Merge pull request 'Adding more embeddings' (#1) from better-embeddings into main
Reviewed-on: #1
This commit was merged in pull request #1.
This commit is contained in:
27
main.py
27
main.py
@@ -1,4 +1,5 @@
|
|||||||
import ollama
|
import ollama
|
||||||
|
import os
|
||||||
from uuid import uuid4, UUID
|
from uuid import uuid4, UUID
|
||||||
|
|
||||||
from request import PaperlessNGXService
|
from request import PaperlessNGXService
|
||||||
@@ -7,9 +8,17 @@ from math import ceil
|
|||||||
|
|
||||||
import chromadb
|
import chromadb
|
||||||
|
|
||||||
|
from chromadb.utils.embedding_functions.ollama_embedding_function import (
|
||||||
|
OllamaEmbeddingFunction,
|
||||||
|
)
|
||||||
|
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
|
||||||
client = chromadb.EphemeralClient()
|
client = chromadb.EphemeralClient()
|
||||||
collection = client.create_collection(name="docs")
|
collection = client.create_collection(name="docs")
|
||||||
|
|
||||||
|
load_dotenv()
|
||||||
|
|
||||||
|
|
||||||
class Chunk:
|
class Chunk:
|
||||||
def __init__(
|
def __init__(
|
||||||
@@ -29,6 +38,11 @@ class Chunk:
|
|||||||
|
|
||||||
class Chunker:
|
class Chunker:
|
||||||
def __init__(self) -> None:
|
def __init__(self) -> None:
|
||||||
|
self.embedding_fx = OllamaEmbeddingFunction(
|
||||||
|
url=os.getenv("OLLAMA_URL", ""),
|
||||||
|
model_name="mxbai-embed-large",
|
||||||
|
)
|
||||||
|
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def chunk_document(self, document: str, chunk_size: int = 300) -> list[Chunk]:
|
def chunk_document(self, document: str, chunk_size: int = 300) -> list[Chunk]:
|
||||||
@@ -47,15 +61,20 @@ class Chunker:
|
|||||||
)
|
)
|
||||||
text_chunk = document[curr_pos:to_pos]
|
text_chunk = document[curr_pos:to_pos]
|
||||||
|
|
||||||
|
embedding = self.embedding_fx([text_chunk])
|
||||||
collection.add(
|
collection.add(
|
||||||
ids=[str(doc_uuid) + ":" + str(i)],
|
ids=[str(doc_uuid) + ":" + str(i)],
|
||||||
documents=[text_chunk],
|
documents=[text_chunk],
|
||||||
|
embeddings=embedding,
|
||||||
)
|
)
|
||||||
|
|
||||||
return chunks
|
return chunks
|
||||||
|
|
||||||
|
|
||||||
# Setup
|
embedding_fx = OllamaEmbeddingFunction(
|
||||||
|
url=os.getenv("OLLAMA_URL", ""),
|
||||||
|
model_name="mxbai-embed-large",
|
||||||
|
)
|
||||||
|
|
||||||
# Step 1: Get the text
|
# Step 1: Get the text
|
||||||
ppngx = PaperlessNGXService()
|
ppngx = PaperlessNGXService()
|
||||||
@@ -70,9 +89,9 @@ for text in texts:
|
|||||||
chunker.chunk_document(document=text)
|
chunker.chunk_document(document=text)
|
||||||
|
|
||||||
# Ask
|
# Ask
|
||||||
input = "How many teeth has Simba had removed?"
|
input = "How many teeth has Simba had removed? Who is his current vet?"
|
||||||
response = ollama.embed(model="mxbai-embed-large", input=input)
|
embeddings = embedding_fx(input=[input])
|
||||||
results = collection.query(query_texts=[input], n_results=1)
|
results = collection.query(query_texts=[input], query_embeddings=embeddings)
|
||||||
print(results)
|
print(results)
|
||||||
# Generate
|
# Generate
|
||||||
output = ollama.generate(
|
output = ollama.generate(
|
||||||
|
|||||||
@@ -17,3 +17,8 @@ class PaperlessNGXService:
|
|||||||
print(f"Getting data from: {self.url}")
|
print(f"Getting data from: {self.url}")
|
||||||
r = httpx.get(self.url, headers=self.headers)
|
r = httpx.get(self.url, headers=self.headers)
|
||||||
return r.json()["results"]
|
return r.json()["results"]
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
pp = PaperlessNGXService()
|
||||||
|
print(pp.get_data()[0].keys())
|
||||||
|
|||||||
Reference in New Issue
Block a user