From 994b3fdf1f54a2d6bdf3a1fc564ba2eb8b7617d0 Mon Sep 17 00:00:00 2001 From: Ryan Chen Date: Sat, 26 Jul 2025 19:23:08 -0400 Subject: [PATCH 1/2] Adding more embeddings --- main.py | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/main.py b/main.py index ff890b2..9069559 100644 --- a/main.py +++ b/main.py @@ -7,6 +7,10 @@ from math import ceil import chromadb +from chromadb.utils.embedding_functions.ollama_embedding_function import ( + OllamaEmbeddingFunction, +) + client = chromadb.EphemeralClient() collection = client.create_collection(name="docs") @@ -29,6 +33,11 @@ class Chunk: class Chunker: def __init__(self) -> None: + self.embedding_fx = OllamaEmbeddingFunction( + url="http://localhost:11434", + model_name="mxbai-embed-large", + ) + pass def chunk_document(self, document: str, chunk_size: int = 300) -> list[Chunk]: @@ -47,15 +56,20 @@ class Chunker: ) text_chunk = document[curr_pos:to_pos] + embedding = self.embedding_fx([text_chunk]) collection.add( ids=[str(doc_uuid) + ":" + str(i)], documents=[text_chunk], + embeddings=embedding, ) return chunks -# Setup +embedding_fx = OllamaEmbeddingFunction( + url="http://localhost:11434", + model_name="mxbai-embed-large", +) # Step 1: Get the text ppngx = PaperlessNGXService() @@ -71,8 +85,8 @@ for text in texts: # Ask input = "How many teeth has Simba had removed?" -response = ollama.embed(model="mxbai-embed-large", input=input) -results = collection.query(query_texts=[input], n_results=1) +embeddings = embedding_fx(input=[input]) +results = collection.query(query_texts=[input], query_embeddings=embeddings) print(results) # Generate output = ollama.generate( -- 2.49.1 From 943a22401b683c19a75e2c0ba94bfdf78757996e Mon Sep 17 00:00:00 2001 From: Ryan Chen Date: Sat, 26 Jul 2025 19:46:55 -0400 Subject: [PATCH 2/2] Adding getenv to main --- main.py | 11 ++++++++--- request.py | 5 +++++ 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/main.py b/main.py index 9069559..e911e69 100644 --- a/main.py +++ b/main.py @@ -1,4 +1,5 @@ import ollama +import os from uuid import uuid4, UUID from request import PaperlessNGXService @@ -11,9 +12,13 @@ from chromadb.utils.embedding_functions.ollama_embedding_function import ( OllamaEmbeddingFunction, ) +from dotenv import load_dotenv + client = chromadb.EphemeralClient() collection = client.create_collection(name="docs") +load_dotenv() + class Chunk: def __init__( @@ -34,7 +39,7 @@ class Chunk: class Chunker: def __init__(self) -> None: self.embedding_fx = OllamaEmbeddingFunction( - url="http://localhost:11434", + url=os.getenv("OLLAMA_URL", ""), model_name="mxbai-embed-large", ) @@ -67,7 +72,7 @@ class Chunker: embedding_fx = OllamaEmbeddingFunction( - url="http://localhost:11434", + url=os.getenv("OLLAMA_URL", ""), model_name="mxbai-embed-large", ) @@ -84,7 +89,7 @@ for text in texts: chunker.chunk_document(document=text) # Ask -input = "How many teeth has Simba had removed?" +input = "How many teeth has Simba had removed? Who is his current vet?" embeddings = embedding_fx(input=[input]) results = collection.query(query_texts=[input], query_embeddings=embeddings) print(results) diff --git a/request.py b/request.py index 7770fcf..229d619 100644 --- a/request.py +++ b/request.py @@ -17,3 +17,8 @@ class PaperlessNGXService: print(f"Getting data from: {self.url}") r = httpx.get(self.url, headers=self.headers) return r.json()["results"] + + +if __name__ == "__main__": + pp = PaperlessNGXService() + print(pp.get_data()[0].keys()) -- 2.49.1