Moving chromadb to env var

Expanded context window, CLI'd the app, and added preprocessing
Adding some funny stuff
2025-07-30 20:27:03 -04:00 · 2025-07-30 19:58:29 -04:00 · 2025-07-29 22:59:40 -04:00
5 changed files with 321 additions and 90 deletions
@@ -0,0 +1,127 @@
+import os
+from math import ceil
+import re
+from uuid import UUID, uuid4
+
+from chromadb.utils.embedding_functions.ollama_embedding_function import (
+    OllamaEmbeddingFunction,
+)
+from dotenv import load_dotenv
+
+
+load_dotenv()
+
+
+def remove_headers_footers(text, header_patterns=None, footer_patterns=None):
+    if header_patterns is None:
+        header_patterns = [r"^.*Header.*$"]
+    if footer_patterns is None:
+        footer_patterns = [r"^.*Footer.*$"]
+
+    for pattern in header_patterns + footer_patterns:
+        text = re.sub(pattern, "", text, flags=re.MULTILINE)
+
+    return text.strip()
+
+
+def remove_special_characters(text, special_chars=None):
+    if special_chars is None:
+        special_chars = r"[^A-Za-z0-9\s\.,;:\'\"\?\!\-]"
+
+    text = re.sub(special_chars, "", text)
+    return text.strip()
+
+
+def remove_repeated_substrings(text, pattern=r"\.{2,}"):
+    text = re.sub(pattern, ".", text)
+    return text.strip()
+
+
+def remove_extra_spaces(text):
+    text = re.sub(r"\n\s*\n", "\n\n", text)
+    text = re.sub(r"\s+", " ", text)
+
+    return text.strip()
+
+
+def preprocess_text(text):
+    # Remove headers and footers
+    text = remove_headers_footers(text)
+
+    # Remove special characters
+    text = remove_special_characters(text)
+
+    # Remove repeated substrings like dots
+    text = remove_repeated_substrings(text)
+
+    # Remove extra spaces between lines and within lines
+    text = remove_extra_spaces(text)
+
+    # Additional cleaning steps can be added here
+
+    return text.strip()
+
+
+class Chunk:
+    def __init__(
+        self,
+        text: str,
+        size: int,
+        document_id: UUID,
+        chunk_id: int,
+        embedding,
+    ):
+        self.text = text
+        self.size = size
+        self.document_id = document_id
+        self.chunk_id = chunk_id
+        self.embedding = embedding
+
+
+class Chunker:
+    embedding_fx = OllamaEmbeddingFunction(
+        url=os.getenv("OLLAMA_URL", ""),
+        model_name="mxbai-embed-large",
+    )
+
+    def __init__(self, collection) -> None:
+        self.collection = collection
+
+    def chunk_document(self, document: str, chunk_size: int = 1000) -> list[Chunk]:
+        doc_uuid = uuid4()
+
+        chunk_size = min(chunk_size, len(document))
+
+        chunks = []
+        num_chunks = ceil(len(document) / chunk_size)
+        document_length = len(document)
+
+        for i in range(num_chunks):
+            curr_pos = i * num_chunks
+            to_pos = (
+                curr_pos + chunk_size
+                if curr_pos + chunk_size < document_length
+                else document_length
+            )
+            text_chunk = self.clean_document(document[curr_pos:to_pos])
+
+            embedding = self.embedding_fx([text_chunk])
+            self.collection.add(
+                ids=[str(doc_uuid) + ":" + str(i)],
+                documents=[text_chunk],
+                embeddings=embedding,
+            )
+
+        return chunks
+
+    def clean_document(self, document: str) -> str:
+        """This function will remove information that is noise or already known.
+
+        Example: We already know all the things in here are Simba-related, so we don't need things like
+        "Sumamry of simba's visit"
+        """
+
+        document = document.replace("\\n", "")
+        document = document.strip()
+
+        return preprocess_text(document)
@@ -1,102 +1,84 @@
-import ollama
+import logging
 import os
-from uuid import uuid4, UUID
+
+import argparse
+import chromadb
+import ollama
+

 from request import PaperlessNGXService
+from chunker import Chunker

-from math import ceil
-
-import chromadb
-
-from chromadb.utils.embedding_functions.ollama_embedding_function import (
-    OllamaEmbeddingFunction,
-)

 from dotenv import load_dotenv

-client = chromadb.EphemeralClient()
-collection = client.create_collection(name="docs")
-
 load_dotenv()

+client = chromadb.PersistentClient(path=os.getenv("CHROMADB_PATH", ""))
+simba_docs = client.get_or_create_collection(name="simba_docs")
+feline_vet_lookup = client.get_or_create_collection(name="feline_vet_lookup")

-class Chunk:
-    def __init__(
-        self,
-        text: str,
-        size: int,
-        document_id: UUID,
-        chunk_id: int,
-        embedding,
-    ):
-        self.text = text
-        self.size = size
-        self.document_id = document_id
-        self.chunk_id = chunk_id
-        self.embedding = embedding
-
-
-class Chunker:
-    def __init__(self) -> None:
-        self.embedding_fx = OllamaEmbeddingFunction(
-            url=os.getenv("OLLAMA_URL", ""),
-            model_name="mxbai-embed-large",
+parser = argparse.ArgumentParser(
+    description="An LLM tool to query information about Simba <3"
 )

-        pass
-
-    def chunk_document(self, document: str, chunk_size: int = 300) -> list[Chunk]:
-        doc_uuid = uuid4()
-
-        chunks = []
-        num_chunks = ceil(len(document) / chunk_size)
-        document_length = len(document)
-
-        for i in range(num_chunks):
-            curr_pos = i * num_chunks
-            to_pos = (
-                curr_pos + num_chunks
-                if curr_pos + num_chunks < document_length
-                else document_length
-            )
-            text_chunk = document[curr_pos:to_pos]
-
-            embedding = self.embedding_fx([text_chunk])
-            collection.add(
-                ids=[str(doc_uuid) + ":" + str(i)],
-                documents=[text_chunk],
-                embeddings=embedding,
+parser.add_argument("query", type=str, help="questions about simba's health")
+parser.add_argument(
+    "--reindex", action="store_true", help="re-index the simba documents"
 )

-        return chunks
-
-
-embedding_fx = OllamaEmbeddingFunction(
-    url=os.getenv("OLLAMA_URL", ""),
-    model_name="mxbai-embed-large",
-)
-
-# Step 1: Get the text
-ppngx = PaperlessNGXService()
-docs = ppngx.get_data()
-texts = [doc["content"] for doc in docs]

+def chunk_data(texts: list[str], collection):
    # Step 2: Create chunks
-chunker = Chunker()
+    chunker = Chunker(collection)

    print(f"chunking {len(texts)} documents")
    for text in texts:
        chunker.chunk_document(document=text)

+
+def consult_oracle(input: str, collection):
    # Ask
-input = "How many teeth has Simba had removed? Who is his current vet?"
-embeddings = embedding_fx(input=[input])
+    embeddings = Chunker.embedding_fx(input=[input])
    results = collection.query(query_texts=[input], query_embeddings=embeddings)
-print(results)
+
    # Generate
    output = ollama.generate(
        model="gemma3n:e4b",
-    prompt=f"Using this data: {results}. Respond to this prompt: {input}",
+        prompt=f"You are a helpful assistant that understandings veterinary terms. Using the following data, help answer the user's query by providing as many details as possible.  Using this data: {results}. Respond to this prompt: {input}",
    )

    print(output["response"])
+
+
+def paperless_workflow(input):
+    # Step 1: Get the text
+    ppngx = PaperlessNGXService()
+    docs = ppngx.get_data()
+    texts = [doc["content"] for doc in docs]
+
+    chunk_data(texts, collection=simba_docs)
+    consult_oracle(input, simba_docs)
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+    if args.reindex:
+        logging.info(msg="Fetching documents from Paperless-NGX")
+        ppngx = PaperlessNGXService()
+        docs = ppngx.get_data()
+        texts = [doc["content"] for doc in docs]
+        logging.info(msg=f"Fetched {len(texts)} documents")
+
+        logging.info(msg="Chunking documents now ...")
+        chunk_data(texts, collection=simba_docs)
+        logging.info(msg="Done chunking documents")
+
+    if args.query:
+        logging.info("Consulting oracle ...")
+        consult_oracle(
+            input=args.query,
+            collection=simba_docs,
+        )
+    else:
+        print("please provide a query")
@@ -0,0 +1,24 @@
+from bs4 import BeautifulSoup
+import chromadb
+import httpx
+
+client = chromadb.PersistentClient(path="/Users/ryanchen/Programs/raggr/chromadb")
+
+# Scrape
+BASE_URL = "https://www.vet.cornell.edu"
+LIST_URL = "/departments-centers-and-institutes/cornell-feline-health-center/health-information/feline-health-topics"
+
+QUERY_URL = BASE_URL + LIST_URL
+r = httpx.get(QUERY_URL)
+soup = BeautifulSoup(r.text)
+
+container = soup.find("div", class_="field-body")
+a_s = container.find_all("a", href=True)
+
+new_texts = []
+
+for link in a_s:
+    endpoint = link["href"]
+    query_url = BASE_URL + endpoint
+    r2 = httpx.get(query_url)
+    article_soup = BeautifulSoup(r2.text)
@@ -0,0 +1,98 @@
+import json
+from typing import Literal
+
+from ollama import chat, ChatResponse
+
+from pydantic import BaseModel, Field
+
+# This uses inferred filters — which means using LLM to create the metadata filters
+
+
+class FilterOperation(BaseModel):
+    op: Literal["$gt", "$gte", "$eq", "$ne", "$lt", "$lte", "$in", "$nin"]
+    value: str | list[str]
+
+
+class FilterQuery(BaseModel):
+    field_name: Literal["created_date, tags"]
+    op: FilterOperation
+
+
+class AndQuery(BaseModel):
+    op: Literal["$and", "$or"]
+    subqueries: list[FilterQuery]
+
+
+class GeneratedQuery(BaseModel):
+    fields: list[str]
+    extracted_metadata_fields: str
+
+
+PROMPT = """
+You are an information specialist that processes user queries. The user queries are all about 
+a cat, Simba, and its records. The types of records are listed below. Using the query, extract the 
+type of record the user is trying to query and the date range the user is trying to query.
+
+
+You have several operators at your disposal:
+- $gt: greater than
+- $gte: greater than or equal
+- $eq: equal
+- $ne: not equal
+- $lt: less than
+- $lte: less than or equal to
+- $in: in
+- $nin: not in
+
+Logical operators:
+- $and, $or
+
+### Example 1
+Query: "Who is Simba's current vet?"
+Metadata fields: "{"created_date, tags"}"
+Extracted metadata fields: {"$and": [{"created_date: {"$gt": "2025-01-01"}, "tags": {"$in": ["bill", "medical records", "aftercare"]}}]}
+
+### Example 2
+Query: "How many teeth has Simba had removed?"
+Metadata fields: {"tags"}
+Extracted metadata fields: {"tags": "medical records"}
+
+### Example 3
+Query: "How many times has Simba been to the vet this year?"
+Metadata fields: {"tags", "created_date"}
+Extracted metadata fields: {"$and": [{"created_date": {"gt": "2025-01-01"}, "tags": {"$in": ["bill"]}}]}
+
+document_types:
+- aftercare
+- bill
+- insurance claim
+- medical records
+
+Only return the extracted metadata fields. Make sure the extracted metadata fields are valid JSON
+"""
+
+
+class QueryGenerator:
+    def __init__(self) -> None:
+        pass
+
+    def get_query(self, input: str):
+        response: ChatResponse = chat(
+            model="gemma3n:e4b",
+            messages=[
+                {"role": "system", "content": PROMPT},
+                {"role": "user", "content": input},
+            ],
+            format=GeneratedQuery.model_json_schema(),
+        )
+
+        print(
+            json.loads(
+                json.loads(response["message"]["content"])["extracted_metadata_fields"]
+            )
+        )
+
+
+if __name__ == "__main__":
+    qg = QueryGenerator()
+    qg.get_query("How old is Simba?")
@@ -21,4 +21,4 @@ class PaperlessNGXService:

 if __name__ == "__main__":
    pp = PaperlessNGXService()
-    print(pp.get_data()[0].keys())
+    pp.get_data()
Author	SHA1	Message	Date
Ryan Chen	c7152d3f32	Moving chromadb to env var	2025-07-30 20:27:03 -04:00
Ryan Chen	0a88a03c90	Expanded context window, CLI'd the app, and added preprocessing	2025-07-30 19:58:29 -04:00
Ryan Chen	b43ef63449	Adding some funny stuff	2025-07-29 22:59:40 -04:00