Merge pull request 'Adding some funny stuff' (#2 ) from data-preprocessing into main

Reviewed-on: #2 implements #1
Moving chromadb to env var
2025-07-30 20:30:34 -04:00 · 2025-07-30 20:27:03 -04:00 · 2025-07-30 19:58:29 -04:00 · 2025-07-29 22:59:40 -04:00 · 2025-07-26 19:55:31 -04:00
5 changed files with 321 additions and 90 deletions
--- a/chunker.py
+++ b/chunker.py
@@ -0,0 +1,127 @@
 import os
 from math import ceil
 import re
 from uuid import UUID, uuid4
 from chromadb.utils.embedding_functions.ollama_embedding_function import (
    OllamaEmbeddingFunction,
 )
 from dotenv import load_dotenv
 load_dotenv()
 def remove_headers_footers(text, header_patterns=None, footer_patterns=None):
    if header_patterns is None:
        header_patterns = [r"^.*Header.*$"]
    if footer_patterns is None:
        footer_patterns = [r"^.*Footer.*$"]
    for pattern in header_patterns + footer_patterns:
        text = re.sub(pattern, "", text, flags=re.MULTILINE)
    return text.strip()
 def remove_special_characters(text, special_chars=None):
    if special_chars is None:
        special_chars = r"[^A-Za-z0-9\s\.,;:\'\"\?\!\-]"
    text = re.sub(special_chars, "", text)
    return text.strip()
 def remove_repeated_substrings(text, pattern=r"\.{2,}"):
    text = re.sub(pattern, ".", text)
    return text.strip()
 def remove_extra_spaces(text):
    text = re.sub(r"\n\s*\n", "\n\n", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()
 def preprocess_text(text):
    # Remove headers and footers
    text = remove_headers_footers(text)
    # Remove special characters
    text = remove_special_characters(text)
    # Remove repeated substrings like dots
    text = remove_repeated_substrings(text)
    # Remove extra spaces between lines and within lines
    text = remove_extra_spaces(text)
    # Additional cleaning steps can be added here
    return text.strip()
 class Chunk:
    def __init__(
        self,
        text: str,
        size: int,
        document_id: UUID,
        chunk_id: int,
        embedding,
    ):
        self.text = text
        self.size = size
        self.document_id = document_id
        self.chunk_id = chunk_id
        self.embedding = embedding
 class Chunker:
    embedding_fx = OllamaEmbeddingFunction(
        url=os.getenv("OLLAMA_URL", ""),
        model_name="mxbai-embed-large",
    )
    def __init__(self, collection) -> None:
        self.collection = collection
    def chunk_document(self, document: str, chunk_size: int = 1000) -> list[Chunk]:
        doc_uuid = uuid4()
        chunk_size = min(chunk_size, len(document))
        chunks = []
        num_chunks = ceil(len(document) / chunk_size)
        document_length = len(document)
        for i in range(num_chunks):
            curr_pos = i * num_chunks
            to_pos = (
                curr_pos + chunk_size
                if curr_pos + chunk_size < document_length
                else document_length
            )
            text_chunk = self.clean_document(document[curr_pos:to_pos])
            embedding = self.embedding_fx([text_chunk])
            self.collection.add(
                ids=[str(doc_uuid) + ":" + str(i)],
                documents=[text_chunk],
                embeddings=embedding,
            )
        return chunks
    def clean_document(self, document: str) -> str:
        """This function will remove information that is noise or already known.
        Example: We already know all the things in here are Simba-related, so we don't need things like
        "Sumamry of simba's visit"
        """
        document = document.replace("\\n", "")
        document = document.strip()
        return preprocess_text(document)
--- a/main.py
+++ b/main.py
@@ -1,102 +1,84 @@
-import ollama
+import logging
 import os
-from uuid import uuid4, UUID
+
 import argparse
 import chromadb
 import ollama
 from request import PaperlessNGXService
 from chunker import Chunker
 from math import ceil
 import chromadb
 from chromadb.utils.embedding_functions.ollama_embedding_function import (
    OllamaEmbeddingFunction,
 )
 from dotenv import load_dotenv
 client = chromadb.EphemeralClient()
 collection = client.create_collection(name="docs")
 load_dotenv()
 client = chromadb.PersistentClient(path=os.getenv("CHROMADB_PATH", ""))
 simba_docs = client.get_or_create_collection(name="simba_docs")
 feline_vet_lookup = client.get_or_create_collection(name="feline_vet_lookup")
-class Chunk:
+parser = argparse.ArgumentParser(
-    def __init__(
+    description="An LLM tool to query information about Simba <3"
-        self,
+)
-        text: str,
+
-        size: int,
+parser.add_argument("query", type=str, help="questions about simba's health")
-        document_id: UUID,
+parser.add_argument(
-        chunk_id: int,
+    "--reindex", action="store_true", help="re-index the simba documents"
-        embedding,
+)
    ):
        self.text = text
        self.size = size
        self.document_id = document_id
        self.chunk_id = chunk_id
        self.embedding = embedding
-class Chunker:
+def chunk_data(texts: list[str], collection):
-    def __init__(self) -> None:
+    # Step 2: Create chunks
-        self.embedding_fx = OllamaEmbeddingFunction(
+    chunker = Chunker(collection)
-            url=os.getenv("OLLAMA_URL", ""),
+
-            model_name="mxbai-embed-large",
+    print(f"chunking {len(texts)} documents")
    for text in texts:
        chunker.chunk_document(document=text)
 def consult_oracle(input: str, collection):
    # Ask
    embeddings = Chunker.embedding_fx(input=[input])
    results = collection.query(query_texts=[input], query_embeddings=embeddings)
    # Generate
    output = ollama.generate(
        model="gemma3n:e4b",
        prompt=f"You are a helpful assistant that understandings veterinary terms. Using the following data, help answer the user's query by providing as many details as possible.  Using this data: {results}. Respond to this prompt: {input}",
    )
    print(output["response"])
 def paperless_workflow(input):
    # Step 1: Get the text
    ppngx = PaperlessNGXService()
    docs = ppngx.get_data()
    texts = [doc["content"] for doc in docs]
    chunk_data(texts, collection=simba_docs)
    consult_oracle(input, simba_docs)
 if __name__ == "__main__":
    args = parser.parse_args()
    if args.reindex:
        logging.info(msg="Fetching documents from Paperless-NGX")
        ppngx = PaperlessNGXService()
        docs = ppngx.get_data()
        texts = [doc["content"] for doc in docs]
        logging.info(msg=f"Fetched {len(texts)} documents")
        logging.info(msg="Chunking documents now ...")
        chunk_data(texts, collection=simba_docs)
        logging.info(msg="Done chunking documents")
    if args.query:
        logging.info("Consulting oracle ...")
        consult_oracle(
            input=args.query,
            collection=simba_docs,
        )
-
+    else:
-        pass
+        print("please provide a query")
    def chunk_document(self, document: str, chunk_size: int = 300) -> list[Chunk]:
        doc_uuid = uuid4()
        chunks = []
        num_chunks = ceil(len(document) / chunk_size)
        document_length = len(document)
        for i in range(num_chunks):
            curr_pos = i * num_chunks
            to_pos = (
                curr_pos + num_chunks
                if curr_pos + num_chunks < document_length
                else document_length
            )
            text_chunk = document[curr_pos:to_pos]
            embedding = self.embedding_fx([text_chunk])
            collection.add(
                ids=[str(doc_uuid) + ":" + str(i)],
                documents=[text_chunk],
                embeddings=embedding,
            )
        return chunks
 embedding_fx = OllamaEmbeddingFunction(
    url=os.getenv("OLLAMA_URL", ""),
    model_name="mxbai-embed-large",
 )
 # Step 1: Get the text
 ppngx = PaperlessNGXService()
 docs = ppngx.get_data()
 texts = [doc["content"] for doc in docs]
 # Step 2: Create chunks
 chunker = Chunker()
 print(f"chunking {len(texts)} documents")
 for text in texts:
    chunker.chunk_document(document=text)
 # Ask
 input = "How many teeth has Simba had removed? Who is his current vet?"
 embeddings = embedding_fx(input=[input])
 results = collection.query(query_texts=[input], query_embeddings=embeddings)
 print(results)
 # Generate
 output = ollama.generate(
    model="gemma3n:e4b",
    prompt=f"Using this data: {results}. Respond to this prompt: {input}",
 )
 print(output["response"])
--- a/petmd_scrape_index.py
+++ b/petmd_scrape_index.py
@@ -0,0 +1,24 @@
 from bs4 import BeautifulSoup
 import chromadb
 import httpx
 client = chromadb.PersistentClient(path="/Users/ryanchen/Programs/raggr/chromadb")
 # Scrape
 BASE_URL = "https://www.vet.cornell.edu"
 LIST_URL = "/departments-centers-and-institutes/cornell-feline-health-center/health-information/feline-health-topics"
 QUERY_URL = BASE_URL + LIST_URL
 r = httpx.get(QUERY_URL)
 soup = BeautifulSoup(r.text)
 container = soup.find("div", class_="field-body")
 a_s = container.find_all("a", href=True)
 new_texts = []
 for link in a_s:
    endpoint = link["href"]
    query_url = BASE_URL + endpoint
    r2 = httpx.get(query_url)
    article_soup = BeautifulSoup(r2.text)
--- a/query.py
+++ b/query.py
@@ -0,0 +1,98 @@
 import json
 from typing import Literal
 from ollama import chat, ChatResponse
 from pydantic import BaseModel, Field
 # This uses inferred filters — which means using LLM to create the metadata filters
 class FilterOperation(BaseModel):
    op: Literal["$gt", "$gte", "$eq", "$ne", "$lt", "$lte", "$in", "$nin"]
    value: str | list[str]
 class FilterQuery(BaseModel):
    field_name: Literal["created_date, tags"]
    op: FilterOperation
 class AndQuery(BaseModel):
    op: Literal["$and", "$or"]
    subqueries: list[FilterQuery]
 class GeneratedQuery(BaseModel):
    fields: list[str]
    extracted_metadata_fields: str
 PROMPT = """
 You are an information specialist that processes user queries. The user queries are all about 
 a cat, Simba, and its records. The types of records are listed below. Using the query, extract the 
 type of record the user is trying to query and the date range the user is trying to query.
 You have several operators at your disposal:
 - $gt: greater than
 - $gte: greater than or equal
 - $eq: equal
 - $ne: not equal
 - $lt: less than
 - $lte: less than or equal to
 - $in: in
 - $nin: not in
 Logical operators:
 - $and, $or
 ### Example 1
 Query: "Who is Simba's current vet?"
 Metadata fields: "{"created_date, tags"}"
 Extracted metadata fields: {"$and": [{"created_date: {"$gt": "2025-01-01"}, "tags": {"$in": ["bill", "medical records", "aftercare"]}}]}
 ### Example 2
 Query: "How many teeth has Simba had removed?"
 Metadata fields: {"tags"}
 Extracted metadata fields: {"tags": "medical records"}
 ### Example 3
 Query: "How many times has Simba been to the vet this year?"
 Metadata fields: {"tags", "created_date"}
 Extracted metadata fields: {"$and": [{"created_date": {"gt": "2025-01-01"}, "tags": {"$in": ["bill"]}}]}
 document_types:
 - aftercare
 - bill
 - insurance claim
 - medical records
 Only return the extracted metadata fields. Make sure the extracted metadata fields are valid JSON
 """
 class QueryGenerator:
    def __init__(self) -> None:
        pass
    def get_query(self, input: str):
        response: ChatResponse = chat(
            model="gemma3n:e4b",
            messages=[
                {"role": "system", "content": PROMPT},
                {"role": "user", "content": input},
            ],
            format=GeneratedQuery.model_json_schema(),
        )
        print(
            json.loads(
                json.loads(response["message"]["content"])["extracted_metadata_fields"]
            )
        )
 if __name__ == "__main__":
    qg = QueryGenerator()
    qg.get_query("How old is Simba?")
--- a/request.py
+++ b/request.py
@@ -21,4 +21,4 @@ class PaperlessNGXService:
 if __name__ == "__main__":
    pp = PaperlessNGXService()
-    print(pp.get_data()[0].keys())
+    pp.get_data()
Author	SHA1	Message	Date
ryan	fc504d3e9c	Merge pull request 'Adding some funny stuff' (#2 ) from data-preprocessing into main Reviewed-on: #2 implements #1	2025-07-30 20:30:34 -04:00
Ryan Chen	c7152d3f32	Moving chromadb to env var	2025-07-30 20:27:03 -04:00
Ryan Chen	0a88a03c90	Expanded context window, CLI'd the app, and added preprocessing	2025-07-30 19:58:29 -04:00
Ryan Chen	b43ef63449	Adding some funny stuff	2025-07-29 22:59:40 -04:00
ryan	b698109183	Merge pull request 'Adding more embeddings' (#1 ) from better-embeddings into main Reviewed-on: #1	2025-07-26 19:55:31 -04:00