From b43ef63449231c0068e0828377fde02c181687a5 Mon Sep 17 00:00:00 2001
From: Ryan Chen <ryan@torrtle.co>
Date: Tue, 29 Jul 2025 22:59:40 -0400
Subject: [PATCH 1/3] Adding some funny stuff

---
 chunker.py            | 127 +++++++++++++++++++++++++++++++++
 main.py               | 158 +++++++++++++++++++-----------------------
 petmd_scrape_index.py |  24 +++++++
 query.py              |  98 ++++++++++++++++++++++++++
 4 files changed, 319 insertions(+), 88 deletions(-)
 create mode 100644 chunker.py
 create mode 100644 petmd_scrape_index.py
 create mode 100644 query.py

diff --git a/chunker.py b/chunker.py
new file mode 100644
index 0000000..0018740
--- /dev/null
+++ b/chunker.py
@@ -0,0 +1,127 @@
+import os
+from math import ceil
+import re
+from uuid import UUID, uuid4
+
+from chromadb.utils.embedding_functions.ollama_embedding_function import (
+    OllamaEmbeddingFunction,
+)
+from dotenv import load_dotenv
+
+
+load_dotenv()
+
+
+def remove_headers_footers(text, header_patterns=None, footer_patterns=None):
+    if header_patterns is None:
+        header_patterns = [r"^.*Header.*$"]
+    if footer_patterns is None:
+        footer_patterns = [r"^.*Footer.*$"]
+
+    for pattern in header_patterns + footer_patterns:
+        text = re.sub(pattern, "", text, flags=re.MULTILINE)
+
+    return text.strip()
+
+
+def remove_special_characters(text, special_chars=None):
+    if special_chars is None:
+        special_chars = r"[^A-Za-z0-9\s\.,;:\'\"\?\!\-]"
+
+    text = re.sub(special_chars, "", text)
+    return text.strip()
+
+
+def remove_repeated_substrings(text, pattern=r"\.{2,}"):
+    text = re.sub(pattern, ".", text)
+    return text.strip()
+
+
+def remove_extra_spaces(text):
+    text = re.sub(r"\n\s*\n", "\n\n", text)
+    text = re.sub(r"\s+", " ", text)
+
+    return text.strip()
+
+
+def preprocess_text(text):
+    # Remove headers and footers
+    text = remove_headers_footers(text)
+
+    # Remove special characters
+    text = remove_special_characters(text)
+
+    # Remove repeated substrings like dots
+    text = remove_repeated_substrings(text)
+
+    # Remove extra spaces between lines and within lines
+    text = remove_extra_spaces(text)
+
+    # Additional cleaning steps can be added here
+
+    return text.strip()
+
+
+class Chunk:
+    def __init__(
+        self,
+        text: str,
+        size: int,
+        document_id: UUID,
+        chunk_id: int,
+        embedding,
+    ):
+        self.text = text
+        self.size = size
+        self.document_id = document_id
+        self.chunk_id = chunk_id
+        self.embedding = embedding
+
+
+class Chunker:
+    embedding_fx = OllamaEmbeddingFunction(
+        url=os.getenv("OLLAMA_URL", ""),
+        model_name="mxbai-embed-large",
+    )
+
+    def __init__(self, collection) -> None:
+        self.collection = collection
+
+    def chunk_document(self, document: str, chunk_size: int = 1000) -> list[Chunk]:
+        doc_uuid = uuid4()
+
+        chunk_size = min(chunk_size, len(document))
+
+        chunks = []
+        num_chunks = ceil(len(document) / chunk_size)
+        document_length = len(document)
+
+        for i in range(num_chunks):
+            curr_pos = i * num_chunks
+            to_pos = (
+                curr_pos + chunk_size
+                if curr_pos + chunk_size < document_length
+                else document_length
+            )
+            text_chunk = self.clean_document(document[curr_pos:to_pos])
+
+            embedding = self.embedding_fx([text_chunk])
+            self.collection.add(
+                ids=[str(doc_uuid) + ":" + str(i)],
+                documents=[text_chunk],
+                embeddings=embedding,
+            )
+
+        return chunks
+
+    def clean_document(self, document: str) -> str:
+        """This function will remove information that is noise or already known.
+
+        Example: We already know all the things in here are Simba-related, so we don't need things like
+        "Sumamry of simba's visit"
+        """
+
+        document = document.replace("\\n", "")
+        document = document.strip()
+
+        return preprocess_text(document)
diff --git a/main.py b/main.py
index e911e69..968c7b3 100644
--- a/main.py
+++ b/main.py
@@ -1,102 +1,84 @@
+import logging
+
+import argparse
+import chromadb
 import ollama
-import os
-from uuid import uuid4, UUID
+
 
 from request import PaperlessNGXService
+from chunker import Chunker
 
-from math import ceil
-
-import chromadb
-
-from chromadb.utils.embedding_functions.ollama_embedding_function import (
-    OllamaEmbeddingFunction,
-)
 
 from dotenv import load_dotenv
 
-client = chromadb.EphemeralClient()
-collection = client.create_collection(name="docs")
+client = chromadb.PersistentClient(path="/Users/ryanchen/Programs/raggr/chromadb")
+simba_docs = client.get_or_create_collection(name="simba_docs")
+feline_vet_lookup = client.get_or_create_collection(name="feline_vet_lookup")
+
+parser = argparse.ArgumentParser(
+    description="An LLM tool to query information about Simba <3"
+)
+
+parser.add_argument("query", type=str, help="questions about simba's health")
+parser.add_argument(
+    "--reindex", action="store_true", help="re-index the simba documents"
+)
 
 load_dotenv()
 
 
-class Chunk:
-    def __init__(
-        self,
-        text: str,
-        size: int,
-        document_id: UUID,
-        chunk_id: int,
-        embedding,
-    ):
-        self.text = text
-        self.size = size
-        self.document_id = document_id
-        self.chunk_id = chunk_id
-        self.embedding = embedding
+def chunk_data(texts: list[str], collection):
+    # Step 2: Create chunks
+    chunker = Chunker(collection)
+
+    print(f"chunking {len(texts)} documents")
+    for text in texts[: len(texts) // 2]:
+        chunker.chunk_document(document=text)
 
 
-class Chunker:
-    def __init__(self) -> None:
-        self.embedding_fx = OllamaEmbeddingFunction(
-            url=os.getenv("OLLAMA_URL", ""),
-            model_name="mxbai-embed-large",
+def consult_oracle(input: str, collection):
+    # Ask
+    embeddings = Chunker.embedding_fx(input=[input])
+    results = collection.query(query_texts=[input], query_embeddings=embeddings)
+    print(results)
+
+    # Generate
+    output = ollama.generate(
+        model="gemma3n:e4b",
+        prompt=f"You are a helpful assistant that understandings veterinary terms. Using the following data, help answer the user's query by providing as many details as possible.  Using this data: {results}. Respond to this prompt: {input}",
+    )
+
+    print(output["response"])
+
+
+def paperless_workflow(input):
+    # Step 1: Get the text
+    ppngx = PaperlessNGXService()
+    docs = ppngx.get_data()
+    texts = [doc["content"] for doc in docs]
+
+    chunk_data(texts, collection=simba_docs)
+    consult_oracle(input, simba_docs)
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+    if args.reindex:
+        logging.info(msg="Fetching documents from Paperless-NGX")
+        ppngx = PaperlessNGXService()
+        docs = ppngx.get_data()
+        texts = [doc["content"] for doc in docs]
+        logging.info(msg=f"Fetched {len(texts)} documents")
+
+        logging.info(msg="Chunking documents now ...")
+        chunk_data(texts, collection=simba_docs)
+        logging.info(msg="Done chunking documents")
+
+    if args.query:
+        logging.info("Consulting oracle ...")
+        consult_oracle(
+            input=args.query,
+            collection=simba_docs,
         )
-
-        pass
-
-    def chunk_document(self, document: str, chunk_size: int = 300) -> list[Chunk]:
-        doc_uuid = uuid4()
-
-        chunks = []
-        num_chunks = ceil(len(document) / chunk_size)
-        document_length = len(document)
-
-        for i in range(num_chunks):
-            curr_pos = i * num_chunks
-            to_pos = (
-                curr_pos + num_chunks
-                if curr_pos + num_chunks < document_length
-                else document_length
-            )
-            text_chunk = document[curr_pos:to_pos]
-
-            embedding = self.embedding_fx([text_chunk])
-            collection.add(
-                ids=[str(doc_uuid) + ":" + str(i)],
-                documents=[text_chunk],
-                embeddings=embedding,
-            )
-
-        return chunks
-
-
-embedding_fx = OllamaEmbeddingFunction(
-    url=os.getenv("OLLAMA_URL", ""),
-    model_name="mxbai-embed-large",
-)
-
-# Step 1: Get the text
-ppngx = PaperlessNGXService()
-docs = ppngx.get_data()
-texts = [doc["content"] for doc in docs]
-
-# Step 2: Create chunks
-chunker = Chunker()
-
-print(f"chunking {len(texts)} documents")
-for text in texts:
-    chunker.chunk_document(document=text)
-
-# Ask
-input = "How many teeth has Simba had removed? Who is his current vet?"
-embeddings = embedding_fx(input=[input])
-results = collection.query(query_texts=[input], query_embeddings=embeddings)
-print(results)
-# Generate
-output = ollama.generate(
-    model="gemma3n:e4b",
-    prompt=f"Using this data: {results}. Respond to this prompt: {input}",
-)
-
-print(output["response"])
+    else:
+        print("please provide a query")
diff --git a/petmd_scrape_index.py b/petmd_scrape_index.py
new file mode 100644
index 0000000..24ee508
--- /dev/null
+++ b/petmd_scrape_index.py
@@ -0,0 +1,24 @@
+from bs4 import BeautifulSoup
+import chromadb
+import httpx
+
+client = chromadb.PersistentClient(path="/Users/ryanchen/Programs/raggr/chromadb")
+
+# Scrape
+BASE_URL = "https://www.vet.cornell.edu"
+LIST_URL = "/departments-centers-and-institutes/cornell-feline-health-center/health-information/feline-health-topics"
+
+QUERY_URL = BASE_URL + LIST_URL
+r = httpx.get(QUERY_URL)
+soup = BeautifulSoup(r.text)
+
+container = soup.find("div", class_="field-body")
+a_s = container.find_all("a", href=True)
+
+new_texts = []
+
+for link in a_s:
+    endpoint = link["href"]
+    query_url = BASE_URL + endpoint
+    r2 = httpx.get(query_url)
+    article_soup = BeautifulSoup(r2.text)
diff --git a/query.py b/query.py
new file mode 100644
index 0000000..0064bb6
--- /dev/null
+++ b/query.py
@@ -0,0 +1,98 @@
+import json
+from typing import Literal
+
+from ollama import chat, ChatResponse
+
+from pydantic import BaseModel, Field
+
+# This uses inferred filters — which means using LLM to create the metadata filters
+
+
+class FilterOperation(BaseModel):
+    op: Literal["$gt", "$gte", "$eq", "$ne", "$lt", "$lte", "$in", "$nin"]
+    value: str | list[str]
+
+
+class FilterQuery(BaseModel):
+    field_name: Literal["created_date, tags"]
+    op: FilterOperation
+
+
+class AndQuery(BaseModel):
+    op: Literal["$and", "$or"]
+    subqueries: list[FilterQuery]
+
+
+class GeneratedQuery(BaseModel):
+    fields: list[str]
+    extracted_metadata_fields: str
+
+
+PROMPT = """
+You are an information specialist that processes user queries. The user queries are all about 
+a cat, Simba, and its records. The types of records are listed below. Using the query, extract the 
+type of record the user is trying to query and the date range the user is trying to query.
+
+
+You have several operators at your disposal:
+- $gt: greater than
+- $gte: greater than or equal
+- $eq: equal
+- $ne: not equal
+- $lt: less than
+- $lte: less than or equal to
+- $in: in
+- $nin: not in
+
+Logical operators:
+- $and, $or
+
+### Example 1
+Query: "Who is Simba's current vet?"
+Metadata fields: "{"created_date, tags"}"
+Extracted metadata fields: {"$and": [{"created_date: {"$gt": "2025-01-01"}, "tags": {"$in": ["bill", "medical records", "aftercare"]}}]}
+
+### Example 2
+Query: "How many teeth has Simba had removed?"
+Metadata fields: {"tags"}
+Extracted metadata fields: {"tags": "medical records"}
+
+### Example 3
+Query: "How many times has Simba been to the vet this year?"
+Metadata fields: {"tags", "created_date"}
+Extracted metadata fields: {"$and": [{"created_date": {"gt": "2025-01-01"}, "tags": {"$in": ["bill"]}}]}
+
+document_types:
+- aftercare
+- bill
+- insurance claim
+- medical records
+
+Only return the extracted metadata fields. Make sure the extracted metadata fields are valid JSON
+"""
+
+
+class QueryGenerator:
+    def __init__(self) -> None:
+        pass
+
+    def get_query(self, input: str):
+        response: ChatResponse = chat(
+            model="gemma3n:e4b",
+            messages=[
+                {"role": "system", "content": PROMPT},
+                {"role": "user", "content": input},
+            ],
+            format=GeneratedQuery.model_json_schema(),
+        )
+
+        print(
+            json.loads(
+                json.loads(response["message"]["content"])["extracted_metadata_fields"]
+            )
+        )
+
+
+if __name__ == "__main__":
+    qg = QueryGenerator()
+    qg.get_query("How old is Simba?")
-- 
2.49.1


From 0a88a03c90aee40663fbb86e973bb330aa6f97d6 Mon Sep 17 00:00:00 2001
From: Ryan Chen <ryan@torrtle.co>
Date: Wed, 30 Jul 2025 19:58:29 -0400
Subject: [PATCH 2/3] Expanded context window, CLI'd the app, and added
 preprocessing

---
 main.py    | 3 +--
 request.py | 2 +-
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/main.py b/main.py
index 968c7b3..e22a907 100644
--- a/main.py
+++ b/main.py
@@ -32,7 +32,7 @@ def chunk_data(texts: list[str], collection):
     chunker = Chunker(collection)
 
     print(f"chunking {len(texts)} documents")
-    for text in texts[: len(texts) // 2]:
+    for text in texts:
         chunker.chunk_document(document=text)
 
 
@@ -40,7 +40,6 @@ def consult_oracle(input: str, collection):
     # Ask
     embeddings = Chunker.embedding_fx(input=[input])
     results = collection.query(query_texts=[input], query_embeddings=embeddings)
-    print(results)
 
     # Generate
     output = ollama.generate(
diff --git a/request.py b/request.py
index 229d619..9357096 100644
--- a/request.py
+++ b/request.py
@@ -21,4 +21,4 @@ class PaperlessNGXService:
 
 if __name__ == "__main__":
     pp = PaperlessNGXService()
-    print(pp.get_data()[0].keys())
+    pp.get_data()
-- 
2.49.1


From c7152d3f32b9cc622e1db31aba9852e9ebc6f4c3 Mon Sep 17 00:00:00 2001
From: Ryan Chen <ryan@torrtle.co>
Date: Wed, 30 Jul 2025 20:27:03 -0400
Subject: [PATCH 3/3] Moving chromadb to env var

---
 main.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/main.py b/main.py
index e22a907..b568c46 100644
--- a/main.py
+++ b/main.py
@@ -1,4 +1,5 @@
 import logging
+import os
 
 import argparse
 import chromadb
@@ -11,7 +12,9 @@ from chunker import Chunker
 
 from dotenv import load_dotenv
 
-client = chromadb.PersistentClient(path="/Users/ryanchen/Programs/raggr/chromadb")
+load_dotenv()
+
+client = chromadb.PersistentClient(path=os.getenv("CHROMADB_PATH", ""))
 simba_docs = client.get_or_create_collection(name="simba_docs")
 feline_vet_lookup = client.get_or_create_collection(name="feline_vet_lookup")
 
@@ -24,8 +27,6 @@ parser.add_argument(
     "--reindex", action="store_true", help="re-index the simba documents"
 )
 
-load_dotenv()
-
 
 def chunk_data(texts: list[str], collection):
     # Step 2: Create chunks
-- 
2.49.1