Adding some funny stuff

2025-07-29 22:59:40 -04:00
parent b698109183
commit b43ef63449
4 changed files with 319 additions and 88 deletions
--- a/chunker.py
+++ b/chunker.py
@@ -0,0 +1,127 @@
 import os
 from math import ceil
 import re
 from uuid import UUID, uuid4
 from chromadb.utils.embedding_functions.ollama_embedding_function import (
    OllamaEmbeddingFunction,
 )
 from dotenv import load_dotenv
 load_dotenv()
 def remove_headers_footers(text, header_patterns=None, footer_patterns=None):
    if header_patterns is None:
        header_patterns = [r"^.*Header.*$"]
    if footer_patterns is None:
        footer_patterns = [r"^.*Footer.*$"]
    for pattern in header_patterns + footer_patterns:
        text = re.sub(pattern, "", text, flags=re.MULTILINE)
    return text.strip()
 def remove_special_characters(text, special_chars=None):
    if special_chars is None:
        special_chars = r"[^A-Za-z0-9\s\.,;:\'\"\?\!\-]"
    text = re.sub(special_chars, "", text)
    return text.strip()
 def remove_repeated_substrings(text, pattern=r"\.{2,}"):
    text = re.sub(pattern, ".", text)
    return text.strip()
 def remove_extra_spaces(text):
    text = re.sub(r"\n\s*\n", "\n\n", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()
 def preprocess_text(text):
    # Remove headers and footers
    text = remove_headers_footers(text)
    # Remove special characters
    text = remove_special_characters(text)
    # Remove repeated substrings like dots
    text = remove_repeated_substrings(text)
    # Remove extra spaces between lines and within lines
    text = remove_extra_spaces(text)
    # Additional cleaning steps can be added here
    return text.strip()
 class Chunk:
    def __init__(
        self,
        text: str,
        size: int,
        document_id: UUID,
        chunk_id: int,
        embedding,
    ):
        self.text = text
        self.size = size
        self.document_id = document_id
        self.chunk_id = chunk_id
        self.embedding = embedding
 class Chunker:
    embedding_fx = OllamaEmbeddingFunction(
        url=os.getenv("OLLAMA_URL", ""),
        model_name="mxbai-embed-large",
    )
    def __init__(self, collection) -> None:
        self.collection = collection
    def chunk_document(self, document: str, chunk_size: int = 1000) -> list[Chunk]:
        doc_uuid = uuid4()
        chunk_size = min(chunk_size, len(document))
        chunks = []
        num_chunks = ceil(len(document) / chunk_size)
        document_length = len(document)
        for i in range(num_chunks):
            curr_pos = i * num_chunks
            to_pos = (
                curr_pos + chunk_size
                if curr_pos + chunk_size < document_length
                else document_length
            )
            text_chunk = self.clean_document(document[curr_pos:to_pos])
            embedding = self.embedding_fx([text_chunk])
            self.collection.add(
                ids=[str(doc_uuid) + ":" + str(i)],
                documents=[text_chunk],
                embeddings=embedding,
            )
        return chunks
    def clean_document(self, document: str) -> str:
        """This function will remove information that is noise or already known.
        Example: We already know all the things in here are Simba-related, so we don't need things like
        "Sumamry of simba's visit"
        """
        document = document.replace("\\n", "")
        document = document.strip()
        return preprocess_text(document)
--- a/main.py
+++ b/main.py
@@ -1,102 +1,84 @@
 import logging
 import argparse
 import chromadb
 import ollama
-import os
+
 from uuid import uuid4, UUID
 from request import PaperlessNGXService
 from chunker import Chunker
 from math import ceil
 import chromadb
 from chromadb.utils.embedding_functions.ollama_embedding_function import (
    OllamaEmbeddingFunction,
 )
 from dotenv import load_dotenv
-client = chromadb.EphemeralClient()
+client = chromadb.PersistentClient(path="/Users/ryanchen/Programs/raggr/chromadb")
-collection = client.create_collection(name="docs")
+simba_docs = client.get_or_create_collection(name="simba_docs")
 feline_vet_lookup = client.get_or_create_collection(name="feline_vet_lookup")
 parser = argparse.ArgumentParser(
    description="An LLM tool to query information about Simba <3"
 )
 parser.add_argument("query", type=str, help="questions about simba's health")
 parser.add_argument(
    "--reindex", action="store_true", help="re-index the simba documents"
 )
 load_dotenv()
-class Chunk:
+def chunk_data(texts: list[str], collection):
-    def __init__(
+    # Step 2: Create chunks
-        self,
+    chunker = Chunker(collection)
-        text: str,
+
-        size: int,
+    print(f"chunking {len(texts)} documents")
-        document_id: UUID,
+    for text in texts[: len(texts) // 2]:
-        chunk_id: int,
+        chunker.chunk_document(document=text)
        embedding,
    ):
        self.text = text
        self.size = size
        self.document_id = document_id
        self.chunk_id = chunk_id
        self.embedding = embedding
-class Chunker:
+def consult_oracle(input: str, collection):
-    def __init__(self) -> None:
+    # Ask
-        self.embedding_fx = OllamaEmbeddingFunction(
+    embeddings = Chunker.embedding_fx(input=[input])
-            url=os.getenv("OLLAMA_URL", ""),
+    results = collection.query(query_texts=[input], query_embeddings=embeddings)
-            model_name="mxbai-embed-large",
+    print(results)
    # Generate
    output = ollama.generate(
        model="gemma3n:e4b",
        prompt=f"You are a helpful assistant that understandings veterinary terms. Using the following data, help answer the user's query by providing as many details as possible.  Using this data: {results}. Respond to this prompt: {input}",
    )
-        pass
+    print(output["response"])
    def chunk_document(self, document: str, chunk_size: int = 300) -> list[Chunk]:
        doc_uuid = uuid4()
        chunks = []
        num_chunks = ceil(len(document) / chunk_size)
        document_length = len(document)
        for i in range(num_chunks):
            curr_pos = i * num_chunks
            to_pos = (
                curr_pos + num_chunks
                if curr_pos + num_chunks < document_length
                else document_length
            )
            text_chunk = document[curr_pos:to_pos]
            embedding = self.embedding_fx([text_chunk])
            collection.add(
                ids=[str(doc_uuid) + ":" + str(i)],
                documents=[text_chunk],
                embeddings=embedding,
            )
        return chunks
-embedding_fx = OllamaEmbeddingFunction(
+def paperless_workflow(input):
    url=os.getenv("OLLAMA_URL", ""),
    model_name="mxbai-embed-large",
 )
    # Step 1: Get the text
    ppngx = PaperlessNGXService()
    docs = ppngx.get_data()
    texts = [doc["content"] for doc in docs]
-# Step 2: Create chunks
+    chunk_data(texts, collection=simba_docs)
-chunker = Chunker()
+    consult_oracle(input, simba_docs)
 print(f"chunking {len(texts)} documents")
 for text in texts:
    chunker.chunk_document(document=text)
-# Ask
+if __name__ == "__main__":
-input = "How many teeth has Simba had removed? Who is his current vet?"
+    args = parser.parse_args()
-embeddings = embedding_fx(input=[input])
+    if args.reindex:
-results = collection.query(query_texts=[input], query_embeddings=embeddings)
+        logging.info(msg="Fetching documents from Paperless-NGX")
-print(results)
+        ppngx = PaperlessNGXService()
-# Generate
+        docs = ppngx.get_data()
-output = ollama.generate(
+        texts = [doc["content"] for doc in docs]
-    model="gemma3n:e4b",
+        logging.info(msg=f"Fetched {len(texts)} documents")
-    prompt=f"Using this data: {results}. Respond to this prompt: {input}",
+
        logging.info(msg="Chunking documents now ...")
        chunk_data(texts, collection=simba_docs)
        logging.info(msg="Done chunking documents")
    if args.query:
        logging.info("Consulting oracle ...")
        consult_oracle(
            input=args.query,
            collection=simba_docs,
        )
-
+    else:
-print(output["response"])
+        print("please provide a query")
--- a/petmd_scrape_index.py
+++ b/petmd_scrape_index.py
@@ -0,0 +1,24 @@
 from bs4 import BeautifulSoup
 import chromadb
 import httpx
 client = chromadb.PersistentClient(path="/Users/ryanchen/Programs/raggr/chromadb")
 # Scrape
 BASE_URL = "https://www.vet.cornell.edu"
 LIST_URL = "/departments-centers-and-institutes/cornell-feline-health-center/health-information/feline-health-topics"
 QUERY_URL = BASE_URL + LIST_URL
 r = httpx.get(QUERY_URL)
 soup = BeautifulSoup(r.text)
 container = soup.find("div", class_="field-body")
 a_s = container.find_all("a", href=True)
 new_texts = []
 for link in a_s:
    endpoint = link["href"]
    query_url = BASE_URL + endpoint
    r2 = httpx.get(query_url)
    article_soup = BeautifulSoup(r2.text)
--- a/query.py
+++ b/query.py
@@ -0,0 +1,98 @@
 import json
 from typing import Literal
 from ollama import chat, ChatResponse
 from pydantic import BaseModel, Field
 # This uses inferred filters — which means using LLM to create the metadata filters
 class FilterOperation(BaseModel):
    op: Literal["$gt", "$gte", "$eq", "$ne", "$lt", "$lte", "$in", "$nin"]
    value: str | list[str]
 class FilterQuery(BaseModel):
    field_name: Literal["created_date, tags"]
    op: FilterOperation
 class AndQuery(BaseModel):
    op: Literal["$and", "$or"]
    subqueries: list[FilterQuery]
 class GeneratedQuery(BaseModel):
    fields: list[str]
    extracted_metadata_fields: str
 PROMPT = """
 You are an information specialist that processes user queries. The user queries are all about 
 a cat, Simba, and its records. The types of records are listed below. Using the query, extract the 
 type of record the user is trying to query and the date range the user is trying to query.
 You have several operators at your disposal:
 - $gt: greater than
 - $gte: greater than or equal
 - $eq: equal
 - $ne: not equal
 - $lt: less than
 - $lte: less than or equal to
 - $in: in
 - $nin: not in
 Logical operators:
 - $and, $or
 ### Example 1
 Query: "Who is Simba's current vet?"
 Metadata fields: "{"created_date, tags"}"
 Extracted metadata fields: {"$and": [{"created_date: {"$gt": "2025-01-01"}, "tags": {"$in": ["bill", "medical records", "aftercare"]}}]}
 ### Example 2
 Query: "How many teeth has Simba had removed?"
 Metadata fields: {"tags"}
 Extracted metadata fields: {"tags": "medical records"}
 ### Example 3
 Query: "How many times has Simba been to the vet this year?"
 Metadata fields: {"tags", "created_date"}
 Extracted metadata fields: {"$and": [{"created_date": {"gt": "2025-01-01"}, "tags": {"$in": ["bill"]}}]}
 document_types:
 - aftercare
 - bill
 - insurance claim
 - medical records
 Only return the extracted metadata fields. Make sure the extracted metadata fields are valid JSON
 """
 class QueryGenerator:
    def __init__(self) -> None:
        pass
    def get_query(self, input: str):
        response: ChatResponse = chat(
            model="gemma3n:e4b",
            messages=[
                {"role": "system", "content": PROMPT},
                {"role": "user", "content": input},
            ],
            format=GeneratedQuery.model_json_schema(),
        )
        print(
            json.loads(
                json.loads(response["message"]["content"])["extracted_metadata_fields"]
            )
        )
 if __name__ == "__main__":
    qg = QueryGenerator()
    qg.get_query("How old is Simba?")