From b43ef63449231c0068e0828377fde02c181687a5 Mon Sep 17 00:00:00 2001 From: Ryan Chen Date: Tue, 29 Jul 2025 22:59:40 -0400 Subject: [PATCH 1/3] Adding some funny stuff --- chunker.py | 127 +++++++++++++++++++++++++++++++++ main.py | 158 +++++++++++++++++++----------------------- petmd_scrape_index.py | 24 +++++++ query.py | 98 ++++++++++++++++++++++++++ 4 files changed, 319 insertions(+), 88 deletions(-) create mode 100644 chunker.py create mode 100644 petmd_scrape_index.py create mode 100644 query.py diff --git a/chunker.py b/chunker.py new file mode 100644 index 0000000..0018740 --- /dev/null +++ b/chunker.py @@ -0,0 +1,127 @@ +import os +from math import ceil +import re +from uuid import UUID, uuid4 + +from chromadb.utils.embedding_functions.ollama_embedding_function import ( + OllamaEmbeddingFunction, +) +from dotenv import load_dotenv + + +load_dotenv() + + +def remove_headers_footers(text, header_patterns=None, footer_patterns=None): + if header_patterns is None: + header_patterns = [r"^.*Header.*$"] + if footer_patterns is None: + footer_patterns = [r"^.*Footer.*$"] + + for pattern in header_patterns + footer_patterns: + text = re.sub(pattern, "", text, flags=re.MULTILINE) + + return text.strip() + + +def remove_special_characters(text, special_chars=None): + if special_chars is None: + special_chars = r"[^A-Za-z0-9\s\.,;:\'\"\?\!\-]" + + text = re.sub(special_chars, "", text) + return text.strip() + + +def remove_repeated_substrings(text, pattern=r"\.{2,}"): + text = re.sub(pattern, ".", text) + return text.strip() + + +def remove_extra_spaces(text): + text = re.sub(r"\n\s*\n", "\n\n", text) + text = re.sub(r"\s+", " ", text) + + return text.strip() + + +def preprocess_text(text): + # Remove headers and footers + text = remove_headers_footers(text) + + # Remove special characters + text = remove_special_characters(text) + + # Remove repeated substrings like dots + text = remove_repeated_substrings(text) + + # Remove extra spaces between lines and within lines + text = remove_extra_spaces(text) + + # Additional cleaning steps can be added here + + return text.strip() + + +class Chunk: + def __init__( + self, + text: str, + size: int, + document_id: UUID, + chunk_id: int, + embedding, + ): + self.text = text + self.size = size + self.document_id = document_id + self.chunk_id = chunk_id + self.embedding = embedding + + +class Chunker: + embedding_fx = OllamaEmbeddingFunction( + url=os.getenv("OLLAMA_URL", ""), + model_name="mxbai-embed-large", + ) + + def __init__(self, collection) -> None: + self.collection = collection + + def chunk_document(self, document: str, chunk_size: int = 1000) -> list[Chunk]: + doc_uuid = uuid4() + + chunk_size = min(chunk_size, len(document)) + + chunks = [] + num_chunks = ceil(len(document) / chunk_size) + document_length = len(document) + + for i in range(num_chunks): + curr_pos = i * num_chunks + to_pos = ( + curr_pos + chunk_size + if curr_pos + chunk_size < document_length + else document_length + ) + text_chunk = self.clean_document(document[curr_pos:to_pos]) + + embedding = self.embedding_fx([text_chunk]) + self.collection.add( + ids=[str(doc_uuid) + ":" + str(i)], + documents=[text_chunk], + embeddings=embedding, + ) + + return chunks + + def clean_document(self, document: str) -> str: + """This function will remove information that is noise or already known. + + Example: We already know all the things in here are Simba-related, so we don't need things like + "Sumamry of simba's visit" + """ + + document = document.replace("\\n", "") + document = document.strip() + + return preprocess_text(document) diff --git a/main.py b/main.py index e911e69..968c7b3 100644 --- a/main.py +++ b/main.py @@ -1,102 +1,84 @@ +import logging + +import argparse +import chromadb import ollama -import os -from uuid import uuid4, UUID + from request import PaperlessNGXService +from chunker import Chunker -from math import ceil - -import chromadb - -from chromadb.utils.embedding_functions.ollama_embedding_function import ( - OllamaEmbeddingFunction, -) from dotenv import load_dotenv -client = chromadb.EphemeralClient() -collection = client.create_collection(name="docs") +client = chromadb.PersistentClient(path="/Users/ryanchen/Programs/raggr/chromadb") +simba_docs = client.get_or_create_collection(name="simba_docs") +feline_vet_lookup = client.get_or_create_collection(name="feline_vet_lookup") + +parser = argparse.ArgumentParser( + description="An LLM tool to query information about Simba <3" +) + +parser.add_argument("query", type=str, help="questions about simba's health") +parser.add_argument( + "--reindex", action="store_true", help="re-index the simba documents" +) load_dotenv() -class Chunk: - def __init__( - self, - text: str, - size: int, - document_id: UUID, - chunk_id: int, - embedding, - ): - self.text = text - self.size = size - self.document_id = document_id - self.chunk_id = chunk_id - self.embedding = embedding +def chunk_data(texts: list[str], collection): + # Step 2: Create chunks + chunker = Chunker(collection) + + print(f"chunking {len(texts)} documents") + for text in texts[: len(texts) // 2]: + chunker.chunk_document(document=text) -class Chunker: - def __init__(self) -> None: - self.embedding_fx = OllamaEmbeddingFunction( - url=os.getenv("OLLAMA_URL", ""), - model_name="mxbai-embed-large", +def consult_oracle(input: str, collection): + # Ask + embeddings = Chunker.embedding_fx(input=[input]) + results = collection.query(query_texts=[input], query_embeddings=embeddings) + print(results) + + # Generate + output = ollama.generate( + model="gemma3n:e4b", + prompt=f"You are a helpful assistant that understandings veterinary terms. Using the following data, help answer the user's query by providing as many details as possible. Using this data: {results}. Respond to this prompt: {input}", + ) + + print(output["response"]) + + +def paperless_workflow(input): + # Step 1: Get the text + ppngx = PaperlessNGXService() + docs = ppngx.get_data() + texts = [doc["content"] for doc in docs] + + chunk_data(texts, collection=simba_docs) + consult_oracle(input, simba_docs) + + +if __name__ == "__main__": + args = parser.parse_args() + if args.reindex: + logging.info(msg="Fetching documents from Paperless-NGX") + ppngx = PaperlessNGXService() + docs = ppngx.get_data() + texts = [doc["content"] for doc in docs] + logging.info(msg=f"Fetched {len(texts)} documents") + + logging.info(msg="Chunking documents now ...") + chunk_data(texts, collection=simba_docs) + logging.info(msg="Done chunking documents") + + if args.query: + logging.info("Consulting oracle ...") + consult_oracle( + input=args.query, + collection=simba_docs, ) - - pass - - def chunk_document(self, document: str, chunk_size: int = 300) -> list[Chunk]: - doc_uuid = uuid4() - - chunks = [] - num_chunks = ceil(len(document) / chunk_size) - document_length = len(document) - - for i in range(num_chunks): - curr_pos = i * num_chunks - to_pos = ( - curr_pos + num_chunks - if curr_pos + num_chunks < document_length - else document_length - ) - text_chunk = document[curr_pos:to_pos] - - embedding = self.embedding_fx([text_chunk]) - collection.add( - ids=[str(doc_uuid) + ":" + str(i)], - documents=[text_chunk], - embeddings=embedding, - ) - - return chunks - - -embedding_fx = OllamaEmbeddingFunction( - url=os.getenv("OLLAMA_URL", ""), - model_name="mxbai-embed-large", -) - -# Step 1: Get the text -ppngx = PaperlessNGXService() -docs = ppngx.get_data() -texts = [doc["content"] for doc in docs] - -# Step 2: Create chunks -chunker = Chunker() - -print(f"chunking {len(texts)} documents") -for text in texts: - chunker.chunk_document(document=text) - -# Ask -input = "How many teeth has Simba had removed? Who is his current vet?" -embeddings = embedding_fx(input=[input]) -results = collection.query(query_texts=[input], query_embeddings=embeddings) -print(results) -# Generate -output = ollama.generate( - model="gemma3n:e4b", - prompt=f"Using this data: {results}. Respond to this prompt: {input}", -) - -print(output["response"]) + else: + print("please provide a query") diff --git a/petmd_scrape_index.py b/petmd_scrape_index.py new file mode 100644 index 0000000..24ee508 --- /dev/null +++ b/petmd_scrape_index.py @@ -0,0 +1,24 @@ +from bs4 import BeautifulSoup +import chromadb +import httpx + +client = chromadb.PersistentClient(path="/Users/ryanchen/Programs/raggr/chromadb") + +# Scrape +BASE_URL = "https://www.vet.cornell.edu" +LIST_URL = "/departments-centers-and-institutes/cornell-feline-health-center/health-information/feline-health-topics" + +QUERY_URL = BASE_URL + LIST_URL +r = httpx.get(QUERY_URL) +soup = BeautifulSoup(r.text) + +container = soup.find("div", class_="field-body") +a_s = container.find_all("a", href=True) + +new_texts = [] + +for link in a_s: + endpoint = link["href"] + query_url = BASE_URL + endpoint + r2 = httpx.get(query_url) + article_soup = BeautifulSoup(r2.text) diff --git a/query.py b/query.py new file mode 100644 index 0000000..0064bb6 --- /dev/null +++ b/query.py @@ -0,0 +1,98 @@ +import json +from typing import Literal + +from ollama import chat, ChatResponse + +from pydantic import BaseModel, Field + +# This uses inferred filters — which means using LLM to create the metadata filters + + +class FilterOperation(BaseModel): + op: Literal["$gt", "$gte", "$eq", "$ne", "$lt", "$lte", "$in", "$nin"] + value: str | list[str] + + +class FilterQuery(BaseModel): + field_name: Literal["created_date, tags"] + op: FilterOperation + + +class AndQuery(BaseModel): + op: Literal["$and", "$or"] + subqueries: list[FilterQuery] + + +class GeneratedQuery(BaseModel): + fields: list[str] + extracted_metadata_fields: str + + +PROMPT = """ +You are an information specialist that processes user queries. The user queries are all about +a cat, Simba, and its records. The types of records are listed below. Using the query, extract the +type of record the user is trying to query and the date range the user is trying to query. + + +You have several operators at your disposal: +- $gt: greater than +- $gte: greater than or equal +- $eq: equal +- $ne: not equal +- $lt: less than +- $lte: less than or equal to +- $in: in +- $nin: not in + +Logical operators: +- $and, $or + +### Example 1 +Query: "Who is Simba's current vet?" +Metadata fields: "{"created_date, tags"}" +Extracted metadata fields: {"$and": [{"created_date: {"$gt": "2025-01-01"}, "tags": {"$in": ["bill", "medical records", "aftercare"]}}]} + +### Example 2 +Query: "How many teeth has Simba had removed?" +Metadata fields: {"tags"} +Extracted metadata fields: {"tags": "medical records"} + +### Example 3 +Query: "How many times has Simba been to the vet this year?" +Metadata fields: {"tags", "created_date"} +Extracted metadata fields: {"$and": [{"created_date": {"gt": "2025-01-01"}, "tags": {"$in": ["bill"]}}]} + +document_types: +- aftercare +- bill +- insurance claim +- medical records + +Only return the extracted metadata fields. Make sure the extracted metadata fields are valid JSON +""" + + +class QueryGenerator: + def __init__(self) -> None: + pass + + def get_query(self, input: str): + response: ChatResponse = chat( + model="gemma3n:e4b", + messages=[ + {"role": "system", "content": PROMPT}, + {"role": "user", "content": input}, + ], + format=GeneratedQuery.model_json_schema(), + ) + + print( + json.loads( + json.loads(response["message"]["content"])["extracted_metadata_fields"] + ) + ) + + +if __name__ == "__main__": + qg = QueryGenerator() + qg.get_query("How old is Simba?") -- 2.49.1 From 0a88a03c90aee40663fbb86e973bb330aa6f97d6 Mon Sep 17 00:00:00 2001 From: Ryan Chen Date: Wed, 30 Jul 2025 19:58:29 -0400 Subject: [PATCH 2/3] Expanded context window, CLI'd the app, and added preprocessing --- main.py | 3 +-- request.py | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/main.py b/main.py index 968c7b3..e22a907 100644 --- a/main.py +++ b/main.py @@ -32,7 +32,7 @@ def chunk_data(texts: list[str], collection): chunker = Chunker(collection) print(f"chunking {len(texts)} documents") - for text in texts[: len(texts) // 2]: + for text in texts: chunker.chunk_document(document=text) @@ -40,7 +40,6 @@ def consult_oracle(input: str, collection): # Ask embeddings = Chunker.embedding_fx(input=[input]) results = collection.query(query_texts=[input], query_embeddings=embeddings) - print(results) # Generate output = ollama.generate( diff --git a/request.py b/request.py index 229d619..9357096 100644 --- a/request.py +++ b/request.py @@ -21,4 +21,4 @@ class PaperlessNGXService: if __name__ == "__main__": pp = PaperlessNGXService() - print(pp.get_data()[0].keys()) + pp.get_data() -- 2.49.1 From c7152d3f32b9cc622e1db31aba9852e9ebc6f4c3 Mon Sep 17 00:00:00 2001 From: Ryan Chen Date: Wed, 30 Jul 2025 20:27:03 -0400 Subject: [PATCH 3/3] Moving chromadb to env var --- main.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/main.py b/main.py index e22a907..b568c46 100644 --- a/main.py +++ b/main.py @@ -1,4 +1,5 @@ import logging +import os import argparse import chromadb @@ -11,7 +12,9 @@ from chunker import Chunker from dotenv import load_dotenv -client = chromadb.PersistentClient(path="/Users/ryanchen/Programs/raggr/chromadb") +load_dotenv() + +client = chromadb.PersistentClient(path=os.getenv("CHROMADB_PATH", "")) simba_docs = client.get_or_create_collection(name="simba_docs") feline_vet_lookup = client.get_or_create_collection(name="feline_vet_lookup") @@ -24,8 +27,6 @@ parser.add_argument( "--reindex", action="store_true", help="re-index the simba documents" ) -load_dotenv() - def chunk_data(texts: list[str], collection): # Step 2: Create chunks -- 2.49.1