5 Commits

Author SHA1 Message Date
ryan
fc504d3e9c Merge pull request 'Adding some funny stuff' (#2) from data-preprocessing into main
Reviewed-on: #2

implements #1
2025-07-30 20:30:34 -04:00
Ryan Chen
c7152d3f32 Moving chromadb to env var 2025-07-30 20:27:03 -04:00
Ryan Chen
0a88a03c90 Expanded context window, CLI'd the app, and added preprocessing 2025-07-30 19:58:29 -04:00
Ryan Chen
b43ef63449 Adding some funny stuff 2025-07-29 22:59:40 -04:00
ryan
b698109183 Merge pull request 'Adding more embeddings' (#1) from better-embeddings into main
Reviewed-on: #1
2025-07-26 19:55:31 -04:00
5 changed files with 321 additions and 90 deletions

127
chunker.py Normal file
View File

@@ -0,0 +1,127 @@
import os
from math import ceil
import re
from uuid import UUID, uuid4
from chromadb.utils.embedding_functions.ollama_embedding_function import (
OllamaEmbeddingFunction,
)
from dotenv import load_dotenv
load_dotenv()
def remove_headers_footers(text, header_patterns=None, footer_patterns=None):
if header_patterns is None:
header_patterns = [r"^.*Header.*$"]
if footer_patterns is None:
footer_patterns = [r"^.*Footer.*$"]
for pattern in header_patterns + footer_patterns:
text = re.sub(pattern, "", text, flags=re.MULTILINE)
return text.strip()
def remove_special_characters(text, special_chars=None):
if special_chars is None:
special_chars = r"[^A-Za-z0-9\s\.,;:\'\"\?\!\-]"
text = re.sub(special_chars, "", text)
return text.strip()
def remove_repeated_substrings(text, pattern=r"\.{2,}"):
text = re.sub(pattern, ".", text)
return text.strip()
def remove_extra_spaces(text):
text = re.sub(r"\n\s*\n", "\n\n", text)
text = re.sub(r"\s+", " ", text)
return text.strip()
def preprocess_text(text):
# Remove headers and footers
text = remove_headers_footers(text)
# Remove special characters
text = remove_special_characters(text)
# Remove repeated substrings like dots
text = remove_repeated_substrings(text)
# Remove extra spaces between lines and within lines
text = remove_extra_spaces(text)
# Additional cleaning steps can be added here
return text.strip()
class Chunk:
def __init__(
self,
text: str,
size: int,
document_id: UUID,
chunk_id: int,
embedding,
):
self.text = text
self.size = size
self.document_id = document_id
self.chunk_id = chunk_id
self.embedding = embedding
class Chunker:
embedding_fx = OllamaEmbeddingFunction(
url=os.getenv("OLLAMA_URL", ""),
model_name="mxbai-embed-large",
)
def __init__(self, collection) -> None:
self.collection = collection
def chunk_document(self, document: str, chunk_size: int = 1000) -> list[Chunk]:
doc_uuid = uuid4()
chunk_size = min(chunk_size, len(document))
chunks = []
num_chunks = ceil(len(document) / chunk_size)
document_length = len(document)
for i in range(num_chunks):
curr_pos = i * num_chunks
to_pos = (
curr_pos + chunk_size
if curr_pos + chunk_size < document_length
else document_length
)
text_chunk = self.clean_document(document[curr_pos:to_pos])
embedding = self.embedding_fx([text_chunk])
self.collection.add(
ids=[str(doc_uuid) + ":" + str(i)],
documents=[text_chunk],
embeddings=embedding,
)
return chunks
def clean_document(self, document: str) -> str:
"""This function will remove information that is noise or already known.
Example: We already know all the things in here are Simba-related, so we don't need things like
"Sumamry of simba's visit"
"""
document = document.replace("\\n", "")
document = document.strip()
return preprocess_text(document)

160
main.py
View File

@@ -1,102 +1,84 @@
import ollama import logging
import os import os
from uuid import uuid4, UUID
import argparse
import chromadb
import ollama
from request import PaperlessNGXService from request import PaperlessNGXService
from chunker import Chunker
from math import ceil
import chromadb
from chromadb.utils.embedding_functions.ollama_embedding_function import (
OllamaEmbeddingFunction,
)
from dotenv import load_dotenv from dotenv import load_dotenv
client = chromadb.EphemeralClient()
collection = client.create_collection(name="docs")
load_dotenv() load_dotenv()
client = chromadb.PersistentClient(path=os.getenv("CHROMADB_PATH", ""))
simba_docs = client.get_or_create_collection(name="simba_docs")
feline_vet_lookup = client.get_or_create_collection(name="feline_vet_lookup")
class Chunk: parser = argparse.ArgumentParser(
def __init__( description="An LLM tool to query information about Simba <3"
self, )
text: str,
size: int, parser.add_argument("query", type=str, help="questions about simba's health")
document_id: UUID, parser.add_argument(
chunk_id: int, "--reindex", action="store_true", help="re-index the simba documents"
embedding, )
):
self.text = text
self.size = size
self.document_id = document_id
self.chunk_id = chunk_id
self.embedding = embedding
class Chunker: def chunk_data(texts: list[str], collection):
def __init__(self) -> None: # Step 2: Create chunks
self.embedding_fx = OllamaEmbeddingFunction( chunker = Chunker(collection)
url=os.getenv("OLLAMA_URL", ""),
model_name="mxbai-embed-large", print(f"chunking {len(texts)} documents")
for text in texts:
chunker.chunk_document(document=text)
def consult_oracle(input: str, collection):
# Ask
embeddings = Chunker.embedding_fx(input=[input])
results = collection.query(query_texts=[input], query_embeddings=embeddings)
# Generate
output = ollama.generate(
model="gemma3n:e4b",
prompt=f"You are a helpful assistant that understandings veterinary terms. Using the following data, help answer the user's query by providing as many details as possible. Using this data: {results}. Respond to this prompt: {input}",
)
print(output["response"])
def paperless_workflow(input):
# Step 1: Get the text
ppngx = PaperlessNGXService()
docs = ppngx.get_data()
texts = [doc["content"] for doc in docs]
chunk_data(texts, collection=simba_docs)
consult_oracle(input, simba_docs)
if __name__ == "__main__":
args = parser.parse_args()
if args.reindex:
logging.info(msg="Fetching documents from Paperless-NGX")
ppngx = PaperlessNGXService()
docs = ppngx.get_data()
texts = [doc["content"] for doc in docs]
logging.info(msg=f"Fetched {len(texts)} documents")
logging.info(msg="Chunking documents now ...")
chunk_data(texts, collection=simba_docs)
logging.info(msg="Done chunking documents")
if args.query:
logging.info("Consulting oracle ...")
consult_oracle(
input=args.query,
collection=simba_docs,
) )
else:
pass print("please provide a query")
def chunk_document(self, document: str, chunk_size: int = 300) -> list[Chunk]:
doc_uuid = uuid4()
chunks = []
num_chunks = ceil(len(document) / chunk_size)
document_length = len(document)
for i in range(num_chunks):
curr_pos = i * num_chunks
to_pos = (
curr_pos + num_chunks
if curr_pos + num_chunks < document_length
else document_length
)
text_chunk = document[curr_pos:to_pos]
embedding = self.embedding_fx([text_chunk])
collection.add(
ids=[str(doc_uuid) + ":" + str(i)],
documents=[text_chunk],
embeddings=embedding,
)
return chunks
embedding_fx = OllamaEmbeddingFunction(
url=os.getenv("OLLAMA_URL", ""),
model_name="mxbai-embed-large",
)
# Step 1: Get the text
ppngx = PaperlessNGXService()
docs = ppngx.get_data()
texts = [doc["content"] for doc in docs]
# Step 2: Create chunks
chunker = Chunker()
print(f"chunking {len(texts)} documents")
for text in texts:
chunker.chunk_document(document=text)
# Ask
input = "How many teeth has Simba had removed? Who is his current vet?"
embeddings = embedding_fx(input=[input])
results = collection.query(query_texts=[input], query_embeddings=embeddings)
print(results)
# Generate
output = ollama.generate(
model="gemma3n:e4b",
prompt=f"Using this data: {results}. Respond to this prompt: {input}",
)
print(output["response"])

24
petmd_scrape_index.py Normal file
View File

@@ -0,0 +1,24 @@
from bs4 import BeautifulSoup
import chromadb
import httpx
client = chromadb.PersistentClient(path="/Users/ryanchen/Programs/raggr/chromadb")
# Scrape
BASE_URL = "https://www.vet.cornell.edu"
LIST_URL = "/departments-centers-and-institutes/cornell-feline-health-center/health-information/feline-health-topics"
QUERY_URL = BASE_URL + LIST_URL
r = httpx.get(QUERY_URL)
soup = BeautifulSoup(r.text)
container = soup.find("div", class_="field-body")
a_s = container.find_all("a", href=True)
new_texts = []
for link in a_s:
endpoint = link["href"]
query_url = BASE_URL + endpoint
r2 = httpx.get(query_url)
article_soup = BeautifulSoup(r2.text)

98
query.py Normal file
View File

@@ -0,0 +1,98 @@
import json
from typing import Literal
from ollama import chat, ChatResponse
from pydantic import BaseModel, Field
# This uses inferred filters — which means using LLM to create the metadata filters
class FilterOperation(BaseModel):
op: Literal["$gt", "$gte", "$eq", "$ne", "$lt", "$lte", "$in", "$nin"]
value: str | list[str]
class FilterQuery(BaseModel):
field_name: Literal["created_date, tags"]
op: FilterOperation
class AndQuery(BaseModel):
op: Literal["$and", "$or"]
subqueries: list[FilterQuery]
class GeneratedQuery(BaseModel):
fields: list[str]
extracted_metadata_fields: str
PROMPT = """
You are an information specialist that processes user queries. The user queries are all about
a cat, Simba, and its records. The types of records are listed below. Using the query, extract the
type of record the user is trying to query and the date range the user is trying to query.
You have several operators at your disposal:
- $gt: greater than
- $gte: greater than or equal
- $eq: equal
- $ne: not equal
- $lt: less than
- $lte: less than or equal to
- $in: in
- $nin: not in
Logical operators:
- $and, $or
### Example 1
Query: "Who is Simba's current vet?"
Metadata fields: "{"created_date, tags"}"
Extracted metadata fields: {"$and": [{"created_date: {"$gt": "2025-01-01"}, "tags": {"$in": ["bill", "medical records", "aftercare"]}}]}
### Example 2
Query: "How many teeth has Simba had removed?"
Metadata fields: {"tags"}
Extracted metadata fields: {"tags": "medical records"}
### Example 3
Query: "How many times has Simba been to the vet this year?"
Metadata fields: {"tags", "created_date"}
Extracted metadata fields: {"$and": [{"created_date": {"gt": "2025-01-01"}, "tags": {"$in": ["bill"]}}]}
document_types:
- aftercare
- bill
- insurance claim
- medical records
Only return the extracted metadata fields. Make sure the extracted metadata fields are valid JSON
"""
class QueryGenerator:
def __init__(self) -> None:
pass
def get_query(self, input: str):
response: ChatResponse = chat(
model="gemma3n:e4b",
messages=[
{"role": "system", "content": PROMPT},
{"role": "user", "content": input},
],
format=GeneratedQuery.model_json_schema(),
)
print(
json.loads(
json.loads(response["message"]["content"])["extracted_metadata_fields"]
)
)
if __name__ == "__main__":
qg = QueryGenerator()
qg.get_query("How old is Simba?")

View File

@@ -21,4 +21,4 @@ class PaperlessNGXService:
if __name__ == "__main__": if __name__ == "__main__":
pp = PaperlessNGXService() pp = PaperlessNGXService()
print(pp.get_data()[0].keys()) pp.get_data()