Adding some funny stuff
This commit is contained in:
127
chunker.py
Normal file
127
chunker.py
Normal file
@@ -0,0 +1,127 @@
|
|||||||
|
import os
|
||||||
|
from math import ceil
|
||||||
|
import re
|
||||||
|
from uuid import UUID, uuid4
|
||||||
|
|
||||||
|
from chromadb.utils.embedding_functions.ollama_embedding_function import (
|
||||||
|
OllamaEmbeddingFunction,
|
||||||
|
)
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
|
||||||
|
|
||||||
|
load_dotenv()
|
||||||
|
|
||||||
|
|
||||||
|
def remove_headers_footers(text, header_patterns=None, footer_patterns=None):
|
||||||
|
if header_patterns is None:
|
||||||
|
header_patterns = [r"^.*Header.*$"]
|
||||||
|
if footer_patterns is None:
|
||||||
|
footer_patterns = [r"^.*Footer.*$"]
|
||||||
|
|
||||||
|
for pattern in header_patterns + footer_patterns:
|
||||||
|
text = re.sub(pattern, "", text, flags=re.MULTILINE)
|
||||||
|
|
||||||
|
return text.strip()
|
||||||
|
|
||||||
|
|
||||||
|
def remove_special_characters(text, special_chars=None):
|
||||||
|
if special_chars is None:
|
||||||
|
special_chars = r"[^A-Za-z0-9\s\.,;:\'\"\?\!\-]"
|
||||||
|
|
||||||
|
text = re.sub(special_chars, "", text)
|
||||||
|
return text.strip()
|
||||||
|
|
||||||
|
|
||||||
|
def remove_repeated_substrings(text, pattern=r"\.{2,}"):
|
||||||
|
text = re.sub(pattern, ".", text)
|
||||||
|
return text.strip()
|
||||||
|
|
||||||
|
|
||||||
|
def remove_extra_spaces(text):
|
||||||
|
text = re.sub(r"\n\s*\n", "\n\n", text)
|
||||||
|
text = re.sub(r"\s+", " ", text)
|
||||||
|
|
||||||
|
return text.strip()
|
||||||
|
|
||||||
|
|
||||||
|
def preprocess_text(text):
|
||||||
|
# Remove headers and footers
|
||||||
|
text = remove_headers_footers(text)
|
||||||
|
|
||||||
|
# Remove special characters
|
||||||
|
text = remove_special_characters(text)
|
||||||
|
|
||||||
|
# Remove repeated substrings like dots
|
||||||
|
text = remove_repeated_substrings(text)
|
||||||
|
|
||||||
|
# Remove extra spaces between lines and within lines
|
||||||
|
text = remove_extra_spaces(text)
|
||||||
|
|
||||||
|
# Additional cleaning steps can be added here
|
||||||
|
|
||||||
|
return text.strip()
|
||||||
|
|
||||||
|
|
||||||
|
class Chunk:
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
text: str,
|
||||||
|
size: int,
|
||||||
|
document_id: UUID,
|
||||||
|
chunk_id: int,
|
||||||
|
embedding,
|
||||||
|
):
|
||||||
|
self.text = text
|
||||||
|
self.size = size
|
||||||
|
self.document_id = document_id
|
||||||
|
self.chunk_id = chunk_id
|
||||||
|
self.embedding = embedding
|
||||||
|
|
||||||
|
|
||||||
|
class Chunker:
|
||||||
|
embedding_fx = OllamaEmbeddingFunction(
|
||||||
|
url=os.getenv("OLLAMA_URL", ""),
|
||||||
|
model_name="mxbai-embed-large",
|
||||||
|
)
|
||||||
|
|
||||||
|
def __init__(self, collection) -> None:
|
||||||
|
self.collection = collection
|
||||||
|
|
||||||
|
def chunk_document(self, document: str, chunk_size: int = 1000) -> list[Chunk]:
|
||||||
|
doc_uuid = uuid4()
|
||||||
|
|
||||||
|
chunk_size = min(chunk_size, len(document))
|
||||||
|
|
||||||
|
chunks = []
|
||||||
|
num_chunks = ceil(len(document) / chunk_size)
|
||||||
|
document_length = len(document)
|
||||||
|
|
||||||
|
for i in range(num_chunks):
|
||||||
|
curr_pos = i * num_chunks
|
||||||
|
to_pos = (
|
||||||
|
curr_pos + chunk_size
|
||||||
|
if curr_pos + chunk_size < document_length
|
||||||
|
else document_length
|
||||||
|
)
|
||||||
|
text_chunk = self.clean_document(document[curr_pos:to_pos])
|
||||||
|
|
||||||
|
embedding = self.embedding_fx([text_chunk])
|
||||||
|
self.collection.add(
|
||||||
|
ids=[str(doc_uuid) + ":" + str(i)],
|
||||||
|
documents=[text_chunk],
|
||||||
|
embeddings=embedding,
|
||||||
|
)
|
||||||
|
|
||||||
|
return chunks
|
||||||
|
|
||||||
|
def clean_document(self, document: str) -> str:
|
||||||
|
"""This function will remove information that is noise or already known.
|
||||||
|
|
||||||
|
Example: We already know all the things in here are Simba-related, so we don't need things like
|
||||||
|
"Sumamry of simba's visit"
|
||||||
|
"""
|
||||||
|
|
||||||
|
document = document.replace("\\n", "")
|
||||||
|
document = document.strip()
|
||||||
|
|
||||||
|
return preprocess_text(document)
|
||||||
136
main.py
136
main.py
@@ -1,102 +1,84 @@
|
|||||||
|
import logging
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import chromadb
|
||||||
import ollama
|
import ollama
|
||||||
import os
|
|
||||||
from uuid import uuid4, UUID
|
|
||||||
|
|
||||||
from request import PaperlessNGXService
|
from request import PaperlessNGXService
|
||||||
|
from chunker import Chunker
|
||||||
|
|
||||||
from math import ceil
|
|
||||||
|
|
||||||
import chromadb
|
|
||||||
|
|
||||||
from chromadb.utils.embedding_functions.ollama_embedding_function import (
|
|
||||||
OllamaEmbeddingFunction,
|
|
||||||
)
|
|
||||||
|
|
||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
|
|
||||||
client = chromadb.EphemeralClient()
|
client = chromadb.PersistentClient(path="/Users/ryanchen/Programs/raggr/chromadb")
|
||||||
collection = client.create_collection(name="docs")
|
simba_docs = client.get_or_create_collection(name="simba_docs")
|
||||||
|
feline_vet_lookup = client.get_or_create_collection(name="feline_vet_lookup")
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="An LLM tool to query information about Simba <3"
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument("query", type=str, help="questions about simba's health")
|
||||||
|
parser.add_argument(
|
||||||
|
"--reindex", action="store_true", help="re-index the simba documents"
|
||||||
|
)
|
||||||
|
|
||||||
load_dotenv()
|
load_dotenv()
|
||||||
|
|
||||||
|
|
||||||
class Chunk:
|
def chunk_data(texts: list[str], collection):
|
||||||
def __init__(
|
# Step 2: Create chunks
|
||||||
self,
|
chunker = Chunker(collection)
|
||||||
text: str,
|
|
||||||
size: int,
|
print(f"chunking {len(texts)} documents")
|
||||||
document_id: UUID,
|
for text in texts[: len(texts) // 2]:
|
||||||
chunk_id: int,
|
chunker.chunk_document(document=text)
|
||||||
embedding,
|
|
||||||
):
|
|
||||||
self.text = text
|
|
||||||
self.size = size
|
|
||||||
self.document_id = document_id
|
|
||||||
self.chunk_id = chunk_id
|
|
||||||
self.embedding = embedding
|
|
||||||
|
|
||||||
|
|
||||||
class Chunker:
|
def consult_oracle(input: str, collection):
|
||||||
def __init__(self) -> None:
|
# Ask
|
||||||
self.embedding_fx = OllamaEmbeddingFunction(
|
embeddings = Chunker.embedding_fx(input=[input])
|
||||||
url=os.getenv("OLLAMA_URL", ""),
|
results = collection.query(query_texts=[input], query_embeddings=embeddings)
|
||||||
model_name="mxbai-embed-large",
|
print(results)
|
||||||
|
|
||||||
|
# Generate
|
||||||
|
output = ollama.generate(
|
||||||
|
model="gemma3n:e4b",
|
||||||
|
prompt=f"You are a helpful assistant that understandings veterinary terms. Using the following data, help answer the user's query by providing as many details as possible. Using this data: {results}. Respond to this prompt: {input}",
|
||||||
)
|
)
|
||||||
|
|
||||||
pass
|
print(output["response"])
|
||||||
|
|
||||||
def chunk_document(self, document: str, chunk_size: int = 300) -> list[Chunk]:
|
|
||||||
doc_uuid = uuid4()
|
|
||||||
|
|
||||||
chunks = []
|
|
||||||
num_chunks = ceil(len(document) / chunk_size)
|
|
||||||
document_length = len(document)
|
|
||||||
|
|
||||||
for i in range(num_chunks):
|
|
||||||
curr_pos = i * num_chunks
|
|
||||||
to_pos = (
|
|
||||||
curr_pos + num_chunks
|
|
||||||
if curr_pos + num_chunks < document_length
|
|
||||||
else document_length
|
|
||||||
)
|
|
||||||
text_chunk = document[curr_pos:to_pos]
|
|
||||||
|
|
||||||
embedding = self.embedding_fx([text_chunk])
|
|
||||||
collection.add(
|
|
||||||
ids=[str(doc_uuid) + ":" + str(i)],
|
|
||||||
documents=[text_chunk],
|
|
||||||
embeddings=embedding,
|
|
||||||
)
|
|
||||||
|
|
||||||
return chunks
|
|
||||||
|
|
||||||
|
|
||||||
embedding_fx = OllamaEmbeddingFunction(
|
def paperless_workflow(input):
|
||||||
url=os.getenv("OLLAMA_URL", ""),
|
|
||||||
model_name="mxbai-embed-large",
|
|
||||||
)
|
|
||||||
|
|
||||||
# Step 1: Get the text
|
# Step 1: Get the text
|
||||||
ppngx = PaperlessNGXService()
|
ppngx = PaperlessNGXService()
|
||||||
docs = ppngx.get_data()
|
docs = ppngx.get_data()
|
||||||
texts = [doc["content"] for doc in docs]
|
texts = [doc["content"] for doc in docs]
|
||||||
|
|
||||||
# Step 2: Create chunks
|
chunk_data(texts, collection=simba_docs)
|
||||||
chunker = Chunker()
|
consult_oracle(input, simba_docs)
|
||||||
|
|
||||||
print(f"chunking {len(texts)} documents")
|
|
||||||
for text in texts:
|
|
||||||
chunker.chunk_document(document=text)
|
|
||||||
|
|
||||||
# Ask
|
if __name__ == "__main__":
|
||||||
input = "How many teeth has Simba had removed? Who is his current vet?"
|
args = parser.parse_args()
|
||||||
embeddings = embedding_fx(input=[input])
|
if args.reindex:
|
||||||
results = collection.query(query_texts=[input], query_embeddings=embeddings)
|
logging.info(msg="Fetching documents from Paperless-NGX")
|
||||||
print(results)
|
ppngx = PaperlessNGXService()
|
||||||
# Generate
|
docs = ppngx.get_data()
|
||||||
output = ollama.generate(
|
texts = [doc["content"] for doc in docs]
|
||||||
model="gemma3n:e4b",
|
logging.info(msg=f"Fetched {len(texts)} documents")
|
||||||
prompt=f"Using this data: {results}. Respond to this prompt: {input}",
|
|
||||||
|
logging.info(msg="Chunking documents now ...")
|
||||||
|
chunk_data(texts, collection=simba_docs)
|
||||||
|
logging.info(msg="Done chunking documents")
|
||||||
|
|
||||||
|
if args.query:
|
||||||
|
logging.info("Consulting oracle ...")
|
||||||
|
consult_oracle(
|
||||||
|
input=args.query,
|
||||||
|
collection=simba_docs,
|
||||||
)
|
)
|
||||||
|
else:
|
||||||
print(output["response"])
|
print("please provide a query")
|
||||||
|
|||||||
24
petmd_scrape_index.py
Normal file
24
petmd_scrape_index.py
Normal file
@@ -0,0 +1,24 @@
|
|||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import chromadb
|
||||||
|
import httpx
|
||||||
|
|
||||||
|
client = chromadb.PersistentClient(path="/Users/ryanchen/Programs/raggr/chromadb")
|
||||||
|
|
||||||
|
# Scrape
|
||||||
|
BASE_URL = "https://www.vet.cornell.edu"
|
||||||
|
LIST_URL = "/departments-centers-and-institutes/cornell-feline-health-center/health-information/feline-health-topics"
|
||||||
|
|
||||||
|
QUERY_URL = BASE_URL + LIST_URL
|
||||||
|
r = httpx.get(QUERY_URL)
|
||||||
|
soup = BeautifulSoup(r.text)
|
||||||
|
|
||||||
|
container = soup.find("div", class_="field-body")
|
||||||
|
a_s = container.find_all("a", href=True)
|
||||||
|
|
||||||
|
new_texts = []
|
||||||
|
|
||||||
|
for link in a_s:
|
||||||
|
endpoint = link["href"]
|
||||||
|
query_url = BASE_URL + endpoint
|
||||||
|
r2 = httpx.get(query_url)
|
||||||
|
article_soup = BeautifulSoup(r2.text)
|
||||||
98
query.py
Normal file
98
query.py
Normal file
@@ -0,0 +1,98 @@
|
|||||||
|
import json
|
||||||
|
from typing import Literal
|
||||||
|
|
||||||
|
from ollama import chat, ChatResponse
|
||||||
|
|
||||||
|
from pydantic import BaseModel, Field
|
||||||
|
|
||||||
|
# This uses inferred filters — which means using LLM to create the metadata filters
|
||||||
|
|
||||||
|
|
||||||
|
class FilterOperation(BaseModel):
|
||||||
|
op: Literal["$gt", "$gte", "$eq", "$ne", "$lt", "$lte", "$in", "$nin"]
|
||||||
|
value: str | list[str]
|
||||||
|
|
||||||
|
|
||||||
|
class FilterQuery(BaseModel):
|
||||||
|
field_name: Literal["created_date, tags"]
|
||||||
|
op: FilterOperation
|
||||||
|
|
||||||
|
|
||||||
|
class AndQuery(BaseModel):
|
||||||
|
op: Literal["$and", "$or"]
|
||||||
|
subqueries: list[FilterQuery]
|
||||||
|
|
||||||
|
|
||||||
|
class GeneratedQuery(BaseModel):
|
||||||
|
fields: list[str]
|
||||||
|
extracted_metadata_fields: str
|
||||||
|
|
||||||
|
|
||||||
|
PROMPT = """
|
||||||
|
You are an information specialist that processes user queries. The user queries are all about
|
||||||
|
a cat, Simba, and its records. The types of records are listed below. Using the query, extract the
|
||||||
|
type of record the user is trying to query and the date range the user is trying to query.
|
||||||
|
|
||||||
|
|
||||||
|
You have several operators at your disposal:
|
||||||
|
- $gt: greater than
|
||||||
|
- $gte: greater than or equal
|
||||||
|
- $eq: equal
|
||||||
|
- $ne: not equal
|
||||||
|
- $lt: less than
|
||||||
|
- $lte: less than or equal to
|
||||||
|
- $in: in
|
||||||
|
- $nin: not in
|
||||||
|
|
||||||
|
Logical operators:
|
||||||
|
- $and, $or
|
||||||
|
|
||||||
|
### Example 1
|
||||||
|
Query: "Who is Simba's current vet?"
|
||||||
|
Metadata fields: "{"created_date, tags"}"
|
||||||
|
Extracted metadata fields: {"$and": [{"created_date: {"$gt": "2025-01-01"}, "tags": {"$in": ["bill", "medical records", "aftercare"]}}]}
|
||||||
|
|
||||||
|
### Example 2
|
||||||
|
Query: "How many teeth has Simba had removed?"
|
||||||
|
Metadata fields: {"tags"}
|
||||||
|
Extracted metadata fields: {"tags": "medical records"}
|
||||||
|
|
||||||
|
### Example 3
|
||||||
|
Query: "How many times has Simba been to the vet this year?"
|
||||||
|
Metadata fields: {"tags", "created_date"}
|
||||||
|
Extracted metadata fields: {"$and": [{"created_date": {"gt": "2025-01-01"}, "tags": {"$in": ["bill"]}}]}
|
||||||
|
|
||||||
|
document_types:
|
||||||
|
- aftercare
|
||||||
|
- bill
|
||||||
|
- insurance claim
|
||||||
|
- medical records
|
||||||
|
|
||||||
|
Only return the extracted metadata fields. Make sure the extracted metadata fields are valid JSON
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
class QueryGenerator:
|
||||||
|
def __init__(self) -> None:
|
||||||
|
pass
|
||||||
|
|
||||||
|
def get_query(self, input: str):
|
||||||
|
response: ChatResponse = chat(
|
||||||
|
model="gemma3n:e4b",
|
||||||
|
messages=[
|
||||||
|
{"role": "system", "content": PROMPT},
|
||||||
|
{"role": "user", "content": input},
|
||||||
|
],
|
||||||
|
format=GeneratedQuery.model_json_schema(),
|
||||||
|
)
|
||||||
|
|
||||||
|
print(
|
||||||
|
json.loads(
|
||||||
|
json.loads(response["message"]["content"])["extracted_metadata_fields"]
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
qg = QueryGenerator()
|
||||||
|
qg.get_query("How old is Simba?")
|
||||||
Reference in New Issue
Block a user