7 Commits

Author SHA1 Message Date
Ryan Chen
e577cb335b query classification 2025-10-26 17:29:00 -04:00
Ryan Chen
591788dfa4 reindex pls 2025-10-26 11:06:32 -04:00
Ryan Chen
561b5bddce reindex pls 2025-10-26 11:04:33 -04:00
Ryan Chen
ddd455a4c6 reindex pls 2025-10-26 11:02:51 -04:00
ryan
07424e77e0 Merge pull request 'favicon' (#7) from update-favicon-and-title into main
Reviewed-on: #7
2025-10-26 10:49:27 -04:00
Ryan Chen
a56f752917 favicon 2025-10-26 10:48:59 -04:00
Ryan Chen
e8264e80ce Changing DB thing 2025-10-26 09:36:33 -04:00
10 changed files with 147 additions and 45 deletions

View File

@@ -24,7 +24,6 @@ RUN uv pip install --system -e .
# Copy application code # Copy application code
COPY *.py ./ COPY *.py ./
COPY blueprints ./blueprints COPY blueprints ./blueprints
COPY aerich.toml ./
COPY migrations ./migrations COPY migrations ./migrations
COPY startup.sh ./ COPY startup.sh ./
RUN chmod +x startup.sh RUN chmod +x startup.sh
@@ -35,8 +34,8 @@ WORKDIR /app/raggr-frontend
RUN yarn install && yarn build RUN yarn install && yarn build
WORKDIR /app WORKDIR /app
# Create ChromaDB directory # Create ChromaDB and database directories
RUN mkdir -p /app/chromadb RUN mkdir -p /app/chromadb /app/database
# Expose port # Expose port
EXPOSE 8080 EXPOSE 8080

View File

@@ -10,7 +10,7 @@ from blueprints.users.models import User
async def add_user(username: str, email: str, password: str): async def add_user(username: str, email: str, password: str):
"""Add a new user to the database""" """Add a new user to the database"""
await Tortoise.init( await Tortoise.init(
db_url="sqlite://raggr.db", db_url="sqlite://database/raggr.db",
modules={ modules={
"models": [ "models": [
"blueprints.users.models", "blueprints.users.models",
@@ -56,7 +56,7 @@ async def add_user(username: str, email: str, password: str):
async def list_users(): async def list_users():
"""List all users in the database""" """List all users in the database"""
await Tortoise.init( await Tortoise.init(
db_url="sqlite://raggr.db", db_url="sqlite://database/raggr.db",
modules={ modules={
"models": [ "models": [
"blueprints.users.models", "blueprints.users.models",

View File

@@ -1,7 +1,7 @@
import os import os
TORTOISE_ORM = { TORTOISE_ORM = {
"connections": {"default": os.getenv("DATABASE_URL", "sqlite:///app/raggr.db")}, "connections": {"default": os.getenv("DATABASE_URL", "sqlite:///app/database/raggr.db")},
"apps": { "apps": {
"models": { "models": {
"models": [ "models": [

2
app.py
View File

@@ -27,7 +27,7 @@ app.register_blueprint(blueprints.conversation.conversation_blueprint)
TORTOISE_CONFIG = { TORTOISE_CONFIG = {
"connections": {"default": "sqlite://raggr.db"}, "connections": {"default": "sqlite://database/raggr.db"},
"apps": { "apps": {
"models": { "models": {
"models": [ "models": [

View File

@@ -12,6 +12,8 @@ services:
- OPENAI_API_KEY=${OPENAI_API_KEY} - OPENAI_API_KEY=${OPENAI_API_KEY}
volumes: volumes:
- chromadb_data:/app/chromadb - chromadb_data:/app/chromadb
- database_data:/app/database
volumes: volumes:
chromadb_data: chromadb_data:
database_data:

View File

@@ -27,7 +27,7 @@ headers = {"x-api-key": API_KEY, "Content-Type": "application/json"}
VISITED = {} VISITED = {}
if __name__ == "__main__": if __name__ == "__main__":
conn = sqlite3.connect("./visited.db") conn = sqlite3.connect("./database/visited.db")
c = conn.cursor() c = conn.cursor()
c.execute("select immich_id from visited") c.execute("select immich_id from visited")
rows = c.fetchall() rows = c.fetchall()

111
main.py
View File

@@ -7,6 +7,8 @@ import argparse
import chromadb import chromadb
import ollama import ollama
import time
from request import PaperlessNGXService from request import PaperlessNGXService
from chunker import Chunker from chunker import Chunker
@@ -36,6 +38,7 @@ parser.add_argument("query", type=str, help="questions about simba's health")
parser.add_argument( parser.add_argument(
"--reindex", action="store_true", help="re-index the simba documents" "--reindex", action="store_true", help="re-index the simba documents"
) )
parser.add_argument("--classify", action="store_true", help="test classification")
parser.add_argument("--index", help="index a file") parser.add_argument("--index", help="index a file")
ppngx = PaperlessNGXService() ppngx = PaperlessNGXService()
@@ -77,7 +80,7 @@ def chunk_data(docs, collection, doctypes):
logging.info(f"chunking {len(docs)} documents") logging.info(f"chunking {len(docs)} documents")
texts: list[str] = [doc["content"] for doc in docs] texts: list[str] = [doc["content"] for doc in docs]
with sqlite3.connect("visited.db") as conn: with sqlite3.connect("database/visited.db") as conn:
to_insert = [] to_insert = []
c = conn.cursor() c = conn.cursor()
for index, text in enumerate(texts): for index, text in enumerate(texts):
@@ -113,13 +116,22 @@ def chunk_text(texts: list[str], collection):
) )
def classify_query(query: str, transcript: str) -> bool:
logging.info("Starting query generation")
qg_start = time.time()
qg = QueryGenerator()
query_type = qg.get_query_type(input=query, transcript=transcript)
logging.info(query_type)
qg_end = time.time()
logging.info(f"Query generation took {qg_end - qg_start:.2f} seconds")
return query_type == "Simba"
def consult_oracle( def consult_oracle(
input: str, input: str,
collection, collection,
transcript: str = "", transcript: str = "",
): ):
import time
chunker = Chunker(collection) chunker = Chunker(collection)
start_time = time.time() start_time = time.time()
@@ -171,6 +183,16 @@ def consult_oracle(
return output return output
def llm_chat(input: str, transcript: str = "") -> str:
system_prompt = "You are a helpful assistant that understands veterinary terms."
transcript_prompt = f"Here is the message transcript thus far {transcript}."
prompt = f"""Answer the user in a humorous way as if you were a cat named Simba. Be very coy.
{transcript_prompt if len(transcript) > 0 else ""}
Respond to this prompt: {input}"""
output = llm_client.chat(prompt=prompt, system_prompt=system_prompt)
return output
def paperless_workflow(input): def paperless_workflow(input):
# Step 1: Get the text # Step 1: Get the text
ppngx = PaperlessNGXService() ppngx = PaperlessNGXService()
@@ -181,15 +203,23 @@ def paperless_workflow(input):
def consult_simba_oracle(input: str, transcript: str = ""): def consult_simba_oracle(input: str, transcript: str = ""):
return consult_oracle( is_simba_related = classify_query(query=input, transcript=transcript)
input=input,
collection=simba_docs, if is_simba_related:
transcript=transcript, logging.info("Query is related to simba")
) return consult_oracle(
input=input,
collection=simba_docs,
transcript=transcript,
)
logging.info("Query is NOT related to simba")
return llm_chat(input=input, transcript=transcript)
def filter_indexed_files(docs): def filter_indexed_files(docs):
with sqlite3.connect("visited.db") as conn: with sqlite3.connect("database/visited.db") as conn:
c = conn.cursor() c = conn.cursor()
c.execute( c.execute(
"CREATE TABLE IF NOT EXISTS indexed_documents (id INTEGER PRIMARY KEY AUTOINCREMENT, paperless_id INTEGER)" "CREATE TABLE IF NOT EXISTS indexed_documents (id INTEGER PRIMARY KEY AUTOINCREMENT, paperless_id INTEGER)"
@@ -202,38 +232,45 @@ def filter_indexed_files(docs):
return [doc for doc in docs if doc["id"] not in visited] return [doc for doc in docs if doc["id"] not in visited]
def reindex():
with sqlite3.connect("database/visited.db") as conn:
c = conn.cursor()
c.execute("DELETE FROM indexed_documents")
conn.commit()
# Delete all documents from the collection
all_docs = simba_docs.get()
if all_docs["ids"]:
simba_docs.delete(ids=all_docs["ids"])
logging.info("Fetching documents from Paperless-NGX")
ppngx = PaperlessNGXService()
docs = ppngx.get_data()
docs = filter_indexed_files(docs)
logging.info(f"Fetched {len(docs)} documents")
# Delete all chromadb data
ids = simba_docs.get(ids=None, limit=None, offset=0)
all_ids = ids["ids"]
if len(all_ids) > 0:
simba_docs.delete(ids=all_ids)
# Chunk documents
logging.info("Chunking documents now ...")
doctype_lookup = ppngx.get_doctypes()
chunk_data(docs, collection=simba_docs, doctypes=doctype_lookup)
logging.info("Done chunking documents")
if __name__ == "__main__": if __name__ == "__main__":
args = parser.parse_args() args = parser.parse_args()
if args.reindex: if args.reindex:
logging.info("Fetching documents from Paperless-NGX") reindex()
ppngx = PaperlessNGXService()
docs = ppngx.get_data()
docs = filter_indexed_files(docs)
logging.info(f"Fetched {len(docs)} documents")
# Delete all chromadb data if args.classify:
ids = simba_docs.get(ids=None, limit=None, offset=0) consult_simba_oracle(input="yohohoho testing")
all_ids = ids["ids"] consult_simba_oracle(input="write an email")
if len(all_ids) > 0: consult_simba_oracle(input="how much does simba weigh")
simba_docs.delete(ids=all_ids)
# Chunk documents
logging.info("Chunking documents now ...")
tag_lookup = ppngx.get_tags()
doctype_lookup = ppngx.get_doctypes()
chunk_data(docs, collection=simba_docs, doctypes=doctype_lookup)
logging.info("Done chunking documents")
# if args.index:
# with open(args.index) as file:
# extension = args.index.split(".")[-1]
# if extension == "pdf":
# pdf_path = ppngx.download_pdf_from_id(id=document_id)
# image_paths = pdf_to_image(filepath=pdf_path)
# print(f"summarizing {file}")
# generated_summary = summarize_pdf_image(filepaths=image_paths)
# elif extension in [".md", ".txt"]:
# chunk_text(texts=[file.readall()], collection=simba_docs)
if args.query: if args.query:
logging.info("Consulting oracle ...") logging.info("Consulting oracle ...")

View File

@@ -49,11 +49,20 @@ DOCTYPE_OPTIONS = [
"Letter", "Letter",
] ]
QUERY_TYPE_OPTIONS = [
"Simba",
"Other",
]
class DocumentType(BaseModel): class DocumentType(BaseModel):
type: list[str] = Field(description="type of document", enum=DOCTYPE_OPTIONS) type: list[str] = Field(description="type of document", enum=DOCTYPE_OPTIONS)
class QueryType(BaseModel):
type: str = Field(desciption="type of query", enum=QUERY_TYPE_OPTIONS)
PROMPT = """ PROMPT = """
You are an information specialist that processes user queries. The current year is 2025. The user queries are all about You are an information specialist that processes user queries. The current year is 2025. The user queries are all about
a cat, Simba, and its records. The types of records are listed below. Using the query, extract the a cat, Simba, and its records. The types of records are listed below. Using the query, extract the
@@ -111,6 +120,27 @@ Query: "Who does Simba know?"
Tags: ["Letter", "Documentation"] Tags: ["Letter", "Documentation"]
""" """
QUERY_TYPE_PROMPT = f"""You are an information specialist that processes user queries.
A query can have one tag attached from the following options. Based on the query and the transcript which is listed below, determine
which of the following options is most appropriate: {",".join(QUERY_TYPE_OPTIONS)}
### Example 1
Query: "Who is Simba's current vet?"
Tags: ["Simba"]
### Example 2
Query: "What is the capital of Tokyo?"
Tags: ["Other"]
### Example 3
Query: "Can you help me write an email?"
Tags: ["Other"]
TRANSCRIPT:
"""
class QueryGenerator: class QueryGenerator:
def __init__(self) -> None: def __init__(self) -> None:
@@ -154,6 +184,33 @@ class QueryGenerator:
metadata_query = {"document_type": {"$in": type_data["type"]}} metadata_query = {"document_type": {"$in": type_data["type"]}}
return metadata_query return metadata_query
def get_query_type(self, input: str, transcript: str):
client = OpenAI()
response = client.chat.completions.create(
messages=[
{
"role": "system",
"content": "You are an information specialist that is really good at deciding what tags a query should have",
},
{
"role": "user",
"content": f"{QUERY_TYPE_PROMPT}\nTRANSCRIPT:\n{transcript}\nQUERY:{input}",
},
],
model="gpt-4o",
response_format={
"type": "json_schema",
"json_schema": {
"name": "query_type",
"schema": QueryType.model_json_schema(),
},
},
)
response_json_str = response.choices[0].message.content
type_data = json.loads(response_json_str)
return type_data["type"]
def get_query(self, input: str): def get_query(self, input: str):
client = OpenAI() client = OpenAI()
response = client.responses.parse( response = client.responses.parse(

View File

@@ -3,4 +3,8 @@ import { pluginReact } from '@rsbuild/plugin-react';
export default defineConfig({ export default defineConfig({
plugins: [pluginReact()], plugins: [pluginReact()],
html: {
title: 'Raggr',
favicon: './src/assets/favicon.svg',
},
}); });

View File

@@ -0,0 +1,3 @@
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 100 100">
<text y="80" font-size="80" font-family="system-ui, -apple-system, sans-serif">🐱</text>
</svg>

After

Width:  |  Height:  |  Size: 163 B