reorganization

This commit is contained in:
2026-01-31 17:13:27 -05:00
parent 1fd2e860b2
commit ad39904dda
87 changed files with 1019 additions and 237 deletions

0
scripts/__init__.py Normal file
View File

146
scripts/add_user.py Normal file
View File

@@ -0,0 +1,146 @@
# GENERATED BY CLAUDE
import os
import sys
import uuid
import asyncio
from tortoise import Tortoise
from blueprints.users.models import User
from dotenv import load_dotenv
load_dotenv()
# Database configuration with environment variable support
DATABASE_PATH = os.getenv("DATABASE_PATH", "database/raggr.db")
DATABASE_URL = os.getenv("DATABASE_URL", f"sqlite://{DATABASE_PATH}")
print(DATABASE_URL)
async def add_user(username: str, email: str, password: str):
"""Add a new user to the database"""
await Tortoise.init(
db_url=DATABASE_URL,
modules={
"models": [
"blueprints.users.models",
"blueprints.conversation.models",
]
},
)
try:
# Check if user already exists
existing_user = await User.filter(email=email).first()
if existing_user:
print(f"Error: User with email '{email}' already exists!")
return False
existing_username = await User.filter(username=username).first()
if existing_username:
print(f"Error: Username '{username}' is already taken!")
return False
# Create new user
user = User(
id=uuid.uuid4(),
username=username,
email=email,
)
user.set_password(password)
await user.save()
print("✓ User created successfully!")
print(f" Username: {username}")
print(f" Email: {email}")
print(f" ID: {user.id}")
return True
except Exception as e:
print(f"Error creating user: {e}")
return False
finally:
await Tortoise.close_connections()
async def list_users():
"""List all users in the database"""
await Tortoise.init(
db_url=DATABASE_URL,
modules={
"models": [
"blueprints.users.models",
"blueprints.conversation.models",
]
},
)
try:
users = await User.all()
if not users:
print("No users found in database.")
return
print(f"\nFound {len(users)} user(s):")
print("-" * 60)
for user in users:
print(f"Username: {user.username}")
print(f"Email: {user.email}")
print(f"ID: {user.id}")
print(f"Created: {user.created_at}")
print("-" * 60)
except Exception as e:
print(f"Error listing users: {e}")
finally:
await Tortoise.close_connections()
def print_usage():
"""Print usage instructions"""
print("Usage:")
print(" python add_user.py add <username> <email> <password>")
print(" python add_user.py list")
print("\nExamples:")
print(" python add_user.py add ryan ryan@example.com mypassword123")
print(" python add_user.py list")
print("\nEnvironment Variables:")
print(" DATABASE_PATH - Path to database file (default: database/raggr.db)")
print(" DATABASE_URL - Full database URL (overrides DATABASE_PATH)")
print("\n Example with custom database:")
print(" DATABASE_PATH=dev.db python add_user.py list")
async def main():
if len(sys.argv) < 2:
print_usage()
sys.exit(1)
command = sys.argv[1].lower()
if command == "add":
if len(sys.argv) != 5:
print("Error: Missing arguments for 'add' command")
print_usage()
sys.exit(1)
username = sys.argv[2]
email = sys.argv[3]
password = sys.argv[4]
success = await add_user(username, email, password)
sys.exit(0 if success else 1)
elif command == "list":
await list_users()
sys.exit(0)
else:
print(f"Error: Unknown command '{command}'")
print_usage()
sys.exit(1)
if __name__ == "__main__":
asyncio.run(main())

118
scripts/index_immich.py Normal file
View File

@@ -0,0 +1,118 @@
import logging
import os
import sqlite3
import httpx
from dotenv import load_dotenv
import sys
from pathlib import Path
# Add parent directory to path for imports
sys.path.insert(0, str(Path(__file__).parent.parent))
from utils.image_process import describe_simba_image
from utils.request import PaperlessNGXService
logging.basicConfig(level=logging.INFO)
load_dotenv()
# Configuration from environment variables
IMMICH_URL = os.getenv("IMMICH_URL", "http://localhost:2283")
API_KEY = os.getenv("IMMICH_API_KEY")
PERSON_NAME = os.getenv("PERSON_NAME", "Simba") # Name of the tagged person/pet
DOWNLOAD_DIR = os.getenv("DOWNLOAD_DIR", "./simba_photos")
# Set up headers
headers = {"x-api-key": API_KEY, "Content-Type": "application/json"}
VISITED = {}
if __name__ == "__main__":
conn = sqlite3.connect("./database/visited.db")
c = conn.cursor()
c.execute("select immich_id from visited")
rows = c.fetchall()
for row in rows:
VISITED.add(row[0])
ppngx = PaperlessNGXService()
people_url = f"{IMMICH_URL}/api/search/person?name=Simba"
people = httpx.get(people_url, headers=headers).json()
simba_id = people[0]["id"]
ids = {}
asset_search = f"{IMMICH_URL}/api/search/smart"
request_body = {"query": "orange cat"}
results = httpx.post(asset_search, headers=headers, json=request_body)
assets = results.json()["assets"]
for asset in assets["items"]:
if asset["type"] == "IMAGE" and asset["id"] not in VISITED:
ids[asset["id"]] = asset.get("originalFileName")
nextPage = assets.get("nextPage")
# while nextPage != None:
# logging.info(f"next page: {nextPage}")
# request_body["page"] = nextPage
# results = httpx.post(asset_search, headers=headers, json=request_body)
# assets = results.json()["assets"]
# for asset in assets["items"]:
# if asset["type"] == "IMAGE":
# ids.add(asset['id'])
# nextPage = assets.get("nextPage")
asset_search = f"{IMMICH_URL}/api/search/smart"
request_body = {"query": "simba"}
results = httpx.post(asset_search, headers=headers, json=request_body)
for asset in results.json()["assets"]["items"]:
if asset["type"] == "IMAGE":
ids[asset["id"]] = asset.get("originalFileName")
for immich_asset_id, immich_filename in ids.items():
try:
response = httpx.get(
f"{IMMICH_URL}/api/assets/{immich_asset_id}/original", headers=headers
)
path = os.path.join("/Users/ryanchen/Programs/raggr", immich_filename)
file = open(path, "wb+")
for chunk in response.iter_bytes(chunk_size=8192):
file.write(chunk)
logging.info("Processing image ...")
description = describe_simba_image(path)
image_description = description.description
image_date = description.image_date
description_filepath = os.path.join(
"/Users/ryanchen/Programs/raggr", "SIMBA_DESCRIBE_001.txt"
)
file = open(description_filepath, "w+")
file.write(image_description)
file.close()
file = open(description_filepath, "rb")
ppngx.upload_description(
description_filepath=description_filepath,
file=file,
title="SIMBA_DESCRIBE_001.txt",
exif_date=image_date,
)
file.close()
c.execute("INSERT INTO visited (immich_id) values (?)", (immich_asset_id,))
conn.commit()
logging.info("Processing complete. Deleting file.")
os.remove(file.name)
except Exception as e:
logging.info(f"something went wrong for {immich_filename}")
logging.info(e)
conn.close()

View File

@@ -0,0 +1,92 @@
#!/usr/bin/env python3
"""CLI tool to inspect the vector store contents."""
import argparse
import os
from dotenv import load_dotenv
from blueprints.rag.logic import (
get_vector_store_stats,
index_documents,
list_all_documents,
)
# Load .env from the root directory
root_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))
env_path = os.path.join(root_dir, ".env")
load_dotenv(env_path)
def print_stats():
"""Print vector store statistics."""
stats = get_vector_store_stats()
print("=== Vector Store Statistics ===")
print(f"Collection Name: {stats['collection_name']}")
print(f"Total Documents: {stats['total_documents']}")
print()
def print_documents(limit: int = 10, show_content: bool = False):
"""Print documents in the vector store."""
docs = list_all_documents(limit=limit)
print(f"=== Documents (showing {len(docs)} of {limit} requested) ===\n")
for i, doc in enumerate(docs, 1):
print(f"Document {i}:")
print(f" ID: {doc['id']}")
print(f" Metadata: {doc['metadata']}")
if show_content:
print(f" Content Preview: {doc['content_preview']}")
print()
async def run_index():
"""Run the indexing process."""
print("Starting indexing process...")
await index_documents()
print("Indexing complete!")
print_stats()
def main():
import asyncio
parser = argparse.ArgumentParser(description="Inspect the vector store contents")
parser.add_argument(
"--stats", action="store_true", help="Show vector store statistics"
)
parser.add_argument(
"--list", type=int, metavar="N", help="List N documents from the vector store"
)
parser.add_argument(
"--show-content",
action="store_true",
help="Show content preview when listing documents",
)
parser.add_argument(
"--index",
action="store_true",
help="Index documents from Paperless-NGX into the vector store",
)
args = parser.parse_args()
# Handle indexing first if requested
if args.index:
asyncio.run(run_index())
return
# If no arguments provided, show stats by default
if not any([args.stats, args.list]):
args.stats = True
if args.stats:
print_stats()
if args.list:
print_documents(limit=args.list, show_content=args.show_content)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,121 @@
#!/usr/bin/env python3
"""Management script for vector store operations."""
import argparse
import asyncio
import sys
from blueprints.rag.logic import (
get_vector_store_stats,
index_documents,
list_all_documents,
vector_store,
)
def stats():
"""Show vector store statistics."""
stats = get_vector_store_stats()
print("=== Vector Store Statistics ===")
print(f"Collection: {stats['collection_name']}")
print(f"Total Documents: {stats['total_documents']}")
async def index():
"""Index documents from Paperless-NGX."""
print("Starting indexing process...")
print("Fetching documents from Paperless-NGX...")
await index_documents()
print("✓ Indexing complete!")
stats()
async def reindex():
"""Clear and reindex all documents."""
print("Clearing existing documents...")
collection = vector_store._collection
all_docs = collection.get()
if all_docs["ids"]:
print(f"Deleting {len(all_docs['ids'])} existing documents...")
collection.delete(ids=all_docs["ids"])
print("✓ Cleared")
else:
print("Collection is already empty")
await index()
def list_docs(limit: int = 10, show_content: bool = False):
"""List documents in the vector store."""
docs = list_all_documents(limit=limit)
print(f"\n=== Documents (showing {len(docs)}) ===\n")
for i, doc in enumerate(docs, 1):
print(f"Document {i}:")
print(f" ID: {doc['id']}")
print(f" Metadata: {doc['metadata']}")
if show_content:
print(f" Content: {doc['content_preview']}")
print()
def main():
parser = argparse.ArgumentParser(
description="Manage vector store for RAG system",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
%(prog)s stats # Show vector store statistics
%(prog)s index # Index new documents from Paperless-NGX
%(prog)s reindex # Clear and reindex all documents
%(prog)s list 10 # List first 10 documents
%(prog)s list 20 --show-content # List 20 documents with content preview
""",
)
subparsers = parser.add_subparsers(dest="command", help="Command to execute")
# Stats command
subparsers.add_parser("stats", help="Show vector store statistics")
# Index command
subparsers.add_parser("index", help="Index documents from Paperless-NGX")
# Reindex command
subparsers.add_parser("reindex", help="Clear and reindex all documents")
# List command
list_parser = subparsers.add_parser("list", help="List documents in vector store")
list_parser.add_argument(
"limit", type=int, default=10, nargs="?", help="Number of documents to list"
)
list_parser.add_argument(
"--show-content", action="store_true", help="Show content preview"
)
args = parser.parse_args()
if not args.command:
parser.print_help()
sys.exit(1)
try:
if args.command == "stats":
stats()
elif args.command == "index":
asyncio.run(index())
elif args.command == "reindex":
asyncio.run(reindex())
elif args.command == "list":
list_docs(limit=args.limit, show_content=args.show_content)
except KeyboardInterrupt:
print("\n\nOperation cancelled by user")
sys.exit(1)
except Exception as e:
print(f"\n❌ Error: {e}", file=sys.stderr)
sys.exit(1)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,24 @@
from bs4 import BeautifulSoup
import chromadb
import httpx
client = chromadb.PersistentClient(path="/Users/ryanchen/Programs/raggr/chromadb")
# Scrape
BASE_URL = "https://www.vet.cornell.edu"
LIST_URL = "/departments-centers-and-institutes/cornell-feline-health-center/health-information/feline-health-topics"
QUERY_URL = BASE_URL + LIST_URL
r = httpx.get(QUERY_URL)
soup = BeautifulSoup(r.text)
container = soup.find("div", class_="field-body")
a_s = container.find_all("a", href=True)
new_texts = []
for link in a_s:
endpoint = link["href"]
query_url = BASE_URL + endpoint
r2 = httpx.get(query_url)
article_soup = BeautifulSoup(r2.text)

251
scripts/query.py Normal file
View File

@@ -0,0 +1,251 @@
import json
import os
from typing import Literal
import datetime
from ollama import Client
from openai import OpenAI
from pydantic import BaseModel, Field
# Configure ollama client with URL from environment or default to localhost
ollama_client = Client(
host=os.getenv("OLLAMA_URL", "http://localhost:11434"), timeout=10.0
)
# This uses inferred filters — which means using LLM to create the metadata filters
class FilterOperation(BaseModel):
op: Literal["$gt", "$gte", "$eq", "$ne", "$lt", "$lte", "$in", "$nin"]
value: str | list[str]
class FilterQuery(BaseModel):
field_name: Literal["created_date, tags"]
op: FilterOperation
class AndQuery(BaseModel):
op: Literal["$and", "$or"]
subqueries: list[FilterQuery]
class GeneratedQuery(BaseModel):
fields: list[str]
extracted_metadata_fields: str
class Time(BaseModel):
time: int
DOCTYPE_OPTIONS = [
"Bill",
"Image Description",
"Insurance",
"Medical Record",
"Documentation",
"Letter",
]
QUERY_TYPE_OPTIONS = [
"Simba",
"Other",
]
class DocumentType(BaseModel):
type: list[str] = Field(description="type of document", enum=DOCTYPE_OPTIONS)
class QueryType(BaseModel):
type: str = Field(desciption="type of query", enum=QUERY_TYPE_OPTIONS)
PROMPT = """
You are an information specialist that processes user queries. The current year is 2025. The user queries are all about
a cat, Simba, and its records. The types of records are listed below. Using the query, extract the
the date range the user is trying to query. You should return it as a JSON. The date tag is created_date. Return the date in epoch time.
If the created_date cannot be ascertained, set it to epoch time start.
You have several operators at your disposal:
- $gt: greater than
- $gte: greater than or equal
- $eq: equal
- $ne: not equal
- $lt: less than
- $lte: less than or equal to
- $in: in
- $nin: not in
Logical operators:
- $and, $or
### Example 1
Query: "Who is Simba's current vet?"
Metadata fields: "{"created_date"}"
Extracted metadata fields: {"created_date: {"$gt": "2025-01-01"}}
### Example 2
Query: "How many teeth has Simba had removed?"
Metadata fields: {}
Extracted metadata fields: {}
### Example 3
Query: "How many times has Simba been to the vet this year?"
Metadata fields: {"created_date"}
Extracted metadata fields: {"created_date": {"gt": "2025-01-01"}}
document_types:
- aftercare
- bill
- insurance claim
- medical records
Only return the extracted metadata fields. Make sure the extracted metadata fields are valid JSON
"""
DOCTYPE_PROMPT = f"""You are an information specialist that processes user queries. A query can have two tags attached from the following options. Based on the query, determine which of the following options is most appropriate: {",".join(DOCTYPE_OPTIONS)}
### Example 1
Query: "Who is Simba's current vet?"
Tags: ["Bill", "Medical Record"]
### Example 2
Query: "Who does Simba know?"
Tags: ["Letter", "Documentation"]
"""
QUERY_TYPE_PROMPT = f"""You are an information specialist that processes user queries.
A query can have one tag attached from the following options. Based on the query and the transcript which is listed below, determine
which of the following options is most appropriate: {",".join(QUERY_TYPE_OPTIONS)}
### Example 1
Query: "Who is Simba's current vet?"
Tags: ["Simba"]
### Example 2
Query: "What is the capital of Tokyo?"
Tags: ["Other"]
### Example 3
Query: "Can you help me write an email?"
Tags: ["Other"]
TRANSCRIPT:
"""
class QueryGenerator:
def __init__(self) -> None:
pass
def date_to_epoch(self, date_str: str) -> float:
split_date = date_str.split("-")
date = datetime.datetime(
int(split_date[0]),
int(split_date[1]),
int(split_date[2]),
0,
0,
0,
)
return date.timestamp()
def get_doctype_query(self, input: str):
client = OpenAI()
response = client.chat.completions.create(
messages=[
{
"role": "system",
"content": "You are an information specialist that is really good at deciding what tags a query should have",
},
{"role": "user", "content": DOCTYPE_PROMPT + " " + input},
],
model="gpt-4o",
response_format={
"type": "json_schema",
"json_schema": {
"name": "document_type",
"schema": DocumentType.model_json_schema(),
},
},
)
response_json_str = response.choices[0].message.content
type_data = json.loads(response_json_str)
metadata_query = {"document_type": {"$in": type_data["type"]}}
return metadata_query
def get_query_type(self, input: str, transcript: str):
client = OpenAI()
response = client.chat.completions.create(
messages=[
{
"role": "system",
"content": "You are an information specialist that is really good at deciding what tags a query should have",
},
{
"role": "user",
"content": f"{QUERY_TYPE_PROMPT}\nTRANSCRIPT:\n{transcript}\nQUERY:{input}",
},
],
model="gpt-4o",
response_format={
"type": "json_schema",
"json_schema": {
"name": "query_type",
"schema": QueryType.model_json_schema(),
},
},
)
response_json_str = response.choices[0].message.content
type_data = json.loads(response_json_str)
return type_data["type"]
def get_query(self, input: str):
client = OpenAI()
response = client.responses.parse(
model="gpt-4o",
input=[
{"role": "system", "content": PROMPT},
{"role": "user", "content": input},
],
text_format=GeneratedQuery,
)
print(response.output)
query = json.loads(response.output_parsed.extracted_metadata_fields)
# response: ChatResponse = ollama_client.chat(
# model="gemma3n:e4b",
# messages=[
# {"role": "system", "content": PROMPT},
# {"role": "user", "content": input},
# ],
# format=GeneratedQuery.model_json_schema(),
# )
# query = json.loads(
# json.loads(response["message"]["content"])["extracted_metadata_fields"]
# )
# date_key = list(query["created_date"].keys())[0]
# query["created_date"][date_key] = self.date_to_epoch(
# query["created_date"][date_key]
# )
# if "$" not in date_key:
# query["created_date"]["$" + date_key] = query["created_date"][date_key]
return query
if __name__ == "__main__":
qg = QueryGenerator()
print(qg.get_doctype_query("How heavy is Simba?"))

39
scripts/test_query.py Normal file
View File

@@ -0,0 +1,39 @@
#!/usr/bin/env python3
"""Test the query_vector_store function."""
import asyncio
import os
from dotenv import load_dotenv
from blueprints.rag.logic import query_vector_store
# Load .env from the root directory
root_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))
env_path = os.path.join(root_dir, ".env")
load_dotenv(env_path)
async def test_query(query: str):
"""Test a query against the vector store."""
print(f"Query: {query}\n")
result, docs = await query_vector_store(query)
print(f"Found {len(docs)} documents\n")
print("Serialized result:")
print(result)
print("\n" + "=" * 80 + "\n")
async def main():
queries = [
"What is Simba's weight?",
"What medications is Simba taking?",
"Tell me about Simba's recent vet visits",
]
for query in queries:
await test_query(query)
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -0,0 +1,79 @@
#!/usr/bin/env python3
"""
Script to show how many messages each user has written
"""
import asyncio
from tortoise import Tortoise
from blueprints.users.models import User
from blueprints.conversation.models import Speaker
import os
async def get_user_message_stats():
"""Get message count statistics per user"""
# Initialize database connection
database_url = os.getenv("DATABASE_URL", "sqlite://raggr.db")
await Tortoise.init(
db_url=database_url,
modules={
"models": [
"blueprints.users.models",
"blueprints.conversation.models",
]
},
)
print("\n📊 User Message Statistics\n")
print(
f"{'Username':<20} {'Total Messages':<15} {'User Messages':<15} {'Conversations':<15}"
)
print("=" * 70)
# Get all users
users = await User.all()
total_users = 0
total_messages = 0
for user in users:
# Get all conversations for this user
conversations = await user.conversations.all()
if not conversations:
continue
total_users += 1
# Count messages across all conversations
user_message_count = 0
total_message_count = 0
for conversation in conversations:
messages = await conversation.messages.all()
total_message_count += len(messages)
# Count only user messages (not assistant responses)
user_messages = [msg for msg in messages if msg.speaker == Speaker.USER]
user_message_count += len(user_messages)
total_messages += user_message_count
print(
f"{user.username:<20} {total_message_count:<15} {user_message_count:<15} {len(conversations):<15}"
)
print("=" * 70)
print("\n📈 Summary:")
print(f" Total active users: {total_users}")
print(f" Total user messages: {total_messages}")
print(
f" Average messages per user: {total_messages / total_users if total_users > 0 else 0:.1f}\n"
)
await Tortoise.close_connections()
if __name__ == "__main__":
asyncio.run(get_user_message_stats())