reorganization
This commit is contained in:
0
scripts/__init__.py
Normal file
0
scripts/__init__.py
Normal file
146
scripts/add_user.py
Normal file
146
scripts/add_user.py
Normal file
@@ -0,0 +1,146 @@
|
||||
# GENERATED BY CLAUDE
|
||||
|
||||
import os
|
||||
import sys
|
||||
import uuid
|
||||
import asyncio
|
||||
from tortoise import Tortoise
|
||||
from blueprints.users.models import User
|
||||
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv()
|
||||
|
||||
# Database configuration with environment variable support
|
||||
DATABASE_PATH = os.getenv("DATABASE_PATH", "database/raggr.db")
|
||||
DATABASE_URL = os.getenv("DATABASE_URL", f"sqlite://{DATABASE_PATH}")
|
||||
|
||||
print(DATABASE_URL)
|
||||
|
||||
|
||||
async def add_user(username: str, email: str, password: str):
|
||||
"""Add a new user to the database"""
|
||||
await Tortoise.init(
|
||||
db_url=DATABASE_URL,
|
||||
modules={
|
||||
"models": [
|
||||
"blueprints.users.models",
|
||||
"blueprints.conversation.models",
|
||||
]
|
||||
},
|
||||
)
|
||||
|
||||
try:
|
||||
# Check if user already exists
|
||||
existing_user = await User.filter(email=email).first()
|
||||
if existing_user:
|
||||
print(f"Error: User with email '{email}' already exists!")
|
||||
return False
|
||||
|
||||
existing_username = await User.filter(username=username).first()
|
||||
if existing_username:
|
||||
print(f"Error: Username '{username}' is already taken!")
|
||||
return False
|
||||
|
||||
# Create new user
|
||||
user = User(
|
||||
id=uuid.uuid4(),
|
||||
username=username,
|
||||
email=email,
|
||||
)
|
||||
user.set_password(password)
|
||||
await user.save()
|
||||
|
||||
print("✓ User created successfully!")
|
||||
print(f" Username: {username}")
|
||||
print(f" Email: {email}")
|
||||
print(f" ID: {user.id}")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error creating user: {e}")
|
||||
return False
|
||||
finally:
|
||||
await Tortoise.close_connections()
|
||||
|
||||
|
||||
async def list_users():
|
||||
"""List all users in the database"""
|
||||
await Tortoise.init(
|
||||
db_url=DATABASE_URL,
|
||||
modules={
|
||||
"models": [
|
||||
"blueprints.users.models",
|
||||
"blueprints.conversation.models",
|
||||
]
|
||||
},
|
||||
)
|
||||
|
||||
try:
|
||||
users = await User.all()
|
||||
if not users:
|
||||
print("No users found in database.")
|
||||
return
|
||||
|
||||
print(f"\nFound {len(users)} user(s):")
|
||||
print("-" * 60)
|
||||
for user in users:
|
||||
print(f"Username: {user.username}")
|
||||
print(f"Email: {user.email}")
|
||||
print(f"ID: {user.id}")
|
||||
print(f"Created: {user.created_at}")
|
||||
print("-" * 60)
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error listing users: {e}")
|
||||
finally:
|
||||
await Tortoise.close_connections()
|
||||
|
||||
|
||||
def print_usage():
|
||||
"""Print usage instructions"""
|
||||
print("Usage:")
|
||||
print(" python add_user.py add <username> <email> <password>")
|
||||
print(" python add_user.py list")
|
||||
print("\nExamples:")
|
||||
print(" python add_user.py add ryan ryan@example.com mypassword123")
|
||||
print(" python add_user.py list")
|
||||
print("\nEnvironment Variables:")
|
||||
print(" DATABASE_PATH - Path to database file (default: database/raggr.db)")
|
||||
print(" DATABASE_URL - Full database URL (overrides DATABASE_PATH)")
|
||||
print("\n Example with custom database:")
|
||||
print(" DATABASE_PATH=dev.db python add_user.py list")
|
||||
|
||||
|
||||
async def main():
|
||||
if len(sys.argv) < 2:
|
||||
print_usage()
|
||||
sys.exit(1)
|
||||
|
||||
command = sys.argv[1].lower()
|
||||
|
||||
if command == "add":
|
||||
if len(sys.argv) != 5:
|
||||
print("Error: Missing arguments for 'add' command")
|
||||
print_usage()
|
||||
sys.exit(1)
|
||||
|
||||
username = sys.argv[2]
|
||||
email = sys.argv[3]
|
||||
password = sys.argv[4]
|
||||
|
||||
success = await add_user(username, email, password)
|
||||
sys.exit(0 if success else 1)
|
||||
|
||||
elif command == "list":
|
||||
await list_users()
|
||||
sys.exit(0)
|
||||
|
||||
else:
|
||||
print(f"Error: Unknown command '{command}'")
|
||||
print_usage()
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
118
scripts/index_immich.py
Normal file
118
scripts/index_immich.py
Normal file
@@ -0,0 +1,118 @@
|
||||
import logging
|
||||
import os
|
||||
import sqlite3
|
||||
|
||||
import httpx
|
||||
from dotenv import load_dotenv
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# Add parent directory to path for imports
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
|
||||
from utils.image_process import describe_simba_image
|
||||
from utils.request import PaperlessNGXService
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
|
||||
load_dotenv()
|
||||
|
||||
# Configuration from environment variables
|
||||
IMMICH_URL = os.getenv("IMMICH_URL", "http://localhost:2283")
|
||||
API_KEY = os.getenv("IMMICH_API_KEY")
|
||||
PERSON_NAME = os.getenv("PERSON_NAME", "Simba") # Name of the tagged person/pet
|
||||
DOWNLOAD_DIR = os.getenv("DOWNLOAD_DIR", "./simba_photos")
|
||||
|
||||
# Set up headers
|
||||
headers = {"x-api-key": API_KEY, "Content-Type": "application/json"}
|
||||
|
||||
VISITED = {}
|
||||
|
||||
if __name__ == "__main__":
|
||||
conn = sqlite3.connect("./database/visited.db")
|
||||
c = conn.cursor()
|
||||
c.execute("select immich_id from visited")
|
||||
rows = c.fetchall()
|
||||
for row in rows:
|
||||
VISITED.add(row[0])
|
||||
|
||||
ppngx = PaperlessNGXService()
|
||||
people_url = f"{IMMICH_URL}/api/search/person?name=Simba"
|
||||
people = httpx.get(people_url, headers=headers).json()
|
||||
|
||||
simba_id = people[0]["id"]
|
||||
|
||||
ids = {}
|
||||
|
||||
asset_search = f"{IMMICH_URL}/api/search/smart"
|
||||
request_body = {"query": "orange cat"}
|
||||
results = httpx.post(asset_search, headers=headers, json=request_body)
|
||||
|
||||
assets = results.json()["assets"]
|
||||
for asset in assets["items"]:
|
||||
if asset["type"] == "IMAGE" and asset["id"] not in VISITED:
|
||||
ids[asset["id"]] = asset.get("originalFileName")
|
||||
nextPage = assets.get("nextPage")
|
||||
|
||||
# while nextPage != None:
|
||||
# logging.info(f"next page: {nextPage}")
|
||||
# request_body["page"] = nextPage
|
||||
# results = httpx.post(asset_search, headers=headers, json=request_body)
|
||||
# assets = results.json()["assets"]
|
||||
|
||||
# for asset in assets["items"]:
|
||||
# if asset["type"] == "IMAGE":
|
||||
# ids.add(asset['id'])
|
||||
|
||||
# nextPage = assets.get("nextPage")
|
||||
|
||||
asset_search = f"{IMMICH_URL}/api/search/smart"
|
||||
request_body = {"query": "simba"}
|
||||
results = httpx.post(asset_search, headers=headers, json=request_body)
|
||||
for asset in results.json()["assets"]["items"]:
|
||||
if asset["type"] == "IMAGE":
|
||||
ids[asset["id"]] = asset.get("originalFileName")
|
||||
|
||||
for immich_asset_id, immich_filename in ids.items():
|
||||
try:
|
||||
response = httpx.get(
|
||||
f"{IMMICH_URL}/api/assets/{immich_asset_id}/original", headers=headers
|
||||
)
|
||||
|
||||
path = os.path.join("/Users/ryanchen/Programs/raggr", immich_filename)
|
||||
file = open(path, "wb+")
|
||||
for chunk in response.iter_bytes(chunk_size=8192):
|
||||
file.write(chunk)
|
||||
|
||||
logging.info("Processing image ...")
|
||||
description = describe_simba_image(path)
|
||||
|
||||
image_description = description.description
|
||||
image_date = description.image_date
|
||||
|
||||
description_filepath = os.path.join(
|
||||
"/Users/ryanchen/Programs/raggr", "SIMBA_DESCRIBE_001.txt"
|
||||
)
|
||||
file = open(description_filepath, "w+")
|
||||
file.write(image_description)
|
||||
file.close()
|
||||
|
||||
file = open(description_filepath, "rb")
|
||||
ppngx.upload_description(
|
||||
description_filepath=description_filepath,
|
||||
file=file,
|
||||
title="SIMBA_DESCRIBE_001.txt",
|
||||
exif_date=image_date,
|
||||
)
|
||||
file.close()
|
||||
|
||||
c.execute("INSERT INTO visited (immich_id) values (?)", (immich_asset_id,))
|
||||
conn.commit()
|
||||
logging.info("Processing complete. Deleting file.")
|
||||
os.remove(file.name)
|
||||
except Exception as e:
|
||||
logging.info(f"something went wrong for {immich_filename}")
|
||||
logging.info(e)
|
||||
|
||||
conn.close()
|
||||
92
scripts/inspect_vector_store.py
Normal file
92
scripts/inspect_vector_store.py
Normal file
@@ -0,0 +1,92 @@
|
||||
#!/usr/bin/env python3
|
||||
"""CLI tool to inspect the vector store contents."""
|
||||
|
||||
import argparse
|
||||
import os
|
||||
|
||||
from dotenv import load_dotenv
|
||||
|
||||
from blueprints.rag.logic import (
|
||||
get_vector_store_stats,
|
||||
index_documents,
|
||||
list_all_documents,
|
||||
)
|
||||
|
||||
# Load .env from the root directory
|
||||
root_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))
|
||||
env_path = os.path.join(root_dir, ".env")
|
||||
load_dotenv(env_path)
|
||||
|
||||
|
||||
def print_stats():
|
||||
"""Print vector store statistics."""
|
||||
stats = get_vector_store_stats()
|
||||
print("=== Vector Store Statistics ===")
|
||||
print(f"Collection Name: {stats['collection_name']}")
|
||||
print(f"Total Documents: {stats['total_documents']}")
|
||||
print()
|
||||
|
||||
|
||||
def print_documents(limit: int = 10, show_content: bool = False):
|
||||
"""Print documents in the vector store."""
|
||||
docs = list_all_documents(limit=limit)
|
||||
print(f"=== Documents (showing {len(docs)} of {limit} requested) ===\n")
|
||||
|
||||
for i, doc in enumerate(docs, 1):
|
||||
print(f"Document {i}:")
|
||||
print(f" ID: {doc['id']}")
|
||||
print(f" Metadata: {doc['metadata']}")
|
||||
if show_content:
|
||||
print(f" Content Preview: {doc['content_preview']}")
|
||||
print()
|
||||
|
||||
|
||||
async def run_index():
|
||||
"""Run the indexing process."""
|
||||
print("Starting indexing process...")
|
||||
await index_documents()
|
||||
print("Indexing complete!")
|
||||
print_stats()
|
||||
|
||||
|
||||
def main():
|
||||
import asyncio
|
||||
|
||||
parser = argparse.ArgumentParser(description="Inspect the vector store contents")
|
||||
parser.add_argument(
|
||||
"--stats", action="store_true", help="Show vector store statistics"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--list", type=int, metavar="N", help="List N documents from the vector store"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--show-content",
|
||||
action="store_true",
|
||||
help="Show content preview when listing documents",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--index",
|
||||
action="store_true",
|
||||
help="Index documents from Paperless-NGX into the vector store",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Handle indexing first if requested
|
||||
if args.index:
|
||||
asyncio.run(run_index())
|
||||
return
|
||||
|
||||
# If no arguments provided, show stats by default
|
||||
if not any([args.stats, args.list]):
|
||||
args.stats = True
|
||||
|
||||
if args.stats:
|
||||
print_stats()
|
||||
|
||||
if args.list:
|
||||
print_documents(limit=args.list, show_content=args.show_content)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
121
scripts/manage_vectorstore.py
Normal file
121
scripts/manage_vectorstore.py
Normal file
@@ -0,0 +1,121 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Management script for vector store operations."""
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import sys
|
||||
|
||||
from blueprints.rag.logic import (
|
||||
get_vector_store_stats,
|
||||
index_documents,
|
||||
list_all_documents,
|
||||
vector_store,
|
||||
)
|
||||
|
||||
|
||||
def stats():
|
||||
"""Show vector store statistics."""
|
||||
stats = get_vector_store_stats()
|
||||
print("=== Vector Store Statistics ===")
|
||||
print(f"Collection: {stats['collection_name']}")
|
||||
print(f"Total Documents: {stats['total_documents']}")
|
||||
|
||||
|
||||
async def index():
|
||||
"""Index documents from Paperless-NGX."""
|
||||
print("Starting indexing process...")
|
||||
print("Fetching documents from Paperless-NGX...")
|
||||
await index_documents()
|
||||
print("✓ Indexing complete!")
|
||||
stats()
|
||||
|
||||
|
||||
async def reindex():
|
||||
"""Clear and reindex all documents."""
|
||||
print("Clearing existing documents...")
|
||||
collection = vector_store._collection
|
||||
all_docs = collection.get()
|
||||
|
||||
if all_docs["ids"]:
|
||||
print(f"Deleting {len(all_docs['ids'])} existing documents...")
|
||||
collection.delete(ids=all_docs["ids"])
|
||||
print("✓ Cleared")
|
||||
else:
|
||||
print("Collection is already empty")
|
||||
|
||||
await index()
|
||||
|
||||
|
||||
def list_docs(limit: int = 10, show_content: bool = False):
|
||||
"""List documents in the vector store."""
|
||||
docs = list_all_documents(limit=limit)
|
||||
print(f"\n=== Documents (showing {len(docs)}) ===\n")
|
||||
|
||||
for i, doc in enumerate(docs, 1):
|
||||
print(f"Document {i}:")
|
||||
print(f" ID: {doc['id']}")
|
||||
print(f" Metadata: {doc['metadata']}")
|
||||
if show_content:
|
||||
print(f" Content: {doc['content_preview']}")
|
||||
print()
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Manage vector store for RAG system",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
%(prog)s stats # Show vector store statistics
|
||||
%(prog)s index # Index new documents from Paperless-NGX
|
||||
%(prog)s reindex # Clear and reindex all documents
|
||||
%(prog)s list 10 # List first 10 documents
|
||||
%(prog)s list 20 --show-content # List 20 documents with content preview
|
||||
""",
|
||||
)
|
||||
|
||||
subparsers = parser.add_subparsers(dest="command", help="Command to execute")
|
||||
|
||||
# Stats command
|
||||
subparsers.add_parser("stats", help="Show vector store statistics")
|
||||
|
||||
# Index command
|
||||
subparsers.add_parser("index", help="Index documents from Paperless-NGX")
|
||||
|
||||
# Reindex command
|
||||
subparsers.add_parser("reindex", help="Clear and reindex all documents")
|
||||
|
||||
# List command
|
||||
list_parser = subparsers.add_parser("list", help="List documents in vector store")
|
||||
list_parser.add_argument(
|
||||
"limit", type=int, default=10, nargs="?", help="Number of documents to list"
|
||||
)
|
||||
list_parser.add_argument(
|
||||
"--show-content", action="store_true", help="Show content preview"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if not args.command:
|
||||
parser.print_help()
|
||||
sys.exit(1)
|
||||
|
||||
try:
|
||||
if args.command == "stats":
|
||||
stats()
|
||||
elif args.command == "index":
|
||||
asyncio.run(index())
|
||||
elif args.command == "reindex":
|
||||
asyncio.run(reindex())
|
||||
elif args.command == "list":
|
||||
list_docs(limit=args.limit, show_content=args.show_content)
|
||||
except KeyboardInterrupt:
|
||||
print("\n\nOperation cancelled by user")
|
||||
sys.exit(1)
|
||||
except Exception as e:
|
||||
print(f"\n❌ Error: {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
24
scripts/petmd_scrape_index.py
Normal file
24
scripts/petmd_scrape_index.py
Normal file
@@ -0,0 +1,24 @@
|
||||
from bs4 import BeautifulSoup
|
||||
import chromadb
|
||||
import httpx
|
||||
|
||||
client = chromadb.PersistentClient(path="/Users/ryanchen/Programs/raggr/chromadb")
|
||||
|
||||
# Scrape
|
||||
BASE_URL = "https://www.vet.cornell.edu"
|
||||
LIST_URL = "/departments-centers-and-institutes/cornell-feline-health-center/health-information/feline-health-topics"
|
||||
|
||||
QUERY_URL = BASE_URL + LIST_URL
|
||||
r = httpx.get(QUERY_URL)
|
||||
soup = BeautifulSoup(r.text)
|
||||
|
||||
container = soup.find("div", class_="field-body")
|
||||
a_s = container.find_all("a", href=True)
|
||||
|
||||
new_texts = []
|
||||
|
||||
for link in a_s:
|
||||
endpoint = link["href"]
|
||||
query_url = BASE_URL + endpoint
|
||||
r2 = httpx.get(query_url)
|
||||
article_soup = BeautifulSoup(r2.text)
|
||||
251
scripts/query.py
Normal file
251
scripts/query.py
Normal file
@@ -0,0 +1,251 @@
|
||||
import json
|
||||
import os
|
||||
from typing import Literal
|
||||
import datetime
|
||||
from ollama import Client
|
||||
|
||||
from openai import OpenAI
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
# Configure ollama client with URL from environment or default to localhost
|
||||
ollama_client = Client(
|
||||
host=os.getenv("OLLAMA_URL", "http://localhost:11434"), timeout=10.0
|
||||
)
|
||||
|
||||
# This uses inferred filters — which means using LLM to create the metadata filters
|
||||
|
||||
|
||||
class FilterOperation(BaseModel):
|
||||
op: Literal["$gt", "$gte", "$eq", "$ne", "$lt", "$lte", "$in", "$nin"]
|
||||
value: str | list[str]
|
||||
|
||||
|
||||
class FilterQuery(BaseModel):
|
||||
field_name: Literal["created_date, tags"]
|
||||
op: FilterOperation
|
||||
|
||||
|
||||
class AndQuery(BaseModel):
|
||||
op: Literal["$and", "$or"]
|
||||
subqueries: list[FilterQuery]
|
||||
|
||||
|
||||
class GeneratedQuery(BaseModel):
|
||||
fields: list[str]
|
||||
extracted_metadata_fields: str
|
||||
|
||||
|
||||
class Time(BaseModel):
|
||||
time: int
|
||||
|
||||
|
||||
DOCTYPE_OPTIONS = [
|
||||
"Bill",
|
||||
"Image Description",
|
||||
"Insurance",
|
||||
"Medical Record",
|
||||
"Documentation",
|
||||
"Letter",
|
||||
]
|
||||
|
||||
QUERY_TYPE_OPTIONS = [
|
||||
"Simba",
|
||||
"Other",
|
||||
]
|
||||
|
||||
|
||||
class DocumentType(BaseModel):
|
||||
type: list[str] = Field(description="type of document", enum=DOCTYPE_OPTIONS)
|
||||
|
||||
|
||||
class QueryType(BaseModel):
|
||||
type: str = Field(desciption="type of query", enum=QUERY_TYPE_OPTIONS)
|
||||
|
||||
|
||||
PROMPT = """
|
||||
You are an information specialist that processes user queries. The current year is 2025. The user queries are all about
|
||||
a cat, Simba, and its records. The types of records are listed below. Using the query, extract the
|
||||
the date range the user is trying to query. You should return it as a JSON. The date tag is created_date. Return the date in epoch time.
|
||||
|
||||
If the created_date cannot be ascertained, set it to epoch time start.
|
||||
|
||||
You have several operators at your disposal:
|
||||
- $gt: greater than
|
||||
- $gte: greater than or equal
|
||||
- $eq: equal
|
||||
- $ne: not equal
|
||||
- $lt: less than
|
||||
- $lte: less than or equal to
|
||||
- $in: in
|
||||
- $nin: not in
|
||||
|
||||
Logical operators:
|
||||
- $and, $or
|
||||
|
||||
### Example 1
|
||||
Query: "Who is Simba's current vet?"
|
||||
Metadata fields: "{"created_date"}"
|
||||
Extracted metadata fields: {"created_date: {"$gt": "2025-01-01"}}
|
||||
|
||||
### Example 2
|
||||
Query: "How many teeth has Simba had removed?"
|
||||
Metadata fields: {}
|
||||
Extracted metadata fields: {}
|
||||
|
||||
### Example 3
|
||||
Query: "How many times has Simba been to the vet this year?"
|
||||
Metadata fields: {"created_date"}
|
||||
Extracted metadata fields: {"created_date": {"gt": "2025-01-01"}}
|
||||
|
||||
document_types:
|
||||
- aftercare
|
||||
- bill
|
||||
- insurance claim
|
||||
- medical records
|
||||
|
||||
Only return the extracted metadata fields. Make sure the extracted metadata fields are valid JSON
|
||||
"""
|
||||
|
||||
|
||||
DOCTYPE_PROMPT = f"""You are an information specialist that processes user queries. A query can have two tags attached from the following options. Based on the query, determine which of the following options is most appropriate: {",".join(DOCTYPE_OPTIONS)}
|
||||
|
||||
### Example 1
|
||||
Query: "Who is Simba's current vet?"
|
||||
Tags: ["Bill", "Medical Record"]
|
||||
|
||||
|
||||
### Example 2
|
||||
Query: "Who does Simba know?"
|
||||
Tags: ["Letter", "Documentation"]
|
||||
"""
|
||||
|
||||
QUERY_TYPE_PROMPT = f"""You are an information specialist that processes user queries.
|
||||
A query can have one tag attached from the following options. Based on the query and the transcript which is listed below, determine
|
||||
which of the following options is most appropriate: {",".join(QUERY_TYPE_OPTIONS)}
|
||||
|
||||
### Example 1
|
||||
Query: "Who is Simba's current vet?"
|
||||
Tags: ["Simba"]
|
||||
|
||||
|
||||
### Example 2
|
||||
Query: "What is the capital of Tokyo?"
|
||||
Tags: ["Other"]
|
||||
|
||||
|
||||
### Example 3
|
||||
Query: "Can you help me write an email?"
|
||||
Tags: ["Other"]
|
||||
|
||||
TRANSCRIPT:
|
||||
"""
|
||||
|
||||
|
||||
class QueryGenerator:
|
||||
def __init__(self) -> None:
|
||||
pass
|
||||
|
||||
def date_to_epoch(self, date_str: str) -> float:
|
||||
split_date = date_str.split("-")
|
||||
date = datetime.datetime(
|
||||
int(split_date[0]),
|
||||
int(split_date[1]),
|
||||
int(split_date[2]),
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
)
|
||||
|
||||
return date.timestamp()
|
||||
|
||||
def get_doctype_query(self, input: str):
|
||||
client = OpenAI()
|
||||
response = client.chat.completions.create(
|
||||
messages=[
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are an information specialist that is really good at deciding what tags a query should have",
|
||||
},
|
||||
{"role": "user", "content": DOCTYPE_PROMPT + " " + input},
|
||||
],
|
||||
model="gpt-4o",
|
||||
response_format={
|
||||
"type": "json_schema",
|
||||
"json_schema": {
|
||||
"name": "document_type",
|
||||
"schema": DocumentType.model_json_schema(),
|
||||
},
|
||||
},
|
||||
)
|
||||
|
||||
response_json_str = response.choices[0].message.content
|
||||
type_data = json.loads(response_json_str)
|
||||
metadata_query = {"document_type": {"$in": type_data["type"]}}
|
||||
return metadata_query
|
||||
|
||||
def get_query_type(self, input: str, transcript: str):
|
||||
client = OpenAI()
|
||||
response = client.chat.completions.create(
|
||||
messages=[
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are an information specialist that is really good at deciding what tags a query should have",
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": f"{QUERY_TYPE_PROMPT}\nTRANSCRIPT:\n{transcript}\nQUERY:{input}",
|
||||
},
|
||||
],
|
||||
model="gpt-4o",
|
||||
response_format={
|
||||
"type": "json_schema",
|
||||
"json_schema": {
|
||||
"name": "query_type",
|
||||
"schema": QueryType.model_json_schema(),
|
||||
},
|
||||
},
|
||||
)
|
||||
|
||||
response_json_str = response.choices[0].message.content
|
||||
type_data = json.loads(response_json_str)
|
||||
return type_data["type"]
|
||||
|
||||
def get_query(self, input: str):
|
||||
client = OpenAI()
|
||||
response = client.responses.parse(
|
||||
model="gpt-4o",
|
||||
input=[
|
||||
{"role": "system", "content": PROMPT},
|
||||
{"role": "user", "content": input},
|
||||
],
|
||||
text_format=GeneratedQuery,
|
||||
)
|
||||
print(response.output)
|
||||
query = json.loads(response.output_parsed.extracted_metadata_fields)
|
||||
# response: ChatResponse = ollama_client.chat(
|
||||
# model="gemma3n:e4b",
|
||||
# messages=[
|
||||
# {"role": "system", "content": PROMPT},
|
||||
# {"role": "user", "content": input},
|
||||
# ],
|
||||
# format=GeneratedQuery.model_json_schema(),
|
||||
# )
|
||||
|
||||
# query = json.loads(
|
||||
# json.loads(response["message"]["content"])["extracted_metadata_fields"]
|
||||
# )
|
||||
# date_key = list(query["created_date"].keys())[0]
|
||||
# query["created_date"][date_key] = self.date_to_epoch(
|
||||
# query["created_date"][date_key]
|
||||
# )
|
||||
|
||||
# if "$" not in date_key:
|
||||
# query["created_date"]["$" + date_key] = query["created_date"][date_key]
|
||||
|
||||
return query
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
qg = QueryGenerator()
|
||||
print(qg.get_doctype_query("How heavy is Simba?"))
|
||||
39
scripts/test_query.py
Normal file
39
scripts/test_query.py
Normal file
@@ -0,0 +1,39 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Test the query_vector_store function."""
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
|
||||
from dotenv import load_dotenv
|
||||
|
||||
from blueprints.rag.logic import query_vector_store
|
||||
|
||||
# Load .env from the root directory
|
||||
root_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))
|
||||
env_path = os.path.join(root_dir, ".env")
|
||||
load_dotenv(env_path)
|
||||
|
||||
|
||||
async def test_query(query: str):
|
||||
"""Test a query against the vector store."""
|
||||
print(f"Query: {query}\n")
|
||||
result, docs = await query_vector_store(query)
|
||||
print(f"Found {len(docs)} documents\n")
|
||||
print("Serialized result:")
|
||||
print(result)
|
||||
print("\n" + "=" * 80 + "\n")
|
||||
|
||||
|
||||
async def main():
|
||||
queries = [
|
||||
"What is Simba's weight?",
|
||||
"What medications is Simba taking?",
|
||||
"Tell me about Simba's recent vet visits",
|
||||
]
|
||||
|
||||
for query in queries:
|
||||
await test_query(query)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
79
scripts/user_message_stats.py
Normal file
79
scripts/user_message_stats.py
Normal file
@@ -0,0 +1,79 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Script to show how many messages each user has written
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
from tortoise import Tortoise
|
||||
from blueprints.users.models import User
|
||||
from blueprints.conversation.models import Speaker
|
||||
import os
|
||||
|
||||
|
||||
async def get_user_message_stats():
|
||||
"""Get message count statistics per user"""
|
||||
|
||||
# Initialize database connection
|
||||
database_url = os.getenv("DATABASE_URL", "sqlite://raggr.db")
|
||||
await Tortoise.init(
|
||||
db_url=database_url,
|
||||
modules={
|
||||
"models": [
|
||||
"blueprints.users.models",
|
||||
"blueprints.conversation.models",
|
||||
]
|
||||
},
|
||||
)
|
||||
|
||||
print("\n📊 User Message Statistics\n")
|
||||
print(
|
||||
f"{'Username':<20} {'Total Messages':<15} {'User Messages':<15} {'Conversations':<15}"
|
||||
)
|
||||
print("=" * 70)
|
||||
|
||||
# Get all users
|
||||
users = await User.all()
|
||||
|
||||
total_users = 0
|
||||
total_messages = 0
|
||||
|
||||
for user in users:
|
||||
# Get all conversations for this user
|
||||
conversations = await user.conversations.all()
|
||||
|
||||
if not conversations:
|
||||
continue
|
||||
|
||||
total_users += 1
|
||||
|
||||
# Count messages across all conversations
|
||||
user_message_count = 0
|
||||
total_message_count = 0
|
||||
|
||||
for conversation in conversations:
|
||||
messages = await conversation.messages.all()
|
||||
total_message_count += len(messages)
|
||||
|
||||
# Count only user messages (not assistant responses)
|
||||
user_messages = [msg for msg in messages if msg.speaker == Speaker.USER]
|
||||
user_message_count += len(user_messages)
|
||||
|
||||
total_messages += user_message_count
|
||||
|
||||
print(
|
||||
f"{user.username:<20} {total_message_count:<15} {user_message_count:<15} {len(conversations):<15}"
|
||||
)
|
||||
|
||||
print("=" * 70)
|
||||
print("\n📈 Summary:")
|
||||
print(f" Total active users: {total_users}")
|
||||
print(f" Total user messages: {total_messages}")
|
||||
print(
|
||||
f" Average messages per user: {total_messages / total_users if total_users > 0 else 0:.1f}\n"
|
||||
)
|
||||
|
||||
await Tortoise.close_connections()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(get_user_message_stats())
|
||||
Reference in New Issue
Block a user