Compare commits
1 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| ffbe992f64 |
+4
-5
@@ -19,11 +19,10 @@ BASE_URL=192.168.1.5:8000
|
||||
LLAMA_SERVER_URL=http://192.168.1.213:8080/v1
|
||||
LLAMA_MODEL_NAME=llama-3.1-8b-instruct
|
||||
|
||||
# Embedding Server Configuration
|
||||
# If set, uses a custom OpenAI-compatible embedding server (e.g. llama-server)
|
||||
# Falls back to OpenAI embeddings if not set
|
||||
EMBEDDING_SERVER_URL=http://192.168.1.7:8086/v1
|
||||
EMBEDDING_MODEL_NAME=all-minilm
|
||||
# ChromaDB Configuration
|
||||
# For Docker: This is automatically set to /app/data/chromadb
|
||||
# For local development: Set to a local directory path
|
||||
CHROMADB_PATH=./data/chromadb
|
||||
|
||||
# OpenAI Configuration
|
||||
OPENAI_API_KEY=your-openai-api-key
|
||||
|
||||
@@ -13,6 +13,9 @@ wheels/
|
||||
.env
|
||||
|
||||
# Database files
|
||||
chromadb/
|
||||
chromadb_openai/
|
||||
chroma_db/
|
||||
database/
|
||||
*.db
|
||||
|
||||
|
||||
@@ -4,7 +4,7 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co
|
||||
|
||||
## Project Overview
|
||||
|
||||
SimbaRAG is a RAG (Retrieval-Augmented Generation) conversational AI system for querying information about Simba (a cat). It ingests documents from Paperless-NGX, stores embeddings in PostgreSQL via pgvector, and uses LLMs (Ollama or OpenAI) to answer questions.
|
||||
SimbaRAG is a RAG (Retrieval-Augmented Generation) conversational AI system for querying information about Simba (a cat). It ingests documents from Paperless-NGX, stores embeddings in ChromaDB, and uses LLMs (Ollama or OpenAI) to answer questions.
|
||||
|
||||
## Commands
|
||||
|
||||
@@ -54,8 +54,9 @@ docker compose up -d
|
||||
│ Docker Compose │
|
||||
├─────────────────────────────────────────────────────────────┤
|
||||
│ raggr (port 8080) │ postgres (port 5432) │
|
||||
│ ├── Quart backend │ PostgreSQL 16 + pgvector│
|
||||
│ └── React frontend (served) │ │
|
||||
│ ├── Quart backend │ PostgreSQL 16 │
|
||||
│ ├── React frontend (served) │ │
|
||||
│ └── ChromaDB (volume) │ │
|
||||
└─────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
|
||||
+3
-2
@@ -37,14 +37,15 @@ WORKDIR /app/raggr-frontend
|
||||
RUN yarn install && yarn build
|
||||
WORKDIR /app
|
||||
|
||||
# Create database directory
|
||||
RUN mkdir -p /app/database
|
||||
# Create ChromaDB and database directories
|
||||
RUN mkdir -p /app/chromadb /app/database
|
||||
|
||||
# Expose port
|
||||
EXPOSE 8080
|
||||
|
||||
# Set environment variables
|
||||
ENV PYTHONPATH=/app
|
||||
ENV CHROMADB_PATH=/app/chromadb
|
||||
|
||||
# Run the startup script
|
||||
CMD ["./startup.sh"]
|
||||
|
||||
+3
-2
@@ -34,15 +34,16 @@ COPY . .
|
||||
WORKDIR /app/raggr-frontend
|
||||
RUN yarn build
|
||||
|
||||
# Create database directory
|
||||
# Create ChromaDB and database directories
|
||||
WORKDIR /app
|
||||
RUN mkdir -p /app/database
|
||||
RUN mkdir -p /app/chromadb /app/database
|
||||
|
||||
# Make startup script executable
|
||||
RUN chmod +x /app/startup-dev.sh
|
||||
|
||||
# Set environment variables
|
||||
ENV PYTHONPATH=/app
|
||||
ENV CHROMADB_PATH=/app/chromadb
|
||||
ENV PYTHONUNBUFFERED=1
|
||||
|
||||
# Expose port
|
||||
|
||||
@@ -3,7 +3,7 @@ import os
|
||||
from datetime import timedelta
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from quart import Quart, jsonify, render_template, send_from_directory
|
||||
from quart import Quart, jsonify, render_template, request, send_from_directory
|
||||
from quart_jwt_extended import JWTManager, get_jwt_identity, jwt_refresh_token_required
|
||||
from tortoise import Tortoise
|
||||
|
||||
@@ -15,6 +15,7 @@ import blueprints.users
|
||||
import blueprints.whatsapp
|
||||
import blueprints.users.models
|
||||
from config.db import TORTOISE_CONFIG
|
||||
from main import consult_simba_oracle
|
||||
|
||||
# Load environment variables
|
||||
load_dotenv()
|
||||
@@ -77,6 +78,39 @@ async def serve_react_app(path):
|
||||
return await render_template("index.html")
|
||||
|
||||
|
||||
@app.route("/api/query", methods=["POST"])
|
||||
@jwt_refresh_token_required
|
||||
async def query():
|
||||
current_user_uuid = get_jwt_identity()
|
||||
user = await blueprints.users.models.User.get(id=current_user_uuid)
|
||||
data = await request.get_json()
|
||||
query = data.get("query")
|
||||
conversation_id = data.get("conversation_id")
|
||||
conversation = await blueprints.conversation.logic.get_conversation_by_id(
|
||||
conversation_id
|
||||
)
|
||||
await conversation.fetch_related("messages")
|
||||
await blueprints.conversation.logic.add_message_to_conversation(
|
||||
conversation=conversation,
|
||||
message=query,
|
||||
speaker="user",
|
||||
user=user,
|
||||
)
|
||||
|
||||
transcript = await blueprints.conversation.logic.get_conversation_transcript(
|
||||
user=user, conversation=conversation
|
||||
)
|
||||
|
||||
response = consult_simba_oracle(input=query, transcript=transcript)
|
||||
await blueprints.conversation.logic.add_message_to_conversation(
|
||||
conversation=conversation,
|
||||
message=response,
|
||||
speaker="simba",
|
||||
user=user,
|
||||
)
|
||||
return jsonify({"response": response})
|
||||
|
||||
|
||||
@app.route("/api/messages", methods=["GET"])
|
||||
@jwt_refresh_token_required
|
||||
async def get_messages():
|
||||
|
||||
@@ -328,7 +328,7 @@ async def obsidian_search_notes(query: str) -> str:
|
||||
return "Obsidian integration is not configured. Please set OBSIDIAN_VAULT_PATH environment variable."
|
||||
|
||||
try:
|
||||
# Query vector store for obsidian documents
|
||||
# Query ChromaDB for obsidian documents
|
||||
serialized, docs = await query_vector_store(query=query)
|
||||
return serialized
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
SIMBA_SYSTEM_PROMPT = """You are Simba, Ryan's helpful personal assistant. You're named after his orange cat. You have a warm, friendly personality with a light cat-themed touch, but your priority is always being genuinely useful — give thorough, detailed answers and think things through carefully. When asked about Simba the cat, you speak as him in first person. For everything else, you're just a great assistant who happens to have a cat's name.
|
||||
SIMBA_SYSTEM_PROMPT = """You are a helpful cat assistant named Simba that understands veterinary terms. When there are questions to you specifically, they are referring to Simba the cat. Answer the user in as if you were a cat named Simba. Don't act too catlike. Be assertive.
|
||||
|
||||
SIMBA FACTS (as of January 2026):
|
||||
- Name: Simba
|
||||
|
||||
@@ -1,12 +1,7 @@
|
||||
from quart import Blueprint, jsonify
|
||||
from quart_jwt_extended import jwt_refresh_token_required
|
||||
|
||||
from .logic import (
|
||||
delete_all_documents,
|
||||
get_vector_store_stats,
|
||||
index_documents,
|
||||
index_obsidian_documents,
|
||||
)
|
||||
from .logic import fetch_obsidian_documents, get_vector_store_stats, index_documents, index_obsidian_documents, vector_store
|
||||
from blueprints.users.decorators import admin_required
|
||||
|
||||
rag_blueprint = Blueprint("rag_api", __name__, url_prefix="/api/rag")
|
||||
@@ -37,7 +32,14 @@ async def trigger_index():
|
||||
async def trigger_reindex():
|
||||
"""Clear and reindex all documents. Admin only."""
|
||||
try:
|
||||
delete_all_documents()
|
||||
# Clear existing documents
|
||||
collection = vector_store._collection
|
||||
all_docs = collection.get()
|
||||
|
||||
if all_docs["ids"]:
|
||||
collection.delete(ids=all_docs["ids"])
|
||||
|
||||
# Reindex
|
||||
await index_documents()
|
||||
stats = get_vector_store_stats()
|
||||
return jsonify({"status": "success", "stats": stats})
|
||||
|
||||
+28
-167
@@ -1,14 +1,11 @@
|
||||
import datetime
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from langchain_chroma import Chroma
|
||||
from langchain_core.documents import Document
|
||||
from langchain_openai import OpenAIEmbeddings
|
||||
from langchain_postgres import PGVector
|
||||
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
||||
from sqlalchemy import create_engine, text
|
||||
|
||||
from .fetchers import PaperlessNGXService
|
||||
from utils.obsidian_service import ObsidianService
|
||||
@@ -16,51 +13,13 @@ from utils.obsidian_service import ObsidianService
|
||||
# Load environment variables
|
||||
load_dotenv()
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
|
||||
|
||||
_embedding_server_url = os.getenv("EMBEDDING_SERVER_URL")
|
||||
_embedding_model = os.getenv("EMBEDDING_MODEL_NAME", "text-embedding-3-small")
|
||||
|
||||
if _embedding_server_url:
|
||||
embeddings = OpenAIEmbeddings(
|
||||
model=_embedding_model,
|
||||
base_url=_embedding_server_url,
|
||||
api_key="not-needed",
|
||||
check_embedding_ctx_length=False,
|
||||
)
|
||||
else:
|
||||
embeddings = OpenAIEmbeddings(model=_embedding_model)
|
||||
|
||||
# Convert Tortoise-style postgres:// URL to SQLAlchemy-style postgresql+psycopg://
|
||||
_db_url = os.getenv(
|
||||
"DATABASE_URL", "postgres://raggr:raggr_dev_password@localhost:5432/raggr"
|
||||
)
|
||||
_pgvector_url = _db_url.replace("postgres://", "postgresql+psycopg://")
|
||||
|
||||
# Lazy-initialized vector store (defers DB connection to first use)
|
||||
_vector_store = None
|
||||
|
||||
|
||||
def _get_vector_store() -> PGVector:
|
||||
global _vector_store
|
||||
if _vector_store is None:
|
||||
_vector_store = PGVector(
|
||||
embeddings=embeddings,
|
||||
vector_store = Chroma(
|
||||
collection_name="simba_docs",
|
||||
connection=_pgvector_url,
|
||||
use_jsonb=True,
|
||||
create_extension=False, # created by docker init script
|
||||
async_mode=True,
|
||||
)
|
||||
return _vector_store
|
||||
|
||||
|
||||
def _get_engine():
|
||||
"""Get a SQLAlchemy engine for direct queries."""
|
||||
if not hasattr(_get_engine, "_engine"):
|
||||
_get_engine._engine = create_engine(_pgvector_url)
|
||||
return _get_engine._engine
|
||||
|
||||
embedding_function=embeddings,
|
||||
persist_directory=os.getenv("CHROMADB_PATH", ""),
|
||||
)
|
||||
|
||||
text_splitter = RecursiveCharacterTextSplitter(
|
||||
chunk_size=1000, # chunk size (characters)
|
||||
@@ -69,22 +28,6 @@ text_splitter = RecursiveCharacterTextSplitter(
|
||||
)
|
||||
|
||||
|
||||
def _get_collection_id():
|
||||
"""Get the UUID of our collection from the langchain_pg_collection table."""
|
||||
engine = _get_engine()
|
||||
try:
|
||||
with engine.connect() as conn:
|
||||
result = conn.execute(
|
||||
text("SELECT uuid FROM langchain_pg_collection WHERE name = :name"),
|
||||
{"name": "simba_docs"},
|
||||
)
|
||||
row = result.fetchone()
|
||||
return row[0] if row else None
|
||||
except Exception:
|
||||
# Table doesn't exist yet (first run before any indexing)
|
||||
return None
|
||||
|
||||
|
||||
def date_to_epoch(date_str: str) -> float:
|
||||
split_date = date_str.split("-")
|
||||
date = datetime.datetime(
|
||||
@@ -115,43 +58,12 @@ async def fetch_documents_from_paperless_ngx() -> list[Document]:
|
||||
return documents
|
||||
|
||||
|
||||
def _sanitize_text(text_content: str) -> str:
|
||||
"""Strip non-printable and invalid characters that break embedding tokenizers."""
|
||||
# Remove null bytes and control characters (keep newlines and tabs)
|
||||
text_content = re.sub(r"[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\x9f]", "", text_content)
|
||||
# Remove Unicode surrogates and other problematic Unicode
|
||||
text_content = re.sub(r"[\ud800-\udfff\ufffe\uffff]", "", text_content)
|
||||
# Remove replacement character clusters
|
||||
text_content = text_content.replace("\ufffd", "")
|
||||
# Collapse excessive whitespace
|
||||
text_content = re.sub(r" {3,}", " ", text_content)
|
||||
return text_content.strip()
|
||||
|
||||
|
||||
def _sanitize_documents(documents: list[Document]) -> list[Document]:
|
||||
"""Sanitize page_content of all documents for embedding compatibility."""
|
||||
for doc in documents:
|
||||
doc.page_content = _sanitize_text(doc.page_content)
|
||||
return [doc for doc in documents if doc.page_content]
|
||||
|
||||
|
||||
async def index_documents():
|
||||
"""Index Paperless-NGX documents into vector store."""
|
||||
documents = await fetch_documents_from_paperless_ngx()
|
||||
|
||||
splits = text_splitter.split_documents(documents)
|
||||
splits = _sanitize_documents(splits)
|
||||
logger.info(f"Indexing {len(splits)} chunks from {len(documents)} documents")
|
||||
vector_store = _get_vector_store()
|
||||
for i, split in enumerate(splits):
|
||||
try:
|
||||
await vector_store.aadd_documents(documents=[split])
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"Failed to embed chunk {i} from {split.metadata.get('filename', 'unknown')}: {e}"
|
||||
)
|
||||
logger.debug(f"Chunk content preview: {split.page_content[:200]!r}")
|
||||
raise
|
||||
await vector_store.aadd_documents(documents=splits)
|
||||
|
||||
|
||||
async def fetch_obsidian_documents() -> list[Document]:
|
||||
@@ -180,17 +92,13 @@ async def fetch_obsidian_documents() -> list[Document]:
|
||||
"filepath": parsed["filepath"],
|
||||
"tags": parsed["tags"],
|
||||
"created_at": parsed["metadata"].get("created_at"),
|
||||
**{
|
||||
k: v
|
||||
for k, v in parsed["metadata"].items()
|
||||
if k not in ["created_at", "created_by"]
|
||||
},
|
||||
**{k: v for k, v in parsed["metadata"].items() if k not in ["created_at", "created_by"]},
|
||||
},
|
||||
)
|
||||
documents.append(document)
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Error reading {md_path}: {e}")
|
||||
print(f"Error reading {md_path}: {e}")
|
||||
continue
|
||||
|
||||
return documents
|
||||
@@ -201,26 +109,26 @@ async def index_obsidian_documents():
|
||||
|
||||
Deletes existing obsidian source chunks before re-indexing.
|
||||
"""
|
||||
obsidian_service = ObsidianService()
|
||||
documents = await fetch_obsidian_documents()
|
||||
|
||||
if not documents:
|
||||
logger.info("No Obsidian documents found to index")
|
||||
print("No Obsidian documents found to index")
|
||||
return {"indexed": 0}
|
||||
|
||||
# Delete existing obsidian chunks
|
||||
delete_documents_by_metadata("source", "obsidian")
|
||||
existing_results = vector_store.get(where={"source": "obsidian"})
|
||||
if existing_results.get("ids"):
|
||||
await vector_store.adelete(existing_results["ids"])
|
||||
|
||||
# Split, sanitize, and index documents
|
||||
# Split and index documents
|
||||
splits = text_splitter.split_documents(documents)
|
||||
splits = _sanitize_documents(splits)
|
||||
vector_store = _get_vector_store()
|
||||
await vector_store.aadd_documents(documents=splits)
|
||||
|
||||
return {"indexed": len(documents)}
|
||||
|
||||
|
||||
async def query_vector_store(query: str):
|
||||
vector_store = _get_vector_store()
|
||||
retrieved_docs = await vector_store.asimilarity_search(query, k=2)
|
||||
serialized = "\n\n".join(
|
||||
(f"Source: {doc.metadata}\nContent: {doc.page_content}")
|
||||
@@ -229,79 +137,32 @@ async def query_vector_store(query: str):
|
||||
return serialized, retrieved_docs
|
||||
|
||||
|
||||
def delete_all_documents():
|
||||
"""Delete all documents from the vector store collection."""
|
||||
collection_id = _get_collection_id()
|
||||
if not collection_id:
|
||||
return
|
||||
engine = _get_engine()
|
||||
with engine.connect() as conn:
|
||||
conn.execute(
|
||||
text("DELETE FROM langchain_pg_embedding WHERE collection_id = :cid"),
|
||||
{"cid": collection_id},
|
||||
)
|
||||
conn.commit()
|
||||
|
||||
|
||||
def delete_documents_by_metadata(key: str, value: str):
|
||||
"""Delete documents matching a metadata key/value pair."""
|
||||
collection_id = _get_collection_id()
|
||||
if not collection_id:
|
||||
return
|
||||
engine = _get_engine()
|
||||
with engine.connect() as conn:
|
||||
conn.execute(
|
||||
text(
|
||||
"DELETE FROM langchain_pg_embedding "
|
||||
"WHERE collection_id = :cid AND cmetadata->>:key = :value"
|
||||
),
|
||||
{"cid": collection_id, "key": key, "value": value},
|
||||
)
|
||||
conn.commit()
|
||||
|
||||
|
||||
def get_vector_store_stats():
|
||||
"""Get statistics about the vector store."""
|
||||
collection_id = _get_collection_id()
|
||||
count = 0
|
||||
if collection_id:
|
||||
engine = _get_engine()
|
||||
with engine.connect() as conn:
|
||||
result = conn.execute(
|
||||
text(
|
||||
"SELECT COUNT(*) FROM langchain_pg_embedding WHERE collection_id = :cid"
|
||||
),
|
||||
{"cid": collection_id},
|
||||
)
|
||||
count = result.scalar()
|
||||
collection = vector_store._collection
|
||||
count = collection.count()
|
||||
return {
|
||||
"total_documents": count,
|
||||
"collection_name": "simba_docs",
|
||||
"collection_name": collection.name,
|
||||
}
|
||||
|
||||
|
||||
def list_all_documents(limit: int = 10):
|
||||
"""List documents in the vector store with their metadata."""
|
||||
collection_id = _get_collection_id()
|
||||
if not collection_id:
|
||||
return []
|
||||
collection = vector_store._collection
|
||||
results = collection.get(limit=limit, include=["metadatas", "documents"])
|
||||
|
||||
engine = _get_engine()
|
||||
with engine.connect() as conn:
|
||||
result = conn.execute(
|
||||
text(
|
||||
"SELECT id, document, cmetadata FROM langchain_pg_embedding "
|
||||
"WHERE collection_id = :cid LIMIT :limit"
|
||||
),
|
||||
{"cid": collection_id, "limit": limit},
|
||||
)
|
||||
documents = []
|
||||
for row in result:
|
||||
for i, doc_id in enumerate(results["ids"]):
|
||||
documents.append(
|
||||
{
|
||||
"id": str(row[0]),
|
||||
"metadata": row[2],
|
||||
"content_preview": row[1][:200] if row[1] else None,
|
||||
"id": doc_id,
|
||||
"metadata": results["metadatas"][i]
|
||||
if results.get("metadatas")
|
||||
else None,
|
||||
"content_preview": results["documents"][i][:200]
|
||||
if results.get("documents")
|
||||
else None,
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
+4
-4
@@ -2,7 +2,7 @@ version: "3.8"
|
||||
|
||||
services:
|
||||
postgres:
|
||||
image: pgvector/pgvector:pg16
|
||||
image: postgres:16-alpine
|
||||
ports:
|
||||
- "5432:5432"
|
||||
environment:
|
||||
@@ -11,7 +11,6 @@ services:
|
||||
- POSTGRES_DB=${POSTGRES_DB:-raggr}
|
||||
volumes:
|
||||
- postgres_data:/var/lib/postgresql/data
|
||||
- ./docker/init-pgvector.sql:/docker-entrypoint-initdb.d/init-pgvector.sql
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "pg_isready -U ${POSTGRES_USER:-raggr}"]
|
||||
interval: 10s
|
||||
@@ -30,9 +29,8 @@ services:
|
||||
- PAPERLESS_TOKEN=${PAPERLESS_TOKEN}
|
||||
- BASE_URL=${BASE_URL}
|
||||
- OLLAMA_URL=${OLLAMA_URL:-http://localhost:11434}
|
||||
- CHROMADB_PATH=/app/data/chromadb
|
||||
- OPENAI_API_KEY=${OPENAI_API_KEY}
|
||||
- EMBEDDING_SERVER_URL=${EMBEDDING_SERVER_URL}
|
||||
- EMBEDDING_MODEL_NAME=${EMBEDDING_MODEL_NAME}
|
||||
- JWT_SECRET_KEY=${JWT_SECRET_KEY}
|
||||
- LLAMA_SERVER_URL=${LLAMA_SERVER_URL}
|
||||
- LLAMA_MODEL_NAME=${LLAMA_MODEL_NAME}
|
||||
@@ -68,8 +66,10 @@ services:
|
||||
postgres:
|
||||
condition: service_healthy
|
||||
volumes:
|
||||
- chromadb_data:/app/data/chromadb
|
||||
- ./obvault:/app/data/obsidian
|
||||
restart: unless-stopped
|
||||
|
||||
volumes:
|
||||
chromadb_data:
|
||||
postgres_data:
|
||||
|
||||
@@ -1 +0,0 @@
|
||||
CREATE EXTENSION IF NOT EXISTS vector;
|
||||
@@ -0,0 +1,278 @@
|
||||
import argparse
|
||||
import datetime
|
||||
import logging
|
||||
import os
|
||||
import sqlite3
|
||||
import time
|
||||
|
||||
from dotenv import load_dotenv
|
||||
|
||||
import chromadb
|
||||
from utils.chunker import Chunker
|
||||
from utils.cleaner import pdf_to_image, summarize_pdf_image
|
||||
from llm import LLMClient
|
||||
from scripts.query import QueryGenerator
|
||||
from utils.request import PaperlessNGXService
|
||||
|
||||
_dotenv_loaded = load_dotenv()
|
||||
|
||||
client = chromadb.PersistentClient(path=os.getenv("CHROMADB_PATH", ""))
|
||||
simba_docs = client.get_or_create_collection(name="simba_docs2")
|
||||
feline_vet_lookup = client.get_or_create_collection(name="feline_vet_lookup")
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
description="An LLM tool to query information about Simba <3"
|
||||
)
|
||||
|
||||
parser.add_argument("query", type=str, help="questions about simba's health")
|
||||
parser.add_argument(
|
||||
"--reindex", action="store_true", help="re-index the simba documents"
|
||||
)
|
||||
parser.add_argument("--classify", action="store_true", help="test classification")
|
||||
parser.add_argument("--index", help="index a file")
|
||||
|
||||
ppngx = PaperlessNGXService()
|
||||
|
||||
llm_client = LLMClient()
|
||||
|
||||
|
||||
def index_using_pdf_llm(doctypes):
|
||||
logging.info("reindex data...")
|
||||
files = ppngx.get_data()
|
||||
for file in files:
|
||||
document_id: int = file["id"]
|
||||
pdf_path = ppngx.download_pdf_from_id(id=document_id)
|
||||
image_paths = pdf_to_image(filepath=pdf_path)
|
||||
logging.info(f"summarizing {file}")
|
||||
generated_summary = summarize_pdf_image(filepaths=image_paths)
|
||||
file["content"] = generated_summary
|
||||
|
||||
chunk_data(files, simba_docs, doctypes=doctypes)
|
||||
|
||||
|
||||
def date_to_epoch(date_str: str) -> float:
|
||||
split_date = date_str.split("-")
|
||||
date = datetime.datetime(
|
||||
int(split_date[0]),
|
||||
int(split_date[1]),
|
||||
int(split_date[2]),
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
)
|
||||
|
||||
return date.timestamp()
|
||||
|
||||
|
||||
def chunk_data(docs, collection, doctypes):
|
||||
# Step 2: Create chunks
|
||||
chunker = Chunker(collection)
|
||||
|
||||
logging.info(f"chunking {len(docs)} documents")
|
||||
texts: list[str] = [doc["content"] for doc in docs]
|
||||
with sqlite3.connect("database/visited.db") as conn:
|
||||
to_insert = []
|
||||
c = conn.cursor()
|
||||
for index, text in enumerate(texts):
|
||||
metadata = {
|
||||
"created_date": date_to_epoch(docs[index]["created_date"]),
|
||||
"filename": docs[index]["original_file_name"],
|
||||
"document_type": doctypes.get(docs[index]["document_type"], ""),
|
||||
}
|
||||
|
||||
if doctypes:
|
||||
metadata["type"] = doctypes.get(docs[index]["document_type"])
|
||||
|
||||
chunker.chunk_document(
|
||||
document=text,
|
||||
metadata=metadata,
|
||||
)
|
||||
to_insert.append((docs[index]["id"],))
|
||||
|
||||
c.executemany(
|
||||
"INSERT INTO indexed_documents (paperless_id) values (?)", to_insert
|
||||
)
|
||||
conn.commit()
|
||||
|
||||
|
||||
def chunk_text(texts: list[str], collection):
|
||||
chunker = Chunker(collection)
|
||||
|
||||
for index, text in enumerate(texts):
|
||||
metadata = {}
|
||||
chunker.chunk_document(
|
||||
document=text,
|
||||
metadata=metadata,
|
||||
)
|
||||
|
||||
|
||||
def classify_query(query: str, transcript: str) -> bool:
|
||||
logging.info("Starting query generation")
|
||||
qg_start = time.time()
|
||||
qg = QueryGenerator()
|
||||
query_type = qg.get_query_type(input=query, transcript=transcript)
|
||||
logging.info(query_type)
|
||||
qg_end = time.time()
|
||||
logging.info(f"Query generation took {qg_end - qg_start:.2f} seconds")
|
||||
return query_type == "Simba"
|
||||
|
||||
|
||||
def consult_oracle(
|
||||
input: str,
|
||||
collection,
|
||||
transcript: str = "",
|
||||
):
|
||||
chunker = Chunker(collection)
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
# Ask
|
||||
logging.info("Starting query generation")
|
||||
qg_start = time.time()
|
||||
qg = QueryGenerator()
|
||||
doctype_query = qg.get_doctype_query(input=input)
|
||||
# metadata_filter = qg.get_query(input)
|
||||
metadata_filter = {**doctype_query}
|
||||
logging.info(metadata_filter)
|
||||
qg_end = time.time()
|
||||
logging.info(f"Query generation took {qg_end - qg_start:.2f} seconds")
|
||||
|
||||
logging.info("Starting embedding generation")
|
||||
embedding_start = time.time()
|
||||
embeddings = chunker.embedding_fx(inputs=[input])
|
||||
embedding_end = time.time()
|
||||
logging.info(
|
||||
f"Embedding generation took {embedding_end - embedding_start:.2f} seconds"
|
||||
)
|
||||
|
||||
logging.info("Starting collection query")
|
||||
query_start = time.time()
|
||||
results = collection.query(
|
||||
query_texts=[input],
|
||||
query_embeddings=embeddings,
|
||||
where=metadata_filter,
|
||||
)
|
||||
query_end = time.time()
|
||||
logging.info(f"Collection query took {query_end - query_start:.2f} seconds")
|
||||
|
||||
# Generate
|
||||
logging.info("Starting LLM generation")
|
||||
llm_start = time.time()
|
||||
system_prompt = "You are a helpful assistant that understands veterinary terms."
|
||||
transcript_prompt = f"Here is the message transcript thus far {transcript}."
|
||||
prompt = f"""Using the following data, help answer the user's query by providing as many details as possible.
|
||||
Using this data: {results}. {transcript_prompt if len(transcript) > 0 else ""}
|
||||
Respond to this prompt: {input}"""
|
||||
output = llm_client.chat(prompt=prompt, system_prompt=system_prompt)
|
||||
llm_end = time.time()
|
||||
logging.info(f"LLM generation took {llm_end - llm_start:.2f} seconds")
|
||||
|
||||
total_time = time.time() - start_time
|
||||
logging.info(f"Total consult_oracle execution took {total_time:.2f} seconds")
|
||||
|
||||
return output
|
||||
|
||||
|
||||
def llm_chat(input: str, transcript: str = "") -> str:
|
||||
system_prompt = "You are a helpful assistant that understands veterinary terms."
|
||||
transcript_prompt = f"Here is the message transcript thus far {transcript}."
|
||||
prompt = f"""Answer the user in as if you were a cat named Simba. Don't act too catlike. Be assertive.
|
||||
{transcript_prompt if len(transcript) > 0 else ""}
|
||||
Respond to this prompt: {input}"""
|
||||
output = llm_client.chat(prompt=prompt, system_prompt=system_prompt)
|
||||
return output
|
||||
|
||||
|
||||
def paperless_workflow(input):
|
||||
# Step 1: Get the text
|
||||
ppngx = PaperlessNGXService()
|
||||
docs = ppngx.get_data()
|
||||
|
||||
chunk_data(docs, collection=simba_docs)
|
||||
consult_oracle(input, simba_docs)
|
||||
|
||||
|
||||
def consult_simba_oracle(input: str, transcript: str = ""):
|
||||
is_simba_related = classify_query(query=input, transcript=transcript)
|
||||
|
||||
if is_simba_related:
|
||||
logging.info("Query is related to simba")
|
||||
return consult_oracle(
|
||||
input=input,
|
||||
collection=simba_docs,
|
||||
transcript=transcript,
|
||||
)
|
||||
|
||||
logging.info("Query is NOT related to simba")
|
||||
|
||||
return llm_chat(input=input, transcript=transcript)
|
||||
|
||||
|
||||
def filter_indexed_files(docs):
|
||||
with sqlite3.connect("database/visited.db") as conn:
|
||||
c = conn.cursor()
|
||||
c.execute(
|
||||
"CREATE TABLE IF NOT EXISTS indexed_documents (id INTEGER PRIMARY KEY AUTOINCREMENT, paperless_id INTEGER)"
|
||||
)
|
||||
c.execute("SELECT paperless_id FROM indexed_documents")
|
||||
rows = c.fetchall()
|
||||
conn.commit()
|
||||
|
||||
visited = {row[0] for row in rows}
|
||||
return [doc for doc in docs if doc["id"] not in visited]
|
||||
|
||||
|
||||
def reindex():
|
||||
with sqlite3.connect("database/visited.db") as conn:
|
||||
c = conn.cursor()
|
||||
# Ensure the table exists before trying to delete from it
|
||||
c.execute(
|
||||
"CREATE TABLE IF NOT EXISTS indexed_documents (id INTEGER PRIMARY KEY AUTOINCREMENT, paperless_id INTEGER)"
|
||||
)
|
||||
c.execute("DELETE FROM indexed_documents")
|
||||
conn.commit()
|
||||
|
||||
# Delete all documents from the collection
|
||||
all_docs = simba_docs.get()
|
||||
if all_docs["ids"]:
|
||||
simba_docs.delete(ids=all_docs["ids"])
|
||||
|
||||
logging.info("Fetching documents from Paperless-NGX")
|
||||
ppngx = PaperlessNGXService()
|
||||
docs = ppngx.get_data()
|
||||
docs = filter_indexed_files(docs)
|
||||
logging.info(f"Fetched {len(docs)} documents")
|
||||
|
||||
# Delete all chromadb data
|
||||
ids = simba_docs.get(ids=None, limit=None, offset=0)
|
||||
all_ids = ids["ids"]
|
||||
if len(all_ids) > 0:
|
||||
simba_docs.delete(ids=all_ids)
|
||||
|
||||
# Chunk documents
|
||||
logging.info("Chunking documents now ...")
|
||||
doctype_lookup = ppngx.get_doctypes()
|
||||
chunk_data(docs, collection=simba_docs, doctypes=doctype_lookup)
|
||||
logging.info("Done chunking documents")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parser.parse_args()
|
||||
if args.reindex:
|
||||
reindex()
|
||||
|
||||
if args.classify:
|
||||
consult_simba_oracle(input="yohohoho testing")
|
||||
consult_simba_oracle(input="write an email")
|
||||
consult_simba_oracle(input="how much does simba weigh")
|
||||
|
||||
if args.query:
|
||||
logging.info("Consulting oracle ...")
|
||||
print(
|
||||
consult_oracle(
|
||||
input=args.query,
|
||||
collection=simba_docs,
|
||||
)
|
||||
)
|
||||
else:
|
||||
logging.info("please provide a query")
|
||||
+2
-2
@@ -5,8 +5,7 @@ description = "Add your description here"
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.13"
|
||||
dependencies = [
|
||||
"langchain-postgres>=0.0.13",
|
||||
"psycopg[binary]>=3.1.0",
|
||||
"chromadb>=1.1.0",
|
||||
"python-dotenv>=1.0.0",
|
||||
"flask>=3.1.2",
|
||||
"httpx>=0.28.1",
|
||||
@@ -31,6 +30,7 @@ dependencies = [
|
||||
"asyncpg>=0.30.0",
|
||||
"langchain-openai>=1.1.6",
|
||||
"langchain>=1.2.0",
|
||||
"langchain-chroma>=1.0.0",
|
||||
"langchain-community>=0.4.1",
|
||||
"jq>=1.10.0",
|
||||
"tavily-python>=0.7.17",
|
||||
|
||||
@@ -1,13 +1,48 @@
|
||||
import { useState, useEffect } from "react";
|
||||
|
||||
import "./App.css";
|
||||
import { AuthProvider } from "./contexts/AuthContext";
|
||||
import { ChatScreen } from "./components/ChatScreen";
|
||||
import { LoginScreen } from "./components/LoginScreen";
|
||||
import { useAuthCheck } from "./hooks/useAuthCheck";
|
||||
import { conversationService } from "./api/conversationService";
|
||||
import catIcon from "./assets/cat.png";
|
||||
|
||||
const AppContainer = () => {
|
||||
const { isAuthenticated, isChecking, isAdmin, setAuthenticated } = useAuthCheck();
|
||||
const [isAuthenticated, setAuthenticated] = useState<boolean>(false);
|
||||
const [isChecking, setIsChecking] = useState<boolean>(true);
|
||||
|
||||
useEffect(() => {
|
||||
const checkAuth = async () => {
|
||||
const accessToken = localStorage.getItem("access_token");
|
||||
const refreshToken = localStorage.getItem("refresh_token");
|
||||
|
||||
// No tokens at all, not authenticated
|
||||
if (!accessToken && !refreshToken) {
|
||||
setIsChecking(false);
|
||||
setAuthenticated(false);
|
||||
return;
|
||||
}
|
||||
|
||||
// Try to verify token by making a request
|
||||
try {
|
||||
await conversationService.getAllConversations();
|
||||
// If successful, user is authenticated
|
||||
setAuthenticated(true);
|
||||
} catch (error) {
|
||||
// Token is invalid or expired
|
||||
console.error("Authentication check failed:", error);
|
||||
localStorage.removeItem("access_token");
|
||||
localStorage.removeItem("refresh_token");
|
||||
setAuthenticated(false);
|
||||
} finally {
|
||||
setIsChecking(false);
|
||||
}
|
||||
};
|
||||
|
||||
checkAuth();
|
||||
}, []);
|
||||
|
||||
// Show loading state while checking authentication
|
||||
if (isChecking) {
|
||||
return (
|
||||
<div className="h-screen flex flex-col items-center justify-center bg-cream gap-4">
|
||||
@@ -26,7 +61,7 @@ const AppContainer = () => {
|
||||
return (
|
||||
<>
|
||||
{isAuthenticated ? (
|
||||
<ChatScreen setAuthenticated={setAuthenticated} isAdmin={isAdmin} />
|
||||
<ChatScreen setAuthenticated={setAuthenticated} />
|
||||
) : (
|
||||
<LoginScreen setAuthenticated={setAuthenticated} />
|
||||
)}
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
import { useState } from "react";
|
||||
import { useEffect, useState } from "react";
|
||||
import { X, Phone, PhoneOff, Pencil, Check, Mail, Copy } from "lucide-react";
|
||||
import { userService, type AdminUserRecord } from "../api/userService";
|
||||
import { cn } from "../lib/utils";
|
||||
@@ -12,19 +12,27 @@ import {
|
||||
TableHeader,
|
||||
TableRow,
|
||||
} from "./ui/table";
|
||||
import { useAdminUsers } from "../hooks/useAdminUsers";
|
||||
|
||||
type Props = {
|
||||
onClose: () => void;
|
||||
};
|
||||
|
||||
export const AdminPanel = ({ onClose }: Props) => {
|
||||
const { users, loading, updateUser } = useAdminUsers();
|
||||
const [users, setUsers] = useState<AdminUserRecord[]>([]);
|
||||
const [loading, setLoading] = useState(true);
|
||||
const [editingId, setEditingId] = useState<string | null>(null);
|
||||
const [editValue, setEditValue] = useState("");
|
||||
const [rowError, setRowError] = useState<Record<string, string>>({});
|
||||
const [rowSuccess, setRowSuccess] = useState<Record<string, string>>({});
|
||||
|
||||
useEffect(() => {
|
||||
userService
|
||||
.adminListUsers()
|
||||
.then(setUsers)
|
||||
.catch(() => {})
|
||||
.finally(() => setLoading(false));
|
||||
}, []);
|
||||
|
||||
const startEdit = (user: AdminUserRecord) => {
|
||||
setEditingId(user.id);
|
||||
setEditValue(user.whatsapp_number ?? "");
|
||||
@@ -41,8 +49,8 @@ export const AdminPanel = ({ onClose }: Props) => {
|
||||
setRowError((p) => ({ ...p, [userId]: "" }));
|
||||
try {
|
||||
const updated = await userService.adminSetWhatsapp(userId, editValue);
|
||||
updateUser(userId, () => updated);
|
||||
setRowSuccess((p) => ({ ...p, [userId]: "Saved" }));
|
||||
setUsers((p) => p.map((u) => (u.id === userId ? updated : u)));
|
||||
setRowSuccess((p) => ({ ...p, [userId]: "Saved ✓" }));
|
||||
setEditingId(null);
|
||||
setTimeout(() => setRowSuccess((p) => ({ ...p, [userId]: "" })), 2000);
|
||||
} catch (err) {
|
||||
@@ -57,8 +65,10 @@ export const AdminPanel = ({ onClose }: Props) => {
|
||||
setRowError((p) => ({ ...p, [userId]: "" }));
|
||||
try {
|
||||
await userService.adminUnlinkWhatsapp(userId);
|
||||
updateUser(userId, (u) => ({ ...u, whatsapp_number: null }));
|
||||
setRowSuccess((p) => ({ ...p, [userId]: "Unlinked" }));
|
||||
setUsers((p) =>
|
||||
p.map((u) => (u.id === userId ? { ...u, whatsapp_number: null } : u)),
|
||||
);
|
||||
setRowSuccess((p) => ({ ...p, [userId]: "Unlinked ✓" }));
|
||||
setTimeout(() => setRowSuccess((p) => ({ ...p, [userId]: "" })), 2000);
|
||||
} catch (err) {
|
||||
setRowError((p) => ({
|
||||
@@ -72,8 +82,8 @@ export const AdminPanel = ({ onClose }: Props) => {
|
||||
setRowError((p) => ({ ...p, [userId]: "" }));
|
||||
try {
|
||||
const updated = await userService.adminToggleEmail(userId);
|
||||
updateUser(userId, () => updated);
|
||||
setRowSuccess((p) => ({ ...p, [userId]: "Email enabled" }));
|
||||
setUsers((p) => p.map((u) => (u.id === userId ? updated : u)));
|
||||
setRowSuccess((p) => ({ ...p, [userId]: "Email enabled ✓" }));
|
||||
setTimeout(() => setRowSuccess((p) => ({ ...p, [userId]: "" })), 2000);
|
||||
} catch (err) {
|
||||
setRowError((p) => ({
|
||||
@@ -87,8 +97,10 @@ export const AdminPanel = ({ onClose }: Props) => {
|
||||
setRowError((p) => ({ ...p, [userId]: "" }));
|
||||
try {
|
||||
await userService.adminDisableEmail(userId);
|
||||
updateUser(userId, (u) => ({ ...u, email_enabled: false, email_address: null }));
|
||||
setRowSuccess((p) => ({ ...p, [userId]: "Email disabled" }));
|
||||
setUsers((p) =>
|
||||
p.map((u) => (u.id === userId ? { ...u, email_enabled: false, email_address: null } : u)),
|
||||
);
|
||||
setRowSuccess((p) => ({ ...p, [userId]: "Email disabled ✓" }));
|
||||
setTimeout(() => setRowSuccess((p) => ({ ...p, [userId]: "" })), 2000);
|
||||
} catch (err) {
|
||||
setRowError((p) => ({
|
||||
@@ -100,7 +112,7 @@ export const AdminPanel = ({ onClose }: Props) => {
|
||||
|
||||
const copyToClipboard = (text: string, userId: string) => {
|
||||
navigator.clipboard.writeText(text);
|
||||
setRowSuccess((p) => ({ ...p, [userId]: "Copied" }));
|
||||
setRowSuccess((p) => ({ ...p, [userId]: "Copied ✓" }));
|
||||
setTimeout(() => setRowSuccess((p) => ({ ...p, [userId]: "" })), 2000);
|
||||
};
|
||||
|
||||
@@ -116,6 +128,7 @@ export const AdminPanel = ({ onClose }: Props) => {
|
||||
"border border-sand-light/60",
|
||||
)}
|
||||
>
|
||||
{/* Header */}
|
||||
<div className="flex items-center justify-between px-6 py-4 border-b border-sand-light/60">
|
||||
<div className="flex items-center gap-2.5">
|
||||
<div className="w-8 h-8 rounded-xl bg-leaf-pale flex items-center justify-center">
|
||||
@@ -133,6 +146,7 @@ export const AdminPanel = ({ onClose }: Props) => {
|
||||
</button>
|
||||
</div>
|
||||
|
||||
{/* Body */}
|
||||
<div className="overflow-y-auto flex-1 rounded-b-3xl">
|
||||
{loading ? (
|
||||
<div className="px-6 py-12 text-center text-warm-gray text-sm">
|
||||
@@ -141,7 +155,7 @@ export const AdminPanel = ({ onClose }: Props) => {
|
||||
<span className="loading-dot w-2 h-2 rounded-full bg-amber-soft inline-block" />
|
||||
<span className="loading-dot w-2 h-2 rounded-full bg-amber-soft inline-block" />
|
||||
</div>
|
||||
Loading users...
|
||||
Loading users…
|
||||
</div>
|
||||
) : (
|
||||
<Table>
|
||||
@@ -190,7 +204,7 @@ export const AdminPanel = ({ onClose }: Props) => {
|
||||
: "text-warm-gray/40 italic",
|
||||
)}
|
||||
>
|
||||
{user.whatsapp_number ?? "\u2014"}
|
||||
{user.whatsapp_number ?? "—"}
|
||||
</span>
|
||||
{rowSuccess[user.id] && (
|
||||
<span className="text-xs text-leaf-dark">
|
||||
@@ -221,7 +235,7 @@ export const AdminPanel = ({ onClose }: Props) => {
|
||||
</button>
|
||||
</div>
|
||||
) : (
|
||||
<span className="text-sm text-warm-gray/40 italic">\u2014</span>
|
||||
<span className="text-sm text-warm-gray/40 italic">—</span>
|
||||
)}
|
||||
</div>
|
||||
</TableCell>
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
import React from "react";
|
||||
import ReactMarkdown from "react-markdown";
|
||||
import { cn } from "../lib/utils";
|
||||
|
||||
@@ -7,7 +6,7 @@ type AnswerBubbleProps = {
|
||||
loading?: boolean;
|
||||
};
|
||||
|
||||
export const AnswerBubble = React.memo(({ text, loading }: AnswerBubbleProps) => {
|
||||
export const AnswerBubble = ({ text, loading }: AnswerBubbleProps) => {
|
||||
return (
|
||||
<div className="flex justify-start message-enter">
|
||||
<div
|
||||
@@ -18,6 +17,7 @@ export const AnswerBubble = React.memo(({ text, loading }: AnswerBubbleProps) =>
|
||||
"overflow-hidden",
|
||||
)}
|
||||
>
|
||||
{/* amber accent bar */}
|
||||
<div className="h-0.5 w-full bg-gradient-to-r from-amber-soft via-amber-glow/50 to-transparent" />
|
||||
|
||||
<div className="px-4 py-3">
|
||||
@@ -36,4 +36,4 @@ export const AnswerBubble = React.memo(({ text, loading }: AnswerBubbleProps) =>
|
||||
</div>
|
||||
</div>
|
||||
);
|
||||
});
|
||||
};
|
||||
|
||||
@@ -1,5 +1,7 @@
|
||||
import { useCallback, useState, useRef } from "react";
|
||||
import { useCallback, useEffect, useState, useRef } from "react";
|
||||
import { LogOut, Shield, PanelLeftClose, PanelLeftOpen, Menu, X } from "lucide-react";
|
||||
import { conversationService } from "../api/conversationService";
|
||||
import { userService } from "../api/userService";
|
||||
import { QuestionBubble } from "./QuestionBubble";
|
||||
import { AnswerBubble } from "./AnswerBubble";
|
||||
import { ToolBubble } from "./ToolBubble";
|
||||
@@ -7,79 +9,205 @@ import { MessageInput } from "./MessageInput";
|
||||
import { ConversationList } from "./ConversationList";
|
||||
import { AdminPanel } from "./AdminPanel";
|
||||
import { cn } from "../lib/utils";
|
||||
import { useConversations } from "../hooks/useConversations";
|
||||
import { useChat } from "../hooks/useChat";
|
||||
import catIcon from "../assets/cat.png";
|
||||
|
||||
type Message = {
|
||||
text: string;
|
||||
speaker: "simba" | "user" | "tool";
|
||||
image_key?: string | null;
|
||||
};
|
||||
|
||||
type Conversation = {
|
||||
title: string;
|
||||
id: string;
|
||||
};
|
||||
|
||||
type ChatScreenProps = {
|
||||
setAuthenticated: (isAuth: boolean) => void;
|
||||
isAdmin: boolean;
|
||||
};
|
||||
|
||||
export const ChatScreen = ({ setAuthenticated, isAdmin }: ChatScreenProps) => {
|
||||
const [query, setQuery] = useState("");
|
||||
const [simbaMode, setSimbaMode] = useState(false);
|
||||
const [showConversations, setShowConversations] = useState(false);
|
||||
const [sidebarCollapsed, setSidebarCollapsed] = useState(false);
|
||||
const [showAdminPanel, setShowAdminPanel] = useState(false);
|
||||
const TOOL_MESSAGES: Record<string, string> = {
|
||||
simba_search: "🔍 Searching Simba's records...",
|
||||
web_search: "🌐 Searching the web...",
|
||||
get_current_date: "📅 Checking today's date...",
|
||||
ynab_budget_summary: "💰 Checking budget summary...",
|
||||
ynab_search_transactions: "💳 Looking up transactions...",
|
||||
ynab_category_spending: "📊 Analyzing category spending...",
|
||||
ynab_insights: "📈 Generating budget insights...",
|
||||
obsidian_search_notes: "📝 Searching notes...",
|
||||
obsidian_read_note: "📖 Reading note...",
|
||||
obsidian_create_note: "✏️ Saving note...",
|
||||
obsidian_create_task: "✅ Creating task...",
|
||||
journal_get_today: "📔 Reading today's journal...",
|
||||
journal_get_tasks: "📋 Getting tasks...",
|
||||
journal_add_task: "➕ Adding task...",
|
||||
journal_complete_task: "✔️ Completing task...",
|
||||
};
|
||||
|
||||
export const ChatScreen = ({ setAuthenticated }: ChatScreenProps) => {
|
||||
const [query, setQuery] = useState<string>("");
|
||||
const [simbaMode, setSimbaMode] = useState<boolean>(false);
|
||||
const [messages, setMessages] = useState<Message[]>([]);
|
||||
const [conversations, setConversations] = useState<Conversation[]>([]);
|
||||
const [showConversations, setShowConversations] = useState<boolean>(false);
|
||||
const [selectedConversation, setSelectedConversation] =
|
||||
useState<Conversation | null>(null);
|
||||
const [sidebarCollapsed, setSidebarCollapsed] = useState<boolean>(false);
|
||||
const [isLoading, setIsLoading] = useState<boolean>(false);
|
||||
const [isAdmin, setIsAdmin] = useState<boolean>(false);
|
||||
const [showAdminPanel, setShowAdminPanel] = useState<boolean>(false);
|
||||
const [pendingImage, setPendingImage] = useState<File | null>(null);
|
||||
|
||||
const messagesEndRef = useRef<HTMLDivElement>(null);
|
||||
const isLoadingRef = useRef(false);
|
||||
const isMountedRef = useRef<boolean>(true);
|
||||
const abortControllerRef = useRef<AbortController | null>(null);
|
||||
const simbaAnswers = ["meow.", "hiss...", "purrrrrr", "yowOWROWWowowr"];
|
||||
|
||||
const scrollToBottom = useCallback(() => {
|
||||
requestAnimationFrame(() => {
|
||||
messagesEndRef.current?.scrollIntoView({
|
||||
behavior: isLoadingRef.current ? "instant" : "smooth",
|
||||
behavior: isLoading ? "instant" : "smooth",
|
||||
});
|
||||
});
|
||||
}, [isLoading]);
|
||||
|
||||
useEffect(() => {
|
||||
isMountedRef.current = true;
|
||||
return () => {
|
||||
isMountedRef.current = false;
|
||||
abortControllerRef.current?.abort();
|
||||
};
|
||||
}, []);
|
||||
|
||||
const {
|
||||
conversations,
|
||||
selectedConversation,
|
||||
selectConversation,
|
||||
createConversation,
|
||||
refreshConversations,
|
||||
} = useConversations();
|
||||
|
||||
const onSessionExpired = useCallback(() => setAuthenticated(false), [setAuthenticated]);
|
||||
|
||||
const {
|
||||
messages,
|
||||
setMessages,
|
||||
isLoading,
|
||||
pendingImage,
|
||||
setPendingImage,
|
||||
sendMessage,
|
||||
} = useChat({
|
||||
selectedConversation,
|
||||
createConversation,
|
||||
refreshConversations,
|
||||
onSessionExpired,
|
||||
scrollToBottom,
|
||||
});
|
||||
|
||||
// Keep ref in sync for scrollToBottom behavior
|
||||
isLoadingRef.current = isLoading;
|
||||
|
||||
const handleSelectConversation = useCallback(
|
||||
async (conversation: { title: string; id: string }) => {
|
||||
const handleSelectConversation = (conversation: Conversation) => {
|
||||
setShowConversations(false);
|
||||
const loaded = await selectConversation(conversation);
|
||||
setMessages(loaded);
|
||||
},
|
||||
[selectConversation, setMessages],
|
||||
setSelectedConversation(conversation);
|
||||
const load = async () => {
|
||||
try {
|
||||
const fetched = await conversationService.getConversation(conversation.id);
|
||||
setMessages(
|
||||
fetched.messages.map((m) => ({ text: m.text, speaker: m.speaker, image_key: m.image_key })),
|
||||
);
|
||||
} catch (err) {
|
||||
console.error("Failed to load messages:", err);
|
||||
}
|
||||
};
|
||||
load();
|
||||
};
|
||||
|
||||
const handleCreateNewConversation = useCallback(async () => {
|
||||
await createConversation();
|
||||
setMessages([]);
|
||||
}, [createConversation, setMessages]);
|
||||
const loadConversations = async () => {
|
||||
try {
|
||||
const fetched = await conversationService.getAllConversations();
|
||||
const parsed = fetched.map((c) => ({ id: c.id, title: c.name }));
|
||||
setConversations(parsed);
|
||||
} catch (err) {
|
||||
console.error("Failed to load conversations:", err);
|
||||
}
|
||||
};
|
||||
|
||||
const handleQuestionSubmit = useCallback(() => {
|
||||
sendMessage(query, simbaMode);
|
||||
const handleCreateNewConversation = async () => {
|
||||
const newConv = await conversationService.createConversation();
|
||||
await loadConversations();
|
||||
setSelectedConversation({ title: newConv.name, id: newConv.id });
|
||||
};
|
||||
|
||||
useEffect(() => {
|
||||
loadConversations();
|
||||
userService.getMe().then((me) => setIsAdmin(me.is_admin)).catch(() => {});
|
||||
}, []);
|
||||
|
||||
useEffect(() => {
|
||||
scrollToBottom();
|
||||
}, [messages]);
|
||||
|
||||
const handleQuestionSubmit = useCallback(async () => {
|
||||
if ((!query.trim() && !pendingImage) || isLoading) return;
|
||||
|
||||
let activeConversation = selectedConversation;
|
||||
if (!activeConversation) {
|
||||
const newConv = await conversationService.createConversation();
|
||||
activeConversation = { title: newConv.name, id: newConv.id };
|
||||
setSelectedConversation(activeConversation);
|
||||
setConversations((prev) => [activeConversation!, ...prev]);
|
||||
}
|
||||
|
||||
// Capture pending image before clearing state
|
||||
const imageFile = pendingImage;
|
||||
|
||||
const currMessages = messages.concat([{ text: query, speaker: "user" }]);
|
||||
setMessages(currMessages);
|
||||
setQuery("");
|
||||
}, [query, simbaMode, sendMessage]);
|
||||
setPendingImage(null);
|
||||
setIsLoading(true);
|
||||
|
||||
if (simbaMode) {
|
||||
const randomElement = simbaAnswers[Math.floor(Math.random() * simbaAnswers.length)];
|
||||
setMessages((prev) => prev.concat([{ text: randomElement, speaker: "simba" }]));
|
||||
setIsLoading(false);
|
||||
return;
|
||||
}
|
||||
|
||||
const abortController = new AbortController();
|
||||
abortControllerRef.current = abortController;
|
||||
|
||||
try {
|
||||
// Upload image first if present
|
||||
let imageKey: string | undefined;
|
||||
if (imageFile) {
|
||||
const uploadResult = await conversationService.uploadImage(
|
||||
imageFile,
|
||||
activeConversation.id,
|
||||
);
|
||||
imageKey = uploadResult.image_key;
|
||||
|
||||
// Update the user message with the image key
|
||||
setMessages((prev) => {
|
||||
const updated = [...prev];
|
||||
// Find the last user message we just added
|
||||
for (let i = updated.length - 1; i >= 0; i--) {
|
||||
if (updated[i].speaker === "user") {
|
||||
updated[i] = { ...updated[i], image_key: imageKey };
|
||||
break;
|
||||
}
|
||||
}
|
||||
return updated;
|
||||
});
|
||||
}
|
||||
|
||||
await conversationService.streamQuery(
|
||||
query,
|
||||
activeConversation.id,
|
||||
(event) => {
|
||||
if (!isMountedRef.current) return;
|
||||
if (event.type === "tool_start") {
|
||||
const friendly = TOOL_MESSAGES[event.tool] ?? `🔧 Using ${event.tool}...`;
|
||||
setMessages((prev) => prev.concat([{ text: friendly, speaker: "tool" }]));
|
||||
} else if (event.type === "response") {
|
||||
setMessages((prev) => prev.concat([{ text: event.message, speaker: "simba" }]));
|
||||
} else if (event.type === "error") {
|
||||
console.error("Stream error:", event.message);
|
||||
}
|
||||
},
|
||||
abortController.signal,
|
||||
imageKey,
|
||||
);
|
||||
} catch (error) {
|
||||
if (error instanceof Error && error.name === "AbortError") {
|
||||
console.log("Request was aborted");
|
||||
} else {
|
||||
console.error("Failed to send query:", error);
|
||||
if (error instanceof Error && error.message.includes("Session expired")) {
|
||||
setAuthenticated(false);
|
||||
}
|
||||
}
|
||||
} finally {
|
||||
if (isMountedRef.current) {
|
||||
setIsLoading(false);
|
||||
loadConversations();
|
||||
}
|
||||
abortControllerRef.current = null;
|
||||
}
|
||||
}, [query, pendingImage, isLoading, selectedConversation, simbaMode, messages, setAuthenticated]);
|
||||
|
||||
const handleQueryChange = useCallback((event: React.ChangeEvent<HTMLTextAreaElement>) => {
|
||||
setQuery(event.target.value);
|
||||
@@ -93,8 +221,8 @@ export const ChatScreen = ({ setAuthenticated, isAdmin }: ChatScreenProps) => {
|
||||
}
|
||||
}, [handleQuestionSubmit]);
|
||||
|
||||
const handleImageSelect = useCallback((file: File) => setPendingImage(file), [setPendingImage]);
|
||||
const handleClearImage = useCallback(() => setPendingImage(null), [setPendingImage]);
|
||||
const handleImageSelect = useCallback((file: File) => setPendingImage(file), []);
|
||||
const handleClearImage = useCallback(() => setPendingImage(null), []);
|
||||
|
||||
const handleLogout = () => {
|
||||
localStorage.removeItem("access_token");
|
||||
@@ -104,7 +232,7 @@ export const ChatScreen = ({ setAuthenticated, isAdmin }: ChatScreenProps) => {
|
||||
|
||||
return (
|
||||
<div className="h-screen h-[100dvh] flex flex-row bg-cream overflow-hidden">
|
||||
{/* Desktop Sidebar */}
|
||||
{/* ── Desktop Sidebar ─────────────────────────────── */}
|
||||
<aside
|
||||
className={cn(
|
||||
"hidden md:flex md:flex-col",
|
||||
@@ -113,6 +241,7 @@ export const ChatScreen = ({ setAuthenticated, isAdmin }: ChatScreenProps) => {
|
||||
)}
|
||||
>
|
||||
{sidebarCollapsed ? (
|
||||
/* Collapsed state */
|
||||
<div className="flex flex-col items-center py-4 gap-4 h-full">
|
||||
<button
|
||||
onClick={() => setSidebarCollapsed(false)}
|
||||
@@ -127,7 +256,9 @@ export const ChatScreen = ({ setAuthenticated, isAdmin }: ChatScreenProps) => {
|
||||
/>
|
||||
</div>
|
||||
) : (
|
||||
/* Expanded state */
|
||||
<div className="flex flex-col h-full">
|
||||
{/* Header */}
|
||||
<div className="flex items-center justify-between px-4 py-4 border-b border-white/8">
|
||||
<div className="flex items-center gap-2.5">
|
||||
<img src={catIcon} alt="Simba" className="w-12 h-12" />
|
||||
@@ -146,6 +277,7 @@ export const ChatScreen = ({ setAuthenticated, isAdmin }: ChatScreenProps) => {
|
||||
</button>
|
||||
</div>
|
||||
|
||||
{/* Conversations */}
|
||||
<div className="flex-1 overflow-y-auto px-2 py-3">
|
||||
<ConversationList
|
||||
conversations={conversations}
|
||||
@@ -155,6 +287,7 @@ export const ChatScreen = ({ setAuthenticated, isAdmin }: ChatScreenProps) => {
|
||||
/>
|
||||
</div>
|
||||
|
||||
{/* Footer */}
|
||||
<div className="px-2 pb-3 pt-2 border-t border-white/8 flex flex-col gap-0.5">
|
||||
{isAdmin && (
|
||||
<button
|
||||
@@ -177,9 +310,12 @@ export const ChatScreen = ({ setAuthenticated, isAdmin }: ChatScreenProps) => {
|
||||
)}
|
||||
</aside>
|
||||
|
||||
{/* Admin Panel modal */}
|
||||
{showAdminPanel && <AdminPanel onClose={() => setShowAdminPanel(false)} />}
|
||||
|
||||
{/* ── Main chat area ──────────────────────────────── */}
|
||||
<div className="flex-1 flex flex-col h-full overflow-hidden min-w-0">
|
||||
{/* Mobile header */}
|
||||
<header className="md:hidden flex items-center justify-between px-4 py-3 bg-warm-white border-b border-sand-light/60">
|
||||
<div className="flex items-center gap-2">
|
||||
<img src={catIcon} alt="Simba" className="w-12 h-12" />
|
||||
@@ -207,7 +343,9 @@ export const ChatScreen = ({ setAuthenticated, isAdmin }: ChatScreenProps) => {
|
||||
</header>
|
||||
|
||||
{messages.length === 0 ? (
|
||||
/* ── Empty / homepage state ── */
|
||||
<div className="flex-1 flex flex-col items-center justify-center px-4 gap-6">
|
||||
{/* Mobile conversation drawer */}
|
||||
{showConversations && (
|
||||
<div className="md:hidden w-full max-w-2xl bg-warm-white rounded-2xl border border-sand-light p-3 shadow-sm">
|
||||
<ConversationList
|
||||
@@ -244,9 +382,11 @@ export const ChatScreen = ({ setAuthenticated, isAdmin }: ChatScreenProps) => {
|
||||
</div>
|
||||
</div>
|
||||
) : (
|
||||
/* ── Active chat state ── */
|
||||
<>
|
||||
<div className="flex-1 overflow-y-auto px-4 py-6">
|
||||
<div className="max-w-2xl mx-auto flex flex-col gap-3">
|
||||
{/* Mobile conversation drawer */}
|
||||
{showConversations && (
|
||||
<div className="md:hidden mb-3 bg-warm-white rounded-2xl border border-sand-light p-3 shadow-sm">
|
||||
<ConversationList
|
||||
@@ -282,8 +422,8 @@ export const ChatScreen = ({ setAuthenticated, isAdmin }: ChatScreenProps) => {
|
||||
setSimbaMode={setSimbaMode}
|
||||
isLoading={isLoading}
|
||||
pendingImage={pendingImage}
|
||||
onImageSelect={handleImageSelect}
|
||||
onClearImage={handleClearImage}
|
||||
onImageSelect={(file) => setPendingImage(file)}
|
||||
onClearImage={() => setPendingImage(null)}
|
||||
/>
|
||||
</div>
|
||||
</footer>
|
||||
|
||||
@@ -1,5 +1,7 @@
|
||||
import { useState, useEffect } from "react";
|
||||
import { Plus } from "lucide-react";
|
||||
import { cn } from "../lib/utils";
|
||||
import { conversationService } from "../api/conversationService";
|
||||
|
||||
type Conversation = {
|
||||
title: string;
|
||||
@@ -21,8 +23,32 @@ export const ConversationList = ({
|
||||
selectedId,
|
||||
variant = "dark",
|
||||
}: ConversationProps) => {
|
||||
const [items, setItems] = useState(conversations);
|
||||
|
||||
useEffect(() => {
|
||||
const load = async () => {
|
||||
try {
|
||||
let fetched = await conversationService.getAllConversations();
|
||||
if (fetched.length === 0) {
|
||||
await conversationService.createConversation();
|
||||
fetched = await conversationService.getAllConversations();
|
||||
}
|
||||
setItems(fetched.map((c) => ({ id: c.id, title: c.name })));
|
||||
} catch (err) {
|
||||
console.error("Failed to load conversations:", err);
|
||||
}
|
||||
};
|
||||
load();
|
||||
}, []);
|
||||
|
||||
// Keep in sync when parent updates conversations
|
||||
useEffect(() => {
|
||||
setItems(conversations);
|
||||
}, [conversations]);
|
||||
|
||||
return (
|
||||
<div className="flex flex-col gap-1">
|
||||
{/* New thread button */}
|
||||
<button
|
||||
onClick={onCreateNewConversation}
|
||||
className={cn(
|
||||
@@ -37,7 +63,8 @@ export const ConversationList = ({
|
||||
<span>New thread</span>
|
||||
</button>
|
||||
|
||||
{conversations.map((conv) => {
|
||||
{/* Conversation items */}
|
||||
{items.map((conv) => {
|
||||
const isActive = conv.id === selectedId;
|
||||
return (
|
||||
<button
|
||||
|
||||
@@ -1,19 +1,66 @@
|
||||
import { useState, useEffect } from "react";
|
||||
import { userService } from "../api/userService";
|
||||
import { oidcService } from "../api/oidcService";
|
||||
import catIcon from "../assets/cat.png";
|
||||
import { cn } from "../lib/utils";
|
||||
import { useOIDCAuth } from "../hooks/useOIDCAuth";
|
||||
|
||||
type LoginScreenProps = {
|
||||
setAuthenticated: (isAuth: boolean) => void;
|
||||
};
|
||||
|
||||
export const LoginScreen = ({ setAuthenticated }: LoginScreenProps) => {
|
||||
const { isChecking, isLoggingIn, error, handleLogin } = useOIDCAuth({
|
||||
setAuthenticated,
|
||||
});
|
||||
const [error, setError] = useState<string>("");
|
||||
const [isChecking, setIsChecking] = useState<boolean>(true);
|
||||
const [isLoggingIn, setIsLoggingIn] = useState<boolean>(false);
|
||||
|
||||
useEffect(() => {
|
||||
const initAuth = async () => {
|
||||
const callbackParams = oidcService.getCallbackParamsFromURL();
|
||||
if (callbackParams) {
|
||||
try {
|
||||
setIsLoggingIn(true);
|
||||
const result = await oidcService.handleCallback(
|
||||
callbackParams.code,
|
||||
callbackParams.state,
|
||||
);
|
||||
localStorage.setItem("access_token", result.access_token);
|
||||
localStorage.setItem("refresh_token", result.refresh_token);
|
||||
oidcService.clearCallbackParams();
|
||||
setAuthenticated(true);
|
||||
setIsChecking(false);
|
||||
return;
|
||||
} catch (err) {
|
||||
console.error("OIDC callback error:", err);
|
||||
setError("Login failed. Please try again.");
|
||||
oidcService.clearCallbackParams();
|
||||
setIsLoggingIn(false);
|
||||
setIsChecking(false);
|
||||
return;
|
||||
}
|
||||
}
|
||||
const isValid = await userService.validateToken();
|
||||
if (isValid) setAuthenticated(true);
|
||||
setIsChecking(false);
|
||||
};
|
||||
initAuth();
|
||||
}, [setAuthenticated]);
|
||||
|
||||
const handleOIDCLogin = async () => {
|
||||
try {
|
||||
setIsLoggingIn(true);
|
||||
setError("");
|
||||
const authUrl = await oidcService.initiateLogin();
|
||||
window.location.href = authUrl;
|
||||
} catch {
|
||||
setError("Failed to initiate login. Please try again.");
|
||||
setIsLoggingIn(false);
|
||||
}
|
||||
};
|
||||
|
||||
if (isChecking || isLoggingIn) {
|
||||
return (
|
||||
<div className="h-screen flex flex-col items-center justify-center bg-cream gap-4">
|
||||
{/* Subtle dot grid */}
|
||||
<div
|
||||
className="fixed inset-0 pointer-events-none opacity-[0.035]"
|
||||
style={{
|
||||
@@ -38,6 +85,7 @@ export const LoginScreen = ({ setAuthenticated }: LoginScreenProps) => {
|
||||
|
||||
return (
|
||||
<div className="h-screen bg-cream flex items-center justify-center p-4 relative overflow-hidden">
|
||||
{/* Background dot texture */}
|
||||
<div
|
||||
className="fixed inset-0 pointer-events-none opacity-[0.04]"
|
||||
style={{
|
||||
@@ -46,10 +94,12 @@ export const LoginScreen = ({ setAuthenticated }: LoginScreenProps) => {
|
||||
}}
|
||||
/>
|
||||
|
||||
{/* Decorative background blobs */}
|
||||
<div className="absolute top-1/4 -left-20 w-72 h-72 rounded-full bg-leaf-pale/60 blur-3xl pointer-events-none" />
|
||||
<div className="absolute bottom-1/4 -right-20 w-64 h-64 rounded-full bg-amber-pale/70 blur-3xl pointer-events-none" />
|
||||
|
||||
<div className="relative w-full max-w-sm">
|
||||
{/* Branding */}
|
||||
<div className="flex flex-col items-center mb-8">
|
||||
<div className="relative mb-5">
|
||||
<div className="absolute -inset-5 bg-amber-soft/30 rounded-full blur-2xl" />
|
||||
@@ -70,6 +120,7 @@ export const LoginScreen = ({ setAuthenticated }: LoginScreenProps) => {
|
||||
</p>
|
||||
</div>
|
||||
|
||||
{/* Card */}
|
||||
<div
|
||||
className={cn(
|
||||
"bg-warm-white rounded-3xl border border-sand-light",
|
||||
@@ -87,7 +138,7 @@ export const LoginScreen = ({ setAuthenticated }: LoginScreenProps) => {
|
||||
</p>
|
||||
|
||||
<button
|
||||
onClick={handleLogin}
|
||||
onClick={handleOIDCLogin}
|
||||
disabled={isLoggingIn}
|
||||
className={cn(
|
||||
"w-full py-3.5 px-4 rounded-2xl text-sm font-semibold tracking-wide",
|
||||
@@ -103,7 +154,7 @@ export const LoginScreen = ({ setAuthenticated }: LoginScreenProps) => {
|
||||
</div>
|
||||
|
||||
<p className="text-center text-sand mt-5 text-xs tracking-widest select-none">
|
||||
* meow *
|
||||
✦ meow ✦
|
||||
</p>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
@@ -1,14 +1,26 @@
|
||||
import React from "react";
|
||||
import { useEffect, useState } from "react";
|
||||
import { cn } from "../lib/utils";
|
||||
import { usePresignedUrl } from "../hooks/usePresignedUrl";
|
||||
import { conversationService } from "../api/conversationService";
|
||||
|
||||
type QuestionBubbleProps = {
|
||||
text: string;
|
||||
image_key?: string | null;
|
||||
};
|
||||
|
||||
export const QuestionBubble = React.memo(({ text, image_key }: QuestionBubbleProps) => {
|
||||
const { imageUrl, imageError } = usePresignedUrl(image_key);
|
||||
export const QuestionBubble = ({ text, image_key }: QuestionBubbleProps) => {
|
||||
const [imageUrl, setImageUrl] = useState<string | null>(null);
|
||||
const [imageError, setImageError] = useState(false);
|
||||
|
||||
useEffect(() => {
|
||||
if (!image_key) return;
|
||||
conversationService
|
||||
.getPresignedImageUrl(image_key)
|
||||
.then(setImageUrl)
|
||||
.catch((err) => {
|
||||
console.error("Failed to load image:", err);
|
||||
setImageError(true);
|
||||
});
|
||||
}, [image_key]);
|
||||
|
||||
return (
|
||||
<div className="flex justify-end message-enter">
|
||||
@@ -22,6 +34,7 @@ export const QuestionBubble = React.memo(({ text, image_key }: QuestionBubblePro
|
||||
>
|
||||
{imageError && (
|
||||
<div className="flex items-center gap-2 text-xs text-charcoal/50 bg-charcoal/5 rounded-xl px-3 py-2 mb-2">
|
||||
<span>🖼️</span>
|
||||
<span>Image failed to load</span>
|
||||
</div>
|
||||
)}
|
||||
@@ -36,4 +49,4 @@ export const QuestionBubble = React.memo(({ text, image_key }: QuestionBubblePro
|
||||
</div>
|
||||
</div>
|
||||
);
|
||||
});
|
||||
};
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
import React from "react";
|
||||
import { cn } from "../lib/utils";
|
||||
|
||||
export const ToolBubble = React.memo(({ text }: { text: string }) => (
|
||||
export const ToolBubble = ({ text }: { text: string }) => (
|
||||
<div className="flex justify-center message-enter">
|
||||
<div
|
||||
className={cn(
|
||||
@@ -13,4 +12,4 @@ export const ToolBubble = React.memo(({ text }: { text: string }) => (
|
||||
{text}
|
||||
</div>
|
||||
</div>
|
||||
));
|
||||
);
|
||||
|
||||
@@ -1,21 +0,0 @@
|
||||
import { useState, useEffect } from "react";
|
||||
import { userService, type AdminUserRecord } from "../api/userService";
|
||||
|
||||
export function useAdminUsers() {
|
||||
const [users, setUsers] = useState<AdminUserRecord[]>([]);
|
||||
const [loading, setLoading] = useState(true);
|
||||
|
||||
useEffect(() => {
|
||||
userService
|
||||
.adminListUsers()
|
||||
.then(setUsers)
|
||||
.catch(() => {})
|
||||
.finally(() => setLoading(false));
|
||||
}, []);
|
||||
|
||||
const updateUser = (userId: string, updater: (u: AdminUserRecord) => AdminUserRecord) => {
|
||||
setUsers((prev) => prev.map((u) => (u.id === userId ? updater(u) : u)));
|
||||
};
|
||||
|
||||
return { users, loading, updateUser };
|
||||
}
|
||||
@@ -1,37 +0,0 @@
|
||||
import { useState, useEffect } from "react";
|
||||
import { userService } from "../api/userService";
|
||||
|
||||
export function useAuthCheck() {
|
||||
const [isAuthenticated, setAuthenticated] = useState(false);
|
||||
const [isChecking, setIsChecking] = useState(true);
|
||||
const [isAdmin, setIsAdmin] = useState(false);
|
||||
|
||||
useEffect(() => {
|
||||
const checkAuth = async () => {
|
||||
const accessToken = localStorage.getItem("access_token");
|
||||
const refreshToken = localStorage.getItem("refresh_token");
|
||||
|
||||
if (!accessToken && !refreshToken) {
|
||||
setIsChecking(false);
|
||||
setAuthenticated(false);
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
const me = await userService.getMe();
|
||||
setAuthenticated(true);
|
||||
setIsAdmin(me.is_admin);
|
||||
} catch {
|
||||
localStorage.removeItem("access_token");
|
||||
localStorage.removeItem("refresh_token");
|
||||
setAuthenticated(false);
|
||||
} finally {
|
||||
setIsChecking(false);
|
||||
}
|
||||
};
|
||||
|
||||
checkAuth();
|
||||
}, []);
|
||||
|
||||
return { isAuthenticated, isChecking, isAdmin, setAuthenticated };
|
||||
}
|
||||
@@ -1,183 +0,0 @@
|
||||
import { useState, useCallback, useEffect, useRef } from "react";
|
||||
import { conversationService } from "../api/conversationService";
|
||||
import type { Conversation } from "./useConversations";
|
||||
|
||||
type Message = {
|
||||
text: string;
|
||||
speaker: "simba" | "user" | "tool";
|
||||
image_key?: string | null;
|
||||
};
|
||||
|
||||
const TOOL_MESSAGES: Record<string, string> = {
|
||||
simba_search: "Searching Simba's records...",
|
||||
web_search: "Searching the web...",
|
||||
get_current_date: "Checking today's date...",
|
||||
ynab_budget_summary: "Checking budget summary...",
|
||||
ynab_search_transactions: "Looking up transactions...",
|
||||
ynab_category_spending: "Analyzing category spending...",
|
||||
ynab_insights: "Generating budget insights...",
|
||||
obsidian_search_notes: "Searching notes...",
|
||||
obsidian_read_note: "Reading note...",
|
||||
obsidian_create_note: "Saving note...",
|
||||
obsidian_create_task: "Creating task...",
|
||||
journal_get_today: "Reading today's journal...",
|
||||
journal_get_tasks: "Getting tasks...",
|
||||
journal_add_task: "Adding task...",
|
||||
journal_complete_task: "Completing task...",
|
||||
};
|
||||
|
||||
const simbaAnswers = ["meow.", "hiss...", "purrrrrr", "yowOWROWWowowr"];
|
||||
|
||||
type UseChatOptions = {
|
||||
selectedConversation: Conversation | null;
|
||||
createConversation: () => Promise<Conversation>;
|
||||
refreshConversations: () => Promise<void>;
|
||||
onSessionExpired: () => void;
|
||||
scrollToBottom: () => void;
|
||||
};
|
||||
|
||||
export function useChat({
|
||||
selectedConversation,
|
||||
createConversation,
|
||||
refreshConversations,
|
||||
onSessionExpired,
|
||||
scrollToBottom,
|
||||
}: UseChatOptions) {
|
||||
const [messages, setMessages] = useState<Message[]>([]);
|
||||
const [isLoading, setIsLoading] = useState(false);
|
||||
const [pendingImage, setPendingImage] = useState<File | null>(null);
|
||||
|
||||
const isMountedRef = useRef(true);
|
||||
const abortControllerRef = useRef<AbortController | null>(null);
|
||||
|
||||
useEffect(() => {
|
||||
isMountedRef.current = true;
|
||||
return () => {
|
||||
isMountedRef.current = false;
|
||||
abortControllerRef.current?.abort();
|
||||
};
|
||||
}, []);
|
||||
|
||||
const updateMessages = useCallback(
|
||||
(updater: Message[] | ((prev: Message[]) => Message[])) => {
|
||||
setMessages(updater);
|
||||
scrollToBottom();
|
||||
},
|
||||
[scrollToBottom],
|
||||
);
|
||||
|
||||
const sendMessage = useCallback(
|
||||
async (query: string, simbaMode: boolean) => {
|
||||
if ((!query.trim() && !pendingImage) || isLoading) return;
|
||||
|
||||
let activeConversation = selectedConversation;
|
||||
let createdNew = false;
|
||||
if (!activeConversation) {
|
||||
activeConversation = await createConversation();
|
||||
createdNew = true;
|
||||
}
|
||||
|
||||
const imageFile = pendingImage;
|
||||
|
||||
updateMessages((prev) => prev.concat([{ text: query, speaker: "user" }]));
|
||||
setPendingImage(null);
|
||||
setIsLoading(true);
|
||||
|
||||
if (simbaMode) {
|
||||
const randomElement =
|
||||
simbaAnswers[Math.floor(Math.random() * simbaAnswers.length)];
|
||||
updateMessages((prev) =>
|
||||
prev.concat([{ text: randomElement, speaker: "simba" }]),
|
||||
);
|
||||
setIsLoading(false);
|
||||
return;
|
||||
}
|
||||
|
||||
const abortController = new AbortController();
|
||||
abortControllerRef.current = abortController;
|
||||
|
||||
try {
|
||||
let imageKey: string | undefined;
|
||||
if (imageFile) {
|
||||
const uploadResult = await conversationService.uploadImage(
|
||||
imageFile,
|
||||
activeConversation.id,
|
||||
);
|
||||
imageKey = uploadResult.image_key;
|
||||
|
||||
updateMessages((prev) => {
|
||||
const updated = [...prev];
|
||||
for (let i = updated.length - 1; i >= 0; i--) {
|
||||
if (updated[i].speaker === "user") {
|
||||
updated[i] = { ...updated[i], image_key: imageKey };
|
||||
break;
|
||||
}
|
||||
}
|
||||
return updated;
|
||||
});
|
||||
}
|
||||
|
||||
await conversationService.streamQuery(
|
||||
query,
|
||||
activeConversation.id,
|
||||
(event) => {
|
||||
if (!isMountedRef.current) return;
|
||||
if (event.type === "tool_start") {
|
||||
const friendly =
|
||||
TOOL_MESSAGES[event.tool] ?? `Using ${event.tool}...`;
|
||||
updateMessages((prev) =>
|
||||
prev.concat([{ text: friendly, speaker: "tool" }]),
|
||||
);
|
||||
} else if (event.type === "response") {
|
||||
updateMessages((prev) =>
|
||||
prev.concat([{ text: event.message, speaker: "simba" }]),
|
||||
);
|
||||
} else if (event.type === "error") {
|
||||
console.error("Stream error:", event.message);
|
||||
}
|
||||
},
|
||||
abortController.signal,
|
||||
imageKey,
|
||||
);
|
||||
} catch (error) {
|
||||
if (error instanceof Error && error.name === "AbortError") {
|
||||
console.log("Request was aborted");
|
||||
} else {
|
||||
console.error("Failed to send query:", error);
|
||||
if (
|
||||
error instanceof Error &&
|
||||
error.message.includes("Session expired")
|
||||
) {
|
||||
onSessionExpired();
|
||||
}
|
||||
}
|
||||
} finally {
|
||||
if (isMountedRef.current) {
|
||||
setIsLoading(false);
|
||||
if (createdNew) {
|
||||
refreshConversations();
|
||||
}
|
||||
}
|
||||
abortControllerRef.current = null;
|
||||
}
|
||||
},
|
||||
[
|
||||
pendingImage,
|
||||
isLoading,
|
||||
selectedConversation,
|
||||
createConversation,
|
||||
refreshConversations,
|
||||
onSessionExpired,
|
||||
updateMessages,
|
||||
],
|
||||
);
|
||||
|
||||
return {
|
||||
messages,
|
||||
setMessages: updateMessages,
|
||||
isLoading,
|
||||
pendingImage,
|
||||
setPendingImage,
|
||||
sendMessage,
|
||||
};
|
||||
}
|
||||
@@ -1,69 +0,0 @@
|
||||
import { useState, useCallback, useEffect } from "react";
|
||||
import { conversationService } from "../api/conversationService";
|
||||
|
||||
export type Conversation = {
|
||||
title: string;
|
||||
id: string;
|
||||
};
|
||||
|
||||
type Message = {
|
||||
text: string;
|
||||
speaker: "simba" | "user" | "tool";
|
||||
image_key?: string | null;
|
||||
};
|
||||
|
||||
export function useConversations() {
|
||||
const [conversations, setConversations] = useState<Conversation[]>([]);
|
||||
const [selectedConversation, setSelectedConversation] =
|
||||
useState<Conversation | null>(null);
|
||||
|
||||
const refreshConversations = useCallback(async () => {
|
||||
try {
|
||||
const fetched = await conversationService.getAllConversations();
|
||||
setConversations(fetched.map((c) => ({ id: c.id, title: c.name })));
|
||||
} catch (err) {
|
||||
console.error("Failed to load conversations:", err);
|
||||
}
|
||||
}, []);
|
||||
|
||||
useEffect(() => {
|
||||
refreshConversations();
|
||||
}, [refreshConversations]);
|
||||
|
||||
const selectConversation = useCallback(
|
||||
async (conversation: Conversation): Promise<Message[]> => {
|
||||
setSelectedConversation(conversation);
|
||||
try {
|
||||
const fetched = await conversationService.getConversation(
|
||||
conversation.id,
|
||||
);
|
||||
return fetched.messages.map((m) => ({
|
||||
text: m.text,
|
||||
speaker: m.speaker,
|
||||
image_key: m.image_key,
|
||||
}));
|
||||
} catch (err) {
|
||||
console.error("Failed to load messages:", err);
|
||||
return [];
|
||||
}
|
||||
},
|
||||
[],
|
||||
);
|
||||
|
||||
const createConversation = useCallback(async (): Promise<Conversation> => {
|
||||
const newConv = await conversationService.createConversation();
|
||||
const conversation = { title: newConv.name, id: newConv.id };
|
||||
setConversations((prev) => [conversation, ...prev]);
|
||||
setSelectedConversation(conversation);
|
||||
return conversation;
|
||||
}, []);
|
||||
|
||||
return {
|
||||
conversations,
|
||||
selectedConversation,
|
||||
setSelectedConversation,
|
||||
selectConversation,
|
||||
createConversation,
|
||||
refreshConversations,
|
||||
};
|
||||
}
|
||||
@@ -1,59 +0,0 @@
|
||||
import { useState, useEffect } from "react";
|
||||
import { userService } from "../api/userService";
|
||||
import { oidcService } from "../api/oidcService";
|
||||
|
||||
type UseOIDCAuthOptions = {
|
||||
setAuthenticated: (isAuth: boolean) => void;
|
||||
};
|
||||
|
||||
export function useOIDCAuth({ setAuthenticated }: UseOIDCAuthOptions) {
|
||||
const [isChecking, setIsChecking] = useState(true);
|
||||
const [isLoggingIn, setIsLoggingIn] = useState(false);
|
||||
const [error, setError] = useState("");
|
||||
|
||||
useEffect(() => {
|
||||
const initAuth = async () => {
|
||||
const callbackParams = oidcService.getCallbackParamsFromURL();
|
||||
if (callbackParams) {
|
||||
try {
|
||||
setIsLoggingIn(true);
|
||||
const result = await oidcService.handleCallback(
|
||||
callbackParams.code,
|
||||
callbackParams.state,
|
||||
);
|
||||
localStorage.setItem("access_token", result.access_token);
|
||||
localStorage.setItem("refresh_token", result.refresh_token);
|
||||
oidcService.clearCallbackParams();
|
||||
setAuthenticated(true);
|
||||
setIsChecking(false);
|
||||
return;
|
||||
} catch (err) {
|
||||
console.error("OIDC callback error:", err);
|
||||
setError("Login failed. Please try again.");
|
||||
oidcService.clearCallbackParams();
|
||||
setIsLoggingIn(false);
|
||||
setIsChecking(false);
|
||||
return;
|
||||
}
|
||||
}
|
||||
const isValid = await userService.validateToken();
|
||||
if (isValid) setAuthenticated(true);
|
||||
setIsChecking(false);
|
||||
};
|
||||
initAuth();
|
||||
}, [setAuthenticated]);
|
||||
|
||||
const handleLogin = async () => {
|
||||
try {
|
||||
setIsLoggingIn(true);
|
||||
setError("");
|
||||
const authUrl = await oidcService.initiateLogin();
|
||||
window.location.href = authUrl;
|
||||
} catch {
|
||||
setError("Failed to initiate login. Please try again.");
|
||||
setIsLoggingIn(false);
|
||||
}
|
||||
};
|
||||
|
||||
return { isChecking, isLoggingIn, error, handleLogin };
|
||||
}
|
||||
@@ -1,34 +0,0 @@
|
||||
import { useState, useEffect } from "react";
|
||||
import { conversationService } from "../api/conversationService";
|
||||
|
||||
const urlCache = new Map<string, string>();
|
||||
|
||||
export function usePresignedUrl(imageKey: string | null | undefined) {
|
||||
const [imageUrl, setImageUrl] = useState<string | null>(
|
||||
imageKey ? (urlCache.get(imageKey) ?? null) : null,
|
||||
);
|
||||
const [imageError, setImageError] = useState(false);
|
||||
|
||||
useEffect(() => {
|
||||
if (!imageKey) return;
|
||||
|
||||
const cached = urlCache.get(imageKey);
|
||||
if (cached) {
|
||||
setImageUrl(cached);
|
||||
return;
|
||||
}
|
||||
|
||||
conversationService
|
||||
.getPresignedImageUrl(imageKey)
|
||||
.then((url) => {
|
||||
urlCache.set(imageKey, url);
|
||||
setImageUrl(url);
|
||||
})
|
||||
.catch((err) => {
|
||||
console.error("Failed to load image:", err);
|
||||
setImageError(true);
|
||||
});
|
||||
}, [imageKey]);
|
||||
|
||||
return { imageUrl, imageError };
|
||||
}
|
||||
@@ -6,19 +6,19 @@ import asyncio
|
||||
import sys
|
||||
|
||||
from blueprints.rag.logic import (
|
||||
delete_all_documents,
|
||||
get_vector_store_stats,
|
||||
index_documents,
|
||||
list_all_documents,
|
||||
vector_store,
|
||||
)
|
||||
|
||||
|
||||
def stats():
|
||||
"""Show vector store statistics."""
|
||||
s = get_vector_store_stats()
|
||||
stats = get_vector_store_stats()
|
||||
print("=== Vector Store Statistics ===")
|
||||
print(f"Collection: {s['collection_name']}")
|
||||
print(f"Total Documents: {s['total_documents']}")
|
||||
print(f"Collection: {stats['collection_name']}")
|
||||
print(f"Total Documents: {stats['total_documents']}")
|
||||
|
||||
|
||||
async def index():
|
||||
@@ -26,15 +26,23 @@ async def index():
|
||||
print("Starting indexing process...")
|
||||
print("Fetching documents from Paperless-NGX...")
|
||||
await index_documents()
|
||||
print("Indexing complete!")
|
||||
print("✓ Indexing complete!")
|
||||
stats()
|
||||
|
||||
|
||||
async def reindex():
|
||||
"""Clear and reindex all documents."""
|
||||
print("Clearing existing documents...")
|
||||
delete_all_documents()
|
||||
print("Cleared")
|
||||
collection = vector_store._collection
|
||||
all_docs = collection.get()
|
||||
|
||||
if all_docs["ids"]:
|
||||
print(f"Deleting {len(all_docs['ids'])} existing documents...")
|
||||
collection.delete(ids=all_docs["ids"])
|
||||
print("✓ Cleared")
|
||||
else:
|
||||
print("Collection is already empty")
|
||||
|
||||
await index()
|
||||
|
||||
|
||||
@@ -105,7 +113,7 @@ Examples:
|
||||
print("\n\nOperation cancelled by user")
|
||||
sys.exit(1)
|
||||
except Exception as e:
|
||||
print(f"\nError: {e}", file=sys.stderr)
|
||||
print(f"\n❌ Error: {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
|
||||
@@ -0,0 +1,24 @@
|
||||
from bs4 import BeautifulSoup
|
||||
import chromadb
|
||||
import httpx
|
||||
|
||||
client = chromadb.PersistentClient(path="/Users/ryanchen/Programs/raggr/chromadb")
|
||||
|
||||
# Scrape
|
||||
BASE_URL = "https://www.vet.cornell.edu"
|
||||
LIST_URL = "/departments-centers-and-institutes/cornell-feline-health-center/health-information/feline-health-topics"
|
||||
|
||||
QUERY_URL = BASE_URL + LIST_URL
|
||||
r = httpx.get(QUERY_URL)
|
||||
soup = BeautifulSoup(r.text)
|
||||
|
||||
container = soup.find("div", class_="field-body")
|
||||
a_s = container.find_all("a", href=True)
|
||||
|
||||
new_texts = []
|
||||
|
||||
for link in a_s:
|
||||
endpoint = link["href"]
|
||||
query_url = BASE_URL + endpoint
|
||||
r2 = httpx.get(query_url)
|
||||
article_soup = BeautifulSoup(r2.text)
|
||||
@@ -0,0 +1,97 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Management command to rename all conversations.
|
||||
|
||||
- Conversations with >10 messages: renamed to an LLM-generated summary
|
||||
- Conversations with <=10 messages: renamed to a truncation of the first user message
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import os
|
||||
|
||||
from tortoise import Tortoise
|
||||
|
||||
from blueprints.conversation.models import Conversation, Speaker
|
||||
from llm import LLMClient
|
||||
|
||||
|
||||
async def rename_conversations(dry_run: bool = False):
|
||||
"""Rename all conversations based on message count."""
|
||||
|
||||
database_url = os.getenv("DATABASE_URL", "sqlite://raggr.db")
|
||||
await Tortoise.init(
|
||||
db_url=database_url,
|
||||
modules={
|
||||
"models": [
|
||||
"blueprints.users.models",
|
||||
"blueprints.conversation.models",
|
||||
]
|
||||
},
|
||||
)
|
||||
|
||||
try:
|
||||
llm = LLMClient()
|
||||
conversations = await Conversation.all().prefetch_related("messages")
|
||||
|
||||
renamed = 0
|
||||
skipped = 0
|
||||
|
||||
for conversation in conversations:
|
||||
messages = sorted(conversation.messages, key=lambda m: m.created_at)
|
||||
user_messages = [m for m in messages if m.speaker == Speaker.USER]
|
||||
|
||||
if not user_messages:
|
||||
skipped += 1
|
||||
continue
|
||||
|
||||
if len(messages) > 10:
|
||||
# Summarize via LLM
|
||||
message_text = "\n".join(
|
||||
f"{m.speaker.value}: {m.text}" for m in messages[:30]
|
||||
)
|
||||
new_name = llm.chat(
|
||||
prompt=message_text,
|
||||
system_prompt=(
|
||||
"You are naming a conversation. Given the messages below, "
|
||||
"produce a short, descriptive title (max 8 words). "
|
||||
"Reply with ONLY the title, nothing else."
|
||||
),
|
||||
)
|
||||
new_name = new_name.strip().strip('"').strip("'")[:100]
|
||||
else:
|
||||
# Truncate first user message
|
||||
new_name = user_messages[0].text[:100]
|
||||
|
||||
old_name = conversation.name
|
||||
if old_name == new_name:
|
||||
skipped += 1
|
||||
continue
|
||||
|
||||
if dry_run:
|
||||
print(f" [dry-run] '{old_name}' -> '{new_name}'")
|
||||
else:
|
||||
conversation.name = new_name
|
||||
await conversation.save()
|
||||
print(f" '{old_name}' -> '{new_name}'")
|
||||
|
||||
renamed += 1
|
||||
|
||||
print(f"\nRenamed: {renamed} Skipped: {skipped}")
|
||||
if dry_run:
|
||||
print("(dry run — no changes were saved)")
|
||||
finally:
|
||||
await Tortoise.close_connections()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Rename conversations based on message count"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dry-run",
|
||||
action="store_true",
|
||||
help="Preview renames without saving",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
asyncio.run(rename_conversations(dry_run=args.dry_run))
|
||||
@@ -1,6 +1,9 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
echo "Initializing directories..."
|
||||
mkdir -p /app/data/chromadb
|
||||
|
||||
echo "Rebuilding frontend..."
|
||||
cd /app/raggr-frontend
|
||||
yarn build
|
||||
|
||||
@@ -0,0 +1,139 @@
|
||||
"""Tests for text preprocessing functions in utils/chunker.py."""
|
||||
|
||||
from utils.chunker import (
|
||||
remove_headers_footers,
|
||||
remove_special_characters,
|
||||
remove_repeated_substrings,
|
||||
remove_extra_spaces,
|
||||
preprocess_text,
|
||||
)
|
||||
|
||||
|
||||
class TestRemoveHeadersFooters:
|
||||
def test_removes_default_header(self):
|
||||
text = "Header Line\nActual content here"
|
||||
result = remove_headers_footers(text)
|
||||
assert "Header" not in result
|
||||
assert "Actual content here" in result
|
||||
|
||||
def test_removes_default_footer(self):
|
||||
text = "Actual content\nFooter Line"
|
||||
result = remove_headers_footers(text)
|
||||
assert "Footer" not in result
|
||||
assert "Actual content" in result
|
||||
|
||||
def test_custom_patterns(self):
|
||||
text = "PAGE 1\nContent\nCopyright 2024"
|
||||
result = remove_headers_footers(
|
||||
text,
|
||||
header_patterns=[r"^PAGE \d+$"],
|
||||
footer_patterns=[r"^Copyright.*$"],
|
||||
)
|
||||
assert "PAGE 1" not in result
|
||||
assert "Copyright" not in result
|
||||
assert "Content" in result
|
||||
|
||||
def test_no_match_preserves_text(self):
|
||||
text = "Just normal content"
|
||||
result = remove_headers_footers(text)
|
||||
assert result == "Just normal content"
|
||||
|
||||
def test_empty_string(self):
|
||||
assert remove_headers_footers("") == ""
|
||||
|
||||
|
||||
class TestRemoveSpecialCharacters:
|
||||
def test_removes_special_chars(self):
|
||||
text = "Hello @world #test $100"
|
||||
result = remove_special_characters(text)
|
||||
assert "@" not in result
|
||||
assert "#" not in result
|
||||
assert "$" not in result
|
||||
|
||||
def test_preserves_allowed_chars(self):
|
||||
text = "Hello, world! How's it going? Yes-no."
|
||||
result = remove_special_characters(text)
|
||||
assert "," in result
|
||||
assert "!" in result
|
||||
assert "'" in result
|
||||
assert "?" in result
|
||||
assert "-" in result
|
||||
assert "." in result
|
||||
|
||||
def test_custom_pattern(self):
|
||||
text = "keep @this but not #that"
|
||||
result = remove_special_characters(text, special_chars=r"[#]")
|
||||
assert "@this" in result
|
||||
assert "#" not in result
|
||||
|
||||
def test_empty_string(self):
|
||||
assert remove_special_characters("") == ""
|
||||
|
||||
|
||||
class TestRemoveRepeatedSubstrings:
|
||||
def test_collapses_dots(self):
|
||||
text = "Item.....Value"
|
||||
result = remove_repeated_substrings(text)
|
||||
assert result == "Item.Value"
|
||||
|
||||
def test_single_dot_preserved(self):
|
||||
text = "End of sentence."
|
||||
result = remove_repeated_substrings(text)
|
||||
assert result == "End of sentence."
|
||||
|
||||
def test_custom_pattern(self):
|
||||
text = "hello---world"
|
||||
result = remove_repeated_substrings(text, pattern=r"-{2,}")
|
||||
# Function always replaces matched pattern with "."
|
||||
assert result == "hello.world"
|
||||
|
||||
def test_empty_string(self):
|
||||
assert remove_repeated_substrings("") == ""
|
||||
|
||||
|
||||
class TestRemoveExtraSpaces:
|
||||
def test_collapses_multiple_blank_lines(self):
|
||||
text = "Line 1\n\n\n\nLine 2"
|
||||
result = remove_extra_spaces(text)
|
||||
# After collapsing newlines to \n\n, then \s+ collapses everything to single spaces
|
||||
assert "\n\n\n" not in result
|
||||
|
||||
def test_collapses_multiple_spaces(self):
|
||||
text = "Hello world"
|
||||
result = remove_extra_spaces(text)
|
||||
assert result == "Hello world"
|
||||
|
||||
def test_strips_whitespace(self):
|
||||
text = " Hello world "
|
||||
result = remove_extra_spaces(text)
|
||||
assert result == "Hello world"
|
||||
|
||||
def test_empty_string(self):
|
||||
assert remove_extra_spaces("") == ""
|
||||
|
||||
|
||||
class TestPreprocessText:
|
||||
def test_full_pipeline(self):
|
||||
text = "Header Info\nHello @world... with spaces\nFooter Info"
|
||||
result = preprocess_text(text)
|
||||
assert "Header" not in result
|
||||
assert "Footer" not in result
|
||||
assert "@" not in result
|
||||
assert "..." not in result
|
||||
assert " " not in result
|
||||
|
||||
def test_preserves_meaningful_content(self):
|
||||
text = "The cat weighs 10 pounds."
|
||||
result = preprocess_text(text)
|
||||
assert "cat" in result
|
||||
assert "10" in result
|
||||
assert "pounds" in result
|
||||
|
||||
def test_empty_string(self):
|
||||
assert preprocess_text("") == ""
|
||||
|
||||
def test_already_clean(self):
|
||||
text = "Simple clean text here."
|
||||
result = preprocess_text(text)
|
||||
assert "Simple" in result
|
||||
assert "clean" in result
|
||||
@@ -0,0 +1,137 @@
|
||||
import os
|
||||
from math import ceil
|
||||
import re
|
||||
from typing import Union
|
||||
from uuid import UUID, uuid4
|
||||
from chromadb.utils.embedding_functions.openai_embedding_function import (
|
||||
OpenAIEmbeddingFunction,
|
||||
)
|
||||
from dotenv import load_dotenv
|
||||
from llm import LLMClient
|
||||
|
||||
|
||||
load_dotenv()
|
||||
|
||||
|
||||
def remove_headers_footers(text, header_patterns=None, footer_patterns=None):
|
||||
if header_patterns is None:
|
||||
header_patterns = [r"^.*Header.*$"]
|
||||
if footer_patterns is None:
|
||||
footer_patterns = [r"^.*Footer.*$"]
|
||||
|
||||
for pattern in header_patterns + footer_patterns:
|
||||
text = re.sub(pattern, "", text, flags=re.MULTILINE)
|
||||
|
||||
return text.strip()
|
||||
|
||||
|
||||
def remove_special_characters(text, special_chars=None):
|
||||
if special_chars is None:
|
||||
special_chars = r"[^A-Za-z0-9\s\.,;:\'\"\?\!\-]"
|
||||
|
||||
text = re.sub(special_chars, "", text)
|
||||
return text.strip()
|
||||
|
||||
|
||||
def remove_repeated_substrings(text, pattern=r"\.{2,}"):
|
||||
text = re.sub(pattern, ".", text)
|
||||
return text.strip()
|
||||
|
||||
|
||||
def remove_extra_spaces(text):
|
||||
text = re.sub(r"\n\s*\n", "\n\n", text)
|
||||
text = re.sub(r"\s+", " ", text)
|
||||
|
||||
return text.strip()
|
||||
|
||||
|
||||
def preprocess_text(text):
|
||||
# Remove headers and footers
|
||||
text = remove_headers_footers(text)
|
||||
|
||||
# Remove special characters
|
||||
text = remove_special_characters(text)
|
||||
|
||||
# Remove repeated substrings like dots
|
||||
text = remove_repeated_substrings(text)
|
||||
|
||||
# Remove extra spaces between lines and within lines
|
||||
text = remove_extra_spaces(text)
|
||||
|
||||
# Additional cleaning steps can be added here
|
||||
|
||||
return text.strip()
|
||||
|
||||
|
||||
class Chunk:
|
||||
def __init__(
|
||||
self,
|
||||
text: str,
|
||||
size: int,
|
||||
document_id: UUID,
|
||||
chunk_id: int,
|
||||
embedding,
|
||||
):
|
||||
self.text = text
|
||||
self.size = size
|
||||
self.document_id = document_id
|
||||
self.chunk_id = chunk_id
|
||||
self.embedding = embedding
|
||||
|
||||
|
||||
class Chunker:
|
||||
def __init__(self, collection) -> None:
|
||||
self.collection = collection
|
||||
self.llm_client = LLMClient()
|
||||
|
||||
def embedding_fx(self, inputs):
|
||||
openai_embedding_fx = OpenAIEmbeddingFunction(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
model_name="text-embedding-3-small",
|
||||
)
|
||||
return openai_embedding_fx(inputs)
|
||||
|
||||
def chunk_document(
|
||||
self,
|
||||
document: str,
|
||||
chunk_size: int = 1000,
|
||||
metadata: dict[str, Union[str, float]] = {},
|
||||
) -> list[Chunk]:
|
||||
doc_uuid = uuid4()
|
||||
|
||||
chunk_size = min(chunk_size, len(document)) or 1
|
||||
|
||||
chunks = []
|
||||
num_chunks = ceil(len(document) / chunk_size)
|
||||
document_length = len(document)
|
||||
|
||||
for i in range(num_chunks):
|
||||
curr_pos = i * num_chunks
|
||||
to_pos = (
|
||||
curr_pos + chunk_size
|
||||
if curr_pos + chunk_size < document_length
|
||||
else document_length
|
||||
)
|
||||
text_chunk = self.clean_document(document[curr_pos:to_pos])
|
||||
|
||||
embedding = self.embedding_fx([text_chunk])
|
||||
self.collection.add(
|
||||
ids=[str(doc_uuid) + ":" + str(i)],
|
||||
documents=[text_chunk],
|
||||
embeddings=embedding,
|
||||
metadatas=[metadata],
|
||||
)
|
||||
|
||||
return chunks
|
||||
|
||||
def clean_document(self, document: str) -> str:
|
||||
"""This function will remove information that is noise or already known.
|
||||
|
||||
Example: We already know all the things in here are Simba-related, so we don't need things like
|
||||
"Sumamry of simba's visit"
|
||||
"""
|
||||
|
||||
document = document.replace("\\n", "")
|
||||
document = document.strip()
|
||||
|
||||
return preprocess_text(document)
|
||||
Reference in New Issue
Block a user