reorganization

2026-01-31 17:13:27 -05:00
parent 1fd2e860b2
commit ad39904dda
87 changed files with 1019 additions and 237 deletions
--- a/blueprints/rag/init.py
+++ b/blueprints/rag/init.py
@@ -0,0 +1,47 @@
+from quart import Blueprint, jsonify
+from quart_jwt_extended import jwt_refresh_token_required
+
+from .logic import get_vector_store_stats, index_documents, vector_store
+from blueprints.users.decorators import admin_required
+
+rag_blueprint = Blueprint("rag_api", __name__, url_prefix="/api/rag")
+
+
+@rag_blueprint.get("/stats")
+@jwt_refresh_token_required
+async def get_stats():
+    """Get vector store statistics."""
+    stats = get_vector_store_stats()
+    return jsonify(stats)
+
+
+@rag_blueprint.post("/index")
+@admin_required
+async def trigger_index():
+    """Trigger indexing of documents from Paperless-NGX. Admin only."""
+    try:
+        await index_documents()
+        stats = get_vector_store_stats()
+        return jsonify({"status": "success", "stats": stats})
+    except Exception as e:
+        return jsonify({"status": "error", "message": str(e)}), 500
+
+
+@rag_blueprint.post("/reindex")
+@admin_required
+async def trigger_reindex():
+    """Clear and reindex all documents. Admin only."""
+    try:
+        # Clear existing documents
+        collection = vector_store._collection
+        all_docs = collection.get()
+
+        if all_docs["ids"]:
+            collection.delete(ids=all_docs["ids"])
+
+        # Reindex
+        await index_documents()
+        stats = get_vector_store_stats()
+        return jsonify({"status": "success", "stats": stats})
+    except Exception as e:
+        return jsonify({"status": "error", "message": str(e)}), 500
--- a/blueprints/rag/fetchers.py
+++ b/blueprints/rag/fetchers.py
@@ -0,0 +1,75 @@
+import os
+import tempfile
+
+import httpx
+
+
+class PaperlessNGXService:
+    def __init__(self):
+        self.base_url = os.getenv("BASE_URL")
+        self.token = os.getenv("PAPERLESS_TOKEN")
+        self.url = f"http://{os.getenv('BASE_URL')}/api/documents/?tags__id=8"
+        self.headers = {"Authorization": f"Token {os.getenv('PAPERLESS_TOKEN')}"}
+
+    def get_data(self):
+        print(f"Getting data from: {self.url}")
+        r = httpx.get(self.url, headers=self.headers)
+        results = r.json()["results"]
+
+        nextLink = r.json().get("next")
+
+        while nextLink:
+            r = httpx.get(nextLink, headers=self.headers)
+            results += r.json()["results"]
+            nextLink = r.json().get("next")
+
+        return results
+
+    def get_doc_by_id(self, doc_id: int):
+        url = f"http://{os.getenv('BASE_URL')}/api/documents/{doc_id}/"
+        r = httpx.get(url, headers=self.headers)
+        return r.json()
+
+    def download_pdf_from_id(self, id: int) -> str:
+        download_url = f"http://{os.getenv('BASE_URL')}/api/documents/{id}/download/"
+        response = httpx.get(
+            download_url, headers=self.headers, follow_redirects=True, timeout=30
+        )
+        response.raise_for_status()
+        # Use a temporary file for the downloaded PDF
+        temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
+        temp_file.write(response.content)
+        temp_file.close()
+        temp_pdf_path = temp_file.name
+        pdf_to_process = temp_pdf_path
+        return pdf_to_process
+
+    def upload_cleaned_content(self, document_id, data):
+        PUTS_URL = f"http://{os.getenv('BASE_URL')}/api/documents/{document_id}/"
+        r = httpx.put(PUTS_URL, headers=self.headers, data=data)
+        r.raise_for_status()
+
+    def upload_description(self, description_filepath, file, title, exif_date: str):
+        POST_URL = f"http://{os.getenv('BASE_URL')}/api/documents/post_document/"
+        files = {"document": ("description_filepath", file, "application/txt")}
+        data = {
+            "title": title,
+            "create": exif_date,
+            "document_type": 3,
+            "tags": [7],
+        }
+
+        r = httpx.post(POST_URL, headers=self.headers, data=data, files=files)
+        r.raise_for_status()
+
+    def get_tags(self):
+        GET_URL = f"http://{os.getenv('BASE_URL')}/api/tags/"
+        r = httpx.get(GET_URL, headers=self.headers)
+        data = r.json()
+        return {tag["id"]: tag["name"] for tag in data["results"]}
+
+    def get_doctypes(self):
+        GET_URL = f"http://{os.getenv('BASE_URL')}/api/document_types/"
+        r = httpx.get(GET_URL, headers=self.headers)
+        data = r.json()
+        return {doctype["id"]: doctype["name"] for doctype in data["results"]}
--- a/blueprints/rag/logic.py
+++ b/blueprints/rag/logic.py
@@ -0,0 +1,101 @@
+import datetime
+import os
+
+from langchain_chroma import Chroma
+from langchain_core.documents import Document
+from langchain_openai import OpenAIEmbeddings
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+
+from .fetchers import PaperlessNGXService
+
+embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
+
+vector_store = Chroma(
+    collection_name="simba_docs",
+    embedding_function=embeddings,
+    persist_directory=os.getenv("CHROMADB_PATH", ""),
+)
+
+text_splitter = RecursiveCharacterTextSplitter(
+    chunk_size=1000,  # chunk size (characters)
+    chunk_overlap=200,  # chunk overlap (characters)
+    add_start_index=True,  # track index in original document
+)
+
+
+def date_to_epoch(date_str: str) -> float:
+    split_date = date_str.split("-")
+    date = datetime.datetime(
+        int(split_date[0]),
+        int(split_date[1]),
+        int(split_date[2]),
+        0,
+        0,
+        0,
+    )
+
+    return date.timestamp()
+
+
+async def fetch_documents_from_paperless_ngx() -> list[Document]:
+    ppngx = PaperlessNGXService()
+    data = ppngx.get_data()
+    doctypes = ppngx.get_doctypes()
+    documents = []
+    for doc in data:
+        metadata = {
+            "created_date": date_to_epoch(doc["created_date"]),
+            "filename": doc["original_file_name"],
+            "document_type": doctypes.get(doc["document_type"], ""),
+        }
+        documents.append(Document(page_content=doc["content"], metadata=metadata))
+
+    return documents
+
+
+async def index_documents():
+    documents = await fetch_documents_from_paperless_ngx()
+
+    splits = text_splitter.split_documents(documents)
+    await vector_store.aadd_documents(documents=splits)
+
+
+async def query_vector_store(query: str):
+    retrieved_docs = await vector_store.asimilarity_search(query, k=2)
+    serialized = "\n\n".join(
+        (f"Source: {doc.metadata}\nContent: {doc.page_content}")
+        for doc in retrieved_docs
+    )
+    return serialized, retrieved_docs
+
+
+def get_vector_store_stats():
+    """Get statistics about the vector store."""
+    collection = vector_store._collection
+    count = collection.count()
+    return {
+        "total_documents": count,
+        "collection_name": collection.name,
+    }
+
+
+def list_all_documents(limit: int = 10):
+    """List documents in the vector store with their metadata."""
+    collection = vector_store._collection
+    results = collection.get(limit=limit, include=["metadatas", "documents"])
+
+    documents = []
+    for i, doc_id in enumerate(results["ids"]):
+        documents.append(
+            {
+                "id": doc_id,
+                "metadata": results["metadatas"][i]
+                if results.get("metadatas")
+                else None,
+                "content_preview": results["documents"][i][:200]
+                if results.get("documents")
+                else None,
+            }
+        )
+
+    return documents
--- a/blueprints/rag/models.py
+++ b/blueprints/rag/models.py