Replace ChromaDB with pgvector for vector storage

Consolidate onto PostgreSQL by using pgvector instead of a separate ChromaDB instance. This removes a Docker volume, a large dependency, and simplifies the stack without meaningful performance impact at our document scale. - Swap langchain-chroma for langchain-postgres (PGVector) - Use pgvector/pgvector:pg16 Docker image with init script - Lazy-initialize vector store to avoid eager DB connections - Add SQL helpers for stats/delete/list (replacing _collection access) - Remove legacy main.py, chunker, petmd scraper, and /api/query endpoint Re-index required after deploy (POST /api/rag/index + /index-obsidian). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-24 08:43:52 -04:00
parent 9ed4ca126a
commit 438399646f
19 changed files with 241 additions and 1690 deletions
@@ -1,7 +1,12 @@
 from quart import Blueprint, jsonify
 from quart_jwt_extended import jwt_refresh_token_required

-from .logic import fetch_obsidian_documents, get_vector_store_stats, index_documents, index_obsidian_documents, vector_store
+from .logic import (
+    delete_all_documents,
+    get_vector_store_stats,
+    index_documents,
+    index_obsidian_documents,
+)
 from blueprints.users.decorators import admin_required

 rag_blueprint = Blueprint("rag_api", __name__, url_prefix="/api/rag")
@@ -32,14 +37,7 @@ async def trigger_index():
 async def trigger_reindex():
    """Clear and reindex all documents. Admin only."""
    try:
-        # Clear existing documents
-        collection = vector_store._collection
-        all_docs = collection.get()
-
-        if all_docs["ids"]:
-            collection.delete(ids=all_docs["ids"])
-
-        # Reindex
+        delete_all_documents()
        await index_documents()
        stats = get_vector_store_stats()
        return jsonify({"status": "success", "stats": stats})