Replace ChromaDB with pgvector for vector storage

Consolidate onto PostgreSQL by using pgvector instead of a separate ChromaDB instance. This removes a Docker volume, a large dependency, and simplifies the stack without meaningful performance impact at our document scale. - Swap langchain-chroma for langchain-postgres (PGVector) - Use pgvector/pgvector:pg16 Docker image with init script - Lazy-initialize vector store to avoid eager DB connections - Add SQL helpers for stats/delete/list (replacing _collection access) - Remove legacy main.py, chunker, petmd scraper, and /api/query endpoint Re-index required after deploy (POST /api/rag/index + /index-obsidian). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-24 08:43:52 -04:00
parent 9ed4ca126a
commit 438399646f
19 changed files with 241 additions and 1690 deletions
@@ -6,19 +6,19 @@ import asyncio
 import sys

 from blueprints.rag.logic import (
+    delete_all_documents,
    get_vector_store_stats,
    index_documents,
    list_all_documents,
-    vector_store,
 )


 def stats():
    """Show vector store statistics."""
-    stats = get_vector_store_stats()
+    s = get_vector_store_stats()
    print("=== Vector Store Statistics ===")
-    print(f"Collection: {stats['collection_name']}")
-    print(f"Total Documents: {stats['total_documents']}")
+    print(f"Collection: {s['collection_name']}")
+    print(f"Total Documents: {s['total_documents']}")


 async def index():
@@ -26,23 +26,15 @@ async def index():
    print("Starting indexing process...")
    print("Fetching documents from Paperless-NGX...")
    await index_documents()
-    print("✓ Indexing complete!")
+    print("Indexing complete!")
    stats()


 async def reindex():
    """Clear and reindex all documents."""
    print("Clearing existing documents...")
-    collection = vector_store._collection
-    all_docs = collection.get()
-
-    if all_docs["ids"]:
-        print(f"Deleting {len(all_docs['ids'])} existing documents...")
-        collection.delete(ids=all_docs["ids"])
-        print("✓ Cleared")
-    else:
-        print("Collection is already empty")
-
+    delete_all_documents()
+    print("Cleared")
    await index()


@@ -113,7 +105,7 @@ Examples:
        print("\n\nOperation cancelled by user")
        sys.exit(1)
    except Exception as e:
-        print(f"\n❌ Error: {e}", file=sys.stderr)
+        print(f"\nError: {e}", file=sys.stderr)
        sys.exit(1)


@@ -1,24 +0,0 @@
-from bs4 import BeautifulSoup
-import chromadb
-import httpx
-
-client = chromadb.PersistentClient(path="/Users/ryanchen/Programs/raggr/chromadb")
-
-# Scrape
-BASE_URL = "https://www.vet.cornell.edu"
-LIST_URL = "/departments-centers-and-institutes/cornell-feline-health-center/health-information/feline-health-topics"
-
-QUERY_URL = BASE_URL + LIST_URL
-r = httpx.get(QUERY_URL)
-soup = BeautifulSoup(r.text)
-
-container = soup.find("div", class_="field-body")
-a_s = container.find_all("a", href=True)
-
-new_texts = []
-
-for link in a_s:
-    endpoint = link["href"]
-    query_url = BASE_URL + endpoint
-    r2 = httpx.get(query_url)
-    article_soup = BeautifulSoup(r2.text)