Replace ChromaDB with pgvector for vector storage

Consolidate onto PostgreSQL by using pgvector instead of a separate
ChromaDB instance. This removes a Docker volume, a large dependency,
and simplifies the stack without meaningful performance impact at
our document scale.

- Swap langchain-chroma for langchain-postgres (PGVector)
- Use pgvector/pgvector:pg16 Docker image with init script
- Lazy-initialize vector store to avoid eager DB connections
- Add SQL helpers for stats/delete/list (replacing _collection access)
- Remove legacy main.py, chunker, petmd scraper, and /api/query endpoint

Re-index required after deploy (POST /api/rag/index + /index-obsidian).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-24 08:43:52 -04:00
parent 9ed4ca126a
commit 438399646f
19 changed files with 241 additions and 1690 deletions
+8 -16
View File
@@ -6,19 +6,19 @@ import asyncio
import sys
from blueprints.rag.logic import (
delete_all_documents,
get_vector_store_stats,
index_documents,
list_all_documents,
vector_store,
)
def stats():
"""Show vector store statistics."""
stats = get_vector_store_stats()
s = get_vector_store_stats()
print("=== Vector Store Statistics ===")
print(f"Collection: {stats['collection_name']}")
print(f"Total Documents: {stats['total_documents']}")
print(f"Collection: {s['collection_name']}")
print(f"Total Documents: {s['total_documents']}")
async def index():
@@ -26,23 +26,15 @@ async def index():
print("Starting indexing process...")
print("Fetching documents from Paperless-NGX...")
await index_documents()
print("Indexing complete!")
print("Indexing complete!")
stats()
async def reindex():
"""Clear and reindex all documents."""
print("Clearing existing documents...")
collection = vector_store._collection
all_docs = collection.get()
if all_docs["ids"]:
print(f"Deleting {len(all_docs['ids'])} existing documents...")
collection.delete(ids=all_docs["ids"])
print("✓ Cleared")
else:
print("Collection is already empty")
delete_all_documents()
print("Cleared")
await index()
@@ -113,7 +105,7 @@ Examples:
print("\n\nOperation cancelled by user")
sys.exit(1)
except Exception as e:
print(f"\nError: {e}", file=sys.stderr)
print(f"\nError: {e}", file=sys.stderr)
sys.exit(1)
-24
View File
@@ -1,24 +0,0 @@
from bs4 import BeautifulSoup
import chromadb
import httpx
client = chromadb.PersistentClient(path="/Users/ryanchen/Programs/raggr/chromadb")
# Scrape
BASE_URL = "https://www.vet.cornell.edu"
LIST_URL = "/departments-centers-and-institutes/cornell-feline-health-center/health-information/feline-health-topics"
QUERY_URL = BASE_URL + LIST_URL
r = httpx.get(QUERY_URL)
soup = BeautifulSoup(r.text)
container = soup.find("div", class_="field-body")
a_s = container.find_all("a", href=True)
new_texts = []
for link in a_s:
endpoint = link["href"]
query_url = BASE_URL + endpoint
r2 = httpx.get(query_url)
article_soup = BeautifulSoup(r2.text)