Replace ChromaDB with pgvector for vector storage
Consolidate onto PostgreSQL by using pgvector instead of a separate ChromaDB instance. This removes a Docker volume, a large dependency, and simplifies the stack without meaningful performance impact at our document scale. - Swap langchain-chroma for langchain-postgres (PGVector) - Use pgvector/pgvector:pg16 Docker image with init script - Lazy-initialize vector store to avoid eager DB connections - Add SQL helpers for stats/delete/list (replacing _collection access) - Remove legacy main.py, chunker, petmd scraper, and /api/query endpoint Re-index required after deploy (POST /api/rag/index + /index-obsidian). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -6,19 +6,19 @@ import asyncio
|
||||
import sys
|
||||
|
||||
from blueprints.rag.logic import (
|
||||
delete_all_documents,
|
||||
get_vector_store_stats,
|
||||
index_documents,
|
||||
list_all_documents,
|
||||
vector_store,
|
||||
)
|
||||
|
||||
|
||||
def stats():
|
||||
"""Show vector store statistics."""
|
||||
stats = get_vector_store_stats()
|
||||
s = get_vector_store_stats()
|
||||
print("=== Vector Store Statistics ===")
|
||||
print(f"Collection: {stats['collection_name']}")
|
||||
print(f"Total Documents: {stats['total_documents']}")
|
||||
print(f"Collection: {s['collection_name']}")
|
||||
print(f"Total Documents: {s['total_documents']}")
|
||||
|
||||
|
||||
async def index():
|
||||
@@ -26,23 +26,15 @@ async def index():
|
||||
print("Starting indexing process...")
|
||||
print("Fetching documents from Paperless-NGX...")
|
||||
await index_documents()
|
||||
print("✓ Indexing complete!")
|
||||
print("Indexing complete!")
|
||||
stats()
|
||||
|
||||
|
||||
async def reindex():
|
||||
"""Clear and reindex all documents."""
|
||||
print("Clearing existing documents...")
|
||||
collection = vector_store._collection
|
||||
all_docs = collection.get()
|
||||
|
||||
if all_docs["ids"]:
|
||||
print(f"Deleting {len(all_docs['ids'])} existing documents...")
|
||||
collection.delete(ids=all_docs["ids"])
|
||||
print("✓ Cleared")
|
||||
else:
|
||||
print("Collection is already empty")
|
||||
|
||||
delete_all_documents()
|
||||
print("Cleared")
|
||||
await index()
|
||||
|
||||
|
||||
@@ -113,7 +105,7 @@ Examples:
|
||||
print("\n\nOperation cancelled by user")
|
||||
sys.exit(1)
|
||||
except Exception as e:
|
||||
print(f"\n❌ Error: {e}", file=sys.stderr)
|
||||
print(f"\nError: {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
|
||||
@@ -1,24 +0,0 @@
|
||||
from bs4 import BeautifulSoup
|
||||
import chromadb
|
||||
import httpx
|
||||
|
||||
client = chromadb.PersistentClient(path="/Users/ryanchen/Programs/raggr/chromadb")
|
||||
|
||||
# Scrape
|
||||
BASE_URL = "https://www.vet.cornell.edu"
|
||||
LIST_URL = "/departments-centers-and-institutes/cornell-feline-health-center/health-information/feline-health-topics"
|
||||
|
||||
QUERY_URL = BASE_URL + LIST_URL
|
||||
r = httpx.get(QUERY_URL)
|
||||
soup = BeautifulSoup(r.text)
|
||||
|
||||
container = soup.find("div", class_="field-body")
|
||||
a_s = container.find_all("a", href=True)
|
||||
|
||||
new_texts = []
|
||||
|
||||
for link in a_s:
|
||||
endpoint = link["href"]
|
||||
query_url = BASE_URL + endpoint
|
||||
r2 = httpx.get(query_url)
|
||||
article_soup = BeautifulSoup(r2.text)
|
||||
Reference in New Issue
Block a user