Replace ChromaDB with pgvector for vector storage

Consolidate onto PostgreSQL by using pgvector instead of a separate
ChromaDB instance. This removes a Docker volume, a large dependency,
and simplifies the stack without meaningful performance impact at
our document scale.

- Swap langchain-chroma for langchain-postgres (PGVector)
- Use pgvector/pgvector:pg16 Docker image with init script
- Lazy-initialize vector store to avoid eager DB connections
- Add SQL helpers for stats/delete/list (replacing _collection access)
- Remove legacy main.py, chunker, petmd scraper, and /api/query endpoint

Re-index required after deploy (POST /api/rag/index + /index-obsidian).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-24 08:43:52 -04:00
parent 9ed4ca126a
commit 438399646f
19 changed files with 241 additions and 1690 deletions
+1 -1
View File
@@ -328,7 +328,7 @@ async def obsidian_search_notes(query: str) -> str:
return "Obsidian integration is not configured. Please set OBSIDIAN_VAULT_PATH environment variable."
try:
# Query ChromaDB for obsidian documents
# Query vector store for obsidian documents
serialized, docs = await query_vector_store(query=query)
return serialized
+7 -9
View File
@@ -1,7 +1,12 @@
from quart import Blueprint, jsonify
from quart_jwt_extended import jwt_refresh_token_required
from .logic import fetch_obsidian_documents, get_vector_store_stats, index_documents, index_obsidian_documents, vector_store
from .logic import (
delete_all_documents,
get_vector_store_stats,
index_documents,
index_obsidian_documents,
)
from blueprints.users.decorators import admin_required
rag_blueprint = Blueprint("rag_api", __name__, url_prefix="/api/rag")
@@ -32,14 +37,7 @@ async def trigger_index():
async def trigger_reindex():
"""Clear and reindex all documents. Admin only."""
try:
# Clear existing documents
collection = vector_store._collection
all_docs = collection.get()
if all_docs["ids"]:
collection.delete(ids=all_docs["ids"])
# Reindex
delete_all_documents()
await index_documents()
stats = get_vector_store_stats()
return jsonify({"status": "success", "stats": stats})
+120 -29
View File
@@ -1,11 +1,13 @@
import datetime
import logging
import os
from dotenv import load_dotenv
from langchain_chroma import Chroma
from langchain_core.documents import Document
from langchain_openai import OpenAIEmbeddings
from langchain_postgres import PGVector
from langchain_text_splitters import RecursiveCharacterTextSplitter
from sqlalchemy import create_engine, text
from .fetchers import PaperlessNGXService
from utils.obsidian_service import ObsidianService
@@ -13,13 +15,39 @@ from utils.obsidian_service import ObsidianService
# Load environment variables
load_dotenv()
logger = logging.getLogger(__name__)
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
vector_store = Chroma(
collection_name="simba_docs",
embedding_function=embeddings,
persist_directory=os.getenv("CHROMADB_PATH", ""),
# Convert Tortoise-style postgres:// URL to SQLAlchemy-style postgresql+psycopg://
_db_url = os.getenv(
"DATABASE_URL", "postgres://raggr:raggr_dev_password@localhost:5432/raggr"
)
_pgvector_url = _db_url.replace("postgres://", "postgresql+psycopg://")
# Lazy-initialized vector store (defers DB connection to first use)
_vector_store = None
def _get_vector_store() -> PGVector:
global _vector_store
if _vector_store is None:
_vector_store = PGVector(
embeddings=embeddings,
collection_name="simba_docs",
connection=_pgvector_url,
use_jsonb=True,
create_extension=False, # created by docker init script
)
return _vector_store
def _get_engine():
"""Get a SQLAlchemy engine for direct queries."""
if not hasattr(_get_engine, "_engine"):
_get_engine._engine = create_engine(_pgvector_url)
return _get_engine._engine
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000, # chunk size (characters)
@@ -28,6 +56,18 @@ text_splitter = RecursiveCharacterTextSplitter(
)
def _get_collection_id():
"""Get the UUID of our collection from the langchain_pg_collection table."""
engine = _get_engine()
with engine.connect() as conn:
result = conn.execute(
text("SELECT uuid FROM langchain_pg_collection WHERE name = :name"),
{"name": "simba_docs"},
)
row = result.fetchone()
return row[0] if row else None
def date_to_epoch(date_str: str) -> float:
split_date = date_str.split("-")
date = datetime.datetime(
@@ -63,6 +103,7 @@ async def index_documents():
documents = await fetch_documents_from_paperless_ngx()
splits = text_splitter.split_documents(documents)
vector_store = _get_vector_store()
await vector_store.aadd_documents(documents=splits)
@@ -92,13 +133,17 @@ async def fetch_obsidian_documents() -> list[Document]:
"filepath": parsed["filepath"],
"tags": parsed["tags"],
"created_at": parsed["metadata"].get("created_at"),
**{k: v for k, v in parsed["metadata"].items() if k not in ["created_at", "created_by"]},
**{
k: v
for k, v in parsed["metadata"].items()
if k not in ["created_at", "created_by"]
},
},
)
documents.append(document)
except Exception as e:
print(f"Error reading {md_path}: {e}")
logger.warning(f"Error reading {md_path}: {e}")
continue
return documents
@@ -109,26 +154,25 @@ async def index_obsidian_documents():
Deletes existing obsidian source chunks before re-indexing.
"""
obsidian_service = ObsidianService()
documents = await fetch_obsidian_documents()
if not documents:
print("No Obsidian documents found to index")
logger.info("No Obsidian documents found to index")
return {"indexed": 0}
# Delete existing obsidian chunks
existing_results = vector_store.get(where={"source": "obsidian"})
if existing_results.get("ids"):
await vector_store.adelete(existing_results["ids"])
delete_documents_by_metadata("source", "obsidian")
# Split and index documents
splits = text_splitter.split_documents(documents)
vector_store = _get_vector_store()
await vector_store.aadd_documents(documents=splits)
return {"indexed": len(documents)}
async def query_vector_store(query: str):
vector_store = _get_vector_store()
retrieved_docs = await vector_store.asimilarity_search(query, k=2)
serialized = "\n\n".join(
(f"Source: {doc.metadata}\nContent: {doc.page_content}")
@@ -137,33 +181,80 @@ async def query_vector_store(query: str):
return serialized, retrieved_docs
def delete_all_documents():
"""Delete all documents from the vector store collection."""
collection_id = _get_collection_id()
if not collection_id:
return
engine = _get_engine()
with engine.connect() as conn:
conn.execute(
text("DELETE FROM langchain_pg_embedding WHERE collection_id = :cid"),
{"cid": collection_id},
)
conn.commit()
def delete_documents_by_metadata(key: str, value: str):
"""Delete documents matching a metadata key/value pair."""
collection_id = _get_collection_id()
if not collection_id:
return
engine = _get_engine()
with engine.connect() as conn:
conn.execute(
text(
"DELETE FROM langchain_pg_embedding "
"WHERE collection_id = :cid AND cmetadata->>:key = :value"
),
{"cid": collection_id, "key": key, "value": value},
)
conn.commit()
def get_vector_store_stats():
"""Get statistics about the vector store."""
collection = vector_store._collection
count = collection.count()
collection_id = _get_collection_id()
count = 0
if collection_id:
engine = _get_engine()
with engine.connect() as conn:
result = conn.execute(
text(
"SELECT COUNT(*) FROM langchain_pg_embedding WHERE collection_id = :cid"
),
{"cid": collection_id},
)
count = result.scalar()
return {
"total_documents": count,
"collection_name": collection.name,
"collection_name": "simba_docs",
}
def list_all_documents(limit: int = 10):
"""List documents in the vector store with their metadata."""
collection = vector_store._collection
results = collection.get(limit=limit, include=["metadatas", "documents"])
collection_id = _get_collection_id()
if not collection_id:
return []
documents = []
for i, doc_id in enumerate(results["ids"]):
documents.append(
{
"id": doc_id,
"metadata": results["metadatas"][i]
if results.get("metadatas")
else None,
"content_preview": results["documents"][i][:200]
if results.get("documents")
else None,
}
engine = _get_engine()
with engine.connect() as conn:
result = conn.execute(
text(
"SELECT id, document, cmetadata FROM langchain_pg_embedding "
"WHERE collection_id = :cid LIMIT :limit"
),
{"cid": collection_id, "limit": limit},
)
documents = []
for row in result:
documents.append(
{
"id": str(row[0]),
"metadata": row[2],
"content_preview": row[1][:200] if row[1] else None,
}
)
return documents