Replace ChromaDB with pgvector for vector storage
Consolidate onto PostgreSQL by using pgvector instead of a separate ChromaDB instance. This removes a Docker volume, a large dependency, and simplifies the stack without meaningful performance impact at our document scale. - Swap langchain-chroma for langchain-postgres (PGVector) - Use pgvector/pgvector:pg16 Docker image with init script - Lazy-initialize vector store to avoid eager DB connections - Add SQL helpers for stats/delete/list (replacing _collection access) - Remove legacy main.py, chunker, petmd scraper, and /api/query endpoint Re-index required after deploy (POST /api/rag/index + /index-obsidian). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -328,7 +328,7 @@ async def obsidian_search_notes(query: str) -> str:
|
||||
return "Obsidian integration is not configured. Please set OBSIDIAN_VAULT_PATH environment variable."
|
||||
|
||||
try:
|
||||
# Query ChromaDB for obsidian documents
|
||||
# Query vector store for obsidian documents
|
||||
serialized, docs = await query_vector_store(query=query)
|
||||
return serialized
|
||||
|
||||
|
||||
@@ -1,7 +1,12 @@
|
||||
from quart import Blueprint, jsonify
|
||||
from quart_jwt_extended import jwt_refresh_token_required
|
||||
|
||||
from .logic import fetch_obsidian_documents, get_vector_store_stats, index_documents, index_obsidian_documents, vector_store
|
||||
from .logic import (
|
||||
delete_all_documents,
|
||||
get_vector_store_stats,
|
||||
index_documents,
|
||||
index_obsidian_documents,
|
||||
)
|
||||
from blueprints.users.decorators import admin_required
|
||||
|
||||
rag_blueprint = Blueprint("rag_api", __name__, url_prefix="/api/rag")
|
||||
@@ -32,14 +37,7 @@ async def trigger_index():
|
||||
async def trigger_reindex():
|
||||
"""Clear and reindex all documents. Admin only."""
|
||||
try:
|
||||
# Clear existing documents
|
||||
collection = vector_store._collection
|
||||
all_docs = collection.get()
|
||||
|
||||
if all_docs["ids"]:
|
||||
collection.delete(ids=all_docs["ids"])
|
||||
|
||||
# Reindex
|
||||
delete_all_documents()
|
||||
await index_documents()
|
||||
stats = get_vector_store_stats()
|
||||
return jsonify({"status": "success", "stats": stats})
|
||||
|
||||
+120
-29
@@ -1,11 +1,13 @@
|
||||
import datetime
|
||||
import logging
|
||||
import os
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from langchain_chroma import Chroma
|
||||
from langchain_core.documents import Document
|
||||
from langchain_openai import OpenAIEmbeddings
|
||||
from langchain_postgres import PGVector
|
||||
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
||||
from sqlalchemy import create_engine, text
|
||||
|
||||
from .fetchers import PaperlessNGXService
|
||||
from utils.obsidian_service import ObsidianService
|
||||
@@ -13,13 +15,39 @@ from utils.obsidian_service import ObsidianService
|
||||
# Load environment variables
|
||||
load_dotenv()
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
|
||||
|
||||
vector_store = Chroma(
|
||||
collection_name="simba_docs",
|
||||
embedding_function=embeddings,
|
||||
persist_directory=os.getenv("CHROMADB_PATH", ""),
|
||||
# Convert Tortoise-style postgres:// URL to SQLAlchemy-style postgresql+psycopg://
|
||||
_db_url = os.getenv(
|
||||
"DATABASE_URL", "postgres://raggr:raggr_dev_password@localhost:5432/raggr"
|
||||
)
|
||||
_pgvector_url = _db_url.replace("postgres://", "postgresql+psycopg://")
|
||||
|
||||
# Lazy-initialized vector store (defers DB connection to first use)
|
||||
_vector_store = None
|
||||
|
||||
|
||||
def _get_vector_store() -> PGVector:
|
||||
global _vector_store
|
||||
if _vector_store is None:
|
||||
_vector_store = PGVector(
|
||||
embeddings=embeddings,
|
||||
collection_name="simba_docs",
|
||||
connection=_pgvector_url,
|
||||
use_jsonb=True,
|
||||
create_extension=False, # created by docker init script
|
||||
)
|
||||
return _vector_store
|
||||
|
||||
|
||||
def _get_engine():
|
||||
"""Get a SQLAlchemy engine for direct queries."""
|
||||
if not hasattr(_get_engine, "_engine"):
|
||||
_get_engine._engine = create_engine(_pgvector_url)
|
||||
return _get_engine._engine
|
||||
|
||||
|
||||
text_splitter = RecursiveCharacterTextSplitter(
|
||||
chunk_size=1000, # chunk size (characters)
|
||||
@@ -28,6 +56,18 @@ text_splitter = RecursiveCharacterTextSplitter(
|
||||
)
|
||||
|
||||
|
||||
def _get_collection_id():
|
||||
"""Get the UUID of our collection from the langchain_pg_collection table."""
|
||||
engine = _get_engine()
|
||||
with engine.connect() as conn:
|
||||
result = conn.execute(
|
||||
text("SELECT uuid FROM langchain_pg_collection WHERE name = :name"),
|
||||
{"name": "simba_docs"},
|
||||
)
|
||||
row = result.fetchone()
|
||||
return row[0] if row else None
|
||||
|
||||
|
||||
def date_to_epoch(date_str: str) -> float:
|
||||
split_date = date_str.split("-")
|
||||
date = datetime.datetime(
|
||||
@@ -63,6 +103,7 @@ async def index_documents():
|
||||
documents = await fetch_documents_from_paperless_ngx()
|
||||
|
||||
splits = text_splitter.split_documents(documents)
|
||||
vector_store = _get_vector_store()
|
||||
await vector_store.aadd_documents(documents=splits)
|
||||
|
||||
|
||||
@@ -92,13 +133,17 @@ async def fetch_obsidian_documents() -> list[Document]:
|
||||
"filepath": parsed["filepath"],
|
||||
"tags": parsed["tags"],
|
||||
"created_at": parsed["metadata"].get("created_at"),
|
||||
**{k: v for k, v in parsed["metadata"].items() if k not in ["created_at", "created_by"]},
|
||||
**{
|
||||
k: v
|
||||
for k, v in parsed["metadata"].items()
|
||||
if k not in ["created_at", "created_by"]
|
||||
},
|
||||
},
|
||||
)
|
||||
documents.append(document)
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error reading {md_path}: {e}")
|
||||
logger.warning(f"Error reading {md_path}: {e}")
|
||||
continue
|
||||
|
||||
return documents
|
||||
@@ -109,26 +154,25 @@ async def index_obsidian_documents():
|
||||
|
||||
Deletes existing obsidian source chunks before re-indexing.
|
||||
"""
|
||||
obsidian_service = ObsidianService()
|
||||
documents = await fetch_obsidian_documents()
|
||||
|
||||
if not documents:
|
||||
print("No Obsidian documents found to index")
|
||||
logger.info("No Obsidian documents found to index")
|
||||
return {"indexed": 0}
|
||||
|
||||
# Delete existing obsidian chunks
|
||||
existing_results = vector_store.get(where={"source": "obsidian"})
|
||||
if existing_results.get("ids"):
|
||||
await vector_store.adelete(existing_results["ids"])
|
||||
delete_documents_by_metadata("source", "obsidian")
|
||||
|
||||
# Split and index documents
|
||||
splits = text_splitter.split_documents(documents)
|
||||
vector_store = _get_vector_store()
|
||||
await vector_store.aadd_documents(documents=splits)
|
||||
|
||||
return {"indexed": len(documents)}
|
||||
|
||||
|
||||
async def query_vector_store(query: str):
|
||||
vector_store = _get_vector_store()
|
||||
retrieved_docs = await vector_store.asimilarity_search(query, k=2)
|
||||
serialized = "\n\n".join(
|
||||
(f"Source: {doc.metadata}\nContent: {doc.page_content}")
|
||||
@@ -137,33 +181,80 @@ async def query_vector_store(query: str):
|
||||
return serialized, retrieved_docs
|
||||
|
||||
|
||||
def delete_all_documents():
|
||||
"""Delete all documents from the vector store collection."""
|
||||
collection_id = _get_collection_id()
|
||||
if not collection_id:
|
||||
return
|
||||
engine = _get_engine()
|
||||
with engine.connect() as conn:
|
||||
conn.execute(
|
||||
text("DELETE FROM langchain_pg_embedding WHERE collection_id = :cid"),
|
||||
{"cid": collection_id},
|
||||
)
|
||||
conn.commit()
|
||||
|
||||
|
||||
def delete_documents_by_metadata(key: str, value: str):
|
||||
"""Delete documents matching a metadata key/value pair."""
|
||||
collection_id = _get_collection_id()
|
||||
if not collection_id:
|
||||
return
|
||||
engine = _get_engine()
|
||||
with engine.connect() as conn:
|
||||
conn.execute(
|
||||
text(
|
||||
"DELETE FROM langchain_pg_embedding "
|
||||
"WHERE collection_id = :cid AND cmetadata->>:key = :value"
|
||||
),
|
||||
{"cid": collection_id, "key": key, "value": value},
|
||||
)
|
||||
conn.commit()
|
||||
|
||||
|
||||
def get_vector_store_stats():
|
||||
"""Get statistics about the vector store."""
|
||||
collection = vector_store._collection
|
||||
count = collection.count()
|
||||
collection_id = _get_collection_id()
|
||||
count = 0
|
||||
if collection_id:
|
||||
engine = _get_engine()
|
||||
with engine.connect() as conn:
|
||||
result = conn.execute(
|
||||
text(
|
||||
"SELECT COUNT(*) FROM langchain_pg_embedding WHERE collection_id = :cid"
|
||||
),
|
||||
{"cid": collection_id},
|
||||
)
|
||||
count = result.scalar()
|
||||
return {
|
||||
"total_documents": count,
|
||||
"collection_name": collection.name,
|
||||
"collection_name": "simba_docs",
|
||||
}
|
||||
|
||||
|
||||
def list_all_documents(limit: int = 10):
|
||||
"""List documents in the vector store with their metadata."""
|
||||
collection = vector_store._collection
|
||||
results = collection.get(limit=limit, include=["metadatas", "documents"])
|
||||
collection_id = _get_collection_id()
|
||||
if not collection_id:
|
||||
return []
|
||||
|
||||
documents = []
|
||||
for i, doc_id in enumerate(results["ids"]):
|
||||
documents.append(
|
||||
{
|
||||
"id": doc_id,
|
||||
"metadata": results["metadatas"][i]
|
||||
if results.get("metadatas")
|
||||
else None,
|
||||
"content_preview": results["documents"][i][:200]
|
||||
if results.get("documents")
|
||||
else None,
|
||||
}
|
||||
engine = _get_engine()
|
||||
with engine.connect() as conn:
|
||||
result = conn.execute(
|
||||
text(
|
||||
"SELECT id, document, cmetadata FROM langchain_pg_embedding "
|
||||
"WHERE collection_id = :cid LIMIT :limit"
|
||||
),
|
||||
{"cid": collection_id, "limit": limit},
|
||||
)
|
||||
documents = []
|
||||
for row in result:
|
||||
documents.append(
|
||||
{
|
||||
"id": str(row[0]),
|
||||
"metadata": row[2],
|
||||
"content_preview": row[1][:200] if row[1] else None,
|
||||
}
|
||||
)
|
||||
|
||||
return documents
|
||||
|
||||
Reference in New Issue
Block a user