Add incremental Obsidian-to-pgvector sync with background watcher

Replace full delete-and-reindex with mtime-based incremental sync that
only re-indexes changed/new files and removes deleted ones. A background
polling task keeps the vector store up-to-date automatically when
OBSIDIAN_CONTINUOUS_SYNC=true.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-05-31 07:05:48 -04:00
parent 2cd77c68c1
commit 869de1c250
3 changed files with 136 additions and 3 deletions
+3 -3
View File
@@ -5,7 +5,7 @@ from .logic import (
delete_all_documents,
get_vector_store_stats,
index_documents,
index_obsidian_documents,
sync_obsidian_documents,
)
from blueprints.users.decorators import admin_required
@@ -48,9 +48,9 @@ async def trigger_reindex():
@rag_blueprint.post("/index-obsidian")
@admin_required
async def trigger_obsidian_index():
"""Index all Obsidian markdown documents into vector store. Admin only."""
"""Incrementally sync Obsidian documents into vector store. Admin only."""
try:
result = await index_obsidian_documents()
result = await sync_obsidian_documents()
stats = get_vector_store_stats()
return jsonify({"status": "success", "result": result, "stats": stats})
except Exception as e:
+102
View File
@@ -2,6 +2,7 @@ import datetime
import logging
import os
import re
import time
from dotenv import load_dotenv
from langchain_core.documents import Document
@@ -180,6 +181,7 @@ async def fetch_obsidian_documents() -> list[Document]:
"filepath": parsed["filepath"],
"tags": parsed["tags"],
"created_at": parsed["metadata"].get("created_at"),
"indexed_at": time.time(),
**{
k: v
for k, v in parsed["metadata"].items()
@@ -219,6 +221,106 @@ async def index_obsidian_documents():
return {"indexed": len(documents)}
def _get_obsidian_indexed_files() -> dict[str, float]:
"""Return {filepath: indexed_at} for all obsidian chunks in pgvector."""
collection_id = _get_collection_id()
if not collection_id:
return {}
engine = _get_engine()
with engine.connect() as conn:
result = conn.execute(
text(
"SELECT DISTINCT cmetadata->>'filepath' AS filepath, "
"MAX((cmetadata->>'indexed_at')::float) AS indexed_at "
"FROM langchain_pg_embedding "
"WHERE collection_id = :cid AND cmetadata->>'source' = 'obsidian' "
"GROUP BY cmetadata->>'filepath'"
),
{"cid": collection_id},
)
return {row[0]: row[1] for row in result if row[0] is not None}
async def sync_obsidian_documents() -> dict[str, int]:
"""Incrementally sync Obsidian documents to pgvector.
Compares file mtimes against stored indexed_at timestamps to only
re-index changed/new files and remove deleted ones.
Returns:
Dict with counts of added, updated, and deleted files.
"""
obsidian_service = ObsidianService()
indexed_files = _get_obsidian_indexed_files()
# Build map of current vault files -> mtime
vault_files: dict[str, float] = {}
for md_path in obsidian_service.walk_vault():
vault_files[str(md_path)] = md_path.stat().st_mtime
added = 0
updated = 0
deleted = 0
# Find files to add or update
files_to_index: list[str] = []
for filepath, mtime in vault_files.items():
indexed_at = indexed_files.get(filepath)
if indexed_at is None:
files_to_index.append(filepath)
added += 1
elif mtime > indexed_at:
# Delete old chunks first
delete_documents_by_metadata("filepath", filepath)
files_to_index.append(filepath)
updated += 1
# Find deleted files (in DB but not on disk)
for filepath in indexed_files:
if filepath not in vault_files:
delete_documents_by_metadata("filepath", filepath)
deleted += 1
# Index new/changed files
if files_to_index:
documents = []
for filepath in files_to_index:
try:
with open(filepath, "r", encoding="utf-8") as f:
content = f.read()
parsed = obsidian_service.parse_markdown(content, filepath)
document = Document(
page_content=parsed["content"],
metadata={
"source": "obsidian",
"filepath": parsed["filepath"],
"tags": parsed["tags"],
"created_at": parsed["metadata"].get("created_at"),
"indexed_at": time.time(),
**{
k: v
for k, v in parsed["metadata"].items()
if k not in ["created_at", "created_by"]
},
},
)
documents.append(document)
except Exception as e:
logger.warning(f"Error reading {filepath}: {e}")
continue
if documents:
splits = text_splitter.split_documents(documents)
splits = _sanitize_documents(splits)
vector_store = _get_vector_store()
await vector_store.aadd_documents(documents=splits)
logger.info(
f"Obsidian sync complete: {added} added, {updated} updated, {deleted} deleted"
)
return {"added": added, "updated": updated, "deleted": deleted}
async def query_vector_store(query: str):
vector_store = _get_vector_store()
retrieved_docs = await vector_store.asimilarity_search(query, k=2)