Add incremental Obsidian-to-pgvector sync with background watcher
Replace full delete-and-reindex with mtime-based incremental sync that only re-indexes changed/new files and removes deleted ones. A background polling task keeps the vector store up-to-date automatically when OBSIDIAN_CONTINUOUS_SYNC=true. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1,3 +1,4 @@
|
|||||||
|
import asyncio
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
from datetime import timedelta
|
from datetime import timedelta
|
||||||
@@ -51,13 +52,43 @@ app.register_blueprint(blueprints.rag.rag_blueprint)
|
|||||||
app.register_blueprint(blueprints.whatsapp.whatsapp_blueprint)
|
app.register_blueprint(blueprints.whatsapp.whatsapp_blueprint)
|
||||||
|
|
||||||
|
|
||||||
|
async def _obsidian_sync_loop():
|
||||||
|
"""Background task that incrementally syncs Obsidian documents to pgvector."""
|
||||||
|
from blueprints.rag.logic import sync_obsidian_documents
|
||||||
|
|
||||||
|
interval = int(os.getenv("OBSIDIAN_SYNC_INTERVAL", "60"))
|
||||||
|
logger = logging.getLogger("obsidian_sync")
|
||||||
|
logger.info(f"Obsidian sync watcher started (interval={interval}s)")
|
||||||
|
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
result = await sync_obsidian_documents()
|
||||||
|
if result["added"] or result["updated"] or result["deleted"]:
|
||||||
|
logger.info(
|
||||||
|
f"Obsidian sync: {result['added']} added, "
|
||||||
|
f"{result['updated']} updated, {result['deleted']} deleted"
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
logger.exception("Obsidian sync error")
|
||||||
|
await asyncio.sleep(interval)
|
||||||
|
|
||||||
|
|
||||||
# Initialize Tortoise ORM with lifecycle hooks
|
# Initialize Tortoise ORM with lifecycle hooks
|
||||||
@app.while_serving
|
@app.while_serving
|
||||||
async def lifespan():
|
async def lifespan():
|
||||||
logging.info("Initializing Tortoise ORM...")
|
logging.info("Initializing Tortoise ORM...")
|
||||||
await Tortoise.init(config=TORTOISE_CONFIG)
|
await Tortoise.init(config=TORTOISE_CONFIG)
|
||||||
logging.info("Tortoise ORM initialized successfully")
|
logging.info("Tortoise ORM initialized successfully")
|
||||||
|
|
||||||
|
watcher_task = None
|
||||||
|
if os.getenv("OBSIDIAN_CONTINUOUS_SYNC") == "true":
|
||||||
|
watcher_task = asyncio.create_task(_obsidian_sync_loop())
|
||||||
|
|
||||||
yield
|
yield
|
||||||
|
|
||||||
|
if watcher_task is not None:
|
||||||
|
watcher_task.cancel()
|
||||||
|
|
||||||
logging.info("Closing Tortoise ORM connections...")
|
logging.info("Closing Tortoise ORM connections...")
|
||||||
await Tortoise.close_connections()
|
await Tortoise.close_connections()
|
||||||
|
|
||||||
|
|||||||
@@ -5,7 +5,7 @@ from .logic import (
|
|||||||
delete_all_documents,
|
delete_all_documents,
|
||||||
get_vector_store_stats,
|
get_vector_store_stats,
|
||||||
index_documents,
|
index_documents,
|
||||||
index_obsidian_documents,
|
sync_obsidian_documents,
|
||||||
)
|
)
|
||||||
from blueprints.users.decorators import admin_required
|
from blueprints.users.decorators import admin_required
|
||||||
|
|
||||||
@@ -48,9 +48,9 @@ async def trigger_reindex():
|
|||||||
@rag_blueprint.post("/index-obsidian")
|
@rag_blueprint.post("/index-obsidian")
|
||||||
@admin_required
|
@admin_required
|
||||||
async def trigger_obsidian_index():
|
async def trigger_obsidian_index():
|
||||||
"""Index all Obsidian markdown documents into vector store. Admin only."""
|
"""Incrementally sync Obsidian documents into vector store. Admin only."""
|
||||||
try:
|
try:
|
||||||
result = await index_obsidian_documents()
|
result = await sync_obsidian_documents()
|
||||||
stats = get_vector_store_stats()
|
stats = get_vector_store_stats()
|
||||||
return jsonify({"status": "success", "result": result, "stats": stats})
|
return jsonify({"status": "success", "result": result, "stats": stats})
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|||||||
@@ -2,6 +2,7 @@ import datetime
|
|||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
|
import time
|
||||||
|
|
||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
from langchain_core.documents import Document
|
from langchain_core.documents import Document
|
||||||
@@ -180,6 +181,7 @@ async def fetch_obsidian_documents() -> list[Document]:
|
|||||||
"filepath": parsed["filepath"],
|
"filepath": parsed["filepath"],
|
||||||
"tags": parsed["tags"],
|
"tags": parsed["tags"],
|
||||||
"created_at": parsed["metadata"].get("created_at"),
|
"created_at": parsed["metadata"].get("created_at"),
|
||||||
|
"indexed_at": time.time(),
|
||||||
**{
|
**{
|
||||||
k: v
|
k: v
|
||||||
for k, v in parsed["metadata"].items()
|
for k, v in parsed["metadata"].items()
|
||||||
@@ -219,6 +221,106 @@ async def index_obsidian_documents():
|
|||||||
return {"indexed": len(documents)}
|
return {"indexed": len(documents)}
|
||||||
|
|
||||||
|
|
||||||
|
def _get_obsidian_indexed_files() -> dict[str, float]:
|
||||||
|
"""Return {filepath: indexed_at} for all obsidian chunks in pgvector."""
|
||||||
|
collection_id = _get_collection_id()
|
||||||
|
if not collection_id:
|
||||||
|
return {}
|
||||||
|
engine = _get_engine()
|
||||||
|
with engine.connect() as conn:
|
||||||
|
result = conn.execute(
|
||||||
|
text(
|
||||||
|
"SELECT DISTINCT cmetadata->>'filepath' AS filepath, "
|
||||||
|
"MAX((cmetadata->>'indexed_at')::float) AS indexed_at "
|
||||||
|
"FROM langchain_pg_embedding "
|
||||||
|
"WHERE collection_id = :cid AND cmetadata->>'source' = 'obsidian' "
|
||||||
|
"GROUP BY cmetadata->>'filepath'"
|
||||||
|
),
|
||||||
|
{"cid": collection_id},
|
||||||
|
)
|
||||||
|
return {row[0]: row[1] for row in result if row[0] is not None}
|
||||||
|
|
||||||
|
|
||||||
|
async def sync_obsidian_documents() -> dict[str, int]:
|
||||||
|
"""Incrementally sync Obsidian documents to pgvector.
|
||||||
|
|
||||||
|
Compares file mtimes against stored indexed_at timestamps to only
|
||||||
|
re-index changed/new files and remove deleted ones.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict with counts of added, updated, and deleted files.
|
||||||
|
"""
|
||||||
|
obsidian_service = ObsidianService()
|
||||||
|
indexed_files = _get_obsidian_indexed_files()
|
||||||
|
|
||||||
|
# Build map of current vault files -> mtime
|
||||||
|
vault_files: dict[str, float] = {}
|
||||||
|
for md_path in obsidian_service.walk_vault():
|
||||||
|
vault_files[str(md_path)] = md_path.stat().st_mtime
|
||||||
|
|
||||||
|
added = 0
|
||||||
|
updated = 0
|
||||||
|
deleted = 0
|
||||||
|
|
||||||
|
# Find files to add or update
|
||||||
|
files_to_index: list[str] = []
|
||||||
|
for filepath, mtime in vault_files.items():
|
||||||
|
indexed_at = indexed_files.get(filepath)
|
||||||
|
if indexed_at is None:
|
||||||
|
files_to_index.append(filepath)
|
||||||
|
added += 1
|
||||||
|
elif mtime > indexed_at:
|
||||||
|
# Delete old chunks first
|
||||||
|
delete_documents_by_metadata("filepath", filepath)
|
||||||
|
files_to_index.append(filepath)
|
||||||
|
updated += 1
|
||||||
|
|
||||||
|
# Find deleted files (in DB but not on disk)
|
||||||
|
for filepath in indexed_files:
|
||||||
|
if filepath not in vault_files:
|
||||||
|
delete_documents_by_metadata("filepath", filepath)
|
||||||
|
deleted += 1
|
||||||
|
|
||||||
|
# Index new/changed files
|
||||||
|
if files_to_index:
|
||||||
|
documents = []
|
||||||
|
for filepath in files_to_index:
|
||||||
|
try:
|
||||||
|
with open(filepath, "r", encoding="utf-8") as f:
|
||||||
|
content = f.read()
|
||||||
|
parsed = obsidian_service.parse_markdown(content, filepath)
|
||||||
|
document = Document(
|
||||||
|
page_content=parsed["content"],
|
||||||
|
metadata={
|
||||||
|
"source": "obsidian",
|
||||||
|
"filepath": parsed["filepath"],
|
||||||
|
"tags": parsed["tags"],
|
||||||
|
"created_at": parsed["metadata"].get("created_at"),
|
||||||
|
"indexed_at": time.time(),
|
||||||
|
**{
|
||||||
|
k: v
|
||||||
|
for k, v in parsed["metadata"].items()
|
||||||
|
if k not in ["created_at", "created_by"]
|
||||||
|
},
|
||||||
|
},
|
||||||
|
)
|
||||||
|
documents.append(document)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Error reading {filepath}: {e}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
if documents:
|
||||||
|
splits = text_splitter.split_documents(documents)
|
||||||
|
splits = _sanitize_documents(splits)
|
||||||
|
vector_store = _get_vector_store()
|
||||||
|
await vector_store.aadd_documents(documents=splits)
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
f"Obsidian sync complete: {added} added, {updated} updated, {deleted} deleted"
|
||||||
|
)
|
||||||
|
return {"added": added, "updated": updated, "deleted": deleted}
|
||||||
|
|
||||||
|
|
||||||
async def query_vector_store(query: str):
|
async def query_vector_store(query: str):
|
||||||
vector_store = _get_vector_store()
|
vector_store = _get_vector_store()
|
||||||
retrieved_docs = await vector_store.asimilarity_search(query, k=2)
|
retrieved_docs = await vector_store.asimilarity_search(query, k=2)
|
||||||
|
|||||||
Reference in New Issue
Block a user