Preserve wikilink text in Obsidian indexing and fix duplicate sync

Two fixes: - Convert wikilinks to display text instead of stripping them entirely. [[Noah]] becomes "Noah", [[target|display]] becomes "display". This was causing names and references in wikilinks to be invisible to search. - Switch _get_obsidian_indexed_files to async engine to avoid stale reads from the separate sync engine, which caused files to be re-indexed every cycle. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-05-31 07:35:24 -04:00
parent 73e952c617
commit 00c9b44c0e
2 changed files with 19 additions and 7 deletions
@@ -10,6 +10,7 @@ from langchain_openai import OpenAIEmbeddings
 from langchain_postgres import PGVector
 from langchain_text_splitters import RecursiveCharacterTextSplitter
 from sqlalchemy import create_engine, text
+from sqlalchemy.ext.asyncio import create_async_engine

 from .fetchers import PaperlessNGXService
 from utils.obsidian_service import ObsidianService
@@ -63,6 +64,13 @@ def _get_engine():
    return _get_engine._engine


+def _get_async_engine():
+    """Get an async SQLAlchemy engine for direct queries."""
+    if not hasattr(_get_async_engine, "_engine"):
+        _get_async_engine._engine = create_async_engine(_pgvector_url)
+    return _get_async_engine._engine
+
+
 text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,  # chunk size (characters)
    chunk_overlap=200,  # chunk overlap (characters)
@@ -233,14 +241,14 @@ async def index_obsidian_documents():
    return {"indexed": len(documents)}


-def _get_obsidian_indexed_files() -> dict[str, float]:
+async def _get_obsidian_indexed_files() -> dict[str, float]:
    """Return {filepath: indexed_at} for all obsidian chunks in pgvector."""
    collection_id = _get_collection_id()
    if not collection_id:
        return {}
-    engine = _get_engine()
-    with engine.connect() as conn:
-        result = conn.execute(
+    engine = _get_async_engine()
+    async with engine.connect() as conn:
+        result = await conn.execute(
            text(
                "SELECT DISTINCT cmetadata->>'filepath' AS filepath, "
                "MAX((cmetadata->>'indexed_at')::float) AS indexed_at "
@@ -263,7 +271,7 @@ async def sync_obsidian_documents() -> dict[str, int]:
        Dict with counts of added, updated, and deleted files.
    """
    obsidian_service = ObsidianService()
-    indexed_files = _get_obsidian_indexed_files()
+    indexed_files = await _get_obsidian_indexed_files()

    # Build map of current vault files -> mtime
    vault_files: dict[str, float] = {}