Preserve wikilink text in Obsidian indexing and fix duplicate sync

Two fixes:
- Convert wikilinks to display text instead of stripping them entirely.
  [[Noah]] becomes "Noah", [[target|display]] becomes "display". This
  was causing names and references in wikilinks to be invisible to search.
- Switch _get_obsidian_indexed_files to async engine to avoid stale reads
  from the separate sync engine, which caused files to be re-indexed
  every cycle.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-05-31 07:35:24 -04:00
parent 73e952c617
commit 00c9b44c0e
2 changed files with 19 additions and 7 deletions
+13 -5
View File
@@ -10,6 +10,7 @@ from langchain_openai import OpenAIEmbeddings
from langchain_postgres import PGVector from langchain_postgres import PGVector
from langchain_text_splitters import RecursiveCharacterTextSplitter from langchain_text_splitters import RecursiveCharacterTextSplitter
from sqlalchemy import create_engine, text from sqlalchemy import create_engine, text
from sqlalchemy.ext.asyncio import create_async_engine
from .fetchers import PaperlessNGXService from .fetchers import PaperlessNGXService
from utils.obsidian_service import ObsidianService from utils.obsidian_service import ObsidianService
@@ -63,6 +64,13 @@ def _get_engine():
return _get_engine._engine return _get_engine._engine
def _get_async_engine():
"""Get an async SQLAlchemy engine for direct queries."""
if not hasattr(_get_async_engine, "_engine"):
_get_async_engine._engine = create_async_engine(_pgvector_url)
return _get_async_engine._engine
text_splitter = RecursiveCharacterTextSplitter( text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000, # chunk size (characters) chunk_size=1000, # chunk size (characters)
chunk_overlap=200, # chunk overlap (characters) chunk_overlap=200, # chunk overlap (characters)
@@ -233,14 +241,14 @@ async def index_obsidian_documents():
return {"indexed": len(documents)} return {"indexed": len(documents)}
def _get_obsidian_indexed_files() -> dict[str, float]: async def _get_obsidian_indexed_files() -> dict[str, float]:
"""Return {filepath: indexed_at} for all obsidian chunks in pgvector.""" """Return {filepath: indexed_at} for all obsidian chunks in pgvector."""
collection_id = _get_collection_id() collection_id = _get_collection_id()
if not collection_id: if not collection_id:
return {} return {}
engine = _get_engine() engine = _get_async_engine()
with engine.connect() as conn: async with engine.connect() as conn:
result = conn.execute( result = await conn.execute(
text( text(
"SELECT DISTINCT cmetadata->>'filepath' AS filepath, " "SELECT DISTINCT cmetadata->>'filepath' AS filepath, "
"MAX((cmetadata->>'indexed_at')::float) AS indexed_at " "MAX((cmetadata->>'indexed_at')::float) AS indexed_at "
@@ -263,7 +271,7 @@ async def sync_obsidian_documents() -> dict[str, int]:
Dict with counts of added, updated, and deleted files. Dict with counts of added, updated, and deleted files.
""" """
obsidian_service = ObsidianService() obsidian_service = ObsidianService()
indexed_files = _get_obsidian_indexed_files() indexed_files = await _get_obsidian_indexed_files()
# Build map of current vault files -> mtime # Build map of current vault files -> mtime
vault_files: dict[str, float] = {} vault_files: dict[str, float] = {}
+6 -2
View File
@@ -106,8 +106,12 @@ class ObsidianService:
embeds = [e.split(":")[0].strip() if ":" in e else e.strip() for e in embeds] embeds = [e.split(":")[0].strip() if ":" in e else e.strip() for e in embeds]
# Clean body content # Clean body content
# Remove wikilinks [[...]] and embeds [[!...]] # Remove embeds ![[...]]
cleaned_content = re.sub(r"\[\[.*?\]\]", "", body_content) cleaned_content = re.sub(r"!\[\[.*?\]\]", "", body_content)
# Convert wikilinks to display text: [[target|display]] → display, [[target]] → target
cleaned_content = re.sub(
r"\[\[([^\]|]+\|)?([^\]]+)\]\]", r"\2", cleaned_content
)
cleaned_content = re.sub(r"\n{3,}", "\n\n", cleaned_content).strip() cleaned_content = re.sub(r"\n{3,}", "\n\n", cleaned_content).strip()
return { return {