Preserve wikilink text in Obsidian indexing and fix duplicate sync
Two fixes: - Convert wikilinks to display text instead of stripping them entirely. [[Noah]] becomes "Noah", [[target|display]] becomes "display". This was causing names and references in wikilinks to be invisible to search. - Switch _get_obsidian_indexed_files to async engine to avoid stale reads from the separate sync engine, which caused files to be re-indexed every cycle. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
+13
-5
@@ -10,6 +10,7 @@ from langchain_openai import OpenAIEmbeddings
|
|||||||
from langchain_postgres import PGVector
|
from langchain_postgres import PGVector
|
||||||
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
||||||
from sqlalchemy import create_engine, text
|
from sqlalchemy import create_engine, text
|
||||||
|
from sqlalchemy.ext.asyncio import create_async_engine
|
||||||
|
|
||||||
from .fetchers import PaperlessNGXService
|
from .fetchers import PaperlessNGXService
|
||||||
from utils.obsidian_service import ObsidianService
|
from utils.obsidian_service import ObsidianService
|
||||||
@@ -63,6 +64,13 @@ def _get_engine():
|
|||||||
return _get_engine._engine
|
return _get_engine._engine
|
||||||
|
|
||||||
|
|
||||||
|
def _get_async_engine():
|
||||||
|
"""Get an async SQLAlchemy engine for direct queries."""
|
||||||
|
if not hasattr(_get_async_engine, "_engine"):
|
||||||
|
_get_async_engine._engine = create_async_engine(_pgvector_url)
|
||||||
|
return _get_async_engine._engine
|
||||||
|
|
||||||
|
|
||||||
text_splitter = RecursiveCharacterTextSplitter(
|
text_splitter = RecursiveCharacterTextSplitter(
|
||||||
chunk_size=1000, # chunk size (characters)
|
chunk_size=1000, # chunk size (characters)
|
||||||
chunk_overlap=200, # chunk overlap (characters)
|
chunk_overlap=200, # chunk overlap (characters)
|
||||||
@@ -233,14 +241,14 @@ async def index_obsidian_documents():
|
|||||||
return {"indexed": len(documents)}
|
return {"indexed": len(documents)}
|
||||||
|
|
||||||
|
|
||||||
def _get_obsidian_indexed_files() -> dict[str, float]:
|
async def _get_obsidian_indexed_files() -> dict[str, float]:
|
||||||
"""Return {filepath: indexed_at} for all obsidian chunks in pgvector."""
|
"""Return {filepath: indexed_at} for all obsidian chunks in pgvector."""
|
||||||
collection_id = _get_collection_id()
|
collection_id = _get_collection_id()
|
||||||
if not collection_id:
|
if not collection_id:
|
||||||
return {}
|
return {}
|
||||||
engine = _get_engine()
|
engine = _get_async_engine()
|
||||||
with engine.connect() as conn:
|
async with engine.connect() as conn:
|
||||||
result = conn.execute(
|
result = await conn.execute(
|
||||||
text(
|
text(
|
||||||
"SELECT DISTINCT cmetadata->>'filepath' AS filepath, "
|
"SELECT DISTINCT cmetadata->>'filepath' AS filepath, "
|
||||||
"MAX((cmetadata->>'indexed_at')::float) AS indexed_at "
|
"MAX((cmetadata->>'indexed_at')::float) AS indexed_at "
|
||||||
@@ -263,7 +271,7 @@ async def sync_obsidian_documents() -> dict[str, int]:
|
|||||||
Dict with counts of added, updated, and deleted files.
|
Dict with counts of added, updated, and deleted files.
|
||||||
"""
|
"""
|
||||||
obsidian_service = ObsidianService()
|
obsidian_service = ObsidianService()
|
||||||
indexed_files = _get_obsidian_indexed_files()
|
indexed_files = await _get_obsidian_indexed_files()
|
||||||
|
|
||||||
# Build map of current vault files -> mtime
|
# Build map of current vault files -> mtime
|
||||||
vault_files: dict[str, float] = {}
|
vault_files: dict[str, float] = {}
|
||||||
|
|||||||
@@ -106,8 +106,12 @@ class ObsidianService:
|
|||||||
embeds = [e.split(":")[0].strip() if ":" in e else e.strip() for e in embeds]
|
embeds = [e.split(":")[0].strip() if ":" in e else e.strip() for e in embeds]
|
||||||
|
|
||||||
# Clean body content
|
# Clean body content
|
||||||
# Remove wikilinks [[...]] and embeds [[!...]]
|
# Remove embeds ![[...]]
|
||||||
cleaned_content = re.sub(r"\[\[.*?\]\]", "", body_content)
|
cleaned_content = re.sub(r"!\[\[.*?\]\]", "", body_content)
|
||||||
|
# Convert wikilinks to display text: [[target|display]] → display, [[target]] → target
|
||||||
|
cleaned_content = re.sub(
|
||||||
|
r"\[\[([^\]|]+\|)?([^\]]+)\]\]", r"\2", cleaned_content
|
||||||
|
)
|
||||||
cleaned_content = re.sub(r"\n{3,}", "\n\n", cleaned_content).strip()
|
cleaned_content = re.sub(r"\n{3,}", "\n\n", cleaned_content).strip()
|
||||||
|
|
||||||
return {
|
return {
|
||||||
|
|||||||
Reference in New Issue
Block a user