Improve Obsidian RAG retrieval for large vaults

- Markdown-aware chunking (split on headers before size-based splitting)
- Prepend note filename to each chunk for self-contained context
- Source-filtered retrieval (obsidian/paperless queries stay isolated)
- MMR search with k=8, fetch_k=24 for better recall and diversity
- Add source metadata to Paperless docs and folder metadata to Obsidian docs

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-06-04 13:34:15 -04:00
parent 9bccac82f3
commit add9946bc2
2 changed files with 81 additions and 9 deletions
+3 -3
View File
@@ -121,7 +121,7 @@ async def simba_search(query: str):
Relevant information from Simba's documents Relevant information from Simba's documents
""" """
print(f"[SIMBA SEARCH] Tool called with query: {query}") print(f"[SIMBA SEARCH] Tool called with query: {query}")
serialized, docs = await query_vector_store(query=query) serialized, docs = await query_vector_store(query=query, source="paperless")
print(f"[SIMBA SEARCH] Found {len(docs)} documents") print(f"[SIMBA SEARCH] Found {len(docs)} documents")
print(f"[SIMBA SEARCH] Serialized result length: {len(serialized)}") print(f"[SIMBA SEARCH] Serialized result length: {len(serialized)}")
print(f"[SIMBA SEARCH] First 200 chars: {serialized[:200]}") print(f"[SIMBA SEARCH] First 200 chars: {serialized[:200]}")
@@ -329,8 +329,8 @@ async def obsidian_search_notes(query: str) -> str:
return "Obsidian integration is not configured. Please set OBSIDIAN_VAULT_PATH environment variable." return "Obsidian integration is not configured. Please set OBSIDIAN_VAULT_PATH environment variable."
try: try:
# Query vector store for obsidian documents # Query vector store filtered to obsidian source only
serialized, docs = await query_vector_store(query=query) serialized, docs = await query_vector_store(query=query, source="obsidian")
return serialized return serialized
except Exception as e: except Exception as e:
+78 -6
View File
@@ -3,12 +3,16 @@ import logging
import os import os
import re import re
import time import time
from pathlib import Path
from dotenv import load_dotenv from dotenv import load_dotenv
from langchain_core.documents import Document from langchain_core.documents import Document
from langchain_openai import OpenAIEmbeddings from langchain_openai import OpenAIEmbeddings
from langchain_postgres import PGVector from langchain_postgres import PGVector
from langchain_text_splitters import RecursiveCharacterTextSplitter from langchain_text_splitters import (
MarkdownHeaderTextSplitter,
RecursiveCharacterTextSplitter,
)
from sqlalchemy import create_engine, text from sqlalchemy import create_engine, text
from .fetchers import PaperlessNGXService from .fetchers import PaperlessNGXService
@@ -69,6 +73,46 @@ text_splitter = RecursiveCharacterTextSplitter(
add_start_index=True, # track index in original document add_start_index=True, # track index in original document
) )
md_header_splitter = MarkdownHeaderTextSplitter(
headers_to_split_on=[("#", "h1"), ("##", "h2"), ("###", "h3")],
strip_headers=False,
)
md_chunk_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=200,
add_start_index=True,
)
def _split_markdown_document(doc: Document) -> list[Document]:
"""Split a markdown document by headers first, then by size.
Prepends the note filename to each chunk so chunks are self-contained.
"""
note_name = (
Path(doc.metadata.get("filepath", "")).stem
if doc.metadata.get("filepath")
else ""
)
# Split by markdown headers
header_splits = md_header_splitter.split_text(doc.page_content)
# Carry over original document metadata to each header split
for split in header_splits:
split.metadata.update(doc.metadata)
# Then apply size-based splitting on large sections
sized_splits = md_chunk_splitter.split_documents(header_splits)
# Prepend note name for self-contained context
if note_name:
for split in sized_splits:
split.page_content = f"[Note: {note_name}]\n{split.page_content}"
return sized_splits
def _get_collection_id(): def _get_collection_id():
"""Get the UUID of our collection from the langchain_pg_collection table.""" """Get the UUID of our collection from the langchain_pg_collection table."""
@@ -107,6 +151,7 @@ async def fetch_documents_from_paperless_ngx() -> list[Document]:
documents = [] documents = []
for doc in data: for doc in data:
metadata = { metadata = {
"source": "paperless",
"created_date": date_to_epoch(doc["created_date"]), "created_date": date_to_epoch(doc["created_date"]),
"filename": doc["original_file_name"], "filename": doc["original_file_name"],
"document_type": doctypes.get(doc["document_type"], ""), "document_type": doctypes.get(doc["document_type"], ""),
@@ -188,6 +233,9 @@ async def fetch_obsidian_documents() -> list[Document]:
metadata = { metadata = {
"source": "obsidian", "source": "obsidian",
"filepath": parsed["filepath"], "filepath": parsed["filepath"],
"folder": str(Path(parsed["filepath"]).parent)
if parsed["filepath"]
else "",
"tags": parsed["tags"], "tags": parsed["tags"],
"created_at": parsed["metadata"].get("created_at"), "created_at": parsed["metadata"].get("created_at"),
"indexed_at": time.time(), "indexed_at": time.time(),
@@ -224,8 +272,10 @@ async def index_obsidian_documents():
# Delete existing obsidian chunks # Delete existing obsidian chunks
delete_documents_by_metadata("source", "obsidian") delete_documents_by_metadata("source", "obsidian")
# Split, sanitize, and index documents # Split using markdown-aware chunking, sanitize, and index
splits = text_splitter.split_documents(documents) splits = []
for doc in documents:
splits.extend(_split_markdown_document(doc))
splits = _sanitize_documents(splits) splits = _sanitize_documents(splits)
vector_store = _get_vector_store() vector_store = _get_vector_store()
await vector_store.aadd_documents(documents=splits) await vector_store.aadd_documents(documents=splits)
@@ -315,6 +365,9 @@ async def sync_obsidian_documents() -> dict[str, int]:
metadata = { metadata = {
"source": "obsidian", "source": "obsidian",
"filepath": parsed["filepath"], "filepath": parsed["filepath"],
"folder": str(Path(parsed["filepath"]).parent)
if parsed["filepath"]
else "",
"tags": parsed["tags"], "tags": parsed["tags"],
"created_at": parsed["metadata"].get("created_at"), "created_at": parsed["metadata"].get("created_at"),
"indexed_at": now, "indexed_at": now,
@@ -334,7 +387,9 @@ async def sync_obsidian_documents() -> dict[str, int]:
continue continue
if documents: if documents:
splits = text_splitter.split_documents(documents) splits = []
for doc in documents:
splits.extend(_split_markdown_document(doc))
splits = _sanitize_documents(splits) splits = _sanitize_documents(splits)
if splits: if splits:
vector_store = _get_vector_store() vector_store = _get_vector_store()
@@ -350,9 +405,26 @@ async def sync_obsidian_documents() -> dict[str, int]:
return {"added": added, "updated": updated, "deleted": deleted} return {"added": added, "updated": updated, "deleted": deleted}
async def query_vector_store(query: str): async def query_vector_store(
query: str,
source: str | None = None,
k: int = 8,
):
"""Query the vector store with optional source filtering and MMR.
Args:
query: Search query text
source: Filter by source metadata (e.g., "obsidian", "paperless")
k: Number of results to return
"""
vector_store = _get_vector_store() vector_store = _get_vector_store()
retrieved_docs = await vector_store.asimilarity_search(query, k=6) filter_dict = {"source": source} if source else None
retrieved_docs = await vector_store.amax_marginal_relevance_search(
query,
k=k,
fetch_k=k * 3,
filter=filter_dict,
)
serialized = "\n\n".join( serialized = "\n\n".join(
(f"Source: {doc.metadata}\nContent: {doc.page_content}") (f"Source: {doc.metadata}\nContent: {doc.page_content}")
for doc in retrieved_docs for doc in retrieved_docs