Improve Obsidian RAG retrieval for large vaults
- Markdown-aware chunking (split on headers before size-based splitting) - Prepend note filename to each chunk for self-contained context - Source-filtered retrieval (obsidian/paperless queries stay isolated) - MMR search with k=8, fetch_k=24 for better recall and diversity - Add source metadata to Paperless docs and folder metadata to Obsidian docs Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -121,7 +121,7 @@ async def simba_search(query: str):
|
|||||||
Relevant information from Simba's documents
|
Relevant information from Simba's documents
|
||||||
"""
|
"""
|
||||||
print(f"[SIMBA SEARCH] Tool called with query: {query}")
|
print(f"[SIMBA SEARCH] Tool called with query: {query}")
|
||||||
serialized, docs = await query_vector_store(query=query)
|
serialized, docs = await query_vector_store(query=query, source="paperless")
|
||||||
print(f"[SIMBA SEARCH] Found {len(docs)} documents")
|
print(f"[SIMBA SEARCH] Found {len(docs)} documents")
|
||||||
print(f"[SIMBA SEARCH] Serialized result length: {len(serialized)}")
|
print(f"[SIMBA SEARCH] Serialized result length: {len(serialized)}")
|
||||||
print(f"[SIMBA SEARCH] First 200 chars: {serialized[:200]}")
|
print(f"[SIMBA SEARCH] First 200 chars: {serialized[:200]}")
|
||||||
@@ -329,8 +329,8 @@ async def obsidian_search_notes(query: str) -> str:
|
|||||||
return "Obsidian integration is not configured. Please set OBSIDIAN_VAULT_PATH environment variable."
|
return "Obsidian integration is not configured. Please set OBSIDIAN_VAULT_PATH environment variable."
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Query vector store for obsidian documents
|
# Query vector store filtered to obsidian source only
|
||||||
serialized, docs = await query_vector_store(query=query)
|
serialized, docs = await query_vector_store(query=query, source="obsidian")
|
||||||
return serialized
|
return serialized
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|||||||
+78
-6
@@ -3,12 +3,16 @@ import logging
|
|||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
import time
|
import time
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
from langchain_core.documents import Document
|
from langchain_core.documents import Document
|
||||||
from langchain_openai import OpenAIEmbeddings
|
from langchain_openai import OpenAIEmbeddings
|
||||||
from langchain_postgres import PGVector
|
from langchain_postgres import PGVector
|
||||||
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
from langchain_text_splitters import (
|
||||||
|
MarkdownHeaderTextSplitter,
|
||||||
|
RecursiveCharacterTextSplitter,
|
||||||
|
)
|
||||||
from sqlalchemy import create_engine, text
|
from sqlalchemy import create_engine, text
|
||||||
|
|
||||||
from .fetchers import PaperlessNGXService
|
from .fetchers import PaperlessNGXService
|
||||||
@@ -69,6 +73,46 @@ text_splitter = RecursiveCharacterTextSplitter(
|
|||||||
add_start_index=True, # track index in original document
|
add_start_index=True, # track index in original document
|
||||||
)
|
)
|
||||||
|
|
||||||
|
md_header_splitter = MarkdownHeaderTextSplitter(
|
||||||
|
headers_to_split_on=[("#", "h1"), ("##", "h2"), ("###", "h3")],
|
||||||
|
strip_headers=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
md_chunk_splitter = RecursiveCharacterTextSplitter(
|
||||||
|
chunk_size=1000,
|
||||||
|
chunk_overlap=200,
|
||||||
|
add_start_index=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _split_markdown_document(doc: Document) -> list[Document]:
|
||||||
|
"""Split a markdown document by headers first, then by size.
|
||||||
|
|
||||||
|
Prepends the note filename to each chunk so chunks are self-contained.
|
||||||
|
"""
|
||||||
|
note_name = (
|
||||||
|
Path(doc.metadata.get("filepath", "")).stem
|
||||||
|
if doc.metadata.get("filepath")
|
||||||
|
else ""
|
||||||
|
)
|
||||||
|
|
||||||
|
# Split by markdown headers
|
||||||
|
header_splits = md_header_splitter.split_text(doc.page_content)
|
||||||
|
|
||||||
|
# Carry over original document metadata to each header split
|
||||||
|
for split in header_splits:
|
||||||
|
split.metadata.update(doc.metadata)
|
||||||
|
|
||||||
|
# Then apply size-based splitting on large sections
|
||||||
|
sized_splits = md_chunk_splitter.split_documents(header_splits)
|
||||||
|
|
||||||
|
# Prepend note name for self-contained context
|
||||||
|
if note_name:
|
||||||
|
for split in sized_splits:
|
||||||
|
split.page_content = f"[Note: {note_name}]\n{split.page_content}"
|
||||||
|
|
||||||
|
return sized_splits
|
||||||
|
|
||||||
|
|
||||||
def _get_collection_id():
|
def _get_collection_id():
|
||||||
"""Get the UUID of our collection from the langchain_pg_collection table."""
|
"""Get the UUID of our collection from the langchain_pg_collection table."""
|
||||||
@@ -107,6 +151,7 @@ async def fetch_documents_from_paperless_ngx() -> list[Document]:
|
|||||||
documents = []
|
documents = []
|
||||||
for doc in data:
|
for doc in data:
|
||||||
metadata = {
|
metadata = {
|
||||||
|
"source": "paperless",
|
||||||
"created_date": date_to_epoch(doc["created_date"]),
|
"created_date": date_to_epoch(doc["created_date"]),
|
||||||
"filename": doc["original_file_name"],
|
"filename": doc["original_file_name"],
|
||||||
"document_type": doctypes.get(doc["document_type"], ""),
|
"document_type": doctypes.get(doc["document_type"], ""),
|
||||||
@@ -188,6 +233,9 @@ async def fetch_obsidian_documents() -> list[Document]:
|
|||||||
metadata = {
|
metadata = {
|
||||||
"source": "obsidian",
|
"source": "obsidian",
|
||||||
"filepath": parsed["filepath"],
|
"filepath": parsed["filepath"],
|
||||||
|
"folder": str(Path(parsed["filepath"]).parent)
|
||||||
|
if parsed["filepath"]
|
||||||
|
else "",
|
||||||
"tags": parsed["tags"],
|
"tags": parsed["tags"],
|
||||||
"created_at": parsed["metadata"].get("created_at"),
|
"created_at": parsed["metadata"].get("created_at"),
|
||||||
"indexed_at": time.time(),
|
"indexed_at": time.time(),
|
||||||
@@ -224,8 +272,10 @@ async def index_obsidian_documents():
|
|||||||
# Delete existing obsidian chunks
|
# Delete existing obsidian chunks
|
||||||
delete_documents_by_metadata("source", "obsidian")
|
delete_documents_by_metadata("source", "obsidian")
|
||||||
|
|
||||||
# Split, sanitize, and index documents
|
# Split using markdown-aware chunking, sanitize, and index
|
||||||
splits = text_splitter.split_documents(documents)
|
splits = []
|
||||||
|
for doc in documents:
|
||||||
|
splits.extend(_split_markdown_document(doc))
|
||||||
splits = _sanitize_documents(splits)
|
splits = _sanitize_documents(splits)
|
||||||
vector_store = _get_vector_store()
|
vector_store = _get_vector_store()
|
||||||
await vector_store.aadd_documents(documents=splits)
|
await vector_store.aadd_documents(documents=splits)
|
||||||
@@ -315,6 +365,9 @@ async def sync_obsidian_documents() -> dict[str, int]:
|
|||||||
metadata = {
|
metadata = {
|
||||||
"source": "obsidian",
|
"source": "obsidian",
|
||||||
"filepath": parsed["filepath"],
|
"filepath": parsed["filepath"],
|
||||||
|
"folder": str(Path(parsed["filepath"]).parent)
|
||||||
|
if parsed["filepath"]
|
||||||
|
else "",
|
||||||
"tags": parsed["tags"],
|
"tags": parsed["tags"],
|
||||||
"created_at": parsed["metadata"].get("created_at"),
|
"created_at": parsed["metadata"].get("created_at"),
|
||||||
"indexed_at": now,
|
"indexed_at": now,
|
||||||
@@ -334,7 +387,9 @@ async def sync_obsidian_documents() -> dict[str, int]:
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
if documents:
|
if documents:
|
||||||
splits = text_splitter.split_documents(documents)
|
splits = []
|
||||||
|
for doc in documents:
|
||||||
|
splits.extend(_split_markdown_document(doc))
|
||||||
splits = _sanitize_documents(splits)
|
splits = _sanitize_documents(splits)
|
||||||
if splits:
|
if splits:
|
||||||
vector_store = _get_vector_store()
|
vector_store = _get_vector_store()
|
||||||
@@ -350,9 +405,26 @@ async def sync_obsidian_documents() -> dict[str, int]:
|
|||||||
return {"added": added, "updated": updated, "deleted": deleted}
|
return {"added": added, "updated": updated, "deleted": deleted}
|
||||||
|
|
||||||
|
|
||||||
async def query_vector_store(query: str):
|
async def query_vector_store(
|
||||||
|
query: str,
|
||||||
|
source: str | None = None,
|
||||||
|
k: int = 8,
|
||||||
|
):
|
||||||
|
"""Query the vector store with optional source filtering and MMR.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
query: Search query text
|
||||||
|
source: Filter by source metadata (e.g., "obsidian", "paperless")
|
||||||
|
k: Number of results to return
|
||||||
|
"""
|
||||||
vector_store = _get_vector_store()
|
vector_store = _get_vector_store()
|
||||||
retrieved_docs = await vector_store.asimilarity_search(query, k=6)
|
filter_dict = {"source": source} if source else None
|
||||||
|
retrieved_docs = await vector_store.amax_marginal_relevance_search(
|
||||||
|
query,
|
||||||
|
k=k,
|
||||||
|
fetch_k=k * 3,
|
||||||
|
filter=filter_dict,
|
||||||
|
)
|
||||||
serialized = "\n\n".join(
|
serialized = "\n\n".join(
|
||||||
(f"Source: {doc.metadata}\nContent: {doc.page_content}")
|
(f"Source: {doc.metadata}\nContent: {doc.page_content}")
|
||||||
for doc in retrieved_docs
|
for doc in retrieved_docs
|
||||||
|
|||||||
Reference in New Issue
Block a user