From b4097730efdf3bb850ae14388e84d75bf85516ef Mon Sep 17 00:00:00 2001 From: Ryan Chen Date: Mon, 11 May 2026 23:38:03 -0400 Subject: [PATCH] Add per-chunk error logging and broaden text sanitizer Indexes chunks one at a time with error logging to identify which document/chunk causes embedding failures. Also strips Unicode surrogates and replacement characters. Co-Authored-By: Claude Opus 4.6 --- blueprints/rag/logic.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/blueprints/rag/logic.py b/blueprints/rag/logic.py index 4ea730d..cff8393 100644 --- a/blueprints/rag/logic.py +++ b/blueprints/rag/logic.py @@ -118,6 +118,10 @@ def _sanitize_text(text_content: str) -> str: """Strip non-printable and invalid characters that break embedding tokenizers.""" # Remove null bytes and control characters (keep newlines and tabs) text_content = re.sub(r"[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\x9f]", "", text_content) + # Remove Unicode surrogates and other problematic Unicode + text_content = re.sub(r"[\ud800-\udfff\ufffe\uffff]", "", text_content) + # Remove replacement character clusters + text_content = text_content.replace("\ufffd", "") # Collapse excessive whitespace text_content = re.sub(r" {3,}", " ", text_content) return text_content.strip() @@ -136,8 +140,17 @@ async def index_documents(): splits = text_splitter.split_documents(documents) splits = _sanitize_documents(splits) + logger.info(f"Indexing {len(splits)} chunks from {len(documents)} documents") vector_store = _get_vector_store() - await vector_store.aadd_documents(documents=splits) + for i, split in enumerate(splits): + try: + await vector_store.aadd_documents(documents=[split]) + except Exception as e: + logger.error( + f"Failed to embed chunk {i} from {split.metadata.get('filename', 'unknown')}: {e}" + ) + logger.debug(f"Chunk content preview: {split.page_content[:200]!r}") + raise async def fetch_obsidian_documents() -> list[Document]: