Fix datetime serialization in Obsidian metadata for pgvector

YAML frontmatter can contain datetime objects which aren't JSON
serializable. Add _make_serializable() to coerce all metadata values
before storing in pgvector.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-05-31 07:11:29 -04:00
parent 869de1c250
commit 1e6bc536b4
+37 -24
View File
@@ -116,6 +116,17 @@ async def fetch_documents_from_paperless_ngx() -> list[Document]:
return documents return documents
def _make_serializable(value):
"""Convert a value to a JSON-serializable type."""
if isinstance(value, (str, int, float, bool, type(None))):
return value
if isinstance(value, (list, tuple)):
return [_make_serializable(v) for v in value]
if isinstance(value, dict):
return {k: _make_serializable(v) for k, v in value.items()}
return str(value)
def _sanitize_text(text_content: str) -> str: def _sanitize_text(text_content: str) -> str:
"""Strip non-printable and invalid characters that break embedding tokenizers.""" """Strip non-printable and invalid characters that break embedding tokenizers."""
# Remove null bytes and control characters (keep newlines and tabs) # Remove null bytes and control characters (keep newlines and tabs)
@@ -174,20 +185,21 @@ async def fetch_obsidian_documents() -> list[Document]:
parsed = obsidian_service.parse_markdown(content, md_path) parsed = obsidian_service.parse_markdown(content, md_path)
# Create LangChain Document with obsidian source # Create LangChain Document with obsidian source
metadata = {
"source": "obsidian",
"filepath": parsed["filepath"],
"tags": parsed["tags"],
"created_at": parsed["metadata"].get("created_at"),
"indexed_at": time.time(),
**{
k: v
for k, v in parsed["metadata"].items()
if k not in ["created_at", "created_by"]
},
}
document = Document( document = Document(
page_content=parsed["content"], page_content=parsed["content"],
metadata={ metadata=_make_serializable(metadata),
"source": "obsidian",
"filepath": parsed["filepath"],
"tags": parsed["tags"],
"created_at": parsed["metadata"].get("created_at"),
"indexed_at": time.time(),
**{
k: v
for k, v in parsed["metadata"].items()
if k not in ["created_at", "created_by"]
},
},
) )
documents.append(document) documents.append(document)
@@ -289,20 +301,21 @@ async def sync_obsidian_documents() -> dict[str, int]:
with open(filepath, "r", encoding="utf-8") as f: with open(filepath, "r", encoding="utf-8") as f:
content = f.read() content = f.read()
parsed = obsidian_service.parse_markdown(content, filepath) parsed = obsidian_service.parse_markdown(content, filepath)
metadata = {
"source": "obsidian",
"filepath": parsed["filepath"],
"tags": parsed["tags"],
"created_at": parsed["metadata"].get("created_at"),
"indexed_at": time.time(),
**{
k: v
for k, v in parsed["metadata"].items()
if k not in ["created_at", "created_by"]
},
}
document = Document( document = Document(
page_content=parsed["content"], page_content=parsed["content"],
metadata={ metadata=_make_serializable(metadata),
"source": "obsidian",
"filepath": parsed["filepath"],
"tags": parsed["tags"],
"created_at": parsed["metadata"].get("created_at"),
"indexed_at": time.time(),
**{
k: v
for k, v in parsed["metadata"].items()
if k not in ["created_at", "created_by"]
},
},
) )
documents.append(document) documents.append(document)
except Exception as e: except Exception as e: