Fix datetime serialization in Obsidian metadata for pgvector
YAML frontmatter can contain datetime objects which aren't JSON serializable. Add _make_serializable() to coerce all metadata values before storing in pgvector. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
+19
-6
@@ -116,6 +116,17 @@ async def fetch_documents_from_paperless_ngx() -> list[Document]:
|
|||||||
return documents
|
return documents
|
||||||
|
|
||||||
|
|
||||||
|
def _make_serializable(value):
|
||||||
|
"""Convert a value to a JSON-serializable type."""
|
||||||
|
if isinstance(value, (str, int, float, bool, type(None))):
|
||||||
|
return value
|
||||||
|
if isinstance(value, (list, tuple)):
|
||||||
|
return [_make_serializable(v) for v in value]
|
||||||
|
if isinstance(value, dict):
|
||||||
|
return {k: _make_serializable(v) for k, v in value.items()}
|
||||||
|
return str(value)
|
||||||
|
|
||||||
|
|
||||||
def _sanitize_text(text_content: str) -> str:
|
def _sanitize_text(text_content: str) -> str:
|
||||||
"""Strip non-printable and invalid characters that break embedding tokenizers."""
|
"""Strip non-printable and invalid characters that break embedding tokenizers."""
|
||||||
# Remove null bytes and control characters (keep newlines and tabs)
|
# Remove null bytes and control characters (keep newlines and tabs)
|
||||||
@@ -174,8 +185,6 @@ async def fetch_obsidian_documents() -> list[Document]:
|
|||||||
parsed = obsidian_service.parse_markdown(content, md_path)
|
parsed = obsidian_service.parse_markdown(content, md_path)
|
||||||
|
|
||||||
# Create LangChain Document with obsidian source
|
# Create LangChain Document with obsidian source
|
||||||
document = Document(
|
|
||||||
page_content=parsed["content"],
|
|
||||||
metadata = {
|
metadata = {
|
||||||
"source": "obsidian",
|
"source": "obsidian",
|
||||||
"filepath": parsed["filepath"],
|
"filepath": parsed["filepath"],
|
||||||
@@ -187,7 +196,10 @@ async def fetch_obsidian_documents() -> list[Document]:
|
|||||||
for k, v in parsed["metadata"].items()
|
for k, v in parsed["metadata"].items()
|
||||||
if k not in ["created_at", "created_by"]
|
if k not in ["created_at", "created_by"]
|
||||||
},
|
},
|
||||||
},
|
}
|
||||||
|
document = Document(
|
||||||
|
page_content=parsed["content"],
|
||||||
|
metadata=_make_serializable(metadata),
|
||||||
)
|
)
|
||||||
documents.append(document)
|
documents.append(document)
|
||||||
|
|
||||||
@@ -289,8 +301,6 @@ async def sync_obsidian_documents() -> dict[str, int]:
|
|||||||
with open(filepath, "r", encoding="utf-8") as f:
|
with open(filepath, "r", encoding="utf-8") as f:
|
||||||
content = f.read()
|
content = f.read()
|
||||||
parsed = obsidian_service.parse_markdown(content, filepath)
|
parsed = obsidian_service.parse_markdown(content, filepath)
|
||||||
document = Document(
|
|
||||||
page_content=parsed["content"],
|
|
||||||
metadata = {
|
metadata = {
|
||||||
"source": "obsidian",
|
"source": "obsidian",
|
||||||
"filepath": parsed["filepath"],
|
"filepath": parsed["filepath"],
|
||||||
@@ -302,7 +312,10 @@ async def sync_obsidian_documents() -> dict[str, int]:
|
|||||||
for k, v in parsed["metadata"].items()
|
for k, v in parsed["metadata"].items()
|
||||||
if k not in ["created_at", "created_by"]
|
if k not in ["created_at", "created_by"]
|
||||||
},
|
},
|
||||||
},
|
}
|
||||||
|
document = Document(
|
||||||
|
page_content=parsed["content"],
|
||||||
|
metadata=_make_serializable(metadata),
|
||||||
)
|
)
|
||||||
documents.append(document)
|
documents.append(document)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|||||||
Reference in New Issue
Block a user