Compare commits

..

7 Commits

Author SHA1 Message Date
Ryan Chen 9629bfcef4 Fix embedding tokenizer mismatch with custom embedding server
Disable tiktoken pre-encoding for custom embedding servers. LangChain
was encoding text into OpenAI token IDs then sending them to llama-server
which has a different vocabulary, causing "invalid tokens" errors.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-05-11 23:42:23 -04:00
Ryan Chen b4097730ef Add per-chunk error logging and broaden text sanitizer
Indexes chunks one at a time with error logging to identify which
document/chunk causes embedding failures. Also strips Unicode surrogates
and replacement characters.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-05-11 23:38:03 -04:00
Ryan Chen abb06b78e2 Sanitize document text before embedding to fix tokenizer errors
Strips null bytes, control characters, and excessive whitespace from
document content before sending to embedding models. Fixes 400 errors
from BERT-based tokenizers (e.g. nomic-embed-text) on PDF-extracted text.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-05-11 23:35:25 -04:00
Ryan Chen 92171cbfb6 Support custom OpenAI-compatible embedding server with OpenAI fallback
Adds EMBEDDING_SERVER_URL and EMBEDDING_MODEL_NAME env vars, mirroring
the existing LLAMA_SERVER_URL pattern for LLM configuration.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-05-11 23:24:54 -04:00
ryan 8e884b5e76 Merge pull request 'Refactor frontend to hook-based architecture' (#33) from refactor/frontend-hooks into main
Reviewed-on: #33
2026-04-24 09:12:47 -04:00
ryan ed973357e8 Merge pull request 'Improve Simba system prompt' (#31) from feat/improve-system-prompt into main
Reviewed-on: #31
2026-04-24 09:12:39 -04:00
Ryan Chen db977270a3 Improve Simba system prompt for more helpful responses
Shift focus from cat persona to genuine helpfulness. Keep light
cat flavor but prioritize thorough, detailed answers over the
assertive cat act.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-24 08:58:29 -04:00
4 changed files with 55 additions and 4 deletions
+6
View File
@@ -19,6 +19,12 @@ BASE_URL=192.168.1.5:8000
LLAMA_SERVER_URL=http://192.168.1.213:8080/v1 LLAMA_SERVER_URL=http://192.168.1.213:8080/v1
LLAMA_MODEL_NAME=llama-3.1-8b-instruct LLAMA_MODEL_NAME=llama-3.1-8b-instruct
# Embedding Server Configuration
# If set, uses a custom OpenAI-compatible embedding server (e.g. llama-server)
# Falls back to OpenAI embeddings if not set
EMBEDDING_SERVER_URL=http://192.168.1.7:8086/v1
EMBEDDING_MODEL_NAME=all-minilm
# OpenAI Configuration # OpenAI Configuration
OPENAI_API_KEY=your-openai-api-key OPENAI_API_KEY=your-openai-api-key
+1 -1
View File
@@ -1,4 +1,4 @@
SIMBA_SYSTEM_PROMPT = """You are a helpful cat assistant named Simba that understands veterinary terms. When there are questions to you specifically, they are referring to Simba the cat. Answer the user in as if you were a cat named Simba. Don't act too catlike. Be assertive. SIMBA_SYSTEM_PROMPT = """You are Simba, Ryan's helpful personal assistant. You're named after his orange cat. You have a warm, friendly personality with a light cat-themed touch, but your priority is always being genuinely useful — give thorough, detailed answers and think things through carefully. When asked about Simba the cat, you speak as him in first person. For everything else, you're just a great assistant who happens to have a cat's name.
SIMBA FACTS (as of January 2026): SIMBA FACTS (as of January 2026):
- Name: Simba - Name: Simba
+46 -3
View File
@@ -1,6 +1,7 @@
import datetime import datetime
import logging import logging
import os import os
import re
from dotenv import load_dotenv from dotenv import load_dotenv
from langchain_core.documents import Document from langchain_core.documents import Document
@@ -17,7 +18,18 @@ load_dotenv()
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
embeddings = OpenAIEmbeddings(model="text-embedding-3-small") _embedding_server_url = os.getenv("EMBEDDING_SERVER_URL")
_embedding_model = os.getenv("EMBEDDING_MODEL_NAME", "text-embedding-3-small")
if _embedding_server_url:
embeddings = OpenAIEmbeddings(
model=_embedding_model,
base_url=_embedding_server_url,
api_key="not-needed",
check_embedding_ctx_length=False,
)
else:
embeddings = OpenAIEmbeddings(model=_embedding_model)
# Convert Tortoise-style postgres:// URL to SQLAlchemy-style postgresql+psycopg:// # Convert Tortoise-style postgres:// URL to SQLAlchemy-style postgresql+psycopg://
_db_url = os.getenv( _db_url = os.getenv(
@@ -103,13 +115,43 @@ async def fetch_documents_from_paperless_ngx() -> list[Document]:
return documents return documents
def _sanitize_text(text_content: str) -> str:
"""Strip non-printable and invalid characters that break embedding tokenizers."""
# Remove null bytes and control characters (keep newlines and tabs)
text_content = re.sub(r"[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\x9f]", "", text_content)
# Remove Unicode surrogates and other problematic Unicode
text_content = re.sub(r"[\ud800-\udfff\ufffe\uffff]", "", text_content)
# Remove replacement character clusters
text_content = text_content.replace("\ufffd", "")
# Collapse excessive whitespace
text_content = re.sub(r" {3,}", " ", text_content)
return text_content.strip()
def _sanitize_documents(documents: list[Document]) -> list[Document]:
"""Sanitize page_content of all documents for embedding compatibility."""
for doc in documents:
doc.page_content = _sanitize_text(doc.page_content)
return [doc for doc in documents if doc.page_content]
async def index_documents(): async def index_documents():
"""Index Paperless-NGX documents into vector store.""" """Index Paperless-NGX documents into vector store."""
documents = await fetch_documents_from_paperless_ngx() documents = await fetch_documents_from_paperless_ngx()
splits = text_splitter.split_documents(documents) splits = text_splitter.split_documents(documents)
splits = _sanitize_documents(splits)
logger.info(f"Indexing {len(splits)} chunks from {len(documents)} documents")
vector_store = _get_vector_store() vector_store = _get_vector_store()
await vector_store.aadd_documents(documents=splits) for i, split in enumerate(splits):
try:
await vector_store.aadd_documents(documents=[split])
except Exception as e:
logger.error(
f"Failed to embed chunk {i} from {split.metadata.get('filename', 'unknown')}: {e}"
)
logger.debug(f"Chunk content preview: {split.page_content[:200]!r}")
raise
async def fetch_obsidian_documents() -> list[Document]: async def fetch_obsidian_documents() -> list[Document]:
@@ -168,8 +210,9 @@ async def index_obsidian_documents():
# Delete existing obsidian chunks # Delete existing obsidian chunks
delete_documents_by_metadata("source", "obsidian") delete_documents_by_metadata("source", "obsidian")
# Split and index documents # Split, sanitize, and index documents
splits = text_splitter.split_documents(documents) splits = text_splitter.split_documents(documents)
splits = _sanitize_documents(splits)
vector_store = _get_vector_store() vector_store = _get_vector_store()
await vector_store.aadd_documents(documents=splits) await vector_store.aadd_documents(documents=splits)
+2
View File
@@ -31,6 +31,8 @@ services:
- BASE_URL=${BASE_URL} - BASE_URL=${BASE_URL}
- OLLAMA_URL=${OLLAMA_URL:-http://localhost:11434} - OLLAMA_URL=${OLLAMA_URL:-http://localhost:11434}
- OPENAI_API_KEY=${OPENAI_API_KEY} - OPENAI_API_KEY=${OPENAI_API_KEY}
- EMBEDDING_SERVER_URL=${EMBEDDING_SERVER_URL}
- EMBEDDING_MODEL_NAME=${EMBEDDING_MODEL_NAME}
- JWT_SECRET_KEY=${JWT_SECRET_KEY} - JWT_SECRET_KEY=${JWT_SECRET_KEY}
- LLAMA_SERVER_URL=${LLAMA_SERVER_URL} - LLAMA_SERVER_URL=${LLAMA_SERVER_URL}
- LLAMA_MODEL_NAME=${LLAMA_MODEL_NAME} - LLAMA_MODEL_NAME=${LLAMA_MODEL_NAME}