Compare commits
7 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 9629bfcef4 | |||
| b4097730ef | |||
| abb06b78e2 | |||
| 92171cbfb6 | |||
| 8e884b5e76 | |||
| ed973357e8 | |||
| db977270a3 |
@@ -19,6 +19,12 @@ BASE_URL=192.168.1.5:8000
|
|||||||
LLAMA_SERVER_URL=http://192.168.1.213:8080/v1
|
LLAMA_SERVER_URL=http://192.168.1.213:8080/v1
|
||||||
LLAMA_MODEL_NAME=llama-3.1-8b-instruct
|
LLAMA_MODEL_NAME=llama-3.1-8b-instruct
|
||||||
|
|
||||||
|
# Embedding Server Configuration
|
||||||
|
# If set, uses a custom OpenAI-compatible embedding server (e.g. llama-server)
|
||||||
|
# Falls back to OpenAI embeddings if not set
|
||||||
|
EMBEDDING_SERVER_URL=http://192.168.1.7:8086/v1
|
||||||
|
EMBEDDING_MODEL_NAME=all-minilm
|
||||||
|
|
||||||
# OpenAI Configuration
|
# OpenAI Configuration
|
||||||
OPENAI_API_KEY=your-openai-api-key
|
OPENAI_API_KEY=your-openai-api-key
|
||||||
|
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
SIMBA_SYSTEM_PROMPT = """You are a helpful cat assistant named Simba that understands veterinary terms. When there are questions to you specifically, they are referring to Simba the cat. Answer the user in as if you were a cat named Simba. Don't act too catlike. Be assertive.
|
SIMBA_SYSTEM_PROMPT = """You are Simba, Ryan's helpful personal assistant. You're named after his orange cat. You have a warm, friendly personality with a light cat-themed touch, but your priority is always being genuinely useful — give thorough, detailed answers and think things through carefully. When asked about Simba the cat, you speak as him in first person. For everything else, you're just a great assistant who happens to have a cat's name.
|
||||||
|
|
||||||
SIMBA FACTS (as of January 2026):
|
SIMBA FACTS (as of January 2026):
|
||||||
- Name: Simba
|
- Name: Simba
|
||||||
|
|||||||
+46
-3
@@ -1,6 +1,7 @@
|
|||||||
import datetime
|
import datetime
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
|
import re
|
||||||
|
|
||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
from langchain_core.documents import Document
|
from langchain_core.documents import Document
|
||||||
@@ -17,7 +18,18 @@ load_dotenv()
|
|||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
|
_embedding_server_url = os.getenv("EMBEDDING_SERVER_URL")
|
||||||
|
_embedding_model = os.getenv("EMBEDDING_MODEL_NAME", "text-embedding-3-small")
|
||||||
|
|
||||||
|
if _embedding_server_url:
|
||||||
|
embeddings = OpenAIEmbeddings(
|
||||||
|
model=_embedding_model,
|
||||||
|
base_url=_embedding_server_url,
|
||||||
|
api_key="not-needed",
|
||||||
|
check_embedding_ctx_length=False,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
embeddings = OpenAIEmbeddings(model=_embedding_model)
|
||||||
|
|
||||||
# Convert Tortoise-style postgres:// URL to SQLAlchemy-style postgresql+psycopg://
|
# Convert Tortoise-style postgres:// URL to SQLAlchemy-style postgresql+psycopg://
|
||||||
_db_url = os.getenv(
|
_db_url = os.getenv(
|
||||||
@@ -103,13 +115,43 @@ async def fetch_documents_from_paperless_ngx() -> list[Document]:
|
|||||||
return documents
|
return documents
|
||||||
|
|
||||||
|
|
||||||
|
def _sanitize_text(text_content: str) -> str:
|
||||||
|
"""Strip non-printable and invalid characters that break embedding tokenizers."""
|
||||||
|
# Remove null bytes and control characters (keep newlines and tabs)
|
||||||
|
text_content = re.sub(r"[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\x9f]", "", text_content)
|
||||||
|
# Remove Unicode surrogates and other problematic Unicode
|
||||||
|
text_content = re.sub(r"[\ud800-\udfff\ufffe\uffff]", "", text_content)
|
||||||
|
# Remove replacement character clusters
|
||||||
|
text_content = text_content.replace("\ufffd", "")
|
||||||
|
# Collapse excessive whitespace
|
||||||
|
text_content = re.sub(r" {3,}", " ", text_content)
|
||||||
|
return text_content.strip()
|
||||||
|
|
||||||
|
|
||||||
|
def _sanitize_documents(documents: list[Document]) -> list[Document]:
|
||||||
|
"""Sanitize page_content of all documents for embedding compatibility."""
|
||||||
|
for doc in documents:
|
||||||
|
doc.page_content = _sanitize_text(doc.page_content)
|
||||||
|
return [doc for doc in documents if doc.page_content]
|
||||||
|
|
||||||
|
|
||||||
async def index_documents():
|
async def index_documents():
|
||||||
"""Index Paperless-NGX documents into vector store."""
|
"""Index Paperless-NGX documents into vector store."""
|
||||||
documents = await fetch_documents_from_paperless_ngx()
|
documents = await fetch_documents_from_paperless_ngx()
|
||||||
|
|
||||||
splits = text_splitter.split_documents(documents)
|
splits = text_splitter.split_documents(documents)
|
||||||
|
splits = _sanitize_documents(splits)
|
||||||
|
logger.info(f"Indexing {len(splits)} chunks from {len(documents)} documents")
|
||||||
vector_store = _get_vector_store()
|
vector_store = _get_vector_store()
|
||||||
await vector_store.aadd_documents(documents=splits)
|
for i, split in enumerate(splits):
|
||||||
|
try:
|
||||||
|
await vector_store.aadd_documents(documents=[split])
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(
|
||||||
|
f"Failed to embed chunk {i} from {split.metadata.get('filename', 'unknown')}: {e}"
|
||||||
|
)
|
||||||
|
logger.debug(f"Chunk content preview: {split.page_content[:200]!r}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
|
||||||
async def fetch_obsidian_documents() -> list[Document]:
|
async def fetch_obsidian_documents() -> list[Document]:
|
||||||
@@ -168,8 +210,9 @@ async def index_obsidian_documents():
|
|||||||
# Delete existing obsidian chunks
|
# Delete existing obsidian chunks
|
||||||
delete_documents_by_metadata("source", "obsidian")
|
delete_documents_by_metadata("source", "obsidian")
|
||||||
|
|
||||||
# Split and index documents
|
# Split, sanitize, and index documents
|
||||||
splits = text_splitter.split_documents(documents)
|
splits = text_splitter.split_documents(documents)
|
||||||
|
splits = _sanitize_documents(splits)
|
||||||
vector_store = _get_vector_store()
|
vector_store = _get_vector_store()
|
||||||
await vector_store.aadd_documents(documents=splits)
|
await vector_store.aadd_documents(documents=splits)
|
||||||
|
|
||||||
|
|||||||
@@ -31,6 +31,8 @@ services:
|
|||||||
- BASE_URL=${BASE_URL}
|
- BASE_URL=${BASE_URL}
|
||||||
- OLLAMA_URL=${OLLAMA_URL:-http://localhost:11434}
|
- OLLAMA_URL=${OLLAMA_URL:-http://localhost:11434}
|
||||||
- OPENAI_API_KEY=${OPENAI_API_KEY}
|
- OPENAI_API_KEY=${OPENAI_API_KEY}
|
||||||
|
- EMBEDDING_SERVER_URL=${EMBEDDING_SERVER_URL}
|
||||||
|
- EMBEDDING_MODEL_NAME=${EMBEDDING_MODEL_NAME}
|
||||||
- JWT_SECRET_KEY=${JWT_SECRET_KEY}
|
- JWT_SECRET_KEY=${JWT_SECRET_KEY}
|
||||||
- LLAMA_SERVER_URL=${LLAMA_SERVER_URL}
|
- LLAMA_SERVER_URL=${LLAMA_SERVER_URL}
|
||||||
- LLAMA_MODEL_NAME=${LLAMA_MODEL_NAME}
|
- LLAMA_MODEL_NAME=${LLAMA_MODEL_NAME}
|
||||||
|
|||||||
Reference in New Issue
Block a user