Compare commits
4 Commits
8e884b5e76
..
main
| Author | SHA1 | Date | |
|---|---|---|---|
| 9629bfcef4 | |||
| b4097730ef | |||
| abb06b78e2 | |||
| 92171cbfb6 |
@@ -19,6 +19,12 @@ BASE_URL=192.168.1.5:8000
|
|||||||
LLAMA_SERVER_URL=http://192.168.1.213:8080/v1
|
LLAMA_SERVER_URL=http://192.168.1.213:8080/v1
|
||||||
LLAMA_MODEL_NAME=llama-3.1-8b-instruct
|
LLAMA_MODEL_NAME=llama-3.1-8b-instruct
|
||||||
|
|
||||||
|
# Embedding Server Configuration
|
||||||
|
# If set, uses a custom OpenAI-compatible embedding server (e.g. llama-server)
|
||||||
|
# Falls back to OpenAI embeddings if not set
|
||||||
|
EMBEDDING_SERVER_URL=http://192.168.1.7:8086/v1
|
||||||
|
EMBEDDING_MODEL_NAME=all-minilm
|
||||||
|
|
||||||
# OpenAI Configuration
|
# OpenAI Configuration
|
||||||
OPENAI_API_KEY=your-openai-api-key
|
OPENAI_API_KEY=your-openai-api-key
|
||||||
|
|
||||||
|
|||||||
+46
-3
@@ -1,6 +1,7 @@
|
|||||||
import datetime
|
import datetime
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
|
import re
|
||||||
|
|
||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
from langchain_core.documents import Document
|
from langchain_core.documents import Document
|
||||||
@@ -17,7 +18,18 @@ load_dotenv()
|
|||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
|
_embedding_server_url = os.getenv("EMBEDDING_SERVER_URL")
|
||||||
|
_embedding_model = os.getenv("EMBEDDING_MODEL_NAME", "text-embedding-3-small")
|
||||||
|
|
||||||
|
if _embedding_server_url:
|
||||||
|
embeddings = OpenAIEmbeddings(
|
||||||
|
model=_embedding_model,
|
||||||
|
base_url=_embedding_server_url,
|
||||||
|
api_key="not-needed",
|
||||||
|
check_embedding_ctx_length=False,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
embeddings = OpenAIEmbeddings(model=_embedding_model)
|
||||||
|
|
||||||
# Convert Tortoise-style postgres:// URL to SQLAlchemy-style postgresql+psycopg://
|
# Convert Tortoise-style postgres:// URL to SQLAlchemy-style postgresql+psycopg://
|
||||||
_db_url = os.getenv(
|
_db_url = os.getenv(
|
||||||
@@ -103,13 +115,43 @@ async def fetch_documents_from_paperless_ngx() -> list[Document]:
|
|||||||
return documents
|
return documents
|
||||||
|
|
||||||
|
|
||||||
|
def _sanitize_text(text_content: str) -> str:
|
||||||
|
"""Strip non-printable and invalid characters that break embedding tokenizers."""
|
||||||
|
# Remove null bytes and control characters (keep newlines and tabs)
|
||||||
|
text_content = re.sub(r"[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\x9f]", "", text_content)
|
||||||
|
# Remove Unicode surrogates and other problematic Unicode
|
||||||
|
text_content = re.sub(r"[\ud800-\udfff\ufffe\uffff]", "", text_content)
|
||||||
|
# Remove replacement character clusters
|
||||||
|
text_content = text_content.replace("\ufffd", "")
|
||||||
|
# Collapse excessive whitespace
|
||||||
|
text_content = re.sub(r" {3,}", " ", text_content)
|
||||||
|
return text_content.strip()
|
||||||
|
|
||||||
|
|
||||||
|
def _sanitize_documents(documents: list[Document]) -> list[Document]:
|
||||||
|
"""Sanitize page_content of all documents for embedding compatibility."""
|
||||||
|
for doc in documents:
|
||||||
|
doc.page_content = _sanitize_text(doc.page_content)
|
||||||
|
return [doc for doc in documents if doc.page_content]
|
||||||
|
|
||||||
|
|
||||||
async def index_documents():
|
async def index_documents():
|
||||||
"""Index Paperless-NGX documents into vector store."""
|
"""Index Paperless-NGX documents into vector store."""
|
||||||
documents = await fetch_documents_from_paperless_ngx()
|
documents = await fetch_documents_from_paperless_ngx()
|
||||||
|
|
||||||
splits = text_splitter.split_documents(documents)
|
splits = text_splitter.split_documents(documents)
|
||||||
|
splits = _sanitize_documents(splits)
|
||||||
|
logger.info(f"Indexing {len(splits)} chunks from {len(documents)} documents")
|
||||||
vector_store = _get_vector_store()
|
vector_store = _get_vector_store()
|
||||||
await vector_store.aadd_documents(documents=splits)
|
for i, split in enumerate(splits):
|
||||||
|
try:
|
||||||
|
await vector_store.aadd_documents(documents=[split])
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(
|
||||||
|
f"Failed to embed chunk {i} from {split.metadata.get('filename', 'unknown')}: {e}"
|
||||||
|
)
|
||||||
|
logger.debug(f"Chunk content preview: {split.page_content[:200]!r}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
|
||||||
async def fetch_obsidian_documents() -> list[Document]:
|
async def fetch_obsidian_documents() -> list[Document]:
|
||||||
@@ -168,8 +210,9 @@ async def index_obsidian_documents():
|
|||||||
# Delete existing obsidian chunks
|
# Delete existing obsidian chunks
|
||||||
delete_documents_by_metadata("source", "obsidian")
|
delete_documents_by_metadata("source", "obsidian")
|
||||||
|
|
||||||
# Split and index documents
|
# Split, sanitize, and index documents
|
||||||
splits = text_splitter.split_documents(documents)
|
splits = text_splitter.split_documents(documents)
|
||||||
|
splits = _sanitize_documents(splits)
|
||||||
vector_store = _get_vector_store()
|
vector_store = _get_vector_store()
|
||||||
await vector_store.aadd_documents(documents=splits)
|
await vector_store.aadd_documents(documents=splits)
|
||||||
|
|
||||||
|
|||||||
@@ -31,6 +31,8 @@ services:
|
|||||||
- BASE_URL=${BASE_URL}
|
- BASE_URL=${BASE_URL}
|
||||||
- OLLAMA_URL=${OLLAMA_URL:-http://localhost:11434}
|
- OLLAMA_URL=${OLLAMA_URL:-http://localhost:11434}
|
||||||
- OPENAI_API_KEY=${OPENAI_API_KEY}
|
- OPENAI_API_KEY=${OPENAI_API_KEY}
|
||||||
|
- EMBEDDING_SERVER_URL=${EMBEDDING_SERVER_URL}
|
||||||
|
- EMBEDDING_MODEL_NAME=${EMBEDDING_MODEL_NAME}
|
||||||
- JWT_SECRET_KEY=${JWT_SECRET_KEY}
|
- JWT_SECRET_KEY=${JWT_SECRET_KEY}
|
||||||
- LLAMA_SERVER_URL=${LLAMA_SERVER_URL}
|
- LLAMA_SERVER_URL=${LLAMA_SERVER_URL}
|
||||||
- LLAMA_MODEL_NAME=${LLAMA_MODEL_NAME}
|
- LLAMA_MODEL_NAME=${LLAMA_MODEL_NAME}
|
||||||
|
|||||||
Reference in New Issue
Block a user