This commit is contained in:
2025-10-05 20:31:46 -04:00
parent 0bb3e3172b
commit 910097d13b
7 changed files with 146 additions and 98 deletions

View File

@@ -3,15 +3,19 @@ from math import ceil
import re
from typing import Union
from uuid import UUID, uuid4
from ollama import Client
from chromadb.utils.embedding_functions.openai_embedding_function import (
OpenAIEmbeddingFunction,
)
from dotenv import load_dotenv
USE_OPENAI = os.getenv("OPENAI_API_KEY") != None
load_dotenv()
ollama_client = Client(host=os.getenv("OLLAMA_HOST", "http://localhost:11434"))
def remove_headers_footers(text, header_patterns=None, footer_patterns=None):
if header_patterns is None:
@@ -88,6 +92,17 @@ class Chunker:
def __init__(self, collection) -> None:
self.collection = collection
def embedding_fx(self, inputs):
if USE_OPENAI:
openai_embedding_fx = OpenAIEmbeddingFunction(
api_key=os.getenv("OPENAI_API_KEY"),
model_name="text-embedding-3-small",
)
return openai_embedding_fx(inputs)
else:
response = ollama_client.embed(model="mxbai-embed-large", input=inputs[0])
return response["embeddings"]
def chunk_document(
self,
document: str,