diff --git a/main.py b/main.py index e6f4873..101d36a 100644 --- a/main.py +++ b/main.py @@ -1,7 +1,14 @@ +import datetime + from chromadb import HttpClient from quart import Quart, jsonify, request from tortoise.contrib.quart import register_tortoise +from chunker import Chunker +from models import EmbeddingEngine, PaperlessDocument +from request import PaperlessNGXService +from utils import date_to_epoch + app = Quart(__name__) TORTOISE_CONFIG = { @@ -21,6 +28,7 @@ TORTOISE_CONFIG = { chroma_client = HttpClient(host="localhost", port=8333) chroma_client.heartbeat() +simba_docs = chroma_client.get_or_create_collection(name="simba_docs2") register_tortoise( app, @@ -29,6 +37,33 @@ register_tortoise( ) +async def chunk_data(docs, collection, doctypes): + chunker = Chunker(collection) + + texts: list[str] = [doc["content"] for doc in docs] + to_insert = [] + for index, text in enumerate(texts): + metadata = { + "created_date": date_to_epoch(docs[index]["created_date"]), + "filename": docs[index]["original_file_name"], + "document_type": doctypes.get(docs[index]["document_type"], ""), + } + + if doctypes: + metadata["type"] = doctypes.get(docs[index]["document_type"]) + + chunker.chunk_document( + document=text, + metadata=metadata, + ) + to_insert.append((docs[index]["id"],)) + paperless_id = docs[index]["id"] + await PaperlessDocument.create( + paperless_id=paperless_id, + embedding_model=EmbeddingEngine.OPENAI, + ) + + @app.route("/api/query") async def query(): data = await request.get_json() @@ -45,7 +80,25 @@ async def index(): @app.route("/api/index/paperless/all") -async def reindex_all_documents(): +async def reindex_all_paperless_documents(): + # Delete from database + await PaperlessDocument.all().delete() + + # Delete from ChromaDB + all_docs = simba_docs.get() + if all_docs["ids"]: + simba_docs.delete(ids=all_docs["ids"]) + + visited_documents = set( + await PaperlessDocument.all().values_list("paperless_id", flat=True) + ) + + ppngx = PaperlessNGXService() + docs = ppngx.get_data() + docs = [doc for doc in docs if doc["id"] not in visited_documents] + doctype_lookup = ppngx.get_doctypes() + await chunk_data(docs, collection=simba_docs, doctypes=doctype_lookup) + return "" diff --git a/models.py b/models.py index bbd6754..8e09d37 100644 --- a/models.py +++ b/models.py @@ -12,4 +12,4 @@ class EmbeddingEngine(enum.Enum): class PaperlessDocument(Model): id = fields.UUIDField(primary_key=True) paperless_id = fields.CharField(unique=True) - indexing_engine = fields.CharEnumField(enum_type=EmbeddingEngine) + embedding_model = fields.CharEnumField(enum_type=EmbeddingEngine)