import datetime from chromadb import HttpClient from quart import Quart, jsonify, request from tortoise.contrib.quart import register_tortoise from chunker import Chunker from models import EmbeddingEngine, PaperlessDocument from request import PaperlessNGXService from utils import date_to_epoch app = Quart(__name__) TORTOISE_CONFIG = { "connections": { "default": f"postgresql://postgres:password@localhost?statusColor=&env=local&name=simbarag&tLSMode=0&usePrivateKey=false&safeModeLevel=0&advancedSafeModeLevel=0&driverVersion=0&showSystemSchemas=0&driverVersion=0&lazyload=False" }, "apps": { "models": { "models": [ "blueprints.conversation.models", "blueprints.users.models", "aerich.models", ] }, }, } chroma_client = HttpClient(host="localhost", port=8333) chroma_client.heartbeat() simba_docs = chroma_client.get_or_create_collection(name="simba_docs2") register_tortoise( app, config=TORTOISE_CONFIG, generate_schemas=False, # Disabled - using Aerich for migrations ) async def chunk_data(docs, collection, doctypes): chunker = Chunker(collection) texts: list[str] = [doc["content"] for doc in docs] to_insert = [] for index, text in enumerate(texts): metadata = { "created_date": date_to_epoch(docs[index]["created_date"]), "filename": docs[index]["original_file_name"], "document_type": doctypes.get(docs[index]["document_type"], ""), } if doctypes: metadata["type"] = doctypes.get(docs[index]["document_type"]) chunker.chunk_document( document=text, metadata=metadata, ) to_insert.append((docs[index]["id"],)) paperless_id = docs[index]["id"] await PaperlessDocument.create( paperless_id=paperless_id, embedding_model=EmbeddingEngine.OPENAI, ) @app.route("/api/query") async def query(): data = await request.get_json() return jsonify("asdf") @app.route("/api/index") async def index(): """ This function adds the corpus of text attached to the ChromaDB library """ return "" @app.route("/api/index/paperless/all") async def reindex_all_paperless_documents(): # Delete from database await PaperlessDocument.all().delete() # Delete from ChromaDB all_docs = simba_docs.get() if all_docs["ids"]: simba_docs.delete(ids=all_docs["ids"]) visited_documents = set( await PaperlessDocument.all().values_list("paperless_id", flat=True) ) ppngx = PaperlessNGXService() docs = ppngx.get_data() docs = [doc for doc in docs if doc["id"] not in visited_documents] doctype_lookup = ppngx.get_doctypes() await chunk_data(docs, collection=simba_docs, doctypes=doctype_lookup) return "" @app.route("/api/index/paperless/single", methods=["POST"]) async def index_single_document(): return "" if __name__ == "__main__": app.run()