simbarag-index/main.py

import datetime

from chromadb import HttpClient
from quart import Quart, jsonify, request
from tortoise.contrib.quart import register_tortoise

from chunker import Chunker
from models import EmbeddingEngine, PaperlessDocument
from request import PaperlessNGXService
from utils import date_to_epoch

app = Quart(__name__)

TORTOISE_CONFIG = {
    "connections": {
        "default": f"postgresql://postgres:password@localhost?statusColor=&env=local&name=simbarag&tLSMode=0&usePrivateKey=false&safeModeLevel=0&advancedSafeModeLevel=0&driverVersion=0&showSystemSchemas=0&driverVersion=0&lazyload=False"
    },
    "apps": {
        "models": {
            "models": [
                "blueprints.conversation.models",
                "blueprints.users.models",
                "aerich.models",
            ]
        },
    },
}

chroma_client = HttpClient(host="localhost", port=8333)
chroma_client.heartbeat()
simba_docs = chroma_client.get_or_create_collection(name="simba_docs2")

register_tortoise(
    app,
    config=TORTOISE_CONFIG,
    generate_schemas=False,  # Disabled - using Aerich for migrations
)


async def chunk_data(docs, collection, doctypes):
    chunker = Chunker(collection)

    texts: list[str] = [doc["content"] for doc in docs]
    to_insert = []
    for index, text in enumerate(texts):
        metadata = {
            "created_date": date_to_epoch(docs[index]["created_date"]),
            "filename": docs[index]["original_file_name"],
            "document_type": doctypes.get(docs[index]["document_type"], ""),
        }

        if doctypes:
            metadata["type"] = doctypes.get(docs[index]["document_type"])

        chunker.chunk_document(
            document=text,
            metadata=metadata,
        )
        to_insert.append((docs[index]["id"],))
        paperless_id = docs[index]["id"]
        await PaperlessDocument.create(
            paperless_id=paperless_id,
            embedding_model=EmbeddingEngine.OPENAI,
        )


@app.route("/api/query")
async def query():
    data = await request.get_json()
    return jsonify("asdf")


@app.route("/api/index")
async def index():
    """
    This function adds the corpus of text attached to the ChromaDB library
    """

    return ""


@app.route("/api/index/paperless/all")
async def reindex_all_paperless_documents():
    # Delete from database
    await PaperlessDocument.all().delete()

    # Delete from ChromaDB
    all_docs = simba_docs.get()
    if all_docs["ids"]:
        simba_docs.delete(ids=all_docs["ids"])

    # Fetch all documents again
    ppngx = PaperlessNGXService()
    docs = ppngx.get_data()
    doctype_lookup = ppngx.get_doctypes()

    # Chunk them
    await chunk_data(docs, collection=simba_docs, doctypes=doctype_lookup)

    return ""


@app.route("/api/index/paperless/single", methods=["POST"])
async def index_single_document():
    return ""


if __name__ == "__main__":
    app.run()