Files
simbarag-index/main.py
2025-11-19 20:53:24 -05:00

110 lines
2.9 KiB
Python

import datetime
from chromadb import HttpClient
from quart import Quart, jsonify, request
from tortoise.contrib.quart import register_tortoise
from chunker import Chunker
from models import EmbeddingEngine, PaperlessDocument
from request import PaperlessNGXService
from utils import date_to_epoch
app = Quart(__name__)
TORTOISE_CONFIG = {
"connections": {
"default": f"postgresql://postgres:password@localhost?statusColor=&env=local&name=simbarag&tLSMode=0&usePrivateKey=false&safeModeLevel=0&advancedSafeModeLevel=0&driverVersion=0&showSystemSchemas=0&driverVersion=0&lazyload=False"
},
"apps": {
"models": {
"models": [
"blueprints.conversation.models",
"blueprints.users.models",
"aerich.models",
]
},
},
}
chroma_client = HttpClient(host="localhost", port=8333)
chroma_client.heartbeat()
simba_docs = chroma_client.get_or_create_collection(name="simba_docs2")
register_tortoise(
app,
config=TORTOISE_CONFIG,
generate_schemas=False, # Disabled - using Aerich for migrations
)
async def chunk_data(docs, collection, doctypes):
chunker = Chunker(collection)
texts: list[str] = [doc["content"] for doc in docs]
to_insert = []
for index, text in enumerate(texts):
metadata = {
"created_date": date_to_epoch(docs[index]["created_date"]),
"filename": docs[index]["original_file_name"],
"document_type": doctypes.get(docs[index]["document_type"], ""),
}
if doctypes:
metadata["type"] = doctypes.get(docs[index]["document_type"])
chunker.chunk_document(
document=text,
metadata=metadata,
)
to_insert.append((docs[index]["id"],))
paperless_id = docs[index]["id"]
await PaperlessDocument.create(
paperless_id=paperless_id,
embedding_model=EmbeddingEngine.OPENAI,
)
@app.route("/api/query")
async def query():
data = await request.get_json()
return jsonify("asdf")
@app.route("/api/index")
async def index():
"""
This function adds the corpus of text attached to the ChromaDB library
"""
return ""
@app.route("/api/index/paperless/all")
async def reindex_all_paperless_documents():
# Delete from database
await PaperlessDocument.all().delete()
# Delete from ChromaDB
all_docs = simba_docs.get()
if all_docs["ids"]:
simba_docs.delete(ids=all_docs["ids"])
# Fetch all documents again
ppngx = PaperlessNGXService()
docs = ppngx.get_data()
doctype_lookup = ppngx.get_doctypes()
# Chunk them
await chunk_data(docs, collection=simba_docs, doctypes=doctype_lookup)
return ""
@app.route("/api/index/paperless/single", methods=["POST"])
async def index_single_document():
return ""
if __name__ == "__main__":
app.run()