110 lines
2.9 KiB
Python
110 lines
2.9 KiB
Python
import datetime
|
|
|
|
from chromadb import HttpClient
|
|
from quart import Quart, jsonify, request
|
|
from tortoise.contrib.quart import register_tortoise
|
|
|
|
from chunker import Chunker
|
|
from models import EmbeddingEngine, PaperlessDocument
|
|
from request import PaperlessNGXService
|
|
from utils import date_to_epoch
|
|
|
|
app = Quart(__name__)
|
|
|
|
TORTOISE_CONFIG = {
|
|
"connections": {
|
|
"default": f"postgresql://postgres:password@localhost?statusColor=&env=local&name=simbarag&tLSMode=0&usePrivateKey=false&safeModeLevel=0&advancedSafeModeLevel=0&driverVersion=0&showSystemSchemas=0&driverVersion=0&lazyload=False"
|
|
},
|
|
"apps": {
|
|
"models": {
|
|
"models": [
|
|
"blueprints.conversation.models",
|
|
"blueprints.users.models",
|
|
"aerich.models",
|
|
]
|
|
},
|
|
},
|
|
}
|
|
|
|
chroma_client = HttpClient(host="localhost", port=8333)
|
|
chroma_client.heartbeat()
|
|
simba_docs = chroma_client.get_or_create_collection(name="simba_docs2")
|
|
|
|
register_tortoise(
|
|
app,
|
|
config=TORTOISE_CONFIG,
|
|
generate_schemas=False, # Disabled - using Aerich for migrations
|
|
)
|
|
|
|
|
|
async def chunk_data(docs, collection, doctypes):
|
|
chunker = Chunker(collection)
|
|
|
|
texts: list[str] = [doc["content"] for doc in docs]
|
|
to_insert = []
|
|
for index, text in enumerate(texts):
|
|
metadata = {
|
|
"created_date": date_to_epoch(docs[index]["created_date"]),
|
|
"filename": docs[index]["original_file_name"],
|
|
"document_type": doctypes.get(docs[index]["document_type"], ""),
|
|
}
|
|
|
|
if doctypes:
|
|
metadata["type"] = doctypes.get(docs[index]["document_type"])
|
|
|
|
chunker.chunk_document(
|
|
document=text,
|
|
metadata=metadata,
|
|
)
|
|
to_insert.append((docs[index]["id"],))
|
|
paperless_id = docs[index]["id"]
|
|
await PaperlessDocument.create(
|
|
paperless_id=paperless_id,
|
|
embedding_model=EmbeddingEngine.OPENAI,
|
|
)
|
|
|
|
|
|
@app.route("/api/query")
|
|
async def query():
|
|
data = await request.get_json()
|
|
return jsonify("asdf")
|
|
|
|
|
|
@app.route("/api/index")
|
|
async def index():
|
|
"""
|
|
This function adds the corpus of text attached to the ChromaDB library
|
|
"""
|
|
|
|
return ""
|
|
|
|
|
|
@app.route("/api/index/paperless/all")
|
|
async def reindex_all_paperless_documents():
|
|
# Delete from database
|
|
await PaperlessDocument.all().delete()
|
|
|
|
# Delete from ChromaDB
|
|
all_docs = simba_docs.get()
|
|
if all_docs["ids"]:
|
|
simba_docs.delete(ids=all_docs["ids"])
|
|
|
|
# Fetch all documents again
|
|
ppngx = PaperlessNGXService()
|
|
docs = ppngx.get_data()
|
|
doctype_lookup = ppngx.get_doctypes()
|
|
|
|
# Chunk them
|
|
await chunk_data(docs, collection=simba_docs, doctypes=doctype_lookup)
|
|
|
|
return ""
|
|
|
|
|
|
@app.route("/api/index/paperless/single", methods=["POST"])
|
|
async def index_single_document():
|
|
return ""
|
|
|
|
|
|
if __name__ == "__main__":
|
|
app.run()
|