Add main re-indexing
This commit is contained in:
55
main.py
55
main.py
@@ -1,7 +1,14 @@
|
|||||||
|
import datetime
|
||||||
|
|
||||||
from chromadb import HttpClient
|
from chromadb import HttpClient
|
||||||
from quart import Quart, jsonify, request
|
from quart import Quart, jsonify, request
|
||||||
from tortoise.contrib.quart import register_tortoise
|
from tortoise.contrib.quart import register_tortoise
|
||||||
|
|
||||||
|
from chunker import Chunker
|
||||||
|
from models import EmbeddingEngine, PaperlessDocument
|
||||||
|
from request import PaperlessNGXService
|
||||||
|
from utils import date_to_epoch
|
||||||
|
|
||||||
app = Quart(__name__)
|
app = Quart(__name__)
|
||||||
|
|
||||||
TORTOISE_CONFIG = {
|
TORTOISE_CONFIG = {
|
||||||
@@ -21,6 +28,7 @@ TORTOISE_CONFIG = {
|
|||||||
|
|
||||||
chroma_client = HttpClient(host="localhost", port=8333)
|
chroma_client = HttpClient(host="localhost", port=8333)
|
||||||
chroma_client.heartbeat()
|
chroma_client.heartbeat()
|
||||||
|
simba_docs = chroma_client.get_or_create_collection(name="simba_docs2")
|
||||||
|
|
||||||
register_tortoise(
|
register_tortoise(
|
||||||
app,
|
app,
|
||||||
@@ -29,6 +37,33 @@ register_tortoise(
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
async def chunk_data(docs, collection, doctypes):
|
||||||
|
chunker = Chunker(collection)
|
||||||
|
|
||||||
|
texts: list[str] = [doc["content"] for doc in docs]
|
||||||
|
to_insert = []
|
||||||
|
for index, text in enumerate(texts):
|
||||||
|
metadata = {
|
||||||
|
"created_date": date_to_epoch(docs[index]["created_date"]),
|
||||||
|
"filename": docs[index]["original_file_name"],
|
||||||
|
"document_type": doctypes.get(docs[index]["document_type"], ""),
|
||||||
|
}
|
||||||
|
|
||||||
|
if doctypes:
|
||||||
|
metadata["type"] = doctypes.get(docs[index]["document_type"])
|
||||||
|
|
||||||
|
chunker.chunk_document(
|
||||||
|
document=text,
|
||||||
|
metadata=metadata,
|
||||||
|
)
|
||||||
|
to_insert.append((docs[index]["id"],))
|
||||||
|
paperless_id = docs[index]["id"]
|
||||||
|
await PaperlessDocument.create(
|
||||||
|
paperless_id=paperless_id,
|
||||||
|
embedding_model=EmbeddingEngine.OPENAI,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@app.route("/api/query")
|
@app.route("/api/query")
|
||||||
async def query():
|
async def query():
|
||||||
data = await request.get_json()
|
data = await request.get_json()
|
||||||
@@ -45,7 +80,25 @@ async def index():
|
|||||||
|
|
||||||
|
|
||||||
@app.route("/api/index/paperless/all")
|
@app.route("/api/index/paperless/all")
|
||||||
async def reindex_all_documents():
|
async def reindex_all_paperless_documents():
|
||||||
|
# Delete from database
|
||||||
|
await PaperlessDocument.all().delete()
|
||||||
|
|
||||||
|
# Delete from ChromaDB
|
||||||
|
all_docs = simba_docs.get()
|
||||||
|
if all_docs["ids"]:
|
||||||
|
simba_docs.delete(ids=all_docs["ids"])
|
||||||
|
|
||||||
|
visited_documents = set(
|
||||||
|
await PaperlessDocument.all().values_list("paperless_id", flat=True)
|
||||||
|
)
|
||||||
|
|
||||||
|
ppngx = PaperlessNGXService()
|
||||||
|
docs = ppngx.get_data()
|
||||||
|
docs = [doc for doc in docs if doc["id"] not in visited_documents]
|
||||||
|
doctype_lookup = ppngx.get_doctypes()
|
||||||
|
await chunk_data(docs, collection=simba_docs, doctypes=doctype_lookup)
|
||||||
|
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -12,4 +12,4 @@ class EmbeddingEngine(enum.Enum):
|
|||||||
class PaperlessDocument(Model):
|
class PaperlessDocument(Model):
|
||||||
id = fields.UUIDField(primary_key=True)
|
id = fields.UUIDField(primary_key=True)
|
||||||
paperless_id = fields.CharField(unique=True)
|
paperless_id = fields.CharField(unique=True)
|
||||||
indexing_engine = fields.CharEnumField(enum_type=EmbeddingEngine)
|
embedding_model = fields.CharEnumField(enum_type=EmbeddingEngine)
|
||||||
|
|||||||
Reference in New Issue
Block a user