This commit is contained in:
2025-08-07 17:43:24 -04:00
parent fc504d3e9c
commit 679cfb08e4
5 changed files with 294 additions and 32 deletions

82
main.py
View File

@@ -1,5 +1,7 @@
import datetime
import logging
import os
from typing import Any, Union
import argparse
import chromadb
@@ -8,7 +10,8 @@ import ollama
from request import PaperlessNGXService
from chunker import Chunker
from query import QueryGenerator
from cleaner import pdf_to_image, summarize_pdf_image
from dotenv import load_dotenv
@@ -27,20 +30,66 @@ parser.add_argument(
"--reindex", action="store_true", help="re-index the simba documents"
)
ppngx = PaperlessNGXService()
def chunk_data(texts: list[str], collection):
def index_using_pdf_llm():
files = ppngx.get_data()
for file in files:
document_id = file["id"]
pdf_path = ppngx.download_pdf_from_id(id=document_id)
image_paths = pdf_to_image(filepath=pdf_path)
generated_summary = summarize_pdf_image(filepaths=image_paths)
file["content"] = generated_summary
chunk_data(files, simba_docs)
def date_to_epoch(date_str: str) -> float:
split_date = date_str.split("-")
print(split_date)
date = datetime.datetime(
int(split_date[0]),
int(split_date[1]),
int(split_date[2]),
0,
0,
0,
)
return date.timestamp()
def chunk_data(docs: list[dict[str, Union[str, Any]]], collection):
# Step 2: Create chunks
chunker = Chunker(collection)
print(f"chunking {len(texts)} documents")
for text in texts:
chunker.chunk_document(document=text)
print(f"chunking {len(docs)} documents")
print(docs)
texts: list[str] = [doc["content"] for doc in docs]
for index, text in enumerate(texts):
metadata = {
"created_date": date_to_epoch(docs[index]["created_date"]),
}
chunker.chunk_document(
document=text,
metadata=metadata,
)
def consult_oracle(input: str, collection):
# Ask
qg = QueryGenerator()
metadata_filter = qg.get_query("input")
print(metadata_filter)
embeddings = Chunker.embedding_fx(input=[input])
results = collection.query(query_texts=[input], query_embeddings=embeddings)
results = collection.query(
query_texts=[input],
query_embeddings=embeddings,
where=metadata_filter,
)
print(results)
# Generate
output = ollama.generate(
@@ -55,24 +104,23 @@ def paperless_workflow(input):
# Step 1: Get the text
ppngx = PaperlessNGXService()
docs = ppngx.get_data()
texts = [doc["content"] for doc in docs]
chunk_data(texts, collection=simba_docs)
chunk_data(docs, collection=simba_docs)
consult_oracle(input, simba_docs)
if __name__ == "__main__":
args = parser.parse_args()
if args.reindex:
logging.info(msg="Fetching documents from Paperless-NGX")
ppngx = PaperlessNGXService()
docs = ppngx.get_data()
texts = [doc["content"] for doc in docs]
logging.info(msg=f"Fetched {len(texts)} documents")
logging.info(msg="Chunking documents now ...")
chunk_data(texts, collection=simba_docs)
logging.info(msg="Done chunking documents")
# logging.info(msg="Fetching documents from Paperless-NGX")
# ppngx = PaperlessNGXService()
# docs = ppngx.get_data()
# logging.info(msg=f"Fetched {len(docs)} documents")
#
# logging.info(msg="Chunking documents now ...")
# chunk_data(docs, collection=simba_docs)
# logging.info(msg="Done chunking documents")
index_using_pdf_llm()
if args.query:
logging.info("Consulting oracle ...")