This commit is contained in:
2025-10-16 22:43:14 -04:00
parent acaf681927
commit 8479898cc4

39
main.py
View File

@@ -50,7 +50,7 @@ def index_using_pdf_llm(doctypes):
document_id: int = file["id"] document_id: int = file["id"]
pdf_path = ppngx.download_pdf_from_id(id=document_id) pdf_path = ppngx.download_pdf_from_id(id=document_id)
image_paths = pdf_to_image(filepath=pdf_path) image_paths = pdf_to_image(filepath=pdf_path)
print(f"summarizing {file}") logging.info(f"summarizing {file}")
generated_summary = summarize_pdf_image(filepaths=image_paths) generated_summary = summarize_pdf_image(filepaths=image_paths)
file["content"] = generated_summary file["content"] = generated_summary
@@ -75,7 +75,7 @@ def chunk_data(docs, collection, doctypes):
# Step 2: Create chunks # Step 2: Create chunks
chunker = Chunker(collection) chunker = Chunker(collection)
print(f"chunking {len(docs)} documents") logging.info(f"chunking {len(docs)} documents")
texts: list[str] = [doc["content"] for doc in docs] texts: list[str] = [doc["content"] for doc in docs]
with sqlite3.connect("visited.db") as conn: with sqlite3.connect("visited.db") as conn:
to_insert = [] to_insert = []
@@ -121,44 +121,45 @@ def consult_oracle(input: str, collection):
start_time = time.time() start_time = time.time()
# Ask # Ask
print("Starting query generation") logging.info("Starting query generation")
qg_start = time.time() qg_start = time.time()
qg = QueryGenerator() qg = QueryGenerator()
doctype_query = qg.get_doctype_query(input=input) doctype_query = qg.get_doctype_query(input=input)
# metadata_filter = qg.get_query(input) # metadata_filter = qg.get_query(input)
metadata_filter = {**doctype_query} metadata_filter = {**doctype_query}
print(metadata_filter) logging.info(metadata_filter)
qg_end = time.time() qg_end = time.time()
print(f"Query generation took {qg_end - qg_start:.2f} seconds") logging.info(f"Query generation took {qg_end - qg_start:.2f} seconds")
print("Starting embedding generation") logging.info("Starting embedding generation")
embedding_start = time.time() embedding_start = time.time()
embeddings = chunker.embedding_fx(inputs=[input]) embeddings = chunker.embedding_fx(inputs=[input])
embedding_end = time.time() embedding_end = time.time()
print(f"Embedding generation took {embedding_end - embedding_start:.2f} seconds") logging.info(
f"Embedding generation took {embedding_end - embedding_start:.2f} seconds"
)
print("Starting collection query") logging.info("Starting collection query")
query_start = time.time() query_start = time.time()
results = collection.query( results = collection.query(
query_texts=[input], query_texts=[input],
query_embeddings=embeddings, query_embeddings=embeddings,
where=metadata_filter, where=metadata_filter,
) )
print(results)
query_end = time.time() query_end = time.time()
print(f"Collection query took {query_end - query_start:.2f} seconds") logging.info(f"Collection query took {query_end - query_start:.2f} seconds")
# Generate # Generate
print("Starting LLM generation") logging.info("Starting LLM generation")
llm_start = time.time() llm_start = time.time()
system_prompt = "You are a helpful assistant that understands veterinary terms." system_prompt = "You are a helpful assistant that understands veterinary terms."
prompt = f"Using the following data, help answer the user's query by providing as many details as possible. Using this data: {results}. Respond to this prompt: {input}" prompt = f"Using the following data, help answer the user's query by providing as many details as possible. Using this data: {results}. Respond to this prompt: {input}"
output = llm_client.chat(prompt=prompt, system_prompt=system_prompt) output = llm_client.chat(prompt=prompt, system_prompt=system_prompt)
llm_end = time.time() llm_end = time.time()
print(f"LLM generation took {llm_end - llm_start:.2f} seconds") logging.info(f"LLM generation took {llm_end - llm_start:.2f} seconds")
total_time = time.time() - start_time total_time = time.time() - start_time
print(f"Total consult_oracle execution took {total_time:.2f} seconds") logging.info(f"Total consult_oracle execution took {total_time:.2f} seconds")
return output return output
@@ -200,11 +201,11 @@ if __name__ == "__main__":
c = conn.cursor() c = conn.cursor()
c.execute("DELETE FROM indexed_documents") c.execute("DELETE FROM indexed_documents")
print("Fetching documents from Paperless-NGX") logging.info("Fetching documents from Paperless-NGX")
ppngx = PaperlessNGXService() ppngx = PaperlessNGXService()
docs = ppngx.get_data() docs = ppngx.get_data()
docs = filter_indexed_files(docs) docs = filter_indexed_files(docs)
print(f"Fetched {len(docs)} documents") logging.info(f"Fetched {len(docs)} documents")
# Delete all chromadb data # Delete all chromadb data
ids = simba_docs.get(ids=None, limit=None, offset=0) ids = simba_docs.get(ids=None, limit=None, offset=0)
@@ -213,11 +214,11 @@ if __name__ == "__main__":
simba_docs.delete(ids=all_ids) simba_docs.delete(ids=all_ids)
# Chunk documents # Chunk documents
print("Chunking documents now ...") logging.info("Chunking documents now ...")
tag_lookup = ppngx.get_tags() tag_lookup = ppngx.get_tags()
doctype_lookup = ppngx.get_doctypes() doctype_lookup = ppngx.get_doctypes()
chunk_data(docs, collection=simba_docs, doctypes=doctype_lookup) chunk_data(docs, collection=simba_docs, doctypes=doctype_lookup)
print("Done chunking documents") logging.info("Done chunking documents")
# if args.index: # if args.index:
# with open(args.index) as file: # with open(args.index) as file:
@@ -231,7 +232,7 @@ if __name__ == "__main__":
# chunk_text(texts=[file.readall()], collection=simba_docs) # chunk_text(texts=[file.readall()], collection=simba_docs)
if args.query: if args.query:
print("Consulting oracle ...") logging.info("Consulting oracle ...")
print( print(
consult_oracle( consult_oracle(
input=args.query, input=args.query,
@@ -239,4 +240,4 @@ if __name__ == "__main__":
) )
) )
else: else:
print("please provide a query") logging.info("please provide a query")