From 8479898cc4c4defebf1ef2696c3c860e04ae1927 Mon Sep 17 00:00:00 2001 From: Ryan Chen Date: Thu, 16 Oct 2025 22:43:14 -0400 Subject: [PATCH] Logging --- main.py | 39 ++++++++++++++++++++------------------- 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/main.py b/main.py index 6f1b095..2eda189 100644 --- a/main.py +++ b/main.py @@ -50,7 +50,7 @@ def index_using_pdf_llm(doctypes): document_id: int = file["id"] pdf_path = ppngx.download_pdf_from_id(id=document_id) image_paths = pdf_to_image(filepath=pdf_path) - print(f"summarizing {file}") + logging.info(f"summarizing {file}") generated_summary = summarize_pdf_image(filepaths=image_paths) file["content"] = generated_summary @@ -75,7 +75,7 @@ def chunk_data(docs, collection, doctypes): # Step 2: Create chunks chunker = Chunker(collection) - print(f"chunking {len(docs)} documents") + logging.info(f"chunking {len(docs)} documents") texts: list[str] = [doc["content"] for doc in docs] with sqlite3.connect("visited.db") as conn: to_insert = [] @@ -121,44 +121,45 @@ def consult_oracle(input: str, collection): start_time = time.time() # Ask - print("Starting query generation") + logging.info("Starting query generation") qg_start = time.time() qg = QueryGenerator() doctype_query = qg.get_doctype_query(input=input) # metadata_filter = qg.get_query(input) metadata_filter = {**doctype_query} - print(metadata_filter) + logging.info(metadata_filter) qg_end = time.time() - print(f"Query generation took {qg_end - qg_start:.2f} seconds") + logging.info(f"Query generation took {qg_end - qg_start:.2f} seconds") - print("Starting embedding generation") + logging.info("Starting embedding generation") embedding_start = time.time() embeddings = chunker.embedding_fx(inputs=[input]) embedding_end = time.time() - print(f"Embedding generation took {embedding_end - embedding_start:.2f} seconds") + logging.info( + f"Embedding generation took {embedding_end - embedding_start:.2f} seconds" + ) - print("Starting collection query") + logging.info("Starting collection query") query_start = time.time() results = collection.query( query_texts=[input], query_embeddings=embeddings, where=metadata_filter, ) - print(results) query_end = time.time() - print(f"Collection query took {query_end - query_start:.2f} seconds") + logging.info(f"Collection query took {query_end - query_start:.2f} seconds") # Generate - print("Starting LLM generation") + logging.info("Starting LLM generation") llm_start = time.time() system_prompt = "You are a helpful assistant that understands veterinary terms." prompt = f"Using the following data, help answer the user's query by providing as many details as possible. Using this data: {results}. Respond to this prompt: {input}" output = llm_client.chat(prompt=prompt, system_prompt=system_prompt) llm_end = time.time() - print(f"LLM generation took {llm_end - llm_start:.2f} seconds") + logging.info(f"LLM generation took {llm_end - llm_start:.2f} seconds") total_time = time.time() - start_time - print(f"Total consult_oracle execution took {total_time:.2f} seconds") + logging.info(f"Total consult_oracle execution took {total_time:.2f} seconds") return output @@ -200,11 +201,11 @@ if __name__ == "__main__": c = conn.cursor() c.execute("DELETE FROM indexed_documents") - print("Fetching documents from Paperless-NGX") + logging.info("Fetching documents from Paperless-NGX") ppngx = PaperlessNGXService() docs = ppngx.get_data() docs = filter_indexed_files(docs) - print(f"Fetched {len(docs)} documents") + logging.info(f"Fetched {len(docs)} documents") # Delete all chromadb data ids = simba_docs.get(ids=None, limit=None, offset=0) @@ -213,11 +214,11 @@ if __name__ == "__main__": simba_docs.delete(ids=all_ids) # Chunk documents - print("Chunking documents now ...") + logging.info("Chunking documents now ...") tag_lookup = ppngx.get_tags() doctype_lookup = ppngx.get_doctypes() chunk_data(docs, collection=simba_docs, doctypes=doctype_lookup) - print("Done chunking documents") + logging.info("Done chunking documents") # if args.index: # with open(args.index) as file: @@ -231,7 +232,7 @@ if __name__ == "__main__": # chunk_text(texts=[file.readall()], collection=simba_docs) if args.query: - print("Consulting oracle ...") + logging.info("Consulting oracle ...") print( consult_oracle( input=args.query, @@ -239,4 +240,4 @@ if __name__ == "__main__": ) ) else: - print("please provide a query") + logging.info("please provide a query")