yeet

2025-10-01 21:00:05 -04:00
parent a69f7864f3
commit 99c98b7e42
18 changed files with 3032 additions and 31 deletions
--- a/main.py
+++ b/main.py
@@ -6,6 +6,7 @@ from typing import Any, Union
 import argparse
 import chromadb
 import ollama
+from openai import OpenAI


 from request import PaperlessNGXService
@@ -29,9 +30,13 @@ parser.add_argument("query", type=str, help="questions about simba's health")
 parser.add_argument(
    "--reindex", action="store_true", help="re-index the simba documents"
 )
+parser.add_argument(
+        "--index", help="index a file"
+)

 ppngx = PaperlessNGXService()

+openai_client = OpenAI()

 def index_using_pdf_llm():
    files = ppngx.get_data()
@@ -39,6 +44,7 @@ def index_using_pdf_llm():
        document_id = file["id"]
        pdf_path = ppngx.download_pdf_from_id(id=document_id)
        image_paths = pdf_to_image(filepath=pdf_path)
+        print(f"summarizing {file}")
        generated_summary = summarize_pdf_image(filepaths=image_paths)
        file["content"] = generated_summary

@@ -68,36 +74,75 @@ def chunk_data(docs: list[dict[str, Union[str, Any]]], collection):
    print(docs)
    texts: list[str] = [doc["content"] for doc in docs]
    for index, text in enumerate(texts):
+        print(docs[index]["original_file_name"])
        metadata = {
-            "created_date": date_to_epoch(docs[index]["created_date"]),
+             "created_date": date_to_epoch(docs[index]["created_date"]),
+             "filename": docs[index]["original_file_name"]
        }
        chunker.chunk_document(
            document=text,
            metadata=metadata,
        )

+def chunk_text(texts: list[str], collection):
+    chunker = Chunker(collection)
+
+    for index, text in enumerate(texts):
+        metadata = {}
+        chunker.chunk_document(
+            document=text,
+            metadata=metadata,
+        )

 def consult_oracle(input: str, collection):
+    print(input)
+    import time
+    start_time = time.time()
+
    # Ask
-    qg = QueryGenerator()
-    metadata_filter = qg.get_query("input")
-    print(metadata_filter)
+    # print("Starting query generation")
+    # qg_start = time.time()
+    # qg = QueryGenerator()
+    # metadata_filter = qg.get_query(input)
+    # qg_end = time.time()
+    # print(f"Query generation took {qg_end - qg_start:.2f} seconds")
+    # print(metadata_filter)
+
+    print("Starting embedding generation")
+    embedding_start = time.time()
    embeddings = Chunker.embedding_fx(input=[input])
+    embedding_end = time.time()
+    print(f"Embedding generation took {embedding_end - embedding_start:.2f} seconds")
+
+    print("Starting collection query")
+    query_start = time.time()
    results = collection.query(
        query_texts=[input],
        query_embeddings=embeddings,
-        where=metadata_filter,
+        #where=metadata_filter,
    )
-
    print(results)
+    query_end = time.time()
+    print(f"Collection query took {query_end - query_start:.2f} seconds")

    # Generate
-    output = ollama.generate(
-        model="gemma3n:e4b",
-        prompt=f"You are a helpful assistant that understandings veterinary terms. Using the following data, help answer the user's query by providing as many details as possible.  Using this data: {results}. Respond to this prompt: {input}",
+    print("Starting LLM generation")
+    llm_start = time.time()
+    # output = ollama.generate(
+        # model="gemma3n:e4b",
+        # prompt=f"You are a helpful assistant that understandings veterinary terms. Using the following data, help answer the user's query by providing as many details as possible.  Using this data: {results}. Respond to this prompt: {input}",
+    # )
+    response = openai_client.responses.create(
+        model="gpt-4o-mini",
+        input=f"You are a helpful assistant that understandings veterinary terms. Using the following data, help answer the user's query by providing as many details as possible.  Using this data: {results}. Respond to this prompt: {input}",
    )
+    llm_end = time.time()
+    print(f"LLM generation took {llm_end - llm_start:.2f} seconds")

-    print(output["response"])
+    total_time = time.time() - start_time
+    print(f"Total consult_oracle execution took {total_time:.2f} seconds")
+
+    return response.output_text


 def paperless_workflow(input):
@@ -109,24 +154,47 @@ def paperless_workflow(input):
    consult_oracle(input, simba_docs)


+def consult_simba_oracle(input: str):
+    return consult_oracle(
+        input=input,
+        collection=simba_docs,
+    )
+
+
 if __name__ == "__main__":
    args = parser.parse_args()
    if args.reindex:
-        # logging.info(msg="Fetching documents from Paperless-NGX")
-        # ppngx = PaperlessNGXService()
-        # docs = ppngx.get_data()
-        # logging.info(msg=f"Fetched {len(docs)} documents")
+        print("Fetching documents from Paperless-NGX")
+        ppngx = PaperlessNGXService()
+        docs = ppngx.get_data()
+        print(docs)
+        print(f"Fetched {len(docs)} documents")
        #
-        # logging.info(msg="Chunking documents now ...")
-        # chunk_data(docs, collection=simba_docs)
-        # logging.info(msg="Done chunking documents")
-        index_using_pdf_llm()
+        print("Chunking documents now ...")
+        chunk_data(docs, collection=simba_docs)
+        print("Done chunking documents")
+        # index_using_pdf_llm()
+
+
+    if args.index:
+        with open(args.index) as file:
+            extension = args.index.split(".")[-1]
+
+            if extension == "pdf":
+                pdf_path = ppngx.download_pdf_from_id(id=document_id)
+                image_paths = pdf_to_image(filepath=pdf_path)
+                print(f"summarizing {file}")
+                generated_summary = summarize_pdf_image(filepaths=image_paths)
+            elif extension in [".md", ".txt"]:
+                chunk_text(texts=[file.readall()], collection=simba_docs)

    if args.query:
-        logging.info("Consulting oracle ...")
-        consult_oracle(
+        print("Consulting oracle ...")
+        print(consult_oracle(
            input=args.query,
            collection=simba_docs,
-        )
+        ))
    else:
        print("please provide a query")
+
+