diff --git a/.python-version b/.python-version new file mode 100644 index 0000000..24ee5b1 --- /dev/null +++ b/.python-version @@ -0,0 +1 @@ +3.13 diff --git a/app.py b/app.py new file mode 100644 index 0000000..9fe390b --- /dev/null +++ b/app.py @@ -0,0 +1,33 @@ +import os + +from flask import Flask, request, jsonify, render_template + +from main import consult_simba_oracle + +app = Flask(__name__, static_folder="raggr-frontend/dist/static", template_folder="raggr-frontend/dist") + + +# Serve React static files +@app.route('/static/') +def static_files(filename): + return send_from_directory(app.static_folder, filename) + +# Serve the React app for all routes (catch-all) +@app.route('/', defaults={'path': ''}) +@app.route('/') +def serve_react_app(path): + if path and os.path.exists(os.path.join(app.template_folder, path)): + return send_from_directory(app.template_folder, path) + return render_template('index.html') + +@app.route("/api/query", methods=["POST"]) +def query(): + data = request.get_json() + query = data.get("query") + return jsonify({"response": consult_simba_oracle(query)}) + +@app.route("/api/ingest", methods=["POST"]) +def webhook(): + data = request.get_json() + print(data) + diff --git a/chunker.py b/chunker.py index e398730..ec84bbe 100644 --- a/chunker.py +++ b/chunker.py @@ -96,7 +96,7 @@ class Chunker: ) -> list[Chunk]: doc_uuid = uuid4() - chunk_size = min(chunk_size, len(document)) + chunk_size = min(chunk_size, len(document)) or 1 chunks = [] num_chunks = ceil(len(document) / chunk_size) diff --git a/main.py b/main.py index babc996..060bdb2 100644 --- a/main.py +++ b/main.py @@ -6,6 +6,7 @@ from typing import Any, Union import argparse import chromadb import ollama +from openai import OpenAI from request import PaperlessNGXService @@ -29,9 +30,13 @@ parser.add_argument("query", type=str, help="questions about simba's health") parser.add_argument( "--reindex", action="store_true", help="re-index the simba documents" ) +parser.add_argument( + "--index", help="index a file" +) ppngx = PaperlessNGXService() +openai_client = OpenAI() def index_using_pdf_llm(): files = ppngx.get_data() @@ -39,6 +44,7 @@ def index_using_pdf_llm(): document_id = file["id"] pdf_path = ppngx.download_pdf_from_id(id=document_id) image_paths = pdf_to_image(filepath=pdf_path) + print(f"summarizing {file}") generated_summary = summarize_pdf_image(filepaths=image_paths) file["content"] = generated_summary @@ -68,36 +74,75 @@ def chunk_data(docs: list[dict[str, Union[str, Any]]], collection): print(docs) texts: list[str] = [doc["content"] for doc in docs] for index, text in enumerate(texts): + print(docs[index]["original_file_name"]) metadata = { - "created_date": date_to_epoch(docs[index]["created_date"]), + "created_date": date_to_epoch(docs[index]["created_date"]), + "filename": docs[index]["original_file_name"] } chunker.chunk_document( document=text, metadata=metadata, ) +def chunk_text(texts: list[str], collection): + chunker = Chunker(collection) + + for index, text in enumerate(texts): + metadata = {} + chunker.chunk_document( + document=text, + metadata=metadata, + ) def consult_oracle(input: str, collection): + print(input) + import time + start_time = time.time() + # Ask - qg = QueryGenerator() - metadata_filter = qg.get_query("input") - print(metadata_filter) + # print("Starting query generation") + # qg_start = time.time() + # qg = QueryGenerator() + # metadata_filter = qg.get_query(input) + # qg_end = time.time() + # print(f"Query generation took {qg_end - qg_start:.2f} seconds") + # print(metadata_filter) + + print("Starting embedding generation") + embedding_start = time.time() embeddings = Chunker.embedding_fx(input=[input]) + embedding_end = time.time() + print(f"Embedding generation took {embedding_end - embedding_start:.2f} seconds") + + print("Starting collection query") + query_start = time.time() results = collection.query( query_texts=[input], query_embeddings=embeddings, - where=metadata_filter, + #where=metadata_filter, ) - print(results) + query_end = time.time() + print(f"Collection query took {query_end - query_start:.2f} seconds") # Generate - output = ollama.generate( - model="gemma3n:e4b", - prompt=f"You are a helpful assistant that understandings veterinary terms. Using the following data, help answer the user's query by providing as many details as possible. Using this data: {results}. Respond to this prompt: {input}", + print("Starting LLM generation") + llm_start = time.time() + # output = ollama.generate( + # model="gemma3n:e4b", + # prompt=f"You are a helpful assistant that understandings veterinary terms. Using the following data, help answer the user's query by providing as many details as possible. Using this data: {results}. Respond to this prompt: {input}", + # ) + response = openai_client.responses.create( + model="gpt-4o-mini", + input=f"You are a helpful assistant that understandings veterinary terms. Using the following data, help answer the user's query by providing as many details as possible. Using this data: {results}. Respond to this prompt: {input}", ) + llm_end = time.time() + print(f"LLM generation took {llm_end - llm_start:.2f} seconds") - print(output["response"]) + total_time = time.time() - start_time + print(f"Total consult_oracle execution took {total_time:.2f} seconds") + + return response.output_text def paperless_workflow(input): @@ -109,24 +154,47 @@ def paperless_workflow(input): consult_oracle(input, simba_docs) +def consult_simba_oracle(input: str): + return consult_oracle( + input=input, + collection=simba_docs, + ) + + if __name__ == "__main__": args = parser.parse_args() if args.reindex: - # logging.info(msg="Fetching documents from Paperless-NGX") - # ppngx = PaperlessNGXService() - # docs = ppngx.get_data() - # logging.info(msg=f"Fetched {len(docs)} documents") + print("Fetching documents from Paperless-NGX") + ppngx = PaperlessNGXService() + docs = ppngx.get_data() + print(docs) + print(f"Fetched {len(docs)} documents") # - # logging.info(msg="Chunking documents now ...") - # chunk_data(docs, collection=simba_docs) - # logging.info(msg="Done chunking documents") - index_using_pdf_llm() + print("Chunking documents now ...") + chunk_data(docs, collection=simba_docs) + print("Done chunking documents") + # index_using_pdf_llm() + + + if args.index: + with open(args.index) as file: + extension = args.index.split(".")[-1] + + if extension == "pdf": + pdf_path = ppngx.download_pdf_from_id(id=document_id) + image_paths = pdf_to_image(filepath=pdf_path) + print(f"summarizing {file}") + generated_summary = summarize_pdf_image(filepaths=image_paths) + elif extension in [".md", ".txt"]: + chunk_text(texts=[file.readall()], collection=simba_docs) if args.query: - logging.info("Consulting oracle ...") - consult_oracle( + print("Consulting oracle ...") + print(consult_oracle( input=args.query, collection=simba_docs, - ) + )) else: print("please provide a query") + + diff --git a/pyproject.toml b/pyproject.toml index b0a5584..1431e9c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,4 +4,12 @@ version = "0.1.0" description = "Add your description here" readme = "README.md" requires-python = ">=3.13" -dependencies = [] +dependencies = [ + "chromadb>=1.1.0", + "dotenv>=0.9.9", + "flask>=3.1.2", + "httpx>=0.28.1", + "ollama>=0.6.0", + "openai>=2.0.1", + "pydantic>=2.11.9", +] diff --git a/query.py b/query.py index 82309fa..453ed85 100644 --- a/query.py +++ b/query.py @@ -3,6 +3,8 @@ from typing import Literal import datetime from ollama import chat, ChatResponse +from openai import OpenAI + from pydantic import BaseModel, Field # This uses inferred filters — which means using LLM to create the metadata filters @@ -27,11 +29,15 @@ class GeneratedQuery(BaseModel): fields: list[str] extracted_metadata_fields: str +class Time(BaseModel): + time: int PROMPT = """ You are an information specialist that processes user queries. The current year is 2025. The user queries are all about a cat, Simba, and its records. The types of records are listed below. Using the query, extract the -the date range the user is trying to query. You should return the it as a JSON. The date tag is created_date. Return the date in epoch time +the date range the user is trying to query. You should return it as a JSON. The date tag is created_date. Return the date in epoch time. + +If the created_date cannot be ascertained, set it to epoch time start. You have several operators at your disposal: @@ -90,18 +96,31 @@ class QueryGenerator: return date.timestamp() def get_query(self, input: str): - response: ChatResponse = chat( - model="gemma3n:e4b", - messages=[ + client = OpenAI() + print(input) + response = client.responses.parse( + model="gpt-4o", + input=[ {"role": "system", "content": PROMPT}, {"role": "user", "content": input}, ], - format=GeneratedQuery.model_json_schema(), + text_format=Time, ) + print(response) + query = json.loads(response.output_parsed.extracted_metadata_fields) - query = json.loads( - json.loads(response["message"]["content"])["extracted_metadata_fields"] - ) + # response: ChatResponse = chat( + # model="gemma3n:e4b", + # messages=[ + # {"role": "system", "content": PROMPT}, + # {"role": "user", "content": input}, + # ], + # format=GeneratedQuery.model_json_schema(), + # ) + + # query = json.loads( + # json.loads(response["message"]["content"])["extracted_metadata_fields"] + # ) date_key = list(query["created_date"].keys())[0] query["created_date"][date_key] = self.date_to_epoch( query["created_date"][date_key] diff --git a/raggr-frontend/.gitignore b/raggr-frontend/.gitignore new file mode 100644 index 0000000..6f3092c --- /dev/null +++ b/raggr-frontend/.gitignore @@ -0,0 +1,16 @@ +# Local +.DS_Store +*.local +*.log* + +# Dist +node_modules +dist/ + +# Profile +.rspack-profile-*/ + +# IDE +.vscode/* +!.vscode/extensions.json +.idea diff --git a/raggr-frontend/README.md b/raggr-frontend/README.md new file mode 100644 index 0000000..a80dc73 --- /dev/null +++ b/raggr-frontend/README.md @@ -0,0 +1,36 @@ +# Rsbuild project + +## Setup + +Install the dependencies: + +```bash +pnpm install +``` + +## Get started + +Start the dev server, and the app will be available at [http://localhost:3000](http://localhost:3000). + +```bash +pnpm dev +``` + +Build the app for production: + +```bash +pnpm build +``` + +Preview the production build locally: + +```bash +pnpm preview +``` + +## Learn more + +To learn more about Rsbuild, check out the following resources: + +- [Rsbuild documentation](https://rsbuild.rs) - explore Rsbuild features and APIs. +- [Rsbuild GitHub repository](https://github.com/web-infra-dev/rsbuild) - your feedback and contributions are welcome! diff --git a/raggr-frontend/package.json b/raggr-frontend/package.json new file mode 100644 index 0000000..a5f73e5 --- /dev/null +++ b/raggr-frontend/package.json @@ -0,0 +1,24 @@ +{ + "name": "raggr-frontend", + "version": "1.0.0", + "private": true, + "type": "module", + "scripts": { + "build": "rsbuild build", + "dev": "rsbuild dev --open", + "preview": "rsbuild preview" + }, + "dependencies": { + "marked": "^16.3.0", + "react": "^19.1.1", + "react-dom": "^19.1.1", + "react-markdown": "^10.1.0" + }, + "devDependencies": { + "@rsbuild/core": "^1.5.6", + "@rsbuild/plugin-react": "^1.4.0", + "@types/react": "^19.1.13", + "@types/react-dom": "^19.1.9", + "typescript": "^5.9.2" + } +} diff --git a/raggr-frontend/postcss.config.mjs b/raggr-frontend/postcss.config.mjs new file mode 100644 index 0000000..017b34b --- /dev/null +++ b/raggr-frontend/postcss.config.mjs @@ -0,0 +1,5 @@ +export default { + plugins: { + "@tailwindcss/postcss": {}, + }, +}; diff --git a/raggr-frontend/rsbuild.config.ts b/raggr-frontend/rsbuild.config.ts new file mode 100644 index 0000000..c9962d3 --- /dev/null +++ b/raggr-frontend/rsbuild.config.ts @@ -0,0 +1,6 @@ +import { defineConfig } from '@rsbuild/core'; +import { pluginReact } from '@rsbuild/plugin-react'; + +export default defineConfig({ + plugins: [pluginReact()], +}); diff --git a/raggr-frontend/src/App.css b/raggr-frontend/src/App.css new file mode 100644 index 0000000..9cad0ff --- /dev/null +++ b/raggr-frontend/src/App.css @@ -0,0 +1,6 @@ +@import "tailwindcss"; + +body { + margin: 0; + font-family: Inter, Avenir, Helvetica, Arial, sans-serif; +} diff --git a/raggr-frontend/src/App.tsx b/raggr-frontend/src/App.tsx new file mode 100644 index 0000000..8a7571b --- /dev/null +++ b/raggr-frontend/src/App.tsx @@ -0,0 +1,66 @@ +import { useState } from "react"; +import axios from "axios"; +import ReactMarkdown from "react-markdown"; + +import "./App.css"; + +const App = () => { + const [query, setQuery] = useState(""); + const [answer, setAnswer] = useState(""); + const [loading, setLoading] = useState(false); + + const handleQuestionSubmit = () => { + const payload = { query: query }; + setLoading(true); + axios + .post("/api/query", payload) + .then((result) => setAnswer(result.data.response)) + .finally(() => setLoading(false)); + }; + const handleQueryChange = (event) => { + setQuery(event.target.value); + }; + return ( +
+
+
+

ask simba!

+
+
+