Compare commits
16 Commits
data-prepr
...
3ffc95a1b0
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
3ffc95a1b0 | ||
|
|
c5091dc07a | ||
|
|
c140758560 | ||
|
|
ab3a0eb442 | ||
|
|
c619d78922 | ||
|
|
c20ae0a4b9 | ||
|
|
26cc01b58b | ||
|
|
746b60e070 | ||
|
|
577c9144ac | ||
|
|
2b2891bd79 | ||
|
|
03b033e9a4 | ||
|
|
a640ae5fed | ||
|
|
99c98b7e42 | ||
|
|
a69f7864f3 | ||
|
|
679cfb08e4 | ||
|
|
fc504d3e9c |
16
.dockerignore
Normal file
16
.dockerignore
Normal file
@@ -0,0 +1,16 @@
|
|||||||
|
.git
|
||||||
|
.gitignore
|
||||||
|
README.md
|
||||||
|
.env
|
||||||
|
.DS_Store
|
||||||
|
chromadb/
|
||||||
|
chroma_db/
|
||||||
|
raggr-frontend/node_modules/
|
||||||
|
__pycache__/
|
||||||
|
*.pyc
|
||||||
|
*.pyo
|
||||||
|
*.pyd
|
||||||
|
.Python
|
||||||
|
.venv/
|
||||||
|
venv/
|
||||||
|
.pytest_cache/
|
||||||
1
.python-version
Normal file
1
.python-version
Normal file
@@ -0,0 +1 @@
|
|||||||
|
3.13
|
||||||
46
Dockerfile
Normal file
46
Dockerfile
Normal file
@@ -0,0 +1,46 @@
|
|||||||
|
FROM python:3.13-slim
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
# Install system dependencies, Node.js, Yarn, and uv
|
||||||
|
RUN apt-get update && apt-get install -y \
|
||||||
|
build-essential \
|
||||||
|
curl \
|
||||||
|
&& curl -fsSL https://deb.nodesource.com/setup_20.x | bash - \
|
||||||
|
&& apt-get install -y nodejs \
|
||||||
|
&& npm install -g yarn \
|
||||||
|
&& rm -rf /var/lib/apt/lists/* \
|
||||||
|
&& curl -LsSf https://astral.sh/uv/install.sh | sh
|
||||||
|
|
||||||
|
# Add uv to PATH
|
||||||
|
ENV PATH="/root/.local/bin:$PATH"
|
||||||
|
|
||||||
|
# Copy dependency files
|
||||||
|
COPY pyproject.toml ./
|
||||||
|
|
||||||
|
# Install Python dependencies using uv
|
||||||
|
RUN uv pip install --system -e .
|
||||||
|
|
||||||
|
# Copy application code
|
||||||
|
COPY *.py ./
|
||||||
|
COPY startup.sh ./
|
||||||
|
RUN chmod +x startup.sh
|
||||||
|
|
||||||
|
# Copy frontend code and build
|
||||||
|
COPY raggr-frontend ./raggr-frontend
|
||||||
|
WORKDIR /app/raggr-frontend
|
||||||
|
RUN yarn install && yarn build
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
# Create ChromaDB directory
|
||||||
|
RUN mkdir -p /app/chromadb
|
||||||
|
|
||||||
|
# Expose port
|
||||||
|
EXPOSE 8080
|
||||||
|
|
||||||
|
# Set environment variables
|
||||||
|
ENV PYTHONPATH=/app
|
||||||
|
ENV CHROMADB_PATH=/app/chromadb
|
||||||
|
|
||||||
|
# Run the startup script
|
||||||
|
CMD ["./startup.sh"]
|
||||||
37
app.py
Normal file
37
app.py
Normal file
@@ -0,0 +1,37 @@
|
|||||||
|
import os
|
||||||
|
|
||||||
|
from flask import Flask, request, jsonify, render_template, send_from_directory
|
||||||
|
|
||||||
|
from main import consult_simba_oracle
|
||||||
|
|
||||||
|
app = Flask(__name__, static_folder="raggr-frontend/dist/static", template_folder="raggr-frontend/dist")
|
||||||
|
|
||||||
|
|
||||||
|
# Serve React static files
|
||||||
|
@app.route('/static/<path:filename>')
|
||||||
|
def static_files(filename):
|
||||||
|
return send_from_directory(app.static_folder, filename)
|
||||||
|
|
||||||
|
# Serve the React app for all routes (catch-all)
|
||||||
|
@app.route('/', defaults={'path': ''})
|
||||||
|
@app.route('/<path:path>')
|
||||||
|
def serve_react_app(path):
|
||||||
|
if path and os.path.exists(os.path.join(app.template_folder, path)):
|
||||||
|
return send_from_directory(app.template_folder, path)
|
||||||
|
return render_template('index.html')
|
||||||
|
|
||||||
|
@app.route("/api/query", methods=["POST"])
|
||||||
|
def query():
|
||||||
|
data = request.get_json()
|
||||||
|
query = data.get("query")
|
||||||
|
return jsonify({"response": consult_simba_oracle(query)})
|
||||||
|
|
||||||
|
@app.route("/api/ingest", methods=["POST"])
|
||||||
|
def webhook():
|
||||||
|
data = request.get_json()
|
||||||
|
print(data)
|
||||||
|
return jsonify({"status": "received"})
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
app.run(host="0.0.0.0", port=8080, debug=True)
|
||||||
|
|
||||||
21
chunker.py
21
chunker.py
@@ -1,10 +1,11 @@
|
|||||||
import os
|
import os
|
||||||
from math import ceil
|
from math import ceil
|
||||||
import re
|
import re
|
||||||
|
from typing import Union
|
||||||
from uuid import UUID, uuid4
|
from uuid import UUID, uuid4
|
||||||
|
|
||||||
from chromadb.utils.embedding_functions.ollama_embedding_function import (
|
from chromadb.utils.embedding_functions.openai_embedding_function import (
|
||||||
OllamaEmbeddingFunction,
|
OpenAIEmbeddingFunction,
|
||||||
)
|
)
|
||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
|
|
||||||
@@ -79,18 +80,23 @@ class Chunk:
|
|||||||
|
|
||||||
|
|
||||||
class Chunker:
|
class Chunker:
|
||||||
embedding_fx = OllamaEmbeddingFunction(
|
embedding_fx = OpenAIEmbeddingFunction(
|
||||||
url=os.getenv("OLLAMA_URL", ""),
|
api_key=os.getenv("OPENAI_API_KEY"),
|
||||||
model_name="mxbai-embed-large",
|
model_name="text-embedding-3-small",
|
||||||
)
|
)
|
||||||
|
|
||||||
def __init__(self, collection) -> None:
|
def __init__(self, collection) -> None:
|
||||||
self.collection = collection
|
self.collection = collection
|
||||||
|
|
||||||
def chunk_document(self, document: str, chunk_size: int = 1000) -> list[Chunk]:
|
def chunk_document(
|
||||||
|
self,
|
||||||
|
document: str,
|
||||||
|
chunk_size: int = 1000,
|
||||||
|
metadata: dict[str, Union[str, float]] = {},
|
||||||
|
) -> list[Chunk]:
|
||||||
doc_uuid = uuid4()
|
doc_uuid = uuid4()
|
||||||
|
|
||||||
chunk_size = min(chunk_size, len(document))
|
chunk_size = min(chunk_size, len(document)) or 1
|
||||||
|
|
||||||
chunks = []
|
chunks = []
|
||||||
num_chunks = ceil(len(document) / chunk_size)
|
num_chunks = ceil(len(document) / chunk_size)
|
||||||
@@ -110,6 +116,7 @@ class Chunker:
|
|||||||
ids=[str(doc_uuid) + ":" + str(i)],
|
ids=[str(doc_uuid) + ":" + str(i)],
|
||||||
documents=[text_chunk],
|
documents=[text_chunk],
|
||||||
embeddings=embedding,
|
embeddings=embedding,
|
||||||
|
metadatas=[metadata],
|
||||||
)
|
)
|
||||||
|
|
||||||
return chunks
|
return chunks
|
||||||
|
|||||||
165
cleaner.py
Normal file
165
cleaner.py
Normal file
@@ -0,0 +1,165 @@
|
|||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import tempfile
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
import ollama
|
||||||
|
from PIL import Image
|
||||||
|
import fitz
|
||||||
|
|
||||||
|
from request import PaperlessNGXService
|
||||||
|
|
||||||
|
load_dotenv()
|
||||||
|
|
||||||
|
# Configure ollama client with URL from environment or default to localhost
|
||||||
|
ollama_client = ollama.Client(host=os.getenv("OLLAMA_URL", "http://localhost:11434"))
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser(description="use llm to clean documents")
|
||||||
|
parser.add_argument("document_id", type=str, help="questions about simba's health")
|
||||||
|
|
||||||
|
|
||||||
|
def pdf_to_image(filepath: str, dpi=300) -> list[str]:
|
||||||
|
"""Returns the filepaths to the created images"""
|
||||||
|
image_temp_files = []
|
||||||
|
try:
|
||||||
|
pdf_document = fitz.open(filepath)
|
||||||
|
print(f"\nConverting '{os.path.basename(filepath)}' to temporary images...")
|
||||||
|
|
||||||
|
for page_num in range(len(pdf_document)):
|
||||||
|
page = pdf_document.load_page(page_num)
|
||||||
|
zoom = dpi / 72
|
||||||
|
mat = fitz.Matrix(zoom, zoom)
|
||||||
|
pix = page.get_pixmap(matrix=mat)
|
||||||
|
|
||||||
|
# Create a temporary file for the image. delete=False is crucial.
|
||||||
|
with tempfile.NamedTemporaryFile(
|
||||||
|
delete=False,
|
||||||
|
suffix=".png",
|
||||||
|
prefix=f"pdf_page_{page_num + 1}_",
|
||||||
|
) as temp_image_file:
|
||||||
|
temp_image_path = temp_image_file.name
|
||||||
|
|
||||||
|
# Save the pixel data to the temporary file
|
||||||
|
pix.save(temp_image_path)
|
||||||
|
image_temp_files.append(temp_image_path)
|
||||||
|
print(
|
||||||
|
f" -> Saved page {page_num + 1} to temporary file: '{temp_image_path}'"
|
||||||
|
)
|
||||||
|
|
||||||
|
print("\nConversion successful! ✨")
|
||||||
|
return image_temp_files
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"An error occurred during PDF conversion: {e}", file=sys.stderr)
|
||||||
|
# Clean up any image files that were created before the error
|
||||||
|
for path in image_temp_files:
|
||||||
|
os.remove(path)
|
||||||
|
return []
|
||||||
|
|
||||||
|
|
||||||
|
def merge_images_vertically_to_tempfile(image_paths):
|
||||||
|
"""
|
||||||
|
Merges a list of images vertically and saves the result to a temporary file.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
image_paths (list): A list of strings, where each string is the
|
||||||
|
filepath to an image.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: The filepath of the temporary merged image file.
|
||||||
|
"""
|
||||||
|
if not image_paths:
|
||||||
|
print("Error: The list of image paths is empty.")
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Open all images and check for consistency
|
||||||
|
try:
|
||||||
|
images = [Image.open(path) for path in image_paths]
|
||||||
|
except FileNotFoundError as e:
|
||||||
|
print(f"Error: Could not find image file: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
widths, heights = zip(*(img.size for img in images))
|
||||||
|
max_width = max(widths)
|
||||||
|
|
||||||
|
# All images must have the same width
|
||||||
|
if not all(width == max_width for width in widths):
|
||||||
|
print("Warning: Images have different widths. They will be resized.")
|
||||||
|
resized_images = []
|
||||||
|
for img in images:
|
||||||
|
if img.size[0] != max_width:
|
||||||
|
img = img.resize(
|
||||||
|
(max_width, int(img.size[1] * (max_width / img.size[0])))
|
||||||
|
)
|
||||||
|
resized_images.append(img)
|
||||||
|
images = resized_images
|
||||||
|
heights = [img.size[1] for img in images]
|
||||||
|
|
||||||
|
# Calculate the total height of the merged image
|
||||||
|
total_height = sum(heights)
|
||||||
|
|
||||||
|
# Create a new blank image with the combined dimensions
|
||||||
|
merged_image = Image.new("RGB", (max_width, total_height))
|
||||||
|
|
||||||
|
# Paste each image onto the new blank image
|
||||||
|
y_offset = 0
|
||||||
|
for img in images:
|
||||||
|
merged_image.paste(img, (0, y_offset))
|
||||||
|
y_offset += img.height
|
||||||
|
|
||||||
|
# Create a temporary file and save the image
|
||||||
|
temp_file = tempfile.NamedTemporaryFile(suffix=".png", delete=False)
|
||||||
|
temp_path = temp_file.name
|
||||||
|
merged_image.save(temp_path)
|
||||||
|
temp_file.close()
|
||||||
|
|
||||||
|
print(f"Successfully merged {len(images)} images into temporary file: {temp_path}")
|
||||||
|
return temp_path
|
||||||
|
|
||||||
|
|
||||||
|
OCR_PROMPT = """
|
||||||
|
You job is to extract text from the images I provide you. Extract every bit of the text in the image. Don't say anything just do your job. Text should be same as in the images. If there are multiple images, categorize the transcriptions by page.
|
||||||
|
|
||||||
|
Things to avoid:
|
||||||
|
- Don't miss anything to extract from the images
|
||||||
|
|
||||||
|
Things to include:
|
||||||
|
- Include everything, even anything inside [], (), {} or anything.
|
||||||
|
- Include any repetitive things like "..." or anything
|
||||||
|
- If you think there is any mistake in image just include it too
|
||||||
|
|
||||||
|
Someone will kill the innocent kittens if you don't extract the text exactly. So, make sure you extract every bit of the text. Only output the extracted text.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
def summarize_pdf_image(filepaths: list[str]):
|
||||||
|
res = ollama_client.chat(
|
||||||
|
model="gemma3:4b",
|
||||||
|
messages=[
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": OCR_PROMPT,
|
||||||
|
"images": filepaths,
|
||||||
|
}
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
return res["message"]["content"]
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
args = parser.parse_args()
|
||||||
|
ppngx = PaperlessNGXService()
|
||||||
|
|
||||||
|
if args.document_id:
|
||||||
|
doc_id = args.document_id
|
||||||
|
file = ppngx.get_doc_by_id(doc_id=doc_id)
|
||||||
|
pdf_path = ppngx.download_pdf_from_id(doc_id)
|
||||||
|
print(pdf_path)
|
||||||
|
image_paths = pdf_to_image(filepath=pdf_path)
|
||||||
|
summary = summarize_pdf_image(filepaths=image_paths)
|
||||||
|
print(summary)
|
||||||
|
file["content"] = summary
|
||||||
|
print(file)
|
||||||
|
ppngx.upload_cleaned_content(doc_id, file)
|
||||||
17
docker-compose.yml
Normal file
17
docker-compose.yml
Normal file
@@ -0,0 +1,17 @@
|
|||||||
|
version: '3.8'
|
||||||
|
|
||||||
|
services:
|
||||||
|
raggr:
|
||||||
|
image: torrtle/simbarag:latest
|
||||||
|
network_mode: host
|
||||||
|
environment:
|
||||||
|
- PAPERLESS_TOKEN=${PAPERLESS_TOKEN}
|
||||||
|
- BASE_URL=${BASE_URL}
|
||||||
|
- OLLAMA_URL=${OLLAMA_URL:-http://localhost:11434}
|
||||||
|
- CHROMADB_PATH=/app/chromadb
|
||||||
|
- OPENAI_API_KEY=${OPENAI_API_KEY}
|
||||||
|
volumes:
|
||||||
|
- chromadb_data:/app/chromadb
|
||||||
|
|
||||||
|
volumes:
|
||||||
|
chromadb_data:
|
||||||
178
main.py
178
main.py
@@ -1,19 +1,26 @@
|
|||||||
|
import datetime
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
|
from typing import Any, Union
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import chromadb
|
import chromadb
|
||||||
import ollama
|
import ollama
|
||||||
|
from openai import OpenAI
|
||||||
|
|
||||||
|
|
||||||
from request import PaperlessNGXService
|
from request import PaperlessNGXService
|
||||||
from chunker import Chunker
|
from chunker import Chunker
|
||||||
|
from query import QueryGenerator
|
||||||
|
from cleaner import pdf_to_image, summarize_pdf_image
|
||||||
|
|
||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
|
|
||||||
load_dotenv()
|
load_dotenv()
|
||||||
|
|
||||||
|
# Configure ollama client with URL from environment or default to localhost
|
||||||
|
ollama_client = ollama.Client(host=os.getenv("OLLAMA_URL", "http://localhost:11434"))
|
||||||
|
|
||||||
client = chromadb.PersistentClient(path=os.getenv("CHROMADB_PATH", ""))
|
client = chromadb.PersistentClient(path=os.getenv("CHROMADB_PATH", ""))
|
||||||
simba_docs = client.get_or_create_collection(name="simba_docs")
|
simba_docs = client.get_or_create_collection(name="simba_docs")
|
||||||
feline_vet_lookup = client.get_or_create_collection(name="feline_vet_lookup")
|
feline_vet_lookup = client.get_or_create_collection(name="feline_vet_lookup")
|
||||||
@@ -26,59 +33,174 @@ parser.add_argument("query", type=str, help="questions about simba's health")
|
|||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--reindex", action="store_true", help="re-index the simba documents"
|
"--reindex", action="store_true", help="re-index the simba documents"
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--index", help="index a file"
|
||||||
|
)
|
||||||
|
|
||||||
|
ppngx = PaperlessNGXService()
|
||||||
|
|
||||||
|
openai_client = OpenAI()
|
||||||
|
|
||||||
|
def index_using_pdf_llm():
|
||||||
|
files = ppngx.get_data()
|
||||||
|
for file in files:
|
||||||
|
document_id = file["id"]
|
||||||
|
pdf_path = ppngx.download_pdf_from_id(id=document_id)
|
||||||
|
image_paths = pdf_to_image(filepath=pdf_path)
|
||||||
|
print(f"summarizing {file}")
|
||||||
|
generated_summary = summarize_pdf_image(filepaths=image_paths)
|
||||||
|
file["content"] = generated_summary
|
||||||
|
|
||||||
|
chunk_data(files, simba_docs)
|
||||||
|
|
||||||
|
|
||||||
def chunk_data(texts: list[str], collection):
|
def date_to_epoch(date_str: str) -> float:
|
||||||
|
split_date = date_str.split("-")
|
||||||
|
print(split_date)
|
||||||
|
date = datetime.datetime(
|
||||||
|
int(split_date[0]),
|
||||||
|
int(split_date[1]),
|
||||||
|
int(split_date[2]),
|
||||||
|
0,
|
||||||
|
0,
|
||||||
|
0,
|
||||||
|
)
|
||||||
|
|
||||||
|
return date.timestamp()
|
||||||
|
|
||||||
|
|
||||||
|
def chunk_data(docs: list[dict[str, Union[str, Any]]], collection):
|
||||||
# Step 2: Create chunks
|
# Step 2: Create chunks
|
||||||
chunker = Chunker(collection)
|
chunker = Chunker(collection)
|
||||||
|
|
||||||
print(f"chunking {len(texts)} documents")
|
print(f"chunking {len(docs)} documents")
|
||||||
for text in texts:
|
print(docs)
|
||||||
chunker.chunk_document(document=text)
|
texts: list[str] = [doc["content"] for doc in docs]
|
||||||
|
for index, text in enumerate(texts):
|
||||||
|
print(docs[index]["original_file_name"])
|
||||||
def consult_oracle(input: str, collection):
|
metadata = {
|
||||||
# Ask
|
"created_date": date_to_epoch(docs[index]["created_date"]),
|
||||||
embeddings = Chunker.embedding_fx(input=[input])
|
"filename": docs[index]["original_file_name"]
|
||||||
results = collection.query(query_texts=[input], query_embeddings=embeddings)
|
}
|
||||||
|
chunker.chunk_document(
|
||||||
# Generate
|
document=text,
|
||||||
output = ollama.generate(
|
metadata=metadata,
|
||||||
model="gemma3n:e4b",
|
|
||||||
prompt=f"You are a helpful assistant that understandings veterinary terms. Using the following data, help answer the user's query by providing as many details as possible. Using this data: {results}. Respond to this prompt: {input}",
|
|
||||||
)
|
)
|
||||||
|
|
||||||
print(output["response"])
|
def chunk_text(texts: list[str], collection):
|
||||||
|
chunker = Chunker(collection)
|
||||||
|
|
||||||
|
for index, text in enumerate(texts):
|
||||||
|
metadata = {}
|
||||||
|
chunker.chunk_document(
|
||||||
|
document=text,
|
||||||
|
metadata=metadata,
|
||||||
|
)
|
||||||
|
|
||||||
|
def consult_oracle(input: str, collection):
|
||||||
|
print(input)
|
||||||
|
import time
|
||||||
|
start_time = time.time()
|
||||||
|
|
||||||
|
# Ask
|
||||||
|
# print("Starting query generation")
|
||||||
|
# qg_start = time.time()
|
||||||
|
# qg = QueryGenerator()
|
||||||
|
# metadata_filter = qg.get_query(input)
|
||||||
|
# qg_end = time.time()
|
||||||
|
# print(f"Query generation took {qg_end - qg_start:.2f} seconds")
|
||||||
|
# print(metadata_filter)
|
||||||
|
|
||||||
|
print("Starting embedding generation")
|
||||||
|
embedding_start = time.time()
|
||||||
|
embeddings = Chunker.embedding_fx(input=[input])
|
||||||
|
embedding_end = time.time()
|
||||||
|
print(f"Embedding generation took {embedding_end - embedding_start:.2f} seconds")
|
||||||
|
|
||||||
|
print("Starting collection query")
|
||||||
|
query_start = time.time()
|
||||||
|
results = collection.query(
|
||||||
|
query_texts=[input],
|
||||||
|
query_embeddings=embeddings,
|
||||||
|
#where=metadata_filter,
|
||||||
|
)
|
||||||
|
print(results)
|
||||||
|
query_end = time.time()
|
||||||
|
print(f"Collection query took {query_end - query_start:.2f} seconds")
|
||||||
|
|
||||||
|
# Generate
|
||||||
|
print("Starting LLM generation")
|
||||||
|
llm_start = time.time()
|
||||||
|
# output = ollama_client.generate(
|
||||||
|
# model="gemma3n:e4b",
|
||||||
|
# prompt=f"You are a helpful assistant that understandings veterinary terms. Using the following data, help answer the user's query by providing as many details as possible. Using this data: {results}. Respond to this prompt: {input}",
|
||||||
|
# )
|
||||||
|
response = openai_client.chat.completions.create(
|
||||||
|
model="gpt-4o-mini",
|
||||||
|
messages=[
|
||||||
|
{"role": "system", "content": "You are a helpful assistant that understands veterinary terms."},
|
||||||
|
{"role": "user", "content": f"Using the following data, help answer the user's query by providing as many details as possible. Using this data: {results}. Respond to this prompt: {input}"}
|
||||||
|
]
|
||||||
|
)
|
||||||
|
llm_end = time.time()
|
||||||
|
print(f"LLM generation took {llm_end - llm_start:.2f} seconds")
|
||||||
|
|
||||||
|
total_time = time.time() - start_time
|
||||||
|
print(f"Total consult_oracle execution took {total_time:.2f} seconds")
|
||||||
|
|
||||||
|
return response.choices[0].message.content
|
||||||
|
|
||||||
|
|
||||||
def paperless_workflow(input):
|
def paperless_workflow(input):
|
||||||
# Step 1: Get the text
|
# Step 1: Get the text
|
||||||
ppngx = PaperlessNGXService()
|
ppngx = PaperlessNGXService()
|
||||||
docs = ppngx.get_data()
|
docs = ppngx.get_data()
|
||||||
texts = [doc["content"] for doc in docs]
|
|
||||||
|
|
||||||
chunk_data(texts, collection=simba_docs)
|
chunk_data(docs, collection=simba_docs)
|
||||||
consult_oracle(input, simba_docs)
|
consult_oracle(input, simba_docs)
|
||||||
|
|
||||||
|
|
||||||
|
def consult_simba_oracle(input: str):
|
||||||
|
return consult_oracle(
|
||||||
|
input=input,
|
||||||
|
collection=simba_docs,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
if args.reindex:
|
if args.reindex:
|
||||||
logging.info(msg="Fetching documents from Paperless-NGX")
|
print("Fetching documents from Paperless-NGX")
|
||||||
ppngx = PaperlessNGXService()
|
ppngx = PaperlessNGXService()
|
||||||
docs = ppngx.get_data()
|
docs = ppngx.get_data()
|
||||||
texts = [doc["content"] for doc in docs]
|
print(docs)
|
||||||
logging.info(msg=f"Fetched {len(texts)} documents")
|
print(f"Fetched {len(docs)} documents")
|
||||||
|
#
|
||||||
|
print("Chunking documents now ...")
|
||||||
|
chunk_data(docs, collection=simba_docs)
|
||||||
|
print("Done chunking documents")
|
||||||
|
# index_using_pdf_llm()
|
||||||
|
|
||||||
logging.info(msg="Chunking documents now ...")
|
|
||||||
chunk_data(texts, collection=simba_docs)
|
if args.index:
|
||||||
logging.info(msg="Done chunking documents")
|
with open(args.index) as file:
|
||||||
|
extension = args.index.split(".")[-1]
|
||||||
|
|
||||||
|
if extension == "pdf":
|
||||||
|
pdf_path = ppngx.download_pdf_from_id(id=document_id)
|
||||||
|
image_paths = pdf_to_image(filepath=pdf_path)
|
||||||
|
print(f"summarizing {file}")
|
||||||
|
generated_summary = summarize_pdf_image(filepaths=image_paths)
|
||||||
|
elif extension in [".md", ".txt"]:
|
||||||
|
chunk_text(texts=[file.readall()], collection=simba_docs)
|
||||||
|
|
||||||
if args.query:
|
if args.query:
|
||||||
logging.info("Consulting oracle ...")
|
print("Consulting oracle ...")
|
||||||
consult_oracle(
|
print(consult_oracle(
|
||||||
input=args.query,
|
input=args.query,
|
||||||
collection=simba_docs,
|
collection=simba_docs,
|
||||||
)
|
))
|
||||||
else:
|
else:
|
||||||
print("please provide a query")
|
print("please provide a query")
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -4,4 +4,14 @@ version = "0.1.0"
|
|||||||
description = "Add your description here"
|
description = "Add your description here"
|
||||||
readme = "README.md"
|
readme = "README.md"
|
||||||
requires-python = ">=3.13"
|
requires-python = ">=3.13"
|
||||||
dependencies = []
|
dependencies = [
|
||||||
|
"chromadb>=1.1.0",
|
||||||
|
"python-dotenv>=1.0.0",
|
||||||
|
"flask>=3.1.2",
|
||||||
|
"httpx>=0.28.1",
|
||||||
|
"ollama>=0.6.0",
|
||||||
|
"openai>=2.0.1",
|
||||||
|
"pydantic>=2.11.9",
|
||||||
|
"pillow>=10.0.0",
|
||||||
|
"pymupdf>=1.24.0",
|
||||||
|
]
|
||||||
|
|||||||
81
query.py
81
query.py
@@ -1,10 +1,16 @@
|
|||||||
import json
|
import json
|
||||||
|
import os
|
||||||
from typing import Literal
|
from typing import Literal
|
||||||
|
import datetime
|
||||||
|
from ollama import chat, ChatResponse, Client
|
||||||
|
|
||||||
from ollama import chat, ChatResponse
|
from openai import OpenAI
|
||||||
|
|
||||||
from pydantic import BaseModel, Field
|
from pydantic import BaseModel, Field
|
||||||
|
|
||||||
|
# Configure ollama client with URL from environment or default to localhost
|
||||||
|
ollama_client = Client(host=os.getenv("OLLAMA_URL", "http://localhost:11434"))
|
||||||
|
|
||||||
# This uses inferred filters — which means using LLM to create the metadata filters
|
# This uses inferred filters — which means using LLM to create the metadata filters
|
||||||
|
|
||||||
|
|
||||||
@@ -27,11 +33,15 @@ class GeneratedQuery(BaseModel):
|
|||||||
fields: list[str]
|
fields: list[str]
|
||||||
extracted_metadata_fields: str
|
extracted_metadata_fields: str
|
||||||
|
|
||||||
|
class Time(BaseModel):
|
||||||
|
time: int
|
||||||
|
|
||||||
PROMPT = """
|
PROMPT = """
|
||||||
You are an information specialist that processes user queries. The user queries are all about
|
You are an information specialist that processes user queries. The current year is 2025. The user queries are all about
|
||||||
a cat, Simba, and its records. The types of records are listed below. Using the query, extract the
|
a cat, Simba, and its records. The types of records are listed below. Using the query, extract the
|
||||||
type of record the user is trying to query and the date range the user is trying to query.
|
the date range the user is trying to query. You should return it as a JSON. The date tag is created_date. Return the date in epoch time.
|
||||||
|
|
||||||
|
If the created_date cannot be ascertained, set it to epoch time start.
|
||||||
|
|
||||||
|
|
||||||
You have several operators at your disposal:
|
You have several operators at your disposal:
|
||||||
@@ -49,18 +59,18 @@ Logical operators:
|
|||||||
|
|
||||||
### Example 1
|
### Example 1
|
||||||
Query: "Who is Simba's current vet?"
|
Query: "Who is Simba's current vet?"
|
||||||
Metadata fields: "{"created_date, tags"}"
|
Metadata fields: "{"created_date"}"
|
||||||
Extracted metadata fields: {"$and": [{"created_date: {"$gt": "2025-01-01"}, "tags": {"$in": ["bill", "medical records", "aftercare"]}}]}
|
Extracted metadata fields: {"created_date: {"$gt": "2025-01-01"}}
|
||||||
|
|
||||||
### Example 2
|
### Example 2
|
||||||
Query: "How many teeth has Simba had removed?"
|
Query: "How many teeth has Simba had removed?"
|
||||||
Metadata fields: {"tags"}
|
Metadata fields: {}
|
||||||
Extracted metadata fields: {"tags": "medical records"}
|
Extracted metadata fields: {}
|
||||||
|
|
||||||
### Example 3
|
### Example 3
|
||||||
Query: "How many times has Simba been to the vet this year?"
|
Query: "How many times has Simba been to the vet this year?"
|
||||||
Metadata fields: {"tags", "created_date"}
|
Metadata fields: {"created_date"}
|
||||||
Extracted metadata fields: {"$and": [{"created_date": {"gt": "2025-01-01"}, "tags": {"$in": ["bill"]}}]}
|
Extracted metadata fields: {"created_date": {"gt": "2025-01-01"}}
|
||||||
|
|
||||||
document_types:
|
document_types:
|
||||||
- aftercare
|
- aftercare
|
||||||
@@ -76,23 +86,56 @@ class QueryGenerator:
|
|||||||
def __init__(self) -> None:
|
def __init__(self) -> None:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
def date_to_epoch(self, date_str: str) -> float:
|
||||||
|
split_date = date_str.split("-")
|
||||||
|
date = datetime.datetime(
|
||||||
|
int(split_date[0]),
|
||||||
|
int(split_date[1]),
|
||||||
|
int(split_date[2]),
|
||||||
|
0,
|
||||||
|
0,
|
||||||
|
0,
|
||||||
|
)
|
||||||
|
|
||||||
|
return date.timestamp()
|
||||||
|
|
||||||
def get_query(self, input: str):
|
def get_query(self, input: str):
|
||||||
response: ChatResponse = chat(
|
client = OpenAI()
|
||||||
model="gemma3n:e4b",
|
print(input)
|
||||||
messages=[
|
response = client.responses.parse(
|
||||||
|
model="gpt-4o",
|
||||||
|
input=[
|
||||||
{"role": "system", "content": PROMPT},
|
{"role": "system", "content": PROMPT},
|
||||||
{"role": "user", "content": input},
|
{"role": "user", "content": input},
|
||||||
],
|
],
|
||||||
format=GeneratedQuery.model_json_schema(),
|
text_format=Time,
|
||||||
|
)
|
||||||
|
print(response)
|
||||||
|
query = json.loads(response.output_parsed.extracted_metadata_fields)
|
||||||
|
|
||||||
|
# response: ChatResponse = ollama_client.chat(
|
||||||
|
# model="gemma3n:e4b",
|
||||||
|
# messages=[
|
||||||
|
# {"role": "system", "content": PROMPT},
|
||||||
|
# {"role": "user", "content": input},
|
||||||
|
# ],
|
||||||
|
# format=GeneratedQuery.model_json_schema(),
|
||||||
|
# )
|
||||||
|
|
||||||
|
# query = json.loads(
|
||||||
|
# json.loads(response["message"]["content"])["extracted_metadata_fields"]
|
||||||
|
# )
|
||||||
|
date_key = list(query["created_date"].keys())[0]
|
||||||
|
query["created_date"][date_key] = self.date_to_epoch(
|
||||||
|
query["created_date"][date_key]
|
||||||
)
|
)
|
||||||
|
|
||||||
print(
|
if "$" not in date_key:
|
||||||
json.loads(
|
query["created_date"]["$" + date_key] = query["created_date"][date_key]
|
||||||
json.loads(response["message"]["content"])["extracted_metadata_fields"]
|
|
||||||
)
|
return query
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
qg = QueryGenerator()
|
qg = QueryGenerator()
|
||||||
qg.get_query("How old is Simba?")
|
print(qg.get_query("How heavy is Simba?"))
|
||||||
|
|||||||
16
raggr-frontend/.gitignore
vendored
Normal file
16
raggr-frontend/.gitignore
vendored
Normal file
@@ -0,0 +1,16 @@
|
|||||||
|
# Local
|
||||||
|
.DS_Store
|
||||||
|
*.local
|
||||||
|
*.log*
|
||||||
|
|
||||||
|
# Dist
|
||||||
|
node_modules
|
||||||
|
dist/
|
||||||
|
|
||||||
|
# Profile
|
||||||
|
.rspack-profile-*/
|
||||||
|
|
||||||
|
# IDE
|
||||||
|
.vscode/*
|
||||||
|
!.vscode/extensions.json
|
||||||
|
.idea
|
||||||
36
raggr-frontend/README.md
Normal file
36
raggr-frontend/README.md
Normal file
@@ -0,0 +1,36 @@
|
|||||||
|
# Rsbuild project
|
||||||
|
|
||||||
|
## Setup
|
||||||
|
|
||||||
|
Install the dependencies:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pnpm install
|
||||||
|
```
|
||||||
|
|
||||||
|
## Get started
|
||||||
|
|
||||||
|
Start the dev server, and the app will be available at [http://localhost:3000](http://localhost:3000).
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pnpm dev
|
||||||
|
```
|
||||||
|
|
||||||
|
Build the app for production:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pnpm build
|
||||||
|
```
|
||||||
|
|
||||||
|
Preview the production build locally:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pnpm preview
|
||||||
|
```
|
||||||
|
|
||||||
|
## Learn more
|
||||||
|
|
||||||
|
To learn more about Rsbuild, check out the following resources:
|
||||||
|
|
||||||
|
- [Rsbuild documentation](https://rsbuild.rs) - explore Rsbuild features and APIs.
|
||||||
|
- [Rsbuild GitHub repository](https://github.com/web-infra-dev/rsbuild) - your feedback and contributions are welcome!
|
||||||
26
raggr-frontend/package.json
Normal file
26
raggr-frontend/package.json
Normal file
@@ -0,0 +1,26 @@
|
|||||||
|
{
|
||||||
|
"name": "raggr-frontend",
|
||||||
|
"version": "1.0.0",
|
||||||
|
"private": true,
|
||||||
|
"type": "module",
|
||||||
|
"scripts": {
|
||||||
|
"build": "rsbuild build",
|
||||||
|
"dev": "rsbuild dev --open",
|
||||||
|
"preview": "rsbuild preview"
|
||||||
|
},
|
||||||
|
"dependencies": {
|
||||||
|
"axios": "^1.12.2",
|
||||||
|
"marked": "^16.3.0",
|
||||||
|
"react": "^19.1.1",
|
||||||
|
"react-dom": "^19.1.1",
|
||||||
|
"react-markdown": "^10.1.0"
|
||||||
|
},
|
||||||
|
"devDependencies": {
|
||||||
|
"@rsbuild/core": "^1.5.6",
|
||||||
|
"@rsbuild/plugin-react": "^1.4.0",
|
||||||
|
"@tailwindcss/postcss": "^4.0.0",
|
||||||
|
"@types/react": "^19.1.13",
|
||||||
|
"@types/react-dom": "^19.1.9",
|
||||||
|
"typescript": "^5.9.2"
|
||||||
|
}
|
||||||
|
}
|
||||||
5
raggr-frontend/postcss.config.mjs
Normal file
5
raggr-frontend/postcss.config.mjs
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
export default {
|
||||||
|
plugins: {
|
||||||
|
"@tailwindcss/postcss": {},
|
||||||
|
},
|
||||||
|
};
|
||||||
6
raggr-frontend/rsbuild.config.ts
Normal file
6
raggr-frontend/rsbuild.config.ts
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
import { defineConfig } from '@rsbuild/core';
|
||||||
|
import { pluginReact } from '@rsbuild/plugin-react';
|
||||||
|
|
||||||
|
export default defineConfig({
|
||||||
|
plugins: [pluginReact()],
|
||||||
|
});
|
||||||
6
raggr-frontend/src/App.css
Normal file
6
raggr-frontend/src/App.css
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
@import "tailwindcss";
|
||||||
|
|
||||||
|
body {
|
||||||
|
margin: 0;
|
||||||
|
font-family: Inter, Avenir, Helvetica, Arial, sans-serif;
|
||||||
|
}
|
||||||
66
raggr-frontend/src/App.tsx
Normal file
66
raggr-frontend/src/App.tsx
Normal file
@@ -0,0 +1,66 @@
|
|||||||
|
import { useState } from "react";
|
||||||
|
import axios from "axios";
|
||||||
|
import ReactMarkdown from "react-markdown";
|
||||||
|
|
||||||
|
import "./App.css";
|
||||||
|
|
||||||
|
const App = () => {
|
||||||
|
const [query, setQuery] = useState<string>("");
|
||||||
|
const [answer, setAnswer] = useState<string>("");
|
||||||
|
const [loading, setLoading] = useState<boolean>(false);
|
||||||
|
|
||||||
|
const handleQuestionSubmit = () => {
|
||||||
|
const payload = { query: query };
|
||||||
|
setLoading(true);
|
||||||
|
axios
|
||||||
|
.post("/api/query", payload)
|
||||||
|
.then((result) => setAnswer(result.data.response))
|
||||||
|
.finally(() => setLoading(false));
|
||||||
|
};
|
||||||
|
const handleQueryChange = (event) => {
|
||||||
|
setQuery(event.target.value);
|
||||||
|
};
|
||||||
|
return (
|
||||||
|
<div className="flex flex-row justify-center py-4">
|
||||||
|
<div className="flex flex-col gap-4 min-w-xl max-w-xl">
|
||||||
|
<div className="flex flex-row justify-center gap-2 grow">
|
||||||
|
<h1 className="text-3xl">ask simba!</h1>
|
||||||
|
</div>
|
||||||
|
<div className="flex flex-row justify-between gap-2 grow">
|
||||||
|
<textarea
|
||||||
|
type="text"
|
||||||
|
className="p-4 border border-blue-200 rounded-md grow"
|
||||||
|
onChange={handleQueryChange}
|
||||||
|
/>
|
||||||
|
</div>
|
||||||
|
<div className="flex flex-row justify-between gap-2 grow">
|
||||||
|
<button
|
||||||
|
className="p-4 border border-blue-400 bg-blue-200 hover:bg-blue-400 cursor-pointer rounded-md flex-grow"
|
||||||
|
onClick={() => handleQuestionSubmit()}
|
||||||
|
type="submit"
|
||||||
|
>
|
||||||
|
Submit
|
||||||
|
</button>
|
||||||
|
</div>
|
||||||
|
{loading ? (
|
||||||
|
<div className="flex flex-col w-full animate-pulse gap-2">
|
||||||
|
<div className="flex flex-row gap-2 w-full">
|
||||||
|
<div className="bg-gray-400 w-1/2 p-3 rounded-lg" />
|
||||||
|
<div className="bg-gray-400 w-1/2 p-3 rounded-lg" />
|
||||||
|
</div>
|
||||||
|
<div className="flex flex-row gap-2 w-full">
|
||||||
|
<div className="bg-gray-400 w-1/3 p-3 rounded-lg" />
|
||||||
|
<div className="bg-gray-400 w-2/3 p-3 rounded-lg" />
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
) : (
|
||||||
|
<div className="flex flex-col">
|
||||||
|
<ReactMarkdown>{answer}</ReactMarkdown>
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
);
|
||||||
|
};
|
||||||
|
|
||||||
|
export default App;
|
||||||
11
raggr-frontend/src/env.d.ts
vendored
Normal file
11
raggr-frontend/src/env.d.ts
vendored
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
/// <reference types="@rsbuild/core/types" />
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Imports the SVG file as a React component.
|
||||||
|
* @requires [@rsbuild/plugin-svgr](https://npmjs.com/package/@rsbuild/plugin-svgr)
|
||||||
|
*/
|
||||||
|
declare module '*.svg?react' {
|
||||||
|
import type React from 'react';
|
||||||
|
const ReactComponent: React.FunctionComponent<React.SVGProps<SVGSVGElement>>;
|
||||||
|
export default ReactComponent;
|
||||||
|
}
|
||||||
13
raggr-frontend/src/index.tsx
Normal file
13
raggr-frontend/src/index.tsx
Normal file
@@ -0,0 +1,13 @@
|
|||||||
|
import React from 'react';
|
||||||
|
import ReactDOM from 'react-dom/client';
|
||||||
|
import App from './App';
|
||||||
|
|
||||||
|
const rootEl = document.getElementById('root');
|
||||||
|
if (rootEl) {
|
||||||
|
const root = ReactDOM.createRoot(rootEl);
|
||||||
|
root.render(
|
||||||
|
<React.StrictMode>
|
||||||
|
<App />
|
||||||
|
</React.StrictMode>,
|
||||||
|
);
|
||||||
|
}
|
||||||
25
raggr-frontend/tsconfig.json
Normal file
25
raggr-frontend/tsconfig.json
Normal file
@@ -0,0 +1,25 @@
|
|||||||
|
{
|
||||||
|
"compilerOptions": {
|
||||||
|
"lib": ["DOM", "ES2020"],
|
||||||
|
"jsx": "react-jsx",
|
||||||
|
"target": "ES2020",
|
||||||
|
"noEmit": true,
|
||||||
|
"skipLibCheck": true,
|
||||||
|
"useDefineForClassFields": true,
|
||||||
|
|
||||||
|
/* modules */
|
||||||
|
"module": "ESNext",
|
||||||
|
"moduleDetection": "force",
|
||||||
|
"moduleResolution": "bundler",
|
||||||
|
"verbatimModuleSyntax": true,
|
||||||
|
"resolveJsonModule": true,
|
||||||
|
"allowImportingTsExtensions": true,
|
||||||
|
"noUncheckedSideEffectImports": true,
|
||||||
|
|
||||||
|
/* type checking */
|
||||||
|
"strict": true,
|
||||||
|
"noUnusedLocals": true,
|
||||||
|
"noUnusedParameters": true
|
||||||
|
},
|
||||||
|
"include": ["src"]
|
||||||
|
}
|
||||||
1424
raggr-frontend/yarn.lock
Normal file
1424
raggr-frontend/yarn.lock
Normal file
File diff suppressed because it is too large
Load Diff
25
request.py
25
request.py
@@ -1,4 +1,5 @@
|
|||||||
import os
|
import os
|
||||||
|
import tempfile
|
||||||
import httpx
|
import httpx
|
||||||
|
|
||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
@@ -18,6 +19,30 @@ class PaperlessNGXService:
|
|||||||
r = httpx.get(self.url, headers=self.headers)
|
r = httpx.get(self.url, headers=self.headers)
|
||||||
return r.json()["results"]
|
return r.json()["results"]
|
||||||
|
|
||||||
|
def get_doc_by_id(self, doc_id: int):
|
||||||
|
url = f"http://{os.getenv("BASE_URL")}/api/documents/{doc_id}/"
|
||||||
|
r = httpx.get(url, headers=self.headers)
|
||||||
|
return r.json()
|
||||||
|
|
||||||
|
def download_pdf_from_id(self, id: int) -> str:
|
||||||
|
download_url = f"http://{os.getenv("BASE_URL")}/api/documents/{id}/download/"
|
||||||
|
response = httpx.get(
|
||||||
|
download_url, headers=self.headers, follow_redirects=True, timeout=30
|
||||||
|
)
|
||||||
|
response.raise_for_status()
|
||||||
|
# Use a temporary file for the downloaded PDF
|
||||||
|
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
|
||||||
|
temp_file.write(response.content)
|
||||||
|
temp_file.close()
|
||||||
|
temp_pdf_path = temp_file.name
|
||||||
|
pdf_to_process = temp_pdf_path
|
||||||
|
return pdf_to_process
|
||||||
|
|
||||||
|
def upload_cleaned_content(self, document_id, data):
|
||||||
|
PUTS_URL = f"http://{os.getenv("BASE_URL")}/api/documents/{document_id}/"
|
||||||
|
r = httpx.put(PUTS_URL, headers=self.headers, data=data)
|
||||||
|
r.raise_for_status()
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
pp = PaperlessNGXService()
|
pp = PaperlessNGXService()
|
||||||
|
|||||||
7
startup.sh
Normal file
7
startup.sh
Normal file
@@ -0,0 +1,7 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
echo "Starting reindex process..."
|
||||||
|
python main.py "" --reindex
|
||||||
|
|
||||||
|
echo "Starting Flask application..."
|
||||||
|
python app.py
|
||||||
Reference in New Issue
Block a user