Compare commits
29 Commits
a69f7864f3
...
user-suppo
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
5054b4a859 | ||
|
|
8479898cc4 | ||
|
|
acaf681927 | ||
|
|
2bbe33fedc | ||
|
|
b872750444 | ||
|
|
376baccadb | ||
|
|
c978b1a255 | ||
|
|
51b9932389 | ||
|
|
ebf39480b6 | ||
|
|
e4a04331cb | ||
|
|
166ffb4c09 | ||
|
|
64e286e623 | ||
|
|
c6c14729dd | ||
|
|
910097d13b | ||
|
|
0bb3e3172b | ||
|
|
24b30bc8a3 | ||
|
|
3ffc95a1b0 | ||
|
|
c5091dc07a | ||
|
|
c140758560 | ||
|
|
ab3a0eb442 | ||
|
|
c619d78922 | ||
|
|
c20ae0a4b9 | ||
|
|
26cc01b58b | ||
|
|
746b60e070 | ||
|
|
577c9144ac | ||
|
|
2b2891bd79 | ||
|
|
03b033e9a4 | ||
|
|
a640ae5fed | ||
|
|
99c98b7e42 |
16
.dockerignore
Normal file
16
.dockerignore
Normal file
@@ -0,0 +1,16 @@
|
|||||||
|
.git
|
||||||
|
.gitignore
|
||||||
|
README.md
|
||||||
|
.env
|
||||||
|
.DS_Store
|
||||||
|
chromadb/
|
||||||
|
chroma_db/
|
||||||
|
raggr-frontend/node_modules/
|
||||||
|
__pycache__/
|
||||||
|
*.pyc
|
||||||
|
*.pyo
|
||||||
|
*.pyd
|
||||||
|
.Python
|
||||||
|
.venv/
|
||||||
|
venv/
|
||||||
|
.pytest_cache/
|
||||||
1
.python-version
Normal file
1
.python-version
Normal file
@@ -0,0 +1 @@
|
|||||||
|
3.13
|
||||||
46
Dockerfile
Normal file
46
Dockerfile
Normal file
@@ -0,0 +1,46 @@
|
|||||||
|
FROM python:3.13-slim
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
# Install system dependencies, Node.js, Yarn, and uv
|
||||||
|
RUN apt-get update && apt-get install -y \
|
||||||
|
build-essential \
|
||||||
|
curl \
|
||||||
|
&& curl -fsSL https://deb.nodesource.com/setup_20.x | bash - \
|
||||||
|
&& apt-get install -y nodejs \
|
||||||
|
&& npm install -g yarn \
|
||||||
|
&& rm -rf /var/lib/apt/lists/* \
|
||||||
|
&& curl -LsSf https://astral.sh/uv/install.sh | sh
|
||||||
|
|
||||||
|
# Add uv to PATH
|
||||||
|
ENV PATH="/root/.local/bin:$PATH"
|
||||||
|
|
||||||
|
# Copy dependency files
|
||||||
|
COPY pyproject.toml ./
|
||||||
|
|
||||||
|
# Install Python dependencies using uv
|
||||||
|
RUN uv pip install --system -e .
|
||||||
|
|
||||||
|
# Copy application code
|
||||||
|
COPY *.py ./
|
||||||
|
COPY startup.sh ./
|
||||||
|
RUN chmod +x startup.sh
|
||||||
|
|
||||||
|
# Copy frontend code and build
|
||||||
|
COPY raggr-frontend ./raggr-frontend
|
||||||
|
WORKDIR /app/raggr-frontend
|
||||||
|
RUN yarn install && yarn build
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
# Create ChromaDB directory
|
||||||
|
RUN mkdir -p /app/chromadb
|
||||||
|
|
||||||
|
# Expose port
|
||||||
|
EXPOSE 8080
|
||||||
|
|
||||||
|
# Set environment variables
|
||||||
|
ENV PYTHONPATH=/app
|
||||||
|
ENV CHROMADB_PATH=/app/chromadb
|
||||||
|
|
||||||
|
# Run the startup script
|
||||||
|
CMD ["./startup.sh"]
|
||||||
102
app.py
Normal file
102
app.py
Normal file
@@ -0,0 +1,102 @@
|
|||||||
|
import os
|
||||||
|
|
||||||
|
from quart import Quart, request, jsonify, render_template, send_from_directory
|
||||||
|
from tortoise.contrib.quart import register_tortoise
|
||||||
|
|
||||||
|
from quart_jwt_extended import JWTManager
|
||||||
|
|
||||||
|
from main import consult_simba_oracle
|
||||||
|
from blueprints.conversation.logic import (
|
||||||
|
get_the_only_conversation,
|
||||||
|
add_message_to_conversation,
|
||||||
|
)
|
||||||
|
|
||||||
|
app = Quart(
|
||||||
|
__name__,
|
||||||
|
static_folder="raggr-frontend/dist/static",
|
||||||
|
template_folder="raggr-frontend/dist",
|
||||||
|
)
|
||||||
|
|
||||||
|
app.config["JWT_SECRET_KEY"] = os.getenv("JWT_SECRET_KEY", "SECRET_KEY")
|
||||||
|
jwt = JWTManager(app)
|
||||||
|
|
||||||
|
# Initialize Tortoise ORM
|
||||||
|
register_tortoise(
|
||||||
|
app,
|
||||||
|
db_url=os.getenv("DATABASE_URL", "sqlite://raggr.db"),
|
||||||
|
modules={"models": ["blueprints.conversation.models"]},
|
||||||
|
generate_schemas=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# Serve React static files
|
||||||
|
@app.route("/static/<path:filename>")
|
||||||
|
async def static_files(filename):
|
||||||
|
return await send_from_directory(app.static_folder, filename)
|
||||||
|
|
||||||
|
|
||||||
|
# Serve the React app for all routes (catch-all)
|
||||||
|
@app.route("/", defaults={"path": ""})
|
||||||
|
@app.route("/<path:path>")
|
||||||
|
async def serve_react_app(path):
|
||||||
|
if path and os.path.exists(os.path.join(app.template_folder, path)):
|
||||||
|
return await send_from_directory(app.template_folder, path)
|
||||||
|
return await render_template("index.html")
|
||||||
|
|
||||||
|
|
||||||
|
@app.route("/api/query", methods=["POST"])
|
||||||
|
async def query():
|
||||||
|
data = await request.get_json()
|
||||||
|
query = data.get("query")
|
||||||
|
# add message to database
|
||||||
|
conversation = await get_the_only_conversation()
|
||||||
|
print(conversation)
|
||||||
|
await add_message_to_conversation(
|
||||||
|
conversation=conversation, message=query, speaker="user"
|
||||||
|
)
|
||||||
|
|
||||||
|
response = consult_simba_oracle(query)
|
||||||
|
await add_message_to_conversation(
|
||||||
|
conversation=conversation, message=response, speaker="simba"
|
||||||
|
)
|
||||||
|
return jsonify({"response": response})
|
||||||
|
|
||||||
|
|
||||||
|
@app.route("/api/messages", methods=["GET"])
|
||||||
|
async def get_messages():
|
||||||
|
conversation = await get_the_only_conversation()
|
||||||
|
# Prefetch related messages
|
||||||
|
await conversation.fetch_related("messages")
|
||||||
|
|
||||||
|
# Manually serialize the conversation with messages
|
||||||
|
messages = []
|
||||||
|
for msg in conversation.messages:
|
||||||
|
messages.append(
|
||||||
|
{
|
||||||
|
"id": str(msg.id),
|
||||||
|
"text": msg.text,
|
||||||
|
"speaker": msg.speaker.value,
|
||||||
|
"created_at": msg.created_at.isoformat(),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
return jsonify(
|
||||||
|
{
|
||||||
|
"id": str(conversation.id),
|
||||||
|
"name": conversation.name,
|
||||||
|
"messages": messages,
|
||||||
|
"created_at": conversation.created_at.isoformat(),
|
||||||
|
"updated_at": conversation.updated_at.isoformat(),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# @app.route("/api/ingest", methods=["POST"])
|
||||||
|
# def webhook():
|
||||||
|
# data = request.get_json()
|
||||||
|
# print(data)
|
||||||
|
# return jsonify({"status": "received"})
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
app.run(host="0.0.0.0", port=8080, debug=True)
|
||||||
17
blueprints/conversation/__init__.py
Normal file
17
blueprints/conversation/__init__.py
Normal file
@@ -0,0 +1,17 @@
|
|||||||
|
from quart import Blueprint, jsonify
|
||||||
|
from .models import (
|
||||||
|
Conversation,
|
||||||
|
PydConversation,
|
||||||
|
)
|
||||||
|
|
||||||
|
conversation_blueprint = Blueprint(
|
||||||
|
"conversation_api", __name__, url_prefix="/api/conversation"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@conversation_blueprint.route("/<conversation_id>")
|
||||||
|
async def get_conversation(conversation_id: str):
|
||||||
|
conversation = await Conversation.get(id=conversation_id)
|
||||||
|
serialized_conversation = await PydConversation.from_tortoise_orm(conversation)
|
||||||
|
|
||||||
|
return jsonify(serialized_conversation.model_dump_json())
|
||||||
32
blueprints/conversation/logic.py
Normal file
32
blueprints/conversation/logic.py
Normal file
@@ -0,0 +1,32 @@
|
|||||||
|
from .models import Conversation, ConversationMessage
|
||||||
|
|
||||||
|
|
||||||
|
async def create_conversation(name: str = "") -> Conversation:
|
||||||
|
conversation = await Conversation.create(name=name)
|
||||||
|
return conversation
|
||||||
|
|
||||||
|
|
||||||
|
async def add_message_to_conversation(
|
||||||
|
conversation: Conversation,
|
||||||
|
message: str,
|
||||||
|
speaker: str,
|
||||||
|
) -> ConversationMessage:
|
||||||
|
print(conversation, message, speaker)
|
||||||
|
message = await ConversationMessage.create(
|
||||||
|
text=message,
|
||||||
|
speaker=speaker,
|
||||||
|
conversation=conversation,
|
||||||
|
)
|
||||||
|
|
||||||
|
return message
|
||||||
|
|
||||||
|
|
||||||
|
async def get_the_only_conversation() -> Conversation:
|
||||||
|
try:
|
||||||
|
conversation = await Conversation.all().first()
|
||||||
|
if conversation is None:
|
||||||
|
conversation = await Conversation.create(name="simba_chat")
|
||||||
|
except Exception as _e:
|
||||||
|
conversation = await Conversation.create(name="simba_chat")
|
||||||
|
|
||||||
|
return conversation
|
||||||
41
blueprints/conversation/models.py
Normal file
41
blueprints/conversation/models.py
Normal file
@@ -0,0 +1,41 @@
|
|||||||
|
import enum
|
||||||
|
|
||||||
|
from tortoise.models import Model
|
||||||
|
from tortoise import fields
|
||||||
|
from tortoise.contrib.pydantic import (
|
||||||
|
pydantic_queryset_creator,
|
||||||
|
pydantic_model_creator,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class Speaker(enum.Enum):
|
||||||
|
USER = "user"
|
||||||
|
SIMBA = "simba"
|
||||||
|
|
||||||
|
|
||||||
|
class Conversation(Model):
|
||||||
|
id = fields.UUIDField(primary_key=True)
|
||||||
|
name = fields.CharField(max_length=255)
|
||||||
|
created_at = fields.DatetimeField(auto_now_add=True)
|
||||||
|
updated_at = fields.DatetimeField(auto_now=True)
|
||||||
|
|
||||||
|
class Meta:
|
||||||
|
table = "conversations"
|
||||||
|
|
||||||
|
|
||||||
|
class ConversationMessage(Model):
|
||||||
|
id = fields.UUIDField(primary_key=True)
|
||||||
|
text = fields.TextField()
|
||||||
|
conversation = fields.ForeignKeyField(
|
||||||
|
"models.Conversation", related_name="messages"
|
||||||
|
)
|
||||||
|
created_at = fields.DatetimeField(auto_now_add=True)
|
||||||
|
speaker = fields.CharEnumField(enum_type=Speaker, max_length=10)
|
||||||
|
|
||||||
|
class Meta:
|
||||||
|
table = "conversation_messages"
|
||||||
|
|
||||||
|
|
||||||
|
PydConversationMessage = pydantic_model_creator(ConversationMessage)
|
||||||
|
PydConversation = pydantic_model_creator(Conversation, name="Conversation")
|
||||||
|
PydListConversationMessage = pydantic_queryset_creator(ConversationMessage)
|
||||||
26
chunker.py
26
chunker.py
@@ -3,15 +3,20 @@ from math import ceil
|
|||||||
import re
|
import re
|
||||||
from typing import Union
|
from typing import Union
|
||||||
from uuid import UUID, uuid4
|
from uuid import UUID, uuid4
|
||||||
|
from ollama import Client
|
||||||
from chromadb.utils.embedding_functions.ollama_embedding_function import (
|
from chromadb.utils.embedding_functions.openai_embedding_function import (
|
||||||
OllamaEmbeddingFunction,
|
OpenAIEmbeddingFunction,
|
||||||
)
|
)
|
||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
|
from llm import LLMClient
|
||||||
|
|
||||||
|
|
||||||
load_dotenv()
|
load_dotenv()
|
||||||
|
|
||||||
|
ollama_client = Client(
|
||||||
|
host=os.getenv("OLLAMA_HOST", "http://localhost:11434"), timeout=10.0
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def remove_headers_footers(text, header_patterns=None, footer_patterns=None):
|
def remove_headers_footers(text, header_patterns=None, footer_patterns=None):
|
||||||
if header_patterns is None:
|
if header_patterns is None:
|
||||||
@@ -80,13 +85,16 @@ class Chunk:
|
|||||||
|
|
||||||
|
|
||||||
class Chunker:
|
class Chunker:
|
||||||
embedding_fx = OllamaEmbeddingFunction(
|
|
||||||
url=os.getenv("OLLAMA_URL", ""),
|
|
||||||
model_name="mxbai-embed-large",
|
|
||||||
)
|
|
||||||
|
|
||||||
def __init__(self, collection) -> None:
|
def __init__(self, collection) -> None:
|
||||||
self.collection = collection
|
self.collection = collection
|
||||||
|
self.llm_client = LLMClient()
|
||||||
|
|
||||||
|
def embedding_fx(self, inputs):
|
||||||
|
openai_embedding_fx = OpenAIEmbeddingFunction(
|
||||||
|
api_key=os.getenv("OPENAI_API_KEY"),
|
||||||
|
model_name="text-embedding-3-small",
|
||||||
|
)
|
||||||
|
return openai_embedding_fx(inputs)
|
||||||
|
|
||||||
def chunk_document(
|
def chunk_document(
|
||||||
self,
|
self,
|
||||||
@@ -96,7 +104,7 @@ class Chunker:
|
|||||||
) -> list[Chunk]:
|
) -> list[Chunk]:
|
||||||
doc_uuid = uuid4()
|
doc_uuid = uuid4()
|
||||||
|
|
||||||
chunk_size = min(chunk_size, len(document))
|
chunk_size = min(chunk_size, len(document)) or 1
|
||||||
|
|
||||||
chunks = []
|
chunks = []
|
||||||
num_chunks = ceil(len(document) / chunk_size)
|
num_chunks = ceil(len(document) / chunk_size)
|
||||||
|
|||||||
@@ -12,6 +12,9 @@ from request import PaperlessNGXService
|
|||||||
|
|
||||||
load_dotenv()
|
load_dotenv()
|
||||||
|
|
||||||
|
# Configure ollama client with URL from environment or default to localhost
|
||||||
|
ollama_client = ollama.Client(host=os.getenv("OLLAMA_URL", "http://localhost:11434"))
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(description="use llm to clean documents")
|
parser = argparse.ArgumentParser(description="use llm to clean documents")
|
||||||
parser.add_argument("document_id", type=str, help="questions about simba's health")
|
parser.add_argument("document_id", type=str, help="questions about simba's health")
|
||||||
|
|
||||||
@@ -131,7 +134,7 @@ Someone will kill the innocent kittens if you don't extract the text exactly. So
|
|||||||
|
|
||||||
|
|
||||||
def summarize_pdf_image(filepaths: list[str]):
|
def summarize_pdf_image(filepaths: list[str]):
|
||||||
res = ollama.chat(
|
res = ollama_client.chat(
|
||||||
model="gemma3:4b",
|
model="gemma3:4b",
|
||||||
messages=[
|
messages=[
|
||||||
{
|
{
|
||||||
|
|||||||
17
docker-compose.yml
Normal file
17
docker-compose.yml
Normal file
@@ -0,0 +1,17 @@
|
|||||||
|
version: "3.8"
|
||||||
|
|
||||||
|
services:
|
||||||
|
raggr:
|
||||||
|
image: torrtle/simbarag:latest
|
||||||
|
network_mode: host
|
||||||
|
environment:
|
||||||
|
- PAPERLESS_TOKEN=${PAPERLESS_TOKEN}
|
||||||
|
- BASE_URL=${BASE_URL}
|
||||||
|
- OLLAMA_URL=${OLLAMA_URL:-http://localhost:11434}
|
||||||
|
- CHROMADB_PATH=/app/chromadb
|
||||||
|
- OPENAI_API_KEY=${OPENAI_API_KEY}
|
||||||
|
volumes:
|
||||||
|
- chromadb_data:/app/chromadb
|
||||||
|
|
||||||
|
volumes:
|
||||||
|
chromadb_data:
|
||||||
83
image_process.py
Normal file
83
image_process.py
Normal file
@@ -0,0 +1,83 @@
|
|||||||
|
from ollama import Client
|
||||||
|
import argparse
|
||||||
|
import os
|
||||||
|
import logging
|
||||||
|
from PIL import Image, ExifTags
|
||||||
|
from pillow_heif import register_heif_opener
|
||||||
|
from pydantic import BaseModel
|
||||||
|
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
|
||||||
|
load_dotenv()
|
||||||
|
|
||||||
|
register_heif_opener()
|
||||||
|
|
||||||
|
logging.basicConfig(level=logging.INFO)
|
||||||
|
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
prog="SimbaImageProcessor",
|
||||||
|
description="What the program does",
|
||||||
|
epilog="Text at the bottom of help",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument("filepath")
|
||||||
|
|
||||||
|
client = Client(host=os.getenv("OLLAMA_HOST", "http://localhost:11434"))
|
||||||
|
|
||||||
|
|
||||||
|
class SimbaImageDescription(BaseModel):
|
||||||
|
image_date: str
|
||||||
|
description: str
|
||||||
|
|
||||||
|
|
||||||
|
def describe_simba_image(input):
|
||||||
|
logging.info("Opening image of Simba ...")
|
||||||
|
if "heic" in input.lower() or "heif" in input.lower():
|
||||||
|
new_filepath = input.split(".")[0] + ".jpg"
|
||||||
|
img = Image.open(input)
|
||||||
|
img.save(new_filepath, "JPEG")
|
||||||
|
logging.info("Extracting EXIF...")
|
||||||
|
exif = {
|
||||||
|
ExifTags.TAGS[k]: v for k, v in img.getexif().items() if k in ExifTags.TAGS
|
||||||
|
}
|
||||||
|
img = Image.open(new_filepath)
|
||||||
|
input = new_filepath
|
||||||
|
else:
|
||||||
|
img = Image.open(input)
|
||||||
|
|
||||||
|
logging.info("Extracting EXIF...")
|
||||||
|
exif = {
|
||||||
|
ExifTags.TAGS[k]: v for k, v in img.getexif().items() if k in ExifTags.TAGS
|
||||||
|
}
|
||||||
|
|
||||||
|
if "MakerNote" in exif:
|
||||||
|
exif.pop("MakerNote")
|
||||||
|
|
||||||
|
logging.info(exif)
|
||||||
|
|
||||||
|
prompt = f"Simba is an orange cat belonging to Ryan Chen. In 2025, they lived in New York. In 2024, they lived in California. Analyze the following image and tell me what Simba seems to be doing. Be extremely descriptive about Simba, things in the background, and the setting of the image. I will also include the EXIF data of the image, please use it to help you determine information about Simba. EXIF: {exif}. Put the notes in the description field and the date in the image_date field."
|
||||||
|
|
||||||
|
logging.info("Sending info to Ollama ...")
|
||||||
|
response = client.chat(
|
||||||
|
model="gemma3:4b",
|
||||||
|
messages=[
|
||||||
|
{
|
||||||
|
"role": "system",
|
||||||
|
"content": "you are a very shrewd and descriptive note taker. all of your responses will be formatted like notes in bullet points. be very descriptive. do not leave a single thing out.",
|
||||||
|
},
|
||||||
|
{"role": "user", "content": prompt, "images": [input]},
|
||||||
|
],
|
||||||
|
format=SimbaImageDescription.model_json_schema(),
|
||||||
|
)
|
||||||
|
|
||||||
|
result = SimbaImageDescription.model_validate_json(response["message"]["content"])
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
args = parser.parse_args()
|
||||||
|
if args.filepath:
|
||||||
|
logging.info
|
||||||
|
describe_simba_image(input=args.filepath)
|
||||||
115
index_immich.py
Normal file
115
index_immich.py
Normal file
@@ -0,0 +1,115 @@
|
|||||||
|
import httpx
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
import logging
|
||||||
|
import tempfile
|
||||||
|
|
||||||
|
from image_process import describe_simba_image
|
||||||
|
from request import PaperlessNGXService
|
||||||
|
import sqlite3
|
||||||
|
|
||||||
|
logging.basicConfig(level=logging.INFO)
|
||||||
|
|
||||||
|
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
|
||||||
|
load_dotenv()
|
||||||
|
|
||||||
|
# Configuration from environment variables
|
||||||
|
IMMICH_URL = os.getenv("IMMICH_URL", "http://localhost:2283")
|
||||||
|
API_KEY = os.getenv("IMMICH_API_KEY")
|
||||||
|
PERSON_NAME = os.getenv("PERSON_NAME", "Simba") # Name of the tagged person/pet
|
||||||
|
DOWNLOAD_DIR = os.getenv("DOWNLOAD_DIR", "./simba_photos")
|
||||||
|
|
||||||
|
# Set up headers
|
||||||
|
headers = {"x-api-key": API_KEY, "Content-Type": "application/json"}
|
||||||
|
|
||||||
|
VISITED = {}
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
conn = sqlite3.connect("./visited.db")
|
||||||
|
c = conn.cursor()
|
||||||
|
c.execute("select immich_id from visited")
|
||||||
|
rows = c.fetchall()
|
||||||
|
for row in rows:
|
||||||
|
VISITED.add(row[0])
|
||||||
|
|
||||||
|
ppngx = PaperlessNGXService()
|
||||||
|
people_url = f"{IMMICH_URL}/api/search/person?name=Simba"
|
||||||
|
people = httpx.get(people_url, headers=headers).json()
|
||||||
|
|
||||||
|
simba_id = people[0]["id"]
|
||||||
|
|
||||||
|
ids = {}
|
||||||
|
|
||||||
|
asset_search = f"{IMMICH_URL}/api/search/smart"
|
||||||
|
request_body = {"query": "orange cat"}
|
||||||
|
results = httpx.post(asset_search, headers=headers, json=request_body)
|
||||||
|
|
||||||
|
assets = results.json()["assets"]
|
||||||
|
for asset in assets["items"]:
|
||||||
|
if asset["type"] == "IMAGE" and asset["id"] not in VISITED:
|
||||||
|
ids[asset["id"]] = asset.get("originalFileName")
|
||||||
|
nextPage = assets.get("nextPage")
|
||||||
|
|
||||||
|
# while nextPage != None:
|
||||||
|
# logging.info(f"next page: {nextPage}")
|
||||||
|
# request_body["page"] = nextPage
|
||||||
|
# results = httpx.post(asset_search, headers=headers, json=request_body)
|
||||||
|
# assets = results.json()["assets"]
|
||||||
|
|
||||||
|
# for asset in assets["items"]:
|
||||||
|
# if asset["type"] == "IMAGE":
|
||||||
|
# ids.add(asset['id'])
|
||||||
|
|
||||||
|
# nextPage = assets.get("nextPage")
|
||||||
|
|
||||||
|
asset_search = f"{IMMICH_URL}/api/search/smart"
|
||||||
|
request_body = {"query": "simba"}
|
||||||
|
results = httpx.post(asset_search, headers=headers, json=request_body)
|
||||||
|
for asset in results.json()["assets"]["items"]:
|
||||||
|
if asset["type"] == "IMAGE":
|
||||||
|
ids[asset["id"]] = asset.get("originalFileName")
|
||||||
|
|
||||||
|
for immich_asset_id, immich_filename in ids.items():
|
||||||
|
try:
|
||||||
|
response = httpx.get(
|
||||||
|
f"{IMMICH_URL}/api/assets/{immich_asset_id}/original", headers=headers
|
||||||
|
)
|
||||||
|
|
||||||
|
path = os.path.join("/Users/ryanchen/Programs/raggr", immich_filename)
|
||||||
|
file = open(path, "wb+")
|
||||||
|
for chunk in response.iter_bytes(chunk_size=8192):
|
||||||
|
file.write(chunk)
|
||||||
|
|
||||||
|
logging.info("Processing image ...")
|
||||||
|
description = describe_simba_image(path)
|
||||||
|
|
||||||
|
image_description = description.description
|
||||||
|
image_date = description.image_date
|
||||||
|
|
||||||
|
description_filepath = os.path.join(
|
||||||
|
"/Users/ryanchen/Programs/raggr", f"SIMBA_DESCRIBE_001.txt"
|
||||||
|
)
|
||||||
|
file = open(description_filepath, "w+")
|
||||||
|
file.write(image_description)
|
||||||
|
file.close()
|
||||||
|
|
||||||
|
file = open(description_filepath, "rb")
|
||||||
|
ppngx.upload_description(
|
||||||
|
description_filepath=description_filepath,
|
||||||
|
file=file,
|
||||||
|
title="SIMBA_DESCRIBE_001.txt",
|
||||||
|
exif_date=image_date,
|
||||||
|
)
|
||||||
|
file.close()
|
||||||
|
|
||||||
|
c.execute("INSERT INTO visited (immich_id) values (?)", (immich_asset_id,))
|
||||||
|
conn.commit()
|
||||||
|
logging.info("Processing complete. Deleting file.")
|
||||||
|
os.remove(file.name)
|
||||||
|
except Exception as e:
|
||||||
|
logging.info(f"something went wrong for {immich_filename}")
|
||||||
|
logging.info(e)
|
||||||
|
|
||||||
|
conn.close()
|
||||||
64
llm.py
Normal file
64
llm.py
Normal file
@@ -0,0 +1,64 @@
|
|||||||
|
import os
|
||||||
|
|
||||||
|
from ollama import Client
|
||||||
|
from openai import OpenAI
|
||||||
|
|
||||||
|
import logging
|
||||||
|
|
||||||
|
logging.basicConfig(level=logging.INFO)
|
||||||
|
|
||||||
|
|
||||||
|
class LLMClient:
|
||||||
|
def __init__(self):
|
||||||
|
try:
|
||||||
|
self.ollama_client = Client(
|
||||||
|
host=os.getenv("OLLAMA_URL", "http://localhost:11434"), timeout=10.0
|
||||||
|
)
|
||||||
|
self.ollama_client.chat(
|
||||||
|
model="gemma3:4b", messages=[{"role": "system", "content": "test"}]
|
||||||
|
)
|
||||||
|
self.PROVIDER = "ollama"
|
||||||
|
logging.info("Using Ollama as LLM backend")
|
||||||
|
except Exception as e:
|
||||||
|
print(e)
|
||||||
|
self.openai_client = OpenAI()
|
||||||
|
self.PROVIDER = "openai"
|
||||||
|
logging.info("Using OpenAI as LLM backend")
|
||||||
|
|
||||||
|
def chat(
|
||||||
|
self,
|
||||||
|
prompt: str,
|
||||||
|
system_prompt: str,
|
||||||
|
):
|
||||||
|
if self.PROVIDER == "ollama":
|
||||||
|
response = self.ollama_client.chat(
|
||||||
|
model="gemma3:4b",
|
||||||
|
messages=[
|
||||||
|
{
|
||||||
|
"role": "system",
|
||||||
|
"content": system_prompt,
|
||||||
|
},
|
||||||
|
{"role": "user", "content": prompt},
|
||||||
|
],
|
||||||
|
)
|
||||||
|
print(response)
|
||||||
|
output = response.message.content
|
||||||
|
elif self.PROVIDER == "openai":
|
||||||
|
response = self.openai_client.responses.create(
|
||||||
|
model="gpt-4o-mini",
|
||||||
|
input=[
|
||||||
|
{
|
||||||
|
"role": "system",
|
||||||
|
"content": system_prompt,
|
||||||
|
},
|
||||||
|
{"role": "user", "content": prompt},
|
||||||
|
],
|
||||||
|
)
|
||||||
|
output = response.output_text
|
||||||
|
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
client = Client()
|
||||||
|
client.chat(model="gemma3:4b", messages=[{"role": "system", "promp": "hack"}])
|
||||||
173
main.py
173
main.py
@@ -1,7 +1,7 @@
|
|||||||
import datetime
|
import datetime
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
from typing import Any, Union
|
import sqlite3
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import chromadb
|
import chromadb
|
||||||
@@ -10,15 +10,22 @@ import ollama
|
|||||||
|
|
||||||
from request import PaperlessNGXService
|
from request import PaperlessNGXService
|
||||||
from chunker import Chunker
|
from chunker import Chunker
|
||||||
from query import QueryGenerator
|
|
||||||
from cleaner import pdf_to_image, summarize_pdf_image
|
from cleaner import pdf_to_image, summarize_pdf_image
|
||||||
|
from llm import LLMClient
|
||||||
|
from query import QueryGenerator
|
||||||
|
|
||||||
|
|
||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
|
|
||||||
load_dotenv()
|
_dotenv_loaded = load_dotenv()
|
||||||
|
|
||||||
|
# Configure ollama client with URL from environment or default to localhost
|
||||||
|
ollama_client = ollama.Client(
|
||||||
|
host=os.getenv("OLLAMA_URL", "http://localhost:11434"), timeout=10.0
|
||||||
|
)
|
||||||
|
|
||||||
client = chromadb.PersistentClient(path=os.getenv("CHROMADB_PATH", ""))
|
client = chromadb.PersistentClient(path=os.getenv("CHROMADB_PATH", ""))
|
||||||
simba_docs = client.get_or_create_collection(name="simba_docs")
|
simba_docs = client.get_or_create_collection(name="simba_docs2")
|
||||||
feline_vet_lookup = client.get_or_create_collection(name="feline_vet_lookup")
|
feline_vet_lookup = client.get_or_create_collection(name="feline_vet_lookup")
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(
|
parser = argparse.ArgumentParser(
|
||||||
@@ -29,25 +36,29 @@ parser.add_argument("query", type=str, help="questions about simba's health")
|
|||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--reindex", action="store_true", help="re-index the simba documents"
|
"--reindex", action="store_true", help="re-index the simba documents"
|
||||||
)
|
)
|
||||||
|
parser.add_argument("--index", help="index a file")
|
||||||
|
|
||||||
ppngx = PaperlessNGXService()
|
ppngx = PaperlessNGXService()
|
||||||
|
|
||||||
|
llm_client = LLMClient()
|
||||||
|
|
||||||
def index_using_pdf_llm():
|
|
||||||
|
def index_using_pdf_llm(doctypes):
|
||||||
|
logging.info("reindex data...")
|
||||||
files = ppngx.get_data()
|
files = ppngx.get_data()
|
||||||
for file in files:
|
for file in files:
|
||||||
document_id = file["id"]
|
document_id: int = file["id"]
|
||||||
pdf_path = ppngx.download_pdf_from_id(id=document_id)
|
pdf_path = ppngx.download_pdf_from_id(id=document_id)
|
||||||
image_paths = pdf_to_image(filepath=pdf_path)
|
image_paths = pdf_to_image(filepath=pdf_path)
|
||||||
|
logging.info(f"summarizing {file}")
|
||||||
generated_summary = summarize_pdf_image(filepaths=image_paths)
|
generated_summary = summarize_pdf_image(filepaths=image_paths)
|
||||||
file["content"] = generated_summary
|
file["content"] = generated_summary
|
||||||
|
|
||||||
chunk_data(files, simba_docs)
|
chunk_data(files, simba_docs, doctypes=doctypes)
|
||||||
|
|
||||||
|
|
||||||
def date_to_epoch(date_str: str) -> float:
|
def date_to_epoch(date_str: str) -> float:
|
||||||
split_date = date_str.split("-")
|
split_date = date_str.split("-")
|
||||||
print(split_date)
|
|
||||||
date = datetime.datetime(
|
date = datetime.datetime(
|
||||||
int(split_date[0]),
|
int(split_date[0]),
|
||||||
int(split_date[1]),
|
int(split_date[1]),
|
||||||
@@ -60,17 +71,42 @@ def date_to_epoch(date_str: str) -> float:
|
|||||||
return date.timestamp()
|
return date.timestamp()
|
||||||
|
|
||||||
|
|
||||||
def chunk_data(docs: list[dict[str, Union[str, Any]]], collection):
|
def chunk_data(docs, collection, doctypes):
|
||||||
# Step 2: Create chunks
|
# Step 2: Create chunks
|
||||||
chunker = Chunker(collection)
|
chunker = Chunker(collection)
|
||||||
|
|
||||||
print(f"chunking {len(docs)} documents")
|
logging.info(f"chunking {len(docs)} documents")
|
||||||
print(docs)
|
|
||||||
texts: list[str] = [doc["content"] for doc in docs]
|
texts: list[str] = [doc["content"] for doc in docs]
|
||||||
|
with sqlite3.connect("visited.db") as conn:
|
||||||
|
to_insert = []
|
||||||
|
c = conn.cursor()
|
||||||
for index, text in enumerate(texts):
|
for index, text in enumerate(texts):
|
||||||
metadata = {
|
metadata = {
|
||||||
"created_date": date_to_epoch(docs[index]["created_date"]),
|
"created_date": date_to_epoch(docs[index]["created_date"]),
|
||||||
|
"filename": docs[index]["original_file_name"],
|
||||||
|
"document_type": doctypes.get(docs[index]["document_type"], ""),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if doctypes:
|
||||||
|
metadata["type"] = doctypes.get(docs[index]["document_type"])
|
||||||
|
|
||||||
|
chunker.chunk_document(
|
||||||
|
document=text,
|
||||||
|
metadata=metadata,
|
||||||
|
)
|
||||||
|
to_insert.append((docs[index]["id"],))
|
||||||
|
|
||||||
|
c.executemany(
|
||||||
|
"INSERT INTO indexed_documents (paperless_id) values (?)", to_insert
|
||||||
|
)
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
|
|
||||||
|
def chunk_text(texts: list[str], collection):
|
||||||
|
chunker = Chunker(collection)
|
||||||
|
|
||||||
|
for index, text in enumerate(texts):
|
||||||
|
metadata = {}
|
||||||
chunker.chunk_document(
|
chunker.chunk_document(
|
||||||
document=text,
|
document=text,
|
||||||
metadata=metadata,
|
metadata=metadata,
|
||||||
@@ -78,26 +114,54 @@ def chunk_data(docs: list[dict[str, Union[str, Any]]], collection):
|
|||||||
|
|
||||||
|
|
||||||
def consult_oracle(input: str, collection):
|
def consult_oracle(input: str, collection):
|
||||||
|
import time
|
||||||
|
|
||||||
|
chunker = Chunker(collection)
|
||||||
|
|
||||||
|
start_time = time.time()
|
||||||
|
|
||||||
# Ask
|
# Ask
|
||||||
|
logging.info("Starting query generation")
|
||||||
|
qg_start = time.time()
|
||||||
qg = QueryGenerator()
|
qg = QueryGenerator()
|
||||||
metadata_filter = qg.get_query("input")
|
doctype_query = qg.get_doctype_query(input=input)
|
||||||
print(metadata_filter)
|
# metadata_filter = qg.get_query(input)
|
||||||
embeddings = Chunker.embedding_fx(input=[input])
|
metadata_filter = {**doctype_query}
|
||||||
|
logging.info(metadata_filter)
|
||||||
|
qg_end = time.time()
|
||||||
|
logging.info(f"Query generation took {qg_end - qg_start:.2f} seconds")
|
||||||
|
|
||||||
|
logging.info("Starting embedding generation")
|
||||||
|
embedding_start = time.time()
|
||||||
|
embeddings = chunker.embedding_fx(inputs=[input])
|
||||||
|
embedding_end = time.time()
|
||||||
|
logging.info(
|
||||||
|
f"Embedding generation took {embedding_end - embedding_start:.2f} seconds"
|
||||||
|
)
|
||||||
|
|
||||||
|
logging.info("Starting collection query")
|
||||||
|
query_start = time.time()
|
||||||
results = collection.query(
|
results = collection.query(
|
||||||
query_texts=[input],
|
query_texts=[input],
|
||||||
query_embeddings=embeddings,
|
query_embeddings=embeddings,
|
||||||
where=metadata_filter,
|
where=metadata_filter,
|
||||||
)
|
)
|
||||||
|
query_end = time.time()
|
||||||
print(results)
|
logging.info(f"Collection query took {query_end - query_start:.2f} seconds")
|
||||||
|
|
||||||
# Generate
|
# Generate
|
||||||
output = ollama.generate(
|
logging.info("Starting LLM generation")
|
||||||
model="gemma3n:e4b",
|
llm_start = time.time()
|
||||||
prompt=f"You are a helpful assistant that understandings veterinary terms. Using the following data, help answer the user's query by providing as many details as possible. Using this data: {results}. Respond to this prompt: {input}",
|
system_prompt = "You are a helpful assistant that understands veterinary terms."
|
||||||
)
|
prompt = f"Using the following data, help answer the user's query by providing as many details as possible. Using this data: {results}. Respond to this prompt: {input}"
|
||||||
|
output = llm_client.chat(prompt=prompt, system_prompt=system_prompt)
|
||||||
|
llm_end = time.time()
|
||||||
|
logging.info(f"LLM generation took {llm_end - llm_start:.2f} seconds")
|
||||||
|
|
||||||
print(output["response"])
|
total_time = time.time() - start_time
|
||||||
|
logging.info(f"Total consult_oracle execution took {total_time:.2f} seconds")
|
||||||
|
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
def paperless_workflow(input):
|
def paperless_workflow(input):
|
||||||
@@ -109,24 +173,71 @@ def paperless_workflow(input):
|
|||||||
consult_oracle(input, simba_docs)
|
consult_oracle(input, simba_docs)
|
||||||
|
|
||||||
|
|
||||||
|
def consult_simba_oracle(input: str):
|
||||||
|
return consult_oracle(
|
||||||
|
input=input,
|
||||||
|
collection=simba_docs,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def filter_indexed_files(docs):
|
||||||
|
with sqlite3.connect("visited.db") as conn:
|
||||||
|
c = conn.cursor()
|
||||||
|
c.execute(
|
||||||
|
"CREATE TABLE IF NOT EXISTS indexed_documents (id INTEGER PRIMARY KEY AUTOINCREMENT, paperless_id INTEGER)"
|
||||||
|
)
|
||||||
|
c.execute("SELECT paperless_id FROM indexed_documents")
|
||||||
|
rows = c.fetchall()
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
|
visited = {row[0] for row in rows}
|
||||||
|
return [doc for doc in docs if doc["id"] not in visited]
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
if args.reindex:
|
if args.reindex:
|
||||||
# logging.info(msg="Fetching documents from Paperless-NGX")
|
with sqlite3.connect("./visited.db") as conn:
|
||||||
# ppngx = PaperlessNGXService()
|
c = conn.cursor()
|
||||||
# docs = ppngx.get_data()
|
c.execute("DELETE FROM indexed_documents")
|
||||||
# logging.info(msg=f"Fetched {len(docs)} documents")
|
|
||||||
#
|
logging.info("Fetching documents from Paperless-NGX")
|
||||||
# logging.info(msg="Chunking documents now ...")
|
ppngx = PaperlessNGXService()
|
||||||
# chunk_data(docs, collection=simba_docs)
|
docs = ppngx.get_data()
|
||||||
# logging.info(msg="Done chunking documents")
|
docs = filter_indexed_files(docs)
|
||||||
index_using_pdf_llm()
|
logging.info(f"Fetched {len(docs)} documents")
|
||||||
|
|
||||||
|
# Delete all chromadb data
|
||||||
|
ids = simba_docs.get(ids=None, limit=None, offset=0)
|
||||||
|
all_ids = ids["ids"]
|
||||||
|
if len(all_ids) > 0:
|
||||||
|
simba_docs.delete(ids=all_ids)
|
||||||
|
|
||||||
|
# Chunk documents
|
||||||
|
logging.info("Chunking documents now ...")
|
||||||
|
tag_lookup = ppngx.get_tags()
|
||||||
|
doctype_lookup = ppngx.get_doctypes()
|
||||||
|
chunk_data(docs, collection=simba_docs, doctypes=doctype_lookup)
|
||||||
|
logging.info("Done chunking documents")
|
||||||
|
|
||||||
|
# if args.index:
|
||||||
|
# with open(args.index) as file:
|
||||||
|
# extension = args.index.split(".")[-1]
|
||||||
|
# if extension == "pdf":
|
||||||
|
# pdf_path = ppngx.download_pdf_from_id(id=document_id)
|
||||||
|
# image_paths = pdf_to_image(filepath=pdf_path)
|
||||||
|
# print(f"summarizing {file}")
|
||||||
|
# generated_summary = summarize_pdf_image(filepaths=image_paths)
|
||||||
|
# elif extension in [".md", ".txt"]:
|
||||||
|
# chunk_text(texts=[file.readall()], collection=simba_docs)
|
||||||
|
|
||||||
if args.query:
|
if args.query:
|
||||||
logging.info("Consulting oracle ...")
|
logging.info("Consulting oracle ...")
|
||||||
|
print(
|
||||||
consult_oracle(
|
consult_oracle(
|
||||||
input=args.query,
|
input=args.query,
|
||||||
collection=simba_docs,
|
collection=simba_docs,
|
||||||
)
|
)
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
print("please provide a query")
|
logging.info("please provide a query")
|
||||||
|
|||||||
@@ -4,4 +4,24 @@ version = "0.1.0"
|
|||||||
description = "Add your description here"
|
description = "Add your description here"
|
||||||
readme = "README.md"
|
readme = "README.md"
|
||||||
requires-python = ">=3.13"
|
requires-python = ">=3.13"
|
||||||
dependencies = []
|
dependencies = [
|
||||||
|
"chromadb>=1.1.0",
|
||||||
|
"python-dotenv>=1.0.0",
|
||||||
|
"flask>=3.1.2",
|
||||||
|
"httpx>=0.28.1",
|
||||||
|
"ollama>=0.6.0",
|
||||||
|
"openai>=2.0.1",
|
||||||
|
"pydantic>=2.11.9",
|
||||||
|
"pillow>=10.0.0",
|
||||||
|
"pymupdf>=1.24.0",
|
||||||
|
"black>=25.9.0",
|
||||||
|
"pillow-heif>=1.1.1",
|
||||||
|
"flask-jwt-extended>=4.7.1",
|
||||||
|
"bcrypt>=5.0.0",
|
||||||
|
"pony>=0.7.19",
|
||||||
|
"flask-login>=0.6.3",
|
||||||
|
"quart>=0.20.0",
|
||||||
|
"tortoise-orm>=0.25.1",
|
||||||
|
"quart-jwt-extended>=0.1.0",
|
||||||
|
"pre-commit>=4.3.0",
|
||||||
|
]
|
||||||
|
|||||||
108
query.py
108
query.py
@@ -1,10 +1,18 @@
|
|||||||
import json
|
import json
|
||||||
|
import os
|
||||||
from typing import Literal
|
from typing import Literal
|
||||||
import datetime
|
import datetime
|
||||||
from ollama import chat, ChatResponse
|
from ollama import Client
|
||||||
|
|
||||||
|
from openai import OpenAI
|
||||||
|
|
||||||
from pydantic import BaseModel, Field
|
from pydantic import BaseModel, Field
|
||||||
|
|
||||||
|
# Configure ollama client with URL from environment or default to localhost
|
||||||
|
ollama_client = Client(
|
||||||
|
host=os.getenv("OLLAMA_URL", "http://localhost:11434"), timeout=10.0
|
||||||
|
)
|
||||||
|
|
||||||
# This uses inferred filters — which means using LLM to create the metadata filters
|
# This uses inferred filters — which means using LLM to create the metadata filters
|
||||||
|
|
||||||
|
|
||||||
@@ -28,11 +36,30 @@ class GeneratedQuery(BaseModel):
|
|||||||
extracted_metadata_fields: str
|
extracted_metadata_fields: str
|
||||||
|
|
||||||
|
|
||||||
|
class Time(BaseModel):
|
||||||
|
time: int
|
||||||
|
|
||||||
|
|
||||||
|
DOCTYPE_OPTIONS = [
|
||||||
|
"Bill",
|
||||||
|
"Image Description",
|
||||||
|
"Insurance",
|
||||||
|
"Medical Record",
|
||||||
|
"Documentation",
|
||||||
|
"Letter",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
class DocumentType(BaseModel):
|
||||||
|
type: list[str] = Field(description="type of document", enum=DOCTYPE_OPTIONS)
|
||||||
|
|
||||||
|
|
||||||
PROMPT = """
|
PROMPT = """
|
||||||
You are an information specialist that processes user queries. The current year is 2025. The user queries are all about
|
You are an information specialist that processes user queries. The current year is 2025. The user queries are all about
|
||||||
a cat, Simba, and its records. The types of records are listed below. Using the query, extract the
|
a cat, Simba, and its records. The types of records are listed below. Using the query, extract the
|
||||||
the date range the user is trying to query. You should return the it as a JSON. The date tag is created_date. Return the date in epoch time
|
the date range the user is trying to query. You should return it as a JSON. The date tag is created_date. Return the date in epoch time.
|
||||||
|
|
||||||
|
If the created_date cannot be ascertained, set it to epoch time start.
|
||||||
|
|
||||||
You have several operators at your disposal:
|
You have several operators at your disposal:
|
||||||
- $gt: greater than
|
- $gt: greater than
|
||||||
@@ -72,6 +99,19 @@ Only return the extracted metadata fields. Make sure the extracted metadata fiel
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
DOCTYPE_PROMPT = f"""You are an information specialist that processes user queries. A query can have two tags attached from the following options. Based on the query, determine which of the following options is most appropriate: {",".join(DOCTYPE_OPTIONS)}
|
||||||
|
|
||||||
|
### Example 1
|
||||||
|
Query: "Who is Simba's current vet?"
|
||||||
|
Tags: ["Bill", "Medical Record"]
|
||||||
|
|
||||||
|
|
||||||
|
### Example 2
|
||||||
|
Query: "Who does Simba know?"
|
||||||
|
Tags: ["Letter", "Documentation"]
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
class QueryGenerator:
|
class QueryGenerator:
|
||||||
def __init__(self) -> None:
|
def __init__(self) -> None:
|
||||||
pass
|
pass
|
||||||
@@ -89,30 +129,66 @@ class QueryGenerator:
|
|||||||
|
|
||||||
return date.timestamp()
|
return date.timestamp()
|
||||||
|
|
||||||
def get_query(self, input: str):
|
def get_doctype_query(self, input: str):
|
||||||
response: ChatResponse = chat(
|
client = OpenAI()
|
||||||
model="gemma3n:e4b",
|
response = client.chat.completions.create(
|
||||||
messages=[
|
messages=[
|
||||||
|
{
|
||||||
|
"role": "system",
|
||||||
|
"content": "You are an information specialist that is really good at deciding what tags a query should have",
|
||||||
|
},
|
||||||
|
{"role": "user", "content": DOCTYPE_PROMPT + " " + input},
|
||||||
|
],
|
||||||
|
model="gpt-4o",
|
||||||
|
response_format={
|
||||||
|
"type": "json_schema",
|
||||||
|
"json_schema": {
|
||||||
|
"name": "document_type",
|
||||||
|
"schema": DocumentType.model_json_schema(),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
response_json_str = response.choices[0].message.content
|
||||||
|
type_data = json.loads(response_json_str)
|
||||||
|
metadata_query = {"document_type": {"$in": type_data["type"]}}
|
||||||
|
return metadata_query
|
||||||
|
|
||||||
|
def get_query(self, input: str):
|
||||||
|
client = OpenAI()
|
||||||
|
response = client.responses.parse(
|
||||||
|
model="gpt-4o",
|
||||||
|
input=[
|
||||||
{"role": "system", "content": PROMPT},
|
{"role": "system", "content": PROMPT},
|
||||||
{"role": "user", "content": input},
|
{"role": "user", "content": input},
|
||||||
],
|
],
|
||||||
format=GeneratedQuery.model_json_schema(),
|
text_format=GeneratedQuery,
|
||||||
)
|
)
|
||||||
|
print(response.output)
|
||||||
|
query = json.loads(response.output_parsed.extracted_metadata_fields)
|
||||||
|
# response: ChatResponse = ollama_client.chat(
|
||||||
|
# model="gemma3n:e4b",
|
||||||
|
# messages=[
|
||||||
|
# {"role": "system", "content": PROMPT},
|
||||||
|
# {"role": "user", "content": input},
|
||||||
|
# ],
|
||||||
|
# format=GeneratedQuery.model_json_schema(),
|
||||||
|
# )
|
||||||
|
|
||||||
query = json.loads(
|
# query = json.loads(
|
||||||
json.loads(response["message"]["content"])["extracted_metadata_fields"]
|
# json.loads(response["message"]["content"])["extracted_metadata_fields"]
|
||||||
)
|
# )
|
||||||
date_key = list(query["created_date"].keys())[0]
|
# date_key = list(query["created_date"].keys())[0]
|
||||||
query["created_date"][date_key] = self.date_to_epoch(
|
# query["created_date"][date_key] = self.date_to_epoch(
|
||||||
query["created_date"][date_key]
|
# query["created_date"][date_key]
|
||||||
)
|
# )
|
||||||
|
|
||||||
if "$" not in date_key:
|
# if "$" not in date_key:
|
||||||
query["created_date"]["$" + date_key] = query["created_date"][date_key]
|
# query["created_date"]["$" + date_key] = query["created_date"][date_key]
|
||||||
|
|
||||||
return query
|
return query
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
qg = QueryGenerator()
|
qg = QueryGenerator()
|
||||||
print(qg.get_query("How heavy is Simba?"))
|
print(qg.get_doctype_query("How heavy is Simba?"))
|
||||||
|
|||||||
16
raggr-frontend/.gitignore
vendored
Normal file
16
raggr-frontend/.gitignore
vendored
Normal file
@@ -0,0 +1,16 @@
|
|||||||
|
# Local
|
||||||
|
.DS_Store
|
||||||
|
*.local
|
||||||
|
*.log*
|
||||||
|
|
||||||
|
# Dist
|
||||||
|
node_modules
|
||||||
|
dist/
|
||||||
|
|
||||||
|
# Profile
|
||||||
|
.rspack-profile-*/
|
||||||
|
|
||||||
|
# IDE
|
||||||
|
.vscode/*
|
||||||
|
!.vscode/extensions.json
|
||||||
|
.idea
|
||||||
36
raggr-frontend/README.md
Normal file
36
raggr-frontend/README.md
Normal file
@@ -0,0 +1,36 @@
|
|||||||
|
# Rsbuild project
|
||||||
|
|
||||||
|
## Setup
|
||||||
|
|
||||||
|
Install the dependencies:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pnpm install
|
||||||
|
```
|
||||||
|
|
||||||
|
## Get started
|
||||||
|
|
||||||
|
Start the dev server, and the app will be available at [http://localhost:3000](http://localhost:3000).
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pnpm dev
|
||||||
|
```
|
||||||
|
|
||||||
|
Build the app for production:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pnpm build
|
||||||
|
```
|
||||||
|
|
||||||
|
Preview the production build locally:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pnpm preview
|
||||||
|
```
|
||||||
|
|
||||||
|
## Learn more
|
||||||
|
|
||||||
|
To learn more about Rsbuild, check out the following resources:
|
||||||
|
|
||||||
|
- [Rsbuild documentation](https://rsbuild.rs) - explore Rsbuild features and APIs.
|
||||||
|
- [Rsbuild GitHub repository](https://github.com/web-infra-dev/rsbuild) - your feedback and contributions are welcome!
|
||||||
26
raggr-frontend/package.json
Normal file
26
raggr-frontend/package.json
Normal file
@@ -0,0 +1,26 @@
|
|||||||
|
{
|
||||||
|
"name": "raggr-frontend",
|
||||||
|
"version": "1.0.0",
|
||||||
|
"private": true,
|
||||||
|
"type": "module",
|
||||||
|
"scripts": {
|
||||||
|
"build": "rsbuild build",
|
||||||
|
"dev": "rsbuild dev --open",
|
||||||
|
"preview": "rsbuild preview"
|
||||||
|
},
|
||||||
|
"dependencies": {
|
||||||
|
"axios": "^1.12.2",
|
||||||
|
"marked": "^16.3.0",
|
||||||
|
"react": "^19.1.1",
|
||||||
|
"react-dom": "^19.1.1",
|
||||||
|
"react-markdown": "^10.1.0"
|
||||||
|
},
|
||||||
|
"devDependencies": {
|
||||||
|
"@rsbuild/core": "^1.5.6",
|
||||||
|
"@rsbuild/plugin-react": "^1.4.0",
|
||||||
|
"@tailwindcss/postcss": "^4.0.0",
|
||||||
|
"@types/react": "^19.1.13",
|
||||||
|
"@types/react-dom": "^19.1.9",
|
||||||
|
"typescript": "^5.9.2"
|
||||||
|
}
|
||||||
|
}
|
||||||
5
raggr-frontend/postcss.config.mjs
Normal file
5
raggr-frontend/postcss.config.mjs
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
export default {
|
||||||
|
plugins: {
|
||||||
|
"@tailwindcss/postcss": {},
|
||||||
|
},
|
||||||
|
};
|
||||||
6
raggr-frontend/rsbuild.config.ts
Normal file
6
raggr-frontend/rsbuild.config.ts
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
import { defineConfig } from '@rsbuild/core';
|
||||||
|
import { pluginReact } from '@rsbuild/plugin-react';
|
||||||
|
|
||||||
|
export default defineConfig({
|
||||||
|
plugins: [pluginReact()],
|
||||||
|
});
|
||||||
BIN
raggr-frontend/src/.App.tsx.swp
Normal file
BIN
raggr-frontend/src/.App.tsx.swp
Normal file
Binary file not shown.
6
raggr-frontend/src/App.css
Normal file
6
raggr-frontend/src/App.css
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
@import "tailwindcss";
|
||||||
|
|
||||||
|
body {
|
||||||
|
margin: 0;
|
||||||
|
font-family: Inter, Avenir, Helvetica, Arial, sans-serif;
|
||||||
|
}
|
||||||
204
raggr-frontend/src/App.tsx
Normal file
204
raggr-frontend/src/App.tsx
Normal file
@@ -0,0 +1,204 @@
|
|||||||
|
import { useEffect, useState } from "react";
|
||||||
|
import axios from "axios";
|
||||||
|
import ReactMarkdown from "react-markdown";
|
||||||
|
|
||||||
|
import "./App.css";
|
||||||
|
|
||||||
|
type QuestionAnswer = {
|
||||||
|
question: string;
|
||||||
|
answer: string;
|
||||||
|
};
|
||||||
|
|
||||||
|
type QuestionBubbleProps = {
|
||||||
|
text: string;
|
||||||
|
};
|
||||||
|
|
||||||
|
type AnswerBubbleProps = {
|
||||||
|
text: string;
|
||||||
|
loading: string;
|
||||||
|
};
|
||||||
|
|
||||||
|
type QuestionAnswerPairProps = {
|
||||||
|
question: string;
|
||||||
|
answer: string;
|
||||||
|
loading: boolean;
|
||||||
|
};
|
||||||
|
|
||||||
|
type Conversation = {
|
||||||
|
title: string;
|
||||||
|
id: string;
|
||||||
|
};
|
||||||
|
|
||||||
|
type Message = {
|
||||||
|
text: string;
|
||||||
|
speaker: "simba" | "user";
|
||||||
|
};
|
||||||
|
|
||||||
|
type ConversationMenuProps = {
|
||||||
|
conversations: Conversation[];
|
||||||
|
};
|
||||||
|
|
||||||
|
const ConversationMenu = ({ conversations }: ConversationMenuProps) => {
|
||||||
|
return (
|
||||||
|
<div className="absolute bg-white w-md rounded-md shadow-xl m-4 p-4">
|
||||||
|
<p className="py-2 px-4 rounded-md w-full text-xl font-bold">askSimba!</p>
|
||||||
|
{conversations.map((conversation) => (
|
||||||
|
<p className="py-2 px-4 rounded-md hover:bg-stone-200 w-full text-xl font-bold cursor-pointer">
|
||||||
|
{conversation.title}
|
||||||
|
</p>
|
||||||
|
))}
|
||||||
|
</div>
|
||||||
|
);
|
||||||
|
};
|
||||||
|
|
||||||
|
const QuestionBubble = ({ text }: QuestionBubbleProps) => {
|
||||||
|
return <div className="rounded-md bg-stone-200 p-3">🤦: {text}</div>;
|
||||||
|
};
|
||||||
|
|
||||||
|
const AnswerBubble = ({ text, loading }: AnswerBubbleProps) => {
|
||||||
|
return (
|
||||||
|
<div className="rounded-md bg-orange-100 p-3">
|
||||||
|
{loading ? (
|
||||||
|
<div className="flex flex-col w-full animate-pulse gap-2">
|
||||||
|
<div className="flex flex-row gap-2 w-full">
|
||||||
|
<div className="bg-gray-400 w-1/2 p-3 rounded-lg" />
|
||||||
|
<div className="bg-gray-400 w-1/2 p-3 rounded-lg" />
|
||||||
|
</div>
|
||||||
|
<div className="flex flex-row gap-2 w-full">
|
||||||
|
<div className="bg-gray-400 w-1/3 p-3 rounded-lg" />
|
||||||
|
<div className="bg-gray-400 w-2/3 p-3 rounded-lg" />
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
) : (
|
||||||
|
<div className="flex flex-col">
|
||||||
|
<ReactMarkdown>{"🐈: " + text}</ReactMarkdown>
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
);
|
||||||
|
};
|
||||||
|
|
||||||
|
const QuestionAnswerPair = ({
|
||||||
|
question,
|
||||||
|
answer,
|
||||||
|
loading,
|
||||||
|
}: QuestionAnswerPairProps) => {
|
||||||
|
return (
|
||||||
|
<div className="flex flex-col gap-4">
|
||||||
|
<QuestionBubble text={question} />
|
||||||
|
<AnswerBubble text={answer} loading={loading} />
|
||||||
|
</div>
|
||||||
|
);
|
||||||
|
};
|
||||||
|
|
||||||
|
const App = () => {
|
||||||
|
const [query, setQuery] = useState<string>("");
|
||||||
|
const [answer, setAnswer] = useState<string>("");
|
||||||
|
const [simbaMode, setSimbaMode] = useState<boolean>(false);
|
||||||
|
const [questionsAnswers, setQuestionsAnswers] = useState<QuestionAnswer[]>(
|
||||||
|
[],
|
||||||
|
);
|
||||||
|
const [messages, setMessages] = useState<Message[]>([]);
|
||||||
|
const [conversations, setConversations] = useState<Conversation[]>([
|
||||||
|
{ title: "simba meow meow", id: "uuid" },
|
||||||
|
]);
|
||||||
|
|
||||||
|
const simbaAnswers = ["meow.", "hiss...", "purrrrrr", "yowOWROWWowowr"];
|
||||||
|
|
||||||
|
useEffect(() => {
|
||||||
|
axios.get("/api/messages").then((result) => {
|
||||||
|
setMessages(
|
||||||
|
result.data.messages.map((message) => {
|
||||||
|
return {
|
||||||
|
text: message.text,
|
||||||
|
speaker: message.speaker,
|
||||||
|
};
|
||||||
|
}),
|
||||||
|
);
|
||||||
|
});
|
||||||
|
}, []);
|
||||||
|
|
||||||
|
const handleQuestionSubmit = () => {
|
||||||
|
let currMessages = messages.concat([{ text: query, speaker: "user" }]);
|
||||||
|
setMessages(currMessages);
|
||||||
|
if (simbaMode) {
|
||||||
|
console.log("simba mode activated");
|
||||||
|
const randomIndex = Math.floor(Math.random() * simbaAnswers.length);
|
||||||
|
const randomElement = simbaAnswers[randomIndex];
|
||||||
|
setAnswer(randomElement);
|
||||||
|
setQuestionsAnswers(
|
||||||
|
questionsAnswers.concat([
|
||||||
|
{
|
||||||
|
question: query,
|
||||||
|
answer: randomElement,
|
||||||
|
},
|
||||||
|
]),
|
||||||
|
);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
const payload = { query: query };
|
||||||
|
axios.post("/api/query", payload).then((result) => {
|
||||||
|
setQuestionsAnswers(
|
||||||
|
questionsAnswers.concat([
|
||||||
|
{ question: query, answer: result.data.response },
|
||||||
|
]),
|
||||||
|
);
|
||||||
|
setMessages(
|
||||||
|
currMessages.concat([{ text: result.data.response, speaker: "simba" }]),
|
||||||
|
);
|
||||||
|
});
|
||||||
|
};
|
||||||
|
const handleQueryChange = (event) => {
|
||||||
|
setQuery(event.target.value);
|
||||||
|
};
|
||||||
|
return (
|
||||||
|
<div className="h-screen bg-opacity-20">
|
||||||
|
<div className="bg-white/85 h-screen">
|
||||||
|
<div className="flex flex-row justify-center py-4">
|
||||||
|
<div className="flex flex-col gap-4 min-w-xl max-w-xl">
|
||||||
|
<header className="flex flex-row justify-center gap-2 grow sticky top-0 z-10 bg-white">
|
||||||
|
<h1 className="text-3xl">ask simba!</h1>
|
||||||
|
</header>
|
||||||
|
{/*{questionsAnswers.map((qa) => (
|
||||||
|
<QuestionAnswerPair question={qa.question} answer={qa.answer} />
|
||||||
|
))}*/}
|
||||||
|
{messages.map((msg) => {
|
||||||
|
if (msg.speaker == "simba") {
|
||||||
|
return <AnswerBubble text={msg.text} loading="" />;
|
||||||
|
}
|
||||||
|
|
||||||
|
return <QuestionBubble text={msg.text} />;
|
||||||
|
})}
|
||||||
|
<footer className="flex flex-col gap-2 sticky bottom-0">
|
||||||
|
<div className="flex flex-row justify-between gap-2 grow">
|
||||||
|
<textarea
|
||||||
|
type="text"
|
||||||
|
className="p-4 border border-blue-200 rounded-md grow bg-white"
|
||||||
|
onChange={handleQueryChange}
|
||||||
|
/>
|
||||||
|
</div>
|
||||||
|
<div className="flex flex-row justify-between gap-2 grow">
|
||||||
|
<button
|
||||||
|
className="p-4 border border-blue-400 bg-blue-200 hover:bg-blue-400 cursor-pointer rounded-md flex-grow"
|
||||||
|
onClick={() => handleQuestionSubmit()}
|
||||||
|
type="submit"
|
||||||
|
>
|
||||||
|
Submit
|
||||||
|
</button>
|
||||||
|
</div>
|
||||||
|
<div className="flex flex-row justify-center gap-2 grow">
|
||||||
|
<input
|
||||||
|
type="checkbox"
|
||||||
|
onChange={(event) => setSimbaMode(event.target.checked)}
|
||||||
|
/>
|
||||||
|
<p>simba mode?</p>
|
||||||
|
</div>
|
||||||
|
</footer>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
);
|
||||||
|
};
|
||||||
|
|
||||||
|
export default App;
|
||||||
11
raggr-frontend/src/env.d.ts
vendored
Normal file
11
raggr-frontend/src/env.d.ts
vendored
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
/// <reference types="@rsbuild/core/types" />
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Imports the SVG file as a React component.
|
||||||
|
* @requires [@rsbuild/plugin-svgr](https://npmjs.com/package/@rsbuild/plugin-svgr)
|
||||||
|
*/
|
||||||
|
declare module '*.svg?react' {
|
||||||
|
import type React from 'react';
|
||||||
|
const ReactComponent: React.FunctionComponent<React.SVGProps<SVGSVGElement>>;
|
||||||
|
export default ReactComponent;
|
||||||
|
}
|
||||||
13
raggr-frontend/src/index.tsx
Normal file
13
raggr-frontend/src/index.tsx
Normal file
@@ -0,0 +1,13 @@
|
|||||||
|
import React from 'react';
|
||||||
|
import ReactDOM from 'react-dom/client';
|
||||||
|
import App from './App';
|
||||||
|
|
||||||
|
const rootEl = document.getElementById('root');
|
||||||
|
if (rootEl) {
|
||||||
|
const root = ReactDOM.createRoot(rootEl);
|
||||||
|
root.render(
|
||||||
|
<React.StrictMode>
|
||||||
|
<App />
|
||||||
|
</React.StrictMode>,
|
||||||
|
);
|
||||||
|
}
|
||||||
BIN
raggr-frontend/src/simba_cute.jpeg
Normal file
BIN
raggr-frontend/src/simba_cute.jpeg
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 3.4 MiB |
BIN
raggr-frontend/src/simba_troll.jpeg
Normal file
BIN
raggr-frontend/src/simba_troll.jpeg
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 2.1 MiB |
25
raggr-frontend/tsconfig.json
Normal file
25
raggr-frontend/tsconfig.json
Normal file
@@ -0,0 +1,25 @@
|
|||||||
|
{
|
||||||
|
"compilerOptions": {
|
||||||
|
"lib": ["DOM", "ES2020"],
|
||||||
|
"jsx": "react-jsx",
|
||||||
|
"target": "ES2020",
|
||||||
|
"noEmit": true,
|
||||||
|
"skipLibCheck": true,
|
||||||
|
"useDefineForClassFields": true,
|
||||||
|
|
||||||
|
/* modules */
|
||||||
|
"module": "ESNext",
|
||||||
|
"moduleDetection": "force",
|
||||||
|
"moduleResolution": "bundler",
|
||||||
|
"verbatimModuleSyntax": true,
|
||||||
|
"resolveJsonModule": true,
|
||||||
|
"allowImportingTsExtensions": true,
|
||||||
|
"noUncheckedSideEffectImports": true,
|
||||||
|
|
||||||
|
/* type checking */
|
||||||
|
"strict": true,
|
||||||
|
"noUnusedLocals": true,
|
||||||
|
"noUnusedParameters": true
|
||||||
|
},
|
||||||
|
"include": ["src"]
|
||||||
|
}
|
||||||
1424
raggr-frontend/yarn.lock
Normal file
1424
raggr-frontend/yarn.lock
Normal file
File diff suppressed because it is too large
Load Diff
49
request.py
49
request.py
@@ -1,31 +1,43 @@
|
|||||||
import os
|
import os
|
||||||
import tempfile
|
import tempfile
|
||||||
import httpx
|
import httpx
|
||||||
|
import logging
|
||||||
|
|
||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
|
|
||||||
load_dotenv()
|
load_dotenv()
|
||||||
|
|
||||||
|
logging.basicConfig(level=logging.INFO)
|
||||||
|
|
||||||
|
|
||||||
class PaperlessNGXService:
|
class PaperlessNGXService:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.base_url = os.getenv("BASE_URL")
|
self.base_url = os.getenv("BASE_URL")
|
||||||
self.token = os.getenv("PAPERLESS_TOKEN")
|
self.token = os.getenv("PAPERLESS_TOKEN")
|
||||||
self.url = f"http://{os.getenv("BASE_URL")}/api/documents/?query=simba"
|
self.url = f"http://{os.getenv('BASE_URL')}/api/documents/?tags__id=8"
|
||||||
self.headers = {"Authorization": f"Token {os.getenv("PAPERLESS_TOKEN")}"}
|
self.headers = {"Authorization": f"Token {os.getenv('PAPERLESS_TOKEN')}"}
|
||||||
|
|
||||||
def get_data(self):
|
def get_data(self):
|
||||||
print(f"Getting data from: {self.url}")
|
print(f"Getting data from: {self.url}")
|
||||||
r = httpx.get(self.url, headers=self.headers)
|
r = httpx.get(self.url, headers=self.headers)
|
||||||
return r.json()["results"]
|
results = r.json()["results"]
|
||||||
|
|
||||||
|
nextLink = r.json().get("next")
|
||||||
|
|
||||||
|
while nextLink:
|
||||||
|
r = httpx.get(nextLink, headers=self.headers)
|
||||||
|
results += r.json()["results"]
|
||||||
|
nextLink = r.json().get("next")
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
def get_doc_by_id(self, doc_id: int):
|
def get_doc_by_id(self, doc_id: int):
|
||||||
url = f"http://{os.getenv("BASE_URL")}/api/documents/{doc_id}/"
|
url = f"http://{os.getenv('BASE_URL')}/api/documents/{doc_id}/"
|
||||||
r = httpx.get(url, headers=self.headers)
|
r = httpx.get(url, headers=self.headers)
|
||||||
return r.json()
|
return r.json()
|
||||||
|
|
||||||
def download_pdf_from_id(self, id: int) -> str:
|
def download_pdf_from_id(self, id: int) -> str:
|
||||||
download_url = f"http://{os.getenv("BASE_URL")}/api/documents/{id}/download/"
|
download_url = f"http://{os.getenv('BASE_URL')}/api/documents/{id}/download/"
|
||||||
response = httpx.get(
|
response = httpx.get(
|
||||||
download_url, headers=self.headers, follow_redirects=True, timeout=30
|
download_url, headers=self.headers, follow_redirects=True, timeout=30
|
||||||
)
|
)
|
||||||
@@ -39,10 +51,35 @@ class PaperlessNGXService:
|
|||||||
return pdf_to_process
|
return pdf_to_process
|
||||||
|
|
||||||
def upload_cleaned_content(self, document_id, data):
|
def upload_cleaned_content(self, document_id, data):
|
||||||
PUTS_URL = f"http://{os.getenv("BASE_URL")}/api/documents/{document_id}/"
|
PUTS_URL = f"http://{os.getenv('BASE_URL')}/api/documents/{document_id}/"
|
||||||
r = httpx.put(PUTS_URL, headers=self.headers, data=data)
|
r = httpx.put(PUTS_URL, headers=self.headers, data=data)
|
||||||
r.raise_for_status()
|
r.raise_for_status()
|
||||||
|
|
||||||
|
def upload_description(self, description_filepath, file, title, exif_date: str):
|
||||||
|
POST_URL = f"http://{os.getenv('BASE_URL')}/api/documents/post_document/"
|
||||||
|
files = {"document": ("description_filepath", file, "application/txt")}
|
||||||
|
data = {
|
||||||
|
"title": title,
|
||||||
|
"create": exif_date,
|
||||||
|
"document_type": 3,
|
||||||
|
"tags": [7],
|
||||||
|
}
|
||||||
|
|
||||||
|
r = httpx.post(POST_URL, headers=self.headers, data=data, files=files)
|
||||||
|
r.raise_for_status()
|
||||||
|
|
||||||
|
def get_tags(self):
|
||||||
|
GET_URL = f"http://{os.getenv('BASE_URL')}/api/tags/"
|
||||||
|
r = httpx.get(GET_URL, headers=self.headers)
|
||||||
|
data = r.json()
|
||||||
|
return {tag["id"]: tag["name"] for tag in data["results"]}
|
||||||
|
|
||||||
|
def get_doctypes(self):
|
||||||
|
GET_URL = f"http://{os.getenv('BASE_URL')}/api/document_types/"
|
||||||
|
r = httpx.get(GET_URL, headers=self.headers)
|
||||||
|
data = r.json()
|
||||||
|
return {doctype["id"]: doctype["name"] for doctype in data["results"]}
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
pp = PaperlessNGXService()
|
pp = PaperlessNGXService()
|
||||||
|
|||||||
7
startup.sh
Normal file
7
startup.sh
Normal file
@@ -0,0 +1,7 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
echo "Starting reindex process..."
|
||||||
|
python main.py "" --reindex
|
||||||
|
|
||||||
|
echo "Starting Flask application..."
|
||||||
|
python app.py
|
||||||
Reference in New Issue
Block a user