From 2bc498f8c0c402c8b5b451f783f8b2f179fb8a13 Mon Sep 17 00:00:00 2001 From: Ryan Chen Date: Fri, 25 Jul 2025 23:08:16 -0400 Subject: [PATCH] initial commit --- .gitignore | 13 ++++++++ README.md | 0 main.py | 83 ++++++++++++++++++++++++++++++++++++++++++++++++++ pyproject.toml | 7 +++++ request.py | 19 ++++++++++++ 5 files changed, 122 insertions(+) create mode 100644 .gitignore create mode 100644 README.md create mode 100644 main.py create mode 100644 pyproject.toml create mode 100644 request.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..edf0ab1 --- /dev/null +++ b/.gitignore @@ -0,0 +1,13 @@ +# Python-generated files +__pycache__/ +*.py[oc] +build/ +dist/ +wheels/ +*.egg-info + +# Virtual environments +.venv + + +.env diff --git a/README.md b/README.md new file mode 100644 index 0000000..e69de29 diff --git a/main.py b/main.py new file mode 100644 index 0000000..ff890b2 --- /dev/null +++ b/main.py @@ -0,0 +1,83 @@ +import ollama +from uuid import uuid4, UUID + +from request import PaperlessNGXService + +from math import ceil + +import chromadb + +client = chromadb.EphemeralClient() +collection = client.create_collection(name="docs") + + +class Chunk: + def __init__( + self, + text: str, + size: int, + document_id: UUID, + chunk_id: int, + embedding, + ): + self.text = text + self.size = size + self.document_id = document_id + self.chunk_id = chunk_id + self.embedding = embedding + + +class Chunker: + def __init__(self) -> None: + pass + + def chunk_document(self, document: str, chunk_size: int = 300) -> list[Chunk]: + doc_uuid = uuid4() + + chunks = [] + num_chunks = ceil(len(document) / chunk_size) + document_length = len(document) + + for i in range(num_chunks): + curr_pos = i * num_chunks + to_pos = ( + curr_pos + num_chunks + if curr_pos + num_chunks < document_length + else document_length + ) + text_chunk = document[curr_pos:to_pos] + + collection.add( + ids=[str(doc_uuid) + ":" + str(i)], + documents=[text_chunk], + ) + + return chunks + + +# Setup + +# Step 1: Get the text +ppngx = PaperlessNGXService() +docs = ppngx.get_data() +texts = [doc["content"] for doc in docs] + +# Step 2: Create chunks +chunker = Chunker() + +print(f"chunking {len(texts)} documents") +for text in texts: + chunker.chunk_document(document=text) + +# Ask +input = "How many teeth has Simba had removed?" +response = ollama.embed(model="mxbai-embed-large", input=input) +results = collection.query(query_texts=[input], n_results=1) +print(results) +# Generate +output = ollama.generate( + model="gemma3n:e4b", + prompt=f"Using this data: {results}. Respond to this prompt: {input}", +) + +print(output["response"]) diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..b0a5584 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,7 @@ +[project] +name = "raggr" +version = "0.1.0" +description = "Add your description here" +readme = "README.md" +requires-python = ">=3.13" +dependencies = [] diff --git a/request.py b/request.py new file mode 100644 index 0000000..7770fcf --- /dev/null +++ b/request.py @@ -0,0 +1,19 @@ +import os +import httpx + +from dotenv import load_dotenv + +load_dotenv() + + +class PaperlessNGXService: + def __init__(self): + self.base_url = os.getenv("BASE_URL") + self.token = os.getenv("PAPERLESS_TOKEN") + self.url = f"http://{os.getenv("BASE_URL")}/api/documents/?query=simba" + self.headers = {"Authorization": f"Token {os.getenv("PAPERLESS_TOKEN")}"} + + def get_data(self): + print(f"Getting data from: {self.url}") + r = httpx.get(self.url, headers=self.headers) + return r.json()["results"]