initial commit

2025-07-25 23:08:16 -04:00
commit 2bc498f8c0
5 changed files with 122 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,13 @@
+# Python-generated files
+__pycache__/
+*.py[oc]
+build/
+dist/
+wheels/
+*.egg-info
+
+# Virtual environments
+.venv
+
+
+.env
--- a/README.md
+++ b/README.md
--- a/main.py
+++ b/main.py
@@ -0,0 +1,83 @@
+import ollama
+from uuid import uuid4, UUID
+
+from request import PaperlessNGXService
+
+from math import ceil
+
+import chromadb
+
+client = chromadb.EphemeralClient()
+collection = client.create_collection(name="docs")
+
+
+class Chunk:
+    def __init__(
+        self,
+        text: str,
+        size: int,
+        document_id: UUID,
+        chunk_id: int,
+        embedding,
+    ):
+        self.text = text
+        self.size = size
+        self.document_id = document_id
+        self.chunk_id = chunk_id
+        self.embedding = embedding
+
+
+class Chunker:
+    def __init__(self) -> None:
+        pass
+
+    def chunk_document(self, document: str, chunk_size: int = 300) -> list[Chunk]:
+        doc_uuid = uuid4()
+
+        chunks = []
+        num_chunks = ceil(len(document) / chunk_size)
+        document_length = len(document)
+
+        for i in range(num_chunks):
+            curr_pos = i * num_chunks
+            to_pos = (
+                curr_pos + num_chunks
+                if curr_pos + num_chunks < document_length
+                else document_length
+            )
+            text_chunk = document[curr_pos:to_pos]
+
+            collection.add(
+                ids=[str(doc_uuid) + ":" + str(i)],
+                documents=[text_chunk],
+            )
+
+        return chunks
+
+
+# Setup
+
+# Step 1: Get the text
+ppngx = PaperlessNGXService()
+docs = ppngx.get_data()
+texts = [doc["content"] for doc in docs]
+
+# Step 2: Create chunks
+chunker = Chunker()
+
+print(f"chunking {len(texts)} documents")
+for text in texts:
+    chunker.chunk_document(document=text)
+
+# Ask
+input = "How many teeth has Simba had removed?"
+response = ollama.embed(model="mxbai-embed-large", input=input)
+results = collection.query(query_texts=[input], n_results=1)
+print(results)
+# Generate
+output = ollama.generate(
+    model="gemma3n:e4b",
+    prompt=f"Using this data: {results}. Respond to this prompt: {input}",
+)
+
+print(output["response"])
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -0,0 +1,7 @@
+[project]
+name = "raggr"
+version = "0.1.0"
+description = "Add your description here"
+readme = "README.md"
+requires-python = ">=3.13"
+dependencies = []
--- a/request.py
+++ b/request.py
@@ -0,0 +1,19 @@
+import os
+import httpx
+
+from dotenv import load_dotenv
+
+load_dotenv()
+
+
+class PaperlessNGXService:
+    def __init__(self):
+        self.base_url = os.getenv("BASE_URL")
+        self.token = os.getenv("PAPERLESS_TOKEN")
+        self.url = f"http://{os.getenv("BASE_URL")}/api/documents/?query=simba"
+        self.headers = {"Authorization": f"Token {os.getenv("PAPERLESS_TOKEN")}"}
+
+    def get_data(self):
+        print(f"Getting data from: {self.url}")
+        r = httpx.get(self.url, headers=self.headers)
+        return r.json()["results"]