initial commit

2025-07-25 23:08:16 -04:00
commit 2bc498f8c0
5 changed files with 122 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,13 @@
 # Python-generated files
 __pycache__/
 *.py[oc]
 build/
 dist/
 wheels/
 *.egg-info
 # Virtual environments
 .venv
 .env
--- a/README.md
+++ b/README.md
--- a/main.py
+++ b/main.py
@@ -0,0 +1,83 @@
 import ollama
 from uuid import uuid4, UUID
 from request import PaperlessNGXService
 from math import ceil
 import chromadb
 client = chromadb.EphemeralClient()
 collection = client.create_collection(name="docs")
 class Chunk:
    def __init__(
        self,
        text: str,
        size: int,
        document_id: UUID,
        chunk_id: int,
        embedding,
    ):
        self.text = text
        self.size = size
        self.document_id = document_id
        self.chunk_id = chunk_id
        self.embedding = embedding
 class Chunker:
    def __init__(self) -> None:
        pass
    def chunk_document(self, document: str, chunk_size: int = 300) -> list[Chunk]:
        doc_uuid = uuid4()
        chunks = []
        num_chunks = ceil(len(document) / chunk_size)
        document_length = len(document)
        for i in range(num_chunks):
            curr_pos = i * num_chunks
            to_pos = (
                curr_pos + num_chunks
                if curr_pos + num_chunks < document_length
                else document_length
            )
            text_chunk = document[curr_pos:to_pos]
            collection.add(
                ids=[str(doc_uuid) + ":" + str(i)],
                documents=[text_chunk],
            )
        return chunks
 # Setup
 # Step 1: Get the text
 ppngx = PaperlessNGXService()
 docs = ppngx.get_data()
 texts = [doc["content"] for doc in docs]
 # Step 2: Create chunks
 chunker = Chunker()
 print(f"chunking {len(texts)} documents")
 for text in texts:
    chunker.chunk_document(document=text)
 # Ask
 input = "How many teeth has Simba had removed?"
 response = ollama.embed(model="mxbai-embed-large", input=input)
 results = collection.query(query_texts=[input], n_results=1)
 print(results)
 # Generate
 output = ollama.generate(
    model="gemma3n:e4b",
    prompt=f"Using this data: {results}. Respond to this prompt: {input}",
 )
 print(output["response"])
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -0,0 +1,7 @@
 [project]
 name = "raggr"
 version = "0.1.0"
 description = "Add your description here"
 readme = "README.md"
 requires-python = ">=3.13"
 dependencies = []
--- a/request.py
+++ b/request.py
@@ -0,0 +1,19 @@
 import os
 import httpx
 from dotenv import load_dotenv
 load_dotenv()
 class PaperlessNGXService:
    def __init__(self):
        self.base_url = os.getenv("BASE_URL")
        self.token = os.getenv("PAPERLESS_TOKEN")
        self.url = f"http://{os.getenv("BASE_URL")}/api/documents/?query=simba"
        self.headers = {"Authorization": f"Token {os.getenv("PAPERLESS_TOKEN")}"}
    def get_data(self):
        print(f"Getting data from: {self.url}")
        r = httpx.get(self.url, headers=self.headers)
        return r.json()["results"]