From 142fac3a845804c3476bb96d30e89f05d4375c6c Mon Sep 17 00:00:00 2001
From: Ryan Chen <ryan@torrtle.co>
Date: Sat, 4 Apr 2026 08:06:51 -0400
Subject: [PATCH] Switch image analysis from Ollama to llama-server

Use the same llama-server (OpenAI-compatible API) for vision analysis
that the main agent uses, with OpenAI fallback. Sends images as base64
in the standard OpenAI vision message format.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 utils/image_process.py | 61 +++++++++++++++++++++++++-----------------
 1 file changed, 36 insertions(+), 25 deletions(-)

diff --git a/utils/image_process.py b/utils/image_process.py
index 9bf14e8..3da666a 100644
--- a/utils/image_process.py
+++ b/utils/image_process.py
@@ -79,34 +79,45 @@ def describe_simba_image(input):
 async def analyze_user_image(file_bytes: bytes) -> str:
     """Analyze an image uploaded by a user and return a text description.
 
-    Uses Ollama vision model to describe the image contents.
-    Works with JPEG, PNG, WebP bytes (HEIC should be converted before calling).
+    Uses llama-server (OpenAI-compatible API) with vision support.
+    Falls back to OpenAI if llama-server is not configured.
     """
-    import tempfile
+    import base64
 
-    # Write to temp file since ollama client expects a file path
-    with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as f:
-        f.write(file_bytes)
-        temp_path = f.name
+    from openai import AsyncOpenAI
 
-    try:
-        response = client.chat(
-            model="gemma3:4b",
-            messages=[
-                {
-                    "role": "system",
-                    "content": "You are a helpful image analyst. Describe what you see in the image in detail. Be thorough but concise.",
-                },
-                {
-                    "role": "user",
-                    "content": "Please describe this image in detail.",
-                    "images": [temp_path],
-                },
-            ],
-        )
-        return response["message"]["content"]
-    finally:
-        os.remove(temp_path)
+    llama_url = os.getenv("LLAMA_SERVER_URL")
+    if llama_url:
+        aclient = AsyncOpenAI(base_url=llama_url, api_key="not-needed")
+        model = os.getenv("LLAMA_MODEL_NAME", "llama-3.1-8b-instruct")
+    else:
+        aclient = AsyncOpenAI()
+        model = "gpt-4o-mini"
+
+    b64 = base64.b64encode(file_bytes).decode("utf-8")
+
+    response = await aclient.chat.completions.create(
+        model=model,
+        messages=[
+            {
+                "role": "system",
+                "content": "You are a helpful image analyst. Describe what you see in the image in detail. Be thorough but concise.",
+            },
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "Please describe this image in detail."},
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": f"data:image/jpeg;base64,{b64}",
+                        },
+                    },
+                ],
+            },
+        ],
+    )
+    return response.choices[0].message.content
 
 
 if __name__ == "__main__":