Files
simbarag/utils/image_process.py
Ryan Chen 142fac3a84 Switch image analysis from Ollama to llama-server
Use the same llama-server (OpenAI-compatible API) for vision analysis
that the main agent uses, with OpenAI fallback. Sends images as base64
in the standard OpenAI vision message format.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-04 08:06:51 -04:00

128 lines
4.0 KiB
Python

from ollama import Client
import argparse
import os
import logging
from PIL import Image, ExifTags
from pillow_heif import register_heif_opener
from pydantic import BaseModel
from dotenv import load_dotenv
load_dotenv()
register_heif_opener()
logging.basicConfig(level=logging.INFO)
parser = argparse.ArgumentParser(
prog="SimbaImageProcessor",
description="What the program does",
epilog="Text at the bottom of help",
)
parser.add_argument("filepath")
client = Client(host=os.getenv("OLLAMA_HOST", "http://localhost:11434"))
class SimbaImageDescription(BaseModel):
image_date: str
description: str
def describe_simba_image(input):
logging.info("Opening image of Simba ...")
if "heic" in input.lower() or "heif" in input.lower():
new_filepath = input.split(".")[0] + ".jpg"
img = Image.open(input)
img.save(new_filepath, "JPEG")
logging.info("Extracting EXIF...")
exif = {
ExifTags.TAGS[k]: v for k, v in img.getexif().items() if k in ExifTags.TAGS
}
img = Image.open(new_filepath)
input = new_filepath
else:
img = Image.open(input)
logging.info("Extracting EXIF...")
exif = {
ExifTags.TAGS[k]: v for k, v in img.getexif().items() if k in ExifTags.TAGS
}
if "MakerNote" in exif:
exif.pop("MakerNote")
logging.info(exif)
prompt = f"Simba is an orange cat belonging to Ryan Chen. In 2025, they lived in New York. In 2024, they lived in California. Analyze the following image and tell me what Simba seems to be doing. Be extremely descriptive about Simba, things in the background, and the setting of the image. I will also include the EXIF data of the image, please use it to help you determine information about Simba. EXIF: {exif}. Put the notes in the description field and the date in the image_date field."
logging.info("Sending info to Ollama ...")
response = client.chat(
model="gemma3:4b",
messages=[
{
"role": "system",
"content": "you are a very shrewd and descriptive note taker. all of your responses will be formatted like notes in bullet points. be very descriptive. do not leave a single thing out.",
},
{"role": "user", "content": prompt, "images": [input]},
],
format=SimbaImageDescription.model_json_schema(),
)
result = SimbaImageDescription.model_validate_json(response["message"]["content"])
return result
async def analyze_user_image(file_bytes: bytes) -> str:
"""Analyze an image uploaded by a user and return a text description.
Uses llama-server (OpenAI-compatible API) with vision support.
Falls back to OpenAI if llama-server is not configured.
"""
import base64
from openai import AsyncOpenAI
llama_url = os.getenv("LLAMA_SERVER_URL")
if llama_url:
aclient = AsyncOpenAI(base_url=llama_url, api_key="not-needed")
model = os.getenv("LLAMA_MODEL_NAME", "llama-3.1-8b-instruct")
else:
aclient = AsyncOpenAI()
model = "gpt-4o-mini"
b64 = base64.b64encode(file_bytes).decode("utf-8")
response = await aclient.chat.completions.create(
model=model,
messages=[
{
"role": "system",
"content": "You are a helpful image analyst. Describe what you see in the image in detail. Be thorough but concise.",
},
{
"role": "user",
"content": [
{"type": "text", "text": "Please describe this image in detail."},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{b64}",
},
},
],
},
],
)
return response.choices[0].message.content
if __name__ == "__main__":
args = parser.parse_args()
if args.filepath:
logging.info
describe_simba_image(input=args.filepath)