2 Commits

Author SHA1 Message Date
ryan
f0f72cce36 Merge pull request 'Replace Ollama with llama-server (OpenAI-compatible API)' (#14) from feature/llama-cpp-integration into main
Reviewed-on: #14
2026-01-31 21:41:19 -05:00
Ryan Chen
32020a6c60 Replace Ollama with llama-server (OpenAI-compatible API)
- Update llm.py to use OpenAI client with custom base_url for llama-server
- Update agents.py to use ChatOpenAI instead of ChatOllama
- Remove unused ollama imports from main.py, chunker.py, query.py
- Add LLAMA_SERVER_URL and LLAMA_MODEL_NAME env vars
- Remove ollama and langchain-ollama dependencies

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-31 21:39:23 -05:00
7 changed files with 35 additions and 71 deletions

View File

@@ -14,9 +14,10 @@ JWT_SECRET_KEY=your-secret-key-here
PAPERLESS_TOKEN=your-paperless-token
BASE_URL=192.168.1.5:8000
# Ollama Configuration
OLLAMA_URL=http://192.168.1.14:11434
OLLAMA_HOST=http://192.168.1.14:11434
# llama-server Configuration (OpenAI-compatible API)
# If set, uses llama-server as the primary LLM backend with OpenAI as fallback
LLAMA_SERVER_URL=http://192.168.1.213:8080/v1
LLAMA_MODEL_NAME=llama-3.1-8b-instruct
# ChromaDB Configuration
# For Docker: This is automatically set to /app/data/chromadb

View File

@@ -4,16 +4,26 @@ from typing import cast
from langchain.agents import create_agent
from langchain.chat_models import BaseChatModel
from langchain.tools import tool
from langchain_ollama import ChatOllama
from langchain_openai import ChatOpenAI
from tavily import AsyncTavilyClient
from blueprints.rag.logic import query_vector_store
openai_gpt_5_mini = ChatOpenAI(model="gpt-5-mini")
ollama_deepseek = ChatOllama(model="llama3.1:8b", base_url=os.getenv("OLLAMA_URL"))
# Configure LLM with llama-server or OpenAI fallback
llama_url = os.getenv("LLAMA_SERVER_URL")
if llama_url:
llama_chat = ChatOpenAI(
base_url=llama_url,
api_key="not-needed",
model=os.getenv("LLAMA_MODEL_NAME", "llama-3.1-8b-instruct"),
)
else:
llama_chat = None
openai_fallback = ChatOpenAI(model="gpt-5-mini")
model_with_fallback = cast(
BaseChatModel, ollama_deepseek.with_fallbacks([openai_gpt_5_mini])
BaseChatModel,
llama_chat.with_fallbacks([openai_fallback]) if llama_chat else openai_fallback,
)
client = AsyncTavilyClient(os.getenv("TAVILY_KEY"), "")

59
llm.py
View File

@@ -1,32 +1,25 @@
import os
from ollama import Client
from openai import OpenAI
import logging
from openai import OpenAI
from dotenv import load_dotenv
load_dotenv()
logging.basicConfig(level=logging.INFO)
TRY_OLLAMA = os.getenv("TRY_OLLAMA", False)
class LLMClient:
def __init__(self):
try:
self.ollama_client = Client(
host=os.getenv("OLLAMA_URL", "http://localhost:11434"), timeout=1.0
)
self.ollama_client.chat(
model="gemma3:4b", messages=[{"role": "system", "content": "test"}]
)
self.PROVIDER = "ollama"
logging.info("Using Ollama as LLM backend")
except Exception as e:
print(e)
self.openai_client = OpenAI()
llama_url = os.getenv("LLAMA_SERVER_URL")
if llama_url:
self.client = OpenAI(base_url=llama_url, api_key="not-needed")
self.model = os.getenv("LLAMA_MODEL_NAME", "llama-3.1-8b-instruct")
self.PROVIDER = "llama_server"
logging.info("Using llama_server as LLM backend")
else:
self.client = OpenAI()
self.model = "gpt-4o-mini"
self.PROVIDER = "openai"
logging.info("Using OpenAI as LLM backend")
@@ -35,11 +28,8 @@ class LLMClient:
prompt: str,
system_prompt: str,
):
# Instituting a fallback if my gaming PC is not on
if self.PROVIDER == "ollama":
try:
response = self.ollama_client.chat(
model="gemma3:4b",
response = self.client.chat.completions.create(
model=self.model,
messages=[
{
"role": "system",
@@ -48,26 +38,9 @@ class LLMClient:
{"role": "user", "content": prompt},
],
)
output = response.message.content
return output
except Exception as e:
logging.error(f"Could not connect to OLLAMA: {str(e)}")
response = self.openai_client.responses.create(
model="gpt-4o-mini",
input=[
{
"role": "system",
"content": system_prompt,
},
{"role": "user", "content": prompt},
],
)
output = response.output_text
return output
return response.choices[0].message.content
if __name__ == "__main__":
client = Client()
client.chat(model="gemma3:4b", messages=[{"role": "system", "promp": "hack"}])
client = LLMClient()
print(client.chat(prompt="Hello!", system_prompt="You are a helpful assistant."))

View File

@@ -5,7 +5,6 @@ import os
import sqlite3
import time
import ollama
from dotenv import load_dotenv
import chromadb
@@ -17,11 +16,6 @@ from utils.request import PaperlessNGXService
_dotenv_loaded = load_dotenv()
# Configure ollama client with URL from environment or default to localhost
ollama_client = ollama.Client(
host=os.getenv("OLLAMA_URL", "http://localhost:11434"), timeout=10.0
)
client = chromadb.PersistentClient(path=os.getenv("CHROMADB_PATH", ""))
simba_docs = client.get_or_create_collection(name="simba_docs2")
feline_vet_lookup = client.get_or_create_collection(name="feline_vet_lookup")

View File

@@ -9,7 +9,6 @@ dependencies = [
"python-dotenv>=1.0.0",
"flask>=3.1.2",
"httpx>=0.28.1",
"ollama>=0.6.0",
"openai>=2.0.1",
"pydantic>=2.11.9",
"pillow>=10.0.0",
@@ -34,7 +33,6 @@ dependencies = [
"langchain-chroma>=1.0.0",
"langchain-community>=0.4.1",
"jq>=1.10.0",
"langchain-ollama>=1.0.1",
"tavily-python>=0.7.17",
]

View File

@@ -1,18 +1,11 @@
import json
import os
from typing import Literal
import datetime
from ollama import Client
from openai import OpenAI
from pydantic import BaseModel, Field
# Configure ollama client with URL from environment or default to localhost
ollama_client = Client(
host=os.getenv("OLLAMA_URL", "http://localhost:11434"), timeout=10.0
)
# This uses inferred filters — which means using LLM to create the metadata filters

View File

@@ -3,7 +3,6 @@ from math import ceil
import re
from typing import Union
from uuid import UUID, uuid4
from ollama import Client
from chromadb.utils.embedding_functions.openai_embedding_function import (
OpenAIEmbeddingFunction,
)
@@ -13,10 +12,6 @@ from llm import LLMClient
load_dotenv()
ollama_client = Client(
host=os.getenv("OLLAMA_HOST", "http://localhost:11434"), timeout=1.0
)
def remove_headers_footers(text, header_patterns=None, footer_patterns=None):
if header_patterns is None: