Merge pull request 'Replace Ollama with llama-server (OpenAI-compatible API)' (#14) from feature/llama-cpp-integration into main

Reviewed-on: #14
This commit was merged in pull request #14.
This commit is contained in:
2026-01-31 21:41:19 -05:00
7 changed files with 35 additions and 71 deletions

View File

@@ -14,9 +14,10 @@ JWT_SECRET_KEY=your-secret-key-here
PAPERLESS_TOKEN=your-paperless-token PAPERLESS_TOKEN=your-paperless-token
BASE_URL=192.168.1.5:8000 BASE_URL=192.168.1.5:8000
# Ollama Configuration # llama-server Configuration (OpenAI-compatible API)
OLLAMA_URL=http://192.168.1.14:11434 # If set, uses llama-server as the primary LLM backend with OpenAI as fallback
OLLAMA_HOST=http://192.168.1.14:11434 LLAMA_SERVER_URL=http://192.168.1.213:8080/v1
LLAMA_MODEL_NAME=llama-3.1-8b-instruct
# ChromaDB Configuration # ChromaDB Configuration
# For Docker: This is automatically set to /app/data/chromadb # For Docker: This is automatically set to /app/data/chromadb

View File

@@ -4,16 +4,26 @@ from typing import cast
from langchain.agents import create_agent from langchain.agents import create_agent
from langchain.chat_models import BaseChatModel from langchain.chat_models import BaseChatModel
from langchain.tools import tool from langchain.tools import tool
from langchain_ollama import ChatOllama
from langchain_openai import ChatOpenAI from langchain_openai import ChatOpenAI
from tavily import AsyncTavilyClient from tavily import AsyncTavilyClient
from blueprints.rag.logic import query_vector_store from blueprints.rag.logic import query_vector_store
openai_gpt_5_mini = ChatOpenAI(model="gpt-5-mini") # Configure LLM with llama-server or OpenAI fallback
ollama_deepseek = ChatOllama(model="llama3.1:8b", base_url=os.getenv("OLLAMA_URL")) llama_url = os.getenv("LLAMA_SERVER_URL")
if llama_url:
llama_chat = ChatOpenAI(
base_url=llama_url,
api_key="not-needed",
model=os.getenv("LLAMA_MODEL_NAME", "llama-3.1-8b-instruct"),
)
else:
llama_chat = None
openai_fallback = ChatOpenAI(model="gpt-5-mini")
model_with_fallback = cast( model_with_fallback = cast(
BaseChatModel, ollama_deepseek.with_fallbacks([openai_gpt_5_mini]) BaseChatModel,
llama_chat.with_fallbacks([openai_fallback]) if llama_chat else openai_fallback,
) )
client = AsyncTavilyClient(os.getenv("TAVILY_KEY"), "") client = AsyncTavilyClient(os.getenv("TAVILY_KEY"), "")

59
llm.py
View File

@@ -1,32 +1,25 @@
import os import os
from ollama import Client
from openai import OpenAI
import logging import logging
from openai import OpenAI
from dotenv import load_dotenv from dotenv import load_dotenv
load_dotenv() load_dotenv()
logging.basicConfig(level=logging.INFO) logging.basicConfig(level=logging.INFO)
TRY_OLLAMA = os.getenv("TRY_OLLAMA", False)
class LLMClient: class LLMClient:
def __init__(self): def __init__(self):
try: llama_url = os.getenv("LLAMA_SERVER_URL")
self.ollama_client = Client( if llama_url:
host=os.getenv("OLLAMA_URL", "http://localhost:11434"), timeout=1.0 self.client = OpenAI(base_url=llama_url, api_key="not-needed")
) self.model = os.getenv("LLAMA_MODEL_NAME", "llama-3.1-8b-instruct")
self.ollama_client.chat( self.PROVIDER = "llama_server"
model="gemma3:4b", messages=[{"role": "system", "content": "test"}] logging.info("Using llama_server as LLM backend")
) else:
self.PROVIDER = "ollama" self.client = OpenAI()
logging.info("Using Ollama as LLM backend") self.model = "gpt-4o-mini"
except Exception as e:
print(e)
self.openai_client = OpenAI()
self.PROVIDER = "openai" self.PROVIDER = "openai"
logging.info("Using OpenAI as LLM backend") logging.info("Using OpenAI as LLM backend")
@@ -35,11 +28,8 @@ class LLMClient:
prompt: str, prompt: str,
system_prompt: str, system_prompt: str,
): ):
# Instituting a fallback if my gaming PC is not on response = self.client.chat.completions.create(
if self.PROVIDER == "ollama": model=self.model,
try:
response = self.ollama_client.chat(
model="gemma3:4b",
messages=[ messages=[
{ {
"role": "system", "role": "system",
@@ -48,26 +38,9 @@ class LLMClient:
{"role": "user", "content": prompt}, {"role": "user", "content": prompt},
], ],
) )
output = response.message.content return response.choices[0].message.content
return output
except Exception as e:
logging.error(f"Could not connect to OLLAMA: {str(e)}")
response = self.openai_client.responses.create(
model="gpt-4o-mini",
input=[
{
"role": "system",
"content": system_prompt,
},
{"role": "user", "content": prompt},
],
)
output = response.output_text
return output
if __name__ == "__main__": if __name__ == "__main__":
client = Client() client = LLMClient()
client.chat(model="gemma3:4b", messages=[{"role": "system", "promp": "hack"}]) print(client.chat(prompt="Hello!", system_prompt="You are a helpful assistant."))

View File

@@ -5,7 +5,6 @@ import os
import sqlite3 import sqlite3
import time import time
import ollama
from dotenv import load_dotenv from dotenv import load_dotenv
import chromadb import chromadb
@@ -17,11 +16,6 @@ from utils.request import PaperlessNGXService
_dotenv_loaded = load_dotenv() _dotenv_loaded = load_dotenv()
# Configure ollama client with URL from environment or default to localhost
ollama_client = ollama.Client(
host=os.getenv("OLLAMA_URL", "http://localhost:11434"), timeout=10.0
)
client = chromadb.PersistentClient(path=os.getenv("CHROMADB_PATH", "")) client = chromadb.PersistentClient(path=os.getenv("CHROMADB_PATH", ""))
simba_docs = client.get_or_create_collection(name="simba_docs2") simba_docs = client.get_or_create_collection(name="simba_docs2")
feline_vet_lookup = client.get_or_create_collection(name="feline_vet_lookup") feline_vet_lookup = client.get_or_create_collection(name="feline_vet_lookup")

View File

@@ -9,7 +9,6 @@ dependencies = [
"python-dotenv>=1.0.0", "python-dotenv>=1.0.0",
"flask>=3.1.2", "flask>=3.1.2",
"httpx>=0.28.1", "httpx>=0.28.1",
"ollama>=0.6.0",
"openai>=2.0.1", "openai>=2.0.1",
"pydantic>=2.11.9", "pydantic>=2.11.9",
"pillow>=10.0.0", "pillow>=10.0.0",
@@ -34,7 +33,6 @@ dependencies = [
"langchain-chroma>=1.0.0", "langchain-chroma>=1.0.0",
"langchain-community>=0.4.1", "langchain-community>=0.4.1",
"jq>=1.10.0", "jq>=1.10.0",
"langchain-ollama>=1.0.1",
"tavily-python>=0.7.17", "tavily-python>=0.7.17",
] ]

View File

@@ -1,18 +1,11 @@
import json import json
import os
from typing import Literal from typing import Literal
import datetime import datetime
from ollama import Client
from openai import OpenAI from openai import OpenAI
from pydantic import BaseModel, Field from pydantic import BaseModel, Field
# Configure ollama client with URL from environment or default to localhost
ollama_client = Client(
host=os.getenv("OLLAMA_URL", "http://localhost:11434"), timeout=10.0
)
# This uses inferred filters — which means using LLM to create the metadata filters # This uses inferred filters — which means using LLM to create the metadata filters

View File

@@ -3,7 +3,6 @@ from math import ceil
import re import re
from typing import Union from typing import Union
from uuid import UUID, uuid4 from uuid import UUID, uuid4
from ollama import Client
from chromadb.utils.embedding_functions.openai_embedding_function import ( from chromadb.utils.embedding_functions.openai_embedding_function import (
OpenAIEmbeddingFunction, OpenAIEmbeddingFunction,
) )
@@ -13,10 +12,6 @@ from llm import LLMClient
load_dotenv() load_dotenv()
ollama_client = Client(
host=os.getenv("OLLAMA_HOST", "http://localhost:11434"), timeout=1.0
)
def remove_headers_footers(text, header_patterns=None, footer_patterns=None): def remove_headers_footers(text, header_patterns=None, footer_patterns=None):
if header_patterns is None: if header_patterns is None: