diff --git a/.env.example b/.env.example index 25a9215..0e0693d 100644 --- a/.env.example +++ b/.env.example @@ -14,9 +14,10 @@ JWT_SECRET_KEY=your-secret-key-here PAPERLESS_TOKEN=your-paperless-token BASE_URL=192.168.1.5:8000 -# Ollama Configuration -OLLAMA_URL=http://192.168.1.14:11434 -OLLAMA_HOST=http://192.168.1.14:11434 +# llama-server Configuration (OpenAI-compatible API) +# If set, uses llama-server as the primary LLM backend with OpenAI as fallback +LLAMA_SERVER_URL=http://192.168.1.213:8080/v1 +LLAMA_MODEL_NAME=llama-3.1-8b-instruct # ChromaDB Configuration # For Docker: This is automatically set to /app/data/chromadb diff --git a/blueprints/conversation/agents.py b/blueprints/conversation/agents.py index 71c472f..13f914c 100644 --- a/blueprints/conversation/agents.py +++ b/blueprints/conversation/agents.py @@ -4,16 +4,26 @@ from typing import cast from langchain.agents import create_agent from langchain.chat_models import BaseChatModel from langchain.tools import tool -from langchain_ollama import ChatOllama from langchain_openai import ChatOpenAI from tavily import AsyncTavilyClient from blueprints.rag.logic import query_vector_store -openai_gpt_5_mini = ChatOpenAI(model="gpt-5-mini") -ollama_deepseek = ChatOllama(model="llama3.1:8b", base_url=os.getenv("OLLAMA_URL")) +# Configure LLM with llama-server or OpenAI fallback +llama_url = os.getenv("LLAMA_SERVER_URL") +if llama_url: + llama_chat = ChatOpenAI( + base_url=llama_url, + api_key="not-needed", + model=os.getenv("LLAMA_MODEL_NAME", "llama-3.1-8b-instruct"), + ) +else: + llama_chat = None + +openai_fallback = ChatOpenAI(model="gpt-5-mini") model_with_fallback = cast( - BaseChatModel, ollama_deepseek.with_fallbacks([openai_gpt_5_mini]) + BaseChatModel, + llama_chat.with_fallbacks([openai_fallback]) if llama_chat else openai_fallback, ) client = AsyncTavilyClient(os.getenv("TAVILY_KEY"), "") diff --git a/llm.py b/llm.py index 280b481..1417a6a 100644 --- a/llm.py +++ b/llm.py @@ -1,32 +1,25 @@ import os - -from ollama import Client -from openai import OpenAI - import logging + +from openai import OpenAI from dotenv import load_dotenv load_dotenv() logging.basicConfig(level=logging.INFO) -TRY_OLLAMA = os.getenv("TRY_OLLAMA", False) - class LLMClient: def __init__(self): - try: - self.ollama_client = Client( - host=os.getenv("OLLAMA_URL", "http://localhost:11434"), timeout=1.0 - ) - self.ollama_client.chat( - model="gemma3:4b", messages=[{"role": "system", "content": "test"}] - ) - self.PROVIDER = "ollama" - logging.info("Using Ollama as LLM backend") - except Exception as e: - print(e) - self.openai_client = OpenAI() + llama_url = os.getenv("LLAMA_SERVER_URL") + if llama_url: + self.client = OpenAI(base_url=llama_url, api_key="not-needed") + self.model = os.getenv("LLAMA_MODEL_NAME", "llama-3.1-8b-instruct") + self.PROVIDER = "llama_server" + logging.info("Using llama_server as LLM backend") + else: + self.client = OpenAI() + self.model = "gpt-4o-mini" self.PROVIDER = "openai" logging.info("Using OpenAI as LLM backend") @@ -35,27 +28,9 @@ class LLMClient: prompt: str, system_prompt: str, ): - # Instituting a fallback if my gaming PC is not on - if self.PROVIDER == "ollama": - try: - response = self.ollama_client.chat( - model="gemma3:4b", - messages=[ - { - "role": "system", - "content": system_prompt, - }, - {"role": "user", "content": prompt}, - ], - ) - output = response.message.content - return output - except Exception as e: - logging.error(f"Could not connect to OLLAMA: {str(e)}") - - response = self.openai_client.responses.create( - model="gpt-4o-mini", - input=[ + response = self.client.chat.completions.create( + model=self.model, + messages=[ { "role": "system", "content": system_prompt, @@ -63,11 +38,9 @@ class LLMClient: {"role": "user", "content": prompt}, ], ) - output = response.output_text - - return output + return response.choices[0].message.content if __name__ == "__main__": - client = Client() - client.chat(model="gemma3:4b", messages=[{"role": "system", "promp": "hack"}]) + client = LLMClient() + print(client.chat(prompt="Hello!", system_prompt="You are a helpful assistant.")) diff --git a/main.py b/main.py index 63f7dd1..d424dfd 100644 --- a/main.py +++ b/main.py @@ -5,7 +5,6 @@ import os import sqlite3 import time -import ollama from dotenv import load_dotenv import chromadb @@ -17,11 +16,6 @@ from utils.request import PaperlessNGXService _dotenv_loaded = load_dotenv() -# Configure ollama client with URL from environment or default to localhost -ollama_client = ollama.Client( - host=os.getenv("OLLAMA_URL", "http://localhost:11434"), timeout=10.0 -) - client = chromadb.PersistentClient(path=os.getenv("CHROMADB_PATH", "")) simba_docs = client.get_or_create_collection(name="simba_docs2") feline_vet_lookup = client.get_or_create_collection(name="feline_vet_lookup") diff --git a/pyproject.toml b/pyproject.toml index 337f023..357816b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,7 +9,6 @@ dependencies = [ "python-dotenv>=1.0.0", "flask>=3.1.2", "httpx>=0.28.1", - "ollama>=0.6.0", "openai>=2.0.1", "pydantic>=2.11.9", "pillow>=10.0.0", @@ -34,7 +33,6 @@ dependencies = [ "langchain-chroma>=1.0.0", "langchain-community>=0.4.1", "jq>=1.10.0", - "langchain-ollama>=1.0.1", "tavily-python>=0.7.17", ] diff --git a/scripts/query.py b/scripts/query.py index 974b824..f1fdfba 100644 --- a/scripts/query.py +++ b/scripts/query.py @@ -1,18 +1,11 @@ import json -import os from typing import Literal import datetime -from ollama import Client from openai import OpenAI from pydantic import BaseModel, Field -# Configure ollama client with URL from environment or default to localhost -ollama_client = Client( - host=os.getenv("OLLAMA_URL", "http://localhost:11434"), timeout=10.0 -) - # This uses inferred filters — which means using LLM to create the metadata filters diff --git a/utils/chunker.py b/utils/chunker.py index b0179bb..d46a532 100644 --- a/utils/chunker.py +++ b/utils/chunker.py @@ -3,7 +3,6 @@ from math import ceil import re from typing import Union from uuid import UUID, uuid4 -from ollama import Client from chromadb.utils.embedding_functions.openai_embedding_function import ( OpenAIEmbeddingFunction, ) @@ -13,10 +12,6 @@ from llm import LLMClient load_dotenv() -ollama_client = Client( - host=os.getenv("OLLAMA_HOST", "http://localhost:11434"), timeout=1.0 -) - def remove_headers_footers(text, header_patterns=None, footer_patterns=None): if header_patterns is None: