Merge pull request 'Replace Ollama with llama-server (OpenAI-compatible API)' (#14) from feature/llama-cpp-integration into main
Reviewed-on: #14
This commit was merged in pull request #14.
This commit is contained in:
@@ -14,9 +14,10 @@ JWT_SECRET_KEY=your-secret-key-here
|
|||||||
PAPERLESS_TOKEN=your-paperless-token
|
PAPERLESS_TOKEN=your-paperless-token
|
||||||
BASE_URL=192.168.1.5:8000
|
BASE_URL=192.168.1.5:8000
|
||||||
|
|
||||||
# Ollama Configuration
|
# llama-server Configuration (OpenAI-compatible API)
|
||||||
OLLAMA_URL=http://192.168.1.14:11434
|
# If set, uses llama-server as the primary LLM backend with OpenAI as fallback
|
||||||
OLLAMA_HOST=http://192.168.1.14:11434
|
LLAMA_SERVER_URL=http://192.168.1.213:8080/v1
|
||||||
|
LLAMA_MODEL_NAME=llama-3.1-8b-instruct
|
||||||
|
|
||||||
# ChromaDB Configuration
|
# ChromaDB Configuration
|
||||||
# For Docker: This is automatically set to /app/data/chromadb
|
# For Docker: This is automatically set to /app/data/chromadb
|
||||||
|
|||||||
@@ -4,16 +4,26 @@ from typing import cast
|
|||||||
from langchain.agents import create_agent
|
from langchain.agents import create_agent
|
||||||
from langchain.chat_models import BaseChatModel
|
from langchain.chat_models import BaseChatModel
|
||||||
from langchain.tools import tool
|
from langchain.tools import tool
|
||||||
from langchain_ollama import ChatOllama
|
|
||||||
from langchain_openai import ChatOpenAI
|
from langchain_openai import ChatOpenAI
|
||||||
from tavily import AsyncTavilyClient
|
from tavily import AsyncTavilyClient
|
||||||
|
|
||||||
from blueprints.rag.logic import query_vector_store
|
from blueprints.rag.logic import query_vector_store
|
||||||
|
|
||||||
openai_gpt_5_mini = ChatOpenAI(model="gpt-5-mini")
|
# Configure LLM with llama-server or OpenAI fallback
|
||||||
ollama_deepseek = ChatOllama(model="llama3.1:8b", base_url=os.getenv("OLLAMA_URL"))
|
llama_url = os.getenv("LLAMA_SERVER_URL")
|
||||||
|
if llama_url:
|
||||||
|
llama_chat = ChatOpenAI(
|
||||||
|
base_url=llama_url,
|
||||||
|
api_key="not-needed",
|
||||||
|
model=os.getenv("LLAMA_MODEL_NAME", "llama-3.1-8b-instruct"),
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
llama_chat = None
|
||||||
|
|
||||||
|
openai_fallback = ChatOpenAI(model="gpt-5-mini")
|
||||||
model_with_fallback = cast(
|
model_with_fallback = cast(
|
||||||
BaseChatModel, ollama_deepseek.with_fallbacks([openai_gpt_5_mini])
|
BaseChatModel,
|
||||||
|
llama_chat.with_fallbacks([openai_fallback]) if llama_chat else openai_fallback,
|
||||||
)
|
)
|
||||||
client = AsyncTavilyClient(os.getenv("TAVILY_KEY"), "")
|
client = AsyncTavilyClient(os.getenv("TAVILY_KEY"), "")
|
||||||
|
|
||||||
|
|||||||
61
llm.py
61
llm.py
@@ -1,32 +1,25 @@
|
|||||||
import os
|
import os
|
||||||
|
|
||||||
from ollama import Client
|
|
||||||
from openai import OpenAI
|
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
|
from openai import OpenAI
|
||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
|
|
||||||
load_dotenv()
|
load_dotenv()
|
||||||
|
|
||||||
logging.basicConfig(level=logging.INFO)
|
logging.basicConfig(level=logging.INFO)
|
||||||
|
|
||||||
TRY_OLLAMA = os.getenv("TRY_OLLAMA", False)
|
|
||||||
|
|
||||||
|
|
||||||
class LLMClient:
|
class LLMClient:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
try:
|
llama_url = os.getenv("LLAMA_SERVER_URL")
|
||||||
self.ollama_client = Client(
|
if llama_url:
|
||||||
host=os.getenv("OLLAMA_URL", "http://localhost:11434"), timeout=1.0
|
self.client = OpenAI(base_url=llama_url, api_key="not-needed")
|
||||||
)
|
self.model = os.getenv("LLAMA_MODEL_NAME", "llama-3.1-8b-instruct")
|
||||||
self.ollama_client.chat(
|
self.PROVIDER = "llama_server"
|
||||||
model="gemma3:4b", messages=[{"role": "system", "content": "test"}]
|
logging.info("Using llama_server as LLM backend")
|
||||||
)
|
else:
|
||||||
self.PROVIDER = "ollama"
|
self.client = OpenAI()
|
||||||
logging.info("Using Ollama as LLM backend")
|
self.model = "gpt-4o-mini"
|
||||||
except Exception as e:
|
|
||||||
print(e)
|
|
||||||
self.openai_client = OpenAI()
|
|
||||||
self.PROVIDER = "openai"
|
self.PROVIDER = "openai"
|
||||||
logging.info("Using OpenAI as LLM backend")
|
logging.info("Using OpenAI as LLM backend")
|
||||||
|
|
||||||
@@ -35,27 +28,9 @@ class LLMClient:
|
|||||||
prompt: str,
|
prompt: str,
|
||||||
system_prompt: str,
|
system_prompt: str,
|
||||||
):
|
):
|
||||||
# Instituting a fallback if my gaming PC is not on
|
response = self.client.chat.completions.create(
|
||||||
if self.PROVIDER == "ollama":
|
model=self.model,
|
||||||
try:
|
messages=[
|
||||||
response = self.ollama_client.chat(
|
|
||||||
model="gemma3:4b",
|
|
||||||
messages=[
|
|
||||||
{
|
|
||||||
"role": "system",
|
|
||||||
"content": system_prompt,
|
|
||||||
},
|
|
||||||
{"role": "user", "content": prompt},
|
|
||||||
],
|
|
||||||
)
|
|
||||||
output = response.message.content
|
|
||||||
return output
|
|
||||||
except Exception as e:
|
|
||||||
logging.error(f"Could not connect to OLLAMA: {str(e)}")
|
|
||||||
|
|
||||||
response = self.openai_client.responses.create(
|
|
||||||
model="gpt-4o-mini",
|
|
||||||
input=[
|
|
||||||
{
|
{
|
||||||
"role": "system",
|
"role": "system",
|
||||||
"content": system_prompt,
|
"content": system_prompt,
|
||||||
@@ -63,11 +38,9 @@ class LLMClient:
|
|||||||
{"role": "user", "content": prompt},
|
{"role": "user", "content": prompt},
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
output = response.output_text
|
return response.choices[0].message.content
|
||||||
|
|
||||||
return output
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
client = Client()
|
client = LLMClient()
|
||||||
client.chat(model="gemma3:4b", messages=[{"role": "system", "promp": "hack"}])
|
print(client.chat(prompt="Hello!", system_prompt="You are a helpful assistant."))
|
||||||
|
|||||||
6
main.py
6
main.py
@@ -5,7 +5,6 @@ import os
|
|||||||
import sqlite3
|
import sqlite3
|
||||||
import time
|
import time
|
||||||
|
|
||||||
import ollama
|
|
||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
|
|
||||||
import chromadb
|
import chromadb
|
||||||
@@ -17,11 +16,6 @@ from utils.request import PaperlessNGXService
|
|||||||
|
|
||||||
_dotenv_loaded = load_dotenv()
|
_dotenv_loaded = load_dotenv()
|
||||||
|
|
||||||
# Configure ollama client with URL from environment or default to localhost
|
|
||||||
ollama_client = ollama.Client(
|
|
||||||
host=os.getenv("OLLAMA_URL", "http://localhost:11434"), timeout=10.0
|
|
||||||
)
|
|
||||||
|
|
||||||
client = chromadb.PersistentClient(path=os.getenv("CHROMADB_PATH", ""))
|
client = chromadb.PersistentClient(path=os.getenv("CHROMADB_PATH", ""))
|
||||||
simba_docs = client.get_or_create_collection(name="simba_docs2")
|
simba_docs = client.get_or_create_collection(name="simba_docs2")
|
||||||
feline_vet_lookup = client.get_or_create_collection(name="feline_vet_lookup")
|
feline_vet_lookup = client.get_or_create_collection(name="feline_vet_lookup")
|
||||||
|
|||||||
@@ -9,7 +9,6 @@ dependencies = [
|
|||||||
"python-dotenv>=1.0.0",
|
"python-dotenv>=1.0.0",
|
||||||
"flask>=3.1.2",
|
"flask>=3.1.2",
|
||||||
"httpx>=0.28.1",
|
"httpx>=0.28.1",
|
||||||
"ollama>=0.6.0",
|
|
||||||
"openai>=2.0.1",
|
"openai>=2.0.1",
|
||||||
"pydantic>=2.11.9",
|
"pydantic>=2.11.9",
|
||||||
"pillow>=10.0.0",
|
"pillow>=10.0.0",
|
||||||
@@ -34,7 +33,6 @@ dependencies = [
|
|||||||
"langchain-chroma>=1.0.0",
|
"langchain-chroma>=1.0.0",
|
||||||
"langchain-community>=0.4.1",
|
"langchain-community>=0.4.1",
|
||||||
"jq>=1.10.0",
|
"jq>=1.10.0",
|
||||||
"langchain-ollama>=1.0.1",
|
|
||||||
"tavily-python>=0.7.17",
|
"tavily-python>=0.7.17",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|||||||
@@ -1,18 +1,11 @@
|
|||||||
import json
|
import json
|
||||||
import os
|
|
||||||
from typing import Literal
|
from typing import Literal
|
||||||
import datetime
|
import datetime
|
||||||
from ollama import Client
|
|
||||||
|
|
||||||
from openai import OpenAI
|
from openai import OpenAI
|
||||||
|
|
||||||
from pydantic import BaseModel, Field
|
from pydantic import BaseModel, Field
|
||||||
|
|
||||||
# Configure ollama client with URL from environment or default to localhost
|
|
||||||
ollama_client = Client(
|
|
||||||
host=os.getenv("OLLAMA_URL", "http://localhost:11434"), timeout=10.0
|
|
||||||
)
|
|
||||||
|
|
||||||
# This uses inferred filters — which means using LLM to create the metadata filters
|
# This uses inferred filters — which means using LLM to create the metadata filters
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -3,7 +3,6 @@ from math import ceil
|
|||||||
import re
|
import re
|
||||||
from typing import Union
|
from typing import Union
|
||||||
from uuid import UUID, uuid4
|
from uuid import UUID, uuid4
|
||||||
from ollama import Client
|
|
||||||
from chromadb.utils.embedding_functions.openai_embedding_function import (
|
from chromadb.utils.embedding_functions.openai_embedding_function import (
|
||||||
OpenAIEmbeddingFunction,
|
OpenAIEmbeddingFunction,
|
||||||
)
|
)
|
||||||
@@ -13,10 +12,6 @@ from llm import LLMClient
|
|||||||
|
|
||||||
load_dotenv()
|
load_dotenv()
|
||||||
|
|
||||||
ollama_client = Client(
|
|
||||||
host=os.getenv("OLLAMA_HOST", "http://localhost:11434"), timeout=1.0
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def remove_headers_footers(text, header_patterns=None, footer_patterns=None):
|
def remove_headers_footers(text, header_patterns=None, footer_patterns=None):
|
||||||
if header_patterns is None:
|
if header_patterns is None:
|
||||||
|
|||||||
Reference in New Issue
Block a user