Skip to main content

Local LLM Setup with Ollama

Run your AI trading agents completely locally with Ollama. Zero API costs, complete privacy, no rate limits.

Why Local LLMs?

BenefitCloud LLMLocal LLM
Cost$0.01-0.03/callFree
PrivacyData sent to providerStays on your machine
Latency500ms-2s100-500ms
Rate LimitsYesNo
UptimeDepends on providerYou control
CustomizationLimitedFull fine-tuning

Prerequisites

  • Mac with Apple Silicon (M1/M2/M3) or Linux with NVIDIA GPU
  • 16GB+ RAM (32GB recommended)
  • Ollama installed

Step 1: Install Ollama

macOS

curl -fsSL https://ollama.ai/install.sh | sh

Linux

curl -fsSL https://ollama.ai/install.sh | sh

Verify Installation

ollama --version

Step 2: Pull Models

For Trading Analysis

# Llama 3.1 8B - Good balance of speed and quality
ollama pull llama3.1:8b

# Qwen 2.5 7B - Excellent for structured output
ollama pull qwen2.5:7b

# Mistral 7B - Fast inference
ollama pull mistral:7b

For Complex Reasoning (requires more RAM)

# Llama 3.1 70B - Highest quality (needs 48GB+ RAM)
ollama pull llama3.1:70b

# Qwen 2.5 32B - Great for analysis (needs 32GB+ RAM)
ollama pull qwen2.5:32b

Step 3: Test the Model

# Start Ollama server (usually auto-starts)
ollama serve

# Test in another terminal
ollama run llama3.1:8b "What is the RSI indicator in trading?"

Step 4: Integrate with LangChain

Installation

pip install langchain-ollama

Basic Usage

from langchain_ollama import ChatOllama

llm = ChatOllama(
model="llama3.1:8b",
base_url="http://localhost:11434", # Default Ollama URL
temperature=0,
)

response = await llm.ainvoke("What is a good RSI level to buy?")
print(response.content)

Structured Output

import json
from langchain_ollama import ChatOllama
from langchain_core.prompts import PromptTemplate

llm = ChatOllama(
model="qwen2.5:7b", # Qwen is great for JSON output
temperature=0,
format="json", # Request JSON output
)

prompt = PromptTemplate.from_template("""
Analyze this market data and respond with a JSON trading recommendation:

RSI: {rsi}
MACD: {macd}
Price Change 24h: {change}%

Response format:
{{
"action": "BUY" | "SELL" | "HOLD",
"confidence": 0-100,
"reasoning": "brief explanation"
}}
""")

chain = prompt | llm

result = await chain.ainvoke({
"rsi": 28,
"macd": "bullish crossover",
"change": -5.2,
})

print(json.loads(result.content))

Step 5: Build a Local Trading Agent

# src/local_trading_agent.py
import os
import asyncio
from web3 import Web3
from langchain_ollama import ChatOllama
from langchain.agents import AgentExecutor, create_react_agent
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder

from zeroquant import ZeroQuantClient
from zeroquant.langchain.tools import ExecuteSwapTool


class LocalTradingAgent:
def __init__(self, model: str, client: ZeroQuantClient, vault_address: str):
self.llm = ChatOllama(
model=model,
base_url="http://localhost:11434",
temperature=0,
num_predict=500, # Limit output length
num_ctx=4096, # Context window
)
self.client = client
self.vault_address = vault_address
self.executor = None

async def initialize(self) -> None:
tools = [
ExecuteSwapTool(client=self.client),
]

prompt = ChatPromptTemplate.from_messages([
("system", """You are a DeFi trading assistant running locally.
You help users execute trades safely through their ZeroQuant vault.
Always confirm trade details before executing.
Be concise in your responses."""),
("human", "{input}"),
MessagesPlaceholder(variable_name="agent_scratchpad"),
])

agent = create_react_agent(
llm=self.llm,
tools=tools,
prompt=prompt,
)

self.executor = AgentExecutor(
agent=agent,
tools=tools,
verbose=True,
max_iterations=3,
)

async def chat(self, input_text: str) -> str:
if not self.executor:
raise RuntimeError("Agent not initialized")

result = await self.executor.ainvoke({"input": input_text})
return result["output"]


# Usage
async def main():
w3 = Web3(Web3.HTTPProvider(os.getenv("RPC_URL")))

client = ZeroQuantClient(
web3=w3,
private_key=os.getenv("PRIVATE_KEY"),
factory_address=os.getenv("FACTORY_ADDRESS"),
permission_manager_address=os.getenv("PERMISSION_MANAGER_ADDRESS"),
)
await client.connect()
await client.connect_vault(os.getenv("VAULT_ADDRESS"))

agent = LocalTradingAgent(
model="llama3.1:8b",
client=client,
vault_address=os.getenv("VAULT_ADDRESS"),
)

await agent.initialize()

# Interactive chat
print("Local Trading Agent Ready (using Ollama)")
print("Type your commands:\n")

while True:
try:
user_input = input("You: ")
if user_input.lower() == "exit":
break

response = await agent.chat(user_input)
print(f"\nAgent: {response}\n")
except KeyboardInterrupt:
break


if __name__ == "__main__":
asyncio.run(main())

Step 6: Local Decision Engine

For high-frequency analysis, bypass LangChain overhead:

# src/local_decision_engine.py
import json
from dataclasses import dataclass
from langchain_ollama import ChatOllama


@dataclass
class MarketData:
symbol: str
price: float
rsi: float
macd_trend: str # "BULLISH", "BEARISH", "NEUTRAL"
volume_ratio: float


@dataclass
class TradingDecision:
action: str # "LONG", "SHORT", "HOLD"
confidence: int
reasoning: str


class LocalDecisionEngine:
def __init__(self, model: str = "qwen2.5:7b"):
self.llm = ChatOllama(
model=model,
base_url="http://localhost:11434",
temperature=0,
format="json",
num_predict=200,
)

async def analyze(self, data: MarketData) -> TradingDecision:
prompt = f"""You are a crypto trading analyst. Analyze this data:

Symbol: {data.symbol}
Price: ${data.price}
RSI: {data.rsi} (oversold<30, overbought>70)
MACD: {data.macd_trend}
Volume: {data.volume_ratio}x average

Respond ONLY with this JSON:
{{"action":"LONG"|"SHORT"|"HOLD","confidence":0-100,"reasoning":"brief"}}"""

try:
response = await self.llm.ainvoke(prompt)
result = json.loads(response.content)
return TradingDecision(
action=result["action"],
confidence=result["confidence"],
reasoning=result["reasoning"],
)
except Exception as e:
print(f"Decision failed: {e}")
return TradingDecision(
action="HOLD",
confidence=0,
reasoning="Analysis failed",
)

Performance Optimization

1. Model Quantization

Ollama automatically uses quantized models. For faster inference:

# 4-bit quantization (fastest, lower quality)
ollama pull llama3.1:8b-q4_0

# 8-bit quantization (balanced)
ollama pull llama3.1:8b-q8_0

2. Context Window Management

llm = ChatOllama(
model="llama3.1:8b",
num_ctx=2048, # Smaller context = faster
num_predict=100, # Limit output tokens
)

3. Batch Processing

import asyncio

# Process multiple symbols in parallel
decisions = await asyncio.gather(*[
engine.analyze(MarketData(symbol=symbol, **data))
for symbol in symbols
])

4. Response Caching

from cachetools import TTLCache

cache = TTLCache(maxsize=100, ttl=300) # 5 min cache


async def cached_analyze(data: MarketData) -> TradingDecision:
key = f"{data.symbol}-{data.rsi}-{data.macd_trend}"

if key in cache:
return cache[key]

decision = await engine.analyze(data)
cache[key] = decision
return decision

Model Comparison

ModelRAMSpeedQualityBest For
llama3.1:8b8GBFastGoodGeneral trading
qwen2.5:7b8GBFastGoodJSON output
mistral:7b8GBFastestGoodQuick decisions
llama3.1:70b48GBSlowExcellentComplex analysis
qwen2.5:32b32GBMediumVery GoodDetailed reasoning

Troubleshooting

Model Too Slow

# Check GPU usage
ollama ps

# Use smaller model
ollama pull llama3.1:8b-q4_0

Out of Memory

# Reduce context window
OLLAMA_NUM_CTX=2048 ollama serve

Connection Refused

# Ensure Ollama is running
ollama serve

# Check port
curl http://localhost:11434/api/tags

Complete Example

import os
import asyncio
from dotenv import load_dotenv
from web3 import Web3

from zeroquant import ZeroQuantClient
from local_decision_engine import LocalDecisionEngine, MarketData

load_dotenv()


async def main():
# Setup
w3 = Web3(Web3.HTTPProvider(os.getenv("RPC_URL")))

client = ZeroQuantClient(
web3=w3,
private_key=os.getenv("PRIVATE_KEY"),
factory_address=os.getenv("FACTORY_ADDRESS"),
permission_manager_address=os.getenv("PERMISSION_MANAGER_ADDRESS"),
)
await client.connect()
await client.connect_vault(os.getenv("VAULT_ADDRESS"))

# Local decision engine
engine = LocalDecisionEngine("qwen2.5:7b")

print("Local Trading System Ready")
print("Using Ollama with qwen2.5:7b")
print("Zero API costs, complete privacy\n")

# Trading loop
while True:
decision = await engine.analyze(MarketData(
symbol="ETH",
price=3500,
rsi=35,
macd_trend="BULLISH",
volume_ratio=1.2,
))

print(f"Decision: {decision.action} ({decision.confidence}%)")
print(f"Reasoning: {decision.reasoning}\n")

if decision.confidence > 70 and decision.action != "HOLD":
# Execute trade...
pass

await asyncio.sleep(60)


if __name__ == "__main__":
asyncio.run(main())

What's Next?


Tip: Start with smaller models (llama3.1:8b) and upgrade to larger ones (llama3.1:70b) as needed. The smaller models are often sufficient for trading decisions.