Local LLM Setup with Ollama
Run your AI trading agents completely locally with Ollama. Zero API costs, complete privacy, no rate limits.
Why Local LLMs?
| Benefit | Cloud LLM | Local LLM |
|---|---|---|
| Cost | $0.01-0.03/call | Free |
| Privacy | Data sent to provider | Stays on your machine |
| Latency | 500ms-2s | 100-500ms |
| Rate Limits | Yes | No |
| Uptime | Depends on provider | You control |
| Customization | Limited | Full fine-tuning |
Prerequisites
- Mac with Apple Silicon (M1/M2/M3) or Linux with NVIDIA GPU
- 16GB+ RAM (32GB recommended)
- Ollama installed
Step 1: Install Ollama
macOS
curl -fsSL https://ollama.ai/install.sh | sh
Linux
curl -fsSL https://ollama.ai/install.sh | sh
Verify Installation
ollama --version
Step 2: Pull Models
For Trading Analysis
# Llama 3.1 8B - Good balance of speed and quality
ollama pull llama3.1:8b
# Qwen 2.5 7B - Excellent for structured output
ollama pull qwen2.5:7b
# Mistral 7B - Fast inference
ollama pull mistral:7b
For Complex Reasoning (requires more RAM)
# Llama 3.1 70B - Highest quality (needs 48GB+ RAM)
ollama pull llama3.1:70b
# Qwen 2.5 32B - Great for analysis (needs 32GB+ RAM)
ollama pull qwen2.5:32b
Step 3: Test the Model
# Start Ollama server (usually auto-starts)
ollama serve
# Test in another terminal
ollama run llama3.1:8b "What is the RSI indicator in trading?"
Step 4: Integrate with LangChain
Installation
- Python
- TypeScript
pip install langchain-ollama
npm install @langchain/ollama
Basic Usage
- Python
- TypeScript
from langchain_ollama import ChatOllama
llm = ChatOllama(
model="llama3.1:8b",
base_url="http://localhost:11434", # Default Ollama URL
temperature=0,
)
response = await llm.ainvoke("What is a good RSI level to buy?")
print(response.content)
import { ChatOllama } from '@langchain/community/chat_models/ollama';
const llm = new ChatOllama({
model: 'llama3.1:8b',
baseUrl: 'http://localhost:11434', // Default Ollama URL
temperature: 0,
});
const response = await llm.invoke('What is a good RSI level to buy?');
console.log(response.content);
Structured Output
- Python
- TypeScript
import json
from langchain_ollama import ChatOllama
from langchain_core.prompts import PromptTemplate
llm = ChatOllama(
model="qwen2.5:7b", # Qwen is great for JSON output
temperature=0,
format="json", # Request JSON output
)
prompt = PromptTemplate.from_template("""
Analyze this market data and respond with a JSON trading recommendation:
RSI: {rsi}
MACD: {macd}
Price Change 24h: {change}%
Response format:
{{
"action": "BUY" | "SELL" | "HOLD",
"confidence": 0-100,
"reasoning": "brief explanation"
}}
""")
chain = prompt | llm
result = await chain.ainvoke({
"rsi": 28,
"macd": "bullish crossover",
"change": -5.2,
})
print(json.loads(result.content))
import { ChatOllama } from '@langchain/community/chat_models/ollama';
import { PromptTemplate } from '@langchain/core/prompts';
const llm = new ChatOllama({
model: 'qwen2.5:7b', // Qwen is great for JSON output
temperature: 0,
format: 'json', // Request JSON output
});
const prompt = PromptTemplate.fromTemplate(`
Analyze this market data and respond with a JSON trading recommendation:
RSI: {rsi}
MACD: {macd}
Price Change 24h: {change}%
Response format:
{{
"action": "BUY" | "SELL" | "HOLD",
"confidence": 0-100,
"reasoning": "brief explanation"
}}
`);
const chain = prompt.pipe(llm);
const result = await chain.invoke({
rsi: 28,
macd: 'bullish crossover',
change: -5.2,
});
console.log(JSON.parse(result.content as string));
Step 5: Build a Local Trading Agent
- Python
- TypeScript
# src/local_trading_agent.py
import os
import asyncio
from web3 import Web3
from langchain_ollama import ChatOllama
from langchain.agents import AgentExecutor, create_react_agent
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from zeroquant import ZeroQuantClient
from zeroquant.langchain.tools import ExecuteSwapTool
class LocalTradingAgent:
def __init__(self, model: str, client: ZeroQuantClient, vault_address: str):
self.llm = ChatOllama(
model=model,
base_url="http://localhost:11434",
temperature=0,
num_predict=500, # Limit output length
num_ctx=4096, # Context window
)
self.client = client
self.vault_address = vault_address
self.executor = None
async def initialize(self) -> None:
tools = [
ExecuteSwapTool(client=self.client),
]
prompt = ChatPromptTemplate.from_messages([
("system", """You are a DeFi trading assistant running locally.
You help users execute trades safely through their ZeroQuant vault.
Always confirm trade details before executing.
Be concise in your responses."""),
("human", "{input}"),
MessagesPlaceholder(variable_name="agent_scratchpad"),
])
agent = create_react_agent(
llm=self.llm,
tools=tools,
prompt=prompt,
)
self.executor = AgentExecutor(
agent=agent,
tools=tools,
verbose=True,
max_iterations=3,
)
async def chat(self, input_text: str) -> str:
if not self.executor:
raise RuntimeError("Agent not initialized")
result = await self.executor.ainvoke({"input": input_text})
return result["output"]
# Usage
async def main():
w3 = Web3(Web3.HTTPProvider(os.getenv("RPC_URL")))
client = ZeroQuantClient(
web3=w3,
private_key=os.getenv("PRIVATE_KEY"),
factory_address=os.getenv("FACTORY_ADDRESS"),
permission_manager_address=os.getenv("PERMISSION_MANAGER_ADDRESS"),
)
await client.connect()
await client.connect_vault(os.getenv("VAULT_ADDRESS"))
agent = LocalTradingAgent(
model="llama3.1:8b",
client=client,
vault_address=os.getenv("VAULT_ADDRESS"),
)
await agent.initialize()
# Interactive chat
print("Local Trading Agent Ready (using Ollama)")
print("Type your commands:\n")
while True:
try:
user_input = input("You: ")
if user_input.lower() == "exit":
break
response = await agent.chat(user_input)
print(f"\nAgent: {response}\n")
except KeyboardInterrupt:
break
if __name__ == "__main__":
asyncio.run(main())
// src/local-trading-agent.ts
import { ethers } from 'ethers';
import { ZeroQuantClient } from '@zeroquant/sdk';
import { ExecuteSwapTool } from '@zeroquant/langchain';
import { ChatOllama } from '@langchain/community/chat_models/ollama';
import { AgentExecutor, createReactAgent } from 'langchain/agents';
import { ChatPromptTemplate } from '@langchain/core/prompts';
interface LocalAgentConfig {
model: string;
client: ZeroQuantClient;
vaultAddress: string;
}
export class LocalTradingAgent {
private llm: ChatOllama;
private client: ZeroQuantClient;
private executor: AgentExecutor | null = null;
constructor(config: LocalAgentConfig) {
this.llm = new ChatOllama({
model: config.model,
baseUrl: 'http://localhost:11434',
temperature: 0,
numPredict: 500, // Limit output length
numCtx: 4096, // Context window
});
this.client = config.client;
}
async initialize(): Promise<void> {
const tools = [
new ExecuteSwapTool({ client: this.client }),
];
const prompt = ChatPromptTemplate.fromMessages([
['system', `You are a DeFi trading assistant running locally.
You help users execute trades safely through their ZeroQuant vault.
Always confirm trade details before executing.
Be concise in your responses.`],
['human', '{input}'],
['placeholder', '{agent_scratchpad}'],
]);
const agent = await createReactAgent({
llm: this.llm,
tools,
prompt,
});
this.executor = new AgentExecutor({
agent,
tools,
verbose: true,
maxIterations: 3,
});
}
async chat(input: string): Promise<string> {
if (!this.executor) {
throw new Error('Agent not initialized');
}
const result = await this.executor.invoke({ input });
return result.output;
}
}
// Usage
async function main() {
const provider = new ethers.JsonRpcProvider(process.env.RPC_URL);
const signer = new ethers.Wallet(process.env.PRIVATE_KEY!, provider);
const client = new ZeroQuantClient(provider, {
factoryAddress: process.env.FACTORY_ADDRESS!,
permissionManagerAddress: process.env.PERMISSION_MANAGER_ADDRESS!,
});
await client.connect(signer);
await client.connectVault(process.env.VAULT_ADDRESS!);
const agent = new LocalTradingAgent({
model: 'llama3.1:8b',
client,
vaultAddress: process.env.VAULT_ADDRESS!,
});
await agent.initialize();
// Interactive chat
const readline = require('readline');
const rl = readline.createInterface({
input: process.stdin,
output: process.stdout,
});
console.log('Local Trading Agent Ready (using Ollama)');
console.log('Type your commands:\n');
const ask = () => {
rl.question('You: ', async (input: string) => {
if (input.toLowerCase() === 'exit') {
rl.close();
return;
}
const response = await agent.chat(input);
console.log(`\nAgent: ${response}\n`);
ask();
});
};
ask();
}
main().catch(console.error);
Step 6: Local Decision Engine
For high-frequency analysis, bypass LangChain overhead:
- Python
- TypeScript
# src/local_decision_engine.py
import json
from dataclasses import dataclass
from langchain_ollama import ChatOllama
@dataclass
class MarketData:
symbol: str
price: float
rsi: float
macd_trend: str # "BULLISH", "BEARISH", "NEUTRAL"
volume_ratio: float
@dataclass
class TradingDecision:
action: str # "LONG", "SHORT", "HOLD"
confidence: int
reasoning: str
class LocalDecisionEngine:
def __init__(self, model: str = "qwen2.5:7b"):
self.llm = ChatOllama(
model=model,
base_url="http://localhost:11434",
temperature=0,
format="json",
num_predict=200,
)
async def analyze(self, data: MarketData) -> TradingDecision:
prompt = f"""You are a crypto trading analyst. Analyze this data:
Symbol: {data.symbol}
Price: ${data.price}
RSI: {data.rsi} (oversold<30, overbought>70)
MACD: {data.macd_trend}
Volume: {data.volume_ratio}x average
Respond ONLY with this JSON:
{{"action":"LONG"|"SHORT"|"HOLD","confidence":0-100,"reasoning":"brief"}}"""
try:
response = await self.llm.ainvoke(prompt)
result = json.loads(response.content)
return TradingDecision(
action=result["action"],
confidence=result["confidence"],
reasoning=result["reasoning"],
)
except Exception as e:
print(f"Decision failed: {e}")
return TradingDecision(
action="HOLD",
confidence=0,
reasoning="Analysis failed",
)
// src/local-decision-engine.ts
import { ChatOllama } from '@langchain/community/chat_models/ollama';
interface MarketData {
symbol: string;
price: number;
rsi: number;
macdTrend: 'BULLISH' | 'BEARISH' | 'NEUTRAL';
volumeRatio: number;
}
interface TradingDecision {
action: 'LONG' | 'SHORT' | 'HOLD';
confidence: number;
reasoning: string;
}
export class LocalDecisionEngine {
private llm: ChatOllama;
constructor(model: string = 'qwen2.5:7b') {
this.llm = new ChatOllama({
model,
baseUrl: 'http://localhost:11434',
temperature: 0,
format: 'json',
numPredict: 200,
});
}
async analyze(data: MarketData): Promise<TradingDecision> {
const prompt = `You are a crypto trading analyst. Analyze this data:
Symbol: ${data.symbol}
Price: $${data.price}
RSI: ${data.rsi} (oversold<30, overbought>70)
MACD: ${data.macdTrend}
Volume: ${data.volumeRatio}x average
Respond ONLY with this JSON:
{"action":"LONG"|"SHORT"|"HOLD","confidence":0-100,"reasoning":"brief"}`;
try {
const response = await this.llm.invoke(prompt);
return JSON.parse(response.content as string);
} catch (error) {
console.error('Decision failed:', error);
return {
action: 'HOLD',
confidence: 0,
reasoning: 'Analysis failed',
};
}
}
}
Performance Optimization
1. Model Quantization
Ollama automatically uses quantized models. For faster inference:
# 4-bit quantization (fastest, lower quality)
ollama pull llama3.1:8b-q4_0
# 8-bit quantization (balanced)
ollama pull llama3.1:8b-q8_0
2. Context Window Management
- Python
- TypeScript
llm = ChatOllama(
model="llama3.1:8b",
num_ctx=2048, # Smaller context = faster
num_predict=100, # Limit output tokens
)
const llm = new ChatOllama({
model: 'llama3.1:8b',
numCtx: 2048, // Smaller context = faster
numPredict: 100, // Limit output tokens
});
3. Batch Processing
- Python
- TypeScript
import asyncio
# Process multiple symbols in parallel
decisions = await asyncio.gather(*[
engine.analyze(MarketData(symbol=symbol, **data))
for symbol in symbols
])
// Process multiple symbols in parallel
const decisions = await Promise.all(
symbols.map(symbol => engine.analyze({ symbol, ...data }))
);
4. Response Caching
- Python
- TypeScript
from cachetools import TTLCache
cache = TTLCache(maxsize=100, ttl=300) # 5 min cache
async def cached_analyze(data: MarketData) -> TradingDecision:
key = f"{data.symbol}-{data.rsi}-{data.macd_trend}"
if key in cache:
return cache[key]
decision = await engine.analyze(data)
cache[key] = decision
return decision
import NodeCache from 'node-cache';
const cache = new NodeCache({ stdTTL: 300 }); // 5 min cache
async function cachedAnalyze(data: MarketData): Promise<TradingDecision> {
const key = `${data.symbol}-${data.rsi}-${data.macdTrend}`;
const cached = cache.get<TradingDecision>(key);
if (cached) return cached;
const decision = await engine.analyze(data);
cache.set(key, decision);
return decision;
}
Model Comparison
| Model | RAM | Speed | Quality | Best For |
|---|---|---|---|---|
llama3.1:8b | 8GB | Fast | Good | General trading |
qwen2.5:7b | 8GB | Fast | Good | JSON output |
mistral:7b | 8GB | Fastest | Good | Quick decisions |
llama3.1:70b | 48GB | Slow | Excellent | Complex analysis |
qwen2.5:32b | 32GB | Medium | Very Good | Detailed reasoning |
Troubleshooting
Model Too Slow
# Check GPU usage
ollama ps
# Use smaller model
ollama pull llama3.1:8b-q4_0
Out of Memory
# Reduce context window
OLLAMA_NUM_CTX=2048 ollama serve
Connection Refused
# Ensure Ollama is running
ollama serve
# Check port
curl http://localhost:11434/api/tags
Complete Example
- Python
- TypeScript
import os
import asyncio
from dotenv import load_dotenv
from web3 import Web3
from zeroquant import ZeroQuantClient
from local_decision_engine import LocalDecisionEngine, MarketData
load_dotenv()
async def main():
# Setup
w3 = Web3(Web3.HTTPProvider(os.getenv("RPC_URL")))
client = ZeroQuantClient(
web3=w3,
private_key=os.getenv("PRIVATE_KEY"),
factory_address=os.getenv("FACTORY_ADDRESS"),
permission_manager_address=os.getenv("PERMISSION_MANAGER_ADDRESS"),
)
await client.connect()
await client.connect_vault(os.getenv("VAULT_ADDRESS"))
# Local decision engine
engine = LocalDecisionEngine("qwen2.5:7b")
print("Local Trading System Ready")
print("Using Ollama with qwen2.5:7b")
print("Zero API costs, complete privacy\n")
# Trading loop
while True:
decision = await engine.analyze(MarketData(
symbol="ETH",
price=3500,
rsi=35,
macd_trend="BULLISH",
volume_ratio=1.2,
))
print(f"Decision: {decision.action} ({decision.confidence}%)")
print(f"Reasoning: {decision.reasoning}\n")
if decision.confidence > 70 and decision.action != "HOLD":
# Execute trade...
pass
await asyncio.sleep(60)
if __name__ == "__main__":
asyncio.run(main())
import { ethers } from 'ethers';
import { ZeroQuantClient } from '@zeroquant/sdk';
import { LocalDecisionEngine } from './local-decision-engine';
import 'dotenv/config';
async function main() {
// Setup
const provider = new ethers.JsonRpcProvider(process.env.RPC_URL);
const signer = new ethers.Wallet(process.env.PRIVATE_KEY!, provider);
const client = new ZeroQuantClient(provider, {
factoryAddress: process.env.FACTORY_ADDRESS!,
permissionManagerAddress: process.env.PERMISSION_MANAGER_ADDRESS!,
});
await client.connect(signer);
await client.connectVault(process.env.VAULT_ADDRESS!);
// Local decision engine
const engine = new LocalDecisionEngine('qwen2.5:7b');
console.log('Local Trading System Ready');
console.log('Using Ollama with qwen2.5:7b');
console.log('Zero API costs, complete privacy\n');
// Trading loop
setInterval(async () => {
const decision = await engine.analyze({
symbol: 'ETH',
price: 3500,
rsi: 35,
macdTrend: 'BULLISH',
volumeRatio: 1.2,
});
console.log(`Decision: ${decision.action} (${decision.confidence}%)`);
console.log(`Reasoning: ${decision.reasoning}\n`);
if (decision.confidence > 70 && decision.action !== 'HOLD') {
// Execute trade...
}
}, 60000);
}
main().catch(console.error);
What's Next?
- AI Trading System - Full production system
- Multi-Agent Coordination - Multiple local agents
Tip: Start with smaller models (llama3.1:8b) and upgrade to larger ones (llama3.1:70b) as needed. The smaller models are often sufficient for trading decisions.