模型性能优化

优化模型性能可以让推理更快,支持更大的模型。

GPU 加速

检查 GPU 状态

# NVIDIA
nvidia-smi

# macOS
system_profiler SPDisplaysDataType

GPU 层数设置

import ollama

response = ollama.generate(
    model='llama3.2',
    prompt='你好',
    options={
        'num_gpu': 35  # GPU 层数
    }
)

完全使用 GPU

response = ollama.generate(
    model='llama3.2',
    prompt='你好',
    options={
        'num_gpu': -1  # 全部使用 GPU
    }
)

禁用 GPU

response = ollama.generate(
    model='llama3.2',
    prompt='你好',
    options={
        'num_gpu': 0  # 仅使用 CPU
    }
)

批处理优化

批量请求

import ollama
from concurrent.futures import ThreadPoolExecutor

def batch_generate(prompts, model='llama3.2', max_workers=4):
    def generate(prompt):
        return ollama.generate(
            model=model,
            prompt=prompt,
            options={'num_predict': 100}
        )['response']
    
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        results = list(executor.map(generate, prompts))
    
    return results

prompts = ['你好', '再见', '谢谢']
results = batch_generate(prompts)

预加载模型

import ollama

def preload_model(model_name):
    ollama.generate(
        model=model_name,
        prompt='',
        keep_alive='10m'
    )
    print(f"模型 {model_name} 已预加载")

preload_model('llama3.2')

上下文优化

减少上下文长度

response = ollama.chat(
    model='llama3.2',
    messages=messages,
    options={
        'num_ctx': 2048  # 减少上下文窗口
    }
)

截断历史消息

def truncate_history(messages, max_messages=10):
    system_messages = [m for m in messages if m['role'] == 'system']
    other_messages = [m for m in messages if m['role'] != 'system']
    
    truncated = other_messages[-max_messages:]
    
    return system_messages + truncated

messages = truncate_history(messages, max_messages=5)

并行处理

异步处理

from ollama import AsyncClient
import asyncio

async def async_generate(client, prompt):
    return await client.generate(
        model='llama3.2',
        prompt=prompt
    )

async def main():
    client = AsyncClient()
    
    tasks = [
        async_generate(client, f"问题 {i}")
        for i in range(5)
    ]
    
    results = await asyncio.gather(*tasks)
    
    for r in results:
        print(r['response'][:50])

asyncio.run(main())

多进程处理

from multiprocessing import Pool
import ollama

def generate_wrapper(prompt):
    return ollama.generate(
        model='llama3.2',
        prompt=prompt
    )['response']

with Pool(4) as p:
    results = p.map(generate_wrapper, ['问题1', '问题2', '问题3'])

内存优化

及时卸载模型

import ollama

def unload_model(model_name):
    ollama.generate(
        model=model_name,
        prompt='',
        keep_alive='0'
    )

unload_model('llama3.2')

控制模型保留时间

response = ollama.generate(
    model='llama3.2',
    prompt='你好',
    keep_alive='5m'  # 5分钟后卸载
)

性能监控

响应时间监控

import ollama
import time

def benchmark(model, prompt, iterations=5):
    times = []
    
    for i in range(iterations):
        start = time.time()
        
        response = ollama.generate(
            model=model,
            prompt=prompt,
            options={'num_predict': 100}
        )
        
        duration = time.time() - start
        times.append(duration)
        
        tokens = response.get('eval_count', 0)
        tps = tokens / duration if duration > 0 else 0
        
        print(f"迭代 {i+1}: {duration:.2f}s, {tps:.1f} tokens/s")
    
    avg = sum(times) / len(times)
    print(f"\n平均: {avg:.2f}s")

benchmark('llama3.2', '写一首诗')

资源使用监控

import ollama
import psutil
import time

def monitor_resources(interval=1):
    while True:
        cpu = psutil.cpu_percent()
        memory = psutil.virtual_memory().percent
        
        models = ollama.ps().get('models', [])
        vram = sum(m.get('size_vram', 0) for m in models)
        
        print(f"CPU: {cpu}%, 内存: {memory}%, 显存: {vram/(1024**3):.2f}GB")
        
        time.sleep(interval)

# monitor_resources()

性能对比工具

import ollama
import time

def compare_models(models, prompt):
    results = []
    
    for model in models:
        try:
            ollama.pull(model)
        except:
            pass
        
        info = ollama.show(model)
        size = info.get('details', {}).get('parameter_size', 'Unknown')
        
        start = time.time()
        response = ollama.generate(
            model=model,
            prompt=prompt,
            options={'num_predict': 100}
        )
        duration = time.time() - start
        
        tokens = response.get('eval_count', 0)
        tps = tokens / duration if duration > 0 else 0
        
        results.append({
            'model': model,
            'size': size,
            'duration': duration,
            'tokens': tokens,
            'tps': tps
        })
    
    print("\n性能对比:")
    print("-" * 60)
    for r in results:
        print(f"{r['model']:20} | {r['size']:8} | {r['duration']:.2f}s | {r['tps']:.1f} t/s")

compare_models(
    ['llama3.2:1b', 'llama3.2:3b', 'mistral:7b'],
    '用 Python 写一个快速排序'
)