优化模型性能可以让推理更快,支持更大的模型。
# NVIDIA
nvidia-smi
# macOS
system_profiler SPDisplaysDataType
import ollama
response = ollama.generate(
model='llama3.2',
prompt='你好',
options={
'num_gpu': 35 # GPU 层数
}
)
response = ollama.generate(
model='llama3.2',
prompt='你好',
options={
'num_gpu': -1 # 全部使用 GPU
}
)
response = ollama.generate(
model='llama3.2',
prompt='你好',
options={
'num_gpu': 0 # 仅使用 CPU
}
)
import ollama
from concurrent.futures import ThreadPoolExecutor
def batch_generate(prompts, model='llama3.2', max_workers=4):
def generate(prompt):
return ollama.generate(
model=model,
prompt=prompt,
options={'num_predict': 100}
)['response']
with ThreadPoolExecutor(max_workers=max_workers) as executor:
results = list(executor.map(generate, prompts))
return results
prompts = ['你好', '再见', '谢谢']
results = batch_generate(prompts)
import ollama
def preload_model(model_name):
ollama.generate(
model=model_name,
prompt='',
keep_alive='10m'
)
print(f"模型 {model_name} 已预加载")
preload_model('llama3.2')
response = ollama.chat(
model='llama3.2',
messages=messages,
options={
'num_ctx': 2048 # 减少上下文窗口
}
)
def truncate_history(messages, max_messages=10):
system_messages = [m for m in messages if m['role'] == 'system']
other_messages = [m for m in messages if m['role'] != 'system']
truncated = other_messages[-max_messages:]
return system_messages + truncated
messages = truncate_history(messages, max_messages=5)
from ollama import AsyncClient
import asyncio
async def async_generate(client, prompt):
return await client.generate(
model='llama3.2',
prompt=prompt
)
async def main():
client = AsyncClient()
tasks = [
async_generate(client, f"问题 {i}")
for i in range(5)
]
results = await asyncio.gather(*tasks)
for r in results:
print(r['response'][:50])
asyncio.run(main())
from multiprocessing import Pool
import ollama
def generate_wrapper(prompt):
return ollama.generate(
model='llama3.2',
prompt=prompt
)['response']
with Pool(4) as p:
results = p.map(generate_wrapper, ['问题1', '问题2', '问题3'])
import ollama
def unload_model(model_name):
ollama.generate(
model=model_name,
prompt='',
keep_alive='0'
)
unload_model('llama3.2')
response = ollama.generate(
model='llama3.2',
prompt='你好',
keep_alive='5m' # 5分钟后卸载
)
import ollama
import time
def benchmark(model, prompt, iterations=5):
times = []
for i in range(iterations):
start = time.time()
response = ollama.generate(
model=model,
prompt=prompt,
options={'num_predict': 100}
)
duration = time.time() - start
times.append(duration)
tokens = response.get('eval_count', 0)
tps = tokens / duration if duration > 0 else 0
print(f"迭代 {i+1}: {duration:.2f}s, {tps:.1f} tokens/s")
avg = sum(times) / len(times)
print(f"\n平均: {avg:.2f}s")
benchmark('llama3.2', '写一首诗')
import ollama
import psutil
import time
def monitor_resources(interval=1):
while True:
cpu = psutil.cpu_percent()
memory = psutil.virtual_memory().percent
models = ollama.ps().get('models', [])
vram = sum(m.get('size_vram', 0) for m in models)
print(f"CPU: {cpu}%, 内存: {memory}%, 显存: {vram/(1024**3):.2f}GB")
time.sleep(interval)
# monitor_resources()
import ollama
import time
def compare_models(models, prompt):
results = []
for model in models:
try:
ollama.pull(model)
except:
pass
info = ollama.show(model)
size = info.get('details', {}).get('parameter_size', 'Unknown')
start = time.time()
response = ollama.generate(
model=model,
prompt=prompt,
options={'num_predict': 100}
)
duration = time.time() - start
tokens = response.get('eval_count', 0)
tps = tokens / duration if duration > 0 else 0
results.append({
'model': model,
'size': size,
'duration': duration,
'tokens': tokens,
'tps': tps
})
print("\n性能对比:")
print("-" * 60)
for r in results:
print(f"{r['model']:20} | {r['size']:8} | {r['duration']:.2f}s | {r['tps']:.1f} t/s")
compare_models(
['llama3.2:1b', 'llama3.2:3b', 'mistral:7b'],
'用 Python 写一个快速排序'
)