运行中模型接口返回当前加载在内存中的模型列表,包括模型的运行状态和资源使用情况。
curl http://localhost:11434/api/ps
响应:
{
"models": [
{
"name": "llama3.2:latest",
"model": "llama3.2:latest",
"size": 4661224676,
"digest": "abc123...",
"details": {
"format": "gguf",
"family": "llama",
"parameter_size": "3B",
"quantization_level": "Q4_K_M"
},
"expires_at": "2024-01-15T10:30:00Z",
"size_vram": 3500000000
}
]
}
| 字段 | 说明 |
|---|---|
| name | 模型名称 |
| model | 模型名称(同 name) |
| size | 模型总大小 |
| digest | 模型哈希 |
| details | 模型详情 |
| expires_at | 预期卸载时间 |
| size_vram | 显存占用大小 |
模型会在空闲一段时间后自动卸载。这个字段显示预期的卸载时间。
默认情况下,模型空闲 5 分钟后卸载。可以通过 keep_alive 参数调整。
显示模型在显存中占用的大小。如果模型部分在系统内存中,这个值会小于 size。
import requests
def get_running_models():
response = requests.get("http://localhost:11434/api/ps")
return response.json().get("models", [])
models = get_running_models()
if models:
for m in models:
vram_gb = m.get("size_vram", 0) / (1024 ** 3)
print(f"{m['name']}: 显存占用 {vram_gb:.2f} GB")
else:
print("没有运行中的模型")
def is_model_loaded(model_name):
models = get_running_models()
return any(m["name"].startswith(model_name) for m in models)
if is_model_loaded("llama3.2"):
print("模型已加载")
else:
print("模型未加载")
def preload_model(model_name, keep_alive="10m"):
response = requests.post(
"http://localhost:11434/api/generate",
json={
"model": model_name,
"prompt": "",
"keep_alive": keep_alive
}
)
print(f"模型 {model_name} 已预加载")
preload_model("llama3.2")
def unload_model(model_name):
response = requests.post(
"http://localhost:11434/api/generate",
json={
"model": model_name,
"prompt": "",
"keep_alive": "0"
}
)
print(f"模型 {model_name} 已卸载")
unload_model("llama3.2")
async function getRunningModels() {
const response = await fetch('http://localhost:11434/api/ps');
const data = await response.json();
return data.models || [];
}
const models = await getRunningModels();
models.forEach(m => {
const vramGB = (m.size_vram || 0) / (1024 ** 3);
console.log(`${m.name}: 显存占用 ${vramGB.toFixed(2)} GB`);
});
package main
import (
"encoding/json"
"fmt"
"io"
"net/http"
)
type RunningModel struct {
Name string `json:"name"`
SizeVram int64 `json:"size_vram"`
ExpiresAt string `json:"expires_at"`
}
type RunningModelsResponse struct {
Models []RunningModel `json:"models"`
}
func getRunningModels() ([]RunningModel, error) {
resp, err := http.Get("http://localhost:11434/api/ps")
if err != nil {
return nil, err
}
defer resp.Body.Close()
data, _ := io.ReadAll(resp.Body)
var result RunningModelsResponse
json.Unmarshal(data, &result)
return result.Models, nil
}
func main() {
models, _ := getRunningModels()
if len(models) == 0 {
fmt.Println("没有运行中的模型")
return
}
for _, m := range models {
vramGB := float64(m.SizeVram) / (1024 * 1024 * 1024)
fmt.Printf("%s: 显存占用 %.2f GB\n", m.Name, vramGB)
}
}
import time
def monitor_resources(interval=10):
while True:
models = get_running_models()
total_vram = sum(m.get("size_vram", 0) for m in models)
print(f"\n运行中模型: {len(models)}")
print(f"总显存占用: {total_vram / (1024**3):.2f} GB")
for m in models:
print(f" - {m['name']}")
time.sleep(interval)
monitor_resources()
class ModelScheduler:
def __init__(self, max_vram_gb=8):
self.max_vram = max_vram_gb * (1024 ** 3)
def can_load_model(self, model_size):
models = get_running_models()
used_vram = sum(m.get("size_vram", 0) for m in models)
return (used_vram + model_size) < self.max_vram
def make_room(self, required_size):
models = get_running_models()
used_vram = sum(m.get("size_vram", 0) for m in models)
while (used_vram + required_size) > self.max_vram and models:
oldest = models[-1]
unload_model(oldest["name"])
used_vram -= oldest.get("size_vram", 0)
models = get_running_models()
scheduler = ModelScheduler(max_vram_gb=8)
if scheduler.can_load_model(4 * 1024**3):
print("可以加载模型")