Ollama 支持多模态模型,可以处理图像和文本,实现图像理解、视觉问答等功能。
| 模型 | 描述 | 大小 |
|---|---|---|
| llava | 视觉语言模型 | ~4GB |
| bakllava | 改进的视觉模型 | ~4GB |
| moondream | 轻量级视觉模型 | ~1.5GB |
| llava-phi-3 | 小型视觉模型 | ~2GB |
# 拉取多模态模型
ollama pull llava
# 分析图像
ollama run llava "描述这张图片" ./image.jpg
import ollama
import base64
def encode_image(image_path):
with open(image_path, 'rb') as f:
return base64.b64encode(f.read()).decode('utf-8')
response = ollama.chat(
model='llava',
messages=[
{
'role': 'user',
'content': '描述这张图片',
'images': [encode_image('image.jpg')]
}
]
)
print(response['message']['content'])
import ollama
import base64
from typing import List, Optional
class VisionAssistant:
def __init__(self, model='llava'):
self.model = model
def encode_image(self, image_path: str) -> str:
with open(image_path, 'rb') as f:
return base64.b64encode(f.read()).decode('utf-8')
def analyze(self, image_path: str, question: str = '描述这张图片') -> str:
response = ollama.chat(
model=self.model,
messages=[
{
'role': 'user',
'content': question,
'images': [self.encode_image(image_path)]
}
]
)
return response['message']['content']
def describe(self, image_path: str) -> str:
return self.analyze(image_path, '详细描述这张图片的内容')
def extract_text(self, image_path: str) -> str:
return self.analyze(image_path, '提取图片中的所有文字')
def count_objects(self, image_path: str, object_type: str) -> str:
return self.analyze(image_path, f'数一数图片中有多少个{object_type}')
def compare_images(self, image_paths: List[str], question: str) -> str:
images = [self.encode_image(p) for p in image_paths]
response = ollama.chat(
model=self.model,
messages=[
{
'role': 'user',
'content': question,
'images': images
}
]
)
return response['message']['content']
# 使用
assistant = VisionAssistant()
description = assistant.describe('photo.jpg')
print(description)
text = assistant.extract_text('document.png')
print(f"提取的文字: {text}")
import ollama
import base64
class ImageQA:
def __init__(self, model='llava'):
self.model = model
self.context = []
def add_image(self, image_path: str, description: str = None):
with open(image_path, 'rb') as f:
image_data = base64.b64encode(f.read()).decode('utf-8')
self.context.append({
'image': image_data,
'description': description
})
def ask(self, question: str, image_index: int = -1) -> str:
if not self.context:
return "请先添加图片"
ctx = self.context[image_index]
response = ollama.chat(
model=self.model,
messages=[
{
'role': 'user',
'content': question,
'images': [ctx['image']]
}
]
)
return response['message']['content']
def batch_analyze(self, image_paths: List[str], prompt: str) -> List[str]:
results = []
for path in image_paths:
result = self.analyze_image(path, prompt)
results.append({
'image': path,
'result': result
})
return results
# 使用
qa = ImageQA()
qa.add_image('chart.png', '销售数据图表')
answer = qa.ask("这个图表显示了什么趋势?")
print(answer)
import ollama
import base64
from typing import List
class ImageClassifier:
def __init__(self, model='llava'):
self.model = model
def classify(self, image_path: str, categories: List[str]) -> dict:
with open(image_path, 'rb') as f:
image_data = base64.b64encode(f.read()).decode('utf-8')
categories_str = '、'.join(categories)
prompt = f"这张图片属于哪个类别?可选类别:{categories_str}。只输出类别名称。"
response = ollama.chat(
model=self.model,
messages=[
{
'role': 'user',
'content': prompt,
'images': [image_data]
}
]
)
category = response['message']['content'].strip()
return {
'image': image_path,
'category': category,
'confidence': 'high' if category in categories else 'low'
}
def batch_classify(self, image_paths: List[str], categories: List[str]) -> List[dict]:
return [self.classify(path, categories) for path in image_paths]
# 使用
classifier = ImageClassifier()
result = classifier.classify('photo.jpg', ['风景', '人物', '动物', '建筑'])
print(f"分类结果: {result['category']}")
import ollama
import base64
class DocumentAnalyzer:
def __init__(self, model='llava'):
self.model = model
def analyze_document(self, image_path: str) -> dict:
with open(image_path, 'rb') as f:
image_data = base64.b64encode(f.read()).decode('utf-8')
questions = {
'type': '这是什么类型的文档?',
'content': '文档的主要内容是什么?',
'text': '提取文档中的关键文字信息。',
'summary': '总结文档的核心要点。'
}
results = {}
for key, question in questions.items():
response = ollama.chat(
model=self.model,
messages=[
{
'role': 'user',
'content': question,
'images': [image_data]
}
]
)
results[key] = response['message']['content']
return results
def extract_table(self, image_path: str) -> str:
with open(image_path, 'rb') as f:
image_data = base64.b64encode(f.read()).decode('utf-8')
response = ollama.chat(
model=self.model,
messages=[
{
'role': 'user',
'content': '提取图片中的表格数据,以 Markdown 表格格式输出。',
'images': [image_data]
}
]
)
return response['message']['content']
# 使用
analyzer = DocumentAnalyzer()
result = analyzer.analyze_document('invoice.png')
for key, value in result.items():
print(f"{key}: {value}\n")
from fastapi import FastAPI, File, UploadFile
from pydantic import BaseModel
import ollama
import base64
import tempfile
app = FastAPI(title="图像分析 API")
class AnalysisResponse(BaseModel):
description: str
@app.post("/analyze", response_model=AnalysisResponse)
async def analyze_image(file: UploadFile = File(...)):
contents = await file.read()
image_data = base64.b64encode(contents).decode('utf-8')
response = ollama.chat(
model='llava',
messages=[
{
'role': 'user',
'content': '描述这张图片',
'images': [image_data]
}
]
)
return AnalysisResponse(description=response['message']['content'])
@app.post("/ocr")
async def extract_text(file: UploadFile = File(...)):
contents = await file.read()
image_data = base64.b64encode(contents).decode('utf-8')
response = ollama.chat(
model='llava',
messages=[
{
'role': 'user',
'content': '提取图片中的所有文字',
'images': [image_data]
}
]
)
return {"text": response['message']['content']}
if __name__ == '__main__':
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=8000)