多模态应用

Ollama 支持多模态模型,可以处理图像和文本,实现图像理解、视觉问答等功能。

支持的多模态模型

模型描述大小
llava视觉语言模型~4GB
bakllava改进的视觉模型~4GB
moondream轻量级视觉模型~1.5GB
llava-phi-3小型视觉模型~2GB

图像理解基础

命令行使用

# 拉取多模态模型
ollama pull llava

# 分析图像
ollama run llava "描述这张图片" ./image.jpg

Python 调用

import ollama
import base64

def encode_image(image_path):
    with open(image_path, 'rb') as f:
        return base64.b64encode(f.read()).decode('utf-8')

response = ollama.chat(
    model='llava',
    messages=[
        {
            'role': 'user',
            'content': '描述这张图片',
            'images': [encode_image('image.jpg')]
        }
    ]
)

print(response['message']['content'])

多模态应用类

import ollama
import base64
from typing import List, Optional

class VisionAssistant:
    def __init__(self, model='llava'):
        self.model = model
    
    def encode_image(self, image_path: str) -> str:
        with open(image_path, 'rb') as f:
            return base64.b64encode(f.read()).decode('utf-8')
    
    def analyze(self, image_path: str, question: str = '描述这张图片') -> str:
        response = ollama.chat(
            model=self.model,
            messages=[
                {
                    'role': 'user',
                    'content': question,
                    'images': [self.encode_image(image_path)]
                }
            ]
        )
        return response['message']['content']
    
    def describe(self, image_path: str) -> str:
        return self.analyze(image_path, '详细描述这张图片的内容')
    
    def extract_text(self, image_path: str) -> str:
        return self.analyze(image_path, '提取图片中的所有文字')
    
    def count_objects(self, image_path: str, object_type: str) -> str:
        return self.analyze(image_path, f'数一数图片中有多少个{object_type}')
    
    def compare_images(self, image_paths: List[str], question: str) -> str:
        images = [self.encode_image(p) for p in image_paths]
        
        response = ollama.chat(
            model=self.model,
            messages=[
                {
                    'role': 'user',
                    'content': question,
                    'images': images
                }
            ]
        )
        return response['message']['content']

# 使用
assistant = VisionAssistant()

description = assistant.describe('photo.jpg')
print(description)

text = assistant.extract_text('document.png')
print(f"提取的文字: {text}")

图像问答系统

import ollama
import base64

class ImageQA:
    def __init__(self, model='llava'):
        self.model = model
        self.context = []
    
    def add_image(self, image_path: str, description: str = None):
        with open(image_path, 'rb') as f:
            image_data = base64.b64encode(f.read()).decode('utf-8')
        
        self.context.append({
            'image': image_data,
            'description': description
        })
    
    def ask(self, question: str, image_index: int = -1) -> str:
        if not self.context:
            return "请先添加图片"
        
        ctx = self.context[image_index]
        
        response = ollama.chat(
            model=self.model,
            messages=[
                {
                    'role': 'user',
                    'content': question,
                    'images': [ctx['image']]
                }
            ]
        )
        
        return response['message']['content']
    
    def batch_analyze(self, image_paths: List[str], prompt: str) -> List[str]:
        results = []
        
        for path in image_paths:
            result = self.analyze_image(path, prompt)
            results.append({
                'image': path,
                'result': result
            })
        
        return results

# 使用
qa = ImageQA()
qa.add_image('chart.png', '销售数据图表')

answer = qa.ask("这个图表显示了什么趋势?")
print(answer)

图像分类助手

import ollama
import base64
from typing import List

class ImageClassifier:
    def __init__(self, model='llava'):
        self.model = model
    
    def classify(self, image_path: str, categories: List[str]) -> dict:
        with open(image_path, 'rb') as f:
            image_data = base64.b64encode(f.read()).decode('utf-8')
        
        categories_str = '、'.join(categories)
        prompt = f"这张图片属于哪个类别?可选类别:{categories_str}。只输出类别名称。"
        
        response = ollama.chat(
            model=self.model,
            messages=[
                {
                    'role': 'user',
                    'content': prompt,
                    'images': [image_data]
                }
            ]
        )
        
        category = response['message']['content'].strip()
        
        return {
            'image': image_path,
            'category': category,
            'confidence': 'high' if category in categories else 'low'
        }
    
    def batch_classify(self, image_paths: List[str], categories: List[str]) -> List[dict]:
        return [self.classify(path, categories) for path in image_paths]

# 使用
classifier = ImageClassifier()

result = classifier.classify('photo.jpg', ['风景', '人物', '动物', '建筑'])
print(f"分类结果: {result['category']}")

文档分析器

import ollama
import base64

class DocumentAnalyzer:
    def __init__(self, model='llava'):
        self.model = model
    
    def analyze_document(self, image_path: str) -> dict:
        with open(image_path, 'rb') as f:
            image_data = base64.b64encode(f.read()).decode('utf-8')
        
        questions = {
            'type': '这是什么类型的文档?',
            'content': '文档的主要内容是什么?',
            'text': '提取文档中的关键文字信息。',
            'summary': '总结文档的核心要点。'
        }
        
        results = {}
        for key, question in questions.items():
            response = ollama.chat(
                model=self.model,
                messages=[
                    {
                        'role': 'user',
                        'content': question,
                        'images': [image_data]
                    }
                ]
            )
            results[key] = response['message']['content']
        
        return results
    
    def extract_table(self, image_path: str) -> str:
        with open(image_path, 'rb') as f:
            image_data = base64.b64encode(f.read()).decode('utf-8')
        
        response = ollama.chat(
            model=self.model,
            messages=[
                {
                    'role': 'user',
                    'content': '提取图片中的表格数据,以 Markdown 表格格式输出。',
                    'images': [image_data]
                }
            ]
        )
        
        return response['message']['content']

# 使用
analyzer = DocumentAnalyzer()

result = analyzer.analyze_document('invoice.png')
for key, value in result.items():
    print(f"{key}: {value}\n")

Web 应用集成

from fastapi import FastAPI, File, UploadFile
from pydantic import BaseModel
import ollama
import base64
import tempfile

app = FastAPI(title="图像分析 API")

class AnalysisResponse(BaseModel):
    description: str

@app.post("/analyze", response_model=AnalysisResponse)
async def analyze_image(file: UploadFile = File(...)):
    contents = await file.read()
    image_data = base64.b64encode(contents).decode('utf-8')
    
    response = ollama.chat(
        model='llava',
        messages=[
            {
                'role': 'user',
                'content': '描述这张图片',
                'images': [image_data]
            }
        ]
    )
    
    return AnalysisResponse(description=response['message']['content'])

@app.post("/ocr")
async def extract_text(file: UploadFile = File(...)):
    contents = await file.read()
    image_data = base64.b64encode(contents).decode('utf-8')
    
    response = ollama.chat(
        model='llava',
        messages=[
            {
                'role': 'user',
                'content': '提取图片中的所有文字',
                'images': [image_data]
            }
        ]
    )
    
    return {"text": response['message']['content']}

if __name__ == '__main__':
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=8000)