import ollama from 'ollama'

const stream = await ollama.chat({
  model: 'llama3.2',
  messages: [{ role: 'user', content: '写一首诗' }],
  stream: true
})

for await (const chunk of stream) {
  if (chunk.message.content) {
    process.stdout.write(chunk.message.content)
  }
}

基本流式生成

const stream = await ollama.generate({
  model: 'llama3.2',
  prompt: '用 JavaScript 写一个快速排序',
  stream: true
})

for await (const chunk of stream) {
  if (chunk.response) {
    process.stdout.write(chunk.response)
  }
}

封装流式处理

async function streamChat(model, messages, callback) {
  const stream = await ollama.chat({
    model,
    messages,
    stream: true
  })
  
  let fullResponse = ''
  
  for await (const chunk of stream) {
    const content = chunk.message.content
    if (content) {
      fullResponse += content
      if (callback) {
        callback(content)
      }
    }
  }
  
  return fullResponse
}

// 使用
await streamChat('llama3.2', [{ role: 'user', content: '写一首诗' }], (text) => {
  process.stdout.write(text)
})

流式对话类

class StreamingChat {
  constructor(model = 'llama3.2', system = null) {
    this.model = model
    this.messages = []
    if (system) {
      this.messages.push({ role: 'system', content: system })
    }
  }
  
  async send(content) {
    this.messages.push({ role: 'user', content })
    
    const stream = await ollama.chat({
      model: this.model,
      messages: this.messages,
      stream: true
    })
    
    let fullResponse = ''
    process.stdout.write('助手: ')
    
    for await (const chunk of stream) {
      const text = chunk.message.content
      if (text) {
        process.stdout.write(text)
        fullResponse += text
      }
    }
    
    console.log()
    this.messages.push({ role: 'assistant', content: fullResponse })
    return fullResponse
  }
}

// 使用
const chat = new StreamingChat('llama3.2', '你是一个友好的助手')
await chat.send('你好')
await chat.send('写一首关于春天的诗')

处理流式响应的元数据

const stream = await ollama.chat({
  model: 'llama3.2',
  messages: [{ role: 'user', content: '你好' }],
  stream: true
})

for await (const chunk of stream) {
  if (chunk.message.content) {
    process.stdout.write(chunk.message.content)
  }
  
  if (chunk.done) {
    console.log('\n---')
    console.log(`总耗时: ${(chunk.total_duration / 1e9).toFixed(2)}秒`)
    console.log(`生成token数: ${chunk.eval_count}`)
  }
}

浏览器流式处理

const stream = await ollama.chat({
  model: 'llama3.2',
  messages: [{ role: 'user', content: '写一首诗' }],
  stream: true
})

const output = document.getElementById('output')

for await (const chunk of stream) {
  if (chunk.message.content) {
    output.textContent += chunk.message.content
  }
}

Web 框架集成

Express 流式响应

import express from 'express'
import ollama from 'ollama'

const app = express()
app.use(express.json())

app.post('/chat/stream', async (req, res) => {
  const { message } = req.body
  
  res.setHeader('Content-Type', 'text/event-stream')
  res.setHeader('Cache-Control', 'no-cache')
  res.setHeader('Connection', 'keep-alive')
  
  const stream = await ollama.chat({
    model: 'llama3.2',
    messages: [{ role: 'user', content: message }],
    stream: true
  })
  
  for await (const chunk of stream) {
    const content = chunk.message.content
    if (content) {
      res.write(`data: ${JSON.stringify({ content })}\n\n`)
    }
  }
  
  res.write('data: [DONE]\n\n')
  res.end()
})

app.listen(3000)

Next.js 流式响应

import ollama from 'ollama'

export async function POST(request) {
  const { message } = await request.json()
  
  const stream = await ollama.chat({
    model: 'llama3.2',
    messages: [{ role: 'user', content: message }],
    stream: true
  })
  
  const encoder = new TextEncoder()
  
  const readable = new ReadableStream({
    async start(controller) {
      for await (const chunk of stream) {
        const content = chunk.message.content
        if (content) {
          controller.enqueue(encoder.encode(`data: ${JSON.stringify({ content })}\n\n`))
        }
      }
      controller.enqueue(encoder.encode('data: [DONE]\n\n'))
      controller.close()
    }
  })
  
  return new Response(readable, {
    headers: {
      'Content-Type': 'text/event-stream',
      'Cache-Control': 'no-cache',
      'Connection': 'keep-alive'
    }
  })
}

AbortController 取消

const controller = new AbortController()

async function streamWithAbort() {
  try {
    const stream = await ollama.chat({
      model: 'llama3.2',
      messages: [{ role: 'user', content: '写一篇长文章' }],
      stream: true
    }, { signal: controller.signal })
    
    for await (const chunk of stream) {
      if (chunk.message.content) {
        process.stdout.write(chunk.message.content)
      }
    }
  } catch (error) {
    if (error.name === 'AbortError') {
      console.log('\n[已取消]')
    } else {
      throw error
    }
  }
}

// 3秒后取消
setTimeout(() => controller.abort(), 3000)

streamWithAbort()

上一章：JavaScript 生成与聊天

下一章：TypeScript 类型定义