构建 RAG 应用

RAG(Retrieval-Augmented Generation)结合了检索和生成,让模型能基于外部知识回答问题。

RAG 基本原理

  1. 文档切分:将长文档分成小块
  2. 向量化:将文本转换为向量
  3. 存储:保存向量和原文
  4. 检索:根据问题找到相关文档
  5. 生成:结合相关文档生成回答

简单 RAG 实现

import ollama
import numpy as np
from typing import List, Dict

class SimpleRAG:
    def __init__(self, model='llama3.2', embed_model='nomic-embed-text'):
        self.model = model
        self.embed_model = embed_model
        self.documents: List[Dict] = []
    
    def cosine_similarity(self, a, b):
        return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
    
    def add_document(self, text: str, metadata: dict = None):
        response = ollama.embeddings(
            model=self.embed_model,
            prompt=text
        )
        
        self.documents.append({
            'text': text,
            'embedding': response['embedding'],
            'metadata': metadata or {}
        })
    
    def add_documents(self, texts: List[str]):
        for text in texts:
            self.add_document(text)
    
    def search(self, query: str, top_k: int = 3) -> List[Dict]:
        response = ollama.embeddings(
            model=self.embed_model,
            prompt=query
        )
        query_embedding = response['embedding']
        
        similarities = []
        for i, doc in enumerate(self.documents):
            sim = self.cosine_similarity(query_embedding, doc['embedding'])
            similarities.append((i, sim))
        
        similarities.sort(key=lambda x: x[1], reverse=True)
        
        results = []
        for i, sim in similarities[:top_k]:
            results.append({
                'text': self.documents[i]['text'],
                'score': sim,
                'metadata': self.documents[i]['metadata']
            })
        
        return results
    
    def query(self, question: str, top_k: int = 3) -> str:
        results = self.search(question, top_k)
        
        context = "\n\n".join([r['text'] for r in results])
        
        response = ollama.chat(
            model=self.model,
            messages=[
                {
                    'role': 'system',
                    'content': '根据提供的上下文回答问题。如果上下文中没有相关信息,请说明。'
                },
                {
                    'role': 'user',
                    'content': f'上下文:\n{context}\n\n问题:{question}'
                }
            ]
        )
        
        return response['message']['content']

# 使用
rag = SimpleRAG()

rag.add_documents([
    "Ollama 是一个本地运行大语言模型的工具,支持多种开源模型。",
    "Ollama 支持 Llama、Mistral、Gemma 等多种模型系列。",
    "Ollama 提供 REST API,可以通过 HTTP 请求调用。",
    "Ollama 支持流式输出,可以实时显示生成内容。"
])

answer = rag.query("Ollama 支持哪些模型?")
print(answer)

文档切分

from typing import List

def split_text(text: str, chunk_size: int = 500, overlap: int = 50) -> List[str]:
    chunks = []
    start = 0
    
    while start < len(text):
        end = start + chunk_size
        
        if end < len(text):
            last_period = text.rfind('。', start, end)
            last_newline = text.rfind('\n', start, end)
            split_point = max(last_period, last_newline)
            
            if split_point > start:
                end = split_point + 1
        
        chunk = text[start:end].strip()
        if chunk:
            chunks.append(chunk)
        
        start = end - overlap if end < len(text) else end
    
    return chunks

class DocumentRAG(SimpleRAG):
    def add_document_file(self, filepath: str, chunk_size: int = 500):
        with open(filepath, 'r', encoding='utf-8') as f:
            text = f.read()
        
        chunks = split_text(text, chunk_size)
        
        for i, chunk in enumerate(chunks):
            self.add_document(chunk, {
                'source': filepath,
                'chunk': i
            })

# 使用
rag = DocumentRAG()
rag.add_document_file('document.txt')

answer = rag.query("文档的主要内容是什么?")
print(answer)

带来源引用的 RAG

import ollama
import numpy as np

class RAGWithSources:
    def __init__(self, model='llama3.2', embed_model='nomic-embed-text'):
        self.model = model
        self.embed_model = embed_model
        self.documents = []
    
    def cosine_similarity(self, a, b):
        return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
    
    def add_document(self, text, source=None):
        response = ollama.embeddings(
            model=self.embed_model,
            prompt=text
        )
        
        self.documents.append({
            'text': text,
            'embedding': response['embedding'],
            'source': source
        })
    
    def search(self, query, top_k=3):
        response = ollama.embeddings(
            model=self.embed_model,
            prompt=query
        )
        query_embedding = response['embedding']
        
        similarities = []
        for i, doc in enumerate(self.documents):
            sim = self.cosine_similarity(query_embedding, doc['embedding'])
            similarities.append((i, sim))
        
        similarities.sort(key=lambda x: x[1], reverse=True)
        
        return [
            {
                'text': self.documents[i]['text'],
                'score': sim,
                'source': self.documents[i]['source']
            }
            for i, sim in similarities[:top_k]
        ]
    
    def query_with_sources(self, question, top_k=3):
        results = self.search(question, top_k)
        
        context_parts = []
        sources = []
        
        for i, r in enumerate(results):
            context_parts.append(f"[{i+1}] {r['text']}")
            if r['source']:
                sources.append(f"[{i+1}] {r['source']}")
        
        context = "\n\n".join(context_parts)
        
        response = ollama.chat(
            model=self.model,
            messages=[
                {
                    'role': 'system',
                    'content': '根据提供的上下文回答问题,并在回答中引用来源编号(如[1]、[2])。'
                },
                {
                    'role': 'user',
                    'content': f'上下文:\n{context}\n\n问题:{question}'
                }
            ]
        )
        
        return {
            'answer': response['message']['content'],
            'sources': sources,
            'relevant_docs': results
        }

# 使用
rag = RAGWithSources()

rag.add_document(
    "Ollama 是一个本地运行大语言模型的工具。",
    source="ollama-intro.txt"
)
rag.add_document(
    "Ollama 支持 Llama、Mistral 等多种模型。",
    source="ollama-models.txt"
)

result = rag.query_with_sources("什么是 Ollama?")
print(f"回答: {result['answer']}")
print(f"\n来源: {', '.join(result['sources'])}")

对话式 RAG

class ConversationalRAG(RAGWithSources):
    def __init__(self, model='llama3.2', embed_model='nomic-embed-text'):
        super().__init__(model, embed_model)
        self.conversation_history = []
    
    def chat(self, question, top_k=3):
        results = self.search(question, top_k)
        context = "\n\n".join([r['text'] for r in results])
        
        messages = [
            {
                'role': 'system',
                'content': '你是一个助手,根据提供的上下文回答问题。保持回答简洁准确。'
            }
        ]
        
        messages.extend(self.conversation_history)
        
        messages.append({
            'role': 'user',
            'content': f'参考信息:\n{context}\n\n问题:{question}'
        })
        
        response = ollama.chat(
            model=self.model,
            messages=messages
        )
        
        reply = response['message']['content']
        
        self.conversation_history.append({'role': 'user', 'content': question})
        self.conversation_history.append({'role': 'assistant', 'content': reply})
        
        return {
            'answer': reply,
            'sources': [r['source'] for r in results if r['source']]
        }
    
    def clear_history(self):
        self.conversation_history = []

# 使用
rag = ConversationalRAG()

rag.add_document("Python 是一种高级编程语言,由 Guido van Rossum 创建。", source="python-intro.txt")
rag.add_document("Python 广泛用于 Web 开发、数据科学、人工智能等领域。", source="python-usage.txt")

result1 = rag.chat("Python 是什么?")
print(f"回答: {result1['answer']}")

result2 = rag.chat("它有哪些应用领域?")
print(f"回答: {result2['answer']}")