错误处理最佳实践

29.1 错误处理设计原则

1. 错误应该是可预期的

package main

import (
    "errors"
    "fmt"
)

var (
    ErrInvalidInput = errors.New("无效的输入")
    ErrNotFound     = errors.New("资源不存在")
    ErrPermission   = errors.New("权限不足")
)

func validateInput(input string) error {
    if input == "" {
        return fmt.Errorf("输入不能为空: %w", ErrInvalidInput)
    }
    if len(input) > 100 {
        return fmt.Errorf("输入过长: %w", ErrInvalidInput)
    }
    return nil
}

func main() {
    if err := validateInput(""); err != nil {
        if errors.Is(err, ErrInvalidInput) {
            fmt.Println("输入验证失败")
        }
    }
}

2. 错误信息应该包含上下文

package main

import (
    "fmt"
)

func processFile(filename string) error {
    return fmt.Errorf("处理文件 %s 失败: 文件不存在", filename)
}

func main() {
    err := processFile("config.json")
    fmt.Println(err)
}

3. 错误应该只处理一次

package main

import (
    "fmt"
    "log"
)

func processData() error {
    return fmt.Errorf("数据处理失败")
}

func main() {
    err := processData()
    if err != nil {
        log.Printf("错误: %v", err)
        return err
    }
}

29.2 错误日志记录

结构化日志

package main

import (
    "encoding/json"
    "fmt"
    "time"
)

type ErrorLog struct {
    Timestamp   time.Time `json:"timestamp"`
    Level       string    `json:"level"`
    Error       string    `json:"error"`
    Component   string    `json:"component"`
    TraceID     string    `json:"trace_id"`
    UserID      string    `json:"user_id,omitempty"`
    RequestID   string    `json:"request_id,omitempty"`
    StackTrace  string    `json:"stack_trace,omitempty"`
}

func logError(err error, component string) {
    log := ErrorLog{
        Timestamp: time.Now(),
        Level:     "error",
        Error:     err.Error(),
        Component: component,
        TraceID:   generateTraceID(),
    }

    b, _ := json.Marshal(log)
    fmt.Println(string(b))
}

func generateTraceID() string {
    return fmt.Sprintf("trace-%d", time.Now().UnixNano())
}

func main() {
    err := fmt.Errorf("数据库连接失败")
    logError(err, "UserService")
}

分级日志

package main

import (
    "fmt"
    "log"
)

type LogLevel int

const (
    LogLevelDebug LogLevel = iota
    LogLevelInfo
    LogLevelWarn
    LogLevelError
    LogLevelFatal
)

type Logger struct {
    level LogLevel
}

func NewLogger(level LogLevel) *Logger {
    return &Logger{level: level}
}

func (l *Logger) Debug(msg string) {
    if l.level <= LogLevelDebug {
        log.Printf("[DEBUG] %s", msg)
    }
}

func (l *Logger) Info(msg string) {
    if l.level <= LogLevelInfo {
        log.Printf("[INFO] %s", msg)
    }
}

func (l *Logger) Warn(msg string) {
    if l.level <= LogLevelWarn {
        log.Printf("[WARN] %s", msg)
    }
}

func (l *Logger) Error(msg string) {
    if l.level <= LogLevelError {
        log.Printf("[ERROR] %s", msg)
    }
}

func (l *Logger) Fatal(msg string) {
    if l.level <= LogLevelFatal {
        log.Fatalf("[FATAL] %s", msg)
    }
}

func main() {
    logger := NewLogger(LogLevelInfo)
    logger.Info("应用启动")
    logger.Warn("配置文件使用默认值")
    logger.Error("数据库连接失败")
}

29.3 错误监控与告警

错误统计

package main

import (
    "fmt"
    "sync"
    "sync/atomic"
    "time"
)

type ErrorStats struct {
    mu           sync.RWMutex
    totalCount   int64
    errorCounts  map[string]int64
    lastErrors   []string
    maxLastError int
}

func NewErrorStats(maxLastError int) *ErrorStats {
    return &ErrorStats{
        errorCounts:  make(map[string]int64),
        maxLastError: maxLastError,
    }
}

func (s *ErrorStats) Record(err error) {
    atomic.AddInt64(&s.totalCount, 1)

    s.mu.Lock()
    s.errorCounts[err.Error()]++
    s.lastErrors = append(s.lastErrors, err.Error())
    if len(s.lastErrors) > s.maxLastError {
        s.lastErrors = s.lastErrors[1:]
    }
    s.mu.Unlock()
}

func (s *ErrorStats) GetStats() map[string]int64 {
    s.mu.RLock()
    defer s.mu.RUnlock()

    stats := make(map[string]int64)
    for k, v := range s.errorCounts {
        stats[k] = v
    }
    return stats
}

func (s *ErrorStats) GetTotalCount() int64 {
    return atomic.LoadInt64(&s.totalCount)
}

func (s *ErrorStats) GetLastErrors() []string {
    s.mu.RLock()
    defer s.mu.RUnlock()

    errors := make([]string, len(s.lastErrors))
    copy(errors, s.lastErrors)
    return errors
}

func main() {
    stats := NewErrorStats(10)

    errors := []string{
        "数据库连接失败",
        "超时错误",
        "数据库连接失败",
        "参数错误",
        "数据库连接失败",
    }

    for _, err := range errors {
        stats.Record(fmt.Errorf(err))
    }

    fmt.Printf("总错误数: %d\n", stats.GetTotalCount())
    fmt.Println("错误统计:")
    for err, count := range stats.GetStats() {
        fmt.Printf("  %s: %d\n", err, count)
    }
    fmt.Println("最近错误:", stats.GetLastErrors())
}

错误告警

package main

import (
    "fmt"
    "sync"
    "time"
)

type AlertLevel int

const (
    AlertLevelInfo  AlertLevel = iota
    AlertLevelWarn
    AlertLevelError
    AlertLevelCritical
)

type Alert struct {
    Level     AlertLevel
    Message   string
    Timestamp time.Time
    Metadata  map[string]string
}

type AlertHandler interface {
    Handle(alert Alert)
}

type ConsoleAlertHandler struct{}

func (h *ConsoleAlertHandler) Handle(alert Alert) {
    level := "INFO"
    switch alert.Level {
    case AlertLevelWarn:
        level = "WARN"
    case AlertLevelError:
        level = "ERROR"
    case AlertLevelCritical:
        level = "CRITICAL"
    }
    fmt.Printf("[%s] %s - %v\n", level, alert.Message, alert.Timestamp)
}

type AlertManager struct {
    handlers []AlertHandler
    mu       sync.RWMutex
}

func NewAlertManager() *AlertManager {
    return &AlertManager{
        handlers: make([]AlertHandler, 0),
    }
}

func (m *AlertManager) AddHandler(handler AlertHandler) {
    m.mu.Lock()
    defer m.mu.Unlock()
    m.handlers = append(m.handlers, handler)
}

func (m *AlertManager) SendAlert(level AlertLevel, message string, metadata map[string]string) {
    alert := Alert{
        Level:     level,
        Message:   message,
        Timestamp: time.Now(),
        Metadata:  metadata,
    }

    m.mu.RLock()
    defer m.mu.RUnlock()

    for _, handler := range m.handlers {
        handler.Handle(alert)
    }
}

func main() {
    alertManager := NewAlertManager()
    alertManager.AddHandler(&ConsoleAlertHandler{})

    alertManager.SendAlert(AlertLevelWarn, "内存使用率过高", map[string]string{
        "usage": "85%",
        "threshold": "80%",
    })

    alertManager.SendAlert(AlertLevelError, "数据库连接失败", map[string]string{
        "database": "mysql",
        "host":     "localhost:3306",
    })
}

29.4 错误重试机制

指数退避重试

package main

import (
    "errors"
    "fmt"
    "math"
    "time"
)

type RetryConfig struct {
    MaxAttempts int
    InitialDelay time.Duration
    MaxDelay     time.Duration
    Multiplier   float64
}

func DefaultRetryConfig() *RetryConfig {
    return &RetryConfig{
        MaxAttempts:  3,
        InitialDelay: 100 * time.Millisecond,
        MaxDelay:     5 * time.Second,
        Multiplier:   2.0,
    }
}

func RetryWithBackoff(fn func() error, config *RetryConfig) error {
    if config == nil {
        config = DefaultRetryConfig()
    }

    var lastErr error
    delay := config.InitialDelay

    for attempt := 1; attempt <= config.MaxAttempts; attempt++ {
        err := fn()
        if err == nil {
            return nil
        }

        lastErr = err
        if attempt < config.MaxAttempts {
            fmt.Printf("第 %d 次尝试失败,%v 后重试...\n", attempt, delay)
            time.Sleep(delay)

            delay = time.Duration(float64(delay) * config.Multiplier)
            if delay > config.MaxDelay {
                delay = config.MaxDelay
            }
        }
    }

    return fmt.Errorf("重试 %d 次后仍然失败: %w", config.MaxAttempts, lastErr)
}

func main() {
    attempts := 0
    err := RetryWithBackoff(func() error {
        attempts++
        fmt.Printf("尝试第 %d 次操作...\n", attempts)
        if attempts < 3 {
            return errors.New("操作失败")
        }
        return nil
    }, DefaultRetryConfig())

    if err != nil {
        fmt.Println("最终失败:", err)
    } else {
        fmt.Println("操作成功")
    }
}

条件重试

package main

import (
    "errors"
    "fmt"
    "time"
)

type RetryableError struct {
    Err error
}

func (e *RetryableError) Error() string {
    return e.Err.Error()
}

func (e *RetryableError) Unwrap() error {
    return e.Err
}

func IsRetryable(err error) bool {
    _, ok := err.(*RetryableError)
    return ok
}

func RetryIfRetryable(fn func() error, maxAttempts int) error {
    var lastErr error

    for attempt := 1; attempt <= maxAttempts; attempt++ {
        err := fn()
        if err == nil {
            return nil
        }

        lastErr = err
        if !IsRetryable(err) {
            return err
        }

        if attempt < maxAttempts {
            fmt.Printf("可重试错误,第 %d 次重试...\n", attempt)
            time.Sleep(time.Second)
        }
    }

    return fmt.Errorf("重试 %d 次后失败: %w", maxAttempts, lastErr)
}

func main() {
    attempts := 0
    err := RetryIfRetryable(func() error {
        attempts++
        if attempts < 3 {
            return &RetryableError{Err: errors.New("临时错误")}
        }
        return nil
    }, 5)

    if err != nil {
        fmt.Println("失败:", err)
    } else {
        fmt.Println("成功")
    }

    // 测试不可重试错误
    err = RetryIfRetryable(func() error {
        return errors.New("永久错误")
    }, 5)

    if err != nil {
        fmt.Println("失败:", err)
    }
}

29.5 错误降级处理

降级策略

package main

import (
    "fmt"
    "time"
)

type FallbackStrategy int

const (
    FallbackStrategyCache FallbackStrategy = iota
    FallbackStrategyDefault
    FallbackStrategyEmpty
)

type FallbackHandler struct {
    strategy FallbackStrategy
    cache    map[string]string
}

func NewFallbackHandler(strategy FallbackStrategy) *FallbackHandler {
    return &FallbackHandler{
        strategy: strategy,
        cache: map[string]string{
            "user:1": "Alice (缓存)",
            "user:2": "Bob (缓存)",
        },
    }
}

func (h *FallbackHandler) ExecuteWithFallback(key string, primary func() (string, error)) (string, error) {
    result, err := primary()
    if err == nil {
        return result, nil
    }

    fmt.Printf("主服务失败: %v,执行降级策略\n", err)

    switch h.strategy {
    case FallbackStrategyCache:
        if cached, ok := h.cache[key]; ok {
            fmt.Println("使用缓存数据")
            return cached, nil
        }
        return "", fmt.Errorf("缓存中也没有数据")

    case FallbackStrategyDefault:
        fmt.Println("使用默认值")
        return "默认值", nil

    case FallbackStrategyEmpty:
        fmt.Println("返回空值")
        return "", nil
    }

    return "", err
}

func main() {
    handler := NewFallbackHandler(FallbackStrategyCache)

    result, err := handler.ExecuteWithFallback("user:1", func() (string, error) {
        return "", fmt.Errorf("数据库连接失败")
    })

    if err != nil {
        fmt.Println("降级失败:", err)
    } else {
        fmt.Println("降级成功:", result)
    }
}

熔断器模式

package main

import (
    "fmt"
    "sync"
    "time"
)

type CircuitState int

const (
    CircuitStateClosed CircuitState = iota
    CircuitStateOpen
    CircuitStateHalfOpen
)

type CircuitBreaker struct {
    mu              sync.RWMutex
    state           CircuitState
    failureCount    int
    successCount    int
    failureThreshold int
    successThreshold int
    timeout         time.Duration
    lastFailureTime time.Time
}

func NewCircuitBreaker(failureThreshold, successThreshold int, timeout time.Duration) *CircuitBreaker {
    return &CircuitBreaker{
        state:            CircuitStateClosed,
        failureThreshold: failureThreshold,
        successThreshold: successThreshold,
        timeout:          timeout,
    }
}

func (cb *CircuitBreaker) Execute(fn func() error) error {
    cb.mu.RLock()
    state := cb.state
    cb.mu.RUnlock()

    if state == CircuitStateOpen {
        if time.Since(cb.lastFailureTime) > cb.timeout {
            cb.mu.Lock()
            cb.state = CircuitStateHalfOpen
            cb.mu.Unlock()
            fmt.Println("熔断器进入半开状态")
        } else {
            return fmt.Errorf("熔断器已打开,拒绝请求")
        }
    }

    err := fn()

    cb.mu.Lock()
    defer cb.mu.Unlock()

    if err != nil {
        cb.failureCount++
        cb.successCount = 0
        cb.lastFailureTime = time.Now()

        if cb.failure.failureCount >= cb.failureThreshold {
            cb.state = CircuitStateOpen
            fmt.Println("熔断器打开")
        }
        return err
    }

    cb.successCount++
    if cb.state == CircuitStateHalfOpen {
        if cb.successCount >= cb.successThreshold {
            cb.state = CircuitStateClosed
            cb.failureCount = 0
            cb.successCount = 0
            fmt.Println("熔断器恢复到关闭状态")
        }
    } else {
        cb.failureCount = 0
    }

    return nil
}

func main() {
    cb := NewCircuitBreaker(3, 2, 5*time.Second)
    attempts := 0

    for i := 0; i < 10; i++ {
        err := cb.Execute(func() error {
            attempts++
            if attempts <= 5 {
                return fmt.Errorf("服务错误")
            }
            return nil
        })

        if err != nil {
            fmt.Printf("请求失败: %v\n", err)
        } else {
            fmt.Println("请求成功")
        }
        time.Sleep(100 * time.Millisecond)
    }
}

29.6 错误恢复策略

优雅降级

package main

import (
    "fmt"
    "sync"
)

type Service struct {
    name      string
    available bool
    mu        sync.RWMutex
}

func NewService(name string) *Service {
    return &Service{
        name:      name,
        available: true,
    }
}

func (s *Service) SetAvailable(available bool) {
    s.mu.Lock()
    defer s.mu.Unlock()
    s.available = available
}

func (s *Service) IsAvailable() bool {
    s.mu.RLock()
    defer s.mu.RUnlock()
    return s.available
}

func (s *Service) Process(data string) (string, error) {
    if !s.IsAvailable() {
        return "", fmt.Errorf("服务 %s 不可用", s.name)
    }
    return fmt.Sprintf("%s 处理: %s", s.name, data), nil
}

type ServiceManager struct {
    primary   *Service
    secondary *Service
}

func NewServiceManager(primary, secondary *Service) *ServiceManager {
    return &ServiceManager{
        primary:   primary,
        secondary: secondary,
    }
}

func (sm *ServiceManager) Process(data string) (string, error) {
    if sm.primary.IsAvailable() {
        result, err := sm.primary.Process(data)
        if err == nil {
            return result, nil
        }
        fmt.Printf("主服务失败: %v,尝试备用服务\n", err)
    }

    if sm.secondary.IsAvailable() {
        result, err := sm.secondary.Process(data)
        if err == nil {
            return result, nil
        }
        fmt.Printf("备用服务失败: %v\n", err)
    }

    return "", fmt.Errorf("所有服务都不可用")
}

func main() {
    primary := NewService("主服务")
    secondary := NewService("备用服务")
    manager := NewServiceManager(primary, secondary)

    result, err := manager.Process("数据")
    if err != nil {
        fmt.Println("处理失败:", err)
    } else {
        fmt.Println("处理成功:", result)
    }

    primary.SetAvailable(false)
    result, err = manager.Process("数据")
    if err != nil {
        fmt.Println("处理失败:", err)
    } else {
        fmt.Println("处理成功:", result)
    }
}

29.7 完整的错误处理流程

package main

import (
    "encoding/json"
    "fmt"
    "log"
    "time"
)

type ErrorContext struct {
    Operation string
    Resource  string
    ID        string
    Timestamp time.Time
}

type ErrorProcessor struct {
    logger *Logger
    stats  *ErrorStats
    alert  *AlertManager
}

func NewErrorProcessor() *ErrorProcessor {
    return &ErrorProcessor{
        logger: NewLogger(LogLevelInfo),
        stats:  NewErrorStats(100),
        alert:  NewAlertManager(),
    }
}

func (p *ErrorProcessor) ProcessError(err error, ctx ErrorContext) {
    p.logger.Error(fmt.Sprintf("[%s] %s: %v", ctx.Operation, ctx.Resource, err))
    p.stats.Record(err)

    if isCritical(err) {
        p.alert.SendAlert(AlertLevelCritical, "严重错误", map[string]string{
            "operation": ctx.Operation,
            "resource":  ctx.Resource,
            "id":        ctx.ID,
        })
    }
}

func isCritical(err error) bool {
    return false
}

func main() {
    processor := NewErrorProcessor()

    err := fmt.Errorf("数据库连接失败")
    ctx := ErrorContext{
        Operation: "query",
        Resource:  "user",
        ID:        "123",
        Timestamp: time.Now(),
    }

    processor.ProcessError(err, ctx)
}

29.8 小结

本章详细介绍了 Go 语言错误处理的最佳实践:

  1. 设计原则:错误应该可预期、包含上下文、只处理一次
  2. 日志记录:结构化日志、分级日志
  3. 监控告警:错误统计、错误告警
  4. 重试机制:指数退避重试、条件重试
  5. 降级处理:降级策略、熔断器模式
  6. 恢复策略:优雅降级、服务切换

良好的错误处理是构建可靠系统的基础。通过合理的错误设计、监控和处理机制,可以大大提高系统的稳定性和可维护性。在下一章中,我们将学习 Go 语言的包管理机制。