# Prometheus 告警规则 # 定义 KaMiXiTong 系统的告警阈值和条件 groups: - name: kamaxitong_alerts rules: # ========================================== # 🔥 高优先级告警 # ========================================== # 服务不可用 - alert: ServiceDown expr: up == 0 for: 0m labels: severity: critical annotations: summary: "KaMiXiTong 服务不可用" description: "KaMiXiTong 服务已宕机超过 0 分钟" # API 响应时间过长 - alert: HighResponseTime expr: http_request_duration_seconds{quantile="0.95"} > 2 for: 5m labels: severity: critical annotations: summary: "API 响应时间过长" description: "95% 的请求响应时间超过 2 秒,已持续 5 分钟" # 错误率过高 - alert: HighErrorRate expr: rate(http_requests_total{status=~"5.."}[5m]) > 0.1 for: 5m labels: severity: critical annotations: summary: "API 错误率过高" description: "5xx 错误率超过 10%,已持续 5 分钟" # 数据库连接失败 - alert: DatabaseConnectionDown expr: mysql_up == 0 for: 0m labels: severity: critical annotations: summary: "数据库连接失败" description: "无法连接到数据库" # ========================================== # ⚠️ 中优先级告警 # ========================================== # CPU 使用率过高 - alert: HighCPUUsage expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80 for: 10m labels: severity: warning annotations: summary: "CPU 使用率过高" description: "CPU 使用率超过 80%,已持续 10 分钟" # 内存使用率过高 - alert: HighMemoryUsage expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 85 for: 10m labels: severity: warning annotations: summary: "内存使用率过高" description: "内存使用率超过 85%,已持续 10 分钟" # 磁盘使用率过高 - alert: HighDiskUsage expr: (1 - (node_filesystem_avail_bytes / node_filesystem_size_bytes)) * 100 > 90 for: 10m labels: severity: warning annotations: summary: "磁盘使用率过高" description: "磁盘使用率超过 90%,已持续 10 分钟" # 数据库连接数过多 - alert: HighDatabaseConnections expr: mysql_global_status_threads_connected / mysql_global_variables_max_connections * 100 > 80 for: 10m labels: severity: warning annotations: summary: "数据库连接数过多" description: "数据库连接数使用率超过 80%" # 活跃会话数异常 - alert: HighActiveSessions expr: flask_session_active > 1000 for: 10m labels: severity: warning annotations: summary: "活跃会话数异常" description: "活跃会话数超过 1000" # ========================================== # 📊 业务指标告警 # ========================================== # 卡密验证失败率过高 - alert: HighLicenseVerificationFailure expr: rate(license_verification_total{status="failed"}[5m]) > 0.05 for: 10m labels: severity: warning annotations: summary: "卡密验证失败率过高" description: "卡密验证失败率超过 5%" # 订单支付失败率过高 - alert: HighPaymentFailureRate expr: rate(payment_total{status="failed"}[5m]) > 0.1 for: 10m labels: severity: warning annotations: summary: "订单支付失败率过高" description: "订单支付失败率超过 10%" # 活跃卡密数量异常 - alert: AbnormalActiveLicenses expr: change(license_active_total[1h]) < -0.2 for: 30m labels: severity: warning annotations: summary: "活跃卡密数量异常下降" description: "活跃卡密数量在 1 小时内下降超过 20%" # ========================================== # 🔍 安全告警 # ========================================== # 频繁登录失败 - alert: HighLoginFailures expr: rate(login_failures_total[5m]) > 0.5 for: 5m labels: severity: warning annotations: 频繁登录失败" summary: " description: "登录失败率过高,可能存在暴力破解攻击" # API 频率限制触发 - alert: RateLimitTriggered expr: rate(rate_limit_triggered_total[5m]) > 10 for: 5m labels: severity: warning annotations: summary: "API 频率限制频繁触发" description: "API 频率限制触发次数过多,可能存在 API 滥用" # ========================================== # 💡 信息性告警 # ========================================== # 证书即将过期 - alert: SSLCertificateExpiring expr: (ssl_certificate_expiry_seconds - time()) / 86400 < 30 for: 0m labels: severity: info annotations: summary: "SSL 证书即将过期" description: "SSL 证书将在 30 天内过期" # 数据库备份失败 - alert: DatabaseBackupFailed expr: time() - database_backup_last_success_timestamp > 86400 for: 0m labels: severity: warning annotations: summary: "数据库备份失败" description: "数据库备份已超过 24 小时未成功"