Kamixitong/monitoring/alert_rules.yml
2025-12-12 11:35:14 +08:00

186 lines
5.8 KiB
YAML

# Prometheus 告警规则
# 定义 KaMiXiTong 系统的告警阈值和条件
groups:
- name: kamaxitong_alerts
rules:
# ==========================================
# 🔥 高优先级告警
# ==========================================
# 服务不可用
- alert: ServiceDown
expr: up == 0
for: 0m
labels:
severity: critical
annotations:
summary: "KaMiXiTong 服务不可用"
description: "KaMiXiTong 服务已宕机超过 0 分钟"
# API 响应时间过长
- alert: HighResponseTime
expr: http_request_duration_seconds{quantile="0.95"} > 2
for: 5m
labels:
severity: critical
annotations:
summary: "API 响应时间过长"
description: "95% 的请求响应时间超过 2 秒,已持续 5 分钟"
# 错误率过高
- alert: HighErrorRate
expr: rate(http_requests_total{status=~"5.."}[5m]) > 0.1
for: 5m
labels:
severity: critical
annotations:
summary: "API 错误率过高"
description: "5xx 错误率超过 10%,已持续 5 分钟"
# 数据库连接失败
- alert: DatabaseConnectionDown
expr: mysql_up == 0
for: 0m
labels:
severity: critical
annotations:
summary: "数据库连接失败"
description: "无法连接到数据库"
# ==========================================
# ⚠️ 中优先级告警
# ==========================================
# CPU 使用率过高
- alert: HighCPUUsage
expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
for: 10m
labels:
severity: warning
annotations:
summary: "CPU 使用率过高"
description: "CPU 使用率超过 80%,已持续 10 分钟"
# 内存使用率过高
- alert: HighMemoryUsage
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 85
for: 10m
labels:
severity: warning
annotations:
summary: "内存使用率过高"
description: "内存使用率超过 85%,已持续 10 分钟"
# 磁盘使用率过高
- alert: HighDiskUsage
expr: (1 - (node_filesystem_avail_bytes / node_filesystem_size_bytes)) * 100 > 90
for: 10m
labels:
severity: warning
annotations:
summary: "磁盘使用率过高"
description: "磁盘使用率超过 90%,已持续 10 分钟"
# 数据库连接数过多
- alert: HighDatabaseConnections
expr: mysql_global_status_threads_connected / mysql_global_variables_max_connections * 100 > 80
for: 10m
labels:
severity: warning
annotations:
summary: "数据库连接数过多"
description: "数据库连接数使用率超过 80%"
# 活跃会话数异常
- alert: HighActiveSessions
expr: flask_session_active > 1000
for: 10m
labels:
severity: warning
annotations:
summary: "活跃会话数异常"
description: "活跃会话数超过 1000"
# ==========================================
# 📊 业务指标告警
# ==========================================
# 卡密验证失败率过高
- alert: HighLicenseVerificationFailure
expr: rate(license_verification_total{status="failed"}[5m]) > 0.05
for: 10m
labels:
severity: warning
annotations:
summary: "卡密验证失败率过高"
description: "卡密验证失败率超过 5%"
# 订单支付失败率过高
- alert: HighPaymentFailureRate
expr: rate(payment_total{status="failed"}[5m]) > 0.1
for: 10m
labels:
severity: warning
annotations:
summary: "订单支付失败率过高"
description: "订单支付失败率超过 10%"
# 活跃卡密数量异常
- alert: AbnormalActiveLicenses
expr: change(license_active_total[1h]) < -0.2
for: 30m
labels:
severity: warning
annotations:
summary: "活跃卡密数量异常下降"
description: "活跃卡密数量在 1 小时内下降超过 20%"
# ==========================================
# 🔍 安全告警
# ==========================================
# 频繁登录失败
- alert: HighLoginFailures
expr: rate(login_failures_total[5m]) > 0.5
for: 5m
labels:
severity: warning
annotations:
频繁登录失败"
summary: " description: "登录失败率过高,可能存在暴力破解攻击"
# API 频率限制触发
- alert: RateLimitTriggered
expr: rate(rate_limit_triggered_total[5m]) > 10
for: 5m
labels:
severity: warning
annotations:
summary: "API 频率限制频繁触发"
description: "API 频率限制触发次数过多,可能存在 API 滥用"
# ==========================================
# 💡 信息性告警
# ==========================================
# 证书即将过期
- alert: SSLCertificateExpiring
expr: (ssl_certificate_expiry_seconds - time()) / 86400 < 30
for: 0m
labels:
severity: info
annotations:
summary: "SSL 证书即将过期"
description: "SSL 证书将在 30 天内过期"
# 数据库备份失败
- alert: DatabaseBackupFailed
expr: time() - database_backup_last_success_timestamp > 86400
for: 0m
labels:
severity: warning
annotations:
summary: "数据库备份失败"
description: "数据库备份已超过 24 小时未成功"