186 lines
5.8 KiB
YAML
186 lines
5.8 KiB
YAML
# Prometheus 告警规则
|
|
# 定义 KaMiXiTong 系统的告警阈值和条件
|
|
|
|
groups:
|
|
- name: kamaxitong_alerts
|
|
rules:
|
|
# ==========================================
|
|
# 🔥 高优先级告警
|
|
# ==========================================
|
|
|
|
# 服务不可用
|
|
- alert: ServiceDown
|
|
expr: up == 0
|
|
for: 0m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "KaMiXiTong 服务不可用"
|
|
description: "KaMiXiTong 服务已宕机超过 0 分钟"
|
|
|
|
# API 响应时间过长
|
|
- alert: HighResponseTime
|
|
expr: http_request_duration_seconds{quantile="0.95"} > 2
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "API 响应时间过长"
|
|
description: "95% 的请求响应时间超过 2 秒,已持续 5 分钟"
|
|
|
|
# 错误率过高
|
|
- alert: HighErrorRate
|
|
expr: rate(http_requests_total{status=~"5.."}[5m]) > 0.1
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "API 错误率过高"
|
|
description: "5xx 错误率超过 10%,已持续 5 分钟"
|
|
|
|
# 数据库连接失败
|
|
- alert: DatabaseConnectionDown
|
|
expr: mysql_up == 0
|
|
for: 0m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "数据库连接失败"
|
|
description: "无法连接到数据库"
|
|
|
|
# ==========================================
|
|
# ⚠️ 中优先级告警
|
|
# ==========================================
|
|
|
|
# CPU 使用率过高
|
|
- alert: HighCPUUsage
|
|
expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "CPU 使用率过高"
|
|
description: "CPU 使用率超过 80%,已持续 10 分钟"
|
|
|
|
# 内存使用率过高
|
|
- alert: HighMemoryUsage
|
|
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 85
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "内存使用率过高"
|
|
description: "内存使用率超过 85%,已持续 10 分钟"
|
|
|
|
# 磁盘使用率过高
|
|
- alert: HighDiskUsage
|
|
expr: (1 - (node_filesystem_avail_bytes / node_filesystem_size_bytes)) * 100 > 90
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "磁盘使用率过高"
|
|
description: "磁盘使用率超过 90%,已持续 10 分钟"
|
|
|
|
# 数据库连接数过多
|
|
- alert: HighDatabaseConnections
|
|
expr: mysql_global_status_threads_connected / mysql_global_variables_max_connections * 100 > 80
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "数据库连接数过多"
|
|
description: "数据库连接数使用率超过 80%"
|
|
|
|
# 活跃会话数异常
|
|
- alert: HighActiveSessions
|
|
expr: flask_session_active > 1000
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "活跃会话数异常"
|
|
description: "活跃会话数超过 1000"
|
|
|
|
# ==========================================
|
|
# 📊 业务指标告警
|
|
# ==========================================
|
|
|
|
# 卡密验证失败率过高
|
|
- alert: HighLicenseVerificationFailure
|
|
expr: rate(license_verification_total{status="failed"}[5m]) > 0.05
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "卡密验证失败率过高"
|
|
description: "卡密验证失败率超过 5%"
|
|
|
|
# 订单支付失败率过高
|
|
- alert: HighPaymentFailureRate
|
|
expr: rate(payment_total{status="failed"}[5m]) > 0.1
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "订单支付失败率过高"
|
|
description: "订单支付失败率超过 10%"
|
|
|
|
# 活跃卡密数量异常
|
|
- alert: AbnormalActiveLicenses
|
|
expr: change(license_active_total[1h]) < -0.2
|
|
for: 30m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "活跃卡密数量异常下降"
|
|
description: "活跃卡密数量在 1 小时内下降超过 20%"
|
|
|
|
# ==========================================
|
|
# 🔍 安全告警
|
|
# ==========================================
|
|
|
|
# 频繁登录失败
|
|
- alert: HighLoginFailures
|
|
expr: rate(login_failures_total[5m]) > 0.5
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
频繁登录失败"
|
|
summary: " description: "登录失败率过高,可能存在暴力破解攻击"
|
|
|
|
# API 频率限制触发
|
|
- alert: RateLimitTriggered
|
|
expr: rate(rate_limit_triggered_total[5m]) > 10
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "API 频率限制频繁触发"
|
|
description: "API 频率限制触发次数过多,可能存在 API 滥用"
|
|
|
|
# ==========================================
|
|
# 💡 信息性告警
|
|
# ==========================================
|
|
|
|
# 证书即将过期
|
|
- alert: SSLCertificateExpiring
|
|
expr: (ssl_certificate_expiry_seconds - time()) / 86400 < 30
|
|
for: 0m
|
|
labels:
|
|
severity: info
|
|
annotations:
|
|
summary: "SSL 证书即将过期"
|
|
description: "SSL 证书将在 30 天内过期"
|
|
|
|
# 数据库备份失败
|
|
- alert: DatabaseBackupFailed
|
|
expr: time() - database_backup_last_success_timestamp > 86400
|
|
for: 0m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "数据库备份失败"
|
|
description: "数据库备份已超过 24 小时未成功"
|