第一次提交
This commit is contained in:
185
monitoring/alert_rules.yml
Normal file
185
monitoring/alert_rules.yml
Normal file
@@ -0,0 +1,185 @@
|
||||
# Prometheus 告警规则
|
||||
# 定义 KaMiXiTong 系统的告警阈值和条件
|
||||
|
||||
groups:
|
||||
- name: kamaxitong_alerts
|
||||
rules:
|
||||
# ==========================================
|
||||
# 🔥 高优先级告警
|
||||
# ==========================================
|
||||
|
||||
# 服务不可用
|
||||
- alert: ServiceDown
|
||||
expr: up == 0
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "KaMiXiTong 服务不可用"
|
||||
description: "KaMiXiTong 服务已宕机超过 0 分钟"
|
||||
|
||||
# API 响应时间过长
|
||||
- alert: HighResponseTime
|
||||
expr: http_request_duration_seconds{quantile="0.95"} > 2
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "API 响应时间过长"
|
||||
description: "95% 的请求响应时间超过 2 秒,已持续 5 分钟"
|
||||
|
||||
# 错误率过高
|
||||
- alert: HighErrorRate
|
||||
expr: rate(http_requests_total{status=~"5.."}[5m]) > 0.1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "API 错误率过高"
|
||||
description: "5xx 错误率超过 10%,已持续 5 分钟"
|
||||
|
||||
# 数据库连接失败
|
||||
- alert: DatabaseConnectionDown
|
||||
expr: mysql_up == 0
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "数据库连接失败"
|
||||
description: "无法连接到数据库"
|
||||
|
||||
# ==========================================
|
||||
# ⚠️ 中优先级告警
|
||||
# ==========================================
|
||||
|
||||
# CPU 使用率过高
|
||||
- alert: HighCPUUsage
|
||||
expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "CPU 使用率过高"
|
||||
description: "CPU 使用率超过 80%,已持续 10 分钟"
|
||||
|
||||
# 内存使用率过高
|
||||
- alert: HighMemoryUsage
|
||||
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 85
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "内存使用率过高"
|
||||
description: "内存使用率超过 85%,已持续 10 分钟"
|
||||
|
||||
# 磁盘使用率过高
|
||||
- alert: HighDiskUsage
|
||||
expr: (1 - (node_filesystem_avail_bytes / node_filesystem_size_bytes)) * 100 > 90
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "磁盘使用率过高"
|
||||
description: "磁盘使用率超过 90%,已持续 10 分钟"
|
||||
|
||||
# 数据库连接数过多
|
||||
- alert: HighDatabaseConnections
|
||||
expr: mysql_global_status_threads_connected / mysql_global_variables_max_connections * 100 > 80
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "数据库连接数过多"
|
||||
description: "数据库连接数使用率超过 80%"
|
||||
|
||||
# 活跃会话数异常
|
||||
- alert: HighActiveSessions
|
||||
expr: flask_session_active > 1000
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "活跃会话数异常"
|
||||
description: "活跃会话数超过 1000"
|
||||
|
||||
# ==========================================
|
||||
# 📊 业务指标告警
|
||||
# ==========================================
|
||||
|
||||
# 卡密验证失败率过高
|
||||
- alert: HighLicenseVerificationFailure
|
||||
expr: rate(license_verification_total{status="failed"}[5m]) > 0.05
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "卡密验证失败率过高"
|
||||
description: "卡密验证失败率超过 5%"
|
||||
|
||||
# 订单支付失败率过高
|
||||
- alert: HighPaymentFailureRate
|
||||
expr: rate(payment_total{status="failed"}[5m]) > 0.1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "订单支付失败率过高"
|
||||
description: "订单支付失败率超过 10%"
|
||||
|
||||
# 活跃卡密数量异常
|
||||
- alert: AbnormalActiveLicenses
|
||||
expr: change(license_active_total[1h]) < -0.2
|
||||
for: 30m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "活跃卡密数量异常下降"
|
||||
description: "活跃卡密数量在 1 小时内下降超过 20%"
|
||||
|
||||
# ==========================================
|
||||
# 🔍 安全告警
|
||||
# ==========================================
|
||||
|
||||
# 频繁登录失败
|
||||
- alert: HighLoginFailures
|
||||
expr: rate(login_failures_total[5m]) > 0.5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
频繁登录失败"
|
||||
summary: " description: "登录失败率过高,可能存在暴力破解攻击"
|
||||
|
||||
# API 频率限制触发
|
||||
- alert: RateLimitTriggered
|
||||
expr: rate(rate_limit_triggered_total[5m]) > 10
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "API 频率限制频繁触发"
|
||||
description: "API 频率限制触发次数过多,可能存在 API 滥用"
|
||||
|
||||
# ==========================================
|
||||
# 💡 信息性告警
|
||||
# ==========================================
|
||||
|
||||
# 证书即将过期
|
||||
- alert: SSLCertificateExpiring
|
||||
expr: (ssl_certificate_expiry_seconds - time()) / 86400 < 30
|
||||
for: 0m
|
||||
labels:
|
||||
severity: info
|
||||
annotations:
|
||||
summary: "SSL 证书即将过期"
|
||||
description: "SSL 证书将在 30 天内过期"
|
||||
|
||||
# 数据库备份失败
|
||||
- alert: DatabaseBackupFailed
|
||||
expr: time() - database_backup_last_success_timestamp > 86400
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "数据库备份失败"
|
||||
description: "数据库备份已超过 24 小时未成功"
|
||||
1
monitoring/grafana_dashboard.json
Normal file
1
monitoring/grafana_dashboard.json
Normal file
@@ -0,0 +1 @@
|
||||
{"dashboard": {"id": null, "title": "KaMiXiTong 系统监控", "tags": ["kamaxitong", "monitoring"], "timezone": "browser", "panels": [{"id": 1, "title": "系统概览", "type": "stat", "targets": [{"expr": "up{job=\"kamaxitong\"}", "legendFormat": "服务状态"}], "fieldConfig": {"defaults": {"color": {"mode": "thresholds"}, "thresholds": {"steps": [{"color": "red", "value": 0}, {"color": "green", "value": 1}]}}}}, {"id": 2, "title": "API 响应时间", "type": "graph", "targets": [{"expr": "http_request_duration_seconds{quantile=\"0.50\"}", "legendFormat": "50%"}, {"expr": "http_request_duration_seconds{quantile=\"0.95\"}", "legendFormat": "95%"}, {"expr": "http_request_duration_seconds{quantile=\"0.99\"}", "legendFormat": "99%"}]}, {"id": 3, "title": "请求量", "type": "graph", "targets": [{"expr": "rate(http_requests_total[5m])", "legendFormat": "{{method}} {{status}}"}]}, {"id": 4, "title": "错误率", "type": "graph", "targets": [{"expr": "rate(http_requests_total{status=~\"4..|5..\"}[5m]) / rate(http_requests_total[5m]) * 100", "legendFormat": "错误率"}]}, {"id": 5, "title": "CPU 使用率", "type": "graph", "targets": [{"expr": "100 - (avg by(instance) (irate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100)", "legendFormat": "CPU 使用率"}]}, {"id": 6, "title": "内存使用率", "type": "graph", "targets": [{"expr": "(1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100", "legendFormat": "内存使用率"}]}, {"id": 7, "title": "磁盘使用率", "type": "graph", "targets": [{"expr": "(1 - (node_filesystem_avail_bytes / node_filesystem_size_bytes)) * 100", "legendFormat": "{{mountpoint}}"}]}, {"id": 8, "title": "数据库连接数", "type": "graph", "targets": [{"expr": "mysql_global_status_threads_connected", "legendFormat": "活跃连接"}, {"expr": "mysql_global_variables_max_connections", "legendFormat": "最大连接数"}]}, {"id": 9, "title": "卡密统计", "type": "stat", "targets": [{"expr": "license_total", "legendFormat": "总卡密数"}, {"expr": "license_active", "legendFormat": "活跃卡密"}, {"expr": "license_expired", "legendFormat": "过期卡密"}]}, {"id": 10, "title": "订单统计", "type": "stat", "targets": [{"expr": "order_total", "legendFormat": "总订单数"}, {"expr": "order_paid", "legendFormat": "已支付"}, {"expr": "order_pending", "legendFormat": "待支付"}]}], "time": {"from": "now-1h", "to": "now"}, "refresh": "5s"}}
|
||||
56
monitoring/prometheus.yml
Normal file
56
monitoring/prometheus.yml
Normal file
@@ -0,0 +1,56 @@
|
||||
# Prometheus 监控配置
|
||||
# 用于监控 KaMiXiTong 系统的性能和健康状态
|
||||
|
||||
global:
|
||||
scrape_interval: 15s # 全局采集间隔
|
||||
evaluation_interval: 15s # 告警规则评估间隔
|
||||
|
||||
# 告警管理器配置
|
||||
alerting:
|
||||
alertmanagers:
|
||||
- static_configs:
|
||||
- targets:
|
||||
- alertmanager:9093
|
||||
|
||||
# 告警规则文件
|
||||
rule_files:
|
||||
- "alert_rules.yml"
|
||||
|
||||
# 采集配置
|
||||
scrape_configs:
|
||||
# Prometheus 自身监控
|
||||
- job_name: 'prometheus'
|
||||
static_configs:
|
||||
- targets: ['localhost:9090']
|
||||
|
||||
# KaMiXiTong 应用监控
|
||||
- job_name: 'kamaxitong'
|
||||
static_configs:
|
||||
- targets: ['localhost:5000']
|
||||
metrics_path: '/api/v1/metrics'
|
||||
scrape_interval: 15s
|
||||
scrape_timeout: 10s
|
||||
|
||||
# 系统监控(需要安装 node_exporter)
|
||||
- job_name: 'node'
|
||||
static_configs:
|
||||
- targets: ['localhost:9100']
|
||||
scrape_interval: 15s
|
||||
|
||||
# 数据库监控(需要安装 mysqld_exporter 或 postgres_exporter)
|
||||
- job_name: 'mysql'
|
||||
static_configs:
|
||||
- targets: ['localhost:9104']
|
||||
scrape_interval: 15s
|
||||
|
||||
# Redis 监控(需要安装 redis_exporter)
|
||||
- job_name: 'redis'
|
||||
static_configs:
|
||||
- targets: ['localhost:9121']
|
||||
scrape_interval: 15s
|
||||
|
||||
# Nginx 监控(需要安装 nginx-prometheus-exporter)
|
||||
- job_name: 'nginx'
|
||||
static_configs:
|
||||
- targets: ['localhost:9113']
|
||||
scrape_interval: 15s
|
||||
Reference in New Issue
Block a user