第一次提交

This commit is contained in:
2026-03-25 15:24:22 +08:00
commit 0f8ac68d4d
156 changed files with 42365 additions and 0 deletions

185
monitoring/alert_rules.yml Normal file
View File

@@ -0,0 +1,185 @@
# Prometheus 告警规则
# 定义 KaMiXiTong 系统的告警阈值和条件
groups:
- name: kamaxitong_alerts
rules:
# ==========================================
# 🔥 高优先级告警
# ==========================================
# 服务不可用
- alert: ServiceDown
expr: up == 0
for: 0m
labels:
severity: critical
annotations:
summary: "KaMiXiTong 服务不可用"
description: "KaMiXiTong 服务已宕机超过 0 分钟"
# API 响应时间过长
- alert: HighResponseTime
expr: http_request_duration_seconds{quantile="0.95"} > 2
for: 5m
labels:
severity: critical
annotations:
summary: "API 响应时间过长"
description: "95% 的请求响应时间超过 2 秒,已持续 5 分钟"
# 错误率过高
- alert: HighErrorRate
expr: rate(http_requests_total{status=~"5.."}[5m]) > 0.1
for: 5m
labels:
severity: critical
annotations:
summary: "API 错误率过高"
description: "5xx 错误率超过 10%,已持续 5 分钟"
# 数据库连接失败
- alert: DatabaseConnectionDown
expr: mysql_up == 0
for: 0m
labels:
severity: critical
annotations:
summary: "数据库连接失败"
description: "无法连接到数据库"
# ==========================================
# ⚠️ 中优先级告警
# ==========================================
# CPU 使用率过高
- alert: HighCPUUsage
expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
for: 10m
labels:
severity: warning
annotations:
summary: "CPU 使用率过高"
description: "CPU 使用率超过 80%,已持续 10 分钟"
# 内存使用率过高
- alert: HighMemoryUsage
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 85
for: 10m
labels:
severity: warning
annotations:
summary: "内存使用率过高"
description: "内存使用率超过 85%,已持续 10 分钟"
# 磁盘使用率过高
- alert: HighDiskUsage
expr: (1 - (node_filesystem_avail_bytes / node_filesystem_size_bytes)) * 100 > 90
for: 10m
labels:
severity: warning
annotations:
summary: "磁盘使用率过高"
description: "磁盘使用率超过 90%,已持续 10 分钟"
# 数据库连接数过多
- alert: HighDatabaseConnections
expr: mysql_global_status_threads_connected / mysql_global_variables_max_connections * 100 > 80
for: 10m
labels:
severity: warning
annotations:
summary: "数据库连接数过多"
description: "数据库连接数使用率超过 80%"
# 活跃会话数异常
- alert: HighActiveSessions
expr: flask_session_active > 1000
for: 10m
labels:
severity: warning
annotations:
summary: "活跃会话数异常"
description: "活跃会话数超过 1000"
# ==========================================
# 📊 业务指标告警
# ==========================================
# 卡密验证失败率过高
- alert: HighLicenseVerificationFailure
expr: rate(license_verification_total{status="failed"}[5m]) > 0.05
for: 10m
labels:
severity: warning
annotations:
summary: "卡密验证失败率过高"
description: "卡密验证失败率超过 5%"
# 订单支付失败率过高
- alert: HighPaymentFailureRate
expr: rate(payment_total{status="failed"}[5m]) > 0.1
for: 10m
labels:
severity: warning
annotations:
summary: "订单支付失败率过高"
description: "订单支付失败率超过 10%"
# 活跃卡密数量异常
- alert: AbnormalActiveLicenses
expr: change(license_active_total[1h]) < -0.2
for: 30m
labels:
severity: warning
annotations:
summary: "活跃卡密数量异常下降"
description: "活跃卡密数量在 1 小时内下降超过 20%"
# ==========================================
# 🔍 安全告警
# ==========================================
# 频繁登录失败
- alert: HighLoginFailures
expr: rate(login_failures_total[5m]) > 0.5
for: 5m
labels:
severity: warning
annotations:
频繁登录失败"
summary: " description: "登录失败率过高,可能存在暴力破解攻击"
# API 频率限制触发
- alert: RateLimitTriggered
expr: rate(rate_limit_triggered_total[5m]) > 10
for: 5m
labels:
severity: warning
annotations:
summary: "API 频率限制频繁触发"
description: "API 频率限制触发次数过多,可能存在 API 滥用"
# ==========================================
# 💡 信息性告警
# ==========================================
# 证书即将过期
- alert: SSLCertificateExpiring
expr: (ssl_certificate_expiry_seconds - time()) / 86400 < 30
for: 0m
labels:
severity: info
annotations:
summary: "SSL 证书即将过期"
description: "SSL 证书将在 30 天内过期"
# 数据库备份失败
- alert: DatabaseBackupFailed
expr: time() - database_backup_last_success_timestamp > 86400
for: 0m
labels:
severity: warning
annotations:
summary: "数据库备份失败"
description: "数据库备份已超过 24 小时未成功"

View File

@@ -0,0 +1 @@
{"dashboard": {"id": null, "title": "KaMiXiTong 系统监控", "tags": ["kamaxitong", "monitoring"], "timezone": "browser", "panels": [{"id": 1, "title": "系统概览", "type": "stat", "targets": [{"expr": "up{job=\"kamaxitong\"}", "legendFormat": "服务状态"}], "fieldConfig": {"defaults": {"color": {"mode": "thresholds"}, "thresholds": {"steps": [{"color": "red", "value": 0}, {"color": "green", "value": 1}]}}}}, {"id": 2, "title": "API 响应时间", "type": "graph", "targets": [{"expr": "http_request_duration_seconds{quantile=\"0.50\"}", "legendFormat": "50%"}, {"expr": "http_request_duration_seconds{quantile=\"0.95\"}", "legendFormat": "95%"}, {"expr": "http_request_duration_seconds{quantile=\"0.99\"}", "legendFormat": "99%"}]}, {"id": 3, "title": "请求量", "type": "graph", "targets": [{"expr": "rate(http_requests_total[5m])", "legendFormat": "{{method}} {{status}}"}]}, {"id": 4, "title": "错误率", "type": "graph", "targets": [{"expr": "rate(http_requests_total{status=~\"4..|5..\"}[5m]) / rate(http_requests_total[5m]) * 100", "legendFormat": "错误率"}]}, {"id": 5, "title": "CPU 使用率", "type": "graph", "targets": [{"expr": "100 - (avg by(instance) (irate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100)", "legendFormat": "CPU 使用率"}]}, {"id": 6, "title": "内存使用率", "type": "graph", "targets": [{"expr": "(1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100", "legendFormat": "内存使用率"}]}, {"id": 7, "title": "磁盘使用率", "type": "graph", "targets": [{"expr": "(1 - (node_filesystem_avail_bytes / node_filesystem_size_bytes)) * 100", "legendFormat": "{{mountpoint}}"}]}, {"id": 8, "title": "数据库连接数", "type": "graph", "targets": [{"expr": "mysql_global_status_threads_connected", "legendFormat": "活跃连接"}, {"expr": "mysql_global_variables_max_connections", "legendFormat": "最大连接数"}]}, {"id": 9, "title": "卡密统计", "type": "stat", "targets": [{"expr": "license_total", "legendFormat": "总卡密数"}, {"expr": "license_active", "legendFormat": "活跃卡密"}, {"expr": "license_expired", "legendFormat": "过期卡密"}]}, {"id": 10, "title": "订单统计", "type": "stat", "targets": [{"expr": "order_total", "legendFormat": "总订单数"}, {"expr": "order_paid", "legendFormat": "已支付"}, {"expr": "order_pending", "legendFormat": "待支付"}]}], "time": {"from": "now-1h", "to": "now"}, "refresh": "5s"}}

56
monitoring/prometheus.yml Normal file
View File

@@ -0,0 +1,56 @@
# Prometheus 监控配置
# 用于监控 KaMiXiTong 系统的性能和健康状态
global:
scrape_interval: 15s # 全局采集间隔
evaluation_interval: 15s # 告警规则评估间隔
# 告警管理器配置
alerting:
alertmanagers:
- static_configs:
- targets:
- alertmanager:9093
# 告警规则文件
rule_files:
- "alert_rules.yml"
# 采集配置
scrape_configs:
# Prometheus 自身监控
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
# KaMiXiTong 应用监控
- job_name: 'kamaxitong'
static_configs:
- targets: ['localhost:5000']
metrics_path: '/api/v1/metrics'
scrape_interval: 15s
scrape_timeout: 10s
# 系统监控(需要安装 node_exporter
- job_name: 'node'
static_configs:
- targets: ['localhost:9100']
scrape_interval: 15s
# 数据库监控(需要安装 mysqld_exporter 或 postgres_exporter
- job_name: 'mysql'
static_configs:
- targets: ['localhost:9104']
scrape_interval: 15s
# Redis 监控(需要安装 redis_exporter
- job_name: 'redis'
static_configs:
- targets: ['localhost:9121']
scrape_interval: 15s
# Nginx 监控(需要安装 nginx-prometheus-exporter
- job_name: 'nginx'
static_configs:
- targets: ['localhost:9113']
scrape_interval: 15s