Kamixitong/scripts/setup_monitoring.py
2025-12-12 11:35:14 +08:00

386 lines
11 KiB
Python

#!/usr/bin/env python3
"""
监控告警配置脚本
自动安装和配置 Prometheus + Grafana 监控栈
"""
import os
import sys
import subprocess
import json
import time
from pathlib import Path
class MonitoringSetup:
"""监控配置类"""
def __init__(self):
self.monitoring_dir = Path('monitoring')
self.docker_compose_file = self.monitoring_dir / 'docker-compose.yml'
def check_docker(self):
"""检查 Docker 和 Docker Compose"""
print("🔍 检查 Docker 环境...")
try:
result = subprocess.run(['docker', '--version'], capture_output=True, text=True)
print(f"✅ Docker: {result.stdout.strip()}")
except FileNotFoundError:
print("❌ Docker 未安装")
print("\n请安装 Docker: https://docs.docker.com/get-docker/")
return False
try:
result = subprocess.run(['docker-compose', '--version'], capture_output=True, text=True)
print(f"✅ Docker Compose: {result.stdout.strip()}")
except FileNotFoundError:
try:
result = subprocess.run(['docker', 'compose', 'version'], capture_output=True, text=True)
print(f"✅ Docker Compose: {result.stdout.strip()}")
except FileNotFoundError:
print("❌ Docker Compose 未安装")
print("\n请安装 Docker Compose")
return False
return True
def create_docker_compose(self):
"""创建 Docker Compose 配置"""
print("\n📝 创建 Docker Compose 配置...")
self.monitoring_dir.mkdir(exist_ok=True)
compose_content = """version: '3.8'
services:
# Prometheus 监控
prometheus:
image: prom/prometheus:latest
container_name: kamaxitong-prometheus
restart: unless-stopped
ports:
- "9090:9090"
volumes:
- ./prometheus.yml:/etc/prometheus/prometheus.yml
- ./alert_rules.yml:/etc/prometheus/alert_rules.yml
- prometheus_data:/prometheus
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
- '--web.console.libraries=/etc/prometheus/console_libraries'
- '--web.console.templates=/etc/prometheus/consoles'
- '--storage.tsdb.retention.time=200h'
- '--web.enable-lifecycle'
networks:
- monitoring
# Grafana 仪表板
grafana:
image: grafana/grafana:latest
container_name: kamaxitong-grafana
restart: unless-stopped
ports:
- "3000:3000"
volumes:
- grafana_data:/var/lib/grafana
- ./grafana/provisioning:/etc/grafana/provisioning
environment:
- GF_SECURITY_ADMIN_USER=admin
- GF_SECURITY_ADMIN_PASSWORD=admin123
- GF_USERS_ALLOW_SIGN_UP=false
networks:
- monitoring
# Node Exporter 系统监控
node-exporter:
image: prom/node-exporter:latest
container_name: kamaxitong-node-exporter
restart: unless-stopped
ports:
- "9100:9100"
volumes:
- /proc:/host/proc:ro
- /sys:/host/sys:ro
- /:/rootfs:ro
command:
- '--path.procfs=/host/proc'
- '--path.rootfs=/rootfs'
- '--path.sysfs=/host/sys'
- '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)'
networks:
- monitoring
# AlertManager 告警管理
alertmanager:
image: prom/alertmanager:latest
container_name: kamaxitong-alertmanager
restart: unless-stopped
ports:
- "9093:9093"
volumes:
- ./alertmanager.yml:/etc/alertmanager/alertmanager.yml
- alertmanager_data:/alertmanager
command:
- '--config.file=/etc/alertmanager/alertmanager.yml'
- '--storage.path=/alertmanager'
networks:
- monitoring
# Redis Exporter
redis-exporter:
image: oliver006/redis_exporter:latest
container_name: kamaxitong-redis-exporter
restart: unless-stopped
ports:
- "9121:9121"
environment:
- REDIS_ADDR=redis://redis:6379
networks:
- monitoring
depends_on:
- redis
# Redis 数据库
redis:
image: redis:alpine
container_name: kamaxitong-redis
restart: unless-stopped
ports:
- "6379:6379"
volumes:
- redis_data:/data
networks:
- monitoring
volumes:
prometheus_data:
grafana_data:
alertmanager_data:
redis_data:
networks:
monitoring:
driver: bridge
"""
with open(self.docker_compose_file, 'w') as f:
f.write(compose_content)
print(f"✅ Docker Compose 配置已创建: {self.docker_compose_file}")
def create_alertmanager_config(self):
"""创建 AlertManager 配置"""
print("\n📝 创建 AlertManager 配置...")
alertmanager_config = self.monitoring_dir / 'alertmanager.yml'
config_content = """global:
smtp_smarthost: 'localhost:587'
smtp_from: 'alerts@yourcompany.com'
smtp_auth_username: 'alerts@yourcompany.com'
smtp_auth_password: 'your-password'
route:
group_by: ['alertname']
group_wait: 10s
group_interval: 10s
repeat_interval: 1h
receiver: 'web.hook'
routes:
- match:
severity: critical
receiver: 'critical-alerts'
- match:
severity: warning
receiver: 'warning-alerts'
receivers:
- name: 'web.hook'
webhook_configs:
- url: 'http://localhost:5001/alert'
send_resolved: true
- name: 'critical-alerts'
email_configs:
- to: 'admin@yourcompany.com'
subject: '【严重告警】KaMiXiTong 系统告警'
body: |
{{ range .Alerts }}
告警: {{ .Annotations.summary }}
描述: {{ .Annotations.description }}
时间: {{ .StartsAt }}
级别: {{ .Labels.severity }}
{{ end }}
- name: 'warning-alerts'
email_configs:
- to: 'admin@yourcompany.com'
subject: '【警告】KaMiXiTong 系统告警'
body: |
{{ range .Alerts }}
告警: {{ .Annotations.summary }}
描述: {{ .Annotations.description }}
时间: {{ .StartsAt }}
级别: {{ .Labels.severity }}
{{ end }}
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'dev', 'instance']
"""
with open(alertmanager_config, 'w') as f:
f.write(config_content)
print(f"✅ AlertManager 配置已创建: {alertmanager_config}")
def create_grafana_provisioning(self):
"""创建 Grafana 配置"""
print("\n📝 创建 Grafana 配置...")
provisioning_dir = self.monitoring_dir / 'grafana' / 'provisioning'
provisioning_dir.mkdir(parents=True, exist_ok=True)
# 数据源配置
datasource_config = provisioning_dir / 'datasources.yml'
datasource_content = """apiVersion: 1
datasources:
- name: Prometheus
type: prometheus
access: proxy
url: http://prometheus:9090
isDefault: true
"""
with open(datasource_config, 'w') as f:
f.write(datasource_content)
# 仪表板配置
dashboard_config = provisioning_dir / 'dashboards.yml'
dashboard_content = """apiVersion: 1
providers:
- name: 'KaMiXiTong'
orgId: 1
folder: 'KaMiXiTong'
type: file
disableDeletion: false
editable: true
options:
path: /etc/grafana/provisioning/dashboards
"""
with open(dashboard_config, 'w') as f:
f.write(dashboard_content)
print("✅ Grafana 配置已创建")
def copy_monitoring_files(self):
"""复制监控配置文件"""
print("\n📁 复制监控配置文件...")
# 复制 Prometheus 配置
prometheus_src = Path('monitoring/prometheus.yml')
prometheus_dst = self.monitoring_dir / 'prometheus.yml'
if prometheus_src.exists():
prometheus_dst.write_text(prometheus_src.read_text())
# 复制告警规则
alert_rules_src = Path('monitoring/alert_rules.yml')
alert_rules_dst = self.monitoring_dir / 'alert_rules.yml'
if alert_rules_src.exists():
alert_rules_dst.write_text(alert_rules_src.read_text())
# 复制 Grafana 仪表板
grafana_src = Path('monitoring/grafana_dashboard.json')
grafana_dashboard_dir = self.monitoring_dir / 'grafana' / 'provisioning' / 'dashboards'
grafana_dashboard_dir.mkdir(parents=True, exist_ok=True)
grafana_dst = grafana_dashboard_dir / 'kamaxitong_dashboard.json'
if grafana_src.exists():
grafana_dst.write_text(grafana_src.read_text())
print("✅ 配置文件已复制")
def start_monitoring(self):
"""启动监控服务"""
print("\n🚀 启动监控服务...")
os.chdir(self.monitoring_dir)
# 启动服务
print("启动 Docker Compose...")
result = subprocess.run(['docker-compose', 'up', '-d'], capture_output=True, text=True)
if result.returncode != 0:
print(f"❌ 启动失败: {result.stderr}")
return False
print("✅ 监控服务已启动")
return True
def show_access_info(self):
"""显示访问信息"""
print("\n" + "=" * 60)
print("✅ 监控服务配置完成!")
print("=" * 60)
print("\n📊 访问地址:")
print(" Grafana 仪表板: http://localhost:3000")
print(" 用户名: admin")
print(" 密码: admin123")
print("\n Prometheus: http://localhost:9090")
print(" AlertManager: http://localhost:9093")
print(" Node Exporter: http://localhost:9100")
print("\n📋 常用命令:")
print(" 查看服务状态: docker-compose ps")
print(" 查看日志: docker-compose logs -f")
print(" 停止服务: docker-compose down")
print(" 重启服务: docker-compose restart")
print("\n⚠️ 注意:")
print(" - 首次启动 Grafana 需要导入仪表板")
print(" - 定期备份 Grafana 数据")
print(" - 配置邮件告警需要修改 alertmanager.yml")
def run(self):
"""运行配置流程"""
print("=" * 60)
print("🔧 KaMiXiTong 监控告警配置工具")
print("=" * 60)
# 检查 Docker
if not self.check_docker():
sys.exit(1)
# 创建配置
self.create_docker_compose()
self.create_alertmanager_config()
self.create_grafana_provisioning()
self.copy_monitoring_files()
# 询问是否启动
print("\n是否启动监控服务? (y/N)")
if input().lower() == 'y':
if self.start_monitoring():
time.sleep(5) # 等待服务启动
self.show_access_info()
else:
print("❌ 启动失败,请检查错误信息")
sys.exit(1)
else:
print("\n📋 手动启动命令:")
print(f" cd {self.monitoring_dir}")
print(" docker-compose up -d")
def main():
"""主函数"""
setup = MonitoringSetup()
setup.run()
if __name__ == '__main__':
main()