#!/usr/bin/env python3 """ 监控告警配置脚本 自动安装和配置 Prometheus + Grafana 监控栈 """ import os import sys import subprocess import json import time from pathlib import Path class MonitoringSetup: """监控配置类""" def __init__(self): self.monitoring_dir = Path('monitoring') self.docker_compose_file = self.monitoring_dir / 'docker-compose.yml' def check_docker(self): """检查 Docker 和 Docker Compose""" print("🔍 检查 Docker 环境...") try: result = subprocess.run(['docker', '--version'], capture_output=True, text=True) print(f"✅ Docker: {result.stdout.strip()}") except FileNotFoundError: print("❌ Docker 未安装") print("\n请安装 Docker: https://docs.docker.com/get-docker/") return False try: result = subprocess.run(['docker-compose', '--version'], capture_output=True, text=True) print(f"✅ Docker Compose: {result.stdout.strip()}") except FileNotFoundError: try: result = subprocess.run(['docker', 'compose', 'version'], capture_output=True, text=True) print(f"✅ Docker Compose: {result.stdout.strip()}") except FileNotFoundError: print("❌ Docker Compose 未安装") print("\n请安装 Docker Compose") return False return True def create_docker_compose(self): """创建 Docker Compose 配置""" print("\n📝 创建 Docker Compose 配置...") self.monitoring_dir.mkdir(exist_ok=True) compose_content = """version: '3.8' services: # Prometheus 监控 prometheus: image: prom/prometheus:latest container_name: kamaxitong-prometheus restart: unless-stopped ports: - "9090:9090" volumes: - ./prometheus.yml:/etc/prometheus/prometheus.yml - ./alert_rules.yml:/etc/prometheus/alert_rules.yml - prometheus_data:/prometheus command: - '--config.file=/etc/prometheus/prometheus.yml' - '--storage.tsdb.path=/prometheus' - '--web.console.libraries=/etc/prometheus/console_libraries' - '--web.console.templates=/etc/prometheus/consoles' - '--storage.tsdb.retention.time=200h' - '--web.enable-lifecycle' networks: - monitoring # Grafana 仪表板 grafana: image: grafana/grafana:latest container_name: kamaxitong-grafana restart: unless-stopped ports: - "3000:3000" volumes: - grafana_data:/var/lib/grafana - ./grafana/provisioning:/etc/grafana/provisioning environment: - GF_SECURITY_ADMIN_USER=admin - GF_SECURITY_ADMIN_PASSWORD=admin123 - GF_USERS_ALLOW_SIGN_UP=false networks: - monitoring # Node Exporter 系统监控 node-exporter: image: prom/node-exporter:latest container_name: kamaxitong-node-exporter restart: unless-stopped ports: - "9100:9100" volumes: - /proc:/host/proc:ro - /sys:/host/sys:ro - /:/rootfs:ro command: - '--path.procfs=/host/proc' - '--path.rootfs=/rootfs' - '--path.sysfs=/host/sys' - '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)' networks: - monitoring # AlertManager 告警管理 alertmanager: image: prom/alertmanager:latest container_name: kamaxitong-alertmanager restart: unless-stopped ports: - "9093:9093" volumes: - ./alertmanager.yml:/etc/alertmanager/alertmanager.yml - alertmanager_data:/alertmanager command: - '--config.file=/etc/alertmanager/alertmanager.yml' - '--storage.path=/alertmanager' networks: - monitoring # Redis Exporter redis-exporter: image: oliver006/redis_exporter:latest container_name: kamaxitong-redis-exporter restart: unless-stopped ports: - "9121:9121" environment: - REDIS_ADDR=redis://redis:6379 networks: - monitoring depends_on: - redis # Redis 数据库 redis: image: redis:alpine container_name: kamaxitong-redis restart: unless-stopped ports: - "6379:6379" volumes: - redis_data:/data networks: - monitoring volumes: prometheus_data: grafana_data: alertmanager_data: redis_data: networks: monitoring: driver: bridge """ with open(self.docker_compose_file, 'w') as f: f.write(compose_content) print(f"✅ Docker Compose 配置已创建: {self.docker_compose_file}") def create_alertmanager_config(self): """创建 AlertManager 配置""" print("\n📝 创建 AlertManager 配置...") alertmanager_config = self.monitoring_dir / 'alertmanager.yml' config_content = """global: smtp_smarthost: 'localhost:587' smtp_from: 'alerts@yourcompany.com' smtp_auth_username: 'alerts@yourcompany.com' smtp_auth_password: 'your-password' route: group_by: ['alertname'] group_wait: 10s group_interval: 10s repeat_interval: 1h receiver: 'web.hook' routes: - match: severity: critical receiver: 'critical-alerts' - match: severity: warning receiver: 'warning-alerts' receivers: - name: 'web.hook' webhook_configs: - url: 'http://localhost:5001/alert' send_resolved: true - name: 'critical-alerts' email_configs: - to: 'admin@yourcompany.com' subject: '【严重告警】KaMiXiTong 系统告警' body: | {{ range .Alerts }} 告警: {{ .Annotations.summary }} 描述: {{ .Annotations.description }} 时间: {{ .StartsAt }} 级别: {{ .Labels.severity }} {{ end }} - name: 'warning-alerts' email_configs: - to: 'admin@yourcompany.com' subject: '【警告】KaMiXiTong 系统告警' body: | {{ range .Alerts }} 告警: {{ .Annotations.summary }} 描述: {{ .Annotations.description }} 时间: {{ .StartsAt }} 级别: {{ .Labels.severity }} {{ end }} inhibit_rules: - source_match: severity: 'critical' target_match: severity: 'warning' equal: ['alertname', 'dev', 'instance'] """ with open(alertmanager_config, 'w') as f: f.write(config_content) print(f"✅ AlertManager 配置已创建: {alertmanager_config}") def create_grafana_provisioning(self): """创建 Grafana 配置""" print("\n📝 创建 Grafana 配置...") provisioning_dir = self.monitoring_dir / 'grafana' / 'provisioning' provisioning_dir.mkdir(parents=True, exist_ok=True) # 数据源配置 datasource_config = provisioning_dir / 'datasources.yml' datasource_content = """apiVersion: 1 datasources: - name: Prometheus type: prometheus access: proxy url: http://prometheus:9090 isDefault: true """ with open(datasource_config, 'w') as f: f.write(datasource_content) # 仪表板配置 dashboard_config = provisioning_dir / 'dashboards.yml' dashboard_content = """apiVersion: 1 providers: - name: 'KaMiXiTong' orgId: 1 folder: 'KaMiXiTong' type: file disableDeletion: false editable: true options: path: /etc/grafana/provisioning/dashboards """ with open(dashboard_config, 'w') as f: f.write(dashboard_content) print("✅ Grafana 配置已创建") def copy_monitoring_files(self): """复制监控配置文件""" print("\n📁 复制监控配置文件...") # 复制 Prometheus 配置 prometheus_src = Path('monitoring/prometheus.yml') prometheus_dst = self.monitoring_dir / 'prometheus.yml' if prometheus_src.exists(): prometheus_dst.write_text(prometheus_src.read_text()) # 复制告警规则 alert_rules_src = Path('monitoring/alert_rules.yml') alert_rules_dst = self.monitoring_dir / 'alert_rules.yml' if alert_rules_src.exists(): alert_rules_dst.write_text(alert_rules_src.read_text()) # 复制 Grafana 仪表板 grafana_src = Path('monitoring/grafana_dashboard.json') grafana_dashboard_dir = self.monitoring_dir / 'grafana' / 'provisioning' / 'dashboards' grafana_dashboard_dir.mkdir(parents=True, exist_ok=True) grafana_dst = grafana_dashboard_dir / 'kamaxitong_dashboard.json' if grafana_src.exists(): grafana_dst.write_text(grafana_src.read_text()) print("✅ 配置文件已复制") def start_monitoring(self): """启动监控服务""" print("\n🚀 启动监控服务...") os.chdir(self.monitoring_dir) # 启动服务 print("启动 Docker Compose...") result = subprocess.run(['docker-compose', 'up', '-d'], capture_output=True, text=True) if result.returncode != 0: print(f"❌ 启动失败: {result.stderr}") return False print("✅ 监控服务已启动") return True def show_access_info(self): """显示访问信息""" print("\n" + "=" * 60) print("✅ 监控服务配置完成!") print("=" * 60) print("\n📊 访问地址:") print(" Grafana 仪表板: http://localhost:3000") print(" 用户名: admin") print(" 密码: admin123") print("\n Prometheus: http://localhost:9090") print(" AlertManager: http://localhost:9093") print(" Node Exporter: http://localhost:9100") print("\n📋 常用命令:") print(" 查看服务状态: docker-compose ps") print(" 查看日志: docker-compose logs -f") print(" 停止服务: docker-compose down") print(" 重启服务: docker-compose restart") print("\n⚠️ 注意:") print(" - 首次启动 Grafana 需要导入仪表板") print(" - 定期备份 Grafana 数据") print(" - 配置邮件告警需要修改 alertmanager.yml") def run(self): """运行配置流程""" print("=" * 60) print("🔧 KaMiXiTong 监控告警配置工具") print("=" * 60) # 检查 Docker if not self.check_docker(): sys.exit(1) # 创建配置 self.create_docker_compose() self.create_alertmanager_config() self.create_grafana_provisioning() self.copy_monitoring_files() # 询问是否启动 print("\n是否启动监控服务? (y/N)") if input().lower() == 'y': if self.start_monitoring(): time.sleep(5) # 等待服务启动 self.show_access_info() else: print("❌ 启动失败,请检查错误信息") sys.exit(1) else: print("\n📋 手动启动命令:") print(f" cd {self.monitoring_dir}") print(" docker-compose up -d") def main(): """主函数""" setup = MonitoringSetup() setup.run() if __name__ == '__main__': main()