生产环境运维
1. 概述
生产环境运维是AI UI生成系统稳定运行的关键保障,涉及服务监控、日志管理、备份恢复、故障排查等多个方面。本章详细介绍生产环境的部署策略、监控体系、运维工具和最佳实践,包括完整的运维脚本和故障处理流程。
2. 生产环境架构
2.1 运维架构图
监控层 -> 应用层 -> 数据层 -> 基础设施层
↓ ↓ ↓ ↓
Prometheus FastAPI Redis Docker
Grafana Nginx Models Kubernetes
AlertManager Logs Storage Cloud
2.2 运维组件
# 生产环境运维组件
production_ops_components:
monitoring:
- prometheus: 指标收集
- grafana: 可视化监控
- alertmanager: 告警管理
- node_exporter: 系统指标
logging:
- fluentd: 日志收集
- elasticsearch: 日志存储
- kibana: 日志分析
- logrotate: 日志轮转
backup:
- backup_scripts: 备份脚本
- storage: 备份存储
- restore_scripts: 恢复脚本
security:
- firewall: 防火墙配置
- ssl_certificates: SSL证书
- access_control: 访问控制
- vulnerability_scanning: 漏洞扫描
3. 部署脚本详解
3.1 主部署脚本
#!/bin/bash
# scripts/deploy.sh - 生产环境部署脚本
set -e
# 配置变量
PROJECT_NAME="ai-ui-system"
VERSION=${1:-"latest"}
ENVIRONMENT=${2:-"production"}
BACKUP_DIR="/backup/${PROJECT_NAME}"
LOG_DIR="/var/log/${PROJECT_NAME}"
# 颜色定义
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m'
# 日志函数
log_info() {
echo -e "${GREEN}[$(date '+%Y-%m-%d %H:%M:%S')] [INFO]${NC} $1" | tee -a ${LOG_DIR}/deploy.log
}
log_warn() {
echo -e "${YELLOW}[$(date '+%Y-%m-%d %H:%M:%S')] [WARN]${NC} $1" | tee -a ${LOG_DIR}/deploy.log
}
log_error() {
echo -e "${RED}[$(date '+%Y-%m-%d %H:%M:%S')] [ERROR]${NC} $1" | tee -a ${LOG_DIR}/deploy.log
}
log_debug() {
echo -e "${BLUE}[$(date '+%Y-%m-%d %H:%M:%S')] [DEBUG]${NC} $1" | tee -a ${LOG_DIR}/deploy.log
}
# 环境检查
check_environment() {
log_info "检查部署环境..."
# 检查系统要求
if [[ $(uname -s) != "Linux" ]]; then
log_error "仅支持Linux系统"
exit 1
fi
# 检查内存
MEMORY_GB=$(free -g | awk '/^Mem:/{print $2}')
if [[ $MEMORY_GB -lt 4 ]]; then
log_warn "系统内存不足4GB,可能影响性能"
fi
# 检查磁盘空间
DISK_GB=$(df -BG / | awk 'NR==2{print $4}' | sed 's/G//')
if [[ $DISK_GB -lt 20 ]]; then
log_error "磁盘空间不足20GB"
exit 1
fi
# 检查Docker
if ! command -v docker &> /dev/null; then
log_error "Docker未安装"
exit 1
fi
# 检查Docker Compose
if ! command -v docker-compose &> /dev/null; then
log_error "Docker Compose未安装"
exit 1
fi
log_info "环境检查通过"
}
# 创建必要目录
create_directories() {
log_info "创建必要目录..."
mkdir -p ${BACKUP_DIR}/{data,config,models,logs}
mkdir -p ${LOG_DIR}
mkdir -p /opt/${PROJECT_NAME}/{config,data,models,output}
mkdir -p /etc/${PROJECT_NAME}
# 设置权限
chown -R 1000:1000 /opt/${PROJECT_NAME}
chmod -R 755 /opt/${PROJECT_NAME}
log_info "目录创建完成"
}
# 备份现有数据
backup_existing() {
log_info "备份现有数据..."
if [[ -d "/opt/${PROJECT_NAME}" ]]; then
BACKUP_NAME="backup_$(date +%Y%m%d_%H%M%S)"
tar -czf ${BACKUP_DIR}/${BACKUP_NAME}.tar.gz -C /opt ${PROJECT_NAME}
log_info "备份完成: ${BACKUP_DIR}/${BACKUP_NAME}.tar.gz"
else
log_info "未发现现有数据,跳过备份"
fi
}
# 部署配置
deploy_config() {
log_info "部署配置文件..."
# 复制配置文件
cp -r config/* /opt/${PROJECT_NAME}/config/
cp docker-compose.prod.yml /opt/${PROJECT_NAME}/
cp Dockerfile /opt/${PROJECT_NAME}/
# 生成环境配置
cat > /opt/${PROJECT_NAME}/.env << EOF
# 环境配置
ENV=${ENVIRONMENT}
VERSION=${VERSION}
PROJECT_NAME=${PROJECT_NAME}
# 服务配置
API_HOST=0.0.0.0
API_PORT=8000
WORKERS=4
# 数据库配置
REDIS_HOST=redis
REDIS_PORT=6379
REDIS_PASSWORD=
# 日志配置
LOG_LEVEL=INFO
LOG_FILE=${LOG_DIR}/app.log
# 监控配置
PROMETHEUS_PORT=9090
GRAFANA_PORT=3000
# 备份配置
BACKUP_DIR=${BACKUP_DIR}
BACKUP_RETENTION_DAYS=30
EOF
log_info "配置部署完成"
}
# 构建镜像
build_image() {
log_info "构建Docker镜像..."
cd /opt/${PROJECT_NAME}
# 检查GPU支持
if command -v nvidia-smi &> /dev/null; then
log_info "检测到GPU,使用GPU版本Dockerfile"
docker build -f Dockerfile.gpu -t ${PROJECT_NAME}:${VERSION} .
else
log_info "使用CPU版本Dockerfile"
docker build -f Dockerfile -t ${PROJECT_NAME}:${VERSION} .
fi
# 标记为latest
docker tag ${PROJECT_NAME}:${VERSION} ${PROJECT_NAME}:latest
log_info "镜像构建完成"
}
# 启动服务
start_services() {
log_info "启动服务..."
cd /opt/${PROJECT_NAME}
# 停止现有服务
docker-compose -f docker-compose.prod.yml down || true
# 启动新服务
docker-compose -f docker-compose.prod.yml up -d
log_info "服务启动完成"
}
# 等待服务就绪
wait_for_services() {
log_info "等待服务就绪..."
# 等待主服务
for i in {1..60}; do
if curl -f http://localhost:8000/health &> /dev/null; then
log_info "主服务已就绪"
break
fi
if [[ $i -eq 60 ]]; then
log_error "主服务启动超时"
show_service_logs
exit 1
fi
log_info "等待主服务启动... ($i/60)"
sleep 10
done
# 等待Nginx
for i in {1..30}; do
if curl -f http://localhost/health &> /dev/null; then
log_info "Nginx已就绪"
break
fi
if [[ $i -eq 30 ]]; then
log_error "Nginx启动超时"
show_service_logs
exit 1
fi
log_info "等待Nginx启动... ($i/30)"
sleep 5
done
}
# 显示服务日志
show_service_logs() {
log_info "显示服务日志..."
docker-compose -f /opt/${PROJECT_NAME}/docker-compose.prod.yml logs --tail=50
}
# 健康检查
health_check() {
log_info "执行健康检查..."
# 检查服务状态
docker-compose -f /opt/${PROJECT_NAME}/docker-compose.prod.yml ps
# 检查API健康
if curl -f http://localhost/health; then
log_info "API健康检查通过"
else
log_error "API健康检查失败"
return 1
fi
# 检查服务指标
if curl -f http://localhost:9090/metrics &> /dev/null; then
log_info "监控服务正常"
else
log_warn "监控服务异常"
fi
log_info "健康检查完成"
}
# 清理旧镜像
cleanup_old_images() {
log_info "清理旧镜像..."
# 删除悬空镜像
docker image prune -f
# 保留最近3个版本的镜像
docker images ${PROJECT_NAME} --format "table {{.Tag}}\t{{.ID}}" | \
grep -v "latest" | \
tail -n +4 | \
awk '{print $2}' | \
xargs -r docker rmi
log_info "镜像清理完成"
}
# 主函数
main() {
log_info "开始部署 ${PROJECT_NAME} v${VERSION} 到 ${ENVIRONMENT} 环境"
check_environment
create_directories
backup_existing
deploy_config
build_image
start_services
wait_for_services
health_check
cleanup_old_images
log_info "部署完成!"
log_info "访问地址:"
log_info " - 主服务: http://localhost:8000"
log_info " - Nginx: http://localhost"
log_info " - API文档: http://localhost/docs"
log_info " - 监控面板: http://localhost:3000"
log_info " - 健康检查: http://localhost/health"
}
# 错误处理
trap 'log_error "部署过程中发生错误,退出码: $?"' ERR
# 执行主函数
main "$@"
3.2 回滚脚本
#!/bin/bash
# scripts/rollback.sh - 回滚脚本
set -e
PROJECT_NAME="ai-ui-system"
BACKUP_DIR="/backup/${PROJECT_NAME}"
LOG_DIR="/var/log/${PROJECT_NAME}"
# 日志函数
log_info() {
echo -e "${GREEN}[$(date '+%Y-%m-%d %H:%M:%S')] [INFO]${NC} $1" | tee -a ${LOG_DIR}/rollback.log
}
log_error() {
echo -e "${RED}[$(date '+%Y-%m-%d %H:%M:%S')] [ERROR]${NC} $1" | tee -a ${LOG_DIR}/rollback.log
}
# 列出可用备份
list_backups() {
log_info "可用备份列表:"
ls -la ${BACKUP_DIR}/*.tar.gz 2>/dev/null || {
log_error "未找到备份文件"
exit 1
}
}
# 回滚到指定备份
rollback_to_backup() {
local backup_file=$1
if [[ ! -f "${backup_file}" ]]; then
log_error "备份文件不存在: ${backup_file}"
exit 1
fi
log_info "开始回滚到: ${backup_file}"
# 停止服务
log_info "停止服务..."
docker-compose -f /opt/${PROJECT_NAME}/docker-compose.prod.yml down
# 备份当前状态
log_info "备份当前状态..."
CURRENT_BACKUP="current_$(date +%Y%m%d_%H%M%S).tar.gz"
tar -czf ${BACKUP_DIR}/${CURRENT_BACKUP} -C /opt ${PROJECT_NAME}
# 恢复备份
log_info "恢复备份..."
rm -rf /opt/${PROJECT_NAME}
tar -xzf ${backup_file} -C /opt
# 重启服务
log_info "重启服务..."
cd /opt/${PROJECT_NAME}
docker-compose -f docker-compose.prod.yml up -d
# 等待服务就绪
log_info "等待服务就绪..."
sleep 30
# 健康检查
if curl -f http://localhost/health; then
log_info "回滚成功"
else
log_error "回滚后服务异常"
exit 1
fi
}
# 主函数
main() {
if [[ $# -eq 0 ]]; then
list_backups
echo "使用方法: $0 <backup_file>"
exit 1
fi
rollback_to_backup "$1"
}
main "$@"
4. 监控体系
4.1 Prometheus配置
# docker/prometheus.yml - Prometheus监控配置
global:
scrape_interval: 15s
evaluation_interval: 15s
external_labels:
cluster: 'ai-ui-production'
replica: 'prometheus-1'
rule_files:
- "alert_rules.yml"
alerting:
alertmanagers:
- static_configs:
- targets:
- alertmanager:9093
scrape_configs:
# 主应用监控
- job_name: 'ai-ui-system'
static_configs:
- targets: ['ai-ui-system:8000']
metrics_path: '/metrics'
scrape_interval: 5s
scrape_timeout: 5s
honor_labels: true
# Nginx监控
- job_name: 'nginx'
static_configs:
- targets: ['nginx:9113']
scrape_interval: 10s
# Redis监控
- job_name: 'redis'
static_configs:
- targets: ['redis:6379']
scrape_interval: 10s
# 系统监控
- job_name: 'node-exporter'
static_configs:
- targets: ['node-exporter:9100']
scrape_interval: 15s
# Docker监控
- job_name: 'cadvisor'
static_configs:
- targets: ['cadvisor:8080']
scrape_interval: 15s
4.2 告警规则
# docker/alert_rules.yml - 告警规则配置
groups:
- name: ai-ui-system
rules:
# 服务可用性告警
- alert: ServiceDown
expr: up{job="ai-ui-system"} == 0
for: 1m
labels:
severity: critical
annotations:
summary: "AI UI系统服务不可用"
description: "AI UI系统服务已停止响应超过1分钟"
# 高错误率告警
- alert: HighErrorRate
expr: rate(http_requests_total{status=~"5.."}[5m]) > 0.1
for: 2m
labels:
severity: warning
annotations:
summary: "API错误率过高"
description: "5xx错误率超过10%"
# 响应时间告警
- alert: HighResponseTime
expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 2
for: 3m
labels:
severity: warning
annotations:
summary: "API响应时间过长"
description: "95%的请求响应时间超过2秒"
# 内存使用告警
- alert: HighMemoryUsage
expr: (container_memory_usage_bytes / container_spec_memory_limit_bytes) > 0.8
for: 5m
labels:
severity: warning
annotations:
summary: "容器内存使用率过高"
description: "内存使用率超过80%"
# 磁盘空间告警
- alert: LowDiskSpace
expr: (node_filesystem_avail_bytes / node_filesystem_size_bytes) < 0.1
for: 1m
labels:
severity: critical
annotations:
summary: "磁盘空间不足"
description: "磁盘可用空间低于10%"
# GPU使用率告警
- alert: HighGPUUsage
expr: nvidia_utilization_gpu > 90
for: 5m
labels:
severity: warning
annotations:
summary: "GPU使用率过高"
description: "GPU使用率超过90%"
4.3 Grafana仪表板
{
"dashboard": {
"title": "AI UI系统监控面板",
"panels": [
{
"title": "服务状态",
"type": "stat",
"targets": [
{
"expr": "up{job=\"ai-ui-system\"}",
"legendFormat": "服务状态"
}
]
},
{
"title": "请求速率",
"type": "graph",
"targets": [
{
"expr": "rate(http_requests_total[5m])",
"legendFormat": "{{method}} {{endpoint}}"
}
]
},
{
"title": "响应时间",
"type": "graph",
"targets": [
{
"expr": "histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m]))",
"legendFormat": "95%响应时间"
}
]
},
{
"title": "错误率",
"type": "graph",
"targets": [
{
"expr": "rate(http_requests_total{status=~\"5..\"}[5m])",
"legendFormat": "5xx错误率"
}
]
},
{
"title": "内存使用",
"type": "graph",
"targets": [
{
"expr": "container_memory_usage_bytes",
"legendFormat": "{{container_name}}"
}
]
},
{
"title": "CPU使用",
"type": "graph",
"targets": [
{
"expr": "rate(container_cpu_usage_seconds_total[5m])",
"legendFormat": "{{container_name}}"
}
]
}
]
}
}
5. 日志管理
5.1 日志收集配置
# docker/fluentd.conf - Fluentd日志收集配置
<source>
@type forward
port 24224
bind 0.0.0.0
</source>
<source>
@type tail
path /var/log/containers/*.log
pos_file /var/log/fluentd-containers.log.pos
tag kubernetes.*
format json
time_key time
time_format %Y-%m-%dT%H:%M:%S.%NZ
</source>
<filter kubernetes.**>
@type kubernetes_metadata
</filter>
<match kubernetes.**>
@type elasticsearch
host elasticsearch
port 9200
index_name ai-ui-logs
type_name _doc
include_tag_key true
tag_key @log_name
flush_interval 1s
</match>
<match **>
@type stdout
</match>
5.2 日志轮转配置
# /etc/logrotate.d/ai-ui-system - 日志轮转配置
/var/log/ai-ui-system/*.log {
daily
missingok
rotate 30
compress
delaycompress
notifempty
create 644 root root
postrotate
docker-compose -f /opt/ai-ui-system/docker-compose.prod.yml restart ai-ui-system
endscript
}
/var/log/ai-ui-system/deploy.log {
weekly
missingok
rotate 12
compress
delaycompress
notifempty
create 644 root root
}
5.3 日志分析脚本
#!/bin/bash
# scripts/log_analysis.sh - 日志分析脚本
LOG_DIR="/var/log/ai-ui-system"
REPORT_DIR="/opt/ai-ui-system/reports"
# 创建报告目录
mkdir -p ${REPORT_DIR}
# 生成日志报告
generate_log_report() {
local date_range=${1:-"today"}
local report_file="${REPORT_DIR}/log_report_$(date +%Y%m%d).html"
cat > ${report_file} << EOF
<!DOCTYPE html>
<html>
<head>
<title>AI UI系统日志报告 - $(date)</title>
<style>
body { font-family: Arial, sans-serif; margin: 20px; }
.header { background-color: #f0f0f0; padding: 10px; }
.section { margin: 20px 0; }
.error { color: red; }
.warning { color: orange; }
.info { color: blue; }
table { border-collapse: collapse; width: 100%; }
th, td { border: 1px solid #ddd; padding: 8px; text-align: left; }
th { background-color: #f2f2f2; }
</style>
</head>
<body>
<div class="header">
<h1>AI UI系统日志报告</h1>
<p>生成时间: $(date)</p>
<p>时间范围: ${date_range}</p>
</div>
<div class="section">
<h2>错误统计</h2>
<table>
<tr><th>错误类型</th><th>数量</th><th>占比</th></tr>
EOF
# 统计错误类型
grep -h "ERROR" ${LOG_DIR}/*.log | \
awk '{print $4}' | \
sort | uniq -c | sort -nr | \
while read count error; do
echo " <tr><td>${error}</td><td>${count}</td><td>${count}%</td></tr>" >> ${report_file}
done
cat >> ${report_file} << EOF
</table>
</div>
<div class="section">
<h2>性能统计</h2>
<table>
<tr><th>指标</th><th>平均值</th><th>最大值</th><th>最小值</th></tr>
EOF
# 统计响应时间
avg_time=$(grep -h "processing_time" ${LOG_DIR}/*.log | awk '{sum+=$NF; count++} END {print sum/count}')
max_time=$(grep -h "processing_time" ${LOG_DIR}/*.log | awk '{if($NF>max) max=$NF} END {print max}')
min_time=$(grep -h "processing_time" ${LOG_DIR}/*.log | awk 'BEGIN{min=999} {if($NF<min) min=$NF} END {print min}')
cat >> ${report_file} << EOF
<tr><td>响应时间(秒)</td><td>${avg_time}</td><td>${max_time}</td><td>${min_time}</td></tr>
</table>
</div>
<div class="section">
<h2>请求统计</h2>
<table>
<tr><th>端点</th><th>请求数</th><th>成功率</th></tr>
EOF
# 统计请求端点
grep -h "POST\|GET" ${LOG_DIR}/*.log | \
awk '{print $7}' | \
sort | uniq -c | sort -nr | \
while read count endpoint; do
success_count=$(grep -h "${endpoint}" ${LOG_DIR}/*.log | grep -c "200")
success_rate=$((success_count * 100 / count))
echo " <tr><td>${endpoint}</td><td>${count}</td><td>${success_rate}%</td></tr>" >> ${report_file}
done
cat >> ${report_file} << EOF
</table>
</div>
</body>
</html>
EOF
echo "日志报告已生成: ${report_file}"
}
# 主函数
main() {
generate_log_report "$1"
}
main "$@"
6. 备份和恢复
6.1 自动备份脚本
#!/bin/bash
# scripts/auto_backup.sh - 自动备份脚本
set -e
PROJECT_NAME="ai-ui-system"
BACKUP_DIR="/backup/${PROJECT_NAME}"
RETENTION_DAYS=30
LOG_FILE="/var/log/${PROJECT_NAME}/backup.log"
# 日志函数
log() {
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a ${LOG_FILE}
}
# 备份数据
backup_data() {
local backup_name="backup_$(date +%Y%m%d_%H%M%S)"
local backup_path="${BACKUP_DIR}/${backup_name}"
log "开始备份: ${backup_name}"
# 创建备份目录
mkdir -p ${backup_path}
# 备份应用数据
log "备份应用数据..."
tar -czf ${backup_path}/app_data.tar.gz -C /opt ${PROJECT_NAME}
# 备份配置文件
log "备份配置文件..."
tar -czf ${backup_path}/config.tar.gz -C /etc ${PROJECT_NAME}
# 备份日志
log "备份日志..."
tar -czf ${backup_path}/logs.tar.gz -C /var/log ${PROJECT_NAME}
# 备份数据库
log "备份Redis数据..."
docker exec redis redis-cli BGSAVE
docker cp redis:/data/dump.rdb ${backup_path}/redis.rdb
# 备份Docker镜像
log "备份Docker镜像..."
docker save ${PROJECT_NAME}:latest | gzip > ${backup_path}/image.tar.gz
# 生成备份清单
cat > ${backup_path}/backup_manifest.txt << EOF
备份时间: $(date)
备份名称: ${backup_name}
备份大小: $(du -sh ${backup_path} | awk '{print $1}')
包含内容:
- 应用数据: app_data.tar.gz
- 配置文件: config.tar.gz
- 日志文件: logs.tar.gz
- Redis数据: redis.rdb
- Docker镜像: image.tar.gz
EOF
log "备份完成: ${backup_path}"
echo ${backup_name} > ${BACKUP_DIR}/latest_backup.txt
}
# 清理旧备份
cleanup_old_backups() {
log "清理旧备份..."
find ${BACKUP_DIR} -name "backup_*" -type d -mtime +${RETENTION_DAYS} -exec rm -rf {} \;
log "旧备份清理完成"
}
# 验证备份
verify_backup() {
local backup_name=$1
local backup_path="${BACKUP_DIR}/${backup_name}"
log "验证备份: ${backup_name}"
# 检查备份文件
if [[ ! -f "${backup_path}/app_data.tar.gz" ]]; then
log "错误: 应用数据备份文件不存在"
return 1
fi
if [[ ! -f "${backup_path}/config.tar.gz" ]]; then
log "错误: 配置文件备份不存在"
return 1
fi
if [[ ! -f "${backup_path}/redis.rdb" ]]; then
log "错误: Redis数据备份不存在"
return 1
fi
# 测试备份文件完整性
tar -tzf ${backup_path}/app_data.tar.gz > /dev/null || {
log "错误: 应用数据备份文件损坏"
return 1
}
log "备份验证通过"
return 0
}
# 主函数
main() {
log "开始自动备份..."
backup_data
cleanup_old_backups
# 验证最新备份
latest_backup=$(cat ${BACKUP_DIR}/latest_backup.txt)
verify_backup ${latest_backup}
log "自动备份完成"
}
# 错误处理
trap 'log "备份过程中发生错误,退出码: $?"' ERR
# 执行主函数
main "$@"
6.2 恢复脚本
#!/bin/bash
# scripts/restore.sh - 恢复脚本
set -e
PROJECT_NAME="ai-ui-system"
BACKUP_DIR="/backup/${PROJECT_NAME}"
LOG_FILE="/var/log/${PROJECT_NAME}/restore.log"
# 日志函数
log() {
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a ${LOG_FILE}
}
# 列出可用备份
list_backups() {
log "可用备份列表:"
ls -la ${BACKUP_DIR}/backup_* 2>/dev/null | while read line; do
echo " $line"
done
}
# 恢复备份
restore_backup() {
local backup_name=$1
if [[ -z "${backup_name}" ]]; then
log "错误: 请指定备份名称"
list_backups
exit 1
fi
local backup_path="${BACKUP_DIR}/${backup_name}"
if [[ ! -d "${backup_path}" ]]; then
log "错误: 备份目录不存在: ${backup_path}"
exit 1
fi
log "开始恢复备份: ${backup_name}"
# 停止服务
log "停止服务..."
docker-compose -f /opt/${PROJECT_NAME}/docker-compose.prod.yml down || true
# 备份当前状态
log "备份当前状态..."
current_backup="current_$(date +%Y%m%d_%H%M%S)"
mkdir -p ${BACKUP_DIR}/${current_backup}
tar -czf ${BACKUP_DIR}/${current_backup}/current_state.tar.gz -C /opt ${PROJECT_NAME}
# 恢复应用数据
log "恢复应用数据..."
rm -rf /opt/${PROJECT_NAME}
tar -xzf ${backup_path}/app_data.tar.gz -C /opt
# 恢复配置文件
log "恢复配置文件..."
rm -rf /etc/${PROJECT_NAME}
tar -xzf ${backup_path}/config.tar.gz -C /etc
# 恢复日志
log "恢复日志..."
tar -xzf ${backup_path}/logs.tar.gz -C /var/log
# 恢复Docker镜像
log "恢复Docker镜像..."
docker load < ${backup_path}/image.tar.gz
# 启动服务
log "启动服务..."
cd /opt/${PROJECT_NAME}
docker-compose -f docker-compose.prod.yml up -d
# 等待服务就绪
log "等待服务就绪..."
sleep 30
# 恢复Redis数据
log "恢复Redis数据..."
docker cp ${backup_path}/redis.rdb redis:/data/dump.rdb
docker restart redis
# 健康检查
log "执行健康检查..."
sleep 10
if curl -f http://localhost/health; then
log "恢复成功"
else
log "错误: 恢复后服务异常"
log "请检查日志或回滚到之前的状态"
exit 1
fi
}
# 主函数
main() {
if [[ $# -eq 0 ]]; then
list_backups
echo "使用方法: $0 <backup_name>"
exit 1
fi
restore_backup "$1"
}
main "$@"
7. 故障排查
7.1 故障排查脚本
#!/bin/bash
# scripts/troubleshoot.sh - 故障排查脚本
set -e
PROJECT_NAME="ai-ui-system"
LOG_DIR="/var/log/${PROJECT_NAME}"
# 颜色定义
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m'
# 日志函数
log_info() {
echo -e "${GREEN}[INFO]${NC} $1"
}
log_warn() {
echo -e "${YELLOW}[WARN]${NC} $1"
}
log_error() {
echo -e "${RED}[ERROR]${NC} $1"
}
# 检查服务状态
check_service_status() {
log_info "检查服务状态..."
# 检查Docker服务
if ! systemctl is-active --quiet docker; then
log_error "Docker服务未运行"
return 1
fi
# 检查容器状态
log_info "容器状态:"
docker-compose -f /opt/${PROJECT_NAME}/docker-compose.prod.yml ps
# 检查服务健康
if curl -f http://localhost/health &> /dev/null; then
log_info "服务健康检查通过"
else
log_error "服务健康检查失败"
return 1
fi
}
# 检查资源使用
check_resource_usage() {
log_info "检查资源使用..."
# 检查内存使用
log_info "内存使用:"
free -h
# 检查磁盘使用
log_info "磁盘使用:"
df -h
# 检查CPU使用
log_info "CPU使用:"
top -bn1 | grep "Cpu(s)"
# 检查容器资源使用
log_info "容器资源使用:"
docker stats --no-stream
}
# 检查网络连接
check_network() {
log_info "检查网络连接..."
# 检查端口监听
log_info "端口监听状态:"
netstat -tlnp | grep -E ":(80|8000|6379|9090|3000)"
# 检查容器网络
log_info "容器网络:"
docker network ls
docker network inspect ai-ui-system_ai-ui-network
}
# 检查日志错误
check_log_errors() {
log_info "检查日志错误..."
# 检查应用日志
log_info "最近的应用错误:"
grep -h "ERROR" ${LOG_DIR}/*.log | tail -10
# 检查容器日志
log_info "容器日志错误:"
docker-compose -f /opt/${PROJECT_NAME}/docker-compose.prod.yml logs --tail=50 | grep -i error
}
# 检查配置文件
check_config() {
log_info "检查配置文件..."
# 检查Docker Compose配置
if [[ -f "/opt/${PROJECT_NAME}/docker-compose.prod.yml" ]]; then
log_info "Docker Compose配置文件存在"
docker-compose -f /opt/${PROJECT_NAME}/docker-compose.prod.yml config --quiet
else
log_error "Docker Compose配置文件不存在"
fi
# 检查环境配置
if [[ -f "/opt/${PROJECT_NAME}/.env" ]]; then
log_info "环境配置文件存在"
else
log_error "环境配置文件不存在"
fi
}
# 检查备份状态
check_backup_status() {
log_info "检查备份状态..."
local backup_dir="/backup/${PROJECT_NAME}"
if [[ -d "${backup_dir}" ]]; then
log_info "备份目录存在"
log_info "最新备份:"
ls -la ${backup_dir}/backup_* | tail -5
else
log_warn "备份目录不存在"
fi
}
# 生成诊断报告
generate_diagnostic_report() {
log_info "生成诊断报告..."
local report_file="${LOG_DIR}/diagnostic_report_$(date +%Y%m%d_%H%M%S).txt"
cat > ${report_file} << EOF
AI UI系统诊断报告
生成时间: $(date)
系统信息: $(uname -a)
Docker版本: $(docker --version)
Docker Compose版本: $(docker-compose --version)
=== 服务状态 ===
EOF
docker-compose -f /opt/${PROJECT_NAME}/docker-compose.prod.yml ps >> ${report_file}
cat >> ${report_file} << EOF
=== 资源使用 ===
EOF
free -h >> ${report_file}
df -h >> ${report_file}
cat >> ${report_file} << EOF
=== 网络状态 ===
EOF
netstat -tlnp | grep -E ":(80|8000|6379|9090|3000)" >> ${report_file}
cat >> ${report_file} << EOF
=== 最近错误 ===
EOF
grep -h "ERROR" ${LOG_DIR}/*.log | tail -20 >> ${report_file}
log_info "诊断报告已生成: ${report_file}"
}
# 主函数
main() {
log_info "开始故障排查..."
check_service_status
check_resource_usage
check_network
check_log_errors
check_config
check_backup_status
generate_diagnostic_report
log_info "故障排查完成"
}
main "$@"
7.2 常见问题解决
#!/bin/bash
# scripts/fix_common_issues.sh - 常见问题修复脚本
# 修复服务启动失败
fix_service_startup() {
log_info "修复服务启动失败..."
# 清理Docker资源
docker system prune -f
# 重启Docker服务
systemctl restart docker
# 重新启动服务
cd /opt/${PROJECT_NAME}
docker-compose -f docker-compose.prod.yml up -d
}
# 修复内存不足
fix_memory_issue() {
log_info "修复内存不足问题..."
# 清理Docker缓存
docker system prune -a -f
# 重启服务
docker-compose -f /opt/${PROJECT_NAME}/docker-compose.prod.yml restart
# 调整服务配置
sed -i 's/WORKERS=4/WORKERS=2/' /opt/${PROJECT_NAME}/.env
}
# 修复磁盘空间不足
fix_disk_space() {
log_info "修复磁盘空间不足..."
# 清理Docker镜像
docker image prune -a -f
# 清理日志文件
find /var/log -name "*.log" -mtime +7 -delete
# 清理备份文件
find /backup -name "backup_*" -mtime +30 -exec rm -rf {} \;
}
# 修复网络问题
fix_network_issue() {
log_info "修复网络问题..."
# 重启网络服务
systemctl restart networking
# 重启Docker网络
docker network prune -f
# 重启服务
docker-compose -f /opt/${PROJECT_NAME}/docker-compose.prod.yml restart
}
# 主函数
main() {
case $1 in
"startup")
fix_service_startup
;;
"memory")
fix_memory_issue
;;
"disk")
fix_disk_space
;;
"network")
fix_network_issue
;;
*)
echo "使用方法: $0 {startup|memory|disk|network}"
exit 1
;;
esac
}
main "$@"
8. 性能监控
8.1 性能监控脚本
#!/bin/bash
# scripts/performance_monitor.sh - 性能监控脚本
# 监控指标收集
collect_metrics() {
local timestamp=$(date +%s)
local metrics_file="/tmp/performance_metrics_${timestamp}.json"
cat > ${metrics_file} << EOF
{
"timestamp": ${timestamp},
"system": {
"cpu_usage": $(top -bn1 | grep "Cpu(s)" | awk '{print $2}' | sed 's/%us,//'),
"memory_usage": $(free | awk 'NR==2{printf "%.2f", $3*100/$2}'),
"disk_usage": $(df / | awk 'NR==2{print $5}' | sed 's/%//'),
"load_average": "$(uptime | awk -F'load average:' '{print $2}')"
},
"docker": {
"containers_running": $(docker ps -q | wc -l),
"containers_total": $(docker ps -a -q | wc -l),
"images_count": $(docker images -q | wc -l)
},
"application": {
"response_time": $(curl -o /dev/null -s -w '%{time_total}' http://localhost/health),
"status_code": $(curl -o /dev/null -s -w '%{http_code}' http://localhost/health)
}
}
EOF
echo ${metrics_file}
}
# 性能分析
analyze_performance() {
local metrics_file=$1
log_info "性能分析结果:"
# 分析CPU使用率
cpu_usage=$(jq -r '.system.cpu_usage' ${metrics_file})
if (( $(echo "$cpu_usage > 80" | bc -l) )); then
log_warn "CPU使用率过高: ${cpu_usage}%"
else
log_info "CPU使用率正常: ${cpu_usage}%"
fi
# 分析内存使用率
memory_usage=$(jq -r '.system.memory_usage' ${metrics_file})
if (( $(echo "$memory_usage > 80" | bc -l) )); then
log_warn "内存使用率过高: ${memory_usage}%"
else
log_info "内存使用率正常: ${memory_usage}%"
fi
# 分析响应时间
response_time=$(jq -r '.application.response_time' ${metrics_file})
if (( $(echo "$response_time > 2" | bc -l) )); then
log_warn "响应时间过长: ${response_time}秒"
else
log_info "响应时间正常: ${response_time}秒"
fi
}
# 主函数
main() {
local metrics_file=$(collect_metrics)
analyze_performance ${metrics_file}
rm -f ${metrics_file}
}
main "$@"
9. 安全运维
9.1 安全扫描脚本
#!/bin/bash
# scripts/security_scan.sh - 安全扫描脚本
# 扫描Docker镜像漏洞
scan_docker_images() {
log_info "扫描Docker镜像漏洞..."
if command -v trivy &> /dev/null; then
trivy image ai-ui-system:latest --format json > /tmp/trivy_report.json
# 分析扫描结果
high_vulns=$(jq '.Results[].Vulnerabilities[] | select(.Severity == "HIGH")' /tmp/trivy_report.json | jq -s 'length')
critical_vulns=$(jq '.Results[].Vulnerabilities[] | select(.Severity == "CRITICAL")' /tmp/trivy_report.json | jq -s 'length')
if [[ ${critical_vulns} -gt 0 ]]; then
log_error "发现 ${critical_vulns} 个严重漏洞"
elif [[ ${high_vulns} -gt 0 ]]; then
log_warn "发现 ${high_vulns} 个高危漏洞"
else
log_info "未发现严重漏洞"
fi
else
log_warn "Trivy未安装,跳过漏洞扫描"
fi
}
# 检查容器安全配置
check_container_security() {
log_info "检查容器安全配置..."
# 检查容器是否以root用户运行
containers=$(docker ps --format "{{.Names}}")
for container in ${containers}; do
user=$(docker exec ${container} whoami 2>/dev/null || echo "unknown")
if [[ "${user}" == "root" ]]; then
log_warn "容器 ${container} 以root用户运行"
else
log_info "容器 ${container} 以非root用户运行"
fi
done
}
# 检查网络安全
check_network_security() {
log_info "检查网络安全..."
# 检查开放的端口
open_ports=$(netstat -tlnp | grep LISTEN | awk '{print $4}' | cut -d: -f2 | sort -u)
for port in ${open_ports}; do
if [[ ${port} -lt 1024 && ${port} -ne 80 && ${port} -ne 443 ]]; then
log_warn "发现特权端口开放: ${port}"
fi
done
}
# 主函数
main() {
scan_docker_images
check_container_security
check_network_security
}
main "$@"
10. 运维最佳实践
10.1 运维检查清单
#!/bin/bash
# scripts/ops_checklist.sh - 运维检查清单
# 日常检查清单
daily_checklist() {
log_info "执行日常检查清单..."
# 服务状态检查
check_service_status
# 资源使用检查
check_resource_usage
# 日志错误检查
check_log_errors
# 备份状态检查
check_backup_status
log_info "日常检查完成"
}
# 周度检查清单
weekly_checklist() {
log_info "执行周度检查清单..."
# 安全扫描
security_scan
# 性能分析
performance_analysis
# 容量规划
capacity_planning
log_info "周度检查完成"
}
# 月度检查清单
monthly_checklist() {
log_info "执行月度检查清单..."
# 灾难恢复测试
disaster_recovery_test
# 安全审计
security_audit
# 性能优化
performance_optimization
log_info "月度检查完成"
}
# 主函数
main() {
case $1 in
"daily")
daily_checklist
;;
"weekly")
weekly_checklist
;;
"monthly")
monthly_checklist
;;
*)
echo "使用方法: $0 {daily|weekly|monthly}"
exit 1
;;
esac
}
main "$@"
10.2 自动化运维
#!/bin/bash
# scripts/auto_ops.sh - 自动化运维脚本
# 设置定时任务
setup_cron_jobs() {
log_info "设置定时任务..."
# 添加定时任务
(crontab -l 2>/dev/null; echo "0 2 * * * /opt/ai-ui-system/scripts/auto_backup.sh") | crontab -
(crontab -l 2>/dev/null; echo "0 */6 * * * /opt/ai-ui-system/scripts/performance_monitor.sh") | crontab -
(crontab -l 2>/dev/null; echo "0 0 * * 0 /opt/ai-ui-system/scripts/ops_checklist.sh weekly") | crontab -
(crontab -l 2>/dev/null; echo "0 0 1 * * /opt/ai-ui-system/scripts/ops_checklist.sh monthly") | crontab -
log_info "定时任务设置完成"
}
# 主函数
main() {
setup_cron_jobs
}
main "$@"
通过掌握这些生产环境运维技术,您可以构建一个稳定、可靠、易维护的生产环境。生产环境运维是系统稳定运行的重要保障,需要系统性的规划和持续性的优化,以确保服务的高可用性和高性能。