数据库性能诊断与优化实战手册
适用读者:DBA、性能工程师、开发工程师 目标:提供跨数据库(Oracle/MySQL/PostgreSQL)的性能诊断方法论和优化实战案例。
1. 性能问题分类
1.1 问题类型
| 类型 | 症状 | 常见原因 | 优先级 |
|---|---|---|---|
| 慢查询 | 单条 SQL 执行慢 | 缺失索引、全表扫描、锁等待 | 高 |
| 高并发 | 连接数耗尽、响应慢 | 连接池配置、资源竞争 | 高 |
| 锁等待 | 事务阻塞、超时 | 长事务、死锁、锁升级 | 高 |
| IO 瓶颈 | 磁盘 IO 高 | 大表扫描、归档慢、RAID 配置 | 中 |
| 内存不足 | OOM、频繁 Swap | Buffer Pool 小、内存泄漏 | 中 |
| CPU 瓶颈 | CPU 100% | 复杂计算、排序、聚合 | 中 |
1.2 诊断流程
1. 现象确认
↓
2. 监控数据收集(CPU/内存/IO/网络)
↓
3. 数据库层面分析(慢查询/锁/等待事件)
↓
4. 应用层面分析(连接池/事务/缓存)
↓
5. 定位根因
↓
6. 制定优化方案
↓
7. 实施验证
↓
8. 持续监控
2. Oracle 性能诊断
2.1 AWR 报告分析
-- 生成 AWR 报告
@?/rdbms/admin/awrrpt.sql
-- 关键指标:
-- 1. Top 5 Timed Events(等待事件)
-- 2. SQL ordered by Elapsed Time(慢查询)
-- 3. Segment Statistics(热点表/索引)
-- 4. Instance Efficiency Percentages(缓存命中率)
AWR 报告解读:
Top 5 Timed Events:
1. db file scattered read -- 全表扫描,需要索引
2. db file sequential read -- 索引扫描,可能索引碎片
3. log file sync -- 提交频繁,考虑批量提交
4. enq: TX - row lock -- 行锁等待,检查长事务
5. CPU time -- CPU 瓶颈,优化 SQL 逻辑
2.2 实时会话诊断
-- 查看活动会话
SELECT
s.sid,
s.serial#,
s.username,
s.program,
s.machine,
s.status,
s.sql_id,
s.event,
s.wait_class,
s.seconds_in_wait,
s.blocking_session
FROM v$session s
WHERE s.status = 'ACTIVE'
AND s.username IS NOT NULL
ORDER BY s.seconds_in_wait DESC;
-- 查看当前 SQL
SELECT sql_text
FROM v$sql
WHERE sql_id = '&sql_id';
-- 查看执行计划
SELECT * FROM TABLE(DBMS_XPLAN.DISPLAY_CURSOR('&sql_id', NULL, 'ALLSTATS LAST'));
-- 查看锁等待
SELECT
s1.username || '@' || s1.machine AS blocker,
s1.sid AS blocker_sid,
s2.username || '@' || s2.machine AS waiter,
s2.sid AS waiter_sid,
lo.oracle_username,
lo.os_user_name,
ao.object_name,
lo.locked_mode
FROM v$locked_object lo
JOIN all_objects ao ON lo.object_id = ao.object_id
JOIN v$session s1 ON lo.session_id = s1.sid
JOIN v$session s2 ON s1.blocking_session = s2.sid;
2.3 SQL 调优
-- 收集统计信息
EXEC DBMS_STATS.GATHER_TABLE_STATS('SCHEMA_NAME', 'TABLE_NAME', CASCADE => TRUE);
-- 查看执行计划
EXPLAIN PLAN FOR
SELECT * FROM orders WHERE user_id = 12345;
SELECT * FROM TABLE(DBMS_XPLAN.DISPLAY);
-- 创建索引
CREATE INDEX idx_orders_user_id ON orders(user_id);
-- 使用 Hint 强制索引
SELECT /*+ INDEX(orders idx_orders_user_id) */ *
FROM orders
WHERE user_id = 12345;
-- 分析 SQL 性能
SELECT
sql_id,
executions,
elapsed_time / 1000000 AS elapsed_sec,
cpu_time / 1000000 AS cpu_sec,
buffer_gets,
disk_reads,
rows_processed,
ROUND(elapsed_time / executions / 1000000, 2) AS avg_elapsed_sec
FROM v$sql
WHERE executions > 0
ORDER BY elapsed_time DESC
FETCH FIRST 20 ROWS ONLY;
3. MySQL 性能诊断
3.1 慢查询日志分析
# 启用慢查询日志
[mysqld]
slow_query_log = 1
slow_query_log_file = /var/log/mysql/slow.log
long_query_time = 1
log_queries_not_using_indexes = 1
# 使用 pt-query-digest 分析
pt-query-digest /var/log/mysql/slow.log > slow_report.txt
# 输出示例:
# Query 1: 0.12 QPS, 1.23x concurrency, ID 0xABCD1234
# Calls: 1000, Time: 1234.56s (avg 1.23s)
# SELECT * FROM orders WHERE status = 'pending' ORDER BY created_at DESC LIMIT 100
3.2 实时性能监控
-- 查看当前连接
SHOW PROCESSLIST;
-- 查看慢查询
SELECT
id,
user,
host,
db,
command,
time,
state,
info
FROM information_schema.processlist
WHERE time > 5
AND command != 'Sleep'
ORDER BY time DESC;
-- 查看锁等待
SELECT
r.trx_id AS waiting_trx_id,
r.trx_mysql_thread_id AS waiting_thread,
r.trx_query AS waiting_query,
b.trx_id AS blocking_trx_id,
b.trx_mysql_thread_id AS blocking_thread,
b.trx_query AS blocking_query
FROM information_schema.innodb_lock_waits w
JOIN information_schema.innodb_trx b ON b.trx_id = w.blocking_trx_id
JOIN information_schema.innodb_trx r ON r.trx_id = w.requesting_trx_id;
-- 查看表统计信息
SELECT
table_schema,
table_name,
table_rows,
avg_row_length,
data_length / 1024 / 1024 AS data_mb,
index_length / 1024 / 1024 AS index_mb
FROM information_schema.tables
WHERE table_schema = 'mydb'
ORDER BY data_length DESC;
3.3 执行计划分析
-- 查看执行计划
EXPLAIN SELECT * FROM orders WHERE user_id = 12345;
-- 输出示例:
-- id | select_type | table | type | possible_keys | key | rows | Extra
-- 1 | SIMPLE | orders | ref | idx_user_id | idx_user_id | 100 | Using where
-- type 类型(性能从好到差):
-- system > const > eq_ref > ref > range > index > ALL
-- 详细执行计划
EXPLAIN FORMAT=JSON SELECT * FROM orders WHERE user_id = 12345;
-- 实际执行统计
EXPLAIN ANALYZE SELECT * FROM orders WHERE user_id = 12345;
3.4 索引优化
-- 查看未使用的索引
SELECT
t.table_schema,
t.table_name,
s.index_name,
s.column_name
FROM information_schema.statistics s
JOIN information_schema.tables t ON s.table_schema = t.table_schema
AND s.table_name = t.table_name
LEFT JOIN sys.schema_unused_indexes u ON s.table_schema = u.object_schema
AND s.table_name = u.object_name
AND s.index_name = u.index_name
WHERE t.table_schema = 'mydb'
AND s.index_name != 'PRIMARY'
AND u.index_name IS NOT NULL;
-- 查看重复索引
SELECT
a.table_schema,
a.table_name,
a.index_name AS index1,
b.index_name AS index2,
a.column_name
FROM information_schema.statistics a
JOIN information_schema.statistics b ON a.table_schema = b.table_schema
AND a.table_name = b.table_name
AND a.column_name = b.column_name
AND a.index_name < b.index_name
WHERE a.table_schema = 'mydb'
ORDER BY a.table_name, a.column_name;
-- 创建复合索引
CREATE INDEX idx_orders_user_status ON orders(user_id, status);
-- 删除未使用索引
DROP INDEX idx_unused ON orders;
4. PostgreSQL 性能诊断
4.1 pg_stat_statements 分析
-- 启用 pg_stat_statements
CREATE EXTENSION pg_stat_statements;
-- postgresql.conf
shared_preload_libraries = 'pg_stat_statements'
pg_stat_statements.track = all
-- 查看慢查询
SELECT
query,
calls,
total_exec_time / 1000 AS total_sec,
mean_exec_time / 1000 AS avg_sec,
max_exec_time / 1000 AS max_sec,
rows,
100.0 * shared_blks_hit / NULLIF(shared_blks_hit + shared_blks_read, 0) AS cache_hit_ratio
FROM pg_stat_statements
ORDER BY total_exec_time DESC
LIMIT 20;
-- 重置统计
SELECT pg_stat_statements_reset();
4.2 实时会话监控
-- 查看活动会话
SELECT
pid,
usename,
application_name,
client_addr,
state,
query_start,
state_change,
wait_event_type,
wait_event,
query
FROM pg_stat_activity
WHERE state = 'active'
AND pid != pg_backend_pid()
ORDER BY query_start;
-- 查看锁等待
SELECT
blocked_locks.pid AS blocked_pid,
blocked_activity.usename AS blocked_user,
blocking_locks.pid AS blocking_pid,
blocking_activity.usename AS blocking_user,
blocked_activity.query AS blocked_statement,
blocking_activity.query AS blocking_statement,
blocked_activity.application_name
FROM pg_catalog.pg_locks blocked_locks
JOIN pg_catalog.pg_stat_activity blocked_activity ON blocked_activity.pid = blocked_locks.pid
JOIN pg_catalog.pg_locks blocking_locks ON blocking_locks.locktype = blocked_locks.locktype
AND blocking_locks.database IS NOT DISTINCT FROM blocked_locks.database
AND blocking_locks.relation IS NOT DISTINCT FROM blocked_locks.relation
AND blocking_locks.page IS NOT DISTINCT FROM blocked_locks.page
AND blocking_locks.tuple IS NOT DISTINCT FROM blocked_locks.tuple
AND blocking_locks.virtualxid IS NOT DISTINCT FROM blocked_locks.virtualxid
AND blocking_locks.transactionid IS NOT DISTINCT FROM blocked_locks.transactionid
AND blocking_locks.classid IS NOT DISTINCT FROM blocked_locks.classid
AND blocking_locks.objid IS NOT DISTINCT FROM blocked_locks.objid
AND blocking_locks.objsubid IS NOT DISTINCT FROM blocked_locks.objsubid
AND blocking_locks.pid != blocked_locks.pid
JOIN pg_catalog.pg_stat_activity blocking_activity ON blocking_activity.pid = blocking_locks.pid
WHERE NOT blocked_locks.granted;
-- 终止慢查询
SELECT pg_cancel_backend(12345); -- 取消查询
SELECT pg_terminate_backend(12345); -- 终止连接
4.3 执行计划分析
-- 查看执行计划
EXPLAIN SELECT * FROM orders WHERE user_id = 12345;
-- 详细执行计划
EXPLAIN (ANALYZE, BUFFERS, VERBOSE)
SELECT * FROM orders WHERE user_id = 12345;
-- 输出示例:
-- Index Scan using idx_orders_user_id on orders (cost=0.43..8.45 rows=1 width=100) (actual time=0.012..0.013 rows=1 loops=1)
-- Index Cond: (user_id = 12345)
-- Buffers: shared hit=4
-- Planning Time: 0.123 ms
-- Execution Time: 0.045 ms
-- 查看表膨胀
SELECT
schemaname,
tablename,
pg_size_pretty(pg_total_relation_size(schemaname||'.'||tablename)) AS total_size,
pg_size_pretty(pg_relation_size(schemaname||'.'||tablename)) AS table_size,
pg_size_pretty(pg_total_relation_size(schemaname||'.'||tablename) - pg_relation_size(schemaname||'.'||tablename)) AS index_size,
n_dead_tup,
n_live_tup,
ROUND(100 * n_dead_tup / NULLIF(n_live_tup + n_dead_tup, 0), 2) AS dead_ratio
FROM pg_stat_user_tables
ORDER BY pg_total_relation_size(schemaname||'.'||tablename) DESC;
4.4 VACUUM 与索引维护
-- 手动 VACUUM
VACUUM VERBOSE ANALYZE orders;
-- 查看 autovacuum 状态
SELECT
schemaname,
tablename,
last_vacuum,
last_autovacuum,
last_analyze,
last_autoanalyze,
vacuum_count,
autovacuum_count
FROM pg_stat_user_tables
ORDER BY last_autovacuum NULLS FIRST;
-- 重建索引
REINDEX INDEX idx_orders_user_id;
REINDEX TABLE orders;
-- 查看索引使用情况
SELECT
schemaname,
tablename,
indexname,
idx_scan,
idx_tup_read,
idx_tup_fetch,
pg_size_pretty(pg_relation_size(indexrelid)) AS index_size
FROM pg_stat_user_indexes
WHERE schemaname = 'public'
ORDER BY idx_scan;
5. 通用优化策略
5.1 索引优化原则
1. 选择性高的列优先建索引(唯一值多)
2. WHERE/JOIN/ORDER BY 子句中的列
3. 复合索引遵循最左前缀原则
4. 避免过多索引(影响写入性能)
5. 定期重建碎片化索引
6. 删除未使用的索引
5.2 SQL 优化技巧
-- 避免 SELECT *
-- 不推荐
SELECT * FROM orders WHERE user_id = 12345;
-- 推荐
SELECT order_id, user_id, amount, created_at FROM orders WHERE user_id = 12345;
-- 避免函数包裹索引列
-- 不推荐
SELECT * FROM orders WHERE DATE(created_at) = '2025-11-11';
-- 推荐
SELECT * FROM orders WHERE created_at >= '2025-11-11' AND created_at < '2025-11-12';
-- 使用 LIMIT 限制结果集
SELECT * FROM orders ORDER BY created_at DESC LIMIT 100;
-- 批量操作代替循环
-- 不推荐
FOR i IN 1..10000 LOOP
INSERT INTO logs VALUES (i, 'message');
END LOOP;
-- 推荐
INSERT INTO logs SELECT generate_series(1, 10000), 'message';
-- 使用 EXISTS 代替 IN(大数据集)
-- 不推荐
SELECT * FROM users WHERE user_id IN (SELECT user_id FROM orders);
-- 推荐
SELECT * FROM users u WHERE EXISTS (SELECT 1 FROM orders o WHERE o.user_id = u.user_id);
5.3 连接池优化
# HikariCP 配置
hikari.maximum-pool-size=20
hikari.minimum-idle=5
hikari.connection-timeout=30000
hikari.idle-timeout=600000
hikari.max-lifetime=1800000
hikari.leak-detection-threshold=60000
# 计算公式:
# connections = ((core_count * 2) + effective_spindle_count)
# 例如:8核 + 4块磁盘 = (8 * 2) + 4 = 20
6. 实战案例
案例1:慢查询优化
-- 问题 SQL(执行 5 秒)
SELECT * FROM orders
WHERE status = 'pending'
ORDER BY created_at DESC
LIMIT 100;
-- 诊断:
EXPLAIN ANALYZE ...
-- Seq Scan on orders (cost=0.00..1234567.89 rows=1000000 width=100)
-- 全表扫描!
-- 优化方案:
CREATE INDEX idx_orders_status_created ON orders(status, created_at DESC);
-- 优化后(执行 0.05 秒)
-- Index Scan using idx_orders_status_created on orders (cost=0.43..8.45 rows=100 width=100)
案例2:锁等待优化
-- 问题:大量锁等待
-- 原因:长事务未提交
-- 诊断:
SELECT * FROM pg_stat_activity WHERE state = 'idle in transaction';
-- 优化方案:
-- 1. 缩短事务时间
-- 2. 设置超时
SET statement_timeout = '30s';
SET lock_timeout = '10s';
-- 3. 应用层优化
-- - 避免在事务中执行耗时操作
-- - 使用乐观锁代替悲观锁
案例3:表膨胀优化
-- 问题:表膨胀严重(死元组 > 50%)
-- 诊断:
SELECT * FROM pg_stat_user_tables WHERE dead_ratio > 50;
-- 优化方案:
-- 1. 手动 VACUUM
VACUUM FULL orders; -- 锁表,慎用
-- 2. 调整 autovacuum 参数
ALTER TABLE orders SET (autovacuum_vacuum_scale_factor = 0.05);
ALTER TABLE orders SET (autovacuum_analyze_scale_factor = 0.02);
-- 3. 使用 pg_repack(在线重建)
pg_repack -t orders -d mydb
7. 监控告警
7.1 关键指标
数据库层:
- QPS/TPS
- 慢查询数量
- 连接数使用率
- 缓存命中率
- 锁等待时间
- 复制延迟
系统层:
- CPU 使用率
- 内存使用率
- 磁盘 IO(IOPS/吞吐量)
- 网络流量
7.2 告警阈值
- 慢查询 > 1s:警告
- 慢查询 > 5s:严重
- 连接数 > 80%:警告
- 连接数 > 95%:严重
- 复制延迟 > 10s:警告
- 复制延迟 > 60s:严重
- CPU > 80%:警告
- 磁盘 IO 等待 > 50%:警告
8. 工具推荐
| 工具 | 用途 | 支持数据库 |
|---|---|---|
| pt-query-digest | 慢查询分析 | MySQL |
| pg_stat_statements | 慢查询分析 | PostgreSQL |
| AWR/ASH | 性能报告 | Oracle |
| Prometheus + Grafana | 监控可视化 | 全部 |
| pgBadger | 日志分析 | PostgreSQL |
| MySQLTuner | 参数优化建议 | MySQL |
| pg_repack | 在线表重建 | PostgreSQL |
| Percona Toolkit | 性能工具集 | MySQL |
总结:性能优化是持续过程,需要监控、诊断、优化、验证的闭环。