Skip to content

SQL 数据分析

SQL 是数据分析中最常用的查询语言,用于从数据库中提取和分析数据。

查询优化

索引优化

sql
-- 创建单列索引
CREATE INDEX idx_users_email ON users(email);

-- 创建复合索引
CREATE INDEX idx_orders_customer_date ON orders(customer_id, order_date);

-- 创建唯一索引
CREATE UNIQUE INDEX idx_users_username ON users(username);

-- 删除索引
DROP INDEX idx_users_email;

查询重写

sql
-- 避免 SELECT *
SELECT name, email FROM users WHERE id = 1;

-- 使用 JOIN 代替子查询
SELECT u.name, o.order_id
FROM users u
JOIN orders o ON u.id = o.customer_id
WHERE u.country = 'China';

-- 使用 EXISTS 代替 IN
SELECT name
FROM users u
WHERE EXISTS (
    SELECT 1 FROM orders o 
    WHERE o.customer_id = u.id
);

窗口函数

排名函数

sql
-- ROW_NUMBER: 连续排名
SELECT 
    name, 
    score,
    ROW_NUMBER() OVER (ORDER BY score DESC) as row_num
FROM students;

-- RANK: 跳跃排名
SELECT 
    name, 
    score,
    RANK() OVER (ORDER BY score DESC) as rank
FROM students;

-- DENSE_RANK: 密集排名
SELECT 
    name, 
    score,
    DENSE_RANK() OVER (ORDER BY score DESC) as dense_rank
FROM students;

聚合窗口函数

sql
-- 计算累计和
SELECT 
    date,
    sales,
    SUM(sales) OVER (ORDER BY date) as cumulative_sales
FROM daily_sales;

-- 移动平均
SELECT 
    date,
    sales,
    AVG(sales) OVER (
        ORDER BY date 
        ROWS BETWEEN 2 PRECEDING AND CURRENT ROW
    ) as moving_avg
FROM daily_sales;

-- 分组统计
SELECT 
    department,
    name,
    salary,
    AVG(salary) OVER (PARTITION BY department) as dept_avg
FROM employees;

CTE 表达式

基本 CTE

sql
WITH monthly_sales AS (
    SELECT 
        DATE_TRUNC('month', order_date) as month,
        SUM(amount) as total_sales
    FROM orders
    GROUP BY DATE_TRUNC('month', order_date)
)
SELECT 
    month,
    total_sales,
    LAG(total_sales) OVER (ORDER BY month) as prev_month_sales
FROM monthly_sales;

递归 CTE

sql
-- 查询组织层级
WITH RECURSIVE org_hierarchy AS (
    -- 基础情况:顶级员工
    SELECT 
        id,
        name,
        manager_id,
        1 as level
    FROM employees
    WHERE manager_id IS NULL
    
    UNION ALL
    
    -- 递归情况:下级员工
    SELECT 
        e.id,
        e.name,
        e.manager_id,
        oh.level + 1 as level
    FROM employees e
    JOIN org_hierarchy oh ON e.manager_id = oh.id
)
SELECT * FROM org_hierarchy;

数据透视

PIVOT 操作

sql
-- PostgreSQL
SELECT *
FROM (
    SELECT department, category, amount
    FROM expenses
) AS source
PIVOT (
    SUM(amount)
    FOR category IN ('salary', 'office', 'travel')
) AS pivot_table;

-- MySQL 使用 CASE
SELECT 
    department,
    SUM(CASE WHEN category = 'salary' THEN amount ELSE 0 END) as salary,
    SUM(CASE WHEN category = 'office' THEN amount ELSE 0 END) as office,
    SUM(CASE WHEN category = 'travel' THEN amount ELSE 0 END) as travel
FROM expenses
GROUP BY department;

性能分析

EXPLAIN 分析

sql
EXPLAIN ANALYZE
SELECT u.name, COUNT(o.id) as order_count
FROM users u
LEFT JOIN orders o ON u.id = o.customer_id
WHERE u.created_at > '2024-01-01'
GROUP BY u.id, u.name
ORDER BY order_count DESC
LIMIT 10;

慢查询日志

sql
-- MySQL 开启慢查询日志
SET GLOBAL slow_query_log = 'ON';
SET GLOBAL long_query_time = 2; -- 超过 2 秒的查询记录
SET GLOBAL slow_query_log_file = '/var/log/mysql/slow.log';