sql-patterns

Compare original and translation side by side

🇺🇸

Original

English
🇨🇳

Translation

Chinese

SQL-Patterns

SQL模式

Advanced SQL patterns for data engineering beyond basic SELECT/JOIN.
面向数据工程的高级SQL模式,超越基础的SELECT/JOIN操作。

Common Table Expressions (CTEs)

通用表表达式(CTEs)

sql
-- Chain transformations readably
WITH
  active_users AS (
    SELECT user_id, email
    FROM users
    WHERE status = 'active'
  ),
  user_orders AS (
    SELECT u.user_id, COUNT(*) as order_count
    FROM active_users u
    JOIN orders o ON u.user_id = o.user_id
    GROUP BY u.user_id
  )
SELECT * FROM user_orders WHERE order_count > 5;
sql
-- Chain transformations readably
WITH
  active_users AS (
    SELECT user_id, email
    FROM users
    WHERE status = 'active'
  ),
  user_orders AS (
    SELECT u.user_id, COUNT(*) as order_count
    FROM active_users u
    JOIN orders o ON u.user_id = o.user_id
    GROUP BY u.user_id
  )
SELECT * FROM user_orders WHERE order_count > 5;

Window Functions

窗口函数

sql
-- Row numbering within groups
SELECT *,
  ROW_NUMBER() OVER (PARTITION BY category ORDER BY created_at DESC) as rn
FROM products;

-- Running totals
SELECT
  date,
  revenue,
  SUM(revenue) OVER (ORDER BY date) as cumulative_revenue
FROM daily_sales;

-- Percent of total
SELECT
  category,
  sales,
  sales * 100.0 / SUM(sales) OVER () as pct_of_total
FROM category_sales;

-- Lead/Lag for time series
SELECT
  date,
  value,
  LAG(value, 1) OVER (ORDER BY date) as prev_value,
  value - LAG(value, 1) OVER (ORDER BY date) as change
FROM metrics;

-- Ranking with ties
SELECT *,
  RANK() OVER (ORDER BY score DESC) as rank,       -- 1,2,2,4
  DENSE_RANK() OVER (ORDER BY score DESC) as drank -- 1,2,2,3
FROM scores;
sql
-- Row numbering within groups
SELECT *,
  ROW_NUMBER() OVER (PARTITION BY category ORDER BY created_at DESC) as rn
FROM products;

-- Running totals
SELECT
  date,
  revenue,
  SUM(revenue) OVER (ORDER BY date) as cumulative_revenue
FROM daily_sales;

-- Percent of total
SELECT
  category,
  sales,
  sales * 100.0 / SUM(sales) OVER () as pct_of_total
FROM category_sales;

-- Lead/Lag for time series
SELECT
  date,
  value,
  LAG(value, 1) OVER (ORDER BY date) as prev_value,
  value - LAG(value, 1) OVER (ORDER BY date) as change
FROM metrics;

-- Ranking with ties
SELECT *,
  RANK() OVER (ORDER BY score DESC) as rank,       -- 1,2,2,4
  DENSE_RANK() OVER (ORDER BY score DESC) as drank -- 1,2,2,3
FROM scores;

Recursive CTEs

递归CTEs

sql
-- Hierarchical data (org chart, categories)
WITH RECURSIVE org_tree AS (
  -- Base case: top-level managers
  SELECT id, name, manager_id, 1 as depth
  FROM employees
  WHERE manager_id IS NULL

  UNION ALL

  -- Recursive case: subordinates
  SELECT e.id, e.name, e.manager_id, t.depth + 1
  FROM employees e
  JOIN org_tree t ON e.manager_id = t.id
)
SELECT * FROM org_tree;

-- Generate date series
WITH RECURSIVE dates AS (
  SELECT DATE '2024-01-01' as dt
  UNION ALL
  SELECT dt + INTERVAL '1 day'
  FROM dates
  WHERE dt < DATE '2024-12-31'
)
SELECT * FROM dates;
sql
-- Hierarchical data (org chart, categories)
WITH RECURSIVE org_tree AS (
  -- Base case: top-level managers
  SELECT id, name, manager_id, 1 as depth
  FROM employees
  WHERE manager_id IS NULL

  UNION ALL

  -- Recursive case: subordinates
  SELECT e.id, e.name, e.manager_id, t.depth + 1
  FROM employees e
  JOIN org_tree t ON e.manager_id = t.id
)
SELECT * FROM org_tree;

-- Generate date series
WITH RECURSIVE dates AS (
  SELECT DATE '2024-01-01' as dt
  UNION ALL
  SELECT dt + INTERVAL '1 day'
  FROM dates
  WHERE dt < DATE '2024-12-31'
)
SELECT * FROM dates;

CASE Expressions

CASE表达式

sql
-- Simple CASE
SELECT
  CASE status
    WHEN 'A' THEN 'Active'
    WHEN 'I' THEN 'Inactive'
    ELSE 'Unknown'
  END as status_label
FROM users;

-- Searched CASE for ranges
SELECT
  CASE
    WHEN age < 18 THEN 'Minor'
    WHEN age < 65 THEN 'Adult'
    ELSE 'Senior'
  END as age_group
FROM users;

-- Conditional aggregation
SELECT
  COUNT(*) as total,
  COUNT(*) FILTER (WHERE status = 'active') as active_count,  -- PostgreSQL
  SUM(CASE WHEN status = 'active' THEN 1 ELSE 0 END) as active_count  -- ANSI
FROM users;
sql
-- Simple CASE
SELECT
  CASE status
    WHEN 'A' THEN 'Active'
    WHEN 'I' THEN 'Inactive'
    ELSE 'Unknown'
  END as status_label
FROM users;

-- Searched CASE for ranges
SELECT
  CASE
    WHEN age < 18 THEN 'Minor'
    WHEN age < 65 THEN 'Adult'
    ELSE 'Senior'
  END as age_group
FROM users;

-- Conditional aggregation
SELECT
  COUNT(*) as total,
  COUNT(*) FILTER (WHERE status = 'active') as active_count,  -- PostgreSQL
  SUM(CASE WHEN status = 'active' THEN 1 ELSE 0 END) as active_count  -- ANSI
FROM users;

UPSERT Patterns

UPSERT模式

sql
-- PostgreSQL: INSERT ON CONFLICT
INSERT INTO inventory (sku, quantity, updated_at)
VALUES ('ABC123', 100, NOW())
ON CONFLICT (sku) DO UPDATE SET
  quantity = EXCLUDED.quantity,
  updated_at = EXCLUDED.updated_at;

-- MySQL: INSERT ON DUPLICATE KEY
INSERT INTO inventory (sku, quantity, updated_at)
VALUES ('ABC123', 100, NOW())
ON DUPLICATE KEY UPDATE
  quantity = VALUES(quantity),
  updated_at = VALUES(updated_at);

-- SQLite: INSERT OR REPLACE
INSERT OR REPLACE INTO inventory (sku, quantity, updated_at)
VALUES ('ABC123', 100, datetime('now'));
sql
-- PostgreSQL: INSERT ON CONFLICT
INSERT INTO inventory (sku, quantity, updated_at)
VALUES ('ABC123', 100, NOW())
ON CONFLICT (sku) DO UPDATE SET
  quantity = EXCLUDED.quantity,
  updated_at = EXCLUDED.updated_at;

-- MySQL: INSERT ON DUPLICATE KEY
INSERT INTO inventory (sku, quantity, updated_at)
VALUES ('ABC123', 100, NOW())
ON DUPLICATE KEY UPDATE
  quantity = VALUES(quantity),
  updated_at = VALUES(updated_at);

-- SQLite: INSERT OR REPLACE
INSERT OR REPLACE INTO inventory (sku, quantity, updated_at)
VALUES ('ABC123', 100, datetime('now'));

Efficient Pagination

高效分页

sql
-- BAD: OFFSET for large pages
SELECT * FROM orders ORDER BY id LIMIT 20 OFFSET 10000;

-- GOOD: Keyset pagination
SELECT * FROM orders
WHERE id > 10000  -- last seen id
ORDER BY id
LIMIT 20;
sql
-- BAD: OFFSET for large pages
SELECT * FROM orders ORDER BY id LIMIT 20 OFFSET 10000;

-- GOOD: Keyset pagination
SELECT * FROM orders
WHERE id > 10000  -- last seen id
ORDER BY id
LIMIT 20;

Batch Operations

批量操作

sql
-- Batch DELETE with limit (avoid long locks)
DELETE FROM logs
WHERE created_at < NOW() - INTERVAL '90 days'
LIMIT 10000;

-- Batch UPDATE
UPDATE orders
SET status = 'archived'
WHERE id IN (
  SELECT id FROM orders
  WHERE status = 'completed'
  AND completed_at < NOW() - INTERVAL '1 year'
  LIMIT 1000
);
sql
-- Batch DELETE with limit (avoid long locks)
DELETE FROM logs
WHERE created_at < NOW() - INTERVAL '90 days'
LIMIT 10000;

-- Batch UPDATE
UPDATE orders
SET status = 'archived'
WHERE id IN (
  SELECT id FROM orders
  WHERE status = 'completed'
  AND completed_at < NOW() - INTERVAL '1 year'
  LIMIT 1000
);

Index-Friendly Queries

索引友好的查询

sql
-- BAD: Function on indexed column
SELECT * FROM users WHERE LOWER(email) = 'test@example.com';

-- GOOD: Store lowercase or use expression index
SELECT * FROM users WHERE email_lower = 'test@example.com';
-- Or: CREATE INDEX idx_email_lower ON users (LOWER(email));

-- BAD: Leading wildcard
SELECT * FROM products WHERE name LIKE '%widget%';

-- GOOD: Full-text search or prefix match
SELECT * FROM products WHERE name LIKE 'widget%';
sql
-- BAD: Function on indexed column
SELECT * FROM users WHERE LOWER(email) = 'test@example.com';

-- GOOD: Store lowercase or use expression index
SELECT * FROM users WHERE email_lower = 'test@example.com';
-- Or: CREATE INDEX idx_email_lower ON users (LOWER(email));

-- BAD: Leading wildcard
SELECT * FROM products WHERE name LIKE '%widget%';

-- GOOD: Full-text search or prefix match
SELECT * FROM products WHERE name LIKE 'widget%';

NULL Handling

NULL值处理

sql
-- COALESCE for defaults
SELECT COALESCE(nickname, first_name, 'Anonymous') as display_name
FROM users;

-- NULLIF to convert values to NULL
SELECT NULLIF(status, '') as status  -- empty string -> NULL
FROM records;

-- IS DISTINCT FROM (NULL-safe comparison)
SELECT * FROM a
WHERE a.value IS DISTINCT FROM b.value;  -- treats NULL != NULL as false
sql
-- COALESCE for defaults
SELECT COALESCE(nickname, first_name, 'Anonymous') as display_name
FROM users;

-- NULLIF to convert values to NULL
SELECT NULLIF(status, '') as status  -- empty string -> NULL
FROM records;

-- IS DISTINCT FROM (NULL-safe comparison)
SELECT * FROM a
WHERE a.value IS DISTINCT FROM b.value;  -- treats NULL != NULL as false

LATERAL Joins

LATERAL连接

sql
-- Top N per group
SELECT d.name, t.product, t.revenue
FROM departments d
CROSS JOIN LATERAL (
  SELECT product, revenue
  FROM sales
  WHERE sales.dept_id = d.id
  ORDER BY revenue DESC
  LIMIT 3
) t;
sql
-- Top N per group
SELECT d.name, t.product, t.revenue
FROM departments d
CROSS JOIN LATERAL (
  SELECT product, revenue
  FROM sales
  WHERE sales.dept_id = d.id
  ORDER BY revenue DESC
  LIMIT 3
) t;

Materialized Views

物化视图

sql
-- Create for expensive aggregations
CREATE MATERIALIZED VIEW daily_stats AS
SELECT
  DATE_TRUNC('day', created_at) as date,
  COUNT(*) as total_orders,
  SUM(amount) as revenue
FROM orders
GROUP BY 1;

-- Refresh periodically
REFRESH MATERIALIZED VIEW CONCURRENTLY daily_stats;
sql
-- Create for expensive aggregations
CREATE MATERIALIZED VIEW daily_stats AS
SELECT
  DATE_TRUNC('day', created_at) as date,
  COUNT(*) as total_orders,
  SUM(amount) as revenue
FROM orders
GROUP BY 1;

-- Refresh periodically
REFRESH MATERIALIZED VIEW CONCURRENTLY daily_stats;

Query Optimization Checklist

查询优化检查清单

  1. Check EXPLAIN ANALYZE - Look for sequential scans on large tables
  2. Add missing indexes - Columns in WHERE, JOIN, ORDER BY
  3. **Avoid SELECT *** - Fetch only needed columns
  4. Use EXISTS over IN - For correlated subqueries
  5. Batch large operations - Avoid long-running transactions
  6. Partition large tables - By date or category
  7. Use connection pooling - Avoid connection overhead
  1. 检查EXPLAIN ANALYZE - 查看大表上的全表扫描
  2. 添加缺失的索引 - WHERE、JOIN、ORDER BY子句中的列
  3. **避免使用SELECT *** - 仅获取需要的列
  4. 使用EXISTS替代IN - 针对关联子查询
  5. 批量处理大型操作 - 避免长时间运行的事务
  6. 对大表进行分区 - 按日期或类别
  7. 使用连接池 - 避免连接开销