bigquery

Compare original and translation side by side

🇺🇸

Original

English
🇨🇳

Translation

Chinese

BigQuery Expert Engineer Skill

BigQuery专家工程师技能

This skill provides a comprehensive guide for BigQuery development.
本技能为BigQuery开发提供全面指南。

1. bq Command Line Tool Basics

1. bq命令行工具基础

1.1. Query Execution

1.1. 查询执行

sh
undefined
sh
undefined

Execute query with Standard SQL

使用标准SQL执行查询

bq query --use_legacy_sql=false 'SELECT * FROM
project.dataset.table
LIMIT 10'
bq query --use_legacy_sql=false 'SELECT * FROM
project.dataset.table
LIMIT 10'

Output results in CSV format

以CSV格式输出结果

bq query --use_legacy_sql=false --format=csv 'SELECT * FROM
project.dataset.table
'
bq query --use_legacy_sql=false --format=csv 'SELECT * FROM
project.dataset.table
'

Dry run (cost estimation)

试运行(成本估算)

bq query --use_legacy_sql=false --dry_run 'SELECT * FROM
project.dataset.table
'
bq query --use_legacy_sql=false --dry_run 'SELECT * FROM
project.dataset.table
'

Save results to table

将结果保存到表中

bq query --use_legacy_sql=false --destination_table=project:dataset.result_table 'SELECT * FROM
project.dataset.table
'
undefined
bq query --use_legacy_sql=false --destination_table=project:dataset.result_table 'SELECT * FROM
project.dataset.table
'
undefined

1.2. Table Operations

1.2. 表操作

sh
undefined
sh
undefined

List tables

列出表

bq ls project:dataset
bq ls project:dataset

Check table schema

查看表结构

bq show --schema --format=prettyjson project:dataset.table
bq show --schema --format=prettyjson project:dataset.table

Create table (from schema file)

从Schema文件创建表

bq mk --table project:dataset.table schema.json
bq mk --table project:dataset.table schema.json

Create partitioned table

创建分区表

bq mk --table --time_partitioning_field=created_at project:dataset.table schema.json
bq mk --table --time_partitioning_field=created_at project:dataset.table schema.json

Create clustered table

创建聚簇表

bq mk --table --clustering_fields=user_id,category project:dataset.table schema.json
bq mk --table --clustering_fields=user_id,category project:dataset.table schema.json

Delete table

删除表

bq rm -t project:dataset.table
undefined
bq rm -t project:dataset.table
undefined

1.3. Data Load/Export

1.3. 数据加载/导出

sh
undefined
sh
undefined

Load from CSV

从CSV加载数据

bq load --source_format=CSV project:dataset.table gs://bucket/data.csv schema.json
bq load --source_format=CSV project:dataset.table gs://bucket/data.csv schema.json

Load from JSON

从JSON加载数据

bq load --source_format=NEWLINE_DELIMITED_JSON project:dataset.table gs://bucket/data.json
bq load --source_format=NEWLINE_DELIMITED_JSON project:dataset.table gs://bucket/data.json

Load from Parquet (auto-detect schema)

从Parquet加载数据(自动检测Schema)

bq load --source_format=PARQUET --autodetect project:dataset.table gs://bucket/data.parquet
bq load --source_format=PARQUET --autodetect project:dataset.table gs://bucket/data.parquet

Export to Cloud Storage

导出到云存储

bq extract --destination_format=CSV project:dataset.table gs://bucket/export/*.csv
undefined
bq extract --destination_format=CSV project:dataset.table gs://bucket/export/*.csv
undefined

2. GoogleSQL Basic Syntax

2. GoogleSQL基础语法

2.1. SELECT Statement

2.1. SELECT语句

sql
-- Basic SELECT
SELECT
  column1,
  column2,
  COUNT(*) AS count
FROM
  `project.dataset.table`
WHERE
  date >= '2024-01-01'
GROUP BY
  column1, column2
HAVING
  COUNT(*) > 10
ORDER BY
  count DESC
LIMIT 100
sql
-- 基础SELECT
SELECT
  column1,
  column2,
  COUNT(*) AS count
FROM
  `project.dataset.table`
WHERE
  date >= '2024-01-01'
GROUP BY
  column1, column2
HAVING
  COUNT(*) > 10
ORDER BY
  count DESC
LIMIT 100

2.2. Common Functions

2.2. 常用函数

sql
-- String functions
CONCAT(str1, str2)
LOWER(str), UPPER(str)
TRIM(str), LTRIM(str), RTRIM(str)
SUBSTR(str, start, length)
REGEXP_CONTAINS(str, r'pattern')
REGEXP_EXTRACT(str, r'pattern')
SPLIT(str, delimiter)

-- Date/time functions
CURRENT_DATE(), CURRENT_TIMESTAMP()
DATE(timestamp), TIMESTAMP(date)
DATE_ADD(date, INTERVAL 1 DAY)
DATE_DIFF(date1, date2, DAY)
FORMAT_DATE('%Y-%m-%d', date)
PARSE_DATE('%Y%m%d', str)
EXTRACT(YEAR FROM date)

-- Aggregate functions
COUNT(*), COUNT(DISTINCT column)
SUM(column), AVG(column)
MIN(column), MAX(column)
ARRAY_AGG(column)
STRING_AGG(column, ',')

-- Window functions
ROW_NUMBER() OVER (PARTITION BY col ORDER BY col2)
RANK() OVER (ORDER BY col DESC)
LAG(col, 1) OVER (ORDER BY date)
LEAD(col, 1) OVER (ORDER BY date)
SUM(col) OVER (PARTITION BY category)
sql
-- 字符串函数
CONCAT(str1, str2)
LOWER(str), UPPER(str)
TRIM(str), LTRIM(str), RTRIM(str)
SUBSTR(str, start, length)
REGEXP_CONTAINS(str, r'pattern')
REGEXP_EXTRACT(str, r'pattern')
SPLIT(str, delimiter)

-- 日期/时间函数
CURRENT_DATE(), CURRENT_TIMESTAMP()
DATE(timestamp), TIMESTAMP(date)
DATE_ADD(date, INTERVAL 1 DAY)
DATE_DIFF(date1, date2, DAY)
FORMAT_DATE('%Y-%m-%d', date)
PARSE_DATE('%Y%m%d', str)
EXTRACT(YEAR FROM date)

-- 聚合函数
COUNT(*), COUNT(DISTINCT column)
SUM(column), AVG(column)
MIN(column), MAX(column)
ARRAY_AGG(column)
STRING_AGG(column, ',')

-- 窗口函数
ROW_NUMBER() OVER (PARTITION BY col ORDER BY col2)
RANK() OVER (ORDER BY col DESC)
LAG(col, 1) OVER (ORDER BY date)
LEAD(col, 1) OVER (ORDER BY date)
SUM(col) OVER (PARTITION BY category)

2.3. JOIN Syntax

2.3. JOIN语法

sql
-- INNER JOIN
SELECT a.*, b.column
FROM `project.dataset.table_a` AS a
INNER JOIN `project.dataset.table_b` AS b
  ON a.id = b.id

-- LEFT JOIN
SELECT a.*, b.column
FROM `project.dataset.table_a` AS a
LEFT JOIN `project.dataset.table_b` AS b
  ON a.id = b.id

-- CROSS JOIN (commonly used for array expansion)
SELECT *
FROM `project.dataset.table`,
UNNEST(array_column) AS element
sql
-- INNER JOIN
SELECT a.*, b.column
FROM `project.dataset.table_a` AS a
INNER JOIN `project.dataset.table_b` AS b
  ON a.id = b.id

-- LEFT JOIN
SELECT a.*, b.column
FROM `project.dataset.table_a` AS a
LEFT JOIN `project.dataset.table_b` AS b
  ON a.id = b.id

-- CROSS JOIN(常用于数组展开)
SELECT *
FROM `project.dataset.table`,
UNNEST(array_column) AS element

2.4. CTE (Common Table Expressions)

2.4. CTE(公共表表达式)

sql
WITH
  base_data AS (
    SELECT *
    FROM `project.dataset.table`
    WHERE date >= '2024-01-01'
  ),
  aggregated AS (
    SELECT
      category,
      COUNT(*) AS count
    FROM base_data
    GROUP BY category
  )
SELECT *
FROM aggregated
ORDER BY count DESC
sql
WITH
  base_data AS (
    SELECT *
    FROM `project.dataset.table`
    WHERE date >= '2024-01-01'
  ),
  aggregated AS (
    SELECT
      category,
      COUNT(*) AS count
    FROM base_data
    GROUP BY category
  )
SELECT *
FROM aggregated
ORDER BY count DESC

3. Table Design

3. 表设计

3.1. Partitioning

3.1. 分区

Divide data by date to reduce query scan volume.
sql
-- Create date-partitioned table
CREATE TABLE `project.dataset.partitioned_table`
PARTITION BY DATE(created_at)
AS SELECT * FROM `project.dataset.source_table`;

-- Integer partitioning
CREATE TABLE `project.dataset.int_partitioned`
PARTITION BY RANGE_BUCKET(user_id, GENERATE_ARRAY(0, 1000000, 10000))
AS SELECT * FROM source;

-- Require partition filter
CREATE TABLE `project.dataset.table`
PARTITION BY DATE(created_at)
OPTIONS (
  require_partition_filter = TRUE
);
按日期划分数据,减少查询扫描量。
sql
-- 创建日期分区表
CREATE TABLE `project.dataset.partitioned_table`
PARTITION BY DATE(created_at)
AS SELECT * FROM `project.dataset.source_table`;

-- 整数分区
CREATE TABLE `project.dataset.int_partitioned`
PARTITION BY RANGE_BUCKET(user_id, GENERATE_ARRAY(0, 1000000, 10000))
AS SELECT * FROM source;

-- 强制使用分区过滤器
CREATE TABLE `project.dataset.table`
PARTITION BY DATE(created_at)
OPTIONS (
  require_partition_filter = TRUE
);

3.2. Clustering

3.2. 聚簇

Sort and group data by specified columns.
sql
-- Clustering table
CREATE TABLE `project.dataset.clustered_table`
PARTITION BY DATE(created_at)
CLUSTER BY user_id, category
AS SELECT * FROM source;
按指定列对数据进行排序和分组。
sql
-- 创建聚簇表
CREATE TABLE `project.dataset.clustered_table`
PARTITION BY DATE(created_at)
CLUSTER BY user_id, category
AS SELECT * FROM source;

3.3. Best Practices

3.3. 最佳实践

  • Combine partitioning and clustering
  • Choose columns frequently filtered in queries
  • Maximum 4 clustering columns
  • Prioritize high-cardinality columns
  • 结合分区和聚簇
  • 选择查询中频繁过滤的列
  • 最多4个聚簇列
  • 优先选择高基数列

4. Performance Optimization

4. 性能优化

4.1. Query Optimization

4.1. 查询优化

sql
-- Avoid SELECT *
-- Bad
SELECT * FROM table;
-- Good
SELECT column1, column2 FROM table;

-- Leverage partition pruning
-- Bad (function applied to partition column)
WHERE DATE(created_at) = '2024-01-01'
-- Good
WHERE created_at >= '2024-01-01' AND created_at < '2024-01-02'

-- Use APPROX_ functions for estimates (faster)
SELECT APPROX_COUNT_DISTINCT(user_id) FROM table;
sql
-- 避免使用SELECT *
-- 不良写法
SELECT * FROM table;
-- 推荐写法
SELECT column1, column2 FROM table;

-- 利用分区剪枝
-- 不良写法(对分区列使用函数)
WHERE DATE(created_at) = '2024-01-01'
-- 推荐写法
WHERE created_at >= '2024-01-01' AND created_at < '2024-01-02'

-- 使用APPROX_函数获取估算值(速度更快)
SELECT APPROX_COUNT_DISTINCT(user_id) FROM table;

4.2. JOIN Optimization

4.2. JOIN优化

sql
-- Put smaller table on right side (broadcast JOIN)
SELECT *
FROM large_table
JOIN small_table ON large_table.id = small_table.id;

-- JOIN only needed columns
WITH filtered AS (
  SELECT id, needed_column FROM large_table WHERE condition
)
SELECT * FROM filtered JOIN other_table ON ...
sql
-- 将小表放在右侧(广播JOIN)
SELECT *
FROM large_table
JOIN small_table ON large_table.id = small_table.id;

-- 仅JOIN需要的列
WITH filtered AS (
  SELECT id, needed_column FROM large_table WHERE condition
)
SELECT * FROM filtered JOIN other_table ON ...

4.3. Check Slot Usage

4.3. 检查Slot使用情况

sql
-- Check job statistics
SELECT
  job_id,
  total_bytes_processed,
  total_slot_ms,
  TIMESTAMP_DIFF(end_time, start_time, SECOND) AS duration_sec
FROM `region-us`.INFORMATION_SCHEMA.JOBS
WHERE creation_time > TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 1 DAY)
ORDER BY total_slot_ms DESC
LIMIT 10;
sql
-- 查看作业统计信息
SELECT
  job_id,
  total_bytes_processed,
  total_slot_ms,
  TIMESTAMP_DIFF(end_time, start_time, SECOND) AS duration_sec
FROM `region-us`.INFORMATION_SCHEMA.JOBS
WHERE creation_time > TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 1 DAY)
ORDER BY total_slot_ms DESC
LIMIT 10;

5. Cost Management

5. 成本管理

5.1. Pricing Model

5.1. 定价模型

  • On-demand: Based on scanned data ($5/TB)
  • Flat-rate (Editions): Based on reserved slots
  • Storage: Active $0.02/GB, Long-term $0.01/GB
  • 按需付费:基于扫描数据量(5美元/TB)
  • 固定费率(版本):基于预留Slot
  • 存储:活跃存储0.02美元/GB,长期存储0.01美元/GB

5.2. Cost Reduction Best Practices

5.2. 成本削减最佳实践

  1. Avoid SELECT *
  2. Always use partition filters
  3. Check cost with dry run before queries
  4. Optimize repeated queries with materialized views
  5. Speed up dashboard queries with BI Engine
  1. 避免使用SELECT *
  2. 始终使用分区过滤器
  3. 查询前通过试运行检查成本
  4. 使用物化视图优化重复查询
  5. 使用BI Engine加速仪表板查询

5.3. Custom Quota Settings

5.3. 自定义配额设置

sql
-- Set query byte limit per project
-- Configure in Cloud Console or gcloud
sql
-- 设置项目级查询字节限制
-- 在Cloud Console或gcloud中配置

6. Data Governance

6. 数据治理

6.1. IAM Roles

6.1. IAM角色

  • roles/bigquery.admin
    : Full permissions
  • roles/bigquery.dataEditor
    : Read/write data
  • roles/bigquery.dataViewer
    : Read-only data
  • roles/bigquery.jobUser
    : Execute jobs
  • roles/bigquery.user
    : List datasets, execute jobs
  • roles/bigquery.admin
    :完整权限
  • roles/bigquery.dataEditor
    :读写数据
  • roles/bigquery.dataViewer
    :只读数据
  • roles/bigquery.jobUser
    :执行作业
  • roles/bigquery.user
    :列出数据集、执行作业

6.2. Column-level Security

6.2. 列级安全

sql
-- Apply policy tag
ALTER TABLE `project.dataset.table`
ALTER COLUMN sensitive_column
SET OPTIONS (policy_tags = ['projects/project/locations/us/taxonomies/123/policyTags/456']);
sql
-- 应用策略标签
ALTER TABLE `project.dataset.table`
ALTER COLUMN sensitive_column
SET OPTIONS (policy_tags = ['projects/project/locations/us/taxonomies/123/policyTags/456']);

6.3. Row-level Security

6.3. 行级安全

sql
-- Create row access policy
CREATE ROW ACCESS POLICY region_filter
ON `project.dataset.table`
GRANT TO ('user:analyst@example.com')
FILTER USING (region = 'APAC');
sql
-- 创建行访问策略
CREATE ROW ACCESS POLICY region_filter
ON `project.dataset.table`
GRANT TO ('user:analyst@example.com')
FILTER USING (region = 'APAC');

7. BigQuery ML

7. BigQuery ML

7.1. Model Creation

7.1. 模型创建

sql
-- Linear regression model
CREATE OR REPLACE MODEL `project.dataset.model`
OPTIONS (
  model_type = 'LINEAR_REG',
  input_label_cols = ['target']
) AS
SELECT feature1, feature2, target
FROM `project.dataset.training_data`;

-- Logistic regression model
CREATE OR REPLACE MODEL `project.dataset.classifier`
OPTIONS (
  model_type = 'LOGISTIC_REG',
  input_label_cols = ['label']
) AS
SELECT * FROM training_data;
sql
-- 线性回归模型
CREATE OR REPLACE MODEL `project.dataset.model`
OPTIONS (
  model_type = 'LINEAR_REG',
  input_label_cols = ['target']
) AS
SELECT feature1, feature2, target
FROM `project.dataset.training_data`;

-- 逻辑回归模型
CREATE OR REPLACE MODEL `project.dataset.classifier`
OPTIONS (
  model_type = 'LOGISTIC_REG',
  input_label_cols = ['label']
) AS
SELECT * FROM training_data;

7.2. Model Evaluation and Prediction

7.2. 模型评估与预测

sql
-- Model evaluation
SELECT * FROM ML.EVALUATE(MODEL `project.dataset.model`);

-- Prediction
SELECT *
FROM ML.PREDICT(
  MODEL `project.dataset.model`,
  (SELECT * FROM `project.dataset.new_data`)
);
sql
-- 模型评估
SELECT * FROM ML.EVALUATE(MODEL `project.dataset.model`);

-- 预测
SELECT *
FROM ML.PREDICT(
  MODEL `project.dataset.model`,
  (SELECT * FROM `project.dataset.new_data`)
);

8. External Data Sources

8. 外部数据源

8.1. External Tables

8.1. 外部表

sql
-- Reference Cloud Storage CSV as external table
CREATE EXTERNAL TABLE `project.dataset.external_table`
OPTIONS (
  format = 'CSV',
  uris = ['gs://bucket/path/*.csv'],
  skip_leading_rows = 1
);

-- Parquet external table
CREATE EXTERNAL TABLE `project.dataset.parquet_table`
OPTIONS (
  format = 'PARQUET',
  uris = ['gs://bucket/path/*.parquet']
);
sql
-- 将Cloud Storage中的CSV作为外部表引用
CREATE EXTERNAL TABLE `project.dataset.external_table`
OPTIONS (
  format = 'CSV',
  uris = ['gs://bucket/path/*.csv'],
  skip_leading_rows = 1
);

-- Parquet外部表
CREATE EXTERNAL TABLE `project.dataset.parquet_table`
OPTIONS (
  format = 'PARQUET',
  uris = ['gs://bucket/path/*.parquet']
);

8.2. Federated Query

8.2. 联合查询

sql
-- Connect to Cloud SQL
SELECT * FROM EXTERNAL_QUERY(
  'projects/project/locations/us/connections/connection_id',
  'SELECT * FROM mysql_table'
);
sql
-- 连接到Cloud SQL
SELECT * FROM EXTERNAL_QUERY(
  'projects/project/locations/us/connections/connection_id',
  'SELECT * FROM mysql_table'
);

9. Scheduled Queries

9. 计划查询

9.1. Configuration Example

9.1. 配置示例

sql
-- Configure in Cloud Console or bq command
-- Run daily at 2 AM
bq query --use_legacy_sql=false \
  --schedule='every 24 hours' \
  --display_name='Daily aggregation' \
  --destination_table='project:dataset.daily_summary' \
  --replace \
  'SELECT DATE(created_at) as date, COUNT(*) as count FROM source GROUP BY 1'
sql
-- 在Cloud Console或bq命令中配置
-- 每天凌晨2点运行
bq query --use_legacy_sql=false \
  --schedule='every 24 hours' \
  --display_name='Daily aggregation' \
  --destination_table='project:dataset.daily_summary' \
  --replace \
  'SELECT DATE(created_at) as date, COUNT(*) as count FROM source GROUP BY 1'

10. Reference Links

10. 参考链接