Loading...
Loading...
BigQuery Expert Engineer Skill - Comprehensive guide for GoogleSQL queries, data management, performance optimization, and cost management Use when: - Running bq commands (query, load, extract) - Writing GoogleSQL queries (functions, JOINs, CTEs) - Designing partitioned/clustered tables - Using BigQuery ML or external data sources
npx skill4agent add i9wa4/dotfiles bigquery# Execute query with Standard SQL
bq query --use_legacy_sql=false 'SELECT * FROM `project.dataset.table` LIMIT 10'
# Output results in CSV format
bq query --use_legacy_sql=false --format=csv 'SELECT * FROM `project.dataset.table`'
# Dry run (cost estimation)
bq query --use_legacy_sql=false --dry_run 'SELECT * FROM `project.dataset.table`'
# Save results to table
bq query --use_legacy_sql=false --destination_table=project:dataset.result_table 'SELECT * FROM `project.dataset.table`'# List tables
bq ls project:dataset
# Check table schema
bq show --schema --format=prettyjson project:dataset.table
# Create table (from schema file)
bq mk --table project:dataset.table schema.json
# Create partitioned table
bq mk --table --time_partitioning_field=created_at project:dataset.table schema.json
# Create clustered table
bq mk --table --clustering_fields=user_id,category project:dataset.table schema.json
# Delete table
bq rm -t project:dataset.table# Load from CSV
bq load --source_format=CSV project:dataset.table gs://bucket/data.csv schema.json
# Load from JSON
bq load --source_format=NEWLINE_DELIMITED_JSON project:dataset.table gs://bucket/data.json
# Load from Parquet (auto-detect schema)
bq load --source_format=PARQUET --autodetect project:dataset.table gs://bucket/data.parquet
# Export to Cloud Storage
bq extract --destination_format=CSV project:dataset.table gs://bucket/export/*.csv-- Basic SELECT
SELECT
column1,
column2,
COUNT(*) AS count
FROM
`project.dataset.table`
WHERE
date >= '2024-01-01'
GROUP BY
column1, column2
HAVING
COUNT(*) > 10
ORDER BY
count DESC
LIMIT 100-- String functions
CONCAT(str1, str2)
LOWER(str), UPPER(str)
TRIM(str), LTRIM(str), RTRIM(str)
SUBSTR(str, start, length)
REGEXP_CONTAINS(str, r'pattern')
REGEXP_EXTRACT(str, r'pattern')
SPLIT(str, delimiter)
-- Date/time functions
CURRENT_DATE(), CURRENT_TIMESTAMP()
DATE(timestamp), TIMESTAMP(date)
DATE_ADD(date, INTERVAL 1 DAY)
DATE_DIFF(date1, date2, DAY)
FORMAT_DATE('%Y-%m-%d', date)
PARSE_DATE('%Y%m%d', str)
EXTRACT(YEAR FROM date)
-- Aggregate functions
COUNT(*), COUNT(DISTINCT column)
SUM(column), AVG(column)
MIN(column), MAX(column)
ARRAY_AGG(column)
STRING_AGG(column, ',')
-- Window functions
ROW_NUMBER() OVER (PARTITION BY col ORDER BY col2)
RANK() OVER (ORDER BY col DESC)
LAG(col, 1) OVER (ORDER BY date)
LEAD(col, 1) OVER (ORDER BY date)
SUM(col) OVER (PARTITION BY category)-- INNER JOIN
SELECT a.*, b.column
FROM `project.dataset.table_a` AS a
INNER JOIN `project.dataset.table_b` AS b
ON a.id = b.id
-- LEFT JOIN
SELECT a.*, b.column
FROM `project.dataset.table_a` AS a
LEFT JOIN `project.dataset.table_b` AS b
ON a.id = b.id
-- CROSS JOIN (commonly used for array expansion)
SELECT *
FROM `project.dataset.table`,
UNNEST(array_column) AS elementWITH
base_data AS (
SELECT *
FROM `project.dataset.table`
WHERE date >= '2024-01-01'
),
aggregated AS (
SELECT
category,
COUNT(*) AS count
FROM base_data
GROUP BY category
)
SELECT *
FROM aggregated
ORDER BY count DESC-- Create date-partitioned table
CREATE TABLE `project.dataset.partitioned_table`
PARTITION BY DATE(created_at)
AS SELECT * FROM `project.dataset.source_table`;
-- Integer partitioning
CREATE TABLE `project.dataset.int_partitioned`
PARTITION BY RANGE_BUCKET(user_id, GENERATE_ARRAY(0, 1000000, 10000))
AS SELECT * FROM source;
-- Require partition filter
CREATE TABLE `project.dataset.table`
PARTITION BY DATE(created_at)
OPTIONS (
require_partition_filter = TRUE
);-- Clustering table
CREATE TABLE `project.dataset.clustered_table`
PARTITION BY DATE(created_at)
CLUSTER BY user_id, category
AS SELECT * FROM source;-- Avoid SELECT *
-- Bad
SELECT * FROM table;
-- Good
SELECT column1, column2 FROM table;
-- Leverage partition pruning
-- Bad (function applied to partition column)
WHERE DATE(created_at) = '2024-01-01'
-- Good
WHERE created_at >= '2024-01-01' AND created_at < '2024-01-02'
-- Use APPROX_ functions for estimates (faster)
SELECT APPROX_COUNT_DISTINCT(user_id) FROM table;-- Put smaller table on right side (broadcast JOIN)
SELECT *
FROM large_table
JOIN small_table ON large_table.id = small_table.id;
-- JOIN only needed columns
WITH filtered AS (
SELECT id, needed_column FROM large_table WHERE condition
)
SELECT * FROM filtered JOIN other_table ON ...-- Check job statistics
SELECT
job_id,
total_bytes_processed,
total_slot_ms,
TIMESTAMP_DIFF(end_time, start_time, SECOND) AS duration_sec
FROM `region-us`.INFORMATION_SCHEMA.JOBS
WHERE creation_time > TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 1 DAY)
ORDER BY total_slot_ms DESC
LIMIT 10;-- Set query byte limit per project
-- Configure in Cloud Console or gcloudroles/bigquery.adminroles/bigquery.dataEditorroles/bigquery.dataViewerroles/bigquery.jobUserroles/bigquery.user-- Apply policy tag
ALTER TABLE `project.dataset.table`
ALTER COLUMN sensitive_column
SET OPTIONS (policy_tags = ['projects/project/locations/us/taxonomies/123/policyTags/456']);-- Create row access policy
CREATE ROW ACCESS POLICY region_filter
ON `project.dataset.table`
GRANT TO ('user:analyst@example.com')
FILTER USING (region = 'APAC');-- Linear regression model
CREATE OR REPLACE MODEL `project.dataset.model`
OPTIONS (
model_type = 'LINEAR_REG',
input_label_cols = ['target']
) AS
SELECT feature1, feature2, target
FROM `project.dataset.training_data`;
-- Logistic regression model
CREATE OR REPLACE MODEL `project.dataset.classifier`
OPTIONS (
model_type = 'LOGISTIC_REG',
input_label_cols = ['label']
) AS
SELECT * FROM training_data;-- Model evaluation
SELECT * FROM ML.EVALUATE(MODEL `project.dataset.model`);
-- Prediction
SELECT *
FROM ML.PREDICT(
MODEL `project.dataset.model`,
(SELECT * FROM `project.dataset.new_data`)
);-- Reference Cloud Storage CSV as external table
CREATE EXTERNAL TABLE `project.dataset.external_table`
OPTIONS (
format = 'CSV',
uris = ['gs://bucket/path/*.csv'],
skip_leading_rows = 1
);
-- Parquet external table
CREATE EXTERNAL TABLE `project.dataset.parquet_table`
OPTIONS (
format = 'PARQUET',
uris = ['gs://bucket/path/*.parquet']
);-- Connect to Cloud SQL
SELECT * FROM EXTERNAL_QUERY(
'projects/project/locations/us/connections/connection_id',
'SELECT * FROM mysql_table'
);-- Configure in Cloud Console or bq command
-- Run daily at 2 AM
bq query --use_legacy_sql=false \
--schedule='every 24 hours' \
--display_name='Daily aggregation' \
--destination_table='project:dataset.daily_summary' \
--replace \
'SELECT DATE(created_at) as date, COUNT(*) as count FROM source GROUP BY 1'