Loading...
Loading...
Optimizes ClickHouse queries for speed and efficiency. Helps with primary key design, sparse indexes, data skipping indexes (minmax, set, bloom filter, ngrambf_v1), partitioning strategies, projections, PREWHERE optimization, approximate functions, and query profiling with EXPLAIN. Use when writing ClickHouse queries, designing table schemas, analyzing slow queries, or implementing analytical aggregations. Works with columnar OLAP workloads.
npx skill4agent add dawiddutoit/custom-claude clickhouse-query-optimizationEXPLAIN
SELECT user_id, COUNT()
FROM events
WHERE timestamp >= '2024-01-01'
GROUP BY user_id;-- Good: country (low) → user_id → timestamp (high)
CREATE TABLE events (
user_id UInt32,
timestamp DateTime,
country String
)
ENGINE = MergeTree()
ORDER BY (country, user_id, timestamp);-- ✅ Fast: Uses index (country first)
SELECT * FROM events WHERE country = 'US';
-- ❌ Slow: Skips index (missing country)
SELECT * FROM events WHERE user_id = 12345;-- Numeric ranges
ALTER TABLE events ADD INDEX idx_duration session_duration TYPE minmax GRANULARITY 4;
-- Categorical (low cardinality)
ALTER TABLE events ADD INDEX idx_event_type event_type TYPE set(100) GRANULARITY 4;
-- String equality
ALTER TABLE events ADD INDEX idx_url url TYPE bloom_filter(0.01) GRANULARITY 4;
-- Substring search
ALTER TABLE logs ADD INDEX idx_message message TYPE ngrambf_v1(4, 512, 3, 0) GRANULARITY 1;CREATE TABLE events (
timestamp DateTime,
user_id UInt32
)
ENGINE = MergeTree()
PARTITION BY toYYYYMM(timestamp)
ORDER BY (user_id, timestamp);
-- Drop old data instantly
ALTER TABLE events DROP PARTITION '202401';
-- Or use TTL
ALTER TABLE events MODIFY TTL timestamp + INTERVAL 90 DAY;-- Main table sorted by user_id
CREATE TABLE events (
user_id UInt32,
product_id UInt32,
timestamp DateTime
)
ENGINE = MergeTree()
ORDER BY (user_id, timestamp);
-- Add projection for product queries
ALTER TABLE events ADD PROJECTION proj_by_product (
SELECT *
ORDER BY (product_id, timestamp)
);
ALTER TABLE events MATERIALIZE PROJECTION proj_by_product;
-- Both queries now fast:
SELECT * FROM events WHERE user_id = 12345; -- Uses main table
SELECT * FROM events WHERE product_id = 789; -- Uses projectionSELECT user_id, event_type, properties
FROM events
PREWHERE timestamp >= '2024-01-01' AND country = 'US' -- Small columns first
WHERE event_type IN ('purchase', 'signup'); -- Complex logic-- 10-100x faster, ~2% error
SELECT uniq(user_id) FROM events; -- vs COUNT(DISTINCT)
SELECT topK(10)(product_id) FROM events; -- Approximate top-K
SELECT quantile(0.95)(response_time) FROM events; -- Approximate percentile-- Bad: Reads all columns
SELECT * FROM events WHERE user_id = 12345;
-- Good: Columnar advantage
SELECT user_id, timestamp, event_type FROM events WHERE user_id = 12345;-- View execution plan
EXPLAIN SELECT COUNT() FROM events WHERE country = 'US';
-- Check performance
SELECT
query,
query_duration_ms,
read_rows,
read_bytes
FROM system.query_log
WHERE query LIKE '%events%'
ORDER BY event_time DESC
LIMIT 1;| Technique | Problem Solved | Impact | When to Use |
|---|---|---|---|
| Primary Key Design | Index doesn't cover queries | Foundation | Always (design first) |
| Data Skipping Indexes | Non-primary filtering slow | 10-100x | After primary key |
| Partitioning | Need to delete old data | Instant deletion | Time-series with retention |
| Projections | Multiple query patterns | 100-1000x | Different sort orders |
| Query Syntax | Large columns read unnecessarily | 2-10x | Per-query optimization |
| Profiling | Don't know why slow | Insight | When optimization unclear |
| File | Purpose |
|---|---|
| examples/examples.md | Real-world optimization scenarios with metrics |
| references/reference.md | Technical guides and decision trees |