Loading...
Loading...
An analytical in-process SQL database management system. Designed for fast analytical queries (OLAP). Highly interoperable with Python's data ecosystem (Pandas, NumPy, Arrow, Polars). Supports querying files (CSV, Parquet, JSON) directly without an ingestion step. Use for complex SQL queries on Pandas/Polars data, querying large Parquet/CSV files directly, joining data from different sources, analytical pipelines, local datasets too big for Excel, intermediate data storage and feature engineering for ML.
npx skill4agent add tondevrel/scientific-agent-skills duckdbduckdb.sqlduckdb.queryduckdb.read_parquetduckdb.from_dfpip install duckdbimport duckdb
import pandas as pd
import numpy as npimport duckdb
import pandas as pd
# 1. Create a sample DataFrame
df = pd.DataFrame({"id": [1, 2, 3], "val": [10.5, 20.0, 15.2]})
# 2. Query the DataFrame directly via SQL
# DuckDB automatically finds the 'df' variable in the local scope
result_df = duckdb.sql("SELECT id, val * 2 AS doubled FROM df WHERE val > 12").df()
print(result_df)SELECT * FROM 'data.parquet'duckdb.connect('my_data.db')FROM 'data/*.parquet'EXPLAIN ANALYZEpd.read_csv()import duckdb
import pandas as pd
# ❌ BAD: Loading everything into Pandas just to do a simple filter
# df = pd.read_csv("massive.csv")
# result = df[df['val'] > 100]
# ✅ GOOD: Let DuckDB filter while reading (saves RAM)
result = duckdb.sql("SELECT * FROM 'massive.csv' WHERE val > 100").df()
# ❌ BAD: Manual string formatting for SQL queries (SQL Injection risk)
# duckdb.sql(f"SELECT * FROM data WHERE name = '{user_input}'")
# ✅ GOOD: Use prepared statements or parameters
duckdb.execute("SELECT * FROM data WHERE name = ?", [user_input]).df()
# ❌ BAD: Re-reading the same file in a loop
# for i in range(10):
# res = duckdb.sql("SELECT mean(val) FROM 'large.parquet'").df()
# ✅ GOOD: Create a VIEW or TABLE first
duckdb.sql("CREATE VIEW data_view AS SELECT * FROM 'large.parquet'")
for i in range(10):
res = duckdb.sql("SELECT mean(val) FROM data_view").df()# Query a CSV
res_csv = duckdb.sql("SELECT * FROM read_csv_auto('data.csv')").df()
# Query a Parquet file
res_pq = duckdb.sql("SELECT * FROM 'data.parquet' WHERE price > 50").df()
# Query multiple Parquet files and join with a Pandas DF
df_metadata = pd.DataFrame(...)
query = """
SELECT p.*, m.category
FROM 'raw_data/*.parquet' p
JOIN df_metadata m ON p.id = m.id
LIMIT 10
"""
res = duckdb.sql(query).df()rel = duckdb.from_df(df)
res = rel.filter("val > 15").project("id, val * 2").order("val").limit(5)
print(res.df())query = """
SELECT
date,
station_id,
temp,
AVG(temp) OVER (PARTITION BY station_id ORDER BY date ROWS BETWEEN 7 PRECEDING AND CURRENT ROW) as rolling_7d_avg,
temp - LAG(temp) OVER (PARTITION BY station_id ORDER BY date) as daily_change
FROM 'weather_data.parquet'
"""
df_stats = duckdb.sql(query).df()# Create or open a database file
con = duckdb.connect('scientific_project.db')
# Create a table from a dataframe
con.execute("CREATE TABLE experiment_results AS SELECT * FROM df")
# Check tables
print(con.execute("SHOW TABLES").df())
# Close connection
con.close()# Enable temp directory for large queries
duckdb.sql("SET temp_directory='/tmp/duckdb_temp/'")
duckdb.sql("SET max_memory='4GB'") # Limit RAM usageduckdb.sql("SET threads TO 8")# Random 10% sample
df_sample = duckdb.sql("SELECT * FROM 'huge.parquet' USING SAMPLE 10%").df()def process_experiment_logs(glob_pattern):
"""Clean and aggregate TBs of log data across many files."""
query = f"""
WITH clean_data AS (
SELECT
timestamp::TIMESTAMP as ts,
sensor_id,
value
FROM read_csv_auto('{glob_pattern}')
WHERE value IS NOT NULL AND value != -999
)
SELECT
time_bucket(INTERVAL '1 hour', ts) as hour,
sensor_id,
AVG(value) as avg_val
FROM clean_data
GROUP BY 1, 2
ORDER BY 1, 2
"""
return duckdb.sql(query).df()def create_features(df_train):
# Use SQL to create complex lag and moving average features
return duckdb.sql("""
SELECT *,
AVG(price) OVER (PARTITION BY item ORDER BY date ROWS 3 PRECEDING) as ma3,
COUNT(*) OVER (PARTITION BY user) as user_activity_count
FROM df_train
""").df()# Query data and convert to Arrow for zero-copy transfer to Deep Learning
arrow_table = duckdb.sql("SELECT * FROM 'data.parquet'").arrow()
# Then in PyTorch (requires torch.utils.dlpack or similar)
# Or just use the fast arrow-to-numpy/tensor pathduckdb.sql()# ✅ Solution: If calling inside a function, ensure the variable is local or passed
def my_query(my_df):
return duckdb.sql("SELECT * FROM my_df").df()# ✅ Solution: Open as read-only for secondary processes
con = duckdb.connect('data.db', read_only=True)# ✅ Solution: Use strptime or let DuckDB auto-detect
# SELECT strptime(date_col, '%d/%m/%Y') FROM 'data.csv'SELECT * FROM 'file.parquet'.df().pl().arrow()FROM 'data/*.parquet'temp_directorymax_memory