Loading...
Loading...
Fast in-memory DataFrame library for datasets that fit in RAM. Use when pandas is too slow but data still fits in memory. Lazy evaluation, parallel execution, Apache Arrow backend. Best for 1-100GB datasets, ETL pipelines, faster pandas replacement. For larger-than-RAM data use dask or vaex.
npx skill4agent add k-dense-ai/claude-scientific-skills polarsuv pip install polarsimport polars as pl
# Create DataFrame
df = pl.DataFrame({
"name": ["Alice", "Bob", "Charlie"],
"age": [25, 30, 35],
"city": ["NY", "LA", "SF"]
})
# Select columns
df.select("name", "age")
# Filter rows
df.filter(pl.col("age") > 25)
# Add computed columns
df.with_columns(
age_plus_10=pl.col("age") + 10
)pl.col("column_name")# Expression-based computation
df.select(
pl.col("name"),
(pl.col("age") * 12).alias("age_in_months")
)df = pl.read_csv("file.csv") # Reads immediately
result = df.filter(pl.col("age") > 25) # Executes immediatelylf = pl.scan_csv("file.csv") # Doesn't read yet
result = lf.filter(pl.col("age") > 25).select("name", "age")
df = result.collect() # Now executes optimized queryreferences/core_concepts.md# Select specific columns
df.select("name", "age")
# Select with expressions
df.select(
pl.col("name"),
(pl.col("age") * 2).alias("double_age")
)
# Select all columns matching a pattern
df.select(pl.col("^.*_id$"))# Single condition
df.filter(pl.col("age") > 25)
# Multiple conditions (cleaner than using &)
df.filter(
pl.col("age") > 25,
pl.col("city") == "NY"
)
# Complex conditions
df.filter(
(pl.col("age") > 25) | (pl.col("city") == "LA")
)# Add new columns
df.with_columns(
age_plus_10=pl.col("age") + 10,
name_upper=pl.col("name").str.to_uppercase()
)
# Parallel computation (all columns computed in parallel)
df.with_columns(
pl.col("value") * 10,
pl.col("value") * 100,
)# Basic grouping
df.group_by("city").agg(
pl.col("age").mean().alias("avg_age"),
pl.len().alias("count")
)
# Multiple group keys
df.group_by("city", "department").agg(
pl.col("salary").sum()
)
# Conditional aggregations
df.group_by("city").agg(
(pl.col("age") > 30).sum().alias("over_30")
)references/operations.mdgroup_bypl.len()pl.col("x").sum()pl.col("x").mean()pl.col("x").min()pl.col("x").max()pl.first()pl.last()over()# Add group statistics to each row
df.with_columns(
avg_age_by_city=pl.col("age").mean().over("city"),
rank_in_city=pl.col("salary").rank().over("city")
)
# Multiple grouping columns
df.with_columns(
group_avg=pl.col("value").mean().over("category", "region")
)group_to_rowsexplodejoin# Eager
df = pl.read_csv("file.csv")
df.write_csv("output.csv")
# Lazy (preferred for large files)
lf = pl.scan_csv("file.csv")
result = lf.filter(...).select(...).collect()df = pl.read_parquet("file.parquet")
df.write_parquet("output.parquet")df = pl.read_json("file.json")
df.write_json("output.json")references/io_guide.md# Inner join
df1.join(df2, on="id", how="inner")
# Left join
df1.join(df2, on="id", how="left")
# Join on different column names
df1.join(df2, left_on="user_id", right_on="id")# Vertical (stack rows)
pl.concat([df1, df2], how="vertical")
# Horizontal (add columns)
pl.concat([df1, df2], how="horizontal")
# Diagonal (union with different schemas)
pl.concat([df1, df2], how="diagonal")# Pivot (wide format)
df.pivot(values="sales", index="date", columns="product")
# Unpivot (long format)
df.unpivot(index="id", on=["col1", "col2"])references/transformations.md| Operation | Pandas | Polars |
|---|---|---|
| Select column | | |
| Filter | | |
| Add column | | |
| Group by | | |
| Window | | |
df.assign(
col_a=lambda df_: df_.value * 10,
col_b=lambda df_: df_.value * 100
)df.with_columns(
col_a=pl.col("value") * 10,
col_b=pl.col("value") * 100,
)references/pandas_migration.mdlf = pl.scan_csv("large.csv") # Don't use read_csv
result = lf.filter(...).select(...).collect().map_elements()lf.collect(streaming=True)# Good: Select columns early
lf.select("col1", "col2").filter(...)
# Bad: Filter on all columns first
lf.filter(...).select("col1", "col2")pl.when(condition).then(value).otherwise(other_value)df.select(pl.col("^.*_value$") * 2) # Regex patternpl.col("x").fill_null(0)
pl.col("x").is_null()
pl.col("x").drop_nulls()references/best_practices.mdcore_concepts.mdoperations.mdpandas_migration.mdio_guide.mdtransformations.mdbest_practices.md