Loading...
Loading...
This skill should be used when the user asks to "validate a DataFrame with pandera", "write a pandera schema", "use pandera DataFrameModel", "add data validation to a pipeline", or needs guidance on pandera best practices for data quality.
npx skill4agent add the-perfect-developer/the-perfect-opencode panderapanderaFutureWarningimport pandera.pandas as pa # pandas (recommended)
import pandera.polars as pa # polars
from pandera.typing.pandas import DataFrame, Series, IndexDataFrameSchemaimport pandas as pd
import pandera.pandas as pa
schema = pa.DataFrameSchema({
"user_id": pa.Column(int, pa.Check.gt(0)),
"email": pa.Column(str, pa.Check.str_matches(r"^[^@]+@[^@]+\.[^@]+$")),
"score": pa.Column(float, [pa.Check.ge(0.0), pa.Check.le(1.0)]),
"status": pa.Column(str, pa.Check.isin(["active", "inactive", "banned"])),
})
validated = schema.validate(df)DataFrameModel@pa.check_typesimport pandera.pandas as pa
from pandera.typing.pandas import DataFrame, Series
class UserSchema(pa.DataFrameModel):
user_id: int = pa.Field(gt=0)
email: str = pa.Field(str_matches=r"^[^@]+@[^@]+\.[^@]+$")
score: float = pa.Field(ge=0.0, le=1.0)
status: str = pa.Field(isin=["active", "inactive", "banned"])
class Config:
strict = True # reject extra columns
coerce = False # do not silently cast types
# Validate directly
UserSchema.validate(df)
# Or via typing annotation + decorator
@pa.check_types
def process(df: DataFrame[UserSchema]) -> DataFrame[UserSchema]:
return dfpa.Check.gt(0) # greater than
pa.Check.ge(0) # greater than or equal
pa.Check.lt(100) # less than
pa.Check.le(100) # less than or equal
pa.Check.eq("value") # equal to
pa.Check.ne("value") # not equal to
pa.Check.isin(["a", "b"]) # membership
pa.Check.notin(["x"]) # exclusion
pa.Check.str_matches(r"^\d+$") # regex match
pa.Check.in_range(0, 100) # closed interval
pa.Check.str_startswith("prefix")
pa.Check.str_endswith("suffix")
pa.Check.str_length(1, 255) # min/max string length# Vectorized (default, faster — operates on the whole Series)
pa.Check(lambda s: s.str.len() <= 255)
# Element-wise (scalar input, use only when vectorized is impractical)
pa.Check(lambda x: x > 0, element_wise=True)
# Always add an error message
pa.Check(lambda s: s > 0, error="values must be positive")schema = pa.DataFrameSchema(
columns={...},
checks=pa.Check(lambda df: df["end_date"] >= df["start_date"]),
)DataFrameModel@pa.dataframe_checkclass Schema(pa.DataFrameModel):
start_date: int
end_date: int
@pa.dataframe_check
@classmethod
def end_after_start(cls, df: pd.DataFrame) -> pd.Series:
return df["end_date"] >= df["start_date"]# Object API: allow nulls in a column
pa.Column(float, nullable=True)
# DataFrameModel: make a column optional (may be absent)
from typing import Optional
class Schema(pa.DataFrameModel):
required_col: Series[int]
optional_col: Optional[Series[float]]# Per-column
pa.Column(int, coerce=True)
# Schema-wide via Config
class Schema(pa.DataFrameModel):
year: int = pa.Field(gt=2000, coerce=True)
class Config:
coerce = Truelazy=Truetry:
schema.validate(df, lazy=True)
except pa.errors.SchemaErrors as exc:
print(exc.failure_cases) # DataFrame of all failures# DataFrameModel + check_types (recommended)
@pa.check_types
def transform(df: DataFrame[InputSchema]) -> DataFrame[OutputSchema]:
return df.assign(revenue=df["units"] * df["price"])
# Object API: check_input / check_output
@pa.check_input(input_schema)
@pa.check_output(output_schema)
def pipeline_step(df):
return df
# check_io: concisely specify both
@pa.check_io(raw=input_schema, out=output_schema)
def pipeline_step(raw):
return rawclass BaseEvent(pa.DataFrameModel):
event_id: str
timestamp: int = pa.Field(gt=0)
class ClickEvent(BaseEvent):
url: str
user_agent: str
class Config:
strict = Trueimport pandera.io
# Save
pandera.io.to_yaml(schema, "./schema.yaml")
# Load
schema = pandera.io.from_yaml("./schema.yaml")
# Generate Python script
pandera.io.to_script(schema, "./schema_definition.py")import pandera.pandas as pa
inferred = pa.infer_schema(df)
print(inferred.to_script()) # inspect then copy-editdrop_invalid_rows=TrueDataFrameSchemaschema = pa.DataFrameSchema(
{"score": pa.Column(float, pa.Check.ge(0))},
drop_invalid_rows=True,
)
cleaned = schema.validate(df_with_bad_rows)from pandera.errors import SchemaError, SchemaErrors
# Single error (eager validation)
try:
schema.validate(df)
except SchemaError as exc:
print(exc.failure_cases) # Series/DataFrame of failures
# Multiple errors (lazy validation)
try:
schema.validate(df, lazy=True)
except SchemaErrors as exc:
# Structured dict with SCHEMA and DATA keys
print(exc.error_counts)
print(exc.failure_cases)Config| Option | Type | Effect |
|---|---|---|
| | Raise if extra columns present |
| | Cast columns to declared dtypes |
| | Require columns in declared order |
| | Schema name shown in error messages |
| | Insert columns with default values |
DataFrameModelDataFrameSchemastrict=TrueCheck.gtCheck.isinelement_wise=Falseelement_wise=Trueerror=Checkcoerce=Trueraise_warning=Truereferences/checks-and-validation.mdreferences/dataframe-models.md