Addendum: "Data Testing" can also be thought of as testing the transformation code that produces the data.
data_cleaner.py
import pandas as pd
raw_data = pd.DataFrame({
"continuous": ["-1.1", "4.0", "10.25", "-0.1", "5.2"],
"categorical": ["A", "B", "C", "Z", "X"],
})
def clean(raw_data):
return (
raw_data
# do some cleaning 🧹✨
)
test_data_cleaner.py
def test_clean():
mock_raw_data = ... # hand-written mock data 😅
result = clean(mock_raw_data)
# assumptions about clean data
assert result["continuous"].ge(0).all()
assert result["categorical"].isin(["A", "B", "C"]).all()
An expressive and light-weight statistical validation tool for dataframes
Defining a schema looks and feels like defining a pandas dataframe
import pandera as pa
schema = pa.DataFrameSchema(
columns={
"continuous": pa.Column(float, pa.Check.ge(0)),
"categorical": pa.Column(str, pa.Check.isin(["A", "B", "C"])),
},
coerce=True,
)
With coerce=True
pandera first performs type coercion on the columns before
validating them.
raw_data = pd.DataFrame({
"continuous": ["-1.1", "4.0", "10.25", "-0.1", "5.2"],
"categorical": ["A", "B", "C", "Z", "X"],
})
try:
Schema.validate(raw_data, lazy=True)
except pa.errors.SchemaErrors as exc:
display(exc.failure_cases)
schema_context | column | check | check_number | failure_case | index | |
---|---|---|---|---|---|---|
0 | Column | continuous | greater_than_or_equal_to(0) | 0 | -1.1 | 0 |
1 | Column | continuous | greater_than_or_equal_to(0) | 0 | -0.1 | 3 |
2 | Column | categorical | isin({'B', 'A', 'C'}) | 0 | Z | 3 |
3 | Column | categorical | isin({'B', 'A', 'C'}) | 0 | X | 4 |
pydantic [and pandera guarantee] the types and constraints of the output [data], not the input data. -Pydantic Docs
raw_data = ...
valid_data = validate(parse(raw_data)) # raise Exception if constraints are not met
Pandera guarantees that input and output dataframes fulfill the types and constraints as defined by type annotations.
raw_data = pd.DataFrame({
"continuous": list("123456"),
"categorical": list("AABBCC"),
})
class Schema(pa.SchemaModel):
continuous: Series[float] = pa.Field(ge=0)
categorical: Series[str] = pa.Field(isin=["A", "B", "C"])
class Config:
coerce = True
from pandera.typing import DataFrame
@pa.check_types
def summarize_data(clean_data: DataFrame[Schema]):
return clean_data.groupby("categorical")["continuous"].mean()
display(summarize_data(raw_data).rename("mean_continuous").to_frame())
mean_continuous | |
---|---|
categorical | |
A | 1.5 |
B | 3.5 |
C | 5.5 |
Extend parsing functionality to support arbitrary transformations
class Schema(pa.SchemaModel):
continuous: Series[float] = pa.Field(ge=0)
categorical: Series[str] = pa.Field(isin=["A", "B", "C"])
class Config:
coerce = True
@pa.parser("continuous")
def truncate_continuous(cls, series):
"""set negative values to nan"""
return series.mask(series < 0, pd.NA)
@pa.parser("continuous")
def filter_continuous(cls, series):
"""filter out records with negative values in the continuous column"""
return series[series >= 0]
Once you've defined a schema, you can use it in your source code
# data_cleaning.py
from pandera.typing import DataFrame
@pa.check_types
def clean_data(raw_data) -> DataFrame[Schema]:
return (
raw_data
# do some cleaning
)
... and your test suite (or anywhere you want, really!)
# test_data_cleaning.py
def test_clean_data():
raw_data = ...
clean_data(raw_data)
Now the output dataframe type is validated when you call clean_data
at runtime
so our test reduces to an execution test!
class InputSchema(pa.SchemaModel):
_categories = ["A", "B", "C"] # store arbitrary metadata in private class attributes
continuous: Series[float] = pa.Field(ge=0)
categorical: Series[str] = pa.Field(isin=_categories)
class Config:
coerce = True
class OutputSchema(InputSchema):
categorical_one_hot: Series[int] = pa.Field(alias="one_hot_", regex=True)
@pa.check("one_hot_")
def categorical_one_hot_check(cls, series):
return series.name[-1] in cls._categories
@pa.check_types
def featurize_data(clean_data: DataFrame[InputSchema]) -> DataFrame[OutputSchema]:
one_hot = pd.get_dummies(clean_data["categorical"], prefix="one_hot")
return pd.concat([clean_data, one_hot], axis="columns")
display(featurize_data(raw_data).head(3))
continuous | categorical | one_hot_A | one_hot_B | one_hot_C | |
---|---|---|---|---|---|
0 | 1.0 | A | 1 | 0 | 0 |
1 | 2.0 | A | 1 | 0 | 0 |
2 | 3.0 | B | 0 | 1 | 0 |
Support frictionless data table schemas (✨ coming out in the 0.7.0
release ✨)
from pandera.io import from_frictionless_schema
frictionless_schema = {
"fields": [
{
"name": "column_1",
"type": "integer",
"constraints": {"minimum": 10, "maximum": 99}
}
],
"primaryKey": "column_1"
}
pandera_schema = from_frictionless_schema(frictionless_schema)
You have a schema with a bunch of metadata about it... why not generate data for testing?
display(InputSchema.example(size=3))
continuous | categorical | |
---|---|---|
0 | 1.100000e+00 | B |
1 | 1.192093e-07 | B |
2 | 9.007199e+15 | B |
input_schema_strategy = InputSchema.strategy(size=5)
print(input_schema_strategy)
print(type(input_schema_strategy))
_dataframe_strategy() <class 'hypothesis.strategies._internal.lazy.LazyStrategy'>
from hypothesis import given
@given(input_schema_strategy)
def test_featurize_data(clean_data):
featurize_data(clean_data)
test_featurize_data()
Automate the tedium of hand-writing mock dataframes for testing!
At run-time
At test-time
Pandera uses basic data profiling to infer a schema from realistic data
realistic_data = pd.DataFrame({"continuous": [1, 2, 3, 4, 5, 6]})
bootstrapped_schema = pa.infer_schema(realistic_data)
print(bootstrapped_schema)
<Schema DataFrameSchema( columns={ 'continuous': <Schema Column(name=continuous, type=int64)> }, checks=[], coerce=True, pandas_dtype=None, index=<Schema Index(name=None, type=int64)>, strict=False name=None, ordered=False )>
Write it out into a python file with bootstrapped_schema.to_script("schemas.py")
from pandera import (
DataFrameSchema,
Column,
Check,
Index,
MultiIndex,
PandasDtype,
)
schema = DataFrameSchema(
columns={
"continuous": Column(
pandas_dtype=PandasDtype.Int64,
checks=[
Check.greater_than_or_equal_to(min_value=1.0),
Check.less_than_or_equal_to(max_value=6.0),
],
nullable=False,
allow_duplicates=True,
coerce=False,
required=True,
regex=False,
)
},
index=Index(
pandas_dtype=PandasDtype.Int64,
checks=[
Check.greater_than_or_equal_to(min_value=0.0),
Check.less_than_or_equal_to(max_value=5.0),
],
nullable=False,
coerce=False,
name=None,
),
coerce=True,
strict=False,
name=None,
)
Write it out into a yaml file with bootstrapped_schema.to_yaml("schema.yaml")
schema_type: dataframe
version: 0.6.5
columns:
continuous:
pandas_dtype: int64
nullable: false
checks:
greater_than_or_equal_to: 1.0
less_than_or_equal_to: 6.0
allow_duplicates: true
coerce: false
required: true
regex: false
checks: null
index:
- pandas_dtype: int64
nullable: false
checks:
greater_than_or_equal_to: 0.0
less_than_or_equal_to: 5.0
name: null
coerce: false
coerce: true
strict: false
Create schema from a pandas-profiling
ProfileReport
from pandera.io import from_pandas_profile_report
df = ...
profile = ProfileReport(df)
schema = from_pandas_profile_report(profile)
If we have pandera schema type annotations...
import pandera as pa
from pandera.typing import DataFrame as DF
from schemas import Raw, Clean, Training
@pa.check_types
def load() -> DF[Raw]:
...
@pa.check_types
def clean(raw_data: DF[Raw]) -> DF[Clean]:
...
@pa.check_types
def featurize(clean_data: DF[Clean]) -> DF[Training]:
...
@pa.check_types
def train_model(training_data: DF[Training]):
...
We can potentially create a data flow graph
Collect coverage statistics of schema-annotated dataframes to identify
points in the pipeline that lack dataframe type coverage
Function | Input Type | Output Type | Test Errors |
---|---|---|---|
load | - | DF[Raw] | 0 |
clean | DF[Raw] | DF[Clean] | 1 |
featurize | DF[Clean] | DF[Train] | 7 |
train_model | DF[Train] | - | 2 |