WARNING:root:Found pyspark version "3.2.0" installed. The pyspark version 3.2 and above has a built-in "pandas APIs on Spark" module ported from Koalas. Try `import pyspark.pandas as ps` instead.
Validating not only real data, but also the functions that produce them.
Data tests validate real data
Data tests validate functions that produce data, given some test cases
data_cleaner.py
import pandas as pd
raw_data = pd.DataFrame({
"continuous": ["-1.1", "4.0", "10.25", "-0.1", "5.2"],
"categorical": ["A", "B", "C", "Z", "X"],
})
def clean(raw_data):
# do some cleaning 🧹✨
clean_data = ...
return clean_data
test_data_cleaner.py
import pytest
def test_clean():
# assumptions about valid data
mock_raw_data = pd.DataFrame({"continuous": ["1.0", "-5.1"], "categorical": ["X", "A"]})
result = clean(mock_raw_data)
# check that the result contains nulls
assert result.isna().any(axis="columns").all()
# check data types of each column
assert result["continuous"].dtype == float
assert result["categorical"].dtype == object
# check that non-null values have expected properties
assert result["continuous"].dropna().ge(0).all()
assert result["categorical"].dropna().isin(["A", "B", "C"]).all()
# assumptions about invalid data
with pytest.raises(KeyError):
invalid_mock_raw_data = pd.DataFrame({"categorical": ["A"]})
clean(invalid_mock_raw_data)
print("tests pass! ✅")
Let's implement the clean
function:
def clean(raw_data):
raw_data = pd.DataFrame(raw_data)
# do some cleaning 🧹✨
clean_data = (
raw_data
.astype({"continuous": float, "categorical": str})
.assign(
continuous=lambda df: df.continuous.mask(df.continuous < 0),
categorical=lambda df: df.categorical.mask(~df.categorical.isin(["A", "B", "C"]))
)
)
return clean_data
clean(raw_data)
continuous | categorical | |
---|---|---|
0 | NaN | A |
1 | 4.00 | B |
2 | 10.25 | C |
3 | NaN | NaN |
4 | 5.20 | NaN |
test_clean()
tests pass! ✅
An expressive and light-weight statistical validation tool for dataframes
Defining a schema looks and feels like defining a pandas dataframe
import pandera as pa
clean_data_schema = pa.DataFrameSchema(
columns={
"continuous": pa.Column(float, pa.Check.ge(0), nullable=True),
"categorical": pa.Column(str, pa.Check.isin(["A", "B", "C"]), nullable=True),
},
coerce=True,
)
Know Exactly What Went Wrong with Your Data
raw_data = pd.DataFrame({
"continuous": ["-1.1", "4.0", "10.25", "-0.1", "5.2"],
"categorical": ["A", "B", "C", "Z", "X"],
})
try:
CleanData.validate(raw_data, lazy=True)
except pa.errors.SchemaErrors as exc:
display(exc.failure_cases)
schema_context | column | check | check_number | failure_case | index | |
---|---|---|---|---|---|---|
0 | Column | continuous | greater_than_or_equal_to(0) | 0 | -1.1 | 0 |
1 | Column | continuous | greater_than_or_equal_to(0) | 0 | -0.1 | 3 |
2 | Column | categorical | isin({'B', 'A', 'C'}) | 0 | Z | 3 |
3 | Column | categorical | isin({'B', 'A', 'C'}) | 0 | X | 4 |
Here's data_cleaner.py
again:
import pandera as pa
from pandera.typing import DataFrame, Series
class RawData(pa.SchemaModel):
continuous: Series[float]
categorical: Series[str]
class Config:
coerce = True
class CleanData(RawData):
continuous = pa.Field(ge=0, nullable=True)
categorical = pa.Field(isin=[*"ABC"], nullable=True)
Pandera guarantees that input and output dataframes fulfill the types and constraints as defined by type annotations
@pa.check_types
def clean(raw_data: DataFrame[RawData]) -> DataFrame[CleanData]:
return raw_data.assign(
continuous=lambda df: df.continuous.mask(df.continuous < 0),
categorical=lambda df: df.categorical.mask(~df.categorical.isin(["A", "B", "C"]))
)
clean(raw_data)
continuous | categorical | |
---|---|---|
0 | NaN | A |
1 | 4.00 | B |
2 | 10.25 | C |
3 | NaN | NaN |
4 | 5.20 | NaN |
test_data_cleaner.py
def test_clean():
# assumptions about valid data
mock_raw_data = pd.DataFrame({"continuous": ["1.0", "-5.1"], "categorical": ["X", "A"]})
# the assertions about the resulting data reduces to an execution test!
clean(mock_raw_data)
# assumptions about invalid data
with pytest.raises(pa.errors.SchemaError):
invalid_mock_raw_data = pd.DataFrame({"categorical": ["A"]})
clean(invalid_mock_raw_data)
print("tests pass! ✅")
test_clean()
tests pass! ✅
Once you've defined a schema, you can import it in other parts of your code base, like your test suite!
# data_cleaner.py
def clean(raw_data: DataFrame[RawData]) -> DataFrame[CleanData]:
return raw_data.assign(
continuous=lambda df: df.continuous.mask(df.continuous < 0),
categorical=lambda df: df.categorical.mask(~df.categorical.isin(["A", "B", "C"]))
)
# test_data_cleaner.py
def test_clean():
# assumptions about valid data
mock_raw_data = RawData(pd.DataFrame({"continuous": ["1.0", "-5.1"], "categorical": ["X", "A"]}))
# the assertions about the resulting data reduces to an execution test!
CleanData(clean(mock_raw_data))
# assumptions about invalid data
with pytest.raises(pa.errors.SchemaError):
invalid_mock_raw_data = RawData(pd.DataFrame({"categorical": ["A"]}))
clean(invalid_mock_raw_data)
print("tests pass! ✅")
test_clean()
tests pass! ✅
You can even represent dataframe joins!
class CleanData(RawData):
continuous = pa.Field(ge=0, nullable=True)
categorical = pa.Field(isin=[*"ABC"], nullable=True)
class SupplementaryData(pa.SchemaModel):
discrete: Series[int] = pa.Field(ge=0, nullable=True)
class JoinedData(CleanData, SupplementaryData): pass
clean_data = pd.DataFrame({"continuous": ["1.0"], "categorical": ["A"]})
supplementary_data = pd.DataFrame({"discrete": [1]})
JoinedData(clean_data.join(supplementary_data))
continuous | categorical | discrete | |
---|---|---|---|
0 | 1.0 | A | 1 |
clean_data = pd.DataFrame({
"continuous": range(100),
"categorical": [*"ABCAB" * 20]
})
schema = pa.infer_schema(clean_data)
print(schema)
<Schema DataFrameSchema( columns={ 'continuous': <Schema Column(name=continuous, type=DataType(int64))> 'categorical': <Schema Column(name=categorical, type=DataType(object))> }, checks=[], coerce=True, dtype=None, index=<Schema Index(name=None, type=DataType(int64))>, strict=False name=None, ordered=False )>
yaml_schema = schema.to_yaml()
print(yaml_schema)
schema_type: dataframe version: 0.9.0 columns: continuous: dtype: int64 nullable: false checks: greater_than_or_equal_to: 0.0 less_than_or_equal_to: 99.0 unique: false coerce: false required: true regex: false categorical: dtype: object nullable: false checks: null unique: false coerce: false required: true regex: false checks: null index: - dtype: int64 nullable: false checks: greater_than_or_equal_to: 0.0 less_than_or_equal_to: 99.0 name: null coerce: false coerce: true strict: false unique: null
print(schema.from_yaml(yaml_schema))
<Schema DataFrameSchema( columns={ 'continuous': <Schema Column(name=continuous, type=DataType(int64))> 'categorical': <Schema Column(name=categorical, type=DataType(object))> }, checks=[], coerce=True, dtype=None, index=<Schema Index(name=None, type=DataType(int64))>, strict=False name=None, ordered=False )>
schema.to_script()
¶from pandera import DataFrameSchema, Column, Check, Index, MultiIndex
schema = DataFrameSchema(
columns={
"continuous": Column(
dtype=pandera.engines.numpy_engine.Int64,
checks=[
Check.greater_than_or_equal_to(min_value=0.0),
Check.less_than_or_equal_to(max_value=99.0),
],
nullable=False,
unique=False,
coerce=False,
required=True,
regex=False,
),
"categorical": Column(
dtype=pandera.engines.numpy_engine.Object,
checks=None,
nullable=False,
unique=False,
coerce=False,
required=True,
regex=False,
),
},
index=Index(
dtype=pandera.engines.numpy_engine.Int64,
checks=[
Check.greater_than_or_equal_to(min_value=0.0),
Check.less_than_or_equal_to(max_value=99.0),
],
nullable=False,
coerce=False,
name=None,
),
coerce=True,
strict=False,
name=None,
)
frictionless
table schema¶from pandera.io import from_frictionless_schema
frictionless_schema = {
"fields": [
{
"name": "continuous",
"type": "number",
"constraints": {"minimum": 0}
},
{
"name": "categorical",
"type": "string",
"constraints": {"isin": ["A", "B", "C"]}
},
],
}
schema = from_frictionless_schema(frictionless_schema)
print(schema)
<Schema DataFrameSchema( columns={ 'continuous': <Schema Column(name=continuous, type=DataType(float64))> 'categorical': <Schema Column(name=categorical, type=DataType(string[python]))> }, checks=[], coerce=True, dtype=None, index=None, strict=True name=None, ordered=False )>
Generate valid examples under the schema's constraints
RawData.example(size=3)
continuous | categorical | |
---|---|---|
0 | 0.0 | |
1 | 0.0 | |
2 | 0.0 |
CleanData.example(size=3)
continuous | categorical | |
---|---|---|
0 | 8.778791e-69 | C |
1 | 2.000010e+00 | B |
2 | 3.402823e+38 | A |
# Transform your unit test suite!
# data_cleaner.py
@pa.check_types
def clean(raw_data: DataFrame[RawData]) -> DataFrame[CleanData]:
return raw_data.assign(
continuous=lambda df: df.continuous.mask(df.continuous < 0),
categorical=lambda df: df.categorical.mask(~df.categorical.isin(["A", "B", "C"]))
)
# test_data_cleaner.py
from hypothesis import given
@given(RawData.strategy(size=5))
def test_clean(mock_raw_data):
clean(mock_raw_data)
class InvalidData(pa.SchemaModel):
foo: Series[int]
@given(InvalidData.strategy(size=5))
def test_clean_errors(mock_invalid_data):
with pytest.raises(pa.errors.SchemaError):
clean(mock_invalid_data)
def run_test_suite():
test_clean()
test_clean_errors()
print("tests pass! ✅")
run_test_suite()
tests pass! ✅
In 0.8.0
, pandera supports dask
, modin
, and koalas
dataframes to scale
data validation to big data.
display(raw_data)
continuous | categorical | |
---|---|---|
0 | -1.1 | A |
1 | 4.0 | B |
2 | 10.25 | C |
3 | -0.1 | Z |
4 | 5.2 | X |
import dask.dataframe as dd
dask_dataframe = dd.from_pandas(raw_data, npartitions=1)
try:
CleanData(dask_dataframe, lazy=True).compute()
except pa.errors.SchemaErrors as exc:
display(exc.failure_cases)
schema_context | column | check | check_number | failure_case | index | |
---|---|---|---|---|---|---|
0 | Column | continuous | greater_than_or_equal_to(0) | 0 | -1.1 | 0 |
1 | Column | continuous | greater_than_or_equal_to(0) | 0 | -0.1 | 3 |
2 | Column | categorical | isin({'B', 'A', 'C'}) | 0 | Z | 3 |
3 | Column | categorical | isin({'B', 'A', 'C'}) | 0 | X | 4 |
import modin.pandas as mpd
modin_dataframe = mpd.DataFrame(raw_data)
try:
CleanData(modin_dataframe, lazy=True)
except pa.errors.SchemaErrors as exc:
display(exc.failure_cases)
schema_context | column | check | check_number | failure_case | index | |
---|---|---|---|---|---|---|
0 | Column | continuous | greater_than_or_equal_to(0) | 0 | -1.1 | 0 |
1 | Column | continuous | greater_than_or_equal_to(0) | 0 | -0.1 | 3 |
2 | Column | categorical | isin({'B', 'A', 'C'}) | 0 | Z | 3 |
3 | Column | categorical | isin({'B', 'A', 'C'}) | 0 | X | 4 |
from typing import Union
Number = Union[int, float]
def add_and_double(x: Number, y: Number) -> Number:
...
add_and_double(5, 2)
add_and_double(5, "hello")
add_and_double(11.5, -1.5)
import pandera as pa
from pandera.typing import DataFrame, Series
class Inputs(pa.SchemaModel):
x: Series[int]
y: Series[int]
class Config:
coerce = True
class Outputs(Inputs):
z: Series[int]
@pa.dataframe_check
def custom_check(cls, df: DataFrame) -> Series:
return df["z"] == (df["x"] + df["y"]) * 2
@pa.check_types
def add_and_double(raw_data: DataFrame[Inputs]) -> DataFrame[Outputs]:
...
Statistical typing extends primitive data types with additional semantics about the _properties held by a collection of data pointsdf.
data_point = {"square_footage": 700, "nbedrooms": 1, "price": 500_000}
data_points = [
{"square_footage": 700, "nbedrooms": 1, "price": 500_000},
{"square_footage": 1000, "nbedrooms": 2, "price": 750_000},
{"square_footage": 3000, "nbedrooms": 4, "price": 1_000_000},
...
]
"The height of group A is greater than that of group B"
df = pd.DataFrame({
"height_in_feet": [6.5, 7, 7.1, 6.1, 5.1, 4],
"group": ["A", "A", "A", "B", "B", "B"]
})
schema = pa.DataFrameSchema({
"height_in_feet": pa.Column(
float, [
pa.Hypothesis.two_sample_ttest(
sample1="A",
relationship="greater_than",
sample2="B",
groupby="group",
alpha=0.05,
equal_var=True,
),
]
),
"group": pa.Column(str, pa.Check.isin(["A", "B"])),
})
display(schema(df))
height_in_feet | group | |
---|---|---|
0 | 6.5 | A |
1 | 7.0 | A |
2 | 7.1 | A |
3 | 6.1 | B |
4 | 5.1 | B |
5 | 4.0 | B |
from scipy import stats
def two_sample_ttest(array1, array2):
return stats.ttest_ind(array1, array2)
def greater_than(stat, pvalue, alpha=0.01):
return stat > 0 and pvalue / 2 < alpha
schema = pa.DataFrameSchema({
"height_in_feet": pa.Column(
float, [
pa.Hypothesis(
name="two_sample_test[A > B; alpha=0.05]",
test=two_sample_ttest,
samples=["A", "B"],
groupby="group",
relationship=greater_than,
relationship_kwargs={"alpha": 0.05}
)
]),
"group": pa.Column(str, checks=pa.Check.isin(["A", "B"]))
})
df = pd.DataFrame({
"height_in_feet": [6.5, 7, 7.1, 6.1, 5.1, 4],
"group": ["B", "B", "B", "A", "A", "A"]
})
try:
schema(df, lazy=True)
except pa.errors.SchemaErrors as exc:
display(exc.failure_cases)
schema_context | column | check | check_number | failure_case | index | |
---|---|---|---|---|---|---|
0 | Column | height_in_feet | two_sample_test[A > B; alpha=0.05] | 0 | False | None |
xarray
, jsonschema
, and an extension API for arbitrary data containers.fastapi
, pydantic
, pytest