WARNING:root:Found pyspark version "3.2.0" installed. The pyspark version 3.2 and above has a built-in "pandas APIs on Spark" module ported from Koalas. Try `import pyspark.pandas as ps` instead.
The act of asking the question "are my data as I expect them to be?"
It's the act of writing programs that assert properties about not only the data themselves, but also the functions that produce them.
data_cleaner.py
import pandas as pd
raw_data = pd.DataFrame({
"continuous": ["-1.1", "4.0", "10.25", "-0.1", "5.2"],
"categorical": ["A", "B", "C", "Z", "X"],
})
def clean(raw_data):
# do some cleaning 🧹✨
clean_data = ...
return clean_data
test_data_cleaner.py
import pytest
def test_clean():
# assumptions about valid data
mock_raw_data = pd.DataFrame({"continuous": ["1.0", "-5.1"], "categorical": ["X", "A"]})
result = clean(mock_raw_data)
# check that the result contains nulls
assert result.isna().any(axis="columns").all()
# check data types of each column
assert result["continuous"].dtype == float
assert result["categorical"].dtype == object
# check that non-null values have expected properties
assert result["continuous"].dropna().ge(0).all()
assert result["categorical"].dropna().isin(["A", "B", "C"]).all()
# assumptions about invalid data
with pytest.raises(KeyError):
invalid_mock_raw_data = pd.DataFrame({"categorical": ["A"]})
clean(invalid_mock_raw_data)
print("tests pass! ✅")
Let's implement the clean
function:
def clean(raw_data):
raw_data = pd.DataFrame(raw_data)
# do some cleaning 🧹✨
clean_data = (
raw_data
.astype({"continuous": float, "categorical": str})
.assign(
continuous=lambda _: _.continuous.mask(_.continuous < 0),
categorical=lambda _: _.categorical.mask(~_.categorical.isin(["A", "B", "C"]))
)
)
return clean_data
clean(raw_data)
continuous | categorical | |
---|---|---|
0 | NaN | A |
1 | 4.00 | B |
2 | 10.25 | C |
3 | NaN | NaN |
4 | 5.20 | NaN |
test_clean()
tests pass! ✅
An expressive and light-weight statistical validation tool for dataframes
Defining a schema looks and feels like defining a pandas dataframe
import pandera as pa
clean_data_schema = pa.DataFrameSchema(
columns={
"continuous": pa.Column(float, pa.Check.ge(0)),
"categorical": pa.Column(str, pa.Check.isin(["A", "B", "C"])),
},
coerce=True,
)
Know Exactly What Went Wrong with Your Data
raw_data = pd.DataFrame({
"continuous": ["-1.1", "4.0", "10.25", "-0.1", "5.2"],
"categorical": ["A", "B", "C", "Z", "X"],
})
try:
CleanData.validate(raw_data, lazy=True)
except pa.errors.SchemaErrors as exc:
display(exc.failure_cases)
schema_context | column | check | check_number | failure_case | index | |
---|---|---|---|---|---|---|
0 | Column | continuous | greater_than_or_equal_to(0) | 0 | -1.1 | 0 |
1 | Column | continuous | greater_than_or_equal_to(0) | 0 | -0.1 | 3 |
2 | Column | categorical | isin({'A', 'B', 'C'}) | 0 | Z | 3 |
3 | Column | categorical | isin({'A', 'B', 'C'}) | 0 | X | 4 |
Here's data_cleaner.py
again:
import pandera as pa
from pandera.typing import DataFrame, Series
class RawData(pa.SchemaModel):
continuous: Series[float]
categorical: Series[str]
class Config:
coerce = True
class CleanData(RawData):
continuous = pa.Field(ge=0, nullable=True)
categorical = pa.Field(isin=[*"ABC"], nullable=True)
Pandera guarantees that input and output dataframes fulfill the types and constraints as defined by type annotations
@pa.check_types
def clean(raw_data: DataFrame[RawData]) -> DataFrame[CleanData]:
return raw_data.assign(
continuous=lambda _: _.continuous.mask(_.continuous < 0),
categorical=lambda _: _.categorical.mask(~_.categorical.isin(["A", "B", "C"]))
)
clean(raw_data)
continuous | categorical | |
---|---|---|
0 | NaN | A |
1 | 4.00 | B |
2 | 10.25 | C |
3 | NaN | NaN |
4 | 5.20 | NaN |
test_data_cleaner.py
def test_clean():
# assumptions about valid data
mock_raw_data = pd.DataFrame({"continuous": ["1.0", "-5.1"], "categorical": ["X", "A"]})
# the assertions about the resulting data reduces to an execution test!
clean(mock_raw_data)
# assumptions about invalid data
with pytest.raises(pa.errors.SchemaError):
invalid_mock_raw_data = pd.DataFrame({"categorical": ["A"]})
clean(invalid_mock_raw_data)
print("tests pass! ✅")
test_clean()
tests pass! ✅
Once you've defined a schema, you can import it in other parts of your code base, like your test suite!
# data_cleaner.py
def clean(raw_data: DataFrame[RawData]) -> DataFrame[CleanData]:
return raw_data.assign(
continuous=lambda _: _.continuous.mask(_.continuous < 0),
categorical=lambda _: _.categorical.mask(~_.categorical.isin(["A", "B", "C"]))
)
# test_data_cleaner.py
def test_clean():
# assumptions about valid data
mock_raw_data = RawData(pd.DataFrame({"continuous": ["1.0", "-5.1"], "categorical": ["X", "A"]}))
# the assertions about the resulting data reduces to an execution test!
CleanData(clean(mock_raw_data))
# assumptions about invalid data
with pytest.raises(pa.errors.SchemaError):
invalid_mock_raw_data = RawData(pd.DataFrame({"categorical": ["A"]}))
clean(invalid_mock_raw_data)
print("tests pass! ✅")
test_clean()
tests pass! ✅
You can even represent dataframe joins!
class CleanData(RawData):
continuous = pa.Field(ge=0, nullable=True)
categorical = pa.Field(isin=[*"ABC"], nullable=True)
class SupplementaryData(pa.SchemaModel):
discrete: Series[int] = pa.Field(ge=0, nullable=True)
class JoinedData(CleanData, SupplementaryData): pass
clean_data = pd.DataFrame({"continuous": ["1.0"], "categorical": ["A"]})
supplementary_data = pd.DataFrame({"discrete": [1]})
JoinedData(clean_data.join(supplementary_data))
continuous | categorical | discrete | |
---|---|---|---|
0 | 1.0 | A | 1 |
clean_data = pd.DataFrame({
"continuous": range(100),
"categorical": [*"ABCAB" * 20]
})
schema = pa.infer_schema(clean_data)
print(schema)
<Schema DataFrameSchema( columns={ 'continuous': <Schema Column(name=continuous, type=DataType(int64))> 'categorical': <Schema Column(name=categorical, type=DataType(object))> }, checks=[], coerce=True, dtype=None, index=<Schema Index(name=None, type=DataType(int64))>, strict=False name=None, ordered=False )>
yaml_schema = schema.to_yaml()
print(yaml_schema)
schema_type: dataframe version: 0.8.0 columns: continuous: dtype: int64 nullable: false checks: greater_than_or_equal_to: 0.0 less_than_or_equal_to: 99.0 unique: false coerce: false required: true regex: false categorical: dtype: object nullable: false checks: null unique: false coerce: false required: true regex: false checks: null index: - dtype: int64 nullable: false checks: greater_than_or_equal_to: 0.0 less_than_or_equal_to: 99.0 name: null coerce: false coerce: true strict: false unique: null
print(schema.from_yaml(yaml_schema))
<Schema DataFrameSchema( columns={ 'continuous': <Schema Column(name=continuous, type=DataType(int64))> 'categorical': <Schema Column(name=categorical, type=DataType(object))> }, checks=[], coerce=True, dtype=None, index=<Schema Index(name=None, type=DataType(int64))>, strict=False name=None, ordered=False )>
schema.to_script()
¶from pandera import DataFrameSchema, Column, Check, Index, MultiIndex
schema = DataFrameSchema(
columns={
"continuous": Column(
dtype=pandera.engines.numpy_engine.Int64,
checks=[
Check.greater_than_or_equal_to(min_value=0.0),
Check.less_than_or_equal_to(max_value=99.0),
],
nullable=False,
unique=False,
coerce=False,
required=True,
regex=False,
),
"categorical": Column(
dtype=pandera.engines.numpy_engine.Object,
checks=None,
nullable=False,
unique=False,
coerce=False,
required=True,
regex=False,
),
},
index=Index(
dtype=pandera.engines.numpy_engine.Int64,
checks=[
Check.greater_than_or_equal_to(min_value=0.0),
Check.less_than_or_equal_to(max_value=99.0),
],
nullable=False,
coerce=False,
name=None,
),
coerce=True,
strict=False,
name=None,
)
frictionless
table schema¶from pandera.io import from_frictionless_schema
frictionless_schema = {
"fields": [
{
"name": "continuous",
"type": "number",
"constraints": {"minimum": 0}
},
{
"name": "categorical",
"type": "string",
"constraints": {"isin": ["A", "B", "C"]}
},
],
}
schema = from_frictionless_schema(frictionless_schema)
print(schema)
<Schema DataFrameSchema( columns={ 'continuous': <Schema Column(name=continuous, type=DataType(float64))> 'categorical': <Schema Column(name=categorical, type=DataType(string[python]))> }, checks=[], coerce=True, dtype=None, index=None, strict=True name=None, ordered=False )>
Generate valid examples under the schema's constraints
RawData.example(size=3)
continuous | categorical | |
---|---|---|
0 | 0.0 | |
1 | 0.0 | |
2 | 0.0 |
CleanData.example(size=3)
continuous | categorical | |
---|---|---|
0 | 0.0 | A |
1 | 0.0 | A |
2 | 0.0 | A |
# Transform your unit test suite!
from hypothesis import given
@pa.check_types
def clean(raw_data: DataFrame[RawData]) -> DataFrame[CleanData]:
return raw_data.assign(
continuous=lambda _: _.continuous.mask(_.continuous < 0),
categorical=lambda _: _.categorical.mask(~_.categorical.isin(["A", "B", "C"]))
)
@given(RawData.strategy(size=5))
def test_clean(mock_raw_data):
clean(mock_raw_data)
class InvalidData(pa.SchemaModel):
foo: Series[int]
@given(InvalidData.strategy(size=5))
def test_clean_errors(mock_invalid_data):
with pytest.raises(pa.errors.SchemaError):
clean(mock_invalid_data)
def run_test_suite():
test_clean()
test_clean_errors()
print("tests pass! ✅")
run_test_suite()
tests pass! ✅
display(raw_data)
continuous | categorical | |
---|---|---|
0 | -1.1 | A |
1 | 4.0 | B |
2 | 10.25 | C |
3 | -0.1 | Z |
4 | 5.2 | X |
import dask.dataframe as dd
dask_dataframe = dd.from_pandas(raw_data, npartitions=1)
try:
CleanData(dask_dataframe, lazy=True).compute()
except pa.errors.SchemaErrors as exc:
print(exc.failure_cases)
schema_context column check check_number \ 0 Column continuous greater_than_or_equal_to(0) 0 1 Column continuous greater_than_or_equal_to(0) 0 2 Column categorical isin({'A', 'B', 'C'}) 0 3 Column categorical isin({'A', 'B', 'C'}) 0 failure_case index 0 -1.1 0 1 -0.1 3 2 Z 3 3 X 4
import databricks.koalas as ks
koalas_dataframe = ks.DataFrame(raw_data)
try:
CleanData(koalas_dataframe, lazy=True).compute()
except pa.errors.SchemaErrors as exc:
print(exc.failure_cases)
schema_context column check check_number \ 0 Column continuous greater_than_or_equal_to(0) 0 1 Column categorical isin({'A', 'B', 'C'}) 0 failure_case index 0 AnalysisException("Resolved attribute(s) conti... None 1 AnalysisException("Resolved attribute(s) categ... None
import modin.pandas as mpd
modin_dataframe = mpd.DataFrame(raw_data)
try:
CleanData(modin_dataframe, lazy=True).compute()
except pa.errors.SchemaErrors as exc:
print(exc.failure_cases)
schema_context column check check_number \ 0 Column continuous greater_than_or_equal_to(0) 0 1 Column continuous greater_than_or_equal_to(0) 0 2 Column categorical isin({'A', 'B', 'C'}) 0 3 Column categorical isin({'A', 'B', 'C'}) 0 failure_case index 0 -1.1 0 1 -0.1 3 2 Z 3 3 X 4
from typing import Union
Number = Union[int, float]
def add_and_double(x: Number, y: Number) -> Number:
...
add_and_double(5, 2)
add_and_double(5, "hello")
add_and_double(11.5, -1.5)
import pandera as pa
from pandera.typing import DataFrame, Series
class Inputs(pa.SchemaModel):
x: Series[int]
y: Series[int]
class Config:
coerce = True
class Outputs(Inputs):
z: Series[int]
@pa.dataframe_check
def custom_check(cls, df: DataFrame) -> Series:
return df["z"] == (df["x"] + df["y"]) * 2
@pa.check_types
def add_and_double(raw_data: DataFrame[Inputs]) -> DataFrame[Outputs]:
...
It provides a flexible and expressive API for defining types for dataframes.
This enables a more intuitive way of validating not only data, but also the functions that produce those data.
pyarrow
, xarray
, polars
, vaex
, cudf
, etc.pandas-profiling
, fastapi
, json-schema
pytest-pandera
plugin for data pipeline profiling and reporting