from typing import Union
Number = Union[int, float]
def add_and_double(x: Number, y: Number) -> Number:
...
Can you predict the outcome of these function calls?
add_and_double(5, 2)
add_and_double(5, "hello")
add_and_double(11.5, -1.5)
pandera
Especially if they're in ML models that took a lot of ⏰ to train
model ~ data
Δdata -> Δmodel
model
as a function f(x) -> y
f
is working as intended?Catches certain type errors before running code
from typing import Union
Number = Union[int, float]
def add_and_double(x: Number, y: Number) -> Number:
return (x + y) * 2
add_and_double(5, 2) # ✅
add_and_double(5, "hello") # ❌
add_and_double(11.5, -1.5) # ✅
Problem: What if the underlying implementation is wrong?
from typing import Union
Number = Union[int, float]
def add_and_double(x: Number, y: Number) -> Number:
return (x - y) * 4
add_and_double(5, 2) # output: 12
add_and_double(5, "hello") # raises: TypeError
add_and_double(11.5, -1.5) # output: 52
Unit tests verify the behavior of isolated pieces of functionality and let you know when changes cause breakages or regressions.
import pytest
def test_add_and_double():
# 🙂 path
assert add_and_double(5, 2) == 14
assert add_and_double(11.5, -15) == 20.0
assert add_and_double(-10, 1.0) == -18.0
def test_add_and_double_exceptions():
# 😞 path
with pytest.raises(TypeError):
add_and_double(5, "hello")
with pytest.raises(TypeError):
add_and_double("world", 32.5)
Property-based testing alleviates the burden of explicitly writing test cases
from hypothesis import given
from hypothesis.strategies import integers, floats, one_of, text
numbers = one_of(integers(), floats())
@given(x=numbers, y=numbers)
def test_add_and_double(x, y):
assert add_and_double(x, y) == (x + y) * 2
@given(x=numbers, y=text())
def test_add_and_double_exceptions():
with pytest.raises(TypeError):
add_and_double(x, y)
from typing import List, Tuple, TypedDict
Response = TypedDict("Response", q1=int, q2=int, q3=str)
Example = Tuple[List[float], bool]
def store_data(raw_response: str) -> Response:
...
def create_dataset(raw_responses: List[Response], target: List[bool]) -> List[Example]:
...
def train_model(survey_responses: List[Example]) -> str:
...
store_data
's scope of concern is atomic, i.e. it only operates
on a single data point 🧘⚛create_dataset
needs to worry about the statistical patterns of a
sample of data points 😓📊create_dataset
on plausible example data?
.... it's not fun 😭
import string
import pandera as pa
from pandera.typing import Series
class SurveySchema(pa.SchemaModel):
q1: Series[int] = pa.Field(isin=[1, 2, 3, 4, 5])
q2: Series[int] = pa.Field(isin=[1, 2, 3, 4, 5])
q3: Series[str] = pa.Field(str_matches="[a-zA-Z0-9 ]+")
data = pd.DataFrame({"q1": [-1], "q2": [5], "q3": ["hello"]})
try:
SurveySchema.validate(data)
except Exception as e:
print(e)
<Schema Column(name=q1, type=<class 'int'>)> failed element-wise validator 0: <Check isin: isin({1, 2, 3, 4, 5})> failure cases: index failure_case 0 0 -1
sample_data = SurveySchema.example(size=3)
display(sample_data)
q1 | q2 | q3 | |
---|---|---|---|
0 | 1 | 1 | W |
1 | 1 | 1 | 0 |
2 | 1 | 1 | 0 |
Statistical typing extends primitive data types with additional semantics about the properties held by a collection of data points
Boolean → Bernoulli
¶x1 = True
x2 = False
support: Set[bool] = {x1, x2}
probability_distribution: Dict[str, float] = {True: 0.5, False, 0.5}
FairCoin = Bernoulli(support, probability_distribution)
data: FairCoin = [1, 0, 0, 1, 1, 0]
mean(data)
mode(data)
Enum → Categorical
¶class Animal(Enum):
CAT = 1
DOG = 2
COW = 3
FarmAnimals = Categorical(
Animal,
probabilities={
Animal.CAT: 0.01,
Animal.DOG: 0.04,
Animal.COW: 0.95,
},
ordered=False,
)
data: FarmAnimals = [Animal.CAT] * 50 + [Animal.DOG] * 50
check_type(data) # raise a RuntimeError
Float → Gaussian
¶TreeHeight = Gaussian(mean=10, standard_deviation=1)
def test_process_data():
data: List[float] = sample(TreeHeight)
result = process_data(data)
assert ...
import math
def normalize(x: List[float]):
"""Mean-center and scale with standard deviation"""
mean = sum(x) / len(x)
std = math.sqrt(sum((i - mean) ** 2 for i in x) / len(x))
x_norm = [(i - mean) / std for i in x]
# runtime assertions
assert any(i < 0 for i in x_norm)
assert any(i > 0 for i in x_norm)
return x_norm
For each variable in my dataset, define:
int
, float
, bool
, str
, etc.x >= 0
mean
and standard deviation
Some statistical properties can be checked statically, e.g. the mean operation cannot be applied to categorical data
mean(categorical) ❌
Other properties can only be checked at runtime, e.g. this sample of data is drawn from a Gaussian
scipy.stats.normaltest(normalize(raw_data))
Schemas can be implemented as generative data contracts that can be used for type checking and sampling
pandera
¶Suppose we're building a predictive model of house prices given features about different houses:
raw_data = """
square_footage,n_bedrooms,property_type,price
750,1,condo,200000
900,2,condo,400000
1200,2,house,500000
1100,3,house,450000
1000,2,condo,300000
1000,2,townhouse,300000
1200,2,townhouse,350000
"""
square_footage
: positive integern_bedrooms
: positive integerproperty type
: categoricalprice
: positive real numberdef process_data(raw_data): # step 1: prepare data for model training
...
def train_model(processed_data): # step 2: fit a model on processed data
...
pandera
¶import pandera as pa
from pandera.typing import Series, DataFrame
PROPERTY_TYPES = ["condo", "townhouse", "house"]
class BaseSchema(pa.SchemaModel):
square_footage: Series[int] = pa.Field(in_range={"min_value": 0, "max_value": 3000})
n_bedrooms: Series[int] = pa.Field(in_range={"min_value": 0, "max_value": 10})
price: Series[int] = pa.Field(in_range={"min_value": 0, "max_value": 1000000})
class Config:
coerce = True
class RawData(BaseSchema):
property_type: Series[str] = pa.Field(isin=PROPERTY_TYPES)
class ProcessedData(BaseSchema):
property_type_condo: Series[int] = pa.Field(isin=[0, 1])
property_type_house: Series[int] = pa.Field(isin=[0, 1])
property_type_townhouse: Series[int] = pa.Field(isin=[0, 1])
With Type Annotations
@pa.check_types
def process_data(raw_data: DataFrame[RawData]) -> DataFrame[ProcessedData]:
...
@pa.check_types
def train_model(processed_data: DataFrame[ProcessedData]):
...
With Implementation
import pandas as pd
from sklearn.base import BaseEstimator
from sklearn.linear_model import LinearRegression
@pa.check_types
def process_data(raw_data: DataFrame[RawData]) -> DataFrame[ProcessedData]:
return pd.get_dummies(
raw_data.astype({"property_type": pd.CategoricalDtype(PROPERTY_TYPES)})
)
@pa.check_types
def train_model(processed_data: DataFrame[ProcessedData]) -> BaseEstimator:
return LinearRegression().fit(
X=processed_data.drop("price", axis=1),
y=processed_data["price"],
)
Validate the statistical type of raw and processed data every time we run our pipeline.
from io import StringIO
def run_pipeline(raw_data):
processed_data = process_data(raw_data)
estimator = train_model(processed_data)
# evaluate model, save artifacts, etc...
print("✅ model training successful!")
run_pipeline(pd.read_csv(StringIO(raw_data.strip())))
✅ model training successful!
invalid_data = """
square_footage,n_bedrooms,property_type,price
750,1,unknown,200000
900,2,condo,400000
1200,2,house,500000
"""
try:
run_pipeline(pd.read_csv(StringIO(invalid_data.strip())))
except Exception as e:
print(e)
error in check_types decorator of function 'process_data': <Schema Column(name=property_type, type=<class 'str'>)> failed element-wise validator 0: <Check isin: isin({'condo', 'house', 'townhouse'})> failure cases: index failure_case 0 0 unknown
Define property-based unit tests with hypothesis
from hypothesis import given
@given(RawData.strategy(size=3))
def test_process_data(raw_data):
process_data(raw_data)
@given(ProcessedData.strategy(size=3))
def test_train_model(processed_data):
estimator = train_model(processed_data)
predictions = estimator.predict(processed_data.drop("price", axis=1))
assert len(predictions) == processed_data.shape[0]
def run_test_suite():
test_process_data()
test_train_model()
print("✅ tests successful!")
run_test_suite()
✅ tests successful!
Define property-based unit tests with hypothesis
@pa.check_types
def process_data(raw_data: DataFrame[RawData]) -> DataFrame[ProcessedData]:
return raw_data
try:
run_test_suite()
except Exception as e:
print(e)
Falsifying example: test_process_data( raw_data= square_footage n_bedrooms price property_type 0 0 0 0 condo 1 0 0 0 condo 2 0 0 0 condo, ) error in check_types decorator of function 'process_data': column 'property_type_condo' not in dataframe square_footage n_bedrooms price property_type 0 0 0 0 condo 1 0 0 0 condo 2 0 0 0 condo
For some datasets, it might make sense to infer a schema from a sample of data and go from there:
raw_df = pd.read_csv(StringIO(raw_data.strip()))
display(raw_df.head(3))
square_footage | n_bedrooms | property_type | price | |
---|---|---|---|---|
0 | 750 | 1 | condo | 200000 |
1 | 900 | 2 | condo | 400000 |
2 | 1200 | 2 | house | 500000 |
schema = pa.infer_schema(raw_df)
schema.to_yaml()
schema.to_script()
print(schema)
<Schema DataFrameSchema( columns={ 'square_footage': <Schema Column(name=square_footage, type=int64)> 'n_bedrooms': <Schema Column(name=n_bedrooms, type=int64)> 'property_type': <Schema Column(name=property_type, type=str)> 'price': <Schema Column(name=price, type=int64)> }, checks=[], coerce=True, pandas_dtype=None, index=<Schema Index(name=None, type=int64)>, strict=False name=None, ordered=False )>
FarmAnimals = Categorical(
Animal,
probabilities={
Animal.CAT: 0.01,
Animal.DOG: 0.04,
Animal.COW: 0.95,
},
ordered=False,
)
data: FarmAnimals = [Animal.CAT] * 50 + [Animal.DOG] * 50
mean(data) # ❌ cannot apply mean to Categorical
def model(input_data: Gaussian) -> Bernoulli:
...
type(model)
# [LogisticRegression, RandomForestClassifier, ...]
Model-based statistical types
In theory, a generative adversarial network can be used as a schema to validate real-world data and generate synthetic data
The discriminator, which is typically discarded after training, can validate real or upstream synthetic data.
category | images | |
---|---|---|
0 | cat | image1.jpeg |
1 | dog | image2.jpeg |
2 | cow | image3.jpeg |
3 | horse | image4.jpeg |
4 | ... | ... |
class ImageSchema(pa.SchemaModel):
category: Series[str] = pa.Field(isin=["cat", "dog", "cow", "horse", "..."])
images: Series[Image] = pa.Field(drawn_from=GenerativeAdversarialNetwork("weights.pt"))
Statistical typing extends primitive data types into the statistical domain, opening up a bunch of testing capabilities that make statistical code more robust and easier to reason about.
niels.bantilan@gmail.com
@cosmicbboy
cosmicBboy