from typing import Union

Number = Union[int, float]

def add_and_double(x: Number, y: Number) -> Number:
    ...


add_and_double(5, 2)
add_and_double(5, "hello")
add_and_double(11.5, -1.5)


from typing import Union

Number = Union[int, float]

def add_and_double(x: Number, y: Number) -> Number:
    return (x + y) * 2


from typing import Union

Number = Union[int, float]

def add_and_double(x: Number, y: Number) -> Number:
    return (x - y) * 4


import pytest

def test_add_and_double():
    # 🙂 path
    assert add_and_double(5, 2) == 14
    assert add_and_double(11.5, -15) == 20.0
    assert add_and_double(-10, 1.0) == -18.0

def test_add_and_double_exceptions():
    # 😞 path
    with pytest.raises(TypeError):
        add_and_double(5, "hello")
    with pytest.raises(TypeError):
        add_and_double("world", 32.5)


from hypothesis import given
from hypothesis.strategies import integers, floats, one_of, text

numbers = one_of(integers(), floats())

@given(x=numbers, y=numbers)
def test_add_and_double(x, y):
    assert add_and_double(x, y) == (x + y) * 2

@given(x=numbers, y=text())
def test_add_and_double_exceptions():
    with pytest.raises(TypeError):
        add_and_double(x, y)


from typing import List, Tuple, TypedDict

Response = TypedDict("Response", q1=int, q2=int, q3=str)
Example = Tuple[List[float], bool]

def store_data(raw_response: str) -> Response:
    ...

def create_dataset(raw_responses: List[Response], target: List[bool]) -> List[Example]:
    ...

def train_model(survey_responses: List[Example]) -> str:
    ...


import string
import pandera as pa
from pandera.typing import Series

class SurveySchema(pa.SchemaModel):
    q1: Series[int] = pa.Field(isin=[1, 2, 3, 4, 5])
    q2: Series[int] = pa.Field(isin=[1, 2, 3, 4, 5])
    q3: Series[str] = pa.Field(str_matches="[a-zA-Z0-9 ]+")

data = pd.DataFrame({"q1": [-1], "q2": [5], "q3": ["hello"]})


try:
    SurveySchema.validate(data)
except Exception as e:
    print(e)

<Schema Column(name=q1, type=<class 'int'>)> failed element-wise validator 0:
<Check isin: isin({1, 2, 3, 4, 5})>
failure cases:
   index  failure_case
0      0            -1


sample_data = SurveySchema.example(size=3)
display(sample_data)


import math

def normalize(x: List[float]):
    """Mean-center and scale with standard deviation"""
    mean = sum(x) / len(x)
    std = math.sqrt(sum((i - mean) ** 2 for i in x) / len(x))
    x_norm = [(i - mean) / std for i in x]

    # runtime assertions
    assert any(i < 0 for i in x_norm)
    assert any(i > 0 for i in x_norm)

    return x_norm


raw_data = """
square_footage,n_bedrooms,property_type,price
750,1,condo,200000
900,2,condo,400000
1200,2,house,500000
1100,3,house,450000
1000,2,condo,300000
1000,2,townhouse,300000
1200,2,townhouse,350000
"""


def process_data(raw_data):  # step 1: prepare data for model training
    ...
    
def train_model(processed_data): # step 2: fit a model on processed data
    ...


import pandera as pa
from pandera.typing import Series, DataFrame

PROPERTY_TYPES = ["condo", "townhouse", "house"]


class BaseSchema(pa.SchemaModel):
    square_footage: Series[int] = pa.Field(in_range={"min_value": 0, "max_value": 3000})
    n_bedrooms: Series[int] = pa.Field(in_range={"min_value": 0, "max_value": 10})
    price: Series[int] = pa.Field(in_range={"min_value": 0, "max_value": 1000000})

    class Config:
        coerce = True


class RawData(BaseSchema):
    property_type: Series[str] = pa.Field(isin=PROPERTY_TYPES)


class ProcessedData(BaseSchema):
    property_type_condo: Series[int] = pa.Field(isin=[0, 1])
    property_type_house: Series[int] = pa.Field(isin=[0, 1])    
    property_type_townhouse: Series[int] = pa.Field(isin=[0, 1])


@pa.check_types
def process_data(raw_data: DataFrame[RawData]) -> DataFrame[ProcessedData]:
    ...

@pa.check_types
def train_model(processed_data: DataFrame[ProcessedData]):
    ...


import pandas as pd
from sklearn.base import BaseEstimator
from sklearn.linear_model import LinearRegression


@pa.check_types
def process_data(raw_data: DataFrame[RawData]) -> DataFrame[ProcessedData]:
    return pd.get_dummies(
        raw_data.astype({"property_type": pd.CategoricalDtype(PROPERTY_TYPES)})
    )


@pa.check_types
def train_model(processed_data: DataFrame[ProcessedData]) -> BaseEstimator:
    return LinearRegression().fit(
        X=processed_data.drop("price", axis=1),
        y=processed_data["price"],
    )


from io import StringIO


def run_pipeline(raw_data):
    processed_data = process_data(raw_data)
    estimator = train_model(processed_data)
    # evaluate model, save artifacts, etc...
    print("✅ model training successful!")


run_pipeline(pd.read_csv(StringIO(raw_data.strip())))

✅ model training successful!


invalid_data = """
square_footage,n_bedrooms,property_type,price
750,1,unknown,200000
900,2,condo,400000
1200,2,house,500000
"""

try:
    run_pipeline(pd.read_csv(StringIO(invalid_data.strip())))
except Exception as e:
    print(e)

error in check_types decorator of function 'process_data': <Schema Column(name=property_type, type=<class 'str'>)> failed element-wise validator 0:
<Check isin: isin({'condo', 'house', 'townhouse'})>
failure cases:
   index failure_case
0      0      unknown


from hypothesis import given


@given(RawData.strategy(size=3))
def test_process_data(raw_data):
    process_data(raw_data)
    
@given(ProcessedData.strategy(size=3))
def test_train_model(processed_data):
    estimator = train_model(processed_data)
    predictions = estimator.predict(processed_data.drop("price", axis=1))
    assert len(predictions) == processed_data.shape[0]

def run_test_suite():
    test_process_data()
    test_train_model()
    print("✅ tests successful!")    
    
run_test_suite()

✅ tests successful!


@pa.check_types
def process_data(raw_data: DataFrame[RawData]) -> DataFrame[ProcessedData]:
    return raw_data

try:
    run_test_suite()
except Exception as e:
    print(e)

Falsifying example: test_process_data(
    raw_data=   square_footage  n_bedrooms  price property_type
    0               0           0      0         condo
    1               0           0      0         condo
    2               0           0      0         condo,
)
error in check_types decorator of function 'process_data': column 'property_type_condo' not in dataframe
   square_footage  n_bedrooms  price property_type
0               0           0      0         condo
1               0           0      0         condo
2               0           0      0         condo


raw_df = pd.read_csv(StringIO(raw_data.strip()))
display(raw_df.head(3))


schema = pa.infer_schema(raw_df)
schema.to_yaml()
schema.to_script()
print(schema)

<Schema DataFrameSchema(
    columns={
        'square_footage': <Schema Column(name=square_footage, type=int64)>
        'n_bedrooms': <Schema Column(name=n_bedrooms, type=int64)>
        'property_type': <Schema Column(name=property_type, type=str)>
        'price': <Schema Column(name=price, type=int64)>
    },
    checks=[],
    coerce=True,
    pandas_dtype=None,
    index=<Schema Index(name=None, type=int64)>,
    strict=False
    name=None,
    ordered=False
)>

	category	images
0	cat	image1.jpeg
1	dog	image2.jpeg
2	cow	image3.jpeg
3	horse	image4.jpeg
4	...	...

	square_footage	n_bedrooms	property_type	price
0	750	1	condo	200000
1	900	2	condo	400000
2	1200	2	house	500000

Statistical Typing: A Runtime Typing System for Data Science and Machine Learning¶

Niels Bantilan¶

Type systems help programmers reason about and write more robust code.¶

🤔 What would a type system geared towards data science and machine learning look like?¶

Outline¶

🐞🐞🐞 An Introduction to Some of my Problems¶

The worst bugs are the silent ones¶

The Model is the Data, the Data is the Model 🤔 🤯¶

Static Type-checking/Linting¶

Static Type-checking/Linting¶

Unit Tests¶

Property-based Tests¶

🔎 ⛏ Testing code is hard, testing statistical analysis code is harder!¶

Toy Example: Survey Data for Modeling¶

Toy Pipeline¶

🤲 📀 🖼 hand-crafting example dataframes is a major barrier for unit testing.¶

What if I could do something like...¶

📊📈 Define a Specification for Data Types in the Statistical Domain¶

Boolean → Bernoulli¶

Enum → Categorical¶

Float → Gaussian¶

Have you ever done something like this?¶

🤯 You've Been Doing Statistical Typing All Along¶

Statistical Type Specification: Types as Schemas¶

Implications¶

🛠 Statistical Typing in Practice with pandera¶

Pipeline¶

Define Schemas with pandera¶

Pipeline¶

Pipeline¶

Running the Pipeline¶

Fail Early and with Useful Information¶

Schemas as Generative Contracts¶

Catch Errors in Data Processing Code¶

Bootstrapping a Schema from Sample Data¶

🪛🪓🪚 Use Cases¶

🏎 Where Can this Idea Go Next?¶

Statically analyze code that performs statistical operations¶

Infer model architecture space based on function signatures¶

Infer Statistical Types from Data¶

GAN Schema¶

GAN Schema¶

Validation and Data Synthesis for Complex Statistical Types¶

Takeaway¶

Thanks!¶

`Boolean → Bernoulli`¶

`Enum → Categorical`¶

`Float → Gaussian`¶

🛠 Statistical Typing in Practice with `pandera`¶

Define Schemas with `pandera`¶