Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).

22/07/28 15:28:45 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


dataframe.head()

/usr/local/miniconda3/envs/pandera-presentations/lib/python3.7/site-packages/pandas/core/ops/__init__.py in masked_arith_op(x, y, op)
    445         if mask.any():
    446             with np.errstate(all="ignore"):
--> 447                 result[mask] = op(xrav[mask], com.values_from_object(yrav[mask]))
    448 
    449     else:

TypeError: can't multiply sequence by non-int of type 'float'


def process_data(df):
    ...


def process_data(df):
    return df.assign(weekly_income=lambda x: x.hours_worked * x.wage_per_hour)


def process_data(df):
    import pdb; pdb.set_trace()  # <- insert breakpoint
    return df.assign(weekly_income=lambda x: x.hours_worked * x.wage_per_hour)


print(df)

          hours_worked  wage_per_hour
person_id                            
cad24fe           38.5           15.1
fe709bf          41.25           15.0
4c977a9           35.0           21.3
d5abdae          27.75           17.5
4c1c3b8          22.25           19.5
bacdade          -20.5           25.5


df.dtypes

hours_worked      object
wage_per_hour    float64
dtype: object


df.hours_worked.map(type)

person_id
cad24fe    <class 'float'>
fe709bf    <class 'float'>
4c977a9      <class 'str'>
d5abdae    <class 'float'>
4c1c3b8    <class 'float'>
bacdade    <class 'float'>
Name: hours_worked, dtype: object


def process_data(df):
    return (
        df
        # make sure columns are floats
        .astype({"hours_worked": float, "wage_per_hour": float})
        # replace negative values with nans
        .assign(hours_worked=lambda x: x.hours_worked.where(x.hours_worked >= 0, np.nan))
        # compute weekly income
        .assign(weekly_income=lambda x: x.hours_worked * x.wage_per_hour)
    )


process_data(df)


@pa.check_types
def process_data(df: DataFrame[RawData]) -> DataFrame[ProcessedData]:
    return (
        # replace negative values with nans
        df.assign(hours_worked=lambda x: x.hours_worked.where(x.hours_worked >= 0, np.nan))
        # compute weekly income
        .assign(weekly_income=lambda x: x.hours_worked * x.wage_per_hour)
    )


import pandera as pa

# NOTE: this is what's supposed to be in `df` going into `process_data`
class RawData(pa.SchemaModel):
    hours_worked: Series[float] = pa.Field(coerce=True, nullable=True)
    wage_per_hour: Series[float] = pa.Field(coerce=True, nullable=True)


# ... and this is what `process_data` is supposed to return.
class ProcessedData(RawData):
    hours_worked: Series[float] = pa.Field(ge=0, coerce=True, nullable=True)
    weekly_income: Series[float] = pa.Field(nullable=True)


@pa.check_types
def process_data(df: DataFrame[RawData]) -> DataFrame[ProcessedData]:
    ...


import pandera as pa

clean_data_schema = pa.DataFrameSchema(
    columns={
        "continuous": pa.Column(float, pa.Check.ge(0), nullable=True),
        "categorical": pa.Column(str, pa.Check.isin(["A", "B", "C"]), nullable=True),
    },
    coerce=True,
)


from pandera.typing import DataFrame, Series

class CleanData(pa.SchemaModel):
    continuous: Series[float] = pa.Field(ge=0, nullable=True)
    categorical: Series[str] = pa.Field(isin=["A", "B", "C"], nullable=True)

    class Config:
        coerce = True


raw_data = pd.DataFrame({
    "continuous": ["-1.1", "4.0", "10.25", "-0.1", "5.2"],
    "categorical": ["A", "B", "C", "Z", "X"],
})

try:
    CleanData.validate(raw_data, lazy=True)
except pa.errors.SchemaErrors as exc:
    display(exc.failure_cases)


raw_data_schema = pa.DataFrameSchema(
    columns={
        "continuous": pa.Column(float),
        "categorical": pa.Column(str),
    },
    coerce=True,
)

clean_data_schema.update_columns({
    "continuous": {"nullable": True},
    "categorical": {"checks": pa.Check.isin(["A", "B", "C"]), "nullable": True},
});


class RawData(pa.SchemaModel):
    continuous: Series[float]
    categorical: Series[str]

    class Config:
        coerce = True

class CleanData(RawData):
    continuous = pa.Field(ge=0, nullable=True)
    categorical = pa.Field(isin=["A", "B", "C"], nullable=True);


@pa.check_types
def fn(raw_data: DataFrame[RawData]) -> DataFrame[CleanData]:
    return raw_data.assign(
        continuous=lambda df: df["continuous"].where(lambda x: x > 0, np.nan),
        categorical=lambda df: df["categorical"].where(lambda x: x.isin(["A", "B", "C"]), np.nan),
    )


fn(raw_data)


CleanData.example(size=5)


from hypothesis import given


@given(RawData.strategy(size=5))
def test_fn(raw_data):
    fn(raw_data)


def run_test_suite():
    test_fn()
    print("tests passed ✅")


run_test_suite()

tests passed ✅


display(raw_data)


import dask.dataframe as dd

dask_dataframe = dd.from_pandas(raw_data, npartitions=1)

try:
    CleanData(dask_dataframe, lazy=True).compute()
except pa.errors.SchemaErrors as exc:
    display(exc.failure_cases.sort_index())


import modin.pandas as mpd

modin_dataframe = mpd.DataFrame(raw_data)

try:
    CleanData(modin_dataframe, lazy=True)
except pa.errors.SchemaErrors as exc:
    display(exc.failure_cases.sort_index())


import pyspark.pandas as ps

pyspark_pd_dataframe = ps.DataFrame(raw_data)

try:
    CleanData(pyspark_pd_dataframe, lazy=True)
except pa.errors.SchemaErrors as exc:
    display(exc.failure_cases.sort_index())

WARNING:root:'PYARROW_IGNORE_TIMEZONE' environment variable was not set. It is required to set this environment variable to '1' in both driver and executor sides if you use pyarrow>=2.0.0. pandas-on-Spark will set it for you but it does not work if there is a Spark context already launched.


from typing import Union

Number = Union[int, float]

def add_and_double(x: Number, y: Number) -> Number:
    ...


add_and_double(5, 2)
add_and_double(5, "hello")
add_and_double(11.5, -1.5)


import pandera as pa
from pandera.typing import DataFrame, Series

class Inputs(pa.SchemaModel):
    x: Series[int]
    y: Series[int]

    class Config:
        coerce = True


class Outputs(Inputs):
    z: Series[int]
        
    @pa.dataframe_check
    def custom_check(cls, df: DataFrame) -> Series:
        return df["z"] == (df["x"] + df["y"]) * 2
    
    
@pa.check_types
def add_and_double(raw_data: DataFrame[Inputs]) -> DataFrame[Outputs]:
    ...


data_point = {"square_footage": 700, "nbedrooms": 1, "price": 500_000}


data_points = [
    {"square_footage": 700, "nbedrooms": 1, "price": 500_000},
    {"square_footage": 1000, "nbedrooms": 2, "price": 750_000},
    {"square_footage": 3000, "nbedrooms": 4, "price": 1_000_000},
    ...
]


from sklearn.datasets import fetch_california_housing

housing_data = fetch_california_housing(as_frame=True).frame
housing_data.describe()


housing_data.head(5)


class HousingData(pa.SchemaModel):

    # features
    MedInc: Series[float] = pa.Field(in_range={"min_value": 0, "max_value": 100})
    HouseAge: Series[float] = pa.Field(in_range={"min_value": 0, "max_value": 100})
    AveRooms: Series[float] = pa.Field(in_range={"min_value": 0, "max_value": 1_000})
    AveBedrms: Series[float] = pa.Field(in_range={"min_value": 0, "max_value": 100})
    Population: Series[float] = pa.Field(in_range={"min_value": 0, "max_value": 100_000})
    AveOccup: Series[float] = pa.Field(in_range={"min_value": 0, "max_value": 10_000})
    Latitude: Series[float] = pa.Field(in_range={"min_value": -90, "max_value": 90})
    Longitude: Series[float] = pa.Field(in_range={"min_value": -180, "max_value": 180})

    # target variable! 🎯
    MedHouseVal: Series[float] = pa.Field(in_range={"min_value": 0, "max_value": 100})

    class Config:
        coerce = True


@pa.check_types
def read_data() -> DataFrame[HousingData]:
    return fetch_california_housing(as_frame=True).frame


housing_data = read_data()
print("validation passed ✅")

validation passed ✅


def analyze_data(housing_data, var1, var2):
    correlation_coef = housing_data[[var1, var2]].corr().at[var1, var2]
    display(Markdown(f"Pearson correlation coefficient = {correlation_coef:0.06f}"))
    housing_data.plot.scatter(var1, var2, s=1, alpha=0.5)

analyze_data(housing_data, "MedInc", "MedHouseVal")


from scipy.stats import pearsonr
import pandera.extensions as extensions

@extensions.register_check_method(
    statistics=["var1", "var2", "alpha"],
    supported_types=[pd.DataFrame]
)
def is_positively_correlated(
    df: pd.DataFrame,
    *,
    var1: str,
    var2: str,
    alpha: float = 0.01,
):
    """Perform Pearson correlation hypothesis test."""

    r, pvalue = pearsonr(df[var1], df[var2])
    passed = r > 0 and pvalue <= alpha

    pretty_pvalue = np.format_float_scientific(pvalue)
    if passed:
        print(f"✅ {var1} is positively correlated with {var2} with r = {r:0.04f}; pvalue = {pretty_pvalue}")
    else:
        print(f"❌ {var1} not correlated with {var2} with with r = {r:0.04f}; pvalue = {pretty_pvalue}")

    return passed


def analyze_data(housing_data, var1: str, var2: str):

    class HousingDataHypothesis(HousingData):
        class Config:
            coerce = True
            is_positively_correlated = {
                "var1": var1,
                "var2": var2,
                "alpha": 0.01,
            }

    housing_data = HousingDataHypothesis.validate(housing_data)
    correlation_coef = housing_data[[var1, var2]].corr().at[var1, var2]
    display(Markdown(f"Pearson correlation coefficient = {correlation_coef:0.06f}"))
    housing_data.plot.scatter(var1, var2, s=1, alpha=0.5)


analyze_data(housing_data, "MedInc", "MedHouseVal")

✅ MedInc is positively correlated with MedHouseVal with r = 0.6881; pvalue = 0.e+00


def run_analysis_pipeline(var1: str, var2: str):
    data = read_data()
    analyze_data(data, var1, var2)


run_analysis_pipeline("MedInc", "MedHouseVal")

✅ MedInc is positively correlated with MedHouseVal with r = 0.6881; pvalue = 0.e+00


from typing import Tuple

from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split


# ⚠️ This is the most critical part to check
@pa.check_types
def split_data(
    data: DataFrame[HousingData],
    test_size: float = 0.2,
) -> Tuple[DataFrame[HousingData], DataFrame[HousingData]]:
    return train_test_split(data, test_size=test_size)


# 👉 Notice that I don't use @pa.check_types here
def parse_data(data: DataFrame[HousingData], target: str) -> Tuple[DataFrame[HousingData], pd.Series]:
    features = [column for column in data if column != target]
    return data[features], data[target]


# 🔽 At this point onward the type annotations are for type linters like mypy
def train(features: pd.DataFrame, target: pd.Series) -> LinearRegression:
    model = LinearRegression()
    return model.fit(features, target)


def evaluate(model: LinearRegression, features: pd.DataFrame, target: pd.Series) -> float:
    prediction = model.predict(features)
    return r2_score(target, prediction)


def run_training_pipeline(data: pd.DataFrame, target: str):
    train_data, test_data = split_data(data)
    train_features, train_target = parse_data(train_data, target)

    # train a model
    model = train(train_features, train_target)

    # evaluate
    train_r2 = evaluate(model, train_features, train_target)
    test_r2 = evaluate(model, *parse_data(test_data, target))

    return model, train_r2, test_r2


model, train_r2, test_r2 = run_training_pipeline(read_data(), "MedHouseVal")
print(f"🏋️‍♂️ Train R^2 score: {train_r2:0.6f}")
print(f"📝 Test R^2 score: {test_r2:0.6f}")
model

🏋️‍♂️ Train R^2 score: 0.610440
📝 Test R^2 score: 0.588109

LinearRegression()

LinearRegression()


from hypothesis import settings

prediction_schema = pa.SeriesSchema(
    float,
    # in-line custom checks
    pa.Check(lambda s: (s >= 0).mean() > 0.05, name="predictions are mostly positive"),
    nullable=False,
)


@given(HousingData.strategy(size=20))
@settings(max_examples=3)
def test_run_training_pipeline(data):
    target = "MedHouseVal"
    model, *_ = run_training_pipeline(data, target)
    features, _ = parse_data(data, target)
    predictions = pd.Series(model.predict(features))

    # validate predictions
    prediction_schema(predictions)


def run_test_suite():
    test_run_training_pipeline()
    print("✅ training pipeline test suite passed!")


run_test_suite()

✅ training pipeline test suite passed!

	continuous	categorical
0	NaN	A
1	4.00	B
2	10.25	C
3	NaN	NaN
4	5.20	NaN

	MedInc	HouseAge	AveRooms	AveBedrms	Population	AveOccup	Latitude	Longitude	MedHouseVal
count	20640.000000	20640.000000	20640.000000	20640.000000	20640.000000	20640.000000	20640.000000	20640.000000	20640.000000
mean	3.870671	28.639486	5.429000	1.096675	1425.476744	3.070655	35.631861	-119.569704	2.068558
std	1.899822	12.585558	2.474173	0.473911	1132.462122	10.386050	2.135952	2.003532	1.153956
min	0.499900	1.000000	0.846154	0.333333	3.000000	0.692308	32.540000	-124.350000	0.149990
25%	2.563400	18.000000	4.440716	1.006079	787.000000	2.429741	33.930000	-121.800000	1.196000
50%	3.534800	29.000000	5.229129	1.048780	1166.000000	2.818116	34.260000	-118.490000	1.797000
75%	4.743250	37.000000	6.052381	1.099526	1725.000000	3.282261	37.710000	-118.010000	2.647250
max	15.000100	52.000000	141.909091	34.066667	35682.000000	1243.333333	41.950000	-114.310000	5.000010

	MedInc	HouseAge	AveRooms	AveBedrms	Population	AveOccup	Latitude	Longitude	MedHouseVal
0	8.3252	41.0	6.984127	1.023810	322.0	2.555556	37.88	-122.23	4.526
1	8.3014	21.0	6.238137	0.971880	2401.0	2.109842	37.86	-122.22	3.585
2	7.2574	52.0	8.288136	1.073446	496.0	2.802260	37.85	-122.24	3.521
3	5.6431	52.0	5.817352	1.073059	558.0	2.547945	37.85	-122.25	3.413
4	3.8462	52.0	6.281853	1.081081	565.0	2.181467	37.85	-122.25	3.422

	hours_worked	wage_per_hour
person_id
cad24fe	38.5	15.1
fe709bf	41.25	15.0
4c977a9	35.0	21.3
d5abdae	27.75	17.5
4c1c3b8	22.25	19.5

	hours_worked	wage_per_hour	weekly_income
person_id
cad24fe	38.50	15.1	581.350
fe709bf	41.25	15.0	618.750
4c977a9	35.00	21.3	745.500
d5abdae	27.75	17.5	485.625
4c1c3b8	22.25	19.5	433.875
bacdade	NaN	25.5	NaN

	schema_context	column	check	failure_case	index
0	Column	continuous	greater_than_or_equal_to(0)	-1.1	0
1	Column	continuous	greater_than_or_equal_to(0)	-0.1	3
2	Column	categorical	isin({'A', 'C', 'B'})	Z	3
3	Column	categorical	isin({'A', 'C', 'B'})	X	4

	continuous	categorical
0	0.0	A
1	0.0	A
2	0.0	A
3	0.0	A
4	0.0	A

Statistical Types for 🐼 Pandas DataFrames and Friends 🌈✨¶

Parse, Validate, and Synthesize DataFrames with Generative Schemas.¶

Background 🏞¶

Outline 📝¶

Where's the Code?¶

🤷‍♂️ Why Should I Validate Data?¶

What's a DataFrame?¶

What's Data Validation?¶

Why Do I Need it?¶

🐞 It can be difficult to reason about and debug data processing pipelines.¶

⚠️ It's critical to ensuring data quality in many contexts especially when the end product informs business decisions, supports scientific findings, or generates predictions in a production setting.¶

Everyone has a personal relationship with their dataframes¶

Story Time 📖¶

Imagine that you're a data scientist maintaining an existing data processing pipeline 👩‍💻👨‍💻...¶

One day, you encounter an error log trail and decide to follow it...¶

And you find yourself at the top of a function...¶

You look around, and see some hints of what had happened...¶

You sort of know what's going on, but you want to take a closer look!¶

And you find some funny business going on...¶

You squash the bug and add documentation for the next weary traveler who happens upon this code.¶

⏱ A few months later...¶

You find yourself at a familiar function, but it looks a little different from when you left it...¶

You look above and see what RawData and ProcessedData are, finding a NOTE that a fellow traveler has left for you.¶

Moral of the Story¶

The better you can reason about the contents of a dataframe, the faster you can debug.¶

The faster you can debug, the sooner you can focus on downstream tasks that you care about.¶

By validating data through explicit contracts, you also create data documentation for the rest of your team.¶

🤔 What's Data Testing¶

And How Can I Put it Into Practice?¶

In the Real World 🌍¶

Validate real data in production¶

In the Test Suite 🧪¶

Validate functions that produce data, given some test cases¶

Data Testing in Practice¶

Data testing is an iterative process of:¶

✅ Pandera Quickstart¶

Create statistical types for your DataFrames¶

Pandera

An expressive and light-weight statistical typing tool for dataframe-like containers¶

Object-based API¶

Class-based API¶

Pandera Raises Informative Errors¶

Pandera Supports Schema Transformations/Inheritence¶

Object-based API¶

Class-based API¶

Integrate Seamlessly with your Pipeline¶

Generative Schemas¶

Scaling Pandera with Pandas' Friends 🐼🌈✨¶

Apply a single schema to a suite of dataframe-like objects¶

dask¶

Apply a single schema to a suite of dataframe-like objects¶

modin¶

Apply a single schema to a suite of dataframe-like objects¶

pyspark.pandas¶

Meta Comment¶

This presentation notebook is validated by pandera 🤯¶

⌨️ Statistical Typing¶

Type systems help programmers reason about and write more robust code¶

Can you predict the outcome of these function calls?¶

Similarly...¶

🤔 What's Statistical Typing?¶

Consider a single data point¶

Now consider a collection data point¶

Pandera is a Statistical Type System Geared Towards Data Science¶

📊 Example 1: Validate your Data analysis¶

Dataset: California Housing¶

With a cursory glance at the data...¶

We can start defining a basic schema¶

Analysis Pipeline¶

Bake in statistical hypothesis testing into your pipeline¶

Dynamically create schemas as statistical hypothesis validators¶

The Analysis Pipeline¶

🤖 Example 2: Validate your Machine Learning Pipeline¶

Prediction Task:¶

Running a validated training pipeline¶

Unit testing a training pipeline¶

⭐️ Conclusion: How can I start using Pandera in my work?¶

Incrementally adopt pandera into your workflow¶

🛣 Future Roadmap¶

Join the Community!¶

What's a `DataFrame`?¶

You look above and see what `RawData` and `ProcessedData` are, finding a `NOTE` that a fellow traveler has left for you.¶

`dask`¶

`modin`¶

`pyspark.pandas`¶

Incrementally adopt `pandera` into your workflow¶