Chaining in Pandas
Pandas chaining is an alternative to variable assignment when transforming data. Those in favor of chaining argue that the code is easier to read because it lays out the execution of the transformation like a recipe. In this post we will explore transforming the Titanic Dataset to prep it for machine learning using both chaining and variable transformation.
- Extracting Data
- Transformation #1
- Transformation #2
- Transformation #3
- Transformation #4
- Transformation #5
- Transformation Final
- Performance
- Summary
import pandas as pd
import numpy as np
raw_df = pd.read_csv('https://raw.githubusercontent.com/pandas-dev/pandas/master/doc/data/titanic.csv')
df = raw_df.copy()
df.head()
df.info()
df = raw_df.copy()
df = df.set_index('PassengerId')
df['Sex'] = df.Sex.astype('category')
df['Cabin'] = df.Cabin.astype('category')
The first big difference in chaining is that we need to rap our transformation in ()
. The next difference is using a new method called assign
to create or change existing columns.
df = raw_df.copy()
df = (
df
.set_index('PassengerId')
.assign(Sex=df.Sex.astype('category'),
Cabin=df.Cabin.astype('category'))
)
df.head()
df = raw_df.copy()
df = df.set_index('PassengerId')
df['Sex'] = df.Sex.astype('category')
df['Cabin'] = df.Cabin.astype('category')
## NEW SECTION ##
df['Title'] = df.Name.str.extract('([A-Za-z]+)\.')
df['Title'] = np.select(
condlist=(df.Title.isin(['Mlle', 'Ms']),
df.Title.isin(['Mme', 'Mrs']),
df.Title.isin(['Mr'])),
choicelist=('Miss', 'Mrs', 'Mr'),
default='Rare')
We are still using assign
to create the new columns but there are items of note. We are unable to have the same column name within a single assign
method so we need to create a second assign
method. Also, because Title
does not exist in the original dataframe we have to reference the current object with lambda
.
df = raw_df.copy()
df = (
df
.set_index('PassengerId')
.assign(Sex=df.Sex.astype('category'),
Cabin=df.Cabin.astype('category'),
## NEW SECTION ##
Title=df.Name.str.extract('([A-Za-z]+)\.'))
.assign(Title=lambda df:
np.select(
condlist=(df.Title.isin(['Mlle', 'Ms']),
df.Title.isin(['Mme', 'Mrs']),
df.Title.isin(['Mr'])),
choicelist=('Miss', 'Mrs', 'Mr'),
default='Rare'))
)
df.head()
df = raw_df.copy()
df = df.set_index('PassengerId')
df['Sex'] = df.Sex.astype('category')
df['Cabin'] = df.Cabin.astype('category')
df['Title'] = df.Name.str.extract('([A-Za-z]+)\.')
df['Title'] = np.select(
condlist=(df.Title.isin(['Mlle', 'Ms']),
df.Title.isin(['Mme', 'Mrs']),
df.Title.isin(['Mr'])),
choicelist=('Miss', 'Mrs', 'Mr'),
default='Rare')
## NEW SECTION ##
df['Age_bin'] = pd.cut(
df['Age'],
bins=[0, 12, 20, 40, 120],
labels=['Children', 'Teenage', 'Adult', 'Elder'])
df['Fare_bin'] = pd.cut(
df['Fare'],
bins=[0, 7.91, 14.45, 31, 120],
labels=['Low_fare', 'median_fare', 'Average_fare', 'high_fare'])
df['Age_bin'] = df.Age_bin.astype('category')
df['Fare_bin'] = df.Fare_bin.astype('category')
df = df.drop(['Age', 'Fare', 'Name', 'Ticket'], axis=1)
We are continuing to use assign
to create our columns. Note how we have to introduce a third assign
method to categorize Age_bin
and Fare_bin
into categories since those columns are already referenced in the previous assign
.
df = raw_df.copy()
df = (
df
.set_index('PassengerId')
.assign(
Title=df.Name.str.extract('([A-Za-z]+)\.')
)
.assign(Title=lambda df:
np.select(
condlist=(df.Title.isin(['Mlle', 'Ms']),
df.Title.isin(['Mme', 'Mrs']),
df.Title.isin(['Mr'])),
choicelist=('Miss', 'Mrs', 'Mr'),
default='Rare'),
## NEW SECTION ##
Age_bin=pd.cut(
df['Age'],
bins=[0, 12, 20, 40, 120],
labels=['Children', 'Teenage', 'Adult', 'Elder']),
Fare_bin=pd.cut(
df['Fare'],
bins=[0, 7.91, 14.45, 31, 120],
labels=['Low_fare', 'median_fare', 'Average_fare', 'high_fare'])
)
.assign(
Age_bin=lambda df: df.Age_bin.astype('category'),
Fare_bin=lambda df: df.Fare_bin.astype('category')
)
.drop(['Age', 'Fare', 'Name', 'Ticket'], axis=1)
)
df.head()
df = raw_df.copy()
df = df.set_index('PassengerId')
df['Sex'] = df.Sex.astype('category')
df['Cabin'] = df.Cabin.astype('category')
df['Title'] = df.Name.str.extract('([A-Za-z]+)\.')
df['Title'] = np.select(
condlist=(df.Title.isin(['Mlle', 'Ms']),
df.Title.isin(['Mme', 'Mrs']),
df.Title.isin(['Mr'])),
choicelist=('Miss', 'Mrs', 'Mr'),
default='Rare')
df['Age_bin'] = pd.cut(
df['Age'],
bins=[0, 12, 20, 40, 120],
labels=['Children', 'Teenage', 'Adult', 'Elder'])
df['Fare_bin'] = pd.cut(
df['Fare'],
bins=[0, 7.91, 14.45, 31, 120],
labels=['Low_fare', 'median_fare', 'Average_fare', 'high_fare'])
df['Age_bin'] = df.Age_bin.astype('category')
df['Fare_bin'] = df.Fare_bin.astype('category')
df = df.drop(['Age', 'Fare', 'Name', 'Ticket', 'Cabin'], axis=1)
## NEW SECTION ##
df = df[df.Survived.eq(0)] # or df.Survived == 0
df = raw_df.copy()
df = (
df
.set_index('PassengerId')
.assign(
Title=df.Name.str.extract('([A-Za-z]+)\.')
)
.assign(Title=lambda df:
np.select(
condlist=(df.Title.isin(['Mlle', 'Ms']),
df.Title.isin(['Mme', 'Mrs']),
df.Title.isin(['Mr'])),
choicelist=('Miss', 'Mrs', 'Mr'),
default='Rare'),
Age_bin=pd.cut(
df['Age'],
bins=[0, 12, 20, 40, 120],
labels=['Children', 'Teenage', 'Adult', 'Elder']),
Fare_bin=pd.cut(
df['Fare'],
bins=[0, 7.91, 14.45, 31, 120],
labels=['Low_fare', 'median_fare', 'Average_fare', 'high_fare'])
)
.assign(
Age_bin=lambda df: df.Age_bin.astype('category'),
Fare_bin=lambda df: df.Fare_bin.astype('category')
)
.drop(['Age', 'Fare', 'Name', 'Ticket', 'Cabin'], axis=1)
## NEW SECTION ##
.query('Survived == 0') # .loc/iloc is also an option
)
df.head()
df = raw_df.copy()
df = df.set_index('PassengerId')
df['Cabin'] = df.Cabin.astype('category')
df['Title'] = df.Name.str.extract('([A-Za-z]+)\.')
df['Title'] = np.select(
condlist=(df.Title.isin(['Mlle', 'Ms']),
df.Title.isin(['Mme', 'Mrs']),
df.Title.isin(['Mr'])),
choicelist=('Miss', 'Mrs', 'Mr'),
default='Rare')
df['Age_bin'] = pd.cut(
df['Age'],
bins=[0, 12, 20, 40, 120],
labels=['Children', 'Teenage', 'Adult', 'Elder'])
df['Fare_bin'] = pd.cut(
df['Fare'],
bins=[0, 7.91, 14.45, 31, 120],
labels=['Low_fare', 'median_fare', 'Average_fare', 'high_fare'])
df['Age_bin'] = df.Age_bin.astype('category')
df['Fare_bin'] = df.Fare_bin.astype('category')
df = df.drop(['Age', 'Fare', 'Name', 'Ticket', 'Cabin'], axis=1)
df = df[df.Survived.eq(0)] # or df.Survived == 0
## NEW SECTION ##
df = pd.get_dummies(
df, columns=["Sex", "Title", "Age_bin", "Embarked", "Fare_bin"])
The pipe
method is used when you have a function that takes your dataframe as an argument such as pd.get_dummies
.
df = raw_df.copy()
df = (
df
.set_index('PassengerId')
.assign(
Title=df.Name.str.extract('([A-Za-z]+)\.')
)
.assign(Title=lambda df:
np.select(
condlist=(df.Title.isin(['Mlle', 'Ms']),
df.Title.isin(['Mme', 'Mrs']),
df.Title.isin(['Mr'])),
choicelist=('Miss', 'Mrs', 'Mr'),
default='Rare'),
Age_bin=pd.cut(
df['Age'],
bins=[0, 12, 20, 40, 120],
labels=['Children', 'Teenage', 'Adult', 'Elder']),
Fare_bin=pd.cut(
df['Fare'],
bins=[0, 7.91, 14.45, 31, 120],
labels=['Low_fare', 'median_fare', 'Average_fare', 'high_fare'])
)
.assign(
Age_bin=lambda df: df.Age_bin.astype('category'),
Fare_bin=lambda df: df.Fare_bin.astype('category')
)
.drop(['Age', 'Fare', 'Name', 'Ticket', 'Cabin'], axis=1)
.query('Survived == 0')
## NEW SECTION ##
.pipe(lambda df: pd.get_dummies(df, columns=["Sex", "Title", "Age_bin", "Embarked", "Fare_bin"]))
)
df.head()
def variable_transform(raw_df):
df = raw_df.copy()
df = df.set_index('PassengerId')
df['Cabin'] = df.Cabin.astype('category')
df['Title'] = df.Name.str.extract('([A-Za-z]+)\.')
df['Title'] = np.select(
condlist=(df.Title.isin(['Mlle', 'Ms']),
df.Title.isin(['Mme', 'Mrs']),
df.Title.isin(['Mr'])),
choicelist=('Miss', 'Mrs', 'Mr'),
default='Rare')
df['Age_bin'] = pd.cut(
df['Age'],
bins=[0, 12, 20, 40, 120],
labels=['Children', 'Teenage', 'Adult', 'Elder'])
df['Fare_bin'] = pd.cut(
df['Fare'],
bins=[0, 7.91, 14.45, 31, 120],
labels=['Low_fare', 'median_fare', 'Average_fare', 'high_fare'])
df['Age_bin'] = df.Age_bin.astype('category')
df['Fare_bin'] = df.Fare_bin.astype('category')
df = df.drop(['Age', 'Fare', 'Name', 'Ticket', 'Cabin'], axis=1)
df = df[df.Survived.eq(0)] # or df.Survived == 0
df = pd.get_dummies(
df, columns=["Sex", "Title", "Age_bin", "Embarked", "Fare_bin"])
return df
def chain_transform(raw_df):
df = raw_df.copy()
df = (
df
.set_index('PassengerId')
.assign(
Title=df.Name.str.extract('([A-Za-z]+)\.')
)
.assign(Title=lambda df:
np.select(
condlist=(df.Title.isin(['Mlle', 'Ms']),
df.Title.isin(['Mme', 'Mrs']),
df.Title.isin(['Mr'])),
choicelist=('Miss', 'Mrs', 'Mr'),
default='Rare'),
Age_bin=pd.cut(
df['Age'],
bins=[0, 12, 20, 40, 120],
labels=['Children', 'Teenage', 'Adult', 'Elder']),
Fare_bin=pd.cut(
df['Fare'],
bins=[0, 7.91, 14.45, 31, 120],
labels=['Low_fare', 'median_fare', 'Average_fare', 'high_fare'])
)
.assign(
Age_bin=lambda df: df.Age_bin.astype('category'),
Fare_bin=lambda df: df.Fare_bin.astype('category')
)
.drop(['Age', 'Fare', 'Name', 'Ticket', 'Cabin'], axis=1)
.query('Survived == 0')
.pipe(lambda df: pd.get_dummies(df, columns=["Sex", "Title", "Age_bin", "Embarked", "Fare_bin"]))
)
return df
import timeit
def wrapper(func, *args, **kwargs):
# https://www.pythoncentral.io/time-a-python-function/
def wrapped():
return func(*args, **kwargs)
return wrapped
variable = wrapper(variable_transform, raw_df)
chain = wrapper(chain_transform, raw_df)
print(f'Variables took {timeit.timeit(variable, number=100):.2f} seconds')
print(f'Chaining took {timeit.timeit(chain, number=100):.2f} seconds')