DonaldRauscher.com

A Blog About D4T4 & M47H

High Cardinality Categoricals with Sklearn

18 December ’17

I used a Bayesian approach to encode high cardinality categorical variables in a Kaggle a few months back. My original implementation was in R. However, I have recently been doing most of my modeling in sklearn, so I decided to also implement this approach there as well.

This approach lends itself to the sklearn framework very well! The fit method uses MLE to estimate a and b for the prior distribution and calculates descriptive stats for each level. The transform method calculates posterior probabilities on what is assumed to be out-of-sample data. And the fit_transform method runs fit and then calculates posterior probabilities, leaving out the current sample, and applying some noise (deterring overfitting in tree-based models) if specified.

You can see this method implemented in a familiar friend: my hospital readmission model. Cheers!

import pandas as pd
import numpy as np

from scipy.optimize import minimize
from scipy.special import beta

from sklearn.base import BaseEstimator
from sklearn.utils.validation import check_is_fitted, column_or_1d
from sklearn.utils.multiclass import type_of_target

# beta binomial density function
@np.vectorize
def dbetabinom(a, b, k, n):
    n2 = np.clip(n, 0, 100)
    k2 = round(k * n2 / n)
    return beta(k2 + a, n2 - k2 + b) / beta(a, b)

# beta binomial log loss
def betabinom_ll(par, arg):
    return np.sum(-np.log(dbetabinom(par[0], par[1], arg[0], arg[1])))

# default params for MLE
mle_param_defaults = dict(method = "L-BFGS-B", x0 = [1,1], bounds = [(0.5, 500), (0.5, 500)])

# encodes single high cardinality categorical variable
class SingleHCCEncoder(BaseEstimator):

    def __init__(self, add_noise = True, noise_sd = 0.05, mle_params = mle_param_defaults):
        self.add_noise = add_noise
        self.noise_sd = noise_sd
        self.mle_params = mle_params
        self.a, self.b = None, None
        self.df, self.df_dict = None, None

    # calibrate a and b of beta distribution
    def fit_beta(self):
        check_is_fitted(self, 'df')
        k, n = self.df.k, self.df.n
        mle = minimize(fun = betabinom_ll, args = [k, n], **self.mle_params)
        self.a, self.b = mle.x

    # descriptive stats for each level
    def fit_df(self, x, y):
        df = pd.DataFrame(data = dict(x = x, y = y))
        df = df.groupby(['x']).agg(['sum', 'count', 'mean'])
        df.columns = ['k', 'n', 'p']
        self.df = df
        self.df_dict = df.to_dict(orient = "index")

    @np.vectorize
    def transform_one_loo(self, x, y):
        xval = self.df_dict.get(x, dict(k = 0, n = 0))
        return (xval['k'] + self.a - y) * 1.0 / (xval['n'] + self.a + self.b - 1)

    @np.vectorize
    def transform_one(self, x):
        xval = self.df_dict.get(x, dict(k = 0, n = 0))
        return (xval['k'] + self.a) * 1.0 / (xval['n'] + self.a + self.b)

    def fit(self, x, y):
        assert(type_of_target(y) == "binary")
        x = column_or_1d(x)
        y = column_or_1d(y)
        self.fit_df(x, y)
        self.fit_beta()
        return self

    def fit_transform(self, x, y):
        self.fit(x, y)
        if self.add_noise:
            noise = self.noise_sd * np.random.randn(len(x)) + 1
        else:
            noise = np.repeat(1, len(x))
        return self.transform_one_loo(self, x, y) * noise

    def transform(self, x):
        check_is_fitted(self, 'df_dict')
        x = column_or_1d(x)
        return self.transform_one(self, x)

# encodes multiple high cardinality categorical variables
class HCCEncoder(BaseEstimator):

    def __init__(self, columns, hcc_encode_params = {}, seed = 1):
        self.columns = columns
        self.hcc_encode_params = hcc_encode_params
        self.seed = seed
        self.labellers = {}

    def fit(self, df, y):
        for c in self.columns:
            hcc_encode_params = self.hcc_encode_params.get(c, {})
            labeller = SingleHCCEncoder(**hcc_encode_params)
            labeller.fit(df[c], y)
            self.labellers[c] = labeller
        return self

    def fit_transform(self, df, y):
        np.random.seed(self.seed)
        df = df.copy()
        for c in self.columns:
            hcc_encode_params = self.hcc_encode_params.get(c, {})
            labeller = SingleHCCEncoder(**hcc_encode_params)
            df[c] = labeller.fit_transform(df[c], y)
            self.labellers[c] = labeller
        return df

    def transform(self, df):
        df = df.copy()
        for c in self.columns:
            df[c] = self.labellers[c].transform(df[c])
        return df