Here is my target encoder ...
Unfortunately it seems that fit is only called with X ... so this will not work.
import sklearn
class TargetEncoder(sklearn.base.BaseEstimator, sklearn.base.TransformerMixin):
def __init__(self, min_samples_leaf=1, smoothing=1, noise_level=0):
self.dict_averages = {}
self.dict_priors = {}
self.min_samples_leaf = min_samples_leaf
self.smoothing = smoothing
self.noise_level = noise_level
def fit(self, X, y=None):
assert y is not None
target = y
self.y_col = y.name
trn_series = X
col = X.name
temp = pd.concat([trn_series, target], axis=1)
# Compute target mean
averages = temp.groupby(by=trn_series.name)[target.name].agg(["mean", "count"])
# Compute smoothing
smoothing = 1 / (1 + np.exp(-(averages["count"] - self.min_samples_leaf) / self.smoothing))
# Apply average function to all target data
prior = target.mean()
# The bigger the count the less full_avg is taken into account
averages[target.name] = prior * (1 - smoothing) + averages["mean"] * smoothing
averages.drop(["mean", "count"], axis=1, inplace=True)
self.dict_averages.update({col: averages})
self.dict_priors.update({col: prior})
return self
def transform(self, X):
trn_series = X
col = X.name
ft_trn_series = pd.merge(
trn_series.to_frame(trn_series.name),
self.dict_averages[col].reset_index().rename(columns={'index': self.y_col, self.y_col: 'average'}),
on=trn_series.name, how='left')['average'].rename(trn_series.name).fillna(self.dict_priors[col])
# pd.merge does not keep the index so restore it
ft_trn_series.index = trn_series.index
X = ft_trn_series
return X
processor = TargetEncoder()