Source code for autopandas.utils.encoding

# Functions for encoding

# Imports
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import bisect
import copy
# cat2vec
#from gensim.models.word2vec import Word2Vec
from random import shuffle

[docs]def none(data, column): """ Remove column from data. """ data.drop([column], axis=1, inplace=True) return data
[docs]def label(data, column): """ Performs label encoding. Example: Color: ['blue', 'green', 'blue', 'pink'] is encoded by Color: [1, 2, 1, 3] :param data: Data :param column: Column to encode :return: Encoded data :rtype: pd.DataFrame """ data[column] = data[column].astype('category').cat.codes return data
[docs]def one_hot(data, column, rare=False, coeff=0.1): """ Performs one-hot encoding. Example: Color: ['black', 'white', 'white'] is encoded by Black: [1, 0, 0] White: [0, 1, 1] :param df: Data :param column: Column to encode :param rare: If True, rare categories are merged into one :param coeff: Coefficient defining rare values. A rare category occurs less than the (average number of occurrence * coefficient). :return: Encoded data :rtype: pd.DataFrame """ # Rare values management if rare: average = len(data[column]) / len(data[column].unique()) # train/test bias ? threshold = np.ceil(average * coeff) data.loc[data[column].value_counts()[data[column]].values < threshold, column] = "RARE_VALUE" # Usual one-hot encoding data = pd.concat([data, pd.get_dummies(data[column], prefix=column)], axis=1) data.drop([column], axis=1, inplace=True) return data
[docs]def likelihood(data, column, mapping=None, return_param=False): """ Performs likelihood encoding. :param df: Data :param column: Column to encode :param mapping: Dictionary {category : value} :param return_param: If True, the mapping is returned :return: Encoded data :rtype: pd.DataFrame """ numericals = data.indexes['numerical'] # numerical variables categories = data[column].unique() if mapping is None: mapping = dict() try: # TODO: NOT OPTIMIZED: PCA will be computed for every variable pca = PCA() principal_axe = pca.fit(data[numericals].values).components_[0, :] # First principal component. pc1 = (principal_axe * data[numericals]).sum(axis=1) except: raise Exception('No numerical columns found, cannot apply likelihood encoding.') for i, category in enumerate(categories): mapping[category] = np.mean(pc1[data[column]==category]) else: for category in categories: if category not in mapping: mapping[category] = 0 data[column] = data[column].map(mapping) if return_param: return data, mapping return data
[docs]def count(data, column, mapping=None, probability=False, return_param=False): """ Performs frequency encoding. Categories are replaced by their number of occurence. Soon: possibility of probability instead of count (normalization) :param df: Data :param column: Column to encode :param mapping: Dictionary {category : value} :param probability: If True, return probability instead of frequency :param return_param: If True, the mapping is returned :return: Encoded data :rtype: pd.DataFrame """ categories = data[column].unique() if mapping is None: mapping = dict() for e in data[column]: if e in mapping: mapping[e] += 1 else: mapping[e] = 1 else: for category in categories: if category not in mapping: mapping[category] = 0 # TODO if probability: factor = 1.0 / sum(mapping.values()) for k in mapping: mapping[k] = float(format(mapping[k] * factor, '.3f')) data[column] = data[column].map(mapping) if return_param: return data, mapping return data
[docs]def target(data, column, target, mapping=None, return_param=False): """ Performs target encoding. :param df: Data :param column: Column to encode :param target: Target column name :param mapping: Dictionary {category : value} :param return_param: If True, the mapping is returned :return: Encoded data :rtype: pd.DataFrame """ target = data[target] categories = data[column].unique() if mapping is None: mapping = dict() for i, category in enumerate(categories): mapping[category] = np.mean(target[data[column]==category]).round(3) else: for category in categories: if category not in mapping: mapping[category] = 0 data[column] = data[column].map(mapping) if return_param: return data, mapping return data
[docs]def frequency(columns, probability=False): """ /!\ Warning: Take only column(s) and not DataFrame /!\ Frequency encoding: Pandas series to frequency/probability distribution. If there are several series, the outputs will have the same format. Example: C1: ['b', 'a', 'a', 'b', 'b'] C2: ['b', 'b', 'b', 'c', 'b'] f1: ['a': 2, 'b'; 3, 'c': 0] f2: ['a': 0, 'b'; 4, 'c': 1] Output: [[2, 3, 0], [0, 4, 1]] (with probability = False) :param probability: True for probablities, False for frequencies. :return: Frequency/probability distribution. :rtype: list """ # TODO error if several columns have the same header # If there is only one column, just return frequencies if not isinstance(columns[0], (list, np.ndarray, pd.Series)): return columns.value_counts(normalize=probability).values frequencies = [] # Compute frequencies for each column for column in columns: f = dict() for e in column: if e in f: f[e] += 1 else: f[e] = 1 frequencies.append(f) # Add keys from other columns in every dictionaries with a frequency of 0 # We want the same format for i, f in enumerate(frequencies): for k in f.keys(): for other_f in frequencies[:i]+frequencies[i+1:]: if k not in other_f: other_f[k] = 0 # Convert to frequency/probability distribution res = [] for f in frequencies: l = list(f.values()) if probability: # normalize between 0 and 1 with a sum of 1 l = normalize(l) # Convert dict into a list of values res.append(l) # Every list will follow the same order because the dicts contain the same keys return res
#def cat2vec(data, size=6, window=8, verbose=True): # """ TODO. Based on Yonatan Hadar's implementation. # """ # x_w2v = copy.deepcopy(data) # names = list(x_w2v.columns.values) # for i in names: # x_w2v[i]=x_w2v[i].astype('category') # x_w2v[i].cat.categories = ["Feature %s %s" % (i,g) for g in x_w2v[i].cat.categories] # x_w2v = x_w2v.values.tolist() # for i in x_w2v: # shuffle(i) # w2v = Word2Vec(x_w2v, size=size, window=window) # data_w2v = copy.copy(data) # for i in names: # data_w2v[i] = data_w2v[i].astype('category') # data_w2v[i].cat.categories = ["Feature %s %s" % (i,g) for g in data_w2v[i].cat.categories] # data_w2v = data_w2v.values # x_w2v_train = np.random.random((len(data_w2v),size*data_w2v.shape[1])) # for j in range(data_w2v.shape[1]): # for i in range(data_w2v.shape[0]): # if data_w2v[i,j] in w2v: # x_w2v_train[i,j*size:(j+1)*size] = w2v[data_w2v[i,j]] # return pd.DataFrame(x_w2v_train) # Deep category embedding
[docs]def normalize(l, normalization='probability'): """ Return a normalized list :param normalization: 'probability': between 0 and 1 with a sum equals to 1 OR 'min-max': min become 0 and max become 1 """ if normalization=='probability': return [float(i)/sum(l) for i in l] elif normalization=='min-max': return [(float(i) - min(l)) / (max(l) - min(l)) for i in l] else: # mean std ? raise ValueError('Argument normalization is invalid.')