Source code for autopandas.utils.reduction

# Dimensionality reduction functions

from warnings import warn
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.feature_extraction import FeatureHasher
import pandas as pd
import autopandas
import numpy as np

[docs]def pca(data, key=None, return_param=False, verbose=False, model=None, **kwargs): """ Compute Principal Components Analysis. Use kwargs for additional PCA parameters (cf. sklearn doc). :param key: Indexes key to select data. :param return_param: If True, returns a tuple (X, pca) to store PCA parameters and apply them later. :param model: Use this argument to pass a trained PCA model. :param verbose: Display additional information during run. :rtype: autopandas.AutoData :return: Transformed data """ X = data.get_data(key) pca = model if model is None: # initialize and fit PCA pca = PCA(**kwargs) pca.fit(X) X = pca.transform(X) if verbose: print('Explained variance ratio of the {} components: \n {}'.format(pca.n_components_, pca.explained_variance_ratio_)) plt.bar(left=range(pca.n_components_), height=pca.explained_variance_ratio_, width=0.3, tick_label=range(pca.n_components_)) plt.title('Explained variance ratio by principal component') plt.show() if return_param: return X, pca return X
[docs]def tsne(data, key=None, verbose=False, **kwargs): """ Compute T-SNE. Use kwargs for additional T-SNE parameters (cf. sklearn doc) Parameters ---------- key : Indexes key to select data. verbose : Display additional information during run. Returns ------- AutoData Transformed data """ tsne = TSNE(**kwargs) X = tsne.fit_transform(data.get_data(key)) return X
[docs]def lda(data, key=None, verbose=False, **kwargs): """ Compute Linear Discriminant Analysis. Use kwargs for additional LDA parameters (cf. sklearn doc) Parameters ---------- key : Indexes key to select data. verbose : Display additional information during run. Returns ------- AutoData Transformed data """ lda = LinearDiscriminantAnalysis(**kwargs) X = data.get_data('X').get_data(key) y = data.get_data('y').get_data(key) if y.shape[1] > 1: warn("LDA can't handle multi-output class. Only the first column will be used.\nUse set_class method to define another target before calling lda.") y = y[y.columns[0]] X = lda.fit_transform(X, np.array(y).ravel()) # ravel to avoid warnings return X
[docs]def feature_hashing(data, key=None, n_features=10, **kwargs): """ Feature hashing. :param n_features: Wanted number of features after feature hashing. """ data = data.get_data(key) h = FeatureHasher(n_features=n_features, **kwargs) data = data.to_dict('records') data = h.transform(data) data = data.toarray() # to array data = pd.DataFrame(data) # to df return data