Source code for autopandas.generators.copula

# Copula generator

from scipy.stats import gaussian_kde, norm
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KernelDensity
import numpy as np
import autopandas

[docs]def vector_to_rank(x, reverse=False): sort = np.sort(x) rank = LabelEncoder().fit_transform(sort) + 1 if reverse: sort = np.flip(sort) rank_dict = dict(zip(sort, rank)) return [rank_dict[i] for i in x]
[docs]def matrix_to_rank(X): matrix = np.copy(X) for i, column in enumerate(matrix.T): matrix[:, i] = vector_to_rank(column) return matrix
[docs]def rank_vector_to_inverse(x): x = x / (x.max() + 1) inverse = norm.ppf(x) return inverse
[docs]def rank_matrix_to_inverse(X): matrix = np.copy(X) for i, column in enumerate(X.T): matrix[:, i] = rank_vector_to_inverse(column) return matrix
[docs]def marginal_retrofit(Xartif, Xreal): """ Retrofit the marginal distributions of the features in Xartif to those in Xreal. """ pa,n = Xartif.shape pr,nr = Xreal.shape assert(n==nr) Xretro = np.zeros(Xartif.shape) # Adjust the dimensions of the 2 matrices if pa>pr: # subsample Xreal Xreal = resample(Xreal, replace='False', n_samples=pa) elif pa<pr: # oversample Xreal Xreal = resample(Xreal, replace='True', n_samples=pa) # Otherwise do nothing # Loop over variables for i in range(n): # Sort the values of both arrays Xa=Xartif[:,i] Xr=Xreal[:,i] idxa=np.argsort(Xa) idxr=np.argsort(Xr) # Substitute artificial value for corresponding real value at same rank Xa[idxa] = Xr[idxr] # replace initial column Xretro[:,i] = Xa return Xretro
[docs]def copula_generate(X, generator=None, n=None): """ Generate using copula trick. :param generator: Model to fit and sample from. KDE by default. :param n: Number of examples to generate. By default it is the number of observations in X. """ indexes = X.indexes columns = X.columns if generator is None: generator = KernelDensity() if n is None: n = X.shape[0] X_real = np.array(X) # X marginals to uniforms X = matrix_to_rank(X) # X uniforms to inverse gaussian CDF X = rank_matrix_to_inverse(X) # Fit generator generator.fit(X) # Generating artificial data \n Sampling from generator X_artif = generator.sample(n) # Marginal retrofitting result = autopandas.AutoData(marginal_retrofit(X_artif, X_real)) # Restore data frame index result.indexes = indexes result.columns = columns return result
[docs]class Copula():
[docs] def __init__(self): """ Copula generator. """ self.data = None
[docs] def fit(self, data): """ Use the copula trick and train the generator with data. :param data: Data frame to use as training set. """ self.data = copula_generate(data)
[docs] def sample(self, n=1, replace=False): """ Sample from trained generator. :param n: Number of examples to sample. :param replace: If True, sample with replacement. """ if self.data is None: raise Exception('You firstly need to train the copula generator before sampling. Please use fit method.') else: return self.data.sample(n, replace=replace)