Source code for autopandas.generators.anm

# Additive Noise Model

# Imports
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import clone

[docs]class ANM():
[docs]    def __init__(self, model=None):
        """ Data generator using multiple imputations with random forest (or another model).

            :param model: Model used for imputations.
        """
        # List of Random Forests
        self.models = []
        if model is None: # Default Random Forest
            self.regressor = RandomForestRegressor()
            self.classifier = RandomForestClassifier()
        else: # Custom model
            self.regressor = model
            self.classifier = model
        # Store data to be able to sample from original data
        self.data = None
        # For noise behaviour
        self.predicted_matrix = None
        self.var_vector = None

[docs]    def fit(self, data, noise=False):
        """ Fit one random forest (or another model) for each column, given the others.

            :param noise: If True, add noise during sampling relative to the residual matrix
        """
        self.data = data
        for i in range(len(data.columns)):
            # May bug with duplicate names in columns
            y = data[data.columns[i]]
            X = data.drop(data.columns[i], axis=1)
            # Regressor or classifier
            if data.columns[i] in data.indexes['numerical']:
                model = clone(self.regressor)
            else:
                model = clone(self.classifier)
            # Fit one predictive model for each variable
            model.fit(X, y)
            self.models.append(model)
        # NOISE BEHAVIOUR
        # takes more time because needs to compute the residual matrix for the whole dataset
        if noise:
            self.predicted_matrix = np.zeros(data.shape)
            residual_matrix = np.zeros(data.shape)
            for x in list(data.index.values):
                for i, y in enumerate(list(data.columns.values)):
                    row = data.loc[[x]].drop(y, axis=1)
                    self.predicted_matrix[x, i] = self.models[i].predict(row)
                    residual_matrix[x, i] = (self.predicted_matrix[x,i] - data.loc[x, y])**2
            self.var_vector = np.mean(residual_matrix, axis=0)
        else: # RESET
            self.predicted_matrix = None
            self.var_vector = None

[docs]    def partial_fit_generate(self, n=1, p=0.8, replace=True, noise=False):
        """ Fit and generate for high dimensional case.
            To avoid memory error, features are trained and generated one by one.

            :param n: Number of examples to sample
            :param p: The probability of changing a value
                        if p=0, the generated dataset will be equals to the original
                        if p=1, the generated dataset will contains only new values
            :param replace: If True, sample the original data with replacement before the imputations
            :param noise: If True, add noise relative to the residual matrix. NOT IMPLEMENTED (not possible?)

            :return: Generated data
            :rtype: pd.DataFrame
        """
        if noise:
            raise Exception('noise argument is not compatible with partial_fit_generate. Please use fit and then sample methods.')
        data = self.data
        data = data.sample(n=n, replace=replace)
        gen_data = data.copy()
        # Features are trained and generated one by one
        for i in range(len(data.columns)):
            # May bug with duplicate names in columns
            y = data.columns[i] # name
            Y = data[y]         # data
            X = data.drop(data.columns[i], axis=1)
            # Regressor or classifier
            if data.columns[i] in data.indexes['numerical']:
                model = self.regressor
            else:
                model = self.classifier
            # FIT
            model.fit(X, Y)
            # GENERATE
            for x in list(data.index.values): # Loop over rows
                if np.random.random() < p:
                    row = data.loc[[x]].drop(y, axis=1)
                    # DEBUG
                    prediction = model.predict(row)
                    if isinstance(prediction, np.ndarray):
                        gen_data.at[x, y] = prediction[0]
                    else:
                        gen_data.at[x, y] = prediction
        return gen_data

[docs]    def sample(self, n=1, p=0.8, replace=True, noise=False):
        """ Generate n rows by copying data and then do values imputations.

            :param n: Number of examples to sample
            :param p: The probability of changing a value
                        if p=0, the generated dataset will be equals to the original
                        if p=1, the generated dataset will contains only new values
            :param replace: If True, sample the original data with replacement before the imputations
            :param noise: If True, add noise relative to the residual matrix

            :return: Generated data
            :rtype: pd.DataFrame
        """
        if self.data is not None:
            data = self.data
        else:
            raise Exception('The ANM generator needs to be trained before you can sample from it. Please use fit method.')
        gen_data = data.sample(n=n, replace=replace)
        # NOISE BEHAVIOUR
        if noise:
            if self.var_vector is None:
                raise Exception('You must call fit method with noise=True before calling sample method with noise=True.')
            for x in list(gen_data.index.values):
                row = self.predicted_matrix[x, :]
                for i, y in enumerate(list(data.columns.values)):
                    if np.random.random() < p: # with probability p
                        # may need the ndarray debug...
                        gen_data.at[x, y] = row[i] + np.random.normal(loc=0, scale=np.sqrt(self.var_vector[i]))
        # CLASSICAL BEHAVIOUR
        else:
            # Loop over examples
            for x in list(gen_data.index.values):
                # Loop over features
                for i, y in enumerate(list(data.columns.values)):
                    if np.random.random() < p: # with probability p
                        row = data.loc[[x]].drop(y, axis=1)
                        prediction = self.models[i].predict(row)
                        if isinstance(prediction, np.ndarray):
                            prediction = prediction[0] # select first value if needed
                        gen_data.at[x, y] = prediction
        return gen_data.reset_index(drop=True)