Source code for autopandas.utils.metric

# Distance metric functions

import numpy as np
import scipy as sp
import random
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from scipy.spatial.distance import pdist, cdist, squareform
from scipy.stats import ks_2samp
from sklearn.utils import resample, shuffle
from sklearn.neighbors import NearestNeighbors
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import itertools
#from .nn_adversarial_accuracy import NearestNeighborMetrics
#from .nnaa import nnaa

# Between Points/Columns (1D)
#############################
# A lot of functions are still in utilities.py (not imported module)
# TODO: Find distances that works well with binary/categorical data

[docs]def distance(x, y, axis=None, norm='euclidean', method=None):
    """ Compute the distance between x and y (data points).

        Default behaviour: flatten multi-dimensional arrays.

        :param x: Array-like, first point
        :param y: Array-like, second point
        :param axis: Axis of x along which to compute the vector norms.
        :param norm: 'l0', 'manhattan', 'euclidean', 'minimum' or 'maximum'
        :param method: Alias for norm parameter.
        :return: Distance value
        :rtype: float
    """
    if type(x) != type(y):
        raise Exception('x type is {} and y type is {}. Please pass two arguments with the same type.'.format(type(x), type(y)))
    if method is not None: # Alias
        norm = method
    # if x and y are single values
    if isinstance(x, np.ndarray):
        if len(x.shape) > 1:
            x = x.flatten()
        if len(y.shape) > 1:
            y = y.flatten()
        z = x - y
    elif isinstance(x, list):
        z = x - y
    else:
        z = [x - y]
    if norm == 'manhattan' or norm == 'l1':
        return np.linalg.norm(z, ord=1, axis=axis)
    elif norm == 'euclidean' or norm == 'l2':
        return np.linalg.norm(z, ord=2, axis=axis)
    elif norm == 'minimum':
        return np.linalg.norm(z, ord=-np.inf, axis=axis)
    elif norm == 'maximum':
        return np.linalg.norm(z, ord=np.inf, axis=axis)
    elif norm == 'l0':
        return np.linalg.norm(z, ord=0, axis=axis)
    else:
        raise ValueError('Unknwon norm: {}.'.format(norm))

[docs]def acc_stat(solution, prediction):
    """ Return accuracy statistics TN, FP, TP, FN
        Assumes that solution and prediction are binary 0/1 vectors.
    """
    # This uses floats so the results are floats
    TN = sum(np.multiply((1-solution), (1-prediction)))
    FN = sum(np.multiply(solution, (1-prediction)))
    TP = sum(np.multiply(solution, prediction))
    FP = sum(np.multiply((1-solution), prediction))
    #print "TN =",TN
    #print "FP =",FP
    #print "TP =",TP
    #print "FN =",FN
    return (TN, FP, TP, FN)

[docs]def bac_metric(solution, prediction):
    """ Compute the balanced accuracy for binary classification.
    """
    [tn,fp,tp,fn] = acc_stat(solution, prediction)
    # Bounding to avoid division by 0
    eps = 1e-15
    tp = sp.maximum (eps, tp)
    pos_num = sp.maximum (eps, tp+fn)
    tpr = tp / pos_num # true positive rate (sensitivity)
    tn = sp.maximum (eps, tn)
    neg_num = sp.maximum (eps, tn+fp)
    tnr = tn / neg_num # true negative rate (specificity)
    bac = 0.5*(tpr + tnr)
    return bac

# Between Distributions (2D)
############################

[docs]def nn_discrepancy(X1, X2):
    """ Use 1 nearest neighbor method to determine discrepancy between X1 and X2.
        If X1 and X2 are very different, it is easy to classify them
        thus bac > 0.5. Otherwise, if they are similar, bac ~ 0.5.
    """
    n1 = X1.shape[0]
    n2 = X2.shape[0]
    X = np.concatenate((X1, X2))
    Y = np.concatenate((np.ones(n1), np.zeros(n2)))
    X, Y = shuffle(X, Y)
    nbrs = NearestNeighbors(n_neighbors=2, algorithm='brute').fit(X)
    distances, indices = nbrs.kneighbors(X)
    Ypred = Y[indices[:,1]] # the second nearest neighbor is the loo neighbor
    return max(0, 2*bac_metric(Y, Ypred)-1)

[docs]def discriminant(data1, data2, model=None, metric=None, name1='Dataset 1', name2='Dataset 2', same_size=False, verbose=False):
    """ Return the scores of a classifier trained to differentiate data1 and data2.

        If the distributions are similar and the model can't distinguish then the score will be ~ 0.5 (depending on the metric of course).

        :param model: The classifier. It has to have fit(X,y) and score(X,y) methods. Logistic regression by default.
        :param metric: The scoring metric. Accuracy by default.
        :param same_size: If True, normalize datasets to same size before computation.
        :return: Classification report (precision, recall, f1-score).
        :rtype: str
    """
    if model is None:
        model = LogisticRegression()
    if metric is None:
        metric = accuracy_score
    # check if train/test split already exists or do it
    if not data1.has_split():
        data1.train_test_split()
    if not data2.has_split():
        data2.train_test_split()
    ds1_train = data1.get_data('train')
    ds1_test = data1.get_data('test')
    ds2_train = data2.get_data('train')
    ds2_test = data2.get_data('test')
    if same_size:
        # We want same number of example in both dataset to compute
        if ds1_train.shape[0] < ds2_train.shape[0]:
            ds2_train = ds2_train.sample(n=ds1_train.shape[0])
        if ds1_train.shape[0] > ds2_train.shape[0]:
            ds1_train = ds1_train.sample(n=ds1_train.shape[0])
    # Train set
    X1_train, X2_train = list(ds1_train.values), list(ds2_train.values)
    X_train = X1_train + X2_train
    y_train = [0] * len(X1_train) + [1] * len(X2_train)
    # Shuffle
    combined = list(zip(X_train, y_train))
    random.shuffle(combined)
    X_train[:], y_train[:] = zip(*combined)
    # Test set
    X1_test, X2_test = list(ds1_test.values), list(ds2_test.values)
    X_test = X1_test + X2_test
    y_test = [0] * len(X1_test) + [1] * len(X2_test)
    # Training
    model.fit(X_train, y_train)
    # If verbose, more information
    target_names = [name1, name2]
    model_info = str(model)
    report = classification_report(model.predict(X_test), y_test, target_names=target_names)
    if verbose:
        print(model_info)
        print(report)
        print('Metric: {}'.format(metric))
    # Scoring
    score = metric(y_test, model.predict(X_test))
    return score

[docs]def distance_matrix(data1, data2, distance_func=None):
    """ Compute matrix with distances between each points (m_ij is distance between i and j).
        TODO: parallelization.

        :param data1: Distribution
        :param data2: Distribution
        :param distance_func: Distance metric function to use to compare data points. Euclidean distance by default.
    """
    # distance metric between data points
    if distance_func is None:
        distance_func = distance
    len1, len2 = len(data1), len(data2) # handle len1 != len2 case?

    distance_matrix = np.empty((len1, len2))
    # compute the distances
    for i in range(len1):
        for j in range(len2):
            distance_matrix[i, j] = distance_func(data1[i], data2[j])
    return distance_matrix

[docs]def nnaa(data_s, data_t, distance_func=None, detailed_results=False):
    """ Compute nearest neighbors adversarial accuracy between data_s and data_t.
        This is the proportion of points in data_s for which the nearest neighbor is in data_s (and not in data_t).
        It can also be seen as the binary classification score of a 1NN trying to tell if a point is from data1 or data2, in a leave one out setting.
        If data_s and data_t follow the same distribution, the score should be near 0.5:
        * nnaa > 0.5 -> underfitting
        * nnaa ~ 0.5 -> cool
        * nnaa < 0.5 -> overfitting

        From "Privacy Preserving Synthetic Health Data" by Andrew Yale et al.

        :param data_s: 2D distribution (s for "source").
        :param data_t: 2D distribution (t for "target").
        :param distance_func: Distance metric function to use to compare data points. Euclidean distance by default.
        :param detailed_results: If True, return score but also score for TS and ST (the 2 components of the score).
    """
    data_s, data_t = np.array(data_s), np.array(data_t)
    if(data_s.shape != data_t.shape):
        raise Exception('The two data frames must have the same shapes but {} != {} got passed.'.format(data_s.shape, data_t.shape))
    # matrixes with distances between each points (m_ij is distance between i and j)
    distances_st = distance_matrix(data_s, data_t, distance_func=distance_func) # distances between data_s and data_t
    distances_ss = distance_matrix(data_s, data_s, distance_func=distance_func)
    distances_tt = distance_matrix(data_t, data_t, distance_func=distance_func)
    # fill the diagonal to avoid considering the points themselves as their nearest neighbors
    np.fill_diagonal(distances_ss, np.inf)
    np.fill_diagonal(distances_tt, np.inf)
    # distance to nearest neighbors
    min_st, min_ts = distances_st.min(axis=0), distances_st.min(axis=1)
    min_ss = distances_ss.min(axis=0)
    min_tt = distances_tt.min(axis=0)
    nnaa_st = np.sum(min_st > min_ss) / len(data_s) # proportion of nearest neihbors of s in s
    nnaa_ts = np.sum(min_ts > min_tt) / len(data_t)
    score = (nnaa_st + nnaa_ts) / 2
    if detailed_results:
        return score, nnaa_st, nnaa_ts
    else:
        return score