Source code for autopandas.utils.automl

# Function to read and write AutoML format
# TODO

import pandas as pd
import os

[docs]def read_automl(path):
    """ Alias for from_automl.
        Read files in AutoML format.
    """
    return from_automl(path)

[docs]def from_automl(path):
    """ Read files in AutoML format.
        TODO
    """
    pass
    # detect files, abort if conflicts (several datasets)
    """
    feat_name_file = os.path.join(input_dir, basename + '_feat.name')
    feat_name = pd.read_csv(feat_name_file, header=None).values.ravel() if os.path.exists(feat_name_file) else None
    label_name_file = os.path.join(input_dir, basename + '_label.name')
    label_name = pd.read_csv(label_name_file, header=None).values.ravel() if os.path.exists(label_name_file) else None
    # if exists
    if os.path.exists(os.path.join(input_dir, basename + '.data')):
        # read .data and .solution
        pd.read_csv(filepath, sep=' ', header=None)
    # create AutoData object
    data = AutoData(df)
    # class ?
    data.set_class()
    # train/valid/test ?
    data.indexes['train'] = [0]
    """

[docs]def to_automl(data, path='.', name='autodata'):
    """ Write files in AutoML format.
        AutoML format is ideal to create a Codalab competition.

        :param data: AutoData frame to format.
        :param path: where to save the dataset
        :param name: name of the dataset to put into filenames
    """
    # check if folder exists
    dir = os.path.join(path, name+'_automl')
    if not os.path.exists(dir):
        # create folder
        os.mkdir(dir)
    data.descriptors().to_csv(os.path.join(dir, name+'.info'), header=True) # some information
    if data.has_class():
        pd.DataFrame(data.indexes['X']).to_csv(os.path.join(dir, name+'_feat.name'), index=False, header=False) # feat name
        pd.DataFrame(data.indexes['y']).to_csv(os.path.join(dir, name+'_label.name'), index=False, header=False) # label name
        if 'train' in data.indexes: # train/test and X/y splits
            data.get_data('X_train').to_csv(os.path.join(dir, name+'_train.data'), sep=' ', index=False, header=False) # train data
            data.get_data('X_test').to_csv(os.path.join(dir, name+'_test.data'), sep=' ', index=False, header=False) # test data
            data.get_data('y_train').to_csv(os.path.join(dir, name+'_train.solution'), sep=' ', index=False, header=False) # train solution
            data.get_data('y_test').to_csv(os.path.join(dir, name+'_test.solution'), sep=' ', index=False, header=False) # test solution
        else: # only X/y split
            data.get_data('X').to_csv(os.path.join(dir, name+'.data'), sep=' ', index=False, header=False) # data
            data.get_data('y').to_csv(os.path.join(dir, name+'.solution'), sep=' ', index=False, header=False) # solution
    else:
        pd.DataFrame(data.columns).to_csv(os.path.join(dir, name+'_feat.name'), index=False, header=False) # feat name
        if 'train' in data.indexes: # only train/test split
            data.get_data('train').to_csv(os.path.join(dir, name+'_train.data'), sep=' ', index=False, header=False) # train data
            data.get_data('test').to_csv(os.path.join(dir, name+'_test.data'), sep=' ', index=False, header=False) # test data
        else: # no split at all
            data.to_csv(os.path.join(dir, name+'.data'), sep=' ', index=False, header=False) # data