Source code for autopandas.utils.automl

# Function to read and write AutoML format
# TODO

import pandas as pd
import os

[docs]def read_automl(path): """ Alias for from_automl. Read files in AutoML format. """ return from_automl(path)
[docs]def from_automl(path): """ Read files in AutoML format. TODO """ pass # detect files, abort if conflicts (several datasets) """ feat_name_file = os.path.join(input_dir, basename + '_feat.name') feat_name = pd.read_csv(feat_name_file, header=None).values.ravel() if os.path.exists(feat_name_file) else None label_name_file = os.path.join(input_dir, basename + '_label.name') label_name = pd.read_csv(label_name_file, header=None).values.ravel() if os.path.exists(label_name_file) else None # if exists if os.path.exists(os.path.join(input_dir, basename + '.data')): # read .data and .solution pd.read_csv(filepath, sep=' ', header=None) # create AutoData object data = AutoData(df) # class ? data.set_class() # train/valid/test ? data.indexes['train'] = [0] """
[docs]def to_automl(data, path='.', name='autodata'): """ Write files in AutoML format. AutoML format is ideal to create a Codalab competition. :param data: AutoData frame to format. :param path: where to save the dataset :param name: name of the dataset to put into filenames """ # check if folder exists dir = os.path.join(path, name+'_automl') if not os.path.exists(dir): # create folder os.mkdir(dir) data.descriptors().to_csv(os.path.join(dir, name+'.info'), header=True) # some information if data.has_class(): pd.DataFrame(data.indexes['X']).to_csv(os.path.join(dir, name+'_feat.name'), index=False, header=False) # feat name pd.DataFrame(data.indexes['y']).to_csv(os.path.join(dir, name+'_label.name'), index=False, header=False) # label name if 'train' in data.indexes: # train/test and X/y splits data.get_data('X_train').to_csv(os.path.join(dir, name+'_train.data'), sep=' ', index=False, header=False) # train data data.get_data('X_test').to_csv(os.path.join(dir, name+'_test.data'), sep=' ', index=False, header=False) # test data data.get_data('y_train').to_csv(os.path.join(dir, name+'_train.solution'), sep=' ', index=False, header=False) # train solution data.get_data('y_test').to_csv(os.path.join(dir, name+'_test.solution'), sep=' ', index=False, header=False) # test solution else: # only X/y split data.get_data('X').to_csv(os.path.join(dir, name+'.data'), sep=' ', index=False, header=False) # data data.get_data('y').to_csv(os.path.join(dir, name+'.solution'), sep=' ', index=False, header=False) # solution else: pd.DataFrame(data.columns).to_csv(os.path.join(dir, name+'_feat.name'), index=False, header=False) # feat name if 'train' in data.indexes: # only train/test split data.get_data('train').to_csv(os.path.join(dir, name+'_train.data'), sep=' ', index=False, header=False) # train data data.get_data('test').to_csv(os.path.join(dir, name+'_test.data'), sep=' ', index=False, header=False) # test data else: # no split at all data.to_csv(os.path.join(dir, name+'.data'), sep=' ', index=False, header=False) # data