From 9cba46da4ca360299a606200be17171fe12588e1 Mon Sep 17 00:00:00 2001 From: Tarek Allam Date: Wed, 8 May 2019 15:52:26 +0100 Subject: [PATCH 01/58] Migration of file used in tallamjr/plasticc repo This baseline commit brings in the file that has been used in the exploratory repo of https://github.com/tallamjr/plasticc/pipeline.py --- utils/run_plasticc_pipeline.py | 326 +++++++++++++++++++++++++++++++++ 1 file changed, 326 insertions(+) create mode 100644 utils/run_plasticc_pipeline.py diff --git a/utils/run_plasticc_pipeline.py b/utils/run_plasticc_pipeline.py new file mode 100644 index 00000000..9069b3bd --- /dev/null +++ b/utils/run_plasticc_pipeline.py @@ -0,0 +1,326 @@ +# snmachine machine learning pipeline for the PLAsTiCC competition. + +## IMPORTS +import numpy as np +import pandas as pd +import sys +import os +import subprocess +import multiprocessing +import glob +from astropy.table import Table,join,vstack +from sklearn.model_selection import train_test_split +from sklearn.ensemble import RandomForestClassifier +import pickle +from argparse import ArgumentParser +import yaml +import multiprocessing +import warnings +warnings.filterwarnings("ignore") +try: + import cPickle as pickle +except ModuleNotFoundError: + import pickle +try: + from snmachine import snfeatures, sndata, snaugment, gps +except ImportError: + print("Unable to import snmachine. Check environment set correctly") + +util_module_path = os.path.abspath(os.path.join('snmachine', 'utils')) +if util_module_path not in sys.path: + sys.path.append(util_module_path) +from plasticc_utils import plasticcLogLoss, plotConfusionMatrix + + +def createFolderStructure(ANALYSIS_DIR, ANALYSIS_NAME): + + method_dir = os.path.join(ANALYSIS_DIR, ANALYSIS_NAME) + features_dir = os.path.join(method_dir, 'wavelet_features') + classif_dir = os.path.join(method_dir, 'classifications') + interm_dir = os.path.join(method_dir, 'intermediate') + plots_dir = os.path.join(method_dir, 'plots') + + dirs = {"method_dir" : method_dir, "features_dir" : features_dir, + "classif_dir" : classif_dir, "interm_dir" : interm_dir, + "plots_dir" : plots_dir} + + for key, value in dirs.items(): + subprocess.call(['mkdir', value]) + + return dirs + + +def saveConfigurationFile(dirs): + + METHOD_DIR = dirs.get("method_dir", None) + with open('/{}/config.yaml'.format(METHOD_DIR), 'w') as config: + yaml.dump(params, config, default_flow_style=False) + + +def loadDataset(DATA_PATH): + + try: + if DATA_PATH.lower().endswith((".pickle", ".pkl", ".p", ".pckl")): + with open(DATA_PATH, 'rb') as input: + print("Opening from binary pickle") + dat = pickle.load(input) + print("Dataset loaded from pickle file as: {}".format(dat)) + else: + + folder, data_file = os.path.split(DATA_PATH) + print(folder, data_file) + meta_file = "_metadata.".join(data_file.split(".")) + + print("Opening from CSV") + dat = sndata.PlasticcData(folder=folder, data_file=data_file, meta_file=meta_file, + from_pickle=False) + print("Dataset loaded from csv file as: {}".format(dat)) + print("Saving {} object to pickle binary".format(dat)) + + dat_binary = os.path.splitext(data_file)[0]+".pckl" + print(os.path.join(folder, dat_binary)) + with open(os.path.join(folder, dat_binary), 'wb') as f: + pickle.dump(dat, f, pickle.HIGHEST_PROTOCOL) + except FileNotFoundError: + print("Oii, load something !!") + + return dat + + +def reduceDataset(dat, dirs, subset_size, SEED): + + METHOD_DIR = dirs.get("method_dir", None) + subset_file = '/{}/subset.list'.format(METHOD_DIR) + if os.path.exists(subset_file): + rand_objs = np.genfromtxt(subset_file, dtype='U') + else: + np.random.seed(SEED) + rand_objs = np.random.choice(dat.object_names, replace=False, size=subset_size) + rand_objs_sorted_int = np.sort(rand_objs.astype(np.int)) + rand_objs = rand_objs_sorted_int.astype('>>> COULD BE ITS OWN LOAD CONFIGURATION FUNCTION? + try: + with open(arguments.configuration) as f: + params = yaml.load(f) + except IOError: + print("Invalid yaml file provided") + exit() + + print("The PARAMS are:\n {}".format(params)) + + # GLOBAL SETTINGS + RANDOM_STATE = params.get("RANDOM_STATE", None) + print("RANDOM_STATE:\n{}".format(RANDOM_STATE)) + SEED = params.get("SEED", None) + DATA_PATH = params.get("DATA_PATH", None) + ANALYSIS_DIR = params.get("ANALYSIS_DIR", None) + ANALYSIS_NAME = params.get("ANALYSIS_NAME", None) + + # Set the number of processes you want to use throughout the notebook + nprocesses = multiprocessing.cpu_count() + print("Running with {} cores".format(nprocesses)) + + # SNMACHINE PARAMETERS + ngp = params.get("ngp", None) + initheta = params.get("initheta", None) + + dirs = createFolderStructure(ANALYSIS_DIR, ANALYSIS_NAME) + saveConfigurationFile(dirs) + + # RUN PIPELINE + if (arguments.restart.lower() == "wavelets"): + + wavelet_features = Table.read(dirs.get("features_dir")+"/wavelet_features.fits") + combined_features = combineAdditionalFeatures(wavelet_features, DATA_PATH) + classifer = createClassififer(combined_features) + + elif (arguments.restart.lower() == "gps"): + print("Hello") + else: + print("Running full pipeline .. ") + + dat = loadDataset(DATA_PATH) + # dat = reduceDataset(dat, dirs, subset_size=10, SEED=SEED) + fitGaussianProcess(dat, ngp=ngp, t_min=0, initheta=initheta, + nprocesses=nprocesses, output_root=dirs.get("interm_dir"), t_max=1100) + + waveout, waveout_err, wavelet_object = waveletDecomposition(dat, ngp=ngp, nprocesses=nprocesses, save_output='all', output_root=dirs.get("interm_dir")) + + wavelet_features, eigenvalues, eigenvectors, means = dimentionalityReduction(wavelet_object, dirs, dat.object_names.copy(), waveout, tolerance=0.99, save_output=True, recompute_pca=True, output_root=dirs.get("features_dir")) + + combined_features = combineAdditionalFeatures(wavelet_features, DATA_PATH) + classifer = createClassififer(combined_features) + # snmachine.utils.fit_gaussian_process.extract_GP() + # check for wavelets, if so restartFromWavelets() + # else, check for gp's, if so restartFromGPs() + # otherwise runFullPipeline() From dd8277d21ef119fdb84d9d3d783d3bf4e95ec7d8 Mon Sep 17 00:00:00 2001 From: Tarek Allam Date: Wed, 8 May 2019 16:13:52 +0100 Subject: [PATCH 02/58] Minor linting improvements + comments Although one expects the code in this file to change a lot, PEP8 linting was carried out to encourage the consistent style. Comments added to areas of code which need further discussion or will indeed be adapted further --- utils/run_plasticc_pipeline.py | 93 +++++++++++++++------------------- 1 file changed, 41 insertions(+), 52 deletions(-) diff --git a/utils/run_plasticc_pipeline.py b/utils/run_plasticc_pipeline.py index 9069b3bd..5ec04762 100644 --- a/utils/run_plasticc_pipeline.py +++ b/utils/run_plasticc_pipeline.py @@ -1,20 +1,18 @@ -# snmachine machine learning pipeline for the PLAsTiCC competition. +""" +Machine learning pipeline for the PLAsTiCC competition using snmachine codebase +""" -## IMPORTS import numpy as np import pandas as pd import sys import os import subprocess import multiprocessing -import glob -from astropy.table import Table,join,vstack +from astropy.table import Table from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestClassifier -import pickle from argparse import ArgumentParser import yaml -import multiprocessing import warnings warnings.filterwarnings("ignore") try: @@ -34,15 +32,15 @@ def createFolderStructure(ANALYSIS_DIR, ANALYSIS_NAME): - method_dir = os.path.join(ANALYSIS_DIR, ANALYSIS_NAME) + method_dir = os.path.join(ANALYSIS_DIR, ANALYSIS_NAME) features_dir = os.path.join(method_dir, 'wavelet_features') - classif_dir = os.path.join(method_dir, 'classifications') - interm_dir = os.path.join(method_dir, 'intermediate') - plots_dir = os.path.join(method_dir, 'plots') + classif_dir = os.path.join(method_dir, 'classifications') + interm_dir = os.path.join(method_dir, 'intermediate') + plots_dir = os.path.join(method_dir, 'plots') - dirs = {"method_dir" : method_dir, "features_dir" : features_dir, - "classif_dir" : classif_dir, "interm_dir" : interm_dir, - "plots_dir" : plots_dir} + dirs = {"method_dir": method_dir, "features_dir": features_dir, + "classif_dir": classif_dir, "interm_dir": interm_dir, + "plots_dir": plots_dir} for key, value in dirs.items(): subprocess.call(['mkdir', value]) @@ -72,8 +70,7 @@ def loadDataset(DATA_PATH): meta_file = "_metadata.".join(data_file.split(".")) print("Opening from CSV") - dat = sndata.PlasticcData(folder=folder, data_file=data_file, meta_file=meta_file, - from_pickle=False) + dat = sndata.PlasticcData(folder=folder, data_file=data_file, meta_file=meta_file, from_pickle=False) print("Dataset loaded from csv file as: {}".format(dat)) print("Saving {} object to pickle binary".format(dat)) @@ -101,39 +98,29 @@ def reduceDataset(dat, dirs, subset_size, SEED): np.savetxt(subset_file, rand_objs, fmt='%s') dat.object_names = rand_objs - dat.data = {objects:dat.data[objects] for objects in dat.object_names} # erase the data we are not using + dat.data = {objects: dat.data[objects] for objects in dat.object_names} # Erase the data we are not using print("Dataset reduced to {} objects".format(dat.object_names.shape[0])) - return dat # Cat: I don't think we need to return anything + return dat # Cat: I don't think we need to return anything def augmentData(dat, number_per_type): - - def print_stats_by_type(dat): - print('total obj in dataset: %d'%len(dat.data)) - types=dat.get_types() - t_unique=np.unique(types['Type']) - - for t in t_unique: - thistype=types[types['Type']==t] - print('type: %d - %d obj in dataset'%(t,len(thistype))) - return t_unique - - t_unique=print_stats_by_type(dat) - aug=snaugment.GPAugment(dat) - numbers={types:number_per_type for types in t_unique} - res=aug.augment(numbers) - t_unique_new=print_stats_by_type(dat) + # Tarek: This might be removed as a function call and replaced with calls to + # functions inside snmachine.snaugment + pass -def fitGaussianProcess(dat, **kwargs): # Cat: Do we really want a mask funtion? +def fitGaussianProcess(dat, **kwargs): # Cat: Do we really want a mask funtion? + # Tarek: Now that this file lives in snmachine and with the extensive + # refactoring this is no longer necessary I believe - extract_GP(dat, **kwargs) + # extract_GP(dat, **kwargs) # snfeatures.WaveletFeatures.extract_GP(dat, **kwargs) + pass -def waveletDecomposition(dat, ngp, **kwargs): # Cat: we need to add ngp as input otherwise it doesn't run on the notebbok +def waveletDecomposition(dat, ngp, **kwargs): # Cat: we need to add ngp as input otherwise it doesn't run on the notebbok wavelet_object = snfeatures.WaveletFeatures(ngp=ngp) print("WAV = {}\n".format(wavelet_object.wav)) @@ -155,7 +142,7 @@ def dimentionalityReduction(wavelet_object, dirs, object_names, waveout, toleran return wavelet_features, eigenvalues, eigenvectors, means -def getMeta(dat): # including mjd +def getMeta(dat): # including mjd object_names = dat.object_names meta_df = pd.DataFrame(index=object_names, columns=dat.data[object_names[0]].meta.keys()) mjd_diff = np.zeros_like(object_names) @@ -174,7 +161,7 @@ def getMeta(dat): # including mjd meta_df.at[obj, key] = meta_key try: meta_df.drop(['distmod','mwebv', 'stencil', 'augment_algo'] , axis=1, inplace=True) - except KeyError: # if we are only using the original objects, 'stencil', 'augment_algo' aren't part of the metadata + except KeyError: # if we are only using the original objects, 'stencil', 'augment_algo' aren't part of the metadata meta_df.drop(['distmod','mwebv'] , axis=1, inplace=True) meta_df.rename(index=str, columns={"name": "Object", "type":"target"}, inplace=True) meta_df['mjd_diff'] = mjd_diff @@ -210,13 +197,11 @@ def createClassififer(combined_features, RANDOM_STATE): print("X = \n{}".format(X)) print("y = \n{}".format(y)) - X_train, X_test, y_train, y_test = train_test_split(X, y, - random_state=RANDOM_STATE) + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=RANDOM_STATE) - - clf = RandomForestClassifier(n_estimators=700, criterion='entropy',\ - oob_score=True, n_jobs=-1, - random_state=RANDOM_STATE) + clf = RandomForestClassifier(n_estimators=700, criterion='entropy', + oob_score=True, n_jobs=-1, + random_state=RANDOM_STATE) clf.fit(X_train, y_train) @@ -252,20 +237,24 @@ def makePredictions(LOCATION_OF_TEST_DATA, CLASSIFIER): # RETURN SUBMISSION_FILE_WITHOUT_99 pass + def runFullPipeline(): pass + def restartFromGPs(): pass + def restartFromWavelets(): pass + if __name__ == "__main__": parser = ArgumentParser(description="Run pipeline end to end") parser.add_argument('--configuration', '-c') - parser.add_argument('--restart', '-r', default="full") + parser.add_argument('--restart-from', '-r', help='Either restart from saved "GPs" or from saved "Wavelets"', default="full") arguments = parser.parse_args() # LOAD CONFIGURATION FILE --->>>> COULD BE ITS OWN LOAD CONFIGURATION FUNCTION? @@ -287,7 +276,7 @@ def restartFromWavelets(): ANALYSIS_NAME = params.get("ANALYSIS_NAME", None) # Set the number of processes you want to use throughout the notebook - nprocesses = multiprocessing.cpu_count() + nprocesses = multiprocessing.cpu_count() print("Running with {} cores".format(nprocesses)) # SNMACHINE PARAMETERS @@ -300,9 +289,9 @@ def restartFromWavelets(): # RUN PIPELINE if (arguments.restart.lower() == "wavelets"): - wavelet_features = Table.read(dirs.get("features_dir")+"/wavelet_features.fits") - combined_features = combineAdditionalFeatures(wavelet_features, DATA_PATH) - classifer = createClassififer(combined_features) + wavelet_features = Table.read(dirs.get("features_dir")+"/wavelet_features.fits") + combined_features = combineAdditionalFeatures(wavelet_features, DATA_PATH) + classifer = createClassififer(combined_features) elif (arguments.restart.lower() == "gps"): print("Hello") @@ -312,14 +301,14 @@ def restartFromWavelets(): dat = loadDataset(DATA_PATH) # dat = reduceDataset(dat, dirs, subset_size=10, SEED=SEED) fitGaussianProcess(dat, ngp=ngp, t_min=0, initheta=initheta, - nprocesses=nprocesses, output_root=dirs.get("interm_dir"), t_max=1100) + nprocesses=nprocesses, output_root=dirs.get("interm_dir"), t_max=1100) waveout, waveout_err, wavelet_object = waveletDecomposition(dat, ngp=ngp, nprocesses=nprocesses, save_output='all', output_root=dirs.get("interm_dir")) wavelet_features, eigenvalues, eigenvectors, means = dimentionalityReduction(wavelet_object, dirs, dat.object_names.copy(), waveout, tolerance=0.99, save_output=True, recompute_pca=True, output_root=dirs.get("features_dir")) - combined_features = combineAdditionalFeatures(wavelet_features, DATA_PATH) - classifer = createClassififer(combined_features) + combined_features = combineAdditionalFeatures(wavelet_features, DATA_PATH) + classifer = createClassififer(combined_features) # snmachine.utils.fit_gaussian_process.extract_GP() # check for wavelets, if so restartFromWavelets() # else, check for gp's, if so restartFromGPs() From 520150d1ce2adc26e8b99d60af87355ed5a41d02 Mon Sep 17 00:00:00 2001 From: Tarek Allam Date: Wed, 8 May 2019 17:27:19 +0100 Subject: [PATCH 03/58] Changing mode of file File mode changed to 644 from 755. This puts all files in the same permissions bracket to allow for consistency across the files. --- snmachine/snclassifier.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100755 => 100644 snmachine/snclassifier.py diff --git a/snmachine/snclassifier.py b/snmachine/snclassifier.py old mode 100755 new mode 100644 From 2acd69acaaaa7af9859c5f73338219f5d478e3b7 Mon Sep 17 00:00:00 2001 From: Tarek Allam Date: Fri, 10 May 2019 18:44:33 +0100 Subject: [PATCH 04/58] Renaming functions to be inline with code style --- utils/run_plasticc_pipeline.py | 122 ++++++++++++++------------------- 1 file changed, 50 insertions(+), 72 deletions(-) diff --git a/utils/run_plasticc_pipeline.py b/utils/run_plasticc_pipeline.py index 5ec04762..6f783b05 100644 --- a/utils/run_plasticc_pipeline.py +++ b/utils/run_plasticc_pipeline.py @@ -30,7 +30,7 @@ from plasticc_utils import plasticcLogLoss, plotConfusionMatrix -def createFolderStructure(ANALYSIS_DIR, ANALYSIS_NAME): +def create_folder_structure(ANALYSIS_DIR, ANALYSIS_NAME): method_dir = os.path.join(ANALYSIS_DIR, ANALYSIS_NAME) features_dir = os.path.join(method_dir, 'wavelet_features') @@ -48,14 +48,14 @@ def createFolderStructure(ANALYSIS_DIR, ANALYSIS_NAME): return dirs -def saveConfigurationFile(dirs): +def save_configuration_file(dirs): METHOD_DIR = dirs.get("method_dir", None) with open('/{}/config.yaml'.format(METHOD_DIR), 'w') as config: yaml.dump(params, config, default_flow_style=False) -def loadDataset(DATA_PATH): +def load_dataset(DATA_PATH): try: if DATA_PATH.lower().endswith((".pickle", ".pkl", ".p", ".pckl")): @@ -84,7 +84,7 @@ def loadDataset(DATA_PATH): return dat -def reduceDataset(dat, dirs, subset_size, SEED): +def reduce_dataset(dat, dirs, subset_size, SEED): METHOD_DIR = dirs.get("method_dir", None) subset_file = '/{}/subset.list'.format(METHOD_DIR) @@ -105,13 +105,7 @@ def reduceDataset(dat, dirs, subset_size, SEED): return dat # Cat: I don't think we need to return anything -def augmentData(dat, number_per_type): - # Tarek: This might be removed as a function call and replaced with calls to - # functions inside snmachine.snaugment - pass - - -def fitGaussianProcess(dat, **kwargs): # Cat: Do we really want a mask funtion? +def fit_gaussian_process(dat, **kwargs): # Cat: Do we really want a mask funtion? # Tarek: Now that this file lives in snmachine and with the extensive # refactoring this is no longer necessary I believe @@ -120,7 +114,7 @@ def fitGaussianProcess(dat, **kwargs): # Cat: Do we really want a mask funtion? pass -def waveletDecomposition(dat, ngp, **kwargs): # Cat: we need to add ngp as input otherwise it doesn't run on the notebbok +def wavelet_decomposition(dat, ngp, **kwargs): # Cat: we need to add ngp as input otherwise it doesn't run on the notebbok wavelet_object = snfeatures.WaveletFeatures(ngp=ngp) print("WAV = {}\n".format(wavelet_object.wav)) @@ -130,7 +124,7 @@ def waveletDecomposition(dat, ngp, **kwargs): # Cat: we need to add ngp as inpu return waveout, waveout_err, wavelet_object -def dimentionalityReduction(wavelet_object, dirs, object_names, waveout, tolerance, **kwargs): # Cat: we need to add tolerance +def dimentionality_reduction(wavelet_object, dirs, object_names, waveout, tolerance, **kwargs): # Cat: we need to add tolerance # check if reduced wavelet features already exist wavelet_features, eigenvalues, eigenvectors, means, num_feats = wavelet_object.extract_pca(object_names, waveout, **kwargs) @@ -142,33 +136,7 @@ def dimentionalityReduction(wavelet_object, dirs, object_names, waveout, toleran return wavelet_features, eigenvalues, eigenvectors, means -def getMeta(dat): # including mjd - object_names = dat.object_names - meta_df = pd.DataFrame(index=object_names, columns=dat.data[object_names[0]].meta.keys()) - mjd_diff = np.zeros_like(object_names) - for i in np.arange(len(object_names)): - obj = object_names[i] - obj_data = dat.data[obj] - obj_meta = obj_data.meta - mjd_diff[i] = np.max(obj_data['mjd'])-np.min(obj_data['mjd']) - for key in obj_meta.keys(): - meta_key = obj_meta[key] - try: - assert type(meta_key) == np.ndarray - meta_key = meta_key[0] - except: - pass - meta_df.at[obj, key] = meta_key - try: - meta_df.drop(['distmod','mwebv', 'stencil', 'augment_algo'] , axis=1, inplace=True) - except KeyError: # if we are only using the original objects, 'stencil', 'augment_algo' aren't part of the metadata - meta_df.drop(['distmod','mwebv'] , axis=1, inplace=True) - meta_df.rename(index=str, columns={"name": "Object", "type":"target"}, inplace=True) - meta_df['mjd_diff'] = mjd_diff - return meta_df - - -def mergeFeatures(some_features, other_features): +def merge_features(some_features, other_features): if type(some_features) != pd.core.frame.DataFrame: some_features = some_features.to_pandas() if type(other_features) != pd.core.frame.DataFrame: @@ -178,13 +146,15 @@ def mergeFeatures(some_features, other_features): return merged_df -def combineAdditionalFeatures(wavelet_features, dat): - meta_df = getMeta(dat) - combined_features = mergeFeatures(wavelet_features, meta_df) +def combine_additional_features(wavelet_features, dat): + # Combine snmachine wavelet features with PLASTICC features. Allow user to + # define the dataframe they would like to merge + meta_df = dat.metadata + combined_features = merge_features(wavelet_features, meta_df) return combined_features -def createClassififer(combined_features, RANDOM_STATE): +def create_classififer(combined_features, RANDOM_STATE): X = combined_features.drop('target', axis=1) y = combined_features['target'].values @@ -230,7 +200,7 @@ def createClassififer(combined_features, RANDOM_STATE): return clf -def makePredictions(LOCATION_OF_TEST_DATA, CLASSIFIER): +def make_predictions(LOCATION_OF_TEST_DATA, CLASSIFIER): # LOAD TEST SET AT THIS POINT # USE CLASFFIFER FROM createClassififer, BY USING THAT WE THEN # clf.predict(test_set) @@ -238,25 +208,19 @@ def makePredictions(LOCATION_OF_TEST_DATA, CLASSIFIER): pass -def runFullPipeline(): +def run_full_pipeline(): pass -def restartFromGPs(): +def restart_from_saved_gps(): pass -def restartFromWavelets(): +def restart_from_save_wavelets(): pass -if __name__ == "__main__": - - parser = ArgumentParser(description="Run pipeline end to end") - parser.add_argument('--configuration', '-c') - parser.add_argument('--restart-from', '-r', help='Either restart from saved "GPs" or from saved "Wavelets"', default="full") - arguments = parser.parse_args() - +def load_configuration_file(path_to_configuration_file): # LOAD CONFIGURATION FILE --->>>> COULD BE ITS OWN LOAD CONFIGURATION FUNCTION? try: with open(arguments.configuration) as f: @@ -267,48 +231,62 @@ def restartFromWavelets(): print("The PARAMS are:\n {}".format(params)) - # GLOBAL SETTINGS + return params + + +if __name__ == "__main__": + + parser = ArgumentParser(description="Run pipeline end to end") + parser.add_argument('--configuration', '-c') + parser.add_argument('--restart-from', '-r', help='Either restart from saved "GPs" or from saved "Wavelets"', default="full") + arguments = parser.parse_args() + + params = load_configuration_file(arguments.configuration) + + # global settings RANDOM_STATE = params.get("RANDOM_STATE", None) - print("RANDOM_STATE:\n{}".format(RANDOM_STATE)) + # Tarek: maybe remove this completely and + # set inside a function call itself, i.e. have a default which can be + # overridden SEED = params.get("SEED", None) DATA_PATH = params.get("DATA_PATH", None) ANALYSIS_DIR = params.get("ANALYSIS_DIR", None) ANALYSIS_NAME = params.get("ANALYSIS_NAME", None) + # snmachine parameters + ngp = params.get("ngp", None) + initheta = params.get("initheta", None) + # Set the number of processes you want to use throughout the notebook nprocesses = multiprocessing.cpu_count() print("Running with {} cores".format(nprocesses)) - # SNMACHINE PARAMETERS - ngp = params.get("ngp", None) - initheta = params.get("initheta", None) - - dirs = createFolderStructure(ANALYSIS_DIR, ANALYSIS_NAME) - saveConfigurationFile(dirs) + dirs = create_folder_structure(ANALYSIS_DIR, ANALYSIS_NAME) + save_configuration_file(dirs) # RUN PIPELINE if (arguments.restart.lower() == "wavelets"): wavelet_features = Table.read(dirs.get("features_dir")+"/wavelet_features.fits") - combined_features = combineAdditionalFeatures(wavelet_features, DATA_PATH) - classifer = createClassififer(combined_features) + combined_features = combine_additional_features(wavelet_features, DATA_PATH) + classifer = create_classififer(combined_features) elif (arguments.restart.lower() == "gps"): print("Hello") else: print("Running full pipeline .. ") - dat = loadDataset(DATA_PATH) + dat = load_dataset(DATA_PATH) # dat = reduceDataset(dat, dirs, subset_size=10, SEED=SEED) - fitGaussianProcess(dat, ngp=ngp, t_min=0, initheta=initheta, - nprocesses=nprocesses, output_root=dirs.get("interm_dir"), t_max=1100) + fit_gaussian_process(dat, ngp=ngp, t_min=0, initheta=initheta, + nprocesses=nprocesses, output_root=dirs.get("interm_dir"), t_max=1100) - waveout, waveout_err, wavelet_object = waveletDecomposition(dat, ngp=ngp, nprocesses=nprocesses, save_output='all', output_root=dirs.get("interm_dir")) + waveout, waveout_err, wavelet_object = wavelet_decomposition(dat, ngp=ngp, nprocesses=nprocesses, save_output='all', output_root=dirs.get("interm_dir")) - wavelet_features, eigenvalues, eigenvectors, means = dimentionalityReduction(wavelet_object, dirs, dat.object_names.copy(), waveout, tolerance=0.99, save_output=True, recompute_pca=True, output_root=dirs.get("features_dir")) + wavelet_features, eigenvalues, eigenvectors, means = dimentionality_reduction(wavelet_object, dirs, dat.object_names.copy(), waveout, tolerance=0.99, save_output=True, recompute_pca=True, output_root=dirs.get("features_dir")) - combined_features = combineAdditionalFeatures(wavelet_features, DATA_PATH) - classifer = createClassififer(combined_features) + combined_features = combine_additional_features(wavelet_features, DATA_PATH) + classifer = create_classififer(combined_features) # snmachine.utils.fit_gaussian_process.extract_GP() # check for wavelets, if so restartFromWavelets() # else, check for gp's, if so restartFromGPs() From c898f963998a306bf68faab71f1c09d1115231fe Mon Sep 17 00:00:00 2001 From: Tarek Allam Date: Tue, 14 May 2019 13:07:17 +0100 Subject: [PATCH 05/58] Tidying up file and renaming function names Renaming to be in line with code style conventions --- utils/plasticc_utils.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/utils/plasticc_utils.py b/utils/plasticc_utils.py index 7b1b45f5..c17a2d5b 100644 --- a/utils/plasticc_utils.py +++ b/utils/plasticc_utils.py @@ -2,21 +2,23 @@ Utility script for calculating the log loss """ +from sklearn.metrics import confusion_matrix import sys import numpy as np import matplotlib.pyplot as plt import seaborn as sns -from sklearn.metrics import auc, roc_curve, confusion_matrix -def plotConfusionMatrix(yTrue, yPredict, dataName, targetNames): + +def plot_confusion_matrix(yTrue, yPredict, dataName, targetNames): cm = confusion_matrix(yTrue, yPredict, labels=targetNames) cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] annot = np.around(cm, 2) - fig, ax = plt.subplots(figsize=(9,7)) + fig, ax = plt.subplots(figsize=(9, 7)) sns.heatmap(cm, xticklabels=targetNames, - yticklabels=targetNames, cmap='Blues', - annot=annot, lw=0.5) + yticklabels=targetNames, cmap='Blues', + annot=annot, lw=0.5) + ax.set_xlabel('Predicted Label') ax.set_ylabel('True Label') ax.set_aspect('equal') @@ -24,14 +26,15 @@ def plotConfusionMatrix(yTrue, yPredict, dataName, targetNames): return cm -def plasticcLogLoss(y_true, y_pred, relative_class_weights=None): + +def plasticc_log_loss(y_true, y_pred, relative_class_weights=None): """ Implementation of weighted log loss used for the Kaggle challenge """ predictions = y_pred.copy() # sanitize predictions - epsilon = sys.float_info.epsilon # this is machine dependent but essentially prevents log(0) + epsilon = sys.float_info.epsilon # this is machine dependent but essentially prevents log(0) predictions = np.clip(predictions, epsilon, 1.0 - epsilon) predictions = predictions / np.sum(predictions, axis=1)[:, np.newaxis] From 660519a37b8fc3587f4df4d82f194f86241eae3e Mon Sep 17 00:00:00 2001 From: Tarek Allam Date: Tue, 14 May 2019 13:46:03 +0100 Subject: [PATCH 06/58] Change mode of run_plasticc_pipeline file to 744 --- utils/run_plasticc_pipeline.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 utils/run_plasticc_pipeline.py diff --git a/utils/run_plasticc_pipeline.py b/utils/run_plasticc_pipeline.py old mode 100644 new mode 100755 From 906d95e1a56fd600992446d91541337012451f27 Mon Sep 17 00:00:00 2001 From: Tarek Allam Date: Tue, 14 May 2019 13:46:37 +0100 Subject: [PATCH 07/58] Updating create_folder_structure function Updating with doctrings and examples. Also including helper function to obtain git revision hash to include in analysis folder name --- utils/run_plasticc_pipeline.py | 68 +++++++++++++++++++++++++++------- 1 file changed, 54 insertions(+), 14 deletions(-) diff --git a/utils/run_plasticc_pipeline.py b/utils/run_plasticc_pipeline.py index 6f783b05..2cbbaa38 100755 --- a/utils/run_plasticc_pipeline.py +++ b/utils/run_plasticc_pipeline.py @@ -27,20 +27,60 @@ util_module_path = os.path.abspath(os.path.join('snmachine', 'utils')) if util_module_path not in sys.path: sys.path.append(util_module_path) -from plasticc_utils import plasticcLogLoss, plotConfusionMatrix - - -def create_folder_structure(ANALYSIS_DIR, ANALYSIS_NAME): - - method_dir = os.path.join(ANALYSIS_DIR, ANALYSIS_NAME) - features_dir = os.path.join(method_dir, 'wavelet_features') - classif_dir = os.path.join(method_dir, 'classifications') - interm_dir = os.path.join(method_dir, 'intermediate') - plots_dir = os.path.join(method_dir, 'plots') - - dirs = {"method_dir": method_dir, "features_dir": features_dir, - "classif_dir": classif_dir, "interm_dir": interm_dir, - "plots_dir": plots_dir} +from plasticc_utils import plasticc_log_loss, plot_confusion_matrix + + +def get_git_revision_short_hash(): + """ Helper function to obtain current version control hash value + + Returns + ------- + _hash : str + Short representation of current version control hash value + + Examples + -------- + >>> sha = get_git_revision_short_hash() + >>> print(sha) + 'ede068e' + """ + _hash = subprocess.check_output(['git', 'rev-parse', '--short', 'HEAD']) + return _hash.decode("utf-8").rstrip() + + +def create_folder_structure(analysis_directory, analysis_name): + """ Make directories that will be used for analysis + + Parameters + ---------- + analysis_directory : str + System path to where the user would like to contain + a run of the analysis + analysis_name : str + Given name of analysis run. This is appended with the current git hash + the code has been run with. + + Returns + ------- + dirs: dict + Dictionary containing the mapping of folders that have been created. + + Examples + -------- + Each folder name can then be accessed with dictionary methods: + + >>> analysis_directory = params.get("analysis_directory", None) + >>> analysis_name = params.get("analysis_name", None) + """ + method_directory = os.path.join(analysis_directory, analysis_name + get_git_revision_short_hash()) + features_directory = os.path.join(method_directory, 'wavelet_features') + classifications_directory = os.path.join(method_directory, 'classifications') + intermediate_files_directory = os.path.join(method_directory, 'intermediate') + plots_directory = os.path.join(method_directory, 'plots') + + dirs = {"method_directory": method_directory, "features_directory": features_directory, + "classifications_directory": classifications_directory, "intermediate_files_directory": intermediate_files_directory, + "plots_directory": plots_directory} for key, value in dirs.items(): subprocess.call(['mkdir', value]) From e40a785f707834e6947e5108e8290318eb5bc38b Mon Sep 17 00:00:00 2001 From: Tarek Allam Date: Tue, 14 May 2019 13:48:19 +0100 Subject: [PATCH 08/58] Removing options in config to be in script instead Certain options would be better served as defaults in the script and the user can change these as they wish in the source file --- utils/config.yml | 12 ++++++++++++ utils/run_plasticc_pipeline.py | 20 ++++++++++---------- 2 files changed, 22 insertions(+), 10 deletions(-) create mode 100644 utils/config.yml diff --git a/utils/config.yml b/utils/config.yml new file mode 100644 index 00000000..f5949880 --- /dev/null +++ b/utils/config.yml @@ -0,0 +1,12 @@ +# +# +# GENERAL PARAMS +SEED : 1234 +REPO_DIR : "/share/hypatia/snmachine_resources/data/plasticc/" +ANALYSIS_DIR : "/share/hypatia/snmachine_resources/data/plasticc/analysis/" +ANALYSIS_NAME : "test-analysis" +DATA_PATH : "/share/hypatia/snmachine_resources/data/plasticc/data/raw_data/training_set.pkl" + +# SNMACHINE_PARAMS +ngp : 1100 +initheta : [500, 20] diff --git a/utils/run_plasticc_pipeline.py b/utils/run_plasticc_pipeline.py index 2cbbaa38..24f87e24 100755 --- a/utils/run_plasticc_pipeline.py +++ b/utils/run_plasticc_pipeline.py @@ -124,14 +124,14 @@ def load_dataset(DATA_PATH): return dat -def reduce_dataset(dat, dirs, subset_size, SEED): +def reduce_dataset(dat, dirs, subset_size, seed=1234): METHOD_DIR = dirs.get("method_dir", None) subset_file = '/{}/subset.list'.format(METHOD_DIR) if os.path.exists(subset_file): rand_objs = np.genfromtxt(subset_file, dtype='U') else: - np.random.seed(SEED) + np.random.seed(seed) rand_objs = np.random.choice(dat.object_names, replace=False, size=subset_size) rand_objs_sorted_int = np.sort(rand_objs.astype(np.int)) rand_objs = rand_objs_sorted_int.astype(' Date: Tue, 14 May 2019 13:52:06 +0100 Subject: [PATCH 09/58] Moving old utils files to an archival folder These files may still have merit for processing the data but as the pipeline is being developed it is felt they are better served in a seperate folder --- utils/{ => archive}/collect.pbs | 0 utils/{ => archive}/collect.py | 0 utils/{ => archive}/conquer.pbs | 0 utils/{ => archive}/conquer.py | 10 +++++----- utils/{ => archive}/create_jobs.py | 0 utils/{ => archive}/divide.pbs | 0 utils/{ => archive}/divide.py | 0 utils/{ => archive}/plasticc_extract_gp.pbs | 0 utils/{ => archive}/plasticc_extract_gp.py | 1 - utils/{ => archive}/post_process.py | 1 - 10 files changed, 5 insertions(+), 7 deletions(-) rename utils/{ => archive}/collect.pbs (100%) rename utils/{ => archive}/collect.py (100%) rename utils/{ => archive}/conquer.pbs (100%) rename utils/{ => archive}/conquer.py (96%) rename utils/{ => archive}/create_jobs.py (100%) rename utils/{ => archive}/divide.pbs (100%) rename utils/{ => archive}/divide.py (100%) rename utils/{ => archive}/plasticc_extract_gp.pbs (100%) rename utils/{ => archive}/plasticc_extract_gp.py (99%) rename utils/{ => archive}/post_process.py (99%) diff --git a/utils/collect.pbs b/utils/archive/collect.pbs similarity index 100% rename from utils/collect.pbs rename to utils/archive/collect.pbs diff --git a/utils/collect.py b/utils/archive/collect.py similarity index 100% rename from utils/collect.py rename to utils/archive/collect.py diff --git a/utils/conquer.pbs b/utils/archive/conquer.pbs similarity index 100% rename from utils/conquer.pbs rename to utils/archive/conquer.pbs diff --git a/utils/conquer.py b/utils/archive/conquer.py similarity index 96% rename from utils/conquer.py rename to utils/archive/conquer.py index 218c5949..40d85d33 100644 --- a/utils/conquer.py +++ b/utils/archive/conquer.py @@ -88,10 +88,11 @@ tab.meta['z']=tab.meta['hostgal_specz'] #insert into data set - d.insert_lightcurve(tab) + d.insert_lightcurve(tab) + with open(os.path.join(out_folder,'dataset_%d.pickle'%index),'wb') as f: - pickle.dump(d,f) + pickle.dump(d,f) ''' wf=snfeatures.WaveletFeatures() @@ -99,9 +100,9 @@ feats.write(os.path.join(feats_folder, 'wavelet_features.fits'),overwrite=True) with open(os.path.join(feats_folder,'PCA_mean.pickle'),'wb') as f1: - pickle.dump(wf.PCA_mean,f1) + pickle.dump(wf.PCA_mean,f1) with open(os.path.join(feats_folder,'PCA_eigenvals.pickle'),'wb') as f2: - pickle.dump(wf.PCA_eigenvals,f2) + pickle.dump(wf.PCA_eigenvals,f2) with open(os.path.join(feats_folder,'PCA_eigenvectors.pickle'),'wb') as f3: pickle.dump(wf.PCA_eigenvectors,f3) @@ -110,4 +111,3 @@ np.savetxt(os.path.join(feats_folder,'PCA_eigenvals.txt'),wf.PCA_eigenvals) np.savetxt(os.path.join(feats_folder,'PCA_eigenvectors.txt'),wf.PCA_eigenvectors) ''' - diff --git a/utils/create_jobs.py b/utils/archive/create_jobs.py similarity index 100% rename from utils/create_jobs.py rename to utils/archive/create_jobs.py diff --git a/utils/divide.pbs b/utils/archive/divide.pbs similarity index 100% rename from utils/divide.pbs rename to utils/archive/divide.pbs diff --git a/utils/divide.py b/utils/archive/divide.py similarity index 100% rename from utils/divide.py rename to utils/archive/divide.py diff --git a/utils/plasticc_extract_gp.pbs b/utils/archive/plasticc_extract_gp.pbs similarity index 100% rename from utils/plasticc_extract_gp.pbs rename to utils/archive/plasticc_extract_gp.pbs diff --git a/utils/plasticc_extract_gp.py b/utils/archive/plasticc_extract_gp.py similarity index 99% rename from utils/plasticc_extract_gp.py rename to utils/archive/plasticc_extract_gp.py index fe30ddef..3a1bb8ae 100644 --- a/utils/plasticc_extract_gp.py +++ b/utils/archive/plasticc_extract_gp.py @@ -59,4 +59,3 @@ np.savetxt(os.path.join(feats_folder,'PCA_eigenvals.txt'),wf.PCA_eigenvals) np.savetxt(os.path.join(feats_folder,'PCA_eigenvectors.txt'),wf.PCA_eigenvectors) ''' - diff --git a/utils/post_process.py b/utils/archive/post_process.py similarity index 99% rename from utils/post_process.py rename to utils/archive/post_process.py index c0b2487d..797a34c7 100644 --- a/utils/post_process.py +++ b/utils/archive/post_process.py @@ -67,4 +67,3 @@ np.savetxt(flname, not_done, fmt='%s') else: print 'All objects accounted for' - From edfd84d594e395b91dfe0aa05460cd8a8bf1d532 Mon Sep 17 00:00:00 2001 From: Tarek Allam Date: Tue, 14 May 2019 13:54:32 +0100 Subject: [PATCH 10/58] Tidy up import block Removed code to add to sys path as this is no longer necessary as pipeline script now resides in snmachine main repo --- utils/run_plasticc_pipeline.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/utils/run_plasticc_pipeline.py b/utils/run_plasticc_pipeline.py index 24f87e24..f444d42a 100755 --- a/utils/run_plasticc_pipeline.py +++ b/utils/run_plasticc_pipeline.py @@ -1,10 +1,9 @@ """ Machine learning pipeline for the PLAsTiCC competition using snmachine codebase """ - +from plasticc_utils import plasticc_log_loss, plot_confusion_matrix import numpy as np import pandas as pd -import sys import os import subprocess import multiprocessing @@ -24,11 +23,6 @@ except ImportError: print("Unable to import snmachine. Check environment set correctly") -util_module_path = os.path.abspath(os.path.join('snmachine', 'utils')) -if util_module_path not in sys.path: - sys.path.append(util_module_path) -from plasticc_utils import plasticc_log_loss, plot_confusion_matrix - def get_git_revision_short_hash(): """ Helper function to obtain current version control hash value From 98e88005574e261b47f6af55632d906693fe43c0 Mon Sep 17 00:00:00 2001 From: Tarek Allam Date: Tue, 14 May 2019 16:56:05 +0100 Subject: [PATCH 11/58] [WIP] Updating functions in pipeline script Several functions have been updated with doctrings and examples for how to run such functions --- utils/run_plasticc_pipeline.py | 354 ++++++++++++++++++++------------- 1 file changed, 212 insertions(+), 142 deletions(-) diff --git a/utils/run_plasticc_pipeline.py b/utils/run_plasticc_pipeline.py index f444d42a..40513533 100755 --- a/utils/run_plasticc_pipeline.py +++ b/utils/run_plasticc_pipeline.py @@ -2,15 +2,15 @@ Machine learning pipeline for the PLAsTiCC competition using snmachine codebase """ from plasticc_utils import plasticc_log_loss, plot_confusion_matrix +from astropy.table import Table +from sklearn.model_selection import train_test_split +from sklearn.ensemble import RandomForestClassifier +from argparse import ArgumentParser import numpy as np import pandas as pd import os import subprocess import multiprocessing -from astropy.table import Table -from sklearn.model_selection import train_test_split -from sklearn.ensemble import RandomForestClassifier -from argparse import ArgumentParser import yaml import warnings warnings.filterwarnings("ignore") @@ -34,6 +34,7 @@ def get_git_revision_short_hash(): Examples -------- + >>> ... >>> sha = get_git_revision_short_hash() >>> print(sha) 'ede068e' @@ -62,7 +63,7 @@ def create_folder_structure(analysis_directory, analysis_name): Examples -------- Each folder name can then be accessed with dictionary methods: - + >>> ... >>> analysis_directory = params.get("analysis_directory", None) >>> analysis_name = params.get("analysis_name", None) """ @@ -82,105 +83,203 @@ def create_folder_structure(analysis_directory, analysis_name): return dirs +def load_configuration_file(path_to_configuration_file): + # TODO: Finish doctring examples + """ Load from disk the configuration file that is to be used + + Parameters + ---------- + path_to_configuration_file : str + System path to where the configuration file is located + + Returns + ------- + params : dict + Dictionary of parameters contained inside the configuration file + + Examples + -------- + Each item inside the configuration file can be accessed like so: + >>> ... + >>> params = load_configuration_file(path_to_configuration_file) + >>> data_path = params.get("data_path", None) + >>> print(data_path) + >>> ngp = params.get("ngp", None) + >>> print(ngp) + """ + try: + with open(path_to_configuration_file) as f: + params = yaml.load(f) + except IOError: + print("Invalid yaml file provided") + exit() + print("The PARAMS are:\n {}".format(params)) + return params + + def save_configuration_file(dirs): + # TODO: Provide a doctring example + """ Make a copy of the configuration file that has been used inside the + analysis directory + + Parameters + ---------- + dirs : dict + Dictionary containing the names of the folder paths used in this analysis + + Returns + ------- + None - METHOD_DIR = dirs.get("method_dir", None) - with open('/{}/config.yaml'.format(METHOD_DIR), 'w') as config: + """ + method_directory = dirs.get("method_directory", None) + with open(os.path.join(method_directory, "config.yml"), 'w') as config: yaml.dump(params, config, default_flow_style=False) -def load_dataset(DATA_PATH): +def load_training_data(data_path): + # TODO: Finish doctring examples + """ Load from disk the training data one will use for this analysis + + Parameters + ---------- + params : dict + Dictionary containing the parameters that reside in the configuration + file. This will be used to obtain the path to the training data. + + Returns + ------- + training_data : snmachine.PlasticcData + snmachine.PlasticcData instance of the training data + + Examples + -------- + >>> ... + >>> training_data = load_training_data(params) + >>> print(training_data) + """ try: - if DATA_PATH.lower().endswith((".pickle", ".pkl", ".p", ".pckl")): - with open(DATA_PATH, 'rb') as input: + if data_path.lower().endswith((".pickle", ".pkl", ".p", ".pckl")): + with open(data_path, 'rb') as input: print("Opening from binary pickle") - dat = pickle.load(input) - print("Dataset loaded from pickle file as: {}".format(dat)) + training_data = pickle.load(input) + print("Dataset loaded from pickle file as: {}".format(training_data)) else: - - folder, data_file = os.path.split(DATA_PATH) - print(folder, data_file) - meta_file = "_metadata.".join(data_file.split(".")) + folder_path, train_data_file_name = os.path.split(data_path) + print(folder_path, train_data_file_name) + meta_data_file_name = "_metadata.".join(train_data_file_name.split(".")) print("Opening from CSV") - dat = sndata.PlasticcData(folder=folder, data_file=data_file, meta_file=meta_file, from_pickle=False) - print("Dataset loaded from csv file as: {}".format(dat)) - print("Saving {} object to pickle binary".format(dat)) - - dat_binary = os.path.splitext(data_file)[0]+".pckl" - print(os.path.join(folder, dat_binary)) - with open(os.path.join(folder, dat_binary), 'wb') as f: - pickle.dump(dat, f, pickle.HIGHEST_PROTOCOL) + training_data = sndata.PlasticcData(folder=folder_path, data_file=train_data_file_name, + metadata_file=meta_data_file_name, cut_non_detections=False) + print("Dataset loaded from csv file as: {}".format(training_data)) + print("Saving {} object to pickle binary".format(training_data)) + + dat_binary = os.path.splitext(train_data_file_name)[0] + ".pckl" + print(os.path.join(folder_path, dat_binary)) + with open(os.path.join(folder_path, dat_binary), 'wb') as f: + pickle.dump(training_data, f, pickle.HIGHEST_PROTOCOL) except FileNotFoundError: - print("Oii, load something !!") + print("No file found to load") + exit() + + return training_data - return dat +def reduce_size_of_training_data(training_data, dirs, subset_size, seed=1234): + # TODO: Incorpate further doctrings and finish examples. Tarek: Catarina and I need to + # discuss this further. There is some overlap between this and + # sndata.PlasticcData.update_data() and it would be good to comebine this. + """ Load from disk the training data one will use for this analysis -def reduce_dataset(dat, dirs, subset_size, seed=1234): + Parameters + ---------- + training_data : snmachine.PlasticcData + Dictionary containing the parameters that reside in the configuration + file. This will be used to obtain the path to the training data. + dirs : dict + Dictionary containing + subset_size : int + Number of objects the user would like to reduce the training data to + seed : int + Default set to 1234. This can be overridden by the user to check for + consistancy of results - METHOD_DIR = dirs.get("method_dir", None) - subset_file = '/{}/subset.list'.format(METHOD_DIR) + Returns + ------- + None + + Examples + -------- + >>> ... + >>> print(shape.training_data) + + >>> new_training_data = reduce_size_of_training_data(training_data, dirs, 1000)) + >>> print(shape.new_training_data) + + """ + + method_directory = dirs.get("method_directory", None) + subset_file = os.path.join(method_directory, "subset.list") if os.path.exists(subset_file): rand_objs = np.genfromtxt(subset_file, dtype='U') else: np.random.seed(seed) - rand_objs = np.random.choice(dat.object_names, replace=False, size=subset_size) + rand_objs = np.random.choice(training_data.object_names, replace=False, size=subset_size) rand_objs_sorted_int = np.sort(rand_objs.astype(np.int)) rand_objs = rand_objs_sorted_int.astype('>>> COULD BE ITS OWN LOAD CONFIGURATION FUNCTION? - try: - with open(arguments.configuration) as f: - params = yaml.load(f) - except IOError: - print("Invalid yaml file provided") - exit() - - print("The PARAMS are:\n {}".format(params)) - - return params - - if __name__ == "__main__": + # Set the number of processes you want to use throughout the notebook + nprocesses = multiprocessing.cpu_count() + print("Running with {} cores".format(nprocesses)) + parser = ArgumentParser(description="Run pipeline end to end") parser.add_argument('--configuration', '-c') parser.add_argument('--restart-from', '-r', help='Either restart from saved "GPs" or from saved "Wavelets"', default="full") @@ -277,13 +346,7 @@ def load_configuration_file(path_to_configuration_file): params = load_configuration_file(arguments.configuration) - # global settings - RANDOM_STATE = params.get("RANDOM_STATE", None) - # Tarek: maybe remove this completely and - # set inside a function call itself, i.e. have a default which can be - # overridden - SEED = params.get("SEED", None) - DATA_PATH = params.get("DATA_PATH", None) + data_path = params.get("data_path", None) analysis_directory = params.get("analysis_directory", None) analysis_name = params.get("analysis_name", None) @@ -291,37 +354,44 @@ def load_configuration_file(path_to_configuration_file): ngp = params.get("ngp", None) initheta = params.get("initheta", None) - # Set the number of processes you want to use throughout the notebook - nprocesses = multiprocessing.cpu_count() - print("Running with {} cores".format(nprocesses)) - + # Step 1. Creat folders that contain analysis dirs = create_folder_structure(analysis_directory, analysis_name) + # Step 2. Save configuration file used for this analysis save_configuration_file(dirs) - - # RUN PIPELINE + # Step 3. Check at which point the user would like to run the analysis from. + # If elements already saved, these will be used but this can be overriden + # with command line argument if (arguments.restart.lower() == "wavelets"): - - wavelet_features = Table.read(dirs.get("features_dir")+"/wavelet_features.fits") - combined_features = combine_additional_features(wavelet_features, DATA_PATH) + # Restart from saved uncompressed wavelets. + wavelet_features = Table.read(dirs.get("features_dir") + "/wavelet_features.fits") + combined_features = combine_all_features(wavelet_features, data_path) classifer = create_classififer(combined_features) - elif (arguments.restart.lower() == "gps"): - print("Hello") + # Restart from saved GPs. + pass else: - print("Running full pipeline .. ") + # Run full pipeline but still do checks to see if elements from GPs or + # wavelets already exist on disk; the first check should be for: + # 1. Saved PCA files + # 2. Saved uncompressed wavelets + # 3. Saved GPs - dat = load_dataset(DATA_PATH) - # dat = reduceDataset(dat, dirs, subset_size=10, SEED=SEED) - fit_gaussian_process(dat, ngp=ngp, t_min=0, initheta=initheta, - nprocesses=nprocesses, output_root=dirs.get("interm_dir"), t_max=1100) + training_data = load_training_data(data_path) + gps.compute_gps() + wavelet_object = snfeatures.WaveletFeatures(ngp=ngp) + waveout, waveout_err = wavelet_object.extract_wavelets(training_data, wavelet_object.wav, wavelet_object.mlev, **kwargs) + # waveout, waveout_err, wavelet_object = wavelet_decomposition(dat, ngp=ngp, nprocesses=nprocesses, save_output='all', output_root=dirs.get("interm_dir")) + # wavelet_features, eigenvalues, eigenvectors, means = dimentionality_reduction(wavelet_object, dirs, dat.object_names.copy(), waveout, tolerance=0.99, save_output=True, recompute_pca=True, output_root=dirs.get("features_dir")) + # combined_features = combine_all_features(wavelet_features, DATA_PATH) + # classifer = create_classififer(combined_features) - waveout, waveout_err, wavelet_object = wavelet_decomposition(dat, ngp=ngp, nprocesses=nprocesses, save_output='all', output_root=dirs.get("interm_dir")) - wavelet_features, eigenvalues, eigenvectors, means = dimentionality_reduction(wavelet_object, dirs, dat.object_names.copy(), waveout, tolerance=0.99, save_output=True, recompute_pca=True, output_root=dirs.get("features_dir")) + # fit_gaussian_process(dat, ngp=ngp, t_min=0, initheta=initheta, + # nprocesses=nprocesses, output_root=dirs.get("interm_dir"), t_max=1100) - combined_features = combine_additional_features(wavelet_features, DATA_PATH) - classifer = create_classififer(combined_features) - # snmachine.utils.fit_gaussian_process.extract_GP() - # check for wavelets, if so restartFromWavelets() - # else, check for gp's, if so restartFromGPs() - # otherwise runFullPipeline() + # waveout, waveout_err, wavelet_object = wavelet_decomposition(dat, ngp=ngp, nprocesses=nprocesses, save_output='all', output_root=dirs.get("interm_dir")) + + # wavelet_features, eigenvalues, eigenvectors, means = dimentionality_reduction(wavelet_object, dirs, dat.object_names.copy(), waveout, tolerance=0.99, save_output=True, recompute_pca=True, output_root=dirs.get("features_dir")) + + # combined_features = combine_all_features(wavelet_features, DATA_PATH) + # classifer = create_classififer(combined_features) From 2992782c73db0ffc9a79e2732659dcf343b9fc25 Mon Sep 17 00:00:00 2001 From: Tarek Allam Date: Tue, 14 May 2019 17:25:30 +0100 Subject: [PATCH 12/58] [WIP] Further updates to pipeline script --- utils/run_plasticc_pipeline.py | 56 ++++++++++++++++++++++++++++++++-- 1 file changed, 53 insertions(+), 3 deletions(-) diff --git a/utils/run_plasticc_pipeline.py b/utils/run_plasticc_pipeline.py index 40513533..6f26efad 100755 --- a/utils/run_plasticc_pipeline.py +++ b/utils/run_plasticc_pipeline.py @@ -113,7 +113,7 @@ def load_configuration_file(path_to_configuration_file): except IOError: print("Invalid yaml file provided") exit() - print("The PARAMS are:\n {}".format(params)) + print("The parameters are:\n {}".format(params)) return params @@ -280,14 +280,62 @@ def reduce_size_of_training_data(training_data, dirs, subset_size, seed=1234): def combine_all_features(reduced_wavelet_features, dataframe): - # Combine snmachine wavelet features with PLASTICC features. Allow user to - # define the dataframe they would like to merge + # TODO: Improve docstrings. Discuss whether the user should pass in a CSV + # instead? + """ Combine snmachine wavelet features with PLASTICC features. The + user should define a dataframe they would like to merge. + + Parameters + ---------- + reduced_wavelet_features : numpy.ndarray + These are the N principle components from the uncompressed wavelets + dataframe : pandas.DataFrame + Dataframe + + Returns + ------- + combined_features : pandas.DataFrame + + Examples + -------- + >>> ... + >>> print(shape.training_data) + + >>> new_training_data = reduce_size_of_training_data(training_data, dirs, 1000)) + >>> print(shape.new_training_data) + + """ meta_df = dat.metadata combined_features = merge_features(wavelet_features, meta_df) return combined_features def create_classififer(combined_features, random_state=42): + # TODO: Improve docstrings. Discuss whether the user should pass in a CSV + # instead? + """ Combine snmachine wavelet features with PLASTICC features. The + user should define a dataframe they would like to merge. + + Parameters + ---------- + reduced_wavelet_features : numpy.ndarray + These are the N principle components from the uncompressed wavelets + dataframe : pandas.DataFrame + Dataframe + + Returns + ------- + combined_features : pandas.DataFrame + + Examples + -------- + >>> ... + >>> print(shape.training_data) + + >>> new_training_data = reduce_size_of_training_data(training_data, dirs, 1000)) + >>> print(shape.new_training_data) + + """ X = combined_features.drop('target', axis=1) y = combined_features['target'].values @@ -323,6 +371,7 @@ def create_classififer(combined_features, random_state=42): weights = np.array([1/18, 1/9, 1/18, 1/18, 1/18, 1/18, 1/18, 1/9, 1/18, 1/18, 1/18, 1/18, 1/18, 1/18, 1/19]) + # weights[:-1] to ignore last class, the anomaly class log_loss = plasticc_log_loss(sklearn_truth, y_probs, relative_class_weights=weights[:-1]) print("LogLoss: {:.3f}\nBest Params: {}".format(log_loss, classifer.get_params)) @@ -330,6 +379,7 @@ def create_classififer(combined_features, random_state=42): def make_predictions(location_of_test_data, classifier): + # TODO: Move to a seperate make_predictions file pass From 8bd050ff1962f4c5f2122e4c243be4d22a2f27ad Mon Sep 17 00:00:00 2001 From: Tarek Allam Date: Tue, 14 May 2019 17:36:15 +0100 Subject: [PATCH 13/58] Modifying file structure inside utils directory Renaming of files to make it easier to follow how the modern workflow takes place. Put old run_pipeline.py file in archive as this is no longer used --- utils/{ => archive}/run_pipeline.py | 0 utils/plasticc_feature_engineering.py | 0 utils/plasticc_make_predictions.py | 0 ...sticc_pipeline.py => plasticc_pipeline.py} | 112 ++++++++++-------- 4 files changed, 62 insertions(+), 50 deletions(-) rename utils/{ => archive}/run_pipeline.py (100%) create mode 100644 utils/plasticc_feature_engineering.py create mode 100644 utils/plasticc_make_predictions.py rename utils/{run_plasticc_pipeline.py => plasticc_pipeline.py} (79%) diff --git a/utils/run_pipeline.py b/utils/archive/run_pipeline.py similarity index 100% rename from utils/run_pipeline.py rename to utils/archive/run_pipeline.py diff --git a/utils/plasticc_feature_engineering.py b/utils/plasticc_feature_engineering.py new file mode 100644 index 00000000..e69de29b diff --git a/utils/plasticc_make_predictions.py b/utils/plasticc_make_predictions.py new file mode 100644 index 00000000..e69de29b diff --git a/utils/run_plasticc_pipeline.py b/utils/plasticc_pipeline.py similarity index 79% rename from utils/run_plasticc_pipeline.py rename to utils/plasticc_pipeline.py index 6f26efad..b0fc3211 100755 --- a/utils/run_plasticc_pipeline.py +++ b/utils/plasticc_pipeline.py @@ -1,5 +1,5 @@ """ -Machine learning pipeline for the PLAsTiCC competition using snmachine codebase +Machine learning pipeline for the PLAsTiCC competition using snmachine codebase. """ from plasticc_utils import plasticc_log_loss, plot_confusion_matrix from astropy.table import Table @@ -237,36 +237,43 @@ def reduce_size_of_training_data(training_data, dirs, subset_size, seed=1234): training_data.data = {objects: training_data.data[objects] for objects in training_data.object_names} print("Dataset reduced to {} objects".format(training_data.object_names.shape[0])) -# def fit_gaussian_process(dat, **kwargs): # Cat: Do we really want a mask funtion? -# # Tarek: Now that this file lives in snmachine and with the extensive -# # refactoring this is no longer necessary I believe -# # extract_GP(dat, **kwargs) -# # snfeatures.WaveletFeatures.extract_GP(dat, **kwargs) -# pass - - -# def wavelet_decomposition(dat, ngp, **kwargs): # Cat: we need to add ngp as input otherwise it doesn't run on the notebbok - -# wavelet_object = snfeatures.WaveletFeatures(ngp=ngp) -# print("WAV = {}\n".format(wavelet_object.wav)) -# print("MLEV = {}\n".format(wavelet_object.mlev)) -# print("NGP = {}\n".format(ngp)) -# waveout, waveout_err = wavelet_object.extract_wavelets(dat, wavelet_object.wav, wavelet_object.mlev, **kwargs) -# return waveout, waveout_err, wavelet_object +def wavelet_decomposition(training_data, ngp, **kwargs): + """ Load from disk the training data one will use for this analysis + Parameters + ---------- + training_data : snmachine.PlasticcData + Dictionary containing the parameters that reside in the configuration + file. This will be used to obtain the path to the training data. + dirs : dict + Dictionary containing + subset_size : int + Number of objects the user would like to reduce the training data to + seed : int + Default set to 1234. This can be overridden by the user to check for + consistancy of results -# def dimentionality_reduction(wavelet_object, dirs, object_names, waveout, tolerance, **kwargs): # Cat: we need to add tolerance + Returns + ------- + None -# # check if reduced wavelet features already exist -# wavelet_features, eigenvalues, eigenvectors, means, num_feats = wavelet_object.extract_pca(object_names, waveout, **kwargs) + Examples + -------- + >>> ... + >>> print(shape.training_data) -# output_root = dirs.get("features_dir") -# print("Inside dimRedux: {}\n".format(output_root)) -# wavelet_features.write('{}/wavelet_features_{}.fits'.format(output_root, str(tolerance)[2:])) + >>> new_training_data = reduce_size_of_training_data(training_data, dirs, 1000)) + >>> print(shape.new_training_data) -# return wavelet_features, eigenvalues, eigenvectors, means + """ + wavelet_object = snfeatures.WaveletFeatures(ngp=ngp) + print("WAV = {}\n".format(wavelet_object.wav)) + print("MLEV = {}\n".format(wavelet_object.mlev)) + print("NGP = {}\n".format(ngp)) + waveout, waveout_err = wavelet_object.extract_wavelets(training_data, wavelet_object.wav, wavelet_object.mlev, **kwargs) + return waveout, waveout_err, wavelet_object # def merge_features(some_features, other_features): # # TODO: Move this to a data processing file @@ -311,21 +318,19 @@ def combine_all_features(reduced_wavelet_features, dataframe): def create_classififer(combined_features, random_state=42): - # TODO: Improve docstrings. Discuss whether the user should pass in a CSV - # instead? - """ Combine snmachine wavelet features with PLASTICC features. The - user should define a dataframe they would like to merge. + # TODO: Improve docstrings. + """ Creation of an optimised Random Forest classifier. Parameters ---------- - reduced_wavelet_features : numpy.ndarray - These are the N principle components from the uncompressed wavelets - dataframe : pandas.DataFrame - Dataframe + combined_features : pandas.DataFrame + This contains. Index on objects + random_state : int + To allow for reproducible... Returns ------- - combined_features : pandas.DataFrame + classifer : sklearn.RandomForestClassifier object Examples -------- @@ -403,6 +408,7 @@ def make_predictions(location_of_test_data, classifier): # snmachine parameters ngp = params.get("ngp", None) initheta = params.get("initheta", None) + number_of_principal_components = params.get("number_of_principal_components", None) # Step 1. Creat folders that contain analysis dirs = create_folder_structure(analysis_directory, analysis_name) @@ -422,26 +428,32 @@ def make_predictions(location_of_test_data, classifier): else: # Run full pipeline but still do checks to see if elements from GPs or # wavelets already exist on disk; the first check should be for: - # 1. Saved PCA files - # 2. Saved uncompressed wavelets - # 3. Saved GPs - + # a. Saved PCA files + # path_saved_reduced_wavelets = dirs.get("intermediate_files_directory") + # eigenvectors_saved_file = np.load(os.path.join(path_saved_reduced_wavelets, 'eigenvectors_' + str(number_of_principal_components) + '.npy')) + # means_saved_file = np.load(os.path.join(path_saved_reduced_wavelets, 'means_' + str(number_of_principal_components) + '.npy')) + # b. Saved uncompressed wavelets + # c. Saved GPs + + # Step 4. Load in training data training_data = load_training_data(data_path) - gps.compute_gps() - wavelet_object = snfeatures.WaveletFeatures(ngp=ngp) - waveout, waveout_err = wavelet_object.extract_wavelets(training_data, wavelet_object.wav, wavelet_object.mlev, **kwargs) - # waveout, waveout_err, wavelet_object = wavelet_decomposition(dat, ngp=ngp, nprocesses=nprocesses, save_output='all', output_root=dirs.get("interm_dir")) - # wavelet_features, eigenvalues, eigenvectors, means = dimentionality_reduction(wavelet_object, dirs, dat.object_names.copy(), waveout, tolerance=0.99, save_output=True, recompute_pca=True, output_root=dirs.get("features_dir")) - # combined_features = combine_all_features(wavelet_features, DATA_PATH) - # classifer = create_classififer(combined_features) + # Step 5. Compute GPs + gps.compute_gps(training_data, number_gp=100, t_min=0, t_max=1100, + kernel_param=[500., 20.], + output_root=dirs['intermediate_files_directory'], + number_processes=nprocesses) - # fit_gaussian_process(dat, ngp=ngp, t_min=0, initheta=initheta, - # nprocesses=nprocesses, output_root=dirs.get("interm_dir"), t_max=1100) + # Step 6. Extract wavelet coeffiencts + waveout, waveout_err, wavelet_object = wavelet_decomposition(training_data, ngp=ngp, nprocesses=nprocesses, + save_output='all', output_root=dirs.get("intermediate_files_directory")) - # waveout, waveout_err, wavelet_object = wavelet_decomposition(dat, ngp=ngp, nprocesses=nprocesses, save_output='all', output_root=dirs.get("interm_dir")) + # Step 7. Reduce dimensionality of wavelets by using only N principle components + wavelet_features, eigenvals, eigenvecs, means, num_feats = wavelet_object.extract_pca(object_names=training_data.object_names, wavout=waveout, recompute_pca=True, method='svd', ncomp=number_of_principal_components, + tol=None, pca_path=None, save_output=True, output_root=dirs.get("intermediate_files_directory")) - # wavelet_features, eigenvalues, eigenvectors, means = dimentionality_reduction(wavelet_object, dirs, dat.object_names.copy(), waveout, tolerance=0.99, save_output=True, recompute_pca=True, output_root=dirs.get("features_dir")) + # Step 8. TODO Combine snmachine features with user defined features + # Step 9. TODO Create a Random Forest classifier; need to fit model and + # save it. - # combined_features = combine_all_features(wavelet_features, DATA_PATH) - # classifer = create_classififer(combined_features) + # Step 10. TODO Use saved classifier to make predictions. This can occur using a seperate file From 3e85f5b79c4b0178219ba25243f7c3ff6f78ebb3 Mon Sep 17 00:00:00 2001 From: Tarek Allam Date: Tue, 14 May 2019 18:40:31 +0100 Subject: [PATCH 14/58] Updating configuration file --- utils/config.yml | 18 +++++++--------- utils/plasticc_pipeline.py | 43 ++++++++++++++++++++++++-------------- 2 files changed, 34 insertions(+), 27 deletions(-) diff --git a/utils/config.yml b/utils/config.yml index f5949880..be92e897 100644 --- a/utils/config.yml +++ b/utils/config.yml @@ -1,12 +1,8 @@ -# -# -# GENERAL PARAMS -SEED : 1234 -REPO_DIR : "/share/hypatia/snmachine_resources/data/plasticc/" -ANALYSIS_DIR : "/share/hypatia/snmachine_resources/data/plasticc/analysis/" -ANALYSIS_NAME : "test-analysis" -DATA_PATH : "/share/hypatia/snmachine_resources/data/plasticc/data/raw_data/training_set.pkl" - -# SNMACHINE_PARAMS +# Global settings +analysis_dir : "/share/hypatia/snmachine_resources/data/plasticc/analysis/" +analysis_name : "pipeline-test" +data_path : "/share/hypatia/snmachine_resources/data/plasticc/data/raw_data/training_set_snia.pickle" +# snmachine parameters ngp : 1100 -initheta : [500, 20] +initheta : [500., 20.] +number_of_principle_components : 200 diff --git a/utils/plasticc_pipeline.py b/utils/plasticc_pipeline.py index b0fc3211..ef89a07f 100755 --- a/utils/plasticc_pipeline.py +++ b/utils/plasticc_pipeline.py @@ -66,6 +66,9 @@ def create_folder_structure(analysis_directory, analysis_name): >>> ... >>> analysis_directory = params.get("analysis_directory", None) >>> analysis_name = params.get("analysis_name", None) + >>> directories = create_folder_structure(analysis_directory, analysis_name) + >>> print(directories.get("method_directory")) + """ method_directory = os.path.join(analysis_directory, analysis_name + get_git_revision_short_hash()) features_directory = os.path.join(method_directory, 'wavelet_features') @@ -104,8 +107,10 @@ def load_configuration_file(path_to_configuration_file): >>> params = load_configuration_file(path_to_configuration_file) >>> data_path = params.get("data_path", None) >>> print(data_path) + >>> ngp = params.get("ngp", None) >>> print(ngp) + """ try: with open(path_to_configuration_file) as f: @@ -117,22 +122,27 @@ def load_configuration_file(path_to_configuration_file): return params -def save_configuration_file(dirs): +def save_configuration_file(method_directory): # TODO: Provide a doctring example """ Make a copy of the configuration file that has been used inside the analysis directory Parameters ---------- - dirs : dict - Dictionary containing the names of the folder paths used in this analysis + method_directory : string + The folder path used for this analysis Returns ------- None + Examples + -------- + >>> ... + >>> save_configuration_file(method_directory) + >>> print() + """ - method_directory = dirs.get("method_directory", None) with open(os.path.join(method_directory, "config.yml"), 'w') as config: yaml.dump(params, config, default_flow_style=False) @@ -261,10 +271,9 @@ def wavelet_decomposition(training_data, ngp, **kwargs): Examples -------- >>> ... - >>> print(shape.training_data) - - >>> new_training_data = reduce_size_of_training_data(training_data, dirs, 1000)) - >>> print(shape.new_training_data) + >>> waveout, waveout_err, wavelet_object = wavelet_decomposition(training_data, ngp=ngp, nprocesses=nprocesses, + save_output='all', output_root=dirs.get("intermediate_files_directory")) + >>> print() """ @@ -306,10 +315,12 @@ def combine_all_features(reduced_wavelet_features, dataframe): Examples -------- >>> ... - >>> print(shape.training_data) + >>> print(shape.reduced_wavelet_features) - >>> new_training_data = reduce_size_of_training_data(training_data, dirs, 1000)) - >>> print(shape.new_training_data) + >>> print(shape.dataframe) + + >>> combined_features = combine_all_features(reduced_wavelet_features, dataframe) + >>> print(shape.combined_features) """ meta_df = dat.metadata @@ -335,10 +346,10 @@ def create_classififer(combined_features, random_state=42): Examples -------- >>> ... - >>> print(shape.training_data) + >>> classifier, confusion_matrix = create_classififer(combined_features) + >>> print(classifier) - >>> new_training_data = reduce_size_of_training_data(training_data, dirs, 1000)) - >>> print(shape.new_training_data) + >>> plot_confusion_matrix(confusion_matrix) """ @@ -439,8 +450,8 @@ def make_predictions(location_of_test_data, classifier): training_data = load_training_data(data_path) # Step 5. Compute GPs - gps.compute_gps(training_data, number_gp=100, t_min=0, t_max=1100, - kernel_param=[500., 20.], + gps.compute_gps(training_data, number_gp=ngp, t_min=0, t_max=1100, + kernel_param=initheta, output_root=dirs['intermediate_files_directory'], number_processes=nprocesses) From 4179b2c287c2c076d882708fad0a0c030ecce30b Mon Sep 17 00:00:00 2001 From: Tarek Allam Date: Wed, 15 May 2019 14:15:18 +0100 Subject: [PATCH 15/58] Append git has to analysis name --- utils/config.yml | 2 +- utils/plasticc_pipeline.py | 14 +++++++++----- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/utils/config.yml b/utils/config.yml index be92e897..7acfaa16 100644 --- a/utils/config.yml +++ b/utils/config.yml @@ -1,5 +1,5 @@ # Global settings -analysis_dir : "/share/hypatia/snmachine_resources/data/plasticc/analysis/" +analysis_directory : "/share/hypatia/snmachine_resources/data/plasticc/analysis/" analysis_name : "pipeline-test" data_path : "/share/hypatia/snmachine_resources/data/plasticc/data/raw_data/training_set_snia.pickle" # snmachine parameters diff --git a/utils/plasticc_pipeline.py b/utils/plasticc_pipeline.py index ef89a07f..5b18e45e 100755 --- a/utils/plasticc_pipeline.py +++ b/utils/plasticc_pipeline.py @@ -70,7 +70,10 @@ def create_folder_structure(analysis_directory, analysis_name): >>> print(directories.get("method_directory")) """ - method_directory = os.path.join(analysis_directory, analysis_name + get_git_revision_short_hash()) + # Append Git has to analysis name + analysis_name = analysis_name + "-" + get_git_revision_short_hash() + + method_directory = os.path.join(analysis_directory, analysis_name) features_directory = os.path.join(method_directory, 'wavelet_features') classifications_directory = os.path.join(method_directory, 'classifications') intermediate_files_directory = os.path.join(method_directory, 'intermediate') @@ -409,8 +412,9 @@ def make_predictions(location_of_test_data, classifier): parser.add_argument('--configuration', '-c') parser.add_argument('--restart-from', '-r', help='Either restart from saved "GPs" or from saved "Wavelets"', default="full") arguments = parser.parse_args() + arguments = vars(arguments) - params = load_configuration_file(arguments.configuration) + params = load_configuration_file(arguments['configuration']) data_path = params.get("data_path", None) analysis_directory = params.get("analysis_directory", None) @@ -424,16 +428,16 @@ def make_predictions(location_of_test_data, classifier): # Step 1. Creat folders that contain analysis dirs = create_folder_structure(analysis_directory, analysis_name) # Step 2. Save configuration file used for this analysis - save_configuration_file(dirs) + save_configuration_file(dirs.get("method_directory")) # Step 3. Check at which point the user would like to run the analysis from. # If elements already saved, these will be used but this can be overriden # with command line argument - if (arguments.restart.lower() == "wavelets"): + if (arguments['restart_from'].lower() == "wavelets"): # Restart from saved uncompressed wavelets. wavelet_features = Table.read(dirs.get("features_dir") + "/wavelet_features.fits") combined_features = combine_all_features(wavelet_features, data_path) classifer = create_classififer(combined_features) - elif (arguments.restart.lower() == "gps"): + elif (arguments['restart_from'].lower() == "gps"): # Restart from saved GPs. pass else: From 32eb2ebb1e106024def138620c8e5a8518f517bf Mon Sep 17 00:00:00 2001 From: Tarek Allam Date: Fri, 17 May 2019 08:59:53 +0100 Subject: [PATCH 16/58] Updating variable names to be consistent gps.py defines variable names for the kernel parameters and number of points for the GPs. This change updates the configuration file and pipeline to be in line with that file --- utils/config.yml | 4 ++-- utils/plasticc_pipeline.py | 23 ++++++++++++----------- 2 files changed, 14 insertions(+), 13 deletions(-) diff --git a/utils/config.yml b/utils/config.yml index 7acfaa16..34a726e0 100644 --- a/utils/config.yml +++ b/utils/config.yml @@ -3,6 +3,6 @@ analysis_directory : "/share/hypatia/snmachine_resources/data/plasticc/analysis/ analysis_name : "pipeline-test" data_path : "/share/hypatia/snmachine_resources/data/plasticc/data/raw_data/training_set_snia.pickle" # snmachine parameters -ngp : 1100 -initheta : [500., 20.] +number_gp : 1100 +kernel_param : [500., 20.] number_of_principle_components : 200 diff --git a/utils/plasticc_pipeline.py b/utils/plasticc_pipeline.py index 5b18e45e..f58217fd 100755 --- a/utils/plasticc_pipeline.py +++ b/utils/plasticc_pipeline.py @@ -111,8 +111,8 @@ def load_configuration_file(path_to_configuration_file): >>> data_path = params.get("data_path", None) >>> print(data_path) - >>> ngp = params.get("ngp", None) - >>> print(ngp) + >>> number_gp = params.get("number_gp", None) + >>> print(number_gp) """ try: @@ -251,7 +251,7 @@ def reduce_size_of_training_data(training_data, dirs, subset_size, seed=1234): print("Dataset reduced to {} objects".format(training_data.object_names.shape[0])) -def wavelet_decomposition(training_data, ngp, **kwargs): +def wavelet_decomposition(training_data, number_gp, **kwargs): """ Load from disk the training data one will use for this analysis Parameters @@ -274,16 +274,17 @@ def wavelet_decomposition(training_data, ngp, **kwargs): Examples -------- >>> ... - >>> waveout, waveout_err, wavelet_object = wavelet_decomposition(training_data, ngp=ngp, nprocesses=nprocesses, + >>> waveout, waveout_err, wavelet_object = + wavelet_decomposition(training_data, number_gp=number_gp, nprocesses=nprocesses, save_output='all', output_root=dirs.get("intermediate_files_directory")) >>> print() """ - wavelet_object = snfeatures.WaveletFeatures(ngp=ngp) + wavelet_object = snfeatures.WaveletFeatures(number_gp=number_gp) print("WAV = {}\n".format(wavelet_object.wav)) print("MLEV = {}\n".format(wavelet_object.mlev)) - print("NGP = {}\n".format(ngp)) + print("number_gp = {}\n".format(number_gp)) waveout, waveout_err = wavelet_object.extract_wavelets(training_data, wavelet_object.wav, wavelet_object.mlev, **kwargs) return waveout, waveout_err, wavelet_object @@ -421,8 +422,8 @@ def make_predictions(location_of_test_data, classifier): analysis_name = params.get("analysis_name", None) # snmachine parameters - ngp = params.get("ngp", None) - initheta = params.get("initheta", None) + number_gp = params.get("number_gp", None) + kernel_param = params.get("kernel_param", None) number_of_principal_components = params.get("number_of_principal_components", None) # Step 1. Creat folders that contain analysis @@ -454,13 +455,13 @@ def make_predictions(location_of_test_data, classifier): training_data = load_training_data(data_path) # Step 5. Compute GPs - gps.compute_gps(training_data, number_gp=ngp, t_min=0, t_max=1100, - kernel_param=initheta, + gps.compute_gps(training_data, number_gp=number_gp, t_min=0, t_max=1100, + kernel_param=kernel_param, output_root=dirs['intermediate_files_directory'], number_processes=nprocesses) # Step 6. Extract wavelet coeffiencts - waveout, waveout_err, wavelet_object = wavelet_decomposition(training_data, ngp=ngp, nprocesses=nprocesses, + waveout, waveout_err, wavelet_object = wavelet_decomposition(training_data, number_gp=number_gp, nprocesses=nprocesses, save_output='all', output_root=dirs.get("intermediate_files_directory")) # Step 7. Reduce dimensionality of wavelets by using only N principle components From cdf659d94dc601394a9260b6e32e9f42eee49c94 Mon Sep 17 00:00:00 2001 From: Tarek Allam Date: Fri, 17 May 2019 19:56:59 +0100 Subject: [PATCH 17/58] Updating new var name to be consistant with gps.py --- snmachine/gps.py | 6 +-- snmachine/snaugment.py | 2 +- snmachine/snclassifier.py | 10 ++--- snmachine/snfeatures.py | 85 +++++++++++++++++++------------------- test/snclassifier_test.py | 2 +- test/snfeatures_test.py | 32 +++++++------- utils/plasticc_pipeline.py | 10 ++--- 7 files changed, 74 insertions(+), 73 deletions(-) diff --git a/snmachine/gps.py b/snmachine/gps.py index 5b965756..3dec7868 100644 --- a/snmachine/gps.py +++ b/snmachine/gps.py @@ -56,7 +56,7 @@ def compute_gps(dataset, number_gp, t_min, t_max, kernel_param=[500., 20.], outp output_root : {None, str}, optional If None, don't save anything. If str, it is the output directory, so save the flux and error estimates and used kernels there. number_processes : int, optional - Number of processors to use for parallelisation (shared memory only). By default `nprocesses` = 1. + Number of processors to use for parallelisation (shared memory only). By default `number_processes` = 1. gp_algo : str, optional which gp package is used for the Gaussian Process Regression, GaPP or george """ @@ -148,7 +148,7 @@ def _compute_gps_parallel(dataset, number_gp, t_min, t_max, kernel_param, output output_root : {None, str}, optional If None, don't save anything. If str, it is the output directory, so save the flux and error estimates and used kernels there. number_processes : int, optional - Number of processors to use for parallelisation (shared memory only). By default `nprocesses` = 1. + Number of processors to use for parallelisation (shared memory only). By default `number_processes` = 1. gp_algo : str, optional which gp package is used for the Gaussian Process Regression, GaPP or george """ @@ -413,4 +413,4 @@ def get_kernel(kernel_name, kernel_param): elif kernel_name == 'ExpSquared+ExpSine2': kExpSine2 = kernel_param[4]*george.kernels.ExpSine2Kernel(gamma=kernel_param[5],log_period=kernel_param[6]) kernel = kExpSquared + kExpSine2 - return kernel \ No newline at end of file + return kernel diff --git a/snmachine/snaugment.py b/snmachine/snaugment.py index 4e582cd8..564a6f96 100644 --- a/snmachine/snaugment.py +++ b/snmachine/snaugment.py @@ -123,7 +123,7 @@ def extract_proxy_features(self,peak_filter='desr',nproc=1,fit_salt2=False,salt2 #tf=snfeatures.TemplateFeatures(sampler='leastsq') tf=snfeatures.TemplateFeatures(sampler=sampler) if salt2feats is None: - salt2feats=tf.extract_features(self.dataset,nprocesses=nproc,use_redshift=fix_redshift) + salt2feats=tf.extract_features(self.dataset,number_processes=nproc,use_redshift=fix_redshift) #fit models and extract r-peakmags peaklogflux=[] diff --git a/snmachine/snclassifier.py b/snmachine/snclassifier.py index 0e3a4cd2..185356a1 100644 --- a/snmachine/snclassifier.py +++ b/snmachine/snclassifier.py @@ -608,7 +608,7 @@ def __call_classifier(classifier, X_train, y_train, X_test, param_dict, return_c def run_pipeline(features, types, output_name='', columns=[], classifiers=['nb', 'knn', 'svm', 'neural_network', 'boost_dt'], - training_set=0.3, param_dict={}, nprocesses=1, scale=True, + training_set=0.3, param_dict={}, number_processes=1, scale=True, plot_roc_curve=True, return_classifier=False, classifiers_for_cm_plots=[], type_dict=None, seed=1234): """ @@ -632,7 +632,7 @@ def run_pipeline(features, types, output_name='', columns=[], classifiers=['nb', the ID's of the objects to be used param_dict : dict, optional Use to run different ranges of hyperparameters for the classifiers when optimising - nprocesses : int, optional + number_processes : int, optional Number of processors for multiprocessing (shared memory only). Each classifier will then be run in parallel. scale : bool, optional Rescale features using sklearn's preprocessing Scalar class (highly recommended this is True) @@ -707,15 +707,15 @@ def run_pipeline(features, types, output_name='', columns=[], classifiers=['nb', probabilities = {} classifier_objects = {} - if nprocesses > 1 and return_classifier: + if number_processes > 1 and return_classifier: print("Due to limitations with python's multiprocessing module, classifier objects cannot be returned if " \ "multiple processors are used. Continuing serially...") print() - if nprocesses > 1 and not return_classifier: + if number_processes > 1 and not return_classifier: partial_func=partial(__call_classifier, X_train=X_train, y_train=y_train, X_test=X_test, param_dict=param_dict, return_classifier=False) - p = Pool(nprocesses, maxtasksperchild=1) + p = Pool(number_processes, maxtasksperchild=1) result = p.map(partial_func, classifiers) for i in range(len(result)): diff --git a/snmachine/snfeatures.py b/snmachine/snfeatures.py index d1b6ee39..6e903c21 100644 --- a/snmachine/snfeatures.py +++ b/snmachine/snfeatures.py @@ -686,7 +686,7 @@ def __init__(self, model=['Ia'], sampler='leastsq',lsst_bands=False,lsst_dir='.. 'nugent-sn2l':{'z':(0.01, 1.5)}, 'nugent-sn1bc':{'z':(0.01, 1.5)}} - def extract_features(self, d, save_output=False, chain_directory='chains', use_redshift=False, nprocesses=1, restart=False, seed=-1): + def extract_features(self, d, save_output=False, chain_directory='chains', use_redshift=False, number_processes=1, restart=False, seed=-1): """ Extract template features for a dataset. @@ -700,7 +700,7 @@ def extract_features(self, d, save_output=False, chain_directory='chains', use_r Where to save the chains use_redshift : bool Whether or not to use provided redshift when fitting objects - nprocesses : int, optional + number_processes : int, optional Number of processors to use for parallelisation (shared memory only) restart : bool Whether or not to restart from multinest chains @@ -736,7 +736,7 @@ def extract_features(self, d, save_output=False, chain_directory='chains', use_r output = Table(names=labels, dtype=['U32'] + ['f'] * (len(labels) - 1)) k=0 - if nprocesses<2: + if number_processes<2: for obj in d.object_names: if k%100==0: print (k, 'objects fitted') @@ -779,7 +779,7 @@ def extract_features(self, d, save_output=False, chain_directory='chains', use_r else: if self.sampler=='leastsq': - p=Pool(nprocesses, maxtasksperchild=1) + p=Pool(number_processes, maxtasksperchild=1) partial_func=partial(_run_leastsq_templates, d=d, model_name=self.templates[mod_name], use_redshift=use_redshift, bounds=self.bounds[self.templates[mod_name]]) out=p.map(partial_func, d.object_names) output=out[0] @@ -790,7 +790,7 @@ def extract_features(self, d, save_output=False, chain_directory='chains', use_r else: all_output=vstack((all_output, output)) elif self.sampler=='nested': - p=Pool(nprocesses, maxtasksperchild=1) + p=Pool(number_processes, maxtasksperchild=1) partial_func=partial(_run_multinest_templates, d=d, model_name=self.templates[mod_name], bounds=self.bounds[self.templates[mod_name]], chain_directory=chain_directory, nlp=1000, convert_to_binary=True, use_redshift=use_redshift, short_name=self.short_names[mod_name], restart=restart, seed=seed) out=p.map(partial_func, d.object_names) @@ -914,7 +914,7 @@ def __init__(self, model_choice, sampler='leastsq', limits=None): - def extract_features(self, d, chain_directory='chains', save_output=True, n_attempts=20, nprocesses=1, n_walkers=100, + def extract_features(self, d, chain_directory='chains', save_output=True, n_attempts=20, number_processes=1, n_walkers=100, n_steps=500, walker_spread=0.1, burn=50, nlp=1000, starting_point=None, convert_to_binary=True, n_iter=0, restart=False, seed=-1): """ Fit parametric models and return best-fitting parameters as features. @@ -930,7 +930,7 @@ def extract_features(self, d, chain_directory='chains', save_output=True, n_atte n_attempts : int Allow the minimiser to start in new random locations if the fit is bad. Put n_attempts=1 to fit only once with the default starting position. - nprocesses : int, optional + number_processes : int, optional Number of processors to use for parallelisation (shared memory only) n_walkers : int emcee parameter - number of walkers to use @@ -963,7 +963,7 @@ def extract_features(self, d, chain_directory='chains', save_output=True, n_atte output=[] #obj=d.object_names[0] - if nprocesses<2: + if number_processes<2: k=0 for obj in d.object_names: if k%100==0: @@ -984,14 +984,14 @@ def extract_features(self, d, chain_directory='chains', save_output=True, n_atte k+=1 else: if self.sampler=='leastsq': - p=Pool(nprocesses, maxtasksperchild=1) + p=Pool(number_processes, maxtasksperchild=1) partial_func=partial(_run_leastsq, d=d, model=self.model, n_attempts=n_attempts, seed=seed) out=p.map(partial_func, d.object_names) output=out[0] for i in range(1, len(out)): output=vstack((output, out[i])) elif self.sampler=='nested': - p=Pool(nprocesses, maxtasksperchild=1) + p=Pool(number_processes, maxtasksperchild=1) partial_func=partial(_run_multinest, d=d, model=self.model,chain_directory=chain_directory, nlp=nlp, convert_to_binary=convert_to_binary, n_iter=n_iter, restart=restart, seed=seed) #Pool starts a number of threads, all of which may try to tackle all of the data. Better to take it in chunks @@ -999,7 +999,7 @@ def extract_features(self, d, chain_directory='chains', save_output=True, n_atte k=0 objs=d.object_names while k>> ... >>> waveout, waveout_err, wavelet_object = - wavelet_decomposition(training_data, number_gp=number_gp, nprocesses=nprocesses, + wavelet_decomposition(training_data, number_gp=number_gp, number_processes=number_processes, save_output='all', output_root=dirs.get("intermediate_files_directory")) >>> print() @@ -406,8 +406,8 @@ def make_predictions(location_of_test_data, classifier): if __name__ == "__main__": # Set the number of processes you want to use throughout the notebook - nprocesses = multiprocessing.cpu_count() - print("Running with {} cores".format(nprocesses)) + number_processes = multiprocessing.cpu_count() + print("Running with {} cores".format(number_processes)) parser = ArgumentParser(description="Run pipeline end to end") parser.add_argument('--configuration', '-c') @@ -458,10 +458,10 @@ def make_predictions(location_of_test_data, classifier): gps.compute_gps(training_data, number_gp=number_gp, t_min=0, t_max=1100, kernel_param=kernel_param, output_root=dirs['intermediate_files_directory'], - number_processes=nprocesses) + number_processes=number_processes) # Step 6. Extract wavelet coeffiencts - waveout, waveout_err, wavelet_object = wavelet_decomposition(training_data, number_gp=number_gp, nprocesses=nprocesses, + waveout, waveout_err, wavelet_object = wavelet_decomposition(training_data, number_gp=number_gp, number_processes=number_processes, save_output='all', output_root=dirs.get("intermediate_files_directory")) # Step 7. Reduce dimensionality of wavelets by using only N principle components From 5fb499e434e815db8c1e2df39a54ec1c7d713561 Mon Sep 17 00:00:00 2001 From: Tarek Allam Date: Sat, 18 May 2019 19:03:51 +0100 Subject: [PATCH 18/58] Reducing number of PCA components Reducing the number of PCA components from 200 to 10 as it is required that number of components be less than or equal to the number of objects. Thus for the dataset used here "training_set_snia.pickle" 10 is appropriate. This should fix this error: Running PCA... The condition number in the SVD is 1.02688179587e+23 and the normalized one is 5.00036575467e+22 Traceback (most recent call last): File "plasticc_pipeline.py", line 469, in tol=None, pca_path=None, save_output=True, output_root=dirs.get("intermediate_files_directory")) File "/home/tallam/.conda/envs/snmachine/lib/python3.6/site-packages/snmachine/snfeatures.py", line 2005, in extract_pca normalize_variance=normalize_variance) File "/home/tallam/.conda/envs/snmachine/lib/python3.6/site-packages/snmachine/snfeatures.py", line 1873, in _pca return self.pca_SVD(dataMatrix, ncomp, tol, normalize_variance) File "/home/tallam/.conda/envs/snmachine/lib/python3.6/site-packages/snmachine/snfeatures.py", line 1714, in pca_SVD assert isinstance(tol, np.float) AssertionError --- utils/config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/config.yml b/utils/config.yml index 34a726e0..66715648 100644 --- a/utils/config.yml +++ b/utils/config.yml @@ -5,4 +5,4 @@ data_path : "/share/hypatia/snmachine_resources/data/plasticc/data/raw_data/trai # snmachine parameters number_gp : 1100 kernel_param : [500., 20.] -number_of_principle_components : 200 +number_of_principle_components : 10 From 8bb380c990679ed391fdbfea114309f3cb3152c5 Mon Sep 17 00:00:00 2001 From: Tarek Allam Date: Sat, 18 May 2019 09:03:03 +0100 Subject: [PATCH 19/58] Adding None return if key-value not found If one attempts to call the "method_directory" parameter from the dictionary but it does not exist, a None type return will occur --- utils/plasticc_pipeline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/plasticc_pipeline.py b/utils/plasticc_pipeline.py index 1fbc2628..336eeea7 100755 --- a/utils/plasticc_pipeline.py +++ b/utils/plasticc_pipeline.py @@ -67,7 +67,7 @@ def create_folder_structure(analysis_directory, analysis_name): >>> analysis_directory = params.get("analysis_directory", None) >>> analysis_name = params.get("analysis_name", None) >>> directories = create_folder_structure(analysis_directory, analysis_name) - >>> print(directories.get("method_directory")) + >>> print(directories.get("method_directory", None)) """ # Append Git has to analysis name From 429355bd8fda2a848da99624af9cf65f13924f49 Mon Sep 17 00:00:00 2001 From: Tarek Allam Date: Sat, 18 May 2019 09:08:59 +0100 Subject: [PATCH 20/58] Removing unnecessary print statements --- utils/plasticc_pipeline.py | 1 - 1 file changed, 1 deletion(-) diff --git a/utils/plasticc_pipeline.py b/utils/plasticc_pipeline.py index 336eeea7..10f569ef 100755 --- a/utils/plasticc_pipeline.py +++ b/utils/plasticc_pipeline.py @@ -180,7 +180,6 @@ def load_training_data(data_path): print("Dataset loaded from pickle file as: {}".format(training_data)) else: folder_path, train_data_file_name = os.path.split(data_path) - print(folder_path, train_data_file_name) meta_data_file_name = "_metadata.".join(train_data_file_name.split(".")) print("Opening from CSV") From f84d22bd986e3cf2ee643483c9a5423b72393d18 Mon Sep 17 00:00:00 2001 From: Tarek Allam Date: Sat, 18 May 2019 21:12:51 +0100 Subject: [PATCH 21/58] Adding timestamp helper function This function is used to determine the last modified time of the configuration file that is being used and to place this in the name of the analysis run. --- utils/plasticc_pipeline.py | 41 +++++++++++++++++++++++++++++++------- 1 file changed, 34 insertions(+), 7 deletions(-) diff --git a/utils/plasticc_pipeline.py b/utils/plasticc_pipeline.py index 10f569ef..0c020415 100755 --- a/utils/plasticc_pipeline.py +++ b/utils/plasticc_pipeline.py @@ -43,7 +43,32 @@ def get_git_revision_short_hash(): return _hash.decode("utf-8").rstrip() -def create_folder_structure(analysis_directory, analysis_name): +def get_timestamp(path_to_configuration_file): + """ Helper function to obtain latest modified time of the configuration file + + Parameters + ---------- + path_to_configuration_file : str + System path to where the configuration file is located + + Returns + ------- + timestamp : str + Short representation of last modified time for the configuration file used. + 'YYYY-MM-DD-HOURMINUTE' + + Examples + -------- + >>> ... + >>> timestamp = get_timestamp(path_to_configuration_file) + >>> print(timestamp) + '2019-05-18-2100' + """ + _timestamp = subprocess.check_output(['date', '+%Y-%m-%d-%H%M', '-r', path_to_configuration_file]) + return _timestamp + + +def create_folder_structure(analysis_directory, analysis_name, path_to_configuration_file): """ Make directories that will be used for analysis Parameters @@ -66,17 +91,17 @@ def create_folder_structure(analysis_directory, analysis_name): >>> ... >>> analysis_directory = params.get("analysis_directory", None) >>> analysis_name = params.get("analysis_name", None) - >>> directories = create_folder_structure(analysis_directory, analysis_name) + >>> directories = create_folder_structure(analysis_directory, analysis_name, path_to_configuration_file) >>> print(directories.get("method_directory", None)) """ - # Append Git has to analysis name - analysis_name = analysis_name + "-" + get_git_revision_short_hash() + # Prepend last modified time of configuration file and git SHA to analysis name + analysis_name = get_timestamp(path_to_configuration_file) + "-" + get_git_revision_short_hash() + "-" + analysis_name method_directory = os.path.join(analysis_directory, analysis_name) features_directory = os.path.join(method_directory, 'wavelet_features') classifications_directory = os.path.join(method_directory, 'classifications') - intermediate_files_directory = os.path.join(method_directory, 'intermediate') + intermediate_files_directory = os.path.join(method_directory, 'intermediate_files') plots_directory = os.path.join(method_directory, 'plots') dirs = {"method_directory": method_directory, "features_directory": features_directory, @@ -414,7 +439,9 @@ def make_predictions(location_of_test_data, classifier): arguments = parser.parse_args() arguments = vars(arguments) - params = load_configuration_file(arguments['configuration']) + path_to_configuration_file = arguments['configuration'] + + params = load_configuration_file(path_to_configuration_file) data_path = params.get("data_path", None) analysis_directory = params.get("analysis_directory", None) @@ -426,7 +453,7 @@ def make_predictions(location_of_test_data, classifier): number_of_principal_components = params.get("number_of_principal_components", None) # Step 1. Creat folders that contain analysis - dirs = create_folder_structure(analysis_directory, analysis_name) + dirs = create_folder_structure(analysis_directory, analysis_name, path_to_configuration_file) # Step 2. Save configuration file used for this analysis save_configuration_file(dirs.get("method_directory")) # Step 3. Check at which point the user would like to run the analysis from. From be45b5957187e8d632e382a10eb8883e11342917 Mon Sep 17 00:00:00 2001 From: Tarek Allam Date: Sat, 18 May 2019 21:15:45 +0100 Subject: [PATCH 22/58] Fixes Type error: can't concat str to bytes --- utils/plasticc_pipeline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/plasticc_pipeline.py b/utils/plasticc_pipeline.py index 0c020415..be9532ff 100755 --- a/utils/plasticc_pipeline.py +++ b/utils/plasticc_pipeline.py @@ -65,7 +65,7 @@ def get_timestamp(path_to_configuration_file): '2019-05-18-2100' """ _timestamp = subprocess.check_output(['date', '+%Y-%m-%d-%H%M', '-r', path_to_configuration_file]) - return _timestamp + return _timestamp.decode("utf-8").rstrip() def create_folder_structure(analysis_directory, analysis_name, path_to_configuration_file): From fa0373eb9066db1ccc2b82e80ffeff8b0e37094f Mon Sep 17 00:00:00 2001 From: Tarek Allam Date: Mon, 20 May 2019 12:03:01 +0100 Subject: [PATCH 23/58] Updating path to features directory for wavelets --- utils/config.yml | 2 +- utils/plasticc_pipeline.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/utils/config.yml b/utils/config.yml index 66715648..1a25addd 100644 --- a/utils/config.yml +++ b/utils/config.yml @@ -1,7 +1,7 @@ # Global settings analysis_directory : "/share/hypatia/snmachine_resources/data/plasticc/analysis/" analysis_name : "pipeline-test" -data_path : "/share/hypatia/snmachine_resources/data/plasticc/data/raw_data/training_set_snia.pickle" +data_path : "/share/hypatia/snmachine_resources/data/plasticc/data/raw_data/new_train_data.pckl" # snmachine parameters number_gp : 1100 kernel_param : [500., 20.] diff --git a/utils/plasticc_pipeline.py b/utils/plasticc_pipeline.py index be9532ff..52bdba6e 100755 --- a/utils/plasticc_pipeline.py +++ b/utils/plasticc_pipeline.py @@ -492,7 +492,7 @@ def make_predictions(location_of_test_data, classifier): # Step 7. Reduce dimensionality of wavelets by using only N principle components wavelet_features, eigenvals, eigenvecs, means, num_feats = wavelet_object.extract_pca(object_names=training_data.object_names, wavout=waveout, recompute_pca=True, method='svd', ncomp=number_of_principal_components, - tol=None, pca_path=None, save_output=True, output_root=dirs.get("intermediate_files_directory")) + tol=None, pca_path=None, save_output=True, output_root=dirs.get("features_directory")) # Step 8. TODO Combine snmachine features with user defined features # Step 9. TODO Create a Random Forest classifier; need to fit model and From d16bc3ef06db6d52a48dbc36ebc1781498c3868e Mon Sep 17 00:00:00 2001 From: Tarek Allam Date: Mon, 20 May 2019 12:14:12 +0100 Subject: [PATCH 24/58] Fixing spelling error for 'Principal' in PCA --- utils/config.yml | 2 +- utils/plasticc_pipeline.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/utils/config.yml b/utils/config.yml index 1a25addd..ab511dc2 100644 --- a/utils/config.yml +++ b/utils/config.yml @@ -5,4 +5,4 @@ data_path : "/share/hypatia/snmachine_resources/data/plasticc/data/raw_data/new_ # snmachine parameters number_gp : 1100 kernel_param : [500., 20.] -number_of_principle_components : 10 +number_of_principal_components : 10 diff --git a/utils/plasticc_pipeline.py b/utils/plasticc_pipeline.py index 52bdba6e..0320f271 100755 --- a/utils/plasticc_pipeline.py +++ b/utils/plasticc_pipeline.py @@ -332,7 +332,7 @@ def combine_all_features(reduced_wavelet_features, dataframe): Parameters ---------- reduced_wavelet_features : numpy.ndarray - These are the N principle components from the uncompressed wavelets + These are the N principal components from the uncompressed wavelets dataframe : pandas.DataFrame Dataframe @@ -490,7 +490,7 @@ def make_predictions(location_of_test_data, classifier): waveout, waveout_err, wavelet_object = wavelet_decomposition(training_data, number_gp=number_gp, number_processes=number_processes, save_output='all', output_root=dirs.get("intermediate_files_directory")) - # Step 7. Reduce dimensionality of wavelets by using only N principle components + # Step 7. Reduce dimensionality of wavelets by using only N principal components wavelet_features, eigenvals, eigenvecs, means, num_feats = wavelet_object.extract_pca(object_names=training_data.object_names, wavout=waveout, recompute_pca=True, method='svd', ncomp=number_of_principal_components, tol=None, pca_path=None, save_output=True, output_root=dirs.get("features_directory")) From 699b9028d4810930c3b974f963be40c40b7fea83 Mon Sep 17 00:00:00 2001 From: Tarek Allam Date: Mon, 20 May 2019 14:01:19 +0100 Subject: [PATCH 25/58] Converting wavelet features to pandas dataframe --- utils/plasticc_pipeline.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/utils/plasticc_pipeline.py b/utils/plasticc_pipeline.py index 0320f271..eb093e60 100755 --- a/utils/plasticc_pipeline.py +++ b/utils/plasticc_pipeline.py @@ -356,7 +356,7 @@ def combine_all_features(reduced_wavelet_features, dataframe): return combined_features -def create_classififer(combined_features, random_state=42): +def create_classifier(combined_features, training_data, random_state=42): # TODO: Improve docstrings. """ Creation of an optimised Random Forest classifier. @@ -374,12 +374,16 @@ def create_classififer(combined_features, random_state=42): Examples -------- >>> ... - >>> classifier, confusion_matrix = create_classififer(combined_features) + >>> classifier, confusion_matrix = create_classifier(combined_features) >>> print(classifier) >>> plot_confusion_matrix(confusion_matrix) """ + # TODO: This is temporary while the pipeline is tested. + if isinstance(combined_features, np.ndarray): + features_pd = pd.DataFrame(combined_features, index=training_data.object_names) + features_pd['target'] = training_data.labels.values X = combined_features.drop('target', axis=1) y = combined_features['target'].values @@ -463,7 +467,7 @@ def make_predictions(location_of_test_data, classifier): # Restart from saved uncompressed wavelets. wavelet_features = Table.read(dirs.get("features_dir") + "/wavelet_features.fits") combined_features = combine_all_features(wavelet_features, data_path) - classifer = create_classififer(combined_features) + classifer = create_classifier(combined_features) elif (arguments['restart_from'].lower() == "gps"): # Restart from saved GPs. pass @@ -495,7 +499,8 @@ def make_predictions(location_of_test_data, classifier): tol=None, pca_path=None, save_output=True, output_root=dirs.get("features_directory")) # Step 8. TODO Combine snmachine features with user defined features - # Step 9. TODO Create a Random Forest classifier; need to fit model and - # save it. + # Step 9. TODO Create a Random Forest classifier; need to fit model and save it. + combined_features = wavelet_features # For running tests for now + create_classifier(combined_features, training_data) # Step 10. TODO Use saved classifier to make predictions. This can occur using a seperate file From 27a0f6e894882a8078e955b3dd30a48091355893 Mon Sep 17 00:00:00 2001 From: Tarek Allam Date: Mon, 20 May 2019 15:17:25 +0100 Subject: [PATCH 26/58] Updating confusion matrix functions This function now displays the confusion matrix as ASCII table in console as well as returning seaborn figure --- utils/plasticc_utils.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/utils/plasticc_utils.py b/utils/plasticc_utils.py index c17a2d5b..e56d1852 100644 --- a/utils/plasticc_utils.py +++ b/utils/plasticc_utils.py @@ -9,20 +9,26 @@ import seaborn as sns -def plot_confusion_matrix(yTrue, yPredict, dataName, targetNames): - cm = confusion_matrix(yTrue, yPredict, labels=targetNames) - cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] +def plot_confusion_matrix(y_true, y_pred, title, target_names, normalize=False): + cm = confusion_matrix(y_true, y_pred, labels=target_names) + if normalize: + cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] + print("Normalized confusion matrix") + else: + print('Confusion matrix, without normalization') + print(cm) + annot = np.around(cm, 2) fig, ax = plt.subplots(figsize=(9, 7)) - sns.heatmap(cm, xticklabels=targetNames, - yticklabels=targetNames, cmap='Blues', + sns.heatmap(cm, xticklabels=target_names, + yticklabels=target_names, cmap='Blues', annot=annot, lw=0.5) ax.set_xlabel('Predicted Label') ax.set_ylabel('True Label') ax.set_aspect('equal') - plt.title(dataName) + plt.title(title) return cm From 2b19babacd027c078dfdfc2ab384fda9b4b89659 Mon Sep 17 00:00:00 2001 From: Tarek Allam Date: Mon, 20 May 2019 15:18:43 +0100 Subject: [PATCH 27/58] Updates made to 'create_classifier' functions Removal of Log Loss function call as well as stripping unused functions within 'create_classifier' --- utils/config.yml | 2 +- utils/plasticc_pipeline.py | 79 ++++++++++++++++++++------------------ 2 files changed, 42 insertions(+), 39 deletions(-) diff --git a/utils/config.yml b/utils/config.yml index ab511dc2..84456cb7 100644 --- a/utils/config.yml +++ b/utils/config.yml @@ -1,7 +1,7 @@ # Global settings analysis_directory : "/share/hypatia/snmachine_resources/data/plasticc/analysis/" analysis_name : "pipeline-test" -data_path : "/share/hypatia/snmachine_resources/data/plasticc/data/raw_data/new_train_data.pckl" +data_path : "/share/hypatia/snmachine_resources/data/plasticc/data/raw_data/training_set_snia.pickle" # snmachine parameters number_gp : 1100 kernel_param : [500., 20.] diff --git a/utils/plasticc_pipeline.py b/utils/plasticc_pipeline.py index eb093e60..479fcf0b 100755 --- a/utils/plasticc_pipeline.py +++ b/utils/plasticc_pipeline.py @@ -293,18 +293,20 @@ def wavelet_decomposition(training_data, number_gp, **kwargs): Returns ------- - None + waveout: + + waveout_err: + + wavelet_object: Examples -------- >>> ... - >>> waveout, waveout_err, wavelet_object = - wavelet_decomposition(training_data, number_gp=number_gp, number_processes=number_processes, + >>> waveout, waveout_err, wavelet_object = wavelet_decomposition(training_data, number_gp=number_gp, number_processes=number_processes, save_output='all', output_root=dirs.get("intermediate_files_directory")) >>> print() """ - wavelet_object = snfeatures.WaveletFeatures(number_gp=number_gp) print("WAV = {}\n".format(wavelet_object.wav)) print("MLEV = {}\n".format(wavelet_object.mlev)) @@ -312,16 +314,6 @@ def wavelet_decomposition(training_data, number_gp, **kwargs): waveout, waveout_err = wavelet_object.extract_wavelets(training_data, wavelet_object.wav, wavelet_object.mlev, **kwargs) return waveout, waveout_err, wavelet_object -# def merge_features(some_features, other_features): -# # TODO: Move this to a data processing file -# if type(some_features) != pd.core.frame.DataFrame: -# some_features = some_features.to_pandas() -# if type(other_features) != pd.core.frame.DataFrame: -# other_features = other_features.to_pandas() -# merged_df = pd.merge(some_features, other_features) -# merged_df.set_index("Object", inplace=True) -# return merged_df - def combine_all_features(reduced_wavelet_features, dataframe): # TODO: Improve docstrings. Discuss whether the user should pass in a CSV @@ -351,8 +343,18 @@ def combine_all_features(reduced_wavelet_features, dataframe): >>> print(shape.combined_features) """ - meta_df = dat.metadata - combined_features = merge_features(wavelet_features, meta_df) +# def merge_features(some_features, other_features): +# # TODO: Move this to a data processing file +# if type(some_features) != pd.core.frame.DataFrame: +# some_features = some_features.to_pandas() +# if type(other_features) != pd.core.frame.DataFrame: +# other_features = other_features.to_pandas() +# merged_df = pd.merge(some_features, other_features) +# merged_df.set_index("Object", inplace=True) +# return merged_df + +# meta_df = dat.metadata +# combined_features = merge_features(wavelet_features, meta_df) return combined_features @@ -381,9 +383,13 @@ def create_classifier(combined_features, training_data, random_state=42): """ # TODO: This is temporary while the pipeline is tested. + print("COMBINED_FEATURES_TYPE: {}".format(type(combined_features))) if isinstance(combined_features, np.ndarray): - features_pd = pd.DataFrame(combined_features, index=training_data.object_names) - features_pd['target'] = training_data.labels.values + combined_features = pd.DataFrame(combined_features, index=training_data.object_names) + combined_features['target'] = training_data.labels.values + else: + combined_features = combined_features.to_pandas() + combined_features['target'] = training_data.labels.values X = combined_features.drop('target', axis=1) y = combined_features['target'].values @@ -398,31 +404,14 @@ def create_classifier(combined_features, training_data, random_state=42): X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state) - classifer = RandomForestClassifier(n_estimators=700, criterion='entropy', - oob_score=True, n_jobs=-1, - random_state=random_state) - + classifer = RandomForestClassifier(n_estimators=700, criterion='entropy', oob_score=True, n_jobs=-1, random_state=random_state) classifer.fit(X_train, y_train) y_preds = classifer.predict(X_test) - - confusion_matrix = plot_confusion_matrix(y_test, y_preds, 'Validation data', target_names) + confusion_matrix = plot_confusion_matrix(y_test, y_preds, 'Validation data', target_names, normalize=True) y_probs = classifer.predict_proba(X_test) - nlines = len(target_names) - # we also need to express the truth table as a matrix - sklearn_truth = np.zeros((len(y_test), nlines)) - label_index_map = dict(zip(classifer.classes_, np.arange(nlines))) - for i, x in enumerate(y_test): - sklearn_truth[i][label_index_map[y_test[i]]] = 1 - - weights = np.array([1/18, 1/9, 1/18, 1/18, 1/18, 1/18, 1/18, 1/9, 1/18, 1/18, 1/18, 1/18, 1/18, 1/18, 1/19]) - - # weights[:-1] to ignore last class, the anomaly class - log_loss = plasticc_log_loss(sklearn_truth, y_probs, relative_class_weights=weights[:-1]) - print("LogLoss: {:.3f}\nBest Params: {}".format(log_loss, classifer.get_params)) - return classifer, confusion_matrix @@ -448,12 +437,15 @@ def make_predictions(location_of_test_data, classifier): params = load_configuration_file(path_to_configuration_file) data_path = params.get("data_path", None) + print(data_path) analysis_directory = params.get("analysis_directory", None) analysis_name = params.get("analysis_name", None) # snmachine parameters number_gp = params.get("number_gp", None) + print(number_gp) kernel_param = params.get("kernel_param", None) + print(kernel_param) number_of_principal_components = params.get("number_of_principal_components", None) # Step 1. Creat folders that contain analysis @@ -483,6 +475,7 @@ def make_predictions(location_of_test_data, classifier): # Step 4. Load in training data training_data = load_training_data(data_path) + print(training_data) # Step 5. Compute GPs gps.compute_gps(training_data, number_gp=number_gp, t_min=0, t_max=1100, @@ -493,14 +486,24 @@ def make_predictions(location_of_test_data, classifier): # Step 6. Extract wavelet coeffiencts waveout, waveout_err, wavelet_object = wavelet_decomposition(training_data, number_gp=number_gp, number_processes=number_processes, save_output='all', output_root=dirs.get("intermediate_files_directory")) + print(waveout) + print(type(waveout)) + print(waveout_err) + print(type(waveout_err)) + print(wavelet_object) + print(type(wavelet_object)) # Step 7. Reduce dimensionality of wavelets by using only N principal components wavelet_features, eigenvals, eigenvecs, means, num_feats = wavelet_object.extract_pca(object_names=training_data.object_names, wavout=waveout, recompute_pca=True, method='svd', ncomp=number_of_principal_components, tol=None, pca_path=None, save_output=True, output_root=dirs.get("features_directory")) + print(wavelet_features) + print(type(wavelet_features)) # Step 8. TODO Combine snmachine features with user defined features + # Step 9. TODO Create a Random Forest classifier; need to fit model and save it. combined_features = wavelet_features # For running tests for now - create_classifier(combined_features, training_data) + classifer = create_classifier(combined_features, training_data) + print(classifer.best_params_) # Step 10. TODO Use saved classifier to make predictions. This can occur using a seperate file From a267363c4a0954e00d77c9702351458cf111a07d Mon Sep 17 00:00:00 2001 From: Tarek Allam Date: Tue, 21 May 2019 10:18:20 +0100 Subject: [PATCH 28/58] Save SHA and timestamp inside copy of config file Previously one would prepend the hash and timestamp to the folder, but this became too verbose Adding checks if analysis name already created This should see if the user has already defined an existing analysis name already and check to see if they want to overwrite the results in that folder, or create a new one. --- utils/plasticc_pipeline.py | 46 ++++++++++++++++++++++++++++++-------- 1 file changed, 37 insertions(+), 9 deletions(-) diff --git a/utils/plasticc_pipeline.py b/utils/plasticc_pipeline.py index 479fcf0b..69528c7a 100755 --- a/utils/plasticc_pipeline.py +++ b/utils/plasticc_pipeline.py @@ -9,6 +9,7 @@ import numpy as np import pandas as pd import os +import sys import subprocess import multiprocessing import yaml @@ -60,15 +61,15 @@ def get_timestamp(path_to_configuration_file): Examples -------- >>> ... - >>> timestamp = get_timestamp(path_to_configuration_file) + >>> timestamp = get_timestamp() >>> print(timestamp) '2019-05-18-2100' """ - _timestamp = subprocess.check_output(['date', '+%Y-%m-%d-%H%M', '-r', path_to_configuration_file]) + _timestamp = subprocess.check_output(['date', '+%Y-%m-%d-%H%M']) return _timestamp.decode("utf-8").rstrip() -def create_folder_structure(analysis_directory, analysis_name, path_to_configuration_file): +def create_folder_structure(analysis_directory, analysis_name): """ Make directories that will be used for analysis Parameters @@ -91,12 +92,12 @@ def create_folder_structure(analysis_directory, analysis_name, path_to_configura >>> ... >>> analysis_directory = params.get("analysis_directory", None) >>> analysis_name = params.get("analysis_name", None) - >>> directories = create_folder_structure(analysis_directory, analysis_name, path_to_configuration_file) + >>> directories = create_folder_structure(analysis_directory, analysis_name) >>> print(directories.get("method_directory", None)) """ # Prepend last modified time of configuration file and git SHA to analysis name - analysis_name = get_timestamp(path_to_configuration_file) + "-" + get_git_revision_short_hash() + "-" + analysis_name + # analysis_name = get_timestamp() + "-" + get_git_revision_short_hash() + "-" + analysis_name method_directory = os.path.join(analysis_directory, analysis_name) features_directory = os.path.join(method_directory, 'wavelet_features') @@ -108,8 +109,29 @@ def create_folder_structure(analysis_directory, analysis_name, path_to_configura "classifications_directory": classifications_directory, "intermediate_files_directory": intermediate_files_directory, "plots_directory": plots_directory} - for key, value in dirs.items(): - subprocess.call(['mkdir', value]) + if os.path.isdir(method_directory): + errmsg = """ + Folders already exist with this analysis name. + + Are you sure you would like to proceed, this will overwrite the + {} folder [Y/n] + """.format(analysis_name) + raise OSError(errmsg) + + _yes = ["yes", "y", "ye"] + _no = ["no", "n"] + + choice = input().lower() + + if choice in _yes: + print("I am sure") + for key, value in dirs.items(): + subprocess.call(['mkdir', value]) + elif choice in _no: + print("I am NOT sure") + sys.exit() + else: + sys.stdout.write("Please respond with 'yes' or 'no'") return dirs @@ -145,7 +167,7 @@ def load_configuration_file(path_to_configuration_file): params = yaml.load(f) except IOError: print("Invalid yaml file provided") - exit() + sys.exit() print("The parameters are:\n {}".format(params)) return params @@ -171,6 +193,12 @@ def save_configuration_file(method_directory): >>> print() """ + git_hash = {"git_hash": get_git_revision_short_hash()} + timestamp = {"timestamp": get_timestamp(path_to_configuration_file)} + + params.update(git_hash) + params.update(timestamp) + with open(os.path.join(method_directory, "config.yml"), 'w') as config: yaml.dump(params, config, default_flow_style=False) @@ -449,7 +477,7 @@ def make_predictions(location_of_test_data, classifier): number_of_principal_components = params.get("number_of_principal_components", None) # Step 1. Creat folders that contain analysis - dirs = create_folder_structure(analysis_directory, analysis_name, path_to_configuration_file) + dirs = create_folder_structure(analysis_directory, analysis_name) # Step 2. Save configuration file used for this analysis save_configuration_file(dirs.get("method_directory")) # Step 3. Check at which point the user would like to run the analysis from. From 3ad79b49fc5b08827434b886a5007bfb91161be7 Mon Sep 17 00:00:00 2001 From: Tarek Allam Date: Tue, 21 May 2019 11:27:15 +0100 Subject: [PATCH 29/58] Remove unused function argument Sending stderr to /dev/null if folder overwritten --- utils/plasticc_pipeline.py | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/utils/plasticc_pipeline.py b/utils/plasticc_pipeline.py index 69528c7a..a796b4d9 100755 --- a/utils/plasticc_pipeline.py +++ b/utils/plasticc_pipeline.py @@ -44,14 +44,9 @@ def get_git_revision_short_hash(): return _hash.decode("utf-8").rstrip() -def get_timestamp(path_to_configuration_file): +def get_timestamp(): """ Helper function to obtain latest modified time of the configuration file - Parameters - ---------- - path_to_configuration_file : str - System path to where the configuration file is located - Returns ------- timestamp : str @@ -116,7 +111,7 @@ def create_folder_structure(analysis_directory, analysis_name): Are you sure you would like to proceed, this will overwrite the {} folder [Y/n] """.format(analysis_name) - raise OSError(errmsg) + print(errmsg) _yes = ["yes", "y", "ye"] _no = ["no", "n"] @@ -124,9 +119,9 @@ def create_folder_structure(analysis_directory, analysis_name): choice = input().lower() if choice in _yes: - print("I am sure") + print("Overwriting existing folder..") for key, value in dirs.items(): - subprocess.call(['mkdir', value]) + subprocess.call(['mkdir', value], stderr=subprocess.DEVNULL) elif choice in _no: print("I am NOT sure") sys.exit() @@ -194,7 +189,7 @@ def save_configuration_file(method_directory): """ git_hash = {"git_hash": get_git_revision_short_hash()} - timestamp = {"timestamp": get_timestamp(path_to_configuration_file)} + timestamp = {"timestamp": get_timestamp()} params.update(git_hash) params.update(timestamp) From 9c8d870cbba2023555ebc18dfcbd549d69d96c8f Mon Sep 17 00:00:00 2001 From: Tarek Allam Date: Tue, 21 May 2019 12:50:33 +0100 Subject: [PATCH 30/58] Updating docstrings --- utils/plasticc_pipeline.py | 81 ++++++++++++++++++++++---------------- 1 file changed, 46 insertions(+), 35 deletions(-) diff --git a/utils/plasticc_pipeline.py b/utils/plasticc_pipeline.py index a796b4d9..7b2750ac 100755 --- a/utils/plasticc_pipeline.py +++ b/utils/plasticc_pipeline.py @@ -91,8 +91,6 @@ def create_folder_structure(analysis_directory, analysis_name): >>> print(directories.get("method_directory", None)) """ - # Prepend last modified time of configuration file and git SHA to analysis name - # analysis_name = get_timestamp() + "-" + get_git_revision_short_hash() + "-" + analysis_name method_directory = os.path.join(analysis_directory, analysis_name) features_directory = os.path.join(method_directory, 'wavelet_features') @@ -132,7 +130,6 @@ def create_folder_structure(analysis_directory, analysis_name): def load_configuration_file(path_to_configuration_file): - # TODO: Finish doctring examples """ Load from disk the configuration file that is to be used Parameters @@ -150,12 +147,12 @@ def load_configuration_file(path_to_configuration_file): Each item inside the configuration file can be accessed like so: >>> ... >>> params = load_configuration_file(path_to_configuration_file) - >>> data_path = params.get("data_path", None) - >>> print(data_path) - + >>> kernel_param = params.get("kernel_param", None) + >>> print(kernel_param) + [500.0, 20.0] >>> number_gp = params.get("number_gp", None) >>> print(number_gp) - + '1100' """ try: with open(path_to_configuration_file) as f: @@ -168,7 +165,6 @@ def load_configuration_file(path_to_configuration_file): def save_configuration_file(method_directory): - # TODO: Provide a doctring example """ Make a copy of the configuration file that has been used inside the analysis directory @@ -185,9 +181,19 @@ def save_configuration_file(method_directory): -------- >>> ... >>> save_configuration_file(method_directory) - >>> print() - + >>> subprocess.call(['cat', os.path.join(method_directory, "config.yml")]) + analysis_directory: /share/hypatia/snmachine_resources/data/plasticc/analysis/ + analysis_name: pipeline-test + data_path: /share/hypatia/snmachine_resources/data/plasticc/data/raw_data/training_set_snia.pickle + git_hash: 916eaec + kernel_param: + - 500.0 + - 20.0 + number_gp: 1100 + number_of_principal_components: 10 + timestamp: 2019-05-21-1204 """ + git_hash = {"git_hash": get_git_revision_short_hash()} timestamp = {"timestamp": get_timestamp()} @@ -199,7 +205,6 @@ def save_configuration_file(method_directory): def load_training_data(data_path): - # TODO: Finish doctring examples """ Load from disk the training data one will use for this analysis Parameters @@ -218,8 +223,9 @@ def load_training_data(data_path): >>> ... >>> training_data = load_training_data(params) >>> print(training_data) - + """ + try: if data_path.lower().endswith((".pickle", ".pkl", ".p", ".pckl")): with open(data_path, 'rb') as input: @@ -299,20 +305,27 @@ def reduce_size_of_training_data(training_data, dirs, subset_size, seed=1234): def wavelet_decomposition(training_data, number_gp, **kwargs): - """ Load from disk the training data one will use for this analysis + """ Wrapper function for `snmachine.snfeatures.WaveletFeatures`. This + performs a wavelet decomposition on training data evaluated at 'number_gp' + points on a light curve Parameters ---------- training_data : snmachine.PlasticcData Dictionary containing the parameters that reside in the configuration file. This will be used to obtain the path to the training data. - dirs : dict - Dictionary containing - subset_size : int - Number of objects the user would like to reduce the training data to - seed : int - Default set to 1234. This can be overridden by the user to check for - consistancy of results + number_gp : int + Number of points on the light curve to do wavelet analysis. Note, this + should be an even number for the wavelet decomposition to be able to be + performed. + number_processes : int + Number CPU cores avaiable to the user, this is how many cores the + decomposition will take place over + save_output : string + String defining what should be saved. See docs in + `snmachine.snfeatures.extract_wavelets` for more details on options. + output_root : string + Path to where one would like the uncompressed wavelet files to be stored Returns ------- @@ -456,19 +469,15 @@ def make_predictions(location_of_test_data, classifier): arguments = vars(arguments) path_to_configuration_file = arguments['configuration'] - params = load_configuration_file(path_to_configuration_file) data_path = params.get("data_path", None) - print(data_path) analysis_directory = params.get("analysis_directory", None) analysis_name = params.get("analysis_name", None) # snmachine parameters number_gp = params.get("number_gp", None) - print(number_gp) kernel_param = params.get("kernel_param", None) - print(kernel_param) number_of_principal_components = params.get("number_of_principal_components", None) # Step 1. Creat folders that contain analysis @@ -498,7 +507,7 @@ def make_predictions(location_of_test_data, classifier): # Step 4. Load in training data training_data = load_training_data(data_path) - print(training_data) + print("training_data = {}".format(training_data)) # Step 5. Compute GPs gps.compute_gps(training_data, number_gp=number_gp, t_min=0, t_max=1100, @@ -509,24 +518,26 @@ def make_predictions(location_of_test_data, classifier): # Step 6. Extract wavelet coeffiencts waveout, waveout_err, wavelet_object = wavelet_decomposition(training_data, number_gp=number_gp, number_processes=number_processes, save_output='all', output_root=dirs.get("intermediate_files_directory")) - print(waveout) - print(type(waveout)) - print(waveout_err) - print(type(waveout_err)) - print(wavelet_object) - print(type(wavelet_object)) + print("waveout = {}".format(waveout)) + print("waveout, type = {}".format(type(waveout))) + + print("waveout_err = {}".format(waveout_err)) + print("waveout_err, type = {}".format(type(waveout_err))) + + print("wavelet_object = {}".format(wavelet_object)) + print("wavelet_object, type = {}".format(type(wavelet_object))) # Step 7. Reduce dimensionality of wavelets by using only N principal components wavelet_features, eigenvals, eigenvecs, means, num_feats = wavelet_object.extract_pca(object_names=training_data.object_names, wavout=waveout, recompute_pca=True, method='svd', ncomp=number_of_principal_components, tol=None, pca_path=None, save_output=True, output_root=dirs.get("features_directory")) - print(wavelet_features) - print(type(wavelet_features)) + print("wavelet_features = {}".format(wavelet_features)) + print("wavelet_features, type = {}".format(type(wavelet_features))) # Step 8. TODO Combine snmachine features with user defined features # Step 9. TODO Create a Random Forest classifier; need to fit model and save it. combined_features = wavelet_features # For running tests for now - classifer = create_classifier(combined_features, training_data) - print(classifer.best_params_) + classifier = create_classifier(combined_features, training_data) + print(F"classifier = {classifier}") # Step 10. TODO Use saved classifier to make predictions. This can occur using a seperate file From d2dd8435b2835c4267391daf69a3986f3617e043 Mon Sep 17 00:00:00 2001 From: Tarek Allam Date: Tue, 21 May 2019 13:46:33 +0100 Subject: [PATCH 31/58] Adding _to_pandas() helper functions This function should be able to convert to either numpy or astropy.Table to a pandas.DataFrame. Also updating docstrings --- utils/plasticc_pipeline.py | 66 +++++++++++++++++++++++++++++--------- 1 file changed, 50 insertions(+), 16 deletions(-) diff --git a/utils/plasticc_pipeline.py b/utils/plasticc_pipeline.py index 7b2750ac..81025568 100755 --- a/utils/plasticc_pipeline.py +++ b/utils/plasticc_pipeline.py @@ -329,11 +329,12 @@ def wavelet_decomposition(training_data, number_gp, **kwargs): Returns ------- - waveout: + waveout: numpy.ndarray - waveout_err: + waveout_err: numpy.ndarray + + wavelet_object: snmachine.snfeatures.WaveletFeatures object - wavelet_object: Examples -------- @@ -352,14 +353,13 @@ def wavelet_decomposition(training_data, number_gp, **kwargs): def combine_all_features(reduced_wavelet_features, dataframe): - # TODO: Improve docstrings. Discuss whether the user should pass in a CSV - # instead? + # TODO: Improve docstrings. """ Combine snmachine wavelet features with PLASTICC features. The user should define a dataframe they would like to merge. Parameters ---------- - reduced_wavelet_features : numpy.ndarray + reduced_wavelet_features : astropy.table.table.Table These are the N principal components from the uncompressed wavelets dataframe : pandas.DataFrame Dataframe @@ -379,6 +379,7 @@ def combine_all_features(reduced_wavelet_features, dataframe): >>> print(shape.combined_features) """ + # def merge_features(some_features, other_features): # # TODO: Move this to a data processing file # if type(some_features) != pd.core.frame.DataFrame: @@ -394,6 +395,39 @@ def combine_all_features(reduced_wavelet_features, dataframe): return combined_features +def _to_pandas(features): + # TODO: Improve docstrings. + """ Helper function to take either an astropy Table + or numpy ndarray and convert to a pandas DataFrame representation + + Parameters + ---------- + features: astropy.table.table.Table OR numpy.ndarray + This parameter can be either an astropy Table or numpy ndarray + representation of the wavelet features + + Returns + ------- + features : pandas.DataFrame + + Examples + -------- + >>> ... + >>> print(type(features)) + + >>> features = _to_pandas(features) + >>> print(type(features)) + + """ + + if isinstance(features, np.ndarray): + features = pd.DataFrame(features, index=training_data.object_names) + else: + features = features.to_pandas() + + return features + + def create_classifier(combined_features, training_data, random_state=42): # TODO: Improve docstrings. """ Creation of an optimised Random Forest classifier. @@ -414,18 +448,18 @@ def create_classifier(combined_features, training_data, random_state=42): >>> ... >>> classifier, confusion_matrix = create_classifier(combined_features) >>> print(classifier) - - >>> plot_confusion_matrix(confusion_matrix) - + (RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy', + max_depth=None, max_features='auto', max_leaf_nodes=None, + min_impurity_split=1e-07, min_samples_leaf=1, + min_samples_split=2, min_weight_fraction_leaf=0.0, + n_estimators=700, n_jobs=-1, oob_score=True, random_state=42, + verbose=0, warm_start=False), array([[ 1.]])) """ - # TODO: This is temporary while the pipeline is tested. + print("COMBINED_FEATURES_TYPE: {}".format(type(combined_features))) - if isinstance(combined_features, np.ndarray): - combined_features = pd.DataFrame(combined_features, index=training_data.object_names) - combined_features['target'] = training_data.labels.values - else: - combined_features = combined_features.to_pandas() - combined_features['target'] = training_data.labels.values + combined_features = _to_pandas(combined_features) + print("COMBINED_FEATURES_TYPE, after _to_pandas(): {}".format(type(combined_features))) + combined_features['target'] = training_data.labels.values X = combined_features.drop('target', axis=1) y = combined_features['target'].values From 0e4fe568095078eab22344987fbc598cc7311b66 Mon Sep 17 00:00:00 2001 From: Tarek Allam Date: Tue, 21 May 2019 13:53:39 +0100 Subject: [PATCH 32/58] Adding roc/auc metrics to create_classifier() --- utils/config.yml | 4 ++-- utils/plasticc_pipeline.py | 10 ++++++++-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/utils/config.yml b/utils/config.yml index 84456cb7..43695d63 100644 --- a/utils/config.yml +++ b/utils/config.yml @@ -1,7 +1,7 @@ # Global settings analysis_directory : "/share/hypatia/snmachine_resources/data/plasticc/analysis/" -analysis_name : "pipeline-test" -data_path : "/share/hypatia/snmachine_resources/data/plasticc/data/raw_data/training_set_snia.pickle" +analysis_name : "pipeline-extragal" +data_path : "/share/hypatia/snmachine_resources/data/plasticc/data/raw_data/training_set_extragal.pickle" # snmachine parameters number_gp : 1100 kernel_param : [500., 20.] diff --git a/utils/plasticc_pipeline.py b/utils/plasticc_pipeline.py index 81025568..5b1742fe 100755 --- a/utils/plasticc_pipeline.py +++ b/utils/plasticc_pipeline.py @@ -5,6 +5,7 @@ from astropy.table import Table from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestClassifier +from sklearn.metrics import roc_curve, auc from argparse import ArgumentParser import numpy as np import pandas as pd @@ -414,10 +415,10 @@ def _to_pandas(features): -------- >>> ... >>> print(type(features)) - + >>> features = _to_pandas(features) >>> print(type(features)) - + """ if isinstance(features, np.ndarray): @@ -480,7 +481,12 @@ def create_classifier(combined_features, training_data, random_state=42): y_preds = classifer.predict(X_test) confusion_matrix = plot_confusion_matrix(y_test, y_preds, 'Validation data', target_names, normalize=True) + false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_preds) + roc_auc = auc(false_positive_rate, true_positive_rate) + print(F"ROC {roc_auc}") + y_probs = classifer.predict_proba(X_test) + print(y_probs) return classifer, confusion_matrix From 83f91f4778c5c0bc06331071fa3f91721a65be35 Mon Sep 17 00:00:00 2001 From: Tarek Allam Date: Tue, 21 May 2019 14:26:23 +0100 Subject: [PATCH 33/58] Fixing error of now new folder being created Due to a conditional check if a directory exists or not, it became apparent that if there was not a directory, no new ones were being created. This change fixes that. --- utils/plasticc_pipeline.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/utils/plasticc_pipeline.py b/utils/plasticc_pipeline.py index 5b1742fe..3192b253 100755 --- a/utils/plasticc_pipeline.py +++ b/utils/plasticc_pipeline.py @@ -126,6 +126,9 @@ def create_folder_structure(analysis_directory, analysis_name): sys.exit() else: sys.stdout.write("Please respond with 'yes' or 'no'") + else: + for key, value in dirs.items(): + subprocess.call(['mkdir', value]) return dirs From c5593d50ac49d47534a7d506ac06ecdb8858edac Mon Sep 17 00:00:00 2001 From: Tarek Allam Date: Tue, 21 May 2019 15:41:13 +0100 Subject: [PATCH 34/58] Updating gitignore Do not track log files in utils folder --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index 6875f042..0797322a 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,9 @@ test/* !test/*.py +# Do not track log files in utils +utils/*stdout.txt + ## Python.gitignore from Github. ## # Byte-compiled / optimized / DLL files From 382a251ea10826e221c47ddb704ef8038b3ec05f Mon Sep 17 00:00:00 2001 From: Tarek Allam Date: Wed, 22 May 2019 13:27:35 +0100 Subject: [PATCH 35/58] Updating save_configuration_file function Removing ROC and AUC metrics --- utils/plasticc_pipeline.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/utils/plasticc_pipeline.py b/utils/plasticc_pipeline.py index 3192b253..2fbf4d6d 100755 --- a/utils/plasticc_pipeline.py +++ b/utils/plasticc_pipeline.py @@ -5,7 +5,6 @@ from astropy.table import Table from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestClassifier -from sklearn.metrics import roc_curve, auc from argparse import ArgumentParser import numpy as np import pandas as pd @@ -168,14 +167,16 @@ def load_configuration_file(path_to_configuration_file): return params -def save_configuration_file(method_directory): +def save_configuration_file(params, method_directory): """ Make a copy of the configuration file that has been used inside the analysis directory Parameters ---------- + params : dict + Dictionary containing the parameters used for this analysis method_directory : string - The folder path used for this analysis + Folder where this analysis is taking place Returns ------- @@ -184,7 +185,7 @@ def save_configuration_file(method_directory): Examples -------- >>> ... - >>> save_configuration_file(method_directory) + >>> save_configuration_file(params, method_directory) >>> subprocess.call(['cat', os.path.join(method_directory, "config.yml")]) analysis_directory: /share/hypatia/snmachine_resources/data/plasticc/analysis/ analysis_name: pipeline-test @@ -484,10 +485,6 @@ def create_classifier(combined_features, training_data, random_state=42): y_preds = classifer.predict(X_test) confusion_matrix = plot_confusion_matrix(y_test, y_preds, 'Validation data', target_names, normalize=True) - false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_preds) - roc_auc = auc(false_positive_rate, true_positive_rate) - print(F"ROC {roc_auc}") - y_probs = classifer.predict_proba(X_test) print(y_probs) @@ -526,7 +523,7 @@ def make_predictions(location_of_test_data, classifier): # Step 1. Creat folders that contain analysis dirs = create_folder_structure(analysis_directory, analysis_name) # Step 2. Save configuration file used for this analysis - save_configuration_file(dirs.get("method_directory")) + save_configuration_file(params, dirs.get("method_directory")) # Step 3. Check at which point the user would like to run the analysis from. # If elements already saved, these will be used but this can be overriden # with command line argument From 8ad52aaf3b7dd84fe517bc9cb3ed487937c035db Mon Sep 17 00:00:00 2001 From: Tarek Allam Date: Wed, 22 May 2019 13:42:30 +0100 Subject: [PATCH 36/58] Adding option to save wavelet features to disk --- snmachine/snfeatures.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/snmachine/snfeatures.py b/snmachine/snfeatures.py index 6e903c21..db28c0d2 100644 --- a/snmachine/snfeatures.py +++ b/snmachine/snfeatures.py @@ -2025,6 +2025,8 @@ def extract_pca(self, object_names, wavout, recompute_pca=True, np.save(os.path.join(output_root,'eigenvectors_{}.npy'.format(ncomp)),vec) np.save(os.path.join(output_root,'comps_{}.npy'.format(ncomp)),comps) np.save(os.path.join(output_root,'means_{}.npy'.format(ncomp)),M) + # Write the astropy table containing the wavelet features to disk + wavs.write(os.path.join(output_root, 'reduced_wavelet_features'), format='fits',overwrite=True) return wavs, vals, vec, M, s From 98867b928bd29cd2994c54dd82e42b6fb4123a1a Mon Sep 17 00:00:00 2001 From: Tarek Allam Date: Wed, 22 May 2019 13:50:49 +0100 Subject: [PATCH 37/58] Adding option to restart from saved wavelets --- utils/plasticc_pipeline.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/utils/plasticc_pipeline.py b/utils/plasticc_pipeline.py index 2fbf4d6d..e8ef8326 100755 --- a/utils/plasticc_pipeline.py +++ b/utils/plasticc_pipeline.py @@ -529,12 +529,16 @@ def make_predictions(location_of_test_data, classifier): # with command line argument if (arguments['restart_from'].lower() == "wavelets"): # Restart from saved uncompressed wavelets. - wavelet_features = Table.read(dirs.get("features_dir") + "/wavelet_features.fits") - combined_features = combine_all_features(wavelet_features, data_path) - classifer = create_classifier(combined_features) + wavelet_features = Table.read(os.path.join(dirs.get("features_dir"), "reduced_wavelet_features.fits")) + combined_features = wavelet_features # For running tests for now + classifier, confusion_matrix = create_classifier(combined_features, training_data) + print(F"classifier = {classifier}") elif (arguments['restart_from'].lower() == "gps"): # Restart from saved GPs. pass + elif (arguments['restart_from'].lower() == "pca"): + # Restart from saved PCA components + pass else: # Run full pipeline but still do checks to see if elements from GPs or # wavelets already exist on disk; the first check should be for: From 80971f205ac94d82c8450347dfd21c79c70fdb8a Mon Sep 17 00:00:00 2001 From: Tarek Allam Date: Wed, 22 May 2019 14:15:20 +0100 Subject: [PATCH 38/58] Moving restart option to its own function call --- utils/plasticc_pipeline.py | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/utils/plasticc_pipeline.py b/utils/plasticc_pipeline.py index e8ef8326..e5562c71 100755 --- a/utils/plasticc_pipeline.py +++ b/utils/plasticc_pipeline.py @@ -496,6 +496,22 @@ def make_predictions(location_of_test_data, classifier): pass +def restart_from_saved_gps(dirs): + pass + + +def restart_from_saved_wavelets(dirs): + pass + + +def restart_from_saved_pca(dirs): + # TODO: Write docstrings + wavelet_features = Table.read(os.path.join(dirs.get("features_directory"), "reduced_wavelet_features.fits")) + combined_features = wavelet_features # For running tests for now + classifier, confusion_matrix = create_classifier(combined_features, training_data) + print(F"classifier = {classifier}") + + if __name__ == "__main__": # Set the number of processes you want to use throughout the notebook @@ -527,18 +543,15 @@ def make_predictions(location_of_test_data, classifier): # Step 3. Check at which point the user would like to run the analysis from. # If elements already saved, these will be used but this can be overriden # with command line argument - if (arguments['restart_from'].lower() == "wavelets"): - # Restart from saved uncompressed wavelets. - wavelet_features = Table.read(os.path.join(dirs.get("features_dir"), "reduced_wavelet_features.fits")) - combined_features = wavelet_features # For running tests for now - classifier, confusion_matrix = create_classifier(combined_features, training_data) - print(F"classifier = {classifier}") - elif (arguments['restart_from'].lower() == "gps"): + if (arguments['restart_from'].lower() == "gps"): # Restart from saved GPs. pass + elif (arguments['restart_from'].lower() == "wavelets"): + # Restart from saved uncompressed wavelets. + pass elif (arguments['restart_from'].lower() == "pca"): # Restart from saved PCA components - pass + restart_from_saved_pca(dirs) else: # Run full pipeline but still do checks to see if elements from GPs or # wavelets already exist on disk; the first check should be for: From 42486c81d0e28b4bb200d6c6cea3ac8c8e054477 Mon Sep 17 00:00:00 2001 From: Tarek Allam Date: Wed, 22 May 2019 16:12:52 +0100 Subject: [PATCH 39/58] Return wavelet_components as a pandas DataFrame Instead of an Astropy Table, return as a pandas DataFrame to processing later in the pipeline --- snmachine/snfeatures.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/snmachine/snfeatures.py b/snmachine/snfeatures.py index db28c0d2..71d4dd94 100644 --- a/snmachine/snfeatures.py +++ b/snmachine/snfeatures.py @@ -2012,11 +2012,11 @@ def extract_pca(self, object_names, wavout, recompute_pca=True, print('finish projecting PCA') # Now reformat the components as a table - labels = ['C%d' %i for i in range(number_comp)] - wavs = Table(comps, names=labels) + labels = ['C%d' %i for i in range(ncomp)] + wavelet_components = Table(comps, names=labels) objnames = Table(object_names.reshape(len(object_names), 1), names=['Object']) - wavs = hstack((objnames, wavs)) + wavelet_components = hstack((objnames, wavelet_components)) print('Time for PCA', time.time() - t1) if save_output: @@ -2025,10 +2025,11 @@ def extract_pca(self, object_names, wavout, recompute_pca=True, np.save(os.path.join(output_root,'eigenvectors_{}.npy'.format(ncomp)),vec) np.save(os.path.join(output_root,'comps_{}.npy'.format(ncomp)),comps) np.save(os.path.join(output_root,'means_{}.npy'.format(ncomp)),M) - # Write the astropy table containing the wavelet features to disk - wavs.write(os.path.join(output_root, 'reduced_wavelet_features'), format='fits',overwrite=True) + # Write the astropy table containing the wavelet features to disk after converting to pandas dataframe + wavelet_components = wavelet_components.to_pandas() + wavelet_components.pickle(os.path.join(output_root, 'wavelet_components_{}.pickle'.format(ncomp))) - return wavs, vals, vec, M, s + return wavelet_components, vals, vec, M, s def iswt(self, coefficients, wavelet): """ From d50ec445d88af21f571ddc74df2c8acfcd0f764c Mon Sep 17 00:00:00 2001 From: Tarek Allam Date: Wed, 22 May 2019 16:13:34 +0100 Subject: [PATCH 40/58] Rearrange imports to be PEP8 compliant --- utils/plasticc_pipeline.py | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/utils/plasticc_pipeline.py b/utils/plasticc_pipeline.py index e5562c71..48945a0e 100755 --- a/utils/plasticc_pipeline.py +++ b/utils/plasticc_pipeline.py @@ -1,20 +1,15 @@ """ Machine learning pipeline for the PLAsTiCC competition using snmachine codebase. """ -from plasticc_utils import plasticc_log_loss, plot_confusion_matrix -from astropy.table import Table -from sklearn.model_selection import train_test_split -from sklearn.ensemble import RandomForestClassifier -from argparse import ArgumentParser -import numpy as np -import pandas as pd +import multiprocessing import os -import sys import subprocess -import multiprocessing -import yaml +import sys import warnings -warnings.filterwarnings("ignore") + +import numpy as np +import pandas as pd +import yaml try: import cPickle as pickle except ModuleNotFoundError: @@ -24,6 +19,14 @@ except ImportError: print("Unable to import snmachine. Check environment set correctly") +from plasticc_utils import plasticc_log_loss, plot_confusion_matrix +from astropy.table import Table +from sklearn.model_selection import train_test_split +from sklearn.ensemble import RandomForestClassifier +from argparse import ArgumentParser + +warnings.filterwarnings("ignore") + def get_git_revision_short_hash(): """ Helper function to obtain current version control hash value @@ -461,9 +464,6 @@ def create_classifier(combined_features, training_data, random_state=42): verbose=0, warm_start=False), array([[ 1.]])) """ - print("COMBINED_FEATURES_TYPE: {}".format(type(combined_features))) - combined_features = _to_pandas(combined_features) - print("COMBINED_FEATURES_TYPE, after _to_pandas(): {}".format(type(combined_features))) combined_features['target'] = training_data.labels.values X = combined_features.drop('target', axis=1) From ae8d50e547b5ac9c0efe53b0851bc2f811fd4d67 Mon Sep 17 00:00:00 2001 From: Tarek Allam Date: Wed, 22 May 2019 17:34:34 +0100 Subject: [PATCH 41/58] Updating variable name --- snmachine/snfeatures.py | 10 +++++----- utils/config.yml | 4 ++-- utils/plasticc_pipeline.py | 6 +++--- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/snmachine/snfeatures.py b/snmachine/snfeatures.py index 71d4dd94..0965dfc3 100644 --- a/snmachine/snfeatures.py +++ b/snmachine/snfeatures.py @@ -2013,10 +2013,10 @@ def extract_pca(self, object_names, wavout, recompute_pca=True, # Now reformat the components as a table labels = ['C%d' %i for i in range(ncomp)] - wavelet_components = Table(comps, names=labels) + reduced_wavelet_components = Table(comps, names=labels) objnames = Table(object_names.reshape(len(object_names), 1), names=['Object']) - wavelet_components = hstack((objnames, wavelet_components)) + reduced_wavelet_components = hstack((objnames, reduced_wavelet_components)) print('Time for PCA', time.time() - t1) if save_output: @@ -2026,10 +2026,10 @@ def extract_pca(self, object_names, wavout, recompute_pca=True, np.save(os.path.join(output_root,'comps_{}.npy'.format(ncomp)),comps) np.save(os.path.join(output_root,'means_{}.npy'.format(ncomp)),M) # Write the astropy table containing the wavelet features to disk after converting to pandas dataframe - wavelet_components = wavelet_components.to_pandas() - wavelet_components.pickle(os.path.join(output_root, 'wavelet_components_{}.pickle'.format(ncomp))) + reduced_wavelet_components = reduced_wavelet_components.to_pandas() + reduced_wavelet_components.pickle(os.path.join(output_root, 'reduced_wavelet_components_{}.pickle'.format(ncomp))) - return wavelet_components, vals, vec, M, s + return reduced_wavelet_components, vals, vec, M, s def iswt(self, coefficients, wavelet): """ diff --git a/utils/config.yml b/utils/config.yml index 43695d63..80951185 100644 --- a/utils/config.yml +++ b/utils/config.yml @@ -1,7 +1,7 @@ # Global settings analysis_directory : "/share/hypatia/snmachine_resources/data/plasticc/analysis/" -analysis_name : "pipeline-extragal" -data_path : "/share/hypatia/snmachine_resources/data/plasticc/data/raw_data/training_set_extragal.pickle" +analysis_name : "pipeline-sniabcii" +data_path : "/share/hypatia/snmachine_resources/data/plasticc/data/raw_data/training_set_sniabcii.pickle" # snmachine parameters number_gp : 1100 kernel_param : [500., 20.] diff --git a/utils/plasticc_pipeline.py b/utils/plasticc_pipeline.py index 48945a0e..bc3de1e4 100755 --- a/utils/plasticc_pipeline.py +++ b/utils/plasticc_pipeline.py @@ -504,9 +504,9 @@ def restart_from_saved_wavelets(dirs): pass -def restart_from_saved_pca(dirs): +def restart_from_saved_pca(dirs, number_of_principal_components): # TODO: Write docstrings - wavelet_features = Table.read(os.path.join(dirs.get("features_directory"), "reduced_wavelet_features.fits")) + wavelet_features = Table.read(os.path.join(dirs.get("features_directory"), "reduced_wavelet_components_{}.pickle".format(number_of_principal_components))) combined_features = wavelet_features # For running tests for now classifier, confusion_matrix = create_classifier(combined_features, training_data) print(F"classifier = {classifier}") @@ -551,7 +551,7 @@ def restart_from_saved_pca(dirs): pass elif (arguments['restart_from'].lower() == "pca"): # Restart from saved PCA components - restart_from_saved_pca(dirs) + restart_from_saved_pca(dirs, number_of_principal_components) else: # Run full pipeline but still do checks to see if elements from GPs or # wavelets already exist on disk; the first check should be for: From 89e3bf5bf3ba6b016ef2d2a015618b92d82f7034 Mon Sep 17 00:00:00 2001 From: Tarek Allam Date: Wed, 22 May 2019 17:40:30 +0100 Subject: [PATCH 42/58] Chaning file that logs parameters to be appending --- utils/plasticc_pipeline.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/utils/plasticc_pipeline.py b/utils/plasticc_pipeline.py index bc3de1e4..01310cd5 100755 --- a/utils/plasticc_pipeline.py +++ b/utils/plasticc_pipeline.py @@ -189,7 +189,7 @@ def save_configuration_file(params, method_directory): -------- >>> ... >>> save_configuration_file(params, method_directory) - >>> subprocess.call(['cat', os.path.join(method_directory, "config.yml")]) + >>> subprocess.call(['cat', os.path.join(method_directory, "logs.yml")]) analysis_directory: /share/hypatia/snmachine_resources/data/plasticc/analysis/ analysis_name: pipeline-test data_path: /share/hypatia/snmachine_resources/data/plasticc/data/raw_data/training_set_snia.pickle @@ -208,7 +208,7 @@ def save_configuration_file(params, method_directory): params.update(git_hash) params.update(timestamp) - with open(os.path.join(method_directory, "config.yml"), 'w') as config: + with open(os.path.join(method_directory, "logs.yml"), 'a') as config: yaml.dump(params, config, default_flow_style=False) From 91d84a65deb7ebd66c8b31331a3c277040e0b249 Mon Sep 17 00:00:00 2001 From: Tarek Allam Date: Wed, 22 May 2019 17:42:56 +0100 Subject: [PATCH 43/58] This will open file for reading/writing (updating) --- utils/plasticc_pipeline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/plasticc_pipeline.py b/utils/plasticc_pipeline.py index 01310cd5..4485bd83 100755 --- a/utils/plasticc_pipeline.py +++ b/utils/plasticc_pipeline.py @@ -208,7 +208,7 @@ def save_configuration_file(params, method_directory): params.update(git_hash) params.update(timestamp) - with open(os.path.join(method_directory, "logs.yml"), 'a') as config: + with open(os.path.join(method_directory, "logs.yml"), 'a+') as config: yaml.dump(params, config, default_flow_style=False) From 7e281e25dd5862551a35e81ae44559ce99fa553a Mon Sep 17 00:00:00 2001 From: Tarek Allam Date: Wed, 22 May 2019 18:19:18 +0100 Subject: [PATCH 44/58] Fixing typo in saving and reading pickled df --- snmachine/snfeatures.py | 2 +- utils/plasticc_pipeline.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/snmachine/snfeatures.py b/snmachine/snfeatures.py index 0965dfc3..6970ad4f 100644 --- a/snmachine/snfeatures.py +++ b/snmachine/snfeatures.py @@ -2027,7 +2027,7 @@ def extract_pca(self, object_names, wavout, recompute_pca=True, np.save(os.path.join(output_root,'means_{}.npy'.format(ncomp)),M) # Write the astropy table containing the wavelet features to disk after converting to pandas dataframe reduced_wavelet_components = reduced_wavelet_components.to_pandas() - reduced_wavelet_components.pickle(os.path.join(output_root, 'reduced_wavelet_components_{}.pickle'.format(ncomp))) + reduced_wavelet_components.to_pickle(os.path.join(output_root, 'reduced_wavelet_components_{}.pickle'.format(ncomp))) return reduced_wavelet_components, vals, vec, M, s diff --git a/utils/plasticc_pipeline.py b/utils/plasticc_pipeline.py index 4485bd83..f54a85cf 100755 --- a/utils/plasticc_pipeline.py +++ b/utils/plasticc_pipeline.py @@ -506,7 +506,7 @@ def restart_from_saved_wavelets(dirs): def restart_from_saved_pca(dirs, number_of_principal_components): # TODO: Write docstrings - wavelet_features = Table.read(os.path.join(dirs.get("features_directory"), "reduced_wavelet_components_{}.pickle".format(number_of_principal_components))) + wavelet_features = pd.read_pickle(os.path.join(dirs.get("features_directory"), "reduced_wavelet_components_{}.pickle".format(number_of_principal_components))) combined_features = wavelet_features # For running tests for now classifier, confusion_matrix = create_classifier(combined_features, training_data) print(F"classifier = {classifier}") From e32f65e537d875049305e6f1fa2979849a808489 Mon Sep 17 00:00:00 2001 From: Tarek Allam Date: Wed, 29 May 2019 15:37:17 +0100 Subject: [PATCH 45/58] Including 'imbalanced-learn' package as dependency Required updating sklearn version --- environment.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/environment.yml b/environment.yml index 997fc7e1..aeb4e79e 100644 --- a/environment.yml +++ b/environment.yml @@ -11,12 +11,13 @@ dependencies: - jupyter>=1.0.0 - matplotlib>=1.5.1 - numpy=1.12.0 - - scikit-learn=0.18.1 + - scikit-learn>=0.20 - scipy>=0.17.0 - george>=0.3.0 - iminuit>=1.2 - pandas>=0.23.0 - extinction>=0.3.0 + - imbalanced-learn>=0.4.3 - pip: - emcee>=2.1.0 From 47f6125c9cbbeaf588847becd5a5992f9cf1bed2 Mon Sep 17 00:00:00 2001 From: Tarek Allam Date: Wed, 29 May 2019 15:38:28 +0100 Subject: [PATCH 46/58] Return figure aswell as confusion matrix from func --- utils/plasticc_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/plasticc_utils.py b/utils/plasticc_utils.py index e56d1852..3659cd4b 100644 --- a/utils/plasticc_utils.py +++ b/utils/plasticc_utils.py @@ -30,7 +30,7 @@ def plot_confusion_matrix(y_true, y_pred, title, target_names, normalize=False): ax.set_aspect('equal') plt.title(title) - return cm + return cm, fig def plasticc_log_loss(y_true, y_pred, relative_class_weights=None): From 3242bf36033c50d118b57debe135308ac5f8157b Mon Sep 17 00:00:00 2001 From: Tarek Allam Date: Wed, 29 May 2019 15:39:06 +0100 Subject: [PATCH 47/58] Adding functionality to rebalance classes Also functionality save classifier and confusion matrix plot --- utils/plasticc_pipeline.py | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/utils/plasticc_pipeline.py b/utils/plasticc_pipeline.py index f54a85cf..f8ede560 100755 --- a/utils/plasticc_pipeline.py +++ b/utils/plasticc_pipeline.py @@ -23,6 +23,9 @@ from astropy.table import Table from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestClassifier +from imblearn.metrics import classification_report_imbalanced +from imblearn.pipeline import make_pipeline +from imblearn.over_sampling import SMOTE from argparse import ArgumentParser warnings.filterwarnings("ignore") @@ -436,7 +439,7 @@ def _to_pandas(features): return features -def create_classifier(combined_features, training_data, random_state=42): +def create_classifier(combined_features, training_data, dirs, augmentation_method=None, random_state=42): # TODO: Improve docstrings. """ Creation of an optimised Random Forest classifier. @@ -469,24 +472,30 @@ def create_classifier(combined_features, training_data, random_state=42): X = combined_features.drop('target', axis=1) y = combined_features['target'].values - print("X SHAPE = {}\n".format(X.shape)) - print("y SHAPE = {}\n".format(y.shape)) - target_names = combined_features['target'].unique() - print("X = \n{}".format(X)) - print("y = \n{}".format(y)) - X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state) classifer = RandomForestClassifier(n_estimators=700, criterion='entropy', oob_score=True, n_jobs=-1, random_state=random_state) + + if augmentation_method in ['SMOTE']: + classifer = make_pipeline(augmentation_method(sampling_strategy='not majority'), classifer) + else: + print("No augmentation selected, proceeding without resampling of classes") + classifer.fit(X_train, y_train) + # Classify and report the results + print(classification_report_imbalanced(y_test, classifer.predict(X_test))) + y_preds = classifer.predict(X_test) - confusion_matrix = plot_confusion_matrix(y_test, y_preds, 'Validation data', target_names, normalize=True) + confusion_matrix, figure = plot_confusion_matrix(y_test, y_preds, 'Validation data', target_names, normalize=True) + + timestamp = get_timestamp() + with open(os.path.join(dirs.get("classifications_directory"), F'classifer_{timestamp}.pkl'), 'wb') as clf: + pickle.dump(classifer, clf) - y_probs = classifer.predict_proba(X_test) - print(y_probs) + figure.savefig(os.join.path(dirs.get("plots_directory"), F'plot_{timestamp}.png')) return classifer, confusion_matrix From cecb4ac1ad29d713fc007f5985145b7be759c66d Mon Sep 17 00:00:00 2001 From: Catarina Alves Date: Wed, 29 May 2019 18:49:37 +0100 Subject: [PATCH 48/58] Fix a path bug --- utils/plasticc_pipeline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/plasticc_pipeline.py b/utils/plasticc_pipeline.py index f8ede560..fab3c99f 100755 --- a/utils/plasticc_pipeline.py +++ b/utils/plasticc_pipeline.py @@ -495,7 +495,7 @@ def create_classifier(combined_features, training_data, dirs, augmentation_metho with open(os.path.join(dirs.get("classifications_directory"), F'classifer_{timestamp}.pkl'), 'wb') as clf: pickle.dump(classifer, clf) - figure.savefig(os.join.path(dirs.get("plots_directory"), F'plot_{timestamp}.png')) + figure.savefig(os.path.join(dirs.get("plots_directory"), F'plot_{timestamp}.png')) return classifer, confusion_matrix From 080434b4e7c0443abafff8d15bad4a7dc0f1857a Mon Sep 17 00:00:00 2001 From: Catarina Alves Date: Wed, 29 May 2019 19:04:21 +0100 Subject: [PATCH 49/58] Fix a method call --- utils/plasticc_pipeline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/plasticc_pipeline.py b/utils/plasticc_pipeline.py index fab3c99f..3c5c5224 100755 --- a/utils/plasticc_pipeline.py +++ b/utils/plasticc_pipeline.py @@ -479,7 +479,7 @@ def create_classifier(combined_features, training_data, dirs, augmentation_metho classifer = RandomForestClassifier(n_estimators=700, criterion='entropy', oob_score=True, n_jobs=-1, random_state=random_state) if augmentation_method in ['SMOTE']: - classifer = make_pipeline(augmentation_method(sampling_strategy='not majority'), classifer) + classifer = make_pipeline(eval(augmentation_method)(sampling_strategy='not majority'), classifer) else: print("No augmentation selected, proceeding without resampling of classes") From 2c86dc79f73ccd506b5a7a7a40ef3129be4bc672 Mon Sep 17 00:00:00 2001 From: Tarek Allam Date: Mon, 3 Jun 2019 00:08:21 +0100 Subject: [PATCH 50/58] Updating variable name, ncomp --> number_comp --- snmachine/snfeatures.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/snmachine/snfeatures.py b/snmachine/snfeatures.py index 6970ad4f..04df6668 100644 --- a/snmachine/snfeatures.py +++ b/snmachine/snfeatures.py @@ -2012,7 +2012,7 @@ def extract_pca(self, object_names, wavout, recompute_pca=True, print('finish projecting PCA') # Now reformat the components as a table - labels = ['C%d' %i for i in range(ncomp)] + labels = ['C%d' %i for i in range(number_comp)] reduced_wavelet_components = Table(comps, names=labels) objnames = Table(object_names.reshape(len(object_names), 1), names=['Object']) @@ -2021,13 +2021,13 @@ def extract_pca(self, object_names, wavout, recompute_pca=True, if save_output: # We need to change the output to make it consistent with new code - np.save(os.path.join(output_root,'eigenvalues_{}.npy'.format(ncomp)),vals) - np.save(os.path.join(output_root,'eigenvectors_{}.npy'.format(ncomp)),vec) - np.save(os.path.join(output_root,'comps_{}.npy'.format(ncomp)),comps) - np.save(os.path.join(output_root,'means_{}.npy'.format(ncomp)),M) + np.save(os.path.join(output_root,'eigenvalues_{}.npy'.format(number_comp)),vals) + np.save(os.path.join(output_root,'eigenvectors_{}.npy'.format(number_comp)),vec) + np.save(os.path.join(output_root,'comps_{}.npy'.format(number_comp)),comps) + np.save(os.path.join(output_root,'means_{}.npy'.format(number_comp)),M) # Write the astropy table containing the wavelet features to disk after converting to pandas dataframe reduced_wavelet_components = reduced_wavelet_components.to_pandas() - reduced_wavelet_components.to_pickle(os.path.join(output_root, 'reduced_wavelet_components_{}.pickle'.format(ncomp))) + reduced_wavelet_components.to_pickle(os.path.join(output_root, 'reduced_wavelet_components_{}.pickle'.format(number_comp))) return reduced_wavelet_components, vals, vec, M, s From 0eb5572dc1396a0c949fe7761ecf394adee81636 Mon Sep 17 00:00:00 2001 From: Tarek Allam Date: Mon, 3 Jun 2019 00:09:20 +0100 Subject: [PATCH 51/58] [FIXUP] Updating variable name, ncomp --- utils/plasticc_pipeline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/plasticc_pipeline.py b/utils/plasticc_pipeline.py index 3c5c5224..2c93f90b 100755 --- a/utils/plasticc_pipeline.py +++ b/utils/plasticc_pipeline.py @@ -594,7 +594,7 @@ def restart_from_saved_pca(dirs, number_of_principal_components): print("wavelet_object, type = {}".format(type(wavelet_object))) # Step 7. Reduce dimensionality of wavelets by using only N principal components - wavelet_features, eigenvals, eigenvecs, means, num_feats = wavelet_object.extract_pca(object_names=training_data.object_names, wavout=waveout, recompute_pca=True, method='svd', ncomp=number_of_principal_components, + wavelet_features, eigenvals, eigenvecs, means, num_feats = wavelet_object.extract_pca(object_names=training_data.object_names, wavout=waveout, recompute_pca=True, method='svd', number_comp=number_of_principal_components, tol=None, pca_path=None, save_output=True, output_root=dirs.get("features_directory")) print("wavelet_features = {}".format(wavelet_features)) print("wavelet_features, type = {}".format(type(wavelet_features))) From d07bbdc5c7e561e3c3ef31662e837d7925135f16 Mon Sep 17 00:00:00 2001 From: Tarek Allam Date: Mon, 3 Jun 2019 00:26:17 +0100 Subject: [PATCH 52/58] Adding 'get_directories()' function Fixes #149 --- utils/plasticc_pipeline.py | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/utils/plasticc_pipeline.py b/utils/plasticc_pipeline.py index 2c93f90b..3234ff5e 100755 --- a/utils/plasticc_pipeline.py +++ b/utils/plasticc_pipeline.py @@ -138,6 +138,38 @@ def create_folder_structure(analysis_directory, analysis_name): return dirs +def get_directories(analyses_directory, analysis_name): + """Returns the folder directories inside of a given analysis. + + # TODO [Add a link to the place where we have an explanation of the folder structure] + + Parameters + ---------- + analyses_directory : str + System path to where the user stores all analysis. + analysis_name : str + Name of the analysis we want. + + Returns + ------- + directories : dict + Dictionary containing the mapping of folders inside of `analysis_name`. + """ + analysis_directory = os.path.join(analyses_directory, analysis_name) + features_directory = os.path.join(analysis_directory, 'wavelet_features') + classifications_directory = os.path.join(analysis_directory, 'classifications') + intermediate_files_directory = os.path.join(analysis_directory, 'intermediate_files') + plots_directory = os.path.join(analysis_directory, 'plots') + + directories = {"analysis_directory": analysis_directory, + "features_directory": features_directory, + "classifications_directory": classifications_directory, + "intermediate_files_directory": intermediate_files_directory, + "plots_directory": plots_directory} + + return directories + + def load_configuration_file(path_to_configuration_file): """ Load from disk the configuration file that is to be used From 854ebc25d7118ce2a37e6a1bff31b4952f161ec6 Mon Sep 17 00:00:00 2001 From: Tarek Allam Date: Mon, 3 Jun 2019 15:39:02 +0100 Subject: [PATCH 53/58] [FIXUP] Adding debug print statement --- snmachine/snfeatures.py | 1 + 1 file changed, 1 insertion(+) diff --git a/snmachine/snfeatures.py b/snmachine/snfeatures.py index 04df6668..b7e6f930 100644 --- a/snmachine/snfeatures.py +++ b/snmachine/snfeatures.py @@ -721,6 +721,7 @@ def extract_features(self, d, save_output=False, chain_directory='chains', use_r self.model=sncosmo.Model(self.templates[mod_name],effects=[dust],effect_names=['host'], effect_frames=['rest']) else: self.model=sncosmo.Model(self.templates[mod_name]) + print(F'MODEL-NAME: {mod_name}') params=['['+mod_name+']'+pname for pname in self.model.param_names] # err_plus=[pname+'_err+' for pname in params] # err_minus=[pname+'_err-' for pname in params] From d32ca95fd21dc3c2e06e31cc06f36d92914baf2f Mon Sep 17 00:00:00 2001 From: Tarek Allam Date: Mon, 3 Jun 2019 16:23:53 +0100 Subject: [PATCH 54/58] Updating docstrings --- utils/plasticc_pipeline.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/utils/plasticc_pipeline.py b/utils/plasticc_pipeline.py index 3234ff5e..9cb249c7 100755 --- a/utils/plasticc_pipeline.py +++ b/utils/plasticc_pipeline.py @@ -373,12 +373,12 @@ def wavelet_decomposition(training_data, number_gp, **kwargs): Returns ------- waveout: numpy.ndarray - + Numpy array of the wavelet coefficients where each row is an object and + each column a different coefficient waveout_err: numpy.ndarray - + Numpy array storing the (assuming Gaussian) error on each coefficient. wavelet_object: snmachine.snfeatures.WaveletFeatures object - Examples -------- >>> ... From 89caf3c9f3912689266ae712a83f397660f43afd Mon Sep 17 00:00:00 2001 From: Tarek Allam Date: Mon, 3 Jun 2019 16:26:10 +0100 Subject: [PATCH 55/58] Updating variable name, dirs --> directories --- utils/plasticc_pipeline.py | 54 +++++++++++++++++++++----------------- 1 file changed, 30 insertions(+), 24 deletions(-) diff --git a/utils/plasticc_pipeline.py b/utils/plasticc_pipeline.py index 9cb249c7..8811d08e 100755 --- a/utils/plasticc_pipeline.py +++ b/utils/plasticc_pipeline.py @@ -84,7 +84,7 @@ def create_folder_structure(analysis_directory, analysis_name): Returns ------- - dirs: dict + directories: dict Dictionary containing the mapping of folders that have been created. Examples @@ -104,7 +104,7 @@ def create_folder_structure(analysis_directory, analysis_name): intermediate_files_directory = os.path.join(method_directory, 'intermediate_files') plots_directory = os.path.join(method_directory, 'plots') - dirs = {"method_directory": method_directory, "features_directory": features_directory, + directories = {"method_directory": method_directory, "features_directory": features_directory, "classifications_directory": classifications_directory, "intermediate_files_directory": intermediate_files_directory, "plots_directory": plots_directory} @@ -124,7 +124,7 @@ def create_folder_structure(analysis_directory, analysis_name): if choice in _yes: print("Overwriting existing folder..") - for key, value in dirs.items(): + for key, value in directories.items(): subprocess.call(['mkdir', value], stderr=subprocess.DEVNULL) elif choice in _no: print("I am NOT sure") @@ -132,10 +132,10 @@ def create_folder_structure(analysis_directory, analysis_name): else: sys.stdout.write("Please respond with 'yes' or 'no'") else: - for key, value in dirs.items(): + for key, value in directories.items(): subprocess.call(['mkdir', value]) - return dirs + return directories def get_directories(analyses_directory, analysis_name): @@ -296,7 +296,7 @@ def load_training_data(data_path): return training_data -def reduce_size_of_training_data(training_data, dirs, subset_size, seed=1234): +def reduce_size_of_training_data(training_data, directories, subset_size, seed=1234): # TODO: Incorpate further doctrings and finish examples. Tarek: Catarina and I need to # discuss this further. There is some overlap between this and # sndata.PlasticcData.update_data() and it would be good to comebine this. @@ -307,7 +307,7 @@ def reduce_size_of_training_data(training_data, dirs, subset_size, seed=1234): training_data : snmachine.PlasticcData Dictionary containing the parameters that reside in the configuration file. This will be used to obtain the path to the training data. - dirs : dict + directories : dict Dictionary containing subset_size : int Number of objects the user would like to reduce the training data to @@ -324,12 +324,13 @@ def reduce_size_of_training_data(training_data, dirs, subset_size, seed=1234): >>> ... >>> print(shape.training_data) - >>> new_training_data = reduce_size_of_training_data(training_data, dirs, 1000)) + >>> new_training_data = reduce_size_of_training_data(training_data, + directories, 1000)) >>> print(shape.new_training_data) """ - method_directory = dirs.get("method_directory", None) + method_directory = directories.get("method_directory", None) subset_file = os.path.join(method_directory, "subset.list") if os.path.exists(subset_file): rand_objs = np.genfromtxt(subset_file, dtype='U') @@ -383,7 +384,8 @@ def wavelet_decomposition(training_data, number_gp, **kwargs): -------- >>> ... >>> waveout, waveout_err, wavelet_object = wavelet_decomposition(training_data, number_gp=number_gp, number_processes=number_processes, - save_output='all', output_root=dirs.get("intermediate_files_directory")) + save_output='all', + output_root=directories.get("intermediate_files_directory")) >>> print() """ @@ -471,7 +473,7 @@ def _to_pandas(features): return features -def create_classifier(combined_features, training_data, dirs, augmentation_method=None, random_state=42): +def create_classifier(combined_features, training_data, directories, augmentation_method=None, random_state=42): # TODO: Improve docstrings. """ Creation of an optimised Random Forest classifier. @@ -524,10 +526,10 @@ def create_classifier(combined_features, training_data, dirs, augmentation_metho confusion_matrix, figure = plot_confusion_matrix(y_test, y_preds, 'Validation data', target_names, normalize=True) timestamp = get_timestamp() - with open(os.path.join(dirs.get("classifications_directory"), F'classifer_{timestamp}.pkl'), 'wb') as clf: + with open(os.path.join(directories.get("classifications_directory"), F'classifer_{timestamp}.pkl'), 'wb') as clf: pickle.dump(classifer, clf) - figure.savefig(os.path.join(dirs.get("plots_directory"), F'plot_{timestamp}.png')) + figure.savefig(os.path.join(directories.get("plots_directory"), F'plot_{timestamp}.png')) return classifer, confusion_matrix @@ -537,17 +539,17 @@ def make_predictions(location_of_test_data, classifier): pass -def restart_from_saved_gps(dirs): +def restart_from_saved_gps(directories): pass -def restart_from_saved_wavelets(dirs): +def restart_from_saved_wavelets(directories): pass -def restart_from_saved_pca(dirs, number_of_principal_components): +def restart_from_saved_pca(directories, number_of_principal_components): # TODO: Write docstrings - wavelet_features = pd.read_pickle(os.path.join(dirs.get("features_directory"), "reduced_wavelet_components_{}.pickle".format(number_of_principal_components))) + wavelet_features = pd.read_pickle(os.path.join(directories.get("features_directory"), "reduced_wavelet_components_{}.pickle".format(number_of_principal_components))) combined_features = wavelet_features # For running tests for now classifier, confusion_matrix = create_classifier(combined_features, training_data) print(F"classifier = {classifier}") @@ -578,9 +580,9 @@ def restart_from_saved_pca(dirs, number_of_principal_components): number_of_principal_components = params.get("number_of_principal_components", None) # Step 1. Creat folders that contain analysis - dirs = create_folder_structure(analysis_directory, analysis_name) + directories = create_folder_structure(analysis_directory, analysis_name) # Step 2. Save configuration file used for this analysis - save_configuration_file(params, dirs.get("method_directory")) + save_configuration_file(params, directories.get("method_directory")) # Step 3. Check at which point the user would like to run the analysis from. # If elements already saved, these will be used but this can be overriden # with command line argument @@ -592,12 +594,12 @@ def restart_from_saved_pca(dirs, number_of_principal_components): pass elif (arguments['restart_from'].lower() == "pca"): # Restart from saved PCA components - restart_from_saved_pca(dirs, number_of_principal_components) + restart_from_saved_pca(directories, number_of_principal_components) else: # Run full pipeline but still do checks to see if elements from GPs or # wavelets already exist on disk; the first check should be for: # a. Saved PCA files - # path_saved_reduced_wavelets = dirs.get("intermediate_files_directory") + # path_saved_reduced_wavelets = directories.get("intermediate_files_directory") # eigenvectors_saved_file = np.load(os.path.join(path_saved_reduced_wavelets, 'eigenvectors_' + str(number_of_principal_components) + '.npy')) # means_saved_file = np.load(os.path.join(path_saved_reduced_wavelets, 'means_' + str(number_of_principal_components) + '.npy')) # b. Saved uncompressed wavelets @@ -610,12 +612,13 @@ def restart_from_saved_pca(dirs, number_of_principal_components): # Step 5. Compute GPs gps.compute_gps(training_data, number_gp=number_gp, t_min=0, t_max=1100, kernel_param=kernel_param, - output_root=dirs['intermediate_files_directory'], + output_root=directories['intermediate_files_directory'], number_processes=number_processes) # Step 6. Extract wavelet coeffiencts waveout, waveout_err, wavelet_object = wavelet_decomposition(training_data, number_gp=number_gp, number_processes=number_processes, - save_output='all', output_root=dirs.get("intermediate_files_directory")) + save_output='all', + output_root=directories.get("intermediate_files_directory")) print("waveout = {}".format(waveout)) print("waveout, type = {}".format(type(waveout))) @@ -627,7 +630,10 @@ def restart_from_saved_pca(dirs, number_of_principal_components): # Step 7. Reduce dimensionality of wavelets by using only N principal components wavelet_features, eigenvals, eigenvecs, means, num_feats = wavelet_object.extract_pca(object_names=training_data.object_names, wavout=waveout, recompute_pca=True, method='svd', number_comp=number_of_principal_components, - tol=None, pca_path=None, save_output=True, output_root=dirs.get("features_directory")) + tol=None, + pca_path=None, + save_output=True, + output_root=directories.get("features_directory")) print("wavelet_features = {}".format(wavelet_features)) print("wavelet_features, type = {}".format(type(wavelet_features))) From 33cffea91614126d2574d4080e29e61e48488925 Mon Sep 17 00:00:00 2001 From: Tarek Allam Date: Tue, 4 Jun 2019 12:52:50 +0100 Subject: [PATCH 56/58] Fixing version of sncosmo for debug checks The recent HTTP 404 error discovered in the CI suggests that a recent change to sncosmo might be the reason for failing to find salt2 models Latest version = 1.8.0, which is where the error occurs, bumping down to 1.7.1 (previous release) to test outcome --- environment.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environment.yml b/environment.yml index aeb4e79e..23422a70 100644 --- a/environment.yml +++ b/environment.yml @@ -23,6 +23,6 @@ dependencies: - emcee>=2.1.0 - numpydoc>=0.6.0 - pywavelets>=0.4.0 - - sncosmo>=1.3.0 + - sncosmo==1.7.1 - nose>=1.3.7 - future>=0.16 From 13fb8b638ae83039646ac5e36d10139318fa9116 Mon Sep 17 00:00:00 2001 From: Catarina Alves Date: Tue, 4 Jun 2019 12:57:53 +0100 Subject: [PATCH 57/58] Save the balancing method and the number of PCA components used for the classifier and confusion matrix --- utils/plasticc_pipeline.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/utils/plasticc_pipeline.py b/utils/plasticc_pipeline.py index 8811d08e..796af43f 100755 --- a/utils/plasticc_pipeline.py +++ b/utils/plasticc_pipeline.py @@ -473,7 +473,7 @@ def _to_pandas(features): return features -def create_classifier(combined_features, training_data, directories, augmentation_method=None, random_state=42): +def create_classifier(combined_features, training_data, directories, augmentation_method=None, random_state=42, number_comps=''): # TODO: Improve docstrings. """ Creation of an optimised Random Forest classifier. @@ -526,10 +526,10 @@ def create_classifier(combined_features, training_data, directories, augmentatio confusion_matrix, figure = plot_confusion_matrix(y_test, y_preds, 'Validation data', target_names, normalize=True) timestamp = get_timestamp() - with open(os.path.join(directories.get("classifications_directory"), F'classifer_{timestamp}.pkl'), 'wb') as clf: + with open(os.path.join(directories.get("classifications_directory"), F'classifer_{number_comps}_{augmentation_method}.pkl'), 'wb') as clf: pickle.dump(classifer, clf) - figure.savefig(os.path.join(directories.get("plots_directory"), F'plot_{timestamp}.png')) + figure.savefig(os.path.join(directories.get("plots_directory"), F'confusion_matrix_{number_comps}_{augmentation_method}.pdf')) return classifer, confusion_matrix From c2513377420ad679af3bb0bda3eb4401a58a495f Mon Sep 17 00:00:00 2001 From: Tarek Allam Date: Tue, 4 Jun 2019 15:11:42 +0100 Subject: [PATCH 58/58] Bump version 1.3.2 --> 1.4.0 With the inclusion of this feature set, although not fully complete, a MINOR bump is felt necessary. --- snmachine/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/snmachine/version.py b/snmachine/version.py index 922bcbf7..b2c6a8de 100644 --- a/snmachine/version.py +++ b/snmachine/version.py @@ -1 +1 @@ -__VERSION__ = "1.3.2" +__VERSION__ = "1.4.0"