From 9cba46da4ca360299a606200be17171fe12588e1 Mon Sep 17 00:00:00 2001
From: Tarek Allam <t.allam.jr@gmail.com>
Date: Wed, 8 May 2019 15:52:26 +0100
Subject: [PATCH 01/58] Migration of file used in tallamjr/plasticc repo

This baseline commit brings in the file that has been used in the
exploratory repo of https://github.com/tallamjr/plasticc/pipeline.py
---
 utils/run_plasticc_pipeline.py | 326 +++++++++++++++++++++++++++++++++
 1 file changed, 326 insertions(+)
 create mode 100644 utils/run_plasticc_pipeline.py

diff --git a/utils/run_plasticc_pipeline.py b/utils/run_plasticc_pipeline.py
new file mode 100644
index 00000000..9069b3bd
--- /dev/null
+++ b/utils/run_plasticc_pipeline.py
@@ -0,0 +1,326 @@
+# snmachine machine learning pipeline for the PLAsTiCC competition.
+
+## IMPORTS
+import numpy as np
+import pandas as pd
+import sys
+import os
+import subprocess
+import multiprocessing
+import glob
+from astropy.table import Table,join,vstack
+from sklearn.model_selection import train_test_split
+from sklearn.ensemble import RandomForestClassifier
+import pickle
+from argparse import ArgumentParser
+import yaml
+import multiprocessing
+import warnings
+warnings.filterwarnings("ignore")
+try:
+    import cPickle as pickle
+except ModuleNotFoundError:
+    import pickle
+try:
+    from snmachine import snfeatures, sndata, snaugment, gps
+except ImportError:
+    print("Unable to import snmachine. Check environment set correctly")
+
+util_module_path = os.path.abspath(os.path.join('snmachine', 'utils'))
+if util_module_path not in sys.path:
+    sys.path.append(util_module_path)
+from plasticc_utils import plasticcLogLoss, plotConfusionMatrix
+
+
+def createFolderStructure(ANALYSIS_DIR, ANALYSIS_NAME):
+
+    method_dir   = os.path.join(ANALYSIS_DIR, ANALYSIS_NAME)
+    features_dir = os.path.join(method_dir, 'wavelet_features')
+    classif_dir  = os.path.join(method_dir, 'classifications')
+    interm_dir   = os.path.join(method_dir, 'intermediate')
+    plots_dir    = os.path.join(method_dir, 'plots')
+
+    dirs = {"method_dir" : method_dir, "features_dir" : features_dir,
+            "classif_dir" : classif_dir, "interm_dir" : interm_dir,
+            "plots_dir" : plots_dir}
+
+    for key, value in dirs.items():
+        subprocess.call(['mkdir', value])
+
+    return dirs
+
+
+def saveConfigurationFile(dirs):
+
+    METHOD_DIR = dirs.get("method_dir", None)
+    with open('/{}/config.yaml'.format(METHOD_DIR), 'w') as config:
+            yaml.dump(params, config, default_flow_style=False)
+
+
+def loadDataset(DATA_PATH):
+
+    try:
+        if DATA_PATH.lower().endswith((".pickle", ".pkl", ".p", ".pckl")):
+            with open(DATA_PATH, 'rb') as input:
+                print("Opening from binary pickle")
+                dat = pickle.load(input)
+                print("Dataset loaded from pickle file as: {}".format(dat))
+        else:
+
+            folder, data_file = os.path.split(DATA_PATH)
+            print(folder, data_file)
+            meta_file = "_metadata.".join(data_file.split("."))
+
+            print("Opening from CSV")
+            dat = sndata.PlasticcData(folder=folder, data_file=data_file, meta_file=meta_file,
+                            from_pickle=False)
+            print("Dataset loaded from csv file as: {}".format(dat))
+            print("Saving {} object to pickle binary".format(dat))
+
+            dat_binary = os.path.splitext(data_file)[0]+".pckl"
+            print(os.path.join(folder, dat_binary))
+            with open(os.path.join(folder, dat_binary), 'wb') as f:
+                pickle.dump(dat, f, pickle.HIGHEST_PROTOCOL)
+    except FileNotFoundError:
+        print("Oii, load something !!")
+
+    return dat
+
+
+def reduceDataset(dat, dirs, subset_size, SEED):
+
+    METHOD_DIR = dirs.get("method_dir", None)
+    subset_file = '/{}/subset.list'.format(METHOD_DIR)
+    if os.path.exists(subset_file):
+        rand_objs = np.genfromtxt(subset_file, dtype='U')
+    else:
+        np.random.seed(SEED)
+        rand_objs = np.random.choice(dat.object_names, replace=False, size=subset_size)
+        rand_objs_sorted_int = np.sort(rand_objs.astype(np.int))
+        rand_objs = rand_objs_sorted_int.astype('<U9')
+        np.savetxt(subset_file, rand_objs, fmt='%s')
+
+    dat.object_names = rand_objs
+    dat.data = {objects:dat.data[objects] for objects in dat.object_names} # erase the data we are not using
+
+    print("Dataset reduced to {} objects".format(dat.object_names.shape[0]))
+
+    return dat # Cat: I don't think we need to return anything
+
+
+def augmentData(dat, number_per_type):
+
+    def print_stats_by_type(dat):
+            print('total obj in dataset: %d'%len(dat.data))
+            types=dat.get_types()
+            t_unique=np.unique(types['Type'])
+
+            for t in t_unique:
+                thistype=types[types['Type']==t]
+                print('type: %d - %d obj in dataset'%(t,len(thistype)))
+            return t_unique
+
+    t_unique=print_stats_by_type(dat)
+    aug=snaugment.GPAugment(dat)
+    numbers={types:number_per_type for types in t_unique}
+    res=aug.augment(numbers)
+    t_unique_new=print_stats_by_type(dat)
+
+
+def fitGaussianProcess(dat, **kwargs): # Cat: Do we really want a mask funtion?
+
+    extract_GP(dat, **kwargs)
+    # snfeatures.WaveletFeatures.extract_GP(dat, **kwargs)
+
+
+def waveletDecomposition(dat, ngp, **kwargs): # Cat: we need to add ngp as input otherwise it doesn't run on the notebbok
+
+    wavelet_object = snfeatures.WaveletFeatures(ngp=ngp)
+    print("WAV = {}\n".format(wavelet_object.wav))
+    print("MLEV = {}\n".format(wavelet_object.mlev))
+    print("NGP = {}\n".format(ngp))
+    waveout, waveout_err = wavelet_object.extract_wavelets(dat, wavelet_object.wav, wavelet_object.mlev, **kwargs)
+    return waveout, waveout_err, wavelet_object
+
+
+def dimentionalityReduction(wavelet_object, dirs, object_names, waveout, tolerance, **kwargs): # Cat: we need to add tolerance
+
+    # check if reduced wavelet features already exist
+    wavelet_features, eigenvalues, eigenvectors, means, num_feats = wavelet_object.extract_pca(object_names, waveout, **kwargs)
+
+    output_root = dirs.get("features_dir")
+    print("Inside dimRedux: {}\n".format(output_root))
+    wavelet_features.write('{}/wavelet_features_{}.fits'.format(output_root, str(tolerance)[2:]))
+
+    return wavelet_features, eigenvalues, eigenvectors, means
+
+
+def getMeta(dat): # including mjd
+    object_names = dat.object_names
+    meta_df = pd.DataFrame(index=object_names, columns=dat.data[object_names[0]].meta.keys())
+    mjd_diff = np.zeros_like(object_names)
+    for i in np.arange(len(object_names)):
+        obj = object_names[i]
+        obj_data = dat.data[obj]
+        obj_meta = obj_data.meta
+        mjd_diff[i] = np.max(obj_data['mjd'])-np.min(obj_data['mjd'])
+        for key in obj_meta.keys():
+            meta_key = obj_meta[key]
+            try:
+                assert type(meta_key) == np.ndarray
+                meta_key = meta_key[0]
+            except:
+                pass
+            meta_df.at[obj, key] = meta_key
+    try:
+        meta_df.drop(['distmod','mwebv', 'stencil', 'augment_algo'] , axis=1, inplace=True)
+    except KeyError: # if we are only using the original objects, 'stencil', 'augment_algo' aren't part of the metadata
+        meta_df.drop(['distmod','mwebv'] , axis=1, inplace=True)
+    meta_df.rename(index=str, columns={"name": "Object", "type":"target"}, inplace=True)
+    meta_df['mjd_diff'] = mjd_diff
+    return meta_df
+
+
+def mergeFeatures(some_features, other_features):
+    if type(some_features) != pd.core.frame.DataFrame:
+        some_features = some_features.to_pandas()
+    if type(other_features) != pd.core.frame.DataFrame:
+        other_features = other_features.to_pandas()
+    merged_df = pd.merge(some_features, other_features)
+    merged_df.set_index("Object", inplace=True)
+    return merged_df
+
+
+def combineAdditionalFeatures(wavelet_features, dat):
+    meta_df = getMeta(dat)
+    combined_features = mergeFeatures(wavelet_features, meta_df)
+    return combined_features
+
+
+def createClassififer(combined_features, RANDOM_STATE):
+
+    X = combined_features.drop('target', axis=1)
+    y = combined_features['target'].values
+
+    print("X SHAPE = {}\n".format(X.shape))
+    print("y SHAPE = {}\n".format(y.shape))
+
+    target_names = combined_features['target'].unique()
+
+    print("X = \n{}".format(X))
+    print("y = \n{}".format(y))
+
+    X_train, X_test, y_train, y_test = train_test_split(X, y,
+            random_state=RANDOM_STATE)
+
+
+    clf = RandomForestClassifier(n_estimators=700, criterion='entropy',\
+                                         oob_score=True, n_jobs=-1,
+                                         random_state=RANDOM_STATE)
+
+    clf.fit(X_train, y_train)
+
+    y_preds = clf.predict(X_test)
+
+    # confm = plotConfusionMatrix(y_test, y_preds, 'Test data', target_names)
+
+    y_probs = clf.predict_proba(X_test)
+
+    nlines = len(target_names)
+    # we also need to express the truth table as a matrix
+    sklearn_truth = np.zeros((len(y_test), nlines))
+    label_index_map = dict(zip(clf.classes_, np.arange(nlines)))
+    for i, x in enumerate(y_test):
+            sklearn_truth[i][label_index_map[y_test[i]]] = 1
+
+    weights = np.array([1/18, 1/9, 1/18, 1/18, 1/18, 1/18, 1/18, 1/9, 1/18, 1/18, 1/18, 1/18, 1/18, 1/18, 1/19])
+
+    logloss = plasticcLogLoss(sklearn_truth, y_probs, relative_class_weights=weights[:-1])
+    print("LogLoss: {:.3f}\nBest Params: {}".format(logloss, clf.get_params))
+
+    # PASS IN TRAINING DATA IN FORM OF SNMACHINE OBJECT
+    # CREATE rf OBJECT.
+    # CROSS-VAIDATION HERE
+    # RETURN CLASSIFIFER OBJECT
+    return clf
+
+
+def makePredictions(LOCATION_OF_TEST_DATA, CLASSIFIER):
+    # LOAD TEST SET AT THIS POINT
+    # USE CLASFFIFER FROM createClassififer, BY USING THAT WE THEN
+    # clf.predict(test_set)
+    # RETURN SUBMISSION_FILE_WITHOUT_99
+    pass
+
+def runFullPipeline():
+    pass
+
+def restartFromGPs():
+    pass
+
+def restartFromWavelets():
+    pass
+
+if __name__ == "__main__":
+
+    parser = ArgumentParser(description="Run pipeline end to end")
+    parser.add_argument('--configuration', '-c')
+    parser.add_argument('--restart', '-r', default="full")
+    arguments = parser.parse_args()
+
+    # LOAD CONFIGURATION FILE --->>>> COULD BE ITS OWN LOAD CONFIGURATION FUNCTION?
+    try:
+        with open(arguments.configuration) as f:
+            params = yaml.load(f)
+    except IOError:
+        print("Invalid yaml file provided")
+        exit()
+
+    print("The PARAMS are:\n {}".format(params))
+
+    # GLOBAL SETTINGS
+    RANDOM_STATE = params.get("RANDOM_STATE", None)
+    print("RANDOM_STATE:\n{}".format(RANDOM_STATE))
+    SEED = params.get("SEED", None)
+    DATA_PATH = params.get("DATA_PATH", None)
+    ANALYSIS_DIR = params.get("ANALYSIS_DIR", None)
+    ANALYSIS_NAME = params.get("ANALYSIS_NAME", None)
+
+    # Set the number of processes you want to use throughout the notebook
+    nprocesses  = multiprocessing.cpu_count()
+    print("Running with {} cores".format(nprocesses))
+
+    # SNMACHINE PARAMETERS
+    ngp = params.get("ngp", None)
+    initheta = params.get("initheta", None)
+
+    dirs = createFolderStructure(ANALYSIS_DIR, ANALYSIS_NAME)
+    saveConfigurationFile(dirs)
+
+    # RUN PIPELINE
+    if (arguments.restart.lower() == "wavelets"):
+
+        wavelet_features    = Table.read(dirs.get("features_dir")+"/wavelet_features.fits")
+        combined_features   = combineAdditionalFeatures(wavelet_features, DATA_PATH)
+        classifer           = createClassififer(combined_features)
+
+    elif (arguments.restart.lower() == "gps"):
+        print("Hello")
+    else:
+        print("Running full pipeline .. ")
+
+        dat = loadDataset(DATA_PATH)
+        # dat = reduceDataset(dat, dirs, subset_size=10, SEED=SEED)
+        fitGaussianProcess(dat, ngp=ngp, t_min=0, initheta=initheta,
+                            nprocesses=nprocesses, output_root=dirs.get("interm_dir"), t_max=1100)
+
+        waveout, waveout_err, wavelet_object = waveletDecomposition(dat, ngp=ngp, nprocesses=nprocesses, save_output='all', output_root=dirs.get("interm_dir"))
+
+        wavelet_features, eigenvalues, eigenvectors, means = dimentionalityReduction(wavelet_object, dirs, dat.object_names.copy(), waveout, tolerance=0.99, save_output=True, recompute_pca=True, output_root=dirs.get("features_dir"))
+
+        combined_features   = combineAdditionalFeatures(wavelet_features, DATA_PATH)
+        classifer           = createClassififer(combined_features)
+        # snmachine.utils.fit_gaussian_process.extract_GP()
+        # check for wavelets, if so restartFromWavelets()
+        # else, check for gp's, if so restartFromGPs()
+        # otherwise runFullPipeline()

From dd8277d21ef119fdb84d9d3d783d3bf4e95ec7d8 Mon Sep 17 00:00:00 2001
From: Tarek Allam <t.allam.jr@gmail.com>
Date: Wed, 8 May 2019 16:13:52 +0100
Subject: [PATCH 02/58] Minor linting improvements + comments

Although one expects the code in this file to change a lot, PEP8 linting
was carried out to encourage the consistent style.

Comments added to areas of code which need further discussion or will
indeed be adapted further
---
 utils/run_plasticc_pipeline.py | 93 +++++++++++++++-------------------
 1 file changed, 41 insertions(+), 52 deletions(-)

diff --git a/utils/run_plasticc_pipeline.py b/utils/run_plasticc_pipeline.py
index 9069b3bd..5ec04762 100644
--- a/utils/run_plasticc_pipeline.py
+++ b/utils/run_plasticc_pipeline.py
@@ -1,20 +1,18 @@
-# snmachine machine learning pipeline for the PLAsTiCC competition.
+"""
+Machine learning pipeline for the PLAsTiCC competition using snmachine codebase
+"""
 
-## IMPORTS
 import numpy as np
 import pandas as pd
 import sys
 import os
 import subprocess
 import multiprocessing
-import glob
-from astropy.table import Table,join,vstack
+from astropy.table import Table
 from sklearn.model_selection import train_test_split
 from sklearn.ensemble import RandomForestClassifier
-import pickle
 from argparse import ArgumentParser
 import yaml
-import multiprocessing
 import warnings
 warnings.filterwarnings("ignore")
 try:
@@ -34,15 +32,15 @@
 
 def createFolderStructure(ANALYSIS_DIR, ANALYSIS_NAME):
 
-    method_dir   = os.path.join(ANALYSIS_DIR, ANALYSIS_NAME)
+    method_dir = os.path.join(ANALYSIS_DIR, ANALYSIS_NAME)
     features_dir = os.path.join(method_dir, 'wavelet_features')
-    classif_dir  = os.path.join(method_dir, 'classifications')
-    interm_dir   = os.path.join(method_dir, 'intermediate')
-    plots_dir    = os.path.join(method_dir, 'plots')
+    classif_dir = os.path.join(method_dir, 'classifications')
+    interm_dir = os.path.join(method_dir, 'intermediate')
+    plots_dir = os.path.join(method_dir, 'plots')
 
-    dirs = {"method_dir" : method_dir, "features_dir" : features_dir,
-            "classif_dir" : classif_dir, "interm_dir" : interm_dir,
-            "plots_dir" : plots_dir}
+    dirs = {"method_dir": method_dir, "features_dir": features_dir,
+            "classif_dir": classif_dir, "interm_dir": interm_dir,
+            "plots_dir": plots_dir}
 
     for key, value in dirs.items():
         subprocess.call(['mkdir', value])
@@ -72,8 +70,7 @@ def loadDataset(DATA_PATH):
             meta_file = "_metadata.".join(data_file.split("."))
 
             print("Opening from CSV")
-            dat = sndata.PlasticcData(folder=folder, data_file=data_file, meta_file=meta_file,
-                            from_pickle=False)
+            dat = sndata.PlasticcData(folder=folder, data_file=data_file, meta_file=meta_file, from_pickle=False)
             print("Dataset loaded from csv file as: {}".format(dat))
             print("Saving {} object to pickle binary".format(dat))
 
@@ -101,39 +98,29 @@ def reduceDataset(dat, dirs, subset_size, SEED):
         np.savetxt(subset_file, rand_objs, fmt='%s')
 
     dat.object_names = rand_objs
-    dat.data = {objects:dat.data[objects] for objects in dat.object_names} # erase the data we are not using
+    dat.data = {objects: dat.data[objects] for objects in dat.object_names}  # Erase the data we are not using
 
     print("Dataset reduced to {} objects".format(dat.object_names.shape[0]))
 
-    return dat # Cat: I don't think we need to return anything
+    return dat  # Cat: I don't think we need to return anything
 
 
 def augmentData(dat, number_per_type):
-
-    def print_stats_by_type(dat):
-            print('total obj in dataset: %d'%len(dat.data))
-            types=dat.get_types()
-            t_unique=np.unique(types['Type'])
-
-            for t in t_unique:
-                thistype=types[types['Type']==t]
-                print('type: %d - %d obj in dataset'%(t,len(thistype)))
-            return t_unique
-
-    t_unique=print_stats_by_type(dat)
-    aug=snaugment.GPAugment(dat)
-    numbers={types:number_per_type for types in t_unique}
-    res=aug.augment(numbers)
-    t_unique_new=print_stats_by_type(dat)
+    # Tarek: This might be removed as a function call and replaced with calls to
+    # functions inside snmachine.snaugment
+    pass
 
 
-def fitGaussianProcess(dat, **kwargs): # Cat: Do we really want a mask funtion?
+def fitGaussianProcess(dat, **kwargs):  # Cat: Do we really want a mask funtion?
+    # Tarek: Now that this file lives in snmachine and with the extensive
+    # refactoring this is no longer necessary I believe
 
-    extract_GP(dat, **kwargs)
+    # extract_GP(dat, **kwargs)
     # snfeatures.WaveletFeatures.extract_GP(dat, **kwargs)
+    pass
 
 
-def waveletDecomposition(dat, ngp, **kwargs): # Cat: we need to add ngp as input otherwise it doesn't run on the notebbok
+def waveletDecomposition(dat, ngp, **kwargs):  # Cat: we need to add ngp as input otherwise it doesn't run on the notebbok
 
     wavelet_object = snfeatures.WaveletFeatures(ngp=ngp)
     print("WAV = {}\n".format(wavelet_object.wav))
@@ -155,7 +142,7 @@ def dimentionalityReduction(wavelet_object, dirs, object_names, waveout, toleran
     return wavelet_features, eigenvalues, eigenvectors, means
 
 
-def getMeta(dat): # including mjd
+def getMeta(dat):  # including mjd
     object_names = dat.object_names
     meta_df = pd.DataFrame(index=object_names, columns=dat.data[object_names[0]].meta.keys())
     mjd_diff = np.zeros_like(object_names)
@@ -174,7 +161,7 @@ def getMeta(dat): # including mjd
             meta_df.at[obj, key] = meta_key
     try:
         meta_df.drop(['distmod','mwebv', 'stencil', 'augment_algo'] , axis=1, inplace=True)
-    except KeyError: # if we are only using the original objects, 'stencil', 'augment_algo' aren't part of the metadata
+    except KeyError:  # if we are only using the original objects, 'stencil', 'augment_algo' aren't part of the metadata
         meta_df.drop(['distmod','mwebv'] , axis=1, inplace=True)
     meta_df.rename(index=str, columns={"name": "Object", "type":"target"}, inplace=True)
     meta_df['mjd_diff'] = mjd_diff
@@ -210,13 +197,11 @@ def createClassififer(combined_features, RANDOM_STATE):
     print("X = \n{}".format(X))
     print("y = \n{}".format(y))
 
-    X_train, X_test, y_train, y_test = train_test_split(X, y,
-            random_state=RANDOM_STATE)
+    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=RANDOM_STATE)
 
-
-    clf = RandomForestClassifier(n_estimators=700, criterion='entropy',\
-                                         oob_score=True, n_jobs=-1,
-                                         random_state=RANDOM_STATE)
+    clf = RandomForestClassifier(n_estimators=700, criterion='entropy',
+                                 oob_score=True, n_jobs=-1,
+                                 random_state=RANDOM_STATE)
 
     clf.fit(X_train, y_train)
 
@@ -252,20 +237,24 @@ def makePredictions(LOCATION_OF_TEST_DATA, CLASSIFIER):
     # RETURN SUBMISSION_FILE_WITHOUT_99
     pass
 
+
 def runFullPipeline():
     pass
 
+
 def restartFromGPs():
     pass
 
+
 def restartFromWavelets():
     pass
 
+
 if __name__ == "__main__":
 
     parser = ArgumentParser(description="Run pipeline end to end")
     parser.add_argument('--configuration', '-c')
-    parser.add_argument('--restart', '-r', default="full")
+    parser.add_argument('--restart-from', '-r', help='Either restart from saved "GPs" or from saved "Wavelets"', default="full")
     arguments = parser.parse_args()
 
     # LOAD CONFIGURATION FILE --->>>> COULD BE ITS OWN LOAD CONFIGURATION FUNCTION?
@@ -287,7 +276,7 @@ def restartFromWavelets():
     ANALYSIS_NAME = params.get("ANALYSIS_NAME", None)
 
     # Set the number of processes you want to use throughout the notebook
-    nprocesses  = multiprocessing.cpu_count()
+    nprocesses = multiprocessing.cpu_count()
     print("Running with {} cores".format(nprocesses))
 
     # SNMACHINE PARAMETERS
@@ -300,9 +289,9 @@ def restartFromWavelets():
     # RUN PIPELINE
     if (arguments.restart.lower() == "wavelets"):
 
-        wavelet_features    = Table.read(dirs.get("features_dir")+"/wavelet_features.fits")
-        combined_features   = combineAdditionalFeatures(wavelet_features, DATA_PATH)
-        classifer           = createClassififer(combined_features)
+        wavelet_features = Table.read(dirs.get("features_dir")+"/wavelet_features.fits")
+        combined_features = combineAdditionalFeatures(wavelet_features, DATA_PATH)
+        classifer = createClassififer(combined_features)
 
     elif (arguments.restart.lower() == "gps"):
         print("Hello")
@@ -312,14 +301,14 @@ def restartFromWavelets():
         dat = loadDataset(DATA_PATH)
         # dat = reduceDataset(dat, dirs, subset_size=10, SEED=SEED)
         fitGaussianProcess(dat, ngp=ngp, t_min=0, initheta=initheta,
-                            nprocesses=nprocesses, output_root=dirs.get("interm_dir"), t_max=1100)
+                           nprocesses=nprocesses, output_root=dirs.get("interm_dir"), t_max=1100)
 
         waveout, waveout_err, wavelet_object = waveletDecomposition(dat, ngp=ngp, nprocesses=nprocesses, save_output='all', output_root=dirs.get("interm_dir"))
 
         wavelet_features, eigenvalues, eigenvectors, means = dimentionalityReduction(wavelet_object, dirs, dat.object_names.copy(), waveout, tolerance=0.99, save_output=True, recompute_pca=True, output_root=dirs.get("features_dir"))
 
-        combined_features   = combineAdditionalFeatures(wavelet_features, DATA_PATH)
-        classifer           = createClassififer(combined_features)
+        combined_features = combineAdditionalFeatures(wavelet_features, DATA_PATH)
+        classifer = createClassififer(combined_features)
         # snmachine.utils.fit_gaussian_process.extract_GP()
         # check for wavelets, if so restartFromWavelets()
         # else, check for gp's, if so restartFromGPs()

From 520150d1ce2adc26e8b99d60af87355ed5a41d02 Mon Sep 17 00:00:00 2001
From: Tarek Allam <t.allam.jr@gmail.com>
Date: Wed, 8 May 2019 17:27:19 +0100
Subject: [PATCH 03/58] Changing mode of file

File mode changed to 644 from 755. This puts all files in the same
permissions bracket to allow for consistency across the files.
---
 snmachine/snclassifier.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 mode change 100755 => 100644 snmachine/snclassifier.py

diff --git a/snmachine/snclassifier.py b/snmachine/snclassifier.py
old mode 100755
new mode 100644

From 2acd69acaaaa7af9859c5f73338219f5d478e3b7 Mon Sep 17 00:00:00 2001
From: Tarek Allam <t.allam.jr@gmail.com>
Date: Fri, 10 May 2019 18:44:33 +0100
Subject: [PATCH 04/58] Renaming functions to be inline with code style

---
 utils/run_plasticc_pipeline.py | 122 ++++++++++++++-------------------
 1 file changed, 50 insertions(+), 72 deletions(-)

diff --git a/utils/run_plasticc_pipeline.py b/utils/run_plasticc_pipeline.py
index 5ec04762..6f783b05 100644
--- a/utils/run_plasticc_pipeline.py
+++ b/utils/run_plasticc_pipeline.py
@@ -30,7 +30,7 @@
 from plasticc_utils import plasticcLogLoss, plotConfusionMatrix
 
 
-def createFolderStructure(ANALYSIS_DIR, ANALYSIS_NAME):
+def create_folder_structure(ANALYSIS_DIR, ANALYSIS_NAME):
 
     method_dir = os.path.join(ANALYSIS_DIR, ANALYSIS_NAME)
     features_dir = os.path.join(method_dir, 'wavelet_features')
@@ -48,14 +48,14 @@ def createFolderStructure(ANALYSIS_DIR, ANALYSIS_NAME):
     return dirs
 
 
-def saveConfigurationFile(dirs):
+def save_configuration_file(dirs):
 
     METHOD_DIR = dirs.get("method_dir", None)
     with open('/{}/config.yaml'.format(METHOD_DIR), 'w') as config:
             yaml.dump(params, config, default_flow_style=False)
 
 
-def loadDataset(DATA_PATH):
+def load_dataset(DATA_PATH):
 
     try:
         if DATA_PATH.lower().endswith((".pickle", ".pkl", ".p", ".pckl")):
@@ -84,7 +84,7 @@ def loadDataset(DATA_PATH):
     return dat
 
 
-def reduceDataset(dat, dirs, subset_size, SEED):
+def reduce_dataset(dat, dirs, subset_size, SEED):
 
     METHOD_DIR = dirs.get("method_dir", None)
     subset_file = '/{}/subset.list'.format(METHOD_DIR)
@@ -105,13 +105,7 @@ def reduceDataset(dat, dirs, subset_size, SEED):
     return dat  # Cat: I don't think we need to return anything
 
 
-def augmentData(dat, number_per_type):
-    # Tarek: This might be removed as a function call and replaced with calls to
-    # functions inside snmachine.snaugment
-    pass
-
-
-def fitGaussianProcess(dat, **kwargs):  # Cat: Do we really want a mask funtion?
+def fit_gaussian_process(dat, **kwargs):  # Cat: Do we really want a mask funtion?
     # Tarek: Now that this file lives in snmachine and with the extensive
     # refactoring this is no longer necessary I believe
 
@@ -120,7 +114,7 @@ def fitGaussianProcess(dat, **kwargs):  # Cat: Do we really want a mask funtion?
     pass
 
 
-def waveletDecomposition(dat, ngp, **kwargs):  # Cat: we need to add ngp as input otherwise it doesn't run on the notebbok
+def wavelet_decomposition(dat, ngp, **kwargs):  # Cat: we need to add ngp as input otherwise it doesn't run on the notebbok
 
     wavelet_object = snfeatures.WaveletFeatures(ngp=ngp)
     print("WAV = {}\n".format(wavelet_object.wav))
@@ -130,7 +124,7 @@ def waveletDecomposition(dat, ngp, **kwargs):  # Cat: we need to add ngp as inpu
     return waveout, waveout_err, wavelet_object
 
 
-def dimentionalityReduction(wavelet_object, dirs, object_names, waveout, tolerance, **kwargs): # Cat: we need to add tolerance
+def dimentionality_reduction(wavelet_object, dirs, object_names, waveout, tolerance, **kwargs): # Cat: we need to add tolerance
 
     # check if reduced wavelet features already exist
     wavelet_features, eigenvalues, eigenvectors, means, num_feats = wavelet_object.extract_pca(object_names, waveout, **kwargs)
@@ -142,33 +136,7 @@ def dimentionalityReduction(wavelet_object, dirs, object_names, waveout, toleran
     return wavelet_features, eigenvalues, eigenvectors, means
 
 
-def getMeta(dat):  # including mjd
-    object_names = dat.object_names
-    meta_df = pd.DataFrame(index=object_names, columns=dat.data[object_names[0]].meta.keys())
-    mjd_diff = np.zeros_like(object_names)
-    for i in np.arange(len(object_names)):
-        obj = object_names[i]
-        obj_data = dat.data[obj]
-        obj_meta = obj_data.meta
-        mjd_diff[i] = np.max(obj_data['mjd'])-np.min(obj_data['mjd'])
-        for key in obj_meta.keys():
-            meta_key = obj_meta[key]
-            try:
-                assert type(meta_key) == np.ndarray
-                meta_key = meta_key[0]
-            except:
-                pass
-            meta_df.at[obj, key] = meta_key
-    try:
-        meta_df.drop(['distmod','mwebv', 'stencil', 'augment_algo'] , axis=1, inplace=True)
-    except KeyError:  # if we are only using the original objects, 'stencil', 'augment_algo' aren't part of the metadata
-        meta_df.drop(['distmod','mwebv'] , axis=1, inplace=True)
-    meta_df.rename(index=str, columns={"name": "Object", "type":"target"}, inplace=True)
-    meta_df['mjd_diff'] = mjd_diff
-    return meta_df
-
-
-def mergeFeatures(some_features, other_features):
+def merge_features(some_features, other_features):
     if type(some_features) != pd.core.frame.DataFrame:
         some_features = some_features.to_pandas()
     if type(other_features) != pd.core.frame.DataFrame:
@@ -178,13 +146,15 @@ def mergeFeatures(some_features, other_features):
     return merged_df
 
 
-def combineAdditionalFeatures(wavelet_features, dat):
-    meta_df = getMeta(dat)
-    combined_features = mergeFeatures(wavelet_features, meta_df)
+def combine_additional_features(wavelet_features, dat):
+    # Combine snmachine wavelet features with PLASTICC features. Allow user to
+    # define the dataframe they would like to merge
+    meta_df = dat.metadata
+    combined_features = merge_features(wavelet_features, meta_df)
     return combined_features
 
 
-def createClassififer(combined_features, RANDOM_STATE):
+def create_classififer(combined_features, RANDOM_STATE):
 
     X = combined_features.drop('target', axis=1)
     y = combined_features['target'].values
@@ -230,7 +200,7 @@ def createClassififer(combined_features, RANDOM_STATE):
     return clf
 
 
-def makePredictions(LOCATION_OF_TEST_DATA, CLASSIFIER):
+def make_predictions(LOCATION_OF_TEST_DATA, CLASSIFIER):
     # LOAD TEST SET AT THIS POINT
     # USE CLASFFIFER FROM createClassififer, BY USING THAT WE THEN
     # clf.predict(test_set)
@@ -238,25 +208,19 @@ def makePredictions(LOCATION_OF_TEST_DATA, CLASSIFIER):
     pass
 
 
-def runFullPipeline():
+def run_full_pipeline():
     pass
 
 
-def restartFromGPs():
+def restart_from_saved_gps():
     pass
 
 
-def restartFromWavelets():
+def restart_from_save_wavelets():
     pass
 
 
-if __name__ == "__main__":
-
-    parser = ArgumentParser(description="Run pipeline end to end")
-    parser.add_argument('--configuration', '-c')
-    parser.add_argument('--restart-from', '-r', help='Either restart from saved "GPs" or from saved "Wavelets"', default="full")
-    arguments = parser.parse_args()
-
+def load_configuration_file(path_to_configuration_file):
     # LOAD CONFIGURATION FILE --->>>> COULD BE ITS OWN LOAD CONFIGURATION FUNCTION?
     try:
         with open(arguments.configuration) as f:
@@ -267,48 +231,62 @@ def restartFromWavelets():
 
     print("The PARAMS are:\n {}".format(params))
 
-    # GLOBAL SETTINGS
+    return params
+
+
+if __name__ == "__main__":
+
+    parser = ArgumentParser(description="Run pipeline end to end")
+    parser.add_argument('--configuration', '-c')
+    parser.add_argument('--restart-from', '-r', help='Either restart from saved "GPs" or from saved "Wavelets"', default="full")
+    arguments = parser.parse_args()
+
+    params = load_configuration_file(arguments.configuration)
+
+    # global settings
     RANDOM_STATE = params.get("RANDOM_STATE", None)
-    print("RANDOM_STATE:\n{}".format(RANDOM_STATE))
+    # Tarek: maybe remove this completely and
+    # set inside a function call itself, i.e. have a default which can be
+    # overridden
     SEED = params.get("SEED", None)
     DATA_PATH = params.get("DATA_PATH", None)
     ANALYSIS_DIR = params.get("ANALYSIS_DIR", None)
     ANALYSIS_NAME = params.get("ANALYSIS_NAME", None)
 
+    # snmachine parameters
+    ngp = params.get("ngp", None)
+    initheta = params.get("initheta", None)
+
     # Set the number of processes you want to use throughout the notebook
     nprocesses = multiprocessing.cpu_count()
     print("Running with {} cores".format(nprocesses))
 
-    # SNMACHINE PARAMETERS
-    ngp = params.get("ngp", None)
-    initheta = params.get("initheta", None)
-
-    dirs = createFolderStructure(ANALYSIS_DIR, ANALYSIS_NAME)
-    saveConfigurationFile(dirs)
+    dirs = create_folder_structure(ANALYSIS_DIR, ANALYSIS_NAME)
+    save_configuration_file(dirs)
 
     # RUN PIPELINE
     if (arguments.restart.lower() == "wavelets"):
 
         wavelet_features = Table.read(dirs.get("features_dir")+"/wavelet_features.fits")
-        combined_features = combineAdditionalFeatures(wavelet_features, DATA_PATH)
-        classifer = createClassififer(combined_features)
+        combined_features = combine_additional_features(wavelet_features, DATA_PATH)
+        classifer = create_classififer(combined_features)
 
     elif (arguments.restart.lower() == "gps"):
         print("Hello")
     else:
         print("Running full pipeline .. ")
 
-        dat = loadDataset(DATA_PATH)
+        dat = load_dataset(DATA_PATH)
         # dat = reduceDataset(dat, dirs, subset_size=10, SEED=SEED)
-        fitGaussianProcess(dat, ngp=ngp, t_min=0, initheta=initheta,
-                           nprocesses=nprocesses, output_root=dirs.get("interm_dir"), t_max=1100)
+        fit_gaussian_process(dat, ngp=ngp, t_min=0, initheta=initheta,
+                             nprocesses=nprocesses, output_root=dirs.get("interm_dir"), t_max=1100)
 
-        waveout, waveout_err, wavelet_object = waveletDecomposition(dat, ngp=ngp, nprocesses=nprocesses, save_output='all', output_root=dirs.get("interm_dir"))
+        waveout, waveout_err, wavelet_object = wavelet_decomposition(dat, ngp=ngp, nprocesses=nprocesses, save_output='all', output_root=dirs.get("interm_dir"))
 
-        wavelet_features, eigenvalues, eigenvectors, means = dimentionalityReduction(wavelet_object, dirs, dat.object_names.copy(), waveout, tolerance=0.99, save_output=True, recompute_pca=True, output_root=dirs.get("features_dir"))
+        wavelet_features, eigenvalues, eigenvectors, means = dimentionality_reduction(wavelet_object, dirs, dat.object_names.copy(), waveout, tolerance=0.99, save_output=True, recompute_pca=True, output_root=dirs.get("features_dir"))
 
-        combined_features = combineAdditionalFeatures(wavelet_features, DATA_PATH)
-        classifer = createClassififer(combined_features)
+        combined_features = combine_additional_features(wavelet_features, DATA_PATH)
+        classifer = create_classififer(combined_features)
         # snmachine.utils.fit_gaussian_process.extract_GP()
         # check for wavelets, if so restartFromWavelets()
         # else, check for gp's, if so restartFromGPs()

From c898f963998a306bf68faab71f1c09d1115231fe Mon Sep 17 00:00:00 2001
From: Tarek Allam <t.allam.jr@gmail.com>
Date: Tue, 14 May 2019 13:07:17 +0100
Subject: [PATCH 05/58] Tidying up file and renaming function names

Renaming to be in line with code style conventions
---
 utils/plasticc_utils.py | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/utils/plasticc_utils.py b/utils/plasticc_utils.py
index 7b1b45f5..c17a2d5b 100644
--- a/utils/plasticc_utils.py
+++ b/utils/plasticc_utils.py
@@ -2,21 +2,23 @@
 Utility script for calculating the log loss
 """
 
+from sklearn.metrics import confusion_matrix
 import sys
 import numpy as np
 import matplotlib.pyplot as plt
 import seaborn as sns
-from sklearn.metrics import auc, roc_curve, confusion_matrix
 
-def plotConfusionMatrix(yTrue, yPredict, dataName, targetNames):
+
+def plot_confusion_matrix(yTrue, yPredict, dataName, targetNames):
     cm = confusion_matrix(yTrue, yPredict, labels=targetNames)
     cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
     annot = np.around(cm, 2)
 
-    fig, ax = plt.subplots(figsize=(9,7))
+    fig, ax = plt.subplots(figsize=(9, 7))
     sns.heatmap(cm, xticklabels=targetNames,
-    yticklabels=targetNames, cmap='Blues',
-    annot=annot, lw=0.5)
+                yticklabels=targetNames, cmap='Blues',
+                annot=annot, lw=0.5)
+
     ax.set_xlabel('Predicted Label')
     ax.set_ylabel('True Label')
     ax.set_aspect('equal')
@@ -24,14 +26,15 @@ def plotConfusionMatrix(yTrue, yPredict, dataName, targetNames):
 
     return cm
 
-def plasticcLogLoss(y_true, y_pred, relative_class_weights=None):
+
+def plasticc_log_loss(y_true, y_pred, relative_class_weights=None):
     """
     Implementation of weighted log loss used for the Kaggle challenge
     """
     predictions = y_pred.copy()
 
     # sanitize predictions
-    epsilon = sys.float_info.epsilon # this is machine dependent but essentially prevents log(0)
+    epsilon = sys.float_info.epsilon  # this is machine dependent but essentially prevents log(0)
     predictions = np.clip(predictions, epsilon, 1.0 - epsilon)
     predictions = predictions / np.sum(predictions, axis=1)[:, np.newaxis]
 

From 660519a37b8fc3587f4df4d82f194f86241eae3e Mon Sep 17 00:00:00 2001
From: Tarek Allam <t.allam.jr@gmail.com>
Date: Tue, 14 May 2019 13:46:03 +0100
Subject: [PATCH 06/58] Change mode of run_plasticc_pipeline file to 744

---
 utils/run_plasticc_pipeline.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 mode change 100644 => 100755 utils/run_plasticc_pipeline.py

diff --git a/utils/run_plasticc_pipeline.py b/utils/run_plasticc_pipeline.py
old mode 100644
new mode 100755

From 906d95e1a56fd600992446d91541337012451f27 Mon Sep 17 00:00:00 2001
From: Tarek Allam <t.allam.jr@gmail.com>
Date: Tue, 14 May 2019 13:46:37 +0100
Subject: [PATCH 07/58] Updating create_folder_structure function

Updating with doctrings and examples. Also including helper function to
obtain git revision hash to include in analysis folder name
---
 utils/run_plasticc_pipeline.py | 68 +++++++++++++++++++++++++++-------
 1 file changed, 54 insertions(+), 14 deletions(-)

diff --git a/utils/run_plasticc_pipeline.py b/utils/run_plasticc_pipeline.py
index 6f783b05..2cbbaa38 100755
--- a/utils/run_plasticc_pipeline.py
+++ b/utils/run_plasticc_pipeline.py
@@ -27,20 +27,60 @@
 util_module_path = os.path.abspath(os.path.join('snmachine', 'utils'))
 if util_module_path not in sys.path:
     sys.path.append(util_module_path)
-from plasticc_utils import plasticcLogLoss, plotConfusionMatrix
-
-
-def create_folder_structure(ANALYSIS_DIR, ANALYSIS_NAME):
-
-    method_dir = os.path.join(ANALYSIS_DIR, ANALYSIS_NAME)
-    features_dir = os.path.join(method_dir, 'wavelet_features')
-    classif_dir = os.path.join(method_dir, 'classifications')
-    interm_dir = os.path.join(method_dir, 'intermediate')
-    plots_dir = os.path.join(method_dir, 'plots')
-
-    dirs = {"method_dir": method_dir, "features_dir": features_dir,
-            "classif_dir": classif_dir, "interm_dir": interm_dir,
-            "plots_dir": plots_dir}
+from plasticc_utils import plasticc_log_loss, plot_confusion_matrix
+
+
+def get_git_revision_short_hash():
+    """ Helper function to obtain current version control hash value
+
+    Returns
+    -------
+    _hash : str
+        Short representation of current version control hash value
+
+    Examples
+    --------
+    >>> sha = get_git_revision_short_hash()
+    >>> print(sha)
+    'ede068e'
+    """
+    _hash = subprocess.check_output(['git', 'rev-parse', '--short', 'HEAD'])
+    return _hash.decode("utf-8").rstrip()
+
+
+def create_folder_structure(analysis_directory, analysis_name):
+    """ Make directories that will be used for analysis
+
+    Parameters
+    ----------
+    analysis_directory : str
+        System path to where the user would like to contain
+        a run of the analysis
+    analysis_name : str
+        Given name of analysis run. This is appended with the current git hash
+        the code has been run with.
+
+    Returns
+    -------
+    dirs: dict
+        Dictionary containing the mapping of folders that have been created.
+
+    Examples
+    --------
+    Each folder name can then be accessed with dictionary methods:
+
+    >>> analysis_directory = params.get("analysis_directory", None)
+    >>> analysis_name = params.get("analysis_name", None)
+    """
+    method_directory = os.path.join(analysis_directory, analysis_name + get_git_revision_short_hash())
+    features_directory = os.path.join(method_directory, 'wavelet_features')
+    classifications_directory = os.path.join(method_directory, 'classifications')
+    intermediate_files_directory = os.path.join(method_directory, 'intermediate')
+    plots_directory = os.path.join(method_directory, 'plots')
+
+    dirs = {"method_directory": method_directory, "features_directory": features_directory,
+            "classifications_directory": classifications_directory, "intermediate_files_directory": intermediate_files_directory,
+            "plots_directory": plots_directory}
 
     for key, value in dirs.items():
         subprocess.call(['mkdir', value])

From e40a785f707834e6947e5108e8290318eb5bc38b Mon Sep 17 00:00:00 2001
From: Tarek Allam <t.allam.jr@gmail.com>
Date: Tue, 14 May 2019 13:48:19 +0100
Subject: [PATCH 08/58] Removing options in config to be in script instead

Certain options would be better served as defaults in the script and the
user can change these as they wish in the source file
---
 utils/config.yml               | 12 ++++++++++++
 utils/run_plasticc_pipeline.py | 20 ++++++++++----------
 2 files changed, 22 insertions(+), 10 deletions(-)
 create mode 100644 utils/config.yml

diff --git a/utils/config.yml b/utils/config.yml
new file mode 100644
index 00000000..f5949880
--- /dev/null
+++ b/utils/config.yml
@@ -0,0 +1,12 @@
+#
+#
+# GENERAL PARAMS
+SEED : 1234
+REPO_DIR : "/share/hypatia/snmachine_resources/data/plasticc/"
+ANALYSIS_DIR : "/share/hypatia/snmachine_resources/data/plasticc/analysis/"
+ANALYSIS_NAME : "test-analysis"
+DATA_PATH : "/share/hypatia/snmachine_resources/data/plasticc/data/raw_data/training_set.pkl"
+
+# SNMACHINE_PARAMS
+ngp : 1100
+initheta : [500, 20]
diff --git a/utils/run_plasticc_pipeline.py b/utils/run_plasticc_pipeline.py
index 2cbbaa38..24f87e24 100755
--- a/utils/run_plasticc_pipeline.py
+++ b/utils/run_plasticc_pipeline.py
@@ -124,14 +124,14 @@ def load_dataset(DATA_PATH):
     return dat
 
 
-def reduce_dataset(dat, dirs, subset_size, SEED):
+def reduce_dataset(dat, dirs, subset_size, seed=1234):
 
     METHOD_DIR = dirs.get("method_dir", None)
     subset_file = '/{}/subset.list'.format(METHOD_DIR)
     if os.path.exists(subset_file):
         rand_objs = np.genfromtxt(subset_file, dtype='U')
     else:
-        np.random.seed(SEED)
+        np.random.seed(seed)
         rand_objs = np.random.choice(dat.object_names, replace=False, size=subset_size)
         rand_objs_sorted_int = np.sort(rand_objs.astype(np.int))
         rand_objs = rand_objs_sorted_int.astype('<U9')
@@ -194,7 +194,7 @@ def combine_additional_features(wavelet_features, dat):
     return combined_features
 
 
-def create_classififer(combined_features, RANDOM_STATE):
+def create_classififer(combined_features, random_state=42):
 
     X = combined_features.drop('target', axis=1)
     y = combined_features['target'].values
@@ -207,17 +207,17 @@ def create_classififer(combined_features, RANDOM_STATE):
     print("X = \n{}".format(X))
     print("y = \n{}".format(y))
 
-    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=RANDOM_STATE)
+    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state)
 
     clf = RandomForestClassifier(n_estimators=700, criterion='entropy',
                                  oob_score=True, n_jobs=-1,
-                                 random_state=RANDOM_STATE)
+                                 random_state=random_state)
 
     clf.fit(X_train, y_train)
 
     y_preds = clf.predict(X_test)
 
-    # confm = plotConfusionMatrix(y_test, y_preds, 'Test data', target_names)
+    # confm = plot_confusion_matrix(y_test, y_preds, 'Test data', target_names)
 
     y_probs = clf.predict_proba(X_test)
 
@@ -230,7 +230,7 @@ def create_classififer(combined_features, RANDOM_STATE):
 
     weights = np.array([1/18, 1/9, 1/18, 1/18, 1/18, 1/18, 1/18, 1/9, 1/18, 1/18, 1/18, 1/18, 1/18, 1/18, 1/19])
 
-    logloss = plasticcLogLoss(sklearn_truth, y_probs, relative_class_weights=weights[:-1])
+    logloss = plasticc_log_loss(sklearn_truth, y_probs, relative_class_weights=weights[:-1])
     print("LogLoss: {:.3f}\nBest Params: {}".format(logloss, clf.get_params))
 
     # PASS IN TRAINING DATA IN FORM OF SNMACHINE OBJECT
@@ -290,8 +290,8 @@ def load_configuration_file(path_to_configuration_file):
     # overridden
     SEED = params.get("SEED", None)
     DATA_PATH = params.get("DATA_PATH", None)
-    ANALYSIS_DIR = params.get("ANALYSIS_DIR", None)
-    ANALYSIS_NAME = params.get("ANALYSIS_NAME", None)
+    analysis_directory = params.get("analysis_directory", None)
+    analysis_name = params.get("analysis_name", None)
 
     # snmachine parameters
     ngp = params.get("ngp", None)
@@ -301,7 +301,7 @@ def load_configuration_file(path_to_configuration_file):
     nprocesses = multiprocessing.cpu_count()
     print("Running with {} cores".format(nprocesses))
 
-    dirs = create_folder_structure(ANALYSIS_DIR, ANALYSIS_NAME)
+    dirs = create_folder_structure(analysis_directory, analysis_name)
     save_configuration_file(dirs)
 
     # RUN PIPELINE

From cc84318e9129ec6ddcfff4bacd4e165d7cf7bb57 Mon Sep 17 00:00:00 2001
From: Tarek Allam <t.allam.jr@gmail.com>
Date: Tue, 14 May 2019 13:52:06 +0100
Subject: [PATCH 09/58] Moving old utils files to an archival folder

These files may still have merit for processing the data but as the
pipeline is being developed it is felt they are better served in a
seperate folder
---
 utils/{ => archive}/collect.pbs             |  0
 utils/{ => archive}/collect.py              |  0
 utils/{ => archive}/conquer.pbs             |  0
 utils/{ => archive}/conquer.py              | 10 +++++-----
 utils/{ => archive}/create_jobs.py          |  0
 utils/{ => archive}/divide.pbs              |  0
 utils/{ => archive}/divide.py               |  0
 utils/{ => archive}/plasticc_extract_gp.pbs |  0
 utils/{ => archive}/plasticc_extract_gp.py  |  1 -
 utils/{ => archive}/post_process.py         |  1 -
 10 files changed, 5 insertions(+), 7 deletions(-)
 rename utils/{ => archive}/collect.pbs (100%)
 rename utils/{ => archive}/collect.py (100%)
 rename utils/{ => archive}/conquer.pbs (100%)
 rename utils/{ => archive}/conquer.py (96%)
 rename utils/{ => archive}/create_jobs.py (100%)
 rename utils/{ => archive}/divide.pbs (100%)
 rename utils/{ => archive}/divide.py (100%)
 rename utils/{ => archive}/plasticc_extract_gp.pbs (100%)
 rename utils/{ => archive}/plasticc_extract_gp.py (99%)
 rename utils/{ => archive}/post_process.py (99%)

diff --git a/utils/collect.pbs b/utils/archive/collect.pbs
similarity index 100%
rename from utils/collect.pbs
rename to utils/archive/collect.pbs
diff --git a/utils/collect.py b/utils/archive/collect.py
similarity index 100%
rename from utils/collect.py
rename to utils/archive/collect.py
diff --git a/utils/conquer.pbs b/utils/archive/conquer.pbs
similarity index 100%
rename from utils/conquer.pbs
rename to utils/archive/conquer.pbs
diff --git a/utils/conquer.py b/utils/archive/conquer.py
similarity index 96%
rename from utils/conquer.py
rename to utils/archive/conquer.py
index 218c5949..40d85d33 100644
--- a/utils/conquer.py
+++ b/utils/archive/conquer.py
@@ -88,10 +88,11 @@
 	tab.meta['z']=tab.meta['hostgal_specz']
 
 	#insert into data set
-	d.insert_lightcurve(tab)
+        d.insert_lightcurve(tab)
+
 
 with open(os.path.join(out_folder,'dataset_%d.pickle'%index),'wb') as f:
-        pickle.dump(d,f)
+    pickle.dump(d,f)
 
 '''
 wf=snfeatures.WaveletFeatures()
@@ -99,9 +100,9 @@
 feats.write(os.path.join(feats_folder, 'wavelet_features.fits'),overwrite=True)
 
 with open(os.path.join(feats_folder,'PCA_mean.pickle'),'wb') as f1:
-	pickle.dump(wf.PCA_mean,f1)
+    pickle.dump(wf.PCA_mean,f1)
 with open(os.path.join(feats_folder,'PCA_eigenvals.pickle'),'wb') as f2:
-        pickle.dump(wf.PCA_eigenvals,f2)
+    pickle.dump(wf.PCA_eigenvals,f2)
 with open(os.path.join(feats_folder,'PCA_eigenvectors.pickle'),'wb') as f3:
         pickle.dump(wf.PCA_eigenvectors,f3)
 
@@ -110,4 +111,3 @@
 np.savetxt(os.path.join(feats_folder,'PCA_eigenvals.txt'),wf.PCA_eigenvals)
 np.savetxt(os.path.join(feats_folder,'PCA_eigenvectors.txt'),wf.PCA_eigenvectors)
 '''
-
diff --git a/utils/create_jobs.py b/utils/archive/create_jobs.py
similarity index 100%
rename from utils/create_jobs.py
rename to utils/archive/create_jobs.py
diff --git a/utils/divide.pbs b/utils/archive/divide.pbs
similarity index 100%
rename from utils/divide.pbs
rename to utils/archive/divide.pbs
diff --git a/utils/divide.py b/utils/archive/divide.py
similarity index 100%
rename from utils/divide.py
rename to utils/archive/divide.py
diff --git a/utils/plasticc_extract_gp.pbs b/utils/archive/plasticc_extract_gp.pbs
similarity index 100%
rename from utils/plasticc_extract_gp.pbs
rename to utils/archive/plasticc_extract_gp.pbs
diff --git a/utils/plasticc_extract_gp.py b/utils/archive/plasticc_extract_gp.py
similarity index 99%
rename from utils/plasticc_extract_gp.py
rename to utils/archive/plasticc_extract_gp.py
index fe30ddef..3a1bb8ae 100644
--- a/utils/plasticc_extract_gp.py
+++ b/utils/archive/plasticc_extract_gp.py
@@ -59,4 +59,3 @@
 np.savetxt(os.path.join(feats_folder,'PCA_eigenvals.txt'),wf.PCA_eigenvals)
 np.savetxt(os.path.join(feats_folder,'PCA_eigenvectors.txt'),wf.PCA_eigenvectors)
 '''
-
diff --git a/utils/post_process.py b/utils/archive/post_process.py
similarity index 99%
rename from utils/post_process.py
rename to utils/archive/post_process.py
index c0b2487d..797a34c7 100644
--- a/utils/post_process.py
+++ b/utils/archive/post_process.py
@@ -67,4 +67,3 @@
     np.savetxt(flname, not_done, fmt='%s')
 else:
     print 'All objects accounted for'
-

From edfd84d594e395b91dfe0aa05460cd8a8bf1d532 Mon Sep 17 00:00:00 2001
From: Tarek Allam <t.allam.jr@gmail.com>
Date: Tue, 14 May 2019 13:54:32 +0100
Subject: [PATCH 10/58] Tidy up import block

Removed code to add to sys path as this is no longer necessary as
pipeline script now resides in snmachine main repo
---
 utils/run_plasticc_pipeline.py | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/utils/run_plasticc_pipeline.py b/utils/run_plasticc_pipeline.py
index 24f87e24..f444d42a 100755
--- a/utils/run_plasticc_pipeline.py
+++ b/utils/run_plasticc_pipeline.py
@@ -1,10 +1,9 @@
 """
 Machine learning pipeline for the PLAsTiCC competition using snmachine codebase
 """
-
+from plasticc_utils import plasticc_log_loss, plot_confusion_matrix
 import numpy as np
 import pandas as pd
-import sys
 import os
 import subprocess
 import multiprocessing
@@ -24,11 +23,6 @@
 except ImportError:
     print("Unable to import snmachine. Check environment set correctly")
 
-util_module_path = os.path.abspath(os.path.join('snmachine', 'utils'))
-if util_module_path not in sys.path:
-    sys.path.append(util_module_path)
-from plasticc_utils import plasticc_log_loss, plot_confusion_matrix
-
 
 def get_git_revision_short_hash():
     """ Helper function to obtain current version control hash value

From 98e88005574e261b47f6af55632d906693fe43c0 Mon Sep 17 00:00:00 2001
From: Tarek Allam <t.allam.jr@gmail.com>
Date: Tue, 14 May 2019 16:56:05 +0100
Subject: [PATCH 11/58] [WIP] Updating functions in pipeline script

Several functions have been updated with doctrings and examples for how
to run such functions
---
 utils/run_plasticc_pipeline.py | 354 ++++++++++++++++++++-------------
 1 file changed, 212 insertions(+), 142 deletions(-)

diff --git a/utils/run_plasticc_pipeline.py b/utils/run_plasticc_pipeline.py
index f444d42a..40513533 100755
--- a/utils/run_plasticc_pipeline.py
+++ b/utils/run_plasticc_pipeline.py
@@ -2,15 +2,15 @@
 Machine learning pipeline for the PLAsTiCC competition using snmachine codebase
 """
 from plasticc_utils import plasticc_log_loss, plot_confusion_matrix
+from astropy.table import Table
+from sklearn.model_selection import train_test_split
+from sklearn.ensemble import RandomForestClassifier
+from argparse import ArgumentParser
 import numpy as np
 import pandas as pd
 import os
 import subprocess
 import multiprocessing
-from astropy.table import Table
-from sklearn.model_selection import train_test_split
-from sklearn.ensemble import RandomForestClassifier
-from argparse import ArgumentParser
 import yaml
 import warnings
 warnings.filterwarnings("ignore")
@@ -34,6 +34,7 @@ def get_git_revision_short_hash():
 
     Examples
     --------
+    >>> ...
     >>> sha = get_git_revision_short_hash()
     >>> print(sha)
     'ede068e'
@@ -62,7 +63,7 @@ def create_folder_structure(analysis_directory, analysis_name):
     Examples
     --------
     Each folder name can then be accessed with dictionary methods:
-
+    >>> ...
     >>> analysis_directory = params.get("analysis_directory", None)
     >>> analysis_name = params.get("analysis_name", None)
     """
@@ -82,105 +83,203 @@ def create_folder_structure(analysis_directory, analysis_name):
     return dirs
 
 
+def load_configuration_file(path_to_configuration_file):
+    # TODO: Finish doctring examples
+    """ Load from disk the configuration file that is to be used
+
+    Parameters
+    ----------
+    path_to_configuration_file : str
+        System path to where the configuration file is located
+
+    Returns
+    -------
+    params : dict
+        Dictionary of parameters contained inside the configuration file
+
+    Examples
+    --------
+    Each item inside the configuration file can be accessed like so:
+    >>> ...
+    >>> params = load_configuration_file(path_to_configuration_file)
+    >>> data_path = params.get("data_path", None)
+    >>> print(data_path)
+    >>> ngp = params.get("ngp", None)
+    >>> print(ngp)
+    """
+    try:
+        with open(path_to_configuration_file) as f:
+            params = yaml.load(f)
+    except IOError:
+        print("Invalid yaml file provided")
+        exit()
+    print("The PARAMS are:\n {}".format(params))
+    return params
+
+
 def save_configuration_file(dirs):
+    # TODO: Provide a doctring example
+    """ Make a copy of the configuration file that has been used inside the
+    analysis directory
+
+    Parameters
+    ----------
+    dirs : dict
+        Dictionary containing the names of the folder paths used in this analysis
+
+    Returns
+    -------
+    None
 
-    METHOD_DIR = dirs.get("method_dir", None)
-    with open('/{}/config.yaml'.format(METHOD_DIR), 'w') as config:
+    """
+    method_directory = dirs.get("method_directory", None)
+    with open(os.path.join(method_directory, "config.yml"), 'w') as config:
             yaml.dump(params, config, default_flow_style=False)
 
 
-def load_dataset(DATA_PATH):
+def load_training_data(data_path):
+    # TODO: Finish doctring examples
+    """ Load from disk the training data one will use for this analysis
+
+    Parameters
+    ----------
+    params : dict
+        Dictionary containing the parameters that reside in the configuration
+        file. This will be used to obtain the path to the training data.
+
+    Returns
+    -------
+    training_data : snmachine.PlasticcData
+        snmachine.PlasticcData instance of the training data
+
+    Examples
+    --------
+    >>> ...
+    >>> training_data = load_training_data(params)
+    >>> print(training_data)
 
+    """
     try:
-        if DATA_PATH.lower().endswith((".pickle", ".pkl", ".p", ".pckl")):
-            with open(DATA_PATH, 'rb') as input:
+        if data_path.lower().endswith((".pickle", ".pkl", ".p", ".pckl")):
+            with open(data_path, 'rb') as input:
                 print("Opening from binary pickle")
-                dat = pickle.load(input)
-                print("Dataset loaded from pickle file as: {}".format(dat))
+                training_data = pickle.load(input)
+                print("Dataset loaded from pickle file as: {}".format(training_data))
         else:
-
-            folder, data_file = os.path.split(DATA_PATH)
-            print(folder, data_file)
-            meta_file = "_metadata.".join(data_file.split("."))
+            folder_path, train_data_file_name = os.path.split(data_path)
+            print(folder_path, train_data_file_name)
+            meta_data_file_name = "_metadata.".join(train_data_file_name.split("."))
 
             print("Opening from CSV")
-            dat = sndata.PlasticcData(folder=folder, data_file=data_file, meta_file=meta_file, from_pickle=False)
-            print("Dataset loaded from csv file as: {}".format(dat))
-            print("Saving {} object to pickle binary".format(dat))
-
-            dat_binary = os.path.splitext(data_file)[0]+".pckl"
-            print(os.path.join(folder, dat_binary))
-            with open(os.path.join(folder, dat_binary), 'wb') as f:
-                pickle.dump(dat, f, pickle.HIGHEST_PROTOCOL)
+            training_data = sndata.PlasticcData(folder=folder_path, data_file=train_data_file_name,
+                                                metadata_file=meta_data_file_name, cut_non_detections=False)
+            print("Dataset loaded from csv file as: {}".format(training_data))
+            print("Saving {} object to pickle binary".format(training_data))
+
+            dat_binary = os.path.splitext(train_data_file_name)[0] + ".pckl"
+            print(os.path.join(folder_path, dat_binary))
+            with open(os.path.join(folder_path, dat_binary), 'wb') as f:
+                pickle.dump(training_data, f, pickle.HIGHEST_PROTOCOL)
     except FileNotFoundError:
-        print("Oii, load something !!")
+        print("No file found to load")
+        exit()
+
+    return training_data
 
-    return dat
 
+def reduce_size_of_training_data(training_data, dirs, subset_size, seed=1234):
+    # TODO: Incorpate further doctrings and finish examples. Tarek: Catarina and I need to
+    # discuss this further. There is some overlap between this and
+    # sndata.PlasticcData.update_data() and it would be good to comebine this.
+    """ Load from disk the training data one will use for this analysis
 
-def reduce_dataset(dat, dirs, subset_size, seed=1234):
+    Parameters
+    ----------
+    training_data : snmachine.PlasticcData
+        Dictionary containing the parameters that reside in the configuration
+        file. This will be used to obtain the path to the training data.
+    dirs : dict
+        Dictionary containing
+    subset_size : int
+        Number of objects the user would like to reduce the training data to
+    seed : int
+        Default set to 1234. This can be overridden by the user to check for
+        consistancy of results
 
-    METHOD_DIR = dirs.get("method_dir", None)
-    subset_file = '/{}/subset.list'.format(METHOD_DIR)
+    Returns
+    -------
+    None
+
+    Examples
+    --------
+    >>> ...
+    >>> print(shape.training_data)
+
+    >>> new_training_data = reduce_size_of_training_data(training_data, dirs, 1000))
+    >>> print(shape.new_training_data)
+
+    """
+
+    method_directory = dirs.get("method_directory", None)
+    subset_file = os.path.join(method_directory, "subset.list")
     if os.path.exists(subset_file):
         rand_objs = np.genfromtxt(subset_file, dtype='U')
     else:
         np.random.seed(seed)
-        rand_objs = np.random.choice(dat.object_names, replace=False, size=subset_size)
+        rand_objs = np.random.choice(training_data.object_names, replace=False, size=subset_size)
         rand_objs_sorted_int = np.sort(rand_objs.astype(np.int))
         rand_objs = rand_objs_sorted_int.astype('<U9')
         np.savetxt(subset_file, rand_objs, fmt='%s')
 
-    dat.object_names = rand_objs
-    dat.data = {objects: dat.data[objects] for objects in dat.object_names}  # Erase the data we are not using
-
-    print("Dataset reduced to {} objects".format(dat.object_names.shape[0]))
+    training_data.object_names = rand_objs
 
-    return dat  # Cat: I don't think we need to return anything
+    # Erase the data we are not using
+    training_data.data = {objects: training_data.data[objects] for objects in training_data.object_names}
+    print("Dataset reduced to {} objects".format(training_data.object_names.shape[0]))
 
+# def fit_gaussian_process(dat, **kwargs):  # Cat: Do we really want a mask funtion?
+#     # Tarek: Now that this file lives in snmachine and with the extensive
+#     # refactoring this is no longer necessary I believe
 
-def fit_gaussian_process(dat, **kwargs):  # Cat: Do we really want a mask funtion?
-    # Tarek: Now that this file lives in snmachine and with the extensive
-    # refactoring this is no longer necessary I believe
+#     # extract_GP(dat, **kwargs)
+#     # snfeatures.WaveletFeatures.extract_GP(dat, **kwargs)
+#     pass
 
-    # extract_GP(dat, **kwargs)
-    # snfeatures.WaveletFeatures.extract_GP(dat, **kwargs)
-    pass
 
+# def wavelet_decomposition(dat, ngp, **kwargs):  # Cat: we need to add ngp as input otherwise it doesn't run on the notebbok
 
-def wavelet_decomposition(dat, ngp, **kwargs):  # Cat: we need to add ngp as input otherwise it doesn't run on the notebbok
+#     wavelet_object = snfeatures.WaveletFeatures(ngp=ngp)
+#     print("WAV = {}\n".format(wavelet_object.wav))
+#     print("MLEV = {}\n".format(wavelet_object.mlev))
+#     print("NGP = {}\n".format(ngp))
+#     waveout, waveout_err = wavelet_object.extract_wavelets(dat, wavelet_object.wav, wavelet_object.mlev, **kwargs)
+#     return waveout, waveout_err, wavelet_object
 
-    wavelet_object = snfeatures.WaveletFeatures(ngp=ngp)
-    print("WAV = {}\n".format(wavelet_object.wav))
-    print("MLEV = {}\n".format(wavelet_object.mlev))
-    print("NGP = {}\n".format(ngp))
-    waveout, waveout_err = wavelet_object.extract_wavelets(dat, wavelet_object.wav, wavelet_object.mlev, **kwargs)
-    return waveout, waveout_err, wavelet_object
 
+# def dimentionality_reduction(wavelet_object, dirs, object_names, waveout, tolerance, **kwargs): # Cat: we need to add tolerance
 
-def dimentionality_reduction(wavelet_object, dirs, object_names, waveout, tolerance, **kwargs): # Cat: we need to add tolerance
+#     # check if reduced wavelet features already exist
+#     wavelet_features, eigenvalues, eigenvectors, means, num_feats = wavelet_object.extract_pca(object_names, waveout, **kwargs)
 
-    # check if reduced wavelet features already exist
-    wavelet_features, eigenvalues, eigenvectors, means, num_feats = wavelet_object.extract_pca(object_names, waveout, **kwargs)
+#     output_root = dirs.get("features_dir")
+#     print("Inside dimRedux: {}\n".format(output_root))
+#     wavelet_features.write('{}/wavelet_features_{}.fits'.format(output_root, str(tolerance)[2:]))
 
-    output_root = dirs.get("features_dir")
-    print("Inside dimRedux: {}\n".format(output_root))
-    wavelet_features.write('{}/wavelet_features_{}.fits'.format(output_root, str(tolerance)[2:]))
+#     return wavelet_features, eigenvalues, eigenvectors, means
 
-    return wavelet_features, eigenvalues, eigenvectors, means
 
+# def merge_features(some_features, other_features):
+#     # TODO: Move this to a data processing file
+#     if type(some_features) != pd.core.frame.DataFrame:
+#         some_features = some_features.to_pandas()
+#     if type(other_features) != pd.core.frame.DataFrame:
+#         other_features = other_features.to_pandas()
+#     merged_df = pd.merge(some_features, other_features)
+#     merged_df.set_index("Object", inplace=True)
+#     return merged_df
 
-def merge_features(some_features, other_features):
-    if type(some_features) != pd.core.frame.DataFrame:
-        some_features = some_features.to_pandas()
-    if type(other_features) != pd.core.frame.DataFrame:
-        other_features = other_features.to_pandas()
-    merged_df = pd.merge(some_features, other_features)
-    merged_df.set_index("Object", inplace=True)
-    return merged_df
 
-
-def combine_additional_features(wavelet_features, dat):
+def combine_all_features(reduced_wavelet_features, dataframe):
     # Combine snmachine wavelet features with PLASTICC features. Allow user to
     # define the dataframe they would like to merge
     meta_df = dat.metadata
@@ -203,73 +302,43 @@ def create_classififer(combined_features, random_state=42):
 
     X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state)
 
-    clf = RandomForestClassifier(n_estimators=700, criterion='entropy',
-                                 oob_score=True, n_jobs=-1,
-                                 random_state=random_state)
+    classifer = RandomForestClassifier(n_estimators=700, criterion='entropy',
+                                       oob_score=True, n_jobs=-1,
+                                       random_state=random_state)
 
-    clf.fit(X_train, y_train)
+    classifer.fit(X_train, y_train)
 
-    y_preds = clf.predict(X_test)
+    y_preds = classifer.predict(X_test)
 
-    # confm = plot_confusion_matrix(y_test, y_preds, 'Test data', target_names)
+    confusion_matrix = plot_confusion_matrix(y_test, y_preds, 'Validation data', target_names)
 
-    y_probs = clf.predict_proba(X_test)
+    y_probs = classifer.predict_proba(X_test)
 
     nlines = len(target_names)
     # we also need to express the truth table as a matrix
     sklearn_truth = np.zeros((len(y_test), nlines))
-    label_index_map = dict(zip(clf.classes_, np.arange(nlines)))
+    label_index_map = dict(zip(classifer.classes_, np.arange(nlines)))
     for i, x in enumerate(y_test):
             sklearn_truth[i][label_index_map[y_test[i]]] = 1
 
     weights = np.array([1/18, 1/9, 1/18, 1/18, 1/18, 1/18, 1/18, 1/9, 1/18, 1/18, 1/18, 1/18, 1/18, 1/18, 1/19])
 
-    logloss = plasticc_log_loss(sklearn_truth, y_probs, relative_class_weights=weights[:-1])
-    print("LogLoss: {:.3f}\nBest Params: {}".format(logloss, clf.get_params))
-
-    # PASS IN TRAINING DATA IN FORM OF SNMACHINE OBJECT
-    # CREATE rf OBJECT.
-    # CROSS-VAIDATION HERE
-    # RETURN CLASSIFIFER OBJECT
-    return clf
-
-
-def make_predictions(LOCATION_OF_TEST_DATA, CLASSIFIER):
-    # LOAD TEST SET AT THIS POINT
-    # USE CLASFFIFER FROM createClassififer, BY USING THAT WE THEN
-    # clf.predict(test_set)
-    # RETURN SUBMISSION_FILE_WITHOUT_99
-    pass
-
+    log_loss = plasticc_log_loss(sklearn_truth, y_probs, relative_class_weights=weights[:-1])
+    print("LogLoss: {:.3f}\nBest Params: {}".format(log_loss, classifer.get_params))
 
-def run_full_pipeline():
-    pass
-
-
-def restart_from_saved_gps():
-    pass
+    return classifer, confusion_matrix
 
 
-def restart_from_save_wavelets():
+def make_predictions(location_of_test_data, classifier):
     pass
 
 
-def load_configuration_file(path_to_configuration_file):
-    # LOAD CONFIGURATION FILE --->>>> COULD BE ITS OWN LOAD CONFIGURATION FUNCTION?
-    try:
-        with open(arguments.configuration) as f:
-            params = yaml.load(f)
-    except IOError:
-        print("Invalid yaml file provided")
-        exit()
-
-    print("The PARAMS are:\n {}".format(params))
-
-    return params
-
-
 if __name__ == "__main__":
 
+    # Set the number of processes you want to use throughout the notebook
+    nprocesses = multiprocessing.cpu_count()
+    print("Running with {} cores".format(nprocesses))
+
     parser = ArgumentParser(description="Run pipeline end to end")
     parser.add_argument('--configuration', '-c')
     parser.add_argument('--restart-from', '-r', help='Either restart from saved "GPs" or from saved "Wavelets"', default="full")
@@ -277,13 +346,7 @@ def load_configuration_file(path_to_configuration_file):
 
     params = load_configuration_file(arguments.configuration)
 
-    # global settings
-    RANDOM_STATE = params.get("RANDOM_STATE", None)
-    # Tarek: maybe remove this completely and
-    # set inside a function call itself, i.e. have a default which can be
-    # overridden
-    SEED = params.get("SEED", None)
-    DATA_PATH = params.get("DATA_PATH", None)
+    data_path = params.get("data_path", None)
     analysis_directory = params.get("analysis_directory", None)
     analysis_name = params.get("analysis_name", None)
 
@@ -291,37 +354,44 @@ def load_configuration_file(path_to_configuration_file):
     ngp = params.get("ngp", None)
     initheta = params.get("initheta", None)
 
-    # Set the number of processes you want to use throughout the notebook
-    nprocesses = multiprocessing.cpu_count()
-    print("Running with {} cores".format(nprocesses))
-
+    # Step 1. Creat folders that contain analysis
     dirs = create_folder_structure(analysis_directory, analysis_name)
+    # Step 2. Save configuration file used for this analysis
     save_configuration_file(dirs)
-
-    # RUN PIPELINE
+    # Step 3. Check at which point the user would like to run the analysis from.
+    # If elements already saved, these will be used but this can be overriden
+    # with command line argument
     if (arguments.restart.lower() == "wavelets"):
-
-        wavelet_features = Table.read(dirs.get("features_dir")+"/wavelet_features.fits")
-        combined_features = combine_additional_features(wavelet_features, DATA_PATH)
+        # Restart from saved uncompressed wavelets.
+        wavelet_features = Table.read(dirs.get("features_dir") + "/wavelet_features.fits")
+        combined_features = combine_all_features(wavelet_features, data_path)
         classifer = create_classififer(combined_features)
-
     elif (arguments.restart.lower() == "gps"):
-        print("Hello")
+        # Restart from saved GPs.
+        pass
     else:
-        print("Running full pipeline .. ")
+        # Run full pipeline but still do checks to see if elements from GPs or
+        # wavelets already exist on disk; the first check should be for:
+        #   1. Saved PCA files
+        #   2. Saved uncompressed wavelets
+        #   3. Saved GPs
 
-        dat = load_dataset(DATA_PATH)
-        # dat = reduceDataset(dat, dirs, subset_size=10, SEED=SEED)
-        fit_gaussian_process(dat, ngp=ngp, t_min=0, initheta=initheta,
-                             nprocesses=nprocesses, output_root=dirs.get("interm_dir"), t_max=1100)
+        training_data = load_training_data(data_path)
+        gps.compute_gps()
+        wavelet_object = snfeatures.WaveletFeatures(ngp=ngp)
+        waveout, waveout_err = wavelet_object.extract_wavelets(training_data, wavelet_object.wav, wavelet_object.mlev, **kwargs)
+        # waveout, waveout_err, wavelet_object = wavelet_decomposition(dat, ngp=ngp, nprocesses=nprocesses, save_output='all', output_root=dirs.get("interm_dir"))
+        # wavelet_features, eigenvalues, eigenvectors, means = dimentionality_reduction(wavelet_object, dirs, dat.object_names.copy(), waveout, tolerance=0.99, save_output=True, recompute_pca=True, output_root=dirs.get("features_dir"))
+        # combined_features = combine_all_features(wavelet_features, DATA_PATH)
+        # classifer = create_classififer(combined_features)
 
-        waveout, waveout_err, wavelet_object = wavelet_decomposition(dat, ngp=ngp, nprocesses=nprocesses, save_output='all', output_root=dirs.get("interm_dir"))
 
-        wavelet_features, eigenvalues, eigenvectors, means = dimentionality_reduction(wavelet_object, dirs, dat.object_names.copy(), waveout, tolerance=0.99, save_output=True, recompute_pca=True, output_root=dirs.get("features_dir"))
+        # fit_gaussian_process(dat, ngp=ngp, t_min=0, initheta=initheta,
+        #                      nprocesses=nprocesses, output_root=dirs.get("interm_dir"), t_max=1100)
 
-        combined_features = combine_additional_features(wavelet_features, DATA_PATH)
-        classifer = create_classififer(combined_features)
-        # snmachine.utils.fit_gaussian_process.extract_GP()
-        # check for wavelets, if so restartFromWavelets()
-        # else, check for gp's, if so restartFromGPs()
-        # otherwise runFullPipeline()
+        # waveout, waveout_err, wavelet_object = wavelet_decomposition(dat, ngp=ngp, nprocesses=nprocesses, save_output='all', output_root=dirs.get("interm_dir"))
+
+        # wavelet_features, eigenvalues, eigenvectors, means = dimentionality_reduction(wavelet_object, dirs, dat.object_names.copy(), waveout, tolerance=0.99, save_output=True, recompute_pca=True, output_root=dirs.get("features_dir"))
+
+        # combined_features = combine_all_features(wavelet_features, DATA_PATH)
+        # classifer = create_classififer(combined_features)

From 2992782c73db0ffc9a79e2732659dcf343b9fc25 Mon Sep 17 00:00:00 2001
From: Tarek Allam <t.allam.jr@gmail.com>
Date: Tue, 14 May 2019 17:25:30 +0100
Subject: [PATCH 12/58] [WIP] Further updates to pipeline script

---
 utils/run_plasticc_pipeline.py | 56 ++++++++++++++++++++++++++++++++--
 1 file changed, 53 insertions(+), 3 deletions(-)

diff --git a/utils/run_plasticc_pipeline.py b/utils/run_plasticc_pipeline.py
index 40513533..6f26efad 100755
--- a/utils/run_plasticc_pipeline.py
+++ b/utils/run_plasticc_pipeline.py
@@ -113,7 +113,7 @@ def load_configuration_file(path_to_configuration_file):
     except IOError:
         print("Invalid yaml file provided")
         exit()
-    print("The PARAMS are:\n {}".format(params))
+    print("The parameters are:\n {}".format(params))
     return params
 
 
@@ -280,14 +280,62 @@ def reduce_size_of_training_data(training_data, dirs, subset_size, seed=1234):
 
 
 def combine_all_features(reduced_wavelet_features, dataframe):
-    # Combine snmachine wavelet features with PLASTICC features. Allow user to
-    # define the dataframe they would like to merge
+    # TODO: Improve docstrings. Discuss whether the user should pass in a CSV
+    # instead?
+    """ Combine snmachine wavelet features with PLASTICC features. The
+    user should define a dataframe they would like to merge.
+
+    Parameters
+    ----------
+    reduced_wavelet_features : numpy.ndarray
+        These are the N principle components from the uncompressed wavelets
+    dataframe : pandas.DataFrame
+        Dataframe
+
+    Returns
+    -------
+    combined_features : pandas.DataFrame
+
+    Examples
+    --------
+    >>> ...
+    >>> print(shape.training_data)
+
+    >>> new_training_data = reduce_size_of_training_data(training_data, dirs, 1000))
+    >>> print(shape.new_training_data)
+
+    """
     meta_df = dat.metadata
     combined_features = merge_features(wavelet_features, meta_df)
     return combined_features
 
 
 def create_classififer(combined_features, random_state=42):
+    # TODO: Improve docstrings. Discuss whether the user should pass in a CSV
+    # instead?
+    """ Combine snmachine wavelet features with PLASTICC features. The
+    user should define a dataframe they would like to merge.
+
+    Parameters
+    ----------
+    reduced_wavelet_features : numpy.ndarray
+        These are the N principle components from the uncompressed wavelets
+    dataframe : pandas.DataFrame
+        Dataframe
+
+    Returns
+    -------
+    combined_features : pandas.DataFrame
+
+    Examples
+    --------
+    >>> ...
+    >>> print(shape.training_data)
+
+    >>> new_training_data = reduce_size_of_training_data(training_data, dirs, 1000))
+    >>> print(shape.new_training_data)
+
+    """
 
     X = combined_features.drop('target', axis=1)
     y = combined_features['target'].values
@@ -323,6 +371,7 @@ def create_classififer(combined_features, random_state=42):
 
     weights = np.array([1/18, 1/9, 1/18, 1/18, 1/18, 1/18, 1/18, 1/9, 1/18, 1/18, 1/18, 1/18, 1/18, 1/18, 1/19])
 
+    # weights[:-1] to ignore last class, the anomaly class
     log_loss = plasticc_log_loss(sklearn_truth, y_probs, relative_class_weights=weights[:-1])
     print("LogLoss: {:.3f}\nBest Params: {}".format(log_loss, classifer.get_params))
 
@@ -330,6 +379,7 @@ def create_classififer(combined_features, random_state=42):
 
 
 def make_predictions(location_of_test_data, classifier):
+    # TODO: Move to a seperate make_predictions file
     pass
 
 

From 8bd050ff1962f4c5f2122e4c243be4d22a2f27ad Mon Sep 17 00:00:00 2001
From: Tarek Allam <t.allam.jr@gmail.com>
Date: Tue, 14 May 2019 17:36:15 +0100
Subject: [PATCH 13/58] Modifying file structure inside utils directory

Renaming of files to make it easier to follow how the modern workflow
takes place. Put old run_pipeline.py file in archive as this is no
longer used
---
 utils/{ => archive}/run_pipeline.py           |   0
 utils/plasticc_feature_engineering.py         |   0
 utils/plasticc_make_predictions.py            |   0
 ...sticc_pipeline.py => plasticc_pipeline.py} | 112 ++++++++++--------
 4 files changed, 62 insertions(+), 50 deletions(-)
 rename utils/{ => archive}/run_pipeline.py (100%)
 create mode 100644 utils/plasticc_feature_engineering.py
 create mode 100644 utils/plasticc_make_predictions.py
 rename utils/{run_plasticc_pipeline.py => plasticc_pipeline.py} (79%)

diff --git a/utils/run_pipeline.py b/utils/archive/run_pipeline.py
similarity index 100%
rename from utils/run_pipeline.py
rename to utils/archive/run_pipeline.py
diff --git a/utils/plasticc_feature_engineering.py b/utils/plasticc_feature_engineering.py
new file mode 100644
index 00000000..e69de29b
diff --git a/utils/plasticc_make_predictions.py b/utils/plasticc_make_predictions.py
new file mode 100644
index 00000000..e69de29b
diff --git a/utils/run_plasticc_pipeline.py b/utils/plasticc_pipeline.py
similarity index 79%
rename from utils/run_plasticc_pipeline.py
rename to utils/plasticc_pipeline.py
index 6f26efad..b0fc3211 100755
--- a/utils/run_plasticc_pipeline.py
+++ b/utils/plasticc_pipeline.py
@@ -1,5 +1,5 @@
 """
-Machine learning pipeline for the PLAsTiCC competition using snmachine codebase
+Machine learning pipeline for the PLAsTiCC competition using snmachine codebase.
 """
 from plasticc_utils import plasticc_log_loss, plot_confusion_matrix
 from astropy.table import Table
@@ -237,36 +237,43 @@ def reduce_size_of_training_data(training_data, dirs, subset_size, seed=1234):
     training_data.data = {objects: training_data.data[objects] for objects in training_data.object_names}
     print("Dataset reduced to {} objects".format(training_data.object_names.shape[0]))
 
-# def fit_gaussian_process(dat, **kwargs):  # Cat: Do we really want a mask funtion?
-#     # Tarek: Now that this file lives in snmachine and with the extensive
-#     # refactoring this is no longer necessary I believe
 
-#     # extract_GP(dat, **kwargs)
-#     # snfeatures.WaveletFeatures.extract_GP(dat, **kwargs)
-#     pass
-
-
-# def wavelet_decomposition(dat, ngp, **kwargs):  # Cat: we need to add ngp as input otherwise it doesn't run on the notebbok
-
-#     wavelet_object = snfeatures.WaveletFeatures(ngp=ngp)
-#     print("WAV = {}\n".format(wavelet_object.wav))
-#     print("MLEV = {}\n".format(wavelet_object.mlev))
-#     print("NGP = {}\n".format(ngp))
-#     waveout, waveout_err = wavelet_object.extract_wavelets(dat, wavelet_object.wav, wavelet_object.mlev, **kwargs)
-#     return waveout, waveout_err, wavelet_object
+def wavelet_decomposition(training_data, ngp, **kwargs):
+    """ Load from disk the training data one will use for this analysis
 
+    Parameters
+    ----------
+    training_data : snmachine.PlasticcData
+        Dictionary containing the parameters that reside in the configuration
+        file. This will be used to obtain the path to the training data.
+    dirs : dict
+        Dictionary containing
+    subset_size : int
+        Number of objects the user would like to reduce the training data to
+    seed : int
+        Default set to 1234. This can be overridden by the user to check for
+        consistancy of results
 
-# def dimentionality_reduction(wavelet_object, dirs, object_names, waveout, tolerance, **kwargs): # Cat: we need to add tolerance
+    Returns
+    -------
+    None
 
-#     # check if reduced wavelet features already exist
-#     wavelet_features, eigenvalues, eigenvectors, means, num_feats = wavelet_object.extract_pca(object_names, waveout, **kwargs)
+    Examples
+    --------
+    >>> ...
+    >>> print(shape.training_data)
 
-#     output_root = dirs.get("features_dir")
-#     print("Inside dimRedux: {}\n".format(output_root))
-#     wavelet_features.write('{}/wavelet_features_{}.fits'.format(output_root, str(tolerance)[2:]))
+    >>> new_training_data = reduce_size_of_training_data(training_data, dirs, 1000))
+    >>> print(shape.new_training_data)
 
-#     return wavelet_features, eigenvalues, eigenvectors, means
+    """
 
+    wavelet_object = snfeatures.WaveletFeatures(ngp=ngp)
+    print("WAV = {}\n".format(wavelet_object.wav))
+    print("MLEV = {}\n".format(wavelet_object.mlev))
+    print("NGP = {}\n".format(ngp))
+    waveout, waveout_err = wavelet_object.extract_wavelets(training_data, wavelet_object.wav, wavelet_object.mlev, **kwargs)
+    return waveout, waveout_err, wavelet_object
 
 # def merge_features(some_features, other_features):
 #     # TODO: Move this to a data processing file
@@ -311,21 +318,19 @@ def combine_all_features(reduced_wavelet_features, dataframe):
 
 
 def create_classififer(combined_features, random_state=42):
-    # TODO: Improve docstrings. Discuss whether the user should pass in a CSV
-    # instead?
-    """ Combine snmachine wavelet features with PLASTICC features. The
-    user should define a dataframe they would like to merge.
+    # TODO: Improve docstrings.
+    """ Creation of an optimised Random Forest classifier.
 
     Parameters
     ----------
-    reduced_wavelet_features : numpy.ndarray
-        These are the N principle components from the uncompressed wavelets
-    dataframe : pandas.DataFrame
-        Dataframe
+    combined_features : pandas.DataFrame
+        This contains. Index on objects
+    random_state : int
+        To allow for reproducible...
 
     Returns
     -------
-    combined_features : pandas.DataFrame
+    classifer : sklearn.RandomForestClassifier object
 
     Examples
     --------
@@ -403,6 +408,7 @@ def make_predictions(location_of_test_data, classifier):
     # snmachine parameters
     ngp = params.get("ngp", None)
     initheta = params.get("initheta", None)
+    number_of_principal_components = params.get("number_of_principal_components", None)
 
     # Step 1. Creat folders that contain analysis
     dirs = create_folder_structure(analysis_directory, analysis_name)
@@ -422,26 +428,32 @@ def make_predictions(location_of_test_data, classifier):
     else:
         # Run full pipeline but still do checks to see if elements from GPs or
         # wavelets already exist on disk; the first check should be for:
-        #   1. Saved PCA files
-        #   2. Saved uncompressed wavelets
-        #   3. Saved GPs
-
+        #   a. Saved PCA files
+            # path_saved_reduced_wavelets = dirs.get("intermediate_files_directory")
+            # eigenvectors_saved_file = np.load(os.path.join(path_saved_reduced_wavelets, 'eigenvectors_' + str(number_of_principal_components) + '.npy'))
+            # means_saved_file = np.load(os.path.join(path_saved_reduced_wavelets, 'means_' + str(number_of_principal_components) + '.npy'))
+        #   b. Saved uncompressed wavelets
+        #   c. Saved GPs
+
+        # Step 4. Load in training data
         training_data = load_training_data(data_path)
-        gps.compute_gps()
-        wavelet_object = snfeatures.WaveletFeatures(ngp=ngp)
-        waveout, waveout_err = wavelet_object.extract_wavelets(training_data, wavelet_object.wav, wavelet_object.mlev, **kwargs)
-        # waveout, waveout_err, wavelet_object = wavelet_decomposition(dat, ngp=ngp, nprocesses=nprocesses, save_output='all', output_root=dirs.get("interm_dir"))
-        # wavelet_features, eigenvalues, eigenvectors, means = dimentionality_reduction(wavelet_object, dirs, dat.object_names.copy(), waveout, tolerance=0.99, save_output=True, recompute_pca=True, output_root=dirs.get("features_dir"))
-        # combined_features = combine_all_features(wavelet_features, DATA_PATH)
-        # classifer = create_classififer(combined_features)
 
+        # Step 5. Compute GPs
+        gps.compute_gps(training_data, number_gp=100, t_min=0, t_max=1100,
+                        kernel_param=[500., 20.],
+                        output_root=dirs['intermediate_files_directory'],
+                        number_processes=nprocesses)
 
-        # fit_gaussian_process(dat, ngp=ngp, t_min=0, initheta=initheta,
-        #                      nprocesses=nprocesses, output_root=dirs.get("interm_dir"), t_max=1100)
+        # Step 6. Extract wavelet coeffiencts
+        waveout, waveout_err, wavelet_object = wavelet_decomposition(training_data, ngp=ngp, nprocesses=nprocesses,
+                                                                     save_output='all', output_root=dirs.get("intermediate_files_directory"))
 
-        # waveout, waveout_err, wavelet_object = wavelet_decomposition(dat, ngp=ngp, nprocesses=nprocesses, save_output='all', output_root=dirs.get("interm_dir"))
+        # Step 7. Reduce dimensionality of wavelets by using only N principle components
+        wavelet_features, eigenvals, eigenvecs, means, num_feats = wavelet_object.extract_pca(object_names=training_data.object_names, wavout=waveout, recompute_pca=True, method='svd', ncomp=number_of_principal_components,
+                                                                                              tol=None, pca_path=None, save_output=True, output_root=dirs.get("intermediate_files_directory"))
 
-        # wavelet_features, eigenvalues, eigenvectors, means = dimentionality_reduction(wavelet_object, dirs, dat.object_names.copy(), waveout, tolerance=0.99, save_output=True, recompute_pca=True, output_root=dirs.get("features_dir"))
+        # Step 8. TODO Combine snmachine features with user defined features
+        # Step 9. TODO Create a Random Forest classifier; need to fit model and
+        # save it.
 
-        # combined_features = combine_all_features(wavelet_features, DATA_PATH)
-        # classifer = create_classififer(combined_features)
+        # Step 10. TODO Use saved classifier to make predictions. This can occur using a seperate file

From 3e85f5b79c4b0178219ba25243f7c3ff6f78ebb3 Mon Sep 17 00:00:00 2001
From: Tarek Allam <t.allam.jr@gmail.com>
Date: Tue, 14 May 2019 18:40:31 +0100
Subject: [PATCH 14/58] Updating configuration file

---
 utils/config.yml           | 18 +++++++---------
 utils/plasticc_pipeline.py | 43 ++++++++++++++++++++++++--------------
 2 files changed, 34 insertions(+), 27 deletions(-)

diff --git a/utils/config.yml b/utils/config.yml
index f5949880..be92e897 100644
--- a/utils/config.yml
+++ b/utils/config.yml
@@ -1,12 +1,8 @@
-#
-#
-# GENERAL PARAMS
-SEED : 1234
-REPO_DIR : "/share/hypatia/snmachine_resources/data/plasticc/"
-ANALYSIS_DIR : "/share/hypatia/snmachine_resources/data/plasticc/analysis/"
-ANALYSIS_NAME : "test-analysis"
-DATA_PATH : "/share/hypatia/snmachine_resources/data/plasticc/data/raw_data/training_set.pkl"
-
-# SNMACHINE_PARAMS
+# Global settings
+analysis_dir : "/share/hypatia/snmachine_resources/data/plasticc/analysis/"
+analysis_name : "pipeline-test"
+data_path : "/share/hypatia/snmachine_resources/data/plasticc/data/raw_data/training_set_snia.pickle"
+# snmachine parameters
 ngp : 1100
-initheta : [500, 20]
+initheta : [500., 20.]
+number_of_principle_components : 200
diff --git a/utils/plasticc_pipeline.py b/utils/plasticc_pipeline.py
index b0fc3211..ef89a07f 100755
--- a/utils/plasticc_pipeline.py
+++ b/utils/plasticc_pipeline.py
@@ -66,6 +66,9 @@ def create_folder_structure(analysis_directory, analysis_name):
     >>> ...
     >>> analysis_directory = params.get("analysis_directory", None)
     >>> analysis_name = params.get("analysis_name", None)
+    >>> directories = create_folder_structure(analysis_directory, analysis_name)
+    >>> print(directories.get("method_directory"))
+
     """
     method_directory = os.path.join(analysis_directory, analysis_name + get_git_revision_short_hash())
     features_directory = os.path.join(method_directory, 'wavelet_features')
@@ -104,8 +107,10 @@ def load_configuration_file(path_to_configuration_file):
     >>> params = load_configuration_file(path_to_configuration_file)
     >>> data_path = params.get("data_path", None)
     >>> print(data_path)
+
     >>> ngp = params.get("ngp", None)
     >>> print(ngp)
+
     """
     try:
         with open(path_to_configuration_file) as f:
@@ -117,22 +122,27 @@ def load_configuration_file(path_to_configuration_file):
     return params
 
 
-def save_configuration_file(dirs):
+def save_configuration_file(method_directory):
     # TODO: Provide a doctring example
     """ Make a copy of the configuration file that has been used inside the
     analysis directory
 
     Parameters
     ----------
-    dirs : dict
-        Dictionary containing the names of the folder paths used in this analysis
+    method_directory : string
+        The folder path used for this analysis
 
     Returns
     -------
     None
 
+    Examples
+    --------
+    >>> ...
+    >>> save_configuration_file(method_directory)
+    >>> print()
+
     """
-    method_directory = dirs.get("method_directory", None)
     with open(os.path.join(method_directory, "config.yml"), 'w') as config:
             yaml.dump(params, config, default_flow_style=False)
 
@@ -261,10 +271,9 @@ def wavelet_decomposition(training_data, ngp, **kwargs):
     Examples
     --------
     >>> ...
-    >>> print(shape.training_data)
-
-    >>> new_training_data = reduce_size_of_training_data(training_data, dirs, 1000))
-    >>> print(shape.new_training_data)
+    >>> waveout, waveout_err, wavelet_object = wavelet_decomposition(training_data, ngp=ngp, nprocesses=nprocesses,
+                                                                     save_output='all', output_root=dirs.get("intermediate_files_directory"))
+    >>> print()
 
     """
 
@@ -306,10 +315,12 @@ def combine_all_features(reduced_wavelet_features, dataframe):
     Examples
     --------
     >>> ...
-    >>> print(shape.training_data)
+    >>> print(shape.reduced_wavelet_features)
 
-    >>> new_training_data = reduce_size_of_training_data(training_data, dirs, 1000))
-    >>> print(shape.new_training_data)
+    >>> print(shape.dataframe)
+
+    >>> combined_features = combine_all_features(reduced_wavelet_features, dataframe)
+    >>> print(shape.combined_features)
 
     """
     meta_df = dat.metadata
@@ -335,10 +346,10 @@ def create_classififer(combined_features, random_state=42):
     Examples
     --------
     >>> ...
-    >>> print(shape.training_data)
+    >>> classifier, confusion_matrix = create_classififer(combined_features)
+    >>> print(classifier)
 
-    >>> new_training_data = reduce_size_of_training_data(training_data, dirs, 1000))
-    >>> print(shape.new_training_data)
+    >>> plot_confusion_matrix(confusion_matrix)
 
     """
 
@@ -439,8 +450,8 @@ def make_predictions(location_of_test_data, classifier):
         training_data = load_training_data(data_path)
 
         # Step 5. Compute GPs
-        gps.compute_gps(training_data, number_gp=100, t_min=0, t_max=1100,
-                        kernel_param=[500., 20.],
+        gps.compute_gps(training_data, number_gp=ngp, t_min=0, t_max=1100,
+                        kernel_param=initheta,
                         output_root=dirs['intermediate_files_directory'],
                         number_processes=nprocesses)
 

From 4179b2c287c2c076d882708fad0a0c030ecce30b Mon Sep 17 00:00:00 2001
From: Tarek Allam <t.allam.jr@gmail.com>
Date: Wed, 15 May 2019 14:15:18 +0100
Subject: [PATCH 15/58] Append git has to analysis name

---
 utils/config.yml           |  2 +-
 utils/plasticc_pipeline.py | 14 +++++++++-----
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/utils/config.yml b/utils/config.yml
index be92e897..7acfaa16 100644
--- a/utils/config.yml
+++ b/utils/config.yml
@@ -1,5 +1,5 @@
 # Global settings
-analysis_dir : "/share/hypatia/snmachine_resources/data/plasticc/analysis/"
+analysis_directory : "/share/hypatia/snmachine_resources/data/plasticc/analysis/"
 analysis_name : "pipeline-test"
 data_path : "/share/hypatia/snmachine_resources/data/plasticc/data/raw_data/training_set_snia.pickle"
 # snmachine parameters
diff --git a/utils/plasticc_pipeline.py b/utils/plasticc_pipeline.py
index ef89a07f..5b18e45e 100755
--- a/utils/plasticc_pipeline.py
+++ b/utils/plasticc_pipeline.py
@@ -70,7 +70,10 @@ def create_folder_structure(analysis_directory, analysis_name):
     >>> print(directories.get("method_directory"))
 
     """
-    method_directory = os.path.join(analysis_directory, analysis_name + get_git_revision_short_hash())
+    # Append Git has to analysis name
+    analysis_name = analysis_name + "-" + get_git_revision_short_hash()
+
+    method_directory = os.path.join(analysis_directory, analysis_name)
     features_directory = os.path.join(method_directory, 'wavelet_features')
     classifications_directory = os.path.join(method_directory, 'classifications')
     intermediate_files_directory = os.path.join(method_directory, 'intermediate')
@@ -409,8 +412,9 @@ def make_predictions(location_of_test_data, classifier):
     parser.add_argument('--configuration', '-c')
     parser.add_argument('--restart-from', '-r', help='Either restart from saved "GPs" or from saved "Wavelets"', default="full")
     arguments = parser.parse_args()
+    arguments = vars(arguments)
 
-    params = load_configuration_file(arguments.configuration)
+    params = load_configuration_file(arguments['configuration'])
 
     data_path = params.get("data_path", None)
     analysis_directory = params.get("analysis_directory", None)
@@ -424,16 +428,16 @@ def make_predictions(location_of_test_data, classifier):
     # Step 1. Creat folders that contain analysis
     dirs = create_folder_structure(analysis_directory, analysis_name)
     # Step 2. Save configuration file used for this analysis
-    save_configuration_file(dirs)
+    save_configuration_file(dirs.get("method_directory"))
     # Step 3. Check at which point the user would like to run the analysis from.
     # If elements already saved, these will be used but this can be overriden
     # with command line argument
-    if (arguments.restart.lower() == "wavelets"):
+    if (arguments['restart_from'].lower() == "wavelets"):
         # Restart from saved uncompressed wavelets.
         wavelet_features = Table.read(dirs.get("features_dir") + "/wavelet_features.fits")
         combined_features = combine_all_features(wavelet_features, data_path)
         classifer = create_classififer(combined_features)
-    elif (arguments.restart.lower() == "gps"):
+    elif (arguments['restart_from'].lower() == "gps"):
         # Restart from saved GPs.
         pass
     else:

From 32eb2ebb1e106024def138620c8e5a8518f517bf Mon Sep 17 00:00:00 2001
From: Tarek Allam <t.allam.jr@gmail.com>
Date: Fri, 17 May 2019 08:59:53 +0100
Subject: [PATCH 16/58] Updating variable names to be consistent

gps.py defines variable names for the kernel parameters and number of
points for the GPs. This change updates the configuration file and
pipeline to be in line with that file
---
 utils/config.yml           |  4 ++--
 utils/plasticc_pipeline.py | 23 ++++++++++++-----------
 2 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/utils/config.yml b/utils/config.yml
index 7acfaa16..34a726e0 100644
--- a/utils/config.yml
+++ b/utils/config.yml
@@ -3,6 +3,6 @@ analysis_directory : "/share/hypatia/snmachine_resources/data/plasticc/analysis/
 analysis_name : "pipeline-test"
 data_path : "/share/hypatia/snmachine_resources/data/plasticc/data/raw_data/training_set_snia.pickle"
 # snmachine parameters
-ngp : 1100
-initheta : [500., 20.]
+number_gp : 1100
+kernel_param : [500., 20.]
 number_of_principle_components : 200
diff --git a/utils/plasticc_pipeline.py b/utils/plasticc_pipeline.py
index 5b18e45e..f58217fd 100755
--- a/utils/plasticc_pipeline.py
+++ b/utils/plasticc_pipeline.py
@@ -111,8 +111,8 @@ def load_configuration_file(path_to_configuration_file):
     >>> data_path = params.get("data_path", None)
     >>> print(data_path)
 
-    >>> ngp = params.get("ngp", None)
-    >>> print(ngp)
+    >>> number_gp = params.get("number_gp", None)
+    >>> print(number_gp)
 
     """
     try:
@@ -251,7 +251,7 @@ def reduce_size_of_training_data(training_data, dirs, subset_size, seed=1234):
     print("Dataset reduced to {} objects".format(training_data.object_names.shape[0]))
 
 
-def wavelet_decomposition(training_data, ngp, **kwargs):
+def wavelet_decomposition(training_data, number_gp, **kwargs):
     """ Load from disk the training data one will use for this analysis
 
     Parameters
@@ -274,16 +274,17 @@ def wavelet_decomposition(training_data, ngp, **kwargs):
     Examples
     --------
     >>> ...
-    >>> waveout, waveout_err, wavelet_object = wavelet_decomposition(training_data, ngp=ngp, nprocesses=nprocesses,
+    >>> waveout, waveout_err, wavelet_object =
+    wavelet_decomposition(training_data, number_gp=number_gp, nprocesses=nprocesses,
                                                                      save_output='all', output_root=dirs.get("intermediate_files_directory"))
     >>> print()
 
     """
 
-    wavelet_object = snfeatures.WaveletFeatures(ngp=ngp)
+    wavelet_object = snfeatures.WaveletFeatures(number_gp=number_gp)
     print("WAV = {}\n".format(wavelet_object.wav))
     print("MLEV = {}\n".format(wavelet_object.mlev))
-    print("NGP = {}\n".format(ngp))
+    print("number_gp = {}\n".format(number_gp))
     waveout, waveout_err = wavelet_object.extract_wavelets(training_data, wavelet_object.wav, wavelet_object.mlev, **kwargs)
     return waveout, waveout_err, wavelet_object
 
@@ -421,8 +422,8 @@ def make_predictions(location_of_test_data, classifier):
     analysis_name = params.get("analysis_name", None)
 
     # snmachine parameters
-    ngp = params.get("ngp", None)
-    initheta = params.get("initheta", None)
+    number_gp = params.get("number_gp", None)
+    kernel_param = params.get("kernel_param", None)
     number_of_principal_components = params.get("number_of_principal_components", None)
 
     # Step 1. Creat folders that contain analysis
@@ -454,13 +455,13 @@ def make_predictions(location_of_test_data, classifier):
         training_data = load_training_data(data_path)
 
         # Step 5. Compute GPs
-        gps.compute_gps(training_data, number_gp=ngp, t_min=0, t_max=1100,
-                        kernel_param=initheta,
+        gps.compute_gps(training_data, number_gp=number_gp, t_min=0, t_max=1100,
+                        kernel_param=kernel_param,
                         output_root=dirs['intermediate_files_directory'],
                         number_processes=nprocesses)
 
         # Step 6. Extract wavelet coeffiencts
-        waveout, waveout_err, wavelet_object = wavelet_decomposition(training_data, ngp=ngp, nprocesses=nprocesses,
+        waveout, waveout_err, wavelet_object = wavelet_decomposition(training_data, number_gp=number_gp, nprocesses=nprocesses,
                                                                      save_output='all', output_root=dirs.get("intermediate_files_directory"))
 
         # Step 7. Reduce dimensionality of wavelets by using only N principle components

From cdf659d94dc601394a9260b6e32e9f42eee49c94 Mon Sep 17 00:00:00 2001
From: Tarek Allam <t.allam.jr@gmail.com>
Date: Fri, 17 May 2019 19:56:59 +0100
Subject: [PATCH 17/58] Updating new var name to be consistant with gps.py

---
 snmachine/gps.py           |  6 +--
 snmachine/snaugment.py     |  2 +-
 snmachine/snclassifier.py  | 10 ++---
 snmachine/snfeatures.py    | 85 +++++++++++++++++++-------------------
 test/snclassifier_test.py  |  2 +-
 test/snfeatures_test.py    | 32 +++++++-------
 utils/plasticc_pipeline.py | 10 ++---
 7 files changed, 74 insertions(+), 73 deletions(-)

diff --git a/snmachine/gps.py b/snmachine/gps.py
index 5b965756..3dec7868 100644
--- a/snmachine/gps.py
+++ b/snmachine/gps.py
@@ -56,7 +56,7 @@ def compute_gps(dataset, number_gp, t_min, t_max, kernel_param=[500., 20.], outp
     output_root : {None, str}, optional
         If None, don't save anything. If str, it is the output directory, so save the flux and error estimates and used kernels there.
     number_processes : int, optional
-        Number of processors to use for parallelisation (shared memory only). By default `nprocesses` = 1.
+        Number of processors to use for parallelisation (shared memory only). By default `number_processes` = 1.
     gp_algo : str, optional
         which gp package is used for the Gaussian Process Regression, GaPP or george
     """
@@ -148,7 +148,7 @@ def _compute_gps_parallel(dataset, number_gp, t_min, t_max, kernel_param, output
     output_root : {None, str}, optional
         If None, don't save anything. If str, it is the output directory, so save the flux and error estimates and used kernels there.
     number_processes : int, optional
-        Number of processors to use for parallelisation (shared memory only). By default `nprocesses` = 1.
+        Number of processors to use for parallelisation (shared memory only). By default `number_processes` = 1.
     gp_algo : str, optional
         which gp package is used for the Gaussian Process Regression, GaPP or george
     """
@@ -413,4 +413,4 @@ def get_kernel(kernel_name, kernel_param):
     elif kernel_name == 'ExpSquared+ExpSine2':
         kExpSine2 = kernel_param[4]*george.kernels.ExpSine2Kernel(gamma=kernel_param[5],log_period=kernel_param[6])
         kernel = kExpSquared + kExpSine2
-    return kernel
\ No newline at end of file
+    return kernel
diff --git a/snmachine/snaugment.py b/snmachine/snaugment.py
index 4e582cd8..564a6f96 100644
--- a/snmachine/snaugment.py
+++ b/snmachine/snaugment.py
@@ -123,7 +123,7 @@ def extract_proxy_features(self,peak_filter='desr',nproc=1,fit_salt2=False,salt2
             #tf=snfeatures.TemplateFeatures(sampler='leastsq')
             tf=snfeatures.TemplateFeatures(sampler=sampler)
             if salt2feats is None:
-                salt2feats=tf.extract_features(self.dataset,nprocesses=nproc,use_redshift=fix_redshift)
+                salt2feats=tf.extract_features(self.dataset,number_processes=nproc,use_redshift=fix_redshift)
 
             #fit models and extract r-peakmags
             peaklogflux=[]
diff --git a/snmachine/snclassifier.py b/snmachine/snclassifier.py
index 0e3a4cd2..185356a1 100644
--- a/snmachine/snclassifier.py
+++ b/snmachine/snclassifier.py
@@ -608,7 +608,7 @@ def __call_classifier(classifier, X_train, y_train, X_test, param_dict, return_c
 
 
 def run_pipeline(features, types, output_name='', columns=[], classifiers=['nb', 'knn', 'svm', 'neural_network', 'boost_dt'],
-                 training_set=0.3, param_dict={}, nprocesses=1, scale=True,
+                 training_set=0.3, param_dict={}, number_processes=1, scale=True,
                  plot_roc_curve=True, return_classifier=False, classifiers_for_cm_plots=[],
                  type_dict=None, seed=1234):
     """
@@ -632,7 +632,7 @@ def run_pipeline(features, types, output_name='', columns=[], classifiers=['nb',
         the ID's of the objects to be used
     param_dict : dict, optional
         Use to run different ranges of hyperparameters for the classifiers when optimising
-    nprocesses : int, optional
+    number_processes : int, optional
         Number of processors for multiprocessing (shared memory only). Each classifier will then be run in parallel.
     scale : bool, optional
         Rescale features using sklearn's preprocessing Scalar class (highly recommended this is True)
@@ -707,15 +707,15 @@ def run_pipeline(features, types, output_name='', columns=[], classifiers=['nb',
     probabilities = {}
     classifier_objects = {}
 
-    if nprocesses > 1 and return_classifier:
+    if number_processes > 1 and return_classifier:
         print("Due to limitations with python's multiprocessing module, classifier objects cannot be returned if " \
               "multiple processors are used. Continuing serially...")
         print()
 
-    if nprocesses > 1 and not return_classifier:
+    if number_processes > 1 and not return_classifier:
         partial_func=partial(__call_classifier, X_train=X_train, y_train=y_train, X_test=X_test,
                              param_dict=param_dict, return_classifier=False)
-        p = Pool(nprocesses, maxtasksperchild=1)
+        p = Pool(number_processes, maxtasksperchild=1)
         result = p.map(partial_func, classifiers)
 
         for i in range(len(result)):
diff --git a/snmachine/snfeatures.py b/snmachine/snfeatures.py
index d1b6ee39..6e903c21 100644
--- a/snmachine/snfeatures.py
+++ b/snmachine/snfeatures.py
@@ -686,7 +686,7 @@ def __init__(self, model=['Ia'], sampler='leastsq',lsst_bands=False,lsst_dir='..
                     'nugent-sn2l':{'z':(0.01, 1.5)},
                     'nugent-sn1bc':{'z':(0.01, 1.5)}}
 
-    def extract_features(self, d, save_output=False, chain_directory='chains', use_redshift=False, nprocesses=1, restart=False, seed=-1):
+    def extract_features(self, d, save_output=False, chain_directory='chains', use_redshift=False, number_processes=1, restart=False, seed=-1):
         """
         Extract template features for a dataset.
 
@@ -700,7 +700,7 @@ def extract_features(self, d, save_output=False, chain_directory='chains', use_r
             Where to save the chains
         use_redshift : bool
             Whether or not to use provided redshift when fitting objects
-        nprocesses : int, optional
+        number_processes : int, optional
             Number of processors to use for parallelisation (shared memory only)
         restart : bool
             Whether or not to restart from multinest chains
@@ -736,7 +736,7 @@ def extract_features(self, d, save_output=False, chain_directory='chains', use_r
             output = Table(names=labels, dtype=['U32'] + ['f'] * (len(labels) - 1))
 
             k=0
-            if nprocesses<2:
+            if number_processes<2:
                 for obj in d.object_names:
                     if k%100==0:
                         print (k, 'objects fitted')
@@ -779,7 +779,7 @@ def extract_features(self, d, save_output=False, chain_directory='chains', use_r
 
             else:
                 if self.sampler=='leastsq':
-                    p=Pool(nprocesses, maxtasksperchild=1)
+                    p=Pool(number_processes, maxtasksperchild=1)
                     partial_func=partial(_run_leastsq_templates, d=d, model_name=self.templates[mod_name], use_redshift=use_redshift, bounds=self.bounds[self.templates[mod_name]])
                     out=p.map(partial_func, d.object_names)
                     output=out[0]
@@ -790,7 +790,7 @@ def extract_features(self, d, save_output=False, chain_directory='chains', use_r
                     else:
                         all_output=vstack((all_output, output))
                 elif self.sampler=='nested':
-                    p=Pool(nprocesses, maxtasksperchild=1)
+                    p=Pool(number_processes, maxtasksperchild=1)
                     partial_func=partial(_run_multinest_templates, d=d, model_name=self.templates[mod_name], bounds=self.bounds[self.templates[mod_name]],
                     chain_directory=chain_directory, nlp=1000, convert_to_binary=True, use_redshift=use_redshift, short_name=self.short_names[mod_name], restart=restart, seed=seed)
                     out=p.map(partial_func, d.object_names)
@@ -914,7 +914,7 @@ def __init__(self, model_choice, sampler='leastsq', limits=None):
 
 
 
-    def extract_features(self, d, chain_directory='chains', save_output=True, n_attempts=20, nprocesses=1, n_walkers=100,
+    def extract_features(self, d, chain_directory='chains', save_output=True, n_attempts=20, number_processes=1, n_walkers=100,
     n_steps=500, walker_spread=0.1, burn=50, nlp=1000, starting_point=None, convert_to_binary=True, n_iter=0, restart=False, seed=-1):
         """
         Fit parametric models and return best-fitting parameters as features.
@@ -930,7 +930,7 @@ def extract_features(self, d, chain_directory='chains', save_output=True, n_atte
         n_attempts : int
             Allow the minimiser to start in new random locations if the fit is bad. Put n_attempts=1 to fit only once
             with the default starting position.
-        nprocesses : int, optional
+        number_processes : int, optional
             Number of processors to use for parallelisation (shared memory only)
         n_walkers : int
             emcee parameter - number of walkers to use
@@ -963,7 +963,7 @@ def extract_features(self, d, chain_directory='chains', save_output=True, n_atte
         output=[]
 
         #obj=d.object_names[0]
-        if nprocesses<2:
+        if number_processes<2:
             k=0
             for obj in d.object_names:
                 if k%100==0:
@@ -984,14 +984,14 @@ def extract_features(self, d, chain_directory='chains', save_output=True, n_atte
                 k+=1
         else:
             if self.sampler=='leastsq':
-                p=Pool(nprocesses, maxtasksperchild=1)
+                p=Pool(number_processes, maxtasksperchild=1)
                 partial_func=partial(_run_leastsq, d=d, model=self.model,  n_attempts=n_attempts, seed=seed)
                 out=p.map(partial_func, d.object_names)
                 output=out[0]
                 for i in range(1, len(out)):
                     output=vstack((output, out[i]))
             elif self.sampler=='nested':
-                p=Pool(nprocesses, maxtasksperchild=1)
+                p=Pool(number_processes, maxtasksperchild=1)
                 partial_func=partial(_run_multinest, d=d, model=self.model,chain_directory=chain_directory,
                 nlp=nlp, convert_to_binary=convert_to_binary, n_iter=n_iter, restart=restart, seed=seed)
                 #Pool starts a number of threads, all of which may try to tackle all of the data. Better to take it in chunks
@@ -999,7 +999,7 @@ def extract_features(self, d, chain_directory='chains', save_output=True, n_atte
                 k=0
                 objs=d.object_names
                 while k<len(objs):
-                    objs_subset=objs[k:k+nprocesses]
+                    objs_subset=objs[k:k+number_processes]
                     out=p.map(partial_func, objs_subset)
                     for i in range(0, len(out)):
                         if out[i]==None:
@@ -1187,7 +1187,7 @@ class WaveletFeatures(Features):
     Uses wavelets to decompose the data and then reduces dimensionality of the feature space using PCA.
     """
 
-    def __init__(self, wavelet='sym2', ngp=100, **kwargs):
+    def __init__(self, wavelet='sym2', number_gp=100, **kwargs):
         """
         Initialises the pywt wavelet object and sets the maximum depth for deconstruction.
 
@@ -1195,7 +1195,7 @@ def __init__(self, wavelet='sym2', ngp=100, **kwargs):
         ----------
         wavelet : str, optional
             String for which wavelet family to use.
-        ngp : int, optional
+        number_gp : int, optional
             Number of points on the Gaussian process curve
         level : int, optional
             The maximum depth for wavelet deconstruction. If not provided, will use the maximum depth possible
@@ -1204,7 +1204,7 @@ def __init__(self, wavelet='sym2', ngp=100, **kwargs):
         Features.__init__(self)
 
         self.wav=pywt.Wavelet(wavelet)
-        self.ngp=ngp #Number of points to use on the Gaussian process curve
+        self.number_gp=number_gp #Number of points to use on the Gaussian process curve
         self.wavelet_list=pywt.wavelist() #All possible families
 
         if wavelet not in self.wavelet_list:
@@ -1215,10 +1215,10 @@ def __init__(self, wavelet='sym2', ngp=100, **kwargs):
         if 'level' in kwargs:
             self.mlev=kwargs['level']
         else:
-            self.mlev=pywt.swt_max_level(self.ngp)
+            self.mlev=pywt.swt_max_level(self.number_gp)
 
 
-    def extract_features(self, d, initheta=[500, 20], save_output=False, output_root='features', nprocesses=24, restart='none', gp_algo='george', xmin=None, xmax=None, recompute_pca=True, pca_path=None):
+    def extract_features(self, d, initheta=[500, 20], save_output=False, output_root='features', number_processes=24, restart='none', gp_algo='george', xmin=None, xmax=None, recompute_pca=True, pca_path=None):
         """
         Applies a wavelet transform followed by PCA dimensionality reduction to extract wavelet coefficients as features.
 
@@ -1232,7 +1232,7 @@ def extract_features(self, d, initheta=[500, 20], save_output=False, output_root
             Whether or not to save the output
         output_root : str, optional
             Output directory
-        nprocesses : int, optional
+        number_processes : int, optional
             Number of processors to use for parallelisation (shared memory only)
         restart : str, optional
             Either 'none' to start from scratch, 'gp' to restart from saved Gaussian processes, or 'wavelet' to
@@ -1260,9 +1260,9 @@ def extract_features(self, d, initheta=[500, 20], save_output=False, output_root
             if restart=='gp':
                 self.restart_from_gp(d, output_root)
             else:
-                compute_gps(d, self.ngp, xmin, xmax, initheta, output_root, nprocesses, gp_algo=gp_algo, save_output=save_output)
+                compute_gps(d, self.number_gp, xmin, xmax, initheta, output_root, number_processes, gp_algo=gp_algo, save_output=save_output)
 
-            wavout, waveout_err=self.extract_wavelets(d, self.wav, self.mlev,  nprocesses, save_output, output_root)
+            wavout, waveout_err=self.extract_wavelets(d, self.wav, self.mlev,  number_processes, save_output, output_root)
         self.features,vals,vec,mn,s=self.extract_pca(d.object_names.copy(), wavout, recompute_pca=recompute_pca, pca_path=pca_path, output_root=output_root)
 
         #Save the PCA information
@@ -1316,14 +1316,14 @@ def fit_sn(self, lc, comps, vec,  mn, xmin, xmax, filter_set, waveUni=0):
 
             coeffs=np.array(np.dot(new_comps, eigs.T)+mn).flatten()
 
-        n=self.mlev*2*self.ngp
-        xnew=np.linspace(xmin, xmax, self.ngp)
+        n=self.mlev*2*self.number_gp
+        xnew=np.linspace(xmin, xmax, self.number_gp)
         output=[]
         for i in range(len(filter_set)):
             if filter_set[i] in filts:
                 if waveUni==0:
                     filt_coeffs=coeffs[i*n:(i+1)*n]
-                    filt_coeffs=filt_coeffs.reshape(self.mlev, 2, self.ngp, order='C')
+                    filt_coeffs=filt_coeffs.reshape(self.mlev, 2, self.number_gp, order='C')
                 else: # tweak things
                     ifFil = comps['filter'] == filter_set[i]
                     filt_coeffs = (( np.array(comps[ifFil]['cA2']), np.array(comps[ifFil]['cD2']) ),
@@ -1331,7 +1331,7 @@ def fit_sn(self, lc, comps, vec,  mn, xmin, xmax, filter_set, waveUni=0):
 
                 ynew=self.iswt(filt_coeffs, self.wav)
 
-                newtable=Table([xnew, ynew, [filter_set[i]]*self.ngp], names=['mjd', 'flux', 'filter'], dtype=['f', 'f', 'U32'])
+                newtable=Table([xnew, ynew, [filter_set[i]]*self.number_gp], names=['mjd', 'flux', 'filter'], dtype=['f', 'f', 'U32'])
                 if len(output)==0:
                     output=newtable
                 else:
@@ -1385,8 +1385,8 @@ def restart_from_wavelets(self, d, output_root):
         """
         print ('Restarting from stored wavelets...')
         nfilts=len(d.filter_set)
-        wavout=np.zeros([len(d.object_names), self.ngp*2*self.mlev*nfilts]) #This is just a very big array holding coefficients in memory
-        wavout_err=np.zeros([len(d.object_names), self.ngp*2*self.mlev*nfilts])
+        wavout=np.zeros([len(d.object_names), self.number_gp*2*self.mlev*nfilts]) #This is just a very big array holding coefficients in memory
+        wavout_err=np.zeros([len(d.object_names), self.number_gp*2*self.mlev*nfilts])
 
         for i in range(len(d.object_names)):
             obj=d.object_names[i]
@@ -1395,7 +1395,7 @@ def restart_from_wavelets(self, d, output_root):
                 # out=Table.read(fname, format='ascii')
                 out=Table.read(fname, format='fits')
                 cols=out.colnames[:-1]
-                n=self.ngp*2*self.mlev
+                n=self.number_gp*2*self.mlev
                 for j in range(nfilts): # I think I can do this in a more clear/ easy to understand way
                     x=out[out['filter']==d.filter_set[j]] # select the filter
                     coeffs=x[cols[:self.mlev*2]] # select the coeeficients ['cA2', 'cD2', 'cA1', 'cD1'] of that filter
@@ -1430,7 +1430,7 @@ def wavelet_decomp(self, lc, wav, mlev):
         """
 
         filters=np.unique(lc['filter'])
-        ngp=len(lc['flux'][lc['filter']==filters[0]])
+        number_gp=len(lc['flux'][lc['filter']==filters[0]])
         #Store the output in another astropy table
         output=0
         for fil in filters:
@@ -1465,8 +1465,9 @@ def wavelet_decomp(self, lc, wav, mlev):
 
         return output
 
-    def extract_wavelets(self, d, wav, mlev, nprocesses, save_output, output_root):
-        """Perform wavelet decomposition on all objects in dataset. Output is stored as astropy table for each object.
+    def extract_wavelets(self, d, wav, mlev, number_processes, save_output, output_root):
+        """
+        Perform wavelet decomposition on all objects in dataset. Output is stored as astropy table for each object.
 
         Parameters
         ----------
@@ -1476,7 +1477,7 @@ def extract_wavelets(self, d, wav, mlev, nprocesses, save_output, output_root):
             Which wavelet family to use
         mlev : int
             Max depth
-        nprocesses : int, optional
+        number_processes : int, optional
             Number of processors to use for parallelisation (shared memory only)
         save_output : bool, optional
             Whether or not to save the output
@@ -1494,8 +1495,8 @@ def extract_wavelets(self, d, wav, mlev, nprocesses, save_output, output_root):
         print ('Performing wavelet decomposition')
 
         nfilts=len(d.filter_set)
-        wavout=np.zeros([len(d.object_names), self.ngp*2*mlev*nfilts]) #This is just a big array holding coefficients in memory
-        wavout_err=np.zeros([len(d.object_names), self.ngp*2*mlev*nfilts])
+        wavout=np.zeros([len(d.object_names), self.number_gp*2*mlev*nfilts]) #This is just a big array holding coefficients in memory
+        wavout_err=np.zeros([len(d.object_names), self.number_gp*2*mlev*nfilts])
         t1=time.time()
         for i in range(len(d.object_names)):
             obj=d.object_names[i]
@@ -1506,7 +1507,7 @@ def extract_wavelets(self, d, wav, mlev, nprocesses, save_output, output_root):
                 out.write(os.path.join(output_root, 'wavelet_'+obj), format='fits',overwrite=True)
             #We go by filter, then by set of coefficients
             cols=out.colnames[:-1]
-            n=self.ngp*2*mlev
+            n=self.number_gp*2*mlev
             filts=np.unique(lc['filter'])
             for j in range(nfilts):
                 if d.filter_set[j] in filts:
@@ -1572,7 +1573,7 @@ def get_pca_eigendecomposition(self, data_matrix, number_comp=None, tol=0.999,
         Z : `np.ndarray`
             Components of the vectors forming the data matrix in the PCA bases of shape (Nsamps, number_comps)
         M : `np.ndarray`
-            Means of the features of the data matrix over the samples, should have shape (Nfeats,) 
+            Means of the features of the data matrix over the samples, should have shape (Nfeats,)
         s : `np.ndarray`
             scalings used to rescale X so that the variance of each feature in
             X is 1. Should have shape (Nfeats, ) or be `None`
@@ -1631,11 +1632,11 @@ def get_pca_svd(self, data_matrix, number_comp=None, tol=0.999,
         -------
         V : `np.ndarray`
             Right Singular Matrix, with shape (Nsamps, min(`number_comp`, Nfeats))
-        Z : `np.ndarray` 
-            Components of the vectors forming the data matrix in the PCA bases 
+        Z : `np.ndarray`
+            Components of the vectors forming the data matrix in the PCA bases
             of shape (Nsamps, `number_comp`)
         M : `np.ndarray`
-            Means of the features of the data matrix over the samples, should have shape (Nfeats,) 
+            Means of the features of the data matrix over the samples, should have shape (Nfeats,)
         s : `np.ndarray`
             scalings used to rescale X so that the variance of each feature in
             X is 1. Should have shape (Nfeats, ) or be `None`
@@ -1703,7 +1704,7 @@ def normalize_datamatrix(D, normalize_variance=True):
         X : `np.ndarray`
             normalized and centered data matrix X. Should have shape (Nsamps, Nfeats)
         M : `np.ndarray`
-            Means of the features of the data matrix over the samples, should have shape (Nfeats,) 
+            Means of the features of the data matrix over the samples, should have shape (Nfeats,)
         s : `np.ndarray`
             scalings used to rescale X so that the variance of each feature in
             X is 1. Should have shape (Nfeats, ) or be `None`
@@ -1841,7 +1842,7 @@ def _pca(self, data_matrix, number_comp, tol, normalize_variance, method):
             If `True` pass to `normalize_variance` method so that the features
             are scaled to have unit variance.
         method : {'svd'| 'eigendecomposition'}
-        
+
         Notes
         -----
         normalize_variance defaults to False. Please read notes in
@@ -1871,12 +1872,12 @@ def reconstruct_datamatrix_lossy(Z, vec, M=None, s=None):
         M : `np.ndarray`, defaults to `None`
             Matrix subtracted from original Data Matrix to center it.
             Must have shape (Nfeats, ). If `None`, M is assumed to be 0
-        s : `np.ndarry` 
+        s : `np.ndarry`
             scale factor applied to normalize data matrix so that each feature
             vector has variance 1. Must have shape (Nfeats, ) or be `None`
         Returns
         -------
-        D : `np.ndarray` 
+        D : `np.ndarray`
             Reconstructed un-normalized data matrix of shape (Nsamps, Nfeats)
             that was compressed via PCA
         """
@@ -1957,7 +1958,7 @@ def extract_pca(self, object_names, wavout, recompute_pca=True,
             array of shape (Nfeat, Ncomp) whose columns are the
             eigenvectors of the covariance matrix.
         M : `np.ndarray`
-            Means of the features of the data matrix over the samples, should have shape (Nfeats,) 
+            Means of the features of the data matrix over the samples, should have shape (Nfeats,)
         s : `np.ndarray`
             scalings used to rescale X so that the variance of each feature in
             X is 1. Should have shape (Nfeats, ) or be `None`
diff --git a/test/snclassifier_test.py b/test/snclassifier_test.py
index 98286d84..3cc5fde8 100644
--- a/test/snclassifier_test.py
+++ b/test/snclassifier_test.py
@@ -41,7 +41,7 @@ def classification_test(cls, featz, types):
     if not os.path.exists(out_dir):
         subprocess.call(['mkdir',out_dir])
 
-    snclassifier.run_pipeline(featz, types, classifiers=cls, nprocesses=4, plot_roc_curve=False, output_name=out_dir)
+    snclassifier.run_pipeline(featz, types, classifiers=cls, number_processes=4, plot_roc_curve=False, output_name=out_dir)
 
     auc_truth={'nb':5.498296484233418102e-01, 'svm': 9.607832585029829620e-01, 'knn':8.683540372670807139e-01, 'random_forest': 9.794267790146994335e-01, 'decision_tree':9.046528076757488490e-01, 'boost_dt': 9.597607478934744307e-01, 'boost_rf': 9.791576972753551766e-01, 'neural_network': 9.637969739836398375e-01}
 
diff --git a/test/snfeatures_test.py b/test/snfeatures_test.py
index 0e6777fa..2025e709 100644
--- a/test/snfeatures_test.py
+++ b/test/snfeatures_test.py
@@ -73,10 +73,10 @@ def load_example_data(request):
     return d
 
 
-def fit_templates(d, sampler='leastsq', use_redshift=False, nprocesses=1):
+def fit_templates(d, sampler='leastsq', use_redshift=False, number_processes=1):
     temp_featz = snfeatures.TemplateFeatures(sampler=sampler)
     extr_features = temp_featz.extract_features(d, use_redshift=use_redshift,
-                                                nprocesses=nprocesses, seed=42)
+                                                number_processes=number_processes, seed=42)
     d.set_model(temp_featz.fit_sn, extr_features)
     gof = temp_featz.goodness_of_fit(d)
     gof = np.array([gof[f] for f in d.filter_set]).T
@@ -84,9 +84,9 @@ def fit_templates(d, sampler='leastsq', use_redshift=False, nprocesses=1):
 #    return gof[example_name]
 
 
-def fit_parametric(model_choice, d, sampler='leastsq', nprocesses=1):
+def fit_parametric(model_choice, d, sampler='leastsq', number_processes=1):
     parametric_featz = snfeatures.ParametricFeatures(model_choice=model_choice, sampler=sampler)
-    extr_features = parametric_featz.extract_features(d, nprocesses=nprocesses, seed=42)
+    extr_features = parametric_featz.extract_features(d, number_processes=number_processes, seed=42)
     d.set_model(parametric_featz.fit_sn, extr_features)
     gof = parametric_featz.goodness_of_fit(d)
     gof = np.array([gof[f] for f in d.filter_set]).T
@@ -123,7 +123,7 @@ def test_module_loading():
 def test_templates_leastsq(load_example_data):
     d = load_example_data
     for nproc in parallel_cores:
-        gof = fit_templates(d, sampler='leastsq', use_redshift=False, nprocesses=nproc)
+        gof = fit_templates(d, sampler='leastsq', use_redshift=False, number_processes=nproc)
 
         # This case distinction is necessary, since from sncosmo-1.4 to sncosmo-1.5 there have been
         # significant changes in the salt2 templates that result in different fits.
@@ -135,7 +135,7 @@ def test_templates_leastsq(load_example_data):
         np.testing.assert_allclose(gof, gof_truth, rtol=rtol)
 
     for nproc in parallel_cores:
-        gof = fit_templates(d, sampler='leastsq', use_redshift=True, nprocesses=nproc)
+        gof = fit_templates(d, sampler='leastsq', use_redshift=True, number_processes=nproc)
         if sncosmo.__version__ < '1.5.0':
             gof_truth = [6.21906514,  18.35383076,   6.08646565,   1.0458849]
         else:
@@ -148,14 +148,14 @@ def test_templates_leastsq(load_example_data):
 def test_templates_mcmc(load_example_data):
     d=load_example_data
     for nproc in parallel_cores:
-        gof=fit_templates(d, sampler='mcmc', use_redshift=False, nprocesses=nproc)
+        gof=fit_templates(d, sampler='mcmc', use_redshift=False, number_processes=nproc)
         if sncosmo.__version__<'1.5.0':
 	    gof_truth=[  6.16634571,  17.82825731,   6.23085278,   1.11415153]
 	else:
 	    gof_truth=[  6.09118872,  18.24212791,   6.6823486,    1.29993102]
         np.testing.assert_allclose(gof, gof_truth, rtol=rtol)
     for nproc in parallel_cores:
-        gof=fit_templates(d, sampler='mcmc', use_redshift=True, nprocesses=nproc)
+        gof=fit_templates(d, sampler='mcmc', use_redshift=True, number_processes=nproc)
 	if sncosmo.__version__<'1.5.0':
 	    gof_truth=[  5.34752278,  18.87138068,   6.98768329,   1.84565766]
 	else:
@@ -168,7 +168,7 @@ def test_templates_mcmc(load_example_data):
 def test_templates_nested(load_example_data):
     d = load_example_data
     for nproc in parallel_cores:
-        gof = fit_templates(d, sampler='nested', use_redshift=False, nprocesses=nproc)
+        gof = fit_templates(d, sampler='nested', use_redshift=False, number_processes=nproc)
         if sncosmo.__version__ < '1.5.0':
             gof_truth = [6.14752938,  18.26134481,   6.12642616,   1.06306042]
         else:
@@ -176,7 +176,7 @@ def test_templates_nested(load_example_data):
         np.testing.assert_allclose(np.sort(gof), np.sort(gof_truth), rtol=rtol)
 #        np.testing.assert_allclose(np.sort(gof), np.sort([ 2.4059210272, 2.0797560142, 1.7905944939, 3.57346633979]), rtol=rtol)
     for nproc in parallel_cores:
-        gof=fit_templates(d, sampler='nested', use_redshift=True, nprocesses=nproc)
+        gof=fit_templates(d, sampler='nested', use_redshift=True, number_processes=nproc)
         if sncosmo.__version__ < '1.5.0':
             gof_truth = [6.27339226,  18.63956378,   6.16584135,   1.05712933]
         else:
@@ -189,14 +189,14 @@ def test_templates_nested(load_example_data):
 def test_newling_leastsq(load_example_data):
     d = load_example_data
     for nproc in parallel_cores:
-        gof = fit_parametric('newling', d, sampler='leastsq', nprocesses=nproc)
+        gof = fit_parametric('newling', d, sampler='leastsq', number_processes=nproc)
         np.testing.assert_allclose(gof, [6.00072104,  22.03567143,   7.2070583,    1.28674332], rtol=rtol)
 #        np.testing.assert_allclose(gof, [ 0.83526717,  0.51772027,  1.1398396,   1.11812427], rtol=rtol)
 
 def test_karpenka_leastsq(load_example_data):
     d=load_example_data
     for nproc in parallel_cores:
-        gof=fit_parametric('karpenka', d, sampler='leastsq', nprocesses=nproc)
+        gof=fit_parametric('karpenka', d, sampler='leastsq', number_processes=nproc)
         np.testing.assert_allclose(gof, [  5.24617927,  23.03744351,   7.82406324,   0.88721942], rtol=rtol)
 
 """
@@ -204,7 +204,7 @@ def test_karpenka_leastsq(load_example_data):
 def test_newling_mcmc(load_example_data):
     d=load_example_data
     for nproc in parallel_cores:
-        gof=fit_parametric('newling', d, sampler='mcmc', nprocesses=nproc)
+        gof=fit_parametric('newling', d, sampler='mcmc', number_processes=nproc)
 	print gof
 #        np.testing.assert_allclose(gof, , rtol=rtol)
 #        np.testing.assert_allclose(gof, [ 0.83526717,  0.51772027,  1.1398396,   1.11812427], rtol=rtol)
@@ -213,7 +213,7 @@ def test_newling_mcmc(load_example_data):
 def test_karpenka_mcmc(load_example_data):
     d=load_example_data
     for nproc in parallel_cores:
-        gof=fit_parametric('karpenka', d, sampler='mcmc', nprocesses=nproc)
+        gof=fit_parametric('karpenka', d, sampler='mcmc', number_processes=nproc)
 	print gof
 #        np.testing.assert_allclose(gof, , rtol=rtol)
 #"""
@@ -224,7 +224,7 @@ def test_karpenka_mcmc(load_example_data):
 def test_newling_nested(load_example_data):
     d = load_example_data
     for nproc in parallel_cores:
-        gof = fit_parametric('newling', d, sampler='nested', nprocesses=nproc)
+        gof = fit_parametric('newling', d, sampler='nested', number_processes=nproc)
         np.testing.assert_allclose(gof, [5.83656883,  21.81049531,   7.21428601,   1.29572207], rtol=rtol)
 
 
@@ -233,7 +233,7 @@ def test_newling_nested(load_example_data):
 def test_karpenka_nested(load_example_data):
     d = load_example_data
     for nproc in parallel_cores:
-        gof = fit_parametric('karpenka', d, sampler='nested', nprocesses=nproc)
+        gof = fit_parametric('karpenka', d, sampler='nested', number_processes=nproc)
         np.testing.assert_allclose(gof, [5.10496956,  29.83861575,   6.50170389,   0.89942577], rtol=rtol)
 
 
diff --git a/utils/plasticc_pipeline.py b/utils/plasticc_pipeline.py
index f58217fd..1fbc2628 100755
--- a/utils/plasticc_pipeline.py
+++ b/utils/plasticc_pipeline.py
@@ -275,7 +275,7 @@ def wavelet_decomposition(training_data, number_gp, **kwargs):
     --------
     >>> ...
     >>> waveout, waveout_err, wavelet_object =
-    wavelet_decomposition(training_data, number_gp=number_gp, nprocesses=nprocesses,
+    wavelet_decomposition(training_data, number_gp=number_gp, number_processes=number_processes,
                                                                      save_output='all', output_root=dirs.get("intermediate_files_directory"))
     >>> print()
 
@@ -406,8 +406,8 @@ def make_predictions(location_of_test_data, classifier):
 if __name__ == "__main__":
 
     # Set the number of processes you want to use throughout the notebook
-    nprocesses = multiprocessing.cpu_count()
-    print("Running with {} cores".format(nprocesses))
+    number_processes = multiprocessing.cpu_count()
+    print("Running with {} cores".format(number_processes))
 
     parser = ArgumentParser(description="Run pipeline end to end")
     parser.add_argument('--configuration', '-c')
@@ -458,10 +458,10 @@ def make_predictions(location_of_test_data, classifier):
         gps.compute_gps(training_data, number_gp=number_gp, t_min=0, t_max=1100,
                         kernel_param=kernel_param,
                         output_root=dirs['intermediate_files_directory'],
-                        number_processes=nprocesses)
+                        number_processes=number_processes)
 
         # Step 6. Extract wavelet coeffiencts
-        waveout, waveout_err, wavelet_object = wavelet_decomposition(training_data, number_gp=number_gp, nprocesses=nprocesses,
+        waveout, waveout_err, wavelet_object = wavelet_decomposition(training_data, number_gp=number_gp, number_processes=number_processes,
                                                                      save_output='all', output_root=dirs.get("intermediate_files_directory"))
 
         # Step 7. Reduce dimensionality of wavelets by using only N principle components

From 5fb499e434e815db8c1e2df39a54ec1c7d713561 Mon Sep 17 00:00:00 2001
From: Tarek Allam <t.allam.jr@gmail.com>
Date: Sat, 18 May 2019 19:03:51 +0100
Subject: [PATCH 18/58] Reducing number of PCA components

Reducing the number of PCA components from 200 to 10 as it is required
that number of components be less than or equal to the number of
objects. Thus for the dataset used here "training_set_snia.pickle" 10 is
appropriate.

This should fix this error:

    Running PCA...
    The condition number in the SVD is 1.02688179587e+23 and the normalized
    one is 5.00036575467e+22
    Traceback (most recent call last):
      File "plasticc_pipeline.py", line 469, in <module>
        tol=None, pca_path=None, save_output=True,
    output_root=dirs.get("intermediate_files_directory"))
      File
    "/home/tallam/.conda/envs/snmachine/lib/python3.6/site-packages/snmachine/snfeatures.py",
    line 2005, in extract_pca
        normalize_variance=normalize_variance)
      File
    "/home/tallam/.conda/envs/snmachine/lib/python3.6/site-packages/snmachine/snfeatures.py",
    line 1873, in _pca
        return self.pca_SVD(dataMatrix, ncomp, tol, normalize_variance)
      File
    "/home/tallam/.conda/envs/snmachine/lib/python3.6/site-packages/snmachine/snfeatures.py",
    line 1714, in pca_SVD
        assert isinstance(tol, np.float)
    AssertionError
---
 utils/config.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/utils/config.yml b/utils/config.yml
index 34a726e0..66715648 100644
--- a/utils/config.yml
+++ b/utils/config.yml
@@ -5,4 +5,4 @@ data_path : "/share/hypatia/snmachine_resources/data/plasticc/data/raw_data/trai
 # snmachine parameters
 number_gp : 1100
 kernel_param : [500., 20.]
-number_of_principle_components : 200
+number_of_principle_components : 10

From 8bb380c990679ed391fdbfea114309f3cb3152c5 Mon Sep 17 00:00:00 2001
From: Tarek Allam <t.allam.jr@gmail.com>
Date: Sat, 18 May 2019 09:03:03 +0100
Subject: [PATCH 19/58] Adding None return if key-value not found

If one attempts to call the "method_directory" parameter from the
dictionary but it does not exist, a None type return will occur
---
 utils/plasticc_pipeline.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/utils/plasticc_pipeline.py b/utils/plasticc_pipeline.py
index 1fbc2628..336eeea7 100755
--- a/utils/plasticc_pipeline.py
+++ b/utils/plasticc_pipeline.py
@@ -67,7 +67,7 @@ def create_folder_structure(analysis_directory, analysis_name):
     >>> analysis_directory = params.get("analysis_directory", None)
     >>> analysis_name = params.get("analysis_name", None)
     >>> directories = create_folder_structure(analysis_directory, analysis_name)
-    >>> print(directories.get("method_directory"))
+    >>> print(directories.get("method_directory", None))
 
     """
     # Append Git has to analysis name

From 429355bd8fda2a848da99624af9cf65f13924f49 Mon Sep 17 00:00:00 2001
From: Tarek Allam <t.allam.jr@gmail.com>
Date: Sat, 18 May 2019 09:08:59 +0100
Subject: [PATCH 20/58] Removing unnecessary print statements

---
 utils/plasticc_pipeline.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/utils/plasticc_pipeline.py b/utils/plasticc_pipeline.py
index 336eeea7..10f569ef 100755
--- a/utils/plasticc_pipeline.py
+++ b/utils/plasticc_pipeline.py
@@ -180,7 +180,6 @@ def load_training_data(data_path):
                 print("Dataset loaded from pickle file as: {}".format(training_data))
         else:
             folder_path, train_data_file_name = os.path.split(data_path)
-            print(folder_path, train_data_file_name)
             meta_data_file_name = "_metadata.".join(train_data_file_name.split("."))
 
             print("Opening from CSV")

From f84d22bd986e3cf2ee643483c9a5423b72393d18 Mon Sep 17 00:00:00 2001
From: Tarek Allam <t.allam.jr@gmail.com>
Date: Sat, 18 May 2019 21:12:51 +0100
Subject: [PATCH 21/58] Adding timestamp helper function

This function is used to determine the last modified time of the
configuration file that is being used and to place this in the name of
the analysis run.
---
 utils/plasticc_pipeline.py | 41 +++++++++++++++++++++++++++++++-------
 1 file changed, 34 insertions(+), 7 deletions(-)

diff --git a/utils/plasticc_pipeline.py b/utils/plasticc_pipeline.py
index 10f569ef..0c020415 100755
--- a/utils/plasticc_pipeline.py
+++ b/utils/plasticc_pipeline.py
@@ -43,7 +43,32 @@ def get_git_revision_short_hash():
     return _hash.decode("utf-8").rstrip()
 
 
-def create_folder_structure(analysis_directory, analysis_name):
+def get_timestamp(path_to_configuration_file):
+    """ Helper function to obtain latest modified time of the configuration file
+
+    Parameters
+    ----------
+    path_to_configuration_file : str
+        System path to where the configuration file is located
+
+    Returns
+    -------
+    timestamp : str
+        Short representation of last modified time for the configuration file used.
+        'YYYY-MM-DD-HOURMINUTE'
+
+    Examples
+    --------
+    >>> ...
+    >>> timestamp = get_timestamp(path_to_configuration_file)
+    >>> print(timestamp)
+    '2019-05-18-2100'
+    """
+    _timestamp = subprocess.check_output(['date', '+%Y-%m-%d-%H%M', '-r', path_to_configuration_file])
+    return _timestamp
+
+
+def create_folder_structure(analysis_directory, analysis_name, path_to_configuration_file):
     """ Make directories that will be used for analysis
 
     Parameters
@@ -66,17 +91,17 @@ def create_folder_structure(analysis_directory, analysis_name):
     >>> ...
     >>> analysis_directory = params.get("analysis_directory", None)
     >>> analysis_name = params.get("analysis_name", None)
-    >>> directories = create_folder_structure(analysis_directory, analysis_name)
+    >>> directories = create_folder_structure(analysis_directory, analysis_name, path_to_configuration_file)
     >>> print(directories.get("method_directory", None))
 
     """
-    # Append Git has to analysis name
-    analysis_name = analysis_name + "-" + get_git_revision_short_hash()
+    # Prepend last modified time of configuration file and git SHA to analysis name
+    analysis_name = get_timestamp(path_to_configuration_file) + "-" + get_git_revision_short_hash() + "-" + analysis_name
 
     method_directory = os.path.join(analysis_directory, analysis_name)
     features_directory = os.path.join(method_directory, 'wavelet_features')
     classifications_directory = os.path.join(method_directory, 'classifications')
-    intermediate_files_directory = os.path.join(method_directory, 'intermediate')
+    intermediate_files_directory = os.path.join(method_directory, 'intermediate_files')
     plots_directory = os.path.join(method_directory, 'plots')
 
     dirs = {"method_directory": method_directory, "features_directory": features_directory,
@@ -414,7 +439,9 @@ def make_predictions(location_of_test_data, classifier):
     arguments = parser.parse_args()
     arguments = vars(arguments)
 
-    params = load_configuration_file(arguments['configuration'])
+    path_to_configuration_file = arguments['configuration']
+
+    params = load_configuration_file(path_to_configuration_file)
 
     data_path = params.get("data_path", None)
     analysis_directory = params.get("analysis_directory", None)
@@ -426,7 +453,7 @@ def make_predictions(location_of_test_data, classifier):
     number_of_principal_components = params.get("number_of_principal_components", None)
 
     # Step 1. Creat folders that contain analysis
-    dirs = create_folder_structure(analysis_directory, analysis_name)
+    dirs = create_folder_structure(analysis_directory, analysis_name, path_to_configuration_file)
     # Step 2. Save configuration file used for this analysis
     save_configuration_file(dirs.get("method_directory"))
     # Step 3. Check at which point the user would like to run the analysis from.

From be45b5957187e8d632e382a10eb8883e11342917 Mon Sep 17 00:00:00 2001
From: Tarek Allam <t.allam.jr@gmail.com>
Date: Sat, 18 May 2019 21:15:45 +0100
Subject: [PATCH 22/58] Fixes Type error: can't concat str to bytes

---
 utils/plasticc_pipeline.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/utils/plasticc_pipeline.py b/utils/plasticc_pipeline.py
index 0c020415..be9532ff 100755
--- a/utils/plasticc_pipeline.py
+++ b/utils/plasticc_pipeline.py
@@ -65,7 +65,7 @@ def get_timestamp(path_to_configuration_file):
     '2019-05-18-2100'
     """
     _timestamp = subprocess.check_output(['date', '+%Y-%m-%d-%H%M', '-r', path_to_configuration_file])
-    return _timestamp
+    return _timestamp.decode("utf-8").rstrip()
 
 
 def create_folder_structure(analysis_directory, analysis_name, path_to_configuration_file):

From fa0373eb9066db1ccc2b82e80ffeff8b0e37094f Mon Sep 17 00:00:00 2001
From: Tarek Allam <t.allam.jr@gmail.com>
Date: Mon, 20 May 2019 12:03:01 +0100
Subject: [PATCH 23/58] Updating path to features directory for wavelets

---
 utils/config.yml           | 2 +-
 utils/plasticc_pipeline.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/utils/config.yml b/utils/config.yml
index 66715648..1a25addd 100644
--- a/utils/config.yml
+++ b/utils/config.yml
@@ -1,7 +1,7 @@
 # Global settings
 analysis_directory : "/share/hypatia/snmachine_resources/data/plasticc/analysis/"
 analysis_name : "pipeline-test"
-data_path : "/share/hypatia/snmachine_resources/data/plasticc/data/raw_data/training_set_snia.pickle"
+data_path : "/share/hypatia/snmachine_resources/data/plasticc/data/raw_data/new_train_data.pckl"
 # snmachine parameters
 number_gp : 1100
 kernel_param : [500., 20.]
diff --git a/utils/plasticc_pipeline.py b/utils/plasticc_pipeline.py
index be9532ff..52bdba6e 100755
--- a/utils/plasticc_pipeline.py
+++ b/utils/plasticc_pipeline.py
@@ -492,7 +492,7 @@ def make_predictions(location_of_test_data, classifier):
 
         # Step 7. Reduce dimensionality of wavelets by using only N principle components
         wavelet_features, eigenvals, eigenvecs, means, num_feats = wavelet_object.extract_pca(object_names=training_data.object_names, wavout=waveout, recompute_pca=True, method='svd', ncomp=number_of_principal_components,
-                                                                                              tol=None, pca_path=None, save_output=True, output_root=dirs.get("intermediate_files_directory"))
+                                                                                              tol=None, pca_path=None, save_output=True, output_root=dirs.get("features_directory"))
 
         # Step 8. TODO Combine snmachine features with user defined features
         # Step 9. TODO Create a Random Forest classifier; need to fit model and

From d16bc3ef06db6d52a48dbc36ebc1781498c3868e Mon Sep 17 00:00:00 2001
From: Tarek Allam <t.allam.jr@gmail.com>
Date: Mon, 20 May 2019 12:14:12 +0100
Subject: [PATCH 24/58] Fixing spelling error for 'Principal' in PCA

---
 utils/config.yml           | 2 +-
 utils/plasticc_pipeline.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/utils/config.yml b/utils/config.yml
index 1a25addd..ab511dc2 100644
--- a/utils/config.yml
+++ b/utils/config.yml
@@ -5,4 +5,4 @@ data_path : "/share/hypatia/snmachine_resources/data/plasticc/data/raw_data/new_
 # snmachine parameters
 number_gp : 1100
 kernel_param : [500., 20.]
-number_of_principle_components : 10
+number_of_principal_components : 10
diff --git a/utils/plasticc_pipeline.py b/utils/plasticc_pipeline.py
index 52bdba6e..0320f271 100755
--- a/utils/plasticc_pipeline.py
+++ b/utils/plasticc_pipeline.py
@@ -332,7 +332,7 @@ def combine_all_features(reduced_wavelet_features, dataframe):
     Parameters
     ----------
     reduced_wavelet_features : numpy.ndarray
-        These are the N principle components from the uncompressed wavelets
+        These are the N principal components from the uncompressed wavelets
     dataframe : pandas.DataFrame
         Dataframe
 
@@ -490,7 +490,7 @@ def make_predictions(location_of_test_data, classifier):
         waveout, waveout_err, wavelet_object = wavelet_decomposition(training_data, number_gp=number_gp, number_processes=number_processes,
                                                                      save_output='all', output_root=dirs.get("intermediate_files_directory"))
 
-        # Step 7. Reduce dimensionality of wavelets by using only N principle components
+        # Step 7. Reduce dimensionality of wavelets by using only N principal components
         wavelet_features, eigenvals, eigenvecs, means, num_feats = wavelet_object.extract_pca(object_names=training_data.object_names, wavout=waveout, recompute_pca=True, method='svd', ncomp=number_of_principal_components,
                                                                                               tol=None, pca_path=None, save_output=True, output_root=dirs.get("features_directory"))
 

From 699b9028d4810930c3b974f963be40c40b7fea83 Mon Sep 17 00:00:00 2001
From: Tarek Allam <t.allam.jr@gmail.com>
Date: Mon, 20 May 2019 14:01:19 +0100
Subject: [PATCH 25/58] Converting wavelet features to pandas dataframe

---
 utils/plasticc_pipeline.py | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/utils/plasticc_pipeline.py b/utils/plasticc_pipeline.py
index 0320f271..eb093e60 100755
--- a/utils/plasticc_pipeline.py
+++ b/utils/plasticc_pipeline.py
@@ -356,7 +356,7 @@ def combine_all_features(reduced_wavelet_features, dataframe):
     return combined_features
 
 
-def create_classififer(combined_features, random_state=42):
+def create_classifier(combined_features, training_data, random_state=42):
     # TODO: Improve docstrings.
     """ Creation of an optimised Random Forest classifier.
 
@@ -374,12 +374,16 @@ def create_classififer(combined_features, random_state=42):
     Examples
     --------
     >>> ...
-    >>> classifier, confusion_matrix = create_classififer(combined_features)
+    >>> classifier, confusion_matrix = create_classifier(combined_features)
     >>> print(classifier)
 
     >>> plot_confusion_matrix(confusion_matrix)
 
     """
+    # TODO: This is temporary while the pipeline is tested.
+    if isinstance(combined_features, np.ndarray):
+        features_pd = pd.DataFrame(combined_features, index=training_data.object_names)
+        features_pd['target'] = training_data.labels.values
 
     X = combined_features.drop('target', axis=1)
     y = combined_features['target'].values
@@ -463,7 +467,7 @@ def make_predictions(location_of_test_data, classifier):
         # Restart from saved uncompressed wavelets.
         wavelet_features = Table.read(dirs.get("features_dir") + "/wavelet_features.fits")
         combined_features = combine_all_features(wavelet_features, data_path)
-        classifer = create_classififer(combined_features)
+        classifer = create_classifier(combined_features)
     elif (arguments['restart_from'].lower() == "gps"):
         # Restart from saved GPs.
         pass
@@ -495,7 +499,8 @@ def make_predictions(location_of_test_data, classifier):
                                                                                               tol=None, pca_path=None, save_output=True, output_root=dirs.get("features_directory"))
 
         # Step 8. TODO Combine snmachine features with user defined features
-        # Step 9. TODO Create a Random Forest classifier; need to fit model and
-        # save it.
+        # Step 9. TODO Create a Random Forest classifier; need to fit model and save it.
+        combined_features = wavelet_features  # For running tests for now
+        create_classifier(combined_features, training_data)
 
         # Step 10. TODO Use saved classifier to make predictions. This can occur using a seperate file

From 27a0f6e894882a8078e955b3dd30a48091355893 Mon Sep 17 00:00:00 2001
From: Tarek Allam <t.allam.jr@gmail.com>
Date: Mon, 20 May 2019 15:17:25 +0100
Subject: [PATCH 26/58] Updating confusion matrix functions

This function now displays the confusion matrix as ASCII table in
console as well as returning seaborn figure
---
 utils/plasticc_utils.py | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/utils/plasticc_utils.py b/utils/plasticc_utils.py
index c17a2d5b..e56d1852 100644
--- a/utils/plasticc_utils.py
+++ b/utils/plasticc_utils.py
@@ -9,20 +9,26 @@
 import seaborn as sns
 
 
-def plot_confusion_matrix(yTrue, yPredict, dataName, targetNames):
-    cm = confusion_matrix(yTrue, yPredict, labels=targetNames)
-    cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
+def plot_confusion_matrix(y_true, y_pred, title, target_names, normalize=False):
+    cm = confusion_matrix(y_true, y_pred, labels=target_names)
+    if normalize:
+        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
+        print("Normalized confusion matrix")
+    else:
+        print('Confusion matrix, without normalization')
+    print(cm)
+
     annot = np.around(cm, 2)
 
     fig, ax = plt.subplots(figsize=(9, 7))
-    sns.heatmap(cm, xticklabels=targetNames,
-                yticklabels=targetNames, cmap='Blues',
+    sns.heatmap(cm, xticklabels=target_names,
+                yticklabels=target_names, cmap='Blues',
                 annot=annot, lw=0.5)
 
     ax.set_xlabel('Predicted Label')
     ax.set_ylabel('True Label')
     ax.set_aspect('equal')
-    plt.title(dataName)
+    plt.title(title)
 
     return cm
 

From 2b19babacd027c078dfdfc2ab384fda9b4b89659 Mon Sep 17 00:00:00 2001
From: Tarek Allam <t.allam.jr@gmail.com>
Date: Mon, 20 May 2019 15:18:43 +0100
Subject: [PATCH 27/58] Updates made to 'create_classifier' functions

Removal of Log Loss function call as well as stripping unused functions
within 'create_classifier'
---
 utils/config.yml           |  2 +-
 utils/plasticc_pipeline.py | 79 ++++++++++++++++++++------------------
 2 files changed, 42 insertions(+), 39 deletions(-)

diff --git a/utils/config.yml b/utils/config.yml
index ab511dc2..84456cb7 100644
--- a/utils/config.yml
+++ b/utils/config.yml
@@ -1,7 +1,7 @@
 # Global settings
 analysis_directory : "/share/hypatia/snmachine_resources/data/plasticc/analysis/"
 analysis_name : "pipeline-test"
-data_path : "/share/hypatia/snmachine_resources/data/plasticc/data/raw_data/new_train_data.pckl"
+data_path : "/share/hypatia/snmachine_resources/data/plasticc/data/raw_data/training_set_snia.pickle"
 # snmachine parameters
 number_gp : 1100
 kernel_param : [500., 20.]
diff --git a/utils/plasticc_pipeline.py b/utils/plasticc_pipeline.py
index eb093e60..479fcf0b 100755
--- a/utils/plasticc_pipeline.py
+++ b/utils/plasticc_pipeline.py
@@ -293,18 +293,20 @@ def wavelet_decomposition(training_data, number_gp, **kwargs):
 
     Returns
     -------
-    None
+    waveout:
+
+    waveout_err:
+
+    wavelet_object:
 
     Examples
     --------
     >>> ...
-    >>> waveout, waveout_err, wavelet_object =
-    wavelet_decomposition(training_data, number_gp=number_gp, number_processes=number_processes,
+    >>> waveout, waveout_err, wavelet_object = wavelet_decomposition(training_data, number_gp=number_gp, number_processes=number_processes,
                                                                      save_output='all', output_root=dirs.get("intermediate_files_directory"))
     >>> print()
 
     """
-
     wavelet_object = snfeatures.WaveletFeatures(number_gp=number_gp)
     print("WAV = {}\n".format(wavelet_object.wav))
     print("MLEV = {}\n".format(wavelet_object.mlev))
@@ -312,16 +314,6 @@ def wavelet_decomposition(training_data, number_gp, **kwargs):
     waveout, waveout_err = wavelet_object.extract_wavelets(training_data, wavelet_object.wav, wavelet_object.mlev, **kwargs)
     return waveout, waveout_err, wavelet_object
 
-# def merge_features(some_features, other_features):
-#     # TODO: Move this to a data processing file
-#     if type(some_features) != pd.core.frame.DataFrame:
-#         some_features = some_features.to_pandas()
-#     if type(other_features) != pd.core.frame.DataFrame:
-#         other_features = other_features.to_pandas()
-#     merged_df = pd.merge(some_features, other_features)
-#     merged_df.set_index("Object", inplace=True)
-#     return merged_df
-
 
 def combine_all_features(reduced_wavelet_features, dataframe):
     # TODO: Improve docstrings. Discuss whether the user should pass in a CSV
@@ -351,8 +343,18 @@ def combine_all_features(reduced_wavelet_features, dataframe):
     >>> print(shape.combined_features)
 
     """
-    meta_df = dat.metadata
-    combined_features = merge_features(wavelet_features, meta_df)
+# def merge_features(some_features, other_features):
+#     # TODO: Move this to a data processing file
+#     if type(some_features) != pd.core.frame.DataFrame:
+#         some_features = some_features.to_pandas()
+#     if type(other_features) != pd.core.frame.DataFrame:
+#         other_features = other_features.to_pandas()
+#     merged_df = pd.merge(some_features, other_features)
+#     merged_df.set_index("Object", inplace=True)
+#     return merged_df
+
+#     meta_df = dat.metadata
+#     combined_features = merge_features(wavelet_features, meta_df)
     return combined_features
 
 
@@ -381,9 +383,13 @@ def create_classifier(combined_features, training_data, random_state=42):
 
     """
     # TODO: This is temporary while the pipeline is tested.
+    print("COMBINED_FEATURES_TYPE: {}".format(type(combined_features)))
     if isinstance(combined_features, np.ndarray):
-        features_pd = pd.DataFrame(combined_features, index=training_data.object_names)
-        features_pd['target'] = training_data.labels.values
+        combined_features = pd.DataFrame(combined_features, index=training_data.object_names)
+        combined_features['target'] = training_data.labels.values
+    else:
+        combined_features = combined_features.to_pandas()
+        combined_features['target'] = training_data.labels.values
 
     X = combined_features.drop('target', axis=1)
     y = combined_features['target'].values
@@ -398,31 +404,14 @@ def create_classifier(combined_features, training_data, random_state=42):
 
     X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state)
 
-    classifer = RandomForestClassifier(n_estimators=700, criterion='entropy',
-                                       oob_score=True, n_jobs=-1,
-                                       random_state=random_state)
-
+    classifer = RandomForestClassifier(n_estimators=700, criterion='entropy', oob_score=True, n_jobs=-1, random_state=random_state)
     classifer.fit(X_train, y_train)
 
     y_preds = classifer.predict(X_test)
-
-    confusion_matrix = plot_confusion_matrix(y_test, y_preds, 'Validation data', target_names)
+    confusion_matrix = plot_confusion_matrix(y_test, y_preds, 'Validation data', target_names, normalize=True)
 
     y_probs = classifer.predict_proba(X_test)
 
-    nlines = len(target_names)
-    # we also need to express the truth table as a matrix
-    sklearn_truth = np.zeros((len(y_test), nlines))
-    label_index_map = dict(zip(classifer.classes_, np.arange(nlines)))
-    for i, x in enumerate(y_test):
-            sklearn_truth[i][label_index_map[y_test[i]]] = 1
-
-    weights = np.array([1/18, 1/9, 1/18, 1/18, 1/18, 1/18, 1/18, 1/9, 1/18, 1/18, 1/18, 1/18, 1/18, 1/18, 1/19])
-
-    # weights[:-1] to ignore last class, the anomaly class
-    log_loss = plasticc_log_loss(sklearn_truth, y_probs, relative_class_weights=weights[:-1])
-    print("LogLoss: {:.3f}\nBest Params: {}".format(log_loss, classifer.get_params))
-
     return classifer, confusion_matrix
 
 
@@ -448,12 +437,15 @@ def make_predictions(location_of_test_data, classifier):
     params = load_configuration_file(path_to_configuration_file)
 
     data_path = params.get("data_path", None)
+    print(data_path)
     analysis_directory = params.get("analysis_directory", None)
     analysis_name = params.get("analysis_name", None)
 
     # snmachine parameters
     number_gp = params.get("number_gp", None)
+    print(number_gp)
     kernel_param = params.get("kernel_param", None)
+    print(kernel_param)
     number_of_principal_components = params.get("number_of_principal_components", None)
 
     # Step 1. Creat folders that contain analysis
@@ -483,6 +475,7 @@ def make_predictions(location_of_test_data, classifier):
 
         # Step 4. Load in training data
         training_data = load_training_data(data_path)
+        print(training_data)
 
         # Step 5. Compute GPs
         gps.compute_gps(training_data, number_gp=number_gp, t_min=0, t_max=1100,
@@ -493,14 +486,24 @@ def make_predictions(location_of_test_data, classifier):
         # Step 6. Extract wavelet coeffiencts
         waveout, waveout_err, wavelet_object = wavelet_decomposition(training_data, number_gp=number_gp, number_processes=number_processes,
                                                                      save_output='all', output_root=dirs.get("intermediate_files_directory"))
+        print(waveout)
+        print(type(waveout))
+        print(waveout_err)
+        print(type(waveout_err))
+        print(wavelet_object)
+        print(type(wavelet_object))
 
         # Step 7. Reduce dimensionality of wavelets by using only N principal components
         wavelet_features, eigenvals, eigenvecs, means, num_feats = wavelet_object.extract_pca(object_names=training_data.object_names, wavout=waveout, recompute_pca=True, method='svd', ncomp=number_of_principal_components,
                                                                                               tol=None, pca_path=None, save_output=True, output_root=dirs.get("features_directory"))
+        print(wavelet_features)
+        print(type(wavelet_features))
 
         # Step 8. TODO Combine snmachine features with user defined features
+
         # Step 9. TODO Create a Random Forest classifier; need to fit model and save it.
         combined_features = wavelet_features  # For running tests for now
-        create_classifier(combined_features, training_data)
+        classifer = create_classifier(combined_features, training_data)
+        print(classifer.best_params_)
 
         # Step 10. TODO Use saved classifier to make predictions. This can occur using a seperate file

From a267363c4a0954e00d77c9702351458cf111a07d Mon Sep 17 00:00:00 2001
From: Tarek Allam <t.allam.jr@gmail.com>
Date: Tue, 21 May 2019 10:18:20 +0100
Subject: [PATCH 28/58] Save SHA and timestamp inside copy of config file

Previously one would prepend the hash and timestamp to the folder, but
this became too verbose

Adding checks if analysis name already created

This should see if the user has already defined an existing analysis
name already and check to see if they want to overwrite the results in
that folder, or create a new one.
---
 utils/plasticc_pipeline.py | 46 ++++++++++++++++++++++++++++++--------
 1 file changed, 37 insertions(+), 9 deletions(-)

diff --git a/utils/plasticc_pipeline.py b/utils/plasticc_pipeline.py
index 479fcf0b..69528c7a 100755
--- a/utils/plasticc_pipeline.py
+++ b/utils/plasticc_pipeline.py
@@ -9,6 +9,7 @@
 import numpy as np
 import pandas as pd
 import os
+import sys
 import subprocess
 import multiprocessing
 import yaml
@@ -60,15 +61,15 @@ def get_timestamp(path_to_configuration_file):
     Examples
     --------
     >>> ...
-    >>> timestamp = get_timestamp(path_to_configuration_file)
+    >>> timestamp = get_timestamp()
     >>> print(timestamp)
     '2019-05-18-2100'
     """
-    _timestamp = subprocess.check_output(['date', '+%Y-%m-%d-%H%M', '-r', path_to_configuration_file])
+    _timestamp = subprocess.check_output(['date', '+%Y-%m-%d-%H%M'])
     return _timestamp.decode("utf-8").rstrip()
 
 
-def create_folder_structure(analysis_directory, analysis_name, path_to_configuration_file):
+def create_folder_structure(analysis_directory, analysis_name):
     """ Make directories that will be used for analysis
 
     Parameters
@@ -91,12 +92,12 @@ def create_folder_structure(analysis_directory, analysis_name, path_to_configura
     >>> ...
     >>> analysis_directory = params.get("analysis_directory", None)
     >>> analysis_name = params.get("analysis_name", None)
-    >>> directories = create_folder_structure(analysis_directory, analysis_name, path_to_configuration_file)
+    >>> directories = create_folder_structure(analysis_directory, analysis_name)
     >>> print(directories.get("method_directory", None))
 
     """
     # Prepend last modified time of configuration file and git SHA to analysis name
-    analysis_name = get_timestamp(path_to_configuration_file) + "-" + get_git_revision_short_hash() + "-" + analysis_name
+    # analysis_name = get_timestamp() + "-" + get_git_revision_short_hash() + "-" + analysis_name
 
     method_directory = os.path.join(analysis_directory, analysis_name)
     features_directory = os.path.join(method_directory, 'wavelet_features')
@@ -108,8 +109,29 @@ def create_folder_structure(analysis_directory, analysis_name, path_to_configura
             "classifications_directory": classifications_directory, "intermediate_files_directory": intermediate_files_directory,
             "plots_directory": plots_directory}
 
-    for key, value in dirs.items():
-        subprocess.call(['mkdir', value])
+    if os.path.isdir(method_directory):
+        errmsg = """
+                Folders already exist with this analysis name.
+
+                Are you sure you would like to proceed, this will overwrite the
+                {} folder [Y/n]
+                """.format(analysis_name)
+        raise OSError(errmsg)
+
+        _yes = ["yes", "y", "ye"]
+        _no = ["no", "n"]
+
+        choice = input().lower()
+
+        if choice in _yes:
+            print("I am sure")
+            for key, value in dirs.items():
+                subprocess.call(['mkdir', value])
+        elif choice in _no:
+            print("I am NOT sure")
+            sys.exit()
+        else:
+            sys.stdout.write("Please respond with 'yes' or 'no'")
 
     return dirs
 
@@ -145,7 +167,7 @@ def load_configuration_file(path_to_configuration_file):
             params = yaml.load(f)
     except IOError:
         print("Invalid yaml file provided")
-        exit()
+        sys.exit()
     print("The parameters are:\n {}".format(params))
     return params
 
@@ -171,6 +193,12 @@ def save_configuration_file(method_directory):
     >>> print()
 
     """
+    git_hash = {"git_hash": get_git_revision_short_hash()}
+    timestamp = {"timestamp": get_timestamp(path_to_configuration_file)}
+
+    params.update(git_hash)
+    params.update(timestamp)
+
     with open(os.path.join(method_directory, "config.yml"), 'w') as config:
             yaml.dump(params, config, default_flow_style=False)
 
@@ -449,7 +477,7 @@ def make_predictions(location_of_test_data, classifier):
     number_of_principal_components = params.get("number_of_principal_components", None)
 
     # Step 1. Creat folders that contain analysis
-    dirs = create_folder_structure(analysis_directory, analysis_name, path_to_configuration_file)
+    dirs = create_folder_structure(analysis_directory, analysis_name)
     # Step 2. Save configuration file used for this analysis
     save_configuration_file(dirs.get("method_directory"))
     # Step 3. Check at which point the user would like to run the analysis from.

From 3ad79b49fc5b08827434b886a5007bfb91161be7 Mon Sep 17 00:00:00 2001
From: Tarek Allam <t.allam.jr@gmail.com>
Date: Tue, 21 May 2019 11:27:15 +0100
Subject: [PATCH 29/58] Remove unused function argument

Sending stderr to /dev/null if folder overwritten
---
 utils/plasticc_pipeline.py | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

diff --git a/utils/plasticc_pipeline.py b/utils/plasticc_pipeline.py
index 69528c7a..a796b4d9 100755
--- a/utils/plasticc_pipeline.py
+++ b/utils/plasticc_pipeline.py
@@ -44,14 +44,9 @@ def get_git_revision_short_hash():
     return _hash.decode("utf-8").rstrip()
 
 
-def get_timestamp(path_to_configuration_file):
+def get_timestamp():
     """ Helper function to obtain latest modified time of the configuration file
 
-    Parameters
-    ----------
-    path_to_configuration_file : str
-        System path to where the configuration file is located
-
     Returns
     -------
     timestamp : str
@@ -116,7 +111,7 @@ def create_folder_structure(analysis_directory, analysis_name):
                 Are you sure you would like to proceed, this will overwrite the
                 {} folder [Y/n]
                 """.format(analysis_name)
-        raise OSError(errmsg)
+        print(errmsg)
 
         _yes = ["yes", "y", "ye"]
         _no = ["no", "n"]
@@ -124,9 +119,9 @@ def create_folder_structure(analysis_directory, analysis_name):
         choice = input().lower()
 
         if choice in _yes:
-            print("I am sure")
+            print("Overwriting existing folder..")
             for key, value in dirs.items():
-                subprocess.call(['mkdir', value])
+                subprocess.call(['mkdir', value], stderr=subprocess.DEVNULL)
         elif choice in _no:
             print("I am NOT sure")
             sys.exit()
@@ -194,7 +189,7 @@ def save_configuration_file(method_directory):
 
     """
     git_hash = {"git_hash": get_git_revision_short_hash()}
-    timestamp = {"timestamp": get_timestamp(path_to_configuration_file)}
+    timestamp = {"timestamp": get_timestamp()}
 
     params.update(git_hash)
     params.update(timestamp)

From 9c8d870cbba2023555ebc18dfcbd549d69d96c8f Mon Sep 17 00:00:00 2001
From: Tarek Allam <t.allam.jr@gmail.com>
Date: Tue, 21 May 2019 12:50:33 +0100
Subject: [PATCH 30/58] Updating docstrings

---
 utils/plasticc_pipeline.py | 81 ++++++++++++++++++++++----------------
 1 file changed, 46 insertions(+), 35 deletions(-)

diff --git a/utils/plasticc_pipeline.py b/utils/plasticc_pipeline.py
index a796b4d9..7b2750ac 100755
--- a/utils/plasticc_pipeline.py
+++ b/utils/plasticc_pipeline.py
@@ -91,8 +91,6 @@ def create_folder_structure(analysis_directory, analysis_name):
     >>> print(directories.get("method_directory", None))
 
     """
-    # Prepend last modified time of configuration file and git SHA to analysis name
-    # analysis_name = get_timestamp() + "-" + get_git_revision_short_hash() + "-" + analysis_name
 
     method_directory = os.path.join(analysis_directory, analysis_name)
     features_directory = os.path.join(method_directory, 'wavelet_features')
@@ -132,7 +130,6 @@ def create_folder_structure(analysis_directory, analysis_name):
 
 
 def load_configuration_file(path_to_configuration_file):
-    # TODO: Finish doctring examples
     """ Load from disk the configuration file that is to be used
 
     Parameters
@@ -150,12 +147,12 @@ def load_configuration_file(path_to_configuration_file):
     Each item inside the configuration file can be accessed like so:
     >>> ...
     >>> params = load_configuration_file(path_to_configuration_file)
-    >>> data_path = params.get("data_path", None)
-    >>> print(data_path)
-
+    >>> kernel_param = params.get("kernel_param", None)
+    >>> print(kernel_param)
+    [500.0, 20.0]
     >>> number_gp = params.get("number_gp", None)
     >>> print(number_gp)
-
+    '1100'
     """
     try:
         with open(path_to_configuration_file) as f:
@@ -168,7 +165,6 @@ def load_configuration_file(path_to_configuration_file):
 
 
 def save_configuration_file(method_directory):
-    # TODO: Provide a doctring example
     """ Make a copy of the configuration file that has been used inside the
     analysis directory
 
@@ -185,9 +181,19 @@ def save_configuration_file(method_directory):
     --------
     >>> ...
     >>> save_configuration_file(method_directory)
-    >>> print()
-
+    >>> subprocess.call(['cat', os.path.join(method_directory, "config.yml")])
+    analysis_directory: /share/hypatia/snmachine_resources/data/plasticc/analysis/
+    analysis_name: pipeline-test
+    data_path: /share/hypatia/snmachine_resources/data/plasticc/data/raw_data/training_set_snia.pickle
+    git_hash: 916eaec
+    kernel_param:
+    - 500.0
+    - 20.0
+    number_gp: 1100
+    number_of_principal_components: 10
+    timestamp: 2019-05-21-1204
     """
+
     git_hash = {"git_hash": get_git_revision_short_hash()}
     timestamp = {"timestamp": get_timestamp()}
 
@@ -199,7 +205,6 @@ def save_configuration_file(method_directory):
 
 
 def load_training_data(data_path):
-    # TODO: Finish doctring examples
     """ Load from disk the training data one will use for this analysis
 
     Parameters
@@ -218,8 +223,9 @@ def load_training_data(data_path):
     >>> ...
     >>> training_data = load_training_data(params)
     >>> print(training_data)
-
+    <snmachine.sndata.PlasticcData object at 0x7f8dc9dd4e10>
     """
+
     try:
         if data_path.lower().endswith((".pickle", ".pkl", ".p", ".pckl")):
             with open(data_path, 'rb') as input:
@@ -299,20 +305,27 @@ def reduce_size_of_training_data(training_data, dirs, subset_size, seed=1234):
 
 
 def wavelet_decomposition(training_data, number_gp, **kwargs):
-    """ Load from disk the training data one will use for this analysis
+    """ Wrapper function for `snmachine.snfeatures.WaveletFeatures`. This
+    performs a wavelet decomposition on training data evaluated at 'number_gp'
+    points on a light curve
 
     Parameters
     ----------
     training_data : snmachine.PlasticcData
         Dictionary containing the parameters that reside in the configuration
         file. This will be used to obtain the path to the training data.
-    dirs : dict
-        Dictionary containing
-    subset_size : int
-        Number of objects the user would like to reduce the training data to
-    seed : int
-        Default set to 1234. This can be overridden by the user to check for
-        consistancy of results
+    number_gp : int
+        Number of points on the light curve to do wavelet analysis. Note, this
+        should be an even number for the wavelet decomposition to be able to be
+        performed.
+    number_processes : int
+        Number CPU cores avaiable to the user, this is how many cores the
+        decomposition will take place over
+    save_output : string
+        String defining what should be saved. See docs in
+        `snmachine.snfeatures.extract_wavelets` for more details on options.
+    output_root : string
+        Path to where one would like the uncompressed wavelet files to be stored
 
     Returns
     -------
@@ -456,19 +469,15 @@ def make_predictions(location_of_test_data, classifier):
     arguments = vars(arguments)
 
     path_to_configuration_file = arguments['configuration']
-
     params = load_configuration_file(path_to_configuration_file)
 
     data_path = params.get("data_path", None)
-    print(data_path)
     analysis_directory = params.get("analysis_directory", None)
     analysis_name = params.get("analysis_name", None)
 
     # snmachine parameters
     number_gp = params.get("number_gp", None)
-    print(number_gp)
     kernel_param = params.get("kernel_param", None)
-    print(kernel_param)
     number_of_principal_components = params.get("number_of_principal_components", None)
 
     # Step 1. Creat folders that contain analysis
@@ -498,7 +507,7 @@ def make_predictions(location_of_test_data, classifier):
 
         # Step 4. Load in training data
         training_data = load_training_data(data_path)
-        print(training_data)
+        print("training_data = {}".format(training_data))
 
         # Step 5. Compute GPs
         gps.compute_gps(training_data, number_gp=number_gp, t_min=0, t_max=1100,
@@ -509,24 +518,26 @@ def make_predictions(location_of_test_data, classifier):
         # Step 6. Extract wavelet coeffiencts
         waveout, waveout_err, wavelet_object = wavelet_decomposition(training_data, number_gp=number_gp, number_processes=number_processes,
                                                                      save_output='all', output_root=dirs.get("intermediate_files_directory"))
-        print(waveout)
-        print(type(waveout))
-        print(waveout_err)
-        print(type(waveout_err))
-        print(wavelet_object)
-        print(type(wavelet_object))
+        print("waveout = {}".format(waveout))
+        print("waveout, type = {}".format(type(waveout)))
+
+        print("waveout_err = {}".format(waveout_err))
+        print("waveout_err, type = {}".format(type(waveout_err)))
+
+        print("wavelet_object = {}".format(wavelet_object))
+        print("wavelet_object, type = {}".format(type(wavelet_object)))
 
         # Step 7. Reduce dimensionality of wavelets by using only N principal components
         wavelet_features, eigenvals, eigenvecs, means, num_feats = wavelet_object.extract_pca(object_names=training_data.object_names, wavout=waveout, recompute_pca=True, method='svd', ncomp=number_of_principal_components,
                                                                                               tol=None, pca_path=None, save_output=True, output_root=dirs.get("features_directory"))
-        print(wavelet_features)
-        print(type(wavelet_features))
+        print("wavelet_features = {}".format(wavelet_features))
+        print("wavelet_features, type = {}".format(type(wavelet_features)))
 
         # Step 8. TODO Combine snmachine features with user defined features
 
         # Step 9. TODO Create a Random Forest classifier; need to fit model and save it.
         combined_features = wavelet_features  # For running tests for now
-        classifer = create_classifier(combined_features, training_data)
-        print(classifer.best_params_)
+        classifier = create_classifier(combined_features, training_data)
+        print(F"classifier = {classifier}")
 
         # Step 10. TODO Use saved classifier to make predictions. This can occur using a seperate file

From d2dd8435b2835c4267391daf69a3986f3617e043 Mon Sep 17 00:00:00 2001
From: Tarek Allam <t.allam.jr@gmail.com>
Date: Tue, 21 May 2019 13:46:33 +0100
Subject: [PATCH 31/58] Adding _to_pandas() helper functions

This function should be able to convert to either numpy or astropy.Table
to a pandas.DataFrame.

Also updating docstrings
---
 utils/plasticc_pipeline.py | 66 +++++++++++++++++++++++++++++---------
 1 file changed, 50 insertions(+), 16 deletions(-)

diff --git a/utils/plasticc_pipeline.py b/utils/plasticc_pipeline.py
index 7b2750ac..81025568 100755
--- a/utils/plasticc_pipeline.py
+++ b/utils/plasticc_pipeline.py
@@ -329,11 +329,12 @@ def wavelet_decomposition(training_data, number_gp, **kwargs):
 
     Returns
     -------
-    waveout:
+    waveout: numpy.ndarray
 
-    waveout_err:
+    waveout_err: numpy.ndarray
+
+    wavelet_object: snmachine.snfeatures.WaveletFeatures object
 
-    wavelet_object:
 
     Examples
     --------
@@ -352,14 +353,13 @@ def wavelet_decomposition(training_data, number_gp, **kwargs):
 
 
 def combine_all_features(reduced_wavelet_features, dataframe):
-    # TODO: Improve docstrings. Discuss whether the user should pass in a CSV
-    # instead?
+    # TODO: Improve docstrings.
     """ Combine snmachine wavelet features with PLASTICC features. The
     user should define a dataframe they would like to merge.
 
     Parameters
     ----------
-    reduced_wavelet_features : numpy.ndarray
+    reduced_wavelet_features :  astropy.table.table.Table
         These are the N principal components from the uncompressed wavelets
     dataframe : pandas.DataFrame
         Dataframe
@@ -379,6 +379,7 @@ def combine_all_features(reduced_wavelet_features, dataframe):
     >>> print(shape.combined_features)
 
     """
+
 # def merge_features(some_features, other_features):
 #     # TODO: Move this to a data processing file
 #     if type(some_features) != pd.core.frame.DataFrame:
@@ -394,6 +395,39 @@ def combine_all_features(reduced_wavelet_features, dataframe):
     return combined_features
 
 
+def _to_pandas(features):
+    # TODO: Improve docstrings.
+    """ Helper function to take either an astropy Table
+    or numpy ndarray and convert to a pandas DataFrame representation
+
+    Parameters
+    ----------
+    features: astropy.table.table.Table OR numpy.ndarray
+        This parameter can be either an astropy Table or numpy ndarray
+        representation of the wavelet features
+
+    Returns
+    -------
+    features : pandas.DataFrame
+
+    Examples
+    --------
+    >>> ...
+    >>> print(type(features))
+
+    >>> features = _to_pandas(features)
+    >>> print(type(features))
+
+    """
+
+    if isinstance(features, np.ndarray):
+        features = pd.DataFrame(features, index=training_data.object_names)
+    else:
+        features = features.to_pandas()
+
+    return features
+
+
 def create_classifier(combined_features, training_data, random_state=42):
     # TODO: Improve docstrings.
     """ Creation of an optimised Random Forest classifier.
@@ -414,18 +448,18 @@ def create_classifier(combined_features, training_data, random_state=42):
     >>> ...
     >>> classifier, confusion_matrix = create_classifier(combined_features)
     >>> print(classifier)
-
-    >>> plot_confusion_matrix(confusion_matrix)
-
+    (RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
+        max_depth=None, max_features='auto', max_leaf_nodes=None,
+        min_impurity_split=1e-07, min_samples_leaf=1,
+        min_samples_split=2, min_weight_fraction_leaf=0.0,
+        n_estimators=700, n_jobs=-1, oob_score=True, random_state=42,
+        verbose=0, warm_start=False), array([[ 1.]]))
     """
-    # TODO: This is temporary while the pipeline is tested.
+
     print("COMBINED_FEATURES_TYPE: {}".format(type(combined_features)))
-    if isinstance(combined_features, np.ndarray):
-        combined_features = pd.DataFrame(combined_features, index=training_data.object_names)
-        combined_features['target'] = training_data.labels.values
-    else:
-        combined_features = combined_features.to_pandas()
-        combined_features['target'] = training_data.labels.values
+    combined_features = _to_pandas(combined_features)
+    print("COMBINED_FEATURES_TYPE, after _to_pandas(): {}".format(type(combined_features)))
+    combined_features['target'] = training_data.labels.values
 
     X = combined_features.drop('target', axis=1)
     y = combined_features['target'].values

From 0e4fe568095078eab22344987fbc598cc7311b66 Mon Sep 17 00:00:00 2001
From: Tarek Allam <t.allam.jr@gmail.com>
Date: Tue, 21 May 2019 13:53:39 +0100
Subject: [PATCH 32/58] Adding roc/auc metrics to create_classifier()

---
 utils/config.yml           |  4 ++--
 utils/plasticc_pipeline.py | 10 ++++++++--
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/utils/config.yml b/utils/config.yml
index 84456cb7..43695d63 100644
--- a/utils/config.yml
+++ b/utils/config.yml
@@ -1,7 +1,7 @@
 # Global settings
 analysis_directory : "/share/hypatia/snmachine_resources/data/plasticc/analysis/"
-analysis_name : "pipeline-test"
-data_path : "/share/hypatia/snmachine_resources/data/plasticc/data/raw_data/training_set_snia.pickle"
+analysis_name : "pipeline-extragal"
+data_path : "/share/hypatia/snmachine_resources/data/plasticc/data/raw_data/training_set_extragal.pickle"
 # snmachine parameters
 number_gp : 1100
 kernel_param : [500., 20.]
diff --git a/utils/plasticc_pipeline.py b/utils/plasticc_pipeline.py
index 81025568..5b1742fe 100755
--- a/utils/plasticc_pipeline.py
+++ b/utils/plasticc_pipeline.py
@@ -5,6 +5,7 @@
 from astropy.table import Table
 from sklearn.model_selection import train_test_split
 from sklearn.ensemble import RandomForestClassifier
+from sklearn.metrics import roc_curve, auc
 from argparse import ArgumentParser
 import numpy as np
 import pandas as pd
@@ -414,10 +415,10 @@ def _to_pandas(features):
     --------
     >>> ...
     >>> print(type(features))
-
+    <class 'astropy.table.table.Table'>
     >>> features = _to_pandas(features)
     >>> print(type(features))
-
+    <class 'pandas.core.frame.DataFrame'>
     """
 
     if isinstance(features, np.ndarray):
@@ -480,7 +481,12 @@ def create_classifier(combined_features, training_data, random_state=42):
     y_preds = classifer.predict(X_test)
     confusion_matrix = plot_confusion_matrix(y_test, y_preds, 'Validation data', target_names, normalize=True)
 
+    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_preds)
+    roc_auc = auc(false_positive_rate, true_positive_rate)
+    print(F"ROC {roc_auc}")
+
     y_probs = classifer.predict_proba(X_test)
+    print(y_probs)
 
     return classifer, confusion_matrix
 

From 83f91f4778c5c0bc06331071fa3f91721a65be35 Mon Sep 17 00:00:00 2001
From: Tarek Allam <t.allam.jr@gmail.com>
Date: Tue, 21 May 2019 14:26:23 +0100
Subject: [PATCH 33/58] Fixing error of now new folder being created

Due to a conditional check if a directory exists or not, it became
apparent that if there was not a directory, no new ones were being
created. This change fixes that.
---
 utils/plasticc_pipeline.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/utils/plasticc_pipeline.py b/utils/plasticc_pipeline.py
index 5b1742fe..3192b253 100755
--- a/utils/plasticc_pipeline.py
+++ b/utils/plasticc_pipeline.py
@@ -126,6 +126,9 @@ def create_folder_structure(analysis_directory, analysis_name):
             sys.exit()
         else:
             sys.stdout.write("Please respond with 'yes' or 'no'")
+    else:
+        for key, value in dirs.items():
+            subprocess.call(['mkdir', value])
 
     return dirs
 

From c5593d50ac49d47534a7d506ac06ecdb8858edac Mon Sep 17 00:00:00 2001
From: Tarek Allam <t.allam.jr@gmail.com>
Date: Tue, 21 May 2019 15:41:13 +0100
Subject: [PATCH 34/58] Updating gitignore

Do not track log files in utils folder
---
 .gitignore | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.gitignore b/.gitignore
index 6875f042..0797322a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,6 +2,9 @@
 test/*
 !test/*.py
 
+# Do not track log files in utils
+utils/*stdout.txt
+
 ## Python.gitignore from Github.
 ##
 # Byte-compiled / optimized / DLL files

From 382a251ea10826e221c47ddb704ef8038b3ec05f Mon Sep 17 00:00:00 2001
From: Tarek Allam <t.allam.jr@gmail.com>
Date: Wed, 22 May 2019 13:27:35 +0100
Subject: [PATCH 35/58] Updating save_configuration_file function

Removing ROC and AUC metrics
---
 utils/plasticc_pipeline.py | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/utils/plasticc_pipeline.py b/utils/plasticc_pipeline.py
index 3192b253..2fbf4d6d 100755
--- a/utils/plasticc_pipeline.py
+++ b/utils/plasticc_pipeline.py
@@ -5,7 +5,6 @@
 from astropy.table import Table
 from sklearn.model_selection import train_test_split
 from sklearn.ensemble import RandomForestClassifier
-from sklearn.metrics import roc_curve, auc
 from argparse import ArgumentParser
 import numpy as np
 import pandas as pd
@@ -168,14 +167,16 @@ def load_configuration_file(path_to_configuration_file):
     return params
 
 
-def save_configuration_file(method_directory):
+def save_configuration_file(params, method_directory):
     """ Make a copy of the configuration file that has been used inside the
     analysis directory
 
     Parameters
     ----------
+    params : dict
+        Dictionary containing the parameters used for this analysis
     method_directory : string
-        The folder path used for this analysis
+        Folder where this analysis is taking place
 
     Returns
     -------
@@ -184,7 +185,7 @@ def save_configuration_file(method_directory):
     Examples
     --------
     >>> ...
-    >>> save_configuration_file(method_directory)
+    >>> save_configuration_file(params, method_directory)
     >>> subprocess.call(['cat', os.path.join(method_directory, "config.yml")])
     analysis_directory: /share/hypatia/snmachine_resources/data/plasticc/analysis/
     analysis_name: pipeline-test
@@ -484,10 +485,6 @@ def create_classifier(combined_features, training_data, random_state=42):
     y_preds = classifer.predict(X_test)
     confusion_matrix = plot_confusion_matrix(y_test, y_preds, 'Validation data', target_names, normalize=True)
 
-    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_preds)
-    roc_auc = auc(false_positive_rate, true_positive_rate)
-    print(F"ROC {roc_auc}")
-
     y_probs = classifer.predict_proba(X_test)
     print(y_probs)
 
@@ -526,7 +523,7 @@ def make_predictions(location_of_test_data, classifier):
     # Step 1. Creat folders that contain analysis
     dirs = create_folder_structure(analysis_directory, analysis_name)
     # Step 2. Save configuration file used for this analysis
-    save_configuration_file(dirs.get("method_directory"))
+    save_configuration_file(params, dirs.get("method_directory"))
     # Step 3. Check at which point the user would like to run the analysis from.
     # If elements already saved, these will be used but this can be overriden
     # with command line argument

From 8ad52aaf3b7dd84fe517bc9cb3ed487937c035db Mon Sep 17 00:00:00 2001
From: Tarek Allam <t.allam.jr@gmail.com>
Date: Wed, 22 May 2019 13:42:30 +0100
Subject: [PATCH 36/58] Adding option to save wavelet features to disk

---
 snmachine/snfeatures.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/snmachine/snfeatures.py b/snmachine/snfeatures.py
index 6e903c21..db28c0d2 100644
--- a/snmachine/snfeatures.py
+++ b/snmachine/snfeatures.py
@@ -2025,6 +2025,8 @@ def extract_pca(self, object_names, wavout, recompute_pca=True,
             np.save(os.path.join(output_root,'eigenvectors_{}.npy'.format(ncomp)),vec)
             np.save(os.path.join(output_root,'comps_{}.npy'.format(ncomp)),comps)
             np.save(os.path.join(output_root,'means_{}.npy'.format(ncomp)),M)
+            # Write the astropy table containing the wavelet features to disk
+            wavs.write(os.path.join(output_root, 'reduced_wavelet_features'), format='fits',overwrite=True)
 
         return wavs, vals, vec, M, s
 

From 98867b928bd29cd2994c54dd82e42b6fb4123a1a Mon Sep 17 00:00:00 2001
From: Tarek Allam <t.allam.jr@gmail.com>
Date: Wed, 22 May 2019 13:50:49 +0100
Subject: [PATCH 37/58] Adding option to restart from saved wavelets

---
 utils/plasticc_pipeline.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/utils/plasticc_pipeline.py b/utils/plasticc_pipeline.py
index 2fbf4d6d..e8ef8326 100755
--- a/utils/plasticc_pipeline.py
+++ b/utils/plasticc_pipeline.py
@@ -529,12 +529,16 @@ def make_predictions(location_of_test_data, classifier):
     # with command line argument
     if (arguments['restart_from'].lower() == "wavelets"):
         # Restart from saved uncompressed wavelets.
-        wavelet_features = Table.read(dirs.get("features_dir") + "/wavelet_features.fits")
-        combined_features = combine_all_features(wavelet_features, data_path)
-        classifer = create_classifier(combined_features)
+        wavelet_features = Table.read(os.path.join(dirs.get("features_dir"), "reduced_wavelet_features.fits"))
+        combined_features = wavelet_features  # For running tests for now
+        classifier, confusion_matrix = create_classifier(combined_features, training_data)
+        print(F"classifier = {classifier}")
     elif (arguments['restart_from'].lower() == "gps"):
         # Restart from saved GPs.
         pass
+    elif (arguments['restart_from'].lower() == "pca"):
+        # Restart from saved PCA components
+        pass
     else:
         # Run full pipeline but still do checks to see if elements from GPs or
         # wavelets already exist on disk; the first check should be for:

From 80971f205ac94d82c8450347dfd21c79c70fdb8a Mon Sep 17 00:00:00 2001
From: Tarek Allam <t.allam.jr@gmail.com>
Date: Wed, 22 May 2019 14:15:20 +0100
Subject: [PATCH 38/58] Moving restart option to its own function call

---
 utils/plasticc_pipeline.py | 29 +++++++++++++++++++++--------
 1 file changed, 21 insertions(+), 8 deletions(-)

diff --git a/utils/plasticc_pipeline.py b/utils/plasticc_pipeline.py
index e8ef8326..e5562c71 100755
--- a/utils/plasticc_pipeline.py
+++ b/utils/plasticc_pipeline.py
@@ -496,6 +496,22 @@ def make_predictions(location_of_test_data, classifier):
     pass
 
 
+def restart_from_saved_gps(dirs):
+    pass
+
+
+def restart_from_saved_wavelets(dirs):
+    pass
+
+
+def restart_from_saved_pca(dirs):
+    # TODO: Write docstrings
+    wavelet_features = Table.read(os.path.join(dirs.get("features_directory"), "reduced_wavelet_features.fits"))
+    combined_features = wavelet_features  # For running tests for now
+    classifier, confusion_matrix = create_classifier(combined_features, training_data)
+    print(F"classifier = {classifier}")
+
+
 if __name__ == "__main__":
 
     # Set the number of processes you want to use throughout the notebook
@@ -527,18 +543,15 @@ def make_predictions(location_of_test_data, classifier):
     # Step 3. Check at which point the user would like to run the analysis from.
     # If elements already saved, these will be used but this can be overriden
     # with command line argument
-    if (arguments['restart_from'].lower() == "wavelets"):
-        # Restart from saved uncompressed wavelets.
-        wavelet_features = Table.read(os.path.join(dirs.get("features_dir"), "reduced_wavelet_features.fits"))
-        combined_features = wavelet_features  # For running tests for now
-        classifier, confusion_matrix = create_classifier(combined_features, training_data)
-        print(F"classifier = {classifier}")
-    elif (arguments['restart_from'].lower() == "gps"):
+    if (arguments['restart_from'].lower() == "gps"):
         # Restart from saved GPs.
         pass
+    elif (arguments['restart_from'].lower() == "wavelets"):
+        # Restart from saved uncompressed wavelets.
+        pass
     elif (arguments['restart_from'].lower() == "pca"):
         # Restart from saved PCA components
-        pass
+        restart_from_saved_pca(dirs)
     else:
         # Run full pipeline but still do checks to see if elements from GPs or
         # wavelets already exist on disk; the first check should be for:

From 42486c81d0e28b4bb200d6c6cea3ac8c8e054477 Mon Sep 17 00:00:00 2001
From: Tarek Allam <t.allam.jr@gmail.com>
Date: Wed, 22 May 2019 16:12:52 +0100
Subject: [PATCH 39/58] Return wavelet_components as a pandas DataFrame

Instead of an Astropy Table, return as a pandas DataFrame to processing
later in the pipeline
---
 snmachine/snfeatures.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/snmachine/snfeatures.py b/snmachine/snfeatures.py
index db28c0d2..71d4dd94 100644
--- a/snmachine/snfeatures.py
+++ b/snmachine/snfeatures.py
@@ -2012,11 +2012,11 @@ def extract_pca(self, object_names, wavout, recompute_pca=True,
             print('finish projecting PCA')
 
         # Now reformat the components as a table
-        labels = ['C%d' %i for i in range(number_comp)]
-        wavs = Table(comps, names=labels)
+        labels = ['C%d' %i for i in range(ncomp)]
+        wavelet_components = Table(comps, names=labels)
         objnames = Table(object_names.reshape(len(object_names), 1),
                          names=['Object'])
-        wavs = hstack((objnames, wavs))
+        wavelet_components = hstack((objnames, wavelet_components))
         print('Time for PCA', time.time() - t1)
 
         if save_output:
@@ -2025,10 +2025,11 @@ def extract_pca(self, object_names, wavout, recompute_pca=True,
             np.save(os.path.join(output_root,'eigenvectors_{}.npy'.format(ncomp)),vec)
             np.save(os.path.join(output_root,'comps_{}.npy'.format(ncomp)),comps)
             np.save(os.path.join(output_root,'means_{}.npy'.format(ncomp)),M)
-            # Write the astropy table containing the wavelet features to disk
-            wavs.write(os.path.join(output_root, 'reduced_wavelet_features'), format='fits',overwrite=True)
+            # Write the astropy table containing the wavelet features to disk after converting to pandas dataframe
+            wavelet_components = wavelet_components.to_pandas()
+            wavelet_components.pickle(os.path.join(output_root, 'wavelet_components_{}.pickle'.format(ncomp)))
 
-        return wavs, vals, vec, M, s
+        return wavelet_components, vals, vec, M, s
 
     def iswt(self, coefficients, wavelet):
         """

From d50ec445d88af21f571ddc74df2c8acfcd0f764c Mon Sep 17 00:00:00 2001
From: Tarek Allam <t.allam.jr@gmail.com>
Date: Wed, 22 May 2019 16:13:34 +0100
Subject: [PATCH 40/58] Rearrange imports to be PEP8 compliant

---
 utils/plasticc_pipeline.py | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/utils/plasticc_pipeline.py b/utils/plasticc_pipeline.py
index e5562c71..48945a0e 100755
--- a/utils/plasticc_pipeline.py
+++ b/utils/plasticc_pipeline.py
@@ -1,20 +1,15 @@
 """
 Machine learning pipeline for the PLAsTiCC competition using snmachine codebase.
 """
-from plasticc_utils import plasticc_log_loss, plot_confusion_matrix
-from astropy.table import Table
-from sklearn.model_selection import train_test_split
-from sklearn.ensemble import RandomForestClassifier
-from argparse import ArgumentParser
-import numpy as np
-import pandas as pd
+import multiprocessing
 import os
-import sys
 import subprocess
-import multiprocessing
-import yaml
+import sys
 import warnings
-warnings.filterwarnings("ignore")
+
+import numpy as np
+import pandas as pd
+import yaml
 try:
     import cPickle as pickle
 except ModuleNotFoundError:
@@ -24,6 +19,14 @@
 except ImportError:
     print("Unable to import snmachine. Check environment set correctly")
 
+from plasticc_utils import plasticc_log_loss, plot_confusion_matrix
+from astropy.table import Table
+from sklearn.model_selection import train_test_split
+from sklearn.ensemble import RandomForestClassifier
+from argparse import ArgumentParser
+
+warnings.filterwarnings("ignore")
+
 
 def get_git_revision_short_hash():
     """ Helper function to obtain current version control hash value
@@ -461,9 +464,6 @@ def create_classifier(combined_features, training_data, random_state=42):
         verbose=0, warm_start=False), array([[ 1.]]))
     """
 
-    print("COMBINED_FEATURES_TYPE: {}".format(type(combined_features)))
-    combined_features = _to_pandas(combined_features)
-    print("COMBINED_FEATURES_TYPE, after _to_pandas(): {}".format(type(combined_features)))
     combined_features['target'] = training_data.labels.values
 
     X = combined_features.drop('target', axis=1)

From ae8d50e547b5ac9c0efe53b0851bc2f811fd4d67 Mon Sep 17 00:00:00 2001
From: Tarek Allam <t.allam.jr@gmail.com>
Date: Wed, 22 May 2019 17:34:34 +0100
Subject: [PATCH 41/58] Updating variable name

---
 snmachine/snfeatures.py    | 10 +++++-----
 utils/config.yml           |  4 ++--
 utils/plasticc_pipeline.py |  6 +++---
 3 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/snmachine/snfeatures.py b/snmachine/snfeatures.py
index 71d4dd94..0965dfc3 100644
--- a/snmachine/snfeatures.py
+++ b/snmachine/snfeatures.py
@@ -2013,10 +2013,10 @@ def extract_pca(self, object_names, wavout, recompute_pca=True,
 
         # Now reformat the components as a table
         labels = ['C%d' %i for i in range(ncomp)]
-        wavelet_components = Table(comps, names=labels)
+        reduced_wavelet_components = Table(comps, names=labels)
         objnames = Table(object_names.reshape(len(object_names), 1),
                          names=['Object'])
-        wavelet_components = hstack((objnames, wavelet_components))
+        reduced_wavelet_components = hstack((objnames, reduced_wavelet_components))
         print('Time for PCA', time.time() - t1)
 
         if save_output:
@@ -2026,10 +2026,10 @@ def extract_pca(self, object_names, wavout, recompute_pca=True,
             np.save(os.path.join(output_root,'comps_{}.npy'.format(ncomp)),comps)
             np.save(os.path.join(output_root,'means_{}.npy'.format(ncomp)),M)
             # Write the astropy table containing the wavelet features to disk after converting to pandas dataframe
-            wavelet_components = wavelet_components.to_pandas()
-            wavelet_components.pickle(os.path.join(output_root, 'wavelet_components_{}.pickle'.format(ncomp)))
+            reduced_wavelet_components = reduced_wavelet_components.to_pandas()
+            reduced_wavelet_components.pickle(os.path.join(output_root, 'reduced_wavelet_components_{}.pickle'.format(ncomp)))
 
-        return wavelet_components, vals, vec, M, s
+        return reduced_wavelet_components, vals, vec, M, s
 
     def iswt(self, coefficients, wavelet):
         """
diff --git a/utils/config.yml b/utils/config.yml
index 43695d63..80951185 100644
--- a/utils/config.yml
+++ b/utils/config.yml
@@ -1,7 +1,7 @@
 # Global settings
 analysis_directory : "/share/hypatia/snmachine_resources/data/plasticc/analysis/"
-analysis_name : "pipeline-extragal"
-data_path : "/share/hypatia/snmachine_resources/data/plasticc/data/raw_data/training_set_extragal.pickle"
+analysis_name : "pipeline-sniabcii"
+data_path : "/share/hypatia/snmachine_resources/data/plasticc/data/raw_data/training_set_sniabcii.pickle"
 # snmachine parameters
 number_gp : 1100
 kernel_param : [500., 20.]
diff --git a/utils/plasticc_pipeline.py b/utils/plasticc_pipeline.py
index 48945a0e..bc3de1e4 100755
--- a/utils/plasticc_pipeline.py
+++ b/utils/plasticc_pipeline.py
@@ -504,9 +504,9 @@ def restart_from_saved_wavelets(dirs):
     pass
 
 
-def restart_from_saved_pca(dirs):
+def restart_from_saved_pca(dirs, number_of_principal_components):
     # TODO: Write docstrings
-    wavelet_features = Table.read(os.path.join(dirs.get("features_directory"), "reduced_wavelet_features.fits"))
+    wavelet_features = Table.read(os.path.join(dirs.get("features_directory"), "reduced_wavelet_components_{}.pickle".format(number_of_principal_components)))
     combined_features = wavelet_features  # For running tests for now
     classifier, confusion_matrix = create_classifier(combined_features, training_data)
     print(F"classifier = {classifier}")
@@ -551,7 +551,7 @@ def restart_from_saved_pca(dirs):
         pass
     elif (arguments['restart_from'].lower() == "pca"):
         # Restart from saved PCA components
-        restart_from_saved_pca(dirs)
+        restart_from_saved_pca(dirs, number_of_principal_components)
     else:
         # Run full pipeline but still do checks to see if elements from GPs or
         # wavelets already exist on disk; the first check should be for:

From 89e3bf5bf3ba6b016ef2d2a015618b92d82f7034 Mon Sep 17 00:00:00 2001
From: Tarek Allam <t.allam.jr@gmail.com>
Date: Wed, 22 May 2019 17:40:30 +0100
Subject: [PATCH 42/58] Chaning file that logs parameters to be appending

---
 utils/plasticc_pipeline.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/utils/plasticc_pipeline.py b/utils/plasticc_pipeline.py
index bc3de1e4..01310cd5 100755
--- a/utils/plasticc_pipeline.py
+++ b/utils/plasticc_pipeline.py
@@ -189,7 +189,7 @@ def save_configuration_file(params, method_directory):
     --------
     >>> ...
     >>> save_configuration_file(params, method_directory)
-    >>> subprocess.call(['cat', os.path.join(method_directory, "config.yml")])
+    >>> subprocess.call(['cat', os.path.join(method_directory, "logs.yml")])
     analysis_directory: /share/hypatia/snmachine_resources/data/plasticc/analysis/
     analysis_name: pipeline-test
     data_path: /share/hypatia/snmachine_resources/data/plasticc/data/raw_data/training_set_snia.pickle
@@ -208,7 +208,7 @@ def save_configuration_file(params, method_directory):
     params.update(git_hash)
     params.update(timestamp)
 
-    with open(os.path.join(method_directory, "config.yml"), 'w') as config:
+    with open(os.path.join(method_directory, "logs.yml"), 'a') as config:
             yaml.dump(params, config, default_flow_style=False)
 
 

From 91d84a65deb7ebd66c8b31331a3c277040e0b249 Mon Sep 17 00:00:00 2001
From: Tarek Allam <t.allam.jr@gmail.com>
Date: Wed, 22 May 2019 17:42:56 +0100
Subject: [PATCH 43/58] This will open file for reading/writing (updating)

---
 utils/plasticc_pipeline.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/utils/plasticc_pipeline.py b/utils/plasticc_pipeline.py
index 01310cd5..4485bd83 100755
--- a/utils/plasticc_pipeline.py
+++ b/utils/plasticc_pipeline.py
@@ -208,7 +208,7 @@ def save_configuration_file(params, method_directory):
     params.update(git_hash)
     params.update(timestamp)
 
-    with open(os.path.join(method_directory, "logs.yml"), 'a') as config:
+    with open(os.path.join(method_directory, "logs.yml"), 'a+') as config:
             yaml.dump(params, config, default_flow_style=False)
 
 

From 7e281e25dd5862551a35e81ae44559ce99fa553a Mon Sep 17 00:00:00 2001
From: Tarek Allam <t.allam.jr@gmail.com>
Date: Wed, 22 May 2019 18:19:18 +0100
Subject: [PATCH 44/58] Fixing typo in saving and reading pickled df

---
 snmachine/snfeatures.py    | 2 +-
 utils/plasticc_pipeline.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/snmachine/snfeatures.py b/snmachine/snfeatures.py
index 0965dfc3..6970ad4f 100644
--- a/snmachine/snfeatures.py
+++ b/snmachine/snfeatures.py
@@ -2027,7 +2027,7 @@ def extract_pca(self, object_names, wavout, recompute_pca=True,
             np.save(os.path.join(output_root,'means_{}.npy'.format(ncomp)),M)
             # Write the astropy table containing the wavelet features to disk after converting to pandas dataframe
             reduced_wavelet_components = reduced_wavelet_components.to_pandas()
-            reduced_wavelet_components.pickle(os.path.join(output_root, 'reduced_wavelet_components_{}.pickle'.format(ncomp)))
+            reduced_wavelet_components.to_pickle(os.path.join(output_root, 'reduced_wavelet_components_{}.pickle'.format(ncomp)))
 
         return reduced_wavelet_components, vals, vec, M, s
 
diff --git a/utils/plasticc_pipeline.py b/utils/plasticc_pipeline.py
index 4485bd83..f54a85cf 100755
--- a/utils/plasticc_pipeline.py
+++ b/utils/plasticc_pipeline.py
@@ -506,7 +506,7 @@ def restart_from_saved_wavelets(dirs):
 
 def restart_from_saved_pca(dirs, number_of_principal_components):
     # TODO: Write docstrings
-    wavelet_features = Table.read(os.path.join(dirs.get("features_directory"), "reduced_wavelet_components_{}.pickle".format(number_of_principal_components)))
+    wavelet_features = pd.read_pickle(os.path.join(dirs.get("features_directory"), "reduced_wavelet_components_{}.pickle".format(number_of_principal_components)))
     combined_features = wavelet_features  # For running tests for now
     classifier, confusion_matrix = create_classifier(combined_features, training_data)
     print(F"classifier = {classifier}")

From e32f65e537d875049305e6f1fa2979849a808489 Mon Sep 17 00:00:00 2001
From: Tarek Allam <t.allam.jr@gmail.com>
Date: Wed, 29 May 2019 15:37:17 +0100
Subject: [PATCH 45/58] Including 'imbalanced-learn' package as dependency

Required updating sklearn version
---
 environment.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/environment.yml b/environment.yml
index 997fc7e1..aeb4e79e 100644
--- a/environment.yml
+++ b/environment.yml
@@ -11,12 +11,13 @@ dependencies:
   - jupyter>=1.0.0
   - matplotlib>=1.5.1
   - numpy=1.12.0
-  - scikit-learn=0.18.1
+  - scikit-learn>=0.20
   - scipy>=0.17.0
   - george>=0.3.0
   - iminuit>=1.2
   - pandas>=0.23.0
   - extinction>=0.3.0
+  - imbalanced-learn>=0.4.3
 
   - pip:
     - emcee>=2.1.0

From 47f6125c9cbbeaf588847becd5a5992f9cf1bed2 Mon Sep 17 00:00:00 2001
From: Tarek Allam <t.allam.jr@gmail.com>
Date: Wed, 29 May 2019 15:38:28 +0100
Subject: [PATCH 46/58] Return figure aswell as confusion matrix from func

---
 utils/plasticc_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/utils/plasticc_utils.py b/utils/plasticc_utils.py
index e56d1852..3659cd4b 100644
--- a/utils/plasticc_utils.py
+++ b/utils/plasticc_utils.py
@@ -30,7 +30,7 @@ def plot_confusion_matrix(y_true, y_pred, title, target_names, normalize=False):
     ax.set_aspect('equal')
     plt.title(title)
 
-    return cm
+    return cm, fig
 
 
 def plasticc_log_loss(y_true, y_pred, relative_class_weights=None):

From 3242bf36033c50d118b57debe135308ac5f8157b Mon Sep 17 00:00:00 2001
From: Tarek Allam <t.allam.jr@gmail.com>
Date: Wed, 29 May 2019 15:39:06 +0100
Subject: [PATCH 47/58] Adding functionality to rebalance classes

Also functionality save classifier and confusion matrix plot
---
 utils/plasticc_pipeline.py | 29 +++++++++++++++++++----------
 1 file changed, 19 insertions(+), 10 deletions(-)

diff --git a/utils/plasticc_pipeline.py b/utils/plasticc_pipeline.py
index f54a85cf..f8ede560 100755
--- a/utils/plasticc_pipeline.py
+++ b/utils/plasticc_pipeline.py
@@ -23,6 +23,9 @@
 from astropy.table import Table
 from sklearn.model_selection import train_test_split
 from sklearn.ensemble import RandomForestClassifier
+from imblearn.metrics import classification_report_imbalanced
+from imblearn.pipeline import make_pipeline
+from imblearn.over_sampling import SMOTE
 from argparse import ArgumentParser
 
 warnings.filterwarnings("ignore")
@@ -436,7 +439,7 @@ def _to_pandas(features):
     return features
 
 
-def create_classifier(combined_features, training_data, random_state=42):
+def create_classifier(combined_features, training_data, dirs, augmentation_method=None, random_state=42):
     # TODO: Improve docstrings.
     """ Creation of an optimised Random Forest classifier.
 
@@ -469,24 +472,30 @@ def create_classifier(combined_features, training_data, random_state=42):
     X = combined_features.drop('target', axis=1)
     y = combined_features['target'].values
 
-    print("X SHAPE = {}\n".format(X.shape))
-    print("y SHAPE = {}\n".format(y.shape))
-
     target_names = combined_features['target'].unique()
 
-    print("X = \n{}".format(X))
-    print("y = \n{}".format(y))
-
     X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state)
 
     classifer = RandomForestClassifier(n_estimators=700, criterion='entropy', oob_score=True, n_jobs=-1, random_state=random_state)
+
+    if augmentation_method in ['SMOTE']:
+        classifer = make_pipeline(augmentation_method(sampling_strategy='not majority'), classifer)
+    else:
+        print("No augmentation selected, proceeding without resampling of classes")
+
     classifer.fit(X_train, y_train)
 
+    # Classify and report the results
+    print(classification_report_imbalanced(y_test, classifer.predict(X_test)))
+
     y_preds = classifer.predict(X_test)
-    confusion_matrix = plot_confusion_matrix(y_test, y_preds, 'Validation data', target_names, normalize=True)
+    confusion_matrix, figure = plot_confusion_matrix(y_test, y_preds, 'Validation data', target_names, normalize=True)
+
+    timestamp = get_timestamp()
+    with open(os.path.join(dirs.get("classifications_directory"), F'classifer_{timestamp}.pkl'), 'wb') as clf:
+        pickle.dump(classifer, clf)
 
-    y_probs = classifer.predict_proba(X_test)
-    print(y_probs)
+    figure.savefig(os.join.path(dirs.get("plots_directory"), F'plot_{timestamp}.png'))
 
     return classifer, confusion_matrix
 

From cecb4ac1ad29d713fc007f5985145b7be759c66d Mon Sep 17 00:00:00 2001
From: Catarina Alves <catarina._3@hotmail.com>
Date: Wed, 29 May 2019 18:49:37 +0100
Subject: [PATCH 48/58] Fix a path bug

---
 utils/plasticc_pipeline.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/utils/plasticc_pipeline.py b/utils/plasticc_pipeline.py
index f8ede560..fab3c99f 100755
--- a/utils/plasticc_pipeline.py
+++ b/utils/plasticc_pipeline.py
@@ -495,7 +495,7 @@ def create_classifier(combined_features, training_data, dirs, augmentation_metho
     with open(os.path.join(dirs.get("classifications_directory"), F'classifer_{timestamp}.pkl'), 'wb') as clf:
         pickle.dump(classifer, clf)
 
-    figure.savefig(os.join.path(dirs.get("plots_directory"), F'plot_{timestamp}.png'))
+    figure.savefig(os.path.join(dirs.get("plots_directory"), F'plot_{timestamp}.png'))
 
     return classifer, confusion_matrix
 

From 080434b4e7c0443abafff8d15bad4a7dc0f1857a Mon Sep 17 00:00:00 2001
From: Catarina Alves <catarina._3@hotmail.com>
Date: Wed, 29 May 2019 19:04:21 +0100
Subject: [PATCH 49/58] Fix a method call

---
 utils/plasticc_pipeline.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/utils/plasticc_pipeline.py b/utils/plasticc_pipeline.py
index fab3c99f..3c5c5224 100755
--- a/utils/plasticc_pipeline.py
+++ b/utils/plasticc_pipeline.py
@@ -479,7 +479,7 @@ def create_classifier(combined_features, training_data, dirs, augmentation_metho
     classifer = RandomForestClassifier(n_estimators=700, criterion='entropy', oob_score=True, n_jobs=-1, random_state=random_state)
 
     if augmentation_method in ['SMOTE']:
-        classifer = make_pipeline(augmentation_method(sampling_strategy='not majority'), classifer)
+        classifer = make_pipeline(eval(augmentation_method)(sampling_strategy='not majority'), classifer)
     else:
         print("No augmentation selected, proceeding without resampling of classes")
 

From 2c86dc79f73ccd506b5a7a7a40ef3129be4bc672 Mon Sep 17 00:00:00 2001
From: Tarek Allam <t.allam.jr@gmail.com>
Date: Mon, 3 Jun 2019 00:08:21 +0100
Subject: [PATCH 50/58] Updating variable name, ncomp --> number_comp

---
 snmachine/snfeatures.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/snmachine/snfeatures.py b/snmachine/snfeatures.py
index 6970ad4f..04df6668 100644
--- a/snmachine/snfeatures.py
+++ b/snmachine/snfeatures.py
@@ -2012,7 +2012,7 @@ def extract_pca(self, object_names, wavout, recompute_pca=True,
             print('finish projecting PCA')
 
         # Now reformat the components as a table
-        labels = ['C%d' %i for i in range(ncomp)]
+        labels = ['C%d' %i for i in range(number_comp)]
         reduced_wavelet_components = Table(comps, names=labels)
         objnames = Table(object_names.reshape(len(object_names), 1),
                          names=['Object'])
@@ -2021,13 +2021,13 @@ def extract_pca(self, object_names, wavout, recompute_pca=True,
 
         if save_output:
             # We need to change the output to make it consistent with new code
-            np.save(os.path.join(output_root,'eigenvalues_{}.npy'.format(ncomp)),vals)
-            np.save(os.path.join(output_root,'eigenvectors_{}.npy'.format(ncomp)),vec)
-            np.save(os.path.join(output_root,'comps_{}.npy'.format(ncomp)),comps)
-            np.save(os.path.join(output_root,'means_{}.npy'.format(ncomp)),M)
+            np.save(os.path.join(output_root,'eigenvalues_{}.npy'.format(number_comp)),vals)
+            np.save(os.path.join(output_root,'eigenvectors_{}.npy'.format(number_comp)),vec)
+            np.save(os.path.join(output_root,'comps_{}.npy'.format(number_comp)),comps)
+            np.save(os.path.join(output_root,'means_{}.npy'.format(number_comp)),M)
             # Write the astropy table containing the wavelet features to disk after converting to pandas dataframe
             reduced_wavelet_components = reduced_wavelet_components.to_pandas()
-            reduced_wavelet_components.to_pickle(os.path.join(output_root, 'reduced_wavelet_components_{}.pickle'.format(ncomp)))
+            reduced_wavelet_components.to_pickle(os.path.join(output_root, 'reduced_wavelet_components_{}.pickle'.format(number_comp)))
 
         return reduced_wavelet_components, vals, vec, M, s
 

From 0eb5572dc1396a0c949fe7761ecf394adee81636 Mon Sep 17 00:00:00 2001
From: Tarek Allam <t.allam.jr@gmail.com>
Date: Mon, 3 Jun 2019 00:09:20 +0100
Subject: [PATCH 51/58] [FIXUP] Updating variable name, ncomp

---
 utils/plasticc_pipeline.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/utils/plasticc_pipeline.py b/utils/plasticc_pipeline.py
index 3c5c5224..2c93f90b 100755
--- a/utils/plasticc_pipeline.py
+++ b/utils/plasticc_pipeline.py
@@ -594,7 +594,7 @@ def restart_from_saved_pca(dirs, number_of_principal_components):
         print("wavelet_object, type = {}".format(type(wavelet_object)))
 
         # Step 7. Reduce dimensionality of wavelets by using only N principal components
-        wavelet_features, eigenvals, eigenvecs, means, num_feats = wavelet_object.extract_pca(object_names=training_data.object_names, wavout=waveout, recompute_pca=True, method='svd', ncomp=number_of_principal_components,
+        wavelet_features, eigenvals, eigenvecs, means, num_feats = wavelet_object.extract_pca(object_names=training_data.object_names, wavout=waveout, recompute_pca=True, method='svd', number_comp=number_of_principal_components,
                                                                                               tol=None, pca_path=None, save_output=True, output_root=dirs.get("features_directory"))
         print("wavelet_features = {}".format(wavelet_features))
         print("wavelet_features, type = {}".format(type(wavelet_features)))

From d07bbdc5c7e561e3c3ef31662e837d7925135f16 Mon Sep 17 00:00:00 2001
From: Tarek Allam <t.allam.jr@gmail.com>
Date: Mon, 3 Jun 2019 00:26:17 +0100
Subject: [PATCH 52/58] Adding 'get_directories()' function

Fixes #149
---
 utils/plasticc_pipeline.py | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/utils/plasticc_pipeline.py b/utils/plasticc_pipeline.py
index 2c93f90b..3234ff5e 100755
--- a/utils/plasticc_pipeline.py
+++ b/utils/plasticc_pipeline.py
@@ -138,6 +138,38 @@ def create_folder_structure(analysis_directory, analysis_name):
     return dirs
 
 
+def get_directories(analyses_directory, analysis_name):
+    """Returns the folder directories inside of a given analysis.
+
+    # TODO [Add a link to the place where we have an explanation of the folder structure]
+
+    Parameters
+    ----------
+    analyses_directory : str
+        System path to where the user stores all analysis.
+    analysis_name : str
+        Name of the analysis we want.
+
+    Returns
+    -------
+    directories : dict
+        Dictionary containing the mapping of folders inside of `analysis_name`.
+    """
+    analysis_directory = os.path.join(analyses_directory, analysis_name)
+    features_directory = os.path.join(analysis_directory, 'wavelet_features')
+    classifications_directory = os.path.join(analysis_directory, 'classifications')
+    intermediate_files_directory = os.path.join(analysis_directory, 'intermediate_files')
+    plots_directory = os.path.join(analysis_directory, 'plots')
+
+    directories = {"analysis_directory": analysis_directory,
+                   "features_directory": features_directory,
+                   "classifications_directory": classifications_directory,
+                   "intermediate_files_directory": intermediate_files_directory,
+                   "plots_directory": plots_directory}
+
+    return directories
+
+
 def load_configuration_file(path_to_configuration_file):
     """ Load from disk the configuration file that is to be used
 

From 854ebc25d7118ce2a37e6a1bff31b4952f161ec6 Mon Sep 17 00:00:00 2001
From: Tarek Allam <t.allam.jr@gmail.com>
Date: Mon, 3 Jun 2019 15:39:02 +0100
Subject: [PATCH 53/58] [FIXUP] Adding debug print statement

---
 snmachine/snfeatures.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/snmachine/snfeatures.py b/snmachine/snfeatures.py
index 04df6668..b7e6f930 100644
--- a/snmachine/snfeatures.py
+++ b/snmachine/snfeatures.py
@@ -721,6 +721,7 @@ def extract_features(self, d, save_output=False, chain_directory='chains', use_r
                 self.model=sncosmo.Model(self.templates[mod_name],effects=[dust],effect_names=['host'], effect_frames=['rest'])
             else:
                 self.model=sncosmo.Model(self.templates[mod_name])
+                print(F'MODEL-NAME: {mod_name}')
             params=['['+mod_name+']'+pname for pname in self.model.param_names]
             # err_plus=[pname+'_err+' for pname in params]
             # err_minus=[pname+'_err-' for pname in params]

From d32ca95fd21dc3c2e06e31cc06f36d92914baf2f Mon Sep 17 00:00:00 2001
From: Tarek Allam <t.allam.jr@gmail.com>
Date: Mon, 3 Jun 2019 16:23:53 +0100
Subject: [PATCH 54/58] Updating docstrings

---
 utils/plasticc_pipeline.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/utils/plasticc_pipeline.py b/utils/plasticc_pipeline.py
index 3234ff5e..9cb249c7 100755
--- a/utils/plasticc_pipeline.py
+++ b/utils/plasticc_pipeline.py
@@ -373,12 +373,12 @@ def wavelet_decomposition(training_data, number_gp, **kwargs):
     Returns
     -------
     waveout: numpy.ndarray
-
+        Numpy array of the wavelet coefficients where each row is an object and
+        each column a different coefficient
     waveout_err: numpy.ndarray
-
+        Numpy array storing the (assuming Gaussian) error on each coefficient.
     wavelet_object: snmachine.snfeatures.WaveletFeatures object
 
-
     Examples
     --------
     >>> ...

From 89caf3c9f3912689266ae712a83f397660f43afd Mon Sep 17 00:00:00 2001
From: Tarek Allam <t.allam.jr@gmail.com>
Date: Mon, 3 Jun 2019 16:26:10 +0100
Subject: [PATCH 55/58] Updating variable name, dirs --> directories

---
 utils/plasticc_pipeline.py | 54 +++++++++++++++++++++-----------------
 1 file changed, 30 insertions(+), 24 deletions(-)

diff --git a/utils/plasticc_pipeline.py b/utils/plasticc_pipeline.py
index 9cb249c7..8811d08e 100755
--- a/utils/plasticc_pipeline.py
+++ b/utils/plasticc_pipeline.py
@@ -84,7 +84,7 @@ def create_folder_structure(analysis_directory, analysis_name):
 
     Returns
     -------
-    dirs: dict
+    directories: dict
         Dictionary containing the mapping of folders that have been created.
 
     Examples
@@ -104,7 +104,7 @@ def create_folder_structure(analysis_directory, analysis_name):
     intermediate_files_directory = os.path.join(method_directory, 'intermediate_files')
     plots_directory = os.path.join(method_directory, 'plots')
 
-    dirs = {"method_directory": method_directory, "features_directory": features_directory,
+    directories = {"method_directory": method_directory, "features_directory": features_directory,
             "classifications_directory": classifications_directory, "intermediate_files_directory": intermediate_files_directory,
             "plots_directory": plots_directory}
 
@@ -124,7 +124,7 @@ def create_folder_structure(analysis_directory, analysis_name):
 
         if choice in _yes:
             print("Overwriting existing folder..")
-            for key, value in dirs.items():
+            for key, value in directories.items():
                 subprocess.call(['mkdir', value], stderr=subprocess.DEVNULL)
         elif choice in _no:
             print("I am NOT sure")
@@ -132,10 +132,10 @@ def create_folder_structure(analysis_directory, analysis_name):
         else:
             sys.stdout.write("Please respond with 'yes' or 'no'")
     else:
-        for key, value in dirs.items():
+        for key, value in directories.items():
             subprocess.call(['mkdir', value])
 
-    return dirs
+    return directories
 
 
 def get_directories(analyses_directory, analysis_name):
@@ -296,7 +296,7 @@ def load_training_data(data_path):
     return training_data
 
 
-def reduce_size_of_training_data(training_data, dirs, subset_size, seed=1234):
+def reduce_size_of_training_data(training_data, directories, subset_size, seed=1234):
     # TODO: Incorpate further doctrings and finish examples. Tarek: Catarina and I need to
     # discuss this further. There is some overlap between this and
     # sndata.PlasticcData.update_data() and it would be good to comebine this.
@@ -307,7 +307,7 @@ def reduce_size_of_training_data(training_data, dirs, subset_size, seed=1234):
     training_data : snmachine.PlasticcData
         Dictionary containing the parameters that reside in the configuration
         file. This will be used to obtain the path to the training data.
-    dirs : dict
+    directories : dict
         Dictionary containing
     subset_size : int
         Number of objects the user would like to reduce the training data to
@@ -324,12 +324,13 @@ def reduce_size_of_training_data(training_data, dirs, subset_size, seed=1234):
     >>> ...
     >>> print(shape.training_data)
 
-    >>> new_training_data = reduce_size_of_training_data(training_data, dirs, 1000))
+    >>> new_training_data = reduce_size_of_training_data(training_data,
+            directories, 1000))
     >>> print(shape.new_training_data)
 
     """
 
-    method_directory = dirs.get("method_directory", None)
+    method_directory = directories.get("method_directory", None)
     subset_file = os.path.join(method_directory, "subset.list")
     if os.path.exists(subset_file):
         rand_objs = np.genfromtxt(subset_file, dtype='U')
@@ -383,7 +384,8 @@ def wavelet_decomposition(training_data, number_gp, **kwargs):
     --------
     >>> ...
     >>> waveout, waveout_err, wavelet_object = wavelet_decomposition(training_data, number_gp=number_gp, number_processes=number_processes,
-                                                                     save_output='all', output_root=dirs.get("intermediate_files_directory"))
+                                                                     save_output='all',
+                                                                     output_root=directories.get("intermediate_files_directory"))
     >>> print()
 
     """
@@ -471,7 +473,7 @@ def _to_pandas(features):
     return features
 
 
-def create_classifier(combined_features, training_data, dirs, augmentation_method=None, random_state=42):
+def create_classifier(combined_features, training_data, directories, augmentation_method=None, random_state=42):
     # TODO: Improve docstrings.
     """ Creation of an optimised Random Forest classifier.
 
@@ -524,10 +526,10 @@ def create_classifier(combined_features, training_data, dirs, augmentation_metho
     confusion_matrix, figure = plot_confusion_matrix(y_test, y_preds, 'Validation data', target_names, normalize=True)
 
     timestamp = get_timestamp()
-    with open(os.path.join(dirs.get("classifications_directory"), F'classifer_{timestamp}.pkl'), 'wb') as clf:
+    with open(os.path.join(directories.get("classifications_directory"), F'classifer_{timestamp}.pkl'), 'wb') as clf:
         pickle.dump(classifer, clf)
 
-    figure.savefig(os.path.join(dirs.get("plots_directory"), F'plot_{timestamp}.png'))
+    figure.savefig(os.path.join(directories.get("plots_directory"), F'plot_{timestamp}.png'))
 
     return classifer, confusion_matrix
 
@@ -537,17 +539,17 @@ def make_predictions(location_of_test_data, classifier):
     pass
 
 
-def restart_from_saved_gps(dirs):
+def restart_from_saved_gps(directories):
     pass
 
 
-def restart_from_saved_wavelets(dirs):
+def restart_from_saved_wavelets(directories):
     pass
 
 
-def restart_from_saved_pca(dirs, number_of_principal_components):
+def restart_from_saved_pca(directories, number_of_principal_components):
     # TODO: Write docstrings
-    wavelet_features = pd.read_pickle(os.path.join(dirs.get("features_directory"), "reduced_wavelet_components_{}.pickle".format(number_of_principal_components)))
+    wavelet_features = pd.read_pickle(os.path.join(directories.get("features_directory"), "reduced_wavelet_components_{}.pickle".format(number_of_principal_components)))
     combined_features = wavelet_features  # For running tests for now
     classifier, confusion_matrix = create_classifier(combined_features, training_data)
     print(F"classifier = {classifier}")
@@ -578,9 +580,9 @@ def restart_from_saved_pca(dirs, number_of_principal_components):
     number_of_principal_components = params.get("number_of_principal_components", None)
 
     # Step 1. Creat folders that contain analysis
-    dirs = create_folder_structure(analysis_directory, analysis_name)
+    directories = create_folder_structure(analysis_directory, analysis_name)
     # Step 2. Save configuration file used for this analysis
-    save_configuration_file(params, dirs.get("method_directory"))
+    save_configuration_file(params, directories.get("method_directory"))
     # Step 3. Check at which point the user would like to run the analysis from.
     # If elements already saved, these will be used but this can be overriden
     # with command line argument
@@ -592,12 +594,12 @@ def restart_from_saved_pca(dirs, number_of_principal_components):
         pass
     elif (arguments['restart_from'].lower() == "pca"):
         # Restart from saved PCA components
-        restart_from_saved_pca(dirs, number_of_principal_components)
+        restart_from_saved_pca(directories, number_of_principal_components)
     else:
         # Run full pipeline but still do checks to see if elements from GPs or
         # wavelets already exist on disk; the first check should be for:
         #   a. Saved PCA files
-            # path_saved_reduced_wavelets = dirs.get("intermediate_files_directory")
+            # path_saved_reduced_wavelets = directories.get("intermediate_files_directory")
             # eigenvectors_saved_file = np.load(os.path.join(path_saved_reduced_wavelets, 'eigenvectors_' + str(number_of_principal_components) + '.npy'))
             # means_saved_file = np.load(os.path.join(path_saved_reduced_wavelets, 'means_' + str(number_of_principal_components) + '.npy'))
         #   b. Saved uncompressed wavelets
@@ -610,12 +612,13 @@ def restart_from_saved_pca(dirs, number_of_principal_components):
         # Step 5. Compute GPs
         gps.compute_gps(training_data, number_gp=number_gp, t_min=0, t_max=1100,
                         kernel_param=kernel_param,
-                        output_root=dirs['intermediate_files_directory'],
+                        output_root=directories['intermediate_files_directory'],
                         number_processes=number_processes)
 
         # Step 6. Extract wavelet coeffiencts
         waveout, waveout_err, wavelet_object = wavelet_decomposition(training_data, number_gp=number_gp, number_processes=number_processes,
-                                                                     save_output='all', output_root=dirs.get("intermediate_files_directory"))
+                                                                     save_output='all',
+                                                                     output_root=directories.get("intermediate_files_directory"))
         print("waveout = {}".format(waveout))
         print("waveout, type = {}".format(type(waveout)))
 
@@ -627,7 +630,10 @@ def restart_from_saved_pca(dirs, number_of_principal_components):
 
         # Step 7. Reduce dimensionality of wavelets by using only N principal components
         wavelet_features, eigenvals, eigenvecs, means, num_feats = wavelet_object.extract_pca(object_names=training_data.object_names, wavout=waveout, recompute_pca=True, method='svd', number_comp=number_of_principal_components,
-                                                                                              tol=None, pca_path=None, save_output=True, output_root=dirs.get("features_directory"))
+                                                                                              tol=None,
+                                                                                              pca_path=None,
+                                                                                              save_output=True,
+                                                                                              output_root=directories.get("features_directory"))
         print("wavelet_features = {}".format(wavelet_features))
         print("wavelet_features, type = {}".format(type(wavelet_features)))
 

From 33cffea91614126d2574d4080e29e61e48488925 Mon Sep 17 00:00:00 2001
From: Tarek Allam <t.allam.jr@gmail.com>
Date: Tue, 4 Jun 2019 12:52:50 +0100
Subject: [PATCH 56/58] Fixing version of sncosmo for debug checks

The recent HTTP 404 error discovered in the CI suggests that a recent
change to sncosmo might be the reason for failing to find salt2 models

Latest version = 1.8.0, which is where the error occurs, bumping down to
1.7.1 (previous release) to test outcome
---
 environment.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/environment.yml b/environment.yml
index aeb4e79e..23422a70 100644
--- a/environment.yml
+++ b/environment.yml
@@ -23,6 +23,6 @@ dependencies:
     - emcee>=2.1.0
     - numpydoc>=0.6.0
     - pywavelets>=0.4.0
-    - sncosmo>=1.3.0
+    - sncosmo==1.7.1
     - nose>=1.3.7
     - future>=0.16

From 13fb8b638ae83039646ac5e36d10139318fa9116 Mon Sep 17 00:00:00 2001
From: Catarina Alves <catarina._3@hotmail.com>
Date: Tue, 4 Jun 2019 12:57:53 +0100
Subject: [PATCH 57/58] Save the balancing method and the number of PCA
 components used for the classifier and confusion matrix

---
 utils/plasticc_pipeline.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/utils/plasticc_pipeline.py b/utils/plasticc_pipeline.py
index 8811d08e..796af43f 100755
--- a/utils/plasticc_pipeline.py
+++ b/utils/plasticc_pipeline.py
@@ -473,7 +473,7 @@ def _to_pandas(features):
     return features
 
 
-def create_classifier(combined_features, training_data, directories, augmentation_method=None, random_state=42):
+def create_classifier(combined_features, training_data, directories, augmentation_method=None, random_state=42, number_comps=''):
     # TODO: Improve docstrings.
     """ Creation of an optimised Random Forest classifier.
 
@@ -526,10 +526,10 @@ def create_classifier(combined_features, training_data, directories, augmentatio
     confusion_matrix, figure = plot_confusion_matrix(y_test, y_preds, 'Validation data', target_names, normalize=True)
 
     timestamp = get_timestamp()
-    with open(os.path.join(directories.get("classifications_directory"), F'classifer_{timestamp}.pkl'), 'wb') as clf:
+    with open(os.path.join(directories.get("classifications_directory"), F'classifer_{number_comps}_{augmentation_method}.pkl'), 'wb') as clf:
         pickle.dump(classifer, clf)
 
-    figure.savefig(os.path.join(directories.get("plots_directory"), F'plot_{timestamp}.png'))
+    figure.savefig(os.path.join(directories.get("plots_directory"), F'confusion_matrix_{number_comps}_{augmentation_method}.pdf'))
 
     return classifer, confusion_matrix
 

From c2513377420ad679af3bb0bda3eb4401a58a495f Mon Sep 17 00:00:00 2001
From: Tarek Allam <t.allam.jr@gmail.com>
Date: Tue, 4 Jun 2019 15:11:42 +0100
Subject: [PATCH 58/58] Bump version 1.3.2 --> 1.4.0

With the inclusion of this feature set, although not fully complete,
a MINOR bump is felt necessary.
---
 snmachine/version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/snmachine/version.py b/snmachine/version.py
index 922bcbf7..b2c6a8de 100644
--- a/snmachine/version.py
+++ b/snmachine/version.py
@@ -1 +1 @@
-__VERSION__ = "1.3.2"
+__VERSION__ = "1.4.0"