diff --git a/.gitignore b/.gitignore
index 6875f042..0797322a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,6 +2,9 @@
 test/*
 !test/*.py
 
+# Do not track log files in utils
+utils/*stdout.txt
+
 ## Python.gitignore from Github.
 ##
 # Byte-compiled / optimized / DLL files
diff --git a/environment.yml b/environment.yml
index 997fc7e1..23422a70 100644
--- a/environment.yml
+++ b/environment.yml
@@ -11,17 +11,18 @@ dependencies:
   - jupyter>=1.0.0
   - matplotlib>=1.5.1
   - numpy=1.12.0
-  - scikit-learn=0.18.1
+  - scikit-learn>=0.20
   - scipy>=0.17.0
   - george>=0.3.0
   - iminuit>=1.2
   - pandas>=0.23.0
   - extinction>=0.3.0
+  - imbalanced-learn>=0.4.3
 
   - pip:
     - emcee>=2.1.0
     - numpydoc>=0.6.0
     - pywavelets>=0.4.0
-    - sncosmo>=1.3.0
+    - sncosmo==1.7.1
     - nose>=1.3.7
     - future>=0.16
diff --git a/snmachine/gps.py b/snmachine/gps.py
index 5b965756..3dec7868 100644
--- a/snmachine/gps.py
+++ b/snmachine/gps.py
@@ -56,7 +56,7 @@ def compute_gps(dataset, number_gp, t_min, t_max, kernel_param=[500., 20.], outp
     output_root : {None, str}, optional
         If None, don't save anything. If str, it is the output directory, so save the flux and error estimates and used kernels there.
     number_processes : int, optional
-        Number of processors to use for parallelisation (shared memory only). By default `nprocesses` = 1.
+        Number of processors to use for parallelisation (shared memory only). By default `number_processes` = 1.
     gp_algo : str, optional
         which gp package is used for the Gaussian Process Regression, GaPP or george
     """
@@ -148,7 +148,7 @@ def _compute_gps_parallel(dataset, number_gp, t_min, t_max, kernel_param, output
     output_root : {None, str}, optional
         If None, don't save anything. If str, it is the output directory, so save the flux and error estimates and used kernels there.
     number_processes : int, optional
-        Number of processors to use for parallelisation (shared memory only). By default `nprocesses` = 1.
+        Number of processors to use for parallelisation (shared memory only). By default `number_processes` = 1.
     gp_algo : str, optional
         which gp package is used for the Gaussian Process Regression, GaPP or george
     """
@@ -413,4 +413,4 @@ def get_kernel(kernel_name, kernel_param):
     elif kernel_name == 'ExpSquared+ExpSine2':
         kExpSine2 = kernel_param[4]*george.kernels.ExpSine2Kernel(gamma=kernel_param[5],log_period=kernel_param[6])
         kernel = kExpSquared + kExpSine2
-    return kernel
\ No newline at end of file
+    return kernel
diff --git a/snmachine/snaugment.py b/snmachine/snaugment.py
index 4e582cd8..564a6f96 100644
--- a/snmachine/snaugment.py
+++ b/snmachine/snaugment.py
@@ -123,7 +123,7 @@ def extract_proxy_features(self,peak_filter='desr',nproc=1,fit_salt2=False,salt2
             #tf=snfeatures.TemplateFeatures(sampler='leastsq')
             tf=snfeatures.TemplateFeatures(sampler=sampler)
             if salt2feats is None:
-                salt2feats=tf.extract_features(self.dataset,nprocesses=nproc,use_redshift=fix_redshift)
+                salt2feats=tf.extract_features(self.dataset,number_processes=nproc,use_redshift=fix_redshift)
 
             #fit models and extract r-peakmags
             peaklogflux=[]
diff --git a/snmachine/snclassifier.py b/snmachine/snclassifier.py
old mode 100755
new mode 100644
index 0e3a4cd2..185356a1
--- a/snmachine/snclassifier.py
+++ b/snmachine/snclassifier.py
@@ -608,7 +608,7 @@ def __call_classifier(classifier, X_train, y_train, X_test, param_dict, return_c
 
 
 def run_pipeline(features, types, output_name='', columns=[], classifiers=['nb', 'knn', 'svm', 'neural_network', 'boost_dt'],
-                 training_set=0.3, param_dict={}, nprocesses=1, scale=True,
+                 training_set=0.3, param_dict={}, number_processes=1, scale=True,
                  plot_roc_curve=True, return_classifier=False, classifiers_for_cm_plots=[],
                  type_dict=None, seed=1234):
     """
@@ -632,7 +632,7 @@ def run_pipeline(features, types, output_name='', columns=[], classifiers=['nb',
         the ID's of the objects to be used
     param_dict : dict, optional
         Use to run different ranges of hyperparameters for the classifiers when optimising
-    nprocesses : int, optional
+    number_processes : int, optional
         Number of processors for multiprocessing (shared memory only). Each classifier will then be run in parallel.
     scale : bool, optional
         Rescale features using sklearn's preprocessing Scalar class (highly recommended this is True)
@@ -707,15 +707,15 @@ def run_pipeline(features, types, output_name='', columns=[], classifiers=['nb',
     probabilities = {}
     classifier_objects = {}
 
-    if nprocesses > 1 and return_classifier:
+    if number_processes > 1 and return_classifier:
         print("Due to limitations with python's multiprocessing module, classifier objects cannot be returned if " \
               "multiple processors are used. Continuing serially...")
         print()
 
-    if nprocesses > 1 and not return_classifier:
+    if number_processes > 1 and not return_classifier:
         partial_func=partial(__call_classifier, X_train=X_train, y_train=y_train, X_test=X_test,
                              param_dict=param_dict, return_classifier=False)
-        p = Pool(nprocesses, maxtasksperchild=1)
+        p = Pool(number_processes, maxtasksperchild=1)
         result = p.map(partial_func, classifiers)
 
         for i in range(len(result)):
diff --git a/snmachine/snfeatures.py b/snmachine/snfeatures.py
index d1b6ee39..b7e6f930 100644
--- a/snmachine/snfeatures.py
+++ b/snmachine/snfeatures.py
@@ -686,7 +686,7 @@ def __init__(self, model=['Ia'], sampler='leastsq',lsst_bands=False,lsst_dir='..
                     'nugent-sn2l':{'z':(0.01, 1.5)},
                     'nugent-sn1bc':{'z':(0.01, 1.5)}}
 
-    def extract_features(self, d, save_output=False, chain_directory='chains', use_redshift=False, nprocesses=1, restart=False, seed=-1):
+    def extract_features(self, d, save_output=False, chain_directory='chains', use_redshift=False, number_processes=1, restart=False, seed=-1):
         """
         Extract template features for a dataset.
 
@@ -700,7 +700,7 @@ def extract_features(self, d, save_output=False, chain_directory='chains', use_r
             Where to save the chains
         use_redshift : bool
             Whether or not to use provided redshift when fitting objects
-        nprocesses : int, optional
+        number_processes : int, optional
             Number of processors to use for parallelisation (shared memory only)
         restart : bool
             Whether or not to restart from multinest chains
@@ -721,6 +721,7 @@ def extract_features(self, d, save_output=False, chain_directory='chains', use_r
                 self.model=sncosmo.Model(self.templates[mod_name],effects=[dust],effect_names=['host'], effect_frames=['rest'])
             else:
                 self.model=sncosmo.Model(self.templates[mod_name])
+                print(F'MODEL-NAME: {mod_name}')
             params=['['+mod_name+']'+pname for pname in self.model.param_names]
             # err_plus=[pname+'_err+' for pname in params]
             # err_minus=[pname+'_err-' for pname in params]
@@ -736,7 +737,7 @@ def extract_features(self, d, save_output=False, chain_directory='chains', use_r
             output = Table(names=labels, dtype=['U32'] + ['f'] * (len(labels) - 1))
 
             k=0
-            if nprocesses<2:
+            if number_processes<2:
                 for obj in d.object_names:
                     if k%100==0:
                         print (k, 'objects fitted')
@@ -779,7 +780,7 @@ def extract_features(self, d, save_output=False, chain_directory='chains', use_r
 
             else:
                 if self.sampler=='leastsq':
-                    p=Pool(nprocesses, maxtasksperchild=1)
+                    p=Pool(number_processes, maxtasksperchild=1)
                     partial_func=partial(_run_leastsq_templates, d=d, model_name=self.templates[mod_name], use_redshift=use_redshift, bounds=self.bounds[self.templates[mod_name]])
                     out=p.map(partial_func, d.object_names)
                     output=out[0]
@@ -790,7 +791,7 @@ def extract_features(self, d, save_output=False, chain_directory='chains', use_r
                     else:
                         all_output=vstack((all_output, output))
                 elif self.sampler=='nested':
-                    p=Pool(nprocesses, maxtasksperchild=1)
+                    p=Pool(number_processes, maxtasksperchild=1)
                     partial_func=partial(_run_multinest_templates, d=d, model_name=self.templates[mod_name], bounds=self.bounds[self.templates[mod_name]],
                     chain_directory=chain_directory, nlp=1000, convert_to_binary=True, use_redshift=use_redshift, short_name=self.short_names[mod_name], restart=restart, seed=seed)
                     out=p.map(partial_func, d.object_names)
@@ -914,7 +915,7 @@ def __init__(self, model_choice, sampler='leastsq', limits=None):
 
 
 
-    def extract_features(self, d, chain_directory='chains', save_output=True, n_attempts=20, nprocesses=1, n_walkers=100,
+    def extract_features(self, d, chain_directory='chains', save_output=True, n_attempts=20, number_processes=1, n_walkers=100,
     n_steps=500, walker_spread=0.1, burn=50, nlp=1000, starting_point=None, convert_to_binary=True, n_iter=0, restart=False, seed=-1):
         """
         Fit parametric models and return best-fitting parameters as features.
@@ -930,7 +931,7 @@ def extract_features(self, d, chain_directory='chains', save_output=True, n_atte
         n_attempts : int
             Allow the minimiser to start in new random locations if the fit is bad. Put n_attempts=1 to fit only once
             with the default starting position.
-        nprocesses : int, optional
+        number_processes : int, optional
             Number of processors to use for parallelisation (shared memory only)
         n_walkers : int
             emcee parameter - number of walkers to use
@@ -963,7 +964,7 @@ def extract_features(self, d, chain_directory='chains', save_output=True, n_atte
         output=[]
 
         #obj=d.object_names[0]
-        if nprocesses<2:
+        if number_processes<2:
             k=0
             for obj in d.object_names:
                 if k%100==0:
@@ -984,14 +985,14 @@ def extract_features(self, d, chain_directory='chains', save_output=True, n_atte
                 k+=1
         else:
             if self.sampler=='leastsq':
-                p=Pool(nprocesses, maxtasksperchild=1)
+                p=Pool(number_processes, maxtasksperchild=1)
                 partial_func=partial(_run_leastsq, d=d, model=self.model,  n_attempts=n_attempts, seed=seed)
                 out=p.map(partial_func, d.object_names)
                 output=out[0]
                 for i in range(1, len(out)):
                     output=vstack((output, out[i]))
             elif self.sampler=='nested':
-                p=Pool(nprocesses, maxtasksperchild=1)
+                p=Pool(number_processes, maxtasksperchild=1)
                 partial_func=partial(_run_multinest, d=d, model=self.model,chain_directory=chain_directory,
                 nlp=nlp, convert_to_binary=convert_to_binary, n_iter=n_iter, restart=restart, seed=seed)
                 #Pool starts a number of threads, all of which may try to tackle all of the data. Better to take it in chunks
@@ -999,7 +1000,7 @@ def extract_features(self, d, chain_directory='chains', save_output=True, n_atte
                 k=0
                 objs=d.object_names
                 while k<len(objs):
-                    objs_subset=objs[k:k+nprocesses]
+                    objs_subset=objs[k:k+number_processes]
                     out=p.map(partial_func, objs_subset)
                     for i in range(0, len(out)):
                         if out[i]==None:
@@ -1187,7 +1188,7 @@ class WaveletFeatures(Features):
     Uses wavelets to decompose the data and then reduces dimensionality of the feature space using PCA.
     """
 
-    def __init__(self, wavelet='sym2', ngp=100, **kwargs):
+    def __init__(self, wavelet='sym2', number_gp=100, **kwargs):
         """
         Initialises the pywt wavelet object and sets the maximum depth for deconstruction.
 
@@ -1195,7 +1196,7 @@ def __init__(self, wavelet='sym2', ngp=100, **kwargs):
         ----------
         wavelet : str, optional
             String for which wavelet family to use.
-        ngp : int, optional
+        number_gp : int, optional
             Number of points on the Gaussian process curve
         level : int, optional
             The maximum depth for wavelet deconstruction. If not provided, will use the maximum depth possible
@@ -1204,7 +1205,7 @@ def __init__(self, wavelet='sym2', ngp=100, **kwargs):
         Features.__init__(self)
 
         self.wav=pywt.Wavelet(wavelet)
-        self.ngp=ngp #Number of points to use on the Gaussian process curve
+        self.number_gp=number_gp #Number of points to use on the Gaussian process curve
         self.wavelet_list=pywt.wavelist() #All possible families
 
         if wavelet not in self.wavelet_list:
@@ -1215,10 +1216,10 @@ def __init__(self, wavelet='sym2', ngp=100, **kwargs):
         if 'level' in kwargs:
             self.mlev=kwargs['level']
         else:
-            self.mlev=pywt.swt_max_level(self.ngp)
+            self.mlev=pywt.swt_max_level(self.number_gp)
 
 
-    def extract_features(self, d, initheta=[500, 20], save_output=False, output_root='features', nprocesses=24, restart='none', gp_algo='george', xmin=None, xmax=None, recompute_pca=True, pca_path=None):
+    def extract_features(self, d, initheta=[500, 20], save_output=False, output_root='features', number_processes=24, restart='none', gp_algo='george', xmin=None, xmax=None, recompute_pca=True, pca_path=None):
         """
         Applies a wavelet transform followed by PCA dimensionality reduction to extract wavelet coefficients as features.
 
@@ -1232,7 +1233,7 @@ def extract_features(self, d, initheta=[500, 20], save_output=False, output_root
             Whether or not to save the output
         output_root : str, optional
             Output directory
-        nprocesses : int, optional
+        number_processes : int, optional
             Number of processors to use for parallelisation (shared memory only)
         restart : str, optional
             Either 'none' to start from scratch, 'gp' to restart from saved Gaussian processes, or 'wavelet' to
@@ -1260,9 +1261,9 @@ def extract_features(self, d, initheta=[500, 20], save_output=False, output_root
             if restart=='gp':
                 self.restart_from_gp(d, output_root)
             else:
-                compute_gps(d, self.ngp, xmin, xmax, initheta, output_root, nprocesses, gp_algo=gp_algo, save_output=save_output)
+                compute_gps(d, self.number_gp, xmin, xmax, initheta, output_root, number_processes, gp_algo=gp_algo, save_output=save_output)
 
-            wavout, waveout_err=self.extract_wavelets(d, self.wav, self.mlev,  nprocesses, save_output, output_root)
+            wavout, waveout_err=self.extract_wavelets(d, self.wav, self.mlev,  number_processes, save_output, output_root)
         self.features,vals,vec,mn,s=self.extract_pca(d.object_names.copy(), wavout, recompute_pca=recompute_pca, pca_path=pca_path, output_root=output_root)
 
         #Save the PCA information
@@ -1316,14 +1317,14 @@ def fit_sn(self, lc, comps, vec,  mn, xmin, xmax, filter_set, waveUni=0):
 
             coeffs=np.array(np.dot(new_comps, eigs.T)+mn).flatten()
 
-        n=self.mlev*2*self.ngp
-        xnew=np.linspace(xmin, xmax, self.ngp)
+        n=self.mlev*2*self.number_gp
+        xnew=np.linspace(xmin, xmax, self.number_gp)
         output=[]
         for i in range(len(filter_set)):
             if filter_set[i] in filts:
                 if waveUni==0:
                     filt_coeffs=coeffs[i*n:(i+1)*n]
-                    filt_coeffs=filt_coeffs.reshape(self.mlev, 2, self.ngp, order='C')
+                    filt_coeffs=filt_coeffs.reshape(self.mlev, 2, self.number_gp, order='C')
                 else: # tweak things
                     ifFil = comps['filter'] == filter_set[i]
                     filt_coeffs = (( np.array(comps[ifFil]['cA2']), np.array(comps[ifFil]['cD2']) ),
@@ -1331,7 +1332,7 @@ def fit_sn(self, lc, comps, vec,  mn, xmin, xmax, filter_set, waveUni=0):
 
                 ynew=self.iswt(filt_coeffs, self.wav)
 
-                newtable=Table([xnew, ynew, [filter_set[i]]*self.ngp], names=['mjd', 'flux', 'filter'], dtype=['f', 'f', 'U32'])
+                newtable=Table([xnew, ynew, [filter_set[i]]*self.number_gp], names=['mjd', 'flux', 'filter'], dtype=['f', 'f', 'U32'])
                 if len(output)==0:
                     output=newtable
                 else:
@@ -1385,8 +1386,8 @@ def restart_from_wavelets(self, d, output_root):
         """
         print ('Restarting from stored wavelets...')
         nfilts=len(d.filter_set)
-        wavout=np.zeros([len(d.object_names), self.ngp*2*self.mlev*nfilts]) #This is just a very big array holding coefficients in memory
-        wavout_err=np.zeros([len(d.object_names), self.ngp*2*self.mlev*nfilts])
+        wavout=np.zeros([len(d.object_names), self.number_gp*2*self.mlev*nfilts]) #This is just a very big array holding coefficients in memory
+        wavout_err=np.zeros([len(d.object_names), self.number_gp*2*self.mlev*nfilts])
 
         for i in range(len(d.object_names)):
             obj=d.object_names[i]
@@ -1395,7 +1396,7 @@ def restart_from_wavelets(self, d, output_root):
                 # out=Table.read(fname, format='ascii')
                 out=Table.read(fname, format='fits')
                 cols=out.colnames[:-1]
-                n=self.ngp*2*self.mlev
+                n=self.number_gp*2*self.mlev
                 for j in range(nfilts): # I think I can do this in a more clear/ easy to understand way
                     x=out[out['filter']==d.filter_set[j]] # select the filter
                     coeffs=x[cols[:self.mlev*2]] # select the coeeficients ['cA2', 'cD2', 'cA1', 'cD1'] of that filter
@@ -1430,7 +1431,7 @@ def wavelet_decomp(self, lc, wav, mlev):
         """
 
         filters=np.unique(lc['filter'])
-        ngp=len(lc['flux'][lc['filter']==filters[0]])
+        number_gp=len(lc['flux'][lc['filter']==filters[0]])
         #Store the output in another astropy table
         output=0
         for fil in filters:
@@ -1465,8 +1466,9 @@ def wavelet_decomp(self, lc, wav, mlev):
 
         return output
 
-    def extract_wavelets(self, d, wav, mlev, nprocesses, save_output, output_root):
-        """Perform wavelet decomposition on all objects in dataset. Output is stored as astropy table for each object.
+    def extract_wavelets(self, d, wav, mlev, number_processes, save_output, output_root):
+        """
+        Perform wavelet decomposition on all objects in dataset. Output is stored as astropy table for each object.
 
         Parameters
         ----------
@@ -1476,7 +1478,7 @@ def extract_wavelets(self, d, wav, mlev, nprocesses, save_output, output_root):
             Which wavelet family to use
         mlev : int
             Max depth
-        nprocesses : int, optional
+        number_processes : int, optional
             Number of processors to use for parallelisation (shared memory only)
         save_output : bool, optional
             Whether or not to save the output
@@ -1494,8 +1496,8 @@ def extract_wavelets(self, d, wav, mlev, nprocesses, save_output, output_root):
         print ('Performing wavelet decomposition')
 
         nfilts=len(d.filter_set)
-        wavout=np.zeros([len(d.object_names), self.ngp*2*mlev*nfilts]) #This is just a big array holding coefficients in memory
-        wavout_err=np.zeros([len(d.object_names), self.ngp*2*mlev*nfilts])
+        wavout=np.zeros([len(d.object_names), self.number_gp*2*mlev*nfilts]) #This is just a big array holding coefficients in memory
+        wavout_err=np.zeros([len(d.object_names), self.number_gp*2*mlev*nfilts])
         t1=time.time()
         for i in range(len(d.object_names)):
             obj=d.object_names[i]
@@ -1506,7 +1508,7 @@ def extract_wavelets(self, d, wav, mlev, nprocesses, save_output, output_root):
                 out.write(os.path.join(output_root, 'wavelet_'+obj), format='fits',overwrite=True)
             #We go by filter, then by set of coefficients
             cols=out.colnames[:-1]
-            n=self.ngp*2*mlev
+            n=self.number_gp*2*mlev
             filts=np.unique(lc['filter'])
             for j in range(nfilts):
                 if d.filter_set[j] in filts:
@@ -1572,7 +1574,7 @@ def get_pca_eigendecomposition(self, data_matrix, number_comp=None, tol=0.999,
         Z : `np.ndarray`
             Components of the vectors forming the data matrix in the PCA bases of shape (Nsamps, number_comps)
         M : `np.ndarray`
-            Means of the features of the data matrix over the samples, should have shape (Nfeats,) 
+            Means of the features of the data matrix over the samples, should have shape (Nfeats,)
         s : `np.ndarray`
             scalings used to rescale X so that the variance of each feature in
             X is 1. Should have shape (Nfeats, ) or be `None`
@@ -1631,11 +1633,11 @@ def get_pca_svd(self, data_matrix, number_comp=None, tol=0.999,
         -------
         V : `np.ndarray`
             Right Singular Matrix, with shape (Nsamps, min(`number_comp`, Nfeats))
-        Z : `np.ndarray` 
-            Components of the vectors forming the data matrix in the PCA bases 
+        Z : `np.ndarray`
+            Components of the vectors forming the data matrix in the PCA bases
             of shape (Nsamps, `number_comp`)
         M : `np.ndarray`
-            Means of the features of the data matrix over the samples, should have shape (Nfeats,) 
+            Means of the features of the data matrix over the samples, should have shape (Nfeats,)
         s : `np.ndarray`
             scalings used to rescale X so that the variance of each feature in
             X is 1. Should have shape (Nfeats, ) or be `None`
@@ -1703,7 +1705,7 @@ def normalize_datamatrix(D, normalize_variance=True):
         X : `np.ndarray`
             normalized and centered data matrix X. Should have shape (Nsamps, Nfeats)
         M : `np.ndarray`
-            Means of the features of the data matrix over the samples, should have shape (Nfeats,) 
+            Means of the features of the data matrix over the samples, should have shape (Nfeats,)
         s : `np.ndarray`
             scalings used to rescale X so that the variance of each feature in
             X is 1. Should have shape (Nfeats, ) or be `None`
@@ -1841,7 +1843,7 @@ def _pca(self, data_matrix, number_comp, tol, normalize_variance, method):
             If `True` pass to `normalize_variance` method so that the features
             are scaled to have unit variance.
         method : {'svd'| 'eigendecomposition'}
-        
+
         Notes
         -----
         normalize_variance defaults to False. Please read notes in
@@ -1871,12 +1873,12 @@ def reconstruct_datamatrix_lossy(Z, vec, M=None, s=None):
         M : `np.ndarray`, defaults to `None`
             Matrix subtracted from original Data Matrix to center it.
             Must have shape (Nfeats, ). If `None`, M is assumed to be 0
-        s : `np.ndarry` 
+        s : `np.ndarry`
             scale factor applied to normalize data matrix so that each feature
             vector has variance 1. Must have shape (Nfeats, ) or be `None`
         Returns
         -------
-        D : `np.ndarray` 
+        D : `np.ndarray`
             Reconstructed un-normalized data matrix of shape (Nsamps, Nfeats)
             that was compressed via PCA
         """
@@ -1957,7 +1959,7 @@ def extract_pca(self, object_names, wavout, recompute_pca=True,
             array of shape (Nfeat, Ncomp) whose columns are the
             eigenvectors of the covariance matrix.
         M : `np.ndarray`
-            Means of the features of the data matrix over the samples, should have shape (Nfeats,) 
+            Means of the features of the data matrix over the samples, should have shape (Nfeats,)
         s : `np.ndarray`
             scalings used to rescale X so that the variance of each feature in
             X is 1. Should have shape (Nfeats, ) or be `None`
@@ -2012,20 +2014,23 @@ def extract_pca(self, object_names, wavout, recompute_pca=True,
 
         # Now reformat the components as a table
         labels = ['C%d' %i for i in range(number_comp)]
-        wavs = Table(comps, names=labels)
+        reduced_wavelet_components = Table(comps, names=labels)
         objnames = Table(object_names.reshape(len(object_names), 1),
                          names=['Object'])
-        wavs = hstack((objnames, wavs))
+        reduced_wavelet_components = hstack((objnames, reduced_wavelet_components))
         print('Time for PCA', time.time() - t1)
 
         if save_output:
             # We need to change the output to make it consistent with new code
-            np.save(os.path.join(output_root,'eigenvalues_{}.npy'.format(ncomp)),vals)
-            np.save(os.path.join(output_root,'eigenvectors_{}.npy'.format(ncomp)),vec)
-            np.save(os.path.join(output_root,'comps_{}.npy'.format(ncomp)),comps)
-            np.save(os.path.join(output_root,'means_{}.npy'.format(ncomp)),M)
-
-        return wavs, vals, vec, M, s
+            np.save(os.path.join(output_root,'eigenvalues_{}.npy'.format(number_comp)),vals)
+            np.save(os.path.join(output_root,'eigenvectors_{}.npy'.format(number_comp)),vec)
+            np.save(os.path.join(output_root,'comps_{}.npy'.format(number_comp)),comps)
+            np.save(os.path.join(output_root,'means_{}.npy'.format(number_comp)),M)
+            # Write the astropy table containing the wavelet features to disk after converting to pandas dataframe
+            reduced_wavelet_components = reduced_wavelet_components.to_pandas()
+            reduced_wavelet_components.to_pickle(os.path.join(output_root, 'reduced_wavelet_components_{}.pickle'.format(number_comp)))
+
+        return reduced_wavelet_components, vals, vec, M, s
 
     def iswt(self, coefficients, wavelet):
         """
diff --git a/snmachine/version.py b/snmachine/version.py
index 922bcbf7..b2c6a8de 100644
--- a/snmachine/version.py
+++ b/snmachine/version.py
@@ -1 +1 @@
-__VERSION__ = "1.3.2"
+__VERSION__ = "1.4.0"
diff --git a/test/snclassifier_test.py b/test/snclassifier_test.py
index 98286d84..3cc5fde8 100644
--- a/test/snclassifier_test.py
+++ b/test/snclassifier_test.py
@@ -41,7 +41,7 @@ def classification_test(cls, featz, types):
     if not os.path.exists(out_dir):
         subprocess.call(['mkdir',out_dir])
 
-    snclassifier.run_pipeline(featz, types, classifiers=cls, nprocesses=4, plot_roc_curve=False, output_name=out_dir)
+    snclassifier.run_pipeline(featz, types, classifiers=cls, number_processes=4, plot_roc_curve=False, output_name=out_dir)
 
     auc_truth={'nb':5.498296484233418102e-01, 'svm': 9.607832585029829620e-01, 'knn':8.683540372670807139e-01, 'random_forest': 9.794267790146994335e-01, 'decision_tree':9.046528076757488490e-01, 'boost_dt': 9.597607478934744307e-01, 'boost_rf': 9.791576972753551766e-01, 'neural_network': 9.637969739836398375e-01}
 
diff --git a/test/snfeatures_test.py b/test/snfeatures_test.py
index 0e6777fa..2025e709 100644
--- a/test/snfeatures_test.py
+++ b/test/snfeatures_test.py
@@ -73,10 +73,10 @@ def load_example_data(request):
     return d
 
 
-def fit_templates(d, sampler='leastsq', use_redshift=False, nprocesses=1):
+def fit_templates(d, sampler='leastsq', use_redshift=False, number_processes=1):
     temp_featz = snfeatures.TemplateFeatures(sampler=sampler)
     extr_features = temp_featz.extract_features(d, use_redshift=use_redshift,
-                                                nprocesses=nprocesses, seed=42)
+                                                number_processes=number_processes, seed=42)
     d.set_model(temp_featz.fit_sn, extr_features)
     gof = temp_featz.goodness_of_fit(d)
     gof = np.array([gof[f] for f in d.filter_set]).T
@@ -84,9 +84,9 @@ def fit_templates(d, sampler='leastsq', use_redshift=False, nprocesses=1):
 #    return gof[example_name]
 
 
-def fit_parametric(model_choice, d, sampler='leastsq', nprocesses=1):
+def fit_parametric(model_choice, d, sampler='leastsq', number_processes=1):
     parametric_featz = snfeatures.ParametricFeatures(model_choice=model_choice, sampler=sampler)
-    extr_features = parametric_featz.extract_features(d, nprocesses=nprocesses, seed=42)
+    extr_features = parametric_featz.extract_features(d, number_processes=number_processes, seed=42)
     d.set_model(parametric_featz.fit_sn, extr_features)
     gof = parametric_featz.goodness_of_fit(d)
     gof = np.array([gof[f] for f in d.filter_set]).T
@@ -123,7 +123,7 @@ def test_module_loading():
 def test_templates_leastsq(load_example_data):
     d = load_example_data
     for nproc in parallel_cores:
-        gof = fit_templates(d, sampler='leastsq', use_redshift=False, nprocesses=nproc)
+        gof = fit_templates(d, sampler='leastsq', use_redshift=False, number_processes=nproc)
 
         # This case distinction is necessary, since from sncosmo-1.4 to sncosmo-1.5 there have been
         # significant changes in the salt2 templates that result in different fits.
@@ -135,7 +135,7 @@ def test_templates_leastsq(load_example_data):
         np.testing.assert_allclose(gof, gof_truth, rtol=rtol)
 
     for nproc in parallel_cores:
-        gof = fit_templates(d, sampler='leastsq', use_redshift=True, nprocesses=nproc)
+        gof = fit_templates(d, sampler='leastsq', use_redshift=True, number_processes=nproc)
         if sncosmo.__version__ < '1.5.0':
             gof_truth = [6.21906514,  18.35383076,   6.08646565,   1.0458849]
         else:
@@ -148,14 +148,14 @@ def test_templates_leastsq(load_example_data):
 def test_templates_mcmc(load_example_data):
     d=load_example_data
     for nproc in parallel_cores:
-        gof=fit_templates(d, sampler='mcmc', use_redshift=False, nprocesses=nproc)
+        gof=fit_templates(d, sampler='mcmc', use_redshift=False, number_processes=nproc)
         if sncosmo.__version__<'1.5.0':
 	    gof_truth=[  6.16634571,  17.82825731,   6.23085278,   1.11415153]
 	else:
 	    gof_truth=[  6.09118872,  18.24212791,   6.6823486,    1.29993102]
         np.testing.assert_allclose(gof, gof_truth, rtol=rtol)
     for nproc in parallel_cores:
-        gof=fit_templates(d, sampler='mcmc', use_redshift=True, nprocesses=nproc)
+        gof=fit_templates(d, sampler='mcmc', use_redshift=True, number_processes=nproc)
 	if sncosmo.__version__<'1.5.0':
 	    gof_truth=[  5.34752278,  18.87138068,   6.98768329,   1.84565766]
 	else:
@@ -168,7 +168,7 @@ def test_templates_mcmc(load_example_data):
 def test_templates_nested(load_example_data):
     d = load_example_data
     for nproc in parallel_cores:
-        gof = fit_templates(d, sampler='nested', use_redshift=False, nprocesses=nproc)
+        gof = fit_templates(d, sampler='nested', use_redshift=False, number_processes=nproc)
         if sncosmo.__version__ < '1.5.0':
             gof_truth = [6.14752938,  18.26134481,   6.12642616,   1.06306042]
         else:
@@ -176,7 +176,7 @@ def test_templates_nested(load_example_data):
         np.testing.assert_allclose(np.sort(gof), np.sort(gof_truth), rtol=rtol)
 #        np.testing.assert_allclose(np.sort(gof), np.sort([ 2.4059210272, 2.0797560142, 1.7905944939, 3.57346633979]), rtol=rtol)
     for nproc in parallel_cores:
-        gof=fit_templates(d, sampler='nested', use_redshift=True, nprocesses=nproc)
+        gof=fit_templates(d, sampler='nested', use_redshift=True, number_processes=nproc)
         if sncosmo.__version__ < '1.5.0':
             gof_truth = [6.27339226,  18.63956378,   6.16584135,   1.05712933]
         else:
@@ -189,14 +189,14 @@ def test_templates_nested(load_example_data):
 def test_newling_leastsq(load_example_data):
     d = load_example_data
     for nproc in parallel_cores:
-        gof = fit_parametric('newling', d, sampler='leastsq', nprocesses=nproc)
+        gof = fit_parametric('newling', d, sampler='leastsq', number_processes=nproc)
         np.testing.assert_allclose(gof, [6.00072104,  22.03567143,   7.2070583,    1.28674332], rtol=rtol)
 #        np.testing.assert_allclose(gof, [ 0.83526717,  0.51772027,  1.1398396,   1.11812427], rtol=rtol)
 
 def test_karpenka_leastsq(load_example_data):
     d=load_example_data
     for nproc in parallel_cores:
-        gof=fit_parametric('karpenka', d, sampler='leastsq', nprocesses=nproc)
+        gof=fit_parametric('karpenka', d, sampler='leastsq', number_processes=nproc)
         np.testing.assert_allclose(gof, [  5.24617927,  23.03744351,   7.82406324,   0.88721942], rtol=rtol)
 
 """
@@ -204,7 +204,7 @@ def test_karpenka_leastsq(load_example_data):
 def test_newling_mcmc(load_example_data):
     d=load_example_data
     for nproc in parallel_cores:
-        gof=fit_parametric('newling', d, sampler='mcmc', nprocesses=nproc)
+        gof=fit_parametric('newling', d, sampler='mcmc', number_processes=nproc)
 	print gof
 #        np.testing.assert_allclose(gof, , rtol=rtol)
 #        np.testing.assert_allclose(gof, [ 0.83526717,  0.51772027,  1.1398396,   1.11812427], rtol=rtol)
@@ -213,7 +213,7 @@ def test_newling_mcmc(load_example_data):
 def test_karpenka_mcmc(load_example_data):
     d=load_example_data
     for nproc in parallel_cores:
-        gof=fit_parametric('karpenka', d, sampler='mcmc', nprocesses=nproc)
+        gof=fit_parametric('karpenka', d, sampler='mcmc', number_processes=nproc)
 	print gof
 #        np.testing.assert_allclose(gof, , rtol=rtol)
 #"""
@@ -224,7 +224,7 @@ def test_karpenka_mcmc(load_example_data):
 def test_newling_nested(load_example_data):
     d = load_example_data
     for nproc in parallel_cores:
-        gof = fit_parametric('newling', d, sampler='nested', nprocesses=nproc)
+        gof = fit_parametric('newling', d, sampler='nested', number_processes=nproc)
         np.testing.assert_allclose(gof, [5.83656883,  21.81049531,   7.21428601,   1.29572207], rtol=rtol)
 
 
@@ -233,7 +233,7 @@ def test_newling_nested(load_example_data):
 def test_karpenka_nested(load_example_data):
     d = load_example_data
     for nproc in parallel_cores:
-        gof = fit_parametric('karpenka', d, sampler='nested', nprocesses=nproc)
+        gof = fit_parametric('karpenka', d, sampler='nested', number_processes=nproc)
         np.testing.assert_allclose(gof, [5.10496956,  29.83861575,   6.50170389,   0.89942577], rtol=rtol)
 
 
diff --git a/utils/collect.pbs b/utils/archive/collect.pbs
similarity index 100%
rename from utils/collect.pbs
rename to utils/archive/collect.pbs
diff --git a/utils/collect.py b/utils/archive/collect.py
similarity index 100%
rename from utils/collect.py
rename to utils/archive/collect.py
diff --git a/utils/conquer.pbs b/utils/archive/conquer.pbs
similarity index 100%
rename from utils/conquer.pbs
rename to utils/archive/conquer.pbs
diff --git a/utils/conquer.py b/utils/archive/conquer.py
similarity index 96%
rename from utils/conquer.py
rename to utils/archive/conquer.py
index 218c5949..40d85d33 100644
--- a/utils/conquer.py
+++ b/utils/archive/conquer.py
@@ -88,10 +88,11 @@
 	tab.meta['z']=tab.meta['hostgal_specz']
 
 	#insert into data set
-	d.insert_lightcurve(tab)
+        d.insert_lightcurve(tab)
+
 
 with open(os.path.join(out_folder,'dataset_%d.pickle'%index),'wb') as f:
-        pickle.dump(d,f)
+    pickle.dump(d,f)
 
 '''
 wf=snfeatures.WaveletFeatures()
@@ -99,9 +100,9 @@
 feats.write(os.path.join(feats_folder, 'wavelet_features.fits'),overwrite=True)
 
 with open(os.path.join(feats_folder,'PCA_mean.pickle'),'wb') as f1:
-	pickle.dump(wf.PCA_mean,f1)
+    pickle.dump(wf.PCA_mean,f1)
 with open(os.path.join(feats_folder,'PCA_eigenvals.pickle'),'wb') as f2:
-        pickle.dump(wf.PCA_eigenvals,f2)
+    pickle.dump(wf.PCA_eigenvals,f2)
 with open(os.path.join(feats_folder,'PCA_eigenvectors.pickle'),'wb') as f3:
         pickle.dump(wf.PCA_eigenvectors,f3)
 
@@ -110,4 +111,3 @@
 np.savetxt(os.path.join(feats_folder,'PCA_eigenvals.txt'),wf.PCA_eigenvals)
 np.savetxt(os.path.join(feats_folder,'PCA_eigenvectors.txt'),wf.PCA_eigenvectors)
 '''
-
diff --git a/utils/create_jobs.py b/utils/archive/create_jobs.py
similarity index 100%
rename from utils/create_jobs.py
rename to utils/archive/create_jobs.py
diff --git a/utils/divide.pbs b/utils/archive/divide.pbs
similarity index 100%
rename from utils/divide.pbs
rename to utils/archive/divide.pbs
diff --git a/utils/divide.py b/utils/archive/divide.py
similarity index 100%
rename from utils/divide.py
rename to utils/archive/divide.py
diff --git a/utils/plasticc_extract_gp.pbs b/utils/archive/plasticc_extract_gp.pbs
similarity index 100%
rename from utils/plasticc_extract_gp.pbs
rename to utils/archive/plasticc_extract_gp.pbs
diff --git a/utils/plasticc_extract_gp.py b/utils/archive/plasticc_extract_gp.py
similarity index 99%
rename from utils/plasticc_extract_gp.py
rename to utils/archive/plasticc_extract_gp.py
index fe30ddef..3a1bb8ae 100644
--- a/utils/plasticc_extract_gp.py
+++ b/utils/archive/plasticc_extract_gp.py
@@ -59,4 +59,3 @@
 np.savetxt(os.path.join(feats_folder,'PCA_eigenvals.txt'),wf.PCA_eigenvals)
 np.savetxt(os.path.join(feats_folder,'PCA_eigenvectors.txt'),wf.PCA_eigenvectors)
 '''
-
diff --git a/utils/post_process.py b/utils/archive/post_process.py
similarity index 99%
rename from utils/post_process.py
rename to utils/archive/post_process.py
index c0b2487d..797a34c7 100644
--- a/utils/post_process.py
+++ b/utils/archive/post_process.py
@@ -67,4 +67,3 @@
     np.savetxt(flname, not_done, fmt='%s')
 else:
     print 'All objects accounted for'
-
diff --git a/utils/run_pipeline.py b/utils/archive/run_pipeline.py
similarity index 100%
rename from utils/run_pipeline.py
rename to utils/archive/run_pipeline.py
diff --git a/utils/config.yml b/utils/config.yml
new file mode 100644
index 00000000..80951185
--- /dev/null
+++ b/utils/config.yml
@@ -0,0 +1,8 @@
+# Global settings
+analysis_directory : "/share/hypatia/snmachine_resources/data/plasticc/analysis/"
+analysis_name : "pipeline-sniabcii"
+data_path : "/share/hypatia/snmachine_resources/data/plasticc/data/raw_data/training_set_sniabcii.pickle"
+# snmachine parameters
+number_gp : 1100
+kernel_param : [500., 20.]
+number_of_principal_components : 10
diff --git a/utils/plasticc_feature_engineering.py b/utils/plasticc_feature_engineering.py
new file mode 100644
index 00000000..e69de29b
diff --git a/utils/plasticc_make_predictions.py b/utils/plasticc_make_predictions.py
new file mode 100644
index 00000000..e69de29b
diff --git a/utils/plasticc_pipeline.py b/utils/plasticc_pipeline.py
new file mode 100755
index 00000000..796af43f
--- /dev/null
+++ b/utils/plasticc_pipeline.py
@@ -0,0 +1,647 @@
+"""
+Machine learning pipeline for the PLAsTiCC competition using snmachine codebase.
+"""
+import multiprocessing
+import os
+import subprocess
+import sys
+import warnings
+
+import numpy as np
+import pandas as pd
+import yaml
+try:
+    import cPickle as pickle
+except ModuleNotFoundError:
+    import pickle
+try:
+    from snmachine import snfeatures, sndata, snaugment, gps
+except ImportError:
+    print("Unable to import snmachine. Check environment set correctly")
+
+from plasticc_utils import plasticc_log_loss, plot_confusion_matrix
+from astropy.table import Table
+from sklearn.model_selection import train_test_split
+from sklearn.ensemble import RandomForestClassifier
+from imblearn.metrics import classification_report_imbalanced
+from imblearn.pipeline import make_pipeline
+from imblearn.over_sampling import SMOTE
+from argparse import ArgumentParser
+
+warnings.filterwarnings("ignore")
+
+
+def get_git_revision_short_hash():
+    """ Helper function to obtain current version control hash value
+
+    Returns
+    -------
+    _hash : str
+        Short representation of current version control hash value
+
+    Examples
+    --------
+    >>> ...
+    >>> sha = get_git_revision_short_hash()
+    >>> print(sha)
+    'ede068e'
+    """
+    _hash = subprocess.check_output(['git', 'rev-parse', '--short', 'HEAD'])
+    return _hash.decode("utf-8").rstrip()
+
+
+def get_timestamp():
+    """ Helper function to obtain latest modified time of the configuration file
+
+    Returns
+    -------
+    timestamp : str
+        Short representation of last modified time for the configuration file used.
+        'YYYY-MM-DD-HOURMINUTE'
+
+    Examples
+    --------
+    >>> ...
+    >>> timestamp = get_timestamp()
+    >>> print(timestamp)
+    '2019-05-18-2100'
+    """
+    _timestamp = subprocess.check_output(['date', '+%Y-%m-%d-%H%M'])
+    return _timestamp.decode("utf-8").rstrip()
+
+
+def create_folder_structure(analysis_directory, analysis_name):
+    """ Make directories that will be used for analysis
+
+    Parameters
+    ----------
+    analysis_directory : str
+        System path to where the user would like to contain
+        a run of the analysis
+    analysis_name : str
+        Given name of analysis run. This is appended with the current git hash
+        the code has been run with.
+
+    Returns
+    -------
+    directories: dict
+        Dictionary containing the mapping of folders that have been created.
+
+    Examples
+    --------
+    Each folder name can then be accessed with dictionary methods:
+    >>> ...
+    >>> analysis_directory = params.get("analysis_directory", None)
+    >>> analysis_name = params.get("analysis_name", None)
+    >>> directories = create_folder_structure(analysis_directory, analysis_name)
+    >>> print(directories.get("method_directory", None))
+
+    """
+
+    method_directory = os.path.join(analysis_directory, analysis_name)
+    features_directory = os.path.join(method_directory, 'wavelet_features')
+    classifications_directory = os.path.join(method_directory, 'classifications')
+    intermediate_files_directory = os.path.join(method_directory, 'intermediate_files')
+    plots_directory = os.path.join(method_directory, 'plots')
+
+    directories = {"method_directory": method_directory, "features_directory": features_directory,
+            "classifications_directory": classifications_directory, "intermediate_files_directory": intermediate_files_directory,
+            "plots_directory": plots_directory}
+
+    if os.path.isdir(method_directory):
+        errmsg = """
+                Folders already exist with this analysis name.
+
+                Are you sure you would like to proceed, this will overwrite the
+                {} folder [Y/n]
+                """.format(analysis_name)
+        print(errmsg)
+
+        _yes = ["yes", "y", "ye"]
+        _no = ["no", "n"]
+
+        choice = input().lower()
+
+        if choice in _yes:
+            print("Overwriting existing folder..")
+            for key, value in directories.items():
+                subprocess.call(['mkdir', value], stderr=subprocess.DEVNULL)
+        elif choice in _no:
+            print("I am NOT sure")
+            sys.exit()
+        else:
+            sys.stdout.write("Please respond with 'yes' or 'no'")
+    else:
+        for key, value in directories.items():
+            subprocess.call(['mkdir', value])
+
+    return directories
+
+
+def get_directories(analyses_directory, analysis_name):
+    """Returns the folder directories inside of a given analysis.
+
+    # TODO [Add a link to the place where we have an explanation of the folder structure]
+
+    Parameters
+    ----------
+    analyses_directory : str
+        System path to where the user stores all analysis.
+    analysis_name : str
+        Name of the analysis we want.
+
+    Returns
+    -------
+    directories : dict
+        Dictionary containing the mapping of folders inside of `analysis_name`.
+    """
+    analysis_directory = os.path.join(analyses_directory, analysis_name)
+    features_directory = os.path.join(analysis_directory, 'wavelet_features')
+    classifications_directory = os.path.join(analysis_directory, 'classifications')
+    intermediate_files_directory = os.path.join(analysis_directory, 'intermediate_files')
+    plots_directory = os.path.join(analysis_directory, 'plots')
+
+    directories = {"analysis_directory": analysis_directory,
+                   "features_directory": features_directory,
+                   "classifications_directory": classifications_directory,
+                   "intermediate_files_directory": intermediate_files_directory,
+                   "plots_directory": plots_directory}
+
+    return directories
+
+
+def load_configuration_file(path_to_configuration_file):
+    """ Load from disk the configuration file that is to be used
+
+    Parameters
+    ----------
+    path_to_configuration_file : str
+        System path to where the configuration file is located
+
+    Returns
+    -------
+    params : dict
+        Dictionary of parameters contained inside the configuration file
+
+    Examples
+    --------
+    Each item inside the configuration file can be accessed like so:
+    >>> ...
+    >>> params = load_configuration_file(path_to_configuration_file)
+    >>> kernel_param = params.get("kernel_param", None)
+    >>> print(kernel_param)
+    [500.0, 20.0]
+    >>> number_gp = params.get("number_gp", None)
+    >>> print(number_gp)
+    '1100'
+    """
+    try:
+        with open(path_to_configuration_file) as f:
+            params = yaml.load(f)
+    except IOError:
+        print("Invalid yaml file provided")
+        sys.exit()
+    print("The parameters are:\n {}".format(params))
+    return params
+
+
+def save_configuration_file(params, method_directory):
+    """ Make a copy of the configuration file that has been used inside the
+    analysis directory
+
+    Parameters
+    ----------
+    params : dict
+        Dictionary containing the parameters used for this analysis
+    method_directory : string
+        Folder where this analysis is taking place
+
+    Returns
+    -------
+    None
+
+    Examples
+    --------
+    >>> ...
+    >>> save_configuration_file(params, method_directory)
+    >>> subprocess.call(['cat', os.path.join(method_directory, "logs.yml")])
+    analysis_directory: /share/hypatia/snmachine_resources/data/plasticc/analysis/
+    analysis_name: pipeline-test
+    data_path: /share/hypatia/snmachine_resources/data/plasticc/data/raw_data/training_set_snia.pickle
+    git_hash: 916eaec
+    kernel_param:
+    - 500.0
+    - 20.0
+    number_gp: 1100
+    number_of_principal_components: 10
+    timestamp: 2019-05-21-1204
+    """
+
+    git_hash = {"git_hash": get_git_revision_short_hash()}
+    timestamp = {"timestamp": get_timestamp()}
+
+    params.update(git_hash)
+    params.update(timestamp)
+
+    with open(os.path.join(method_directory, "logs.yml"), 'a+') as config:
+            yaml.dump(params, config, default_flow_style=False)
+
+
+def load_training_data(data_path):
+    """ Load from disk the training data one will use for this analysis
+
+    Parameters
+    ----------
+    params : dict
+        Dictionary containing the parameters that reside in the configuration
+        file. This will be used to obtain the path to the training data.
+
+    Returns
+    -------
+    training_data : snmachine.PlasticcData
+        snmachine.PlasticcData instance of the training data
+
+    Examples
+    --------
+    >>> ...
+    >>> training_data = load_training_data(params)
+    >>> print(training_data)
+    <snmachine.sndata.PlasticcData object at 0x7f8dc9dd4e10>
+    """
+
+    try:
+        if data_path.lower().endswith((".pickle", ".pkl", ".p", ".pckl")):
+            with open(data_path, 'rb') as input:
+                print("Opening from binary pickle")
+                training_data = pickle.load(input)
+                print("Dataset loaded from pickle file as: {}".format(training_data))
+        else:
+            folder_path, train_data_file_name = os.path.split(data_path)
+            meta_data_file_name = "_metadata.".join(train_data_file_name.split("."))
+
+            print("Opening from CSV")
+            training_data = sndata.PlasticcData(folder=folder_path, data_file=train_data_file_name,
+                                                metadata_file=meta_data_file_name, cut_non_detections=False)
+            print("Dataset loaded from csv file as: {}".format(training_data))
+            print("Saving {} object to pickle binary".format(training_data))
+
+            dat_binary = os.path.splitext(train_data_file_name)[0] + ".pckl"
+            print(os.path.join(folder_path, dat_binary))
+            with open(os.path.join(folder_path, dat_binary), 'wb') as f:
+                pickle.dump(training_data, f, pickle.HIGHEST_PROTOCOL)
+    except FileNotFoundError:
+        print("No file found to load")
+        exit()
+
+    return training_data
+
+
+def reduce_size_of_training_data(training_data, directories, subset_size, seed=1234):
+    # TODO: Incorpate further doctrings and finish examples. Tarek: Catarina and I need to
+    # discuss this further. There is some overlap between this and
+    # sndata.PlasticcData.update_data() and it would be good to comebine this.
+    """ Load from disk the training data one will use for this analysis
+
+    Parameters
+    ----------
+    training_data : snmachine.PlasticcData
+        Dictionary containing the parameters that reside in the configuration
+        file. This will be used to obtain the path to the training data.
+    directories : dict
+        Dictionary containing
+    subset_size : int
+        Number of objects the user would like to reduce the training data to
+    seed : int
+        Default set to 1234. This can be overridden by the user to check for
+        consistancy of results
+
+    Returns
+    -------
+    None
+
+    Examples
+    --------
+    >>> ...
+    >>> print(shape.training_data)
+
+    >>> new_training_data = reduce_size_of_training_data(training_data,
+            directories, 1000))
+    >>> print(shape.new_training_data)
+
+    """
+
+    method_directory = directories.get("method_directory", None)
+    subset_file = os.path.join(method_directory, "subset.list")
+    if os.path.exists(subset_file):
+        rand_objs = np.genfromtxt(subset_file, dtype='U')
+    else:
+        np.random.seed(seed)
+        rand_objs = np.random.choice(training_data.object_names, replace=False, size=subset_size)
+        rand_objs_sorted_int = np.sort(rand_objs.astype(np.int))
+        rand_objs = rand_objs_sorted_int.astype('<U9')
+        np.savetxt(subset_file, rand_objs, fmt='%s')
+
+    training_data.object_names = rand_objs
+
+    # Erase the data we are not using
+    training_data.data = {objects: training_data.data[objects] for objects in training_data.object_names}
+    print("Dataset reduced to {} objects".format(training_data.object_names.shape[0]))
+
+
+def wavelet_decomposition(training_data, number_gp, **kwargs):
+    """ Wrapper function for `snmachine.snfeatures.WaveletFeatures`. This
+    performs a wavelet decomposition on training data evaluated at 'number_gp'
+    points on a light curve
+
+    Parameters
+    ----------
+    training_data : snmachine.PlasticcData
+        Dictionary containing the parameters that reside in the configuration
+        file. This will be used to obtain the path to the training data.
+    number_gp : int
+        Number of points on the light curve to do wavelet analysis. Note, this
+        should be an even number for the wavelet decomposition to be able to be
+        performed.
+    number_processes : int
+        Number CPU cores avaiable to the user, this is how many cores the
+        decomposition will take place over
+    save_output : string
+        String defining what should be saved. See docs in
+        `snmachine.snfeatures.extract_wavelets` for more details on options.
+    output_root : string
+        Path to where one would like the uncompressed wavelet files to be stored
+
+    Returns
+    -------
+    waveout: numpy.ndarray
+        Numpy array of the wavelet coefficients where each row is an object and
+        each column a different coefficient
+    waveout_err: numpy.ndarray
+        Numpy array storing the (assuming Gaussian) error on each coefficient.
+    wavelet_object: snmachine.snfeatures.WaveletFeatures object
+
+    Examples
+    --------
+    >>> ...
+    >>> waveout, waveout_err, wavelet_object = wavelet_decomposition(training_data, number_gp=number_gp, number_processes=number_processes,
+                                                                     save_output='all',
+                                                                     output_root=directories.get("intermediate_files_directory"))
+    >>> print()
+
+    """
+    wavelet_object = snfeatures.WaveletFeatures(number_gp=number_gp)
+    print("WAV = {}\n".format(wavelet_object.wav))
+    print("MLEV = {}\n".format(wavelet_object.mlev))
+    print("number_gp = {}\n".format(number_gp))
+    waveout, waveout_err = wavelet_object.extract_wavelets(training_data, wavelet_object.wav, wavelet_object.mlev, **kwargs)
+    return waveout, waveout_err, wavelet_object
+
+
+def combine_all_features(reduced_wavelet_features, dataframe):
+    # TODO: Improve docstrings.
+    """ Combine snmachine wavelet features with PLASTICC features. The
+    user should define a dataframe they would like to merge.
+
+    Parameters
+    ----------
+    reduced_wavelet_features :  astropy.table.table.Table
+        These are the N principal components from the uncompressed wavelets
+    dataframe : pandas.DataFrame
+        Dataframe
+
+    Returns
+    -------
+    combined_features : pandas.DataFrame
+
+    Examples
+    --------
+    >>> ...
+    >>> print(shape.reduced_wavelet_features)
+
+    >>> print(shape.dataframe)
+
+    >>> combined_features = combine_all_features(reduced_wavelet_features, dataframe)
+    >>> print(shape.combined_features)
+
+    """
+
+# def merge_features(some_features, other_features):
+#     # TODO: Move this to a data processing file
+#     if type(some_features) != pd.core.frame.DataFrame:
+#         some_features = some_features.to_pandas()
+#     if type(other_features) != pd.core.frame.DataFrame:
+#         other_features = other_features.to_pandas()
+#     merged_df = pd.merge(some_features, other_features)
+#     merged_df.set_index("Object", inplace=True)
+#     return merged_df
+
+#     meta_df = dat.metadata
+#     combined_features = merge_features(wavelet_features, meta_df)
+    return combined_features
+
+
+def _to_pandas(features):
+    # TODO: Improve docstrings.
+    """ Helper function to take either an astropy Table
+    or numpy ndarray and convert to a pandas DataFrame representation
+
+    Parameters
+    ----------
+    features: astropy.table.table.Table OR numpy.ndarray
+        This parameter can be either an astropy Table or numpy ndarray
+        representation of the wavelet features
+
+    Returns
+    -------
+    features : pandas.DataFrame
+
+    Examples
+    --------
+    >>> ...
+    >>> print(type(features))
+    <class 'astropy.table.table.Table'>
+    >>> features = _to_pandas(features)
+    >>> print(type(features))
+    <class 'pandas.core.frame.DataFrame'>
+    """
+
+    if isinstance(features, np.ndarray):
+        features = pd.DataFrame(features, index=training_data.object_names)
+    else:
+        features = features.to_pandas()
+
+    return features
+
+
+def create_classifier(combined_features, training_data, directories, augmentation_method=None, random_state=42, number_comps=''):
+    # TODO: Improve docstrings.
+    """ Creation of an optimised Random Forest classifier.
+
+    Parameters
+    ----------
+    combined_features : pandas.DataFrame
+        This contains. Index on objects
+    random_state : int
+        To allow for reproducible...
+
+    Returns
+    -------
+    classifer : sklearn.RandomForestClassifier object
+
+    Examples
+    --------
+    >>> ...
+    >>> classifier, confusion_matrix = create_classifier(combined_features)
+    >>> print(classifier)
+    (RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
+        max_depth=None, max_features='auto', max_leaf_nodes=None,
+        min_impurity_split=1e-07, min_samples_leaf=1,
+        min_samples_split=2, min_weight_fraction_leaf=0.0,
+        n_estimators=700, n_jobs=-1, oob_score=True, random_state=42,
+        verbose=0, warm_start=False), array([[ 1.]]))
+    """
+
+    combined_features['target'] = training_data.labels.values
+
+    X = combined_features.drop('target', axis=1)
+    y = combined_features['target'].values
+
+    target_names = combined_features['target'].unique()
+
+    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state)
+
+    classifer = RandomForestClassifier(n_estimators=700, criterion='entropy', oob_score=True, n_jobs=-1, random_state=random_state)
+
+    if augmentation_method in ['SMOTE']:
+        classifer = make_pipeline(eval(augmentation_method)(sampling_strategy='not majority'), classifer)
+    else:
+        print("No augmentation selected, proceeding without resampling of classes")
+
+    classifer.fit(X_train, y_train)
+
+    # Classify and report the results
+    print(classification_report_imbalanced(y_test, classifer.predict(X_test)))
+
+    y_preds = classifer.predict(X_test)
+    confusion_matrix, figure = plot_confusion_matrix(y_test, y_preds, 'Validation data', target_names, normalize=True)
+
+    timestamp = get_timestamp()
+    with open(os.path.join(directories.get("classifications_directory"), F'classifer_{number_comps}_{augmentation_method}.pkl'), 'wb') as clf:
+        pickle.dump(classifer, clf)
+
+    figure.savefig(os.path.join(directories.get("plots_directory"), F'confusion_matrix_{number_comps}_{augmentation_method}.pdf'))
+
+    return classifer, confusion_matrix
+
+
+def make_predictions(location_of_test_data, classifier):
+    # TODO: Move to a seperate make_predictions file
+    pass
+
+
+def restart_from_saved_gps(directories):
+    pass
+
+
+def restart_from_saved_wavelets(directories):
+    pass
+
+
+def restart_from_saved_pca(directories, number_of_principal_components):
+    # TODO: Write docstrings
+    wavelet_features = pd.read_pickle(os.path.join(directories.get("features_directory"), "reduced_wavelet_components_{}.pickle".format(number_of_principal_components)))
+    combined_features = wavelet_features  # For running tests for now
+    classifier, confusion_matrix = create_classifier(combined_features, training_data)
+    print(F"classifier = {classifier}")
+
+
+if __name__ == "__main__":
+
+    # Set the number of processes you want to use throughout the notebook
+    number_processes = multiprocessing.cpu_count()
+    print("Running with {} cores".format(number_processes))
+
+    parser = ArgumentParser(description="Run pipeline end to end")
+    parser.add_argument('--configuration', '-c')
+    parser.add_argument('--restart-from', '-r', help='Either restart from saved "GPs" or from saved "Wavelets"', default="full")
+    arguments = parser.parse_args()
+    arguments = vars(arguments)
+
+    path_to_configuration_file = arguments['configuration']
+    params = load_configuration_file(path_to_configuration_file)
+
+    data_path = params.get("data_path", None)
+    analysis_directory = params.get("analysis_directory", None)
+    analysis_name = params.get("analysis_name", None)
+
+    # snmachine parameters
+    number_gp = params.get("number_gp", None)
+    kernel_param = params.get("kernel_param", None)
+    number_of_principal_components = params.get("number_of_principal_components", None)
+
+    # Step 1. Creat folders that contain analysis
+    directories = create_folder_structure(analysis_directory, analysis_name)
+    # Step 2. Save configuration file used for this analysis
+    save_configuration_file(params, directories.get("method_directory"))
+    # Step 3. Check at which point the user would like to run the analysis from.
+    # If elements already saved, these will be used but this can be overriden
+    # with command line argument
+    if (arguments['restart_from'].lower() == "gps"):
+        # Restart from saved GPs.
+        pass
+    elif (arguments['restart_from'].lower() == "wavelets"):
+        # Restart from saved uncompressed wavelets.
+        pass
+    elif (arguments['restart_from'].lower() == "pca"):
+        # Restart from saved PCA components
+        restart_from_saved_pca(directories, number_of_principal_components)
+    else:
+        # Run full pipeline but still do checks to see if elements from GPs or
+        # wavelets already exist on disk; the first check should be for:
+        #   a. Saved PCA files
+            # path_saved_reduced_wavelets = directories.get("intermediate_files_directory")
+            # eigenvectors_saved_file = np.load(os.path.join(path_saved_reduced_wavelets, 'eigenvectors_' + str(number_of_principal_components) + '.npy'))
+            # means_saved_file = np.load(os.path.join(path_saved_reduced_wavelets, 'means_' + str(number_of_principal_components) + '.npy'))
+        #   b. Saved uncompressed wavelets
+        #   c. Saved GPs
+
+        # Step 4. Load in training data
+        training_data = load_training_data(data_path)
+        print("training_data = {}".format(training_data))
+
+        # Step 5. Compute GPs
+        gps.compute_gps(training_data, number_gp=number_gp, t_min=0, t_max=1100,
+                        kernel_param=kernel_param,
+                        output_root=directories['intermediate_files_directory'],
+                        number_processes=number_processes)
+
+        # Step 6. Extract wavelet coeffiencts
+        waveout, waveout_err, wavelet_object = wavelet_decomposition(training_data, number_gp=number_gp, number_processes=number_processes,
+                                                                     save_output='all',
+                                                                     output_root=directories.get("intermediate_files_directory"))
+        print("waveout = {}".format(waveout))
+        print("waveout, type = {}".format(type(waveout)))
+
+        print("waveout_err = {}".format(waveout_err))
+        print("waveout_err, type = {}".format(type(waveout_err)))
+
+        print("wavelet_object = {}".format(wavelet_object))
+        print("wavelet_object, type = {}".format(type(wavelet_object)))
+
+        # Step 7. Reduce dimensionality of wavelets by using only N principal components
+        wavelet_features, eigenvals, eigenvecs, means, num_feats = wavelet_object.extract_pca(object_names=training_data.object_names, wavout=waveout, recompute_pca=True, method='svd', number_comp=number_of_principal_components,
+                                                                                              tol=None,
+                                                                                              pca_path=None,
+                                                                                              save_output=True,
+                                                                                              output_root=directories.get("features_directory"))
+        print("wavelet_features = {}".format(wavelet_features))
+        print("wavelet_features, type = {}".format(type(wavelet_features)))
+
+        # Step 8. TODO Combine snmachine features with user defined features
+
+        # Step 9. TODO Create a Random Forest classifier; need to fit model and save it.
+        combined_features = wavelet_features  # For running tests for now
+        classifier = create_classifier(combined_features, training_data)
+        print(F"classifier = {classifier}")
+
+        # Step 10. TODO Use saved classifier to make predictions. This can occur using a seperate file
diff --git a/utils/plasticc_utils.py b/utils/plasticc_utils.py
index 7b1b45f5..3659cd4b 100644
--- a/utils/plasticc_utils.py
+++ b/utils/plasticc_utils.py
@@ -2,36 +2,45 @@
 Utility script for calculating the log loss
 """
 
+from sklearn.metrics import confusion_matrix
 import sys
 import numpy as np
 import matplotlib.pyplot as plt
 import seaborn as sns
-from sklearn.metrics import auc, roc_curve, confusion_matrix
 
-def plotConfusionMatrix(yTrue, yPredict, dataName, targetNames):
-    cm = confusion_matrix(yTrue, yPredict, labels=targetNames)
-    cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
+
+def plot_confusion_matrix(y_true, y_pred, title, target_names, normalize=False):
+    cm = confusion_matrix(y_true, y_pred, labels=target_names)
+    if normalize:
+        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
+        print("Normalized confusion matrix")
+    else:
+        print('Confusion matrix, without normalization')
+    print(cm)
+
     annot = np.around(cm, 2)
 
-    fig, ax = plt.subplots(figsize=(9,7))
-    sns.heatmap(cm, xticklabels=targetNames,
-    yticklabels=targetNames, cmap='Blues',
-    annot=annot, lw=0.5)
+    fig, ax = plt.subplots(figsize=(9, 7))
+    sns.heatmap(cm, xticklabels=target_names,
+                yticklabels=target_names, cmap='Blues',
+                annot=annot, lw=0.5)
+
     ax.set_xlabel('Predicted Label')
     ax.set_ylabel('True Label')
     ax.set_aspect('equal')
-    plt.title(dataName)
+    plt.title(title)
+
+    return cm, fig
 
-    return cm
 
-def plasticcLogLoss(y_true, y_pred, relative_class_weights=None):
+def plasticc_log_loss(y_true, y_pred, relative_class_weights=None):
     """
     Implementation of weighted log loss used for the Kaggle challenge
     """
     predictions = y_pred.copy()
 
     # sanitize predictions
-    epsilon = sys.float_info.epsilon # this is machine dependent but essentially prevents log(0)
+    epsilon = sys.float_info.epsilon  # this is machine dependent but essentially prevents log(0)
     predictions = np.clip(predictions, epsilon, 1.0 - epsilon)
     predictions = predictions / np.sum(predictions, axis=1)[:, np.newaxis]