diff --git a/.gitignore b/.gitignore index 6875f042..0797322a 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,9 @@ test/* !test/*.py +# Do not track log files in utils +utils/*stdout.txt + ## Python.gitignore from Github. ## # Byte-compiled / optimized / DLL files diff --git a/environment.yml b/environment.yml index 997fc7e1..23422a70 100644 --- a/environment.yml +++ b/environment.yml @@ -11,17 +11,18 @@ dependencies: - jupyter>=1.0.0 - matplotlib>=1.5.1 - numpy=1.12.0 - - scikit-learn=0.18.1 + - scikit-learn>=0.20 - scipy>=0.17.0 - george>=0.3.0 - iminuit>=1.2 - pandas>=0.23.0 - extinction>=0.3.0 + - imbalanced-learn>=0.4.3 - pip: - emcee>=2.1.0 - numpydoc>=0.6.0 - pywavelets>=0.4.0 - - sncosmo>=1.3.0 + - sncosmo==1.7.1 - nose>=1.3.7 - future>=0.16 diff --git a/snmachine/gps.py b/snmachine/gps.py index 5b965756..3dec7868 100644 --- a/snmachine/gps.py +++ b/snmachine/gps.py @@ -56,7 +56,7 @@ def compute_gps(dataset, number_gp, t_min, t_max, kernel_param=[500., 20.], outp output_root : {None, str}, optional If None, don't save anything. If str, it is the output directory, so save the flux and error estimates and used kernels there. number_processes : int, optional - Number of processors to use for parallelisation (shared memory only). By default `nprocesses` = 1. + Number of processors to use for parallelisation (shared memory only). By default `number_processes` = 1. gp_algo : str, optional which gp package is used for the Gaussian Process Regression, GaPP or george """ @@ -148,7 +148,7 @@ def _compute_gps_parallel(dataset, number_gp, t_min, t_max, kernel_param, output output_root : {None, str}, optional If None, don't save anything. If str, it is the output directory, so save the flux and error estimates and used kernels there. number_processes : int, optional - Number of processors to use for parallelisation (shared memory only). By default `nprocesses` = 1. + Number of processors to use for parallelisation (shared memory only). By default `number_processes` = 1. gp_algo : str, optional which gp package is used for the Gaussian Process Regression, GaPP or george """ @@ -413,4 +413,4 @@ def get_kernel(kernel_name, kernel_param): elif kernel_name == 'ExpSquared+ExpSine2': kExpSine2 = kernel_param[4]*george.kernels.ExpSine2Kernel(gamma=kernel_param[5],log_period=kernel_param[6]) kernel = kExpSquared + kExpSine2 - return kernel \ No newline at end of file + return kernel diff --git a/snmachine/snaugment.py b/snmachine/snaugment.py index 4e582cd8..564a6f96 100644 --- a/snmachine/snaugment.py +++ b/snmachine/snaugment.py @@ -123,7 +123,7 @@ def extract_proxy_features(self,peak_filter='desr',nproc=1,fit_salt2=False,salt2 #tf=snfeatures.TemplateFeatures(sampler='leastsq') tf=snfeatures.TemplateFeatures(sampler=sampler) if salt2feats is None: - salt2feats=tf.extract_features(self.dataset,nprocesses=nproc,use_redshift=fix_redshift) + salt2feats=tf.extract_features(self.dataset,number_processes=nproc,use_redshift=fix_redshift) #fit models and extract r-peakmags peaklogflux=[] diff --git a/snmachine/snclassifier.py b/snmachine/snclassifier.py old mode 100755 new mode 100644 index 0e3a4cd2..185356a1 --- a/snmachine/snclassifier.py +++ b/snmachine/snclassifier.py @@ -608,7 +608,7 @@ def __call_classifier(classifier, X_train, y_train, X_test, param_dict, return_c def run_pipeline(features, types, output_name='', columns=[], classifiers=['nb', 'knn', 'svm', 'neural_network', 'boost_dt'], - training_set=0.3, param_dict={}, nprocesses=1, scale=True, + training_set=0.3, param_dict={}, number_processes=1, scale=True, plot_roc_curve=True, return_classifier=False, classifiers_for_cm_plots=[], type_dict=None, seed=1234): """ @@ -632,7 +632,7 @@ def run_pipeline(features, types, output_name='', columns=[], classifiers=['nb', the ID's of the objects to be used param_dict : dict, optional Use to run different ranges of hyperparameters for the classifiers when optimising - nprocesses : int, optional + number_processes : int, optional Number of processors for multiprocessing (shared memory only). Each classifier will then be run in parallel. scale : bool, optional Rescale features using sklearn's preprocessing Scalar class (highly recommended this is True) @@ -707,15 +707,15 @@ def run_pipeline(features, types, output_name='', columns=[], classifiers=['nb', probabilities = {} classifier_objects = {} - if nprocesses > 1 and return_classifier: + if number_processes > 1 and return_classifier: print("Due to limitations with python's multiprocessing module, classifier objects cannot be returned if " \ "multiple processors are used. Continuing serially...") print() - if nprocesses > 1 and not return_classifier: + if number_processes > 1 and not return_classifier: partial_func=partial(__call_classifier, X_train=X_train, y_train=y_train, X_test=X_test, param_dict=param_dict, return_classifier=False) - p = Pool(nprocesses, maxtasksperchild=1) + p = Pool(number_processes, maxtasksperchild=1) result = p.map(partial_func, classifiers) for i in range(len(result)): diff --git a/snmachine/snfeatures.py b/snmachine/snfeatures.py index d1b6ee39..b7e6f930 100644 --- a/snmachine/snfeatures.py +++ b/snmachine/snfeatures.py @@ -686,7 +686,7 @@ def __init__(self, model=['Ia'], sampler='leastsq',lsst_bands=False,lsst_dir='.. 'nugent-sn2l':{'z':(0.01, 1.5)}, 'nugent-sn1bc':{'z':(0.01, 1.5)}} - def extract_features(self, d, save_output=False, chain_directory='chains', use_redshift=False, nprocesses=1, restart=False, seed=-1): + def extract_features(self, d, save_output=False, chain_directory='chains', use_redshift=False, number_processes=1, restart=False, seed=-1): """ Extract template features for a dataset. @@ -700,7 +700,7 @@ def extract_features(self, d, save_output=False, chain_directory='chains', use_r Where to save the chains use_redshift : bool Whether or not to use provided redshift when fitting objects - nprocesses : int, optional + number_processes : int, optional Number of processors to use for parallelisation (shared memory only) restart : bool Whether or not to restart from multinest chains @@ -721,6 +721,7 @@ def extract_features(self, d, save_output=False, chain_directory='chains', use_r self.model=sncosmo.Model(self.templates[mod_name],effects=[dust],effect_names=['host'], effect_frames=['rest']) else: self.model=sncosmo.Model(self.templates[mod_name]) + print(F'MODEL-NAME: {mod_name}') params=['['+mod_name+']'+pname for pname in self.model.param_names] # err_plus=[pname+'_err+' for pname in params] # err_minus=[pname+'_err-' for pname in params] @@ -736,7 +737,7 @@ def extract_features(self, d, save_output=False, chain_directory='chains', use_r output = Table(names=labels, dtype=['U32'] + ['f'] * (len(labels) - 1)) k=0 - if nprocesses<2: + if number_processes<2: for obj in d.object_names: if k%100==0: print (k, 'objects fitted') @@ -779,7 +780,7 @@ def extract_features(self, d, save_output=False, chain_directory='chains', use_r else: if self.sampler=='leastsq': - p=Pool(nprocesses, maxtasksperchild=1) + p=Pool(number_processes, maxtasksperchild=1) partial_func=partial(_run_leastsq_templates, d=d, model_name=self.templates[mod_name], use_redshift=use_redshift, bounds=self.bounds[self.templates[mod_name]]) out=p.map(partial_func, d.object_names) output=out[0] @@ -790,7 +791,7 @@ def extract_features(self, d, save_output=False, chain_directory='chains', use_r else: all_output=vstack((all_output, output)) elif self.sampler=='nested': - p=Pool(nprocesses, maxtasksperchild=1) + p=Pool(number_processes, maxtasksperchild=1) partial_func=partial(_run_multinest_templates, d=d, model_name=self.templates[mod_name], bounds=self.bounds[self.templates[mod_name]], chain_directory=chain_directory, nlp=1000, convert_to_binary=True, use_redshift=use_redshift, short_name=self.short_names[mod_name], restart=restart, seed=seed) out=p.map(partial_func, d.object_names) @@ -914,7 +915,7 @@ def __init__(self, model_choice, sampler='leastsq', limits=None): - def extract_features(self, d, chain_directory='chains', save_output=True, n_attempts=20, nprocesses=1, n_walkers=100, + def extract_features(self, d, chain_directory='chains', save_output=True, n_attempts=20, number_processes=1, n_walkers=100, n_steps=500, walker_spread=0.1, burn=50, nlp=1000, starting_point=None, convert_to_binary=True, n_iter=0, restart=False, seed=-1): """ Fit parametric models and return best-fitting parameters as features. @@ -930,7 +931,7 @@ def extract_features(self, d, chain_directory='chains', save_output=True, n_atte n_attempts : int Allow the minimiser to start in new random locations if the fit is bad. Put n_attempts=1 to fit only once with the default starting position. - nprocesses : int, optional + number_processes : int, optional Number of processors to use for parallelisation (shared memory only) n_walkers : int emcee parameter - number of walkers to use @@ -963,7 +964,7 @@ def extract_features(self, d, chain_directory='chains', save_output=True, n_atte output=[] #obj=d.object_names[0] - if nprocesses<2: + if number_processes<2: k=0 for obj in d.object_names: if k%100==0: @@ -984,14 +985,14 @@ def extract_features(self, d, chain_directory='chains', save_output=True, n_atte k+=1 else: if self.sampler=='leastsq': - p=Pool(nprocesses, maxtasksperchild=1) + p=Pool(number_processes, maxtasksperchild=1) partial_func=partial(_run_leastsq, d=d, model=self.model, n_attempts=n_attempts, seed=seed) out=p.map(partial_func, d.object_names) output=out[0] for i in range(1, len(out)): output=vstack((output, out[i])) elif self.sampler=='nested': - p=Pool(nprocesses, maxtasksperchild=1) + p=Pool(number_processes, maxtasksperchild=1) partial_func=partial(_run_multinest, d=d, model=self.model,chain_directory=chain_directory, nlp=nlp, convert_to_binary=convert_to_binary, n_iter=n_iter, restart=restart, seed=seed) #Pool starts a number of threads, all of which may try to tackle all of the data. Better to take it in chunks @@ -999,7 +1000,7 @@ def extract_features(self, d, chain_directory='chains', save_output=True, n_atte k=0 objs=d.object_names while k>> ... + >>> sha = get_git_revision_short_hash() + >>> print(sha) + 'ede068e' + """ + _hash = subprocess.check_output(['git', 'rev-parse', '--short', 'HEAD']) + return _hash.decode("utf-8").rstrip() + + +def get_timestamp(): + """ Helper function to obtain latest modified time of the configuration file + + Returns + ------- + timestamp : str + Short representation of last modified time for the configuration file used. + 'YYYY-MM-DD-HOURMINUTE' + + Examples + -------- + >>> ... + >>> timestamp = get_timestamp() + >>> print(timestamp) + '2019-05-18-2100' + """ + _timestamp = subprocess.check_output(['date', '+%Y-%m-%d-%H%M']) + return _timestamp.decode("utf-8").rstrip() + + +def create_folder_structure(analysis_directory, analysis_name): + """ Make directories that will be used for analysis + + Parameters + ---------- + analysis_directory : str + System path to where the user would like to contain + a run of the analysis + analysis_name : str + Given name of analysis run. This is appended with the current git hash + the code has been run with. + + Returns + ------- + directories: dict + Dictionary containing the mapping of folders that have been created. + + Examples + -------- + Each folder name can then be accessed with dictionary methods: + >>> ... + >>> analysis_directory = params.get("analysis_directory", None) + >>> analysis_name = params.get("analysis_name", None) + >>> directories = create_folder_structure(analysis_directory, analysis_name) + >>> print(directories.get("method_directory", None)) + + """ + + method_directory = os.path.join(analysis_directory, analysis_name) + features_directory = os.path.join(method_directory, 'wavelet_features') + classifications_directory = os.path.join(method_directory, 'classifications') + intermediate_files_directory = os.path.join(method_directory, 'intermediate_files') + plots_directory = os.path.join(method_directory, 'plots') + + directories = {"method_directory": method_directory, "features_directory": features_directory, + "classifications_directory": classifications_directory, "intermediate_files_directory": intermediate_files_directory, + "plots_directory": plots_directory} + + if os.path.isdir(method_directory): + errmsg = """ + Folders already exist with this analysis name. + + Are you sure you would like to proceed, this will overwrite the + {} folder [Y/n] + """.format(analysis_name) + print(errmsg) + + _yes = ["yes", "y", "ye"] + _no = ["no", "n"] + + choice = input().lower() + + if choice in _yes: + print("Overwriting existing folder..") + for key, value in directories.items(): + subprocess.call(['mkdir', value], stderr=subprocess.DEVNULL) + elif choice in _no: + print("I am NOT sure") + sys.exit() + else: + sys.stdout.write("Please respond with 'yes' or 'no'") + else: + for key, value in directories.items(): + subprocess.call(['mkdir', value]) + + return directories + + +def get_directories(analyses_directory, analysis_name): + """Returns the folder directories inside of a given analysis. + + # TODO [Add a link to the place where we have an explanation of the folder structure] + + Parameters + ---------- + analyses_directory : str + System path to where the user stores all analysis. + analysis_name : str + Name of the analysis we want. + + Returns + ------- + directories : dict + Dictionary containing the mapping of folders inside of `analysis_name`. + """ + analysis_directory = os.path.join(analyses_directory, analysis_name) + features_directory = os.path.join(analysis_directory, 'wavelet_features') + classifications_directory = os.path.join(analysis_directory, 'classifications') + intermediate_files_directory = os.path.join(analysis_directory, 'intermediate_files') + plots_directory = os.path.join(analysis_directory, 'plots') + + directories = {"analysis_directory": analysis_directory, + "features_directory": features_directory, + "classifications_directory": classifications_directory, + "intermediate_files_directory": intermediate_files_directory, + "plots_directory": plots_directory} + + return directories + + +def load_configuration_file(path_to_configuration_file): + """ Load from disk the configuration file that is to be used + + Parameters + ---------- + path_to_configuration_file : str + System path to where the configuration file is located + + Returns + ------- + params : dict + Dictionary of parameters contained inside the configuration file + + Examples + -------- + Each item inside the configuration file can be accessed like so: + >>> ... + >>> params = load_configuration_file(path_to_configuration_file) + >>> kernel_param = params.get("kernel_param", None) + >>> print(kernel_param) + [500.0, 20.0] + >>> number_gp = params.get("number_gp", None) + >>> print(number_gp) + '1100' + """ + try: + with open(path_to_configuration_file) as f: + params = yaml.load(f) + except IOError: + print("Invalid yaml file provided") + sys.exit() + print("The parameters are:\n {}".format(params)) + return params + + +def save_configuration_file(params, method_directory): + """ Make a copy of the configuration file that has been used inside the + analysis directory + + Parameters + ---------- + params : dict + Dictionary containing the parameters used for this analysis + method_directory : string + Folder where this analysis is taking place + + Returns + ------- + None + + Examples + -------- + >>> ... + >>> save_configuration_file(params, method_directory) + >>> subprocess.call(['cat', os.path.join(method_directory, "logs.yml")]) + analysis_directory: /share/hypatia/snmachine_resources/data/plasticc/analysis/ + analysis_name: pipeline-test + data_path: /share/hypatia/snmachine_resources/data/plasticc/data/raw_data/training_set_snia.pickle + git_hash: 916eaec + kernel_param: + - 500.0 + - 20.0 + number_gp: 1100 + number_of_principal_components: 10 + timestamp: 2019-05-21-1204 + """ + + git_hash = {"git_hash": get_git_revision_short_hash()} + timestamp = {"timestamp": get_timestamp()} + + params.update(git_hash) + params.update(timestamp) + + with open(os.path.join(method_directory, "logs.yml"), 'a+') as config: + yaml.dump(params, config, default_flow_style=False) + + +def load_training_data(data_path): + """ Load from disk the training data one will use for this analysis + + Parameters + ---------- + params : dict + Dictionary containing the parameters that reside in the configuration + file. This will be used to obtain the path to the training data. + + Returns + ------- + training_data : snmachine.PlasticcData + snmachine.PlasticcData instance of the training data + + Examples + -------- + >>> ... + >>> training_data = load_training_data(params) + >>> print(training_data) + + """ + + try: + if data_path.lower().endswith((".pickle", ".pkl", ".p", ".pckl")): + with open(data_path, 'rb') as input: + print("Opening from binary pickle") + training_data = pickle.load(input) + print("Dataset loaded from pickle file as: {}".format(training_data)) + else: + folder_path, train_data_file_name = os.path.split(data_path) + meta_data_file_name = "_metadata.".join(train_data_file_name.split(".")) + + print("Opening from CSV") + training_data = sndata.PlasticcData(folder=folder_path, data_file=train_data_file_name, + metadata_file=meta_data_file_name, cut_non_detections=False) + print("Dataset loaded from csv file as: {}".format(training_data)) + print("Saving {} object to pickle binary".format(training_data)) + + dat_binary = os.path.splitext(train_data_file_name)[0] + ".pckl" + print(os.path.join(folder_path, dat_binary)) + with open(os.path.join(folder_path, dat_binary), 'wb') as f: + pickle.dump(training_data, f, pickle.HIGHEST_PROTOCOL) + except FileNotFoundError: + print("No file found to load") + exit() + + return training_data + + +def reduce_size_of_training_data(training_data, directories, subset_size, seed=1234): + # TODO: Incorpate further doctrings and finish examples. Tarek: Catarina and I need to + # discuss this further. There is some overlap between this and + # sndata.PlasticcData.update_data() and it would be good to comebine this. + """ Load from disk the training data one will use for this analysis + + Parameters + ---------- + training_data : snmachine.PlasticcData + Dictionary containing the parameters that reside in the configuration + file. This will be used to obtain the path to the training data. + directories : dict + Dictionary containing + subset_size : int + Number of objects the user would like to reduce the training data to + seed : int + Default set to 1234. This can be overridden by the user to check for + consistancy of results + + Returns + ------- + None + + Examples + -------- + >>> ... + >>> print(shape.training_data) + + >>> new_training_data = reduce_size_of_training_data(training_data, + directories, 1000)) + >>> print(shape.new_training_data) + + """ + + method_directory = directories.get("method_directory", None) + subset_file = os.path.join(method_directory, "subset.list") + if os.path.exists(subset_file): + rand_objs = np.genfromtxt(subset_file, dtype='U') + else: + np.random.seed(seed) + rand_objs = np.random.choice(training_data.object_names, replace=False, size=subset_size) + rand_objs_sorted_int = np.sort(rand_objs.astype(np.int)) + rand_objs = rand_objs_sorted_int.astype('>> ... + >>> waveout, waveout_err, wavelet_object = wavelet_decomposition(training_data, number_gp=number_gp, number_processes=number_processes, + save_output='all', + output_root=directories.get("intermediate_files_directory")) + >>> print() + + """ + wavelet_object = snfeatures.WaveletFeatures(number_gp=number_gp) + print("WAV = {}\n".format(wavelet_object.wav)) + print("MLEV = {}\n".format(wavelet_object.mlev)) + print("number_gp = {}\n".format(number_gp)) + waveout, waveout_err = wavelet_object.extract_wavelets(training_data, wavelet_object.wav, wavelet_object.mlev, **kwargs) + return waveout, waveout_err, wavelet_object + + +def combine_all_features(reduced_wavelet_features, dataframe): + # TODO: Improve docstrings. + """ Combine snmachine wavelet features with PLASTICC features. The + user should define a dataframe they would like to merge. + + Parameters + ---------- + reduced_wavelet_features : astropy.table.table.Table + These are the N principal components from the uncompressed wavelets + dataframe : pandas.DataFrame + Dataframe + + Returns + ------- + combined_features : pandas.DataFrame + + Examples + -------- + >>> ... + >>> print(shape.reduced_wavelet_features) + + >>> print(shape.dataframe) + + >>> combined_features = combine_all_features(reduced_wavelet_features, dataframe) + >>> print(shape.combined_features) + + """ + +# def merge_features(some_features, other_features): +# # TODO: Move this to a data processing file +# if type(some_features) != pd.core.frame.DataFrame: +# some_features = some_features.to_pandas() +# if type(other_features) != pd.core.frame.DataFrame: +# other_features = other_features.to_pandas() +# merged_df = pd.merge(some_features, other_features) +# merged_df.set_index("Object", inplace=True) +# return merged_df + +# meta_df = dat.metadata +# combined_features = merge_features(wavelet_features, meta_df) + return combined_features + + +def _to_pandas(features): + # TODO: Improve docstrings. + """ Helper function to take either an astropy Table + or numpy ndarray and convert to a pandas DataFrame representation + + Parameters + ---------- + features: astropy.table.table.Table OR numpy.ndarray + This parameter can be either an astropy Table or numpy ndarray + representation of the wavelet features + + Returns + ------- + features : pandas.DataFrame + + Examples + -------- + >>> ... + >>> print(type(features)) + + >>> features = _to_pandas(features) + >>> print(type(features)) + + """ + + if isinstance(features, np.ndarray): + features = pd.DataFrame(features, index=training_data.object_names) + else: + features = features.to_pandas() + + return features + + +def create_classifier(combined_features, training_data, directories, augmentation_method=None, random_state=42, number_comps=''): + # TODO: Improve docstrings. + """ Creation of an optimised Random Forest classifier. + + Parameters + ---------- + combined_features : pandas.DataFrame + This contains. Index on objects + random_state : int + To allow for reproducible... + + Returns + ------- + classifer : sklearn.RandomForestClassifier object + + Examples + -------- + >>> ... + >>> classifier, confusion_matrix = create_classifier(combined_features) + >>> print(classifier) + (RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy', + max_depth=None, max_features='auto', max_leaf_nodes=None, + min_impurity_split=1e-07, min_samples_leaf=1, + min_samples_split=2, min_weight_fraction_leaf=0.0, + n_estimators=700, n_jobs=-1, oob_score=True, random_state=42, + verbose=0, warm_start=False), array([[ 1.]])) + """ + + combined_features['target'] = training_data.labels.values + + X = combined_features.drop('target', axis=1) + y = combined_features['target'].values + + target_names = combined_features['target'].unique() + + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state) + + classifer = RandomForestClassifier(n_estimators=700, criterion='entropy', oob_score=True, n_jobs=-1, random_state=random_state) + + if augmentation_method in ['SMOTE']: + classifer = make_pipeline(eval(augmentation_method)(sampling_strategy='not majority'), classifer) + else: + print("No augmentation selected, proceeding without resampling of classes") + + classifer.fit(X_train, y_train) + + # Classify and report the results + print(classification_report_imbalanced(y_test, classifer.predict(X_test))) + + y_preds = classifer.predict(X_test) + confusion_matrix, figure = plot_confusion_matrix(y_test, y_preds, 'Validation data', target_names, normalize=True) + + timestamp = get_timestamp() + with open(os.path.join(directories.get("classifications_directory"), F'classifer_{number_comps}_{augmentation_method}.pkl'), 'wb') as clf: + pickle.dump(classifer, clf) + + figure.savefig(os.path.join(directories.get("plots_directory"), F'confusion_matrix_{number_comps}_{augmentation_method}.pdf')) + + return classifer, confusion_matrix + + +def make_predictions(location_of_test_data, classifier): + # TODO: Move to a seperate make_predictions file + pass + + +def restart_from_saved_gps(directories): + pass + + +def restart_from_saved_wavelets(directories): + pass + + +def restart_from_saved_pca(directories, number_of_principal_components): + # TODO: Write docstrings + wavelet_features = pd.read_pickle(os.path.join(directories.get("features_directory"), "reduced_wavelet_components_{}.pickle".format(number_of_principal_components))) + combined_features = wavelet_features # For running tests for now + classifier, confusion_matrix = create_classifier(combined_features, training_data) + print(F"classifier = {classifier}") + + +if __name__ == "__main__": + + # Set the number of processes you want to use throughout the notebook + number_processes = multiprocessing.cpu_count() + print("Running with {} cores".format(number_processes)) + + parser = ArgumentParser(description="Run pipeline end to end") + parser.add_argument('--configuration', '-c') + parser.add_argument('--restart-from', '-r', help='Either restart from saved "GPs" or from saved "Wavelets"', default="full") + arguments = parser.parse_args() + arguments = vars(arguments) + + path_to_configuration_file = arguments['configuration'] + params = load_configuration_file(path_to_configuration_file) + + data_path = params.get("data_path", None) + analysis_directory = params.get("analysis_directory", None) + analysis_name = params.get("analysis_name", None) + + # snmachine parameters + number_gp = params.get("number_gp", None) + kernel_param = params.get("kernel_param", None) + number_of_principal_components = params.get("number_of_principal_components", None) + + # Step 1. Creat folders that contain analysis + directories = create_folder_structure(analysis_directory, analysis_name) + # Step 2. Save configuration file used for this analysis + save_configuration_file(params, directories.get("method_directory")) + # Step 3. Check at which point the user would like to run the analysis from. + # If elements already saved, these will be used but this can be overriden + # with command line argument + if (arguments['restart_from'].lower() == "gps"): + # Restart from saved GPs. + pass + elif (arguments['restart_from'].lower() == "wavelets"): + # Restart from saved uncompressed wavelets. + pass + elif (arguments['restart_from'].lower() == "pca"): + # Restart from saved PCA components + restart_from_saved_pca(directories, number_of_principal_components) + else: + # Run full pipeline but still do checks to see if elements from GPs or + # wavelets already exist on disk; the first check should be for: + # a. Saved PCA files + # path_saved_reduced_wavelets = directories.get("intermediate_files_directory") + # eigenvectors_saved_file = np.load(os.path.join(path_saved_reduced_wavelets, 'eigenvectors_' + str(number_of_principal_components) + '.npy')) + # means_saved_file = np.load(os.path.join(path_saved_reduced_wavelets, 'means_' + str(number_of_principal_components) + '.npy')) + # b. Saved uncompressed wavelets + # c. Saved GPs + + # Step 4. Load in training data + training_data = load_training_data(data_path) + print("training_data = {}".format(training_data)) + + # Step 5. Compute GPs + gps.compute_gps(training_data, number_gp=number_gp, t_min=0, t_max=1100, + kernel_param=kernel_param, + output_root=directories['intermediate_files_directory'], + number_processes=number_processes) + + # Step 6. Extract wavelet coeffiencts + waveout, waveout_err, wavelet_object = wavelet_decomposition(training_data, number_gp=number_gp, number_processes=number_processes, + save_output='all', + output_root=directories.get("intermediate_files_directory")) + print("waveout = {}".format(waveout)) + print("waveout, type = {}".format(type(waveout))) + + print("waveout_err = {}".format(waveout_err)) + print("waveout_err, type = {}".format(type(waveout_err))) + + print("wavelet_object = {}".format(wavelet_object)) + print("wavelet_object, type = {}".format(type(wavelet_object))) + + # Step 7. Reduce dimensionality of wavelets by using only N principal components + wavelet_features, eigenvals, eigenvecs, means, num_feats = wavelet_object.extract_pca(object_names=training_data.object_names, wavout=waveout, recompute_pca=True, method='svd', number_comp=number_of_principal_components, + tol=None, + pca_path=None, + save_output=True, + output_root=directories.get("features_directory")) + print("wavelet_features = {}".format(wavelet_features)) + print("wavelet_features, type = {}".format(type(wavelet_features))) + + # Step 8. TODO Combine snmachine features with user defined features + + # Step 9. TODO Create a Random Forest classifier; need to fit model and save it. + combined_features = wavelet_features # For running tests for now + classifier = create_classifier(combined_features, training_data) + print(F"classifier = {classifier}") + + # Step 10. TODO Use saved classifier to make predictions. This can occur using a seperate file diff --git a/utils/plasticc_utils.py b/utils/plasticc_utils.py index 7b1b45f5..3659cd4b 100644 --- a/utils/plasticc_utils.py +++ b/utils/plasticc_utils.py @@ -2,36 +2,45 @@ Utility script for calculating the log loss """ +from sklearn.metrics import confusion_matrix import sys import numpy as np import matplotlib.pyplot as plt import seaborn as sns -from sklearn.metrics import auc, roc_curve, confusion_matrix -def plotConfusionMatrix(yTrue, yPredict, dataName, targetNames): - cm = confusion_matrix(yTrue, yPredict, labels=targetNames) - cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] + +def plot_confusion_matrix(y_true, y_pred, title, target_names, normalize=False): + cm = confusion_matrix(y_true, y_pred, labels=target_names) + if normalize: + cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] + print("Normalized confusion matrix") + else: + print('Confusion matrix, without normalization') + print(cm) + annot = np.around(cm, 2) - fig, ax = plt.subplots(figsize=(9,7)) - sns.heatmap(cm, xticklabels=targetNames, - yticklabels=targetNames, cmap='Blues', - annot=annot, lw=0.5) + fig, ax = plt.subplots(figsize=(9, 7)) + sns.heatmap(cm, xticklabels=target_names, + yticklabels=target_names, cmap='Blues', + annot=annot, lw=0.5) + ax.set_xlabel('Predicted Label') ax.set_ylabel('True Label') ax.set_aspect('equal') - plt.title(dataName) + plt.title(title) + + return cm, fig - return cm -def plasticcLogLoss(y_true, y_pred, relative_class_weights=None): +def plasticc_log_loss(y_true, y_pred, relative_class_weights=None): """ Implementation of weighted log loss used for the Kaggle challenge """ predictions = y_pred.copy() # sanitize predictions - epsilon = sys.float_info.epsilon # this is machine dependent but essentially prevents log(0) + epsilon = sys.float_info.epsilon # this is machine dependent but essentially prevents log(0) predictions = np.clip(predictions, epsilon, 1.0 - epsilon) predictions = predictions / np.sum(predictions, axis=1)[:, np.newaxis]