Adding greater freedoms for the user to modify the data preprocessing (…

…#3) * initial work to increase user freedoms * initial commit to preprocess kwargs branch * updated gui to work with new gui_configuration files * adding preprocessing kwarg files to released models * flake8 fix * bug fix to pull gui_config file * removing preprocess files * replacement preprocessing files * updating gui config files * updated tests and temporary modification to downloads code * updated testing * bug fix in preprocessing * updated testing * updated tests and improved coverage * increasing test coverage * ensuring output of eval is always an array * bumping version number and minor changes to README text * updating tutorial documentation * restoring downloads code before merge to master
htjb · Jul 6, 2021 · d21955e · d21955e
1 parent 783856e
commit d21955e
Show file tree

Hide file tree

Showing 20 changed files with 211 additions and 190 deletions.
diff --git a/README.rst b/README.rst
@@ -7,7 +7,7 @@ Introduction
 
 :globalemu: Robust Global 21-cm Signal Emulation
 :Author: Harry Thomas Jones Bevins
-:Version: 1.3.1
+:Version: 1.4.0
 :Homepage: https://github.com/htjb/globalemu
 :Documentation: https://globalemu.readthedocs.io/
 
@@ -100,11 +100,9 @@ Results are accessed via 'res.z' and 'res.signal'.
 The code can also be used to train a network on your own Global 21-cm signal
 or neutral fraction simulations using the built in ``globalemu`` pre-processing
 techniques. There is some flexibility on the required astrophysical input
-parameters but the models are required to subscribe to the astrophysics free
-baseline calculation detailed in the ``globalemu`` paper (see below for a reference).
+parameters and the pre-processing steps which is detailed in the documentation.
 More details about training your own network can be found in the documentation.
 
-
 ``globalemu`` GUI
 -----------------
 
@@ -144,8 +142,9 @@ An image of the GUI is shown below.
   :alt: graphical user interface
 
 The GUI can also be used to investigate the physics of the neutral fraction
-history by generating a configuration file for the released trained model and
-setting the kwarg ``xHI=True`` in gui_config.config().
+history by generating a configuration file for the released trained model.
+There is no need to specify that the configuration file is for a neutral
+fraction emulator.
 
 Configuration files for the released models are provided.
 

diff --git a/T_release/gui_configuration.csv b/T_release/gui_configuration.csv
@@ -1,8 +1,8 @@
-names,mins,maxs,label_min,label_max,logs,xHI
-$\log(f_*)$,-3.4579971262630043,-0.3010299956639812,-246.84562,32.171596,0,False
-$\log(V_c)$,0.6232492903979004,1.8836614351536176,,,1,
-$\log(f_X)$,-6.0,0.9977593286204041,,,2,
-$\tau$,0.05550117,0.09999531,,,--,
-$\alpha$,1.0,1.5,,,--,
-$\nu_\mathrm{min}$,0.1,3.0,,,--,
-$R_\mathrm{mfp}$,10.0,50.0,,,--,
+names,mins,maxs,label_min,label_max,logs,ylabel
+$\log(f_*)$,-3.4579971262630043,-0.3010299956639812,-246.84562,32.171596,0,$\delta T$ [mK]
+$\log(V_c)$,0.6232492903979004,1.8836614351536176,,,1,$\delta T$ [mK]
+$\log(f_X)$,-6.0,0.9977593286204041,,,2,$\delta T$ [mK]
+$\tau$,0.05550117,0.09999531,,,--,$\delta T$ [mK]
+$\alpha$,1.0,1.5,,,--,$\delta T$ [mK]
+$\nu_\mathrm{min}$,0.1,3.0,,,--,$\delta T$ [mK]
+$R_\mathrm{mfp}$,10.0,50.0,,,--,$\delta T$ [mK]
diff --git a/T_release/preprocess_settings.pkl b/T_release/preprocess_settings.pkl
diff --git a/docs/source/tutorial.rst b/docs/source/tutorial.rst
@@ -9,7 +9,8 @@ of ``globalemu``. If you are just interested in evaluating the released models
 then take a look at the second part towards the bottom of the page.
 If you are intending to work with neutral fraction histories then the frame
 work for training and evaluating models is identical you just need to pass
-the kwarg ``xHI=True`` to all of the ``globalemu`` functions.
+the kwarg ``xHI=True`` to the pre-processing function, `process()`,
+and model building function, `nn()`, discussed below.
 
 The tutorial can also be found as a Jupyter notebook
 `here <https://mybinder.org/v2/gh/htjb/globalemu/master?filepath=notebooks%2F>`__.
@@ -70,6 +71,12 @@ for the neural network. It also saves some files used for normalisation in
 the ``base_dir`` so that when evaluating the network the inputs and outputs
 can be properly dealt with.
 
+By default the network subtracts and astrophysics free baseline from the models
+and resamples the signals at a higher rate in regions of high variation across
+the training data. Both of these pre-processing techniques are detailed in the
+`globalemu` MNRAS preprint. Users can prevent this happening by passing the
+kwargs `AFB=False` and `resampling=False` to `process()` if required.
+
 Once pre-processing has been performed we can train our network with the
 ``nn()`` function in ``globalemu.network``.
 

diff --git a/globalemu/downloads.py b/globalemu/downloads.py
@@ -36,7 +36,8 @@ def model(self):
 
         files = [
             'model.h5', 'data_mins.txt', 'data_maxs.txt', 'samples.txt',
-            'cdf.txt', 'z.txt', 'kwargs.txt',
+            'cdf.txt', 'z.txt', 'kwargs.txt', 'preprocess_settings.pkl',
+            'gui_configuration.csv',
             'AFB_norm_factor.npy', 'labels_stds.npy', 'AFB.txt']
 
         if self.xHI is False:
@@ -49,7 +50,7 @@ def model(self):
                 'htjb/globalemu/master/xHI_release/'
 
         for i in range(len(files)):
-            if i > 6 and self.xHI is True:
+            if i > 8 and self.xHI is True:
                 break
             r = requests.get(base_url + files[i])
             open(base_dir + files[i], 'wb').write(r.content)

diff --git a/globalemu/eval.py b/globalemu/eval.py
@@ -14,6 +14,7 @@
 from tensorflow import keras
 from tensorflow.keras import backend as K
 import gc
+import pickle
 
 
 class evaluate():
@@ -29,10 +30,6 @@ class evaluate():
 
     **kwargs:**
 
-        xHI: **Bool / default: False**
-            | If True then ``globalemu`` will act as if it is evaluating a
-                neutral fraction history emulator.
-
         base_dir: **string / default: 'model_dir/'**
             | The ``base_dir`` is where the trained model is saved.
 
@@ -120,20 +117,22 @@ def __init__(self, **kwargs):
 
         for key, values in kwargs.items():
             if key not in set(
-                    ['xHI', 'base_dir', 'model', 'logs', 'gc', 'z']):
+                    ['base_dir', 'model', 'logs', 'gc', 'z']):
                 raise KeyError("Unexpected keyword argument in evaluate()")
 
-        self.xHI = kwargs.pop('xHI', False)
-
         self.base_dir = kwargs.pop('base_dir', 'model_dir/')
         if type(self.base_dir) is not str:
             raise TypeError("'base_dir' must be a sting.")
         elif self.base_dir.endswith('/') is False:
             raise KeyError("'base_dir' must end with '/'.")
 
+        file = open(self.base_dir + "preprocess_settings.pkl", "rb")
+        self.preprocess_settings = pickle.load(file)
+
         self.data_mins = np.loadtxt(self.base_dir + 'data_mins.txt')
         self.data_maxs = np.loadtxt(self.base_dir + 'data_maxs.txt')
-        self.cdf = np.loadtxt(self.base_dir + 'cdf.txt')
+        if self.preprocess_settings['resampling'] is True:
+            self.cdf = np.loadtxt(self.base_dir + 'cdf.txt')
 
         self.model = kwargs.pop('model', None)
         if self.model is None:
@@ -146,14 +145,15 @@ def __init__(self, **kwargs):
             raise TypeError("'logs' must be a list.")
         self.garbage_collection = kwargs.pop('gc', False)
 
-        boolean_kwargs = [self.garbage_collection, self.xHI]
-        boolean_strings = ['gc', 'xHI']
+        boolean_kwargs = [self.garbage_collection]
+        boolean_strings = ['gc']
         for i in range(len(boolean_kwargs)):
             if type(boolean_kwargs[i]) is not bool:
                 raise TypeError("'" + boolean_strings[i] + "' must be a bool.")
 
-        if self.xHI is False:
+        if self.preprocess_settings['AFB'] is True:
             self.AFB = np.loadtxt(self.base_dir + 'AFB.txt')
+        if self.preprocess_settings['std_division'] is True:
             self.label_stds = np.load(self.base_dir + 'labels_stds.npy')
 
         self.original_z = np.loadtxt(self.base_dir + 'z.txt')
@@ -200,7 +200,11 @@ def __call__(self, parameters):
                 (self.data_maxs[i] - self.data_mins[i])
                 for i in range(params.shape[1])]).T
 
-        norm_z = np.interp(self.z, self.original_z, self.cdf)
+        if self.preprocess_settings['resampling'] is True:
+            norm_z = np.interp(self.z, self.original_z, self.cdf)
+        else:
+            norm_z = (self.z - self.original_z.min()) / \
+                     (self.original_z.max() - self.original_z.min())
         if isinstance(norm_z, np.ndarray):
             if len(normalised_params.shape) == 1:
                 x = np.tile(normalised_params, (len(norm_z), 1))
@@ -229,14 +233,18 @@ def __call__(self, parameters):
             result = self.model(x[np.newaxis, :], training=False).numpy()
             evaluation = result[0][0]
 
-        if self.xHI is False:
+        if self.preprocess_settings['std_division'] is True:
             if isinstance(evaluation, np.ndarray):
                 evaluation = [
                     evaluation[i]*self.label_stds
                     for i in range(evaluation.shape[0])]
             else:
                 evaluation *= self.label_stds
 
+        if self.preprocess_settings['AFB'] is True:
             evaluation += np.interp(self.z, self.original_z, self.AFB)
 
+        if type(evaluation) is not np.ndarray:
+            evaluation = np.array(evaluation)
+
         return evaluation, self.z
diff --git a/globalemu/gui_config.py b/globalemu/gui_config.py
@@ -16,6 +16,7 @@
 
 import numpy as np
 import pandas as pd
+import pickle
 
 
 class config():
@@ -51,10 +52,6 @@ class config():
 
     **Kwargs:**
 
-        xHI: **Bool / default: False**
-            | If True then ``globalemu`` will act as if it is evaluating a
-                neutral fraction history emulator.
-
         logs: **list / default: [0, 1, 2]**
             | The indices corresponding to the astrophysical
                 parameters that
@@ -64,20 +61,23 @@ class config():
                 :math:`{V_c}` (minimum virial circular velocity) and
                 :math:`{f_x}` (X-ray efficieny).
 
+        ylabel: **string / default: 'y'**
+            | y-axis label for gui plot.
+
     """
 
     def __init__(self, base_dir, paramnames, data_dir, **kwargs):
 
         for key, values in kwargs.items():
             if key not in set(
-                    ['xHI', 'logs']):
+                    ['logs', 'ylabel']):
                 raise KeyError("Unexpected keyword argument in process()")
 
         self.base_dir = base_dir
         self.paramnames = paramnames
         self.data_dir = data_dir
         self.logs = kwargs.pop('logs', [0, 1, 2])
-        self.xHI = kwargs.pop('xHI', False)
+        self.ylabel = kwargs.pop('ylabel', 'y')
 
         file_kwargs = [self.base_dir, self.data_dir]
         file_strings = ['base_dir', 'data_dir']
@@ -87,12 +87,12 @@ def __init__(self, base_dir, paramnames, data_dir, **kwargs):
             elif file_kwargs[i].endswith('/') is False:
                 raise KeyError("'" + file_strings[i] + "' must end with '/'.")
 
+        file = open(self.base_dir + "preprocess_settings.pkl", "rb")
+        self.preprocess_settings = pickle.load(file)
+
         if type(self.paramnames) is not list:
             raise TypeError("'paramnames' must be a list of strings.")
 
-        if type(self.xHI) is not bool:
-            raise TypeError("'xHI' must be a bool.")
-
         if type(self.logs) is not list:
             raise TypeError("'logs' must be a list.")
 
@@ -123,7 +123,6 @@ def __init__(self, base_dir, paramnames, data_dir, **kwargs):
                            'label_max':
                            [test_labels.max()] + ['']*(len(data_maxs)-1),
                            'logs': full_logs,
-                           'xHI':
-                           [self.xHI] + ['']*(len(data_maxs)-1)})
+                           'ylabel': self.ylabel})
 
         df.to_csv(base_dir + 'gui_configuration.csv', index=False)
diff --git a/globalemu/network.py b/globalemu/network.py
@@ -91,6 +91,15 @@ class nn():
             | If True then ``globalemu`` will act as if it is training a
                 neutral fraction history emulator.
 
+        output_activation: **string / default: 'linear'**
+            | Determines the output activation function for the network.
+                Modifying this
+                is useful if the emulator output is required to be positive or
+                negative etc. If xHI is True then the output activation is
+                set to 'relu' else the function is 'linear'. See the tensorflow
+                documentation for more details on the types of activation
+                functions available.
+
         resume: **Bool / default: False**
             | If set to ``True`` then ``globalemu`` will look in the
                 ``base_dir`` for a trained model and ``loss_history.txt``
@@ -123,7 +132,7 @@ def __init__(self, **kwargs):
                         'lr', 'dropout', 'input_shape',
                         'output_shape', 'layer_sizes', 'base_dir',
                         'early_stop', 'early_stop_lim', 'xHI', 'resume',
-                        'random_seed']):
+                        'random_seed', 'output_activation']):
                 raise KeyError("Unexpected keyword argument in nn()")
 
         self.resume = kwargs.pop('resume', False)
@@ -206,20 +215,19 @@ def pack_features_vector(features, labels):
 
         train_dataset = train_dataset.map(pack_features_vector)
 
+        self.output_activation = kwargs.pop('output_activation', 'linear')
+        if self.xHI is True:
+            self.output_activation = 'relu'
+
         if self.resume is True:
             model = keras.models.load_model(
                 self.base_dir + 'model.h5',
                 compile=False)
-        elif self.xHI is False:
-            model = network_models().basic_model(
-                self.input_shape, self.output_shape,
-                self.layer_sizes, self.activation, self.drop_val,
-                'linear')
         else:
             model = network_models().basic_model(
                 self.input_shape, self.output_shape,
                 self.layer_sizes, self.activation, self.drop_val,
-                'relu')
+                self.output_activation)
 
         def loss(model, x, y, training):
             y_ = tf.transpose(model(x, training=training))[0]