regression dataset

cvjena · Aug 28, 2023 · f203613 · f203613
1 parent 901cc81
commit f203613
Show file tree

Hide file tree

Showing 3 changed files with 339 additions and 0 deletions.
diff --git a/algebra.py b/algebra.py
@@ -0,0 +1,52 @@
+import numpy as np
+
+def has_leftinverse(matrix):
+    """Returns whether the matrix is left-invertible. That is, it returns whether
+    a matrix A' exists to the given matrix A such that A'Ax=x for all x.
+
+    :param matrix: Matrix as np.array.
+    :return: Whether the given matrix has a left inverse.
+    :rtype: boolean.
+
+    """
+
+    # A matrix can only have a left-inverse if it is of full column rank.
+    m, n = matrix.shape # rows, columns
+    _, s, _ = np.linalg.svd(matrix)
+    rank = np.sum(s > np.finfo(matrix.dtype).eps)
+
+    return (rank==n and n <= m)
+
+def random_orthogonal_matrix(n, complex=False, seed=None):
+    """A Random matrix distributed with Haar measure.
+
+    Returns a random orthogonal matrix. To ensure randomness, we have to choose
+    from the distribution created by the Haar measure. The calculation follows
+    the results of:
+        Mezzadri, Francesco: How to generate random matrices from the classical
+        compact groups. In: NOTICES of the AMS, Vol. 54 (54 (2007).
+        URL: https://arxiv.org/pdf/math-ph/0609050.
+
+    :param n: Matrix returned has dimensions nxn.
+    :param complex: Whether or not the returned matrix contains complex numbers.
+    :param seed: If int the seed to generate a reproducible results.
+    :return: Random matrix distributed with Haar measure.
+    :rtype: np.array containing floats
+
+    """
+
+    if not seed is None:
+        np.random.seed(seed)
+
+    # The original algorithm provided by Mezzari's is only defined for complex
+    # initialization.
+    if complex:
+        z = (np.random.randn(n,n) + 1j*np.random.randn(n,n))/np.lib.scimath.sqrt(2.0)
+    else:
+        z = np.random.randn(n,n)
+    q,r = np.linalg.qr(z)
+    d = np.diagonal(r)
+    ph = d/np.absolute(d)
+    q = np.multiply(q,ph,q)
+
+    return q
diff --git a/make_regression.py b/make_regression.py
@@ -0,0 +1,108 @@
+"""
+This file contains a modified version of the make_regression() function of
+scikit-learn.
+
+We modify the function in order to create a regression dataset with
+standard-uniform coefficients. The original scikit-learn implementation scales
+the standard-uniformly generated coefficients with a factor of 100.
+
+scikit-learn was published under the following license:
+    BSD 3-Clause License
+
+    Copyright (c) 2007-2023 The scikit-learn developers.
+    All rights reserved.
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice, this
+    list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright notice,
+    this list of conditions and the following disclaimer in the documentation
+    and/or other materials provided with the distribution.
+
+    * Neither the name of the copyright holder nor the names of its
+    contributors may be used to endorse or promote products derived from
+    this software without specific prior written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+    AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+    IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+    DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+    FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+    DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+    SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+    CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+    OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+"""
+
+import numpy as np
+from sklearn.utils import check_random_state
+from sklearn.utils import shuffle as util_shuffle
+from sklearn.datasets import make_low_rank_matrix
+
+def make_regression(
+    n_samples=100,
+    n_features=100,
+    *,
+    n_informative=10,
+    n_targets=1,
+    bias=0.0,
+    effective_rank=None,
+    tail_strength=0.5,
+    noise=0.0,
+    shuffle=True,
+    coef=False,
+    random_state=None,
+):
+    n_informative = min(n_features, n_informative)
+    generator = check_random_state(random_state)
+
+    if effective_rank is None:
+        # Randomly generate a well conditioned input set
+        X = generator.standard_normal(size=(n_samples, n_features))
+
+    else:
+        # Randomly generate a low rank, fat tail input set
+        X = make_low_rank_matrix(
+            n_samples=n_samples,
+            n_features=n_features,
+            effective_rank=effective_rank,
+            tail_strength=tail_strength,
+            random_state=generator,
+        )
+
+    # Generate a ground truth model with only n_informative features being non
+    # zeros (the other features are not correlated to y and should be ignored
+    # by a sparsifying regularizers such as L1 or elastic net)
+    # CHANGES: Sklearn multiplies the uniformly created coefficients with a
+    # factor of 100. We avoid that.
+    ground_truth = np.zeros((n_features, n_targets))
+    ground_truth[:n_informative, :] = generator.uniform(
+        size=(n_informative, n_targets)
+    )
+
+    y = np.dot(X, ground_truth) + bias
+
+    # Add noise
+    if noise > 0.0:
+        y += generator.normal(scale=noise, size=y.shape)
+
+    # Randomly permute samples and features
+    if shuffle:
+        X, y = util_shuffle(X, y, random_state=generator)
+
+        indices = np.arange(n_features)
+        generator.shuffle(indices)
+        X[:, :] = X[:, indices]
+        ground_truth = ground_truth[indices]
+
+    y = np.squeeze(y)
+
+    if coef:
+        return X, y, np.squeeze(ground_truth)
+
+    else:
+        return X, y
diff --git a/regression_dataset.py b/regression_dataset.py
@@ -0,0 +1,179 @@
+import os
+import numpy as np
+from toy_data.algebra import random_orthogonal_matrix
+from toy_data.sklearn_adaptions import make_regression
+from dataset_utils import get_dataset_from_arrays
+
+def make_regression_dataset(high_dim_transform=True, n_features_low_dim=4, n_uninformative_low_dim=0, n_high_dim = 128, noise_on_high_dim_snrdb=None,
+                               noise_on_output=0.0, n_train=50000, n_test=10000, n_validation=10000, normalize=False, seed = None, batch_size = 10, log_coefs = False):
+    """
+        The input variables are standard normal distributed and the coefficients
+        of the regression task follow a standard uniform distribution. The bias
+        of the regression problem is set to zero.
+        Since a linear combination of normal distributed random variables is
+        again normal distributed, the target variable of the regresssion is also
+        normal distributed.
+        The random variables are distributed similarly to the input variables.
+
+        high_dim_transform:
+            Whether a high-dimensional transformation shall be performed. In case
+            no high-dimensional transformation is desired, all inputs like
+            n_high_dim, ... are ignored and instead the identity transformation
+            is used.
+            noise_on_high_dim_snrdb is still used to add noise after the identity
+            transformation.
+        n_low_dim:
+            Number of informative low-dimensional variables.
+        n_informative_low_dim:
+            Number of uninformative low-dimensional variables.
+        noise_on_high_dim_snrdb:
+            Additive gaussian noise is applied to the regression input variables after transformation into
+            a higher dimension. Here, the variance of each input variable is determined and the noise is
+            added so that the SNR corresponds to the given value (in dB). A reasonable value can for instance
+            be 10 or 40.
+        noise_on_output: 
+            Standard deviation of the gaussian noise (zero mean) applied to the output (before normalization).
+    """
+
+    # Generate random rergession problem.
+    n_samples = n_train + n_validation + n_test
+    features, output, coefficients = _regression_dataset(n_features=n_features_low_dim,
+                                            n_uninformative=n_uninformative_low_dim,
+                                            n_samples=n_samples,
+                                            noise_on_output=noise_on_output,
+                                            seed=seed)
+    if log_coefs:
+        f = open(os.path.join(os.getcwd(), "lowdim_regression_coefficients.list"),'w')
+        f.writelines(coefficients)
+        f.close()
+
+    # Normalization.
+    # Normalize outputs to mean zero and unit variance.
+    if normalize:
+        output = output - np.mean(output)
+        output = output / np.std(output)
+
+    # Expand to high dimensional problem.
+    # If not desired, we apply the identity transformation.
+    if high_dim_transform:
+        features, transformation_matrix = _inverse_pca_dataset(features, n_high_dim, seed=seed)
+    else:
+        if not n_high_dim is None:
+            raise ValueError("When no dimensionality expansion is performed, the number of high dimensional features should not be set.")
+        transformation_matrix = np.identity(n=n_features_low_dim)
+
+    # Add noise if specified.
+    if not noise_on_high_dim_snrdb is None:
+        # Calculation of Noise SNR from Signal SNR in dB:
+        #           SNR = E(S^2) / E(N^2)
+        #       <-> E(N^2) = E(S^2) / SNR
+        # Our noise is mean-zero:
+        #       <-> Noise Variance = E(S^2) / SNR
+        # With SNR = 10^(SNR in dB / 10):
+        #       <-> Noise Variance = E(S^2) / (10^(SNR in dB / 10))
+        #
+        if noise_on_high_dim_snrdb == 0.0:
+            raise ValueError("A SNR of zero equals infite noise. For no noise specify \'None\'.")
+        signal_second_moments = np.mean(features ** 2, axis=0)
+        noise_variances = signal_second_moments / (10 ** (noise_on_high_dim_snrdb / 10))
+        noise_stds = np.sqrt(noise_variances)
+
+        np.random.seed(seed=seed)
+        features += features + np.random.normal(loc=0.0, scale=noise_stds, size=features.shape)
+
+    # Divide into test, train, validation.
+    _, train_dataloader, _, test_dataloader, _, validation_dataloader = get_dataset_from_arrays(train_features=features[:n_train],
+                                                                                                train_outputs=output[:n_train],
+                                                                                                test_features=features[n_train:n_train + n_test],
+                                                                                                test_outputs=output[n_train:n_train + n_test],
+                                                                                                validation_features=features[n_train + n_test:],
+                                                                                                validation_outputs=output[n_train + n_test:],
+                                                                                                batch_size=batch_size)
+
+    return (train_dataloader, test_dataloader, validation_dataloader, transformation_matrix)
+
+def _regression_dataset(n_features, n_samples, n_uninformative=0, noise_on_output=0.0, seed=None):
+    """Generates a random regression dataset with n_features parameters.
+
+    :param n_features: Number of coefficients of the regression problem /
+    dimensions of the input.
+    :param n_uninformative: Number of noise variables (uninformative variables).
+    The uninformative variables are distributed similarly to the coefficients of
+    the regression problem.
+    :param n_samples: Number of samples generated for the regression problem.
+    :param seed: If int the seed to generate a reproducible regression dataset.
+    :return: Coefficients of the regression problem and generated samples.
+    :rtype: Tuple of (inputs of the generated samples, outputs of the generated
+    samples, coefficients of the regression problem used to generate the samples).
+
+    """
+
+    # n_samples:        Number of observations to estimate coefficients
+    # n_features:       Number of total features, potentially not all relevant for
+    #                   regression (number of dimensions of the input vector)
+    # n_informative:    Number of informative features (=coefficients), that is, the
+    #                   features that are used to build the linear model
+    # n_targets:        Number of dimensions of the output vector
+    # noise:            std of gaussian noise applied to output
+    features, output, coef = make_regression(  n_samples = n_samples,
+                                                        n_features = n_features + n_uninformative,
+                                                        n_informative = n_features,
+                                                        n_targets = 1,
+                                                        noise = noise_on_output,
+                                                        coef = True,
+                                                        random_state = seed)
+
+    return (features, output, coef)
+
+def _inverse_pca_dataset(features, n_high_dim_features, seed=None):
+    """Input data is interpreted as output of a random PCA, transforms the data
+    with an inverse transformation of a random PCA.
+
+    :param features: Dataset to transform to higher dimension. Rows are
+    interpreted as observations and columns as input features.
+    :param n_high_dim_features: Number of output features per observation.
+    :param seed: If int the seed to generate a reproducible dataset.
+    :return: Transformed dataset and matrix used for transformation.
+    :rtype: Tuple of (transformed dataset, transformation matrix).
+
+    """
+    # Generate transformation to higher dimension.
+    # For theoretical reasons the change of basis matrix has to have a left inverse,
+    # but when doing PCA this is guaranteed even if we only care about the first
+    # k eigen vectors - in this case, the first k columns.
+    n_low_dim_features = features.shape[1]
+    change_of_basis_matrix = random_orthogonal_matrix(n_high_dim_features, seed=seed)
+    transformation_matrix = change_of_basis_matrix[:,:n_low_dim_features]
+
+    # Apply transformation to higher dimension to the observational data.
+    high_dim_features = _matrix_transform_dataset(features, transformation_matrix)
+
+    # Standardize the high-dimensional data.
+    # high_dim_features = StandardScaler().fit_transform(high_dim_features)
+
+    return (high_dim_features, transformation_matrix)
+
+def _matrix_transform_dataset(features, transformation_matrix):
+    """Applies a given transformation matrix A to a dataset.
+    The transformation matrix is applied to the rows x of the dataset via Ax=x'.
+
+    :param type features: Dataset to apply transformation to. Rows are
+    interpreted as observations and columns as input features.
+    :param type transformation_matrix: Matrix used for transformation.
+    :return: Transformed input dataset.
+    :rtype: np.array
+
+    """
+
+    # features: rows are observations, columns the features.
+    # (transformed features to higher dimension, matrix used for transformation)
+    n_dim_features = features.shape[1]
+
+    assert(transformation_matrix.shape[1] == n_dim_features)
+
+    # Apply transformation to higher dimension to the observational data.
+    # Instead of individually applying the change of basis matrix via B*x=x' we
+    # can apply it to all observations at once via X*B^T=X'.
+    transformed_features = features @ transformation_matrix.T
+
+    return transformed_features