Correct PCA component counting logic

dafeda · dafeda · commit 0aaf2c64baa4 · 2025-06-17T09:19:16.000+02:00
BREAKING CHANGE:
The change in component counting directly affects the number of clusters requested in the `main` auto-scaling function.
For the same input data, this version may produce a different number of clusters and therefore different final scaling factors compared to previous versions.
The new behavior is considered more accurate.
diff --git a/src/ert/analysis/misfit_preprocessor.py b/src/ert/analysis/misfit_preprocessor.py
@@ -32,10 +32,19 @@ def get_nr_primary_components(
     responses: npt.NDArray[np.float64], threshold: float
 ) -> int:
     """
-    Calculate the number of principal components needed to achieve a cumulative
-    variance less than a specified threshold using Singular Value Decomposition (SVD).
+    Calculate the number of principal components required
+    to explain a given amount of variance in the responses.
 
-    responses should be on form (n_realizations, n_observations)
+    Args:
+    responses: A 2D array of data with shape
+        (n_realizations, n_observations).
+    threshold: The cumulative variance threshold to meet or exceed.
+        For example, a value of 0.95 will find the number of
+        components needed to explain at least 95% of the total variance.
+
+    Returns:
+        The minimum number of principal components required to meet or exceed
+        the specified variance threshold.
     """
     data_matrix = responses - responses.mean(axis=0)
     _, singulars, _ = np.linalg.svd(data_matrix.astype(float), full_matrices=False)
@@ -45,7 +54,10 @@ def get_nr_primary_components(
     # sum to get the cumulative proportion of variance explained by each successive
     # component.
     variance_ratio = np.cumsum(singulars**2) / np.sum(singulars**2)
-    return max(len([1 for i in variance_ratio[:-1] if i < threshold]), 1)
+
+    num_components = np.searchsorted(variance_ratio, threshold, side="left") + 1
+
+    return int(num_components)
 
 
 def cluster_responses(
diff --git a/tests/ert/unit_tests/analysis/test_misfit_preprocessor.py b/tests/ert/unit_tests/analysis/test_misfit_preprocessor.py
@@ -59,10 +59,12 @@ def test_that_get_nr_primary_components_is_according_to_theory(p, rho, seed):
     X = rng.standard_normal(size=(p, N))
     Y = (np.linalg.cholesky(Sigma) @ X).T
 
-    # Adding a bit to the thresholds because of numerical accuracy.
-    assert get_nr_primary_components(Y, threshold_1 + 0.01) == 1
-    assert get_nr_primary_components(Y, threshold_2 + 0.01) == 2
-    assert get_nr_primary_components(Y, threshold_3 + 0.01) == 3
+    # To get 1 component, the threshold must be <= the variance
+    # of the 1st component.
+    # Same for the other components.
+    assert get_nr_primary_components(Y, threshold_1 - 0.01) == 1
+    assert get_nr_primary_components(Y, threshold_2 - 0.01) == 2
+    assert get_nr_primary_components(Y, threshold_3 - 0.01) == 3
 
 
 @pytest.mark.parametrize("nr_observations", [4, 7, 12])
@@ -71,24 +73,19 @@ def test_that_correlated_and_independent_observations_are_grouped_separately(
     nr_observations,
 ):
     """
-    Test the preprocessor's ability to cluster correlated observations.
+    Test the preprocessor's ability to cluster correlated observations
+    separately from multiple independent observations.
 
     We create a response matrix with `nr_observations` rows, where the
     first `nr_observations - 2` rows are strongly correlated, while the
-    last two are independent.
-
-    We expect the correlated observations to be scaled in one group,
-    and perhaps surprisingly, the two independent observations to be
-    scaled together in a second group.
-    The reason that the two independent responses end up in the same group
-    is due to the way get_nr_primary_components counts PCA components,
-    and the fact that the number of PCA components is used as the number
-    of clusters.
-    It returns the number of components that explain **less** than the
-    variance specified as the threshold.
-    With a threshold of 0.95, the expression used is as follows:
-
-    max(len([1 for i in variance_ratio[:-1] if i < 0.95]), 1),
+    last two are independent of both the main group and each other.
+
+    This will result in a request for 3 clusters, correctly separating the
+    data into its three natural groups:
+
+    1. The main group of correlated observations.
+    2. The first independent observation.
+    3. The second independent observation.
     """
     rng = np.random.default_rng(1234)
     nr_realizations = 1000
@@ -99,44 +96,49 @@ def test_that_correlated_and_independent_observations_are_grouped_separately(
     parameters_b = rng.standard_normal(nr_realizations)
     parameters_c = rng.standard_normal(nr_realizations)
 
-    Y = np.ones((nr_observations, nr_realizations))
+    Y = np.zeros((nr_observations, nr_realizations))
     for i in range(nr_correlated_obs):
-        Y[i] = (1 + i) * parameters_a
-    Y[-1] = 5 + parameters_b
-    Y[-2] = 10 + parameters_c
+        Y[i] = (i + 1) * parameters_a
+    # The last two observations are independent
+    Y[-2] = 10 + parameters_b
+    Y[-1] = 5 + parameters_c
 
     obs_errors = Y.std(axis=1)
     Y_original = Y.copy()
     obs_error_copy = obs_errors.copy()
 
     scale_factors, clusters, nr_components = main(Y, obs_errors)
 
-    # Since the first nr_correlated_obs rows of Y are perfectly correlated,
-    # we only need one principal component to describe all variance.
-    nr_components_correlated_group = 1
-
-    # Because of the way we calculate the number of components
-    # (see docstring for details), the two undependent responses
-    # are represented by a single component.
-    nr_components_uncorrelated_group = 1
+    # We expect three distinct clusters now.
+    cluster_label_correlated = clusters[0]
+    cluster_label_independent_1 = clusters[-2]
+    cluster_label_independent_2 = clusters[-1]
 
-    np.testing.assert_equal(
-        scale_factors,
-        np.array(
-            nr_correlated_obs
-            * [np.sqrt(nr_correlated_obs / nr_components_correlated_group)]
-            + nr_uncorrelated_obs
-            * [np.sqrt(nr_uncorrelated_obs / nr_components_uncorrelated_group)]
-        ),
-    )
+    # Check that the three labels are all different
+    assert cluster_label_correlated != cluster_label_independent_1
+    assert cluster_label_correlated != cluster_label_independent_2
+    assert cluster_label_independent_1 != cluster_label_independent_2
 
-    expected_clusters = np.array(nr_correlated_obs * [1] + nr_uncorrelated_obs * [2])
-    np.testing.assert_equal(clusters, expected_clusters)
+    # Check that the main group is clustered together
+    for i in range(nr_correlated_obs):
+        assert clusters[i] == cluster_label_correlated
 
-    expected_nr_components = (nr_uncorrelated_obs + nr_correlated_obs) * [1]
+    # Correlated group cluster has 1 component.
+    # The two independent clusters have size 1, so they also get 1 component.
+    # Therefore, all observations should be associated with 1 component.
+    expected_nr_components = np.ones(nr_observations, dtype=int)
     np.testing.assert_equal(nr_components, expected_nr_components)
 
-    # Check that we don`t modify the input data
+    # For the correlated group: sqrt(num_obs / num_components)
+    sf_correlated = np.sqrt(nr_correlated_obs / 1.0)
+    # For the independent groups (size 1): sqrt(1 / 1)
+    sf_independent = np.sqrt(1.0 / 1.0)
+
+    expected_scale_factors = np.array(
+        [sf_correlated] * nr_correlated_obs + [sf_independent] * nr_uncorrelated_obs
+    )
+    np.testing.assert_allclose(scale_factors, expected_scale_factors)
+
     np.testing.assert_equal(Y, Y_original)
     np.testing.assert_equal(obs_errors, obs_error_copy)
 
@@ -168,3 +170,89 @@ def test_that_perfectly_correlated_responses_are_not_scaled(nr_observations):
         nr_components,
         np.array(nr_observations * [1.0]),
     )
+
+
+@pytest.mark.parametrize(
+    "nr_obs_group_a, nr_obs_group_b",
+    [
+        (3, 2),
+        (5, 5),
+        (4, 6),
+    ],
+)
+@pytest.mark.integration_test
+def test_main_correctly_separates_distinct_correlation_groups(
+    nr_obs_group_a, nr_obs_group_b
+):
+    """
+    Creates a response matrix with two distinct and independent groups of
+    correlated observations.
+    - Group A contains `nr_obs_group_a` responses that are all correlated
+      with each other.
+    - Group B contains `nr_obs_group_b` responses that are also correlated
+      with each other, but are independent of Group A.
+
+    This test asserts that the algorithm places
+    the two groups into two separate clusters.
+    """
+    rng = np.random.default_rng(seed=12345)
+    nr_realizations = 1000
+    nr_observations = nr_obs_group_a + nr_obs_group_b
+
+    # Create two independent random signals that will form the
+    # basis for the two correlation groups.
+    params_a = rng.standard_normal(nr_realizations)
+    params_b = rng.standard_normal(nr_realizations)
+
+    # Create the final response matrix Y
+    Y = np.zeros((nr_observations, nr_realizations))
+
+    # Create Group A: `nr_obs_group_a` perfectly correlated
+    # responses based on `params_a`
+    for i in range(nr_obs_group_a):
+        Y[i] = (i + 1) * params_a
+
+    # Create Group B: `nr_obs_group_b` perfectly correlated
+    # responses based on `params_b`
+    for i in range(nr_obs_group_b):
+        Y[nr_obs_group_a + i] = (i + 1) * params_b
+
+    # Calculate observation errors,
+    # required as input for the main function
+    obs_errors = Y.std(axis=1)
+
+    scale_factors, clusters, nr_components = main(Y, obs_errors)
+
+    # Assert that the two groups were placed in different clusters.
+    # The absolute cluster labels (e.g., 1 vs 2) can change between runs,
+    # so we check the grouping structure dynamically.
+    cluster_label_group_a = clusters[0]
+    cluster_label_group_b = clusters[nr_obs_group_a]
+
+    assert cluster_label_group_a != cluster_label_group_b, (
+        "The two distinct correlation groups should be in different clusters."
+    )
+
+    # Assert that all members of Group A are in the same cluster
+    expected_clusters_a = np.full(nr_obs_group_a, cluster_label_group_a)
+    np.testing.assert_array_equal(clusters[:nr_obs_group_a], expected_clusters_a)
+
+    # Assert that all members of Group B are in the same cluster
+    expected_clusters_b = np.full(nr_obs_group_b, cluster_label_group_b)
+    np.testing.assert_array_equal(clusters[nr_obs_group_a:], expected_clusters_b)
+
+    # Assert the number of components for each observation.
+    # Since each group is perfectly correlated internally, the PCA performed
+    # on each cluster should find that only 1 principal component is needed.
+    expected_nr_components = np.ones(nr_observations, dtype=int)
+    np.testing.assert_array_equal(nr_components, expected_nr_components)
+
+    # Assert the calculated scaling factors.
+    # The scaling factor is sqrt(num_observations_in_cluster / num_components).
+    sf_group_a = np.sqrt(nr_obs_group_a / 1.0)
+    sf_group_b = np.sqrt(nr_obs_group_b / 1.0)
+
+    expected_scale_factors = np.array(
+        [sf_group_a] * nr_obs_group_a + [sf_group_b] * nr_obs_group_b
+    )
+    np.testing.assert_allclose(scale_factors, expected_scale_factors)