Skip to content

Commit 0aaf2c6

Browse files
committed
Correct PCA component counting logic
BREAKING CHANGE: The change in component counting directly affects the number of clusters requested in the `main` auto-scaling function. For the same input data, this version may produce a different number of clusters and therefore different final scaling factors compared to previous versions. The new behavior is considered more accurate.
1 parent 96b70cc commit 0aaf2c6

File tree

2 files changed

+148
-48
lines changed

2 files changed

+148
-48
lines changed

src/ert/analysis/misfit_preprocessor.py

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -32,10 +32,19 @@ def get_nr_primary_components(
3232
responses: npt.NDArray[np.float64], threshold: float
3333
) -> int:
3434
"""
35-
Calculate the number of principal components needed to achieve a cumulative
36-
variance less than a specified threshold using Singular Value Decomposition (SVD).
35+
Calculate the number of principal components required
36+
to explain a given amount of variance in the responses.
3737
38-
responses should be on form (n_realizations, n_observations)
38+
Args:
39+
responses: A 2D array of data with shape
40+
(n_realizations, n_observations).
41+
threshold: The cumulative variance threshold to meet or exceed.
42+
For example, a value of 0.95 will find the number of
43+
components needed to explain at least 95% of the total variance.
44+
45+
Returns:
46+
The minimum number of principal components required to meet or exceed
47+
the specified variance threshold.
3948
"""
4049
data_matrix = responses - responses.mean(axis=0)
4150
_, singulars, _ = np.linalg.svd(data_matrix.astype(float), full_matrices=False)
@@ -45,7 +54,10 @@ def get_nr_primary_components(
4554
# sum to get the cumulative proportion of variance explained by each successive
4655
# component.
4756
variance_ratio = np.cumsum(singulars**2) / np.sum(singulars**2)
48-
return max(len([1 for i in variance_ratio[:-1] if i < threshold]), 1)
57+
58+
num_components = np.searchsorted(variance_ratio, threshold, side="left") + 1
59+
60+
return int(num_components)
4961

5062

5163
def cluster_responses(

tests/ert/unit_tests/analysis/test_misfit_preprocessor.py

Lines changed: 132 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -59,10 +59,12 @@ def test_that_get_nr_primary_components_is_according_to_theory(p, rho, seed):
5959
X = rng.standard_normal(size=(p, N))
6060
Y = (np.linalg.cholesky(Sigma) @ X).T
6161

62-
# Adding a bit to the thresholds because of numerical accuracy.
63-
assert get_nr_primary_components(Y, threshold_1 + 0.01) == 1
64-
assert get_nr_primary_components(Y, threshold_2 + 0.01) == 2
65-
assert get_nr_primary_components(Y, threshold_3 + 0.01) == 3
62+
# To get 1 component, the threshold must be <= the variance
63+
# of the 1st component.
64+
# Same for the other components.
65+
assert get_nr_primary_components(Y, threshold_1 - 0.01) == 1
66+
assert get_nr_primary_components(Y, threshold_2 - 0.01) == 2
67+
assert get_nr_primary_components(Y, threshold_3 - 0.01) == 3
6668

6769

6870
@pytest.mark.parametrize("nr_observations", [4, 7, 12])
@@ -71,24 +73,19 @@ def test_that_correlated_and_independent_observations_are_grouped_separately(
7173
nr_observations,
7274
):
7375
"""
74-
Test the preprocessor's ability to cluster correlated observations.
76+
Test the preprocessor's ability to cluster correlated observations
77+
separately from multiple independent observations.
7578
7679
We create a response matrix with `nr_observations` rows, where the
7780
first `nr_observations - 2` rows are strongly correlated, while the
78-
last two are independent.
79-
80-
We expect the correlated observations to be scaled in one group,
81-
and perhaps surprisingly, the two independent observations to be
82-
scaled together in a second group.
83-
The reason that the two independent responses end up in the same group
84-
is due to the way get_nr_primary_components counts PCA components,
85-
and the fact that the number of PCA components is used as the number
86-
of clusters.
87-
It returns the number of components that explain **less** than the
88-
variance specified as the threshold.
89-
With a threshold of 0.95, the expression used is as follows:
90-
91-
max(len([1 for i in variance_ratio[:-1] if i < 0.95]), 1),
81+
last two are independent of both the main group and each other.
82+
83+
This will result in a request for 3 clusters, correctly separating the
84+
data into its three natural groups:
85+
86+
1. The main group of correlated observations.
87+
2. The first independent observation.
88+
3. The second independent observation.
9289
"""
9390
rng = np.random.default_rng(1234)
9491
nr_realizations = 1000
@@ -99,44 +96,49 @@ def test_that_correlated_and_independent_observations_are_grouped_separately(
9996
parameters_b = rng.standard_normal(nr_realizations)
10097
parameters_c = rng.standard_normal(nr_realizations)
10198

102-
Y = np.ones((nr_observations, nr_realizations))
99+
Y = np.zeros((nr_observations, nr_realizations))
103100
for i in range(nr_correlated_obs):
104-
Y[i] = (1 + i) * parameters_a
105-
Y[-1] = 5 + parameters_b
106-
Y[-2] = 10 + parameters_c
101+
Y[i] = (i + 1) * parameters_a
102+
# The last two observations are independent
103+
Y[-2] = 10 + parameters_b
104+
Y[-1] = 5 + parameters_c
107105

108106
obs_errors = Y.std(axis=1)
109107
Y_original = Y.copy()
110108
obs_error_copy = obs_errors.copy()
111109

112110
scale_factors, clusters, nr_components = main(Y, obs_errors)
113111

114-
# Since the first nr_correlated_obs rows of Y are perfectly correlated,
115-
# we only need one principal component to describe all variance.
116-
nr_components_correlated_group = 1
117-
118-
# Because of the way we calculate the number of components
119-
# (see docstring for details), the two undependent responses
120-
# are represented by a single component.
121-
nr_components_uncorrelated_group = 1
112+
# We expect three distinct clusters now.
113+
cluster_label_correlated = clusters[0]
114+
cluster_label_independent_1 = clusters[-2]
115+
cluster_label_independent_2 = clusters[-1]
122116

123-
np.testing.assert_equal(
124-
scale_factors,
125-
np.array(
126-
nr_correlated_obs
127-
* [np.sqrt(nr_correlated_obs / nr_components_correlated_group)]
128-
+ nr_uncorrelated_obs
129-
* [np.sqrt(nr_uncorrelated_obs / nr_components_uncorrelated_group)]
130-
),
131-
)
117+
# Check that the three labels are all different
118+
assert cluster_label_correlated != cluster_label_independent_1
119+
assert cluster_label_correlated != cluster_label_independent_2
120+
assert cluster_label_independent_1 != cluster_label_independent_2
132121

133-
expected_clusters = np.array(nr_correlated_obs * [1] + nr_uncorrelated_obs * [2])
134-
np.testing.assert_equal(clusters, expected_clusters)
122+
# Check that the main group is clustered together
123+
for i in range(nr_correlated_obs):
124+
assert clusters[i] == cluster_label_correlated
135125

136-
expected_nr_components = (nr_uncorrelated_obs + nr_correlated_obs) * [1]
126+
# Correlated group cluster has 1 component.
127+
# The two independent clusters have size 1, so they also get 1 component.
128+
# Therefore, all observations should be associated with 1 component.
129+
expected_nr_components = np.ones(nr_observations, dtype=int)
137130
np.testing.assert_equal(nr_components, expected_nr_components)
138131

139-
# Check that we don`t modify the input data
132+
# For the correlated group: sqrt(num_obs / num_components)
133+
sf_correlated = np.sqrt(nr_correlated_obs / 1.0)
134+
# For the independent groups (size 1): sqrt(1 / 1)
135+
sf_independent = np.sqrt(1.0 / 1.0)
136+
137+
expected_scale_factors = np.array(
138+
[sf_correlated] * nr_correlated_obs + [sf_independent] * nr_uncorrelated_obs
139+
)
140+
np.testing.assert_allclose(scale_factors, expected_scale_factors)
141+
140142
np.testing.assert_equal(Y, Y_original)
141143
np.testing.assert_equal(obs_errors, obs_error_copy)
142144

@@ -168,3 +170,89 @@ def test_that_perfectly_correlated_responses_are_not_scaled(nr_observations):
168170
nr_components,
169171
np.array(nr_observations * [1.0]),
170172
)
173+
174+
175+
@pytest.mark.parametrize(
176+
"nr_obs_group_a, nr_obs_group_b",
177+
[
178+
(3, 2),
179+
(5, 5),
180+
(4, 6),
181+
],
182+
)
183+
@pytest.mark.integration_test
184+
def test_main_correctly_separates_distinct_correlation_groups(
185+
nr_obs_group_a, nr_obs_group_b
186+
):
187+
"""
188+
Creates a response matrix with two distinct and independent groups of
189+
correlated observations.
190+
- Group A contains `nr_obs_group_a` responses that are all correlated
191+
with each other.
192+
- Group B contains `nr_obs_group_b` responses that are also correlated
193+
with each other, but are independent of Group A.
194+
195+
This test asserts that the algorithm places
196+
the two groups into two separate clusters.
197+
"""
198+
rng = np.random.default_rng(seed=12345)
199+
nr_realizations = 1000
200+
nr_observations = nr_obs_group_a + nr_obs_group_b
201+
202+
# Create two independent random signals that will form the
203+
# basis for the two correlation groups.
204+
params_a = rng.standard_normal(nr_realizations)
205+
params_b = rng.standard_normal(nr_realizations)
206+
207+
# Create the final response matrix Y
208+
Y = np.zeros((nr_observations, nr_realizations))
209+
210+
# Create Group A: `nr_obs_group_a` perfectly correlated
211+
# responses based on `params_a`
212+
for i in range(nr_obs_group_a):
213+
Y[i] = (i + 1) * params_a
214+
215+
# Create Group B: `nr_obs_group_b` perfectly correlated
216+
# responses based on `params_b`
217+
for i in range(nr_obs_group_b):
218+
Y[nr_obs_group_a + i] = (i + 1) * params_b
219+
220+
# Calculate observation errors,
221+
# required as input for the main function
222+
obs_errors = Y.std(axis=1)
223+
224+
scale_factors, clusters, nr_components = main(Y, obs_errors)
225+
226+
# Assert that the two groups were placed in different clusters.
227+
# The absolute cluster labels (e.g., 1 vs 2) can change between runs,
228+
# so we check the grouping structure dynamically.
229+
cluster_label_group_a = clusters[0]
230+
cluster_label_group_b = clusters[nr_obs_group_a]
231+
232+
assert cluster_label_group_a != cluster_label_group_b, (
233+
"The two distinct correlation groups should be in different clusters."
234+
)
235+
236+
# Assert that all members of Group A are in the same cluster
237+
expected_clusters_a = np.full(nr_obs_group_a, cluster_label_group_a)
238+
np.testing.assert_array_equal(clusters[:nr_obs_group_a], expected_clusters_a)
239+
240+
# Assert that all members of Group B are in the same cluster
241+
expected_clusters_b = np.full(nr_obs_group_b, cluster_label_group_b)
242+
np.testing.assert_array_equal(clusters[nr_obs_group_a:], expected_clusters_b)
243+
244+
# Assert the number of components for each observation.
245+
# Since each group is perfectly correlated internally, the PCA performed
246+
# on each cluster should find that only 1 principal component is needed.
247+
expected_nr_components = np.ones(nr_observations, dtype=int)
248+
np.testing.assert_array_equal(nr_components, expected_nr_components)
249+
250+
# Assert the calculated scaling factors.
251+
# The scaling factor is sqrt(num_observations_in_cluster / num_components).
252+
sf_group_a = np.sqrt(nr_obs_group_a / 1.0)
253+
sf_group_b = np.sqrt(nr_obs_group_b / 1.0)
254+
255+
expected_scale_factors = np.array(
256+
[sf_group_a] * nr_obs_group_a + [sf_group_b] * nr_obs_group_b
257+
)
258+
np.testing.assert_allclose(scale_factors, expected_scale_factors)

0 commit comments

Comments
 (0)