Skip to content

Commit 50f34ad

Browse files
committed
Correct PCA component counting logic
BREAKING CHANGE: The change in component counting directly affects the number of clusters requested in the `main` auto-scaling function. For the same input data, this version may produce a different number of clusters and therefore different final scaling factors compared to previous versions. The new behavior is considered more accurate.
1 parent 7ae6920 commit 50f34ad

File tree

2 files changed

+264
-48
lines changed

2 files changed

+264
-48
lines changed

src/ert/analysis/misfit_preprocessor.py

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -32,10 +32,19 @@ def get_nr_primary_components(
3232
responses: npt.NDArray[np.float64], threshold: float
3333
) -> int:
3434
"""
35-
Calculate the number of principal components needed to achieve a cumulative
36-
variance less than a specified threshold using Singular Value Decomposition (SVD).
35+
Calculate the number of principal components required
36+
to explain a given amount of variance in the responses.
3737
38-
responses should be on form (n_realizations, n_observations)
38+
Args:
39+
responses: A 2D array of data with shape
40+
(n_realizations, n_observations).
41+
threshold: The cumulative variance threshold to meet or exceed.
42+
For example, a value of 0.95 will find the number of
43+
components needed to explain at least 95% of the total variance.
44+
45+
Returns:
46+
The minimum number of principal components required to meet or exceed
47+
the specified variance threshold.
3948
"""
4049
data_matrix = responses - responses.mean(axis=0)
4150
_, singulars, _ = np.linalg.svd(data_matrix.astype(float), full_matrices=False)
@@ -45,7 +54,10 @@ def get_nr_primary_components(
4554
# sum to get the cumulative proportion of variance explained by each successive
4655
# component.
4756
variance_ratio = np.cumsum(singulars**2) / np.sum(singulars**2)
48-
return max(len([1 for i in variance_ratio[:-1] if i < threshold]), 1)
57+
58+
num_components = np.searchsorted(variance_ratio, threshold, side="left") + 1
59+
60+
return int(num_components)
4961

5062

5163
def cluster_responses(

tests/ert/unit_tests/analysis/test_misfit_preprocessor.py

Lines changed: 248 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -59,10 +59,12 @@ def test_that_get_nr_primary_components_is_according_to_theory(p, rho, seed):
5959
X = rng.standard_normal(size=(p, N))
6060
Y = (np.linalg.cholesky(Sigma) @ X).T
6161

62-
# Adding a bit to the thresholds because of numerical accuracy.
63-
assert get_nr_primary_components(Y, threshold_1 + 0.01) == 1
64-
assert get_nr_primary_components(Y, threshold_2 + 0.01) == 2
65-
assert get_nr_primary_components(Y, threshold_3 + 0.01) == 3
62+
# To get 1 component, the threshold must be <= the variance
63+
# of the 1st component.
64+
# Same for the other components.
65+
assert get_nr_primary_components(Y, threshold_1 - 0.01) == 1
66+
assert get_nr_primary_components(Y, threshold_2 - 0.01) == 2
67+
assert get_nr_primary_components(Y, threshold_3 - 0.01) == 3
6668

6769

6870
@pytest.mark.parametrize("nr_observations", [4, 7, 12])
@@ -71,24 +73,19 @@ def test_that_correlated_and_independent_observations_are_grouped_separately(
7173
nr_observations,
7274
):
7375
"""
74-
Test the preprocessor's ability to cluster correlated observations.
76+
Test the preprocessor's ability to cluster correlated observations
77+
separately from multiple independent observations.
7578
7679
We create a response matrix with `nr_observations` rows, where the
7780
first `nr_observations - 2` rows are strongly correlated, while the
78-
last two are independent.
79-
80-
We expect the correlated observations to be scaled in one group,
81-
and perhaps surprisingly, the two independent observations to be
82-
scaled together in a second group.
83-
The reason that the two independent responses end up in the same group
84-
is due to the way get_nr_primary_components counts PCA components,
85-
and the fact that the number of PCA components is used as the number
86-
of clusters.
87-
It returns the number of components that explain **less** than the
88-
variance specified as the threshold.
89-
With a threshold of 0.95, the expression used is as follows:
90-
91-
max(len([1 for i in variance_ratio[:-1] if i < 0.95]), 1),
81+
last two are independent of both the main group and each other.
82+
83+
This will result in a request for 3 clusters, correctly separating the
84+
data into its three natural groups:
85+
86+
1. The main group of correlated observations.
87+
2. The first independent observation.
88+
3. The second independent observation.
9289
"""
9390
rng = np.random.default_rng(1234)
9491
nr_realizations = 1000
@@ -99,44 +96,49 @@ def test_that_correlated_and_independent_observations_are_grouped_separately(
9996
parameters_b = rng.standard_normal(nr_realizations)
10097
parameters_c = rng.standard_normal(nr_realizations)
10198

102-
Y = np.ones((nr_observations, nr_realizations))
99+
Y = np.zeros((nr_observations, nr_realizations))
103100
for i in range(nr_correlated_obs):
104-
Y[i] = (1 + i) * parameters_a
105-
Y[-1] = 5 + parameters_b
106-
Y[-2] = 10 + parameters_c
101+
Y[i] = (i + 1) * parameters_a
102+
# The last two observations are independent
103+
Y[-2] = 10 + parameters_b
104+
Y[-1] = 5 + parameters_c
107105

108106
obs_errors = Y.std(axis=1)
109107
Y_original = Y.copy()
110108
obs_error_copy = obs_errors.copy()
111109

112110
scale_factors, clusters, nr_components = main(Y, obs_errors)
113111

114-
# Since the first nr_correlated_obs rows of Y are perfectly correlated,
115-
# we only need one principal component to describe all variance.
116-
nr_components_correlated_group = 1
112+
# We expect three distinct clusters now.
113+
cluster_label_correlated = clusters[0]
114+
cluster_label_independent_1 = clusters[-2]
115+
cluster_label_independent_2 = clusters[-1]
117116

118-
# Because of the way we calculate the number of components
119-
# (see docstring for details), the two undependent responses
120-
# are represented by a single component.
121-
nr_components_uncorrelated_group = 1
122-
123-
np.testing.assert_equal(
124-
scale_factors,
125-
np.array(
126-
nr_correlated_obs
127-
* [np.sqrt(nr_correlated_obs / nr_components_correlated_group)]
128-
+ nr_uncorrelated_obs
129-
* [np.sqrt(nr_uncorrelated_obs / nr_components_uncorrelated_group)]
130-
),
131-
)
117+
# Check that the three labels are all different
118+
assert cluster_label_correlated != cluster_label_independent_1
119+
assert cluster_label_correlated != cluster_label_independent_2
120+
assert cluster_label_independent_1 != cluster_label_independent_2
132121

133-
expected_clusters = np.array(nr_correlated_obs * [1] + nr_uncorrelated_obs * [2])
134-
np.testing.assert_equal(clusters, expected_clusters)
122+
# Check that the main group is clustered together
123+
for i in range(nr_correlated_obs):
124+
assert clusters[i] == cluster_label_correlated
135125

136-
expected_nr_components = (nr_uncorrelated_obs + nr_correlated_obs) * [1]
126+
# Correlated group cluster has 1 component.
127+
# The two independent clusters have size 1, so they also get 1 component.
128+
# Therefore, all observations should be associated with 1 component.
129+
expected_nr_components = np.ones(nr_observations, dtype=int)
137130
np.testing.assert_equal(nr_components, expected_nr_components)
138131

139-
# Check that we don`t modify the input data
132+
# For the correlated group: sqrt(num_obs / num_components)
133+
sf_correlated = np.sqrt(nr_correlated_obs / 1.0)
134+
# For the independent groups (size 1): sqrt(1 / 1)
135+
sf_independent = np.sqrt(1.0 / 1.0)
136+
137+
expected_scale_factors = np.array(
138+
[sf_correlated] * nr_correlated_obs + [sf_independent] * nr_uncorrelated_obs
139+
)
140+
np.testing.assert_allclose(scale_factors, expected_scale_factors)
141+
140142
np.testing.assert_equal(Y, Y_original)
141143
np.testing.assert_equal(obs_errors, obs_error_copy)
142144

@@ -171,3 +173,205 @@ def test_edge_cases_with_few_observations_return_default_values(nr_observations)
171173
nr_components,
172174
np.array(nr_observations * [1.0]),
173175
)
176+
177+
178+
@pytest.mark.parametrize(
179+
"nr_obs_group_a, nr_obs_group_b",
180+
[
181+
(3, 2),
182+
(5, 5),
183+
(4, 6),
184+
],
185+
)
186+
@pytest.mark.integration_test
187+
def test_main_correctly_separates_distinct_correlation_groups(
188+
nr_obs_group_a, nr_obs_group_b
189+
):
190+
"""
191+
Creates a response matrix with two distinct and independent groups of
192+
correlated observations.
193+
- Group A contains `nr_obs_group_a` responses that are all correlated
194+
with each other.
195+
- Group B contains `nr_obs_group_b` responses that are also correlated
196+
with each other, but are independent of Group A.
197+
198+
This test asserts that the algorithm places
199+
the two groups into two separate clusters.
200+
"""
201+
rng = np.random.default_rng(seed=12345)
202+
nr_realizations = 1000
203+
nr_observations = nr_obs_group_a + nr_obs_group_b
204+
205+
# Create two independent random signals that will form the
206+
# basis for the two correlation groups.
207+
params_a = rng.standard_normal(nr_realizations)
208+
params_b = rng.standard_normal(nr_realizations)
209+
210+
# Create the final response matrix Y
211+
Y = np.zeros((nr_observations, nr_realizations))
212+
213+
# Create Group A: `nr_obs_group_a` perfectly correlated
214+
# responses based on `params_a`
215+
for i in range(nr_obs_group_a):
216+
Y[i] = (i + 1) * params_a
217+
218+
# Create Group B: `nr_obs_group_b` perfectly correlated
219+
# responses based on `params_b`
220+
for i in range(nr_obs_group_b):
221+
Y[nr_obs_group_a + i] = (i + 1) * params_b
222+
223+
# Calculate observation errors,
224+
# required as input for the main function
225+
obs_errors = Y.std(axis=1)
226+
227+
scale_factors, clusters, nr_components = main(Y, obs_errors)
228+
229+
# Assert that the two groups were placed in different clusters.
230+
# The absolute cluster labels (e.g., 1 vs 2) can change between runs,
231+
# so we check the grouping structure dynamically.
232+
cluster_label_group_a = clusters[0]
233+
cluster_label_group_b = clusters[nr_obs_group_a]
234+
235+
assert cluster_label_group_a != cluster_label_group_b, (
236+
"The two distinct correlation groups should be in different clusters."
237+
)
238+
239+
# Assert that all members of Group A are in the same cluster
240+
expected_clusters_a = np.full(nr_obs_group_a, cluster_label_group_a)
241+
np.testing.assert_array_equal(clusters[:nr_obs_group_a], expected_clusters_a)
242+
243+
# Assert that all members of Group B are in the same cluster
244+
expected_clusters_b = np.full(nr_obs_group_b, cluster_label_group_b)
245+
np.testing.assert_array_equal(clusters[nr_obs_group_a:], expected_clusters_b)
246+
247+
# Assert the number of components for each observation.
248+
# Since each group is perfectly correlated internally, the PCA performed
249+
# on each cluster should find that only 1 principal component is needed.
250+
expected_nr_components = np.ones(nr_observations, dtype=int)
251+
np.testing.assert_array_equal(nr_components, expected_nr_components)
252+
253+
# Assert the calculated scaling factors.
254+
# The scaling factor is sqrt(num_observations_in_cluster / num_components).
255+
sf_group_a = np.sqrt(nr_obs_group_a / 1.0)
256+
sf_group_b = np.sqrt(nr_obs_group_b / 1.0)
257+
258+
expected_scale_factors = np.array(
259+
[sf_group_a] * nr_obs_group_a + [sf_group_b] * nr_obs_group_b
260+
)
261+
np.testing.assert_allclose(scale_factors, expected_scale_factors)
262+
263+
264+
@pytest.mark.parametrize(
265+
"nr_obs_group_a, nr_obs_group_b",
266+
[
267+
(3, 2),
268+
(5, 5),
269+
(4, 6),
270+
],
271+
)
272+
@pytest.mark.integration_test
273+
def test_autoscale_clusters_by_correlation_sign_not_signal_source(
274+
nr_obs_group_a, nr_obs_group_b
275+
):
276+
"""
277+
Creates a response matrix with two distinct and independent groups:
278+
279+
- Group A contains `nr_obs_group_a` responses that are all positively
280+
correlated with each other, following the pattern (a+i)*X_1.
281+
- Group B contains `nr_obs_group_b` responses with alternating signs
282+
following the pattern (b+j)*(-1)^j*X_2, creating a checkerboard
283+
correlation pattern within the group, but independent of Group A.
284+
285+
This test asserts that the algorithm correctly identifies the correlation
286+
patterns and clusters responses based on their correlation structure.
287+
"""
288+
rng = np.random.default_rng(seed=12345)
289+
nr_realizations = 1000
290+
nr_observations = nr_obs_group_a + nr_obs_group_b
291+
292+
# Create two independent random signals
293+
X_1 = rng.standard_normal(nr_realizations)
294+
X_2 = rng.standard_normal(nr_realizations)
295+
296+
Y = np.zeros((nr_observations, nr_realizations))
297+
298+
# Create Group A: (a+i)*X_1 pattern - all positively correlated
299+
a = 1 # base scaling factor for group A
300+
for i in range(nr_obs_group_a):
301+
Y[i] = (a + i) * X_1
302+
303+
# Create Group B: (b+j)*(-1)^j*X_2 pattern - checkerboard correlation
304+
b = 1 # base scaling factor for group B
305+
for j in range(nr_obs_group_b):
306+
sign = (-1) ** j # alternates: +1, -1, +1, -1, ...
307+
Y[nr_obs_group_a + j] = (b + j) * sign * X_2
308+
309+
obs_errors = Y.std(axis=1)
310+
scale_factors, clusters, nr_components = main(Y, obs_errors)
311+
312+
# NB!
313+
# It's not distinguishing based on which underlying signal
314+
# (X_1 vs X_2) the responses come from,
315+
# but rather on their correlation patterns.
316+
# From the algorithm's perspective,
317+
# X_1 and 3*X_2 look very similar (both positive scaling),
318+
# while -2*X_2 looks different (negative scaling).
319+
320+
# Assert that all members of Group A are in the same cluster
321+
group_a_clusters = clusters[:nr_obs_group_a]
322+
assert len(np.unique(group_a_clusters)) == 1, (
323+
"All Group A responses should be in the same cluster"
324+
)
325+
326+
# Identify Group B even and odd indexed responses
327+
group_b_clusters = clusters[nr_obs_group_a:]
328+
group_b_even_clusters = group_b_clusters[::2] # indices 0, 2, 4, ...
329+
group_b_odd_clusters = group_b_clusters[1::2] # indices 1, 3, 5, ...
330+
331+
# Assert that Group B even-indexed responses are clustered together
332+
if len(group_b_even_clusters) > 1:
333+
assert len(np.unique(group_b_even_clusters)) == 1, (
334+
"All Group B even-indexed responses should be in the same cluster"
335+
)
336+
337+
# Assert that Group B odd-indexed responses are clustered together
338+
if len(group_b_odd_clusters) > 1:
339+
assert len(np.unique(group_b_odd_clusters)) == 1, (
340+
"All Group B odd-indexed responses should be in the same cluster"
341+
)
342+
343+
# Assert that Group B even and odd responses are in different clusters
344+
# (only if both even and odd responses exist)
345+
if len(group_b_even_clusters) > 0 and len(group_b_odd_clusters) > 0:
346+
assert group_b_even_clusters[0] != group_b_odd_clusters[0], (
347+
"Group B even and odd indexed responses should be in different clusters"
348+
)
349+
350+
# Assert that Group A responses are clustered with Group B even-indexed responses
351+
# This documents the algorithm's behavior: it clusters based on correlation patterns
352+
# (positive vs negative scaling) rather than underlying signal source (X_1 vs X_2)
353+
group_a_cluster = group_a_clusters[0]
354+
if len(group_b_even_clusters) > 0:
355+
group_b_even_cluster = group_b_even_clusters[0]
356+
assert group_a_cluster == group_b_even_cluster
357+
358+
# Assert that nr_components are consistent within clusters
359+
# (all members of the same cluster should have the same nr_components)
360+
unique_clusters = np.unique(clusters)
361+
for cluster_id in unique_clusters:
362+
cluster_indices = np.where(clusters == cluster_id)[0]
363+
cluster_components = nr_components[cluster_indices]
364+
assert len(np.unique(cluster_components)) == 1
365+
366+
# Assert the calculated scaling factors based on actual cluster assignments
367+
expected_scale_factors = np.zeros(nr_observations)
368+
369+
for cluster_id in unique_clusters:
370+
cluster_indices = np.where(clusters == cluster_id)[0]
371+
cluster_size = len(cluster_indices)
372+
# Use the actual number of components for this cluster
373+
cluster_nr_components = nr_components[cluster_indices[0]]
374+
expected_sf = np.sqrt(cluster_size / float(cluster_nr_components))
375+
expected_scale_factors[cluster_indices] = expected_sf
376+
377+
np.testing.assert_allclose(scale_factors, expected_scale_factors)

0 commit comments

Comments
 (0)