@@ -59,10 +59,12 @@ def test_that_get_nr_primary_components_is_according_to_theory(p, rho, seed):
59
59
X = rng .standard_normal (size = (p , N ))
60
60
Y = (np .linalg .cholesky (Sigma ) @ X ).T
61
61
62
- # Adding a bit to the thresholds because of numerical accuracy.
63
- assert get_nr_primary_components (Y , threshold_1 + 0.01 ) == 1
64
- assert get_nr_primary_components (Y , threshold_2 + 0.01 ) == 2
65
- assert get_nr_primary_components (Y , threshold_3 + 0.01 ) == 3
62
+ # To get 1 component, the threshold must be <= the variance
63
+ # of the 1st component.
64
+ # Same for the other components.
65
+ assert get_nr_primary_components (Y , threshold_1 - 0.01 ) == 1
66
+ assert get_nr_primary_components (Y , threshold_2 - 0.01 ) == 2
67
+ assert get_nr_primary_components (Y , threshold_3 - 0.01 ) == 3
66
68
67
69
68
70
@pytest .mark .parametrize ("nr_observations" , [4 , 7 , 12 ])
@@ -71,24 +73,19 @@ def test_that_correlated_and_independent_observations_are_grouped_separately(
71
73
nr_observations ,
72
74
):
73
75
"""
74
- Test the preprocessor's ability to cluster correlated observations.
76
+ Test the preprocessor's ability to cluster correlated observations
77
+ separately from multiple independent observations.
75
78
76
79
We create a response matrix with `nr_observations` rows, where the
77
80
first `nr_observations - 2` rows are strongly correlated, while the
78
- last two are independent.
79
-
80
- We expect the correlated observations to be scaled in one group,
81
- and perhaps surprisingly, the two independent observations to be
82
- scaled together in a second group.
83
- The reason that the two independent responses end up in the same group
84
- is due to the way get_nr_primary_components counts PCA components,
85
- and the fact that the number of PCA components is used as the number
86
- of clusters.
87
- It returns the number of components that explain **less** than the
88
- variance specified as the threshold.
89
- With a threshold of 0.95, the expression used is as follows:
90
-
91
- max(len([1 for i in variance_ratio[:-1] if i < 0.95]), 1),
81
+ last two are independent of both the main group and each other.
82
+
83
+ This will result in a request for 3 clusters, correctly separating the
84
+ data into its three natural groups:
85
+
86
+ 1. The main group of correlated observations.
87
+ 2. The first independent observation.
88
+ 3. The second independent observation.
92
89
"""
93
90
rng = np .random .default_rng (1234 )
94
91
nr_realizations = 1000
@@ -99,44 +96,49 @@ def test_that_correlated_and_independent_observations_are_grouped_separately(
99
96
parameters_b = rng .standard_normal (nr_realizations )
100
97
parameters_c = rng .standard_normal (nr_realizations )
101
98
102
- Y = np .ones ((nr_observations , nr_realizations ))
99
+ Y = np .zeros ((nr_observations , nr_realizations ))
103
100
for i in range (nr_correlated_obs ):
104
- Y [i ] = (1 + i ) * parameters_a
105
- Y [- 1 ] = 5 + parameters_b
106
- Y [- 2 ] = 10 + parameters_c
101
+ Y [i ] = (i + 1 ) * parameters_a
102
+ # The last two observations are independent
103
+ Y [- 2 ] = 10 + parameters_b
104
+ Y [- 1 ] = 5 + parameters_c
107
105
108
106
obs_errors = Y .std (axis = 1 )
109
107
Y_original = Y .copy ()
110
108
obs_error_copy = obs_errors .copy ()
111
109
112
110
scale_factors , clusters , nr_components = main (Y , obs_errors )
113
111
114
- # Since the first nr_correlated_obs rows of Y are perfectly correlated,
115
- # we only need one principal component to describe all variance.
116
- nr_components_correlated_group = 1
112
+ # We expect three distinct clusters now.
113
+ cluster_label_correlated = clusters [0 ]
114
+ cluster_label_independent_1 = clusters [- 2 ]
115
+ cluster_label_independent_2 = clusters [- 1 ]
117
116
118
- # Because of the way we calculate the number of components
119
- # (see docstring for details), the two undependent responses
120
- # are represented by a single component.
121
- nr_components_uncorrelated_group = 1
122
-
123
- np .testing .assert_equal (
124
- scale_factors ,
125
- np .array (
126
- nr_correlated_obs
127
- * [np .sqrt (nr_correlated_obs / nr_components_correlated_group )]
128
- + nr_uncorrelated_obs
129
- * [np .sqrt (nr_uncorrelated_obs / nr_components_uncorrelated_group )]
130
- ),
131
- )
117
+ # Check that the three labels are all different
118
+ assert cluster_label_correlated != cluster_label_independent_1
119
+ assert cluster_label_correlated != cluster_label_independent_2
120
+ assert cluster_label_independent_1 != cluster_label_independent_2
132
121
133
- expected_clusters = np .array (nr_correlated_obs * [1 ] + nr_uncorrelated_obs * [2 ])
134
- np .testing .assert_equal (clusters , expected_clusters )
122
+ # Check that the main group is clustered together
123
+ for i in range (nr_correlated_obs ):
124
+ assert clusters [i ] == cluster_label_correlated
135
125
136
- expected_nr_components = (nr_uncorrelated_obs + nr_correlated_obs ) * [1 ]
126
+ # Correlated group cluster has 1 component.
127
+ # The two independent clusters have size 1, so they also get 1 component.
128
+ # Therefore, all observations should be associated with 1 component.
129
+ expected_nr_components = np .ones (nr_observations , dtype = int )
137
130
np .testing .assert_equal (nr_components , expected_nr_components )
138
131
139
- # Check that we don`t modify the input data
132
+ # For the correlated group: sqrt(num_obs / num_components)
133
+ sf_correlated = np .sqrt (nr_correlated_obs / 1.0 )
134
+ # For the independent groups (size 1): sqrt(1 / 1)
135
+ sf_independent = np .sqrt (1.0 / 1.0 )
136
+
137
+ expected_scale_factors = np .array (
138
+ [sf_correlated ] * nr_correlated_obs + [sf_independent ] * nr_uncorrelated_obs
139
+ )
140
+ np .testing .assert_allclose (scale_factors , expected_scale_factors )
141
+
140
142
np .testing .assert_equal (Y , Y_original )
141
143
np .testing .assert_equal (obs_errors , obs_error_copy )
142
144
@@ -171,3 +173,205 @@ def test_edge_cases_with_few_observations_return_default_values(nr_observations)
171
173
nr_components ,
172
174
np .array (nr_observations * [1.0 ]),
173
175
)
176
+
177
+
178
+ @pytest .mark .parametrize (
179
+ "nr_obs_group_a, nr_obs_group_b" ,
180
+ [
181
+ (3 , 2 ),
182
+ (5 , 5 ),
183
+ (4 , 6 ),
184
+ ],
185
+ )
186
+ @pytest .mark .integration_test
187
+ def test_main_correctly_separates_distinct_correlation_groups (
188
+ nr_obs_group_a , nr_obs_group_b
189
+ ):
190
+ """
191
+ Creates a response matrix with two distinct and independent groups of
192
+ correlated observations.
193
+ - Group A contains `nr_obs_group_a` responses that are all correlated
194
+ with each other.
195
+ - Group B contains `nr_obs_group_b` responses that are also correlated
196
+ with each other, but are independent of Group A.
197
+
198
+ This test asserts that the algorithm places
199
+ the two groups into two separate clusters.
200
+ """
201
+ rng = np .random .default_rng (seed = 12345 )
202
+ nr_realizations = 1000
203
+ nr_observations = nr_obs_group_a + nr_obs_group_b
204
+
205
+ # Create two independent random signals that will form the
206
+ # basis for the two correlation groups.
207
+ params_a = rng .standard_normal (nr_realizations )
208
+ params_b = rng .standard_normal (nr_realizations )
209
+
210
+ # Create the final response matrix Y
211
+ Y = np .zeros ((nr_observations , nr_realizations ))
212
+
213
+ # Create Group A: `nr_obs_group_a` perfectly correlated
214
+ # responses based on `params_a`
215
+ for i in range (nr_obs_group_a ):
216
+ Y [i ] = (i + 1 ) * params_a
217
+
218
+ # Create Group B: `nr_obs_group_b` perfectly correlated
219
+ # responses based on `params_b`
220
+ for i in range (nr_obs_group_b ):
221
+ Y [nr_obs_group_a + i ] = (i + 1 ) * params_b
222
+
223
+ # Calculate observation errors,
224
+ # required as input for the main function
225
+ obs_errors = Y .std (axis = 1 )
226
+
227
+ scale_factors , clusters , nr_components = main (Y , obs_errors )
228
+
229
+ # Assert that the two groups were placed in different clusters.
230
+ # The absolute cluster labels (e.g., 1 vs 2) can change between runs,
231
+ # so we check the grouping structure dynamically.
232
+ cluster_label_group_a = clusters [0 ]
233
+ cluster_label_group_b = clusters [nr_obs_group_a ]
234
+
235
+ assert cluster_label_group_a != cluster_label_group_b , (
236
+ "The two distinct correlation groups should be in different clusters."
237
+ )
238
+
239
+ # Assert that all members of Group A are in the same cluster
240
+ expected_clusters_a = np .full (nr_obs_group_a , cluster_label_group_a )
241
+ np .testing .assert_array_equal (clusters [:nr_obs_group_a ], expected_clusters_a )
242
+
243
+ # Assert that all members of Group B are in the same cluster
244
+ expected_clusters_b = np .full (nr_obs_group_b , cluster_label_group_b )
245
+ np .testing .assert_array_equal (clusters [nr_obs_group_a :], expected_clusters_b )
246
+
247
+ # Assert the number of components for each observation.
248
+ # Since each group is perfectly correlated internally, the PCA performed
249
+ # on each cluster should find that only 1 principal component is needed.
250
+ expected_nr_components = np .ones (nr_observations , dtype = int )
251
+ np .testing .assert_array_equal (nr_components , expected_nr_components )
252
+
253
+ # Assert the calculated scaling factors.
254
+ # The scaling factor is sqrt(num_observations_in_cluster / num_components).
255
+ sf_group_a = np .sqrt (nr_obs_group_a / 1.0 )
256
+ sf_group_b = np .sqrt (nr_obs_group_b / 1.0 )
257
+
258
+ expected_scale_factors = np .array (
259
+ [sf_group_a ] * nr_obs_group_a + [sf_group_b ] * nr_obs_group_b
260
+ )
261
+ np .testing .assert_allclose (scale_factors , expected_scale_factors )
262
+
263
+
264
+ @pytest .mark .parametrize (
265
+ "nr_obs_group_a, nr_obs_group_b" ,
266
+ [
267
+ (3 , 2 ),
268
+ (5 , 5 ),
269
+ (4 , 6 ),
270
+ ],
271
+ )
272
+ @pytest .mark .integration_test
273
+ def test_autoscale_clusters_by_correlation_sign_not_signal_source (
274
+ nr_obs_group_a , nr_obs_group_b
275
+ ):
276
+ """
277
+ Creates a response matrix with two distinct and independent groups:
278
+
279
+ - Group A contains `nr_obs_group_a` responses that are all positively
280
+ correlated with each other, following the pattern (a+i)*X_1.
281
+ - Group B contains `nr_obs_group_b` responses with alternating signs
282
+ following the pattern (b+j)*(-1)^j*X_2, creating a checkerboard
283
+ correlation pattern within the group, but independent of Group A.
284
+
285
+ This test asserts that the algorithm correctly identifies the correlation
286
+ patterns and clusters responses based on their correlation structure.
287
+ """
288
+ rng = np .random .default_rng (seed = 12345 )
289
+ nr_realizations = 1000
290
+ nr_observations = nr_obs_group_a + nr_obs_group_b
291
+
292
+ # Create two independent random signals
293
+ X_1 = rng .standard_normal (nr_realizations )
294
+ X_2 = rng .standard_normal (nr_realizations )
295
+
296
+ Y = np .zeros ((nr_observations , nr_realizations ))
297
+
298
+ # Create Group A: (a+i)*X_1 pattern - all positively correlated
299
+ a = 1 # base scaling factor for group A
300
+ for i in range (nr_obs_group_a ):
301
+ Y [i ] = (a + i ) * X_1
302
+
303
+ # Create Group B: (b+j)*(-1)^j*X_2 pattern - checkerboard correlation
304
+ b = 1 # base scaling factor for group B
305
+ for j in range (nr_obs_group_b ):
306
+ sign = (- 1 ) ** j # alternates: +1, -1, +1, -1, ...
307
+ Y [nr_obs_group_a + j ] = (b + j ) * sign * X_2
308
+
309
+ obs_errors = Y .std (axis = 1 )
310
+ scale_factors , clusters , nr_components = main (Y , obs_errors )
311
+
312
+ # NB!
313
+ # It's not distinguishing based on which underlying signal
314
+ # (X_1 vs X_2) the responses come from,
315
+ # but rather on their correlation patterns.
316
+ # From the algorithm's perspective,
317
+ # X_1 and 3*X_2 look very similar (both positive scaling),
318
+ # while -2*X_2 looks different (negative scaling).
319
+
320
+ # Assert that all members of Group A are in the same cluster
321
+ group_a_clusters = clusters [:nr_obs_group_a ]
322
+ assert len (np .unique (group_a_clusters )) == 1 , (
323
+ "All Group A responses should be in the same cluster"
324
+ )
325
+
326
+ # Identify Group B even and odd indexed responses
327
+ group_b_clusters = clusters [nr_obs_group_a :]
328
+ group_b_even_clusters = group_b_clusters [::2 ] # indices 0, 2, 4, ...
329
+ group_b_odd_clusters = group_b_clusters [1 ::2 ] # indices 1, 3, 5, ...
330
+
331
+ # Assert that Group B even-indexed responses are clustered together
332
+ if len (group_b_even_clusters ) > 1 :
333
+ assert len (np .unique (group_b_even_clusters )) == 1 , (
334
+ "All Group B even-indexed responses should be in the same cluster"
335
+ )
336
+
337
+ # Assert that Group B odd-indexed responses are clustered together
338
+ if len (group_b_odd_clusters ) > 1 :
339
+ assert len (np .unique (group_b_odd_clusters )) == 1 , (
340
+ "All Group B odd-indexed responses should be in the same cluster"
341
+ )
342
+
343
+ # Assert that Group B even and odd responses are in different clusters
344
+ # (only if both even and odd responses exist)
345
+ if len (group_b_even_clusters ) > 0 and len (group_b_odd_clusters ) > 0 :
346
+ assert group_b_even_clusters [0 ] != group_b_odd_clusters [0 ], (
347
+ "Group B even and odd indexed responses should be in different clusters"
348
+ )
349
+
350
+ # Assert that Group A responses are clustered with Group B even-indexed responses
351
+ # This documents the algorithm's behavior: it clusters based on correlation patterns
352
+ # (positive vs negative scaling) rather than underlying signal source (X_1 vs X_2)
353
+ group_a_cluster = group_a_clusters [0 ]
354
+ if len (group_b_even_clusters ) > 0 :
355
+ group_b_even_cluster = group_b_even_clusters [0 ]
356
+ assert group_a_cluster == group_b_even_cluster
357
+
358
+ # Assert that nr_components are consistent within clusters
359
+ # (all members of the same cluster should have the same nr_components)
360
+ unique_clusters = np .unique (clusters )
361
+ for cluster_id in unique_clusters :
362
+ cluster_indices = np .where (clusters == cluster_id )[0 ]
363
+ cluster_components = nr_components [cluster_indices ]
364
+ assert len (np .unique (cluster_components )) == 1
365
+
366
+ # Assert the calculated scaling factors based on actual cluster assignments
367
+ expected_scale_factors = np .zeros (nr_observations )
368
+
369
+ for cluster_id in unique_clusters :
370
+ cluster_indices = np .where (clusters == cluster_id )[0 ]
371
+ cluster_size = len (cluster_indices )
372
+ # Use the actual number of components for this cluster
373
+ cluster_nr_components = nr_components [cluster_indices [0 ]]
374
+ expected_sf = np .sqrt (cluster_size / float (cluster_nr_components ))
375
+ expected_scale_factors [cluster_indices ] = expected_sf
376
+
377
+ np .testing .assert_allclose (scale_factors , expected_scale_factors )
0 commit comments