@@ -59,10 +59,12 @@ def test_that_get_nr_primary_components_is_according_to_theory(p, rho, seed):
59
59
X = rng .standard_normal (size = (p , N ))
60
60
Y = (np .linalg .cholesky (Sigma ) @ X ).T
61
61
62
- # Adding a bit to the thresholds because of numerical accuracy.
63
- assert get_nr_primary_components (Y , threshold_1 + 0.01 ) == 1
64
- assert get_nr_primary_components (Y , threshold_2 + 0.01 ) == 2
65
- assert get_nr_primary_components (Y , threshold_3 + 0.01 ) == 3
62
+ # To get 1 component, the threshold must be <= the variance
63
+ # of the 1st component.
64
+ # Same for the other components.
65
+ assert get_nr_primary_components (Y , threshold_1 - 0.01 ) == 1
66
+ assert get_nr_primary_components (Y , threshold_2 - 0.01 ) == 2
67
+ assert get_nr_primary_components (Y , threshold_3 - 0.01 ) == 3
66
68
67
69
68
70
@pytest .mark .parametrize ("nr_observations" , [4 , 7 , 12 ])
@@ -71,24 +73,19 @@ def test_that_correlated_and_independent_observations_are_grouped_separately(
71
73
nr_observations ,
72
74
):
73
75
"""
74
- Test the preprocessor's ability to cluster correlated observations.
76
+ Test the preprocessor's ability to cluster correlated observations
77
+ separately from multiple independent observations.
75
78
76
79
We create a response matrix with `nr_observations` rows, where the
77
80
first `nr_observations - 2` rows are strongly correlated, while the
78
- last two are independent.
79
-
80
- We expect the correlated observations to be scaled in one group,
81
- and perhaps surprisingly, the two independent observations to be
82
- scaled together in a second group.
83
- The reason that the two independent responses end up in the same group
84
- is due to the way get_nr_primary_components counts PCA components,
85
- and the fact that the number of PCA components is used as the number
86
- of clusters.
87
- It returns the number of components that explain **less** than the
88
- variance specified as the threshold.
89
- With a threshold of 0.95, the expression used is as follows:
90
-
91
- max(len([1 for i in variance_ratio[:-1] if i < 0.95]), 1),
81
+ last two are independent of both the main group and each other.
82
+
83
+ This will result in a request for 3 clusters, correctly separating the
84
+ data into its three natural groups:
85
+
86
+ 1. The main group of correlated observations.
87
+ 2. The first independent observation.
88
+ 3. The second independent observation.
92
89
"""
93
90
rng = np .random .default_rng (1234 )
94
91
nr_realizations = 1000
@@ -99,44 +96,49 @@ def test_that_correlated_and_independent_observations_are_grouped_separately(
99
96
parameters_b = rng .standard_normal (nr_realizations )
100
97
parameters_c = rng .standard_normal (nr_realizations )
101
98
102
- Y = np .ones ((nr_observations , nr_realizations ))
99
+ Y = np .zeros ((nr_observations , nr_realizations ))
103
100
for i in range (nr_correlated_obs ):
104
- Y [i ] = (1 + i ) * parameters_a
105
- Y [- 1 ] = 5 + parameters_b
106
- Y [- 2 ] = 10 + parameters_c
101
+ Y [i ] = (i + 1 ) * parameters_a
102
+ # The last two observations are independent
103
+ Y [- 2 ] = 10 + parameters_b
104
+ Y [- 1 ] = 5 + parameters_c
107
105
108
106
obs_errors = Y .std (axis = 1 )
109
107
Y_original = Y .copy ()
110
108
obs_error_copy = obs_errors .copy ()
111
109
112
110
scale_factors , clusters , nr_components = main (Y , obs_errors )
113
111
114
- # Since the first nr_correlated_obs rows of Y are perfectly correlated,
115
- # we only need one principal component to describe all variance.
116
- nr_components_correlated_group = 1
117
-
118
- # Because of the way we calculate the number of components
119
- # (see docstring for details), the two undependent responses
120
- # are represented by a single component.
121
- nr_components_uncorrelated_group = 1
112
+ # We expect three distinct clusters now.
113
+ cluster_label_correlated = clusters [0 ]
114
+ cluster_label_independent_1 = clusters [- 2 ]
115
+ cluster_label_independent_2 = clusters [- 1 ]
122
116
123
- np .testing .assert_equal (
124
- scale_factors ,
125
- np .array (
126
- nr_correlated_obs
127
- * [np .sqrt (nr_correlated_obs / nr_components_correlated_group )]
128
- + nr_uncorrelated_obs
129
- * [np .sqrt (nr_uncorrelated_obs / nr_components_uncorrelated_group )]
130
- ),
131
- )
117
+ # Check that the three labels are all different
118
+ assert cluster_label_correlated != cluster_label_independent_1
119
+ assert cluster_label_correlated != cluster_label_independent_2
120
+ assert cluster_label_independent_1 != cluster_label_independent_2
132
121
133
- expected_clusters = np .array (nr_correlated_obs * [1 ] + nr_uncorrelated_obs * [2 ])
134
- np .testing .assert_equal (clusters , expected_clusters )
122
+ # Check that the main group is clustered together
123
+ for i in range (nr_correlated_obs ):
124
+ assert clusters [i ] == cluster_label_correlated
135
125
136
- expected_nr_components = (nr_uncorrelated_obs + nr_correlated_obs ) * [1 ]
126
+ # Correlated group cluster has 1 component.
127
+ # The two independent clusters have size 1, so they also get 1 component.
128
+ # Therefore, all observations should be associated with 1 component.
129
+ expected_nr_components = np .ones (nr_observations , dtype = int )
137
130
np .testing .assert_equal (nr_components , expected_nr_components )
138
131
139
- # Check that we don`t modify the input data
132
+ # For the correlated group: sqrt(num_obs / num_components)
133
+ sf_correlated = np .sqrt (nr_correlated_obs / 1.0 )
134
+ # For the independent groups (size 1): sqrt(1 / 1)
135
+ sf_independent = np .sqrt (1.0 / 1.0 )
136
+
137
+ expected_scale_factors = np .array (
138
+ [sf_correlated ] * nr_correlated_obs + [sf_independent ] * nr_uncorrelated_obs
139
+ )
140
+ np .testing .assert_allclose (scale_factors , expected_scale_factors )
141
+
140
142
np .testing .assert_equal (Y , Y_original )
141
143
np .testing .assert_equal (obs_errors , obs_error_copy )
142
144
@@ -168,3 +170,89 @@ def test_that_perfectly_correlated_responses_are_not_scaled(nr_observations):
168
170
nr_components ,
169
171
np .array (nr_observations * [1.0 ]),
170
172
)
173
+
174
+
175
+ @pytest .mark .parametrize (
176
+ "nr_obs_group_a, nr_obs_group_b" ,
177
+ [
178
+ (3 , 2 ),
179
+ (5 , 5 ),
180
+ (4 , 6 ),
181
+ ],
182
+ )
183
+ @pytest .mark .integration_test
184
+ def test_main_correctly_separates_distinct_correlation_groups (
185
+ nr_obs_group_a , nr_obs_group_b
186
+ ):
187
+ """
188
+ Creates a response matrix with two distinct and independent groups of
189
+ correlated observations.
190
+ - Group A contains `nr_obs_group_a` responses that are all correlated
191
+ with each other.
192
+ - Group B contains `nr_obs_group_b` responses that are also correlated
193
+ with each other, but are independent of Group A.
194
+
195
+ This test asserts that the algorithm places
196
+ the two groups into two separate clusters.
197
+ """
198
+ rng = np .random .default_rng (seed = 12345 )
199
+ nr_realizations = 1000
200
+ nr_observations = nr_obs_group_a + nr_obs_group_b
201
+
202
+ # Create two independent random signals that will form the
203
+ # basis for the two correlation groups.
204
+ params_a = rng .standard_normal (nr_realizations )
205
+ params_b = rng .standard_normal (nr_realizations )
206
+
207
+ # Create the final response matrix Y
208
+ Y = np .zeros ((nr_observations , nr_realizations ))
209
+
210
+ # Create Group A: `nr_obs_group_a` perfectly correlated
211
+ # responses based on `params_a`
212
+ for i in range (nr_obs_group_a ):
213
+ Y [i ] = (i + 1 ) * params_a
214
+
215
+ # Create Group B: `nr_obs_group_b` perfectly correlated
216
+ # responses based on `params_b`
217
+ for i in range (nr_obs_group_b ):
218
+ Y [nr_obs_group_a + i ] = (i + 1 ) * params_b
219
+
220
+ # Calculate observation errors,
221
+ # required as input for the main function
222
+ obs_errors = Y .std (axis = 1 )
223
+
224
+ scale_factors , clusters , nr_components = main (Y , obs_errors )
225
+
226
+ # Assert that the two groups were placed in different clusters.
227
+ # The absolute cluster labels (e.g., 1 vs 2) can change between runs,
228
+ # so we check the grouping structure dynamically.
229
+ cluster_label_group_a = clusters [0 ]
230
+ cluster_label_group_b = clusters [nr_obs_group_a ]
231
+
232
+ assert cluster_label_group_a != cluster_label_group_b , (
233
+ "The two distinct correlation groups should be in different clusters."
234
+ )
235
+
236
+ # Assert that all members of Group A are in the same cluster
237
+ expected_clusters_a = np .full (nr_obs_group_a , cluster_label_group_a )
238
+ np .testing .assert_array_equal (clusters [:nr_obs_group_a ], expected_clusters_a )
239
+
240
+ # Assert that all members of Group B are in the same cluster
241
+ expected_clusters_b = np .full (nr_obs_group_b , cluster_label_group_b )
242
+ np .testing .assert_array_equal (clusters [nr_obs_group_a :], expected_clusters_b )
243
+
244
+ # Assert the number of components for each observation.
245
+ # Since each group is perfectly correlated internally, the PCA performed
246
+ # on each cluster should find that only 1 principal component is needed.
247
+ expected_nr_components = np .ones (nr_observations , dtype = int )
248
+ np .testing .assert_array_equal (nr_components , expected_nr_components )
249
+
250
+ # Assert the calculated scaling factors.
251
+ # The scaling factor is sqrt(num_observations_in_cluster / num_components).
252
+ sf_group_a = np .sqrt (nr_obs_group_a / 1.0 )
253
+ sf_group_b = np .sqrt (nr_obs_group_b / 1.0 )
254
+
255
+ expected_scale_factors = np .array (
256
+ [sf_group_a ] * nr_obs_group_a + [sf_group_b ] * nr_obs_group_b
257
+ )
258
+ np .testing .assert_allclose (scale_factors , expected_scale_factors )
0 commit comments