9
9
from sgkit_bgen .bgen_reader import (
10
10
GT_DATA_VARS ,
11
11
BgenReader ,
12
- rechunk_from_zarr ,
13
- rechunk_to_zarr ,
12
+ bgen_to_zarr ,
13
+ rechunk_bgen ,
14
14
unpack_variables ,
15
15
)
16
16
44
44
[np .nan , 1.018 , 0.010 , 0.160 , 0.991 ] # Generated using bgen-reader directly
45
45
)
46
46
47
+ EXPECTED_DIMS = dict (variants = 199 , samples = 500 , genotypes = 3 , alleles = 2 )
48
+
49
+
50
+ def _shape (* dims : str ) -> Tuple [int , ...]:
51
+ return tuple (EXPECTED_DIMS [d ] for d in dims )
52
+
47
53
48
54
@pytest .mark .parametrize ("chunks" , CHUNKS )
49
55
def test_read_bgen (shared_datadir , chunks ):
50
56
path = shared_datadir / "example.bgen"
51
57
ds = read_bgen (path , chunks = chunks )
52
58
53
59
# check some of the data (in different chunks)
54
- assert ds ["call_dosage" ].shape == ( 199 , 500 )
60
+ assert ds ["call_dosage" ].shape == _shape ( "variants" , "samples" )
55
61
npt .assert_almost_equal (ds ["call_dosage" ].values [1 ][0 ], 1.987 , decimal = 3 )
56
62
npt .assert_almost_equal (ds ["call_dosage" ].values [100 ][0 ], 0.160 , decimal = 3 )
57
63
npt .assert_array_equal (ds ["call_dosage_mask" ].values [0 , 0 ], [True ])
58
64
npt .assert_array_equal (ds ["call_dosage_mask" ].values [0 , 1 ], [False ])
59
- assert ds ["call_genotype_probability" ].shape == (199 , 500 , 3 )
65
+ assert ds ["call_genotype_probability" ].shape == _shape (
66
+ "variants" , "samples" , "genotypes"
67
+ )
60
68
npt .assert_almost_equal (
61
69
ds ["call_genotype_probability" ].values [1 ][0 ], [0.005 , 0.002 , 0.992 ], decimal = 3
62
70
)
@@ -137,39 +145,45 @@ def test_read_bgen__raise_on_invalid_indexers(shared_datadir):
137
145
reader [([0 ], [0 ], [0 ])]
138
146
139
147
140
- def _rechunk_to_zarr (
148
+ def _rechunk_bgen (
141
149
shared_datadir : Path , tmp_path : Path , ** kwargs : Any
142
- ) -> Tuple [xr .Dataset , str ]:
150
+ ) -> Tuple [xr .Dataset , xr . Dataset , str ]:
143
151
path = shared_datadir / "example.bgen"
144
152
ds = read_bgen (path , chunks = (10 , - 1 , - 1 ))
145
153
store = tmp_path / "example.zarr"
146
- rechunk_to_zarr (ds , store , ** kwargs )
147
- return ds , str (store )
154
+ dsr = rechunk_bgen (ds , store , ** kwargs )
155
+ return ds , dsr , str (store )
148
156
149
157
150
158
def _open_zarr (store : str , ** kwargs : Any ) -> xr .Dataset :
151
159
# Force concat_characters False to avoid to avoid https://github.com/pydata/xarray/issues/4405
152
160
return xr .open_zarr (store , concat_characters = False , ** kwargs ) # type: ignore[no-any-return,no-untyped-call]
153
161
154
162
155
- @pytest .mark .parametrize ("chunk_width" , [10 , 50 , 500 ])
156
- def test_rechunk_to_zarr__chunk_size (shared_datadir , tmp_path , chunk_width ):
157
- _ , store = _rechunk_to_zarr (
158
- shared_datadir , tmp_path , chunk_width = chunk_width , pack = False
163
+ @pytest .mark .parametrize ("target_chunks" , [(10 , 10 ), (50 , 50 ), (100 , 50 ), (50 , 100 )])
164
+ def test_rechunk_bgen__target_chunks (shared_datadir , tmp_path , target_chunks ):
165
+ _ , dsr , store = _rechunk_bgen (
166
+ shared_datadir ,
167
+ tmp_path ,
168
+ chunk_length = target_chunks [0 ],
169
+ chunk_width = target_chunks [1 ],
170
+ pack = False ,
159
171
)
160
- dsr = _open_zarr (store )
161
172
for v in GT_DATA_VARS :
162
- # Chunks shape should equal (
163
- # length of chunks on read,
164
- # width of chunks on rechunk
165
- # )
166
- assert dsr [v ].data .chunksize [0 ] == 10
167
- assert dsr [v ].data .chunksize [1 ] == chunk_width
173
+ assert dsr [v ].data .chunksize [:2 ] == target_chunks
174
+
175
+
176
+ def test_rechunk_from_zarr__self_consistent (shared_datadir , tmp_path ):
177
+ # With no probability dtype or packing, rechunk_{to,from}_zarr is a noop
178
+ ds , dsr , store = _rechunk_bgen (
179
+ shared_datadir , tmp_path , probability_dtype = None , pack = False
180
+ )
181
+ xr .testing .assert_allclose (ds .compute (), dsr .compute ()) # type: ignore[no-untyped-call]
168
182
169
183
170
184
@pytest .mark .parametrize ("dtype" , ["uint8" , "uint16" ])
171
- def test_rechunk_to_zarr__probability_encoding (shared_datadir , tmp_path , dtype ):
172
- ds , store = _rechunk_to_zarr (
185
+ def test_rechunk_bgen__probability_encoding (shared_datadir , tmp_path , dtype ):
186
+ ds , _ , store = _rechunk_bgen (
173
187
shared_datadir , tmp_path , probability_dtype = dtype , pack = False
174
188
)
175
189
dsr = _open_zarr (store , mask_and_scale = False )
@@ -184,61 +198,42 @@ def test_rechunk_to_zarr__probability_encoding(shared_datadir, tmp_path, dtype):
184
198
np .testing .assert_allclose (ds [v ], dsr [v ], atol = tolerance )
185
199
186
200
187
- def test_rechunk_to_zarr__variable_packing (shared_datadir , tmp_path ):
188
- ds , store = _rechunk_to_zarr (
201
+ def test_rechunk_bgen__variable_packing (shared_datadir , tmp_path ):
202
+ ds , dsr , store = _rechunk_bgen (
189
203
shared_datadir , tmp_path , probability_dtype = None , pack = True
190
204
)
191
- dsr = _open_zarr (store , mask_and_scale = True )
192
- dsr = unpack_variables (dsr )
193
205
# A minor tolerance is necessary here when packing is enabled
194
206
# because one of the genotype probabilities is constructed from the others
195
207
xr .testing .assert_allclose (ds .compute (), dsr .compute (), atol = 1e-6 ) # type: ignore[no-untyped-call]
196
208
197
209
198
- def test_rechunk_to_zarr__raise_on_invalid_chunk_length (shared_datadir , tmp_path ):
199
- with pytest .raises (
200
- ValueError ,
201
- match = "Chunk size in variant dimension for variable .* must evenly divide target chunk size" ,
202
- ):
203
- _rechunk_to_zarr (shared_datadir , tmp_path , chunk_length = 11 )
204
-
205
-
206
- @pytest .mark .parametrize ("chunks" , [(10 , 10 ), (50 , 50 ), (100 , 50 ), (50 , 100 )])
207
- def test_rechunk_from_zarr__target_chunks (shared_datadir , tmp_path , chunks ):
208
- ds , store = _rechunk_to_zarr (
209
- shared_datadir ,
210
- tmp_path ,
211
- chunk_length = chunks [0 ],
212
- chunk_width = chunks [1 ],
213
- pack = False ,
214
- )
215
- ds = rechunk_from_zarr (store , chunk_length = chunks [0 ], chunk_width = chunks [1 ])
216
- for v in GT_DATA_VARS :
217
- assert ds [v ].data .chunksize [:2 ] == chunks
218
-
219
-
220
210
@pytest .mark .parametrize ("dtype" , ["uint32" , "int8" , "float32" ])
221
- def test_rechunk_from_zarr__invalid_probability_type (shared_datadir , tmp_path , dtype ):
211
+ def test_rechunk_bgen__invalid_probability_type (shared_datadir , tmp_path , dtype ):
222
212
with pytest .raises (ValueError , match = "Probability integer dtype invalid" ):
223
- _rechunk_to_zarr (shared_datadir , tmp_path , probability_dtype = dtype )
213
+ _rechunk_bgen (shared_datadir , tmp_path , probability_dtype = dtype )
224
214
225
215
226
216
def test_unpack_variables__invalid_gp_dims (shared_datadir , tmp_path ):
227
217
# Validate that an error is thrown when variables are
228
218
# unpacked without being packed in the first place
229
- _ , store = _rechunk_to_zarr (shared_datadir , tmp_path , pack = False )
230
- dsr = _open_zarr (store , mask_and_scale = True )
219
+ _ , dsr , store = _rechunk_bgen (shared_datadir , tmp_path , pack = False )
231
220
with pytest .raises (
232
221
ValueError ,
233
222
match = "Expecting variable 'call_genotype_probability' to have genotypes dimension of size 2" ,
234
223
):
235
224
unpack_variables (dsr )
236
225
237
226
238
- def test_rechunk_from_zarr__self_consistent (shared_datadir , tmp_path ):
239
- # With no probability dtype or packing, rechunk_{to,from}_zarr is a noop
240
- ds , store = _rechunk_to_zarr (
241
- shared_datadir , tmp_path , probability_dtype = None , pack = False
242
- )
243
- dsr = rechunk_from_zarr (store )
244
- xr .testing .assert_allclose (ds .compute (), dsr .compute ()) # type: ignore[no-untyped-call]
227
+ @pytest .mark .parametrize ("region" , [None , dict (variants = slice (0 , 100 ))])
228
+ def test_bgen_to_zarr (shared_datadir , tmp_path , region ):
229
+ input = shared_datadir / "example.bgen"
230
+ output = tmp_path / "example.zarr"
231
+ ds = bgen_to_zarr (input , output , region = region )
232
+ expected_dims = {
233
+ k : EXPECTED_DIMS [k ]
234
+ if region is None or k not in region
235
+ else region [k ].stop - region [k ].start
236
+ for k in EXPECTED_DIMS
237
+ }
238
+ actual_dims = {k : v for k , v in ds .dims .items () if k in expected_dims }
239
+ assert actual_dims == expected_dims
0 commit comments