diff --git a/sgkit/io/vcf/vcf_reader.py b/sgkit/io/vcf/vcf_reader.py index 6b94b53b0..3a3c02ab0 100644 --- a/sgkit/io/vcf/vcf_reader.py +++ b/sgkit/io/vcf/vcf_reader.py @@ -240,7 +240,7 @@ def for_field( dims.append(dimension) chunksize += (size,) - array = np.full(chunksize, fill_value, dtype=dtype) + array = np.full(chunksize, missing_value, dtype=dtype) return InfoAndFormatFieldHandler( category, @@ -304,7 +304,7 @@ def add_variant(self, i: int, variant: Any) -> None: val if val is not None else self.missing_value ) else: - self.array[i] = self.fill_value + self.array[i] = self.missing_value elif self.category == "FORMAT": val = variant.format(self.key) if val is not None: @@ -327,7 +327,7 @@ def add_variant(self, i: int, variant: Any) -> None: a = a[..., : self.array.shape[-1]] # trim to fit self.array[i, ..., : a.shape[-1]] = a else: - self.array[i] = self.fill_value + self.array[i] = self.missing_value def truncate_array(self, length: int) -> None: self.array = self.array[:length] diff --git a/sgkit/tests/io/vcf/test_vcf_reader.py b/sgkit/tests/io/vcf/test_vcf_reader.py index a841ca29b..7d7efefc8 100644 --- a/sgkit/tests/io/vcf/test_vcf_reader.py +++ b/sgkit/tests/io/vcf/test_vcf_reader.py @@ -9,10 +9,10 @@ import xarray as xr import zarr from numcodecs import Blosc, Delta, FixedScaleOffset, PackBits, VLenUTF8 -from numpy.testing import assert_allclose, assert_array_equal +from numpy.testing import assert_allclose, assert_array_equal, assert_array_almost_equal from sgkit import load_dataset, save_dataset -from sgkit.io.utils import FLOAT32_FILL, INT_FILL, INT_MISSING +from sgkit.io.utils import FLOAT32_MISSING, FLOAT32_FILL, INT_FILL, INT_MISSING from sgkit.io.vcf import ( MaxAltAllelesExceededWarning, partition_into_regions, @@ -46,27 +46,133 @@ def test_vcf_to_zarr__small_vcf( path = path_for_test(shared_datadir, "sample.vcf.gz", is_path) output = tmp_path.joinpath("vcf.zarr").as_posix() + fields = [ + "INFO/NS", + "INFO/AN", + "INFO/AA", + "INFO/DB", + "INFO/AC", + "INFO/AF", + "FORMAT/GT", + "FORMAT/DP", + "FORMAT/HQ", + ] + field_defs = { + "FORMAT/HQ": {"dimension": "ploidy"}, + "INFO/AF": {"Number": "2", "dimension": "AF"}, + "INFO/AC": {"Number": "2", "dimension": "AC"}, + } if to_zarr: vcf_to_zarr( path, output, + max_alt_alleles=3, chunk_length=5, chunk_width=2, read_chunk_length=read_chunk_length, + fields=fields, + field_defs=field_defs, ) ds = xr.open_zarr(output) else: - ds = read_vcf(path, chunk_length=5, chunk_width=2) + ds = read_vcf( + path, chunk_length=5, chunk_width=2, fields=fields, field_defs=field_defs + ) + assert_array_equal(ds["filter_id"], ["PASS", "s50", "q10"]) + assert_array_equal( + ds["variant_filter"], + [ + [False, False, False], + [False, False, False], + [True, False, False], + [False, False, True], + [True, False, False], + [True, False, False], + [True, False, False], + [False, False, False], + [True, False, False], + ], + ) assert_array_equal(ds["contig_id"], ["19", "20", "X"]) assert "contig_length" not in ds assert_array_equal(ds["variant_contig"], [0, 0, 1, 1, 1, 1, 1, 1, 2]) assert ds["variant_contig"].chunks[0][0] == 5 + assert_array_equal( ds["variant_position"], [111, 112, 14370, 17330, 1110696, 1230237, 1234567, 1235237, 10], ) assert ds["variant_position"].chunks[0][0] == 5 + + assert_array_equal( + ds["variant_NS"], + [-1, -1, 3, 3, 2, 3, 3, -1, -1], + ) + assert ds["variant_NS"].chunks[0][0] == 5 + + assert_array_equal( + ds["variant_AN"], + [-1, -1, -1, -1, -1, -1, 6, -1, -1], + ) + assert ds["variant_AN"].chunks[0][0] == 5 + + assert_array_equal( + ds["variant_AA"], + [ + ".", + ".", + ".", + ".", + "T", + "T", + "G", + ".", + ".", + ], + ) + assert ds["variant_AN"].chunks[0][0] == 5 + + assert_array_equal( + ds["variant_DB"], + [ + False, + False, + True, + False, + True, + False, + False, + False, + False, + ], + ) + assert ds["variant_AN"].chunks[0][0] == 5 + + variant_AF = np.full((9, 2), FLOAT32_MISSING, dtype=np.float32) + variant_AF[2, 0] = 0.5 + variant_AF[3, 0] = 0.017 + variant_AF[4, 0] = 0.333 + variant_AF[4, 1] = 0.667 + assert_array_almost_equal(ds["variant_AF"], variant_AF, 3) + assert ds["variant_AF"].chunks[0][0] == 5 + + assert_array_equal( + ds["variant_AC"], + [ + [-1, -1], + [-1, -1], + [-1, -1], + [-1, -1], + [-1, -1], + [-1, -1], + [3, 1], + [-1, -1], + [-1, -1], + ], + ) + assert ds["variant_AC"].chunks[0][0] == 5 + assert_array_equal( ds["variant_allele"].values.tolist(), [ @@ -84,7 +190,7 @@ def test_vcf_to_zarr__small_vcf( assert ds["variant_allele"].chunks[0][0] == 5 assert ds["variant_allele"].dtype == "O" assert_array_equal( - ds["variant_id"], + ds["variant_id"].values.tolist(), [".", ".", "rs6054257", ".", "rs6040355", ".", "microsat1", ".", "rsTest"], ) assert ds["variant_id"].chunks[0][0] == 5 @@ -108,6 +214,9 @@ def test_vcf_to_zarr__small_vcf( [[0, 0], [0, 0], [0, 0]], [[0, 1], [0, 2], [-1, -1]], [[0, 0], [0, 0], [-1, -1]], + # NOTE: inconsistency here on pad vs missing. I think this is a + # pad value as it's a mixed ploidy call: + # GT 0 0/1 0|2 [[0, -1], [0, 1], [0, 2]], ], dtype="i1", @@ -126,18 +235,43 @@ def test_vcf_to_zarr__small_vcf( ], dtype=bool, ) + call_DP = [ + [-1, -1, -1], + [-1, -1, -1], + [1, 8, 5], + [3, 5, 3], + [6, 0, 4], + [-1, 4, 2], + [4, 2, 3], + [-1, -1, -1], + [-1, -1, -1], + ] + call_HQ = [ + [[10, 15], [10, 10], [3, 3]], + [[10, 10], [10, 10], [3, 3]], + [[51, 51], [51, 51], [-1, -1]], + [[58, 50], [65, 3], [-1, -1]], + [[23, 27], [18, 2], [-1, -1]], + [[56, 60], [51, 51], [-1, -1]], + [[-1, -1], [-1, -1], [-1, -1]], + [[-1, -1], [-1, -1], [-1, -1]], + [[-1, -1], [-1, -1], [-1, -1]], + ] + + # print(np.array2string(ds["call_HQ"].values, separator=",")) + # print(np.array2string(ds["call_genotype"].values < 0, separator=",")) + assert_array_equal(ds["call_genotype"], call_genotype) - assert ds["call_genotype"].chunks[0][0] == 5 - assert ds["call_genotype"].chunks[1][0] == 2 - assert ds["call_genotype"].chunks[2][0] == 2 assert_array_equal(ds["call_genotype_mask"], call_genotype < 0) - assert ds["call_genotype_mask"].chunks[0][0] == 5 - assert ds["call_genotype_mask"].chunks[1][0] == 2 - assert ds["call_genotype_mask"].chunks[2][0] == 2 assert_array_equal(ds["call_genotype_phased"], call_genotype_phased) - assert ds["call_genotype_mask"].chunks[0][0] == 5 - assert ds["call_genotype_mask"].chunks[1][0] == 2 - assert ds["call_genotype_mask"].chunks[2][0] == 2 + assert_array_equal(ds["call_DP"], call_DP) + assert_array_equal(ds["call_HQ"], call_HQ) + + for name in ["call_genotype", "call_genotype_mask", "call_HQ"]: + assert ds[name].chunks == ((5, 4), (2, 1), (2,)) + + for name in ["call_genotype_phased", "call_DP"]: + assert ds[name].chunks == ((5, 4), (2, 1)) # save and load again to test https://github.com/pydata/xarray/issues/3476 path2 = tmp_path / "ds2.zarr"