Skip to content

Commit

Permalink
Errors in missing vs pad values in VCF
Browse files Browse the repository at this point in the history
  • Loading branch information
jeromekelleher committed Feb 8, 2024
1 parent a386541 commit c77857c
Show file tree
Hide file tree
Showing 2 changed files with 150 additions and 16 deletions.
6 changes: 3 additions & 3 deletions sgkit/io/vcf/vcf_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,7 +240,7 @@ def for_field(
dims.append(dimension)
chunksize += (size,)

array = np.full(chunksize, fill_value, dtype=dtype)
array = np.full(chunksize, missing_value, dtype=dtype)

return InfoAndFormatFieldHandler(
category,
Expand Down Expand Up @@ -304,7 +304,7 @@ def add_variant(self, i: int, variant: Any) -> None:
val if val is not None else self.missing_value
)
else:
self.array[i] = self.fill_value
self.array[i] = self.missing_value
elif self.category == "FORMAT":
val = variant.format(self.key)
if val is not None:
Expand All @@ -327,7 +327,7 @@ def add_variant(self, i: int, variant: Any) -> None:
a = a[..., : self.array.shape[-1]] # trim to fit
self.array[i, ..., : a.shape[-1]] = a
else:
self.array[i] = self.fill_value
self.array[i] = self.missing_value

def truncate_array(self, length: int) -> None:
self.array = self.array[:length]
Expand Down
160 changes: 147 additions & 13 deletions sgkit/tests/io/vcf/test_vcf_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,10 @@
import xarray as xr
import zarr
from numcodecs import Blosc, Delta, FixedScaleOffset, PackBits, VLenUTF8
from numpy.testing import assert_allclose, assert_array_equal
from numpy.testing import assert_allclose, assert_array_equal, assert_array_almost_equal

from sgkit import load_dataset, save_dataset
from sgkit.io.utils import FLOAT32_FILL, INT_FILL, INT_MISSING
from sgkit.io.utils import FLOAT32_MISSING, FLOAT32_FILL, INT_FILL, INT_MISSING
from sgkit.io.vcf import (
MaxAltAllelesExceededWarning,
partition_into_regions,
Expand Down Expand Up @@ -46,27 +46,133 @@ def test_vcf_to_zarr__small_vcf(
path = path_for_test(shared_datadir, "sample.vcf.gz", is_path)
output = tmp_path.joinpath("vcf.zarr").as_posix()

fields = [
"INFO/NS",
"INFO/AN",
"INFO/AA",
"INFO/DB",
"INFO/AC",
"INFO/AF",
"FORMAT/GT",
"FORMAT/DP",
"FORMAT/HQ",
]
field_defs = {
"FORMAT/HQ": {"dimension": "ploidy"},
"INFO/AF": {"Number": "2", "dimension": "AF"},
"INFO/AC": {"Number": "2", "dimension": "AC"},
}
if to_zarr:
vcf_to_zarr(
path,
output,
max_alt_alleles=3,
chunk_length=5,
chunk_width=2,
read_chunk_length=read_chunk_length,
fields=fields,
field_defs=field_defs,
)
ds = xr.open_zarr(output)
else:
ds = read_vcf(path, chunk_length=5, chunk_width=2)
ds = read_vcf(
path, chunk_length=5, chunk_width=2, fields=fields, field_defs=field_defs
)

assert_array_equal(ds["filter_id"], ["PASS", "s50", "q10"])
assert_array_equal(
ds["variant_filter"],
[
[False, False, False],
[False, False, False],
[True, False, False],
[False, False, True],
[True, False, False],
[True, False, False],
[True, False, False],
[False, False, False],
[True, False, False],
],
)
assert_array_equal(ds["contig_id"], ["19", "20", "X"])
assert "contig_length" not in ds
assert_array_equal(ds["variant_contig"], [0, 0, 1, 1, 1, 1, 1, 1, 2])
assert ds["variant_contig"].chunks[0][0] == 5

assert_array_equal(
ds["variant_position"],
[111, 112, 14370, 17330, 1110696, 1230237, 1234567, 1235237, 10],
)
assert ds["variant_position"].chunks[0][0] == 5

assert_array_equal(
ds["variant_NS"],
[-1, -1, 3, 3, 2, 3, 3, -1, -1],
)
assert ds["variant_NS"].chunks[0][0] == 5

assert_array_equal(
ds["variant_AN"],
[-1, -1, -1, -1, -1, -1, 6, -1, -1],
)
assert ds["variant_AN"].chunks[0][0] == 5

assert_array_equal(
ds["variant_AA"],
[
".",
".",
".",
".",
"T",
"T",
"G",
".",
".",
],
)
assert ds["variant_AN"].chunks[0][0] == 5

assert_array_equal(
ds["variant_DB"],
[
False,
False,
True,
False,
True,
False,
False,
False,
False,
],
)
assert ds["variant_AN"].chunks[0][0] == 5

variant_AF = np.full((9, 2), FLOAT32_MISSING, dtype=np.float32)
variant_AF[2, 0] = 0.5
variant_AF[3, 0] = 0.017
variant_AF[4, 0] = 0.333
variant_AF[4, 1] = 0.667
assert_array_almost_equal(ds["variant_AF"], variant_AF, 3)
assert ds["variant_AF"].chunks[0][0] == 5

assert_array_equal(
ds["variant_AC"],
[
[-1, -1],
[-1, -1],
[-1, -1],
[-1, -1],
[-1, -1],
[-1, -1],
[3, 1],
[-1, -1],
[-1, -1],
],
)
assert ds["variant_AC"].chunks[0][0] == 5

assert_array_equal(
ds["variant_allele"].values.tolist(),
[
Expand All @@ -84,7 +190,7 @@ def test_vcf_to_zarr__small_vcf(
assert ds["variant_allele"].chunks[0][0] == 5
assert ds["variant_allele"].dtype == "O"
assert_array_equal(
ds["variant_id"],
ds["variant_id"].values.tolist(),
[".", ".", "rs6054257", ".", "rs6040355", ".", "microsat1", ".", "rsTest"],
)
assert ds["variant_id"].chunks[0][0] == 5
Expand All @@ -108,6 +214,9 @@ def test_vcf_to_zarr__small_vcf(
[[0, 0], [0, 0], [0, 0]],
[[0, 1], [0, 2], [-1, -1]],
[[0, 0], [0, 0], [-1, -1]],
# NOTE: inconsistency here on pad vs missing. I think this is a
# pad value as it's a mixed ploidy call:
# GT 0 0/1 0|2
[[0, -1], [0, 1], [0, 2]],
],
dtype="i1",
Expand All @@ -126,18 +235,43 @@ def test_vcf_to_zarr__small_vcf(
],
dtype=bool,
)
call_DP = [
[-1, -1, -1],
[-1, -1, -1],
[1, 8, 5],
[3, 5, 3],
[6, 0, 4],
[-1, 4, 2],
[4, 2, 3],
[-1, -1, -1],
[-1, -1, -1],
]
call_HQ = [
[[10, 15], [10, 10], [3, 3]],
[[10, 10], [10, 10], [3, 3]],
[[51, 51], [51, 51], [-1, -1]],
[[58, 50], [65, 3], [-1, -1]],
[[23, 27], [18, 2], [-1, -1]],
[[56, 60], [51, 51], [-1, -1]],
[[-1, -1], [-1, -1], [-1, -1]],
[[-1, -1], [-1, -1], [-1, -1]],
[[-1, -1], [-1, -1], [-1, -1]],
]

# print(np.array2string(ds["call_HQ"].values, separator=","))
# print(np.array2string(ds["call_genotype"].values < 0, separator=","))

assert_array_equal(ds["call_genotype"], call_genotype)
assert ds["call_genotype"].chunks[0][0] == 5
assert ds["call_genotype"].chunks[1][0] == 2
assert ds["call_genotype"].chunks[2][0] == 2
assert_array_equal(ds["call_genotype_mask"], call_genotype < 0)
assert ds["call_genotype_mask"].chunks[0][0] == 5
assert ds["call_genotype_mask"].chunks[1][0] == 2
assert ds["call_genotype_mask"].chunks[2][0] == 2
assert_array_equal(ds["call_genotype_phased"], call_genotype_phased)
assert ds["call_genotype_mask"].chunks[0][0] == 5
assert ds["call_genotype_mask"].chunks[1][0] == 2
assert ds["call_genotype_mask"].chunks[2][0] == 2
assert_array_equal(ds["call_DP"], call_DP)
assert_array_equal(ds["call_HQ"], call_HQ)

for name in ["call_genotype", "call_genotype_mask", "call_HQ"]:
assert ds[name].chunks == ((5, 4), (2, 1), (2,))

for name in ["call_genotype_phased", "call_DP"]:
assert ds[name].chunks == ((5, 4), (2, 1))

# save and load again to test https://github.com/pydata/xarray/issues/3476
path2 = tmp_path / "ds2.zarr"
Expand Down

0 comments on commit c77857c

Please sign in to comment.