Skip to content
This repository has been archived by the owner on Oct 15, 2020. It is now read-only.

Commit

Permalink
Use encode_array from sgkit.
Browse files Browse the repository at this point in the history
Make coverage 100%.
Add GH Action to run test and build.
  • Loading branch information
tomwhite committed Jul 21, 2020
1 parent a014507 commit 3748617
Show file tree
Hide file tree
Showing 4 changed files with 22 additions and 8 deletions.
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ dask[array]
dask[dataframe]
fsspec
numpy
scipy
xarray
bgen_reader
git+https://github.com/tomwhite/sgkit@dosages
git+https://github.com/pystatgen/sgkit
15 changes: 8 additions & 7 deletions sgkit_bgen/bgen_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from xarray import Dataset

from sgkit import create_genotype_dosage_dataset
from sgkit.utils import encode_array

PathType = Union[str, Path]

Expand Down Expand Up @@ -88,9 +89,11 @@ def split(allele_row):

def __getitem__(self, idx):
if not isinstance(idx, tuple):
raise IndexError(f"Indexer must be tuple (received {type(idx)})")
raise IndexError( # pragma: no cover
f"Indexer must be tuple (received {type(idx)})"
)
if len(idx) != self.ndim:
raise IndexError(
raise IndexError( # pragma: no cover
f"Indexer must be two-item tuple (received {len(idx)} slices)"
)

Expand Down Expand Up @@ -138,9 +141,9 @@ def read_bgen(
path : PathType
Path to BGEN file.
chunks : Union[str, int, tuple], optional
Chunk size for genotype (i.e. `.bed`) data, by default "auto"
Chunk size for genotype data, by default "auto"
lock : bool, optional
Whether or not to synchronize concurrent reads of `.bed`
Whether or not to synchronize concurrent reads of
file blocks, by default False. This is passed through to
[dask.array.from_array](https://docs.dask.org/en/latest/array-api.html#dask.array.from_array).
persist : bool, optional
Expand All @@ -152,9 +155,7 @@ def read_bgen(

bgen_reader = BgenReader(path, persist)

variant_contig_names, variant_contig = np.unique(
np.array(bgen_reader.contig, dtype=str), return_inverse=True
)
variant_contig, variant_contig_names = encode_array(bgen_reader.contig.compute())
variant_contig_names = list(variant_contig_names)
variant_contig = variant_contig.astype("int16")

Expand Down
Binary file added sgkit_bgen/tests/data/complex.23bits.no.samples.bgen
Binary file not shown.
12 changes: 12 additions & 0 deletions sgkit_bgen/tests/test_bgen_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,15 @@ def test_read_bgen_with_sample_file(shared_datadir):
ds = read_bgen(path)
# Check the sample IDs are the ones from the .sample file
assert ds["sample/id"].values.tolist() == ["s0", "s1", "s2", "s3"]


def test_read_bgen_with_no_samples(shared_datadir):
path = shared_datadir / "complex.23bits.no.samples.bgen"
ds = read_bgen(path)
# Check the sample IDs are generated
assert ds["sample/id"].values.tolist() == [
"sample_0",
"sample_1",
"sample_2",
"sample_3",
]

0 comments on commit 3748617

Please sign in to comment.