Skip to content
This repository was archived by the owner on Oct 15, 2020. It is now read-only.

Commit 47a7db7

Browse files
authored
BGEN reader implementation using bgen_reader (#1)
* BGEN reader implementation using bgen_reader * Use encode_array from sgkit. Make coverage 100%. Add GH Action to run test and build. * Add package metadata * Doc bi-allelic, diploid limitation. * Test a variety of chunk sizes * Check size of array in dosage calculation * Fail if not bi-allelic * Rework tests for bi-allelic examples with separate/missing sample IDs
1 parent 256ce6a commit 47a7db7

13 files changed

+373
-0
lines changed

.pre-commit-config.yaml

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
repos:
2+
- repo: https://github.com/pre-commit/pre-commit-hooks
3+
rev: v2.5.0
4+
hooks:
5+
- id: check-merge-conflict
6+
- id: debug-statements
7+
- id: mixed-line-ending
8+
- id: check-case-conflict
9+
- id: check-yaml
10+
- repo: https://github.com/asottile/seed-isort-config
11+
rev: v2.2.0
12+
hooks:
13+
- id: seed-isort-config
14+
- repo: https://github.com/timothycrosley/isort
15+
rev: 4.3.21-2 # pick the isort version you'd like to use from https://github.com/timothycrosley/isort/releases
16+
hooks:
17+
- id: isort
18+
- repo: https://github.com/python/black
19+
rev: 19.10b0
20+
hooks:
21+
- id: black
22+
language_version: python3
23+
- repo: https://gitlab.com/pycqa/flake8
24+
rev: 3.7.9
25+
hooks:
26+
- id: flake8
27+
language_version: python3

requirements-dev.txt

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
black
2+
flake8
3+
isort
4+
mypy
5+
pre-commit
6+
pytest
7+
pytest-cov
8+
pytest-datadir

requirements.txt

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
dask[array]
2+
dask[dataframe]
3+
fsspec
4+
numpy
5+
scipy
6+
xarray
7+
bgen_reader
8+
git+https://github.com/pystatgen/sgkit

setup.cfg

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
[metadata]
2+
name = sgkit-bgen
3+
author = sgkit Developers
4+
license = Apache
5+
description = BGEN IO implementations for sgkit
6+
long_description_content_type=text/x-rst
7+
long_description =
8+
**sgkit-bgen** contains block array readers for large-scale genomic data stored as BGEN
9+
url = https://github.com/pystatgen/sgkit
10+
classifiers =
11+
Development Status :: 2 - Pre-Alpha
12+
License :: OSI Approved :: Apache Software License
13+
Operating System :: OS Independent
14+
Intended Audience :: Science/Research
15+
Programming Language :: Python
16+
Programming Language :: Python :: 3
17+
Programming Language :: Python :: 3.7
18+
Programming Language :: Python :: 3.8
19+
Topic :: Scientific/Engineering
20+
21+
[options]
22+
packages = sgkit_bgen
23+
zip_safe = False # https://mypy.readthedocs.io/en/latest/installed_packages.html
24+
include_package_data = True
25+
python_requires = >=3.7
26+
install_requires =
27+
dask[array]
28+
dask[dataframe]
29+
fsspec
30+
numpy
31+
scipy
32+
xarray
33+
bgen_reader
34+
setuptools >= 41.2 # For pkg_resources
35+
setup_requires =
36+
setuptools >= 41.2
37+
setuptools_scm
38+
39+
[coverage:report]
40+
fail_under = 100
41+
42+
[flake8]
43+
ignore =
44+
# whitespace before ':' - doesn't work well with black
45+
E203
46+
E402
47+
# line too long - let black worry about that
48+
E501
49+
# do not assign a lambda expression, use a def
50+
E731
51+
# line break before binary operator
52+
W503
53+
54+
[isort]
55+
default_section = THIRDPARTY
56+
known_first_party = sgkit
57+
known_third_party = bgen_reader,dask,numpy,pytest,xarray
58+
multi_line_output = 3
59+
include_trailing_comma = True
60+
force_grid_wrap = 0
61+
use_parentheses = True
62+
line_length = 88
63+
64+
[mypy-numpy.*]
65+
ignore_missing_imports = True
66+
67+
[mypy-sgkit_bgen.tests.*]
68+
disallow_untyped_defs = False

sgkit_bgen/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
from .bgen_reader import read_bgen # noqa: F401
2+
3+
__all__ = ["read_bgen"]

sgkit_bgen/bgen_reader.py

Lines changed: 207 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,207 @@
1+
"""BGEN reader implementation (using bgen_reader)"""
2+
from pathlib import Path
3+
from typing import Any, Union
4+
5+
import dask.array as da
6+
import numpy as np
7+
from bgen_reader._bgen_file import bgen_file
8+
from bgen_reader._bgen_metafile import bgen_metafile
9+
from bgen_reader._metafile import create_metafile
10+
from bgen_reader._reader import _infer_metafile_filepath
11+
from bgen_reader._samples import generate_samples, read_samples_file
12+
from xarray import Dataset
13+
14+
from sgkit import create_genotype_dosage_dataset
15+
from sgkit.typing import ArrayLike
16+
from sgkit.utils import encode_array
17+
18+
PathType = Union[str, Path]
19+
20+
21+
def _to_dict(df, dtype=None):
22+
return {
23+
c: df[c].to_dask_array(lengths=True).astype(dtype[c] if dtype else df[c].dtype)
24+
for c in df
25+
}
26+
27+
28+
VARIANT_FIELDS = [
29+
("id", str, "U"),
30+
("rsid", str, "U"),
31+
("chrom", str, "U"),
32+
("pos", str, "int32"),
33+
("nalleles", str, "int8"),
34+
("allele_ids", str, "U"),
35+
("vaddr", str, "int64"),
36+
]
37+
VARIANT_DF_DTYPE = dict([(f[0], f[1]) for f in VARIANT_FIELDS])
38+
VARIANT_ARRAY_DTYPE = dict([(f[0], f[2]) for f in VARIANT_FIELDS])
39+
40+
41+
class BgenReader:
42+
43+
name = "bgen_reader"
44+
45+
def __init__(self, path, persist=True, dtype=np.float32):
46+
self.path = Path(path)
47+
48+
self.metafile_filepath = _infer_metafile_filepath(Path(self.path))
49+
if not self.metafile_filepath.exists():
50+
create_metafile(path, self.metafile_filepath, verbose=False)
51+
52+
with bgen_metafile(self.metafile_filepath) as mf:
53+
self.n_variants = mf.nvariants
54+
self.npartitions = mf.npartitions
55+
self.partition_size = mf.partition_size
56+
57+
df = mf.create_variants()
58+
if persist:
59+
df = df.persist()
60+
variant_arrs = _to_dict(df, dtype=VARIANT_ARRAY_DTYPE)
61+
62+
self.variant_id = variant_arrs["id"]
63+
self.contig = variant_arrs["chrom"]
64+
self.pos = variant_arrs["pos"]
65+
66+
def split_alleles(alleles, block_info=None):
67+
if block_info is None or len(block_info) == 0:
68+
return alleles
69+
70+
def split(allele_row):
71+
alleles_list = allele_row[0].split(",")
72+
assert len(alleles_list) == 2 # bi-allelic
73+
return np.array(alleles_list)
74+
75+
return np.apply_along_axis(split, 1, alleles[:, np.newaxis])
76+
77+
variant_alleles = variant_arrs["allele_ids"].map_blocks(split_alleles)
78+
79+
def max_str_len(arr: ArrayLike) -> Any:
80+
return arr.map_blocks(
81+
lambda s: np.char.str_len(s.astype(str)), dtype=np.int8
82+
).max()
83+
84+
max_allele_length = max(max_str_len(variant_alleles).compute())
85+
self.variant_alleles = variant_alleles.astype(f"S{max_allele_length}")
86+
87+
with bgen_file(self.path) as bgen:
88+
sample_path = self.path.with_suffix(".sample")
89+
if sample_path.exists():
90+
self.sample_id = read_samples_file(sample_path, verbose=False)
91+
else:
92+
if bgen.contain_samples:
93+
self.sample_id = bgen.read_samples()
94+
else:
95+
self.sample_id = generate_samples(bgen.nsamples)
96+
97+
self.shape = (self.n_variants, len(self.sample_id))
98+
self.dtype = dtype
99+
self.ndim = 2
100+
101+
def __getitem__(self, idx):
102+
if not isinstance(idx, tuple):
103+
raise IndexError( # pragma: no cover
104+
f"Indexer must be tuple (received {type(idx)})"
105+
)
106+
if len(idx) != self.ndim:
107+
raise IndexError( # pragma: no cover
108+
f"Indexer must be two-item tuple (received {len(idx)} slices)"
109+
)
110+
111+
if idx[0].start == idx[0].stop:
112+
return np.empty((0, 0), dtype=self.dtype)
113+
114+
start_partition = idx[0].start // self.partition_size
115+
start_partition_offset = idx[0].start % self.partition_size
116+
end_partition = (idx[0].stop - 1) // self.partition_size
117+
end_partition_offset = (idx[0].stop - 1) % self.partition_size
118+
119+
all_vaddr = []
120+
with bgen_metafile(self.metafile_filepath) as mf:
121+
for i in range(start_partition, end_partition + 1):
122+
partition = mf.read_partition(i)
123+
start_offset = start_partition_offset if i == start_partition else 0
124+
end_offset = (
125+
end_partition_offset + 1
126+
if i == end_partition
127+
else self.partition_size
128+
)
129+
vaddr = partition["vaddr"].tolist()
130+
all_vaddr.extend(vaddr[start_offset:end_offset])
131+
132+
with bgen_file(self.path) as bgen:
133+
genotypes = [bgen.read_genotype(vaddr) for vaddr in all_vaddr]
134+
all_probs = [genotype["probs"] for genotype in genotypes]
135+
d = [_to_dosage(probs) for probs in all_probs]
136+
return np.stack(d)[:, idx[1]]
137+
138+
139+
def _to_dosage(probs: ArrayLike):
140+
"""Calculate the dosage from genotype likelihoods (probabilities)"""
141+
assert len(probs.shape) == 2 and probs.shape[1] == 3
142+
return 2 * probs[:, -1] + probs[:, 1]
143+
144+
145+
def read_bgen(
146+
path: PathType,
147+
chunks: Union[str, int, tuple] = "auto",
148+
lock: bool = False,
149+
persist: bool = True,
150+
) -> Dataset:
151+
"""Read BGEN dataset.
152+
153+
Loads a single BGEN dataset as dask arrays within a Dataset
154+
from a bgen file.
155+
156+
Parameters
157+
----------
158+
path : PathType
159+
Path to BGEN file.
160+
chunks : Union[str, int, tuple], optional
161+
Chunk size for genotype data, by default "auto"
162+
lock : bool, optional
163+
Whether or not to synchronize concurrent reads of
164+
file blocks, by default False. This is passed through to
165+
[dask.array.from_array](https://docs.dask.org/en/latest/array-api.html#dask.array.from_array).
166+
persist : bool, optional
167+
Whether or not to persist variant information in
168+
memory, by default True. This is an important performance
169+
consideration as the metadata file for this data will
170+
be read multiple times when False.
171+
172+
Warnings
173+
--------
174+
Only bi-allelic, diploid BGEN files are currently supported.
175+
"""
176+
177+
bgen_reader = BgenReader(path, persist)
178+
179+
variant_contig, variant_contig_names = encode_array(bgen_reader.contig.compute())
180+
variant_contig_names = list(variant_contig_names)
181+
variant_contig = variant_contig.astype("int16")
182+
183+
variant_position = np.array(bgen_reader.pos, dtype=int)
184+
variant_alleles = np.array(bgen_reader.variant_alleles, dtype="S1")
185+
variant_id = np.array(bgen_reader.variant_id, dtype=str)
186+
187+
sample_id = np.array(bgen_reader.sample_id, dtype=str)
188+
189+
call_dosage = da.from_array(
190+
bgen_reader,
191+
chunks=chunks,
192+
lock=lock,
193+
asarray=False,
194+
name=f"{bgen_reader.name}:read_bgen:{path}",
195+
)
196+
197+
ds = create_genotype_dosage_dataset(
198+
variant_contig_names=variant_contig_names,
199+
variant_contig=variant_contig,
200+
variant_position=variant_position,
201+
variant_alleles=variant_alleles,
202+
sample_id=sample_id,
203+
call_dosage=call_dosage,
204+
variant_id=variant_id,
205+
)
206+
207+
return ds

sgkit_bgen/tests/__init__.py

Whitespace-only changes.
16.9 KB
Binary file not shown.
Binary file not shown.
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
ID
2+
0
3+
s1
4+
s2
5+
s3
6+
s4
7+
s5

0 commit comments

Comments
 (0)