Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Initial version of num records #295

Merged
merged 7 commits into from
Jan 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 10 additions & 2 deletions cyvcf2/cyvcf2.pxd
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from libc.stdint cimport int64_t, int32_t, uint32_t, int8_t, int16_t, uint8_t
from libc.stdint cimport int64_t, uint64_t, int32_t, uint32_t, int8_t, int16_t, uint8_t
import numpy as np
cimport numpy as np
np.import_array()
Expand Down Expand Up @@ -70,15 +70,23 @@ cdef extern from "htslib/hts.h":

hts_idx_t *bcf_index_load(char *fn)
hts_idx_t *hts_idx_load2(const char *fn, const char *fnidx);
int hts_idx_nseq(const hts_idx_t *idx);
int hts_idx_get_stat(const hts_idx_t* idx, int tid, uint64_t* mapped,
uint64_t* unmapped);

#int hts_itr_next(BGZF *fp, hts_itr_t *iter, void *r, void *data);
void hts_itr_destroy(hts_itr_t *iter);
void hts_idx_destroy(hts_idx_t *idx);

cdef extern from "htslib/tbx.h":
ctypedef struct tbx_conf_t:
pass

# Expose details of tbx_t so that we can access the idx field
ctypedef struct tbx_t:
pass
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I added this so that Cython "knows about" the idx member of this struct. However, it has two other members - I'm not sure if it's safe to just ignore them? Originally I copied the full struct definition in here, but that seems worse, if it's not needed.

Copy link
Owner

@brentp brentp Jan 18, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I tried to find documentation of this in cython, but failed. My memory is that you should either declare none of the fields or all of the fields. The definition of tbx_t is simple enough that it is at least it is easy to explicitly list all of them.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed

tbx_conf_t conf
hts_idx_t *idx
void *dict

tbx_t *tbx_index_load(const char *fn);
tbx_t *tbx_index_load2(const char *fn, const char *fnidx);
Expand Down
69 changes: 61 additions & 8 deletions cyvcf2/cyvcf2.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -405,6 +405,14 @@ cdef class VCF(HTSFile):
raise Exception("unable to update to header")

def set_index(self, index_path=""):
# Clear any existing indexes
if self.idx != NULL:
tbx_destroy(self.idx)
self.idx = NULL
if self.hidx != NULL:
hts_idx_destroy(self.hidx)
self.hidx = NULL

if index_path.endswith(".tbi"):
self.idx = tbx_index_load2(to_bytes(self.fname), to_bytes(index_path))
if self.idx != NULL:
Expand Down Expand Up @@ -648,28 +656,73 @@ cdef class VCF(HTSFile):
stdlib.free(sls)
return self._seqlens

cdef _open_index(self):
"""
Try to open an index, if not open already.
"""
if self.hidx == NULL and self.idx == NULL:
if self.fname.decode(ENC).endswith(('.bcf', '.bcf.gz')):
self.hidx = bcf_index_load(self.fname)
else:
self.idx = tbx_index_load(to_bytes(self.fname))

property seqnames:
"list of chromosomes in the VCF"
def __get__(self):
if len(self._seqnames) > 0: return self._seqnames
cdef char **cnames
cdef const char **cnames
cdef int i, n = 0
cnames = bcf_hdr_seqnames(self.hdr, &n)
if n == 0 and self.fname.decode(ENC).endswith(('.bcf', '.bcf.gz')):
if self.hidx == NULL:
self.hidx = bcf_index_load(self.fname)
if n == 0:
self._open_index()
if self.hidx != NULL:
cnames = bcf_index_seqnames(self.hidx, self.hdr, &n)
elif n == 0:
if self.idx == NULL:
self.idx = tbx_index_load(to_bytes(self.fname))
if self.idx !=NULL:
cnames = tbx_seqnames(self.idx, &n)

self._seqnames = [cnames[i].decode() for i in range(n)]
stdlib.free(cnames)
return self._seqnames

cdef _num_records(self):
cdef uint64_t total, records, v;
cdef int ret, tid, nseq;
cdef hts_idx_t *idx = NULL;

self._open_index()
if self.hidx != NULL:
idx = self.hidx
assert self.idx == NULL
if self.idx != NULL:
idx = self.idx.idx
assert self.hidx == NULL

if idx == NULL:
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I thought a bit about this, and since you need an index to compute this, it seems best to make it a hard error so that the user is alerted. Happy to discuss of course (also the type of the exception is debateable)

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is fine with me. You could just document in num_records docstring that is will raise a ValueError if index is not found.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done

raise ValueError(
"File must be indexed to compute num_records (tip: use bcftools index)")

nseq = hts_idx_nseq(idx)
total = 0;
for tid in range(nseq):
# NOTE: the return value here doesn't seem to indicate an error
# condition, and correct values are computed when it returns < 0.
# bcftools index -n doesn't strictly check the output.
hts_idx_get_stat(idx, tid, &records, &v);
total += records
return total

property num_records:
"""
The number of VCF records in the file, computed from the index.
If the file is not indexed (or an index has not been set using
``set_index``) a ValueError is raised.

Note that incorrect values may be returned if a mismatched
index file (i.e., the index for a different VCF file) is used.
This is not detected as an error condition.
"""
def __get__(self):
return self._num_records()

def plot_relatedness(self, riter):
import pandas as pd
from matplotlib import pyplot as plt
Expand Down
3 changes: 1 addition & 2 deletions cyvcf2/helpers.c
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,9 @@

int as_gts(int32_t *gts, int num_samples, int ploidy, int strict_gt, int HOM_ALT, int UNKNOWN) {
int j = 0, i, k;
int missing= 0, found=0;
int missing= 0;
for (i = 0; i < ploidy * num_samples; i += ploidy){
missing = 0;
found = 0;
for (k = 0; k < ploidy; k++) {
if bcf_gt_is_missing(gts[i+k]) {
missing += 1;
Expand Down
Binary file added cyvcf2/tests/multi-contig.bcf
Binary file not shown.
Binary file added cyvcf2/tests/multi-contig.bcf.csi
Binary file not shown.
Binary file added cyvcf2/tests/multi-contig.vcf.gz
Binary file not shown.
Binary file added cyvcf2/tests/multi-contig.vcf.gz.csi
Binary file not shown.
Binary file added cyvcf2/tests/multi-contig.vcf.gz.tbi
Binary file not shown.
76 changes: 75 additions & 1 deletion cyvcf2/tests/test_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -1296,7 +1296,7 @@ def test_genotypes():
[0, 0, 1, 1],
[1, 1, 0, 0],
[1, 1, 0, 0],
]
]

strict_exp_num = [x[:] for x in non_strict_exp_num]
strict_exp_num[1] = [0, 0, 2, 0] # both unknown
Expand Down Expand Up @@ -1336,3 +1336,77 @@ def test_issue17_no_gt():
with pytest.raises(Exception):
for v in vcf:
v.num_called # Used to give segmentation fault


@pytest.mark.parametrize("path", [
"test.vcf.gz",
"test-multiallelic-homozygous-alt.vcf.gz",
"test-strict-gt-option-flag.vcf.gz",
"test-strict-gt-option-flag.vcf.gz",
"multi-contig.vcf.gz",
"multi-contig.bcf",
"test.snpeff.bcf",
])
def test_num_records_indexed(path):
vcf = VCF(os.path.join(HERE, path))
n = len(list(vcf))
assert n == vcf.num_records
vcf = VCF(os.path.join(HERE, path))
assert n == vcf.num_records

@pytest.mark.parametrize("suffix", ["csi", "tbi"])
def test_num_records_indexed_csi_tabix(suffix):
path = "multi-contig.vcf.gz"
index_file = os.path.join(HERE, "multi-contig.vcf.gz.{}".format(suffix))
vcf = VCF(os.path.join(HERE, path))
n = len(list(vcf))
# Explicitly set the index
vcf.set_index(index_file)
assert n == vcf.num_records
vcf = VCF(os.path.join(HERE, path))
vcf.set_index(index_file)
assert n == vcf.num_records

def test_num_records_set_index_multiple_times():
path = os.path.join(HERE, "multi-contig.vcf.gz")
csi_index = path + ".csi"
tbi_index = path + ".tbi"
vcf = VCF(path)
n = len(list(vcf))
assert n == vcf.num_records
vcf.set_index(csi_index)
assert n == vcf.num_records

vcf = VCF(path)
assert n == vcf.num_records
vcf.set_index(tbi_index)
assert n == vcf.num_records

vcf = VCF(path)
vcf.set_index(csi_index)
assert n == vcf.num_records

vcf = VCF(path)
for _ in range(10):
vcf.set_index(csi_index)
assert n == vcf.num_records
vcf.set_index(tbi_index)
assert n == vcf.num_records

def test_num_records_set_wrong_index():
path = os.path.join(HERE, "multi-contig.vcf.gz")
index = os.path.join(HERE, "test.vcf.gz.tbi")
vcf = VCF(path)
vcf.set_index(index)
# We compute the number of records from the index, and don't report an
# error
assert vcf.num_records == 115
assert vcf.num_records != len(list(vcf))

@pytest.mark.parametrize("path", [
"test-genotypes.vcf",
])
def test_num_records_no_index(path):
vcf = VCF(os.path.join(HERE, path))
with pytest.raises(ValueError, match="must be indexed"):
vcf.num_records