-
Notifications
You must be signed in to change notification settings - Fork 72
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Initial version of num records #295
Changes from all commits
191654e
3b12e43
df1b90b
417bc9b
01288c8
c78c869
e7ed941
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -405,6 +405,14 @@ cdef class VCF(HTSFile): | |
raise Exception("unable to update to header") | ||
|
||
def set_index(self, index_path=""): | ||
# Clear any existing indexes | ||
if self.idx != NULL: | ||
tbx_destroy(self.idx) | ||
self.idx = NULL | ||
if self.hidx != NULL: | ||
hts_idx_destroy(self.hidx) | ||
self.hidx = NULL | ||
|
||
if index_path.endswith(".tbi"): | ||
self.idx = tbx_index_load2(to_bytes(self.fname), to_bytes(index_path)) | ||
if self.idx != NULL: | ||
|
@@ -648,28 +656,73 @@ cdef class VCF(HTSFile): | |
stdlib.free(sls) | ||
return self._seqlens | ||
|
||
cdef _open_index(self): | ||
""" | ||
Try to open an index, if not open already. | ||
""" | ||
if self.hidx == NULL and self.idx == NULL: | ||
if self.fname.decode(ENC).endswith(('.bcf', '.bcf.gz')): | ||
self.hidx = bcf_index_load(self.fname) | ||
else: | ||
self.idx = tbx_index_load(to_bytes(self.fname)) | ||
|
||
property seqnames: | ||
"list of chromosomes in the VCF" | ||
def __get__(self): | ||
if len(self._seqnames) > 0: return self._seqnames | ||
cdef char **cnames | ||
cdef const char **cnames | ||
cdef int i, n = 0 | ||
cnames = bcf_hdr_seqnames(self.hdr, &n) | ||
if n == 0 and self.fname.decode(ENC).endswith(('.bcf', '.bcf.gz')): | ||
if self.hidx == NULL: | ||
self.hidx = bcf_index_load(self.fname) | ||
if n == 0: | ||
self._open_index() | ||
if self.hidx != NULL: | ||
cnames = bcf_index_seqnames(self.hidx, self.hdr, &n) | ||
elif n == 0: | ||
if self.idx == NULL: | ||
self.idx = tbx_index_load(to_bytes(self.fname)) | ||
if self.idx !=NULL: | ||
cnames = tbx_seqnames(self.idx, &n) | ||
|
||
self._seqnames = [cnames[i].decode() for i in range(n)] | ||
stdlib.free(cnames) | ||
return self._seqnames | ||
|
||
cdef _num_records(self): | ||
cdef uint64_t total, records, v; | ||
cdef int ret, tid, nseq; | ||
cdef hts_idx_t *idx = NULL; | ||
|
||
self._open_index() | ||
if self.hidx != NULL: | ||
idx = self.hidx | ||
assert self.idx == NULL | ||
if self.idx != NULL: | ||
idx = self.idx.idx | ||
assert self.hidx == NULL | ||
|
||
if idx == NULL: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I thought a bit about this, and since you need an index to compute this, it seems best to make it a hard error so that the user is alerted. Happy to discuss of course (also the type of the exception is debateable) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is fine with me. You could just document in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done |
||
raise ValueError( | ||
"File must be indexed to compute num_records (tip: use bcftools index)") | ||
|
||
nseq = hts_idx_nseq(idx) | ||
total = 0; | ||
for tid in range(nseq): | ||
# NOTE: the return value here doesn't seem to indicate an error | ||
# condition, and correct values are computed when it returns < 0. | ||
# bcftools index -n doesn't strictly check the output. | ||
hts_idx_get_stat(idx, tid, &records, &v); | ||
total += records | ||
return total | ||
|
||
property num_records: | ||
""" | ||
The number of VCF records in the file, computed from the index. | ||
If the file is not indexed (or an index has not been set using | ||
``set_index``) a ValueError is raised. | ||
|
||
Note that incorrect values may be returned if a mismatched | ||
index file (i.e., the index for a different VCF file) is used. | ||
This is not detected as an error condition. | ||
""" | ||
def __get__(self): | ||
return self._num_records() | ||
|
||
def plot_relatedness(self, riter): | ||
import pandas as pd | ||
from matplotlib import pyplot as plt | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I added this so that Cython "knows about" the idx member of this struct. However, it has two other members - I'm not sure if it's safe to just ignore them? Originally I copied the full struct definition in here, but that seems worse, if it's not needed.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I tried to find documentation of this in cython, but failed. My memory is that you should either declare none of the fields or all of the fields. The definition of
tbx_t
is simple enough that it is at least it is easy to explicitly list all of them.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fixed