brentp · brentp · Jan 19, 2024 · Jan 18, 2024 · Jan 18, 2024 · Jan 19, 2024
diff --git a/cyvcf2/cyvcf2.pxd b/cyvcf2/cyvcf2.pxd
@@ -1,4 +1,4 @@
-from libc.stdint cimport int64_t, int32_t, uint32_t, int8_t, int16_t, uint8_t
+from libc.stdint cimport int64_t, uint64_t, int32_t, uint32_t, int8_t, int16_t, uint8_t
 import numpy as np
 cimport numpy as np
 np.import_array()
@@ -70,15 +70,23 @@ cdef extern from "htslib/hts.h":
 
     hts_idx_t *bcf_index_load(char *fn)
     hts_idx_t *hts_idx_load2(const char *fn, const char *fnidx);
+    int hts_idx_nseq(const hts_idx_t *idx);
+    int hts_idx_get_stat(const hts_idx_t* idx, int tid, uint64_t* mapped,
+            uint64_t* unmapped);
 
     #int hts_itr_next(BGZF *fp, hts_itr_t *iter, void *r, void *data);
     void hts_itr_destroy(hts_itr_t *iter);
     void hts_idx_destroy(hts_idx_t *idx);
 
 cdef extern from "htslib/tbx.h":
+    ctypedef struct tbx_conf_t:
+        pass
 
+    # Expose details of tbx_t so that we can access the idx field
     ctypedef struct tbx_t:
-        pass
+        tbx_conf_t conf
+        hts_idx_t *idx
+        void *dict
 
     tbx_t *tbx_index_load(const char *fn);
     tbx_t *tbx_index_load2(const char *fn, const char *fnidx);

diff --git a/cyvcf2/cyvcf2.pyx b/cyvcf2/cyvcf2.pyx
@@ -405,6 +405,14 @@ cdef class VCF(HTSFile):
             raise Exception("unable to update to header")
 
     def set_index(self, index_path=""):
+        # Clear any existing indexes
+        if self.idx != NULL:
+            tbx_destroy(self.idx)
+            self.idx = NULL
+        if self.hidx != NULL:
+            hts_idx_destroy(self.hidx)
+            self.hidx = NULL
+
         if index_path.endswith(".tbi"):
             self.idx = tbx_index_load2(to_bytes(self.fname), to_bytes(index_path))
             if self.idx != NULL:
@@ -648,28 +656,73 @@ cdef class VCF(HTSFile):
             stdlib.free(sls)
             return self._seqlens
 
+    cdef _open_index(self):
+        """
+        Try to open an index, if not open already.
+        """
+        if self.hidx == NULL and self.idx == NULL:
+            if self.fname.decode(ENC).endswith(('.bcf', '.bcf.gz')):
+                self.hidx = bcf_index_load(self.fname)
+            else:
+                self.idx = tbx_index_load(to_bytes(self.fname))
+
     property seqnames:
         "list of chromosomes in the VCF"
         def __get__(self):
             if len(self._seqnames) > 0: return self._seqnames
-            cdef char **cnames
+            cdef const char **cnames
             cdef int i, n = 0
             cnames = bcf_hdr_seqnames(self.hdr, &n)
-            if n == 0 and self.fname.decode(ENC).endswith(('.bcf', '.bcf.gz')):
-                if self.hidx == NULL:
-                    self.hidx = bcf_index_load(self.fname)
+            if n == 0:
+                self._open_index()
                 if self.hidx != NULL:
                     cnames = bcf_index_seqnames(self.hidx, self.hdr, &n)
-            elif n == 0:
-                if self.idx == NULL:
-                    self.idx = tbx_index_load(to_bytes(self.fname))
                 if self.idx !=NULL:
                     cnames = tbx_seqnames(self.idx, &n)
-
             self._seqnames = [cnames[i].decode() for i in range(n)]
             stdlib.free(cnames)
             return self._seqnames
 
+    cdef _num_records(self):
+        cdef uint64_t total, records, v;
+        cdef int ret, tid, nseq;
+        cdef hts_idx_t *idx = NULL;
+
+        self._open_index()
+        if self.hidx != NULL:
+            idx = self.hidx
+            assert self.idx == NULL
+        if self.idx != NULL:
+            idx = self.idx.idx
+            assert self.hidx == NULL
+
+        if idx == NULL:
+            raise ValueError(
+                "File must be indexed to compute num_records (tip: use bcftools index)")
+
+        nseq = hts_idx_nseq(idx)
+        total = 0;
+        for tid in range(nseq):
+            # NOTE: the return value here doesn't seem to indicate an error
+            # condition, and correct values are computed when it returns < 0.
+            # bcftools index -n doesn't strictly check the output.
+            hts_idx_get_stat(idx, tid, &records, &v);
+            total += records
+        return total
+
+    property num_records:
+        """
+        The number of VCF records in the file, computed from the index.
+        If the file is not indexed (or an index has not been set using 
+        ``set_index``) a ValueError is raised.
+
+        Note that incorrect values may be returned if a mismatched 
+        index file (i.e., the index for a different VCF file) is used.
+        This is not detected as an error condition.
+        """
+        def __get__(self):
+            return self._num_records()
+
     def plot_relatedness(self, riter):
         import pandas as pd
         from matplotlib import pyplot as plt

diff --git a/cyvcf2/helpers.c b/cyvcf2/helpers.c
@@ -4,10 +4,9 @@
 
 int as_gts(int32_t *gts, int num_samples, int ploidy, int strict_gt, int HOM_ALT, int UNKNOWN) {
     int j = 0, i, k;
-    int missing= 0, found=0;
+    int missing= 0;
     for (i = 0; i < ploidy * num_samples; i += ploidy){
         missing = 0;
-    found = 0;
         for (k = 0; k < ploidy; k++) {
             if bcf_gt_is_missing(gts[i+k])  {
                 missing += 1;

diff --git a/cyvcf2/tests/multi-contig.bcf b/cyvcf2/tests/multi-contig.bcf
diff --git a/cyvcf2/tests/multi-contig.bcf.csi b/cyvcf2/tests/multi-contig.bcf.csi
diff --git a/cyvcf2/tests/multi-contig.vcf.gz b/cyvcf2/tests/multi-contig.vcf.gz
diff --git a/cyvcf2/tests/multi-contig.vcf.gz.csi b/cyvcf2/tests/multi-contig.vcf.gz.csi
diff --git a/cyvcf2/tests/multi-contig.vcf.gz.tbi b/cyvcf2/tests/multi-contig.vcf.gz.tbi
diff --git a/cyvcf2/tests/test_reader.py b/cyvcf2/tests/test_reader.py
@@ -1296,7 +1296,7 @@ def test_genotypes():
     [0, 0, 1, 1],
     [1, 1, 0, 0],
     [1, 1, 0, 0],
-    ] 
+    ]
 
     strict_exp_num = [x[:] for x in non_strict_exp_num]
     strict_exp_num[1] = [0, 0, 2, 0] # both unknown
@@ -1336,3 +1336,77 @@ def test_issue17_no_gt():
     with pytest.raises(Exception):
         for v in vcf:
             v.num_called  # Used to give segmentation fault
+
+
+@pytest.mark.parametrize("path", [
+    "test.vcf.gz",
+    "test-multiallelic-homozygous-alt.vcf.gz",
+    "test-strict-gt-option-flag.vcf.gz",
+    "test-strict-gt-option-flag.vcf.gz",
+    "multi-contig.vcf.gz",
+    "multi-contig.bcf",
+    "test.snpeff.bcf",
+    ])
+def test_num_records_indexed(path):
+    vcf = VCF(os.path.join(HERE, path))
+    n = len(list(vcf))
+    assert n == vcf.num_records
+    vcf = VCF(os.path.join(HERE, path))
+    assert n == vcf.num_records
+
+@pytest.mark.parametrize("suffix", ["csi", "tbi"])
+def test_num_records_indexed_csi_tabix(suffix):
+    path = "multi-contig.vcf.gz"
+    index_file = os.path.join(HERE, "multi-contig.vcf.gz.{}".format(suffix))
+    vcf = VCF(os.path.join(HERE, path))
+    n = len(list(vcf))
+    # Explicitly set the index
+    vcf.set_index(index_file)
+    assert n == vcf.num_records
+    vcf = VCF(os.path.join(HERE, path))
+    vcf.set_index(index_file)
+    assert n == vcf.num_records
+
+def test_num_records_set_index_multiple_times():
+    path = os.path.join(HERE, "multi-contig.vcf.gz")
+    csi_index = path + ".csi"
+    tbi_index = path + ".tbi"
+    vcf = VCF(path)
+    n = len(list(vcf))
+    assert n == vcf.num_records
+    vcf.set_index(csi_index)
+    assert n == vcf.num_records
+
+    vcf = VCF(path)
+    assert n == vcf.num_records
+    vcf.set_index(tbi_index)
+    assert n == vcf.num_records
+
+    vcf = VCF(path)
+    vcf.set_index(csi_index)
+    assert n == vcf.num_records
+
+    vcf = VCF(path)
+    for _ in range(10):
+        vcf.set_index(csi_index)
+        assert n == vcf.num_records
+        vcf.set_index(tbi_index)
+        assert n == vcf.num_records
+
+def test_num_records_set_wrong_index():
+    path = os.path.join(HERE, "multi-contig.vcf.gz")
+    index = os.path.join(HERE, "test.vcf.gz.tbi")
+    vcf = VCF(path)
+    vcf.set_index(index)
+    # We compute the number of records from the index, and don't report an
+    # error
+    assert vcf.num_records == 115
+    assert vcf.num_records != len(list(vcf))
+
+@pytest.mark.parametrize("path", [
+    "test-genotypes.vcf",
+    ])
+def test_num_records_no_index(path):
+    vcf = VCF(os.path.join(HERE, path))
+    with pytest.raises(ValueError, match="must be indexed"):
+        vcf.num_records