Skip to content

Commit 78f789e

Browse files
authored
Merge pull request #160 from hammerlab/fix-issue-158
Fix load_vcf_fast for sample names containing a space character
2 parents 4bb441f + b965d71 commit 78f789e

File tree

4 files changed

+40
-5
lines changed

4 files changed

+40
-5
lines changed

test/data/multiallelic.vcf

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,5 +12,5 @@
1212
##contig=<ID=chr15,length=102531392>
1313
##contig=<ID=chr16,length=90354753>
1414
##contig=<ID=chr17,length=81195210>
15-
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT metastasis
15+
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT metastasis
1616
chr1 1431105 rs199599542 A C,G 593.69 PASS DP=17;GE=Wuzzle GT 0/1
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
##fileformat=VCFv4.1
2+
##reference=file:///projects/ngs/resources/gatk/2.3/ucsc.hg19.fasta
3+
##INFO=<ID=GE,Number=.,Type=String,Description="HGNC Gene Symbol (could be more than one)">
4+
##INFO=<ID=EG,Number=.,Type=String,Description="Entrez Gene ID (could be more than one)">
5+
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
6+
##contig=<ID=chrM,length=16571>
7+
##contig=<ID=chr1,length=249250621>
8+
##contig=<ID=chr10,length=135534747>
9+
##contig=<ID=chr11,length=135006516>
10+
##contig=<ID=chr12,length=133851895>
11+
##contig=<ID=chr14,length=107349540>
12+
##contig=<ID=chr15,length=102531392>
13+
##contig=<ID=chr16,length=90354753>
14+
##contig=<ID=chr17,length=81195210>
15+
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT metastasis foo
16+
chr1 53513530 . A C . . GE=SCP2;EG=6342 GT 0/1
17+
chr1 228295398 . G T . . GE=MRPL55;EG=128308 GT 0/1
18+
chr10 49658590 . T C . . GE=ARHGAP22;EG=58504 GT 0/1
19+
chr10 51585166 . G T . . GE=NCOA4;EG=8031 GT 0/1
20+
chr10 96709040 . A C . . GE=CYP2C9;EG=1559 GT 0/1
21+
chr10 119134281 . G T . . GE=PDZD8;EG=118987 GT 0/1
22+
chr11 118244286 . G G . . GE=UBE4A;EG=9354 GT 0/1
23+
chr12 14794076 . C A . . GE=GUCY2C;EG=2984 GT 0/1
24+
chr12 25398284 . C G . . GE=KRAS;EG=3845 GT 0/1
25+
chr12 42778752 . T A . . GE=PPHLN1;EG=51535 GT 0/1
26+
chr14 31144202 . A C . . GE=SCFD1;EG=23256 GT 0/1
27+
chr16 25704209 . G A . . GE=HS3ST4;EG=9951 GT 0/1
28+
chr17 7577548 . C CA . . GE=TP53;EG=7157 GT 0/1
29+
chr17 36731197 . C AAT . . GE=SRCIN1;EG=80725 GT 0/1

test/test_vcf.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,7 @@ def test_vcf_reference_name():
7373
def test_pandas_and_pyvcf_implementations_equivalent():
7474
paths = [
7575
{'path': data_path("somatic_hg19_14muts.vcf")},
76+
{'path': data_path("somatic_hg19_14muts.space_in_sample_name.vcf")},
7677
{'path': "/" + data_path("somatic_hg19_14muts.vcf")},
7778
{'path': data_path("somatic_hg19_14muts.vcf.gz")},
7879
{'path': data_path("multiallelic.vcf")},

varcode/vcf.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -151,8 +151,8 @@ def load_vcf_fast(
151151
152152
genome : {pyensembl.Genome, reference name, Ensembl version int}, optional
153153
Optionally pass in a PyEnsembl Genome object, name of reference, or
154-
PyEnsembl release version to specify the reference associated with a VCF
155-
(otherwise infer reference from VCF using reference_vcf_key)
154+
PyEnsembl release version to specify the reference associated with a
155+
VCF (otherwise infer reference from VCF using reference_vcf_key)
156156
157157
reference_vcf_key : str, optional
158158
Name of metadata field which contains path to reference FASTA
@@ -469,7 +469,9 @@ def __init__(self, path):
469469
self.path = path
470470
parsed_path = parse_url_or_path(path)
471471
if not parsed_path.scheme or parsed_path.scheme.lower() == 'file':
472-
self.vcf_reader = pyvcf.Reader(filename=parsed_path.path)
472+
self.vcf_reader = pyvcf.Reader(
473+
filename=parsed_path.path,
474+
strict_whitespace=True)
473475
elif parsed_path.scheme.lower() in ("http", "https", "ftp"):
474476
self._to_close = response = requests.get(path, stream=True)
475477
response.raise_for_status() # raise error on 404, etc.
@@ -478,7 +480,10 @@ def __init__(self, path):
478480
response.iter_content())
479481
else:
480482
lines = response.iter_lines(decode_unicode=True)
481-
self.vcf_reader = pyvcf.Reader(fsock=lines, compressed=False)
483+
self.vcf_reader = pyvcf.Reader(
484+
fsock=lines,
485+
compressed=False,
486+
strict_whitespace=True)
482487
else:
483488
raise ValueError("Unsupported scheme: %s" % parsed_path.scheme)
484489

0 commit comments

Comments
 (0)