-
Notifications
You must be signed in to change notification settings - Fork 0
/
tpedToVcf.py
executable file
·109 lines (87 loc) · 3.44 KB
/
tpedToVcf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#!/usr/bin/env python
import sys
import os
import string
import re
import bx.seq.twobit
from optparse import OptionParser
from common import grouper
from common import numericalGenotypes
import datetime
def printvcfHeader(tpedfile, tbfile):
today=datetime.datetime.today()
datestr=today.strftime("%m-%d-%y")
#vcf_metainfolines
vcf_metalines=[]
vcf_metalines.append ( "##fileformat=VCFv4.1")
vcf_metalines.append( "##fileDate="+datestr )
vcf_metalines.append("##reference="+tbfile)
vcf_metalines.append("##pedfile="+tpedfile)
vcf_metalines.append("##INFO=<ID=NS,Number=1,Type=Integer,Description=\"Number of Samples With Data\">")
vcf_metalines.append( "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">" )
print "\n".join(vcf_metalines)
""" Given a tped/tfam file based on --file basename make a barebones VCF file
A tped/tfam file is generated in PLINK, for more see here: http://pngu.mgh.harvard.edu/~purcell/plink/dataman.shtml#recode
the only field in the format is GT
No qual or filter value is given, only a '.'
"""
def main():
usage = "usage: %prog [options] filebasename"
parser = OptionParser(usage)
parser.add_option("--file", type="string", dest="basename", help="basename of tped/tfam file")
parser.add_option("--twobitfile", type="string", dest="tbf", help="2bit file of reference genome")
(options, args)=parser.parse_args()
try:
sys.stderr.write("opening twobitfile...\n")
twobit=bx.seq.twobit.TwoBitFile( open( options.tbf ) )
except:
sys.stderr.write("unable to open twobit file!\n")
tfamfile=options.basename+".tfam"
tpedfile=options.basename+".tped"
tfamfh=open(tfamfile, 'r')
samplenames=[]
for line in tfamfh:
(fid,iid,pid,mid,sex,pheno)=line.strip().split(' ')
samplenames.append(iid)
samplestring="\t".join(samplenames)
tpedfh=open(tpedfile,'r')
printvcfHeader(options.tbf, tpedfile)
print "\t".join(["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT", samplestring])
for line in tpedfh:
fields=line.strip().split(' ')
(chrom, snpid,cM,pos)=fields[0:4]
start=int(pos)-1
end=int(pos)
try:
sequence=twobit[chrom][start:end]
sequence=sequence.upper()
except:
error="unable to fetch sequence from 2bit file!: " + chrom + " " + pos
sys.stderr.write(error + "\n")
exit(1)
refbase=sequence
#print chrom, pos,refbase
genotypes=fields[4::]
if len(genotypes)/2 != len(samplenames):
sys.stderr.write("unequal numbers of genotypes and sample names!\n")
sys.exit(1)
observed_alleles=set(genotypes)
altbases= list( observed_alleles - set(refbase) )
alt='.'
if len(altbases) == 0:
alt='.'
elif len(altbases) > 1:
alt=",".join(altbases )
else:
alt=altbases[0]
metainfo="\t".join([chrom,pos,snpid,refbase,alt,'.','.', 'NS='+str(len(samplenames)),'GT'])
ngenotypes=[]
for genotype in grouper(2, genotypes,'x'):
genostr="".join(list(genotype) )
ngenotypes.append( numericalGenotypes(refbase,alt, genostr) )
#print genotypes
#print ngenotypes
goutput="\t".join(ngenotypes)
print metainfo +"\t" +goutput
if __name__ == "__main__":
main()