forked from Eswar1997/VCF_TO_PLINK
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathvcf_to_plink.py
123 lines (113 loc) · 5.54 KB
/
vcf_to_plink.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
#Import required packages
import sys,getopt
import os
import pandas as pd
import numpy as np
import re
import time
##################################Map file conversion fumction########################################
def vcf_to_map(vcf_file,output_file_name):
map_file=pd.DataFrame(columns=["CHR","BP","ID"])
ext_file=pd.DataFrame(columns=["CHR","BP","ID","REF","ALT"])
map_file=vcf_file.iloc[:,0:3].copy()
#Uncomment the following line if you want the allele information in a .ext file
#ext_file=vcf_file.iloc[:,0:5].copy()
map_file['DIST'] = int('0')
map_file.columns = ['CHR','BP','ID','DIST']
map_file = map_file.reindex(columns=['CHR','ID','DIST','BP'])
map_file = map_file.replace(to_replace ='chr', value = '', regex=True)
map_file.to_csv(output_file_name+'.map',sep="\t",index=False,header=False)
#Uncomment the following line for .ext file
#ext_file.to_csv(output_file_name+'.ext',sep="\t",index=False,header=True)
print("First fewlines of mapfile is:")
print(map_file.head())
print("Map file created successfully")
############################################Ped file Conversion###########################################
def vcf_to_ped(vcf_file,output_file_name):
vcf_file_reqd = vcf_file.iloc[:,9:].copy()
vcf_file_trans = vcf_file_reqd.transpose()
columns=len(vcf_file_trans.columns)
rows=vcf_file_trans.shape[0]
#print(columns)
#print(rows)
ped_file=vcf_file_trans.copy()
cols=ped_file.columns
prev_len=len(cols)
print("Number of columns in pedfile should be 6+2*"+str(prev_len)+"=",(6+(2*prev_len)))
ped_file.index.name = 'FID'
ped_file.reset_index(inplace=True)
ped_file['IID']=ped_file['FID']
ped_file['PID']=ped_file['MID']=ped_file['Sex']=str(0)
ped_file['Pheno']=str(-9)
cols = list(ped_file)
cols.insert(1, cols.pop(cols.index('IID')))
cols.insert(2, cols.pop(cols.index('PID')))
cols.insert(3, cols.pop(cols.index('MID')))
cols.insert(4, cols.pop(cols.index('Sex')))
cols.insert(5, cols.pop(cols.index('Pheno')))
ped_file = ped_file.loc[:, cols]
#print(ped_file.head())
###########################Convering genotypes according to plink formats##############################
ped_file=ped_file.replace({r'^\.\/\.*':'0/1',r'^1/1*':'2/2',r'^0/0*':'0/0',r'^0/1*':'1/0'},regex = True)
ped_file=ped_file.astype(dtype=str, copy=True, errors='raise')
cols=ped_file.columns
ped_file= pd.concat([ped_file[x].str.split('/', expand=True) for x in cols], axis=1, keys=ped_file.columns)
ped_file.columns = ped_file.columns.map(lambda x: '_'.join((str(x[0]), str(x[1]))))
#ped_file = ped_file.replace({r':*':''},regex=True)
#pedfile_updated=ped_file.replace(to_replace =r'\1:*', value = '', regex = True)
#ped_file_updated = ped_file(lambda x:':'.join(str(x[0]).split(':')[:-1]))
ped_file=ped_file.replace({r'^0:*': 0,r'^1:*':1,r'^2:*':2}, regex = True)
#ped_file=ped_file.replace(to_replace =r'^1:*', value = 1, regex = True)
#ped_file=ped_file.replace(to_replace =r'^2:*', value = 2, regex = True)
#print(ped_file.head())
print("Total columns in ped files:" ,ped_file.shape[1])
ped_file.to_csv(output_file_name+'.ped',sep="\t",index=False,header=False)
#######################The convert vcf to ped and map controlling function#############################
def convert_vcf(input_file_name,output_file_name):
#print("VCF file is :" +input_file_name)
#print("Plink formatted file is :" +output_file_name)
pattern = re.compile("^#CHROM")
for i, line in enumerate(open(input_file_name)):
for match in re.finditer(pattern, line):
#Uncomment the following line if you want to know where the vcf header is ending
print ('VCF headers till line %s: %s' %(i+1, match.group()))
start_at=int(i)
vcf_file = pd.read_csv(input_file_name,header = start_at,sep='\s',na_filter = True,skip_blank_lines= True, engine='python')
#print(vcf_file.head())
print("Number of markers in the given vcf file is ",vcf_file.shape[0])
print("Number of samples in the given vcf file is ",vcf_file.shape[1])
pre= time.perf_counter()
vcf_to_map(vcf_file,output_file_name)
post= time.perf_counter()
print("Time to generate mapfile:"+str(post-pre)+"s")
pre= time.perf_counter()
vcf_to_ped(vcf_file,output_file_name)
post= time.perf_counter()
print("Time to generate pedfile:"+str(post-pre)+"s")
##############################Python main function - getting command line input#############################
def main(argv):
print("VCF to PLINK format converter")
print("Written by Alagu Sankareswaran. Suggestions are welcome.")
print("EmailID: [email protected] / [email protected]")
print("Provide only file name. Extension will be taken care automatically")
input_file_name = ''
output_file_name= ''
try:
opts, args = getopt.getopt(argv,"hi:o:",["ifile=","ofile="])
except getopt.GetoptError:
print ('vcf_to_plink.py -i <vcf_file_name> -o <plink_format_file_name>')
sys.exit(2)
for opt, arg in opts:
if opt == '-h':
print('vcf_to_plink.py -i <vcf_file_name> -o <plink_format_file_name>')
sys.exit()
elif opt in ("-i", "--ifile"):
input_file_name = arg
elif opt in ("-o","--ofile"):
output_file_name = arg
print('VCF file name is "', input_file_name)
print('Plink file name is "', output_file_name)
input_file_name = input_file_name +".vcf"
convert_vcf(input_file_name,output_file_name)
if __name__ == "__main__":
main(sys.argv[1:])