-
Notifications
You must be signed in to change notification settings - Fork 15
/
Copy pathget_variable_protein_site_sub.py
171 lines (140 loc) · 4.85 KB
/
get_variable_protein_site_sub.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
import os
import sys
import getopt
import glob
import pandas as pd
from Bio import SeqIO
#parameters of the script
script_dir = os.path.dirname(os.path.realpath(sys.argv[0])) + '/'
in_dir = script_dir
out_dir = script_dir + r"get_variable_output/"
exclude_list = ['-', 'X'] #list of garbage characters or gaps
drop_exclude = True #whether to remove positions containing values in the list
write_output = False
input_filetype = r'.alnclw'
input_files = [
]
#/parameters
os.chdir(script_dir)
print(os.getcwd())
def usage():
print('see code')
sys.exit(0)
#grab df from fafsa formated aln file
def import_data(filename, extension):
if extension == ".alnclw":
format = "clustal"
os.chdir(in_dir)
data_dict = {}
with open(filename, "rU") as handle:
data_list = list(SeqIO.parse(handle, format))
for entry in data_list:
data_dict.update({entry.id:list(entry.seq)})
#print(data_dict)
df = pd.DataFrame(data_dict)
df.name = filename
# print(df)
return df
#return list of rows w/ variable sites
def find_variable(df, exclude_sites):
#run a lambda to find var rows, append result bool to df.
not_variable = df.apply(lambda x: len(x[-x.isnull()].unique()) == 1 , axis=1)
not_variable.name = 'not_variable'
df = pd.concat([not_variable, df], axis=1)
var_rows = []
for row_num in df.index:
row = df.iloc[row_num]
#print row.loc['variable']
if row.loc['not_variable'] == False:
var_rows.append(row_num)
for x in exclude_sites:
if x in var_rows:
var_rows.remove(x)
return var_rows
#return list of rows containing gaps, junk, whatever.
def find_exclude(df):
#ugly way of doing this. if you don't want to exclude rows, returns empty
if drop_exclude:
#run lambda to find gap rows, exclude w/ regex
exclude_char = df.apply(lambda x: x.isin(exclude_list).any(), axis=1)
exclude_char.name = 'exclude'
df = pd.concat([exclude_char, df], axis=1)
print('gaps/indels/junk:', df['exclude'].any())
excluded_rows = []
for row_num in df.index:
row = df.iloc[row_num]
if row.loc['exclude']:
excluded_rows.append(row_num)
else:
excluded_rows = []
return excluded_rows
def percent_variable(df, var_sites):
#print(df)
#print(var_sites)
var_percent = (float(len(var_sites))/df.shape[0])*100
print(df.name, ' %var:', var_percent)
return var_percent
def concat_each_gene(data_frames):
concat_genes = []
for df in data_frames:
series_each_concat = (df.sum(axis=1, skipna=False).astype(str))
series_each_concat.name = df.name
concat_genes.append(series_each_concat)
df_each_gene_concat = pd.DataFrame(index=concat_genes[0].index)
for series in concat_genes:
df_each_gene_concat[series.name] = series.values
print(df_each_gene_concat)
return df_each_gene_concat
assert 0
def build_csv(df, var_sites, write_output):
if write_output:
os.chdir(out_dir)
df_var_sites = df.iloc[var_sites]
df_var_sites = df_var_sites.transpose()
print(df.name)
df_var_sites.name = df.name
positions = [x+1 for x in list(df_var_sites.columns.values)]
df_var_sites.columns = positions
if write_output:
if input_filetype == ".alnclw":
output_filename = 'df_'+df.name[:-7]+'_var.csv'
df_var_sites.to_csv(output_filename)
print('written to:' + out_dir + output_filename)
else:
print(df_var_sites)
try:
opts, args = getopt.getopt(sys.argv[1:],"hwf:i:o:")
print(opts)
except:
print('arguments skipped')
for opt, arg in opts:
if opt == '-h': #help
usage()
elif opt == '-w': #save output
write_output = True
elif opt == '-i': #input directory
in_dir = os.path.join(script_dir, arg)
print(in_dir)
elif opt == '-o': #output directory, must be a subdir of current directory
out_dir = os.path.join(script_dir, arg)
print(out_dir)
elif opt == "-f": #file extension
input_filetype = arg
if write_output and not os.path.exists(out_dir):
os.makedirs(out_dir)
if len(args) > 0:
input_files = args
#uncomment this else to require input
#else:
# usage()
#get all files with the filetype extension in the script's directory
input_files_absolute = glob.glob(in_dir + "*" + input_filetype)
for path in input_files_absolute:
input_files.append(os.path.basename(path))
print(input_files)
for filename in input_files:
df = import_data(filename, input_filetype)
exclude_sites = find_exclude(df)
var_sites = find_variable(df, exclude_sites)
var_percent = percent_variable(df, var_sites)
build_csv(df, var_sites, write_output)