-
Notifications
You must be signed in to change notification settings - Fork 1
/
protein_cutoff.py
executable file
·143 lines (128 loc) · 4.55 KB
/
protein_cutoff.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
#!/usr/bin/python3
import sys, os, re, argparse, textwrap
from collections import OrderedDict, defaultdict
from itertools import tee, islice, chain, zip_longest
currentdir = os.getcwd()
parentdir = os.path.dirname(currentdir)
sys.path.insert(0, parentdir)
class MyFormatter(argparse.RawTextHelpFormatter):
def __init__(self, prog):
super(MyFormatter, self).__init__(prog, max_help_position=48)
parser = argparse.ArgumentParser(
usage='./%(prog)s [options] -o output_basename -p protein.fasta and/or -gf .gff3 and/or -gb .gbk',
description = ''' A script to remove proteins from FASTA, GFF3, or GenBank
format that are shorter than a provided threshold.''',
epilog = """Written by Stephen A. Wyka (2019)""",
formatter_class = MyFormatter)
parser.add_argument(
'-o',
'--out',
required=True,
help = 'Basename of output files',
metavar=''
)
parser.add_argument(
'-p',
'--protein',
help = 'protein file in FASTA format',
metavar=''
)
parser.add_argument(
'-gf',
'--gff3',
help = 'gff3 file',
metavar=''
)
parser.add_argument(
'-c',
'--cutoff',
type=int,
default=50,
help = 'Cutoff for protein length [default = 50]',
metavar=''
)
args=parser.parse_args()
def get_next(some_iterable):
items, nexts = tee(some_iterable, 2)
nexts = chain(islice(nexts, 1, None), [None])
return zip_longest(items, nexts)
# Create output folder and define paths to folders
if not os.path.isdir(os.path.join(currentdir, args.out+'_reformated')):
os.makedirs(os.path.join(currentdir, args.out+'_reformated'))
result_dir = os.path.abspath(os.path.join(currentdir, args.out+'_reformated'))
# Checking arguments and making paths
if args.protein:
protein_file = os.path.abspath(os.path.join(currentdir, args.protein))
reformat_fasta = os.path.abspath(os.path.join(result_dir, args.out+'_reformat.fasta'))
if args.gff3:
gff3_file = os.path.abspath(os.path.join(currentdir, args.gff3))
reformat_gff = os.path.abspath(os.path.join(result_dir, args.out+'_reformat.gff3'))
# Removing proteins from FASTA file
if args.protein:
print('Removing proteins shorter than {} aa in your FASTA file'.format(args.cutoff))
protein_dict=defaultdict(str)
try:
with open(protein_file, 'r') as fasta_file:
name=''
for line in fasta_file:
if line.startswith('>'):
if ' ' in line:
raise
name=line[1:-1]
continue
protein_dict[name]+=line.strip()
except:
print("Error: The script does not like your FASTA file")
with open(reformat_fasta, 'w') as re_fasta:
for key, values in protein_dict.items():
if len(values) >= args.cutoff:
re_fasta.write('>'+key + '\n')
re_fasta.write(textwrap.fill(''.join(protein_dict[key]),width=80) + '\n')
# Removing proteins from GFF3 file
if args.gff3:
print('Removing proteins shorter than {} aa in your GFF3 file'.format(args.cutoff))
line_list = []
group_list = []
gff_list = []
with open(gff3_file, 'r') as gff_file:
for line in gff_file:
if line.startswith('#'):
continue
elif 'region' in line:
continue
elif 'gap' in line:
continue
elif 'pseudo=true' in line:
continue
elif 'tRNA' in line:
continue
else:
gff_list.append(line.strip().split('\t'))
gff_list = list(filter(None, gff_list))
for line, next_line in get_next(gff_list):
if next_line == None:
line_list.append(line)
continue
elif line[2] == 'gene' and next_line[2] != 'gene':
line_list.append(line)
elif line[2] != 'gene' and next_line[2] == 'gene':
line_list.append(line)
group_list.append(line_list)
line_list = []
else:
line_list.append(line)
group_list.append(line_list)
group_list = list(filter(None, group_list))
cut_list = []
for gene in group_list:
lengths = []
for feature in gene:
if feature[2] == 'CDS':
lengths.append(int(feature[4]) - int(feature[3]) + 1 - int(feature[7]))
if sum(lengths) >= args.cutoff*3:
cut_list.append(gene)
with open(reformat_gff, 'w') as re_gff:
re_gff.write('##gff-version 3' + '\n')
for x in cut_list:
for y in x:
re_gff.write('\t'.join(y) + '\n')