-
Notifications
You must be signed in to change notification settings - Fork 0
/
gff_processing.py
executable file
·94 lines (78 loc) · 3.51 KB
/
gff_processing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
##
import gffutils
import sys
import os
tx_features_names=['mRNA','transcript','lincRNA','snoRNA','tRNA','snRNA','SRP_RNA','lnc_RNA','miRNA','rRNA','ncRNA','ncRNA_gene','pre_miRNA','RNase_MRP_RNA']
gene_features_names=['gene','tRNA_gene','ncRNA_gene','lincRNA_gene','miRNA_gene']
def get_longest_transcripts_only(db,out_gff_path):
#out_gff = open(sys.argv[2], 'w')
out_gff = open(out_gff_path, 'w')
# iterate over all features
for gene in db.features_of_type(gene_features_names):
if 'ID' in gene.attributes.keys():
#print('it has ID attribute')
gene_id=gene.attributes['ID']
else:
gene_id=gene.attributes['gene_id']
gene.attributes['gene_id']=gene_id
#del feature['ID']
# iterate over childs to add gene_id
#if len(db.children(gene, featuretype='mRNA', order_by='start')) > 1:
# print ('MULTIPLE TRANSCRIPTS')
longest_RNA='None'
max_length=0
#longest_mRNA=0
for RNA in db.children(gene, featuretype=tx_features_names):
rna_length=RNA.end - RNA.start
if rna_length > max_length:
longest_RNA=RNA
max_length=rna_length
# now process the longest mRNA
# it could happen that gene is used to define structures other than the classic gene-mRNA-exons. For example gene-tRNAs
# in this case the previous loop wont catch any longest mRNA (the gene has no mRNA childs)
if longest_RNA != 'None':
out_gff.write(str(gene)+'\n') # print the feature with the modified values
longest_RNA.attributes['gene_id']=gene_id
out_gff.write(str(longest_RNA)+'\n')
for exon in db.children(longest_RNA, featuretype='exon', order_by='start'):
exon.attributes['gene_id']=gene_id
out_gff.write(str(exon)+'\n')
#else:
# print gene_id[0] + '\tNo RNA found'
def get_tx2gene_table(db,filename):
out_file = open(filename, 'w')
for gene in db.features_of_type(gene_features_names):
for child in db.children(gene, order_by='start'):
# if child.featuretype=='mRNA' or child.featuretype=='rRNA' or child.featuretype=='lnc_RNA':
if child.featuretype in tx_features_names:
out_file.write(child.id + '\t' + gene.id + '\n')
def process_gff_function(db_name,gff_file,tx2gene_out,longest_tx_gff_out):
if not os.path.isfile(db_name):
db = gffutils.create_db(gff_file, db_name, force=True, merge_strategy='create_unique',disable_infer_transcripts=True)
db = gffutils.FeatureDB(db_name, keep_order=True)
get_tx2gene_table(db,tx2gene_out)
get_longest_transcripts_only(db,longest_tx_gff_out)
#def main():
# parser = optparse.OptionParser()
# #parser.add_option( '-d', '--out_dir', dest='out_dir', action='store', type="string", default=None )
# parser.add_option( '-o', '--out_file', dest='out_file', action='store', type="string", default=None )
# parser.add_option( '-f', '--gff_file', dest='gff_file', action='store', type="string", default=None )
# (options, args) = parser.parse_args()
#
# #gff_file=options.gff_file
# #samples=options.samples_list
#
# #create tmp file to save tx-gene table
#
# #tmp_dir = tempfile.mkdtemp( prefix='tmp-gene-tx-table' )
#
# #gene_tx_table = os.path.join( options.out_dir, 'tx2gene_table' )
#
# get_tx2gen_table(options.gff_file, options.out_file)
#
#
#if __name__ == "__main__":
# main()
#
#
#