-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathsum_isoform_expression_per_gene.py
54 lines (50 loc) · 2.03 KB
/
sum_isoform_expression_per_gene.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
#!/usr/bin/env python
'''
This script takes the isoforms.results output of RSEM and creates a gene-level results file with each 'gene' level abundance
taken as the sum across member 'transcripts'
usage: python average_isoform_expression_per_gene.py <input.isoforms.results>
'''
from __future__ import division
import os
import sys
import numpy
def get_gene_list(infile):
genes = []
with open(infile, 'r') as iso_input:
next(iso_input)
for line in iso_input:
gene = line.split('\t')[1]
gene = '_'.join(gene.split('_')[:-1])
if gene not in genes:
genes.append(gene)
return genes
def average_expression(gene, infile, output):
lengths = []
e_lengths = []
counts = []
TPMs = []
FPKMs = []
isos = []
with open(infile, 'r') as iso_input:
for line in iso_input:
vals = line.split('\t')
trans_id,gene_id,length,effective_length,expected_count,TPM,FPKM,IsoPct = vals[0:8]
gene_id = '_'.join(gene_id.split('_')[:-1])
if gene == gene_id:
isos.append(trans_id)
length_w = float(length)*float(IsoPct)/100
e_length_w = float(effective_length)*float(IsoPct)/100
lengths.append(length_w)
e_lengths.append(e_length_w)
counts.append(float(expected_count))
TPMs.append(float(TPM))
FPKMs.append(float(FPKM))
gene_out = gene +'\t'+','.join(isos)+'\t'+str(round(numpy.sum(lengths),2))+'\t'+str(round(numpy.sum(e_lengths),2))+'\t'+str(round(numpy.sum(counts),2))+'\t'+str(round(numpy.sum(TPMs),2))+'\t'+str(round(numpy.sum(FPKMs),2))+'\n'
output.write(gene_out)
if __name__ == "__main__":
infile = sys.argv[1]
output = open(infile.split('.')[0]+'.genes.results', 'w')
output.write('gene_id\ttranscript_id(s)\tlength\teffective_length\texpected_count\tTPM\tFPKM\n')
gene_list = get_gene_list(infile)
for gene in gene_list:
average_expression(gene, infile, output)