-
Notifications
You must be signed in to change notification settings - Fork 0
/
parse_gtf.py
75 lines (58 loc) · 3.28 KB
/
parse_gtf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
#!/usr/bin/env python
# You can refer to the help manual by `python parse_gtf.py -h`
# argparse is a library that allows you to make user-friendly command line interfaces
import argparse
import csv
def parse_gtf(input_file, output_file):
# Open the input GTF file for reading
with open(input_file, 'r') as gtf_file:
# Open the output file for writing
with open(output_file, 'w', newline='') as output:
# Create a CSV writer
writer = csv.writer(output, delimiter=',')
# Write header to the output file
writer.writerow(['gene_id', 'gene_name'])
# Iterate through each line in the GTF file
for line in gtf_file:
# Skip comment lines
if line.startswith('##'):
continue
# Split the line into columns
columns = line.strip().split('\t')
# Check if the feature type is "gene"
if columns[2] == 'gene':
# Extract the key-value pairs from column 9
attributes = columns[8].split(';')
# Initialize variables to store gene_id and gene_name
gene_id = None
gene_name = None
# Iterate through the key-value pairs
for attr in attributes:
# Split the key and value (if available)
parts = attr.strip().split(' ', 1)
# Check if there is a value
if len(parts) == 2:
key, value = parts
# Remove quotes from the value
value = value.strip('"')
# Check for gene_id and gene_name
if key == 'gene_id':
gene_id = value
elif key == 'gene_name':
gene_name = value
# Write the gene_id and gene_name to the output file
if gene_id and gene_name:
writer.writerow([gene_id, gene_name])
# here we are initializing the argparse object that we will modify
parser = argparse.ArgumentParser()
# we are asking argparse to require a -i or --input flag on the command line when this
# script is invoked. It will store it in the "filenames" attribute of the object. Here
# we are only asking to provide this script one file: the GTF file we are parsing
# We also ask it to require a value for the -o or --output flag, which will specify
# the name of the file we produce
parser.add_argument("-i", "--input", help='The input file specified will be the GTF file provided by snakemake', dest="input", required=True)
parser.add_argument("-o", "--output", help='The output file name and path provided by snakemake', dest="output", required=True)
# this method will run the parser and input the data into the namespace object
args = parser.parse_args()
# Replace the code that comes after this with the code necessary to parse the GTF
parse_gtf(args.input, args.output)