Skip to content

Commit 4edc939

Browse files
committed
Merge branch 'master' of github.com:EnvGen/toolbox
2 parents 69044a2 + 4ebf7f1 commit 4edc939

File tree

6 files changed

+425
-0
lines changed

6 files changed

+425
-0
lines changed
Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
#!/usr/bin/env python
2+
"""construct_ena_sequencing_runs_table.py
3+
4+
Based on a folder with uploaded files and a template, construct table ready to be submitted.
5+
This script is not general but is very niched to the NGI/Uppmax scenario.
6+
Ideally the user is expected to copy this script and edit it to suit the users needs.
7+
"""
8+
9+
import argparse
10+
import sys
11+
import os
12+
import glob
13+
from os.path import join as opj
14+
import pandas as pd
15+
from collections import defaultdict
16+
import gzip
17+
18+
# Need to fetch file name and link it to sample id
19+
# Fetch the md5sum already calculated
20+
21+
def main(args):
22+
md5sum_df = pd.read_table(args.md5_summary, sep=',', header=None, names=['file_name', 'md5sum'], index_col=0)
23+
24+
insert_sizes = pd.read_table(args.insert_size, index_col=0)
25+
info_d = {}
26+
27+
for sample_dir in args.sample_dirs:
28+
for R1_run_file in glob.glob(opj(sample_dir, "*", "*_R1*.fastq.gz")):
29+
R1_file_name=os.path.basename(R1_run_file)
30+
sample_name="_".join(R1_file_name.split('_')[0:2])
31+
run_id="_".join(R1_file_name.split('_')[0:4])
32+
run_info = {}
33+
run_info['sample_accession'] = sample_name
34+
run_info['library_name'] = run_id
35+
is_series = insert_sizes.ix[sample_name]['Avg. FS']
36+
try:
37+
run_info['insert_size'] = int(round(is_series))
38+
except TypeError:
39+
run_info['insert_size'] = int(round(insert_sizes[insert_sizes['Lib QC'] == 'PASSED'].ix[sample_name]['Avg. FS']))
40+
41+
run_info['forward_file_name'] = R1_file_name
42+
run_info['forward_file_md5'] = md5sum_df.loc[R1_file_name]['md5sum']
43+
R2_file_name = R1_file_name.replace("R1", "R2")
44+
run_info['reverse_file_name'] = R2_file_name
45+
run_info['reverse_file_md5'] = md5sum_df.loc[R2_file_name]['md5sum']
46+
run_info['library_source'] = 'METAGENOMIC'
47+
run_info['library_selection'] = 'RANDOM'
48+
run_info['library_strategy'] = 'WGS'
49+
run_info['library_construction_protocol'] = 'Rubicon Thruplex'
50+
run_info['instrument_model'] = 'Illumina HiSeq 2500'
51+
run_info['file_type'] = 'fastq'
52+
run_info['library_layout'] = 'PAIRED'
53+
54+
55+
info_d[run_id] = run_info
56+
all_columns_sorted = ['sample_accession', 'library_name', 'library_source', 'insert_size', \
57+
'library_selection', 'library_strategy', 'library_construction_protocol', 'instrument_model', \
58+
'file_type', 'library_layout', 'insert_size', \
59+
'forward_file_name', 'forward_file_md5', 'reverse_file_name', 'reverse_file_md5']
60+
61+
df = pd.DataFrame.from_dict(info_d, orient='index')
62+
df[all_columns_sorted].to_csv(sys.stdout, index=False, sep='\t', header=True)
63+
64+
if __name__ == "__main__":
65+
parser = argparse.ArgumentParser(description=__doc__)
66+
parser.add_argument("md5_summary", help="Table (csv) with md5sum values for all files")
67+
parser.add_argument("insert_size", help="Table with insert size per sample")
68+
parser.add_argument("sample_dirs", nargs='*', help="Directories where read files are located in subdirs")
69+
args = parser.parse_args()
70+
71+
main(args)
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
# -n option disables auto-logon
2+
FILE_SUFFIX="fastq.gz"
3+
for upload_dir in "$@"
4+
do
5+
ftp -n webin.ebi.ac.uk <<End-Of-Session
6+
user $ENA_USER $ENA_PASS
7+
binary
8+
prompt
9+
lcd "$upload_dir"
10+
!ls *"$FILE_SUFFIX"
11+
mput *"$FILE_SUFFIX"
12+
bye
13+
End-Of-Session
14+
done
Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
#!/usr/bin/env python
2+
usage="""Checks the status for a subset of reads within a bam file.
3+
4+
Three categories:
5+
Matching
6+
Soft clipped
7+
Not mapping
8+
"""
9+
10+
import argparse
11+
import sys
12+
import re
13+
14+
soft_clip_re = re.compile('([0-9]+)S')
15+
matched_re = re.compile('([0-9]+)M')
16+
17+
left_clipped_re = re.compile('^([0-9]+)S')
18+
19+
nr_mismatched_re = re.compile('XM:i:([0-9]+)')
20+
21+
def parse_cigar(cigar):
22+
soft_clipped_matches = soft_clip_re.findall(cigar)
23+
matched_matches = matched_re.findall(cigar)
24+
soft_clipped_bases = sum([int(x) for x in soft_clipped_matches])
25+
matched_bases = sum([int(x) for x in matched_matches])
26+
27+
left_clipped_matches = left_clipped_re.findall(cigar)
28+
left_clipped_bases = sum([int(x) for x in left_clipped_matches])
29+
30+
return soft_clipped_bases, matched_bases, left_clipped_bases
31+
32+
33+
"""
34+
chimeric or not
35+
soft clipped (> 20S) or not
36+
overlapping edge or not
37+
mapped or not
38+
39+
soft clipped not soft clipped overlapping edge not overlapping edge overlapping edge and soft clipped mapped not mapped
40+
chimeric:
41+
Non-chimeric:
42+
43+
"""
44+
45+
def print_stats(name, soft_clipped_set, overlapping_set, mapped_set, complete_set):
46+
"soft clipped not soft clipped overlapping edge not overlapping edge overlapping edge and soft clipped mapped not mapped"
47+
48+
print("{},{},{},{},{},{},{},{}".format(name, len(soft_clipped_set), len(complete_set - soft_clipped_set), len(overlapping_set), len(complete_set - overlapping_set), len(overlapping_set & soft_clipped_set), len(mapped_set), len(complete_set - mapped_set)))
49+
50+
51+
def main(args):
52+
chim_reads = set()
53+
for line in open(args.subset_reads, 'r'):
54+
read = line.strip()
55+
chim_reads.add(read)
56+
57+
mapped_reads = set()
58+
soft_clipped_reads = set()
59+
overlapping_edge_reads = set()
60+
all_reads = set()
61+
contig_lens = {}
62+
63+
contig_len_re = re.compile('SN:(.*) *LN:([0-9]*)$')
64+
65+
# Read the input sam files
66+
for line in sys.stdin:
67+
line = line.strip()
68+
69+
# Check if in header
70+
if line[0] == '@':
71+
if line[0:3] == '@SQ':
72+
contig, contig_len = contig_len_re.findall(line)[0]
73+
contig_lens[contig] = contig_len
74+
else:
75+
# Not in header
76+
77+
line_split = line.split('\t')
78+
if (int(line_split[1]) / 128) == 1:
79+
read_id = line_split[0] + "_2"
80+
else:
81+
read_id = line_split[0] + "_1"
82+
83+
all_reads.add(read_id)
84+
85+
mapping_bit = (((((int(line_split[1]) % 128) % 64) % 32) % 16) % 8) / 4
86+
if mapping_bit != 1:
87+
mapped_reads.add(read_id)
88+
cigar = line_split[5]
89+
90+
soft_clipped, matching, left_clipped_bases = parse_cigar(cigar)
91+
if soft_clipped > 20:
92+
soft_clipped_reads.add(read_id)
93+
94+
read_length = len(line_split[9])
95+
96+
contig, mapping_start_pos = line_split[2], line_split[7]
97+
98+
if int(mapping_start_pos) - left_clipped_bases < 0:
99+
overlapping_edge_reads.add(read_id)
100+
101+
if int(mapping_start_pos) + read_length > contig_lens[contig]:
102+
overlapping_edge_reads.add(read_id)
103+
104+
# Aim to print>
105+
# "soft clipped not soft clipped overlapping edge not overlapping edge overlapping edge and soft clipped mapped not mapped"
106+
print("name,soft clipped, not soft clipped,overlapping edge,not overlapping edge,overlapping edge and soft clipped,mapped,not mapped")
107+
print_stats("chimeric_reads", soft_clipped_reads & chim_reads, overlapping_edge_reads & chim_reads, mapped_reads & chim_reads, chim_reads)
108+
print_stats("other_reads", soft_clipped_reads - chim_reads, overlapping_edge_reads - chim_reads, mapped_reads - chim_reads, all_reads - chim_reads)
109+
110+
111+
if __name__ == "__main__":
112+
parser = argparse.ArgumentParser(description=usage)
113+
parser.add_argument('subset_reads')
114+
args = parser.parse_args()
115+
116+
main(args)
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
#!/usr/bin/env python
2+
usage="""Filter potential chimeric reads to remove correctly none-clipped.
3+
"""
4+
5+
import argparse
6+
import sys
7+
import re
8+
9+
soft_clip_re = re.compile('([0-9]+)S')
10+
matched_re = re.compile('([0-9]+)M')
11+
12+
nr_mismatched_re = re.compile('XM:i:([0-9]+)')
13+
14+
def parse_cigar(cigar):
15+
soft_clipped_matches = soft_clip_re.findall(cigar)
16+
matched_matches = matched_re.findall(cigar)
17+
soft_clipped_bases = sum([int(x) for x in soft_clipped_matches])
18+
matched_bases = sum([int(x) for x in matched_matches])
19+
20+
return soft_clipped_bases, matched_bases
21+
22+
def main(args):
23+
all_reads = set()
24+
for line in open(args.potential_chimera, 'r'):
25+
read = line.strip()
26+
all_reads.add(read)
27+
28+
# Read the input sam files
29+
for line in sys.stdin:
30+
line = line.strip()
31+
32+
# Check if in header
33+
if line[0] != '@':
34+
# Not in header
35+
36+
# Ignore reads which are note potential chimeras
37+
# Read is most likely not chimeric
38+
line_split = line.split('\t')
39+
if (int(line_split[1]) / 128) == 1:
40+
read_id = line_split[0] + "_2"
41+
else:
42+
read_id = line_split[0] + "_1"
43+
44+
if read_id in all_reads:
45+
cigar = line_split[5]
46+
# Matching min 95% of bases
47+
soft_clipped, matching = parse_cigar(cigar)
48+
49+
read_length = len(line_split[9])
50+
mismatches = int(nr_mismatched_re.findall(line)[0])
51+
52+
if (matching > read_length*0.95) and (int(mismatches) < 5):
53+
all_reads.remove(read_id)
54+
55+
print("\n".join(list(all_reads)))
56+
57+
if __name__ == "__main__":
58+
parser = argparse.ArgumentParser(description=usage)
59+
parser.add_argument('potential_chimera')
60+
args = parser.parse_args()
61+
62+
main(args)
Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
#!/usr/bin/env python
2+
usage="""Prints reads which are potential chimeras.
3+
4+
These are determined as being soft clipped, not on the edge
5+
of contigs and at leat 50% mapped with max 2 mismatches.
6+
7+
Input SAM with header.
8+
"""
9+
10+
import argparse
11+
import sys
12+
import re
13+
14+
soft_clip_re = re.compile('([0-9]+)S')
15+
matched_re = re.compile('([0-9]+)M')
16+
17+
nr_mismatched_re = re.compile('XM:i:([0-9]+)')
18+
19+
def parse_cigar(cigar):
20+
soft_clipped_matches = soft_clip_re.findall(cigar)
21+
matched_matches = matched_re.findall(cigar)
22+
soft_clipped_bases = sum([int(x) for x in soft_clipped_matches])
23+
matched_bases = sum([int(x) for x in matched_matches])
24+
25+
return soft_clipped_bases, matched_bases
26+
27+
def main():
28+
contig_len_re = re.compile('SN:(.*) *LN:([0-9]*)$')
29+
contig_lens = {}
30+
31+
# Read the input sam file
32+
for line in sys.stdin:
33+
line = line.strip()
34+
35+
# Check if in header
36+
if line[0] == '@':
37+
if line[0:3] == '@SQ':
38+
contig, contig_len = contig_len_re.findall(line)[0]
39+
contig_lens[contig] = contig_len
40+
else:
41+
# Not in header
42+
line_split = line.split('\t')
43+
cigar = line_split[5]
44+
# Soft clipped at least 20 bases
45+
if 'S' in cigar:
46+
soft_clipped, matching = parse_cigar(cigar)
47+
if soft_clipped > 20:
48+
# Matching region should be > 50% of read
49+
read_length = len(line_split[9])
50+
if matching > read_length/2.0:
51+
contig, mapping_start_pos = line_split[2], line_split[7]
52+
53+
# mapping should not be on the edges
54+
if mapping_start_pos > 300 and int(contig_lens[contig]) - int(mapping_start_pos) > 300:
55+
# check nr_mismatches
56+
mismatches = int(nr_mismatched_re.findall(line)[0])
57+
if int(mismatches) < 2:
58+
# check which read
59+
if (int(line_split[1]) / 128) == 1:
60+
read_nr = "2"
61+
else:
62+
read_nr = "1"
63+
64+
print(line_split[0]+"_" + read_nr)
65+
66+
67+
if __name__ == "__main__":
68+
parser = argparse.ArgumentParser(description=usage)
69+
parser.parse_args()
70+
main()

0 commit comments

Comments
 (0)