Skip to content

Commit b62ee87

Browse files
authored
Merge pull request #11 from erinyoung/update-2024-01-16
Update 2024 01 16
2 parents 4568ea3 + 35b070f commit b62ee87

File tree

10 files changed

+203
-159
lines changed

10 files changed

+203
-159
lines changed

aci/aci.py

Lines changed: 83 additions & 97 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
#!/usr/bin/env python3
2+
# pylint: disable=logging-fstring-interpolation
23

34
'''
45
Author: Erin Young
@@ -13,151 +14,136 @@
1314

1415
# I tried to keep dependencies down...
1516
import argparse
16-
import concurrent.futures
17-
import itertools
1817
import logging
1918
import os
2019
import sys
2120
import tempfile
2221
import pandas as pd
2322

24-
from aci.utils.amplicon_depth import amplicon_depth # pylint: disable=E0401
25-
from aci.utils.column_names import column_names # pylint: disable=E0401
26-
from aci.utils.get_regions import get_regions # pylint: disable=E0401
27-
from aci.utils.genome_depth import genome_depth # pylint: disable=E0401
28-
from aci.utils.plotting_amplicons import plotting_amplicons # pylint: disable=E0401
29-
from aci.utils.plotting_depth import plotting_depth # pylint: disable=E0401
30-
from aci.utils.prep import prep # pylint: disable=E0401
31-
from aci.utils.get_coverage import get_coverage # pylint: disable=E0401
23+
from aci.utils.amplicon_splitting import amplicon_splitting
24+
from aci.utils.genome_depth import genome_depth
25+
from aci.utils.plotting_amplicons import plotting_amplicons
26+
from aci.utils.plotting_depth import plotting_depth
27+
from aci.utils.prep import prep
3228

3329
# about 30 seconds per artic V3 primer on SRR13957125
3430
# $ samtools coverage SRR13957125.sorted.bam
3531
# #rname startpos endpos numreads covbases coverage meandepth meanbaseq meanmapq
3632
# MN908947.3 1 29903 1141595 29827 99.7458 5350.27 37.3 60
3733
# 15000 - 16500
3834

39-
def main(): # pylint: disable=R0914,R0915
35+
def main():
4036
""" Use pysam to get depth for amplicon region and general coverage """
4137

4238
##### ----- ----- ----- ----- ----- #####
4339
##### Part 0. Setup #####
4440
##### ----- ----- ----- ----- ----- #####
4541

46-
version = '1.0.20231229'
42+
version = '1.4.20240116'
4743

4844
parser = argparse.ArgumentParser()
49-
parser.add_argument('-b', '--bam', nargs = '+', required = True, type = str, help = '(required) input bam file(s)') # pylint: disable=C0301
50-
parser.add_argument('-d', '--bed', required = True, type = str, help ='(required) amplicon bedfile') # pylint: disable=C0301
51-
parser.add_argument('-o', '--out', required = False, type = str, help = 'directory for results', default = 'aci') # pylint: disable=C0301
52-
parser.add_argument('-log', '--loglevel', required = False, type = str, help = 'logging level', default = 'INFO') # pylint: disable=C0301
53-
parser.add_argument('-t', '--threads', required = False, type = int, help = 'specifies number of threads to use', default=4) # pylint: disable=C0301
54-
parser.add_argument('-v', '--version', help='print version and exit', action = 'version', version = version) # pylint: disable=C0301
45+
parser.add_argument('-b', '--bam',
46+
nargs = '+',
47+
required = True,
48+
type = str,
49+
help = '(required) input bam file(s)')
50+
parser.add_argument('-d', '--bed',
51+
required = True,
52+
type = str,
53+
help ='(required) amplicon bedfile')
54+
parser.add_argument('-o', '--out',
55+
required = False,
56+
type = str,
57+
help = 'directory for results',
58+
default = 'aci')
59+
parser.add_argument('-log', '--loglevel',
60+
required = False,
61+
type = str,
62+
help = 'logging level',
63+
default = 'INFO')
64+
parser.add_argument('-t', '--threads',
65+
required = False,
66+
type = int,
67+
help = 'specifies number of threads to use',
68+
default=4)
69+
parser.add_argument('-v', '--version',
70+
help='print version and exit',
71+
action = 'version',
72+
version = version)
5573
args = parser.parse_args()
5674

5775
logging.basicConfig(format='%(asctime)s - %(message)s',
5876
datefmt = '%y-%b-%d %H:%M:%S',
5977
level=args.loglevel.upper())
6078

6179
if not os.path.exists(args.bed):
62-
logging.critical('bedfile ' + args.bed + ' does not exist. Exiting') # pylint: disable=W1201
80+
logging.critical(f"bedfile {args.bed} does not exist. Exiting")
6381
sys.exit(2)
6482

6583
if not os.path.exists(args.out):
6684
os.mkdir(args.out)
6785

68-
logging.info('ACI version :\t\t' + str(version)) # pylint: disable=W1201
69-
logging.info('Number of threads :\t' + str(args.threads)) # pylint: disable=W1201
70-
logging.info('Final directory :\t\t' + str(args.out)) # pylint: disable=W1201
71-
logging.info('Input bed file :\t\t' + str(args.bed)) # pylint: disable=W1201
72-
logging.info('Input bam file(s) :\t' + ', '.join(args.bam)) # pylint: disable=W1201
73-
74-
bed = args.bed
75-
out = args.out
76-
threads = args.threads
77-
temp_dir = tempfile.TemporaryDirectory(dir = args.out) # pylint: disable=R1732
78-
79-
meta = {}
80-
filenames = []
81-
for bam in args.bam:
82-
meta[bam] = {}
83-
meta[bam]['initial_bam'] = bam
84-
meta[bam]['out'] = out
85-
meta[bam]['tmp'] = temp_dir.name + '/'
86-
meta[bam]['file_name'] = os.path.basename(bam)
87-
meta[bam]['sorted_bam'] = meta[bam]['tmp'] + os.path.basename(bam)
88-
meta[bam]['sorted_bai'] = meta[bam]['sorted_bam'] + '.bai'
89-
filenames.append(meta[bam]['file_name'])
90-
91-
logging.info('Sorting and indexing ' + meta[bam]['file_name']) # pylint: disable=W1201
92-
prep(meta[bam]['initial_bam'], meta[bam]['sorted_bam'], threads)
93-
logging.info('Finished sorting and indexing')
94-
95-
logging.debug('the filenames for all the bam files are')
96-
logging.debug(filenames)
86+
logging.info(f"ACI version :\t\t{str(version)}")
87+
logging.info(f"Number of threads :\t{str(args.threads)}")
88+
logging.info(f"Final directory :\t\t{str(args.out)}")
89+
logging.info(f"Input bed file :\t\t{str(args.bed)}")
90+
logging.info(f"Input bam file(s) :\t{', '.join(args.bam)}")
9791

98-
##### ----- ----- ----- ----- ----- #####
99-
##### Part 1. Amplicon depths #####
100-
##### ----- ----- ----- ----- ----- #####
101-
102-
# getting regions for parallel processing
103-
regions = get_regions(bed)
104-
105-
logging.info('Getting depth for amplicons')
106-
logging.debug('List for parallel processing:')
107-
logging.debug(list(itertools.product(args.bam, regions)))
108-
with concurrent.futures.ThreadPoolExecutor(max_workers=args.threads) as executor:
109-
for bam, subregion in list(itertools.product(args.bam, regions)):
110-
results = [executor.submit(amplicon_depth, meta[bam], subregion)]
111-
# keeping the line below for testing
112-
# results = amplicon_depth(df, meta[bam], subregion)
113-
114-
for f in concurrent.futures.as_completed(results):
115-
logging.debug(f.result())
92+
with tempfile.TemporaryDirectory(dir = args.out) as temp_dir:
93+
meta = {}
94+
filenames = []
95+
for bam in args.bam:
96+
meta[bam] = {}
97+
meta[bam]['initial_bam'] = bam
98+
meta[bam]['out'] = args.out
99+
meta['tmp'] = temp_dir + '/'
100+
meta[bam]['tmp'] = temp_dir + '/'
101+
meta[bam]['file_name'] = os.path.basename(bam)
102+
meta[bam]['sorted_bam'] = meta[bam]['tmp'] + os.path.basename(bam)
103+
meta[bam]['sorted_bai'] = meta[bam]['sorted_bam'] + '.bai'
104+
filenames.append(meta[bam]['file_name'])
116105

117-
# setting up the dataframe
118-
columns = column_names(bed)
119-
df = pd.DataFrame(columns= ['bam'] + columns)
120-
df['bam'] = filenames
121-
logging.debug('Initial empty dataframe:')
122-
logging.debug(df)
106+
logging.info(f"Sorting and indexing {meta[bam]['file_name']}")
107+
prep(meta[bam]['initial_bam'], meta[bam]['sorted_bam'], args.threads)
108+
logging.info('Finished sorting and indexing')
109+
meta['filenames'] = filenames
123110

124-
# NOTE : Had to be done outside of concurrent
125-
for bam in args.bam:
126-
bamindex = df.index[df['bam'] == meta[bam]['file_name']]
127-
for subregion in regions:
128-
cov = get_coverage(meta[bam], subregion)
129-
df.loc[bamindex, [subregion.split(':')[3]]] = cov
111+
logging.debug('the filenames for all the bam files are')
112+
logging.debug(filenames)
130113

131-
logging.debug('The final dataframe is:')
132-
logging.debug(df)
114+
##### ----- ----- ----- ----- ----- #####
115+
##### Part 1. Amplicon depths #####
116+
##### ----- ----- ----- ----- ----- #####
133117

134-
plotting_amplicons(df, out)
118+
df = amplicon_splitting(meta, args)
135119

136-
logging.info('Depth for amplicons is saved in ' + args.out + '/amplicon_depth.csv') # pylint: disable=W1201
137-
logging.info('An boxplot of these depths is at ' + args.out + '/amplicon_depth.png') # pylint: disable=W1201
120+
plotting_amplicons(df, args.out)
138121

139-
# ##### ----- ----- ----- ----- ----- #####
140-
# ##### Part 2. Genome/bam depths #####
141-
# ##### ----- ----- ----- ----- ----- #####
122+
logging.info(f"Amplicon depth is saved in {args.out}/amplicon_depth.csv")
123+
logging.info(f"An boxplot of these depths is at {args.out}/amplicon_depth.png")
142124

143-
df_pysam = pd.DataFrame([])
125+
# ##### ----- ----- ----- ----- ----- #####
126+
# ##### Part 2. Genome/bam depths #####
127+
# ##### ----- ----- ----- ----- ----- #####
144128

145-
# TODO : Fix this so that it's concurrent friendly # pylint: disable=W0511
129+
df_pysam = pd.DataFrame([])
146130

147-
for bam in args.bam:
148-
df_pysam_results = genome_depth(meta[bam])
149-
df_pysam = pd.concat([df_pysam, df_pysam_results], ignore_index=True)
131+
# NOTE : Attempted with concurrent and this was just as fast
150132

151-
logging.debug('The final pysam dataframe is:')
152-
logging.debug(df_pysam)
133+
for bam in args.bam:
134+
df_pysam_results = genome_depth(meta[bam])
135+
df_pysam = pd.concat([df_pysam, df_pysam_results], ignore_index=True)
153136

154-
plotting_depth(df_pysam, out)
137+
plotting_depth(df_pysam, args.out)
155138

156-
logging.info('Depth for the genome from the bam file is saved in ' + out + '/genome_depth.csv') # pylint: disable=W1201
157-
logging.info('An boxplot of these depths is at ' + out + '/genome_depth.png') # pylint: disable=W1201
139+
logging.info(f"Genome depth is saved in {args.out}/genome_depth.csv")
140+
logging.info(f"An boxplot of these depths is at {args.out}/genome_depth.png")
158141

159-
logging.info('ACI is complete! (I hope all your primers are behaving as expected!)')
142+
# ##### ----- ----- ----- ----- ----- #####
143+
# ##### Fin #####
144+
# ##### ----- ----- ----- ----- ----- #####
160145

146+
logging.info('ACI is complete! (I hope all your primers are behaving as expected!)')
161147

162148
if __name__ == "__main__":
163149
main()

aci/utils/amplicon_depth.py

Lines changed: 28 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
#!/usr/bin/env python
2+
# pylint: disable=logging-fstring-interpolation
23

34
""" Use pysam to get depth for amplicon region """
45

@@ -8,48 +9,51 @@
89
from .subregion import subregion
910
from .within import within
1011
from .without import without
12+
from .get_coverage import get_coverage
1113

1214
def amplicon_depth(meta, region):
1315
""" Use pysam to get depth for amplicon region """
1416

15-
name = region.split(':')[3]
17+
# getting subregion information and creating bedfile
18+
subrange, name = subregion(region)
1619

17-
# pylint was complaining about the number of variables
18-
# so they're all dict values instead
19-
meta['after_reduction_bam'] = meta['tmp'] + name + '.1.' + meta['file_name']
20-
meta['removing_outside_matches'] = meta['tmp'] + name + '.2.' + meta['file_name']
21-
meta['junk_bam'] = meta['tmp'] + name + '.3.' + meta['file_name']
22-
meta['final_bam'] = meta['tmp'] + name + '.4.' + meta['file_name']
23-
meta['subregion_bed'] = meta['tmp'] + name + '.' + meta['file_name'] + '.bed' # pylint: disable=C0301
20+
after_reduction_bam = meta['tmp'] + name + '.step1.' + meta['file_name']
21+
removing_outside_matches = meta['tmp'] + name + '.step2.' + meta['file_name']
22+
junk_bam = meta['tmp'] + name + '.step3.' + meta['file_name']
23+
final_bam = meta['tmp'] + name + '.step4.' + meta['file_name']
2424
logging.debug('The filenames are going to be :')
2525
logging.debug(meta)
2626

27-
# getting subregion information and creating bedfile
28-
subrange = subregion(region, meta['subregion_bed'])
27+
# setting the default value
28+
cov = 0.0
2929

3030
# reduce bam file to something smaller
3131
if os.path.exists(meta['sorted_bam']):
32-
logging.debug('Step 1. reducing bam for speed for ' + meta['sorted_bam']) # pylint: disable=W1201
33-
within(meta['sorted_bam'], meta['after_reduction_bam'], subrange)
32+
logging.debug(f"Step 1. reducing bam for speed for {meta['sorted_bam']}")
33+
within(meta['sorted_bam'], after_reduction_bam, subrange)
3434

35-
if os.path.exists(meta['after_reduction_bam']):
36-
pysam.index(meta['after_reduction_bam'])
35+
if os.path.exists(after_reduction_bam):
36+
pysam.index(after_reduction_bam)
3737

3838
# remove all reads that fall outside of region of interest
3939
# warning : this is the slow part of the script
40-
logging.debug('Step 2. reducing bam for speed for ' + meta['sorted_bam']) # pylint: disable=W1201
41-
without(meta['after_reduction_bam'], meta['removing_outside_matches'], meta['junk_bam'], meta['subregion_bed']) # pylint: disable=C0301
40+
logging.debug(f"Step 2. reducing bam for speed for {meta['sorted_bam']}")
41+
without(after_reduction_bam,
42+
removing_outside_matches,
43+
junk_bam,
44+
bed = meta['tmp'] + name + '.bed')
4245

43-
if os.path.exists(meta['removing_outside_matches']):
44-
pysam.index(meta['removing_outside_matches'])
46+
if os.path.exists(removing_outside_matches):
47+
pysam.index(removing_outside_matches)
4548

4649
# get only reads that are within subrange
47-
logging.debug('Step 3. final reduction for ' + meta['sorted_bam']) # pylint: disable=W1201
48-
within(meta['removing_outside_matches'], meta['final_bam'], subrange)
50+
logging.debug(f"Step 3. final reduction for {meta['sorted_bam']}")
51+
within(removing_outside_matches, final_bam, subrange)
4952

50-
if os.path.exists(meta['final_bam']):
51-
pysam.index(meta['final_bam'])
53+
if os.path.exists(final_bam):
54+
pysam.index(final_bam)
55+
cov = get_coverage(final_bam, subrange)
5256

53-
logging.info('Amplicon bam file created for ' + meta['file_name'] + ' over ' + subrange) # pylint: disable=W1201
57+
logging.info(f"Amplicon bam file created for {meta['file_name']} over {subrange}")
5458

55-
return [meta['file_name'], meta['initial_bam'], name]
59+
return [meta['file_name'], name, cov]

aci/utils/amplicon_splitting.py

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
#!/usr/bin/env python
2+
# pylint: disable=logging-fstring-interpolation
3+
4+
""" Split processes by region in bedfile for concurrent """
5+
6+
import logging
7+
import concurrent.futures
8+
import pandas as pd
9+
10+
from .amplicon_depth import amplicon_depth
11+
from .get_regions import get_regions
12+
from .column_names import column_names
13+
14+
def amplicon_splitting(meta, args):
15+
""" Split processes by region in bedfile for concurrent """
16+
17+
# getting region for parallel processing
18+
regions = get_regions(meta, args.bed)
19+
#regions = ['MN908947.3:54:385:1', 'MN908947.3:342:704:2', 'MN908947.3:664:1004:3', 'MN908947.3:965:1312:4', 'MN908947.3:1264:1623:5', 'MN908947.3:1595:1942:6' ]
20+
21+
logging.info('Getting depth for amplicons')
22+
results = []
23+
with concurrent.futures.ThreadPoolExecutor(max_workers=args.threads) as executor:
24+
tasks = []
25+
for bam in args.bam:
26+
for subregion in regions:
27+
future = executor.submit(amplicon_depth, meta[bam], subregion)
28+
tasks.append(future)
29+
30+
completed_tasks, _ = concurrent.futures.wait(tasks,return_when=concurrent.futures.ALL_COMPLETED) # pylint: disable=C0301
31+
results = [task.result() for task in completed_tasks]
32+
33+
# setting up the dataframe
34+
columns = column_names(args.bed)
35+
df = pd.DataFrame(columns= ['bam'] + columns)
36+
df['bam'] = meta['filenames']
37+
logging.debug('Initial empty dataframe:')
38+
logging.debug(df)
39+
40+
# NOTE : Had to be done outside of concurrent
41+
for bam, name, cov in results :
42+
bamindex = df.index[df['bam'] == bam]
43+
df.loc[bamindex, name] = cov
44+
45+
df = df.fillna(0)
46+
47+
return df

aci/utils/genome_depth.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ def genome_depth(meta):
1010
""" Takes a bam file file and gets coverage """
1111

1212
# getting depth results from pysam
13-
df = pd.DataFrame([x.split('\t') for x in pysam.depth(meta['sorted_bam']).split('\n')])
13+
df = pd.DataFrame([x.split('\t') for x in pysam.depth(meta['sorted_bam']).split('\n')]) # pylint: disable=E1101
1414
df.columns = ['ref', 'pos', 'cov']
1515

1616
# ensuring that the type is correct
@@ -19,7 +19,7 @@ def genome_depth(meta):
1919
df['bam'] = meta['file_name']
2020
df = df.dropna()
2121

22-
logging.debug('pysam results for ' + meta['sorted_bam']) # pylint: disable=W1201
22+
logging.debug(f"pysam results for {meta['sorted_bam']}") # pylint: disable=W1203
2323
logging.debug(df)
2424

2525
return df

0 commit comments

Comments
 (0)