|
1 | 1 | #!/usr/bin/env python3
|
| 2 | +# pylint: disable=logging-fstring-interpolation |
2 | 3 |
|
3 | 4 | '''
|
4 | 5 | Author: Erin Young
|
|
13 | 14 |
|
14 | 15 | # I tried to keep dependencies down...
|
15 | 16 | import argparse
|
16 |
| -import concurrent.futures |
17 |
| -import itertools |
18 | 17 | import logging
|
19 | 18 | import os
|
20 | 19 | import sys
|
21 | 20 | import tempfile
|
22 | 21 | import pandas as pd
|
23 | 22 |
|
24 |
| -from aci.utils.amplicon_depth import amplicon_depth # pylint: disable=E0401 |
25 |
| -from aci.utils.column_names import column_names # pylint: disable=E0401 |
26 |
| -from aci.utils.get_regions import get_regions # pylint: disable=E0401 |
27 |
| -from aci.utils.genome_depth import genome_depth # pylint: disable=E0401 |
28 |
| -from aci.utils.plotting_amplicons import plotting_amplicons # pylint: disable=E0401 |
29 |
| -from aci.utils.plotting_depth import plotting_depth # pylint: disable=E0401 |
30 |
| -from aci.utils.prep import prep # pylint: disable=E0401 |
31 |
| -from aci.utils.get_coverage import get_coverage # pylint: disable=E0401 |
| 23 | +from aci.utils.amplicon_splitting import amplicon_splitting |
| 24 | +from aci.utils.genome_depth import genome_depth |
| 25 | +from aci.utils.plotting_amplicons import plotting_amplicons |
| 26 | +from aci.utils.plotting_depth import plotting_depth |
| 27 | +from aci.utils.prep import prep |
32 | 28 |
|
33 | 29 | # about 30 seconds per artic V3 primer on SRR13957125
|
34 | 30 | # $ samtools coverage SRR13957125.sorted.bam
|
35 | 31 | # #rname startpos endpos numreads covbases coverage meandepth meanbaseq meanmapq
|
36 | 32 | # MN908947.3 1 29903 1141595 29827 99.7458 5350.27 37.3 60
|
37 | 33 | # 15000 - 16500
|
38 | 34 |
|
39 |
| -def main(): # pylint: disable=R0914,R0915 |
| 35 | +def main(): |
40 | 36 | """ Use pysam to get depth for amplicon region and general coverage """
|
41 | 37 |
|
42 | 38 | ##### ----- ----- ----- ----- ----- #####
|
43 | 39 | ##### Part 0. Setup #####
|
44 | 40 | ##### ----- ----- ----- ----- ----- #####
|
45 | 41 |
|
46 |
| - version = '1.0.20231229' |
| 42 | + version = '1.4.20240116' |
47 | 43 |
|
48 | 44 | parser = argparse.ArgumentParser()
|
49 |
| - parser.add_argument('-b', '--bam', nargs = '+', required = True, type = str, help = '(required) input bam file(s)') # pylint: disable=C0301 |
50 |
| - parser.add_argument('-d', '--bed', required = True, type = str, help ='(required) amplicon bedfile') # pylint: disable=C0301 |
51 |
| - parser.add_argument('-o', '--out', required = False, type = str, help = 'directory for results', default = 'aci') # pylint: disable=C0301 |
52 |
| - parser.add_argument('-log', '--loglevel', required = False, type = str, help = 'logging level', default = 'INFO') # pylint: disable=C0301 |
53 |
| - parser.add_argument('-t', '--threads', required = False, type = int, help = 'specifies number of threads to use', default=4) # pylint: disable=C0301 |
54 |
| - parser.add_argument('-v', '--version', help='print version and exit', action = 'version', version = version) # pylint: disable=C0301 |
| 45 | + parser.add_argument('-b', '--bam', |
| 46 | + nargs = '+', |
| 47 | + required = True, |
| 48 | + type = str, |
| 49 | + help = '(required) input bam file(s)') |
| 50 | + parser.add_argument('-d', '--bed', |
| 51 | + required = True, |
| 52 | + type = str, |
| 53 | + help ='(required) amplicon bedfile') |
| 54 | + parser.add_argument('-o', '--out', |
| 55 | + required = False, |
| 56 | + type = str, |
| 57 | + help = 'directory for results', |
| 58 | + default = 'aci') |
| 59 | + parser.add_argument('-log', '--loglevel', |
| 60 | + required = False, |
| 61 | + type = str, |
| 62 | + help = 'logging level', |
| 63 | + default = 'INFO') |
| 64 | + parser.add_argument('-t', '--threads', |
| 65 | + required = False, |
| 66 | + type = int, |
| 67 | + help = 'specifies number of threads to use', |
| 68 | + default=4) |
| 69 | + parser.add_argument('-v', '--version', |
| 70 | + help='print version and exit', |
| 71 | + action = 'version', |
| 72 | + version = version) |
55 | 73 | args = parser.parse_args()
|
56 | 74 |
|
57 | 75 | logging.basicConfig(format='%(asctime)s - %(message)s',
|
58 | 76 | datefmt = '%y-%b-%d %H:%M:%S',
|
59 | 77 | level=args.loglevel.upper())
|
60 | 78 |
|
61 | 79 | if not os.path.exists(args.bed):
|
62 |
| - logging.critical('bedfile ' + args.bed + ' does not exist. Exiting') # pylint: disable=W1201 |
| 80 | + logging.critical(f"bedfile {args.bed} does not exist. Exiting") |
63 | 81 | sys.exit(2)
|
64 | 82 |
|
65 | 83 | if not os.path.exists(args.out):
|
66 | 84 | os.mkdir(args.out)
|
67 | 85 |
|
68 |
| - logging.info('ACI version :\t\t' + str(version)) # pylint: disable=W1201 |
69 |
| - logging.info('Number of threads :\t' + str(args.threads)) # pylint: disable=W1201 |
70 |
| - logging.info('Final directory :\t\t' + str(args.out)) # pylint: disable=W1201 |
71 |
| - logging.info('Input bed file :\t\t' + str(args.bed)) # pylint: disable=W1201 |
72 |
| - logging.info('Input bam file(s) :\t' + ', '.join(args.bam)) # pylint: disable=W1201 |
73 |
| - |
74 |
| - bed = args.bed |
75 |
| - out = args.out |
76 |
| - threads = args.threads |
77 |
| - temp_dir = tempfile.TemporaryDirectory(dir = args.out) # pylint: disable=R1732 |
78 |
| - |
79 |
| - meta = {} |
80 |
| - filenames = [] |
81 |
| - for bam in args.bam: |
82 |
| - meta[bam] = {} |
83 |
| - meta[bam]['initial_bam'] = bam |
84 |
| - meta[bam]['out'] = out |
85 |
| - meta[bam]['tmp'] = temp_dir.name + '/' |
86 |
| - meta[bam]['file_name'] = os.path.basename(bam) |
87 |
| - meta[bam]['sorted_bam'] = meta[bam]['tmp'] + os.path.basename(bam) |
88 |
| - meta[bam]['sorted_bai'] = meta[bam]['sorted_bam'] + '.bai' |
89 |
| - filenames.append(meta[bam]['file_name']) |
90 |
| - |
91 |
| - logging.info('Sorting and indexing ' + meta[bam]['file_name']) # pylint: disable=W1201 |
92 |
| - prep(meta[bam]['initial_bam'], meta[bam]['sorted_bam'], threads) |
93 |
| - logging.info('Finished sorting and indexing') |
94 |
| - |
95 |
| - logging.debug('the filenames for all the bam files are') |
96 |
| - logging.debug(filenames) |
| 86 | + logging.info(f"ACI version :\t\t{str(version)}") |
| 87 | + logging.info(f"Number of threads :\t{str(args.threads)}") |
| 88 | + logging.info(f"Final directory :\t\t{str(args.out)}") |
| 89 | + logging.info(f"Input bed file :\t\t{str(args.bed)}") |
| 90 | + logging.info(f"Input bam file(s) :\t{', '.join(args.bam)}") |
97 | 91 |
|
98 |
| - ##### ----- ----- ----- ----- ----- ##### |
99 |
| - ##### Part 1. Amplicon depths ##### |
100 |
| - ##### ----- ----- ----- ----- ----- ##### |
101 |
| - |
102 |
| - # getting regions for parallel processing |
103 |
| - regions = get_regions(bed) |
104 |
| - |
105 |
| - logging.info('Getting depth for amplicons') |
106 |
| - logging.debug('List for parallel processing:') |
107 |
| - logging.debug(list(itertools.product(args.bam, regions))) |
108 |
| - with concurrent.futures.ThreadPoolExecutor(max_workers=args.threads) as executor: |
109 |
| - for bam, subregion in list(itertools.product(args.bam, regions)): |
110 |
| - results = [executor.submit(amplicon_depth, meta[bam], subregion)] |
111 |
| - # keeping the line below for testing |
112 |
| - # results = amplicon_depth(df, meta[bam], subregion) |
113 |
| - |
114 |
| - for f in concurrent.futures.as_completed(results): |
115 |
| - logging.debug(f.result()) |
| 92 | + with tempfile.TemporaryDirectory(dir = args.out) as temp_dir: |
| 93 | + meta = {} |
| 94 | + filenames = [] |
| 95 | + for bam in args.bam: |
| 96 | + meta[bam] = {} |
| 97 | + meta[bam]['initial_bam'] = bam |
| 98 | + meta[bam]['out'] = args.out |
| 99 | + meta['tmp'] = temp_dir + '/' |
| 100 | + meta[bam]['tmp'] = temp_dir + '/' |
| 101 | + meta[bam]['file_name'] = os.path.basename(bam) |
| 102 | + meta[bam]['sorted_bam'] = meta[bam]['tmp'] + os.path.basename(bam) |
| 103 | + meta[bam]['sorted_bai'] = meta[bam]['sorted_bam'] + '.bai' |
| 104 | + filenames.append(meta[bam]['file_name']) |
116 | 105 |
|
117 |
| - # setting up the dataframe |
118 |
| - columns = column_names(bed) |
119 |
| - df = pd.DataFrame(columns= ['bam'] + columns) |
120 |
| - df['bam'] = filenames |
121 |
| - logging.debug('Initial empty dataframe:') |
122 |
| - logging.debug(df) |
| 106 | + logging.info(f"Sorting and indexing {meta[bam]['file_name']}") |
| 107 | + prep(meta[bam]['initial_bam'], meta[bam]['sorted_bam'], args.threads) |
| 108 | + logging.info('Finished sorting and indexing') |
| 109 | + meta['filenames'] = filenames |
123 | 110 |
|
124 |
| - # NOTE : Had to be done outside of concurrent |
125 |
| - for bam in args.bam: |
126 |
| - bamindex = df.index[df['bam'] == meta[bam]['file_name']] |
127 |
| - for subregion in regions: |
128 |
| - cov = get_coverage(meta[bam], subregion) |
129 |
| - df.loc[bamindex, [subregion.split(':')[3]]] = cov |
| 111 | + logging.debug('the filenames for all the bam files are') |
| 112 | + logging.debug(filenames) |
130 | 113 |
|
131 |
| - logging.debug('The final dataframe is:') |
132 |
| - logging.debug(df) |
| 114 | + ##### ----- ----- ----- ----- ----- ##### |
| 115 | + ##### Part 1. Amplicon depths ##### |
| 116 | + ##### ----- ----- ----- ----- ----- ##### |
133 | 117 |
|
134 |
| - plotting_amplicons(df, out) |
| 118 | + df = amplicon_splitting(meta, args) |
135 | 119 |
|
136 |
| - logging.info('Depth for amplicons is saved in ' + args.out + '/amplicon_depth.csv') # pylint: disable=W1201 |
137 |
| - logging.info('An boxplot of these depths is at ' + args.out + '/amplicon_depth.png') # pylint: disable=W1201 |
| 120 | + plotting_amplicons(df, args.out) |
138 | 121 |
|
139 |
| - # ##### ----- ----- ----- ----- ----- ##### |
140 |
| - # ##### Part 2. Genome/bam depths ##### |
141 |
| - # ##### ----- ----- ----- ----- ----- ##### |
| 122 | + logging.info(f"Amplicon depth is saved in {args.out}/amplicon_depth.csv") |
| 123 | + logging.info(f"An boxplot of these depths is at {args.out}/amplicon_depth.png") |
142 | 124 |
|
143 |
| - df_pysam = pd.DataFrame([]) |
| 125 | + # ##### ----- ----- ----- ----- ----- ##### |
| 126 | + # ##### Part 2. Genome/bam depths ##### |
| 127 | + # ##### ----- ----- ----- ----- ----- ##### |
144 | 128 |
|
145 |
| - # TODO : Fix this so that it's concurrent friendly # pylint: disable=W0511 |
| 129 | + df_pysam = pd.DataFrame([]) |
146 | 130 |
|
147 |
| - for bam in args.bam: |
148 |
| - df_pysam_results = genome_depth(meta[bam]) |
149 |
| - df_pysam = pd.concat([df_pysam, df_pysam_results], ignore_index=True) |
| 131 | + # NOTE : Attempted with concurrent and this was just as fast |
150 | 132 |
|
151 |
| - logging.debug('The final pysam dataframe is:') |
152 |
| - logging.debug(df_pysam) |
| 133 | + for bam in args.bam: |
| 134 | + df_pysam_results = genome_depth(meta[bam]) |
| 135 | + df_pysam = pd.concat([df_pysam, df_pysam_results], ignore_index=True) |
153 | 136 |
|
154 |
| - plotting_depth(df_pysam, out) |
| 137 | + plotting_depth(df_pysam, args.out) |
155 | 138 |
|
156 |
| - logging.info('Depth for the genome from the bam file is saved in ' + out + '/genome_depth.csv') # pylint: disable=W1201 |
157 |
| - logging.info('An boxplot of these depths is at ' + out + '/genome_depth.png') # pylint: disable=W1201 |
| 139 | + logging.info(f"Genome depth is saved in {args.out}/genome_depth.csv") |
| 140 | + logging.info(f"An boxplot of these depths is at {args.out}/genome_depth.png") |
158 | 141 |
|
159 |
| - logging.info('ACI is complete! (I hope all your primers are behaving as expected!)') |
| 142 | + # ##### ----- ----- ----- ----- ----- ##### |
| 143 | + # ##### Fin ##### |
| 144 | + # ##### ----- ----- ----- ----- ----- ##### |
160 | 145 |
|
| 146 | + logging.info('ACI is complete! (I hope all your primers are behaving as expected!)') |
161 | 147 |
|
162 | 148 | if __name__ == "__main__":
|
163 | 149 | main()
|
0 commit comments