-
Notifications
You must be signed in to change notification settings - Fork 5
/
dnabarcoder.py
488 lines (441 loc) · 24.1 KB
/
dnabarcoder.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Jul 1 15:13:18 2021
@author: duong vu [email protected]/[email protected]
"""
import sys, os, subprocess, inspect
path = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
#import lib.library as lib
#git_version = lib.git_version()
#base_version = '1.0.0'
#if git_version:
# version = base_version+'-'+git_version
#else:
# version = base_version
default_help = """
Usage: dnabarcoder <command> <arguments>
version: %s
Description: dnabarcoder is a tool for the analysis, visualization, classification, and predictions of similarity cut-offs for dna barcodes
Command: overview Get an overview of the dataset
length Compute length distribution
distribute Compute sequence distribution
variation Compute sequence variation
sim Compute similarity matrix
visualize Visualize the sequences
tree Create a phylogenetic tree of the sequences
cluster Cluster the sequences
predict Predict similarity cut-offs for the sequences
best Compute best similarity cut-offs for the sequences
merge Merge two similarity cut-offs files
remove Remove similar sequences of the same complexes based on a give threshold
search Search for best matches of the sequences against a file of reference sequences
classify Classify the sequences to the group of their best match if the score is greater than the given cutoff
verify Verify the assigned sequences based on the phylogenetic tree branch lengths
krona Visualize classification results using Krona
evaluate Compute accuracy for classification results
Written by Duong Vu [email protected]/[email protected]
""" #% version
wrongcommand=False
if len(sys.argv) > 1:
if sys.argv[1] == 'overview':
help = """
Usage: dnabarcoder %s <arguments>
version: %s
Description: The script gives an overview about the barcode dataset
Arguments: -i, --input Fasta file of Dna sequences, required
-c, --classification The taxonomic classification file in tab delimited format, required
-o, --out The output folder, default= "dnabarcoder"
Written by Duong Vu [email protected]/[email protected]
""" # % (sys.argv[1], version)
arguments = sys.argv[2:]
if len(arguments) > 1:
cmd = os.path.join(path, 'analysis', 'overview.py')
arguments.insert(0, cmd)
exe = sys.executable
arguments.insert(0, exe)
subprocess.call(arguments)
else:
print(help)
sys.exit(1)
elif sys.argv[1] == 'length':
help = """
Usage: dnabarcoder %s <arguments>
version: %s
Description: The script computes the barcode length distribution
Arguments: -i, --input Fasta file of Dna sequences, required
-l, --intervallength The length of intervals, default=100
-o, --out The output folder, default= "dnabarcoder"
Written by Duong Vu [email protected]/[email protected]
""" # % (sys.argv[1], version)
arguments = sys.argv[2:]
if len(arguments) > 1:
cmd = os.path.join(path, 'analysis', 'computeLengthDistribution.py')
arguments.insert(0, cmd)
exe = sys.executable
arguments.insert(0, exe)
subprocess.call(arguments)
else:
print(help)
sys.exit(1)
elif sys.argv[1] == 'distribute':
help = """
Usage: dnabarcoder %s <arguments>
version: %s
Description: The script computes the distribution of the sequences
Arguments: -i, --input Fasta file of Dna sequences, required
-c, --classification The taxonomic classification file in tab delimited format, required
-rank, --classificationranks The classification ranks for computing sequence distribution, required
-n, --numberofdisplayedlabels The number of the largest taxa to be displayed in the figure, default=8
-method, --visualizationmethod The visualization method: krona or plot, default=plot
-o, --out The output folder, default= "dnabarcoder"
Written by Duong Vu [email protected]/[email protected]
""" # % (sys.argv[1], version)
arguments = sys.argv[2:]
if len(arguments) > 1:
cmd = os.path.join(path, 'analysis', 'computeDistribution.py')
arguments.insert(0, cmd)
exe = sys.executable
arguments.insert(0, exe)
subprocess.call(arguments)
else:
print(help)
sys.exit(1)
elif sys.argv[1] == 'variation':
help = """
Usage: dnabarcoder %s <arguments>
version: %s
Description: The script computes the variation of the sequences
Arguments: -i, --input FASTA file of Dna sequences, required
-c, --classification The taxonomic classification file in tab delimited format, required
-rank, --classificationranks The classification ranks for computing sequence variation, default=""
-ml, --minalignmentlength Minimum sequence alignment length required for BLAST, default=400. For short barcode sequences like ITS2 (ITS1) sequences, ml should be set to smaller, 50 for instance.
-m, --maxSeqNo The maximum number of randomly selected sequences to be computed in the case the groups are too big, default=0 (no maximum is given).
-plt, --plottype The type of visualization: boxplot or plot, default=boxplot
-o, --out The output folder, default= "dnabarcoder"
Written by Duong Vu [email protected]/[email protected]
""" # % (sys.argv[1], version)
arguments = sys.argv[2:]
if len(arguments) > 1:
cmd = os.path.join(path, 'analysis', 'computeVariation.py')
arguments.insert(0, cmd)
exe = sys.executable
arguments.insert(0, exe)
subprocess.call(arguments)
else:
print(help)
sys.exit(1)
elif sys.argv[1] == 'sim':
help = """
Usage: dnabarcoder %s <arguments>
version: %s
Description: The script computes BLAST similarity matrix for sequences
Arguments: -i, --input Fasta file of Dna sequences, required
-ml, --minalignmentlength Minimum sequence alignment length required for BLAST, default=400. For short barcode sequences like ITS2 (ITS1) sequences, ml should be set to smaller, 50 for instance.
-ms, --minsim The minimum similarity that will be saved.
-o, --out The output folder, default= "dnabarcoder"
Written by Duong Vu [email protected]/[email protected]
""" # % (sys.argv[1], version)
arguments = sys.argv[2:]
if len(arguments) > 1:
cmd = os.path.join(path, 'analysis', 'computeSim.py')
arguments.insert(0, cmd)
exe = sys.executable
arguments.insert(0, exe)
subprocess.call(arguments)
else:
print(help)
sys.exit(1)
elif sys.argv[1] == 'visualize':
help = """
Usage: dnabarcoder %s <arguments>
version: %s
Description: The script visualizes the sequences based on BLAST similarity
Arguments: -i, --input Fasta file of Dna sequences, required
-c, --classification The taxonomic classification file in tab delimited format, required
-rank, --classificationrank The classification rank for coloring the sequences
-coord, --coordinates The coordinates file in json format if exists.
-sim, --simfilename The similarity file if exists.
-ml, --minalignmentlength Minimum sequence alignment length required for BLAST, default=400. For short barcode sequences like ITS2 (ITS1) sequences, ml should be set to smaller, 50 for instance.
-ms, --minsim The minimum similarity that will be saved for the similarity matrix, default=0.5
-dim, --dimension The dimension 2D or 3D for visualization, default=3
-kneigh, --kneigbors The k-neighbors number for visualization, default=150
-size, --size The size of the dot
-method, --visualizationmethod The visualization method: DiVE and plot, default=plot
-o, --out The output folder, default= "dnabarcoder"
Written by Duong Vu [email protected]/[email protected]
""" # % (sys.argv[1], version)
arguments = sys.argv[2:]
if len(arguments) > 1:
cmd = os.path.join(path, 'visualization', 'visualize.py')
arguments.insert(0, cmd)
exe = sys.executable
arguments.insert(0, exe)
subprocess.call(arguments)
else:
print(help)
sys.exit(1)
elif sys.argv[1] == 'tree':
help = """
Usage: dnabarcoder %s <arguments>
version: %s
Description: The script visualizes the sequences based on BLAST similarity
Arguments: -i, --input FASTA file of Dna sequences, required
-c, --classification The taxonomic classification file in tab delimited format
-rank, --classificationranks The classification ranks for getting sequence descriptions
-o, --out The output folder, default= "dnabarcoder"
Written by Duong Vu [email protected]/[email protected]
""" # % (sys.argv[1], version)
arguments = sys.argv[2:]
if len(arguments) > 1:
cmd = os.path.join(path, 'visualization', 'maketree.py')
arguments.insert(0, cmd)
exe = sys.executable
arguments.insert(0, exe)
subprocess.call(arguments)
else:
print(help)
sys.exit(1)
elif sys.argv[1] == 'cluster':
help = """
Usage: dnabarcoder %s <arguments>
version: %s
Description: The script cluster the sequences with a given threshold (cutoff)
Arguments: -i, --input FASTA file of Dna sequences, required
-t, --cutoff The threshold (cutoff) used for clustering, default=0.97
-sim, --simfilename The similarity file if exists
-ml, --minalignmentlength Minimum sequence alignment length required for BLAST in case simmatrix is not given, default=400. For short barcode sequences like ITS2 (ITS1) sequences, ml should probably be set to smaller, 50 for instance.
-c, --classification The taxonomic classification file in tab delimited format, optional. This is used to compute the confidence of clustering.
-rank, --classificationrank The classification rank to compute the confidence measure for clustering, optional
-o, --out The output folder, default= "dnabarcoder"
Written by Duong Vu [email protected]/[email protected]
""" # % (sys.argv[1], version)
arguments = sys.argv[2:]
if len(arguments) > 1:
cmd = os.path.join(path, 'prediction', 'cluster.py')
arguments.insert(0, cmd)
exe = sys.executable
arguments.insert(0, exe)
subprocess.call(arguments)
else:
print(help)
sys.exit(1)
elif sys.argv[1] == 'remove':
help = """
Usage: dnabarcoder %s <arguments>
version: %s
Description: The script removes sequences of the same complexes based on a given threshold (cutoff)
Arguments: -i, --input FASTA file of Dna sequences, required
-t, --cutoff The threshold (cutoff) used for clustering, default=0.97
-sim, --simfilename The similarity matrix file if exists
-ml, --minalignmentlength Minimum sequence alignment length required for BLAST in case simmatrix is not given, default=400. For short barcode sequences like ITS2 (ITS1) sequences, ml should probably be set to smaller, 50 for instance.
-c, --classification The taxonomic classification file in tab delimited format, optional. This is used to compute the confidence of clustering
-rank, --classificationrank The classification rank to remove complexes.
-o, --out The output folder, default= "dnabarcoder"
Written by Duong Vu [email protected]/[email protected]
""" # % (sys.argv[1], version)
arguments = sys.argv[2:]
if len(arguments) > 1:
cmd = os.path.join(path, 'prediction', 'removeComplexes.py')
arguments.insert(0, cmd)
exe = sys.executable
arguments.insert(0, exe)
subprocess.call(arguments)
else:
print(help)
sys.exit(1)
elif sys.argv[1] == 'predict':
help = """
Usage: dnabarcoder %s <arguments>
version: %s
Description: The script predicts similarity cut-offs for sequence identification
Arguments: -i, --input Fasta file of Dna sequences, required
-sim, --simfilename The similarity matrix file if exists
-ml, --minalignmentlength Minimum sequence alignment length required for BLAST in case simmatrix is not given, default=400. For short barcode sequences like ITS2 (ITS1) sequences, ml should probably be set to smaller, 50 for instance.
-c, --classification The taxonomic classification file in tab delimited format
-rank, --classificationranks The classification ranks for prediction, separated by ","
-higherrank, --higherclassificationranks The higher classification ranks for the prediction. If hp=="", the whole dataset will be used for prediction
-st, --startingthreshold The starting threshold of the prediction
-et, --endthreshold The ending threshold of the prediction
-s, --step , The step for the prediction
-minGroupNo,--minimumgroupnumber The minimum number of groups for prediction, default=5
-minSeqNo,--minimumsequencenumber The minimum number of sequences for prediction, default=50
-redo,--redo Redo the prediction if redo !="", default=""
-prefix,--prefix Prefix of all output files, default as the base of the input file
-o, --out The output folder, default= "dnabarcoder"
Written by Duong Vu [email protected]/[email protected]
""" # % (sys.argv[1], version)
arguments = sys.argv[2:]
if len(arguments) > 1:
cmd = os.path.join(path, 'prediction', 'predict.py')
arguments.insert(0, cmd)
exe = sys.executable
arguments.insert(0, exe)
subprocess.call(arguments)
else:
print(help)
sys.exit(1)
elif sys.argv[1] == 'best':
help = """
Usage: dnabarcoder %s <arguments>
version: %s
Description: The script predicts similarity cut-offs for sequence identification
Arguments: -i, --input the cutoffs file, required
-c, --classification The taxonomic classification file in tab delimited format
-prefix,--prefix Prefix of all output files, default as the base of the input file
-o, --out The output folder, default= "dnabarcoder"
Written by Duong Vu [email protected]/[email protected]
""" # % (sys.argv[1], version)
arguments = sys.argv[2:]
if len(arguments) > 1:
cmd = os.path.join(path, 'prediction', 'computeBestCutoffs.py')
arguments.insert(0, cmd)
exe = sys.executable
arguments.insert(0, exe)
subprocess.call(arguments)
else:
print(help)
sys.exit(1)
elif sys.argv[1] == 'merge':
help = """
Usage: dnabarcoder %s <arguments>
version: %s
Description: The script predicts similarity cut-offs for sequence identification
Arguments: -i, --input the cutoffs filenames separated by commas, required
-o, --out The merged cut-offs file, required
Written by Duong Vu [email protected]/[email protected]
""" # % (sys.argv[1], version)
arguments = sys.argv[2:]
if len(arguments) > 1:
cmd = os.path.join(path, 'prediction', 'mergeCutoffs.py')
arguments.insert(0, cmd)
exe = sys.executable
arguments.insert(0, exe)
subprocess.call(arguments)
else:
print(help)
sys.exit(1)
elif sys.argv[1] == 'search':
help = """
Usage: dnabarcoder %s <arguments>
version: %s
Description: The script classifies a fasta file of sequences against a reference file of sequences
Arguments: -i, --input Fasta file of Dna sequences to be classified, required
-r, --reference Fasta file of reference sequences, required
-ml, --minalignmentlength Minimum sequence alignment length required for BLAST in case simmatrix is not given, default=400. For short barcode sequences like ITS2 (ITS1) sequences, ml should probably be set to smaller, 50 for instance.
-prefix,--prefix Prefix of all output files, default as the base of the input file
-o, --out The output folder, default= "dnabarcoder"
Written by Duong Vu [email protected]/[email protected]
""" # % (sys.argv[1], version)
arguments = sys.argv[2:]
if len(arguments) > 1:
cmd = os.path.join(path, 'classification', 'search.py')
arguments.insert(0, cmd)
exe = sys.executable
arguments.insert(0, exe)
subprocess.call(arguments)
else:
print(help)
sys.exit(1)
elif sys.argv[1] == 'classify':
help = """
Usage: dnabarcoder %s <arguments>
version: %s
Description: The script classifies the sequences to their best match based on local cutoffs or a given global cutoff. For a sequence, a BLAST similarity score to the sequences of the predicted group is computed. The sequence is assigned if the BLAST similarity score is greater or equal than the similarity cut-off
Arguments: -i, --input The file of classified sequences, required
-f, --fasta The fasta file of classified sequences, required
-r, --reference The fasta file of reference sequences, required
-ml, --minalignmentlength Minimum sequence alignment length required for BLAST in case simmatrix is not given, default=400. For short barcode sequences like ITS2 (ITS1) sequences, ml should probably be set to smaller, 50 for instance.
-mp, --minproba Only consider the classified sequences with a probability greater or equal minproba
-m, --maxseqno Maximum number of the sequences of the predicted taxon name from the classification file will be selected for the comparison to find the best match. If it is not given, all the sequences will be selected, default=0
-c, --classification The taxonomic classification file in tab delimited format
-cutoffs, --cutoffs The similarity cutoffs file predicted by dnabarcoder predict if exists
-cutoff, --cutoff The similarity cutoff, default=0, only used if the similarity cutoffs file is not given
-confidence, --confidence The confidence of the similarity cutoff if exists
-rank, --rank The rank to classify the sequences, default=""
-prefix,--prefix Prefix of all output files, default as the base of the input file
-minGroupNo,--minimumgroupnumber The minimum number of groups for prediction, default=5
-minSeqNo,--minimumsequencenumber The minimum number of sequences for prediction, default=50
-o, --out The output folder, default= "dnabarcoder"
Written by Duong Vu [email protected]/[email protected]
""" # % (sys.argv[1], version)
arguments = sys.argv[2:]
if len(arguments) > 1:
cmd = os.path.join(path, 'classification', 'classify.py')
arguments.insert(0, cmd)
exe = sys.executable
arguments.insert(0, exe)
subprocess.call(arguments)
else:
print(help)
sys.exit(1)
elif sys.argv[1] == 'verify':
help = """
Usage: dnabarcoder %s <arguments>
version: %s
Description: The script assigns classified sequences based on local cutoffs or a given global cutoff. For a sequence, a BLAST similarity score to the sequences of the predicted group is computed. The sequence is assigned if the BLAST similarity score is greater or equal than the similarity cut-off
Arguments: -i, --input The file of classified sequences, required
-f, --fasta The fasta file of classified sequences, required
-r, --reference The fasta file of reference sequences, required
-m, --maxseqno Maximum number of the sequences of the predicted taxon name from the classification file will be selected for the comparison to find the best match. If it is not given, all the sequences will be selected, default=0
-c, --classification The taxonomic classification file in tab delimited format
-rank, --rank The rank to classify the sequences, default=""
-prefix,--prefix Prefix of all output files, default as the base of the input file
-o, --out The output folder, default= "dnabarcoder"
Written by Duong Vu [email protected]/[email protected]
""" # % (sys.argv[1], version)
arguments = sys.argv[2:]
if len(arguments) > 1:
cmd = os.path.join(path, 'classification', 'verify.py')
arguments.insert(0, cmd)
exe = sys.executable
arguments.insert(0, exe)
subprocess.call(arguments)
else:
print(help)
sys.exit(1)
elif sys.argv[1] == 'krona':
help = """
Usage: dnabarcoder %s <arguments>
version: %s
Description: The script visualizes the classification/assignment results using Krona
Arguments: -i, --input The file of classified/assigned sequences, required
-c, --classification The fasta file of classified sequences, required
-o, --out The output folder, default= "dnabarcoder"
Written by Duong Vu [email protected]/[email protected]
""" # % (sys.argv[1], version)
arguments = sys.argv[2:]
if len(arguments) > 1:
cmd = os.path.join(path, 'classification', 'visualizeClassification.py')
arguments.insert(0, cmd)
exe = sys.executable
arguments.insert(0, exe)
subprocess.call(arguments)
else:
print(help)
sys.exit(1)
elif sys.argv[1] == 'evaluate':
help = """
Usage: dnabarcoder %s <arguments>
version: %s
Description: The script compute classification metrices
Arguments: -i, --input The file of classified/assigned sequences, required
-c, --queryclassification The classification file of query sequences, required
-r, --refclassification The classification file of reference sequences, required
-o, --out The output folder, default= "dnabarcoder"
Written by Duong Vu [email protected]/[email protected]
""" # % (sys.argv[1], version)
arguments = sys.argv[2:]
if len(arguments) > 1:
cmd = os.path.join(path, 'classification', 'evaluate.py')
arguments.insert(0, cmd)
exe = sys.executable
arguments.insert(0, exe)
subprocess.call(arguments)
else:
print(help)
sys.exit(1)
else:
wrongcommand=True
if len(sys.argv) == 1 or wrongcommand==True:
print(default_help)