-
Notifications
You must be signed in to change notification settings - Fork 89
/
wf_common.cwl
1064 lines (1028 loc) · 36.1 KB
/
wf_common.cwl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env cwl-runner
label: "PGAP Pipeline"
cwlVersion: v1.2
class: Workflow
doc: PGAP pipeline for external usage, powered via containers
requirements:
- class: SubworkflowFeatureRequirement
- class: MultipleInputFeatureRequirement
- class: LoadListingRequirement
loadListing: deep_listing
- class: NetworkAccess
networkAccess: true
inputs:
#
# User specific input
#
go:
type: boolean[]
entries: File?
seq_submit: File?
taxid: int
gc_assm_name: string
locus_tag_prefix: string?
dbname: string?
report_usage: boolean
#
# User independent, static input
#
scatter_gather_nchunks:
type: string
default: '1'
supplemental_data:
type: Directory
blast_hits_cache_data:
type: Directory?
submol_block_json: File
ignore_all_errors: boolean?
contact_as_author_possible: boolean?
xpath_fail_initial_asndisc:
type: string?
doc: 'The default: setting is for standard call of pgap.cwl (for example, from pgap.py)'
default: //*[@severity="FATAL"]
xpath_fail_initial_asnvalidate:
type: string?
doc: 'The default: setting is for standard call of pgap.cwl (for example, from pgap.py)'
default: >
//*[
( @severity="ERROR" or @severity="REJECT" )
and not(contains(@code, "GENERIC_MissingPubRequirement"))
and not(contains(@code, "SEQ_DESCR_ChromosomeLocation"))
and not(contains(@code, "SEQ_DESCR_MissingLineage"))
and not(contains(@code, "SEQ_DESCR_NoTaxonID"))
and not(contains(@code, "SEQ_DESCR_OrganismIsUndefinedSpecies"))
and not(contains(@code, "SEQ_DESCR_StrainWithEnvironSample"))
and not(contains(@code, "SEQ_DESCR_BacteriaMissingSourceQualifier"))
and not(contains(@code, "SEQ_DESCR_UnwantedCompleteFlag"))
and not(contains(@code, "SEQ_FEAT_BadCharInAuthorLastName"))
and not(contains(@code, "SEQ_FEAT_ShortIntron"))
and not(contains(@code, "SEQ_INST_InternalNsInSeqRaw"))
and not(contains(@code, "SEQ_INST_ProteinsHaveGeneralID"))
and not(contains(@code, "SEQ_PKG_NucProtProblem"))
and not(contains(@code, "SEQ_PKG_ComponentMissingTitle"))
]
xpath_fail_final_asndisc:
type: string?
doc: 'The default: setting is for standard call of pgap.cwl (for example, from pgap.py)'
default: //*[@severity="FATAL"]
xpath_fail_final_asnvalidate:
type: string?
doc: 'The default: setting is for standard call of pgap.cwl (for example, from pgap.py)'
default: >
//*[( @severity="ERROR" or @severity="REJECT" )
and not(contains(@code, "GENERIC_MissingPubRequirement"))
and not(contains(@code, "SEQ_DESCR_ChromosomeLocation"))
and not(contains(@code, "SEQ_DESCR_MissingLineage"))
and not(contains(@code, "SEQ_DESCR_NoTaxonID"))
and not(contains(@code, "SEQ_DESCR_OrganismIsUndefinedSpecies"))
and not(contains(@code, "SEQ_DESCR_StrainWithEnvironSample"))
and not(contains(@code, "SEQ_DESCR_BacteriaMissingSourceQualifier"))
and not(contains(@code, "SEQ_DESCR_UnwantedCompleteFlag"))
and not(contains(@code, "SEQ_FEAT_BadCharInAuthorLastName"))
and not(contains(@code, "SEQ_FEAT_ShortIntron"))
and not(contains(@code, "SEQ_INST_InternalNsInSeqRaw"))
and not(contains(@code, "SEQ_INST_ProteinsHaveGeneralID"))
and not(contains(@code, "SEQ_PKG_ComponentMissingTitle"))
and not(contains(@code, "SEQ_PKG_NucProtProblem"))
]
no_internet:
type: boolean?
make_uuid:
type: boolean?
default: true
uuid_in:
type: File?
steps:
ping_start:
run: progs/pinger.cwl
in:
report_usage: report_usage
make_uuid: make_uuid
uuid_in: uuid_in
state:
default: "start"
workflow:
default: "pgap"
instring: gc_assm_name
out: [stdout, outstring, uuid_out]
passdata:
in:
data: supplemental_data
run: expr/supplemental_data_split_dir.cwl
out:
- 5s_model_path
- 16s_model_path
- 23s_model_path
- AntiFamLib
- all_order_specific_blastdb_file
- asn2pas_xsl
- identification_db_dir
- CDDdata2
- CDDdata
- checkm_data_path
- defline_cleanup_rules
- filter_for_raw_checkm
- gc_cache
- gene_master_ini
- hmm_path
- hmms_tab
- naming_hmms_combined
- naming_hmms_tab
- naming_sqlite
- package_versions
- rfam_amendments
- rfam_model_path
- rfam_stockholm
- selenoproteins
- species_genome_size
- taxon_db
- thresholds
- uniColl_cache
- uniColl_nuc_cache
- univ_prot_xml
- val_res_den_xml
- wp_hashes
#
# massage passdata output here
#
Get_Order_Specific_Strings:
label: "Get List of Order Specific Databases in the form of string[]"
run: progs/file2basenames.cwl
in:
input: passdata/all_order_specific_blastdb_file
out: [values]
log_package_versions:
run: progs/catlog.cwl
in:
input:
source:
- passdata/package_versions
linkMerge: merge_flattened
out: []
blast_hits_cache_data_split_dir:
in:
data: blast_hits_cache_data
run: expr/blast_hits_cache_data_split_dir.cwl
out:
- blast_hits_cache
- genus_list
genus_list_file2ints:
run: progs/file2ints.cwl
in:
input: blast_hits_cache_data_split_dir/genus_list
out: [values]
# end of massaging passdata output
genomic_source: # PLANE
run: genomic_source/wf_genomic_source_asn.cwl
in:
entries: entries
seq_submit: seq_submit
# taxid: taxid
gc_assm_name: ping_start/outstring
taxon_db: passdata/taxon_db
out: [gencoll_asn, seqid_list, stats_report, asncache, ids_out, submit_block_template, order]
#
# Pseudo plane "default 1"
#
Prepare_Unannotated_Sequences: # ORIGINAL TASK NAME: Prepare Unannotated Sequences # default 1
label: "Prepare Unannotated Sequences"
run: bacterial_prepare_unannotated.cwl
in:
asn_cache: genomic_source/asncache
gc_assembly: genomic_source/gencoll_asn
ids: genomic_source/seqid_list
submit_block: genomic_source/submit_block_template
taxon_db: passdata/taxon_db
no_internet: no_internet
out: [master_desc, sequences, plasmids]
Prepare_Unannotated_Sequences_pgapx_input_check:
run: progs/pgapx_input_check.cwl
in:
input: Prepare_Unannotated_Sequences/sequences
max_size: { default: 15000000 }
min_size: { default: 300 }
species_genome_size: passdata/species_genome_size
taxon_db: passdata/taxon_db
ignore_all_errors: ignore_all_errors
out: []
Prepare_Unannotated_Sequences_text:
run: progs/asn_translator.cwl
in:
input: Prepare_Unannotated_Sequences/sequences
output_output: {default: 'sequences.text.asn'}
out: [output]
Prepare_Unannotated_Sequences_asndisc_cpp:
run: progs/asndisc_cpp.cwl
in:
XML: {default: true}
genbank: {default: false}
P: {default: 't'}
a: {default: 'c'}
asn_cache: genomic_source/asncache
o_output: {default: 'sequences.disc.xml'}
i: Prepare_Unannotated_Sequences_text/output
d:
default:
- AUTODEF_USER_OBJECT
- FEATURE_LIST
- BACTERIAL_PARTIAL_NONEXTENDABLE_PROBLEMS
- PARTIAL_CDS_COMPLETE_SEQUENCE
- MISSING_AFFIL
- OVERLAPPING_CDS
- BAD_BGPIPE_QUALS
- FLATFILE_FIND
- COMMENT_PRESENT
- SHORT_PROT_SEQUENCES
- OVERLAPPING_GENES
- EXTRA_GENES
- N_RUNS
- TAX_LOOKUP_MISMATCH
- TAX_LOOKUP_MISSING
out: [o]
Prepare_Unannotated_Sequences_asndisc_evaluate:
run: progs/xml_evaluate.cwl
in:
input: Prepare_Unannotated_Sequences_asndisc_cpp/o
xpath_fail: xpath_fail_initial_asndisc
ignore_all_errors: ignore_all_errors
stdout_redir:
default: 'initial_asndisc_diag.xml'
out: [success, xml_output]
Prepare_Unannotated_Sequences_asnvalidate:
run: progs/asnvalidate.cwl
in:
Q:
default: 0
R:
default: 5
i: Prepare_Unannotated_Sequences/sequences
o_output:
default: 'sequences.val'
v: { default: 4 }
A:
default: true
U:
default: true
Z:
default: true
y:
default: true
out: [o]
Prepare_Unannotated_Sequences_asnvalidate_evaluate:
run: progs/xml_evaluate.cwl
in:
input: Prepare_Unannotated_Sequences_asnvalidate/o
xpath_fail: xpath_fail_initial_asnvalidate
ignore_all_errors: ignore_all_errors
stdout_redir:
default: 'initial_asnval_diag.xml'
out: [success, xml_output]
Cache_Entrez_Gene: # ORIGINAL TASK NAME: Cache Entrez Gene # default 1
label: "Cache Entrez Gene"
run: cache_entrez_gene.cwl
in:
asn_cache: [genomic_source/asncache, passdata/uniColl_cache]
egene_ini: passdata/gene_master_ini
input: Prepare_Unannotated_Sequences/sequences
go:
- Prepare_Unannotated_Sequences_asndisc_evaluate/success
- Prepare_Unannotated_Sequences_asnvalidate_evaluate/success
out: [prok_entrez_gene_stuff]
Create_Genomic_BLASTdb: # default 1
label: "Create Genomic BLASTdb" # default 1
run: progs/gp_makeblastdb.cwl
in:
ids: genomic_source/ids_out
title:
default: 'BLASTdb created by GPipe'
asn_cache:
source: [ genomic_source/asncache ]
linkMerge: merge_flattened
dbtype:
default: 'nucl'
out: [blastdb]
#
# end of pseudo plane "default 1"
#
Get_Proteins:
label: "Get Proteins"
run: wf_bacterial_prot_src.cwl
in:
uniColl_asn_cache: passdata/uniColl_cache
naming_sqlite: passdata/naming_sqlite
taxid: taxid
tax_sql_file: passdata/taxon_db
blastdb_dir: passdata/identification_db_dir
all_order_specific_blastdb: Get_Order_Specific_Strings/values
out: [universal_clusters, all_prots, selected_blastdb]
bacterial_ncrna: # PLANE
run: bacterial_ncrna/wf_gcmsearch.cwl
in:
go:
- Prepare_Unannotated_Sequences_asndisc_evaluate/success
- Prepare_Unannotated_Sequences_asnvalidate_evaluate/success
asn_cache: genomic_source/asncache
seqids: genomic_source/seqid_list
model_path: passdata/rfam_model_path
rfam_amendments: passdata/rfam_amendments
rfam_stockholm: passdata/rfam_stockholm
taxon_db: passdata/taxon_db
out: [annots]
bacterial_mobile_elem: # PLANE
run: bacterial_mobile_elem/wf_bacterial_mobile_elem.cwl
in:
go:
- Prepare_Unannotated_Sequences_asndisc_evaluate/success
- Prepare_Unannotated_Sequences_asnvalidate_evaluate/success
asn_cache: genomic_source/asncache
seqids: genomic_source/seqid_list
out: [annots]
bacterial_noncoding: # PLANE
run: bacterial_noncoding/wf_bacterial_noncoding.cwl
in:
go:
- Prepare_Unannotated_Sequences_asndisc_evaluate/success
- Prepare_Unannotated_Sequences_asnvalidate_evaluate/success
asn_cache: genomic_source/asncache
seqids: genomic_source/seqid_list
model_path_5s: passdata/5s_model_path
model_path_16s: passdata/16s_model_path
model_path_23s: passdata/23s_model_path
rfam_amendments: passdata/rfam_amendments
rfam_stockholm: passdata/rfam_stockholm
taxon_db: passdata/taxon_db
out: [ annotations_5s, annotations_16s, annotations_23s ]
bacterial_trna: # PLANE
run: bacterial_trna/wf_trnascan.cwl
in:
go:
- Prepare_Unannotated_Sequences_asndisc_evaluate/success
- Prepare_Unannotated_Sequences_asnvalidate_evaluate/success
asn_cache: genomic_source/asncache
seqids: genomic_source/seqid_list
taxid: taxid
taxon_db: passdata/taxon_db
scatter_gather_nchunks: scatter_gather_nchunks
out: [annots]
bacterial_annot: # PLANE
run: bacterial_annot/wf_bacterial_annot_pass1.cwl
in:
go:
- Prepare_Unannotated_Sequences_asndisc_evaluate/success
- Prepare_Unannotated_Sequences_asnvalidate_evaluate/success
asn_cache: genomic_source/asncache
inseq: Prepare_Unannotated_Sequences/sequences
hmm_path: passdata/hmm_path
hmms_tab: passdata/hmms_tab
selenoproteins: passdata/selenoproteins
scatter_gather_nchunks: scatter_gather_nchunks
uniColl_cache: passdata/uniColl_cache
naming_sqlite: passdata/naming_sqlite
trna_annots: bacterial_trna/annots
ncrna_annots: bacterial_ncrna/annots
nogenbank:
default: true
Execute_CRISPRs_annots: bacterial_mobile_elem/annots
Generate_16S_rRNA_Annotation_annotation: bacterial_noncoding/annotations_16s
Generate_23S_rRNA_Annotation_annotation: bacterial_noncoding/annotations_23s
Post_process_CMsearch_annotations_annots_5S: bacterial_noncoding/annotations_5s
genemark_path:
default: /netmnt/vast01/gp/ThirdParty/GeneMark/
thresholds: passdata/thresholds
out: [lds2,seqids,proteins, aligns, annotation, out_hmm_params, outseqs, prot_ids, models1]
spurious_annot_prelim: # PLANE
run: spurious_annot/wf_spurious_annot_pass1.cwl
in:
Extract_ORF_Proteins_proteins: bacterial_annot/proteins
Extract_ORF_Proteins_seqids: bacterial_annot/seqids
Extract_ORF_Proteins_lds2: bacterial_annot/lds2
AntiFamLib: passdata/AntiFamLib
sequence_cache: genomic_source/asncache
scatter_gather_nchunks: scatter_gather_nchunks
out: [AntiFam_tainted_proteins_I___oseqids]
bacterial_annot_1st_pass: # PLANE
run: bacterial_annot/wf_bacterial_annot_pass2.cwl
in:
lds2: bacterial_annot/lds2
proteins: bacterial_annot/proteins
prot_ids_A: bacterial_annot/seqids
prot_ids_B1: bacterial_annot/prot_ids
prot_ids_B2: spurious_annot_prelim/AntiFam_tainted_proteins_I___oseqids
identification_db_dir: passdata/identification_db_dir
blastdb: Get_Proteins/selected_blastdb
annotation: bacterial_annot/outseqs
sequence_cache: genomic_source/asncache
unicoll_cache: passdata/uniColl_cache
raw_seqs: Prepare_Unannotated_Sequences/sequences
plasmids: Prepare_Unannotated_Sequences/plasmids
scatter_gather_nchunks: scatter_gather_nchunks
taxid: taxid
blast_hits_cache: blast_hits_cache_data_split_dir/blast_hits_cache
taxon_db: passdata/taxon_db
genus_list: genus_list_file2ints/values
out: [aligns] # label: "goes to protein_alignment/Seed Search Compartments/compartments"
protein_alignment: # PLANE
run: protein_alignment/wf_protein_alignment.cwl
in:
go:
- Prepare_Unannotated_Sequences_asndisc_evaluate/success
- Prepare_Unannotated_Sequences_asnvalidate_evaluate/success
asn_cache: genomic_source/asncache
uniColl_asn_cache: passdata/uniColl_cache
blastdb_dir: Create_Genomic_BLASTdb/blastdb
taxid: taxid
tax_sql_file: passdata/taxon_db
gc_assembly: genomic_source/gencoll_asn
compartments: bacterial_annot_1st_pass/aligns
all_prots: Get_Proteins/all_prots
out: [align, align_non_match]
bacterial_annot_misc:
run: bacterial_annot/wf_bacterial_annot_pass3.cwl
in:
AntiFamLib: passdata/AntiFamLib
uniColl_cache: passdata/uniColl_cache
sequence_cache: genomic_source/asncache
hmm_aligns: bacterial_annot/aligns
scatter_gather_nchunks: scatter_gather_nchunks
prot_aligns: protein_alignment/align # label: "Filter Protein Alignments/align"
annotation: bacterial_annot/annotation
models1: bacterial_annot/models1
raw_seqs: Prepare_Unannotated_Sequences/sequences
thresholds: passdata/thresholds
naming_sqlite: passdata/naming_sqlite
hmm_params: bacterial_annot/out_hmm_params # Run GeneMark Training/hmm_params (EXTERNAL, put to input/
selenoproteins: passdata/selenoproteins
naming_hmms_combined: passdata/naming_hmms_combined
hmms_tab: passdata/naming_hmms_tab
wp_hashes: passdata/wp_hashes
taxon_db: passdata/taxon_db
genemark_path:
default: /netmnt/vast01/gp/ThirdParty/GeneMark/
out:
- id: Find_Best_Evidence_Alignments_aligns
- id: Run_GeneMark_Post_models
- id: Extract_Model_Proteins_seqids
- id: Extract_Model_Proteins_lds2
- id: Extract_Model_Proteins_proteins
- id: Search_Naming_HMMs_hmm_hits
- id: Assign_Naming_HMM_to_Proteins_assignments
- id: Name_by_WPs_names
- id: PGAP_plus_ab_initio_annotation
spurious_annot_final:
run: spurious_annot/wf_spurious_annot_pass2.cwl
in:
Extract_Model_Proteins_proteins: bacterial_annot_misc/Extract_Model_Proteins_proteins
Extract_Model_Proteins_seqids: bacterial_annot_misc/Extract_Model_Proteins_seqids
Extract_Model_Proteins_lds2: bacterial_annot_misc/Extract_Model_Proteins_lds2
AntiFamLib: passdata/AntiFamLib
sequence_cache: genomic_source/asncache
scatter_gather_nchunks: scatter_gather_nchunks
input_models: bacterial_annot_misc/PGAP_plus_ab_initio_annotation
out:
- AntiFam_tainted_proteins___oseqids
- Good_AntiFam_filtered_annotations_out
- Good_AntiFam_filtered_proteins_output
bacterial_annot_2nd_pass:
run: bacterial_annot/wf_bacterial_annot_pass4.cwl
in:
lds2: bacterial_annot_misc/Extract_Model_Proteins_lds2
proteins: bacterial_annot_misc/Extract_Model_Proteins_proteins
annotation: spurious_annot_final/Good_AntiFam_filtered_annotations_out
Good_AntiFam_filtered_proteins_gilist: spurious_annot_final/Good_AntiFam_filtered_proteins_output
sequence_cache: genomic_source/asncache
uniColl_cache: passdata/uniColl_cache
identification_db_dir: passdata/identification_db_dir
naming_sqlite: passdata/naming_sqlite
hmm_assignments: bacterial_annot_misc/Assign_Naming_HMM_to_Proteins_assignments
wp_assignments: bacterial_annot_misc/Name_by_WPs_names
Extract_Model_Proteins_prot_ids: bacterial_annot_misc/Extract_Model_Proteins_seqids
CDDdata: passdata/CDDdata
CDDdata2: passdata/CDDdata2
thresholds: passdata/thresholds
defline_cleanup_rules: passdata/defline_cleanup_rules
blastdb: Get_Proteins/selected_blastdb
scatter_gather_nchunks: scatter_gather_nchunks
taxid: taxid
blast_hits_cache: blast_hits_cache_data_split_dir/blast_hits_cache
taxon_db: passdata/taxon_db
genus_list: genus_list_file2ints/values
out:
- id: out_annotation
# #
# # Pseudo plane default 2, we do not need that for new submissions in off-NCBI environment
# #
# # Preserve_Annotations: # Pseudo plane default 2
# # run: task_types/tt_preserve_annot.cwl
# # in:
# # asn_cache:
# # source: [genomic_source/asncache]
# # linkMerge: merge_flattened
# # input_annotation: bacterial_annot/annotation
# # rfam_amendments: rfam_amendments
# # no_ncRNA:
# # default: true
# # out: [annotations]
# # preserve_annot_markup: # Pseudo plane default 2
# # # uncharted territory!!!
# # run: preserve_annot_markup.cwl # Preserve Product Accessions
# # in:
# # #seq_cache: genobacterial_prepare_unannotated/asncache
# # #unicoll_cache: uniColl_cache
# # input_annotation: Preserve_Annotations/annotations
# # asn_cache: [genomic_source/asncache, uniColl_cache]
# # egene_ini: gene_master_ini
# # gc_assembly: genomic_source/gencoll_asn
# # input: Preserve_Annotations/annotations
# # prok_entrez_gene_stuff: Cache_Entrez_Gene/prok_entrez_gene_stuff
# # out: [annotations]
# #
# # End of Pseudo plane default 2
# #
# # This step takes input from bacterial_annot 4/Bacterial Annot Filter, see GP-23942
# # Status:
# # tasktype coded, input/output matches
# # application not coded
# ###############################################
# # AMR plane is for later stages skipping
# ###############################################
bacterial_orthology_conditional:
run: bacterial_orthology/wf_bacterial_orthology_conditional.cwl
in:
input: Add_Locus_Tags/output
taxid: taxid
naming_sqlite: passdata/naming_sqlite
taxon_db: passdata/taxon_db
gc_cache: passdata/gc_cache
asn_cache:
source: [passdata/uniColl_nuc_cache, genomic_source/asncache]
linkMerge: merge_flattened
genus_list: genus_list_file2ints/values
blastdb:
default: [blastdb]
scatter_gather_nchunks: scatter_gather_nchunks
gencoll_asn: genomic_source/gencoll_asn
out: [output]
Add_Locus_Tags:
run: progs/add_locus_tags.cwl
in:
input: bacterial_annot_2nd_pass/out_annotation
locus_tag_prefix: locus_tag_prefix
dbname: dbname
out: [output]
#
# Pseudo plane default 3
#
#
# Final_Bacterial_Package task
#
Final_Bacterial_Package_asn_cleanup:
run: progs/asn_cleanup.cwl
in:
inp_annotation: bacterial_orthology_conditional/output # production
out: [annotation]
Final_Bacterial_Package_final_bact_asn:
run: progs/final_bact_asn.cwl
in:
annotation:
source: [Final_Bacterial_Package_asn_cleanup/annotation]
linkMerge: merge_flattened
asn_cache: genomic_source/asncache
gc_assembly: genomic_source/gencoll_asn # gc_create_from_sequences
master_desc: Prepare_Unannotated_Sequences/master_desc
it:
default: true
submission_mode_genbank:
default: true
nogenbank:
default: true
order: genomic_source/order
out: [outfull]
Final_Bacterial_Package_dumb_down_as_required:
run: progs/dumb_down_as_required.cwl
in:
annotation: Final_Bacterial_Package_final_bact_asn/outfull
asn_cache:
source: [genomic_source/asncache]
linkMerge: merge_flattened
max_x_ratio:
default: 0.1
max_x_run:
default: 3
partial_cov_threshold:
default: 65
partial_len_threshold:
default: 30
drop_partial_in_the_middle:
default: true
submission_mode_genbank:
default: true
submol_block_json: submol_block_json
nogenbank:
default: true
it:
default: true
out: [outent]
Final_Bacterial_Package_ent2sqn:
run: progs/ent2sqn.cwl
in:
annotation: Final_Bacterial_Package_dumb_down_as_required/outent
asn_cache:
source: [genomic_source/asncache]
linkMerge: merge_flattened
gc_assembly: genomic_source/gencoll_asn # gc_create_from_sequences
submit_block_template:
source: [genomic_source/submit_block_template]
linkMerge: merge_flattened
it:
default: true
contact_as_author_possible: contact_as_author_possible
output_impl:
default: annot-wo-checksum.sqn
out: [output]
add_checksum_sqn:
label: Add Checksum to SQN
run: progs/annot_checksum.cwl
in:
input: Final_Bacterial_Package_ent2sqn/output
output_name:
default: 'annot.sqn'
t:
default: true
ifmt:
default: seq-submit
mode:
default: add
out: [output]
Final_Bacterial_Package_sqn2gbent:
run: progs/sqn2gbent.cwl
doc: We are not taking here sqn with added annot checksum.
in:
input: Final_Bacterial_Package_ent2sqn/output
it:
default: true
out_name:
default: annot-gb-wo-checksum.ent
out: [output]
checkm:
label: 'Run CheckM in PGAP graph'
doc: 'Identify completeness of genome based on core HMM models in CheckM'
run: checkm/wf_checkm.cwl
in:
models: Final_Bacterial_Package_sqn2gbent/output
checkm_data_path: passdata/checkm_data_path
filter_for_raw_checkm: passdata/filter_for_raw_checkm
taxid: taxid
taxon_db: passdata/taxon_db
out: [checkm_raw, checkm_results]
add_checksum_gbent:
label: Add Checksum to Genbank class ENT
run: progs/annot_checksum.cwl
in:
input: Final_Bacterial_Package_sqn2gbent/output
output_name:
default: 'annot-gb.ent'
t:
default: true
ifmt:
default: seq-entry
mode:
default: add
out: [output]
Generate_Annotation_Reports_gff:
run: progs/gp_annot_format.cwl
in:
input: Final_Bacterial_Package_dumb_down_as_required/outent
ifmt:
default: seq-entry
t:
default: true
ofmt:
default: gff3
exclude_external:
default: true
out: [output]
Generate_Annotation_Reports_gbk:
run: progs/asn2flat.cwl
in:
input: Final_Bacterial_Package_sqn2gbent/output
no_external:
default: true
type:
default: seq-entry
mode:
default: entrez
style:
default: master
gbload:
default: false
out: [output]
Generate_Annotation_Reports_nuc_fasta:
run: progs/asn2fasta.cwl
in:
i: Final_Bacterial_Package_sqn2gbent/output
type:
default: seq-entry
nuc_fasta_name:
default: annot.fna
out: [nuc_fasta]
Generate_Annotation_Reports_prot_fasta:
run: progs/asn2fasta.cwl
in:
i: Final_Bacterial_Package_sqn2gbent/output
type:
default: seq-entry
prot_fasta_name:
default: annot.faa
out: [prot_fasta]
Generate_Annotation_Reports_cds_nuc_fasta:
run: progs/asn2fasta.cwl
in:
i: Final_Bacterial_Package_sqn2gbent/output
type:
default: seq-entry
feats:
default: fasta_cds_na
fasta_name:
default: annot_cds_from_genomic.fna
out: [fasta]
Generate_Annotation_Reports_cds_prot_fasta:
run: progs/asn2fasta.cwl
in:
i: Final_Bacterial_Package_sqn2gbent/output
type:
default: seq-entry
feats:
default: fasta_cds_aa
fasta_name:
default: annot_translated_cds.faa
out: [fasta]
Final_Bacterial_Package_std_validation:
run: progs/std_validation.cwl
in:
annotation: Final_Bacterial_Package_dumb_down_as_required/outent
asn_cache:
source: [genomic_source/asncache]
exclude_asndisc_codes: #
default:
- AUTODEF_USER_OBJECT
- FEATURE_LIST
- BACTERIAL_PARTIAL_NONEXTENDABLE_PROBLEMS
- PARTIAL_CDS_COMPLETE_SEQUENCE
- MISSING_AFFIL
- OVERLAPPING_CDS
- BAD_BGPIPE_QUALS
- FLATFILE_FIND
- COMMENT_PRESENT
- SHORT_PROT_SEQUENCES
- OVERLAPPING_GENES
- EXTRA_GENES
- N_RUNS
- BAD_LOCUS_TAG_FORMAT
- TAX_LOOKUP_MISMATCH
- TAX_LOOKUP_MISSING
inent: Final_Bacterial_Package_dumb_down_as_required/outent
ingb: Final_Bacterial_Package_sqn2gbent/output
insqn: Final_Bacterial_Package_ent2sqn/output
master_desc:
source: [Prepare_Unannotated_Sequences/master_desc]
linkMerge: merge_flattened
submit_block_template:
source: [genomic_source/submit_block_template]
linkMerge: merge_flattened
it:
default: true
submission_mode_genbank:
default: true
nogenbank:
default: true
out:
- id: outdisc
- id: outdiscxml
- id: outmetamaster
- id: outval
Final_Bacterial_Package_asndisc_evaluate:
run: progs/xml_evaluate.cwl
in:
input: Final_Bacterial_Package_std_validation/outdisc
xpath_fail: xpath_fail_final_asndisc
ignore_all_errors: ignore_all_errors
stdout_redir:
default: 'final_asndisc_diag.xml'
out: [xml_output]
Final_Bacterial_Package_asnvalidate_evaluate:
run: progs/xml_evaluate.cwl
in:
input: Final_Bacterial_Package_std_validation/outval
xpath_fail: xpath_fail_final_asnvalidate
ignore_all_errors: ignore_all_errors
stdout_redir:
default: 'final_asnval_diag.xml'
out: [success, xml_output]
Final_Bacterial_Package_val_stats: # TESTED (unit test)
run: progs/val_stats.cwl
in:
annot_val: Final_Bacterial_Package_std_validation/outval
c_toolkit:
default: true
out: [output, xml]
#
# end of Final_Bacterial_Package task
#
#### we do not need this
# Prepare_Init_Refseq_Molecules:
# run: progs/
#
# Validate_Annotation task
#
Validate_Annotation_bact_univ_prot_stats:
run: progs/bact_univ_prot_stats.cwl
in:
annot_request_id:
default: -1 # this is dummy annot_request_id
hmm_search: bacterial_annot_misc/Search_Naming_HMMs_hmm_hits
hmm_search_proteins: bacterial_annot_misc/PGAP_plus_ab_initio_annotation
input: Final_Bacterial_Package_final_bact_asn/outfull
univ_prot_xml: passdata/univ_prot_xml
val_res_den_xml: passdata/val_res_den_xml
it:
default: true
out: [bact_univ_prot_stats_old_xml, var_bact_univ_prot_details_xml, var_bact_univ_prot_stats_xml]
Validate_Annotation_proc_annot_stats:
run: progs/proc_annot_stats.cwl
in:
input: Final_Bacterial_Package_dumb_down_as_required/outent
max_unannotated_region:
default: 5000
univ_prot_xml: passdata/univ_prot_xml
val_res_den_xml: passdata/val_res_den_xml
it:
default: true
out:
- id: var_proc_annot_stats_xml
- id: var_proc_annot_details_xml
Validate_Annotation_xsltproc_asnvalidate:
run: progs/xsltproc.cwl
in:
xml: Final_Bacterial_Package_val_stats/xml
xslt: passdata/asn2pas_xsl
output_name:
default: 'var_proc_annot_stats.val.xml'
out: [output]
Validate_Annotation_xsltproc_asndisc:
run: progs/xsltproc.cwl
in:
xml: Final_Bacterial_Package_std_validation/outdiscxml
xslt: passdata/asn2pas_xsl
output_name:
default: 'var_proc_annot_stats.disc.xml'
out: [output]
Validate_Annotation_collect_annot_stats: # TESTED (unit test)
run: progs/collect_annot_stats.cwl
in:
input:
source:
- Validate_Annotation_bact_univ_prot_stats/var_bact_univ_prot_stats_xml
- Validate_Annotation_proc_annot_stats/var_proc_annot_stats_xml
- Validate_Annotation_xsltproc_asndisc/output
- Validate_Annotation_xsltproc_asnvalidate/output
linkMerge: merge_flattened
output_name:
default: proc_annot_stats.xml
out: [output]
Validate_Annotation_collect_annot_details:
run: progs/collect_annot_stats.cwl
in:
input:
source:
- Validate_Annotation_bact_univ_prot_stats/var_bact_univ_prot_details_xml
- Validate_Annotation_proc_annot_stats/var_proc_annot_details_xml
linkMerge: merge_flattened
output_name:
default: proc_annot_details.xml
out: [output]
ping_stop:
run: progs/pinger.cwl
in:
report_usage: report_usage
uuid_in: ping_start/uuid_out
state:
default: "stop"
workflow:
default: "pgap"
# Note: the input on the following line should be the same as all of the outputs
# for this workflow, so we ensure this is the final step.
infile:
- Final_Bacterial_Package_sqn2gbent/output
- Generate_Annotation_Reports_gff/output
- Generate_Annotation_Reports_gbk/output
- Generate_Annotation_Reports_nuc_fasta/nuc_fasta
- Generate_Annotation_Reports_prot_fasta/prot_fasta
out: [stdout]
#
# end of Validate_Annotation task
#
#
# End of Pseudo plane default 3
#
###############################################
# taxonomy plane is for later stages skipping
###############################################
#
# Pseudo plane default 4
#
# task: Generate Annotation Reports
#
# Generate_Annotation_Reports_pgaap_prepare_review:
# run: progs/pgaap_prepare_review.cwl
# Generate_Annotation_Reports_lds2_indexer:
# run: progs/lds2_indexer.cwl
#
# comparisons only for pre-existing annotation, one of the next phases
#
# # Generate_Annotation_Reports_comparison_format_curr_comparison:
# # run: progs/comparison_format.cwl
# # Generate_Annotation_Reports_comparison_format_prev_comparison:
# # run: progs/comparison_format.cwl
# # Generate_Annotation_Reports_comparison_format_prev_assm_comparison:
# # run: progs/comparison_format.cwl
# # Generate_Annotation_Reports_comparison_format_ref_comparison:
# # run: progs/comparison_format.cwl
# Generate_Annotation_Reports_bact_asn_stats:
# run: progs/bact_asn_stats.cwl
# in:
# input_annotation: Final_Bacterial_Package_dumb_down_as_required/outent
# it:
# default: true
# out: [output, xml_output]
# Generate_Annotation_Reports_val_format:
# run: progs/val_format.cwl
# Generate_Annotation_Reports_gbproject:
# run: progs/gbproject.cwl
# Generate_Annotation_Reports_asn2nucleotide_fasta:
# run: progs/asn2fasta.cwl
# Generate_Annotation_Reports_asn2all_protein_fasta:
# run: progs/asn2fasta.cwl
# Generate_Annotation_Reports_asn2protein_fasta:
# run: progs/asn2fasta.cwl
# Generate_Annotation_Reports_asn2flat: