From 16696036b7dc836fb667c79517b617daf38bab13 Mon Sep 17 00:00:00 2001 From: Pablo Riesgo Ferreiro Date: Wed, 23 Mar 2022 13:16:17 +0100 Subject: [PATCH 1/5] bump version --- nextflow.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow.config b/nextflow.config index c7b1423..884e73e 100644 --- a/nextflow.config +++ b/nextflow.config @@ -62,7 +62,7 @@ process.shell = ['/bin/bash', '-euo', 'pipefail'] cleanup = true conda.createTimeout = '1 h' -VERSION = '0.9.3' +VERSION = '0.10.0' manifest { name = 'TRON-Bioinformatics/covigator-ngs-pipeline' From 623bc0b24cfe16b1643d5e65fff6afe4bd6ed32f Mon Sep 17 00:00:00 2001 From: Pablo Riesgo Ferreiro Date: Wed, 23 Mar 2022 13:17:07 +0100 Subject: [PATCH 2/5] add bed files with ARTIC primers --- reference/SARS-CoV-2.primer.bed | 209 ++++++++++++++++++++++++++++++++ 1 file changed, 209 insertions(+) create mode 100755 reference/SARS-CoV-2.primer.bed diff --git a/reference/SARS-CoV-2.primer.bed b/reference/SARS-CoV-2.primer.bed new file mode 100755 index 0000000..bb3f96f --- /dev/null +++ b/reference/SARS-CoV-2.primer.bed @@ -0,0 +1,209 @@ +MN908947.3 25 50 SARS-CoV-2_1_LEFT 1 + AACAAACCAACCAACTTTCGATCTC +MN908947.3 324 344 SARS-CoV-2_2_LEFT 2 + TTTACAGGTTCGCGACGTGC +MN908947.3 408 431 SARS-CoV-2_1_RIGHT 1 - CTTCTACTAAGCCACAAGTGCCA +MN908947.3 644 666 SARS-CoV-2_3_LEFT 1 + GTAATAAAGGAGCTGGTGGCCA +MN908947.3 705 727 SARS-CoV-2_2_RIGHT 2 - ATAAGGATCAGTGCCAAGCTCG +MN908947.3 944 966 SARS-CoV-2_4_LEFT 2 + GTGTATACTGCTGCCGTGAACA +MN908947.3 1017 1044 SARS-CoV-2_3_RIGHT 1 - GCCAATTTAATTTCAAAAGGTGTCTGC +MN908947.3 1245 1266 SARS-CoV-2_5_LEFT 1 + TGAAACTTCATGGCAGACGGG +MN908947.3 1337 1362 SARS-CoV-2_4_RIGHT 2 - ACAACAGCATTTTGGGGTAAGTAAC +MN908947.3 1540 1562 SARS-CoV-2_6_LEFT 2 + CGTGCTAGCGCTAACATAGGTT +MN908947.3 1623 1650 SARS-CoV-2_5_RIGHT 1 - TTGATGTTGACTTTCTCTTTTTGGAGT +MN908947.3 1851 1875 SARS-CoV-2_7_LEFT 1 + ACTGAGTCCTCTTTATGCATTTGC +MN908947.3 1925 1948 SARS-CoV-2_6_RIGHT 2 - AACACGCACAGAATTTTGAGCAG +MN908947.3 2154 2180 SARS-CoV-2_8_LEFT 2 + GCTTGAAGAGAAGTTTAAGGAAGGTG +MN908947.3 2228 2250 SARS-CoV-2_7_RIGHT 1 - CCACCGACAATTTCACAAGCAC +MN908947.3 2483 2508 SARS-CoV-2_9_LEFT 1 + TCTTCTTAGAGGGAGAAACACTTCC +MN908947.3 2544 2571 SARS-CoV-2_8_RIGHT 2 - GGTTGTTCTAATGGTTGTAAATCACCA +MN908947.3 2780 2813 SARS-CoV-2_10_LEFT_alt1 2 + TGAATATCACTTTTGAACTTGATGAAAGGATTG +MN908947.3 2826 2850 SARS-CoV-2_10_LEFT 2 + TGAGAAGTGCTCTGCCTATACAGT +MN908947.3 2861 2885 SARS-CoV-2_9_RIGHT 1 - CACAGGCGAACTCATTTACTTCTG +MN908947.3 3078 3102 SARS-CoV-2_11_LEFT 1 + AGAAGAGTTTGAGCCATCAACTCA +MN908947.3 3156 3177 SARS-CoV-2_10_RIGHT_alt1 2 - GGTTGAAGAGCAGCAGAAGTG +MN908947.3 3183 3210 SARS-CoV-2_10_RIGHT 2 - TCATCTAACCAATCTTCTTCTTGCTCT +MN908947.3 3390 3412 SARS-CoV-2_12_LEFT 2 + TGCAGACATTGTGGAAGAAGCT +MN908947.3 3470 3492 SARS-CoV-2_11_RIGHT 1 - TTTAAGGCTCCTGCAACACCTC +MN908947.3 3683 3705 SARS-CoV-2_13_LEFT 1 + AGCACGAAGTTCTACTTGCACC +MN908947.3 3769 3794 SARS-CoV-2_12_RIGHT 2 - CAGCTAAGTAGACATTTGTGCGAAC +MN908947.3 3992 4018 SARS-CoV-2_14_LEFT 2 + TGGAAGAAACTAAGTTCCTCACAGAA +MN908947.3 4067 4093 SARS-CoV-2_13_RIGHT 1 - GATGTCAATGTCACTAACAAGAGTGG +MN908947.3 4312 4339 SARS-CoV-2_15_LEFT 1 + AAAAGTGCCTTTTACATTCTACCATCT +MN908947.3 4387 4409 SARS-CoV-2_14_RIGHT 2 - CATGTGCAAGCATTTCTCGCAA +MN908947.3 4620 4648 SARS-CoV-2_16_LEFT 2 + TGTAACACATGGCTTAAATTTGGAAGAA +MN908947.3 4685 4710 SARS-CoV-2_15_RIGHT 1 - GCATCAGGTGAAGAAACAGAAACTG +MN908947.3 4923 4953 SARS-CoV-2_17_LEFT 1 + TGACAATCTTAAGACACTTCTTTCTTTGAG +MN908947.3 4995 5017 SARS-CoV-2_16_RIGHT 2 - CACAACTTGCGTGTGGAGGTTA +MN908947.3 5230 5259 SARS-CoV-2_18_LEFT 2 + TGGAAATACCCACAAGTTAATGGTTTAAC +MN908947.3 5302 5331 SARS-CoV-2_17_RIGHT 1 - TTCAACTCTATTTGTTGGAGTGTTAACAA +MN908947.3 5561 5584 SARS-CoV-2_19_LEFT 1 + AAGCTGTTATGTACATGGGCACA +MN908947.3 5620 5643 SARS-CoV-2_18_RIGHT 2 - GCTTGTTTACCACACGTACAAGG +MN908947.3 5867 5894 SARS-CoV-2_20_LEFT 2 + ACAAAGAAAACAGTTACACAACAACCA +MN908947.3 5932 5957 SARS-CoV-2_19_RIGHT 1 - TGTCCAACTTAGGGTCAATTTCTGT +MN908947.3 6184 6210 SARS-CoV-2_21_LEFT 1 + CACTACACACCCTCTTTTAAGAAAGG +MN908947.3 6247 6272 SARS-CoV-2_20_RIGHT 2 - ACGTGGCTTTATTAGTTGCATTGTT +MN908947.3 6478 6507 SARS-CoV-2_22_LEFT 2 + GTAGGAGACATTATACTTAAACCAGCAAA +MN908947.3 6553 6582 SARS-CoV-2_21_RIGHT 1 - GTAAGACTAGAATTGTCTACATAAGCAGC +MN908947.3 6747 6776 SARS-CoV-2_23_LEFT 1 + AAACCGTGTTTGTACTAATTATATGCCTT +MN908947.3 6859 6885 SARS-CoV-2_22_RIGHT 2 - CCGACACTCTTAACAGTATTCTTTGC +MN908947.3 7057 7084 SARS-CoV-2_24_LEFT 2 + GGTTACAGAGAAGGCTATTTGAACTCT +MN908947.3 7122 7148 SARS-CoV-2_23_RIGHT 1 - AACCACTAAGACAAACACTACAAGGT +MN908947.3 7127 7156 SARS-CoV-2_23_RIGHT_alt1 1 - AGAATCTAAACCACTAAGACAAACACTAC +MN908947.3 7381 7403 SARS-CoV-2_25_LEFT 1 + CAAATGGCCCCGATTTCAGCTA +MN908947.3 7440 7467 SARS-CoV-2_24_RIGHT 2 - ACAACATGCACATAACTTTTCCATACA +MN908947.3 7672 7695 SARS-CoV-2_26_LEFT 2 + GCGAGAGACTTGTCACTACAGTT +MN908947.3 7747 7770 SARS-CoV-2_25_RIGHT 1 - TGGATGGAACCATTCTTCACTGT +MN908947.3 7997 8019 SARS-CoV-2_27_LEFT 1 + CTGATGTTGGTGATAGTGCGGA +MN908947.3 8063 8092 SARS-CoV-2_26_RIGHT 2 - GAGTTTTTCCATTGGTACGTTAAAAGTTG +MN908947.3 8304 8326 SARS-CoV-2_28_LEFT 2 + TGAAAACATGACACCCCGTGAC +MN908947.3 8367 8392 SARS-CoV-2_27_RIGHT_alt1 1 - AATGTTGTGACTTTTTGCTACCTGC +MN908947.3 8370 8395 SARS-CoV-2_27_RIGHT 1 - AGCAATGTTGTGACTTTTTGCTACC +MN908947.3 8596 8619 SARS-CoV-2_29_LEFT 1 + CTTGTGTTCCTTTTTGTTGCTGC +MN908947.3 8691 8714 SARS-CoV-2_28_RIGHT 2 - TGACACCACCATCAATAGCCTTG +MN908947.3 8919 8944 SARS-CoV-2_30_LEFT 2 + ACCTAGAGTTTTTAGTGCAGTTGGT +MN908947.3 8990 9013 SARS-CoV-2_29_RIGHT 1 - AGCCAAAACACAAGCTGATGTTG +MN908947.3 9168 9192 SARS-CoV-2_31_LEFT 1 + CCTTGAAGGTTCTGTTAGAGTGGT +MN908947.3 9306 9329 SARS-CoV-2_30_RIGHT 2 - CTACACCACAGAAAACTCCTGGT +MN908947.3 9470 9497 SARS-CoV-2_32_LEFT 2 + GAGCTTTTGGTGAATACAGTCATGTAG +MN908947.3 9535 9564 SARS-CoV-2_31_RIGHT 1 - AATGAGTAAACTGGTGTTAAACAGAGTAC +MN908947.3 9782 9805 SARS-CoV-2_33_LEFT 1 + GTACTTTTGAAGAAGCTGCGCTG +MN908947.3 9842 9866 SARS-CoV-2_32_RIGHT 2 - GAGGTAATAGCACATCACTACGCA +MN908947.3 10076 10099 SARS-CoV-2_34_LEFT 2 + TCCCATCTGGTAAAGTTGAGGGT +MN908947.3 10150 10176 SARS-CoV-2_33_RIGHT 1 - TGTCTTGGACAGTAAACTACGTCATC +MN908947.3 10393 10419 SARS-CoV-2_35_LEFT 1 + GTGTTAGCTTGTTACAATGGTTCACC +MN908947.3 10465 10491 SARS-CoV-2_34_RIGHT 2 - CCACATGAACCATTAAGGAATGAACC +MN908947.3 10713 10742 SARS-CoV-2_36_LEFT 2 + CAATCGATTTACCACAACTCTTAATGACT +MN908947.3 10785 10810 SARS-CoV-2_35_RIGHT 1 - AGGTCCTAGTATGTCAACATGGTCT +MN908947.3 11000 11023 SARS-CoV-2_37_LEFT 1 + CACACCACTGGTTGTTACTCACA +MN908947.3 11092 11116 SARS-CoV-2_36_RIGHT 2 - ACCCATAGCAAAAGGTAAAAAGGC +MN908947.3 11305 11330 SARS-CoV-2_38_LEFT 2 + GACTGTGTTATGTATGCATCAGCTG +MN908947.3 11388 11414 SARS-CoV-2_37_RIGHT 1 - GTGTCAAGACATTCATAAGTGTCCAC +MN908947.3 11624 11651 SARS-CoV-2_39_LEFT 1 + GCTATTTTTGTACTTGTTACTTTGGCC +MN908947.3 11689 11720 SARS-CoV-2_38_RIGHT 2 - CCTGTGTAGAAACTAAGTAATCATAAACACC +MN908947.3 11937 11963 SARS-CoV-2_40_LEFT 2 + TGTCCAGTTACACAATGACATTCTCT +MN908947.3 12011 12033 SARS-CoV-2_39_RIGHT 1 - CCCTGCATGGAAAGCAAAACAG +MN908947.3 12234 12255 SARS-CoV-2_41_LEFT 1 + ATTTGACCGTGATGCAGCCAT +MN908947.3 12317 12339 SARS-CoV-2_40_RIGHT 2 - ACTTTTGCCCTCTTGTCCTCAG +MN908947.3 12519 12546 SARS-CoV-2_42_LEFT 2 + TGGTACAACATTTACTTATGCATCAGC +MN908947.3 12618 12643 SARS-CoV-2_41_RIGHT 1 - AAGAGGCCATGCTAAATTAGGTGAA +MN908947.3 12831 12856 SARS-CoV-2_43_LEFT 1 + GGATTTGAAATGGGCTAGATTCCCT +MN908947.3 12895 12920 SARS-CoV-2_42_RIGHT 2 - TGTCTGTAACAAACCTACAAGGTGG +MN908947.3 13124 13148 SARS-CoV-2_44_LEFT 2 + GGGGACAACCAATCACTAATTGTG +MN908947.3 13218 13240 SARS-CoV-2_43_RIGHT 1 - CGATGCACCACCAAAGGATTCT +MN908947.3 13463 13485 SARS-CoV-2_45_LEFT 1 + TAAACGGGTTTGCGGTGTAAGT +MN908947.3 13506 13528 SARS-CoV-2_44_RIGHT 2 - CATCAGTACTAGTGCCTGTGCC +MN908947.3 13752 13775 SARS-CoV-2_46_LEFT 2 + AGAATAGACGGTGACATGGTACC +MN908947.3 13833 13859 SARS-CoV-2_45_RIGHT 1 - TCACAATTACCTTCATCAAAATGCCT +MN908947.3 14045 14075 SARS-CoV-2_47_LEFT 1 + TGGTGTACTGACATTAGATAATCAAGATCT +MN908947.3 14120 14144 SARS-CoV-2_46_RIGHT 2 - TCTACAACAGGAACTCCACTACCT +MN908947.3 14338 14362 SARS-CoV-2_48_LEFT 2 + ACTGTTTGGATGACAGATGCATTC +MN908947.3 14428 14457 SARS-CoV-2_47_RIGHT 1 - TGGAACACCATCAACAAATATTTTTCTCA +MN908947.3 14647 14674 SARS-CoV-2_49_LEFT 1 + ACAATGTTGCTTTTCAAACTGTCAAAC +MN908947.3 14717 14743 SARS-CoV-2_48_RIGHT 2 - CAGAACTTCCTTCCTTAAAGAAACCC +MN908947.3 14953 14983 SARS-CoV-2_50_LEFT 2 + CATTTAATAAATGGGGTAAGGCTAGACTTT +MN908947.3 15023 15050 SARS-CoV-2_49_RIGHT 1 - GGGATGACATTACGTTTTGTATATGCG +MN908947.3 15214 15237 SARS-CoV-2_51_LEFT 1 + GCAAATTCTATGGTGGTTGGCAC +MN908947.3 15336 15358 SARS-CoV-2_50_RIGHT 2 - GAGCAAGAACAAGTGAGGCCAT +MN908947.3 15535 15557 SARS-CoV-2_52_LEFT 2 + CTGTCACGGCCAATGTTAATGC +MN908947.3 15596 15619 SARS-CoV-2_51_RIGHT 1 - GTCTGTGTTGTAAATTGCGGACA +MN908947.3 15855 15881 SARS-CoV-2_53_LEFT 1 + ACTAAAGGACCTCATGAATTTTGCTC +MN908947.3 15917 15941 SARS-CoV-2_52_RIGHT 2 - GGATCTGGGTAAGGAAGGTACACA +MN908947.3 16112 16137 SARS-CoV-2_54_LEFT 2 + ACATGATGAGTTAACAGGACACATG +MN908947.3 16239 16260 SARS-CoV-2_53_RIGHT 1 - GCAAAGAACACAAGCCCCAAC +MN908947.3 16386 16408 SARS-CoV-2_55_LEFT 1 + AATGCTCCAGGTTGTGATGTCA +MN908947.3 16483 16508 SARS-CoV-2_54_RIGHT 2 - CCAAAAACTTGTCCATTAGCACACA +MN908947.3 16692 16714 SARS-CoV-2_56_LEFT 2 + ACTGTACGTGAAGTGCTGTCTG +MN908947.3 16767 16796 SARS-CoV-2_55_RIGHT 1 - ACACGATAACCAGTAAAGACATAATTTCG +MN908947.3 16986 17013 SARS-CoV-2_57_LEFT 1 + GGCTTATACCCAACACTCAATATCTCA +MN908947.3 17082 17105 SARS-CoV-2_56_RIGHT 2 - TGACTCTTACCAGTACCAGGTGG +MN908947.3 17323 17345 SARS-CoV-2_58_LEFT 2 + TGCCTGAGACGACAGCAGATAT +MN908947.3 17381 17405 SARS-CoV-2_57_RIGHT 1 - CTGGCATTGACAACACTCAAATCA +MN908947.3 17615 17642 SARS-CoV-2_59_LEFT 1 + GCTTAAAGCACATAAAGACAAATCAGC +MN908947.3 17688 17711 SARS-CoV-2_58_RIGHT 2 - TGTGGCCTGTTAATTGCAGATGA +MN908947.3 17911 17939 SARS-CoV-2_60_LEFT 2 + ACAGATTTAATGTTGCTATTACCAGAGC +MN908947.3 17997 18022 SARS-CoV-2_59_RIGHT 1 - TCCTACGTGGAATTTCAAGACTTGT +MN908947.3 18244 18267 SARS-CoV-2_61_LEFT 1 + ACCCTAACATGTTTATCACCCGC +MN908947.3 18307 18328 SARS-CoV-2_60_RIGHT 2 - TAGCATGACACCCCTCGACAT +MN908947.3 18550 18578 SARS-CoV-2_62_LEFT 2 + GTGACACACTTAAAAATCTCTCTGACAG +MN908947.3 18624 18652 SARS-CoV-2_61_RIGHT 1 - GCTCAGGTCCTATTTTCACAAAATACTT +MN908947.3 18869 18891 SARS-CoV-2_63_LEFT 1 + TAGGTGTCTAGCTGTCCACGAG +MN908947.3 18936 18961 SARS-CoV-2_62_RIGHT 2 - CCGCATTAATCTTCAGTTCATCACC +MN908947.3 19183 19208 SARS-CoV-2_64_LEFT 2 + GCCTATTTTGGAATTGCAATGTCGA +MN908947.3 19252 19277 SARS-CoV-2_63_RIGHT 1 - CCAGGCAAGTTAAGGTTAGATAGCA +MN908947.3 19485 19513 SARS-CoV-2_65_LEFT 1 + GTCTGTAGACATCATGCTAATGAGTACA +MN908947.3 19558 19586 SARS-CoV-2_64_RIGHT 2 - GTATCAAATTGTTTGTAAACCCACAAGC +MN908947.3 19810 19836 SARS-CoV-2_66_LEFT 2 + AACCAGTACCAGAGGTGAAAATACTC +MN908947.3 19877 19901 SARS-CoV-2_65_RIGHT 1 - GCTGGAGCATCTCTTTTGTAGTCC +MN908947.3 20090 20117 SARS-CoV-2_67_LEFT 1 + CAAACAAGCTAGTCTTAATGGAGTCAC +MN908947.3 20186 20216 SARS-CoV-2_66_RIGHT 2 - TTTCTACTCTGAGTAAAGTAAGTTTCAGGT +MN908947.3 20377 20405 SARS-CoV-2_68_LEFT 2 + GACTAGCTAAACGTTTTAAGGAATCACC +MN908947.3 20472 20497 SARS-CoV-2_67_RIGHT 1 - AACACACACACTTAGATGAACCTGT +MN908947.3 20677 20699 SARS-CoV-2_69_LEFT 1 + CGGGTGTTGCTATGCCTAATCT +MN908947.3 20766 20792 SARS-CoV-2_68_RIGHT 2 - GCGACATTCATCATTATGCCTTTAGG +MN908947.3 20988 21013 SARS-CoV-2_70_LEFT 2 + TTGATTGGTGATTGTGCAACTGTAC +MN908947.3 21050 21080 SARS-CoV-2_69_RIGHT 1 - TTTGTAACATTTTTAGTCTTAGGGTCGTAC +MN908947.3 21294 21316 SARS-CoV-2_71_LEFT 1 + GGCAAACCACGCGAACAAATAG +MN908947.3 21358 21387 SARS-CoV-2_70_RIGHT 2 - AGAATAGGAAGACAACTGAATTGGATTTG +MN908947.3 21532 21561 SARS-CoV-2_72_LEFT 2 + GTGATGTTCTTGTTAACAACTAAACGAAC +MN908947.3 21675 21700 SARS-CoV-2_71_RIGHT 1 - TGAGGATCTGAAAACTTTGTCAGGG +MN908947.3 21865 21889 SARS-CoV-2_73_LEFT 1 + AGAGGCTGGATTTTTGGTACTACT +MN908947.3 21904 21933 SARS-CoV-2_72_RIGHT 2 - GTAGCGTTATTAACAATAAGTAGGGACTG +MN908947.3 22091 22113 SARS-CoV-2_74_LEFT 2 + TGGACCTTGAAGGAAAACAGGG +MN908947.3 22247 22274 SARS-CoV-2_73_RIGHT 1 - ACCTAGTGATGTTAATACCTATTGGCA +MN908947.3 22402 22428 SARS-CoV-2_75_LEFT 1 + GAAAATGGAACCATTACAGATGCTGT +MN908947.3 22474 22503 SARS-CoV-2_74_RIGHT 2 - TGATAGATTCCTTTTTCTACAGTGAAGGA +MN908947.3 22648 22677 SARS-CoV-2_76_LEFT 2 + GCTGATTATTCTGTCCTATATAATTCCGC +MN908947.3 22742 22774 SARS-CoV-2_76_LEFT_alt1 2 + ATGTCTATGCAGATTCATTTGTAATTAGAGGT +MN908947.3 22785 22805 SARS-CoV-2_75_RIGHT 1 - TTTGCCCTGGAGCGATTTGT +MN908947.3 22944 22974 SARS-CoV-2_77_LEFT 1 + CAAACCTTTTGAGAGAGATATTTCAACTGA +MN908947.3 23028 23057 SARS-CoV-2_76_RIGHT 2 - GTTGGAAACCATATGATTGTAAAGGAAAG +MN908947.3 23120 23141 SARS-CoV-2_76_RIGHT_alt1 2 - GTCCACAAACAGTTGCTGGTG +MN908947.3 23219 23246 SARS-CoV-2_78_LEFT 2 + CTGAGTCTAACAAAAAGTTTCTGCCTT +MN908947.3 23327 23351 SARS-CoV-2_77_RIGHT 1 - CACTGACACCACCAAAAGAACATG +MN908947.3 23553 23575 SARS-CoV-2_79_LEFT 1 + ACCCATTGGTGCAGGTATATGC +MN908947.3 23611 23635 SARS-CoV-2_78_RIGHT 2 - GGATTGACTAGCTACACTACGTGC +MN908947.3 23853 23876 SARS-CoV-2_80_LEFT 2 + CCGTGCTTTAACTGGAATAGCTG +MN908947.3 23914 23944 SARS-CoV-2_79_RIGHT_alt1 1 - AATTGGTGGTGTTTTGTAAATTTGTTTGAC +MN908947.3 23927 23955 SARS-CoV-2_79_RIGHT 1 - CCAAAATCTTTAATTGGTGGTGTTTTGT +MN908947.3 24171 24194 SARS-CoV-2_81_LEFT 1 + TGCTCAATACACTTCTGCACTGT +MN908947.3 24233 24258 SARS-CoV-2_80_RIGHT 2 - GCAAATGGTATTTGTAATGCAGCAC +MN908947.3 24426 24448 SARS-CoV-2_82_LEFT 2 + TGCACAAGCTTTAAACACGCTT +MN908947.3 24545 24567 SARS-CoV-2_81_RIGHT 1 - TGAAGTCTGCCTGTGATCAACC +MN908947.3 24750 24772 SARS-CoV-2_83_LEFT 1 + GCATGTGACTTATGTCCCTGCA +MN908947.3 24814 24836 SARS-CoV-2_82_RIGHT 2 - CACGAGGAAAGTGTGCTTTTCC +MN908947.3 25051 25076 SARS-CoV-2_84_LEFT 2 + GTTGATTTAGGTGACATCTCTGGCA +MN908947.3 25122 25150 SARS-CoV-2_83_RIGHT 1 - AGATTCATTTAAATTCTTGGCAACCTCA +MN908947.3 25331 25353 SARS-CoV-2_85_LEFT 1 + ATGAAGACGACTCTGAGCCAGT +MN908947.3 25438 25461 SARS-CoV-2_84_RIGHT 2 - AGCATCCTTGATTTCACCTTGCT +MN908947.3 25645 25672 SARS-CoV-2_86_LEFT 2 + TGTTGTTTGTAACAGTTTACTCACACC +MN908947.3 25711 25740 SARS-CoV-2_85_RIGHT 1 - CTGCAAGAAGTAGACTAAAGCATAAAGAT +MN908947.3 25951 25979 SARS-CoV-2_87_LEFT 1 + GTGGTTATACTGAAAAATGGGAATCTGG +MN908947.3 26026 26050 SARS-CoV-2_86_RIGHT 2 - TCAATTGAGTTGAGTACAGCTGGT +MN908947.3 26242 26268 SARS-CoV-2_88_LEFT_alt1 2 + TTATGTACTCATTCGTTTCGGAAGAG +MN908947.3 26255 26277 SARS-CoV-2_88_LEFT 2 + CGTTTCGGAAGAGACAGGTACG +MN908947.3 26338 26360 SARS-CoV-2_87_RIGHT 1 - AATCGAAGCGCAGTAAGGATGG +MN908947.3 26564 26587 SARS-CoV-2_89_LEFT 1 + AAGCTCCTTGAACAATGGAACCT +MN908947.3 26592 26621 SARS-CoV-2_89_LEFT_alt1 1 + TAGGTTTCCTATTCCTTACATGGATTTGT +MN908947.3 26635 26661 SARS-CoV-2_88_RIGHT 2 - ACAAAAACCTATTCCTGTTGGCATAG +MN908947.3 26873 26895 SARS-CoV-2_90_LEFT 2 + ATTCTTCTCAACGTGCCACTCC +MN908947.3 26956 26979 SARS-CoV-2_89_RIGHT 1 - CAGCAATACGAAGATGTCCACGA +MN908947.3 26966 26991 SARS-CoV-2_89_RIGHT_alt1 1 - CTAGATGGTGTCCAGCAATACGAAG +MN908947.3 27152 27177 SARS-CoV-2_91_LEFT 1 + TCCAGTAGCAGTGACAATATTGCTT +MN908947.3 27218 27251 SARS-CoV-2_90_RIGHT_alt1 2 - ATTAGTAATATCTCTGCTATAGTAACCTGAAAG +MN908947.3 27256 27283 SARS-CoV-2_90_RIGHT 2 - TCCAAATGGAAACTTTAAAAGTCCTCA +MN908947.3 27447 27473 SARS-CoV-2_92_LEFT 2 + CACTACCAAGAGTGTGTTAGAGGTAC +MN908947.3 27534 27560 SARS-CoV-2_91_RIGHT 1 - AGTGCAAATTTGTTATCAGCTAGAGG +MN908947.3 27700 27726 SARS-CoV-2_93_LEFT 1 + TTGTTGCGGCAATAGTGTTTATAACA +MN908947.3 27826 27855 SARS-CoV-2_92_RIGHT 2 - GTTCAAGTGAGAACCAAAAGATAATAAGC +MN908947.3 27996 28021 SARS-CoV-2_94_LEFT 2 + ACCCGTGTCCTATTCACTTCTATTC +MN908947.3 28082 28104 SARS-CoV-2_93_RIGHT 1 - TGGGTGATTTAGAACCAGCCTC +MN908947.3 28190 28214 SARS-CoV-2_95_LEFT 1 + GTGCGTTGTTCGTTCTATGAAGAC +MN908947.3 28394 28416 SARS-CoV-2_94_RIGHT 2 - TTATTGGGTAAACCTTGGGGCC +MN908947.3 28512 28536 SARS-CoV-2_96_LEFT 2 + AGATGACCAAATTGGCTACTACCG +MN908947.3 28572 28598 SARS-CoV-2_95_RIGHT 1 - ACCATCTTGGACTGAGATCTTTCATT +MN908947.3 28827 28849 SARS-CoV-2_97_LEFT 1 + TTCCTCATCACGTAGTCGCAAC +MN908947.3 28893 28914 SARS-CoV-2_96_RIGHT 2 - CCATTGCCAGCCATTCTAGCA +MN908947.3 29136 29161 SARS-CoV-2_98_LEFT 2 + CCAGGAACTAATCAGACAAGGAACT +MN908947.3 29206 29227 SARS-CoV-2_97_RIGHT 1 - CGACATTCCGAAGAACGCTGA +MN908947.3 29452 29475 SARS-CoV-2_99_LEFT 1 + CTTCTTCCTGCTGCAGATTTGGA +MN908947.3 29512 29534 SARS-CoV-2_98_RIGHT 2 - TTTAGGCCTGAGTTGAGTCAGC +MN908947.3 29827 29854 SARS-CoV-2_99_RIGHT 1 - GCTATTAAAATCACATGGGGATAGCAC From c7733b866f69b6fd3ab5c27a97155aedb9d03936 Mon Sep 17 00:00:00 2001 From: Pablo Riesgo Ferreiro Date: Wed, 23 Mar 2022 13:50:44 +0100 Subject: [PATCH 3/5] add primer trimming step --- main.nf | 22 +++++++++++++------- modules/03_bam_preprocessing.nf | 36 ++++++++++++++++++++++++++++++++- nextflow.config | 1 + 3 files changed, 51 insertions(+), 8 deletions(-) diff --git a/main.nf b/main.nf index 9dce619..2a8f78a 100755 --- a/main.nf +++ b/main.nf @@ -5,7 +5,7 @@ nextflow.enable.dsl = 2 include { READ_TRIMMING_PAIRED_END; READ_TRIMMING_SINGLE_END } from './modules/01_fastp' include { ALIGNMENT_PAIRED_END; ALIGNMENT_SINGLE_END } from './modules/02_bwa' -include { BAM_PREPROCESSING; COVERAGE_ANALYSIS } from './modules/03_bam_preprocessing' +include { BAM_PREPROCESSING; COVERAGE_ANALYSIS; PRIMER_TRIMMING_IVAR } from './modules/03_bam_preprocessing' include { VARIANT_CALLING_BCFTOOLS; VARIANT_CALLING_LOFREQ ; VARIANT_CALLING_GATK ; VARIANT_CALLING_IVAR ; VARIANT_CALLING_ASSEMBLY; IVAR2VCF } from './modules/04_variant_calling' include { VARIANT_NORMALIZATION ; PHASING } from './modules/05_variant_normalization' @@ -33,6 +33,7 @@ params.gff = false params.snpeff_data = false params.snpeff_config = false params.snpeff_organism = false +params.primers = false params.output = "." params.min_mapping_quality = 20 @@ -78,6 +79,7 @@ if (params.reference == false) { snpeff_config = params.sarscov2_snpeff_config snpeff_organism = params.sarscov2_snpeff_organism skip_sarscov2_annotations = params.skip_sarscov2_annotations + primers = file(params.sarscov2_primers) } else { log.info "Using custom reference genome: ${params.reference}" @@ -86,6 +88,7 @@ else { snpeff_data = params.snpeff_data snpeff_config = params.snpeff_config snpeff_organism = params.snpeff_organism + primers = params.primers ? file(params.primers) : false skip_sarscov2_annotations = true } @@ -188,27 +191,32 @@ workflow { bam_files = ALIGNMENT_SINGLE_END.out } BAM_PREPROCESSING(bam_files, reference) - COVERAGE_ANALYSIS(BAM_PREPROCESSING.out.preprocessed_bam) + preprocessed_bams = BAM_PREPROCESSING.out.preprocessed_bam + if (primers) { + PRIMER_TRIMMING_IVAR(preprocessed_bams, primers) + preprocessed_bams = PRIMER_TRIMMING_IVAR.out.trimmed_bam + } + COVERAGE_ANALYSIS(preprocessed_bams) // variant calling vcfs_to_normalize = null if (!params.skip_bcftools) { - VARIANT_CALLING_BCFTOOLS(BAM_PREPROCESSING.out.preprocessed_bam, reference) + VARIANT_CALLING_BCFTOOLS(preprocessed_bams, reference) vcfs_to_normalize = vcfs_to_normalize == null? VARIANT_CALLING_BCFTOOLS.out : vcfs_to_normalize.concat(VARIANT_CALLING_BCFTOOLS.out) } if (!params.skip_lofreq) { - VARIANT_CALLING_LOFREQ(BAM_PREPROCESSING.out.preprocessed_bam, reference) + VARIANT_CALLING_LOFREQ(preprocessed_bams, reference) vcfs_to_normalize = vcfs_to_normalize == null? VARIANT_CALLING_LOFREQ.out : vcfs_to_normalize.concat(VARIANT_CALLING_LOFREQ.out) } if (!params.skip_gatk) { - VARIANT_CALLING_GATK(BAM_PREPROCESSING.out.preprocessed_bam, reference) + VARIANT_CALLING_GATK(preprocessed_bams, reference) vcfs_to_normalize = vcfs_to_normalize == null? VARIANT_CALLING_GATK.out : vcfs_to_normalize.concat(VARIANT_CALLING_GATK.out) } if (!params.skip_ivar && gff) { - VARIANT_CALLING_IVAR(BAM_PREPROCESSING.out.preprocessed_bam, reference, gff) + VARIANT_CALLING_IVAR(preprocessed_bams, reference, gff) IVAR2VCF(VARIANT_CALLING_IVAR.out, reference) vcfs_to_normalize = vcfs_to_normalize == null? IVAR2VCF.out : vcfs_to_normalize.concat(IVAR2VCF.out) @@ -238,7 +246,7 @@ workflow { if (input_fastqs) { // we can only add technical annotations when we have the reads - VAFATOR(normalized_vcfs.combine(BAM_PREPROCESSING.out.preprocessed_bam, by: 0)) + VAFATOR(normalized_vcfs.combine(preprocessed_bams, by: 0)) VARIANT_VAF_ANNOTATION(VAFATOR.out.annotated_vcf) normalized_vcfs = VARIANT_VAF_ANNOTATION.out.vaf_annotated } diff --git a/modules/03_bam_preprocessing.nf b/modules/03_bam_preprocessing.nf index 2fdb649..d7155bd 100644 --- a/modules/03_bam_preprocessing.nf +++ b/modules/03_bam_preprocessing.nf @@ -60,6 +60,40 @@ process BAM_PREPROCESSING { """ } +process PRIMER_TRIMMING_IVAR { + cpus params.cpus + memory params.memory + if (params.keep_intermediate) { + publishDir "${params.output}", mode: "copy" + } + publishDir "${params.output}", mode: "copy", pattern: "${name}.deduplication_metrics.txt" + tag "${name}" + + conda (params.enable_conda ? "bioconda::gatk4=4.2.0.0 bioconda::ivar=1.3.1" : null) + + input: + tuple val(name), file(bam), file(bai) + file(primers) + + output: + tuple val(name), file("${bam.baseName}.trimmed.sorted.bam"), file("${bam.baseName}.trimmed.sorted.bai"), emit: trimmed_bam + + """ + ivar trim \ + -i ${bam} \ + -b ${primers} \ + -p ${bam.baseName}.trimmed + + gatk SortSam \ + --java-options '-Xmx${params.memory} -Djava.io.tmpdir=./tmp' \ + --INPUT ${bam.baseName}.trimmed.bam \ + --OUTPUT ${bam.baseName}.trimmed.sorted.bam \ + --SORT_ORDER coordinate + + gatk BuildBamIndex --INPUT ${bam.baseName}.trimmed.sorted.bam + """ +} + process COVERAGE_ANALYSIS { cpus params.cpus memory params.memory @@ -81,4 +115,4 @@ process COVERAGE_ANALYSIS { samtools coverage ${bam} > ${name}.coverage.tsv samtools depth -s -d 0 -H ${bam} > ${name}.depth.tsv """ -} \ No newline at end of file +} diff --git a/nextflow.config b/nextflow.config index 884e73e..97bfdf5 100644 --- a/nextflow.config +++ b/nextflow.config @@ -10,6 +10,7 @@ params.sarscov2_gff = "$baseDir/reference/Sars_cov_2.ASM985889v3.101.gff3" params.sarscov2_snpeff_data = "$baseDir/reference/snpeff/" params.sarscov2_snpeff_config = "$baseDir/reference/snpeff/snpEff.config" params.sarscov2_snpeff_organism = "Sars_cov_2.ASM985889v3.101" +params.sarscov2_primers = "$baseDir/reference/SARS-CoV-2.primer.bed" // problematic sites params.problematic_sites = "$baseDir/reference/problematic_sites_sarsCov2.vcf.gz" From 2086ece389b1d2592ff2fd152d4534f0b0b59bf5 Mon Sep 17 00:00:00 2001 From: Pablo Riesgo Ferreiro Date: Wed, 30 Mar 2022 19:49:36 +0200 Subject: [PATCH 4/5] make the primer trimming an optional step when a BED is provided --- README.md | 25 +++++++++++++++---- main.nf | 10 +++++--- nextflow.config | 14 ++++++----- .../SARS-CoV-2.primer.bed | 0 tests/test_11.sh | 25 +++++++++++++++++++ 5 files changed, 59 insertions(+), 15 deletions(-) rename {reference => test_data}/SARS-CoV-2.primer.bed (100%) create mode 100644 tests/test_11.sh diff --git a/README.md b/README.md index 05f5199..6e84e80 100644 --- a/README.md +++ b/README.md @@ -36,6 +36,7 @@ When FASTQ files are provided the pipeline includes the following steps: - **Trimming**. `fastp` is used to trim reads with default values. This step also includes QC filtering. - **Alignment**. `BWA mem` is used for the alignment of single or paired end samples. - **BAM preprocessing**. BAM files are prepared and duplicate reads are marked using GATK and Picard tools. +- **Primer trimming**. When a BED with primers is provided, these are trimmed from the reads using iVar. This is applicable to the results from all variant callers. - **Coverage analysis**. `samtools coverage` and `samtools depth` are used to compute the horizontal and vertical coverage respectively. - **Variant calling**. Four different variant callers are employed: BCFtools, LoFreq, iVar and GATK. @@ -110,6 +111,17 @@ or GATK's ReadBackedPhasing. Unfortunately these tools do not support the scenar undefined number of subclones. For this reason, phasing is implemented with custom Python code at `bin/phasing.py`. +## Primers trimming + +With some library preparation protocols such as ARTIC it is recommended to trim the primers from the reads. +We have observed that if primers are not trimmed spurious mutations are being called specially SNVs with lower frequencies and long deletions. +Also the variant allele frequencies of clonal mutations are underestimated. + +The BED files containing the primers for each ARTIC version can be found at https://github.com/artic-network/artic-ncov2019/tree/master/primer_schemes/nCoV-2019. + +If the adequate BED file is provided to the CoVigator pipeline with `--primers` the primers will be trimmed with iVar. +This affects the output of every variant caller, not only iVar. + ## Reference data The default SARS-CoV-2 reference files correspond to Sars_cov_2.ASM985889v3 and were downloaded from Ensembl servers. @@ -230,10 +242,16 @@ Input: * --library: required only when using --input_fastqs * --input_fastas_list: alternative to --name and --fasta for batch processing +Optional input only required to use a custom reference: + * --reference: the reference genome FASTA file, *.fai, *.dict and bwa indexes are required. + * --gff: the GFFv3 gene annotations file (required to run iVar and to phase mutations from all variant callers) + * --snpeff_data: path to the SnpEff data folder, it will be useful to use the pipeline on other virus than SARS-CoV-2 + * --snpeff_config: path to the SnpEff config file, it will be useful to use the pipeline on other virus than SARS-CoV-2 + * --snpeff_organism: organism to annotate with SnpEff, it will be useful to use the pipeline on other virus than SARS-CoV-2 + Optional input: * --fastq2: the second input FASTQ file - * --reference: the reference genome FASTA file, *.fai, *.dict and bwa indexes are required. - * --gff: the GFFv3 gene annotations file (only required to run iVar) + * --primers: a BED file containing the primers used during library preparation. If provided primers are trimmed from the reads. * --min_base_quality: minimum base call quality to take a base into account for variant calling (default: 20) * --min_mapping_quality: minimum mapping quality to take a read into account for variant calling (default: 20) * --vafator_min_base_quality: minimum base call quality to take a base into account for VAF annotation (default: 0) @@ -251,9 +269,6 @@ Optional input: * --open_gap_score: global alignment open gap score, only applicable for assemblies (default: -3) * --extend_gap_score: global alignment extend gap score, only applicable for assemblies (default: -0.1) * --skip_sarscov2_annotations: skip some of the SARS-CoV-2 specific annotations (default: false) - * --snpeff_data: path to the SnpEff data folder, it will be useful to use the pipeline on other virus than SARS-CoV-2 - * --snpeff_config: path to the SnpEff config file, it will be useful to use the pipeline on other virus than SARS-CoV-2 - * --snpeff_organism: organism to annotate with SnpEff, it will be useful to use the pipeline on other virus than SARS-CoV-2 * --keep_intermediate: keep intermediate files (ie: BAM files and intermediate VCF files) Output: diff --git a/main.nf b/main.nf index 2a8f78a..50771eb 100755 --- a/main.nf +++ b/main.nf @@ -79,7 +79,6 @@ if (params.reference == false) { snpeff_config = params.sarscov2_snpeff_config snpeff_organism = params.sarscov2_snpeff_organism skip_sarscov2_annotations = params.skip_sarscov2_annotations - primers = file(params.sarscov2_primers) } else { log.info "Using custom reference genome: ${params.reference}" @@ -88,10 +87,11 @@ else { snpeff_data = params.snpeff_data snpeff_config = params.snpeff_config snpeff_organism = params.snpeff_organism - primers = params.primers ? file(params.primers) : false skip_sarscov2_annotations = true } +primers = params.primers ? file(params.primers) : false + skip_snpeff = false if (! snpeff_data || ! snpeff_config || ! snpeff_organism) { log.info "Skipping SnpEff annotation as either --snpeff_data, --snpeff_config or --snpeff_organism was not provided" @@ -252,8 +252,10 @@ workflow { } // NOTE: phasing has to happen before SnpEff annotation for MNVs to be annotated correctly - PHASING(normalized_vcfs, reference, gff) - normalized_vcfs = PHASING.out + if (params.gff) { + PHASING(normalized_vcfs, reference, gff) + normalized_vcfs = PHASING.out + } if (! skip_snpeff) { // only when configured we run SnpEff diff --git a/nextflow.config b/nextflow.config index 97bfdf5..f50a502 100644 --- a/nextflow.config +++ b/nextflow.config @@ -10,7 +10,6 @@ params.sarscov2_gff = "$baseDir/reference/Sars_cov_2.ASM985889v3.101.gff3" params.sarscov2_snpeff_data = "$baseDir/reference/snpeff/" params.sarscov2_snpeff_config = "$baseDir/reference/snpeff/snpEff.config" params.sarscov2_snpeff_organism = "Sars_cov_2.ASM985889v3.101" -params.sarscov2_primers = "$baseDir/reference/SARS-CoV-2.primer.bed" // problematic sites params.problematic_sites = "$baseDir/reference/problematic_sites_sarsCov2.vcf.gz" @@ -90,10 +89,16 @@ Input: * --library: required only when using --input_fastqs * --input_fastas_list: alternative to --name and --fasta for batch processing +Optional input only required to use a custom reference: + * --reference: the reference genome FASTA file, *.fai, *.dict and bwa indexes are required. + * --gff: the GFFv3 gene annotations file (required to run iVar and to phase mutations from all variant callers) + * --snpeff_data: path to the SnpEff data folder, it will be useful to use the pipeline on other virus than SARS-CoV-2 + * --snpeff_config: path to the SnpEff config file, it will be useful to use the pipeline on other virus than SARS-CoV-2 + * --snpeff_organism: organism to annotate with SnpEff, it will be useful to use the pipeline on other virus than SARS-CoV-2 + Optional input: * --fastq2: the second input FASTQ file - * --reference: the reference genome FASTA file, *.fai, *.dict and bwa indexes are required. - * --gff: the GFFv3 gene annotations file (only required to run iVar) + * --primers: a BED file containing the primers used during library preparation. If provided primers are trimmed from the reads. Only applicable to FASTQs. * --min_base_quality: minimum base call quality to take a base into account for variant calling (default: 20) * --min_mapping_quality: minimum mapping quality to take a read into account for variant calling (default: 20) * --vafator_min_base_quality: minimum base call quality to take a base into account for VAF annotation (default: 0) @@ -111,9 +116,6 @@ Optional input: * --open_gap_score: global alignment open gap score, only applicable for assemblies (default: -3) * --extend_gap_score: global alignment extend gap score, only applicable for assemblies (default: -0.1) * --skip_sarscov2_annotations: skip some of the SARS-CoV-2 specific annotations (default: false) - * --snpeff_data: path to the SnpEff data folder, it will be useful to use the pipeline on other virus than SARS-CoV-2 - * --snpeff_config: path to the SnpEff config file, it will be useful to use the pipeline on other virus than SARS-CoV-2 - * --snpeff_organism: organism to annotate with SnpEff, it will be useful to use the pipeline on other virus than SARS-CoV-2 * --keep_intermediate: keep intermediate files (ie: BAM files and intermediate VCF files) Output: diff --git a/reference/SARS-CoV-2.primer.bed b/test_data/SARS-CoV-2.primer.bed similarity index 100% rename from reference/SARS-CoV-2.primer.bed rename to test_data/SARS-CoV-2.primer.bed diff --git a/tests/test_11.sh b/tests/test_11.sh new file mode 100644 index 0000000..83ddb87 --- /dev/null +++ b/tests/test_11.sh @@ -0,0 +1,25 @@ +#!/bin/bash + +################################################################################## +# primer trimmming +################################################################################## +echo "Running CoVigator pipeline test 11" +source bin/assert.sh +output=output/test10 +echo -e "test_data\t"`pwd`"/test_data/test_data_1.fastq.gz\n" > test_data/test_input.txt +nextflow main.nf -profile test,conda --input_fastqs_list test_data/test_input.txt \ +--library single --output $output \ +--primers test_data/SARS-CoV-2.primer.bed + +test -s $output/test_data.lofreq.vcf.gz || { echo "Missing VCF output file!"; exit 1; } +test -s $output/test_data.fastp_stats.json || { echo "Missing VCF output file!"; exit 1; } +test -s $output/test_data.fastp_stats.html || { echo "Missing VCF output file!"; exit 1; } +test -s $output/test_data.coverage.tsv || { echo "Missing coverage output file!"; exit 1; } +test -s $output/test_data.depth.tsv || { echo "Missing depth output file!"; exit 1; } +test -s $output/test_data.deduplication_metrics.txt || { echo "Missing deduplication metrics file!"; exit 1; } +test -s $output/test_data.lofreq.pangolin.csv || { echo "Missing pangolin output file!"; exit 1; } + +assert_eq `zcat $output/test_data.lofreq.vcf.gz | grep -v '#' | wc -l` 3 "Wrong number of variants" +assert_eq `zcat $output/test_data.lofreq.vcf.gz | grep -v '#' | grep PASS | wc -l` 0 "Wrong number of variants" + +assert_eq `cat $output/test_data.lofreq.pangolin.csv | wc -l` 2 "Wrong number of pangolin results" From 7e0d9820870286e114ab79f673572f8fed280019 Mon Sep 17 00:00:00 2001 From: Pablo Riesgo Ferreiro Date: Fri, 1 Apr 2022 09:48:36 +0200 Subject: [PATCH 5/5] include tests 10 and 11 into automated tests --- Makefile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Makefile b/Makefile index e4ff1c9..ed257d2 100644 --- a/Makefile +++ b/Makefile @@ -18,3 +18,5 @@ test: bash tests/test_07.sh bash tests/test_08.sh bash tests/test_09.sh + bash tests/test_10.sh + bash tests/test_11.sh