diff --git a/.circleci/config.yml b/.circleci/config.yml new file mode 100755 index 0000000..cece8fa --- /dev/null +++ b/.circleci/config.yml @@ -0,0 +1,18 @@ +version: 2 + +jobs: + build: + machine: true + steps: + - checkout + - run: cd ~ ; wget -qO- get.nextflow.io | bash ; chmod 755 nextflow ; sudo ln -s ~/nextflow /usr/local/bin/ ; sudo apt-get install graphviz + - run: cd ~ && git clone https://github.com/iarcbioinfo/data_test.git + - run: echo " docker.runOptions = '-u $(id -u):$(id -g)' " > ~/.nextflow/config + - run: cd ~/project/ ; docker build -t iarcbioinfo/bqsr-nf . + - run: cd ; nextflow run ~/project/BQSR.nf --help + - run: cd ; nextflow run ~/project/BQSR.nf -with-docker iarcbioinfo/bqsr-nf --input_folder ~/data_test/BAM/ --output_folder BAM_bqsr --cpu 2 --mem 4 --snp_vcf ~/data_test/REF/dbsnp_138.17_7572000-7591000.vcf.gz --indel_vcf ~/data_test/REF/1000G_phase1.indels.17_7572000-7591000.sites.vcf.gz --ref ~/data_test/REF/17_7572000-7591000.fasta -with-dag dag_bqsr.png + - run: cd ; nextflow run ~/project/BQSR.nf -with-docker iarcbioinfo/bqsr-nf --input_folder ~/data_test/BAM/ --output_folder BAM_bqsr --cpu 2 --mem 4 --snp_vcf ~/data_test/REF/dbsnp_138.17_7572000-7591000.vcf.gz --indel_vcf ~/data_test/REF/1000G_phase1.indels.17_7572000-7591000.sites.vcf.gz --ref ~/data_test/REF/17_7572000-7591000.fasta -with-dag dag_STAR_bqsr.html + - run: cd ; cp ~/dag* ~/project/. + - deploy: + branch: [master, dev] + command: chmod +x deploy.sh && ./deploy.sh diff --git a/BQSR.nf b/BQSR.nf index e5c4533..bcc47b5 100755 --- a/BQSR.nf +++ b/BQSR.nf @@ -1,25 +1,20 @@ #! /usr/bin/env nextflow -// usage : ./BQSR.nf --input_folder input/ --cpu 8 --mem 32 --fasta_ref hg19.fasta -/* -vim: syntax=groovy --*- mode: groovy;-*- */ - // requirement: -// - samtools +// - gatk4 // - samblaster // - sambamba //default values params.help = null params.input_folder = '.' -params.fasta_ref = 'hg19.fasta' -params.cpu = 8 +params.ref = 'hg19.fasta' +params.cpu = 2 params.mem = 32 -params.mem_sambamba = 1 -params.out_folder = "." -params.intervals = "" -params.GATK_bundle = "bundle" -params.GATK_folder = "." +params.output_folder = "." +params.snp_vcf = "dbsnp.vcf" +params.indel_vcf = "Mills_1000G_indels.vcf" +params.multiqc_config = 'NO_FILE' + if (params.help) { log.info '' @@ -28,34 +23,50 @@ if (params.help) { log.info '-------------------------------------------------------------' log.info '' log.info 'Usage: ' - log.info 'nextflow run BQSR.nf --input_folder input/ --fasta_ref hg19.fasta [--cpu 8] [--mem 32] [--out_folder output/]' + log.info 'nextflow run BQSR.nf --input_folder input/ --ref hg19.fasta [--cpu 8] [--mem 32] [--output_folder output/]' log.info '' log.info 'Mandatory arguments:' - log.info ' --input_folder FOLDER Folder containing BAM or fastq files to be aligned.' - log.info ' --fasta_ref FILE Reference fasta file (with index).' + log.info ' --input_folder FOLDER Folder containing BAM or fastq files to be aligned.' + log.info ' --ref FILE Reference fasta file (with index).' log.info 'Optional arguments:' - log.info ' --cpu INTEGER Number of cpu used by bwa mem and sambamba (default: 8).' - log.info ' --mem INTEGER Size of memory used by sambamba (in GB) (default: 32).' - log.info ' --out_folder STRING Output folder (default: results_alignment).' + log.info ' --cpu INTEGER Number of cpu used by bwa mem and sambamba (default: 8).' + log.info ' --mem INTEGER Size of memory used by sambamba (in GB) (default: 32).' + log.info ' --snp_vcf STRING path to SNP VCF from GATK bundle (default : dbsnp.vcf)' + log.info ' --indel_vcf STRING path to indel VCF from GATK bundle (default : Mills_1000G_indels.vcf)' + log.info ' --output_folder STRING Output folder (default: results_alignment).' + log.info ' --multiqc_config STRING config yaml file for multiqc (default : none)' log.info '' - exit 1 + log.info '' + exit 0 } //read files -fasta_ref = file(params.fasta_ref) -fasta_ref_fai = file( params.fasta_ref+'.fai' ) -fasta_ref_sa = file( params.fasta_ref+'.sa' ) -fasta_ref_bwt = file( params.fasta_ref+'.bwt' ) -fasta_ref_ann = file( params.fasta_ref+'.ann' ) -fasta_ref_amb = file( params.fasta_ref+'.amb' ) -fasta_ref_pac = file( params.fasta_ref+'.pac' ) -fasta_ref_alt = file( params.fasta_ref+'.alt' ) +ref = file(params.ref) +ref_fai = file( params.ref+'.fai' ) +ref_dict= file( params.ref.replaceFirst(/fasta/, "").replaceFirst(/fa/, "") +'dict') +//ref_sa = file( params.ref+'.sa' ) +//ref_bwt = file( params.ref+'.bwt' ) +//ref_ann = file( params.ref+'.ann' ) +//ref_amb = file( params.ref+'.amb' ) +//ref_pac = file( params.ref+'.pac' ) +//ref_alt = file( params.ref+'.alt' ) + +//get know site VCFs from GATK bundle +known_snps = file( params.snp_vcf ) +known_snps_index = file( params.snp_vcf+'.tbi' ) +known_indels = file( params.indel_vcf ) +known_indels_index = file( params.indel_vcf+'.tbi' ) + +//multiqc config file +ch_config_for_multiqc = file(params.multiqc_config) if (file(params.input_folder).listFiles().findAll { it.name ==~ /.*bam/ }.size() > 0){ - println "BAM files found, proceed with realignment"; mode ='bam' - bam_files = Channel.fromPath( params.input_folder+'/*.bam' - bai_files = Channel.fromPath( params.input_folder+'/*.bai' - ) + println "BAM files found, proceed with realignment"; + //bam_files = Channel.fromPath( params.input_folder+'/*.bam'); + //bai_files = Channel.fromPath( params.input_folder+'/*.bai') + bam_bai_files = Channel.fromFilePairs("${params.input_folder}/*{.bam,.bai}") + .map { row -> tuple(row[1][0], row[1][1]) } + }else{ println "ERROR: input folder contains no fastq nor BAM files"; System.exit(0) } @@ -65,32 +76,78 @@ process base_quality_score_recalibration { cpus params.cpu memory params.mem+'G' tag { file_tag } - + + publishDir "$params.output_folder/BAM/", mode: 'copy', pattern: "*bam*" + publishDir "$params.output_folder/QC/BAM/BQSR/", mode: 'copy', + saveAs: {filename -> + if (filename.indexOf("table") > 0) "$filename" + else if (filename.indexOf("plots") > 0) "$filename" + else null + } + input: - file("${file_tag}.bam") from bam_files - file("${file_tag}.bam.bai") from bai_files + set file(bam), file(bai) from bam_bai_files + file known_snps + file known_snps_index + file known_indels + file known_indels_index + file ref + file ref_fai + file ref_dict + output: - file("${file_tag_new}_recal.table") into recal_table_files - file("${file_tag_new}_post_recal.table") into recal_table_post_files - file("${file_tag_new}_recalibration_plots.pdf") into recal_plots_files - set val(file_tag_new), file("${file_tag_new}.bam") into recal_bam_files - file("${file_tag_new}.bam.bai") into recal_bai_files - publishDir params.out_folder, mode: 'move' + file("*_recal.table") into recal_table_files + file("*plots.pdf") into recal_plots_files + set val(file_tag_new), file("${file_tag_new}.bam"), file("${file_tag_new}.bam.bai") into final_bam_bai_files shell: - file_tag=infile.baseName - file_tag_new=file_tag+'_BQSR' + file_tag=bam.baseName + file_tag_new=file_tag+'_BQSRecalibrated' ''' - indelsvcf=(`ls !{params.GATK_bundle}/*indels*.vcf* | grep -v ".tbi" | grep -v ".idx"`) - dbsnpvcfs=(`ls !{params.GATK_bundle}/*dbsnp*.vcf* | grep -v ".tbi" | grep -v ".idx"`) - dbsnpvcf=${dbsnpvcfs[@]:(-1)} - knownSitescom='' - for ll in $indelsvcf; do knownSitescom=$knownSitescom' -knownSites '$ll; done - knownSitescom=$knownSitescom' -knownSites '$dbsnpvcf - java -jar !{params.GATK_folder}/GenomeAnalysisTK.jar -T BaseRecalibrator -nct !{params.cpu} -R !{params.fasta_ref} -I !{file_tag}.bam $knownSitescom -L !{params.intervals} -o !{file_tag_new}_recal.table - java -jar !{params.GATK_folder}/GenomeAnalysisTK.jar -T BaseRecalibrator -nct !{params.cpu} -R !{params.fasta_ref} -I !{file_tag}.bam $knownSitescom -BQSR !{file_tag_new}_recal.table -L !{params.intervals} -o !{file_tag_new}_post_recal.table - java -jar !{params.GATK_folder}/GenomeAnalysisTK.jar -T AnalyzeCovariates -R !{params.fasta_ref} -before !{file_tag_new}_recal.table -after !{file_tag_new}_post_recal.table -plots !{file_tag_new}_recalibration_plots.pdf - java -jar !{params.GATK_folder}/GenomeAnalysisTK.jar -T PrintReads -nct !{params.cpu} -R !{params.fasta_ref} -I !{file_tag}.bam -BQSR !{file_tag_new}_recal.table -L !{params.intervals} -o !{file_tag_new}.bam + gatk BaseRecalibrator --java-options "-Xmx!{params.mem}G" -R !{ref} -I !{file_tag}.bam --known-sites !{known_snps} --known-sites !{known_indels} -O !{file_tag}_recal.table + gatk ApplyBQSR --java-options "-Xmx!{params.mem}G" -R !{ref} -I !{file_tag}.bam --bqsr-recal-file !{file_tag}_recal.table -O !{file_tag_new}.bam + gatk BaseRecalibrator --java-options "-Xmx!{params.mem}G" -R !{ref} -I !{file_tag_new}.bam --known-sites !{known_snps} --known-sites !{known_indels} -O !{file_tag_new}_recal.table + gatk AnalyzeCovariates --java-options "-Xmx!{params.mem}G" -before !{file_tag}_recal.table -after !{file_tag_new}_recal.table -plots !{file_tag_new}_recalibration_plots.pdf mv !{file_tag_new}.bai !{file_tag_new}.bam.bai ''' -} \ No newline at end of file +} + +process multiqc_final { + cpus 2 + memory '1G' + + publishDir "${params.output_folder}/QC/BAM/", mode: 'copy' + + input: + file BQSR_results from recal_table_files.collect() + file BQSR_results_plots from recal_plots_files.collect() + file multiqc_config from ch_config_for_multiqc + + output: + file("*report.html") into final_output + file("multiqc_BQSR_report_data/") into final_output_data + + shell: + if( multiqc_config.name=='NO_FILE' ){ + opt = "" + }else{ + opt = "--config ${multiqc_config}" + } + ''' + multiqc . -n multiqc_BQSR_report.html -m gatk !{opt} --comment "GATK base quality score recalibration QC report" + ''' +} + +// Display completion message +workflow.onComplete { + log.info "N E X T F L O W ~ version ${workflow.nextflow.version} ${workflow.nextflow.build}" + //log.info "iarcbioinfo/BQSR-nf ~ " + this.grabRevision() + (workflow.commitId ? " [${workflow.commitId}]" : "") + log.info "Completed at: " + workflow.complete + log.info "Duration : " + workflow.duration + log.info "Success : " + workflow.success + log.info "Exit status : " + workflow.exitStatus + log.info "Error report: " + (workflow.errorReport ?: '-') +} + + + diff --git a/Dockerfile b/Dockerfile new file mode 100755 index 0000000..87ad168 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,23 @@ +################## BASE IMAGE ##################### +FROM nfcore/base + + +################## METADATA ####################### + +LABEL base_image="nfcore/base" +LABEL version="1.0" +LABEL software="bqsr-nf" +LABEL software.version="2.0" +LABEL about.summary="Container image containing all requirements for bqsr-nf" +LABEL about.home="http://github.com/IARCbioinfo/BQSR-nf" +LABEL about.documentation="http://github.com/IARCbioinfo/BQSR-nf/README.md" +LABEL about.license_file="http://github.com/IARCbioinfo/BQSR-nf/LICENSE.txt" +LABEL about.license="GNU-3.0" + +################## MAINTAINER ###################### +MAINTAINER **nalcala** <**alcalan@fellows.iarc.fr**> + +################## INSTALLATION ###################### +COPY environment.yml / +RUN conda env create -n bqsr-nf -f /environment.yml && conda clean -a +ENV PATH /opt/conda/envs/bqsr-nf/bin:$PATH diff --git a/LICENSE b/LICENSE old mode 100644 new mode 100755 diff --git a/README.md b/README.md old mode 100644 new mode 100755 index e97004e..5aed689 --- a/README.md +++ b/README.md @@ -1,2 +1,79 @@ # BQSR-nf Nextflow script for base quality score recalibration of bam files using GATK + +## Nextflow pipeline for base quality score recalibration with GATK processing +[![CircleCI](https://circleci.com/gh/IARCbioinfo/BQSR-nf/tree/master.svg?style=svg)](https://circleci.com/gh/IARCbioinfo/BQSR-nf/tree/master) +[![Docker Hub](https://img.shields.io/badge/docker-ready-blue.svg)](https://hub.docker.com/r/iarcbioinfo/bqsr-nf/) + +## Decription + +Nextflow pipeline for base quality score recalibration and quality control + +## Dependencies + +1. Nextflow: for common installation procedures see the [IARC-nf](https://github.com/IARCbioinfo/IARC-nf) repository. + +2. [*multiQC*](http://multiqc.info/docs/) +3. [*GATK4*](https://software.broadinstitute.org/gatk/guide/quickstart) must be in the PATH variable +4. [GATK bundle](https://software.broadinstitute.org/gatk/download/bundle) VCF files with lists of indels and SNVs (recommended: 1000 genomes indels, dbsnp VCF) + +You can provide a config file to customize the multiqc report (see https://multiqc.info/docs/#configuring-multiqc). + +## Input + | Type | Description | + |-----------|---------------| + |--input_folder | a folder with bam files | + + +## Parameters + +* #### Mandatory +| Name | Example value | Description | +|-----------|--------------:|-------------| +|--ref | ref.fa | reference genome fasta file for GATK | + +* #### Optional + +| Name | Default value | Description | +|-----------|--------------|-------------| +|--cpu | 2 | number of CPUs | +|--mem | 32 | memory for mapping| +|--output_folder | . | output folder for aligned BAMs| +|--snp_vcf | dbsnp.vcf | VCF file with known variants for GATK BQSR | +|--indel_vcf | Mills_100G_indels.vcf | VCF file with known indels for GATK BQSR | +|--multiqc_config | null | config yaml file for multiqc | + +* #### Flags + +| Name | Description | +|-----------|-------------| +|--help | print usage and optional parameters | + +## Usage +To run the pipeline on a series of bam files in folder *bam*, a reference genome with indexes at *ref.fa*, and known snps and indels from the gatk bundle, one can type: +```bash +nextflow run iarcbioinfo/BQSR-nf --input_folder bam --ref ref.fa --snp_vcf GATK_bundle/dbsnp_146.hg38.vcf.gz --indel_vcf GATK_bundle/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz +``` + +## Output + | Type | Description | + |-----------|---------------| + | BAM/file.bam | BAM files of alignments or realignments | + | BAM/file.bam.bai | BAI files of alignments or realignments | + | QC/multiqc_BQSR_report.html | multiqc report | + | QC/multiqc_BQSR_report_data | folder with data used to compute multiqc report | + | QC/BAM/BQSR/file_recal.table | table of scores before recalibration | + | QC/BAM/BQSR/file_post_recal.table | table of scores after recalibration | + | QC/BAM/BQSR/file_recalibration_plots.pdf | before/after recalibration plots | + +The output_folder directory contains two subfolders: BAM and QC + +## Directed Acyclic Graph + +[![DAG BQSR](dag_BQSR.png)](http://htmlpreview.github.io/?https://github.com/IARCbioinfo/BQSR-nf/blob/dev/dag_BQSR.html) + +## Contributions + + | Name | Email | Description | + |-----------|---------------|-----------------| + | Nicolas Alcala* | AlcalaN@fellows.iarc.fr | Developer to contact for support | diff --git a/dag_STAR_bqsr.html b/dag_STAR_bqsr.html new file mode 100644 index 0000000..6931c4d --- /dev/null +++ b/dag_STAR_bqsr.html @@ -0,0 +1,158 @@ + + + + + + Nextflow Cytoscape.js with Dagre + + + + + + + + + + + +

Nextflow Cytoscape.js with Dagre

+
+ + + diff --git a/dag_bqsr.png b/dag_bqsr.png new file mode 100644 index 0000000..1bb3942 Binary files /dev/null and b/dag_bqsr.png differ diff --git a/deploy.sh b/deploy.sh new file mode 100755 index 0000000..3643218 --- /dev/null +++ b/deploy.sh @@ -0,0 +1,14 @@ +#!/bin/bash +cd ~/project/ +commitID=`git log -n 1 --pretty="%h" -- environment.yml` +sed -i '/^# environment.yml/d' Singularity && echo -e "# environment.yml commit ID: $commitID\n" >> Singularity +git config --global user.email "alcalan@fellows.iarc.fr" +git config --global user.name "Circle CI_$CIRCLE_PROJECT_REPONAME_$CIRCLE_BRANCH" +git add . +git status +git commit -m "Generated DAG [skip ci]" +git push origin $CIRCLE_BRANCH + +curl -H "Content-Type: application/json" --data "{\"source_type\": \"Branch\", \"source_name\": \"$CIRCLE_BRANCH\"}" -X POST https://cloud.docker.com/api/build/v1/source/d3356607-1420-43f3-8955-8f9212639ba4/trigger/5f8ed292-0607-4279-88e7-e8d0c65df8e0/call/ + + diff --git a/environment.yml b/environment.yml new file mode 100755 index 0000000..7e8350a --- /dev/null +++ b/environment.yml @@ -0,0 +1,17 @@ +name: bqsr +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - fontconfig > 2 + - font-ttf-dejavu-sans-mono > 2 + - r-base + - r-essentials + - multiqc=1.7 + - gatk4=4.0.5.1 + - r-ggplot2=3.1.1 + - r-gplots=3.0.1 + - r-gsalib=2.1.0 + - r-essentials + - r-reshape=0.8.8 diff --git a/nextflow.config b/nextflow.config old mode 100644 new mode 100755 index 29c01e5..c9817b2 --- a/nextflow.config +++ b/nextflow.config @@ -3,3 +3,48 @@ manifest { description = 'Perform Base Quality Score Recalibration of BAM files' mainScript = 'BQSR.nf' } + +profiles { + conda { + process { + conda = "$baseDir/environment.yml" + } + conda.createTimeout = "200 min" + } + docker { + docker.enabled = true + process.container = 'iarcbioinfo/bqsr-nf' + } + singularity { + singularity.enabled = true + process.container = 'iarcbioinfo/bqsr-nf' + } +} + +process { + process.container = 'iarcbioinfo/bqsr-nf:dev' + shell = ['/bin/bash','-o','pipefail'] +} + +params.output_folder="." + + +timeline { + enabled = true + file = "${params.output_folder}/nf-pipeline_info/bqsr-nf_timeline.html" +} + +report { + enabled = true + file = "${params.output_folder}/nf-pipeline_info/bqsr-nf_report.html" +} + +trace { + enabled = true + file = "${params.output_folder}/nf-pipeline_info/bqsr-nf_trace.txt" +} + +dag { + enabled = true + file = "${params.output_folder}/nf-pipeline_info/bqsr-nf_dag.html" +}