Add testing pipeline (#35)

* Add testing pipeline * Add env files * prepare mob-suite db prior to use * Use full path to mob-suite db * Use different sample for test data * update * Fix provenance * Update README * fix provenance * fix provenance * fix provenance in README
BCCDC-PHL · Jun 20, 2024 · 2898065 · 2898065
1 parent 74b8048
commit 2898065
Show file tree

Hide file tree

Showing 20 changed files with 385 additions and 6 deletions.
diff --git a/.github/data/reads_to_simulate.csv b/.github/data/reads_to_simulate.csv
@@ -0,0 +1 @@
+GCF024700185.1,.github/data/assemblies/GCF024700185.1.fa
diff --git a/.github/environments/art.yml b/.github/environments/art.yml
@@ -0,0 +1,7 @@
+name: art
+channels:
+  - conda-forge
+  - bioconda
+  - defaults
+dependencies:
+  - art=2016.06.05
diff --git a/.github/environments/check-outputs.yml b/.github/environments/check-outputs.yml
@@ -0,0 +1,9 @@
+name: check-outputs
+channels:
+  - conda-forge
+  - bioconda
+  - defaults
+dependencies:
+  - python=3
+  - jsonschema=4.20.0
+  - pyyaml=6.0.1
diff --git a/.github/scripts/check_outputs.py b/.github/scripts/check_outputs.py
@@ -0,0 +1,114 @@
+#!/usr/bin/env python3
+
+import argparse
+import csv
+import glob
+import json
+import os
+import urllib.request
+
+from jsonschema import validate
+import yaml
+
+
+def check_provenance_format_valid(provenance_files, schema):
+    """
+    Check that the provenance files are valid according to the schema.
+    """
+    for provenance_file in provenance_files:
+        with open(provenance_file) as f:
+            try:
+                provenance = yaml.load(f, Loader=yaml.BaseLoader)
+                validate(provenance, schema)
+            except Exception as e:
+                print(f"Error validating {provenance_file}: {e}")
+                exit(1)
+                return False
+
+    return True
+
+def check_expected_files_exist(output_dir, sample_ids):
+    """
+    Check that the expected files exist in the output directory.
+
+    :param output_dir: Path to the output directory
+    :param sample_ids: List of sample IDs
+    :return: True if all expected files exist, False otherwise
+    :rtype: bool
+    """
+    for sample_id in sample_ids:
+        expected_files = [
+            f"{sample_id}/{sample_id}_fastp.csv",
+            f"{sample_id}/{sample_id}_fastp.json",
+            f"{sample_id}/{sample_id}_quast.csv",
+            f"{sample_id}/{sample_id}_abricate_ncbi.tsv",
+            f"{sample_id}/{sample_id}_abricate_plasmidfinder.tsv",
+            f"{sample_id}/{sample_id}_resistance_gene_report.tsv",
+        ]
+
+        for expected_file in expected_files:
+            expected_file_path = os.path.join(output_dir, expected_file)
+            if not os.path.exists(expected_file_path):
+                print(f"Expected file {expected_file_path} not found")
+                return False
+
+    return True
+
+
+def main(args):
+
+    output_dir = os.path.dirname(args.output)
+    os.makedirs(output_dir, exist_ok=True)
+
+    provenance_schema_url = "https://raw.githubusercontent.com/BCCDC-PHL/pipeline-provenance-schema/main/schema/pipeline-provenance.json"
+    provenance_schema_path = ".github/data/pipeline-provenance.json"
+    urllib.request.urlretrieve(provenance_schema_url, provenance_schema_path)
+
+    provenance_schema = None
+    with open(provenance_schema_path) as f:
+        provenance_schema = json.load(f)
+
+    provenace_files_glob = f"{args.pipeline_outdir}/**/*_provenance.yml"
+    provenance_files = glob.glob(provenace_files_glob, recursive=True)
+
+    sample_ids = [os.path.basename(provenance_file).split("_")[0] for provenance_file in provenance_files]
+
+    # TODO: Add more tests
+    tests = [
+        {
+            "test_name": "provenance_format_valid",
+            "test_passed": check_provenance_format_valid(provenance_files, provenance_schema),
+        },
+        {
+            "test_name": "all_expected_files_exist",
+            "test_passed": check_expected_files_exist(args.pipeline_outdir, sample_ids),
+        },
+    ]
+
+    output_fields = [
+        "test_name",
+        "test_result"
+    ]
+
+    output_path = args.output
+    with open(output_path, 'w') as f:
+        writer = csv.DictWriter(f, fieldnames=output_fields, extrasaction='ignore')
+        writer.writeheader()
+        for test in tests:
+            if test["test_passed"]:
+                test["test_result"] = "PASS"
+            else:
+                test["test_result"] = "FAIL"
+            writer.writerow(test)
+
+    for test in tests:
+        if not test['test_passed']:
+            exit(1)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Check outputs')
+    parser.add_argument('--pipeline-outdir', type=str, help='Path to the pipeline output directory')
+    parser.add_argument('-o', '--output', type=str, help='Path to the output file')
+    args = parser.parse_args()
+    main(args)
diff --git a/.github/scripts/check_outputs.sh b/.github/scripts/check_outputs.sh
@@ -0,0 +1,12 @@
+#!/usr/bin/env bash
+
+set -e -o pipefail
+
+source ${HOME}/.bashrc
+
+eval "$(conda shell.bash hook)"
+
+conda activate check-outputs
+
+
+.github/scripts/check_outputs.py --pipeline-outdir .github/data/test_output -o artifacts/check_outputs_results.csv
diff --git a/.github/scripts/create_art_environment.sh b/.github/scripts/create_art_environment.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+
+conda env create -f .github/environments/art.yml
diff --git a/.github/scripts/create_output_checking_environment.sh b/.github/scripts/create_output_checking_environment.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+
+conda env create -f .github/environments/check-outputs.yml
diff --git a/.github/scripts/create_samplesheet.sh b/.github/scripts/create_samplesheet.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+echo 'ID,R1,R2,ASSEMBLY' > .github/data/samplesheet.csv
+
+for i in $(ls ${PWD}/.github/data/fastq/*_R1.fastq.gz); do
+  ID=$(basename $i _R1.fastq.gz)
+  R1=$i
+  R2=${PWD}/.github/data/fastq/${ID}_R2.fastq.gz
+  ASSEMBLY=${PWD}/.github/data/assemblies/${ID}.fa
+  echo "$ID,$R1,$R2,$ASSEMBLY" >> .github/data/samplesheet.csv
+done
diff --git a/.github/scripts/download_assemblies.sh b/.github/scripts/download_assemblies.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+
+mkdir -p .github/data/assemblies
+
+rm -f .github/data/assemblies/GCF_024700185.1.zip
+rm -f .github/data/assemblies/GCF024700185.1.fa
+rm -f .github/data/assemblies/README.md
+
+curl -o .github/data/assemblies/GCF_024700185.1.zip  "https://api.ncbi.nlm.nih.gov/datasets/v2alpha/genome/accession/GCF_024700185.1/download?include_annotation_type=GENOME_FASTA&include_annotation_type=SEQUENCE_REPORT&hydrated=FULLY_HYDRATED"
+
+unzip .github/data/assemblies/GCF_024700185.1.zip -d .github/data/assemblies
+
+mv .github/data/assemblies/ncbi_dataset/data/GCF_024700185.1/GCF_024700185.1_ASM2470018v1_genomic.fna .github/data/assemblies/GCF024700185.1.fa
+
+rm -r .github/data/assemblies/ncbi_dataset
+rm -f .github/data/assemblies/README.md
diff --git a/.github/scripts/download_mob-suite_db.sh b/.github/scripts/download_mob-suite_db.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+source ${HOME}/.bashrc
+
+eval "$(conda shell.bash hook)"
+
+mkdir -p .github/data
+
+rm -rf .github/data/mob-suite-db
+
+pushd .github/data
+
+wget -O data.tar.gz https://zenodo.org/records/10304948/files/data.tar.gz?download=1
+
+tar -xzf data.tar.gz
+
+rm data.tar.gz
+
+mv data mob-suite-db
+
+conda activate plasmid-screen-35d122a137231eda3b8a0039d42f24f6
+
+mash sketch -i mob-suite-db/ncbi_plasmid_full_seqs.fas
+
+makeblastdb -in mob-suite-db/ncbi_plasmid_full_seqs.fas -dbtype nucl
+
+popd
diff --git a/.github/scripts/install_conda.sh b/.github/scripts/install_conda.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+set -eo pipefail
+
+artifacts_dir="artifacts"
+
+echo "Install Miniconda .." >> ${artifacts_dir}/test.log
+
+export PATH=/opt/miniconda3/bin:$PATH
+
+wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh
+
+/bin/bash ~/miniconda.sh -b -p /opt/miniconda3
+
+rm ~/miniconda.sh
+
+echo ". /opt/minconda3/etc/profile.d/conda.sh" >> ~/.bashrc
+
+conda update -n base -c defaults conda
+
+conda install -y -c conda-forge mamba
+
+conda init bash
diff --git a/.github/scripts/install_nextflow.sh b/.github/scripts/install_nextflow.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+set -eo pipefail
+
+artifacts_dir="artifacts"
+
+echo Install Nextflow .. >> ${artifacts_dir}/test.log
+
+wget -qO- https://get.nextflow.io | bash
+
+sudo mv nextflow /usr/local/bin/
diff --git a/.github/scripts/prepare_artifacts.sh b/.github/scripts/prepare_artifacts.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+artifacts_dir="artifacts"
+
+echo "Prepare artifacts .." >> ${artifacts_dir}/test.log
+
+mkdir -p ${artifacts_dir}/fastq
+
+mv .github/data/fastq/*.fastq.gz ${artifacts_dir}/fastq
+
+mkdir -p ${artifacts_dir}/pipeline_outputs
+
+mv .github/data/test_output/* ${artifacts_dir}/pipeline_outputs
diff --git a/.github/scripts/run_pipeline.sh b/.github/scripts/run_pipeline.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+
+set -eo pipefail
+
+if [ -n "${GITHUB_ACTIONS}" ]; then
+    echo "Running in GitHub Actions Environment"
+    echo "Adjusting nextflow.config"
+    sed -i 's/cpus = 16/cpus = 4/g' nextflow.config 
+else
+    echo "Not running in GitHub Actions Environment"
+fi
+
+nextflow run main.nf \
+	 -profile conda \
+	 --cache ${HOME}/.conda/envs \
+	 --samplesheet_input .github/data/samplesheet.csv \
+	 --pre_assembled \
+	 --mob_db ${PWD}/.github/data/mob-suite-db \
+	 --collect_outputs \
+	 --collected_outputs_prefix test \
+	 --outdir .github/data/test_output \
+	 -with-report .github/data/test_output/nextflow_report.html \
+ 	 -with-trace .github/data/test_output/nextflow_trace.tsv
diff --git a/.github/scripts/run_tests_locally.sh b/.github/scripts/run_tests_locally.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+rm -rf .github/data/assemblies/*
+rm -rf .github/data/fastq/*
+rm -rf .github/data/mob-suite-db
+rm -rf .github/data/samplesheet.csv
+rm -rf .github/data/test_output
+
+.github/scripts/download_assemblies.sh
+
+.github/scripts/simulate_reads.sh
+
+.github/scripts/download_mob-suite_db.sh
+
+.github/scripts/create_samplesheet.sh
+
+.github/scripts/run_pipeline.sh
+
diff --git a/.github/scripts/simulate_reads.sh b/.github/scripts/simulate_reads.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+
+
+source ${HOME}/.bashrc
+
+eval "$(conda shell.bash hook)"
+
+conda activate art
+
+mkdir -p .github/data/fastq
+
+while IFS=',' read -r sample_id assembly; do
+    art_illumina \
+	--paired \
+	--in ${assembly} \
+	--fcov 12 \
+	--len 150 \
+	--mflen 400 \
+	--sdev 100 \
+	--rndSeed 42 \
+	--qShift 0 \
+	--qShift2 0 \
+	--out .github/data/fastq/${sample_id}_R
+
+    rm -f .github/data/fastq/${sample_id}_R1.aln
+    rm -f .github/data/fastq/${sample_id}_R2.aln
+
+    mv .github/data/fastq/${sample_id}_R1.fq .github/data/fastq/${sample_id}_R1.fastq
+    mv .github/data/fastq/${sample_id}_R2.fq .github/data/fastq/${sample_id}_R2.fastq
+
+    gzip -f .github/data/fastq/${sample_id}_R1.fastq
+    gzip -f .github/data/fastq/${sample_id}_R2.fastq
+
+done < .github/data/reads_to_simulate.csv
+
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -0,0 +1,52 @@
+on:
+  pull_request:
+    branches:
+      - main
+  push:
+    branches:
+      - main
+  workflow_dispatch:
+name: Tests
+jobs:
+  test:
+    strategy:
+      fail-fast: false
+      matrix:
+        nextflow_version: ["21.04.3", "23.10.1"]
+    name: Run tests
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@master
+    - name: Create Artifacts Directory
+      run: mkdir artifacts
+    - name: Install Miniconda
+      run: bash .github/scripts/install_conda.sh
+    - name: Install Nextflow
+      env:
+        NXF_VER: ${{ matrix.nextflow_version }}
+      run: bash .github/scripts/install_nextflow.sh
+    - name: Create ART Read-Simulation Environment
+      run: bash .github/scripts/create_art_environment.sh
+    - name: Download Assemblies
+      run: bash .github/scripts/download_assemblies.sh
+    - name: Simulate Reads
+      run: bash .github/scripts/simulate_reads.sh
+    - name: Download mob-suite db
+      run: bash .github/scripts/download_mob-suite_db.sh
+    - name: Create SampleSheet
+      run: bash .github/scripts/create_samplesheet.sh
+    - name: Run Pipeline
+      run: bash .github/scripts/run_pipeline.sh
+    - name: Create Output Checking Environment
+      run: bash .github/scripts/create_output_checking_environment.sh
+    - name: Check Outputs
+      run: bash .github/scripts/check_outputs.sh
+    - name: Prepare Artifacts
+      if: always()
+      run: bash .github/scripts/prepare_artifacts.sh
+    - name: Upload Artifacts
+      uses: actions/upload-artifact@v4
+      if: always()
+      with:
+        name: artifacts-BCCDC-PHL-routine-assembly-nextflow-v${{ matrix.nextflow_version }}-${{ github.run_id }}.${{ github.run_attempt }}
+        path: artifacts