Skip to content

Commit

Permalink
Add testing pipeline (#35)
Browse files Browse the repository at this point in the history
* Add testing pipeline

* Add env files

* prepare mob-suite db prior to use

* Use full path to mob-suite db

* Use different sample for test data

* update

* Fix provenance

* Update README

* fix provenance

* fix provenance

* fix provenance in README
  • Loading branch information
dfornika authored Jun 20, 2024
1 parent 74b8048 commit 2898065
Show file tree
Hide file tree
Showing 20 changed files with 385 additions and 6 deletions.
1 change: 1 addition & 0 deletions .github/data/reads_to_simulate.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
GCF024700185.1,.github/data/assemblies/GCF024700185.1.fa
7 changes: 7 additions & 0 deletions .github/environments/art.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
name: art
channels:
- conda-forge
- bioconda
- defaults
dependencies:
- art=2016.06.05
9 changes: 9 additions & 0 deletions .github/environments/check-outputs.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
name: check-outputs
channels:
- conda-forge
- bioconda
- defaults
dependencies:
- python=3
- jsonschema=4.20.0
- pyyaml=6.0.1
114 changes: 114 additions & 0 deletions .github/scripts/check_outputs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
#!/usr/bin/env python3

import argparse
import csv
import glob
import json
import os
import urllib.request

from jsonschema import validate
import yaml


def check_provenance_format_valid(provenance_files, schema):
"""
Check that the provenance files are valid according to the schema.
"""
for provenance_file in provenance_files:
with open(provenance_file) as f:
try:
provenance = yaml.load(f, Loader=yaml.BaseLoader)
validate(provenance, schema)
except Exception as e:
print(f"Error validating {provenance_file}: {e}")
exit(1)
return False

return True

def check_expected_files_exist(output_dir, sample_ids):
"""
Check that the expected files exist in the output directory.
:param output_dir: Path to the output directory
:param sample_ids: List of sample IDs
:return: True if all expected files exist, False otherwise
:rtype: bool
"""
for sample_id in sample_ids:
expected_files = [
f"{sample_id}/{sample_id}_fastp.csv",
f"{sample_id}/{sample_id}_fastp.json",
f"{sample_id}/{sample_id}_quast.csv",
f"{sample_id}/{sample_id}_abricate_ncbi.tsv",
f"{sample_id}/{sample_id}_abricate_plasmidfinder.tsv",
f"{sample_id}/{sample_id}_resistance_gene_report.tsv",
]

for expected_file in expected_files:
expected_file_path = os.path.join(output_dir, expected_file)
if not os.path.exists(expected_file_path):
print(f"Expected file {expected_file_path} not found")
return False

return True


def main(args):

output_dir = os.path.dirname(args.output)
os.makedirs(output_dir, exist_ok=True)

provenance_schema_url = "https://raw.githubusercontent.com/BCCDC-PHL/pipeline-provenance-schema/main/schema/pipeline-provenance.json"
provenance_schema_path = ".github/data/pipeline-provenance.json"
urllib.request.urlretrieve(provenance_schema_url, provenance_schema_path)

provenance_schema = None
with open(provenance_schema_path) as f:
provenance_schema = json.load(f)

provenace_files_glob = f"{args.pipeline_outdir}/**/*_provenance.yml"
provenance_files = glob.glob(provenace_files_glob, recursive=True)

sample_ids = [os.path.basename(provenance_file).split("_")[0] for provenance_file in provenance_files]

# TODO: Add more tests
tests = [
{
"test_name": "provenance_format_valid",
"test_passed": check_provenance_format_valid(provenance_files, provenance_schema),
},
{
"test_name": "all_expected_files_exist",
"test_passed": check_expected_files_exist(args.pipeline_outdir, sample_ids),
},
]

output_fields = [
"test_name",
"test_result"
]

output_path = args.output
with open(output_path, 'w') as f:
writer = csv.DictWriter(f, fieldnames=output_fields, extrasaction='ignore')
writer.writeheader()
for test in tests:
if test["test_passed"]:
test["test_result"] = "PASS"
else:
test["test_result"] = "FAIL"
writer.writerow(test)

for test in tests:
if not test['test_passed']:
exit(1)


if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Check outputs')
parser.add_argument('--pipeline-outdir', type=str, help='Path to the pipeline output directory')
parser.add_argument('-o', '--output', type=str, help='Path to the output file')
args = parser.parse_args()
main(args)
12 changes: 12 additions & 0 deletions .github/scripts/check_outputs.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#!/usr/bin/env bash

set -e -o pipefail

source ${HOME}/.bashrc

eval "$(conda shell.bash hook)"

conda activate check-outputs


.github/scripts/check_outputs.py --pipeline-outdir .github/data/test_output -o artifacts/check_outputs_results.csv
3 changes: 3 additions & 0 deletions .github/scripts/create_art_environment.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#!/bin/bash

conda env create -f .github/environments/art.yml
3 changes: 3 additions & 0 deletions .github/scripts/create_output_checking_environment.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#!/bin/bash

conda env create -f .github/environments/check-outputs.yml
11 changes: 11 additions & 0 deletions .github/scripts/create_samplesheet.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#!/bin/bash

echo 'ID,R1,R2,ASSEMBLY' > .github/data/samplesheet.csv

for i in $(ls ${PWD}/.github/data/fastq/*_R1.fastq.gz); do
ID=$(basename $i _R1.fastq.gz)
R1=$i
R2=${PWD}/.github/data/fastq/${ID}_R2.fastq.gz
ASSEMBLY=${PWD}/.github/data/assemblies/${ID}.fa
echo "$ID,$R1,$R2,$ASSEMBLY" >> .github/data/samplesheet.csv
done
16 changes: 16 additions & 0 deletions .github/scripts/download_assemblies.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#!/bin/bash

mkdir -p .github/data/assemblies

rm -f .github/data/assemblies/GCF_024700185.1.zip
rm -f .github/data/assemblies/GCF024700185.1.fa
rm -f .github/data/assemblies/README.md

curl -o .github/data/assemblies/GCF_024700185.1.zip "https://api.ncbi.nlm.nih.gov/datasets/v2alpha/genome/accession/GCF_024700185.1/download?include_annotation_type=GENOME_FASTA&include_annotation_type=SEQUENCE_REPORT&hydrated=FULLY_HYDRATED"

unzip .github/data/assemblies/GCF_024700185.1.zip -d .github/data/assemblies

mv .github/data/assemblies/ncbi_dataset/data/GCF_024700185.1/GCF_024700185.1_ASM2470018v1_genomic.fna .github/data/assemblies/GCF024700185.1.fa

rm -r .github/data/assemblies/ncbi_dataset
rm -f .github/data/assemblies/README.md
27 changes: 27 additions & 0 deletions .github/scripts/download_mob-suite_db.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
#!/bin/bash

source ${HOME}/.bashrc

eval "$(conda shell.bash hook)"

mkdir -p .github/data

rm -rf .github/data/mob-suite-db

pushd .github/data

wget -O data.tar.gz https://zenodo.org/records/10304948/files/data.tar.gz?download=1

tar -xzf data.tar.gz

rm data.tar.gz

mv data mob-suite-db

conda activate plasmid-screen-35d122a137231eda3b8a0039d42f24f6

mash sketch -i mob-suite-db/ncbi_plasmid_full_seqs.fas

makeblastdb -in mob-suite-db/ncbi_plasmid_full_seqs.fas -dbtype nucl

popd
22 changes: 22 additions & 0 deletions .github/scripts/install_conda.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#!/bin/bash
set -eo pipefail

artifacts_dir="artifacts"

echo "Install Miniconda .." >> ${artifacts_dir}/test.log

export PATH=/opt/miniconda3/bin:$PATH

wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh

/bin/bash ~/miniconda.sh -b -p /opt/miniconda3

rm ~/miniconda.sh

echo ". /opt/minconda3/etc/profile.d/conda.sh" >> ~/.bashrc

conda update -n base -c defaults conda

conda install -y -c conda-forge mamba

conda init bash
11 changes: 11 additions & 0 deletions .github/scripts/install_nextflow.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#!/bin/bash

set -eo pipefail

artifacts_dir="artifacts"

echo Install Nextflow .. >> ${artifacts_dir}/test.log

wget -qO- https://get.nextflow.io | bash

sudo mv nextflow /usr/local/bin/
13 changes: 13 additions & 0 deletions .github/scripts/prepare_artifacts.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#!/bin/bash

artifacts_dir="artifacts"

echo "Prepare artifacts .." >> ${artifacts_dir}/test.log

mkdir -p ${artifacts_dir}/fastq

mv .github/data/fastq/*.fastq.gz ${artifacts_dir}/fastq

mkdir -p ${artifacts_dir}/pipeline_outputs

mv .github/data/test_output/* ${artifacts_dir}/pipeline_outputs
23 changes: 23 additions & 0 deletions .github/scripts/run_pipeline.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
#!/bin/bash

set -eo pipefail

if [ -n "${GITHUB_ACTIONS}" ]; then
echo "Running in GitHub Actions Environment"
echo "Adjusting nextflow.config"
sed -i 's/cpus = 16/cpus = 4/g' nextflow.config
else
echo "Not running in GitHub Actions Environment"
fi

nextflow run main.nf \
-profile conda \
--cache ${HOME}/.conda/envs \
--samplesheet_input .github/data/samplesheet.csv \
--pre_assembled \
--mob_db ${PWD}/.github/data/mob-suite-db \
--collect_outputs \
--collected_outputs_prefix test \
--outdir .github/data/test_output \
-with-report .github/data/test_output/nextflow_report.html \
-with-trace .github/data/test_output/nextflow_trace.tsv
18 changes: 18 additions & 0 deletions .github/scripts/run_tests_locally.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#!/bin/bash

rm -rf .github/data/assemblies/*
rm -rf .github/data/fastq/*
rm -rf .github/data/mob-suite-db
rm -rf .github/data/samplesheet.csv
rm -rf .github/data/test_output

.github/scripts/download_assemblies.sh

.github/scripts/simulate_reads.sh

.github/scripts/download_mob-suite_db.sh

.github/scripts/create_samplesheet.sh

.github/scripts/run_pipeline.sh

35 changes: 35 additions & 0 deletions .github/scripts/simulate_reads.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
#!/bin/bash


source ${HOME}/.bashrc

eval "$(conda shell.bash hook)"

conda activate art

mkdir -p .github/data/fastq

while IFS=',' read -r sample_id assembly; do
art_illumina \
--paired \
--in ${assembly} \
--fcov 12 \
--len 150 \
--mflen 400 \
--sdev 100 \
--rndSeed 42 \
--qShift 0 \
--qShift2 0 \
--out .github/data/fastq/${sample_id}_R

rm -f .github/data/fastq/${sample_id}_R1.aln
rm -f .github/data/fastq/${sample_id}_R2.aln

mv .github/data/fastq/${sample_id}_R1.fq .github/data/fastq/${sample_id}_R1.fastq
mv .github/data/fastq/${sample_id}_R2.fq .github/data/fastq/${sample_id}_R2.fastq

gzip -f .github/data/fastq/${sample_id}_R1.fastq
gzip -f .github/data/fastq/${sample_id}_R2.fastq

done < .github/data/reads_to_simulate.csv

52 changes: 52 additions & 0 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
on:
pull_request:
branches:
- main
push:
branches:
- main
workflow_dispatch:
name: Tests
jobs:
test:
strategy:
fail-fast: false
matrix:
nextflow_version: ["21.04.3", "23.10.1"]
name: Run tests
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@master
- name: Create Artifacts Directory
run: mkdir artifacts
- name: Install Miniconda
run: bash .github/scripts/install_conda.sh
- name: Install Nextflow
env:
NXF_VER: ${{ matrix.nextflow_version }}
run: bash .github/scripts/install_nextflow.sh
- name: Create ART Read-Simulation Environment
run: bash .github/scripts/create_art_environment.sh
- name: Download Assemblies
run: bash .github/scripts/download_assemblies.sh
- name: Simulate Reads
run: bash .github/scripts/simulate_reads.sh
- name: Download mob-suite db
run: bash .github/scripts/download_mob-suite_db.sh
- name: Create SampleSheet
run: bash .github/scripts/create_samplesheet.sh
- name: Run Pipeline
run: bash .github/scripts/run_pipeline.sh
- name: Create Output Checking Environment
run: bash .github/scripts/create_output_checking_environment.sh
- name: Check Outputs
run: bash .github/scripts/check_outputs.sh
- name: Prepare Artifacts
if: always()
run: bash .github/scripts/prepare_artifacts.sh
- name: Upload Artifacts
uses: actions/upload-artifact@v4
if: always()
with:
name: artifacts-BCCDC-PHL-routine-assembly-nextflow-v${{ matrix.nextflow_version }}-${{ github.run_id }}.${{ github.run_attempt }}
path: artifacts
Loading

0 comments on commit 2898065

Please sign in to comment.