Skip to content

Commit

Permalink
Add testing pipeline
Browse files Browse the repository at this point in the history
  • Loading branch information
dfornika committed Jun 19, 2024
1 parent 74b8048 commit a7c1fca
Show file tree
Hide file tree
Showing 13 changed files with 301 additions and 0 deletions.
1 change: 1 addition & 0 deletions .github/data/reads_to_simulate.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
CP003200.1,.github/data/assemblies/CP003200.1.fa
114 changes: 114 additions & 0 deletions .github/scripts/check_outputs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
#!/usr/bin/env python3

import argparse
import csv
import glob
import json
import os
import urllib.request

from jsonschema import validate
import yaml


def check_provenance_format_valid(provenance_files, schema):
"""
Check that the provenance files are valid according to the schema.
"""
for provenance_file in provenance_files:
with open(provenance_file) as f:
try:
provenance = yaml.load(f, Loader=yaml.BaseLoader)
validate(provenance, schema)
except Exception as e:
print(f"Error validating {provenance_file}: {e}")
exit(1)
return False

return True

def check_expected_files_exist(output_dir, sample_ids):
"""
Check that the expected files exist in the output directory.
:param output_dir: Path to the output directory
:param sample_ids: List of sample IDs
:return: True if all expected files exist, False otherwise
:rtype: bool
"""
for sample_id in sample_ids:
expected_files = [
f"{sample_id}/{sample_id}_fastp.csv",
f"{sample_id}/{sample_id}_fastp.json",
f"{sample_id}/{sample_id}_quast.csv",
f"{sample_id}/{sample_id}_abricate_ncbi.tsv",
f"{sample_id}/{sample_id}_abricate_plasmidfinder.tsv",
f"{sample_id}/{sample_id}_resistance_gene_report.tsv",
]

for expected_file in expected_files:
expected_file_path = os.path.join(output_dir, expected_file)
if not os.path.exists(expected_file_path):
print(f"Expected file {expected_file_path} not found")
return False

return True


def main(args):

output_dir = os.path.dirname(args.output)
os.makedirs(output_dir, exist_ok=True)

provenance_schema_url = "https://raw.githubusercontent.com/BCCDC-PHL/pipeline-provenance-schema/main/schema/pipeline-provenance.json"
provenance_schema_path = ".github/data/pipeline-provenance.json"
urllib.request.urlretrieve(provenance_schema_url, provenance_schema_path)

provenance_schema = None
with open(provenance_schema_path) as f:
provenance_schema = json.load(f)

provenace_files_glob = f"{args.pipeline_outdir}/**/*_provenance.yml"
provenance_files = glob.glob(provenace_files_glob, recursive=True)

sample_ids = [os.path.basename(provenance_file).split("_")[0] for provenance_file in provenance_files]

# TODO: Add more tests
tests = [
{
"test_name": "provenance_format_valid",
"test_passed": check_provenance_format_valid(provenance_files, provenance_schema),
},
{
"test_name": "all_expected_files_exist",
"test_passed": check_expected_files_exist(args.pipeline_outdir, sample_ids),
},
]

output_fields = [
"test_name",
"test_result"
]

output_path = args.output
with open(output_path, 'w') as f:
writer = csv.DictWriter(f, fieldnames=output_fields, extrasaction='ignore')
writer.writeheader()
for test in tests:
if test["test_passed"]:
test["test_result"] = "PASS"
else:
test["test_result"] = "FAIL"
writer.writerow(test)

for test in tests:
if not test['test_passed']:
exit(1)


if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Check outputs')
parser.add_argument('--pipeline-outdir', type=str, help='Path to the pipeline output directory')
parser.add_argument('-o', '--output', type=str, help='Path to the output file')
args = parser.parse_args()
main(args)
12 changes: 12 additions & 0 deletions .github/scripts/check_outputs.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#!/usr/bin/env bash

set -e -o pipefail

source ${HOME}/.bashrc

eval "$(conda shell.bash hook)"

conda activate check-outputs


.github/scripts/check_outputs.py --pipeline-outdir .github/data/test_output -o artifacts/check_outputs_results.csv
3 changes: 3 additions & 0 deletions .github/scripts/create_art_environment.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#!/bin/bash

conda env create -f .github/environments/art.yml
3 changes: 3 additions & 0 deletions .github/scripts/create_output_checking_environment.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#!/bin/bash

conda env create -f .github/environments/check-outputs.yml
5 changes: 5 additions & 0 deletions .github/scripts/download_assemblies.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#!/bin/bash

mkdir -p .github/data/assemblies

curl -o .github/data/assemblies/CP003200.1.fa "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?retmode=text&id=CP003200.1&db=nucleotide&rettype=fasta"
15 changes: 15 additions & 0 deletions .github/scripts/download_mob-suite_db.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#!/bin/bash

mkdir -p .github/data

pushd .github/data

wget -O data.tar.gz https://zenodo.org/records/10304948/files/data.tar.gz?download=1

tar -xzf data.tar.gz

rm data.tar.gz

mv data mob-suite-db

popd
22 changes: 22 additions & 0 deletions .github/scripts/install_conda.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#!/bin/bash
set -eo pipefail

artifacts_dir="artifacts"

echo "Install Miniconda .." >> ${artifacts_dir}/test.log

export PATH=/opt/miniconda3/bin:$PATH

wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh

/bin/bash ~/miniconda.sh -b -p /opt/miniconda3

rm ~/miniconda.sh

echo ". /opt/minconda3/etc/profile.d/conda.sh" >> ~/.bashrc

conda update -n base -c defaults conda

conda install -y -c conda-forge mamba

conda init bash
11 changes: 11 additions & 0 deletions .github/scripts/install_nextflow.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#!/bin/bash

set -eo pipefail

artifacts_dir="artifacts"

echo Install Nextflow .. >> ${artifacts_dir}/test.log

wget -qO- https://get.nextflow.io | bash

sudo mv nextflow /usr/local/bin/
13 changes: 13 additions & 0 deletions .github/scripts/prepare_artifacts.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#!/bin/bash

artifacts_dir="artifacts"

echo "Prepare artifacts .." >> ${artifacts_dir}/test.log

mkdir -p ${artifacts_dir}/fastq

mv .github/data/fastq/*.fastq.gz ${artifacts_dir}/fastq

mkdir -p ${artifacts_dir}/pipeline_outputs

mv .github/data/test_output/* ${artifacts_dir}/pipeline_outputs
17 changes: 17 additions & 0 deletions .github/scripts/run_pipeline.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#!/bin/bash

set -eo pipefail

sed -i 's/cpus = 16/cpus = 4/g' nextflow.config

nextflow run main.nf \
-profile conda \
--cache ${HOME}/.conda/envs \
--fastq_input .github/data/fastq \
--mob_db .github/data/mob-suite-db \
--prokka \
--collect_outputs \
--collected_outputs_prefix test \
--outdir .github/data/test_output \
-with-report .github/data/test_output/nextflow_report.html \
-with-trace .github/data/test_output/nextflow_trace.tsv
35 changes: 35 additions & 0 deletions .github/scripts/simulate_reads.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
#!/bin/bash


source ${HOME}/.bashrc

eval "$(conda shell.bash hook)"

conda activate art

mkdir -p .github/data/fastq

while IFS=',' read -r sample_id assembly; do
art_illumina \
--paired \
--in ${assembly} \
--fcov 12 \
--len 150 \
--mflen 400 \
--sdev 100 \
--rndSeed 42 \
--qShift 0 \
--qShift2 0 \
--out .github/data/fastq/${sample_id}_R

rm -f .github/data/fastq/${sample_id}_R1.aln
rm -f .github/data/fastq/${sample_id}_R2.aln

mv .github/data/fastq/${sample_id}_R1.fq .github/data/fastq/${sample_id}_R1.fastq
mv .github/data/fastq/${sample_id}_R2.fq .github/data/fastq/${sample_id}_R2.fastq

gzip -f .github/data/fastq/${sample_id}_R1.fastq
gzip -f .github/data/fastq/${sample_id}_R2.fastq

done < .github/data/reads_to_simulate.csv

50 changes: 50 additions & 0 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
on:
pull_request:
branches:
- main
push:
branches:
- main
workflow_dispatch:
name: Tests
jobs:
test:
strategy:
fail-fast: false
matrix:
nextflow_version: ["21.04.3", "23.10.1"]
name: Run tests
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@master
- name: Create Artifacts Directory
run: mkdir artifacts
- name: Install Miniconda
run: bash .github/scripts/install_conda.sh
- name: Install Nextflow
env:
NXF_VER: ${{ matrix.nextflow_version }}
run: bash .github/scripts/install_nextflow.sh
- name: Create ART Read-Simulation Environment
run: bash .github/scripts/create_art_environment.sh
- name: Download Assemblies
run: bash .github/scripts/download_assemblies.sh
- name: Simulate Reads
run: bash .github/scripts/simulate_reads.sh
- name: Download mob-suite db
run: bash .github/scripts/download_mob-suite_db.sh
- name: Run Pipeline
run: bash .github/scripts/run_pipeline.sh
- name: Create Output Checking Environment
run: bash .github/scripts/create_output_checking_environment.sh
- name: Check Outputs
run: bash .github/scripts/check_outputs.sh
- name: Prepare Artifacts
if: always()
run: bash .github/scripts/prepare_artifacts.sh
- name: Upload Artifacts
uses: actions/upload-artifact@v4
if: always()
with:
name: artifacts-BCCDC-PHL-routine-assembly-nextflow-v${{ matrix.nextflow_version }}-${{ github.run_id }}.${{ github.run_attempt }}
path: artifacts

0 comments on commit a7c1fca

Please sign in to comment.