Skip to content

Commit

Permalink
Merge pull request #837 from broadinstitute/staging
Browse files Browse the repository at this point in the history
Staging -> Master
  • Loading branch information
nikellepetrillo committed Nov 10, 2022
2 parents 46084d5 + e30bd74 commit 434cbd9
Show file tree
Hide file tree
Showing 202 changed files with 2,181 additions and 1,811 deletions.
20 changes: 17 additions & 3 deletions .pullapprove.yml
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ groups:
'tasks/broad/IlluminaGenotypingArrayTasks.wdl' in files or
'tasks/broad/InternalArraysTasks.wdl' in files or
'tasks/broad/InternalTasks.wdl' in files or
'tasks/broad/Qc.wdl' in files or
'tasks/broad/Utilities.wdl' in files or
'verification/VerifyArrays.wdl' in files or
'verification/VerifyIlluminaGenotypingArray.wdl' in files or
Expand Down Expand Up @@ -138,6 +139,8 @@ groups:
'tasks/broad/BamProcessing.wdl' in files or
'tasks/broad/BamToCram.wdl' in files or
'tasks/broad/CopyFilesFromCloudToCloud.wdl' in files or
'tasks/broad/DragenTasks.wdl' in files or
'tasks/broad/DragmapAlignment.wdl' in files or
'tasks/broad/GermlineVariantDiscovery.wdl' in files or
'tasks/broad/Qc.wdl' in files or
'tasks/broad/SplitLargeReadGroup.wdl' in files or
Expand All @@ -148,6 +151,7 @@ groups:
'verification/VerifyReprocessing.wdl' in files or
'verification/VerifyTasks.wdl' in files or
'pipelines/broad/dna_seq/germline/single_sample/exome' in files or
'pipelines/broad/dna_seq/germline/single_sample/ugwgs' in files or
'pipelines/broad/dna_seq/germline/single_sample/wgs' in files or
'pipelines/broad/reprocessing/cram_to_unmapped_bams' in files or
'pipelines/broad/reprocessing/exome' in files or
Expand All @@ -160,7 +164,7 @@ groups:
request: 2
reviewers:
users:
- ldgauthier # Laura Gauthier
- samuelklee # Samuel Lee
- kachulis # Chris Kachulis

scientific_owners_joint_genotyping:
Expand All @@ -173,13 +177,19 @@ groups:
'pipelines/broad/dna_seq/germline/joint_genotyping/by_chromosome/JointGenotypingByChromosomePartOne.wdl' in files or
'pipelines/broad/dna_seq/germline/joint_genotyping/by_chromosome/JointGenotypingByChromosomePartTwo.wdl' in files or
'pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/ReblockGVCF.wdl' in files or
'tasks/broad/GermlineVariantDiscovery.wdl' in files or
'tasks/broad/JointGenotypingTasks.wdl' in files or
'tasks/broad/Qc.wdl' in files or
'pipelines/broad/dna_seq/germline/joint_genotyping/JointGenotyping.changelog.md' in files or
'pipelines/broad/dna_seq/germline/joint_genotyping/JointGenotyping.options.json' in files or
'pipelines/broad/dna_seq/germline/joint_genotyping/JointGenotyping.wdl' in files or
'pipelines/broad/dna_seq/germline/joint_genotyping/JointGenotypingOnReblockedValidate.md' in files or
'pipelines/broad/dna_seq/germline/joint_genotyping/UltimaGenomics' in files or
'pipelines/broad/dna_seq/germline/joint_genotyping/by_chromosome' in files or
'pipelines/broad/dna_seq/germline/joint_genotyping/exome' in files or
'pipelines/broad/dna_seq/germline/joint_genotyping/reblocking' in files or
'pipelines/broad/dna_seq/germline/joint_genotyping/test_data_overview.md' in files or
'pipelines/broad/dna_seq/germline/joint_genotyping/test_inputs' in files or
'pipelines/broad/dna_seq/germline/joint_genotyping/wgs' in files
reviews:
Expand All @@ -189,7 +199,7 @@ groups:
request_order: given
reviewers:
users:
- ldgauthier # Laura Gauthier
- samuelklee # Samuel Lee

scientific_owners_somatic_single_sample:
conditions:
Expand All @@ -203,6 +213,7 @@ groups:
'tasks/broad/Alignment.wdl' in files or
'tasks/broad/BamProcessing.wdl' in files or
'tasks/broad/BamToCram.wdl' in files or
'tasks/broad/DragmapAlignment.wdl' in files or
'tasks/broad/Qc.wdl' in files or
'tasks/broad/SplitLargeReadGroup.wdl' in files or
'tasks/broad/UnmappedBamToAlignedBam.wdl' in files or
Expand Down Expand Up @@ -235,7 +246,8 @@ groups:
'pipelines/broad/arrays/imputation/Imputation.changelog.md' in files or
'pipelines/broad/arrays/imputation/Imputation.options.json' in files or
'pipelines/broad/arrays/imputation/Imputation.wdl' in files or
'pipelines/broad/arrays/imputation/example_inputs.json' in files
'pipelines/broad/arrays/imputation/example_inputs.json' in files or
'pipelines/broad/arrays/imputation/test_inputs' in files
reviews:
required: 1
Expand All @@ -262,6 +274,8 @@ groups:
'tasks/broad/Alignment.wdl' in files or
'tasks/broad/BamProcessing.wdl' in files or
'tasks/broad/BamToCram.wdl' in files or
'tasks/broad/DragenTasks.wdl' in files or
'tasks/broad/DragmapAlignment.wdl' in files or
'tasks/broad/GermlineVariantDiscovery.wdl' in files or
'tasks/broad/Qc.wdl' in files or
'tasks/broad/SplitLargeReadGroup.wdl' in files or
Expand Down
22 changes: 11 additions & 11 deletions beta-pipelines/skylab/ATAC/ATAC.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -230,7 +230,7 @@ task BWAPairedEndAlignment {
String read_group_sample_name
Int cpu
String output_base_name
String docker_image = "quay.io/humancellatlas/snaptools:0.0.1"
String docker_image = "us.gcr.io/broad-gotc-prod/bwa:1.0.0-0.7.17-1660770463"
}

parameter_meta {
Expand All @@ -241,7 +241,7 @@ task BWAPairedEndAlignment {
read_group_sample_name: "the read group sample to be added upon alignment"
cpu: "the number of cpu cores to use during alignment"
output_base_name: "basename to be used for the output of the task"
docker_image: "the docker image using BWA to be used (default: quay.io/humancellatlas/snaptools:0.0.1)"
docker_image: "the docker image using BWA to be used (default: us.gcr.io/broad-gotc-prod/pytools:1.0.0-1661263730)"
}

# runtime requirements based upon input file size
Expand Down Expand Up @@ -580,7 +580,7 @@ task SnapPre {
String genome_name
Int max_fragment_length
File genome_size_file
String docker_image = "quay.io/humancellatlas/snaptools:0.0.1"
String docker_image = "us.gcr.io/broad-gotc-prod/snaptools-bwa:1.0.0-1.4.8-0.7.17-1660844602"
}

parameter_meta {
Expand All @@ -589,7 +589,7 @@ task SnapPre {
genome_name: "the name of the genome being analyzed"
max_fragment_length: "the maximum fragment length for filtering out reads by snap-pre (snaptools task)"
genome_size_file: "size for the chromoomes for the genome; ex: mm10.chrom.size"
docker_image: "the docker image using snaptools to be used (default: quay.io/humancellatlas/snaptools:0.0.1)"
docker_image: "the docker image using snaptools to be used (default: us.gcr.io/broad-gotc-prod/snaptools-bwa:1.0.0-1.4.8-0.7.17-1660844602)"
}

String snap_file_output_name = output_base_name + ".snap"
Expand Down Expand Up @@ -635,14 +635,14 @@ task SnapCellByBin {
File snap_input
String bin_size_list
String snap_output_name = "output.snap"
String docker_image = "quay.io/humancellatlas/snaptools:0.0.1"
String docker_image = "us.gcr.io/broad-gotc-prod/snaptools-bwa:1.0.0-1.4.8-0.7.17-1660844602"
}

parameter_meta {
snap_input: "the bam to passed into snaptools tools"
bin_size_list: "space separated list of bins to generate"
snap_output_name: "output.snap"
docker_image: "the docker image to be used (default: quay.io/humancellatlas/snaptools:0.0.1)"
docker_image: "the docker image to be used (default: us.gcr.io/broad-gotc-prod/snaptools-bwa:1.0.0-1.4.8-0.7.17-1660844602)"
}

Int num_threads = 1
Expand Down Expand Up @@ -673,21 +673,21 @@ task MakeCompliantBAM {
input {
File bam_input
String output_base_name
String docker_image = "quay.io/humancellatlas/snaptools:0.0.1"
String docker_image = "us.gcr.io/broad-gotc-prod/pytools:1.0.0-1661263730"
}

parameter_meta {
bam_input: "the bam with barcodes in the read ids that need to be converted to barcodes in bam tags"
output_base_name: "base name to be used for the output of the task"
docker_image: "the docker image using the python script to convert the bam barcodes/read ids (default: quay.io/humancellatlas/snaptools:0.0.1)"
docker_image: "the docker image using the python script to convert the bam barcodes/read ids (default: us.gcr.io/broad-gotc-prod/pytools:1.0.0-1661263730)"
}

Int disk_size = ceil(2.5 * (if size(bam_input, "GiB") < 1 then 1 else size(bam_input, "GiB")))

String compliant_bam_output_name = output_base_name + ".compliant.bam"

command {
makeCompliantBAM.py \
/usr/gitc/makeCompliantBAM.py \
--input-bam ~{bam_input} \
--output-bam ~{compliant_bam_output_name}
}
Expand All @@ -707,15 +707,15 @@ task MakeCompliantBAM {
task BreakoutSnap {
input {
File snap_input
String docker_image = "quay.io/humancellatlas/snap-breakout:0.0.1"
String docker_image = "us.gcr.io/broad-gotc-prod/pytools:1.0.0-1661263730"
String bin_size_list
}
Int num_threads = 1
Float input_size = size(snap_input, "GiB")
command {
set -euo pipefail
mkdir output
breakoutSnap.py --input ~{snap_input} \
/usr/gitc/breakoutSnap.py --input ~{snap_input} \
--output-prefix output/
}
output {
Expand Down
50 changes: 34 additions & 16 deletions dockers/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,15 @@ This style guide provides formatting guidelines and best practices for writing D
* [Goals](#goals)
* [Small images](#small)
* [Alpine base](#alpine)
* [Specifying image platform](#platform)
* [Minimal RUN steps](#minimal-run)
* [Publicly accessible](#publicly)
* [Image scanning](#scanning)
* [Semantic tagging](#semantic)
* [Proper process reaping](#process)
* [Build Scripts and README](#build)
* [Formatting](#formatting)
* [Troubleshooting](#trouble)
* [Troubleshooting and running standalone](#trouble)
## <a name="overview"></a> Overview

WARP maintains a collection of docker images which are used as execution environments for various cloud-optimized data processing pipelines. Many of these image require specific sets of tools and dependencies to run and can be thought of as _custom_ images rather than traditional application images.
Expand All @@ -35,7 +36,7 @@ The easiest way to have a small image is to use an [Alpine](https://alpinelinux.

Along with being a small base, Alpine also has built in deletion of package index and provides [tini](https://github.com/krallin/tini) natively through APK.

There are some instances where a Debian base image is unavoidable, specifically in the case where dependencies don't exists in APK. It is suggested that you only go to a Debian base as a last resort.
There are some instances where a Debian base image is unavoidable, specifically in the case where dependencies don't exist in APK. It is suggested that you only go to a Debian base as a last resort.


##### :eyes: Example
Expand All @@ -62,12 +63,23 @@ RUN set -eux; \
bash \
```

#### <a name="platform"></a> Specifying image platform

Docker images built on ARM-based machines such as the new M-series Macs may run into execution issues with our automated PR test suite.
One way to avoid these issues is to use a `linux/amd64` base image by including the `--platform="linux/amd64` flag after the `FROM` keyword.

##### :eyes: Example
```dockerfile
# Use the amd64 version of alpine
FROM --platform="linux/amd64" alpine
```

#### <a name="minimal-run"></a> Minimal RUN steps

Having minimal `RUN`steps (ideally one) is another highly effective way to reduce the size of your image. Each instruction in a Dockerfile creates a [layer](https://docs.docker.com/storage/storagedriver/) and these layers are what add up to build the final image.
When you use multple `RUN` steps it creates additional unnecessary layers and bloats your image.
When you use multiple `RUN` steps it creates additional unnecessary layers and bloats your image.

An alternative to having a single `RUN` step is to use [multi-stage builds](https://docs.docker.com/develop/develop-images/multistage-build/) which are effective when the application your are containerizing is just a statically linked binary.
An alternative to having a single `RUN` step is to use [multi-stage builds](https://docs.docker.com/develop/develop-images/multistage-build/) which are effective when the application you are containerizing is just a statically linked binary.
Just to note, many of the images maintained in WARP require a handful of system-level dependencies and custom packages so multi-stages builds are typically not used.

##### :eyes: Example
Expand All @@ -84,7 +96,7 @@ RUN set -eux; \
apk add --no-cache \
curl \
bash \
; \
; \
wget https://www.somezipfile.com/zip; \
unzip zip
```
Expand All @@ -96,22 +108,22 @@ The pipelines that we maintain in WARP are designed for public use, ideally we w
* Anybody can pull our images
* Anybody can build our images

For anybody to be able to pull our images they must be hosted on a public container registry, we host all of our images in publics repos on GCR (our 'official' location) and Quay (for discoverability).
For anybody to be able to pull our images they must be hosted on a public container registry, we host all of our images in public repos on GCR (our 'official' location) and Quay (for discoverability).

* GCR - `us.gcr.io/broad-gotc-prod`
* Quay - `quay.io/broadinstitute/broad-gotc-prod`

For anybody to be able to build our images all of the functionality should be encapsulated in the Dockerfile. Any custom software packages, dependencies etc. have to be downloaded from public links within the Dockerfile, this obviously means that we should not be copying files from within the Broad network infrastucture into our images.
For anybody to be able to build our images, all functionality should be encapsulated in the Dockerfile. Any custom software packages, dependencies etc. have to be downloaded from public links within the Dockerfile, this obviously means that we should not be copying files from within the Broad network infrastructure into our images.

### <a name="scanning"></a> Image scanning


All of the images that we build are scanned for critical vulnerabilities on every pull request. For this we use a github-action that leverages [trivy](https://github.com/aquasecurity/trivy) for scanning. If you build a new image please add it to the action [here](../.github/workflows/trivy.yml).
All images that we build are scanned for critical vulnerabilities on every pull request. For this we use a github-action that leverages [trivy](https://github.com/aquasecurity/trivy) for scanning. If you build a new image please add it to the action [here](../.github/workflows/trivy.yml).

### <a name="semantic"></a> Semantic tagging


We recommend against using rolling tags like `master` or `latest` when building images. Rolling tags make it hard to track down versions of images since the underlying image hash and content could be different across the same tags. Instead we ask that you use a semantic tag that follows the convention below:
We recommend against using rolling tags like `master` or `latest` when building images. Rolling tags make it hard to track down versions of images since the underlying image hash and content could be different across the same tags. Instead, we ask that you use a semantic tag that follows the convention below:

##### `us.gcr.io/broad-gotc-prod/samtools:<image-version>-<samtools-version>-<unix-timestamp>`

Expand All @@ -120,7 +132,7 @@ This example is for an image we use containing `samtools`. The 'image-version' i
### <a name="process"></a> Proper process reaping


Classic init systems like systemd are used to reap orphaned, zombie processes. Typically these orphaned processes are reattached to the process at PID 1 which will reap them when they die. In a container this responsibility falls to process at PID 1 which is by default `/bin/sh`...this obviously will not handle process reaping. Because of this you run the risk of expending excess memory or resources within your container. A simple solution to this is to use `tini` in all of our images, a lengthy explanation of what this package does can be found [here](https://github.com/krallin/tini/issues/8).
Classic init systems like systemd are used to reap orphaned, zombie processes. Typically, these orphaned processes are reattached to the process at PID 1 which will reap them when they die. In a container this responsibility falls to process at PID 1 which is by default `/bin/sh`...this obviously will not handle process reaping. Because of this you run the risk of expending excess memory or resources within your container. A simple solution to this is to use `tini` in all of our images, a lengthy explanation of what this package does can be found [here](https://github.com/krallin/tini/issues/8).

Luckily `tini` is available natively through APK so all you have to do is install it and set it as the default entrypoint!

Expand All @@ -129,7 +141,7 @@ Luckily `tini` is available natively through APK so all you have to do is instal

FROM alpine:3.9

RUN set -eux;
RUN set -eux; \
apk add --no-cache \
tini

Expand All @@ -146,15 +158,15 @@ See the examples for samtools([docker_build](./broad/samtools/docker_build.sh),

## Formatting

Formatting our Dockerfiles consistenty helps improve readability and eases maintenance headaches down the road. The following are a couple of tenants that we follow when writing our Dockerfiles:
Formatting our Dockerfiles consistently helps improve readability and eases maintenance headaches down the road. The following are a couple of tenants that we follow when writing our Dockerfiles:

* ARGS, ENV, LABEL in that order
* Always add versions of tools in the LABEL
* Single RUN steps
* Alphabetize package install
* Clean up package index cache
* Use ; instead of && for line continuation
* Logically seperate steps within RUN
* Logically separate steps within RUN
* Four spaces per tab indent
* Short comments to describe each step
* tini is always default entrypoint
Expand All @@ -180,13 +192,13 @@ WORKDIR /usr/gitc
# Install dependencies
RUN set -eux; \
apt-get update; \
apt-get install -y \
apt-get install -y \
autoconf \
cmake \
g++ \
gcc \
git \
libbz2-dev \
libbz2-dev \
libcurl4-openssl-dev \
libhts-dev \
libssl-dev \
Expand Down Expand Up @@ -222,6 +234,12 @@ RUN set -eux; \
ENTRYPOINT [ "/sbin/tini", "--" ]
```

## <a link="trouble"></a> Troubleshooting
## <a link="trouble"></a> Troubleshooting and running standalone

The WARP dockers are designed to be run from their respective WDL pipelines. However, if you need to run a Docker independent of a WDL for testing or troubleshooting, you'll likely need to explicity instruct it to run a `bash` shell in the `run` command. An example of this is shown in the terminal command below:

```bash
docker run -it --rm <docker url> bash
```

If you have any questions or would like some more guidance on writing Dockerfiles please file a [GitHub issue in WARP](https://github.com/broadinstitute/warp/issues/new).
4 changes: 2 additions & 2 deletions dockers/broad/imputation/bcftools_vcftools/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM python:3.8-alpine
FROM --platform=linux/amd64 python:3.8-alpine

ARG BCFTOOLS_VERSION=1.10.2 \
VCFTOOLS_VERSION=0.1.16
Expand Down Expand Up @@ -70,7 +70,7 @@ RUN set -eux; \
./configure; \
make; \
make install; \
\
\
cd ../..; \
rm -r samtools-1.10; \
rm samtools-1.10.tar.bz2
Expand Down
Loading

0 comments on commit 434cbd9

Please sign in to comment.