|
1 | 1 | #!/bin/bash
|
| 2 | +# usage: ingest-gisaid [--fetch] |
| 3 | +# ingest-gisaid --help |
| 4 | +# |
| 5 | +# Ingest SARS-CoV-2 metadata and sequences from GISAID. |
| 6 | +# |
| 7 | +# If the --fetch flag is given, new records are fetched from GISAID. Otherwise, |
| 8 | +# ingest from the existing GISAID NDJSON file on S3. |
| 9 | +# |
2 | 10 | set -euo pipefail
|
3 | 11 |
|
4 | 12 | : "${S3_SRC:=s3://nextstrain-ncov-private}"
|
5 | 13 | : "${S3_DST:=$S3_SRC}"
|
6 | 14 |
|
7 |
| -# Determine where to save data files based on if we're running as a result of a |
8 |
| -# push to master or to another branch (or locally, outside of the GitHub |
9 |
| -# workflow). Files are always compared to the default/primary paths in the |
10 |
| -# source S3 bucket. |
11 |
| -# |
12 |
| -silent= |
13 |
| -branch= |
| 15 | +main() { |
| 16 | + local fetch=0 |
| 17 | + |
| 18 | + for arg; do |
| 19 | + case "$arg" in |
| 20 | + -h|--help) |
| 21 | + print-help |
| 22 | + exit |
| 23 | + ;; |
| 24 | + --fetch) |
| 25 | + fetch=1 |
| 26 | + shift |
| 27 | + break |
| 28 | + ;; |
| 29 | + esac |
| 30 | + done |
| 31 | + |
| 32 | + # Determine where to save data files based on if we're running as a result of a |
| 33 | + # push to master or to another branch (or locally, outside of the GitHub |
| 34 | + # workflow). Files are always compared to the default/primary paths in the |
| 35 | + # source S3 bucket. |
| 36 | + # |
| 37 | + local silent= |
| 38 | + local branch= |
| 39 | + |
| 40 | + case "${GITHUB_REF:-}" in |
| 41 | + refs/heads/master) |
| 42 | + # Do nothing different; defaults above are good. |
| 43 | + branch=master |
| 44 | + ;; |
| 45 | + refs/heads/*) |
| 46 | + # Save data files under a per-branch prefix |
| 47 | + silent=yes |
| 48 | + branch="${GITHUB_REF##refs/heads/}" |
| 49 | + S3_DST="$S3_DST/branch/$branch" |
| 50 | + ;; |
| 51 | + "") |
| 52 | + # Save data files under a tmp prefix |
| 53 | + silent=yes |
| 54 | + S3_DST="$S3_DST/tmp" |
| 55 | + ;; |
| 56 | + *) |
| 57 | + echo "Skipping ingest for ref $GITHUB_REF" |
| 58 | + exit 0 |
| 59 | + ;; |
| 60 | + esac |
| 61 | + |
| 62 | + echo "S3_SRC is $S3_SRC" |
| 63 | + echo "S3_DST is $S3_DST" |
14 | 64 |
|
15 |
| -case "${GITHUB_REF:-}" in |
16 |
| - refs/heads/master) |
17 |
| - # Do nothing different; defaults above are good. |
18 |
| - branch=master |
19 |
| - ;; |
20 |
| - refs/heads/*) |
21 |
| - # Save data files under a per-branch prefix |
22 |
| - silent=yes |
23 |
| - branch="${GITHUB_REF##refs/heads/}" |
24 |
| - S3_DST="$S3_DST/branch/$branch" |
25 |
| - ;; |
26 |
| - "") |
27 |
| - # Save data files under a tmp prefix |
28 |
| - silent=yes |
29 |
| - S3_DST="$S3_DST/tmp" |
30 |
| - ;; |
31 |
| - *) |
32 |
| - echo "Skipping ingest for ref $GITHUB_REF" |
33 |
| - exit 0 |
34 |
| - ;; |
35 |
| -esac |
| 65 | + cd "$(dirname "$0")/.." |
36 | 66 |
|
37 |
| -echo "S3_SRC is $S3_SRC" |
38 |
| -echo "S3_DST is $S3_DST" |
| 67 | + set -x |
39 | 68 |
|
40 |
| -cd "$(dirname "$0")/.." |
| 69 | + if [[ "$fetch" == 1 ]]; then |
| 70 | + ./bin/fetch-from-gisaid > data/gisaid.ndjson |
| 71 | + if [[ "$branch" == master ]]; then |
| 72 | + ./bin/notify-on-record-change data/gisaid.ndjson "$S3_SRC/gisaid.ndjson.gz" "GISAID" |
| 73 | + fi |
| 74 | + ./bin/upload-to-s3 --quiet data/gisaid.ndjson "$S3_DST/gisaid.ndjson.gz" |
| 75 | + else |
| 76 | + aws s3 cp --no-progress "$S3_DST/gisaid.ndjson.gz" - | gunzip -cfq > data/gisaid.ndjson |
| 77 | + fi |
41 | 78 |
|
42 |
| -set -x |
| 79 | + ./bin/transform-gisaid data/gisaid.ndjson \ |
| 80 | + --output-metadata data/gisaid/metadata.tsv \ |
| 81 | + --output-fasta data/gisaid/sequences.fasta |
43 | 82 |
|
44 |
| -./bin/fetch-from-gisaid > data/gisaid.ndjson |
45 |
| -if [[ "$branch" == master ]]; then |
46 |
| - ./bin/notify-on-record-change data/gisaid.ndjson "$S3_SRC/gisaid.ndjson.gz" "GISAID" |
47 |
| -fi |
48 |
| -./bin/upload-to-s3 --quiet data/gisaid.ndjson "$S3_DST/gisaid.ndjson.gz" |
| 83 | + ./bin/flag-metadata data/gisaid/metadata.tsv > data/gisaid/flagged_metadata.txt |
| 84 | + ./bin/check-locations data/gisaid/metadata.tsv \ |
| 85 | + data/gisaid/location_hierarchy.tsv \ |
| 86 | + gisaid_epi_isl |
49 | 87 |
|
50 |
| -./bin/transform-gisaid data/gisaid.ndjson \ |
51 |
| - --output-metadata data/gisaid/metadata.tsv \ |
52 |
| - --output-fasta data/gisaid/sequences.fasta |
| 88 | + if [[ "$branch" == master ]]; then |
| 89 | + ./bin/notify-on-metadata-change data/gisaid/metadata.tsv "$S3_SRC/metadata.tsv.gz" gisaid_epi_isl |
| 90 | + ./bin/notify-on-additional-info-change data/gisaid/additional_info.tsv "$S3_SRC/additional_info.tsv.gz" |
| 91 | + ./bin/notify-on-flagged-metadata-change data/gisaid/flagged_metadata.txt "$S3_SRC/flagged_metadata.txt.gz" |
| 92 | + ./bin/notify-on-location-hierarchy-addition data/gisaid/location_hierarchy.tsv source-data/location_hierarchy.tsv |
| 93 | + fi |
53 | 94 |
|
54 |
| -./bin/flag-metadata data/gisaid/metadata.tsv > data/gisaid/flagged_metadata.txt |
55 |
| -./bin/check-locations data/gisaid/metadata.tsv \ |
56 |
| - data/gisaid/location_hierarchy.tsv \ |
57 |
| - gisaid_epi_isl |
| 95 | + ./bin/upload-to-s3 ${silent:+--quiet} data/gisaid/metadata.tsv "$S3_DST/metadata.tsv.gz" |
| 96 | + ./bin/upload-to-s3 ${silent:+--quiet} data/gisaid/additional_info.tsv "$S3_DST/additional_info.tsv.gz" |
| 97 | + ./bin/upload-to-s3 ${silent:+--quiet} data/gisaid/flagged_metadata.txt "$S3_DST/flagged_metadata.txt.gz" |
| 98 | + ./bin/upload-to-s3 ${silent:+--quiet} data/gisaid/sequences.fasta "$S3_DST/sequences.fasta.gz" |
| 99 | +} |
58 | 100 |
|
59 |
| -if [[ "$branch" == master ]]; then |
60 |
| - ./bin/notify-on-metadata-change data/gisaid/metadata.tsv "$S3_SRC/metadata.tsv.gz" gisaid_epi_isl |
61 |
| - ./bin/notify-on-additional-info-change data/gisaid/additional_info.tsv "$S3_SRC/additional_info.tsv.gz" |
62 |
| - ./bin/notify-on-flagged-metadata-change data/gisaid/flagged_metadata.txt "$S3_SRC/flagged_metadata.txt.gz" |
63 |
| - ./bin/notify-on-location-hierarchy-addition data/gisaid/location_hierarchy.tsv source-data/location_hierarchy.tsv |
64 |
| -fi |
| 101 | +print-help() { |
| 102 | + # Print the help comments at the top of this file ($0) |
| 103 | + local line |
| 104 | + while read -r line; do |
| 105 | + if [[ $line =~ ^#! ]]; then |
| 106 | + continue |
| 107 | + elif [[ $line =~ ^# ]]; then |
| 108 | + line="${line/##/}" |
| 109 | + line="${line/# /}" |
| 110 | + echo "$line" |
| 111 | + else |
| 112 | + break |
| 113 | + fi |
| 114 | + done < "$0" |
| 115 | +} |
65 | 116 |
|
66 |
| -./bin/upload-to-s3 ${silent:+--quiet} data/gisaid/metadata.tsv "$S3_DST/metadata.tsv.gz" |
67 |
| -./bin/upload-to-s3 ${silent:+--quiet} data/gisaid/additional_info.tsv "$S3_DST/additional_info.tsv.gz" |
68 |
| -./bin/upload-to-s3 ${silent:+--quiet} data/gisaid/flagged_metadata.txt "$S3_DST/flagged_metadata.txt.gz" |
69 |
| -./bin/upload-to-s3 ${silent:+--quiet} data/gisaid/sequences.fasta "$S3_DST/sequences.fasta.gz" |
| 117 | +main "$@" |
0 commit comments