diff --git a/.github/workflows/static_database.yml b/.github/workflows/static_database.yml index 84cde2f..7d76be0 100644 --- a/.github/workflows/static_database.yml +++ b/.github/workflows/static_database.yml @@ -22,7 +22,10 @@ jobs: - name: Install required utilities run: | sudo apt-get update - sudo apt-get -y install git maven curl unzip gawk sqlite3 libsqlite3-dev pv nodejs + sudo apt-get -y install git maven curl unzip gawk sqlite3 libsqlite3-dev pv nodejs wget + - name: Download Taxdmp file + shell: bash + run: wget https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdmp.zip - name: Generate tsv.gz files shell: bash run: ./scripts/build_database.sh static-database "swissprot,trembl" "https://ftp.expasy.org/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.xml.gz,https://ftp.expasy.org/databases/uniprot/current_release/knowledgebase/complete/uniprot_trembl.xml.gz" "output" @@ -60,8 +63,8 @@ jobs: release_name: Static database ${{ steps.date.outputs.date }} draft: false prerelease: false - - name: Upload Release Asset - id: upload-release-asset + - name: Upload Static Database Release Asset + id: upload-database-release-asset uses: actions/upload-release-asset@v1 env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} @@ -70,3 +73,13 @@ jobs: asset_path: ./output.zip asset_name: unipept-static-db-${{ steps.date.outputs.date }}.zip asset_content_type: application/zip + - name: Upload NCBI Taxdmp Release Asset + id: upload-taxdmp-release-asset + uses: actions/upload-release-asset@v1 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + with: + upload_url: ${{ steps.create_release.outputs.upload_url }} + asset_path: ./taxdmp.zip + asset_name: ncbi-taxdmp.zip + asset_content_type: application/zip diff --git a/scripts/build_database.sh b/scripts/build_database.sh index de2936f..a44d13a 100755 --- a/scripts/build_database.sh +++ b/scripts/build_database.sh @@ -263,7 +263,7 @@ CMD_SORT="sort --buffer-size=$SORT_MEMORY --parallel=4" # Which sort command sho CMD_GZIP="gzip -" # Which pipe compression command should I use? ENTREZ_BATCH_SIZE=1000 # Which batch size should I use for communication with Entrez? -TAXON_URL="https://ftp.ncbi.nih.gov/pub/taxonomy/taxdmp.zip" +TAXON_FALLBACK_URL="https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdmp.zip" EC_CLASS_URL="https://ftp.expasy.org/databases/enzyme/enzclass.txt" EC_NUMBER_URL="https://ftp.expasy.org/databases/enzyme/enzyme.dat" GO_TERM_URL="http://geneontology.org/ontology/go-basic.obo" @@ -315,11 +315,27 @@ have() { ### All the different database construction steps. +download_taxdmp() { + # Check if our self-hosted version is available or not using the GitHub API + LATEST_RELEASE_URL="https://api.github.com/repos/unipept/unipept-database/releases/latest" + TAXDMP_RELEASE_ASSET_RE="unipept/unipept-database/releases/download/[^/]+/ncbi-taxdmp.zip" + SELF_HOSTED_URL=$(curl -s "$LATEST_RELEASE_URL" | egrep -o "$TAXDMP_RELEASE_ASSET_RE") + + if [ "$SELF_HOSTED_URL" ] + then + TAXON_URL="https://github.com/$SELF_HOSTED_URL" + else + TAXON_URL="$TAXON_FALLBACK_URL" + fi + + curl -L --create-dirs --silent --output "$TEMP_DIR/$UNIPEPT_TEMP_CONSTANT/taxdmp.zip" "$TAXON_URL" +} + create_taxon_tables() { log "Started creating the taxon tables." reportProgress -1 "Creating taxon tables." 1 - curl --create-dirs --silent --output "$TEMP_DIR/$UNIPEPT_TEMP_CONSTANT/taxdmp.zip" "$TAXON_URL" + download_taxdmp unzip "$TEMP_DIR/$UNIPEPT_TEMP_CONSTANT/taxdmp.zip" "names.dmp" "nodes.dmp" -d "$TEMP_DIR/$UNIPEPT_TEMP_CONSTANT" rm "$TEMP_DIR/$UNIPEPT_TEMP_CONSTANT/taxdmp.zip"