From da911a84ed3ab6fd1744b217415ec00657a2c307 Mon Sep 17 00:00:00 2001 From: stijndcl Date: Mon, 26 Feb 2024 16:08:18 +0100 Subject: [PATCH 1/9] Download taxdmp --- .github/workflows/static_database.yml | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/.github/workflows/static_database.yml b/.github/workflows/static_database.yml index 84cde2f..fce9d08 100644 --- a/.github/workflows/static_database.yml +++ b/.github/workflows/static_database.yml @@ -23,6 +23,9 @@ jobs: run: | sudo apt-get update sudo apt-get -y install git maven curl unzip gawk sqlite3 libsqlite3-dev pv nodejs + - name: Download Taxdmp file + shell: bash + run: curl https://ftp.ncbi.nih.gov/pub/taxonomy/taxdmp.zip - name: Generate tsv.gz files shell: bash run: ./scripts/build_database.sh static-database "swissprot,trembl" "https://ftp.expasy.org/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.xml.gz,https://ftp.expasy.org/databases/uniprot/current_release/knowledgebase/complete/uniprot_trembl.xml.gz" "output" @@ -60,7 +63,7 @@ jobs: release_name: Static database ${{ steps.date.outputs.date }} draft: false prerelease: false - - name: Upload Release Asset + - name: Upload Static Database Release Asset id: upload-release-asset uses: actions/upload-release-asset@v1 env: @@ -70,3 +73,13 @@ jobs: asset_path: ./output.zip asset_name: unipept-static-db-${{ steps.date.outputs.date }}.zip asset_content_type: application/zip + - name: Upload UniProt Taxdmp Release Asset + id: upload-release-asset + uses: actions/upload-release-asset@v1 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + with: + upload_url: ${{ steps.create_release.outputs.upload_url }} + asset_path: ./taxdmp.zip + asset_name: uniprot-taxdmp.zip + asset_content_type: application/zip From a007a50bced1071ca45ebedbcb01cda88e131a2f Mon Sep 17 00:00:00 2001 From: stijndcl Date: Mon, 26 Feb 2024 16:09:39 +0100 Subject: [PATCH 2/9] Make id unique --- .github/workflows/static_database.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/static_database.yml b/.github/workflows/static_database.yml index fce9d08..009623b 100644 --- a/.github/workflows/static_database.yml +++ b/.github/workflows/static_database.yml @@ -64,7 +64,7 @@ jobs: draft: false prerelease: false - name: Upload Static Database Release Asset - id: upload-release-asset + id: upload-database-release-asset uses: actions/upload-release-asset@v1 env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} @@ -74,7 +74,7 @@ jobs: asset_name: unipept-static-db-${{ steps.date.outputs.date }}.zip asset_content_type: application/zip - name: Upload UniProt Taxdmp Release Asset - id: upload-release-asset + id: upload-taxdmp-release-asset uses: actions/upload-release-asset@v1 env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} From d2cf8128d8f100af00d55a12f8d2890e8ad9924d Mon Sep 17 00:00:00 2001 From: stijndcl Date: Mon, 26 Feb 2024 16:12:17 +0100 Subject: [PATCH 3/9] Use wget --- .github/workflows/static_database.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/static_database.yml b/.github/workflows/static_database.yml index 009623b..ddf84d2 100644 --- a/.github/workflows/static_database.yml +++ b/.github/workflows/static_database.yml @@ -22,10 +22,10 @@ jobs: - name: Install required utilities run: | sudo apt-get update - sudo apt-get -y install git maven curl unzip gawk sqlite3 libsqlite3-dev pv nodejs + sudo apt-get -y install git maven curl unzip gawk sqlite3 libsqlite3-dev pv nodejs wget - name: Download Taxdmp file shell: bash - run: curl https://ftp.ncbi.nih.gov/pub/taxonomy/taxdmp.zip + run: wget https://ftp.ncbi.nih.gov/pub/taxonomy/taxdmp.zip - name: Generate tsv.gz files shell: bash run: ./scripts/build_database.sh static-database "swissprot,trembl" "https://ftp.expasy.org/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.xml.gz,https://ftp.expasy.org/databases/uniprot/current_release/knowledgebase/complete/uniprot_trembl.xml.gz" "output" From f2c4d38d8bcb21d51232db3a737896f1dcbc70a6 Mon Sep 17 00:00:00 2001 From: stijndcl Date: Mon, 26 Feb 2024 16:17:14 +0100 Subject: [PATCH 4/9] Correctly name action --- .github/workflows/static_database.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/static_database.yml b/.github/workflows/static_database.yml index ddf84d2..5ca1d87 100644 --- a/.github/workflows/static_database.yml +++ b/.github/workflows/static_database.yml @@ -73,7 +73,7 @@ jobs: asset_path: ./output.zip asset_name: unipept-static-db-${{ steps.date.outputs.date }}.zip asset_content_type: application/zip - - name: Upload UniProt Taxdmp Release Asset + - name: Upload ncbi Taxdmp Release Asset id: upload-taxdmp-release-asset uses: actions/upload-release-asset@v1 env: @@ -81,5 +81,5 @@ jobs: with: upload_url: ${{ steps.create_release.outputs.upload_url }} asset_path: ./taxdmp.zip - asset_name: uniprot-taxdmp.zip + asset_name: ncbi-taxdmp.zip asset_content_type: application/zip From bc6b30555d0c01955c1a083dec3008c860e607ed Mon Sep 17 00:00:00 2001 From: stijndcl Date: Mon, 26 Feb 2024 16:20:46 +0100 Subject: [PATCH 5/9] Create separate job --- .github/workflows/static_database.yml | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/.github/workflows/static_database.yml b/.github/workflows/static_database.yml index 5ca1d87..44ae095 100644 --- a/.github/workflows/static_database.yml +++ b/.github/workflows/static_database.yml @@ -4,7 +4,7 @@ on: workflow_dispatch: schedule: # * is a special character in YAML so you have to quote this string - - cron: '0 0 1 * *' + - cron: '0 0 1 * *' jobs: generate_static_database: @@ -22,10 +22,7 @@ jobs: - name: Install required utilities run: | sudo apt-get update - sudo apt-get -y install git maven curl unzip gawk sqlite3 libsqlite3-dev pv nodejs wget - - name: Download Taxdmp file - shell: bash - run: wget https://ftp.ncbi.nih.gov/pub/taxonomy/taxdmp.zip + sudo apt-get -y install git maven curl unzip gawk sqlite3 libsqlite3-dev pv nodejs - name: Generate tsv.gz files shell: bash run: ./scripts/build_database.sh static-database "swissprot,trembl" "https://ftp.expasy.org/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.xml.gz,https://ftp.expasy.org/databases/uniprot/current_release/knowledgebase/complete/uniprot_trembl.xml.gz" "output" @@ -73,13 +70,26 @@ jobs: asset_path: ./output.zip asset_name: unipept-static-db-${{ steps.date.outputs.date }}.zip asset_content_type: application/zip - - name: Upload ncbi Taxdmp Release Asset + + download_ncbi_taxdmp: + runs-on: ubuntu-latest + needs: [ generate_static_database ] + steps: + - uses: actions/checkout@v2 + - name: Install Wget + run: | + sudo apt-get update + sudo apt-get -y install wget + - name: Download Taxdmp file + shell: bash + run: wget https://ftp.ncbi.nih.gov/pub/taxonomy/taxdmp.zip + - name: Upload NCBI Taxdmp Release Asset id: upload-taxdmp-release-asset uses: actions/upload-release-asset@v1 env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} with: - upload_url: ${{ steps.create_release.outputs.upload_url }} + upload_url: ${{ jobs.generate_static_database.steps.create_release.outputs.upload_url }} asset_path: ./taxdmp.zip asset_name: ncbi-taxdmp.zip - asset_content_type: application/zip + asset_content_type: application/zip \ No newline at end of file From 006087f1abc2bef9e8c4e66642a4a0b3fb670453 Mon Sep 17 00:00:00 2001 From: stijndcl Date: Mon, 26 Feb 2024 16:21:42 +0100 Subject: [PATCH 6/9] Name correctly --- .github/workflows/static_database.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/static_database.yml b/.github/workflows/static_database.yml index 44ae095..e00c21d 100644 --- a/.github/workflows/static_database.yml +++ b/.github/workflows/static_database.yml @@ -89,7 +89,7 @@ jobs: env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} with: - upload_url: ${{ jobs.generate_static_database.steps.create_release.outputs.upload_url }} + upload_url: ${{ needs.generate_static_database.steps.create_release.outputs.upload_url }} asset_path: ./taxdmp.zip asset_name: ncbi-taxdmp.zip asset_content_type: application/zip \ No newline at end of file From 950ce73b608c2117773b881005f8b254928a3718 Mon Sep 17 00:00:00 2001 From: stijndcl Date: Mon, 26 Feb 2024 16:21:54 +0100 Subject: [PATCH 7/9] Ending newline --- .github/workflows/static_database.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/static_database.yml b/.github/workflows/static_database.yml index e00c21d..d9da5db 100644 --- a/.github/workflows/static_database.yml +++ b/.github/workflows/static_database.yml @@ -92,4 +92,4 @@ jobs: upload_url: ${{ needs.generate_static_database.steps.create_release.outputs.upload_url }} asset_path: ./taxdmp.zip asset_name: ncbi-taxdmp.zip - asset_content_type: application/zip \ No newline at end of file + asset_content_type: application/zip From 3c3a5a5b6ba10c66cc62e80556b1e922f5d3eae5 Mon Sep 17 00:00:00 2001 From: stijndcl Date: Mon, 26 Feb 2024 16:30:22 +0100 Subject: [PATCH 8/9] Undo separation --- .github/workflows/static_database.yml | 22 ++++++---------------- 1 file changed, 6 insertions(+), 16 deletions(-) diff --git a/.github/workflows/static_database.yml b/.github/workflows/static_database.yml index d9da5db..634e884 100644 --- a/.github/workflows/static_database.yml +++ b/.github/workflows/static_database.yml @@ -4,7 +4,7 @@ on: workflow_dispatch: schedule: # * is a special character in YAML so you have to quote this string - - cron: '0 0 1 * *' + - cron: '0 0 1 * *' jobs: generate_static_database: @@ -22,7 +22,10 @@ jobs: - name: Install required utilities run: | sudo apt-get update - sudo apt-get -y install git maven curl unzip gawk sqlite3 libsqlite3-dev pv nodejs + sudo apt-get -y install git maven curl unzip gawk sqlite3 libsqlite3-dev pv nodejs wget + - name: Download Taxdmp file + shell: bash + run: wget https://ftp.ncbi.nih.gov/pub/taxonomy/taxdmp.zip - name: Generate tsv.gz files shell: bash run: ./scripts/build_database.sh static-database "swissprot,trembl" "https://ftp.expasy.org/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.xml.gz,https://ftp.expasy.org/databases/uniprot/current_release/knowledgebase/complete/uniprot_trembl.xml.gz" "output" @@ -70,26 +73,13 @@ jobs: asset_path: ./output.zip asset_name: unipept-static-db-${{ steps.date.outputs.date }}.zip asset_content_type: application/zip - - download_ncbi_taxdmp: - runs-on: ubuntu-latest - needs: [ generate_static_database ] - steps: - - uses: actions/checkout@v2 - - name: Install Wget - run: | - sudo apt-get update - sudo apt-get -y install wget - - name: Download Taxdmp file - shell: bash - run: wget https://ftp.ncbi.nih.gov/pub/taxonomy/taxdmp.zip - name: Upload NCBI Taxdmp Release Asset id: upload-taxdmp-release-asset uses: actions/upload-release-asset@v1 env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} with: - upload_url: ${{ needs.generate_static_database.steps.create_release.outputs.upload_url }} + upload_url: ${{ steps.create_release.outputs.upload_url }} asset_path: ./taxdmp.zip asset_name: ncbi-taxdmp.zip asset_content_type: application/zip From 898182f479bdc46482111118d1eb043f2d1c42c9 Mon Sep 17 00:00:00 2001 From: stijndcl Date: Fri, 8 Mar 2024 12:12:01 +0100 Subject: [PATCH 9/9] Update Taxdmp URL, use self-hosted if available --- .github/workflows/static_database.yml | 2 +- scripts/build_database.sh | 20 ++++++++++++++++++-- 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/.github/workflows/static_database.yml b/.github/workflows/static_database.yml index 634e884..7d76be0 100644 --- a/.github/workflows/static_database.yml +++ b/.github/workflows/static_database.yml @@ -25,7 +25,7 @@ jobs: sudo apt-get -y install git maven curl unzip gawk sqlite3 libsqlite3-dev pv nodejs wget - name: Download Taxdmp file shell: bash - run: wget https://ftp.ncbi.nih.gov/pub/taxonomy/taxdmp.zip + run: wget https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdmp.zip - name: Generate tsv.gz files shell: bash run: ./scripts/build_database.sh static-database "swissprot,trembl" "https://ftp.expasy.org/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.xml.gz,https://ftp.expasy.org/databases/uniprot/current_release/knowledgebase/complete/uniprot_trembl.xml.gz" "output" diff --git a/scripts/build_database.sh b/scripts/build_database.sh index de2936f..a44d13a 100755 --- a/scripts/build_database.sh +++ b/scripts/build_database.sh @@ -263,7 +263,7 @@ CMD_SORT="sort --buffer-size=$SORT_MEMORY --parallel=4" # Which sort command sho CMD_GZIP="gzip -" # Which pipe compression command should I use? ENTREZ_BATCH_SIZE=1000 # Which batch size should I use for communication with Entrez? -TAXON_URL="https://ftp.ncbi.nih.gov/pub/taxonomy/taxdmp.zip" +TAXON_FALLBACK_URL="https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdmp.zip" EC_CLASS_URL="https://ftp.expasy.org/databases/enzyme/enzclass.txt" EC_NUMBER_URL="https://ftp.expasy.org/databases/enzyme/enzyme.dat" GO_TERM_URL="http://geneontology.org/ontology/go-basic.obo" @@ -315,11 +315,27 @@ have() { ### All the different database construction steps. +download_taxdmp() { + # Check if our self-hosted version is available or not using the GitHub API + LATEST_RELEASE_URL="https://api.github.com/repos/unipept/unipept-database/releases/latest" + TAXDMP_RELEASE_ASSET_RE="unipept/unipept-database/releases/download/[^/]+/ncbi-taxdmp.zip" + SELF_HOSTED_URL=$(curl -s "$LATEST_RELEASE_URL" | egrep -o "$TAXDMP_RELEASE_ASSET_RE") + + if [ "$SELF_HOSTED_URL" ] + then + TAXON_URL="https://github.com/$SELF_HOSTED_URL" + else + TAXON_URL="$TAXON_FALLBACK_URL" + fi + + curl -L --create-dirs --silent --output "$TEMP_DIR/$UNIPEPT_TEMP_CONSTANT/taxdmp.zip" "$TAXON_URL" +} + create_taxon_tables() { log "Started creating the taxon tables." reportProgress -1 "Creating taxon tables." 1 - curl --create-dirs --silent --output "$TEMP_DIR/$UNIPEPT_TEMP_CONSTANT/taxdmp.zip" "$TAXON_URL" + download_taxdmp unzip "$TEMP_DIR/$UNIPEPT_TEMP_CONSTANT/taxdmp.zip" "names.dmp" "nodes.dmp" -d "$TEMP_DIR/$UNIPEPT_TEMP_CONSTANT" rm "$TEMP_DIR/$UNIPEPT_TEMP_CONSTANT/taxdmp.zip"