Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Download NCBI Taxdmp file in CI #41

Merged
merged 9 commits into from
Mar 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 16 additions & 3 deletions .github/workflows/static_database.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,10 @@ jobs:
- name: Install required utilities
run: |
sudo apt-get update
sudo apt-get -y install git maven curl unzip gawk sqlite3 libsqlite3-dev pv nodejs
sudo apt-get -y install git maven curl unzip gawk sqlite3 libsqlite3-dev pv nodejs wget
- name: Download Taxdmp file
shell: bash
run: wget https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdmp.zip
- name: Generate tsv.gz files
shell: bash
run: ./scripts/build_database.sh static-database "swissprot,trembl" "https://ftp.expasy.org/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.xml.gz,https://ftp.expasy.org/databases/uniprot/current_release/knowledgebase/complete/uniprot_trembl.xml.gz" "output"
Expand Down Expand Up @@ -60,8 +63,8 @@ jobs:
release_name: Static database ${{ steps.date.outputs.date }}
draft: false
prerelease: false
- name: Upload Release Asset
id: upload-release-asset
- name: Upload Static Database Release Asset
id: upload-database-release-asset
uses: actions/upload-release-asset@v1
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
Expand All @@ -70,3 +73,13 @@ jobs:
asset_path: ./output.zip
asset_name: unipept-static-db-${{ steps.date.outputs.date }}.zip
asset_content_type: application/zip
- name: Upload NCBI Taxdmp Release Asset
id: upload-taxdmp-release-asset
uses: actions/upload-release-asset@v1
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
with:
upload_url: ${{ steps.create_release.outputs.upload_url }}
asset_path: ./taxdmp.zip
asset_name: ncbi-taxdmp.zip
asset_content_type: application/zip
20 changes: 18 additions & 2 deletions scripts/build_database.sh
Original file line number Diff line number Diff line change
Expand Up @@ -263,7 +263,7 @@ CMD_SORT="sort --buffer-size=$SORT_MEMORY --parallel=4" # Which sort command sho
CMD_GZIP="gzip -" # Which pipe compression command should I use?
ENTREZ_BATCH_SIZE=1000 # Which batch size should I use for communication with Entrez?

TAXON_URL="https://ftp.ncbi.nih.gov/pub/taxonomy/taxdmp.zip"
TAXON_FALLBACK_URL="https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdmp.zip"
EC_CLASS_URL="https://ftp.expasy.org/databases/enzyme/enzclass.txt"
EC_NUMBER_URL="https://ftp.expasy.org/databases/enzyme/enzyme.dat"
GO_TERM_URL="http://geneontology.org/ontology/go-basic.obo"
Expand Down Expand Up @@ -315,11 +315,27 @@ have() {

### All the different database construction steps.

download_taxdmp() {
# Check if our self-hosted version is available or not using the GitHub API
LATEST_RELEASE_URL="https://api.github.com/repos/unipept/unipept-database/releases/latest"
TAXDMP_RELEASE_ASSET_RE="unipept/unipept-database/releases/download/[^/]+/ncbi-taxdmp.zip"
SELF_HOSTED_URL=$(curl -s "$LATEST_RELEASE_URL" | egrep -o "$TAXDMP_RELEASE_ASSET_RE")

if [ "$SELF_HOSTED_URL" ]
then
TAXON_URL="https://github.com/$SELF_HOSTED_URL"
else
TAXON_URL="$TAXON_FALLBACK_URL"
fi

curl -L --create-dirs --silent --output "$TEMP_DIR/$UNIPEPT_TEMP_CONSTANT/taxdmp.zip" "$TAXON_URL"
}

create_taxon_tables() {
log "Started creating the taxon tables."
reportProgress -1 "Creating taxon tables." 1

curl --create-dirs --silent --output "$TEMP_DIR/$UNIPEPT_TEMP_CONSTANT/taxdmp.zip" "$TAXON_URL"
download_taxdmp
unzip "$TEMP_DIR/$UNIPEPT_TEMP_CONSTANT/taxdmp.zip" "names.dmp" "nodes.dmp" -d "$TEMP_DIR/$UNIPEPT_TEMP_CONSTANT"
rm "$TEMP_DIR/$UNIPEPT_TEMP_CONSTANT/taxdmp.zip"

Expand Down
Loading