diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml new file mode 100644 index 0000000..11aac1c --- /dev/null +++ b/.github/workflows/rust.yml @@ -0,0 +1,70 @@ +on: + pull_request: + +permissions: + pull-requests: write + checks: write + +env: + CACHE_KEY: CI + +jobs: + dependencies: + name: Install dependencies + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: dtolnay/rust-toolchain@stable + - uses: Swatinem/rust-cache@v2 + id: cache + with: + shared-key: ${{ env.CACHE_KEY }} + - run: cd scripts/helper_scripts/unipept-database-rs && cargo fetch + if: ${{ !steps.cache.outputs.cache-hit }} + + build: + name: Build binaries + runs-on: ubuntu-latest + needs: [dependencies] + steps: + - uses: actions/checkout@v4 + - uses: dtolnay/rust-toolchain@stable + - uses: Swatinem/rust-cache@v2 + with: + shared-key: ${{ env.CACHE_KEY }} + - run: cd scripts/helper_scripts/unipept-database-rs && cargo build --release + + format: + name: Check formatting + runs-on: ubuntu-latest + needs: [dependencies] + steps: + - uses: actions/checkout@v4 + - uses: dtolnay/rust-toolchain@stable + with: + components: rustfmt + - uses: Swatinem/rust-cache@v2 + with: + shared-key: ${{ env.CACHE_KEY }} + - run: cd scripts/helper_scripts/unipept-database-rs && cargo fmt --all --check + + lint: + name: Linting + runs-on: ubuntu-latest + needs: [dependencies] + steps: + - uses: actions/checkout@v4 + - uses: dtolnay/rust-toolchain@stable + with: + components: clippy + - uses: Swatinem/rust-cache@v2 + with: + shared-key: ${{ env.CACHE_KEY }} + - uses: giraffate/clippy-action@v1 + with: + clippy_flags: -- -D warnings + reporter: 'github-pr-check' + fail_on_error: true + github_token: ${{ secrets.GITHUB_TOKEN }} + workdir: scripts/helper_scripts/unipept-database-rs + diff --git a/.gitignore b/.gitignore index 2193525..01de547 100644 --- a/.gitignore +++ b/.gitignore @@ -17,3 +17,5 @@ data/ out scripts/helper_scripts/parser/output scripts/helper_scripts/parser/src/META-INF +.idea/ + diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..5b9a9c3 --- /dev/null +++ b/LICENSE @@ -0,0 +1,22 @@ +MIT License + +Copyright (c) 2023 Universiteit Gent + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + diff --git a/scripts/helper_scripts/unipept-database-rs/.gitignore b/scripts/helper_scripts/unipept-database-rs/.gitignore new file mode 100644 index 0000000..1946e26 --- /dev/null +++ b/scripts/helper_scripts/unipept-database-rs/.gitignore @@ -0,0 +1,14 @@ +### Rust template +# Generated by Cargo +# will have compiled files and executables +debug/ +target/ + +# These are backup files generated by rustfmt +**/*.rs.bk + +# MSVC Windows builds of rustc generate these, which store debugging information +*.pdb + +# RustRover, CLion and IntelliJ +.idea/ diff --git a/scripts/helper_scripts/unipept-database-rs/Cargo.lock b/scripts/helper_scripts/unipept-database-rs/Cargo.lock new file mode 100644 index 0000000..af2c026 --- /dev/null +++ b/scripts/helper_scripts/unipept-database-rs/Cargo.lock @@ -0,0 +1,518 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "android-tzdata" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e999941b234f3131b00bc13c22d06e8c5ff726d1b6318ac7eb276997bbb4fef0" + +[[package]] +name = "android_system_properties" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311" +dependencies = [ + "libc", +] + +[[package]] +name = "anstream" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2ab91ebe16eb252986481c5b62f6098f3b698a45e34b5b98200cf20dd2484a44" +dependencies = [ + "anstyle", + "anstyle-parse", + "anstyle-query", + "anstyle-wincon", + "colorchoice", + "utf8parse", +] + +[[package]] +name = "anstyle" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7079075b41f533b8c61d2a4d073c4676e1f8b249ff94a393b0595db304e0dd87" + +[[package]] +name = "anstyle-parse" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "317b9a89c1868f5ea6ff1d9539a69f45dffc21ce321ac1fd1160dfa48c8e2140" +dependencies = [ + "utf8parse", +] + +[[package]] +name = "anstyle-query" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5ca11d4be1bab0c8bc8734a9aa7bf4ee8316d462a08c6ac5052f888fef5b494b" +dependencies = [ + "windows-sys", +] + +[[package]] +name = "anstyle-wincon" +version = "3.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0699d10d2f4d628a98ee7b57b289abbc98ff3bad977cb3152709d4bf2330628" +dependencies = [ + "anstyle", + "windows-sys", +] + +[[package]] +name = "anyhow" +version = "1.0.75" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4668cab20f66d8d020e1fbc0ebe47217433c1b6c8f2040faf858554e394ace6" + +[[package]] +name = "autocfg" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" + +[[package]] +name = "bit-vec" +version = "0.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "349f9b6a179ed607305526ca489b34ad0a41aed5f7980fa90eb03160b69598fb" + +[[package]] +name = "bumpalo" +version = "3.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f30e7476521f6f8af1a1c4c0b8cc94f0bee37d91763d0ca2665f299b6cd8aec" + +[[package]] +name = "cc" +version = "1.0.83" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1174fb0b6ec23863f8b971027804a42614e347eafb0a95bf0b12cdae21fc4d0" +dependencies = [ + "libc", +] + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "chrono" +version = "0.4.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f2c685bad3eb3d45a01354cedb7d5faa66194d1d58ba6e267a8de788f79db38" +dependencies = [ + "android-tzdata", + "iana-time-zone", + "js-sys", + "num-traits", + "wasm-bindgen", + "windows-targets", +] + +[[package]] +name = "clap" +version = "4.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d04704f56c2cde07f43e8e2c154b43f216dc5c92fc98ada720177362f953b956" +dependencies = [ + "clap_builder", + "clap_derive", +] + +[[package]] +name = "clap_builder" +version = "4.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0e231faeaca65ebd1ea3c737966bf858971cd38c3849107aa3ea7de90a804e45" +dependencies = [ + "anstream", + "anstyle", + "clap_lex", + "strsim", +] + +[[package]] +name = "clap_derive" +version = "4.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0862016ff20d69b84ef8247369fabf5c008a7417002411897d40ee1f4532b873" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "clap_lex" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cd7cc57abe963c6d3b9d8be5b06ba7c8957a930305ca90304f24ef040aa6f961" + +[[package]] +name = "colorchoice" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "acbf1af155f9b9ef647e42cdc158db4b64a1b61f743629225fde6f3e0be2a7c7" + +[[package]] +name = "core-foundation-sys" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e496a50fda8aacccc86d7529e2c1e0892dbd0f898a6b5645b5561b89c3210efa" + +[[package]] +name = "crossbeam-channel" +version = "0.5.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a33c2bf77f2df06183c3aa30d1e96c0695a313d4f9c453cc3762a6db39f99200" +dependencies = [ + "cfg-if", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a22b2d63d4d1dc0b7f1b6b2747dd0088008a9be28b6ddf0b1e7d335e3037294" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "heck" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" + +[[package]] +name = "hermit-abi" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d77f7ec81a6d05a3abb01ab6eb7590f6083d08449fe5a1c8b1e620283546ccb7" + +[[package]] +name = "iana-time-zone" +version = "0.1.57" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2fad5b825842d2b38bd206f3e81d6957625fd7f0a361e345c30e01a0ae2dd613" +dependencies = [ + "android_system_properties", + "core-foundation-sys", + "iana-time-zone-haiku", + "js-sys", + "wasm-bindgen", + "windows", +] + +[[package]] +name = "iana-time-zone-haiku" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f" +dependencies = [ + "cc", +] + +[[package]] +name = "js-sys" +version = "0.3.64" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c5f195fe497f702db0f318b07fdd68edb16955aed830df8363d837542f8f935a" +dependencies = [ + "wasm-bindgen", +] + +[[package]] +name = "lazy_static" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" + +[[package]] +name = "libc" +version = "0.2.148" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9cdc71e17332e86d2e1d38c1f99edcb6288ee11b815fb1a4b049eaa2114d369b" + +[[package]] +name = "log" +version = "0.4.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f" + +[[package]] +name = "memchr" +version = "2.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f665ee40bc4a3c5590afb1e9677db74a508659dfd71e126420da8274909a0167" + +[[package]] +name = "num-traits" +version = "0.2.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f30b0abd723be7e2ffca1272140fac1a2f084c77ec3e123c192b66af1ee9e6c2" +dependencies = [ + "autocfg", +] + +[[package]] +name = "num_cpus" +version = "1.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4161fcb6d602d4d2081af7c3a45852d875a03dd337a6bfdd6e06407b61342a43" +dependencies = [ + "hermit-abi", + "libc", +] + +[[package]] +name = "once_cell" +version = "1.18.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d" + +[[package]] +name = "proc-macro2" +version = "1.0.67" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d433d9f1a3e8c1263d9456598b16fec66f4acc9a74dacffd35c7bb09b3a1328" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quick-xml" +version = "0.30.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eff6510e86862b57b210fd8cbe8ed3f0d7d600b9c2863cd4549a2e033c66e956" +dependencies = [ + "memchr", +] + +[[package]] +name = "quote" +version = "1.0.33" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5267fca4496028628a95160fc423a33e8b2e6af8a5302579e322e4b520293cae" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "smartstring" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3fb72c633efbaa2dd666986505016c32c3044395ceaf881518399d2f4127ee29" +dependencies = [ + "autocfg", + "static_assertions", + "version_check", +] + +[[package]] +name = "static_assertions" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" + +[[package]] +name = "strsim" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" + +[[package]] +name = "syn" +version = "2.0.37" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7303ef2c05cd654186cb250d29049a24840ca25d2747c25c0381c8d9e2f582e8" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "unicode-ident" +version = "1.0.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" + +[[package]] +name = "unipept_database" +version = "0.1.0" +dependencies = [ + "anyhow", + "bit-vec", + "chrono", + "clap", + "smartstring", + "uniprot", +] + +[[package]] +name = "uniprot" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "10f630b7076a88cb10cbc0bd3f118dc9995eae2668e704c751644bff70fbed8a" +dependencies = [ + "chrono", + "crossbeam-channel", + "lazy_static", + "memchr", + "num_cpus", + "quick-xml", + "smartstring", +] + +[[package]] +name = "utf8parse" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a" + +[[package]] +name = "version_check" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" + +[[package]] +name = "wasm-bindgen" +version = "0.2.87" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7706a72ab36d8cb1f80ffbf0e071533974a60d0a308d01a5d0375bf60499a342" +dependencies = [ + "cfg-if", + "wasm-bindgen-macro", +] + +[[package]] +name = "wasm-bindgen-backend" +version = "0.2.87" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5ef2b6d3c510e9625e5fe6f509ab07d66a760f0885d858736483c32ed7809abd" +dependencies = [ + "bumpalo", + "log", + "once_cell", + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-macro" +version = "0.2.87" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dee495e55982a3bd48105a7b947fd2a9b4a8ae3010041b9e0faab3f9cd028f1d" +dependencies = [ + "quote", + "wasm-bindgen-macro-support", +] + +[[package]] +name = "wasm-bindgen-macro-support" +version = "0.2.87" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "54681b18a46765f095758388f2d0cf16eb8d4169b639ab575a8f5693af210c7b" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-backend", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-shared" +version = "0.2.87" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ca6ad05a4870b2bf5fe995117d3728437bd27d7cd5f06f13c17443ef369775a1" + +[[package]] +name = "windows" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e686886bc078bc1b0b600cac0147aadb815089b6e4da64016cbd754b6342700f" +dependencies = [ + "windows-targets", +] + +[[package]] +name = "windows-sys" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" +dependencies = [ + "windows-targets", +] + +[[package]] +name = "windows-targets" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c" +dependencies = [ + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc" + +[[package]] +name = "windows_i686_gnu" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e" + +[[package]] +name = "windows_i686_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" diff --git a/scripts/helper_scripts/unipept-database-rs/Cargo.toml b/scripts/helper_scripts/unipept-database-rs/Cargo.toml new file mode 100644 index 0000000..07f6cfd --- /dev/null +++ b/scripts/helper_scripts/unipept-database-rs/Cargo.toml @@ -0,0 +1,19 @@ +[package] +name = "unipept_database" +version = "0.1.0" +edition = "2021" +authors = ["unipept@ugent.be", "Stijn De Clercq", "Bart Mesuere", "Pieter Verschaffelt", "Tibo Vande Moortele"] +categories = ["command-line-utilities", "parser-implementations", "science"] +keywords = ["uniprot", "unipept", "metaproteomics"] +readme = "README.md" +repository = "https://github.com/unipept/unipept-database" +homepage = "https://unipept.ugent.be/" +license = "MIT" + +[dependencies] +anyhow = "1.0.75" +bit-vec = "0.6.3" +chrono = "0.4.31" +clap = { version = "4.4.6", features = ["derive"] } +smartstring = { version = "1.0" } +uniprot = "0.7.0" diff --git a/scripts/helper_scripts/unipept-database-rs/README.md b/scripts/helper_scripts/unipept-database-rs/README.md new file mode 100644 index 0000000..52cb34d --- /dev/null +++ b/scripts/helper_scripts/unipept-database-rs/README.md @@ -0,0 +1,23 @@ +# unipept-database-rs + +This is a Rust package that implements custom tools for the Unipept database construction pipeline. + +The main tools are located in [`/src/bin`](./src/bin). They can all be built in one go using: + +```shell +cargo build --release +``` + +or, individually: + +```shell +cargo build --release --bin +``` + +## Tools + +| Name | Description | +|-----------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------| +| [`xml-parser`](./src/bin/xml-parser.rs) | Parser for the UniProtKB XML files from [Uniprot](https://www.uniprot.org/help/downloads). | +| [`functional-analysis`](./src/bin/functional-analysis.rs) | Counts and combines functional annotations of all lines that start with the same sequence ID, and summarises this in a JSON-object. | +| [`taxons-uniprots-tables`](./src/bin/taxons-uniprots-tables.rs) | Parse the Uniprot TSV-file into TSV tables. | diff --git a/scripts/helper_scripts/unipept-database-rs/src/bin/functional-analysis.rs b/scripts/helper_scripts/unipept-database-rs/src/bin/functional-analysis.rs new file mode 100644 index 0000000..afa3817 --- /dev/null +++ b/scripts/helper_scripts/unipept-database-rs/src/bin/functional-analysis.rs @@ -0,0 +1,140 @@ +use anyhow::{Context, Result}; +use std::collections::HashMap; +use std::fs::File; +use std::io::{BufRead, BufWriter, Write}; +use std::path::PathBuf; + +use clap::Parser; + +use unipept_database::utils::files::{open_read, open_write}; + +fn main() -> Result<()> { + let args = Cli::parse(); + + let reader = open_read(&args.input_file)?; + let mut writer = open_write(&args.output_file)?; + + let mut current_pept: String = String::new(); + + let mut num_prot: u32 = 0; + let mut num_annotated_go: u32 = 0; + let mut num_annotated_ec: u32 = 0; + let mut num_annotated_ip: u32 = 0; + let mut done: u64 = 0; + + let mut m: HashMap = HashMap::new(); + + for line in reader.lines() { + let line = line.context("Error reading input file")?; + let row: Vec<&str> = line.split('\t').collect(); + if row[0] != current_pept { + if !current_pept.is_empty() && !m.is_empty() { + write_entry( + &mut writer, + current_pept, + num_prot, + num_annotated_go, + num_annotated_ec, + num_annotated_ip, + &m, + )?; + } + + m.clear(); + num_prot = 0; + num_annotated_go = 0; + num_annotated_ec = 0; + num_annotated_ip = 0; + current_pept = row[0].to_string(); + } + + num_prot += 1; + + if row.len() > 1 { + let terms = row[1].split(';').map(String::from); + let mut has_ec = false; + let mut has_go = false; + let mut has_ip = false; + + for term in terms { + if term.is_empty() { + continue; + } + + if term.starts_with('G') { + has_go = true; + } else if term.starts_with('E') { + has_ec = true; + } else { + has_ip = true; + } + + *m.entry(term).or_insert(0) += 1; + } + + if has_go { + num_annotated_go += 1 + }; + if has_ec { + num_annotated_ec += 1 + }; + if has_ip { + num_annotated_ip += 1 + }; + } + + done += 1; + + if done % 1000000 == 0 { + println!("FA {} rows", done); + } + } + + if !m.is_empty() { + write_entry( + &mut writer, + current_pept, + num_prot, + num_annotated_go, + num_annotated_ec, + num_annotated_ip, + &m, + )?; + } + + Ok(()) +} + +fn write_entry( + writer: &mut BufWriter, + current_peptide: String, + num_prot: u32, + num_go: u32, + num_ec: u32, + num_ip: u32, + m: &HashMap, +) -> Result<()> { + let data = m + .iter() + .map(|(key, value)| format!(r#""{key}":{value}"#)) + .collect::>() + .join(","); + + let format_string = format!( + "{current_peptide}\t{{\"num\":{{\"all\":{num_prot},\"EC\":{num_ec},\"GO\":{num_go},\"IPR\":{num_ip},\"data\":{{{data}}}}}}}\n" + ); + + writer + .write_all(format_string.as_bytes()) + .context("Error writing to output file")?; + + Ok(()) +} + +#[derive(Parser, Debug)] +struct Cli { + #[clap(short, long)] + input_file: PathBuf, + #[clap(short, long)] + output_file: PathBuf, +} diff --git a/scripts/helper_scripts/unipept-database-rs/src/bin/taxons-uniprots-tables.rs b/scripts/helper_scripts/unipept-database-rs/src/bin/taxons-uniprots-tables.rs new file mode 100644 index 0000000..335af5d --- /dev/null +++ b/scripts/helper_scripts/unipept-database-rs/src/bin/taxons-uniprots-tables.rs @@ -0,0 +1,68 @@ +use anyhow::{Context, Result}; +use clap::Parser; +use std::path::PathBuf; +use unipept_database::taxons_uniprots_tables::tab_parser::TabParser; +use unipept_database::taxons_uniprots_tables::table_writer::TableWriter; + +fn main() -> Result<()> { + let args = Cli::parse(); + let mut writer = TableWriter::new( + &args.taxons, + &args.peptides, + &args.uniprot_entries, + &args.go, + &args.ec, + &args.interpro, + ) + .context("Unable to instantiate TableWriter")?; + + let parser = TabParser::new(args.peptide_min, args.peptide_max, args.verbose) + .context("Unable to instantiate TabParser")?; + + for entry in parser { + writer + .store(entry.context("Error getting entry from TabParser")?) + .context("Error storing entry")?; + } + + Ok(()) +} + +#[derive(Parser, Debug)] +pub struct Cli { + /// Minimum peptide length + #[clap(long)] + peptide_min: u32, + + /// Maximum peptide length + #[clap(long)] + peptide_max: u32, + + /// Taxons TSV input file + #[clap(long)] + taxons: PathBuf, + + /// Peptides TSV output file + #[clap(long)] + peptides: PathBuf, + + /// Uniprot entries TSV output file + #[clap(long)] + uniprot_entries: PathBuf, + + /// EC references TSV output file + #[clap(long)] + ec: PathBuf, + + /// GO references TSV output file + #[clap(long)] + go: PathBuf, + + /// InterPro references TSV output file + #[clap(long)] + interpro: PathBuf, + + /// Enable verbose mode + #[clap(short, long, default_value_t = false)] + verbose: bool, +} diff --git a/scripts/helper_scripts/unipept-database-rs/src/bin/xml-parser.rs b/scripts/helper_scripts/unipept-database-rs/src/bin/xml-parser.rs new file mode 100644 index 0000000..985b9ed --- /dev/null +++ b/scripts/helper_scripts/unipept-database-rs/src/bin/xml-parser.rs @@ -0,0 +1,193 @@ +use std::io::{BufReader, Stdin}; +use std::num::NonZeroUsize; + +use anyhow::{Context, Result}; +use clap::Parser; +use smartstring::{LazyCompact, SmartString}; +use uniprot::uniprot::{SequentialParser, ThreadedParser}; + +use unipept_database::utils::files::open_sin; + +fn main() -> Result<()> { + let args = Cli::parse(); + + let reader = open_sin(); + + write_header(); + + // Create a different parser based on the amount of threads requested + match args.threads { + 1 => { + for r in SequentialParser::new(reader) { + let entry = r.context("Error reading UniProt entry from SequentialParser")?; + write_entry(&entry, &args.uniprot_type, args.verbose); + } + } + n => { + let parser: ThreadedParser> = if n == 0 { + ThreadedParser::new(reader) + } else { + ThreadedParser::with_threads( + reader, + NonZeroUsize::new(n as usize) + .context("Error parsing number of threads as usize")?, + ) + }; + + for r in parser { + let entry = r.context("Error reading UniProt entry from ThreadedParser")?; + write_entry(&entry, &args.uniprot_type, args.verbose); + } + } + } + + Ok(()) +} + +type SmartStr = SmartString; + +#[derive(clap::ValueEnum, Clone, Debug)] +enum UniprotType { + Swissprot, + Trembl, +} + +impl UniprotType { + pub fn to_str(&self) -> &str { + match self { + UniprotType::Swissprot => "swissprot", + UniprotType::Trembl => "trembl", + } + } +} + +// Parse a Uniprot XML file and convert it into a TSV-file +#[derive(Parser, Debug)] +struct Cli { + #[clap(value_enum, short = 't', long, default_value_t = UniprotType::Swissprot)] + uniprot_type: UniprotType, + #[clap(long, default_value_t = 0)] + threads: u32, + #[clap(short, long, default_value_t = false)] + verbose: bool, +} + +/// Write the header line to stdout +fn write_header() { + let fields: [&str; 9] = [ + "Entry", + "Sequence", + "Protein names", + "Version (entry)", + "EC number", + "Gene ontology IDs", + "Cross-reference (InterPro)", + "Status", + "Organism ID", + ]; + + let result_string = fields.join("\t"); + println!("{}", result_string); +} + +/// Resolve the name of a single entry +fn parse_name(entry: &uniprot::uniprot::Entry) -> SmartStr { + let mut submitted_name: SmartStr = SmartStr::new(); + + // Check the last "recommended" name from a protein's components, + // otherwise store the last "submitted" name of these components for later + for component in entry.protein.components.iter().rev() { + if let Some(n) = &component.recommended { + return n.full.clone(); + } + + if submitted_name.is_empty() { + if let Some(n) = component.submitted.last() { + submitted_name = n.full.clone(); + } + } + } + + // Do the same thing for the domains + for domain in entry.protein.domains.iter().rev() { + if let Some(n) = &domain.recommended { + return n.full.clone(); + } + + if submitted_name.is_empty() { + if let Some(n) = domain.submitted.last() { + submitted_name = n.full.clone(); + } + } + } + + // First check the protein's own recommended name, + // otherwise return the submitted name from above if there was one, + // otherwise the last submitted name from the protein itself + if let Some(n) = &entry.protein.name.recommended { + n.full.clone() + } else if !submitted_name.is_empty() { + submitted_name + } else if let Some(n) = entry.protein.name.submitted.last() { + n.full.clone() + } else { + eprintln!("Could not find a name for entry {}", entry.accessions[0]); + SmartStr::new() + } +} + +/// Write a single UniProt entry to stdout +fn write_entry(entry: &uniprot::uniprot::Entry, db_type: &UniprotType, verbose: bool) { + let accession_number: SmartStr = entry.accessions[0].clone(); + let sequence: SmartStr = entry.sequence.value.clone(); + + let name: SmartStr = parse_name(entry); + + let version: SmartStr = SmartStr::from(entry.version.to_string()); + + let mut ec_references: Vec<&str> = Vec::new(); + let mut go_references: Vec<&str> = Vec::new(); + let mut ip_references: Vec<&str> = Vec::new(); + let mut taxon_id: SmartStr = SmartStr::new(); + + // Find the taxon id in the organism + for reference in &entry.organism.db_references { + if reference.ty == "NCBI Taxonomy" { + taxon_id = reference.id.clone(); + } + } + + // Find the EC, GO and InterPro references in the entry itself + for reference in &entry.db_references { + let vector: Option<&mut Vec<&str>> = match reference.ty.as_str() { + "EC" => Some(&mut ec_references), + "GO" => Some(&mut go_references), + "InterPro" => Some(&mut ip_references), + _ => None, + }; + + if let Some(v) = vector { + v.push(&reference.id); + } + } + + let fields: [SmartStr; 9] = [ + accession_number, + sequence, + name, + version, + SmartStr::from(ec_references.join(";")), + SmartStr::from(go_references.join(";")), + SmartStr::from(ip_references.join(";")), + SmartStr::from(db_type.to_str()), + taxon_id, + ]; + + let line = fields.join("\t"); + + if verbose { + eprintln!("INFO VERBOSE: Writing tabular line: {}", line); + } + + println!("{}", line); +} diff --git a/scripts/helper_scripts/unipept-database-rs/src/lib.rs b/scripts/helper_scripts/unipept-database-rs/src/lib.rs new file mode 100644 index 0000000..262d619 --- /dev/null +++ b/scripts/helper_scripts/unipept-database-rs/src/lib.rs @@ -0,0 +1,2 @@ +pub mod taxons_uniprots_tables; +pub mod utils; diff --git a/scripts/helper_scripts/unipept-database-rs/src/taxons_uniprots_tables/mod.rs b/scripts/helper_scripts/unipept-database-rs/src/taxons_uniprots_tables/mod.rs new file mode 100644 index 0000000..f3d84d0 --- /dev/null +++ b/scripts/helper_scripts/unipept-database-rs/src/taxons_uniprots_tables/mod.rs @@ -0,0 +1,5 @@ +pub mod models; +pub mod tab_parser; +pub mod table_writer; +pub mod taxon_list; +pub mod utils; diff --git a/scripts/helper_scripts/unipept-database-rs/src/taxons_uniprots_tables/models.rs b/scripts/helper_scripts/unipept-database-rs/src/taxons_uniprots_tables/models.rs new file mode 100644 index 0000000..a0c7021 --- /dev/null +++ b/scripts/helper_scripts/unipept-database-rs/src/taxons_uniprots_tables/models.rs @@ -0,0 +1,181 @@ +use std::str::FromStr; + +use anyhow::{Context, Error, Result}; + +#[derive(Debug)] +pub struct Entry { + pub min_length: u32, + pub max_length: u32, + + // The "version" and "accession_number" fields are actually integers, but they are never used as such, + // so there is no use converting/parsing them + pub accession_number: String, + pub version: String, + pub taxon_id: i32, + + pub type_: String, + pub name: String, + pub sequence: String, + pub ec_references: Vec, + pub go_references: Vec, + pub ip_references: Vec, +} + +impl Entry { + #[allow(clippy::too_many_arguments)] + pub fn new( + min_length: u32, + max_length: u32, + type_: String, + accession_number: String, + sequence: String, + name: String, + version: String, + taxon_id: String, + ec_references: Vec, + go_references: Vec, + ip_references: Vec, + ) -> Result { + let parsed_id = taxon_id + .parse() + .with_context(|| format!("Failed to parse {} to i32", taxon_id))?; + + Ok(Entry { + min_length, + max_length, + + accession_number, + version, + taxon_id: parsed_id, + type_, + name, + sequence, + + ec_references, + go_references, + ip_references, + }) + } +} + +pub fn calculate_entry_digest( + sequence: &String, + min_length: usize, + max_length: usize, +) -> Vec<&[u8]> { + let mut result = Vec::new(); + + let mut start: usize = 0; + let length = sequence.len(); + let content = sequence.as_bytes(); + + for (i, c) in content.iter().enumerate() { + if (*c == b'K' || *c == b'R') && (i + 1 < length && content[i + 1] != b'P') { + if i + 1 - start >= min_length && i + 1 - start <= max_length { + result.push(&content[start..i + 1]); + } + + start = i + 1; + } + } + + // Add last one + if length - start >= min_length && length - start <= max_length { + result.push(&content[start..length]); + } + + result +} + +#[derive(Debug)] +pub enum Rank { + NoRank, + SuperKingdom, + Kingdom, + SubKingdom, + SuperPhylum, + Phylum, + SubPhylum, + SuperClass, + Class, + SubClass, + SuperOrder, + Order, + SubOrder, + InfraOrder, + SuperFamily, + Family, + SubFamily, + Tribe, + SubTribe, + Genus, + SubGenus, + SpeciesGroup, + SpeciesSubgroup, + Species, + SubSpecies, + Strain, + Varietas, + Forma, +} + +impl FromStr for Rank { + type Err = Error; + + fn from_str(s: &str) -> Result { + match s.to_uppercase().replace(' ', "_").as_str() { + "CLASS" => Ok(Self::Class), + "FAMILY" => Ok(Self::Family), + "FORMA" => Ok(Self::Forma), + "GENUS" => Ok(Self::Genus), + "INFRAORDER" => Ok(Self::InfraOrder), + "KINGDOM" => Ok(Self::Kingdom), + "NO_RANK" => Ok(Self::NoRank), + "ORDER" => Ok(Self::Order), + "PHYLUM" => Ok(Self::Phylum), + "SPECIES" => Ok(Self::Species), + "SPECIES_GROUP" => Ok(Self::SpeciesGroup), + "SPECIES_SUBGROUP" => Ok(Self::SpeciesSubgroup), + "STRAIN" => Ok(Self::Strain), + "SUBCLASS" => Ok(Self::SubClass), + "SUBFAMILY" => Ok(Self::SubFamily), + "SUBGENUS" => Ok(Self::SubGenus), + "SUBKINGDOM" => Ok(Self::SubKingdom), + "SUBORDER" => Ok(Self::SubOrder), + "SUBPHYLUM" => Ok(Self::SubPhylum), + "SUBSPECIES" => Ok(Self::SubSpecies), + "SUBTRIBE" => Ok(Self::SubTribe), + "SUPERCLASS" => Ok(Self::SuperClass), + "SUPERFAMILY" => Ok(Self::SuperFamily), + "SUPERKINGDOM" => Ok(Self::SuperKingdom), + "SUPERORDER" => Ok(Self::SuperOrder), + "SUPERPHYLUM" => Ok(Self::SuperPhylum), + "TRIBE" => Ok(Self::Tribe), + "VARIETAS" => Ok(Self::Varietas), + _ => Err(Error::msg(format!( + "Value {} does not match any known ranks", + s + ))), + } + } +} + +#[allow(dead_code)] // The fields in this struct aren't used YET, but will be later on +#[derive(Debug)] +pub struct Taxon { + name: String, + rank: Rank, + parent: usize, + valid: bool, +} + +impl Taxon { + pub fn new(name: String, rank: Rank, parent: usize, valid: bool) -> Self { + Taxon { + name, + rank, + parent, + valid, + } + } +} diff --git a/scripts/helper_scripts/unipept-database-rs/src/taxons_uniprots_tables/tab_parser.rs b/scripts/helper_scripts/unipept-database-rs/src/taxons_uniprots_tables/tab_parser.rs new file mode 100644 index 0000000..e4b2016 --- /dev/null +++ b/scripts/helper_scripts/unipept-database-rs/src/taxons_uniprots_tables/tab_parser.rs @@ -0,0 +1,96 @@ +use anyhow::{Context, Error, Result}; +use std::collections::HashMap; +use std::io::{BufRead, BufReader, Lines, Stdin}; + +use crate::taxons_uniprots_tables::models::Entry; +use crate::utils::files::open_sin; + +pub struct TabParser { + lines: Lines>, + header_map: HashMap, + min_length: u32, + max_length: u32, + verbose: bool, +} + +impl TabParser { + pub fn new(peptide_min: u32, peptide_max: u32, verbose: bool) -> Result { + // First read the header line + let reader = open_sin(); + let mut map = HashMap::new(); + + let mut lines = reader.lines(); + + let line = match lines.next() { + None => return Err(Error::msg("Missing header line")), + Some(s) => s.context("Unable to read header line")?, + }; + + for (i, l) in line.split('\t').enumerate() { + map.insert(l.trim().to_string(), i); + } + + Ok(TabParser { + lines, + header_map: map, + min_length: peptide_min, + max_length: peptide_max, + verbose, + }) + } +} + +impl Iterator for TabParser { + type Item = Result; + + fn next(&mut self) -> Option { + let line = self + .lines + .next()? + .context("Unable to read line from TSV file"); + + let line = match line { + Ok(s) => s, + Err(e) => { + return Some(Err(e)); + } + }; + + let fields: Vec<&str> = line.trim().split('\t').collect(); + + let ec_references: Vec = fields[self.header_map["EC number"]] + .split(';') + .map(|x| x.trim().to_string()) + .collect(); + let go_references: Vec = fields[self.header_map["Gene ontology IDs"]] + .split(';') + .map(|x| x.trim().to_string()) + .collect(); + let ip_references: Vec = fields[self.header_map["Cross-reference (InterPro)"]] + .split(';') + .map(|x| x.trim().to_string()) + .collect(); + + let entry = Entry::new( + self.min_length, + self.max_length, + fields[self.header_map["Status"]].trim().to_string(), + fields[self.header_map["Entry"]].trim().to_string(), + fields[self.header_map["Sequence"]].trim().to_string(), + fields[self.header_map["Protein names"]].trim().to_string(), + fields[self.header_map["Version (entry)"]] + .trim() + .to_string(), + fields[self.header_map["Organism ID"]].trim().to_string(), + ec_references, + go_references, + ip_references, + ); + + if self.verbose { + eprintln!("INFO VERBOSE: TSV line parsed: {}", line); + } + + Some(entry) + } +} diff --git a/scripts/helper_scripts/unipept-database-rs/src/taxons_uniprots_tables/table_writer.rs b/scripts/helper_scripts/unipept-database-rs/src/taxons_uniprots_tables/table_writer.rs new file mode 100644 index 0000000..6b1076e --- /dev/null +++ b/scripts/helper_scripts/unipept-database-rs/src/taxons_uniprots_tables/table_writer.rs @@ -0,0 +1,216 @@ +use std::collections::HashSet; +use std::fs::File; +use std::io::{BufWriter, Write}; +use std::path::PathBuf; + +use anyhow::{Context, Result}; +use bit_vec::BitVec; + +use crate::taxons_uniprots_tables::models::{calculate_entry_digest, Entry}; +use crate::taxons_uniprots_tables::taxon_list::parse_taxon_file_basic; +use crate::taxons_uniprots_tables::utils::now_str; +use crate::utils::files::open_write; + +/// Note: this is single-threaded +/// we attempted a parallel version that wrote to all files at the same time, +/// but this didn't achieve any speed increase, so we decided not to go forward with it +pub struct TableWriter { + taxons: BitVec, + wrong_ids: HashSet, + peptides: BufWriter, + uniprot_entries: BufWriter, + go_cross_references: BufWriter, + ec_cross_references: BufWriter, + ip_cross_references: BufWriter, + + peptide_count: i64, + uniprot_count: i64, + go_count: i64, + ec_count: i64, + ip_count: i64, +} + +impl TableWriter { + pub fn new( + taxons: &PathBuf, + peptides: &PathBuf, + uniprot_entries: &PathBuf, + go_references: &PathBuf, + ec_references: &PathBuf, + interpro_references: &PathBuf, + ) -> Result { + Ok(TableWriter { + taxons: parse_taxon_file_basic(taxons).context("Unable to parse taxonomy file")?, + wrong_ids: HashSet::new(), + peptides: open_write(peptides).context("Unable to open output file")?, + uniprot_entries: open_write(uniprot_entries).context("Unable to open output file")?, + go_cross_references: open_write(go_references).context("Unable to open output file")?, + ec_cross_references: open_write(ec_references).context("Unable to open output file")?, + ip_cross_references: open_write(interpro_references) + .context("Unable to open output file")?, + + peptide_count: 0, + uniprot_count: 0, + go_count: 0, + ec_count: 0, + ip_count: 0, + }) + } + + // Store a complete entry in the database + pub fn store(&mut self, entry: Entry) -> Result<()> { + let id = self + .write_uniprot_entry(&entry) + .context("Failed to write Uniprot entry")?; + + // Failed to add entry + if id == -1 { + return Ok(()); + } + + for r in &entry.go_references { + self.write_go_ref(r, id).context("Error writing GO ref")?; + } + + for r in &entry.ec_references { + self.write_ec_ref(r, id).context("Error writing EC ref")?; + } + + for r in &entry.ip_references { + self.write_ip_ref(r, id) + .context("Error writing Interpro ref")?; + } + + let go_ids = entry.go_references.into_iter(); + let ec_ids = entry + .ec_references + .iter() + .filter(|x| !x.is_empty()) + .map(|x| format!("EC:{}", x)); + let ip_ids = entry + .ip_references + .iter() + .filter(|x| !x.is_empty()) + .map(|x| format!("IPR:{}", x)); + + let summary = go_ids + .chain(ec_ids) + .chain(ip_ids) + .collect::>() + .join(";"); + + for sequence in calculate_entry_digest( + &entry.sequence, + entry.min_length as usize, + entry.max_length as usize, + ) { + self.write_peptide( + sequence + .iter() + .map(|&x| if x == b'I' { b'L' } else { x }) + .collect(), + id, + sequence, + &summary, + ) + .context("Failed to write peptide")?; + } + + Ok(()) + } + + fn write_peptide( + &mut self, + sequence: Vec, + id: i64, + original_sequence: &[u8], + annotations: &String, + ) -> Result<()> { + self.peptide_count += 1; + + writeln!( + &mut self.peptides, + "{}\t{:?}\t{:?}\t{}\t{}", + self.peptide_count, sequence, original_sequence, id, annotations + ) + .context("Error writing to TSV")?; + + Ok(()) + } + + // Store the entry info and return the generated id + fn write_uniprot_entry(&mut self, entry: &Entry) -> Result { + if 0 <= entry.taxon_id + && entry.taxon_id < self.taxons.len() as i32 + && self.taxons[entry.taxon_id as usize] + // This indexing is safe due to the line above + { + self.uniprot_count += 1; + + let accession_number = &entry.accession_number; + let version = entry.version.clone(); + let taxon_id = entry.taxon_id; + let type_ = entry.type_.clone(); + let name = entry.name.clone(); + let sequence = entry.sequence.clone(); + + writeln!( + &mut self.uniprot_entries, + "{}\t{}\t{}\t{}\t{}\t{}\t{}", + self.uniprot_count, accession_number, version, taxon_id, type_, name, sequence + ) + .context("Error writing to TSV")?; + + return Ok(self.uniprot_count); + } else if !self.wrong_ids.contains(&entry.taxon_id) { + self.wrong_ids.insert(entry.taxon_id); + eprintln!( + "[{}]\t{} added to the list of {} invalid taxonIds", + now_str(), + entry.taxon_id, + self.wrong_ids.len() + ); + } + + Ok(-1) + } + + fn write_go_ref(&mut self, ref_id: &String, uniprot_entry_id: i64) -> Result<()> { + self.go_count += 1; + + writeln!( + &mut self.go_cross_references, + "{}\t{}\t{}", + self.go_count, uniprot_entry_id, ref_id + ) + .context("Error writing to TSV")?; + + Ok(()) + } + + fn write_ec_ref(&mut self, ref_id: &String, uniprot_entry_id: i64) -> Result<()> { + self.ec_count += 1; + + writeln!( + &mut self.ec_cross_references, + "{}\t{}\t{}", + self.ec_count, uniprot_entry_id, ref_id + ) + .context("Error writing to TSV")?; + + Ok(()) + } + + fn write_ip_ref(&mut self, ref_id: &String, uniprot_entry_id: i64) -> Result<()> { + self.ip_count += 1; + + writeln!( + &mut self.ip_cross_references, + "{}\t{}\t{}", + self.ip_count, uniprot_entry_id, ref_id, + ) + .context("Error writing to TSV")?; + + Ok(()) + } +} diff --git a/scripts/helper_scripts/unipept-database-rs/src/taxons_uniprots_tables/taxon_list.rs b/scripts/helper_scripts/unipept-database-rs/src/taxons_uniprots_tables/taxon_list.rs new file mode 100644 index 0000000..fb87681 --- /dev/null +++ b/scripts/helper_scripts/unipept-database-rs/src/taxons_uniprots_tables/taxon_list.rs @@ -0,0 +1,86 @@ +use crate::taxons_uniprots_tables::models::{Rank, Taxon}; +use anyhow::{Context, Result}; +use bit_vec::BitVec; +use std::io::BufRead; +use std::path::PathBuf; +use std::str::FromStr; + +use crate::utils::files::open_read; + +pub struct TaxonList { + entries: Vec>, +} + +impl TaxonList { + pub fn from_file(pb: &PathBuf) -> Result { + let mut entries = Vec::new(); + let reader = open_read(pb).context("Unable to open input file")?; + + for line in reader.lines() { + let line = line + .with_context(|| format!("Error reading line from input file {}", pb.display()))?; + let spl: Vec<&str> = line.split('\t').collect(); + let id: usize = spl[0] + .parse() + .with_context(|| format!("Unable to parse {} as usize", spl[0]))?; + let parent: usize = spl[3] + .parse() + .with_context(|| format!("Unable to parse {} as usize", spl[3]))?; + let valid = spl[4].trim() == "true"; + + let taxon = Taxon::new( + spl[1].to_string(), + Rank::from_str(spl[2]) + .with_context(|| format!("Unable to parse {} into Rank", spl[2]))?, + parent, + valid, + ); + + while entries.len() <= id { + entries.push(None); + } + + entries[id] = Some(taxon); + } + + Ok(TaxonList { entries }) + } + + pub fn get(&self, i: usize) -> &Option { + &self.entries[i] + } + + pub fn is_empty(&self) -> bool { + self.entries.is_empty() + } + + pub fn len(&self) -> usize { + self.entries.len() + } +} + +/// Parse a taxons TSV-file into a vector that can be accessed by id +/// The actual content of these Taxons is never used, so we don't try to parse a struct +pub fn parse_taxon_file_basic(pb: &PathBuf) -> Result { + let mut entries = BitVec::new(); + let reader = open_read(pb).context("Unable to open taxon input file")?; + + for line in reader.lines() { + let line = line.context("Error reading line from taxon file")?; + let spl = line + .split_once('\t') + .context("Unable to split taxon file on tabs")?; + let id: usize = spl + .0 + .parse() + .with_context(|| format!("Unable to parse {} as usize", spl.0))?; + + if entries.len() <= id { + entries.grow(id - entries.len() + 1, false) + } + + entries.set(id, true); + } + + Ok(entries) +} diff --git a/scripts/helper_scripts/unipept-database-rs/src/taxons_uniprots_tables/utils.rs b/scripts/helper_scripts/unipept-database-rs/src/taxons_uniprots_tables/utils.rs new file mode 100644 index 0000000..420801d --- /dev/null +++ b/scripts/helper_scripts/unipept-database-rs/src/taxons_uniprots_tables/utils.rs @@ -0,0 +1,16 @@ +use std::time::{SystemTime, UNIX_EPOCH}; + +use chrono::{DateTime, Utc}; + +pub fn now() -> u128 { + SystemTime::now() + .duration_since(UNIX_EPOCH) + .expect("Error getting system time") + .as_millis() +} + +pub fn now_str() -> String { + let n = now(); + let dt: DateTime = SystemTime::now().into(); + format!("{} ({})", n, dt.format("%+")) +} diff --git a/scripts/helper_scripts/unipept-database-rs/src/utils/files.rs b/scripts/helper_scripts/unipept-database-rs/src/utils/files.rs new file mode 100644 index 0000000..1eb18cb --- /dev/null +++ b/scripts/helper_scripts/unipept-database-rs/src/utils/files.rs @@ -0,0 +1,27 @@ +use anyhow::{Context, Result}; +use std::fs::{File, OpenOptions}; +use std::io::{stdin, BufReader, BufWriter, Stdin}; +use std::path::PathBuf; + +/// Create a BufReader that reads from StdIn +pub fn open_sin() -> BufReader { + BufReader::new(stdin()) +} + +/// Create a BufReader that reads from a file denoted by its PathBuf +pub fn open_read(pb: &PathBuf) -> Result> { + let file = OpenOptions::new() + .read(true) + .open(pb) + .with_context(|| format!("Failed to open file \"{}\" for reading", pb.display()))?; + Ok(BufReader::new(file)) +} + +/// Create a BufWriter that writes to a file denoted by its PathBuf +pub fn open_write(pb: &PathBuf) -> Result> { + let file = OpenOptions::new() + .write(true) + .open(pb) + .with_context(|| format!("Failed to open file \"{}\" for writing", pb.display()))?; + Ok(BufWriter::new(file)) +} diff --git a/scripts/helper_scripts/unipept-database-rs/src/utils/mod.rs b/scripts/helper_scripts/unipept-database-rs/src/utils/mod.rs new file mode 100644 index 0000000..d3ab969 --- /dev/null +++ b/scripts/helper_scripts/unipept-database-rs/src/utils/mod.rs @@ -0,0 +1 @@ +pub mod files; diff --git a/workflows/static_database/version.txt b/workflows/static_database/version.txt index 7dca4b1..a788ffb 100644 --- a/workflows/static_database/version.txt +++ b/workflows/static_database/version.txt @@ -1 +1 @@ -2023-10-01 +2023-11-01