From 5849c928e8f7aeec85eee4be5818dc42b3885154 Mon Sep 17 00:00:00 2001 From: ptaylor Date: Wed, 18 Dec 2024 17:19:18 -0800 Subject: [PATCH] rewrite sccache-dist to use AMQP --- .github/actions/artifact_failure/action.yml | 21 +- .github/workflows/ci.yml | 13 +- .github/workflows/integration-tests.yml | 30 +- Cargo.lock | 860 ++++++++- Cargo.toml | 34 +- scripts/freebsd-ci-test.sh | 97 +- src/bin/sccache-dist/build.rs | 1205 ++++-------- src/bin/sccache-dist/build_freebsd.rs | 441 +++-- src/bin/sccache-dist/cmdline/mod.rs | 14 +- src/bin/sccache-dist/cmdline/parse.rs | 249 +-- src/bin/sccache-dist/main.rs | 1271 +++++-------- src/bin/sccache-dist/token_check.rs | 4 +- src/cache/cache.rs | 90 +- src/cache/disk.rs | 63 +- src/cache/readonly.rs | 21 +- src/commands.rs | 7 +- src/compiler/compiler.rs | 322 ++-- src/config.rs | 370 +++- src/dist/cache.rs | 162 +- src/dist/client_auth.rs | 2 +- src/dist/http.rs | 1906 ++++++------------- src/dist/mod.rs | 396 ++-- src/dist/server.rs | 563 ++++++ src/lib.rs | 2 +- src/lru_disk_cache/mod.rs | 24 - src/server.rs | 24 +- src/test/mock_storage.rs | 31 +- src/util.rs | 49 +- tests/dist.rs | 321 ++-- tests/harness/mod.rs | 401 ++-- tests/sccache_args.rs | 10 +- 31 files changed, 4576 insertions(+), 4427 deletions(-) create mode 100644 src/dist/server.rs diff --git a/.github/actions/artifact_failure/action.yml b/.github/actions/artifact_failure/action.yml index 071fe0917a..6c487d16a4 100644 --- a/.github/actions/artifact_failure/action.yml +++ b/.github/actions/artifact_failure/action.yml @@ -4,25 +4,42 @@ inputs: name: description: "" required: true + tar: + default: tar + shell: + default: bash --noprofile --norc -eo pipefail runs: using: "composite" steps: - name: pack failure artifacts - shell: bash + shell: ${{ inputs.shell }} {0} run: | + set -x; echo "Current running processes" ps uax echo "Processes that access current dir" lsof +D `pwd` || true killall sccache || true killall sccache-dist || true + # possible temp dirs for either linux or windows cp "${TMP:-${TEMP:-${TMPDIR:-/tmp}}}"/sccache_*.txt . 2>/dev/null || true - tar --exclude='target' \ + pwd + ls -l . + echo "${TMP:-${TEMP:-${TMPDIR:-/tmp}}}"/ + ls -l "${TMP:-${TEMP:-${TMPDIR:-/tmp}}}"/ + if test -n "${GITHUB_WORKSPACE-}"; then + cd "$GITHUB_WORKSPACE" + pwd + fi + ls -l . + ${{ inputs.tar }} \ + --exclude='target' \ --exclude='docs' \ --exclude='bins' \ --exclude='.git' \ -zcf target/failure-${{ inputs.name }}.tar.gz . + - uses: actions/upload-artifact@v3 with: name: ${{ inputs.name }} diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 640289e4da..2286f5e681 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -342,13 +342,8 @@ jobs: test_freebsd: name: test freebsd-14.1 rust stable - runs-on: ${{ matrix.job.os }} + runs-on: ubuntu-22.04 timeout-minutes: 70 - strategy: - fail-fast: false - matrix: - job: - - { os: ubuntu-22.04 } steps: - uses: actions/checkout@v4 - name: Prepare, build and test @@ -357,8 +352,8 @@ jobs: mem: 8192 usesh: true sync: rsync - copyback: false - prepare: pkg install -y ca_root_nss curl gmake gtar pot sudo + copyback: true + prepare: pkg install -y ca_root_nss curl gmake gtar pot sudo rabbitmq run: | ##################################################################################### ### Prepare, build, and test @@ -400,6 +395,8 @@ jobs: if: failure() uses: ./.github/actions/artifact_failure with: + tar: gtar + shell: freebsd name: test-freebsd-14.1-stable release: diff --git a/.github/workflows/integration-tests.yml b/.github/workflows/integration-tests.yml index d57794f12b..f5b008ce82 100644 --- a/.github/workflows/integration-tests.yml +++ b/.github/workflows/integration-tests.yml @@ -8,7 +8,7 @@ env: jobs: build: - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 steps: - name: Clone repository uses: actions/checkout@v4 @@ -28,7 +28,7 @@ jobs: path: ./target/debug/sccache redis-deprecated: - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 needs: build services: @@ -76,7 +76,7 @@ jobs: ${SCCACHE_PATH} --show-stats | grep -e "Cache hits\s*[1-9]" redis: - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 needs: build services: @@ -124,7 +124,7 @@ jobs: ${SCCACHE_PATH} --show-stats | grep -e "Cache hits\s*[1-9]" s3_minio: - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 needs: build # Setup minio server @@ -184,7 +184,7 @@ jobs: ${SCCACHE_PATH} --show-stats | grep -e "Cache hits\s*[1-9]" azblob_azurite: - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 needs: build # Setup azurite server @@ -240,7 +240,7 @@ jobs: ${SCCACHE_PATH} --show-stats | grep -e "Cache hits\s*[1-9]" gha: - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 needs: build env: @@ -289,7 +289,7 @@ jobs: ${SCCACHE_PATH} --show-stats | grep -e "Cache hits\s*[1-9]" memcached-deprecated: - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 needs: build # Setup memcached server @@ -344,7 +344,7 @@ jobs: ${SCCACHE_PATH} --show-stats | grep -e "Cache hits\s*[1-9]" memcached: - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 needs: build # Setup memcached server @@ -399,7 +399,7 @@ jobs: ${SCCACHE_PATH} --show-stats | grep -e "Cache hits\s*[1-9]" webdav: - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 needs: build env: @@ -524,7 +524,7 @@ jobs: run: cat "$SCCACHE_ERROR_LOG" clang: - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 needs: build env: @@ -647,7 +647,7 @@ jobs: ${SCCACHE_PATH} --show-stats | grep -e "Cache hits\s*[1-9]" gcc: - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 needs: build env: @@ -692,7 +692,7 @@ jobs: ${SCCACHE_PATH} --show-stats | grep -e "Cache hits\s*[1-9]" autotools: - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 needs: build env: @@ -748,7 +748,7 @@ jobs: ${SCCACHE_PATH} --show-stats | grep -e "Cache hits\s*[1-9]" cmake: - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 needs: build env: @@ -806,7 +806,7 @@ jobs: ${SCCACHE_PATH} --show-stats | grep -e "Cache hits\s*[1-9]" rust-test-coverage: - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 needs: build env: @@ -847,7 +847,7 @@ jobs: ${SCCACHE_PATH} --show-stats | grep -e "Cache hits\s*[1-9]" zstd-compression-level: - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 needs: build env: diff --git a/Cargo.lock b/Cargo.lock index 3ec71483e0..7c83bb5c5a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -37,6 +37,54 @@ dependencies = [ "memchr", ] +[[package]] +name = "amq-protocol" +version = "7.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3a41c091e49edfcc098b4f90d4d7706a8cf9158034e84ebfee7ff346092f67c" +dependencies = [ + "amq-protocol-tcp", + "amq-protocol-types", + "amq-protocol-uri", + "cookie-factory", + "nom", + "serde", +] + +[[package]] +name = "amq-protocol-tcp" +version = "7.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3ed7a4a662472f88823ed2fc81babb0b00562f2c54284e3e7bffc02b6df649bf" +dependencies = [ + "amq-protocol-uri", + "tcp-stream", + "tracing", +] + +[[package]] +name = "amq-protocol-types" +version = "7.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bd6484fdc918c1b6e2ae8eda2914d19a5873e1975f93ad8d33d6a24d1d98df05" +dependencies = [ + "cookie-factory", + "nom", + "serde", + "serde_json", +] + +[[package]] +name = "amq-protocol-uri" +version = "7.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f7f2da69e0e1182765bf33407cd8a843f20791b5af2b57a2645818c4776c56c" +dependencies = [ + "amq-protocol-types", + "percent-encoding", + "url", +] + [[package]] name = "android-tzdata" version = "0.1.1" @@ -139,6 +187,45 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d92bec98840b8f03a5ff5413de5293bfcd8bf96467cf5452609f939ec6f5de16" +[[package]] +name = "asn1-rs" +version = "0.6.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5493c3bedbacf7fd7382c6346bbd66687d12bbaad3a89a2d2c303ee6cf20b048" +dependencies = [ + "asn1-rs-derive", + "asn1-rs-impl", + "displaydoc 0.2.5", + "nom", + "num-traits", + "rusticata-macros", + "thiserror", + "time", +] + +[[package]] +name = "asn1-rs-derive" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "965c2d33e53cb6b267e148a4cb0760bc01f4904c1cd4bb4002a085bb016d1490" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.48", + "synstructure", +] + +[[package]] +name = "asn1-rs-impl" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b18050c2cd6fe86c3a76584ef5e0baf286d038cda203eb6223df2cc413565f7" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.48", +] + [[package]] name = "assert_cmd" version = "2.0.13" @@ -198,8 +285,8 @@ checksum = "30ca9a001c1e8ba5149f91a74362376cc6bc5b919d92d988668657bd570bdcec" dependencies = [ "async-task", "concurrent-queue", - "fastrand", - "futures-lite", + "fastrand 2.2.0", + "futures-lite 2.5.0", "slab", ] @@ -211,32 +298,72 @@ checksum = "05b1b633a2115cd122d73b955eadd9916c18c8f510ec9cd1686404c60ad1c29c" dependencies = [ "async-channel 2.3.1", "async-executor", - "async-io", - "async-lock", + "async-io 2.3.3", + "async-lock 3.4.0", "blocking", - "futures-lite", + "futures-lite 2.5.0", "once_cell", ] +[[package]] +name = "async-global-executor-trait" +version = "2.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "80f19936c1a84fb48ceb8899b642d2a72572587d1021cc561bfb24de9f33ee89" +dependencies = [ + "async-global-executor", + "async-trait", + "executor-trait", +] + +[[package]] +name = "async-io" +version = "1.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fc5b45d93ef0529756f812ca52e44c221b35341892d3dcc34132ac02f3dd2af" +dependencies = [ + "async-lock 2.8.0", + "autocfg", + "cfg-if 1.0.0", + "concurrent-queue", + "futures-lite 1.13.0", + "log", + "parking", + "polling 2.8.0", + "rustix 0.37.27", + "slab", + "socket2 0.4.10", + "waker-fn", +] + [[package]] name = "async-io" version = "2.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0d6baa8f0178795da0e71bc42c9e5d13261aac7ee549853162e66a241ba17964" dependencies = [ - "async-lock", + "async-lock 3.4.0", "cfg-if 1.0.0", "concurrent-queue", "futures-io", - "futures-lite", + "futures-lite 2.5.0", "parking", - "polling", - "rustix", + "polling 3.7.2", + "rustix 0.38.34", "slab", "tracing", "windows-sys 0.52.0", ] +[[package]] +name = "async-lock" +version = "2.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "287272293e9d8c41773cec55e365490fe034813a2f172f502d6ddcf75b2f582b" +dependencies = [ + "event-listener 2.5.3", +] + [[package]] name = "async-lock" version = "3.4.0" @@ -255,31 +382,43 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "63255f1dc2381611000436537bbedfe83183faa303a5a0edaf191edef06526bb" dependencies = [ "async-channel 2.3.1", - "async-io", - "async-lock", + "async-io 2.3.3", + "async-lock 3.4.0", "async-signal", "async-task", "blocking", "cfg-if 1.0.0", "event-listener 5.3.1", - "futures-lite", - "rustix", + "futures-lite 2.5.0", + "rustix 0.38.34", "tracing", ] +[[package]] +name = "async-reactor-trait" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a6012d170ad00de56c9ee354aef2e358359deb1ec504254e0e5a3774771de0e" +dependencies = [ + "async-io 1.13.0", + "async-trait", + "futures-core", + "reactor-trait", +] + [[package]] name = "async-signal" version = "0.2.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dfb3634b73397aa844481f814fad23bbf07fdb0eabec10f2eb95e58944b1ec32" dependencies = [ - "async-io", - "async-lock", + "async-io 2.3.3", + "async-lock 3.4.0", "atomic-waker", "cfg-if 1.0.0", "futures-core", "futures-io", - "rustix", + "rustix 0.38.34", "signal-hook-registry", "slab", "windows-sys 0.52.0", @@ -293,14 +432,14 @@ checksum = "c634475f29802fde2b8f0b505b1bd00dfe4df7d4a000f0b36f7671197d5c3615" dependencies = [ "async-channel 1.9.0", "async-global-executor", - "async-io", - "async-lock", + "async-io 2.3.3", + "async-lock 3.4.0", "async-process", "crossbeam-utils", "futures-channel", "futures-core", "futures-io", - "futures-lite", + "futures-lite 2.5.0", "gloo-timers", "kv-log-macro", "log", @@ -364,6 +503,7 @@ dependencies = [ "async-trait", "axum-core", "axum-macros", + "base64 0.22.1", "bytes", "futures-util", "http 1.1.0", @@ -382,8 +522,10 @@ dependencies = [ "serde_json", "serde_path_to_error", "serde_urlencoded", + "sha1", "sync_wrapper 1.0.1", "tokio", + "tokio-tungstenite", "tower 0.5.1", "tower-layer", "tower-service", @@ -420,7 +562,7 @@ dependencies = [ "axum", "axum-core", "bytes", - "fastrand", + "fastrand 2.2.0", "futures-util", "headers", "http 1.1.0", @@ -452,7 +594,7 @@ version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e4fa97bb310c33c811334143cf64c5bb2b7b3c06e453db6b095d7061eff8f113" dependencies = [ - "fastrand", + "fastrand 2.2.0", "gloo-timers", "tokio", ] @@ -570,7 +712,7 @@ dependencies = [ "async-channel 2.3.1", "async-task", "futures-io", - "futures-lite", + "futures-lite 2.5.0", "piper", ] @@ -632,6 +774,49 @@ dependencies = [ "libc", ] +[[package]] +name = "celery" +version = "0.5.5" +source = "git+https://github.com/trxcllnt/rusty-celery.git?branch=sccache#66e5b3ee68d0d84d32f247760996f85dcdfba677" +dependencies = [ + "async-trait", + "base64 0.21.7", + "celery-codegen", + "chrono", + "colored", + "futures", + "futures-lite 1.13.0", + "globset", + "hostname", + "lapin", + "log", + "once_cell", + "rand", + "redis 0.22.3", + "rmp-serde", + "rmpv", + "serde", + "serde-pickle", + "serde_json", + "serde_yaml", + "thiserror", + "tokio", + "tokio-executor-trait", + "tokio-reactor-trait", + "tokio-stream", + "uuid", +] + +[[package]] +name = "celery-codegen" +version = "0.5.5" +source = "git+https://github.com/trxcllnt/rusty-celery.git?branch=sccache#66e5b3ee68d0d84d32f247760996f85dcdfba677" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.109", +] + [[package]] name = "cfg-if" version = "0.1.10" @@ -722,12 +907,34 @@ version = "0.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "702fc72eb24e5a1e48ce58027a675bc24edd52096d5397d4aea7c6dd9eca0bd1" +[[package]] +name = "cms" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b77c319abfd5219629c45c34c89ba945ed3c5e49fcde9d16b6c3885f118a730" +dependencies = [ + "const-oid", + "der", + "spki", + "x509-cert", +] + [[package]] name = "colorchoice" version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "acbf1af155f9b9ef647e42cdc158db4b64a1b61f743629225fde6f3e0be2a7c7" +[[package]] +name = "colored" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cbf2150cce219b664a8a70df7a1f933836724b503f8a413af9365b4dcc4d90b8" +dependencies = [ + "lazy_static", + "windows-sys 0.48.0", +] + [[package]] name = "combine" version = "4.6.6" @@ -783,6 +990,12 @@ version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f7144d30dcf0fafbce74250a3963025d8d52177934239851c917d29f1df280c2" +[[package]] +name = "cookie-factory" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9885fa71e26b8ab7855e2ec7cae6e9b380edff76cd052e07c683a0319d51b3a2" + [[package]] name = "core-foundation" version = "0.9.4" @@ -877,6 +1090,12 @@ dependencies = [ "libc", ] +[[package]] +name = "data-encoding" +version = "2.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e8566979429cf69b49a5c740c60791108e86440e8be149bbea4fe54d2c32d6e2" + [[package]] name = "der" version = "0.7.8" @@ -884,10 +1103,37 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fffa369a668c8af7dbf8b5e56c9f744fbd399949ed171606040001947de40b1c" dependencies = [ "const-oid", + "der_derive", + "flagset", "pem-rfc7468", "zeroize", ] +[[package]] +name = "der-parser" +version = "9.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5cd0a5c643689626bec213c4d8bd4d96acc8ffdb4ad4bb6bc16abf27d5f4b553" +dependencies = [ + "asn1-rs", + "displaydoc 0.2.5", + "nom", + "num-bigint", + "num-traits", + "rusticata-macros", +] + +[[package]] +name = "der_derive" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8034092389675178f570469e6c3b0465d3d30b4505c294a6550db47f3c17ad18" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.48", +] + [[package]] name = "deranged" version = "0.3.11" @@ -908,6 +1154,15 @@ dependencies = [ "syn 1.0.109", ] +[[package]] +name = "des" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ffdd80ce8ce993de27e9f063a444a4d53ce8e8db4c1f00cc03af5ad5a9867a1e" +dependencies = [ + "cipher", +] + [[package]] name = "difflib" version = "0.4.0" @@ -958,6 +1213,17 @@ dependencies = [ "syn 1.0.109", ] +[[package]] +name = "displaydoc" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.48", +] + [[package]] name = "dlv-list" version = "0.5.2" @@ -1053,6 +1319,24 @@ dependencies = [ "pin-project-lite", ] +[[package]] +name = "executor-trait" +version = "2.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13c39dff9342e4e0e16ce96be751eb21a94e94a87bb2f6e63ad1961c2ce109bf" +dependencies = [ + "async-trait", +] + +[[package]] +name = "fastrand" +version = "1.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e51093e27b0797c359783294ca4f0a911c270184cb10f85783b118614a1501be" +dependencies = [ + "instant", +] + [[package]] name = "fastrand" version = "2.2.0" @@ -1109,6 +1393,17 @@ dependencies = [ "spin 0.9.8", ] +[[package]] +name = "flume" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "da0e4dd2a88388a1f4ccc7c9ce104604dab68d9f408dc34cd45823d5a9069095" +dependencies = [ + "futures-core", + "futures-sink", + "spin 0.9.8", +] + [[package]] name = "fnv" version = "1.0.7" @@ -1196,13 +1491,28 @@ version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a44623e20b9681a318efdd71c299b6b222ed6f231972bfe2f224ebad6311f0c1" +[[package]] +name = "futures-lite" +version = "1.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49a9d51ce47660b1e808d3c990b4709f2f415d928835a17dfd16991515c46bce" +dependencies = [ + "fastrand 1.9.0", + "futures-core", + "futures-io", + "memchr", + "parking", + "pin-project-lite", + "waker-fn", +] + [[package]] name = "futures-lite" version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cef40d21ae2c515b51041df9ed313ed21e572df340ea58a922a0aefe7e8891a1" dependencies = [ - "fastrand", + "fastrand 2.2.0", "futures-core", "futures-io", "parking", @@ -1279,6 +1589,19 @@ version = "0.28.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4271d37baee1b8c7e4b708028c57d816cf9d2434acb33a549475f78c181f6253" +[[package]] +name = "globset" +version = "0.4.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "15f1ce686646e7f1e19bf7d5533fe443a45dbfb990e00629110797578b42fb19" +dependencies = [ + "aho-corasick", + "bstr", + "log", + "regex-automata 0.4.5", + "regex-syntax 0.8.2", +] + [[package]] name = "gloo-timers" version = "0.3.0" @@ -1301,7 +1624,7 @@ dependencies = [ "bytes", "core_affinity", "flate2", - "flume", + "flume 0.10.14", "num_cpus", "thiserror", ] @@ -1518,7 +1841,7 @@ dependencies = [ "httpdate", "itoa", "pin-project-lite", - "socket2", + "socket2 0.5.5", "tokio", "tower-service", "tracing", @@ -1606,7 +1929,7 @@ dependencies = [ "http-body 1.0.0", "hyper 1.1.0", "pin-project-lite", - "socket2", + "socket2 0.5.5", "tokio", "tower 0.4.13", "tower-service", @@ -1666,6 +1989,26 @@ dependencies = [ "generic-array", ] +[[package]] +name = "instant" +version = "0.1.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e0242819d153cba4b4b05a5a8f2a7e9bbf97b6055b2a002b395c96b5ff3c0222" +dependencies = [ + "cfg-if 1.0.0", +] + +[[package]] +name = "io-lifetimes" +version = "1.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eae7b9aee968036d54dce06cebaefd919e4472e753296daccd6d344e3e2df0c2" +dependencies = [ + "hermit-abi 0.3.9", + "libc", + "windows-sys 0.48.0", +] + [[package]] name = "ipnet" version = "2.9.0" @@ -1683,6 +2026,12 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "iter-read" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c397ca3ea05ad509c4ec451fea28b4771236a376ca1c69fd5143aae0cf8f93c4" + [[package]] name = "itertools" version = "0.12.0" @@ -1740,6 +2089,28 @@ dependencies = [ "log", ] +[[package]] +name = "lapin" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "209b09a06f4bd4952a0fd0594f90d53cf4496b062f59acc838a2823e1bb7d95c" +dependencies = [ + "amq-protocol", + "async-global-executor-trait", + "async-reactor-trait", + "async-trait", + "executor-trait", + "flume 0.11.1", + "futures-core", + "futures-io", + "parking_lot", + "pinky-swear", + "reactor-trait", + "serde", + "tracing", + "waker-fn", +] + [[package]] name = "lazy_static" version = "1.4.0" @@ -1789,6 +2160,12 @@ version = "0.5.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0717cef1bc8b636c6e1c1bbdefc09e6322da8a9321966e8928ef80d20f7f770f" +[[package]] +name = "linux-raw-sys" +version = "0.3.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ef53942eb7bf7ff43a617b3e2c1c4a5ecf5944a7c1bc12d7ee39bbb15e5c1519" + [[package]] name = "linux-raw-sys" version = "0.4.12" @@ -1876,6 +2253,12 @@ dependencies = [ "unicase", ] +[[package]] +name = "minimal-lexical" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" + [[package]] name = "miniz_oxide" version = "0.7.1" @@ -1984,6 +2367,16 @@ dependencies = [ "libc", ] +[[package]] +name = "nom" +version = "7.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a" +dependencies = [ + "memchr", + "minimal-lexical", +] + [[package]] name = "normalize-line-endings" version = "0.3.0" @@ -2099,6 +2492,15 @@ dependencies = [ "ruzstd", ] +[[package]] +name = "oid-registry" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a8d8034d9489cdaf79228eb9f6a3b8d7bb32ba00d6645ebd48eef4077ceb5bd9" +dependencies = [ + "asn1-rs", +] + [[package]] name = "once_cell" version = "1.19.0" @@ -2128,7 +2530,7 @@ dependencies = [ "once_cell", "percent-encoding", "quick-xml 0.36.1", - "redis", + "redis 0.27.5", "reqsign", "reqwest 0.12.5", "serde", @@ -2208,12 +2610,34 @@ dependencies = [ "hashbrown", ] -[[package]] -name = "overload" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39" - +[[package]] +name = "overload" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39" + +[[package]] +name = "p12-keystore" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df7b60d0b2dcace322e6e8c4499c4c8bdf331c1bae046a54be5e4191c3610286" +dependencies = [ + "cbc", + "cms", + "der", + "des", + "hex", + "hmac", + "pkcs12", + "pkcs5", + "rand", + "rc2", + "sha1", + "sha2", + "thiserror", + "x509-parser", +] + [[package]] name = "parking" version = "2.2.1" @@ -2243,6 +2667,12 @@ dependencies = [ "windows-targets 0.48.5", ] +[[package]] +name = "paste" +version = "1.0.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a" + [[package]] name = "pbkdf2" version = "0.12.2" @@ -2310,6 +2740,18 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" +[[package]] +name = "pinky-swear" +version = "6.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6cfae3ead413ca051a681152bd266438d3bfa301c9bdf836939a14c721bb2a21" +dependencies = [ + "doc-comment", + "flume 0.11.1", + "parking_lot", + "tracing", +] + [[package]] name = "piper" version = "0.2.4" @@ -2317,7 +2759,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "96c8c490f422ef9a4efd2cb5b42b76c8613d7e7dfc1caf667b8a3350a5acc066" dependencies = [ "atomic-waker", - "fastrand", + "fastrand 2.2.0", "futures-io", ] @@ -2332,6 +2774,21 @@ dependencies = [ "spki", ] +[[package]] +name = "pkcs12" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "695b3df3d3cc1015f12d70235e35b6b79befc5fa7a9b95b951eab1dd07c9efc2" +dependencies = [ + "cms", + "const-oid", + "der", + "digest", + "spki", + "x509-cert", + "zeroize", +] + [[package]] name = "pkcs5" version = "0.7.1" @@ -2365,6 +2822,22 @@ version = "0.3.28" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "69d3587f8a9e599cc7ec2c00e331f71c4e69a5f9a4b8a6efd5b07466b9736f9a" +[[package]] +name = "polling" +version = "2.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4b2d323e8ca7996b3e23126511a523f7e62924d93ecd5ae73b333815b0eb3dce" +dependencies = [ + "autocfg", + "bitflags 1.3.2", + "cfg-if 1.0.0", + "concurrent-queue", + "libc", + "log", + "pin-project-lite", + "windows-sys 0.48.0", +] + [[package]] name = "polling" version = "3.7.2" @@ -2375,7 +2848,7 @@ dependencies = [ "concurrent-queue", "hermit-abi 0.4.0", "pin-project-lite", - "rustix", + "rustix 0.38.34", "tracing", "windows-sys 0.52.0", ] @@ -2499,7 +2972,7 @@ checksum = "9096629c45860fc7fb143e125eb826b5e721e10be3263160c7d60ca832cf8c46" dependencies = [ "libc", "once_cell", - "socket2", + "socket2 0.5.5", "tracing", "windows-sys 0.52.0", ] @@ -2543,6 +3016,48 @@ dependencies = [ "getrandom", ] +[[package]] +name = "rc2" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "62c64daa8e9438b84aaae55010a93f396f8e60e3911590fcba770d04643fc1dd" +dependencies = [ + "cipher", +] + +[[package]] +name = "reactor-trait" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "438a4293e4d097556730f4711998189416232f009c137389e0f961d2bc0ddc58" +dependencies = [ + "async-trait", + "futures-core", + "futures-io", +] + +[[package]] +name = "redis" +version = "0.22.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aa8455fa3621f6b41c514946de66ea0531f57ca017b2e6c7cc368035ea5b46df" +dependencies = [ + "arc-swap", + "async-trait", + "bytes", + "combine", + "futures", + "futures-util", + "itoa", + "percent-encoding", + "pin-project-lite", + "ryu", + "sha1_smol", + "tokio", + "tokio-util", + "url", +] + [[package]] name = "redis" version = "0.27.5" @@ -2568,7 +3083,7 @@ dependencies = [ "rustls-pki-types", "ryu", "sha1_smol", - "socket2", + "socket2 0.5.5", "tokio", "tokio-retry2", "tokio-rustls", @@ -2794,6 +3309,40 @@ dependencies = [ "windows-sys 0.48.0", ] +[[package]] +name = "rmp" +version = "0.8.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "228ed7c16fa39782c3b3468e974aec2795e9089153cd08ee2e9aefb3613334c4" +dependencies = [ + "byteorder", + "num-traits", + "paste", +] + +[[package]] +name = "rmp-serde" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52e599a477cf9840e92f2cde9a7189e67b42c57532749bf90aea6ec10facd4db" +dependencies = [ + "byteorder", + "rmp", + "serde", +] + +[[package]] +name = "rmpv" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "58450723cd9ee93273ce44a20b6ec4efe17f8ed2e3631474387bfdecf18bb2a9" +dependencies = [ + "num-traits", + "rmp", + "serde", + "serde_bytes", +] + [[package]] name = "rouille" version = "3.6.2" @@ -2875,6 +3424,29 @@ dependencies = [ "semver", ] +[[package]] +name = "rusticata-macros" +version = "4.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "faf0c4a6ece9950b9abdb62b1cfcf2a68b3b67a10ba445b3bb85be2a293d0632" +dependencies = [ + "nom", +] + +[[package]] +name = "rustix" +version = "0.37.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fea8ca367a3a01fe35e6943c400addf443c0f57670e6ec51196f71a4b8762dd2" +dependencies = [ + "bitflags 1.3.2", + "errno", + "io-lifetimes", + "libc", + "linux-raw-sys 0.3.8", + "windows-sys 0.48.0", +] + [[package]] name = "rustix" version = "0.38.34" @@ -2884,7 +3456,7 @@ dependencies = [ "bitflags 2.4.1", "errno", "libc", - "linux-raw-sys", + "linux-raw-sys 0.4.12", "windows-sys 0.52.0", ] @@ -2902,6 +3474,19 @@ dependencies = [ "zeroize", ] +[[package]] +name = "rustls-connector" +version = "0.20.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2a980454b497c439c274f2feae2523ed8138bbd3d323684e1435fec62f800481" +dependencies = [ + "log", + "rustls", + "rustls-native-certs", + "rustls-pki-types", + "rustls-webpki", +] + [[package]] name = "rustls-native-certs" version = "0.7.0" @@ -3016,6 +3601,7 @@ dependencies = [ "byteorder", "bytes", "cc", + "celery", "chrono", "clap", "daemonize", @@ -3072,6 +3658,8 @@ dependencies = [ "tokio", "tokio-openssl", "tokio-serde", + "tokio-stream", + "tokio-tungstenite", "tokio-util", "toml", "tower 0.4.13", @@ -3158,6 +3746,28 @@ dependencies = [ "serde_derive", ] +[[package]] +name = "serde-pickle" +version = "1.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c762ad136a26407c6a80825813600ceeab5e613660d93d79a41f0ec877171e71" +dependencies = [ + "byteorder", + "iter-read", + "num-bigint", + "num-traits", + "serde", +] + +[[package]] +name = "serde_bytes" +version = "0.11.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "387cc504cb06bb40a96c8e04e951fe01854cf6bc921053c954e4a606d9675c6a" +dependencies = [ + "serde", +] + [[package]] name = "serde_derive" version = "1.0.201" @@ -3223,6 +3833,19 @@ dependencies = [ "serde", ] +[[package]] +name = "serde_yaml" +version = "0.9.34+deprecated" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a8b1a1a2ebf674015cc02edccce75287f1a0130d394307b36743c2f5d504b47" +dependencies = [ + "indexmap", + "itoa", + "ryu", + "serde", + "unsafe-libyaml", +] + [[package]] name = "serial_test" version = "3.1.0" @@ -3337,6 +3960,16 @@ version = "1.11.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4dccd0940a2dcdf68d092b8cbab7dc0ad8fa938bf95787e1b916b0e3d0e8e970" +[[package]] +name = "socket2" +version = "0.4.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9f7916fc008ca5542385b89a3d3ce689953c143e9304a9bf8beec1de48994c0d" +dependencies = [ + "libc", + "winapi", +] + [[package]] name = "socket2" version = "0.5.5" @@ -3442,6 +4075,17 @@ version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a7065abeca94b6a8a577f9bd45aa0867a2238b74e8eb67cf10d492bc39351394" +[[package]] +name = "synstructure" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8af7666ab7b6390ab78131fb5b0fce11d6b7a6951602017c35fa82800708971" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.48", +] + [[package]] name = "syslog" version = "6.1.0" @@ -3487,6 +4131,18 @@ dependencies = [ "xattr 1.3.1", ] +[[package]] +name = "tcp-stream" +version = "0.28.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "495b0abdce3dc1f8fd27240651c9e68890c14e9d9c61527b1ce44d8a5a7bd3d5" +dependencies = [ + "cfg-if 1.0.0", + "p12-keystore", + "rustls-connector", + "rustls-pemfile", +] + [[package]] name = "temp-env" version = "0.3.6" @@ -3503,8 +4159,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "85b77fafb263dd9d05cbeac119526425676db3784113aa9295c88498cbf8bff1" dependencies = [ "cfg-if 1.0.0", - "fastrand", - "rustix", + "fastrand 2.2.0", + "rustix 0.38.34", "windows-sys 0.52.0", ] @@ -3523,7 +4179,7 @@ version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "21bebf2b7c9e0a515f6e0f8c51dc0f8e4696391e6f1ff30379559f8365fb0df7" dependencies = [ - "rustix", + "rustix 0.38.34", "windows-sys 0.48.0", ] @@ -3575,7 +4231,7 @@ dependencies = [ "async-trait", "base64 0.13.1", "chrono", - "displaydoc", + "displaydoc 0.1.7", "futures", "log", "reqwest 0.11.23", @@ -3726,11 +4382,22 @@ dependencies = [ "parking_lot", "pin-project-lite", "signal-hook-registry", - "socket2", + "socket2 0.5.5", "tokio-macros", "windows-sys 0.52.0", ] +[[package]] +name = "tokio-executor-trait" +version = "2.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "96a1593beae7759f592e1100c5997fe9e9ebf4b5968062f1fbcd807989cd1b79" +dependencies = [ + "async-trait", + "executor-trait", + "tokio", +] + [[package]] name = "tokio-macros" version = "2.4.0" @@ -3763,6 +4430,20 @@ dependencies = [ "tokio", ] +[[package]] +name = "tokio-reactor-trait" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e9481a72f36bd9cbb8d6dd349227c4783e234e4332cfe806225bc929c4b92486" +dependencies = [ + "async-trait", + "futures-core", + "futures-io", + "reactor-trait", + "tokio", + "tokio-stream", +] + [[package]] name = "tokio-retry2" version = "0.5.6" @@ -3797,6 +4478,29 @@ dependencies = [ "pin-project", ] +[[package]] +name = "tokio-stream" +version = "0.1.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eca58d7bba4a75707817a2c44174253f9236b2d5fbd055602e9d5c07c139a047" +dependencies = [ + "futures-core", + "pin-project-lite", + "tokio", +] + +[[package]] +name = "tokio-tungstenite" +version = "0.24.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "edc5f74e248dc973e0dbb7b74c7e0d6fcc301c694ff50049504004ef4d0cdcd9" +dependencies = [ + "futures-util", + "log", + "tokio", + "tungstenite", +] + [[package]] name = "tokio-util" version = "0.7.10" @@ -3981,6 +4685,24 @@ version = "0.2.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b" +[[package]] +name = "tungstenite" +version = "0.24.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "18e5b8366ee7a95b16d32197d0b2604b43a0be89dc5fac9f8e96ccafbaedda8a" +dependencies = [ + "byteorder", + "bytes", + "data-encoding", + "http 1.1.0", + "httparse", + "log", + "rand", + "sha1", + "thiserror", + "utf-8", +] + [[package]] name = "twoway" version = "0.1.8" @@ -4036,6 +4758,12 @@ dependencies = [ "tinyvec", ] +[[package]] +name = "unsafe-libyaml" +version = "0.2.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "673aac59facbab8a9007c7f6108d11f63b603f7cabff99fabf650fea5c32b861" + [[package]] name = "untrusted" version = "0.9.0" @@ -4059,6 +4787,12 @@ version = "0.7.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "110352d4e9076c67839003c7788d8604e24dcded13e0b375af3efaa8cf468517" +[[package]] +name = "utf-8" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9" + [[package]] name = "utf8parse" version = "0.2.1" @@ -4140,6 +4874,12 @@ dependencies = [ "libc", ] +[[package]] +name = "waker-fn" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "317211a0dc0ceedd78fb2ca9a44aed3d7b9b26f81870d485c07122b4350673b7" + [[package]] name = "walkdir" version = "2.5.0" @@ -4271,7 +5011,7 @@ checksum = "b4ee928febd44d98f2f459a4a79bd4d928591333a494a10a868418ac1b39cf1f" dependencies = [ "either", "home", - "rustix", + "rustix 0.38.34", "winsafe", ] @@ -4482,6 +5222,34 @@ version = "0.0.19" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d135d17ab770252ad95e9a872d365cf3090e3be864a34ab46f48555993efc904" +[[package]] +name = "x509-cert" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1301e935010a701ae5f8655edc0ad17c44bad3ac5ce8c39185f75453b720ae94" +dependencies = [ + "const-oid", + "der", + "spki", +] + +[[package]] +name = "x509-parser" +version = "0.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fcbc162f30700d6f3f82a24bf7cc62ffe7caea42c0b2cba8bf7f3ae50cf51f69" +dependencies = [ + "asn1-rs", + "data-encoding", + "der-parser", + "lazy_static", + "nom", + "oid-registry", + "rusticata-macros", + "thiserror", + "time", +] + [[package]] name = "xattr" version = "0.2.3" @@ -4498,8 +5266,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8da84f1a25939b27f6820d92aed108f83ff920fdf11a7b19366c27c4cda81d4f" dependencies = [ "libc", - "linux-raw-sys", - "rustix", + "linux-raw-sys 0.4.12", + "rustix 0.38.34", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index c2d3d6cd45..4a5951dc8d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -61,9 +61,11 @@ libc = "0.2.153" linked-hash-map = "0.5" log = "0.4" memchr = "2" +memmap2 = "0.9.4" mime = "0.3" num_cpus = "1.16" number_prefix = "0.4" +object = "0.32" once_cell = "1.19" opendal = { version = "0.50.1", optional = true, default-features = false } openssl = { version = "0.10.64", optional = true } @@ -83,6 +85,7 @@ semver = "1.0" serde = { version = "1.0", features = ["derive"] } serde_json = "1.0" sha2 = { version = "0.10.8", optional = true } +shlex = "1.3.0" strip-ansi-escapes = "0.2" tar = "0.4.40" tempfile = "3" @@ -107,11 +110,22 @@ zip = { version = "0.6", default-features = false } zstd = "0.13" # dist-server only -async-compression = { version = "0.4", features = ["tokio", "zlib"] } +async-compression = { version = "0.4", features = [ + "gzip", + "tokio", + "zlib", +], optional = true } async-tar = { version = "0.5", optional = true } -axum = { version = "0.7", features = ["macros"] } -axum-extra = { version = "0.9.6", features = ["typed-header"] } -memmap2 = "0.9.4" +axum = { version = "0.7", features = [ + "http2", + "macros", + "ws", +], optional = true } +axum-extra = { version = "0.9.6", features = ["typed-header"], optional = true } +# celery = { version = "0.5.5", optional = true, features = ["extra_content_types"] } +celery = { git = "https://github.com/trxcllnt/rusty-celery.git", branch = "sccache", optional = true, features = [ + "extra_content_types", +] } nix = { version = "0.28.0", optional = true, features = [ "mount", "user", @@ -119,13 +133,13 @@ nix = { version = "0.28.0", optional = true, features = [ "signal", "process", ] } -object = "0.32" rouille = { version = "3.6", optional = true, default-features = false, features = [ "ssl", ] } -shlex = "1.3.0" syslog = { version = "6", optional = true } tokio-openssl = { version = "0.6.5", optional = true } +tokio-stream = { version = "0.1.17", optional = true } +tokio-tungstenite = { version = "0.24", optional = true } tower-http = { version = "0.6", features = [ "request-id", "sensitive-headers", @@ -202,10 +216,16 @@ dist-client = [ "reqwest", "url", "sha2", + "tokio-stream", + "tokio-tungstenite", ] # Enables the sccache-dist binary dist-server = [ + "async-compression", "async-tar", + "axum", + "axum-extra", + "celery", "jwt", "flate2", "hyper", @@ -218,6 +238,8 @@ dist-server = [ "syslog", "tower-http", "tokio-openssl", + "tokio-stream", + "tokio-tungstenite", "tracing", "tracing-subscriber", "version-compare", diff --git a/scripts/freebsd-ci-test.sh b/scripts/freebsd-ci-test.sh index a670c95df3..18bc162d3c 100755 --- a/scripts/freebsd-ci-test.sh +++ b/scripts/freebsd-ci-test.sh @@ -17,20 +17,20 @@ # etc.) after each run. This can be prevented by # setting FREEBSD_CI_NOCLEAN in the environment: # -# FREEBSD_CI_NOCLEAN=1 scripts/freebsd-ci-test.sh +# FREEBSD_CI_NOCLEAN=1 scripts/freebsd-ci-test.sh # # When running in a loop, time and bandwidth can be # saved by placing FreeBSD distribution files in # $HOME/.potcache # -# mkdir $HOME/.potcache -# fetch -o $HOME/.potcache/14.1-RELEASE_base.txz \ -# https://ftp.freebsd.org/pub/FreeBSD/releases/amd64/14.1-RELEASE/base.txz +# mkdir $HOME/.potcache +# fetch -o $HOME/.potcache/14.1-RELEASE_base.txz \ +# https://ftp.freebsd.org/pub/FreeBSD/releases/amd64/14.1-RELEASE/base.txz # # This script can be run from a github action. When run locally, make # sure to install the required packages: # -# pkg install -y ca-root-nss curl gmake gtar pot sudo +# pkg install -y ca-root-nss curl gmake gtar pot sudo rabbitmq # # shellcheck disable=SC3040 @@ -55,6 +55,8 @@ output_env_info() { echo "## user" whoami + echo "## cwd" + pwd echo "## environment" env | sort echo "## network" @@ -89,37 +91,50 @@ build_and_test_project() if [ $FAULT -ne 0 ]; then return 1; fi } -prepare_and_run_sccache_dist() +prepare_sccache_dist() { echo "#### preparing sccache-dist" - SECRET_KEY="$(sccache-dist auth generate-jwt-hs256-key)" - CLIENT_AUTH_KEY="$(sccache-dist auth generate-jwt-hs256-key)" + # SECRET_KEY="$(sccache-dist auth generate-jwt-hs256-key)" + # CLIENT_AUTH_KEY="$(sccache-dist auth generate-jwt-hs256-key)" + CLIENT_AUTH_KEY="client_token" # create scheduler.conf cat >"$TEST_TMPDIR"/scheduler.conf <<-EOF public_addr = "127.0.0.1:10600" [client_auth] type = "token" token = "$CLIENT_AUTH_KEY" - [server_auth] - type = "jwt_hs256" - secret_key = "$SECRET_KEY" EOF - SERVER_TOKEN="$(sccache-dist auth generate-jwt-hs256-server-token \ - --config="$TEST_TMPDIR"/scheduler.conf \ - --server="127.0.0.1:10501")" + # cat >"$TEST_TMPDIR"/scheduler.conf <<-EOF + # public_addr = "127.0.0.1:10600" + # [client_auth] + # type = "token" + # token = "$CLIENT_AUTH_KEY" + # [server_auth] + # type = "jwt_hs256" + # secret_key = "$SECRET_KEY" + # EOF + # SERVER_TOKEN="$(sccache-dist auth generate-jwt-hs256-server-token \ + # --config="$TEST_TMPDIR"/scheduler.conf \ + # --server="127.0.0.1:10501")" # Create server.conf cat >"$TEST_TMPDIR"/server.conf <<-EOF cache_dir = "$TEST_TMPDIR/toolchains" - public_addr = "127.0.0.1:10501" - scheduler_url = "http://127.0.0.1:10600" [builder] type = "pot" pot_fs_root = "$TEST_TMPDIR/pot" - [scheduler_auth] - type = "jwt_token" - token = "$SERVER_TOKEN" EOF + # cat >"$TEST_TMPDIR"/server.conf <<-EOF + # cache_dir = "$TEST_TMPDIR/toolchains" + # public_addr = "127.0.0.1:10501" + # scheduler_url = "http://127.0.0.1:10600" + # [builder] + # type = "pot" + # pot_fs_root = "$TEST_TMPDIR/pot" + # [scheduler_auth] + # type = "jwt_token" + # token = "$SERVER_TOKEN" + # EOF # create sccache client config TC="$(rustup toolchain list | grep default | awk '{ print $1 }')" @@ -151,8 +166,11 @@ prepare_and_run_sccache_dist() gtar cf - --sort=name --mtime='2022-06-28 17:35Z' "$HOME/.rustup" | \ gzip -n >"$TEST_TMPDIR/rust-toolchain.tgz" - echo "Starting scheduler" - sccache-dist scheduler --config "$TEST_TMPDIR"/scheduler.conf + echo "Starting rabbitmq" + sudo service rabbitmq onestart + sleep 5 + echo "Checking rabbitmq status" + sudo service rabbitmq onestatus } prepare_zpool() @@ -183,12 +201,23 @@ prepare_pot() sudo pot snapshot -p sccache-template } +start_scheduler() +{ + echo "#### starting scheduler" + AMQP_ADDR="amqp://127.0.0.1:5672//" \ + SCCACHE_SERVER_LOG=/tmp/sccache_scheduler_log.txt \ + SCCACHE_LOG="celery=debug,sccache=trace,tower_http=debug,axum::rejection=trace" \ + sccache-dist scheduler --config "$TEST_TMPDIR"/scheduler.conf +} + start_build_server() { echo "#### starting build-server (as root)" - SCCACHE_DIST_LOG=debug RUST_LOG=info sudo \ - "$HOME"/.cargo/bin/sccache-dist server \ - --config "$TEST_TMPDIR"/server.conf & + sudo \ + AMQP_ADDR="amqp://127.0.0.1:5672//" \ + SCCACHE_LOG="celery=debug,sccache=trace" \ + SCCACHE_SERVER_LOG=/tmp/sccache_server_log.txt \ + "$HOME"/.cargo/bin/sccache-dist server --config "$TEST_TMPDIR"/server.conf } wait_for_build_server() @@ -218,9 +247,10 @@ start_sccache_server() { echo "#### starting sccache-server" killall sccache 2>/dev/null || true - SCCACHE_ERROR_LOG="$TEST_TMPDIR"/sccache_log.txt SCCACHE_LOG=info \ - RUST_LOG=info sccache --start-server - sleep 10 + SCCACHE_LOG="sccache=trace" \ + SCCACHE_ERROR_LOG=/tmp/sccache_client_log.txt \ + sccache --start-server + sleep 5 } test_sccache_dist_01() @@ -236,7 +266,9 @@ test_sccache_dist_01() FAILED_DIST="$(echo "$STATS" | \ grep "Failed distributed compilations" | awk '{ print $4 }')" SUCCEEDED_DIST="$(echo "$STATS" | \ - (grep -F "127.0.0.1:10501" || echo 0 0) | awk '{ print $2 }')" + grep "Successful distributed compiles" | awk '{ print $4 }')" + # SUCCEEDED_DIST="$(echo "$STATS" | \ + # (grep -F "127.0.0.1:10501" || echo 0 0) | awk '{ print $2 }')" if [ "$CACHE_HITS" -ne 0 ]; then 2>&1 echo "Unexpected cache hits" @@ -272,7 +304,9 @@ test_sccache_dist_02() FAILED_DIST="$(echo "$STATS" | \ grep "Failed distributed compilations" | awk '{ print $4 }')" SUCCEEDED_DIST="$(echo "$STATS" | \ - (grep -F "127.0.0.1:10501" || echo 0 0) | awk '{ print $2 }')" + grep "Successful distributed compiles" | awk '{ print $4 }')" + # SUCCEEDED_DIST="$(echo "$STATS" | \ + # (grep -F "127.0.0.1:10501" || echo 0 0) | awk '{ print $2 }')" if [ "$CACHE_HITS" -eq 0 ]; then 2>&1 echo "No cache hits when there should be some" @@ -333,11 +367,12 @@ main() init output_env_info build_and_test_project - prepare_and_run_sccache_dist + prepare_sccache_dist prepare_zpool prepare_pot + start_scheduler start_build_server - wait_for_build_server + # wait_for_build_server create_build_test_project start_sccache_server test_sccache_dist_01 diff --git a/src/bin/sccache-dist/build.rs b/src/bin/sccache-dist/build.rs index 1320ec47ed..173980c334 100644 --- a/src/bin/sccache-dist/build.rs +++ b/src/bin/sccache-dist/build.rs @@ -13,33 +13,23 @@ // limitations under the License. use anyhow::{anyhow, bail, Context, Error, Result}; +use async_compression::tokio::bufread::ZlibDecoder as ZlibDecoderAsync; use async_trait::async_trait; -use flate2::read::GzDecoder; +use bytes::Buf; +use flate2::read::ZlibDecoder as ZlibDecoderSync; use fs_err as fs; -use futures::lock::Mutex; +use itertools::Itertools; use libmount::Overlay; -use sccache::dist::{ - BuildResult, BuilderIncoming, CompileCommand, JobId, OutputData, ProcessOutput, TcCache, - Toolchain, -}; -use sccache::lru_disk_cache::Error as LruError; -use std::borrow::Borrow; -use std::collections::{hash_map, HashMap}; +use sccache::dist::{BuildResult, BuilderIncoming, CompileCommand, OutputData, ProcessOutput}; use std::io; use std::path::{self, Path, PathBuf}; -use std::process::{Output, Stdio}; -use tokio::process::ChildStdin; -use tokio_util::compat::TokioAsyncReadCompatExt; +use std::process::Output; +use tokio_util::compat::{FuturesAsyncReadCompatExt, TokioAsyncReadCompatExt}; use version_compare::Version; #[async_trait] trait AsyncCommandExt { async fn check_stdout_trim(&mut self) -> Result; - async fn check_piped(&mut self, pipe: F) -> Result<()> - where - F: FnOnce(ChildStdin) -> Fut + std::marker::Send, - Fut: std::future::Future> + std::marker::Send; - async fn check_run(&mut self) -> Result<()>; } #[async_trait] @@ -51,34 +41,6 @@ impl AsyncCommandExt for tokio::process::Command { String::from_utf8(output.stdout).context("Output from listing containers not UTF8")?; Ok(stdout.trim().to_owned()) } - // Should really take a FnOnce/FnBox - async fn check_piped(&mut self, pipe: F) -> Result<()> - where - F: FnOnce(ChildStdin) -> Fut + std::marker::Send, - Fut: std::future::Future> + std::marker::Send, - { - let mut process = self - .stdin(Stdio::piped()) - .spawn() - .context("Failed to start command")?; - pipe( - process - .stdin - .take() - .expect("Requested piped stdin but not present"), - ) - .await - .context("Failed to pipe input to process")?; - let output = process - .wait_with_output() - .await - .context("Failed to wait for process to return")?; - check_output(&output) - } - async fn check_run(&mut self) -> Result<()> { - let output = self.output().await.context("Failed to start command")?; - check_output(&output) - } } fn check_output(output: &Output) -> Result<()> { @@ -108,16 +70,9 @@ struct OverlaySpec { toolchain_dir: PathBuf, } -#[derive(Debug, Clone)] -struct DeflatedToolchain { - path: PathBuf, - build_count: u64, -} - pub struct OverlayBuilder { bubblewrap: PathBuf, dir: PathBuf, - toolchain_dir_map: Mutex>, } impl OverlayBuilder { @@ -130,11 +85,13 @@ impl OverlayBuilder { } let mut cmd = tokio::process::Command::new(&bubblewrap); + let out = cmd .arg("--version") .check_stdout_trim() .await .context("Failed to execute bwrap for version check")?; + if let Some(s) = out.split_whitespace().nth(1) { match (Version::from("0.3.0"), Version::from(s)) { (Some(min), Some(seen)) => { @@ -160,11 +117,7 @@ impl OverlayBuilder { } // TODO: pidfile - let ret = Self { - bubblewrap, - dir, - toolchain_dir_map: Mutex::new(HashMap::new()), - }; + let ret = Self { bubblewrap, dir }; ret.cleanup().await?; fs::create_dir_all(&ret.dir).context("Failed to create base directory for builder")?; fs::create_dir_all(ret.dir.join("builds")) @@ -181,353 +134,215 @@ impl OverlayBuilder { Ok(()) } - async fn cleanup_old_toolchains( - &self, - job_id: JobId, - tccache: &TcCache, - tc_dirs: &mut HashMap, - ) { - if tc_dirs.len() >= tccache.len() { - let dir_map = tc_dirs.clone(); - for (tc, entry) in dir_map.iter() { - // Only clean up old uncompressed toolchains that aren't currently in use - if !tccache.contains_toolchain(tc) && entry.build_count == 0 { - tracing::warn!( - "[cleanup_old_toolchains({})]: Removing old un-compressed toolchain: {:?}", - job_id, - tc.archive_id - ); - if tc_dirs.remove(tc).is_none() { - tracing::warn!( - "[cleanup_old_toolchains({})]: Toochain {} not in toolchain_dir_map", - job_id, - tc.archive_id - ); - } - fs::remove_dir_all(self.dir.join("toolchains").join(&tc.archive_id)) - .context("Failed to remove old toolchain") - .unwrap_or_else(|err| { - tracing::warn!("[cleanup_old_toolchains({})]: {:?}", job_id, err) - }); - } - } - } - } - async fn prepare_overlay_dirs( &self, - job_id: JobId, - tc: &Toolchain, - tccache: &Mutex, + job_id: &str, + toolchain_dir: &Path, ) -> Result { - let DeflatedToolchain { - path: toolchain_dir, - build_count: _, - } = { - let mut toolchain_dir_map = self.toolchain_dir_map.lock().await; - // Create the toolchain dir (if necessary) while we have an exclusive lock - let toolchain_dir = self.dir.join("toolchains").join(&tc.archive_id); - if toolchain_dir_map.contains_key(tc) && toolchain_dir.exists() { - // TODO: use if let when sccache can use NLL - let entry = toolchain_dir_map - .get_mut(tc) - .expect("Key missing after checking"); - entry.build_count += 1; - entry.clone() - } else { - tracing::trace!( - "[prepare_overlay_dirs({})]: Creating toolchain directory for archive {}: {:?}", - job_id, - tc.archive_id, - toolchain_dir - ); - - let mut tccache = tccache.lock().await; - - self.cleanup_old_toolchains(job_id, &tccache, &mut toolchain_dir_map) - .await; - - let toolchain_rdr = match tccache.get(tc) { - Ok(rdr) => rdr, - Err(LruError::FileNotInCache) => { - bail!( - "[prepare_overlay_dirs({})]: Expected toolchain {}, but not available", - job_id, - tc.archive_id - ) - } - Err(e) => { - return Err(Error::from(e).context("Failed to get toolchain from cache")) - } - }; - - fs::create_dir_all(&toolchain_dir) - .context("Failed to create toolchain dir") - .unwrap_or_else(|err| { - tracing::warn!("[prepare_overlay_dirs({})]: {:?}", job_id, err) - }); - - tar::Archive::new(GzDecoder::new(toolchain_rdr)) - .unpack(&toolchain_dir) - .map_err(|err| { - tracing::warn!( - "[prepare_overlay_dirs({})]: Failed to unpack toolchain {}: {:?}", - job_id, - tc.archive_id, - err - ); - fs::remove_dir_all(&toolchain_dir) - .context("Failed to remove unpacked toolchain") - .unwrap_or_else(|err| { - tracing::warn!("[prepare_overlay_dirs({})]: {:?}", job_id, err) - }); - tccache - .remove(tc) - .context("Failed to remove corrupt toolchain") - .unwrap_or_else(|err| { - tracing::warn!("[prepare_overlay_dirs({})]: {:?}", job_id, err) - }); - Error::from(err) - })?; - - let entry = DeflatedToolchain { - path: toolchain_dir, - build_count: 1, - }; - - toolchain_dir_map.insert(tc.clone(), entry.clone()); - - entry - } - }; - - let build_dir = self - .dir - .join("builds") - .join(format!("{}-{}", tc.archive_id, job_id)); + let build_dir = self.dir.join("builds").join(job_id); tracing::trace!( - "[prepare_overlay_dirs({})]: Creating build directory for {}-{}: {:?}", - job_id, - tc.archive_id, - job_id, - build_dir + "[prepare_overlay_dirs({job_id})]: Creating build directory: {build_dir:?}" ); fs::create_dir_all(&build_dir) .context("Failed to create build dir") - .unwrap_or_else(|err| tracing::warn!("[prepare_overlay_dirs({})]: {:?}", job_id, err)); + .unwrap_or_else(|err| tracing::warn!("[prepare_overlay_dirs({job_id})]: {err:?}")); Ok(OverlaySpec { build_dir, - toolchain_dir, + toolchain_dir: toolchain_dir.to_owned(), }) } async fn perform_build( - job_id: JobId, + job_id: &str, bubblewrap: PathBuf, - compile_command: CompileCommand, - inputs_rdr: std::pin::Pin<&mut (dyn tokio::io::AsyncRead + Send)>, + CompileCommand { + executable, + arguments, + env_vars, + cwd, + }: CompileCommand, + inputs: Vec, output_paths: Vec, overlay: OverlaySpec, - job_queue: &tokio::sync::Semaphore, ) -> Result { - tracing::trace!( - "[perform_build({})]: Compile environment: {:?}", - job_id, - compile_command.env_vars - ); - tracing::trace!( - "[perform_build({})]: Compile command: {:?} {:?}", - job_id, - compile_command.executable, - compile_command.arguments - ); + tracing::trace!("[perform_build({job_id})]: Compile environment: {env_vars:?}"); + tracing::trace!("[perform_build({job_id})]: Compile command: {executable:?} {arguments:?}"); + + let job_id = job_id.to_owned(); + + tokio::runtime::Handle::current() + .spawn_blocking(move || { + // Now mounted filesystems will be automatically unmounted when this thread dies + // (and tmpfs filesystems will be completely destroyed) + nix::sched::unshare(nix::sched::CloneFlags::CLONE_NEWNS) + .context("Failed to enter a new Linux namespace")?; + // Make sure that all future mount changes are private to this namespace + // TODO: shouldn't need to add these annotations + let source: Option<&str> = None; + let fstype: Option<&str> = None; + let data: Option<&str> = None; + // Turn / into a 'slave', so it receives mounts from real root, but doesn't propagate back + nix::mount::mount( + source, + "/", + fstype, + nix::mount::MsFlags::MS_REC | nix::mount::MsFlags::MS_PRIVATE, + data, + ) + .context("Failed to turn / into a slave")?; + + let work_dir = overlay.build_dir.join("work"); + let upper_dir = overlay.build_dir.join("upper"); + let target_dir = overlay.build_dir.join("target"); + fs::create_dir_all(&work_dir).context("Failed to create overlay work directory")?; + fs::create_dir_all(&upper_dir) + .context("Failed to create overlay upper directory")?; + fs::create_dir_all(&target_dir) + .context("Failed to create overlay target directory")?; + + let () = Overlay::writable( + std::iter::once(overlay.toolchain_dir.as_path()), + upper_dir, + work_dir, + &target_dir, + // This error is unfortunately not Send+Sync + ) + .mount() + .map_err(|e| anyhow!("Failed to mount overlay FS: {}", e.to_string()))?; + + tracing::trace!("[perform_build({job_id})]: copying in inputs"); + // Note that we don't unpack directly into the upperdir since there overlayfs has some + // special marker files that we don't want to create by accident (or malicious intent) + tar::Archive::new(ZlibDecoderSync::new(inputs.reader())) + .unpack(&target_dir) + .context("Failed to unpack inputs to overlay")?; + + let cwd = Path::new(&cwd); + + tracing::trace!("[perform_build({job_id})]: creating output directories"); + fs::create_dir_all(join_suffix(&target_dir, cwd)) + .context("Failed to create cwd")?; + for path in output_paths.iter() { + // If it doesn't have a parent, nothing needs creating + let output_parent = if let Some(p) = Path::new(path).parent() { + p + } else { + continue; + }; + fs::create_dir_all(join_suffix(&target_dir, cwd.join(output_parent))) + .context("Failed to create an output directory")?; + } - // Read inputs here because we can't use asyncio in the thread below. - let work_dir = overlay.build_dir.join("work"); - let upper_dir = overlay.build_dir.join("upper"); - let target_dir = overlay.build_dir.join("target"); - let inputs_dir = overlay.build_dir.join("inputs"); - fs::create_dir_all(&work_dir).context("Failed to create overlay work directory")?; - fs::create_dir_all(&upper_dir).context("Failed to create overlay upper directory")?; - fs::create_dir_all(&target_dir).context("Failed to create overlay target directory")?; - fs::create_dir_all(&inputs_dir).context("Failed to create overlay inputs directory")?; - - tracing::trace!("[perform_build({})]: copying in inputs", job_id); - // Note that we don't unpack directly into the upperdir since there overlayfs has some - // special marker files that we don't want to create by accident (or malicious intent) - async_tar::Archive::new(inputs_rdr.compat()) - .unpack(&inputs_dir) - .await - .context("Failed to unpack inputs to overlay")?; - - // Guard compiling until we get a token from the job queue - let _token = job_queue.acquire().await?; - - std::thread::scope(|scope| { - scope - .spawn(|| { - // Now mounted filesystems will be automatically unmounted when this thread dies - // (and tmpfs filesystems will be completely destroyed) - nix::sched::unshare(nix::sched::CloneFlags::CLONE_NEWNS) - .context("Failed to enter a new Linux namespace")?; - // Make sure that all future mount changes are private to this namespace - // TODO: shouldn't need to add these annotations - let source: Option<&str> = None; - let fstype: Option<&str> = None; - let data: Option<&str> = None; - // Turn / into a 'slave', so it receives mounts from real root, but doesn't propagate back - nix::mount::mount( - source, - "/", - fstype, - nix::mount::MsFlags::MS_REC | nix::mount::MsFlags::MS_PRIVATE, - data, - ) - .context("Failed to turn / into a slave")?; - - let () = Overlay::writable( - [inputs_dir.as_path(), overlay.toolchain_dir.as_path()] - .iter() - .cloned(), - upper_dir, - work_dir, - &target_dir, - // This error is unfortunately not Send+Sync - ) - .mount() - .map_err(|e| anyhow!("Failed to mount overlay FS: {}", e.to_string()))?; - - let CompileCommand { - executable, - arguments, - env_vars, - cwd, - } = compile_command; - let cwd = Path::new(&cwd); - - tracing::trace!("[perform_build({})]: creating output directories", job_id); - fs::create_dir_all(join_suffix(&target_dir, cwd)) - .context("Failed to create cwd")?; - for path in output_paths.iter() { - // If it doesn't have a parent, nothing needs creating - let output_parent = if let Some(p) = Path::new(path).parent() { - p - } else { - continue; - }; - fs::create_dir_all(join_suffix(&target_dir, cwd.join(output_parent))) - .context("Failed to create an output directory")?; + tracing::trace!("[perform_build({job_id})]: performing compile"); + // Bubblewrap notes: + // - We're running as uid 0 (to do the mounts above), and so bubblewrap is run as uid 0 + // - There's special handling in bubblewrap to compare uid and euid - of interest to us, + // if uid == euid == 0, bubblewrap preserves capabilities (not good!) so we explicitly + // drop all capabilities + // - By entering a new user namespace means any set of capabilities do not apply to any + // other user namespace, i.e. you lose privileges. This is not strictly necessary because + // we're dropping caps anyway so it's irrelevant which namespace we're in, but it doesn't + // hurt. + // - --unshare-all is not ideal as it happily continues if it fails to unshare either + // the user or cgroups namespace, so we list everything explicitly + // - The order of bind vs proc + dev is important - the new root must be put in place + // first, otherwise proc and dev get hidden + let mut cmd = std::process::Command::new(bubblewrap); + cmd.arg("--die-with-parent") + .args(["--cap-drop", "ALL"]) + .args([ + "--unshare-user", + "--unshare-cgroup", + "--unshare-ipc", + "--unshare-pid", + "--unshare-net", + "--unshare-uts", + ]) + .arg("--bind") + .arg(&target_dir) + .arg("/") + .args(["--proc", "/proc"]) + .args(["--dev", "/dev"]) + .arg("--chdir") + .arg(cwd); + + for (k, v) in env_vars { + if k.contains('=') { + tracing::warn!("[perform_build({job_id})]: Skipping environment variable: {k:?}"); + continue; } + cmd.arg("--setenv").arg(k).arg(v); + } + cmd.arg("--"); + cmd.arg(executable); + cmd.args(arguments); - tracing::trace!("[perform_build({})]: performing compile", job_id); - // Bubblewrap notes: - // - We're running as uid 0 (to do the mounts above), and so bubblewrap is run as uid 0 - // - There's special handling in bubblewrap to compare uid and euid - of interest to us, - // if uid == euid == 0, bubblewrap preserves capabilities (not good!) so we explicitly - // drop all capabilities - // - By entering a new user namespace means any set of capabilities do not apply to any - // other user namespace, i.e. you lose privileges. This is not strictly necessary because - // we're dropping caps anyway so it's irrelevant which namespace we're in, but it doesn't - // hurt. - // - --unshare-all is not ideal as it happily continues if it fails to unshare either - // the user or cgroups namespace, so we list everything explicitly - // - The order of bind vs proc + dev is important - the new root must be put in place - // first, otherwise proc and dev get hidden - let mut cmd = std::process::Command::new(bubblewrap); - cmd.arg("--die-with-parent") - .args(["--cap-drop", "ALL"]) - .args([ - "--unshare-user", - "--unshare-cgroup", - "--unshare-ipc", - "--unshare-pid", - "--unshare-net", - "--unshare-uts", - ]) - .arg("--bind") - .arg(&target_dir) - .arg("/") - .args(["--proc", "/proc"]) - .args(["--dev", "/dev"]) - .arg("--chdir") - .arg(cwd); - - for (k, v) in env_vars { - if k.contains('=') { - tracing::warn!( - "[perform_build({})]: Skipping environment variable: {:?}", - job_id, - k - ); - continue; - } - cmd.arg("--setenv").arg(k).arg(v); - } - cmd.arg("--"); - cmd.arg(executable); - cmd.args(arguments); - - tracing::trace!("[perform_build({})]: bubblewrap command: {:?}", job_id, cmd); - - let compile_output = cmd - .output() - .context("Failed to retrieve output from compile")?; - tracing::trace!( - "[perform_build({})]: compile_output: {:?}", - job_id, - compile_output + tracing::trace!("[perform_build({job_id})]: bubblewrap command: {:?}", cmd); + + let compile_output = cmd + .output() + .context("Failed to retrieve output from compile")?; + + if !compile_output.status.success() { + tracing::warn!( + "[perform_build({job_id})]: compile output:\n===========\nstdout:\n{}\n==========\n\n=========\nstderr:\n{}\n===============\n\n", + String::from_utf8_lossy(&compile_output.stdout), + String::from_utf8_lossy(&compile_output.stderr) ); + } else { + tracing::trace!("[perform_build({job_id})]: compile output: {compile_output:?}"); + } - let mut outputs = vec![]; - tracing::trace!("[perform_build({})]: retrieving {:?}", job_id, output_paths); - for path in output_paths { - let abspath = join_suffix(&target_dir, cwd.join(&path)); // Resolve in case it's relative since we copy it from the root level - match fs::File::open(abspath) { - Ok(file) => { - let output = OutputData::try_from_reader(file) - .context("Failed to read output file")?; - outputs.push((path, output)) - } - Err(e) => { - if e.kind() == io::ErrorKind::NotFound { - tracing::debug!( - "[perform_build({})]: Missing output path {:?}", - job_id, - path - ) - } else { - return Err( - Error::from(e).context("Failed to open output file") - ); - } + tracing::trace!("[perform_build({job_id})]: retrieving {output_paths:?}"); + + let mut outputs = vec![]; + + for path in output_paths { + + let abspath = Path::new(&path); + let abspath = if abspath.is_absolute() { + abspath.to_path_buf() + } else { + cwd.join(abspath) + }; + + // Resolve in case it's relative since we copy it from the root level + let abspath = join_suffix(&target_dir, abspath); + + match fs::File::open(&abspath) { + Ok(file) => { + let output = OutputData::try_from_reader(file) + .context("Failed to read output file")?; + outputs.push((path, output)) + } + Err(e) => { + if e.kind() == io::ErrorKind::NotFound { + tracing::debug!("[perform_build({job_id})]: Missing output path host={abspath:?}, overlay={path:?}") + } else { + return Err(Error::from(e).context("Failed to open output file")); } } } - let compile_output = ProcessOutput::try_from(compile_output) - .context("Failed to convert compilation exit status")?; - Ok(BuildResult { - output: compile_output, - outputs, - }) - // Bizarrely there's no way to actually get any information from a thread::Result::Err + } + + let compile_output = ProcessOutput::try_from(compile_output) + .context("Failed to convert compilation exit status")?; + + Ok(BuildResult { + output: compile_output, + outputs, }) - .join() - .unwrap_or_else(|_e| Err(anyhow!("Build thread exited unsuccessfully"))) - }) + }) + .await + .context("Build thread exited unsuccessfully")? } // Failing during cleanup is pretty unexpected, but we can still return the successful compile // TODO: if too many of these fail, we should mark this builder as faulty - async fn finish_overlay( - &self, - job_id: JobId, - tc: &Toolchain, - tccache: &Mutex, - overlay: &OverlaySpec, - ) { + async fn finish_overlay(&self, job_id: &str, overlay: &OverlaySpec) { let OverlaySpec { build_dir, toolchain_dir: _, @@ -535,27 +350,9 @@ impl OverlayBuilder { if let Err(e) = fs::remove_dir_all(build_dir) { tracing::warn!( - "[finish_overlay({})]: Failed to remove build directory {}: {}", - job_id, - build_dir.display(), - e + "[finish_overlay({job_id})]: Failed to remove build directory {build_dir:?}: {e}" ); } - - // TODO: collect toolchain directories - - // Decrement the build count so its toolchain can be cleaned up later - let mut toolchain_dir_map = self.toolchain_dir_map.lock().await; - if let Some(entry) = toolchain_dir_map.get_mut(tc) { - entry.build_count = std::cmp::max(0, entry.build_count - 1); - } - - self.cleanup_old_toolchains( - job_id, - tccache.lock().await.borrow(), - &mut toolchain_dir_map, - ) - .await; } } @@ -563,65 +360,44 @@ impl OverlayBuilder { impl BuilderIncoming for OverlayBuilder { async fn run_build( &self, - job_id: JobId, - tc: Toolchain, + job_id: &str, + toolchain_dir: &Path, command: CompileCommand, outputs: Vec, - inputs_rdr: std::pin::Pin<&mut (dyn tokio::io::AsyncRead + Send)>, - tccache: &Mutex, - job_queue: &tokio::sync::Semaphore, + inputs: Vec, ) -> Result { - tracing::debug!("[run_build({})]: Preparing overlay", job_id); + tracing::debug!("[run_build({job_id})]: Preparing overlay"); + let overlay = self - .prepare_overlay_dirs(job_id, &tc, tccache) + .prepare_overlay_dirs(job_id, toolchain_dir) .await .context("failed to prepare overlay dirs")?; - tracing::debug!("[run_build({})]: Performing build in {:?}", job_id, overlay); + + tracing::debug!("[run_build({job_id})]: Performing build in {overlay:?}"); + let res = Self::perform_build( job_id, self.bubblewrap.clone(), command, - inputs_rdr, + inputs, outputs, overlay.clone(), - job_queue, ) .await; - tracing::debug!("[run_build({})]: Finishing with overlay", job_id); - self.finish_overlay(job_id, &tc, tccache, &overlay).await; - tracing::debug!("[run_build({})]: Returning result", job_id); + + tracing::debug!("[run_build({job_id})]: Finishing with overlay"); + + self.finish_overlay(job_id, &overlay).await; + + tracing::debug!("[run_build({job_id})]: Returning result"); + res.context("Failed to perform build") } } -const BASE_DOCKER_IMAGE: &str = "busybox:stable-musl"; -// Make sure sh doesn't exec the final command, since we need it to do -// init duties (reaping zombies). Also, because we kill -9 -1, that kills -// the sleep (it's not a builtin) so it needs to be a loop. -const DOCKER_SHELL_INIT: &str = "while true; do busybox sleep 365d && busybox true; done"; - -// Check the diff and clean up the FS -async fn docker_diff(cid: &str) -> Result { - let mut cmd = tokio::process::Command::new("docker"); - cmd.args(["diff", cid]) - .check_stdout_trim() - .await - .context("Failed to Docker diff container") -} +const BUSYBOX_DOCKER_IMAGE: &str = "busybox:stable-musl"; -// Force remove the container -async fn docker_rm(cid: &str) -> Result<()> { - let mut cmd = tokio::process::Command::new("docker"); - cmd.args(["rm", "-f", cid]) - .check_run() - .await - .context("Failed to force delete container") -} - -pub struct DockerBuilder { - image_map: Mutex>, - container_lists: Mutex>>, -} +pub struct DockerBuilder {} impl DockerBuilder { // TODO: this should accept a unique string, e.g. inode of the tccache directory @@ -629,425 +405,196 @@ impl DockerBuilder { // instances - pidfile in /tmp pub async fn new() -> Result { tracing::info!("Creating docker builder"); - - let ret = Self { - image_map: Mutex::new(HashMap::new()), - container_lists: Mutex::new(HashMap::new()), - }; - ret.cleanup().await?; - Ok(ret) + Ok(Self {}) } - // TODO: this should really reclaim, and should check in the image map and container lists, so - // that when things are removed from there it becomes a form of GC - async fn cleanup(&self) -> Result<()> { - tracing::info!("Performing initial Docker cleanup"); - - let mut cmd = tokio::process::Command::new("docker"); - let containers = cmd - .args(["ps", "-a", "--format", "{{.ID}} {{.Image}}"]) - .check_stdout_trim() - .await - .context("Unable to list all Docker containers")?; - if !containers.is_empty() { - let mut containers_to_rm = vec![]; - for line in containers.split(|c| c == '\n') { - let mut iter = line.splitn(2, ' '); - let container_id = iter - .next() - .context("Malformed container listing - no container ID")?; - let image_name = iter - .next() - .context("Malformed container listing - no image name")?; - if iter.next().is_some() { - bail!("Malformed container listing - third field on row") - } - if image_name.starts_with("sccache-builder-") { - containers_to_rm.push(container_id) - } - } - if !containers_to_rm.is_empty() { - let mut cmd = tokio::process::Command::new("docker"); - cmd.args(["rm", "-f"]) - .args(containers_to_rm) - .check_run() - .await - .context("Failed to start command to remove old containers")?; - } - } - - let mut cmd = tokio::process::Command::new("docker"); - let images = cmd - .args(["images", "--format", "{{.ID}} {{.Repository}}"]) - .check_stdout_trim() - .await - .context("Failed to list all docker images")?; - if !images.is_empty() { - let mut images_to_rm = vec![]; - for line in images.split(|c| c == '\n') { - let mut iter = line.splitn(2, ' '); - let image_id = iter - .next() - .context("Malformed image listing - no image ID")?; - let image_name = iter - .next() - .context("Malformed image listing - no image name")?; - if iter.next().is_some() { - bail!("Malformed image listing - third field on row") - } - if image_name.starts_with("sccache-builder-") { - images_to_rm.push(image_id) - } - } - if !images_to_rm.is_empty() { - let mut cmd = tokio::process::Command::new("docker"); - cmd.args(["rmi"]) - .args(images_to_rm) - .check_run() - .await - .context("Failed to remove image")? - } - } - - tracing::info!("Completed initial Docker cleanup"); - Ok(()) - } + async fn perform_build( + job_id: &str, + toolchain_dir: &Path, + CompileCommand { + executable, + arguments, + env_vars, + cwd, + }: CompileCommand, + output_paths: Vec, + inputs: Vec, + ) -> Result { + tracing::trace!("[perform_build({job_id})]: Compile environment: {env_vars:?}"); + tracing::trace!("[perform_build({job_id})]: Compile command: {executable:?} {arguments:?}"); + tracing::trace!("[perform_build({job_id})]: Output paths: {output_paths:?}"); - // If we have a spare running container, claim it and remove it from the available list, - // otherwise try and create a new container (possibly creating the Docker image along - // the way) - async fn get_container( - &self, - job_id: JobId, - tc: &Toolchain, - tccache: &Mutex, - ) -> Result { - let container = { - let mut map = self.container_lists.lock().await; - map.entry(tc.clone()).or_default().pop() - }; - match container { - Some(cid) => Ok(cid), - None => { - // TODO: can improve parallelism (of creating multiple images at a time) by using another - // (more fine-grained) mutex around the entry value and checking if its empty a second time - let image = { - let mut map = self.image_map.lock().await; - match map.entry(tc.clone()) { - hash_map::Entry::Occupied(e) => e.get().clone(), - hash_map::Entry::Vacant(e) => { - tracing::info!("[get_container({})]: Creating Docker image for {:?} (may block requests)", job_id, tc); - let image = Self::make_image(job_id, tc, tccache).await?; - e.insert(image.clone()); - image - } - } - }; - Self::start_container(&image).await - } + if output_paths.is_empty() { + bail!("Output paths is empty"); } - } - async fn clean_container(&self, job_id: JobId, cid: &str) -> Result<()> { - // Clean up any running processes - let mut cmd = tokio::process::Command::new("docker"); - cmd.args(["exec", cid, "busybox", "kill", "-9", "-1"]) - .check_run() - .await - .context("Failed to run kill on all processes in container")?; - - let diff = docker_diff(cid).await?; - if !diff.is_empty() { - let mut lastpath = None; - for line in diff.split(|c| c == '\n') { - let mut iter = line.splitn(2, ' '); - let changetype = iter - .next() - .context("Malformed container diff - no change type")?; - let changepath = iter - .next() - .context("Malformed container diff - no change path")?; - if iter.next().is_some() { - bail!("Malformed container diff - third field on row") - } - // TODO: If files are created in this dir, it gets marked as modified. - // A similar thing applies to /root or /build etc - if changepath == "/tmp" { - continue; - } - if changetype != "A" { - bail!( - "Path {} had a non-A changetype of {}", - changepath, - changetype - ); - } - // Docker diff paths are in alphabetical order and we do `rm -rf`, so we might be able to skip - // calling Docker more than necessary (since it's slow) - if let Some(lastpath) = lastpath { - if Path::new(changepath).starts_with(lastpath) { - continue; - } - } - lastpath = Some(changepath); - let mut cmd = tokio::process::Command::new("docker"); - if let Err(e) = cmd - .args(["exec", cid, "busybox", "rm", "-rf", changepath]) - .check_run() - .await - { - // We do a final check anyway, so just continue - tracing::warn!( - "[clean_container({})]: Failed to remove added path in a container: {}", - job_id, - e - ) - } - } + // Should automatically get deleted when host_temp goes out of scope + let host_temp = tempfile::Builder::new().prefix("sccache_dist").tempdir()?; + let host_root = host_temp.path(); - let newdiff = docker_diff(cid).await?; - // See note about changepath == "/tmp" above - if !newdiff.is_empty() && newdiff != "C /tmp" { - bail!( - "Attempted to delete files, but container still has a diff: {:?}", - newdiff - ); - } - } + let cwd = Path::new(&cwd); + let cwd_host = join_suffix(host_root, cwd); - Ok(()) - } + tracing::trace!("[perform_build({job_id})]: copying in inputs"); - // Failing during cleanup is pretty unexpected, but we can still return the successful compile - // TODO: if too many of these fail, we should mark this builder as faulty - async fn finish_container(&self, job_id: JobId, tc: &Toolchain, cid: String) { - // TODO: collect images - - if let Err(e) = self.clean_container(job_id, &cid).await { - tracing::info!( - "[finish_container({})]: Failed to clean container {}: {}", - job_id, - cid, - e - ); - if let Err(e) = docker_rm(&cid).await { - tracing::warn!( - "[finish_container({})]: Failed to remove container {} after failed clean: {}", - job_id, - cid, - e - ); - } - return; + // Copy inputs to host_root + { + let reader = inputs.reader(); + let reader = futures::io::AllowStdIo::new(reader); + let reader = ZlibDecoderAsync::new(reader.compat()); + async_tar::Archive::new(reader.compat()) + .unpack(&host_root) + .await + .context("Failed to unpack inputs to tempdir")?; } - // Good as new, add it back to the container list - if let Some(entry) = self.container_lists.lock().await.get_mut(tc) { - tracing::debug!( - "[finish_container({})]: Reclaimed container {}", - job_id, - cid - ); - entry.push(cid) - } else { - tracing::warn!( - "[finish_container({})]: Was ready to reclaim container {} but toolchain went missing", - job_id, cid - ); - if let Err(e) = docker_rm(&cid).await { - tracing::warn!( - "[finish_container({})]: Failed to remove container {}: {}", - job_id, - cid, - e - ); - } + fn list_files(root: &Path) -> Vec { + walkdir::WalkDir::new(root) + .follow_links(false) + .same_file_system(true) + .into_iter() + .flatten() + // Only mount files and symlinks, not dirs + .filter_map(|entr| { + entr.metadata() + .ok() + .and_then(|meta| (!meta.is_dir()).then_some(entr)) + }) + .map(|file| file.path().to_path_buf()) + .collect::>() } - } - async fn make_image(job_id: JobId, tc: &Toolchain, tccache: &Mutex) -> Result { - let mut cmd = tokio::process::Command::new("docker"); - let cid = cmd - .args(["create", BASE_DOCKER_IMAGE, "busybox", "true"]) - .check_stdout_trim() - .await - .context("Failed to create docker container")?; - - let mut tccache = tccache.lock().await; - let mut toolchain_rdr = match tccache.get_async(tc).await { - Ok(rdr) => rdr, - Err(LruError::FileNotInCache) => bail!( - "Expected to find toolchain {}, but not available", - tc.archive_id - ), - Err(e) => { - return Err(e).with_context(|| format!("Failed to use toolchain {}", tc.archive_id)) - } + let host_toolchain_paths = list_files(toolchain_dir); + + // Collect host CWD, input, and output dir paths + let host_bindmount_paths = { + // Always create the CWD even if it's not in the inputs archive + std::iter::once(cwd_host.as_path()) + .chain( + // Input paths + list_files(host_root) + .iter() + .filter_map(|path| path.strip_prefix(host_root).ok()), + ) + .chain( + // Output paths + output_paths.iter().map(Path::new), + ) + // If it doesn't have a parent, nothing needs creating + .filter_map(|path| path.parent().map(|p| join_suffix(host_root, p))) + .unique() + .collect::>() }; - tracing::trace!("[make_image({})]: Copying in toolchain", job_id); - let mut cmd = tokio::process::Command::new("docker"); - cmd.args(["cp", "-", &format!("{}:/", cid)]) - .check_piped(|mut stdin| async move { - tokio::io::copy(&mut toolchain_rdr, &mut stdin).await?; - Ok(()) - }) - .await - .context("Failed to copy toolchain tar into container")?; - - let imagename = format!("sccache-builder-{}", &tc.archive_id); - let mut cmd = tokio::process::Command::new("docker"); - cmd.args(["commit", &cid, &imagename]) - .check_run() - .await - .context("Failed to commit container after build")?; - - let mut cmd = tokio::process::Command::new("docker"); - cmd.args(["rm", "-f", &cid]) - .check_run() - .await - .context("Failed to remove temporary build container")?; - - Ok(imagename) - } + tracing::trace!("[perform_build({job_id})]: creating output directories"); - async fn start_container(image: &str) -> Result { - let mut cmd = tokio::process::Command::new("docker"); - cmd.args(["run", "-d", image, "busybox", "sh", "-c", DOCKER_SHELL_INIT]) - .check_stdout_trim() - .await - .context("Failed to run container") - } - - async fn perform_build( - job_id: JobId, - compile_command: CompileCommand, - mut inputs_rdr: std::pin::Pin<&mut (dyn tokio::io::AsyncRead + Send)>, - output_paths: Vec, - cid: &str, - job_queue: &tokio::sync::Semaphore, - ) -> Result { - tracing::trace!( - "[perform_build({})]: Compile environment: {:?}", - job_id, - compile_command.env_vars - ); - tracing::trace!( - "[perform_build({})]: Compile command: {:?} {:?}", - job_id, - compile_command.executable, - compile_command.arguments - ); - - tracing::trace!( - "[perform_build({})]: Output paths: {:?}", - job_id, - output_paths - ); + for path in host_bindmount_paths.iter() { + tracing::trace!("[perform_build({job_id})]: creating dir: {path:?}"); + tokio::fs::create_dir_all(path) + .await + .context(format!("Failed to create output directory {path:?}"))?; + } - if output_paths.is_empty() { - bail!("output_paths is empty"); + fn volume_mount>( + prefix: &Path, + access: &str, + ) -> impl FnMut(P) -> Vec { + let prefix = prefix.to_owned(); + let access = access.to_owned(); + move |h_path| { + let h_path = h_path.as_ref(); + if let Ok(c_path) = h_path.strip_prefix(&prefix) { + let c_path = Path::new("/").join(c_path); + let h_path = h_path.display(); + let c_path = c_path.display(); + vec!["--volume".into(), format!("{h_path}:{c_path}:{access}")] + } else { + vec![] + } + } } - tracing::trace!("[perform_build({})]: copying in inputs", job_id); + tracing::trace!("[perform_build({job_id})]: performing compile"); + // TODO: likely shouldn't perform the compile as root in the container let mut cmd = tokio::process::Command::new("docker"); - cmd.args(["cp", "-", &format!("{}:/", cid)]) - .check_piped(|mut stdin| async move { - tokio::io::copy(&mut inputs_rdr, &mut stdin).await?; - Ok(()) - }) - .await - .context("Failed to copy inputs tar into container")?; - let CompileCommand { - executable, - arguments, - env_vars, - cwd, - } = compile_command; - let cwd = Path::new(&cwd); + // Start a new container and remove it on exit + cmd.args(["run", "--rm"]) + .args(["--name", &format!("sccache-builder-{job_id}")]) + // Run in `cwd` + .args(["--workdir", &format!("{}", cwd.display())]) + // Mount input and output dirs as read-write + .args( + host_bindmount_paths + .iter() + .flat_map(volume_mount(host_root, "rw")), + ) + // Mount toolchain files as read-only + .args( + host_toolchain_paths + .iter() + .flat_map(volume_mount(toolchain_dir, "ro")), + ) + // Define envvars + .args(env_vars.iter().flat_map(|(k, v)| { + if k.contains('=') { + tracing::warn!( + "[perform_build({job_id})]: Skipping environment variable: {k:?}" + ); + vec![] + } else { + vec!["--env".into(), format!("{k}=\"{v}\"")] + } + })) + // Name of the image to run (currently busybox:stable-musl) + // TODO: Make this configurable? + .arg(BUSYBOX_DOCKER_IMAGE) + // Finally, the executable and arguments + .arg(executable) + .args(arguments); - tracing::trace!("[perform_build({})]: creating output directories", job_id); - let mut cmd = tokio::process::Command::new("docker"); - cmd.args(["exec", cid, "busybox", "mkdir", "-p"]).arg(cwd); - for path in output_paths.iter() { - // If it doesn't have a parent, nothing needs creating - let output_parent = if let Some(p) = Path::new(path).parent() { - p - } else { - continue; - }; - cmd.arg(cwd.join(output_parent)); - } - cmd.check_run() - .await - .context("Failed to create directories required for compile in container")?; + tracing::trace!("[perform_build({job_id})]: {:?}", cmd.as_std()); - // Guard compiling until we get a token from the job queue - let _token = job_queue.acquire().await?; + let compile_output = cmd.output().await.context("Failed to compile")?; - tracing::trace!("[perform_build({})]: performing compile", job_id); - // TODO: likely shouldn't perform the compile as root in the container - let mut cmd = tokio::process::Command::new("docker"); - cmd.arg("exec"); - for (k, v) in env_vars { - if k.contains('=') { - tracing::warn!( - "[perform_build({})]: Skipping environment variable: {:?}", - job_id, - k - ); - continue; - } - let mut env = k; - env.push('='); - env.push_str(&v); - cmd.arg("-e").arg(env); + if !compile_output.status.success() { + tracing::warn!( + "[perform_build({job_id})]: compile output:\n===========\nstdout:\n{}\n==========\n\n=========\nstderr:\n{}\n===============\n\n", + String::from_utf8_lossy(&compile_output.stdout), + String::from_utf8_lossy(&compile_output.stderr) + ); + } else { + tracing::trace!("[perform_build({job_id})]: compile output: {compile_output:?}"); } - let shell_cmd = "cd \"$1\" && shift && exec \"$@\""; - cmd.args([cid, "busybox", "sh", "-c", shell_cmd]); - cmd.arg(&executable); - cmd.arg(cwd); - cmd.arg(executable); - cmd.args(arguments); - let compile_output = cmd - .output() - .await - .context("Failed to start executing compile")?; - tracing::trace!( - "[perform_build({})]: compile_output: {:?}", - job_id, - compile_output - ); let mut outputs = vec![]; - tracing::trace!("[perform_build({})]: retrieving {:?}", job_id, output_paths); + + tracing::trace!("[perform_build({job_id})]: retrieving {output_paths:?}"); + for path in output_paths { - let abspath = cwd.join(&path); // Resolve in case it's relative since we copy it from the root level - // TODO: this isn't great, but cp gives it out as a tar - let mut cmd = tokio::process::Command::new("docker"); - let output = cmd - .args(["exec", cid, "busybox", "cat"]) - .arg(abspath) - .output() - .await - .context("Failed to start command to retrieve output file")?; - if output.status.success() { - let output = OutputData::try_from_reader(&*output.stdout) - .expect("Failed to read compress output stdout"); - outputs.push((path, output)) - } else { - tracing::debug!( - "[perform_build({})]: Missing output path {:?}", - job_id, - path - ) + let abspath = join_suffix(host_root, &path); // Resolve in case it's relative since we copy it from the root level + match fs::File::open(&abspath) { + Ok(file) => { + let output = + OutputData::try_from_reader(file).context("Failed to read output file")?; + outputs.push((path, output)) + } + Err(e) => { + if e.kind() == io::ErrorKind::NotFound { + tracing::debug!( + "[perform_build({job_id})]: Missing output path host={abspath:?}, container={path:?}" + ) + } else { + return Err(Error::from(e).context("Failed to open output file")); + } + } } } let compile_output = ProcessOutput::try_from(compile_output) .context("Failed to convert compilation exit status")?; + Ok(BuildResult { output: compile_output, outputs, @@ -1060,28 +607,18 @@ impl BuilderIncoming for DockerBuilder { // From Server async fn run_build( &self, - job_id: JobId, - tc: Toolchain, + job_id: &str, + toolchain_dir: &Path, command: CompileCommand, outputs: Vec, - inputs_rdr: std::pin::Pin<&mut (dyn tokio::io::AsyncRead + Send)>, - tccache: &Mutex, - job_queue: &tokio::sync::Semaphore, + inputs: Vec, ) -> Result { - tracing::debug!("[run_build({})]: Finding container", job_id); - let cid = self - .get_container(job_id, &tc, tccache) - .await - .context("Failed to get a container for build")?; - tracing::debug!( - "[run_build({})]: Performing build with container {}", - job_id, - cid - ); - let res = Self::perform_build(job_id, command, inputs_rdr, outputs, &cid, job_queue).await; - tracing::debug!("[run_build({})]: Finishing with container {}", job_id, cid); - self.finish_container(job_id, &tc, cid).await; + tracing::debug!("[run_build({})]: Performing build in container", job_id); + + let res = Self::perform_build(job_id, toolchain_dir, command, outputs, inputs).await; + tracing::debug!("[run_build({})]: Returning result", job_id); + res.context("Failed to perform build") } } diff --git a/src/bin/sccache-dist/build_freebsd.rs b/src/bin/sccache-dist/build_freebsd.rs index 450e60bb7c..fab1914813 100644 --- a/src/bin/sccache-dist/build_freebsd.rs +++ b/src/bin/sccache-dist/build_freebsd.rs @@ -12,33 +12,26 @@ // See the License for the specific language governing permissions and // limitations under the License. -use anyhow::{bail, Context, Error, Result}; +use anyhow::{bail, Context, Result}; +use async_compression::tokio::bufread::ZlibDecoder as ZlibDecoderAsync; use async_trait::async_trait; use bytes::Buf; -use flate2::read::GzDecoder; use futures::lock::Mutex; -use sccache::dist::{ - BuildResult, BuilderIncoming, CompileCommand, JobId, OutputData, ProcessOutput, TcCache, - Toolchain, -}; -use sccache::lru_disk_cache::Error as LruError; +use sccache::dist::{BuildResult, BuilderIncoming, CompileCommand, OutputData, ProcessOutput}; use std::collections::{hash_map, HashMap}; -use std::hint; use std::path::{Path, PathBuf}; -use std::process::{Output, Stdio}; -use std::sync::atomic::{AtomicUsize, Ordering}; +use std::process::Output; use std::sync::Arc; -use tokio::io::AsyncReadExt; -use tokio::process::ChildStdin; +use tokio_util::compat::{FuturesAsyncReadCompatExt, TokioAsyncReadCompatExt}; use uuid::Uuid; #[async_trait] trait AsyncCommandExt { async fn check_stdout_trim(&mut self) -> Result; - async fn check_piped(&mut self, pipe: F) -> Result<()> - where - F: FnOnce(ChildStdin) -> Fut + std::marker::Send, - Fut: std::future::Future> + std::marker::Send; + // async fn check_piped(&mut self, pipe: F) -> Result<()> + // where + // F: FnOnce(ChildStdin) -> Fut + std::marker::Send, + // Fut: std::future::Future> + std::marker::Send; async fn check_run(&mut self) -> Result<()>; } @@ -51,30 +44,30 @@ impl AsyncCommandExt for tokio::process::Command { String::from_utf8(output.stdout).context("Output from listing containers not UTF8")?; Ok(stdout.trim().to_owned()) } - // Should really take a FnOnce/FnBox - async fn check_piped(&mut self, pipe: F) -> Result<()> - where - F: FnOnce(ChildStdin) -> Fut + std::marker::Send, - Fut: std::future::Future> + std::marker::Send, - { - let mut process = self - .stdin(Stdio::piped()) - .spawn() - .context("Failed to start command")?; - pipe( - process - .stdin - .take() - .expect("Requested piped stdin but not present"), - ) - .await - .context("Failed to pipe input to process")?; - let output = process - .wait_with_output() - .await - .context("Failed to wait for process to return")?; - check_output(&output) - } + // // Should really take a FnOnce/FnBox + // async fn check_piped(&mut self, pipe: F) -> Result<()> + // where + // F: FnOnce(ChildStdin) -> Fut + std::marker::Send, + // Fut: std::future::Future> + std::marker::Send, + // { + // let mut process = self + // .stdin(Stdio::piped()) + // .spawn() + // .context("Failed to start command")?; + // pipe( + // process + // .stdin + // .take() + // .expect("Requested piped stdin but not present"), + // ) + // .await + // .context("Failed to pipe input to process")?; + // let output = process + // .wait_with_output() + // .await + // .context("Failed to wait for process to return")?; + // check_output(&output) + // } async fn check_run(&mut self) -> Result<()> { let output = self.output().await.context("Failed to start command")?; check_output(&output) @@ -96,7 +89,7 @@ fn check_output(output: &Output) -> Result<()> { // Force remove the container async fn pot_rm(cid: &str, pot_cmd: &Path) -> Result<()> { let mut cmd = tokio::process::Command::new(pot_cmd); - cmd.args(&["destroy", "-F", "-p", cid]) + cmd.args(["destroy", "-F", "-p", cid]) .check_run() .await .context("Failed to force delete container") @@ -108,10 +101,12 @@ pub struct PotBuilder { clone_from: String, pot_cmd: PathBuf, pot_clone_args: Vec, - image_map: Arc>>, - container_lists: Arc>>>, - cleanup_thread_count: Arc, - max_cleanup_thread_count: usize, + image_map: Arc>>, + container_lists: Arc>>>, + // cleanup_thread_count: Arc, + // max_cleanup_thread_count: usize, + // toolchain_cache: &Mutex, + toolchain_base_dir: PathBuf, } impl PotBuilder { @@ -119,6 +114,8 @@ impl PotBuilder { // having locked a pidfile, or at minimum should loudly detect other running // instances - pidfile in /tmp pub async fn new( + // toolchain_cache: &Mutex, + toolchain_base_dir: &Path, pot_fs_root: PathBuf, clone_from: String, pot_cmd: PathBuf, @@ -133,8 +130,9 @@ impl PotBuilder { pot_clone_args, image_map: Arc::new(Mutex::new(HashMap::new())), container_lists: Arc::new(Mutex::new(HashMap::new())), - cleanup_thread_count: Arc::new(AtomicUsize::new(0)), - max_cleanup_thread_count: std::thread::available_parallelism().unwrap().get() * 3, + // cleanup_thread_count: Arc::new(AtomicUsize::new(0)), + // max_cleanup_thread_count: std::thread::available_parallelism().unwrap().get() * 3, + toolchain_base_dir: toolchain_base_dir.to_owned(), }; ret.cleanup().await?; Ok(ret) @@ -145,7 +143,7 @@ impl PotBuilder { tracing::info!("Performing initial pot cleanup"); let mut cmd = tokio::process::Command::new(&self.pot_cmd); let mut to_remove = cmd - .args(&["ls", "-q"]) + .args(["ls", "-q"]) .check_stdout_trim() .await .context("Failed to force delete container")? @@ -169,13 +167,15 @@ impl PotBuilder { // the way) async fn get_container( &self, - job_id: JobId, - tc: &Toolchain, - tccache: &Mutex, + job_id: &str, + toolchain_base_dir: &Path, + toolchain_dir: &Path, ) -> Result { let container = { let mut map = self.container_lists.lock().await; - map.entry(tc.clone()).or_insert_with(Vec::new).pop() + map.entry(toolchain_dir.to_path_buf()) + .or_insert_with(Vec::new) + .pop() }; match container { Some(cid) => Ok(cid), @@ -184,15 +184,19 @@ impl PotBuilder { // (more fine-grained) mutex around the entry value and checking if its empty a second time let image = { let mut map = self.image_map.lock().await; - match map.entry(tc.clone()) { + match map.entry(toolchain_dir.to_path_buf()) { hash_map::Entry::Occupied(e) => e.get().clone(), hash_map::Entry::Vacant(e) => { - tracing::info!("[get_container({})]: Creating pot image for {:?} (may block requests)", job_id, tc); + tracing::info!( + "[get_container({})]: Creating pot image for {:?} (may block requests)", + job_id, + toolchain_dir.components().last().unwrap() + ); let image = Self::make_image( job_id, - tc, - tccache, - &self.pot_fs_root, + toolchain_base_dir, + toolchain_dir, + // &self.pot_fs_root, &self.clone_from, &self.pot_cmd, &self.pot_clone_args, @@ -210,19 +214,19 @@ impl PotBuilder { async fn clean_container(cid: &str) -> Result<()> { let mut cmd = tokio::process::Command::new("pot"); - cmd.args(&["stop", "-p", cid]) + cmd.args(["stop", "-p", cid]) .check_run() .await .context("Failed to stop container")?; let mut cmd = tokio::process::Command::new("pot"); - cmd.args(&["revert", "-p", cid]) + cmd.args(["revert", "-p", cid]) .check_run() .await .context("Failed to revert container")?; let mut cmd = tokio::process::Command::new("pot"); - cmd.args(&["start", "-p", cid]) + cmd.args(["start", "-p", cid]) .check_run() .await .context("Failed to (re)start container")?; @@ -232,20 +236,20 @@ impl PotBuilder { // Failing during cleanup is pretty unexpected, but we can still return the successful compile // TODO: if too many of these fail, we should mark this builder as faulty async fn finish_container( - job_id: JobId, - container_lists: Arc>>>, - tc: Toolchain, - cid: String, - pot_cmd: &PathBuf, + job_id: &str, + container_lists: Arc>>>, + toolchain_dir: &Path, + cid: &str, + pot_cmd: &Path, ) { - if let Err(e) = Self::clean_container(&cid).await { + if let Err(e) = Self::clean_container(cid).await { tracing::info!( "[finish_container({})]: Failed to clean container {}: {}", job_id, cid, e ); - if let Err(e) = pot_rm(&cid, pot_cmd).await { + if let Err(e) = pot_rm(cid, pot_cmd).await { tracing::warn!( "[finish_container({})]: Failed to remove container {} after failed clean: {}", job_id, @@ -257,19 +261,23 @@ impl PotBuilder { } // Good as new, add it back to the container list - if let Some(entry) = container_lists.lock().await.get_mut(&tc) { + if let Some(entry) = container_lists + .lock() + .await + .get_mut(&toolchain_dir.to_path_buf()) + { tracing::debug!( "[finish_container({})]: Reclaimed container {}", job_id, cid ); - entry.push(cid) + entry.push(cid.to_owned()) } else { tracing::warn!( "[finish_container({})]: Was ready to reclaim container {} but toolchain went missing", job_id, cid ); - if let Err(e) = pot_rm(&cid, pot_cmd).await { + if let Err(e) = pot_rm(cid, pot_cmd).await { tracing::warn!( "[finish_container({})]: Failed to remove container {}: {}", job_id, @@ -281,15 +289,19 @@ impl PotBuilder { } async fn make_image( - job_id: JobId, - tc: &Toolchain, - tccache: &Mutex, - pot_fs_root: &Path, + job_id: &str, + toolchain_base_dir: &Path, + toolchain_dir: &Path, + // pot_fs_root: &Path, clone_from: &str, pot_cmd: &PathBuf, pot_clone_args: &[String], ) -> Result { - let imagename = format!("sccache-image-{}", &tc.archive_id); + let toolchain_name = toolchain_dir.components().last().unwrap(); + let imagename = format!( + "sccache-image-{}", + toolchain_name.as_os_str().to_string_lossy() + ); tracing::trace!( "[make_image({})]: Creating toolchain image: {}", job_id, @@ -303,32 +315,45 @@ impl PotBuilder { .await .context("Failed to create pot container")?; - let mut tccache = tccache.lock().await; - let toolchain_rdr = match tccache.get(tc) { - Ok(rdr) => rdr, - Err(LruError::FileNotInCache) => { - bail!("expected toolchain {}, but not available", tc.archive_id) - } - Err(e) => return Err(Error::from(e).context("failed to get toolchain from cache")), - }; + // let mut tccache = tccache.lock().await; + // let toolchain_rdr = match tccache.get(tc) { + // Ok(rdr) => rdr, + // Err(LruError::FileNotInCache) => { + // bail!("expected toolchain {}, but not available", tc.archive_id) + // } + // Err(e) => return Err(Error::from(e).context("failed to get toolchain from cache")), + // }; + + // tracing::trace!("[make_image({})]: Copying in toolchain", job_id); + + // tar::Archive::new(GzDecoder::new(toolchain_rdr)) + // .unpack(pot_fs_root.join("jails").join(&imagename).join("m")) + // .or_else(|e| { + // tracing::warn!( + // "[make_image({})]: Failed to unpack toolchain: {:?}", + // job_id, + // e + // ); + // tccache + // .remove(tc) + // .context("Failed to remove corrupt toolchain")?; + // Err(Error::from(e)) + // })?; + + tracing::trace!("[make_image({})]: Mounting in toolchain", job_id); + + let toolchain_container_dir = toolchain_dir.strip_prefix(toolchain_base_dir)?; - tracing::trace!("[make_image({})]: Copying in toolchain", job_id); - tar::Archive::new(GzDecoder::new(toolchain_rdr)) - .unpack(pot_fs_root.join("jails").join(&imagename).join("m")) - .or_else(|e| { - tracing::warn!( - "[make_image({})]: Failed to unpack toolchain: {:?}", - job_id, - e - ); - tccache - .remove(tc) - .context("Failed to remove corrupt toolchain")?; - Err(Error::from(e)) - })?; + let mut cmd = tokio::process::Command::new(pot_cmd); + cmd.args(["mount-in", "-p", &imagename]) + .args(["-m", &format!("{}", toolchain_container_dir.display())]) + .args(["-d", &format!("{}", toolchain_dir.display())]) + .check_run() + .await + .context("Failed to mount in toolchain")?; let mut cmd = tokio::process::Command::new(pot_cmd); - cmd.args(&["snapshot", "-p", &imagename]) + cmd.args(["snapshot", "-p", &imagename]) .check_run() .await .context("Failed to snapshot container after build")?; @@ -351,13 +376,13 @@ impl PotBuilder { .context("Failed to create pot container")?; let mut cmd = tokio::process::Command::new(pot_cmd); - cmd.args(&["snapshot", "-p", &cid]) + cmd.args(["snapshot", "-p", &cid]) .check_run() .await .context("Failed to snapshotpot container")?; let mut cmd = tokio::process::Command::new(pot_cmd); - cmd.args(&["start", "-p", &cid]) + cmd.args(["start", "-p", &cid]) .check_run() .await .context("Failed to start container")?; @@ -366,75 +391,81 @@ impl PotBuilder { } async fn perform_build( - job_id: JobId, - compile_command: CompileCommand, - mut inputs_rdr: std::pin::Pin<&mut (dyn tokio::io::AsyncRead + Send)>, + job_id: &str, + CompileCommand { + executable, + arguments, + env_vars, + cwd, + }: CompileCommand, output_paths: Vec, + inputs: Vec, cid: &str, pot_fs_root: &Path, - job_queue: &tokio::sync::Semaphore, ) -> Result { - tracing::trace!( - "[perform_build({})]: Compile environment: {:?}", - job_id, - compile_command.env_vars - ); - tracing::trace!( - "[perform_build({})]: Compile command: {:?} {:?}", - job_id, - compile_command.executable, - compile_command.arguments - ); + tracing::trace!("[perform_build({job_id})]: Compile environment: {env_vars:?}"); + tracing::trace!("[perform_build({job_id})]: Compile command: {executable:?} {arguments:?}"); - tracing::trace!("[perform_build({})]: copying in inputs", job_id); - // not elegant - // Read into memory because we can't use asyncio in the thread below. - let mut inputs_buf = vec![]; - inputs_rdr.read_to_end(&mut inputs_buf).await?; - tar::Archive::new(inputs_buf.reader()) - .unpack(pot_fs_root.join("jails").join(cid).join("m")) - .context("Failed to unpack inputs to pot")?; + if output_paths.is_empty() { + bail!("output_paths is empty"); + } + + tracing::trace!("[perform_build({job_id})]: copying in inputs"); + + let jail_root = pot_fs_root.join("jails").join(cid).join("m"); + + // Copy inputs to jail_root + { + let reader = inputs.reader(); + let reader = futures::io::AllowStdIo::new(reader); + let reader = ZlibDecoderAsync::new(reader.compat()); + async_tar::Archive::new(reader.compat()) + .unpack(&jail_root) + .await + .context("Failed to unpack inputs to tempdir")?; + } - let CompileCommand { - executable, - arguments, - env_vars, - cwd, - } = compile_command; let cwd = Path::new(&cwd); - tracing::trace!("[perform_build({})]: creating output directories", job_id); - assert!(!output_paths.is_empty()); + tracing::trace!("[perform_build({job_id})]: creating output directories"); + + // Resolve output paths relative to cwd since we copy them from the root level + let output_paths_absolute = output_paths + .iter() + .map(|path| { + let path = Path::new(path); + if path.is_absolute() { + path.to_path_buf() + } else { + cwd.join(path) + } + }) + .collect::>(); + let mut cmd = tokio::process::Command::new("jexec"); - cmd.args(&[cid, "mkdir", "-p"]).arg(cwd); - for path in output_paths.iter() { + + cmd.args([cid, "mkdir", "-p"]).arg(cwd); + + for path in output_paths_absolute.iter() { // If it doesn't have a parent, nothing needs creating - let output_parent = if let Some(p) = Path::new(path).parent() { - p - } else { - continue; - }; - cmd.arg(cwd.join(output_parent)); + if let Some(path) = path.parent() { + cmd.arg(path); + } } + cmd.check_run() .await .context("Failed to create directories required for compile in container")?; - // Guard compiling until we get a token from the job queue - let _token = job_queue.acquire().await?; + tracing::trace!("[perform_build({job_id})]: performing compile"); - tracing::trace!("[perform_build({})]: performing compile", job_id); // TODO: likely shouldn't perform the compile as root in the container let mut cmd = tokio::process::Command::new("jexec"); cmd.arg(cid); cmd.arg("env"); for (k, v) in env_vars { if k.contains('=') { - tracing::warn!( - "[perform_build({})]: Skipping environment variable: {:?}", - job_id, - k - ); + tracing::warn!("[perform_build({job_id})]: Skipping environment variable: {k:?}"); continue; } let mut env = k; @@ -443,7 +474,7 @@ impl PotBuilder { cmd.arg(env); } let shell_cmd = "cd \"$1\" && shift && exec \"$@\""; - cmd.args(&["sh", "-c", shell_cmd]); + cmd.args(["sh", "-c", shell_cmd]); cmd.arg(&executable); cmd.arg(cwd); cmd.arg(executable); @@ -452,19 +483,26 @@ impl PotBuilder { .output() .await .context("Failed to start executing compile")?; - tracing::trace!( - "[perform_build({})]: compile_output: {:?}", - job_id, - compile_output - ); + + if !compile_output.status.success() { + tracing::warn!( + "[perform_build({job_id})]: compile output:\n===========\nstdout:\n{}\n==========\n\n=========\nstderr:\n{}\n===============\n\n", + String::from_utf8_lossy(&compile_output.stdout), + String::from_utf8_lossy(&compile_output.stderr) + ); + } else { + tracing::trace!("[perform_build({job_id})]: compile output: {compile_output:?}"); + } let mut outputs = vec![]; - tracing::trace!("[perform_build({})]: retrieving {:?}", job_id, output_paths); - for path in output_paths { - let abspath = cwd.join(&path); // Resolve in case it's relative since we copy it from the root level - // TODO: this isn't great, but cp gives it out as a tar + + tracing::trace!("[perform_build({job_id})]: retrieving {output_paths:?}"); + + for (path, abspath) in output_paths.iter().zip(output_paths_absolute.iter()) { + // TODO: this isn't great, but cp gives it out as a tar + let output = tokio::process::Command::new("jexec") - .args(&[cid, "cat"]) + .args([cid, "cat"]) .arg(abspath) .output() .await @@ -472,18 +510,17 @@ impl PotBuilder { if output.status.success() { let output = OutputData::try_from_reader(&*output.stdout) .expect("Failed to read compress output stdout"); - outputs.push((path, output)) + outputs.push((path.clone(), output)) } else { tracing::debug!( - "[perform_build({})]: Missing output path {:?}", - job_id, - path + "[perform_build({job_id})]: Missing output path {path:?} ({abspath:?})" ) } } let compile_output = ProcessOutput::try_from(compile_output) .context("Failed to convert compilation exit status")?; + Ok(BuildResult { output: compile_output, outputs, @@ -496,57 +533,69 @@ impl BuilderIncoming for PotBuilder { // From Server async fn run_build( &self, - job_id: JobId, - tc: Toolchain, + job_id: &str, + toolchain_dir: &Path, command: CompileCommand, outputs: Vec, - inputs_rdr: std::pin::Pin<&mut (dyn tokio::io::AsyncRead + Send)>, - tccache: &Mutex, - job_queue: &tokio::sync::Semaphore, + inputs: Vec, ) -> Result { tracing::debug!("[run_build({})]: Finding container", job_id); + let cid = self - .get_container(job_id, &tc, tccache) + .get_container(job_id, &self.toolchain_base_dir, toolchain_dir) .await .context("Failed to get a container for build")?; + tracing::debug!( "[run_build({})]: Performing build with container {}", job_id, cid ); - let res = Self::perform_build( + + let res = + Self::perform_build(job_id, command, outputs, inputs, &cid, &self.pot_fs_root).await; + + // Unwrap the result + let res = res.context("Failed to perform build")?; + tracing::debug!("[run_build({})]: Finishing with container {}", job_id, cid); + + Self::finish_container( job_id, - command, - inputs_rdr, - outputs, + self.container_lists.clone(), + toolchain_dir, &cid, - &self.pot_fs_root, - job_queue, + &self.pot_cmd, ) .await; - // Unwrap the result - let res = res.context("Failed to perform build")?; - tracing::debug!("[run_build({})]: Finishing with container {}", job_id, cid); - let cloned = self.clone(); - let tc = tc; - while cloned.cleanup_thread_count.fetch_add(1, Ordering::SeqCst) - > self.max_cleanup_thread_count - { - cloned.cleanup_thread_count.fetch_sub(1, Ordering::SeqCst); - hint::spin_loop(); - } - let runtime = tokio::runtime::Handle::current(); - // - // Don't await the spawn future so cleanup happens in the background. - // - // TODO: This seems like many background cleanup threads could occupy - // many of the threads in tokio's threadpool. Maybe this should - // be awaited? How expensive is `Self::finish_container()`? - runtime.spawn(async move { - Self::finish_container(job_id, cloned.container_lists, tc, cid, &cloned.pot_cmd).await; - cloned.cleanup_thread_count.fetch_sub(1, Ordering::SeqCst); - }); + + // let cloned = self.clone(); + + // while cloned.cleanup_thread_count.fetch_add(1, Ordering::SeqCst) + // > self.max_cleanup_thread_count + // { + // cloned.cleanup_thread_count.fetch_sub(1, Ordering::SeqCst); + // hint::spin_loop(); + // } + + // let runtime = tokio::runtime::Handle::current(); + // // + // // Don't await the spawn future so cleanup happens in the background. + // // + // // TODO: This seems like many background cleanup threads could occupy + // // many of the threads in tokio's threadpool. Maybe this should + // // be awaited? How expensive is `Self::finish_container()`? + // runtime.spawn({ + // let cid = cid.clone(); + // let job_id = job_id.to_owned(); + // let toolchain_dir = toolchain_dir.to_path_buf(); + // async move { + // Self::finish_container(&job_id, cloned.container_lists, &toolchain_dir, &cid, &cloned.pot_cmd).await; + // cloned.cleanup_thread_count.fetch_sub(1, Ordering::SeqCst); + // } + // }); + tracing::debug!("[run_build({})]: Returning result", job_id); + Ok(res) } } diff --git a/src/bin/sccache-dist/cmdline/mod.rs b/src/bin/sccache-dist/cmdline/mod.rs index fafe3902c1..108a7713c3 100644 --- a/src/bin/sccache-dist/cmdline/mod.rs +++ b/src/bin/sccache-dist/cmdline/mod.rs @@ -13,7 +13,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -use sccache::{config, dist::ServerId}; +use sccache::config; mod parse; @@ -21,18 +21,6 @@ pub use parse::try_parse_from; #[derive(Debug)] pub enum Command { - Auth(AuthSubcommand), Scheduler(config::scheduler::Config), Server(config::server::Config), } - -#[derive(Debug, PartialEq, Eq)] -pub enum AuthSubcommand { - Base64 { - num_bytes: usize, - }, - JwtHS256ServerToken { - secret_key: String, - server_id: ServerId, - }, -} diff --git a/src/bin/sccache-dist/cmdline/parse.rs b/src/bin/sccache-dist/cmdline/parse.rs index 54a990fb5f..f0ae0fcab0 100644 --- a/src/bin/sccache-dist/cmdline/parse.rs +++ b/src/bin/sccache-dist/cmdline/parse.rs @@ -13,45 +13,14 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::{env, ffi::OsString, fmt, net::SocketAddr, path::PathBuf, str::FromStr}; +use std::{env, ffi::OsString, path::PathBuf, str::FromStr}; -use anyhow::{anyhow, bail}; -use clap::{Arg, ArgGroup, Command as ClapCommand, ValueEnum}; -use sccache::{config, dist::ServerId}; +use anyhow::{bail, Context}; +use clap::{Arg, Command as ClapCommand, ValueEnum}; +use sccache::config; use syslog::Facility; -use crate::cmdline::{AuthSubcommand, Command}; - -#[derive(Debug, Clone)] -struct TokenLength(usize); - -impl TokenLength { - fn as_bytes(&self) -> usize { - self.0 / 8 - } - - fn from_bits(bits: &str) -> anyhow::Result { - let bits: usize = bits.parse()?; - - if bits & 0x7 != 0 { - Err(anyhow!("Number of bits must be divisible by 8")) - } else if bits < 64 { - Err(anyhow!( - "Number of bits must be greater than or equal to 64" - )) - } else if bits > 4_096 { - Err(anyhow!("Number of bits must be less than or equal to 4096")) - } else { - Ok(Self(bits)) - } - } -} - -impl fmt::Display for TokenLength { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!(f, "{}", self.0) - } -} +use crate::cmdline::Command; #[derive(Clone, Copy, ValueEnum)] enum LogLevel { @@ -110,41 +79,6 @@ fn get_clap_command() -> ClapCommand { ClapCommand::new(env!("CARGO_PKG_NAME")) .version(env!("CARGO_PKG_VERSION")) .subcommand_required(true) - .subcommand( - ClapCommand::new("auth") - .subcommand_required(true) - .subcommand(ClapCommand::new("generate-jwt-hs256-key")) - .subcommand( - ClapCommand::new("generate-jwt-hs256-server-token") - .args(&[ - flag_infer_long("server") - .help("Generate a key for the specified server") - .value_name("SERVER_ADDR") - .value_parser(clap::value_parser!(SocketAddr)) - .required(true), - flag_infer_long("secret-key") - .help("Use specified key to create the token") - .value_name("KEY"), - config_with_help_message( - "Use the key from the scheduler config file at PATH", - ), - ]) - .group( - ArgGroup::new("key_source_mutual_exclusion") - .args(["config", "secret-key"]) - .required(true), - ), - ) - .subcommand( - ClapCommand::new("generate-shared-token").arg( - flag_infer_long("bits") - .help("Use the specified number of bits of randomness") - .value_name("BITS") - .default_value("256") - .value_parser(TokenLength::from_bits), - ), - ), - ) .subcommand(ClapCommand::new("scheduler").args(&[ config_with_help_message("Use the scheduler config file at PATH").required(true), syslog.clone(), @@ -167,53 +101,6 @@ pub fn try_parse_from( let matches = get_clap_command().try_get_matches_from(args)?; Ok(match matches.subcommand() { - Some(("auth", matches)) => Command::Auth(match matches.subcommand() { - // Size based on https://briansmith.org/rustdoc/ring/hmac/fn.recommended_key_len.html - Some(("generate-jwt-hs256-key", _)) => AuthSubcommand::Base64 { num_bytes: 256 / 8 }, - Some(("generate-jwt-hs256-server-token", matches)) => { - let server_addr = matches - .get_one("server") - .expect("`server` is required and it can be parsed to a `SocketAddr`"); - let server_id = ServerId::new(*server_addr); - - let secret_key = if matches.contains_id("config") { - let config_path = matches - .get_one::("config") - .expect("`config` is required and it can be parsed to a `PathBuf`"); - if let Some(config) = config::scheduler::from_path(config_path)? { - match config.server_auth { - config::scheduler::ServerAuth::JwtHS256 { secret_key } => secret_key, - config::scheduler::ServerAuth::Insecure - | config::scheduler::ServerAuth::Token { .. } => { - bail!("Scheduler not configured with JWT HS256") - } - } - } else { - bail!("Could not load config") - } - } else { - matches - .get_one::("secret-key") - .expect("`secret-key` is required") - .to_string() - }; - - AuthSubcommand::JwtHS256ServerToken { - secret_key, - server_id, - } - } - Some(("generate-shared-token", matches)) => { - let token_bits = matches - .get_one::("bits") - .expect("clap provides default"); - - AuthSubcommand::Base64 { - num_bytes: token_bits.as_bytes(), - } - } - _ => unreachable!("Subcommand is enforced by clap"), - }), Some(("scheduler", matches)) => { if matches.contains_id("syslog") { let log_level = matches @@ -225,11 +112,11 @@ pub fn try_parse_from( let config_path = matches .get_one::("config") .expect("`config` is required"); - if let Some(config) = config::scheduler::from_path(config_path)? { - Command::Scheduler(config) - } else { - bail!("Could not load config") - } + + Command::Scheduler( + config::scheduler::Config::load(Some(config_path.clone())) + .with_context(|| "Could not load config")?, + ) } Some(("server", matches)) => { if matches.contains_id("syslog") { @@ -242,118 +129,12 @@ pub fn try_parse_from( let config_path = matches .get_one::("config") .expect("`config` is required"); - if let Some(config) = config::server::from_path(config_path)? { - Command::Server(config) - } else { - bail!("Could not load config") - } + + Command::Server( + config::server::Config::load(Some(config_path.clone())) + .with_context(|| "Could not load config")?, + ) } _ => unreachable!("Subcommand is enforced by clap"), }) } - -#[cfg(test)] -mod tests { - use super::*; - - const EXE: &str = "sccache-dist"; - - fn auth_generate_shared_tokens_bits(bit_val: &'static str) -> Vec<&'static str> { - vec![EXE, "auth", "generate-shared-token", "--bits", bit_val] - } - - fn auth_generate_jwt_hs256_server_token(subcommand_args: &[&'static str]) -> Vec<&'static str> { - let mut args = vec![EXE, "auth", "generate-jwt-hs256-server-token"]; - args.extend_from_slice(subcommand_args); - args - } - - #[test] - fn debug_assert() { - get_clap_command().debug_assert() - } - - #[test] - fn missing_required_subcommands_fails() { - let args_sets = &[vec![EXE], vec![EXE, "auth"]]; - - for args in args_sets { - assert!(try_parse_from(args).is_err()); - } - } - - #[test] - fn invalid_token_bits_fails() { - let invalid_vals = vec!["not_a_num", "58", "8000", "70"]; - - for invalid_val in invalid_vals { - let args = auth_generate_shared_tokens_bits(invalid_val); - assert!(try_parse_from(args).is_err()); - } - } - - #[test] - fn auth_generate_server_token_needs_key_source() { - let server_args = &["--server", "127.0.0.1:4321"]; - - let no_key = auth_generate_jwt_hs256_server_token(server_args); - assert!(try_parse_from(no_key).is_err()); - - let mut too_many_keys = auth_generate_jwt_hs256_server_token(server_args); - too_many_keys.extend_from_slice(&["--secret-key", "secret", "--config", "some/path.toml"]); - assert!(try_parse_from(too_many_keys).is_err()); - } - - // This is all just to work around `PartialEq` not being on some of the values used in variants - // for `Command` yet - fn assert_args_parse_to_auth(args: Vec<&'static str>, ideal_auth: AuthSubcommand) { - match try_parse_from(&args) { - Ok(Command::Auth(auth)) => assert_eq!(auth, ideal_auth), - _ => panic!("Bad parsing for: {:#?}", args), - } - } - - #[test] - fn auth_generate_jwt_hs256_key_good() { - let args = vec![EXE, "auth", "generate-jwt-hs256-key"]; - - assert_args_parse_to_auth(args, AuthSubcommand::Base64 { num_bytes: 256 / 8 }); - } - - #[test] - fn auth_generate_jwt_hs256_server_token_good() { - let base = auth_generate_jwt_hs256_server_token(&["--server", "127.0.0.1:4321"]); - let server_socket: SocketAddr = "127.0.0.1:4321".parse().unwrap(); - let server_id = ServerId::new(server_socket); - - let mut secret_key = base.clone(); - secret_key.extend_from_slice(&["--secret-key", "very secret"]); - assert_args_parse_to_auth( - secret_key, - AuthSubcommand::JwtHS256ServerToken { - server_id, - secret_key: "very secret".to_owned(), - }, - ); - } - - #[test] - fn auth_generate_shared_token_good() { - let raw_to_expected_bit_vals = &[ - ("64", 64 / 8), - ("128", 128 / 8), - ("136", 136 / 8), - ("4000", 4_000 / 8), - ]; - - for (raw, expected) in raw_to_expected_bit_vals { - let args = auth_generate_shared_tokens_bits(raw); - assert_args_parse_to_auth( - args, - AuthSubcommand::Base64 { - num_bytes: *expected, - }, - ); - } - } -} diff --git a/src/bin/sccache-dist/main.rs b/src/bin/sccache-dist/main.rs index fc96288013..2661d7ab91 100644 --- a/src/bin/sccache-dist/main.rs +++ b/src/bin/sccache-dist/main.rs @@ -1,42 +1,42 @@ -use anyhow::{bail, Context, Error, Result}; use async_trait::async_trait; -use base64::Engine; + +use celery::prelude::*; +use celery::protocol::MessageContentType; use futures::lock::Mutex; use futures::FutureExt; -use itertools::Itertools; -use rand::{rngs::OsRng, RngCore}; use sccache::config::{ - scheduler as scheduler_config, server as server_config, INSECURE_DIST_CLIENT_TOKEN, + scheduler as scheduler_config, server as server_config, MessageBroker, + INSECURE_DIST_CLIENT_TOKEN, }; -use sccache::dist::http::{get_dist_request_timeout, HEARTBEAT_ERROR_INTERVAL, HEARTBEAT_INTERVAL}; + use sccache::dist::{ - self, AllocJobResult, AssignJobResult, BuilderIncoming, CompileCommand, HeartbeatServerResult, - JobAlloc, JobAuthorizer, JobComplete, JobId, RunJobResult, SchedulerIncoming, - SchedulerOutgoing, SchedulerStatusResult, ServerId, ServerIncoming, ServerNonce, - ServerOutgoing, ServerStatusResult, SubmitToolchainResult, TcCache, Toolchain, - UpdateJobStateResult, + self, BuildResult, CompileCommand, NewJobRequest, NewJobResponse, RunJobRequest, + RunJobResponse, SchedulerService, SchedulerStatusResult, ServerService, SubmitToolchainResult, + Toolchain, }; use sccache::util::daemonize; -use sccache::util::BASE64_URL_SAFE_ENGINE; -use serde::{Deserialize, Serialize}; -use std::collections::{HashMap, HashSet}; +use std::collections::HashMap; use std::env; -use std::path::Path; -use std::sync::atomic::AtomicUsize; +use std::pin::Pin; use std::sync::Arc; -use std::time::{Duration, Instant}; + +use tokio::sync::OnceCell; use tracing_subscriber::{layer::SubscriberExt, util::SubscriberInitExt}; #[cfg_attr(target_os = "freebsd", path = "build_freebsd.rs")] +// #[cfg_attr(not(target_os = "freebsd"), path = "build_freebsd.rs")] mod build; mod cmdline; +use cmdline::Command; mod token_check; -use cmdline::{AuthSubcommand, Command}; +use crate::dist::ServerToolchains; +use sccache::errors::*; -pub const INSECURE_DIST_SERVER_TOKEN: &str = "dangerously_insecure_server"; +static SERVER: OnceCell> = OnceCell::const_new(); +static SCHEDULER: OnceCell> = OnceCell::const_new(); // Only supported on x86_64/aarch64 Linux machines and on FreeBSD #[cfg(any( @@ -73,7 +73,7 @@ fn main() { }; std::process::exit(match run(command) { - Ok(s) => s, + Ok(_) => 0, Err(e) => { eprintln!("sccache-dist: error: {}", e); @@ -85,193 +85,177 @@ fn main() { }); } -fn create_server_token(server_id: ServerId, auth_token: &str) -> String { - format!("{} {}", server_id.addr(), auth_token) -} -fn check_server_token(server_token: &str, auth_token: &str) -> Option { - let mut split = server_token.splitn(2, |c| c == ' '); - let server_addr = split.next().and_then(|addr| addr.parse().ok())?; - match split.next() { - Some(t) if t == auth_token => Some(ServerId::new(server_addr)), - Some(_) | None => None, - } -} - -#[derive(Serialize, Deserialize)] -#[serde(deny_unknown_fields)] -struct ServerJwt { - exp: u64, - server_id: ServerId, -} -fn create_jwt_server_token( - server_id: ServerId, - header: &jwt::Header, - key: &[u8], -) -> Result { - let key = jwt::EncodingKey::from_secret(key); - jwt::encode(header, &ServerJwt { exp: 0, server_id }, &key).map_err(Into::into) -} -fn dangerous_insecure_extract_jwt_server_token(server_token: &str) -> Result { - let validation = { - let mut validation = jwt::Validation::default(); - validation.validate_exp = false; - validation.validate_nbf = false; - validation.insecure_disable_signature_validation(); - validation - }; - let dummy_key = jwt::DecodingKey::from_secret(b"secret"); - jwt::decode::(server_token, &dummy_key, &validation) - .map(|res| res.claims.server_id) - .map_err(Into::into) -} -fn check_jwt_server_token( - server_token: &str, - key: &[u8], - validation: &jwt::Validation, -) -> Option { - let key = jwt::DecodingKey::from_secret(key); - jwt::decode::(server_token, &key, validation) - .map(|res| res.claims.server_id) - .ok() -} +fn run(command: Command) -> Result<()> { + let num_cpus = std::thread::available_parallelism()?.get(); -fn run(command: Command) -> Result { match command { - Command::Auth(AuthSubcommand::Base64 { num_bytes }) => { - let mut bytes = vec![0; num_bytes]; - OsRng.fill_bytes(&mut bytes); - // As long as it can be copied, it doesn't matter if this is base64 or hex etc - println!("{}", BASE64_URL_SAFE_ENGINE.encode(bytes)); - Ok(0) - } - Command::Auth(AuthSubcommand::JwtHS256ServerToken { - secret_key, - server_id, - }) => { - let header = jwt::Header::new(jwt::Algorithm::HS256); - let secret_key = BASE64_URL_SAFE_ENGINE.decode(secret_key)?; - let token = create_jwt_server_token(server_id, &header, &secret_key) - .context("Failed to create server token")?; - println!("{}", token); - Ok(0) - } - Command::Scheduler(scheduler_config::Config { - public_addr, + enable_web_socket_server, client_auth, - server_auth, - remember_server_error_timeout, + job_time_limit, + max_body_size, + message_broker, + public_addr, + toolchains_fallback, + toolchains, }) => { - let check_client_auth: Box = match client_auth { - scheduler_config::ClientAuth::Insecure => Box::new(token_check::EqCheck::new( - INSECURE_DIST_CLIENT_TOKEN.to_owned(), - )), - scheduler_config::ClientAuth::Token { token } => { - Box::new(token_check::EqCheck::new(token)) - } - scheduler_config::ClientAuth::JwtValidate { - audience, - issuer, - jwks_url, - } => Box::new( - token_check::ValidJWTCheck::new(audience, issuer, &jwks_url) - .context("Failed to create a checker for valid JWTs")?, - ), - scheduler_config::ClientAuth::Mozilla { required_groups } => { - Box::new(token_check::MozillaCheck::new(required_groups)) - } - scheduler_config::ClientAuth::ProxyToken { url, cache_secs } => { - Box::new(token_check::ProxyTokenCheck::new(url, cache_secs)) - } - }; - - let check_server_auth: dist::http::ServerAuthCheck = match server_auth { - scheduler_config::ServerAuth::Insecure => { - tracing::warn!( - "Scheduler starting with DANGEROUSLY_INSECURE server authentication" - ); - let token = INSECURE_DIST_SERVER_TOKEN; - Box::new(move |server_token| check_server_token(server_token, token)) - } - scheduler_config::ServerAuth::Token { token } => { - Box::new(move |server_token| check_server_token(server_token, &token)) - } - scheduler_config::ServerAuth::JwtHS256 { secret_key } => { - let secret_key = BASE64_URL_SAFE_ENGINE - .decode(secret_key) - .context("Secret key base64 invalid")?; - if secret_key.len() != 256 / 8 { - bail!("Size of secret key incorrect") - } - let validation = { - let mut validation = jwt::Validation::new(jwt::Algorithm::HS256); - validation.leeway = 0; - validation.validate_exp = false; - validation.validate_nbf = false; - validation - }; - Box::new(move |server_token| { - check_jwt_server_token(server_token, &secret_key, &validation) - }) - } - }; - - daemonize()?; + let broker_uri = + match message_broker.expect("Missing required message broker configuration") { + MessageBroker::AMQP(uri) => uri, + MessageBroker::Redis(uri) => uri, + }; let runtime = tokio::runtime::Builder::new_multi_thread() .enable_all() .build()?; - let scheduler = Scheduler::new(remember_server_error_timeout); - - let http_scheduler = dist::http::Scheduler::new( - public_addr, - scheduler, - check_client_auth, - check_server_auth, - ); + let toolchain_storage = sccache::cache::cache::storage_from_config( + &toolchains, + &toolchains_fallback, + runtime.handle(), + ) + .context("Failed to initialize toolchain storage")?; runtime.block_on(async move { - match http_scheduler.start().await { - Ok(_) => {} - Err(err) => panic!("Err: {err}"), - } - }); + let scheduler_id = format!( + "sccache-dist-scheduler-{}", + uuid::Uuid::new_v4().to_u128_le() + ); + + let task_queue = Arc::new( + celery::CeleryBuilder::new("sccache-dist", &broker_uri) + .default_queue(&scheduler_id) + .task_content_type(MessageContentType::MsgPack) + .task_route("scheduler_build_failed", &scheduler_id) + .task_route("scheduler_build_success", &scheduler_id) + .task_route("server_run_build", "sccache-dist-server") + .prefetch_count(100 * num_cpus as u16) + .heartbeat(Some(10)) + .acks_late(true) + .acks_on_failure_or_timeout(false) + .nacks_enabled(true) + .build() + .await + .unwrap(), + ); + + task_queue + .register_task::() + .await + .unwrap(); + + task_queue + .register_task::() + .await + .unwrap(); + + let scheduler = Arc::new(Scheduler::new( + job_time_limit, + scheduler_id.clone(), + task_queue.clone(), + toolchain_storage, + )); + + SCHEDULER + .set(scheduler.clone()) + .map_err(|e| anyhow!(e.to_string()))?; + + let server = dist::server::Scheduler::new( + scheduler, + match client_auth { + scheduler_config::ClientAuth::Insecure => Box::new( + token_check::EqCheck::new(INSECURE_DIST_CLIENT_TOKEN.to_owned()), + ), + scheduler_config::ClientAuth::Token { token } => { + Box::new(token_check::EqCheck::new(token)) + } + scheduler_config::ClientAuth::JwtValidate { + audience, + issuer, + jwks_url, + } => Box::new( + token_check::ValidJWTCheck::new(audience, issuer, &jwks_url) + .context("Failed to create a checker for valid JWTs")?, + ), + scheduler_config::ClientAuth::Mozilla { required_groups } => { + Box::new(token_check::MozillaCheck::new(required_groups)) + } + scheduler_config::ClientAuth::ProxyToken { url, cache_secs } => { + Box::new(token_check::ProxyTokenCheck::new(url, cache_secs)) + } + }, + ); + + task_queue.display_pretty().await; + + daemonize()?; + + let cancel = tokio::signal::ctrl_c(); + let celery = task_queue.consume(); + let server = server.serve(public_addr, enable_web_socket_server, max_body_size); + + futures::select! { + res = cancel.fuse() => res?, + res = celery.fuse() => res?, + res = server.fuse() => res?, + }; - unreachable!(); + task_queue.close().await?; + + Ok::<(), anyhow::Error>(()) + }) } Command::Server(server_config::Config { + message_broker, builder, cache_dir, - public_addr, - bind_addr, - scheduler_url, - scheduler_auth, - toolchain_cache_size, max_per_core_load, num_cpus_to_ignore, + toolchain_cache_size, + toolchains, + toolchains_fallback, }) => { - let bind_addr = bind_addr.unwrap_or(public_addr); - let num_cpus = - (std::thread::available_parallelism().unwrap().get() - num_cpus_to_ignore).max(1); + let num_cpus = (num_cpus - num_cpus_to_ignore).max(1) as f64; + let num_cpus = (num_cpus * max_per_core_load).floor().max(1f64) as u16; - tracing::debug!("Server num_cpus={num_cpus}"); + let broker_uri = + match message_broker.expect("Missing required message broker configuration") { + MessageBroker::AMQP(uri) => uri, + MessageBroker::Redis(uri) => uri, + }; let runtime = tokio::runtime::Builder::new_multi_thread() .enable_all() .build()?; + let toolchain_base_dir = cache_dir.join("tc"); + + let toolchain_storage = sccache::cache::cache::storage_from_config( + &toolchains, + &toolchains_fallback, + runtime.handle(), + ) + .context("Failed to initialize toolchain storage")?; + + let toolchains_disk_cache = Arc::new(Mutex::new(ServerToolchains::new( + &toolchain_base_dir, + toolchain_cache_size, + toolchain_storage, + ))); + runtime.block_on(async move { let builder: Box = match builder { #[cfg(not(target_os = "freebsd"))] - server_config::BuilderType::Docker => Box::new( + // #[cfg(target_os = "freebsd")] + sccache::config::server::BuilderType::Docker => Box::new( build::DockerBuilder::new() .await .context("Docker builder failed to start")?, ), #[cfg(not(target_os = "freebsd"))] - server_config::BuilderType::Overlay { + // #[cfg(target_os = "freebsd")] + sccache::config::server::BuilderType::Overlay { bwrap_path, build_dir, } => Box::new( @@ -280,15 +264,22 @@ fn run(command: Command) -> Result { .context("Overlay builder failed to start")?, ), #[cfg(target_os = "freebsd")] - server_config::BuilderType::Pot { + // #[cfg(not(target_os = "freebsd"))] + sccache::config::server::BuilderType::Pot { pot_fs_root, clone_from, pot_cmd, pot_clone_args, } => Box::new( - build::PotBuilder::new(pot_fs_root, clone_from, pot_cmd, pot_clone_args) - .await - .context("Pot builder failed to start")?, + build::PotBuilder::new( + &toolchain_base_dir, + pot_fs_root, + clone_from, + pot_cmd, + pot_clone_args, + ) + .await + .context("Pot builder failed to start")?, ), _ => bail!( "Builder type `{}` not supported on this platform", @@ -299,53 +290,43 @@ fn run(command: Command) -> Result { ), }; - let server_id = ServerId::new(public_addr); - let scheduler_auth = match scheduler_auth { - server_config::SchedulerAuth::Insecure => { - tracing::warn!( - "Server starting with DANGEROUSLY_INSECURE scheduler authentication" - ); - create_server_token(server_id, INSECURE_DIST_SERVER_TOKEN) - } - server_config::SchedulerAuth::Token { token } => { - create_server_token(server_id, &token) - } - server_config::SchedulerAuth::JwtToken { token } => { - let token_server_id: ServerId = - dangerous_insecure_extract_jwt_server_token(&token) - .context("Could not decode scheduler auth jwt")?; - if token_server_id != server_id { - bail!( - "JWT server id ({:?}) did not match configured server id ({:?})", - token_server_id, - server_id - ) - } - token - } - }; + let task_queue = Arc::new( + celery::CeleryBuilder::new("sccache-dist", &broker_uri) + .default_queue("sccache-dist-server") + .task_content_type(MessageContentType::MsgPack) + .prefetch_count(num_cpus) + .heartbeat(Some(10)) + .acks_late(true) + .acks_on_failure_or_timeout(false) + .nacks_enabled(true) + .build() + .await?, + ); - let server = Server::new(builder, &cache_dir, toolchain_cache_size, num_cpus) - .context("Failed to create sccache server instance")?; - - let http_server = dist::http::Server::new( - public_addr, - bind_addr, - scheduler_url.to_url(), - scheduler_auth, - max_per_core_load, - num_cpus, - server, - ) - .context("Failed to create sccache HTTP server instance")?; + task_queue.register_task::().await?; - match http_server.start().await { - Ok(_) => Ok(()), - Err(err) => panic!("Err: {err}"), - } - })?; + SERVER + .set(Box::new(Server::new( + builder, + task_queue.clone(), + toolchains_disk_cache, + ))) + .map_err(|err| anyhow!("{err}"))?; + + tracing::debug!( + "sccache: Server initialized to run {num_cpus} parallel build jobs" + ); + + task_queue.display_pretty().await; + + daemonize()?; + + task_queue.consume().await?; - unreachable!(); + task_queue.close().await?; + + Ok(()) + }) } } } @@ -375,636 +356,328 @@ fn init_logging() { } } -// To avoid deadlocking, make sure to do all locking at once (i.e. no further locking in a downward scope), -// in alphabetical order pub struct Scheduler { - remember_server_error_timeout: Duration, - servers: Mutex>, -} - -struct ServerDetails { - last_seen: Instant, - last_error: Option, - num_cpus: usize, - max_per_core_load: f64, - server_nonce: ServerNonce, - job_authorizer: Box, - num_assigned_jobs: usize, - num_active_jobs: usize, + id: String, + job_time_limit: u32, + job_result: Arc>>>, + task_queue: Arc, + toolchains: Arc, } impl Scheduler { - pub fn new(remember_server_error_timeout: u64) -> Self { - Scheduler { - remember_server_error_timeout: Duration::from_secs(remember_server_error_timeout), - servers: Mutex::new(HashMap::new()), - } - } - - fn prune_servers(&self, servers: &mut HashMap) { - let now = Instant::now(); - - let mut dead_servers = Vec::new(); - - for (&server_id, server) in servers.iter_mut() { - if now.duration_since(server.last_seen) > dist::http::HEARTBEAT_TIMEOUT { - dead_servers.push(server_id); - } - } - - for server_id in dead_servers { - tracing::warn!( - "[prune_servers({})]: Server appears to be dead, pruning it in the scheduler", - server_id.addr() - ); - servers.remove(&server_id); + pub fn new( + job_time_limit: u32, + scheduler_id: String, + task_queue: Arc, + toolchains: Arc, + ) -> Self { + Self { + id: scheduler_id, + job_time_limit, + job_result: Arc::new(Mutex::new(HashMap::new())), + task_queue, + toolchains, } } } -impl Default for Scheduler { - fn default() -> Self { - Self::new(scheduler_config::default_remember_server_error_timeout()) +#[async_trait] +impl SchedulerService for Scheduler { + async fn get_status(&self) -> Result { + // TODO + Ok(SchedulerStatusResult { + num_cpus: 0, // servers.values().map(|v| v.num_cpus).sum(), + num_servers: 0, // servers.len(), + assigned: 0, // assigned_jobs, + active: 0, // active_jobs, + servers: HashMap::new(), // servers_map, + }) } -} -fn error_chain_to_string(err: &Error) -> String { - let mut err_msg = err.to_string(); - let mut maybe_cause = err.source(); - while let Some(cause) = maybe_cause { - err_msg.push_str(", caused by: "); - err_msg.push_str(&cause.to_string()); - maybe_cause = cause.source(); + async fn has_toolchain(&self, toolchain: Toolchain) -> bool { + self.toolchains.has(&toolchain.archive_id).await } - err_msg -} -#[async_trait] -impl SchedulerIncoming for Scheduler { - async fn handle_alloc_job( + async fn put_toolchain( &self, - requester: &dyn SchedulerOutgoing, - tc: Toolchain, - ) -> Result { - let Scheduler { - remember_server_error_timeout, - .. - } = *self; - - // Attempt to allocate a job to the best server. The best server is the server - // with the fewest assigned jobs and least-recently-reported error. Servers - // whose load exceeds `num_cpus` are not considered candidates for assignment. - // - // If we fail to assign a job to a server, attempt to assign the job to the next - // best candidate until either the job has been assigned successfully, or the - // candidate list has been exhausted. - // - // Special care is taken to not lock `self.servers` while network requests are - // in-flight, as that will block other request-handling threads and deadlock - // the scheduler. - // - // Do not assert!() anywhere, as that permanently corrupts the scheduler. - // All error conditions must fail gracefully. - - let try_assign_job = |server_id: ServerId, tc: Toolchain| async move { - // LOCKS - let mut servers = self.servers.lock().await; - let server = match servers.get_mut(&server_id) { - Some(server) => server, - _ => bail!("Failed to assign job to unknown server"), - }; - - let assign_auth = server - .job_authorizer - .generate_token(JobId(server.server_nonce.as_u64())) - .map_err(Error::from) - .context("Could not create assign_auth token")?; - - drop(servers); - - let AssignJobResult { - job_id, - need_toolchain, - num_assigned_jobs, - num_active_jobs, - } = match requester.do_assign_job(server_id, tc, assign_auth).await { - Ok(res) => res, - Err(err) => { - // LOCKS - let mut servers = self.servers.lock().await; - // Couldn't assign the job, so store the last_error - if let Some(server) = servers.get_mut(&server_id) { - server.last_error = Some(Instant::now()); - } - // Prune servers - self.prune_servers(&mut servers); - return Err(err); - } - }; - - // LOCKS - let mut servers = self.servers.lock().await; - let server = match servers.get_mut(&server_id) { - Some(server) => server, - _ => bail!("Failed to assign job to unknown server"), - }; - - // Assigned the job, so update server stats - server.last_seen = Instant::now(); - server.num_assigned_jobs = num_assigned_jobs; - server.num_active_jobs = num_active_jobs; - - let job_auth = server - .job_authorizer - .generate_token(job_id) - .map_err(Error::from) - .context("Could not create job auth token")?; - - if let Some(last_error) = server.last_error { - tracing::debug!( - "[alloc_job({}, {})]: Assigned job to server whose most recent error was {:?} ago", - server_id.addr(), - job_id, - Instant::now() - last_error - ); - } else { - tracing::debug!( - "[alloc_job({}, {})]: Job created and assigned to server", - server_id.addr(), - job_id, - ); - } - - // Prune servers only after updating this server's last_seen time. - self.prune_servers(&mut servers); - - Ok(AllocJobResult::Success { - job_alloc: JobAlloc { - auth: job_auth, - job_id, - server_id, - }, - need_toolchain, + toolchain: Toolchain, + toolchain_reader: Pin<&mut (dyn futures::AsyncRead + Send)>, + ) -> Result { + // Upload toolchain to toolchains storage (S3, GCS, etc.) + self.toolchains + .put_stream(&toolchain.archive_id, toolchain_reader) + .await + .context("Failed to put toolchain") + .map(|_| SubmitToolchainResult::Success) + .map_err(|err| { + tracing::error!("[put_toolchain({})]: {err:?}", toolchain.archive_id); + err }) - }; - - let get_best_server_by_least_load_and_oldest_error = - |servers: &mut HashMap, tried_servers: &HashSet| { - let now = Instant::now(); - - // Compute instantaneous load and update shared server state - servers - .iter_mut() - .filter_map(|(server_id, server)| { - // Forget errors that are too old to care about anymore - if let Some(last_error) = server.last_error { - if now.duration_since(last_error) >= remember_server_error_timeout { - server.last_error = None; - } - } - - // Each server defines its own `max_per_core_load` multiple - let num_vcpus = (server.num_cpus as f64 * server.max_per_core_load) - .floor() - .max(1.0); - - // Assume all pending and assigned jobs will eventually be run: - let num_jobs = server.num_assigned_jobs + server.num_active_jobs; - - let load = num_jobs as f64 / num_vcpus; - - // Exclude servers at max load and servers we've already tried - if load >= 1.0 || tried_servers.contains(server_id) { - None - } else { - Some((server_id, server, load)) - } - }) - // Sort servers by least load and oldest error - .sorted_by(|(_, server_a, load_a), (_, server_b, load_b)| { - match (server_a.last_error, server_b.last_error) { - // If neither server has a recent error, prefer the one with lowest load - (None, None) => load_a.total_cmp(load_b), - // Prefer servers with no recent errors over servers with recent errors - (None, Some(_)) => std::cmp::Ordering::Less, - (Some(_), None) => std::cmp::Ordering::Greater, - // If both servers have an error, prefer the one with the oldest error - (Some(err_a), Some(err_b)) => err_b.cmp(&err_a), - } - }) - .find_or_first(|_| true) - .map(|(server_id, _, _)| *server_id) - }; - - let mut tried_servers = HashSet::::new(); - #[allow(unused_assignments)] - let mut num_servers = 0; - let mut result = None; - - // Loop through candidate servers. - // Exit the loop once we've allocated the job. - // Try the next candidate if we encounter an error. - loop { - // Get the latest best server candidate after sorting all servers by least load - // and oldest error, sans the servers we've already tried. - // - // This computes each server's load again local to this loop. - // - // Since alloc_job in other threads can recover from errors and assign jobs to the - // next-best candidate, the load could drift if we only compute it once outside this - // loop. Computing load again ensures we allocate accurately based on the current - // statistics. - // LOCKS - let server_id = { - // LOCKS - let mut servers = self.servers.lock().await; - num_servers = servers.len(); - get_best_server_by_least_load_and_oldest_error(&mut servers, &tried_servers) - }; - - // Take the top candidate. If we can't allocate the job to it, - // remove it from the candidates list and try the next server. - if let Some(server_id) = server_id { - // Attempt to assign the job to this server. If assign_job fails, - // store the error and attempt to assign to the next server. - // If all servers error, return the last error to the client. - match try_assign_job(server_id, tc.clone()).await { - Ok(res) => { - // If assign_job succeeded, return the result - result = Some(Ok(res)); - break; - } - Err(err) => { - // If alloc_job failed, try the next best server - tracing::warn!( - "[alloc_job({})]: Error assigning job to server: {}", - server_id.addr(), - error_chain_to_string(&err) - ); - tried_servers.insert(server_id); - result = Some(Err(err)); - // Try the next server - continue; - } - } - } - // No available servers - break; - } + } - if let Some(result) = result { - result - } else { - // Fallback to the default failure case - Ok(AllocJobResult::Fail { - msg: format!("Insufficient capacity across {num_servers} available servers",), - }) - } + async fn new_job(&self, request: NewJobRequest) -> Result { + Ok(NewJobResponse { + has_toolchain: self.has_toolchain(request.toolchain).await, + job_id: uuid::Uuid::new_v4().to_string(), + timeout: self.job_time_limit, + }) } - async fn handle_heartbeat_server( + async fn run_job( &self, - server_id: ServerId, - server_nonce: ServerNonce, - num_cpus: usize, - max_per_core_load: f64, - job_authorizer: Box, - num_assigned_jobs: usize, - num_active_jobs: usize, - ) -> Result { - if num_cpus == 0 { - bail!("Invalid number of CPUs (0) specified in heartbeat") - } - - // LOCKS - let mut servers = self.servers.lock().await; - - match servers.get_mut(&server_id) { - Some(ref mut server) if server.server_nonce == server_nonce => { - server.last_seen = Instant::now(); - server.num_cpus = num_cpus; - server.job_authorizer = job_authorizer; - server.max_per_core_load = max_per_core_load; - server.num_assigned_jobs = num_assigned_jobs; - server.num_active_jobs = num_active_jobs; - - // Prune servers only after updating this server's last_seen time. - // This ensures the server which sent this heartbeat isn't pruned. - self.prune_servers(&mut servers); + RunJobRequest { + job_id, + toolchain, + command, + outputs, + inputs, + }: RunJobRequest, + ) -> Result { + let (tx, rx) = tokio::sync::oneshot::channel::(); + self.job_result.lock().await.insert(job_id.clone(), tx); + + let res = self + .task_queue + .send_task( + server_run_build::new( + job_id.clone(), + self.id.clone(), + toolchain, + command, + outputs, + inputs, + ) + .with_time_limit(self.job_time_limit), + ) + .await + .map_err(anyhow::Error::new); - return Ok(HeartbeatServerResult { is_new: false }); - } - _ => (), + if let Err(err) = res { + self.job_result.lock().await.remove(&job_id); + Err(err) + } else { + rx.await.map_err(anyhow::Error::new) } - - self.prune_servers(&mut servers); - - servers.insert( - server_id, - ServerDetails { - last_seen: Instant::now(), - last_error: None, - num_cpus, - max_per_core_load, - server_nonce, - job_authorizer, - num_assigned_jobs, - num_active_jobs, - }, - ); - - tracing::info!("Registered new server {:?}", server_id); - - Ok(HeartbeatServerResult { is_new: true }) } - async fn handle_update_job_state( - &self, - server_id: ServerId, - num_assigned_jobs: usize, - num_active_jobs: usize, - ) -> Result { - let mut servers = self.servers.lock().await; - let server = match servers.get_mut(&server_id) { - Some(server) => server, - _ => bail!("Failed to reserve job on unknown server"), - }; - - server.last_seen = Instant::now(); - server.num_assigned_jobs = num_assigned_jobs; - server.num_active_jobs = num_active_jobs; - - // Prune servers only after updating this server's last_seen time. - self.prune_servers(&mut servers); - - Ok(UpdateJobStateResult::Success) + async fn job_failure(&self, job_id: &str, reason: &str) -> Result<()> { + if let Some(sndr) = self.job_result.lock().await.remove(job_id) { + sndr.send(RunJobResponse::JobFailed { + reason: reason.to_owned(), + }) + .map_err(|_| anyhow!("Failed to send job result")) + } else { + Err(anyhow!( + "[job_failed({job_id})]: Failed to send response for unknown job" + )) + } } - async fn handle_status(&self) -> Result { - let Scheduler { - remember_server_error_timeout, - .. - } = *self; - - // LOCKS - let mut servers = self.servers.lock().await; - - // Prune servers before reporting the scheduler status - self.prune_servers(&mut servers); - - let mut assigned_jobs = 0; - let mut active_jobs = 0; - - let mut servers_map = HashMap::::new(); - for (server_id, server) in servers.iter() { - assigned_jobs += server.num_assigned_jobs; - active_jobs += server.num_active_jobs; - servers_map.insert( - server_id.addr(), - ServerStatusResult { - assigned: server.num_assigned_jobs, - active: server.num_active_jobs, - num_cpus: server.num_cpus, - max_per_core_load: server.max_per_core_load, - last_seen: server.last_seen.elapsed().as_secs(), - last_error: server - .last_error - .map(|e| (remember_server_error_timeout - e.elapsed()).as_secs()), - }, - ); + async fn job_success(&self, job_id: &str, result: BuildResult) -> Result<()> { + if let Some(sndr) = self.job_result.lock().await.remove(job_id) { + sndr.send(RunJobResponse::JobComplete { result }) + .map_err(|_| anyhow!("Failed to send job result")) + } else { + Err(anyhow!( + "[job_complete({job_id})]: Failed to send response for unknown job" + )) } - - Ok(SchedulerStatusResult { - num_cpus: servers.values().map(|v| v.num_cpus).sum(), - num_servers: servers.len(), - assigned: assigned_jobs, - active: active_jobs, - servers: servers_map, - }) } } -struct JobInfo { - ctime: Instant, - toolchain: Toolchain, -} - pub struct Server { - builder: Box, - cache: Mutex, - num_cpus: usize, - job_count: AtomicUsize, - job_queue: Arc, - jobs_assigned: Arc>>, + builder: Box, + jobs: Arc>>, + task_queue: Arc, + toolchains: Arc>, } impl Server { pub fn new( - builder: Box, - cache_dir: &Path, - toolchain_cache_size: u64, - num_cpus: usize, - ) -> Result { - let cache = TcCache::new(&cache_dir.join("tc"), toolchain_cache_size) - .context("Failed to create toolchain cache")?; - Ok(Server { + builder: Box, + task_queue: Arc, + toolchains: Arc>, + ) -> Self { + Self { builder, - num_cpus, - cache: Mutex::new(cache), - job_count: AtomicUsize::new(0), - job_queue: Arc::new(tokio::sync::Semaphore::new(num_cpus)), - jobs_assigned: Arc::new(Mutex::new(HashMap::new())), - }) + jobs: Default::default(), + task_queue, + toolchains, + } } } #[async_trait] -impl ServerIncoming for Server { - fn start_heartbeat(&self, requester: Arc) { - let num_cpus = self.num_cpus; - let job_queue = self.job_queue.clone(); - let jobs_assigned = self.jobs_assigned.clone(); - // Wait up to SCCACHE_DIST_REQUEST_TIMEOUT for a client to start a job. - // Remove jobs the client hasn't started within this interval. - let unstarted_job_timeout = get_dist_request_timeout(); - - // TODO: detect if this panics - tokio::spawn(async move { - loop { - let stale_jobs = { - let jobs_assigned = jobs_assigned.lock().await; - let now = std::time::Instant::now(); - let mut stale_jobs = vec![]; - for (&job_id, job_info) in jobs_assigned.iter() { - if now.duration_since(job_info.ctime) >= unstarted_job_timeout { - stale_jobs.push(job_id); - } - } - stale_jobs - }; +impl ServerService for Server { + async fn run_job( + &self, + task_id: &str, + job_id: &str, + scheduler_id: &str, + toolchain: Toolchain, + command: CompileCommand, + outputs: Vec, + inputs: Vec, + ) -> Result { + // Associate the task with the scheduler and job so we can report success or failure + self.jobs.lock().await.insert( + task_id.to_owned(), + (scheduler_id.to_owned(), job_id.to_owned()), + ); - let num_assigned_jobs = { - let mut jobs_assigned = jobs_assigned.lock().await; - for job_id in stale_jobs { - jobs_assigned.remove(&job_id); - } - jobs_assigned.len() - }; + let tc_dir = self.toolchains.lock().await.acquire(&toolchain).await?; - let num_active_jobs = num_cpus - job_queue.available_permits(); + let result = self + .builder + .run_build(job_id, &tc_dir, command, outputs, inputs) + .await; - let due_time = match requester - .do_heartbeat(num_assigned_jobs, num_active_jobs) - .await - { - Ok(HeartbeatServerResult { is_new }) => { - tracing::trace!("Heartbeat success is_new={}", is_new); - HEARTBEAT_INTERVAL - } - Err(e) => { - tracing::error!("Failed to send heartbeat to server: {}", e); - HEARTBEAT_ERROR_INTERVAL - } - }; + self.toolchains.lock().await.release(&toolchain).await?; - tokio::time::sleep(due_time).await; - } - }); + result } - async fn handle_assign_job(&self, tc: Toolchain) -> Result { - let need_toolchain = !self.cache.lock().await.contains_toolchain(&tc); - let job_id = JobId( - self.job_count - .fetch_add(1, std::sync::atomic::Ordering::SeqCst) as u64, - ); - let num_assigned_jobs = { - let mut jobs_assigned = self.jobs_assigned.lock().await; - jobs_assigned.insert( - job_id, - JobInfo { - ctime: Instant::now(), - toolchain: tc, - }, - ); - jobs_assigned.len() - }; - let num_active_jobs = self.num_cpus - self.job_queue.available_permits(); - - Ok(AssignJobResult { - job_id, - need_toolchain, - num_assigned_jobs, - num_active_jobs, - }) + async fn job_failure(&self, task_id: &str, reason: &str) -> Result<()> { + if let Some((scheduler_id, job_id)) = self.jobs.lock().await.remove(task_id) { + return self + .task_queue + .send_task( + scheduler_build_failed::new(job_id, reason.to_owned()) + .with_queue(&scheduler_id), + ) + .await + .map_err(anyhow::Error::new) + .map(|_| ()); + } + Err(anyhow!( + "[job_failed({task_id})]: Failed to respond to scheduler, unknown job_id" + )) } - async fn handle_submit_toolchain( - &self, - job_id: JobId, - mut tc_rdr: std::pin::Pin<&mut (dyn tokio::io::AsyncRead + Send)>, - ) -> Result { - // TODO: need to lock the toolchain until the container has started - // TODO: can start prepping container - - let tc = match self - .jobs_assigned - .lock() - .await - .get(&job_id) - .map(|j| j.toolchain.clone()) - { - Some(tc) => tc, - None => { - // Remove the job on error - self.jobs_assigned.lock().await.remove(&job_id); - return Ok(SubmitToolchainResult::JobNotFound); - } - }; - - let mut cache = self.cache.lock().await; - - let res = if cache.contains_toolchain(&tc) { - // Drop the lock - drop(cache); - - // Ignore the toolchain request body - // TODO: Investigate if this causes early hangup warnings in - // the load balancer. If so, use the implementation below. - Ok(()) - - // // Consume the entire toolchain request body - // tokio::io::copy(&mut tc_rdr, &mut tokio::io::empty()) - // .await - // .map(|_| ()) - // .or_else(|err| { - // tracing::warn!("[handle_submit_toolchain({})]: {:?}", job_id, err); - // // Ignore errors reading the request body - // Ok(()) - // }) - } else { - cache - .insert_with(&tc, |mut file| async move { - tokio::io::copy(&mut tc_rdr, &mut file).await - }) + async fn job_success(&self, task_id: &str, result: &BuildResult) -> Result<()> { + if let Some((scheduler_id, job_id)) = self.jobs.lock().await.remove(task_id) { + return self + .task_queue + .send_task( + scheduler_build_success::new(job_id, result.to_owned()) + .with_queue(&scheduler_id), + ) .await - }; - - match res { - Ok(_) => Ok(SubmitToolchainResult::Success), - Err(err) => { - tracing::warn!("[handle_submit_toolchain({})]: {:?}", job_id, err); - // Remove the job on error - self.jobs_assigned.lock().await.remove(&job_id); - Ok(SubmitToolchainResult::CannotCache) - } + .map_err(anyhow::Error::new) + .map(|_| ()); } + Err(anyhow!( + "[job_complete({task_id})]: Failed to respond to scheduler, unknown job_id" + )) } +} - async fn handle_run_job( - &self, - job_id: JobId, - command: CompileCommand, - outputs: Vec, - inputs_rdr: std::pin::Pin<&mut (dyn tokio::io::AsyncRead + Send)>, - ) -> Result { - // Remove the job from assigned map - let tc = { - let mut jobs_assigned = self.jobs_assigned.lock().await; - match jobs_assigned.remove(&job_id).map(|j| j.toolchain.clone()) { - Some(tc) => tc, - None => return Ok(RunJobResult::JobNotFound), - } - }; - - // Do the build - let res = std::panic::AssertUnwindSafe(self.builder.run_build( - job_id, - tc, - command, - outputs, - inputs_rdr, - &self.cache, - self.job_queue.as_ref(), - )) - .catch_unwind() +// Runs on the server +#[celery::task( + bind = true, + on_failure = on_server_run_build_failure, + on_success = on_server_run_build_success, +)] +pub async fn server_run_build( + task: &Self, + job_id: String, + scheduler_id: String, + toolchain: Toolchain, + command: CompileCommand, + outputs: Vec, + inputs: Vec, +) -> TaskResult { + let task_id = task.request.id.clone(); + + tracing::debug!( + "server_run_build: job_id={}, task_id={}, scheduler_id={}, toolchain={}, command={:?}, outputs={:?}", + job_id, + task_id, + scheduler_id, + toolchain.archive_id, + command, + outputs + ); + + if let Some(server) = SERVER.get() { + let job_id1 = job_id.clone(); + // let scheduler_id1 = scheduler_id.clone(); + tokio::spawn(async move { + server + .run_job( + &task_id, + &job_id1, + &scheduler_id, + toolchain, + command, + outputs, + inputs, + ) + .await + .map_err(|e| { + tracing::error!("[server_run_build({job_id1})]: run_job failed with: {e:?}"); + TaskError::UnexpectedError(e.to_string()) + }) + }) .await .map_err(|e| { - let msg = e - .downcast_ref::<&str>() - .map(|s| &**s) - .or_else(|| e.downcast_ref::().map(|s| &**s)) - .unwrap_or("An unknown panic was caught."); - anyhow::anyhow!("{msg}") - }) - .and_then(std::convert::identity); - - match res { - Err(e) => Err(e.context("run_job build failed")), - Ok(res) => Ok(RunJobResult::Complete(JobComplete { - output: res.output, - outputs: res.outputs, - })), - } + tracing::error!("[server_run_build({job_id})]: run_job failed with: {e:?}"); + TaskError::UnexpectedError(e.to_string()) + })? + } else { + Err(TaskError::UnexpectedError( + "sccache-dist server is not initialized".into(), + )) + } +} + +async fn on_server_run_build_failure(task: &server_run_build, err: &TaskError) { + let task_id = task.request().id.clone(); + if let Err(err) = SERVER + .get() + .unwrap() + .job_failure( + &task_id, + &match err { + TaskError::TimeoutError => { + format!("[server_run_build({task_id})]: Timed out") + } + _ => { + format!("[server_run_build({task_id})]: Failed with `{err}`") + } + }, + ) + .await + { + tracing::error!("[on_server_run_build_failure({task_id})]: {err}"); + } +} + +async fn on_server_run_build_success(task: &server_run_build, result: &BuildResult) { + let task_id = task.request().id.clone(); + if let Err(err) = SERVER.get().unwrap().job_success(&task_id, result).await { + tracing::error!("[on_server_run_build_success({task_id})]: {err}"); } } + +// Runs on the scheduler +#[celery::task] +async fn scheduler_build_failed(job_id: String, reason: String) -> TaskResult<()> { + SCHEDULER + .get() + .unwrap() + .job_failure(&job_id, &reason) + .await + .map_err(|e| TaskError::UnexpectedError(e.to_string())) +} + +// Runs on the scheduler +#[celery::task] +async fn scheduler_build_success(job_id: String, result: BuildResult) -> TaskResult<()> { + SCHEDULER + .get() + .unwrap() + .job_success(&job_id, result) + .await + .map_err(|e| TaskError::UnexpectedError(e.to_string())) +} diff --git a/src/bin/sccache-dist/token_check.rs b/src/bin/sccache-dist/token_check.rs index bec12a1b68..b1d645a369 100644 --- a/src/bin/sccache-dist/token_check.rs +++ b/src/bin/sccache-dist/token_check.rs @@ -100,7 +100,7 @@ impl MozillaCheck { pub fn new(required_groups: Vec) -> Self { Self { auth_cache: Mutex::new(HashMap::new()), - client: new_reqwest_blocking_client(None), + client: new_reqwest_blocking_client(), required_groups, } } @@ -269,7 +269,7 @@ impl ProxyTokenCheck { let maybe_auth_cache: Option, Duration)>> = cache_secs.map(|secs| Mutex::new((HashMap::new(), Duration::from_secs(secs)))); Self { - client: new_reqwest_blocking_client(None), + client: new_reqwest_blocking_client(), maybe_auth_cache, url, } diff --git a/src/cache/cache.rs b/src/cache/cache.rs index 218d2b87d3..fa7120118c 100644 --- a/src/cache/cache.rs +++ b/src/cache/cache.rs @@ -30,7 +30,6 @@ use crate::cache::s3::S3Cache; #[cfg(feature = "webdav")] use crate::cache::webdav::WebdavCache; use crate::compiler::PreprocessorCacheEntry; -use crate::config::Config; #[cfg(any( feature = "azure", feature = "gcs", @@ -41,16 +40,28 @@ use crate::config::Config; feature = "webdav", feature = "oss" ))] -use crate::config::{self, CacheType}; +use crate::config; +use crate::config::{CacheType, DiskCacheConfig}; use async_trait::async_trait; use fs_err as fs; use serde::{Deserialize, Serialize}; use std::fmt; use std::io::{self, Cursor, Read, Seek, Write}; use std::path::{Path, PathBuf}; +use std::pin::Pin; use std::sync::Arc; use std::time::Duration; use tempfile::NamedTempFile; +#[cfg(any( + feature = "azure", + feature = "gcs", + feature = "gha", + feature = "memcached", + feature = "redis", + feature = "s3", + feature = "webdav", +))] +use tokio_util::compat::{FuturesAsyncReadCompatExt, FuturesAsyncWriteCompatExt}; use zip::write::FileOptions; use zip::{CompressionMethod, ZipArchive, ZipWriter}; @@ -185,12 +196,19 @@ impl CacheRead { self.get_bytes("stderr") } - fn get_bytes(&mut self, name: &str) -> Vec { + pub fn get_bytes(&mut self, name: &str) -> Vec { let mut bytes = Vec::new(); drop(self.get_object(name, &mut bytes)); bytes } + pub fn into_inner(self) -> Box { + self.zip.into_inner() + // let mut reader = self.zip.into_inner(); + // reader.seek(std::io::SeekFrom::Start(0)).unwrap(); + // reader + } + pub async fn extract_objects( mut self, objects: T, @@ -344,12 +362,26 @@ pub trait Storage: Send + Sync { /// return a `Cache::Hit`. async fn get(&self, key: &str) -> Result; + async fn get_stream(&self, key: &str) -> Result>; + + /// Check if the cache has an entry for `key`. + /// + /// If the entry is successfully found in the cache, return true. + /// If an error occurs, or the entry is not found in the cache, return false. + async fn has(&self, key: &str) -> bool; + /// Put `entry` in the cache under `key`. /// /// Returns a `Future` that will provide the result or error when the put is /// finished. async fn put(&self, key: &str, entry: CacheWrite) -> Result; + async fn put_stream( + &self, + key: &str, + stream: Pin<&mut (dyn futures::AsyncRead + Send)>, + ) -> Result<()>; + /// Check the cache capability. /// /// - `Ok(CacheMode::ReadOnly)` means cache can only be used to `get` @@ -367,7 +399,7 @@ pub trait Storage: Send + Sync { } /// Get the storage location. - fn location(&self) -> String; + async fn location(&self) -> String; /// Get the current storage usage, if applicable. async fn current_size(&self) -> Result>; @@ -478,6 +510,19 @@ impl Storage for opendal::Operator { } } + async fn get_stream(&self, key: &str) -> Result> { + Ok(Box::new( + self.reader(&normalize_key(key)) + .await? + .into_futures_async_read(..) + .await?, + ) as Box) + } + + async fn has(&self, key: &str) -> bool { + self.stat(&normalize_key(key)).await.is_ok() + } + async fn put(&self, key: &str, entry: CacheWrite) -> Result { let start = std::time::Instant::now(); @@ -486,6 +531,20 @@ impl Storage for opendal::Operator { Ok(start.elapsed()) } + async fn put_stream( + &self, + key: &str, + source: Pin<&mut (dyn futures::AsyncRead + Send)>, + ) -> Result<()> { + let sink = self.writer(&normalize_key(key)).await?; + tokio::io::copy( + &mut source.compat(), + &mut sink.into_futures_async_write().compat_write(), + ) + .await?; + Ok(()) + } + async fn check(&self) -> Result { use opendal::ErrorKind; @@ -532,7 +591,7 @@ impl Storage for opendal::Operator { Ok(mode) } - fn location(&self) -> String { + async fn location(&self) -> String { let meta = self.info(); format!( "{}, name: {}, prefix: {}", @@ -559,10 +618,11 @@ pub(in crate::cache) fn normalize_key(key: &str) -> String { /// Get a suitable `Storage` implementation from configuration. #[allow(clippy::cognitive_complexity)] // TODO simplify! pub fn storage_from_config( - config: &Config, + cache: &Option, + fallback_cache: &DiskCacheConfig, pool: &tokio::runtime::Handle, ) -> Result> { - if let Some(cache_type) = &config.cache { + if let Some(cache_type) = &cache { match cache_type { #[cfg(feature = "azure")] CacheType::Azure(config::AzureCacheConfig { @@ -732,9 +792,9 @@ pub fn storage_from_config( } } - let (dir, size) = (&config.fallback_cache.dir, config.fallback_cache.size); - let preprocessor_cache_mode_config = config.fallback_cache.preprocessor_cache_mode; - let rw_mode = config.fallback_cache.rw_mode.into(); + let (dir, size) = (&fallback_cache.dir, fallback_cache.size); + let preprocessor_cache_mode_config = fallback_cache.preprocessor_cache_mode; + let rw_mode = fallback_cache.rw_mode.into(); debug!("Init disk cache with dir {:?}, size {}", dir, size); Ok(Arc::new(DiskCache::new( dir, @@ -748,7 +808,7 @@ pub fn storage_from_config( #[cfg(test)] mod test { use super::*; - use crate::config::CacheModeConfig; + use crate::config::{CacheModeConfig, Config}; #[test] fn test_normalize_key() { @@ -786,7 +846,9 @@ mod test { config.fallback_cache.rw_mode = CacheModeConfig::ReadWrite; { - let cache = storage_from_config(&config, runtime.handle()).unwrap(); + let cache = + storage_from_config(&config.cache, &config.fallback_cache, runtime.handle()) + .unwrap(); runtime.block_on(async move { cache.put("test1", CacheWrite::default()).await.unwrap(); @@ -801,7 +863,9 @@ mod test { config.fallback_cache.rw_mode = CacheModeConfig::ReadOnly; { - let cache = storage_from_config(&config, runtime.handle()).unwrap(); + let cache = + storage_from_config(&config.cache, &config.fallback_cache, runtime.handle()) + .unwrap(); runtime.block_on(async move { assert_eq!( diff --git a/src/cache/disk.rs b/src/cache/disk.rs index 36d3be8efc..5a48a1e1c8 100644 --- a/src/cache/disk.rs +++ b/src/cache/disk.rs @@ -17,11 +17,14 @@ use crate::compiler::PreprocessorCacheEntry; use crate::lru_disk_cache::LruDiskCache; use crate::lru_disk_cache::{Error as LruError, ReadSeek}; use async_trait::async_trait; +use futures::AsyncReadExt; use std::ffi::{OsStr, OsString}; use std::io::{BufWriter, Write}; use std::path::{Path, PathBuf}; +use std::pin::Pin; use std::sync::{Arc, Mutex}; use std::time::{Duration, Instant}; +use tokio_util::compat::TokioAsyncReadCompatExt; use crate::errors::*; @@ -74,6 +77,7 @@ pub struct DiskCache { preprocessor_cache_mode_config: PreprocessorCacheModeConfig, preprocessor_cache: Arc>, rw_mode: CacheMode, + root: PathBuf, } impl DiskCache { @@ -99,6 +103,7 @@ impl DiskCache { max_size, })), rw_mode, + root: PathBuf::from(root.as_ref()), } } } @@ -136,6 +141,30 @@ impl Storage for DiskCache { .await? } + async fn get_stream(&self, key: &str) -> Result> { + // HACK: Ignore the LRU and assume the file exists + trace!("DiskCache::get_stream({})", key); + let path = self.root.join(make_key_path(key)); + let file = tokio::fs::File::open(path).await?; + Ok(Box::new(file.compat()) as Box) + } + + async fn has(&self, key: &str) -> bool { + let path = make_key_path(key); + let lru = self.lru.clone(); + + self.pool + .spawn_blocking(move || { + lru.lock() + .unwrap() + .get_or_init() + .and_then(|lru| lru.get(&path).map_err(|e| e.into())) + .is_ok() + }) + .await + .unwrap_or(false) + } + async fn put(&self, key: &str, entry: CacheWrite) -> Result { // We should probably do this on a background thread if we're going to buffer // everything in memory... @@ -164,11 +193,43 @@ impl Storage for DiskCache { .await? } + async fn put_stream( + &self, + key: &str, + mut source: Pin<&mut (dyn futures::AsyncRead + Send)>, + ) -> Result<()> { + if self.rw_mode == CacheMode::ReadOnly { + return Err(anyhow!("Cannot write to a read-only cache")); + } + + let lru = self.lru.clone(); + let key = make_key_path(key); + + let mut v = vec![]; + source.read_to_end(&mut v).await?; + + self.pool + .spawn_blocking(move || { + let mut f = lru + .lock() + .unwrap() + .get_or_init()? + .prepare_add(key, v.len() as u64)?; + + f.as_file_mut().write_all(&v)?; + + lru.lock().unwrap().get().unwrap().commit(f)?; + + Ok(()) + }) + .await? + } + async fn check(&self) -> Result { Ok(self.rw_mode) } - fn location(&self) -> String { + async fn location(&self) -> String { format!("Local disk: {:?}", self.lru.lock().unwrap().path()) } diff --git a/src/cache/readonly.rs b/src/cache/readonly.rs index 90431c4fb8..f07ae153bb 100644 --- a/src/cache/readonly.rs +++ b/src/cache/readonly.rs @@ -10,6 +10,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::pin::Pin; use std::sync::Arc; use std::time::Duration; @@ -29,6 +30,14 @@ impl Storage for ReadOnlyStorage { self.0.get(key).await } + async fn get_stream(&self, key: &str) -> Result> { + self.0.get_stream(key).await + } + + async fn has(&self, key: &str) -> bool { + self.0.has(key).await + } + /// Put `entry` in the cache under `key`. /// /// Returns a `Future` that will provide the result or error when the put is @@ -37,6 +46,14 @@ impl Storage for ReadOnlyStorage { Err(anyhow!("Cannot write to read-only storage")) } + async fn put_stream( + &self, + _key: &str, + _stream: Pin<&mut (dyn futures::AsyncRead + Send)>, + ) -> Result<()> { + Err(anyhow!("Cannot write to read-only storage")) + } + /// Check the cache capability. /// /// The ReadOnlyStorage cache is always read-only. @@ -45,8 +62,8 @@ impl Storage for ReadOnlyStorage { } /// Get the storage location. - fn location(&self) -> String { - self.0.location() + async fn location(&self) -> String { + self.0.location().await } /// Get the current storage usage, if applicable. diff --git a/src/commands.rs b/src/commands.rs index cd4523567a..5fcc965ad3 100644 --- a/src/commands.rs +++ b/src/commands.rs @@ -633,7 +633,12 @@ pub fn run_command(cmd: Command) -> Result { // anyways, so we can just return (mostly) empty stats directly. Err(_) => { let runtime = Runtime::new()?; - let storage = storage_from_config(config, runtime.handle()).ok(); + let storage = storage_from_config( + &config.cache, + &config.fallback_cache, + runtime.handle(), + ) + .ok(); runtime.block_on(ServerInfo::new(ServerStats::default(), storage.as_deref()))? } }; diff --git a/src/compiler/compiler.rs b/src/compiler/compiler.rs index 44019a8df1..02dbbf4c6b 100644 --- a/src/compiler/compiler.rs +++ b/src/compiler/compiler.rs @@ -843,22 +843,24 @@ where { use std::io; + use crate::dist::NewJobResponse; + let mut dist_compile_cmd = dist_compile_cmd.context("Could not create distributed compile command")?; - trace!("[{}]: Creating distributed compile request", out_pretty); + trace!("[{out_pretty}]: Creating distributed compile request"); let dist_output_paths = compilation .outputs() .map(|output| path_transformer.as_dist_abs(&cwd.join(output.path))) .collect::>() .context("Failed to adapt an output path for distributed compile")?; + let (inputs_packager, toolchain_packager, outputs_rewriter) = compilation.into_dist_packagers(path_transformer)?; trace!( - "[{}]: Identifying dist toolchain for {:?}", - out_pretty, + "[{out_pretty}]: Identifying dist toolchain for {:?}", compile_cmd.get_executable() ); let (dist_toolchain, maybe_dist_compile_executable) = dist_client @@ -868,125 +870,80 @@ where toolchain_packager, ) .await?; + let mut tc_archive = None; + if let Some((dist_compile_executable, archive_path)) = maybe_dist_compile_executable { dist_compile_cmd.executable = dist_compile_executable; tc_archive = Some(archive_path); } - trace!("[{}]: Requesting allocation", out_pretty); - let jares = dist_client.do_alloc_job(dist_toolchain.clone()).await?; - let job_alloc = match jares { - dist::AllocJobResult::Success { - job_alloc, - need_toolchain: true, - } => { - debug!( - "[{}, {}, {}]: Successfully allocated job", - out_pretty, - job_alloc.job_id, - job_alloc.server_id.addr() - ); - debug!( - "[{}, {}, {}]: Sending toolchain {}", - out_pretty, - job_alloc.job_id, - job_alloc.server_id.addr(), - dist_toolchain.archive_id - ); - - let archive_id = dist_toolchain.archive_id.clone(); + let NewJobResponse { + has_toolchain, + job_id, + timeout, + } = dist_client.new_job(dist_toolchain.clone()).await?; - match dist_client - .do_submit_toolchain(job_alloc.clone(), dist_toolchain) - .await - .map_err(|e| e.context("Could not submit toolchain"))? - { - dist::SubmitToolchainResult::Success => { - trace!( - "[{}, {}, {}]: Successfully sent toolchain {}", - out_pretty, - job_alloc.job_id, - job_alloc.server_id.addr(), - archive_id, - ); - Ok(job_alloc) - } - dist::SubmitToolchainResult::JobNotFound => { - bail!( - "[{}, {}, {}]: Failed to submit toolchain, job not found on server", - out_pretty, - job_alloc.job_id, - job_alloc.server_id.addr() - ) - } - dist::SubmitToolchainResult::CannotCache => bail!( - "[{}, {}, {}]: Toolchain for job could not be cached by server", - out_pretty, - job_alloc.job_id, - job_alloc.server_id.addr() - ), + if !has_toolchain { + trace!( + "[{}]: Submitting toolchain `{}`", + out_pretty, + dist_toolchain.archive_id, + ); + match dist_client + .do_submit_toolchain(dist_toolchain.clone()) + .await + .map_err(|e| e.context("Could not submit toolchain"))? + { + dist::SubmitToolchainResult::Success => { + trace!( + "[{out_pretty}]: Successfully sent toolchain `{}`", + dist_toolchain.archive_id, + ); + } + dist::SubmitToolchainResult::Error { message } => { + trace!( + "[{out_pretty}]: Failed sending toolchain `{}`: {message}", + dist_toolchain.archive_id, + ); + return Err(anyhow!(message)); } } - dist::AllocJobResult::Success { - job_alloc, - need_toolchain: false, - } => { - debug!( - "[{}, {}, {}]: Successfully allocated job on server", - out_pretty, - job_alloc.job_id, - job_alloc.server_id.addr() - ); - Ok(job_alloc) - } - dist::AllocJobResult::Fail { msg } => Err(anyhow!("Failed to allocate job").context(msg)), - }?; - let job_id = job_alloc.job_id; - let server_id = job_alloc.server_id; - debug!( - "[{}, {}, {}]: Running job on server", - out_pretty, - job_id, - server_id.addr() - ); - let ((job_id, server_id), (jres, path_transformer)) = dist_client - .do_run_job( - job_alloc, + } + + debug!("[{out_pretty}, {job_id}]: Running job"); + + let job_result = dist_client + .run_job( + &job_id, + Duration::from_secs(timeout as u64), + dist_toolchain, dist_compile_cmd, dist_output_paths, inputs_packager, ) - .await - .map(move |res| ((job_id, server_id), res)) - .with_context(|| { - format!( - "Could not run distributed compilation job on {:?}", - server_id.addr() - ) - })?; - - let jc = match jres { - dist::RunJobResult::Complete(jc) => jc, - dist::RunJobResult::JobNotFound => { - bail!( - "[{}, {}, {}]: Failed to run job, job not found on server", - out_pretty, - job_id, - server_id.addr() - ) + .await; + + let (jc, path_transformer) = match job_result { + Ok((dist::RunJobResponse::JobComplete { result }, path_transformer)) => { + (result, path_transformer) + } + Ok((dist::RunJobResponse::JobFailed { reason }, _)) => { + bail!("[{out_pretty}, {job_id}]: Could not run distributed compilation job: {reason}") + } + Err(err) => { + bail!("[{out_pretty}, {job_id}]: Could not run distributed compilation job: {err}") } }; + debug!( - "[{}, {}, {}]: Fetched {:?}", - out_pretty, - job_id, - server_id.addr(), + "[{out_pretty}, {job_id}]: Fetched {:?}", jc.outputs .iter() .map(|(p, bs)| (p, bs.lens().to_string())) .collect::>() ); + let mut output_paths: Vec = vec![]; macro_rules! try_or_cleanup { ($v:expr) => {{ @@ -998,14 +955,7 @@ where for local_path in output_paths.iter() { if let Err(e) = fs::remove_file(local_path) { if e.kind() != io::ErrorKind::NotFound { - warn!( - "[{}, {}, {}]: {} while attempting to clear up {}", - out_pretty, - job_id, - server_id.addr(), - e, - local_path.display() - ) + warn!("[{out_pretty}, {job_id}]: {e} while attempting to remove `{}`", local_path.display()) } } } @@ -1018,7 +968,7 @@ where for (path, output_data) in jc.outputs { let len = output_data.lens().actual; let local_path = try_or_cleanup!(path_transformer.to_local(&path).with_context( - || format!("[{}]: unable to transform output path {}", out_pretty, path) + || format!("[{out_pretty}, {job_id}]: unable to transform output path {path}") )); output_paths.push(local_path); // Do this first so cleanup works correctly @@ -1038,7 +988,8 @@ where try_or_cleanup!(outputs_rewriter .handle_outputs(&path_transformer, &output_paths, &extra_inputs) .with_context(|| "Failed to rewrite outputs from compile")); - Ok((DistType::Ok(server_id), jc.output.into())) + + Ok((DistType::Ok, jc.output.into())) } impl Clone for Box> { @@ -1159,7 +1110,7 @@ pub enum DistType { /// Distribution was not enabled. NoDist, /// Distributed compile success. - Ok(dist::ServerId), + Ok, /// Distributed compile failed. Error, } @@ -2562,7 +2513,7 @@ LLVM version: 6.0", // Ensure that the object file was created. assert!(fs::metadata(&obj).map(|m| m.len() > 0).unwrap()); match cached { - CompileResult::CacheMiss(MissType::Normal, DistType::Ok(_), _, f) => { + CompileResult::CacheMiss(MissType::Normal, DistType::Ok, _, f) => { // wait on cache write future so we don't race with it! f.wait().unwrap(); } @@ -3119,15 +3070,15 @@ LLVM version: 6.0", #[cfg(test)] #[cfg(feature = "dist-client")] mod test_dist { - use crate::dist::pkg; use crate::dist::{ - self, AllocJobResult, CompileCommand, JobAlloc, JobComplete, JobId, OutputData, - PathTransformer, ProcessOutput, RunJobResult, SchedulerStatusResult, ServerId, - SubmitToolchainResult, Toolchain, + self, CompileCommand, NewJobResponse, OutputData, PathTransformer, ProcessOutput, + RunJobResponse, SchedulerStatusResult, SubmitToolchainResult, Toolchain, }; + use crate::dist::{pkg, BuildResult}; use async_trait::async_trait; use std::path::{Path, PathBuf}; use std::sync::{atomic::AtomicBool, Arc}; + use std::time::Duration; use crate::errors::*; @@ -3140,26 +3091,24 @@ mod test_dist { } #[async_trait] impl dist::Client for ErrorPutToolchainClient { - async fn do_alloc_job(&self, _: Toolchain) -> Result { + async fn new_job(&self, _: Toolchain) -> Result { unreachable!() } async fn do_get_status(&self) -> Result { unreachable!() } - async fn do_submit_toolchain( - &self, - _: JobAlloc, - _: Toolchain, - ) -> Result { + async fn do_submit_toolchain(&self, _: Toolchain) -> Result { unreachable!() } - async fn do_run_job( + async fn run_job( &self, - _: JobAlloc, + _: &str, + _: Duration, + _: Toolchain, _: CompileCommand, _: Vec, _: Box, - ) -> Result<(RunJobResult, PathTransformer)> { + ) -> Result<(RunJobResponse, PathTransformer)> { unreachable!() } async fn put_toolchain( @@ -3193,27 +3142,25 @@ mod test_dist { } #[async_trait] impl dist::Client for ErrorAllocJobClient { - async fn do_alloc_job(&self, tc: Toolchain) -> Result { + async fn new_job(&self, tc: Toolchain) -> Result { assert_eq!(self.tc, tc); Err(anyhow!("MOCK: alloc job failure")) } async fn do_get_status(&self) -> Result { unreachable!() } - async fn do_submit_toolchain( - &self, - _: JobAlloc, - _: Toolchain, - ) -> Result { + async fn do_submit_toolchain(&self, _: Toolchain) -> Result { unreachable!() } - async fn do_run_job( + async fn run_job( &self, - _: JobAlloc, + _: &str, + _: Duration, + _: Toolchain, _: CompileCommand, _: Vec, _: Box, - ) -> Result<(RunJobResult, PathTransformer)> { + ) -> Result<(RunJobResponse, PathTransformer)> { unreachable!() } async fn put_toolchain( @@ -3250,40 +3197,34 @@ mod test_dist { #[async_trait] impl dist::Client for ErrorSubmitToolchainClient { - async fn do_alloc_job(&self, tc: Toolchain) -> Result { + async fn new_job(&self, tc: Toolchain) -> Result { assert!(!self .has_started .swap(true, std::sync::atomic::Ordering::AcqRel)); assert_eq!(self.tc, tc); - Ok(AllocJobResult::Success { - job_alloc: JobAlloc { - auth: "abcd".to_owned(), - job_id: JobId(0), - server_id: ServerId::new(([0, 0, 0, 0], 1).into()), - }, - need_toolchain: true, + Ok(NewJobResponse { + has_toolchain: false, + job_id: "job_id".into(), + timeout: 10, }) } async fn do_get_status(&self) -> Result { unreachable!("fn do_get_status is not used for this test. qed") } - async fn do_submit_toolchain( - &self, - job_alloc: JobAlloc, - tc: Toolchain, - ) -> Result { - assert_eq!(job_alloc.job_id, JobId(0)); + async fn do_submit_toolchain(&self, tc: Toolchain) -> Result { assert_eq!(self.tc, tc); Err(anyhow!("MOCK: submit toolchain failure")) } - async fn do_run_job( + async fn run_job( &self, - _: JobAlloc, + _: &str, + _: Duration, + _: Toolchain, _: CompileCommand, _: Vec, _: Box, - ) -> Result<(RunJobResult, PathTransformer)> { - unreachable!("fn do_run_job is not used for this test. qed") + ) -> Result<(RunJobResponse, PathTransformer)> { + unreachable!("fn run_job is not used for this test. qed") } async fn put_toolchain( &self, @@ -3319,40 +3260,36 @@ mod test_dist { #[async_trait] impl dist::Client for ErrorRunJobClient { - async fn do_alloc_job(&self, tc: Toolchain) -> Result { + async fn new_job(&self, tc: Toolchain) -> Result { assert!(!self .has_started .swap(true, std::sync::atomic::Ordering::AcqRel)); assert_eq!(self.tc, tc); - Ok(AllocJobResult::Success { - job_alloc: JobAlloc { - auth: "abcd".to_owned(), - job_id: JobId(0), - server_id: ServerId::new(([0, 0, 0, 0], 1).into()), - }, - need_toolchain: true, + Ok(NewJobResponse { + has_toolchain: false, + job_id: "job_id".into(), + timeout: 10, }) } async fn do_get_status(&self) -> Result { unreachable!() } - async fn do_submit_toolchain( - &self, - job_alloc: JobAlloc, - tc: Toolchain, - ) -> Result { - assert_eq!(job_alloc.job_id, JobId(0)); + async fn do_submit_toolchain(&self, tc: Toolchain) -> Result { assert_eq!(self.tc, tc); Ok(SubmitToolchainResult::Success) } - async fn do_run_job( + async fn run_job( &self, - job_alloc: JobAlloc, + job_id: &str, + timeout: Duration, + tc: Toolchain, command: CompileCommand, _: Vec, _: Box, - ) -> Result<(RunJobResult, PathTransformer)> { - assert_eq!(job_alloc.job_id, JobId(0)); + ) -> Result<(RunJobResponse, PathTransformer)> { + assert_eq!(job_id, "job_id"); + assert_eq!(timeout, Duration::from_secs(10)); + assert_eq!(self.tc, tc); assert_eq!(command.executable, "/overridden/compiler"); Err(anyhow!("MOCK: run job failure")) } @@ -3399,42 +3336,37 @@ mod test_dist { #[async_trait] impl dist::Client for OneshotClient { - async fn do_alloc_job(&self, tc: Toolchain) -> Result { + async fn new_job(&self, tc: Toolchain) -> Result { assert!(!self .has_started .swap(true, std::sync::atomic::Ordering::AcqRel)); assert_eq!(self.tc, tc); - - Ok(AllocJobResult::Success { - job_alloc: JobAlloc { - auth: "abcd".to_owned(), - job_id: JobId(0), - server_id: ServerId::new(([0, 0, 0, 0], 1).into()), - }, - need_toolchain: true, + Ok(NewJobResponse { + has_toolchain: false, + job_id: "job_id".into(), + timeout: 10, }) } async fn do_get_status(&self) -> Result { unreachable!("fn do_get_status is not used for this test. qed") } - async fn do_submit_toolchain( - &self, - job_alloc: JobAlloc, - tc: Toolchain, - ) -> Result { - assert_eq!(job_alloc.job_id, JobId(0)); + async fn do_submit_toolchain(&self, tc: Toolchain) -> Result { assert_eq!(self.tc, tc); Ok(SubmitToolchainResult::Success) } - async fn do_run_job( + async fn run_job( &self, - job_alloc: JobAlloc, + job_id: &str, + timeout: Duration, + tc: Toolchain, command: CompileCommand, outputs: Vec, inputs_packager: Box, - ) -> Result<(RunJobResult, PathTransformer)> { - assert_eq!(job_alloc.job_id, JobId(0)); + ) -> Result<(RunJobResponse, PathTransformer)> { + assert_eq!(job_id, "job_id"); + assert_eq!(timeout, Duration::from_secs(10)); + assert_eq!(self.tc, tc); assert_eq!(command.executable, "/overridden/compiler"); let mut inputs = vec![]; @@ -3447,10 +3379,12 @@ mod test_dist { (name, data) }) .collect(); - let result = RunJobResult::Complete(JobComplete { - output: self.output.clone(), - outputs, - }); + let result = RunJobResponse::JobComplete { + result: BuildResult { + output: self.output.clone(), + outputs, + }, + }; Ok((result, path_transformer)) } async fn put_toolchain( diff --git a/src/config.rs b/src/config.rs index 6f511121bb..d3c3d73d98 100644 --- a/src/config.rs +++ b/src/config.rs @@ -145,7 +145,7 @@ impl HTTPUrl { } } -#[derive(Debug, PartialEq, Eq, Serialize, Deserialize)] +#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] #[serde(deny_unknown_fields)] pub struct AzureCacheConfig { pub connection_string: String, @@ -153,7 +153,7 @@ pub struct AzureCacheConfig { pub key_prefix: String, } -#[derive(Debug, PartialEq, Eq, Serialize, Deserialize)] +#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] #[serde(deny_unknown_fields)] #[serde(default)] pub struct DiskCacheConfig { @@ -193,7 +193,7 @@ impl From for CacheMode { } } -#[derive(Debug, PartialEq, Eq, Serialize, Deserialize)] +#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] #[serde(deny_unknown_fields)] pub struct GCSCacheConfig { pub bucket: String, @@ -204,7 +204,7 @@ pub struct GCSCacheConfig { pub credential_url: Option, } -#[derive(Debug, PartialEq, Eq, Serialize, Deserialize)] +#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] #[serde(deny_unknown_fields)] pub struct GHACacheConfig { pub enabled: bool, @@ -226,7 +226,7 @@ fn default_memcached_cache_expiration() -> u32 { DEFAULT_MEMCACHED_CACHE_EXPIRATION } -#[derive(Debug, PartialEq, Eq, Serialize, Deserialize, Default)] +#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize, Default)] #[serde(deny_unknown_fields)] pub struct MemcachedCacheConfig { #[serde(alias = "endpoint")] @@ -256,7 +256,7 @@ pub struct MemcachedCacheConfig { /// Please change this value freely if we have a better choice. const DEFAULT_REDIS_CACHE_TTL: u64 = 0; pub const DEFAULT_REDIS_DB: u32 = 0; -#[derive(Debug, PartialEq, Eq, Serialize, Deserialize, Default)] +#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize, Default)] #[serde(deny_unknown_fields)] pub struct RedisCacheConfig { /// The single-node redis endpoint. @@ -293,7 +293,7 @@ pub struct RedisCacheConfig { pub key_prefix: String, } -#[derive(Debug, PartialEq, Eq, Serialize, Deserialize)] +#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] #[serde(deny_unknown_fields)] pub struct WebdavCacheConfig { pub endpoint: String, @@ -304,7 +304,7 @@ pub struct WebdavCacheConfig { pub token: Option, } -#[derive(Debug, PartialEq, Eq, Serialize, Deserialize)] +#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] #[serde(deny_unknown_fields)] pub struct S3CacheConfig { pub bucket: String, @@ -317,7 +317,7 @@ pub struct S3CacheConfig { pub server_side_encryption: Option, } -#[derive(Debug, PartialEq, Eq, Serialize, Deserialize)] +#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] #[serde(deny_unknown_fields)] pub struct OSSCacheConfig { pub bucket: String, @@ -327,7 +327,7 @@ pub struct OSSCacheConfig { pub no_credentials: bool, } -#[derive(Debug, PartialEq, Eq)] +#[derive(Clone, Debug, PartialEq, Eq)] pub enum CacheType { Azure(AzureCacheConfig), GCS(GCSCacheConfig), @@ -429,6 +429,45 @@ impl CacheConfigs { } } +impl From for CacheConfigs { + fn from(cache_type: CacheType) -> Self { + match cache_type { + CacheType::Azure(opts) => Self { + azure: Some(opts), + ..Default::default() + }, + CacheType::GCS(opts) => Self { + gcs: Some(opts), + ..Default::default() + }, + CacheType::GHA(opts) => Self { + gha: Some(opts), + ..Default::default() + }, + CacheType::Memcached(opts) => Self { + memcached: Some(opts), + ..Default::default() + }, + CacheType::Redis(opts) => Self { + redis: Some(opts), + ..Default::default() + }, + CacheType::S3(opts) => Self { + s3: Some(opts), + ..Default::default() + }, + CacheType::Webdav(opts) => Self { + webdav: Some(opts), + ..Default::default() + }, + CacheType::OSS(opts) => Self { + oss: Some(opts), + ..Default::default() + }, + } + } +} + #[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] #[serde(deny_unknown_fields)] #[serde(tag = "type")] @@ -1070,23 +1109,64 @@ impl CachedConfig { } } +#[cfg(feature = "dist-server")] +#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq)] +pub enum MessageBroker { + AMQP(String), + Redis(String), +} + +#[cfg(feature = "dist-server")] +#[derive(Debug, Default, Serialize, Deserialize, PartialEq, Eq)] +pub struct MessageBrokerConfigs { + pub amqp: Option, + pub redis: Option, +} + #[cfg(feature = "dist-server")] pub mod scheduler { - use std::path::Path; + use std::path::PathBuf; use std::{net::SocketAddr, str::FromStr}; use crate::errors::*; use serde::{Deserialize, Serialize}; - pub fn default_remember_server_error_timeout() -> u64 { - std::env::var("SCCACHE_DIST_REMEMBER_SERVER_ERROR_TIMEOUT") + use super::{ + config_from_env, try_read_config_file, CacheConfigs, CacheModeConfig, CacheType, + DiskCacheConfig, MessageBroker, MessageBrokerConfigs, + }; + + pub fn default_max_body_size() -> usize { + std::env::var("SCCACHE_DIST_MAX_BODY_SIZE") .ok() .and_then(|s| s.parse().ok()) - .unwrap_or(300) + // 1GiB should be enough for toolchains and compile inputs, right? + .unwrap_or(1024 * 1024 * 1024) } - #[derive(Debug, Serialize, Deserialize)] + pub fn default_job_time_limit() -> u32 { + std::env::var("SCCACHE_DIST_JOB_TIME_LIMIT_SECS") + .ok() + .and_then(|s| s.parse().ok()) + .unwrap_or(600) + } + + pub fn default_enable_web_socket_server() -> bool { + std::env::var("SCCACHE_DIST_ENABLE_WEB_SOCKET_SERVER") + .ok() + .and_then(|s| s.parse().ok()) + .unwrap_or(true) + } + + // pub fn default_remember_server_error_timeout() -> u64 { + // std::env::var("SCCACHE_DIST_REMEMBER_SERVER_ERROR_TIMEOUT") + // .ok() + // .and_then(|s| s.parse().ok()) + // .unwrap_or(300) + // } + + #[derive(Clone, Debug, Serialize, Deserialize)] #[serde(tag = "type")] #[serde(deny_unknown_fields)] pub enum ClientAuth { @@ -1124,35 +1204,140 @@ pub mod scheduler { #[derive(Debug, Serialize, Deserialize)] #[serde(default)] #[serde(deny_unknown_fields)] - pub struct Config { - pub public_addr: SocketAddr, + pub struct FileConfig { pub client_auth: ClientAuth, - pub server_auth: ServerAuth, - pub remember_server_error_timeout: u64, + pub enable_web_socket_server: Option, + pub job_time_limit: Option, + pub max_body_size: Option, + pub message_broker: Option, + pub public_addr: SocketAddr, + pub toolchains: CacheConfigs, } - impl Default for Config { + impl Default for FileConfig { fn default() -> Self { Self { - public_addr: SocketAddr::from_str("0.0.0.0:10500").unwrap(), client_auth: ClientAuth::Insecure, - server_auth: ServerAuth::Insecure, - remember_server_error_timeout: default_remember_server_error_timeout(), + enable_web_socket_server: Some(default_enable_web_socket_server()), + job_time_limit: Some(default_job_time_limit()), + max_body_size: Some(default_max_body_size()), + message_broker: Some(MessageBrokerConfigs { + amqp: std::env::var("AMQP_ADDR").ok().map(MessageBroker::AMQP), + redis: std::env::var("REDIS_ADDR").ok().map(MessageBroker::Redis), + }), + public_addr: SocketAddr::from_str("0.0.0.0:10500").unwrap(), + toolchains: CacheConfigs { + disk: Some(DiskCacheConfig { + dir: PathBuf::from_str("/tmp/sccache/toolchains").unwrap(), + preprocessor_cache_mode: Default::default(), + rw_mode: CacheModeConfig::ReadWrite, + size: u64::MAX, + }), + ..Default::default() + }, } } } - pub fn from_path(conf_path: &Path) -> Result> { - super::try_read_config_file(conf_path).context("Failed to load scheduler config file") + #[derive(Debug)] + pub struct Config { + pub client_auth: ClientAuth, + pub enable_web_socket_server: bool, + pub job_time_limit: u32, + pub max_body_size: usize, + pub message_broker: Option, + pub public_addr: SocketAddr, + pub toolchains_fallback: DiskCacheConfig, + pub toolchains: Option, + } + + impl Config { + pub fn load(conf_path: Option) -> Result { + let mut conf_caches: CacheConfigs = Default::default(); + + let FileConfig { + client_auth, + enable_web_socket_server, + job_time_limit, + max_body_size, + message_broker, + public_addr, + toolchains, + } = conf_path + .map(|path| { + let conf = try_read_config_file::(&path) + .context("Failed to load scheduler config file"); + match conf { + Ok(conf) => conf.unwrap_or_default(), + Err(err) => { + warn!("{err}"); + Default::default() + } + } + }) + .unwrap_or_default(); + + conf_caches.merge(toolchains); + conf_caches.merge(config_from_env()?.cache); + + let (toolchains, toolchains_fallback) = conf_caches.into_fallback(); + + Ok(Self { + client_auth, + enable_web_socket_server: enable_web_socket_server.unwrap_or(true), + job_time_limit: job_time_limit.unwrap_or_else(default_job_time_limit), + max_body_size: max_body_size.unwrap_or_else(default_max_body_size), + message_broker: message_broker.and_then(|mb| mb.amqp.or(mb.redis)), + public_addr, + toolchains_fallback, + toolchains, + }) + } + + pub fn into_file(self) -> FileConfig { + self.into() + } + } + + impl From for FileConfig { + fn from(scheduler_config: Config) -> Self { + Self { + client_auth: scheduler_config.client_auth.clone(), + enable_web_socket_server: Some(scheduler_config.enable_web_socket_server), + job_time_limit: Some(scheduler_config.job_time_limit), + max_body_size: Some(scheduler_config.max_body_size), + message_broker: match scheduler_config.message_broker { + Some(MessageBroker::AMQP(conf)) => Some(MessageBrokerConfigs { + amqp: Some(MessageBroker::AMQP(conf)), + ..Default::default() + }), + Some(MessageBroker::Redis(conf)) => Some(MessageBrokerConfigs { + redis: Some(MessageBroker::Redis(conf)), + ..Default::default() + }), + None => None, + }, + public_addr: scheduler_config.public_addr, + toolchains: scheduler_config + .toolchains + .map(|x| x.clone().into()) + .unwrap_or(CacheConfigs { + disk: Some(scheduler_config.toolchains_fallback), + ..Default::default() + }), + } + } } } #[cfg(feature = "dist-server")] pub mod server { - use super::HTTPUrl; + use super::{ + config_from_env, try_read_config_file, CacheConfigs, CacheModeConfig, CacheType, + DiskCacheConfig, MessageBroker, MessageBrokerConfigs, + }; use serde::{Deserialize, Serialize}; - use std::net::SocketAddr; - use std::path::{Path, PathBuf}; + use std::path::PathBuf; use std::str::FromStr; use crate::errors::*; @@ -1166,8 +1351,8 @@ pub mod server { std::env::var("SCCACHE_DIST_MAX_PER_CORE_LOAD") .ok() .and_then(|s| s.parse().ok()) - // Default to 8 to match the server's thread pool multiple - .unwrap_or(8f64) + // Default to 2 + .unwrap_or(2f64) } fn default_num_cpus_to_ignore() -> usize { @@ -1201,7 +1386,7 @@ pub mod server { .collect() } - #[derive(Debug, Serialize, Deserialize)] + #[derive(Clone, Debug, Serialize, Deserialize)] #[serde(tag = "type")] #[serde(deny_unknown_fields)] pub enum BuilderType { @@ -1240,38 +1425,129 @@ pub mod server { #[derive(Debug, Serialize, Deserialize)] #[serde(default)] #[serde(deny_unknown_fields)] - pub struct Config { + pub struct FileConfig { + pub message_broker: Option, pub builder: BuilderType, pub cache_dir: PathBuf, - pub public_addr: SocketAddr, - pub bind_addr: Option, - pub scheduler_url: HTTPUrl, - pub scheduler_auth: SchedulerAuth, - pub toolchain_cache_size: u64, pub max_per_core_load: f64, pub num_cpus_to_ignore: usize, + pub toolchain_cache_size: i64, + pub toolchains: CacheConfigs, } - impl Default for Config { + impl Default for FileConfig { fn default() -> Self { Self { + message_broker: Some(MessageBrokerConfigs { + amqp: std::env::var("AMQP_ADDR").ok().map(MessageBroker::AMQP), + redis: std::env::var("REDIS_ADDR").ok().map(MessageBroker::Redis), + }), builder: BuilderType::Docker, cache_dir: Default::default(), - public_addr: SocketAddr::from_str("0.0.0.0:10600").unwrap(), - bind_addr: None, - scheduler_url: HTTPUrl::from_url( - reqwest::Url::from_str("http://0.0.0.0:10500").unwrap(), - ), - scheduler_auth: SchedulerAuth::Insecure, - toolchain_cache_size: default_toolchain_cache_size(), max_per_core_load: default_max_per_core_load(), num_cpus_to_ignore: default_num_cpus_to_ignore(), + toolchain_cache_size: default_toolchain_cache_size() as i64, + toolchains: CacheConfigs { + disk: Some(DiskCacheConfig { + dir: PathBuf::from_str("/tmp/sccache/toolchains").unwrap(), + preprocessor_cache_mode: Default::default(), + rw_mode: CacheModeConfig::ReadWrite, + size: u64::MAX, + }), + ..Default::default() + }, } } } - pub fn from_path(conf_path: &Path) -> Result> { - super::try_read_config_file(conf_path).context("Failed to load server config file") + #[derive(Debug)] + pub struct Config { + pub message_broker: Option, + pub builder: BuilderType, + pub cache_dir: PathBuf, + pub max_per_core_load: f64, + pub num_cpus_to_ignore: usize, + pub toolchain_cache_size: i64, + pub toolchains: Option, + pub toolchains_fallback: DiskCacheConfig, + } + + impl Config { + pub fn load(conf_path: Option) -> Result { + let mut conf_caches: CacheConfigs = Default::default(); + + let FileConfig { + message_broker, + builder, + cache_dir, + max_per_core_load, + num_cpus_to_ignore, + toolchain_cache_size, + toolchains, + } = conf_path + .map(|path| { + let conf = try_read_config_file::(&path) + .context("Failed to load server config file"); + match conf { + Ok(conf) => conf.unwrap_or_default(), + Err(err) => { + warn!("{err:?}"); + Default::default() + } + } + }) + .unwrap_or_default(); + + conf_caches.merge(toolchains); + conf_caches.merge(config_from_env()?.cache); + + let (toolchains, toolchains_fallback) = conf_caches.into_fallback(); + + Ok(Self { + message_broker: message_broker.and_then(|mb| mb.amqp.or(mb.redis)), + builder, + cache_dir, + max_per_core_load, + num_cpus_to_ignore, + toolchain_cache_size, + toolchains, + toolchains_fallback, + }) + } + + pub fn into_file(self) -> FileConfig { + self.into() + } + } + + impl From for FileConfig { + fn from(server_config: Config) -> Self { + Self { + builder: server_config.builder.clone(), + cache_dir: server_config.cache_dir.clone(), + max_per_core_load: server_config.max_per_core_load, + message_broker: match server_config.message_broker { + Some(MessageBroker::AMQP(conf)) => Some(MessageBrokerConfigs { + amqp: Some(MessageBroker::AMQP(conf)), + ..Default::default() + }), + Some(MessageBroker::Redis(conf)) => Some(MessageBrokerConfigs { + redis: Some(MessageBroker::Redis(conf)), + ..Default::default() + }), + None => Default::default(), + }, + num_cpus_to_ignore: server_config.num_cpus_to_ignore, + toolchain_cache_size: server_config.toolchain_cache_size, + toolchains: server_config + .toolchains + .map(|x| x.clone().into()) + .unwrap_or(CacheConfigs { + disk: Some(server_config.toolchains_fallback), + ..Default::default() + }), + } + } } } diff --git a/src/dist/cache.rs b/src/dist/cache.rs index 73d36af5cf..6578f7e868 100644 --- a/src/dist/cache.rs +++ b/src/dist/cache.rs @@ -8,6 +8,9 @@ use std::path::{Path, PathBuf}; #[cfg(feature = "dist-client")] pub use self::client::ClientToolchains; +#[cfg(feature = "dist-server")] +pub use self::server::ServerToolchains; + use crate::util::Digest; use std::io::Read; @@ -499,15 +502,6 @@ impl TcCache { self.inner.get(make_lru_key_path(&tc.archive_id)) } - pub async fn get_async( - &mut self, - tc: &Toolchain, - ) -> LruResult> { - self.inner - .get_async(make_lru_key_path(&tc.archive_id)) - .await - } - pub fn len(&self) -> usize { self.inner.len() } @@ -541,3 +535,153 @@ fn file_key(rdr: R) -> Result { fn make_lru_key_path(key: &str) -> PathBuf { Path::new(&key[0..1]).join(&key[1..2]).join(key) } + +#[cfg(feature = "dist-server")] +mod server { + use async_compression::tokio::bufread::GzipDecoder; + + use tokio::io::BufReader; + use tokio::sync::{Mutex, Notify}; + use tokio_util::compat::{FuturesAsyncReadCompatExt, TokioAsyncReadCompatExt}; + + use std::collections::HashMap; + use std::path::{Path, PathBuf}; + use std::sync::Arc; + + use crate::cache::cache; + use crate::dist::Toolchain; + use crate::errors::*; + + use super::make_lru_key_path; + + pub struct ServerToolchains { + cached_toolchains_dir: PathBuf, + cached_toolchains_capacity: i64, + cached_toolchains_size: i64, + cached_toolchains: Mutex>, + remote_toolchains: Arc, + toolchain_evicted: Notify, + } + + // pub fn dir_size_on_disk(path: impl Into) -> io::Result { + // fn dir_size(mut dir: fs::ReadDir) -> io::Result { + // dir.try_fold(0, |acc, file| { + // let file = file?; + // let size = match file.metadata()? { + // data if data.is_dir() => dir_size(fs::read_dir(file.path())?)?, + // data => data.len(), + // }; + // Ok(acc + size) + // }) + // } + + // dir_size(fs::read_dir(path.into())?) + // } + + impl ServerToolchains { + pub fn new( + cached_toolchains_dir: &Path, + cached_toolchains_capacity: i64, + remote_toolchains: Arc, + ) -> Self { + trace!( + "Using ServerToolchains({:?}, {})", + cached_toolchains_dir, + cached_toolchains_capacity + ); + ServerToolchains { + cached_toolchains_capacity, + cached_toolchains_dir: cached_toolchains_dir.to_owned(), + cached_toolchains_size: 0, + cached_toolchains: Mutex::new(HashMap::new()), + remote_toolchains, + toolchain_evicted: Notify::new(), + } + } + + pub async fn acquire(&mut self, toolchain: &Toolchain) -> Result { + let remote = &self.remote_toolchains; + let toolchain_id = &toolchain.archive_id; + let path = self + .cached_toolchains_dir + .join(make_lru_key_path(toolchain_id)); + + let toolchain_is_loaded = { + let mut cached_toolchains = self.cached_toolchains.lock().await; + if cached_toolchains.contains_key(toolchain) { + cached_toolchains + .entry(toolchain.clone()) + .and_modify(|(_, c)| *c += 1); + true + } else { + false + } + }; + + if !toolchain_is_loaded { + // TODO: Get the _uncompressed_ size of the toolchain archive + let inflated_size = 0; + let toolchain_reader = remote.get_stream(toolchain_id).await?.compat(); + loop { + if self.cached_toolchains_size + inflated_size + <= self.cached_toolchains_capacity + { + trace!("ServerToolchains: Unpacking toolchain {toolchain_id} to {path:?}"); + async_tar::Archive::new( + GzipDecoder::new(BufReader::new(toolchain_reader)).compat(), + ) + .unpack(&path) + .await + .context("Failed to unpack toolchain")?; + self.cached_toolchains_size += inflated_size; + trace!("ServerToolchains: Toolchain {toolchain_id} unpacked, new cache size is {}", self.cached_toolchains_size); + break; + } + self.toolchain_evicted.notified().await; + } + self.cached_toolchains + .lock() + .await + .entry(toolchain.clone()) + .and_modify(|(_, c)| *c += 1) + .or_insert((inflated_size, 1)); + } + Ok(path.clone()) + } + + pub async fn release(&mut self, toolchain: &Toolchain) -> Result<()> { + trace!( + "ServerToolchains: Releasing toolchain {}", + toolchain.archive_id + ); + + let mut toolchains = self.cached_toolchains.lock().await; + + toolchains + .entry(toolchain.clone()) + .and_modify(|(_, c)| *c = (*c - 1).max(0)); + + let toolchains_clone = toolchains.clone(); + + for (toolchain, (inflated_size, _)) in + toolchains_clone.iter().filter(|(_, (_, c))| *c <= 0) + { + let path = self + .cached_toolchains_dir + .join(make_lru_key_path(&toolchain.archive_id)); + toolchains.remove(toolchain); + if path.exists() { + trace!("ServerToolchains: Removing toolchain dir {:?}", path); + tokio::fs::remove_dir_all(&path) + .await + .expect("Failed to clean up toolchain directory"); + self.cached_toolchains_size = + (self.cached_toolchains_size - inflated_size).max(0); + self.toolchain_evicted.notify_one(); + } + } + + Ok(()) + } + } +} diff --git a/src/dist/client_auth.rs b/src/dist/client_auth.rs index 19b8cc2187..5555411c00 100644 --- a/src/dist/client_auth.rs +++ b/src/dist/client_auth.rs @@ -246,7 +246,7 @@ mod code_grant_pkce { grant_type: GRANT_TYPE_PARAM_VALUE, redirect_uri, }; - let client = new_reqwest_blocking_client(None); + let client = new_reqwest_blocking_client(); let res = client.post(token_url).json(&token_request).send()?; if !res.status().is_success() { bail!( diff --git a/src/dist/http.rs b/src/dist/http.rs index dc98b4b2ec..37234a97b5 100644 --- a/src/dist/http.rs +++ b/src/dist/http.rs @@ -15,12 +15,10 @@ #[cfg(feature = "dist-client")] pub use self::client::Client; #[cfg(feature = "dist-server")] -pub use self::server::{ - ClientAuthCheck, ClientVisibleMsg, JWTJobAuthorizer, ServerAuthCheck, HEARTBEAT_ERROR_INTERVAL, - HEARTBEAT_INTERVAL, HEARTBEAT_TIMEOUT, +pub use self::{ + common::{bincode_deserialize, bincode_req_fut, bincode_serialize, for_all_concurrent}, + server::{ClientAuthCheck, ClientVisibleMsg}, }; -#[cfg(feature = "dist-server")] -pub use self::server::{Scheduler, Server}; use std::env; use std::time::Duration; @@ -55,15 +53,85 @@ pub fn get_dist_request_timeout() -> Duration { mod common { use reqwest::header; - use serde::{Deserialize, Serialize}; - #[cfg(feature = "dist-server")] - use std::collections::HashMap; - use std::fmt; - use crate::dist; + use futures::{FutureExt, StreamExt, TryFutureExt}; use crate::errors::*; + pub fn for_all_concurrent( + pool: &tokio::runtime::Handle, + recv: S, + token: tokio_util::sync::CancellationToken, + mut f: F, + ) -> tokio::task::JoinHandle<()> + where + S: futures::stream::StreamExt + Send + 'static, + F: FnMut(S::Item) -> Fut + Send + 'static, + Fut: std::future::Future> + Send + 'static, + { + let pool1 = pool.clone(); + let token1 = token.clone(); + let token2 = token.clone(); + pool.spawn( + recv.flat_map_unordered(None, move |item| { + if !token1.is_cancelled() { + futures::stream::once( + pool1 + .spawn(f(item)) + .unwrap_or_else(|_| std::ops::ControlFlow::Continue(String::new())), + ) + .boxed() + } else { + futures::stream::once(futures::future::ready(std::ops::ControlFlow::Break( + String::new(), + ))) + .boxed() + } + }) + .take_while(move |control_flow| { + futures::future::ready(match control_flow { + std::ops::ControlFlow::Break(msg) => { + if !msg.is_empty() { + debug!("{msg}"); + } + token2.cancel(); + !token2.is_cancelled() + } + std::ops::ControlFlow::Continue(msg) => { + if !msg.is_empty() { + debug!("{msg}"); + } + !token2.is_cancelled() + } + }) + }) + .for_each_concurrent(None, |_| async move {}) + .boxed(), + ) + } + + pub async fn bincode_deserialize(bytes: Vec) -> Result + where + T: for<'de> serde::Deserialize<'de> + Send + 'static, + { + tokio::runtime::Handle::current() + .spawn_blocking(move || bincode::deserialize(&bytes)) + .await + .map_err(anyhow::Error::new)? + .map_err(anyhow::Error::new) + } + + pub async fn bincode_serialize(value: T) -> Result> + where + T: serde::Serialize + Send + 'static, + { + tokio::runtime::Handle::current() + .spawn_blocking(move || bincode::serialize(&value)) + .await + .map_err(anyhow::Error::new)? + .map_err(anyhow::Error::new) + } + // Note that content-length is necessary due to https://github.com/tiny-http/tiny-http/issues/147 pub trait ReqwestRequestBuilderExt: Sized { fn bincode(self, bincode: &T) -> Result; @@ -140,317 +208,40 @@ mod common { Ok(bincode::deserialize(&bytes)?) } } - - #[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)] - #[serde(deny_unknown_fields)] - pub struct JobJwt { - pub exp: u64, - pub job_id: dist::JobId, - } - - #[derive(Clone, Debug, Serialize, Deserialize)] - #[serde(deny_unknown_fields)] - pub enum AllocJobHttpResponse { - Success { - job_alloc: dist::JobAlloc, - need_toolchain: bool, - cert_digest: Vec, - }, - Fail { - msg: String, - }, - } - impl AllocJobHttpResponse { - #[cfg(feature = "dist-server")] - pub fn from_alloc_job_result( - res: dist::AllocJobResult, - certs: &HashMap, Vec)>, - ) -> Self { - match res { - dist::AllocJobResult::Success { - job_alloc, - need_toolchain, - } => { - if let Some((digest, _)) = certs.get(&job_alloc.server_id) { - AllocJobHttpResponse::Success { - job_alloc, - need_toolchain, - cert_digest: digest.to_owned(), - } - } else { - AllocJobHttpResponse::Fail { - msg: format!( - "missing certificates for server {}", - job_alloc.server_id.addr() - ), - } - } - } - dist::AllocJobResult::Fail { msg } => AllocJobHttpResponse::Fail { msg }, - } - } - } - - #[derive(Clone, Debug, Serialize, Deserialize)] - #[serde(deny_unknown_fields)] - pub struct ServerCertificateHttpResponse { - pub cert_digest: Vec, - pub cert_pem: Vec, - } - - #[derive(Clone, Serialize, Deserialize)] - #[serde(deny_unknown_fields)] - pub struct HeartbeatServerHttpRequest { - pub jwt_key: Vec, - pub num_cpus: usize, - pub max_per_core_load: f64, - pub server_nonce: dist::ServerNonce, - pub cert_digest: Vec, - pub cert_pem: Vec, - pub num_assigned_jobs: usize, - pub num_active_jobs: usize, - } - // cert_pem is quite long so elide it (you can retrieve it by hitting the server url anyway) - impl fmt::Debug for HeartbeatServerHttpRequest { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - let HeartbeatServerHttpRequest { - jwt_key, - num_cpus, - max_per_core_load, - server_nonce, - cert_digest, - cert_pem, - num_assigned_jobs: assigned, - num_active_jobs: active, - } = self; - write!(f, - "HeartbeatServerHttpRequest {{ jwt_key: {:?}, num_cpus: {:?}, max_per_core_load: {:?}, server_nonce: {:?}, cert_digest: {:?}, cert_pem: [...{} bytes...], jobs: {{ assigned: {:?}, active: {:?} }} }}", - jwt_key, num_cpus, max_per_core_load, server_nonce, cert_digest, cert_pem.len(), assigned, active) - } - } - #[derive(Clone, Debug, Serialize, Deserialize)] - #[serde(deny_unknown_fields)] - pub struct UpdateJobStateHttpRequest { - pub num_assigned_jobs: usize, - pub num_active_jobs: usize, - } - #[derive(Clone, Debug, Serialize, Deserialize)] - #[serde(deny_unknown_fields)] - pub struct RunJobHttpRequest { - pub command: dist::CompileCommand, - pub outputs: Vec, - } } pub mod urls { - use crate::dist::{JobId, ServerId}; - - pub fn scheduler_alloc_job(scheduler_url: &reqwest::Url) -> reqwest::Url { + pub fn scheduler_status(scheduler_url: &reqwest::Url) -> reqwest::Url { scheduler_url - .join("/api/v1/scheduler/alloc_job") + .join("/api/v2/status") .expect("failed to create alloc job url") } - pub fn scheduler_server_certificate( - scheduler_url: &reqwest::Url, - server_id: ServerId, - ) -> reqwest::Url { - scheduler_url - .join(&format!( - "/api/v1/scheduler/server_certificate/{}", - server_id.addr() - )) - .expect("failed to create server certificate url") - } - pub fn scheduler_heartbeat_server(scheduler_url: &reqwest::Url) -> reqwest::Url { + pub fn scheduler_new_job(scheduler_url: &reqwest::Url) -> reqwest::Url { scheduler_url - .join("/api/v1/scheduler/heartbeat_server") - .expect("failed to create heartbeat url") + .join("/api/v2/jobs/new") + .expect("failed to create new job url") } - pub fn scheduler_job_state(scheduler_url: &reqwest::Url) -> reqwest::Url { + pub fn scheduler_run_job(scheduler_url: &reqwest::Url, job_id: &str) -> reqwest::Url { scheduler_url - .join("/api/v1/scheduler/job_state") - .expect("failed to create job state url") + .join(&format!("/api/v2/job/{job_id}/run")) + .expect("failed to create run job url") } - pub fn scheduler_status(scheduler_url: &reqwest::Url) -> reqwest::Url { + pub fn scheduler_submit_toolchain( + scheduler_url: &reqwest::Url, + archive_id: &str, + ) -> reqwest::Url { scheduler_url - .join("/api/v1/scheduler/status") - .expect("failed to create alloc job url") - } - - pub fn server_reserve_job(server_id: ServerId) -> reqwest::Url { - let url = format!("https://{}/api/v1/distserver/reserve_job", server_id.addr()); - reqwest::Url::parse(&url).expect("failed to create reserve job url") - } - pub fn server_assign_job(server_id: ServerId) -> reqwest::Url { - let url = format!("https://{}/api/v1/distserver/assign_job", server_id.addr(),); - reqwest::Url::parse(&url).expect("failed to create assign job url") - } - pub fn server_submit_toolchain(server_id: ServerId, job_id: JobId) -> reqwest::Url { - let url = format!( - "https://{}/api/v1/distserver/submit_toolchain/{}", - server_id.addr(), - job_id - ); - reqwest::Url::parse(&url).expect("failed to create submit toolchain url") - } - pub fn server_run_job(server_id: ServerId, job_id: JobId) -> reqwest::Url { - let url = format!( - "https://{}/api/v1/distserver/run_job/{}", - server_id.addr(), - job_id - ); - reqwest::Url::parse(&url).expect("failed to create run job url") + .join(&format!("/api/v2/toolchain/{archive_id}")) + .expect("failed to create submit toolchain url") } } #[cfg(feature = "dist-server")] mod server { - use async_trait::async_trait; - use axum::{ - body::Bytes, - extract::{ - ConnectInfo, DefaultBodyLimit, Extension, FromRequest, FromRequestParts, Path, Request, - }, - http::{request::Parts, HeaderMap, Method, StatusCode, Uri}, - response::{IntoResponse, Response}, - routing, RequestPartsExt, Router, - }; - - use axum_extra::{ - headers::{authorization::Bearer, Authorization}, - TypedHeader, - }; - use bytes::Buf; - - use async_compression::tokio::bufread::ZlibDecoder as ZlibReadDecoder; - use futures::{lock::Mutex, TryStreamExt}; - - use hyper_util::rt::{TokioExecutor, TokioIo}; - - use once_cell::sync::Lazy; - use openssl::ssl::{Ssl, SslAcceptor, SslMethod}; - - use rand::{rngs::OsRng, RngCore}; - - use serde_json::json; - - use std::{ - borrow::Borrow, collections::HashMap, io, net::SocketAddr, result::Result as StdResult, - sync::Arc, time::Duration, - }; - - use tokio::{io::AsyncReadExt, net::TcpListener}; - use tokio_openssl::SslStream; - use tokio_util::io::StreamReader; - use tower::{Service, ServiceBuilder, ServiceExt}; - use tower_http::{ - request_id::{MakeRequestUuid, PropagateRequestIdLayer, SetRequestIdLayer}, - sensitive_headers::{SetSensitiveRequestHeadersLayer, SetSensitiveResponseHeadersLayer}, - trace::{DefaultMakeSpan, DefaultOnResponse, TraceLayer}, - }; - - use super::common::{ - bincode_req_fut, AllocJobHttpResponse, HeartbeatServerHttpRequest, JobJwt, - ReqwestRequestBuilderExt, RunJobHttpRequest, ServerCertificateHttpResponse, - UpdateJobStateHttpRequest, - }; - use crate::dist::{ - self, http::urls, AssignJobResult, HeartbeatServerResult, JobAuthorizer, JobId, - SchedulerIncoming, SchedulerOutgoing, ServerId, ServerIncoming, ServerNonce, - ServerOutgoing, Toolchain, UpdateJobStateResult, - }; - use crate::util::new_reqwest_client; - - use crate::errors::*; - - pub const HEARTBEAT_INTERVAL: Duration = Duration::from_secs(5); - pub const HEARTBEAT_ERROR_INTERVAL: Duration = Duration::from_secs(3); - pub const HEARTBEAT_TIMEOUT: Duration = Duration::from_secs(60); - - fn create_https_cert_and_privkey(addr: SocketAddr) -> Result<(Vec, Vec, Vec)> { - let rsa_key = openssl::rsa::Rsa::::generate(2048) - .context("failed to generate rsa privkey")?; - let privkey_pem = rsa_key - .private_key_to_pem() - .context("failed to create pem from rsa privkey")?; - let privkey: openssl::pkey::PKey = - openssl::pkey::PKey::from_rsa(rsa_key) - .context("failed to create openssl pkey from rsa privkey")?; - let mut builder = - openssl::x509::X509::builder().context("failed to create x509 builder")?; - - // Populate the certificate with the necessary parts, mostly from mkcert in openssl - builder - .set_version(2) - .context("failed to set x509 version")?; - let serial_number = openssl::bn::BigNum::from_u32(0) - .and_then(|bn| bn.to_asn1_integer()) - .context("failed to create openssl asn1 0")?; - builder - .set_serial_number(serial_number.as_ref()) - .context("failed to set x509 serial number")?; - let not_before = openssl::asn1::Asn1Time::days_from_now(0) - .context("failed to create openssl not before asn1")?; - builder - .set_not_before(not_before.as_ref()) - .context("failed to set not before on x509")?; - let not_after = openssl::asn1::Asn1Time::days_from_now(365) - .context("failed to create openssl not after asn1")?; - builder - .set_not_after(not_after.as_ref()) - .context("failed to set not after on x509")?; - builder - .set_pubkey(privkey.as_ref()) - .context("failed to set pubkey for x509")?; - - let mut name = openssl::x509::X509Name::builder()?; - name.append_entry_by_nid(openssl::nid::Nid::COMMONNAME, &addr.to_string())?; - let name = name.build(); - - builder - .set_subject_name(&name) - .context("failed to set subject name")?; - builder - .set_issuer_name(&name) - .context("failed to set issuer name")?; - - // Add the SubjectAlternativeName - let extension = openssl::x509::extension::SubjectAlternativeName::new() - .ip(&addr.ip().to_string()) - .build(&builder.x509v3_context(None, None)) - .context("failed to build SAN extension for x509")?; - builder - .append_extension(extension) - .context("failed to append SAN extension for x509")?; - - // Add ExtendedKeyUsage - let ext_key_usage = openssl::x509::extension::ExtendedKeyUsage::new() - .server_auth() - .build() - .context("failed to build EKU extension for x509")?; - builder - .append_extension(ext_key_usage) - .context("fails to append EKU extension for x509")?; - - // Finish the certificate - builder - .sign(&privkey, openssl::hash::MessageDigest::sha1()) - .context("failed to sign x509 with sha1")?; - let cert: openssl::x509::X509 = builder.build(); - let cert_pem = cert.to_pem().context("failed to create pem from x509")?; - let cert_digest = cert - .digest(openssl::hash::MessageDigest::sha256()) - .context("failed to create digest of x509 certificate")? - .as_ref() - .to_owned(); - - Ok((cert_digest, cert_pem, privkey_pem)) - } - // Messages that are non-sensitive and can be sent to the client #[derive(Debug)] - pub struct ClientVisibleMsg(String); + pub struct ClientVisibleMsg(pub String); impl ClientVisibleMsg { pub fn from_nonsensitive(s: String) -> Self { ClientVisibleMsg(s) @@ -458,988 +249,338 @@ mod server { } pub trait ClientAuthCheck: Send + Sync { - fn check(&self, token: &str) -> StdResult<(), ClientVisibleMsg>; + fn check(&self, token: &str) -> std::result::Result<(), ClientVisibleMsg>; } - pub type ServerAuthCheck = Box Option + Send + Sync>; - - const JWT_KEY_LENGTH: usize = 256 / 8; - static JWT_HEADER: Lazy = Lazy::new(|| jwt::Header::new(jwt::Algorithm::HS256)); - static JWT_VALIDATION: Lazy = Lazy::new(|| { - let mut validation = jwt::Validation::new(jwt::Algorithm::HS256); - validation.leeway = 0; - validation.validate_exp = false; - validation.validate_nbf = false; - validation - }); - - fn with_request_tracing(app: Router) -> Router { - // Mark these headers as sensitive so they don't show in logs - let headers_to_redact: Arc<[_]> = Arc::new([ - http::header::AUTHORIZATION, - http::header::PROXY_AUTHORIZATION, - http::header::COOKIE, - http::header::SET_COOKIE, - ]); - app.layer( - ServiceBuilder::new() - .layer(SetSensitiveRequestHeadersLayer::from_shared(Arc::clone( - &headers_to_redact, - ))) - .layer(SetRequestIdLayer::x_request_id(MakeRequestUuid)) - .layer( - TraceLayer::new_for_http() - .make_span_with(DefaultMakeSpan::new().include_headers(true)) - .on_response(DefaultOnResponse::new().include_headers(true)), - ) - .layer(PropagateRequestIdLayer::x_request_id()) - .layer(SetSensitiveResponseHeadersLayer::from_shared( - headers_to_redact, - )), - ) - } - - fn get_header_value<'a>(headers: &'a HeaderMap, name: &'a str) -> Option<&'a str> { - if let Some(header) = headers.get(name) { - if let Ok(header) = header.to_str() { - return Some(header); - } - } - None - } - - /// Return `content` as either a bincode or json encoded `Response` - /// depending on the Accept header in `request`. - pub fn accepts_response(headers: &HeaderMap, content: &T) -> (StatusCode, Vec) - where - T: serde::Serialize, - { - if let Some(header) = headers.get("Accept") { - // This is the only function we use from rouille. - // Maybe we can find a replacement? - match rouille::input::priority_header_preferred( - header.to_str().unwrap_or("*/*"), - ["application/octet-stream", "application/json"] - .iter() - .cloned(), - ) { - // application/octet-stream - Some(0) => match bincode::serialize(content) { - Ok(body) => (StatusCode::OK, body), - Err(err) => ( - StatusCode::INTERNAL_SERVER_ERROR, - format!("Failed to serialize response body: {err}").into_bytes(), - ), - }, - // application/json - Some(1) => (StatusCode::OK, json!(content).as_str().unwrap().into()), - _ => ( - StatusCode::BAD_REQUEST, - "Request must accept application/json or application/octet-stream".into(), - ), - } - } else { - ( - StatusCode::BAD_REQUEST, - "Request must accept application/json or application/octet-stream".into(), - ) - } - } - - fn anyhow_response( - method: Method, - uri: Uri, - // the last argument must be the error itself - err: anyhow::Error, - ) -> (StatusCode, Vec) { - let msg = format!("sccache: `{method} {uri}` failed with {err}"); - tracing::error!("{}", msg); - (StatusCode::INTERNAL_SERVER_ERROR, msg.into_bytes()) - } - - fn unwrap_infallible(result: StdResult) -> T { - match result { - Ok(value) => value, - Err(err) => match err {}, - } - } - - struct Bincode(T); - - #[async_trait] - impl FromRequest for Bincode - where - Bytes: FromRequest, - S: Send + Sync, - T: serde::de::DeserializeOwned, - { - type Rejection = Response; - - async fn from_request(req: Request, state: &S) -> StdResult { - let data = match get_header_value(req.headers(), "Content-Type") { - Some("application/octet-stream") => Bytes::from_request(req, state) - .await - .map_err(IntoResponse::into_response)? - .to_vec(), - _ => return Err((StatusCode::BAD_REQUEST, "Wrong content type").into_response()), - }; - - let data = bincode::deserialize_from::<_, T>(data.reader()) - .map_err(|err| (StatusCode::BAD_REQUEST, err.to_string()).into_response())?; +} - Ok(Self(data)) - } - } +#[cfg(feature = "dist-client")] +mod client { + use super::super::cache; + use crate::config; + use crate::dist::http::common::{bincode_deserialize, bincode_serialize}; + use crate::dist::pkg::{InputsPackager, ToolchainPackager}; + use crate::dist::{ + self, ClientIncoming, ClientOutgoing, CompileCommand, NewJobRequest, NewJobResponse, + PathTransformer, RunJobRequest, RunJobResponse, SchedulerStatusResult, + SubmitToolchainResult, Toolchain, + }; + use crate::util::new_reqwest_client; - // Generation and verification of job auth - pub struct JWTJobAuthorizer { - server_key: Vec, - } + // use byteorder::{BigEndian, WriteBytesExt}; - impl JWTJobAuthorizer { - pub fn new(server_key: Vec) -> Box { - Box::new(Self { server_key }) - } - } + // use futures::lock::Mutex; + use futures::{lock::Mutex, StreamExt}; + use tokio_tungstenite::tungstenite::client::IntoClientRequest; - impl dist::JobAuthorizer for JWTJobAuthorizer { - fn generate_token(&self, job_id: JobId) -> Result { - let claims = JobJwt { exp: 0, job_id }; - let key = jwt::EncodingKey::from_secret(&self.server_key); - jwt::encode(&JWT_HEADER, &claims, &key) - .map_err(|e| anyhow!("Failed to create JWT for job: {}", e)) - } - fn verify_token(&self, job_id: JobId, token: &str) -> Result<()> { - let valid_claims = JobJwt { exp: 0, job_id }; - let key = jwt::DecodingKey::from_secret(&self.server_key); - jwt::decode(token, &key, &JWT_VALIDATION) - .map_err(|e| anyhow!("JWT decode failed: {}", e)) - .and_then(|res| { - fn identical_t(_: &T, _: &T) {} - identical_t(&res.claims, &valid_claims); - if res.claims == valid_claims { - Ok(()) - } else { - Err(anyhow!("mismatched claims")) - } - }) - } - } + use std::collections::HashMap; + use std::sync::Arc; + use std::time::Duration; - #[test] - fn test_job_token_verification() { - let ja = JWTJobAuthorizer::new(vec![1, 2, 2]); + use tokio::sync::mpsc::UnboundedSender; + use tokio_stream::wrappers::UnboundedReceiverStream; + use tokio_tungstenite::tungstenite::protocol::CloseFrame; + use tokio_tungstenite::tungstenite::protocol::Message; - let job_id = JobId(55); - let token = ja.generate_token(job_id).unwrap(); + use async_trait::async_trait; + use flate2::write::ZlibEncoder as ZlibWriteEncoder; + use flate2::Compression; + use reqwest::Body; + use std::io::Write; + use std::path::{Path, PathBuf}; - let job_id2 = JobId(56); - let token2 = ja.generate_token(job_id2).unwrap(); + use super::common::{bincode_req_fut, for_all_concurrent, ReqwestRequestBuilderExt}; + use super::urls; + use crate::errors::*; - let ja2 = JWTJobAuthorizer::new(vec![1, 2, 3]); + type WebSocketCallback = tokio::sync::oneshot::Sender; - // Check tokens are deterministic - assert_eq!(token, ja.generate_token(job_id).unwrap()); - // Check token verification works - assert!(ja.verify_token(job_id, &token).is_ok()); - assert!(ja.verify_token(job_id, &token2).is_err()); - assert!(ja.verify_token(job_id2, &token).is_err()); - assert!(ja.verify_token(job_id2, &token2).is_ok()); - // Check token verification with a different key fails - assert!(ja2.verify_token(job_id, &token).is_err()); - assert!(ja2.verify_token(job_id2, &token2).is_err()); - } + pub type WebSocketRequest = ( + String, // request id + Option, // request payload + Option>, // response callback + ); - pub struct Scheduler { - public_addr: SocketAddr, - handler: S, - // Is this client permitted to use the scheduler? - check_client_auth: Box, - // Do we believe the server is who they appear to be? - check_server_auth: ServerAuthCheck, + pub struct WebSocketClient { + connection: tokio_util::sync::CancellationToken, + outgoing: Arc>>>, + requests: Arc>>>, } - impl Scheduler { - pub fn new( - public_addr: SocketAddr, - handler: S, - check_client_auth: Box, - check_server_auth: ServerAuthCheck, - ) -> Self { - Self { - public_addr, - handler, - check_client_auth, - check_server_auth, - } - } + impl WebSocketClient + where + Outgoing: serde::Serialize + std::fmt::Display + Send + Sync + 'static, + Incoming: + for<'a> serde::Deserialize<'a> + Clone + std::fmt::Display + Send + Sync + 'static, + { + pub async fn new( + runtime: &tokio::runtime::Handle, + connect: Connect, + shutdown: Shutdown, + ) -> Result + where + Error: std::fmt::Debug + std::fmt::Display + Send + 'static, + Sender: futures::SinkExt + Send + Unpin + 'static, + Receiver: futures::StreamExt> + + Send + + Unpin + + 'static, + Shutdown: FnOnce() -> Incoming + Send + 'static, + Connect: FnOnce() -> ConnectFut + Send + 'static, + ConnectFut: std::future::Future> + Send, + { + let (sndr, recv) = match connect().await.context("WebSocket connect failed") { + Ok(res) => res, + Err(err) => { + warn!("{err}"); + return Err(anyhow!(err)); + } + }; - pub async fn start(self) -> Result { - pub struct SchedulerRequester { - client: Mutex, - } + let connection = tokio_util::sync::CancellationToken::new(); + let requests_map = Arc::new(Mutex::new(HashMap::new())); + let (tx, rx) = tokio::sync::mpsc::unbounded_channel(); + let requests = UnboundedReceiverStream::new(rx); - #[async_trait] - impl SchedulerOutgoing for SchedulerRequester { - async fn do_assign_job( - &self, - server_id: ServerId, - tc: Toolchain, - auth: String, - ) -> Result { - let url = urls::server_assign_job(server_id); - let req = self.client.lock().await.post(url); - bincode_req_fut(req.bearer_auth(auth).bincode(&tc)?) - .await - .context("POST to server assign_job failed") - } - } + let pool = runtime.clone(); + let token = connection.clone(); + let reqs_map = requests_map.clone(); - struct SchedulerState { - client_auth: Box, - server_auth: ServerAuthCheck, - server_certs: Mutex, Vec)>>, - requester: SchedulerRequester, - } + runtime.spawn(async move { + // Wrap shared sender in a Mutex so the concurrent + // `flat_map_unordered` task writes are serialized + let sndr = Arc::new(Mutex::new(sndr)); - // Verify authenticated sccache clients - struct AuthenticatedClient; - - #[async_trait] - impl FromRequestParts for AuthenticatedClient - where - S: Send + Sync, - { - type Rejection = StatusCode; - - async fn from_request_parts( - parts: &mut Parts, - _state: &S, - ) -> StdResult { - let TypedHeader(Authorization(bearer)) = parts - .extract::>>() - .await - .map_err(|_| StatusCode::UNAUTHORIZED)?; + // Clones to move into the futures that need mutable refs + let sndr_reqs = reqs_map.clone(); + let recv_reqs = reqs_map.clone(); - let Extension(this) = parts - .extract::>>() - .await - .map_err(|_| StatusCode::INTERNAL_SERVER_ERROR)?; - - this.client_auth - .check(bearer.token()) - .map(|_| AuthenticatedClient) - .map_err(|err| { - tracing::warn!( - "[AuthenticatedClient()]: invalid client auth: {}", - err.0 - ); - StatusCode::UNAUTHORIZED - }) - } - } + // Create child tokens for the subtasks so we can imperatively + // know when the parent token has been canceled + let sndr_token = token.child_token(); + let recv_token = token.child_token(); - // Verify authenticated sccache servers - struct AuthenticatedServerId(ServerId); - - #[async_trait] - impl FromRequestParts for AuthenticatedServerId - where - S: Send + Sync, - { - type Rejection = StatusCode; - - async fn from_request_parts( - parts: &mut Parts, - _state: &S, - ) -> StdResult { - let Extension(this) = parts - .extract::>>() - .await - .map_err(|_| StatusCode::INTERNAL_SERVER_ERROR)?; + let mut send_task = for_all_concurrent(&pool, requests, sndr_token, move |(req_id, req, cb): WebSocketRequest| { - let ConnectInfo(remote_addr) = parts - .extract::>() - .await - .map_err(|_| StatusCode::BAD_REQUEST)?; + // Local clones for this individual task + let sndr = sndr.clone(); + let reqs = sndr_reqs.clone(); - let TypedHeader(Authorization(bearer)) = parts - .extract::>>() - .await - .map_err(|_| StatusCode::UNAUTHORIZED)?; + async move { + // Support sending a message without listening for a response + if let Some(cb) = cb { + // Insert request callback into the requests map + reqs.lock().await.insert(req_id.clone(), cb); + } - let server_id = (this.server_auth)(bearer.token()).ok_or_else(|| { - tracing::warn!( - "[AuthenticatedServerId({remote_addr})]: invalid server auth token" - ); - StatusCode::UNAUTHORIZED - })?; - - let origin_ip = - if let Some(header) = get_header_value(&parts.headers, "X-Real-IP") { - tracing::trace!("X-Real-IP: {:?}", header); - match header.parse() { - Ok(ip) => ip, + // Support listening for a message without sending a request + if let Some(req) = req { + debug!("WebSocket sending request: id={req_id} req={req}"); + // Serialize the request + let buf = match bincode_serialize((req_id, req)).await { + Ok(buf) => buf, Err(err) => { - tracing::warn!( - "X-Real-IP value {:?} could not be parsed: {:?}", - header, - err - ); - return Err(StatusCode::UNAUTHORIZED); - } + return std::ops::ControlFlow::Continue(format!("WebSocket failed to serialize request: {err}")); + }, + }; + // Send the message. Abort on error. + if sndr.lock().await.send(Message::Binary(buf)).await.is_err() { + return std::ops::ControlFlow::Break("WebSocket send failure".into()); } - } else { - remote_addr.ip() - }; + } - if server_id.addr().ip() != origin_ip { - tracing::trace!("server ip: {:?}", server_id.addr().ip()); - tracing::trace!("request ip: {:?}", remote_addr.ip()); - Err(StatusCode::UNAUTHORIZED) - } else { - Ok(AuthenticatedServerId(server_id)) + std::ops::ControlFlow::Continue(String::new()) } - } - } + }); - let Self { - check_client_auth, - check_server_auth, - handler, - .. - } = self; - - // Doesn't seem like we can easily add the handler to SchedulerState, - // so create a clone and move it into each route handling closure. - let handler = Arc::new(handler); - - let app = Router::new() - .route( - "/api/v1/scheduler/alloc_job", - routing::post({ - let handler = Arc::clone(&handler); - move |method: Method, - uri: Uri, - headers: HeaderMap, - Extension(state): Extension>, - Bincode(toolchain): Bincode| async move { - let res = - match handler.handle_alloc_job(&state.requester, toolchain).await { - Ok(res) => res, - Err(err) => return anyhow_response(method, uri, err), - }; + let mut recv_task = for_all_concurrent(&pool, recv, recv_token, move |msg| { - accepts_response( - &headers, - &AllocJobHttpResponse::from_alloc_job_result( - res, - state.server_certs.lock().await.borrow(), - ), - ) - } - }), - ) - .route( - "/api/v1/scheduler/server_certificate/:server_id", - routing::get({ - move |method: Method, - uri: Uri, - headers: HeaderMap, - Extension(state): Extension>, - Path(server_id): Path| async move { - match state - .server_certs - .lock() - .await - .get(&server_id) - .map(|v| v.to_owned()) - .context("server cert not available") - .map(|(cert_digest, cert_pem)| ServerCertificateHttpResponse { - cert_digest, - cert_pem, - }) { - Ok(res) => accepts_response(&headers, &res), - Err(err) => anyhow_response(method, uri, err), + // Local clones for this individual task + let reqs = recv_reqs.clone(); + + async move { + let (req_id, res) = match msg { + Err(err) => { + return std::ops::ControlFlow::Break(format!("WebSocket recv failure: {err:?}")) } - } - }), - ) - .route( - "/api/v1/scheduler/heartbeat_server", - routing::post({ - let handler = Arc::clone(&handler); - move | - method: Method, - uri: Uri, - headers: HeaderMap, - Extension(state): Extension>, - AuthenticatedServerId(server_id): AuthenticatedServerId, - Bincode(heartbeat) : Bincode - | async move { - { - // Lock client and server_certs until the certs have been updated - let mut client = state.requester.client.lock().await; - let mut server_certs = state.server_certs.lock().await; - let current_matching_cert = server_certs.get(&server_id).filter(|(saved_cert_digest, _)| { - saved_cert_digest == &heartbeat.cert_digest - }); - // If no current cert, or cert digest doesn't match new cert, update certs and recreate client - if current_matching_cert.is_none() { - // Remove the old entry first - server_certs.remove(&server_id); - if reqwest::Certificate::from_pem(&heartbeat.cert_pem).is_ok() { - // Insert so it's added to the client in the following loop - server_certs.insert(server_id, (heartbeat.cert_digest, heartbeat.cert_pem)); - } else { - warn!( - "[handle_heartbeat_server({})]: failed to interpret pem as certificate", - server_id.addr() - ); + Ok(msg) => { + match msg { + Message::Close(None) => { + return std::ops::ControlFlow::Break("WebSocket close without CloseFrame".into()); } - match new_reqwest_client(None, Some(&server_certs)) { - // Use the updated certificates - Ok(new_client) => *client = new_client, - Err(err) => return anyhow_response(method, uri, err.into()), - }; + Message::Close(Some(CloseFrame { code, reason })) => { + return std::ops::ControlFlow::Break(format!("WebSocket disconnected code={}, reason=`{}`", code, reason)); + } + Message::Text(str) => { + return std::ops::ControlFlow::Continue(format!("WebSocket received unexpected text response: {str}")); + } + Message::Binary(buf) => { + match bincode_deserialize::<(String, Incoming)>(buf).await { + Ok(res) => res, + Err(err) => { + return std::ops::ControlFlow::Continue(format!("WebSocket failed to deserialize response: {err}")); + } + } + } + _ => return std::ops::ControlFlow::Continue(String::new()), } - }; - - match handler.handle_heartbeat_server( - server_id, - heartbeat.server_nonce, - heartbeat.num_cpus, - heartbeat.max_per_core_load, - JWTJobAuthorizer::new(heartbeat.jwt_key), - heartbeat.num_assigned_jobs, - heartbeat.num_active_jobs, - ).await { - Ok(res) => accepts_response(&headers, &res), - Err(err) => anyhow_response(method, uri, err) - } - } - }), - ) - .route( - "/api/v1/scheduler/job_state", - routing::post({ - let handler = Arc::clone(&handler); - move | - method: Method, - uri: Uri, - headers: HeaderMap, - AuthenticatedServerId(server_id): AuthenticatedServerId, - Bincode(update) : Bincode - | async move { - match handler.handle_update_job_state( - server_id, - update.num_assigned_jobs, - update.num_active_jobs, - ).await { - Ok(res) => accepts_response(&headers, &res), - Err(err) => anyhow_response(method, uri, err) } - } - }), - ) - .route( - "/api/v1/scheduler/status", - routing::get({ - let handler = Arc::clone(&handler); - move |method: Method, - uri: Uri, - headers: HeaderMap, - _: AuthenticatedClient| async move { - match handler.handle_status().await { - Ok(res) => accepts_response(&headers, &res), - Err(err) => anyhow_response(method, uri, err), + }; + + if let Some(cb) = reqs.lock().await.remove(&req_id) { + debug!("WebSocket received response: id={req_id}, res={res}"); + if !cb.is_closed() && cb.send(res).is_err() { + return std::ops::ControlFlow::Break(format!("WebSocket failed to notify client of response with id={req_id}")); } } - }), - ) - .fallback(|| async move { (StatusCode::NOT_FOUND, "404") }) - .layer(Extension(Arc::new(SchedulerState { - client_auth: check_client_auth, - server_auth: check_server_auth, - server_certs: Default::default(), - requester: SchedulerRequester { - client: Mutex::new(new_reqwest_client(None, None).expect("http client must build with success")), - }, - }))); - - let app = with_request_tracing(app); - - let mut make_service = app.into_make_service_with_connect_info::(); - - let listener = TcpListener::bind(self.public_addr).await.unwrap(); - - tracing::info!("Scheduler listening for clients on {}", self.public_addr); - - loop { - let (tcp_stream, remote_addr) = listener.accept().await.unwrap(); - let tower_service = unwrap_infallible(make_service.call(remote_addr).await); - - tokio::spawn(async move { - // Hyper has its own `AsyncRead` and `AsyncWrite` traits and doesn't use tokio. - // `TokioIo` converts between them. - let tok_stream = TokioIo::new(tcp_stream); - let hyper_service = hyper::service::service_fn( - move |request: Request| { - // Clone `tower_service` because hyper's `Service` uses `&self` whereas - // tower's `Service` requires `&mut self`. - tower_service.clone().oneshot(request) - }, - ); - - if let Err(err) = - hyper_util::server::conn::auto::Builder::new(TokioExecutor::new()) - .serve_connection(tok_stream, hyper_service) - .await - { - tracing::debug!("sccache: failed to serve connection: {err:#}"); + std::ops::ControlFlow::Continue(String::new()) } }); - } - } - } - pub struct Server { - public_addr: SocketAddr, - bind_addr: SocketAddr, - scheduler_url: reqwest::Url, - scheduler_auth: String, - // HTTPS pieces all the builders will use for connection encryption - cert_digest: Vec, - cert_pem: Vec, - privkey_pem: Vec, - // Key used to sign any requests relating to jobs - jwt_key: Vec, - // Randomly generated nonce to allow the scheduler to detect server restarts - server_nonce: ServerNonce, - max_per_core_load: f64, - num_cpus: usize, - handler: S, - } + // Wait for either cancel/send/recv to finish + tokio::select! { + _ = token.cancelled() => { + recv_task.abort(); + send_task.abort(); + } + _ = (&mut send_task) => { + token.cancel(); + recv_task.abort(); + }, + _ = (&mut recv_task) => { + token.cancel(); + send_task.abort(); + } + } - impl Server { - #[allow(clippy::too_many_arguments)] - pub fn new( - public_addr: SocketAddr, - bind_addr: SocketAddr, - scheduler_url: reqwest::Url, - scheduler_auth: String, - max_per_core_load: f64, - num_cpus: usize, - handler: S, - ) -> Result { - let (cert_digest, cert_pem, privkey_pem) = - create_https_cert_and_privkey(public_addr) - .context("failed to create HTTPS certificate for server")?; - let mut jwt_key = vec![0; JWT_KEY_LENGTH]; - OsRng.fill_bytes(&mut jwt_key); - let server_nonce = ServerNonce::new(); + // Notify all outstanding request handlers that the WebSocket client has shutdown. + let res = shutdown(); + for (req_id, cb) in reqs_map.lock().await.drain() { + if !cb.is_closed() && cb.send(res.clone()).is_err() { + warn!("WebSocket failed to notify client of shutdown (req_id={req_id})") + } + } + }); Ok(Self { - public_addr, - bind_addr, - scheduler_url, - scheduler_auth, - cert_digest, - cert_pem, - privkey_pem, - jwt_key, - server_nonce, - max_per_core_load: max_per_core_load.max(1f64), - num_cpus: num_cpus.max(1), - handler, + connection, + outgoing: Arc::new(Mutex::new(tx)), + requests: requests_map.clone(), }) } - pub async fn start(self) -> Result { - #[derive(Clone)] - pub struct ServerRequester { - client: reqwest::Client, - heartbeat_url: reqwest::Url, - heartbeat_req: HeartbeatServerHttpRequest, - scheduler_url: reqwest::Url, - scheduler_auth: String, - } - - #[async_trait] - impl ServerOutgoing for ServerRequester { - async fn do_heartbeat( - &self, - num_assigned_jobs: usize, - num_active_jobs: usize, - ) -> Result { - let mut heartbeat_req = self.heartbeat_req.clone(); - heartbeat_req.num_assigned_jobs = num_assigned_jobs; - heartbeat_req.num_active_jobs = num_active_jobs; - bincode_req_fut( - self.client - .post(self.heartbeat_url.clone()) - .bearer_auth(self.scheduler_auth.clone()) - .bincode(&heartbeat_req) - .expect("failed to serialize heartbeat"), - ) - .await - } - async fn do_update_job_state( - &self, - num_assigned_jobs: usize, - num_active_jobs: usize, - ) -> Result { - let url = urls::scheduler_job_state(&self.scheduler_url); - let req = UpdateJobStateHttpRequest { - num_assigned_jobs, - num_active_jobs, - }; - bincode_req_fut( - self.client - .post(url) - .bearer_auth(self.scheduler_auth.clone()) - .bincode(&req)?, - ) - .await - .context("POST to scheduler job_state failed") - } - } - - struct ServerState { - auth: Box, - server_nonce: ServerNonce, - } - - // Verification of job auth in a request - struct AuthenticatedJob(JobId); - - #[async_trait] - impl FromRequestParts for AuthenticatedJob - where - S: Send + Sync, - { - type Rejection = StatusCode; - - async fn from_request_parts( - parts: &mut Parts, - _state: &S, - ) -> StdResult { - let Extension(this) = parts - .extract::>>() - .await - .map_err(|_| StatusCode::INTERNAL_SERVER_ERROR)?; - - let Path(job_id) = parts - .extract::>() - .await - .map_err(|_| StatusCode::BAD_REQUEST)?; - - let TypedHeader(Authorization(bearer)) = parts - .extract::>>() - .await - .map_err(|_| StatusCode::UNAUTHORIZED)?; - - this.auth - .verify_token(job_id, bearer.token()) - .map(|_| AuthenticatedJob(job_id)) - .map_err(|_| StatusCode::UNAUTHORIZED) - } - } - - // Verify assign_job is from a scheduler with which this server has registered - struct AuthenticatedScheduler; - - #[async_trait] - impl FromRequestParts for AuthenticatedScheduler - where - S: Send + Sync, - { - type Rejection = StatusCode; - - async fn from_request_parts( - parts: &mut Parts, - _state: &S, - ) -> StdResult { - let Extension(this) = parts - .extract::>>() - .await - .map_err(|_| StatusCode::INTERNAL_SERVER_ERROR)?; - - let TypedHeader(Authorization(bearer)) = parts - .extract::>>() - .await - .map_err(|_| StatusCode::UNAUTHORIZED)?; + pub fn is_closed(&self) -> bool { + self.connection.is_cancelled() + } - this.auth - .verify_token(JobId(this.server_nonce.as_u64()), bearer.token()) - .map(|_| AuthenticatedScheduler) - .map_err(|_| StatusCode::UNAUTHORIZED) + // pub async fn close(&mut self) { + // self.connection.cancel(); + // } + + // pub async fn send(&mut self, req_id: String, req: Outgoing) -> Result<()> { + // self.outgoing + // .lock() + // .await + // .send((req_id, Some(req), None)) + // .map_err(|err| { + // let (req_id, _, _) = err.0; + // anyhow!("WebSocketClient error sending request for id={req_id}") + // })?; + // Ok(()) + // } + + pub async fn send_recv( + &self, + req_id: String, + req: Option, + timeout: Option, + ) -> Result { + let (tx, rx) = tokio::sync::oneshot::channel(); + self.outgoing + .lock() + .await + .send((req_id.clone(), req, Some(tx))) + .map_err(|err| { + let (req_id, _, _) = err.0; + anyhow!("WebSocketClient error sending request for id={req_id}") + })?; + + if let Some(duration) = timeout { + match tokio::time::timeout(duration, rx).await { + Ok(res) => res.map_err(anyhow::Error::new), + Err(_) => { + // Remove the callback from the requests map + self.requests.lock().await.remove(&req_id); + Err(anyhow!("WebSocket request timeout (req_id={req_id})")) + } } + } else { + rx.await.map_err(anyhow::Error::new) } + } - let Self { - public_addr, - bind_addr, - scheduler_url, - scheduler_auth, - cert_digest, - cert_pem, - privkey_pem, - jwt_key, - server_nonce, - max_per_core_load, - num_cpus, - handler, - } = self; - - let requester = Arc::new(ServerRequester { - client: new_reqwest_client(Some(public_addr), None) - .expect("http client must build with success"), - scheduler_url: scheduler_url.clone(), - scheduler_auth: scheduler_auth.clone(), - heartbeat_url: urls::scheduler_heartbeat_server(&scheduler_url), - heartbeat_req: HeartbeatServerHttpRequest { - num_cpus, - max_per_core_load, - jwt_key: jwt_key.clone(), - server_nonce: server_nonce.clone(), - cert_digest: cert_digest.clone(), - cert_pem: cert_pem.clone(), - num_assigned_jobs: 0, - num_active_jobs: 0, - }, - }); - - // Doesn't seem like we can easily add the handler to ServerState, - // so create a clone and move it into each route handling closure. - let handler = Arc::new(handler); - - handler.start_heartbeat(requester.clone()); - - let app = Router::new() - .route( - "/api/v1/distserver/assign_job", - routing::post({ - let handler = Arc::clone(&handler); - move |method: Method, - uri: Uri, - headers: HeaderMap, - _: AuthenticatedScheduler, - Bincode(toolchain): Bincode| async move { - match handler.handle_assign_job(toolchain).await { - Ok(res) => accepts_response(&headers, &res), - Err(err) => anyhow_response(method, uri, err), - } - } - }), - ) - .route( - "/api/v1/distserver/submit_toolchain/:job_id", - routing::post({ - let handler = Arc::clone(&handler); - move |method: Method, - uri: Uri, - headers: HeaderMap, - AuthenticatedJob(job_id): AuthenticatedJob, - request: Request| async move { - // Convert the request body stream into an `AsyncRead` - let toolchain_reader = StreamReader::new( - request - .into_body() - .into_data_stream() - .map_err(|err| io::Error::new(io::ErrorKind::Other, err)), - ); - - futures::pin_mut!(toolchain_reader); - - match handler - .handle_submit_toolchain(job_id, toolchain_reader) - .await - { - Ok(res) => accepts_response(&headers, &res), - Err(err) => anyhow_response(method, uri, err), - } - } - }), - ) - .route( - "/api/v1/distserver/run_job/:job_id", - routing::post({ - let handler = Arc::clone(&handler); - move |method: Method, - uri: Uri, - headers: HeaderMap, - AuthenticatedJob(job_id): AuthenticatedJob, - request: Request| async move { - // Convert the request body stream into an `AsyncRead` - let body_reader = StreamReader::new( - request - .into_body() - .into_data_stream() - .map_err(|err| io::Error::new(io::ErrorKind::Other, err)), - ); - - futures::pin_mut!(body_reader); - - // Read the RunJob message, then take the rest of the body as the job inputs - let (run_job, inputs_reader) = { - let run_job_length = - body_reader.read_u32().await.map_err(|err| { - ( - StatusCode::BAD_REQUEST, - format!("Invalid bincode length: {err}"), - ) - .into_response() - })? as usize; - - let mut run_job_reader = body_reader.take(run_job_length as u64); - let mut run_job_buf = vec![]; - run_job_reader.read_to_end(&mut run_job_buf).await.map_err( - |err| { - (StatusCode::BAD_REQUEST, err.to_string()).into_response() - }, - )?; - - let run_job = bincode::deserialize_from::<_, RunJobHttpRequest>( - run_job_buf.reader(), - ) - .map_err(|err| { - (StatusCode::BAD_REQUEST, err.to_string()).into_response() - })?; - - (run_job, ZlibReadDecoder::new(run_job_reader.into_inner())) - }; - - futures::pin_mut!(inputs_reader); - - tracing::trace!("[run_job({})]: {:?}", job_id, run_job); - let RunJobHttpRequest { command, outputs } = run_job; - - match handler - .handle_run_job(job_id, command, outputs, inputs_reader) - .await - { - Ok(res) => Ok(accepts_response(&headers, &res).into_response()), - Err(err) => Err(anyhow_response(method, uri, err).into_response()), - } - } - }), - ) - .fallback(|| async move { (StatusCode::NOT_FOUND, "404") }) - // 1GiB should be enough for toolchains and compile inputs, right? - .layer(DefaultBodyLimit::max(1024 * 1024 * 1024)) - .layer(Extension(Arc::new(ServerState { - auth: JWTJobAuthorizer::new(jwt_key.clone()), - server_nonce: server_nonce.clone(), - }))); - - let app = with_request_tracing(app); - - let tls_acceptor = { - let cert = openssl::x509::X509::from_pem(&cert_pem).unwrap(); - let key = openssl::pkey::PKey::private_key_from_pem(&privkey_pem).unwrap(); - let mut tls_builder = - SslAcceptor::mozilla_intermediate_v5(SslMethod::tls()).unwrap(); - tls_builder.set_certificate(&cert).unwrap(); - tls_builder.set_private_key(&key).unwrap(); - tls_builder.check_private_key().unwrap(); - tls_builder.build() - }; - - let listener = TcpListener::bind(bind_addr).await.unwrap(); - - tracing::info!( - "Server listening for clients on {}, public_addr is: {}", - bind_addr, - public_addr - ); - - loop { - let tls_acceptor = tls_acceptor.clone(); - let (tcp_stream, remote_addr) = listener.accept().await.unwrap(); - let tower_service = app.clone(); - - tokio::spawn(async move { - // Wait for tls handshake to happen - let ssl = Ssl::new(tls_acceptor.context()).unwrap(); - let mut tls_stream = SslStream::new(ssl, tcp_stream).unwrap(); - if let Err(err) = SslStream::accept(std::pin::Pin::new(&mut tls_stream)).await { - tracing::debug!( - "error during tls handshake connection from {}: {}", - remote_addr, - err - ); - return; - } - - // Hyper has its own `AsyncRead` and `AsyncWrite` traits and doesn't use tokio. - // `TokioIo` converts between them. - let tok_stream = TokioIo::new(tls_stream); - - let hyper_service = hyper::service::service_fn( - move |request: Request| { - // Clone `tower_service` because hyper's `Service` uses `&self` whereas - // tower's `Service` requires `&mut self`. - tower_service.clone().oneshot(request) - }, - ); - - if let Err(err) = - hyper_util::server::conn::auto::Builder::new(TokioExecutor::new()) - .serve_connection(tok_stream, hyper_service) - .await - { - tracing::debug!("sccache: failed to serve connection: {err:#}"); - } - }); + // pub async fn listen(&self, req_id: String) -> Result { + // self.send_recv(req_id, None, None).await + // } + + // pub async fn listen_with_timeout( + // &self, + // req_id: String, + // timeout: Duration, + // ) -> Result { + // self.send_recv(req_id, None, Some(timeout)).await + // } + + // pub async fn request(&self, req: Outgoing) -> Result { + // self.send_recv(uuid::Uuid::new_v4().to_string(), Some(req), None) + // .await + // } + + // pub async fn request_with_timeout( + // &self, + // req: Outgoing, + // timeout: Duration, + // ) -> Result { + // self.send_recv(uuid::Uuid::new_v4().to_string(), Some(req), Some(timeout)) + // .await + // } + } + + fn make_scheduler_ws_uri(scheduler_url: &reqwest::Url) -> Result { + let mut uri = scheduler_url.clone(); + uri.set_path("api/v2/client/ws"); + + match uri.scheme() { + "http" => uri.set_scheme("ws").map(|_| uri), + "https" => uri.set_scheme("wss").map(|_| uri), + scheme => { + error!("Unknown scheduler URL scheme `{scheme}`"); + return Err(anyhow!("Unknown scheduler URL scheme `{scheme}`")); } } + .map_err(|_| anyhow!("Failed to set scheduler WebSocket URI scheme")) + .and_then(|uri| { + http::Uri::try_from(uri.as_str().as_bytes()).map_err(|err| { + error!("Failed to create scheduler WebSocket URI: {err}"); + anyhow::Error::new(err) + }) + }) } -} - -#[cfg(feature = "dist-client")] -mod client { - use super::super::cache; - use crate::config; - use crate::dist::pkg::{InputsPackager, ToolchainPackager}; - use crate::dist::{ - self, AllocJobResult, CompileCommand, JobAlloc, PathTransformer, RunJobResult, - SchedulerStatusResult, SubmitToolchainResult, Toolchain, - }; - use crate::util::new_reqwest_client; - - use async_trait::async_trait; - use byteorder::{BigEndian, WriteBytesExt}; - use flate2::write::ZlibEncoder as ZlibWriteEncoder; - use flate2::Compression; - use futures::lock::Mutex; - use futures::TryFutureExt; - use reqwest::Body; - use std::collections::HashMap; - use std::io::Write; - use std::path::{Path, PathBuf}; - use std::sync::Arc; - - use super::common::{ - bincode_req_fut, AllocJobHttpResponse, ReqwestRequestBuilderExt, RunJobHttpRequest, - ServerCertificateHttpResponse, - }; - use super::urls; - use crate::errors::*; pub struct Client { auth_token: String, scheduler_url: reqwest::Url, - // cert_digest -> cert_pem - server_certs: Arc, Vec)>>>, client: Arc>, + ws_client: tokio::sync::OnceCell>, pool: tokio::runtime::Handle, tc_cache: Arc, rewrite_includes_only: bool, + pending_toolchain_submissions: + Arc>>>, } impl Client { - pub fn new( + pub async fn new( pool: &tokio::runtime::Handle, scheduler_url: reqwest::Url, cache_dir: &Path, @@ -1448,139 +589,128 @@ mod client { auth_token: String, rewrite_includes_only: bool, ) -> Result { - let client = - new_reqwest_client(None, None).expect("http client must build with success"); + let client = new_reqwest_client(); let client_toolchains = cache::ClientToolchains::new(cache_dir, cache_size, toolchain_configs) .context("failed to initialise client toolchains")?; + Ok(Self { - auth_token, - scheduler_url, - server_certs: Default::default(), + auth_token: auth_token.clone(), + scheduler_url: scheduler_url.clone(), client: Arc::new(Mutex::new(client)), + pending_toolchain_submissions: Default::default(), pool: pool.clone(), tc_cache: Arc::new(client_toolchains), rewrite_includes_only, - }) - } - } - - #[async_trait] - impl dist::Client for Client { - async fn do_alloc_job(&self, tc: Toolchain) -> Result { - let scheduler_url = self.scheduler_url.clone(); - let url = urls::scheduler_alloc_job(&scheduler_url); - let mut req = self.client.lock().await.post(url); - req = req.bearer_auth(self.auth_token.clone()).bincode(&tc)?; - - let client = self.client.clone(); - let server_certs = self.server_certs.clone(); - - match bincode_req_fut(req).await? { - AllocJobHttpResponse::Success { - job_alloc, - need_toolchain, - cert_digest, - } => { - let server_id = job_alloc.server_id; - let alloc_job_res = Ok(AllocJobResult::Success { - job_alloc, - need_toolchain, - }); - - // Lock client and server_certs until the certs have been updated - let mut client = client.lock().await; - let mut server_certs = server_certs.lock().await; - if let Some((saved_cert_digest, _)) = server_certs.get(&server_id) { - if saved_cert_digest == &cert_digest { - return alloc_job_res; - } - } - info!( - "Need to request new certificate for server {}", - server_id.addr() - ); - let url = urls::scheduler_server_certificate(&scheduler_url, server_id); - let req = client.get(url); - let res: ServerCertificateHttpResponse = bincode_req_fut(req) - .await - .context("GET to scheduler server_certificate failed")?; - - // Remove the old entry first - server_certs.remove(&server_id); + ws_client: tokio::sync::OnceCell::new_with( + { + let mut connect_req = + make_scheduler_ws_uri(&scheduler_url)?.into_client_request()?; - if reqwest::Certificate::from_pem(&res.cert_pem).is_ok() { - // Insert so it's added to the client in the following loop - server_certs.insert(server_id, (res.cert_digest, res.cert_pem)); - } else { - warn!( - "[do_alloc_job({})]: failed to interpret pem as certificate", - server_id.addr() + connect_req.headers_mut().insert( + http::header::AUTHORIZATION, + http::header::HeaderValue::from_str(&format!("Bearer {}", auth_token))?, ); - } - // Use the updated certificates - *client = new_reqwest_client(None, Some(&server_certs)) - .context("Failed to update certificates")?; + WebSocketClient::new( + pool, + || async move { + info!("Attempting to connect to dist server: {}", &scheduler_url); + let config = + tokio_tungstenite::tungstenite::protocol::WebSocketConfig { + max_message_size: None, + max_frame_size: None, + ..Default::default() + }; + let (sock, response) = + match tokio_tungstenite::connect_async_with_config( + connect_req, + Some(config), + true, + ) + .await + { + Ok(res) => res, + Err(err) => { + error!("Failed to connect to dist server: {err}"); + return Err(anyhow!(err)); + } + }; + info!("WebSocket handshake complete, response was {response:?}"); + Ok(sock.split()) + }, + || { + info!("WebSocketClient shutdown"); + ClientIncoming::Error { + message: "WebSocketClient closed".into(), + } + }, + ) + .await + } + .ok(), + ), + }) + } - alloc_job_res + fn ws_client(&self) -> Option<&WebSocketClient> { + if let Some(ws_client) = self.ws_client.get() { + if !ws_client.is_closed() { + return Some(ws_client); } - AllocJobHttpResponse::Fail { msg } => Ok(AllocJobResult::Fail { msg }), } + None } + } - async fn do_get_status(&self) -> Result { - let scheduler_url = self.scheduler_url.clone(); - let url = urls::scheduler_status(&scheduler_url); - let mut req = self.client.lock().await.get(url); - req = req.bearer_auth(self.auth_token.clone()); - bincode_req_fut(req).await - } - - async fn do_submit_toolchain( - &self, - job_alloc: JobAlloc, - tc: Toolchain, - ) -> Result { - match self.tc_cache.get_toolchain(&tc) { - Ok(Some(toolchain_file)) => { - let url = urls::server_submit_toolchain(job_alloc.server_id, job_alloc.job_id); - let req = self.client.lock().await.post(url); - let toolchain_file = tokio::fs::File::from_std(toolchain_file.into()); - let toolchain_file_stream = tokio_util::io::ReaderStream::new(toolchain_file); - let body = Body::wrap_stream(toolchain_file_stream); - let req = req.bearer_auth(job_alloc.auth).body(body); - bincode_req_fut(req).await + #[async_trait] + impl dist::Client for Client { + async fn new_job(&self, toolchain: Toolchain) -> Result { + if let Some(ws_client) = self.ws_client() { + match ws_client + .send_recv( + uuid::Uuid::new_v4().to_string(), + Some(ClientOutgoing::NewJob(NewJobRequest { toolchain })), + None, + ) + .await + { + Err(err) => Err(err), + Ok(ClientIncoming::NewJob(res)) => Ok(res), + Ok(ClientIncoming::Error { message }) => Err(anyhow!(message)), + Ok(res) => Err(anyhow!("Unexpected new_job response: {res:?}")), } - Ok(None) => Err(anyhow!("couldn't find toolchain locally")), - Err(e) => Err(e), + } else { + bincode_req_fut( + self.client + .lock() + .await + .post(urls::scheduler_new_job(&self.scheduler_url)) + .bearer_auth(self.auth_token.clone()) + .bincode(&toolchain)?, + ) + .await } } - async fn do_run_job( + async fn run_job( &self, - job_alloc: JobAlloc, + job_id: &str, + timeout: Duration, + toolchain: Toolchain, command: CompileCommand, outputs: Vec, inputs_packager: Box, - ) -> Result<(RunJobResult, PathTransformer)> { - let url = urls::server_run_job(job_alloc.server_id, job_alloc.job_id); - - let (body, path_transformer) = self + ) -> Result<(RunJobResponse, PathTransformer)> { + let job_id = job_id.to_owned(); + let (req, path_transformer) = self .pool .spawn_blocking(move || -> Result<_> { - let bincode = bincode::serialize(&RunJobHttpRequest { command, outputs }) - .context("failed to serialize run job request")?; - let bincode_length = bincode.len(); - - let mut body = vec![]; - body.write_u32::(bincode_length as u32) - .expect("Infallible write of bincode length to vec failed"); - body.write_all(&bincode) - .expect("Infallible write of bincode body to vec failed"); + let mut inputs = vec![]; let path_transformer; { - let mut compressor = ZlibWriteEncoder::new(&mut body, Compression::fast()); + let mut compressor = + ZlibWriteEncoder::new(&mut inputs, Compression::fast()); path_transformer = inputs_packager .write_inputs(&mut compressor) .context("Could not write inputs for compilation")?; @@ -1592,15 +722,124 @@ mod client { ); compressor.finish().context("failed to finish compressor")?; } - - Ok((body, path_transformer)) + Ok(( + RunJobRequest { + job_id, + command, + inputs, + outputs, + toolchain, + }, + path_transformer, + )) }) .await??; - let mut req = self.client.lock().await.post(url); - req = req.bearer_auth(job_alloc.auth.clone()).bytes(body); - bincode_req_fut(req) - .map_ok(|res| (res, path_transformer)) + + if let Some(ws_client) = self.ws_client() { + match ws_client + .send_recv( + uuid::Uuid::new_v4().to_string(), + Some(ClientOutgoing::RunJob(req)), + Some(timeout), + ) + .await + { + Err(err) => Err(err), + Ok(ClientIncoming::RunJob(res)) => Ok(res), + Ok(ClientIncoming::Error { message }) => Err(anyhow!(message)), + Ok(res) => Err(anyhow!("Unexpected run_job response: {res:?}")), + } + } else { + bincode_req_fut( + self.client + .lock() + .await + .post(urls::scheduler_run_job(&self.scheduler_url, &req.job_id)) + .bearer_auth(self.auth_token.clone()) + .timeout(timeout) + .bincode(&req)?, + ) .await + } + .map(|res| (res, path_transformer)) + } + + async fn do_get_status(&self) -> Result { + bincode_req_fut( + self.client + .lock() + .await + .get(urls::scheduler_status(&self.scheduler_url)) + .bearer_auth(self.auth_token.clone()), + ) + .await + } + + async fn do_submit_toolchain(&self, tc: Toolchain) -> Result { + let mut rx = match self.tc_cache.get_toolchain(&tc) { + Ok(Some(toolchain_file)) => { + let pending_tc_subs = self.pending_toolchain_submissions.clone(); + let mut pending_subs = pending_tc_subs.lock().await; + if !pending_subs.contains_key(&tc) { + let (tx, rx) = tokio::sync::broadcast::channel(1); + pending_subs.insert(tc.clone(), tx); + self.pool.clone().spawn({ + let tc = tc.clone(); + let tc_id = tc.archive_id.clone(); + let auth_token = self.auth_token.clone(); + let http_client = self.client.clone(); + let pending_tc_subs = pending_tc_subs.clone(); + let scheduler_url = self.scheduler_url.clone(); + + async move { + let res = match bincode_req_fut::( + http_client + .lock() + .await + .put(urls::scheduler_submit_toolchain( + &scheduler_url, + &tc_id, + )) + .bearer_auth(auth_token.clone()) + .body(Body::wrap_stream( + tokio_util::io::ReaderStream::new( + tokio::fs::File::from_std(toolchain_file.into()), + ), + )), + ) + .await + { + Ok(res) => { + debug!("[do_submit_toolchain({})]: {:?}", tc_id, res); + res + } + Err(err) => { + warn!("[do_submit_toolchain({})]: {:?}", tc_id, err); + SubmitToolchainResult::Error { + message: format!("{err}"), + } + } + }; + + pending_tc_subs + .lock() + .await + .remove(&tc) + .unwrap() + .send(res) + .unwrap(); + } + }); + rx + } else { + pending_subs.get(&tc).unwrap().subscribe() + } + } + Ok(None) => return Err(anyhow!("couldn't find toolchain locally")), + Err(e) => return Err(e), + }; + + rx.recv().await.map_err(anyhow::Error::new) } async fn put_toolchain( @@ -1623,6 +862,7 @@ mod client { fn rewrite_includes_only(&self) -> bool { self.rewrite_includes_only } + fn get_custom_toolchain(&self, exe: &Path) -> Option { match self.tc_cache.get_custom_toolchain(exe) { Some(Ok((_, _, path))) => Some(path), diff --git a/src/dist/mod.rs b/src/dist/mod.rs index da4199c49c..f55730644d 100644 --- a/src/dist/mod.rs +++ b/src/dist/mod.rs @@ -12,19 +12,17 @@ // See the License for the specific language governing permissions and // limitations under the License. -use crate::compiler; use async_trait::async_trait; -#[cfg(feature = "dist-server")] -use futures::lock::Mutex; -use rand::{rngs::OsRng, RngCore}; use serde::{Deserialize, Serialize}; use std::ffi::OsString; use std::fmt; use std::io::{self, Read}; use std::net::SocketAddr; use std::path::{Path, PathBuf}; +#[cfg(feature = "dist-server")] +use std::pin::Pin; use std::process; -use std::str::FromStr; +use std::time::Duration; use crate::errors::*; @@ -34,12 +32,17 @@ mod cache; pub mod client_auth; #[cfg(any(feature = "dist-client", feature = "dist-server"))] pub mod http; +#[cfg(feature = "dist-server")] +pub mod server; #[cfg(test)] mod test; #[cfg(any(feature = "dist-client", feature = "dist-server"))] pub use crate::dist::cache::TcCache; +#[cfg(feature = "dist-server")] +pub use crate::dist::cache::ServerToolchains; + // TODO: paths (particularly outputs, which are accessed by an unsandboxed program) // should be some pre-sanitised AbsPath type @@ -317,95 +320,10 @@ pub fn strings_to_osstrings(strings: &[String]) -> Vec { .collect::>() } -// TODO: TryFrom -pub fn try_compile_command_to_dist( - command: compiler::SingleCompileCommand, -) -> Option { - let compiler::SingleCompileCommand { - executable, - arguments, - env_vars, - cwd, - } = command; - Some(CompileCommand { - executable: executable.into_os_string().into_string().ok()?, - arguments: arguments - .into_iter() - .map(|arg| arg.into_string().ok()) - .collect::>()?, - env_vars: env_vars - .into_iter() - .map(|(k, v)| Some((k.into_string().ok()?, v.into_string().ok()?))) - .collect::>()?, - cwd: cwd.into_os_string().into_string().ok()?, - }) -} - -// TODO: Clone by assuming immutable/no GC for now -// TODO: make fields non-public? -// TODO: make archive_id validate that it's just a bunch of hex chars -#[derive(Debug, Hash, Eq, PartialEq, Clone, Serialize, Deserialize)] -#[serde(deny_unknown_fields)] -pub struct Toolchain { - pub archive_id: String, -} - -#[derive(Hash, Eq, PartialEq, Clone, Copy, Debug, Ord, PartialOrd, Serialize, Deserialize)] -#[serde(deny_unknown_fields)] -pub struct JobId(pub u64); -impl fmt::Display for JobId { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - self.0.fmt(f) - } -} -impl FromStr for JobId { - type Err = ::Err; - fn from_str(s: &str) -> ::std::result::Result { - u64::from_str(s).map(JobId) - } -} -#[derive(Hash, Eq, PartialEq, Clone, Copy, Debug, Serialize, Deserialize)] -#[serde(deny_unknown_fields)] -pub struct ServerId(SocketAddr); -impl ServerId { - pub fn new(addr: SocketAddr) -> Self { - ServerId(addr) - } - pub fn addr(&self) -> SocketAddr { - self.0 - } -} -impl FromStr for ServerId { - type Err = ::Err; - fn from_str(s: &str) -> ::std::result::Result { - SocketAddr::from_str(s).map(ServerId) - } -} -#[derive(Eq, PartialEq, Clone, Debug, Serialize, Deserialize)] -#[serde(deny_unknown_fields)] -pub struct ServerNonce(u64); -impl ServerNonce { - pub fn new() -> Self { - ServerNonce(OsRng.next_u64()) - } - pub fn as_u64(&self) -> u64 { - self.0 - } -} - -#[derive(Clone, Debug, Serialize, Deserialize)] -#[serde(deny_unknown_fields)] -pub struct CompileCommand { - pub executable: String, - pub arguments: Vec, - pub env_vars: Vec<(String, String)>, - pub cwd: String, -} - // process::Output is not serialize so we have a custom Output type. However, // we cannot encode all information in here, such as Unix signals, as the other // end may not understand them (e.g. if it's Windows) -#[derive(Clone, Serialize, Deserialize)] +#[derive(Clone, Debug, Serialize, Deserialize)] #[serde(deny_unknown_fields)] pub struct ProcessOutput { code: i32, @@ -465,7 +383,7 @@ impl From for process::Output { } } -#[derive(Clone, Serialize, Deserialize)] +#[derive(Clone, Debug, Serialize, Deserialize)] #[serde(deny_unknown_fields)] pub struct OutputData(Vec, u64); impl OutputData { @@ -490,82 +408,142 @@ impl OutputData { ZlibReadDecoder::new(io::Cursor::new(self.0)) } } + pub struct OutputDataLens { pub actual: u64, pub compressed: u64, } + impl fmt::Display for OutputDataLens { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "Size: {}->{}", self.actual, self.compressed) } } -// TODO: standardise on compressed or not for inputs and toolchain - // TODO: make fields not public -// AllocJob +// BuildResult #[derive(Clone, Debug, Serialize, Deserialize)] -#[serde(deny_unknown_fields)] -pub struct JobAlloc { - pub auth: String, - pub job_id: JobId, - pub server_id: ServerId, +pub struct BuildResult { + pub output: ProcessOutput, + pub outputs: Vec<(String, OutputData)>, } -#[derive(Clone, Serialize, Deserialize)] + +// CompileCommand + +#[derive(Clone, Debug, Serialize, Deserialize)] #[serde(deny_unknown_fields)] -pub enum AllocJobResult { - Success { - job_alloc: JobAlloc, - need_toolchain: bool, - }, - Fail { - msg: String, - }, +pub struct CompileCommand { + pub executable: String, + pub arguments: Vec, + pub env_vars: Vec<(String, String)>, + pub cwd: String, } -// AssignJob +// NewJob -#[derive(Clone, Serialize, Deserialize)] +#[derive(Clone, Debug, Serialize, Deserialize)] +#[serde(deny_unknown_fields)] +pub struct NewJobRequest { + pub toolchain: Toolchain, +} + +#[derive(Clone, Debug, Serialize, Deserialize)] #[serde(deny_unknown_fields)] -pub struct AssignJobResult { - pub job_id: JobId, - pub need_toolchain: bool, - pub num_assigned_jobs: usize, - pub num_active_jobs: usize, +pub struct NewJobResponse { + pub has_toolchain: bool, + pub job_id: String, + pub timeout: u32, } -// JobState +// RunJob -#[derive(Clone, Serialize, Deserialize)] +#[derive(Clone, Debug, Serialize, Deserialize)] #[serde(deny_unknown_fields)] -pub enum UpdateJobStateResult { - Success, - Fail { msg: String }, +pub struct RunJobRequest { + pub job_id: String, + pub command: CompileCommand, + pub inputs: Vec, + pub outputs: Vec, + pub toolchain: Toolchain, } -// HeartbeatServer +#[derive(Clone, Debug, Serialize, Deserialize)] +#[serde(deny_unknown_fields)] +pub enum RunJobResponse { + JobFailed { reason: String }, + JobComplete { result: BuildResult }, +} + +// ClientOutgoing -#[derive(Clone, Serialize, Deserialize)] +#[derive(Clone, Debug, Serialize, Deserialize)] #[serde(deny_unknown_fields)] -pub struct HeartbeatServerResult { - pub is_new: bool, +pub enum ClientOutgoing { + NewJob(NewJobRequest), + RunJob(RunJobRequest), } -// RunJob +impl fmt::Display for ClientOutgoing { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Self::NewJob(NewJobRequest { toolchain }) => { + write!(f, "Request::NewJob(`{}`)", toolchain.archive_id) + } + Self::RunJob(RunJobRequest { job_id, .. }) => { + write!(f, "Request::RunJob(job_id={})", job_id) + } + } + } +} + +// ClientIncoming -#[derive(Clone, Serialize, Deserialize)] +#[derive(Clone, Debug, Serialize, Deserialize)] #[serde(deny_unknown_fields)] -pub enum RunJobResult { - JobNotFound, - Complete(JobComplete), +pub enum ClientIncoming { + Error { message: String }, + NewJob(NewJobResponse), + RunJob(RunJobResponse), } -#[derive(Clone, Serialize, Deserialize)] + +impl fmt::Display for ClientIncoming { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Self::Error { message } => write!(f, "Response::Error(`{message}`)"), + Self::NewJob(NewJobResponse { + job_id, + has_toolchain, + .. + }) => { + write!( + f, + "Response::NewJob(job_id={job_id}, has_toolchain={has_toolchain})" + ) + } + Self::RunJob(RunJobResponse::JobFailed { reason }) => { + write!(f, "Response::JobFailed(`{reason}`)") + } + Self::RunJob(RunJobResponse::JobComplete { result }) => { + write!(f, "Response::JobComplete({:?})", result.output) + } + } + } +} + +pub type ServerIncoming = ClientOutgoing; +pub type ServerOutgoing = ClientIncoming; + +// Toolchain + +// TODO: Clone by assuming immutable/no GC for now +// TODO: make fields non-public? +// TODO: make archive_id validate that it's just a bunch of hex chars +#[derive(Debug, Hash, Eq, PartialEq, Clone, Serialize, Deserialize)] #[serde(deny_unknown_fields)] -pub struct JobComplete { - pub output: ProcessOutput, - pub outputs: Vec<(String, OutputData)>, +pub struct Toolchain { + pub archive_id: String, } // Status @@ -593,23 +571,15 @@ pub struct ServerStatusResult { // SubmitToolchain -#[derive(Clone, Serialize, Deserialize)] +#[derive(Clone, Debug, Serialize, Deserialize)] #[serde(deny_unknown_fields)] pub enum SubmitToolchainResult { Success, - JobNotFound, - CannotCache, + Error { message: String }, } /////////////////// -// BuildResult - -pub struct BuildResult { - pub output: ProcessOutput, - pub outputs: Vec<(String, OutputData)>, -} - /////////////////// // TODO: it's unfortunate all these are public, but in order to describe the trait @@ -621,111 +591,63 @@ pub struct BuildResult { type ExtResult = ::std::result::Result; #[cfg(feature = "dist-server")] -#[async_trait] -pub trait SchedulerOutgoing: Send + Sync { - // To Server - async fn do_assign_job( - &self, - server_id: ServerId, - tc: Toolchain, - auth: String, - ) -> Result; -} - -#[cfg(feature = "dist-server")] -#[async_trait] -pub trait ServerOutgoing: Send + Sync { - // To Scheduler - async fn do_heartbeat( - &self, - num_assigned_jobs: usize, - num_active_jobs: usize, - ) -> Result; - // To Scheduler - async fn do_update_job_state( - &self, - num_assigned_jobs: usize, - num_active_jobs: usize, - ) -> Result; -} +pub type WebSocketSend = + futures::stream::SplitSink; -// Trait to handle the creation and verification of job authorization tokens #[cfg(feature = "dist-server")] -pub trait JobAuthorizer: Send + Sync { - fn generate_token(&self, job_id: JobId) -> Result; - fn verify_token(&self, job_id: JobId, token: &str) -> Result<()>; -} +pub type WebSocketRecv = futures::stream::SplitStream; #[cfg(feature = "dist-server")] #[async_trait] -pub trait SchedulerIncoming: Send + Sync { - // From Client - async fn handle_alloc_job( - &self, - requester: &dyn SchedulerOutgoing, - tc: Toolchain, - ) -> ExtResult; - // // From Client - // From Server - #[allow(clippy::too_many_arguments)] - async fn handle_heartbeat_server( - &self, - server_id: ServerId, - server_nonce: ServerNonce, - num_cpus: usize, - max_per_core_load: f64, - job_authorizer: Box, - num_assigned_jobs: usize, - num_active_jobs: usize, - ) -> ExtResult; - // From Server - async fn handle_update_job_state( +pub trait SchedulerService: Send + Sync { + async fn get_status(&self) -> Result; + + async fn has_toolchain(&self, toolchain: Toolchain) -> bool; + + async fn put_toolchain( &self, - server_id: ServerId, - num_assigned_jobs: usize, - num_active_jobs: usize, - ) -> ExtResult; - // From anyone - async fn handle_status(&self) -> ExtResult; + toolchain: Toolchain, + toolchain_reader: Pin<&mut (dyn futures::AsyncRead + Send)>, + ) -> Result; + + async fn new_job(&self, request: NewJobRequest) -> Result; + async fn run_job(&self, request: RunJobRequest) -> Result; + + async fn job_failure(&self, job_id: &str, reason: &str) -> Result<()>; + async fn job_success(&self, job_id: &str, result: BuildResult) -> Result<()>; } #[cfg(feature = "dist-server")] #[async_trait] -pub trait ServerIncoming: Send + Sync { - // To scheduler - fn start_heartbeat(&self, requester: std::sync::Arc); - // From Scheduler - async fn handle_assign_job(&self, tc: Toolchain) -> ExtResult; - // From Client - async fn handle_submit_toolchain( - &self, - job_id: JobId, - tc_rdr: std::pin::Pin<&mut (dyn tokio::io::AsyncRead + Send)>, - ) -> ExtResult; - // From Client - async fn handle_run_job( +pub trait ServerService: Send + Sync { + #[allow(clippy::too_many_arguments)] + async fn run_job( &self, - job_id: JobId, + task_id: &str, + job_id: &str, + scheduler_id: &str, + toolchain: Toolchain, command: CompileCommand, outputs: Vec, - inputs_rdr: std::pin::Pin<&mut (dyn tokio::io::AsyncRead + Send)>, - ) -> ExtResult; + inputs: Vec, + ) -> Result; + + async fn job_failure(&self, task_id: &str, reason: &str) -> Result<()>; + + async fn job_success(&self, task_id: &str, result: &BuildResult) -> Result<()>; } #[cfg(feature = "dist-server")] #[async_trait] pub trait BuilderIncoming: Send + Sync { // From Server - #[allow(clippy::too_many_arguments)] async fn run_build( &self, - job_id: JobId, - toolchain: Toolchain, + job_id: &str, + toolchain_dir: &Path, command: CompileCommand, outputs: Vec, - inputs_rdr: std::pin::Pin<&mut (dyn tokio::io::AsyncRead + Send)>, - cache: &Mutex, - job_queue: &tokio::sync::Semaphore, + inputs: Vec, ) -> ExtResult; } @@ -733,23 +655,21 @@ pub trait BuilderIncoming: Send + Sync { #[async_trait] pub trait Client: Send + Sync { // To Scheduler - async fn do_alloc_job(&self, tc: Toolchain) -> Result; + async fn new_job(&self, toolchain: Toolchain) -> Result; // To Scheduler - async fn do_get_status(&self) -> Result; - // To Server - async fn do_submit_toolchain( - &self, - job_alloc: JobAlloc, - tc: Toolchain, - ) -> Result; - // To Server - async fn do_run_job( + async fn run_job( &self, - job_alloc: JobAlloc, + job_id: &str, + timeout: Duration, + toolchain: Toolchain, command: CompileCommand, outputs: Vec, inputs_packager: Box, - ) -> Result<(RunJobResult, PathTransformer)>; + ) -> Result<(RunJobResponse, PathTransformer)>; + // To Scheduler + async fn do_get_status(&self) -> Result; + // To Scheduler + async fn do_submit_toolchain(&self, tc: Toolchain) -> Result; async fn put_toolchain( &self, compiler_path: PathBuf, diff --git a/src/dist/server.rs b/src/dist/server.rs new file mode 100644 index 0000000000..d088b595b2 --- /dev/null +++ b/src/dist/server.rs @@ -0,0 +1,563 @@ +#[cfg(feature = "dist-server")] +pub use self::internal::Scheduler; + +#[cfg(feature = "dist-server")] +mod internal { + + use async_trait::async_trait; + + use axum::{ + body::Bytes, + extract::{ + ws::{CloseFrame, Message, WebSocket, WebSocketUpgrade}, + ConnectInfo, DefaultBodyLimit, Extension, FromRequest, FromRequestParts, Path, Request, + }, + http::{request::Parts, HeaderMap, Method, StatusCode, Uri}, + response::{IntoResponse, Response}, + routing, RequestPartsExt, Router, + }; + + use axum_extra::{ + headers::{authorization::Bearer, Authorization}, + TypedHeader, + }; + + use futures::{lock::Mutex, pin_mut, SinkExt, StreamExt, TryStreamExt}; + + use hyper_util::rt::{TokioExecutor, TokioIo}; + + use serde_json::json; + use tokio_tungstenite::tungstenite::error::ProtocolError; + + use std::{io, net::SocketAddr, sync::Arc}; + + use tokio::net::TcpListener; + use tokio_util::{compat::TokioAsyncReadCompatExt, io::StreamReader}; + use tower::{Service, ServiceBuilder, ServiceExt}; + use tower_http::{ + request_id::{MakeRequestUuid, PropagateRequestIdLayer, SetRequestIdLayer}, + sensitive_headers::{SetSensitiveRequestHeadersLayer, SetSensitiveResponseHeadersLayer}, + trace::{DefaultMakeSpan, DefaultOnResponse, TraceLayer}, + }; + + use crate::dist::{ + http::{bincode_deserialize, bincode_serialize, for_all_concurrent, ClientAuthCheck}, + ClientIncoming, ClientOutgoing, NewJobRequest, RunJobRequest, SchedulerService, + ServerIncoming, Toolchain, + }; + + use crate::errors::*; + + fn get_header_value<'a>(headers: &'a HeaderMap, name: &'a str) -> Option<&'a str> { + if let Some(header) = headers.get(name) { + if let Ok(header) = header.to_str() { + return Some(header); + } + } + None + } + + /// Return `content` as either a bincode or json encoded `Response` depending on the Accept header. + fn result_to_response( + headers: HeaderMap, + ) -> impl FnOnce(T) -> std::result::Result + where + T: serde::Serialize, + { + move |content: T| { + if let Some(header) = headers.get("Accept") { + // This is the only function we use from rouille. + // Maybe we can find a replacement? + match rouille::input::priority_header_preferred( + header.to_str().unwrap_or("*/*"), + ["application/octet-stream", "application/json"] + .iter() + .cloned(), + ) { + // application/octet-stream + Some(0) => match bincode::serialize(&content) { + Ok(body) => Ok((StatusCode::OK, body).into_response()), + Err(err) => Err(( + StatusCode::INTERNAL_SERVER_ERROR, + format!("Failed to serialize response body: {err}").into_bytes(), + ) + .into_response()), + }, + // application/json + Some(1) => Ok(( + StatusCode::OK, + json!(content).as_str().unwrap().to_string().into_bytes(), + ) + .into_response()), + _ => Err(( + StatusCode::BAD_REQUEST, + "Request must accept application/json or application/octet-stream" + .to_string() + .into_bytes(), + ) + .into_response()), + } + } else { + Err(( + StatusCode::BAD_REQUEST, + "Request must accept application/json or application/octet-stream" + .to_string() + .into_bytes(), + ) + .into_response()) + } + } + } + + fn anyhow_to_response( + method: Method, + uri: Uri, + ) -> impl FnOnce(anyhow::Error) -> std::result::Result { + move |err: anyhow::Error| { + let msg = format!("sccache: `{method} {uri}` failed with {err}"); + tracing::error!("{}", msg); + Err((StatusCode::INTERNAL_SERVER_ERROR, msg.into_bytes()).into_response()) + } + } + + fn unwrap_infallible(result: std::result::Result) -> T { + match result { + Ok(value) => value, + Err(err) => match err {}, + } + } + + fn with_request_tracing(app: Router) -> Router { + // Mark these headers as sensitive so they don't show in logs + let headers_to_redact: Arc<[_]> = Arc::new([ + http::header::AUTHORIZATION, + http::header::PROXY_AUTHORIZATION, + http::header::COOKIE, + http::header::SET_COOKIE, + ]); + app.layer( + ServiceBuilder::new() + .layer(SetSensitiveRequestHeadersLayer::from_shared(Arc::clone( + &headers_to_redact, + ))) + .layer(SetRequestIdLayer::x_request_id(MakeRequestUuid)) + .layer( + TraceLayer::new_for_http() + .make_span_with(DefaultMakeSpan::new().include_headers(true)) + .on_response(DefaultOnResponse::new().include_headers(true)), + ) + .layer(PropagateRequestIdLayer::x_request_id()) + .layer(SetSensitiveResponseHeadersLayer::from_shared( + headers_to_redact, + )), + ) + } + + // Verify authenticated sccache clients + struct AuthenticatedClient(SocketAddr); + + #[async_trait] + impl FromRequestParts for AuthenticatedClient + where + S: Send + Sync, + { + type Rejection = StatusCode; + + async fn from_request_parts( + parts: &mut Parts, + _state: &S, + ) -> std::result::Result { + let TypedHeader(Authorization(bearer)) = parts + .extract::>>() + .await + .map_err(|_| StatusCode::UNAUTHORIZED)?; + + let ConnectInfo(remote_addr) = parts + .extract::>() + .await + .map_err(|_| StatusCode::BAD_REQUEST)?; + + let Extension(this) = parts + .extract::>>() + .await + .map_err(|_| StatusCode::INTERNAL_SERVER_ERROR)?; + + this.client_auth + .check(bearer.token()) + .map(|_| AuthenticatedClient(remote_addr)) + .map_err(|err| { + tracing::warn!( + "[AuthenticatedClient({remote_addr})]: invalid client auth: {}", + err.0 + ); + StatusCode::UNAUTHORIZED + }) + } + } + + struct Bincode(T); + + #[async_trait] + impl FromRequest for Bincode + where + Bytes: FromRequest, + S: Send + Sync, + T: serde::de::DeserializeOwned + Send + 'static, + { + type Rejection = Response; + + async fn from_request( + req: Request, + state: &S, + ) -> std::result::Result { + let data = match get_header_value(req.headers(), "Content-Type") { + Some("application/octet-stream") => Bytes::from_request(req, state) + .await + .map_err(IntoResponse::into_response)? + .to_vec(), + _ => return Err((StatusCode::BAD_REQUEST, "Wrong content type").into_response()), + }; + + let data = bincode_deserialize::(data) + .await + .map_err(|err| (StatusCode::BAD_REQUEST, err.to_string()).into_response())?; + + Ok(Self(data)) + } + } + + struct SchedulerState { + service: Arc, + // Test whether clients are permitted to use the scheduler + client_auth: Box, + } + + pub struct Scheduler { + state: Arc, + } + + impl Scheduler { + pub fn new( + service: Arc, + client_auth: Box, + ) -> Self { + Self { + state: Arc::new(SchedulerState { + service, + client_auth, + }), + } + } + + fn make_router() -> axum::Router { + Router::new() + .route( + "/api/v2/status", + routing::get( + |// Authenticate the client bearer token first + _: AuthenticatedClient, + headers: HeaderMap, + method: Method, + uri: Uri, + Extension(state): Extension>| async move { + state.service.get_status().await.map_or_else( + anyhow_to_response(method, uri), + result_to_response(headers), + ) + }, + ), + ) + .route( + "/api/v2/toolchain/:archive_id", + routing::head( + |// Authenticate the client bearer token first + _: AuthenticatedClient, + Extension(state): Extension>, + Path(archive_id): Path| async move { + if state.service.has_toolchain(Toolchain { archive_id }).await { + (StatusCode::OK).into_response() + } else { + (StatusCode::NOT_FOUND).into_response() + } + }, + ), + ) + .route( + "/api/v2/toolchain/:archive_id", + routing::put( + |// Authenticate the client bearer token first + _: AuthenticatedClient, + headers: HeaderMap, + method: Method, + uri: Uri, + Extension(state): Extension>, + Path(archive_id): Path, + request: Request| async move { + // Convert the request body stream into an `AsyncRead` + let toolchain_reader = StreamReader::new( + request + .into_body() + .into_data_stream() + .map_err(|err| io::Error::new(io::ErrorKind::Other, err)), + ) + .compat(); + + pin_mut!(toolchain_reader); + + state + .service + .put_toolchain(Toolchain { archive_id }, toolchain_reader) + .await + .map_or_else( + anyhow_to_response(method, uri), + result_to_response(headers), + ) + }, + ), + ) + } + + fn with_http_routes(app: axum::Router) -> axum::Router { + app.route( + "/api/v2/jobs/new", + routing::post( + |// Authenticate the client bearer token first + _: AuthenticatedClient, + headers: HeaderMap, + method: Method, + uri: Uri, + Extension(state): Extension>, + Bincode(req): Bincode| async move { + state.service.new_job(req).await.map_or_else( + anyhow_to_response(method, uri), + result_to_response(headers), + ) + }, + ), + ) + .route( + "/api/v2/job/:job_id/run", + routing::post( + |// Authenticate the client bearer token first + _: AuthenticatedClient, + headers: HeaderMap, + method: Method, + uri: Uri, + Extension(state): Extension>, + Path(job_id): Path, + Bincode(req): Bincode| async move { + if job_id != req.job_id { + Ok((StatusCode::BAD_REQUEST).into_response()) + } else { + state.service.run_job(req).await.map_or_else( + anyhow_to_response(method, uri), + result_to_response(headers), + ) + } + }, + ), + ) + } + + fn with_websocket_routes(app: axum::Router) -> axum::Router { + async fn handle_socket( + state: Arc, + client_addr: SocketAddr, + socket: WebSocket, + ) { + tracing::debug!( + "[handle_socket({client_addr})]: client websocket upgrade successful" + ); + + let (sndr, recv) = socket.split(); + + // Wrap shared sender in a Mutex so the concurrent + // `flat_map_unordered` task writes are serialized + let sndr = Arc::new(Mutex::new(sndr)); + + let service = state.service.clone(); + let pool = tokio::runtime::Handle::current(); + let token = tokio_util::sync::CancellationToken::new(); + + let mut recv_task = + for_all_concurrent(&pool, recv, token.child_token(), move |msg| { + // Local clones for this task + let sndr = sndr.clone(); + let service = service.clone(); + + async move { + let (req_id, req) = match msg { + Err(err) => { + let err = err.into_inner(); + return match err.downcast_ref::() { + // TODO: Downcasting the client disconnect error should run + // this case, but it's always running the `None` case below + Some(ProtocolError::ResetWithoutClosingHandshake) => { + std::ops::ControlFlow::Break(String::new()) + } + Some(err) => std::ops::ControlFlow::Break(format!( + "WebSocket recv error: {err:?}" + )), + None => std::ops::ControlFlow::Break(format!( + "WebSocket recv error: {err:?}" + )), + }; + } + Ok(msg) => match msg { + Message::Close(None) => { + return std::ops::ControlFlow::Break( + "WebSocket close without CloseFrame".into(), + ); + } + Message::Close(Some(CloseFrame { code, reason })) => { + return std::ops::ControlFlow::Break(format!( + "WebSocket disconnected code={}, reason=`{}`", + code, reason + )); + } + Message::Text(str) => { + return std::ops::ControlFlow::Continue(format!( + "WebSocket received unexpected text response: {str}" + )); + } + Message::Binary(buf) => { + match bincode_deserialize::<(String, ServerIncoming)>(buf) + .await + { + Ok(res) => res, + Err(err) => { + return std::ops::ControlFlow::Continue(format!( + "WebSocket failed to deserialize response: {err}" + )); + } + } + } + _ => return std::ops::ControlFlow::Continue(String::new()), + }, + }; + + tracing::trace!("WebSocket received request: id={req_id} req={req}"); + + let res = match req { + ClientOutgoing::NewJob(req) => match service.new_job(req).await { + Ok(res) => ClientIncoming::NewJob(res), + Err(err) => ClientIncoming::Error { + message: err.to_string(), + }, + }, + ClientOutgoing::RunJob(req) => match service.run_job(req).await { + Ok(res) => ClientIncoming::RunJob(res), + Err(err) => ClientIncoming::Error { + message: err.to_string(), + }, + }, + }; + + tracing::trace!("WebSocket sending response: id={req_id} res={res}"); + + // Serialize the request + let buf = match bincode_serialize((req_id.clone(), res)).await { + Ok(buf) => buf, + Err(err) => { + return std::ops::ControlFlow::Continue(format!( + "WebSocket failed to serialize request: {err}" + )); + } + }; + + if sndr.lock().await.send(Message::Binary(buf)).await.is_err() { + return std::ops::ControlFlow::Break(format!( + "WebSocket failed to notify client of response with id={req_id}" + )); + } + + std::ops::ControlFlow::Continue(String::new()) + } + }); + + // Wait for either cancel/send/recv to finish + tokio::select! { + _ = token.cancelled() => { + recv_task.abort(); + } + _ = (&mut recv_task) => { + token.cancel(); + } + } + + // TODO: Figure out how to cancel in-progress tasks for this client + + tracing::info!("sccache: {client_addr} shutdown"); + } + + app.route( + "/api/v2/client/ws", + routing::get( + |// Authenticate the client bearer token first + AuthenticatedClient(client): AuthenticatedClient, + ws: WebSocketUpgrade, + Extension(state): Extension>| async move { + tracing::debug!( + "/api/v2/client/ws incoming websocket connection from {client}" + ); + ws.on_upgrade(move |socket| handle_socket(state, client, socket)) + }, + ), + ) + } + + pub async fn serve( + self, + addr: SocketAddr, + enable_web_socket_server: bool, + max_body_size: usize, + ) -> Result<()> { + let state = self.state.clone(); + + let mut app = Self::with_http_routes(Self::make_router()); + + if enable_web_socket_server { + app = Self::with_websocket_routes(app); + } + + app = with_request_tracing( + app.fallback(|| async move { (StatusCode::NOT_FOUND, "404") }) + .layer(DefaultBodyLimit::max(max_body_size)) + .layer(Extension(Arc::clone(&state))), + ); + + let mut make_service = app.into_make_service_with_connect_info::(); + + let listener = TcpListener::bind(addr).await.unwrap(); + + tracing::info!("Scheduler listening for clients on {}", addr); + + loop { + let (tcp_stream, remote_addr) = listener.accept().await.unwrap(); + let tower_service = unwrap_infallible(make_service.call(remote_addr).await); + + tokio::spawn(async move { + // Hyper has its own `AsyncRead` and `AsyncWrite` traits and doesn't use tokio. + // `TokioIo` converts between them. + let tok_stream = TokioIo::new(tcp_stream); + + let hyper_service = hyper::service::service_fn( + move |request: Request| { + // Clone `tower_service` because hyper's `Service` uses `&self` whereas + // tower's `Service` requires `&mut self`. + tower_service.clone().oneshot(request) + }, + ); + + if let Err(err) = + hyper_util::server::conn::auto::Builder::new(TokioExecutor::new()) + .serve_connection_with_upgrades(tok_stream, hyper_service) + .await + { + tracing::debug!("sccache: failed to serve connection: {err:#}"); + } + }); + } + } + } +} diff --git a/src/lib.rs b/src/lib.rs index d675afb440..876dd262ea 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -30,7 +30,7 @@ mod test; #[macro_use] pub mod errors; -mod cache; +pub mod cache; mod client; mod cmdline; mod commands; diff --git a/src/lru_disk_cache/mod.rs b/src/lru_disk_cache/mod.rs index 04c12169cc..ac67aa1525 100644 --- a/src/lru_disk_cache/mod.rs +++ b/src/lru_disk_cache/mod.rs @@ -391,19 +391,6 @@ impl LruDiskCache { }) } - /// Get an opened `File` for `key`, if one exists and can be opened. Updates the LRU state - /// of the file if present. Avoid using this method if at all possible, prefer `.get`. - /// Entries created by `LruDiskCache::prepare_add` but not yet committed return - /// `Err(Error::FileNotInCache)`. - pub async fn get_file_async>(&mut self, key: K) -> Result { - let rel_path = key.as_ref(); - let path = self.rel_to_abs_path(rel_path); - let _ = self.lru.get(rel_path).ok_or(Error::FileNotInCache)?; - let t = FileTime::now(); - set_file_times(&path, t, t)?; - tokio::fs::File::open(path).await.map_err(Into::into) - } - /// Get an opened readable and seekable handle to the file at `key`, if one exists and can /// be opened. Updates the LRU state of the file if present. /// Entries created by `LruDiskCache::prepare_add` but not yet committed return @@ -412,17 +399,6 @@ impl LruDiskCache { self.get_file(key).map(|f| Box::new(f) as Box) } - /// Get an opened readable and seekable handle to the file at `key`, if one exists and can - /// be opened. Updates the LRU state of the file if present. - /// Entries created by `LruDiskCache::prepare_add` but not yet committed return - /// `Err(Error::FileNotInCache)`. - pub async fn get_async>( - &mut self, - key: K, - ) -> Result> { - Ok(Box::new(self.get_file_async(key).await?)) - } - /// Remove the given key from the cache. pub fn remove>(&mut self, key: K) -> Result<()> { match self.lru.remove(key.as_ref()) { diff --git a/src/server.rs b/src/server.rs index 5718259bdd..34d4b314c2 100644 --- a/src/server.rs +++ b/src/server.rs @@ -369,7 +369,8 @@ impl DistClientContainer { &config.toolchains, auth_token, config.rewrite_includes_only, - ); + ) + .await; let dist_client = try_or_retry_later!(dist_client.context("failure during dist client creation")); use crate::dist::Client; @@ -441,7 +442,7 @@ pub fn start_server(config: &Config, addr: &crate::net::SocketAddr) -> Result<() let notify = env::var_os("SCCACHE_STARTUP_NOTIFY"); - let raw_storage = match storage_from_config(config, &pool) { + let raw_storage = match storage_from_config(&config.cache, &config.fallback_cache, &pool) { Ok(storage) => storage, Err(err) => { error!("storage init failed for: {err:?}"); @@ -1423,10 +1424,11 @@ where match dist_type { DistType::NoDist => {} - DistType::Ok(id) => { - let server = id.addr().to_string(); - let server_count = stats.dist_compiles.entry(server).or_insert(0); - *server_count += 1; + DistType::Ok => { + // let server = id.addr().to_string(); + // let server_count = stats.dist_compiles.entry(server).or_insert(0); + // *server_count += 1; + stats.dist_compiles_count += 1; } DistType::Error => stats.dist_errors += 1, } @@ -1606,6 +1608,8 @@ pub struct ServerStats { /// The count of compilations that were successfully distributed indexed /// by the server that ran those compilations. pub dist_compiles: HashMap, + /// The count of compilations that were successfully distributed + pub dist_compiles_count: u64, /// The count of compilations that were distributed but failed and had to be re-run locally pub dist_errors: u64, } @@ -1655,6 +1659,7 @@ impl Default for ServerStats { compile_fails: u64::default(), not_cached: HashMap::new(), dist_compiles: HashMap::new(), + dist_compiles_count: u64::default(), dist_errors: u64::default(), } } @@ -1786,6 +1791,11 @@ impl ServerStats { self.cache_hits.all(), "Average cache read hit" ); + set_stat!( + stats_vec, + self.dist_compiles_count, + "Successful distributed compiles" + ); set_stat!( stats_vec, self.dist_errors, @@ -1907,7 +1917,7 @@ impl ServerInfo { let cache_size; let max_cache_size; if let Some(storage) = storage { - cache_location = storage.location(); + cache_location = storage.location().await; use_preprocessor_cache_mode = storage .preprocessor_cache_mode_config() .use_preprocessor_cache_mode; diff --git a/src/test/mock_storage.rs b/src/test/mock_storage.rs index 00a6aa7c89..9a82bf22a5 100644 --- a/src/test/mock_storage.rs +++ b/src/test/mock_storage.rs @@ -16,6 +16,7 @@ use crate::cache::{Cache, CacheWrite, PreprocessorCacheModeConfig, Storage}; use crate::errors::*; use async_trait::async_trait; use futures::channel::mpsc; +use std::pin::Pin; use std::sync::Arc; use std::time::Duration; use tokio::sync::Mutex; @@ -57,6 +58,24 @@ impl Storage for MockStorage { next.expect("MockStorage get called but no get results available") } + async fn get_stream(&self, key: &str) -> Result> { + if let Some(delay) = self.delay { + sleep(delay).await; + } + let next = self.rx.lock().await.try_next().unwrap(); + let next = next.expect("MockStorage get called but no get results available")?; + match next { + Cache::Hit(file) => { + let reader = file.into_inner(); + let reader = futures::io::AllowStdIo::new(reader); + Ok(Box::new(reader) as Box) + } + _ => Err(anyhow!("No cache entry for key `{key}`")), + } + } + async fn has(&self, _key: &str) -> bool { + false + } async fn put(&self, _key: &str, _entry: CacheWrite) -> Result { Ok(if let Some(delay) = self.delay { sleep(delay).await; @@ -65,7 +84,17 @@ impl Storage for MockStorage { Duration::from_secs(0) }) } - fn location(&self) -> String { + async fn put_stream( + &self, + _key: &str, + _source: Pin<&mut (dyn futures::AsyncRead + Send)>, + ) -> Result<()> { + if let Some(delay) = self.delay { + sleep(delay).await; + } + Ok(()) + } + async fn location(&self) -> String { "Mock Storage".to_string() } async fn current_size(&self) -> Result> { diff --git a/src/util.rs b/src/util.rs index 5e287d5559..af84cd877c 100644 --- a/src/util.rs +++ b/src/util.rs @@ -25,7 +25,6 @@ use std::cell::Cell; use std::ffi::{OsStr, OsString}; use std::hash::Hasher; use std::io::prelude::*; -use std::net::SocketAddr; use std::path::{Path, PathBuf}; use std::process::{self, Stdio}; use std::str; @@ -931,42 +930,14 @@ pub fn daemonize() -> Result<()> { } #[cfg(any(feature = "dist-server", feature = "dist-client"))] -pub fn new_reqwest_client( - real_addr: Option, - certs: Option<&std::collections::HashMap, Vec)>>, -) -> std::result::Result { - let mut builder = reqwest::Client::builder(); - - if let Some(addr) = real_addr { - let mut headers = reqwest::header::HeaderMap::new(); - headers.insert( - "X-Real-IP", - reqwest::header::HeaderValue::from_str(&format!("{}", addr.ip())).unwrap(), - ); - builder = builder.default_headers(headers); - } - - // Add all the existing certificates - if let Some(certs) = certs { - for (server_id, (_, pem)) in certs.iter() { - if let Ok(cert) = reqwest::Certificate::from_pem(pem) { - builder = builder.add_root_certificate(cert); - } else { - warn!( - "[new_reqwest_client({})]: skipping previously valid cert", - server_id.addr() - ); - } - } - } - - builder +pub fn new_reqwest_client() -> reqwest::Client { + reqwest::Client::builder() // Disable connection pool .pool_max_idle_per_host(0) .timeout(get_dist_request_timeout()) .connect_timeout(get_dist_connect_timeout()) .build() - // .expect("http client must build with success") + .expect("http client must build with success") } /// Disable connection pool to avoid broken connection between runtime @@ -980,18 +951,8 @@ pub fn new_reqwest_client( /// /// More details could be found at https://github.com/mozilla/sccache/pull/1563 #[cfg(any(feature = "dist-server", feature = "dist-client"))] -pub fn new_reqwest_blocking_client(real_addr: Option) -> reqwest::blocking::Client { - let mut builder = reqwest::blocking::Client::builder(); - if let Some(addr) = real_addr { - let mut headers = reqwest::header::HeaderMap::new(); - headers.insert( - "X-Real-IP", - reqwest::header::HeaderValue::from_str(&format!("{}", addr.ip())).unwrap(), - ); - builder = builder.default_headers(headers); - } - - builder +pub fn new_reqwest_blocking_client() -> reqwest::blocking::Client { + reqwest::blocking::Client::builder() // Disable connection pool .pool_max_idle_per_host(0) .timeout(get_dist_request_timeout()) diff --git a/tests/dist.rs b/tests/dist.rs index faf45370af..ef74a75123 100644 --- a/tests/dist.rs +++ b/tests/dist.rs @@ -6,28 +6,26 @@ extern crate log; extern crate sccache; extern crate serde_json; -use async_trait::async_trait; +// use async_trait::async_trait; use crate::harness::{ get_stats, sccache_command, start_local_daemon, stop_local_daemon, write_json_cfg, write_source, }; use assert_cmd::prelude::*; use sccache::config::HTTPUrl; -use sccache::dist::{ - AssignJobResult, CompileCommand, JobId, RunJobResult, ServerIncoming, ServerOutgoing, - SubmitToolchainResult, Toolchain, -}; use std::ffi::OsStr; use std::path::Path; -use sccache::errors::*; +use test_case::test_case; + +// use sccache::errors::*; mod harness; fn basic_compile(tmpdir: &Path, sccache_cfg_path: &Path, sccache_cached_cfg_path: &Path) { let envs: Vec<(_, &OsStr)> = vec![ ("RUST_BACKTRACE", "1".as_ref()), - ("SCCACHE_LOG", "trace".as_ref()), + ("SCCACHE_LOG", "sccache=trace".as_ref()), ("SCCACHE_CONF", sccache_cfg_path.as_ref()), ("SCCACHE_CACHED_CONF", sccache_cached_cfg_path.as_ref()), ]; @@ -60,42 +58,10 @@ pub fn dist_test_sccache_client_cfg( sccache_cfg } -#[test] -#[cfg_attr(not(feature = "dist-tests"), ignore)] -fn test_dist_basic() { - let tmpdir = tempfile::Builder::new() - .prefix("sccache_dist_test") - .tempdir() - .unwrap(); - let tmpdir = tmpdir.path(); - let sccache_dist = harness::sccache_dist_path(); - - let mut system = harness::DistSystem::new(&sccache_dist, tmpdir); - system.add_scheduler(); - system.add_server(); - - let sccache_cfg = dist_test_sccache_client_cfg(tmpdir, system.scheduler_url()); - let sccache_cfg_path = tmpdir.join("sccache-cfg.json"); - write_json_cfg(tmpdir, "sccache-cfg.json", &sccache_cfg); - let sccache_cached_cfg_path = tmpdir.join("sccache-cached-cfg"); - - stop_local_daemon(); - start_local_daemon(&sccache_cfg_path, &sccache_cached_cfg_path); - basic_compile(tmpdir, &sccache_cfg_path, &sccache_cached_cfg_path); - - get_stats(|info| { - assert_eq!(1, info.stats.dist_compiles.values().sum::()); - assert_eq!(0, info.stats.dist_errors); - assert_eq!(1, info.stats.compile_requests); - assert_eq!(1, info.stats.requests_executed); - assert_eq!(0, info.stats.cache_hits.all()); - assert_eq!(1, info.stats.cache_misses.all()); - }); -} - -#[test] +#[test_case("rabbitmq" ; "with RabbitMQ")] +#[test_case("redis" ; "with Redis")] #[cfg_attr(not(feature = "dist-tests"), ignore)] -fn test_dist_restartedserver() { +fn test_dist_basic(message_broker: &str) { let tmpdir = tempfile::Builder::new() .prefix("sccache_dist_test") .tempdir() @@ -104,8 +70,9 @@ fn test_dist_restartedserver() { let sccache_dist = harness::sccache_dist_path(); let mut system = harness::DistSystem::new(&sccache_dist, tmpdir); - system.add_scheduler(); - let server_handle = system.add_server(); + let message_broker = system.add_message_broker(message_broker); + system.add_scheduler(message_broker.clone()); + system.add_server(message_broker.clone()); let sccache_cfg = dist_test_sccache_client_cfg(tmpdir, system.scheduler_url()); let sccache_cfg_path = tmpdir.join("sccache-cfg.json"); @@ -116,44 +83,10 @@ fn test_dist_restartedserver() { start_local_daemon(&sccache_cfg_path, &sccache_cached_cfg_path); basic_compile(tmpdir, &sccache_cfg_path, &sccache_cached_cfg_path); - system.restart_server(&server_handle); - basic_compile(tmpdir, &sccache_cfg_path, &sccache_cached_cfg_path); - get_stats(|info| { - assert_eq!(2, info.stats.dist_compiles.values().sum::()); + // assert_eq!(1, info.stats.dist_compiles.values().sum::()); + assert_eq!(1, info.stats.dist_compiles_count); assert_eq!(0, info.stats.dist_errors); - assert_eq!(2, info.stats.compile_requests); - assert_eq!(2, info.stats.requests_executed); - assert_eq!(0, info.stats.cache_hits.all()); - assert_eq!(2, info.stats.cache_misses.all()); - }); -} - -#[test] -#[cfg_attr(not(feature = "dist-tests"), ignore)] -fn test_dist_nobuilder() { - let tmpdir = tempfile::Builder::new() - .prefix("sccache_dist_test") - .tempdir() - .unwrap(); - let tmpdir = tmpdir.path(); - let sccache_dist = harness::sccache_dist_path(); - - let mut system = harness::DistSystem::new(&sccache_dist, tmpdir); - system.add_scheduler(); - - let sccache_cfg = dist_test_sccache_client_cfg(tmpdir, system.scheduler_url()); - let sccache_cfg_path = tmpdir.join("sccache-cfg.json"); - write_json_cfg(tmpdir, "sccache-cfg.json", &sccache_cfg); - let sccache_cached_cfg_path = tmpdir.join("sccache-cached-cfg"); - - stop_local_daemon(); - start_local_daemon(&sccache_cfg_path, &sccache_cached_cfg_path); - basic_compile(tmpdir, &sccache_cfg_path, &sccache_cached_cfg_path); - - get_stats(|info| { - assert_eq!(0, info.stats.dist_compiles.values().sum::()); - assert_eq!(1, info.stats.dist_errors); assert_eq!(1, info.stats.compile_requests); assert_eq!(1, info.stats.requests_executed); assert_eq!(0, info.stats.cache_hits.all()); @@ -161,80 +94,154 @@ fn test_dist_nobuilder() { }); } -struct FailingServer; - -#[async_trait] -impl ServerIncoming for FailingServer { - fn start_heartbeat(&self, requester: std::sync::Arc) { - tokio::spawn(async move { - trace!("Performing heartbeat"); - match requester.do_heartbeat(0, 0).await { - Ok(sccache::dist::HeartbeatServerResult { is_new }) => { - trace!("Heartbeat success is_new={}", is_new); - } - Err(e) => { - error!("Failed to send heartbeat to server: {}", e); - } - } - }); - } - - async fn handle_assign_job(&self, _tc: Toolchain) -> Result { - let need_toolchain = false; - Ok(AssignJobResult { - job_id: JobId(0), - need_toolchain, - num_assigned_jobs: 1, - num_active_jobs: 0, - }) - } - async fn handle_submit_toolchain( - &self, - _job_id: JobId, - _tc_rdr: std::pin::Pin<&mut (dyn tokio::io::AsyncRead + Send)>, - ) -> Result { - panic!("should not have submitted toolchain") - } - async fn handle_run_job( - &self, - _job_id: JobId, - _command: CompileCommand, - _outputs: Vec, - _inputs_rdr: std::pin::Pin<&mut (dyn tokio::io::AsyncRead + Send)>, - ) -> Result { - bail!("internal build failure") - } -} - -#[test] -#[cfg_attr(not(feature = "dist-tests"), ignore)] -fn test_dist_failingserver() { - let tmpdir = tempfile::Builder::new() - .prefix("sccache_dist_test") - .tempdir() - .unwrap(); - let tmpdir = tmpdir.path(); - let sccache_dist = harness::sccache_dist_path(); - - let mut system = harness::DistSystem::new(&sccache_dist, tmpdir); - system.add_scheduler(); - system.add_custom_server(FailingServer); - - let sccache_cfg = dist_test_sccache_client_cfg(tmpdir, system.scheduler_url()); - let sccache_cfg_path = tmpdir.join("sccache-cfg.json"); - write_json_cfg(tmpdir, "sccache-cfg.json", &sccache_cfg); - let sccache_cached_cfg_path = tmpdir.join("sccache-cached-cfg"); - - stop_local_daemon(); - start_local_daemon(&sccache_cfg_path, &sccache_cached_cfg_path); - basic_compile(tmpdir, &sccache_cfg_path, &sccache_cached_cfg_path); - - get_stats(|info| { - assert_eq!(0, info.stats.dist_compiles.values().sum::()); - assert_eq!(1, info.stats.dist_errors); - assert_eq!(1, info.stats.compile_requests); - assert_eq!(1, info.stats.requests_executed); - assert_eq!(0, info.stats.cache_hits.all()); - assert_eq!(1, info.stats.cache_misses.all()); - }); -} +// #[test] +// #[cfg_attr(not(feature = "dist-tests"), ignore)] +// fn test_dist_restartedserver() { +// let tmpdir = tempfile::Builder::new() +// .prefix("sccache_dist_test") +// .tempdir() +// .unwrap(); +// let tmpdir = tmpdir.path(); +// let sccache_dist = harness::sccache_dist_path(); + +// let mut system = harness::DistSystem::new(&sccache_dist, tmpdir); +// system.add_scheduler(); +// let server_handle = system.add_server(); + +// let sccache_cfg = dist_test_sccache_client_cfg(tmpdir, system.scheduler_url()); +// let sccache_cfg_path = tmpdir.join("sccache-cfg.json"); +// write_json_cfg(tmpdir, "sccache-cfg.json", &sccache_cfg); +// let sccache_cached_cfg_path = tmpdir.join("sccache-cached-cfg"); + +// stop_local_daemon(); +// start_local_daemon(&sccache_cfg_path, &sccache_cached_cfg_path); +// basic_compile(tmpdir, &sccache_cfg_path, &sccache_cached_cfg_path); + +// system.restart_server(&server_handle); +// basic_compile(tmpdir, &sccache_cfg_path, &sccache_cached_cfg_path); + +// get_stats(|info| { +// assert_eq!(2, info.stats.dist_compiles.values().sum::()); +// assert_eq!(2, info.stats.dist_compiles_count); +// assert_eq!(0, info.stats.dist_errors); +// assert_eq!(2, info.stats.compile_requests); +// assert_eq!(2, info.stats.requests_executed); +// assert_eq!(0, info.stats.cache_hits.all()); +// assert_eq!(2, info.stats.cache_misses.all()); +// }); +// } + +// #[test_case("rabbitmq" ; "with RabbitMQ")] +// #[test_case("redis" ; "with Redis")] +// #[cfg_attr(not(feature = "dist-tests"), ignore)] +// fn test_dist_nobuilder(message_broker: &str) { +// let tmpdir = tempfile::Builder::new() +// .prefix("sccache_dist_test") +// .tempdir() +// .unwrap(); +// let tmpdir = tmpdir.path(); +// let sccache_dist = harness::sccache_dist_path(); + +// let mut system = harness::DistSystem::new(&sccache_dist, tmpdir); +// let message_broker = system.add_message_broker(message_broker); +// system.add_scheduler(message_broker.clone()); +// // system.add_server(message_broker.clone()); + +// let sccache_cfg = dist_test_sccache_client_cfg(tmpdir, system.scheduler_url()); +// let sccache_cfg_path = tmpdir.join("sccache-cfg.json"); +// write_json_cfg(tmpdir, "sccache-cfg.json", &sccache_cfg); +// let sccache_cached_cfg_path = tmpdir.join("sccache-cached-cfg"); + +// stop_local_daemon(); +// start_local_daemon(&sccache_cfg_path, &sccache_cached_cfg_path); +// basic_compile(tmpdir, &sccache_cfg_path, &sccache_cached_cfg_path); + +// get_stats(|info| { +// // assert_eq!(0, info.stats.dist_compiles.values().sum::()); +// assert_eq!(0, info.stats.dist_compiles_count); +// assert_eq!(1, info.stats.dist_errors); +// assert_eq!(1, info.stats.compile_requests); +// assert_eq!(1, info.stats.requests_executed); +// assert_eq!(0, info.stats.cache_hits.all()); +// assert_eq!(1, info.stats.cache_misses.all()); +// }); +// } + +// struct FailingServer; + +// #[async_trait] +// impl ServerIncoming for FailingServer { +// fn start_heartbeat(&self, requester: std::sync::Arc) { +// tokio::spawn(async move { +// trace!("Performing heartbeat"); +// match requester.do_heartbeat(0, 0).await { +// Ok(sccache::dist::HeartbeatServerResult { is_new }) => { +// trace!("Heartbeat success is_new={}", is_new); +// } +// Err(e) => { +// error!("Failed to send heartbeat to server: {}", e); +// } +// } +// }); +// } + +// async fn handle_assign_job(&self, _tc: Toolchain) -> Result { +// let need_toolchain = false; +// Ok(AssignJobResult { +// job_id: JobId(0), +// need_toolchain, +// num_assigned_jobs: 1, +// num_active_jobs: 0, +// }) +// } +// async fn handle_submit_toolchain( +// &self, +// _job_id: JobId, +// _tc_rdr: std::pin::Pin<&mut (dyn tokio::io::AsyncRead + Send)>, +// ) -> Result { +// panic!("should not have submitted toolchain") +// } +// async fn handle_run_job( +// &self, +// _job_id: JobId, +// _command: CompileCommand, +// _outputs: Vec, +// _inputs_rdr: std::pin::Pin<&mut (dyn tokio::io::AsyncRead + Send)>, +// ) -> Result { +// bail!("internal build failure") +// } +// } + +// #[test] +// #[cfg_attr(not(feature = "dist-tests"), ignore)] +// fn test_dist_failingserver() { +// let tmpdir = tempfile::Builder::new() +// .prefix("sccache_dist_test") +// .tempdir() +// .unwrap(); +// let tmpdir = tmpdir.path(); +// let sccache_dist = harness::sccache_dist_path(); + +// let mut system = harness::DistSystem::new(&sccache_dist, tmpdir); +// system.add_scheduler(); +// system.add_custom_server(FailingServer); + +// let sccache_cfg = dist_test_sccache_client_cfg(tmpdir, system.scheduler_url()); +// let sccache_cfg_path = tmpdir.join("sccache-cfg.json"); +// write_json_cfg(tmpdir, "sccache-cfg.json", &sccache_cfg); +// let sccache_cached_cfg_path = tmpdir.join("sccache-cached-cfg"); + +// stop_local_daemon(); +// start_local_daemon(&sccache_cfg_path, &sccache_cached_cfg_path); +// basic_compile(tmpdir, &sccache_cfg_path, &sccache_cached_cfg_path); + +// get_stats(|info| { +// assert_eq!(0, info.stats.dist_compiles.values().sum::()); +// assert_eq!(0, info.stats.dist_compiles_count); +// assert_eq!(1, info.stats.dist_errors); +// assert_eq!(1, info.stats.compile_requests); +// assert_eq!(1, info.stats.requests_executed); +// assert_eq!(0, info.stats.cache_hits.all()); +// assert_eq!(1, info.stats.cache_misses.all()); +// }); +// } diff --git a/tests/harness/mod.rs b/tests/harness/mod.rs index dace011046..0821b4987b 100644 --- a/tests/harness/mod.rs +++ b/tests/harness/mod.rs @@ -1,14 +1,14 @@ use fs_err as fs; #[cfg(any(feature = "dist-client", feature = "dist-server"))] use sccache::config::HTTPUrl; -use sccache::dist::{self, SchedulerStatusResult, ServerId}; +use sccache::dist::{self, SchedulerStatusResult}; use sccache::server::ServerInfo; use std::env; use std::io::Write; -use std::net::{self, IpAddr, SocketAddr}; +use std::net::{self, SocketAddr}; use std::path::{Path, PathBuf}; use std::process::{Command, Output, Stdio}; -use std::str::{self, FromStr}; +use std::str; use std::thread; use std::time::{Duration, Instant}; @@ -19,8 +19,14 @@ use nix::{ signal::Signal, wait::{WaitPidFlag, WaitStatus}, }, - unistd::{ForkResult, Pid}, + unistd::{ + // ForkResult, + Pid, + }, }; +#[cfg(feature = "dist-server")] +use sccache::config::MessageBroker; + use predicates::prelude::*; use serde::Serialize; use uuid::Uuid; @@ -31,12 +37,12 @@ const DIST_DOCKERFILE: &str = include_str!("Dockerfile.sccache-dist"); const DIST_IMAGE_BWRAP_PATH: &str = "/usr/bin/bwrap"; const MAX_STARTUP_WAIT: Duration = Duration::from_secs(5); -const DIST_SERVER_TOKEN: &str = "THIS IS THE TEST TOKEN"; +// const DIST_SERVER_TOKEN: &str = "THIS IS THE TEST TOKEN"; const CONFIGS_CONTAINER_PATH: &str = "/sccache-bits"; const BUILD_DIR_CONTAINER_PATH: &str = "/sccache-bits/build-dir"; const SCHEDULER_PORT: u16 = 10500; -const SERVER_PORT: u16 = 12345; // arbitrary +// const SERVER_PORT: u16 = 12345; // arbitrary const TC_CACHE_SIZE: u64 = 1024 * 1024 * 1024; // 1 gig @@ -46,7 +52,7 @@ pub fn start_local_daemon(cfg_path: &Path, cached_cfg_path: &Path) { if !sccache_command() .arg("--start-server") // Uncomment following lines to debug locally. - .env("SCCACHE_LOG", "trace") + .env("SCCACHE_LOG", "sccache=trace") .env( "SCCACHE_ERROR_LOG", env::temp_dir().join("sccache_local_daemon.txt"), @@ -169,61 +175,64 @@ pub fn sccache_client_cfg( } #[cfg(feature = "dist-server")] -fn sccache_scheduler_cfg() -> sccache::config::scheduler::Config { - sccache::config::scheduler::Config { - public_addr: SocketAddr::from(([0, 0, 0, 0], SCHEDULER_PORT)), - client_auth: sccache::config::scheduler::ClientAuth::Insecure, - server_auth: sccache::config::scheduler::ServerAuth::Token { - token: DIST_SERVER_TOKEN.to_owned(), - }, - ..Default::default() - } +fn sccache_scheduler_cfg( + tmpdir: &Path, + message_broker: MessageBroker, +) -> sccache::config::scheduler::Config { + let toolchains_path = "server-toolchains"; + fs::create_dir(tmpdir.join(toolchains_path)).unwrap(); + + let mut config = sccache::config::scheduler::Config::load(None).unwrap(); + config.message_broker = Some(message_broker); + config.public_addr = SocketAddr::from(([0, 0, 0, 0], SCHEDULER_PORT)); + config.client_auth = sccache::config::scheduler::ClientAuth::Insecure; + config.toolchains_fallback.dir = Path::new(CONFIGS_CONTAINER_PATH).join(toolchains_path); + config + // sccache::config::scheduler::Config { + // public_addr: SocketAddr::from(([0, 0, 0, 0], SCHEDULER_PORT)), + // client_auth: sccache::config::scheduler::ClientAuth::Insecure, + // server_auth: sccache::config::scheduler::ServerAuth::Token { + // token: DIST_SERVER_TOKEN.to_owned(), + // }, + // ..Default::default() + // } } #[cfg(feature = "dist-server")] fn sccache_server_cfg( tmpdir: &Path, - scheduler_url: HTTPUrl, - server_ip: IpAddr, + message_broker: MessageBroker, + // server_ip: IpAddr, ) -> sccache::config::server::Config { let relpath = "server-cache"; + let toolchains_path = "server-toolchains"; fs::create_dir(tmpdir.join(relpath)).unwrap(); + fs::create_dir(tmpdir.join(toolchains_path)).unwrap_or_default(); - sccache::config::server::Config { - builder: sccache::config::server::BuilderType::Overlay { - build_dir: BUILD_DIR_CONTAINER_PATH.into(), - bwrap_path: DIST_IMAGE_BWRAP_PATH.into(), - }, - cache_dir: Path::new(CONFIGS_CONTAINER_PATH).join(relpath), - public_addr: SocketAddr::new(server_ip, SERVER_PORT), - bind_addr: None, - scheduler_url, - scheduler_auth: sccache::config::server::SchedulerAuth::Token { - token: DIST_SERVER_TOKEN.to_owned(), - }, - toolchain_cache_size: TC_CACHE_SIZE, - ..Default::default() - } -} - -// TODO: this is copied from the sccache-dist binary - it's not clear where would be a better place to put the -// code so that it can be included here -#[cfg(feature = "dist-server")] -fn create_server_token(server_id: ServerId, auth_token: &str) -> String { - format!("{} {}", server_id.addr(), auth_token) + let mut config = sccache::config::server::Config::load(None).unwrap(); + config.message_broker = Some(message_broker); + config.builder = sccache::config::server::BuilderType::Overlay { + build_dir: BUILD_DIR_CONTAINER_PATH.into(), + bwrap_path: DIST_IMAGE_BWRAP_PATH.into(), + }; + config.cache_dir = Path::new(CONFIGS_CONTAINER_PATH).join(relpath); + config.toolchains_fallback.dir = Path::new(CONFIGS_CONTAINER_PATH).join(toolchains_path); + config.toolchain_cache_size = TC_CACHE_SIZE as i64; + config } -#[cfg(feature = "dist-server")] -pub enum ServerHandle { - Container { cid: String, url: HTTPUrl }, - Process { pid: Pid, url: HTTPUrl }, -} +// #[cfg(feature = "dist-server")] +// pub enum ServerHandle { +// Container { cid: String, url: HTTPUrl }, +// // Process { pid: Pid, url: HTTPUrl }, +// } #[cfg(feature = "dist-server")] pub struct DistSystem { sccache_dist: PathBuf, tmpdir: PathBuf, + message_broker_name: Option, scheduler_name: Option, server_names: Vec, server_pids: Vec, @@ -256,21 +265,62 @@ impl DistSystem { sccache_dist: sccache_dist.to_owned(), tmpdir, + message_broker_name: None, scheduler_name: None, server_names: vec![], server_pids: vec![], } } - pub fn add_scheduler(&mut self) { + pub fn add_message_broker(&mut self, message_broker: &str) -> MessageBroker { + match message_broker { + "rabbitmq" => self.add_rabbit_mq(), + "redis" => self.add_redis(), + _ => unreachable!(""), + } + } + + pub fn add_rabbit_mq(&mut self) -> MessageBroker { + self.run_message_broker("rabbitmq:4", "5672:5672"); + MessageBroker::AMQP("amqp://127.0.0.1:5672//".into()) + } + + pub fn add_redis(&mut self) -> MessageBroker { + self.run_message_broker("redis:7", "6379:6379"); + MessageBroker::Redis("redis://127.0.0.1:6379/".into()) + } + + fn run_message_broker(&mut self, image_tag: &str, ports: &str) { + let message_broker_name = make_container_name("message_broker"); + let output = Command::new("docker") + .args([ + "run", + "--name", + &message_broker_name, + "-p", + ports, + "-d", + image_tag, + ]) + .output() + .unwrap(); + + check_output(&output); + + thread::sleep(Duration::from_secs(5)); + + self.message_broker_name = Some(message_broker_name); + } + + pub fn add_scheduler(&mut self, message_broker: MessageBroker) { let scheduler_cfg_relpath = "scheduler-cfg.json"; let scheduler_cfg_path = self.tmpdir.join(scheduler_cfg_relpath); let scheduler_cfg_container_path = Path::new(CONFIGS_CONTAINER_PATH).join(scheduler_cfg_relpath); - let scheduler_cfg = sccache_scheduler_cfg(); + let scheduler_cfg = sccache_scheduler_cfg(&self.tmpdir, message_broker); fs::File::create(scheduler_cfg_path) .unwrap() - .write_all(&serde_json::to_vec(&scheduler_cfg).unwrap()) + .write_all(&serde_json::to_vec(&scheduler_cfg.into_file()).unwrap()) .unwrap(); // Create the scheduler @@ -283,7 +333,7 @@ impl DistSystem { "-e", "SCCACHE_NO_DAEMON=1", "-e", - "SCCACHE_LOG=trace", + "SCCACHE_LOG=sccache=trace,tower_http=debug,axum::rejection=trace", "-e", "RUST_BACKTRACE=1", "--network", @@ -339,7 +389,7 @@ impl DistSystem { ); } - pub fn add_server(&mut self) -> ServerHandle { + pub fn add_server(&mut self, message_broker: MessageBroker) { let server_cfg_relpath = format!("server-cfg-{}.json", self.server_names.len()); let server_cfg_path = self.tmpdir.join(&server_cfg_relpath); let server_cfg_container_path = Path::new(CONFIGS_CONTAINER_PATH).join(server_cfg_relpath); @@ -353,7 +403,9 @@ impl DistSystem { "--name", &server_name, "-e", - "SCCACHE_LOG=trace", + "SCCACHE_NO_DAEMON=1", + "-e", + "SCCACHE_LOG=sccache=trace,tower_http=debug,axum::rejection=trace", "-e", "RUST_BACKTRACE=1", "--network", @@ -381,130 +433,133 @@ impl DistSystem { ]) .output() .unwrap(); + self.server_names.push(server_name.clone()); check_output(&output); - let server_ip = IpAddr::from_str("127.0.0.1").unwrap(); - let server_cfg = sccache_server_cfg(&self.tmpdir, self.scheduler_url(), server_ip); + // let server_ip = IpAddr::from_str("127.0.0.1").unwrap(); + let server_cfg = sccache_server_cfg(&self.tmpdir, message_broker); fs::File::create(&server_cfg_path) .unwrap() - .write_all(&serde_json::to_vec(&server_cfg).unwrap()) + .write_all(&serde_json::to_vec(&server_cfg.into_file()).unwrap()) .unwrap(); fs::File::create(format!("{}.ready", server_cfg_path.to_str().unwrap())).unwrap(); - let url = HTTPUrl::from_url( - reqwest::Url::parse(&format!("https://{}:{}", server_ip, SERVER_PORT)).unwrap(), - ); - let handle = ServerHandle::Container { - cid: server_name, - url, - }; - self.wait_server_ready(&handle); - handle - } - - pub fn add_custom_server( - &mut self, - handler: S, - ) -> ServerHandle { - let server_addr = { - let ip = IpAddr::from_str("127.0.0.1").unwrap(); - let listener = net::TcpListener::bind(SocketAddr::from((ip, 0))).unwrap(); - listener.local_addr().unwrap() - }; - - let runtime = tokio::runtime::Builder::new_current_thread() - .enable_all() - .worker_threads(1) - .build() - .unwrap(); - - let token = create_server_token(ServerId::new(server_addr), DIST_SERVER_TOKEN); - let server = dist::http::Server::new( - server_addr, - server_addr, - self.scheduler_url().to_url(), - token, - 1f64, - 1, - handler, - ) - .unwrap(); - - let pid = match unsafe { nix::unistd::fork() }.unwrap() { - ForkResult::Parent { child } => { - self.server_pids.push(child); - child - } - ForkResult::Child => { - env::set_var("SCCACHE_LOG", "sccache=trace"); - env_logger::try_init().unwrap(); - - runtime.block_on(async move { - match server.start().await { - Ok(_) => {} - Err(err) => panic!("Err: {err}"), - } - }); - - unreachable!(); - } - }; - - let url = - HTTPUrl::from_url(reqwest::Url::parse(&format!("https://{}", server_addr)).unwrap()); - let handle = ServerHandle::Process { pid, url }; - self.wait_server_ready(&handle); - handle + thread::sleep(Duration::from_secs(5)); + + // let url = HTTPUrl::from_url( + // reqwest::Url::parse(&format!("https://{}:{}", server_ip, SERVER_PORT)).unwrap(), + // ); + // let handle = ServerHandle::Container { + // cid: server_name, + // url, + // }; + // self.wait_server_ready(&handle); + // handle } - pub fn restart_server(&mut self, handle: &ServerHandle) { - match handle { - ServerHandle::Container { cid, url: _ } => { - let output = Command::new("docker") - .args(["restart", cid]) - .output() - .unwrap(); - check_output(&output); - } - ServerHandle::Process { pid: _, url: _ } => { - // TODO: pretty easy, just no need yet - panic!("restart not yet implemented for pids") - } - } - self.wait_server_ready(handle) - } - - pub fn wait_server_ready(&mut self, handle: &ServerHandle) { - let url = match handle { - ServerHandle::Container { cid: _, url } | ServerHandle::Process { pid: _, url } => { - url.clone() - } - }; - wait_for_http(url, Duration::from_millis(100), MAX_STARTUP_WAIT); - wait_for( - || { - let status = self.scheduler_status(); - if matches!( - status, - SchedulerStatusResult { - num_servers: 1, - num_cpus: _, - active: 0, - assigned: 0, - servers: _ - } - ) { - Ok(()) - } else { - Err(format!("{:?}", status)) - } - }, - Duration::from_millis(100), - MAX_STARTUP_WAIT, - ); - } + // pub fn add_custom_server( + // &mut self, + // handler: S, + // ) -> ServerHandle { + // let server_addr = { + // let ip = IpAddr::from_str("127.0.0.1").unwrap(); + // let listener = net::TcpListener::bind(SocketAddr::from((ip, 0))).unwrap(); + // listener.local_addr().unwrap() + // }; + + // let runtime = tokio::runtime::Builder::new_current_thread() + // .enable_all() + // .worker_threads(1) + // .build() + // .unwrap(); + + // let token = create_server_token(ServerId::new(server_addr), DIST_SERVER_TOKEN); + // let server = dist::http::Server::new( + // server_addr, + // server_addr, + // self.scheduler_url().to_url(), + // token, + // 1f64, + // 1, + // handler, + // ) + // .unwrap(); + + // let pid = match unsafe { nix::unistd::fork() }.unwrap() { + // ForkResult::Parent { child } => { + // self.server_pids.push(child); + // child + // } + // ForkResult::Child => { + // env::set_var("SCCACHE_LOG", "sccache=trace"); + // env_logger::try_init().unwrap(); + + // runtime.block_on(async move { + // match server.start().await { + // Ok(_) => {} + // Err(err) => panic!("Err: {err}"), + // } + // }); + + // unreachable!(); + // } + // }; + + // let url = + // HTTPUrl::from_url(reqwest::Url::parse(&format!("https://{}", server_addr)).unwrap()); + // let handle = ServerHandle::Process { pid, url }; + // self.wait_server_ready(&handle); + // handle + // } + + // pub fn restart_server(&mut self, handle: &ServerHandle) { + // match handle { + // ServerHandle::Container { cid, url: _ } => { + // let output = Command::new("docker") + // .args(["restart", cid]) + // .output() + // .unwrap(); + // check_output(&output); + // } // ServerHandle::Process { pid: _, url: _ } => { + // // // TODO: pretty easy, just no need yet + // // panic!("restart not yet implemented for pids") + // // } + // } + // self.wait_server_ready(handle) + // } + + // pub fn wait_server_ready(&mut self, handle: &ServerHandle) { + // let url = match handle { + // ServerHandle::Container { cid: _, url } => url.clone(), // + // // ServerHandle::Process { pid: _, url } => { + // // url.clone() + // // } + // }; + // wait_for_http(url, Duration::from_millis(100), MAX_STARTUP_WAIT); + // wait_for( + // || { + // let status = self.scheduler_status(); + // if matches!( + // status, + // SchedulerStatusResult { + // num_servers: 1, + // num_cpus: _, + // active: 0, + // assigned: 0, + // servers: _ + // } + // ) { + // Ok(()) + // } else { + // Err(format!("{:?}", status)) + // } + // }, + // Duration::from_millis(100), + // MAX_STARTUP_WAIT, + // ); + // } pub fn scheduler_url(&self) -> HTTPUrl { let url = format!("http://127.0.0.1:{}", SCHEDULER_PORT); @@ -561,6 +616,7 @@ impl Drop for DistSystem { .output() .map(|o| outputs.push((scheduler_name, o)))); } + for server_name in self.server_names.iter() { droperr!(Command::new("docker") .args(["logs", server_name]) @@ -575,6 +631,7 @@ impl Drop for DistSystem { .output() .map(|o| outputs.push((server_name, o)))); } + for &pid in self.server_pids.iter() { droperr!(nix::sys::signal::kill(pid, Signal::SIGINT)); thread::sleep(Duration::from_millis(100)); @@ -601,6 +658,22 @@ impl Drop for DistSystem { } } + // Kill the message broker last + if let Some(message_broker_name) = self.message_broker_name.as_ref() { + droperr!(Command::new("docker") + .args(["logs", message_broker_name]) + .output() + .map(|o| logs.push((message_broker_name, o)))); + droperr!(Command::new("docker") + .args(["kill", message_broker_name]) + .output() + .map(|o| outputs.push((message_broker_name, o)))); + droperr!(Command::new("docker") + .args(["rm", "-f", message_broker_name]) + .output() + .map(|o| outputs.push((message_broker_name, o)))); + } + for ( container, Output { diff --git a/tests/sccache_args.rs b/tests/sccache_args.rs index 82f91b35a5..a9725eb431 100644 --- a/tests/sccache_args.rs +++ b/tests/sccache_args.rs @@ -1,3 +1,5 @@ +#![cfg(any(feature = "gcs", feature = "s3"))] + //! Tests for sccache args. //! //! Any copyright is dedicated to the Public Domain. @@ -23,7 +25,7 @@ fn test_gcp_arg_check() -> Result<()> { let mut cmd = Command::new(SCCACHE_BIN.as_os_str()); cmd.arg("--start-server") - .env("SCCACHE_LOG", "debug") + .env("SCCACHE_LOG", "sccache=debug") .env("SCCACHE_GCS_KEY_PATH", "foo.json"); cmd.assert().failure().stderr(predicate::str::contains( @@ -34,7 +36,7 @@ fn test_gcp_arg_check() -> Result<()> { let mut cmd = Command::new(SCCACHE_BIN.as_os_str()); cmd.arg("--start-server") - .env("SCCACHE_LOG", "debug") + .env("SCCACHE_LOG", "sccache=debug") .env("SCCACHE_GCS_OAUTH_URL", "http://127.0.0.1"); cmd.assert().failure().stderr(predicate::str::contains( @@ -44,7 +46,7 @@ fn test_gcp_arg_check() -> Result<()> { stop_sccache()?; let mut cmd = Command::new(SCCACHE_BIN.as_os_str()); cmd.arg("--start-server") - .env("SCCACHE_LOG", "debug") + .env("SCCACHE_LOG", "sccache=debug") .env("SCCACHE_GCS_BUCKET", "b") .env("SCCACHE_GCS_CREDENTIALS_URL", "not_valid_url//127.0.0.1") .env("SCCACHE_GCS_KEY_PATH", "foo.json"); @@ -65,7 +67,7 @@ fn test_s3_invalid_args() -> Result<()> { let mut cmd = Command::new(SCCACHE_BIN.as_os_str()); cmd.arg("--start-server") - .env("SCCACHE_LOG", "debug") + .env("SCCACHE_LOG", "sccache=debug") .env("SCCACHE_BUCKET", "test") .env("SCCACHE_REGION", "us-east-1") .env("AWS_ACCESS_KEY_ID", "invalid_ak")