From 4a3fdc3941a2b097bb5c97889f7dd0bc657e342d Mon Sep 17 00:00:00 2001 From: ben Date: Thu, 8 May 2025 12:18:01 +0200 Subject: [PATCH 01/45] run dev branch --- .github/workflows/benchmark.yml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index c1a1e82..500eb58 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -18,7 +18,6 @@ jobs: run-benchmark: name: Run Benchmark runs-on: ubuntu-latest - ## runs-on: self-hosted steps: - name: Check out repository uses: actions/checkout@v4 @@ -49,7 +48,7 @@ jobs: shell: bash -l {0} run: | mamba install -y pip - pip install git+https://github.com/omnibenchmark/omnibenchmark.git@reduce_install_scope + pip install git+https://github.com/omnibenchmark/omnibenchmark.git@dev - name: Load benchmark cache id: cache-benchmark @@ -67,7 +66,6 @@ jobs: upload-artifact: name: Benchmark Artifact runs-on: ubuntu-latest - ## runs-on: self-hosted needs: run-benchmark if: always() steps: From e89adda93e7fdc64b52b2e77dc702969a50f735c Mon Sep 17 00:00:00 2001 From: btraven <128150520+btraven00@users.noreply.github.com> Date: Mon, 17 Mar 2025 16:20:31 +0100 Subject: [PATCH 02/45] docs: use the public repo URI --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index a75c594..89d7c05 100644 --- a/README.md +++ b/README.md @@ -3,8 +3,8 @@ A clustering example for omnibenchmark # How to run 1. Install omnibenchmark using [our tutorial](https://omnibenchmark.org/tutorial/) -2. Clone the benchmark definition / this repository with `git clone git@github.com:omnibenchmark/clustering_example.git` -3. Move to the cloned repository `cd clustering_example` +2. Clone the benchmark definition / this repository with `git clone https://github.com/omnibenchmark/clustering_example` +3. Move into the cloned folder: `cd clustering_example` 4. Run locally, somewhat in parallel `ob run benchmark -b CLUSTERING.YAML --local --threads 6`. Choose `Clustering.yml` specification based on whether running it with conda, easybuild, apptainer, etc. [More details about the available backends](https://github.com/omnibenchmark/clustering_example/blob/main/envs/README.md). # Clustbench attribution From 52ebb556eae88f36d2e857aadfe8189c4aca3eaf Mon Sep 17 00:00:00 2001 From: ben Date: Tue, 18 Mar 2025 13:02:37 +0100 Subject: [PATCH 03/45] chore: add convenience target to build singularity env - make script executable - use /bin/sh instead of /bin/bash - add top-level Makefile to prepare env --- Makefile | 2 ++ envs/build_singularity.sh | 14 +++++--------- 2 files changed, 7 insertions(+), 9 deletions(-) create mode 100644 Makefile mode change 100644 => 100755 envs/build_singularity.sh diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..1e56cb2 --- /dev/null +++ b/Makefile @@ -0,0 +1,2 @@ +prepare_apptainer_env: + cd envs && ./build_singularity.sh diff --git a/envs/build_singularity.sh b/envs/build_singularity.sh old mode 100644 new mode 100755 index 86e053f..c0c3d93 --- a/envs/build_singularity.sh +++ b/envs/build_singularity.sh @@ -1,9 +1,5 @@ -#!/bin/bash - -sudo singularity build sklearn.sif sklearn_singularity.def - -sudo singularity build clustbench.sif clustbench_singularity.def - -sudo singularity build r.sif r_singularity.def - -sudo singularity build fcps.sif fcps_singularity.def +#!/bin/sh +singularity build sklearn.sif sklearn_singularity.def +singularity build clustbench.sif clustbench_singularity.def +singularity build r.sif r_singularity.def +singularity build fcps.sif fcps_singularity.def From 83c6f0b0c78851d93be5956fd27a8180c61b2ba7 Mon Sep 17 00:00:00 2001 From: ben Date: Tue, 18 Mar 2025 13:59:05 +0100 Subject: [PATCH 04/45] feat: parametrize num threads on the makefile --- Makefile | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/Makefile b/Makefile index 1e56cb2..3c58e2f 100644 --- a/Makefile +++ b/Makefile @@ -1,2 +1,10 @@ +MAX_THREADS ?= 30 +OB_CMD="ob run benchmark -k --local" prepare_apptainer_env: cd envs && ./build_singularity.sh +run_with_apptainer_backend: + ${OB_CMD} -b Clustering_singularity.yml --threads ${MAX_THREADS} + mv out out_apptainer +run_with_conda_backend: + ${OB_CMD} -b Clustering_conda.yml --threads ${MAX_THREADS} + mv out out_conda From dc2d629004fcdb40f75bc24194287b961eb40283 Mon Sep 17 00:00:00 2001 From: ben Date: Tue, 18 Mar 2025 14:02:21 +0100 Subject: [PATCH 05/45] chore: ignore common temporary outputs and image build artifacts --- .gitignore | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..4d38534 --- /dev/null +++ b/.gitignore @@ -0,0 +1,10 @@ +# image build artifacts +envs/*.sif + +# snakemake +snakemake.log +.snakemake/ + +# vim swaps +*.swp +*.swo From f91603aecf8f82975087c89615d3473d4b79c12f Mon Sep 17 00:00:00 2001 From: ben Date: Tue, 18 Mar 2025 13:59:05 +0100 Subject: [PATCH 06/45] feat: parametrize num threads on the makefile --- Makefile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 3c58e2f..6883fa0 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,6 @@ MAX_THREADS ?= 30 -OB_CMD="ob run benchmark -k --local" +# by default, we want to run all snakemake rules even if there are failures +OB_CMD=ob run benchmark -k --local prepare_apptainer_env: cd envs && ./build_singularity.sh run_with_apptainer_backend: From bea2a75173f9c19edb2adb1c22bc1ab90d62774d Mon Sep 17 00:00:00 2001 From: ben Date: Mon, 5 May 2025 10:07:36 +0200 Subject: [PATCH 07/45] fix: use --cores, --task-timeout --- Makefile | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Makefile b/Makefile index 6883fa0..73b33b5 100644 --- a/Makefile +++ b/Makefile @@ -1,11 +1,11 @@ -MAX_THREADS ?= 30 +MAX_CORES ?= 10 # by default, we want to run all snakemake rules even if there are failures -OB_CMD=ob run benchmark -k --local +OB_CMD=ob run benchmark -k --local --task-timeout "4h" prepare_apptainer_env: cd envs && ./build_singularity.sh run_with_apptainer_backend: - ${OB_CMD} -b Clustering_singularity.yml --threads ${MAX_THREADS} + ${OB_CMD} -b Clustering_singularity.yml --cores ${MAX_CORES} mv out out_apptainer run_with_conda_backend: - ${OB_CMD} -b Clustering_conda.yml --threads ${MAX_THREADS} + ${OB_CMD} -b Clustering_conda.yml --cores ${MAX_CORES} mv out out_conda From 67e8cf8bc7e0deab9f6bfdc5aceaffe39841040e Mon Sep 17 00:00:00 2001 From: ben Date: Wed, 7 May 2025 21:41:46 +0200 Subject: [PATCH 08/45] update .eb files to easybuild 5.0 --- Makefile | 9 ++++-- envs/clustbench.eb | 81 ++++++---------------------------------------- envs/fcps.eb | 18 ++++------- 3 files changed, 23 insertions(+), 85 deletions(-) diff --git a/Makefile b/Makefile index 73b33b5..e107f62 100644 --- a/Makefile +++ b/Makefile @@ -1,11 +1,14 @@ MAX_CORES ?= 10 # by default, we want to run all snakemake rules even if there are failures -OB_CMD=ob run benchmark -k --local --task-timeout "4h" +OB_CMD=ob run benchmark -k --local --task-timeout "4h" --cores ${MAX_CORES} prepare_apptainer_env: cd envs && ./build_singularity.sh +prepare_envmodules_env: + cd envs && eb clustbench.eb --robot + cd envs && eb fcps.eb --robot run_with_apptainer_backend: - ${OB_CMD} -b Clustering_singularity.yml --cores ${MAX_CORES} + ${OB_CMD} -b Clustering_singularity.yml mv out out_apptainer run_with_conda_backend: - ${OB_CMD} -b Clustering_conda.yml --cores ${MAX_CORES} + ${OB_CMD} -b Clustering_conda.yml mv out out_conda diff --git a/envs/clustbench.eb b/envs/clustbench.eb index 22597fb..f3ee681 100644 --- a/envs/clustbench.eb +++ b/envs/clustbench.eb @@ -1,108 +1,47 @@ -## largely as https://github.com/easybuilders/easybuild-easyconfigs/blob/949c266db9e17440ec2829eb8ffdbdb87ceaf543/easybuild/easyconfigs/c/cooler/cooler-0.10.2-foss-2023b.eb#L4 - easyblock = 'PythonBundle' name = 'clustbench' -version = '1' +version = '0.1.0' -homepage = 'https://python.org/' +homepage = 'https://omnibenchmark.org' description = "Bundle of Python packages for ob clustering_example" toolchain = {'name': 'foss', 'version': '2023b'} - dependencies = [ ('Python', '3.11.5'), - ('Python-bundle-PyPI', '2023.10'), ## so GCC 13.2.0 like foss-2023b ('SciPy-bundle', '2023.11'), - ('meson-python', '0.15.0'), ('matplotlib', '3.8.2'), - ('scikit-learn', '1.4.0') - + ('scikit-learn', '1.4.0'), +# ('meson-python', '0.15.0'), +# ('Python-bundle-PyPI', '2023.10'), ## so GCC 13.2.0 like foss-2023b ] -sanity_pip_check = True -use_pip = True - -exts_default_options = { - 'sanity_pip_check': True, - 'use_pip' : True -} - -## https://files.pythonhosted.org/packages/27/fe/e78538f4cd7b1b28e9c625eabd21f314004d00644a8347d0b01473e72ffa/clustering_benchmarks-1.1.5.tar.gz -## https://files.pythonhosted.org/packages/47/6a/62e288da7bcda82b935ff0c6cfe542970f04e29c756b0e147251b2fb251f/wget-3.2.zip -## https://files.pythonhosted.org/packages/5d/b8/f143d907d93bd4a3dd51d07c4e79b37bedbfc2177f4949bfa0d6ba0af647/fastcluster-1.2.6.tar.gz -## https://files.pythonhosted.org/packages/68/7c/d465bab9f98b75c5c1f5e80165dd82847a504ced655d162b585df08a717b/genieclust-1.1.6.tar.gz -## https://files.pythonhosted.org/packages/a2/45/eaaacaa4f4f2931a80d40e453df275d9af7c07616c5d753272d3055fb79e/genieclust-1.1.6-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl - -source_urls = [PYPI_SOURCE, - 'https://files.pythonhosted.org/packages/27/fe/e78538f4cd7b1b28e9c625eabd21f314004d00644a8347d0b01473e72ffa/', - 'https://files.pythonhosted.org/packages/68/7c/d465bab9f98b75c5c1f5e80165dd82847a504ced655d162b585df08a717b/', - 'https://files.pythonhosted.org/packages/47/6a/62e288da7bcda82b935ff0c6cfe542970f04e29c756b0e147251b2fb251f/', - 'https://files.pythonhosted.org/packages/5d/b8/f143d907d93bd4a3dd51d07c4e79b37bedbfc2177f4949bfa0d6ba0af647/', - 'https://files.pythonhosted.org/packages/65/6e/09db70a523a96d25e115e71cc56a6f9031e7b8cd166c1ac8438307c14058/', - 'https://files.pythonhosted.org/packages/84/4d/b720d6000f4ca77f030bd70f12550820f0766b568e43f11af7f7ad9061aa/', - 'https://files.pythonhosted.org/packages/68/dd/fa2e1a45fce2d09f4aea3cee169760e672c8262325aa5796c49d543dc7e6/', - 'https://files.pythonhosted.org/packages/84/4d/b720d6000f4ca77f030bd70f12550820f0766b568e43f11af7f7ad9061aa', - 'https://files.pythonhosted.org/packages/67/66/91d242ea8dd1729addd36069318ba2cd03874872764f316c3bb51b633ed2/', - 'https://files.pythonhosted.org/packages/e2/a9/a0c57aee75f77794adaf35322f8b6404cbd0f89ad45c87197a937764b7d0/', - 'https://files.pythonhosted.org/packages/d2/c1/72b9622fcb32ff98b054f724e213c7f70d6898baa714f4516288456ceaba/', - 'https://github.com/pybind/pybind11/archive/', - 'https://files.pythonhosted.org/packages/a9/95/a3dbbb5028f35eafb79008e7522a75244477d2838f38cbb722248dabc2a8/', - 'https://files.pythonhosted.org/packages/84/a4/d9da2989a3d937e94616ef07f0630c507f6baa77ad37f94ceb06b36cacc1/python3-wget-0.0.2-beta1.tar.gz', - 'https://files.pythonhosted.org/packages/6a/ef/6e3736663ee67369f7f5b697674bfbd3efc91e7096ddd4452bbbc80065ff/hypothesis-6.124.7.tar.gz', - 'https://files.pythonhosted.org/packages/03/c6/14a17e10813b8db20d1e800ff9a3a898e65d25f2b0e9d6a94616f1e3362c/numpy-1.23.0.tar.gz', - 'https://files.pythonhosted.org/packages/f6/d8/ab692a75f584d13c6542c3994f75def5bce52ded9399f52e230fe402819d/numpy-1.22.4.zip', - 'https://files.pythonhosted.org/packages/65/6e/09db70a523a96d25e115e71cc56a6f9031e7b8cd166c1ac8438307c14058/numpy-1.26.4.tar.gz', - 'https://files.pythonhosted.org/packages/64/33/60135848598c076ce4b231e1b1895170f45fbcaeaa2c9d5e38b04db70c35/joblib-1.4.2.tar.gz', - 'https://files.pythonhosted.org/packages/bd/55/b5148dcbf72f5cde221f8bfe3b6a540da7aa1842f6b491ad979a6c8b84af/threadpoolctl-3.5.0.tar.gz', - 'https://files.pythonhosted.org/packages/9e/a5/4ae3b3a0755f7b35a280ac90b28817d1f380318973cff14075ab41ef50d9/scikit_learn-1.6.1.tar.gz', - 'https://files.pythonhosted.org/packages/ef/e5/c09d20723bfd91315f6f4ddc77912b0dcc09588b4ca7ad2ffa204607ad7f/scikit-learn-1.4.2.tar.gz', - 'https://files.pythonhosted.org/packages/ee/5e/16e17bedcf54d5b618dc0771690deda77178e5c310402881c3d2d6c5f27c/hurry.filesize-0.9.tar.gz'] - - -## caution download genieclust here, not pypi, they differ and pypi's it's not installable! -## cd /home/imallona/.local/easybuild/sources/c/clustbench/extensions/ -## wget wget https://github.com/gagolews/genieclust/archive/refs/tags/v1.1.6.tar.gz -O genieclust-1.1.6.tar.gz -## todo automate this within the easyconfig! - exts_list = [ ('natsort', '8.4.0', { 'checksums': ['45312c4a0e5507593da193dedd04abb1469253b601ecaf63445ad80f0a1ea581'], }), - ('cython', '3.0.11', { - 'checksums': ['7146dd2af8682b4ca61331851e6aebce9fe5158e75300343f80c07ca80b1faff'], - }), ('hypothesis', '6.124.7', { 'checksums': ['8ed6c6ae47e7d26d869c1dc3dee04e8fc50c95240715bb9915ded88d6d920f0e'], }), - ('numpy', '1.26.4', { - 'checksums': ['2a02aba9ed12e4ac4eb3ea9421c420301a0c6460d9830d74a9df87efa4912010'], - }), ('fastcluster', '1.2.6', { 'checksums': ['aab886efa7b6bba7ac124f4498153d053e5a08b822d2254926b7206cdf5a8aa6'], }), - ('genieclust', '1.1.6', { - 'checksums': ['fb5b4ff68eef9e73496afa5949e726c8522c72e51f092716a6a598b03d5c09d6'], - }), ('hurry.filesize', '0.9', { 'checksums': ['f5368329adbef86accd3bc9490522340bb79260455ae89b1a42c10f63801b9a6'], }), ('python3-wget', '0.0.2-beta1', { 'modulename': 'wget', + 'source_urls': ['https://files.pythonhosted.org/packages/84/a4/d9da2989a3d937e94616ef07f0630c507f6baa77ad37f94ceb06b36cacc1/'], 'checksums': ['bbe7f44b3c28c4f7126aff20e8a438e78f6e4f1878d8b0c4940e87363813c17d'], }), - ('clustering_benchmarks', '1.1.5', { - 'modulename': 'clustbench', - 'checksums': ['1732c262fb13be2f88814ef9a19c60108e91a7f6cfb9b960a42feaa299034ea3'], + ('genieclust', '1.1.6', { + 'download_dep_fail': False, + 'install_src': 'https://files.pythonhosted.org/packages/2a/09/d1fd7b02cfabe76262d0f88d74fa71dc93e857525f8249539ec5ab174292/genieclust-1.1.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl', + 'checksums': ['4c159f507b84b6d6d171883223648d837c520a9bcce650944a6ee0cb320e2151'], }), ] -sanity_check_paths = { - 'files': [], - 'dirs': ['lib/python3.11/site-packages/clustbench/'] -} - moduleclass = 'bio' diff --git a/envs/fcps.eb b/envs/fcps.eb index ee3db52..54c8c7d 100644 --- a/envs/fcps.eb +++ b/envs/fcps.eb @@ -1,6 +1,3 @@ -## largely as in https://github.com/easybuilders/easybuild-easyconfigs/commit/e9a36171c68414f933ab1afa03b32422491f0f96#diff-3f2a92ab6ab59ddaccf4bc61b59bdd3f6717b95fd019131a57f51eefc831a699 -## Caution boost easyconfig needs update https://raw.githubusercontent.com/easybuilders/easybuild-easyconfigs/refs/heads/develop/easybuild/easyconfigs/b/Boost/Boost-1.82.0-GCC-12.3.0.eb (https://github.com/easybuilders/easybuild-easyconfigs/commit/e29210626f076e3a207f1abf3759ea124e28f8b2) - easyblock = 'Bundle' name = 'fcps' @@ -23,10 +20,7 @@ dependencies = [ exts_default_options = { 'source_urls': [ - 'https://bioconductor.org/packages/3.18/bioc/src/contrib/', - 'https://bioconductor.org/packages/3.18/bioc/src/contrib/Archive/%(name)s', - 'https://bioconductor.org/packages/3.18/data/annotation/src/contrib/', - 'https://bioconductor.org/packages/3.18/data/experiment/src/contrib/', + 'https://bioconductor.org/packages/release/bioc/src/contrib/', 'https://cran.r-project.org/src/contrib/Archive/%(name)s', # package archive 'https://cran.r-project.org/src/contrib/', # current version of packages 'https://cran.freestatistics.org/src/contrib', # mirror alternative for current packages @@ -192,13 +186,15 @@ exts_list = [ ('cluster', '2.1.8', { 'checksums': ['c32a462e34694c99d58da953efa74882b5427f8c5db7cb226ae15c54ce6060ca'], }), - ('graph', '1.84.1', { - 'checksums': ['cd2a91c93c81c09d9c59853c417e8a9cdde39b0589bacdce4ca916b6ee5f45a7'], + ('graph', '1.86.0', { + 'checksums': ['ac9e196dfcb43848a851ea2d339cff41f8f16c7e80e76282c8fe7b822df8f367'], }), ('mclust', '6.1.1', { 'checksums': ['ddd7018e5e6ea7f92c7fc9872b391491b7e91c2cd89ef1dcaf4408afb5116775'], }), - ('cclust', '0.6-26'), + ('cclust', '0.6-26', { + 'checksums': ['92ec3c55a1864e4e1a4706bfdef8ad00727c720213ac656c718e867286b29857'], + }), ('flowClust', '3.40.0', { 'installopts': "--configure-args='--with-gsl=${EBROOTGSL} --enable-bundled-gsl=false'", 'checksums': ['7e699b06e378e32144704dbec18289109980b0f5eca166180f2c30007b83e0f5'], @@ -240,4 +236,4 @@ sanity_check_paths = { 'dirs': ['FCPS', 'dbscan', 'energy', 'protoclust'], } -moduleclass = 'bio' \ No newline at end of file +moduleclass = 'bio' From 931389f796ef8ceb7e4951c80c708e1b2c2129b1 Mon Sep 17 00:00:00 2001 From: ben Date: Thu, 8 May 2025 12:38:02 +0200 Subject: [PATCH 09/45] remove remote storage --- Clustering.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Clustering.yaml b/Clustering.yaml index 0007ea5..689be2c 100644 --- a/Clustering.yaml +++ b/Clustering.yaml @@ -2,10 +2,10 @@ id: clustering_example description: Clustering benchmark on Gagolewski's, true number of clusters plus minus 2. version: 1.2 benchmarker: "Izaskun Mallona, Daniel Incicau" -storage: https://play.min.io benchmark_yaml_spec: 0.04 -storage_api: S3 -storage_bucket_name: clustering_example +# storage: https://play.min.io +# storage_api: S3 +# storage_bucket_name: clustering_example software_backend: conda software_environments: clustbench: From 60ac47b3c55bec65b5ad839d524a7b8cd87b1b4c Mon Sep 17 00:00:00 2001 From: ben Date: Thu, 8 May 2025 12:41:44 +0200 Subject: [PATCH 10/45] do not run artifact if not in main repo --- .github/workflows/benchmark.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 500eb58..2a55846 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -67,7 +67,7 @@ jobs: name: Benchmark Artifact runs-on: ubuntu-latest needs: run-benchmark - if: always() + if: github.ref == 'refs/heads/main' && github.repository_owner == 'omnibenchmark' steps: - name: Check out repository uses: actions/checkout@v4 From 1b972bfef0d7a74199d0289d8b7b8749720bce27 Mon Sep 17 00:00:00 2001 From: ben Date: Thu, 8 May 2025 12:45:12 +0200 Subject: [PATCH 11/45] Update Makefile --- Makefile | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index e107f62..875a375 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ MAX_CORES ?= 10 -# by default, we want to run all snakemake rules even if there are failures +# by default, we want to run all snakemake rules even if there are failures (-k) OB_CMD=ob run benchmark -k --local --task-timeout "4h" --cores ${MAX_CORES} prepare_apptainer_env: cd envs && ./build_singularity.sh @@ -12,3 +12,6 @@ run_with_apptainer_backend: run_with_conda_backend: ${OB_CMD} -b Clustering_conda.yml mv out out_conda +run_with_envmodules_backend: + ${OB_CMD} -b Clustering_envmodules.yml + mv out out_lmod From 49646db648dee014b3a43f655ef64147cbda6ed0 Mon Sep 17 00:00:00 2001 From: ben Date: Thu, 8 May 2025 13:22:56 +0200 Subject: [PATCH 12/45] streamline envmodules yaml --- Clustering_envmodules.yml | 281 ++++++++++++++++++++------------------ 1 file changed, 149 insertions(+), 132 deletions(-) diff --git a/Clustering_envmodules.yml b/Clustering_envmodules.yml index 3c2b8bd..1ab4808 100644 --- a/Clustering_envmodules.yml +++ b/Clustering_envmodules.yml @@ -2,32 +2,21 @@ id: clustering_example_envmodules description: Clustering benchmark on Gagolewski's, true number of clusters plus minus 2. version: 1.4 benchmarker: "Izaskun Mallona, Daniel Incicau" -storage: http://omnibenchmark.org:9000 -benchmark_yaml_spec: 0.04 -storage_api: S3 -storage_bucket_name: clusteringexampleenvmodules +benchmark_yaml_spec: 0.5 + software_backend: envmodules + software_environments: clustbench: description: "clustbench on py3.12.6" - conda: envs/clustbench.yml - envmodule: clustbench - apptainer: envs/clustbench.sif - sklearn: - description: "Daniel's on py3.12.6" - conda: envs/sklearn.yml - apptainer: envs/sklearn.sif - envmodule: clustbench # not true, but - R: - description: "Daniel's R with readr, dplyr, mclust, caret" - conda: envs/r.yml - apptainer: envs/r.sif - envmodule: fcps # not true, but + envmodule: clustbench/0.1.0-foss-2023b # py3.11 tho + conda: na + apptainer: na rmarkdown: description: "R with some plotting dependencies" - conda: envs/rmarkdown.yml - apptainer: envs/r.sif # not true, but - envmodule: fcps # not true, but + envmodule: rmarkdown # TODO + conda: na + apptainer: na fcps: description: "CRAN's FCPS" conda: envs/fcps.yml @@ -56,42 +45,132 @@ stages: repository: url: https://github.com/imallona/clustbench_data commit: 366c5a2 - parameters: # comments depict the possible cardinalities and the number of curated labelsets + parameters: # comments depict the possible cardinalities and the number of curated labelsets - values: ["--dataset_generator", "fcps", "--dataset_name", "atom"] # 2 1 - - values: ["--dataset_generator", "fcps", "--dataset_name", "chainlink"] # 2 1 - - values: ["--dataset_generator", "fcps", "--dataset_name", "engytime"] # 2 2 + - values: [ + "--dataset_generator", + "fcps", + "--dataset_name", + "chainlink", + ] # 2 1 + - values: [ + "--dataset_generator", + "fcps", + "--dataset_name", + "engytime", + ] # 2 2 - values: ["--dataset_generator", "fcps", "--dataset_name", "hepta"] # 7 1 - values: ["--dataset_generator", "fcps", "--dataset_name", "lsun"] # 3 1 - values: ["--dataset_generator", "fcps", "--dataset_name", "target"] # 2, 6 2 - values: ["--dataset_generator", "fcps", "--dataset_name", "tetra"] # 4 1 - - values: ["--dataset_generator", "fcps", "--dataset_name", "twodiamonds"] # 2 1 + - values: [ + "--dataset_generator", + "fcps", + "--dataset_name", + "twodiamonds", + ] # 2 1 - values: ["--dataset_generator", "fcps", "--dataset_name", "wingnut"] # 2 1 - values: ["--dataset_generator", "graves", "--dataset_name", "dense"] # 2 1 - - values: ["--dataset_generator", "graves", "--dataset_name", "fuzzyx"] # 2, 4, 5 6 + - values: [ + "--dataset_generator", + "graves", + "--dataset_name", + "fuzzyx", + ] # 2, 4, 5 6 - values: ["--dataset_generator", "graves", "--dataset_name", "line"] # 2 1 - - values: ["--dataset_generator", "graves", "--dataset_name", "parabolic"] # 2, 4 2 + - values: [ + "--dataset_generator", + "graves", + "--dataset_name", + "parabolic", + ] # 2, 4 2 - values: ["--dataset_generator", "graves", "--dataset_name", "ring"] # 2 1 - - values: ["--dataset_generator", "graves", "--dataset_name", "ring_noisy"] # 2 1 - - values: ["--dataset_generator", "graves", "--dataset_name", "ring_outliers"] # 2, 5 2 - - values: ["--dataset_generator", "graves", "--dataset_name", "zigzag"] # 3, 5 2 - - values: ["--dataset_generator", "graves", "--dataset_name", "zigzag_noisy"] # 3, 5 2 - - values: ["--dataset_generator", "graves", "--dataset_name", "zigzag_outliers"] # 3, 5 2 - - values: ["--dataset_generator", "other", "--dataset_name", "chameleon_t4_8k"] # 6 1 - - values: ["--dataset_generator", "other", "--dataset_name", "chameleon_t5_8k"] # 6 1 - - values: ["--dataset_generator", "other", "--dataset_name", "hdbscan"] # 6 1 + - values: [ + "--dataset_generator", + "graves", + "--dataset_name", + "ring_noisy", + ] # 2 1 + - values: [ + "--dataset_generator", + "graves", + "--dataset_name", + "ring_outliers", + ] # 2, 5 2 + - values: [ + "--dataset_generator", + "graves", + "--dataset_name", + "zigzag", + ] # 3, 5 2 + - values: [ + "--dataset_generator", + "graves", + "--dataset_name", + "zigzag_noisy", + ] # 3, 5 2 + - values: [ + "--dataset_generator", + "graves", + "--dataset_name", + "zigzag_outliers", + ] # 3, 5 2 + - values: [ + "--dataset_generator", + "other", + "--dataset_name", + "chameleon_t4_8k", + ] # 6 1 + - values: [ + "--dataset_generator", + "other", + "--dataset_name", + "chameleon_t5_8k", + ] # 6 1 + - values: [ + "--dataset_generator", + "other", + "--dataset_name", + "hdbscan", + ] # 6 1 - values: ["--dataset_generator", "other", "--dataset_name", "iris"] # 3 1 - values: ["--dataset_generator", "other", "--dataset_name", "iris5"] # 3 1 - values: ["--dataset_generator", "other", "--dataset_name", "square"] # 2 1 - - values: ["--dataset_generator", "sipu", "--dataset_name", "aggregation"] # 7 1 - - values: ["--dataset_generator", "sipu", "--dataset_name", "compound"] # 4, 5, 6 5 + - values: [ + "--dataset_generator", + "sipu", + "--dataset_name", + "aggregation", + ] # 7 1 + - values: [ + "--dataset_generator", + "sipu", + "--dataset_name", + "compound", + ] # 4, 5, 6 5 - values: ["--dataset_generator", "sipu", "--dataset_name", "flame"] # 2 2 - values: ["--dataset_generator", "sipu", "--dataset_name", "jain"] # 2 1 - - values: ["--dataset_generator", "sipu", "--dataset_name", "pathbased"] # 3, 4 2 + - values: [ + "--dataset_generator", + "sipu", + "--dataset_name", + "pathbased", + ] # 3, 4 2 - values: ["--dataset_generator", "sipu", "--dataset_name", "r15"] # 8, 9, 15 3 - values: ["--dataset_generator", "sipu", "--dataset_name", "spiral"] # 3 1 - - values: ["--dataset_generator", "sipu", "--dataset_name", "unbalance"] # 8 1 + - values: [ + "--dataset_generator", + "sipu", + "--dataset_name", + "unbalance", + ] # 8 1 - values: ["--dataset_generator", "uci", "--dataset_name", "ecoli"] # 8 1 - - values: ["--dataset_generator", "uci", "--dataset_name", "ionosphere"] # 2 1 + - values: [ + "--dataset_generator", + "uci", + "--dataset_name", + "ionosphere", + ] # 2 1 - values: ["--dataset_generator", "uci", "--dataset_name", "sonar"] # 2 1 - values: ["--dataset_generator", "uci", "--dataset_name", "statlog"] # 7 1 - values: ["--dataset_generator", "uci", "--dataset_name", "wdbc"] # 2 1 @@ -100,8 +179,18 @@ stages: - values: ["--dataset_generator", "wut", "--dataset_name", "circles"] # 4 1 - values: ["--dataset_generator", "wut", "--dataset_name", "cross"] # 4 1 - values: ["--dataset_generator", "wut", "--dataset_name", "graph"] # 10 1 - - values: ["--dataset_generator", "wut", "--dataset_name", "isolation"] # 3 1 - - values: ["--dataset_generator", "wut", "--dataset_name", "labirynth"] # 6 1 + - values: [ + "--dataset_generator", + "wut", + "--dataset_name", + "isolation", + ] # 3 1 + - values: [ + "--dataset_generator", + "wut", + "--dataset_name", + "labirynth", + ] # 6 1 - values: ["--dataset_generator", "wut", "--dataset_name", "mk1"] # 3 1 - values: ["--dataset_generator", "wut", "--dataset_name", "mk2"] # 2 1 - values: ["--dataset_generator", "wut", "--dataset_name", "mk3"] # 3 1 @@ -109,9 +198,24 @@ stages: - values: ["--dataset_generator", "wut", "--dataset_name", "olympic"] # 5 1 - values: ["--dataset_generator", "wut", "--dataset_name", "smile"] # 4, 6 2 - values: ["--dataset_generator", "wut", "--dataset_name", "stripes"] # 2 1 - - values: ["--dataset_generator", "wut", "--dataset_name", "trajectories"] # 4 1 - - values: ["--dataset_generator", "wut", "--dataset_name", "trapped_lovers"] # 3 1 - - values: ["--dataset_generator", "wut", "--dataset_name", "twosplashes"] # 2 1 + - values: [ + "--dataset_generator", + "wut", + "--dataset_name", + "trajectories", + ] # 4 1 + - values: [ + "--dataset_generator", + "wut", + "--dataset_name", + "trapped_lovers", + ] # 3 1 + - values: [ + "--dataset_generator", + "wut", + "--dataset_name", + "twosplashes", + ] # 2 1 - values: ["--dataset_generator", "wut", "--dataset_name", "windows"] # 5 1 - values: ["--dataset_generator", "wut", "--dataset_name", "x1"] # 3 1 - values: ["--dataset_generator", "wut", "--dataset_name", "x2"] # 3 1 @@ -126,7 +230,7 @@ stages: path: "{input}/{stage}/{module}/{params}/{dataset}.labels0.gz" ## clustbench methods (fastcluster) ################################################################### - + - id: clustering modules: - id: fastcluster @@ -148,7 +252,6 @@ stages: software_environment: "clustbench" repository: url: https://github.com/imallona/clustbench_sklearn - #url: /home/imallona/src/clustbench_sklearn commit: 5877378 parameters: - values: ["--method", "birch"] @@ -229,89 +332,3 @@ stages: outputs: - id: metrics.scores path: "{input}/{stage}/{module}/{params}/{dataset}.scores.gz" - - # ## daniel's data ########################################################################### - - # - id: danielsdata - # modules: - # - id: iris_manual - # name: "Iris Dataset" - # software_environment: "sklearn" - # repository: - # url: https://github.com/omnibenchmark-example/iris.git - # commit: 47c63f0 - # - id: penguins - # name: "Penguins Dataset" - # software_environment: "sklearn" - # repository: - # url: https://github.com/omnibenchmark-example/penguins.git - # commit: 9032478 - # outputs: - # - id: data.features - # path: "{input}/{stage}/{module}/{params}/{dataset}.features.csv" - # - id: data.labels - # path: "{input}/{stage}/{module}/{params}/{dataset}.labels.csv" - - # ## daniel's distances ######################################################################## - - # - id: distances - # modules: - # - id: D1 - # software_environment: "sklearn" - # parameters: - # - values: ["--measure", "cosine"] - # - values: ["--measure", "euclidean"] - # - values: ["--measure", "manhattan"] - # - values: ["--measure", "chebyshev"] - # repository: - # url: https://github.com/omnibenchmark-example/distance.git - # commit: dd99d4f - # inputs: - # - entries: - # - data.features - # outputs: - # - id: distances - # path: "{input}/{stage}/{module}/{params}/{dataset}.distances.csv" - - # ## daniel's methods ################################################################### - - # - id: danielmethods - # modules: - # - id: kmeans - # software_environment: "sklearn" - # repository: - # url: https://github.com/omnibenchmark-example/kmeans.git - # commit: 049c8b1 - # - id: ward - # software_environment: "R" - # repository: - # url: https://github.com/omnibenchmark-example/ward.git - # commit: 976e3f3 - # inputs: - # - entries: - # - distances - # outputs: - # - id: methods.clusters - # path: "{input}/{stage}/{module}/{params}/{dataset}.clusters.csv" - - # ## daniel's metrics ################################################################### - - # - id: danielsmetrics - # modules: - # - id: ari - # software_environment: "R" - # repository: - # url: https://github.com/omnibenchmark-example/ari.git - # commit: 72708f0 - # - id: accuracy - # software_environment: "R" - # repository: - # url: https://github.com/omnibenchmark-example/accuracy.git - # commit: e26b32f - # inputs: - # - entries: - # - methods.clusters - # - data.labels - # outputs: - # - id: metrics.mapping - # path: "{input}/{stage}/{module}/{params}/{dataset}.metrics.txt" From fc53991d1eb32c7749c3f1a2bccc0ed9e33601af Mon Sep 17 00:00:00 2001 From: ben Date: Thu, 8 May 2025 15:14:35 +0200 Subject: [PATCH 13/45] update clustbench --- Clustering_envmodules.yml | 38 ++++++++++++++++++++------------------ envs/clustbench.eb | 5 +++++ 2 files changed, 25 insertions(+), 18 deletions(-) diff --git a/Clustering_envmodules.yml b/Clustering_envmodules.yml index 1ab4808..f37fd6c 100644 --- a/Clustering_envmodules.yml +++ b/Clustering_envmodules.yml @@ -9,23 +9,24 @@ software_backend: envmodules software_environments: clustbench: description: "clustbench on py3.12.6" - envmodule: clustbench/0.1.0-foss-2023b # py3.11 tho - conda: na + envmodule: clustbench/0.1.0-foss-2023b + conda: envs/clustbench.yml + apptainer: na + fcps: + description: "CRAN's FCPS" + envmodule: fcps/1.3.4-foss-2023a-r-4.3.2 + conda: envs/fcps.yml apptainer: na rmarkdown: description: "R with some plotting dependencies" envmodule: rmarkdown # TODO - conda: na + conda: envs/clustbench.yml apptainer: na - fcps: - description: "CRAN's FCPS" - conda: envs/fcps.yml - apptainer: envs/fcps.sif - envmodule: fcps + metric_collectors: - id: plotting name: "Single-backend metric collector." - software_environment: "rmarkdown" + software_environment: rmarkdown repository: url: https://github.com/imallona/clustering_report commit: 1d6bdf5 @@ -34,14 +35,15 @@ metric_collectors: outputs: - id: plotting.html path: "{input}/{name}/plotting_report.html" + stages: - ## clustbench data ########################################################## - id: data + ## clustbench data modules: - id: clustbench name: "clustbench datasets, from https://www.sciencedirect.com/science/article/pii/S0020025521010082#t0005 Table1" - software_environment: "clustbench" + software_environment: clustbench repository: url: https://github.com/imallona/clustbench_data commit: 366c5a2 @@ -235,7 +237,7 @@ stages: modules: - id: fastcluster name: "fastcluster algorithm" - software_environment: "clustbench" + software_environment: clustbench repository: url: https://github.com/imallona/clustbench_fastcluster # url: /home/imallona/src/clustbench_fastcluster/ @@ -249,7 +251,7 @@ stages: - values: ["--linkage", "centroid"] - id: sklearn name: "sklearn" - software_environment: "clustbench" + software_environment: clustbench repository: url: https://github.com/imallona/clustbench_sklearn commit: 5877378 @@ -260,7 +262,7 @@ stages: - values: ["--method", "gm"] - id: agglomerative name: "agglomerative" - software_environment: "clustbench" + software_environment: clustbench repository: url: https://github.com/imallona/clustbench_agglomerative commit: 5454368 @@ -270,7 +272,7 @@ stages: - values: ["--linkage", "ward"] - id: genieclust name: "genieclust" - software_environment: "clustbench" + software_environment: clustbench repository: url: https://github.com/imallona/clustbench_genieclust commit: 6090043 @@ -280,7 +282,7 @@ stages: - values: ["--method", "ica"] - id: fcps name: "fcps" - software_environment: "fcps" + software_environment: fcps repository: url: https://github.com/imallona/clustbench_fcps commit: 272fa5f @@ -309,10 +311,10 @@ stages: modules: - id: partition_metrics name: "clustbench partition metrics" - software_environment: "clustbench" + software_environment: clustbench repository: url: https://github.com/imallona/clustbench_metrics - commit: 9132d45 + commit: 8184cd4 parameters: - values: ["--metric", "normalized_clustering_accuracy"] - values: ["--metric", "adjusted_fm_score"] diff --git a/envs/clustbench.eb b/envs/clustbench.eb index f3ee681..0e86911 100644 --- a/envs/clustbench.eb +++ b/envs/clustbench.eb @@ -13,6 +13,7 @@ dependencies = [ ('SciPy-bundle', '2023.11'), ('matplotlib', '3.8.2'), ('scikit-learn', '1.4.0'), +# FIXME: I think this is not needed -- ben # ('meson-python', '0.15.0'), # ('Python-bundle-PyPI', '2023.10'), ## so GCC 13.2.0 like foss-2023b ] @@ -40,6 +41,10 @@ exts_list = [ 'install_src': 'https://files.pythonhosted.org/packages/2a/09/d1fd7b02cfabe76262d0f88d74fa71dc93e857525f8249539ec5ab174292/genieclust-1.1.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl', 'checksums': ['4c159f507b84b6d6d171883223648d837c520a9bcce650944a6ee0cb320e2151'], }), + ('clustering_benchmarks', '1.1.6', { + 'modulename': 'clustbench', + 'checksums': ['8c3ac0aed7c4c4925df6e5000db29aed6359341bd1ef2e516f230e13d8b66a0c'], + }), ] moduleclass = 'bio' From 54b72790b1e2d2d9aa66d30c3d956b5d8be387a3 Mon Sep 17 00:00:00 2001 From: ben Date: Sat, 10 May 2025 18:46:41 +0200 Subject: [PATCH 14/45] add rmarkdown-python bundles, without checksums --- envs/rmarkdown-python.eb | 28 ++++++++++++ envs/rmarkdown.eb | 94 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 122 insertions(+) create mode 100644 envs/rmarkdown-python.eb create mode 100644 envs/rmarkdown.eb diff --git a/envs/rmarkdown-python.eb b/envs/rmarkdown-python.eb new file mode 100644 index 0000000..a9edb00 --- /dev/null +++ b/envs/rmarkdown-python.eb @@ -0,0 +1,28 @@ +easyblock = 'Bundle' + +# This is a dummy bundle that installs: +# 1. rmarkdown: an R bundle that we also package +# 2. Python-3.12.3-GCCcore-13.3.0 +# This is a dependency for the clustering_benchmark metric collector. + +name = 'rmarkdown-python' +version = '0.1.0' + +local_rver = '4.4.2' +local_pyver = '3.12.3' +versionsuffix = f'-r-{local_rver}-py-{local_pyver}' + +homepage = 'https://omnibenchmark.org' +description = 'Rmarkdown bundle with specific Python dependency' + +toolchain = {'name': 'system', 'version': '1.0'} + +dependencies = [ + ('rmarkdown', '0.1.0', f'-gfbf-2024a-r-{local_rver}'), + ('Python', local_pyver, '-GCCcore-13.3.0'), +] + +sanity_check_paths = { + 'files': [], + 'dirs': ['../../rmarkdown/0.1.0-gfbf-2024a-r-4.4.2'] +} diff --git a/envs/rmarkdown.eb b/envs/rmarkdown.eb new file mode 100644 index 0000000..a88a2a9 --- /dev/null +++ b/envs/rmarkdown.eb @@ -0,0 +1,94 @@ +easyblock = 'Bundle' + +# TODO(ben): Try to use https://www.eessi.io/docs/available_software/detail/R-bundle-CRAN/ +# and build only what's left out. + +name = 'rmarkdown' +version = '0.1.0' +versionsuffix = '-r-%(rver)s' + +homepage = 'https://omnibenchmark.org' +description = 'rmarkdown bundle for clustbench reports' + +toolchain = {'name': 'gfbf', 'version': '2024a'} + +dependencies = [ + ('R', '4.4.2'), +] + +exts_default_options = { + 'source_urls': [ + 'https://cloud.r-project.org/src/contrib/', + 'https://cran.r-project.org/src/contrib/', # current version of packages + 'https://cran.r-project.org/src/contrib/Archive/%(name)s', # package archive + 'https://www.bioconductor.org/packages/release/bioc/src/contrib/', # bioconductor + ], + 'sources': ['%(name)s_%(version)s.tar.gz'], +} + +exts_defaultclass = 'RPackage' + + +exts_list = [ + ('rlang', '1.1.6'), + ('glue', '1.8.0'), + ('cli', '3.6.4'), + ('lifecycle', '1.0.4'), + ('vctrs', '0.6.5'), + ('utf8', '1.2.4'), + ('lattice', '0.22-5'), + ('pkgconfig', '2.0.3'), + ('pillar', '1.10.2'), + ('magrittr', '2.0.3'), + ('fansi', '1.0.6'), + ('viridisLite', '0.4.2'), + ('RColorBrewer', '1.1-3'), + ('R6', '2.6.1'), + ('labeling', '0.4.3'), + ('farver', '2.1.2'), + ('Matrix', '1.7-3'), + ('nlme', '3.1-168'), + ('withr', '3.0.2'), + ('tibble', '3.2.1'), + ('colorspace', '2.1-1'), + ('munsell', '0.5.1'), + ('scales', '1.3.0'), + ('mgcv', '1.9-1'), + ('MASS', '7.3-65'), + ('isoband', '0.2.7'), + ('gtable', '0.3.6'), + ('ggplot2', '3.5.2'), + ('findpython', '1.0.9', {}), + ('argparse', '2.2.5', {}), + ('rmarkdown', '2.29', {}), + ('generics', '0.1.3', {}), + ('tidyselect', '1.2.1', {}), + ('dplyr', '1.1.4', {}), + ('tidyr', '1.3.1', {}), + ('shape', '1.4.6.1', {}), + ('GlobalOptions', '0.1.2', {}), + ('circlize', '0.4.16', {}), + ('rjson', '0.2.23', {}), + ('GetoptLong', '1.0.5', {}), + ('cluster', '2.1.8.1', {}), + ('clue', '0.3-66', {}), + ('png', '0.1-8', {}), + ('BiocGenerics', '0.54.0', {}), + ('S4Vectors', '0.46.0', {}), + ('IRanges', '2.42.0'), + ('matrixStats', '1.5.0', {}), + ('iterators', '1.0.14', {}), + ('codetools', '0.2-20', {}), + ('foreach', '1.5.2', {}), + ('doParallel', '1.0.17', {}), + ('ComplexHeatmap', '2.24.0', {}), +] + +modextrapaths = {'R_LIBS_SITE': ''} + +sanity_check_paths = { + 'files': [], + 'dirs': ['argparse', 'rmarkdown', 'ggplot2', 'tidyr', 'ComplexHeatmap'], +} + +moduleclass = 'bio' From 1b57e44585c688d6f5e8f5be4b38b039e73cab57 Mon Sep 17 00:00:00 2001 From: ben Date: Sat, 10 May 2025 18:49:27 +0200 Subject: [PATCH 15/45] inject checksums to rmarkdown easyconfig --- envs/rmarkdown.eb | 209 ++++++++++++++++++++++++++++++++++------------ 1 file changed, 156 insertions(+), 53 deletions(-) diff --git a/envs/rmarkdown.eb b/envs/rmarkdown.eb index a88a2a9..067eadd 100644 --- a/envs/rmarkdown.eb +++ b/envs/rmarkdown.eb @@ -28,60 +28,163 @@ exts_default_options = { exts_defaultclass = 'RPackage' - exts_list = [ - ('rlang', '1.1.6'), - ('glue', '1.8.0'), - ('cli', '3.6.4'), - ('lifecycle', '1.0.4'), - ('vctrs', '0.6.5'), - ('utf8', '1.2.4'), - ('lattice', '0.22-5'), - ('pkgconfig', '2.0.3'), - ('pillar', '1.10.2'), - ('magrittr', '2.0.3'), - ('fansi', '1.0.6'), - ('viridisLite', '0.4.2'), - ('RColorBrewer', '1.1-3'), - ('R6', '2.6.1'), - ('labeling', '0.4.3'), - ('farver', '2.1.2'), - ('Matrix', '1.7-3'), - ('nlme', '3.1-168'), - ('withr', '3.0.2'), - ('tibble', '3.2.1'), - ('colorspace', '2.1-1'), - ('munsell', '0.5.1'), - ('scales', '1.3.0'), - ('mgcv', '1.9-1'), - ('MASS', '7.3-65'), - ('isoband', '0.2.7'), - ('gtable', '0.3.6'), - ('ggplot2', '3.5.2'), - ('findpython', '1.0.9', {}), - ('argparse', '2.2.5', {}), - ('rmarkdown', '2.29', {}), - ('generics', '0.1.3', {}), - ('tidyselect', '1.2.1', {}), - ('dplyr', '1.1.4', {}), - ('tidyr', '1.3.1', {}), - ('shape', '1.4.6.1', {}), - ('GlobalOptions', '0.1.2', {}), - ('circlize', '0.4.16', {}), - ('rjson', '0.2.23', {}), - ('GetoptLong', '1.0.5', {}), - ('cluster', '2.1.8.1', {}), - ('clue', '0.3-66', {}), - ('png', '0.1-8', {}), - ('BiocGenerics', '0.54.0', {}), - ('S4Vectors', '0.46.0', {}), - ('IRanges', '2.42.0'), - ('matrixStats', '1.5.0', {}), - ('iterators', '1.0.14', {}), - ('codetools', '0.2-20', {}), - ('foreach', '1.5.2', {}), - ('doParallel', '1.0.17', {}), - ('ComplexHeatmap', '2.24.0', {}), + ('rlang', '1.1.6', { + 'checksums': ['18544c876f4e18ec554edecc308362a52fbc7e0805c4794cf59bcc4d0b57f330'], + }), + ('glue', '1.8.0', { + 'checksums': ['c86f364ba899b8662f5da3e1a75f43ae081ab04e0d51171d052356e7ee4b72a0'], + }), + ('cli', '3.6.4', { + 'checksums': ['0c39539ce173bcbf7abaca64e8d2c87ffec8257c144c31b793c4cf2dd9cf7620'], + }), + ('lifecycle', '1.0.4', { + 'checksums': ['ada4d3c7e84b0c93105e888647c5754219a8334f6e1f82d5afaf83d4855b91cc'], + }), + ('vctrs', '0.6.5', { + 'checksums': ['43167d2248fd699594044b5c8f1dbb7ed163f2d64761e08ba805b04e7ec8e402'], + }), + ('utf8', '1.2.4', { + 'checksums': ['418f824bbd9cd868d2d8a0d4345545c62151d321224cdffca8b1ffd98a167b7d'], + }), + ('lattice', '0.22-5', { + 'checksums': ['ba1fbe5e18a133507dca9851b7f933002bdb6d1f3ea5f410a0a441103b6da5f1'], + }), + ('pkgconfig', '2.0.3', { + 'checksums': ['330fef440ffeb842a7dcfffc8303743f1feae83e8d6131078b5a44ff11bc3850'], + }), + ('pillar', '1.10.2', { + 'checksums': ['2cdbe3fe1b28b62530880ab26fc3c874e0dd5060767ae1a8ee5685f65e56d645'], + }), + ('magrittr', '2.0.3', { + 'checksums': ['a2bff83f792a1acb801bfe6330bb62724c74d5308832f2cb6a6178336ace55d2'], + }), + ('fansi', '1.0.6', { + 'checksums': ['ea9dc690dfe50a7fad7c5eb863c157d70385512173574c56f4253b6dfe431863'], + }), + ('viridisLite', '0.4.2', { + 'checksums': ['893f111d31deccd2cc959bc9db7ba2ce9020a2dd1b9c1c009587e449c4cce1a1'], + }), + ('RColorBrewer', '1.1-3', { + 'checksums': ['4f42f5423c45688b39f492c7892d93f37b4541831c8ffb140364d2bd89031ac0'], + }), + ('R6', '2.6.1', { + 'checksums': ['59c6eba8b1b912eb7e104f65053235604be853425ee67c152ac4e86a1f2073b4'], + }), + ('labeling', '0.4.3', { + 'checksums': ['c62f4fc2cc74377d7055903c5f1913b7295f7587456fe468592738a483e264f2'], + }), + ('farver', '2.1.2', { + 'checksums': ['528823b95daab4566137711f1c842027a952bea1b2ae6ff098e2ca512b17fe25'], + }), + ('Matrix', '1.7-3', { + 'checksums': ['6642e9db8cddf32a051972fd5a634bf7edbdc925c5c2d139bf71e92df00fb44e'], + }), + ('nlme', '3.1-168', { + 'checksums': ['23b78468344cb6775dee5e0d9c8133032d64f08ebaba20776508a0443a897362'], + }), + ('withr', '3.0.2', { + 'checksums': ['0a3a05f493d275cca4bf13c8c1b95a1a4eed7f83b2493f41fde02ce3fc92c1a3'], + }), + ('tibble', '3.2.1', { + 'checksums': ['65a72d0c557fd6e7c510d150c935ed6ced5db7d05fc20236b370f11428372131'], + }), + ('colorspace', '2.1-1', { + 'checksums': ['e721cee5f4d6e4b0fc8eb18265e316b4f856fd3be02f0775a26032663758cd0b'], + }), + ('munsell', '0.5.1', { + 'checksums': ['03a2fd9ac40766cded96dfe33b143d872d0aaa262a25482ce19161ca959429a6'], + }), + ('scales', '1.3.0', { + 'checksums': ['b33e0f6b44259551ce02befd52eac53602509fbfdd903920620c658c50f35888'], + }), + ('mgcv', '1.9-1', { + 'checksums': ['700fbc37bedd3a49505b9bc4949faee156d9cfb4f669d797d06a10a15a5bdb32'], + }), + ('MASS', '7.3-65', { + 'checksums': ['b07ef1e3c364ce56269b4a8a7759cc9f87c876554f91293437bb578cfe38172f'], + }), + ('isoband', '0.2.7', { + 'checksums': ['7693223343b45b86de2b5b638ff148f0dafa6d7b1237e822c5272902f79cdf61'], + }), + ('gtable', '0.3.6', { + 'checksums': ['d305a5fa11278b649d2d8edc5288bf28009be888a42be58ff8714018e49de0ef'], + }), + ('ggplot2', '3.5.2', { + 'checksums': ['0a30024a2ff3e569412223c8f14563ed504f3e0851de03e42d1b5f73fe1f06bf'], + }), + ('findpython', '1.0.9', { + 'checksums': ['b6a15e0cdfcdd4b1cfc76f7e4eaad0125d4d52889711200075280e9b2a2cb7cb'], + }), + ('argparse', '2.2.5', { + 'checksums': ['53c8a9eb51041084eb3d9c271b14ebcb32dc2f50cf16afa5c54c504a97229ea4'], + }), + (name, '2.29', { + 'checksums': ['6662ac85316c869caad6e3b95468cad97f6eef106d47b066db8d40c05a490928'], + }), + ('generics', '0.1.3', { + 'checksums': ['75046163bfa8b8a4f4214c1b689e796207f6447182f2e5062cf570302387d053'], + }), + ('tidyselect', '1.2.1', { + 'checksums': ['169e97ba0bbfbcdf4a80534322751f87a04370310c40e27f04aac6525d45903c'], + }), + ('dplyr', '1.1.4', { + 'checksums': ['cf730414d5d4ab387b4e9890a4b1df9d17a3903488e8da8df1cf2e11e44558cb'], + }), + ('tidyr', '1.3.1', { + 'checksums': ['e820c261cb5543f572f49276a7bdc7302aa4215da4bf850b1b939a315353835d'], + }), + ('shape', '1.4.6.1', { + 'checksums': ['43f9bd0f997fd6cf1838efd8b2509c9a6396513f4e54a20360481634affd22a4'], + }), + ('GlobalOptions', '0.1.2', { + 'checksums': ['47890699668cfa9900a829c51f8a32e02a7a7764ad07cfac972aad66f839753e'], + }), + ('circlize', '0.4.16', { + 'checksums': ['16dc32c7704906d13a9e5281bb396e92fb89a6b17fa5e201953240726b650b67'], + }), + ('rjson', '0.2.23', { + 'checksums': ['55034575c854ed657e6701da278c0fdea251479624d06a963b2e58461a5f0f48'], + }), + ('GetoptLong', '1.0.5', { + 'checksums': ['8c237986ed3dfb72d956ad865ef7768644eebf144675ad66140acfd1aca9d701'], + }), + ('cluster', '2.1.8.1', { + 'checksums': ['4b95b78e09b17ddca72edc0bb180c753c004ed2f61c3eb12e0451ac77f441e57'], + }), + ('clue', '0.3-66', { + 'checksums': ['aa86dd58c05635eb394c9ede0dd15a4f24af4815f299451bbc7895c0f737c2fb'], + }), + ('png', '0.1-8', { + 'checksums': ['5a36fabb6d62ba2533d3fc4cececd07891942cfb76fe689ec0d550d08762f61c'], + }), + ('BiocGenerics', '0.54.0', { + 'checksums': ['413d6f74cbc671147f63eefc46b718af815d6497535c2198925d9306e00c41b9'], + }), + ('S4Vectors', '0.46.0', { + 'checksums': ['c34249c6a367a2a1e94158d9e60294f2b901e485d93717250a417569be187a40'], + }), + ('IRanges', '2.42.0', { + 'checksums': ['0abb01ee93111c5fc678f9aa2f93d00d8d1548263cb60daa52645a6061b603fc'], + }), + ('matrixStats', '1.5.0', { + 'checksums': ['12996c5f3e6fc202a43e1087f16a71b7fa93d7e908f512542c7ee89cf95dcc15'], + }), + ('iterators', '1.0.14', { + 'checksums': ['cef3075a0930e1408c764e4da56bbadd4f7d14315809df8f38dd51f80ccc677b'], + }), + ('codetools', '0.2-20', { + 'checksums': ['3be6f375ec178723ddfd559d1e8e85bfeee04a5fbaf9f53f2f844e1669fea863'], + }), + ('foreach', '1.5.2', { + 'checksums': ['56338d8753f9f68f262cf532fd8a6d0fe25a71a2ff0107f3ce378feb926bafe4'], + }), + ('doParallel', '1.0.17', { + 'checksums': ['b96a25ad105a654d70c7b4ca27290dc9967bc47f4668b2763927a886b178abd7'], + }), + ('ComplexHeatmap', '2.24.0', { + 'checksums': ['2a015ad26c5a5f003ee203d77cc8d3eea5461bcf2db7ce102da1bef7db082650'], + }), ] modextrapaths = {'R_LIBS_SITE': ''} From dfd5b936195655c136bf513640c5a5196a7785ea Mon Sep 17 00:00:00 2001 From: ben Date: Sun, 11 May 2025 12:18:59 +0200 Subject: [PATCH 16/45] update sklearn singularity definition --- envs/sklearn_singularity.def | 57 ++++++++++++++++++++++++------------ 1 file changed, 39 insertions(+), 18 deletions(-) diff --git a/envs/sklearn_singularity.def b/envs/sklearn_singularity.def index 939a3bb..56bcf37 100644 --- a/envs/sklearn_singularity.def +++ b/envs/sklearn_singularity.def @@ -1,33 +1,54 @@ -Bootstrap: docker -From: ubuntu:jammy-20240911.1 +Bootstrap: docker +From: ubuntu:noble-20250404 %labels - - AUTHOR izaskun.mallona@gmail.com + Author izaskun.mallona@gmail.com + Author ben.uzh@proton.me %post + PYTHON_VERSION=3.12.6 + PYTHON_MAJOR_VERSION=$(echo $PYTHON_VERSION | cut -d. -f1,2) - # Install python3.12 + # Update and enable deb-src + apt-get update + echo "deb-src http://archive.ubuntu.com/ubuntu/ noble main restricted universe multiverse" >> /etc/apt/sources.list + echo "deb-src http://archive.ubuntu.com/ubuntu/ noble-updates main restricted universe multiverse" >> /etc/apt/sources.list apt-get update - apt install -y build-essential zlib1g-dev libncurses5-dev libgdbm-dev libnss3-dev libssl-dev \ - libreadline-dev libffi-dev libsqlite3-dev wget libbz2-dev python-is-python3 git - - wget https://www.python.org/ftp/python/3.12.6/Python-3.12.6.tgz - tar -xf Python-3.12.6.tgz - cd Python-3.12.*/ - ./configure --enable-optimizations - make -j 4 - make altinstall - # virtualenv + + # Get build dependencies for Python + apt-get build-dep -y python3 + + # Extra dependencies + apt-get install -y git python-is-python3 wget zlib1g-dev libbz2-dev libssl-dev libffi-dev + + # Calculate half the number of available cores + HALF_NPROC=$(( $(nproc) / 2 )) + # Ensure at least one core is used + CORES_TO_USE=$(( HALF_NPROC > 0 ? HALF_NPROC : 1 )) + + # Download and build Python with optimizations + wget https://www.python.org/ftp/python/${PYTHON_VERSION}/Python-${PYTHON_VERSION}.tgz + tar -xf Python-${PYTHON_VERSION}.tgz + cd Python-${PYTHON_VERSION}*/ + # Enable all possible optimizations + ./configure --enable-optimizations --with-lto --enable-shared LDFLAGS="-Wl,-rpath /usr/local/lib" + make -j ${CORES_TO_USE} + make altinstall + + # Create virtualenv using the locally built Python cd /opt - python3.12 -m venv "default" + /usr/local/bin/python${PYTHON_MAJOR_VERSION} -m venv "default" . default/bin/activate - - pip3 install -U scikit-learn pandas argparse numpy scipy "isodate" "pydantic-core" "gitpython==3.1.43" + + # Install required packages + pip3 install "clustering-benchmarks==1.1.5" "wget" "fastcluster==1.2.6" "numpy==1.26.4" "scipy==1.14.1" \ + "isodate" "pydantic-core" \ + "genieclust==1.1.6" "pandas==2.2.3" "gitpython==3.1.43" echo '. /opt/default/bin/activate' >> $SINGULARITY_ENVIRONMENT %environment . /opt/default/bin/activate + From 0056b7fce71ab1e5efc9456502c2114ea4d597d7 Mon Sep 17 00:00:00 2001 From: ben Date: Sun, 11 May 2025 12:25:21 +0200 Subject: [PATCH 17/45] factorize sklearn singularity pip block --- envs/sklearn-pip.apptainer.include | 11 +++++++++++ envs/sklearn_singularity.def | 19 +++++++++---------- 2 files changed, 20 insertions(+), 10 deletions(-) create mode 100644 envs/sklearn-pip.apptainer.include diff --git a/envs/sklearn-pip.apptainer.include b/envs/sklearn-pip.apptainer.include new file mode 100644 index 0000000..b8f48eb --- /dev/null +++ b/envs/sklearn-pip.apptainer.include @@ -0,0 +1,11 @@ + pip3 install \ + "clustering-benchmarks==1.1.6" \ + "fastcluster==1.2.6" \ + "numpy==1.26.4" \ + "scipy==1.14.1" \ + "isodate" \ + "pydantic-core" \ + "genieclust==1.1.6" \ + "pandas==2.2.3" \ + "gitpython==3.1.43" \ + wget" diff --git a/envs/sklearn_singularity.def b/envs/sklearn_singularity.def index 56bcf37..cb9a2f6 100644 --- a/envs/sklearn_singularity.def +++ b/envs/sklearn_singularity.def @@ -8,25 +8,25 @@ From: ubuntu:noble-20250404 %post PYTHON_VERSION=3.12.6 PYTHON_MAJOR_VERSION=$(echo $PYTHON_VERSION | cut -d. -f1,2) - + # Update and enable deb-src apt-get update echo "deb-src http://archive.ubuntu.com/ubuntu/ noble main restricted universe multiverse" >> /etc/apt/sources.list echo "deb-src http://archive.ubuntu.com/ubuntu/ noble-updates main restricted universe multiverse" >> /etc/apt/sources.list apt-get update - + # Get build dependencies for Python apt-get build-dep -y python3 # Extra dependencies apt-get install -y git python-is-python3 wget zlib1g-dev libbz2-dev libssl-dev libffi-dev - + # Calculate half the number of available cores HALF_NPROC=$(( $(nproc) / 2 )) # Ensure at least one core is used CORES_TO_USE=$(( HALF_NPROC > 0 ? HALF_NPROC : 1 )) - + # Download and build Python with optimizations wget https://www.python.org/ftp/python/${PYTHON_VERSION}/Python-${PYTHON_VERSION}.tgz tar -xf Python-${PYTHON_VERSION}.tgz @@ -35,16 +35,15 @@ From: ubuntu:noble-20250404 ./configure --enable-optimizations --with-lto --enable-shared LDFLAGS="-Wl,-rpath /usr/local/lib" make -j ${CORES_TO_USE} make altinstall - + # Create virtualenv using the locally built Python cd /opt /usr/local/bin/python${PYTHON_MAJOR_VERSION} -m venv "default" . default/bin/activate - - # Install required packages - pip3 install "clustering-benchmarks==1.1.5" "wget" "fastcluster==1.2.6" "numpy==1.26.4" "scipy==1.14.1" \ - "isodate" "pydantic-core" \ - "genieclust==1.1.6" "pandas==2.2.3" "gitpython==3.1.43" + + # Install required packages with pip + + % include sklearn-pip.apptainer.include echo '. /opt/default/bin/activate' >> $SINGULARITY_ENVIRONMENT From cef3a6b6f0c0c0cb564941dee77eb52e9fd207db Mon Sep 17 00:00:00 2001 From: ben Date: Sun, 11 May 2025 12:27:48 +0200 Subject: [PATCH 18/45] extract variable in build script --- envs/build_singularity.sh | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/envs/build_singularity.sh b/envs/build_singularity.sh index c0c3d93..c5cbf6f 100755 --- a/envs/build_singularity.sh +++ b/envs/build_singularity.sh @@ -1,5 +1,7 @@ #!/bin/sh -singularity build sklearn.sif sklearn_singularity.def -singularity build clustbench.sif clustbench_singularity.def -singularity build r.sif r_singularity.def -singularity build fcps.sif fcps_singularity.def +CMD=singularity +BUILD=build --fakeroot +$CMD $BUILD sklearn-optimized.sif sklearn_singularity.def +$CMD $BUILD clustbench-optimized.sif clustbench_singularity.def +$CMD $BUILD r.sif r_singularity.def +$CMD $BUILD fcps.sif fcps_singularity.def From 2ee17ca636501521efb2c650a0b76750052692ee Mon Sep 17 00:00:00 2001 From: ben Date: Sun, 11 May 2025 12:37:19 +0200 Subject: [PATCH 19/45] revert include, should use m4 --- envs/build_singularity.sh | 2 +- envs/sklearn-pip.apptainer.include | 11 ----------- ...ularity.def => sklearn_singularity_optimized.def} | 12 +++++++++++- 3 files changed, 12 insertions(+), 13 deletions(-) delete mode 100644 envs/sklearn-pip.apptainer.include rename envs/{sklearn_singularity.def => sklearn_singularity_optimized.def} (85%) diff --git a/envs/build_singularity.sh b/envs/build_singularity.sh index c5cbf6f..61fbd13 100755 --- a/envs/build_singularity.sh +++ b/envs/build_singularity.sh @@ -1,7 +1,7 @@ #!/bin/sh CMD=singularity BUILD=build --fakeroot -$CMD $BUILD sklearn-optimized.sif sklearn_singularity.def +$CMD $BUILD sklearn_optimized.sif sklearn_singularity_optimized.def $CMD $BUILD clustbench-optimized.sif clustbench_singularity.def $CMD $BUILD r.sif r_singularity.def $CMD $BUILD fcps.sif fcps_singularity.def diff --git a/envs/sklearn-pip.apptainer.include b/envs/sklearn-pip.apptainer.include deleted file mode 100644 index b8f48eb..0000000 --- a/envs/sklearn-pip.apptainer.include +++ /dev/null @@ -1,11 +0,0 @@ - pip3 install \ - "clustering-benchmarks==1.1.6" \ - "fastcluster==1.2.6" \ - "numpy==1.26.4" \ - "scipy==1.14.1" \ - "isodate" \ - "pydantic-core" \ - "genieclust==1.1.6" \ - "pandas==2.2.3" \ - "gitpython==3.1.43" \ - wget" diff --git a/envs/sklearn_singularity.def b/envs/sklearn_singularity_optimized.def similarity index 85% rename from envs/sklearn_singularity.def rename to envs/sklearn_singularity_optimized.def index cb9a2f6..6d6e165 100644 --- a/envs/sklearn_singularity.def +++ b/envs/sklearn_singularity_optimized.def @@ -43,7 +43,17 @@ From: ubuntu:noble-20250404 # Install required packages with pip - % include sklearn-pip.apptainer.include + pip3 install \ + "clustering-benchmarks==1.1.6" \ + "fastcluster==1.2.6" \ + "numpy==1.26.4" \ + "scipy==1.14.1" \ + "isodate" \ + "pydantic-core" \ + "genieclust==1.1.6" \ + "pandas==2.2.3" \ + "gitpython==3.1.43" \ + wget" echo '. /opt/default/bin/activate' >> $SINGULARITY_ENVIRONMENT From c4cbe5c2f22ed52a4873d5a14781682a19e87a4f Mon Sep 17 00:00:00 2001 From: ben Date: Sun, 11 May 2025 12:44:30 +0200 Subject: [PATCH 20/45] update python version --- envs/sklearn_singularity_optimized.def | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/envs/sklearn_singularity_optimized.def b/envs/sklearn_singularity_optimized.def index 6d6e165..17a131d 100644 --- a/envs/sklearn_singularity_optimized.def +++ b/envs/sklearn_singularity_optimized.def @@ -6,7 +6,7 @@ From: ubuntu:noble-20250404 Author ben.uzh@proton.me %post - PYTHON_VERSION=3.12.6 + PYTHON_VERSION=3.12.9 PYTHON_MAJOR_VERSION=$(echo $PYTHON_VERSION | cut -d. -f1,2) # Update and enable deb-src From 21bdd666d47d029d5463f814ab685389ec850f71 Mon Sep 17 00:00:00 2001 From: ben Date: Sun, 11 May 2025 13:25:31 +0200 Subject: [PATCH 21/45] do a little bit of cleanup with the multiple envs --- ....yml => Clustering_apptainer_optimized.yml | 129 +++--------------- Clustering_conda.yml | 125 +++-------------- Clustering_envmodules.yml | 25 ++-- envs/build_singularity.sh | 7 +- ...def => clustbench_apptainer_optimized.def} | 37 +++-- envs/clustbench_apptainer_vanillapy.def | 55 ++++++++ envs/clustbench_singularity.def | 35 ----- ...ity.def => fcps_singularity_optimized.def} | 11 +- envs/sklearn.yml | 11 -- 9 files changed, 145 insertions(+), 290 deletions(-) rename Clustering_singularity.yml => Clustering_apptainer_optimized.yml (74%) rename envs/{sklearn_singularity_optimized.def => clustbench_apptainer_optimized.def} (71%) create mode 100644 envs/clustbench_apptainer_vanillapy.def delete mode 100644 envs/clustbench_singularity.def rename envs/{fcps_singularity.def => fcps_singularity_optimized.def} (79%) delete mode 100644 envs/sklearn.yml diff --git a/Clustering_singularity.yml b/Clustering_apptainer_optimized.yml similarity index 74% rename from Clustering_singularity.yml rename to Clustering_apptainer_optimized.yml index c80b498..96e357e 100644 --- a/Clustering_singularity.yml +++ b/Clustering_apptainer_optimized.yml @@ -1,38 +1,32 @@ id: clustering_example_apptainer + description: Clustering benchmark on Gagolewski's, true number of clusters plus minus 2. -version: 1.4 -benchmarker: "Izaskun Mallona, Daniel Incicau" -storage: http://omnibenchmark.org:9000 -benchmark_yaml_spec: 0.04 -storage_api: S3 -storage_bucket_name: clusteringexampleapptainer +version: 1.5 +benchmarker: "Izaskun Mallona, Daniel Incicau, Ben Carrillo" +benchmark_yaml_spec: 0.4 + software_backend: apptainer + software_environments: + clustbench: description: "clustbench on py3.12.6" - conda: envs/clustbench.yml - envmodule: clustbench - apptainer: envs/clustbench.sif - sklearn: - description: "Daniel's on py3.12.6" - conda: envs/sklearn.yml - apptainer: envs/sklearn.sif - envmodule: clustbench # not true, but - R: - description: "Daniel's R with readr, dplyr, mclust, caret" - conda: envs/r.yml - apptainer: envs/r.sif - envmodule: fcps # not true, but + conda: envs/clustbench.yml # not used + envmodule: na + apptainer: envs/clustbench-optimized.sif + rmarkdown: description: "R with some plotting dependencies" - conda: envs/rmarkdown.yml - apptainer: envs/r.sif # not true, but - envmodule: fcps # not true, but + conda: envs/rmarkdown.yml # not used + envmodule: na + apptainer: envs/rmarkdown.sif + fcps: description: "CRAN's FCPS" conda: envs/fcps.yml + envmodule: na apptainer: envs/fcps.sif - envmodule: fcps + metric_collectors: - id: plotting name: "Single-backend metric collector." @@ -45,10 +39,11 @@ metric_collectors: outputs: - id: plotting.html path: "{input}/{name}/plotting_report.html" + stages: - ## clustbench data ########################################################## - id: data + ## clustbench data modules: - id: clustbench name: "clustbench datasets, from https://www.sciencedirect.com/science/article/pii/S0020025521010082#t0005 Table1" @@ -229,89 +224,3 @@ stages: outputs: - id: metrics.scores path: "{input}/{stage}/{module}/{params}/{dataset}.scores.gz" - - # ## daniel's data ########################################################################### - - # - id: danielsdata - # modules: - # - id: iris_manual - # name: "Iris Dataset" - # software_environment: "sklearn" - # repository: - # url: https://github.com/omnibenchmark-example/iris.git - # commit: 47c63f0 - # - id: penguins - # name: "Penguins Dataset" - # software_environment: "sklearn" - # repository: - # url: https://github.com/omnibenchmark-example/penguins.git - # commit: 9032478 - # outputs: - # - id: data.features - # path: "{input}/{stage}/{module}/{params}/{dataset}.features.csv" - # - id: data.labels - # path: "{input}/{stage}/{module}/{params}/{dataset}.labels.csv" - - # ## daniel's distances ######################################################################## - - # - id: distances - # modules: - # - id: D1 - # software_environment: "sklearn" - # parameters: - # - values: ["--measure", "cosine"] - # - values: ["--measure", "euclidean"] - # - values: ["--measure", "manhattan"] - # - values: ["--measure", "chebyshev"] - # repository: - # url: https://github.com/omnibenchmark-example/distance.git - # commit: dd99d4f - # inputs: - # - entries: - # - data.features - # outputs: - # - id: distances - # path: "{input}/{stage}/{module}/{params}/{dataset}.distances.csv" - - # ## daniel's methods ################################################################### - - # - id: danielmethods - # modules: - # - id: kmeans - # software_environment: "sklearn" - # repository: - # url: https://github.com/omnibenchmark-example/kmeans.git - # commit: 049c8b1 - # - id: ward - # software_environment: "R" - # repository: - # url: https://github.com/omnibenchmark-example/ward.git - # commit: 976e3f3 - # inputs: - # - entries: - # - distances - # outputs: - # - id: methods.clusters - # path: "{input}/{stage}/{module}/{params}/{dataset}.clusters.csv" - - # ## daniel's metrics ################################################################### - - # - id: danielsmetrics - # modules: - # - id: ari - # software_environment: "R" - # repository: - # url: https://github.com/omnibenchmark-example/ari.git - # commit: 72708f0 - # - id: accuracy - # software_environment: "R" - # repository: - # url: https://github.com/omnibenchmark-example/accuracy.git - # commit: e26b32f - # inputs: - # - entries: - # - methods.clusters - # - data.labels - # outputs: - # - id: metrics.mapping - # path: "{input}/{stage}/{module}/{params}/{dataset}.metrics.txt" diff --git a/Clustering_conda.yml b/Clustering_conda.yml index 7ac1629..61352e1 100644 --- a/Clustering_conda.yml +++ b/Clustering_conda.yml @@ -1,38 +1,32 @@ id: clustering_example_conda + description: Clustering benchmark on Gagolewski's, true number of clusters plus minus 2. -version: 1.4 -benchmarker: "Izaskun Mallona, Daniel Incicau" -storage: http://omnibenchmark.org:9000 -benchmark_yaml_spec: 0.04 -storage_api: S3 -storage_bucket_name: clusteringexampleconda +version: 1.5 +benchmarker: "Izaskun Mallona, Daniel Incicau, Ben Carrillo" +benchmark_yaml_spec: 0.4 + software_backend: conda + software_environments: + clustbench: description: "clustbench on py3.12.6" conda: envs/clustbench.yml envmodule: clustbench - apptainer: envs/clustbench.sif - sklearn: - description: "Daniel's on py3.12.6" - conda: envs/sklearn.yml - apptainer: envs/sklearn.sif - envmodule: clustbench # not true, but - R: - description: "Daniel's R with readr, dplyr, mclust, caret" - conda: envs/r.yml - apptainer: envs/r.sif - envmodule: fcps # not true, but + apptainer: na + rmarkdown: description: "R with some plotting dependencies" conda: envs/rmarkdown.yml - apptainer: envs/r.sif # not true, but - envmodule: fcps # not true, but + envmodule: fcps # not used + apptainer: na + fcps: description: "CRAN's FCPS" conda: envs/fcps.yml - apptainer: envs/fcps.sif envmodule: fcps + apptainer: na + metric_collectors: - id: plotting name: "Single-backend metric collector." @@ -45,6 +39,7 @@ metric_collectors: outputs: - id: plotting.html path: "{input}/{name}/plotting_report.html" + stages: ## clustbench data ########################################################## @@ -52,7 +47,7 @@ stages: modules: - id: clustbench name: "clustbench datasets, from https://www.sciencedirect.com/science/article/pii/S0020025521010082#t0005 Table1" - software_environment: "clustbench" + software_environment: clustbench repository: url: https://github.com/imallona/clustbench_data commit: 366c5a2 @@ -145,7 +140,7 @@ stages: - values: ["--linkage", "centroid"] - id: sklearn name: "sklearn" - software_environment: "clustbench" + software_environment: clustbench repository: url: https://github.com/imallona/clustbench_sklearn #url: /home/imallona/src/clustbench_sklearn @@ -229,89 +224,3 @@ stages: outputs: - id: metrics.scores path: "{input}/{stage}/{module}/{params}/{dataset}.scores.gz" - - # ## daniel's data ########################################################################### - - # - id: danielsdata - # modules: - # - id: iris_manual - # name: "Iris Dataset" - # software_environment: "sklearn" - # repository: - # url: https://github.com/omnibenchmark-example/iris.git - # commit: 47c63f0 - # - id: penguins - # name: "Penguins Dataset" - # software_environment: "sklearn" - # repository: - # url: https://github.com/omnibenchmark-example/penguins.git - # commit: 9032478 - # outputs: - # - id: data.features - # path: "{input}/{stage}/{module}/{params}/{dataset}.features.csv" - # - id: data.labels - # path: "{input}/{stage}/{module}/{params}/{dataset}.labels.csv" - - # ## daniel's distances ######################################################################## - - # - id: distances - # modules: - # - id: D1 - # software_environment: "sklearn" - # parameters: - # - values: ["--measure", "cosine"] - # - values: ["--measure", "euclidean"] - # - values: ["--measure", "manhattan"] - # - values: ["--measure", "chebyshev"] - # repository: - # url: https://github.com/omnibenchmark-example/distance.git - # commit: dd99d4f - # inputs: - # - entries: - # - data.features - # outputs: - # - id: distances - # path: "{input}/{stage}/{module}/{params}/{dataset}.distances.csv" - - # ## daniel's methods ################################################################### - - # - id: danielmethods - # modules: - # - id: kmeans - # software_environment: "sklearn" - # repository: - # url: https://github.com/omnibenchmark-example/kmeans.git - # commit: 049c8b1 - # - id: ward - # software_environment: "R" - # repository: - # url: https://github.com/omnibenchmark-example/ward.git - # commit: 976e3f3 - # inputs: - # - entries: - # - distances - # outputs: - # - id: methods.clusters - # path: "{input}/{stage}/{module}/{params}/{dataset}.clusters.csv" - - # ## daniel's metrics ################################################################### - - # - id: danielsmetrics - # modules: - # - id: ari - # software_environment: "R" - # repository: - # url: https://github.com/omnibenchmark-example/ari.git - # commit: 72708f0 - # - id: accuracy - # software_environment: "R" - # repository: - # url: https://github.com/omnibenchmark-example/accuracy.git - # commit: e26b32f - # inputs: - # - entries: - # - methods.clusters - # - data.labels - # outputs: - # - id: metrics.mapping - # path: "{input}/{stage}/{module}/{params}/{dataset}.metrics.txt" diff --git a/Clustering_envmodules.yml b/Clustering_envmodules.yml index f37fd6c..52fb13e 100644 --- a/Clustering_envmodules.yml +++ b/Clustering_envmodules.yml @@ -1,28 +1,33 @@ id: clustering_example_envmodules + description: Clustering benchmark on Gagolewski's, true number of clusters plus minus 2. -version: 1.4 -benchmarker: "Izaskun Mallona, Daniel Incicau" -benchmark_yaml_spec: 0.5 +version: 1.5 +benchmarker: "Izaskun Mallona, Daniel Incicau, Ben Carrillo" +benchmark_yaml_spec: 0.4 software_backend: envmodules software_environments: + clustbench: description: "clustbench on py3.12.6" + conda: envs/clustbench.yml # not used envmodule: clustbench/0.1.0-foss-2023b - conda: envs/clustbench.yml apptainer: na + + rmarkdown: + description: "R with some plotting dependencies" + conda: envs/rmakrkdown.yml # not used + envmodule: rmarkdown + apptainer: na + fcps: description: "CRAN's FCPS" + conda: envs/fcps.yml # not used envmodule: fcps/1.3.4-foss-2023a-r-4.3.2 - conda: envs/fcps.yml - apptainer: na - rmarkdown: - description: "R with some plotting dependencies" - envmodule: rmarkdown # TODO - conda: envs/clustbench.yml apptainer: na + metric_collectors: - id: plotting name: "Single-backend metric collector." diff --git a/envs/build_singularity.sh b/envs/build_singularity.sh index 61fbd13..784e443 100755 --- a/envs/build_singularity.sh +++ b/envs/build_singularity.sh @@ -1,7 +1,6 @@ #!/bin/sh CMD=singularity BUILD=build --fakeroot -$CMD $BUILD sklearn_optimized.sif sklearn_singularity_optimized.def -$CMD $BUILD clustbench-optimized.sif clustbench_singularity.def -$CMD $BUILD r.sif r_singularity.def -$CMD $BUILD fcps.sif fcps_singularity.def +$CMD $BUILD clustbench-vanilla.sif clustbench_apptainer_vanillapy.def +$CMD $BUILD clustbench-optimized.sif clustbench_apptainer_optimized.def +$CMD $BUILD fcps.sif fcps_singularity_optimized.def diff --git a/envs/sklearn_singularity_optimized.def b/envs/clustbench_apptainer_optimized.def similarity index 71% rename from envs/sklearn_singularity_optimized.def rename to envs/clustbench_apptainer_optimized.def index 17a131d..d4a316d 100644 --- a/envs/sklearn_singularity_optimized.def +++ b/envs/clustbench_apptainer_optimized.def @@ -15,7 +15,6 @@ From: ubuntu:noble-20250404 echo "deb-src http://archive.ubuntu.com/ubuntu/ noble-updates main restricted universe multiverse" >> /etc/apt/sources.list apt-get update - # Get build dependencies for Python apt-get build-dep -y python3 @@ -43,17 +42,39 @@ From: ubuntu:noble-20250404 # Install required packages with pip - pip3 install \ + pip install -U pip + + pip install \ "clustering-benchmarks==1.1.6" \ - "fastcluster==1.2.6" \ - "numpy==1.26.4" \ - "scipy==1.14.1" \ - "isodate" \ - "pydantic-core" \ + "contourpy==1.3.2" \ + "cycler==0.12.1" \ + "cython==3.1.0" \ + "fonttools==4.58.0" \ "genieclust==1.1.6" \ + "joblib==1.5.0" \ + "kiwisolver==1.4.8" \ + "matplotlib==3.10.3" \ + "natsort==8.4.0" \ + "numpy==2.2.5" \ + "packaging==25.0" \ "pandas==2.2.3" \ + "pillow==11.2.1" \ + "pyparsing==3.2.3" \ + "python-dateutil==2.9.0.post0" \ + "pytz==2025.2" \ + "scikit-learn==1.6.1" \ + "scipy==1.15.3" \ + "six==1.17.0" \ + "threadpoolctl==3.6.0" \ + "tzdata==2025.2" \ + "fastcluster==1.2.6" \ "gitpython==3.1.43" \ - wget" + "isodate==0.7.2" \ + "pydantic-core==2.34.1" + + # TODO: can we use something more maintained? + pip install --pre "python3-wget==0.0.2-beta1" + echo '. /opt/default/bin/activate' >> $SINGULARITY_ENVIRONMENT diff --git a/envs/clustbench_apptainer_vanillapy.def b/envs/clustbench_apptainer_vanillapy.def new file mode 100644 index 0000000..1f2b4e3 --- /dev/null +++ b/envs/clustbench_apptainer_vanillapy.def @@ -0,0 +1,55 @@ +Bootstrap: docker +From: ubuntu:noble-20250404 + +%labels + Author izaskun.mallona@gmail.com + Author ben.uzh@proton.me + +%post + # Create virtualenv using the default Python + mkdir -p /opt && cd /opt + python3.12 -m venv "default" + . default/bin/activate + + # Install required packages with pip + + pip install -U pip + + pip install \ + "clustering-benchmarks==1.1.6" \ + "contourpy==1.3.2" \ + "cycler==0.12.1" \ + "cython==3.1.0" \ + "fonttools==4.58.0" \ + "genieclust==1.1.6" \ + "joblib==1.5.0" \ + "kiwisolver==1.4.8" \ + "matplotlib==3.10.3" \ + "natsort==8.4.0" \ + "numpy==2.2.5" \ + "packaging==25.0" \ + "pandas==2.2.3" \ + "pillow==11.2.1" \ + "pyparsing==3.2.3" \ + "python-dateutil==2.9.0.post0" \ + "pytz==2025.2" \ + "scikit-learn==1.6.1" \ + "scipy==1.15.3" \ + "six==1.17.0" \ + "threadpoolctl==3.6.0" \ + "tzdata==2025.2" \ + "fastcluster==1.2.6" \ + "gitpython==3.1.43" \ + "isodate==0.7.2" \ + "pydantic-core==2.34.1" + + # TODO: can we use something more maintained? + pip install --pre "python3-wget==0.0.2-beta1" + + + echo '. /opt/default/bin/activate' >> $SINGULARITY_ENVIRONMENT + +%environment + + . /opt/default/bin/activate + diff --git a/envs/clustbench_singularity.def b/envs/clustbench_singularity.def deleted file mode 100644 index 8c2ae85..0000000 --- a/envs/clustbench_singularity.def +++ /dev/null @@ -1,35 +0,0 @@ -Bootstrap: docker -From: ubuntu:jammy-20240911.1 - -%labels - - AUTHOR izaskun.mallona@gmail.com - -%post - - # Install python3.12 - apt-get update - apt install -y build-essential zlib1g-dev libncurses5-dev libgdbm-dev libnss3-dev libssl-dev \ - libreadline-dev libffi-dev libsqlite3-dev wget libbz2-dev python-is-python3 git - - wget https://www.python.org/ftp/python/3.12.6/Python-3.12.6.tgz - tar -xf Python-3.12.6.tgz - cd Python-3.12.*/ - ./configure --enable-optimizations - make -j 4 - make altinstall - - # virtualenv - cd /opt - python3.12 -m venv "default" - . default/bin/activate - - pip3 install "clustering-benchmarks==1.1.5" "wget" "fastcluster==1.2.6" "numpy==1.26.4" "scipy==1.14.1" \ - "isodate" "pydantic-core" \ - "genieclust==1.1.6" "pandas==2.2.3" "gitpython==3.1.43" - - echo '. /opt/default/bin/activate' >> $SINGULARITY_ENVIRONMENT - -%environment - - . /opt/default/bin/activate diff --git a/envs/fcps_singularity.def b/envs/fcps_singularity_optimized.def similarity index 79% rename from envs/fcps_singularity.def rename to envs/fcps_singularity_optimized.def index a4a615e..6362b9e 100644 --- a/envs/fcps_singularity.def +++ b/envs/fcps_singularity_optimized.def @@ -4,6 +4,7 @@ From: rocker/tidyverse:4.3.3 %labels AUTHOR izaskun.mallona@gmail.com + AUTHOR ben.uzh@proton.me %post @@ -13,11 +14,11 @@ From: rocker/tidyverse:4.3.3 libreadline-dev libffi-dev libsqlite3-dev wget libbz2-dev python-is-python3 git \ libgsl-dev - wget https://www.python.org/ftp/python/3.12.6/Python-3.12.6.tgz - tar -xf Python-3.12.6.tgz + wget https://www.python.org/ftp/python/3.12.9/Python-3.12.9.tgz + tar -xf Python-3.12.9.tgz cd Python-3.12.*/ ./configure --enable-optimizations - make -j 4 + make -j 8 make altinstall # virtualenv @@ -25,13 +26,15 @@ From: rocker/tidyverse:4.3.3 python3.12 -m venv "default" . default/bin/activate + # TODO: pin dependencies pip install gitpython==3.1.43 isodate pydantic-core ## no versioning here + ## TODO(ben): get same versions as in easyconfig Rscript -e 'BiocManager::install(c( "dbscan", "cluster", "protoclust", "energy", "argparse", "mclust", "DataVisualizations", "FCPS", "cclust"))' echo '. /opt/default/bin/activate' >> $SINGULARITY_ENVIRONMENT %environment - + . /opt/default/bin/activate diff --git a/envs/sklearn.yml b/envs/sklearn.yml deleted file mode 100644 index 258b7ea..0000000 --- a/envs/sklearn.yml +++ /dev/null @@ -1,11 +0,0 @@ -name: sklearn -channels: - - conda-forge - - nodefaults -dependencies: - - conda-forge::python=3.12.6 - - conda-forge::scikit-learn - - conda-forge::pip - - pip: - - "pandas" - - "argparse" From e8e0f7eb2313696f7494e65c20d44def8301de63 Mon Sep 17 00:00:00 2001 From: ben Date: Sun, 11 May 2025 13:29:28 +0200 Subject: [PATCH 22/45] escape --- envs/build_singularity.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/envs/build_singularity.sh b/envs/build_singularity.sh index 784e443..2dae40a 100755 --- a/envs/build_singularity.sh +++ b/envs/build_singularity.sh @@ -1,6 +1,6 @@ #!/bin/sh CMD=singularity -BUILD=build --fakeroot -$CMD $BUILD clustbench-vanilla.sif clustbench_apptainer_vanillapy.def -$CMD $BUILD clustbench-optimized.sif clustbench_apptainer_optimized.def -$CMD $BUILD fcps.sif fcps_singularity_optimized.def +BUILD='build --fakeroot' +$CMD ${BUILD} clustbench-vanilla.sif clustbench_apptainer_vanillapy.def +$CMD ${BUILD} clustbench-optimized.sif clustbench_apptainer_optimized.def +$CMD ${BUILD} fcps.sif fcps_singularity_optimized.def From a8336fba907ae43ce16678de97a22538addb06e6 Mon Sep 17 00:00:00 2001 From: ben Date: Sun, 11 May 2025 13:42:08 +0200 Subject: [PATCH 23/45] install updated python --- envs/clustbench_apptainer_vanillapy.def | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/envs/clustbench_apptainer_vanillapy.def b/envs/clustbench_apptainer_vanillapy.def index 1f2b4e3..5d388bf 100644 --- a/envs/clustbench_apptainer_vanillapy.def +++ b/envs/clustbench_apptainer_vanillapy.def @@ -6,9 +6,19 @@ From: ubuntu:noble-20250404 Author ben.uzh@proton.me %post + export DEBIAN_FRONTEND=noninteractive + apt-get update && \ + apt-get install -y \ + python3 \ + python3-venv \ + python3-pip \ + ca-certificates \ + && apt-get clean && \ + rm -rf /var/lib/apt/lists/* + # Create virtualenv using the default Python mkdir -p /opt && cd /opt - python3.12 -m venv "default" + /usr/bin/python3 -m venv "default" . default/bin/activate # Install required packages with pip @@ -46,6 +56,8 @@ From: ubuntu:noble-20250404 # TODO: can we use something more maintained? pip install --pre "python3-wget==0.0.2-beta1" + # Do some cleanup to keep the image slim + rm -rf ~/.cache echo '. /opt/default/bin/activate' >> $SINGULARITY_ENVIRONMENT From 518c2f6c894b097855a32f7e810783b13e9ec386 Mon Sep 17 00:00:00 2001 From: ben Date: Sun, 11 May 2025 13:46:47 +0200 Subject: [PATCH 24/45] sync the two build recipes --- envs/clustbench_apptainer_optimized.def | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/envs/clustbench_apptainer_optimized.def b/envs/clustbench_apptainer_optimized.def index d4a316d..1e934a8 100644 --- a/envs/clustbench_apptainer_optimized.def +++ b/envs/clustbench_apptainer_optimized.def @@ -10,6 +10,7 @@ From: ubuntu:noble-20250404 PYTHON_MAJOR_VERSION=$(echo $PYTHON_VERSION | cut -d. -f1,2) # Update and enable deb-src + export DEBIAN_FRONTEND=noninteractive apt-get update echo "deb-src http://archive.ubuntu.com/ubuntu/ noble main restricted universe multiverse" >> /etc/apt/sources.list echo "deb-src http://archive.ubuntu.com/ubuntu/ noble-updates main restricted universe multiverse" >> /etc/apt/sources.list @@ -19,19 +20,33 @@ From: ubuntu:noble-20250404 apt-get build-dep -y python3 # Extra dependencies - apt-get install -y git python-is-python3 wget zlib1g-dev libbz2-dev libssl-dev libffi-dev + apt-get install -y git \ + python-is-python3 \ + wget \ + zlib1g-dev \ + libbz2-dev \ + libssl-dev \ + libffi-dev \ + && apt-get clean && \ + rm -rf /var/lib/apt/lists/* # Calculate half the number of available cores HALF_NPROC=$(( $(nproc) / 2 )) # Ensure at least one core is used CORES_TO_USE=$(( HALF_NPROC > 0 ? HALF_NPROC : 1 )) - # Download and build Python with optimizations + # Download and build Python from source, with optimizations + wget https://www.python.org/ftp/python/${PYTHON_VERSION}/Python-${PYTHON_VERSION}.tgz tar -xf Python-${PYTHON_VERSION}.tgz cd Python-${PYTHON_VERSION}*/ + # Enable all possible optimizations - ./configure --enable-optimizations --with-lto --enable-shared LDFLAGS="-Wl,-rpath /usr/local/lib" + ./configure \ + --enable-optimizations \ + --with-lto \ + --enable-shared \ + LDFLAGS="-Wl,-rpath /usr/local/lib" make -j ${CORES_TO_USE} make altinstall @@ -75,6 +90,8 @@ From: ubuntu:noble-20250404 # TODO: can we use something more maintained? pip install --pre "python3-wget==0.0.2-beta1" + # Do some cleanup to keep the image slim + rm -rf ~/.cache echo '. /opt/default/bin/activate' >> $SINGULARITY_ENVIRONMENT From 2f4131f08f967bf0b934a25907e21bb9d54c001c Mon Sep 17 00:00:00 2001 From: ben Date: Sun, 11 May 2025 14:09:01 +0200 Subject: [PATCH 25/45] delete source folder --- envs/clustbench_apptainer_optimized.def | 2 ++ 1 file changed, 2 insertions(+) diff --git a/envs/clustbench_apptainer_optimized.def b/envs/clustbench_apptainer_optimized.def index 1e934a8..eda9ea6 100644 --- a/envs/clustbench_apptainer_optimized.def +++ b/envs/clustbench_apptainer_optimized.def @@ -37,6 +37,7 @@ From: ubuntu:noble-20250404 # Download and build Python from source, with optimizations + mkdir ~/src && cd src wget https://www.python.org/ftp/python/${PYTHON_VERSION}/Python-${PYTHON_VERSION}.tgz tar -xf Python-${PYTHON_VERSION}.tgz cd Python-${PYTHON_VERSION}*/ @@ -92,6 +93,7 @@ From: ubuntu:noble-20250404 # Do some cleanup to keep the image slim rm -rf ~/.cache + rm -rf ~/src echo '. /opt/default/bin/activate' >> $SINGULARITY_ENVIRONMENT From c72eb273f395e9c5805ee0806cc247a77b783443 Mon Sep 17 00:00:00 2001 From: ben Date: Sun, 11 May 2025 14:15:59 +0200 Subject: [PATCH 26/45] add microbenchmark for numpy operations --- microbenchmark/microbench.py | 67 ++++++++++++++++++++++++++++++++++++ 1 file changed, 67 insertions(+) create mode 100644 microbenchmark/microbench.py diff --git a/microbenchmark/microbench.py b/microbenchmark/microbench.py new file mode 100644 index 0000000..3730f9e --- /dev/null +++ b/microbenchmark/microbench.py @@ -0,0 +1,67 @@ +""" +This script exercises a few common linear algebra operations in numpy. +It's intended mostly to gauge whether it makes sense to descend into +compiler optimizations for the Python binary that we ship within the SIF images, +but it can be easily repurposed for other specific microbenchmarks (i.e., numba or GPU perf gains). + +Be aware that here we're profiling simple operations; it would make sense to carefully +profile the libraries of interest to see where the computational bottlenecks really are. + +Usage: + +singularity exec clustbench-vanilla.sif python3 microbench.py +singularity exec clustbench-optimized.sif python3 microbench.py +""" +import numpy as np +import time +import json +from statistics import mean, stdev + +def run_operation(operation, func, repetitions): + timings = [] + for _ in range(repetitions): + start = time.perf_counter() + func() + elapsed = time.perf_counter() - start + timings.append(elapsed) + return { + 'operation': operation, + 'mean': mean(timings), + 'stdev': stdev(timings), + 'runs': repetitions + } + +def benchmark(repetitions=50): + np.random.seed(42) + size = 1000 + + # Create random matrices + A = np.random.rand(size, size) + B = np.random.rand(size, size) + C = A @ A.T # Ensure positive definite for Cholesky + + # Define operations + operations = [ + ('mat_mul', lambda: np.dot(A, B)), + ('svd', lambda: np.linalg.svd(A)), + ('chol_decomp', lambda: np.linalg.cholesky(C)) + ] + + results = [] + for operation, func in operations: + try: + result = run_operation(operation, func, repetitions) + except np.linalg.LinAlgError: + result = { + 'operation': operation, + 'error': 'Operation failed due to numerical instability' + } + results.append(result) + + # Output results as JSON + print(json.dumps(results, indent=2)) + +if __name__ == "__main__": + import sys + repetitions = int(sys.argv[1]) if len(sys.argv) > 1 else 10 + benchmark(repetitions) From 937e45599633e3a58af28beece19f038a4fd9513 Mon Sep 17 00:00:00 2001 From: ben Date: Sun, 11 May 2025 14:16:41 +0200 Subject: [PATCH 27/45] fix path --- envs/clustbench_apptainer_optimized.def | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/envs/clustbench_apptainer_optimized.def b/envs/clustbench_apptainer_optimized.def index eda9ea6..19726c2 100644 --- a/envs/clustbench_apptainer_optimized.def +++ b/envs/clustbench_apptainer_optimized.def @@ -37,7 +37,7 @@ From: ubuntu:noble-20250404 # Download and build Python from source, with optimizations - mkdir ~/src && cd src + mkdir ~/src && cd ~/src wget https://www.python.org/ftp/python/${PYTHON_VERSION}/Python-${PYTHON_VERSION}.tgz tar -xf Python-${PYTHON_VERSION}.tgz cd Python-${PYTHON_VERSION}*/ From b0bd85adfed66583b676a3f378f0577701efa5a5 Mon Sep 17 00:00:00 2001 From: ben Date: Sun, 11 May 2025 14:24:30 +0200 Subject: [PATCH 28/45] default reps --- microbenchmark/microbench.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/microbenchmark/microbench.py b/microbenchmark/microbench.py index 3730f9e..314e66b 100644 --- a/microbenchmark/microbench.py +++ b/microbenchmark/microbench.py @@ -17,6 +17,8 @@ import json from statistics import mean, stdev +DEFAULT_REPETITIONS = 10 + def run_operation(operation, func, repetitions): timings = [] for _ in range(repetitions): @@ -31,7 +33,7 @@ def run_operation(operation, func, repetitions): 'runs': repetitions } -def benchmark(repetitions=50): +def benchmark(repetitions=DEFAULT_REPETITIONS): np.random.seed(42) size = 1000 @@ -63,5 +65,5 @@ def benchmark(repetitions=50): if __name__ == "__main__": import sys - repetitions = int(sys.argv[1]) if len(sys.argv) > 1 else 10 + repetitions = int(sys.argv[1]) if len(sys.argv) > 1 else DEFAULT_REPETITIONS benchmark(repetitions) From 83f9b07dfcbf25dbfce67d17d92eedbe469e2bd9 Mon Sep 17 00:00:00 2001 From: ben Date: Sun, 11 May 2025 14:40:30 +0200 Subject: [PATCH 29/45] refs --- microbenchmark/microbench.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/microbenchmark/microbench.py b/microbenchmark/microbench.py index 314e66b..6abc6ee 100644 --- a/microbenchmark/microbench.py +++ b/microbenchmark/microbench.py @@ -11,6 +11,8 @@ singularity exec clustbench-vanilla.sif python3 microbench.py singularity exec clustbench-optimized.sif python3 microbench.py + +References: https://pythonspeed.com/articles/faster-python/ """ import numpy as np import time From 744c978643ad5623de1c4b176ab887e7a6127739 Mon Sep 17 00:00:00 2001 From: ben Date: Mon, 12 May 2025 15:11:30 +0200 Subject: [PATCH 30/45] duplicate the apptainer clustering yaml --- Clustering_apptainer_optimized.yml | 39 +++-- Clustering_apptainer_vanilla.yml | 223 +++++++++++++++++++++++++++++ 2 files changed, 241 insertions(+), 21 deletions(-) create mode 100644 Clustering_apptainer_vanilla.yml diff --git a/Clustering_apptainer_optimized.yml b/Clustering_apptainer_optimized.yml index 96e357e..a073683 100644 --- a/Clustering_apptainer_optimized.yml +++ b/Clustering_apptainer_optimized.yml @@ -1,6 +1,6 @@ -id: clustering_example_apptainer - +id: clustering_example_apptainer_optimized description: Clustering benchmark on Gagolewski's, true number of clusters plus minus 2. + version: 1.5 benchmarker: "Izaskun Mallona, Daniel Incicau, Ben Carrillo" benchmark_yaml_spec: 0.4 @@ -10,27 +10,28 @@ software_backend: apptainer software_environments: clustbench: - description: "clustbench on py3.12.6" + description: "clustbench on py3.12.9, optimized python build" conda: envs/clustbench.yml # not used envmodule: na apptainer: envs/clustbench-optimized.sif + fcps: + description: "CRAN's FCPS" + conda: envs/fcps.yml # not used + envmodule: na + apptainer: envs/fcps.sif + rmarkdown: description: "R with some plotting dependencies" conda: envs/rmarkdown.yml # not used envmodule: na apptainer: envs/rmarkdown.sif - fcps: - description: "CRAN's FCPS" - conda: envs/fcps.yml - envmodule: na - apptainer: envs/fcps.sif metric_collectors: - id: plotting name: "Single-backend metric collector." - software_environment: "rmarkdown" + software_environment: rmarkdown repository: url: https://github.com/imallona/clustering_report commit: 1d6bdf5 @@ -43,11 +44,10 @@ metric_collectors: stages: - id: data - ## clustbench data modules: - id: clustbench name: "clustbench datasets, from https://www.sciencedirect.com/science/article/pii/S0020025521010082#t0005 Table1" - software_environment: "clustbench" + software_environment: clustbench repository: url: https://github.com/imallona/clustbench_data commit: 366c5a2 @@ -120,16 +120,13 @@ stages: - id: data.true_labels path: "{input}/{stage}/{module}/{params}/{dataset}.labels0.gz" - ## clustbench methods (fastcluster) ################################################################### - - id: clustering modules: - id: fastcluster name: "fastcluster algorithm" - software_environment: "clustbench" + software_environment: clustbench repository: url: https://github.com/imallona/clustbench_fastcluster - # url: /home/imallona/src/clustbench_fastcluster/ commit: "45e43d3" parameters: - values: ["--linkage", "complete"] @@ -138,12 +135,12 @@ stages: - values: ["--linkage", "weighted"] - values: ["--linkage", "median"] - values: ["--linkage", "centroid"] + - id: sklearn - name: "sklearn" - software_environment: "clustbench" + name: sklearn + software_environment: clustbench repository: url: https://github.com/imallona/clustbench_sklearn - #url: /home/imallona/src/clustbench_sklearn commit: 5877378 parameters: - values: ["--method", "birch"] @@ -161,8 +158,8 @@ stages: - values: ["--linkage", "complete"] - values: ["--linkage", "ward"] - id: genieclust - name: "genieclust" - software_environment: "clustbench" + name: genieclust + software_environment: clustbench repository: url: https://github.com/imallona/clustbench_genieclust commit: 6090043 @@ -201,7 +198,7 @@ stages: modules: - id: partition_metrics name: "clustbench partition metrics" - software_environment: "clustbench" + software_environment: clustbench repository: url: https://github.com/imallona/clustbench_metrics commit: 9132d45 diff --git a/Clustering_apptainer_vanilla.yml b/Clustering_apptainer_vanilla.yml new file mode 100644 index 0000000..46b8ea4 --- /dev/null +++ b/Clustering_apptainer_vanilla.yml @@ -0,0 +1,223 @@ +id: clustering_example_apptainer_vanilla + +description: Clustering benchmark on Gagolewski's, true number of clusters plus minus 2. +version: 1.5 +benchmarker: "Izaskun Mallona, Daniel Incicau, Ben Carrillo" +benchmark_yaml_spec: 0.4 + +software_backend: apptainer + +software_environments: + + clustbench: + description: "clustbench on py3.12.6" + conda: envs/clustbench.yml # not used + envmodule: na + apptainer: envs/clustbench-vanilla.sif + + fcps: + description: "CRAN's FCPS" + conda: envs/fcps.yml # not used + envmodule: na + apptainer: envs/fcps.sif + + rmarkdown: + description: "R with some plotting dependencies" + conda: envs/rmarkdown.yml # not used + envmodule: na + apptainer: envs/rmarkdown.sif + + +metric_collectors: + - id: plotting + name: "Single-backend metric collector." + software_environment: rmarkdown + repository: + url: https://github.com/imallona/clustering_report + commit: 1d6bdf5 + inputs: + - metrics.scores + outputs: + - id: plotting.html + path: "{input}/{name}/plotting_report.html" + +stages: + + - id: data + modules: + - id: clustbench + name: "clustbench datasets, from https://www.sciencedirect.com/science/article/pii/S0020025521010082#t0005 Table1" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_data + commit: 366c5a2 + parameters: # comments depict the possible cardinalities and the number of curated labelsets + - values: ["--dataset_generator", "fcps", "--dataset_name", "atom"] # 2 1 + - values: ["--dataset_generator", "fcps", "--dataset_name", "chainlink"] # 2 1 + - values: ["--dataset_generator", "fcps", "--dataset_name", "engytime"] # 2 2 + - values: ["--dataset_generator", "fcps", "--dataset_name", "hepta"] # 7 1 + - values: ["--dataset_generator", "fcps", "--dataset_name", "lsun"] # 3 1 + - values: ["--dataset_generator", "fcps", "--dataset_name", "target"] # 2, 6 2 + - values: ["--dataset_generator", "fcps", "--dataset_name", "tetra"] # 4 1 + - values: ["--dataset_generator", "fcps", "--dataset_name", "twodiamonds"] # 2 1 + - values: ["--dataset_generator", "fcps", "--dataset_name", "wingnut"] # 2 1 + - values: ["--dataset_generator", "graves", "--dataset_name", "dense"] # 2 1 + - values: ["--dataset_generator", "graves", "--dataset_name", "fuzzyx"] # 2, 4, 5 6 + - values: ["--dataset_generator", "graves", "--dataset_name", "line"] # 2 1 + - values: ["--dataset_generator", "graves", "--dataset_name", "parabolic"] # 2, 4 2 + - values: ["--dataset_generator", "graves", "--dataset_name", "ring"] # 2 1 + - values: ["--dataset_generator", "graves", "--dataset_name", "ring_noisy"] # 2 1 + - values: ["--dataset_generator", "graves", "--dataset_name", "ring_outliers"] # 2, 5 2 + - values: ["--dataset_generator", "graves", "--dataset_name", "zigzag"] # 3, 5 2 + - values: ["--dataset_generator", "graves", "--dataset_name", "zigzag_noisy"] # 3, 5 2 + - values: ["--dataset_generator", "graves", "--dataset_name", "zigzag_outliers"] # 3, 5 2 + - values: ["--dataset_generator", "other", "--dataset_name", "chameleon_t4_8k"] # 6 1 + - values: ["--dataset_generator", "other", "--dataset_name", "chameleon_t5_8k"] # 6 1 + - values: ["--dataset_generator", "other", "--dataset_name", "hdbscan"] # 6 1 + - values: ["--dataset_generator", "other", "--dataset_name", "iris"] # 3 1 + - values: ["--dataset_generator", "other", "--dataset_name", "iris5"] # 3 1 + - values: ["--dataset_generator", "other", "--dataset_name", "square"] # 2 1 + - values: ["--dataset_generator", "sipu", "--dataset_name", "aggregation"] # 7 1 + - values: ["--dataset_generator", "sipu", "--dataset_name", "compound"] # 4, 5, 6 5 + - values: ["--dataset_generator", "sipu", "--dataset_name", "flame"] # 2 2 + - values: ["--dataset_generator", "sipu", "--dataset_name", "jain"] # 2 1 + - values: ["--dataset_generator", "sipu", "--dataset_name", "pathbased"] # 3, 4 2 + - values: ["--dataset_generator", "sipu", "--dataset_name", "r15"] # 8, 9, 15 3 + - values: ["--dataset_generator", "sipu", "--dataset_name", "spiral"] # 3 1 + - values: ["--dataset_generator", "sipu", "--dataset_name", "unbalance"] # 8 1 + - values: ["--dataset_generator", "uci", "--dataset_name", "ecoli"] # 8 1 + - values: ["--dataset_generator", "uci", "--dataset_name", "ionosphere"] # 2 1 + - values: ["--dataset_generator", "uci", "--dataset_name", "sonar"] # 2 1 + - values: ["--dataset_generator", "uci", "--dataset_name", "statlog"] # 7 1 + - values: ["--dataset_generator", "uci", "--dataset_name", "wdbc"] # 2 1 + - values: ["--dataset_generator", "uci", "--dataset_name", "wine"] # 3 1 + - values: ["--dataset_generator", "uci", "--dataset_name", "yeast"] # 10 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "circles"] # 4 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "cross"] # 4 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "graph"] # 10 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "isolation"] # 3 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "labirynth"] # 6 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "mk1"] # 3 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "mk2"] # 2 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "mk3"] # 3 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "mk4"] # 3 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "olympic"] # 5 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "smile"] # 4, 6 2 + - values: ["--dataset_generator", "wut", "--dataset_name", "stripes"] # 2 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "trajectories"] # 4 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "trapped_lovers"] # 3 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "twosplashes"] # 2 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "windows"] # 5 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "x1"] # 3 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "x2"] # 3 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "x3"] # 4 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "z1"] # 3 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "z2"] # 5 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "z3"] # 4 1 + outputs: + - id: data.matrix + path: "{input}/{stage}/{module}/{params}/{dataset}.data.gz" + - id: data.true_labels + path: "{input}/{stage}/{module}/{params}/{dataset}.labels0.gz" + + - id: clustering + modules: + - id: fastcluster + name: "fastcluster algorithm" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_fastcluster + commit: "45e43d3" + parameters: + - values: ["--linkage", "complete"] + - values: ["--linkage", "ward"] + - values: ["--linkage", "average"] + - values: ["--linkage", "weighted"] + - values: ["--linkage", "median"] + - values: ["--linkage", "centroid"] + + - id: sklearn + name: sklearn + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_sklearn + commit: 5877378 + parameters: + - values: ["--method", "birch"] + - values: ["--method", "kmeans"] + # - values: ["--method", "spectral"] ## too slow + - values: ["--method", "gm"] + - id: agglomerative + name: "agglomerative" + software_environment: "clustbench" + repository: + url: https://github.com/imallona/clustbench_agglomerative + commit: 5454368 + parameters: + - values: ["--linkage", "average"] + - values: ["--linkage", "complete"] + - values: ["--linkage", "ward"] + - id: genieclust + name: genieclust + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_genieclust + commit: 6090043 + parameters: + - values: ["--method", "genie", "--gini_threshold", 0.5] + - values: ["--method", "gic"] + - values: ["--method", "ica"] + - id: fcps + name: "fcps" + software_environment: "fcps" + repository: + url: https://github.com/imallona/clustbench_fcps + commit: 272fa5f + parameters: + # - values: ["--method", "FCPS_AdaptiveDensityPeak"] # not in conda + - values: ["--method", "FCPS_Minimax"] + - values: ["--method", "FCPS_MinEnergy"] + - values: ["--method", "FCPS_HDBSCAN_2"] + - values: ["--method", "FCPS_HDBSCAN_4"] + - values: ["--method", "FCPS_HDBSCAN_8"] + - values: ["--method", "FCPS_Diana"] + - values: ["--method", "FCPS_Fanny"] + - values: ["--method", "FCPS_Hardcl"] + - values: ["--method", "FCPS_Softcl"] + - values: ["--method", "FCPS_Clara"] + - values: ["--method", "FCPS_PAM"] + inputs: + - entries: + - data.matrix + - data.true_labels + outputs: + - id: clustering.predicted_ks_range + path: "{input}/{stage}/{module}/{params}/{dataset}_ks_range.labels.gz" + + - id: metrics + modules: + - id: partition_metrics + name: "clustbench partition metrics" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_metrics + commit: 9132d45 + parameters: + - values: ["--metric", "normalized_clustering_accuracy"] + - values: ["--metric", "adjusted_fm_score"] + - values: ["--metric", "adjusted_mi_score"] + - values: ["--metric", "adjusted_rand_score"] + - values: ["--metric", "fm_score"] + - values: ["--metric", "mi_score"] + - values: ["--metric", "normalized_clustering_accuracy"] + - values: ["--metric", "normalized_mi_score"] + - values: ["--metric", "normalized_pivoted_accuracy"] + - values: ["--metric", "pair_sets_index"] + - values: ["--metric", "rand_score"] + inputs: + - entries: + - clustering.predicted_ks_range + - data.true_labels + outputs: + - id: metrics.scores + path: "{input}/{stage}/{module}/{params}/{dataset}.scores.gz" From ec18dcf21ce6d23a1c20f88a5431d9a2c040abae Mon Sep 17 00:00:00 2001 From: ben Date: Mon, 12 May 2025 15:15:36 +0200 Subject: [PATCH 31/45] update the oras yaml. not working, just to keep in sync --- Clustering_oras.yml | 128 ++++++++------------------------------------ 1 file changed, 22 insertions(+), 106 deletions(-) diff --git a/Clustering_oras.yml b/Clustering_oras.yml index 6640461..c6f0d7e 100644 --- a/Clustering_oras.yml +++ b/Clustering_oras.yml @@ -1,36 +1,37 @@ -id: clustering_example +id: clustering_example_oras description: Clustering benchmark on Gagolewski's, true number of clusters plus minus 2. Caution dirty apptainer sifs. -version: 1.2 +version: 1.5 + benchmarker: "Izaskun Mallona, Daniel Incicau" -storage: https://play.min.io -benchmark_yaml_spec: 0.04 -storage_api: S3 -storage_bucket_name: clustering_example +benchmark_yaml_spec: 0.4 + +#storage: https://play.min.io +#storage_api: S3 +#storage_bucket_name: clustering_example + software_backend: apptainer + software_environments: + clustbench: description: "clustbench on py3.12.6" - conda: envs/clustbench.yml + conda: envs/clustbench.yml # not used envmodule: clustbench apptainer: oras://registry.renkulab.io/izaskun.mallona/clustering_example/clustbench:latest - sklearn: - description: "Daniel's on py3.12.6" - conda: envs/sklearn.yml - apptainer: oras://registry.renkulab.io/izaskun.mallona/clustering_example/r:latest - envmodule: clustbench # not true, but - R: - description: "Daniel's R with readr, dplyr, mclust, caret" - conda: envs/r.yml - apptainer: oras://registry.renkulab.io/izaskun.mallona/clustering_example/sklearn:latest - envmodule: fcps # not true, but + fcps: description: "CRAN's FCPS" - conda: envs/fcps.yml + conda: envs/fcps.yml # not used + envmodule: na apptainer: oras://registry.renkulab.io/izaskun.mallona/clustering_example/fcps:latest - envmodule: fcps -stages: - ## clustbench data ########################################################## + rmarkdown: + description: "R with some plotting dependencies" + conda: envs/rmarkdown.yml # not used + envmodule: na + apptainer: oras://registry.renkulab.io/izaskun.mallona/clustering_example/rmarkdown:latest + +stages: - id: data modules: @@ -214,88 +215,3 @@ stages: - id: metrics.scores path: "{input}/{stage}/{module}/{params}/{dataset}.scores.gz" - # ## daniel's data ########################################################################### - - # - id: danielsdata - # modules: - # - id: iris_manual - # name: "Iris Dataset" - # software_environment: "sklearn" - # repository: - # url: https://github.com/omnibenchmark-example/iris.git - # commit: 47c63f0 - # - id: penguins - # name: "Penguins Dataset" - # software_environment: "sklearn" - # repository: - # url: https://github.com/omnibenchmark-example/penguins.git - # commit: 9032478 - # outputs: - # - id: data.features - # path: "{input}/{stage}/{module}/{params}/{dataset}.features.csv" - # - id: data.labels - # path: "{input}/{stage}/{module}/{params}/{dataset}.labels.csv" - - # ## daniel's distances ######################################################################## - - # - id: distances - # modules: - # - id: D1 - # software_environment: "sklearn" - # parameters: - # - values: ["--measure", "cosine"] - # - values: ["--measure", "euclidean"] - # - values: ["--measure", "manhattan"] - # - values: ["--measure", "chebyshev"] - # repository: - # url: https://github.com/omnibenchmark-example/distance.git - # commit: dd99d4f - # inputs: - # - entries: - # - data.features - # outputs: - # - id: distances - # path: "{input}/{stage}/{module}/{params}/{dataset}.distances.csv" - - # ## daniel's methods ################################################################### - - # - id: danielmethods - # modules: - # - id: kmeans - # software_environment: "sklearn" - # repository: - # url: https://github.com/omnibenchmark-example/kmeans.git - # commit: 049c8b1 - # - id: ward - # software_environment: "R" - # repository: - # url: https://github.com/omnibenchmark-example/ward.git - # commit: 976e3f3 - # inputs: - # - entries: - # - distances - # outputs: - # - id: methods.clusters - # path: "{input}/{stage}/{module}/{params}/{dataset}.clusters.csv" - - # ## daniel's metrics ################################################################### - - # - id: danielsmetrics - # modules: - # - id: ari - # software_environment: "R" - # repository: - # url: https://github.com/omnibenchmark-example/ari.git - # commit: 72708f0 - # - id: accuracy - # software_environment: "R" - # repository: - # url: https://github.com/omnibenchmark-example/accuracy.git - # commit: e26b32f - # inputs: - # - entries: - # - methods.clusters - # - data.labels - # outputs: - # - id: metrics.mapping - # path: "{input}/{stage}/{module}/{params}/{dataset}.metrics.txt" From cf52a2c7b3595e25488b1f0a007e3d30045fb74b Mon Sep 17 00:00:00 2001 From: ben Date: Mon, 12 May 2025 15:21:02 +0200 Subject: [PATCH 32/45] update the rmarkdown environment --- Clustering_envmodules.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/Clustering_envmodules.yml b/Clustering_envmodules.yml index 52fb13e..a2112d4 100644 --- a/Clustering_envmodules.yml +++ b/Clustering_envmodules.yml @@ -18,7 +18,7 @@ software_environments: rmarkdown: description: "R with some plotting dependencies" conda: envs/rmakrkdown.yml # not used - envmodule: rmarkdown + envmodule: rmarkdown/0.1.0-gfbf-2024a-r-4.4.2 apptainer: na fcps: @@ -44,7 +44,6 @@ metric_collectors: stages: - id: data - ## clustbench data modules: - id: clustbench name: "clustbench datasets, from https://www.sciencedirect.com/science/article/pii/S0020025521010082#t0005 Table1" From 934ce8baa625f2877a79958f6091fbd4eae4b96f Mon Sep 17 00:00:00 2001 From: ben Date: Mon, 12 May 2025 15:34:49 +0200 Subject: [PATCH 33/45] update makefile --- Clustering_conda.yml | 12 +-- Clustering_conda_smoketest.yml | 129 +++++++++++++++++++++++++++ Clustering_envmodules_smoketest.yml | 131 ++++++++++++++++++++++++++++ Makefile | 23 ++++- envs/rmarkdown.yml | 4 +- 5 files changed, 289 insertions(+), 10 deletions(-) create mode 100644 Clustering_conda_smoketest.yml create mode 100644 Clustering_envmodules_smoketest.yml diff --git a/Clustering_conda.yml b/Clustering_conda.yml index 61352e1..7822761 100644 --- a/Clustering_conda.yml +++ b/Clustering_conda.yml @@ -15,18 +15,18 @@ software_environments: envmodule: clustbench apptainer: na - rmarkdown: - description: "R with some plotting dependencies" - conda: envs/rmarkdown.yml - envmodule: fcps # not used - apptainer: na - fcps: description: "CRAN's FCPS" conda: envs/fcps.yml envmodule: fcps apptainer: na + rmarkdown: + description: "R with some plotting dependencies" + conda: envs/rmarkdown.yml + envmodule: fcps # not used + apptainer: na + metric_collectors: - id: plotting name: "Single-backend metric collector." diff --git a/Clustering_conda_smoketest.yml b/Clustering_conda_smoketest.yml new file mode 100644 index 0000000..15215d7 --- /dev/null +++ b/Clustering_conda_smoketest.yml @@ -0,0 +1,129 @@ +id: clustering_example_envmodules +description: Clustering benchmark on Gagolewski's, true number of clusters plus minus 2. +version: 1.4 +benchmarker: "Izaskun Mallona, Daniel Incicau" +benchmark_yaml_spec: 0.5 + +software_backend: conda + +software_environments: + + clustbench: + description: "clustbench on py3.12.6" + envmodule: clustbench/0.1.0-foss-2023b + conda: envs/clustbench.yml + apptainer: na + + fcps: + description: "CRAN's FCPS" + envmodule: fcps/1.3.4-foss-2023a-r-4.3.2 + conda: envs/fcps.yml + apptainer: na + + rmarkdown: + description: "R with some plotting dependencies" + envmodule: rmarkdown/0.1.0-gfbf-2024a-r-4.4.2 + conda: envs/rmarkdown.yml + apptainer: na + +metric_collectors: + - id: plotting + name: "Single-backend metric collector." + software_environment: rmarkdown + repository: + url: https://github.com/imallona/clustering_report + commit: 1d6bdf5 + inputs: + - metrics.scores + outputs: + - id: plotting.html + path: "{input}/{name}/plotting_report.html" + +stages: + - id: data + modules: + - id: clustbench + name: "clustbench datasets, from https://www.sciencedirect.com/science/article/pii/S0020025521010082#t0005 Table1" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_data + commit: 366c5a2 + parameters: # comments depict the possible cardinalities and the number of curated labelsets + - values: ["--dataset_generator", "fcps", "--dataset_name", "atom"] # 2 1 + outputs: + - id: data.matrix + path: "{input}/{stage}/{module}/{params}/{dataset}.data.gz" + - id: data.true_labels + path: "{input}/{stage}/{module}/{params}/{dataset}.labels0.gz" + + - id: clustering + modules: + - id: fastcluster + name: "fastcluster algorithm" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_fastcluster + commit: "45e43d3" + parameters: + - values: ["--linkage", "complete"] + - values: ["--linkage", "ward"] + - id: sklearn + name: "sklearn" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_sklearn + commit: 5877378 + parameters: + - values: ["--method", "birch"] + - values: ["--method", "kmeans"] + - id: agglomerative + name: "agglomerative" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_agglomerative + commit: 5454368 + parameters: + - values: ["--linkage", "complete"] + - values: ["--linkage", "ward"] + - id: genieclust + name: "genieclust" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_genieclust + commit: 6090043 + parameters: + - values: ["--method", "genie", "--gini_threshold", 0.5] + - id: fcps + name: "fcps" + software_environment: fcps + repository: + url: https://github.com/imallona/clustbench_fcps + commit: 272fa5f + parameters: + - values: ["--method", "FCPS_Minimax"] + inputs: + - entries: + - data.matrix + - data.true_labels + outputs: + - id: clustering.predicted_ks_range + path: "{input}/{stage}/{module}/{params}/{dataset}_ks_range.labels.gz" + + - id: metrics + modules: + - id: partition_metrics + name: "clustbench partition metrics" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_metrics + commit: 8184cd4 + parameters: + - values: ["--metric", "normalized_clustering_accuracy"] + - values: ["--metric", "adjusted_fm_score"] + inputs: + - entries: + - clustering.predicted_ks_range + - data.true_labels + outputs: + - id: metrics.scores + path: "{input}/{stage}/{module}/{params}/{dataset}.scores.gz" diff --git a/Clustering_envmodules_smoketest.yml b/Clustering_envmodules_smoketest.yml new file mode 100644 index 0000000..3fa8e81 --- /dev/null +++ b/Clustering_envmodules_smoketest.yml @@ -0,0 +1,131 @@ +id: clustering_example_envmodules +description: Clustering benchmark on Gagolewski's, true number of clusters plus minus 2. +version: 1.4 +benchmarker: "Izaskun Mallona, Daniel Incicau" +benchmark_yaml_spec: 0.5 + +software_backend: envmodules + +software_environments: + + clustbench: + description: "clustbench on py3.12.6" + envmodule: clustbench/0.1.0-foss-2023b + conda: envs/clustbench.yml + apptainer: na + + fcps: + description: "CRAN's FCPS" + envmodule: fcps/1.3.4-foss-2023a-r-4.3.2 + conda: envs/fcps.yml + apptainer: na + + rmarkdown: + description: "R with some plotting dependencies" + envmodule: rmarkdown/0.1.0-gfbf-2024a-r-4.4.2 + conda: envs/clustbench.yml + apptainer: na + +metric_collectors: + - id: plotting + name: "Single-backend metric collector." + software_environment: rmarkdown + repository: + url: https://github.com/imallona/clustering_report + commit: 1d6bdf5 + inputs: + - metrics.scores + outputs: + - id: plotting.html + path: "{input}/{name}/plotting_report.html" + +stages: + - id: data + modules: + - id: clustbench + name: "clustbench datasets, from https://www.sciencedirect.com/science/article/pii/S0020025521010082#t0005 Table1" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_data + commit: 366c5a2 + parameters: # comments depict the possible cardinalities and the number of curated labelsets + - values: ["--dataset_generator", "fcps", "--dataset_name", "atom"] # 2 1 + outputs: + - id: data.matrix + path: "{input}/{stage}/{module}/{params}/{dataset}.data.gz" + - id: data.true_labels + path: "{input}/{stage}/{module}/{params}/{dataset}.labels0.gz" + + ## clustbench methods (fastcluster) ################################################################### + + - id: clustering + modules: + - id: fastcluster + name: "fastcluster algorithm" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_fastcluster + commit: "45e43d3" + parameters: + - values: ["--linkage", "complete"] + - values: ["--linkage", "ward"] + - id: sklearn + name: "sklearn" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_sklearn + commit: 5877378 + parameters: + - values: ["--method", "birch"] + - values: ["--method", "kmeans"] + - id: agglomerative + name: "agglomerative" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_agglomerative + commit: 5454368 + parameters: + - values: ["--linkage", "complete"] + - values: ["--linkage", "ward"] + - id: genieclust + name: "genieclust" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_genieclust + commit: 6090043 + parameters: + - values: ["--method", "genie", "--gini_threshold", 0.5] + - id: fcps + name: "fcps" + software_environment: fcps + repository: + url: https://github.com/imallona/clustbench_fcps + commit: 272fa5f + parameters: + - values: ["--method", "FCPS_Minimax"] + inputs: + - entries: + - data.matrix + - data.true_labels + outputs: + - id: clustering.predicted_ks_range + path: "{input}/{stage}/{module}/{params}/{dataset}_ks_range.labels.gz" + + - id: metrics + modules: + - id: partition_metrics + name: "clustbench partition metrics" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_metrics + commit: 8184cd4 + parameters: + - values: ["--metric", "normalized_clustering_accuracy"] + - values: ["--metric", "adjusted_fm_score"] + inputs: + - entries: + - clustering.predicted_ks_range + - data.true_labels + outputs: + - id: metrics.scores + path: "{input}/{stage}/{module}/{params}/{dataset}.scores.gz" diff --git a/Makefile b/Makefile index 875a375..e8e942d 100644 --- a/Makefile +++ b/Makefile @@ -6,9 +6,26 @@ prepare_apptainer_env: prepare_envmodules_env: cd envs && eb clustbench.eb --robot cd envs && eb fcps.eb --robot -run_with_apptainer_backend: - ${OB_CMD} -b Clustering_singularity.yml - mv out out_apptainer + cd envs && eb rmarkdown.eb --robot + +# short versions, to debug runs & environments +run_with_apptainer_backend_short: + ${OB_CMD} -b Clustering_apptainer_vanilla_smoketest.yml + mv out out_apptainer_short +run_with_conda_backend_short: + ${OB_CMD} -b Clustering_conda_smoketest.yml + mv out out_conda +run_with_envmodules_backend_short: + ${OB_CMD} -b Clustering_envmodules_smoketest.yml + mv out out_lmod_short + +# full versions (expect hours) +run_with_apptainer_backend_vanilla: + ${OB_CMD} -b Clustering_apptainer_vanilla.yml + mv out out_apptainer_vanilla +run_with_apptainer_backend_optimized: + ${OB_CMD} -b Clustering_apptainer_optimized.yml + mv out out_apptainer_vanilla run_with_conda_backend: ${OB_CMD} -b Clustering_conda.yml mv out out_conda diff --git a/envs/rmarkdown.yml b/envs/rmarkdown.yml index e57969e..ed5c65e 100644 --- a/envs/rmarkdown.yml +++ b/envs/rmarkdown.yml @@ -7,6 +7,8 @@ dependencies: - conda-forge::python=3.12.6 - conda-forge::r-argparse - conda-forge::r-rmarkdown + - conda-forge::r-cairo + - conda-forge::r-svglite - conda-forge::r-ggplot2 - - conda-forge::r-tidyr + - conda-forge::r-tidyr - bioconda::bioconductor-complexheatmap From 3890cb48664570aa7a9878dacb299146d69ced5d Mon Sep 17 00:00:00 2001 From: ben Date: Mon, 12 May 2025 16:15:38 +0200 Subject: [PATCH 34/45] add apptainer definition for rmarkdown --- envs/build_singularity.sh | 4 +++- envs/rmarkdown.def | 38 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+), 1 deletion(-) create mode 100644 envs/rmarkdown.def diff --git a/envs/build_singularity.sh b/envs/build_singularity.sh index 2dae40a..c34208b 100755 --- a/envs/build_singularity.sh +++ b/envs/build_singularity.sh @@ -2,5 +2,7 @@ CMD=singularity BUILD='build --fakeroot' $CMD ${BUILD} clustbench-vanilla.sif clustbench_apptainer_vanillapy.def -$CMD ${BUILD} clustbench-optimized.sif clustbench_apptainer_optimized.def +# enable this if you want to compare with the custom python compilation +# $CMD ${BUILD} clustbench-optimized.sif clustbench_apptainer_optimized.def $CMD ${BUILD} fcps.sif fcps_singularity_optimized.def +$CMD ${BUILD} rmarkdown.sif rmarkdown.def diff --git a/envs/rmarkdown.def b/envs/rmarkdown.def new file mode 100644 index 0000000..ce7ca1e --- /dev/null +++ b/envs/rmarkdown.def @@ -0,0 +1,38 @@ +Bootstrap: docker +From: rocker/tidyverse:4.4 + +%labels + + AUTHOR izaskun.mallona@gmail.com + AUTHOR ben.uzh@proton.me + +%post + + # Install python (3.12 as of noble) + export DEBIAN_FRONTEND=noninteractive + apt-get update + apt-get install -y git \ + python-is-python3 \ + python3.12 \ + python3-virtualenv \ + && apt-get clean && \ + rm -rf /var/lib/apt/lists/* + + # virtualenv + cd /opt + python3.12 -m venv "default" + . default/bin/activate + + pip install \ + "gitpython==3.1.43" \ + "isodate==0.7.2" \ + "pydantic-core==2.34.1" + + # Install R packages + Rscript -e 'BiocManager::install(c("mclust", "caret", "readr", "argparse", "rmarkdown"))' + + echo '. /opt/default/bin/activate' >> $SINGULARITY_ENVIRONMENT + +%environment + + . /opt/default/bin/activate From c80adc10844d9251572f00352795be17c01a61a3 Mon Sep 17 00:00:00 2001 From: ben Date: Mon, 12 May 2025 16:29:08 +0200 Subject: [PATCH 35/45] remove unneeded dependencies --- envs/rmarkdown.def | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/envs/rmarkdown.def b/envs/rmarkdown.def index ce7ca1e..8dc75b6 100644 --- a/envs/rmarkdown.def +++ b/envs/rmarkdown.def @@ -14,7 +14,7 @@ From: rocker/tidyverse:4.4 apt-get install -y git \ python-is-python3 \ python3.12 \ - python3-virtualenv \ + python3.12-venv \ && apt-get clean && \ rm -rf /var/lib/apt/lists/* @@ -29,7 +29,7 @@ From: rocker/tidyverse:4.4 "pydantic-core==2.34.1" # Install R packages - Rscript -e 'BiocManager::install(c("mclust", "caret", "readr", "argparse", "rmarkdown"))' + Rscript -e 'BiocManager::install(c("mclust", "caret", "argparse"))' echo '. /opt/default/bin/activate' >> $SINGULARITY_ENVIRONMENT From b19a489cec78d49c57b1c2b9e6cf1c3b0604c1ca Mon Sep 17 00:00:00 2001 From: ben Date: Mon, 12 May 2025 16:31:01 +0200 Subject: [PATCH 36/45] update makefile --- Makefile | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index e8e942d..f342949 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,9 @@ MAX_CORES ?= 10 +TIMEOUT ?= 4h + # by default, we want to run all snakemake rules even if there are failures (-k) -OB_CMD=ob run benchmark -k --local --task-timeout "4h" --cores ${MAX_CORES} +OB_CMD=ob run benchmark -k --local --task-timeout ${TIMEOUT} --cores ${MAX_CORES} + prepare_apptainer_env: cd envs && ./build_singularity.sh prepare_envmodules_env: @@ -14,7 +17,7 @@ run_with_apptainer_backend_short: mv out out_apptainer_short run_with_conda_backend_short: ${OB_CMD} -b Clustering_conda_smoketest.yml - mv out out_conda + mv out out_conda_short run_with_envmodules_backend_short: ${OB_CMD} -b Clustering_envmodules_smoketest.yml mv out out_lmod_short From ebd69b79937e55a68e968d829910c5c6f3d80b70 Mon Sep 17 00:00:00 2001 From: ben Date: Mon, 12 May 2025 16:38:13 +0200 Subject: [PATCH 37/45] cleanup r/fcps deps --- Clustering_conda.yml | 2 +- envs/build_singularity.sh | 2 +- ...cps_singularity_optimized.def => fcps.def} | 29 ++++++++------- envs/fcps.eb | 3 +- envs/r.yml | 12 ------ envs/r_singularity.def | 37 ------------------- 6 files changed, 19 insertions(+), 66 deletions(-) rename envs/{fcps_singularity_optimized.def => fcps.def} (59%) delete mode 100644 envs/r.yml delete mode 100644 envs/r_singularity.def diff --git a/Clustering_conda.yml b/Clustering_conda.yml index 7822761..9e74ee5 100644 --- a/Clustering_conda.yml +++ b/Clustering_conda.yml @@ -30,7 +30,7 @@ software_environments: metric_collectors: - id: plotting name: "Single-backend metric collector." - software_environment: "rmarkdown" + software_environment: rmarkdown repository: url: https://github.com/imallona/clustering_report commit: 1d6bdf5 diff --git a/envs/build_singularity.sh b/envs/build_singularity.sh index c34208b..f8596a7 100755 --- a/envs/build_singularity.sh +++ b/envs/build_singularity.sh @@ -4,5 +4,5 @@ BUILD='build --fakeroot' $CMD ${BUILD} clustbench-vanilla.sif clustbench_apptainer_vanillapy.def # enable this if you want to compare with the custom python compilation # $CMD ${BUILD} clustbench-optimized.sif clustbench_apptainer_optimized.def -$CMD ${BUILD} fcps.sif fcps_singularity_optimized.def +$CMD ${BUILD} fcps.sif fcps.def $CMD ${BUILD} rmarkdown.sif rmarkdown.def diff --git a/envs/fcps_singularity_optimized.def b/envs/fcps.def similarity index 59% rename from envs/fcps_singularity_optimized.def rename to envs/fcps.def index 6362b9e..f4eefcb 100644 --- a/envs/fcps_singularity_optimized.def +++ b/envs/fcps.def @@ -1,5 +1,5 @@ Bootstrap: docker -From: rocker/tidyverse:4.3.3 +From: rocker/tidyverse:4.4 %labels @@ -8,29 +8,32 @@ From: rocker/tidyverse:4.3.3 %post - # Install python3.12 + # Install python (3.12 as of noble) + export DEBIAN_FRONTEND=noninteractive + apt-get update + apt-get install -y git \ + python-is-python3 \ + python3.12 \ + python3.12-venv \ + && apt-get clean && \ + rm -rf /var/lib/apt/lists/* apt-get update apt install -y build-essential zlib1g-dev libncurses5-dev libgdbm-dev libnss3-dev libssl-dev \ libreadline-dev libffi-dev libsqlite3-dev wget libbz2-dev python-is-python3 git \ libgsl-dev - wget https://www.python.org/ftp/python/3.12.9/Python-3.12.9.tgz - tar -xf Python-3.12.9.tgz - cd Python-3.12.*/ - ./configure --enable-optimizations - make -j 8 - make altinstall - # virtualenv cd /opt python3.12 -m venv "default" . default/bin/activate - # TODO: pin dependencies - pip install gitpython==3.1.43 isodate pydantic-core + pip install \ + "gitpython==3.1.43" \ + "isodate==0.7.2" \ + "pydantic-core==2.34.1" - ## no versioning here - ## TODO(ben): get same versions as in easyconfig + # Install R packages + ## FIXME no versioning here Rscript -e 'BiocManager::install(c( "dbscan", "cluster", "protoclust", "energy", "argparse", "mclust", "DataVisualizations", "FCPS", "cclust"))' echo '. /opt/default/bin/activate' >> $SINGULARITY_ENVIRONMENT diff --git a/envs/fcps.eb b/envs/fcps.eb index 54c8c7d..4d86bdd 100644 --- a/envs/fcps.eb +++ b/envs/fcps.eb @@ -15,12 +15,11 @@ dependencies = [ ('R', '4.3.2'), ('Boost', '1.82.0'), ('GSL', '2.7'), -# ('arrow-R', '14.0.1', versionsuffix), # required by RcisTarget ] exts_default_options = { 'source_urls': [ - 'https://bioconductor.org/packages/release/bioc/src/contrib/', + 'https://bioconductor.org/packages/release/bioc/src/contrib/', 'https://cran.r-project.org/src/contrib/Archive/%(name)s', # package archive 'https://cran.r-project.org/src/contrib/', # current version of packages 'https://cran.freestatistics.org/src/contrib', # mirror alternative for current packages diff --git a/envs/r.yml b/envs/r.yml deleted file mode 100644 index 456e139..0000000 --- a/envs/r.yml +++ /dev/null @@ -1,12 +0,0 @@ -name: r_for_metrics -channels: - - conda-forge - - nodefaults -dependencies: - - conda-forge::python=3.12.6 - - conda-forge::r-mclust - - conda-forge::r-caret - - conda-forge::r-dplyr - - conda-forge::r-readr - - conda-forge::r-argparse - diff --git a/envs/r_singularity.def b/envs/r_singularity.def deleted file mode 100644 index f1f9ec9..0000000 --- a/envs/r_singularity.def +++ /dev/null @@ -1,37 +0,0 @@ -Bootstrap: docker -From: rocker/tidyverse:4.4 - -%labels - - AUTHOR izaskun.mallona@gmail.com - -%post - - # Install python3.12 - apt-get update - apt install -y build-essential zlib1g-dev libncurses5-dev libgdbm-dev libnss3-dev libssl-dev \ - libreadline-dev libffi-dev libsqlite3-dev wget libbz2-dev python-is-python3 git - - wget https://www.python.org/ftp/python/3.12.6/Python-3.12.6.tgz - tar -xf Python-3.12.6.tgz - cd Python-3.12.*/ - ./configure --enable-optimizations - make -j 4 - make altinstall - - # virtualenv - cd /opt - python3.12 -m venv "default" - . default/bin/activate - - pip install gitpython==3.1.43 isodate pydantic-core - - # Install R packages - - Rscript -e 'BiocManager::install(c("mclust", "caret", "readr", "argparse"))' - - echo '. /opt/default/bin/activate' >> $SINGULARITY_ENVIRONMENT - -%environment - - . /opt/default/bin/activate From 1afaa2f2830f11563973a1ef9720753b0a47ceec Mon Sep 17 00:00:00 2001 From: ben Date: Mon, 12 May 2025 16:44:36 +0200 Subject: [PATCH 38/45] cleanup image --- envs/build_singularity.sh | 2 +- envs/fcps.def | 4 ---- envs/rmarkdown.def | 2 ++ 3 files changed, 3 insertions(+), 5 deletions(-) diff --git a/envs/build_singularity.sh b/envs/build_singularity.sh index f8596a7..83203c8 100755 --- a/envs/build_singularity.sh +++ b/envs/build_singularity.sh @@ -5,4 +5,4 @@ $CMD ${BUILD} clustbench-vanilla.sif clustbench_apptainer_vanillapy.def # enable this if you want to compare with the custom python compilation # $CMD ${BUILD} clustbench-optimized.sif clustbench_apptainer_optimized.def $CMD ${BUILD} fcps.sif fcps.def -$CMD ${BUILD} rmarkdown.sif rmarkdown.def +$CMD ${BUILD} rmarkdown.sif rmarkdown.def # this one is very similar to fcps, remove diff --git a/envs/fcps.def b/envs/fcps.def index f4eefcb..922d7f8 100644 --- a/envs/fcps.def +++ b/envs/fcps.def @@ -17,10 +17,6 @@ From: rocker/tidyverse:4.4 python3.12-venv \ && apt-get clean && \ rm -rf /var/lib/apt/lists/* - apt-get update - apt install -y build-essential zlib1g-dev libncurses5-dev libgdbm-dev libnss3-dev libssl-dev \ - libreadline-dev libffi-dev libsqlite3-dev wget libbz2-dev python-is-python3 git \ - libgsl-dev # virtualenv cd /opt diff --git a/envs/rmarkdown.def b/envs/rmarkdown.def index 8dc75b6..aa20cc1 100644 --- a/envs/rmarkdown.def +++ b/envs/rmarkdown.def @@ -1,6 +1,8 @@ Bootstrap: docker From: rocker/tidyverse:4.4 +# TODO: we could merge this one with fcps.def, no need to duplicate the image. + %labels AUTHOR izaskun.mallona@gmail.com From 9e2168a754e7a93e11867f68f4548f1415301c79 Mon Sep 17 00:00:00 2001 From: ben Date: Mon, 12 May 2025 16:54:57 +0200 Subject: [PATCH 39/45] update readme --- envs/README.md | 43 +++++++++++++++++++++---------------------- 1 file changed, 21 insertions(+), 22 deletions(-) diff --git a/envs/README.md b/envs/README.md index 69aa5c1..3cab925 100644 --- a/envs/README.md +++ b/envs/README.md @@ -1,10 +1,9 @@ We distribute `Clustering.yml` runs with different backends. -- `Clustering_conda.yml`. Conda semi-reproducible (no pinning, pip) -- `Clustering_singularity.yml`. Singularity semi-reproducible, local SIF files. -- `Clustering_oras.yml`. Singularity semi-reproducible, prebuilt remote images. -- `Clustering_envmodules.yml`. Easybuilt with default optimization. - +- `Clustering_conda.yml`. Conda semi-reproducible (no pinning, using pip) +- `Clustering_apptainer.yml`. Singularity semi-reproducible, local SIF files. +- `Clustering_oras.yml`. Singularity semi-reproducible, prebuilt remote images from an ORAS registry. +- `Clustering_envmodules.yml`. Easybuild backend with default optimization. ## Conda @@ -12,8 +11,7 @@ We distribute `Clustering.yml` runs with different backends. - `clustbench.yml` - `fcps.yml` -- `r.yml` -- `sklearn.yml` +- `rmarkdown.yml` ### How to build @@ -23,24 +21,25 @@ No need to `ob software conda pin / prepare`; let `ob run benchmark -b Clusterin ### Files -- `clustbench_singularity.def` -- `fcps_singularity.def` -- `r_singularity.def` -- `sklearn_singularity.def` +The apptainer images are based in ubuntu-noble docker images. + +The "optimized" one does a custom python 3.12 compilation; the vanillapy stocks the default py3.12 interpreter from the official ubuntu docker image. + +- `clustbench_apptainer_optimized.def` +- `clustbench_apptainer_vanillapy.def` +- `fcps.def` +- `rmarkdown.def` ### How to build -- `build_singularity.sh` +- `make prepare_apptainer_env` from the root folder. ## Aptainer semi-reproducible and remote -No need to prepare/build anything; let `ob run benchmark -b Clustering_oras.yml --local` do it using pre-built images from https://gitlab.renkulab.io/izaskun.mallona/clustering_example/container_registry. +TODO: push to the registry (how?) -## Apptainer (reproducible) with easybuild - -Doing... +No need to prepare/build anything; let `ob run benchmark -b Clustering_oras.yml --local` do it using pre-built images from https://gitlab.renkulab.io/izaskun.mallona/clustering_example/container_registry. -Lorem ipsum. ## envmodules - reproducible builds with easybuild @@ -48,11 +47,11 @@ Lorem ipsum. - `clustbench.eb` - `fcps.eb` +- `rmarkdown.eb` +- `rmarkdown-python.eb` ### How to build -1. Mind https://github.com/easybuilders/easybuild-easyconfigs/commit/e29210626f076e3a207f1abf3759ea124e28f8b2 -2. Mind `clustbench` is only installable from https://github.com/gagolews/genieclust/archive/refs/tags/v1.1.6.tar.gz and not from pypi's tgz (!), download it locally and ideally update the easyconfig to automate this -3. `python3-wget` from pypi doesn't look very well maintaned -4. `eb fcps.eb --robot` -5. `eb clustbench.eb --robot` +- `make prepare_envmodules_env` from the root folder. +- `python3-wget` from pypi doesn't look very well maintaned + From 6199c0a11bbc88a944d07e4b79bf329fc9c55990 Mon Sep 17 00:00:00 2001 From: ben Date: Mon, 12 May 2025 17:00:25 +0200 Subject: [PATCH 40/45] fixes --- envs/clustbench.eb | 5 ----- envs/fcps.eb | 2 +- 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/envs/clustbench.eb b/envs/clustbench.eb index 0e86911..daae6dd 100644 --- a/envs/clustbench.eb +++ b/envs/clustbench.eb @@ -13,9 +13,6 @@ dependencies = [ ('SciPy-bundle', '2023.11'), ('matplotlib', '3.8.2'), ('scikit-learn', '1.4.0'), -# FIXME: I think this is not needed -- ben -# ('meson-python', '0.15.0'), -# ('Python-bundle-PyPI', '2023.10'), ## so GCC 13.2.0 like foss-2023b ] exts_list = [ @@ -48,5 +45,3 @@ exts_list = [ ] moduleclass = 'bio' - - diff --git a/envs/fcps.eb b/envs/fcps.eb index 4d86bdd..692bf0b 100644 --- a/envs/fcps.eb +++ b/envs/fcps.eb @@ -13,7 +13,7 @@ builddependencies = [('pkgconf', '1.9.5')] dependencies = [ ('R', '4.3.2'), - ('Boost', '1.82.0'), + ('Boost', '1.82.0'), ('GSL', '2.7'), ] From b017cb02a71b83766f831b9bf5b4d483eb8dbe9f Mon Sep 17 00:00:00 2001 From: ben Date: Mon, 12 May 2025 17:18:33 +0200 Subject: [PATCH 41/45] apptainer smoketest --- Clustering_apptainer_vanilla.yml | 4 +- Clustering_apptainer_vanilla_smoketest.yml | 129 +++++++++++++++++++++ 2 files changed, 131 insertions(+), 2 deletions(-) create mode 100644 Clustering_apptainer_vanilla_smoketest.yml diff --git a/Clustering_apptainer_vanilla.yml b/Clustering_apptainer_vanilla.yml index 46b8ea4..6bc5edd 100644 --- a/Clustering_apptainer_vanilla.yml +++ b/Clustering_apptainer_vanilla.yml @@ -10,9 +10,9 @@ software_backend: apptainer software_environments: clustbench: - description: "clustbench on py3.12.6" - conda: envs/clustbench.yml # not used + description: "clustbench on py3.12.3, default python" envmodule: na + conda: envs/clustbench.yml # not used apptainer: envs/clustbench-vanilla.sif fcps: diff --git a/Clustering_apptainer_vanilla_smoketest.yml b/Clustering_apptainer_vanilla_smoketest.yml new file mode 100644 index 0000000..99aff2e --- /dev/null +++ b/Clustering_apptainer_vanilla_smoketest.yml @@ -0,0 +1,129 @@ +id: clustering_example_envmodules +description: Clustering benchmark on Gagolewski's, true number of clusters plus minus 2. +version: 1.4 +benchmarker: "Izaskun Mallona, Daniel Incicau" +benchmark_yaml_spec: 0.5 + +software_backend: apptainer + +software_environments: + + clustbench: + description: "clustbench on py3.12.3, default python" + envmodule: na + conda: envs/clustbench.yml # not used + apptainer: envs/clustbench-vanilla.sif + + fcps: + description: "CRAN's FCPS" + envmodule: na + conda: envs/fcps.yml # not used + apptainer: envs/fcps.sif + + rmarkdown: + description: "R with some plotting dependencies" + envmodule: na + conda: envs/rmarkdown.yml # not used + apptainer: envs/rmarkdown.sif + +metric_collectors: + - id: plotting + name: "Single-backend metric collector." + software_environment: rmarkdown + repository: + url: https://github.com/imallona/clustering_report + commit: 1d6bdf5 + inputs: + - metrics.scores + outputs: + - id: plotting.html + path: "{input}/{name}/plotting_report.html" + +stages: + - id: data + modules: + - id: clustbench + name: "clustbench datasets, from https://www.sciencedirect.com/science/article/pii/S0020025521010082#t0005 Table1" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_data + commit: 366c5a2 + parameters: # comments depict the possible cardinalities and the number of curated labelsets + - values: ["--dataset_generator", "fcps", "--dataset_name", "atom"] # 2 1 + outputs: + - id: data.matrix + path: "{input}/{stage}/{module}/{params}/{dataset}.data.gz" + - id: data.true_labels + path: "{input}/{stage}/{module}/{params}/{dataset}.labels0.gz" + + - id: clustering + modules: + - id: fastcluster + name: "fastcluster algorithm" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_fastcluster + commit: "45e43d3" + parameters: + - values: ["--linkage", "complete"] + - values: ["--linkage", "ward"] + - id: sklearn + name: "sklearn" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_sklearn + commit: 5877378 + parameters: + - values: ["--method", "birch"] + - values: ["--method", "kmeans"] + - id: agglomerative + name: "agglomerative" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_agglomerative + commit: 5454368 + parameters: + - values: ["--linkage", "complete"] + - values: ["--linkage", "ward"] + - id: genieclust + name: "genieclust" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_genieclust + commit: 6090043 + parameters: + - values: ["--method", "genie", "--gini_threshold", 0.5] + - id: fcps + name: "fcps" + software_environment: fcps + repository: + url: https://github.com/imallona/clustbench_fcps + commit: 272fa5f + parameters: + - values: ["--method", "FCPS_Minimax"] + inputs: + - entries: + - data.matrix + - data.true_labels + outputs: + - id: clustering.predicted_ks_range + path: "{input}/{stage}/{module}/{params}/{dataset}_ks_range.labels.gz" + + - id: metrics + modules: + - id: partition_metrics + name: "clustbench partition metrics" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_metrics + commit: 8184cd4 + parameters: + - values: ["--metric", "normalized_clustering_accuracy"] + - values: ["--metric", "adjusted_fm_score"] + inputs: + - entries: + - clustering.predicted_ks_range + - data.true_labels + outputs: + - id: metrics.scores + path: "{input}/{stage}/{module}/{params}/{dataset}.scores.gz" From 98777a52be5fc9500e715a42dd1f4e146bc467b6 Mon Sep 17 00:00:00 2001 From: ben Date: Mon, 12 May 2025 17:21:29 +0200 Subject: [PATCH 42/45] add git in the image --- envs/clustbench_apptainer_vanillapy.def | 1 + 1 file changed, 1 insertion(+) diff --git a/envs/clustbench_apptainer_vanillapy.def b/envs/clustbench_apptainer_vanillapy.def index 5d388bf..63f764a 100644 --- a/envs/clustbench_apptainer_vanillapy.def +++ b/envs/clustbench_apptainer_vanillapy.def @@ -13,6 +13,7 @@ From: ubuntu:noble-20250404 python3-venv \ python3-pip \ ca-certificates \ + git \ && apt-get clean && \ rm -rf /var/lib/apt/lists/* From f4ae29d1600097a42fc906557a085dea97ed8cf0 Mon Sep 17 00:00:00 2001 From: ben Date: Mon, 12 May 2025 17:33:56 +0200 Subject: [PATCH 43/45] try to debug fastcluster problem --- envs/clustbench_apptainer_optimized.def | 4 ++-- envs/clustbench_apptainer_vanillapy.def | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/envs/clustbench_apptainer_optimized.def b/envs/clustbench_apptainer_optimized.def index 19726c2..8fc7e08 100644 --- a/envs/clustbench_apptainer_optimized.def +++ b/envs/clustbench_apptainer_optimized.def @@ -58,7 +58,7 @@ From: ubuntu:noble-20250404 # Install required packages with pip - pip install -U pip + pip install -U pip wheel pip install \ "clustering-benchmarks==1.1.6" \ @@ -83,7 +83,7 @@ From: ubuntu:noble-20250404 "six==1.17.0" \ "threadpoolctl==3.6.0" \ "tzdata==2025.2" \ - "fastcluster==1.2.6" \ + "fastcluster==1.3.0" \ "gitpython==3.1.43" \ "isodate==0.7.2" \ "pydantic-core==2.34.1" diff --git a/envs/clustbench_apptainer_vanillapy.def b/envs/clustbench_apptainer_vanillapy.def index 63f764a..ff9dd91 100644 --- a/envs/clustbench_apptainer_vanillapy.def +++ b/envs/clustbench_apptainer_vanillapy.def @@ -24,7 +24,7 @@ From: ubuntu:noble-20250404 # Install required packages with pip - pip install -U pip + pip install -U pip wheel pip install \ "clustering-benchmarks==1.1.6" \ @@ -49,7 +49,7 @@ From: ubuntu:noble-20250404 "six==1.17.0" \ "threadpoolctl==3.6.0" \ "tzdata==2025.2" \ - "fastcluster==1.2.6" \ + "fastcluster==1.3.0" \ "gitpython==3.1.43" \ "isodate==0.7.2" \ "pydantic-core==2.34.1" From 72cdc598acfd10c2fd73bee49f7b66fdd6a62591 Mon Sep 17 00:00:00 2001 From: ben Date: Wed, 14 May 2025 13:26:23 +0200 Subject: [PATCH 44/45] fail if the exit code fails --- .github/workflows/benchmark.yml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 2a55846..e22b368 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -59,7 +59,7 @@ jobs: - name: Run benchmark shell: bash -l {0} - continue-on-error: true + continue-on-error: false run: | echo "y" | ob run benchmark -b Clustering.yaml --local --cores 3 --continue-on-error @@ -98,7 +98,7 @@ jobs: - name: Deploy to GitHub Pages uses: actions/deploy-pages@v4 - + - name: Create Job Summary if: always() run: | @@ -106,4 +106,3 @@ jobs: echo "- [Plotting Report](https://${{ github.repository_owner }}.github.io/${{ github.event.repository.name }})" >> $GITHUB_STEP_SUMMARY echo "### All Outputs" >> $GITHUB_STEP_SUMMARY echo "- [Complete Benchmark Output](https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}#artifacts)" >> $GITHUB_STEP_SUMMARY - From 01243de1b555e2e5d4e7b31228d66d8a335edcb3 Mon Sep 17 00:00:00 2001 From: ben Date: Wed, 14 May 2025 13:29:16 +0200 Subject: [PATCH 45/45] use conda short for test --- .github/workflows/benchmark.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index e22b368..b6cb977 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -61,7 +61,7 @@ jobs: shell: bash -l {0} continue-on-error: false run: | - echo "y" | ob run benchmark -b Clustering.yaml --local --cores 3 --continue-on-error + echo "y" | ob run benchmark -b Clustering_conda_smoketest.yml --local --cores 3 --continue-on-error upload-artifact: name: Benchmark Artifact