From 4a3fdc3941a2b097bb5c97889f7dd0bc657e342d Mon Sep 17 00:00:00 2001
From: ben <ben.uzh@proton.me>
Date: Thu, 8 May 2025 12:18:01 +0200
Subject: [PATCH 01/45] run dev branch

---
 .github/workflows/benchmark.yml | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index c1a1e82..500eb58 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -18,7 +18,6 @@ jobs:
   run-benchmark:
     name: Run Benchmark
     runs-on: ubuntu-latest
-    ## runs-on: self-hosted
     steps:
       - name: Check out repository
         uses: actions/checkout@v4
@@ -49,7 +48,7 @@ jobs:
         shell: bash -l {0}
         run: |
           mamba install -y pip
-          pip install git+https://github.com/omnibenchmark/omnibenchmark.git@reduce_install_scope
+          pip install git+https://github.com/omnibenchmark/omnibenchmark.git@dev
 
       - name: Load benchmark cache
         id: cache-benchmark
@@ -67,7 +66,6 @@ jobs:
   upload-artifact:
     name: Benchmark Artifact
     runs-on: ubuntu-latest
-    ## runs-on: self-hosted
     needs: run-benchmark
     if: always()
     steps:

From e89adda93e7fdc64b52b2e77dc702969a50f735c Mon Sep 17 00:00:00 2001
From: btraven <128150520+btraven00@users.noreply.github.com>
Date: Mon, 17 Mar 2025 16:20:31 +0100
Subject: [PATCH 02/45] docs: use the public repo URI

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index a75c594..89d7c05 100644
--- a/README.md
+++ b/README.md
@@ -3,8 +3,8 @@ A clustering example for omnibenchmark
 # How to run
 
 1. Install omnibenchmark using [our tutorial](https://omnibenchmark.org/tutorial/)
-2. Clone the benchmark definition / this repository with `git clone git@github.com:omnibenchmark/clustering_example.git`
-3. Move to the cloned repository `cd clustering_example`
+2. Clone the benchmark definition / this repository with `git clone https://github.com/omnibenchmark/clustering_example`
+3. Move into the cloned folder: `cd clustering_example`
 4. Run locally, somewhat in parallel `ob run benchmark -b CLUSTERING.YAML  --local --threads 6`. Choose `Clustering.yml` specification based on whether running it with conda, easybuild, apptainer, etc. [More details about the available backends](https://github.com/omnibenchmark/clustering_example/blob/main/envs/README.md).
 
 # Clustbench attribution

From 52ebb556eae88f36d2e857aadfe8189c4aca3eaf Mon Sep 17 00:00:00 2001
From: ben <ben.uzh@pm.me>
Date: Tue, 18 Mar 2025 13:02:37 +0100
Subject: [PATCH 03/45] chore: add convenience target to build singularity env

- make script executable
- use /bin/sh instead of /bin/bash
- add top-level Makefile to prepare env
---
 Makefile                  |  2 ++
 envs/build_singularity.sh | 14 +++++---------
 2 files changed, 7 insertions(+), 9 deletions(-)
 create mode 100644 Makefile
 mode change 100644 => 100755 envs/build_singularity.sh

diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..1e56cb2
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,2 @@
+prepare_apptainer_env:
+	cd envs && ./build_singularity.sh
diff --git a/envs/build_singularity.sh b/envs/build_singularity.sh
old mode 100644
new mode 100755
index 86e053f..c0c3d93
--- a/envs/build_singularity.sh
+++ b/envs/build_singularity.sh
@@ -1,9 +1,5 @@
-#!/bin/bash
-
-sudo singularity build sklearn.sif sklearn_singularity.def
-
-sudo singularity build clustbench.sif clustbench_singularity.def
-
-sudo singularity build r.sif r_singularity.def
-
-sudo singularity build fcps.sif fcps_singularity.def
+#!/bin/sh
+singularity build sklearn.sif sklearn_singularity.def
+singularity build clustbench.sif clustbench_singularity.def
+singularity build r.sif r_singularity.def
+singularity build fcps.sif fcps_singularity.def

From 83c6f0b0c78851d93be5956fd27a8180c61b2ba7 Mon Sep 17 00:00:00 2001
From: ben <ben.uzh@pm.me>
Date: Tue, 18 Mar 2025 13:59:05 +0100
Subject: [PATCH 04/45] feat: parametrize num threads on the makefile

---
 Makefile | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/Makefile b/Makefile
index 1e56cb2..3c58e2f 100644
--- a/Makefile
+++ b/Makefile
@@ -1,2 +1,10 @@
+MAX_THREADS ?= 30
+OB_CMD="ob run benchmark -k --local"
 prepare_apptainer_env:
 	cd envs && ./build_singularity.sh
+run_with_apptainer_backend:
+	 ${OB_CMD} -b Clustering_singularity.yml --threads ${MAX_THREADS}
+	 mv out out_apptainer
+run_with_conda_backend:
+	 ${OB_CMD} -b Clustering_conda.yml --threads ${MAX_THREADS}
+	 mv out out_conda

From dc2d629004fcdb40f75bc24194287b961eb40283 Mon Sep 17 00:00:00 2001
From: ben <ben.uzh@pm.me>
Date: Tue, 18 Mar 2025 14:02:21 +0100
Subject: [PATCH 05/45] chore: ignore common temporary outputs and image build
 artifacts

---
 .gitignore | 10 ++++++++++
 1 file changed, 10 insertions(+)
 create mode 100644 .gitignore

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..4d38534
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,10 @@
+# image build artifacts
+envs/*.sif
+
+# snakemake
+snakemake.log
+.snakemake/
+
+# vim swaps
+*.swp
+*.swo

From f91603aecf8f82975087c89615d3473d4b79c12f Mon Sep 17 00:00:00 2001
From: ben <ben.uzh@pm.me>
Date: Tue, 18 Mar 2025 13:59:05 +0100
Subject: [PATCH 06/45] feat: parametrize num threads on the makefile

---
 Makefile | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 3c58e2f..6883fa0 100644
--- a/Makefile
+++ b/Makefile
@@ -1,5 +1,6 @@
 MAX_THREADS ?= 30
-OB_CMD="ob run benchmark -k --local"
+# by default, we want to run all snakemake rules even if there are failures
+OB_CMD=ob run benchmark -k --local
 prepare_apptainer_env:
 	cd envs && ./build_singularity.sh
 run_with_apptainer_backend:

From bea2a75173f9c19edb2adb1c22bc1ab90d62774d Mon Sep 17 00:00:00 2001
From: ben <ben.uzh@pm.me>
Date: Mon, 5 May 2025 10:07:36 +0200
Subject: [PATCH 07/45] fix: use --cores, --task-timeout

---
 Makefile | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/Makefile b/Makefile
index 6883fa0..73b33b5 100644
--- a/Makefile
+++ b/Makefile
@@ -1,11 +1,11 @@
-MAX_THREADS ?= 30
+MAX_CORES ?= 10
 # by default, we want to run all snakemake rules even if there are failures
-OB_CMD=ob run benchmark -k --local
+OB_CMD=ob run benchmark -k --local --task-timeout "4h"
 prepare_apptainer_env:
 	cd envs && ./build_singularity.sh
 run_with_apptainer_backend:
-	 ${OB_CMD} -b Clustering_singularity.yml --threads ${MAX_THREADS}
+	 ${OB_CMD} -b Clustering_singularity.yml --cores ${MAX_CORES}
 	 mv out out_apptainer
 run_with_conda_backend:
-	 ${OB_CMD} -b Clustering_conda.yml --threads ${MAX_THREADS}
+	 ${OB_CMD} -b Clustering_conda.yml --cores ${MAX_CORES}
 	 mv out out_conda

From 67e8cf8bc7e0deab9f6bfdc5aceaffe39841040e Mon Sep 17 00:00:00 2001
From: ben <ben.uzh@proton.me>
Date: Wed, 7 May 2025 21:41:46 +0200
Subject: [PATCH 08/45] update .eb files to easybuild 5.0

---
 Makefile           |  9 ++++--
 envs/clustbench.eb | 81 ++++++----------------------------------------
 envs/fcps.eb       | 18 ++++-------
 3 files changed, 23 insertions(+), 85 deletions(-)

diff --git a/Makefile b/Makefile
index 73b33b5..e107f62 100644
--- a/Makefile
+++ b/Makefile
@@ -1,11 +1,14 @@
 MAX_CORES ?= 10
 # by default, we want to run all snakemake rules even if there are failures
-OB_CMD=ob run benchmark -k --local --task-timeout "4h"
+OB_CMD=ob run benchmark -k --local --task-timeout "4h" --cores ${MAX_CORES}
 prepare_apptainer_env:
 	cd envs && ./build_singularity.sh
+prepare_envmodules_env:
+	cd envs && eb clustbench.eb --robot
+	cd envs && eb fcps.eb --robot
 run_with_apptainer_backend:
-	 ${OB_CMD} -b Clustering_singularity.yml --cores ${MAX_CORES}
+	 ${OB_CMD} -b Clustering_singularity.yml
 	 mv out out_apptainer
 run_with_conda_backend:
-	 ${OB_CMD} -b Clustering_conda.yml --cores ${MAX_CORES}
+	 ${OB_CMD} -b Clustering_conda.yml
 	 mv out out_conda
diff --git a/envs/clustbench.eb b/envs/clustbench.eb
index 22597fb..f3ee681 100644
--- a/envs/clustbench.eb
+++ b/envs/clustbench.eb
@@ -1,108 +1,47 @@
-## largely as https://github.com/easybuilders/easybuild-easyconfigs/blob/949c266db9e17440ec2829eb8ffdbdb87ceaf543/easybuild/easyconfigs/c/cooler/cooler-0.10.2-foss-2023b.eb#L4
-
 easyblock = 'PythonBundle'
 
 name = 'clustbench'
-version = '1'
+version = '0.1.0'
 
-homepage = 'https://python.org/'
+homepage = 'https://omnibenchmark.org'
 description = "Bundle of Python packages for ob clustering_example"
 
 toolchain = {'name': 'foss', 'version': '2023b'}
 
-
 dependencies = [
     ('Python', '3.11.5'),
-    ('Python-bundle-PyPI', '2023.10'), ## so GCC 13.2.0 like foss-2023b
     ('SciPy-bundle', '2023.11'),
-    ('meson-python', '0.15.0'),
     ('matplotlib', '3.8.2'),
-    ('scikit-learn', '1.4.0')
-
+    ('scikit-learn', '1.4.0'),
+#    ('meson-python', '0.15.0'),
+#    ('Python-bundle-PyPI', '2023.10'), ## so GCC 13.2.0 like foss-2023b
 ]
 
-sanity_pip_check = True 
-use_pip = True
-
-exts_default_options = {
-    'sanity_pip_check': True,
-    'use_pip' : True
-}
-
-## https://files.pythonhosted.org/packages/27/fe/e78538f4cd7b1b28e9c625eabd21f314004d00644a8347d0b01473e72ffa/clustering_benchmarks-1.1.5.tar.gz
-## https://files.pythonhosted.org/packages/47/6a/62e288da7bcda82b935ff0c6cfe542970f04e29c756b0e147251b2fb251f/wget-3.2.zip
-## https://files.pythonhosted.org/packages/5d/b8/f143d907d93bd4a3dd51d07c4e79b37bedbfc2177f4949bfa0d6ba0af647/fastcluster-1.2.6.tar.gz
-## https://files.pythonhosted.org/packages/68/7c/d465bab9f98b75c5c1f5e80165dd82847a504ced655d162b585df08a717b/genieclust-1.1.6.tar.gz
-## https://files.pythonhosted.org/packages/a2/45/eaaacaa4f4f2931a80d40e453df275d9af7c07616c5d753272d3055fb79e/genieclust-1.1.6-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
-
-source_urls = [PYPI_SOURCE,
-               'https://files.pythonhosted.org/packages/27/fe/e78538f4cd7b1b28e9c625eabd21f314004d00644a8347d0b01473e72ffa/',
-               'https://files.pythonhosted.org/packages/68/7c/d465bab9f98b75c5c1f5e80165dd82847a504ced655d162b585df08a717b/',
-               'https://files.pythonhosted.org/packages/47/6a/62e288da7bcda82b935ff0c6cfe542970f04e29c756b0e147251b2fb251f/',
-               'https://files.pythonhosted.org/packages/5d/b8/f143d907d93bd4a3dd51d07c4e79b37bedbfc2177f4949bfa0d6ba0af647/',
-               'https://files.pythonhosted.org/packages/65/6e/09db70a523a96d25e115e71cc56a6f9031e7b8cd166c1ac8438307c14058/',
-               'https://files.pythonhosted.org/packages/84/4d/b720d6000f4ca77f030bd70f12550820f0766b568e43f11af7f7ad9061aa/',
-               'https://files.pythonhosted.org/packages/68/dd/fa2e1a45fce2d09f4aea3cee169760e672c8262325aa5796c49d543dc7e6/',
-               'https://files.pythonhosted.org/packages/84/4d/b720d6000f4ca77f030bd70f12550820f0766b568e43f11af7f7ad9061aa',
-               'https://files.pythonhosted.org/packages/67/66/91d242ea8dd1729addd36069318ba2cd03874872764f316c3bb51b633ed2/',
-               'https://files.pythonhosted.org/packages/e2/a9/a0c57aee75f77794adaf35322f8b6404cbd0f89ad45c87197a937764b7d0/',
-               'https://files.pythonhosted.org/packages/d2/c1/72b9622fcb32ff98b054f724e213c7f70d6898baa714f4516288456ceaba/',
-               'https://github.com/pybind/pybind11/archive/',
-               'https://files.pythonhosted.org/packages/a9/95/a3dbbb5028f35eafb79008e7522a75244477d2838f38cbb722248dabc2a8/',
-               'https://files.pythonhosted.org/packages/84/a4/d9da2989a3d937e94616ef07f0630c507f6baa77ad37f94ceb06b36cacc1/python3-wget-0.0.2-beta1.tar.gz',
-               'https://files.pythonhosted.org/packages/6a/ef/6e3736663ee67369f7f5b697674bfbd3efc91e7096ddd4452bbbc80065ff/hypothesis-6.124.7.tar.gz',
-               'https://files.pythonhosted.org/packages/03/c6/14a17e10813b8db20d1e800ff9a3a898e65d25f2b0e9d6a94616f1e3362c/numpy-1.23.0.tar.gz',
-               'https://files.pythonhosted.org/packages/f6/d8/ab692a75f584d13c6542c3994f75def5bce52ded9399f52e230fe402819d/numpy-1.22.4.zip',
-               'https://files.pythonhosted.org/packages/65/6e/09db70a523a96d25e115e71cc56a6f9031e7b8cd166c1ac8438307c14058/numpy-1.26.4.tar.gz',
-               'https://files.pythonhosted.org/packages/64/33/60135848598c076ce4b231e1b1895170f45fbcaeaa2c9d5e38b04db70c35/joblib-1.4.2.tar.gz',
-               'https://files.pythonhosted.org/packages/bd/55/b5148dcbf72f5cde221f8bfe3b6a540da7aa1842f6b491ad979a6c8b84af/threadpoolctl-3.5.0.tar.gz',
-               'https://files.pythonhosted.org/packages/9e/a5/4ae3b3a0755f7b35a280ac90b28817d1f380318973cff14075ab41ef50d9/scikit_learn-1.6.1.tar.gz',
-               'https://files.pythonhosted.org/packages/ef/e5/c09d20723bfd91315f6f4ddc77912b0dcc09588b4ca7ad2ffa204607ad7f/scikit-learn-1.4.2.tar.gz',
-               'https://files.pythonhosted.org/packages/ee/5e/16e17bedcf54d5b618dc0771690deda77178e5c310402881c3d2d6c5f27c/hurry.filesize-0.9.tar.gz']
-
-
-## caution download genieclust here, not pypi, they differ and pypi's it's not installable!
-## cd /home/imallona/.local/easybuild/sources/c/clustbench/extensions/
-## wget wget https://github.com/gagolews/genieclust/archive/refs/tags/v1.1.6.tar.gz -O genieclust-1.1.6.tar.gz
-## todo automate this within the easyconfig!
-
 exts_list = [
     ('natsort', '8.4.0', {
         'checksums': ['45312c4a0e5507593da193dedd04abb1469253b601ecaf63445ad80f0a1ea581'],
     }),
-    ('cython', '3.0.11', {
-        'checksums': ['7146dd2af8682b4ca61331851e6aebce9fe5158e75300343f80c07ca80b1faff'],
-    }),
     ('hypothesis', '6.124.7', {
         'checksums': ['8ed6c6ae47e7d26d869c1dc3dee04e8fc50c95240715bb9915ded88d6d920f0e'],
     }),
-    ('numpy', '1.26.4', {
-        'checksums': ['2a02aba9ed12e4ac4eb3ea9421c420301a0c6460d9830d74a9df87efa4912010'],
-    }),
     ('fastcluster', '1.2.6', {
         'checksums': ['aab886efa7b6bba7ac124f4498153d053e5a08b822d2254926b7206cdf5a8aa6'],
     }),
-    ('genieclust', '1.1.6', {
-        'checksums': ['fb5b4ff68eef9e73496afa5949e726c8522c72e51f092716a6a598b03d5c09d6'],
-    }),
     ('hurry.filesize', '0.9', {
         'checksums': ['f5368329adbef86accd3bc9490522340bb79260455ae89b1a42c10f63801b9a6'],
     }),
     ('python3-wget', '0.0.2-beta1', {
         'modulename': 'wget',
+        'source_urls': ['https://files.pythonhosted.org/packages/84/a4/d9da2989a3d937e94616ef07f0630c507f6baa77ad37f94ceb06b36cacc1/'],
         'checksums': ['bbe7f44b3c28c4f7126aff20e8a438e78f6e4f1878d8b0c4940e87363813c17d'],
     }),
-    ('clustering_benchmarks', '1.1.5', {
-        'modulename': 'clustbench',
-        'checksums': ['1732c262fb13be2f88814ef9a19c60108e91a7f6cfb9b960a42feaa299034ea3'],
+    ('genieclust', '1.1.6', {
+        'download_dep_fail': False,
+        'install_src': 'https://files.pythonhosted.org/packages/2a/09/d1fd7b02cfabe76262d0f88d74fa71dc93e857525f8249539ec5ab174292/genieclust-1.1.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl',
+        'checksums': ['4c159f507b84b6d6d171883223648d837c520a9bcce650944a6ee0cb320e2151'],
     }),
 ]
 
-sanity_check_paths = {
-    'files': [],
-    'dirs': ['lib/python3.11/site-packages/clustbench/']
-}
-
 moduleclass = 'bio'
 
 
diff --git a/envs/fcps.eb b/envs/fcps.eb
index ee3db52..54c8c7d 100644
--- a/envs/fcps.eb
+++ b/envs/fcps.eb
@@ -1,6 +1,3 @@
-## largely as in https://github.com/easybuilders/easybuild-easyconfigs/commit/e9a36171c68414f933ab1afa03b32422491f0f96#diff-3f2a92ab6ab59ddaccf4bc61b59bdd3f6717b95fd019131a57f51eefc831a699
-## Caution boost easyconfig needs update  https://raw.githubusercontent.com/easybuilders/easybuild-easyconfigs/refs/heads/develop/easybuild/easyconfigs/b/Boost/Boost-1.82.0-GCC-12.3.0.eb (https://github.com/easybuilders/easybuild-easyconfigs/commit/e29210626f076e3a207f1abf3759ea124e28f8b2)
-
 easyblock = 'Bundle'
 
 name = 'fcps'
@@ -23,10 +20,7 @@ dependencies = [
 
 exts_default_options = {
     'source_urls': [
-        'https://bioconductor.org/packages/3.18/bioc/src/contrib/',
-        'https://bioconductor.org/packages/3.18/bioc/src/contrib/Archive/%(name)s',
-        'https://bioconductor.org/packages/3.18/data/annotation/src/contrib/',
-        'https://bioconductor.org/packages/3.18/data/experiment/src/contrib/',
+	'https://bioconductor.org/packages/release/bioc/src/contrib/',
         'https://cran.r-project.org/src/contrib/Archive/%(name)s',  # package archive
         'https://cran.r-project.org/src/contrib/',  # current version of packages
         'https://cran.freestatistics.org/src/contrib',  # mirror alternative for current packages
@@ -192,13 +186,15 @@ exts_list = [
     ('cluster', '2.1.8', {
         'checksums': ['c32a462e34694c99d58da953efa74882b5427f8c5db7cb226ae15c54ce6060ca'],
     }),
-    ('graph', '1.84.1', {
-        'checksums': ['cd2a91c93c81c09d9c59853c417e8a9cdde39b0589bacdce4ca916b6ee5f45a7'],
+    ('graph', '1.86.0', {
+        'checksums': ['ac9e196dfcb43848a851ea2d339cff41f8f16c7e80e76282c8fe7b822df8f367'],
     }),
     ('mclust', '6.1.1', {
         'checksums': ['ddd7018e5e6ea7f92c7fc9872b391491b7e91c2cd89ef1dcaf4408afb5116775'],
     }),
-    ('cclust', '0.6-26'),
+    ('cclust', '0.6-26', {
+        'checksums': ['92ec3c55a1864e4e1a4706bfdef8ad00727c720213ac656c718e867286b29857'],
+    }),
     ('flowClust', '3.40.0', {
         'installopts': "--configure-args='--with-gsl=${EBROOTGSL} --enable-bundled-gsl=false'",
         'checksums': ['7e699b06e378e32144704dbec18289109980b0f5eca166180f2c30007b83e0f5'],
@@ -240,4 +236,4 @@ sanity_check_paths = {
     'dirs': ['FCPS', 'dbscan', 'energy', 'protoclust'],
 }
 
-moduleclass = 'bio'
\ No newline at end of file
+moduleclass = 'bio'

From 931389f796ef8ceb7e4951c80c708e1b2c2129b1 Mon Sep 17 00:00:00 2001
From: ben <ben.uzh@proton.me>
Date: Thu, 8 May 2025 12:38:02 +0200
Subject: [PATCH 09/45] remove remote storage

---
 Clustering.yaml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Clustering.yaml b/Clustering.yaml
index 0007ea5..689be2c 100644
--- a/Clustering.yaml
+++ b/Clustering.yaml
@@ -2,10 +2,10 @@ id: clustering_example
 description: Clustering benchmark on Gagolewski's, true number of clusters plus minus 2.
 version: 1.2
 benchmarker: "Izaskun Mallona, Daniel Incicau"
-storage: https://play.min.io
 benchmark_yaml_spec: 0.04
-storage_api: S3
-storage_bucket_name: clustering_example
+# storage: https://play.min.io
+# storage_api: S3
+# storage_bucket_name: clustering_example
 software_backend: conda
 software_environments:
   clustbench:

From 60ac47b3c55bec65b5ad839d524a7b8cd87b1b4c Mon Sep 17 00:00:00 2001
From: ben <ben.uzh@proton.me>
Date: Thu, 8 May 2025 12:41:44 +0200
Subject: [PATCH 10/45] do not run artifact if not in main repo

---
 .github/workflows/benchmark.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index 500eb58..2a55846 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -67,7 +67,7 @@ jobs:
     name: Benchmark Artifact
     runs-on: ubuntu-latest
     needs: run-benchmark
-    if: always()
+    if: github.ref == 'refs/heads/main' && github.repository_owner == 'omnibenchmark'
     steps:
       - name: Check out repository
         uses: actions/checkout@v4

From 1b972bfef0d7a74199d0289d8b7b8749720bce27 Mon Sep 17 00:00:00 2001
From: ben <ben.uzh@proton.me>
Date: Thu, 8 May 2025 12:45:12 +0200
Subject: [PATCH 11/45] Update Makefile

---
 Makefile | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index e107f62..875a375 100644
--- a/Makefile
+++ b/Makefile
@@ -1,5 +1,5 @@
 MAX_CORES ?= 10
-# by default, we want to run all snakemake rules even if there are failures
+# by default, we want to run all snakemake rules even if there are failures (-k)
 OB_CMD=ob run benchmark -k --local --task-timeout "4h" --cores ${MAX_CORES}
 prepare_apptainer_env:
 	cd envs && ./build_singularity.sh
@@ -12,3 +12,6 @@ run_with_apptainer_backend:
 run_with_conda_backend:
 	 ${OB_CMD} -b Clustering_conda.yml
 	 mv out out_conda
+run_with_envmodules_backend:
+	 ${OB_CMD} -b Clustering_envmodules.yml
+	 mv out out_lmod

From 49646db648dee014b3a43f655ef64147cbda6ed0 Mon Sep 17 00:00:00 2001
From: ben <ben.uzh@proton.me>
Date: Thu, 8 May 2025 13:22:56 +0200
Subject: [PATCH 12/45] streamline envmodules yaml

---
 Clustering_envmodules.yml | 281 ++++++++++++++++++++------------------
 1 file changed, 149 insertions(+), 132 deletions(-)

diff --git a/Clustering_envmodules.yml b/Clustering_envmodules.yml
index 3c2b8bd..1ab4808 100644
--- a/Clustering_envmodules.yml
+++ b/Clustering_envmodules.yml
@@ -2,32 +2,21 @@ id: clustering_example_envmodules
 description: Clustering benchmark on Gagolewski's, true number of clusters plus minus 2.
 version: 1.4
 benchmarker: "Izaskun Mallona, Daniel Incicau"
-storage: http://omnibenchmark.org:9000
-benchmark_yaml_spec: 0.04
-storage_api: S3
-storage_bucket_name: clusteringexampleenvmodules
+benchmark_yaml_spec: 0.5
+
 software_backend: envmodules
+
 software_environments:
   clustbench:
     description: "clustbench on py3.12.6"
-    conda: envs/clustbench.yml
-    envmodule: clustbench
-    apptainer: envs/clustbench.sif
-  sklearn:
-    description: "Daniel's on py3.12.6"
-    conda: envs/sklearn.yml
-    apptainer: envs/sklearn.sif
-    envmodule: clustbench # not true, but
-  R:
-    description: "Daniel's R with readr, dplyr, mclust, caret"
-    conda: envs/r.yml
-    apptainer: envs/r.sif
-    envmodule: fcps # not true, but
+    envmodule: clustbench/0.1.0-foss-2023b # py3.11 tho
+    conda: na
+    apptainer: na
   rmarkdown:
     description: "R with some plotting dependencies"
-    conda: envs/rmarkdown.yml
-    apptainer: envs/r.sif # not true, but
-    envmodule: fcps # not true, but
+    envmodule: rmarkdown # TODO
+    conda: na
+    apptainer: na
   fcps:
     description: "CRAN's FCPS"
     conda: envs/fcps.yml
@@ -56,42 +45,132 @@ stages:
         repository:
           url: https://github.com/imallona/clustbench_data
           commit: 366c5a2
-        parameters:  # comments depict the possible cardinalities and the number of curated labelsets
+        parameters: # comments depict the possible cardinalities and the number of curated labelsets
           - values: ["--dataset_generator", "fcps", "--dataset_name", "atom"] #	2	1
-          - values: ["--dataset_generator", "fcps", "--dataset_name", "chainlink"] #	2	1
-          - values: ["--dataset_generator", "fcps", "--dataset_name", "engytime"] #	2	2
+          - values: [
+                "--dataset_generator",
+                "fcps",
+                "--dataset_name",
+                "chainlink",
+              ] #	2	1
+          - values: [
+                "--dataset_generator",
+                "fcps",
+                "--dataset_name",
+                "engytime",
+              ] #	2	2
           - values: ["--dataset_generator", "fcps", "--dataset_name", "hepta"] #	7	1
           - values: ["--dataset_generator", "fcps", "--dataset_name", "lsun"] #	3	1
           - values: ["--dataset_generator", "fcps", "--dataset_name", "target"] #	2, 6	2
           - values: ["--dataset_generator", "fcps", "--dataset_name", "tetra"] #	4	1
-          - values: ["--dataset_generator", "fcps", "--dataset_name", "twodiamonds"] #	2	1
+          - values: [
+                "--dataset_generator",
+                "fcps",
+                "--dataset_name",
+                "twodiamonds",
+              ] #	2	1
           - values: ["--dataset_generator", "fcps", "--dataset_name", "wingnut"] #	2	1
           - values: ["--dataset_generator", "graves", "--dataset_name", "dense"] #	2	1
-          - values: ["--dataset_generator", "graves", "--dataset_name", "fuzzyx"] #	2, 4, 5	6
+          - values: [
+                "--dataset_generator",
+                "graves",
+                "--dataset_name",
+                "fuzzyx",
+              ] #	2, 4, 5	6
           - values: ["--dataset_generator", "graves", "--dataset_name", "line"] #	2	1
-          - values: ["--dataset_generator", "graves", "--dataset_name", "parabolic"] #	2, 4	2
+          - values: [
+                "--dataset_generator",
+                "graves",
+                "--dataset_name",
+                "parabolic",
+              ] #	2, 4	2
           - values: ["--dataset_generator", "graves", "--dataset_name", "ring"] #	2	1
-          - values: ["--dataset_generator", "graves", "--dataset_name", "ring_noisy"] #	2	1
-          - values: ["--dataset_generator", "graves", "--dataset_name", "ring_outliers"] #	2, 5	2
-          - values: ["--dataset_generator", "graves", "--dataset_name", "zigzag"] #	3, 5	2
-          - values: ["--dataset_generator", "graves", "--dataset_name", "zigzag_noisy"] #	3, 5	2
-          - values: ["--dataset_generator", "graves", "--dataset_name", "zigzag_outliers"] #	3, 5	2
-          - values: ["--dataset_generator", "other", "--dataset_name", "chameleon_t4_8k"] #	6	1
-          - values: ["--dataset_generator", "other", "--dataset_name", "chameleon_t5_8k"] #	6	1
-          - values: ["--dataset_generator", "other", "--dataset_name", "hdbscan"] #	6	1
+          - values: [
+                "--dataset_generator",
+                "graves",
+                "--dataset_name",
+                "ring_noisy",
+              ] #	2	1
+          - values: [
+                "--dataset_generator",
+                "graves",
+                "--dataset_name",
+                "ring_outliers",
+              ] #	2, 5	2
+          - values: [
+                "--dataset_generator",
+                "graves",
+                "--dataset_name",
+                "zigzag",
+              ] #	3, 5	2
+          - values: [
+                "--dataset_generator",
+                "graves",
+                "--dataset_name",
+                "zigzag_noisy",
+              ] #	3, 5	2
+          - values: [
+                "--dataset_generator",
+                "graves",
+                "--dataset_name",
+                "zigzag_outliers",
+              ] #	3, 5	2
+          - values: [
+                "--dataset_generator",
+                "other",
+                "--dataset_name",
+                "chameleon_t4_8k",
+              ] #	6	1
+          - values: [
+                "--dataset_generator",
+                "other",
+                "--dataset_name",
+                "chameleon_t5_8k",
+              ] #	6	1
+          - values: [
+                "--dataset_generator",
+                "other",
+                "--dataset_name",
+                "hdbscan",
+              ] #	6	1
           - values: ["--dataset_generator", "other", "--dataset_name", "iris"] #	3	1
           - values: ["--dataset_generator", "other", "--dataset_name", "iris5"] #	3	1
           - values: ["--dataset_generator", "other", "--dataset_name", "square"] #	2	1
-          - values: ["--dataset_generator", "sipu", "--dataset_name", "aggregation"] #	7	1
-          - values: ["--dataset_generator", "sipu", "--dataset_name", "compound"] #	4, 5, 6	5
+          - values: [
+                "--dataset_generator",
+                "sipu",
+                "--dataset_name",
+                "aggregation",
+              ] #	7	1
+          - values: [
+                "--dataset_generator",
+                "sipu",
+                "--dataset_name",
+                "compound",
+              ] #	4, 5, 6	5
           - values: ["--dataset_generator", "sipu", "--dataset_name", "flame"] #	2	2
           - values: ["--dataset_generator", "sipu", "--dataset_name", "jain"] #	2	1
-          - values: ["--dataset_generator", "sipu", "--dataset_name", "pathbased"] #	3, 4	2
+          - values: [
+                "--dataset_generator",
+                "sipu",
+                "--dataset_name",
+                "pathbased",
+              ] #	3, 4	2
           - values: ["--dataset_generator", "sipu", "--dataset_name", "r15"] #	8, 9, 15	3
           - values: ["--dataset_generator", "sipu", "--dataset_name", "spiral"] #	3	1
-          - values: ["--dataset_generator", "sipu", "--dataset_name", "unbalance"] #	8	1
+          - values: [
+                "--dataset_generator",
+                "sipu",
+                "--dataset_name",
+                "unbalance",
+              ] #	8	1
           - values: ["--dataset_generator", "uci", "--dataset_name", "ecoli"] #	8	1
-          - values: ["--dataset_generator", "uci", "--dataset_name", "ionosphere"] #	2	1
+          - values: [
+                "--dataset_generator",
+                "uci",
+                "--dataset_name",
+                "ionosphere",
+              ] #	2	1
           - values: ["--dataset_generator", "uci", "--dataset_name", "sonar"] #	2	1
           - values: ["--dataset_generator", "uci", "--dataset_name", "statlog"] #	7	1
           - values: ["--dataset_generator", "uci", "--dataset_name", "wdbc"] #	2	1
@@ -100,8 +179,18 @@ stages:
           - values: ["--dataset_generator", "wut", "--dataset_name", "circles"] #	4	1
           - values: ["--dataset_generator", "wut", "--dataset_name", "cross"] #	4	1
           - values: ["--dataset_generator", "wut", "--dataset_name", "graph"] #	10	1
-          - values: ["--dataset_generator", "wut", "--dataset_name", "isolation"] #	3	1
-          - values: ["--dataset_generator", "wut", "--dataset_name", "labirynth"] #	6	1
+          - values: [
+                "--dataset_generator",
+                "wut",
+                "--dataset_name",
+                "isolation",
+              ] #	3	1
+          - values: [
+                "--dataset_generator",
+                "wut",
+                "--dataset_name",
+                "labirynth",
+              ] #	6	1
           - values: ["--dataset_generator", "wut", "--dataset_name", "mk1"] #	3	1
           - values: ["--dataset_generator", "wut", "--dataset_name", "mk2"] #	2	1
           - values: ["--dataset_generator", "wut", "--dataset_name", "mk3"] #	3	1
@@ -109,9 +198,24 @@ stages:
           - values: ["--dataset_generator", "wut", "--dataset_name", "olympic"] #	5	1
           - values: ["--dataset_generator", "wut", "--dataset_name", "smile"] #	4, 6	2
           - values: ["--dataset_generator", "wut", "--dataset_name", "stripes"] #	2	1
-          - values: ["--dataset_generator", "wut", "--dataset_name", "trajectories"] #	4	1
-          - values: ["--dataset_generator", "wut", "--dataset_name", "trapped_lovers"] #	3	1
-          - values: ["--dataset_generator", "wut", "--dataset_name", "twosplashes"] #	2	1
+          - values: [
+                "--dataset_generator",
+                "wut",
+                "--dataset_name",
+                "trajectories",
+              ] #	4	1
+          - values: [
+                "--dataset_generator",
+                "wut",
+                "--dataset_name",
+                "trapped_lovers",
+              ] #	3	1
+          - values: [
+                "--dataset_generator",
+                "wut",
+                "--dataset_name",
+                "twosplashes",
+              ] #	2	1
           - values: ["--dataset_generator", "wut", "--dataset_name", "windows"] #	5	1
           - values: ["--dataset_generator", "wut", "--dataset_name", "x1"] #	3	1
           - values: ["--dataset_generator", "wut", "--dataset_name", "x2"] #	3	1
@@ -126,7 +230,7 @@ stages:
         path: "{input}/{stage}/{module}/{params}/{dataset}.labels0.gz"
 
   ## clustbench methods (fastcluster) ###################################################################
-  
+
   - id: clustering
     modules:
       - id: fastcluster
@@ -148,7 +252,6 @@ stages:
         software_environment: "clustbench"
         repository:
           url: https://github.com/imallona/clustbench_sklearn
-          #url: /home/imallona/src/clustbench_sklearn
           commit: 5877378
         parameters:
           - values: ["--method", "birch"]
@@ -229,89 +332,3 @@ stages:
     outputs:
       - id: metrics.scores
         path: "{input}/{stage}/{module}/{params}/{dataset}.scores.gz"
-
-  # ## daniel's data ###########################################################################
-  
-  # - id: danielsdata
-  #   modules:
-  #     - id: iris_manual
-  #       name: "Iris Dataset"
-  #       software_environment: "sklearn"
-  #       repository:
-  #         url: https://github.com/omnibenchmark-example/iris.git
-  #         commit: 47c63f0
-  #     - id: penguins
-  #       name: "Penguins Dataset"
-  #       software_environment: "sklearn"
-  #       repository:
-  #         url: https://github.com/omnibenchmark-example/penguins.git
-  #         commit: 9032478
-  #   outputs:
-  #     - id: data.features
-  #       path: "{input}/{stage}/{module}/{params}/{dataset}.features.csv"
-  #     - id: data.labels
-  #       path: "{input}/{stage}/{module}/{params}/{dataset}.labels.csv"
-  
-  # ## daniel's distances ########################################################################
-  
-  # - id: distances
-  #   modules:
-  #     - id: D1
-  #       software_environment: "sklearn"
-  #       parameters:
-  #         - values: ["--measure", "cosine"]
-  #         - values: ["--measure", "euclidean"]
-  #         - values: ["--measure", "manhattan"]
-  #         - values: ["--measure", "chebyshev"]
-  #       repository:
-  #         url: https://github.com/omnibenchmark-example/distance.git
-  #         commit: dd99d4f
-  #   inputs:
-  #     - entries:
-  #         - data.features
-  #   outputs:
-  #     - id: distances
-  #       path: "{input}/{stage}/{module}/{params}/{dataset}.distances.csv"
-        
-  # ## daniel's methods ###################################################################
-  
-  # - id: danielmethods
-  #   modules:
-  #     - id: kmeans
-  #       software_environment: "sklearn"
-  #       repository:
-  #         url: https://github.com/omnibenchmark-example/kmeans.git
-  #         commit: 049c8b1
-  #     - id: ward
-  #       software_environment: "R"
-  #       repository:
-  #         url: https://github.com/omnibenchmark-example/ward.git
-  #         commit: 976e3f3
-  #   inputs:
-  #     - entries:
-  #         - distances
-  #   outputs:
-  #     - id: methods.clusters
-  #       path: "{input}/{stage}/{module}/{params}/{dataset}.clusters.csv"
-
-  # ## daniel's metrics ###################################################################
-
-  # - id: danielsmetrics
-  #   modules:
-  #     - id: ari
-  #       software_environment: "R"
-  #       repository:
-  #         url: https://github.com/omnibenchmark-example/ari.git
-  #         commit: 72708f0
-  #     - id: accuracy
-  #       software_environment: "R"
-  #       repository:
-  #         url: https://github.com/omnibenchmark-example/accuracy.git
-  #         commit: e26b32f
-  #   inputs:
-  #     - entries:
-  #         - methods.clusters
-  #         - data.labels
-  #   outputs:
-  #     - id: metrics.mapping
-  #       path: "{input}/{stage}/{module}/{params}/{dataset}.metrics.txt"

From fc53991d1eb32c7749c3f1a2bccc0ed9e33601af Mon Sep 17 00:00:00 2001
From: ben <ben.uzh@proton.me>
Date: Thu, 8 May 2025 15:14:35 +0200
Subject: [PATCH 13/45] update clustbench

---
 Clustering_envmodules.yml | 38 ++++++++++++++++++++------------------
 envs/clustbench.eb        |  5 +++++
 2 files changed, 25 insertions(+), 18 deletions(-)

diff --git a/Clustering_envmodules.yml b/Clustering_envmodules.yml
index 1ab4808..f37fd6c 100644
--- a/Clustering_envmodules.yml
+++ b/Clustering_envmodules.yml
@@ -9,23 +9,24 @@ software_backend: envmodules
 software_environments:
   clustbench:
     description: "clustbench on py3.12.6"
-    envmodule: clustbench/0.1.0-foss-2023b # py3.11 tho
-    conda: na
+    envmodule: clustbench/0.1.0-foss-2023b
+    conda: envs/clustbench.yml
+    apptainer: na
+  fcps:
+    description: "CRAN's FCPS"
+    envmodule: fcps/1.3.4-foss-2023a-r-4.3.2
+    conda: envs/fcps.yml
     apptainer: na
   rmarkdown:
     description: "R with some plotting dependencies"
     envmodule: rmarkdown # TODO
-    conda: na
+    conda: envs/clustbench.yml
     apptainer: na
-  fcps:
-    description: "CRAN's FCPS"
-    conda: envs/fcps.yml
-    apptainer: envs/fcps.sif
-    envmodule: fcps
+
 metric_collectors:
   - id: plotting
     name: "Single-backend metric collector."
-    software_environment: "rmarkdown"
+    software_environment: rmarkdown
     repository:
       url: https://github.com/imallona/clustering_report
       commit: 1d6bdf5
@@ -34,14 +35,15 @@ metric_collectors:
     outputs:
       - id: plotting.html
         path: "{input}/{name}/plotting_report.html"
+
 stages:
-  ## clustbench data ##########################################################
 
   - id: data
+    ## clustbench data
     modules:
       - id: clustbench
         name: "clustbench datasets, from https://www.sciencedirect.com/science/article/pii/S0020025521010082#t0005 Table1"
-        software_environment: "clustbench"
+        software_environment: clustbench
         repository:
           url: https://github.com/imallona/clustbench_data
           commit: 366c5a2
@@ -235,7 +237,7 @@ stages:
     modules:
       - id: fastcluster
         name: "fastcluster algorithm"
-        software_environment: "clustbench"
+        software_environment: clustbench
         repository:
           url: https://github.com/imallona/clustbench_fastcluster
           # url: /home/imallona/src/clustbench_fastcluster/
@@ -249,7 +251,7 @@ stages:
           - values: ["--linkage", "centroid"]
       - id: sklearn
         name: "sklearn"
-        software_environment: "clustbench"
+        software_environment: clustbench
         repository:
           url: https://github.com/imallona/clustbench_sklearn
           commit: 5877378
@@ -260,7 +262,7 @@ stages:
           - values: ["--method", "gm"]
       - id: agglomerative
         name: "agglomerative"
-        software_environment: "clustbench"
+        software_environment: clustbench
         repository:
           url: https://github.com/imallona/clustbench_agglomerative
           commit: 5454368
@@ -270,7 +272,7 @@ stages:
           - values: ["--linkage", "ward"]
       - id: genieclust
         name: "genieclust"
-        software_environment: "clustbench"
+        software_environment: clustbench
         repository:
           url: https://github.com/imallona/clustbench_genieclust
           commit: 6090043
@@ -280,7 +282,7 @@ stages:
           - values: ["--method", "ica"]
       - id: fcps
         name: "fcps"
-        software_environment: "fcps"
+        software_environment: fcps
         repository:
           url: https://github.com/imallona/clustbench_fcps
           commit: 272fa5f
@@ -309,10 +311,10 @@ stages:
     modules:
       - id: partition_metrics
         name: "clustbench partition metrics"
-        software_environment: "clustbench"
+        software_environment: clustbench
         repository:
           url: https://github.com/imallona/clustbench_metrics
-          commit: 9132d45
+          commit: 8184cd4
         parameters:
           - values: ["--metric", "normalized_clustering_accuracy"]
           - values: ["--metric", "adjusted_fm_score"]
diff --git a/envs/clustbench.eb b/envs/clustbench.eb
index f3ee681..0e86911 100644
--- a/envs/clustbench.eb
+++ b/envs/clustbench.eb
@@ -13,6 +13,7 @@ dependencies = [
     ('SciPy-bundle', '2023.11'),
     ('matplotlib', '3.8.2'),
     ('scikit-learn', '1.4.0'),
+# FIXME: I think this is not needed -- ben
 #    ('meson-python', '0.15.0'),
 #    ('Python-bundle-PyPI', '2023.10'), ## so GCC 13.2.0 like foss-2023b
 ]
@@ -40,6 +41,10 @@ exts_list = [
         'install_src': 'https://files.pythonhosted.org/packages/2a/09/d1fd7b02cfabe76262d0f88d74fa71dc93e857525f8249539ec5ab174292/genieclust-1.1.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl',
         'checksums': ['4c159f507b84b6d6d171883223648d837c520a9bcce650944a6ee0cb320e2151'],
     }),
+    ('clustering_benchmarks', '1.1.6', {
+        'modulename': 'clustbench',
+        'checksums': ['8c3ac0aed7c4c4925df6e5000db29aed6359341bd1ef2e516f230e13d8b66a0c'],
+    }),
 ]
 
 moduleclass = 'bio'

From 54b72790b1e2d2d9aa66d30c3d956b5d8be387a3 Mon Sep 17 00:00:00 2001
From: ben <ben.uzh@proton.me>
Date: Sat, 10 May 2025 18:46:41 +0200
Subject: [PATCH 14/45] add rmarkdown-python bundles, without checksums

---
 envs/rmarkdown-python.eb | 28 ++++++++++++
 envs/rmarkdown.eb        | 94 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 122 insertions(+)
 create mode 100644 envs/rmarkdown-python.eb
 create mode 100644 envs/rmarkdown.eb

diff --git a/envs/rmarkdown-python.eb b/envs/rmarkdown-python.eb
new file mode 100644
index 0000000..a9edb00
--- /dev/null
+++ b/envs/rmarkdown-python.eb
@@ -0,0 +1,28 @@
+easyblock = 'Bundle'
+
+# This is a dummy bundle that installs:
+# 1. rmarkdown: an R bundle that we also package
+# 2. Python-3.12.3-GCCcore-13.3.0
+# This is a dependency for the clustering_benchmark metric collector.
+
+name = 'rmarkdown-python'
+version = '0.1.0'
+
+local_rver = '4.4.2'
+local_pyver = '3.12.3'
+versionsuffix = f'-r-{local_rver}-py-{local_pyver}'
+
+homepage = 'https://omnibenchmark.org'
+description = 'Rmarkdown bundle with specific Python dependency'
+
+toolchain = {'name': 'system', 'version': '1.0'}
+
+dependencies = [
+    ('rmarkdown', '0.1.0', f'-gfbf-2024a-r-{local_rver}'),
+    ('Python', local_pyver, '-GCCcore-13.3.0'),
+]
+
+sanity_check_paths = {
+    'files': [],
+    'dirs': ['../../rmarkdown/0.1.0-gfbf-2024a-r-4.4.2']
+}
diff --git a/envs/rmarkdown.eb b/envs/rmarkdown.eb
new file mode 100644
index 0000000..a88a2a9
--- /dev/null
+++ b/envs/rmarkdown.eb
@@ -0,0 +1,94 @@
+easyblock = 'Bundle'
+
+# TODO(ben): Try to use https://www.eessi.io/docs/available_software/detail/R-bundle-CRAN/
+# and build only what's left out.
+
+name = 'rmarkdown'
+version = '0.1.0'
+versionsuffix = '-r-%(rver)s'
+
+homepage = 'https://omnibenchmark.org'
+description = 'rmarkdown bundle for clustbench reports'
+
+toolchain = {'name': 'gfbf', 'version': '2024a'}
+
+dependencies = [
+    ('R', '4.4.2'),
+]
+
+exts_default_options = {
+    'source_urls': [
+        'https://cloud.r-project.org/src/contrib/',
+        'https://cran.r-project.org/src/contrib/',  				# current version of packages
+        'https://cran.r-project.org/src/contrib/Archive/%(name)s',  		# package archive
+        'https://www.bioconductor.org/packages/release/bioc/src/contrib/',      # bioconductor
+    ],
+    'sources': ['%(name)s_%(version)s.tar.gz'],
+}
+
+exts_defaultclass = 'RPackage'
+
+
+exts_list = [
+    ('rlang', '1.1.6'),
+    ('glue', '1.8.0'),
+    ('cli', '3.6.4'),
+    ('lifecycle', '1.0.4'),
+    ('vctrs', '0.6.5'),
+    ('utf8', '1.2.4'),
+    ('lattice', '0.22-5'),
+    ('pkgconfig', '2.0.3'),
+    ('pillar', '1.10.2'),
+    ('magrittr', '2.0.3'),
+    ('fansi', '1.0.6'),
+    ('viridisLite', '0.4.2'),
+    ('RColorBrewer', '1.1-3'),
+    ('R6', '2.6.1'),
+    ('labeling', '0.4.3'),
+    ('farver', '2.1.2'),
+    ('Matrix', '1.7-3'),
+    ('nlme', '3.1-168'),
+    ('withr', '3.0.2'),
+    ('tibble', '3.2.1'),
+    ('colorspace', '2.1-1'),
+    ('munsell', '0.5.1'),
+    ('scales', '1.3.0'),
+    ('mgcv', '1.9-1'),
+    ('MASS', '7.3-65'),
+    ('isoband', '0.2.7'),
+    ('gtable', '0.3.6'),
+    ('ggplot2', '3.5.2'),
+    ('findpython', '1.0.9', {}),
+    ('argparse', '2.2.5', {}),
+    ('rmarkdown', '2.29', {}),
+    ('generics', '0.1.3', {}),
+    ('tidyselect', '1.2.1', {}),
+    ('dplyr', '1.1.4', {}),
+    ('tidyr', '1.3.1', {}),
+    ('shape', '1.4.6.1', {}),
+    ('GlobalOptions', '0.1.2', {}),
+    ('circlize', '0.4.16', {}),
+    ('rjson', '0.2.23', {}),
+    ('GetoptLong', '1.0.5', {}),
+    ('cluster', '2.1.8.1', {}),
+    ('clue', '0.3-66', {}),
+    ('png', '0.1-8', {}),
+    ('BiocGenerics', '0.54.0', {}),
+    ('S4Vectors', '0.46.0', {}),
+    ('IRanges', '2.42.0'),
+    ('matrixStats', '1.5.0', {}),
+    ('iterators', '1.0.14', {}),
+    ('codetools', '0.2-20', {}),
+    ('foreach', '1.5.2', {}),
+    ('doParallel', '1.0.17', {}),
+    ('ComplexHeatmap', '2.24.0', {}),
+]
+
+modextrapaths = {'R_LIBS_SITE': ''}
+
+sanity_check_paths = {
+    'files': [],
+    'dirs': ['argparse', 'rmarkdown', 'ggplot2', 'tidyr', 'ComplexHeatmap'],
+}
+
+moduleclass = 'bio'

From 1b57e44585c688d6f5e8f5be4b38b039e73cab57 Mon Sep 17 00:00:00 2001
From: ben <ben.uzh@proton.me>
Date: Sat, 10 May 2025 18:49:27 +0200
Subject: [PATCH 15/45] inject checksums to rmarkdown easyconfig

---
 envs/rmarkdown.eb | 209 ++++++++++++++++++++++++++++++++++------------
 1 file changed, 156 insertions(+), 53 deletions(-)

diff --git a/envs/rmarkdown.eb b/envs/rmarkdown.eb
index a88a2a9..067eadd 100644
--- a/envs/rmarkdown.eb
+++ b/envs/rmarkdown.eb
@@ -28,60 +28,163 @@ exts_default_options = {
 
 exts_defaultclass = 'RPackage'
 
-
 exts_list = [
-    ('rlang', '1.1.6'),
-    ('glue', '1.8.0'),
-    ('cli', '3.6.4'),
-    ('lifecycle', '1.0.4'),
-    ('vctrs', '0.6.5'),
-    ('utf8', '1.2.4'),
-    ('lattice', '0.22-5'),
-    ('pkgconfig', '2.0.3'),
-    ('pillar', '1.10.2'),
-    ('magrittr', '2.0.3'),
-    ('fansi', '1.0.6'),
-    ('viridisLite', '0.4.2'),
-    ('RColorBrewer', '1.1-3'),
-    ('R6', '2.6.1'),
-    ('labeling', '0.4.3'),
-    ('farver', '2.1.2'),
-    ('Matrix', '1.7-3'),
-    ('nlme', '3.1-168'),
-    ('withr', '3.0.2'),
-    ('tibble', '3.2.1'),
-    ('colorspace', '2.1-1'),
-    ('munsell', '0.5.1'),
-    ('scales', '1.3.0'),
-    ('mgcv', '1.9-1'),
-    ('MASS', '7.3-65'),
-    ('isoband', '0.2.7'),
-    ('gtable', '0.3.6'),
-    ('ggplot2', '3.5.2'),
-    ('findpython', '1.0.9', {}),
-    ('argparse', '2.2.5', {}),
-    ('rmarkdown', '2.29', {}),
-    ('generics', '0.1.3', {}),
-    ('tidyselect', '1.2.1', {}),
-    ('dplyr', '1.1.4', {}),
-    ('tidyr', '1.3.1', {}),
-    ('shape', '1.4.6.1', {}),
-    ('GlobalOptions', '0.1.2', {}),
-    ('circlize', '0.4.16', {}),
-    ('rjson', '0.2.23', {}),
-    ('GetoptLong', '1.0.5', {}),
-    ('cluster', '2.1.8.1', {}),
-    ('clue', '0.3-66', {}),
-    ('png', '0.1-8', {}),
-    ('BiocGenerics', '0.54.0', {}),
-    ('S4Vectors', '0.46.0', {}),
-    ('IRanges', '2.42.0'),
-    ('matrixStats', '1.5.0', {}),
-    ('iterators', '1.0.14', {}),
-    ('codetools', '0.2-20', {}),
-    ('foreach', '1.5.2', {}),
-    ('doParallel', '1.0.17', {}),
-    ('ComplexHeatmap', '2.24.0', {}),
+    ('rlang', '1.1.6', {
+        'checksums': ['18544c876f4e18ec554edecc308362a52fbc7e0805c4794cf59bcc4d0b57f330'],
+    }),
+    ('glue', '1.8.0', {
+        'checksums': ['c86f364ba899b8662f5da3e1a75f43ae081ab04e0d51171d052356e7ee4b72a0'],
+    }),
+    ('cli', '3.6.4', {
+        'checksums': ['0c39539ce173bcbf7abaca64e8d2c87ffec8257c144c31b793c4cf2dd9cf7620'],
+    }),
+    ('lifecycle', '1.0.4', {
+        'checksums': ['ada4d3c7e84b0c93105e888647c5754219a8334f6e1f82d5afaf83d4855b91cc'],
+    }),
+    ('vctrs', '0.6.5', {
+        'checksums': ['43167d2248fd699594044b5c8f1dbb7ed163f2d64761e08ba805b04e7ec8e402'],
+    }),
+    ('utf8', '1.2.4', {
+        'checksums': ['418f824bbd9cd868d2d8a0d4345545c62151d321224cdffca8b1ffd98a167b7d'],
+    }),
+    ('lattice', '0.22-5', {
+        'checksums': ['ba1fbe5e18a133507dca9851b7f933002bdb6d1f3ea5f410a0a441103b6da5f1'],
+    }),
+    ('pkgconfig', '2.0.3', {
+        'checksums': ['330fef440ffeb842a7dcfffc8303743f1feae83e8d6131078b5a44ff11bc3850'],
+    }),
+    ('pillar', '1.10.2', {
+        'checksums': ['2cdbe3fe1b28b62530880ab26fc3c874e0dd5060767ae1a8ee5685f65e56d645'],
+    }),
+    ('magrittr', '2.0.3', {
+        'checksums': ['a2bff83f792a1acb801bfe6330bb62724c74d5308832f2cb6a6178336ace55d2'],
+    }),
+    ('fansi', '1.0.6', {
+        'checksums': ['ea9dc690dfe50a7fad7c5eb863c157d70385512173574c56f4253b6dfe431863'],
+    }),
+    ('viridisLite', '0.4.2', {
+        'checksums': ['893f111d31deccd2cc959bc9db7ba2ce9020a2dd1b9c1c009587e449c4cce1a1'],
+    }),
+    ('RColorBrewer', '1.1-3', {
+        'checksums': ['4f42f5423c45688b39f492c7892d93f37b4541831c8ffb140364d2bd89031ac0'],
+    }),
+    ('R6', '2.6.1', {
+        'checksums': ['59c6eba8b1b912eb7e104f65053235604be853425ee67c152ac4e86a1f2073b4'],
+    }),
+    ('labeling', '0.4.3', {
+        'checksums': ['c62f4fc2cc74377d7055903c5f1913b7295f7587456fe468592738a483e264f2'],
+    }),
+    ('farver', '2.1.2', {
+        'checksums': ['528823b95daab4566137711f1c842027a952bea1b2ae6ff098e2ca512b17fe25'],
+    }),
+    ('Matrix', '1.7-3', {
+        'checksums': ['6642e9db8cddf32a051972fd5a634bf7edbdc925c5c2d139bf71e92df00fb44e'],
+    }),
+    ('nlme', '3.1-168', {
+        'checksums': ['23b78468344cb6775dee5e0d9c8133032d64f08ebaba20776508a0443a897362'],
+    }),
+    ('withr', '3.0.2', {
+        'checksums': ['0a3a05f493d275cca4bf13c8c1b95a1a4eed7f83b2493f41fde02ce3fc92c1a3'],
+    }),
+    ('tibble', '3.2.1', {
+        'checksums': ['65a72d0c557fd6e7c510d150c935ed6ced5db7d05fc20236b370f11428372131'],
+    }),
+    ('colorspace', '2.1-1', {
+        'checksums': ['e721cee5f4d6e4b0fc8eb18265e316b4f856fd3be02f0775a26032663758cd0b'],
+    }),
+    ('munsell', '0.5.1', {
+        'checksums': ['03a2fd9ac40766cded96dfe33b143d872d0aaa262a25482ce19161ca959429a6'],
+    }),
+    ('scales', '1.3.0', {
+        'checksums': ['b33e0f6b44259551ce02befd52eac53602509fbfdd903920620c658c50f35888'],
+    }),
+    ('mgcv', '1.9-1', {
+        'checksums': ['700fbc37bedd3a49505b9bc4949faee156d9cfb4f669d797d06a10a15a5bdb32'],
+    }),
+    ('MASS', '7.3-65', {
+        'checksums': ['b07ef1e3c364ce56269b4a8a7759cc9f87c876554f91293437bb578cfe38172f'],
+    }),
+    ('isoband', '0.2.7', {
+        'checksums': ['7693223343b45b86de2b5b638ff148f0dafa6d7b1237e822c5272902f79cdf61'],
+    }),
+    ('gtable', '0.3.6', {
+        'checksums': ['d305a5fa11278b649d2d8edc5288bf28009be888a42be58ff8714018e49de0ef'],
+    }),
+    ('ggplot2', '3.5.2', {
+        'checksums': ['0a30024a2ff3e569412223c8f14563ed504f3e0851de03e42d1b5f73fe1f06bf'],
+    }),
+    ('findpython', '1.0.9', {
+        'checksums': ['b6a15e0cdfcdd4b1cfc76f7e4eaad0125d4d52889711200075280e9b2a2cb7cb'],
+    }),
+    ('argparse', '2.2.5', {
+        'checksums': ['53c8a9eb51041084eb3d9c271b14ebcb32dc2f50cf16afa5c54c504a97229ea4'],
+    }),
+    (name, '2.29', {
+        'checksums': ['6662ac85316c869caad6e3b95468cad97f6eef106d47b066db8d40c05a490928'],
+    }),
+    ('generics', '0.1.3', {
+        'checksums': ['75046163bfa8b8a4f4214c1b689e796207f6447182f2e5062cf570302387d053'],
+    }),
+    ('tidyselect', '1.2.1', {
+        'checksums': ['169e97ba0bbfbcdf4a80534322751f87a04370310c40e27f04aac6525d45903c'],
+    }),
+    ('dplyr', '1.1.4', {
+        'checksums': ['cf730414d5d4ab387b4e9890a4b1df9d17a3903488e8da8df1cf2e11e44558cb'],
+    }),
+    ('tidyr', '1.3.1', {
+        'checksums': ['e820c261cb5543f572f49276a7bdc7302aa4215da4bf850b1b939a315353835d'],
+    }),
+    ('shape', '1.4.6.1', {
+        'checksums': ['43f9bd0f997fd6cf1838efd8b2509c9a6396513f4e54a20360481634affd22a4'],
+    }),
+    ('GlobalOptions', '0.1.2', {
+        'checksums': ['47890699668cfa9900a829c51f8a32e02a7a7764ad07cfac972aad66f839753e'],
+    }),
+    ('circlize', '0.4.16', {
+        'checksums': ['16dc32c7704906d13a9e5281bb396e92fb89a6b17fa5e201953240726b650b67'],
+    }),
+    ('rjson', '0.2.23', {
+        'checksums': ['55034575c854ed657e6701da278c0fdea251479624d06a963b2e58461a5f0f48'],
+    }),
+    ('GetoptLong', '1.0.5', {
+        'checksums': ['8c237986ed3dfb72d956ad865ef7768644eebf144675ad66140acfd1aca9d701'],
+    }),
+    ('cluster', '2.1.8.1', {
+        'checksums': ['4b95b78e09b17ddca72edc0bb180c753c004ed2f61c3eb12e0451ac77f441e57'],
+    }),
+    ('clue', '0.3-66', {
+        'checksums': ['aa86dd58c05635eb394c9ede0dd15a4f24af4815f299451bbc7895c0f737c2fb'],
+    }),
+    ('png', '0.1-8', {
+        'checksums': ['5a36fabb6d62ba2533d3fc4cececd07891942cfb76fe689ec0d550d08762f61c'],
+    }),
+    ('BiocGenerics', '0.54.0', {
+        'checksums': ['413d6f74cbc671147f63eefc46b718af815d6497535c2198925d9306e00c41b9'],
+    }),
+    ('S4Vectors', '0.46.0', {
+        'checksums': ['c34249c6a367a2a1e94158d9e60294f2b901e485d93717250a417569be187a40'],
+    }),
+    ('IRanges', '2.42.0', {
+        'checksums': ['0abb01ee93111c5fc678f9aa2f93d00d8d1548263cb60daa52645a6061b603fc'],
+    }),
+    ('matrixStats', '1.5.0', {
+        'checksums': ['12996c5f3e6fc202a43e1087f16a71b7fa93d7e908f512542c7ee89cf95dcc15'],
+    }),
+    ('iterators', '1.0.14', {
+        'checksums': ['cef3075a0930e1408c764e4da56bbadd4f7d14315809df8f38dd51f80ccc677b'],
+    }),
+    ('codetools', '0.2-20', {
+        'checksums': ['3be6f375ec178723ddfd559d1e8e85bfeee04a5fbaf9f53f2f844e1669fea863'],
+    }),
+    ('foreach', '1.5.2', {
+        'checksums': ['56338d8753f9f68f262cf532fd8a6d0fe25a71a2ff0107f3ce378feb926bafe4'],
+    }),
+    ('doParallel', '1.0.17', {
+        'checksums': ['b96a25ad105a654d70c7b4ca27290dc9967bc47f4668b2763927a886b178abd7'],
+    }),
+    ('ComplexHeatmap', '2.24.0', {
+        'checksums': ['2a015ad26c5a5f003ee203d77cc8d3eea5461bcf2db7ce102da1bef7db082650'],
+    }),
 ]
 
 modextrapaths = {'R_LIBS_SITE': ''}

From dfd5b936195655c136bf513640c5a5196a7785ea Mon Sep 17 00:00:00 2001
From: ben <ben.uzh@proton.me>
Date: Sun, 11 May 2025 12:18:59 +0200
Subject: [PATCH 16/45] update sklearn singularity definition

---
 envs/sklearn_singularity.def | 57 ++++++++++++++++++++++++------------
 1 file changed, 39 insertions(+), 18 deletions(-)

diff --git a/envs/sklearn_singularity.def b/envs/sklearn_singularity.def
index 939a3bb..56bcf37 100644
--- a/envs/sklearn_singularity.def
+++ b/envs/sklearn_singularity.def
@@ -1,33 +1,54 @@
-Bootstrap: docker 
-From: ubuntu:jammy-20240911.1
+Bootstrap: docker
+From: ubuntu:noble-20250404
 
 %labels
-
-    AUTHOR izaskun.mallona@gmail.com
+    Author izaskun.mallona@gmail.com
+    Author ben.uzh@proton.me
 
 %post
+    PYTHON_VERSION=3.12.6
+    PYTHON_MAJOR_VERSION=$(echo $PYTHON_VERSION | cut -d. -f1,2)
     
-    # Install python3.12
+    # Update and enable deb-src
+    apt-get update
+    echo "deb-src http://archive.ubuntu.com/ubuntu/ noble main restricted universe multiverse" >> /etc/apt/sources.list
+    echo "deb-src http://archive.ubuntu.com/ubuntu/ noble-updates main restricted universe multiverse" >> /etc/apt/sources.list
     apt-get update
-    apt install -y build-essential zlib1g-dev libncurses5-dev libgdbm-dev libnss3-dev libssl-dev \
-        libreadline-dev libffi-dev libsqlite3-dev wget libbz2-dev python-is-python3 git
-
-    wget https://www.python.org/ftp/python/3.12.6/Python-3.12.6.tgz
-    tar -xf Python-3.12.6.tgz
-    cd Python-3.12.*/
-    ./configure --enable-optimizations
-    make -j 4
-    make altinstall
 
-    # virtualenv
+    
+    # Get build dependencies for Python
+    apt-get build-dep -y python3
+
+    # Extra dependencies
+    apt-get install -y git python-is-python3 wget zlib1g-dev libbz2-dev libssl-dev libffi-dev
+    
+    # Calculate half the number of available cores
+    HALF_NPROC=$(( $(nproc) / 2 ))
+    # Ensure at least one core is used
+    CORES_TO_USE=$(( HALF_NPROC > 0 ? HALF_NPROC : 1 ))
+    
+    # Download and build Python with optimizations
+    wget https://www.python.org/ftp/python/${PYTHON_VERSION}/Python-${PYTHON_VERSION}.tgz
+    tar -xf Python-${PYTHON_VERSION}.tgz
+    cd Python-${PYTHON_VERSION}*/
+    # Enable all possible optimizations
+    ./configure --enable-optimizations --with-lto --enable-shared LDFLAGS="-Wl,-rpath /usr/local/lib"
+    make -j ${CORES_TO_USE}
+    make altinstall
+    
+    # Create virtualenv using the locally built Python
     cd /opt
-    python3.12 -m venv "default"
+    /usr/local/bin/python${PYTHON_MAJOR_VERSION} -m venv "default"
     . default/bin/activate
-
-    pip3 install -U scikit-learn pandas argparse numpy scipy "isodate" "pydantic-core" "gitpython==3.1.43"
+    
+    # Install required packages
+    pip3 install "clustering-benchmarks==1.1.5" "wget" "fastcluster==1.2.6" "numpy==1.26.4" "scipy==1.14.1" \
+      "isodate" "pydantic-core"  \
+      "genieclust==1.1.6" "pandas==2.2.3" "gitpython==3.1.43"
 
     echo '. /opt/default/bin/activate' >> $SINGULARITY_ENVIRONMENT
 
 %environment
 
     . /opt/default/bin/activate
+

From 0056b7fce71ab1e5efc9456502c2114ea4d597d7 Mon Sep 17 00:00:00 2001
From: ben <ben.uzh@proton.me>
Date: Sun, 11 May 2025 12:25:21 +0200
Subject: [PATCH 17/45] factorize sklearn singularity pip block

---
 envs/sklearn-pip.apptainer.include | 11 +++++++++++
 envs/sklearn_singularity.def       | 19 +++++++++----------
 2 files changed, 20 insertions(+), 10 deletions(-)
 create mode 100644 envs/sklearn-pip.apptainer.include

diff --git a/envs/sklearn-pip.apptainer.include b/envs/sklearn-pip.apptainer.include
new file mode 100644
index 0000000..b8f48eb
--- /dev/null
+++ b/envs/sklearn-pip.apptainer.include
@@ -0,0 +1,11 @@
+    pip3 install \
+      "clustering-benchmarks==1.1.6" \
+      "fastcluster==1.2.6" \
+      "numpy==1.26.4" \
+      "scipy==1.14.1" \
+      "isodate" \
+      "pydantic-core"  \
+      "genieclust==1.1.6" \
+      "pandas==2.2.3" \
+      "gitpython==3.1.43" \
+      wget"
diff --git a/envs/sklearn_singularity.def b/envs/sklearn_singularity.def
index 56bcf37..cb9a2f6 100644
--- a/envs/sklearn_singularity.def
+++ b/envs/sklearn_singularity.def
@@ -8,25 +8,25 @@ From: ubuntu:noble-20250404
 %post
     PYTHON_VERSION=3.12.6
     PYTHON_MAJOR_VERSION=$(echo $PYTHON_VERSION | cut -d. -f1,2)
-    
+
     # Update and enable deb-src
     apt-get update
     echo "deb-src http://archive.ubuntu.com/ubuntu/ noble main restricted universe multiverse" >> /etc/apt/sources.list
     echo "deb-src http://archive.ubuntu.com/ubuntu/ noble-updates main restricted universe multiverse" >> /etc/apt/sources.list
     apt-get update
 
-    
+
     # Get build dependencies for Python
     apt-get build-dep -y python3
 
     # Extra dependencies
     apt-get install -y git python-is-python3 wget zlib1g-dev libbz2-dev libssl-dev libffi-dev
-    
+
     # Calculate half the number of available cores
     HALF_NPROC=$(( $(nproc) / 2 ))
     # Ensure at least one core is used
     CORES_TO_USE=$(( HALF_NPROC > 0 ? HALF_NPROC : 1 ))
-    
+
     # Download and build Python with optimizations
     wget https://www.python.org/ftp/python/${PYTHON_VERSION}/Python-${PYTHON_VERSION}.tgz
     tar -xf Python-${PYTHON_VERSION}.tgz
@@ -35,16 +35,15 @@ From: ubuntu:noble-20250404
     ./configure --enable-optimizations --with-lto --enable-shared LDFLAGS="-Wl,-rpath /usr/local/lib"
     make -j ${CORES_TO_USE}
     make altinstall
-    
+
     # Create virtualenv using the locally built Python
     cd /opt
     /usr/local/bin/python${PYTHON_MAJOR_VERSION} -m venv "default"
     . default/bin/activate
-    
-    # Install required packages
-    pip3 install "clustering-benchmarks==1.1.5" "wget" "fastcluster==1.2.6" "numpy==1.26.4" "scipy==1.14.1" \
-      "isodate" "pydantic-core"  \
-      "genieclust==1.1.6" "pandas==2.2.3" "gitpython==3.1.43"
+
+    # Install required packages with pip
+
+    % include sklearn-pip.apptainer.include
 
     echo '. /opt/default/bin/activate' >> $SINGULARITY_ENVIRONMENT
 

From cef3a6b6f0c0c0cb564941dee77eb52e9fd207db Mon Sep 17 00:00:00 2001
From: ben <ben.uzh@proton.me>
Date: Sun, 11 May 2025 12:27:48 +0200
Subject: [PATCH 18/45] extract variable in build script

---
 envs/build_singularity.sh | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/envs/build_singularity.sh b/envs/build_singularity.sh
index c0c3d93..c5cbf6f 100755
--- a/envs/build_singularity.sh
+++ b/envs/build_singularity.sh
@@ -1,5 +1,7 @@
 #!/bin/sh
-singularity build sklearn.sif sklearn_singularity.def
-singularity build clustbench.sif clustbench_singularity.def
-singularity build r.sif r_singularity.def
-singularity build fcps.sif fcps_singularity.def
+CMD=singularity
+BUILD=build --fakeroot
+$CMD $BUILD sklearn-optimized.sif sklearn_singularity.def
+$CMD $BUILD clustbench-optimized.sif clustbench_singularity.def
+$CMD $BUILD r.sif r_singularity.def
+$CMD $BUILD fcps.sif fcps_singularity.def

From 2ee17ca636501521efb2c650a0b76750052692ee Mon Sep 17 00:00:00 2001
From: ben <ben.uzh@proton.me>
Date: Sun, 11 May 2025 12:37:19 +0200
Subject: [PATCH 19/45] revert include, should use m4

---
 envs/build_singularity.sh                            |  2 +-
 envs/sklearn-pip.apptainer.include                   | 11 -----------
 ...ularity.def => sklearn_singularity_optimized.def} | 12 +++++++++++-
 3 files changed, 12 insertions(+), 13 deletions(-)
 delete mode 100644 envs/sklearn-pip.apptainer.include
 rename envs/{sklearn_singularity.def => sklearn_singularity_optimized.def} (85%)

diff --git a/envs/build_singularity.sh b/envs/build_singularity.sh
index c5cbf6f..61fbd13 100755
--- a/envs/build_singularity.sh
+++ b/envs/build_singularity.sh
@@ -1,7 +1,7 @@
 #!/bin/sh
 CMD=singularity
 BUILD=build --fakeroot
-$CMD $BUILD sklearn-optimized.sif sklearn_singularity.def
+$CMD $BUILD sklearn_optimized.sif sklearn_singularity_optimized.def
 $CMD $BUILD clustbench-optimized.sif clustbench_singularity.def
 $CMD $BUILD r.sif r_singularity.def
 $CMD $BUILD fcps.sif fcps_singularity.def
diff --git a/envs/sklearn-pip.apptainer.include b/envs/sklearn-pip.apptainer.include
deleted file mode 100644
index b8f48eb..0000000
--- a/envs/sklearn-pip.apptainer.include
+++ /dev/null
@@ -1,11 +0,0 @@
-    pip3 install \
-      "clustering-benchmarks==1.1.6" \
-      "fastcluster==1.2.6" \
-      "numpy==1.26.4" \
-      "scipy==1.14.1" \
-      "isodate" \
-      "pydantic-core"  \
-      "genieclust==1.1.6" \
-      "pandas==2.2.3" \
-      "gitpython==3.1.43" \
-      wget"
diff --git a/envs/sklearn_singularity.def b/envs/sklearn_singularity_optimized.def
similarity index 85%
rename from envs/sklearn_singularity.def
rename to envs/sklearn_singularity_optimized.def
index cb9a2f6..6d6e165 100644
--- a/envs/sklearn_singularity.def
+++ b/envs/sklearn_singularity_optimized.def
@@ -43,7 +43,17 @@ From: ubuntu:noble-20250404
 
     # Install required packages with pip
 
-    % include sklearn-pip.apptainer.include
+    pip3 install \
+      "clustering-benchmarks==1.1.6" \
+      "fastcluster==1.2.6" \
+      "numpy==1.26.4" \
+      "scipy==1.14.1" \
+      "isodate" \
+      "pydantic-core"  \
+      "genieclust==1.1.6" \
+      "pandas==2.2.3" \
+      "gitpython==3.1.43" \
+      wget"
 
     echo '. /opt/default/bin/activate' >> $SINGULARITY_ENVIRONMENT
 

From c4cbe5c2f22ed52a4873d5a14781682a19e87a4f Mon Sep 17 00:00:00 2001
From: ben <ben.uzh@proton.me>
Date: Sun, 11 May 2025 12:44:30 +0200
Subject: [PATCH 20/45] update python version

---
 envs/sklearn_singularity_optimized.def | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/envs/sklearn_singularity_optimized.def b/envs/sklearn_singularity_optimized.def
index 6d6e165..17a131d 100644
--- a/envs/sklearn_singularity_optimized.def
+++ b/envs/sklearn_singularity_optimized.def
@@ -6,7 +6,7 @@ From: ubuntu:noble-20250404
     Author ben.uzh@proton.me
 
 %post
-    PYTHON_VERSION=3.12.6
+    PYTHON_VERSION=3.12.9
     PYTHON_MAJOR_VERSION=$(echo $PYTHON_VERSION | cut -d. -f1,2)
 
     # Update and enable deb-src

From 21bdd666d47d029d5463f814ab685389ec850f71 Mon Sep 17 00:00:00 2001
From: ben <ben.uzh@proton.me>
Date: Sun, 11 May 2025 13:25:31 +0200
Subject: [PATCH 21/45] do a little bit of cleanup with the multiple envs

---
 ....yml => Clustering_apptainer_optimized.yml | 129 +++---------------
 Clustering_conda.yml                          | 125 +++--------------
 Clustering_envmodules.yml                     |  25 ++--
 envs/build_singularity.sh                     |   7 +-
 ...def => clustbench_apptainer_optimized.def} |  37 +++--
 envs/clustbench_apptainer_vanillapy.def       |  55 ++++++++
 envs/clustbench_singularity.def               |  35 -----
 ...ity.def => fcps_singularity_optimized.def} |  11 +-
 envs/sklearn.yml                              |  11 --
 9 files changed, 145 insertions(+), 290 deletions(-)
 rename Clustering_singularity.yml => Clustering_apptainer_optimized.yml (74%)
 rename envs/{sklearn_singularity_optimized.def => clustbench_apptainer_optimized.def} (71%)
 create mode 100644 envs/clustbench_apptainer_vanillapy.def
 delete mode 100644 envs/clustbench_singularity.def
 rename envs/{fcps_singularity.def => fcps_singularity_optimized.def} (79%)
 delete mode 100644 envs/sklearn.yml

diff --git a/Clustering_singularity.yml b/Clustering_apptainer_optimized.yml
similarity index 74%
rename from Clustering_singularity.yml
rename to Clustering_apptainer_optimized.yml
index c80b498..96e357e 100644
--- a/Clustering_singularity.yml
+++ b/Clustering_apptainer_optimized.yml
@@ -1,38 +1,32 @@
 id: clustering_example_apptainer
+
 description: Clustering benchmark on Gagolewski's, true number of clusters plus minus 2.
-version: 1.4
-benchmarker: "Izaskun Mallona, Daniel Incicau"
-storage: http://omnibenchmark.org:9000
-benchmark_yaml_spec: 0.04
-storage_api: S3
-storage_bucket_name: clusteringexampleapptainer
+version: 1.5
+benchmarker: "Izaskun Mallona, Daniel Incicau, Ben Carrillo"
+benchmark_yaml_spec: 0.4
+
 software_backend: apptainer
+
 software_environments:
+
   clustbench:
     description: "clustbench on py3.12.6"
-    conda: envs/clustbench.yml
-    envmodule: clustbench
-    apptainer: envs/clustbench.sif
-  sklearn:
-    description: "Daniel's on py3.12.6"
-    conda: envs/sklearn.yml
-    apptainer: envs/sklearn.sif
-    envmodule: clustbench # not true, but
-  R:
-    description: "Daniel's R with readr, dplyr, mclust, caret"
-    conda: envs/r.yml
-    apptainer: envs/r.sif
-    envmodule: fcps # not true, but
+    conda: envs/clustbench.yml # not used
+    envmodule: na
+    apptainer: envs/clustbench-optimized.sif
+
   rmarkdown:
     description: "R with some plotting dependencies"
-    conda: envs/rmarkdown.yml
-    apptainer: envs/r.sif # not true, but
-    envmodule: fcps # not true, but
+    conda: envs/rmarkdown.yml # not used
+    envmodule: na
+    apptainer: envs/rmarkdown.sif
+
   fcps:
     description: "CRAN's FCPS"
     conda: envs/fcps.yml
+    envmodule: na
     apptainer: envs/fcps.sif
-    envmodule: fcps
+
 metric_collectors:
   - id: plotting
     name: "Single-backend metric collector."
@@ -45,10 +39,11 @@ metric_collectors:
     outputs:
       - id: plotting.html
         path: "{input}/{name}/plotting_report.html"
+
 stages:
-  ## clustbench data ##########################################################
 
   - id: data
+    ## clustbench data
     modules:
       - id: clustbench
         name: "clustbench datasets, from https://www.sciencedirect.com/science/article/pii/S0020025521010082#t0005 Table1"
@@ -229,89 +224,3 @@ stages:
     outputs:
       - id: metrics.scores
         path: "{input}/{stage}/{module}/{params}/{dataset}.scores.gz"
-
-  # ## daniel's data ###########################################################################
-  
-  # - id: danielsdata
-  #   modules:
-  #     - id: iris_manual
-  #       name: "Iris Dataset"
-  #       software_environment: "sklearn"
-  #       repository:
-  #         url: https://github.com/omnibenchmark-example/iris.git
-  #         commit: 47c63f0
-  #     - id: penguins
-  #       name: "Penguins Dataset"
-  #       software_environment: "sklearn"
-  #       repository:
-  #         url: https://github.com/omnibenchmark-example/penguins.git
-  #         commit: 9032478
-  #   outputs:
-  #     - id: data.features
-  #       path: "{input}/{stage}/{module}/{params}/{dataset}.features.csv"
-  #     - id: data.labels
-  #       path: "{input}/{stage}/{module}/{params}/{dataset}.labels.csv"
-  
-  # ## daniel's distances ########################################################################
-  
-  # - id: distances
-  #   modules:
-  #     - id: D1
-  #       software_environment: "sklearn"
-  #       parameters:
-  #         - values: ["--measure", "cosine"]
-  #         - values: ["--measure", "euclidean"]
-  #         - values: ["--measure", "manhattan"]
-  #         - values: ["--measure", "chebyshev"]
-  #       repository:
-  #         url: https://github.com/omnibenchmark-example/distance.git
-  #         commit: dd99d4f
-  #   inputs:
-  #     - entries:
-  #         - data.features
-  #   outputs:
-  #     - id: distances
-  #       path: "{input}/{stage}/{module}/{params}/{dataset}.distances.csv"
-        
-  # ## daniel's methods ###################################################################
-  
-  # - id: danielmethods
-  #   modules:
-  #     - id: kmeans
-  #       software_environment: "sklearn"
-  #       repository:
-  #         url: https://github.com/omnibenchmark-example/kmeans.git
-  #         commit: 049c8b1
-  #     - id: ward
-  #       software_environment: "R"
-  #       repository:
-  #         url: https://github.com/omnibenchmark-example/ward.git
-  #         commit: 976e3f3
-  #   inputs:
-  #     - entries:
-  #         - distances
-  #   outputs:
-  #     - id: methods.clusters
-  #       path: "{input}/{stage}/{module}/{params}/{dataset}.clusters.csv"
-
-  # ## daniel's metrics ###################################################################
-
-  # - id: danielsmetrics
-  #   modules:
-  #     - id: ari
-  #       software_environment: "R"
-  #       repository:
-  #         url: https://github.com/omnibenchmark-example/ari.git
-  #         commit: 72708f0
-  #     - id: accuracy
-  #       software_environment: "R"
-  #       repository:
-  #         url: https://github.com/omnibenchmark-example/accuracy.git
-  #         commit: e26b32f
-  #   inputs:
-  #     - entries:
-  #         - methods.clusters
-  #         - data.labels
-  #   outputs:
-  #     - id: metrics.mapping
-  #       path: "{input}/{stage}/{module}/{params}/{dataset}.metrics.txt"
diff --git a/Clustering_conda.yml b/Clustering_conda.yml
index 7ac1629..61352e1 100644
--- a/Clustering_conda.yml
+++ b/Clustering_conda.yml
@@ -1,38 +1,32 @@
 id: clustering_example_conda
+
 description: Clustering benchmark on Gagolewski's, true number of clusters plus minus 2.
-version: 1.4
-benchmarker: "Izaskun Mallona, Daniel Incicau"
-storage: http://omnibenchmark.org:9000
-benchmark_yaml_spec: 0.04
-storage_api: S3
-storage_bucket_name: clusteringexampleconda
+version: 1.5
+benchmarker: "Izaskun Mallona, Daniel Incicau, Ben Carrillo"
+benchmark_yaml_spec: 0.4
+
 software_backend: conda
+
 software_environments:
+
   clustbench:
     description: "clustbench on py3.12.6"
     conda: envs/clustbench.yml
     envmodule: clustbench
-    apptainer: envs/clustbench.sif
-  sklearn:
-    description: "Daniel's on py3.12.6"
-    conda: envs/sklearn.yml
-    apptainer: envs/sklearn.sif
-    envmodule: clustbench # not true, but
-  R:
-    description: "Daniel's R with readr, dplyr, mclust, caret"
-    conda: envs/r.yml
-    apptainer: envs/r.sif
-    envmodule: fcps # not true, but
+    apptainer: na
+
   rmarkdown:
     description: "R with some plotting dependencies"
     conda: envs/rmarkdown.yml
-    apptainer: envs/r.sif # not true, but
-    envmodule: fcps # not true, but
+    envmodule: fcps # not used
+    apptainer: na
+
   fcps:
     description: "CRAN's FCPS"
     conda: envs/fcps.yml
-    apptainer: envs/fcps.sif
     envmodule: fcps
+    apptainer: na
+
 metric_collectors:
   - id: plotting
     name: "Single-backend metric collector."
@@ -45,6 +39,7 @@ metric_collectors:
     outputs:
       - id: plotting.html
         path: "{input}/{name}/plotting_report.html"
+
 stages:
   ## clustbench data ##########################################################
 
@@ -52,7 +47,7 @@ stages:
     modules:
       - id: clustbench
         name: "clustbench datasets, from https://www.sciencedirect.com/science/article/pii/S0020025521010082#t0005 Table1"
-        software_environment: "clustbench"
+        software_environment: clustbench
         repository:
           url: https://github.com/imallona/clustbench_data
           commit: 366c5a2
@@ -145,7 +140,7 @@ stages:
           - values: ["--linkage", "centroid"]
       - id: sklearn
         name: "sklearn"
-        software_environment: "clustbench"
+        software_environment: clustbench
         repository:
           url: https://github.com/imallona/clustbench_sklearn
           #url: /home/imallona/src/clustbench_sklearn
@@ -229,89 +224,3 @@ stages:
     outputs:
       - id: metrics.scores
         path: "{input}/{stage}/{module}/{params}/{dataset}.scores.gz"
-
-  # ## daniel's data ###########################################################################
-  
-  # - id: danielsdata
-  #   modules:
-  #     - id: iris_manual
-  #       name: "Iris Dataset"
-  #       software_environment: "sklearn"
-  #       repository:
-  #         url: https://github.com/omnibenchmark-example/iris.git
-  #         commit: 47c63f0
-  #     - id: penguins
-  #       name: "Penguins Dataset"
-  #       software_environment: "sklearn"
-  #       repository:
-  #         url: https://github.com/omnibenchmark-example/penguins.git
-  #         commit: 9032478
-  #   outputs:
-  #     - id: data.features
-  #       path: "{input}/{stage}/{module}/{params}/{dataset}.features.csv"
-  #     - id: data.labels
-  #       path: "{input}/{stage}/{module}/{params}/{dataset}.labels.csv"
-  
-  # ## daniel's distances ########################################################################
-  
-  # - id: distances
-  #   modules:
-  #     - id: D1
-  #       software_environment: "sklearn"
-  #       parameters:
-  #         - values: ["--measure", "cosine"]
-  #         - values: ["--measure", "euclidean"]
-  #         - values: ["--measure", "manhattan"]
-  #         - values: ["--measure", "chebyshev"]
-  #       repository:
-  #         url: https://github.com/omnibenchmark-example/distance.git
-  #         commit: dd99d4f
-  #   inputs:
-  #     - entries:
-  #         - data.features
-  #   outputs:
-  #     - id: distances
-  #       path: "{input}/{stage}/{module}/{params}/{dataset}.distances.csv"
-        
-  # ## daniel's methods ###################################################################
-  
-  # - id: danielmethods
-  #   modules:
-  #     - id: kmeans
-  #       software_environment: "sklearn"
-  #       repository:
-  #         url: https://github.com/omnibenchmark-example/kmeans.git
-  #         commit: 049c8b1
-  #     - id: ward
-  #       software_environment: "R"
-  #       repository:
-  #         url: https://github.com/omnibenchmark-example/ward.git
-  #         commit: 976e3f3
-  #   inputs:
-  #     - entries:
-  #         - distances
-  #   outputs:
-  #     - id: methods.clusters
-  #       path: "{input}/{stage}/{module}/{params}/{dataset}.clusters.csv"
-
-  # ## daniel's metrics ###################################################################
-
-  # - id: danielsmetrics
-  #   modules:
-  #     - id: ari
-  #       software_environment: "R"
-  #       repository:
-  #         url: https://github.com/omnibenchmark-example/ari.git
-  #         commit: 72708f0
-  #     - id: accuracy
-  #       software_environment: "R"
-  #       repository:
-  #         url: https://github.com/omnibenchmark-example/accuracy.git
-  #         commit: e26b32f
-  #   inputs:
-  #     - entries:
-  #         - methods.clusters
-  #         - data.labels
-  #   outputs:
-  #     - id: metrics.mapping
-  #       path: "{input}/{stage}/{module}/{params}/{dataset}.metrics.txt"
diff --git a/Clustering_envmodules.yml b/Clustering_envmodules.yml
index f37fd6c..52fb13e 100644
--- a/Clustering_envmodules.yml
+++ b/Clustering_envmodules.yml
@@ -1,28 +1,33 @@
 id: clustering_example_envmodules
+
 description: Clustering benchmark on Gagolewski's, true number of clusters plus minus 2.
-version: 1.4
-benchmarker: "Izaskun Mallona, Daniel Incicau"
-benchmark_yaml_spec: 0.5
+version: 1.5
+benchmarker: "Izaskun Mallona, Daniel Incicau, Ben Carrillo"
+benchmark_yaml_spec: 0.4
 
 software_backend: envmodules
 
 software_environments:
+
   clustbench:
     description: "clustbench on py3.12.6"
+    conda: envs/clustbench.yml # not used
     envmodule: clustbench/0.1.0-foss-2023b
-    conda: envs/clustbench.yml
     apptainer: na
+
+  rmarkdown:
+    description: "R with some plotting dependencies"
+    conda: envs/rmakrkdown.yml # not used
+    envmodule: rmarkdown
+    apptainer: na
+
   fcps:
     description: "CRAN's FCPS"
+    conda: envs/fcps.yml # not used
     envmodule: fcps/1.3.4-foss-2023a-r-4.3.2
-    conda: envs/fcps.yml
-    apptainer: na
-  rmarkdown:
-    description: "R with some plotting dependencies"
-    envmodule: rmarkdown # TODO
-    conda: envs/clustbench.yml
     apptainer: na
 
+
 metric_collectors:
   - id: plotting
     name: "Single-backend metric collector."
diff --git a/envs/build_singularity.sh b/envs/build_singularity.sh
index 61fbd13..784e443 100755
--- a/envs/build_singularity.sh
+++ b/envs/build_singularity.sh
@@ -1,7 +1,6 @@
 #!/bin/sh
 CMD=singularity
 BUILD=build --fakeroot
-$CMD $BUILD sklearn_optimized.sif sklearn_singularity_optimized.def
-$CMD $BUILD clustbench-optimized.sif clustbench_singularity.def
-$CMD $BUILD r.sif r_singularity.def
-$CMD $BUILD fcps.sif fcps_singularity.def
+$CMD $BUILD clustbench-vanilla.sif clustbench_apptainer_vanillapy.def
+$CMD $BUILD clustbench-optimized.sif clustbench_apptainer_optimized.def
+$CMD $BUILD fcps.sif fcps_singularity_optimized.def
diff --git a/envs/sklearn_singularity_optimized.def b/envs/clustbench_apptainer_optimized.def
similarity index 71%
rename from envs/sklearn_singularity_optimized.def
rename to envs/clustbench_apptainer_optimized.def
index 17a131d..d4a316d 100644
--- a/envs/sklearn_singularity_optimized.def
+++ b/envs/clustbench_apptainer_optimized.def
@@ -15,7 +15,6 @@ From: ubuntu:noble-20250404
     echo "deb-src http://archive.ubuntu.com/ubuntu/ noble-updates main restricted universe multiverse" >> /etc/apt/sources.list
     apt-get update
 
-
     # Get build dependencies for Python
     apt-get build-dep -y python3
 
@@ -43,17 +42,39 @@ From: ubuntu:noble-20250404
 
     # Install required packages with pip
 
-    pip3 install \
+    pip install -U pip
+
+    pip install \
       "clustering-benchmarks==1.1.6" \
-      "fastcluster==1.2.6" \
-      "numpy==1.26.4" \
-      "scipy==1.14.1" \
-      "isodate" \
-      "pydantic-core"  \
+      "contourpy==1.3.2" \
+      "cycler==0.12.1" \
+      "cython==3.1.0" \
+      "fonttools==4.58.0" \
       "genieclust==1.1.6" \
+      "joblib==1.5.0" \
+      "kiwisolver==1.4.8" \
+      "matplotlib==3.10.3" \
+      "natsort==8.4.0" \
+      "numpy==2.2.5" \
+      "packaging==25.0" \
       "pandas==2.2.3" \
+      "pillow==11.2.1" \
+      "pyparsing==3.2.3" \
+      "python-dateutil==2.9.0.post0" \
+      "pytz==2025.2" \
+      "scikit-learn==1.6.1" \
+      "scipy==1.15.3" \
+      "six==1.17.0" \
+      "threadpoolctl==3.6.0" \
+      "tzdata==2025.2" \
+      "fastcluster==1.2.6" \
       "gitpython==3.1.43" \
-      wget"
+      "isodate==0.7.2" \
+      "pydantic-core==2.34.1"
+
+    # TODO: can we use something more maintained?
+    pip install --pre "python3-wget==0.0.2-beta1"
+
 
     echo '. /opt/default/bin/activate' >> $SINGULARITY_ENVIRONMENT
 
diff --git a/envs/clustbench_apptainer_vanillapy.def b/envs/clustbench_apptainer_vanillapy.def
new file mode 100644
index 0000000..1f2b4e3
--- /dev/null
+++ b/envs/clustbench_apptainer_vanillapy.def
@@ -0,0 +1,55 @@
+Bootstrap: docker
+From: ubuntu:noble-20250404
+
+%labels
+    Author izaskun.mallona@gmail.com
+    Author ben.uzh@proton.me
+
+%post
+    # Create virtualenv using the default Python
+    mkdir -p /opt && cd /opt
+    python3.12 -m venv "default"
+    . default/bin/activate
+
+    # Install required packages with pip
+
+    pip install -U pip
+
+    pip install \
+      "clustering-benchmarks==1.1.6" \
+      "contourpy==1.3.2" \
+      "cycler==0.12.1" \
+      "cython==3.1.0" \
+      "fonttools==4.58.0" \
+      "genieclust==1.1.6" \
+      "joblib==1.5.0" \
+      "kiwisolver==1.4.8" \
+      "matplotlib==3.10.3" \
+      "natsort==8.4.0" \
+      "numpy==2.2.5" \
+      "packaging==25.0" \
+      "pandas==2.2.3" \
+      "pillow==11.2.1" \
+      "pyparsing==3.2.3" \
+      "python-dateutil==2.9.0.post0" \
+      "pytz==2025.2" \
+      "scikit-learn==1.6.1" \
+      "scipy==1.15.3" \
+      "six==1.17.0" \
+      "threadpoolctl==3.6.0" \
+      "tzdata==2025.2" \
+      "fastcluster==1.2.6" \
+      "gitpython==3.1.43" \
+      "isodate==0.7.2" \
+      "pydantic-core==2.34.1"
+
+    # TODO: can we use something more maintained?
+    pip install --pre "python3-wget==0.0.2-beta1"
+
+
+    echo '. /opt/default/bin/activate' >> $SINGULARITY_ENVIRONMENT
+
+%environment
+
+    . /opt/default/bin/activate
+
diff --git a/envs/clustbench_singularity.def b/envs/clustbench_singularity.def
deleted file mode 100644
index 8c2ae85..0000000
--- a/envs/clustbench_singularity.def
+++ /dev/null
@@ -1,35 +0,0 @@
-Bootstrap: docker 
-From: ubuntu:jammy-20240911.1
-
-%labels
-
-    AUTHOR izaskun.mallona@gmail.com
-
-%post
-    
-    # Install python3.12
-    apt-get update
-    apt install -y build-essential zlib1g-dev libncurses5-dev libgdbm-dev libnss3-dev libssl-dev \
-        libreadline-dev libffi-dev libsqlite3-dev wget libbz2-dev python-is-python3 git
-
-    wget https://www.python.org/ftp/python/3.12.6/Python-3.12.6.tgz
-    tar -xf Python-3.12.6.tgz
-    cd Python-3.12.*/
-    ./configure --enable-optimizations
-    make -j 4
-    make altinstall
-
-    # virtualenv
-    cd /opt
-    python3.12 -m venv "default"
-    . default/bin/activate
-    
-    pip3 install "clustering-benchmarks==1.1.5" "wget" "fastcluster==1.2.6" "numpy==1.26.4" "scipy==1.14.1" \
-      "isodate" "pydantic-core"  \
-      "genieclust==1.1.6" "pandas==2.2.3" "gitpython==3.1.43"
-
-    echo '. /opt/default/bin/activate' >> $SINGULARITY_ENVIRONMENT
-
-%environment
-
-    . /opt/default/bin/activate
diff --git a/envs/fcps_singularity.def b/envs/fcps_singularity_optimized.def
similarity index 79%
rename from envs/fcps_singularity.def
rename to envs/fcps_singularity_optimized.def
index a4a615e..6362b9e 100644
--- a/envs/fcps_singularity.def
+++ b/envs/fcps_singularity_optimized.def
@@ -4,6 +4,7 @@ From: rocker/tidyverse:4.3.3
 %labels
 
     AUTHOR izaskun.mallona@gmail.com
+    AUTHOR ben.uzh@proton.me
 
 %post
 
@@ -13,11 +14,11 @@ From: rocker/tidyverse:4.3.3
         libreadline-dev libffi-dev libsqlite3-dev wget libbz2-dev python-is-python3 git \
         libgsl-dev
 
-    wget https://www.python.org/ftp/python/3.12.6/Python-3.12.6.tgz
-    tar -xf Python-3.12.6.tgz
+    wget https://www.python.org/ftp/python/3.12.9/Python-3.12.9.tgz
+    tar -xf Python-3.12.9.tgz
     cd Python-3.12.*/
     ./configure --enable-optimizations
-    make -j 4
+    make -j 8
     make altinstall
 
     # virtualenv
@@ -25,13 +26,15 @@ From: rocker/tidyverse:4.3.3
     python3.12 -m venv "default"
     . default/bin/activate
 
+    # TODO: pin dependencies
     pip install gitpython==3.1.43 isodate pydantic-core
 
     ## no versioning here
+    ## TODO(ben): get same versions as in easyconfig
     Rscript -e 'BiocManager::install(c( "dbscan", "cluster", "protoclust", "energy", "argparse", "mclust", "DataVisualizations", "FCPS", "cclust"))'
 
     echo '. /opt/default/bin/activate' >> $SINGULARITY_ENVIRONMENT
 
 %environment
-    
+
     . /opt/default/bin/activate
diff --git a/envs/sklearn.yml b/envs/sklearn.yml
deleted file mode 100644
index 258b7ea..0000000
--- a/envs/sklearn.yml
+++ /dev/null
@@ -1,11 +0,0 @@
-name: sklearn
-channels:
-  - conda-forge
-  - nodefaults
-dependencies:
-  - conda-forge::python=3.12.6
-  - conda-forge::scikit-learn
-  - conda-forge::pip
-  - pip:
-    - "pandas"
-    - "argparse"

From e8e0f7eb2313696f7494e65c20d44def8301de63 Mon Sep 17 00:00:00 2001
From: ben <ben.uzh@proton.me>
Date: Sun, 11 May 2025 13:29:28 +0200
Subject: [PATCH 22/45] escape

---
 envs/build_singularity.sh | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/envs/build_singularity.sh b/envs/build_singularity.sh
index 784e443..2dae40a 100755
--- a/envs/build_singularity.sh
+++ b/envs/build_singularity.sh
@@ -1,6 +1,6 @@
 #!/bin/sh
 CMD=singularity
-BUILD=build --fakeroot
-$CMD $BUILD clustbench-vanilla.sif clustbench_apptainer_vanillapy.def
-$CMD $BUILD clustbench-optimized.sif clustbench_apptainer_optimized.def
-$CMD $BUILD fcps.sif fcps_singularity_optimized.def
+BUILD='build --fakeroot'
+$CMD ${BUILD} clustbench-vanilla.sif clustbench_apptainer_vanillapy.def
+$CMD ${BUILD} clustbench-optimized.sif clustbench_apptainer_optimized.def
+$CMD ${BUILD} fcps.sif fcps_singularity_optimized.def

From a8336fba907ae43ce16678de97a22538addb06e6 Mon Sep 17 00:00:00 2001
From: ben <ben.uzh@proton.me>
Date: Sun, 11 May 2025 13:42:08 +0200
Subject: [PATCH 23/45] install updated python

---
 envs/clustbench_apptainer_vanillapy.def | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/envs/clustbench_apptainer_vanillapy.def b/envs/clustbench_apptainer_vanillapy.def
index 1f2b4e3..5d388bf 100644
--- a/envs/clustbench_apptainer_vanillapy.def
+++ b/envs/clustbench_apptainer_vanillapy.def
@@ -6,9 +6,19 @@ From: ubuntu:noble-20250404
     Author ben.uzh@proton.me
 
 %post
+    export DEBIAN_FRONTEND=noninteractive
+    apt-get update && \
+        apt-get install -y \
+        python3 \
+        python3-venv \
+        python3-pip \
+        ca-certificates \
+        && apt-get clean && \
+        rm -rf /var/lib/apt/lists/*
+
     # Create virtualenv using the default Python
     mkdir -p /opt && cd /opt
-    python3.12 -m venv "default"
+    /usr/bin/python3 -m venv "default"
     . default/bin/activate
 
     # Install required packages with pip
@@ -46,6 +56,8 @@ From: ubuntu:noble-20250404
     # TODO: can we use something more maintained?
     pip install --pre "python3-wget==0.0.2-beta1"
 
+    # Do some cleanup to keep the image slim
+    rm -rf ~/.cache
 
     echo '. /opt/default/bin/activate' >> $SINGULARITY_ENVIRONMENT
 

From 518c2f6c894b097855a32f7e810783b13e9ec386 Mon Sep 17 00:00:00 2001
From: ben <ben.uzh@proton.me>
Date: Sun, 11 May 2025 13:46:47 +0200
Subject: [PATCH 24/45] sync the two build recipes

---
 envs/clustbench_apptainer_optimized.def | 23 ++++++++++++++++++++---
 1 file changed, 20 insertions(+), 3 deletions(-)

diff --git a/envs/clustbench_apptainer_optimized.def b/envs/clustbench_apptainer_optimized.def
index d4a316d..1e934a8 100644
--- a/envs/clustbench_apptainer_optimized.def
+++ b/envs/clustbench_apptainer_optimized.def
@@ -10,6 +10,7 @@ From: ubuntu:noble-20250404
     PYTHON_MAJOR_VERSION=$(echo $PYTHON_VERSION | cut -d. -f1,2)
 
     # Update and enable deb-src
+    export DEBIAN_FRONTEND=noninteractive
     apt-get update
     echo "deb-src http://archive.ubuntu.com/ubuntu/ noble main restricted universe multiverse" >> /etc/apt/sources.list
     echo "deb-src http://archive.ubuntu.com/ubuntu/ noble-updates main restricted universe multiverse" >> /etc/apt/sources.list
@@ -19,19 +20,33 @@ From: ubuntu:noble-20250404
     apt-get build-dep -y python3
 
     # Extra dependencies
-    apt-get install -y git python-is-python3 wget zlib1g-dev libbz2-dev libssl-dev libffi-dev
+    apt-get install -y git \
+        python-is-python3 \
+        wget \
+        zlib1g-dev \
+        libbz2-dev \
+        libssl-dev \
+        libffi-dev \
+        && apt-get clean && \
+        rm -rf /var/lib/apt/lists/*
 
     # Calculate half the number of available cores
     HALF_NPROC=$(( $(nproc) / 2 ))
     # Ensure at least one core is used
     CORES_TO_USE=$(( HALF_NPROC > 0 ? HALF_NPROC : 1 ))
 
-    # Download and build Python with optimizations
+    # Download and build Python from source, with optimizations
+
     wget https://www.python.org/ftp/python/${PYTHON_VERSION}/Python-${PYTHON_VERSION}.tgz
     tar -xf Python-${PYTHON_VERSION}.tgz
     cd Python-${PYTHON_VERSION}*/
+
     # Enable all possible optimizations
-    ./configure --enable-optimizations --with-lto --enable-shared LDFLAGS="-Wl,-rpath /usr/local/lib"
+    ./configure \
+        --enable-optimizations \
+        --with-lto \
+        --enable-shared \
+        LDFLAGS="-Wl,-rpath /usr/local/lib"
     make -j ${CORES_TO_USE}
     make altinstall
 
@@ -75,6 +90,8 @@ From: ubuntu:noble-20250404
     # TODO: can we use something more maintained?
     pip install --pre "python3-wget==0.0.2-beta1"
 
+    # Do some cleanup to keep the image slim
+    rm -rf ~/.cache
 
     echo '. /opt/default/bin/activate' >> $SINGULARITY_ENVIRONMENT
 

From 2f4131f08f967bf0b934a25907e21bb9d54c001c Mon Sep 17 00:00:00 2001
From: ben <ben.uzh@proton.me>
Date: Sun, 11 May 2025 14:09:01 +0200
Subject: [PATCH 25/45] delete source folder

---
 envs/clustbench_apptainer_optimized.def | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/envs/clustbench_apptainer_optimized.def b/envs/clustbench_apptainer_optimized.def
index 1e934a8..eda9ea6 100644
--- a/envs/clustbench_apptainer_optimized.def
+++ b/envs/clustbench_apptainer_optimized.def
@@ -37,6 +37,7 @@ From: ubuntu:noble-20250404
 
     # Download and build Python from source, with optimizations
 
+    mkdir ~/src && cd src
     wget https://www.python.org/ftp/python/${PYTHON_VERSION}/Python-${PYTHON_VERSION}.tgz
     tar -xf Python-${PYTHON_VERSION}.tgz
     cd Python-${PYTHON_VERSION}*/
@@ -92,6 +93,7 @@ From: ubuntu:noble-20250404
 
     # Do some cleanup to keep the image slim
     rm -rf ~/.cache
+    rm -rf ~/src
 
     echo '. /opt/default/bin/activate' >> $SINGULARITY_ENVIRONMENT
 

From c72eb273f395e9c5805ee0806cc247a77b783443 Mon Sep 17 00:00:00 2001
From: ben <ben.uzh@proton.me>
Date: Sun, 11 May 2025 14:15:59 +0200
Subject: [PATCH 26/45] add microbenchmark for numpy operations

---
 microbenchmark/microbench.py | 67 ++++++++++++++++++++++++++++++++++++
 1 file changed, 67 insertions(+)
 create mode 100644 microbenchmark/microbench.py

diff --git a/microbenchmark/microbench.py b/microbenchmark/microbench.py
new file mode 100644
index 0000000..3730f9e
--- /dev/null
+++ b/microbenchmark/microbench.py
@@ -0,0 +1,67 @@
+"""
+This script exercises a few common linear algebra operations in numpy.
+It's intended mostly to gauge whether it makes sense to descend into
+compiler optimizations for the Python binary that we ship within the SIF images,
+but it can be easily repurposed for other specific microbenchmarks (i.e., numba or GPU perf gains).
+
+Be aware that here we're profiling simple operations; it would make sense to carefully
+profile the libraries of interest to see where the computational bottlenecks really are.
+
+Usage:
+
+singularity exec clustbench-vanilla.sif python3 microbench.py
+singularity exec clustbench-optimized.sif python3 microbench.py
+"""
+import numpy as np
+import time
+import json
+from statistics import mean, stdev
+
+def run_operation(operation, func, repetitions):
+    timings = []
+    for _ in range(repetitions):
+        start = time.perf_counter()
+        func()
+        elapsed = time.perf_counter() - start
+        timings.append(elapsed)
+    return {
+        'operation': operation,
+        'mean': mean(timings),
+        'stdev': stdev(timings),
+        'runs': repetitions
+    }
+
+def benchmark(repetitions=50):
+    np.random.seed(42)
+    size = 1000
+
+    # Create random matrices
+    A = np.random.rand(size, size)
+    B = np.random.rand(size, size)
+    C = A @ A.T  # Ensure positive definite for Cholesky
+
+    # Define operations
+    operations = [
+        ('mat_mul', lambda: np.dot(A, B)),
+        ('svd', lambda: np.linalg.svd(A)),
+        ('chol_decomp', lambda: np.linalg.cholesky(C))
+    ]
+
+    results = []
+    for operation, func in operations:
+        try:
+            result = run_operation(operation, func, repetitions)
+        except np.linalg.LinAlgError:
+            result = {
+                'operation': operation,
+                'error': 'Operation failed due to numerical instability'
+            }
+        results.append(result)
+
+    # Output results as JSON
+    print(json.dumps(results, indent=2))
+
+if __name__ == "__main__":
+    import sys
+    repetitions = int(sys.argv[1]) if len(sys.argv) > 1 else 10
+    benchmark(repetitions)

From 937e45599633e3a58af28beece19f038a4fd9513 Mon Sep 17 00:00:00 2001
From: ben <ben.uzh@proton.me>
Date: Sun, 11 May 2025 14:16:41 +0200
Subject: [PATCH 27/45] fix path

---
 envs/clustbench_apptainer_optimized.def | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/envs/clustbench_apptainer_optimized.def b/envs/clustbench_apptainer_optimized.def
index eda9ea6..19726c2 100644
--- a/envs/clustbench_apptainer_optimized.def
+++ b/envs/clustbench_apptainer_optimized.def
@@ -37,7 +37,7 @@ From: ubuntu:noble-20250404
 
     # Download and build Python from source, with optimizations
 
-    mkdir ~/src && cd src
+    mkdir ~/src && cd ~/src
     wget https://www.python.org/ftp/python/${PYTHON_VERSION}/Python-${PYTHON_VERSION}.tgz
     tar -xf Python-${PYTHON_VERSION}.tgz
     cd Python-${PYTHON_VERSION}*/

From b0bd85adfed66583b676a3f378f0577701efa5a5 Mon Sep 17 00:00:00 2001
From: ben <ben.uzh@proton.me>
Date: Sun, 11 May 2025 14:24:30 +0200
Subject: [PATCH 28/45] default reps

---
 microbenchmark/microbench.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/microbenchmark/microbench.py b/microbenchmark/microbench.py
index 3730f9e..314e66b 100644
--- a/microbenchmark/microbench.py
+++ b/microbenchmark/microbench.py
@@ -17,6 +17,8 @@
 import json
 from statistics import mean, stdev
 
+DEFAULT_REPETITIONS = 10
+
 def run_operation(operation, func, repetitions):
     timings = []
     for _ in range(repetitions):
@@ -31,7 +33,7 @@ def run_operation(operation, func, repetitions):
         'runs': repetitions
     }
 
-def benchmark(repetitions=50):
+def benchmark(repetitions=DEFAULT_REPETITIONS):
     np.random.seed(42)
     size = 1000
 
@@ -63,5 +65,5 @@ def benchmark(repetitions=50):
 
 if __name__ == "__main__":
     import sys
-    repetitions = int(sys.argv[1]) if len(sys.argv) > 1 else 10
+    repetitions = int(sys.argv[1]) if len(sys.argv) > 1 else DEFAULT_REPETITIONS
     benchmark(repetitions)

From 83f9b07dfcbf25dbfce67d17d92eedbe469e2bd9 Mon Sep 17 00:00:00 2001
From: ben <ben.uzh@proton.me>
Date: Sun, 11 May 2025 14:40:30 +0200
Subject: [PATCH 29/45] refs

---
 microbenchmark/microbench.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/microbenchmark/microbench.py b/microbenchmark/microbench.py
index 314e66b..6abc6ee 100644
--- a/microbenchmark/microbench.py
+++ b/microbenchmark/microbench.py
@@ -11,6 +11,8 @@
 
 singularity exec clustbench-vanilla.sif python3 microbench.py
 singularity exec clustbench-optimized.sif python3 microbench.py
+
+References: https://pythonspeed.com/articles/faster-python/
 """
 import numpy as np
 import time

From 744c978643ad5623de1c4b176ab887e7a6127739 Mon Sep 17 00:00:00 2001
From: ben <ben.uzh@proton.me>
Date: Mon, 12 May 2025 15:11:30 +0200
Subject: [PATCH 30/45] duplicate the apptainer clustering yaml

---
 Clustering_apptainer_optimized.yml |  39 +++--
 Clustering_apptainer_vanilla.yml   | 223 +++++++++++++++++++++++++++++
 2 files changed, 241 insertions(+), 21 deletions(-)
 create mode 100644 Clustering_apptainer_vanilla.yml

diff --git a/Clustering_apptainer_optimized.yml b/Clustering_apptainer_optimized.yml
index 96e357e..a073683 100644
--- a/Clustering_apptainer_optimized.yml
+++ b/Clustering_apptainer_optimized.yml
@@ -1,6 +1,6 @@
-id: clustering_example_apptainer
-
+id: clustering_example_apptainer_optimized
 description: Clustering benchmark on Gagolewski's, true number of clusters plus minus 2.
+
 version: 1.5
 benchmarker: "Izaskun Mallona, Daniel Incicau, Ben Carrillo"
 benchmark_yaml_spec: 0.4
@@ -10,27 +10,28 @@ software_backend: apptainer
 software_environments:
 
   clustbench:
-    description: "clustbench on py3.12.6"
+    description: "clustbench on py3.12.9, optimized python build"
     conda: envs/clustbench.yml # not used
     envmodule: na
     apptainer: envs/clustbench-optimized.sif
 
+  fcps:
+    description: "CRAN's FCPS"
+    conda: envs/fcps.yml # not used
+    envmodule: na
+    apptainer: envs/fcps.sif
+
   rmarkdown:
     description: "R with some plotting dependencies"
     conda: envs/rmarkdown.yml # not used
     envmodule: na
     apptainer: envs/rmarkdown.sif
 
-  fcps:
-    description: "CRAN's FCPS"
-    conda: envs/fcps.yml
-    envmodule: na
-    apptainer: envs/fcps.sif
 
 metric_collectors:
   - id: plotting
     name: "Single-backend metric collector."
-    software_environment: "rmarkdown"
+    software_environment: rmarkdown
     repository:
       url: https://github.com/imallona/clustering_report
       commit: 1d6bdf5
@@ -43,11 +44,10 @@ metric_collectors:
 stages:
 
   - id: data
-    ## clustbench data
     modules:
       - id: clustbench
         name: "clustbench datasets, from https://www.sciencedirect.com/science/article/pii/S0020025521010082#t0005 Table1"
-        software_environment: "clustbench"
+        software_environment: clustbench
         repository:
           url: https://github.com/imallona/clustbench_data
           commit: 366c5a2
@@ -120,16 +120,13 @@ stages:
       - id: data.true_labels
         path: "{input}/{stage}/{module}/{params}/{dataset}.labels0.gz"
 
-  ## clustbench methods (fastcluster) ###################################################################
-  
   - id: clustering
     modules:
       - id: fastcluster
         name: "fastcluster algorithm"
-        software_environment: "clustbench"
+        software_environment: clustbench
         repository:
           url: https://github.com/imallona/clustbench_fastcluster
-          # url: /home/imallona/src/clustbench_fastcluster/
           commit: "45e43d3"
         parameters:
           - values: ["--linkage", "complete"]
@@ -138,12 +135,12 @@ stages:
           - values: ["--linkage", "weighted"]
           - values: ["--linkage", "median"]
           - values: ["--linkage", "centroid"]
+
       - id: sklearn
-        name: "sklearn"
-        software_environment: "clustbench"
+        name: sklearn
+        software_environment: clustbench
         repository:
           url: https://github.com/imallona/clustbench_sklearn
-          #url: /home/imallona/src/clustbench_sklearn
           commit: 5877378
         parameters:
           - values: ["--method", "birch"]
@@ -161,8 +158,8 @@ stages:
           - values: ["--linkage", "complete"]
           - values: ["--linkage", "ward"]
       - id: genieclust
-        name: "genieclust"
-        software_environment: "clustbench"
+        name: genieclust
+        software_environment: clustbench
         repository:
           url: https://github.com/imallona/clustbench_genieclust
           commit: 6090043
@@ -201,7 +198,7 @@ stages:
     modules:
       - id: partition_metrics
         name: "clustbench partition metrics"
-        software_environment: "clustbench"
+        software_environment: clustbench
         repository:
           url: https://github.com/imallona/clustbench_metrics
           commit: 9132d45
diff --git a/Clustering_apptainer_vanilla.yml b/Clustering_apptainer_vanilla.yml
new file mode 100644
index 0000000..46b8ea4
--- /dev/null
+++ b/Clustering_apptainer_vanilla.yml
@@ -0,0 +1,223 @@
+id: clustering_example_apptainer_vanilla
+
+description: Clustering benchmark on Gagolewski's, true number of clusters plus minus 2.
+version: 1.5
+benchmarker: "Izaskun Mallona, Daniel Incicau, Ben Carrillo"
+benchmark_yaml_spec: 0.4
+
+software_backend: apptainer
+
+software_environments:
+
+  clustbench:
+    description: "clustbench on py3.12.6"
+    conda: envs/clustbench.yml # not used
+    envmodule: na
+    apptainer: envs/clustbench-vanilla.sif
+
+  fcps:
+    description: "CRAN's FCPS"
+    conda: envs/fcps.yml # not used
+    envmodule: na
+    apptainer: envs/fcps.sif
+
+  rmarkdown:
+    description: "R with some plotting dependencies"
+    conda: envs/rmarkdown.yml # not used
+    envmodule: na
+    apptainer: envs/rmarkdown.sif
+
+
+metric_collectors:
+  - id: plotting
+    name: "Single-backend metric collector."
+    software_environment: rmarkdown
+    repository:
+      url: https://github.com/imallona/clustering_report
+      commit: 1d6bdf5
+    inputs:
+      - metrics.scores
+    outputs:
+      - id: plotting.html
+        path: "{input}/{name}/plotting_report.html"
+
+stages:
+
+  - id: data
+    modules:
+      - id: clustbench
+        name: "clustbench datasets, from https://www.sciencedirect.com/science/article/pii/S0020025521010082#t0005 Table1"
+        software_environment: clustbench
+        repository:
+          url: https://github.com/imallona/clustbench_data
+          commit: 366c5a2
+        parameters:  # comments depict the possible cardinalities and the number of curated labelsets
+          - values: ["--dataset_generator", "fcps", "--dataset_name", "atom"] #	2	1
+          - values: ["--dataset_generator", "fcps", "--dataset_name", "chainlink"] #	2	1
+          - values: ["--dataset_generator", "fcps", "--dataset_name", "engytime"] #	2	2
+          - values: ["--dataset_generator", "fcps", "--dataset_name", "hepta"] #	7	1
+          - values: ["--dataset_generator", "fcps", "--dataset_name", "lsun"] #	3	1
+          - values: ["--dataset_generator", "fcps", "--dataset_name", "target"] #	2, 6	2
+          - values: ["--dataset_generator", "fcps", "--dataset_name", "tetra"] #	4	1
+          - values: ["--dataset_generator", "fcps", "--dataset_name", "twodiamonds"] #	2	1
+          - values: ["--dataset_generator", "fcps", "--dataset_name", "wingnut"] #	2	1
+          - values: ["--dataset_generator", "graves", "--dataset_name", "dense"] #	2	1
+          - values: ["--dataset_generator", "graves", "--dataset_name", "fuzzyx"] #	2, 4, 5	6
+          - values: ["--dataset_generator", "graves", "--dataset_name", "line"] #	2	1
+          - values: ["--dataset_generator", "graves", "--dataset_name", "parabolic"] #	2, 4	2
+          - values: ["--dataset_generator", "graves", "--dataset_name", "ring"] #	2	1
+          - values: ["--dataset_generator", "graves", "--dataset_name", "ring_noisy"] #	2	1
+          - values: ["--dataset_generator", "graves", "--dataset_name", "ring_outliers"] #	2, 5	2
+          - values: ["--dataset_generator", "graves", "--dataset_name", "zigzag"] #	3, 5	2
+          - values: ["--dataset_generator", "graves", "--dataset_name", "zigzag_noisy"] #	3, 5	2
+          - values: ["--dataset_generator", "graves", "--dataset_name", "zigzag_outliers"] #	3, 5	2
+          - values: ["--dataset_generator", "other", "--dataset_name", "chameleon_t4_8k"] #	6	1
+          - values: ["--dataset_generator", "other", "--dataset_name", "chameleon_t5_8k"] #	6	1
+          - values: ["--dataset_generator", "other", "--dataset_name", "hdbscan"] #	6	1
+          - values: ["--dataset_generator", "other", "--dataset_name", "iris"] #	3	1
+          - values: ["--dataset_generator", "other", "--dataset_name", "iris5"] #	3	1
+          - values: ["--dataset_generator", "other", "--dataset_name", "square"] #	2	1
+          - values: ["--dataset_generator", "sipu", "--dataset_name", "aggregation"] #	7	1
+          - values: ["--dataset_generator", "sipu", "--dataset_name", "compound"] #	4, 5, 6	5
+          - values: ["--dataset_generator", "sipu", "--dataset_name", "flame"] #	2	2
+          - values: ["--dataset_generator", "sipu", "--dataset_name", "jain"] #	2	1
+          - values: ["--dataset_generator", "sipu", "--dataset_name", "pathbased"] #	3, 4	2
+          - values: ["--dataset_generator", "sipu", "--dataset_name", "r15"] #	8, 9, 15	3
+          - values: ["--dataset_generator", "sipu", "--dataset_name", "spiral"] #	3	1
+          - values: ["--dataset_generator", "sipu", "--dataset_name", "unbalance"] #	8	1
+          - values: ["--dataset_generator", "uci", "--dataset_name", "ecoli"] #	8	1
+          - values: ["--dataset_generator", "uci", "--dataset_name", "ionosphere"] #	2	1
+          - values: ["--dataset_generator", "uci", "--dataset_name", "sonar"] #	2	1
+          - values: ["--dataset_generator", "uci", "--dataset_name", "statlog"] #	7	1
+          - values: ["--dataset_generator", "uci", "--dataset_name", "wdbc"] #	2	1
+          - values: ["--dataset_generator", "uci", "--dataset_name", "wine"] #	3	1
+          - values: ["--dataset_generator", "uci", "--dataset_name", "yeast"] #	10	1
+          - values: ["--dataset_generator", "wut", "--dataset_name", "circles"] #	4	1
+          - values: ["--dataset_generator", "wut", "--dataset_name", "cross"] #	4	1
+          - values: ["--dataset_generator", "wut", "--dataset_name", "graph"] #	10	1
+          - values: ["--dataset_generator", "wut", "--dataset_name", "isolation"] #	3	1
+          - values: ["--dataset_generator", "wut", "--dataset_name", "labirynth"] #	6	1
+          - values: ["--dataset_generator", "wut", "--dataset_name", "mk1"] #	3	1
+          - values: ["--dataset_generator", "wut", "--dataset_name", "mk2"] #	2	1
+          - values: ["--dataset_generator", "wut", "--dataset_name", "mk3"] #	3	1
+          - values: ["--dataset_generator", "wut", "--dataset_name", "mk4"] #	3	1
+          - values: ["--dataset_generator", "wut", "--dataset_name", "olympic"] #	5	1
+          - values: ["--dataset_generator", "wut", "--dataset_name", "smile"] #	4, 6	2
+          - values: ["--dataset_generator", "wut", "--dataset_name", "stripes"] #	2	1
+          - values: ["--dataset_generator", "wut", "--dataset_name", "trajectories"] #	4	1
+          - values: ["--dataset_generator", "wut", "--dataset_name", "trapped_lovers"] #	3	1
+          - values: ["--dataset_generator", "wut", "--dataset_name", "twosplashes"] #	2	1
+          - values: ["--dataset_generator", "wut", "--dataset_name", "windows"] #	5	1
+          - values: ["--dataset_generator", "wut", "--dataset_name", "x1"] #	3	1
+          - values: ["--dataset_generator", "wut", "--dataset_name", "x2"] #	3	1
+          - values: ["--dataset_generator", "wut", "--dataset_name", "x3"] #	4	1
+          - values: ["--dataset_generator", "wut", "--dataset_name", "z1"] #	3	1
+          - values: ["--dataset_generator", "wut", "--dataset_name", "z2"] #	5	1
+          - values: ["--dataset_generator", "wut", "--dataset_name", "z3"] #	4	1
+    outputs:
+      - id: data.matrix
+        path: "{input}/{stage}/{module}/{params}/{dataset}.data.gz"
+      - id: data.true_labels
+        path: "{input}/{stage}/{module}/{params}/{dataset}.labels0.gz"
+
+  - id: clustering
+    modules:
+      - id: fastcluster
+        name: "fastcluster algorithm"
+        software_environment: clustbench
+        repository:
+          url: https://github.com/imallona/clustbench_fastcluster
+          commit: "45e43d3"
+        parameters:
+          - values: ["--linkage", "complete"]
+          - values: ["--linkage", "ward"]
+          - values: ["--linkage", "average"]
+          - values: ["--linkage", "weighted"]
+          - values: ["--linkage", "median"]
+          - values: ["--linkage", "centroid"]
+
+      - id: sklearn
+        name: sklearn
+        software_environment: clustbench
+        repository:
+          url: https://github.com/imallona/clustbench_sklearn
+          commit: 5877378
+        parameters:
+          - values: ["--method", "birch"]
+          - values: ["--method", "kmeans"]
+          # - values: ["--method", "spectral"] ## too slow
+          - values: ["--method", "gm"]
+      - id: agglomerative
+        name: "agglomerative"
+        software_environment: "clustbench"
+        repository:
+          url: https://github.com/imallona/clustbench_agglomerative
+          commit: 5454368
+        parameters:
+          - values: ["--linkage", "average"]
+          - values: ["--linkage", "complete"]
+          - values: ["--linkage", "ward"]
+      - id: genieclust
+        name: genieclust
+        software_environment: clustbench
+        repository:
+          url: https://github.com/imallona/clustbench_genieclust
+          commit: 6090043
+        parameters:
+          - values: ["--method", "genie", "--gini_threshold", 0.5]
+          - values: ["--method", "gic"]
+          - values: ["--method", "ica"]
+      - id: fcps
+        name: "fcps"
+        software_environment: "fcps"
+        repository:
+          url: https://github.com/imallona/clustbench_fcps
+          commit: 272fa5f
+        parameters:
+          # - values: ["--method", "FCPS_AdaptiveDensityPeak"] # not in conda
+          - values: ["--method", "FCPS_Minimax"]
+          - values: ["--method", "FCPS_MinEnergy"]
+          - values: ["--method", "FCPS_HDBSCAN_2"]
+          - values: ["--method", "FCPS_HDBSCAN_4"]
+          - values: ["--method", "FCPS_HDBSCAN_8"]
+          - values: ["--method", "FCPS_Diana"]
+          - values: ["--method", "FCPS_Fanny"]
+          - values: ["--method", "FCPS_Hardcl"]
+          - values: ["--method", "FCPS_Softcl"]
+          - values: ["--method", "FCPS_Clara"]
+          - values: ["--method", "FCPS_PAM"]
+    inputs:
+      - entries:
+          - data.matrix
+          - data.true_labels
+    outputs:
+      - id: clustering.predicted_ks_range
+        path: "{input}/{stage}/{module}/{params}/{dataset}_ks_range.labels.gz"
+
+  - id: metrics
+    modules:
+      - id: partition_metrics
+        name: "clustbench partition metrics"
+        software_environment: clustbench
+        repository:
+          url: https://github.com/imallona/clustbench_metrics
+          commit: 9132d45
+        parameters:
+          - values: ["--metric", "normalized_clustering_accuracy"]
+          - values: ["--metric", "adjusted_fm_score"]
+          - values: ["--metric", "adjusted_mi_score"]
+          - values: ["--metric", "adjusted_rand_score"]
+          - values: ["--metric", "fm_score"]
+          - values: ["--metric", "mi_score"]
+          - values: ["--metric", "normalized_clustering_accuracy"]
+          - values: ["--metric", "normalized_mi_score"]
+          - values: ["--metric", "normalized_pivoted_accuracy"]
+          - values: ["--metric", "pair_sets_index"]
+          - values: ["--metric", "rand_score"]
+    inputs:
+      - entries:
+          - clustering.predicted_ks_range
+          - data.true_labels
+    outputs:
+      - id: metrics.scores
+        path: "{input}/{stage}/{module}/{params}/{dataset}.scores.gz"

From ec18dcf21ce6d23a1c20f88a5431d9a2c040abae Mon Sep 17 00:00:00 2001
From: ben <ben.uzh@proton.me>
Date: Mon, 12 May 2025 15:15:36 +0200
Subject: [PATCH 31/45] update the oras yaml. not working, just to keep in sync

---
 Clustering_oras.yml | 128 ++++++++------------------------------------
 1 file changed, 22 insertions(+), 106 deletions(-)

diff --git a/Clustering_oras.yml b/Clustering_oras.yml
index 6640461..c6f0d7e 100644
--- a/Clustering_oras.yml
+++ b/Clustering_oras.yml
@@ -1,36 +1,37 @@
-id: clustering_example
+id: clustering_example_oras
 description: Clustering benchmark on Gagolewski's, true number of clusters plus minus 2. Caution dirty apptainer sifs.
-version: 1.2
+version: 1.5
+
 benchmarker: "Izaskun Mallona, Daniel Incicau"
-storage: https://play.min.io
-benchmark_yaml_spec: 0.04
-storage_api: S3
-storage_bucket_name: clustering_example
+benchmark_yaml_spec: 0.4
+
+#storage: https://play.min.io
+#storage_api: S3
+#storage_bucket_name: clustering_example
+
 software_backend: apptainer
+
 software_environments:
+
   clustbench:
     description: "clustbench on py3.12.6"
-    conda: envs/clustbench.yml
+    conda: envs/clustbench.yml # not used
     envmodule: clustbench
     apptainer: oras://registry.renkulab.io/izaskun.mallona/clustering_example/clustbench:latest
-  sklearn:
-    description: "Daniel's on py3.12.6"
-    conda: envs/sklearn.yml
-    apptainer: oras://registry.renkulab.io/izaskun.mallona/clustering_example/r:latest
-    envmodule: clustbench # not true, but
-  R:
-    description: "Daniel's R with readr, dplyr, mclust, caret"
-    conda: envs/r.yml
-    apptainer: oras://registry.renkulab.io/izaskun.mallona/clustering_example/sklearn:latest
-    envmodule: fcps # not true, but
+
   fcps:
     description: "CRAN's FCPS"
-    conda: envs/fcps.yml
+    conda: envs/fcps.yml # not used
+    envmodule: na
     apptainer: oras://registry.renkulab.io/izaskun.mallona/clustering_example/fcps:latest
-    envmodule: fcps
-stages:
 
-  ## clustbench data ##########################################################
+  rmarkdown:
+    description: "R with some plotting dependencies"
+    conda: envs/rmarkdown.yml # not used
+    envmodule: na
+    apptainer: oras://registry.renkulab.io/izaskun.mallona/clustering_example/rmarkdown:latest
+
+stages:
 
   - id: data
     modules:
@@ -214,88 +215,3 @@ stages:
       - id: metrics.scores
         path: "{input}/{stage}/{module}/{params}/{dataset}.scores.gz"
 
-  # ## daniel's data ###########################################################################
-  
-  # - id: danielsdata
-  #   modules:
-  #     - id: iris_manual
-  #       name: "Iris Dataset"
-  #       software_environment: "sklearn"
-  #       repository:
-  #         url: https://github.com/omnibenchmark-example/iris.git
-  #         commit: 47c63f0
-  #     - id: penguins
-  #       name: "Penguins Dataset"
-  #       software_environment: "sklearn"
-  #       repository:
-  #         url: https://github.com/omnibenchmark-example/penguins.git
-  #         commit: 9032478
-  #   outputs:
-  #     - id: data.features
-  #       path: "{input}/{stage}/{module}/{params}/{dataset}.features.csv"
-  #     - id: data.labels
-  #       path: "{input}/{stage}/{module}/{params}/{dataset}.labels.csv"
-  
-  # ## daniel's distances ########################################################################
-  
-  # - id: distances
-  #   modules:
-  #     - id: D1
-  #       software_environment: "sklearn"
-  #       parameters:
-  #         - values: ["--measure", "cosine"]
-  #         - values: ["--measure", "euclidean"]
-  #         - values: ["--measure", "manhattan"]
-  #         - values: ["--measure", "chebyshev"]
-  #       repository:
-  #         url: https://github.com/omnibenchmark-example/distance.git
-  #         commit: dd99d4f
-  #   inputs:
-  #     - entries:
-  #         - data.features
-  #   outputs:
-  #     - id: distances
-  #       path: "{input}/{stage}/{module}/{params}/{dataset}.distances.csv"
-        
-  # ## daniel's methods ###################################################################
-  
-  # - id: danielmethods
-  #   modules:
-  #     - id: kmeans
-  #       software_environment: "sklearn"
-  #       repository:
-  #         url: https://github.com/omnibenchmark-example/kmeans.git
-  #         commit: 049c8b1
-  #     - id: ward
-  #       software_environment: "R"
-  #       repository:
-  #         url: https://github.com/omnibenchmark-example/ward.git
-  #         commit: 976e3f3
-  #   inputs:
-  #     - entries:
-  #         - distances
-  #   outputs:
-  #     - id: methods.clusters
-  #       path: "{input}/{stage}/{module}/{params}/{dataset}.clusters.csv"
-
-  # ## daniel's metrics ###################################################################
-
-  # - id: danielsmetrics
-  #   modules:
-  #     - id: ari
-  #       software_environment: "R"
-  #       repository:
-  #         url: https://github.com/omnibenchmark-example/ari.git
-  #         commit: 72708f0
-  #     - id: accuracy
-  #       software_environment: "R"
-  #       repository:
-  #         url: https://github.com/omnibenchmark-example/accuracy.git
-  #         commit: e26b32f
-  #   inputs:
-  #     - entries:
-  #         - methods.clusters
-  #         - data.labels
-  #   outputs:
-  #     - id: metrics.mapping
-  #       path: "{input}/{stage}/{module}/{params}/{dataset}.metrics.txt"

From cf52a2c7b3595e25488b1f0a007e3d30045fb74b Mon Sep 17 00:00:00 2001
From: ben <ben.uzh@proton.me>
Date: Mon, 12 May 2025 15:21:02 +0200
Subject: [PATCH 32/45] update the rmarkdown environment

---
 Clustering_envmodules.yml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/Clustering_envmodules.yml b/Clustering_envmodules.yml
index 52fb13e..a2112d4 100644
--- a/Clustering_envmodules.yml
+++ b/Clustering_envmodules.yml
@@ -18,7 +18,7 @@ software_environments:
   rmarkdown:
     description: "R with some plotting dependencies"
     conda: envs/rmakrkdown.yml # not used
-    envmodule: rmarkdown
+    envmodule: rmarkdown/0.1.0-gfbf-2024a-r-4.4.2
     apptainer: na
 
   fcps:
@@ -44,7 +44,6 @@ metric_collectors:
 stages:
 
   - id: data
-    ## clustbench data
     modules:
       - id: clustbench
         name: "clustbench datasets, from https://www.sciencedirect.com/science/article/pii/S0020025521010082#t0005 Table1"

From 934ce8baa625f2877a79958f6091fbd4eae4b96f Mon Sep 17 00:00:00 2001
From: ben <ben.uzh@proton.me>
Date: Mon, 12 May 2025 15:34:49 +0200
Subject: [PATCH 33/45] update makefile

---
 Clustering_conda.yml                |  12 +--
 Clustering_conda_smoketest.yml      | 129 +++++++++++++++++++++++++++
 Clustering_envmodules_smoketest.yml | 131 ++++++++++++++++++++++++++++
 Makefile                            |  23 ++++-
 envs/rmarkdown.yml                  |   4 +-
 5 files changed, 289 insertions(+), 10 deletions(-)
 create mode 100644 Clustering_conda_smoketest.yml
 create mode 100644 Clustering_envmodules_smoketest.yml

diff --git a/Clustering_conda.yml b/Clustering_conda.yml
index 61352e1..7822761 100644
--- a/Clustering_conda.yml
+++ b/Clustering_conda.yml
@@ -15,18 +15,18 @@ software_environments:
     envmodule: clustbench
     apptainer: na
 
-  rmarkdown:
-    description: "R with some plotting dependencies"
-    conda: envs/rmarkdown.yml
-    envmodule: fcps # not used
-    apptainer: na
-
   fcps:
     description: "CRAN's FCPS"
     conda: envs/fcps.yml
     envmodule: fcps
     apptainer: na
 
+  rmarkdown:
+    description: "R with some plotting dependencies"
+    conda: envs/rmarkdown.yml
+    envmodule: fcps # not used
+    apptainer: na
+
 metric_collectors:
   - id: plotting
     name: "Single-backend metric collector."
diff --git a/Clustering_conda_smoketest.yml b/Clustering_conda_smoketest.yml
new file mode 100644
index 0000000..15215d7
--- /dev/null
+++ b/Clustering_conda_smoketest.yml
@@ -0,0 +1,129 @@
+id: clustering_example_envmodules
+description: Clustering benchmark on Gagolewski's, true number of clusters plus minus 2.
+version: 1.4
+benchmarker: "Izaskun Mallona, Daniel Incicau"
+benchmark_yaml_spec: 0.5
+
+software_backend: conda
+
+software_environments:
+
+  clustbench:
+    description: "clustbench on py3.12.6"
+    envmodule: clustbench/0.1.0-foss-2023b
+    conda: envs/clustbench.yml
+    apptainer: na
+
+  fcps:
+    description: "CRAN's FCPS"
+    envmodule: fcps/1.3.4-foss-2023a-r-4.3.2
+    conda: envs/fcps.yml
+    apptainer: na
+
+  rmarkdown:
+    description: "R with some plotting dependencies"
+    envmodule: rmarkdown/0.1.0-gfbf-2024a-r-4.4.2
+    conda: envs/rmarkdown.yml
+    apptainer: na
+
+metric_collectors:
+  - id: plotting
+    name: "Single-backend metric collector."
+    software_environment: rmarkdown
+    repository:
+      url: https://github.com/imallona/clustering_report
+      commit: 1d6bdf5
+    inputs:
+      - metrics.scores
+    outputs:
+      - id: plotting.html
+        path: "{input}/{name}/plotting_report.html"
+
+stages:
+  - id: data
+    modules:
+      - id: clustbench
+        name: "clustbench datasets, from https://www.sciencedirect.com/science/article/pii/S0020025521010082#t0005 Table1"
+        software_environment: clustbench
+        repository:
+          url: https://github.com/imallona/clustbench_data
+          commit: 366c5a2
+        parameters: # comments depict the possible cardinalities and the number of curated labelsets
+          - values: ["--dataset_generator", "fcps", "--dataset_name", "atom"] #	2	1
+    outputs:
+      - id: data.matrix
+        path: "{input}/{stage}/{module}/{params}/{dataset}.data.gz"
+      - id: data.true_labels
+        path: "{input}/{stage}/{module}/{params}/{dataset}.labels0.gz"
+
+  - id: clustering
+    modules:
+      - id: fastcluster
+        name: "fastcluster algorithm"
+        software_environment: clustbench
+        repository:
+          url: https://github.com/imallona/clustbench_fastcluster
+          commit: "45e43d3"
+        parameters:
+          - values: ["--linkage", "complete"]
+          - values: ["--linkage", "ward"]
+      - id: sklearn
+        name: "sklearn"
+        software_environment: clustbench
+        repository:
+          url: https://github.com/imallona/clustbench_sklearn
+          commit: 5877378
+        parameters:
+          - values: ["--method", "birch"]
+          - values: ["--method", "kmeans"]
+      - id: agglomerative
+        name: "agglomerative"
+        software_environment: clustbench
+        repository:
+          url: https://github.com/imallona/clustbench_agglomerative
+          commit: 5454368
+        parameters:
+          - values: ["--linkage", "complete"]
+          - values: ["--linkage", "ward"]
+      - id: genieclust
+        name: "genieclust"
+        software_environment: clustbench
+        repository:
+          url: https://github.com/imallona/clustbench_genieclust
+          commit: 6090043
+        parameters:
+          - values: ["--method", "genie", "--gini_threshold", 0.5]
+      - id: fcps
+        name: "fcps"
+        software_environment: fcps
+        repository:
+          url: https://github.com/imallona/clustbench_fcps
+          commit: 272fa5f
+        parameters:
+          - values: ["--method", "FCPS_Minimax"]
+    inputs:
+      - entries:
+          - data.matrix
+          - data.true_labels
+    outputs:
+      - id: clustering.predicted_ks_range
+        path: "{input}/{stage}/{module}/{params}/{dataset}_ks_range.labels.gz"
+
+  - id: metrics
+    modules:
+      - id: partition_metrics
+        name: "clustbench partition metrics"
+        software_environment: clustbench
+        repository:
+          url: https://github.com/imallona/clustbench_metrics
+          commit: 8184cd4
+        parameters:
+          - values: ["--metric", "normalized_clustering_accuracy"]
+          - values: ["--metric", "adjusted_fm_score"]
+    inputs:
+      - entries:
+          - clustering.predicted_ks_range
+          - data.true_labels
+    outputs:
+      - id: metrics.scores
+        path: "{input}/{stage}/{module}/{params}/{dataset}.scores.gz"
diff --git a/Clustering_envmodules_smoketest.yml b/Clustering_envmodules_smoketest.yml
new file mode 100644
index 0000000..3fa8e81
--- /dev/null
+++ b/Clustering_envmodules_smoketest.yml
@@ -0,0 +1,131 @@
+id: clustering_example_envmodules
+description: Clustering benchmark on Gagolewski's, true number of clusters plus minus 2.
+version: 1.4
+benchmarker: "Izaskun Mallona, Daniel Incicau"
+benchmark_yaml_spec: 0.5
+
+software_backend: envmodules
+
+software_environments:
+
+  clustbench:
+    description: "clustbench on py3.12.6"
+    envmodule: clustbench/0.1.0-foss-2023b
+    conda: envs/clustbench.yml
+    apptainer: na
+
+  fcps:
+    description: "CRAN's FCPS"
+    envmodule: fcps/1.3.4-foss-2023a-r-4.3.2
+    conda: envs/fcps.yml
+    apptainer: na
+
+  rmarkdown:
+    description: "R with some plotting dependencies"
+    envmodule: rmarkdown/0.1.0-gfbf-2024a-r-4.4.2
+    conda: envs/clustbench.yml
+    apptainer: na
+
+metric_collectors:
+  - id: plotting
+    name: "Single-backend metric collector."
+    software_environment: rmarkdown
+    repository:
+      url: https://github.com/imallona/clustering_report
+      commit: 1d6bdf5
+    inputs:
+      - metrics.scores
+    outputs:
+      - id: plotting.html
+        path: "{input}/{name}/plotting_report.html"
+
+stages:
+  - id: data
+    modules:
+      - id: clustbench
+        name: "clustbench datasets, from https://www.sciencedirect.com/science/article/pii/S0020025521010082#t0005 Table1"
+        software_environment: clustbench
+        repository:
+          url: https://github.com/imallona/clustbench_data
+          commit: 366c5a2
+        parameters: # comments depict the possible cardinalities and the number of curated labelsets
+          - values: ["--dataset_generator", "fcps", "--dataset_name", "atom"] #	2	1
+    outputs:
+      - id: data.matrix
+        path: "{input}/{stage}/{module}/{params}/{dataset}.data.gz"
+      - id: data.true_labels
+        path: "{input}/{stage}/{module}/{params}/{dataset}.labels0.gz"
+
+  ## clustbench methods (fastcluster) ###################################################################
+
+  - id: clustering
+    modules:
+      - id: fastcluster
+        name: "fastcluster algorithm"
+        software_environment: clustbench
+        repository:
+          url: https://github.com/imallona/clustbench_fastcluster
+          commit: "45e43d3"
+        parameters:
+          - values: ["--linkage", "complete"]
+          - values: ["--linkage", "ward"]
+      - id: sklearn
+        name: "sklearn"
+        software_environment: clustbench
+        repository:
+          url: https://github.com/imallona/clustbench_sklearn
+          commit: 5877378
+        parameters:
+          - values: ["--method", "birch"]
+          - values: ["--method", "kmeans"]
+      - id: agglomerative
+        name: "agglomerative"
+        software_environment: clustbench
+        repository:
+          url: https://github.com/imallona/clustbench_agglomerative
+          commit: 5454368
+        parameters:
+          - values: ["--linkage", "complete"]
+          - values: ["--linkage", "ward"]
+      - id: genieclust
+        name: "genieclust"
+        software_environment: clustbench
+        repository:
+          url: https://github.com/imallona/clustbench_genieclust
+          commit: 6090043
+        parameters:
+          - values: ["--method", "genie", "--gini_threshold", 0.5]
+      - id: fcps
+        name: "fcps"
+        software_environment: fcps
+        repository:
+          url: https://github.com/imallona/clustbench_fcps
+          commit: 272fa5f
+        parameters:
+          - values: ["--method", "FCPS_Minimax"]
+    inputs:
+      - entries:
+          - data.matrix
+          - data.true_labels
+    outputs:
+      - id: clustering.predicted_ks_range
+        path: "{input}/{stage}/{module}/{params}/{dataset}_ks_range.labels.gz"
+
+  - id: metrics
+    modules:
+      - id: partition_metrics
+        name: "clustbench partition metrics"
+        software_environment: clustbench
+        repository:
+          url: https://github.com/imallona/clustbench_metrics
+          commit: 8184cd4
+        parameters:
+          - values: ["--metric", "normalized_clustering_accuracy"]
+          - values: ["--metric", "adjusted_fm_score"]
+    inputs:
+      - entries:
+          - clustering.predicted_ks_range
+          - data.true_labels
+    outputs:
+      - id: metrics.scores
+        path: "{input}/{stage}/{module}/{params}/{dataset}.scores.gz"
diff --git a/Makefile b/Makefile
index 875a375..e8e942d 100644
--- a/Makefile
+++ b/Makefile
@@ -6,9 +6,26 @@ prepare_apptainer_env:
 prepare_envmodules_env:
 	cd envs && eb clustbench.eb --robot
 	cd envs && eb fcps.eb --robot
-run_with_apptainer_backend:
-	 ${OB_CMD} -b Clustering_singularity.yml
-	 mv out out_apptainer
+	cd envs && eb rmarkdown.eb --robot
+
+# short versions, to debug runs & environments
+run_with_apptainer_backend_short:
+	 ${OB_CMD} -b Clustering_apptainer_vanilla_smoketest.yml
+	 mv out out_apptainer_short
+run_with_conda_backend_short:
+	 ${OB_CMD} -b Clustering_conda_smoketest.yml
+	 mv out out_conda
+run_with_envmodules_backend_short:
+	 ${OB_CMD} -b Clustering_envmodules_smoketest.yml
+	 mv out out_lmod_short
+
+# full versions (expect hours)
+run_with_apptainer_backend_vanilla:
+	 ${OB_CMD} -b Clustering_apptainer_vanilla.yml
+	 mv out out_apptainer_vanilla
+run_with_apptainer_backend_optimized:
+	 ${OB_CMD} -b Clustering_apptainer_optimized.yml
+	 mv out out_apptainer_vanilla
 run_with_conda_backend:
 	 ${OB_CMD} -b Clustering_conda.yml
 	 mv out out_conda
diff --git a/envs/rmarkdown.yml b/envs/rmarkdown.yml
index e57969e..ed5c65e 100644
--- a/envs/rmarkdown.yml
+++ b/envs/rmarkdown.yml
@@ -7,6 +7,8 @@ dependencies:
   - conda-forge::python=3.12.6
   - conda-forge::r-argparse
   - conda-forge::r-rmarkdown
+  - conda-forge::r-cairo
+  - conda-forge::r-svglite
   - conda-forge::r-ggplot2
-  - conda-forge::r-tidyr  
+  - conda-forge::r-tidyr
   - bioconda::bioconductor-complexheatmap

From 3890cb48664570aa7a9878dacb299146d69ced5d Mon Sep 17 00:00:00 2001
From: ben <ben.uzh@proton.me>
Date: Mon, 12 May 2025 16:15:38 +0200
Subject: [PATCH 34/45] add apptainer definition for rmarkdown

---
 envs/build_singularity.sh |  4 +++-
 envs/rmarkdown.def        | 38 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 41 insertions(+), 1 deletion(-)
 create mode 100644 envs/rmarkdown.def

diff --git a/envs/build_singularity.sh b/envs/build_singularity.sh
index 2dae40a..c34208b 100755
--- a/envs/build_singularity.sh
+++ b/envs/build_singularity.sh
@@ -2,5 +2,7 @@
 CMD=singularity
 BUILD='build --fakeroot'
 $CMD ${BUILD} clustbench-vanilla.sif clustbench_apptainer_vanillapy.def
-$CMD ${BUILD} clustbench-optimized.sif clustbench_apptainer_optimized.def
+# enable this if you want to compare with the custom python compilation
+# $CMD ${BUILD} clustbench-optimized.sif clustbench_apptainer_optimized.def
 $CMD ${BUILD} fcps.sif fcps_singularity_optimized.def
+$CMD ${BUILD} rmarkdown.sif rmarkdown.def
diff --git a/envs/rmarkdown.def b/envs/rmarkdown.def
new file mode 100644
index 0000000..ce7ca1e
--- /dev/null
+++ b/envs/rmarkdown.def
@@ -0,0 +1,38 @@
+Bootstrap: docker
+From: rocker/tidyverse:4.4
+
+%labels
+
+    AUTHOR izaskun.mallona@gmail.com
+    AUTHOR ben.uzh@proton.me
+
+%post
+
+    # Install python (3.12 as of noble)
+    export DEBIAN_FRONTEND=noninteractive
+    apt-get update
+    apt-get install -y git \
+        python-is-python3 \
+        python3.12 \
+        python3-virtualenv \
+        && apt-get clean && \
+        rm -rf /var/lib/apt/lists/*
+
+    # virtualenv
+    cd /opt
+    python3.12 -m venv "default"
+    . default/bin/activate
+
+    pip install \
+        "gitpython==3.1.43" \
+        "isodate==0.7.2" \
+        "pydantic-core==2.34.1"
+
+    # Install R packages
+    Rscript -e 'BiocManager::install(c("mclust", "caret", "readr", "argparse", "rmarkdown"))'
+
+    echo '. /opt/default/bin/activate' >> $SINGULARITY_ENVIRONMENT
+
+%environment
+
+    . /opt/default/bin/activate

From c80adc10844d9251572f00352795be17c01a61a3 Mon Sep 17 00:00:00 2001
From: ben <ben.uzh@proton.me>
Date: Mon, 12 May 2025 16:29:08 +0200
Subject: [PATCH 35/45] remove unneeded dependencies

---
 envs/rmarkdown.def | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/envs/rmarkdown.def b/envs/rmarkdown.def
index ce7ca1e..8dc75b6 100644
--- a/envs/rmarkdown.def
+++ b/envs/rmarkdown.def
@@ -14,7 +14,7 @@ From: rocker/tidyverse:4.4
     apt-get install -y git \
         python-is-python3 \
         python3.12 \
-        python3-virtualenv \
+        python3.12-venv \
         && apt-get clean && \
         rm -rf /var/lib/apt/lists/*
 
@@ -29,7 +29,7 @@ From: rocker/tidyverse:4.4
         "pydantic-core==2.34.1"
 
     # Install R packages
-    Rscript -e 'BiocManager::install(c("mclust", "caret", "readr", "argparse", "rmarkdown"))'
+    Rscript -e 'BiocManager::install(c("mclust", "caret", "argparse"))'
 
     echo '. /opt/default/bin/activate' >> $SINGULARITY_ENVIRONMENT
 

From b19a489cec78d49c57b1c2b9e6cf1c3b0604c1ca Mon Sep 17 00:00:00 2001
From: ben <ben.uzh@proton.me>
Date: Mon, 12 May 2025 16:31:01 +0200
Subject: [PATCH 36/45] update makefile

---
 Makefile | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index e8e942d..f342949 100644
--- a/Makefile
+++ b/Makefile
@@ -1,6 +1,9 @@
 MAX_CORES ?= 10
+TIMEOUT ?= 4h
+
 # by default, we want to run all snakemake rules even if there are failures (-k)
-OB_CMD=ob run benchmark -k --local --task-timeout "4h" --cores ${MAX_CORES}
+OB_CMD=ob run benchmark -k --local --task-timeout ${TIMEOUT} --cores ${MAX_CORES}
+
 prepare_apptainer_env:
 	cd envs && ./build_singularity.sh
 prepare_envmodules_env:
@@ -14,7 +17,7 @@ run_with_apptainer_backend_short:
 	 mv out out_apptainer_short
 run_with_conda_backend_short:
 	 ${OB_CMD} -b Clustering_conda_smoketest.yml
-	 mv out out_conda
+	 mv out out_conda_short
 run_with_envmodules_backend_short:
 	 ${OB_CMD} -b Clustering_envmodules_smoketest.yml
 	 mv out out_lmod_short

From ebd69b79937e55a68e968d829910c5c6f3d80b70 Mon Sep 17 00:00:00 2001
From: ben <ben.uzh@proton.me>
Date: Mon, 12 May 2025 16:38:13 +0200
Subject: [PATCH 37/45] cleanup r/fcps deps

---
 Clustering_conda.yml                          |  2 +-
 envs/build_singularity.sh                     |  2 +-
 ...cps_singularity_optimized.def => fcps.def} | 29 ++++++++-------
 envs/fcps.eb                                  |  3 +-
 envs/r.yml                                    | 12 ------
 envs/r_singularity.def                        | 37 -------------------
 6 files changed, 19 insertions(+), 66 deletions(-)
 rename envs/{fcps_singularity_optimized.def => fcps.def} (59%)
 delete mode 100644 envs/r.yml
 delete mode 100644 envs/r_singularity.def

diff --git a/Clustering_conda.yml b/Clustering_conda.yml
index 7822761..9e74ee5 100644
--- a/Clustering_conda.yml
+++ b/Clustering_conda.yml
@@ -30,7 +30,7 @@ software_environments:
 metric_collectors:
   - id: plotting
     name: "Single-backend metric collector."
-    software_environment: "rmarkdown"
+    software_environment: rmarkdown
     repository:
       url: https://github.com/imallona/clustering_report
       commit: 1d6bdf5
diff --git a/envs/build_singularity.sh b/envs/build_singularity.sh
index c34208b..f8596a7 100755
--- a/envs/build_singularity.sh
+++ b/envs/build_singularity.sh
@@ -4,5 +4,5 @@ BUILD='build --fakeroot'
 $CMD ${BUILD} clustbench-vanilla.sif clustbench_apptainer_vanillapy.def
 # enable this if you want to compare with the custom python compilation
 # $CMD ${BUILD} clustbench-optimized.sif clustbench_apptainer_optimized.def
-$CMD ${BUILD} fcps.sif fcps_singularity_optimized.def
+$CMD ${BUILD} fcps.sif fcps.def
 $CMD ${BUILD} rmarkdown.sif rmarkdown.def
diff --git a/envs/fcps_singularity_optimized.def b/envs/fcps.def
similarity index 59%
rename from envs/fcps_singularity_optimized.def
rename to envs/fcps.def
index 6362b9e..f4eefcb 100644
--- a/envs/fcps_singularity_optimized.def
+++ b/envs/fcps.def
@@ -1,5 +1,5 @@
 Bootstrap: docker
-From: rocker/tidyverse:4.3.3
+From: rocker/tidyverse:4.4
 
 %labels
 
@@ -8,29 +8,32 @@ From: rocker/tidyverse:4.3.3
 
 %post
 
-    # Install python3.12
+    # Install python (3.12 as of noble)
+    export DEBIAN_FRONTEND=noninteractive
+    apt-get update
+    apt-get install -y git \
+        python-is-python3 \
+        python3.12 \
+        python3.12-venv \
+        && apt-get clean && \
+        rm -rf /var/lib/apt/lists/*
     apt-get update
     apt install -y build-essential zlib1g-dev libncurses5-dev libgdbm-dev libnss3-dev libssl-dev \
         libreadline-dev libffi-dev libsqlite3-dev wget libbz2-dev python-is-python3 git \
         libgsl-dev
 
-    wget https://www.python.org/ftp/python/3.12.9/Python-3.12.9.tgz
-    tar -xf Python-3.12.9.tgz
-    cd Python-3.12.*/
-    ./configure --enable-optimizations
-    make -j 8
-    make altinstall
-
     # virtualenv
     cd /opt
     python3.12 -m venv "default"
     . default/bin/activate
 
-    # TODO: pin dependencies
-    pip install gitpython==3.1.43 isodate pydantic-core
+    pip install \
+        "gitpython==3.1.43" \
+        "isodate==0.7.2" \
+        "pydantic-core==2.34.1"
 
-    ## no versioning here
-    ## TODO(ben): get same versions as in easyconfig
+    # Install R packages
+    ## FIXME no versioning here
     Rscript -e 'BiocManager::install(c( "dbscan", "cluster", "protoclust", "energy", "argparse", "mclust", "DataVisualizations", "FCPS", "cclust"))'
 
     echo '. /opt/default/bin/activate' >> $SINGULARITY_ENVIRONMENT
diff --git a/envs/fcps.eb b/envs/fcps.eb
index 54c8c7d..4d86bdd 100644
--- a/envs/fcps.eb
+++ b/envs/fcps.eb
@@ -15,12 +15,11 @@ dependencies = [
     ('R', '4.3.2'),
     ('Boost', '1.82.0'), 
     ('GSL', '2.7'),
-#    ('arrow-R', '14.0.1', versionsuffix),  # required by RcisTarget
 ]
 
 exts_default_options = {
     'source_urls': [
-	'https://bioconductor.org/packages/release/bioc/src/contrib/',
+       'https://bioconductor.org/packages/release/bioc/src/contrib/',
         'https://cran.r-project.org/src/contrib/Archive/%(name)s',  # package archive
         'https://cran.r-project.org/src/contrib/',  # current version of packages
         'https://cran.freestatistics.org/src/contrib',  # mirror alternative for current packages
diff --git a/envs/r.yml b/envs/r.yml
deleted file mode 100644
index 456e139..0000000
--- a/envs/r.yml
+++ /dev/null
@@ -1,12 +0,0 @@
-name: r_for_metrics
-channels:
-  - conda-forge
-  - nodefaults
-dependencies:
-  - conda-forge::python=3.12.6
-  - conda-forge::r-mclust
-  - conda-forge::r-caret
-  - conda-forge::r-dplyr
-  - conda-forge::r-readr
-  - conda-forge::r-argparse
-  
diff --git a/envs/r_singularity.def b/envs/r_singularity.def
deleted file mode 100644
index f1f9ec9..0000000
--- a/envs/r_singularity.def
+++ /dev/null
@@ -1,37 +0,0 @@
-Bootstrap: docker
-From: rocker/tidyverse:4.4
-
-%labels
-
-    AUTHOR izaskun.mallona@gmail.com
-
-%post
-
-    # Install python3.12
-    apt-get update
-    apt install -y build-essential zlib1g-dev libncurses5-dev libgdbm-dev libnss3-dev libssl-dev \
-        libreadline-dev libffi-dev libsqlite3-dev wget libbz2-dev python-is-python3 git
-
-    wget https://www.python.org/ftp/python/3.12.6/Python-3.12.6.tgz
-    tar -xf Python-3.12.6.tgz
-    cd Python-3.12.*/
-    ./configure --enable-optimizations
-    make -j 4
-    make altinstall
-
-    # virtualenv
-    cd /opt
-    python3.12 -m venv "default"
-    . default/bin/activate
-
-    pip install gitpython==3.1.43 isodate pydantic-core
-
-    # Install R packages
-    
-    Rscript -e 'BiocManager::install(c("mclust", "caret", "readr", "argparse"))'
-
-    echo '. /opt/default/bin/activate' >> $SINGULARITY_ENVIRONMENT
-
-%environment
-
-    . /opt/default/bin/activate

From 1afaa2f2830f11563973a1ef9720753b0a47ceec Mon Sep 17 00:00:00 2001
From: ben <ben.uzh@proton.me>
Date: Mon, 12 May 2025 16:44:36 +0200
Subject: [PATCH 38/45] cleanup image

---
 envs/build_singularity.sh | 2 +-
 envs/fcps.def             | 4 ----
 envs/rmarkdown.def        | 2 ++
 3 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/envs/build_singularity.sh b/envs/build_singularity.sh
index f8596a7..83203c8 100755
--- a/envs/build_singularity.sh
+++ b/envs/build_singularity.sh
@@ -5,4 +5,4 @@ $CMD ${BUILD} clustbench-vanilla.sif clustbench_apptainer_vanillapy.def
 # enable this if you want to compare with the custom python compilation
 # $CMD ${BUILD} clustbench-optimized.sif clustbench_apptainer_optimized.def
 $CMD ${BUILD} fcps.sif fcps.def
-$CMD ${BUILD} rmarkdown.sif rmarkdown.def
+$CMD ${BUILD} rmarkdown.sif rmarkdown.def  # this one is very similar to fcps, remove
diff --git a/envs/fcps.def b/envs/fcps.def
index f4eefcb..922d7f8 100644
--- a/envs/fcps.def
+++ b/envs/fcps.def
@@ -17,10 +17,6 @@ From: rocker/tidyverse:4.4
         python3.12-venv \
         && apt-get clean && \
         rm -rf /var/lib/apt/lists/*
-    apt-get update
-    apt install -y build-essential zlib1g-dev libncurses5-dev libgdbm-dev libnss3-dev libssl-dev \
-        libreadline-dev libffi-dev libsqlite3-dev wget libbz2-dev python-is-python3 git \
-        libgsl-dev
 
     # virtualenv
     cd /opt
diff --git a/envs/rmarkdown.def b/envs/rmarkdown.def
index 8dc75b6..aa20cc1 100644
--- a/envs/rmarkdown.def
+++ b/envs/rmarkdown.def
@@ -1,6 +1,8 @@
 Bootstrap: docker
 From: rocker/tidyverse:4.4
 
+# TODO: we could merge this one with fcps.def, no need to duplicate the image.
+
 %labels
 
     AUTHOR izaskun.mallona@gmail.com

From 9e2168a754e7a93e11867f68f4548f1415301c79 Mon Sep 17 00:00:00 2001
From: ben <ben.uzh@proton.me>
Date: Mon, 12 May 2025 16:54:57 +0200
Subject: [PATCH 39/45] update readme

---
 envs/README.md | 43 +++++++++++++++++++++----------------------
 1 file changed, 21 insertions(+), 22 deletions(-)

diff --git a/envs/README.md b/envs/README.md
index 69aa5c1..3cab925 100644
--- a/envs/README.md
+++ b/envs/README.md
@@ -1,10 +1,9 @@
 We distribute `Clustering.yml` runs with different backends.
 
-- `Clustering_conda.yml`. Conda semi-reproducible (no pinning, pip)
-- `Clustering_singularity.yml`. Singularity semi-reproducible, local SIF files.
-- `Clustering_oras.yml`. Singularity semi-reproducible, prebuilt remote images.
-- `Clustering_envmodules.yml`. Easybuilt with default optimization.
-
+- `Clustering_conda.yml`. Conda semi-reproducible (no pinning, using pip)
+- `Clustering_apptainer.yml`. Singularity semi-reproducible, local SIF files.
+- `Clustering_oras.yml`. Singularity semi-reproducible, prebuilt remote images from an ORAS registry.
+- `Clustering_envmodules.yml`. Easybuild backend with default optimization.
 
 ## Conda
 
@@ -12,8 +11,7 @@ We distribute `Clustering.yml` runs with different backends.
 
 - `clustbench.yml`
 - `fcps.yml`
-- `r.yml`
-- `sklearn.yml`
+- `rmarkdown.yml`
 
 ### How to build
 
@@ -23,24 +21,25 @@ No need to `ob software conda pin / prepare`; let `ob run benchmark -b Clusterin
 
 ### Files
 
-- `clustbench_singularity.def`
-- `fcps_singularity.def`
-- `r_singularity.def`
-- `sklearn_singularity.def`
+The apptainer images are based in ubuntu-noble docker images.
+
+The "optimized" one does a custom python 3.12 compilation; the vanillapy stocks the default py3.12 interpreter from the official ubuntu docker image.
+
+- `clustbench_apptainer_optimized.def`
+- `clustbench_apptainer_vanillapy.def`
+- `fcps.def`
+- `rmarkdown.def`
 
 ### How to build
 
-- `build_singularity.sh`
+- `make prepare_apptainer_env` from the root folder.
 
 ## Aptainer semi-reproducible and remote
 
-No need to prepare/build anything; let `ob run benchmark -b Clustering_oras.yml --local` do it using pre-built images from https://gitlab.renkulab.io/izaskun.mallona/clustering_example/container_registry.
+TODO: push to the registry (how?)
 
-## Apptainer (reproducible) with easybuild
-
-Doing...
+No need to prepare/build anything; let `ob run benchmark -b Clustering_oras.yml --local` do it using pre-built images from https://gitlab.renkulab.io/izaskun.mallona/clustering_example/container_registry.
 
-Lorem ipsum.
 
 ## envmodules - reproducible builds with easybuild
 
@@ -48,11 +47,11 @@ Lorem ipsum.
 
 - `clustbench.eb`
 - `fcps.eb`
+- `rmarkdown.eb`
+- `rmarkdown-python.eb`
 
 ### How to build
 
-1. Mind https://github.com/easybuilders/easybuild-easyconfigs/commit/e29210626f076e3a207f1abf3759ea124e28f8b2
-2. Mind `clustbench` is only installable from https://github.com/gagolews/genieclust/archive/refs/tags/v1.1.6.tar.gz and not from pypi's tgz (!), download it locally and ideally update the easyconfig to automate this
-3. `python3-wget` from pypi doesn't look very well maintaned
-4. `eb fcps.eb --robot`
-5. `eb clustbench.eb --robot`
+- `make prepare_envmodules_env` from the root folder.
+- `python3-wget` from pypi doesn't look very well maintaned
+

From 6199c0a11bbc88a944d07e4b79bf329fc9c55990 Mon Sep 17 00:00:00 2001
From: ben <ben.uzh@proton.me>
Date: Mon, 12 May 2025 17:00:25 +0200
Subject: [PATCH 40/45] fixes

---
 envs/clustbench.eb | 5 -----
 envs/fcps.eb       | 2 +-
 2 files changed, 1 insertion(+), 6 deletions(-)

diff --git a/envs/clustbench.eb b/envs/clustbench.eb
index 0e86911..daae6dd 100644
--- a/envs/clustbench.eb
+++ b/envs/clustbench.eb
@@ -13,9 +13,6 @@ dependencies = [
     ('SciPy-bundle', '2023.11'),
     ('matplotlib', '3.8.2'),
     ('scikit-learn', '1.4.0'),
-# FIXME: I think this is not needed -- ben
-#    ('meson-python', '0.15.0'),
-#    ('Python-bundle-PyPI', '2023.10'), ## so GCC 13.2.0 like foss-2023b
 ]
 
 exts_list = [
@@ -48,5 +45,3 @@ exts_list = [
 ]
 
 moduleclass = 'bio'
-
-
diff --git a/envs/fcps.eb b/envs/fcps.eb
index 4d86bdd..692bf0b 100644
--- a/envs/fcps.eb
+++ b/envs/fcps.eb
@@ -13,7 +13,7 @@ builddependencies = [('pkgconf', '1.9.5')]
 
 dependencies = [
     ('R', '4.3.2'),
-    ('Boost', '1.82.0'), 
+    ('Boost', '1.82.0'),
     ('GSL', '2.7'),
 ]
 

From b017cb02a71b83766f831b9bf5b4d483eb8dbe9f Mon Sep 17 00:00:00 2001
From: ben <ben.uzh@proton.me>
Date: Mon, 12 May 2025 17:18:33 +0200
Subject: [PATCH 41/45] apptainer smoketest

---
 Clustering_apptainer_vanilla.yml           |   4 +-
 Clustering_apptainer_vanilla_smoketest.yml | 129 +++++++++++++++++++++
 2 files changed, 131 insertions(+), 2 deletions(-)
 create mode 100644 Clustering_apptainer_vanilla_smoketest.yml

diff --git a/Clustering_apptainer_vanilla.yml b/Clustering_apptainer_vanilla.yml
index 46b8ea4..6bc5edd 100644
--- a/Clustering_apptainer_vanilla.yml
+++ b/Clustering_apptainer_vanilla.yml
@@ -10,9 +10,9 @@ software_backend: apptainer
 software_environments:
 
   clustbench:
-    description: "clustbench on py3.12.6"
-    conda: envs/clustbench.yml # not used
+    description: "clustbench on py3.12.3, default python"
     envmodule: na
+    conda: envs/clustbench.yml # not used
     apptainer: envs/clustbench-vanilla.sif
 
   fcps:
diff --git a/Clustering_apptainer_vanilla_smoketest.yml b/Clustering_apptainer_vanilla_smoketest.yml
new file mode 100644
index 0000000..99aff2e
--- /dev/null
+++ b/Clustering_apptainer_vanilla_smoketest.yml
@@ -0,0 +1,129 @@
+id: clustering_example_envmodules
+description: Clustering benchmark on Gagolewski's, true number of clusters plus minus 2.
+version: 1.4
+benchmarker: "Izaskun Mallona, Daniel Incicau"
+benchmark_yaml_spec: 0.5
+
+software_backend: apptainer
+
+software_environments:
+
+  clustbench:
+    description: "clustbench on py3.12.3, default python"
+    envmodule: na
+    conda: envs/clustbench.yml # not used
+    apptainer: envs/clustbench-vanilla.sif
+
+  fcps:
+    description: "CRAN's FCPS"
+    envmodule: na
+    conda: envs/fcps.yml # not used
+    apptainer: envs/fcps.sif
+
+  rmarkdown:
+    description: "R with some plotting dependencies"
+    envmodule: na
+    conda: envs/rmarkdown.yml # not used
+    apptainer: envs/rmarkdown.sif
+
+metric_collectors:
+  - id: plotting
+    name: "Single-backend metric collector."
+    software_environment: rmarkdown
+    repository:
+      url: https://github.com/imallona/clustering_report
+      commit: 1d6bdf5
+    inputs:
+      - metrics.scores
+    outputs:
+      - id: plotting.html
+        path: "{input}/{name}/plotting_report.html"
+
+stages:
+  - id: data
+    modules:
+      - id: clustbench
+        name: "clustbench datasets, from https://www.sciencedirect.com/science/article/pii/S0020025521010082#t0005 Table1"
+        software_environment: clustbench
+        repository:
+          url: https://github.com/imallona/clustbench_data
+          commit: 366c5a2
+        parameters: # comments depict the possible cardinalities and the number of curated labelsets
+          - values: ["--dataset_generator", "fcps", "--dataset_name", "atom"] #	2	1
+    outputs:
+      - id: data.matrix
+        path: "{input}/{stage}/{module}/{params}/{dataset}.data.gz"
+      - id: data.true_labels
+        path: "{input}/{stage}/{module}/{params}/{dataset}.labels0.gz"
+
+  - id: clustering
+    modules:
+      - id: fastcluster
+        name: "fastcluster algorithm"
+        software_environment: clustbench
+        repository:
+          url: https://github.com/imallona/clustbench_fastcluster
+          commit: "45e43d3"
+        parameters:
+          - values: ["--linkage", "complete"]
+          - values: ["--linkage", "ward"]
+      - id: sklearn
+        name: "sklearn"
+        software_environment: clustbench
+        repository:
+          url: https://github.com/imallona/clustbench_sklearn
+          commit: 5877378
+        parameters:
+          - values: ["--method", "birch"]
+          - values: ["--method", "kmeans"]
+      - id: agglomerative
+        name: "agglomerative"
+        software_environment: clustbench
+        repository:
+          url: https://github.com/imallona/clustbench_agglomerative
+          commit: 5454368
+        parameters:
+          - values: ["--linkage", "complete"]
+          - values: ["--linkage", "ward"]
+      - id: genieclust
+        name: "genieclust"
+        software_environment: clustbench
+        repository:
+          url: https://github.com/imallona/clustbench_genieclust
+          commit: 6090043
+        parameters:
+          - values: ["--method", "genie", "--gini_threshold", 0.5]
+      - id: fcps
+        name: "fcps"
+        software_environment: fcps
+        repository:
+          url: https://github.com/imallona/clustbench_fcps
+          commit: 272fa5f
+        parameters:
+          - values: ["--method", "FCPS_Minimax"]
+    inputs:
+      - entries:
+          - data.matrix
+          - data.true_labels
+    outputs:
+      - id: clustering.predicted_ks_range
+        path: "{input}/{stage}/{module}/{params}/{dataset}_ks_range.labels.gz"
+
+  - id: metrics
+    modules:
+      - id: partition_metrics
+        name: "clustbench partition metrics"
+        software_environment: clustbench
+        repository:
+          url: https://github.com/imallona/clustbench_metrics
+          commit: 8184cd4
+        parameters:
+          - values: ["--metric", "normalized_clustering_accuracy"]
+          - values: ["--metric", "adjusted_fm_score"]
+    inputs:
+      - entries:
+          - clustering.predicted_ks_range
+          - data.true_labels
+    outputs:
+      - id: metrics.scores
+        path: "{input}/{stage}/{module}/{params}/{dataset}.scores.gz"

From 98777a52be5fc9500e715a42dd1f4e146bc467b6 Mon Sep 17 00:00:00 2001
From: ben <ben.uzh@proton.me>
Date: Mon, 12 May 2025 17:21:29 +0200
Subject: [PATCH 42/45] add git in the image

---
 envs/clustbench_apptainer_vanillapy.def | 1 +
 1 file changed, 1 insertion(+)

diff --git a/envs/clustbench_apptainer_vanillapy.def b/envs/clustbench_apptainer_vanillapy.def
index 5d388bf..63f764a 100644
--- a/envs/clustbench_apptainer_vanillapy.def
+++ b/envs/clustbench_apptainer_vanillapy.def
@@ -13,6 +13,7 @@ From: ubuntu:noble-20250404
         python3-venv \
         python3-pip \
         ca-certificates \
+        git \
         && apt-get clean && \
         rm -rf /var/lib/apt/lists/*
 

From f4ae29d1600097a42fc906557a085dea97ed8cf0 Mon Sep 17 00:00:00 2001
From: ben <ben.uzh@proton.me>
Date: Mon, 12 May 2025 17:33:56 +0200
Subject: [PATCH 43/45] try to debug fastcluster problem

---
 envs/clustbench_apptainer_optimized.def | 4 ++--
 envs/clustbench_apptainer_vanillapy.def | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/envs/clustbench_apptainer_optimized.def b/envs/clustbench_apptainer_optimized.def
index 19726c2..8fc7e08 100644
--- a/envs/clustbench_apptainer_optimized.def
+++ b/envs/clustbench_apptainer_optimized.def
@@ -58,7 +58,7 @@ From: ubuntu:noble-20250404
 
     # Install required packages with pip
 
-    pip install -U pip
+    pip install -U pip wheel
 
     pip install \
       "clustering-benchmarks==1.1.6" \
@@ -83,7 +83,7 @@ From: ubuntu:noble-20250404
       "six==1.17.0" \
       "threadpoolctl==3.6.0" \
       "tzdata==2025.2" \
-      "fastcluster==1.2.6" \
+      "fastcluster==1.3.0" \
       "gitpython==3.1.43" \
       "isodate==0.7.2" \
       "pydantic-core==2.34.1"
diff --git a/envs/clustbench_apptainer_vanillapy.def b/envs/clustbench_apptainer_vanillapy.def
index 63f764a..ff9dd91 100644
--- a/envs/clustbench_apptainer_vanillapy.def
+++ b/envs/clustbench_apptainer_vanillapy.def
@@ -24,7 +24,7 @@ From: ubuntu:noble-20250404
 
     # Install required packages with pip
 
-    pip install -U pip
+    pip install -U pip wheel
 
     pip install \
       "clustering-benchmarks==1.1.6" \
@@ -49,7 +49,7 @@ From: ubuntu:noble-20250404
       "six==1.17.0" \
       "threadpoolctl==3.6.0" \
       "tzdata==2025.2" \
-      "fastcluster==1.2.6" \
+      "fastcluster==1.3.0" \
       "gitpython==3.1.43" \
       "isodate==0.7.2" \
       "pydantic-core==2.34.1"

From 72cdc598acfd10c2fd73bee49f7b66fdd6a62591 Mon Sep 17 00:00:00 2001
From: ben <ben.uzh@proton.me>
Date: Wed, 14 May 2025 13:26:23 +0200
Subject: [PATCH 44/45] fail if the exit code fails

---
 .github/workflows/benchmark.yml | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index 2a55846..e22b368 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -59,7 +59,7 @@ jobs:
 
       - name: Run benchmark
         shell: bash -l {0}
-        continue-on-error: true
+        continue-on-error: false
         run: |
           echo "y" | ob run benchmark -b Clustering.yaml --local --cores 3 --continue-on-error
 
@@ -98,7 +98,7 @@ jobs:
 
       - name: Deploy to GitHub Pages
         uses: actions/deploy-pages@v4
-          
+
       - name: Create Job Summary
         if: always()
         run: |
@@ -106,4 +106,3 @@ jobs:
           echo "- [Plotting Report](https://${{ github.repository_owner }}.github.io/${{ github.event.repository.name }})" >> $GITHUB_STEP_SUMMARY
           echo "### All Outputs" >> $GITHUB_STEP_SUMMARY
           echo "- [Complete Benchmark Output](https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}#artifacts)" >> $GITHUB_STEP_SUMMARY
-    

From 01243de1b555e2e5d4e7b31228d66d8a335edcb3 Mon Sep 17 00:00:00 2001
From: ben <ben.uzh@proton.me>
Date: Wed, 14 May 2025 13:29:16 +0200
Subject: [PATCH 45/45] use conda short for test

---
 .github/workflows/benchmark.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index e22b368..b6cb977 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -61,7 +61,7 @@ jobs:
         shell: bash -l {0}
         continue-on-error: false
         run: |
-          echo "y" | ob run benchmark -b Clustering.yaml --local --cores 3 --continue-on-error
+          echo "y" | ob run benchmark -b Clustering_conda_smoketest.yml --local --cores 3 --continue-on-error
 
   upload-artifact:
     name: Benchmark Artifact