diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index c1a1e82..cb744c4 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -18,7 +18,6 @@ jobs: run-benchmark: name: Run Benchmark runs-on: ubuntu-latest - ## runs-on: self-hosted steps: - name: Check out repository uses: actions/checkout@v4 @@ -49,27 +48,26 @@ jobs: shell: bash -l {0} run: | mamba install -y pip - pip install git+https://github.com/omnibenchmark/omnibenchmark.git@reduce_install_scope + pip install git+https://github.com/omnibenchmark/omnibenchmark.git@main - name: Load benchmark cache id: cache-benchmark uses: actions/cache@v3 with: path: out/ - key: benchmark-${{ runner.os }}-${{ hashFiles('Clustering.yaml') }} + key: benchmark-${{ runner.os }}-${{ hashFiles('Clustering_conda_short.yml') }} - name: Run benchmark shell: bash -l {0} - continue-on-error: true + continue-on-error: false run: | - echo "y" | ob run benchmark -b Clustering.yaml --local --cores 3 --continue-on-error + ob run benchmark -b Clustering_conda_short.yml --local --cores 3 --continue-on-error --yes upload-artifact: name: Benchmark Artifact runs-on: ubuntu-latest - ## runs-on: self-hosted needs: run-benchmark - if: always() + if: github.ref == 'refs/heads/main' && github.repository_owner == 'omnibenchmark' steps: - name: Check out repository uses: actions/checkout@v4 @@ -100,7 +98,7 @@ jobs: - name: Deploy to GitHub Pages uses: actions/deploy-pages@v4 - + - name: Create Job Summary if: always() run: | @@ -108,4 +106,3 @@ jobs: echo "- [Plotting Report](https://${{ github.repository_owner }}.github.io/${{ github.event.repository.name }})" >> $GITHUB_STEP_SUMMARY echo "### All Outputs" >> $GITHUB_STEP_SUMMARY echo "- [Complete Benchmark Output](https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}#artifacts)" >> $GITHUB_STEP_SUMMARY - diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..4d38534 --- /dev/null +++ b/.gitignore @@ -0,0 +1,10 @@ +# image build artifacts +envs/*.sif + +# snakemake +snakemake.log +.snakemake/ + +# vim swaps +*.swp +*.swo diff --git a/Clustering.yaml b/Clustering.yaml deleted file mode 100644 index 0007ea5..0000000 --- a/Clustering.yaml +++ /dev/null @@ -1,232 +0,0 @@ -id: clustering_example -description: Clustering benchmark on Gagolewski's, true number of clusters plus minus 2. -version: 1.2 -benchmarker: "Izaskun Mallona, Daniel Incicau" -storage: https://play.min.io -benchmark_yaml_spec: 0.04 -storage_api: S3 -storage_bucket_name: clustering_example -software_backend: conda -software_environments: - clustbench: - description: "clustbench on py3.12.6" - conda: envs/clustbench.yml - envmodule: clustbench - apptainer: envs/clustbench.sif - sklearn: - description: "Daniel's on py3.12.6" - conda: envs/sklearn.yml - apptainer: envs/sklearn.sif - envmodule: clustbench # not true, but - R: - description: "Daniel's R with readr, dplyr, mclust, caret" - conda: envs/r.yml - apptainer: envs/r.sif - envmodule: fcps # not true, but - rmarkdown: - description: "R with some plotting dependencies" - conda: envs/rmarkdown.yml - apptainer: envs/r.sif # not true, but - envmodule: fcps # not true, but - fcps: - description: "CRAN's FCPS" - conda: envs/fcps.yml - apptainer: envs/fcps.sif - envmodule: fcps -metric_collectors: - - id: plotting - name: "Single-backend metric collector." - software_environment: "rmarkdown" - repository: - url: https://github.com/imallona/clustering_report - commit: 1d6bdf5 - inputs: - - metrics.scores - outputs: - - id: plotting.html - path: "{input}/{name}/plotting_report.html" - -stages: - ## clustbench data ########################################################## - - - id: data - modules: - - id: clustbench - name: "clustbench datasets, from https://www.sciencedirect.com/science/article/pii/S0020025521010082#t0005 Table1" - software_environment: "clustbench" - repository: - url: https://github.com/imallona/clustbench_data - commit: 366c5a2 - parameters: # comments depict the possible cardinalities and the number of curated labelsets - - values: ["--dataset_generator", "fcps", "--dataset_name", "atom"] # 2 1 - # - values: ["--dataset_generator", "fcps", "--dataset_name", "chainlink"] # 2 1 - # - values: ["--dataset_generator", "fcps", "--dataset_name", "engytime"] # 2 2 - # - values: ["--dataset_generator", "fcps", "--dataset_name", "hepta"] # 7 1 - # - values: ["--dataset_generator", "fcps", "--dataset_name", "lsun"] # 3 1 - # - values: ["--dataset_generator", "fcps", "--dataset_name", "target"] # 2, 6 2 - # - values: ["--dataset_generator", "fcps", "--dataset_name", "tetra"] # 4 1 - # - values: ["--dataset_generator", "fcps", "--dataset_name", "twodiamonds"] # 2 1 - # - values: ["--dataset_generator", "fcps", "--dataset_name", "wingnut"] # 2 1 - # - values: ["--dataset_generator", "graves", "--dataset_name", "dense"] # 2 1 - # - values: ["--dataset_generator", "graves", "--dataset_name", "fuzzyx"] # 2, 4, 5 6 - # - values: ["--dataset_generator", "graves", "--dataset_name", "line"] # 2 1 - # - values: ["--dataset_generator", "graves", "--dataset_name", "parabolic"] # 2, 4 2 - # - values: ["--dataset_generator", "graves", "--dataset_name", "ring"] # 2 1 - # - values: ["--dataset_generator", "graves", "--dataset_name", "ring_noisy"] # 2 1 - # - values: ["--dataset_generator", "graves", "--dataset_name", "ring_outliers"] # 2, 5 2 - # - values: ["--dataset_generator", "graves", "--dataset_name", "zigzag"] # 3, 5 2 - # - values: ["--dataset_generator", "graves", "--dataset_name", "zigzag_noisy"] # 3, 5 2 - # - values: ["--dataset_generator", "graves", "--dataset_name", "zigzag_outliers"] # 3, 5 2 - # - values: ["--dataset_generator", "other", "--dataset_name", "chameleon_t4_8k"] # 6 1 - # - values: ["--dataset_generator", "other", "--dataset_name", "chameleon_t5_8k"] # 6 1 - # - values: ["--dataset_generator", "other", "--dataset_name", "hdbscan"] # 6 1 - # - values: ["--dataset_generator", "other", "--dataset_name", "iris"] # 3 1 - # - values: ["--dataset_generator", "other", "--dataset_name", "iris5"] # 3 1 - # - values: ["--dataset_generator", "other", "--dataset_name", "square"] # 2 1 - # - values: ["--dataset_generator", "sipu", "--dataset_name", "aggregation"] # 7 1 - # - values: ["--dataset_generator", "sipu", "--dataset_name", "compound"] # 4, 5, 6 5 - # - values: ["--dataset_generator", "sipu", "--dataset_name", "flame"] # 2 2 - # - values: ["--dataset_generator", "sipu", "--dataset_name", "jain"] # 2 1 - # - values: ["--dataset_generator", "sipu", "--dataset_name", "pathbased"] # 3, 4 2 - # - values: ["--dataset_generator", "sipu", "--dataset_name", "r15"] # 8, 9, 15 3 - # - values: ["--dataset_generator", "sipu", "--dataset_name", "spiral"] # 3 1 - # - values: ["--dataset_generator", "sipu", "--dataset_name", "unbalance"] # 8 1 - # - values: ["--dataset_generator", "uci", "--dataset_name", "ecoli"] # 8 1 - # - values: ["--dataset_generator", "uci", "--dataset_name", "ionosphere"] # 2 1 - # - values: ["--dataset_generator", "uci", "--dataset_name", "sonar"] # 2 1 - # - values: ["--dataset_generator", "uci", "--dataset_name", "statlog"] # 7 1 - # - values: ["--dataset_generator", "uci", "--dataset_name", "wdbc"] # 2 1 - # - values: ["--dataset_generator", "uci", "--dataset_name", "wine"] # 3 1 - # - values: ["--dataset_generator", "uci", "--dataset_name", "yeast"] # 10 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "circles"] # 4 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "cross"] # 4 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "graph"] # 10 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "isolation"] # 3 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "labirynth"] # 6 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "mk1"] # 3 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "mk2"] # 2 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "mk3"] # 3 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "mk4"] # 3 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "olympic"] # 5 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "smile"] # 4, 6 2 - # - values: ["--dataset_generator", "wut", "--dataset_name", "stripes"] # 2 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "trajectories"] # 4 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "trapped_lovers"] # 3 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "twosplashes"] # 2 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "windows"] # 5 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "x1"] # 3 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "x2"] # 3 1 - - values: ["--dataset_generator", "wut", "--dataset_name", "x3"] # 4 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "z1"] # 3 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "z2"] # 5 1 - # - values: ["--dataset_generator", "wut", "--dataset_name", "z3"] # 4 1 - outputs: - - id: data.matrix - path: "{input}/{stage}/{module}/{params}/{dataset}.data.gz" - - id: data.true_labels - path: "{input}/{stage}/{module}/{params}/{dataset}.labels0.gz" - - ## clustbench methods (fastcluster) ################################################################### - - - id: clustering - modules: - - id: fastcluster - name: "fastcluster algorithm" - software_environment: "clustbench" - repository: - url: https://github.com/imallona/clustbench_fastcluster - # url: /home/imallona/src/clustbench_fastcluster/ - commit: "45e43d3" - parameters: - - values: ["--linkage", "complete"] - - values: ["--linkage", "ward"] - # - values: ["--linkage", "average"] - # - values: ["--linkage", "weighted"] - # - values: ["--linkage", "median"] - # - values: ["--linkage", "centroid"] - - id: sklearn - name: "sklearn" - software_environment: "clustbench" - repository: - url: https://github.com/imallona/clustbench_sklearn - #url: /home/imallona/src/clustbench_sklearn - commit: 5877378 - parameters: - - values: ["--method", "birch"] - - values: ["--method", "kmeans"] - # - values: ["--method", "spectral"] ## too slow - # - values: ["--method", "gm"] - - id: agglomerative - name: "agglomerative" - software_environment: "clustbench" - repository: - url: https://github.com/imallona/clustbench_agglomerative - commit: 5454368 - parameters: - # - values: ["--linkage", "average"] - - values: ["--linkage", "complete"] - - values: ["--linkage", "ward"] - - id: genieclust - name: "genieclust" - software_environment: "clustbench" - repository: - url: https://github.com/imallona/clustbench_genieclust - commit: 6090043 - parameters: - - values: ["--method", "genie", "--gini_threshold", 0.5] - - values: ["--method", "gic"] - # - values: ["--method", "ica"] - - id: fcps - name: "fcps" - software_environment: "fcps" - repository: - url: https://github.com/imallona/clustbench_fcps - commit: 272fa5f - parameters: - # - values: ["--method", "FCPS_AdaptiveDensityPeak"] # not in conda - - values: ["--method", "FCPS_Minimax"] - - values: ["--method", "FCPS_MinEnergy"] - # - values: ["--method", "FCPS_HDBSCAN_2"] - # - values: ["--method", "FCPS_HDBSCAN_4"] - # - values: ["--method", "FCPS_HDBSCAN_8"] - # - values: ["--method", "FCPS_Diana"] - # - values: ["--method", "FCPS_Fanny"] - # - values: ["--method", "FCPS_Hardcl"] - # - values: ["--method", "FCPS_Softcl"] - # - values: ["--method", "FCPS_Clara"] - # - values: ["--method", "FCPS_PAM"] - inputs: - - entries: - - data.matrix - - data.true_labels - outputs: - - id: clustering.predicted_ks_range - path: "{input}/{stage}/{module}/{params}/{dataset}_ks_range.labels.gz" - - - id: metrics - modules: - - id: partition_metrics - name: "clustbench partition metrics" - software_environment: "clustbench" - repository: - url: https://github.com/imallona/clustbench_metrics - commit: 9132d45 - parameters: - - values: ["--metric", "normalized_clustering_accuracy"] - - values: ["--metric", "adjusted_fm_score"] - # - values: ["--metric", "adjusted_mi_score"] - # - values: ["--metric", "adjusted_rand_score"] - # - values: ["--metric", "fm_score"] - # - values: ["--metric", "mi_score"] - # - values: ["--metric", "normalized_clustering_accuracy"] - # - values: ["--metric", "normalized_mi_score"] - # - values: ["--metric", "normalized_pivoted_accuracy"] - # - values: ["--metric", "pair_sets_index"] - # - values: ["--metric", "rand_score"] - inputs: - - entries: - - clustering.predicted_ks_range - - data.true_labels - outputs: - - id: metrics.scores - path: "{input}/{stage}/{module}/{params}/{dataset}.scores.gz" diff --git a/Clustering_oras.yml b/Clustering_apptainer.yml similarity index 67% rename from Clustering_oras.yml rename to Clustering_apptainer.yml index 6640461..e075d81 100644 --- a/Clustering_oras.yml +++ b/Clustering_apptainer.yml @@ -1,46 +1,45 @@ -id: clustering_example -description: Clustering benchmark on Gagolewski's, true number of clusters plus minus 2. Caution dirty apptainer sifs. -version: 1.2 -benchmarker: "Izaskun Mallona, Daniel Incicau" -storage: https://play.min.io -benchmark_yaml_spec: 0.04 -storage_api: S3 -storage_bucket_name: clustering_example -software_backend: apptainer +# this file has been generated automatically - DO NOT EDIT BY HAND +version: 1.5 +benchmarker: "Izaskun Mallona, Daniel Incicau, Ben Carrillo" +benchmark_yaml_spec: 0.4 software_environments: clustbench: - description: "clustbench on py3.12.6" + description: "clustbench on py3.12.9, optimized python build" conda: envs/clustbench.yml - envmodule: clustbench - apptainer: oras://registry.renkulab.io/izaskun.mallona/clustering_example/clustbench:latest - sklearn: - description: "Daniel's on py3.12.6" - conda: envs/sklearn.yml - apptainer: oras://registry.renkulab.io/izaskun.mallona/clustering_example/r:latest - envmodule: clustbench # not true, but - R: - description: "Daniel's R with readr, dplyr, mclust, caret" - conda: envs/r.yml - apptainer: oras://registry.renkulab.io/izaskun.mallona/clustering_example/sklearn:latest - envmodule: fcps # not true, but + envmodule: clustbench/0.1.0-foss-2023b + apptainer: oras://quay.io/omnibenchmark/clustbench-vanilla:0.1.0 fcps: description: "CRAN's FCPS" conda: envs/fcps.yml - apptainer: oras://registry.renkulab.io/izaskun.mallona/clustering_example/fcps:latest - envmodule: fcps + envmodule: fcps/1.3.4-foss-2023a-r-4.3.2 + apptainer: oras://quay.io/omnibenchmark/fcps:0.1.0 + rmarkdown: + description: "R with some plotting dependencies" + conda: envs/rmarkdown.yml + envmodule: rmarkdown/0.1.0-gfbf-2024a-r-4.4.2 + apptainer: oras://quay.io/omnibenchmark/fcps:0.1.0 +metric_collectors: + - id: plotting + name: "Single-backend metric collector." + software_environment: rmarkdown + repository: + url: https://github.com/imallona/clustering_report + commit: 1d6bdf5 + inputs: + - metrics.scores + outputs: + - id: plotting.html + path: "{input}/{name}/plotting_report.html" stages: - - ## clustbench data ########################################################## - - id: data modules: - id: clustbench - name: "clustbench datasets" - software_environment: "clustbench" + name: "clustbench datasets, from https://www.sciencedirect.com/science/article/pii/S0020025521010082#t0005 Table1" + software_environment: clustbench repository: url: https://github.com/imallona/clustbench_data - commit: 366c5a2 - parameters: + commit: 31ac323 + parameters: # comments depict the possible cardinalities and the number of curated labelsets - values: ["--dataset_generator", "fcps", "--dataset_name", "atom"] # 2 1 - values: ["--dataset_generator", "fcps", "--dataset_name", "chainlink"] # 2 1 - values: ["--dataset_generator", "fcps", "--dataset_name", "engytime"] # 2 2 @@ -102,23 +101,19 @@ stages: - values: ["--dataset_generator", "wut", "--dataset_name", "x3"] # 4 1 - values: ["--dataset_generator", "wut", "--dataset_name", "z1"] # 3 1 - values: ["--dataset_generator", "wut", "--dataset_name", "z2"] # 5 1 - - values: ["--dataset_generator", "wut", "--dataset_name", "z3"] # 4 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "z3"] # 4 1 outputs: - id: data.matrix path: "{input}/{stage}/{module}/{params}/{dataset}.data.gz" - id: data.true_labels path: "{input}/{stage}/{module}/{params}/{dataset}.labels0.gz" - - ## clustbench methods (fastcluster) ################################################################### - - id: clustering modules: - id: fastcluster name: "fastcluster algorithm" - software_environment: "clustbench" + software_environment: clustbench repository: url: https://github.com/imallona/clustbench_fastcluster - # url: /home/imallona/src/clustbench_fastcluster/ commit: "45e43d3" parameters: - values: ["--linkage", "complete"] @@ -128,11 +123,10 @@ stages: - values: ["--linkage", "median"] - values: ["--linkage", "centroid"] - id: sklearn - name: "sklearn" - software_environment: "clustbench" + name: sklearn + software_environment: clustbench repository: url: https://github.com/imallona/clustbench_sklearn - #url: /home/imallona/src/clustbench_sklearn commit: 5877378 parameters: - values: ["--method", "birch"] @@ -150,8 +144,8 @@ stages: - values: ["--linkage", "complete"] - values: ["--linkage", "ward"] - id: genieclust - name: "genieclust" - software_environment: "clustbench" + name: genieclust + software_environment: clustbench repository: url: https://github.com/imallona/clustbench_genieclust commit: 6090043 @@ -185,12 +179,11 @@ stages: outputs: - id: clustering.predicted_ks_range path: "{input}/{stage}/{module}/{params}/{dataset}_ks_range.labels.gz" - - id: metrics modules: - id: partition_metrics name: "clustbench partition metrics" - software_environment: "clustbench" + software_environment: clustbench repository: url: https://github.com/imallona/clustbench_metrics commit: 9132d45 @@ -213,89 +206,6 @@ stages: outputs: - id: metrics.scores path: "{input}/{stage}/{module}/{params}/{dataset}.scores.gz" - - # ## daniel's data ########################################################################### - - # - id: danielsdata - # modules: - # - id: iris_manual - # name: "Iris Dataset" - # software_environment: "sklearn" - # repository: - # url: https://github.com/omnibenchmark-example/iris.git - # commit: 47c63f0 - # - id: penguins - # name: "Penguins Dataset" - # software_environment: "sklearn" - # repository: - # url: https://github.com/omnibenchmark-example/penguins.git - # commit: 9032478 - # outputs: - # - id: data.features - # path: "{input}/{stage}/{module}/{params}/{dataset}.features.csv" - # - id: data.labels - # path: "{input}/{stage}/{module}/{params}/{dataset}.labels.csv" - - # ## daniel's distances ######################################################################## - - # - id: distances - # modules: - # - id: D1 - # software_environment: "sklearn" - # parameters: - # - values: ["--measure", "cosine"] - # - values: ["--measure", "euclidean"] - # - values: ["--measure", "manhattan"] - # - values: ["--measure", "chebyshev"] - # repository: - # url: https://github.com/omnibenchmark-example/distance.git - # commit: dd99d4f - # inputs: - # - entries: - # - data.features - # outputs: - # - id: distances - # path: "{input}/{stage}/{module}/{params}/{dataset}.distances.csv" - - # ## daniel's methods ################################################################### - - # - id: danielmethods - # modules: - # - id: kmeans - # software_environment: "sklearn" - # repository: - # url: https://github.com/omnibenchmark-example/kmeans.git - # commit: 049c8b1 - # - id: ward - # software_environment: "R" - # repository: - # url: https://github.com/omnibenchmark-example/ward.git - # commit: 976e3f3 - # inputs: - # - entries: - # - distances - # outputs: - # - id: methods.clusters - # path: "{input}/{stage}/{module}/{params}/{dataset}.clusters.csv" - - # ## daniel's metrics ################################################################### - - # - id: danielsmetrics - # modules: - # - id: ari - # software_environment: "R" - # repository: - # url: https://github.com/omnibenchmark-example/ari.git - # commit: 72708f0 - # - id: accuracy - # software_environment: "R" - # repository: - # url: https://github.com/omnibenchmark-example/accuracy.git - # commit: e26b32f - # inputs: - # - entries: - # - methods.clusters - # - data.labels - # outputs: - # - id: metrics.mapping - # path: "{input}/{stage}/{module}/{params}/{dataset}.metrics.txt" +id: clustering_benchmark_apptainer_oras +description: Clustering benchmark on Gagolewski's. Using apptainer from omnibenchmark ORAS registry. +software_backend: apptainer diff --git a/Clustering_apptainer_optimized.yml b/Clustering_apptainer_optimized.yml new file mode 100644 index 0000000..d536ddc --- /dev/null +++ b/Clustering_apptainer_optimized.yml @@ -0,0 +1,211 @@ +# this file has been generated automatically - DO NOT EDIT BY HAND +version: 1.5 +benchmarker: "Izaskun Mallona, Daniel Incicau, Ben Carrillo" +benchmark_yaml_spec: 0.4 +software_environments: + clustbench: + description: "clustbench on py3.12.9, optimized python build" + conda: envs/clustbench.yml + envmodule: clustbench/0.1.0-foss-2023b + apptainer: envs/clustbench-optimized.sif + fcps: + description: "CRAN's FCPS" + conda: envs/fcps.yml + envmodule: fcps/1.3.4-foss-2023a-r-4.3.2 + apptainer: envs/fcps.sif + rmarkdown: + description: "R with some plotting dependencies" + conda: envs/rmarkdown.yml + envmodule: rmarkdown/0.1.0-gfbf-2024a-r-4.4.2 + apptainer: envs/fcps.sif +metric_collectors: + - id: plotting + name: "Single-backend metric collector." + software_environment: rmarkdown + repository: + url: https://github.com/imallona/clustering_report + commit: 1d6bdf5 + inputs: + - metrics.scores + outputs: + - id: plotting.html + path: "{input}/{name}/plotting_report.html" +stages: + - id: data + modules: + - id: clustbench + name: "clustbench datasets, from https://www.sciencedirect.com/science/article/pii/S0020025521010082#t0005 Table1" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_data + commit: 31ac323 + parameters: # comments depict the possible cardinalities and the number of curated labelsets + - values: ["--dataset_generator", "fcps", "--dataset_name", "atom"] # 2 1 + - values: ["--dataset_generator", "fcps", "--dataset_name", "chainlink"] # 2 1 + - values: ["--dataset_generator", "fcps", "--dataset_name", "engytime"] # 2 2 + - values: ["--dataset_generator", "fcps", "--dataset_name", "hepta"] # 7 1 + - values: ["--dataset_generator", "fcps", "--dataset_name", "lsun"] # 3 1 + - values: ["--dataset_generator", "fcps", "--dataset_name", "target"] # 2, 6 2 + - values: ["--dataset_generator", "fcps", "--dataset_name", "tetra"] # 4 1 + - values: ["--dataset_generator", "fcps", "--dataset_name", "twodiamonds"] # 2 1 + - values: ["--dataset_generator", "fcps", "--dataset_name", "wingnut"] # 2 1 + - values: ["--dataset_generator", "graves", "--dataset_name", "dense"] # 2 1 + - values: ["--dataset_generator", "graves", "--dataset_name", "fuzzyx"] # 2, 4, 5 6 + - values: ["--dataset_generator", "graves", "--dataset_name", "line"] # 2 1 + - values: ["--dataset_generator", "graves", "--dataset_name", "parabolic"] # 2, 4 2 + - values: ["--dataset_generator", "graves", "--dataset_name", "ring"] # 2 1 + - values: ["--dataset_generator", "graves", "--dataset_name", "ring_noisy"] # 2 1 + - values: ["--dataset_generator", "graves", "--dataset_name", "ring_outliers"] # 2, 5 2 + - values: ["--dataset_generator", "graves", "--dataset_name", "zigzag"] # 3, 5 2 + - values: ["--dataset_generator", "graves", "--dataset_name", "zigzag_noisy"] # 3, 5 2 + - values: ["--dataset_generator", "graves", "--dataset_name", "zigzag_outliers"] # 3, 5 2 + - values: ["--dataset_generator", "other", "--dataset_name", "chameleon_t4_8k"] # 6 1 + - values: ["--dataset_generator", "other", "--dataset_name", "chameleon_t5_8k"] # 6 1 + - values: ["--dataset_generator", "other", "--dataset_name", "hdbscan"] # 6 1 + - values: ["--dataset_generator", "other", "--dataset_name", "iris"] # 3 1 + - values: ["--dataset_generator", "other", "--dataset_name", "iris5"] # 3 1 + - values: ["--dataset_generator", "other", "--dataset_name", "square"] # 2 1 + - values: ["--dataset_generator", "sipu", "--dataset_name", "aggregation"] # 7 1 + - values: ["--dataset_generator", "sipu", "--dataset_name", "compound"] # 4, 5, 6 5 + - values: ["--dataset_generator", "sipu", "--dataset_name", "flame"] # 2 2 + - values: ["--dataset_generator", "sipu", "--dataset_name", "jain"] # 2 1 + - values: ["--dataset_generator", "sipu", "--dataset_name", "pathbased"] # 3, 4 2 + - values: ["--dataset_generator", "sipu", "--dataset_name", "r15"] # 8, 9, 15 3 + - values: ["--dataset_generator", "sipu", "--dataset_name", "spiral"] # 3 1 + - values: ["--dataset_generator", "sipu", "--dataset_name", "unbalance"] # 8 1 + - values: ["--dataset_generator", "uci", "--dataset_name", "ecoli"] # 8 1 + - values: ["--dataset_generator", "uci", "--dataset_name", "ionosphere"] # 2 1 + - values: ["--dataset_generator", "uci", "--dataset_name", "sonar"] # 2 1 + - values: ["--dataset_generator", "uci", "--dataset_name", "statlog"] # 7 1 + - values: ["--dataset_generator", "uci", "--dataset_name", "wdbc"] # 2 1 + - values: ["--dataset_generator", "uci", "--dataset_name", "wine"] # 3 1 + - values: ["--dataset_generator", "uci", "--dataset_name", "yeast"] # 10 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "circles"] # 4 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "cross"] # 4 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "graph"] # 10 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "isolation"] # 3 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "labirynth"] # 6 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "mk1"] # 3 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "mk2"] # 2 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "mk3"] # 3 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "mk4"] # 3 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "olympic"] # 5 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "smile"] # 4, 6 2 + - values: ["--dataset_generator", "wut", "--dataset_name", "stripes"] # 2 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "trajectories"] # 4 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "trapped_lovers"] # 3 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "twosplashes"] # 2 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "windows"] # 5 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "x1"] # 3 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "x2"] # 3 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "x3"] # 4 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "z1"] # 3 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "z2"] # 5 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "z3"] # 4 1 + outputs: + - id: data.matrix + path: "{input}/{stage}/{module}/{params}/{dataset}.data.gz" + - id: data.true_labels + path: "{input}/{stage}/{module}/{params}/{dataset}.labels0.gz" + - id: clustering + modules: + - id: fastcluster + name: "fastcluster algorithm" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_fastcluster + commit: "45e43d3" + parameters: + - values: ["--linkage", "complete"] + - values: ["--linkage", "ward"] + - values: ["--linkage", "average"] + - values: ["--linkage", "weighted"] + - values: ["--linkage", "median"] + - values: ["--linkage", "centroid"] + - id: sklearn + name: sklearn + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_sklearn + commit: 5877378 + parameters: + - values: ["--method", "birch"] + - values: ["--method", "kmeans"] + # - values: ["--method", "spectral"] ## too slow + - values: ["--method", "gm"] + - id: agglomerative + name: "agglomerative" + software_environment: "clustbench" + repository: + url: https://github.com/imallona/clustbench_agglomerative + commit: 5454368 + parameters: + - values: ["--linkage", "average"] + - values: ["--linkage", "complete"] + - values: ["--linkage", "ward"] + - id: genieclust + name: genieclust + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_genieclust + commit: 6090043 + parameters: + - values: ["--method", "genie", "--gini_threshold", 0.5] + - values: ["--method", "gic"] + - values: ["--method", "ica"] + - id: fcps + name: "fcps" + software_environment: "fcps" + repository: + url: https://github.com/imallona/clustbench_fcps + commit: 272fa5f + parameters: + # - values: ["--method", "FCPS_AdaptiveDensityPeak"] # not in conda + - values: ["--method", "FCPS_Minimax"] + - values: ["--method", "FCPS_MinEnergy"] + - values: ["--method", "FCPS_HDBSCAN_2"] + - values: ["--method", "FCPS_HDBSCAN_4"] + - values: ["--method", "FCPS_HDBSCAN_8"] + - values: ["--method", "FCPS_Diana"] + - values: ["--method", "FCPS_Fanny"] + - values: ["--method", "FCPS_Hardcl"] + - values: ["--method", "FCPS_Softcl"] + - values: ["--method", "FCPS_Clara"] + - values: ["--method", "FCPS_PAM"] + inputs: + - entries: + - data.matrix + - data.true_labels + outputs: + - id: clustering.predicted_ks_range + path: "{input}/{stage}/{module}/{params}/{dataset}_ks_range.labels.gz" + - id: metrics + modules: + - id: partition_metrics + name: "clustbench partition metrics" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_metrics + commit: 9132d45 + parameters: + - values: ["--metric", "normalized_clustering_accuracy"] + - values: ["--metric", "adjusted_fm_score"] + - values: ["--metric", "adjusted_mi_score"] + - values: ["--metric", "adjusted_rand_score"] + - values: ["--metric", "fm_score"] + - values: ["--metric", "mi_score"] + - values: ["--metric", "normalized_clustering_accuracy"] + - values: ["--metric", "normalized_mi_score"] + - values: ["--metric", "normalized_pivoted_accuracy"] + - values: ["--metric", "pair_sets_index"] + - values: ["--metric", "rand_score"] + inputs: + - entries: + - clustering.predicted_ks_range + - data.true_labels + outputs: + - id: metrics.scores + path: "{input}/{stage}/{module}/{params}/{dataset}.scores.gz" +id: clustering_benchmark_apptainer_optimized_local +description: Clustering benchmark on Gagolewski's. Using apptainer (locally built image, optimized python) +software_backend: apptainer diff --git a/Clustering_apptainer_optimized_short.yml b/Clustering_apptainer_optimized_short.yml new file mode 100644 index 0000000..5bbd791 --- /dev/null +++ b/Clustering_apptainer_optimized_short.yml @@ -0,0 +1,121 @@ +# this file has been generated automatically - DO NOT EDIT BY HAND +version: 1.5 +benchmarker: "Izaskun Mallona, Daniel Incicau, Ben Carrillo" +benchmark_yaml_spec: 0.5 +software_environments: + clustbench: + description: "clustbench on py3.12.3, default python" + envmodule: clustbench/0.1.0-foss-2023b + conda: envs/clustbench.yml + apptainer: envs/clustbench-optimized.sif + fcps: + description: "CRAN's FCPS" + envmodule: fcps/1.3.4-foss-2023a-r-4.3.2 + conda: envs/fcps.yml + apptainer: envs/fcps.sif + rmarkdown: + description: "R with some plotting dependencies" + conda: envs/rmarkdown.yml + envmodule: rmarkdown/0.1.0-gfbf-2024a-r-4.4.2 + apptainer: envs/fcps.sif +metric_collectors: + - id: plotting + name: "Single-backend metric collector." + software_environment: rmarkdown + repository: + url: https://github.com/imallona/clustering_report + commit: 1d6bdf5 + inputs: + - metrics.scores + outputs: + - id: plotting.html + path: "{input}/{name}/plotting_report.html" +stages: + - id: data + modules: + - id: clustbench + name: "clustbench datasets, from https://www.sciencedirect.com/science/article/pii/S0020025521010082#t0005 Table1" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_data + commit: 31ac323 + parameters: # comments depict the possible cardinalities and the number of curated labelsets + - values: ["--dataset_generator", "fcps", "--dataset_name", "atom"] # 2 1 + outputs: + - id: data.matrix + path: "{input}/{stage}/{module}/{params}/{dataset}.data.gz" + - id: data.true_labels + path: "{input}/{stage}/{module}/{params}/{dataset}.labels0.gz" + - id: clustering + modules: + - id: fastcluster + name: "fastcluster algorithm" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_fastcluster + commit: "45e43d3" + parameters: + - values: ["--linkage", "complete"] + - values: ["--linkage", "ward"] + - id: sklearn + name: "sklearn" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_sklearn + commit: 5877378 + parameters: + - values: ["--method", "birch"] + - values: ["--method", "kmeans"] + - id: agglomerative + name: "agglomerative" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_agglomerative + commit: 5454368 + parameters: + - values: ["--linkage", "complete"] + - values: ["--linkage", "ward"] + - id: genieclust + name: "genieclust" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_genieclust + commit: 6090043 + parameters: + - values: ["--method", "genie", "--gini_threshold", 0.5] + - id: fcps + name: "fcps" + software_environment: fcps + repository: + url: https://github.com/imallona/clustbench_fcps + commit: 272fa5f + parameters: + - values: ["--method", "FCPS_Minimax"] + inputs: + - entries: + - data.matrix + - data.true_labels + outputs: + - id: clustering.predicted_ks_range + path: "{input}/{stage}/{module}/{params}/{dataset}_ks_range.labels.gz" + - id: metrics + modules: + - id: partition_metrics + name: "clustbench partition metrics" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_metrics + commit: 8184cd4 + parameters: + - values: ["--metric", "normalized_clustering_accuracy"] + - values: ["--metric", "adjusted_fm_score"] + inputs: + - entries: + - clustering.predicted_ks_range + - data.true_labels + outputs: + - id: metrics.scores + path: "{input}/{stage}/{module}/{params}/{dataset}.scores.gz" +id: clustering_benchmark_apptainer_optimized_local +description: Clustering benchmark on Gagolewski's. Using apptainer (locally built image, optimized python) +software_backend: apptainer diff --git a/Clustering_apptainer_short.yml b/Clustering_apptainer_short.yml new file mode 100644 index 0000000..71bdd6f --- /dev/null +++ b/Clustering_apptainer_short.yml @@ -0,0 +1,121 @@ +# this file has been generated automatically - DO NOT EDIT BY HAND +version: 1.5 +benchmarker: "Izaskun Mallona, Daniel Incicau, Ben Carrillo" +benchmark_yaml_spec: 0.5 +software_environments: + clustbench: + description: "clustbench on py3.12.3, default python" + envmodule: clustbench/0.1.0-foss-2023b + conda: envs/clustbench.yml + apptainer: oras://quay.io/omnibenchmark/clustbench-vanilla:0.1.0 + fcps: + description: "CRAN's FCPS" + envmodule: fcps/1.3.4-foss-2023a-r-4.3.2 + conda: envs/fcps.yml + apptainer: oras://quay.io/omnibenchmark/fcps:0.1.0 + rmarkdown: + description: "R with some plotting dependencies" + conda: envs/rmarkdown.yml + envmodule: rmarkdown/0.1.0-gfbf-2024a-r-4.4.2 + apptainer: oras://quay.io/omnibenchmark/fcps:0.1.0 +metric_collectors: + - id: plotting + name: "Single-backend metric collector." + software_environment: rmarkdown + repository: + url: https://github.com/imallona/clustering_report + commit: 1d6bdf5 + inputs: + - metrics.scores + outputs: + - id: plotting.html + path: "{input}/{name}/plotting_report.html" +stages: + - id: data + modules: + - id: clustbench + name: "clustbench datasets, from https://www.sciencedirect.com/science/article/pii/S0020025521010082#t0005 Table1" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_data + commit: 31ac323 + parameters: # comments depict the possible cardinalities and the number of curated labelsets + - values: ["--dataset_generator", "fcps", "--dataset_name", "atom"] # 2 1 + outputs: + - id: data.matrix + path: "{input}/{stage}/{module}/{params}/{dataset}.data.gz" + - id: data.true_labels + path: "{input}/{stage}/{module}/{params}/{dataset}.labels0.gz" + - id: clustering + modules: + - id: fastcluster + name: "fastcluster algorithm" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_fastcluster + commit: "45e43d3" + parameters: + - values: ["--linkage", "complete"] + - values: ["--linkage", "ward"] + - id: sklearn + name: "sklearn" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_sklearn + commit: 5877378 + parameters: + - values: ["--method", "birch"] + - values: ["--method", "kmeans"] + - id: agglomerative + name: "agglomerative" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_agglomerative + commit: 5454368 + parameters: + - values: ["--linkage", "complete"] + - values: ["--linkage", "ward"] + - id: genieclust + name: "genieclust" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_genieclust + commit: 6090043 + parameters: + - values: ["--method", "genie", "--gini_threshold", 0.5] + - id: fcps + name: "fcps" + software_environment: fcps + repository: + url: https://github.com/imallona/clustbench_fcps + commit: 272fa5f + parameters: + - values: ["--method", "FCPS_Minimax"] + inputs: + - entries: + - data.matrix + - data.true_labels + outputs: + - id: clustering.predicted_ks_range + path: "{input}/{stage}/{module}/{params}/{dataset}_ks_range.labels.gz" + - id: metrics + modules: + - id: partition_metrics + name: "clustbench partition metrics" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_metrics + commit: 8184cd4 + parameters: + - values: ["--metric", "normalized_clustering_accuracy"] + - values: ["--metric", "adjusted_fm_score"] + inputs: + - entries: + - clustering.predicted_ks_range + - data.true_labels + outputs: + - id: metrics.scores + path: "{input}/{stage}/{module}/{params}/{dataset}.scores.gz" +id: clustering_benchmark_apptainer_oras +description: Clustering benchmark on Gagolewski's. Using apptainer from omnibenchmark ORAS registry. +software_backend: apptainer diff --git a/Clustering_apptainer_vanilla.yml b/Clustering_apptainer_vanilla.yml new file mode 100644 index 0000000..cd4ba56 --- /dev/null +++ b/Clustering_apptainer_vanilla.yml @@ -0,0 +1,211 @@ +# this file has been generated automatically - DO NOT EDIT BY HAND +version: 1.5 +benchmarker: "Izaskun Mallona, Daniel Incicau, Ben Carrillo" +benchmark_yaml_spec: 0.4 +software_environments: + clustbench: + description: "clustbench on py3.12.9, optimized python build" + conda: envs/clustbench.yml + envmodule: clustbench/0.1.0-foss-2023b + apptainer: envs/clustbench.sif + fcps: + description: "CRAN's FCPS" + conda: envs/fcps.yml + envmodule: fcps/1.3.4-foss-2023a-r-4.3.2 + apptainer: envs/fcps.sif + rmarkdown: + description: "R with some plotting dependencies" + conda: envs/rmarkdown.yml + envmodule: rmarkdown/0.1.0-gfbf-2024a-r-4.4.2 + apptainer: envs/fcps.sif +metric_collectors: + - id: plotting + name: "Single-backend metric collector." + software_environment: rmarkdown + repository: + url: https://github.com/imallona/clustering_report + commit: 1d6bdf5 + inputs: + - metrics.scores + outputs: + - id: plotting.html + path: "{input}/{name}/plotting_report.html" +stages: + - id: data + modules: + - id: clustbench + name: "clustbench datasets, from https://www.sciencedirect.com/science/article/pii/S0020025521010082#t0005 Table1" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_data + commit: 31ac323 + parameters: # comments depict the possible cardinalities and the number of curated labelsets + - values: ["--dataset_generator", "fcps", "--dataset_name", "atom"] # 2 1 + - values: ["--dataset_generator", "fcps", "--dataset_name", "chainlink"] # 2 1 + - values: ["--dataset_generator", "fcps", "--dataset_name", "engytime"] # 2 2 + - values: ["--dataset_generator", "fcps", "--dataset_name", "hepta"] # 7 1 + - values: ["--dataset_generator", "fcps", "--dataset_name", "lsun"] # 3 1 + - values: ["--dataset_generator", "fcps", "--dataset_name", "target"] # 2, 6 2 + - values: ["--dataset_generator", "fcps", "--dataset_name", "tetra"] # 4 1 + - values: ["--dataset_generator", "fcps", "--dataset_name", "twodiamonds"] # 2 1 + - values: ["--dataset_generator", "fcps", "--dataset_name", "wingnut"] # 2 1 + - values: ["--dataset_generator", "graves", "--dataset_name", "dense"] # 2 1 + - values: ["--dataset_generator", "graves", "--dataset_name", "fuzzyx"] # 2, 4, 5 6 + - values: ["--dataset_generator", "graves", "--dataset_name", "line"] # 2 1 + - values: ["--dataset_generator", "graves", "--dataset_name", "parabolic"] # 2, 4 2 + - values: ["--dataset_generator", "graves", "--dataset_name", "ring"] # 2 1 + - values: ["--dataset_generator", "graves", "--dataset_name", "ring_noisy"] # 2 1 + - values: ["--dataset_generator", "graves", "--dataset_name", "ring_outliers"] # 2, 5 2 + - values: ["--dataset_generator", "graves", "--dataset_name", "zigzag"] # 3, 5 2 + - values: ["--dataset_generator", "graves", "--dataset_name", "zigzag_noisy"] # 3, 5 2 + - values: ["--dataset_generator", "graves", "--dataset_name", "zigzag_outliers"] # 3, 5 2 + - values: ["--dataset_generator", "other", "--dataset_name", "chameleon_t4_8k"] # 6 1 + - values: ["--dataset_generator", "other", "--dataset_name", "chameleon_t5_8k"] # 6 1 + - values: ["--dataset_generator", "other", "--dataset_name", "hdbscan"] # 6 1 + - values: ["--dataset_generator", "other", "--dataset_name", "iris"] # 3 1 + - values: ["--dataset_generator", "other", "--dataset_name", "iris5"] # 3 1 + - values: ["--dataset_generator", "other", "--dataset_name", "square"] # 2 1 + - values: ["--dataset_generator", "sipu", "--dataset_name", "aggregation"] # 7 1 + - values: ["--dataset_generator", "sipu", "--dataset_name", "compound"] # 4, 5, 6 5 + - values: ["--dataset_generator", "sipu", "--dataset_name", "flame"] # 2 2 + - values: ["--dataset_generator", "sipu", "--dataset_name", "jain"] # 2 1 + - values: ["--dataset_generator", "sipu", "--dataset_name", "pathbased"] # 3, 4 2 + - values: ["--dataset_generator", "sipu", "--dataset_name", "r15"] # 8, 9, 15 3 + - values: ["--dataset_generator", "sipu", "--dataset_name", "spiral"] # 3 1 + - values: ["--dataset_generator", "sipu", "--dataset_name", "unbalance"] # 8 1 + - values: ["--dataset_generator", "uci", "--dataset_name", "ecoli"] # 8 1 + - values: ["--dataset_generator", "uci", "--dataset_name", "ionosphere"] # 2 1 + - values: ["--dataset_generator", "uci", "--dataset_name", "sonar"] # 2 1 + - values: ["--dataset_generator", "uci", "--dataset_name", "statlog"] # 7 1 + - values: ["--dataset_generator", "uci", "--dataset_name", "wdbc"] # 2 1 + - values: ["--dataset_generator", "uci", "--dataset_name", "wine"] # 3 1 + - values: ["--dataset_generator", "uci", "--dataset_name", "yeast"] # 10 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "circles"] # 4 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "cross"] # 4 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "graph"] # 10 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "isolation"] # 3 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "labirynth"] # 6 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "mk1"] # 3 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "mk2"] # 2 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "mk3"] # 3 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "mk4"] # 3 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "olympic"] # 5 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "smile"] # 4, 6 2 + - values: ["--dataset_generator", "wut", "--dataset_name", "stripes"] # 2 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "trajectories"] # 4 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "trapped_lovers"] # 3 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "twosplashes"] # 2 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "windows"] # 5 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "x1"] # 3 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "x2"] # 3 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "x3"] # 4 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "z1"] # 3 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "z2"] # 5 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "z3"] # 4 1 + outputs: + - id: data.matrix + path: "{input}/{stage}/{module}/{params}/{dataset}.data.gz" + - id: data.true_labels + path: "{input}/{stage}/{module}/{params}/{dataset}.labels0.gz" + - id: clustering + modules: + - id: fastcluster + name: "fastcluster algorithm" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_fastcluster + commit: "45e43d3" + parameters: + - values: ["--linkage", "complete"] + - values: ["--linkage", "ward"] + - values: ["--linkage", "average"] + - values: ["--linkage", "weighted"] + - values: ["--linkage", "median"] + - values: ["--linkage", "centroid"] + - id: sklearn + name: sklearn + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_sklearn + commit: 5877378 + parameters: + - values: ["--method", "birch"] + - values: ["--method", "kmeans"] + # - values: ["--method", "spectral"] ## too slow + - values: ["--method", "gm"] + - id: agglomerative + name: "agglomerative" + software_environment: "clustbench" + repository: + url: https://github.com/imallona/clustbench_agglomerative + commit: 5454368 + parameters: + - values: ["--linkage", "average"] + - values: ["--linkage", "complete"] + - values: ["--linkage", "ward"] + - id: genieclust + name: genieclust + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_genieclust + commit: 6090043 + parameters: + - values: ["--method", "genie", "--gini_threshold", 0.5] + - values: ["--method", "gic"] + - values: ["--method", "ica"] + - id: fcps + name: "fcps" + software_environment: "fcps" + repository: + url: https://github.com/imallona/clustbench_fcps + commit: 272fa5f + parameters: + # - values: ["--method", "FCPS_AdaptiveDensityPeak"] # not in conda + - values: ["--method", "FCPS_Minimax"] + - values: ["--method", "FCPS_MinEnergy"] + - values: ["--method", "FCPS_HDBSCAN_2"] + - values: ["--method", "FCPS_HDBSCAN_4"] + - values: ["--method", "FCPS_HDBSCAN_8"] + - values: ["--method", "FCPS_Diana"] + - values: ["--method", "FCPS_Fanny"] + - values: ["--method", "FCPS_Hardcl"] + - values: ["--method", "FCPS_Softcl"] + - values: ["--method", "FCPS_Clara"] + - values: ["--method", "FCPS_PAM"] + inputs: + - entries: + - data.matrix + - data.true_labels + outputs: + - id: clustering.predicted_ks_range + path: "{input}/{stage}/{module}/{params}/{dataset}_ks_range.labels.gz" + - id: metrics + modules: + - id: partition_metrics + name: "clustbench partition metrics" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_metrics + commit: 9132d45 + parameters: + - values: ["--metric", "normalized_clustering_accuracy"] + - values: ["--metric", "adjusted_fm_score"] + - values: ["--metric", "adjusted_mi_score"] + - values: ["--metric", "adjusted_rand_score"] + - values: ["--metric", "fm_score"] + - values: ["--metric", "mi_score"] + - values: ["--metric", "normalized_clustering_accuracy"] + - values: ["--metric", "normalized_mi_score"] + - values: ["--metric", "normalized_pivoted_accuracy"] + - values: ["--metric", "pair_sets_index"] + - values: ["--metric", "rand_score"] + inputs: + - entries: + - clustering.predicted_ks_range + - data.true_labels + outputs: + - id: metrics.scores + path: "{input}/{stage}/{module}/{params}/{dataset}.scores.gz" +id: clustering_benchmark_apptainer_vanilla_local +description: Clustering benchmark on Gagolewski's. Using apptainer (locally built image) +software_backend: apptainer diff --git a/Clustering_apptainer_vanilla_short.yml b/Clustering_apptainer_vanilla_short.yml new file mode 100644 index 0000000..01a1fe2 --- /dev/null +++ b/Clustering_apptainer_vanilla_short.yml @@ -0,0 +1,121 @@ +# this file has been generated automatically - DO NOT EDIT BY HAND +version: 1.5 +benchmarker: "Izaskun Mallona, Daniel Incicau, Ben Carrillo" +benchmark_yaml_spec: 0.5 +software_environments: + clustbench: + description: "clustbench on py3.12.3, default python" + envmodule: clustbench/0.1.0-foss-2023b + conda: envs/clustbench.yml + apptainer: envs/clustbench.sif + fcps: + description: "CRAN's FCPS" + envmodule: fcps/1.3.4-foss-2023a-r-4.3.2 + conda: envs/fcps.yml + apptainer: envs/fcps.sif + rmarkdown: + description: "R with some plotting dependencies" + conda: envs/rmarkdown.yml + envmodule: rmarkdown/0.1.0-gfbf-2024a-r-4.4.2 + apptainer: envs/fcps.sif +metric_collectors: + - id: plotting + name: "Single-backend metric collector." + software_environment: rmarkdown + repository: + url: https://github.com/imallona/clustering_report + commit: 1d6bdf5 + inputs: + - metrics.scores + outputs: + - id: plotting.html + path: "{input}/{name}/plotting_report.html" +stages: + - id: data + modules: + - id: clustbench + name: "clustbench datasets, from https://www.sciencedirect.com/science/article/pii/S0020025521010082#t0005 Table1" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_data + commit: 31ac323 + parameters: # comments depict the possible cardinalities and the number of curated labelsets + - values: ["--dataset_generator", "fcps", "--dataset_name", "atom"] # 2 1 + outputs: + - id: data.matrix + path: "{input}/{stage}/{module}/{params}/{dataset}.data.gz" + - id: data.true_labels + path: "{input}/{stage}/{module}/{params}/{dataset}.labels0.gz" + - id: clustering + modules: + - id: fastcluster + name: "fastcluster algorithm" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_fastcluster + commit: "45e43d3" + parameters: + - values: ["--linkage", "complete"] + - values: ["--linkage", "ward"] + - id: sklearn + name: "sklearn" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_sklearn + commit: 5877378 + parameters: + - values: ["--method", "birch"] + - values: ["--method", "kmeans"] + - id: agglomerative + name: "agglomerative" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_agglomerative + commit: 5454368 + parameters: + - values: ["--linkage", "complete"] + - values: ["--linkage", "ward"] + - id: genieclust + name: "genieclust" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_genieclust + commit: 6090043 + parameters: + - values: ["--method", "genie", "--gini_threshold", 0.5] + - id: fcps + name: "fcps" + software_environment: fcps + repository: + url: https://github.com/imallona/clustbench_fcps + commit: 272fa5f + parameters: + - values: ["--method", "FCPS_Minimax"] + inputs: + - entries: + - data.matrix + - data.true_labels + outputs: + - id: clustering.predicted_ks_range + path: "{input}/{stage}/{module}/{params}/{dataset}_ks_range.labels.gz" + - id: metrics + modules: + - id: partition_metrics + name: "clustbench partition metrics" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_metrics + commit: 8184cd4 + parameters: + - values: ["--metric", "normalized_clustering_accuracy"] + - values: ["--metric", "adjusted_fm_score"] + inputs: + - entries: + - clustering.predicted_ks_range + - data.true_labels + outputs: + - id: metrics.scores + path: "{input}/{stage}/{module}/{params}/{dataset}.scores.gz" +id: clustering_benchmark_apptainer_vanilla_local +description: Clustering benchmark on Gagolewski's. Using apptainer (locally built image) +software_backend: apptainer diff --git a/Clustering_conda.yml b/Clustering_conda.yml index 7ac1629..5fd45d2 100644 --- a/Clustering_conda.yml +++ b/Clustering_conda.yml @@ -1,42 +1,27 @@ -id: clustering_example_conda -description: Clustering benchmark on Gagolewski's, true number of clusters plus minus 2. -version: 1.4 -benchmarker: "Izaskun Mallona, Daniel Incicau" -storage: http://omnibenchmark.org:9000 -benchmark_yaml_spec: 0.04 -storage_api: S3 -storage_bucket_name: clusteringexampleconda -software_backend: conda +# this file has been generated automatically - DO NOT EDIT BY HAND +version: 1.5 +benchmarker: "Izaskun Mallona, Daniel Incicau, Ben Carrillo" +benchmark_yaml_spec: 0.4 software_environments: clustbench: - description: "clustbench on py3.12.6" + description: "clustbench on py3.12.9, optimized python build" conda: envs/clustbench.yml - envmodule: clustbench - apptainer: envs/clustbench.sif - sklearn: - description: "Daniel's on py3.12.6" - conda: envs/sklearn.yml - apptainer: envs/sklearn.sif - envmodule: clustbench # not true, but - R: - description: "Daniel's R with readr, dplyr, mclust, caret" - conda: envs/r.yml - apptainer: envs/r.sif - envmodule: fcps # not true, but - rmarkdown: - description: "R with some plotting dependencies" - conda: envs/rmarkdown.yml - apptainer: envs/r.sif # not true, but - envmodule: fcps # not true, but + envmodule: clustbench/0.1.0-foss-2023b + apptainer: oras://quay.io/omnibenchmark/clustbench-vanilla:0.1.0 fcps: description: "CRAN's FCPS" conda: envs/fcps.yml - apptainer: envs/fcps.sif - envmodule: fcps + envmodule: fcps/1.3.4-foss-2023a-r-4.3.2 + apptainer: oras://quay.io/omnibenchmark/fcps:0.1.0 + rmarkdown: + description: "R with some plotting dependencies" + conda: envs/rmarkdown.yml + envmodule: rmarkdown/0.1.0-gfbf-2024a-r-4.4.2 + apptainer: oras://quay.io/omnibenchmark/fcps:0.1.0 metric_collectors: - id: plotting name: "Single-backend metric collector." - software_environment: "rmarkdown" + software_environment: rmarkdown repository: url: https://github.com/imallona/clustering_report commit: 1d6bdf5 @@ -46,17 +31,15 @@ metric_collectors: - id: plotting.html path: "{input}/{name}/plotting_report.html" stages: - ## clustbench data ########################################################## - - id: data modules: - id: clustbench name: "clustbench datasets, from https://www.sciencedirect.com/science/article/pii/S0020025521010082#t0005 Table1" - software_environment: "clustbench" + software_environment: clustbench repository: url: https://github.com/imallona/clustbench_data - commit: 366c5a2 - parameters: # comments depict the possible cardinalities and the number of curated labelsets + commit: 31ac323 + parameters: # comments depict the possible cardinalities and the number of curated labelsets - values: ["--dataset_generator", "fcps", "--dataset_name", "atom"] # 2 1 - values: ["--dataset_generator", "fcps", "--dataset_name", "chainlink"] # 2 1 - values: ["--dataset_generator", "fcps", "--dataset_name", "engytime"] # 2 2 @@ -124,17 +107,13 @@ stages: path: "{input}/{stage}/{module}/{params}/{dataset}.data.gz" - id: data.true_labels path: "{input}/{stage}/{module}/{params}/{dataset}.labels0.gz" - - ## clustbench methods (fastcluster) ################################################################### - - id: clustering modules: - id: fastcluster name: "fastcluster algorithm" - software_environment: "clustbench" + software_environment: clustbench repository: url: https://github.com/imallona/clustbench_fastcluster - # url: /home/imallona/src/clustbench_fastcluster/ commit: "45e43d3" parameters: - values: ["--linkage", "complete"] @@ -144,11 +123,10 @@ stages: - values: ["--linkage", "median"] - values: ["--linkage", "centroid"] - id: sklearn - name: "sklearn" - software_environment: "clustbench" + name: sklearn + software_environment: clustbench repository: url: https://github.com/imallona/clustbench_sklearn - #url: /home/imallona/src/clustbench_sklearn commit: 5877378 parameters: - values: ["--method", "birch"] @@ -166,8 +144,8 @@ stages: - values: ["--linkage", "complete"] - values: ["--linkage", "ward"] - id: genieclust - name: "genieclust" - software_environment: "clustbench" + name: genieclust + software_environment: clustbench repository: url: https://github.com/imallona/clustbench_genieclust commit: 6090043 @@ -201,12 +179,11 @@ stages: outputs: - id: clustering.predicted_ks_range path: "{input}/{stage}/{module}/{params}/{dataset}_ks_range.labels.gz" - - id: metrics modules: - id: partition_metrics name: "clustbench partition metrics" - software_environment: "clustbench" + software_environment: clustbench repository: url: https://github.com/imallona/clustbench_metrics commit: 9132d45 @@ -229,89 +206,6 @@ stages: outputs: - id: metrics.scores path: "{input}/{stage}/{module}/{params}/{dataset}.scores.gz" - - # ## daniel's data ########################################################################### - - # - id: danielsdata - # modules: - # - id: iris_manual - # name: "Iris Dataset" - # software_environment: "sklearn" - # repository: - # url: https://github.com/omnibenchmark-example/iris.git - # commit: 47c63f0 - # - id: penguins - # name: "Penguins Dataset" - # software_environment: "sklearn" - # repository: - # url: https://github.com/omnibenchmark-example/penguins.git - # commit: 9032478 - # outputs: - # - id: data.features - # path: "{input}/{stage}/{module}/{params}/{dataset}.features.csv" - # - id: data.labels - # path: "{input}/{stage}/{module}/{params}/{dataset}.labels.csv" - - # ## daniel's distances ######################################################################## - - # - id: distances - # modules: - # - id: D1 - # software_environment: "sklearn" - # parameters: - # - values: ["--measure", "cosine"] - # - values: ["--measure", "euclidean"] - # - values: ["--measure", "manhattan"] - # - values: ["--measure", "chebyshev"] - # repository: - # url: https://github.com/omnibenchmark-example/distance.git - # commit: dd99d4f - # inputs: - # - entries: - # - data.features - # outputs: - # - id: distances - # path: "{input}/{stage}/{module}/{params}/{dataset}.distances.csv" - - # ## daniel's methods ################################################################### - - # - id: danielmethods - # modules: - # - id: kmeans - # software_environment: "sklearn" - # repository: - # url: https://github.com/omnibenchmark-example/kmeans.git - # commit: 049c8b1 - # - id: ward - # software_environment: "R" - # repository: - # url: https://github.com/omnibenchmark-example/ward.git - # commit: 976e3f3 - # inputs: - # - entries: - # - distances - # outputs: - # - id: methods.clusters - # path: "{input}/{stage}/{module}/{params}/{dataset}.clusters.csv" - - # ## daniel's metrics ################################################################### - - # - id: danielsmetrics - # modules: - # - id: ari - # software_environment: "R" - # repository: - # url: https://github.com/omnibenchmark-example/ari.git - # commit: 72708f0 - # - id: accuracy - # software_environment: "R" - # repository: - # url: https://github.com/omnibenchmark-example/accuracy.git - # commit: e26b32f - # inputs: - # - entries: - # - methods.clusters - # - data.labels - # outputs: - # - id: metrics.mapping - # path: "{input}/{stage}/{module}/{params}/{dataset}.metrics.txt" +id: clustering_benchmark_conda +description: Clustering benchmark on Gagolewski's. Using conda. +software_backend: conda diff --git a/Clustering_conda_short.yml b/Clustering_conda_short.yml new file mode 100644 index 0000000..fd9ae01 --- /dev/null +++ b/Clustering_conda_short.yml @@ -0,0 +1,121 @@ +# this file has been generated automatically - DO NOT EDIT BY HAND +version: 1.5 +benchmarker: "Izaskun Mallona, Daniel Incicau, Ben Carrillo" +benchmark_yaml_spec: 0.5 +software_environments: + clustbench: + description: "clustbench on py3.12.3, default python" + envmodule: clustbench/0.1.0-foss-2023b + conda: envs/clustbench.yml + apptainer: oras://quay.io/omnibenchmark/clustbench-vanilla:0.1.0 + fcps: + description: "CRAN's FCPS" + envmodule: fcps/1.3.4-foss-2023a-r-4.3.2 + conda: envs/fcps.yml + apptainer: oras://quay.io/omnibenchmark/fcps:0.1.0 + rmarkdown: + description: "R with some plotting dependencies" + conda: envs/rmarkdown.yml + envmodule: rmarkdown/0.1.0-gfbf-2024a-r-4.4.2 + apptainer: oras://quay.io/omnibenchmark/fcps:0.1.0 +metric_collectors: + - id: plotting + name: "Single-backend metric collector." + software_environment: rmarkdown + repository: + url: https://github.com/imallona/clustering_report + commit: 1d6bdf5 + inputs: + - metrics.scores + outputs: + - id: plotting.html + path: "{input}/{name}/plotting_report.html" +stages: + - id: data + modules: + - id: clustbench + name: "clustbench datasets, from https://www.sciencedirect.com/science/article/pii/S0020025521010082#t0005 Table1" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_data + commit: 31ac323 + parameters: # comments depict the possible cardinalities and the number of curated labelsets + - values: ["--dataset_generator", "fcps", "--dataset_name", "atom"] # 2 1 + outputs: + - id: data.matrix + path: "{input}/{stage}/{module}/{params}/{dataset}.data.gz" + - id: data.true_labels + path: "{input}/{stage}/{module}/{params}/{dataset}.labels0.gz" + - id: clustering + modules: + - id: fastcluster + name: "fastcluster algorithm" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_fastcluster + commit: "45e43d3" + parameters: + - values: ["--linkage", "complete"] + - values: ["--linkage", "ward"] + - id: sklearn + name: "sklearn" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_sklearn + commit: 5877378 + parameters: + - values: ["--method", "birch"] + - values: ["--method", "kmeans"] + - id: agglomerative + name: "agglomerative" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_agglomerative + commit: 5454368 + parameters: + - values: ["--linkage", "complete"] + - values: ["--linkage", "ward"] + - id: genieclust + name: "genieclust" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_genieclust + commit: 6090043 + parameters: + - values: ["--method", "genie", "--gini_threshold", 0.5] + - id: fcps + name: "fcps" + software_environment: fcps + repository: + url: https://github.com/imallona/clustbench_fcps + commit: 272fa5f + parameters: + - values: ["--method", "FCPS_Minimax"] + inputs: + - entries: + - data.matrix + - data.true_labels + outputs: + - id: clustering.predicted_ks_range + path: "{input}/{stage}/{module}/{params}/{dataset}_ks_range.labels.gz" + - id: metrics + modules: + - id: partition_metrics + name: "clustbench partition metrics" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_metrics + commit: 8184cd4 + parameters: + - values: ["--metric", "normalized_clustering_accuracy"] + - values: ["--metric", "adjusted_fm_score"] + inputs: + - entries: + - clustering.predicted_ks_range + - data.true_labels + outputs: + - id: metrics.scores + path: "{input}/{stage}/{module}/{params}/{dataset}.scores.gz" +id: clustering_benchmark_conda +description: Clustering benchmark on Gagolewski's. Using conda. +software_backend: conda diff --git a/Clustering_envmodules.yml b/Clustering_envmodules.yml index 3c2b8bd..805e130 100644 --- a/Clustering_envmodules.yml +++ b/Clustering_envmodules.yml @@ -1,42 +1,27 @@ -id: clustering_example_envmodules -description: Clustering benchmark on Gagolewski's, true number of clusters plus minus 2. -version: 1.4 -benchmarker: "Izaskun Mallona, Daniel Incicau" -storage: http://omnibenchmark.org:9000 -benchmark_yaml_spec: 0.04 -storage_api: S3 -storage_bucket_name: clusteringexampleenvmodules -software_backend: envmodules +# this file has been generated automatically - DO NOT EDIT BY HAND +version: 1.5 +benchmarker: "Izaskun Mallona, Daniel Incicau, Ben Carrillo" +benchmark_yaml_spec: 0.4 software_environments: clustbench: - description: "clustbench on py3.12.6" + description: "clustbench on py3.12.9, optimized python build" conda: envs/clustbench.yml - envmodule: clustbench - apptainer: envs/clustbench.sif - sklearn: - description: "Daniel's on py3.12.6" - conda: envs/sklearn.yml - apptainer: envs/sklearn.sif - envmodule: clustbench # not true, but - R: - description: "Daniel's R with readr, dplyr, mclust, caret" - conda: envs/r.yml - apptainer: envs/r.sif - envmodule: fcps # not true, but - rmarkdown: - description: "R with some plotting dependencies" - conda: envs/rmarkdown.yml - apptainer: envs/r.sif # not true, but - envmodule: fcps # not true, but + envmodule: clustbench/0.1.0-foss-2023b + apptainer: oras://quay.io/omnibenchmark/clustbench-vanilla:0.1.0 fcps: description: "CRAN's FCPS" conda: envs/fcps.yml - apptainer: envs/fcps.sif - envmodule: fcps + envmodule: fcps/1.3.4-foss-2023a-r-4.3.2 + apptainer: oras://quay.io/omnibenchmark/fcps:0.1.0 + rmarkdown: + description: "R with some plotting dependencies" + conda: envs/rmarkdown.yml + envmodule: rmarkdown/0.1.0-gfbf-2024a-r-4.4.2 + apptainer: oras://quay.io/omnibenchmark/fcps:0.1.0 metric_collectors: - id: plotting name: "Single-backend metric collector." - software_environment: "rmarkdown" + software_environment: rmarkdown repository: url: https://github.com/imallona/clustering_report commit: 1d6bdf5 @@ -46,17 +31,15 @@ metric_collectors: - id: plotting.html path: "{input}/{name}/plotting_report.html" stages: - ## clustbench data ########################################################## - - id: data modules: - id: clustbench name: "clustbench datasets, from https://www.sciencedirect.com/science/article/pii/S0020025521010082#t0005 Table1" - software_environment: "clustbench" + software_environment: clustbench repository: url: https://github.com/imallona/clustbench_data - commit: 366c5a2 - parameters: # comments depict the possible cardinalities and the number of curated labelsets + commit: 31ac323 + parameters: # comments depict the possible cardinalities and the number of curated labelsets - values: ["--dataset_generator", "fcps", "--dataset_name", "atom"] # 2 1 - values: ["--dataset_generator", "fcps", "--dataset_name", "chainlink"] # 2 1 - values: ["--dataset_generator", "fcps", "--dataset_name", "engytime"] # 2 2 @@ -124,17 +107,13 @@ stages: path: "{input}/{stage}/{module}/{params}/{dataset}.data.gz" - id: data.true_labels path: "{input}/{stage}/{module}/{params}/{dataset}.labels0.gz" - - ## clustbench methods (fastcluster) ################################################################### - - id: clustering modules: - id: fastcluster name: "fastcluster algorithm" - software_environment: "clustbench" + software_environment: clustbench repository: url: https://github.com/imallona/clustbench_fastcluster - # url: /home/imallona/src/clustbench_fastcluster/ commit: "45e43d3" parameters: - values: ["--linkage", "complete"] @@ -144,11 +123,10 @@ stages: - values: ["--linkage", "median"] - values: ["--linkage", "centroid"] - id: sklearn - name: "sklearn" - software_environment: "clustbench" + name: sklearn + software_environment: clustbench repository: url: https://github.com/imallona/clustbench_sklearn - #url: /home/imallona/src/clustbench_sklearn commit: 5877378 parameters: - values: ["--method", "birch"] @@ -166,8 +144,8 @@ stages: - values: ["--linkage", "complete"] - values: ["--linkage", "ward"] - id: genieclust - name: "genieclust" - software_environment: "clustbench" + name: genieclust + software_environment: clustbench repository: url: https://github.com/imallona/clustbench_genieclust commit: 6090043 @@ -201,12 +179,11 @@ stages: outputs: - id: clustering.predicted_ks_range path: "{input}/{stage}/{module}/{params}/{dataset}_ks_range.labels.gz" - - id: metrics modules: - id: partition_metrics name: "clustbench partition metrics" - software_environment: "clustbench" + software_environment: clustbench repository: url: https://github.com/imallona/clustbench_metrics commit: 9132d45 @@ -229,89 +206,6 @@ stages: outputs: - id: metrics.scores path: "{input}/{stage}/{module}/{params}/{dataset}.scores.gz" - - # ## daniel's data ########################################################################### - - # - id: danielsdata - # modules: - # - id: iris_manual - # name: "Iris Dataset" - # software_environment: "sklearn" - # repository: - # url: https://github.com/omnibenchmark-example/iris.git - # commit: 47c63f0 - # - id: penguins - # name: "Penguins Dataset" - # software_environment: "sklearn" - # repository: - # url: https://github.com/omnibenchmark-example/penguins.git - # commit: 9032478 - # outputs: - # - id: data.features - # path: "{input}/{stage}/{module}/{params}/{dataset}.features.csv" - # - id: data.labels - # path: "{input}/{stage}/{module}/{params}/{dataset}.labels.csv" - - # ## daniel's distances ######################################################################## - - # - id: distances - # modules: - # - id: D1 - # software_environment: "sklearn" - # parameters: - # - values: ["--measure", "cosine"] - # - values: ["--measure", "euclidean"] - # - values: ["--measure", "manhattan"] - # - values: ["--measure", "chebyshev"] - # repository: - # url: https://github.com/omnibenchmark-example/distance.git - # commit: dd99d4f - # inputs: - # - entries: - # - data.features - # outputs: - # - id: distances - # path: "{input}/{stage}/{module}/{params}/{dataset}.distances.csv" - - # ## daniel's methods ################################################################### - - # - id: danielmethods - # modules: - # - id: kmeans - # software_environment: "sklearn" - # repository: - # url: https://github.com/omnibenchmark-example/kmeans.git - # commit: 049c8b1 - # - id: ward - # software_environment: "R" - # repository: - # url: https://github.com/omnibenchmark-example/ward.git - # commit: 976e3f3 - # inputs: - # - entries: - # - distances - # outputs: - # - id: methods.clusters - # path: "{input}/{stage}/{module}/{params}/{dataset}.clusters.csv" - - # ## daniel's metrics ################################################################### - - # - id: danielsmetrics - # modules: - # - id: ari - # software_environment: "R" - # repository: - # url: https://github.com/omnibenchmark-example/ari.git - # commit: 72708f0 - # - id: accuracy - # software_environment: "R" - # repository: - # url: https://github.com/omnibenchmark-example/accuracy.git - # commit: e26b32f - # inputs: - # - entries: - # - methods.clusters - # - data.labels - # outputs: - # - id: metrics.mapping - # path: "{input}/{stage}/{module}/{params}/{dataset}.metrics.txt" +id: clustering_benchmark_envmodules +description: Clustering benchmark on Gagolewski's. Using envmodules. +software_backend: envmodules diff --git a/Clustering_envmodules_short.yml b/Clustering_envmodules_short.yml new file mode 100644 index 0000000..e3dc0fd --- /dev/null +++ b/Clustering_envmodules_short.yml @@ -0,0 +1,121 @@ +# this file has been generated automatically - DO NOT EDIT BY HAND +version: 1.5 +benchmarker: "Izaskun Mallona, Daniel Incicau, Ben Carrillo" +benchmark_yaml_spec: 0.5 +software_environments: + clustbench: + description: "clustbench on py3.12.3, default python" + envmodule: clustbench/0.1.0-foss-2023b + conda: envs/clustbench.yml + apptainer: oras://quay.io/omnibenchmark/clustbench-vanilla:0.1.0 + fcps: + description: "CRAN's FCPS" + envmodule: fcps/1.3.4-foss-2023a-r-4.3.2 + conda: envs/fcps.yml + apptainer: oras://quay.io/omnibenchmark/fcps:0.1.0 + rmarkdown: + description: "R with some plotting dependencies" + conda: envs/rmarkdown.yml + envmodule: rmarkdown/0.1.0-gfbf-2024a-r-4.4.2 + apptainer: oras://quay.io/omnibenchmark/fcps:0.1.0 +metric_collectors: + - id: plotting + name: "Single-backend metric collector." + software_environment: rmarkdown + repository: + url: https://github.com/imallona/clustering_report + commit: 1d6bdf5 + inputs: + - metrics.scores + outputs: + - id: plotting.html + path: "{input}/{name}/plotting_report.html" +stages: + - id: data + modules: + - id: clustbench + name: "clustbench datasets, from https://www.sciencedirect.com/science/article/pii/S0020025521010082#t0005 Table1" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_data + commit: 31ac323 + parameters: # comments depict the possible cardinalities and the number of curated labelsets + - values: ["--dataset_generator", "fcps", "--dataset_name", "atom"] # 2 1 + outputs: + - id: data.matrix + path: "{input}/{stage}/{module}/{params}/{dataset}.data.gz" + - id: data.true_labels + path: "{input}/{stage}/{module}/{params}/{dataset}.labels0.gz" + - id: clustering + modules: + - id: fastcluster + name: "fastcluster algorithm" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_fastcluster + commit: "45e43d3" + parameters: + - values: ["--linkage", "complete"] + - values: ["--linkage", "ward"] + - id: sklearn + name: "sklearn" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_sklearn + commit: 5877378 + parameters: + - values: ["--method", "birch"] + - values: ["--method", "kmeans"] + - id: agglomerative + name: "agglomerative" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_agglomerative + commit: 5454368 + parameters: + - values: ["--linkage", "complete"] + - values: ["--linkage", "ward"] + - id: genieclust + name: "genieclust" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_genieclust + commit: 6090043 + parameters: + - values: ["--method", "genie", "--gini_threshold", 0.5] + - id: fcps + name: "fcps" + software_environment: fcps + repository: + url: https://github.com/imallona/clustbench_fcps + commit: 272fa5f + parameters: + - values: ["--method", "FCPS_Minimax"] + inputs: + - entries: + - data.matrix + - data.true_labels + outputs: + - id: clustering.predicted_ks_range + path: "{input}/{stage}/{module}/{params}/{dataset}_ks_range.labels.gz" + - id: metrics + modules: + - id: partition_metrics + name: "clustbench partition metrics" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_metrics + commit: 8184cd4 + parameters: + - values: ["--metric", "normalized_clustering_accuracy"] + - values: ["--metric", "adjusted_fm_score"] + inputs: + - entries: + - clustering.predicted_ks_range + - data.true_labels + outputs: + - id: metrics.scores + path: "{input}/{stage}/{module}/{params}/{dataset}.scores.gz" +id: clustering_benchmark_envmodules +description: Clustering benchmark on Gagolewski's. Using envmodules. +software_backend: envmodules diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..71b6860 --- /dev/null +++ b/Makefile @@ -0,0 +1,78 @@ +MAX_CORES ?= 10 +TIMEOUT ?= 4h +YQ_MERGE=yq eval-all 'select(fileIndex==1) * select(fileIndex==0)' +YQ_REPOS=yq '.stages[].modules[] | .id + ": " + .repository.url + "@" + .repository.commit' + +# by default, we want to run all snakemake rules even if there are failures (-k) +OB_CMD=ob run benchmark -k --local --task-timeout ${TIMEOUT} --cores ${MAX_CORES} --yes + +APPTR = apptainer +APPTV = apptainer_vanilla +APPTO = apptainer_optimized +CONDA = conda +ENVMD = envmodules + +BASE = base.yml +BASE_SHORT = smoketest/base.yml + +# Install dependencies to generate files (requires go in the system) +deps: + go install github.com/mikefarah/yq/v4@latest + +# Generate all the yaml files from base + overrides +.SILENT: generate +generate: + ${YQ_MERGE} overrides/${APPTR}.yml ${BASE} > Clustering_${APPTR}.yml + ${YQ_MERGE} overrides/${APPTV}.yml ${BASE} > Clustering_${APPTV}.yml + ${YQ_MERGE} overrides/${APPTO}.yml ${BASE} > Clustering_${APPTO}.yml + ${YQ_MERGE} overrides/${CONDA}.yml ${BASE} > Clustering_${CONDA}.yml + ${YQ_MERGE} overrides/${ENVMD}.yml ${BASE} > Clustering_${ENVMD}.yml + ${YQ_MERGE} overrides/${APPTR}.yml ${BASE_SHORT} > Clustering_${APPTR}_short.yml + ${YQ_MERGE} overrides/${APPTV}.yml ${BASE_SHORT} > Clustering_${APPTV}_short.yml + ${YQ_MERGE} overrides/${APPTO}.yml ${BASE_SHORT} > Clustering_${APPTO}_short.yml + ${YQ_MERGE} overrides/${CONDA}.yml ${BASE_SHORT} > Clustering_${CONDA}_short.yml + ${YQ_MERGE} overrides/${ENVMD}.yml ${BASE_SHORT} > Clustering_${ENVMD}_short.yml + echo "[+] The following files have been generated:" + ls Clustering_*.yml + echo "[+] You can use 'make clean' to delete them" + +clean: + rm Clustering_*.yml + +prepare_apptainer_env: + cd envs && ./build_singularity.sh +prepare_envmodules_env: + cd envs && eb clustbench.eb --robot + cd envs && eb fcps.eb --robot + cd envs && eb rmarkdown.eb --robot + +# short versions, to debug runs & environments +run_with_apptainer_backend_short: + ${OB_CMD} -b Clustering_${APPTR}_short.yml + mv out out_${APPTR}_short-$(shell date +'%Y%m%d%H%M') +run_with_apptainer_backend_vanilla_short: + ${OB_CMD} -b Clustering_${APPTV}_short.yml + mv out out_${APPTV}_short-$(shell date +'%Y%m%d%H%M') +run_with_conda_backend_short: + ${OB_CMD} -b Clustering_${CONDA}_short.yml + mv out out_${CONDA}_short-$(shell date +'%Y%m%d%H%M') +run_with_envmodules_backend_short: + ${OB_CMD} -b Clustering_${ENVMD}_short.yml + mv out out_${ENVMD}_short-$(shell date +'%Y%m%d%H%M') + +# full versions (expect hours) +run_with_apptainer_backend: + ${OB_CMD} -b Clustering_${APPTR}.yml + mv out out_${APPTR}-$(shell date +'%Y%m%d%H%M') +run_with_apptainer_backend_vanilla: + ${OB_CMD} -b Clustering_${APPTV}.yml + mv out out_${APPTV}-$(shell date +'%Y%m%d%H%M') +run_with_conda_backend: + ${OB_CMD} -b Clustering_${CONDA}.yml + mv out out_${CONDA}-$(shell date +'%Y%m%d%H%M') +run_with_envmodules_backend: + ${OB_CMD} -b Clustering_${ENVMD}.yml + mv out out_${ENVMD}-$(shell date +'%Y%m%d%H%M') + +extract_modules: + @${YQ_REPOS} base.yml diff --git a/README.md b/README.md index a75c594..7859113 100644 --- a/README.md +++ b/README.md @@ -2,29 +2,30 @@ A clustering example for omnibenchmark # How to run -1. Install omnibenchmark using [our tutorial](https://omnibenchmark.org/tutorial/) -2. Clone the benchmark definition / this repository with `git clone git@github.com:omnibenchmark/clustering_example.git` -3. Move to the cloned repository `cd clustering_example` -4. Run locally, somewhat in parallel `ob run benchmark -b CLUSTERING.YAML --local --threads 6`. Choose `Clustering.yml` specification based on whether running it with conda, easybuild, apptainer, etc. [More details about the available backends](https://github.com/omnibenchmark/clustering_example/blob/main/envs/README.md). +1. Install omnibenchmark: `pip install omnibenchmark>=0.2.0` +2. Clone the benchmark definition in this repository with `git clone https://github.com/omnibenchmark/clustering_example` +3. Move into the cloned folder: `cd clustering_example` +4. Run locally, with the desired degree of parallelism: + `ob run benchmark -b --local --cores 6`. + Choose your `Clustering_*.yml` specification based on the backend you want to run (conda, easybuild or apptainer). [More details about the available backends and how to build or enable them](https://github.com/omnibenchmark/clustering_example/blob/main/envs/README.md). -# Clustbench attribution +# Software backends and variants -by Marek Gagolewski, modified by Izaskun Mallona +* All needed recipes can be found under `envs`: conda, apptainer, easybuild (lmod modules) +* The `_smoketest` variants are meant for [quick testing](https://en.wikipedia.org/wiki/Smoke_testing_(software)) +* The default `apptainer` container fetches images from an online registry. +* `apptainer-vanilla` makes reference to a container image with stock python (`3.12`) +* `apptainer-optimized` makes reference to a container image with a custom compiled python (`3.12.9`), just to check if optimization flags have a noticeable effect. +* `envmodules` will need you to previously build the `.eb` easyconfigs with easybuild. We plan to make these modules publicly available in the future. +* `conda` environments will fetch software from the configured conda channels and pypi. Does not compile anything, fetches pre-built binaries (assuming there's a build in those channels for your architecture, that is) -# Data disclaimer +[More info in the envs/ folder](https://github.com/omnibenchmark/clustering_example/blob/main/envs/README.md). -Some datasets are commented out to speed up calculations. - -From [Are cluster validity measures (in) valid?](https://www.sciencedirect.com/science/article/pii/S0020025521010082): - -> The original benchmark battery consists of 79 data instances, however 16 datasets are accompanied by labels that yield ; they were omitted for their computation would be too lengthy (namely: mnist/digits, mnist/fashion, other/chameleon_t7_10k, other/chameleon_t8_8k, sipu/a1, sipu/a2, sipu/a3, sipu/birch1, sipu/birch2, sipu/d31, sipu/s1, sipu/s2, sipu/s3, sipu/s4, sipu/worms_2, sipu/worms_64). Also uci/glass has been removed as one of its 25-near-neighbour graph’s connected components was too small for the NN-based methods to succeed. This leaves us with 62 datasets in total, see Table 1. - -A yaml such as [0a88c91](https://github.com/omnibenchmark/clustering_example/blob/0a88c910bbda62d1b593f4215a682770227f39ff/Clustering.yaml) with 30 cores should run half of the stuff in ~4 h and reach 97% completion in ~8h. # Summary - Data. Example datasets (not a comprehensive list, it's >79 of them): - - https://github.com/imallona/clustbench_data + - https://github.com/imallona/clustbench_data - args: ["--dataset_generator", "mnist", "--dataset_name", "fashion"] - args: ["--dataset_generator", "other", "--dataset_name", "iris"] - args: ["--dataset_generator", "mnist", "--dataset_name", "digits"] @@ -37,7 +38,7 @@ A yaml such as [0a88c91](https://github.com/omnibenchmark/clustering_example/blo - args: ["--linkage", "weighted"] - args: ["--linkage", "median"] - args: ["--linkage", "centroid"] - - https://github.com/imallona/clustbench_sklearn + - https://github.com/imallona/clustbench_sklearn - args: ["--method", "birch"] - args: ["--method", "kmeans"] - args: ["--method", "spectral"] ## too slow @@ -84,14 +85,29 @@ A yaml such as [0a88c91](https://github.com/omnibenchmark/clustering_example/blo - https://github.com/omnibenchmark-example/ward.git - https://github.com/omnibenchmark-example/ari.git - https://github.com/omnibenchmark-example/accuracy.git - - -# Software backends -In `envs`: conda, apptainer, easybuild (lmod modules) + +# Omnibenchmark YAML generation + +The current repo has base templates for different runs. +Install [yq](https://github.com/mikefarah/yq) and run `make generate` if you want to modify the base template in your tests. # Warnings Mind we try to run clusterings specifying the true number of clusters +- 2. But sometimes the true number is k=3. Then we do `k=2, k=2, k=3, k=5, k=6` filling with k=2s as needed, and recomputing the same values multiple times (so runtimes are comparable across datasets, regardless of their true number of clusters). Also, we have modules by Daniel not fully incorporated into Gagolewski's flow. + +# Data disclaimer + +Some datasets are commented out to speed up calculations. + +From [Are cluster validity measures (in) valid?](https://www.sciencedirect.com/science/article/pii/S0020025521010082): + +> The original benchmark battery consists of 79 data instances, however 16 datasets are accompanied by labels that yield ; they were omitted for their computation would be too lengthy (namely: mnist/digits, mnist/fashion, other/chameleon_t7_10k, other/chameleon_t8_8k, sipu/a1, sipu/a2, sipu/a3, sipu/birch1, sipu/birch2, sipu/d31, sipu/s1, sipu/s2, sipu/s3, sipu/s4, sipu/worms_2, sipu/worms_64). Also uci/glass has been removed as one of its 25-near-neighbour graph’s connected components was too small for the NN-based methods to succeed. This leaves us with 62 datasets in total, see Table 1. + +A yaml such as [0a88c91](https://github.com/omnibenchmark/clustering_example/blob/0a88c910bbda62d1b593f4215a682770227f39ff/Clustering.yaml) with 30 cores should run half of the stuff in ~4 h and reach 97% completion in ~8h. + +# Clustbench attribution + +by Marek Gagolewski, modified by Izaskun Mallona diff --git a/Clustering_singularity.yml b/base.yml similarity index 69% rename from Clustering_singularity.yml rename to base.yml index c80b498..bfadca6 100644 --- a/Clustering_singularity.yml +++ b/base.yml @@ -1,42 +1,32 @@ -id: clustering_example_apptainer -description: Clustering benchmark on Gagolewski's, true number of clusters plus minus 2. -version: 1.4 -benchmarker: "Izaskun Mallona, Daniel Incicau" -storage: http://omnibenchmark.org:9000 -benchmark_yaml_spec: 0.04 -storage_api: S3 -storage_bucket_name: clusteringexampleapptainer -software_backend: apptainer +version: 1.5 +benchmarker: "Izaskun Mallona, Daniel Incicau, Ben Carrillo" +benchmark_yaml_spec: 0.4 + software_environments: + clustbench: - description: "clustbench on py3.12.6" + description: "clustbench on py3.12.9, optimized python build" conda: envs/clustbench.yml - envmodule: clustbench - apptainer: envs/clustbench.sif - sklearn: - description: "Daniel's on py3.12.6" - conda: envs/sklearn.yml - apptainer: envs/sklearn.sif - envmodule: clustbench # not true, but - R: - description: "Daniel's R with readr, dplyr, mclust, caret" - conda: envs/r.yml - apptainer: envs/r.sif - envmodule: fcps # not true, but - rmarkdown: - description: "R with some plotting dependencies" - conda: envs/rmarkdown.yml - apptainer: envs/r.sif # not true, but - envmodule: fcps # not true, but + envmodule: clustbench/0.1.0-foss-2023b + apptainer: oras://quay.io/omnibenchmark/clustbench-vanilla:0.1.0 + fcps: description: "CRAN's FCPS" conda: envs/fcps.yml - apptainer: envs/fcps.sif - envmodule: fcps + envmodule: fcps/1.3.4-foss-2023a-r-4.3.2 + apptainer: oras://quay.io/omnibenchmark/fcps:0.1.0 + + rmarkdown: + description: "R with some plotting dependencies" + conda: envs/rmarkdown.yml + envmodule: rmarkdown/0.1.0-gfbf-2024a-r-4.4.2 + apptainer: oras://quay.io/omnibenchmark/fcps:0.1.0 + + metric_collectors: - id: plotting name: "Single-backend metric collector." - software_environment: "rmarkdown" + software_environment: rmarkdown repository: url: https://github.com/imallona/clustering_report commit: 1d6bdf5 @@ -45,17 +35,18 @@ metric_collectors: outputs: - id: plotting.html path: "{input}/{name}/plotting_report.html" + stages: - ## clustbench data ########################################################## - id: data modules: - id: clustbench name: "clustbench datasets, from https://www.sciencedirect.com/science/article/pii/S0020025521010082#t0005 Table1" - software_environment: "clustbench" + software_environment: clustbench repository: url: https://github.com/imallona/clustbench_data - commit: 366c5a2 + commit: 31ac323 + parameters: # comments depict the possible cardinalities and the number of curated labelsets - values: ["--dataset_generator", "fcps", "--dataset_name", "atom"] # 2 1 - values: ["--dataset_generator", "fcps", "--dataset_name", "chainlink"] # 2 1 @@ -125,16 +116,13 @@ stages: - id: data.true_labels path: "{input}/{stage}/{module}/{params}/{dataset}.labels0.gz" - ## clustbench methods (fastcluster) ################################################################### - - id: clustering modules: - id: fastcluster name: "fastcluster algorithm" - software_environment: "clustbench" + software_environment: clustbench repository: url: https://github.com/imallona/clustbench_fastcluster - # url: /home/imallona/src/clustbench_fastcluster/ commit: "45e43d3" parameters: - values: ["--linkage", "complete"] @@ -143,12 +131,12 @@ stages: - values: ["--linkage", "weighted"] - values: ["--linkage", "median"] - values: ["--linkage", "centroid"] + - id: sklearn - name: "sklearn" - software_environment: "clustbench" + name: sklearn + software_environment: clustbench repository: url: https://github.com/imallona/clustbench_sklearn - #url: /home/imallona/src/clustbench_sklearn commit: 5877378 parameters: - values: ["--method", "birch"] @@ -166,8 +154,8 @@ stages: - values: ["--linkage", "complete"] - values: ["--linkage", "ward"] - id: genieclust - name: "genieclust" - software_environment: "clustbench" + name: genieclust + software_environment: clustbench repository: url: https://github.com/imallona/clustbench_genieclust commit: 6090043 @@ -206,7 +194,7 @@ stages: modules: - id: partition_metrics name: "clustbench partition metrics" - software_environment: "clustbench" + software_environment: clustbench repository: url: https://github.com/imallona/clustbench_metrics commit: 9132d45 @@ -230,88 +218,3 @@ stages: - id: metrics.scores path: "{input}/{stage}/{module}/{params}/{dataset}.scores.gz" - # ## daniel's data ########################################################################### - - # - id: danielsdata - # modules: - # - id: iris_manual - # name: "Iris Dataset" - # software_environment: "sklearn" - # repository: - # url: https://github.com/omnibenchmark-example/iris.git - # commit: 47c63f0 - # - id: penguins - # name: "Penguins Dataset" - # software_environment: "sklearn" - # repository: - # url: https://github.com/omnibenchmark-example/penguins.git - # commit: 9032478 - # outputs: - # - id: data.features - # path: "{input}/{stage}/{module}/{params}/{dataset}.features.csv" - # - id: data.labels - # path: "{input}/{stage}/{module}/{params}/{dataset}.labels.csv" - - # ## daniel's distances ######################################################################## - - # - id: distances - # modules: - # - id: D1 - # software_environment: "sklearn" - # parameters: - # - values: ["--measure", "cosine"] - # - values: ["--measure", "euclidean"] - # - values: ["--measure", "manhattan"] - # - values: ["--measure", "chebyshev"] - # repository: - # url: https://github.com/omnibenchmark-example/distance.git - # commit: dd99d4f - # inputs: - # - entries: - # - data.features - # outputs: - # - id: distances - # path: "{input}/{stage}/{module}/{params}/{dataset}.distances.csv" - - # ## daniel's methods ################################################################### - - # - id: danielmethods - # modules: - # - id: kmeans - # software_environment: "sklearn" - # repository: - # url: https://github.com/omnibenchmark-example/kmeans.git - # commit: 049c8b1 - # - id: ward - # software_environment: "R" - # repository: - # url: https://github.com/omnibenchmark-example/ward.git - # commit: 976e3f3 - # inputs: - # - entries: - # - distances - # outputs: - # - id: methods.clusters - # path: "{input}/{stage}/{module}/{params}/{dataset}.clusters.csv" - - # ## daniel's metrics ################################################################### - - # - id: danielsmetrics - # modules: - # - id: ari - # software_environment: "R" - # repository: - # url: https://github.com/omnibenchmark-example/ari.git - # commit: 72708f0 - # - id: accuracy - # software_environment: "R" - # repository: - # url: https://github.com/omnibenchmark-example/accuracy.git - # commit: e26b32f - # inputs: - # - entries: - # - methods.clusters - # - data.labels - # outputs: - # - id: metrics.mapping - # path: "{input}/{stage}/{module}/{params}/{dataset}.metrics.txt" diff --git a/envs/README.md b/envs/README.md index 69aa5c1..335a9d7 100644 --- a/envs/README.md +++ b/envs/README.md @@ -1,58 +1,61 @@ We distribute `Clustering.yml` runs with different backends. -- `Clustering_conda.yml`. Conda semi-reproducible (no pinning, pip) -- `Clustering_singularity.yml`. Singularity semi-reproducible, local SIF files. -- `Clustering_oras.yml`. Singularity semi-reproducible, prebuilt remote images. -- `Clustering_envmodules.yml`. Easybuilt with default optimization. +- `Clustering_envmodules.yml`. Easybuild backend with default optimization. +- `Clustering_apptainer.yml`. Apptainer, pinned, prebuilt remote images from [omnibenchmark's registry](https://quay.io/organization/omnibenchmark). +- `Clustering_apptainer_vanilla.yml`. Singularity, pinnned, from local SIF images. +- `Clustering_apptainer_optimized.yml`. Singularity, pinnned, from local SIF images. This image compiles a custom python with optimization flags. +- `Clustering_conda.yml`. Conda semi-reproducible (no pinning, using pip) +The `_short` variants are meant to run smoketests and see that there's no operational problems when running the environments, abnormal terminations etc. -## Conda + +## envmodules - reproducible builds with easybuild ### Files -- `clustbench.yml` -- `fcps.yml` -- `r.yml` -- `sklearn.yml` +- `clustbench.eb` +- `fcps.eb` +- `rmarkdown.eb` +- `rmarkdown-python.eb` ### How to build -No need to `ob software conda pin / prepare`; let `ob run benchmark -b Clustering_conda.yml --local` do it. +- `make prepare_envmodules_env` from the root folder. -## Apptainer semi-reproducible and local +## Aptainer, pinned, with registry pull -### Files +No need to prepare/build anything, since it fetches the apptainer images from a remote registry" -- `clustbench_singularity.def` -- `fcps_singularity.def` -- `r_singularity.def` -- `sklearn_singularity.def` +```bash +make run_with_apptainer_backend +``` -### How to build +## Apptainer, pinned, local build -- `build_singularity.sh` +### Files -## Aptainer semi-reproducible and remote +The apptainer images are based in ubuntu-noble docker images. -No need to prepare/build anything; let `ob run benchmark -b Clustering_oras.yml --local` do it using pre-built images from https://gitlab.renkulab.io/izaskun.mallona/clustering_example/container_registry. +The "optimized" flavor does a custom python 3.12 compilation; the vanillapy stocks the default py3.12 interpreter from the official ubuntu docker image. -## Apptainer (reproducible) with easybuild +- `clustbench_apptainer_optimized.def` +- `clustbench_apptainer_vanillapy.def` +- `fcps.def` -Doing... +### How to build the SIF images -Lorem ipsum. +- `make prepare_apptainer_env` from the root folder. -## envmodules - reproducible builds with easybuild +## Conda ### Files -- `clustbench.eb` -- `fcps.eb` +- `clustbench.yml` +- `fcps.yml` +- `rmarkdown.yml` ### How to build -1. Mind https://github.com/easybuilders/easybuild-easyconfigs/commit/e29210626f076e3a207f1abf3759ea124e28f8b2 -2. Mind `clustbench` is only installable from https://github.com/gagolews/genieclust/archive/refs/tags/v1.1.6.tar.gz and not from pypi's tgz (!), download it locally and ideally update the easyconfig to automate this -3. `python3-wget` from pypi doesn't look very well maintaned -4. `eb fcps.eb --robot` -5. `eb clustbench.eb --robot` +No need to `ob software conda pin / prepare`. Just use `ob run benchmark -b Clustering_conda.yml --local`. + + diff --git a/envs/build_singularity.sh b/envs/build_singularity.sh old mode 100644 new mode 100755 index 86e053f..099c4c1 --- a/envs/build_singularity.sh +++ b/envs/build_singularity.sh @@ -1,9 +1,11 @@ -#!/bin/bash - -sudo singularity build sklearn.sif sklearn_singularity.def - -sudo singularity build clustbench.sif clustbench_singularity.def - -sudo singularity build r.sif r_singularity.def - -sudo singularity build fcps.sif fcps_singularity.def +#!/bin/sh +# Builds singularity images. +# Installation guide: check https://apptainer.org/docs/user/latest/quick_start.html#installation +# Additionally, you will need: +# apt install fakeroot uidmap +CMD=singularity +BUILD='build --fakeroot' +# enable this if you want to compare with the custom python compilation +# $CMD ${BUILD} clustbench-optimized.sif clustbench_apptainer_optimized.def +$CMD ${BUILD} clustbench.sif clustbench_apptainer_vanillapy.def +$CMD ${BUILD} fcps.sif fcps.def diff --git a/envs/clustbench.eb b/envs/clustbench.eb index 22597fb..7064c67 100644 --- a/envs/clustbench.eb +++ b/envs/clustbench.eb @@ -1,108 +1,42 @@ -## largely as https://github.com/easybuilders/easybuild-easyconfigs/blob/949c266db9e17440ec2829eb8ffdbdb87ceaf543/easybuild/easyconfigs/c/cooler/cooler-0.10.2-foss-2023b.eb#L4 - easyblock = 'PythonBundle' name = 'clustbench' -version = '1' +version = '0.1.0' -homepage = 'https://python.org/' +homepage = 'https://omnibenchmark.org' description = "Bundle of Python packages for ob clustering_example" toolchain = {'name': 'foss', 'version': '2023b'} - dependencies = [ ('Python', '3.11.5'), - ('Python-bundle-PyPI', '2023.10'), ## so GCC 13.2.0 like foss-2023b ('SciPy-bundle', '2023.11'), - ('meson-python', '0.15.0'), ('matplotlib', '3.8.2'), - ('scikit-learn', '1.4.0') - + ('scikit-learn', '1.4.0'), ] -sanity_pip_check = True -use_pip = True - -exts_default_options = { - 'sanity_pip_check': True, - 'use_pip' : True -} - -## https://files.pythonhosted.org/packages/27/fe/e78538f4cd7b1b28e9c625eabd21f314004d00644a8347d0b01473e72ffa/clustering_benchmarks-1.1.5.tar.gz -## https://files.pythonhosted.org/packages/47/6a/62e288da7bcda82b935ff0c6cfe542970f04e29c756b0e147251b2fb251f/wget-3.2.zip -## https://files.pythonhosted.org/packages/5d/b8/f143d907d93bd4a3dd51d07c4e79b37bedbfc2177f4949bfa0d6ba0af647/fastcluster-1.2.6.tar.gz -## https://files.pythonhosted.org/packages/68/7c/d465bab9f98b75c5c1f5e80165dd82847a504ced655d162b585df08a717b/genieclust-1.1.6.tar.gz -## https://files.pythonhosted.org/packages/a2/45/eaaacaa4f4f2931a80d40e453df275d9af7c07616c5d753272d3055fb79e/genieclust-1.1.6-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl - -source_urls = [PYPI_SOURCE, - 'https://files.pythonhosted.org/packages/27/fe/e78538f4cd7b1b28e9c625eabd21f314004d00644a8347d0b01473e72ffa/', - 'https://files.pythonhosted.org/packages/68/7c/d465bab9f98b75c5c1f5e80165dd82847a504ced655d162b585df08a717b/', - 'https://files.pythonhosted.org/packages/47/6a/62e288da7bcda82b935ff0c6cfe542970f04e29c756b0e147251b2fb251f/', - 'https://files.pythonhosted.org/packages/5d/b8/f143d907d93bd4a3dd51d07c4e79b37bedbfc2177f4949bfa0d6ba0af647/', - 'https://files.pythonhosted.org/packages/65/6e/09db70a523a96d25e115e71cc56a6f9031e7b8cd166c1ac8438307c14058/', - 'https://files.pythonhosted.org/packages/84/4d/b720d6000f4ca77f030bd70f12550820f0766b568e43f11af7f7ad9061aa/', - 'https://files.pythonhosted.org/packages/68/dd/fa2e1a45fce2d09f4aea3cee169760e672c8262325aa5796c49d543dc7e6/', - 'https://files.pythonhosted.org/packages/84/4d/b720d6000f4ca77f030bd70f12550820f0766b568e43f11af7f7ad9061aa', - 'https://files.pythonhosted.org/packages/67/66/91d242ea8dd1729addd36069318ba2cd03874872764f316c3bb51b633ed2/', - 'https://files.pythonhosted.org/packages/e2/a9/a0c57aee75f77794adaf35322f8b6404cbd0f89ad45c87197a937764b7d0/', - 'https://files.pythonhosted.org/packages/d2/c1/72b9622fcb32ff98b054f724e213c7f70d6898baa714f4516288456ceaba/', - 'https://github.com/pybind/pybind11/archive/', - 'https://files.pythonhosted.org/packages/a9/95/a3dbbb5028f35eafb79008e7522a75244477d2838f38cbb722248dabc2a8/', - 'https://files.pythonhosted.org/packages/84/a4/d9da2989a3d937e94616ef07f0630c507f6baa77ad37f94ceb06b36cacc1/python3-wget-0.0.2-beta1.tar.gz', - 'https://files.pythonhosted.org/packages/6a/ef/6e3736663ee67369f7f5b697674bfbd3efc91e7096ddd4452bbbc80065ff/hypothesis-6.124.7.tar.gz', - 'https://files.pythonhosted.org/packages/03/c6/14a17e10813b8db20d1e800ff9a3a898e65d25f2b0e9d6a94616f1e3362c/numpy-1.23.0.tar.gz', - 'https://files.pythonhosted.org/packages/f6/d8/ab692a75f584d13c6542c3994f75def5bce52ded9399f52e230fe402819d/numpy-1.22.4.zip', - 'https://files.pythonhosted.org/packages/65/6e/09db70a523a96d25e115e71cc56a6f9031e7b8cd166c1ac8438307c14058/numpy-1.26.4.tar.gz', - 'https://files.pythonhosted.org/packages/64/33/60135848598c076ce4b231e1b1895170f45fbcaeaa2c9d5e38b04db70c35/joblib-1.4.2.tar.gz', - 'https://files.pythonhosted.org/packages/bd/55/b5148dcbf72f5cde221f8bfe3b6a540da7aa1842f6b491ad979a6c8b84af/threadpoolctl-3.5.0.tar.gz', - 'https://files.pythonhosted.org/packages/9e/a5/4ae3b3a0755f7b35a280ac90b28817d1f380318973cff14075ab41ef50d9/scikit_learn-1.6.1.tar.gz', - 'https://files.pythonhosted.org/packages/ef/e5/c09d20723bfd91315f6f4ddc77912b0dcc09588b4ca7ad2ffa204607ad7f/scikit-learn-1.4.2.tar.gz', - 'https://files.pythonhosted.org/packages/ee/5e/16e17bedcf54d5b618dc0771690deda77178e5c310402881c3d2d6c5f27c/hurry.filesize-0.9.tar.gz'] - - -## caution download genieclust here, not pypi, they differ and pypi's it's not installable! -## cd /home/imallona/.local/easybuild/sources/c/clustbench/extensions/ -## wget wget https://github.com/gagolews/genieclust/archive/refs/tags/v1.1.6.tar.gz -O genieclust-1.1.6.tar.gz -## todo automate this within the easyconfig! - exts_list = [ ('natsort', '8.4.0', { 'checksums': ['45312c4a0e5507593da193dedd04abb1469253b601ecaf63445ad80f0a1ea581'], }), - ('cython', '3.0.11', { - 'checksums': ['7146dd2af8682b4ca61331851e6aebce9fe5158e75300343f80c07ca80b1faff'], - }), ('hypothesis', '6.124.7', { 'checksums': ['8ed6c6ae47e7d26d869c1dc3dee04e8fc50c95240715bb9915ded88d6d920f0e'], }), - ('numpy', '1.26.4', { - 'checksums': ['2a02aba9ed12e4ac4eb3ea9421c420301a0c6460d9830d74a9df87efa4912010'], - }), ('fastcluster', '1.2.6', { 'checksums': ['aab886efa7b6bba7ac124f4498153d053e5a08b822d2254926b7206cdf5a8aa6'], }), - ('genieclust', '1.1.6', { - 'checksums': ['fb5b4ff68eef9e73496afa5949e726c8522c72e51f092716a6a598b03d5c09d6'], - }), ('hurry.filesize', '0.9', { 'checksums': ['f5368329adbef86accd3bc9490522340bb79260455ae89b1a42c10f63801b9a6'], }), - ('python3-wget', '0.0.2-beta1', { - 'modulename': 'wget', - 'checksums': ['bbe7f44b3c28c4f7126aff20e8a438e78f6e4f1878d8b0c4940e87363813c17d'], + ('genieclust', '1.1.6', { + 'download_dep_fail': False, + 'install_src': 'https://files.pythonhosted.org/packages/2a/09/d1fd7b02cfabe76262d0f88d74fa71dc93e857525f8249539ec5ab174292/genieclust-1.1.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl', + 'checksums': ['4c159f507b84b6d6d171883223648d837c520a9bcce650944a6ee0cb320e2151'], }), - ('clustering_benchmarks', '1.1.5', { + ('clustering_benchmarks', '1.1.6', { 'modulename': 'clustbench', - 'checksums': ['1732c262fb13be2f88814ef9a19c60108e91a7f6cfb9b960a42feaa299034ea3'], + 'checksums': ['8c3ac0aed7c4c4925df6e5000db29aed6359341bd1ef2e516f230e13d8b66a0c'], }), ] -sanity_check_paths = { - 'files': [], - 'dirs': ['lib/python3.11/site-packages/clustbench/'] -} - moduleclass = 'bio' - - diff --git a/envs/clustbench.yml b/envs/clustbench.yml index 6cb6201..f894c22 100644 --- a/envs/clustbench.yml +++ b/envs/clustbench.yml @@ -6,9 +6,7 @@ dependencies: - conda-forge::python=3.12.6 - conda-forge::pip - pip: - #- "clustering-benchmarks==1.1.5" - - 'https://github.com/gagolews/clustering-benchmarks/releases/download/v1.1.5/clustering_benchmarks-1.1.5.tar.gz' - - "wget" + - "clustering-benchmarks==1.1.6" - "fastcluster==1.2.6" - "numpy==1.26.4" - "scipy==1.14.1" diff --git a/envs/clustbench_apptainer_optimized.def b/envs/clustbench_apptainer_optimized.def new file mode 100644 index 0000000..846bae3 --- /dev/null +++ b/envs/clustbench_apptainer_optimized.def @@ -0,0 +1,100 @@ +Bootstrap: docker +From: ubuntu:noble-20250404 + +%labels + Author izaskun.mallona@gmail.com + Author ben.uzh@proton.me + +%post + PYTHON_VERSION=3.12.9 + PYTHON_MAJOR_VERSION=$(echo $PYTHON_VERSION | cut -d. -f1,2) + + # Update and enable deb-src + export DEBIAN_FRONTEND=noninteractive + apt-get update + echo "deb-src http://archive.ubuntu.com/ubuntu/ noble main restricted universe multiverse" >> /etc/apt/sources.list + echo "deb-src http://archive.ubuntu.com/ubuntu/ noble-updates main restricted universe multiverse" >> /etc/apt/sources.list + apt-get update + + # Get build dependencies for Python + apt-get build-dep -y python3 + + # Extra dependencies + apt-get install -y git \ + python-is-python3 \ + wget \ + zlib1g-dev \ + libbz2-dev \ + libssl-dev \ + libffi-dev \ + && apt-get clean && \ + rm -rf /var/lib/apt/lists/* + + # Calculate half the number of available cores + HALF_NPROC=$(( $(nproc) / 2 )) + # Ensure at least one core is used + CORES_TO_USE=$(( HALF_NPROC > 0 ? HALF_NPROC : 1 )) + + # Download and build Python from source, with optimizations + + mkdir ~/src && cd ~/src + wget https://www.python.org/ftp/python/${PYTHON_VERSION}/Python-${PYTHON_VERSION}.tgz + tar -xf Python-${PYTHON_VERSION}.tgz + cd Python-${PYTHON_VERSION}*/ + + # Enable all possible optimizations + ./configure \ + --enable-optimizations \ + --with-lto \ + --enable-shared \ + LDFLAGS="-Wl,-rpath /usr/local/lib" + make -j ${CORES_TO_USE} + make altinstall + + # Create virtualenv using the locally built Python + cd /opt + /usr/local/bin/python${PYTHON_MAJOR_VERSION} -m venv "default" + . default/bin/activate + + # Install required packages with pip + + pip install -U pip wheel + + pip install \ + "clustering-benchmarks==1.1.6" \ + "contourpy==1.3.2" \ + "cycler==0.12.1" \ + "cython==3.1.0" \ + "fonttools==4.58.0" \ + "genieclust==1.1.6" \ + "joblib==1.5.0" \ + "kiwisolver==1.4.8" \ + "matplotlib==3.10.3" \ + "natsort==8.4.0" \ + "numpy==2.2.5" \ + "packaging==25.0" \ + "pandas==2.2.3" \ + "pillow==11.2.1" \ + "pyparsing==3.2.3" \ + "python-dateutil==2.9.0.post0" \ + "pytz==2025.2" \ + "scikit-learn==1.6.1" \ + "scipy==1.15.3" \ + "six==1.17.0" \ + "threadpoolctl==3.6.0" \ + "tzdata==2025.2" \ + "fastcluster==1.3.0" \ + "gitpython==3.1.43" \ + "isodate==0.7.2" \ + "pydantic-core==2.34.1" + + # Do some cleanup to keep the image slim + rm -rf ~/.cache + rm -rf ~/src + + echo '. /opt/default/bin/activate' >> $SINGULARITY_ENVIRONMENT + +%environment + + . /opt/default/bin/activate + diff --git a/envs/clustbench_apptainer_vanillapy.def b/envs/clustbench_apptainer_vanillapy.def new file mode 100644 index 0000000..a40366c --- /dev/null +++ b/envs/clustbench_apptainer_vanillapy.def @@ -0,0 +1,65 @@ +Bootstrap: docker +From: ubuntu:noble-20250404 + +%labels + Author izaskun.mallona@gmail.com + Author ben.uzh@proton.me + +%post + export DEBIAN_FRONTEND=noninteractive + apt-get update && \ + apt-get install -y \ + python3 \ + python3-venv \ + python3-pip \ + ca-certificates \ + git \ + && apt-get clean && \ + rm -rf /var/lib/apt/lists/* + + # Create virtualenv using the default Python + mkdir -p /opt && cd /opt + /usr/bin/python3 -m venv "default" + . default/bin/activate + + # Install required packages with pip + + pip install -U pip wheel + + pip install \ + "clustering-benchmarks==1.1.6" \ + "contourpy==1.3.2" \ + "cycler==0.12.1" \ + "cython==3.1.0" \ + "fonttools==4.58.0" \ + "genieclust==1.1.6" \ + "joblib==1.5.0" \ + "kiwisolver==1.4.8" \ + "matplotlib==3.10.3" \ + "natsort==8.4.0" \ + "numpy==2.2.5" \ + "packaging==25.0" \ + "pandas==2.2.3" \ + "pillow==11.2.1" \ + "pyparsing==3.2.3" \ + "python-dateutil==2.9.0.post0" \ + "pytz==2025.2" \ + "scikit-learn==1.6.1" \ + "scipy==1.15.3" \ + "six==1.17.0" \ + "threadpoolctl==3.6.0" \ + "tzdata==2025.2" \ + "fastcluster==1.3.0" \ + "gitpython==3.1.43" \ + "isodate==0.7.2" \ + "pydantic-core==2.34.1" + + # Do some cleanup to keep the image slim + rm -rf ~/.cache + + echo '. /opt/default/bin/activate' >> $SINGULARITY_ENVIRONMENT + +%environment + + . /opt/default/bin/activate + diff --git a/envs/clustbench_singularity.def b/envs/clustbench_singularity.def deleted file mode 100644 index 8c2ae85..0000000 --- a/envs/clustbench_singularity.def +++ /dev/null @@ -1,35 +0,0 @@ -Bootstrap: docker -From: ubuntu:jammy-20240911.1 - -%labels - - AUTHOR izaskun.mallona@gmail.com - -%post - - # Install python3.12 - apt-get update - apt install -y build-essential zlib1g-dev libncurses5-dev libgdbm-dev libnss3-dev libssl-dev \ - libreadline-dev libffi-dev libsqlite3-dev wget libbz2-dev python-is-python3 git - - wget https://www.python.org/ftp/python/3.12.6/Python-3.12.6.tgz - tar -xf Python-3.12.6.tgz - cd Python-3.12.*/ - ./configure --enable-optimizations - make -j 4 - make altinstall - - # virtualenv - cd /opt - python3.12 -m venv "default" - . default/bin/activate - - pip3 install "clustering-benchmarks==1.1.5" "wget" "fastcluster==1.2.6" "numpy==1.26.4" "scipy==1.14.1" \ - "isodate" "pydantic-core" \ - "genieclust==1.1.6" "pandas==2.2.3" "gitpython==3.1.43" - - echo '. /opt/default/bin/activate' >> $SINGULARITY_ENVIRONMENT - -%environment - - . /opt/default/bin/activate diff --git a/envs/fcps.def b/envs/fcps.def new file mode 100644 index 0000000..a4996d6 --- /dev/null +++ b/envs/fcps.def @@ -0,0 +1,39 @@ +Bootstrap: docker +From: rocker/tidyverse:4.4 + +%labels + + AUTHOR izaskun.mallona@gmail.com + AUTHOR ben.uzh@proton.me + +%post + + # Install python (3.12 as of noble) + export DEBIAN_FRONTEND=noninteractive + apt-get update + apt-get install -y git \ + python-is-python3 \ + python3.12 \ + python3.12-venv \ + && apt-get clean && \ + rm -rf /var/lib/apt/lists/* + + # virtualenv + cd /opt + python3.12 -m venv "default" + . default/bin/activate + + pip install \ + "gitpython==3.1.43" \ + "isodate==0.7.2" \ + "pydantic-core==2.34.1" + + # Install R packages + ## FIXME no versioning here + Rscript -e 'BiocManager::install(c( "dbscan", "cluster", "protoclust", "energy", "argparse", "mclust", "caret", "DataVisualizations", "FCPS", "cclust"))' + + echo '. /opt/default/bin/activate' >> $SINGULARITY_ENVIRONMENT + +%environment + + . /opt/default/bin/activate diff --git a/envs/fcps.eb b/envs/fcps.eb index ee3db52..692bf0b 100644 --- a/envs/fcps.eb +++ b/envs/fcps.eb @@ -1,6 +1,3 @@ -## largely as in https://github.com/easybuilders/easybuild-easyconfigs/commit/e9a36171c68414f933ab1afa03b32422491f0f96#diff-3f2a92ab6ab59ddaccf4bc61b59bdd3f6717b95fd019131a57f51eefc831a699 -## Caution boost easyconfig needs update https://raw.githubusercontent.com/easybuilders/easybuild-easyconfigs/refs/heads/develop/easybuild/easyconfigs/b/Boost/Boost-1.82.0-GCC-12.3.0.eb (https://github.com/easybuilders/easybuild-easyconfigs/commit/e29210626f076e3a207f1abf3759ea124e28f8b2) - easyblock = 'Bundle' name = 'fcps' @@ -16,17 +13,13 @@ builddependencies = [('pkgconf', '1.9.5')] dependencies = [ ('R', '4.3.2'), - ('Boost', '1.82.0'), + ('Boost', '1.82.0'), ('GSL', '2.7'), -# ('arrow-R', '14.0.1', versionsuffix), # required by RcisTarget ] exts_default_options = { 'source_urls': [ - 'https://bioconductor.org/packages/3.18/bioc/src/contrib/', - 'https://bioconductor.org/packages/3.18/bioc/src/contrib/Archive/%(name)s', - 'https://bioconductor.org/packages/3.18/data/annotation/src/contrib/', - 'https://bioconductor.org/packages/3.18/data/experiment/src/contrib/', + 'https://bioconductor.org/packages/release/bioc/src/contrib/', 'https://cran.r-project.org/src/contrib/Archive/%(name)s', # package archive 'https://cran.r-project.org/src/contrib/', # current version of packages 'https://cran.freestatistics.org/src/contrib', # mirror alternative for current packages @@ -192,13 +185,15 @@ exts_list = [ ('cluster', '2.1.8', { 'checksums': ['c32a462e34694c99d58da953efa74882b5427f8c5db7cb226ae15c54ce6060ca'], }), - ('graph', '1.84.1', { - 'checksums': ['cd2a91c93c81c09d9c59853c417e8a9cdde39b0589bacdce4ca916b6ee5f45a7'], + ('graph', '1.86.0', { + 'checksums': ['ac9e196dfcb43848a851ea2d339cff41f8f16c7e80e76282c8fe7b822df8f367'], }), ('mclust', '6.1.1', { 'checksums': ['ddd7018e5e6ea7f92c7fc9872b391491b7e91c2cd89ef1dcaf4408afb5116775'], }), - ('cclust', '0.6-26'), + ('cclust', '0.6-26', { + 'checksums': ['92ec3c55a1864e4e1a4706bfdef8ad00727c720213ac656c718e867286b29857'], + }), ('flowClust', '3.40.0', { 'installopts': "--configure-args='--with-gsl=${EBROOTGSL} --enable-bundled-gsl=false'", 'checksums': ['7e699b06e378e32144704dbec18289109980b0f5eca166180f2c30007b83e0f5'], @@ -240,4 +235,4 @@ sanity_check_paths = { 'dirs': ['FCPS', 'dbscan', 'energy', 'protoclust'], } -moduleclass = 'bio' \ No newline at end of file +moduleclass = 'bio' diff --git a/envs/fcps_singularity.def b/envs/fcps_singularity.def deleted file mode 100644 index a4a615e..0000000 --- a/envs/fcps_singularity.def +++ /dev/null @@ -1,37 +0,0 @@ -Bootstrap: docker -From: rocker/tidyverse:4.3.3 - -%labels - - AUTHOR izaskun.mallona@gmail.com - -%post - - # Install python3.12 - apt-get update - apt install -y build-essential zlib1g-dev libncurses5-dev libgdbm-dev libnss3-dev libssl-dev \ - libreadline-dev libffi-dev libsqlite3-dev wget libbz2-dev python-is-python3 git \ - libgsl-dev - - wget https://www.python.org/ftp/python/3.12.6/Python-3.12.6.tgz - tar -xf Python-3.12.6.tgz - cd Python-3.12.*/ - ./configure --enable-optimizations - make -j 4 - make altinstall - - # virtualenv - cd /opt - python3.12 -m venv "default" - . default/bin/activate - - pip install gitpython==3.1.43 isodate pydantic-core - - ## no versioning here - Rscript -e 'BiocManager::install(c( "dbscan", "cluster", "protoclust", "energy", "argparse", "mclust", "DataVisualizations", "FCPS", "cclust"))' - - echo '. /opt/default/bin/activate' >> $SINGULARITY_ENVIRONMENT - -%environment - - . /opt/default/bin/activate diff --git a/envs/r.yml b/envs/r.yml deleted file mode 100644 index 456e139..0000000 --- a/envs/r.yml +++ /dev/null @@ -1,12 +0,0 @@ -name: r_for_metrics -channels: - - conda-forge - - nodefaults -dependencies: - - conda-forge::python=3.12.6 - - conda-forge::r-mclust - - conda-forge::r-caret - - conda-forge::r-dplyr - - conda-forge::r-readr - - conda-forge::r-argparse - diff --git a/envs/r_singularity.def b/envs/r_singularity.def deleted file mode 100644 index f1f9ec9..0000000 --- a/envs/r_singularity.def +++ /dev/null @@ -1,37 +0,0 @@ -Bootstrap: docker -From: rocker/tidyverse:4.4 - -%labels - - AUTHOR izaskun.mallona@gmail.com - -%post - - # Install python3.12 - apt-get update - apt install -y build-essential zlib1g-dev libncurses5-dev libgdbm-dev libnss3-dev libssl-dev \ - libreadline-dev libffi-dev libsqlite3-dev wget libbz2-dev python-is-python3 git - - wget https://www.python.org/ftp/python/3.12.6/Python-3.12.6.tgz - tar -xf Python-3.12.6.tgz - cd Python-3.12.*/ - ./configure --enable-optimizations - make -j 4 - make altinstall - - # virtualenv - cd /opt - python3.12 -m venv "default" - . default/bin/activate - - pip install gitpython==3.1.43 isodate pydantic-core - - # Install R packages - - Rscript -e 'BiocManager::install(c("mclust", "caret", "readr", "argparse"))' - - echo '. /opt/default/bin/activate' >> $SINGULARITY_ENVIRONMENT - -%environment - - . /opt/default/bin/activate diff --git a/envs/rmarkdown-python.eb b/envs/rmarkdown-python.eb new file mode 100644 index 0000000..a9edb00 --- /dev/null +++ b/envs/rmarkdown-python.eb @@ -0,0 +1,28 @@ +easyblock = 'Bundle' + +# This is a dummy bundle that installs: +# 1. rmarkdown: an R bundle that we also package +# 2. Python-3.12.3-GCCcore-13.3.0 +# This is a dependency for the clustering_benchmark metric collector. + +name = 'rmarkdown-python' +version = '0.1.0' + +local_rver = '4.4.2' +local_pyver = '3.12.3' +versionsuffix = f'-r-{local_rver}-py-{local_pyver}' + +homepage = 'https://omnibenchmark.org' +description = 'Rmarkdown bundle with specific Python dependency' + +toolchain = {'name': 'system', 'version': '1.0'} + +dependencies = [ + ('rmarkdown', '0.1.0', f'-gfbf-2024a-r-{local_rver}'), + ('Python', local_pyver, '-GCCcore-13.3.0'), +] + +sanity_check_paths = { + 'files': [], + 'dirs': ['../../rmarkdown/0.1.0-gfbf-2024a-r-4.4.2'] +} diff --git a/envs/rmarkdown.eb b/envs/rmarkdown.eb new file mode 100644 index 0000000..067eadd --- /dev/null +++ b/envs/rmarkdown.eb @@ -0,0 +1,197 @@ +easyblock = 'Bundle' + +# TODO(ben): Try to use https://www.eessi.io/docs/available_software/detail/R-bundle-CRAN/ +# and build only what's left out. + +name = 'rmarkdown' +version = '0.1.0' +versionsuffix = '-r-%(rver)s' + +homepage = 'https://omnibenchmark.org' +description = 'rmarkdown bundle for clustbench reports' + +toolchain = {'name': 'gfbf', 'version': '2024a'} + +dependencies = [ + ('R', '4.4.2'), +] + +exts_default_options = { + 'source_urls': [ + 'https://cloud.r-project.org/src/contrib/', + 'https://cran.r-project.org/src/contrib/', # current version of packages + 'https://cran.r-project.org/src/contrib/Archive/%(name)s', # package archive + 'https://www.bioconductor.org/packages/release/bioc/src/contrib/', # bioconductor + ], + 'sources': ['%(name)s_%(version)s.tar.gz'], +} + +exts_defaultclass = 'RPackage' + +exts_list = [ + ('rlang', '1.1.6', { + 'checksums': ['18544c876f4e18ec554edecc308362a52fbc7e0805c4794cf59bcc4d0b57f330'], + }), + ('glue', '1.8.0', { + 'checksums': ['c86f364ba899b8662f5da3e1a75f43ae081ab04e0d51171d052356e7ee4b72a0'], + }), + ('cli', '3.6.4', { + 'checksums': ['0c39539ce173bcbf7abaca64e8d2c87ffec8257c144c31b793c4cf2dd9cf7620'], + }), + ('lifecycle', '1.0.4', { + 'checksums': ['ada4d3c7e84b0c93105e888647c5754219a8334f6e1f82d5afaf83d4855b91cc'], + }), + ('vctrs', '0.6.5', { + 'checksums': ['43167d2248fd699594044b5c8f1dbb7ed163f2d64761e08ba805b04e7ec8e402'], + }), + ('utf8', '1.2.4', { + 'checksums': ['418f824bbd9cd868d2d8a0d4345545c62151d321224cdffca8b1ffd98a167b7d'], + }), + ('lattice', '0.22-5', { + 'checksums': ['ba1fbe5e18a133507dca9851b7f933002bdb6d1f3ea5f410a0a441103b6da5f1'], + }), + ('pkgconfig', '2.0.3', { + 'checksums': ['330fef440ffeb842a7dcfffc8303743f1feae83e8d6131078b5a44ff11bc3850'], + }), + ('pillar', '1.10.2', { + 'checksums': ['2cdbe3fe1b28b62530880ab26fc3c874e0dd5060767ae1a8ee5685f65e56d645'], + }), + ('magrittr', '2.0.3', { + 'checksums': ['a2bff83f792a1acb801bfe6330bb62724c74d5308832f2cb6a6178336ace55d2'], + }), + ('fansi', '1.0.6', { + 'checksums': ['ea9dc690dfe50a7fad7c5eb863c157d70385512173574c56f4253b6dfe431863'], + }), + ('viridisLite', '0.4.2', { + 'checksums': ['893f111d31deccd2cc959bc9db7ba2ce9020a2dd1b9c1c009587e449c4cce1a1'], + }), + ('RColorBrewer', '1.1-3', { + 'checksums': ['4f42f5423c45688b39f492c7892d93f37b4541831c8ffb140364d2bd89031ac0'], + }), + ('R6', '2.6.1', { + 'checksums': ['59c6eba8b1b912eb7e104f65053235604be853425ee67c152ac4e86a1f2073b4'], + }), + ('labeling', '0.4.3', { + 'checksums': ['c62f4fc2cc74377d7055903c5f1913b7295f7587456fe468592738a483e264f2'], + }), + ('farver', '2.1.2', { + 'checksums': ['528823b95daab4566137711f1c842027a952bea1b2ae6ff098e2ca512b17fe25'], + }), + ('Matrix', '1.7-3', { + 'checksums': ['6642e9db8cddf32a051972fd5a634bf7edbdc925c5c2d139bf71e92df00fb44e'], + }), + ('nlme', '3.1-168', { + 'checksums': ['23b78468344cb6775dee5e0d9c8133032d64f08ebaba20776508a0443a897362'], + }), + ('withr', '3.0.2', { + 'checksums': ['0a3a05f493d275cca4bf13c8c1b95a1a4eed7f83b2493f41fde02ce3fc92c1a3'], + }), + ('tibble', '3.2.1', { + 'checksums': ['65a72d0c557fd6e7c510d150c935ed6ced5db7d05fc20236b370f11428372131'], + }), + ('colorspace', '2.1-1', { + 'checksums': ['e721cee5f4d6e4b0fc8eb18265e316b4f856fd3be02f0775a26032663758cd0b'], + }), + ('munsell', '0.5.1', { + 'checksums': ['03a2fd9ac40766cded96dfe33b143d872d0aaa262a25482ce19161ca959429a6'], + }), + ('scales', '1.3.0', { + 'checksums': ['b33e0f6b44259551ce02befd52eac53602509fbfdd903920620c658c50f35888'], + }), + ('mgcv', '1.9-1', { + 'checksums': ['700fbc37bedd3a49505b9bc4949faee156d9cfb4f669d797d06a10a15a5bdb32'], + }), + ('MASS', '7.3-65', { + 'checksums': ['b07ef1e3c364ce56269b4a8a7759cc9f87c876554f91293437bb578cfe38172f'], + }), + ('isoband', '0.2.7', { + 'checksums': ['7693223343b45b86de2b5b638ff148f0dafa6d7b1237e822c5272902f79cdf61'], + }), + ('gtable', '0.3.6', { + 'checksums': ['d305a5fa11278b649d2d8edc5288bf28009be888a42be58ff8714018e49de0ef'], + }), + ('ggplot2', '3.5.2', { + 'checksums': ['0a30024a2ff3e569412223c8f14563ed504f3e0851de03e42d1b5f73fe1f06bf'], + }), + ('findpython', '1.0.9', { + 'checksums': ['b6a15e0cdfcdd4b1cfc76f7e4eaad0125d4d52889711200075280e9b2a2cb7cb'], + }), + ('argparse', '2.2.5', { + 'checksums': ['53c8a9eb51041084eb3d9c271b14ebcb32dc2f50cf16afa5c54c504a97229ea4'], + }), + (name, '2.29', { + 'checksums': ['6662ac85316c869caad6e3b95468cad97f6eef106d47b066db8d40c05a490928'], + }), + ('generics', '0.1.3', { + 'checksums': ['75046163bfa8b8a4f4214c1b689e796207f6447182f2e5062cf570302387d053'], + }), + ('tidyselect', '1.2.1', { + 'checksums': ['169e97ba0bbfbcdf4a80534322751f87a04370310c40e27f04aac6525d45903c'], + }), + ('dplyr', '1.1.4', { + 'checksums': ['cf730414d5d4ab387b4e9890a4b1df9d17a3903488e8da8df1cf2e11e44558cb'], + }), + ('tidyr', '1.3.1', { + 'checksums': ['e820c261cb5543f572f49276a7bdc7302aa4215da4bf850b1b939a315353835d'], + }), + ('shape', '1.4.6.1', { + 'checksums': ['43f9bd0f997fd6cf1838efd8b2509c9a6396513f4e54a20360481634affd22a4'], + }), + ('GlobalOptions', '0.1.2', { + 'checksums': ['47890699668cfa9900a829c51f8a32e02a7a7764ad07cfac972aad66f839753e'], + }), + ('circlize', '0.4.16', { + 'checksums': ['16dc32c7704906d13a9e5281bb396e92fb89a6b17fa5e201953240726b650b67'], + }), + ('rjson', '0.2.23', { + 'checksums': ['55034575c854ed657e6701da278c0fdea251479624d06a963b2e58461a5f0f48'], + }), + ('GetoptLong', '1.0.5', { + 'checksums': ['8c237986ed3dfb72d956ad865ef7768644eebf144675ad66140acfd1aca9d701'], + }), + ('cluster', '2.1.8.1', { + 'checksums': ['4b95b78e09b17ddca72edc0bb180c753c004ed2f61c3eb12e0451ac77f441e57'], + }), + ('clue', '0.3-66', { + 'checksums': ['aa86dd58c05635eb394c9ede0dd15a4f24af4815f299451bbc7895c0f737c2fb'], + }), + ('png', '0.1-8', { + 'checksums': ['5a36fabb6d62ba2533d3fc4cececd07891942cfb76fe689ec0d550d08762f61c'], + }), + ('BiocGenerics', '0.54.0', { + 'checksums': ['413d6f74cbc671147f63eefc46b718af815d6497535c2198925d9306e00c41b9'], + }), + ('S4Vectors', '0.46.0', { + 'checksums': ['c34249c6a367a2a1e94158d9e60294f2b901e485d93717250a417569be187a40'], + }), + ('IRanges', '2.42.0', { + 'checksums': ['0abb01ee93111c5fc678f9aa2f93d00d8d1548263cb60daa52645a6061b603fc'], + }), + ('matrixStats', '1.5.0', { + 'checksums': ['12996c5f3e6fc202a43e1087f16a71b7fa93d7e908f512542c7ee89cf95dcc15'], + }), + ('iterators', '1.0.14', { + 'checksums': ['cef3075a0930e1408c764e4da56bbadd4f7d14315809df8f38dd51f80ccc677b'], + }), + ('codetools', '0.2-20', { + 'checksums': ['3be6f375ec178723ddfd559d1e8e85bfeee04a5fbaf9f53f2f844e1669fea863'], + }), + ('foreach', '1.5.2', { + 'checksums': ['56338d8753f9f68f262cf532fd8a6d0fe25a71a2ff0107f3ce378feb926bafe4'], + }), + ('doParallel', '1.0.17', { + 'checksums': ['b96a25ad105a654d70c7b4ca27290dc9967bc47f4668b2763927a886b178abd7'], + }), + ('ComplexHeatmap', '2.24.0', { + 'checksums': ['2a015ad26c5a5f003ee203d77cc8d3eea5461bcf2db7ce102da1bef7db082650'], + }), +] + +modextrapaths = {'R_LIBS_SITE': ''} + +sanity_check_paths = { + 'files': [], + 'dirs': ['argparse', 'rmarkdown', 'ggplot2', 'tidyr', 'ComplexHeatmap'], +} + +moduleclass = 'bio' diff --git a/envs/rmarkdown.yml b/envs/rmarkdown.yml index e57969e..ed5c65e 100644 --- a/envs/rmarkdown.yml +++ b/envs/rmarkdown.yml @@ -7,6 +7,8 @@ dependencies: - conda-forge::python=3.12.6 - conda-forge::r-argparse - conda-forge::r-rmarkdown + - conda-forge::r-cairo + - conda-forge::r-svglite - conda-forge::r-ggplot2 - - conda-forge::r-tidyr + - conda-forge::r-tidyr - bioconda::bioconductor-complexheatmap diff --git a/envs/sklearn.yml b/envs/sklearn.yml deleted file mode 100644 index 258b7ea..0000000 --- a/envs/sklearn.yml +++ /dev/null @@ -1,11 +0,0 @@ -name: sklearn -channels: - - conda-forge - - nodefaults -dependencies: - - conda-forge::python=3.12.6 - - conda-forge::scikit-learn - - conda-forge::pip - - pip: - - "pandas" - - "argparse" diff --git a/envs/sklearn_singularity.def b/envs/sklearn_singularity.def deleted file mode 100644 index 939a3bb..0000000 --- a/envs/sklearn_singularity.def +++ /dev/null @@ -1,33 +0,0 @@ -Bootstrap: docker -From: ubuntu:jammy-20240911.1 - -%labels - - AUTHOR izaskun.mallona@gmail.com - -%post - - # Install python3.12 - apt-get update - apt install -y build-essential zlib1g-dev libncurses5-dev libgdbm-dev libnss3-dev libssl-dev \ - libreadline-dev libffi-dev libsqlite3-dev wget libbz2-dev python-is-python3 git - - wget https://www.python.org/ftp/python/3.12.6/Python-3.12.6.tgz - tar -xf Python-3.12.6.tgz - cd Python-3.12.*/ - ./configure --enable-optimizations - make -j 4 - make altinstall - - # virtualenv - cd /opt - python3.12 -m venv "default" - . default/bin/activate - - pip3 install -U scikit-learn pandas argparse numpy scipy "isodate" "pydantic-core" "gitpython==3.1.43" - - echo '. /opt/default/bin/activate' >> $SINGULARITY_ENVIRONMENT - -%environment - - . /opt/default/bin/activate diff --git a/envs/upload_to_registry.sh b/envs/upload_to_registry.sh new file mode 100644 index 0000000..7e45e5a --- /dev/null +++ b/envs/upload_to_registry.sh @@ -0,0 +1,12 @@ +#!/bin/sh +USER=user +REGISTRY=quay.io +ORGANIZATION=omnibenchmark +CLUSTBENCH_REPO=clustbench-vanilla +CLUSTBENCH_TAG=0.1.0 +FCPS_REPO=fcps +FCPS_TAG=0.1.0 + +singularity registry login --username {$USER} docker://${REGISTRY} +singularity push ${CLUSTBENCH_REPO}.sif oras://${REGISTRY}/${ORGANIZATION}/${CLUSTBENCH_REPO}:${CLUSTBENCH_TAG} +singularity push ${FCPS_REPO}.sif oras://${REGISTRY}/${ORGANIZATION}/${FCPS_REPO}:${FCPS_TAG} diff --git a/microbenchmark/microbench.py b/microbenchmark/microbench.py new file mode 100644 index 0000000..6abc6ee --- /dev/null +++ b/microbenchmark/microbench.py @@ -0,0 +1,71 @@ +""" +This script exercises a few common linear algebra operations in numpy. +It's intended mostly to gauge whether it makes sense to descend into +compiler optimizations for the Python binary that we ship within the SIF images, +but it can be easily repurposed for other specific microbenchmarks (i.e., numba or GPU perf gains). + +Be aware that here we're profiling simple operations; it would make sense to carefully +profile the libraries of interest to see where the computational bottlenecks really are. + +Usage: + +singularity exec clustbench-vanilla.sif python3 microbench.py +singularity exec clustbench-optimized.sif python3 microbench.py + +References: https://pythonspeed.com/articles/faster-python/ +""" +import numpy as np +import time +import json +from statistics import mean, stdev + +DEFAULT_REPETITIONS = 10 + +def run_operation(operation, func, repetitions): + timings = [] + for _ in range(repetitions): + start = time.perf_counter() + func() + elapsed = time.perf_counter() - start + timings.append(elapsed) + return { + 'operation': operation, + 'mean': mean(timings), + 'stdev': stdev(timings), + 'runs': repetitions + } + +def benchmark(repetitions=DEFAULT_REPETITIONS): + np.random.seed(42) + size = 1000 + + # Create random matrices + A = np.random.rand(size, size) + B = np.random.rand(size, size) + C = A @ A.T # Ensure positive definite for Cholesky + + # Define operations + operations = [ + ('mat_mul', lambda: np.dot(A, B)), + ('svd', lambda: np.linalg.svd(A)), + ('chol_decomp', lambda: np.linalg.cholesky(C)) + ] + + results = [] + for operation, func in operations: + try: + result = run_operation(operation, func, repetitions) + except np.linalg.LinAlgError: + result = { + 'operation': operation, + 'error': 'Operation failed due to numerical instability' + } + results.append(result) + + # Output results as JSON + print(json.dumps(results, indent=2)) + +if __name__ == "__main__": + import sys + repetitions = int(sys.argv[1]) if len(sys.argv) > 1 else DEFAULT_REPETITIONS + benchmark(repetitions) diff --git a/overrides/apptainer.yml b/overrides/apptainer.yml new file mode 100644 index 0000000..93b6c3e --- /dev/null +++ b/overrides/apptainer.yml @@ -0,0 +1,4 @@ +# this file has been generated automatically - DO NOT EDIT BY HAND +id: clustering_benchmark_apptainer_oras +description: Clustering benchmark on Gagolewski's. Using apptainer from omnibenchmark ORAS registry. +software_backend: apptainer diff --git a/overrides/apptainer_optimized.yml b/overrides/apptainer_optimized.yml new file mode 100644 index 0000000..ae4b5ad --- /dev/null +++ b/overrides/apptainer_optimized.yml @@ -0,0 +1,12 @@ +# this file has been generated automatically - DO NOT EDIT BY HAND +id: clustering_benchmark_apptainer_optimized_local +description: Clustering benchmark on Gagolewski's. Using apptainer (locally built image, optimized python) +software_backend: apptainer + +software_environments: + clustbench: + apptainer: envs/clustbench-optimized.sif + fcps: + apptainer: envs/fcps.sif + rmarkdown: + apptainer: envs/fcps.sif diff --git a/overrides/apptainer_vanilla.yml b/overrides/apptainer_vanilla.yml new file mode 100644 index 0000000..f0d3bc4 --- /dev/null +++ b/overrides/apptainer_vanilla.yml @@ -0,0 +1,12 @@ +# this file has been generated automatically - DO NOT EDIT BY HAND +id: clustering_benchmark_apptainer_vanilla_local +description: Clustering benchmark on Gagolewski's. Using apptainer (locally built image) +software_backend: apptainer + +software_environments: + clustbench: + apptainer: envs/clustbench.sif + fcps: + apptainer: envs/fcps.sif + rmarkdown: + apptainer: envs/fcps.sif diff --git a/overrides/conda.yml b/overrides/conda.yml new file mode 100644 index 0000000..5f4a1ac --- /dev/null +++ b/overrides/conda.yml @@ -0,0 +1,4 @@ +# this file has been generated automatically - DO NOT EDIT BY HAND +id: clustering_benchmark_conda +description: Clustering benchmark on Gagolewski's. Using conda. +software_backend: conda diff --git a/overrides/envmodules.yml b/overrides/envmodules.yml new file mode 100644 index 0000000..a34d58e --- /dev/null +++ b/overrides/envmodules.yml @@ -0,0 +1,4 @@ +# this file has been generated automatically - DO NOT EDIT BY HAND +id: clustering_benchmark_envmodules +description: Clustering benchmark on Gagolewski's. Using envmodules. +software_backend: envmodules diff --git a/smoketest/base.yml b/smoketest/base.yml new file mode 100644 index 0000000..db885fe --- /dev/null +++ b/smoketest/base.yml @@ -0,0 +1,127 @@ +version: 1.5 +benchmarker: "Izaskun Mallona, Daniel Incicau, Ben Carrillo" +benchmark_yaml_spec: 0.5 + +software_environments: + + clustbench: + description: "clustbench on py3.12.3, default python" + envmodule: clustbench/0.1.0-foss-2023b + conda: envs/clustbench.yml + apptainer: oras://quay.io/omnibenchmark/clustbench-vanilla:0.1.0 + + fcps: + description: "CRAN's FCPS" + envmodule: fcps/1.3.4-foss-2023a-r-4.3.2 + conda: envs/fcps.yml + apptainer: oras://quay.io/omnibenchmark/fcps:0.1.0 + + rmarkdown: + description: "R with some plotting dependencies" + conda: envs/rmarkdown.yml + envmodule: rmarkdown/0.1.0-gfbf-2024a-r-4.4.2 + apptainer: oras://quay.io/omnibenchmark/fcps:0.1.0 + + +metric_collectors: + - id: plotting + name: "Single-backend metric collector." + software_environment: rmarkdown + repository: + url: https://github.com/imallona/clustering_report + commit: 1d6bdf5 + inputs: + - metrics.scores + outputs: + - id: plotting.html + path: "{input}/{name}/plotting_report.html" + +stages: + - id: data + modules: + - id: clustbench + name: "clustbench datasets, from https://www.sciencedirect.com/science/article/pii/S0020025521010082#t0005 Table1" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_data + commit: 31ac323 + + parameters: # comments depict the possible cardinalities and the number of curated labelsets + - values: ["--dataset_generator", "fcps", "--dataset_name", "atom"] # 2 1 + outputs: + - id: data.matrix + path: "{input}/{stage}/{module}/{params}/{dataset}.data.gz" + - id: data.true_labels + path: "{input}/{stage}/{module}/{params}/{dataset}.labels0.gz" + + - id: clustering + modules: + - id: fastcluster + name: "fastcluster algorithm" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_fastcluster + commit: "45e43d3" + parameters: + - values: ["--linkage", "complete"] + - values: ["--linkage", "ward"] + - id: sklearn + name: "sklearn" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_sklearn + commit: 5877378 + parameters: + - values: ["--method", "birch"] + - values: ["--method", "kmeans"] + - id: agglomerative + name: "agglomerative" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_agglomerative + commit: 5454368 + parameters: + - values: ["--linkage", "complete"] + - values: ["--linkage", "ward"] + - id: genieclust + name: "genieclust" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_genieclust + commit: 6090043 + parameters: + - values: ["--method", "genie", "--gini_threshold", 0.5] + - id: fcps + name: "fcps" + software_environment: fcps + repository: + url: https://github.com/imallona/clustbench_fcps + commit: 272fa5f + parameters: + - values: ["--method", "FCPS_Minimax"] + inputs: + - entries: + - data.matrix + - data.true_labels + outputs: + - id: clustering.predicted_ks_range + path: "{input}/{stage}/{module}/{params}/{dataset}_ks_range.labels.gz" + + - id: metrics + modules: + - id: partition_metrics + name: "clustbench partition metrics" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_metrics + commit: 8184cd4 + parameters: + - values: ["--metric", "normalized_clustering_accuracy"] + - values: ["--metric", "adjusted_fm_score"] + inputs: + - entries: + - clustering.predicted_ks_range + - data.true_labels + outputs: + - id: metrics.scores + path: "{input}/{stage}/{module}/{params}/{dataset}.scores.gz"