diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index c1a1e82..b6cb977 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -18,7 +18,6 @@ jobs: run-benchmark: name: Run Benchmark runs-on: ubuntu-latest - ## runs-on: self-hosted steps: - name: Check out repository uses: actions/checkout@v4 @@ -49,7 +48,7 @@ jobs: shell: bash -l {0} run: | mamba install -y pip - pip install git+https://github.com/omnibenchmark/omnibenchmark.git@reduce_install_scope + pip install git+https://github.com/omnibenchmark/omnibenchmark.git@dev - name: Load benchmark cache id: cache-benchmark @@ -60,16 +59,15 @@ jobs: - name: Run benchmark shell: bash -l {0} - continue-on-error: true + continue-on-error: false run: | - echo "y" | ob run benchmark -b Clustering.yaml --local --cores 3 --continue-on-error + echo "y" | ob run benchmark -b Clustering_conda_smoketest.yml --local --cores 3 --continue-on-error upload-artifact: name: Benchmark Artifact runs-on: ubuntu-latest - ## runs-on: self-hosted needs: run-benchmark - if: always() + if: github.ref == 'refs/heads/main' && github.repository_owner == 'omnibenchmark' steps: - name: Check out repository uses: actions/checkout@v4 @@ -100,7 +98,7 @@ jobs: - name: Deploy to GitHub Pages uses: actions/deploy-pages@v4 - + - name: Create Job Summary if: always() run: | @@ -108,4 +106,3 @@ jobs: echo "- [Plotting Report](https://${{ github.repository_owner }}.github.io/${{ github.event.repository.name }})" >> $GITHUB_STEP_SUMMARY echo "### All Outputs" >> $GITHUB_STEP_SUMMARY echo "- [Complete Benchmark Output](https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}#artifacts)" >> $GITHUB_STEP_SUMMARY - diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..4d38534 --- /dev/null +++ b/.gitignore @@ -0,0 +1,10 @@ +# image build artifacts +envs/*.sif + +# snakemake +snakemake.log +.snakemake/ + +# vim swaps +*.swp +*.swo diff --git a/Clustering.yaml b/Clustering.yaml index 0007ea5..689be2c 100644 --- a/Clustering.yaml +++ b/Clustering.yaml @@ -2,10 +2,10 @@ id: clustering_example description: Clustering benchmark on Gagolewski's, true number of clusters plus minus 2. version: 1.2 benchmarker: "Izaskun Mallona, Daniel Incicau" -storage: https://play.min.io benchmark_yaml_spec: 0.04 -storage_api: S3 -storage_bucket_name: clustering_example +# storage: https://play.min.io +# storage_api: S3 +# storage_bucket_name: clustering_example software_backend: conda software_environments: clustbench: diff --git a/Clustering_singularity.yml b/Clustering_apptainer_optimized.yml similarity index 70% rename from Clustering_singularity.yml rename to Clustering_apptainer_optimized.yml index c80b498..a073683 100644 --- a/Clustering_singularity.yml +++ b/Clustering_apptainer_optimized.yml @@ -1,42 +1,37 @@ -id: clustering_example_apptainer +id: clustering_example_apptainer_optimized description: Clustering benchmark on Gagolewski's, true number of clusters plus minus 2. -version: 1.4 -benchmarker: "Izaskun Mallona, Daniel Incicau" -storage: http://omnibenchmark.org:9000 -benchmark_yaml_spec: 0.04 -storage_api: S3 -storage_bucket_name: clusteringexampleapptainer + +version: 1.5 +benchmarker: "Izaskun Mallona, Daniel Incicau, Ben Carrillo" +benchmark_yaml_spec: 0.4 + software_backend: apptainer + software_environments: + clustbench: - description: "clustbench on py3.12.6" - conda: envs/clustbench.yml - envmodule: clustbench - apptainer: envs/clustbench.sif - sklearn: - description: "Daniel's on py3.12.6" - conda: envs/sklearn.yml - apptainer: envs/sklearn.sif - envmodule: clustbench # not true, but - R: - description: "Daniel's R with readr, dplyr, mclust, caret" - conda: envs/r.yml - apptainer: envs/r.sif - envmodule: fcps # not true, but - rmarkdown: - description: "R with some plotting dependencies" - conda: envs/rmarkdown.yml - apptainer: envs/r.sif # not true, but - envmodule: fcps # not true, but + description: "clustbench on py3.12.9, optimized python build" + conda: envs/clustbench.yml # not used + envmodule: na + apptainer: envs/clustbench-optimized.sif + fcps: description: "CRAN's FCPS" - conda: envs/fcps.yml + conda: envs/fcps.yml # not used + envmodule: na apptainer: envs/fcps.sif - envmodule: fcps + + rmarkdown: + description: "R with some plotting dependencies" + conda: envs/rmarkdown.yml # not used + envmodule: na + apptainer: envs/rmarkdown.sif + + metric_collectors: - id: plotting name: "Single-backend metric collector." - software_environment: "rmarkdown" + software_environment: rmarkdown repository: url: https://github.com/imallona/clustering_report commit: 1d6bdf5 @@ -45,14 +40,14 @@ metric_collectors: outputs: - id: plotting.html path: "{input}/{name}/plotting_report.html" + stages: - ## clustbench data ########################################################## - id: data modules: - id: clustbench name: "clustbench datasets, from https://www.sciencedirect.com/science/article/pii/S0020025521010082#t0005 Table1" - software_environment: "clustbench" + software_environment: clustbench repository: url: https://github.com/imallona/clustbench_data commit: 366c5a2 @@ -125,16 +120,13 @@ stages: - id: data.true_labels path: "{input}/{stage}/{module}/{params}/{dataset}.labels0.gz" - ## clustbench methods (fastcluster) ################################################################### - - id: clustering modules: - id: fastcluster name: "fastcluster algorithm" - software_environment: "clustbench" + software_environment: clustbench repository: url: https://github.com/imallona/clustbench_fastcluster - # url: /home/imallona/src/clustbench_fastcluster/ commit: "45e43d3" parameters: - values: ["--linkage", "complete"] @@ -143,12 +135,12 @@ stages: - values: ["--linkage", "weighted"] - values: ["--linkage", "median"] - values: ["--linkage", "centroid"] + - id: sklearn - name: "sklearn" - software_environment: "clustbench" + name: sklearn + software_environment: clustbench repository: url: https://github.com/imallona/clustbench_sklearn - #url: /home/imallona/src/clustbench_sklearn commit: 5877378 parameters: - values: ["--method", "birch"] @@ -166,8 +158,8 @@ stages: - values: ["--linkage", "complete"] - values: ["--linkage", "ward"] - id: genieclust - name: "genieclust" - software_environment: "clustbench" + name: genieclust + software_environment: clustbench repository: url: https://github.com/imallona/clustbench_genieclust commit: 6090043 @@ -206,7 +198,7 @@ stages: modules: - id: partition_metrics name: "clustbench partition metrics" - software_environment: "clustbench" + software_environment: clustbench repository: url: https://github.com/imallona/clustbench_metrics commit: 9132d45 @@ -229,89 +221,3 @@ stages: outputs: - id: metrics.scores path: "{input}/{stage}/{module}/{params}/{dataset}.scores.gz" - - # ## daniel's data ########################################################################### - - # - id: danielsdata - # modules: - # - id: iris_manual - # name: "Iris Dataset" - # software_environment: "sklearn" - # repository: - # url: https://github.com/omnibenchmark-example/iris.git - # commit: 47c63f0 - # - id: penguins - # name: "Penguins Dataset" - # software_environment: "sklearn" - # repository: - # url: https://github.com/omnibenchmark-example/penguins.git - # commit: 9032478 - # outputs: - # - id: data.features - # path: "{input}/{stage}/{module}/{params}/{dataset}.features.csv" - # - id: data.labels - # path: "{input}/{stage}/{module}/{params}/{dataset}.labels.csv" - - # ## daniel's distances ######################################################################## - - # - id: distances - # modules: - # - id: D1 - # software_environment: "sklearn" - # parameters: - # - values: ["--measure", "cosine"] - # - values: ["--measure", "euclidean"] - # - values: ["--measure", "manhattan"] - # - values: ["--measure", "chebyshev"] - # repository: - # url: https://github.com/omnibenchmark-example/distance.git - # commit: dd99d4f - # inputs: - # - entries: - # - data.features - # outputs: - # - id: distances - # path: "{input}/{stage}/{module}/{params}/{dataset}.distances.csv" - - # ## daniel's methods ################################################################### - - # - id: danielmethods - # modules: - # - id: kmeans - # software_environment: "sklearn" - # repository: - # url: https://github.com/omnibenchmark-example/kmeans.git - # commit: 049c8b1 - # - id: ward - # software_environment: "R" - # repository: - # url: https://github.com/omnibenchmark-example/ward.git - # commit: 976e3f3 - # inputs: - # - entries: - # - distances - # outputs: - # - id: methods.clusters - # path: "{input}/{stage}/{module}/{params}/{dataset}.clusters.csv" - - # ## daniel's metrics ################################################################### - - # - id: danielsmetrics - # modules: - # - id: ari - # software_environment: "R" - # repository: - # url: https://github.com/omnibenchmark-example/ari.git - # commit: 72708f0 - # - id: accuracy - # software_environment: "R" - # repository: - # url: https://github.com/omnibenchmark-example/accuracy.git - # commit: e26b32f - # inputs: - # - entries: - # - methods.clusters - # - data.labels - # outputs: - # - id: metrics.mapping - # path: "{input}/{stage}/{module}/{params}/{dataset}.metrics.txt" diff --git a/Clustering_apptainer_vanilla.yml b/Clustering_apptainer_vanilla.yml new file mode 100644 index 0000000..6bc5edd --- /dev/null +++ b/Clustering_apptainer_vanilla.yml @@ -0,0 +1,223 @@ +id: clustering_example_apptainer_vanilla + +description: Clustering benchmark on Gagolewski's, true number of clusters plus minus 2. +version: 1.5 +benchmarker: "Izaskun Mallona, Daniel Incicau, Ben Carrillo" +benchmark_yaml_spec: 0.4 + +software_backend: apptainer + +software_environments: + + clustbench: + description: "clustbench on py3.12.3, default python" + envmodule: na + conda: envs/clustbench.yml # not used + apptainer: envs/clustbench-vanilla.sif + + fcps: + description: "CRAN's FCPS" + conda: envs/fcps.yml # not used + envmodule: na + apptainer: envs/fcps.sif + + rmarkdown: + description: "R with some plotting dependencies" + conda: envs/rmarkdown.yml # not used + envmodule: na + apptainer: envs/rmarkdown.sif + + +metric_collectors: + - id: plotting + name: "Single-backend metric collector." + software_environment: rmarkdown + repository: + url: https://github.com/imallona/clustering_report + commit: 1d6bdf5 + inputs: + - metrics.scores + outputs: + - id: plotting.html + path: "{input}/{name}/plotting_report.html" + +stages: + + - id: data + modules: + - id: clustbench + name: "clustbench datasets, from https://www.sciencedirect.com/science/article/pii/S0020025521010082#t0005 Table1" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_data + commit: 366c5a2 + parameters: # comments depict the possible cardinalities and the number of curated labelsets + - values: ["--dataset_generator", "fcps", "--dataset_name", "atom"] # 2 1 + - values: ["--dataset_generator", "fcps", "--dataset_name", "chainlink"] # 2 1 + - values: ["--dataset_generator", "fcps", "--dataset_name", "engytime"] # 2 2 + - values: ["--dataset_generator", "fcps", "--dataset_name", "hepta"] # 7 1 + - values: ["--dataset_generator", "fcps", "--dataset_name", "lsun"] # 3 1 + - values: ["--dataset_generator", "fcps", "--dataset_name", "target"] # 2, 6 2 + - values: ["--dataset_generator", "fcps", "--dataset_name", "tetra"] # 4 1 + - values: ["--dataset_generator", "fcps", "--dataset_name", "twodiamonds"] # 2 1 + - values: ["--dataset_generator", "fcps", "--dataset_name", "wingnut"] # 2 1 + - values: ["--dataset_generator", "graves", "--dataset_name", "dense"] # 2 1 + - values: ["--dataset_generator", "graves", "--dataset_name", "fuzzyx"] # 2, 4, 5 6 + - values: ["--dataset_generator", "graves", "--dataset_name", "line"] # 2 1 + - values: ["--dataset_generator", "graves", "--dataset_name", "parabolic"] # 2, 4 2 + - values: ["--dataset_generator", "graves", "--dataset_name", "ring"] # 2 1 + - values: ["--dataset_generator", "graves", "--dataset_name", "ring_noisy"] # 2 1 + - values: ["--dataset_generator", "graves", "--dataset_name", "ring_outliers"] # 2, 5 2 + - values: ["--dataset_generator", "graves", "--dataset_name", "zigzag"] # 3, 5 2 + - values: ["--dataset_generator", "graves", "--dataset_name", "zigzag_noisy"] # 3, 5 2 + - values: ["--dataset_generator", "graves", "--dataset_name", "zigzag_outliers"] # 3, 5 2 + - values: ["--dataset_generator", "other", "--dataset_name", "chameleon_t4_8k"] # 6 1 + - values: ["--dataset_generator", "other", "--dataset_name", "chameleon_t5_8k"] # 6 1 + - values: ["--dataset_generator", "other", "--dataset_name", "hdbscan"] # 6 1 + - values: ["--dataset_generator", "other", "--dataset_name", "iris"] # 3 1 + - values: ["--dataset_generator", "other", "--dataset_name", "iris5"] # 3 1 + - values: ["--dataset_generator", "other", "--dataset_name", "square"] # 2 1 + - values: ["--dataset_generator", "sipu", "--dataset_name", "aggregation"] # 7 1 + - values: ["--dataset_generator", "sipu", "--dataset_name", "compound"] # 4, 5, 6 5 + - values: ["--dataset_generator", "sipu", "--dataset_name", "flame"] # 2 2 + - values: ["--dataset_generator", "sipu", "--dataset_name", "jain"] # 2 1 + - values: ["--dataset_generator", "sipu", "--dataset_name", "pathbased"] # 3, 4 2 + - values: ["--dataset_generator", "sipu", "--dataset_name", "r15"] # 8, 9, 15 3 + - values: ["--dataset_generator", "sipu", "--dataset_name", "spiral"] # 3 1 + - values: ["--dataset_generator", "sipu", "--dataset_name", "unbalance"] # 8 1 + - values: ["--dataset_generator", "uci", "--dataset_name", "ecoli"] # 8 1 + - values: ["--dataset_generator", "uci", "--dataset_name", "ionosphere"] # 2 1 + - values: ["--dataset_generator", "uci", "--dataset_name", "sonar"] # 2 1 + - values: ["--dataset_generator", "uci", "--dataset_name", "statlog"] # 7 1 + - values: ["--dataset_generator", "uci", "--dataset_name", "wdbc"] # 2 1 + - values: ["--dataset_generator", "uci", "--dataset_name", "wine"] # 3 1 + - values: ["--dataset_generator", "uci", "--dataset_name", "yeast"] # 10 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "circles"] # 4 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "cross"] # 4 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "graph"] # 10 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "isolation"] # 3 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "labirynth"] # 6 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "mk1"] # 3 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "mk2"] # 2 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "mk3"] # 3 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "mk4"] # 3 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "olympic"] # 5 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "smile"] # 4, 6 2 + - values: ["--dataset_generator", "wut", "--dataset_name", "stripes"] # 2 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "trajectories"] # 4 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "trapped_lovers"] # 3 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "twosplashes"] # 2 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "windows"] # 5 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "x1"] # 3 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "x2"] # 3 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "x3"] # 4 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "z1"] # 3 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "z2"] # 5 1 + - values: ["--dataset_generator", "wut", "--dataset_name", "z3"] # 4 1 + outputs: + - id: data.matrix + path: "{input}/{stage}/{module}/{params}/{dataset}.data.gz" + - id: data.true_labels + path: "{input}/{stage}/{module}/{params}/{dataset}.labels0.gz" + + - id: clustering + modules: + - id: fastcluster + name: "fastcluster algorithm" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_fastcluster + commit: "45e43d3" + parameters: + - values: ["--linkage", "complete"] + - values: ["--linkage", "ward"] + - values: ["--linkage", "average"] + - values: ["--linkage", "weighted"] + - values: ["--linkage", "median"] + - values: ["--linkage", "centroid"] + + - id: sklearn + name: sklearn + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_sklearn + commit: 5877378 + parameters: + - values: ["--method", "birch"] + - values: ["--method", "kmeans"] + # - values: ["--method", "spectral"] ## too slow + - values: ["--method", "gm"] + - id: agglomerative + name: "agglomerative" + software_environment: "clustbench" + repository: + url: https://github.com/imallona/clustbench_agglomerative + commit: 5454368 + parameters: + - values: ["--linkage", "average"] + - values: ["--linkage", "complete"] + - values: ["--linkage", "ward"] + - id: genieclust + name: genieclust + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_genieclust + commit: 6090043 + parameters: + - values: ["--method", "genie", "--gini_threshold", 0.5] + - values: ["--method", "gic"] + - values: ["--method", "ica"] + - id: fcps + name: "fcps" + software_environment: "fcps" + repository: + url: https://github.com/imallona/clustbench_fcps + commit: 272fa5f + parameters: + # - values: ["--method", "FCPS_AdaptiveDensityPeak"] # not in conda + - values: ["--method", "FCPS_Minimax"] + - values: ["--method", "FCPS_MinEnergy"] + - values: ["--method", "FCPS_HDBSCAN_2"] + - values: ["--method", "FCPS_HDBSCAN_4"] + - values: ["--method", "FCPS_HDBSCAN_8"] + - values: ["--method", "FCPS_Diana"] + - values: ["--method", "FCPS_Fanny"] + - values: ["--method", "FCPS_Hardcl"] + - values: ["--method", "FCPS_Softcl"] + - values: ["--method", "FCPS_Clara"] + - values: ["--method", "FCPS_PAM"] + inputs: + - entries: + - data.matrix + - data.true_labels + outputs: + - id: clustering.predicted_ks_range + path: "{input}/{stage}/{module}/{params}/{dataset}_ks_range.labels.gz" + + - id: metrics + modules: + - id: partition_metrics + name: "clustbench partition metrics" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_metrics + commit: 9132d45 + parameters: + - values: ["--metric", "normalized_clustering_accuracy"] + - values: ["--metric", "adjusted_fm_score"] + - values: ["--metric", "adjusted_mi_score"] + - values: ["--metric", "adjusted_rand_score"] + - values: ["--metric", "fm_score"] + - values: ["--metric", "mi_score"] + - values: ["--metric", "normalized_clustering_accuracy"] + - values: ["--metric", "normalized_mi_score"] + - values: ["--metric", "normalized_pivoted_accuracy"] + - values: ["--metric", "pair_sets_index"] + - values: ["--metric", "rand_score"] + inputs: + - entries: + - clustering.predicted_ks_range + - data.true_labels + outputs: + - id: metrics.scores + path: "{input}/{stage}/{module}/{params}/{dataset}.scores.gz" diff --git a/Clustering_apptainer_vanilla_smoketest.yml b/Clustering_apptainer_vanilla_smoketest.yml new file mode 100644 index 0000000..99aff2e --- /dev/null +++ b/Clustering_apptainer_vanilla_smoketest.yml @@ -0,0 +1,129 @@ +id: clustering_example_envmodules +description: Clustering benchmark on Gagolewski's, true number of clusters plus minus 2. +version: 1.4 +benchmarker: "Izaskun Mallona, Daniel Incicau" +benchmark_yaml_spec: 0.5 + +software_backend: apptainer + +software_environments: + + clustbench: + description: "clustbench on py3.12.3, default python" + envmodule: na + conda: envs/clustbench.yml # not used + apptainer: envs/clustbench-vanilla.sif + + fcps: + description: "CRAN's FCPS" + envmodule: na + conda: envs/fcps.yml # not used + apptainer: envs/fcps.sif + + rmarkdown: + description: "R with some plotting dependencies" + envmodule: na + conda: envs/rmarkdown.yml # not used + apptainer: envs/rmarkdown.sif + +metric_collectors: + - id: plotting + name: "Single-backend metric collector." + software_environment: rmarkdown + repository: + url: https://github.com/imallona/clustering_report + commit: 1d6bdf5 + inputs: + - metrics.scores + outputs: + - id: plotting.html + path: "{input}/{name}/plotting_report.html" + +stages: + - id: data + modules: + - id: clustbench + name: "clustbench datasets, from https://www.sciencedirect.com/science/article/pii/S0020025521010082#t0005 Table1" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_data + commit: 366c5a2 + parameters: # comments depict the possible cardinalities and the number of curated labelsets + - values: ["--dataset_generator", "fcps", "--dataset_name", "atom"] # 2 1 + outputs: + - id: data.matrix + path: "{input}/{stage}/{module}/{params}/{dataset}.data.gz" + - id: data.true_labels + path: "{input}/{stage}/{module}/{params}/{dataset}.labels0.gz" + + - id: clustering + modules: + - id: fastcluster + name: "fastcluster algorithm" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_fastcluster + commit: "45e43d3" + parameters: + - values: ["--linkage", "complete"] + - values: ["--linkage", "ward"] + - id: sklearn + name: "sklearn" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_sklearn + commit: 5877378 + parameters: + - values: ["--method", "birch"] + - values: ["--method", "kmeans"] + - id: agglomerative + name: "agglomerative" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_agglomerative + commit: 5454368 + parameters: + - values: ["--linkage", "complete"] + - values: ["--linkage", "ward"] + - id: genieclust + name: "genieclust" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_genieclust + commit: 6090043 + parameters: + - values: ["--method", "genie", "--gini_threshold", 0.5] + - id: fcps + name: "fcps" + software_environment: fcps + repository: + url: https://github.com/imallona/clustbench_fcps + commit: 272fa5f + parameters: + - values: ["--method", "FCPS_Minimax"] + inputs: + - entries: + - data.matrix + - data.true_labels + outputs: + - id: clustering.predicted_ks_range + path: "{input}/{stage}/{module}/{params}/{dataset}_ks_range.labels.gz" + + - id: metrics + modules: + - id: partition_metrics + name: "clustbench partition metrics" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_metrics + commit: 8184cd4 + parameters: + - values: ["--metric", "normalized_clustering_accuracy"] + - values: ["--metric", "adjusted_fm_score"] + inputs: + - entries: + - clustering.predicted_ks_range + - data.true_labels + outputs: + - id: metrics.scores + path: "{input}/{stage}/{module}/{params}/{dataset}.scores.gz" diff --git a/Clustering_conda.yml b/Clustering_conda.yml index 7ac1629..9e74ee5 100644 --- a/Clustering_conda.yml +++ b/Clustering_conda.yml @@ -1,42 +1,36 @@ id: clustering_example_conda + description: Clustering benchmark on Gagolewski's, true number of clusters plus minus 2. -version: 1.4 -benchmarker: "Izaskun Mallona, Daniel Incicau" -storage: http://omnibenchmark.org:9000 -benchmark_yaml_spec: 0.04 -storage_api: S3 -storage_bucket_name: clusteringexampleconda +version: 1.5 +benchmarker: "Izaskun Mallona, Daniel Incicau, Ben Carrillo" +benchmark_yaml_spec: 0.4 + software_backend: conda + software_environments: + clustbench: description: "clustbench on py3.12.6" conda: envs/clustbench.yml envmodule: clustbench - apptainer: envs/clustbench.sif - sklearn: - description: "Daniel's on py3.12.6" - conda: envs/sklearn.yml - apptainer: envs/sklearn.sif - envmodule: clustbench # not true, but - R: - description: "Daniel's R with readr, dplyr, mclust, caret" - conda: envs/r.yml - apptainer: envs/r.sif - envmodule: fcps # not true, but - rmarkdown: - description: "R with some plotting dependencies" - conda: envs/rmarkdown.yml - apptainer: envs/r.sif # not true, but - envmodule: fcps # not true, but + apptainer: na + fcps: description: "CRAN's FCPS" conda: envs/fcps.yml - apptainer: envs/fcps.sif envmodule: fcps + apptainer: na + + rmarkdown: + description: "R with some plotting dependencies" + conda: envs/rmarkdown.yml + envmodule: fcps # not used + apptainer: na + metric_collectors: - id: plotting name: "Single-backend metric collector." - software_environment: "rmarkdown" + software_environment: rmarkdown repository: url: https://github.com/imallona/clustering_report commit: 1d6bdf5 @@ -45,6 +39,7 @@ metric_collectors: outputs: - id: plotting.html path: "{input}/{name}/plotting_report.html" + stages: ## clustbench data ########################################################## @@ -52,7 +47,7 @@ stages: modules: - id: clustbench name: "clustbench datasets, from https://www.sciencedirect.com/science/article/pii/S0020025521010082#t0005 Table1" - software_environment: "clustbench" + software_environment: clustbench repository: url: https://github.com/imallona/clustbench_data commit: 366c5a2 @@ -145,7 +140,7 @@ stages: - values: ["--linkage", "centroid"] - id: sklearn name: "sklearn" - software_environment: "clustbench" + software_environment: clustbench repository: url: https://github.com/imallona/clustbench_sklearn #url: /home/imallona/src/clustbench_sklearn @@ -229,89 +224,3 @@ stages: outputs: - id: metrics.scores path: "{input}/{stage}/{module}/{params}/{dataset}.scores.gz" - - # ## daniel's data ########################################################################### - - # - id: danielsdata - # modules: - # - id: iris_manual - # name: "Iris Dataset" - # software_environment: "sklearn" - # repository: - # url: https://github.com/omnibenchmark-example/iris.git - # commit: 47c63f0 - # - id: penguins - # name: "Penguins Dataset" - # software_environment: "sklearn" - # repository: - # url: https://github.com/omnibenchmark-example/penguins.git - # commit: 9032478 - # outputs: - # - id: data.features - # path: "{input}/{stage}/{module}/{params}/{dataset}.features.csv" - # - id: data.labels - # path: "{input}/{stage}/{module}/{params}/{dataset}.labels.csv" - - # ## daniel's distances ######################################################################## - - # - id: distances - # modules: - # - id: D1 - # software_environment: "sklearn" - # parameters: - # - values: ["--measure", "cosine"] - # - values: ["--measure", "euclidean"] - # - values: ["--measure", "manhattan"] - # - values: ["--measure", "chebyshev"] - # repository: - # url: https://github.com/omnibenchmark-example/distance.git - # commit: dd99d4f - # inputs: - # - entries: - # - data.features - # outputs: - # - id: distances - # path: "{input}/{stage}/{module}/{params}/{dataset}.distances.csv" - - # ## daniel's methods ################################################################### - - # - id: danielmethods - # modules: - # - id: kmeans - # software_environment: "sklearn" - # repository: - # url: https://github.com/omnibenchmark-example/kmeans.git - # commit: 049c8b1 - # - id: ward - # software_environment: "R" - # repository: - # url: https://github.com/omnibenchmark-example/ward.git - # commit: 976e3f3 - # inputs: - # - entries: - # - distances - # outputs: - # - id: methods.clusters - # path: "{input}/{stage}/{module}/{params}/{dataset}.clusters.csv" - - # ## daniel's metrics ################################################################### - - # - id: danielsmetrics - # modules: - # - id: ari - # software_environment: "R" - # repository: - # url: https://github.com/omnibenchmark-example/ari.git - # commit: 72708f0 - # - id: accuracy - # software_environment: "R" - # repository: - # url: https://github.com/omnibenchmark-example/accuracy.git - # commit: e26b32f - # inputs: - # - entries: - # - methods.clusters - # - data.labels - # outputs: - # - id: metrics.mapping - # path: "{input}/{stage}/{module}/{params}/{dataset}.metrics.txt" diff --git a/Clustering_conda_smoketest.yml b/Clustering_conda_smoketest.yml new file mode 100644 index 0000000..15215d7 --- /dev/null +++ b/Clustering_conda_smoketest.yml @@ -0,0 +1,129 @@ +id: clustering_example_envmodules +description: Clustering benchmark on Gagolewski's, true number of clusters plus minus 2. +version: 1.4 +benchmarker: "Izaskun Mallona, Daniel Incicau" +benchmark_yaml_spec: 0.5 + +software_backend: conda + +software_environments: + + clustbench: + description: "clustbench on py3.12.6" + envmodule: clustbench/0.1.0-foss-2023b + conda: envs/clustbench.yml + apptainer: na + + fcps: + description: "CRAN's FCPS" + envmodule: fcps/1.3.4-foss-2023a-r-4.3.2 + conda: envs/fcps.yml + apptainer: na + + rmarkdown: + description: "R with some plotting dependencies" + envmodule: rmarkdown/0.1.0-gfbf-2024a-r-4.4.2 + conda: envs/rmarkdown.yml + apptainer: na + +metric_collectors: + - id: plotting + name: "Single-backend metric collector." + software_environment: rmarkdown + repository: + url: https://github.com/imallona/clustering_report + commit: 1d6bdf5 + inputs: + - metrics.scores + outputs: + - id: plotting.html + path: "{input}/{name}/plotting_report.html" + +stages: + - id: data + modules: + - id: clustbench + name: "clustbench datasets, from https://www.sciencedirect.com/science/article/pii/S0020025521010082#t0005 Table1" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_data + commit: 366c5a2 + parameters: # comments depict the possible cardinalities and the number of curated labelsets + - values: ["--dataset_generator", "fcps", "--dataset_name", "atom"] # 2 1 + outputs: + - id: data.matrix + path: "{input}/{stage}/{module}/{params}/{dataset}.data.gz" + - id: data.true_labels + path: "{input}/{stage}/{module}/{params}/{dataset}.labels0.gz" + + - id: clustering + modules: + - id: fastcluster + name: "fastcluster algorithm" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_fastcluster + commit: "45e43d3" + parameters: + - values: ["--linkage", "complete"] + - values: ["--linkage", "ward"] + - id: sklearn + name: "sklearn" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_sklearn + commit: 5877378 + parameters: + - values: ["--method", "birch"] + - values: ["--method", "kmeans"] + - id: agglomerative + name: "agglomerative" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_agglomerative + commit: 5454368 + parameters: + - values: ["--linkage", "complete"] + - values: ["--linkage", "ward"] + - id: genieclust + name: "genieclust" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_genieclust + commit: 6090043 + parameters: + - values: ["--method", "genie", "--gini_threshold", 0.5] + - id: fcps + name: "fcps" + software_environment: fcps + repository: + url: https://github.com/imallona/clustbench_fcps + commit: 272fa5f + parameters: + - values: ["--method", "FCPS_Minimax"] + inputs: + - entries: + - data.matrix + - data.true_labels + outputs: + - id: clustering.predicted_ks_range + path: "{input}/{stage}/{module}/{params}/{dataset}_ks_range.labels.gz" + + - id: metrics + modules: + - id: partition_metrics + name: "clustbench partition metrics" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_metrics + commit: 8184cd4 + parameters: + - values: ["--metric", "normalized_clustering_accuracy"] + - values: ["--metric", "adjusted_fm_score"] + inputs: + - entries: + - clustering.predicted_ks_range + - data.true_labels + outputs: + - id: metrics.scores + path: "{input}/{stage}/{module}/{params}/{dataset}.scores.gz" diff --git a/Clustering_envmodules.yml b/Clustering_envmodules.yml index 3c2b8bd..a2112d4 100644 --- a/Clustering_envmodules.yml +++ b/Clustering_envmodules.yml @@ -1,42 +1,37 @@ id: clustering_example_envmodules + description: Clustering benchmark on Gagolewski's, true number of clusters plus minus 2. -version: 1.4 -benchmarker: "Izaskun Mallona, Daniel Incicau" -storage: http://omnibenchmark.org:9000 -benchmark_yaml_spec: 0.04 -storage_api: S3 -storage_bucket_name: clusteringexampleenvmodules +version: 1.5 +benchmarker: "Izaskun Mallona, Daniel Incicau, Ben Carrillo" +benchmark_yaml_spec: 0.4 + software_backend: envmodules + software_environments: + clustbench: description: "clustbench on py3.12.6" - conda: envs/clustbench.yml - envmodule: clustbench - apptainer: envs/clustbench.sif - sklearn: - description: "Daniel's on py3.12.6" - conda: envs/sklearn.yml - apptainer: envs/sklearn.sif - envmodule: clustbench # not true, but - R: - description: "Daniel's R with readr, dplyr, mclust, caret" - conda: envs/r.yml - apptainer: envs/r.sif - envmodule: fcps # not true, but + conda: envs/clustbench.yml # not used + envmodule: clustbench/0.1.0-foss-2023b + apptainer: na + rmarkdown: description: "R with some plotting dependencies" - conda: envs/rmarkdown.yml - apptainer: envs/r.sif # not true, but - envmodule: fcps # not true, but + conda: envs/rmakrkdown.yml # not used + envmodule: rmarkdown/0.1.0-gfbf-2024a-r-4.4.2 + apptainer: na + fcps: description: "CRAN's FCPS" - conda: envs/fcps.yml - apptainer: envs/fcps.sif - envmodule: fcps + conda: envs/fcps.yml # not used + envmodule: fcps/1.3.4-foss-2023a-r-4.3.2 + apptainer: na + + metric_collectors: - id: plotting name: "Single-backend metric collector." - software_environment: "rmarkdown" + software_environment: rmarkdown repository: url: https://github.com/imallona/clustering_report commit: 1d6bdf5 @@ -45,53 +40,143 @@ metric_collectors: outputs: - id: plotting.html path: "{input}/{name}/plotting_report.html" + stages: - ## clustbench data ########################################################## - id: data modules: - id: clustbench name: "clustbench datasets, from https://www.sciencedirect.com/science/article/pii/S0020025521010082#t0005 Table1" - software_environment: "clustbench" + software_environment: clustbench repository: url: https://github.com/imallona/clustbench_data commit: 366c5a2 - parameters: # comments depict the possible cardinalities and the number of curated labelsets + parameters: # comments depict the possible cardinalities and the number of curated labelsets - values: ["--dataset_generator", "fcps", "--dataset_name", "atom"] # 2 1 - - values: ["--dataset_generator", "fcps", "--dataset_name", "chainlink"] # 2 1 - - values: ["--dataset_generator", "fcps", "--dataset_name", "engytime"] # 2 2 + - values: [ + "--dataset_generator", + "fcps", + "--dataset_name", + "chainlink", + ] # 2 1 + - values: [ + "--dataset_generator", + "fcps", + "--dataset_name", + "engytime", + ] # 2 2 - values: ["--dataset_generator", "fcps", "--dataset_name", "hepta"] # 7 1 - values: ["--dataset_generator", "fcps", "--dataset_name", "lsun"] # 3 1 - values: ["--dataset_generator", "fcps", "--dataset_name", "target"] # 2, 6 2 - values: ["--dataset_generator", "fcps", "--dataset_name", "tetra"] # 4 1 - - values: ["--dataset_generator", "fcps", "--dataset_name", "twodiamonds"] # 2 1 + - values: [ + "--dataset_generator", + "fcps", + "--dataset_name", + "twodiamonds", + ] # 2 1 - values: ["--dataset_generator", "fcps", "--dataset_name", "wingnut"] # 2 1 - values: ["--dataset_generator", "graves", "--dataset_name", "dense"] # 2 1 - - values: ["--dataset_generator", "graves", "--dataset_name", "fuzzyx"] # 2, 4, 5 6 + - values: [ + "--dataset_generator", + "graves", + "--dataset_name", + "fuzzyx", + ] # 2, 4, 5 6 - values: ["--dataset_generator", "graves", "--dataset_name", "line"] # 2 1 - - values: ["--dataset_generator", "graves", "--dataset_name", "parabolic"] # 2, 4 2 + - values: [ + "--dataset_generator", + "graves", + "--dataset_name", + "parabolic", + ] # 2, 4 2 - values: ["--dataset_generator", "graves", "--dataset_name", "ring"] # 2 1 - - values: ["--dataset_generator", "graves", "--dataset_name", "ring_noisy"] # 2 1 - - values: ["--dataset_generator", "graves", "--dataset_name", "ring_outliers"] # 2, 5 2 - - values: ["--dataset_generator", "graves", "--dataset_name", "zigzag"] # 3, 5 2 - - values: ["--dataset_generator", "graves", "--dataset_name", "zigzag_noisy"] # 3, 5 2 - - values: ["--dataset_generator", "graves", "--dataset_name", "zigzag_outliers"] # 3, 5 2 - - values: ["--dataset_generator", "other", "--dataset_name", "chameleon_t4_8k"] # 6 1 - - values: ["--dataset_generator", "other", "--dataset_name", "chameleon_t5_8k"] # 6 1 - - values: ["--dataset_generator", "other", "--dataset_name", "hdbscan"] # 6 1 + - values: [ + "--dataset_generator", + "graves", + "--dataset_name", + "ring_noisy", + ] # 2 1 + - values: [ + "--dataset_generator", + "graves", + "--dataset_name", + "ring_outliers", + ] # 2, 5 2 + - values: [ + "--dataset_generator", + "graves", + "--dataset_name", + "zigzag", + ] # 3, 5 2 + - values: [ + "--dataset_generator", + "graves", + "--dataset_name", + "zigzag_noisy", + ] # 3, 5 2 + - values: [ + "--dataset_generator", + "graves", + "--dataset_name", + "zigzag_outliers", + ] # 3, 5 2 + - values: [ + "--dataset_generator", + "other", + "--dataset_name", + "chameleon_t4_8k", + ] # 6 1 + - values: [ + "--dataset_generator", + "other", + "--dataset_name", + "chameleon_t5_8k", + ] # 6 1 + - values: [ + "--dataset_generator", + "other", + "--dataset_name", + "hdbscan", + ] # 6 1 - values: ["--dataset_generator", "other", "--dataset_name", "iris"] # 3 1 - values: ["--dataset_generator", "other", "--dataset_name", "iris5"] # 3 1 - values: ["--dataset_generator", "other", "--dataset_name", "square"] # 2 1 - - values: ["--dataset_generator", "sipu", "--dataset_name", "aggregation"] # 7 1 - - values: ["--dataset_generator", "sipu", "--dataset_name", "compound"] # 4, 5, 6 5 + - values: [ + "--dataset_generator", + "sipu", + "--dataset_name", + "aggregation", + ] # 7 1 + - values: [ + "--dataset_generator", + "sipu", + "--dataset_name", + "compound", + ] # 4, 5, 6 5 - values: ["--dataset_generator", "sipu", "--dataset_name", "flame"] # 2 2 - values: ["--dataset_generator", "sipu", "--dataset_name", "jain"] # 2 1 - - values: ["--dataset_generator", "sipu", "--dataset_name", "pathbased"] # 3, 4 2 + - values: [ + "--dataset_generator", + "sipu", + "--dataset_name", + "pathbased", + ] # 3, 4 2 - values: ["--dataset_generator", "sipu", "--dataset_name", "r15"] # 8, 9, 15 3 - values: ["--dataset_generator", "sipu", "--dataset_name", "spiral"] # 3 1 - - values: ["--dataset_generator", "sipu", "--dataset_name", "unbalance"] # 8 1 + - values: [ + "--dataset_generator", + "sipu", + "--dataset_name", + "unbalance", + ] # 8 1 - values: ["--dataset_generator", "uci", "--dataset_name", "ecoli"] # 8 1 - - values: ["--dataset_generator", "uci", "--dataset_name", "ionosphere"] # 2 1 + - values: [ + "--dataset_generator", + "uci", + "--dataset_name", + "ionosphere", + ] # 2 1 - values: ["--dataset_generator", "uci", "--dataset_name", "sonar"] # 2 1 - values: ["--dataset_generator", "uci", "--dataset_name", "statlog"] # 7 1 - values: ["--dataset_generator", "uci", "--dataset_name", "wdbc"] # 2 1 @@ -100,8 +185,18 @@ stages: - values: ["--dataset_generator", "wut", "--dataset_name", "circles"] # 4 1 - values: ["--dataset_generator", "wut", "--dataset_name", "cross"] # 4 1 - values: ["--dataset_generator", "wut", "--dataset_name", "graph"] # 10 1 - - values: ["--dataset_generator", "wut", "--dataset_name", "isolation"] # 3 1 - - values: ["--dataset_generator", "wut", "--dataset_name", "labirynth"] # 6 1 + - values: [ + "--dataset_generator", + "wut", + "--dataset_name", + "isolation", + ] # 3 1 + - values: [ + "--dataset_generator", + "wut", + "--dataset_name", + "labirynth", + ] # 6 1 - values: ["--dataset_generator", "wut", "--dataset_name", "mk1"] # 3 1 - values: ["--dataset_generator", "wut", "--dataset_name", "mk2"] # 2 1 - values: ["--dataset_generator", "wut", "--dataset_name", "mk3"] # 3 1 @@ -109,9 +204,24 @@ stages: - values: ["--dataset_generator", "wut", "--dataset_name", "olympic"] # 5 1 - values: ["--dataset_generator", "wut", "--dataset_name", "smile"] # 4, 6 2 - values: ["--dataset_generator", "wut", "--dataset_name", "stripes"] # 2 1 - - values: ["--dataset_generator", "wut", "--dataset_name", "trajectories"] # 4 1 - - values: ["--dataset_generator", "wut", "--dataset_name", "trapped_lovers"] # 3 1 - - values: ["--dataset_generator", "wut", "--dataset_name", "twosplashes"] # 2 1 + - values: [ + "--dataset_generator", + "wut", + "--dataset_name", + "trajectories", + ] # 4 1 + - values: [ + "--dataset_generator", + "wut", + "--dataset_name", + "trapped_lovers", + ] # 3 1 + - values: [ + "--dataset_generator", + "wut", + "--dataset_name", + "twosplashes", + ] # 2 1 - values: ["--dataset_generator", "wut", "--dataset_name", "windows"] # 5 1 - values: ["--dataset_generator", "wut", "--dataset_name", "x1"] # 3 1 - values: ["--dataset_generator", "wut", "--dataset_name", "x2"] # 3 1 @@ -126,12 +236,12 @@ stages: path: "{input}/{stage}/{module}/{params}/{dataset}.labels0.gz" ## clustbench methods (fastcluster) ################################################################### - + - id: clustering modules: - id: fastcluster name: "fastcluster algorithm" - software_environment: "clustbench" + software_environment: clustbench repository: url: https://github.com/imallona/clustbench_fastcluster # url: /home/imallona/src/clustbench_fastcluster/ @@ -145,10 +255,9 @@ stages: - values: ["--linkage", "centroid"] - id: sklearn name: "sklearn" - software_environment: "clustbench" + software_environment: clustbench repository: url: https://github.com/imallona/clustbench_sklearn - #url: /home/imallona/src/clustbench_sklearn commit: 5877378 parameters: - values: ["--method", "birch"] @@ -157,7 +266,7 @@ stages: - values: ["--method", "gm"] - id: agglomerative name: "agglomerative" - software_environment: "clustbench" + software_environment: clustbench repository: url: https://github.com/imallona/clustbench_agglomerative commit: 5454368 @@ -167,7 +276,7 @@ stages: - values: ["--linkage", "ward"] - id: genieclust name: "genieclust" - software_environment: "clustbench" + software_environment: clustbench repository: url: https://github.com/imallona/clustbench_genieclust commit: 6090043 @@ -177,7 +286,7 @@ stages: - values: ["--method", "ica"] - id: fcps name: "fcps" - software_environment: "fcps" + software_environment: fcps repository: url: https://github.com/imallona/clustbench_fcps commit: 272fa5f @@ -206,10 +315,10 @@ stages: modules: - id: partition_metrics name: "clustbench partition metrics" - software_environment: "clustbench" + software_environment: clustbench repository: url: https://github.com/imallona/clustbench_metrics - commit: 9132d45 + commit: 8184cd4 parameters: - values: ["--metric", "normalized_clustering_accuracy"] - values: ["--metric", "adjusted_fm_score"] @@ -229,89 +338,3 @@ stages: outputs: - id: metrics.scores path: "{input}/{stage}/{module}/{params}/{dataset}.scores.gz" - - # ## daniel's data ########################################################################### - - # - id: danielsdata - # modules: - # - id: iris_manual - # name: "Iris Dataset" - # software_environment: "sklearn" - # repository: - # url: https://github.com/omnibenchmark-example/iris.git - # commit: 47c63f0 - # - id: penguins - # name: "Penguins Dataset" - # software_environment: "sklearn" - # repository: - # url: https://github.com/omnibenchmark-example/penguins.git - # commit: 9032478 - # outputs: - # - id: data.features - # path: "{input}/{stage}/{module}/{params}/{dataset}.features.csv" - # - id: data.labels - # path: "{input}/{stage}/{module}/{params}/{dataset}.labels.csv" - - # ## daniel's distances ######################################################################## - - # - id: distances - # modules: - # - id: D1 - # software_environment: "sklearn" - # parameters: - # - values: ["--measure", "cosine"] - # - values: ["--measure", "euclidean"] - # - values: ["--measure", "manhattan"] - # - values: ["--measure", "chebyshev"] - # repository: - # url: https://github.com/omnibenchmark-example/distance.git - # commit: dd99d4f - # inputs: - # - entries: - # - data.features - # outputs: - # - id: distances - # path: "{input}/{stage}/{module}/{params}/{dataset}.distances.csv" - - # ## daniel's methods ################################################################### - - # - id: danielmethods - # modules: - # - id: kmeans - # software_environment: "sklearn" - # repository: - # url: https://github.com/omnibenchmark-example/kmeans.git - # commit: 049c8b1 - # - id: ward - # software_environment: "R" - # repository: - # url: https://github.com/omnibenchmark-example/ward.git - # commit: 976e3f3 - # inputs: - # - entries: - # - distances - # outputs: - # - id: methods.clusters - # path: "{input}/{stage}/{module}/{params}/{dataset}.clusters.csv" - - # ## daniel's metrics ################################################################### - - # - id: danielsmetrics - # modules: - # - id: ari - # software_environment: "R" - # repository: - # url: https://github.com/omnibenchmark-example/ari.git - # commit: 72708f0 - # - id: accuracy - # software_environment: "R" - # repository: - # url: https://github.com/omnibenchmark-example/accuracy.git - # commit: e26b32f - # inputs: - # - entries: - # - methods.clusters - # - data.labels - # outputs: - # - id: metrics.mapping - # path: "{input}/{stage}/{module}/{params}/{dataset}.metrics.txt" diff --git a/Clustering_envmodules_smoketest.yml b/Clustering_envmodules_smoketest.yml new file mode 100644 index 0000000..3fa8e81 --- /dev/null +++ b/Clustering_envmodules_smoketest.yml @@ -0,0 +1,131 @@ +id: clustering_example_envmodules +description: Clustering benchmark on Gagolewski's, true number of clusters plus minus 2. +version: 1.4 +benchmarker: "Izaskun Mallona, Daniel Incicau" +benchmark_yaml_spec: 0.5 + +software_backend: envmodules + +software_environments: + + clustbench: + description: "clustbench on py3.12.6" + envmodule: clustbench/0.1.0-foss-2023b + conda: envs/clustbench.yml + apptainer: na + + fcps: + description: "CRAN's FCPS" + envmodule: fcps/1.3.4-foss-2023a-r-4.3.2 + conda: envs/fcps.yml + apptainer: na + + rmarkdown: + description: "R with some plotting dependencies" + envmodule: rmarkdown/0.1.0-gfbf-2024a-r-4.4.2 + conda: envs/clustbench.yml + apptainer: na + +metric_collectors: + - id: plotting + name: "Single-backend metric collector." + software_environment: rmarkdown + repository: + url: https://github.com/imallona/clustering_report + commit: 1d6bdf5 + inputs: + - metrics.scores + outputs: + - id: plotting.html + path: "{input}/{name}/plotting_report.html" + +stages: + - id: data + modules: + - id: clustbench + name: "clustbench datasets, from https://www.sciencedirect.com/science/article/pii/S0020025521010082#t0005 Table1" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_data + commit: 366c5a2 + parameters: # comments depict the possible cardinalities and the number of curated labelsets + - values: ["--dataset_generator", "fcps", "--dataset_name", "atom"] # 2 1 + outputs: + - id: data.matrix + path: "{input}/{stage}/{module}/{params}/{dataset}.data.gz" + - id: data.true_labels + path: "{input}/{stage}/{module}/{params}/{dataset}.labels0.gz" + + ## clustbench methods (fastcluster) ################################################################### + + - id: clustering + modules: + - id: fastcluster + name: "fastcluster algorithm" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_fastcluster + commit: "45e43d3" + parameters: + - values: ["--linkage", "complete"] + - values: ["--linkage", "ward"] + - id: sklearn + name: "sklearn" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_sklearn + commit: 5877378 + parameters: + - values: ["--method", "birch"] + - values: ["--method", "kmeans"] + - id: agglomerative + name: "agglomerative" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_agglomerative + commit: 5454368 + parameters: + - values: ["--linkage", "complete"] + - values: ["--linkage", "ward"] + - id: genieclust + name: "genieclust" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_genieclust + commit: 6090043 + parameters: + - values: ["--method", "genie", "--gini_threshold", 0.5] + - id: fcps + name: "fcps" + software_environment: fcps + repository: + url: https://github.com/imallona/clustbench_fcps + commit: 272fa5f + parameters: + - values: ["--method", "FCPS_Minimax"] + inputs: + - entries: + - data.matrix + - data.true_labels + outputs: + - id: clustering.predicted_ks_range + path: "{input}/{stage}/{module}/{params}/{dataset}_ks_range.labels.gz" + + - id: metrics + modules: + - id: partition_metrics + name: "clustbench partition metrics" + software_environment: clustbench + repository: + url: https://github.com/imallona/clustbench_metrics + commit: 8184cd4 + parameters: + - values: ["--metric", "normalized_clustering_accuracy"] + - values: ["--metric", "adjusted_fm_score"] + inputs: + - entries: + - clustering.predicted_ks_range + - data.true_labels + outputs: + - id: metrics.scores + path: "{input}/{stage}/{module}/{params}/{dataset}.scores.gz" diff --git a/Clustering_oras.yml b/Clustering_oras.yml index 6640461..c6f0d7e 100644 --- a/Clustering_oras.yml +++ b/Clustering_oras.yml @@ -1,36 +1,37 @@ -id: clustering_example +id: clustering_example_oras description: Clustering benchmark on Gagolewski's, true number of clusters plus minus 2. Caution dirty apptainer sifs. -version: 1.2 +version: 1.5 + benchmarker: "Izaskun Mallona, Daniel Incicau" -storage: https://play.min.io -benchmark_yaml_spec: 0.04 -storage_api: S3 -storage_bucket_name: clustering_example +benchmark_yaml_spec: 0.4 + +#storage: https://play.min.io +#storage_api: S3 +#storage_bucket_name: clustering_example + software_backend: apptainer + software_environments: + clustbench: description: "clustbench on py3.12.6" - conda: envs/clustbench.yml + conda: envs/clustbench.yml # not used envmodule: clustbench apptainer: oras://registry.renkulab.io/izaskun.mallona/clustering_example/clustbench:latest - sklearn: - description: "Daniel's on py3.12.6" - conda: envs/sklearn.yml - apptainer: oras://registry.renkulab.io/izaskun.mallona/clustering_example/r:latest - envmodule: clustbench # not true, but - R: - description: "Daniel's R with readr, dplyr, mclust, caret" - conda: envs/r.yml - apptainer: oras://registry.renkulab.io/izaskun.mallona/clustering_example/sklearn:latest - envmodule: fcps # not true, but + fcps: description: "CRAN's FCPS" - conda: envs/fcps.yml + conda: envs/fcps.yml # not used + envmodule: na apptainer: oras://registry.renkulab.io/izaskun.mallona/clustering_example/fcps:latest - envmodule: fcps -stages: - ## clustbench data ########################################################## + rmarkdown: + description: "R with some plotting dependencies" + conda: envs/rmarkdown.yml # not used + envmodule: na + apptainer: oras://registry.renkulab.io/izaskun.mallona/clustering_example/rmarkdown:latest + +stages: - id: data modules: @@ -214,88 +215,3 @@ stages: - id: metrics.scores path: "{input}/{stage}/{module}/{params}/{dataset}.scores.gz" - # ## daniel's data ########################################################################### - - # - id: danielsdata - # modules: - # - id: iris_manual - # name: "Iris Dataset" - # software_environment: "sklearn" - # repository: - # url: https://github.com/omnibenchmark-example/iris.git - # commit: 47c63f0 - # - id: penguins - # name: "Penguins Dataset" - # software_environment: "sklearn" - # repository: - # url: https://github.com/omnibenchmark-example/penguins.git - # commit: 9032478 - # outputs: - # - id: data.features - # path: "{input}/{stage}/{module}/{params}/{dataset}.features.csv" - # - id: data.labels - # path: "{input}/{stage}/{module}/{params}/{dataset}.labels.csv" - - # ## daniel's distances ######################################################################## - - # - id: distances - # modules: - # - id: D1 - # software_environment: "sklearn" - # parameters: - # - values: ["--measure", "cosine"] - # - values: ["--measure", "euclidean"] - # - values: ["--measure", "manhattan"] - # - values: ["--measure", "chebyshev"] - # repository: - # url: https://github.com/omnibenchmark-example/distance.git - # commit: dd99d4f - # inputs: - # - entries: - # - data.features - # outputs: - # - id: distances - # path: "{input}/{stage}/{module}/{params}/{dataset}.distances.csv" - - # ## daniel's methods ################################################################### - - # - id: danielmethods - # modules: - # - id: kmeans - # software_environment: "sklearn" - # repository: - # url: https://github.com/omnibenchmark-example/kmeans.git - # commit: 049c8b1 - # - id: ward - # software_environment: "R" - # repository: - # url: https://github.com/omnibenchmark-example/ward.git - # commit: 976e3f3 - # inputs: - # - entries: - # - distances - # outputs: - # - id: methods.clusters - # path: "{input}/{stage}/{module}/{params}/{dataset}.clusters.csv" - - # ## daniel's metrics ################################################################### - - # - id: danielsmetrics - # modules: - # - id: ari - # software_environment: "R" - # repository: - # url: https://github.com/omnibenchmark-example/ari.git - # commit: 72708f0 - # - id: accuracy - # software_environment: "R" - # repository: - # url: https://github.com/omnibenchmark-example/accuracy.git - # commit: e26b32f - # inputs: - # - entries: - # - methods.clusters - # - data.labels - # outputs: - # - id: metrics.mapping - # path: "{input}/{stage}/{module}/{params}/{dataset}.metrics.txt" diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..f342949 --- /dev/null +++ b/Makefile @@ -0,0 +1,37 @@ +MAX_CORES ?= 10 +TIMEOUT ?= 4h + +# by default, we want to run all snakemake rules even if there are failures (-k) +OB_CMD=ob run benchmark -k --local --task-timeout ${TIMEOUT} --cores ${MAX_CORES} + +prepare_apptainer_env: + cd envs && ./build_singularity.sh +prepare_envmodules_env: + cd envs && eb clustbench.eb --robot + cd envs && eb fcps.eb --robot + cd envs && eb rmarkdown.eb --robot + +# short versions, to debug runs & environments +run_with_apptainer_backend_short: + ${OB_CMD} -b Clustering_apptainer_vanilla_smoketest.yml + mv out out_apptainer_short +run_with_conda_backend_short: + ${OB_CMD} -b Clustering_conda_smoketest.yml + mv out out_conda_short +run_with_envmodules_backend_short: + ${OB_CMD} -b Clustering_envmodules_smoketest.yml + mv out out_lmod_short + +# full versions (expect hours) +run_with_apptainer_backend_vanilla: + ${OB_CMD} -b Clustering_apptainer_vanilla.yml + mv out out_apptainer_vanilla +run_with_apptainer_backend_optimized: + ${OB_CMD} -b Clustering_apptainer_optimized.yml + mv out out_apptainer_vanilla +run_with_conda_backend: + ${OB_CMD} -b Clustering_conda.yml + mv out out_conda +run_with_envmodules_backend: + ${OB_CMD} -b Clustering_envmodules.yml + mv out out_lmod diff --git a/README.md b/README.md index a75c594..89d7c05 100644 --- a/README.md +++ b/README.md @@ -3,8 +3,8 @@ A clustering example for omnibenchmark # How to run 1. Install omnibenchmark using [our tutorial](https://omnibenchmark.org/tutorial/) -2. Clone the benchmark definition / this repository with `git clone git@github.com:omnibenchmark/clustering_example.git` -3. Move to the cloned repository `cd clustering_example` +2. Clone the benchmark definition / this repository with `git clone https://github.com/omnibenchmark/clustering_example` +3. Move into the cloned folder: `cd clustering_example` 4. Run locally, somewhat in parallel `ob run benchmark -b CLUSTERING.YAML --local --threads 6`. Choose `Clustering.yml` specification based on whether running it with conda, easybuild, apptainer, etc. [More details about the available backends](https://github.com/omnibenchmark/clustering_example/blob/main/envs/README.md). # Clustbench attribution diff --git a/envs/README.md b/envs/README.md index 69aa5c1..3cab925 100644 --- a/envs/README.md +++ b/envs/README.md @@ -1,10 +1,9 @@ We distribute `Clustering.yml` runs with different backends. -- `Clustering_conda.yml`. Conda semi-reproducible (no pinning, pip) -- `Clustering_singularity.yml`. Singularity semi-reproducible, local SIF files. -- `Clustering_oras.yml`. Singularity semi-reproducible, prebuilt remote images. -- `Clustering_envmodules.yml`. Easybuilt with default optimization. - +- `Clustering_conda.yml`. Conda semi-reproducible (no pinning, using pip) +- `Clustering_apptainer.yml`. Singularity semi-reproducible, local SIF files. +- `Clustering_oras.yml`. Singularity semi-reproducible, prebuilt remote images from an ORAS registry. +- `Clustering_envmodules.yml`. Easybuild backend with default optimization. ## Conda @@ -12,8 +11,7 @@ We distribute `Clustering.yml` runs with different backends. - `clustbench.yml` - `fcps.yml` -- `r.yml` -- `sklearn.yml` +- `rmarkdown.yml` ### How to build @@ -23,24 +21,25 @@ No need to `ob software conda pin / prepare`; let `ob run benchmark -b Clusterin ### Files -- `clustbench_singularity.def` -- `fcps_singularity.def` -- `r_singularity.def` -- `sklearn_singularity.def` +The apptainer images are based in ubuntu-noble docker images. + +The "optimized" one does a custom python 3.12 compilation; the vanillapy stocks the default py3.12 interpreter from the official ubuntu docker image. + +- `clustbench_apptainer_optimized.def` +- `clustbench_apptainer_vanillapy.def` +- `fcps.def` +- `rmarkdown.def` ### How to build -- `build_singularity.sh` +- `make prepare_apptainer_env` from the root folder. ## Aptainer semi-reproducible and remote -No need to prepare/build anything; let `ob run benchmark -b Clustering_oras.yml --local` do it using pre-built images from https://gitlab.renkulab.io/izaskun.mallona/clustering_example/container_registry. +TODO: push to the registry (how?) -## Apptainer (reproducible) with easybuild - -Doing... +No need to prepare/build anything; let `ob run benchmark -b Clustering_oras.yml --local` do it using pre-built images from https://gitlab.renkulab.io/izaskun.mallona/clustering_example/container_registry. -Lorem ipsum. ## envmodules - reproducible builds with easybuild @@ -48,11 +47,11 @@ Lorem ipsum. - `clustbench.eb` - `fcps.eb` +- `rmarkdown.eb` +- `rmarkdown-python.eb` ### How to build -1. Mind https://github.com/easybuilders/easybuild-easyconfigs/commit/e29210626f076e3a207f1abf3759ea124e28f8b2 -2. Mind `clustbench` is only installable from https://github.com/gagolews/genieclust/archive/refs/tags/v1.1.6.tar.gz and not from pypi's tgz (!), download it locally and ideally update the easyconfig to automate this -3. `python3-wget` from pypi doesn't look very well maintaned -4. `eb fcps.eb --robot` -5. `eb clustbench.eb --robot` +- `make prepare_envmodules_env` from the root folder. +- `python3-wget` from pypi doesn't look very well maintaned + diff --git a/envs/build_singularity.sh b/envs/build_singularity.sh old mode 100644 new mode 100755 index 86e053f..83203c8 --- a/envs/build_singularity.sh +++ b/envs/build_singularity.sh @@ -1,9 +1,8 @@ -#!/bin/bash - -sudo singularity build sklearn.sif sklearn_singularity.def - -sudo singularity build clustbench.sif clustbench_singularity.def - -sudo singularity build r.sif r_singularity.def - -sudo singularity build fcps.sif fcps_singularity.def +#!/bin/sh +CMD=singularity +BUILD='build --fakeroot' +$CMD ${BUILD} clustbench-vanilla.sif clustbench_apptainer_vanillapy.def +# enable this if you want to compare with the custom python compilation +# $CMD ${BUILD} clustbench-optimized.sif clustbench_apptainer_optimized.def +$CMD ${BUILD} fcps.sif fcps.def +$CMD ${BUILD} rmarkdown.sif rmarkdown.def # this one is very similar to fcps, remove diff --git a/envs/clustbench.eb b/envs/clustbench.eb index 22597fb..daae6dd 100644 --- a/envs/clustbench.eb +++ b/envs/clustbench.eb @@ -1,108 +1,47 @@ -## largely as https://github.com/easybuilders/easybuild-easyconfigs/blob/949c266db9e17440ec2829eb8ffdbdb87ceaf543/easybuild/easyconfigs/c/cooler/cooler-0.10.2-foss-2023b.eb#L4 - easyblock = 'PythonBundle' name = 'clustbench' -version = '1' +version = '0.1.0' -homepage = 'https://python.org/' +homepage = 'https://omnibenchmark.org' description = "Bundle of Python packages for ob clustering_example" toolchain = {'name': 'foss', 'version': '2023b'} - dependencies = [ ('Python', '3.11.5'), - ('Python-bundle-PyPI', '2023.10'), ## so GCC 13.2.0 like foss-2023b ('SciPy-bundle', '2023.11'), - ('meson-python', '0.15.0'), ('matplotlib', '3.8.2'), - ('scikit-learn', '1.4.0') - + ('scikit-learn', '1.4.0'), ] -sanity_pip_check = True -use_pip = True - -exts_default_options = { - 'sanity_pip_check': True, - 'use_pip' : True -} - -## https://files.pythonhosted.org/packages/27/fe/e78538f4cd7b1b28e9c625eabd21f314004d00644a8347d0b01473e72ffa/clustering_benchmarks-1.1.5.tar.gz -## https://files.pythonhosted.org/packages/47/6a/62e288da7bcda82b935ff0c6cfe542970f04e29c756b0e147251b2fb251f/wget-3.2.zip -## https://files.pythonhosted.org/packages/5d/b8/f143d907d93bd4a3dd51d07c4e79b37bedbfc2177f4949bfa0d6ba0af647/fastcluster-1.2.6.tar.gz -## https://files.pythonhosted.org/packages/68/7c/d465bab9f98b75c5c1f5e80165dd82847a504ced655d162b585df08a717b/genieclust-1.1.6.tar.gz -## https://files.pythonhosted.org/packages/a2/45/eaaacaa4f4f2931a80d40e453df275d9af7c07616c5d753272d3055fb79e/genieclust-1.1.6-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl - -source_urls = [PYPI_SOURCE, - 'https://files.pythonhosted.org/packages/27/fe/e78538f4cd7b1b28e9c625eabd21f314004d00644a8347d0b01473e72ffa/', - 'https://files.pythonhosted.org/packages/68/7c/d465bab9f98b75c5c1f5e80165dd82847a504ced655d162b585df08a717b/', - 'https://files.pythonhosted.org/packages/47/6a/62e288da7bcda82b935ff0c6cfe542970f04e29c756b0e147251b2fb251f/', - 'https://files.pythonhosted.org/packages/5d/b8/f143d907d93bd4a3dd51d07c4e79b37bedbfc2177f4949bfa0d6ba0af647/', - 'https://files.pythonhosted.org/packages/65/6e/09db70a523a96d25e115e71cc56a6f9031e7b8cd166c1ac8438307c14058/', - 'https://files.pythonhosted.org/packages/84/4d/b720d6000f4ca77f030bd70f12550820f0766b568e43f11af7f7ad9061aa/', - 'https://files.pythonhosted.org/packages/68/dd/fa2e1a45fce2d09f4aea3cee169760e672c8262325aa5796c49d543dc7e6/', - 'https://files.pythonhosted.org/packages/84/4d/b720d6000f4ca77f030bd70f12550820f0766b568e43f11af7f7ad9061aa', - 'https://files.pythonhosted.org/packages/67/66/91d242ea8dd1729addd36069318ba2cd03874872764f316c3bb51b633ed2/', - 'https://files.pythonhosted.org/packages/e2/a9/a0c57aee75f77794adaf35322f8b6404cbd0f89ad45c87197a937764b7d0/', - 'https://files.pythonhosted.org/packages/d2/c1/72b9622fcb32ff98b054f724e213c7f70d6898baa714f4516288456ceaba/', - 'https://github.com/pybind/pybind11/archive/', - 'https://files.pythonhosted.org/packages/a9/95/a3dbbb5028f35eafb79008e7522a75244477d2838f38cbb722248dabc2a8/', - 'https://files.pythonhosted.org/packages/84/a4/d9da2989a3d937e94616ef07f0630c507f6baa77ad37f94ceb06b36cacc1/python3-wget-0.0.2-beta1.tar.gz', - 'https://files.pythonhosted.org/packages/6a/ef/6e3736663ee67369f7f5b697674bfbd3efc91e7096ddd4452bbbc80065ff/hypothesis-6.124.7.tar.gz', - 'https://files.pythonhosted.org/packages/03/c6/14a17e10813b8db20d1e800ff9a3a898e65d25f2b0e9d6a94616f1e3362c/numpy-1.23.0.tar.gz', - 'https://files.pythonhosted.org/packages/f6/d8/ab692a75f584d13c6542c3994f75def5bce52ded9399f52e230fe402819d/numpy-1.22.4.zip', - 'https://files.pythonhosted.org/packages/65/6e/09db70a523a96d25e115e71cc56a6f9031e7b8cd166c1ac8438307c14058/numpy-1.26.4.tar.gz', - 'https://files.pythonhosted.org/packages/64/33/60135848598c076ce4b231e1b1895170f45fbcaeaa2c9d5e38b04db70c35/joblib-1.4.2.tar.gz', - 'https://files.pythonhosted.org/packages/bd/55/b5148dcbf72f5cde221f8bfe3b6a540da7aa1842f6b491ad979a6c8b84af/threadpoolctl-3.5.0.tar.gz', - 'https://files.pythonhosted.org/packages/9e/a5/4ae3b3a0755f7b35a280ac90b28817d1f380318973cff14075ab41ef50d9/scikit_learn-1.6.1.tar.gz', - 'https://files.pythonhosted.org/packages/ef/e5/c09d20723bfd91315f6f4ddc77912b0dcc09588b4ca7ad2ffa204607ad7f/scikit-learn-1.4.2.tar.gz', - 'https://files.pythonhosted.org/packages/ee/5e/16e17bedcf54d5b618dc0771690deda77178e5c310402881c3d2d6c5f27c/hurry.filesize-0.9.tar.gz'] - - -## caution download genieclust here, not pypi, they differ and pypi's it's not installable! -## cd /home/imallona/.local/easybuild/sources/c/clustbench/extensions/ -## wget wget https://github.com/gagolews/genieclust/archive/refs/tags/v1.1.6.tar.gz -O genieclust-1.1.6.tar.gz -## todo automate this within the easyconfig! - exts_list = [ ('natsort', '8.4.0', { 'checksums': ['45312c4a0e5507593da193dedd04abb1469253b601ecaf63445ad80f0a1ea581'], }), - ('cython', '3.0.11', { - 'checksums': ['7146dd2af8682b4ca61331851e6aebce9fe5158e75300343f80c07ca80b1faff'], - }), ('hypothesis', '6.124.7', { 'checksums': ['8ed6c6ae47e7d26d869c1dc3dee04e8fc50c95240715bb9915ded88d6d920f0e'], }), - ('numpy', '1.26.4', { - 'checksums': ['2a02aba9ed12e4ac4eb3ea9421c420301a0c6460d9830d74a9df87efa4912010'], - }), ('fastcluster', '1.2.6', { 'checksums': ['aab886efa7b6bba7ac124f4498153d053e5a08b822d2254926b7206cdf5a8aa6'], }), - ('genieclust', '1.1.6', { - 'checksums': ['fb5b4ff68eef9e73496afa5949e726c8522c72e51f092716a6a598b03d5c09d6'], - }), ('hurry.filesize', '0.9', { 'checksums': ['f5368329adbef86accd3bc9490522340bb79260455ae89b1a42c10f63801b9a6'], }), ('python3-wget', '0.0.2-beta1', { 'modulename': 'wget', + 'source_urls': ['https://files.pythonhosted.org/packages/84/a4/d9da2989a3d937e94616ef07f0630c507f6baa77ad37f94ceb06b36cacc1/'], 'checksums': ['bbe7f44b3c28c4f7126aff20e8a438e78f6e4f1878d8b0c4940e87363813c17d'], }), - ('clustering_benchmarks', '1.1.5', { + ('genieclust', '1.1.6', { + 'download_dep_fail': False, + 'install_src': 'https://files.pythonhosted.org/packages/2a/09/d1fd7b02cfabe76262d0f88d74fa71dc93e857525f8249539ec5ab174292/genieclust-1.1.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl', + 'checksums': ['4c159f507b84b6d6d171883223648d837c520a9bcce650944a6ee0cb320e2151'], + }), + ('clustering_benchmarks', '1.1.6', { 'modulename': 'clustbench', - 'checksums': ['1732c262fb13be2f88814ef9a19c60108e91a7f6cfb9b960a42feaa299034ea3'], + 'checksums': ['8c3ac0aed7c4c4925df6e5000db29aed6359341bd1ef2e516f230e13d8b66a0c'], }), ] -sanity_check_paths = { - 'files': [], - 'dirs': ['lib/python3.11/site-packages/clustbench/'] -} - moduleclass = 'bio' - - diff --git a/envs/clustbench_apptainer_optimized.def b/envs/clustbench_apptainer_optimized.def new file mode 100644 index 0000000..8fc7e08 --- /dev/null +++ b/envs/clustbench_apptainer_optimized.def @@ -0,0 +1,103 @@ +Bootstrap: docker +From: ubuntu:noble-20250404 + +%labels + Author izaskun.mallona@gmail.com + Author ben.uzh@proton.me + +%post + PYTHON_VERSION=3.12.9 + PYTHON_MAJOR_VERSION=$(echo $PYTHON_VERSION | cut -d. -f1,2) + + # Update and enable deb-src + export DEBIAN_FRONTEND=noninteractive + apt-get update + echo "deb-src http://archive.ubuntu.com/ubuntu/ noble main restricted universe multiverse" >> /etc/apt/sources.list + echo "deb-src http://archive.ubuntu.com/ubuntu/ noble-updates main restricted universe multiverse" >> /etc/apt/sources.list + apt-get update + + # Get build dependencies for Python + apt-get build-dep -y python3 + + # Extra dependencies + apt-get install -y git \ + python-is-python3 \ + wget \ + zlib1g-dev \ + libbz2-dev \ + libssl-dev \ + libffi-dev \ + && apt-get clean && \ + rm -rf /var/lib/apt/lists/* + + # Calculate half the number of available cores + HALF_NPROC=$(( $(nproc) / 2 )) + # Ensure at least one core is used + CORES_TO_USE=$(( HALF_NPROC > 0 ? HALF_NPROC : 1 )) + + # Download and build Python from source, with optimizations + + mkdir ~/src && cd ~/src + wget https://www.python.org/ftp/python/${PYTHON_VERSION}/Python-${PYTHON_VERSION}.tgz + tar -xf Python-${PYTHON_VERSION}.tgz + cd Python-${PYTHON_VERSION}*/ + + # Enable all possible optimizations + ./configure \ + --enable-optimizations \ + --with-lto \ + --enable-shared \ + LDFLAGS="-Wl,-rpath /usr/local/lib" + make -j ${CORES_TO_USE} + make altinstall + + # Create virtualenv using the locally built Python + cd /opt + /usr/local/bin/python${PYTHON_MAJOR_VERSION} -m venv "default" + . default/bin/activate + + # Install required packages with pip + + pip install -U pip wheel + + pip install \ + "clustering-benchmarks==1.1.6" \ + "contourpy==1.3.2" \ + "cycler==0.12.1" \ + "cython==3.1.0" \ + "fonttools==4.58.0" \ + "genieclust==1.1.6" \ + "joblib==1.5.0" \ + "kiwisolver==1.4.8" \ + "matplotlib==3.10.3" \ + "natsort==8.4.0" \ + "numpy==2.2.5" \ + "packaging==25.0" \ + "pandas==2.2.3" \ + "pillow==11.2.1" \ + "pyparsing==3.2.3" \ + "python-dateutil==2.9.0.post0" \ + "pytz==2025.2" \ + "scikit-learn==1.6.1" \ + "scipy==1.15.3" \ + "six==1.17.0" \ + "threadpoolctl==3.6.0" \ + "tzdata==2025.2" \ + "fastcluster==1.3.0" \ + "gitpython==3.1.43" \ + "isodate==0.7.2" \ + "pydantic-core==2.34.1" + + # TODO: can we use something more maintained? + pip install --pre "python3-wget==0.0.2-beta1" + + # Do some cleanup to keep the image slim + rm -rf ~/.cache + rm -rf ~/src + + echo '. /opt/default/bin/activate' >> $SINGULARITY_ENVIRONMENT + +%environment + + . /opt/default/bin/activate + diff --git a/envs/clustbench_apptainer_vanillapy.def b/envs/clustbench_apptainer_vanillapy.def new file mode 100644 index 0000000..ff9dd91 --- /dev/null +++ b/envs/clustbench_apptainer_vanillapy.def @@ -0,0 +1,68 @@ +Bootstrap: docker +From: ubuntu:noble-20250404 + +%labels + Author izaskun.mallona@gmail.com + Author ben.uzh@proton.me + +%post + export DEBIAN_FRONTEND=noninteractive + apt-get update && \ + apt-get install -y \ + python3 \ + python3-venv \ + python3-pip \ + ca-certificates \ + git \ + && apt-get clean && \ + rm -rf /var/lib/apt/lists/* + + # Create virtualenv using the default Python + mkdir -p /opt && cd /opt + /usr/bin/python3 -m venv "default" + . default/bin/activate + + # Install required packages with pip + + pip install -U pip wheel + + pip install \ + "clustering-benchmarks==1.1.6" \ + "contourpy==1.3.2" \ + "cycler==0.12.1" \ + "cython==3.1.0" \ + "fonttools==4.58.0" \ + "genieclust==1.1.6" \ + "joblib==1.5.0" \ + "kiwisolver==1.4.8" \ + "matplotlib==3.10.3" \ + "natsort==8.4.0" \ + "numpy==2.2.5" \ + "packaging==25.0" \ + "pandas==2.2.3" \ + "pillow==11.2.1" \ + "pyparsing==3.2.3" \ + "python-dateutil==2.9.0.post0" \ + "pytz==2025.2" \ + "scikit-learn==1.6.1" \ + "scipy==1.15.3" \ + "six==1.17.0" \ + "threadpoolctl==3.6.0" \ + "tzdata==2025.2" \ + "fastcluster==1.3.0" \ + "gitpython==3.1.43" \ + "isodate==0.7.2" \ + "pydantic-core==2.34.1" + + # TODO: can we use something more maintained? + pip install --pre "python3-wget==0.0.2-beta1" + + # Do some cleanup to keep the image slim + rm -rf ~/.cache + + echo '. /opt/default/bin/activate' >> $SINGULARITY_ENVIRONMENT + +%environment + + . /opt/default/bin/activate + diff --git a/envs/clustbench_singularity.def b/envs/clustbench_singularity.def deleted file mode 100644 index 8c2ae85..0000000 --- a/envs/clustbench_singularity.def +++ /dev/null @@ -1,35 +0,0 @@ -Bootstrap: docker -From: ubuntu:jammy-20240911.1 - -%labels - - AUTHOR izaskun.mallona@gmail.com - -%post - - # Install python3.12 - apt-get update - apt install -y build-essential zlib1g-dev libncurses5-dev libgdbm-dev libnss3-dev libssl-dev \ - libreadline-dev libffi-dev libsqlite3-dev wget libbz2-dev python-is-python3 git - - wget https://www.python.org/ftp/python/3.12.6/Python-3.12.6.tgz - tar -xf Python-3.12.6.tgz - cd Python-3.12.*/ - ./configure --enable-optimizations - make -j 4 - make altinstall - - # virtualenv - cd /opt - python3.12 -m venv "default" - . default/bin/activate - - pip3 install "clustering-benchmarks==1.1.5" "wget" "fastcluster==1.2.6" "numpy==1.26.4" "scipy==1.14.1" \ - "isodate" "pydantic-core" \ - "genieclust==1.1.6" "pandas==2.2.3" "gitpython==3.1.43" - - echo '. /opt/default/bin/activate' >> $SINGULARITY_ENVIRONMENT - -%environment - - . /opt/default/bin/activate diff --git a/envs/fcps.def b/envs/fcps.def new file mode 100644 index 0000000..922d7f8 --- /dev/null +++ b/envs/fcps.def @@ -0,0 +1,39 @@ +Bootstrap: docker +From: rocker/tidyverse:4.4 + +%labels + + AUTHOR izaskun.mallona@gmail.com + AUTHOR ben.uzh@proton.me + +%post + + # Install python (3.12 as of noble) + export DEBIAN_FRONTEND=noninteractive + apt-get update + apt-get install -y git \ + python-is-python3 \ + python3.12 \ + python3.12-venv \ + && apt-get clean && \ + rm -rf /var/lib/apt/lists/* + + # virtualenv + cd /opt + python3.12 -m venv "default" + . default/bin/activate + + pip install \ + "gitpython==3.1.43" \ + "isodate==0.7.2" \ + "pydantic-core==2.34.1" + + # Install R packages + ## FIXME no versioning here + Rscript -e 'BiocManager::install(c( "dbscan", "cluster", "protoclust", "energy", "argparse", "mclust", "DataVisualizations", "FCPS", "cclust"))' + + echo '. /opt/default/bin/activate' >> $SINGULARITY_ENVIRONMENT + +%environment + + . /opt/default/bin/activate diff --git a/envs/fcps.eb b/envs/fcps.eb index ee3db52..692bf0b 100644 --- a/envs/fcps.eb +++ b/envs/fcps.eb @@ -1,6 +1,3 @@ -## largely as in https://github.com/easybuilders/easybuild-easyconfigs/commit/e9a36171c68414f933ab1afa03b32422491f0f96#diff-3f2a92ab6ab59ddaccf4bc61b59bdd3f6717b95fd019131a57f51eefc831a699 -## Caution boost easyconfig needs update https://raw.githubusercontent.com/easybuilders/easybuild-easyconfigs/refs/heads/develop/easybuild/easyconfigs/b/Boost/Boost-1.82.0-GCC-12.3.0.eb (https://github.com/easybuilders/easybuild-easyconfigs/commit/e29210626f076e3a207f1abf3759ea124e28f8b2) - easyblock = 'Bundle' name = 'fcps' @@ -16,17 +13,13 @@ builddependencies = [('pkgconf', '1.9.5')] dependencies = [ ('R', '4.3.2'), - ('Boost', '1.82.0'), + ('Boost', '1.82.0'), ('GSL', '2.7'), -# ('arrow-R', '14.0.1', versionsuffix), # required by RcisTarget ] exts_default_options = { 'source_urls': [ - 'https://bioconductor.org/packages/3.18/bioc/src/contrib/', - 'https://bioconductor.org/packages/3.18/bioc/src/contrib/Archive/%(name)s', - 'https://bioconductor.org/packages/3.18/data/annotation/src/contrib/', - 'https://bioconductor.org/packages/3.18/data/experiment/src/contrib/', + 'https://bioconductor.org/packages/release/bioc/src/contrib/', 'https://cran.r-project.org/src/contrib/Archive/%(name)s', # package archive 'https://cran.r-project.org/src/contrib/', # current version of packages 'https://cran.freestatistics.org/src/contrib', # mirror alternative for current packages @@ -192,13 +185,15 @@ exts_list = [ ('cluster', '2.1.8', { 'checksums': ['c32a462e34694c99d58da953efa74882b5427f8c5db7cb226ae15c54ce6060ca'], }), - ('graph', '1.84.1', { - 'checksums': ['cd2a91c93c81c09d9c59853c417e8a9cdde39b0589bacdce4ca916b6ee5f45a7'], + ('graph', '1.86.0', { + 'checksums': ['ac9e196dfcb43848a851ea2d339cff41f8f16c7e80e76282c8fe7b822df8f367'], }), ('mclust', '6.1.1', { 'checksums': ['ddd7018e5e6ea7f92c7fc9872b391491b7e91c2cd89ef1dcaf4408afb5116775'], }), - ('cclust', '0.6-26'), + ('cclust', '0.6-26', { + 'checksums': ['92ec3c55a1864e4e1a4706bfdef8ad00727c720213ac656c718e867286b29857'], + }), ('flowClust', '3.40.0', { 'installopts': "--configure-args='--with-gsl=${EBROOTGSL} --enable-bundled-gsl=false'", 'checksums': ['7e699b06e378e32144704dbec18289109980b0f5eca166180f2c30007b83e0f5'], @@ -240,4 +235,4 @@ sanity_check_paths = { 'dirs': ['FCPS', 'dbscan', 'energy', 'protoclust'], } -moduleclass = 'bio' \ No newline at end of file +moduleclass = 'bio' diff --git a/envs/fcps_singularity.def b/envs/fcps_singularity.def deleted file mode 100644 index a4a615e..0000000 --- a/envs/fcps_singularity.def +++ /dev/null @@ -1,37 +0,0 @@ -Bootstrap: docker -From: rocker/tidyverse:4.3.3 - -%labels - - AUTHOR izaskun.mallona@gmail.com - -%post - - # Install python3.12 - apt-get update - apt install -y build-essential zlib1g-dev libncurses5-dev libgdbm-dev libnss3-dev libssl-dev \ - libreadline-dev libffi-dev libsqlite3-dev wget libbz2-dev python-is-python3 git \ - libgsl-dev - - wget https://www.python.org/ftp/python/3.12.6/Python-3.12.6.tgz - tar -xf Python-3.12.6.tgz - cd Python-3.12.*/ - ./configure --enable-optimizations - make -j 4 - make altinstall - - # virtualenv - cd /opt - python3.12 -m venv "default" - . default/bin/activate - - pip install gitpython==3.1.43 isodate pydantic-core - - ## no versioning here - Rscript -e 'BiocManager::install(c( "dbscan", "cluster", "protoclust", "energy", "argparse", "mclust", "DataVisualizations", "FCPS", "cclust"))' - - echo '. /opt/default/bin/activate' >> $SINGULARITY_ENVIRONMENT - -%environment - - . /opt/default/bin/activate diff --git a/envs/r.yml b/envs/r.yml deleted file mode 100644 index 456e139..0000000 --- a/envs/r.yml +++ /dev/null @@ -1,12 +0,0 @@ -name: r_for_metrics -channels: - - conda-forge - - nodefaults -dependencies: - - conda-forge::python=3.12.6 - - conda-forge::r-mclust - - conda-forge::r-caret - - conda-forge::r-dplyr - - conda-forge::r-readr - - conda-forge::r-argparse - diff --git a/envs/r_singularity.def b/envs/r_singularity.def deleted file mode 100644 index f1f9ec9..0000000 --- a/envs/r_singularity.def +++ /dev/null @@ -1,37 +0,0 @@ -Bootstrap: docker -From: rocker/tidyverse:4.4 - -%labels - - AUTHOR izaskun.mallona@gmail.com - -%post - - # Install python3.12 - apt-get update - apt install -y build-essential zlib1g-dev libncurses5-dev libgdbm-dev libnss3-dev libssl-dev \ - libreadline-dev libffi-dev libsqlite3-dev wget libbz2-dev python-is-python3 git - - wget https://www.python.org/ftp/python/3.12.6/Python-3.12.6.tgz - tar -xf Python-3.12.6.tgz - cd Python-3.12.*/ - ./configure --enable-optimizations - make -j 4 - make altinstall - - # virtualenv - cd /opt - python3.12 -m venv "default" - . default/bin/activate - - pip install gitpython==3.1.43 isodate pydantic-core - - # Install R packages - - Rscript -e 'BiocManager::install(c("mclust", "caret", "readr", "argparse"))' - - echo '. /opt/default/bin/activate' >> $SINGULARITY_ENVIRONMENT - -%environment - - . /opt/default/bin/activate diff --git a/envs/rmarkdown-python.eb b/envs/rmarkdown-python.eb new file mode 100644 index 0000000..a9edb00 --- /dev/null +++ b/envs/rmarkdown-python.eb @@ -0,0 +1,28 @@ +easyblock = 'Bundle' + +# This is a dummy bundle that installs: +# 1. rmarkdown: an R bundle that we also package +# 2. Python-3.12.3-GCCcore-13.3.0 +# This is a dependency for the clustering_benchmark metric collector. + +name = 'rmarkdown-python' +version = '0.1.0' + +local_rver = '4.4.2' +local_pyver = '3.12.3' +versionsuffix = f'-r-{local_rver}-py-{local_pyver}' + +homepage = 'https://omnibenchmark.org' +description = 'Rmarkdown bundle with specific Python dependency' + +toolchain = {'name': 'system', 'version': '1.0'} + +dependencies = [ + ('rmarkdown', '0.1.0', f'-gfbf-2024a-r-{local_rver}'), + ('Python', local_pyver, '-GCCcore-13.3.0'), +] + +sanity_check_paths = { + 'files': [], + 'dirs': ['../../rmarkdown/0.1.0-gfbf-2024a-r-4.4.2'] +} diff --git a/envs/rmarkdown.def b/envs/rmarkdown.def new file mode 100644 index 0000000..aa20cc1 --- /dev/null +++ b/envs/rmarkdown.def @@ -0,0 +1,40 @@ +Bootstrap: docker +From: rocker/tidyverse:4.4 + +# TODO: we could merge this one with fcps.def, no need to duplicate the image. + +%labels + + AUTHOR izaskun.mallona@gmail.com + AUTHOR ben.uzh@proton.me + +%post + + # Install python (3.12 as of noble) + export DEBIAN_FRONTEND=noninteractive + apt-get update + apt-get install -y git \ + python-is-python3 \ + python3.12 \ + python3.12-venv \ + && apt-get clean && \ + rm -rf /var/lib/apt/lists/* + + # virtualenv + cd /opt + python3.12 -m venv "default" + . default/bin/activate + + pip install \ + "gitpython==3.1.43" \ + "isodate==0.7.2" \ + "pydantic-core==2.34.1" + + # Install R packages + Rscript -e 'BiocManager::install(c("mclust", "caret", "argparse"))' + + echo '. /opt/default/bin/activate' >> $SINGULARITY_ENVIRONMENT + +%environment + + . /opt/default/bin/activate diff --git a/envs/rmarkdown.eb b/envs/rmarkdown.eb new file mode 100644 index 0000000..067eadd --- /dev/null +++ b/envs/rmarkdown.eb @@ -0,0 +1,197 @@ +easyblock = 'Bundle' + +# TODO(ben): Try to use https://www.eessi.io/docs/available_software/detail/R-bundle-CRAN/ +# and build only what's left out. + +name = 'rmarkdown' +version = '0.1.0' +versionsuffix = '-r-%(rver)s' + +homepage = 'https://omnibenchmark.org' +description = 'rmarkdown bundle for clustbench reports' + +toolchain = {'name': 'gfbf', 'version': '2024a'} + +dependencies = [ + ('R', '4.4.2'), +] + +exts_default_options = { + 'source_urls': [ + 'https://cloud.r-project.org/src/contrib/', + 'https://cran.r-project.org/src/contrib/', # current version of packages + 'https://cran.r-project.org/src/contrib/Archive/%(name)s', # package archive + 'https://www.bioconductor.org/packages/release/bioc/src/contrib/', # bioconductor + ], + 'sources': ['%(name)s_%(version)s.tar.gz'], +} + +exts_defaultclass = 'RPackage' + +exts_list = [ + ('rlang', '1.1.6', { + 'checksums': ['18544c876f4e18ec554edecc308362a52fbc7e0805c4794cf59bcc4d0b57f330'], + }), + ('glue', '1.8.0', { + 'checksums': ['c86f364ba899b8662f5da3e1a75f43ae081ab04e0d51171d052356e7ee4b72a0'], + }), + ('cli', '3.6.4', { + 'checksums': ['0c39539ce173bcbf7abaca64e8d2c87ffec8257c144c31b793c4cf2dd9cf7620'], + }), + ('lifecycle', '1.0.4', { + 'checksums': ['ada4d3c7e84b0c93105e888647c5754219a8334f6e1f82d5afaf83d4855b91cc'], + }), + ('vctrs', '0.6.5', { + 'checksums': ['43167d2248fd699594044b5c8f1dbb7ed163f2d64761e08ba805b04e7ec8e402'], + }), + ('utf8', '1.2.4', { + 'checksums': ['418f824bbd9cd868d2d8a0d4345545c62151d321224cdffca8b1ffd98a167b7d'], + }), + ('lattice', '0.22-5', { + 'checksums': ['ba1fbe5e18a133507dca9851b7f933002bdb6d1f3ea5f410a0a441103b6da5f1'], + }), + ('pkgconfig', '2.0.3', { + 'checksums': ['330fef440ffeb842a7dcfffc8303743f1feae83e8d6131078b5a44ff11bc3850'], + }), + ('pillar', '1.10.2', { + 'checksums': ['2cdbe3fe1b28b62530880ab26fc3c874e0dd5060767ae1a8ee5685f65e56d645'], + }), + ('magrittr', '2.0.3', { + 'checksums': ['a2bff83f792a1acb801bfe6330bb62724c74d5308832f2cb6a6178336ace55d2'], + }), + ('fansi', '1.0.6', { + 'checksums': ['ea9dc690dfe50a7fad7c5eb863c157d70385512173574c56f4253b6dfe431863'], + }), + ('viridisLite', '0.4.2', { + 'checksums': ['893f111d31deccd2cc959bc9db7ba2ce9020a2dd1b9c1c009587e449c4cce1a1'], + }), + ('RColorBrewer', '1.1-3', { + 'checksums': ['4f42f5423c45688b39f492c7892d93f37b4541831c8ffb140364d2bd89031ac0'], + }), + ('R6', '2.6.1', { + 'checksums': ['59c6eba8b1b912eb7e104f65053235604be853425ee67c152ac4e86a1f2073b4'], + }), + ('labeling', '0.4.3', { + 'checksums': ['c62f4fc2cc74377d7055903c5f1913b7295f7587456fe468592738a483e264f2'], + }), + ('farver', '2.1.2', { + 'checksums': ['528823b95daab4566137711f1c842027a952bea1b2ae6ff098e2ca512b17fe25'], + }), + ('Matrix', '1.7-3', { + 'checksums': ['6642e9db8cddf32a051972fd5a634bf7edbdc925c5c2d139bf71e92df00fb44e'], + }), + ('nlme', '3.1-168', { + 'checksums': ['23b78468344cb6775dee5e0d9c8133032d64f08ebaba20776508a0443a897362'], + }), + ('withr', '3.0.2', { + 'checksums': ['0a3a05f493d275cca4bf13c8c1b95a1a4eed7f83b2493f41fde02ce3fc92c1a3'], + }), + ('tibble', '3.2.1', { + 'checksums': ['65a72d0c557fd6e7c510d150c935ed6ced5db7d05fc20236b370f11428372131'], + }), + ('colorspace', '2.1-1', { + 'checksums': ['e721cee5f4d6e4b0fc8eb18265e316b4f856fd3be02f0775a26032663758cd0b'], + }), + ('munsell', '0.5.1', { + 'checksums': ['03a2fd9ac40766cded96dfe33b143d872d0aaa262a25482ce19161ca959429a6'], + }), + ('scales', '1.3.0', { + 'checksums': ['b33e0f6b44259551ce02befd52eac53602509fbfdd903920620c658c50f35888'], + }), + ('mgcv', '1.9-1', { + 'checksums': ['700fbc37bedd3a49505b9bc4949faee156d9cfb4f669d797d06a10a15a5bdb32'], + }), + ('MASS', '7.3-65', { + 'checksums': ['b07ef1e3c364ce56269b4a8a7759cc9f87c876554f91293437bb578cfe38172f'], + }), + ('isoband', '0.2.7', { + 'checksums': ['7693223343b45b86de2b5b638ff148f0dafa6d7b1237e822c5272902f79cdf61'], + }), + ('gtable', '0.3.6', { + 'checksums': ['d305a5fa11278b649d2d8edc5288bf28009be888a42be58ff8714018e49de0ef'], + }), + ('ggplot2', '3.5.2', { + 'checksums': ['0a30024a2ff3e569412223c8f14563ed504f3e0851de03e42d1b5f73fe1f06bf'], + }), + ('findpython', '1.0.9', { + 'checksums': ['b6a15e0cdfcdd4b1cfc76f7e4eaad0125d4d52889711200075280e9b2a2cb7cb'], + }), + ('argparse', '2.2.5', { + 'checksums': ['53c8a9eb51041084eb3d9c271b14ebcb32dc2f50cf16afa5c54c504a97229ea4'], + }), + (name, '2.29', { + 'checksums': ['6662ac85316c869caad6e3b95468cad97f6eef106d47b066db8d40c05a490928'], + }), + ('generics', '0.1.3', { + 'checksums': ['75046163bfa8b8a4f4214c1b689e796207f6447182f2e5062cf570302387d053'], + }), + ('tidyselect', '1.2.1', { + 'checksums': ['169e97ba0bbfbcdf4a80534322751f87a04370310c40e27f04aac6525d45903c'], + }), + ('dplyr', '1.1.4', { + 'checksums': ['cf730414d5d4ab387b4e9890a4b1df9d17a3903488e8da8df1cf2e11e44558cb'], + }), + ('tidyr', '1.3.1', { + 'checksums': ['e820c261cb5543f572f49276a7bdc7302aa4215da4bf850b1b939a315353835d'], + }), + ('shape', '1.4.6.1', { + 'checksums': ['43f9bd0f997fd6cf1838efd8b2509c9a6396513f4e54a20360481634affd22a4'], + }), + ('GlobalOptions', '0.1.2', { + 'checksums': ['47890699668cfa9900a829c51f8a32e02a7a7764ad07cfac972aad66f839753e'], + }), + ('circlize', '0.4.16', { + 'checksums': ['16dc32c7704906d13a9e5281bb396e92fb89a6b17fa5e201953240726b650b67'], + }), + ('rjson', '0.2.23', { + 'checksums': ['55034575c854ed657e6701da278c0fdea251479624d06a963b2e58461a5f0f48'], + }), + ('GetoptLong', '1.0.5', { + 'checksums': ['8c237986ed3dfb72d956ad865ef7768644eebf144675ad66140acfd1aca9d701'], + }), + ('cluster', '2.1.8.1', { + 'checksums': ['4b95b78e09b17ddca72edc0bb180c753c004ed2f61c3eb12e0451ac77f441e57'], + }), + ('clue', '0.3-66', { + 'checksums': ['aa86dd58c05635eb394c9ede0dd15a4f24af4815f299451bbc7895c0f737c2fb'], + }), + ('png', '0.1-8', { + 'checksums': ['5a36fabb6d62ba2533d3fc4cececd07891942cfb76fe689ec0d550d08762f61c'], + }), + ('BiocGenerics', '0.54.0', { + 'checksums': ['413d6f74cbc671147f63eefc46b718af815d6497535c2198925d9306e00c41b9'], + }), + ('S4Vectors', '0.46.0', { + 'checksums': ['c34249c6a367a2a1e94158d9e60294f2b901e485d93717250a417569be187a40'], + }), + ('IRanges', '2.42.0', { + 'checksums': ['0abb01ee93111c5fc678f9aa2f93d00d8d1548263cb60daa52645a6061b603fc'], + }), + ('matrixStats', '1.5.0', { + 'checksums': ['12996c5f3e6fc202a43e1087f16a71b7fa93d7e908f512542c7ee89cf95dcc15'], + }), + ('iterators', '1.0.14', { + 'checksums': ['cef3075a0930e1408c764e4da56bbadd4f7d14315809df8f38dd51f80ccc677b'], + }), + ('codetools', '0.2-20', { + 'checksums': ['3be6f375ec178723ddfd559d1e8e85bfeee04a5fbaf9f53f2f844e1669fea863'], + }), + ('foreach', '1.5.2', { + 'checksums': ['56338d8753f9f68f262cf532fd8a6d0fe25a71a2ff0107f3ce378feb926bafe4'], + }), + ('doParallel', '1.0.17', { + 'checksums': ['b96a25ad105a654d70c7b4ca27290dc9967bc47f4668b2763927a886b178abd7'], + }), + ('ComplexHeatmap', '2.24.0', { + 'checksums': ['2a015ad26c5a5f003ee203d77cc8d3eea5461bcf2db7ce102da1bef7db082650'], + }), +] + +modextrapaths = {'R_LIBS_SITE': ''} + +sanity_check_paths = { + 'files': [], + 'dirs': ['argparse', 'rmarkdown', 'ggplot2', 'tidyr', 'ComplexHeatmap'], +} + +moduleclass = 'bio' diff --git a/envs/rmarkdown.yml b/envs/rmarkdown.yml index e57969e..ed5c65e 100644 --- a/envs/rmarkdown.yml +++ b/envs/rmarkdown.yml @@ -7,6 +7,8 @@ dependencies: - conda-forge::python=3.12.6 - conda-forge::r-argparse - conda-forge::r-rmarkdown + - conda-forge::r-cairo + - conda-forge::r-svglite - conda-forge::r-ggplot2 - - conda-forge::r-tidyr + - conda-forge::r-tidyr - bioconda::bioconductor-complexheatmap diff --git a/envs/sklearn.yml b/envs/sklearn.yml deleted file mode 100644 index 258b7ea..0000000 --- a/envs/sklearn.yml +++ /dev/null @@ -1,11 +0,0 @@ -name: sklearn -channels: - - conda-forge - - nodefaults -dependencies: - - conda-forge::python=3.12.6 - - conda-forge::scikit-learn - - conda-forge::pip - - pip: - - "pandas" - - "argparse" diff --git a/envs/sklearn_singularity.def b/envs/sklearn_singularity.def deleted file mode 100644 index 939a3bb..0000000 --- a/envs/sklearn_singularity.def +++ /dev/null @@ -1,33 +0,0 @@ -Bootstrap: docker -From: ubuntu:jammy-20240911.1 - -%labels - - AUTHOR izaskun.mallona@gmail.com - -%post - - # Install python3.12 - apt-get update - apt install -y build-essential zlib1g-dev libncurses5-dev libgdbm-dev libnss3-dev libssl-dev \ - libreadline-dev libffi-dev libsqlite3-dev wget libbz2-dev python-is-python3 git - - wget https://www.python.org/ftp/python/3.12.6/Python-3.12.6.tgz - tar -xf Python-3.12.6.tgz - cd Python-3.12.*/ - ./configure --enable-optimizations - make -j 4 - make altinstall - - # virtualenv - cd /opt - python3.12 -m venv "default" - . default/bin/activate - - pip3 install -U scikit-learn pandas argparse numpy scipy "isodate" "pydantic-core" "gitpython==3.1.43" - - echo '. /opt/default/bin/activate' >> $SINGULARITY_ENVIRONMENT - -%environment - - . /opt/default/bin/activate diff --git a/microbenchmark/microbench.py b/microbenchmark/microbench.py new file mode 100644 index 0000000..6abc6ee --- /dev/null +++ b/microbenchmark/microbench.py @@ -0,0 +1,71 @@ +""" +This script exercises a few common linear algebra operations in numpy. +It's intended mostly to gauge whether it makes sense to descend into +compiler optimizations for the Python binary that we ship within the SIF images, +but it can be easily repurposed for other specific microbenchmarks (i.e., numba or GPU perf gains). + +Be aware that here we're profiling simple operations; it would make sense to carefully +profile the libraries of interest to see where the computational bottlenecks really are. + +Usage: + +singularity exec clustbench-vanilla.sif python3 microbench.py +singularity exec clustbench-optimized.sif python3 microbench.py + +References: https://pythonspeed.com/articles/faster-python/ +""" +import numpy as np +import time +import json +from statistics import mean, stdev + +DEFAULT_REPETITIONS = 10 + +def run_operation(operation, func, repetitions): + timings = [] + for _ in range(repetitions): + start = time.perf_counter() + func() + elapsed = time.perf_counter() - start + timings.append(elapsed) + return { + 'operation': operation, + 'mean': mean(timings), + 'stdev': stdev(timings), + 'runs': repetitions + } + +def benchmark(repetitions=DEFAULT_REPETITIONS): + np.random.seed(42) + size = 1000 + + # Create random matrices + A = np.random.rand(size, size) + B = np.random.rand(size, size) + C = A @ A.T # Ensure positive definite for Cholesky + + # Define operations + operations = [ + ('mat_mul', lambda: np.dot(A, B)), + ('svd', lambda: np.linalg.svd(A)), + ('chol_decomp', lambda: np.linalg.cholesky(C)) + ] + + results = [] + for operation, func in operations: + try: + result = run_operation(operation, func, repetitions) + except np.linalg.LinAlgError: + result = { + 'operation': operation, + 'error': 'Operation failed due to numerical instability' + } + results.append(result) + + # Output results as JSON + print(json.dumps(results, indent=2)) + +if __name__ == "__main__": + import sys + repetitions = int(sys.argv[1]) if len(sys.argv) > 1 else DEFAULT_REPETITIONS + benchmark(repetitions)