laaber + schultz's method of estimating slowdown

hail-is · Nov 19, 2024 · fd48477 · fd48477
1 parent fe59078
commit fd48477
Show file tree

Hide file tree

Showing 20 changed files with 960 additions and 4,679 deletions.
diff --git a/hail/notebooks/benchmark/.gitignore b/hail/notebooks/benchmark/.gitignore
@@ -0,0 +1,2 @@
+data/
+out/
diff --git a/hail/notebooks/benchmark/minimal-detectable-slowdown.ipynb b/hail/notebooks/benchmark/minimal-detectable-slowdown.ipynb
@@ -0,0 +1,281 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This notebook explores variability in hail's python (macro)-benchmarks when\n",
+    "said benchmarks are executed on the hail batch service. The analyses within \n",
+    "are based off the methods proposed in [1], albeit slightly modified for long\n",
+    "running benchmarks. The goals of these analyses are\n",
+    "\n",
+    "- to determine if we can detect slowdowns of 5% or less reliably when running\n",
+    "  benchmarks on hail batch.\n",
+    "- to identify configurations (number of batch jobs x iterations) that allow us\n",
+    "  to detect slowdowns efficiently (ie without excesssive time and money).\n",
+    "\n",
+    "[1] Laaber et al., Software Microbenchmarking in the Cloud.How Bad is it Really?\n",
+    "    https://dl.acm.org/doi/10.1007/s10664-019-09681-1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pathlib import Path\n",
+    "\n",
+    "from benchmark.tools.impex import dump_tsv, import_timings\n",
+    "from benchmark.tools.plotting import plot_mean_time_per_instance, plot_trial_against_time\n",
+    "from benchmark.tools.statistics import (\n",
+    "    bootstrap_mean_confidence_interval,\n",
+    "    laaber_mds,\n",
+    "    schultz_mds,\n",
+    "    variability,\n",
+    ")\n",
+    "from IPython.display import clear_output\n",
+    "from plotly.io import renderers\n",
+    "\n",
+    "import hail as hl\n",
+    "\n",
+    "renderers.default = 'notebook_connected'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "hl.init(backend='spark', idempotent=True, local_tmpdir='/tmp/mds')\n",
+    "hl._set_flags(use_new_shuffle='1', lower='1')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Import benchmark data\n",
+    "# ---------------------\n",
+    "#\n",
+    "# benchmarks under `hail/python/benchmarks` are executed with a custom pytest\n",
+    "# plugin and their results are output as json lines (.jsonl).\n",
+    "# Unscrupulously, we use hail to analyse itself.\n",
+    "\n",
+    "with hl.TemporaryFilename(dir='/tmp') as tsvfile:\n",
+    "    timings = Path(tsvfile)\n",
+    "    dump_tsv(Path('data/1k.jsonl'), timings)\n",
+    "    ht = import_timings(timings)\n",
+    "    ht = ht.checkpoint('out/imported.ht', overwrite=True)\n",
+    "\n",
+    "benchmarks = ht.aggregate(hl.agg.collect_as_set(ht.name))\n",
+    "print(*benchmarks, sep='\\n')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "t = ht\n",
+    "t = t.filter(hl.len(t.instances) == 60)\n",
+    "names = t.aggregate(hl.array(hl.agg.collect_as_set(t.name)))\n",
+    "print(*names, sep='\\n')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Plotting the time vs iteration for all instances provides a visual way of\n",
+    "# identifying the number of burn-in iteration required to reach a steady-state.\n",
+    "# Note that a steady state is never reached in some cases.\n",
+    "\n",
+    "for fig in plot_trial_against_time(ht, names=names):\n",
+    "    clear_output(wait=True)\n",
+    "    print(fig.labels.title)\n",
+    "    fig.show()\n",
+    "    input()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# This is an iterative process. Select the minimum number of burn-in iterations\n",
+    "# required for each benchmark. Replot and verify that the graph is more-or-less\n",
+    "# flat. This may not be possible in all cases.\n",
+    "\n",
+    "\n",
+    "def filter_burn_in_iterations(ht: hl.Table) -> hl.Table:\n",
+    "    ht = ht.annotate_globals(\n",
+    "        first_stable_index={\n",
+    "            'benchmark_join_partitions_table[100-10]': 15,\n",
+    "            'benchmark_union_partitions_table[10-10]': 4,\n",
+    "            'benchmark_join_partitions_table[1000-1000]': 15,\n",
+    "            'benchmark_write_range_table[10000000-1000]': 5,\n",
+    "            'benchmark_matrix_table_array_arithmetic': 15,\n",
+    "            'benchmark_table_aggregate_array_sum': 5,\n",
+    "            'benchmark_matrix_table_cols_show': 10,\n",
+    "            'benchmark_pc_relate': hl.missing(hl.tint),\n",
+    "            'benchmark_write_profile_mt': 20,\n",
+    "            'benchmark_table_aggregate_approx_cdf': 28,\n",
+    "            'benchmark_table_aggregate_counter': 12,\n",
+    "            'benchmark_table_show': 10,\n",
+    "            'benchmark_export_range_matrix_table_entry_field_p100': 5,\n",
+    "            'benchmark_group_by_collect_per_row': 8,\n",
+    "\n",
+    "            'benchmark_export_range_matrix_table_row_p100': 20,\n",
+    "            'benchmark_import_gvcf_force_count': 10,\n",
+    "            'benchmark_matrix_table_take_col': 30,\n",
+    "            'benchmark_ndarray_matmul_int64': 23,\n",
+    "            'benchmark_sample_qc': 14,\n",
+    "            'benchmark_shuffle_key_rows_by_mt': 10,\n",
+    "            'benchmark_union_partitions_table[100-100]': 40,\n",
+    "        },\n",
+    "    )\n",
+    "\n",
+    "    return ht.select(\n",
+    "        instances=ht.instances.map(\n",
+    "            lambda instance: instance.annotate(\n",
+    "                trials=(instance.trials.filter(lambda t: t.iteration >= ht.first_stable_index[ht.name]))\n",
+    "            )\n",
+    "        ),\n",
+    "    )\n",
+    "\n",
+    "\n",
+    "ht = filter_burn_in_iterations(ht)\n",
+    "plot_trial_against_time(ht)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# As a final step of cleaning, we'll filter out trials that differ by some\n",
+    "# multiplier of the median for each instance\n",
+    "\n",
+    "\n",
+    "def filter_outliers(ht: hl.Table, factor: hl.Float64Expression) -> hl.Table:\n",
+    "    # Filter out failures and\n",
+    "    return ht.select(\n",
+    "        instances=ht.instances.map(\n",
+    "            lambda instance: instance.annotate(\n",
+    "                trials=hl.bind(\n",
+    "                    lambda median: instance.trials.filter(\n",
+    "                        lambda t: hl.max([t.time, median]) / hl.min([t.time, median]) < factor\n",
+    "                    ),\n",
+    "                    hl.median(instance.trials.map(lambda t: t.time)),\n",
+    "                )\n",
+    "            ),\n",
+    "        ),\n",
+    "    )\n",
+    "\n",
+    "\n",
+    "ht = filter_outliers(ht, hl.float64(10))\n",
+    "plot_trial_against_time(ht)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# These plots show the mean time per instance. This provides a visual way of\n",
+    "# identifying differences in instance type if there are multiple distinct layers\n",
+    "\n",
+    "plot_mean_time_per_instance(ht)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ht = ht.select(instances=ht.instances.trials.time).checkpoint('out/pruned.ht', overwrite=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# laaber et al. section 4\n",
+    "\n",
+    "variability(ht).show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# laaber et al. section 5 - boostrapping confidence intervals of the mean\n",
+    "\n",
+    "bootstrap_mean_confidence_interval(ht, 1000, 0.95).show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Laaber et al - Minimal-Detectable Slowdown\n",
+    "\n",
+    "laaber = laaber_mds(ht).checkpoint('out/laaber-mds.ht', overwrite=True)\n",
+    "schultz = schultz_mds(ht).checkpoint('out/schultz-mds.ht', overwrite=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "slideshow": {
+     "slide_type": "fragment"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "\n",
+    "mds = laaber.select(laaber=laaber.row_value, schultz=schultz[laaber.key])\n",
+    "mds.show(100_000)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.18"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}