Skip to content

Commit dd7255d

Browse files
committed
[benchmark] notebook for interactively computing benchmark config
1 parent 5a2655b commit dd7255d

File tree

10 files changed

+1060
-230
lines changed

10 files changed

+1060
-230
lines changed

hail/notebooks/benchmark/.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
in/
2+
out/

hail/notebooks/benchmark/minimal-detectable-slowdown.ipynb

Lines changed: 528 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
import tempfile
2+
from pathlib import Path
3+
4+
import pytest
5+
6+
from benchmark.tools.impex import dump_tsv, import_timings
7+
from benchmark.tools.statistics import analyze_benchmarks
8+
9+
10+
@pytest.mark.benchmark()
11+
def benchmark_analyze_benchmarks(local_tmpdir, onethreetwo, onethreethree):
12+
inputs = (onethreetwo, onethreethree)
13+
inputs = ((v, Path(tempfile.mktemp(dir=local_tmpdir))) for v in inputs)
14+
inputs = ((dump_tsv(v, t), t)[-1] for v, t in inputs)
15+
16+
tables = (import_timings(v) for v in inputs)
17+
tables = (t.select(instances=t.instances.trials.time) for t in tables)
18+
tables = (t._key_by_assert_sorted(*t.key.drop('version')) for t in tables)
19+
tables = (t.checkpoint(tempfile.mktemp(suffix='.mt', dir=local_tmpdir)) for t in tables)
20+
21+
results = analyze_benchmarks(
22+
*tables,
23+
n_bootstrap_iterations=1000,
24+
confidence=0.95,
25+
)
26+
27+
results._force_count()

hail/python/benchmark/tools/compare.py

Lines changed: 0 additions & 146 deletions
This file was deleted.

hail/python/benchmark/tools/impex.py

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
import json
2+
from pathlib import Path
3+
from typing import Any, Generator, List, Sequence
4+
5+
import hail as hl
6+
from benchmark.tools import maybe, prune
7+
8+
9+
class __types:
10+
trun = hl.tstruct(
11+
iteration=hl.tint, # 0-based
12+
is_burn_in=hl.tbool, # ignore for a/b testing
13+
time=hl.tfloat, # seconds
14+
failure=hl.tstr, # exception message dumped to a string, optional
15+
timed_out=hl.tbool, # whether or not the failure was caused by a timeout
16+
task_memory=hl.tfloat, # don't think this works yet sadly.
17+
)
18+
19+
ttrial = hl.tstruct(
20+
path=hl.tstr,
21+
name=hl.tstr,
22+
version=hl.tstr,
23+
uname=hl.tdict(hl.tstr, hl.tstr),
24+
batch_id=hl.tint,
25+
job_id=hl.tint,
26+
trial=hl.tint,
27+
attempt_id=hl.tstr,
28+
start=hl.tstr,
29+
end=hl.tstr,
30+
**trun,
31+
)
32+
33+
34+
def __write_tsv_row(os, row: Sequence[str]) -> None:
35+
if len(row) > 0:
36+
os.write('\t'.join(row))
37+
os.write('\n')
38+
39+
40+
def dump_tsv(jsonl: Path, tsv: Path) -> None:
41+
def explode(trial: dict) -> Generator[List[Any], Any, None]:
42+
trial['uname'] = json.dumps(trial['uname'])
43+
for run in trial['runs']:
44+
flattened = prune({**trial, **run, 'failure': maybe(json.dumps, run.get('failure')), 'runs': None})
45+
yield [maybe(str, flattened.get(f), 'NA') for f in __types.ttrial]
46+
47+
with (
48+
jsonl.open(encoding='utf-8') as in_,
49+
tsv.open('w', encoding='utf-8') as out,
50+
):
51+
__write_tsv_row(out, [n for n in __types.ttrial])
52+
for line in in_:
53+
trial = json.loads(line)
54+
for row in explode(trial):
55+
__write_tsv_row(out, row)
56+
57+
58+
def import_timings(timings_tsv: Path) -> hl.Table:
59+
ht = hl.import_table(str(timings_tsv), types=__types.ttrial)
60+
trial_key = [t for t in __types.ttrial.fields if t not in set(('uname', *__types.trun.fields))]
61+
ht = ht.group_by(*trial_key).aggregate(
62+
runs=hl.sorted(
63+
hl.agg.collect(ht.row_value.select(*__types.trun)),
64+
lambda t: t.iteration,
65+
),
66+
)
67+
68+
# Rename terms to be consistent with that of Laaber et al.:
69+
# - "trial" (ie batch job) -> "instance"
70+
# - "run" (benchmark invocation) -> "trial"
71+
#
72+
# Note we don't run benchmarks multiple times per trial as these are
73+
# "macro"-benchmarks. This is one area where we differ from Laaber at al.
74+
ht = ht.select(
75+
instance=hl.struct(
76+
instance=ht.trial,
77+
batch_id=ht.batch_id,
78+
job_id=ht.job_id,
79+
attempt_id=ht.attempt_id,
80+
start=ht.start,
81+
end=ht.end,
82+
trials=hl.filter(
83+
lambda t: (
84+
hl.is_missing(t.failure)
85+
| (hl.is_defined(t.failure) & (hl.len(t.failure) == 0))
86+
| ~t.timed_out
87+
| ~t.is_burn_in
88+
),
89+
ht.runs,
90+
),
91+
),
92+
)
93+
94+
return ht.group_by(ht.path, ht.name, ht.version).aggregate(
95+
instances=hl.sorted(
96+
hl.agg.collect(ht.instance),
97+
key=lambda i: (i.instance, i.attempt_id),
98+
)
99+
)
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
from collections.abc import Generator
2+
from typing import Any, List, Optional
3+
4+
import hail as hl
5+
from hail.ggplot import GGPlot, aes, geom_line, geom_point, ggplot, ggtitle
6+
7+
8+
def __agg_names(ht: hl.Table) -> List[str]:
9+
return ht.aggregate(hl.array(hl.agg.collect_as_set(ht.name)))
10+
11+
12+
def plot_trial_against_time(
13+
ht: hl.Table,
14+
names: Optional[List[str]] = None,
15+
) -> Generator[GGPlot, Any, Any]:
16+
for name in names or __agg_names(ht):
17+
k = ht.filter(ht.name == name)
18+
k = k.explode(k.instances, name='__instance')
19+
k = k.select(**k.__instance)
20+
k = k.explode(k.trials, name='trial')
21+
yield (
22+
ggplot(k, aes(x=k.trial.iteration, y=k.trial.time, color=hl.str(k.instance))) + geom_line() + ggtitle(name)
23+
)
24+
25+
26+
def plot_mean_time_per_instance(
27+
ht: hl.Table,
28+
names: Optional[List[str]] = None,
29+
) -> Generator[GGPlot, Any, Any]:
30+
for name in names or __agg_names(ht):
31+
k = ht.filter(ht.name == name)
32+
k = k.explode(k.instances, name='__instance')
33+
k = k.select(**k.__instance)
34+
k = k.annotate(s=k.trials.aggregate(lambda t: hl.agg.stats(t.time)))
35+
yield (ggplot(k, aes(x=k.instance, y=k.s.mean)) + geom_point() + ggtitle(name))

0 commit comments

Comments
 (0)