Skip to content

Commit

Permalink
Merge pull request #15 from joshmoore/multi-chunks
Browse files Browse the repository at this point in the history
Load multiple, random chunks
  • Loading branch information
joshmoore authored Feb 25, 2021
2 parents de52a06 + 1f6d1ab commit b8b6547
Show file tree
Hide file tree
Showing 6 changed files with 172 additions and 83 deletions.
7 changes: 4 additions & 3 deletions .env
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
MINIO_ACCESS_KEY=minioadmin
MINIO_SECRET_KEY=minioadmin
HOST=nginx
XY=65536
XY=32768
Z=1
C=32
T=1
XC=1024
XC=256
ZC=1
ROOT=/uod/idr-scratch/ngff-latency-benchmark
DIR=${ROOT}/${XY}-Z-${Z}-T-${T}-C-${C}-XYC-${XC}-ZC-${ZC}
BASE=IMS_XY-${XY}-Z-${Z}-T-${T}-C-${C}-XYC-${XC}-ZC-${ZC}
TEST_REPEATS=1
ROUNDS=5
TEST_REPEATS=10
2 changes: 2 additions & 0 deletions benchmark/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -11,5 +11,7 @@ COPY plot_results.py /benchmarks/plot_results.py

# see https://github.com/zarr-developers/zarr-python/pull/699
RUN conda run -n benchmark pip install git+https://github.com/joshmoore/zarr-python@key-sep#egg=zarr
RUN conda run -n benchmark pip install pytest-profiling
RUN conda run -n benchmark pip install seaborn

ENTRYPOINT ["conda", "run", "--no-capture-output", "-n", "benchmark", "bash", "/benchmark/benchmark.sh"]
65 changes: 54 additions & 11 deletions benchmark/benchmark.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,19 @@
import time

import fsspec
import h5py
import pytest
import random
import requests
import s3fs
import tifffile
import zarr
from copy import deepcopy
from os import environ

# for product
from functools import reduce # Required in Python 3
import operator

DIR = environ.get("DIR", "data")
BASE = environ.get("BASE", "retina_large")
HOST = environ.get("HOST", "localhost")
Expand All @@ -20,11 +25,48 @@
}


class ChunkChoices:

def __init__(self):
self.z = int(environ.get("Z"))
self.t = int(environ.get("T"))
self.zc = int(environ.get("ZC"))
self.xy = int(environ.get("XY"))
self.c = int(environ.get("C"))
self.xc = int(environ.get("XC"))
chunk_indexes = list()
for ix in range(self.xy // self.xc):
for iy in range(self.xy // self.xc):
for iz in range(self.z // self.zc):
for ic in range(self.c):
for it in range(self.t):
chunk_indexes.append((it+1, ic+1, iz+1, iy+1, ix+1))
self.chunk_choices = random.sample(chunk_indexes, ROUNDS)

def pop(self):
return self.chunk_choices.pop()


CHOICES = ChunkChoices()


class Fixture:

def __init__(self, benchmark):
self.choices = deepcopy(CHOICES)
benchmark.pedantic(self.run, setup=self.setup, rounds=ROUNDS)

def prod(self, seq):
return reduce(operator.mul, seq, 1)

def load(self, data, chunk_shape, chunk_index):
X = list() # eXtents
for i in range(len(chunk_shape)): # zarr=5, HDF5=3
shape = chunk_shape[i]
index = chunk_index[i]
X.append(slice(shape*(index-1), shape*index))
return len(data[tuple(X)]) == self.prod(chunk_shape)

@classmethod
def methods(cls):
return (cls.local, cls.http, cls.s3)
Expand Down Expand Up @@ -82,7 +124,7 @@ def setup(self):
def run(self):
data = self.group["0"]
chunks = data.chunks
len(data[0:chunks[0], 0:chunks[1], 0:chunks[2], 0:chunks[3], 0:chunks[4]])
self.load(data, chunks, self.choices.pop())

ZarrFixture(benchmark)

Expand All @@ -98,12 +140,12 @@ def setup(self):
self.f = fs.open(filename)

def run(self):
self.tif = tifffile.TiffFile(self.f)
fh = self.tif.filehandle
for page in self.tif.pages:
fh.seek(page.dataoffsets[0])
fh.read(page.databytecounts[0])
return
with tifffile.TiffFile(self.f) as tif:
store = tif.aszarr()
group = zarr.group(store=store)
data = group["0"]
chunks = data.chunks
self.load(data, chunks, self.choices.pop())

TiffFixture(benchmark)

Expand All @@ -117,11 +159,12 @@ class HDF5Fixture(Fixture):

def setup(self):
self.f = fs.open(filename)
self.file = h5py.File(self.f)

def run(self):
self.ims = h5py.File(self.f)
data = self.ims["DataSet"]["ResolutionLevel 0"]["TimePoint 0"]["Channel 0"]["Data"]
t, c, *idx = self.choices.pop()
data = self.file["DataSet"]["ResolutionLevel 0"][f"TimePoint {t-1}"][f"Channel {c-1}"]["Data"]
chunks = data.chunks
len(data[0:chunks[0], 0:chunks[1], 0:chunks[2]])
self.load(data, chunks, idx)

HDF5Fixture(benchmark)
169 changes: 105 additions & 64 deletions benchmark/plot_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import re
import seaborn as sns
from collections import defaultdict

json_path = os.environ.get("BENCHMARK_DATA", "benchmark_data")
Expand All @@ -14,10 +17,11 @@

print('base_path', json_path)

named_data = defaultdict(list)

three_col = []
two_col = []

test_repeats = int(os.getenv('TEST_REPEATS'))
test_repeats = int(os.getenv('TEST_REPEATS', "1"))
json_files = ["%s_benchmark_data.json" % r for r in range(test_repeats)]
print('json_files', json_files)

Expand All @@ -30,69 +34,106 @@
benchmarks = json.load(json_file)['benchmarks']

for bm in benchmarks:
label = bm['name'].replace('test_', '')
m = re.match(r"test_(1_byte|\w+)_(\w+)\[(\w+)\]", bm['name'])
if not m:
raise Exception(bm['name'])
typ = m.group(1).replace("1_byte", "overhead")
src = m.group(3)

if typ == "overhead" and src == "local":
# 10e-5 skews the view.
continue

if test_repeats == 1:
# Ran tests once: plot every data point
named_data[label] = bm['stats']['data']
vals = bm['stats']['data']
for run, val in enumerate(vals):
three_col.append(
{
"type": typ,
"source": src,
"seconds": val,
}
)
two_col.append(
{
"name": f"{typ}-{src}",
"seconds": val,
}
)
else:
# Repeats: take mean value from each
named_data[label].append(bm['stats']['mean'])


# print(named_data.keys())
# ['1_byte_overhead[local]', '1_byte_overhead[http]', '1_byte_overhead[boto3]', '1_byte_overhead[s3]',
# 'zarr_chunk[local]', 'zarr_chunk[http]', 'zarr_chunk[boto3]', 'zarr_chunk[s3]',
# 'tiff_tile[local]', 'tiff_tile[http]', 'tiff_tile[s3]',
# 'hdf5_chunk[local]', 'hdf5_chunk[http]', 'hdf5_chunk[s3]',
# 'download_1[local]', 'download_1[http]', 'download_1[s3]',
# 'download_2[local]', 'download_2[http]', 'download_2[boto3]', 'download_2[3fs]']

# plot [hdf5/tiff/zarr] for s3, remote, local
to_plot = [
'hdf5_chunk[s3]', 'tiff_tile[s3]', 'zarr_chunk[s3]', '1_byte_overhead[s3]',
'hdf5_chunk[http]', 'tiff_tile[http]', 'zarr_chunk[http]', '1_byte_overhead[http]',
'hdf5_chunk[local]', 'tiff_tile[local]', 'zarr_chunk[local]', '1_byte_overhead[local]',
]
labels = [
'hdf5 (s3)', 'tiff (s3)', 'zarr (s3)', 'overhead (s3)',
'hdf5 (remote)', 'tiff (remote)', 'zarr (remote)', 'overhead (remote)',
'hdf5 (local)', 'tiff (local)', 'zarr (local]', 'overhead (local)',
]
data = [named_data[key] for key in to_plot]

def get_color(label):
if 'hdf5' in label:
return 'blue'
elif 'tiff' in label:
return 'green'
elif 'overhead' in label:
return 'yellow'
else:
return 'pink'
colors = [get_color(label) for label in labels]

fig1, ax1 = plt.subplots(figsize=(10, 5), dpi=100)
ax1.set_title(f'ngff benchmark ({xy}x{xy}) n={test_repeats}')
boxplot = ax1.boxplot(
data,
labels=labels,
positions=range(len(labels), 0, -1), # reverse order
patch_artist=True, # fill with color
# showfliers=False,
vert=False)

for color, patch in zip(colors, boxplot['boxes']):
patch.set_facecolor(color)
patch.set_edgecolor('grey')
for feature in ['caps', 'whiskers']:
for line in boxplot[feature]:
line.set_color('grey')
for line in boxplot['means']:
line.set_color('black')
for circle in boxplot['fliers']:
circle.set_color('grey')
ax1.set_xscale('log')
ax1.set_xlabel('Chunk loading time (secs)')

plt.tight_layout()
plt.savefig(plot_path)
val = bm['stats']['mean']
three_col.append(
{
"type": typ,
"source": src,
"seconds": val,
}
)
two_col.append(
{
"name": f"{typ}-{src}",
"seconds": val,
}
)

df3 = pd.DataFrame.from_dict(three_col)
df2 = pd.DataFrame.from_dict(two_col)

types = ("overhead", "zarr", "tiff", "hdf5")
sources = ("local", "http", "s3")
orders = {"type": types, "source": sources}

pal_points = "colorblind"
pal_violins = "pastel"

g = sns.FacetGrid(
df3,
col="source",
col_order=sources,
sharey=False,
height=5,
aspect=0.6,
)

g = g.map(
sns.boxenplot,
"type",
"seconds",
order=types,
width=0.6,
k_depth=2,
palette=pal_violins,
dodge=True,
showfliers=False,
)

g = g.map(
sns.stripplot,
"type",
"seconds",
dodge=True,
order=types,
jitter=0.2,
size=3,
palette=pal_points,
)

g.despine(left=True)
g.set(yscale ='log', ylim=(0.0009, 1))

# Set axis labels & ticks #
for ax in g.fig.get_axes():
label = ax.get_title().replace("source =", "")
ax.set_xlabel(label)
ax.set_xticklabels(types)
ax.set_title("")
for col in ax.collections:
# Remove outline of violins
col.set_edgecolor("white")

g.fig.get_axes()[0].set_ylabel("Seconds")
g.fig.get_axes()[0].spines["left"].set_visible(True)

g.savefig(plot_path, dpi=600)
6 changes: 6 additions & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -46,9 +46,15 @@ services:
- "./benchmark:/benchmark:ro"
environment:
- XY
- Z
- C
- T
- XC
- ZC
- BASE
- DIR
- HOST
- ROUNDS
- TEST_REPEATS

# from: https://docs.min.io/docs/deploy-minio-on-docker-compose.html
Expand Down
6 changes: 1 addition & 5 deletions upload.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,4 @@ mc config host add benchmark http://minio1:9000 ${MINIO_ACCESS_KEY} ${MINIO_SECR
mc mb -p benchmark/data
mc policy set public benchmark/data

cd $DIR
mc cp -r *.ims benchmark/data/
mc cp -r *.tiff benchmark/data/
mc cp -r *.zarr benchmark/data/
mc cp -r 1-byte benchmark/data/
time mc mirror --overwrite ${DIR} benchmark/data/

0 comments on commit b8b6547

Please sign in to comment.