Merge pull request #15 from joshmoore/multi-chunks

Load multiple, random chunks
ome · Feb 25, 2021 · b8b6547 · b8b6547
2 parents de52a06 + 1f6d1ab
commit b8b6547
Show file tree

Hide file tree

Showing 6 changed files with 172 additions and 83 deletions.
diff --git a/.env b/.env
@@ -1,13 +1,14 @@
 MINIO_ACCESS_KEY=minioadmin
 MINIO_SECRET_KEY=minioadmin
 HOST=nginx
-XY=65536
+XY=32768
 Z=1
 C=32
 T=1
-XC=1024
+XC=256
 ZC=1
 ROOT=/uod/idr-scratch/ngff-latency-benchmark
 DIR=${ROOT}/${XY}-Z-${Z}-T-${T}-C-${C}-XYC-${XC}-ZC-${ZC}
 BASE=IMS_XY-${XY}-Z-${Z}-T-${T}-C-${C}-XYC-${XC}-ZC-${ZC}
-TEST_REPEATS=1
+ROUNDS=5
+TEST_REPEATS=10
diff --git a/benchmark/Dockerfile b/benchmark/Dockerfile
@@ -11,5 +11,7 @@ COPY plot_results.py /benchmarks/plot_results.py
 
 # see https://github.com/zarr-developers/zarr-python/pull/699
 RUN conda run -n benchmark pip install git+https://github.com/joshmoore/zarr-python@key-sep#egg=zarr
+RUN conda run -n benchmark pip install pytest-profiling
+RUN conda run -n benchmark pip install seaborn
 
 ENTRYPOINT ["conda", "run", "--no-capture-output", "-n", "benchmark", "bash", "/benchmark/benchmark.sh"]
diff --git a/benchmark/benchmark.py b/benchmark/benchmark.py
@@ -1,14 +1,19 @@
 import time
-
 import fsspec
 import h5py
 import pytest
+import random
 import requests
 import s3fs
 import tifffile
 import zarr
+from copy import deepcopy
 from os import environ
 
+# for product
+from functools import reduce  # Required in Python 3
+import operator
+
 DIR = environ.get("DIR", "data")
 BASE = environ.get("BASE", "retina_large")
 HOST = environ.get("HOST", "localhost")
@@ -20,11 +25,48 @@
 }
 
 
+class ChunkChoices:
+
+    def __init__(self):
+        self.z = int(environ.get("Z"))
+        self.t = int(environ.get("T"))
+        self.zc = int(environ.get("ZC"))
+        self.xy = int(environ.get("XY"))
+        self.c = int(environ.get("C"))
+        self.xc = int(environ.get("XC"))
+        chunk_indexes = list()
+        for ix in range(self.xy // self.xc):
+            for iy in range(self.xy // self.xc):
+                for iz in range(self.z // self.zc):
+                    for ic in range(self.c):
+                        for it in range(self.t):
+                            chunk_indexes.append((it+1, ic+1, iz+1, iy+1, ix+1))
+        self.chunk_choices = random.sample(chunk_indexes, ROUNDS)
+
+    def pop(self):
+        return self.chunk_choices.pop()
+
+
+CHOICES = ChunkChoices()
+
+
 class Fixture:
 
     def __init__(self, benchmark):
+        self.choices = deepcopy(CHOICES)
         benchmark.pedantic(self.run, setup=self.setup, rounds=ROUNDS)
 
+    def prod(self, seq):
+        return reduce(operator.mul, seq, 1)
+
+    def load(self, data, chunk_shape, chunk_index):
+            X = list()  # eXtents
+            for i in range(len(chunk_shape)):  # zarr=5, HDF5=3
+                shape = chunk_shape[i]
+                index = chunk_index[i]
+                X.append(slice(shape*(index-1), shape*index))
+            return len(data[tuple(X)]) == self.prod(chunk_shape)
+
     @classmethod
     def methods(cls):
         return (cls.local, cls.http, cls.s3)
@@ -82,7 +124,7 @@ def setup(self):
         def run(self):
             data = self.group["0"]
             chunks = data.chunks
-            len(data[0:chunks[0], 0:chunks[1], 0:chunks[2], 0:chunks[3], 0:chunks[4]])
+            self.load(data, chunks, self.choices.pop())
 
     ZarrFixture(benchmark)
 
@@ -98,12 +140,12 @@ def setup(self):
             self.f = fs.open(filename)
 
         def run(self):
-            self.tif = tifffile.TiffFile(self.f)
-            fh = self.tif.filehandle
-            for page in self.tif.pages:
-                fh.seek(page.dataoffsets[0])
-                fh.read(page.databytecounts[0])
-                return
+            with tifffile.TiffFile(self.f) as tif:
+                store = tif.aszarr()
+                group = zarr.group(store=store)
+                data = group["0"]
+                chunks = data.chunks
+                self.load(data, chunks, self.choices.pop())
 
     TiffFixture(benchmark)
 
@@ -117,11 +159,12 @@ class HDF5Fixture(Fixture):
 
        def setup(self):
             self.f = fs.open(filename)
+            self.file = h5py.File(self.f)
 
        def run(self):
-            self.ims = h5py.File(self.f)
-            data = self.ims["DataSet"]["ResolutionLevel 0"]["TimePoint 0"]["Channel 0"]["Data"]
+            t, c, *idx = self.choices.pop()
+            data = self.file["DataSet"]["ResolutionLevel 0"][f"TimePoint {t-1}"][f"Channel {c-1}"]["Data"]
             chunks = data.chunks
-            len(data[0:chunks[0], 0:chunks[1], 0:chunks[2]])
+            self.load(data, chunks, idx)
 
     HDF5Fixture(benchmark)
diff --git a/benchmark/plot_results.py b/benchmark/plot_results.py
@@ -3,6 +3,9 @@
 import matplotlib.pyplot as plt
 import numpy as np
 import os
+import pandas as pd
+import re
+import seaborn as sns
 from collections import defaultdict
 
 json_path = os.environ.get("BENCHMARK_DATA", "benchmark_data")
@@ -14,10 +17,11 @@
 
 print('base_path', json_path)
 
-named_data = defaultdict(list)
 
+three_col = []
+two_col = []
 
-test_repeats = int(os.getenv('TEST_REPEATS'))
+test_repeats = int(os.getenv('TEST_REPEATS', "1"))
 json_files = ["%s_benchmark_data.json" % r for r in range(test_repeats)]
 print('json_files', json_files)
 
@@ -30,69 +34,106 @@
                 benchmarks = json.load(json_file)['benchmarks']
 
                 for bm in benchmarks:
-                    label = bm['name'].replace('test_', '')
+                    m = re.match(r"test_(1_byte|\w+)_(\w+)\[(\w+)\]", bm['name'])
+                    if not m:
+                        raise Exception(bm['name'])
+                    typ = m.group(1).replace("1_byte", "overhead")
+                    src = m.group(3)
+
+                    if typ == "overhead" and src == "local":
+                        # 10e-5 skews the view.
+                        continue
+
                     if test_repeats == 1:
                         # Ran tests once: plot every data point
-                        named_data[label] = bm['stats']['data']
+                        vals = bm['stats']['data']
+                        for run, val in enumerate(vals):
+                            three_col.append(
+                                {
+                                    "type": typ,
+                                    "source": src,
+                                    "seconds": val,
+                                }
+                            )
+                            two_col.append(
+                                {
+                                    "name": f"{typ}-{src}",
+                                    "seconds": val,
+                                }
+                            )
                     else:
                         # Repeats: take mean value from each
-                        named_data[label].append(bm['stats']['mean'])
-
-
-# print(named_data.keys())
-# ['1_byte_overhead[local]', '1_byte_overhead[http]', '1_byte_overhead[boto3]', '1_byte_overhead[s3]',
-# 'zarr_chunk[local]', 'zarr_chunk[http]', 'zarr_chunk[boto3]', 'zarr_chunk[s3]',
-# 'tiff_tile[local]', 'tiff_tile[http]', 'tiff_tile[s3]',
-# 'hdf5_chunk[local]', 'hdf5_chunk[http]', 'hdf5_chunk[s3]',
-# 'download_1[local]', 'download_1[http]', 'download_1[s3]',
-# 'download_2[local]', 'download_2[http]', 'download_2[boto3]', 'download_2[3fs]']
-
-# plot [hdf5/tiff/zarr] for s3, remote, local
-to_plot = [
-    'hdf5_chunk[s3]', 'tiff_tile[s3]', 'zarr_chunk[s3]', '1_byte_overhead[s3]',
-    'hdf5_chunk[http]', 'tiff_tile[http]', 'zarr_chunk[http]', '1_byte_overhead[http]',
-    'hdf5_chunk[local]', 'tiff_tile[local]', 'zarr_chunk[local]', '1_byte_overhead[local]',
-]
-labels = [
-    'hdf5 (s3)', 'tiff (s3)', 'zarr (s3)', 'overhead (s3)',
-    'hdf5 (remote)', 'tiff (remote)', 'zarr (remote)', 'overhead (remote)',
-    'hdf5 (local)', 'tiff (local)', 'zarr (local]', 'overhead (local)',
-]
-data = [named_data[key] for key in to_plot]
-
-def get_color(label):
-    if 'hdf5' in label:
-        return 'blue'
-    elif 'tiff' in label:
-        return 'green'
-    elif 'overhead' in label:
-        return 'yellow'
-    else:
-        return 'pink'
-colors = [get_color(label) for label in labels]
-
-fig1, ax1 = plt.subplots(figsize=(10, 5), dpi=100)
-ax1.set_title(f'ngff benchmark ({xy}x{xy}) n={test_repeats}')
-boxplot = ax1.boxplot(
-    data,
-    labels=labels,
-    positions=range(len(labels), 0, -1),  # reverse order
-    patch_artist=True,  # fill with color
-    # showfliers=False,
-    vert=False)
-
-for color, patch in zip(colors, boxplot['boxes']):
-    patch.set_facecolor(color)
-    patch.set_edgecolor('grey')
-for feature in ['caps', 'whiskers']:
-    for line in boxplot[feature]:
-        line.set_color('grey')
-for line in boxplot['means']:
-    line.set_color('black')
-for circle in boxplot['fliers']:
-    circle.set_color('grey')
-ax1.set_xscale('log')
-ax1.set_xlabel('Chunk loading time (secs)')
-
-plt.tight_layout()
-plt.savefig(plot_path)
+                        val = bm['stats']['mean']
+                        three_col.append(
+                            {
+                                "type": typ,
+                                "source": src,
+                                "seconds": val,
+                            }
+                        )
+                        two_col.append(
+                            {
+                                "name": f"{typ}-{src}",
+                                "seconds": val,
+                            }
+                        )
+
+df3 = pd.DataFrame.from_dict(three_col)
+df2 = pd.DataFrame.from_dict(two_col)
+
+types = ("overhead", "zarr", "tiff", "hdf5")
+sources = ("local", "http", "s3")
+orders = {"type": types, "source": sources}
+
+pal_points = "colorblind"
+pal_violins = "pastel"
+
+g = sns.FacetGrid(
+    df3,
+    col="source",
+    col_order=sources,
+    sharey=False,
+    height=5,
+    aspect=0.6,
+)
+
+g = g.map(
+    sns.boxenplot,
+    "type",
+    "seconds",
+    order=types,
+    width=0.6,
+    k_depth=2,
+    palette=pal_violins,
+    dodge=True,
+    showfliers=False,
+)
+
+g = g.map(
+    sns.stripplot,
+    "type",
+    "seconds",
+    dodge=True,
+    order=types,
+    jitter=0.2,
+    size=3,
+    palette=pal_points,
+)
+
+g.despine(left=True)
+g.set(yscale ='log', ylim=(0.0009, 1))
+
+# Set axis labels & ticks #
+for ax in g.fig.get_axes():
+    label = ax.get_title().replace("source =", "")
+    ax.set_xlabel(label)
+    ax.set_xticklabels(types)
+    ax.set_title("")
+    for col in ax.collections:
+        # Remove outline of violins
+        col.set_edgecolor("white")
+
+g.fig.get_axes()[0].set_ylabel("Seconds")
+g.fig.get_axes()[0].spines["left"].set_visible(True)
+
+g.savefig(plot_path, dpi=600)
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -46,9 +46,15 @@ services:
       - "./benchmark:/benchmark:ro"
     environment:
       - XY
+      - Z
+      - C
+      - T
+      - XC
+      - ZC
       - BASE
       - DIR
       - HOST
+      - ROUNDS
       - TEST_REPEATS
 
 # from:   https://docs.min.io/docs/deploy-minio-on-docker-compose.html

diff --git a/upload.sh b/upload.sh
@@ -14,8 +14,4 @@ mc config host add benchmark http://minio1:9000 ${MINIO_ACCESS_KEY} ${MINIO_SECR
 mc mb -p benchmark/data
 mc policy set public benchmark/data
 
-cd $DIR
-mc cp -r *.ims benchmark/data/
-mc cp -r *.tiff benchmark/data/
-mc cp -r *.zarr benchmark/data/
-mc cp -r 1-byte benchmark/data/
+time mc mirror --overwrite ${DIR} benchmark/data/