diff --git a/README.rst b/README.rst index e6d553d3..2c0662b5 100644 --- a/README.rst +++ b/README.rst @@ -1,7 +1,6 @@ - -zfex -- efficient, portable erasure coding tool -=============================================== +zfex — efficient, portable erasure coding tool +================================================ Generate redundant blocks of information such that if some of the blocks are lost then the original data can be recovered from the remaining blocks. This @@ -9,6 +8,8 @@ package includes command-line tools, C API, Python API, and Haskell API. |build| |test-intel| |test-arm| |haskell-api| |tools| |pypi| +|intel-benchmark| + Intro and Licence ----------------- @@ -351,3 +352,7 @@ Enjoy! .. |tools| image:: https://github.com/WojciechMigda/zfex/actions/workflows/tools.yml/badge.svg :alt: Tools :target: https://github.com/WojciechMigda/zfex/actions/workflows/tools.yml + +.. |intel-benchmark| image:: bench/images/bench_intel_k7_m10_1M.png + :alt: Intel benchmark chart + :target: bench/Results.rst diff --git a/bench/Results.rst b/bench/Results.rst new file mode 100644 index 00000000..1a6e838e --- /dev/null +++ b/bench/Results.rst @@ -0,0 +1,85 @@ +Benchmark results +================= + +All benchmarks were executed using ``bench_zfex`` binary compiled for a given target. Executions were performed using attached scripts, ``legacy_zfec.sh`` and ``zfex.sh``. + +Between different runs, results which had lowest difference between ``best`` and ``worst`` values were selected and ``mean`` value was used. + +Intel x64 +--------- + +This benchmark was run on virtualized instance of Intel(R) Xeon(R), clocked at 2.2 GHz. + +:: + + Architecture: x86_64 + CPU op-mode(s): 32-bit, 64-bit + Byte Order: Little Endian + Address sizes: 46 bits physical, 48 bits virtual + CPU(s): 4 + On-line CPU(s) list: 0-3 + Thread(s) per core: 2 + Core(s) per socket: 2 + Socket(s): 1 + NUMA node(s): 1 + Vendor ID: GenuineIntel + CPU family: 6 + Model: 79 + Model name: Intel(R) Xeon(R) CPU @ 2.20GHz + Stepping: 0 + CPU MHz: 2200.222 + BogoMIPS: 4400.44 + Hypervisor vendor: KVM + Virtualization type: full + L1d cache: 64 KiB + L1i cache: 64 KiB + L2 cache: 512 KiB + L3 cache: 55 MiB + NUMA node0 CPU(s): 0-3 + Vulnerability Itlb multihit: Not affected + Vulnerability L1tf: Mitigation; PTE Inversion + Vulnerability Mds: Mitigation; Clear CPU buffers; SMT Host state unknown + Vulnerability Meltdown: Mitigation; PTI + Vulnerability Mmio stale data: Vulnerable: Clear CPU buffers attempted, no microcode; SMT Host state unknown + Vulnerability Retbleed: Mitigation; IBRS + Vulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp + Vulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization + Vulnerability Spectre v2: Mitigation; IBRS, IBPB conditional, RSB filling + Vulnerability Srbds: Not affected + Vulnerability Tsx async abort: Mitigation; Clear CPU buffers; SMT Host state unknown + Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology nonstop_tsc cpuid tsc_known_freq pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm abm 3dnowprefetch invpcid_single pti ssbd ibrs ibpb stibp fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid rtm rdseed adx smap xsaveopt arat md_clear arch_capabilities + +Compiler used was: + +:: + + gcc (Ubuntu 9.4.0-1ubuntu1~20.04.1) 9.4.0 + +For legacy ``zfec`` two results were picked, one for code compiled with ``-O2`` optimization, which is very likely binary code packaged with precompiled wheel packages, and the other one compiled with ``-O3 -march=native`` flags, which gave the best results. + +``zfex`` benchmark was run for ``fec_encode_simd`` in five different configurations, one with ``-O2`` optimization and the other ones with ``-O3`` optimization. On top of that different variants of unrolling parameters were set. + + +k=7 m=10 size=1000000 +~~~~~~~~~~~~~~~~~~~~~ + +|intel-7-10| + +.. |intel-7-10| image:: images/bench_intel_k7_m10_1M.png + :scale: 100% + :alt: Intel benchmark, k=7 m=10 size=1000000 + :target: images/bench_intel_k7_m10_1M.png + +Legacy ``zfec`` had both results just below 600 MB/sec. ``zfex`` in all cases ran faster, achieving best performance with ``-DZFEX_UNROLL_ADDMUL_SIMD=8`` unrolling, running over 6 times faster at ~3800 MB/sec. + +k=223 m=255 size=43488 +~~~~~~~~~~~~~~~~~~~~~~ + +|intel-223-255| + +.. |intel-223-255| image:: images/bench_intel_k223_m255_43488.png + :scale: 100% + :alt: Intel benchmark, k=223 m=255 size=43488 + :target: images/bench_intel_k223_m255_43488.png + +Legacy ``zfec`` had both results slightly above 50 MB/sec. ``zfex`` in all cases ran faster, achieving best performance with ``-DZFEX_UNROLL_ADDMUL_SIMD=4`` unrolling, giving almost 6-fold speed-up. diff --git a/bench/images/bench_intel_k223_m255_43488.png b/bench/images/bench_intel_k223_m255_43488.png new file mode 100644 index 00000000..9d3bf66c Binary files /dev/null and b/bench/images/bench_intel_k223_m255_43488.png differ diff --git a/bench/images/bench_intel_k7_m10_1M.png b/bench/images/bench_intel_k7_m10_1M.png new file mode 100644 index 00000000..a1389a4f Binary files /dev/null and b/bench/images/bench_intel_k7_m10_1M.png differ diff --git a/bench/tools/plot_intel_k223_m55_43488.py b/bench/tools/plot_intel_k223_m55_43488.py new file mode 100755 index 00000000..cdbb4bb4 --- /dev/null +++ b/bench/tools/plot_intel_k223_m55_43488.py @@ -0,0 +1,69 @@ +#!/usr/bin/python3 +# -*- coding: utf-8 -*- + + +import matplotlib.pyplot as plt + + +def main(): + + fig, ax = plt.subplots() + + # legacy zfec first + + df = { + '-O2' : 51.804, + '-O3 -march=native' : 53.105 + } + labels, speed = zip(*df.items()) + + patches = plt.barh(labels, speed, height=0.5, color='brown') + + for rect, label in zip(patches, labels): + width = rect.get_width() + height = rect.get_height() + x = rect.get_x() + y = rect.get_y() + label_x = x + width + 6 + label_y = y + height / 2 + + ax.text(label_x, label_y, label, ha='left', va='center', fontsize=9) + + # zfex now + + df = { + '-O2 -DZFEX_UNROLL_ADDMUL_SIMD=1' : 169.997, + '-O3 -DZFEX_UNROLL_ADDMUL_SIMD=1' : 251.576, + '-O3 -DZFEX_UNROLL_ADDMUL_SIMD=2' : 275.800, + '-O3 -DZFEX_UNROLL_ADDMUL_SIMD=4' : 279.742, + '-O3 -DZFEX_UNROLL_ADDMUL_SIMD=8' : 261.318, + } + labels, speed = zip(*df.items()) + + patches = plt.barh(labels, speed, height=0.5, color='green') + + for rect, label in zip(patches, labels): + width = rect.get_width() + height = rect.get_height() + x = rect.get_x() + y = rect.get_y() + label_x = x + width + 6 + label_y = y + height / 2 + + ax.text(label_x, label_y, label, ha='left', va='center', fontsize=9) + + ax.set_xlim([0, 600]) + ax.axes.yaxis.set_ticklabels([]) + ax.invert_yaxis() + ax.set_xlabel('Speed, MB/sec') + ax.set_title("Encoding benchmark of legacy zfec vs. SIMD zfex\nk=223, m=255, size=43488\nIntel(R) Xeon(R) CPU @ 2.20GHz") + ax.legend(['zfec::fec_encode', 'zfex::fec_encode_simd'], loc='upper right') + + plt.savefig('bench_intel_k223_m255_43488.png') + plt.show() + + return 0 + + +if __name__ == '__main__': + main() diff --git a/bench/tools/plot_intel_k7_m10_1M.py b/bench/tools/plot_intel_k7_m10_1M.py new file mode 100755 index 00000000..59fca9a2 --- /dev/null +++ b/bench/tools/plot_intel_k7_m10_1M.py @@ -0,0 +1,69 @@ +#!/usr/bin/python3 +# -*- coding: utf-8 -*- + + +import matplotlib.pyplot as plt + + +def main(): + + fig, ax = plt.subplots() + + # legacy zfec first + + df = { + '-O2' : 557.010, + '-O3 -march=native' : 596.444 + } + labels, speed = zip(*df.items()) + + patches = plt.barh(labels, speed, height=0.5, color='brown') + + for rect, label in zip(patches, labels): + width = rect.get_width() + height = rect.get_height() + x = rect.get_x() + y = rect.get_y() + label_x = x + width + 60 + label_y = y + height / 2 + + ax.text(label_x, label_y, label, ha='left', va='center', fontsize=9) + + # zfex now + + df = { + '-O2 -DZFEX_UNROLL_ADDMUL_SIMD=1' : 2096.781, + '-O3 -DZFEX_UNROLL_ADDMUL_SIMD=1' : 2933.161, + '-O3 -DZFEX_UNROLL_ADDMUL_SIMD=2' : 3237.656, + '-O3 -DZFEX_UNROLL_ADDMUL_SIMD=4' : 3585.507, + '-O3 -DZFEX_UNROLL_ADDMUL_SIMD=8' : 3810.970, + } + labels, speed = zip(*df.items()) + + patches = plt.barh(labels, speed, height=0.5, color='green') + + for rect, label in zip(patches, labels): + width = rect.get_width() + height = rect.get_height() + x = rect.get_x() + y = rect.get_y() + label_x = x + width + 60 + label_y = y + height / 2 + + ax.text(label_x, label_y, label, ha='left', va='center', fontsize=9) + + ax.set_xlim([0, 8000]) + ax.axes.yaxis.set_ticklabels([]) + ax.invert_yaxis() + ax.set_xlabel('Speed, MB/sec') + ax.set_title("Encoding benchmark of legacy zfec vs. SIMD zfex\nk=7, m=10, size=1000000\nIntel(R) Xeon(R) CPU @ 2.20GHz") + ax.legend(['zfec::fec_encode', 'zfex::fec_encode_simd'], loc='upper right') + + plt.savefig('bench_intel_k7_m10_1M.png') + plt.show() + + return 0 + + +if __name__ == '__main__': + main()