-
Notifications
You must be signed in to change notification settings - Fork 6
/
analysis.R
140 lines (121 loc) · 7 KB
/
analysis.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
library(tidyverse)
library(readr)
df <- read_csv("./benchmarks.csv")
# Remove libraries that are not .NET Core compatible and have a median elapsed time of 0ns
df <- filter(df, !(Runtime == "Core" & (Method == "XXHash" | Method == "CityHashNet")))
# Calculate the throughput by computing MB/s. There are 10^6 bytes in a
# MB and 10^9 nanoseconds in a second.
df <- mutate(df, Throughput = PayloadLength * 10^9 / (Mean * 10^6))
# The first thing we're going to do is show the difference between
# a cryptographic function (albeit a bad one) and a non-cryptographic
# function. We're going to see MD5 vs Farmhash
fm <- df %>%
filter((Method == 'FarmHash' | Method == 'Md5') &
Job == 'net-legacy-32bit' & Kind == '32bit hash') %>%
select(Method, PayloadLength, Median, StdErr, StdDev, Throughput)
ggplot(fm, aes(as.factor(PayloadLength), Throughput)) +
geom_bar(aes(fill=Method), stat='identity', position='dodge') +
labs(x='Payload (bytes)', y='Throughput MB/s') +
ggtitle('Cryptographic vs Non-Cryptographic Hash Function',
subtitle = "Using 32bit CLR with 32bit Farmhash as a baseline")
ggsave('crypt-vs-non-crypt.png', width = 12, height = 7, dpi = 100)
# Next we're going to see the performance difference when the 64bit Farmhash
# function is executed in a 64bit Clr Runtime
clr_run <- df %>%
filter((Method == 'FarmHash') & Runtime == 'Clr' & Kind == '64bit hash') %>%
select(Method, Job, Runtime, PayloadLength, Median, StdErr, StdDev, Throughput)
ggplot(clr_run, aes(as.factor(PayloadLength), Throughput)) +
geom_bar(aes(fill=Job), stat='identity', position='dodge') +
labs(x='Payload (bytes)', y='Throughput MB/s') +
ggtitle("Throughput by Jit Runtime",
subtitle = "64bit Farmhash Performance on the Clr")
ggsave('throughput-by-jit.png', width = 12, height = 7, dpi = 100)
# Does the same hold true for Mono?
mono_run <- df %>%
filter((Method == 'FarmHash') & Runtime == 'Mono' & Kind == '64bit hash') %>%
select(Method, Job, Runtime, PayloadLength, Median, StdErr, StdDev, Throughput)
ggplot(mono_run, aes(as.factor(PayloadLength), Throughput)) +
geom_bar(aes(fill=Job), stat='identity', position='dodge') +
labs(x='Payload (bytes)', y='Throughput MB/s') +
ggtitle("Throughput by Jit Runtime",
subtitle = "64bit Farmhash Performance on the Mono")
ggsave('mono-throughput.png', width = 12, height = 7, dpi = 100)
# Now let's compare core, mono, and clr
runtime_df <- df %>%
filter((Method == 'FarmHash')) %>%
select(Method, Job, Runtime, Kind, PayloadLength, Median, StdErr, StdDev, Throughput)
ggplot(runtime_df, aes(as.factor(PayloadLength), Throughput)) +
geom_bar(aes(fill=Job), stat='identity', position='dodge') +
facet_grid(Kind ~ .) +
labs(x='Payload (bytes)', y='Throughput MB/s') +
ggtitle("Throughput by Runtime",
subtitle = "32 and 64bit Farmhash Performance across Core, Mono, and Clr")
ggsave('runtime-throughput.png', width = 12, height = 7, dpi = 100)
# Moving on to comparing relative throughput of every hashing library, on every
# platform, for all payload sizes, for 32bit and 64bit hashes. Careful, this
# heatmap contains a lot of good information!
df2 <- df %>% group_by(Job, Kind, PayloadLength) %>%
mutate(Relative = Throughput / max(Throughput)) %>%
ungroup() %>%
select(Job, Kind, Method, PayloadLength, Relative, Throughput) %>%
complete(Job, Kind, Method, PayloadLength)
df2 %>% ggplot(aes(Method, as.factor(PayloadLength))) +
geom_tile(aes(fill = Relative), color = "white") +
facet_grid(Job ~ Kind) +
scale_x_discrete(position = "top") +
scale_fill_gradient(name = "", low = "white", high = "steelblue", na.value = "#D8D8D8", labels = c("lowest", "highest"), breaks = c(0,1)) +
xlab("Hash Library") +
ylab("Payload Size (bytes)") +
geom_text(size=2.5, aes(label = ifelse(is.na(Relative), "NA", format(round(Relative, 2), digits = 3)))) +
theme(legend.position="bottom") +
theme(axis.text.x.top=element_text(angle=45, hjust=0, vjust=0)) +
theme(plot.caption = element_text(hjust=0)) +
ggtitle("Non-cryptographic Hash Functions with Relative Throughput",
subtitle = "32bit and 64bit hash functions on Mono, Ryu, Core, and Legacy Jits") +
labs(caption = "Shaded by payload and facet. For instance, SparrowXXHash has 60% of the throughput of Farmhash.Sharp\nwhen calculating the 64bit hash and both given a 4 byte payload on the .NET Ryu platform (64bits)")
ggsave('relative-throughput.png', width = 12, height = 10, dpi = 100)
# Previous heatmap detailed relative throughput, but that was for each facet's
# payload size. How can one tell if in terms of absolute throughput what
# configuration yields the highest throughput at a given payload size. Welcome to
# the next heatmap.
df3 <- df %>% mutate(Throughput = Throughput / 1000) %>%
group_by(PayloadLength) %>%
mutate(Relative = Throughput / max(Throughput)) %>%
ungroup() %>%
select(Job, Kind, Method, PayloadLength, Relative, Throughput) %>%
complete(Job, Kind, Method, PayloadLength)
df3 %>% ggplot(aes(Method, as.factor(PayloadLength))) +
geom_tile(aes(fill = Relative), color = "white") +
facet_grid(Job ~ Kind) +
scale_x_discrete(position = "top") +
scale_fill_gradient(name = "", low = "white", high = "steelblue", na.value = "#D8D8D8", labels = c("lowest", "highest"), breaks = c(0,1)) +
xlab("Hash Library") +
ylab("Payload Size (bytes)") +
geom_text(size=2.5, aes(label = ifelse(is.na(Relative), "NA", format(round(Throughput, 1), digits = 3)))) +
theme(axis.text.x.top=element_text(angle=45, hjust=0, vjust=0)) +
theme(legend.position="bottom") +
theme(plot.caption = element_text(hjust=0)) +
ggtitle("Non-cryptographic Hash Functions with Throughput (GB/s)",
subtitle = "32bit and 64bit hash functions on Mono, Ryu, Core, and Legacy Jits") +
labs(caption = "Shaded by payload. For instance, for payloads of 4 bytes, the fastest is\nFarmhash.Sharp on .NET Ryu 64bit calculating 64bit hashes (1.3 GB/s)")
ggsave('absolute-throughput.png', width = 12, height = 10, dpi = 100)
# This is the C++ data. Since the benchmark doesn't output a csv these
# numbers are handcoded from a C++ benchmark run
cpp <- rep(c("farmhash-ha", "farmhash"), each = 6)
cpp_loads <- rep(c(4, 11, 25, 100, 1000, 10000), 2)
cpp_through <- c(1138, 2943, 6263, 6773, 15310, 28434,
1715, 4164, 8301, 7029, 17075, 20601)
cpp_df <- data.frame(cpp, cpp_loads, cpp_through)
colnames(cpp_df) <- c("Job", "PayloadLength", "Throughput")
net_df <- df %>%
filter(Job == 'net-ryu-64bit' & Method == 'FarmHash' & Kind == '64bit hash') %>%
select(Job, PayloadLength, Throughput)
new_df <- rbind(net_df, cpp_df) %>%
group_by(PayloadLength) %>%
mutate(Relative = Throughput / max(Throughput))
ggplot(new_df, aes(as.factor(PayloadLength), Relative)) +
geom_bar(aes(fill=Job), stat='identity', position='dodge') +
labs(x='Payload (bytes)', y='Relative Throughput (1.0 is highest throughput)') +
ggtitle("Throughput of C++ Farmhash vs Farmhash.Sharp",
subtitle = "Where farmhash-ha uses hardware acceleration")
ggsave('c-sharp-vs-cpp.png', width = 12, height = 7, dpi = 100)