From 2ee5761c72827f618d51f554ccd3ff3c24f35e0b Mon Sep 17 00:00:00 2001 From: Leonid Evdokimov Date: Fri, 30 Aug 2024 12:36:33 +0300 Subject: [PATCH] Add weighted average to SpeedSmall test output It addresses the question at https://github.com/rurban/smhasher/pull/113 What is the "real" average cycles/hash value for a given hash function? We can't know, but we can estimate it better if we assume that the function timing does not depend on input (that's not true for hashes based on multiplication) and we know distribution of key length in advance (that might be somewhat known for certain classes of inputs, but the distribution varies across classes measurably). --- Platform.cpp | 16 ++++++++++++--- Platform.h | 1 + SpeedTest.cpp | 2 +- SpeedTest.h | 2 ++ main.cpp | 55 +++++++++++++++++++++++++++++++++++++++++++-------- 5 files changed, 64 insertions(+), 12 deletions(-) diff --git a/Platform.cpp b/Platform.cpp index e0f7b5ef..a49255d7 100644 --- a/Platform.cpp +++ b/Platform.cpp @@ -1,11 +1,21 @@ #include "Platform.h" #include +#include -void testRDTSC ( void ) +long getenvlong(const char *name, long minval, long defval, long maxval) { - int64_t temp = rdtsc(); - printf("%ld",(long)temp); + assert(minval <= defval && defval <= maxval); + const char *s = getenv(name); + if (!s) + return defval; + char *tail; + long l = strtol(s, &tail, 0); + if (*tail) + return defval; + if (l < minval) l = minval; + if (l > maxval) l = maxval; + return l; } #if defined(_WIN32) diff --git a/Platform.h b/Platform.h index eced588b..ef835549 100644 --- a/Platform.h +++ b/Platform.h @@ -21,6 +21,7 @@ void SetThreadAffinity ( std::thread &t, int cpu ); # endif #endif void SetAffinity ( int cpu ); +long getenvlong(const char *name, long minval, long defval, long maxval); // That's not UINT64_MAX as it's converted to int64_t sometimes. constexpr uint64_t timer_inf = INT64_MAX; diff --git a/SpeedTest.cpp b/SpeedTest.cpp index 9ba3a42a..6523e1c8 100644 --- a/SpeedTest.cpp +++ b/SpeedTest.cpp @@ -240,7 +240,7 @@ double SpeedTest ( pfHash hash, uint32_t seed, const int trials, const int block double t; - if(blocksize < 100) + if(blocksize <= TIMEHASH_SMALL_LEN_MAX) { t = (double)timehash_small(hash,block,blocksize,itrial); } diff --git a/SpeedTest.h b/SpeedTest.h index 25ffef30..f26cef3c 100644 --- a/SpeedTest.h +++ b/SpeedTest.h @@ -2,6 +2,8 @@ #include "Types.h" +constexpr int TIMEHASH_SMALL_LEN_MAX = 255; + void BulkSpeedTest ( pfHash hash, uint32_t seed ); double TinySpeedTest ( pfHash hash, int hashsize, int keysize, uint32_t seed, bool verbose ); double HashMapSpeedTest ( pfHash pfhash, int hashbits, std::vector words, diff --git a/main.cpp b/main.cpp index 8a1d982c..1fac739a 100644 --- a/main.cpp +++ b/main.cpp @@ -12,6 +12,7 @@ #include #endif +#include #include #include #include @@ -86,6 +87,8 @@ bool MomentChi2Test ( struct HashInfo *info, int inputSize ); //----------------------------------------------------------------------------- // This is the list of all hashes that SMHasher can test. +#define COUNT_OF(x) ((sizeof(x)/sizeof(0[x])) / ((size_t)(!(sizeof(x) % sizeof(0[x]))))) + const char* quality_str[3] = { "SKIP", "POOR", "GOOD" }; // sorted by quality and speed. the last is the list of internal secrets to be tested against bad seeds. @@ -979,6 +982,17 @@ void SelfTest(bool verbose) { //---------------------------------------------------------------------------- +static std::pair CalcWAvg(const double *x, const double *dist, size_t imin, size_t imax) +{ + double sum = 0, prb = 0; + for (size_t i = imin; i <= imax; i++) { + sum += dist[i] * x[i]; + prb += dist[i]; + } + sum /= prb; + return std::make_pair(sum, prb); +} + template < typename hashtype > void test ( hashfunc hash, HashInfo* info ) { @@ -1027,7 +1041,6 @@ void test ( hashfunc hash, HashInfo* info ) if(g_testSpeedBulk || g_testSpeedSmall || g_testAll) { - double sum = 0.0; printf("[[[ Speed Tests ]]]\n\n"); if (timer_counts_ns()) printf("WARNING: no cycle counter, cycle == 1ns\n"); @@ -1050,17 +1063,43 @@ void test ( hashfunc hash, HashInfo* info ) } if (g_testSpeedSmall || g_testAll) { - const char* const envsmin = getenv("SMHASHER_SMALLKEY_MIN"); - const char* const envsmax = getenv("SMHASHER_SMALLKEY_MAX"); - const int minkey = max(min(envsmin ? atoi(envsmin) : 1, 255), 1); - const int maxkey = max(min(envsmax ? atoi(envsmax) : 32, 255), minkey); + const int dflmax = g_testExtra ? 64 : 32; + const int minkey = getenvlong("SMHASHER_SMALLKEY_MIN", 1, 1, TIMEHASH_SMALL_LEN_MAX); + const int maxkey = getenvlong("SMHASHER_SMALLKEY_MAX", minkey, dflmax, TIMEHASH_SMALL_LEN_MAX); + double cph[TIMEHASH_SMALL_LEN_MAX + 1]; for(int i = minkey; i <= maxkey; i++) { volatile int j = i; - sum += TinySpeedTest(hashfunc(info->hash),sizeof(hashtype),j,info->verification,true); + cph[j] = TinySpeedTest(hashfunc(info->hash),sizeof(hashtype),j,info->verification,true); + } + { + double sum = 0; + for (int i = minkey; i <= maxkey; i++) + sum += cph[i]; + sum /= (maxkey - minkey + 1); + g_speed = sum; + printf("Average %8.3f cycles/hash\n",sum); + } + if (const char *weights = getenv("SMHASHER_SMALLKEY_WEIGHTS")) + { + // Weighted average exist under assumption that hash speed does not depend on input, + // which is not true due to multiplication instruction having certain amount of variance. + std::vector lenprob; + std::istringstream ssws(weights); + double sum = 0.0; + for (double f; ssws >> f; ) { + lenprob.push_back(f); + sum += f; + } + for (size_t i = 0; i < lenprob.size(); i++) + lenprob[i] /= sum; + if (maxkey < lenprob.size()) { + const auto m = CalcWAvg(cph, lenprob.data(), minkey, maxkey); + printf("Weighted average, %5.1f%% of weights %8.3f cycles/hash\n",100.*m.second,m.first); + } else { + printf("Weighted average needs %u more weights (SKIP it)\n", (unsigned)(maxkey - lenprob.size() + 1)); + } } - g_speed = sum = sum / (maxkey - minkey + 1); - printf("Average %6.3f cycles/hash\n",sum); printf("\n"); fflush(NULL); }