Skip to content

Commit

Permalink
Add weighted average to SpeedSmall test output
Browse files Browse the repository at this point in the history
It addresses the question at rurban#113

What is the "real" average cycles/hash value for a given hash function?

We can't know, but we can estimate it better if we assume that the
function timing does not depend on input (that's not true for hashes
based on multiplication) and we know distribution of key length in
advance (that might be somewhat known for certain classes of inputs,
but the distribution varies across classes measurably).
  • Loading branch information
darkk committed Aug 30, 2024
1 parent 0c6a1ef commit 2ee5761
Show file tree
Hide file tree
Showing 5 changed files with 64 additions and 12 deletions.
16 changes: 13 additions & 3 deletions Platform.cpp
Original file line number Diff line number Diff line change
@@ -1,11 +1,21 @@
#include "Platform.h"

#include <stdio.h>
#include <assert.h>

void testRDTSC ( void )
long getenvlong(const char *name, long minval, long defval, long maxval)
{
int64_t temp = rdtsc();
printf("%ld",(long)temp);
assert(minval <= defval && defval <= maxval);
const char *s = getenv(name);
if (!s)
return defval;
char *tail;
long l = strtol(s, &tail, 0);
if (*tail)
return defval;
if (l < minval) l = minval;
if (l > maxval) l = maxval;
return l;
}

#if defined(_WIN32)
Expand Down
1 change: 1 addition & 0 deletions Platform.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ void SetThreadAffinity ( std::thread &t, int cpu );
# endif
#endif
void SetAffinity ( int cpu );
long getenvlong(const char *name, long minval, long defval, long maxval);

// That's not UINT64_MAX as it's converted to int64_t sometimes.
constexpr uint64_t timer_inf = INT64_MAX;
Expand Down
2 changes: 1 addition & 1 deletion SpeedTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -240,7 +240,7 @@ double SpeedTest ( pfHash hash, uint32_t seed, const int trials, const int block

double t;

if(blocksize < 100)
if(blocksize <= TIMEHASH_SMALL_LEN_MAX)
{
t = (double)timehash_small(hash,block,blocksize,itrial);
}
Expand Down
2 changes: 2 additions & 0 deletions SpeedTest.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

#include "Types.h"

constexpr int TIMEHASH_SMALL_LEN_MAX = 255;

void BulkSpeedTest ( pfHash hash, uint32_t seed );
double TinySpeedTest ( pfHash hash, int hashsize, int keysize, uint32_t seed, bool verbose );
double HashMapSpeedTest ( pfHash pfhash, int hashbits, std::vector<std::string> words,
Expand Down
55 changes: 47 additions & 8 deletions main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#include <chrono>
#endif

#include <sstream>
#include <stdio.h>
#include <stdint.h>
#include <time.h>
Expand Down Expand Up @@ -86,6 +87,8 @@ bool MomentChi2Test ( struct HashInfo *info, int inputSize );
//-----------------------------------------------------------------------------
// This is the list of all hashes that SMHasher can test.

#define COUNT_OF(x) ((sizeof(x)/sizeof(0[x])) / ((size_t)(!(sizeof(x) % sizeof(0[x])))))

const char* quality_str[3] = { "SKIP", "POOR", "GOOD" };

// sorted by quality and speed. the last is the list of internal secrets to be tested against bad seeds.
Expand Down Expand Up @@ -979,6 +982,17 @@ void SelfTest(bool verbose) {

//----------------------------------------------------------------------------

static std::pair<double, double> CalcWAvg(const double *x, const double *dist, size_t imin, size_t imax)
{
double sum = 0, prb = 0;
for (size_t i = imin; i <= imax; i++) {
sum += dist[i] * x[i];
prb += dist[i];
}
sum /= prb;
return std::make_pair(sum, prb);
}

template < typename hashtype >
void test ( hashfunc<hashtype> hash, HashInfo* info )
{
Expand Down Expand Up @@ -1027,7 +1041,6 @@ void test ( hashfunc<hashtype> hash, HashInfo* info )

if(g_testSpeedBulk || g_testSpeedSmall || g_testAll)
{
double sum = 0.0;
printf("[[[ Speed Tests ]]]\n\n");
if (timer_counts_ns())
printf("WARNING: no cycle counter, cycle == 1ns\n");
Expand All @@ -1050,17 +1063,43 @@ void test ( hashfunc<hashtype> hash, HashInfo* info )
}

if (g_testSpeedSmall || g_testAll) {
const char* const envsmin = getenv("SMHASHER_SMALLKEY_MIN");
const char* const envsmax = getenv("SMHASHER_SMALLKEY_MAX");
const int minkey = max(min(envsmin ? atoi(envsmin) : 1, 255), 1);
const int maxkey = max(min(envsmax ? atoi(envsmax) : 32, 255), minkey);
const int dflmax = g_testExtra ? 64 : 32;
const int minkey = getenvlong("SMHASHER_SMALLKEY_MIN", 1, 1, TIMEHASH_SMALL_LEN_MAX);
const int maxkey = getenvlong("SMHASHER_SMALLKEY_MAX", minkey, dflmax, TIMEHASH_SMALL_LEN_MAX);
double cph[TIMEHASH_SMALL_LEN_MAX + 1];
for(int i = minkey; i <= maxkey; i++)
{
volatile int j = i;
sum += TinySpeedTest(hashfunc<hashtype>(info->hash),sizeof(hashtype),j,info->verification,true);
cph[j] = TinySpeedTest(hashfunc<hashtype>(info->hash),sizeof(hashtype),j,info->verification,true);
}
{
double sum = 0;
for (int i = minkey; i <= maxkey; i++)
sum += cph[i];
sum /= (maxkey - minkey + 1);
g_speed = sum;
printf("Average %8.3f cycles/hash\n",sum);
}
if (const char *weights = getenv("SMHASHER_SMALLKEY_WEIGHTS"))
{
// Weighted average exist under assumption that hash speed does not depend on input,
// which is not true due to multiplication instruction having certain amount of variance.
std::vector<double> lenprob;
std::istringstream ssws(weights);
double sum = 0.0;
for (double f; ssws >> f; ) {
lenprob.push_back(f);
sum += f;
}
for (size_t i = 0; i < lenprob.size(); i++)
lenprob[i] /= sum;
if (maxkey < lenprob.size()) {
const auto m = CalcWAvg(cph, lenprob.data(), minkey, maxkey);
printf("Weighted average, %5.1f%% of weights %8.3f cycles/hash\n",100.*m.second,m.first);
} else {
printf("Weighted average needs %u more weights (SKIP it)\n", (unsigned)(maxkey - lenprob.size() + 1));
}
}
g_speed = sum = sum / (maxkey - minkey + 1);
printf("Average %6.3f cycles/hash\n",sum);
printf("\n");
fflush(NULL);
}
Expand Down

0 comments on commit 2ee5761

Please sign in to comment.