-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.cpp
173 lines (147 loc) · 3.86 KB
/
main.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
// #define _CRT_SECURE_NO_WARNINGS
#include <cilk/cilk.h>
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <random>
#include <climits>
#include <algorithm>
#include <ctime>
#include <mutex>
#include <stdint.h>
#define SIMHASH_BIT 64
enum { NUMWORDS = 1000000, WORDLENGTH = 16 };
/**
@brief
Some hash function
*/
unsigned long long base_hash(unsigned long long hash, unsigned long long s) {
return ((hash << 5) + hash) + s;
}
/**
@brief
Calculating a has for the given token
*/
unsigned long long hash_token(const char* arKey, unsigned int length)
{
std::mutex base_hash_m;
register unsigned long long ret = 5381;
#pragma simd
cilk_for (int key = length; key >= 8; key -= 8) {
for (size_t i = 0; i < key; ++i) {
base_hash_m.lock();
ret = base_hash(ret, *arKey++);
base_hash_m.unlock();
}
}
for (int i = 0; i < length % 8; ++i) {
ret = base_hash(ret, *arKey++);
}
return ret;
}
/**
@brief
Calculates a simhash of tokenized sequence
*/
unsigned long long simhash_tokens(char const* const* tokens, unsigned int length)
{
float hash_vector[SIMHASH_BIT];
memset(hash_vector, 0, SIMHASH_BIT * sizeof(float));
std::mutex token_hash_m;
std::mutex hash_vector_m;
// #pragma simd
for (unsigned i = 0; i < length; i++) {
unsigned long long token_hash = 0;
int current_bit = 0;
// Calculating some hash value, can be any
token_hash = hash_token(tokens[i], strlen(tokens[i]));
// Comprised hashvalue
cilk_for (int j = SIMHASH_BIT - 1; j >= 0; j--) {
current_bit = token_hash & 0x1;
if (current_bit == 1) {
hash_vector_m.lock();
hash_vector[j] += 1;
hash_vector_m.unlock();
}
else {
hash_vector_m.lock();
hash_vector[j] -= 1;
hash_vector_m.unlock();
}
token_hash_m.lock();
token_hash = token_hash >> 1;
token_hash_m.unlock();
}
}
// Accumulated simhash sum
unsigned simhash = 0;
for (int i = 0; i < SIMHASH_BIT; i++) {
if (hash_vector[i] > 0) {
simhash = (simhash << 1) + 0x1;
}
else {
simhash = simhash << 1;
}
}
return simhash;
}
/**
@brief
Prints a 2D array for debug
*/
void print2D(char** t, int x, int y) {
for (int i = 0; i < x; ++i)
{
puts(t[i]);
}
}
/**
@brief
Runs a test case for the specific size sz
*/
void run_case(size_t sz) {
double a = 0;
// char* words[WORDLENGTH + 1];
char** words;
words = (char**)malloc(sizeof(char*) * NUMWORDS);
for (char** it = words; it != words + NUMWORDS; ++it)
*it = (char*)malloc(WORDLENGTH + 1);
for (size_t i = 0; i < sz; ++i) {
for (int j = 0; j < NUMWORDS; ++j) {
for (int k = 0; k < WORDLENGTH; ++k) {
words[j][k] = rand() % 128;
}
words[j][WORDLENGTH] = '\0';
}
// print2D(words, NUMWORDS, WORDLENGTH);
auto start = clock();
simhash_tokens(words, NUMWORDS);
// printf("%d\n", h);
auto stop = clock();
a = (a + stop - start) / 2;
}
for (char** it = words; it != words + NUMWORDS; ++it)
free(*it);
free(words);
printf("It took simhash %f ticks to run", a);
}
/**
@brief
Runs a bunch of test case on the test cases
*/
void driver()
{
size_t sizes[] = { 100, }; //1000, 10000, 10000, 100000 };
for (size_t i = 0; i < sizeof(sizes) / sizeof(size_t); ++i) {
run_case(sizes[i]);
}
}
int main() {
srand(2);
const char* words[] = { "help",
};
int h1 = simhash_tokens(words, 1);
// int h2 = sh_simhash(words2, 2);
// printf("%d", h1);
return 0;
}