forked from benhoyt/countwords
-
Notifications
You must be signed in to change notification settings - Fork 1
/
optimized.c
148 lines (135 loc) · 4.26 KB
/
optimized.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
#include <ctype.h>
#include <search.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define BUF_SIZE 65536
#define HASH_LEN 65536 // must be a power of 2
#define FNV_OFFSET 14695981039346656037UL
#define FNV_PRIME 1099511628211UL
// Used both for hash table buckets and array for sorting.
typedef struct {
char* word;
int word_len;
int count;
} count;
// Comparison function for qsort() ordering by count descending.
int cmp_count(const void* p1, const void* p2) {
int c1 = ((count*)p1)->count;
int c2 = ((count*)p2)->count;
if (c1 == c2) return 0;
if (c1 < c2) return 1;
return -1;
}
count* table;
int num_unique = 0;
// Increment count of word in hash table (or insert new word).
void increment(char* word, int word_len, uint64_t hash) {
// Make 64-bit hash in range for items slice.
int index = (int)(hash & (uint64_t)(HASH_LEN-1));
// Look up key, using direct match and linear probing if not found.
while (1) {
if (table[index].word == NULL) {
// Found empty slot, add new item (copying key).
char* word_copy = malloc(word_len);
if (word_copy == NULL) {
fprintf(stderr, "out of memory\n");
exit(1);
}
memmove(word_copy, word, word_len);
table[index].word = word_copy;
table[index].word_len = word_len;
table[index].count = 1;
num_unique++;
return;
}
if (table[index].word_len == word_len &&
memcmp(table[index].word, word, word_len) == 0) {
// Found matching slot, increment existing count.
table[index].count++;
return;
}
// Slot already holds another key, try next slot (linear probe).
index++;
if (index >= HASH_LEN) {
index = 0;
}
}
}
int main() {
// Allocate hash table buckets.
table = calloc(HASH_LEN, sizeof(count));
if (table == NULL) {
fprintf(stderr, "out of memory\n");
return 1;
}
char buf[BUF_SIZE];
int offset = 0;
while (1) {
// Read file in chunks, processing one chunk at a time.
size_t num_read = fread(buf+offset, 1, BUF_SIZE-offset, stdin);
if (num_read+offset == 0) {
break;
}
// Find last space or linefeed in buf and process up to there.
int space;
for (space = offset+num_read-1; space>=0; space--) {
char c = buf[space];
if (c <= ' ') {
break;
}
}
int num_process = (space >= 0) ? space : (int)num_read+offset;
// Scan chars to process: tokenize, lowercase, and hash as we go.
int i = 0;
while (1) {
// Skip whitespace before word.
for (; i < num_process; i++) {
char c = buf[i];
if (c > ' ') {
break;
}
}
// Look for end of word, lowercase and hash as we go.
uint64_t hash = FNV_OFFSET;
int start = i;
for (; i < num_process; i++) {
char c = buf[i];
if (c <= ' ') {
break;
}
if (c >= 'A' && c <= 'Z') {
c += ('a' - 'A');
buf[i] = c;
}
hash *= FNV_PRIME;
hash ^= (uint64_t)c;
}
if (i <= start) {
break;
}
// Got a word, increment count in hash table.
increment(buf+start, i-start, hash);
}
// Move down remaining partial word.
if (space >= 0) {
offset = (offset+num_read-1) - space;
memmove(buf, buf+space+1, offset);
} else {
offset = 0;
}
}
count* ordered = calloc(num_unique, sizeof(count));
for (int i=0, i_unique=0; i<HASH_LEN; i++) {
if (table[i].word != NULL) {
ordered[i_unique++] = table[i];
}
}
qsort(ordered, num_unique, sizeof(count), cmp_count);
for (int i=0; i<num_unique; i++) {
printf("%.*s %d\n",
ordered[i].word_len, ordered[i].word, ordered[i].count);
}
return 0;
}