diff --git a/setup.py b/setup.py index d0c04c5..9b54b5c 100644 --- a/setup.py +++ b/setup.py @@ -22,7 +22,7 @@ setup( name='pybloof', - version='0.7.1', + version='0.7.3', author='Jake Heinz', author_email='me@jh.gg', url="http://github.com/jhgg/pybloof", diff --git a/src/_pybloof.pyx b/src/_pybloof.pyx index 29129ec..f48d877 100644 --- a/src/_pybloof.pyx +++ b/src/_pybloof.pyx @@ -8,6 +8,8 @@ import base64 from libc.string cimport memcpy +cdef unsigned int high = 0xFFFFFFFF + cdef array.array char_array_template = array.array('b', []) cdef extern from "MurmurHash3.h" nogil: @@ -47,16 +49,20 @@ cdef void _get_hash_buckets(key, unsigned long long * _bucket_indexes, unsigned for i in range(hash_count): _bucket_indexes[i] = llabs((hash1 + i * hash2) % max)\ + + @cython.boundscheck(False) -cdef void _get_hash_buckets_for_long(long long key, unsigned long long * _bucket_indexes, unsigned int hash_count, +@cython.cdivision(True) +cdef inline void _get_hash_buckets_for_long(long long key, unsigned long long * _bucket_indexes, unsigned int hash_count, unsigned long max): cdef unsigned long result[2] cdef unsigned long hash1, hash2 cdef unsigned long i + MurmurHash3_x64_128_long(key, 0, &result) hash1 = result[0] - MurmurHash3_x64_128_long(key, result[1] & 0xFFFFFFFF, result) + MurmurHash3_x64_128_long(key, result[1] & high, result) hash2 = result[0] for i in range(hash_count): @@ -67,6 +73,34 @@ cdef char* fmt = '!III' cdef ssize_t header_size = sizeof(unsigned int) * 3 DEF MAX_HASHES = 32 + +@cython.boundscheck(False) +cdef int _uniques_in_range(unsigned int start, unsigned int stop, int[:] bitarray, + unsigned long long * _bucket_indexes, unsigned int size, + unsigned int hash_count, int[:] flags, int[:] uniques): + cdef unsigned int i + cdef unsigned int bucket_index + cdef unsigned int idx + cdef unsigned int bit + cdef unsigned int off + cdef int is_in + idx = 0 + off = 0 + for item in range(start, stop): + is_in = 1 + _get_hash_buckets_for_long(item, _bucket_indexes, hash_count, size) + for i in range(hash_count): + if not bitarray[_bucket_indexes[i]]: + is_in = 0 + break + flags[idx] = is_in + if is_in: + uniques[off] = item + off += 1 + idx += 1 + return off + + cdef class _BloomFilter: cdef unsigned int _size cdef unsigned int _hashes @@ -213,9 +247,35 @@ cdef class UIntBloomFilter(_BloomFilter): return True + @cython.boundscheck(False) + cdef _uniques_in_range(self, unsigned int start, unsigned int stop): + cdef unsigned long long _bucket_indexes[MAX_HASHES] + cdef unsigned int i + cdef unsigned int bucket_index + cdef unsigned int idx + cdef unsigned int bit + cdef int is_in + cdef array.array flags = array.array('i', [stop - start]) + array.resize(flags, stop - start) + cdef array.array uniq = array.array('i', [stop - start]) + array.resize(uniq, stop - start) + cdef array.array bitarray = array.array('i', [self._size]) + array.resize(bitarray, self._size) + byte = self._bitarray.unpack() + for idx in range(self._size): + bitarray[idx] = byte[idx] == 255 + off = _uniques_in_range(start, stop, bitarray, _bucket_indexes, + self._size, self._hashes, flags, uniq) + return flags, uniq, off + def __contains__(self, unsigned int item): return self.contains(item) + def uniques_in_range(self, start, stop): + flags, uniq, off = self._uniques_in_range(start, stop) + return list(flags), set(uniq[:off]) + + cdef class StringBloomFilter(_BloomFilter): cpdef add(self, item): cdef unsigned long long _bucket_indexes[MAX_HASHES]