From 7a8e8383c13252b24517c389d7bb0d307d32a82c Mon Sep 17 00:00:00 2001 From: Charlie Dickens Date: Thu, 2 May 2024 11:14:26 +0100 Subject: [PATCH 01/38] Added initial QF builder functions --- .../quotientfilter/QuotientFilterBuilder.java | 131 ++++++++++++++++++ .../QuotientFilterBuilderTest.java | 113 +++++++++++++++ 2 files changed, 244 insertions(+) create mode 100644 src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterBuilder.java create mode 100644 src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterBuilderTest.java diff --git a/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterBuilder.java b/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterBuilder.java new file mode 100644 index 000000000..0d1812d95 --- /dev/null +++ b/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterBuilder.java @@ -0,0 +1,131 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.datasketches.filters.quotientfilter; +import java.util.concurrent.ThreadLocalRandom; + +import org.apache.datasketches.common.SketchesArgumentException; + +/** + *

This class provides methods to help estimate the correct parameters when + * creating a Quotient filter, and methods to create the filter using those values.

+ * + *

The underlying math is described in the + * + * Wikipedia article on Quotient filters.

+ */ +public final class QuotientFilterBuilder { + + /* + This function is used to suggest the number of bits per entry for a given number of entries. + The fingerprint length is related to the targetFalsePositiveProb roughly by 2^(-fingerprint_length). + Hence, the length of the fingerprint can be stored in at most 8 bits. + This, after rounding up, is the same as the more sophisticated expression which involves the capacity + from https://en.wikipedia.org/wiki/Quotient_filter#Probability_of_false_positives. + * @param targetFalsePositiveProb A desired false positive probability per item + * @return The suggested fingerprint length in bits + */ + public static byte suggestFingerprintLength(double targetFalsePositiveProb) { + if (targetFalsePositiveProb <= 0. || targetFalsePositiveProb >= 1.) { + + throw new SketchesArgumentException("targetFalsePositiveProb must be a valid probability and strictly greater than 0"); + } + return (byte) Math.ceil(-Math.log(targetFalsePositiveProb) / Math.log(2)); + } + + /** + * This method suggests the number of slots in the filter for a given input size, assuming 90% capacity. + * There is no load factor checking internally within the filter, so this method is used to map between the + * number of items we insert into a sketch and the number of slots we need to allocate. + * A design feature of Niv's implementation is that 2^j +2*j slots are allocated. This asymptotically approaches + * 2^j slots as j grows, and the canonical number of slots is 2^j. Therefore, we will only check against + * 0.9*2^j slots. + * The load factor is 0.9 to get some space-utility advantages over the bloom filter. + */ + public static byte suggestLgNumSlots(long maxDistinctItems) { + if (maxDistinctItems <= 0) { + throw new SketchesArgumentException("maxDistinctItems must be strictly positive"); + } + byte result = (byte) Math.ceil(Math.log(maxDistinctItems / 0.9) / Math.log(2)); + if (result < 31) { + return result; + } else { + // Largest address space for a Java array is 2^31 - 1 + throw new SketchesArgumentException("Largest address space for a Java array is 2^31 - 1"); + } + } + + /* + Returns the largest number of unique items that can be inserted into the filter. + We use a predefined load factor of 0.9 compared to the number of slots as 2^j. + @param lgNumSlots The log-base-2 of the number of slots in the filter + @return The maximum number of items that can be inserted into the filter + */ + public static long suggestMaxNumItemsFromNumSlots(byte lgNumSlots) { + if (lgNumSlots <= 0) { + throw new SketchesArgumentException("lgNumSlots must be at least 1."); + } else if (lgNumSlots >= 31) { + throw new SketchesArgumentException("lgNumSlots cannot exceed 2^31 - 1."); + } + return (long) Math.floor(0.9 * Math.pow(2, lgNumSlots)); + } + + + /** + * This method suggests the parameters for a Quotient filter based on the maximum number of distinct items and the target false positive probability. + * It first validates the inputs, then calculates the log-base-2 of the number of slots and the fingerprint length. + * The results are returned as a QFPair object. + * + * @param maxDistinctItems The maximum number of distinct items that can be inserted into the filter. + * @param targetFalsePositiveProb The desired false positive probability per item. + * @return A QFPair object containing the suggested number of slots (lgNumSlots) and the suggested fingerprint length. + * @throws SketchesArgumentException if the input parameters are not valid. + */ + public static QFPair suggestParamsFromMaxDistinctsFPP(long maxDistinctItems, double targetFalsePositiveProb) { + validateAccuracyInputs(maxDistinctItems, targetFalsePositiveProb); + byte lgNumSlots = suggestLgNumSlots(maxDistinctItems); + byte fingerprintLength = suggestFingerprintLength(targetFalsePositiveProb); + return new QFPair(lgNumSlots, fingerprintLength); + } + + private static void validateAccuracyInputs(final long maxDistinctItems, final double targetFalsePositiveProb) { + if (maxDistinctItems <= 0) { + throw new SketchesArgumentException("maxDistinctItems must be strictly positive"); + } + if (targetFalsePositiveProb <= 0.0 || targetFalsePositiveProb > 1.0) { + throw new SketchesArgumentException("targetFalsePositiveProb must be a valid probability and strictly greater than 0"); + } + } + + /** + * Helper class to return a pair of parameters for a Quotient filter: + * the log-base-2 of the number of slots (lgNumSlots) and the fingerprint length. + * These parameters are used to configure the Quotient filter. + */ + public static class QFPair { + public final byte lgNumSlots; + public final byte fingerprintLength; + + public QFPair(byte lgNumSlots, byte fingerprintLength) { + this.lgNumSlots = lgNumSlots; + this.fingerprintLength = fingerprintLength; + } + } + +} diff --git a/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterBuilderTest.java b/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterBuilderTest.java new file mode 100644 index 000000000..4fc38b2bc --- /dev/null +++ b/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterBuilderTest.java @@ -0,0 +1,113 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.datasketches.filters.quotientfilter; + +import org.apache.datasketches.common.SketchesArgumentException; +import org.apache.datasketches.filters.quotientfilter.QuotientFilterBuilder; +import org.apache.datasketches.memory.WritableMemory; +import org.testng.annotations.Test; + +import static org.testng.Assert.*; +public class QuotientFilterBuilderTest { + + @Test + public void testSuggestFingerprintLengthFromFPP(){ + // invalid false positive rate + assertThrows(SketchesArgumentException.class, () -> QuotientFilterBuilder.suggestFingerprintLength(0.)); + assertThrows(SketchesArgumentException.class, () -> QuotientFilterBuilder.suggestFingerprintLength(1.)); + + // manually computed values based on formula using ceil(log2(1/targetFalsePositiveProb)) + double[] fpps = {0.1, 0.01, 0.001, 0.0001, 1E-5, 1E-6, 1E-7, 1E-8}; + byte[] results = {4, 7, 10, 14, 17, 20, 24, 27, 30}; + for (int i = 0; i < fpps.length; i++) { + assertEquals(QuotientFilterBuilder.suggestFingerprintLength(fpps[i]), results[i]); + } + } + + @Test + public static void testSuggestLgNumSlots(){ + QuotientFilterBuilder qfb = new QuotientFilterBuilder(); + + // invalid number of items + assertThrows(SketchesArgumentException.class, () -> QuotientFilterBuilder.suggestLgNumSlots(0)); + assertThrows(SketchesArgumentException.class, () -> QuotientFilterBuilder.suggestLgNumSlots(-1)); + assertThrows(SketchesArgumentException.class, () -> QuotientFilterBuilder.suggestLgNumSlots(5000000000L)); + + long[] numItems = {1, 100, 1000, 1000000L}; + int[] results = {1, 7, 11, 21} ; + + for (int i = 0; i < numItems.length; i++) { + long num = numItems[i]; + byte result = qfb.suggestLgNumSlots(num); + assertEquals(result, results[i]); + } + } + + @Test + public static void testSuggestMaxNumItems(){ + QuotientFilterBuilder qfb = new QuotientFilterBuilder(); + + // invalid number of slots + assertThrows(SketchesArgumentException.class, () -> QuotientFilterBuilder.suggestMaxNumItemsFromNumSlots((byte)-127)); + assertThrows(SketchesArgumentException.class, () -> QuotientFilterBuilder.suggestMaxNumItemsFromNumSlots((byte)0)); + assertThrows(SketchesArgumentException.class, () -> QuotientFilterBuilder.suggestMaxNumItemsFromNumSlots((byte)32)); + + + byte[] lgNumSlots = {1, 2, 3, 6, 10, 15, 25, 30,}; + long[] results = {1, 3, 7, 57, 921, 29491, 30198988, 966367641} ; + + for (int i = 0; i < lgNumSlots.length; i++) { + long result = qfb.suggestMaxNumItemsFromNumSlots(lgNumSlots[i]); + assertEquals(result, results[i]); + } + } + + @Test + public static void testSuggestParamsFromMaxDistinctsFPP(){ + + // invalid number of slots + assertThrows(SketchesArgumentException.class, () -> QuotientFilterBuilder.suggestParamsFromMaxDistinctsFPP(5000000000L, 0.0001)); + assertThrows(SketchesArgumentException.class, () -> QuotientFilterBuilder.suggestParamsFromMaxDistinctsFPP(100000000, 0.)); + assertThrows(SketchesArgumentException.class, () -> QuotientFilterBuilder.suggestParamsFromMaxDistinctsFPP(100000000, 1.5)); + assertThrows(SketchesArgumentException.class, () -> QuotientFilterBuilder.suggestParamsFromMaxDistinctsFPP(5000000000L, -1.)); + + + QuotientFilterBuilder qfb = new QuotientFilterBuilder(); + byte lgNumSlots ; + byte fingerprintLength ; + long[] numItems = {1L, 900L, 500_000_000L} ; + double[] fpp = {1E-10, 1E-2, 1e-7} ; + + // expected outcomes + byte[] expected_lgNumSlots = {1, 10, 30} ; + byte[] expected_fingerprintLength = {34, 7, 24} ; + + for (int i = 0; i < numItems.length; i++) { + QuotientFilterBuilder.QFPair pair = qfb.suggestParamsFromMaxDistinctsFPP(numItems[i], fpp[i]); + lgNumSlots = pair.lgNumSlots; + fingerprintLength = pair.fingerprintLength; + assertEquals(expected_lgNumSlots[i], lgNumSlots); + assertEquals(expected_fingerprintLength[i], fingerprintLength); + } + } + + + +} From 2080ec3cc5d3f9c81874ec63e8c49559377058ac Mon Sep 17 00:00:00 2001 From: Charlie Dickens Date: Tue, 7 May 2024 12:39:40 +0100 Subject: [PATCH 02/38] Factored in QF code and added QF unit tests --- .../filters/quotientfilter/Bitmap.java | 35 + .../filters/quotientfilter/Filter.java | 129 +++ .../filters/quotientfilter/Iterator.java | 83 ++ .../quotientfilter/QuickBitVector.java | 329 ++++++ .../quotientfilter/QuickBitVectorWrapper.java | 62 ++ .../quotientfilter/QuotientFilter.java | 945 ++++++++++++++++++ .../filters/quotientfilter/DeletionTests.java | 138 +++ .../quotientfilter/QuotientFilterTest.java | 258 +++++ 8 files changed, 1979 insertions(+) create mode 100644 src/main/java/org/apache/datasketches/filters/quotientfilter/Bitmap.java create mode 100644 src/main/java/org/apache/datasketches/filters/quotientfilter/Filter.java create mode 100644 src/main/java/org/apache/datasketches/filters/quotientfilter/Iterator.java create mode 100644 src/main/java/org/apache/datasketches/filters/quotientfilter/QuickBitVector.java create mode 100644 src/main/java/org/apache/datasketches/filters/quotientfilter/QuickBitVectorWrapper.java create mode 100644 src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java create mode 100644 src/test/java/org/apache/datasketches/filters/quotientfilter/DeletionTests.java create mode 100644 src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterTest.java diff --git a/src/main/java/org/apache/datasketches/filters/quotientfilter/Bitmap.java b/src/main/java/org/apache/datasketches/filters/quotientfilter/Bitmap.java new file mode 100644 index 000000000..658e15f0d --- /dev/null +++ b/src/main/java/org/apache/datasketches/filters/quotientfilter/Bitmap.java @@ -0,0 +1,35 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.datasketches.filters.quotientfilter; + +public abstract class Bitmap { + + public abstract long size(); + public abstract void set(long bit_index, boolean value); + public abstract void setFromTo(long from, long to, long value); + public abstract boolean get(long bit_index); + public abstract long getFromTo(long from, long to); + + public static boolean get_fingerprint_bit(long index, long fingerprint) { + long mask = 1 << index; + long and = fingerprint & mask; + return and != 0; + } +} diff --git a/src/main/java/org/apache/datasketches/filters/quotientfilter/Filter.java b/src/main/java/org/apache/datasketches/filters/quotientfilter/Filter.java new file mode 100644 index 000000000..55bb0f4bc --- /dev/null +++ b/src/main/java/org/apache/datasketches/filters/quotientfilter/Filter.java @@ -0,0 +1,129 @@ +package org.apache.datasketches.filters.quotientfilter; + +import java.nio.ByteBuffer; +import java.nio.charset.StandardCharsets; + + +import org.apache.datasketches.memory.XxHash; + + +public abstract class Filter { + + //HashType hash_type; + + //abstract boolean rejuvenate(long key); + //abstract boolean expand(); + //protected abstract boolean _delete(long large_hash); + abstract protected boolean _insert(long large_hash, boolean insert_only_if_no_match); + abstract protected boolean _search(long large_hash); + + + //public boolean delete(long input) { +// return _delete(get_hash(input)); +// } + +// public boolean delete(String input) { +// ByteBuffer input_buffer = ByteBuffer.wrap(input.getBytes(StandardCharsets.UTF_8)); +// //return _delete(HashFunctions.xxhash(input_buffer)); +// return _delete(XxHash.hashLong(input_buffer)); +// } + +// public boolean delete(byte[] input) { +// ByteBuffer input_buffer = ByteBuffer.wrap(input); +// return _delete(HashFunctions.xxhash(input_buffer)); +// } +// + public boolean insert(long input, boolean insert_only_if_no_match) { + //System.out.println("The ABC input is " + input); + long hash = get_hash(input); + //System.out.println("The ABC hash is " + hash); + return _insert(hash, insert_only_if_no_match); + } +// +// public boolean insert(String input, boolean insert_only_if_no_match) { +// ByteBuffer input_buffer = ByteBuffer.wrap(input.getBytes(StandardCharsets.UTF_8)); +// return _insert(HashFunctions.xxhash(input_buffer), insert_only_if_no_match); +// } +// +// public boolean insert(byte[] input, boolean insert_only_if_no_match) { +// ByteBuffer input_buffer = ByteBuffer.wrap(input); +// return _insert(HashFunctions.xxhash(input_buffer), insert_only_if_no_match); +// } +// + public boolean search(long input) { + return _search(get_hash(input)); + } +// +// public boolean search(String input) { +// ByteBuffer input_buffer = ByteBuffer.wrap(input.getBytes(StandardCharsets.UTF_8)); +// return _search(HashFunctions.xxhash(input_buffer)); +// } +// +// public boolean search(byte[] input) { +// ByteBuffer input_buffer = ByteBuffer.wrap(input); +// return _search(HashFunctions.xxhash(input_buffer)); +// } +// + long get_hash(long input) { +// long hash = 0; +// if (hash_type == HashType.arbitrary) { +// hash = HashFunctions.normal_hash((int)input); +// } +// else if (hash_type == HashType.xxh) { +// hash = HashFunctions.xxhash(input); +// } +// else { +// System.exit(1); +// } +// return hash; + return XxHash.hashLong(input, 0L) ; // CD edit for datasketches hash function using same seed. + } + + public long get_space_use() { return 0 ; } +// public int get_bits_per_entry() { return 0 ; } +// +// public abstract long get_num_entries(boolean include_all_internal_filters); +// +// public double get_utilization() { +// return 0; +// } +// +// public double measure_num_bits_per_entry() { +// return 0; +// } +// +// static void print_int_in_binary(int num, int length) { +// String str = ""; +// for (int i = 0; i < length; i++) { +// int mask = (int)Math.pow(2, i); +// int masked = num & mask; +// str += masked > 0 ? "1" : "0"; +// } +// System.out.println(str); +// } +// +// static void print_long_in_binary(long num, int length) { +// String str = ""; +// for (int i = 0; i < length; i++) { +// long mask = (long)Math.pow(2, i); +// long masked = num & mask; +// str += masked > 0 ? "1" : "0"; +// } +// System.out.println(str); +// } +// +// String get_fingerprint_str(long fp, int length) { +// String str = ""; +// for (int i = 0; i < length; i++) { +// str += Bitmap.get_fingerprint_bit(i, fp) ? "1" : "0"; +// } +// return str; +// } +// +// public void pretty_print() { +// +// } + + +} + diff --git a/src/main/java/org/apache/datasketches/filters/quotientfilter/Iterator.java b/src/main/java/org/apache/datasketches/filters/quotientfilter/Iterator.java new file mode 100644 index 000000000..05725657b --- /dev/null +++ b/src/main/java/org/apache/datasketches/filters/quotientfilter/Iterator.java @@ -0,0 +1,83 @@ +package org.apache.datasketches.filters.quotientfilter; + +import java.util.ArrayDeque; +import java.util.Queue; + +public class Iterator { + + QuotientFilter qf; + long index; + long bucket_index; + long fingerprint; + Queue s; + + Iterator(QuotientFilter new_qf) { + qf = new_qf; + s = new ArrayDeque(); + //s = new ArrayDeque(); + index = 0; + bucket_index = -1; + fingerprint = -1; + } + + void clear() { + s.clear(); + index = 0; + bucket_index = -1; + fingerprint = -1; + } + + boolean next() { + + if (index == qf.get_logical_num_slots_plus_extensions()) { + return false; + } + + long slot = qf.get_slot(index); + boolean occupied = (slot & 1) != 0; + boolean continuation = (slot & 2) != 0; + boolean shifted = (slot & 4) != 0; + + + while (!occupied && !continuation && !shifted && index < qf.get_logical_num_slots_plus_extensions()) { + index++; + if (index == qf.get_logical_num_slots_plus_extensions()) { + return false; + } + slot = qf.get_slot(index); + occupied = (slot & 1) != 0; + continuation = (slot & 2) != 0; + shifted = (slot & 4) != 0; + } + + if (occupied && !continuation && !shifted) { + s.clear(); + s.add(index); + bucket_index = index; + } + else if (occupied && continuation && shifted) { + s.add(index); + } + else if (!occupied && !continuation && shifted) { + s.remove(); + bucket_index = s.peek(); + } + else if (!occupied && continuation && shifted) { + // do nothing + } + else if (occupied && !continuation && shifted) { + s.add(index); + s.remove(); + bucket_index = s.peek(); + } + fingerprint = slot >> 3; + index++; + return true; + } + + void print() { + System.out.println("original slot: " + index + " " + bucket_index); + } + + +} diff --git a/src/main/java/org/apache/datasketches/filters/quotientfilter/QuickBitVector.java b/src/main/java/org/apache/datasketches/filters/quotientfilter/QuickBitVector.java new file mode 100644 index 000000000..17f753e83 --- /dev/null +++ b/src/main/java/org/apache/datasketches/filters/quotientfilter/QuickBitVector.java @@ -0,0 +1,329 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.datasketches.filters.quotientfilter; + +/* +Copyright � 1999 CERN - European Organization for Nuclear Research. +Permission to use, copy, modify, distribute and sell this software and its documentation for any purpose +is hereby granted without fee, provided that the above copyright notice appear in all copies and +that both that copyright notice and this permission notice appear in supporting documentation. +CERN makes no representations about the suitability of this software for any purpose. +It is provided "as is" without expressed or implied warranty. +*/ + +/** + * Implements quick non polymorphic non bounds checking low level bitvector operations. + * Includes some operations that interpret sub-bitstrings as long integers. + *

+ * WARNING: Methods of this class do not check preconditions. + * Provided with invalid parameters these method may return (or set) invalid values without throwing any exception. + * You should only use this class when performance is critical and you are absolutely sure that indexes are within bounds. + *

+ * A bitvector is modelled as a long array, i.e. long[] bits holds bits of a bitvector. + * Each long value holds 64 bits. + * The i-th bit is stored in bits[i/64] at + * bit position i % 64 (where bit position 0 refers to the least + * significant bit and 63 refers to the most significant bit). + * + * @author wolfgang.hoschek@cern.ch + * @version 1.0, 09/24/99 + * @see //BitVector + * @see //BitMatrix + * @see java.util.BitSet + */ +//package bitmap_implementations; + +public class QuickBitVector extends Object { + protected final static int ADDRESS_BITS_PER_UNIT = 6; // 64=2^6 + protected final static int BITS_PER_UNIT = 64; // = 1 << ADDRESS_BITS_PER_UNIT + protected final static int BIT_INDEX_MASK = 63; // = BITS_PER_UNIT - 1; + + private static final long[] pows = precomputePows(); //precompute bitmasks for speed + /** + * Makes this class non instantiable, but still inheritable. + */ + protected QuickBitVector() { + } + /** + * Returns a bit mask with bits in the specified range set to 1, all the rest set to 0. + * In other words, returns a bit mask having 0,1,2,3,...,64 bits set. + * If to-from+1==0 then returns zero (0L). + * Precondition (not checked): to-from+1 >= 0 && to-from+1 <= 64. + * + * @param from index of start bit (inclusive) + * @param to index of end bit (inclusive). + * @return the bit mask having all bits between from and to set to 1. + */ + public static final long bitMaskWithBitsSetFromTo(long from, long to) { + return pows[(int)(to-from+1)] << from; + + // This turned out to be slower: + // 0xffffffffffffffffL == ~0L == -1L == all 64 bits set. + // int width; + // return (width=to-from+1) == 0 ? 0L : (0xffffffffffffffffL >>> (BITS_PER_UNIT-width)) << from; + } + /** + * Changes the bit with index bitIndex in the bitvector bits to the "clear" (false) state. + * + * @param bits the bitvector. + * @param bitIndex the index of the bit to be cleared. + */ + public static void clear(long[] bits, long bitIndex) { + bits[(int)(bitIndex >> ADDRESS_BITS_PER_UNIT)] &= ~(1L << (bitIndex & BIT_INDEX_MASK)); + } + /** + * Returns from the bitvector the value of the bit with the specified index. + * The value is true if the bit with the index bitIndex + * is currently set; otherwise, returns false. + * + * @param bits the bitvector. + * @param bitIndex the bit index. + * @return the value of the bit with the specified index. + */ + public static boolean get(long[] bits, long bitIndex) { + return ((bits[(int)(bitIndex >> ADDRESS_BITS_PER_UNIT)] & (1L << (bitIndex & BIT_INDEX_MASK))) != 0); + } + /** + * Returns a long value representing bits of a bitvector from index from to index to. + * Bits are returned as a long value with the return value having bit 0 set to bit from, ..., bit to-from set to bit to. + * All other bits of return value are set to 0. + * If from > to then returns zero (0L). + * Precondition (not checked): to-from+1 <= 64. + * @param bits the bitvector. + * @param from index of start bit (inclusive). + * @param to index of end bit (inclusive). + * @return the specified bits as long value. + */ + public static long getLongFromTo(long[] bits, long from, long to) { + if (from>to) return 0L; + + final int fromIndex = (int)(from >> ADDRESS_BITS_PER_UNIT); //equivalent to from/64 + final int toIndex = (int)(to >> ADDRESS_BITS_PER_UNIT); + final int fromOffset = (int)(from & BIT_INDEX_MASK); //equivalent to from%64 + final int toOffset = (int)(to & BIT_INDEX_MASK); + //this is equivalent to the above, but slower: + //final int fromIndex=from/BITS_PER_UNIT; + //final int toIndex=to/BITS_PER_UNIT; + //final int fromOffset=from%BITS_PER_UNIT; + //final int toOffset=to%BITS_PER_UNIT; + + + long mask; + if (fromIndex==toIndex) { //range does not cross unit boundaries; value to retrieve is contained in one single long value. + mask=bitMaskWithBitsSetFromTo(fromOffset, toOffset); + return (bits[fromIndex] & mask) >>> fromOffset; + + } + + //range crosses unit boundaries; value to retrieve is spread over two long values. + //get part from first long value + mask=bitMaskWithBitsSetFromTo(fromOffset, BIT_INDEX_MASK); + final long x1=(bits[fromIndex] & mask) >>> fromOffset; + + //get part from second long value + mask=bitMaskWithBitsSetFromTo(0, toOffset); + final long x2=(bits[toIndex] & mask) << (BITS_PER_UNIT-fromOffset); + + //combine + return x1|x2; + } + /** + Returns the index of the least significant bit in state "true". + Returns 32 if no bit is in state "true". + Examples: +

+     0x80000000 --> 31
+     0x7fffffff --> 0
+     0x00000001 --> 0
+     0x00000000 --> 32
+     
+ */ + static public int leastSignificantBit(int value) { + int i=-1; + while (++i < 32 && (((1<size elements, with each element taking bitsPerElement bits. + * CD. THIS METHOD ESSENTIALLY ROUNDS TO THE NEXT MULTIPLE OF 64 BITS. + * @param size the number of elements to be stored in the bitvector (must be >= 0). + * @param bitsPerElement the number of bits one single element takes. + * @return a low level bitvector. + */ + public static long[] makeBitVector(long size, int bitsPerElement) { + long nBits = size*bitsPerElement; + //System.out.println("IN BITVECTOR"); + //System.out.println("Using " + nBits + " bits"); + long right_shift = ((nBits-1) >> ADDRESS_BITS_PER_UNIT) ; // This line basically does (nBits-1) / 2^ADDRESS... + long safe_right_shift = ((nBits-1) >>> ADDRESS_BITS_PER_UNIT) ; // This line basically does (nBits-1) / 2^ADDRESS... + // System.out.println("Right shift " + right_shift); + //System.out.println("Safe Right shift " + safe_right_shift); + int unitIndex = (int)((nBits-1) >> ADDRESS_BITS_PER_UNIT); // How many multiples of 64 bits do we need to store nBits bits? + //System.out.println(ADDRESS_BITS_PER_UNIT); + //System.out.println("unitIndex " + unitIndex); + long[] bitVector = new long[unitIndex + 1]; + //System.out.println("length " + bitVector.length); + //System.out.println("Total bits: " + (bitVector.length * 64)); + //System.out.println("Num slots available: " + (bitVector.length * 64) / bitsPerElement); + return bitVector; + } + /** + Returns the index of the most significant bit in state "true". + Returns -1 if no bit is in state "true". + Examples: +
+     0x80000000 --> 31
+     0x7fffffff --> 30
+     0x00000001 --> 0
+     0x00000000 --> -1
+     
+ */ + static public int mostSignificantBit(int value) { + int i=32; + while (--i >=0 && (((1< correct. + * to speedup calculations in subsequent methods. + */ + private static long[] precomputePows() { + long[] pows=new long[BITS_PER_UNIT+1]; + long value = ~0L; + for (int i=BITS_PER_UNIT+1; --i >= 1; ) { + pows[i]=value >>> (BITS_PER_UNIT-i); + //System.out.println((i)+":"+pows[i]); + } + pows[0]=0L; + //System.out.println((0)+":"+pows[0]); + return pows; + + //OLD STUFF + /* + for (int i=BITS_PER_UNIT+1; --i >= 0; ) { + pows[i]=value; + value = value >>> 1; + System.out.println((i)+":"+pows[i]); + } + */ + + /* + long[] pows=new long[BITS_PER_UNIT]; + for (int i=0; ibitIndex in the bitvector bits to the state specified by value. + * + * @param bits the bitvector. + * @param bitIndex the index of the bit to be changed. + * @param value the value to be stored in the bit. + */ + public static void put(long[] bits, long bitIndex, boolean value) { + if (value) + set(bits, bitIndex); + else + clear(bits, bitIndex); + } + /** + * Sets bits of a bitvector from index from to index to to the bits of value. + * Bit from is set to bit 0 of value, ..., bit to is set to bit to-from of value. + * All other bits stay unaffected. + * If from > to then does nothing. + * Precondition (not checked): to-from+1 <= 64. + * + * @param bits the bitvector. + * @param value the value to be copied into the bitvector. + * @param from index of start bit (inclusive). + * @param to index of end bit (inclusive). + */ + public static void putLongFromTo(long[] bits, long value, long from, long to) { + if (from>to) return; + + final int fromIndex=(int)(from >> ADDRESS_BITS_PER_UNIT); //equivalent to from/64 + final int toIndex=(int)(to >> ADDRESS_BITS_PER_UNIT); + final int fromOffset=(int)(from & BIT_INDEX_MASK); //equivalent to from%64 + final int toOffset=(int)(to & BIT_INDEX_MASK); + /* + this is equivalent to the above, but slower: + int fromIndex=from/BITS_PER_UNIT; + int toIndex=to/BITS_PER_UNIT; + int fromOffset=from%BITS_PER_UNIT; + int toOffset=to%BITS_PER_UNIT; + */ + + //make sure all unused bits to the left are cleared. + long mask; + mask=bitMaskWithBitsSetFromTo(to-from+1, BIT_INDEX_MASK); + long cleanValue=value & (~mask); + + long shiftedValue; + + if (fromIndex==toIndex) { //range does not cross unit boundaries; should go into one single long value. + shiftedValue=cleanValue << fromOffset; + mask=bitMaskWithBitsSetFromTo(fromOffset, toOffset); + bits[fromIndex] = (bits[fromIndex] & (~mask)) | shiftedValue; + return; + + } + + //range crosses unit boundaries; value should go into two long values. + //copy into first long value. + shiftedValue=cleanValue << fromOffset; + mask=bitMaskWithBitsSetFromTo(fromOffset, BIT_INDEX_MASK); + bits[fromIndex] = (bits[fromIndex] & (~mask)) | shiftedValue; + + //copy into second long value. + shiftedValue=cleanValue >>> (BITS_PER_UNIT - fromOffset); + mask=bitMaskWithBitsSetFromTo(0, toOffset); + bits[toIndex] = (bits[toIndex] & (~mask)) | shiftedValue; + } + /** + * Changes the bit with index bitIndex in the bitvector bits to the "set" (true) state. + * + * @param bits the bitvector. + * @param bitIndex the index of the bit to be set. + */ + public static void set(long[] bits, long bitIndex) { + bits[(int)(bitIndex >> ADDRESS_BITS_PER_UNIT)] |= 1L << (bitIndex & BIT_INDEX_MASK); + } + /** + * Returns the index of the unit that contains the given bitIndex. + */ + protected static long unit(long bitIndex) { + return bitIndex >> ADDRESS_BITS_PER_UNIT; + //equivalent to bitIndex/64 + } +} diff --git a/src/main/java/org/apache/datasketches/filters/quotientfilter/QuickBitVectorWrapper.java b/src/main/java/org/apache/datasketches/filters/quotientfilter/QuickBitVectorWrapper.java new file mode 100644 index 000000000..a4c24a3ff --- /dev/null +++ b/src/main/java/org/apache/datasketches/filters/quotientfilter/QuickBitVectorWrapper.java @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.datasketches.filters.quotientfilter; + +public class QuickBitVectorWrapper extends Bitmap { + + long[] bs; + + public QuickBitVectorWrapper(int bits_per_entry, long num_entries) { + bs = QuickBitVector.makeBitVector(num_entries, bits_per_entry); + } + + @Override + public long size() { + return (long)bs.length * Long.BYTES * 8L; + } + + @Override + public void set(long bit_index, boolean value) { + if (value) { + QuickBitVector.set(bs, bit_index); + } + else { + QuickBitVector.clear(bs, bit_index); + } + } + + @Override + public void setFromTo(long from, long to, long value) { + QuickBitVector.putLongFromTo(bs, value, from, to - 1); + } + + @Override + public boolean get(long bit_index) { + return QuickBitVector.get(bs, bit_index); + } + + @Override + public long getFromTo(long from, long to) { + return QuickBitVector.getLongFromTo(bs, from, to - 1); + } + + +} + diff --git a/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java b/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java new file mode 100644 index 000000000..9e0f08ce5 --- /dev/null +++ b/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java @@ -0,0 +1,945 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + + +package org.apache.datasketches.filters.quotientfilter; + +import java.util.ArrayList; +import java.util.HashSet; +import java.util.Set; + +import org.apache.datasketches.filters.quotientfilter.Bitmap; +import org.apache.datasketches.memory.XxHash; + +public class QuotientFilter extends Filter { + + int bitPerEntry; + int fingerprintLength; + int power_of_two_size; + int num_extension_slots; + int num_existing_entries; + Bitmap filter; + + // These three fields are used to prevent throwing exceptions when the buffer space of the filter is exceeded + long last_empty_slot; + long last_cluster_start; + public long backward_steps; + + double expansion_threshold; + long max_entries_before_expansion; + boolean expand_autonomously; + boolean is_full; + + // statistics, computed in the compute_statistics method. method should be called before these are used + long num_runs; + long num_clusters; + public double avg_run_length; + public double avg_cluster_length; + + int original_fingerprint_size; + int num_expansions; + + + public QuotientFilter(int power_of_two, int bits_per_entry) { + power_of_two_size = power_of_two; + bitPerEntry = bits_per_entry; + fingerprintLength = bits_per_entry - 3; + long init_size = 1L << power_of_two; + //System.out.println("Init size: " + init_size); + num_extension_slots = power_of_two * 2; + // System.out.println("Extension slots: " + num_extension_slots); + + filter = make_filter(init_size, bits_per_entry); + + expansion_threshold = 0.8; + max_entries_before_expansion = (int) (init_size * expansion_threshold); + expand_autonomously = false; + is_full = false; + + original_fingerprint_size = fingerprintLength; + num_expansions = 0; + //hash_type = XxHash.hashLong ; //HashType.xxh; + + last_empty_slot = init_size + num_extension_slots - 1; + last_cluster_start = 0; + backward_steps = 0; + //measure_num_bits_per_entry(); + } + + //nuevo + void update(long init_size) + { + last_empty_slot = init_size + num_extension_slots - 1; + last_cluster_start = 0; + backward_steps = 0; + } + + public boolean rejuvenate(long key) { + return false; + } + + public long get_num_existing_entries() { + return num_existing_entries; + } + + public long get_max_entries_before_expansion() { + return max_entries_before_expansion; + } + + public boolean expand_autonomously() { + return expand_autonomously; + } + + public void set_expand_autonomously(boolean val) { + expand_autonomously = val; + } + + Bitmap make_filter(long init_size, int bits_per_entry) { +// System.out.println(init_size ) ; +// System.out.println(num_extension_slots); +// System.out.println("Making BitVector with: " + (init_size + num_extension_slots) + "SLOTS"); + return new QuickBitVectorWrapper(bits_per_entry, init_size + num_extension_slots); + } + + public int get_fingerprint_length() { + return fingerprintLength; + } + + QuotientFilter(int power_of_two, int bits_per_entry, Bitmap bitmap) { + power_of_two_size = power_of_two; + bitPerEntry = bits_per_entry; + fingerprintLength = bits_per_entry - 3; + filter = bitmap; + num_extension_slots = power_of_two * 2; + + //nuevo + long init_size = 1L << power_of_two; + last_empty_slot = init_size + num_extension_slots - 1; + last_cluster_start = 0; + backward_steps = 0; + } + + boolean expand() { + is_full = true; + return false; + } + + // measures the number of bits per entry for the filter + public double measure_num_bits_per_entry() { + return measure_num_bits_per_entry(this, new ArrayList()); + } + + // measures the number of bits per entry for the filter + // it takes an array of filters as a parameter since some filter implementations here consist of multiple filter objects + protected static double measure_num_bits_per_entry(QuotientFilter current, ArrayList other_filters) { + //System.out.println("--------------------------"); + //current.print_filter_summary(); + //System.out.println(); + double num_entries = current.get_num_entries(false); + for (QuotientFilter q : other_filters) { + //q.print_filter_summary(); + //System.out.println(); + long q_num_entries = q.get_num_entries(false); + num_entries += q_num_entries; + } + long init_size = 1L << current.power_of_two_size; + long num_bits = current.bitPerEntry * init_size + current.num_extension_slots * current.bitPerEntry; + for (QuotientFilter q : other_filters) { + init_size = 1L << q.power_of_two_size; + num_bits += q.bitPerEntry * init_size + q.num_extension_slots * q.bitPerEntry; + } + //System.out.println("total entries: \t\t" + num_entries); + //System.out.println("total bits: \t\t" + num_bits); + double bits_per_entry = num_bits / num_entries; + //System.out.println("total bits/entry: \t" + bits_per_entry); + //System.out.println(); + return bits_per_entry; + } + + // scans the quotient filter and returns the number of non-empty slots + public long get_num_entries(boolean include_all_internal_filters) { + //long bits = filter.size(); + long slots = get_physcial_num_slots(); + long num_entries = 0; + for (long i = 0; i < slots; i++) { + if (is_occupied(i) || is_continuation(i) || is_shifted(i)) { + num_entries++; + } + } + return num_entries; + } + + // returns the fraction of occupied slots in the filter + public double get_utilization() { + long num_logical_slots = 1L << power_of_two_size; + long num_entries = get_num_entries(false); + double util = num_entries / (double) num_logical_slots; + return util; + } + + public long get_physcial_num_slots() { + long bits = filter.size(); + return bits / bitPerEntry; + } + + // returns the number of physical slots in the filter (including the extention/buffer slots at the end) + public long get_logical_num_slots_plus_extensions() { + return (1L << power_of_two_size) + num_extension_slots; + } + + // returns the number of slots in the filter without the extension/buffer slots + public long get_logical_num_slots() { + return 1L << power_of_two_size; + } + + // sets the metadata flag bits for a given slot index + void modify_slot(boolean is_occupied, boolean is_continuation, boolean is_shifted, + long index) { + set_occupied(index, is_occupied); + set_continuation(index, is_continuation); + set_shifted(index, is_shifted); + } + + // sets the fingerprint for a given slot index + void set_fingerprint(long index, long fingerprint) { + filter.setFromTo(index * bitPerEntry + 3, (long)index * bitPerEntry + 3 + fingerprintLength, fingerprint); + } + + // print a nice representation of the filter that can be understood. + // if vertical is on, each line will represent a slot + public String get_pretty_str(boolean vertical) { + StringBuffer sbr = new StringBuffer(); + + long logic_slots = get_logical_num_slots(); + long all_slots = get_logical_num_slots_plus_extensions(); + + for (long i = 0; i < filter.size(); i++) { + long remainder = i % bitPerEntry; + if (remainder == 0) { + long slot_num = i/bitPerEntry; + sbr.append(" "); + if (vertical) { + if (slot_num == logic_slots ){//|| slot_num == all_slots) { + sbr.append("\n ---------"); + } else if (slot_num == all_slots) { + sbr.append("\n d***********b"); + } + //sbr.append("\n" + slot_num + " "); + sbr.append("\n" + String.format("%-10d", slot_num) + "\t"); + } + } + if (remainder == 3) { + sbr.append(" "); + } + sbr.append(filter.get(i) ? "1" : "0"); + } + sbr.append("\n"); + return sbr.toString(); + } + + // print a nice representation of the filter that can be humanly read. + public void pretty_print() { + System.out.print(get_pretty_str(true)); + } + + // return a fingerprint in a given slot index + long get_fingerprint(long index) { + return filter.getFromTo(index * bitPerEntry + 3, index * bitPerEntry + 3 + fingerprintLength); + } + + // return an entire slot representation, including metadata flags and fingerprint + long get_slot(long index) { + return filter.getFromTo(index * bitPerEntry, (index + 1) * bitPerEntry); + } + + // compare a fingerprint input to the fingerprint in some slot index + protected boolean compare(long index, long fingerprint) { + return get_fingerprint(index) == fingerprint; + } + + // modify the flags and fingerprint of a given slot + void modify_slot(boolean is_occupied, boolean is_continuation, boolean is_shifted, + long index, long fingerprint) { + modify_slot(is_occupied, is_continuation, is_shifted, index); + set_fingerprint(index, fingerprint); + } + + // summarize some statistical measures about the filter + public void print_filter_summary() { + long num_entries = get_num_entries(false); + long slots = (1L << power_of_two_size) + num_extension_slots; + long num_bits = slots * bitPerEntry; + System.out.println("slots:\t" + slots); + System.out.println("entries:\t" + num_entries); + System.out.println("bits\t:" + num_bits); + System.out.println("bits/entry\t:" + num_bits / (double)num_entries); + System.out.println("FP length:\t" + fingerprintLength); + System.out.println("Is full?\t" + is_full); + double capacity = num_entries / (double)(slots) ; + System.out.println("Capacity\t" + capacity); + compute_statistics(); + //System.out.println("num runs: \t\t" + num_runs); + //System.out.println("avg run length: \t" + avg_run_length); + //System.out.println("num clusters: \t\t" + num_clusters); + //System.out.println("avg cluster length: \t" + avg_cluster_length); + } + + @Override + public long get_space_use(){ + /* + Returns the number of bits used for the filter + */ + long slots = (1L << power_of_two_size); // + num_extension_slots; + long num_bits = slots * bitPerEntry; + return num_bits ; + } + + public int get_bits_per_entry() { + return bitPerEntry; + } + + boolean is_occupied(long index) { + return filter.get(index * bitPerEntry); + } + + boolean is_continuation(long index) { + return filter.get(index * bitPerEntry + 1); + } + + boolean is_shifted(long index) { + return filter.get(index * bitPerEntry + 2); + } + + void set_occupied(long index, boolean val) { + filter.set(index * bitPerEntry, val); + } + + void set_continuation(long index, boolean val) { + filter.set(index * bitPerEntry + 1, val); + } + + void set_shifted(long index, boolean val) { + filter.set(index * bitPerEntry + 2, val); + } + + boolean is_slot_empty(long index) { + return !is_occupied(index) && !is_continuation(index) && !is_shifted(index); + } + + // scan the cluster leftwards until finding the start of the cluster and returning its slot index + // used by deletes + long find_cluster_start(long index) { + long current_index = index; + while (is_shifted(current_index)) { + current_index--; + } + return current_index; + } + + // given a canonical slot A, finds the actual index B of where the run belonging to slot A now resides + // since the run might have been shifted to the right due to collisions + long find_run_start(long index) { + long current_index = index; + int runs_to_skip_counter = 1; + while (is_shifted(current_index)) { + if (is_occupied(current_index)) { + runs_to_skip_counter++; + } + current_index--; + } + last_cluster_start = current_index - 1; + while (true) { + if (!is_continuation(current_index)) { + runs_to_skip_counter--; + if (runs_to_skip_counter == 0) { + return current_index; + } + } + current_index++; + } + } + + // given the start of a run, scan the run and return the index of the first matching fingerprint + long find_first_fingerprint_in_run(long index, long fingerprint) { + assert(!is_continuation(index)); + do { + if (compare(index, fingerprint)) { + //System.out.println("found matching FP at index " + index); + return index; + } + index++; + } while (index < get_logical_num_slots_plus_extensions() && is_continuation(index)); + return -1; + } + + // delete the last matching fingerprint in the run + long decide_which_fingerprint_to_delete(long index, long fingerprint) { + assert(!is_continuation(index)); + long matching_fingerprint_index = -1; + do { + if (compare(index, fingerprint)) { + //System.out.println("found matching FP at index " + index); + matching_fingerprint_index = index; + } + index++; + } while (index < get_logical_num_slots_plus_extensions() && is_continuation(index)); + return matching_fingerprint_index; + } + + // given the start of a run, find the last slot index that still belongs to this run + long find_run_end(long index) { + while(index < get_logical_num_slots_plus_extensions() - 1 && is_continuation(index+1)) { + index++; + } + return index; + } + + // given a canonical index slot and a fingerprint, find the relevant run and check if there is a matching fingerprint within it + boolean search(long fingerprint, long index) { + boolean does_run_exist = is_occupied(index); + if (!does_run_exist) { + return false; + } + long run_start_index = find_run_start(index); + long found_index = find_first_fingerprint_in_run(run_start_index, fingerprint); + return found_index > -1; + } + + // Given a canonical slot index, find the corresponding run and return all fingerprints in the run. + // This method is only used for testing purposes. + Set get_all_fingerprints(long bucket_index) { + boolean does_run_exist = is_occupied(bucket_index); + HashSet set = new HashSet(); + if (!does_run_exist) { + return set; + } + long run_index = find_run_start(bucket_index); + do { + set.add(get_fingerprint(run_index)); + run_index++; + } while (is_continuation(run_index)); + return set; + } + + // Swaps the fingerprint in a given slot with a new one. Return the pre-existing fingerprint + long swap_fingerprints(long index, long new_fingerprint) { + long existing = get_fingerprint(index); + set_fingerprint(index, new_fingerprint); + return existing; + } + + // finds the first empty slot after the given slot index + long find_first_empty_slot(long index) { + while (!is_slot_empty(index)) { + index++; + } + return index; + } + + // moves backwards to find the first empty slot + // used as a part of the mechanism to prevent exceptions when exceeding the quotient filter's bounds + long find_backward_empty_slot(long index) { + while (index >= 0 && !is_slot_empty(index)) { + backward_steps++; + index--; + } + return index; + } + + // return the first slot to the right where the current run starting at the index parameter ends + long find_new_run_location(long index) { + if (!is_slot_empty(index)) { + index++; + } + while (is_continuation(index)) { + index++; + } + return index; + } + + boolean insert_new_run(long canonical_slot, long long_fp) { + long first_empty_slot = find_first_empty_slot(canonical_slot); // finds the first empty slot to the right of the canonical slot that is empty + long preexisting_run_start_index = find_run_start(canonical_slot); // scans the cluster leftwards and then to the right until reaching our run's would be location + long start_of_this_new_run = find_new_run_location(preexisting_run_start_index); // If there is already a run at the would-be location, find its end and insert the new run after it + boolean slot_initially_empty = is_slot_empty(start_of_this_new_run); + + // modify some metadata flags to mark the new run + set_occupied(canonical_slot, true); + if (first_empty_slot != canonical_slot) { + set_shifted(start_of_this_new_run, true); + } + set_continuation(start_of_this_new_run, false); + + // if the slot was initially empty, we can just terminate, as there is nothing to push to the right + if (slot_initially_empty) { + set_fingerprint(start_of_this_new_run, long_fp); + if (start_of_this_new_run == last_empty_slot) { + last_empty_slot = find_backward_empty_slot(last_cluster_start); + } + num_existing_entries++; + return true; + } + + // push all entries one slot to the right + // if we inserted this run in the middle of a cluster + long current_index = start_of_this_new_run; + boolean is_this_slot_empty; + boolean temp_continuation = false; + do { + if (current_index >= get_logical_num_slots_plus_extensions()) { + return false; + } + + is_this_slot_empty = is_slot_empty(current_index); + long_fp = swap_fingerprints(current_index, long_fp); + + if (current_index > start_of_this_new_run) { + set_shifted(current_index, true); + } + + if (current_index > start_of_this_new_run) { + boolean current_continuation = is_continuation(current_index); + set_continuation(current_index, temp_continuation); + temp_continuation = current_continuation; + } + current_index++; + if (current_index == last_empty_slot) { // TODO get this out of the while loop + last_empty_slot = find_backward_empty_slot(last_cluster_start); + } + } while (!is_this_slot_empty); + num_existing_entries++; + return true; + } + + boolean insert(long long_fp, long index, boolean insert_only_if_no_match) { + //System.out.println("Inserting Fingerprint " + long_fp); + //System.out.println("Inserting @ index " + index); + //System.out.println("BoolMatch? " + insert_only_if_no_match); + //System.out.println("**********"); + //System.out.println("Num items: " + num_existing_entries); + //System.out.println("Max items: " + max_entries_before_expansion); + + if (index > last_empty_slot) { + return false; + } + boolean does_run_exist = is_occupied(index); + if (!does_run_exist) { + boolean val = insert_new_run(index, long_fp); + return val; + } + + long run_start_index = find_run_start(index); + if (does_run_exist && insert_only_if_no_match) { + long found_index = find_first_fingerprint_in_run(run_start_index, long_fp); + if (found_index > -1) { + return false; + } + } + return insert_fingerprint_and_push_all_else(long_fp, run_start_index); + } + + // insert an fingerprint as the first fingerprint of the new run and push all other entries in the cluster to the right. + boolean insert_fingerprint_and_push_all_else(long long_fp, long run_start_index) { + long current_index = run_start_index; + boolean is_this_slot_empty; + boolean finished_first_run = false; + boolean temp_continuation = false; + + do { + if (current_index >= get_logical_num_slots_plus_extensions()) { + return false; + } + is_this_slot_empty = is_slot_empty(current_index); + if (current_index > run_start_index) { + set_shifted(current_index, true); + } + if (current_index > run_start_index && !finished_first_run && !is_continuation(current_index)) { + finished_first_run = true; + set_continuation(current_index, true); + long_fp = swap_fingerprints(current_index, long_fp); + } + else if (finished_first_run) { + boolean current_continuation = is_continuation(current_index); + set_continuation(current_index, temp_continuation); + temp_continuation = current_continuation; + long_fp = swap_fingerprints(current_index, long_fp); + } + if (current_index == last_empty_slot) { + last_empty_slot = find_backward_empty_slot(last_cluster_start); + } + current_index++; + } while (!is_this_slot_empty); + num_existing_entries++; + return true; + } + + boolean delete(long fingerprint, long canonical_slot, long run_start_index, long matching_fingerprint_index) { + long run_end = find_run_end(matching_fingerprint_index); + + // the run has only one entry, we need to disable its is_occupied flag + // we just remember we need to do this here, and we do it later to not interfere with counts + boolean turn_off_occupied = run_start_index == run_end; + + // First thing to do is move everything else in the run back by one slot + for (long i = matching_fingerprint_index; i < run_end; i++) { + long f = get_fingerprint(i + 1); + set_fingerprint(i, f); + } + + // for each slot, we want to know by how much the entry there is shifted + // we can do this by counting the number of continuation flags set to true + // and the number of occupied flags set to false from the start of the cluster to the given cell + // and then subtracting: num_shifted_count - num_non_occupied = number of slots by which an entry is shifted + long cluster_start = find_cluster_start(canonical_slot); + long num_shifted_count = 0; + long num_non_occupied = 0; + for (long i = cluster_start; i <= run_end; i++) { + if (is_continuation(i)) { + num_shifted_count++; + } + if (!is_occupied(i)) { + num_non_occupied++; + } + } + + set_fingerprint(run_end, 0); + set_shifted(run_end, false); + set_continuation(run_end, false); + + // we now have a nested loop. The outer do-while iterates over the remaining runs in the cluster. + // the inner for loop iterates over cells of particular runs, pushing entries one slot back. + do { + // we first check if the next run actually exists and if it is shifted. + // only if both conditions hold, we need to shift it back one slot. + //boolean does_next_run_exist = !is_slot_empty(run_end + 1); + //boolean is_next_run_shifted = is_shifted(run_end + 1); + //if (!does_next_run_exist || !is_next_run_shifted) { + if (run_end >= get_logical_num_slots_plus_extensions()-1 || + is_slot_empty(run_end + 1) || !is_shifted(run_end + 1)) { + if (turn_off_occupied) { + // if we eliminated a run and now need to turn the is_occupied flag off, we do it at the end to not interfere in our counts + set_occupied(canonical_slot, false); + + } + if (run_end > last_empty_slot) { + last_empty_slot = run_end; + } + return true; + } + + // we now find the start and end of the next run + long next_run_start = run_end + 1; + run_end = find_run_end(next_run_start); + + // before we start processing the next run, we check whether the previous run we shifted is now back to its canonical slot + // The condition num_shifted_count - num_non_occupied == 1 ensures that the run was shifted by only 1 slot, meaning it is now back in its proper place + if ( is_occupied(next_run_start - 1) && num_shifted_count - num_non_occupied == 1 ) { + set_shifted(next_run_start - 1, false); + } + else { + set_shifted(next_run_start - 1, true); + } + + for (long i = next_run_start; i <= run_end; i++) { + long f = get_fingerprint(i); + set_fingerprint(i - 1, f); + if (is_continuation(i)) { + set_continuation(i-1, true); + } + if (!is_occupied(i)) { + num_non_occupied++; + } + } + num_shifted_count += run_end - next_run_start; + set_fingerprint(run_end, 0); + set_shifted(run_end, false); + set_continuation(run_end, false); + } while (true); + } + + boolean delete(long fingerprint, long canonical_slot) { + if (canonical_slot >= get_logical_num_slots()) { + return false; + } + // if the run doesn't exist, the key can't have possibly been inserted + boolean does_run_exist = is_occupied(canonical_slot); + if (!does_run_exist) { + return false; + } + long run_start_index = find_run_start(canonical_slot); + + long matching_fingerprint_index = decide_which_fingerprint_to_delete(run_start_index, fingerprint); + + if (matching_fingerprint_index == -1) { + // we didn't find a matching fingerprint + return false; + } + + return delete(fingerprint, canonical_slot, run_start_index, matching_fingerprint_index); + + } + + + + /* + Performs the modular arithmetic of large_hash % bits_per_entry and uses this as the slot_index + */ + long get_slot_index(long large_hash) { + long slot_index_mask = (1L << power_of_two_size) - 1; + long slot_index = large_hash & slot_index_mask; + //System.out.format("\n**get_slot_index(): [total_hash:index_hash:int_index] --> [%016x:%016x:%016x]\n", large_hash, (int)large_hash, slot_index); + return slot_index; + } + + long gen_fingerprint(long large_hash) { + long fingerprint_mask = (1L << fingerprintLength) - 1L; + fingerprint_mask = fingerprint_mask << power_of_two_size; + long fingerprint = (large_hash & fingerprint_mask) >> power_of_two_size; + //System.out.format("\n**gen_fingerprint(): [total_hash:fingerprint_hash:int_fingerprint] --> [%016x:%016x:%016x]\n", large_hash, ((int)(large_hash>>32)), fingerprint); + return fingerprint; + } + +// void print_key(int input) { +// long large_hash = HashFunctions.normal_hash(input); +// long slot_index = get_slot_index(large_hash); +// long fingerprint = gen_fingerprint(large_hash); +// +// System.out.println("num : " + input); +// System.out.print("hash : "); +// print_long_in_binary(large_hash, fingerprintLength + power_of_two_size); +// //print_int_in_binary(slot_index_mask, 31); +// System.out.print("bucket: "); +// print_long_in_binary(slot_index, power_of_two_size); +// System.out.print("FP : "); +// //print_int_in_binary(fingerprint_mask, 31); +// print_long_in_binary(fingerprint, fingerprintLength); +// System.out.println(); +// +// } +// +// void set_expansion_threshold(double thresh) { +// expansion_threshold = thresh; +// max_entries_before_expansion = (long)(Math.pow(2, power_of_two_size) * expansion_threshold); +// } +// + /* + This is the main insertion function accessed externally. + It calls the underlying filter _insert function which hashes the input + item internally. + Hence, the `large_hash` argument is already a hash key that has been generated + by the hashing library (eg xxhash). + */ + protected boolean _insert(long large_hash, boolean insert_only_if_no_match) { + //System.out.println("Inserting long hash " + large_hash); + if (is_full) { + return false; + } + long slot_index = get_slot_index(large_hash); + long fingerprint = gen_fingerprint(large_hash); + + /*print_long_in_binary(large_hash, 64); + print_long_in_binary(slot_index, 32); + print_long_in_binary((int)fingerprint, 64); + System.out.println(slot_index + " " + fingerprint ); + System.out.println(); */ + + boolean success = insert(fingerprint, slot_index, false); + /*if (!success) { + System.out.println("insertion failure"); + System.out.println(input + "\t" + slot_index + "\t" + get_fingerprint_str(fingerprint, fingerprintLength)); + pretty_print(); + System.exit(1); + }*/ + +// if (expand_autonomously && num_existing_entries >= max_entries_before_expansion) { +// boolean expanded = expand(); +// if (expanded) { +// num_expansions++; +// } +// } + return success; + } +// +// protected boolean _delete(long large_hash) { +// long slot_index = get_slot_index(large_hash); +// long fp_long = gen_fingerprint(large_hash); +// boolean success = delete(fp_long, slot_index); +// if (success) { +// num_existing_entries--; +// } +// return success; +// } +// + protected boolean _search(long large_hash) { + long slot_index = get_slot_index(large_hash); + long fingerprint = gen_fingerprint(large_hash); + return search(fingerprint, slot_index); + } + + + + public boolean get_bit_at_offset(int offset) { + return filter.get(offset); + } + + public void compute_statistics() { + num_runs = 0; + num_clusters = 0; + double sum_run_lengths = 0; + double sum_cluster_lengths = 0; + + int current_run_length = 0; + int current_cluster_length = 0; + + long num_slots = get_logical_num_slots_plus_extensions(); + for (long i = 0; i < num_slots; i++) { + + boolean occupied = is_occupied(i); + boolean continuation = is_continuation(i); + boolean shifted = is_shifted(i); + + if ( !occupied && !continuation && !shifted ) { // empty slot + sum_cluster_lengths += current_cluster_length; + current_cluster_length = 0; + sum_run_lengths += current_run_length; + current_run_length = 0; + } + else if ( !occupied && !continuation && shifted ) { // start of new run + num_runs++; + sum_run_lengths += current_run_length; + current_run_length = 1; + current_cluster_length++; + } + else if ( !occupied && continuation && !shifted ) { + // not used + } + else if ( !occupied && continuation && shifted ) { // continuation of run + current_cluster_length++; + current_run_length++; + } + else if ( occupied && !continuation && !shifted ) { // start of new cluster & run + num_runs++; + num_clusters++; + sum_cluster_lengths += current_cluster_length; + sum_run_lengths += current_run_length; + current_cluster_length = 1; + current_run_length = 1; + } + else if (occupied && !continuation && shifted ) { // start of new run + num_runs++; + sum_run_lengths += current_run_length; + current_run_length = 1; + current_cluster_length++; + } + else if (occupied && continuation && !shifted ) { + // not used + } + else if (occupied && continuation && shifted ) { // continuation of run + current_cluster_length++; + current_run_length++; + } + } + avg_run_length = sum_run_lengths / num_runs; + avg_cluster_length = sum_cluster_lengths / num_clusters; + } + +// +// void ar_sum1(ArrayList ar, int index) +// { +// int s = ar.size(); +// if (s <= index) +// { +// for (int i = s; i measure_cluster_length() +// { +// ArrayList ar = new ArrayList(); +// +// num_runs = 0; +// num_clusters = 0; +// +// int current_run_length = 0; +// int current_cluster_length = 0; +// +// int cnt = 0; +// +// for (int i = 0; i < get_logical_num_slots_plus_extensions(); i++) { +// +// boolean occupied = is_occupied(i); +// boolean continuation = is_continuation(i); +// boolean shifted = is_shifted(i); +// +// if (!occupied && !continuation && !shifted ) { // empty slot +// if(current_cluster_length != 0) ar_sum1(ar, current_cluster_length-1); +// current_cluster_length = 0; +// current_run_length = 0; +// } +// else if (!occupied && !continuation && shifted ) { // start of new run +// num_runs++; +// current_run_length = 1; +// current_cluster_length++; +// } +// else if (!occupied && continuation && shifted ) { // continuation of run +// current_cluster_length++; +// current_run_length++; +// } +// else if (occupied && !continuation && !shifted ) { // start of new cluster & run +// if(current_cluster_length != 0) ar_sum1(ar, current_cluster_length-1); +// num_runs++; +// num_clusters++; +// //if(current_cluster_length == 0) cnt++; +// current_cluster_length = 1; +// current_run_length = 1; +// } +// else if (occupied && !continuation && shifted ) { // start of new run +// num_runs++; +// current_run_length = 1; +// current_cluster_length++; +// } +// else if (occupied && continuation && shifted ) { // continuation of run +// current_cluster_length++; +// current_run_length++; +// } +// } +// if(current_cluster_length != 0) ar_sum1(ar, current_cluster_length-1); +// //System.out.println("CNT = " + cnt); +// return ar; +// } +// +// /* +// @charlied +// Returns the fraction of the filter that is occupied by inserted items. +// Extension slots are omitted from the calculation of the load factor because they are used to accomodate +// items in the filter at the top end of the filter. +// Asymptotically, these will make little-to-no difference to the load in these calculations as the slots +// contributed 2*j / (2^j) --> 0 entries. +// */ +// public double get_load() { +// return num_existing_entries / (double) get_logical_num_slots(); +// } + +} + + diff --git a/src/test/java/org/apache/datasketches/filters/quotientfilter/DeletionTests.java b/src/test/java/org/apache/datasketches/filters/quotientfilter/DeletionTests.java new file mode 100644 index 000000000..4b2a5da3f --- /dev/null +++ b/src/test/java/org/apache/datasketches/filters/quotientfilter/DeletionTests.java @@ -0,0 +1,138 @@ +package org.apache.datasketches.filters.quotientfilter; +import org.testng.annotations.Test; +import static org.testng.Assert.assertTrue; +import static org.testng.Assert.assertEquals; + +import java.util.BitSet; + +public class DeletionTests { + + /** + * This test checks the functionality of deleting items from the QuotientFilter. + * The test works by: + * 1. Inserting multiple keys into a single slot to create an overflow. + * 2. Removing these keys. + * 3. Checking that the remaining keys have returned to their canonical slots. + * The expected outcome is that after deletion, the remaining keys should be in their canonical slots. + */ + @Test + static public void BasicDeletions() { + int bits_per_entry = 8; + int num_entries_power = 3; + int num_entries = (int)Math.pow(2, num_entries_power); + QuotientFilter qf = new QuotientFilter(num_entries_power, bits_per_entry); + + long fp1 = 1 << 4; + long fp2 = 1 << 3; + long fp3 = 1 << 2; + long fp4 = 31; + + qf.insert(fp4, 1, false); + qf.insert(fp1, 1, false); + qf.insert(fp1, 1, false); + qf.insert(fp2, 2, false); + qf.insert(fp1, 1, false); + qf.insert(fp1, 1, false); + qf.insert(fp3, 4, false); + + + qf.delete(31, 1); + qf.delete(fp1, 1); + qf.delete(fp1, 1); + qf.delete(fp1, 1); + qf.delete(fp1, 1); + + BitSet result = new BitSet(num_entries * bits_per_entry); + result = QuotientFilterTest.set_slot_in_test(result, bits_per_entry, 2, true, false, false, fp2); + result = QuotientFilterTest.set_slot_in_test(result, bits_per_entry, 4, true, false, false, fp3); + assertTrue(QuotientFilterTest.check_equality(qf, result, true)); + } + + /** + * This test checks the functionality of deleting items from the QuotientFilter. + * The test works by: + * 1. Inserting multiple keys into a single slot to create an overflow. + * 2. Removing these keys. + * 3. Checking that the remaining keys have returned to their canonical slots. + * The expected outcome is that after deletion, the remaining keys should be in their canonical slots. + */ + @Test + static public void DeletionsWithSameFingerprint() { + int bits_per_entry = 8; + int num_entries_power = 3; + int num_entries = (int)Math.pow(2, num_entries_power); + QuotientFilter qf = new QuotientFilter(num_entries_power, bits_per_entry); + + + // All keys have the same fingerprint but are mapped into (mostly) different slots + qf.insert(0, 1, false); + qf.insert(0, 1, false); + qf.insert(0, 2, false); + qf.insert(0, 2, false); + qf.insert(0, 3, false); + qf.insert(0, 3, false); + qf.insert(0, 3, false); + qf.insert(0, 6, false); + qf.insert(0, 6, false); + qf.insert(0, 6, false); + qf.insert(0, 7, false); + + + qf.delete(0, 2); + qf.delete(0, 3); + + BitSet result = new BitSet(num_entries * bits_per_entry); + result = QuotientFilterTest.set_slot_in_test(result, bits_per_entry, 1, true, false, false, 0); + result = QuotientFilterTest.set_slot_in_test(result, bits_per_entry, 2, true, true, true, 0); + result = QuotientFilterTest.set_slot_in_test(result, bits_per_entry, 3, true, false, true, 0); + result = QuotientFilterTest.set_slot_in_test(result, bits_per_entry, 4, false, false, true, 0); + result = QuotientFilterTest.set_slot_in_test(result, bits_per_entry, 5, false, true, true, 0); + result = QuotientFilterTest.set_slot_in_test(result, bits_per_entry, 6, true, false, false, 0); + result = QuotientFilterTest.set_slot_in_test(result, bits_per_entry, 7, true, true, true, 0); + result = QuotientFilterTest.set_slot_in_test(result, bits_per_entry, 8, false, true, true, 0); + result = QuotientFilterTest.set_slot_in_test(result, bits_per_entry, 9, false, false, true, 0); + + assertTrue(QuotientFilterTest.check_equality(qf, result, true)); + } + + @Test + /** + * This is a test for deleting items from the QuotientFilter even when an overflow is caused + * by multiple insertions. + * The test works by: + * 1. Inserting multiple keys into a single slot to create an overflow. + * 2. Removing these keys. + * 3. Checking that the remaining keys have returned to their canonical slots. + * + * The expected outcome is that after deletion, the remaining keys should be in their canonical slots. + */ + static public void DeletionsWithOverflow() { + int bits_per_entry = 8; + int num_entries_power = 3; + int num_entries = (int)Math.pow(2, num_entries_power); + QuotientFilter qf = new QuotientFilter(num_entries_power, bits_per_entry); + + qf.insert(0, 1, false); + qf.insert(0, 1, false); + qf.insert(0, 2, false); + qf.insert(0, 2, false); + qf.insert(0, 3, false); + qf.insert(0, 4, false); + qf.insert(0, 4, false); + qf.insert(0, 5, false); + + //qf.pretty_print(); + qf.delete(0, 3); + //qf.pretty_print(); + + BitSet result = new BitSet(num_entries * bits_per_entry); + result = QuotientFilterTest.set_slot_in_test(result, bits_per_entry, 1, true, false, false, 0); + result = QuotientFilterTest.set_slot_in_test(result, bits_per_entry, 2, true, true, true, 0); + result = QuotientFilterTest.set_slot_in_test(result, bits_per_entry, 3, false, false, true, 0); + result = QuotientFilterTest.set_slot_in_test(result, bits_per_entry, 4, true, true, true, 0); + result = QuotientFilterTest.set_slot_in_test(result, bits_per_entry, 5, true, false, true, 0); + result = QuotientFilterTest.set_slot_in_test(result, bits_per_entry, 6, false, true, true, 0); + result = QuotientFilterTest.set_slot_in_test(result, bits_per_entry, 7, false, false, true, 0); + assertTrue(QuotientFilterTest.check_equality(qf, result, true)); + } +} diff --git a/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterTest.java b/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterTest.java new file mode 100644 index 000000000..d03978bf9 --- /dev/null +++ b/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterTest.java @@ -0,0 +1,258 @@ +package org.apache.datasketches.filters.quotientfilter; +import org.testng.annotations.Test; +import static org.testng.Assert.assertTrue; +import static org.testng.Assert.assertEquals; + +import java.util.BitSet; +import java.util.HashSet; +import java.util.Random; + + +public class QuotientFilterTest { + /* + * This test is based on the example from https://en.wikipedia.org/wiki/Quotient_filter + * in "Algorithm Description" section. + * It performs the same insertions and query as the example and verifies that it gets the same results. + * The insertion keys are: b, e, f, c, d, a which are hashed into slots as: + * (b,1), (e,4), (f, 7), (c,1), (d,2), (a,1) + */ + @Test + static public void WikiInsertionTest() { + int bits_per_entry = 8; // 8 bits per entry => 5 bits fingerprints, resolved internally in the filter. + int num_entries_power = 3; + int num_entries = (int)Math.pow(2, num_entries_power); + QuotientFilter qf = new QuotientFilter(num_entries_power, bits_per_entry); + + // this test does not need different fingerprints as it is testing the slot locations and metadata bits. + long fingerprint0 = 0; + long fingerprint1 = (1 << bits_per_entry) - 1; + + /* + The expected sketch is + 0 000 00000 + 1 100 00000 + 2 111 00000 + 3 011 00000 + 4 101 00000 + 5 001 11111 + 6 000 00000 + 7 100 00000 + */ + qf.insert(fingerprint0, 1, false); + qf.insert(fingerprint1, 4, false); // 11111 is inserted at slot 45 but pushed to slot 5 + qf.insert(fingerprint0, 7, false); + qf.insert(fingerprint0, 1, false); + qf.insert(fingerprint0, 2, false); + qf.insert(fingerprint0, 1, false); + assertEquals(qf.num_existing_entries, 6); + + + + // these are the expected resulting is_occupied, is_continuation, and is_shifted bits + // for all slots contiguously. We do not store the fingerprints here + BitSet result = new BitSet(num_entries * bits_per_entry); + result = set_slot_in_test(result, bits_per_entry, 0, false, false, false, fingerprint0); + result = set_slot_in_test(result, bits_per_entry, 1, true, false, false, fingerprint0); + result = set_slot_in_test(result, bits_per_entry, 2, true, true, true, fingerprint0); + result = set_slot_in_test(result, bits_per_entry, 3, false, true, true, fingerprint0); + result = set_slot_in_test(result, bits_per_entry, 4, true, false, true, fingerprint0); + result = set_slot_in_test(result, bits_per_entry, 5, false, false, true, fingerprint1); + result = set_slot_in_test(result, bits_per_entry, 6, false, false, false, fingerprint0); + result = set_slot_in_test(result, bits_per_entry, 7, true, false, false, fingerprint0); + assertTrue(check_equality(qf, result, true)); + } + + /* + * This test is based on the Figure 2. from https://vldb.org/pvldb/vol5/p1627_michaelabender_vldb2012.pdf. + * It performs the same insertions as in Figure 2 and checks for the same result. + */ + @Test + static public void PaperInsertionTest() { + int bits_per_entry = 8; + int num_entries_power = 4; + int num_entries = (int)Math.pow(2, num_entries_power); + QuotientFilter qf = new QuotientFilter(4, 8); + + // (key, slot): {(a, 1), (b,1), (c ,3), (d, 3), (e, 3), (f, 4), (g, 6), (h, 6)} + qf.insert(0, 1, false); + qf.insert(0, 1, false); + qf.insert(0, 3, false); + qf.insert(0, 3, false); + qf.insert(0, 3, false); + qf.insert(0, 4, false); + qf.insert(0, 6, false); + qf.insert(0, 6, false); + + BitSet result = new BitSet(num_entries * bits_per_entry); + result = set_slot_in_test(result, bits_per_entry, 0, false, false, false, 0); + result = set_slot_in_test(result, bits_per_entry, 1, true, false, false, 0); + result = set_slot_in_test(result, bits_per_entry, 2, false, true, true, 0); + result = set_slot_in_test(result, bits_per_entry, 3, true, false, false, 0); + result = set_slot_in_test(result, bits_per_entry, 4, true, true, true, 0); + result = set_slot_in_test(result, bits_per_entry, 5, false, true, true, 0); + result = set_slot_in_test(result, bits_per_entry, 6, true, false, true, 0); + result = set_slot_in_test(result, bits_per_entry, 7, false, false, true, 0); + result = set_slot_in_test(result, bits_per_entry, 8, false, true, true, 0); + assertTrue(check_equality(qf, result, false)); + } + + // test we don't get any false negatives for quotient filter + @Test + static public void FalseNegativeTest() { + int bits_per_entry = 10; + int num_entries_power = 10; + QuotientFilter filter = new QuotientFilter(num_entries_power, bits_per_entry); + int num_entries = (int) (Math.pow(2, num_entries_power) * 0.9 ); + assertTrue(test_no_false_negatives(filter, num_entries)); + } + + + /* + * Adds two entries to the end of the filter, causing an overflow into the extension slots. + * Checks this can be handled by the internal data structure and then deletes one of the keys from the filter. + */ + @Test + static public void OverflowTest() { + int bits_per_entry = 8; + int num_entries_power = 3; + int num_entries = (int)Math.pow(2, num_entries_power); + int fingerprint_size = bits_per_entry - 3; + QuotientFilter qf = new QuotientFilter(num_entries_power, bits_per_entry); + + long fp2 = 1 << fingerprint_size - 1; + qf.insert(fp2, num_entries - 1, false); + qf.insert(fp2, num_entries - 1, false); + qf.delete(fp2, num_entries - 1); + boolean found = qf.search(fp2, num_entries - 1); + assertTrue(found); + } + + /** + * This method tests the functionality of the QuotientFilter and Iterator classes. It creates a QuotientFilter and inserts + * six entries into it. An Iterator is then used to traverse the entries in the QuotientFilter. The method checks if the + * bucket index of each visited entry matches the expected bucket index. If there's a mismatch, an error message is printed + * and the program exits, indicating a test failure. + */ + @Test + static public void testQuotientFilterInsertionAndIteration() { + + int bits_per_entry = 8; + int num_entries_power = 4; + //int num_entries = (int)Math.pow(2, num_entries_power); + //int fingerprint_size = bits_per_entry - 3; + QuotientFilter qf = new QuotientFilter(num_entries_power, bits_per_entry); + + qf.insert(0, 2, false); + qf.insert(0, 3, false); + qf.insert(0, 3, false); + qf.insert(0, 4, false); + qf.insert(0, 23, false); // last key in the filter + qf.insert(0, 24, false); // outside the bounds, logical slot 14 does not exist logically, even if it might exist physically + + Iterator it = new Iterator(qf); + int[] arr = new int[] {2, 3, 3, 4, 23}; + int arr_index = 0; + while (it.next()) {assertEquals(arr[arr_index++], it.bucket_index);} + } + + @Test + static public void testQuotientFilterIterator() { + + int bits_per_entry = 8; + int num_entries_power = 4; + QuotientFilter qf = new QuotientFilter(num_entries_power, bits_per_entry); + + qf.insert(0, 1, false); + qf.insert(0, 4, false); + qf.insert(0, 7, false); + qf.insert(0, 1, false); + qf.insert(0, 2, false); + qf.insert(0, 1, false); + qf.insert(0, 15, false); + + Iterator it = new Iterator(qf); + int[] arr = new int[] {1, 1, 1, 2, 4, 7, 15}; + int arr_index = 0; + while (it.next()) {assertEquals(arr[arr_index++], it.bucket_index);} + } + + + // Helper functions + + /** + * This method sets the values of a slot in a BitSet based on the provided parameters. + * The slot is defined by the number of bits per entry and the slot index. + * The values to be set include whether the slot is occupied, whether it is a continuation of a previous entry, + * whether it is shifted, and the fingerprint. + * + * @param result The BitSet where the slot values will be set. + * @param bits_per_entry The number of bits per entry in the BitSet. + * @param slot The index of the slot to be set. + * @param is_occupied Whether the slot is occupied. + * @param is_continuation Whether the slot is a continuation of a previous entry. + * @param is_shifted Whether the slot is shifted. + * @param fingerprint The fingerprint to be set in the slot. + * @return The BitSet after setting the slot values. + */ + static public BitSet set_slot_in_test(BitSet result, int bits_per_entry, int slot, boolean is_occupied, boolean is_continuation, boolean is_shifted, long fingerprint) { + int index = bits_per_entry * slot; + result.set(index++, is_occupied); + result.set(index++, is_continuation); + result.set(index++, is_shifted); + for (int i = 0; i < bits_per_entry - 3; i++) { + result.set(index++, Bitmap.get_fingerprint_bit(i, fingerprint) ); + } + return result; + } + + static public BitSet set_slot_in_test(BitSet result, int bits_per_entry, int slot, boolean is_occupied, boolean is_continuation, boolean is_shifted, String fingerprint) { + long l_fingerprint = 0; + for (int i = 0; i < fingerprint.length(); i++) { + char c = fingerprint.charAt(i); + if (c == '1') { + l_fingerprint |= (1 << i); + } + } + return set_slot_in_test(result, bits_per_entry, slot, is_occupied, is_continuation, is_shifted, l_fingerprint); + } + + static public boolean check_equality(QuotientFilter qf, BitSet bs, boolean check_also_fingerprints) { + for (int i = 0; i < bs.size(); i++) { + if (check_also_fingerprints || (i % qf.bitPerEntry == 0 || i % qf.bitPerEntry == 1 || i % qf.bitPerEntry == 2)) { + if (qf.get_bit_at_offset(i) != bs.get(i)) { + return false; + } + } + } + return true; + } + + /* + Helper functino to test that no false negatives are returned. + */ + static public boolean test_no_false_negatives(QuotientFilter filter, int num_entries) { + HashSet added = new HashSet(); + int seed = 5; + Random rand = new Random(seed); + + for (int i = 0; i < num_entries; i++) { + int rand_num = rand.nextInt(); + boolean success = filter.insert(rand_num, false); + if (success) { + added.add(rand_num); + } + else { + System.out.println("insertion failed"); + } + } + + for (Integer i : added) { + boolean found = filter.search((long)i); + if (!found) { + return false ; + } + } + return true; + } + +} From 532a77a3cf73138cdc296ebd358083c43c2c7789 Mon Sep 17 00:00:00 2001 From: Charlie Dickens Date: Wed, 8 May 2024 14:10:20 +0100 Subject: [PATCH 03/38] Tidied java doc github build issue --- .../filters/quotientfilter/QuotientFilterBuilder.java | 10 ++++++---- .../filters/quotientfilter/DeletionTests.java | 1 - 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterBuilder.java b/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterBuilder.java index 0d1812d95..1f98c82f2 100644 --- a/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterBuilder.java +++ b/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterBuilder.java @@ -23,12 +23,12 @@ import org.apache.datasketches.common.SketchesArgumentException; /** - *

This class provides methods to help estimate the correct parameters when - * creating a Quotient filter, and methods to create the filter using those values.

+ * This class provides methods to help estimate the correct parameters when + * creating a Quotient filter, and methods to create the filter using those values. * - *

The underlying math is described in the + * The underlying math is described in the * - * Wikipedia article on Quotient filters.

+ * Wikipedia article on Quotient filters. */ public final class QuotientFilterBuilder { @@ -57,6 +57,8 @@ public static byte suggestFingerprintLength(double targetFalsePositiveProb) { * 2^j slots as j grows, and the canonical number of slots is 2^j. Therefore, we will only check against * 0.9*2^j slots. * The load factor is 0.9 to get some space-utility advantages over the bloom filter. + * @param maxDistinctItems The maximum number of distinct items that can be inserted into the filter. + * @return The log-base-2 of the number of slots in the filter. */ public static byte suggestLgNumSlots(long maxDistinctItems) { if (maxDistinctItems <= 0) { diff --git a/src/test/java/org/apache/datasketches/filters/quotientfilter/DeletionTests.java b/src/test/java/org/apache/datasketches/filters/quotientfilter/DeletionTests.java index 4b2a5da3f..04ca81510 100644 --- a/src/test/java/org/apache/datasketches/filters/quotientfilter/DeletionTests.java +++ b/src/test/java/org/apache/datasketches/filters/quotientfilter/DeletionTests.java @@ -1,7 +1,6 @@ package org.apache.datasketches.filters.quotientfilter; import org.testng.annotations.Test; import static org.testng.Assert.assertTrue; -import static org.testng.Assert.assertEquals; import java.util.BitSet; From 9397e9c78acd426b47d05597a177f78c2506fcbe Mon Sep 17 00:00:00 2001 From: Charlie Dickens Date: Wed, 8 May 2024 14:22:47 +0100 Subject: [PATCH 04/38] Initial javadoc correction --- .../quotientfilter/QuickBitVector.java | 70 +++++++++++-------- 1 file changed, 40 insertions(+), 30 deletions(-) diff --git a/src/main/java/org/apache/datasketches/filters/quotientfilter/QuickBitVector.java b/src/main/java/org/apache/datasketches/filters/quotientfilter/QuickBitVector.java index 17f753e83..c9ba8dc99 100644 --- a/src/main/java/org/apache/datasketches/filters/quotientfilter/QuickBitVector.java +++ b/src/main/java/org/apache/datasketches/filters/quotientfilter/QuickBitVector.java @@ -36,7 +36,7 @@ * Provided with invalid parameters these method may return (or set) invalid values without throwing any exception. * You should only use this class when performance is critical and you are absolutely sure that indexes are within bounds. *

- * A bitvector is modelled as a long array, i.e. long[] bits holds bits of a bitvector. + * A bitvector is modelled as a long array, i.e. long[] bits holds bits of a bitvector. * Each long value holds 64 bits. * The i-th bit is stored in bits[i/64] at * bit position i % 64 (where bit position 0 refers to the least @@ -44,8 +44,8 @@ * * @author wolfgang.hoschek@cern.ch * @version 1.0, 09/24/99 - * @see //BitVector - * @see //BitMatrix + * @see BitVector + * @see BitMatrix * @see java.util.BitSet */ //package bitmap_implementations; @@ -64,12 +64,12 @@ protected QuickBitVector() { /** * Returns a bit mask with bits in the specified range set to 1, all the rest set to 0. * In other words, returns a bit mask having 0,1,2,3,...,64 bits set. - * If to-from+1==0 then returns zero (0L). - * Precondition (not checked): to-from+1 >= 0 && to-from+1 <= 64. + * If to-from+1==0 then returns zero (0L). + * Precondition (not checked): to-from+1 >= 0 && to-from+1 <= 64. * * @param from index of start bit (inclusive) * @param to index of end bit (inclusive). - * @return the bit mask having all bits between from and to set to 1. + * @return the bit mask having all bits between from and to set to 1. */ public static final long bitMaskWithBitsSetFromTo(long from, long to) { return pows[(int)(to-from+1)] << from; @@ -80,7 +80,7 @@ public static final long bitMaskWithBitsSetFromTo(long from, long to) { // return (width=to-from+1) == 0 ? 0L : (0xffffffffffffffffL >>> (BITS_PER_UNIT-width)) << from; } /** - * Changes the bit with index bitIndex in the bitvector bits to the "clear" (false) state. + * Changes the bit with index bitIndex in the bitvector bits to the "clear" (false) state. * * @param bits the bitvector. * @param bitIndex the index of the bit to be cleared. @@ -90,8 +90,8 @@ public static void clear(long[] bits, long bitIndex) { } /** * Returns from the bitvector the value of the bit with the specified index. - * The value is true if the bit with the index bitIndex - * is currently set; otherwise, returns false. + * The value is true if the bit with the index bitIndex + * is currently set; otherwise, returns false. * * @param bits the bitvector. * @param bitIndex the bit index. @@ -101,11 +101,11 @@ public static boolean get(long[] bits, long bitIndex) { return ((bits[(int)(bitIndex >> ADDRESS_BITS_PER_UNIT)] & (1L << (bitIndex & BIT_INDEX_MASK))) != 0); } /** - * Returns a long value representing bits of a bitvector from index from to index to. + * Returns a long value representing bits of a bitvector from index from to index to. * Bits are returned as a long value with the return value having bit 0 set to bit from, ..., bit to-from set to bit to. * All other bits of return value are set to 0. - * If from > to then returns zero (0L). - * Precondition (not checked): to-from+1 <= 64. + * If from > to then returns zero (0L). + * Precondition (not checked): to-from+1 <= 64. * @param bits the bitvector. * @param from index of start bit (inclusive). * @param to index of end bit (inclusive). @@ -161,7 +161,7 @@ static public int leastSignificantBit(int value) { return i; } /** - * Constructs a low level bitvector that holds size elements, with each element taking bitsPerElement bits. + * Constructs a low level bitvector that holds size elements, with each element taking bitsPerElement bits. * CD. THIS METHOD ESSENTIALLY ROUNDS TO THE NEXT MULTIPLE OF 64 BITS. * @param size the number of elements to be stored in the bitvector (must be >= 0). * @param bitsPerElement the number of bits one single element takes. @@ -184,28 +184,36 @@ public static long[] makeBitVector(long size, int bitsPerElement) { //System.out.println("Num slots available: " + (bitVector.length * 64) / bitsPerElement); return bitVector; } + /** - Returns the index of the most significant bit in state "true". - Returns -1 if no bit is in state "true". - Examples: -

-     0x80000000 --> 31
-     0x7fffffff --> 30
-     0x00000001 --> 0
-     0x00000000 --> -1
-     
+ * Returns the index of the most significant bit in state "true". + * Returns -1 if no bit is in state "true". + * + * Examples: + *
+     * 0x80000000 --> 31
+     * 0x7fffffff --> 30
+     * 0x00000001 --> 0
+     * 0x00000000 --> -1
+     * 
+ * + * @param value The integer value for which the most significant bit index is to be found. + * @return The index of the most significant bit in state "true". Returns -1 if no bit is in state "true". */ static public int mostSignificantBit(int value) { int i=32; while (--i >=0 && (((1<bitIndex in the bitvector bits to the state specified by value. + * Sets the bit with index bitIndex in the bitvector bits to the state specified by value. * * @param bits the bitvector. * @param bitIndex the index of the bit to be changed. @@ -261,8 +269,8 @@ public static void put(long[] bits, long bitIndex, boolean value) { * Sets bits of a bitvector from index from to index to to the bits of value. * Bit from is set to bit 0 of value, ..., bit to is set to bit to-from of value. * All other bits stay unaffected. - * If from > to then does nothing. - * Precondition (not checked): to-from+1 <= 64. + * If from > to then does nothing. + * Precondition (not checked): to-from+1 <= 64. * * @param bits the bitvector. * @param value the value to be copied into the bitvector. @@ -311,7 +319,7 @@ public static void putLongFromTo(long[] bits, long value, long from, long to) { bits[toIndex] = (bits[toIndex] & (~mask)) | shiftedValue; } /** - * Changes the bit with index bitIndex in the bitvector bits to the "set" (true) state. + * Changes the bit with index bitIndex in the bitvector bits to the "set" (true) state. * * @param bits the bitvector. * @param bitIndex the index of the bit to be set. @@ -321,9 +329,11 @@ public static void set(long[] bits, long bitIndex) { } /** * Returns the index of the unit that contains the given bitIndex. + * + * @param bitIndex The index of the bit to be checked. + * @return The index of the unit that contains the given bitIndex. */ protected static long unit(long bitIndex) { - return bitIndex >> ADDRESS_BITS_PER_UNIT; - //equivalent to bitIndex/64 + return bitIndex >> ADDRESS_BITS_PER_UNIT; // equivalent to bitIndex/64 } } From 25288bd579665b623d05a91624956710066dea1c Mon Sep 17 00:00:00 2001 From: Charlie Dickens Date: Wed, 8 May 2024 14:33:33 +0100 Subject: [PATCH 05/38] Corrected some javadocs? --- .../filters/quotientfilter/QuickBitVector.java | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/src/main/java/org/apache/datasketches/filters/quotientfilter/QuickBitVector.java b/src/main/java/org/apache/datasketches/filters/quotientfilter/QuickBitVector.java index c9ba8dc99..e63696a6c 100644 --- a/src/main/java/org/apache/datasketches/filters/quotientfilter/QuickBitVector.java +++ b/src/main/java/org/apache/datasketches/filters/quotientfilter/QuickBitVector.java @@ -44,8 +44,6 @@ * * @author wolfgang.hoschek@cern.ch * @version 1.0, 09/24/99 - * @see BitVector - * @see BitMatrix * @see java.util.BitSet */ //package bitmap_implementations; @@ -65,7 +63,7 @@ protected QuickBitVector() { * Returns a bit mask with bits in the specified range set to 1, all the rest set to 0. * In other words, returns a bit mask having 0,1,2,3,...,64 bits set. * If to-from+1==0 then returns zero (0L). - * Precondition (not checked): to-from+1 >= 0 && to-from+1 <= 64. + * Precondition (not checked): to-from+1 &ge 0 AND to-from+1 &le 64. * * @param from index of start bit (inclusive) * @param to index of end bit (inclusive). @@ -104,8 +102,8 @@ public static boolean get(long[] bits, long bitIndex) { * Returns a long value representing bits of a bitvector from index from to index to. * Bits are returned as a long value with the return value having bit 0 set to bit from, ..., bit to-from set to bit to. * All other bits of return value are set to 0. - * If from > to then returns zero (0L). - * Precondition (not checked): to-from+1 <= 64. + * If from > to then returns zero (0L). + * Precondition (not checked): to-from+1 &le 64. * @param bits the bitvector. * @param from index of start bit (inclusive). * @param to index of end bit (inclusive). @@ -163,7 +161,7 @@ static public int leastSignificantBit(int value) { /** * Constructs a low level bitvector that holds size elements, with each element taking bitsPerElement bits. * CD. THIS METHOD ESSENTIALLY ROUNDS TO THE NEXT MULTIPLE OF 64 BITS. - * @param size the number of elements to be stored in the bitvector (must be >= 0). + * @param size the number of elements to be stored in the bitvector (must be &ge 0). * @param bitsPerElement the number of bits one single element takes. * @return a low level bitvector. */ @@ -191,10 +189,10 @@ public static long[] makeBitVector(long size, int bitsPerElement) { * * Examples: *
-     * 0x80000000 --> 31
-     * 0x7fffffff --> 30
-     * 0x00000001 --> 0
-     * 0x00000000 --> -1
+     * 0x80000000 : 31
+     * 0x7fffffff : 30
+     * 0x00000001 : 0
+     * 0x00000000 : -1
      * 
* * @param value The integer value for which the most significant bit index is to be found. From 4ab30ded46d3112e9bdb362edda8574279ccf1c8 Mon Sep 17 00:00:00 2001 From: Charlie Dickens Date: Wed, 8 May 2024 14:39:25 +0100 Subject: [PATCH 06/38] javadocs edits --- .../quotientfilter/QuickBitVector.java | 39 +++++-------------- 1 file changed, 9 insertions(+), 30 deletions(-) diff --git a/src/main/java/org/apache/datasketches/filters/quotientfilter/QuickBitVector.java b/src/main/java/org/apache/datasketches/filters/quotientfilter/QuickBitVector.java index e63696a6c..3f26c1066 100644 --- a/src/main/java/org/apache/datasketches/filters/quotientfilter/QuickBitVector.java +++ b/src/main/java/org/apache/datasketches/filters/quotientfilter/QuickBitVector.java @@ -63,7 +63,7 @@ protected QuickBitVector() { * Returns a bit mask with bits in the specified range set to 1, all the rest set to 0. * In other words, returns a bit mask having 0,1,2,3,...,64 bits set. * If to-from+1==0 then returns zero (0L). - * Precondition (not checked): to-from+1 &ge 0 AND to-from+1 &le 64. + * Precondition (not checked): to-from+1 ≥ 0 AND to-from+1 ≤ 64. * * @param from index of start bit (inclusive) * @param to index of end bit (inclusive). @@ -102,8 +102,8 @@ public static boolean get(long[] bits, long bitIndex) { * Returns a long value representing bits of a bitvector from index from to index to. * Bits are returned as a long value with the return value having bit 0 set to bit from, ..., bit to-from set to bit to. * All other bits of return value are set to 0. - * If from > to then returns zero (0L). - * Precondition (not checked): to-from+1 &le 64. + * If from > to then returns zero (0L). + * Precondition (not checked): to-from+1 ≤ 64. * @param bits the bitvector. * @param from index of start bit (inclusive). * @param to index of end bit (inclusive). @@ -147,10 +147,10 @@ public static long getLongFromTo(long[] bits, long from, long to) { Returns 32 if no bit is in state "true". Examples:
-     0x80000000 --> 31
-     0x7fffffff --> 0
-     0x00000001 --> 0
-     0x00000000 --> 32
+     0x80000000 : 31
+     0x7fffffff : 0
+     0x00000001 : 0
+     0x00000000 : 32
      
*/ static public int leastSignificantBit(int value) { @@ -161,7 +161,7 @@ static public int leastSignificantBit(int value) { /** * Constructs a low level bitvector that holds size elements, with each element taking bitsPerElement bits. * CD. THIS METHOD ESSENTIALLY ROUNDS TO THE NEXT MULTIPLE OF 64 BITS. - * @param size the number of elements to be stored in the bitvector (must be &ge 0). + * @param size the number of elements to be stored in the bitvector (must be ≥ 0). * @param bitsPerElement the number of bits one single element takes. * @return a low level bitvector. */ @@ -216,7 +216,7 @@ protected static long offset(long bitIndex) { /** * Initializes a table with numbers having 1,2,3,...,64 bits set. * pows[i] has bits [0..i-1] set. - * pows[64] == -1L == ~0L == has all 64 bits set --> correct. + * pows[64] == -1L == ~0L == has all 64 bits set : correct. * to speedup calculations in subsequent methods. */ private static long[] precomputePows() { @@ -227,28 +227,7 @@ private static long[] precomputePows() { //System.out.println((i)+":"+pows[i]); } pows[0]=0L; - //System.out.println((0)+":"+pows[0]); return pows; - - //OLD STUFF - /* - for (int i=BITS_PER_UNIT+1; --i >= 0; ) { - pows[i]=value; - value = value >>> 1; - System.out.println((i)+":"+pows[i]); - } - */ - - /* - long[] pows=new long[BITS_PER_UNIT]; - for (int i=0; i Date: Wed, 8 May 2024 14:43:17 +0100 Subject: [PATCH 07/38] javadoc edits --- .../quotientfilter/QuickBitVector.java | 27 +++++++++++-------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/src/main/java/org/apache/datasketches/filters/quotientfilter/QuickBitVector.java b/src/main/java/org/apache/datasketches/filters/quotientfilter/QuickBitVector.java index 3f26c1066..8e4ff6d35 100644 --- a/src/main/java/org/apache/datasketches/filters/quotientfilter/QuickBitVector.java +++ b/src/main/java/org/apache/datasketches/filters/quotientfilter/QuickBitVector.java @@ -142,16 +142,21 @@ public static long getLongFromTo(long[] bits, long from, long to) { //combine return x1|x2; } + /** - Returns the index of the least significant bit in state "true". - Returns 32 if no bit is in state "true". - Examples: -
-     0x80000000 : 31
-     0x7fffffff : 0
-     0x00000001 : 0
-     0x00000000 : 32
-     
+ * Returns the index of the least significant bit in state "true". + * Returns 32 if no bit is in state "true". + * + * Examples: + *
+     * 0x80000000 : 31
+     * 0x7fffffff : 0
+     * 0x00000001 : 0
+     * 0x00000000 : 32
+     * 
+ * + * @param value The integer value for which the least significant bit index is to be found. + * @return The index of the least significant bit in state "true". Returns 32 if no bit is in state "true". */ static public int leastSignificantBit(int value) { int i=-1; @@ -246,8 +251,8 @@ public static void put(long[] bits, long bitIndex, boolean value) { * Sets bits of a bitvector from index from to index to to the bits of value. * Bit from is set to bit 0 of value, ..., bit to is set to bit to-from of value. * All other bits stay unaffected. - * If from > to then does nothing. - * Precondition (not checked): to-from+1 <= 64. + * If from > to then does nothing. + * Precondition (not checked): to-from+1 ≤ 64. * * @param bits the bitvector. * @param value the value to be copied into the bitvector. From 90701eb15857e79041c4231c897134a74e55b8a9 Mon Sep 17 00:00:00 2001 From: Charlie Dickens Date: Wed, 8 May 2024 14:48:17 +0100 Subject: [PATCH 08/38] Added licenses --- .../filters/quotientfilter/Filter.java | 20 +++++++++++++++++++ .../filters/quotientfilter/Iterator.java | 20 +++++++++++++++++++ .../filters/quotientfilter/DeletionTests.java | 19 ++++++++++++++++++ .../quotientfilter/QuotientFilterTest.java | 19 ++++++++++++++++++ 4 files changed, 78 insertions(+) diff --git a/src/main/java/org/apache/datasketches/filters/quotientfilter/Filter.java b/src/main/java/org/apache/datasketches/filters/quotientfilter/Filter.java index 55bb0f4bc..8f0796738 100644 --- a/src/main/java/org/apache/datasketches/filters/quotientfilter/Filter.java +++ b/src/main/java/org/apache/datasketches/filters/quotientfilter/Filter.java @@ -1,3 +1,23 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + + package org.apache.datasketches.filters.quotientfilter; import java.nio.ByteBuffer; diff --git a/src/main/java/org/apache/datasketches/filters/quotientfilter/Iterator.java b/src/main/java/org/apache/datasketches/filters/quotientfilter/Iterator.java index 05725657b..ea5039aa6 100644 --- a/src/main/java/org/apache/datasketches/filters/quotientfilter/Iterator.java +++ b/src/main/java/org/apache/datasketches/filters/quotientfilter/Iterator.java @@ -1,3 +1,23 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + + package org.apache.datasketches.filters.quotientfilter; import java.util.ArrayDeque; diff --git a/src/test/java/org/apache/datasketches/filters/quotientfilter/DeletionTests.java b/src/test/java/org/apache/datasketches/filters/quotientfilter/DeletionTests.java index 04ca81510..3f42d3657 100644 --- a/src/test/java/org/apache/datasketches/filters/quotientfilter/DeletionTests.java +++ b/src/test/java/org/apache/datasketches/filters/quotientfilter/DeletionTests.java @@ -1,3 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + package org.apache.datasketches.filters.quotientfilter; import org.testng.annotations.Test; import static org.testng.Assert.assertTrue; diff --git a/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterTest.java b/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterTest.java index d03978bf9..00b085fef 100644 --- a/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterTest.java +++ b/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterTest.java @@ -1,3 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + package org.apache.datasketches.filters.quotientfilter; import org.testng.annotations.Test; import static org.testng.Assert.assertTrue; From a9f9027982871fede90ce9bc5678022d6dad77e9 Mon Sep 17 00:00:00 2001 From: Charlie Dickens Date: Wed, 15 May 2024 10:45:40 +0100 Subject: [PATCH 09/38] Added bitvector tests --- .../quotientfilter/QuickBitVector.java | 27 +++++--- .../quotientfilter/BitVectorTests.java | 63 +++++++++++++++++++ 2 files changed, 80 insertions(+), 10 deletions(-) create mode 100644 src/test/java/org/apache/datasketches/filters/quotientfilter/BitVectorTests.java diff --git a/src/main/java/org/apache/datasketches/filters/quotientfilter/QuickBitVector.java b/src/main/java/org/apache/datasketches/filters/quotientfilter/QuickBitVector.java index 8e4ff6d35..ca387ebc9 100644 --- a/src/main/java/org/apache/datasketches/filters/quotientfilter/QuickBitVector.java +++ b/src/main/java/org/apache/datasketches/filters/quotientfilter/QuickBitVector.java @@ -166,6 +166,7 @@ static public int leastSignificantBit(int value) { /** * Constructs a low level bitvector that holds size elements, with each element taking bitsPerElement bits. * CD. THIS METHOD ESSENTIALLY ROUNDS TO THE NEXT MULTIPLE OF 64 BITS. + * This function gets the smallest number of longs that stores the requisite bits. * @param size the number of elements to be stored in the bitvector (must be ≥ 0). * @param bitsPerElement the number of bits one single element takes. * @return a low level bitvector. @@ -180,7 +181,6 @@ public static long[] makeBitVector(long size, int bitsPerElement) { //System.out.println("Safe Right shift " + safe_right_shift); int unitIndex = (int)((nBits-1) >> ADDRESS_BITS_PER_UNIT); // How many multiples of 64 bits do we need to store nBits bits? //System.out.println(ADDRESS_BITS_PER_UNIT); - //System.out.println("unitIndex " + unitIndex); long[] bitVector = new long[unitIndex + 1]; //System.out.println("length " + bitVector.length); //System.out.println("Total bits: " + (bitVector.length * 64)); @@ -223,17 +223,22 @@ protected static long offset(long bitIndex) { * pows[i] has bits [0..i-1] set. * pows[64] == -1L == ~0L == has all 64 bits set : correct. * to speedup calculations in subsequent methods. + * Output: -1, 2^63-1, 2^62-1, ..., 2^1-1. */ + private static long[] precomputePows() { long[] pows=new long[BITS_PER_UNIT+1]; long value = ~0L; + + // decrement i before executing the loop and only enter loop if the decremented value is at least 1. + // this means that the loop starts at i = 64 and iterates down to i = 1. for (int i=BITS_PER_UNIT+1; --i >= 1; ) { pows[i]=value >>> (BITS_PER_UNIT-i); - //System.out.println((i)+":"+pows[i]); } pows[0]=0L; return pows; } + /** * Sets the bit with index bitIndex in the bitvector bits to the state specified by value. * @@ -247,6 +252,7 @@ public static void put(long[] bits, long bitIndex, boolean value) { else clear(bits, bitIndex); } + /** * Sets bits of a bitvector from index from to index to to the bits of value. * Bit from is set to bit 0 of value, ..., bit to is set to bit to-from of value. @@ -254,6 +260,12 @@ public static void put(long[] bits, long bitIndex, boolean value) { * If from > to then does nothing. * Precondition (not checked): to-from+1 ≤ 64. * + * this function is equivalent to the slower code below: + * int fromIndex=from/BITS_PER_UNIT; + * int toIndex=to/BITS_PER_UNIT; + * int fromOffset=from%BITS_PER_UNIT; + * int toOffset=to%BITS_PER_UNIT; + * * @param bits the bitvector. * @param value the value to be copied into the bitvector. * @param from index of start bit (inclusive). @@ -264,15 +276,8 @@ public static void putLongFromTo(long[] bits, long value, long from, long to) { final int fromIndex=(int)(from >> ADDRESS_BITS_PER_UNIT); //equivalent to from/64 final int toIndex=(int)(to >> ADDRESS_BITS_PER_UNIT); - final int fromOffset=(int)(from & BIT_INDEX_MASK); //equivalent to from%64 + final int fromOffset=(int)(from & BIT_INDEX_MASK); //equivalent to from % 64 final int toOffset=(int)(to & BIT_INDEX_MASK); - /* - this is equivalent to the above, but slower: - int fromIndex=from/BITS_PER_UNIT; - int toIndex=to/BITS_PER_UNIT; - int fromOffset=from%BITS_PER_UNIT; - int toOffset=to%BITS_PER_UNIT; - */ //make sure all unused bits to the left are cleared. long mask; @@ -300,6 +305,7 @@ public static void putLongFromTo(long[] bits, long value, long from, long to) { mask=bitMaskWithBitsSetFromTo(0, toOffset); bits[toIndex] = (bits[toIndex] & (~mask)) | shiftedValue; } + /** * Changes the bit with index bitIndex in the bitvector bits to the "set" (true) state. * @@ -309,6 +315,7 @@ public static void putLongFromTo(long[] bits, long value, long from, long to) { public static void set(long[] bits, long bitIndex) { bits[(int)(bitIndex >> ADDRESS_BITS_PER_UNIT)] |= 1L << (bitIndex & BIT_INDEX_MASK); } + /** * Returns the index of the unit that contains the given bitIndex. * diff --git a/src/test/java/org/apache/datasketches/filters/quotientfilter/BitVectorTests.java b/src/test/java/org/apache/datasketches/filters/quotientfilter/BitVectorTests.java new file mode 100644 index 000000000..a90d59d62 --- /dev/null +++ b/src/test/java/org/apache/datasketches/filters/quotientfilter/BitVectorTests.java @@ -0,0 +1,63 @@ +package org.apache.datasketches.filters.quotientfilter; + +import org.testng.annotations.Test; +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertTrue; +import static org.testng.Assert.assertFalse; + +public class BitVectorTests { + + /** + * This test method initializes a QuickBitVectorWrapper with various combinations of bits per entry and number of entries. + * It then calculates the expected length of the bit vector and asserts that the actual size of the bit vector matches the expected length. + * + * Example Input-Output Pairs: + * 1. Input: bitsPerEntry = 2, numEntries = 8 (1L << 3) + * Output: expectedLengthBits = 64 + * + * 2. Input: bitsPerEntry = 3, numEntries = 16 (1L << 4) + * Output: 64 + * + * 3. Input: bitsPerEntry = 33, numEntries = 8 (1L << 3) + * Output: expectedLengthBits = 320 + */ + @Test + static public void testSize(){ + int[] bitsPerEntry = {2, 3, 4, 5, 6, 7, 8, 9, 10, 23, 24, 25, 31, 32, 33}; + long[] numEntries = {1L << 3, 1L<<4, 1L<<8, 1L << 16}; + long nBits ; + long expectedLengthBits ; + + for (int i = 0; i < bitsPerEntry.length; i++){ + for (int j = 0; j < numEntries.length; j++) { + QuickBitVectorWrapper bv = new QuickBitVectorWrapper(bitsPerEntry[i], numEntries[j]); + nBits = bitsPerEntry[i] * numEntries[j]; + expectedLengthBits = 64 * ((nBits % 64 == 0) ? (nBits / 64) : (1 + nBits / 64)); + assertEquals(bv.size(), expectedLengthBits); + } + } + } + + /* + This test amends a few entries in the BitVector and checks that they are appropriately set. + */ + @Test + static public void testSettersAndGetters(){ + QuickBitVectorWrapper bv = new QuickBitVectorWrapper(6, 16); + + // All entries should be False before any updates + for (int i = 0; i < bv.size(); i++){ + assertFalse(bv.get(i), "All entries should be False"); + } + + // Set some values + bv.set(0, true); + assertTrue(bv.get(0), "Value at index 0 should be True"); + + bv.set(32, true) ; + assertTrue(bv.get(32), "Value at index 32 should be True"); + + bv.setFromTo(64, 128, ~0L); + assertTrue(bv.getFromTo(64, 128) == -1L, "Values from 64 to 128 should be set to 1") ; + } +} From 03f9c522d0088347d792ee035dae29e8a9b6bc98 Mon Sep 17 00:00:00 2001 From: Charlie Dickens Date: Wed, 15 May 2024 10:51:49 +0100 Subject: [PATCH 10/38] Corrected license header --- .../quotientfilter/BitVectorTests.java | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/src/test/java/org/apache/datasketches/filters/quotientfilter/BitVectorTests.java b/src/test/java/org/apache/datasketches/filters/quotientfilter/BitVectorTests.java index a90d59d62..487e36576 100644 --- a/src/test/java/org/apache/datasketches/filters/quotientfilter/BitVectorTests.java +++ b/src/test/java/org/apache/datasketches/filters/quotientfilter/BitVectorTests.java @@ -1,3 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + package org.apache.datasketches.filters.quotientfilter; import org.testng.annotations.Test; From 940eda8b4b17429e7379dc7157cdae46ecb4e61a Mon Sep 17 00:00:00 2001 From: AlexanderSaydakov Date: Thu, 23 May 2024 10:35:53 -0700 Subject: [PATCH 11/38] circular table, no extension slots --- .../filters/quotientfilter/Iterator.java | 7 +- .../quotientfilter/QuotientFilter.java | 201 +++++------------- .../quotientfilter/QuotientFilterTest.java | 82 ++++--- 3 files changed, 97 insertions(+), 193 deletions(-) diff --git a/src/main/java/org/apache/datasketches/filters/quotientfilter/Iterator.java b/src/main/java/org/apache/datasketches/filters/quotientfilter/Iterator.java index ea5039aa6..6b76a1789 100644 --- a/src/main/java/org/apache/datasketches/filters/quotientfilter/Iterator.java +++ b/src/main/java/org/apache/datasketches/filters/quotientfilter/Iterator.java @@ -49,7 +49,7 @@ void clear() { boolean next() { - if (index == qf.get_logical_num_slots_plus_extensions()) { + if (index == qf.get_num_slots()) { return false; } @@ -59,9 +59,9 @@ boolean next() { boolean shifted = (slot & 4) != 0; - while (!occupied && !continuation && !shifted && index < qf.get_logical_num_slots_plus_extensions()) { + while (!occupied && !continuation && !shifted && index < qf.get_num_slots()) { index++; - if (index == qf.get_logical_num_slots_plus_extensions()) { + if (index == qf.get_num_slots()) { return false; } slot = qf.get_slot(index); @@ -99,5 +99,4 @@ void print() { System.out.println("original slot: " + index + " " + bucket_index); } - } diff --git a/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java b/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java index 9e0f08ce5..3af8affcd 100644 --- a/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java +++ b/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java @@ -17,30 +17,20 @@ * under the License. */ - package org.apache.datasketches.filters.quotientfilter; import java.util.ArrayList; import java.util.HashSet; import java.util.Set; -import org.apache.datasketches.filters.quotientfilter.Bitmap; -import org.apache.datasketches.memory.XxHash; - public class QuotientFilter extends Filter { int bitPerEntry; int fingerprintLength; int power_of_two_size; - int num_extension_slots; int num_existing_entries; Bitmap filter; - // These three fields are used to prevent throwing exceptions when the buffer space of the filter is exceeded - long last_empty_slot; - long last_cluster_start; - public long backward_steps; - double expansion_threshold; long max_entries_before_expansion; boolean expand_autonomously; @@ -60,11 +50,7 @@ public QuotientFilter(int power_of_two, int bits_per_entry) { power_of_two_size = power_of_two; bitPerEntry = bits_per_entry; fingerprintLength = bits_per_entry - 3; - long init_size = 1L << power_of_two; - //System.out.println("Init size: " + init_size); - num_extension_slots = power_of_two * 2; - // System.out.println("Extension slots: " + num_extension_slots); - + final long init_size = 1L << power_of_two; filter = make_filter(init_size, bits_per_entry); expansion_threshold = 0.8; @@ -75,19 +61,6 @@ public QuotientFilter(int power_of_two, int bits_per_entry) { original_fingerprint_size = fingerprintLength; num_expansions = 0; //hash_type = XxHash.hashLong ; //HashType.xxh; - - last_empty_slot = init_size + num_extension_slots - 1; - last_cluster_start = 0; - backward_steps = 0; - //measure_num_bits_per_entry(); - } - - //nuevo - void update(long init_size) - { - last_empty_slot = init_size + num_extension_slots - 1; - last_cluster_start = 0; - backward_steps = 0; } public boolean rejuvenate(long key) { @@ -114,7 +87,7 @@ Bitmap make_filter(long init_size, int bits_per_entry) { // System.out.println(init_size ) ; // System.out.println(num_extension_slots); // System.out.println("Making BitVector with: " + (init_size + num_extension_slots) + "SLOTS"); - return new QuickBitVectorWrapper(bits_per_entry, init_size + num_extension_slots); + return new QuickBitVectorWrapper(bits_per_entry, init_size); } public int get_fingerprint_length() { @@ -126,13 +99,6 @@ public int get_fingerprint_length() { bitPerEntry = bits_per_entry; fingerprintLength = bits_per_entry - 3; filter = bitmap; - num_extension_slots = power_of_two * 2; - - //nuevo - long init_size = 1L << power_of_two; - last_empty_slot = init_size + num_extension_slots - 1; - last_cluster_start = 0; - backward_steps = 0; } boolean expand() { @@ -155,18 +121,17 @@ protected static double measure_num_bits_per_entry(QuotientFilter current, Array for (QuotientFilter q : other_filters) { //q.print_filter_summary(); //System.out.println(); - long q_num_entries = q.get_num_entries(false); - num_entries += q_num_entries; + num_entries += q.get_num_entries(false); } long init_size = 1L << current.power_of_two_size; - long num_bits = current.bitPerEntry * init_size + current.num_extension_slots * current.bitPerEntry; + long num_bits = current.bitPerEntry * init_size; for (QuotientFilter q : other_filters) { init_size = 1L << q.power_of_two_size; - num_bits += q.bitPerEntry * init_size + q.num_extension_slots * q.bitPerEntry; + num_bits += q.bitPerEntry * init_size; } //System.out.println("total entries: \t\t" + num_entries); //System.out.println("total bits: \t\t" + num_bits); - double bits_per_entry = num_bits / num_entries; + final double bits_per_entry = num_bits / num_entries; //System.out.println("total bits/entry: \t" + bits_per_entry); //System.out.println(); return bits_per_entry; @@ -174,10 +139,8 @@ protected static double measure_num_bits_per_entry(QuotientFilter current, Array // scans the quotient filter and returns the number of non-empty slots public long get_num_entries(boolean include_all_internal_filters) { - //long bits = filter.size(); - long slots = get_physcial_num_slots(); long num_entries = 0; - for (long i = 0; i < slots; i++) { + for (long i = 0; i < get_num_slots(); i++) { if (is_occupied(i) || is_continuation(i) || is_shifted(i)) { num_entries++; } @@ -193,21 +156,15 @@ public double get_utilization() { return util; } - public long get_physcial_num_slots() { - long bits = filter.size(); - return bits / bitPerEntry; - } - - // returns the number of physical slots in the filter (including the extention/buffer slots at the end) - public long get_logical_num_slots_plus_extensions() { - return (1L << power_of_two_size) + num_extension_slots; - } - // returns the number of slots in the filter without the extension/buffer slots - public long get_logical_num_slots() { + public long get_num_slots() { return 1L << power_of_two_size; } + long getMask() { + return get_num_slots() - 1; + } + // sets the metadata flag bits for a given slot index void modify_slot(boolean is_occupied, boolean is_continuation, boolean is_shifted, long index) { @@ -226,8 +183,7 @@ void set_fingerprint(long index, long fingerprint) { public String get_pretty_str(boolean vertical) { StringBuffer sbr = new StringBuffer(); - long logic_slots = get_logical_num_slots(); - long all_slots = get_logical_num_slots_plus_extensions(); + long num_slots = get_num_slots(); for (long i = 0; i < filter.size(); i++) { long remainder = i % bitPerEntry; @@ -235,10 +191,8 @@ public String get_pretty_str(boolean vertical) { long slot_num = i/bitPerEntry; sbr.append(" "); if (vertical) { - if (slot_num == logic_slots ){//|| slot_num == all_slots) { + if (slot_num == num_slots) { sbr.append("\n ---------"); - } else if (slot_num == all_slots) { - sbr.append("\n d***********b"); } //sbr.append("\n" + slot_num + " "); sbr.append("\n" + String.format("%-10d", slot_num) + "\t"); @@ -283,7 +237,7 @@ void modify_slot(boolean is_occupied, boolean is_continuation, boolean is_shifte // summarize some statistical measures about the filter public void print_filter_summary() { long num_entries = get_num_entries(false); - long slots = (1L << power_of_two_size) + num_extension_slots; + long slots = get_num_slots(); long num_bits = slots * bitPerEntry; System.out.println("slots:\t" + slots); System.out.println("entries:\t" + num_entries); @@ -300,14 +254,13 @@ public void print_filter_summary() { //System.out.println("avg cluster length: \t" + avg_cluster_length); } + /* + Returns the number of bits used for the filter + */ @Override - public long get_space_use(){ - /* - Returns the number of bits used for the filter - */ - long slots = (1L << power_of_two_size); // + num_extension_slots; - long num_bits = slots * bitPerEntry; - return num_bits ; + public long get_space_use() { + long num_bits = get_num_slots() * bitPerEntry; + return num_bits; } public int get_bits_per_entry() { @@ -347,7 +300,7 @@ boolean is_slot_empty(long index) { long find_cluster_start(long index) { long current_index = index; while (is_shifted(current_index)) { - current_index--; + current_index = (current_index - 1) & getMask(); } return current_index; } @@ -361,9 +314,8 @@ long find_run_start(long index) { if (is_occupied(current_index)) { runs_to_skip_counter++; } - current_index--; + current_index = (current_index - 1) & getMask(); } - last_cluster_start = current_index - 1; while (true) { if (!is_continuation(current_index)) { runs_to_skip_counter--; @@ -371,7 +323,7 @@ long find_run_start(long index) { return current_index; } } - current_index++; + current_index = (current_index + 1) & getMask(); } } @@ -383,8 +335,8 @@ long find_first_fingerprint_in_run(long index, long fingerprint) { //System.out.println("found matching FP at index " + index); return index; } - index++; - } while (index < get_logical_num_slots_plus_extensions() && is_continuation(index)); + index = (index + 1) & getMask(); + } while (is_continuation(index)); return -1; } @@ -397,16 +349,16 @@ long decide_which_fingerprint_to_delete(long index, long fingerprint) { //System.out.println("found matching FP at index " + index); matching_fingerprint_index = index; } - index++; - } while (index < get_logical_num_slots_plus_extensions() && is_continuation(index)); + index = (index + 1) & getMask(); + } while (is_continuation(index)); return matching_fingerprint_index; } // given the start of a run, find the last slot index that still belongs to this run long find_run_end(long index) { - while(index < get_logical_num_slots_plus_extensions() - 1 && is_continuation(index+1)) { - index++; - } + do { + index = (index + 1) & getMask(); + } while(is_continuation(index)); return index; } @@ -432,7 +384,7 @@ Set get_all_fingerprints(long bucket_index) { long run_index = find_run_start(bucket_index); do { set.add(get_fingerprint(run_index)); - run_index++; + run_index = (run_index + 1) & getMask(); } while (is_continuation(run_index)); return set; } @@ -447,17 +399,7 @@ long swap_fingerprints(long index, long new_fingerprint) { // finds the first empty slot after the given slot index long find_first_empty_slot(long index) { while (!is_slot_empty(index)) { - index++; - } - return index; - } - - // moves backwards to find the first empty slot - // used as a part of the mechanism to prevent exceptions when exceeding the quotient filter's bounds - long find_backward_empty_slot(long index) { - while (index >= 0 && !is_slot_empty(index)) { - backward_steps++; - index--; + index = (index + 1) & getMask(); } return index; } @@ -465,10 +407,10 @@ long find_backward_empty_slot(long index) { // return the first slot to the right where the current run starting at the index parameter ends long find_new_run_location(long index) { if (!is_slot_empty(index)) { - index++; + index = (index + 1) & getMask(); } while (is_continuation(index)) { - index++; + index = (index + 1) & getMask(); } return index; } @@ -489,9 +431,6 @@ boolean insert_new_run(long canonical_slot, long long_fp) { // if the slot was initially empty, we can just terminate, as there is nothing to push to the right if (slot_initially_empty) { set_fingerprint(start_of_this_new_run, long_fp); - if (start_of_this_new_run == last_empty_slot) { - last_empty_slot = find_backward_empty_slot(last_cluster_start); - } num_existing_entries++; return true; } @@ -502,26 +441,16 @@ boolean insert_new_run(long canonical_slot, long long_fp) { boolean is_this_slot_empty; boolean temp_continuation = false; do { - if (current_index >= get_logical_num_slots_plus_extensions()) { - return false; - } - is_this_slot_empty = is_slot_empty(current_index); long_fp = swap_fingerprints(current_index, long_fp); - if (current_index > start_of_this_new_run) { + if (current_index != start_of_this_new_run) { set_shifted(current_index, true); - } - - if (current_index > start_of_this_new_run) { boolean current_continuation = is_continuation(current_index); set_continuation(current_index, temp_continuation); temp_continuation = current_continuation; } - current_index++; - if (current_index == last_empty_slot) { // TODO get this out of the while loop - last_empty_slot = find_backward_empty_slot(last_cluster_start); - } + current_index = (current_index + 1) & getMask(); } while (!is_this_slot_empty); num_existing_entries++; return true; @@ -535,7 +464,7 @@ boolean insert(long long_fp, long index, boolean insert_only_if_no_match) { //System.out.println("Num items: " + num_existing_entries); //System.out.println("Max items: " + max_entries_before_expansion); - if (index > last_empty_slot) { + if (index >= get_num_slots()) { return false; } boolean does_run_exist = is_occupied(index); @@ -554,7 +483,7 @@ boolean insert(long long_fp, long index, boolean insert_only_if_no_match) { return insert_fingerprint_and_push_all_else(long_fp, run_start_index); } - // insert an fingerprint as the first fingerprint of the new run and push all other entries in the cluster to the right. + // insert a fingerprint as the first fingerprint of the new run and push all other entries in the cluster to the right. boolean insert_fingerprint_and_push_all_else(long long_fp, long run_start_index) { long current_index = run_start_index; boolean is_this_slot_empty; @@ -562,14 +491,11 @@ boolean insert_fingerprint_and_push_all_else(long long_fp, long run_start_index) boolean temp_continuation = false; do { - if (current_index >= get_logical_num_slots_plus_extensions()) { - return false; - } is_this_slot_empty = is_slot_empty(current_index); - if (current_index > run_start_index) { + if (current_index != run_start_index) { set_shifted(current_index, true); } - if (current_index > run_start_index && !finished_first_run && !is_continuation(current_index)) { + if (current_index != run_start_index && !finished_first_run && !is_continuation(current_index)) { finished_first_run = true; set_continuation(current_index, true); long_fp = swap_fingerprints(current_index, long_fp); @@ -580,17 +506,14 @@ else if (finished_first_run) { temp_continuation = current_continuation; long_fp = swap_fingerprints(current_index, long_fp); } - if (current_index == last_empty_slot) { - last_empty_slot = find_backward_empty_slot(last_cluster_start); - } - current_index++; + current_index = (current_index + 1) & getMask(); } while (!is_this_slot_empty); num_existing_entries++; return true; } boolean delete(long fingerprint, long canonical_slot, long run_start_index, long matching_fingerprint_index) { - long run_end = find_run_end(matching_fingerprint_index); + final long run_end = find_run_end(matching_fingerprint_index); // the run has only one entry, we need to disable its is_occupied flag // we just remember we need to do this here, and we do it later to not interfere with counts @@ -630,43 +553,37 @@ boolean delete(long fingerprint, long canonical_slot, long run_start_index, long //boolean does_next_run_exist = !is_slot_empty(run_end + 1); //boolean is_next_run_shifted = is_shifted(run_end + 1); //if (!does_next_run_exist || !is_next_run_shifted) { - if (run_end >= get_logical_num_slots_plus_extensions()-1 || - is_slot_empty(run_end + 1) || !is_shifted(run_end + 1)) { + final long next_run_start = (run_end + 1) & getMask(); + if (is_slot_empty(next_run_start) || !is_shifted(next_run_start)) { if (turn_off_occupied) { // if we eliminated a run and now need to turn the is_occupied flag off, we do it at the end to not interfere in our counts set_occupied(canonical_slot, false); - - } - if (run_end > last_empty_slot) { - last_empty_slot = run_end; } return true; } - // we now find the start and end of the next run - long next_run_start = run_end + 1; - run_end = find_run_end(next_run_start); + final long next_run_end = find_run_end(next_run_start); // before we start processing the next run, we check whether the previous run we shifted is now back to its canonical slot // The condition num_shifted_count - num_non_occupied == 1 ensures that the run was shifted by only 1 slot, meaning it is now back in its proper place - if ( is_occupied(next_run_start - 1) && num_shifted_count - num_non_occupied == 1 ) { - set_shifted(next_run_start - 1, false); + if (is_occupied(run_end) && num_shifted_count - num_non_occupied == 1) { + set_shifted(run_end, false); } else { - set_shifted(next_run_start - 1, true); + set_shifted(run_end, true); } - for (long i = next_run_start; i <= run_end; i++) { + for (long i = next_run_start; i != ((next_run_end + 1) & getMask()); i++) { long f = get_fingerprint(i); - set_fingerprint(i - 1, f); + set_fingerprint((i - 1) & getMask(), f); if (is_continuation(i)) { - set_continuation(i-1, true); + set_continuation((i - 1) & getMask(), true); } if (!is_occupied(i)) { num_non_occupied++; } + num_shifted_count++; } - num_shifted_count += run_end - next_run_start; set_fingerprint(run_end, 0); set_shifted(run_end, false); set_continuation(run_end, false); @@ -674,9 +591,6 @@ boolean delete(long fingerprint, long canonical_slot, long run_start_index, long } boolean delete(long fingerprint, long canonical_slot) { - if (canonical_slot >= get_logical_num_slots()) { - return false; - } // if the run doesn't exist, the key can't have possibly been inserted boolean does_run_exist = is_occupied(canonical_slot); if (!does_run_exist) { @@ -692,11 +606,8 @@ boolean delete(long fingerprint, long canonical_slot) { } return delete(fingerprint, canonical_slot, run_start_index, matching_fingerprint_index); - } - - /* Performs the modular arithmetic of large_hash % bits_per_entry and uses this as the slot_index */ @@ -792,8 +703,6 @@ protected boolean _search(long large_hash) { return search(fingerprint, slot_index); } - - public boolean get_bit_at_offset(int offset) { return filter.get(offset); } @@ -807,7 +716,7 @@ public void compute_statistics() { int current_run_length = 0; int current_cluster_length = 0; - long num_slots = get_logical_num_slots_plus_extensions(); + long num_slots = get_num_slots(); for (long i = 0; i < num_slots; i++) { boolean occupied = is_occupied(i); @@ -941,5 +850,3 @@ else if (occupied && continuation && shifted ) { // continuation of run // } } - - diff --git a/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterTest.java b/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterTest.java index 00b085fef..6fc24b10b 100644 --- a/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterTest.java +++ b/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterTest.java @@ -37,48 +37,47 @@ public class QuotientFilterTest { */ @Test static public void WikiInsertionTest() { - int bits_per_entry = 8; // 8 bits per entry => 5 bits fingerprints, resolved internally in the filter. + int bits_per_entry = 6; // 6 bits per entry => 3 bits fingerprint, resolved internally in the filter. int num_entries_power = 3; - int num_entries = (int)Math.pow(2, num_entries_power); QuotientFilter qf = new QuotientFilter(num_entries_power, bits_per_entry); - // this test does not need different fingerprints as it is testing the slot locations and metadata bits. - long fingerprint0 = 0; - long fingerprint1 = (1 << bits_per_entry) - 1; - - /* - The expected sketch is - 0 000 00000 - 1 100 00000 - 2 111 00000 - 3 011 00000 - 4 101 00000 - 5 001 11111 - 6 000 00000 - 7 100 00000 - */ - qf.insert(fingerprint0, 1, false); - qf.insert(fingerprint1, 4, false); // 11111 is inserted at slot 45 but pushed to slot 5 - qf.insert(fingerprint0, 7, false); - qf.insert(fingerprint0, 1, false); - qf.insert(fingerprint0, 2, false); - qf.insert(fingerprint0, 1, false); + final int A = 1; + final int B = 2; + final int C = 3; + final int D = 4; + final int E = 5; + final int F = 6; + + qf.insert(B, 1, false); + qf.insert(E, 4, false); + qf.insert(F, 7, false); + qf.insert(C, 1, false); + qf.insert(D, 2, false); + qf.insert(A, 1, false); assertEquals(qf.num_existing_entries, 6); + assertEquals(getState(qf, 0), 0); + assertEquals(qf.get_fingerprint(0), 0); + assertEquals(getState(qf, 1), 0b100); + assertEquals(qf.get_fingerprint(1), B); // this run is not ordered, which is different from Wikipedia example + assertEquals(getState(qf, 2), 0b111); + assertEquals(qf.get_fingerprint(2), C); + assertEquals(getState(qf, 3), 0b011); + assertEquals(qf.get_fingerprint(3), A); + assertEquals(getState(qf, 4), 0b101); + assertEquals(qf.get_fingerprint(4), D); + assertEquals(getState(qf, 5), 0b001); + assertEquals(qf.get_fingerprint(5), E); + assertEquals(getState(qf, 6), 0); + assertEquals(qf.get_fingerprint(6), 0); + assertEquals(getState(qf, 7), 0b100); + assertEquals(qf.get_fingerprint(7), F); + } - - // these are the expected resulting is_occupied, is_continuation, and is_shifted bits - // for all slots contiguously. We do not store the fingerprints here - BitSet result = new BitSet(num_entries * bits_per_entry); - result = set_slot_in_test(result, bits_per_entry, 0, false, false, false, fingerprint0); - result = set_slot_in_test(result, bits_per_entry, 1, true, false, false, fingerprint0); - result = set_slot_in_test(result, bits_per_entry, 2, true, true, true, fingerprint0); - result = set_slot_in_test(result, bits_per_entry, 3, false, true, true, fingerprint0); - result = set_slot_in_test(result, bits_per_entry, 4, true, false, true, fingerprint0); - result = set_slot_in_test(result, bits_per_entry, 5, false, false, true, fingerprint1); - result = set_slot_in_test(result, bits_per_entry, 6, false, false, false, fingerprint0); - result = set_slot_in_test(result, bits_per_entry, 7, true, false, false, fingerprint0); - assertTrue(check_equality(qf, result, true)); + static public int getState(QuotientFilter filter, int slot) { + return (filter.is_occupied(slot) ? 1 : 0) << 2 + | (filter.is_continuation(slot) ? 1 : 0) << 1 + | (filter.is_shifted(slot) ? 1 : 0); } /* @@ -165,13 +164,13 @@ static public void testQuotientFilterInsertionAndIteration() { qf.insert(0, 3, false); qf.insert(0, 3, false); qf.insert(0, 4, false); - qf.insert(0, 23, false); // last key in the filter - qf.insert(0, 24, false); // outside the bounds, logical slot 14 does not exist logically, even if it might exist physically + qf.insert(0, 15, false); // last slot in the filter + qf.insert(0, 16, false); // outside the bounds Iterator it = new Iterator(qf); - int[] arr = new int[] {2, 3, 3, 4, 23}; + int[] arr = new int[] {2, 3, 3, 4, 15}; int arr_index = 0; - while (it.next()) {assertEquals(arr[arr_index++], it.bucket_index);} + while (it.next()) {assertEquals(it.bucket_index, arr[arr_index++]);} } @Test @@ -247,7 +246,7 @@ static public boolean check_equality(QuotientFilter qf, BitSet bs, boolean check } /* - Helper functino to test that no false negatives are returned. + Helper function to test that no false negatives are returned. */ static public boolean test_no_false_negatives(QuotientFilter filter, int num_entries) { HashSet added = new HashSet(); @@ -273,5 +272,4 @@ static public boolean test_no_false_negatives(QuotientFilter filter, int num_ent } return true; } - } From cf3eb3c426b9b417d86334b519c7ae0ef5accc6b Mon Sep 17 00:00:00 2001 From: AlexanderSaydakov Date: Thu, 23 May 2024 11:53:05 -0700 Subject: [PATCH 12/38] removed unnecessary scan to compute the number of entries --- .../quotientfilter/QuotientFilter.java | 37 ++++++------------- .../quotientfilter/QuotientFilterTest.java | 2 +- 2 files changed, 13 insertions(+), 26 deletions(-) diff --git a/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java b/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java index 3af8affcd..9a7ebf549 100644 --- a/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java +++ b/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java @@ -28,7 +28,7 @@ public class QuotientFilter extends Filter { int bitPerEntry; int fingerprintLength; int power_of_two_size; - int num_existing_entries; + int num_entries; Bitmap filter; double expansion_threshold; @@ -67,8 +67,8 @@ public boolean rejuvenate(long key) { return false; } - public long get_num_existing_entries() { - return num_existing_entries; + public long get_num_entries() { + return num_entries; } public long get_max_entries_before_expansion() { @@ -117,11 +117,11 @@ protected static double measure_num_bits_per_entry(QuotientFilter current, Array //System.out.println("--------------------------"); //current.print_filter_summary(); //System.out.println(); - double num_entries = current.get_num_entries(false); + double num_entries = current.get_num_entries(); for (QuotientFilter q : other_filters) { //q.print_filter_summary(); //System.out.println(); - num_entries += q.get_num_entries(false); + num_entries += q.get_num_entries(); } long init_size = 1L << current.power_of_two_size; long num_bits = current.bitPerEntry * init_size; @@ -137,21 +137,9 @@ protected static double measure_num_bits_per_entry(QuotientFilter current, Array return bits_per_entry; } - // scans the quotient filter and returns the number of non-empty slots - public long get_num_entries(boolean include_all_internal_filters) { - long num_entries = 0; - for (long i = 0; i < get_num_slots(); i++) { - if (is_occupied(i) || is_continuation(i) || is_shifted(i)) { - num_entries++; - } - } - return num_entries; - } - // returns the fraction of occupied slots in the filter public double get_utilization() { long num_logical_slots = 1L << power_of_two_size; - long num_entries = get_num_entries(false); double util = num_entries / (double) num_logical_slots; return util; } @@ -236,7 +224,6 @@ void modify_slot(boolean is_occupied, boolean is_continuation, boolean is_shifte // summarize some statistical measures about the filter public void print_filter_summary() { - long num_entries = get_num_entries(false); long slots = get_num_slots(); long num_bits = slots * bitPerEntry; System.out.println("slots:\t" + slots); @@ -431,7 +418,7 @@ boolean insert_new_run(long canonical_slot, long long_fp) { // if the slot was initially empty, we can just terminate, as there is nothing to push to the right if (slot_initially_empty) { set_fingerprint(start_of_this_new_run, long_fp); - num_existing_entries++; + num_entries++; return true; } @@ -452,7 +439,7 @@ boolean insert_new_run(long canonical_slot, long long_fp) { } current_index = (current_index + 1) & getMask(); } while (!is_this_slot_empty); - num_existing_entries++; + num_entries++; return true; } @@ -461,7 +448,7 @@ boolean insert(long long_fp, long index, boolean insert_only_if_no_match) { //System.out.println("Inserting @ index " + index); //System.out.println("BoolMatch? " + insert_only_if_no_match); //System.out.println("**********"); - //System.out.println("Num items: " + num_existing_entries); + //System.out.println("Num items: " + num_entries); //System.out.println("Max items: " + max_entries_before_expansion); if (index >= get_num_slots()) { @@ -508,7 +495,7 @@ else if (finished_first_run) { } current_index = (current_index + 1) & getMask(); } while (!is_this_slot_empty); - num_existing_entries++; + num_entries++; return true; } @@ -678,7 +665,7 @@ protected boolean _insert(long large_hash, boolean insert_only_if_no_match) { System.exit(1); }*/ -// if (expand_autonomously && num_existing_entries >= max_entries_before_expansion) { +// if (expand_autonomously && num_entries >= max_entries_before_expansion) { // boolean expanded = expand(); // if (expanded) { // num_expansions++; @@ -692,7 +679,7 @@ protected boolean _insert(long large_hash, boolean insert_only_if_no_match) { // long fp_long = gen_fingerprint(large_hash); // boolean success = delete(fp_long, slot_index); // if (success) { -// num_existing_entries--; +// num_entries--; // } // return success; // } @@ -846,7 +833,7 @@ else if (occupied && continuation && shifted ) { // continuation of run // contributed 2*j / (2^j) --> 0 entries. // */ // public double get_load() { -// return num_existing_entries / (double) get_logical_num_slots(); +// return num_entries / (double) get_logical_num_slots(); // } } diff --git a/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterTest.java b/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterTest.java index 6fc24b10b..247ef7bda 100644 --- a/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterTest.java +++ b/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterTest.java @@ -54,7 +54,7 @@ static public void WikiInsertionTest() { qf.insert(C, 1, false); qf.insert(D, 2, false); qf.insert(A, 1, false); - assertEquals(qf.num_existing_entries, 6); + assertEquals(qf.get_num_entries(), 6); assertEquals(getState(qf, 0), 0); assertEquals(qf.get_fingerprint(0), 0); From 68be9a8273ece5110e34f640781b0714f590e92c Mon Sep 17 00:00:00 2001 From: AlexanderSaydakov Date: Thu, 23 May 2024 22:47:19 -0700 Subject: [PATCH 13/38] fixed deletions --- .../quotientfilter/QuotientFilter.java | 46 +++++++++---------- .../filters/quotientfilter/DeletionTests.java | 9 ++-- 2 files changed, 26 insertions(+), 29 deletions(-) diff --git a/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java b/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java index 9a7ebf549..93a6761c1 100644 --- a/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java +++ b/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java @@ -171,18 +171,14 @@ void set_fingerprint(long index, long fingerprint) { public String get_pretty_str(boolean vertical) { StringBuffer sbr = new StringBuffer(); - long num_slots = get_num_slots(); + long numBits = get_num_slots() * bitPerEntry; - for (long i = 0; i < filter.size(); i++) { + for (long i = 0; i < numBits; i++) { long remainder = i % bitPerEntry; if (remainder == 0) { long slot_num = i/bitPerEntry; sbr.append(" "); if (vertical) { - if (slot_num == num_slots) { - sbr.append("\n ---------"); - } - //sbr.append("\n" + slot_num + " "); sbr.append("\n" + String.format("%-10d", slot_num) + "\t"); } } @@ -343,9 +339,9 @@ long decide_which_fingerprint_to_delete(long index, long fingerprint) { // given the start of a run, find the last slot index that still belongs to this run long find_run_end(long index) { - do { + while (is_continuation((index + 1) & getMask())) { index = (index + 1) & getMask(); - } while(is_continuation(index)); + } return index; } @@ -451,17 +447,16 @@ boolean insert(long long_fp, long index, boolean insert_only_if_no_match) { //System.out.println("Num items: " + num_entries); //System.out.println("Max items: " + max_entries_before_expansion); - if (index >= get_num_slots()) { + if (index >= get_num_slots() || num_entries == get_num_slots()) { return false; } boolean does_run_exist = is_occupied(index); if (!does_run_exist) { - boolean val = insert_new_run(index, long_fp); - return val; + return insert_new_run(index, long_fp); } long run_start_index = find_run_start(index); - if (does_run_exist && insert_only_if_no_match) { + if (insert_only_if_no_match) { long found_index = find_first_fingerprint_in_run(run_start_index, long_fp); if (found_index > -1) { return false; @@ -500,15 +495,15 @@ else if (finished_first_run) { } boolean delete(long fingerprint, long canonical_slot, long run_start_index, long matching_fingerprint_index) { - final long run_end = find_run_end(matching_fingerprint_index); + long run_end = find_run_end(matching_fingerprint_index); // the run has only one entry, we need to disable its is_occupied flag // we just remember we need to do this here, and we do it later to not interfere with counts boolean turn_off_occupied = run_start_index == run_end; // First thing to do is move everything else in the run back by one slot - for (long i = matching_fingerprint_index; i < run_end; i++) { - long f = get_fingerprint(i + 1); + for (long i = matching_fingerprint_index; i != run_end; i = (i + 1) & getMask()) { + long f = get_fingerprint((i + 1) & getMask()); set_fingerprint(i, f); } @@ -519,7 +514,7 @@ boolean delete(long fingerprint, long canonical_slot, long run_start_index, long long cluster_start = find_cluster_start(canonical_slot); long num_shifted_count = 0; long num_non_occupied = 0; - for (long i = cluster_start; i <= run_end; i++) { + for (long i = cluster_start; i != ((run_end + 1) & getMask()); i = (i + 1) & getMask()) { if (is_continuation(i)) { num_shifted_count++; } @@ -540,8 +535,7 @@ boolean delete(long fingerprint, long canonical_slot, long run_start_index, long //boolean does_next_run_exist = !is_slot_empty(run_end + 1); //boolean is_next_run_shifted = is_shifted(run_end + 1); //if (!does_next_run_exist || !is_next_run_shifted) { - final long next_run_start = (run_end + 1) & getMask(); - if (is_slot_empty(next_run_start) || !is_shifted(next_run_start)) { + if (is_slot_empty((run_end + 1) & getMask()) || !is_shifted((run_end + 1) & getMask())) { if (turn_off_occupied) { // if we eliminated a run and now need to turn the is_occupied flag off, we do it at the end to not interfere in our counts set_occupied(canonical_slot, false); @@ -549,18 +543,20 @@ boolean delete(long fingerprint, long canonical_slot, long run_start_index, long return true; } - final long next_run_end = find_run_end(next_run_start); + // we now find the start and end of the next run + final long next_run_start = (run_end + 1) & getMask(); + run_end = find_run_end(next_run_start); // before we start processing the next run, we check whether the previous run we shifted is now back to its canonical slot // The condition num_shifted_count - num_non_occupied == 1 ensures that the run was shifted by only 1 slot, meaning it is now back in its proper place - if (is_occupied(run_end) && num_shifted_count - num_non_occupied == 1) { - set_shifted(run_end, false); + if (is_occupied((next_run_start - 1) & getMask()) && num_shifted_count - num_non_occupied == 1) { + set_shifted((next_run_start - 1) & getMask(), false); } else { - set_shifted(run_end, true); + set_shifted((next_run_start - 1) & getMask(), true); } - for (long i = next_run_start; i != ((next_run_end + 1) & getMask()); i++) { + for (long i = next_run_start; i != ((run_end + 1) & getMask()); i = (i + 1) & getMask()) { long f = get_fingerprint(i); set_fingerprint((i - 1) & getMask(), f); if (is_continuation(i)) { @@ -569,7 +565,9 @@ boolean delete(long fingerprint, long canonical_slot, long run_start_index, long if (!is_occupied(i)) { num_non_occupied++; } - num_shifted_count++; + if (i != next_run_start) { + num_shifted_count++; + } } set_fingerprint(run_end, 0); set_shifted(run_end, false); diff --git a/src/test/java/org/apache/datasketches/filters/quotientfilter/DeletionTests.java b/src/test/java/org/apache/datasketches/filters/quotientfilter/DeletionTests.java index 3f42d3657..1ed94b8a9 100644 --- a/src/test/java/org/apache/datasketches/filters/quotientfilter/DeletionTests.java +++ b/src/test/java/org/apache/datasketches/filters/quotientfilter/DeletionTests.java @@ -91,11 +91,10 @@ static public void DeletionsWithSameFingerprint() { qf.insert(0, 3, false); qf.insert(0, 3, false); qf.insert(0, 6, false); - qf.insert(0, 6, false); + qf.insert(0, 6, false); // these are ignored qf.insert(0, 6, false); qf.insert(0, 7, false); - qf.delete(0, 2); qf.delete(0, 3); @@ -106,9 +105,9 @@ static public void DeletionsWithSameFingerprint() { result = QuotientFilterTest.set_slot_in_test(result, bits_per_entry, 4, false, false, true, 0); result = QuotientFilterTest.set_slot_in_test(result, bits_per_entry, 5, false, true, true, 0); result = QuotientFilterTest.set_slot_in_test(result, bits_per_entry, 6, true, false, false, 0); - result = QuotientFilterTest.set_slot_in_test(result, bits_per_entry, 7, true, true, true, 0); - result = QuotientFilterTest.set_slot_in_test(result, bits_per_entry, 8, false, true, true, 0); - result = QuotientFilterTest.set_slot_in_test(result, bits_per_entry, 9, false, false, true, 0); + result = QuotientFilterTest.set_slot_in_test(result, bits_per_entry, 7, false, false, false, 0); +// result = QuotientFilterTest.set_slot_in_test(result, bits_per_entry, 8, false, true, true, 0); +// result = QuotientFilterTest.set_slot_in_test(result, bits_per_entry, 9, false, false, true, 0); assertTrue(QuotientFilterTest.check_equality(qf, result, true)); } From da8656fae76419f8c92dc9c08d5a52ef1ff5713b Mon Sep 17 00:00:00 2001 From: Charlie Dickens Date: Tue, 28 May 2024 12:06:14 +0100 Subject: [PATCH 14/38] Updated the overflow test to check the wrapping. --- .../filters/quotientfilter/QuotientFilterTest.java | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterTest.java b/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterTest.java index 247ef7bda..c30d7317e 100644 --- a/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterTest.java +++ b/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterTest.java @@ -139,8 +139,14 @@ static public void OverflowTest() { long fp2 = 1 << fingerprint_size - 1; qf.insert(fp2, num_entries - 1, false); + assertEquals(qf.get_fingerprint(7), fp2); + assertEquals(getState(qf, 7), 0b100); qf.insert(fp2, num_entries - 1, false); + assertEquals(qf.get_fingerprint(0), fp2); + assertEquals(getState(qf, 0), 0b011); qf.delete(fp2, num_entries - 1); + assertEquals(qf.get_fingerprint(0), 0); + assertEquals(getState(qf, 0), 0b000); boolean found = qf.search(fp2, num_entries - 1); assertTrue(found); } @@ -166,6 +172,7 @@ static public void testQuotientFilterInsertionAndIteration() { qf.insert(0, 4, false); qf.insert(0, 15, false); // last slot in the filter qf.insert(0, 16, false); // outside the bounds + qf.pretty_print() ; Iterator it = new Iterator(qf); int[] arr = new int[] {2, 3, 3, 4, 15}; From 87abf9cf55022c0c4d59e5e1b8984c9546170573 Mon Sep 17 00:00:00 2001 From: AlexanderSaydakov Date: Tue, 28 May 2024 14:19:25 -0700 Subject: [PATCH 15/38] no need to find the first empty slot --- .../filters/quotientfilter/QuotientFilter.java | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java b/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java index 93a6761c1..35c6f005f 100644 --- a/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java +++ b/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java @@ -379,14 +379,6 @@ long swap_fingerprints(long index, long new_fingerprint) { return existing; } - // finds the first empty slot after the given slot index - long find_first_empty_slot(long index) { - while (!is_slot_empty(index)) { - index = (index + 1) & getMask(); - } - return index; - } - // return the first slot to the right where the current run starting at the index parameter ends long find_new_run_location(long index) { if (!is_slot_empty(index)) { @@ -399,14 +391,13 @@ long find_new_run_location(long index) { } boolean insert_new_run(long canonical_slot, long long_fp) { - long first_empty_slot = find_first_empty_slot(canonical_slot); // finds the first empty slot to the right of the canonical slot that is empty long preexisting_run_start_index = find_run_start(canonical_slot); // scans the cluster leftwards and then to the right until reaching our run's would be location long start_of_this_new_run = find_new_run_location(preexisting_run_start_index); // If there is already a run at the would-be location, find its end and insert the new run after it boolean slot_initially_empty = is_slot_empty(start_of_this_new_run); // modify some metadata flags to mark the new run set_occupied(canonical_slot, true); - if (first_empty_slot != canonical_slot) { + if (start_of_this_new_run != canonical_slot) { set_shifted(start_of_this_new_run, true); } set_continuation(start_of_this_new_run, false); From 62776da548c15819454d990b21c4043452f7c73e Mon Sep 17 00:00:00 2001 From: AlexanderSaydakov Date: Tue, 28 May 2024 19:26:18 -0700 Subject: [PATCH 16/38] use the same method to shift --- .../quotientfilter/QuotientFilter.java | 34 ++++--------------- 1 file changed, 7 insertions(+), 27 deletions(-) diff --git a/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java b/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java index 35c6f005f..db1da48ad 100644 --- a/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java +++ b/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java @@ -408,26 +408,7 @@ boolean insert_new_run(long canonical_slot, long long_fp) { num_entries++; return true; } - - // push all entries one slot to the right - // if we inserted this run in the middle of a cluster - long current_index = start_of_this_new_run; - boolean is_this_slot_empty; - boolean temp_continuation = false; - do { - is_this_slot_empty = is_slot_empty(current_index); - long_fp = swap_fingerprints(current_index, long_fp); - - if (current_index != start_of_this_new_run) { - set_shifted(current_index, true); - boolean current_continuation = is_continuation(current_index); - set_continuation(current_index, temp_continuation); - temp_continuation = current_continuation; - } - current_index = (current_index + 1) & getMask(); - } while (!is_this_slot_empty); - num_entries++; - return true; + return insert_fingerprint_and_push_all_else(long_fp, start_of_this_new_run, false); } boolean insert(long long_fp, long index, boolean insert_only_if_no_match) { @@ -453,14 +434,13 @@ boolean insert(long long_fp, long index, boolean insert_only_if_no_match) { return false; } } - return insert_fingerprint_and_push_all_else(long_fp, run_start_index); + return insert_fingerprint_and_push_all_else(long_fp, run_start_index, true); } - // insert a fingerprint as the first fingerprint of the new run and push all other entries in the cluster to the right. - boolean insert_fingerprint_and_push_all_else(long long_fp, long run_start_index) { + // insert a fingerprint as the last fingerprint of the run and push all other entries in the cluster to the right. + boolean insert_fingerprint_and_push_all_else(long long_fp, long run_start_index, boolean is_same_run) { long current_index = run_start_index; boolean is_this_slot_empty; - boolean finished_first_run = false; boolean temp_continuation = false; do { @@ -468,12 +448,12 @@ boolean insert_fingerprint_and_push_all_else(long long_fp, long run_start_index) if (current_index != run_start_index) { set_shifted(current_index, true); } - if (current_index != run_start_index && !finished_first_run && !is_continuation(current_index)) { - finished_first_run = true; + if (current_index != run_start_index && is_same_run && !is_continuation(current_index)) { + is_same_run = false; set_continuation(current_index, true); long_fp = swap_fingerprints(current_index, long_fp); } - else if (finished_first_run) { + else if (!is_same_run) { boolean current_continuation = is_continuation(current_index); set_continuation(current_index, temp_continuation); temp_continuation = current_continuation; From 97575b0c8edcfbac29ca9bb52672cf134c1f5972 Mon Sep 17 00:00:00 2001 From: AlexanderSaydakov Date: Thu, 30 May 2024 17:07:29 -0700 Subject: [PATCH 17/38] fixed find_run_start() and removed adjustment in insert_new_run() --- .../quotientfilter/QuotientFilter.java | 32 ++++++++----------- 1 file changed, 14 insertions(+), 18 deletions(-) diff --git a/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java b/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java index db1da48ad..f890f53bc 100644 --- a/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java +++ b/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java @@ -291,23 +291,20 @@ long find_cluster_start(long index) { // given a canonical slot A, finds the actual index B of where the run belonging to slot A now resides // since the run might have been shifted to the right due to collisions long find_run_start(long index) { - long current_index = index; - int runs_to_skip_counter = 1; - while (is_shifted(current_index)) { - if (is_occupied(current_index)) { - runs_to_skip_counter++; - } - current_index = (current_index - 1) & getMask(); + int num_runs_to_skip = 0; + while (is_shifted(index)) { + index = (index - 1) & getMask(); + if (is_occupied(index)) { + num_runs_to_skip++; } - while (true) { - if (!is_continuation(current_index)) { - runs_to_skip_counter--; - if (runs_to_skip_counter == 0) { - return current_index; - } - } - current_index = (current_index + 1) & getMask(); + } + while (num_runs_to_skip > 0) { + index = (index + 1) & getMask(); + if (!is_continuation(index)) { + num_runs_to_skip--; } + } + return index; } // given the start of a run, scan the run and return the index of the first matching fingerprint @@ -391,10 +388,9 @@ long find_new_run_location(long index) { } boolean insert_new_run(long canonical_slot, long long_fp) { - long preexisting_run_start_index = find_run_start(canonical_slot); // scans the cluster leftwards and then to the right until reaching our run's would be location - long start_of_this_new_run = find_new_run_location(preexisting_run_start_index); // If there is already a run at the would-be location, find its end and insert the new run after it + long start_of_this_new_run = find_run_start(canonical_slot); boolean slot_initially_empty = is_slot_empty(start_of_this_new_run); - + // modify some metadata flags to mark the new run set_occupied(canonical_slot, true); if (start_of_this_new_run != canonical_slot) { From 8a8bfbc97735614b53e246282553ab3ec9a5235e Mon Sep 17 00:00:00 2001 From: AlexanderSaydakov Date: Thu, 30 May 2024 17:10:05 -0700 Subject: [PATCH 18/38] removed whitespace --- .../datasketches/filters/quotientfilter/QuotientFilter.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java b/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java index f890f53bc..0a8923acf 100644 --- a/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java +++ b/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java @@ -390,7 +390,7 @@ long find_new_run_location(long index) { boolean insert_new_run(long canonical_slot, long long_fp) { long start_of_this_new_run = find_run_start(canonical_slot); boolean slot_initially_empty = is_slot_empty(start_of_this_new_run); - + // modify some metadata flags to mark the new run set_occupied(canonical_slot, true); if (start_of_this_new_run != canonical_slot) { From 31fd85d4e3c40dd45245eed97b2e1d81bbf57c77 Mon Sep 17 00:00:00 2001 From: AlexanderSaydakov Date: Thu, 30 May 2024 17:21:03 -0700 Subject: [PATCH 19/38] removed unnecessary method --- .../filters/quotientfilter/QuotientFilter.java | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java b/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java index 0a8923acf..e2db67c58 100644 --- a/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java +++ b/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java @@ -376,17 +376,6 @@ long swap_fingerprints(long index, long new_fingerprint) { return existing; } - // return the first slot to the right where the current run starting at the index parameter ends - long find_new_run_location(long index) { - if (!is_slot_empty(index)) { - index = (index + 1) & getMask(); - } - while (is_continuation(index)) { - index = (index + 1) & getMask(); - } - return index; - } - boolean insert_new_run(long canonical_slot, long long_fp) { long start_of_this_new_run = find_run_start(canonical_slot); boolean slot_initially_empty = is_slot_empty(start_of_this_new_run); From 158b91b92a647581881bc8d453c2370aafe3fd7a Mon Sep 17 00:00:00 2001 From: jmalkin <786705+jmalkin@users.noreply.github.com> Date: Thu, 30 May 2024 22:26:35 -0700 Subject: [PATCH 20/38] move BloomFilter's underlying BitArray into common directory, update its API so QuotientFilter can use it --- .../filters/bloomfilter/BitArray.java | 135 ------- .../filters/bloomfilter/BloomFilter.java | 3 + .../datasketches/filters/common/BitArray.java | 302 ++++++++++++++++ .../DirectBitArray.java | 98 +++++- .../DirectBitArrayR.java | 86 ++++- .../{bloomfilter => common}/HeapBitArray.java | 128 +++++-- .../filters/quotientfilter/Bitmap.java | 35 -- .../quotientfilter/QuickBitVector.java | 328 ------------------ .../quotientfilter/QuickBitVectorWrapper.java | 62 ---- .../quotientfilter/QuotientFilter.java | 36 +- .../quotientfilter/QuotientFilterBuilder.java | 2 +- .../DirectBitArrayRTest.java | 26 +- .../DirectBitArrayTest.java | 68 +++- .../HeapBitArrayTest.java | 55 ++- .../quotientfilter/BitVectorTests.java | 82 ----- .../quotientfilter/QuotientFilterTest.java | 45 ++- 16 files changed, 756 insertions(+), 735 deletions(-) delete mode 100644 src/main/java/org/apache/datasketches/filters/bloomfilter/BitArray.java create mode 100644 src/main/java/org/apache/datasketches/filters/common/BitArray.java rename src/main/java/org/apache/datasketches/filters/{bloomfilter => common}/DirectBitArray.java (62%) rename src/main/java/org/apache/datasketches/filters/{bloomfilter => common}/DirectBitArrayR.java (58%) rename src/main/java/org/apache/datasketches/filters/{bloomfilter => common}/HeapBitArray.java (57%) delete mode 100644 src/main/java/org/apache/datasketches/filters/quotientfilter/Bitmap.java delete mode 100644 src/main/java/org/apache/datasketches/filters/quotientfilter/QuickBitVector.java delete mode 100644 src/main/java/org/apache/datasketches/filters/quotientfilter/QuickBitVectorWrapper.java rename src/test/java/org/apache/datasketches/filters/{bloomfilter => common}/DirectBitArrayRTest.java (85%) rename src/test/java/org/apache/datasketches/filters/{bloomfilter => common}/DirectBitArrayTest.java (78%) rename src/test/java/org/apache/datasketches/filters/{bloomfilter => common}/HeapBitArrayTest.java (79%) delete mode 100644 src/test/java/org/apache/datasketches/filters/quotientfilter/BitVectorTests.java diff --git a/src/main/java/org/apache/datasketches/filters/bloomfilter/BitArray.java b/src/main/java/org/apache/datasketches/filters/bloomfilter/BitArray.java deleted file mode 100644 index bfa696cad..000000000 --- a/src/main/java/org/apache/datasketches/filters/bloomfilter/BitArray.java +++ /dev/null @@ -1,135 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.datasketches.filters.bloomfilter; - -import static org.apache.datasketches.common.Util.LS; - -import org.apache.datasketches.common.SketchesArgumentException; -import org.apache.datasketches.memory.Buffer; -import org.apache.datasketches.memory.Memory; -import org.apache.datasketches.memory.WritableMemory; - -/** - * This class holds an array of bits suitable for use in a Bloom Filter - * - *

Rounds the number of bits up to the smallest multiple of 64 (one long) - * that is not smaller than the specified number. - */ -abstract class BitArray { - // MAX_BITS using longs, based on array indices being capped at Integer.MAX_VALUE - protected static final long MAX_BITS = Integer.MAX_VALUE * (long) Long.SIZE; - - protected BitArray() {} - - static BitArray heapify(final Buffer mem, final boolean isEmpty) { - return HeapBitArray.heapify(mem, isEmpty); - } - - static BitArray wrap(final Memory mem, final boolean isEmpty) { - return DirectBitArrayR.wrap(mem, isEmpty); - } - - static BitArray writableWrap(final WritableMemory wmem, final boolean isEmpty) { - return DirectBitArray.writableWrap(wmem, isEmpty); - } - - boolean isEmpty() { - return !isDirty() && getNumBitsSet() == 0; - } - - abstract boolean hasMemory(); - - abstract boolean isDirect(); - - abstract boolean isReadOnly(); - - abstract boolean getBit(final long index); - - abstract boolean getAndSetBit(final long index); - - abstract void setBit(final long index); - - abstract long getNumBitsSet(); - - abstract void reset(); - - abstract long getCapacity(); - - abstract int getArrayLength(); - - abstract void union(final BitArray other); - - abstract void intersect(final BitArray other); - - abstract void invert(); - - // prints the raw BitArray as 0s and 1s, one long per row - @Override - public String toString() { - final StringBuilder sb = new StringBuilder(); - for (int i = 0; i < getArrayLength(); ++i) { - sb.append(i + ": ") - .append(printLong(getLong(i))) - .append(LS); - } - return sb.toString(); - } - - long getSerializedSizeBytes() { - // We only really need an int for array length but this will keep everything - // aligned to 8 bytes. - // Always write array length, but write numBitsSet only if empty - return Long.BYTES * (isEmpty() ? 1L : (2L + getArrayLength())); - } - - // returns the number of bytes needed for a non-empty BitArray of the requested size - static long getSerializedSizeBytes(final long numBits) { - if (numBits <= 0) { - throw new SketchesArgumentException("Requested number of bits must be strictly positive"); - } - if (numBits > MAX_BITS) { - throw new SketchesArgumentException("Requested number of bits exceeds maximum allowed. " - + "Requested: " + numBits + ", maximum: " + MAX_BITS); - } - final int numLongs = (int) Math.ceil(numBits / 64.0); - return Long.BYTES * (numLongs + 2L); - } - - abstract protected boolean isDirty(); - - // used to get a long from the array regardless of underlying storage - // NOT used to query individual bits - abstract protected long getLong(final int arrayIndex); - - // used to set a long in the array regardless of underlying storage - // NOT used to set individual bits - abstract protected void setLong(final int arrayIndex, final long value); - - // prints a long as a series of 0s and 1s as little endian - protected static String printLong(final long val) { - final StringBuilder sb = new StringBuilder(); - for (int j = 0; j < Long.SIZE; ++j) { - sb.append((val & (1L << j)) != 0 ? "1" : "0"); - if (j % 8 == 7) { sb.append(" "); } - } - return sb.toString(); - } - -} diff --git a/src/main/java/org/apache/datasketches/filters/bloomfilter/BloomFilter.java b/src/main/java/org/apache/datasketches/filters/bloomfilter/BloomFilter.java index 3ea73b9bd..10829d7b7 100644 --- a/src/main/java/org/apache/datasketches/filters/bloomfilter/BloomFilter.java +++ b/src/main/java/org/apache/datasketches/filters/bloomfilter/BloomFilter.java @@ -26,6 +26,9 @@ import org.apache.datasketches.common.Family; import org.apache.datasketches.common.SketchesArgumentException; import org.apache.datasketches.common.SketchesStateException; +import org.apache.datasketches.filters.common.BitArray; +import org.apache.datasketches.filters.common.DirectBitArray; +import org.apache.datasketches.filters.common.HeapBitArray; import org.apache.datasketches.memory.Buffer; import org.apache.datasketches.memory.Memory; import org.apache.datasketches.memory.WritableBuffer; diff --git a/src/main/java/org/apache/datasketches/filters/common/BitArray.java b/src/main/java/org/apache/datasketches/filters/common/BitArray.java new file mode 100644 index 000000000..2fd2a49e1 --- /dev/null +++ b/src/main/java/org/apache/datasketches/filters/common/BitArray.java @@ -0,0 +1,302 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.datasketches.filters.common; + +import static org.apache.datasketches.common.Util.LS; + +import org.apache.datasketches.common.SketchesArgumentException; +import org.apache.datasketches.memory.Buffer; +import org.apache.datasketches.memory.Memory; +import org.apache.datasketches.memory.WritableMemory; + +/** + * This class holds an array of bits suitable for use in a Bloom Filter + * + *

Rounds the number of bits up to the smallest multiple of 64 (one long) + * that is not smaller than the specified number. + */ +public abstract class BitArray { + + /** + * The maximum number of bits that can be represented using longs, + * based on array indices being capped at Integer.MAX_VALUE + * and allowing room for encoding both the size and the number of bits set. + */ + protected static final long MAX_BITS = (Integer.MAX_VALUE - 1) * (long) Long.SIZE; + + /** + * Constructs a new BitArray. + */ + BitArray() {} + + /** + * Creates a BitArray from a given Buffer. + * + * @param mem The Buffer to heapify. + * @param isEmpty Indicates whether the BitArray is empty. + * @return The heapified BitArray. + */ + public static BitArray heapify(final Buffer mem, final boolean isEmpty) { + return HeapBitArray.heapify(mem, isEmpty); + } + + /** + * Creates a BitArray from a given Memory. + * + * @param mem The Memory to wrap. + * @param isEmpty Indicates whether the BitArray is empty. + * @return The wrapped BitArray. + */ + public static BitArray wrap(final Memory mem, final boolean isEmpty) { + return DirectBitArrayR.wrap(mem, isEmpty); + } + + /** + * Creates a writable BitArray from a given WritableMemory. + * + * @param wmem The WritableMemory to wrap. + * @param isEmpty Indicates whether the BitArray is empty. + * @return The writable wrapped BitArray. + */ + public static BitArray writableWrap(final WritableMemory wmem, final boolean isEmpty) { + return DirectBitArray.writableWrap(wmem, isEmpty); + } + + /** + * Checks if the BitArray is empty. + * + * @return True if the BitArray is empty, false otherwise. + */ + public boolean isEmpty() { + return !isDirty() && getNumBitsSet() == 0; + } + + /** + * Checks if the BitArray has a backing Memory. + * + * @return True if the BitArray has a backing Memory, false otherwise. + */ + public abstract boolean hasMemory(); + + /** + * Checks if the BitArray is direct. + * + * @return True if the BitArray is direct, false otherwise. + */ + public abstract boolean isDirect(); + + /** + * Checks if the BitArray is read-only. + * + * @return True if the BitArray is read-only, false otherwise. + */ + public abstract boolean isReadOnly(); + + /** + * Gets the value of a bit at the specified index. + * + * @param index The index of the bit. + * @return The value of the bit at the specified index. + */ + public abstract boolean getBit(final long index); + + /** + * Gets the a specified number of bits starting at the given index. Limited + * to a single long (64 bits). + * + * @param index The starting index. + * @param numBits The number of bits to return. + * @return The value of the requested bits, starting at bit 0 of the result. + */ + public abstract long getBits(final long index, final int numBits); + + /** + * Gets the value of a bit at the specified index and sets it to true. + * + * @param index The index of the bit. + * @return The previous value of the bit at the specified index. + */ + public abstract boolean getAndSetBit(final long index); + + /** + * Assigns the value of a bit at the specified index to true. + * + * @param index The index of the bit. + */ + public abstract void setBit(final long index); + + /** + * Assigns the value of a bit at the specified index to false. + * + * @param index The index of the bit. + */ + public abstract void clearBit(final long index); + + /** + * Assigns the given value of a bit at the specified index. + * + * @param index The index of the bit. + * @param value The value to set the bit to. + */ + public abstract void assignBit(final long index, final boolean value); + + /** + /** + * Sets {@code numBits} starting from {@code index} to the specified value. + * Limited to a single long (64 bits). + * + * @param index the starting index of the range (inclusive) + * @param numBits the number of bits to write + * @param bits the value to set the bits to, starting with bit 0 + */ + public abstract void setBits(final long index, final int numBits, final long bits); + + /** + * Gets the number of bits that are set to true in the BitArray. + * + * @return The number of bits set to true. + */ + public abstract long getNumBitsSet(); + + /** + * Resets the BitArray, setting all bits to false. + */ + public abstract void reset(); + + /** + * Gets the capacity of the BitArray in bits. + * + * @return The capacity of the BitArray in bits + */ + public abstract long getCapacity(); + + /** + * Gets the length of the underlying array in longs. + * + * @return The length of the underlying array in longs. + */ + public abstract int getArrayLength(); + + /** + * Performs a union operation with another BitArray. + * + * @param other The other BitArray to perform the union with. + */ + public abstract void union(final BitArray other); + + /** + * Performs an intersection operation with another BitArray. + * + * @param other The other BitArray to perform the intersection with. + */ + public abstract void intersect(final BitArray other); + + /** + * Inverts the BitArray, flipping all bits. + */ + public abstract void invert(); + + /** + * Returns a string representation of the BitArray. + * + * @return A string representation of the BitArray. + */ + @Override + public String toString() { + final StringBuilder sb = new StringBuilder(); + for (int i = 0; i < getArrayLength(); ++i) { + sb.append(i + ": ") + .append(printLong(getLong(i))) + .append(LS); + } + return sb.toString(); + } + + /** + * Gets the serialized size of the BitArray in bytes. + * + * @return The serialized size of the BitArray in bytes. + */ + public long getSerializedSizeBytes() { + // We only really need an int for array length but this will keep everything + // aligned to 8 bytes. + // Always write array length, but write numBitsSet only if empty + return Long.BYTES * (isEmpty() ? 1L : (2L + getArrayLength())); + } + + /** + * Gets the serialized size of a non-empty BitArray of the specified size in bytes. + * + * @param numBits The number of bits in the BitArray. + * @return The serialized size of the BitArray in bytes. + * @throws SketchesArgumentException If the requested number of bits is not strictly positive + * or exceeds the maximum allowed. + */ + public static long getSerializedSizeBytes(final long numBits) { + if (numBits <= 0) { + throw new SketchesArgumentException("Requested number of bits must be strictly positive"); + } + if (numBits > MAX_BITS) { + throw new SketchesArgumentException("Requested number of bits exceeds maximum allowed. " + + "Requested: " + numBits + ", maximum: " + MAX_BITS); + } + final int numLongs = (int) Math.ceil(numBits / 64.0); + return Long.BYTES * (numLongs + 2L); + } + + /** + * Checks if the BitArray has changes not reflected in state variables. + * + * @return True if the BitArray is dirty, false otherwise. + */ + abstract boolean isDirty(); + + /** + * Gets the long value at the specified array index. + * + * @param arrayIndex The index of the long value in the array. + * @return The long value at the specified array index. + */ + abstract long getLong(final int arrayIndex); + + /** + * Sets the long value at the specified array index. + * + * @param arrayIndex The index of the long value in the array. + * @param value The value to set the long to. + */ + abstract void setLong(final int arrayIndex, final long value); + + /** + * Returns a string representation of a long value as a series of 0s and 1s (little endian). + * + * @param val The long value to print. + * @return A string representation of the long value. + */ + public static String printLong(final long val) { + final StringBuilder sb = new StringBuilder(); + for (int j = 0; j < Long.SIZE; ++j) { + sb.append((val & (1L << j)) != 0 ? "1" : "0"); + if (j % 8 == 7) { sb.append(" "); } + } + return sb.toString(); + } + +} diff --git a/src/main/java/org/apache/datasketches/filters/bloomfilter/DirectBitArray.java b/src/main/java/org/apache/datasketches/filters/common/DirectBitArray.java similarity index 62% rename from src/main/java/org/apache/datasketches/filters/bloomfilter/DirectBitArray.java rename to src/main/java/org/apache/datasketches/filters/common/DirectBitArray.java index 77c24f027..ac1d6eaf3 100644 --- a/src/main/java/org/apache/datasketches/filters/bloomfilter/DirectBitArray.java +++ b/src/main/java/org/apache/datasketches/filters/common/DirectBitArray.java @@ -17,21 +17,21 @@ * under the License. */ -package org.apache.datasketches.filters.bloomfilter; +package org.apache.datasketches.filters.common; import org.apache.datasketches.common.SketchesArgumentException; import org.apache.datasketches.memory.WritableMemory; -final class DirectBitArray extends DirectBitArrayR { +public final class DirectBitArray extends DirectBitArrayR { - DirectBitArray(final int dataLength, final long storedNumBitsSet, final WritableMemory wmem) { + public DirectBitArray(final int dataLength, final long storedNumBitsSet, final WritableMemory wmem) { super(dataLength, 0, wmem); // we'll set numBitsSet_ ourselves so pass 0 // can recompute later if needed numBitsSet_ = storedNumBitsSet; } - DirectBitArray(final int dataLength, final WritableMemory wmem) { + public DirectBitArray(final int dataLength, final WritableMemory wmem) { super(dataLength, 0, wmem); wmem_.putInt(0, dataLength_); @@ -39,7 +39,7 @@ final class DirectBitArray extends DirectBitArrayR { wmem_.clear(DATA_OFFSET, (long) dataLength_ * Long.BYTES); } - static DirectBitArray initialize(final long numBits, final WritableMemory wmem) { + public static DirectBitArray initialize(final long numBits, final WritableMemory wmem) { if (numBits <= 0) { throw new SketchesArgumentException("Number of bits must be strictly positive. Found: " + numBits); } @@ -58,7 +58,7 @@ static DirectBitArray initialize(final long numBits, final WritableMemory wmem) return new DirectBitArray(arrayLength, wmem); } - static DirectBitArray writableWrap(final WritableMemory mem, final boolean isEmpty) { + public static DirectBitArray writableWrap(final WritableMemory mem, final boolean isEmpty) { final int arrayLength = mem.getInt(0); final long storedNumBitsSet = isEmpty ? 0L : mem.getLong(NUM_BITS_OFFSET); @@ -81,7 +81,7 @@ static DirectBitArray writableWrap(final WritableMemory mem, final boolean isEmp } @Override - long getNumBitsSet() { + public long getNumBitsSet() { // update numBitsSet and store in array if (isDirty()) { numBitsSet_ = 0; @@ -95,17 +95,17 @@ long getNumBitsSet() { } @Override - protected boolean isDirty() { + public boolean isDirty() { return numBitsSet_ == -1; } @Override - boolean getBit(final long index) { + public boolean getBit(final long index) { return (wmem_.getByte(DATA_OFFSET + ((int) index >>> 3)) & (1 << (index & 0x7))) != 0; } @Override - protected long getLong(final int arrayIndex) { + public long getLong(final int arrayIndex) { return wmem_.getLong(DATA_OFFSET + (arrayIndex << 3)); } @@ -115,21 +115,83 @@ public boolean isReadOnly() { } @Override - void reset() { + public void reset() { setNumBitsSet(0); wmem_.clear(DATA_OFFSET, (long) dataLength_ * Long.BYTES); } @Override - void setBit(final long index) { + public void setBit(final long index) { final long memoryOffset = DATA_OFFSET + ((int) index >>> 3); final byte val = wmem_.getByte(memoryOffset); - wmem_.setBits(memoryOffset, (byte) (val | (1 << (index & 0x07)))); + wmem_.putByte(memoryOffset, (byte) (val | (1 << (index & 0x07)))); setNumBitsSet(-1); // mark dirty } @Override - boolean getAndSetBit(final long index) { + public void clearBit(final long index) { + final long memoryOffset = DATA_OFFSET + ((int) index >>> 3); + final byte val = wmem_.getByte(memoryOffset); + wmem_.putByte(memoryOffset, (byte) (val & ~(1 << (index & 0x07)))); + setNumBitsSet(-1); // mark dirty + } + + @Override + public void assignBit(final long index, final boolean value) { + if (value) { + setBit(index); + } else { + clearBit(index); + } + } + + @Override + public void setBits(final long index, final int numBits, final long bits) { + if (numBits < 0 || numBits > 64) { + throw new SketchesArgumentException("numBits must be between 0 and 64 (inclusive)"); + } else if (index + numBits > getCapacity()) { + throw new SketchesArgumentException("End of range exceeds capacity"); + } + + // TODO: since Memory provides byte offsets even when reading a long, we can be sure + // that the result always fits in a single long. We can potentially optimize this, but + // need to handle cases where a long would read beyond the end of the Memory. + + final long endBit = index + numBits - 1; + + // these are indices into a long[] array, need to adjust to byte offsets + // when calling wmem_.getLong() + final int fromIndex = (int) index >>> 6; + final int toIndex = (int) endBit >>> 6; + + setNumBitsSet(-1); // mark dirty + final long fromOffset = index & 0x3F; + final long toOffset = endBit & 0x3F; + + // within a single long + if (fromIndex == toIndex) { + final long toMask = (toOffset == 63) ? -1L : (1L << (toOffset + 1)) - 1L; + final long fromMask = (1L << fromOffset) - 1L; + final long mask = toMask - fromMask; + final long maskedVal = wmem_.getLong(DATA_OFFSET + (fromIndex << 3)) & ~mask; + wmem_.putLong(DATA_OFFSET + (fromIndex << 3), maskedVal | ((bits << fromOffset) & mask)); + return; + } + + // spans longs, need to set bits in two longs + final long splitBit = Long.SIZE - (fromOffset); + final long fromMask = -1L - ((1L << fromOffset) - 1); + final long toMask = (1L << (toOffset + 1)) - 1; + + final long maskedFromVal = wmem_.getLong(DATA_OFFSET + (fromIndex << 3)) & ~fromMask; + final long maskedToVal = wmem_.getLong(DATA_OFFSET + (toIndex << 3)) & ~toMask; + + wmem_.putLong(DATA_OFFSET + (fromIndex << 3), maskedFromVal | ((bits << fromOffset) & fromMask)); + wmem_.putLong(DATA_OFFSET + (toIndex << 3), maskedToVal | ((bits >>> splitBit) & toMask)); + } + + @Override + public boolean getAndSetBit(final long index) { final long memoryOffset = DATA_OFFSET + ((int) index >>> 3); final byte mask = (byte) (1 << (index & 0x07)); final byte val = wmem_.getByte(memoryOffset); @@ -143,7 +205,7 @@ boolean getAndSetBit(final long index) { } @Override - void intersect(final BitArray other) { + public void intersect(final BitArray other) { if (getCapacity() != other.getCapacity()) { throw new SketchesArgumentException("Cannot intersect bit arrays with unequal lengths"); } @@ -158,7 +220,7 @@ void intersect(final BitArray other) { } @Override - void union(final BitArray other) { + public void union(final BitArray other) { if (getCapacity() != other.getCapacity()) { throw new SketchesArgumentException("Cannot intersect bit arrays with unequal lengths"); } @@ -173,7 +235,7 @@ void union(final BitArray other) { } @Override - void invert() { + public void invert() { if (isDirty()) { numBitsSet_ = 0; for (int i = 0; i < dataLength_; ++i) { @@ -191,7 +253,7 @@ void invert() { } @Override - protected void setLong(final int arrayIndex, final long value) { + void setLong(final int arrayIndex, final long value) { wmem_.putLong(DATA_OFFSET + (arrayIndex << 3), value); } diff --git a/src/main/java/org/apache/datasketches/filters/bloomfilter/DirectBitArrayR.java b/src/main/java/org/apache/datasketches/filters/common/DirectBitArrayR.java similarity index 58% rename from src/main/java/org/apache/datasketches/filters/bloomfilter/DirectBitArrayR.java rename to src/main/java/org/apache/datasketches/filters/common/DirectBitArrayR.java index 8acc36be2..e446ae6ea 100644 --- a/src/main/java/org/apache/datasketches/filters/bloomfilter/DirectBitArrayR.java +++ b/src/main/java/org/apache/datasketches/filters/common/DirectBitArrayR.java @@ -17,7 +17,7 @@ * under the License. */ -package org.apache.datasketches.filters.bloomfilter; +package org.apache.datasketches.filters.common; import org.apache.datasketches.common.SketchesArgumentException; import org.apache.datasketches.common.SketchesReadOnlyException; @@ -35,7 +35,7 @@ public class DirectBitArrayR extends BitArray { final protected WritableMemory wmem_; // for inheritance; we won't write to it protected long numBitsSet_; // could be final here but writable direct will update it - protected DirectBitArrayR(final int dataLength, final long storedNumBitsSet, final Memory mem) { + public DirectBitArrayR(final int dataLength, final long storedNumBitsSet, final Memory mem) { super(); dataLength_ = dataLength; @@ -53,7 +53,7 @@ protected DirectBitArrayR(final int dataLength, final long storedNumBitsSet, fin // assumes we have a region with only the portion of Memory // the BitArray cares about - static DirectBitArrayR wrap(final Memory mem, final boolean isEmpty) { + public static DirectBitArrayR wrap(final Memory mem, final boolean isEmpty) { final int arrayLength = mem.getInt(0); final long storedNumBitsSet = isEmpty ? 0L : mem.getLong(NUM_BITS_OFFSET); @@ -71,34 +71,73 @@ static DirectBitArrayR wrap(final Memory mem, final boolean isEmpty) { } @Override - long getCapacity() { + public long getCapacity() { return (long) dataLength_ * Long.SIZE; } @Override - long getNumBitsSet() { + public long getNumBitsSet() { return numBitsSet_; } @Override - protected boolean isDirty() { + public boolean isDirty() { // read-only so necessarily false return false; } @Override - int getArrayLength() { + public int getArrayLength() { return dataLength_; } @Override - boolean getBit(final long index) { + public boolean getBit(final long index) { if (isEmpty()) { return false; } return (wmem_.getByte(DATA_OFFSET + ((int) index >>> 3)) & (1 << (index & 0x7))) != 0; } @Override - protected long getLong(final int arrayIndex) { + public long getBits(final long index, final int numBits) { + if (numBits < 0 || numBits > 64) { + throw new SketchesArgumentException("numBits must be between 0 and 64 (inclusive)"); + } else if (index + numBits > getCapacity()) { + throw new SketchesArgumentException("End of range exceeds capacity"); + } + if (isEmpty()) { return 0L; } + + // TODO: since Memory provides byte offsets even when reading a long, we can be sure + // that the result always fits in a single long. We can potentially optimize this, but + // need to handle cases where a long would read beyond the end of the Memory. + + final long endBit = index + numBits - 1; + + // these are indices into a long[] array, need to adjust to byte offsets + // when calling wmem_.getLong() + final int fromIndex = (int) index >>> 6; + final int toIndex = (int) endBit >>> 6; + final long fromOffset = index & 0x3F; + final long toOffset = endBit & 0x3F; + + // within a single long + if (fromIndex == toIndex) { + final long toMask = (toOffset == 63) ? -1L : (1L << (toOffset + 1)) - 1L; + final long fromMask = (1L << fromOffset) - 1L; + return (wmem_.getLong(DATA_OFFSET + (fromIndex << 3)) & (toMask - fromMask)) >>> fromOffset; + } + + // spans longs, need to combine bits from two longs + final long splitBit = Long.SIZE - (fromOffset); + final long fromMask = -1L - ((1L << fromOffset) - 1); + final long toMask = (1L << (toOffset + 1)) - 1; + + long result = (wmem_.getLong(DATA_OFFSET + (fromIndex << 3)) & fromMask) >>> fromOffset; + result |= (wmem_.getLong(DATA_OFFSET + (toIndex << 3)) & toMask) << splitBit; + return result; + } + + @Override + long getLong(final int arrayIndex) { if (isEmpty()) { return 0L; } return wmem_.getLong(DATA_OFFSET + (arrayIndex << 3)); } @@ -119,37 +158,52 @@ public boolean isReadOnly() { } @Override - void reset() { + public void reset() { throw new SketchesReadOnlyException("Attempt to call reset() on read-only memory"); } @Override - void setBit(final long index) { + public void setBit(final long index) { + throw new SketchesReadOnlyException("Attempt to call setBit() on read-only memory"); + } + + @Override + public void clearBit(final long index) { + throw new SketchesReadOnlyException("Attempt to call clearBit() on read-only memory"); + } + + @Override + public void setBits(final long index, final int numBits, final long bits) { + throw new SketchesReadOnlyException("Attempt to call setBits() on read-only memory"); + } + + @Override + public void assignBit(final long index, final boolean value) { throw new SketchesReadOnlyException("Attempt to call setBit() on read-only memory"); } @Override - boolean getAndSetBit(final long index) { + public boolean getAndSetBit(final long index) { throw new SketchesReadOnlyException("Attempt to call getAndSetBit() on read-only memory"); } @Override - void intersect(final BitArray other) { + public void intersect(final BitArray other) { throw new SketchesReadOnlyException("Attempt to call intersect() on read-only memory"); } @Override - void union(final BitArray other) { + public void union(final BitArray other) { throw new SketchesReadOnlyException("Attempt to call union() on read-only memory"); } @Override - void invert() { + public void invert() { throw new SketchesReadOnlyException("Attempt to call invert() on read-only memory"); } @Override - protected void setLong(final int arrayIndex, final long value) { + void setLong(final int arrayIndex, final long value) { throw new SketchesReadOnlyException("Attempt to call setLong() on read-only memory"); } } diff --git a/src/main/java/org/apache/datasketches/filters/bloomfilter/HeapBitArray.java b/src/main/java/org/apache/datasketches/filters/common/HeapBitArray.java similarity index 57% rename from src/main/java/org/apache/datasketches/filters/bloomfilter/HeapBitArray.java rename to src/main/java/org/apache/datasketches/filters/common/HeapBitArray.java index 4048b6775..184cc83c1 100644 --- a/src/main/java/org/apache/datasketches/filters/bloomfilter/HeapBitArray.java +++ b/src/main/java/org/apache/datasketches/filters/common/HeapBitArray.java @@ -17,7 +17,7 @@ * under the License. */ -package org.apache.datasketches.filters.bloomfilter; +package org.apache.datasketches.filters.common; import java.util.Arrays; @@ -31,13 +31,13 @@ *

Rounds the number of bits up to the smallest multiple of 64 (one long) * that is not smaller than the specified number. */ -final class HeapBitArray extends BitArray { +public final class HeapBitArray extends BitArray { private long numBitsSet_; // if -1, need to recompute value private boolean isDirty_; final private long[] data_; // creates an array of a given size - HeapBitArray(final long numBits) { + public HeapBitArray(final long numBits) { super(); if (numBits <= 0) { @@ -54,7 +54,7 @@ final class HeapBitArray extends BitArray { } // uses the provided array - HeapBitArray(final long numBitsSet, final long[] data) { + public HeapBitArray(final long numBitsSet, final long[] data) { super(); data_ = data; @@ -64,7 +64,7 @@ final class HeapBitArray extends BitArray { // reads a serialized image, but the BitArray is not fully self-describing so requires // a flag to indicate whether the array is empty - static HeapBitArray heapify(final Buffer buffer, final boolean isEmpty) { + public static HeapBitArray heapify(final Buffer buffer, final boolean isEmpty) { final int numLongs = buffer.getInt(); if (numLongs < 0) { throw new SketchesArgumentException("Possible corruption: Must have strictly positive array size. Found: " + numLongs); @@ -85,40 +85,124 @@ static HeapBitArray heapify(final Buffer buffer, final boolean isEmpty) { } @Override - protected boolean isDirty() { + public boolean isDirty() { return isDirty_; } @Override - boolean hasMemory() { + public boolean hasMemory() { return false; } @Override - boolean isDirect() { + public boolean isDirect() { return false; } @Override - boolean isReadOnly() { return false; } + public boolean isReadOnly() { return false; } // queries a single bit in the array @Override - boolean getBit(final long index) { + public boolean getBit(final long index) { return (data_[(int) index >>> 6] & (1L << index)) != 0 ? true : false; } + @Override + public long getBits(final long index, final int numBits) { + if (numBits < 0 || numBits > 64) { + throw new SketchesArgumentException("numBits must be between 0 and 64 (inclusive)"); + } else if (index + numBits > getCapacity()) { + throw new SketchesArgumentException("End of range exceeds capacity"); + } + if (numBits == 0) { return 0; } + + final long endBit = index + numBits - 1; + + final int fromIndex = (int) index >>> 6; + final int toIndex = (int) endBit >>> 6; + final long fromOffset = index & 0x3F; + final long toOffset = endBit & 0x3F; + + // within a single long + if (fromIndex == toIndex) { + final long toMask = (toOffset == 63) ? -1L : (1L << (toOffset + 1)) - 1L; + final long fromMask = (1L << fromOffset) - 1L; + return (data_[fromIndex] & (toMask - fromMask)) >>> fromOffset; + } + + // spans longs, need to combine bits from two longs + final long splitBit = Long.SIZE - (fromOffset); + final long fromMask = -1L - ((1L << fromOffset) - 1); + final long toMask = (1L << (toOffset + 1)) - 1; + + long result = (data_[fromIndex] & fromMask) >>> fromOffset; + result |= (data_[toIndex] & toMask) << splitBit; + return result; + } + // sets a single bit in the array without querying, meaning the method // cannot properly track the number of bits set so set isDirty = true @Override - void setBit(final long index) { + public void setBit(final long index) { data_[(int) index >>> 6] |= 1L << index; isDirty_ = true; } + @Override + public void clearBit(final long index) { + data_[(int) index >>> 6] &= ~(1L << index); + isDirty_ = true; + } + + // assigns a single bit in the array without querying + @Override + public void assignBit(final long index, final boolean value) { + if (value) { + setBit(index); + } else { + clearBit(index); + } + } + + @Override + public void setBits(final long index, final int numBits, final long bits) { + if (numBits < 0 || numBits > 64) { + throw new SketchesArgumentException("numBits must be between 0 and 64 (inclusive)"); + } else if (index + numBits > getCapacity()) { + throw new SketchesArgumentException("End of range exceeds capacity"); + } + if (numBits == 0) { return; } + + isDirty_ = true; + final long endBit = index + numBits - 1; + + final int fromIndex = (int) index >>> 6; + final int toIndex = (int) endBit >>> 6; + final long fromOffset = index & 0x3F; + final long toOffset = endBit & 0x3F; + + // within a single long + if (fromIndex == toIndex) { + final long toMask = (toOffset == 63) ? -1L : (1L << (toOffset + 1)) - 1L; + final long fromMask = (1L << fromOffset) - 1L; + final long mask = toMask - fromMask; + data_[fromIndex] = (data_[fromIndex] & ~mask) | ((bits << fromOffset) & mask); + return; + } + + // spans longs, need to set bits in two longs + final long splitBit = Long.SIZE - (fromOffset); + final long fromMask = -1L - ((1L << fromOffset) - 1); + final long toMask = (1L << (toOffset + 1)) - 1; + + data_[fromIndex] = (data_[fromIndex] & ~fromMask) | ((bits << fromOffset) & fromMask); + data_[toIndex] = (data_[toIndex] & ~toMask) | ((bits >>> splitBit) & toMask); + } + // returns existing value of bit @Override - boolean getAndSetBit(final long index) { + public boolean getAndSetBit(final long index) { final int offset = (int) index >>> 6; final long mask = 1L << index; if ((data_[offset] & mask) != 0) { @@ -134,7 +218,7 @@ boolean getAndSetBit(final long index) { // O(1) if only getAndSetBit() has been used // O(data_.length) if setBit() has ever been used @Override - long getNumBitsSet() { + public long getNumBitsSet() { if (isDirty_) { numBitsSet_ = 0; for (final long val : data_) { @@ -145,14 +229,14 @@ long getNumBitsSet() { } @Override - long getCapacity() { return (long) data_.length * Long.SIZE; } + public long getCapacity() { return (long) data_.length * Long.SIZE; } @Override - int getArrayLength() { return data_.length; } + public int getArrayLength() { return data_.length; } // applies logical OR @Override - void union(final BitArray other) { + public void union(final BitArray other) { if (getCapacity() != other.getCapacity()) { throw new SketchesArgumentException("Cannot union bit arrays with unequal lengths"); } @@ -168,7 +252,7 @@ void union(final BitArray other) { // applies logical AND @Override - void intersect(final BitArray other) { + public void intersect(final BitArray other) { if (getCapacity() != other.getCapacity()) { throw new SketchesArgumentException("Cannot intersect bit arrays with unequal lengths"); } @@ -184,7 +268,7 @@ void intersect(final BitArray other) { // applies bitwise inversion @Override - void invert() { + public void invert() { if (isDirty_) { numBitsSet_ = 0; for (int i = 0; i < data_.length; ++i) { @@ -200,7 +284,7 @@ void invert() { } } - void writeToBuffer(final WritableBuffer wbuf) { + public void writeToBuffer(final WritableBuffer wbuf) { wbuf.putInt(data_.length); wbuf.putInt(0); // unused @@ -211,18 +295,18 @@ void writeToBuffer(final WritableBuffer wbuf) { } @Override - protected long getLong(final int arrayIndex) { + public long getLong(final int arrayIndex) { return data_[arrayIndex]; } @Override - protected void setLong(final int arrayIndex, final long value) { + public void setLong(final int arrayIndex, final long value) { data_[arrayIndex] = value; } // clears the array @Override - void reset() { + public void reset() { Arrays.fill(data_, 0); numBitsSet_ = 0; isDirty_ = false; diff --git a/src/main/java/org/apache/datasketches/filters/quotientfilter/Bitmap.java b/src/main/java/org/apache/datasketches/filters/quotientfilter/Bitmap.java deleted file mode 100644 index 658e15f0d..000000000 --- a/src/main/java/org/apache/datasketches/filters/quotientfilter/Bitmap.java +++ /dev/null @@ -1,35 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.datasketches.filters.quotientfilter; - -public abstract class Bitmap { - - public abstract long size(); - public abstract void set(long bit_index, boolean value); - public abstract void setFromTo(long from, long to, long value); - public abstract boolean get(long bit_index); - public abstract long getFromTo(long from, long to); - - public static boolean get_fingerprint_bit(long index, long fingerprint) { - long mask = 1 << index; - long and = fingerprint & mask; - return and != 0; - } -} diff --git a/src/main/java/org/apache/datasketches/filters/quotientfilter/QuickBitVector.java b/src/main/java/org/apache/datasketches/filters/quotientfilter/QuickBitVector.java deleted file mode 100644 index ca387ebc9..000000000 --- a/src/main/java/org/apache/datasketches/filters/quotientfilter/QuickBitVector.java +++ /dev/null @@ -1,328 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.datasketches.filters.quotientfilter; - -/* -Copyright � 1999 CERN - European Organization for Nuclear Research. -Permission to use, copy, modify, distribute and sell this software and its documentation for any purpose -is hereby granted without fee, provided that the above copyright notice appear in all copies and -that both that copyright notice and this permission notice appear in supporting documentation. -CERN makes no representations about the suitability of this software for any purpose. -It is provided "as is" without expressed or implied warranty. -*/ - -/** - * Implements quick non polymorphic non bounds checking low level bitvector operations. - * Includes some operations that interpret sub-bitstrings as long integers. - *

- * WARNING: Methods of this class do not check preconditions. - * Provided with invalid parameters these method may return (or set) invalid values without throwing any exception. - * You should only use this class when performance is critical and you are absolutely sure that indexes are within bounds. - *

- * A bitvector is modelled as a long array, i.e. long[] bits holds bits of a bitvector. - * Each long value holds 64 bits. - * The i-th bit is stored in bits[i/64] at - * bit position i % 64 (where bit position 0 refers to the least - * significant bit and 63 refers to the most significant bit). - * - * @author wolfgang.hoschek@cern.ch - * @version 1.0, 09/24/99 - * @see java.util.BitSet - */ -//package bitmap_implementations; - -public class QuickBitVector extends Object { - protected final static int ADDRESS_BITS_PER_UNIT = 6; // 64=2^6 - protected final static int BITS_PER_UNIT = 64; // = 1 << ADDRESS_BITS_PER_UNIT - protected final static int BIT_INDEX_MASK = 63; // = BITS_PER_UNIT - 1; - - private static final long[] pows = precomputePows(); //precompute bitmasks for speed - /** - * Makes this class non instantiable, but still inheritable. - */ - protected QuickBitVector() { - } - /** - * Returns a bit mask with bits in the specified range set to 1, all the rest set to 0. - * In other words, returns a bit mask having 0,1,2,3,...,64 bits set. - * If to-from+1==0 then returns zero (0L). - * Precondition (not checked): to-from+1 ≥ 0 AND to-from+1 ≤ 64. - * - * @param from index of start bit (inclusive) - * @param to index of end bit (inclusive). - * @return the bit mask having all bits between from and to set to 1. - */ - public static final long bitMaskWithBitsSetFromTo(long from, long to) { - return pows[(int)(to-from+1)] << from; - - // This turned out to be slower: - // 0xffffffffffffffffL == ~0L == -1L == all 64 bits set. - // int width; - // return (width=to-from+1) == 0 ? 0L : (0xffffffffffffffffL >>> (BITS_PER_UNIT-width)) << from; - } - /** - * Changes the bit with index bitIndex in the bitvector bits to the "clear" (false) state. - * - * @param bits the bitvector. - * @param bitIndex the index of the bit to be cleared. - */ - public static void clear(long[] bits, long bitIndex) { - bits[(int)(bitIndex >> ADDRESS_BITS_PER_UNIT)] &= ~(1L << (bitIndex & BIT_INDEX_MASK)); - } - /** - * Returns from the bitvector the value of the bit with the specified index. - * The value is true if the bit with the index bitIndex - * is currently set; otherwise, returns false. - * - * @param bits the bitvector. - * @param bitIndex the bit index. - * @return the value of the bit with the specified index. - */ - public static boolean get(long[] bits, long bitIndex) { - return ((bits[(int)(bitIndex >> ADDRESS_BITS_PER_UNIT)] & (1L << (bitIndex & BIT_INDEX_MASK))) != 0); - } - /** - * Returns a long value representing bits of a bitvector from index from to index to. - * Bits are returned as a long value with the return value having bit 0 set to bit from, ..., bit to-from set to bit to. - * All other bits of return value are set to 0. - * If from > to then returns zero (0L). - * Precondition (not checked): to-from+1 ≤ 64. - * @param bits the bitvector. - * @param from index of start bit (inclusive). - * @param to index of end bit (inclusive). - * @return the specified bits as long value. - */ - public static long getLongFromTo(long[] bits, long from, long to) { - if (from>to) return 0L; - - final int fromIndex = (int)(from >> ADDRESS_BITS_PER_UNIT); //equivalent to from/64 - final int toIndex = (int)(to >> ADDRESS_BITS_PER_UNIT); - final int fromOffset = (int)(from & BIT_INDEX_MASK); //equivalent to from%64 - final int toOffset = (int)(to & BIT_INDEX_MASK); - //this is equivalent to the above, but slower: - //final int fromIndex=from/BITS_PER_UNIT; - //final int toIndex=to/BITS_PER_UNIT; - //final int fromOffset=from%BITS_PER_UNIT; - //final int toOffset=to%BITS_PER_UNIT; - - - long mask; - if (fromIndex==toIndex) { //range does not cross unit boundaries; value to retrieve is contained in one single long value. - mask=bitMaskWithBitsSetFromTo(fromOffset, toOffset); - return (bits[fromIndex] & mask) >>> fromOffset; - - } - - //range crosses unit boundaries; value to retrieve is spread over two long values. - //get part from first long value - mask=bitMaskWithBitsSetFromTo(fromOffset, BIT_INDEX_MASK); - final long x1=(bits[fromIndex] & mask) >>> fromOffset; - - //get part from second long value - mask=bitMaskWithBitsSetFromTo(0, toOffset); - final long x2=(bits[toIndex] & mask) << (BITS_PER_UNIT-fromOffset); - - //combine - return x1|x2; - } - - /** - * Returns the index of the least significant bit in state "true". - * Returns 32 if no bit is in state "true". - * - * Examples: - *

-     * 0x80000000 : 31
-     * 0x7fffffff : 0
-     * 0x00000001 : 0
-     * 0x00000000 : 32
-     * 
- * - * @param value The integer value for which the least significant bit index is to be found. - * @return The index of the least significant bit in state "true". Returns 32 if no bit is in state "true". - */ - static public int leastSignificantBit(int value) { - int i=-1; - while (++i < 32 && (((1<> ADDRESS_BITS_PER_UNIT) ; // This line basically does (nBits-1) / 2^ADDRESS... - long safe_right_shift = ((nBits-1) >>> ADDRESS_BITS_PER_UNIT) ; // This line basically does (nBits-1) / 2^ADDRESS... - // System.out.println("Right shift " + right_shift); - //System.out.println("Safe Right shift " + safe_right_shift); - int unitIndex = (int)((nBits-1) >> ADDRESS_BITS_PER_UNIT); // How many multiples of 64 bits do we need to store nBits bits? - //System.out.println(ADDRESS_BITS_PER_UNIT); - long[] bitVector = new long[unitIndex + 1]; - //System.out.println("length " + bitVector.length); - //System.out.println("Total bits: " + (bitVector.length * 64)); - //System.out.println("Num slots available: " + (bitVector.length * 64) / bitsPerElement); - return bitVector; - } - - /** - * Returns the index of the most significant bit in state "true". - * Returns -1 if no bit is in state "true". - * - * Examples: - *
-     * 0x80000000 : 31
-     * 0x7fffffff : 30
-     * 0x00000001 : 0
-     * 0x00000000 : -1
-     * 
- * - * @param value The integer value for which the most significant bit index is to be found. - * @return The index of the most significant bit in state "true". Returns -1 if no bit is in state "true". - */ - static public int mostSignificantBit(int value) { - int i=32; - while (--i >=0 && (((1<= 1; ) { - pows[i]=value >>> (BITS_PER_UNIT-i); - } - pows[0]=0L; - return pows; - } - - /** - * Sets the bit with index bitIndex in the bitvector bits to the state specified by value. - * - * @param bits the bitvector. - * @param bitIndex the index of the bit to be changed. - * @param value the value to be stored in the bit. - */ - public static void put(long[] bits, long bitIndex, boolean value) { - if (value) - set(bits, bitIndex); - else - clear(bits, bitIndex); - } - - /** - * Sets bits of a bitvector from index from to index to to the bits of value. - * Bit from is set to bit 0 of value, ..., bit to is set to bit to-from of value. - * All other bits stay unaffected. - * If from > to then does nothing. - * Precondition (not checked): to-from+1 ≤ 64. - * - * this function is equivalent to the slower code below: - * int fromIndex=from/BITS_PER_UNIT; - * int toIndex=to/BITS_PER_UNIT; - * int fromOffset=from%BITS_PER_UNIT; - * int toOffset=to%BITS_PER_UNIT; - * - * @param bits the bitvector. - * @param value the value to be copied into the bitvector. - * @param from index of start bit (inclusive). - * @param to index of end bit (inclusive). - */ - public static void putLongFromTo(long[] bits, long value, long from, long to) { - if (from>to) return; - - final int fromIndex=(int)(from >> ADDRESS_BITS_PER_UNIT); //equivalent to from/64 - final int toIndex=(int)(to >> ADDRESS_BITS_PER_UNIT); - final int fromOffset=(int)(from & BIT_INDEX_MASK); //equivalent to from % 64 - final int toOffset=(int)(to & BIT_INDEX_MASK); - - //make sure all unused bits to the left are cleared. - long mask; - mask=bitMaskWithBitsSetFromTo(to-from+1, BIT_INDEX_MASK); - long cleanValue=value & (~mask); - - long shiftedValue; - - if (fromIndex==toIndex) { //range does not cross unit boundaries; should go into one single long value. - shiftedValue=cleanValue << fromOffset; - mask=bitMaskWithBitsSetFromTo(fromOffset, toOffset); - bits[fromIndex] = (bits[fromIndex] & (~mask)) | shiftedValue; - return; - - } - - //range crosses unit boundaries; value should go into two long values. - //copy into first long value. - shiftedValue=cleanValue << fromOffset; - mask=bitMaskWithBitsSetFromTo(fromOffset, BIT_INDEX_MASK); - bits[fromIndex] = (bits[fromIndex] & (~mask)) | shiftedValue; - - //copy into second long value. - shiftedValue=cleanValue >>> (BITS_PER_UNIT - fromOffset); - mask=bitMaskWithBitsSetFromTo(0, toOffset); - bits[toIndex] = (bits[toIndex] & (~mask)) | shiftedValue; - } - - /** - * Changes the bit with index bitIndex in the bitvector bits to the "set" (true) state. - * - * @param bits the bitvector. - * @param bitIndex the index of the bit to be set. - */ - public static void set(long[] bits, long bitIndex) { - bits[(int)(bitIndex >> ADDRESS_BITS_PER_UNIT)] |= 1L << (bitIndex & BIT_INDEX_MASK); - } - - /** - * Returns the index of the unit that contains the given bitIndex. - * - * @param bitIndex The index of the bit to be checked. - * @return The index of the unit that contains the given bitIndex. - */ - protected static long unit(long bitIndex) { - return bitIndex >> ADDRESS_BITS_PER_UNIT; // equivalent to bitIndex/64 - } -} diff --git a/src/main/java/org/apache/datasketches/filters/quotientfilter/QuickBitVectorWrapper.java b/src/main/java/org/apache/datasketches/filters/quotientfilter/QuickBitVectorWrapper.java deleted file mode 100644 index a4c24a3ff..000000000 --- a/src/main/java/org/apache/datasketches/filters/quotientfilter/QuickBitVectorWrapper.java +++ /dev/null @@ -1,62 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.datasketches.filters.quotientfilter; - -public class QuickBitVectorWrapper extends Bitmap { - - long[] bs; - - public QuickBitVectorWrapper(int bits_per_entry, long num_entries) { - bs = QuickBitVector.makeBitVector(num_entries, bits_per_entry); - } - - @Override - public long size() { - return (long)bs.length * Long.BYTES * 8L; - } - - @Override - public void set(long bit_index, boolean value) { - if (value) { - QuickBitVector.set(bs, bit_index); - } - else { - QuickBitVector.clear(bs, bit_index); - } - } - - @Override - public void setFromTo(long from, long to, long value) { - QuickBitVector.putLongFromTo(bs, value, from, to - 1); - } - - @Override - public boolean get(long bit_index) { - return QuickBitVector.get(bs, bit_index); - } - - @Override - public long getFromTo(long from, long to) { - return QuickBitVector.getLongFromTo(bs, from, to - 1); - } - - -} - diff --git a/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java b/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java index 93a6761c1..8671dd18f 100644 --- a/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java +++ b/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java @@ -23,13 +23,16 @@ import java.util.HashSet; import java.util.Set; +import org.apache.datasketches.filters.common.BitArray; +import org.apache.datasketches.filters.common.HeapBitArray; + public class QuotientFilter extends Filter { int bitPerEntry; int fingerprintLength; int power_of_two_size; int num_entries; - Bitmap filter; + BitArray filter; double expansion_threshold; long max_entries_before_expansion; @@ -83,18 +86,19 @@ public void set_expand_autonomously(boolean val) { expand_autonomously = val; } - Bitmap make_filter(long init_size, int bits_per_entry) { + BitArray make_filter(long init_size, int bits_per_entry) { // System.out.println(init_size ) ; // System.out.println(num_extension_slots); // System.out.println("Making BitVector with: " + (init_size + num_extension_slots) + "SLOTS"); - return new QuickBitVectorWrapper(bits_per_entry, init_size); + //return new QuickBitVectorWrapper(bits_per_entry, init_size); + return new HeapBitArray(init_size * bits_per_entry); } public int get_fingerprint_length() { return fingerprintLength; } - QuotientFilter(int power_of_two, int bits_per_entry, Bitmap bitmap) { + QuotientFilter(int power_of_two, int bits_per_entry, BitArray bitmap) { power_of_two_size = power_of_two; bitPerEntry = bits_per_entry; fingerprintLength = bits_per_entry - 3; @@ -152,7 +156,7 @@ public long get_num_slots() { long getMask() { return get_num_slots() - 1; } - + // sets the metadata flag bits for a given slot index void modify_slot(boolean is_occupied, boolean is_continuation, boolean is_shifted, long index) { @@ -163,7 +167,7 @@ void modify_slot(boolean is_occupied, boolean is_continuation, boolean is_shifte // sets the fingerprint for a given slot index void set_fingerprint(long index, long fingerprint) { - filter.setFromTo(index * bitPerEntry + 3, (long)index * bitPerEntry + 3 + fingerprintLength, fingerprint); + filter.setBits(index * bitPerEntry + 3, fingerprintLength, fingerprint); } // print a nice representation of the filter that can be understood. @@ -185,7 +189,7 @@ public String get_pretty_str(boolean vertical) { if (remainder == 3) { sbr.append(" "); } - sbr.append(filter.get(i) ? "1" : "0"); + sbr.append(filter.getBit(i) ? "1" : "0"); } sbr.append("\n"); return sbr.toString(); @@ -198,12 +202,12 @@ public void pretty_print() { // return a fingerprint in a given slot index long get_fingerprint(long index) { - return filter.getFromTo(index * bitPerEntry + 3, index * bitPerEntry + 3 + fingerprintLength); + return filter.getBits(index * bitPerEntry + 3, fingerprintLength); } // return an entire slot representation, including metadata flags and fingerprint long get_slot(long index) { - return filter.getFromTo(index * bitPerEntry, (index + 1) * bitPerEntry); + return filter.getBits(index * bitPerEntry, bitPerEntry); } // compare a fingerprint input to the fingerprint in some slot index @@ -251,27 +255,27 @@ public int get_bits_per_entry() { } boolean is_occupied(long index) { - return filter.get(index * bitPerEntry); + return filter.getBit(index * bitPerEntry); } boolean is_continuation(long index) { - return filter.get(index * bitPerEntry + 1); + return filter.getBit(index * bitPerEntry + 1); } boolean is_shifted(long index) { - return filter.get(index * bitPerEntry + 2); + return filter.getBit(index * bitPerEntry + 2); } void set_occupied(long index, boolean val) { - filter.set(index * bitPerEntry, val); + filter.assignBit(index * bitPerEntry, val); } void set_continuation(long index, boolean val) { - filter.set(index * bitPerEntry + 1, val); + filter.assignBit(index * bitPerEntry + 1, val); } void set_shifted(long index, boolean val) { - filter.set(index * bitPerEntry + 2, val); + filter.assignBit(index * bitPerEntry + 2, val); } boolean is_slot_empty(long index) { @@ -689,7 +693,7 @@ protected boolean _search(long large_hash) { } public boolean get_bit_at_offset(int offset) { - return filter.get(offset); + return filter.getBit(offset); } public void compute_statistics() { diff --git a/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterBuilder.java b/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterBuilder.java index 1f98c82f2..a39712195 100644 --- a/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterBuilder.java +++ b/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterBuilder.java @@ -18,7 +18,7 @@ */ package org.apache.datasketches.filters.quotientfilter; -import java.util.concurrent.ThreadLocalRandom; +//import java.util.concurrent.ThreadLocalRandom; import org.apache.datasketches.common.SketchesArgumentException; diff --git a/src/test/java/org/apache/datasketches/filters/bloomfilter/DirectBitArrayRTest.java b/src/test/java/org/apache/datasketches/filters/common/DirectBitArrayRTest.java similarity index 85% rename from src/test/java/org/apache/datasketches/filters/bloomfilter/DirectBitArrayRTest.java rename to src/test/java/org/apache/datasketches/filters/common/DirectBitArrayRTest.java index 521019e62..ea02ad21a 100644 --- a/src/test/java/org/apache/datasketches/filters/bloomfilter/DirectBitArrayRTest.java +++ b/src/test/java/org/apache/datasketches/filters/common/DirectBitArrayRTest.java @@ -17,7 +17,7 @@ * under the License. */ -package org.apache.datasketches.filters.bloomfilter; +package org.apache.datasketches.filters.common; import static org.testng.Assert.assertEquals; import static org.testng.Assert.assertFalse; @@ -99,6 +99,27 @@ public void basicOperationTest() { assertTrue(dba.isReadOnly()); } + @Test + public void getBitsFromToTest() { + final HeapBitArray hba = new HeapBitArray(128); + hba.setBit(1); // will override, but this forces non-empty + hba.setLong(0, 0x5555555555555555L); + hba.setLong(1, 0xFFFFFFFFFC003FFFL); + final Memory mem = bitArrayToMemory(hba); + DirectBitArrayR dba = DirectBitArrayR.wrap(mem, hba.isEmpty()); + + // single, full long test + assertEquals(dba.getBits(0, 64), 0x5555555555555555L); + + // subset of single long, mostly ones with a stretch of zeros + assertEquals(dba.getBits(64, 64), 0xFFFFFFFFFC003FFFL); + assertEquals(dba.getBits(78, 12), 0); + assertEquals(dba.getBits(77, 14), 8193); + + // spanning longs + assertEquals(dba.getBits(60, 20), 0x3FFF5); + } + @Test public void countBitsWhenDirty() { // like basicOperationTest but with setBit which does @@ -159,6 +180,9 @@ public void checkInvalidMethods() { // all of these try to modify a read-only memory assertThrows(SketchesReadOnlyException.class, () -> dba.setBit(14)); + assertThrows(SketchesReadOnlyException.class, () -> dba.clearBit(7)); + assertThrows(SketchesReadOnlyException.class, () -> dba.assignBit(924, false)); + assertThrows(SketchesReadOnlyException.class, () -> dba.setBits(100, 30, 0xFF)); assertThrows(SketchesReadOnlyException.class, () -> dba.getAndSetBit(100)); assertThrows(SketchesReadOnlyException.class, () -> dba.reset()); assertThrows(SketchesReadOnlyException.class, () -> dba.invert()); diff --git a/src/test/java/org/apache/datasketches/filters/bloomfilter/DirectBitArrayTest.java b/src/test/java/org/apache/datasketches/filters/common/DirectBitArrayTest.java similarity index 78% rename from src/test/java/org/apache/datasketches/filters/bloomfilter/DirectBitArrayTest.java rename to src/test/java/org/apache/datasketches/filters/common/DirectBitArrayTest.java index a45bcbb82..4cc229c50 100644 --- a/src/test/java/org/apache/datasketches/filters/bloomfilter/DirectBitArrayTest.java +++ b/src/test/java/org/apache/datasketches/filters/common/DirectBitArrayTest.java @@ -17,7 +17,7 @@ * under the License. */ -package org.apache.datasketches.filters.bloomfilter; +package org.apache.datasketches.filters.common; import static org.testng.Assert.assertEquals; import static org.testng.Assert.assertFalse; @@ -68,6 +68,8 @@ public void tooSmallCapacityTest() { } // no text of max size because the BitArray allows up to Integer.MAX_VALUE + // bits, which is the maximum size of an array in Java -- can't use it all, + // (need 2 longs for preamble) but also can't allocate that large to test on most machines @Test public void initializeTooSmallTest() { @@ -134,6 +136,70 @@ public void basicWritableWrapTest() { dba.setBit(100); assertTrue(dba.getAndSetBit(100)); assertEquals(dba.getNumBitsSet(), 8); + + dba.reset(); + assertTrue(dba.isEmpty()); + assertEquals(dba.getNumBitsSet(), 0); + + dba.setBit(0); + dba.setLong(0, -1); + assertTrue(dba.getBit(60)); + dba.clearBit(60); + assertFalse(dba.getBit(60)); + + assertTrue(dba.getBit(35)); + dba.assignBit(35, false); + assertFalse(dba.getBit(35)); + dba.assignBit(35, true); + assertTrue(dba.getBit(35)); + } + + @Test + public void getBitsFromToTest() { + final int numBits = 128; + final WritableMemory wmem = WritableMemory.writableWrap(new byte[32]); + final DirectBitArray dba = DirectBitArray.initialize(numBits, wmem); + + // single, full long test + dba.setBit(0); // useless but forces non-empty when using setLong() + dba.setLong(0, 0x5555555555555555L); + assertEquals(dba.getBits(0, 64), 0x5555555555555555L); + assertEquals(dba.getBits(64, 64), 0); + + // subset of single long, mostly ones with a stretch of zeros + dba.setLong(1, 0xFFFFFFFFFC003FFFL); + assertEquals(dba.getBits(64, 64), 0xFFFFFFFFFC003FFFL); + assertEquals(dba.getBits(78, 12), 0); + assertEquals(dba.getBits(77, 14), 8193); + + // spanning longs + assertEquals(dba.getBits(60, 20), 0x3FFF5); + } + + @Test + public void setBitsFromToTest() { + final int numBits = 128; + WritableMemory wmem = WritableMemory.writableWrap(new byte[32]); + DirectBitArray ba = DirectBitArray.initialize(numBits, wmem); + + // within a single long + ba.setBits(0, 64, 0x80000000DAB8C730L); + assertEquals(ba.getLong(0), 0x80000000DAB8C730L); + assertEquals(ba.getLong(1), 0); + + ba.setBits(40, 8, 0xA6); + assertEquals(ba.getLong(0), 0x8000A600DAB8C730L); + + // spanning longs + ba.setBits(60, 20, 0x3FFF5); + assertEquals(ba.getLong(0), 0x5000A600DAB8C730L); + assertEquals(ba.getLong(1), 0x3FFFL); + + // found specific failure with this test + wmem = WritableMemory.writableWrap(new byte[1272]); + ba = DirectBitArray.initialize(10000, wmem); + ba.setBits(601 * 10 + 3, 7, 125); + assertEquals(ba.getBits(601 * 10 + 3, 7), 125); } @Test diff --git a/src/test/java/org/apache/datasketches/filters/bloomfilter/HeapBitArrayTest.java b/src/test/java/org/apache/datasketches/filters/common/HeapBitArrayTest.java similarity index 79% rename from src/test/java/org/apache/datasketches/filters/bloomfilter/HeapBitArrayTest.java rename to src/test/java/org/apache/datasketches/filters/common/HeapBitArrayTest.java index 0e91788ea..a55f98a30 100644 --- a/src/test/java/org/apache/datasketches/filters/bloomfilter/HeapBitArrayTest.java +++ b/src/test/java/org/apache/datasketches/filters/common/HeapBitArrayTest.java @@ -17,7 +17,7 @@ * under the License. */ -package org.apache.datasketches.filters.bloomfilter; +package org.apache.datasketches.filters.common; import static org.testng.Assert.assertEquals; import static org.testng.Assert.assertFalse; @@ -75,9 +75,62 @@ public void basicOperationTest() { assertTrue(ba.isEmpty()); assertEquals(ba.getNumBitsSet(), 0); + ba.setLong(0, -1); + assertTrue(ba.getBit(60)); + ba.clearBit(60); + assertFalse(ba.getBit(60)); + + assertTrue(ba.getBit(35)); + ba.assignBit(35, false); + assertFalse(ba.getBit(35)); + ba.assignBit(35, true); + assertTrue(ba.getBit(35)); + assertTrue(String.valueOf(ba).length() > 0); } + @Test + public void getBitsFromToTest() { + final HeapBitArray ba = new HeapBitArray(128); + + // single, full long test + ba.setLong(0, 0x5555555555555555L); + assertEquals(ba.getBits(0, 64), 0x5555555555555555L); + assertEquals(ba.getBits(64, 64), 0); + + // subset of single long, mostly ones with a stretch of zeros + ba.setLong(1, 0xFFFFFFFFFC003FFFL); + assertEquals(ba.getBits(64, 64), 0xFFFFFFFFFC003FFFL); + assertEquals(ba.getBits(78, 12), 0); + assertEquals(ba.getBits(77, 14), 8193); + + // spanning longs + assertEquals(ba.getBits(60, 20), 0x3FFF5); + } + + @Test + public void setBitsFromToTest() { + HeapBitArray ba = new HeapBitArray(128); + + // within a single long + ba.setBits(0, 64, 0x80000000DAB8C730L); + assertEquals(ba.getLong(0), 0x80000000DAB8C730L); + assertEquals(ba.getLong(1), 0); + + ba.setBits(40, 8, 0xA6); + assertEquals(ba.getLong(0), 0x8000A600DAB8C730L); + + // spanning longs + ba.setBits(60, 20, 0x3FFF5); + assertEquals(ba.getLong(0), 0x5000A600DAB8C730L); + assertEquals(ba.getLong(1), 0x3FFFL); + + // found specific failure with this test + ba = new HeapBitArray(10000); + ba.setBits(601 * 10 + 3, 7, 125); + assertEquals(ba.getBits(601 * 10 + 3, 7), 125); + } + @Test public void bitAddresOutOfBoundsTest() { final HeapBitArray ba = new HeapBitArray(1024); diff --git a/src/test/java/org/apache/datasketches/filters/quotientfilter/BitVectorTests.java b/src/test/java/org/apache/datasketches/filters/quotientfilter/BitVectorTests.java deleted file mode 100644 index 487e36576..000000000 --- a/src/test/java/org/apache/datasketches/filters/quotientfilter/BitVectorTests.java +++ /dev/null @@ -1,82 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.datasketches.filters.quotientfilter; - -import org.testng.annotations.Test; -import static org.testng.Assert.assertEquals; -import static org.testng.Assert.assertTrue; -import static org.testng.Assert.assertFalse; - -public class BitVectorTests { - - /** - * This test method initializes a QuickBitVectorWrapper with various combinations of bits per entry and number of entries. - * It then calculates the expected length of the bit vector and asserts that the actual size of the bit vector matches the expected length. - * - * Example Input-Output Pairs: - * 1. Input: bitsPerEntry = 2, numEntries = 8 (1L << 3) - * Output: expectedLengthBits = 64 - * - * 2. Input: bitsPerEntry = 3, numEntries = 16 (1L << 4) - * Output: 64 - * - * 3. Input: bitsPerEntry = 33, numEntries = 8 (1L << 3) - * Output: expectedLengthBits = 320 - */ - @Test - static public void testSize(){ - int[] bitsPerEntry = {2, 3, 4, 5, 6, 7, 8, 9, 10, 23, 24, 25, 31, 32, 33}; - long[] numEntries = {1L << 3, 1L<<4, 1L<<8, 1L << 16}; - long nBits ; - long expectedLengthBits ; - - for (int i = 0; i < bitsPerEntry.length; i++){ - for (int j = 0; j < numEntries.length; j++) { - QuickBitVectorWrapper bv = new QuickBitVectorWrapper(bitsPerEntry[i], numEntries[j]); - nBits = bitsPerEntry[i] * numEntries[j]; - expectedLengthBits = 64 * ((nBits % 64 == 0) ? (nBits / 64) : (1 + nBits / 64)); - assertEquals(bv.size(), expectedLengthBits); - } - } - } - - /* - This test amends a few entries in the BitVector and checks that they are appropriately set. - */ - @Test - static public void testSettersAndGetters(){ - QuickBitVectorWrapper bv = new QuickBitVectorWrapper(6, 16); - - // All entries should be False before any updates - for (int i = 0; i < bv.size(); i++){ - assertFalse(bv.get(i), "All entries should be False"); - } - - // Set some values - bv.set(0, true); - assertTrue(bv.get(0), "Value at index 0 should be True"); - - bv.set(32, true) ; - assertTrue(bv.get(32), "Value at index 32 should be True"); - - bv.setFromTo(64, 128, ~0L); - assertTrue(bv.getFromTo(64, 128) == -1L, "Values from 64 to 128 should be set to 1") ; - } -} diff --git a/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterTest.java b/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterTest.java index c30d7317e..884914757 100644 --- a/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterTest.java +++ b/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterTest.java @@ -22,12 +22,20 @@ import static org.testng.Assert.assertTrue; import static org.testng.Assert.assertEquals; +import java.util.ArrayList; import java.util.BitSet; import java.util.HashSet; import java.util.Random; public class QuotientFilterTest { + // this method had been in Bitmap, but was used only to test the QuotientFilter + public static boolean get_fingerprint_bit(long index, long fingerprint) { + long mask = 1 << index; + long and = fingerprint & mask; + return and != 0; + } + /* * This test is based on the example from https://en.wikipedia.org/wiki/Quotient_filter * in "Algorithm Description" section. @@ -36,7 +44,7 @@ public class QuotientFilterTest { * (b,1), (e,4), (f, 7), (c,1), (d,2), (a,1) */ @Test - static public void WikiInsertionTest() { + public void WikiInsertionTest() { int bits_per_entry = 6; // 6 bits per entry => 3 bits fingerprint, resolved internally in the filter. int num_entries_power = 3; QuotientFilter qf = new QuotientFilter(num_entries_power, bits_per_entry); @@ -74,7 +82,7 @@ static public void WikiInsertionTest() { assertEquals(qf.get_fingerprint(7), F); } - static public int getState(QuotientFilter filter, int slot) { + public int getState(QuotientFilter filter, int slot) { return (filter.is_occupied(slot) ? 1 : 0) << 2 | (filter.is_continuation(slot) ? 1 : 0) << 1 | (filter.is_shifted(slot) ? 1 : 0); @@ -85,7 +93,7 @@ static public int getState(QuotientFilter filter, int slot) { * It performs the same insertions as in Figure 2 and checks for the same result. */ @Test - static public void PaperInsertionTest() { + public void PaperInsertionTest() { int bits_per_entry = 8; int num_entries_power = 4; int num_entries = (int)Math.pow(2, num_entries_power); @@ -116,7 +124,7 @@ static public void PaperInsertionTest() { // test we don't get any false negatives for quotient filter @Test - static public void FalseNegativeTest() { + public void FalseNegativeTest() { int bits_per_entry = 10; int num_entries_power = 10; QuotientFilter filter = new QuotientFilter(num_entries_power, bits_per_entry); @@ -130,7 +138,7 @@ static public void FalseNegativeTest() { * Checks this can be handled by the internal data structure and then deletes one of the keys from the filter. */ @Test - static public void OverflowTest() { + public void OverflowTest() { int bits_per_entry = 8; int num_entries_power = 3; int num_entries = (int)Math.pow(2, num_entries_power); @@ -158,7 +166,7 @@ static public void OverflowTest() { * and the program exits, indicating a test failure. */ @Test - static public void testQuotientFilterInsertionAndIteration() { + public void testQuotientFilterInsertionAndIteration() { int bits_per_entry = 8; int num_entries_power = 4; @@ -166,22 +174,24 @@ static public void testQuotientFilterInsertionAndIteration() { //int fingerprint_size = bits_per_entry - 3; QuotientFilter qf = new QuotientFilter(num_entries_power, bits_per_entry); - qf.insert(0, 2, false); - qf.insert(0, 3, false); - qf.insert(0, 3, false); - qf.insert(0, 4, false); - qf.insert(0, 15, false); // last slot in the filter - qf.insert(0, 16, false); // outside the bounds + qf.insert(0x1F, 0, false); + qf.insert(0x1F, 2, false); + qf.insert(0x1F, 3, false); + qf.insert(0x1F, 3, false); + qf.insert(0x1F, 4, false); + qf.insert(0x1F, 15, false); // last slot in the filter + qf.insert(0x1F, 16, false); // outside the bounds qf.pretty_print() ; Iterator it = new Iterator(qf); - int[] arr = new int[] {2, 3, 3, 4, 15}; + //int[] arr = new int[] {2, 3, 3, 4, 15}; + int[] arr = new int[] {0, 2, 3, 3, 4, 15}; int arr_index = 0; while (it.next()) {assertEquals(it.bucket_index, arr[arr_index++]);} } @Test - static public void testQuotientFilterIterator() { + public void testQuotientFilterIterator() { int bits_per_entry = 8; int num_entries_power = 4; @@ -225,7 +235,7 @@ static public BitSet set_slot_in_test(BitSet result, int bits_per_entry, int slo result.set(index++, is_continuation); result.set(index++, is_shifted); for (int i = 0; i < bits_per_entry - 3; i++) { - result.set(index++, Bitmap.get_fingerprint_bit(i, fingerprint) ); + result.set(index++, get_fingerprint_bit(i, fingerprint) ); } return result; } @@ -256,7 +266,8 @@ static public boolean check_equality(QuotientFilter qf, BitSet bs, boolean check Helper function to test that no false negatives are returned. */ static public boolean test_no_false_negatives(QuotientFilter filter, int num_entries) { - HashSet added = new HashSet(); + //HashSet added = new HashSet(); + ArrayList added = new ArrayList(); int seed = 5; Random rand = new Random(seed); @@ -274,7 +285,7 @@ static public boolean test_no_false_negatives(QuotientFilter filter, int num_ent for (Integer i : added) { boolean found = filter.search((long)i); if (!found) { - return false ; + return false; } } return true; From f7c7bd0dd6e8538db417b6a3603857de3dd13870 Mon Sep 17 00:00:00 2001 From: jmalkin <786705+jmalkin@users.noreply.github.com> Date: Fri, 31 May 2024 11:29:57 -0700 Subject: [PATCH 21/38] Cleanup based on review feedback --- .../org/apache/datasketches/filters/common/BitArray.java | 5 ++++- src/main/java/org/apache/datasketches/theta/Sketches.java | 2 +- .../filters/quotientfilter/QuotientFilterTest.java | 7 ++----- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/main/java/org/apache/datasketches/filters/common/BitArray.java b/src/main/java/org/apache/datasketches/filters/common/BitArray.java index 2fd2a49e1..8320a369f 100644 --- a/src/main/java/org/apache/datasketches/filters/common/BitArray.java +++ b/src/main/java/org/apache/datasketches/filters/common/BitArray.java @@ -27,7 +27,10 @@ import org.apache.datasketches.memory.WritableMemory; /** - * This class holds an array of bits suitable for use in a Bloom Filter + * This class holds an array of bits and should be suitable for use in + * the various membership filters. The representation is not compressed and + * is designed to fit in a single array, meaning that the maximum number + * of bits is limited by the maximize size of an array of longs in Java. * *

Rounds the number of bits up to the smallest multiple of 64 (one long) * that is not smaller than the specified number. diff --git a/src/main/java/org/apache/datasketches/theta/Sketches.java b/src/main/java/org/apache/datasketches/theta/Sketches.java index 4b1461876..c204751f2 100644 --- a/src/main/java/org/apache/datasketches/theta/Sketches.java +++ b/src/main/java/org/apache/datasketches/theta/Sketches.java @@ -80,7 +80,7 @@ public static int getMaxAnotBResultBytes(final int maxNomEntries) { /** * Returns the maximum number of storage bytes required for a CompactSketch with the given - * number of actual entries. Note that this assumes the worse case of the sketch in + * number of actual entries. Note that this assumes the worst case of the sketch in * estimation mode, which requires storing theta and count. * @param numberOfEntries the actual number of entries stored with the CompactSketch. * @return the maximum number of storage bytes required for a CompactSketch with the given number diff --git a/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterTest.java b/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterTest.java index 884914757..64f92ddc8 100644 --- a/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterTest.java +++ b/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterTest.java @@ -174,7 +174,6 @@ public void testQuotientFilterInsertionAndIteration() { //int fingerprint_size = bits_per_entry - 3; QuotientFilter qf = new QuotientFilter(num_entries_power, bits_per_entry); - qf.insert(0x1F, 0, false); qf.insert(0x1F, 2, false); qf.insert(0x1F, 3, false); qf.insert(0x1F, 3, false); @@ -184,8 +183,7 @@ public void testQuotientFilterInsertionAndIteration() { qf.pretty_print() ; Iterator it = new Iterator(qf); - //int[] arr = new int[] {2, 3, 3, 4, 15}; - int[] arr = new int[] {0, 2, 3, 3, 4, 15}; + int[] arr = new int[] {2, 3, 3, 4, 15}; int arr_index = 0; while (it.next()) {assertEquals(it.bucket_index, arr[arr_index++]);} } @@ -266,8 +264,7 @@ static public boolean check_equality(QuotientFilter qf, BitSet bs, boolean check Helper function to test that no false negatives are returned. */ static public boolean test_no_false_negatives(QuotientFilter filter, int num_entries) { - //HashSet added = new HashSet(); - ArrayList added = new ArrayList(); + HashSet added = new HashSet(); int seed = 5; Random rand = new Random(seed); From 16dbc167f7abb86f293e604ea2da84ace1693988 Mon Sep 17 00:00:00 2001 From: jmalkin <786705+jmalkin@users.noreply.github.com> Date: Fri, 31 May 2024 13:38:10 -0700 Subject: [PATCH 22/38] slightly simplify mask logic in bitarray --- .../datasketches/filters/common/DirectBitArray.java | 6 +++--- .../datasketches/filters/common/DirectBitArrayR.java | 2 +- .../apache/datasketches/filters/common/HeapBitArray.java | 8 ++++---- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/main/java/org/apache/datasketches/filters/common/DirectBitArray.java b/src/main/java/org/apache/datasketches/filters/common/DirectBitArray.java index ac1d6eaf3..25521672e 100644 --- a/src/main/java/org/apache/datasketches/filters/common/DirectBitArray.java +++ b/src/main/java/org/apache/datasketches/filters/common/DirectBitArray.java @@ -180,13 +180,13 @@ public void setBits(final long index, final int numBits, final long bits) { // spans longs, need to set bits in two longs final long splitBit = Long.SIZE - (fromOffset); - final long fromMask = -1L - ((1L << fromOffset) - 1); + final long fromMask = (1L << fromOffset) - 1; // inverse mask in this case final long toMask = (1L << (toOffset + 1)) - 1; - final long maskedFromVal = wmem_.getLong(DATA_OFFSET + (fromIndex << 3)) & ~fromMask; + final long maskedFromVal = wmem_.getLong(DATA_OFFSET + (fromIndex << 3)) & fromMask; final long maskedToVal = wmem_.getLong(DATA_OFFSET + (toIndex << 3)) & ~toMask; - wmem_.putLong(DATA_OFFSET + (fromIndex << 3), maskedFromVal | ((bits << fromOffset) & fromMask)); + wmem_.putLong(DATA_OFFSET + (fromIndex << 3), maskedFromVal | ((bits << fromOffset) & ~fromMask)); wmem_.putLong(DATA_OFFSET + (toIndex << 3), maskedToVal | ((bits >>> splitBit) & toMask)); } diff --git a/src/main/java/org/apache/datasketches/filters/common/DirectBitArrayR.java b/src/main/java/org/apache/datasketches/filters/common/DirectBitArrayR.java index e446ae6ea..6d0d4bad3 100644 --- a/src/main/java/org/apache/datasketches/filters/common/DirectBitArrayR.java +++ b/src/main/java/org/apache/datasketches/filters/common/DirectBitArrayR.java @@ -128,7 +128,7 @@ public long getBits(final long index, final int numBits) { // spans longs, need to combine bits from two longs final long splitBit = Long.SIZE - (fromOffset); - final long fromMask = -1L - ((1L << fromOffset) - 1); + final long fromMask = ~((1L << fromOffset) - 1); final long toMask = (1L << (toOffset + 1)) - 1; long result = (wmem_.getLong(DATA_OFFSET + (fromIndex << 3)) & fromMask) >>> fromOffset; diff --git a/src/main/java/org/apache/datasketches/filters/common/HeapBitArray.java b/src/main/java/org/apache/datasketches/filters/common/HeapBitArray.java index 184cc83c1..ca81ae073 100644 --- a/src/main/java/org/apache/datasketches/filters/common/HeapBitArray.java +++ b/src/main/java/org/apache/datasketches/filters/common/HeapBitArray.java @@ -133,7 +133,7 @@ public long getBits(final long index, final int numBits) { // spans longs, need to combine bits from two longs final long splitBit = Long.SIZE - (fromOffset); - final long fromMask = -1L - ((1L << fromOffset) - 1); + final long fromMask = ~((1L << fromOffset) - 1); final long toMask = (1L << (toOffset + 1)) - 1; long result = (data_[fromIndex] & fromMask) >>> fromOffset; @@ -193,11 +193,11 @@ public void setBits(final long index, final int numBits, final long bits) { // spans longs, need to set bits in two longs final long splitBit = Long.SIZE - (fromOffset); - final long fromMask = -1L - ((1L << fromOffset) - 1); + final long fromMask = (1L << fromOffset) - 1; // inverse mask in this case final long toMask = (1L << (toOffset + 1)) - 1; - data_[fromIndex] = (data_[fromIndex] & ~fromMask) | ((bits << fromOffset) & fromMask); - data_[toIndex] = (data_[toIndex] & ~toMask) | ((bits >>> splitBit) & toMask); + data_[fromIndex] = (data_[fromIndex] & fromMask) | ((bits << fromOffset) & ~fromMask); + data_[toIndex] = (data_[toIndex] & ~toMask) | ((bits >>> splitBit) & toMask); } // returns existing value of bit From 2fd83132c5b74a60ae07018f5ce161dc06fe6fc4 Mon Sep 17 00:00:00 2001 From: AlexanderSaydakov Date: Mon, 3 Jun 2024 13:40:21 -0700 Subject: [PATCH 23/38] no insertions of duplicates --- .../filters/quotientfilter/Filter.java | 6 +- .../quotientfilter/QuotientFilter.java | 41 +++---- .../filters/quotientfilter/DeletionTests.java | 96 ++++++++------- .../quotientfilter/QuotientFilterTest.java | 114 ++++++++++-------- 4 files changed, 127 insertions(+), 130 deletions(-) diff --git a/src/main/java/org/apache/datasketches/filters/quotientfilter/Filter.java b/src/main/java/org/apache/datasketches/filters/quotientfilter/Filter.java index 8f0796738..bfb1ad25d 100644 --- a/src/main/java/org/apache/datasketches/filters/quotientfilter/Filter.java +++ b/src/main/java/org/apache/datasketches/filters/quotientfilter/Filter.java @@ -34,7 +34,7 @@ public abstract class Filter { //abstract boolean rejuvenate(long key); //abstract boolean expand(); //protected abstract boolean _delete(long large_hash); - abstract protected boolean _insert(long large_hash, boolean insert_only_if_no_match); + abstract protected boolean _insert(long large_hash); abstract protected boolean _search(long large_hash); @@ -53,11 +53,11 @@ public abstract class Filter { // return _delete(HashFunctions.xxhash(input_buffer)); // } // - public boolean insert(long input, boolean insert_only_if_no_match) { + public boolean insert(long input) { //System.out.println("The ABC input is " + input); long hash = get_hash(input); //System.out.println("The ABC hash is " + hash); - return _insert(hash, insert_only_if_no_match); + return _insert(hash); } // // public boolean insert(String input, boolean insert_only_if_no_match) { diff --git a/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java b/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java index 40406dc4e..e0a442fc5 100644 --- a/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java +++ b/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java @@ -400,30 +400,19 @@ boolean insert_new_run(long canonical_slot, long long_fp) { return insert_fingerprint_and_push_all_else(long_fp, start_of_this_new_run, false); } - boolean insert(long long_fp, long index, boolean insert_only_if_no_match) { - //System.out.println("Inserting Fingerprint " + long_fp); - //System.out.println("Inserting @ index " + index); - //System.out.println("BoolMatch? " + insert_only_if_no_match); - //System.out.println("**********"); - //System.out.println("Num items: " + num_entries); - //System.out.println("Max items: " + max_entries_before_expansion); - - if (index >= get_num_slots() || num_entries == get_num_slots()) { - return false; - } - boolean does_run_exist = is_occupied(index); - if (!does_run_exist) { - return insert_new_run(index, long_fp); - } - - long run_start_index = find_run_start(index); - if (insert_only_if_no_match) { - long found_index = find_first_fingerprint_in_run(run_start_index, long_fp); - if (found_index > -1) { - return false; - } - } - return insert_fingerprint_and_push_all_else(long_fp, run_start_index, true); + boolean insert(long long_fp, long index) { + if (index >= get_num_slots() || num_entries == get_num_slots()) { + return false; + } + if (!is_occupied(index)) { + return insert_new_run(index, long_fp); + } + long run_start_index = find_run_start(index); + final long found_index = find_first_fingerprint_in_run(run_start_index, long_fp); + if (found_index > -1) { + return false; + } + return insert_fingerprint_and_push_all_else(long_fp, run_start_index, true); } // insert a fingerprint as the last fingerprint of the run and push all other entries in the cluster to the right. @@ -601,7 +590,7 @@ long gen_fingerprint(long large_hash) { Hence, the `large_hash` argument is already a hash key that has been generated by the hashing library (eg xxhash). */ - protected boolean _insert(long large_hash, boolean insert_only_if_no_match) { + protected boolean _insert(long large_hash) { //System.out.println("Inserting long hash " + large_hash); if (is_full) { return false; @@ -615,7 +604,7 @@ protected boolean _insert(long large_hash, boolean insert_only_if_no_match) { System.out.println(slot_index + " " + fingerprint ); System.out.println(); */ - boolean success = insert(fingerprint, slot_index, false); + boolean success = insert(fingerprint, slot_index); /*if (!success) { System.out.println("insertion failure"); System.out.println(input + "\t" + slot_index + "\t" + get_fingerprint_str(fingerprint, fingerprintLength)); diff --git a/src/test/java/org/apache/datasketches/filters/quotientfilter/DeletionTests.java b/src/test/java/org/apache/datasketches/filters/quotientfilter/DeletionTests.java index 1ed94b8a9..6e1beb9f2 100644 --- a/src/test/java/org/apache/datasketches/filters/quotientfilter/DeletionTests.java +++ b/src/test/java/org/apache/datasketches/filters/quotientfilter/DeletionTests.java @@ -45,13 +45,13 @@ static public void BasicDeletions() { long fp3 = 1 << 2; long fp4 = 31; - qf.insert(fp4, 1, false); - qf.insert(fp1, 1, false); - qf.insert(fp1, 1, false); - qf.insert(fp2, 2, false); - qf.insert(fp1, 1, false); - qf.insert(fp1, 1, false); - qf.insert(fp3, 4, false); + qf.insert(fp4, 1); + qf.insert(fp1, 1); + qf.insert(fp1, 1); + qf.insert(fp2, 2); + qf.insert(fp1, 1); + qf.insert(fp1, 1); + qf.insert(fp3, 4); qf.delete(31, 1); @@ -75,39 +75,36 @@ static public void BasicDeletions() { * The expected outcome is that after deletion, the remaining keys should be in their canonical slots. */ @Test - static public void DeletionsWithSameFingerprint() { + static public void Deletions() { int bits_per_entry = 8; int num_entries_power = 3; int num_entries = (int)Math.pow(2, num_entries_power); QuotientFilter qf = new QuotientFilter(num_entries_power, bits_per_entry); - - // All keys have the same fingerprint but are mapped into (mostly) different slots - qf.insert(0, 1, false); - qf.insert(0, 1, false); - qf.insert(0, 2, false); - qf.insert(0, 2, false); - qf.insert(0, 3, false); - qf.insert(0, 3, false); - qf.insert(0, 3, false); - qf.insert(0, 6, false); - qf.insert(0, 6, false); // these are ignored - qf.insert(0, 6, false); - qf.insert(0, 7, false); - - qf.delete(0, 2); - qf.delete(0, 3); + qf.insert(1, 1); + qf.insert(2, 1); + qf.insert(3, 2); + qf.insert(4, 2); + qf.insert(5, 3); + qf.insert(6, 3); + qf.insert(7, 3); + qf.insert(8, 6); + qf.insert(9, 6); // these are ignored + qf.insert(10, 6); + qf.insert(11, 7); + + qf.delete(3, 2); + qf.delete(5, 3); BitSet result = new BitSet(num_entries * bits_per_entry); - result = QuotientFilterTest.set_slot_in_test(result, bits_per_entry, 1, true, false, false, 0); - result = QuotientFilterTest.set_slot_in_test(result, bits_per_entry, 2, true, true, true, 0); - result = QuotientFilterTest.set_slot_in_test(result, bits_per_entry, 3, true, false, true, 0); - result = QuotientFilterTest.set_slot_in_test(result, bits_per_entry, 4, false, false, true, 0); - result = QuotientFilterTest.set_slot_in_test(result, bits_per_entry, 5, false, true, true, 0); - result = QuotientFilterTest.set_slot_in_test(result, bits_per_entry, 6, true, false, false, 0); + result = QuotientFilterTest.set_slot_in_test(result, bits_per_entry, 0, false, false, false, 0); + result = QuotientFilterTest.set_slot_in_test(result, bits_per_entry, 1, true, false, false, 1); + result = QuotientFilterTest.set_slot_in_test(result, bits_per_entry, 2, true, true, true, 2); + result = QuotientFilterTest.set_slot_in_test(result, bits_per_entry, 3, true, false, true, 4); + result = QuotientFilterTest.set_slot_in_test(result, bits_per_entry, 4, false, false, true, 6); + result = QuotientFilterTest.set_slot_in_test(result, bits_per_entry, 5, false, true, true, 7); + result = QuotientFilterTest.set_slot_in_test(result, bits_per_entry, 6, true, false, false, 8); result = QuotientFilterTest.set_slot_in_test(result, bits_per_entry, 7, false, false, false, 0); -// result = QuotientFilterTest.set_slot_in_test(result, bits_per_entry, 8, false, true, true, 0); -// result = QuotientFilterTest.set_slot_in_test(result, bits_per_entry, 9, false, false, true, 0); assertTrue(QuotientFilterTest.check_equality(qf, result, true)); } @@ -123,33 +120,34 @@ static public void DeletionsWithSameFingerprint() { * * The expected outcome is that after deletion, the remaining keys should be in their canonical slots. */ - static public void DeletionsWithOverflow() { + static public void DeletionsWithWrap() { int bits_per_entry = 8; int num_entries_power = 3; int num_entries = (int)Math.pow(2, num_entries_power); QuotientFilter qf = new QuotientFilter(num_entries_power, bits_per_entry); - qf.insert(0, 1, false); - qf.insert(0, 1, false); - qf.insert(0, 2, false); - qf.insert(0, 2, false); - qf.insert(0, 3, false); - qf.insert(0, 4, false); - qf.insert(0, 4, false); - qf.insert(0, 5, false); + qf.insert(1, 1); + qf.insert(2, 1); + qf.insert(3, 2); + qf.insert(4, 2); + qf.insert(5, 3); + qf.insert(6, 4); + qf.insert(7, 4); + qf.insert(8, 5); //qf.pretty_print(); - qf.delete(0, 3); + qf.delete(5, 3); //qf.pretty_print(); BitSet result = new BitSet(num_entries * bits_per_entry); - result = QuotientFilterTest.set_slot_in_test(result, bits_per_entry, 1, true, false, false, 0); - result = QuotientFilterTest.set_slot_in_test(result, bits_per_entry, 2, true, true, true, 0); - result = QuotientFilterTest.set_slot_in_test(result, bits_per_entry, 3, false, false, true, 0); - result = QuotientFilterTest.set_slot_in_test(result, bits_per_entry, 4, true, true, true, 0); - result = QuotientFilterTest.set_slot_in_test(result, bits_per_entry, 5, true, false, true, 0); - result = QuotientFilterTest.set_slot_in_test(result, bits_per_entry, 6, false, true, true, 0); - result = QuotientFilterTest.set_slot_in_test(result, bits_per_entry, 7, false, false, true, 0); + result = QuotientFilterTest.set_slot_in_test(result, bits_per_entry, 0, false, false, false, 0); + result = QuotientFilterTest.set_slot_in_test(result, bits_per_entry, 1, true, false, false, 1); + result = QuotientFilterTest.set_slot_in_test(result, bits_per_entry, 2, true, true, true, 2); + result = QuotientFilterTest.set_slot_in_test(result, bits_per_entry, 3, false, false, true, 3); + result = QuotientFilterTest.set_slot_in_test(result, bits_per_entry, 4, true, true, true, 4); + result = QuotientFilterTest.set_slot_in_test(result, bits_per_entry, 5, true, false, true, 6); + result = QuotientFilterTest.set_slot_in_test(result, bits_per_entry, 6, false, true, true, 7); + result = QuotientFilterTest.set_slot_in_test(result, bits_per_entry, 7, false, false, true, 8); assertTrue(QuotientFilterTest.check_equality(qf, result, true)); } } diff --git a/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterTest.java b/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterTest.java index 64f92ddc8..4f6486eae 100644 --- a/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterTest.java +++ b/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterTest.java @@ -56,12 +56,12 @@ public void WikiInsertionTest() { final int E = 5; final int F = 6; - qf.insert(B, 1, false); - qf.insert(E, 4, false); - qf.insert(F, 7, false); - qf.insert(C, 1, false); - qf.insert(D, 2, false); - qf.insert(A, 1, false); + qf.insert(B, 1); + qf.insert(E, 4); + qf.insert(F, 7); + qf.insert(C, 1); + qf.insert(D, 2); + qf.insert(A, 1); assertEquals(qf.get_num_entries(), 6); assertEquals(getState(qf, 0), 0); @@ -99,26 +99,35 @@ public void PaperInsertionTest() { int num_entries = (int)Math.pow(2, num_entries_power); QuotientFilter qf = new QuotientFilter(4, 8); - // (key, slot): {(a, 1), (b,1), (c ,3), (d, 3), (e, 3), (f, 4), (g, 6), (h, 6)} - qf.insert(0, 1, false); - qf.insert(0, 1, false); - qf.insert(0, 3, false); - qf.insert(0, 3, false); - qf.insert(0, 3, false); - qf.insert(0, 4, false); - qf.insert(0, 6, false); - qf.insert(0, 6, false); + final int A = 1; + final int B = 2; + final int C = 3; + final int D = 4; + final int E = 5; + final int F = 6; + final int G = 7; + final int H = 8; + + // (key, slot): {(a, 1), (b, 1), (c, 3), (d, 3), (e, 3), (f, 4), (g, 6), (h, 6)} + qf.insert(A, 1); + qf.insert(B, 1); + qf.insert(C, 3); + qf.insert(D, 3); + qf.insert(E, 3); + qf.insert(F, 4); + qf.insert(G, 6); + qf.insert(H, 6); BitSet result = new BitSet(num_entries * bits_per_entry); result = set_slot_in_test(result, bits_per_entry, 0, false, false, false, 0); - result = set_slot_in_test(result, bits_per_entry, 1, true, false, false, 0); - result = set_slot_in_test(result, bits_per_entry, 2, false, true, true, 0); - result = set_slot_in_test(result, bits_per_entry, 3, true, false, false, 0); - result = set_slot_in_test(result, bits_per_entry, 4, true, true, true, 0); - result = set_slot_in_test(result, bits_per_entry, 5, false, true, true, 0); - result = set_slot_in_test(result, bits_per_entry, 6, true, false, true, 0); - result = set_slot_in_test(result, bits_per_entry, 7, false, false, true, 0); - result = set_slot_in_test(result, bits_per_entry, 8, false, true, true, 0); + result = set_slot_in_test(result, bits_per_entry, 1, true, false, false, A); + result = set_slot_in_test(result, bits_per_entry, 2, false, true, true, B); + result = set_slot_in_test(result, bits_per_entry, 3, true, false, false, C); + result = set_slot_in_test(result, bits_per_entry, 4, true, true, true, D); + result = set_slot_in_test(result, bits_per_entry, 5, false, true, true, E); + result = set_slot_in_test(result, bits_per_entry, 6, true, false, true, F); + result = set_slot_in_test(result, bits_per_entry, 7, false, false, true, G); + result = set_slot_in_test(result, bits_per_entry, 8, false, true, true, H); assertTrue(check_equality(qf, result, false)); } @@ -139,23 +148,24 @@ public void FalseNegativeTest() { */ @Test public void OverflowTest() { - int bits_per_entry = 8; - int num_entries_power = 3; - int num_entries = (int)Math.pow(2, num_entries_power); - int fingerprint_size = bits_per_entry - 3; - QuotientFilter qf = new QuotientFilter(num_entries_power, bits_per_entry); - - long fp2 = 1 << fingerprint_size - 1; - qf.insert(fp2, num_entries - 1, false); - assertEquals(qf.get_fingerprint(7), fp2); - assertEquals(getState(qf, 7), 0b100); - qf.insert(fp2, num_entries - 1, false); + final int bits_per_entry = 8; + final int num_entries_power = 3; + final int num_entries = (int)Math.pow(2, num_entries_power); + final int fingerprint_size = bits_per_entry - 3; + final QuotientFilter qf = new QuotientFilter(num_entries_power, bits_per_entry); + + final long fp1 = 1; + final long fp2 = 1 << fingerprint_size - 1; + qf.insert(fp1, num_entries - 1); + assertEquals(qf.get_fingerprint(num_entries - 1), fp1); + assertEquals(getState(qf, num_entries - 1), 0b100); + qf.insert(fp2, num_entries - 1); assertEquals(qf.get_fingerprint(0), fp2); assertEquals(getState(qf, 0), 0b011); qf.delete(fp2, num_entries - 1); assertEquals(qf.get_fingerprint(0), 0); assertEquals(getState(qf, 0), 0b000); - boolean found = qf.search(fp2, num_entries - 1); + final boolean found = qf.search(fp1, num_entries - 1); assertTrue(found); } @@ -174,16 +184,16 @@ public void testQuotientFilterInsertionAndIteration() { //int fingerprint_size = bits_per_entry - 3; QuotientFilter qf = new QuotientFilter(num_entries_power, bits_per_entry); - qf.insert(0x1F, 2, false); - qf.insert(0x1F, 3, false); - qf.insert(0x1F, 3, false); - qf.insert(0x1F, 4, false); - qf.insert(0x1F, 15, false); // last slot in the filter - qf.insert(0x1F, 16, false); // outside the bounds + qf.insert(0x1F, 2); + qf.insert(0x1F, 3); + qf.insert(0x1F, 3); + qf.insert(0x1F, 4); + qf.insert(0x1F, 15); // last slot in the filter + qf.insert(0x1F, 16); // outside the bounds qf.pretty_print() ; Iterator it = new Iterator(qf); - int[] arr = new int[] {2, 3, 3, 4, 15}; + int[] arr = new int[] {2, 3, 4, 15}; int arr_index = 0; while (it.next()) {assertEquals(it.bucket_index, arr[arr_index++]);} } @@ -195,16 +205,16 @@ public void testQuotientFilterIterator() { int num_entries_power = 4; QuotientFilter qf = new QuotientFilter(num_entries_power, bits_per_entry); - qf.insert(0, 1, false); - qf.insert(0, 4, false); - qf.insert(0, 7, false); - qf.insert(0, 1, false); - qf.insert(0, 2, false); - qf.insert(0, 1, false); - qf.insert(0, 15, false); + qf.insert(0, 1); + qf.insert(0, 4); + qf.insert(0, 7); + qf.insert(0, 1); + qf.insert(0, 2); + qf.insert(0, 1); + qf.insert(0, 15); Iterator it = new Iterator(qf); - int[] arr = new int[] {1, 1, 1, 2, 4, 7, 15}; + int[] arr = new int[] {1, 2, 4, 7, 15}; int arr_index = 0; while (it.next()) {assertEquals(arr[arr_index++], it.bucket_index);} } @@ -270,7 +280,7 @@ static public boolean test_no_false_negatives(QuotientFilter filter, int num_ent for (int i = 0; i < num_entries; i++) { int rand_num = rand.nextInt(); - boolean success = filter.insert(rand_num, false); + boolean success = filter.insert(rand_num); if (success) { added.add(rand_num); } @@ -279,7 +289,7 @@ static public boolean test_no_false_negatives(QuotientFilter filter, int num_ent } } - for (Integer i : added) { + for (Integer i: added) { boolean found = filter.search((long)i); if (!found) { return false; From 5aa9191fe7dc6def62ed784c3f203e64852703d1 Mon Sep 17 00:00:00 2001 From: AlexanderSaydakov Date: Tue, 4 Jun 2024 10:11:48 -0700 Subject: [PATCH 24/38] ordered runs, simplified insertion code --- .../quotientfilter/QuotientFilter.java | 116 +++++++----------- .../quotientfilter/QuotientFilterTest.java | 6 +- 2 files changed, 50 insertions(+), 72 deletions(-) diff --git a/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java b/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java index e0a442fc5..66b914111 100644 --- a/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java +++ b/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java @@ -312,16 +312,19 @@ long find_run_start(long index) { } // given the start of a run, scan the run and return the index of the first matching fingerprint + // if not found returns the insertion position as bitwise complement to make it negative long find_first_fingerprint_in_run(long index, long fingerprint) { - assert(!is_continuation(index)); - do { - if (compare(index, fingerprint)) { - //System.out.println("found matching FP at index " + index); - return index; - } - index = (index + 1) & getMask(); - } while (is_continuation(index)); - return -1; + assert(!is_continuation(index)); + do { + final long fingerprintAtIndex = get_fingerprint(index); + if (fingerprintAtIndex == fingerprint) { + return index; + } else if (fingerprintAtIndex > fingerprint) { + return ~index; + } + index = (index + 1) & getMask(); + } while (is_continuation(index)); + return ~index; } // delete the last matching fingerprint in the run @@ -354,7 +357,7 @@ boolean search(long fingerprint, long index) { } long run_start_index = find_run_start(index); long found_index = find_first_fingerprint_in_run(run_start_index, fingerprint); - return found_index > -1; + return found_index >= 0; } // Given a canonical slot index, find the corresponding run and return all fingerprints in the run. @@ -373,74 +376,49 @@ Set get_all_fingerprints(long bucket_index) { return set; } - // Swaps the fingerprint in a given slot with a new one. Return the pre-existing fingerprint - long swap_fingerprints(long index, long new_fingerprint) { - long existing = get_fingerprint(index); - set_fingerprint(index, new_fingerprint); - return existing; - } - - boolean insert_new_run(long canonical_slot, long long_fp) { - long start_of_this_new_run = find_run_start(canonical_slot); - boolean slot_initially_empty = is_slot_empty(start_of_this_new_run); - - // modify some metadata flags to mark the new run - set_occupied(canonical_slot, true); - if (start_of_this_new_run != canonical_slot) { - set_shifted(start_of_this_new_run, true); - } - set_continuation(start_of_this_new_run, false); - - // if the slot was initially empty, we can just terminate, as there is nothing to push to the right - if (slot_initially_empty) { - set_fingerprint(start_of_this_new_run, long_fp); - num_entries++; - return true; - } - return insert_fingerprint_and_push_all_else(long_fp, start_of_this_new_run, false); - } - - boolean insert(long long_fp, long index) { + boolean insert(long fingerprint, long index) { if (index >= get_num_slots() || num_entries == get_num_slots()) { return false; } + final long run_start = find_run_start(index); if (!is_occupied(index)) { - return insert_new_run(index, long_fp); + return insert_fingerprint_and_push_all_else(fingerprint, run_start, index, true, true); } - long run_start_index = find_run_start(index); - final long found_index = find_first_fingerprint_in_run(run_start_index, long_fp); - if (found_index > -1) { + final long found_index = find_first_fingerprint_in_run(run_start, fingerprint); + if (found_index >= 0) { return false; } - return insert_fingerprint_and_push_all_else(long_fp, run_start_index, true); + return insert_fingerprint_and_push_all_else(fingerprint, ~found_index, index, false, ~found_index == run_start); } - // insert a fingerprint as the last fingerprint of the run and push all other entries in the cluster to the right. - boolean insert_fingerprint_and_push_all_else(long long_fp, long run_start_index, boolean is_same_run) { - long current_index = run_start_index; - boolean is_this_slot_empty; - boolean temp_continuation = false; - - do { - is_this_slot_empty = is_slot_empty(current_index); - if (current_index != run_start_index) { - set_shifted(current_index, true); - } - if (current_index != run_start_index && is_same_run && !is_continuation(current_index)) { - is_same_run = false; - set_continuation(current_index, true); - long_fp = swap_fingerprints(current_index, long_fp); - } - else if (!is_same_run) { - boolean current_continuation = is_continuation(current_index); - set_continuation(current_index, temp_continuation); - temp_continuation = current_continuation; - long_fp = swap_fingerprints(current_index, long_fp); - } - current_index = (current_index + 1) & getMask(); - } while (!is_this_slot_empty); - num_entries++; - return true; + boolean insert_fingerprint_and_push_all_else(long fingerprint, long index, long canonical, boolean is_new_run, boolean is_run_start) { + boolean existing_is_continuation = is_continuation(index); + boolean is_continuation = !is_run_start; + boolean is_shifted = index != canonical; + boolean force_continuation = !is_new_run && is_run_start; + boolean existing_is_empty = is_slot_empty(index); + long existing_fingerprint = get_fingerprint(index); + while (!existing_is_empty) { + set_fingerprint(index, fingerprint); + set_continuation(index, is_continuation); + set_shifted(index, is_shifted); + fingerprint = existing_fingerprint; + is_continuation = existing_is_continuation | force_continuation; + is_shifted = true; + index = (index + 1) & getMask(); + existing_fingerprint = get_fingerprint(index); + existing_is_continuation = is_continuation(index); + existing_is_empty = is_slot_empty(index); + force_continuation = false; + } + set_fingerprint(index, fingerprint); + set_continuation(index, is_continuation); + set_shifted(index, is_shifted); + if (is_new_run) { + set_occupied(canonical, true); + } + num_entries++; + return true; } boolean delete(long fingerprint, long canonical_slot, long run_start_index, long matching_fingerprint_index) { diff --git a/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterTest.java b/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterTest.java index 4f6486eae..44e9833d2 100644 --- a/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterTest.java +++ b/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterTest.java @@ -67,11 +67,11 @@ public void WikiInsertionTest() { assertEquals(getState(qf, 0), 0); assertEquals(qf.get_fingerprint(0), 0); assertEquals(getState(qf, 1), 0b100); - assertEquals(qf.get_fingerprint(1), B); // this run is not ordered, which is different from Wikipedia example + assertEquals(qf.get_fingerprint(1), A); // this run is not ordered, which is different from Wikipedia example assertEquals(getState(qf, 2), 0b111); - assertEquals(qf.get_fingerprint(2), C); + assertEquals(qf.get_fingerprint(2), B); assertEquals(getState(qf, 3), 0b011); - assertEquals(qf.get_fingerprint(3), A); + assertEquals(qf.get_fingerprint(3), C); assertEquals(getState(qf, 4), 0b101); assertEquals(qf.get_fingerprint(4), D); assertEquals(getState(qf, 5), 0b001); From f49c7a09c6a3ca4eea0fd3b6540688c4916db9ac Mon Sep 17 00:00:00 2001 From: AlexanderSaydakov Date: Tue, 4 Jun 2024 11:57:03 -0700 Subject: [PATCH 25/38] removed comment that is no longer valid --- .../datasketches/filters/quotientfilter/QuotientFilterTest.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterTest.java b/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterTest.java index 44e9833d2..4f68cb1f4 100644 --- a/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterTest.java +++ b/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterTest.java @@ -67,7 +67,7 @@ public void WikiInsertionTest() { assertEquals(getState(qf, 0), 0); assertEquals(qf.get_fingerprint(0), 0); assertEquals(getState(qf, 1), 0b100); - assertEquals(qf.get_fingerprint(1), A); // this run is not ordered, which is different from Wikipedia example + assertEquals(qf.get_fingerprint(1), A); assertEquals(getState(qf, 2), 0b111); assertEquals(qf.get_fingerprint(2), B); assertEquals(getState(qf, 3), 0b011); From 3d60bced8fa565a2a6b43c59a563458785c5ee72 Mon Sep 17 00:00:00 2001 From: AlexanderSaydakov Date: Wed, 5 Jun 2024 11:31:23 -0700 Subject: [PATCH 26/38] commented shifting the rest of the cluster after the insertion point --- .../quotientfilter/QuotientFilter.java | 27 ++++++++++++++++--- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java b/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java index 66b914111..9489a36f2 100644 --- a/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java +++ b/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java @@ -392,28 +392,47 @@ boolean insert(long fingerprint, long index) { } boolean insert_fingerprint_and_push_all_else(long fingerprint, long index, long canonical, boolean is_new_run, boolean is_run_start) { - boolean existing_is_continuation = is_continuation(index); + // in the first shifted entry set is_continuation flag if inserting at the start of the existing run + // otherwise just shift the existing flag as it is + boolean force_continuation = !is_new_run && is_run_start; + + // prepare flags for the current slot boolean is_continuation = !is_run_start; boolean is_shifted = index != canonical; - boolean force_continuation = !is_new_run && is_run_start; - boolean existing_is_empty = is_slot_empty(index); + + // store the existing entry in the current slot to be shifted to the next slot + // is_occupied flag belongs to the slot, therefore it is never shifted + // is_shifted flag is always true for all shifted entries, no need to store it long existing_fingerprint = get_fingerprint(index); + boolean existing_is_continuation = is_continuation(index); + boolean existing_is_empty = is_slot_empty(index); + while (!existing_is_empty) { + // set the current slot set_fingerprint(index, fingerprint); set_continuation(index, is_continuation); set_shifted(index, is_shifted); + + // prepare values for the next slot fingerprint = existing_fingerprint; is_continuation = existing_is_continuation | force_continuation; is_shifted = true; + index = (index + 1) & getMask(); + + // save the existing entry to be shifted existing_fingerprint = get_fingerprint(index); existing_is_continuation = is_continuation(index); existing_is_empty = is_slot_empty(index); - force_continuation = false; + + force_continuation = false; // this is needed for the first shift only } + // at this point the current slot is empty, so just populate with prepared values + // either the incoming fingerprint or the last shifted one set_fingerprint(index, fingerprint); set_continuation(index, is_continuation); set_shifted(index, is_shifted); + if (is_new_run) { set_occupied(canonical, true); } From 1ea45c08a35b8a5dbb90de65711f73cb515ed0f9 Mon Sep 17 00:00:00 2001 From: AlexanderSaydakov Date: Wed, 5 Jun 2024 11:38:10 -0700 Subject: [PATCH 27/38] better wording --- .../datasketches/filters/quotientfilter/QuotientFilter.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java b/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java index 9489a36f2..d90f243c6 100644 --- a/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java +++ b/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java @@ -400,9 +400,9 @@ boolean insert_fingerprint_and_push_all_else(long fingerprint, long index, long boolean is_continuation = !is_run_start; boolean is_shifted = index != canonical; - // store the existing entry in the current slot to be shifted to the next slot + // remember the existing entry from the current slot to be shifted to the next slot // is_occupied flag belongs to the slot, therefore it is never shifted - // is_shifted flag is always true for all shifted entries, no need to store it + // is_shifted flag is always true for all shifted entries, no need to remember it long existing_fingerprint = get_fingerprint(index); boolean existing_is_continuation = is_continuation(index); boolean existing_is_empty = is_slot_empty(index); From 078c1f42319887ded729db9daf6675a06315687e Mon Sep 17 00:00:00 2001 From: AlexanderSaydakov Date: Wed, 12 Jun 2024 17:07:56 -0700 Subject: [PATCH 28/38] style changes --- .../filters/quotientfilter/Filter.java | 2 +- .../filters/quotientfilter/Iterator.java | 10 +- .../quotientfilter/QuotientFilter.java | 1315 ++++++++--------- .../quotientfilter/QuotientFilterTest.java | 36 +- 4 files changed, 610 insertions(+), 753 deletions(-) diff --git a/src/main/java/org/apache/datasketches/filters/quotientfilter/Filter.java b/src/main/java/org/apache/datasketches/filters/quotientfilter/Filter.java index bfb1ad25d..53dfd1c4b 100644 --- a/src/main/java/org/apache/datasketches/filters/quotientfilter/Filter.java +++ b/src/main/java/org/apache/datasketches/filters/quotientfilter/Filter.java @@ -99,7 +99,7 @@ long get_hash(long input) { return XxHash.hashLong(input, 0L) ; // CD edit for datasketches hash function using same seed. } - public long get_space_use() { return 0 ; } + public long getSpaceUse() { return 0 ; } // public int get_bits_per_entry() { return 0 ; } // // public abstract long get_num_entries(boolean include_all_internal_filters); diff --git a/src/main/java/org/apache/datasketches/filters/quotientfilter/Iterator.java b/src/main/java/org/apache/datasketches/filters/quotientfilter/Iterator.java index 6b76a1789..e04e6cd12 100644 --- a/src/main/java/org/apache/datasketches/filters/quotientfilter/Iterator.java +++ b/src/main/java/org/apache/datasketches/filters/quotientfilter/Iterator.java @@ -49,22 +49,22 @@ void clear() { boolean next() { - if (index == qf.get_num_slots()) { + if (index == qf.getNumSlots()) { return false; } - long slot = qf.get_slot(index); + long slot = qf.getSlot(index); boolean occupied = (slot & 1) != 0; boolean continuation = (slot & 2) != 0; boolean shifted = (slot & 4) != 0; - while (!occupied && !continuation && !shifted && index < qf.get_num_slots()) { + while (!occupied && !continuation && !shifted && index < qf.getNumSlots()) { index++; - if (index == qf.get_num_slots()) { + if (index == qf.getNumSlots()) { return false; } - slot = qf.get_slot(index); + slot = qf.getSlot(index); occupied = (slot & 1) != 0; continuation = (slot & 2) != 0; shifted = (slot & 4) != 0; diff --git a/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java b/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java index d90f243c6..bd8c2abd9 100644 --- a/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java +++ b/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java @@ -28,756 +28,613 @@ public class QuotientFilter extends Filter { - int bitPerEntry; - int fingerprintLength; - int power_of_two_size; - int num_entries; - BitArray filter; - - double expansion_threshold; - long max_entries_before_expansion; - boolean expand_autonomously; - boolean is_full; - - // statistics, computed in the compute_statistics method. method should be called before these are used - long num_runs; - long num_clusters; - public double avg_run_length; - public double avg_cluster_length; - - int original_fingerprint_size; - int num_expansions; - - - public QuotientFilter(int power_of_two, int bits_per_entry) { - power_of_two_size = power_of_two; - bitPerEntry = bits_per_entry; - fingerprintLength = bits_per_entry - 3; - final long init_size = 1L << power_of_two; - filter = make_filter(init_size, bits_per_entry); - - expansion_threshold = 0.8; - max_entries_before_expansion = (int) (init_size * expansion_threshold); - expand_autonomously = false; - is_full = false; - - original_fingerprint_size = fingerprintLength; - num_expansions = 0; - //hash_type = XxHash.hashLong ; //HashType.xxh; - } - - public boolean rejuvenate(long key) { - return false; - } - - public long get_num_entries() { - return num_entries; - } - - public long get_max_entries_before_expansion() { - return max_entries_before_expansion; - } - - public boolean expand_autonomously() { - return expand_autonomously; - } - - public void set_expand_autonomously(boolean val) { - expand_autonomously = val; - } - - BitArray make_filter(long init_size, int bits_per_entry) { -// System.out.println(init_size ) ; -// System.out.println(num_extension_slots); -// System.out.println("Making BitVector with: " + (init_size + num_extension_slots) + "SLOTS"); - //return new QuickBitVectorWrapper(bits_per_entry, init_size); - return new HeapBitArray(init_size * bits_per_entry); - } - - public int get_fingerprint_length() { - return fingerprintLength; - } - - QuotientFilter(int power_of_two, int bits_per_entry, BitArray bitmap) { - power_of_two_size = power_of_two; - bitPerEntry = bits_per_entry; - fingerprintLength = bits_per_entry - 3; - filter = bitmap; - } - - boolean expand() { - is_full = true; - return false; - } - - // measures the number of bits per entry for the filter - public double measure_num_bits_per_entry() { - return measure_num_bits_per_entry(this, new ArrayList()); - } - - // measures the number of bits per entry for the filter - // it takes an array of filters as a parameter since some filter implementations here consist of multiple filter objects - protected static double measure_num_bits_per_entry(QuotientFilter current, ArrayList other_filters) { - //System.out.println("--------------------------"); - //current.print_filter_summary(); - //System.out.println(); - double num_entries = current.get_num_entries(); - for (QuotientFilter q : other_filters) { - //q.print_filter_summary(); - //System.out.println(); - num_entries += q.get_num_entries(); - } - long init_size = 1L << current.power_of_two_size; - long num_bits = current.bitPerEntry * init_size; - for (QuotientFilter q : other_filters) { - init_size = 1L << q.power_of_two_size; - num_bits += q.bitPerEntry * init_size; - } - //System.out.println("total entries: \t\t" + num_entries); - //System.out.println("total bits: \t\t" + num_bits); - final double bits_per_entry = num_bits / num_entries; - //System.out.println("total bits/entry: \t" + bits_per_entry); - //System.out.println(); - return bits_per_entry; - } - - // returns the fraction of occupied slots in the filter - public double get_utilization() { - long num_logical_slots = 1L << power_of_two_size; - double util = num_entries / (double) num_logical_slots; - return util; - } - - // returns the number of slots in the filter without the extension/buffer slots - public long get_num_slots() { - return 1L << power_of_two_size; - } - - long getMask() { - return get_num_slots() - 1; - } - - // sets the metadata flag bits for a given slot index - void modify_slot(boolean is_occupied, boolean is_continuation, boolean is_shifted, - long index) { - set_occupied(index, is_occupied); - set_continuation(index, is_continuation); - set_shifted(index, is_shifted); - } - - // sets the fingerprint for a given slot index - void set_fingerprint(long index, long fingerprint) { - filter.setBits(index * bitPerEntry + 3, fingerprintLength, fingerprint); - } - - // print a nice representation of the filter that can be understood. - // if vertical is on, each line will represent a slot - public String get_pretty_str(boolean vertical) { - StringBuffer sbr = new StringBuffer(); - - long numBits = get_num_slots() * bitPerEntry; - - for (long i = 0; i < numBits; i++) { - long remainder = i % bitPerEntry; - if (remainder == 0) { - long slot_num = i/bitPerEntry; - sbr.append(" "); - if (vertical) { - sbr.append("\n" + String.format("%-10d", slot_num) + "\t"); - } - } - if (remainder == 3) { - sbr.append(" "); - } - sbr.append(filter.getBit(i) ? "1" : "0"); - } - sbr.append("\n"); - return sbr.toString(); - } - - // print a nice representation of the filter that can be humanly read. - public void pretty_print() { - System.out.print(get_pretty_str(true)); - } - - // return a fingerprint in a given slot index - long get_fingerprint(long index) { - return filter.getBits(index * bitPerEntry + 3, fingerprintLength); - } - - // return an entire slot representation, including metadata flags and fingerprint - long get_slot(long index) { - return filter.getBits(index * bitPerEntry, bitPerEntry); - } - - // compare a fingerprint input to the fingerprint in some slot index - protected boolean compare(long index, long fingerprint) { - return get_fingerprint(index) == fingerprint; - } - - // modify the flags and fingerprint of a given slot - void modify_slot(boolean is_occupied, boolean is_continuation, boolean is_shifted, - long index, long fingerprint) { - modify_slot(is_occupied, is_continuation, is_shifted, index); - set_fingerprint(index, fingerprint); - } - - // summarize some statistical measures about the filter - public void print_filter_summary() { - long slots = get_num_slots(); - long num_bits = slots * bitPerEntry; - System.out.println("slots:\t" + slots); - System.out.println("entries:\t" + num_entries); - System.out.println("bits\t:" + num_bits); - System.out.println("bits/entry\t:" + num_bits / (double)num_entries); - System.out.println("FP length:\t" + fingerprintLength); - System.out.println("Is full?\t" + is_full); - double capacity = num_entries / (double)(slots) ; - System.out.println("Capacity\t" + capacity); - compute_statistics(); - //System.out.println("num runs: \t\t" + num_runs); - //System.out.println("avg run length: \t" + avg_run_length); - //System.out.println("num clusters: \t\t" + num_clusters); - //System.out.println("avg cluster length: \t" + avg_cluster_length); - } - - /* - Returns the number of bits used for the filter - */ - @Override - public long get_space_use() { - long num_bits = get_num_slots() * bitPerEntry; - return num_bits; - } - - public int get_bits_per_entry() { - return bitPerEntry; - } - - boolean is_occupied(long index) { - return filter.getBit(index * bitPerEntry); - } - - boolean is_continuation(long index) { - return filter.getBit(index * bitPerEntry + 1); - } - - boolean is_shifted(long index) { - return filter.getBit(index * bitPerEntry + 2); - } - - void set_occupied(long index, boolean val) { - filter.assignBit(index * bitPerEntry, val); - } - - void set_continuation(long index, boolean val) { - filter.assignBit(index * bitPerEntry + 1, val); - } - - void set_shifted(long index, boolean val) { - filter.assignBit(index * bitPerEntry + 2, val); - } - - boolean is_slot_empty(long index) { - return !is_occupied(index) && !is_continuation(index) && !is_shifted(index); - } - - // scan the cluster leftwards until finding the start of the cluster and returning its slot index - // used by deletes - long find_cluster_start(long index) { - long current_index = index; - while (is_shifted(current_index)) { - current_index = (current_index - 1) & getMask(); - } - return current_index; - } - - // given a canonical slot A, finds the actual index B of where the run belonging to slot A now resides - // since the run might have been shifted to the right due to collisions - long find_run_start(long index) { - int num_runs_to_skip = 0; - while (is_shifted(index)) { - index = (index - 1) & getMask(); - if (is_occupied(index)) { - num_runs_to_skip++; + int bitPerEntry_; + int fingerprintLength_; + int powerOfTwoSize_; + int numEntries_; + BitArray bitArray_; + + double expansionThreshold_; + long maxEntriesBeforeExpansion_; + boolean expandAutonomously_; + boolean isFull_; + + // statistics, computed in the compute_statistics method. method should be called before these are used + long numRuns_; + long numClusters_; + public double avgRunLength_; + public double avgClusterLength_; + + int originalFingerprintSize_; + int numExpansions_; + + public QuotientFilter(final int powerOfTwo, final int bitsPerEntry) { + powerOfTwoSize_ = powerOfTwo; + bitPerEntry_ = bitsPerEntry; + fingerprintLength_ = bitsPerEntry - 3; + final long init_size = 1L << powerOfTwo; + bitArray_ = makeFilter(init_size, bitsPerEntry); + + expansionThreshold_ = 0.8; + maxEntriesBeforeExpansion_ = (int) (init_size * expansionThreshold_); + expandAutonomously_ = false; + isFull_ = false; + + originalFingerprintSize_ = fingerprintLength_; + numExpansions_ = 0; + //hash_type = XxHash.hashLong ; //HashType.xxh; + } + + public boolean rejuvenate(final long key) { + return false; + } + + public long getNumEntries() { + return numEntries_; + } + + public long getMaxEntriesBeforeExpansion() { + return maxEntriesBeforeExpansion_; + } + + public boolean expandAutonomously() { + return expandAutonomously_; + } + + public void setExpandAutonomously(final boolean val) { + expandAutonomously_ = val; + } + + BitArray makeFilter(final long initSize, final int bitsPerEntry) { + return new HeapBitArray(initSize * bitsPerEntry); + } + + public int getFingerprintLength() { + return fingerprintLength_; + } + + QuotientFilter(final int powerOfTwo, final int bitsPerEntry, final BitArray bitArray) { + powerOfTwoSize_ = powerOfTwo; + bitPerEntry_ = bitsPerEntry; + fingerprintLength_ = bitsPerEntry - 3; + bitArray_ = bitArray; + } + + boolean expand() { + isFull_ = true; + return false; + } + + // measures the number of bits per entry for the filter + public double measureNumBitsPerEntry() { + return measureNumBitsPerEntry(this, new ArrayList()); + } + + // measures the number of bits per entry for the filter + // it takes an array of filters as a parameter since some filter implementations here consist of multiple filter objects + protected static double measureNumBitsPerEntry(final QuotientFilter current, final ArrayList otherFilters) { + //System.out.println("--------------------------"); + //current.print_filter_summary(); + //System.out.println(); + double numEntries = current.getNumEntries(); + for (QuotientFilter q : otherFilters) { + //q.print_filter_summary(); + //System.out.println(); + numEntries += q.getNumEntries(); + } + long init_size = 1L << current.powerOfTwoSize_; + long numBits = current.bitPerEntry_ * init_size; + for (QuotientFilter q : otherFilters) { + init_size = 1L << q.powerOfTwoSize_; + numBits += q.bitPerEntry_ * init_size; + } + //System.out.println("total entries: \t\t" + num_entries); + //System.out.println("total bits: \t\t" + num_bits); + final double bits_per_entry = numBits / numEntries; + //System.out.println("total bits/entry: \t" + bits_per_entry); + //System.out.println(); + return bits_per_entry; + } + + // returns the fraction of occupied slots in the filter + public double getUtilization() { + return numEntries_ / (double) getNumSlots(); + } + + // returns the number of slots in the filter without the extension/buffer slots + public long getNumSlots() { + return 1L << powerOfTwoSize_; + } + + long getMask() { + return getNumSlots() - 1; + } + + // sets the metadata flag bits for a given slot index + void modifySlot(final boolean isOccupied, final boolean isContinuation, final boolean isShifted, final long index) { + setOccupied(index, isOccupied); + setContinuation(index, isContinuation); + setShifted(index, isShifted); + } + + // sets the fingerprint for a given slot index + void setFingerprint(final long index, final long fingerprint) { + bitArray_.setBits(index * bitPerEntry_ + 3, fingerprintLength_, fingerprint); + } + + // print a nice representation of the filter that can be understood. + // if vertical is on, each line will represent a slot + public String get_pretty_str(final boolean vertical) { + final StringBuffer sbr = new StringBuffer(); + final long numBits = getNumSlots() * bitPerEntry_; + for (long i = 0; i < numBits; i++) { + final long remainder = i % bitPerEntry_; + if (remainder == 0) { + final long slot = i / bitPerEntry_; + sbr.append(" "); + if (vertical) { + sbr.append("\n" + String.format("%-10d", slot) + "\t"); } } - while (num_runs_to_skip > 0) { - index = (index + 1) & getMask(); - if (!is_continuation(index)) { - num_runs_to_skip--; - } + if (remainder == 3) { + sbr.append(" "); + } + sbr.append(bitArray_.getBit(i) ? "1" : "0"); + } + sbr.append("\n"); + return sbr.toString(); + } + + // print a representation of the filter that can be humanly read. + public void prettyPrint() { + System.out.print(get_pretty_str(true)); + } + + // return a fingerprint in a given slot index + long getFingerprint(final long index) { + return bitArray_.getBits(index * bitPerEntry_ + 3, fingerprintLength_); + } + + // return an entire slot representation, including metadata flags and fingerprint + long getSlot(final long index) { + return bitArray_.getBits(index * bitPerEntry_, bitPerEntry_); + } + + // compare a fingerprint input to the fingerprint in some slot index + protected boolean compare(final long index, final long fingerprint) { + return getFingerprint(index) == fingerprint; + } + + // modify the flags and fingerprint of a given slot + void modifySlot(final boolean isOccupied, final boolean isContinuation, final boolean isShifted, + final long index, final long fingerprint) { + modifySlot(isOccupied, isContinuation, isShifted, index); + setFingerprint(index, fingerprint); + } + + // summarize some statistical measures about the filter + public void printFilterSummary() { + final long slots = getNumSlots(); + final long num_bits = slots * bitPerEntry_; + System.out.println("slots:\t" + slots); + System.out.println("entries:\t" + numEntries_); + System.out.println("bits\t:" + num_bits); + System.out.println("bits/entry\t:" + num_bits / (double)numEntries_); + System.out.println("FP length:\t" + fingerprintLength_); + System.out.println("Is full?\t" + isFull_); + final double capacity = numEntries_ / (double)(slots) ; + System.out.println("Capacity\t" + capacity); + computeStatistics(); + //System.out.println("num runs: \t\t" + num_runs); + //System.out.println("avg run length: \t" + avg_run_length); + //System.out.println("num clusters: \t\t" + num_clusters); + //System.out.println("avg cluster length: \t" + avg_cluster_length); + } + + /* + * Returns the number of bits used for the filter + */ + @Override + public long getSpaceUse() { + return getNumSlots() * bitPerEntry_; + } + + public int getBitsPerEntry() { + return bitPerEntry_; + } + + boolean isOccupied(final long index) { + return bitArray_.getBit(index * bitPerEntry_); + } + + boolean isContinuation(final long index) { + return bitArray_.getBit(index * bitPerEntry_ + 1); + } + + boolean isShifted(final long index) { + return bitArray_.getBit(index * bitPerEntry_ + 2); + } + + void setOccupied(final long index, final boolean val) { + bitArray_.assignBit(index * bitPerEntry_, val); + } + + void setContinuation(final long index, final boolean val) { + bitArray_.assignBit(index * bitPerEntry_ + 1, val); + } + + void setShifted(final long index, final boolean val) { + bitArray_.assignBit(index * bitPerEntry_ + 2, val); + } + + boolean isSlotEmpty(final long index) { + return !isOccupied(index) && !isContinuation(index) && !isShifted(index); + } + + // scan the cluster leftwards until finding the start of the cluster and returning its slot index + // used by deletes + long findClusterStart(long index) { + while (isShifted(index)) { + index = (index - 1) & getMask(); + } + return index; + } + + // given a canonical slot A, finds the actual index B of where the run belonging to slot A now resides + // since the run might have been shifted to the right due to collisions + long findRunStart(long index) { + int numRunsToSkip = 0; + while (isShifted(index)) { + index = (index - 1) & getMask(); + if (isOccupied(index)) { + numRunsToSkip++; } - return index; - } - - // given the start of a run, scan the run and return the index of the first matching fingerprint - // if not found returns the insertion position as bitwise complement to make it negative - long find_first_fingerprint_in_run(long index, long fingerprint) { - assert(!is_continuation(index)); - do { - final long fingerprintAtIndex = get_fingerprint(index); - if (fingerprintAtIndex == fingerprint) { - return index; - } else if (fingerprintAtIndex > fingerprint) { - return ~index; - } - index = (index + 1) & getMask(); - } while (is_continuation(index)); - return ~index; } - - // delete the last matching fingerprint in the run - long decide_which_fingerprint_to_delete(long index, long fingerprint) { - assert(!is_continuation(index)); - long matching_fingerprint_index = -1; - do { - if (compare(index, fingerprint)) { - //System.out.println("found matching FP at index " + index); - matching_fingerprint_index = index; - } - index = (index + 1) & getMask(); - } while (is_continuation(index)); - return matching_fingerprint_index; + while (numRunsToSkip > 0) { + index = (index + 1) & getMask(); + if (!isContinuation(index)) { + numRunsToSkip--; + } } + return index; + } - // given the start of a run, find the last slot index that still belongs to this run - long find_run_end(long index) { - while (is_continuation((index + 1) & getMask())) { - index = (index + 1) & getMask(); - } + // given the start of a run, scan the run and return the index of the first matching fingerprint + // if not found returns the insertion position as bitwise complement to make it negative + long findFirstFingerprintInRun(long index, final long fingerprint) { + assert(!isContinuation(index)); + do { + final long fingerprintAtIndex = getFingerprint(index); + if (fingerprintAtIndex == fingerprint) { return index; - } - - // given a canonical index slot and a fingerprint, find the relevant run and check if there is a matching fingerprint within it - boolean search(long fingerprint, long index) { - boolean does_run_exist = is_occupied(index); - if (!does_run_exist) { - return false; - } - long run_start_index = find_run_start(index); - long found_index = find_first_fingerprint_in_run(run_start_index, fingerprint); - return found_index >= 0; - } - - // Given a canonical slot index, find the corresponding run and return all fingerprints in the run. - // This method is only used for testing purposes. - Set get_all_fingerprints(long bucket_index) { - boolean does_run_exist = is_occupied(bucket_index); - HashSet set = new HashSet(); - if (!does_run_exist) { - return set; - } - long run_index = find_run_start(bucket_index); - do { - set.add(get_fingerprint(run_index)); - run_index = (run_index + 1) & getMask(); - } while (is_continuation(run_index)); - return set; - } - - boolean insert(long fingerprint, long index) { - if (index >= get_num_slots() || num_entries == get_num_slots()) { - return false; - } - final long run_start = find_run_start(index); - if (!is_occupied(index)) { - return insert_fingerprint_and_push_all_else(fingerprint, run_start, index, true, true); + } else if (fingerprintAtIndex > fingerprint) { + return ~index; } - final long found_index = find_first_fingerprint_in_run(run_start, fingerprint); - if (found_index >= 0) { - return false; + index = (index + 1) & getMask(); + } while (isContinuation(index)); + return ~index; + } + + // delete the last matching fingerprint in the run + long decideWhichFingerprintToDelete(long index, final long fingerprint) { + assert(!isContinuation(index)); + long matchingFingerprintIndex = -1; + do { + if (compare(index, fingerprint)) { + //System.out.println("found matching FP at index " + index); + matchingFingerprintIndex = index; } - return insert_fingerprint_and_push_all_else(fingerprint, ~found_index, index, false, ~found_index == run_start); - } - - boolean insert_fingerprint_and_push_all_else(long fingerprint, long index, long canonical, boolean is_new_run, boolean is_run_start) { - // in the first shifted entry set is_continuation flag if inserting at the start of the existing run - // otherwise just shift the existing flag as it is - boolean force_continuation = !is_new_run && is_run_start; - - // prepare flags for the current slot - boolean is_continuation = !is_run_start; - boolean is_shifted = index != canonical; - - // remember the existing entry from the current slot to be shifted to the next slot - // is_occupied flag belongs to the slot, therefore it is never shifted - // is_shifted flag is always true for all shifted entries, no need to remember it - long existing_fingerprint = get_fingerprint(index); - boolean existing_is_continuation = is_continuation(index); - boolean existing_is_empty = is_slot_empty(index); - - while (!existing_is_empty) { - // set the current slot - set_fingerprint(index, fingerprint); - set_continuation(index, is_continuation); - set_shifted(index, is_shifted); - - // prepare values for the next slot - fingerprint = existing_fingerprint; - is_continuation = existing_is_continuation | force_continuation; - is_shifted = true; - - index = (index + 1) & getMask(); - - // save the existing entry to be shifted - existing_fingerprint = get_fingerprint(index); - existing_is_continuation = is_continuation(index); - existing_is_empty = is_slot_empty(index); - - force_continuation = false; // this is needed for the first shift only + index = (index + 1) & getMask(); + } while (isContinuation(index)); + return matchingFingerprintIndex; + } + + // given the start of a run, find the last slot index that still belongs to this run + long findRunEnd(long index) { + while (isContinuation((index + 1) & getMask())) { + index = (index + 1) & getMask(); + } + return index; + } + + // given a canonical index slot and a fingerprint, find the relevant run and check if there is a matching fingerprint within it + boolean search(long fingerprint, long index) { + final boolean doesRunExist = isOccupied(index); + if (!doesRunExist) { + return false; + } + final long runStartIndex = findRunStart(index); + final long foundIndex = findFirstFingerprintInRun(runStartIndex, fingerprint); + return foundIndex >= 0; + } + + // Given a canonical slot index, find the corresponding run and return all fingerprints in the run. + // This method is only used for testing purposes. + Set getAllFingerprints(final long bucketIndex) { + final boolean doesRunExist = isOccupied(bucketIndex); + final HashSet set = new HashSet(); + if (!doesRunExist) { + return set; + } + long runIndex = findRunStart(bucketIndex); + do { + set.add(getFingerprint(runIndex)); + runIndex = (runIndex + 1) & getMask(); + } while (isContinuation(runIndex)); + return set; + } + + boolean insert(final long fingerprint, final long index) { + if (index >= getNumSlots() || numEntries_ == getNumSlots()) { + return false; + } + final long run_start = findRunStart(index); + if (!isOccupied(index)) { + return insertFingerprintAndPushAllElse(fingerprint, run_start, index, true, true); + } + final long found_index = findFirstFingerprintInRun(run_start, fingerprint); + if (found_index >= 0) { + return false; + } + return insertFingerprintAndPushAllElse(fingerprint, ~found_index, index, false, ~found_index == run_start); + } + + boolean insertFingerprintAndPushAllElse(long fingerprint, long index, final long canonical, + final boolean isNewRun, final boolean isRunStart) { + // in the first shifted entry set isContinuation flag if inserting at the start of the existing run + // otherwise just shift the existing flag as it is + boolean forceContinuation = !isNewRun && isRunStart; + + // prepare flags for the current slot + boolean isContinuation = !isRunStart; + boolean isShifted = index != canonical; + + // remember the existing entry from the current slot to be shifted to the next slot + // isOccupied flag belongs to the slot, therefore it is never shifted + // isShifted flag is always true for all shifted entries, no need to remember it + long existingFingerprint = getFingerprint(index); + boolean existingIsContinuation = isContinuation(index); + + while (!isSlotEmpty(index)) { + // set the current slot + setFingerprint(index, fingerprint); + setContinuation(index, isContinuation); + setShifted(index, isShifted); + + // prepare values for the next slot + fingerprint = existingFingerprint; + isContinuation = existingIsContinuation | forceContinuation; + isShifted = true; + + index = (index + 1) & getMask(); + + // remember the existing entry to be shifted + existingFingerprint = getFingerprint(index); + existingIsContinuation = isContinuation(index); + + forceContinuation = false; // this is needed for the first shift only + } + // at this point the current slot is empty, so just populate with prepared values + // either the incoming fingerprint or the last shifted one + setFingerprint(index, fingerprint); + setContinuation(index, isContinuation); + setShifted(index, isShifted); + + if (isNewRun) { + setOccupied(canonical, true); + } + numEntries_++; + return true; + } + + boolean delete(final long fingerprint, final long canonicalSlot, long runStartIndex, long matchingFingerprintIndex) { + long runEnd = findRunEnd(matchingFingerprintIndex); + + // the run has only one entry, we need to disable its is_occupied flag + // we just remember we need to do this here, and we do it later to not interfere with counts + boolean turnOffOccupied = runStartIndex == runEnd; + + // First thing to do is move everything else in the run back by one slot + for (long i = matchingFingerprintIndex; i != runEnd; i = (i + 1) & getMask()) { + long f = getFingerprint((i + 1) & getMask()); + setFingerprint(i, f); + } + + // for each slot, we want to know by how much the entry there is shifted + // we can do this by counting the number of continuation flags set to true + // and the number of occupied flags set to false from the start of the cluster to the given cell + // and then subtracting: num_shifted_count - num_non_occupied = number of slots by which an entry is shifted + long clusterStart = findClusterStart(canonicalSlot); + long numShiftedCount = 0; + long numNonOccupied = 0; + for (long i = clusterStart; i != ((runEnd + 1) & getMask()); i = (i + 1) & getMask()) { + if (isContinuation(i)) { + numShiftedCount++; } - // at this point the current slot is empty, so just populate with prepared values - // either the incoming fingerprint or the last shifted one - set_fingerprint(index, fingerprint); - set_continuation(index, is_continuation); - set_shifted(index, is_shifted); - - if (is_new_run) { - set_occupied(canonical, true); + if (!isOccupied(i)) { + numNonOccupied++; } - num_entries++; - return true; } - boolean delete(long fingerprint, long canonical_slot, long run_start_index, long matching_fingerprint_index) { - long run_end = find_run_end(matching_fingerprint_index); - - // the run has only one entry, we need to disable its is_occupied flag - // we just remember we need to do this here, and we do it later to not interfere with counts - boolean turn_off_occupied = run_start_index == run_end; - - // First thing to do is move everything else in the run back by one slot - for (long i = matching_fingerprint_index; i != run_end; i = (i + 1) & getMask()) { - long f = get_fingerprint((i + 1) & getMask()); - set_fingerprint(i, f); + setFingerprint(runEnd, 0); + setShifted(runEnd, false); + setContinuation(runEnd, false); + + // we now have a nested loop. The outer do-while iterates over the remaining runs in the cluster. + // the inner for loop iterates over cells of particular runs, pushing entries one slot back. + do { + // we first check if the next run actually exists and if it is shifted. + // only if both conditions hold, we need to shift it back one slot. + //boolean does_next_run_exist = !is_slot_empty(run_end + 1); + //boolean is_next_run_shifted = is_shifted(run_end + 1); + //if (!does_next_run_exist || !is_next_run_shifted) { + if (isSlotEmpty((runEnd + 1) & getMask()) || !isShifted((runEnd + 1) & getMask())) { + if (turnOffOccupied) { + // if we eliminated a run and now need to turn the isOccupied flag off, we do it at the end to not interfere in our counts + setOccupied(canonicalSlot, false); } + return true; + } - // for each slot, we want to know by how much the entry there is shifted - // we can do this by counting the number of continuation flags set to true - // and the number of occupied flags set to false from the start of the cluster to the given cell - // and then subtracting: num_shifted_count - num_non_occupied = number of slots by which an entry is shifted - long cluster_start = find_cluster_start(canonical_slot); - long num_shifted_count = 0; - long num_non_occupied = 0; - for (long i = cluster_start; i != ((run_end + 1) & getMask()); i = (i + 1) & getMask()) { - if (is_continuation(i)) { - num_shifted_count++; - } - if (!is_occupied(i)) { - num_non_occupied++; - } - } + // we now find the start and end of the next run + final long nextRunStart = (runEnd + 1) & getMask(); + runEnd = findRunEnd(nextRunStart); - set_fingerprint(run_end, 0); - set_shifted(run_end, false); - set_continuation(run_end, false); - - // we now have a nested loop. The outer do-while iterates over the remaining runs in the cluster. - // the inner for loop iterates over cells of particular runs, pushing entries one slot back. - do { - // we first check if the next run actually exists and if it is shifted. - // only if both conditions hold, we need to shift it back one slot. - //boolean does_next_run_exist = !is_slot_empty(run_end + 1); - //boolean is_next_run_shifted = is_shifted(run_end + 1); - //if (!does_next_run_exist || !is_next_run_shifted) { - if (is_slot_empty((run_end + 1) & getMask()) || !is_shifted((run_end + 1) & getMask())) { - if (turn_off_occupied) { - // if we eliminated a run and now need to turn the is_occupied flag off, we do it at the end to not interfere in our counts - set_occupied(canonical_slot, false); - } - return true; - } - - // we now find the start and end of the next run - final long next_run_start = (run_end + 1) & getMask(); - run_end = find_run_end(next_run_start); - - // before we start processing the next run, we check whether the previous run we shifted is now back to its canonical slot - // The condition num_shifted_count - num_non_occupied == 1 ensures that the run was shifted by only 1 slot, meaning it is now back in its proper place - if (is_occupied((next_run_start - 1) & getMask()) && num_shifted_count - num_non_occupied == 1) { - set_shifted((next_run_start - 1) & getMask(), false); - } - else { - set_shifted((next_run_start - 1) & getMask(), true); - } - - for (long i = next_run_start; i != ((run_end + 1) & getMask()); i = (i + 1) & getMask()) { - long f = get_fingerprint(i); - set_fingerprint((i - 1) & getMask(), f); - if (is_continuation(i)) { - set_continuation((i - 1) & getMask(), true); - } - if (!is_occupied(i)) { - num_non_occupied++; - } - if (i != next_run_start) { - num_shifted_count++; - } - } - set_fingerprint(run_end, 0); - set_shifted(run_end, false); - set_continuation(run_end, false); - } while (true); - } + // before we start processing the next run, we check whether the previous run we shifted is now back to its canonical slot + // The condition num_shifted_count - num_non_occupied == 1 ensures that the run was shifted by only 1 slot, meaning it is now back in its proper place + if (isOccupied((nextRunStart - 1) & getMask()) && numShiftedCount - numNonOccupied == 1) { + setShifted((nextRunStart - 1) & getMask(), false); + } else { + setShifted((nextRunStart - 1) & getMask(), true); + } - boolean delete(long fingerprint, long canonical_slot) { - // if the run doesn't exist, the key can't have possibly been inserted - boolean does_run_exist = is_occupied(canonical_slot); - if (!does_run_exist) { - return false; + for (long i = nextRunStart; i != ((runEnd + 1) & getMask()); i = (i + 1) & getMask()) { + long f = getFingerprint(i); + setFingerprint((i - 1) & getMask(), f); + if (isContinuation(i)) { + setContinuation((i - 1) & getMask(), true); } - long run_start_index = find_run_start(canonical_slot); - - long matching_fingerprint_index = decide_which_fingerprint_to_delete(run_start_index, fingerprint); - - if (matching_fingerprint_index == -1) { - // we didn't find a matching fingerprint - return false; + if (!isOccupied(i)) { + numNonOccupied++; } - - return delete(fingerprint, canonical_slot, run_start_index, matching_fingerprint_index); - } - - /* - Performs the modular arithmetic of large_hash % bits_per_entry and uses this as the slot_index - */ - long get_slot_index(long large_hash) { - long slot_index_mask = (1L << power_of_two_size) - 1; - long slot_index = large_hash & slot_index_mask; - //System.out.format("\n**get_slot_index(): [total_hash:index_hash:int_index] --> [%016x:%016x:%016x]\n", large_hash, (int)large_hash, slot_index); - return slot_index; - } - - long gen_fingerprint(long large_hash) { - long fingerprint_mask = (1L << fingerprintLength) - 1L; - fingerprint_mask = fingerprint_mask << power_of_two_size; - long fingerprint = (large_hash & fingerprint_mask) >> power_of_two_size; - //System.out.format("\n**gen_fingerprint(): [total_hash:fingerprint_hash:int_fingerprint] --> [%016x:%016x:%016x]\n", large_hash, ((int)(large_hash>>32)), fingerprint); - return fingerprint; - } - -// void print_key(int input) { -// long large_hash = HashFunctions.normal_hash(input); -// long slot_index = get_slot_index(large_hash); -// long fingerprint = gen_fingerprint(large_hash); -// -// System.out.println("num : " + input); -// System.out.print("hash : "); -// print_long_in_binary(large_hash, fingerprintLength + power_of_two_size); -// //print_int_in_binary(slot_index_mask, 31); -// System.out.print("bucket: "); -// print_long_in_binary(slot_index, power_of_two_size); -// System.out.print("FP : "); -// //print_int_in_binary(fingerprint_mask, 31); -// print_long_in_binary(fingerprint, fingerprintLength); -// System.out.println(); -// -// } -// -// void set_expansion_threshold(double thresh) { -// expansion_threshold = thresh; -// max_entries_before_expansion = (long)(Math.pow(2, power_of_two_size) * expansion_threshold); -// } -// - /* - This is the main insertion function accessed externally. - It calls the underlying filter _insert function which hashes the input - item internally. - Hence, the `large_hash` argument is already a hash key that has been generated - by the hashing library (eg xxhash). - */ - protected boolean _insert(long large_hash) { - //System.out.println("Inserting long hash " + large_hash); - if (is_full) { - return false; + if (i != nextRunStart) { + numShiftedCount++; } - long slot_index = get_slot_index(large_hash); - long fingerprint = gen_fingerprint(large_hash); - - /*print_long_in_binary(large_hash, 64); - print_long_in_binary(slot_index, 32); - print_long_in_binary((int)fingerprint, 64); - System.out.println(slot_index + " " + fingerprint ); - System.out.println(); */ - - boolean success = insert(fingerprint, slot_index); - /*if (!success) { - System.out.println("insertion failure"); - System.out.println(input + "\t" + slot_index + "\t" + get_fingerprint_str(fingerprint, fingerprintLength)); - pretty_print(); - System.exit(1); - }*/ - -// if (expand_autonomously && num_entries >= max_entries_before_expansion) { -// boolean expanded = expand(); -// if (expanded) { -// num_expansions++; -// } -// } - return success; - } -// -// protected boolean _delete(long large_hash) { -// long slot_index = get_slot_index(large_hash); -// long fp_long = gen_fingerprint(large_hash); -// boolean success = delete(fp_long, slot_index); -// if (success) { -// num_entries--; -// } -// return success; -// } -// - protected boolean _search(long large_hash) { - long slot_index = get_slot_index(large_hash); - long fingerprint = gen_fingerprint(large_hash); - return search(fingerprint, slot_index); - } - - public boolean get_bit_at_offset(int offset) { - return filter.getBit(offset); + } + setFingerprint(runEnd, 0); + setShifted(runEnd, false); + setContinuation(runEnd, false); + } while (true); + } + + boolean delete(final long fingerprint, final long canonicalSlot) { + // if the run doesn't exist, the key can't have possibly been inserted + boolean doesRunExist = isOccupied(canonicalSlot); + if (!doesRunExist) { + return false; + } + long runStartIndex = findRunStart(canonicalSlot); + long matchingFingerprintIndex = decideWhichFingerprintToDelete(runStartIndex, fingerprint); + if (matchingFingerprintIndex == -1) { + // we didn't find a matching fingerprint + return false; + } + return delete(fingerprint, canonicalSlot, runStartIndex, matchingFingerprintIndex); + } + + /* + * Performs the modular arithmetic of large_hash % bits_per_entry and uses this as the slot_index + */ + long getSlotIndex(final long largeHash) { + return largeHash & getMask(); + } + + long genFingerprint(final long largeHash) { + long fingerprintMask = (1L << fingerprintLength_) - 1L; + fingerprintMask = fingerprintMask << powerOfTwoSize_; + return (largeHash & fingerprintMask) >> powerOfTwoSize_; + } + + void setExpansionThreshold(double thresh) { + expansionThreshold_ = thresh; + maxEntriesBeforeExpansion_ = (long)(Math.pow(2, powerOfTwoSize_) * expansionThreshold_); + } + + /* + This is the main insertion function accessed externally. + It calls the underlying filter _insert function which hashes the input + item internally. + Hence, the `large_hash` argument is already a hash key that has been generated + by the hashing library (eg xxhash). + */ + protected boolean _insert(long large_hash) { + if (isFull_) { + return false; + } + final long slotIndex = getSlotIndex(large_hash); + final long fingerprint = genFingerprint(large_hash); + boolean success = insert(fingerprint, slotIndex); + + if (expandAutonomously_ && numEntries_ >= maxEntriesBeforeExpansion_) { + final boolean expanded = expand(); + if (expanded) { + numExpansions_++; + } } - - public void compute_statistics() { - num_runs = 0; - num_clusters = 0; - double sum_run_lengths = 0; - double sum_cluster_lengths = 0; - - int current_run_length = 0; - int current_cluster_length = 0; - - long num_slots = get_num_slots(); - for (long i = 0; i < num_slots; i++) { - - boolean occupied = is_occupied(i); - boolean continuation = is_continuation(i); - boolean shifted = is_shifted(i); - - if ( !occupied && !continuation && !shifted ) { // empty slot - sum_cluster_lengths += current_cluster_length; - current_cluster_length = 0; - sum_run_lengths += current_run_length; - current_run_length = 0; - } - else if ( !occupied && !continuation && shifted ) { // start of new run - num_runs++; - sum_run_lengths += current_run_length; - current_run_length = 1; - current_cluster_length++; - } - else if ( !occupied && continuation && !shifted ) { - // not used - } - else if ( !occupied && continuation && shifted ) { // continuation of run - current_cluster_length++; - current_run_length++; - } - else if ( occupied && !continuation && !shifted ) { // start of new cluster & run - num_runs++; - num_clusters++; - sum_cluster_lengths += current_cluster_length; - sum_run_lengths += current_run_length; - current_cluster_length = 1; - current_run_length = 1; - } - else if (occupied && !continuation && shifted ) { // start of new run - num_runs++; - sum_run_lengths += current_run_length; - current_run_length = 1; - current_cluster_length++; - } - else if (occupied && continuation && !shifted ) { - // not used - } - else if (occupied && continuation && shifted ) { // continuation of run - current_cluster_length++; - current_run_length++; - } - } - avg_run_length = sum_run_lengths / num_runs; - avg_cluster_length = sum_cluster_lengths / num_clusters; + return success; + } + + protected boolean _delete(final long largeHash) { + final long slotIndex = getSlotIndex(largeHash); + long fingerprint = genFingerprint(largeHash); + boolean success = delete(fingerprint, slotIndex); + if (success) { + numEntries_--; + } + return success; + } + + protected boolean _search(final long largeHash) { + final long slotIndex = getSlotIndex(largeHash); + long fingerprint = genFingerprint(largeHash); + return search(fingerprint, slotIndex); + } + + public boolean getBitAtOffset(final int offset) { + return bitArray_.getBit(offset); + } + + public void computeStatistics() { + numRuns_ = 0; + numClusters_ = 0; + double sumRunLengths = 0; + double sumClusterLengths = 0; + + int currentRunLength = 0; + int currentCluster_length = 0; + + final long numSlots = getNumSlots(); + for (long i = 0; i < numSlots; i++) { + final boolean occupied = isOccupied(i); + final boolean continuation = isContinuation(i); + final boolean shifted = isShifted(i); + + if (!occupied && !continuation && !shifted) { // empty slot + sumClusterLengths += currentCluster_length; + currentCluster_length = 0; + sumRunLengths += currentRunLength; + currentRunLength = 0; + } else if ( !occupied && !continuation && shifted ) { // start of new run + numRuns_++; + sumRunLengths += currentRunLength; + currentRunLength = 1; + currentCluster_length++; + } else if ( !occupied && continuation && !shifted ) { + // not used + } else if ( !occupied && continuation && shifted ) { // continuation of run + currentCluster_length++; + currentRunLength++; + } else if ( occupied && !continuation && !shifted ) { // start of new cluster & run + numRuns_++; + numClusters_++; + sumClusterLengths += currentCluster_length; + sumRunLengths += currentRunLength; + currentCluster_length = 1; + currentRunLength = 1; + } else if (occupied && !continuation && shifted ) { // start of new run + numRuns_++; + sumRunLengths += currentRunLength; + currentRunLength = 1; + currentCluster_length++; + } else if (occupied && continuation && !shifted ) { + // not used + } else if (occupied && continuation && shifted ) { // continuation of run + currentCluster_length++; + currentRunLength++; + } } - -// -// void ar_sum1(ArrayList ar, int index) -// { -// int s = ar.size(); -// if (s <= index) -// { -// for (int i = s; i measure_cluster_length() -// { -// ArrayList ar = new ArrayList(); -// -// num_runs = 0; -// num_clusters = 0; -// -// int current_run_length = 0; -// int current_cluster_length = 0; -// -// int cnt = 0; -// -// for (int i = 0; i < get_logical_num_slots_plus_extensions(); i++) { -// -// boolean occupied = is_occupied(i); -// boolean continuation = is_continuation(i); -// boolean shifted = is_shifted(i); -// -// if (!occupied && !continuation && !shifted ) { // empty slot -// if(current_cluster_length != 0) ar_sum1(ar, current_cluster_length-1); -// current_cluster_length = 0; -// current_run_length = 0; -// } -// else if (!occupied && !continuation && shifted ) { // start of new run -// num_runs++; -// current_run_length = 1; -// current_cluster_length++; -// } -// else if (!occupied && continuation && shifted ) { // continuation of run -// current_cluster_length++; -// current_run_length++; -// } -// else if (occupied && !continuation && !shifted ) { // start of new cluster & run -// if(current_cluster_length != 0) ar_sum1(ar, current_cluster_length-1); -// num_runs++; -// num_clusters++; -// //if(current_cluster_length == 0) cnt++; -// current_cluster_length = 1; -// current_run_length = 1; -// } -// else if (occupied && !continuation && shifted ) { // start of new run -// num_runs++; -// current_run_length = 1; -// current_cluster_length++; -// } -// else if (occupied && continuation && shifted ) { // continuation of run -// current_cluster_length++; -// current_run_length++; -// } -// } -// if(current_cluster_length != 0) ar_sum1(ar, current_cluster_length-1); -// //System.out.println("CNT = " + cnt); -// return ar; -// } -// -// /* -// @charlied -// Returns the fraction of the filter that is occupied by inserted items. -// Extension slots are omitted from the calculation of the load factor because they are used to accomodate -// items in the filter at the top end of the filter. -// Asymptotically, these will make little-to-no difference to the load in these calculations as the slots -// contributed 2*j / (2^j) --> 0 entries. -// */ -// public double get_load() { -// return num_entries / (double) get_logical_num_slots(); -// } + avgRunLength_ = sumRunLengths / numRuns_; + avgClusterLength_ = sumClusterLengths / numClusters_; + } } diff --git a/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterTest.java b/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterTest.java index 4f68cb1f4..60db5be84 100644 --- a/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterTest.java +++ b/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterTest.java @@ -62,30 +62,30 @@ public void WikiInsertionTest() { qf.insert(C, 1); qf.insert(D, 2); qf.insert(A, 1); - assertEquals(qf.get_num_entries(), 6); + assertEquals(qf.getNumEntries(), 6); assertEquals(getState(qf, 0), 0); - assertEquals(qf.get_fingerprint(0), 0); + assertEquals(qf.getFingerprint(0), 0); assertEquals(getState(qf, 1), 0b100); - assertEquals(qf.get_fingerprint(1), A); + assertEquals(qf.getFingerprint(1), A); assertEquals(getState(qf, 2), 0b111); - assertEquals(qf.get_fingerprint(2), B); + assertEquals(qf.getFingerprint(2), B); assertEquals(getState(qf, 3), 0b011); - assertEquals(qf.get_fingerprint(3), C); + assertEquals(qf.getFingerprint(3), C); assertEquals(getState(qf, 4), 0b101); - assertEquals(qf.get_fingerprint(4), D); + assertEquals(qf.getFingerprint(4), D); assertEquals(getState(qf, 5), 0b001); - assertEquals(qf.get_fingerprint(5), E); + assertEquals(qf.getFingerprint(5), E); assertEquals(getState(qf, 6), 0); - assertEquals(qf.get_fingerprint(6), 0); + assertEquals(qf.getFingerprint(6), 0); assertEquals(getState(qf, 7), 0b100); - assertEquals(qf.get_fingerprint(7), F); + assertEquals(qf.getFingerprint(7), F); } public int getState(QuotientFilter filter, int slot) { - return (filter.is_occupied(slot) ? 1 : 0) << 2 - | (filter.is_continuation(slot) ? 1 : 0) << 1 - | (filter.is_shifted(slot) ? 1 : 0); + return (filter.isOccupied(slot) ? 1 : 0) << 2 + | (filter.isContinuation(slot) ? 1 : 0) << 1 + | (filter.isShifted(slot) ? 1 : 0); } /* @@ -157,13 +157,13 @@ public void OverflowTest() { final long fp1 = 1; final long fp2 = 1 << fingerprint_size - 1; qf.insert(fp1, num_entries - 1); - assertEquals(qf.get_fingerprint(num_entries - 1), fp1); + assertEquals(qf.getFingerprint(num_entries - 1), fp1); assertEquals(getState(qf, num_entries - 1), 0b100); qf.insert(fp2, num_entries - 1); - assertEquals(qf.get_fingerprint(0), fp2); + assertEquals(qf.getFingerprint(0), fp2); assertEquals(getState(qf, 0), 0b011); qf.delete(fp2, num_entries - 1); - assertEquals(qf.get_fingerprint(0), 0); + assertEquals(qf.getFingerprint(0), 0); assertEquals(getState(qf, 0), 0b000); final boolean found = qf.search(fp1, num_entries - 1); assertTrue(found); @@ -190,7 +190,7 @@ public void testQuotientFilterInsertionAndIteration() { qf.insert(0x1F, 4); qf.insert(0x1F, 15); // last slot in the filter qf.insert(0x1F, 16); // outside the bounds - qf.pretty_print() ; +// qf.pretty_print(); Iterator it = new Iterator(qf); int[] arr = new int[] {2, 3, 4, 15}; @@ -261,8 +261,8 @@ static public BitSet set_slot_in_test(BitSet result, int bits_per_entry, int slo static public boolean check_equality(QuotientFilter qf, BitSet bs, boolean check_also_fingerprints) { for (int i = 0; i < bs.size(); i++) { - if (check_also_fingerprints || (i % qf.bitPerEntry == 0 || i % qf.bitPerEntry == 1 || i % qf.bitPerEntry == 2)) { - if (qf.get_bit_at_offset(i) != bs.get(i)) { + if (check_also_fingerprints || (i % qf.getBitsPerEntry() == 0 || i % qf.getBitsPerEntry() == 1 || i % qf.getBitsPerEntry() == 2)) { + if (qf.getBitAtOffset(i) != bs.get(i)) { return false; } } From 9f9f5496eb45fe82820cbe49433310009be7b5f0 Mon Sep 17 00:00:00 2001 From: AlexanderSaydakov Date: Tue, 18 Jun 2024 11:07:47 -0700 Subject: [PATCH 29/38] expansion and some more cleanup --- .../quotientfilter/QuotientFilter.java | 255 +++++++++--------- .../quotientfilter/QuotientFilterTest.java | 44 ++- 2 files changed, 164 insertions(+), 135 deletions(-) diff --git a/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java b/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java index bd8c2abd9..a47bb5f6a 100644 --- a/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java +++ b/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java @@ -21,46 +21,34 @@ import java.util.ArrayList; import java.util.HashSet; +import java.util.LinkedList; +import java.util.Queue; import java.util.Set; +import org.apache.datasketches.common.SketchesException; import org.apache.datasketches.filters.common.BitArray; import org.apache.datasketches.filters.common.HeapBitArray; public class QuotientFilter extends Filter { - int bitPerEntry_; - int fingerprintLength_; + public static final double LOAD_FACTOR = 0.9; + + int numBitsPerEntry_; int powerOfTwoSize_; int numEntries_; + int numExpansions_; BitArray bitArray_; - double expansionThreshold_; - long maxEntriesBeforeExpansion_; - boolean expandAutonomously_; - boolean isFull_; - // statistics, computed in the compute_statistics method. method should be called before these are used long numRuns_; long numClusters_; public double avgRunLength_; public double avgClusterLength_; - int originalFingerprintSize_; - int numExpansions_; - - public QuotientFilter(final int powerOfTwo, final int bitsPerEntry) { + public QuotientFilter(final int powerOfTwo, final int numBitsPerEntry) { powerOfTwoSize_ = powerOfTwo; - bitPerEntry_ = bitsPerEntry; - fingerprintLength_ = bitsPerEntry - 3; - final long init_size = 1L << powerOfTwo; - bitArray_ = makeFilter(init_size, bitsPerEntry); - - expansionThreshold_ = 0.8; - maxEntriesBeforeExpansion_ = (int) (init_size * expansionThreshold_); - expandAutonomously_ = false; - isFull_ = false; - - originalFingerprintSize_ = fingerprintLength_; + numBitsPerEntry_ = numBitsPerEntry; + bitArray_ = makeFilter(getNumSlots(), numBitsPerEntry); numExpansions_ = 0; //hash_type = XxHash.hashLong ; //HashType.xxh; } @@ -73,16 +61,12 @@ public long getNumEntries() { return numEntries_; } - public long getMaxEntriesBeforeExpansion() { - return maxEntriesBeforeExpansion_; - } - - public boolean expandAutonomously() { - return expandAutonomously_; + public int getNumExpansions() { + return numExpansions_; } - public void setExpandAutonomously(final boolean val) { - expandAutonomously_ = val; + public long getMaxEntriesBeforeExpansion() { + return (long)(getNumSlots() * LOAD_FACTOR); } BitArray makeFilter(final long initSize, final int bitsPerEntry) { @@ -90,19 +74,40 @@ BitArray makeFilter(final long initSize, final int bitsPerEntry) { } public int getFingerprintLength() { - return fingerprintLength_; + return numBitsPerEntry_ - 3; } - QuotientFilter(final int powerOfTwo, final int bitsPerEntry, final BitArray bitArray) { + QuotientFilter(final int powerOfTwo, final int numBitsPerEntry, final BitArray bitArray) { powerOfTwoSize_ = powerOfTwo; - bitPerEntry_ = bitsPerEntry; - fingerprintLength_ = bitsPerEntry - 3; + numBitsPerEntry_ = numBitsPerEntry; bitArray_ = bitArray; } - boolean expand() { - isFull_ = true; - return false; + void expand() { + if (getFingerprintLength() < 2) throw new SketchesException("for expansion value must have at least 2 bits"); + QuotientFilter other = new QuotientFilter(powerOfTwoSize_ + 1, numBitsPerEntry_ - 1); + + long i = 0; + if (!isSlotEmpty(i)) { i = findClusterStart(i); } + + Queue fifo = new LinkedList(); + long count = 0; + while (count < numEntries_) { + if (!isSlotEmpty(i)) { + if (isOccupied(i)) { fifo.add(i); } + final long fingerprint = getFingerprint(i); + final long newQuotient = (fifo.element() << 1) | (fingerprint >> other.getFingerprintLength()); + final long newFingerprint = fingerprint & other.getFingerprintMask(); + other.insert(newFingerprint, newQuotient); + count++; + } + i = (i + 1) & getSlotMask(); + if (!fifo.isEmpty() && ! isContinuation(i)) { fifo.remove(); } + } + powerOfTwoSize_++; + numBitsPerEntry_--; + bitArray_ = other.bitArray_; + numExpansions_++; } // measures the number of bits per entry for the filter @@ -122,11 +127,9 @@ protected static double measureNumBitsPerEntry(final QuotientFilter current, fin //System.out.println(); numEntries += q.getNumEntries(); } - long init_size = 1L << current.powerOfTwoSize_; - long numBits = current.bitPerEntry_ * init_size; - for (QuotientFilter q : otherFilters) { - init_size = 1L << q.powerOfTwoSize_; - numBits += q.bitPerEntry_ * init_size; + long numBits = current.getNumBitsPerEntry() * current.getNumSlots(); + for (final QuotientFilter q : otherFilters) { + numBits += q.getNumBitsPerEntry() * q.getNumSlots(); } //System.out.println("total entries: \t\t" + num_entries); //System.out.println("total bits: \t\t" + num_bits); @@ -146,10 +149,14 @@ public long getNumSlots() { return 1L << powerOfTwoSize_; } - long getMask() { + long getSlotMask() { return getNumSlots() - 1; } + long getFingerprintMask() { + return (1L << getFingerprintLength()) - 1; + } + // sets the metadata flag bits for a given slot index void modifySlot(final boolean isOccupied, final boolean isContinuation, final boolean isShifted, final long index) { setOccupied(index, isOccupied); @@ -159,18 +166,18 @@ void modifySlot(final boolean isOccupied, final boolean isContinuation, final bo // sets the fingerprint for a given slot index void setFingerprint(final long index, final long fingerprint) { - bitArray_.setBits(index * bitPerEntry_ + 3, fingerprintLength_, fingerprint); + bitArray_.setBits(index * numBitsPerEntry_ + 3, getFingerprintLength(), fingerprint); } // print a nice representation of the filter that can be understood. // if vertical is on, each line will represent a slot - public String get_pretty_str(final boolean vertical) { + public String getPrettyStr(final boolean vertical) { final StringBuffer sbr = new StringBuffer(); - final long numBits = getNumSlots() * bitPerEntry_; + final long numBits = getNumSlots() * numBitsPerEntry_; for (long i = 0; i < numBits; i++) { - final long remainder = i % bitPerEntry_; + final long remainder = i % numBitsPerEntry_; if (remainder == 0) { - final long slot = i / bitPerEntry_; + final long slot = i / numBitsPerEntry_; sbr.append(" "); if (vertical) { sbr.append("\n" + String.format("%-10d", slot) + "\t"); @@ -187,17 +194,17 @@ public String get_pretty_str(final boolean vertical) { // print a representation of the filter that can be humanly read. public void prettyPrint() { - System.out.print(get_pretty_str(true)); + System.out.print(getPrettyStr(true)); } // return a fingerprint in a given slot index long getFingerprint(final long index) { - return bitArray_.getBits(index * bitPerEntry_ + 3, fingerprintLength_); + return bitArray_.getBits(index * numBitsPerEntry_ + 3, getFingerprintLength()); } // return an entire slot representation, including metadata flags and fingerprint long getSlot(final long index) { - return bitArray_.getBits(index * bitPerEntry_, bitPerEntry_); + return bitArray_.getBits(index * numBitsPerEntry_, numBitsPerEntry_); } // compare a fingerprint input to the fingerprint in some slot index @@ -215,15 +222,14 @@ void modifySlot(final boolean isOccupied, final boolean isContinuation, final bo // summarize some statistical measures about the filter public void printFilterSummary() { final long slots = getNumSlots(); - final long num_bits = slots * bitPerEntry_; - System.out.println("slots:\t" + slots); - System.out.println("entries:\t" + numEntries_); - System.out.println("bits\t:" + num_bits); - System.out.println("bits/entry\t:" + num_bits / (double)numEntries_); - System.out.println("FP length:\t" + fingerprintLength_); - System.out.println("Is full?\t" + isFull_); - final double capacity = numEntries_ / (double)(slots) ; - System.out.println("Capacity\t" + capacity); + final long numBits = slots * numBitsPerEntry_; + System.out.println("slots: " + slots); + System.out.println("bits: " + numBits); + System.out.println("bits/entry: " + numBits / (double)numEntries_); + System.out.println("FP length: " + getFingerprintLength()); + System.out.println("entries: " + numEntries_); + System.out.println("expansions: " + numExpansions_); + System.out.println("load: " + numEntries_ / (double)(slots)); computeStatistics(); //System.out.println("num runs: \t\t" + num_runs); //System.out.println("avg run length: \t" + avg_run_length); @@ -236,35 +242,35 @@ public void printFilterSummary() { */ @Override public long getSpaceUse() { - return getNumSlots() * bitPerEntry_; + return getNumSlots() * numBitsPerEntry_; } - public int getBitsPerEntry() { - return bitPerEntry_; + public int getNumBitsPerEntry() { + return numBitsPerEntry_; } boolean isOccupied(final long index) { - return bitArray_.getBit(index * bitPerEntry_); + return bitArray_.getBit(index * numBitsPerEntry_); } boolean isContinuation(final long index) { - return bitArray_.getBit(index * bitPerEntry_ + 1); + return bitArray_.getBit(index * numBitsPerEntry_ + 1); } boolean isShifted(final long index) { - return bitArray_.getBit(index * bitPerEntry_ + 2); + return bitArray_.getBit(index * numBitsPerEntry_ + 2); } void setOccupied(final long index, final boolean val) { - bitArray_.assignBit(index * bitPerEntry_, val); + bitArray_.assignBit(index * numBitsPerEntry_, val); } void setContinuation(final long index, final boolean val) { - bitArray_.assignBit(index * bitPerEntry_ + 1, val); + bitArray_.assignBit(index * numBitsPerEntry_ + 1, val); } void setShifted(final long index, final boolean val) { - bitArray_.assignBit(index * bitPerEntry_ + 2, val); + bitArray_.assignBit(index * numBitsPerEntry_ + 2, val); } boolean isSlotEmpty(final long index) { @@ -275,7 +281,7 @@ boolean isSlotEmpty(final long index) { // used by deletes long findClusterStart(long index) { while (isShifted(index)) { - index = (index - 1) & getMask(); + index = (index - 1) & getSlotMask(); } return index; } @@ -285,13 +291,13 @@ long findClusterStart(long index) { long findRunStart(long index) { int numRunsToSkip = 0; while (isShifted(index)) { - index = (index - 1) & getMask(); + index = (index - 1) & getSlotMask(); if (isOccupied(index)) { numRunsToSkip++; } } while (numRunsToSkip > 0) { - index = (index + 1) & getMask(); + index = (index + 1) & getSlotMask(); if (!isContinuation(index)) { numRunsToSkip--; } @@ -302,7 +308,7 @@ long findRunStart(long index) { // given the start of a run, scan the run and return the index of the first matching fingerprint // if not found returns the insertion position as bitwise complement to make it negative long findFirstFingerprintInRun(long index, final long fingerprint) { - assert(!isContinuation(index)); + assert !isContinuation(index); do { final long fingerprintAtIndex = getFingerprint(index); if (fingerprintAtIndex == fingerprint) { @@ -310,37 +316,35 @@ long findFirstFingerprintInRun(long index, final long fingerprint) { } else if (fingerprintAtIndex > fingerprint) { return ~index; } - index = (index + 1) & getMask(); + index = (index + 1) & getSlotMask(); } while (isContinuation(index)); return ~index; } // delete the last matching fingerprint in the run long decideWhichFingerprintToDelete(long index, final long fingerprint) { - assert(!isContinuation(index)); + assert !isContinuation(index); long matchingFingerprintIndex = -1; do { if (compare(index, fingerprint)) { - //System.out.println("found matching FP at index " + index); matchingFingerprintIndex = index; } - index = (index + 1) & getMask(); + index = (index + 1) & getSlotMask(); } while (isContinuation(index)); return matchingFingerprintIndex; } // given the start of a run, find the last slot index that still belongs to this run long findRunEnd(long index) { - while (isContinuation((index + 1) & getMask())) { - index = (index + 1) & getMask(); + while (isContinuation((index + 1) & getSlotMask())) { + index = (index + 1) & getSlotMask(); } return index; } // given a canonical index slot and a fingerprint, find the relevant run and check if there is a matching fingerprint within it - boolean search(long fingerprint, long index) { - final boolean doesRunExist = isOccupied(index); - if (!doesRunExist) { + boolean search(final long fingerprint, final long index) { + if (!isOccupied(index)) { return false; } final long runStartIndex = findRunStart(index); @@ -359,7 +363,7 @@ Set getAllFingerprints(final long bucketIndex) { long runIndex = findRunStart(bucketIndex); do { set.add(getFingerprint(runIndex)); - runIndex = (runIndex + 1) & getMask(); + runIndex = (runIndex + 1) & getSlotMask(); } while (isContinuation(runIndex)); return set; } @@ -368,18 +372,20 @@ boolean insert(final long fingerprint, final long index) { if (index >= getNumSlots() || numEntries_ == getNumSlots()) { return false; } - final long run_start = findRunStart(index); + final long runStart = findRunStart(index); if (!isOccupied(index)) { - return insertFingerprintAndPushAllElse(fingerprint, run_start, index, true, true); + insertFingerprintAndPushAllElse(fingerprint, runStart, index, true, true); + return true; } - final long found_index = findFirstFingerprintInRun(run_start, fingerprint); - if (found_index >= 0) { + final long foundIndex = findFirstFingerprintInRun(runStart, fingerprint); + if (foundIndex >= 0) { return false; } - return insertFingerprintAndPushAllElse(fingerprint, ~found_index, index, false, ~found_index == run_start); + insertFingerprintAndPushAllElse(fingerprint, ~foundIndex, index, false, ~foundIndex == runStart); + return true; } - boolean insertFingerprintAndPushAllElse(long fingerprint, long index, final long canonical, + void insertFingerprintAndPushAllElse(long fingerprint, long index, final long canonical, final boolean isNewRun, final boolean isRunStart) { // in the first shifted entry set isContinuation flag if inserting at the start of the existing run // otherwise just shift the existing flag as it is @@ -406,7 +412,7 @@ boolean insertFingerprintAndPushAllElse(long fingerprint, long index, final long isContinuation = existingIsContinuation | forceContinuation; isShifted = true; - index = (index + 1) & getMask(); + index = (index + 1) & getSlotMask(); // remember the existing entry to be shifted existingFingerprint = getFingerprint(index); @@ -424,7 +430,6 @@ boolean insertFingerprintAndPushAllElse(long fingerprint, long index, final long setOccupied(canonical, true); } numEntries_++; - return true; } boolean delete(final long fingerprint, final long canonicalSlot, long runStartIndex, long matchingFingerprintIndex) { @@ -435,8 +440,8 @@ boolean delete(final long fingerprint, final long canonicalSlot, long runStartIn boolean turnOffOccupied = runStartIndex == runEnd; // First thing to do is move everything else in the run back by one slot - for (long i = matchingFingerprintIndex; i != runEnd; i = (i + 1) & getMask()) { - long f = getFingerprint((i + 1) & getMask()); + for (long i = matchingFingerprintIndex; i != runEnd; i = (i + 1) & getSlotMask()) { + long f = getFingerprint((i + 1) & getSlotMask()); setFingerprint(i, f); } @@ -447,7 +452,7 @@ boolean delete(final long fingerprint, final long canonicalSlot, long runStartIn long clusterStart = findClusterStart(canonicalSlot); long numShiftedCount = 0; long numNonOccupied = 0; - for (long i = clusterStart; i != ((runEnd + 1) & getMask()); i = (i + 1) & getMask()) { + for (long i = clusterStart; i != ((runEnd + 1) & getSlotMask()); i = (i + 1) & getSlotMask()) { if (isContinuation(i)) { numShiftedCount++; } @@ -468,7 +473,7 @@ boolean delete(final long fingerprint, final long canonicalSlot, long runStartIn //boolean does_next_run_exist = !is_slot_empty(run_end + 1); //boolean is_next_run_shifted = is_shifted(run_end + 1); //if (!does_next_run_exist || !is_next_run_shifted) { - if (isSlotEmpty((runEnd + 1) & getMask()) || !isShifted((runEnd + 1) & getMask())) { + if (isSlotEmpty((runEnd + 1) & getSlotMask()) || !isShifted((runEnd + 1) & getSlotMask())) { if (turnOffOccupied) { // if we eliminated a run and now need to turn the isOccupied flag off, we do it at the end to not interfere in our counts setOccupied(canonicalSlot, false); @@ -477,22 +482,22 @@ boolean delete(final long fingerprint, final long canonicalSlot, long runStartIn } // we now find the start and end of the next run - final long nextRunStart = (runEnd + 1) & getMask(); + final long nextRunStart = (runEnd + 1) & getSlotMask(); runEnd = findRunEnd(nextRunStart); // before we start processing the next run, we check whether the previous run we shifted is now back to its canonical slot // The condition num_shifted_count - num_non_occupied == 1 ensures that the run was shifted by only 1 slot, meaning it is now back in its proper place - if (isOccupied((nextRunStart - 1) & getMask()) && numShiftedCount - numNonOccupied == 1) { - setShifted((nextRunStart - 1) & getMask(), false); + if (isOccupied((nextRunStart - 1) & getSlotMask()) && numShiftedCount - numNonOccupied == 1) { + setShifted((nextRunStart - 1) & getSlotMask(), false); } else { - setShifted((nextRunStart - 1) & getMask(), true); + setShifted((nextRunStart - 1) & getSlotMask(), true); } - for (long i = nextRunStart; i != ((runEnd + 1) & getMask()); i = (i + 1) & getMask()) { + for (long i = nextRunStart; i != ((runEnd + 1) & getSlotMask()); i = (i + 1) & getSlotMask()) { long f = getFingerprint(i); - setFingerprint((i - 1) & getMask(), f); + setFingerprint((i - 1) & getSlotMask(), f); if (isContinuation(i)) { - setContinuation((i - 1) & getMask(), true); + setContinuation((i - 1) & getSlotMask(), true); } if (!isOccupied(i)) { numNonOccupied++; @@ -522,22 +527,12 @@ boolean delete(final long fingerprint, final long canonicalSlot) { return delete(fingerprint, canonicalSlot, runStartIndex, matchingFingerprintIndex); } - /* - * Performs the modular arithmetic of large_hash % bits_per_entry and uses this as the slot_index - */ - long getSlotIndex(final long largeHash) { - return largeHash & getMask(); - } - - long genFingerprint(final long largeHash) { - long fingerprintMask = (1L << fingerprintLength_) - 1L; - fingerprintMask = fingerprintMask << powerOfTwoSize_; - return (largeHash & fingerprintMask) >> powerOfTwoSize_; + long getSlotFromHash(final long largeHash) { + return (largeHash >> getFingerprintLength()) & getSlotMask(); } - void setExpansionThreshold(double thresh) { - expansionThreshold_ = thresh; - maxEntriesBeforeExpansion_ = (long)(Math.pow(2, powerOfTwoSize_) * expansionThreshold_); + long getFingerprintFromHash(final long largeHash) { + return largeHash & getFingerprintMask(); } /* @@ -547,26 +542,20 @@ void setExpansionThreshold(double thresh) { Hence, the `large_hash` argument is already a hash key that has been generated by the hashing library (eg xxhash). */ - protected boolean _insert(long large_hash) { - if (isFull_) { - return false; - } - final long slotIndex = getSlotIndex(large_hash); - final long fingerprint = genFingerprint(large_hash); - boolean success = insert(fingerprint, slotIndex); - - if (expandAutonomously_ && numEntries_ >= maxEntriesBeforeExpansion_) { - final boolean expanded = expand(); - if (expanded) { - numExpansions_++; - } + protected boolean _insert(final long largeHash) { + final long slotIndex = getSlotFromHash(largeHash); + final long fingerprint = getFingerprintFromHash(largeHash); + final boolean success = insert(fingerprint, slotIndex); + + if (numEntries_ == getMaxEntriesBeforeExpansion()) { + expand(); } return success; } protected boolean _delete(final long largeHash) { - final long slotIndex = getSlotIndex(largeHash); - long fingerprint = genFingerprint(largeHash); + final long slotIndex = getSlotFromHash(largeHash); + long fingerprint = getFingerprintFromHash(largeHash); boolean success = delete(fingerprint, slotIndex); if (success) { numEntries_--; @@ -575,8 +564,8 @@ protected boolean _delete(final long largeHash) { } protected boolean _search(final long largeHash) { - final long slotIndex = getSlotIndex(largeHash); - long fingerprint = genFingerprint(largeHash); + final long slotIndex = getSlotFromHash(largeHash); + final long fingerprint = getFingerprintFromHash(largeHash); return search(fingerprint, slotIndex); } diff --git a/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterTest.java b/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterTest.java index 60db5be84..3e9644dfe 100644 --- a/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterTest.java +++ b/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterTest.java @@ -22,7 +22,6 @@ import static org.testng.Assert.assertTrue; import static org.testng.Assert.assertEquals; -import java.util.ArrayList; import java.util.BitSet; import java.util.HashSet; import java.util.Random; @@ -261,7 +260,7 @@ static public BitSet set_slot_in_test(BitSet result, int bits_per_entry, int slo static public boolean check_equality(QuotientFilter qf, BitSet bs, boolean check_also_fingerprints) { for (int i = 0; i < bs.size(); i++) { - if (check_also_fingerprints || (i % qf.getBitsPerEntry() == 0 || i % qf.getBitsPerEntry() == 1 || i % qf.getBitsPerEntry() == 2)) { + if (check_also_fingerprints || (i % qf.getNumBitsPerEntry() == 0 || i % qf.getNumBitsPerEntry() == 1 || i % qf.getNumBitsPerEntry() == 2)) { if (qf.getBitAtOffset(i) != bs.get(i)) { return false; } @@ -297,4 +296,45 @@ static public boolean test_no_false_negatives(QuotientFilter filter, int num_ent } return true; } + + @Test + public void smallExpansion() { + final QuotientFilter qf = new QuotientFilter(5, 12); + final int n = 30; + for (int i = 0; i < n; i++) { qf.insert(i); } + qf.printFilterSummary(); + assertEquals(qf.getNumExpansions(), 1); + assertEquals(qf.getNumEntries(), n); + + // query the same keys + int positives = 0; + for (int i = 0; i < n; i++) { if (qf.search(i)) { positives++; } } + assertEquals(positives, n); + + // query novel keys + positives = 0; + for (int i = 0; i < n; i++) { if (qf.search(i + n)) { positives++; } } + assertTrue(positives < 2); + } + + @Test + public void expansion() { + final QuotientFilter qf = new QuotientFilter(16, 16); + final int n = 60000; + for (int i = 0; i < n; i++) { qf.insert(i); } +// qf.printFilterSummary(); + assertEquals(qf.getNumExpansions(), 1); + assertTrue(qf.getNumEntries() > n * 0.99); // allow a few hash collisions + + // query the same keys + int positives = 0; + for (int i = 0; i < n; i++) { if (qf.search(i)) { positives++; } } + assertEquals(positives, n); + + // query novel keys + positives = 0; + for (int i = 0; i < n; i++) { if (qf.search(i + n)) { positives++; } } + assertTrue(positives < 6); + } + } From 2064c3cad13f97a5905be49a82e536e1b6259a77 Mon Sep 17 00:00:00 2001 From: AlexanderSaydakov Date: Mon, 8 Jul 2024 23:41:33 -0700 Subject: [PATCH 30/38] constructor takes fingerprint length and load factor --- .../quotientfilter/QuotientFilter.java | 78 ++++++++++--------- .../filters/quotientfilter/DeletionTests.java | 56 ++++++------- .../quotientfilter/QuotientFilterTest.java | 75 ++++++------------ 3 files changed, 93 insertions(+), 116 deletions(-) diff --git a/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java b/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java index a47bb5f6a..7a860f857 100644 --- a/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java +++ b/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java @@ -31,10 +31,11 @@ public class QuotientFilter extends Filter { - public static final double LOAD_FACTOR = 0.9; + public static final float DEFAULT_LOAD_FACTOR = 0.8f; - int numBitsPerEntry_; - int powerOfTwoSize_; + int lgQ_; + int numFingerprintBits_; + float loadFactor_; int numEntries_; int numExpansions_; BitArray bitArray_; @@ -45,10 +46,15 @@ public class QuotientFilter extends Filter { public double avgRunLength_; public double avgClusterLength_; - public QuotientFilter(final int powerOfTwo, final int numBitsPerEntry) { - powerOfTwoSize_ = powerOfTwo; - numBitsPerEntry_ = numBitsPerEntry; - bitArray_ = makeFilter(getNumSlots(), numBitsPerEntry); + public QuotientFilter(final int lgQ, final int numFingerprintBits) { + this(lgQ, numFingerprintBits, DEFAULT_LOAD_FACTOR); + } + + public QuotientFilter(final int lgQ, final int numFingerprintBits, final float loadFactor) { + lgQ_ = lgQ; + numFingerprintBits_ = numFingerprintBits; + loadFactor_ = loadFactor; + bitArray_ = makeFilter(getNumSlots(), getNumBitsPerEntry()); numExpansions_ = 0; //hash_type = XxHash.hashLong ; //HashType.xxh; } @@ -66,7 +72,7 @@ public int getNumExpansions() { } public long getMaxEntriesBeforeExpansion() { - return (long)(getNumSlots() * LOAD_FACTOR); + return (long)(getNumSlots() * loadFactor_); } BitArray makeFilter(final long initSize, final int bitsPerEntry) { @@ -74,23 +80,23 @@ BitArray makeFilter(final long initSize, final int bitsPerEntry) { } public int getFingerprintLength() { - return numBitsPerEntry_ - 3; + return numFingerprintBits_; } - QuotientFilter(final int powerOfTwo, final int numBitsPerEntry, final BitArray bitArray) { - powerOfTwoSize_ = powerOfTwo; - numBitsPerEntry_ = numBitsPerEntry; - bitArray_ = bitArray; - } +// QuotientFilter(final int powerOfTwo, final int numBitsPerEntry, final BitArray bitArray) { +// powerOfTwoSize_ = powerOfTwo; +// numBitsPerEntry_ = numBitsPerEntry; +// bitArray_ = bitArray; +// } void expand() { if (getFingerprintLength() < 2) throw new SketchesException("for expansion value must have at least 2 bits"); - QuotientFilter other = new QuotientFilter(powerOfTwoSize_ + 1, numBitsPerEntry_ - 1); + final QuotientFilter other = new QuotientFilter(lgQ_ + 1, numFingerprintBits_ - 1, loadFactor_); long i = 0; if (!isSlotEmpty(i)) { i = findClusterStart(i); } - Queue fifo = new LinkedList(); + final Queue fifo = new LinkedList(); long count = 0; while (count < numEntries_) { if (!isSlotEmpty(i)) { @@ -104,8 +110,8 @@ void expand() { i = (i + 1) & getSlotMask(); if (!fifo.isEmpty() && ! isContinuation(i)) { fifo.remove(); } } - powerOfTwoSize_++; - numBitsPerEntry_--; + lgQ_++; + numFingerprintBits_--; bitArray_ = other.bitArray_; numExpansions_++; } @@ -146,7 +152,7 @@ public double getUtilization() { // returns the number of slots in the filter without the extension/buffer slots public long getNumSlots() { - return 1L << powerOfTwoSize_; + return 1L << lgQ_; } long getSlotMask() { @@ -166,18 +172,18 @@ void modifySlot(final boolean isOccupied, final boolean isContinuation, final bo // sets the fingerprint for a given slot index void setFingerprint(final long index, final long fingerprint) { - bitArray_.setBits(index * numBitsPerEntry_ + 3, getFingerprintLength(), fingerprint); + bitArray_.setBits(index * getNumBitsPerEntry() + 3, getFingerprintLength(), fingerprint); } // print a nice representation of the filter that can be understood. // if vertical is on, each line will represent a slot public String getPrettyStr(final boolean vertical) { final StringBuffer sbr = new StringBuffer(); - final long numBits = getNumSlots() * numBitsPerEntry_; + final long numBits = getNumSlots() * getNumBitsPerEntry(); for (long i = 0; i < numBits; i++) { - final long remainder = i % numBitsPerEntry_; + final long remainder = i % getNumBitsPerEntry(); if (remainder == 0) { - final long slot = i / numBitsPerEntry_; + final long slot = i / getNumBitsPerEntry(); sbr.append(" "); if (vertical) { sbr.append("\n" + String.format("%-10d", slot) + "\t"); @@ -199,12 +205,12 @@ public void prettyPrint() { // return a fingerprint in a given slot index long getFingerprint(final long index) { - return bitArray_.getBits(index * numBitsPerEntry_ + 3, getFingerprintLength()); + return bitArray_.getBits(index * getNumBitsPerEntry() + 3, getFingerprintLength()); } // return an entire slot representation, including metadata flags and fingerprint long getSlot(final long index) { - return bitArray_.getBits(index * numBitsPerEntry_, numBitsPerEntry_); + return bitArray_.getBits(index * getNumBitsPerEntry(), getNumBitsPerEntry()); } // compare a fingerprint input to the fingerprint in some slot index @@ -222,7 +228,7 @@ void modifySlot(final boolean isOccupied, final boolean isContinuation, final bo // summarize some statistical measures about the filter public void printFilterSummary() { final long slots = getNumSlots(); - final long numBits = slots * numBitsPerEntry_; + final long numBits = slots * getNumBitsPerEntry(); System.out.println("slots: " + slots); System.out.println("bits: " + numBits); System.out.println("bits/entry: " + numBits / (double)numEntries_); @@ -242,35 +248,35 @@ public void printFilterSummary() { */ @Override public long getSpaceUse() { - return getNumSlots() * numBitsPerEntry_; + return getNumSlots() * getNumBitsPerEntry(); } public int getNumBitsPerEntry() { - return numBitsPerEntry_; + return numFingerprintBits_ + 3; } boolean isOccupied(final long index) { - return bitArray_.getBit(index * numBitsPerEntry_); + return bitArray_.getBit(index * getNumBitsPerEntry()); } boolean isContinuation(final long index) { - return bitArray_.getBit(index * numBitsPerEntry_ + 1); + return bitArray_.getBit(index * getNumBitsPerEntry() + 1); } boolean isShifted(final long index) { - return bitArray_.getBit(index * numBitsPerEntry_ + 2); + return bitArray_.getBit(index * getNumBitsPerEntry() + 2); } void setOccupied(final long index, final boolean val) { - bitArray_.assignBit(index * numBitsPerEntry_, val); + bitArray_.assignBit(index * getNumBitsPerEntry(), val); } void setContinuation(final long index, final boolean val) { - bitArray_.assignBit(index * numBitsPerEntry_ + 1, val); + bitArray_.assignBit(index * getNumBitsPerEntry() + 1, val); } void setShifted(final long index, final boolean val) { - bitArray_.assignBit(index * numBitsPerEntry_ + 2, val); + bitArray_.assignBit(index * getNumBitsPerEntry() + 2, val); } boolean isSlotEmpty(final long index) { @@ -432,7 +438,7 @@ void insertFingerprintAndPushAllElse(long fingerprint, long index, final long ca numEntries_++; } - boolean delete(final long fingerprint, final long canonicalSlot, long runStartIndex, long matchingFingerprintIndex) { + boolean delete(final long canonicalSlot, long runStartIndex, long matchingFingerprintIndex) { long runEnd = findRunEnd(matchingFingerprintIndex); // the run has only one entry, we need to disable its is_occupied flag @@ -524,7 +530,7 @@ boolean delete(final long fingerprint, final long canonicalSlot) { // we didn't find a matching fingerprint return false; } - return delete(fingerprint, canonicalSlot, runStartIndex, matchingFingerprintIndex); + return delete(canonicalSlot, runStartIndex, matchingFingerprintIndex); } long getSlotFromHash(final long largeHash) { diff --git a/src/test/java/org/apache/datasketches/filters/quotientfilter/DeletionTests.java b/src/test/java/org/apache/datasketches/filters/quotientfilter/DeletionTests.java index 6e1beb9f2..432e5a6df 100644 --- a/src/test/java/org/apache/datasketches/filters/quotientfilter/DeletionTests.java +++ b/src/test/java/org/apache/datasketches/filters/quotientfilter/DeletionTests.java @@ -35,10 +35,10 @@ public class DeletionTests { */ @Test static public void BasicDeletions() { - int bits_per_entry = 8; + int fingerprint_len_bits = 5; int num_entries_power = 3; - int num_entries = (int)Math.pow(2, num_entries_power); - QuotientFilter qf = new QuotientFilter(num_entries_power, bits_per_entry); + int num_entries = 1 << num_entries_power; + QuotientFilter qf = new QuotientFilter(num_entries_power, fingerprint_len_bits); long fp1 = 1 << 4; long fp2 = 1 << 3; @@ -60,9 +60,9 @@ static public void BasicDeletions() { qf.delete(fp1, 1); qf.delete(fp1, 1); - BitSet result = new BitSet(num_entries * bits_per_entry); - result = QuotientFilterTest.set_slot_in_test(result, bits_per_entry, 2, true, false, false, fp2); - result = QuotientFilterTest.set_slot_in_test(result, bits_per_entry, 4, true, false, false, fp3); + BitSet result = new BitSet(num_entries * qf.getNumBitsPerEntry()); + result = QuotientFilterTest.set_slot_in_test(result, qf.getNumBitsPerEntry(), 2, true, false, false, fp2); + result = QuotientFilterTest.set_slot_in_test(result, qf.getNumBitsPerEntry(), 4, true, false, false, fp3); assertTrue(QuotientFilterTest.check_equality(qf, result, true)); } @@ -76,10 +76,10 @@ static public void BasicDeletions() { */ @Test static public void Deletions() { - int bits_per_entry = 8; + int fingerprint_len_bits = 5; int num_entries_power = 3; int num_entries = (int)Math.pow(2, num_entries_power); - QuotientFilter qf = new QuotientFilter(num_entries_power, bits_per_entry); + QuotientFilter qf = new QuotientFilter(num_entries_power, fingerprint_len_bits); qf.insert(1, 1); qf.insert(2, 1); @@ -96,15 +96,15 @@ static public void Deletions() { qf.delete(3, 2); qf.delete(5, 3); - BitSet result = new BitSet(num_entries * bits_per_entry); - result = QuotientFilterTest.set_slot_in_test(result, bits_per_entry, 0, false, false, false, 0); - result = QuotientFilterTest.set_slot_in_test(result, bits_per_entry, 1, true, false, false, 1); - result = QuotientFilterTest.set_slot_in_test(result, bits_per_entry, 2, true, true, true, 2); - result = QuotientFilterTest.set_slot_in_test(result, bits_per_entry, 3, true, false, true, 4); - result = QuotientFilterTest.set_slot_in_test(result, bits_per_entry, 4, false, false, true, 6); - result = QuotientFilterTest.set_slot_in_test(result, bits_per_entry, 5, false, true, true, 7); - result = QuotientFilterTest.set_slot_in_test(result, bits_per_entry, 6, true, false, false, 8); - result = QuotientFilterTest.set_slot_in_test(result, bits_per_entry, 7, false, false, false, 0); + BitSet result = new BitSet(num_entries * qf.getNumBitsPerEntry()); + result = QuotientFilterTest.set_slot_in_test(result, qf.getNumBitsPerEntry(), 0, false, false, false, 0); + result = QuotientFilterTest.set_slot_in_test(result, qf.getNumBitsPerEntry(), 1, true, false, false, 1); + result = QuotientFilterTest.set_slot_in_test(result, qf.getNumBitsPerEntry(), 2, true, true, true, 2); + result = QuotientFilterTest.set_slot_in_test(result, qf.getNumBitsPerEntry(), 3, true, false, true, 4); + result = QuotientFilterTest.set_slot_in_test(result, qf.getNumBitsPerEntry(), 4, false, false, true, 6); + result = QuotientFilterTest.set_slot_in_test(result, qf.getNumBitsPerEntry(), 5, false, true, true, 7); + result = QuotientFilterTest.set_slot_in_test(result, qf.getNumBitsPerEntry(), 6, true, false, false, 8); + result = QuotientFilterTest.set_slot_in_test(result, qf.getNumBitsPerEntry(), 7, false, false, false, 0); assertTrue(QuotientFilterTest.check_equality(qf, result, true)); } @@ -121,10 +121,10 @@ static public void Deletions() { * The expected outcome is that after deletion, the remaining keys should be in their canonical slots. */ static public void DeletionsWithWrap() { - int bits_per_entry = 8; + int fingerprint_len_bits = 5; int num_entries_power = 3; int num_entries = (int)Math.pow(2, num_entries_power); - QuotientFilter qf = new QuotientFilter(num_entries_power, bits_per_entry); + QuotientFilter qf = new QuotientFilter(num_entries_power, fingerprint_len_bits); qf.insert(1, 1); qf.insert(2, 1); @@ -139,15 +139,15 @@ static public void DeletionsWithWrap() { qf.delete(5, 3); //qf.pretty_print(); - BitSet result = new BitSet(num_entries * bits_per_entry); - result = QuotientFilterTest.set_slot_in_test(result, bits_per_entry, 0, false, false, false, 0); - result = QuotientFilterTest.set_slot_in_test(result, bits_per_entry, 1, true, false, false, 1); - result = QuotientFilterTest.set_slot_in_test(result, bits_per_entry, 2, true, true, true, 2); - result = QuotientFilterTest.set_slot_in_test(result, bits_per_entry, 3, false, false, true, 3); - result = QuotientFilterTest.set_slot_in_test(result, bits_per_entry, 4, true, true, true, 4); - result = QuotientFilterTest.set_slot_in_test(result, bits_per_entry, 5, true, false, true, 6); - result = QuotientFilterTest.set_slot_in_test(result, bits_per_entry, 6, false, true, true, 7); - result = QuotientFilterTest.set_slot_in_test(result, bits_per_entry, 7, false, false, true, 8); + BitSet result = new BitSet(num_entries * qf.getNumBitsPerEntry()); + result = QuotientFilterTest.set_slot_in_test(result, qf.getNumBitsPerEntry(), 0, false, false, false, 0); + result = QuotientFilterTest.set_slot_in_test(result, qf.getNumBitsPerEntry(), 1, true, false, false, 1); + result = QuotientFilterTest.set_slot_in_test(result, qf.getNumBitsPerEntry(), 2, true, true, true, 2); + result = QuotientFilterTest.set_slot_in_test(result, qf.getNumBitsPerEntry(), 3, false, false, true, 3); + result = QuotientFilterTest.set_slot_in_test(result, qf.getNumBitsPerEntry(), 4, true, true, true, 4); + result = QuotientFilterTest.set_slot_in_test(result, qf.getNumBitsPerEntry(), 5, true, false, true, 6); + result = QuotientFilterTest.set_slot_in_test(result, qf.getNumBitsPerEntry(), 6, false, true, true, 7); + result = QuotientFilterTest.set_slot_in_test(result, qf.getNumBitsPerEntry(), 7, false, false, true, 8); assertTrue(QuotientFilterTest.check_equality(qf, result, true)); } } diff --git a/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterTest.java b/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterTest.java index 3e9644dfe..00eb994a9 100644 --- a/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterTest.java +++ b/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterTest.java @@ -44,9 +44,9 @@ public static boolean get_fingerprint_bit(long index, long fingerprint) { */ @Test public void WikiInsertionTest() { - int bits_per_entry = 6; // 6 bits per entry => 3 bits fingerprint, resolved internally in the filter. + int fingerprint_len_bits = 3; // 3 bits fingerprint => 6 bits per entry, resolved internally in the filter. int num_entries_power = 3; - QuotientFilter qf = new QuotientFilter(num_entries_power, bits_per_entry); + QuotientFilter qf = new QuotientFilter(num_entries_power, fingerprint_len_bits, 1.0f); final int A = 1; final int B = 2; @@ -93,10 +93,10 @@ public int getState(QuotientFilter filter, int slot) { */ @Test public void PaperInsertionTest() { - int bits_per_entry = 8; + int fingerprint_len_bits = 5; int num_entries_power = 4; int num_entries = (int)Math.pow(2, num_entries_power); - QuotientFilter qf = new QuotientFilter(4, 8); + QuotientFilter qf = new QuotientFilter(num_entries_power, fingerprint_len_bits); final int A = 1; final int B = 2; @@ -117,57 +117,30 @@ public void PaperInsertionTest() { qf.insert(G, 6); qf.insert(H, 6); - BitSet result = new BitSet(num_entries * bits_per_entry); - result = set_slot_in_test(result, bits_per_entry, 0, false, false, false, 0); - result = set_slot_in_test(result, bits_per_entry, 1, true, false, false, A); - result = set_slot_in_test(result, bits_per_entry, 2, false, true, true, B); - result = set_slot_in_test(result, bits_per_entry, 3, true, false, false, C); - result = set_slot_in_test(result, bits_per_entry, 4, true, true, true, D); - result = set_slot_in_test(result, bits_per_entry, 5, false, true, true, E); - result = set_slot_in_test(result, bits_per_entry, 6, true, false, true, F); - result = set_slot_in_test(result, bits_per_entry, 7, false, false, true, G); - result = set_slot_in_test(result, bits_per_entry, 8, false, true, true, H); + BitSet result = new BitSet(num_entries * qf.getNumBitsPerEntry()); + result = set_slot_in_test(result, qf.getNumBitsPerEntry(), 0, false, false, false, 0); + result = set_slot_in_test(result, qf.getNumBitsPerEntry(), 1, true, false, false, A); + result = set_slot_in_test(result, qf.getNumBitsPerEntry(), 2, false, true, true, B); + result = set_slot_in_test(result, qf.getNumBitsPerEntry(), 3, true, false, false, C); + result = set_slot_in_test(result, qf.getNumBitsPerEntry(), 4, true, true, true, D); + result = set_slot_in_test(result, qf.getNumBitsPerEntry(), 5, false, true, true, E); + result = set_slot_in_test(result, qf.getNumBitsPerEntry(), 6, true, false, true, F); + result = set_slot_in_test(result, qf.getNumBitsPerEntry(), 7, false, false, true, G); + result = set_slot_in_test(result, qf.getNumBitsPerEntry(), 8, false, true, true, H); assertTrue(check_equality(qf, result, false)); } // test we don't get any false negatives for quotient filter @Test public void FalseNegativeTest() { - int bits_per_entry = 10; + int fingerprint_len_bits = 7; int num_entries_power = 10; - QuotientFilter filter = new QuotientFilter(num_entries_power, bits_per_entry); - int num_entries = (int) (Math.pow(2, num_entries_power) * 0.9 ); + QuotientFilter filter = new QuotientFilter(num_entries_power, fingerprint_len_bits); + int num_entries = (int) ((1 << num_entries_power) * 0.8); assertTrue(test_no_false_negatives(filter, num_entries)); } - /* - * Adds two entries to the end of the filter, causing an overflow into the extension slots. - * Checks this can be handled by the internal data structure and then deletes one of the keys from the filter. - */ - @Test - public void OverflowTest() { - final int bits_per_entry = 8; - final int num_entries_power = 3; - final int num_entries = (int)Math.pow(2, num_entries_power); - final int fingerprint_size = bits_per_entry - 3; - final QuotientFilter qf = new QuotientFilter(num_entries_power, bits_per_entry); - - final long fp1 = 1; - final long fp2 = 1 << fingerprint_size - 1; - qf.insert(fp1, num_entries - 1); - assertEquals(qf.getFingerprint(num_entries - 1), fp1); - assertEquals(getState(qf, num_entries - 1), 0b100); - qf.insert(fp2, num_entries - 1); - assertEquals(qf.getFingerprint(0), fp2); - assertEquals(getState(qf, 0), 0b011); - qf.delete(fp2, num_entries - 1); - assertEquals(qf.getFingerprint(0), 0); - assertEquals(getState(qf, 0), 0b000); - final boolean found = qf.search(fp1, num_entries - 1); - assertTrue(found); - } - /** * This method tests the functionality of the QuotientFilter and Iterator classes. It creates a QuotientFilter and inserts * six entries into it. An Iterator is then used to traverse the entries in the QuotientFilter. The method checks if the @@ -177,11 +150,9 @@ public void OverflowTest() { @Test public void testQuotientFilterInsertionAndIteration() { - int bits_per_entry = 8; + int fingerprint_len_bits = 5; int num_entries_power = 4; - //int num_entries = (int)Math.pow(2, num_entries_power); - //int fingerprint_size = bits_per_entry - 3; - QuotientFilter qf = new QuotientFilter(num_entries_power, bits_per_entry); + QuotientFilter qf = new QuotientFilter(num_entries_power, fingerprint_len_bits); qf.insert(0x1F, 2); qf.insert(0x1F, 3); @@ -200,9 +171,9 @@ public void testQuotientFilterInsertionAndIteration() { @Test public void testQuotientFilterIterator() { - int bits_per_entry = 8; + int fingerprint_len_bits = 5; int num_entries_power = 4; - QuotientFilter qf = new QuotientFilter(num_entries_power, bits_per_entry); + QuotientFilter qf = new QuotientFilter(num_entries_power, fingerprint_len_bits); qf.insert(0, 1); qf.insert(0, 4); @@ -299,7 +270,7 @@ static public boolean test_no_false_negatives(QuotientFilter filter, int num_ent @Test public void smallExpansion() { - final QuotientFilter qf = new QuotientFilter(5, 12); + final QuotientFilter qf = new QuotientFilter(5, 9); final int n = 30; for (int i = 0; i < n; i++) { qf.insert(i); } qf.printFilterSummary(); @@ -319,7 +290,7 @@ public void smallExpansion() { @Test public void expansion() { - final QuotientFilter qf = new QuotientFilter(16, 16); + final QuotientFilter qf = new QuotientFilter(16, 13); final int n = 60000; for (int i = 0; i < n; i++) { qf.insert(i); } // qf.printFilterSummary(); From ffc9bbc2a5e2e28c0ef0a0ff72041a03625b21c3 Mon Sep 17 00:00:00 2001 From: AlexanderSaydakov Date: Mon, 15 Jul 2024 16:43:56 -0700 Subject: [PATCH 31/38] implemented merge --- .../quotientfilter/QuotientFilter.java | 53 +++++++++++++----- .../quotientfilter/QuotientFilterTest.java | 54 +++++++++++++++++++ 2 files changed, 94 insertions(+), 13 deletions(-) diff --git a/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java b/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java index 7a860f857..cbee5c64a 100644 --- a/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java +++ b/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java @@ -25,6 +25,7 @@ import java.util.Queue; import java.util.Set; +import org.apache.datasketches.common.SketchesArgumentException; import org.apache.datasketches.common.SketchesException; import org.apache.datasketches.filters.common.BitArray; import org.apache.datasketches.filters.common.HeapBitArray; @@ -83,12 +84,6 @@ public int getFingerprintLength() { return numFingerprintBits_; } -// QuotientFilter(final int powerOfTwo, final int numBitsPerEntry, final BitArray bitArray) { -// powerOfTwoSize_ = powerOfTwo; -// numBitsPerEntry_ = numBitsPerEntry; -// bitArray_ = bitArray; -// } - void expand() { if (getFingerprintLength() < 2) throw new SketchesException("for expansion value must have at least 2 bits"); final QuotientFilter other = new QuotientFilter(lgQ_ + 1, numFingerprintBits_ - 1, loadFactor_); @@ -150,6 +145,14 @@ public double getUtilization() { return numEntries_ / (double) getNumSlots(); } + public int getLgQ() { + return lgQ_; + } + + public float getLoadFactor() { + return loadFactor_; + } + // returns the number of slots in the filter without the extension/buffer slots public long getNumSlots() { return 1L << lgQ_; @@ -229,13 +232,14 @@ void modifySlot(final boolean isOccupied, final boolean isContinuation, final bo public void printFilterSummary() { final long slots = getNumSlots(); final long numBits = slots * getNumBitsPerEntry(); - System.out.println("slots: " + slots); - System.out.println("bits: " + numBits); - System.out.println("bits/entry: " + numBits / (double)numEntries_); - System.out.println("FP length: " + getFingerprintLength()); - System.out.println("entries: " + numEntries_); - System.out.println("expansions: " + numExpansions_); - System.out.println("load: " + numEntries_ / (double)(slots)); + System.out.println("lgQ: " + lgQ_); + System.out.println("FP length: " + getFingerprintLength()); + System.out.println("load factor: " + getLoadFactor()); + System.out.println("bits: " + numBits); + System.out.println("bits/entry: " + numBits / (double)numEntries_); + System.out.println("entries: " + numEntries_); + System.out.println("expansions: " + numExpansions_); + System.out.println("load: " + numEntries_ / (double)(slots)); computeStatistics(); //System.out.println("num runs: \t\t" + num_runs); //System.out.println("avg run length: \t" + avg_run_length); @@ -632,4 +636,27 @@ public void computeStatistics() { avgClusterLength_ = sumClusterLengths / numClusters_; } + public void merge(final QuotientFilter other) { + if (lgQ_ + numFingerprintBits_ != other.lgQ_ + other.numFingerprintBits_) { + throw new SketchesArgumentException("incompatible sketches in merge"); + } + long i = 0; + if (!other.isSlotEmpty(i)) { i = other.findClusterStart(i); } + + final Queue fifo = new LinkedList(); + long count = 0; + while (count < other.numEntries_) { + if (!other.isSlotEmpty(i)) { + if (other.isOccupied(i)) { fifo.add(i); } + final long quotient = fifo.element(); + final long fingerprint = other.getFingerprint(i); + final long hash = quotient << other.getFingerprintLength() | fingerprint; + System.out.println("q=" + quotient + ", fp=" + fingerprint + ", hash=" + hash); + _insert(hash); + count++; + } + i = (i + 1) & other.getSlotMask(); + if (!fifo.isEmpty() && ! other.isContinuation(i)) { fifo.remove(); } + } + } } diff --git a/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterTest.java b/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterTest.java index 00eb994a9..e632744ef 100644 --- a/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterTest.java +++ b/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterTest.java @@ -18,6 +18,7 @@ */ package org.apache.datasketches.filters.quotientfilter; +import org.apache.datasketches.common.SketchesArgumentException; import org.testng.annotations.Test; import static org.testng.Assert.assertTrue; import static org.testng.Assert.assertEquals; @@ -308,4 +309,57 @@ public void expansion() { assertTrue(positives < 6); } + @Test + public void mergeEmpty() { + final QuotientFilter qf1 = new QuotientFilter(4, 3); + final QuotientFilter qf2 = new QuotientFilter(4, 3); + qf1.merge(qf2); + + assertEquals(qf1.getLgQ(), 4); + assertEquals(qf1.getFingerprintLength(), 3); + assertEquals(qf1.getNumEntries(), 0); + } + + @Test + public void merge() { + final QuotientFilter qf1 = new QuotientFilter(16, 13); + final QuotientFilter qf2 = new QuotientFilter(16, 13); + final int n = 50000; + for (int i = 0; i < n / 2; i++) { + qf1.insert(i); + qf1.insert(i + n / 2); + } + qf1.merge(qf2); + + assertEquals(qf1.getNumExpansions(), 0); + assertTrue(qf1.getNumEntries() > n * 0.99); // allow a few hash collisions + + // query the same keys + int positives = 0; + for (int i = 0; i < n; i++) { if (qf1.search(i)) { positives++; } } + assertEquals(positives, n); + + // query novel keys + positives = 0; + for (int i = 0; i < n; i++) { if (qf1.search(i + n)) { positives++; } } + assertTrue(positives < 4); + } + + @Test + public void mergeDifferentConfiguration() { + final QuotientFilter qf1 = new QuotientFilter(3, 4); + final QuotientFilter qf2 = new QuotientFilter(4, 3); + qf1.insert(4); + qf2.insert(4); + qf1.merge(qf2); + assertEquals(qf1.getNumEntries(), 1); + } + + @Test(expectedExceptions = SketchesArgumentException.class) + public void mergeIncompatible() { + final QuotientFilter qf1 = new QuotientFilter(4, 4); + final QuotientFilter qf2 = new QuotientFilter(4, 3); + qf1.merge(qf2); + } + } From ff12ef77d175bf1cf6e2b380552e3270248e4bbb Mon Sep 17 00:00:00 2001 From: c-dickens Date: Tue, 16 Jul 2024 12:58:20 +0100 Subject: [PATCH 32/38] Added load factor change into builder functionality --- .../quotientfilter/QuotientFilter.java | 8 ++--- .../quotientfilter/QuotientFilterBuilder.java | 35 +++++++++++++------ .../QuotientFilterBuilderTest.java | 32 ++++++++++++----- 3 files changed, 53 insertions(+), 22 deletions(-) diff --git a/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java b/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java index 7a860f857..9b0180457 100644 --- a/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java +++ b/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java @@ -18,6 +18,7 @@ */ package org.apache.datasketches.filters.quotientfilter; +import static org.apache.datasketches.filters.quotientfilter.QuotientFilter.DEFAULT_LOAD_FACTOR; import java.util.ArrayList; import java.util.HashSet; @@ -31,11 +32,11 @@ public class QuotientFilter extends Filter { - public static final float DEFAULT_LOAD_FACTOR = 0.8f; + public static final double DEFAULT_LOAD_FACTOR = 0.8; int lgQ_; int numFingerprintBits_; - float loadFactor_; + double loadFactor_; int numEntries_; int numExpansions_; BitArray bitArray_; @@ -50,13 +51,12 @@ public QuotientFilter(final int lgQ, final int numFingerprintBits) { this(lgQ, numFingerprintBits, DEFAULT_LOAD_FACTOR); } - public QuotientFilter(final int lgQ, final int numFingerprintBits, final float loadFactor) { + public QuotientFilter(final int lgQ, final int numFingerprintBits, final double loadFactor) { lgQ_ = lgQ; numFingerprintBits_ = numFingerprintBits; loadFactor_ = loadFactor; bitArray_ = makeFilter(getNumSlots(), getNumBitsPerEntry()); numExpansions_ = 0; - //hash_type = XxHash.hashLong ; //HashType.xxh; } public boolean rejuvenate(final long key) { diff --git a/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterBuilder.java b/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterBuilder.java index a39712195..79b659d87 100644 --- a/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterBuilder.java +++ b/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterBuilder.java @@ -18,8 +18,7 @@ */ package org.apache.datasketches.filters.quotientfilter; -//import java.util.concurrent.ThreadLocalRandom; - +import static org.apache.datasketches.filters.quotientfilter.QuotientFilter.DEFAULT_LOAD_FACTOR; import org.apache.datasketches.common.SketchesArgumentException; /** @@ -60,11 +59,11 @@ public static byte suggestFingerprintLength(double targetFalsePositiveProb) { * @param maxDistinctItems The maximum number of distinct items that can be inserted into the filter. * @return The log-base-2 of the number of slots in the filter. */ - public static byte suggestLgNumSlots(long maxDistinctItems) { + public static byte suggestLgNumSlots(long maxDistinctItems, double loadFactor) { if (maxDistinctItems <= 0) { throw new SketchesArgumentException("maxDistinctItems must be strictly positive"); } - byte result = (byte) Math.ceil(Math.log(maxDistinctItems / 0.9) / Math.log(2)); + byte result = (byte) Math.ceil(Math.log(maxDistinctItems / loadFactor) / Math.log(2)); if (result < 31) { return result; } else { @@ -73,19 +72,27 @@ public static byte suggestLgNumSlots(long maxDistinctItems) { } } + public static byte suggestLgNumSlots(long maxDistinctItems) { + return suggestLgNumSlots(maxDistinctItems, DEFAULT_LOAD_FACTOR); + } + /* Returns the largest number of unique items that can be inserted into the filter. We use a predefined load factor of 0.9 compared to the number of slots as 2^j. @param lgNumSlots The log-base-2 of the number of slots in the filter @return The maximum number of items that can be inserted into the filter */ - public static long suggestMaxNumItemsFromNumSlots(byte lgNumSlots) { + public static long suggestMaxNumItemsFromNumSlots(int lgNumSlots, double loadFactor) { if (lgNumSlots <= 0) { throw new SketchesArgumentException("lgNumSlots must be at least 1."); } else if (lgNumSlots >= 31) { throw new SketchesArgumentException("lgNumSlots cannot exceed 2^31 - 1."); } - return (long) Math.floor(0.9 * Math.pow(2, lgNumSlots)); + return (long) (loadFactor * (1L<= 1.0) { + throw new SketchesArgumentException("loadFactor must be larger than 0 and less than 1"); + } if (targetFalsePositiveProb <= 0.0 || targetFalsePositiveProb > 1.0) { throw new SketchesArgumentException("targetFalsePositiveProb must be a valid probability and strictly greater than 0"); } diff --git a/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterBuilderTest.java b/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterBuilderTest.java index 4fc38b2bc..84e04c08b 100644 --- a/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterBuilderTest.java +++ b/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterBuilderTest.java @@ -46,6 +46,9 @@ public static void testSuggestLgNumSlots(){ QuotientFilterBuilder qfb = new QuotientFilterBuilder(); // invalid number of items + assertThrows(SketchesArgumentException.class, () -> QuotientFilterBuilder.suggestLgNumSlots(0,0.9f)); + assertThrows(SketchesArgumentException.class, () -> QuotientFilterBuilder.suggestLgNumSlots(-1, 0.9f)); + assertThrows(SketchesArgumentException.class, () -> QuotientFilterBuilder.suggestLgNumSlots(5000000000L, 0.9f)); assertThrows(SketchesArgumentException.class, () -> QuotientFilterBuilder.suggestLgNumSlots(0)); assertThrows(SketchesArgumentException.class, () -> QuotientFilterBuilder.suggestLgNumSlots(-1)); assertThrows(SketchesArgumentException.class, () -> QuotientFilterBuilder.suggestLgNumSlots(5000000000L)); @@ -55,7 +58,9 @@ public static void testSuggestLgNumSlots(){ for (int i = 0; i < numItems.length; i++) { long num = numItems[i]; - byte result = qfb.suggestLgNumSlots(num); + byte result = qfb.suggestLgNumSlots(num, 0.9f); + assertEquals(result, results[i]); + result = qfb.suggestLgNumSlots(num); assertEquals(result, results[i]); } } @@ -70,12 +75,15 @@ public static void testSuggestMaxNumItems(){ assertThrows(SketchesArgumentException.class, () -> QuotientFilterBuilder.suggestMaxNumItemsFromNumSlots((byte)32)); - byte[] lgNumSlots = {1, 2, 3, 6, 10, 15, 25, 30,}; - long[] results = {1, 3, 7, 57, 921, 29491, 30198988, 966367641} ; + int[] lgNumSlots = {1, 2, 3, 6, 10, 15, 25, 30,}; + long[] results_ninety_pc = {1, 3, 7, 57, 921, 29491, 30198988, 966367641} ; + long[] results_eighty_pc = {1, 3, 6, 51, 819, 26214, 26843545, 858993459} ; for (int i = 0; i < lgNumSlots.length; i++) { - long result = qfb.suggestMaxNumItemsFromNumSlots(lgNumSlots[i]); - assertEquals(result, results[i]); + long result_ninety = qfb.suggestMaxNumItemsFromNumSlots(lgNumSlots[i], 0.9); + long result_eighty = qfb.suggestMaxNumItemsFromNumSlots(lgNumSlots[i], 0.8); + assertEquals(result_ninety, results_ninety_pc[i]); + assertEquals(result_eighty, results_eighty_pc[i]); } } @@ -96,14 +104,22 @@ public static void testSuggestParamsFromMaxDistinctsFPP(){ double[] fpp = {1E-10, 1E-2, 1e-7} ; // expected outcomes - byte[] expected_lgNumSlots = {1, 10, 30} ; + byte[] expected_lgNumSlotsNinety = {1, 10, 30} ; + byte[] expected_lgNumSlotsEighty = {1, 11, 30} ; byte[] expected_fingerprintLength = {34, 7, 24} ; for (int i = 0; i < numItems.length; i++) { - QuotientFilterBuilder.QFPair pair = qfb.suggestParamsFromMaxDistinctsFPP(numItems[i], fpp[i]); + QuotientFilterBuilder.QFPair pair = qfb.suggestParamsFromMaxDistinctsFPP(numItems[i], 0.9, fpp[i]); + lgNumSlots = pair.lgNumSlots; + fingerprintLength = pair.fingerprintLength; + assertEquals(expected_lgNumSlotsNinety[i], lgNumSlots); + assertEquals(expected_fingerprintLength[i], fingerprintLength); + + // 80% load + pair = qfb.suggestParamsFromMaxDistinctsFPP(numItems[i], fpp[i]); lgNumSlots = pair.lgNumSlots; fingerprintLength = pair.fingerprintLength; - assertEquals(expected_lgNumSlots[i], lgNumSlots); + assertEquals(expected_lgNumSlotsEighty[i], lgNumSlots); assertEquals(expected_fingerprintLength[i], fingerprintLength); } } From fb44a362fd383f24e01b7b1aa0623c7c66db1fb9 Mon Sep 17 00:00:00 2001 From: c-dickens Date: Tue, 16 Jul 2024 15:26:01 +0100 Subject: [PATCH 33/38] Updated builder function to account for new load factor --- .../quotientfilter/QuotientFilter.java | 16 ++++---- .../quotientfilter/QuotientFilterBuilder.java | 37 +++++++++++++------ .../QuotientFilterBuilderTest.java | 34 ++++++++++++----- 3 files changed, 59 insertions(+), 28 deletions(-) diff --git a/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java b/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java index cbee5c64a..b9c313e2a 100644 --- a/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java +++ b/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java @@ -32,11 +32,11 @@ public class QuotientFilter extends Filter { - public static final float DEFAULT_LOAD_FACTOR = 0.8f; + public static final double DEFAULT_LOAD_FACTOR = 0.8f; int lgQ_; int numFingerprintBits_; - float loadFactor_; + double loadFactor_; int numEntries_; int numExpansions_; BitArray bitArray_; @@ -51,7 +51,7 @@ public QuotientFilter(final int lgQ, final int numFingerprintBits) { this(lgQ, numFingerprintBits, DEFAULT_LOAD_FACTOR); } - public QuotientFilter(final int lgQ, final int numFingerprintBits, final float loadFactor) { + public QuotientFilter(final int lgQ, final int numFingerprintBits, final double loadFactor) { lgQ_ = lgQ; numFingerprintBits_ = numFingerprintBits; loadFactor_ = loadFactor; @@ -149,7 +149,7 @@ public int getLgQ() { return lgQ_; } - public float getLoadFactor() { + public double getLoadFactor() { return loadFactor_; } @@ -223,7 +223,7 @@ protected boolean compare(final long index, final long fingerprint) { // modify the flags and fingerprint of a given slot void modifySlot(final boolean isOccupied, final boolean isContinuation, final boolean isShifted, - final long index, final long fingerprint) { + final long index, final long fingerprint) { modifySlot(isOccupied, isContinuation, isShifted, index); setFingerprint(index, fingerprint); } @@ -324,7 +324,7 @@ long findFirstFingerprintInRun(long index, final long fingerprint) { if (fingerprintAtIndex == fingerprint) { return index; } else if (fingerprintAtIndex > fingerprint) { - return ~index; + return ~index; } index = (index + 1) & getSlotMask(); } while (isContinuation(index)); @@ -396,7 +396,7 @@ boolean insert(final long fingerprint, final long index) { } void insertFingerprintAndPushAllElse(long fingerprint, long index, final long canonical, - final boolean isNewRun, final boolean isRunStart) { + final boolean isNewRun, final boolean isRunStart) { // in the first shifted entry set isContinuation flag if inserting at the start of the existing run // otherwise just shift the existing flag as it is boolean forceContinuation = !isNewRun && isRunStart; @@ -659,4 +659,4 @@ public void merge(final QuotientFilter other) { if (!fifo.isEmpty() && ! other.isContinuation(i)) { fifo.remove(); } } } -} +} \ No newline at end of file diff --git a/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterBuilder.java b/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterBuilder.java index a39712195..d0f38b39a 100644 --- a/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterBuilder.java +++ b/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterBuilder.java @@ -18,8 +18,7 @@ */ package org.apache.datasketches.filters.quotientfilter; -//import java.util.concurrent.ThreadLocalRandom; - +import static org.apache.datasketches.filters.quotientfilter.QuotientFilter.DEFAULT_LOAD_FACTOR; import org.apache.datasketches.common.SketchesArgumentException; /** @@ -60,11 +59,11 @@ public static byte suggestFingerprintLength(double targetFalsePositiveProb) { * @param maxDistinctItems The maximum number of distinct items that can be inserted into the filter. * @return The log-base-2 of the number of slots in the filter. */ - public static byte suggestLgNumSlots(long maxDistinctItems) { + public static byte suggestLgNumSlots(long maxDistinctItems, double loadFactor) { if (maxDistinctItems <= 0) { throw new SketchesArgumentException("maxDistinctItems must be strictly positive"); } - byte result = (byte) Math.ceil(Math.log(maxDistinctItems / 0.9) / Math.log(2)); + byte result = (byte) Math.ceil(Math.log(maxDistinctItems / loadFactor) / Math.log(2)); if (result < 31) { return result; } else { @@ -73,19 +72,27 @@ public static byte suggestLgNumSlots(long maxDistinctItems) { } } + public static byte suggestLgNumSlots(long maxDistinctItems) { + return suggestLgNumSlots(maxDistinctItems, DEFAULT_LOAD_FACTOR); + } + /* Returns the largest number of unique items that can be inserted into the filter. We use a predefined load factor of 0.9 compared to the number of slots as 2^j. @param lgNumSlots The log-base-2 of the number of slots in the filter @return The maximum number of items that can be inserted into the filter */ - public static long suggestMaxNumItemsFromNumSlots(byte lgNumSlots) { + public static long suggestMaxNumItemsFromNumSlots(int lgNumSlots, double loadFactor) { if (lgNumSlots <= 0) { throw new SketchesArgumentException("lgNumSlots must be at least 1."); } else if (lgNumSlots >= 31) { throw new SketchesArgumentException("lgNumSlots cannot exceed 2^31 - 1."); } - return (long) Math.floor(0.9 * Math.pow(2, lgNumSlots)); + return (long) (loadFactor * (1L<= 1.0) { + throw new SketchesArgumentException("loadFactor must be larger than 0 and less than 1"); + } if (targetFalsePositiveProb <= 0.0 || targetFalsePositiveProb > 1.0) { throw new SketchesArgumentException("targetFalsePositiveProb must be a valid probability and strictly greater than 0"); } @@ -130,4 +145,4 @@ public QFPair(byte lgNumSlots, byte fingerprintLength) { } } -} +} \ No newline at end of file diff --git a/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterBuilderTest.java b/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterBuilderTest.java index 4fc38b2bc..bbe98d617 100644 --- a/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterBuilderTest.java +++ b/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterBuilderTest.java @@ -46,6 +46,9 @@ public static void testSuggestLgNumSlots(){ QuotientFilterBuilder qfb = new QuotientFilterBuilder(); // invalid number of items + assertThrows(SketchesArgumentException.class, () -> QuotientFilterBuilder.suggestLgNumSlots(0,0.9f)); + assertThrows(SketchesArgumentException.class, () -> QuotientFilterBuilder.suggestLgNumSlots(-1, 0.9f)); + assertThrows(SketchesArgumentException.class, () -> QuotientFilterBuilder.suggestLgNumSlots(5000000000L, 0.9f)); assertThrows(SketchesArgumentException.class, () -> QuotientFilterBuilder.suggestLgNumSlots(0)); assertThrows(SketchesArgumentException.class, () -> QuotientFilterBuilder.suggestLgNumSlots(-1)); assertThrows(SketchesArgumentException.class, () -> QuotientFilterBuilder.suggestLgNumSlots(5000000000L)); @@ -55,7 +58,9 @@ public static void testSuggestLgNumSlots(){ for (int i = 0; i < numItems.length; i++) { long num = numItems[i]; - byte result = qfb.suggestLgNumSlots(num); + byte result = qfb.suggestLgNumSlots(num, 0.9f); + assertEquals(result, results[i]); + result = qfb.suggestLgNumSlots(num); assertEquals(result, results[i]); } } @@ -70,12 +75,15 @@ public static void testSuggestMaxNumItems(){ assertThrows(SketchesArgumentException.class, () -> QuotientFilterBuilder.suggestMaxNumItemsFromNumSlots((byte)32)); - byte[] lgNumSlots = {1, 2, 3, 6, 10, 15, 25, 30,}; - long[] results = {1, 3, 7, 57, 921, 29491, 30198988, 966367641} ; + int[] lgNumSlots = {1, 2, 3, 6, 10, 15, 25, 30,}; + long[] results_ninety_pc = {1, 3, 7, 57, 921, 29491, 30198988, 966367641} ; + long[] results_eighty_pc = {1, 3, 6, 51, 819, 26214, 26843545, 858993459} ; for (int i = 0; i < lgNumSlots.length; i++) { - long result = qfb.suggestMaxNumItemsFromNumSlots(lgNumSlots[i]); - assertEquals(result, results[i]); + long result_ninety = qfb.suggestMaxNumItemsFromNumSlots(lgNumSlots[i], 0.9); + long result_eighty = qfb.suggestMaxNumItemsFromNumSlots(lgNumSlots[i], 0.8); + assertEquals(result_ninety, results_ninety_pc[i]); + assertEquals(result_eighty, results_eighty_pc[i]); } } @@ -96,18 +104,26 @@ public static void testSuggestParamsFromMaxDistinctsFPP(){ double[] fpp = {1E-10, 1E-2, 1e-7} ; // expected outcomes - byte[] expected_lgNumSlots = {1, 10, 30} ; + byte[] expected_lgNumSlotsNinety = {1, 10, 30} ; + byte[] expected_lgNumSlotsEighty = {1, 11, 30} ; byte[] expected_fingerprintLength = {34, 7, 24} ; for (int i = 0; i < numItems.length; i++) { - QuotientFilterBuilder.QFPair pair = qfb.suggestParamsFromMaxDistinctsFPP(numItems[i], fpp[i]); + QuotientFilterBuilder.QFPair pair = qfb.suggestParamsFromMaxDistinctsFPP(numItems[i], 0.9, fpp[i]); + lgNumSlots = pair.lgNumSlots; + fingerprintLength = pair.fingerprintLength; + assertEquals(expected_lgNumSlotsNinety[i], lgNumSlots); + assertEquals(expected_fingerprintLength[i], fingerprintLength); + + // 80% load + pair = qfb.suggestParamsFromMaxDistinctsFPP(numItems[i], fpp[i]); lgNumSlots = pair.lgNumSlots; fingerprintLength = pair.fingerprintLength; - assertEquals(expected_lgNumSlots[i], lgNumSlots); + assertEquals(expected_lgNumSlotsEighty[i], lgNumSlots); assertEquals(expected_fingerprintLength[i], fingerprintLength); } } -} +} \ No newline at end of file From e1f3d9788c6c0156da06b6b2941b27de00f23d77 Mon Sep 17 00:00:00 2001 From: c-dickens Date: Tue, 16 Jul 2024 15:31:07 +0100 Subject: [PATCH 34/38] Revert "Added load factor change into builder functionality" This reverts commit ff12ef77d175bf1cf6e2b380552e3270248e4bbb. --- .../quotientfilter/QuotientFilter.java | 8 ++--- .../quotientfilter/QuotientFilterBuilder.java | 35 ++++++------------- .../QuotientFilterBuilderTest.java | 32 +++++------------ 3 files changed, 22 insertions(+), 53 deletions(-) diff --git a/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java b/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java index 9b0180457..7a860f857 100644 --- a/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java +++ b/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java @@ -18,7 +18,6 @@ */ package org.apache.datasketches.filters.quotientfilter; -import static org.apache.datasketches.filters.quotientfilter.QuotientFilter.DEFAULT_LOAD_FACTOR; import java.util.ArrayList; import java.util.HashSet; @@ -32,11 +31,11 @@ public class QuotientFilter extends Filter { - public static final double DEFAULT_LOAD_FACTOR = 0.8; + public static final float DEFAULT_LOAD_FACTOR = 0.8f; int lgQ_; int numFingerprintBits_; - double loadFactor_; + float loadFactor_; int numEntries_; int numExpansions_; BitArray bitArray_; @@ -51,12 +50,13 @@ public QuotientFilter(final int lgQ, final int numFingerprintBits) { this(lgQ, numFingerprintBits, DEFAULT_LOAD_FACTOR); } - public QuotientFilter(final int lgQ, final int numFingerprintBits, final double loadFactor) { + public QuotientFilter(final int lgQ, final int numFingerprintBits, final float loadFactor) { lgQ_ = lgQ; numFingerprintBits_ = numFingerprintBits; loadFactor_ = loadFactor; bitArray_ = makeFilter(getNumSlots(), getNumBitsPerEntry()); numExpansions_ = 0; + //hash_type = XxHash.hashLong ; //HashType.xxh; } public boolean rejuvenate(final long key) { diff --git a/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterBuilder.java b/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterBuilder.java index 79b659d87..a39712195 100644 --- a/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterBuilder.java +++ b/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterBuilder.java @@ -18,7 +18,8 @@ */ package org.apache.datasketches.filters.quotientfilter; -import static org.apache.datasketches.filters.quotientfilter.QuotientFilter.DEFAULT_LOAD_FACTOR; +//import java.util.concurrent.ThreadLocalRandom; + import org.apache.datasketches.common.SketchesArgumentException; /** @@ -59,11 +60,11 @@ public static byte suggestFingerprintLength(double targetFalsePositiveProb) { * @param maxDistinctItems The maximum number of distinct items that can be inserted into the filter. * @return The log-base-2 of the number of slots in the filter. */ - public static byte suggestLgNumSlots(long maxDistinctItems, double loadFactor) { + public static byte suggestLgNumSlots(long maxDistinctItems) { if (maxDistinctItems <= 0) { throw new SketchesArgumentException("maxDistinctItems must be strictly positive"); } - byte result = (byte) Math.ceil(Math.log(maxDistinctItems / loadFactor) / Math.log(2)); + byte result = (byte) Math.ceil(Math.log(maxDistinctItems / 0.9) / Math.log(2)); if (result < 31) { return result; } else { @@ -72,27 +73,19 @@ public static byte suggestLgNumSlots(long maxDistinctItems, double loadFactor) { } } - public static byte suggestLgNumSlots(long maxDistinctItems) { - return suggestLgNumSlots(maxDistinctItems, DEFAULT_LOAD_FACTOR); - } - /* Returns the largest number of unique items that can be inserted into the filter. We use a predefined load factor of 0.9 compared to the number of slots as 2^j. @param lgNumSlots The log-base-2 of the number of slots in the filter @return The maximum number of items that can be inserted into the filter */ - public static long suggestMaxNumItemsFromNumSlots(int lgNumSlots, double loadFactor) { + public static long suggestMaxNumItemsFromNumSlots(byte lgNumSlots) { if (lgNumSlots <= 0) { throw new SketchesArgumentException("lgNumSlots must be at least 1."); } else if (lgNumSlots >= 31) { throw new SketchesArgumentException("lgNumSlots cannot exceed 2^31 - 1."); } - return (long) (loadFactor * (1L<= 1.0) { - throw new SketchesArgumentException("loadFactor must be larger than 0 and less than 1"); - } if (targetFalsePositiveProb <= 0.0 || targetFalsePositiveProb > 1.0) { throw new SketchesArgumentException("targetFalsePositiveProb must be a valid probability and strictly greater than 0"); } diff --git a/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterBuilderTest.java b/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterBuilderTest.java index 84e04c08b..4fc38b2bc 100644 --- a/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterBuilderTest.java +++ b/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterBuilderTest.java @@ -46,9 +46,6 @@ public static void testSuggestLgNumSlots(){ QuotientFilterBuilder qfb = new QuotientFilterBuilder(); // invalid number of items - assertThrows(SketchesArgumentException.class, () -> QuotientFilterBuilder.suggestLgNumSlots(0,0.9f)); - assertThrows(SketchesArgumentException.class, () -> QuotientFilterBuilder.suggestLgNumSlots(-1, 0.9f)); - assertThrows(SketchesArgumentException.class, () -> QuotientFilterBuilder.suggestLgNumSlots(5000000000L, 0.9f)); assertThrows(SketchesArgumentException.class, () -> QuotientFilterBuilder.suggestLgNumSlots(0)); assertThrows(SketchesArgumentException.class, () -> QuotientFilterBuilder.suggestLgNumSlots(-1)); assertThrows(SketchesArgumentException.class, () -> QuotientFilterBuilder.suggestLgNumSlots(5000000000L)); @@ -58,9 +55,7 @@ public static void testSuggestLgNumSlots(){ for (int i = 0; i < numItems.length; i++) { long num = numItems[i]; - byte result = qfb.suggestLgNumSlots(num, 0.9f); - assertEquals(result, results[i]); - result = qfb.suggestLgNumSlots(num); + byte result = qfb.suggestLgNumSlots(num); assertEquals(result, results[i]); } } @@ -75,15 +70,12 @@ public static void testSuggestMaxNumItems(){ assertThrows(SketchesArgumentException.class, () -> QuotientFilterBuilder.suggestMaxNumItemsFromNumSlots((byte)32)); - int[] lgNumSlots = {1, 2, 3, 6, 10, 15, 25, 30,}; - long[] results_ninety_pc = {1, 3, 7, 57, 921, 29491, 30198988, 966367641} ; - long[] results_eighty_pc = {1, 3, 6, 51, 819, 26214, 26843545, 858993459} ; + byte[] lgNumSlots = {1, 2, 3, 6, 10, 15, 25, 30,}; + long[] results = {1, 3, 7, 57, 921, 29491, 30198988, 966367641} ; for (int i = 0; i < lgNumSlots.length; i++) { - long result_ninety = qfb.suggestMaxNumItemsFromNumSlots(lgNumSlots[i], 0.9); - long result_eighty = qfb.suggestMaxNumItemsFromNumSlots(lgNumSlots[i], 0.8); - assertEquals(result_ninety, results_ninety_pc[i]); - assertEquals(result_eighty, results_eighty_pc[i]); + long result = qfb.suggestMaxNumItemsFromNumSlots(lgNumSlots[i]); + assertEquals(result, results[i]); } } @@ -104,22 +96,14 @@ public static void testSuggestParamsFromMaxDistinctsFPP(){ double[] fpp = {1E-10, 1E-2, 1e-7} ; // expected outcomes - byte[] expected_lgNumSlotsNinety = {1, 10, 30} ; - byte[] expected_lgNumSlotsEighty = {1, 11, 30} ; + byte[] expected_lgNumSlots = {1, 10, 30} ; byte[] expected_fingerprintLength = {34, 7, 24} ; for (int i = 0; i < numItems.length; i++) { - QuotientFilterBuilder.QFPair pair = qfb.suggestParamsFromMaxDistinctsFPP(numItems[i], 0.9, fpp[i]); - lgNumSlots = pair.lgNumSlots; - fingerprintLength = pair.fingerprintLength; - assertEquals(expected_lgNumSlotsNinety[i], lgNumSlots); - assertEquals(expected_fingerprintLength[i], fingerprintLength); - - // 80% load - pair = qfb.suggestParamsFromMaxDistinctsFPP(numItems[i], fpp[i]); + QuotientFilterBuilder.QFPair pair = qfb.suggestParamsFromMaxDistinctsFPP(numItems[i], fpp[i]); lgNumSlots = pair.lgNumSlots; fingerprintLength = pair.fingerprintLength; - assertEquals(expected_lgNumSlotsEighty[i], lgNumSlots); + assertEquals(expected_lgNumSlots[i], lgNumSlots); assertEquals(expected_fingerprintLength[i], fingerprintLength); } } From f9514fa3020e8bbc5bc51921b0da5d137717d57d Mon Sep 17 00:00:00 2001 From: c-dickens Date: Tue, 16 Jul 2024 15:41:00 +0100 Subject: [PATCH 35/38] changed 0.9f -> 0.9 --- .../filters/quotientfilter/QuotientFilterBuilderTest.java | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterBuilderTest.java b/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterBuilderTest.java index bbe98d617..3199c60af 100644 --- a/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterBuilderTest.java +++ b/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterBuilderTest.java @@ -46,9 +46,9 @@ public static void testSuggestLgNumSlots(){ QuotientFilterBuilder qfb = new QuotientFilterBuilder(); // invalid number of items - assertThrows(SketchesArgumentException.class, () -> QuotientFilterBuilder.suggestLgNumSlots(0,0.9f)); - assertThrows(SketchesArgumentException.class, () -> QuotientFilterBuilder.suggestLgNumSlots(-1, 0.9f)); - assertThrows(SketchesArgumentException.class, () -> QuotientFilterBuilder.suggestLgNumSlots(5000000000L, 0.9f)); + assertThrows(SketchesArgumentException.class, () -> QuotientFilterBuilder.suggestLgNumSlots(0,0.9)); + assertThrows(SketchesArgumentException.class, () -> QuotientFilterBuilder.suggestLgNumSlots(-1, 0.9)); + assertThrows(SketchesArgumentException.class, () -> QuotientFilterBuilder.suggestLgNumSlots(5000000000L, 0.9)); assertThrows(SketchesArgumentException.class, () -> QuotientFilterBuilder.suggestLgNumSlots(0)); assertThrows(SketchesArgumentException.class, () -> QuotientFilterBuilder.suggestLgNumSlots(-1)); assertThrows(SketchesArgumentException.class, () -> QuotientFilterBuilder.suggestLgNumSlots(5000000000L)); @@ -58,7 +58,7 @@ public static void testSuggestLgNumSlots(){ for (int i = 0; i < numItems.length; i++) { long num = numItems[i]; - byte result = qfb.suggestLgNumSlots(num, 0.9f); + byte result = qfb.suggestLgNumSlots(num, 0.9); assertEquals(result, results[i]); result = qfb.suggestLgNumSlots(num); assertEquals(result, results[i]); From b742749257c56a24ace431046551e35ba8b7f581 Mon Sep 17 00:00:00 2001 From: c-dickens Date: Tue, 16 Jul 2024 15:42:02 +0100 Subject: [PATCH 36/38] changed load facto 0.8f -> 0.8 --- .../datasketches/filters/quotientfilter/QuotientFilter.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java b/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java index b9c313e2a..b0eece43a 100644 --- a/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java +++ b/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java @@ -32,7 +32,7 @@ public class QuotientFilter extends Filter { - public static final double DEFAULT_LOAD_FACTOR = 0.8f; + public static final double DEFAULT_LOAD_FACTOR = 0.8; int lgQ_; int numFingerprintBits_; From 22ba835993d933313e827830825345363123c0b6 Mon Sep 17 00:00:00 2001 From: AlexanderSaydakov Date: Tue, 16 Jul 2024 16:10:55 -0700 Subject: [PATCH 37/38] removed debug print --- .../datasketches/filters/quotientfilter/QuotientFilter.java | 1 - 1 file changed, 1 deletion(-) diff --git a/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java b/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java index b0eece43a..19c81f4f7 100644 --- a/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java +++ b/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java @@ -651,7 +651,6 @@ public void merge(final QuotientFilter other) { final long quotient = fifo.element(); final long fingerprint = other.getFingerprint(i); final long hash = quotient << other.getFingerprintLength() | fingerprint; - System.out.println("q=" + quotient + ", fp=" + fingerprint + ", hash=" + hash); _insert(hash); count++; } From 9d3a4cde2eb9f83d511a65d961df8368b4aa07c1 Mon Sep 17 00:00:00 2001 From: AlexanderSaydakov Date: Tue, 16 Jul 2024 16:11:13 -0700 Subject: [PATCH 38/38] fixed merge test --- .../datasketches/filters/quotientfilter/QuotientFilterTest.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterTest.java b/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterTest.java index e632744ef..5dc775b0c 100644 --- a/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterTest.java +++ b/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterTest.java @@ -327,7 +327,7 @@ public void merge() { final int n = 50000; for (int i = 0; i < n / 2; i++) { qf1.insert(i); - qf1.insert(i + n / 2); + qf2.insert(i + n / 2); } qf1.merge(qf2);