diff --git a/src/main/java/org/apache/datasketches/filters/bloomfilter/BitArray.java b/src/main/java/org/apache/datasketches/filters/bloomfilter/BitArray.java deleted file mode 100644 index bfa696cad..000000000 --- a/src/main/java/org/apache/datasketches/filters/bloomfilter/BitArray.java +++ /dev/null @@ -1,135 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.datasketches.filters.bloomfilter; - -import static org.apache.datasketches.common.Util.LS; - -import org.apache.datasketches.common.SketchesArgumentException; -import org.apache.datasketches.memory.Buffer; -import org.apache.datasketches.memory.Memory; -import org.apache.datasketches.memory.WritableMemory; - -/** - * This class holds an array of bits suitable for use in a Bloom Filter - * - *

Rounds the number of bits up to the smallest multiple of 64 (one long) - * that is not smaller than the specified number. - */ -abstract class BitArray { - // MAX_BITS using longs, based on array indices being capped at Integer.MAX_VALUE - protected static final long MAX_BITS = Integer.MAX_VALUE * (long) Long.SIZE; - - protected BitArray() {} - - static BitArray heapify(final Buffer mem, final boolean isEmpty) { - return HeapBitArray.heapify(mem, isEmpty); - } - - static BitArray wrap(final Memory mem, final boolean isEmpty) { - return DirectBitArrayR.wrap(mem, isEmpty); - } - - static BitArray writableWrap(final WritableMemory wmem, final boolean isEmpty) { - return DirectBitArray.writableWrap(wmem, isEmpty); - } - - boolean isEmpty() { - return !isDirty() && getNumBitsSet() == 0; - } - - abstract boolean hasMemory(); - - abstract boolean isDirect(); - - abstract boolean isReadOnly(); - - abstract boolean getBit(final long index); - - abstract boolean getAndSetBit(final long index); - - abstract void setBit(final long index); - - abstract long getNumBitsSet(); - - abstract void reset(); - - abstract long getCapacity(); - - abstract int getArrayLength(); - - abstract void union(final BitArray other); - - abstract void intersect(final BitArray other); - - abstract void invert(); - - // prints the raw BitArray as 0s and 1s, one long per row - @Override - public String toString() { - final StringBuilder sb = new StringBuilder(); - for (int i = 0; i < getArrayLength(); ++i) { - sb.append(i + ": ") - .append(printLong(getLong(i))) - .append(LS); - } - return sb.toString(); - } - - long getSerializedSizeBytes() { - // We only really need an int for array length but this will keep everything - // aligned to 8 bytes. - // Always write array length, but write numBitsSet only if empty - return Long.BYTES * (isEmpty() ? 1L : (2L + getArrayLength())); - } - - // returns the number of bytes needed for a non-empty BitArray of the requested size - static long getSerializedSizeBytes(final long numBits) { - if (numBits <= 0) { - throw new SketchesArgumentException("Requested number of bits must be strictly positive"); - } - if (numBits > MAX_BITS) { - throw new SketchesArgumentException("Requested number of bits exceeds maximum allowed. " - + "Requested: " + numBits + ", maximum: " + MAX_BITS); - } - final int numLongs = (int) Math.ceil(numBits / 64.0); - return Long.BYTES * (numLongs + 2L); - } - - abstract protected boolean isDirty(); - - // used to get a long from the array regardless of underlying storage - // NOT used to query individual bits - abstract protected long getLong(final int arrayIndex); - - // used to set a long in the array regardless of underlying storage - // NOT used to set individual bits - abstract protected void setLong(final int arrayIndex, final long value); - - // prints a long as a series of 0s and 1s as little endian - protected static String printLong(final long val) { - final StringBuilder sb = new StringBuilder(); - for (int j = 0; j < Long.SIZE; ++j) { - sb.append((val & (1L << j)) != 0 ? "1" : "0"); - if (j % 8 == 7) { sb.append(" "); } - } - return sb.toString(); - } - -} diff --git a/src/main/java/org/apache/datasketches/filters/bloomfilter/BloomFilter.java b/src/main/java/org/apache/datasketches/filters/bloomfilter/BloomFilter.java index 3ea73b9bd..10829d7b7 100644 --- a/src/main/java/org/apache/datasketches/filters/bloomfilter/BloomFilter.java +++ b/src/main/java/org/apache/datasketches/filters/bloomfilter/BloomFilter.java @@ -26,6 +26,9 @@ import org.apache.datasketches.common.Family; import org.apache.datasketches.common.SketchesArgumentException; import org.apache.datasketches.common.SketchesStateException; +import org.apache.datasketches.filters.common.BitArray; +import org.apache.datasketches.filters.common.DirectBitArray; +import org.apache.datasketches.filters.common.HeapBitArray; import org.apache.datasketches.memory.Buffer; import org.apache.datasketches.memory.Memory; import org.apache.datasketches.memory.WritableBuffer; diff --git a/src/main/java/org/apache/datasketches/filters/common/BitArray.java b/src/main/java/org/apache/datasketches/filters/common/BitArray.java new file mode 100644 index 000000000..8320a369f --- /dev/null +++ b/src/main/java/org/apache/datasketches/filters/common/BitArray.java @@ -0,0 +1,305 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.datasketches.filters.common; + +import static org.apache.datasketches.common.Util.LS; + +import org.apache.datasketches.common.SketchesArgumentException; +import org.apache.datasketches.memory.Buffer; +import org.apache.datasketches.memory.Memory; +import org.apache.datasketches.memory.WritableMemory; + +/** + * This class holds an array of bits and should be suitable for use in + * the various membership filters. The representation is not compressed and + * is designed to fit in a single array, meaning that the maximum number + * of bits is limited by the maximize size of an array of longs in Java. + * + *

Rounds the number of bits up to the smallest multiple of 64 (one long) + * that is not smaller than the specified number. + */ +public abstract class BitArray { + + /** + * The maximum number of bits that can be represented using longs, + * based on array indices being capped at Integer.MAX_VALUE + * and allowing room for encoding both the size and the number of bits set. + */ + protected static final long MAX_BITS = (Integer.MAX_VALUE - 1) * (long) Long.SIZE; + + /** + * Constructs a new BitArray. + */ + BitArray() {} + + /** + * Creates a BitArray from a given Buffer. + * + * @param mem The Buffer to heapify. + * @param isEmpty Indicates whether the BitArray is empty. + * @return The heapified BitArray. + */ + public static BitArray heapify(final Buffer mem, final boolean isEmpty) { + return HeapBitArray.heapify(mem, isEmpty); + } + + /** + * Creates a BitArray from a given Memory. + * + * @param mem The Memory to wrap. + * @param isEmpty Indicates whether the BitArray is empty. + * @return The wrapped BitArray. + */ + public static BitArray wrap(final Memory mem, final boolean isEmpty) { + return DirectBitArrayR.wrap(mem, isEmpty); + } + + /** + * Creates a writable BitArray from a given WritableMemory. + * + * @param wmem The WritableMemory to wrap. + * @param isEmpty Indicates whether the BitArray is empty. + * @return The writable wrapped BitArray. + */ + public static BitArray writableWrap(final WritableMemory wmem, final boolean isEmpty) { + return DirectBitArray.writableWrap(wmem, isEmpty); + } + + /** + * Checks if the BitArray is empty. + * + * @return True if the BitArray is empty, false otherwise. + */ + public boolean isEmpty() { + return !isDirty() && getNumBitsSet() == 0; + } + + /** + * Checks if the BitArray has a backing Memory. + * + * @return True if the BitArray has a backing Memory, false otherwise. + */ + public abstract boolean hasMemory(); + + /** + * Checks if the BitArray is direct. + * + * @return True if the BitArray is direct, false otherwise. + */ + public abstract boolean isDirect(); + + /** + * Checks if the BitArray is read-only. + * + * @return True if the BitArray is read-only, false otherwise. + */ + public abstract boolean isReadOnly(); + + /** + * Gets the value of a bit at the specified index. + * + * @param index The index of the bit. + * @return The value of the bit at the specified index. + */ + public abstract boolean getBit(final long index); + + /** + * Gets the a specified number of bits starting at the given index. Limited + * to a single long (64 bits). + * + * @param index The starting index. + * @param numBits The number of bits to return. + * @return The value of the requested bits, starting at bit 0 of the result. + */ + public abstract long getBits(final long index, final int numBits); + + /** + * Gets the value of a bit at the specified index and sets it to true. + * + * @param index The index of the bit. + * @return The previous value of the bit at the specified index. + */ + public abstract boolean getAndSetBit(final long index); + + /** + * Assigns the value of a bit at the specified index to true. + * + * @param index The index of the bit. + */ + public abstract void setBit(final long index); + + /** + * Assigns the value of a bit at the specified index to false. + * + * @param index The index of the bit. + */ + public abstract void clearBit(final long index); + + /** + * Assigns the given value of a bit at the specified index. + * + * @param index The index of the bit. + * @param value The value to set the bit to. + */ + public abstract void assignBit(final long index, final boolean value); + + /** + /** + * Sets {@code numBits} starting from {@code index} to the specified value. + * Limited to a single long (64 bits). + * + * @param index the starting index of the range (inclusive) + * @param numBits the number of bits to write + * @param bits the value to set the bits to, starting with bit 0 + */ + public abstract void setBits(final long index, final int numBits, final long bits); + + /** + * Gets the number of bits that are set to true in the BitArray. + * + * @return The number of bits set to true. + */ + public abstract long getNumBitsSet(); + + /** + * Resets the BitArray, setting all bits to false. + */ + public abstract void reset(); + + /** + * Gets the capacity of the BitArray in bits. + * + * @return The capacity of the BitArray in bits + */ + public abstract long getCapacity(); + + /** + * Gets the length of the underlying array in longs. + * + * @return The length of the underlying array in longs. + */ + public abstract int getArrayLength(); + + /** + * Performs a union operation with another BitArray. + * + * @param other The other BitArray to perform the union with. + */ + public abstract void union(final BitArray other); + + /** + * Performs an intersection operation with another BitArray. + * + * @param other The other BitArray to perform the intersection with. + */ + public abstract void intersect(final BitArray other); + + /** + * Inverts the BitArray, flipping all bits. + */ + public abstract void invert(); + + /** + * Returns a string representation of the BitArray. + * + * @return A string representation of the BitArray. + */ + @Override + public String toString() { + final StringBuilder sb = new StringBuilder(); + for (int i = 0; i < getArrayLength(); ++i) { + sb.append(i + ": ") + .append(printLong(getLong(i))) + .append(LS); + } + return sb.toString(); + } + + /** + * Gets the serialized size of the BitArray in bytes. + * + * @return The serialized size of the BitArray in bytes. + */ + public long getSerializedSizeBytes() { + // We only really need an int for array length but this will keep everything + // aligned to 8 bytes. + // Always write array length, but write numBitsSet only if empty + return Long.BYTES * (isEmpty() ? 1L : (2L + getArrayLength())); + } + + /** + * Gets the serialized size of a non-empty BitArray of the specified size in bytes. + * + * @param numBits The number of bits in the BitArray. + * @return The serialized size of the BitArray in bytes. + * @throws SketchesArgumentException If the requested number of bits is not strictly positive + * or exceeds the maximum allowed. + */ + public static long getSerializedSizeBytes(final long numBits) { + if (numBits <= 0) { + throw new SketchesArgumentException("Requested number of bits must be strictly positive"); + } + if (numBits > MAX_BITS) { + throw new SketchesArgumentException("Requested number of bits exceeds maximum allowed. " + + "Requested: " + numBits + ", maximum: " + MAX_BITS); + } + final int numLongs = (int) Math.ceil(numBits / 64.0); + return Long.BYTES * (numLongs + 2L); + } + + /** + * Checks if the BitArray has changes not reflected in state variables. + * + * @return True if the BitArray is dirty, false otherwise. + */ + abstract boolean isDirty(); + + /** + * Gets the long value at the specified array index. + * + * @param arrayIndex The index of the long value in the array. + * @return The long value at the specified array index. + */ + abstract long getLong(final int arrayIndex); + + /** + * Sets the long value at the specified array index. + * + * @param arrayIndex The index of the long value in the array. + * @param value The value to set the long to. + */ + abstract void setLong(final int arrayIndex, final long value); + + /** + * Returns a string representation of a long value as a series of 0s and 1s (little endian). + * + * @param val The long value to print. + * @return A string representation of the long value. + */ + public static String printLong(final long val) { + final StringBuilder sb = new StringBuilder(); + for (int j = 0; j < Long.SIZE; ++j) { + sb.append((val & (1L << j)) != 0 ? "1" : "0"); + if (j % 8 == 7) { sb.append(" "); } + } + return sb.toString(); + } + +} diff --git a/src/main/java/org/apache/datasketches/filters/bloomfilter/DirectBitArray.java b/src/main/java/org/apache/datasketches/filters/common/DirectBitArray.java similarity index 62% rename from src/main/java/org/apache/datasketches/filters/bloomfilter/DirectBitArray.java rename to src/main/java/org/apache/datasketches/filters/common/DirectBitArray.java index 77c24f027..25521672e 100644 --- a/src/main/java/org/apache/datasketches/filters/bloomfilter/DirectBitArray.java +++ b/src/main/java/org/apache/datasketches/filters/common/DirectBitArray.java @@ -17,21 +17,21 @@ * under the License. */ -package org.apache.datasketches.filters.bloomfilter; +package org.apache.datasketches.filters.common; import org.apache.datasketches.common.SketchesArgumentException; import org.apache.datasketches.memory.WritableMemory; -final class DirectBitArray extends DirectBitArrayR { +public final class DirectBitArray extends DirectBitArrayR { - DirectBitArray(final int dataLength, final long storedNumBitsSet, final WritableMemory wmem) { + public DirectBitArray(final int dataLength, final long storedNumBitsSet, final WritableMemory wmem) { super(dataLength, 0, wmem); // we'll set numBitsSet_ ourselves so pass 0 // can recompute later if needed numBitsSet_ = storedNumBitsSet; } - DirectBitArray(final int dataLength, final WritableMemory wmem) { + public DirectBitArray(final int dataLength, final WritableMemory wmem) { super(dataLength, 0, wmem); wmem_.putInt(0, dataLength_); @@ -39,7 +39,7 @@ final class DirectBitArray extends DirectBitArrayR { wmem_.clear(DATA_OFFSET, (long) dataLength_ * Long.BYTES); } - static DirectBitArray initialize(final long numBits, final WritableMemory wmem) { + public static DirectBitArray initialize(final long numBits, final WritableMemory wmem) { if (numBits <= 0) { throw new SketchesArgumentException("Number of bits must be strictly positive. Found: " + numBits); } @@ -58,7 +58,7 @@ static DirectBitArray initialize(final long numBits, final WritableMemory wmem) return new DirectBitArray(arrayLength, wmem); } - static DirectBitArray writableWrap(final WritableMemory mem, final boolean isEmpty) { + public static DirectBitArray writableWrap(final WritableMemory mem, final boolean isEmpty) { final int arrayLength = mem.getInt(0); final long storedNumBitsSet = isEmpty ? 0L : mem.getLong(NUM_BITS_OFFSET); @@ -81,7 +81,7 @@ static DirectBitArray writableWrap(final WritableMemory mem, final boolean isEmp } @Override - long getNumBitsSet() { + public long getNumBitsSet() { // update numBitsSet and store in array if (isDirty()) { numBitsSet_ = 0; @@ -95,17 +95,17 @@ long getNumBitsSet() { } @Override - protected boolean isDirty() { + public boolean isDirty() { return numBitsSet_ == -1; } @Override - boolean getBit(final long index) { + public boolean getBit(final long index) { return (wmem_.getByte(DATA_OFFSET + ((int) index >>> 3)) & (1 << (index & 0x7))) != 0; } @Override - protected long getLong(final int arrayIndex) { + public long getLong(final int arrayIndex) { return wmem_.getLong(DATA_OFFSET + (arrayIndex << 3)); } @@ -115,21 +115,83 @@ public boolean isReadOnly() { } @Override - void reset() { + public void reset() { setNumBitsSet(0); wmem_.clear(DATA_OFFSET, (long) dataLength_ * Long.BYTES); } @Override - void setBit(final long index) { + public void setBit(final long index) { final long memoryOffset = DATA_OFFSET + ((int) index >>> 3); final byte val = wmem_.getByte(memoryOffset); - wmem_.setBits(memoryOffset, (byte) (val | (1 << (index & 0x07)))); + wmem_.putByte(memoryOffset, (byte) (val | (1 << (index & 0x07)))); setNumBitsSet(-1); // mark dirty } @Override - boolean getAndSetBit(final long index) { + public void clearBit(final long index) { + final long memoryOffset = DATA_OFFSET + ((int) index >>> 3); + final byte val = wmem_.getByte(memoryOffset); + wmem_.putByte(memoryOffset, (byte) (val & ~(1 << (index & 0x07)))); + setNumBitsSet(-1); // mark dirty + } + + @Override + public void assignBit(final long index, final boolean value) { + if (value) { + setBit(index); + } else { + clearBit(index); + } + } + + @Override + public void setBits(final long index, final int numBits, final long bits) { + if (numBits < 0 || numBits > 64) { + throw new SketchesArgumentException("numBits must be between 0 and 64 (inclusive)"); + } else if (index + numBits > getCapacity()) { + throw new SketchesArgumentException("End of range exceeds capacity"); + } + + // TODO: since Memory provides byte offsets even when reading a long, we can be sure + // that the result always fits in a single long. We can potentially optimize this, but + // need to handle cases where a long would read beyond the end of the Memory. + + final long endBit = index + numBits - 1; + + // these are indices into a long[] array, need to adjust to byte offsets + // when calling wmem_.getLong() + final int fromIndex = (int) index >>> 6; + final int toIndex = (int) endBit >>> 6; + + setNumBitsSet(-1); // mark dirty + final long fromOffset = index & 0x3F; + final long toOffset = endBit & 0x3F; + + // within a single long + if (fromIndex == toIndex) { + final long toMask = (toOffset == 63) ? -1L : (1L << (toOffset + 1)) - 1L; + final long fromMask = (1L << fromOffset) - 1L; + final long mask = toMask - fromMask; + final long maskedVal = wmem_.getLong(DATA_OFFSET + (fromIndex << 3)) & ~mask; + wmem_.putLong(DATA_OFFSET + (fromIndex << 3), maskedVal | ((bits << fromOffset) & mask)); + return; + } + + // spans longs, need to set bits in two longs + final long splitBit = Long.SIZE - (fromOffset); + final long fromMask = (1L << fromOffset) - 1; // inverse mask in this case + final long toMask = (1L << (toOffset + 1)) - 1; + + final long maskedFromVal = wmem_.getLong(DATA_OFFSET + (fromIndex << 3)) & fromMask; + final long maskedToVal = wmem_.getLong(DATA_OFFSET + (toIndex << 3)) & ~toMask; + + wmem_.putLong(DATA_OFFSET + (fromIndex << 3), maskedFromVal | ((bits << fromOffset) & ~fromMask)); + wmem_.putLong(DATA_OFFSET + (toIndex << 3), maskedToVal | ((bits >>> splitBit) & toMask)); + } + + @Override + public boolean getAndSetBit(final long index) { final long memoryOffset = DATA_OFFSET + ((int) index >>> 3); final byte mask = (byte) (1 << (index & 0x07)); final byte val = wmem_.getByte(memoryOffset); @@ -143,7 +205,7 @@ boolean getAndSetBit(final long index) { } @Override - void intersect(final BitArray other) { + public void intersect(final BitArray other) { if (getCapacity() != other.getCapacity()) { throw new SketchesArgumentException("Cannot intersect bit arrays with unequal lengths"); } @@ -158,7 +220,7 @@ void intersect(final BitArray other) { } @Override - void union(final BitArray other) { + public void union(final BitArray other) { if (getCapacity() != other.getCapacity()) { throw new SketchesArgumentException("Cannot intersect bit arrays with unequal lengths"); } @@ -173,7 +235,7 @@ void union(final BitArray other) { } @Override - void invert() { + public void invert() { if (isDirty()) { numBitsSet_ = 0; for (int i = 0; i < dataLength_; ++i) { @@ -191,7 +253,7 @@ void invert() { } @Override - protected void setLong(final int arrayIndex, final long value) { + void setLong(final int arrayIndex, final long value) { wmem_.putLong(DATA_OFFSET + (arrayIndex << 3), value); } diff --git a/src/main/java/org/apache/datasketches/filters/bloomfilter/DirectBitArrayR.java b/src/main/java/org/apache/datasketches/filters/common/DirectBitArrayR.java similarity index 58% rename from src/main/java/org/apache/datasketches/filters/bloomfilter/DirectBitArrayR.java rename to src/main/java/org/apache/datasketches/filters/common/DirectBitArrayR.java index 8acc36be2..6d0d4bad3 100644 --- a/src/main/java/org/apache/datasketches/filters/bloomfilter/DirectBitArrayR.java +++ b/src/main/java/org/apache/datasketches/filters/common/DirectBitArrayR.java @@ -17,7 +17,7 @@ * under the License. */ -package org.apache.datasketches.filters.bloomfilter; +package org.apache.datasketches.filters.common; import org.apache.datasketches.common.SketchesArgumentException; import org.apache.datasketches.common.SketchesReadOnlyException; @@ -35,7 +35,7 @@ public class DirectBitArrayR extends BitArray { final protected WritableMemory wmem_; // for inheritance; we won't write to it protected long numBitsSet_; // could be final here but writable direct will update it - protected DirectBitArrayR(final int dataLength, final long storedNumBitsSet, final Memory mem) { + public DirectBitArrayR(final int dataLength, final long storedNumBitsSet, final Memory mem) { super(); dataLength_ = dataLength; @@ -53,7 +53,7 @@ protected DirectBitArrayR(final int dataLength, final long storedNumBitsSet, fin // assumes we have a region with only the portion of Memory // the BitArray cares about - static DirectBitArrayR wrap(final Memory mem, final boolean isEmpty) { + public static DirectBitArrayR wrap(final Memory mem, final boolean isEmpty) { final int arrayLength = mem.getInt(0); final long storedNumBitsSet = isEmpty ? 0L : mem.getLong(NUM_BITS_OFFSET); @@ -71,34 +71,73 @@ static DirectBitArrayR wrap(final Memory mem, final boolean isEmpty) { } @Override - long getCapacity() { + public long getCapacity() { return (long) dataLength_ * Long.SIZE; } @Override - long getNumBitsSet() { + public long getNumBitsSet() { return numBitsSet_; } @Override - protected boolean isDirty() { + public boolean isDirty() { // read-only so necessarily false return false; } @Override - int getArrayLength() { + public int getArrayLength() { return dataLength_; } @Override - boolean getBit(final long index) { + public boolean getBit(final long index) { if (isEmpty()) { return false; } return (wmem_.getByte(DATA_OFFSET + ((int) index >>> 3)) & (1 << (index & 0x7))) != 0; } @Override - protected long getLong(final int arrayIndex) { + public long getBits(final long index, final int numBits) { + if (numBits < 0 || numBits > 64) { + throw new SketchesArgumentException("numBits must be between 0 and 64 (inclusive)"); + } else if (index + numBits > getCapacity()) { + throw new SketchesArgumentException("End of range exceeds capacity"); + } + if (isEmpty()) { return 0L; } + + // TODO: since Memory provides byte offsets even when reading a long, we can be sure + // that the result always fits in a single long. We can potentially optimize this, but + // need to handle cases where a long would read beyond the end of the Memory. + + final long endBit = index + numBits - 1; + + // these are indices into a long[] array, need to adjust to byte offsets + // when calling wmem_.getLong() + final int fromIndex = (int) index >>> 6; + final int toIndex = (int) endBit >>> 6; + final long fromOffset = index & 0x3F; + final long toOffset = endBit & 0x3F; + + // within a single long + if (fromIndex == toIndex) { + final long toMask = (toOffset == 63) ? -1L : (1L << (toOffset + 1)) - 1L; + final long fromMask = (1L << fromOffset) - 1L; + return (wmem_.getLong(DATA_OFFSET + (fromIndex << 3)) & (toMask - fromMask)) >>> fromOffset; + } + + // spans longs, need to combine bits from two longs + final long splitBit = Long.SIZE - (fromOffset); + final long fromMask = ~((1L << fromOffset) - 1); + final long toMask = (1L << (toOffset + 1)) - 1; + + long result = (wmem_.getLong(DATA_OFFSET + (fromIndex << 3)) & fromMask) >>> fromOffset; + result |= (wmem_.getLong(DATA_OFFSET + (toIndex << 3)) & toMask) << splitBit; + return result; + } + + @Override + long getLong(final int arrayIndex) { if (isEmpty()) { return 0L; } return wmem_.getLong(DATA_OFFSET + (arrayIndex << 3)); } @@ -119,37 +158,52 @@ public boolean isReadOnly() { } @Override - void reset() { + public void reset() { throw new SketchesReadOnlyException("Attempt to call reset() on read-only memory"); } @Override - void setBit(final long index) { + public void setBit(final long index) { + throw new SketchesReadOnlyException("Attempt to call setBit() on read-only memory"); + } + + @Override + public void clearBit(final long index) { + throw new SketchesReadOnlyException("Attempt to call clearBit() on read-only memory"); + } + + @Override + public void setBits(final long index, final int numBits, final long bits) { + throw new SketchesReadOnlyException("Attempt to call setBits() on read-only memory"); + } + + @Override + public void assignBit(final long index, final boolean value) { throw new SketchesReadOnlyException("Attempt to call setBit() on read-only memory"); } @Override - boolean getAndSetBit(final long index) { + public boolean getAndSetBit(final long index) { throw new SketchesReadOnlyException("Attempt to call getAndSetBit() on read-only memory"); } @Override - void intersect(final BitArray other) { + public void intersect(final BitArray other) { throw new SketchesReadOnlyException("Attempt to call intersect() on read-only memory"); } @Override - void union(final BitArray other) { + public void union(final BitArray other) { throw new SketchesReadOnlyException("Attempt to call union() on read-only memory"); } @Override - void invert() { + public void invert() { throw new SketchesReadOnlyException("Attempt to call invert() on read-only memory"); } @Override - protected void setLong(final int arrayIndex, final long value) { + void setLong(final int arrayIndex, final long value) { throw new SketchesReadOnlyException("Attempt to call setLong() on read-only memory"); } } diff --git a/src/main/java/org/apache/datasketches/filters/bloomfilter/HeapBitArray.java b/src/main/java/org/apache/datasketches/filters/common/HeapBitArray.java similarity index 57% rename from src/main/java/org/apache/datasketches/filters/bloomfilter/HeapBitArray.java rename to src/main/java/org/apache/datasketches/filters/common/HeapBitArray.java index 4048b6775..ca81ae073 100644 --- a/src/main/java/org/apache/datasketches/filters/bloomfilter/HeapBitArray.java +++ b/src/main/java/org/apache/datasketches/filters/common/HeapBitArray.java @@ -17,7 +17,7 @@ * under the License. */ -package org.apache.datasketches.filters.bloomfilter; +package org.apache.datasketches.filters.common; import java.util.Arrays; @@ -31,13 +31,13 @@ *

Rounds the number of bits up to the smallest multiple of 64 (one long) * that is not smaller than the specified number. */ -final class HeapBitArray extends BitArray { +public final class HeapBitArray extends BitArray { private long numBitsSet_; // if -1, need to recompute value private boolean isDirty_; final private long[] data_; // creates an array of a given size - HeapBitArray(final long numBits) { + public HeapBitArray(final long numBits) { super(); if (numBits <= 0) { @@ -54,7 +54,7 @@ final class HeapBitArray extends BitArray { } // uses the provided array - HeapBitArray(final long numBitsSet, final long[] data) { + public HeapBitArray(final long numBitsSet, final long[] data) { super(); data_ = data; @@ -64,7 +64,7 @@ final class HeapBitArray extends BitArray { // reads a serialized image, but the BitArray is not fully self-describing so requires // a flag to indicate whether the array is empty - static HeapBitArray heapify(final Buffer buffer, final boolean isEmpty) { + public static HeapBitArray heapify(final Buffer buffer, final boolean isEmpty) { final int numLongs = buffer.getInt(); if (numLongs < 0) { throw new SketchesArgumentException("Possible corruption: Must have strictly positive array size. Found: " + numLongs); @@ -85,40 +85,124 @@ static HeapBitArray heapify(final Buffer buffer, final boolean isEmpty) { } @Override - protected boolean isDirty() { + public boolean isDirty() { return isDirty_; } @Override - boolean hasMemory() { + public boolean hasMemory() { return false; } @Override - boolean isDirect() { + public boolean isDirect() { return false; } @Override - boolean isReadOnly() { return false; } + public boolean isReadOnly() { return false; } // queries a single bit in the array @Override - boolean getBit(final long index) { + public boolean getBit(final long index) { return (data_[(int) index >>> 6] & (1L << index)) != 0 ? true : false; } + @Override + public long getBits(final long index, final int numBits) { + if (numBits < 0 || numBits > 64) { + throw new SketchesArgumentException("numBits must be between 0 and 64 (inclusive)"); + } else if (index + numBits > getCapacity()) { + throw new SketchesArgumentException("End of range exceeds capacity"); + } + if (numBits == 0) { return 0; } + + final long endBit = index + numBits - 1; + + final int fromIndex = (int) index >>> 6; + final int toIndex = (int) endBit >>> 6; + final long fromOffset = index & 0x3F; + final long toOffset = endBit & 0x3F; + + // within a single long + if (fromIndex == toIndex) { + final long toMask = (toOffset == 63) ? -1L : (1L << (toOffset + 1)) - 1L; + final long fromMask = (1L << fromOffset) - 1L; + return (data_[fromIndex] & (toMask - fromMask)) >>> fromOffset; + } + + // spans longs, need to combine bits from two longs + final long splitBit = Long.SIZE - (fromOffset); + final long fromMask = ~((1L << fromOffset) - 1); + final long toMask = (1L << (toOffset + 1)) - 1; + + long result = (data_[fromIndex] & fromMask) >>> fromOffset; + result |= (data_[toIndex] & toMask) << splitBit; + return result; + } + // sets a single bit in the array without querying, meaning the method // cannot properly track the number of bits set so set isDirty = true @Override - void setBit(final long index) { + public void setBit(final long index) { data_[(int) index >>> 6] |= 1L << index; isDirty_ = true; } + @Override + public void clearBit(final long index) { + data_[(int) index >>> 6] &= ~(1L << index); + isDirty_ = true; + } + + // assigns a single bit in the array without querying + @Override + public void assignBit(final long index, final boolean value) { + if (value) { + setBit(index); + } else { + clearBit(index); + } + } + + @Override + public void setBits(final long index, final int numBits, final long bits) { + if (numBits < 0 || numBits > 64) { + throw new SketchesArgumentException("numBits must be between 0 and 64 (inclusive)"); + } else if (index + numBits > getCapacity()) { + throw new SketchesArgumentException("End of range exceeds capacity"); + } + if (numBits == 0) { return; } + + isDirty_ = true; + final long endBit = index + numBits - 1; + + final int fromIndex = (int) index >>> 6; + final int toIndex = (int) endBit >>> 6; + final long fromOffset = index & 0x3F; + final long toOffset = endBit & 0x3F; + + // within a single long + if (fromIndex == toIndex) { + final long toMask = (toOffset == 63) ? -1L : (1L << (toOffset + 1)) - 1L; + final long fromMask = (1L << fromOffset) - 1L; + final long mask = toMask - fromMask; + data_[fromIndex] = (data_[fromIndex] & ~mask) | ((bits << fromOffset) & mask); + return; + } + + // spans longs, need to set bits in two longs + final long splitBit = Long.SIZE - (fromOffset); + final long fromMask = (1L << fromOffset) - 1; // inverse mask in this case + final long toMask = (1L << (toOffset + 1)) - 1; + + data_[fromIndex] = (data_[fromIndex] & fromMask) | ((bits << fromOffset) & ~fromMask); + data_[toIndex] = (data_[toIndex] & ~toMask) | ((bits >>> splitBit) & toMask); + } + // returns existing value of bit @Override - boolean getAndSetBit(final long index) { + public boolean getAndSetBit(final long index) { final int offset = (int) index >>> 6; final long mask = 1L << index; if ((data_[offset] & mask) != 0) { @@ -134,7 +218,7 @@ boolean getAndSetBit(final long index) { // O(1) if only getAndSetBit() has been used // O(data_.length) if setBit() has ever been used @Override - long getNumBitsSet() { + public long getNumBitsSet() { if (isDirty_) { numBitsSet_ = 0; for (final long val : data_) { @@ -145,14 +229,14 @@ long getNumBitsSet() { } @Override - long getCapacity() { return (long) data_.length * Long.SIZE; } + public long getCapacity() { return (long) data_.length * Long.SIZE; } @Override - int getArrayLength() { return data_.length; } + public int getArrayLength() { return data_.length; } // applies logical OR @Override - void union(final BitArray other) { + public void union(final BitArray other) { if (getCapacity() != other.getCapacity()) { throw new SketchesArgumentException("Cannot union bit arrays with unequal lengths"); } @@ -168,7 +252,7 @@ void union(final BitArray other) { // applies logical AND @Override - void intersect(final BitArray other) { + public void intersect(final BitArray other) { if (getCapacity() != other.getCapacity()) { throw new SketchesArgumentException("Cannot intersect bit arrays with unequal lengths"); } @@ -184,7 +268,7 @@ void intersect(final BitArray other) { // applies bitwise inversion @Override - void invert() { + public void invert() { if (isDirty_) { numBitsSet_ = 0; for (int i = 0; i < data_.length; ++i) { @@ -200,7 +284,7 @@ void invert() { } } - void writeToBuffer(final WritableBuffer wbuf) { + public void writeToBuffer(final WritableBuffer wbuf) { wbuf.putInt(data_.length); wbuf.putInt(0); // unused @@ -211,18 +295,18 @@ void writeToBuffer(final WritableBuffer wbuf) { } @Override - protected long getLong(final int arrayIndex) { + public long getLong(final int arrayIndex) { return data_[arrayIndex]; } @Override - protected void setLong(final int arrayIndex, final long value) { + public void setLong(final int arrayIndex, final long value) { data_[arrayIndex] = value; } // clears the array @Override - void reset() { + public void reset() { Arrays.fill(data_, 0); numBitsSet_ = 0; isDirty_ = false; diff --git a/src/main/java/org/apache/datasketches/filters/quotientfilter/Filter.java b/src/main/java/org/apache/datasketches/filters/quotientfilter/Filter.java new file mode 100644 index 000000000..53dfd1c4b --- /dev/null +++ b/src/main/java/org/apache/datasketches/filters/quotientfilter/Filter.java @@ -0,0 +1,149 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + + +package org.apache.datasketches.filters.quotientfilter; + +import java.nio.ByteBuffer; +import java.nio.charset.StandardCharsets; + + +import org.apache.datasketches.memory.XxHash; + + +public abstract class Filter { + + //HashType hash_type; + + //abstract boolean rejuvenate(long key); + //abstract boolean expand(); + //protected abstract boolean _delete(long large_hash); + abstract protected boolean _insert(long large_hash); + abstract protected boolean _search(long large_hash); + + + //public boolean delete(long input) { +// return _delete(get_hash(input)); +// } + +// public boolean delete(String input) { +// ByteBuffer input_buffer = ByteBuffer.wrap(input.getBytes(StandardCharsets.UTF_8)); +// //return _delete(HashFunctions.xxhash(input_buffer)); +// return _delete(XxHash.hashLong(input_buffer)); +// } + +// public boolean delete(byte[] input) { +// ByteBuffer input_buffer = ByteBuffer.wrap(input); +// return _delete(HashFunctions.xxhash(input_buffer)); +// } +// + public boolean insert(long input) { + //System.out.println("The ABC input is " + input); + long hash = get_hash(input); + //System.out.println("The ABC hash is " + hash); + return _insert(hash); + } +// +// public boolean insert(String input, boolean insert_only_if_no_match) { +// ByteBuffer input_buffer = ByteBuffer.wrap(input.getBytes(StandardCharsets.UTF_8)); +// return _insert(HashFunctions.xxhash(input_buffer), insert_only_if_no_match); +// } +// +// public boolean insert(byte[] input, boolean insert_only_if_no_match) { +// ByteBuffer input_buffer = ByteBuffer.wrap(input); +// return _insert(HashFunctions.xxhash(input_buffer), insert_only_if_no_match); +// } +// + public boolean search(long input) { + return _search(get_hash(input)); + } +// +// public boolean search(String input) { +// ByteBuffer input_buffer = ByteBuffer.wrap(input.getBytes(StandardCharsets.UTF_8)); +// return _search(HashFunctions.xxhash(input_buffer)); +// } +// +// public boolean search(byte[] input) { +// ByteBuffer input_buffer = ByteBuffer.wrap(input); +// return _search(HashFunctions.xxhash(input_buffer)); +// } +// + long get_hash(long input) { +// long hash = 0; +// if (hash_type == HashType.arbitrary) { +// hash = HashFunctions.normal_hash((int)input); +// } +// else if (hash_type == HashType.xxh) { +// hash = HashFunctions.xxhash(input); +// } +// else { +// System.exit(1); +// } +// return hash; + return XxHash.hashLong(input, 0L) ; // CD edit for datasketches hash function using same seed. + } + + public long getSpaceUse() { return 0 ; } +// public int get_bits_per_entry() { return 0 ; } +// +// public abstract long get_num_entries(boolean include_all_internal_filters); +// +// public double get_utilization() { +// return 0; +// } +// +// public double measure_num_bits_per_entry() { +// return 0; +// } +// +// static void print_int_in_binary(int num, int length) { +// String str = ""; +// for (int i = 0; i < length; i++) { +// int mask = (int)Math.pow(2, i); +// int masked = num & mask; +// str += masked > 0 ? "1" : "0"; +// } +// System.out.println(str); +// } +// +// static void print_long_in_binary(long num, int length) { +// String str = ""; +// for (int i = 0; i < length; i++) { +// long mask = (long)Math.pow(2, i); +// long masked = num & mask; +// str += masked > 0 ? "1" : "0"; +// } +// System.out.println(str); +// } +// +// String get_fingerprint_str(long fp, int length) { +// String str = ""; +// for (int i = 0; i < length; i++) { +// str += Bitmap.get_fingerprint_bit(i, fp) ? "1" : "0"; +// } +// return str; +// } +// +// public void pretty_print() { +// +// } + + +} + diff --git a/src/main/java/org/apache/datasketches/filters/quotientfilter/Iterator.java b/src/main/java/org/apache/datasketches/filters/quotientfilter/Iterator.java new file mode 100644 index 000000000..e04e6cd12 --- /dev/null +++ b/src/main/java/org/apache/datasketches/filters/quotientfilter/Iterator.java @@ -0,0 +1,102 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + + +package org.apache.datasketches.filters.quotientfilter; + +import java.util.ArrayDeque; +import java.util.Queue; + +public class Iterator { + + QuotientFilter qf; + long index; + long bucket_index; + long fingerprint; + Queue s; + + Iterator(QuotientFilter new_qf) { + qf = new_qf; + s = new ArrayDeque(); + //s = new ArrayDeque(); + index = 0; + bucket_index = -1; + fingerprint = -1; + } + + void clear() { + s.clear(); + index = 0; + bucket_index = -1; + fingerprint = -1; + } + + boolean next() { + + if (index == qf.getNumSlots()) { + return false; + } + + long slot = qf.getSlot(index); + boolean occupied = (slot & 1) != 0; + boolean continuation = (slot & 2) != 0; + boolean shifted = (slot & 4) != 0; + + + while (!occupied && !continuation && !shifted && index < qf.getNumSlots()) { + index++; + if (index == qf.getNumSlots()) { + return false; + } + slot = qf.getSlot(index); + occupied = (slot & 1) != 0; + continuation = (slot & 2) != 0; + shifted = (slot & 4) != 0; + } + + if (occupied && !continuation && !shifted) { + s.clear(); + s.add(index); + bucket_index = index; + } + else if (occupied && continuation && shifted) { + s.add(index); + } + else if (!occupied && !continuation && shifted) { + s.remove(); + bucket_index = s.peek(); + } + else if (!occupied && continuation && shifted) { + // do nothing + } + else if (occupied && !continuation && shifted) { + s.add(index); + s.remove(); + bucket_index = s.peek(); + } + fingerprint = slot >> 3; + index++; + return true; + } + + void print() { + System.out.println("original slot: " + index + " " + bucket_index); + } + +} diff --git a/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java b/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java new file mode 100644 index 000000000..19c81f4f7 --- /dev/null +++ b/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java @@ -0,0 +1,661 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.datasketches.filters.quotientfilter; + +import java.util.ArrayList; +import java.util.HashSet; +import java.util.LinkedList; +import java.util.Queue; +import java.util.Set; + +import org.apache.datasketches.common.SketchesArgumentException; +import org.apache.datasketches.common.SketchesException; +import org.apache.datasketches.filters.common.BitArray; +import org.apache.datasketches.filters.common.HeapBitArray; + +public class QuotientFilter extends Filter { + + public static final double DEFAULT_LOAD_FACTOR = 0.8; + + int lgQ_; + int numFingerprintBits_; + double loadFactor_; + int numEntries_; + int numExpansions_; + BitArray bitArray_; + + // statistics, computed in the compute_statistics method. method should be called before these are used + long numRuns_; + long numClusters_; + public double avgRunLength_; + public double avgClusterLength_; + + public QuotientFilter(final int lgQ, final int numFingerprintBits) { + this(lgQ, numFingerprintBits, DEFAULT_LOAD_FACTOR); + } + + public QuotientFilter(final int lgQ, final int numFingerprintBits, final double loadFactor) { + lgQ_ = lgQ; + numFingerprintBits_ = numFingerprintBits; + loadFactor_ = loadFactor; + bitArray_ = makeFilter(getNumSlots(), getNumBitsPerEntry()); + numExpansions_ = 0; + //hash_type = XxHash.hashLong ; //HashType.xxh; + } + + public boolean rejuvenate(final long key) { + return false; + } + + public long getNumEntries() { + return numEntries_; + } + + public int getNumExpansions() { + return numExpansions_; + } + + public long getMaxEntriesBeforeExpansion() { + return (long)(getNumSlots() * loadFactor_); + } + + BitArray makeFilter(final long initSize, final int bitsPerEntry) { + return new HeapBitArray(initSize * bitsPerEntry); + } + + public int getFingerprintLength() { + return numFingerprintBits_; + } + + void expand() { + if (getFingerprintLength() < 2) throw new SketchesException("for expansion value must have at least 2 bits"); + final QuotientFilter other = new QuotientFilter(lgQ_ + 1, numFingerprintBits_ - 1, loadFactor_); + + long i = 0; + if (!isSlotEmpty(i)) { i = findClusterStart(i); } + + final Queue fifo = new LinkedList(); + long count = 0; + while (count < numEntries_) { + if (!isSlotEmpty(i)) { + if (isOccupied(i)) { fifo.add(i); } + final long fingerprint = getFingerprint(i); + final long newQuotient = (fifo.element() << 1) | (fingerprint >> other.getFingerprintLength()); + final long newFingerprint = fingerprint & other.getFingerprintMask(); + other.insert(newFingerprint, newQuotient); + count++; + } + i = (i + 1) & getSlotMask(); + if (!fifo.isEmpty() && ! isContinuation(i)) { fifo.remove(); } + } + lgQ_++; + numFingerprintBits_--; + bitArray_ = other.bitArray_; + numExpansions_++; + } + + // measures the number of bits per entry for the filter + public double measureNumBitsPerEntry() { + return measureNumBitsPerEntry(this, new ArrayList()); + } + + // measures the number of bits per entry for the filter + // it takes an array of filters as a parameter since some filter implementations here consist of multiple filter objects + protected static double measureNumBitsPerEntry(final QuotientFilter current, final ArrayList otherFilters) { + //System.out.println("--------------------------"); + //current.print_filter_summary(); + //System.out.println(); + double numEntries = current.getNumEntries(); + for (QuotientFilter q : otherFilters) { + //q.print_filter_summary(); + //System.out.println(); + numEntries += q.getNumEntries(); + } + long numBits = current.getNumBitsPerEntry() * current.getNumSlots(); + for (final QuotientFilter q : otherFilters) { + numBits += q.getNumBitsPerEntry() * q.getNumSlots(); + } + //System.out.println("total entries: \t\t" + num_entries); + //System.out.println("total bits: \t\t" + num_bits); + final double bits_per_entry = numBits / numEntries; + //System.out.println("total bits/entry: \t" + bits_per_entry); + //System.out.println(); + return bits_per_entry; + } + + // returns the fraction of occupied slots in the filter + public double getUtilization() { + return numEntries_ / (double) getNumSlots(); + } + + public int getLgQ() { + return lgQ_; + } + + public double getLoadFactor() { + return loadFactor_; + } + + // returns the number of slots in the filter without the extension/buffer slots + public long getNumSlots() { + return 1L << lgQ_; + } + + long getSlotMask() { + return getNumSlots() - 1; + } + + long getFingerprintMask() { + return (1L << getFingerprintLength()) - 1; + } + + // sets the metadata flag bits for a given slot index + void modifySlot(final boolean isOccupied, final boolean isContinuation, final boolean isShifted, final long index) { + setOccupied(index, isOccupied); + setContinuation(index, isContinuation); + setShifted(index, isShifted); + } + + // sets the fingerprint for a given slot index + void setFingerprint(final long index, final long fingerprint) { + bitArray_.setBits(index * getNumBitsPerEntry() + 3, getFingerprintLength(), fingerprint); + } + + // print a nice representation of the filter that can be understood. + // if vertical is on, each line will represent a slot + public String getPrettyStr(final boolean vertical) { + final StringBuffer sbr = new StringBuffer(); + final long numBits = getNumSlots() * getNumBitsPerEntry(); + for (long i = 0; i < numBits; i++) { + final long remainder = i % getNumBitsPerEntry(); + if (remainder == 0) { + final long slot = i / getNumBitsPerEntry(); + sbr.append(" "); + if (vertical) { + sbr.append("\n" + String.format("%-10d", slot) + "\t"); + } + } + if (remainder == 3) { + sbr.append(" "); + } + sbr.append(bitArray_.getBit(i) ? "1" : "0"); + } + sbr.append("\n"); + return sbr.toString(); + } + + // print a representation of the filter that can be humanly read. + public void prettyPrint() { + System.out.print(getPrettyStr(true)); + } + + // return a fingerprint in a given slot index + long getFingerprint(final long index) { + return bitArray_.getBits(index * getNumBitsPerEntry() + 3, getFingerprintLength()); + } + + // return an entire slot representation, including metadata flags and fingerprint + long getSlot(final long index) { + return bitArray_.getBits(index * getNumBitsPerEntry(), getNumBitsPerEntry()); + } + + // compare a fingerprint input to the fingerprint in some slot index + protected boolean compare(final long index, final long fingerprint) { + return getFingerprint(index) == fingerprint; + } + + // modify the flags and fingerprint of a given slot + void modifySlot(final boolean isOccupied, final boolean isContinuation, final boolean isShifted, + final long index, final long fingerprint) { + modifySlot(isOccupied, isContinuation, isShifted, index); + setFingerprint(index, fingerprint); + } + + // summarize some statistical measures about the filter + public void printFilterSummary() { + final long slots = getNumSlots(); + final long numBits = slots * getNumBitsPerEntry(); + System.out.println("lgQ: " + lgQ_); + System.out.println("FP length: " + getFingerprintLength()); + System.out.println("load factor: " + getLoadFactor()); + System.out.println("bits: " + numBits); + System.out.println("bits/entry: " + numBits / (double)numEntries_); + System.out.println("entries: " + numEntries_); + System.out.println("expansions: " + numExpansions_); + System.out.println("load: " + numEntries_ / (double)(slots)); + computeStatistics(); + //System.out.println("num runs: \t\t" + num_runs); + //System.out.println("avg run length: \t" + avg_run_length); + //System.out.println("num clusters: \t\t" + num_clusters); + //System.out.println("avg cluster length: \t" + avg_cluster_length); + } + + /* + * Returns the number of bits used for the filter + */ + @Override + public long getSpaceUse() { + return getNumSlots() * getNumBitsPerEntry(); + } + + public int getNumBitsPerEntry() { + return numFingerprintBits_ + 3; + } + + boolean isOccupied(final long index) { + return bitArray_.getBit(index * getNumBitsPerEntry()); + } + + boolean isContinuation(final long index) { + return bitArray_.getBit(index * getNumBitsPerEntry() + 1); + } + + boolean isShifted(final long index) { + return bitArray_.getBit(index * getNumBitsPerEntry() + 2); + } + + void setOccupied(final long index, final boolean val) { + bitArray_.assignBit(index * getNumBitsPerEntry(), val); + } + + void setContinuation(final long index, final boolean val) { + bitArray_.assignBit(index * getNumBitsPerEntry() + 1, val); + } + + void setShifted(final long index, final boolean val) { + bitArray_.assignBit(index * getNumBitsPerEntry() + 2, val); + } + + boolean isSlotEmpty(final long index) { + return !isOccupied(index) && !isContinuation(index) && !isShifted(index); + } + + // scan the cluster leftwards until finding the start of the cluster and returning its slot index + // used by deletes + long findClusterStart(long index) { + while (isShifted(index)) { + index = (index - 1) & getSlotMask(); + } + return index; + } + + // given a canonical slot A, finds the actual index B of where the run belonging to slot A now resides + // since the run might have been shifted to the right due to collisions + long findRunStart(long index) { + int numRunsToSkip = 0; + while (isShifted(index)) { + index = (index - 1) & getSlotMask(); + if (isOccupied(index)) { + numRunsToSkip++; + } + } + while (numRunsToSkip > 0) { + index = (index + 1) & getSlotMask(); + if (!isContinuation(index)) { + numRunsToSkip--; + } + } + return index; + } + + // given the start of a run, scan the run and return the index of the first matching fingerprint + // if not found returns the insertion position as bitwise complement to make it negative + long findFirstFingerprintInRun(long index, final long fingerprint) { + assert !isContinuation(index); + do { + final long fingerprintAtIndex = getFingerprint(index); + if (fingerprintAtIndex == fingerprint) { + return index; + } else if (fingerprintAtIndex > fingerprint) { + return ~index; + } + index = (index + 1) & getSlotMask(); + } while (isContinuation(index)); + return ~index; + } + + // delete the last matching fingerprint in the run + long decideWhichFingerprintToDelete(long index, final long fingerprint) { + assert !isContinuation(index); + long matchingFingerprintIndex = -1; + do { + if (compare(index, fingerprint)) { + matchingFingerprintIndex = index; + } + index = (index + 1) & getSlotMask(); + } while (isContinuation(index)); + return matchingFingerprintIndex; + } + + // given the start of a run, find the last slot index that still belongs to this run + long findRunEnd(long index) { + while (isContinuation((index + 1) & getSlotMask())) { + index = (index + 1) & getSlotMask(); + } + return index; + } + + // given a canonical index slot and a fingerprint, find the relevant run and check if there is a matching fingerprint within it + boolean search(final long fingerprint, final long index) { + if (!isOccupied(index)) { + return false; + } + final long runStartIndex = findRunStart(index); + final long foundIndex = findFirstFingerprintInRun(runStartIndex, fingerprint); + return foundIndex >= 0; + } + + // Given a canonical slot index, find the corresponding run and return all fingerprints in the run. + // This method is only used for testing purposes. + Set getAllFingerprints(final long bucketIndex) { + final boolean doesRunExist = isOccupied(bucketIndex); + final HashSet set = new HashSet(); + if (!doesRunExist) { + return set; + } + long runIndex = findRunStart(bucketIndex); + do { + set.add(getFingerprint(runIndex)); + runIndex = (runIndex + 1) & getSlotMask(); + } while (isContinuation(runIndex)); + return set; + } + + boolean insert(final long fingerprint, final long index) { + if (index >= getNumSlots() || numEntries_ == getNumSlots()) { + return false; + } + final long runStart = findRunStart(index); + if (!isOccupied(index)) { + insertFingerprintAndPushAllElse(fingerprint, runStart, index, true, true); + return true; + } + final long foundIndex = findFirstFingerprintInRun(runStart, fingerprint); + if (foundIndex >= 0) { + return false; + } + insertFingerprintAndPushAllElse(fingerprint, ~foundIndex, index, false, ~foundIndex == runStart); + return true; + } + + void insertFingerprintAndPushAllElse(long fingerprint, long index, final long canonical, + final boolean isNewRun, final boolean isRunStart) { + // in the first shifted entry set isContinuation flag if inserting at the start of the existing run + // otherwise just shift the existing flag as it is + boolean forceContinuation = !isNewRun && isRunStart; + + // prepare flags for the current slot + boolean isContinuation = !isRunStart; + boolean isShifted = index != canonical; + + // remember the existing entry from the current slot to be shifted to the next slot + // isOccupied flag belongs to the slot, therefore it is never shifted + // isShifted flag is always true for all shifted entries, no need to remember it + long existingFingerprint = getFingerprint(index); + boolean existingIsContinuation = isContinuation(index); + + while (!isSlotEmpty(index)) { + // set the current slot + setFingerprint(index, fingerprint); + setContinuation(index, isContinuation); + setShifted(index, isShifted); + + // prepare values for the next slot + fingerprint = existingFingerprint; + isContinuation = existingIsContinuation | forceContinuation; + isShifted = true; + + index = (index + 1) & getSlotMask(); + + // remember the existing entry to be shifted + existingFingerprint = getFingerprint(index); + existingIsContinuation = isContinuation(index); + + forceContinuation = false; // this is needed for the first shift only + } + // at this point the current slot is empty, so just populate with prepared values + // either the incoming fingerprint or the last shifted one + setFingerprint(index, fingerprint); + setContinuation(index, isContinuation); + setShifted(index, isShifted); + + if (isNewRun) { + setOccupied(canonical, true); + } + numEntries_++; + } + + boolean delete(final long canonicalSlot, long runStartIndex, long matchingFingerprintIndex) { + long runEnd = findRunEnd(matchingFingerprintIndex); + + // the run has only one entry, we need to disable its is_occupied flag + // we just remember we need to do this here, and we do it later to not interfere with counts + boolean turnOffOccupied = runStartIndex == runEnd; + + // First thing to do is move everything else in the run back by one slot + for (long i = matchingFingerprintIndex; i != runEnd; i = (i + 1) & getSlotMask()) { + long f = getFingerprint((i + 1) & getSlotMask()); + setFingerprint(i, f); + } + + // for each slot, we want to know by how much the entry there is shifted + // we can do this by counting the number of continuation flags set to true + // and the number of occupied flags set to false from the start of the cluster to the given cell + // and then subtracting: num_shifted_count - num_non_occupied = number of slots by which an entry is shifted + long clusterStart = findClusterStart(canonicalSlot); + long numShiftedCount = 0; + long numNonOccupied = 0; + for (long i = clusterStart; i != ((runEnd + 1) & getSlotMask()); i = (i + 1) & getSlotMask()) { + if (isContinuation(i)) { + numShiftedCount++; + } + if (!isOccupied(i)) { + numNonOccupied++; + } + } + + setFingerprint(runEnd, 0); + setShifted(runEnd, false); + setContinuation(runEnd, false); + + // we now have a nested loop. The outer do-while iterates over the remaining runs in the cluster. + // the inner for loop iterates over cells of particular runs, pushing entries one slot back. + do { + // we first check if the next run actually exists and if it is shifted. + // only if both conditions hold, we need to shift it back one slot. + //boolean does_next_run_exist = !is_slot_empty(run_end + 1); + //boolean is_next_run_shifted = is_shifted(run_end + 1); + //if (!does_next_run_exist || !is_next_run_shifted) { + if (isSlotEmpty((runEnd + 1) & getSlotMask()) || !isShifted((runEnd + 1) & getSlotMask())) { + if (turnOffOccupied) { + // if we eliminated a run and now need to turn the isOccupied flag off, we do it at the end to not interfere in our counts + setOccupied(canonicalSlot, false); + } + return true; + } + + // we now find the start and end of the next run + final long nextRunStart = (runEnd + 1) & getSlotMask(); + runEnd = findRunEnd(nextRunStart); + + // before we start processing the next run, we check whether the previous run we shifted is now back to its canonical slot + // The condition num_shifted_count - num_non_occupied == 1 ensures that the run was shifted by only 1 slot, meaning it is now back in its proper place + if (isOccupied((nextRunStart - 1) & getSlotMask()) && numShiftedCount - numNonOccupied == 1) { + setShifted((nextRunStart - 1) & getSlotMask(), false); + } else { + setShifted((nextRunStart - 1) & getSlotMask(), true); + } + + for (long i = nextRunStart; i != ((runEnd + 1) & getSlotMask()); i = (i + 1) & getSlotMask()) { + long f = getFingerprint(i); + setFingerprint((i - 1) & getSlotMask(), f); + if (isContinuation(i)) { + setContinuation((i - 1) & getSlotMask(), true); + } + if (!isOccupied(i)) { + numNonOccupied++; + } + if (i != nextRunStart) { + numShiftedCount++; + } + } + setFingerprint(runEnd, 0); + setShifted(runEnd, false); + setContinuation(runEnd, false); + } while (true); + } + + boolean delete(final long fingerprint, final long canonicalSlot) { + // if the run doesn't exist, the key can't have possibly been inserted + boolean doesRunExist = isOccupied(canonicalSlot); + if (!doesRunExist) { + return false; + } + long runStartIndex = findRunStart(canonicalSlot); + long matchingFingerprintIndex = decideWhichFingerprintToDelete(runStartIndex, fingerprint); + if (matchingFingerprintIndex == -1) { + // we didn't find a matching fingerprint + return false; + } + return delete(canonicalSlot, runStartIndex, matchingFingerprintIndex); + } + + long getSlotFromHash(final long largeHash) { + return (largeHash >> getFingerprintLength()) & getSlotMask(); + } + + long getFingerprintFromHash(final long largeHash) { + return largeHash & getFingerprintMask(); + } + + /* + This is the main insertion function accessed externally. + It calls the underlying filter _insert function which hashes the input + item internally. + Hence, the `large_hash` argument is already a hash key that has been generated + by the hashing library (eg xxhash). + */ + protected boolean _insert(final long largeHash) { + final long slotIndex = getSlotFromHash(largeHash); + final long fingerprint = getFingerprintFromHash(largeHash); + final boolean success = insert(fingerprint, slotIndex); + + if (numEntries_ == getMaxEntriesBeforeExpansion()) { + expand(); + } + return success; + } + + protected boolean _delete(final long largeHash) { + final long slotIndex = getSlotFromHash(largeHash); + long fingerprint = getFingerprintFromHash(largeHash); + boolean success = delete(fingerprint, slotIndex); + if (success) { + numEntries_--; + } + return success; + } + + protected boolean _search(final long largeHash) { + final long slotIndex = getSlotFromHash(largeHash); + final long fingerprint = getFingerprintFromHash(largeHash); + return search(fingerprint, slotIndex); + } + + public boolean getBitAtOffset(final int offset) { + return bitArray_.getBit(offset); + } + + public void computeStatistics() { + numRuns_ = 0; + numClusters_ = 0; + double sumRunLengths = 0; + double sumClusterLengths = 0; + + int currentRunLength = 0; + int currentCluster_length = 0; + + final long numSlots = getNumSlots(); + for (long i = 0; i < numSlots; i++) { + final boolean occupied = isOccupied(i); + final boolean continuation = isContinuation(i); + final boolean shifted = isShifted(i); + + if (!occupied && !continuation && !shifted) { // empty slot + sumClusterLengths += currentCluster_length; + currentCluster_length = 0; + sumRunLengths += currentRunLength; + currentRunLength = 0; + } else if ( !occupied && !continuation && shifted ) { // start of new run + numRuns_++; + sumRunLengths += currentRunLength; + currentRunLength = 1; + currentCluster_length++; + } else if ( !occupied && continuation && !shifted ) { + // not used + } else if ( !occupied && continuation && shifted ) { // continuation of run + currentCluster_length++; + currentRunLength++; + } else if ( occupied && !continuation && !shifted ) { // start of new cluster & run + numRuns_++; + numClusters_++; + sumClusterLengths += currentCluster_length; + sumRunLengths += currentRunLength; + currentCluster_length = 1; + currentRunLength = 1; + } else if (occupied && !continuation && shifted ) { // start of new run + numRuns_++; + sumRunLengths += currentRunLength; + currentRunLength = 1; + currentCluster_length++; + } else if (occupied && continuation && !shifted ) { + // not used + } else if (occupied && continuation && shifted ) { // continuation of run + currentCluster_length++; + currentRunLength++; + } + } + avgRunLength_ = sumRunLengths / numRuns_; + avgClusterLength_ = sumClusterLengths / numClusters_; + } + + public void merge(final QuotientFilter other) { + if (lgQ_ + numFingerprintBits_ != other.lgQ_ + other.numFingerprintBits_) { + throw new SketchesArgumentException("incompatible sketches in merge"); + } + long i = 0; + if (!other.isSlotEmpty(i)) { i = other.findClusterStart(i); } + + final Queue fifo = new LinkedList(); + long count = 0; + while (count < other.numEntries_) { + if (!other.isSlotEmpty(i)) { + if (other.isOccupied(i)) { fifo.add(i); } + final long quotient = fifo.element(); + final long fingerprint = other.getFingerprint(i); + final long hash = quotient << other.getFingerprintLength() | fingerprint; + _insert(hash); + count++; + } + i = (i + 1) & other.getSlotMask(); + if (!fifo.isEmpty() && ! other.isContinuation(i)) { fifo.remove(); } + } + } +} \ No newline at end of file diff --git a/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterBuilder.java b/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterBuilder.java new file mode 100644 index 000000000..d0f38b39a --- /dev/null +++ b/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterBuilder.java @@ -0,0 +1,148 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.datasketches.filters.quotientfilter; +import static org.apache.datasketches.filters.quotientfilter.QuotientFilter.DEFAULT_LOAD_FACTOR; +import org.apache.datasketches.common.SketchesArgumentException; + +/** + * This class provides methods to help estimate the correct parameters when + * creating a Quotient filter, and methods to create the filter using those values. + * + * The underlying math is described in the + * + * Wikipedia article on Quotient filters. + */ +public final class QuotientFilterBuilder { + + /* + This function is used to suggest the number of bits per entry for a given number of entries. + The fingerprint length is related to the targetFalsePositiveProb roughly by 2^(-fingerprint_length). + Hence, the length of the fingerprint can be stored in at most 8 bits. + This, after rounding up, is the same as the more sophisticated expression which involves the capacity + from https://en.wikipedia.org/wiki/Quotient_filter#Probability_of_false_positives. + * @param targetFalsePositiveProb A desired false positive probability per item + * @return The suggested fingerprint length in bits + */ + public static byte suggestFingerprintLength(double targetFalsePositiveProb) { + if (targetFalsePositiveProb <= 0. || targetFalsePositiveProb >= 1.) { + + throw new SketchesArgumentException("targetFalsePositiveProb must be a valid probability and strictly greater than 0"); + } + return (byte) Math.ceil(-Math.log(targetFalsePositiveProb) / Math.log(2)); + } + + /** + * This method suggests the number of slots in the filter for a given input size, assuming 90% capacity. + * There is no load factor checking internally within the filter, so this method is used to map between the + * number of items we insert into a sketch and the number of slots we need to allocate. + * A design feature of Niv's implementation is that 2^j +2*j slots are allocated. This asymptotically approaches + * 2^j slots as j grows, and the canonical number of slots is 2^j. Therefore, we will only check against + * 0.9*2^j slots. + * The load factor is 0.9 to get some space-utility advantages over the bloom filter. + * @param maxDistinctItems The maximum number of distinct items that can be inserted into the filter. + * @return The log-base-2 of the number of slots in the filter. + */ + public static byte suggestLgNumSlots(long maxDistinctItems, double loadFactor) { + if (maxDistinctItems <= 0) { + throw new SketchesArgumentException("maxDistinctItems must be strictly positive"); + } + byte result = (byte) Math.ceil(Math.log(maxDistinctItems / loadFactor) / Math.log(2)); + if (result < 31) { + return result; + } else { + // Largest address space for a Java array is 2^31 - 1 + throw new SketchesArgumentException("Largest address space for a Java array is 2^31 - 1"); + } + } + + public static byte suggestLgNumSlots(long maxDistinctItems) { + return suggestLgNumSlots(maxDistinctItems, DEFAULT_LOAD_FACTOR); + } + + /* + Returns the largest number of unique items that can be inserted into the filter. + We use a predefined load factor of 0.9 compared to the number of slots as 2^j. + @param lgNumSlots The log-base-2 of the number of slots in the filter + @return The maximum number of items that can be inserted into the filter + */ + public static long suggestMaxNumItemsFromNumSlots(int lgNumSlots, double loadFactor) { + if (lgNumSlots <= 0) { + throw new SketchesArgumentException("lgNumSlots must be at least 1."); + } else if (lgNumSlots >= 31) { + throw new SketchesArgumentException("lgNumSlots cannot exceed 2^31 - 1."); + } + return (long) (loadFactor * (1L<= 1.0) { + throw new SketchesArgumentException("loadFactor must be larger than 0 and less than 1"); + } + if (targetFalsePositiveProb <= 0.0 || targetFalsePositiveProb > 1.0) { + throw new SketchesArgumentException("targetFalsePositiveProb must be a valid probability and strictly greater than 0"); + } + } + + /** + * Helper class to return a pair of parameters for a Quotient filter: + * the log-base-2 of the number of slots (lgNumSlots) and the fingerprint length. + * These parameters are used to configure the Quotient filter. + */ + public static class QFPair { + public final byte lgNumSlots; + public final byte fingerprintLength; + + public QFPair(byte lgNumSlots, byte fingerprintLength) { + this.lgNumSlots = lgNumSlots; + this.fingerprintLength = fingerprintLength; + } + } + +} \ No newline at end of file diff --git a/src/main/java/org/apache/datasketches/theta/Sketches.java b/src/main/java/org/apache/datasketches/theta/Sketches.java index 4b1461876..c204751f2 100644 --- a/src/main/java/org/apache/datasketches/theta/Sketches.java +++ b/src/main/java/org/apache/datasketches/theta/Sketches.java @@ -80,7 +80,7 @@ public static int getMaxAnotBResultBytes(final int maxNomEntries) { /** * Returns the maximum number of storage bytes required for a CompactSketch with the given - * number of actual entries. Note that this assumes the worse case of the sketch in + * number of actual entries. Note that this assumes the worst case of the sketch in * estimation mode, which requires storing theta and count. * @param numberOfEntries the actual number of entries stored with the CompactSketch. * @return the maximum number of storage bytes required for a CompactSketch with the given number diff --git a/src/test/java/org/apache/datasketches/filters/bloomfilter/DirectBitArrayRTest.java b/src/test/java/org/apache/datasketches/filters/common/DirectBitArrayRTest.java similarity index 85% rename from src/test/java/org/apache/datasketches/filters/bloomfilter/DirectBitArrayRTest.java rename to src/test/java/org/apache/datasketches/filters/common/DirectBitArrayRTest.java index 521019e62..ea02ad21a 100644 --- a/src/test/java/org/apache/datasketches/filters/bloomfilter/DirectBitArrayRTest.java +++ b/src/test/java/org/apache/datasketches/filters/common/DirectBitArrayRTest.java @@ -17,7 +17,7 @@ * under the License. */ -package org.apache.datasketches.filters.bloomfilter; +package org.apache.datasketches.filters.common; import static org.testng.Assert.assertEquals; import static org.testng.Assert.assertFalse; @@ -99,6 +99,27 @@ public void basicOperationTest() { assertTrue(dba.isReadOnly()); } + @Test + public void getBitsFromToTest() { + final HeapBitArray hba = new HeapBitArray(128); + hba.setBit(1); // will override, but this forces non-empty + hba.setLong(0, 0x5555555555555555L); + hba.setLong(1, 0xFFFFFFFFFC003FFFL); + final Memory mem = bitArrayToMemory(hba); + DirectBitArrayR dba = DirectBitArrayR.wrap(mem, hba.isEmpty()); + + // single, full long test + assertEquals(dba.getBits(0, 64), 0x5555555555555555L); + + // subset of single long, mostly ones with a stretch of zeros + assertEquals(dba.getBits(64, 64), 0xFFFFFFFFFC003FFFL); + assertEquals(dba.getBits(78, 12), 0); + assertEquals(dba.getBits(77, 14), 8193); + + // spanning longs + assertEquals(dba.getBits(60, 20), 0x3FFF5); + } + @Test public void countBitsWhenDirty() { // like basicOperationTest but with setBit which does @@ -159,6 +180,9 @@ public void checkInvalidMethods() { // all of these try to modify a read-only memory assertThrows(SketchesReadOnlyException.class, () -> dba.setBit(14)); + assertThrows(SketchesReadOnlyException.class, () -> dba.clearBit(7)); + assertThrows(SketchesReadOnlyException.class, () -> dba.assignBit(924, false)); + assertThrows(SketchesReadOnlyException.class, () -> dba.setBits(100, 30, 0xFF)); assertThrows(SketchesReadOnlyException.class, () -> dba.getAndSetBit(100)); assertThrows(SketchesReadOnlyException.class, () -> dba.reset()); assertThrows(SketchesReadOnlyException.class, () -> dba.invert()); diff --git a/src/test/java/org/apache/datasketches/filters/bloomfilter/DirectBitArrayTest.java b/src/test/java/org/apache/datasketches/filters/common/DirectBitArrayTest.java similarity index 78% rename from src/test/java/org/apache/datasketches/filters/bloomfilter/DirectBitArrayTest.java rename to src/test/java/org/apache/datasketches/filters/common/DirectBitArrayTest.java index a45bcbb82..4cc229c50 100644 --- a/src/test/java/org/apache/datasketches/filters/bloomfilter/DirectBitArrayTest.java +++ b/src/test/java/org/apache/datasketches/filters/common/DirectBitArrayTest.java @@ -17,7 +17,7 @@ * under the License. */ -package org.apache.datasketches.filters.bloomfilter; +package org.apache.datasketches.filters.common; import static org.testng.Assert.assertEquals; import static org.testng.Assert.assertFalse; @@ -68,6 +68,8 @@ public void tooSmallCapacityTest() { } // no text of max size because the BitArray allows up to Integer.MAX_VALUE + // bits, which is the maximum size of an array in Java -- can't use it all, + // (need 2 longs for preamble) but also can't allocate that large to test on most machines @Test public void initializeTooSmallTest() { @@ -134,6 +136,70 @@ public void basicWritableWrapTest() { dba.setBit(100); assertTrue(dba.getAndSetBit(100)); assertEquals(dba.getNumBitsSet(), 8); + + dba.reset(); + assertTrue(dba.isEmpty()); + assertEquals(dba.getNumBitsSet(), 0); + + dba.setBit(0); + dba.setLong(0, -1); + assertTrue(dba.getBit(60)); + dba.clearBit(60); + assertFalse(dba.getBit(60)); + + assertTrue(dba.getBit(35)); + dba.assignBit(35, false); + assertFalse(dba.getBit(35)); + dba.assignBit(35, true); + assertTrue(dba.getBit(35)); + } + + @Test + public void getBitsFromToTest() { + final int numBits = 128; + final WritableMemory wmem = WritableMemory.writableWrap(new byte[32]); + final DirectBitArray dba = DirectBitArray.initialize(numBits, wmem); + + // single, full long test + dba.setBit(0); // useless but forces non-empty when using setLong() + dba.setLong(0, 0x5555555555555555L); + assertEquals(dba.getBits(0, 64), 0x5555555555555555L); + assertEquals(dba.getBits(64, 64), 0); + + // subset of single long, mostly ones with a stretch of zeros + dba.setLong(1, 0xFFFFFFFFFC003FFFL); + assertEquals(dba.getBits(64, 64), 0xFFFFFFFFFC003FFFL); + assertEquals(dba.getBits(78, 12), 0); + assertEquals(dba.getBits(77, 14), 8193); + + // spanning longs + assertEquals(dba.getBits(60, 20), 0x3FFF5); + } + + @Test + public void setBitsFromToTest() { + final int numBits = 128; + WritableMemory wmem = WritableMemory.writableWrap(new byte[32]); + DirectBitArray ba = DirectBitArray.initialize(numBits, wmem); + + // within a single long + ba.setBits(0, 64, 0x80000000DAB8C730L); + assertEquals(ba.getLong(0), 0x80000000DAB8C730L); + assertEquals(ba.getLong(1), 0); + + ba.setBits(40, 8, 0xA6); + assertEquals(ba.getLong(0), 0x8000A600DAB8C730L); + + // spanning longs + ba.setBits(60, 20, 0x3FFF5); + assertEquals(ba.getLong(0), 0x5000A600DAB8C730L); + assertEquals(ba.getLong(1), 0x3FFFL); + + // found specific failure with this test + wmem = WritableMemory.writableWrap(new byte[1272]); + ba = DirectBitArray.initialize(10000, wmem); + ba.setBits(601 * 10 + 3, 7, 125); + assertEquals(ba.getBits(601 * 10 + 3, 7), 125); } @Test diff --git a/src/test/java/org/apache/datasketches/filters/bloomfilter/HeapBitArrayTest.java b/src/test/java/org/apache/datasketches/filters/common/HeapBitArrayTest.java similarity index 79% rename from src/test/java/org/apache/datasketches/filters/bloomfilter/HeapBitArrayTest.java rename to src/test/java/org/apache/datasketches/filters/common/HeapBitArrayTest.java index 0e91788ea..a55f98a30 100644 --- a/src/test/java/org/apache/datasketches/filters/bloomfilter/HeapBitArrayTest.java +++ b/src/test/java/org/apache/datasketches/filters/common/HeapBitArrayTest.java @@ -17,7 +17,7 @@ * under the License. */ -package org.apache.datasketches.filters.bloomfilter; +package org.apache.datasketches.filters.common; import static org.testng.Assert.assertEquals; import static org.testng.Assert.assertFalse; @@ -75,9 +75,62 @@ public void basicOperationTest() { assertTrue(ba.isEmpty()); assertEquals(ba.getNumBitsSet(), 0); + ba.setLong(0, -1); + assertTrue(ba.getBit(60)); + ba.clearBit(60); + assertFalse(ba.getBit(60)); + + assertTrue(ba.getBit(35)); + ba.assignBit(35, false); + assertFalse(ba.getBit(35)); + ba.assignBit(35, true); + assertTrue(ba.getBit(35)); + assertTrue(String.valueOf(ba).length() > 0); } + @Test + public void getBitsFromToTest() { + final HeapBitArray ba = new HeapBitArray(128); + + // single, full long test + ba.setLong(0, 0x5555555555555555L); + assertEquals(ba.getBits(0, 64), 0x5555555555555555L); + assertEquals(ba.getBits(64, 64), 0); + + // subset of single long, mostly ones with a stretch of zeros + ba.setLong(1, 0xFFFFFFFFFC003FFFL); + assertEquals(ba.getBits(64, 64), 0xFFFFFFFFFC003FFFL); + assertEquals(ba.getBits(78, 12), 0); + assertEquals(ba.getBits(77, 14), 8193); + + // spanning longs + assertEquals(ba.getBits(60, 20), 0x3FFF5); + } + + @Test + public void setBitsFromToTest() { + HeapBitArray ba = new HeapBitArray(128); + + // within a single long + ba.setBits(0, 64, 0x80000000DAB8C730L); + assertEquals(ba.getLong(0), 0x80000000DAB8C730L); + assertEquals(ba.getLong(1), 0); + + ba.setBits(40, 8, 0xA6); + assertEquals(ba.getLong(0), 0x8000A600DAB8C730L); + + // spanning longs + ba.setBits(60, 20, 0x3FFF5); + assertEquals(ba.getLong(0), 0x5000A600DAB8C730L); + assertEquals(ba.getLong(1), 0x3FFFL); + + // found specific failure with this test + ba = new HeapBitArray(10000); + ba.setBits(601 * 10 + 3, 7, 125); + assertEquals(ba.getBits(601 * 10 + 3, 7), 125); + } + @Test public void bitAddresOutOfBoundsTest() { final HeapBitArray ba = new HeapBitArray(1024); diff --git a/src/test/java/org/apache/datasketches/filters/quotientfilter/DeletionTests.java b/src/test/java/org/apache/datasketches/filters/quotientfilter/DeletionTests.java new file mode 100644 index 000000000..432e5a6df --- /dev/null +++ b/src/test/java/org/apache/datasketches/filters/quotientfilter/DeletionTests.java @@ -0,0 +1,153 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.datasketches.filters.quotientfilter; +import org.testng.annotations.Test; +import static org.testng.Assert.assertTrue; + +import java.util.BitSet; + +public class DeletionTests { + + /** + * This test checks the functionality of deleting items from the QuotientFilter. + * The test works by: + * 1. Inserting multiple keys into a single slot to create an overflow. + * 2. Removing these keys. + * 3. Checking that the remaining keys have returned to their canonical slots. + * The expected outcome is that after deletion, the remaining keys should be in their canonical slots. + */ + @Test + static public void BasicDeletions() { + int fingerprint_len_bits = 5; + int num_entries_power = 3; + int num_entries = 1 << num_entries_power; + QuotientFilter qf = new QuotientFilter(num_entries_power, fingerprint_len_bits); + + long fp1 = 1 << 4; + long fp2 = 1 << 3; + long fp3 = 1 << 2; + long fp4 = 31; + + qf.insert(fp4, 1); + qf.insert(fp1, 1); + qf.insert(fp1, 1); + qf.insert(fp2, 2); + qf.insert(fp1, 1); + qf.insert(fp1, 1); + qf.insert(fp3, 4); + + + qf.delete(31, 1); + qf.delete(fp1, 1); + qf.delete(fp1, 1); + qf.delete(fp1, 1); + qf.delete(fp1, 1); + + BitSet result = new BitSet(num_entries * qf.getNumBitsPerEntry()); + result = QuotientFilterTest.set_slot_in_test(result, qf.getNumBitsPerEntry(), 2, true, false, false, fp2); + result = QuotientFilterTest.set_slot_in_test(result, qf.getNumBitsPerEntry(), 4, true, false, false, fp3); + assertTrue(QuotientFilterTest.check_equality(qf, result, true)); + } + + /** + * This test checks the functionality of deleting items from the QuotientFilter. + * The test works by: + * 1. Inserting multiple keys into a single slot to create an overflow. + * 2. Removing these keys. + * 3. Checking that the remaining keys have returned to their canonical slots. + * The expected outcome is that after deletion, the remaining keys should be in their canonical slots. + */ + @Test + static public void Deletions() { + int fingerprint_len_bits = 5; + int num_entries_power = 3; + int num_entries = (int)Math.pow(2, num_entries_power); + QuotientFilter qf = new QuotientFilter(num_entries_power, fingerprint_len_bits); + + qf.insert(1, 1); + qf.insert(2, 1); + qf.insert(3, 2); + qf.insert(4, 2); + qf.insert(5, 3); + qf.insert(6, 3); + qf.insert(7, 3); + qf.insert(8, 6); + qf.insert(9, 6); // these are ignored + qf.insert(10, 6); + qf.insert(11, 7); + + qf.delete(3, 2); + qf.delete(5, 3); + + BitSet result = new BitSet(num_entries * qf.getNumBitsPerEntry()); + result = QuotientFilterTest.set_slot_in_test(result, qf.getNumBitsPerEntry(), 0, false, false, false, 0); + result = QuotientFilterTest.set_slot_in_test(result, qf.getNumBitsPerEntry(), 1, true, false, false, 1); + result = QuotientFilterTest.set_slot_in_test(result, qf.getNumBitsPerEntry(), 2, true, true, true, 2); + result = QuotientFilterTest.set_slot_in_test(result, qf.getNumBitsPerEntry(), 3, true, false, true, 4); + result = QuotientFilterTest.set_slot_in_test(result, qf.getNumBitsPerEntry(), 4, false, false, true, 6); + result = QuotientFilterTest.set_slot_in_test(result, qf.getNumBitsPerEntry(), 5, false, true, true, 7); + result = QuotientFilterTest.set_slot_in_test(result, qf.getNumBitsPerEntry(), 6, true, false, false, 8); + result = QuotientFilterTest.set_slot_in_test(result, qf.getNumBitsPerEntry(), 7, false, false, false, 0); + + assertTrue(QuotientFilterTest.check_equality(qf, result, true)); + } + + @Test + /** + * This is a test for deleting items from the QuotientFilter even when an overflow is caused + * by multiple insertions. + * The test works by: + * 1. Inserting multiple keys into a single slot to create an overflow. + * 2. Removing these keys. + * 3. Checking that the remaining keys have returned to their canonical slots. + * + * The expected outcome is that after deletion, the remaining keys should be in their canonical slots. + */ + static public void DeletionsWithWrap() { + int fingerprint_len_bits = 5; + int num_entries_power = 3; + int num_entries = (int)Math.pow(2, num_entries_power); + QuotientFilter qf = new QuotientFilter(num_entries_power, fingerprint_len_bits); + + qf.insert(1, 1); + qf.insert(2, 1); + qf.insert(3, 2); + qf.insert(4, 2); + qf.insert(5, 3); + qf.insert(6, 4); + qf.insert(7, 4); + qf.insert(8, 5); + + //qf.pretty_print(); + qf.delete(5, 3); + //qf.pretty_print(); + + BitSet result = new BitSet(num_entries * qf.getNumBitsPerEntry()); + result = QuotientFilterTest.set_slot_in_test(result, qf.getNumBitsPerEntry(), 0, false, false, false, 0); + result = QuotientFilterTest.set_slot_in_test(result, qf.getNumBitsPerEntry(), 1, true, false, false, 1); + result = QuotientFilterTest.set_slot_in_test(result, qf.getNumBitsPerEntry(), 2, true, true, true, 2); + result = QuotientFilterTest.set_slot_in_test(result, qf.getNumBitsPerEntry(), 3, false, false, true, 3); + result = QuotientFilterTest.set_slot_in_test(result, qf.getNumBitsPerEntry(), 4, true, true, true, 4); + result = QuotientFilterTest.set_slot_in_test(result, qf.getNumBitsPerEntry(), 5, true, false, true, 6); + result = QuotientFilterTest.set_slot_in_test(result, qf.getNumBitsPerEntry(), 6, false, true, true, 7); + result = QuotientFilterTest.set_slot_in_test(result, qf.getNumBitsPerEntry(), 7, false, false, true, 8); + assertTrue(QuotientFilterTest.check_equality(qf, result, true)); + } +} diff --git a/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterBuilderTest.java b/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterBuilderTest.java new file mode 100644 index 000000000..3199c60af --- /dev/null +++ b/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterBuilderTest.java @@ -0,0 +1,129 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.datasketches.filters.quotientfilter; + +import org.apache.datasketches.common.SketchesArgumentException; +import org.apache.datasketches.filters.quotientfilter.QuotientFilterBuilder; +import org.apache.datasketches.memory.WritableMemory; +import org.testng.annotations.Test; + +import static org.testng.Assert.*; +public class QuotientFilterBuilderTest { + + @Test + public void testSuggestFingerprintLengthFromFPP(){ + // invalid false positive rate + assertThrows(SketchesArgumentException.class, () -> QuotientFilterBuilder.suggestFingerprintLength(0.)); + assertThrows(SketchesArgumentException.class, () -> QuotientFilterBuilder.suggestFingerprintLength(1.)); + + // manually computed values based on formula using ceil(log2(1/targetFalsePositiveProb)) + double[] fpps = {0.1, 0.01, 0.001, 0.0001, 1E-5, 1E-6, 1E-7, 1E-8}; + byte[] results = {4, 7, 10, 14, 17, 20, 24, 27, 30}; + for (int i = 0; i < fpps.length; i++) { + assertEquals(QuotientFilterBuilder.suggestFingerprintLength(fpps[i]), results[i]); + } + } + + @Test + public static void testSuggestLgNumSlots(){ + QuotientFilterBuilder qfb = new QuotientFilterBuilder(); + + // invalid number of items + assertThrows(SketchesArgumentException.class, () -> QuotientFilterBuilder.suggestLgNumSlots(0,0.9)); + assertThrows(SketchesArgumentException.class, () -> QuotientFilterBuilder.suggestLgNumSlots(-1, 0.9)); + assertThrows(SketchesArgumentException.class, () -> QuotientFilterBuilder.suggestLgNumSlots(5000000000L, 0.9)); + assertThrows(SketchesArgumentException.class, () -> QuotientFilterBuilder.suggestLgNumSlots(0)); + assertThrows(SketchesArgumentException.class, () -> QuotientFilterBuilder.suggestLgNumSlots(-1)); + assertThrows(SketchesArgumentException.class, () -> QuotientFilterBuilder.suggestLgNumSlots(5000000000L)); + + long[] numItems = {1, 100, 1000, 1000000L}; + int[] results = {1, 7, 11, 21} ; + + for (int i = 0; i < numItems.length; i++) { + long num = numItems[i]; + byte result = qfb.suggestLgNumSlots(num, 0.9); + assertEquals(result, results[i]); + result = qfb.suggestLgNumSlots(num); + assertEquals(result, results[i]); + } + } + + @Test + public static void testSuggestMaxNumItems(){ + QuotientFilterBuilder qfb = new QuotientFilterBuilder(); + + // invalid number of slots + assertThrows(SketchesArgumentException.class, () -> QuotientFilterBuilder.suggestMaxNumItemsFromNumSlots((byte)-127)); + assertThrows(SketchesArgumentException.class, () -> QuotientFilterBuilder.suggestMaxNumItemsFromNumSlots((byte)0)); + assertThrows(SketchesArgumentException.class, () -> QuotientFilterBuilder.suggestMaxNumItemsFromNumSlots((byte)32)); + + + int[] lgNumSlots = {1, 2, 3, 6, 10, 15, 25, 30,}; + long[] results_ninety_pc = {1, 3, 7, 57, 921, 29491, 30198988, 966367641} ; + long[] results_eighty_pc = {1, 3, 6, 51, 819, 26214, 26843545, 858993459} ; + + for (int i = 0; i < lgNumSlots.length; i++) { + long result_ninety = qfb.suggestMaxNumItemsFromNumSlots(lgNumSlots[i], 0.9); + long result_eighty = qfb.suggestMaxNumItemsFromNumSlots(lgNumSlots[i], 0.8); + assertEquals(result_ninety, results_ninety_pc[i]); + assertEquals(result_eighty, results_eighty_pc[i]); + } + } + + @Test + public static void testSuggestParamsFromMaxDistinctsFPP(){ + + // invalid number of slots + assertThrows(SketchesArgumentException.class, () -> QuotientFilterBuilder.suggestParamsFromMaxDistinctsFPP(5000000000L, 0.0001)); + assertThrows(SketchesArgumentException.class, () -> QuotientFilterBuilder.suggestParamsFromMaxDistinctsFPP(100000000, 0.)); + assertThrows(SketchesArgumentException.class, () -> QuotientFilterBuilder.suggestParamsFromMaxDistinctsFPP(100000000, 1.5)); + assertThrows(SketchesArgumentException.class, () -> QuotientFilterBuilder.suggestParamsFromMaxDistinctsFPP(5000000000L, -1.)); + + + QuotientFilterBuilder qfb = new QuotientFilterBuilder(); + byte lgNumSlots ; + byte fingerprintLength ; + long[] numItems = {1L, 900L, 500_000_000L} ; + double[] fpp = {1E-10, 1E-2, 1e-7} ; + + // expected outcomes + byte[] expected_lgNumSlotsNinety = {1, 10, 30} ; + byte[] expected_lgNumSlotsEighty = {1, 11, 30} ; + byte[] expected_fingerprintLength = {34, 7, 24} ; + + for (int i = 0; i < numItems.length; i++) { + QuotientFilterBuilder.QFPair pair = qfb.suggestParamsFromMaxDistinctsFPP(numItems[i], 0.9, fpp[i]); + lgNumSlots = pair.lgNumSlots; + fingerprintLength = pair.fingerprintLength; + assertEquals(expected_lgNumSlotsNinety[i], lgNumSlots); + assertEquals(expected_fingerprintLength[i], fingerprintLength); + + // 80% load + pair = qfb.suggestParamsFromMaxDistinctsFPP(numItems[i], fpp[i]); + lgNumSlots = pair.lgNumSlots; + fingerprintLength = pair.fingerprintLength; + assertEquals(expected_lgNumSlotsEighty[i], lgNumSlots); + assertEquals(expected_fingerprintLength[i], fingerprintLength); + } + } + + + +} \ No newline at end of file diff --git a/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterTest.java b/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterTest.java new file mode 100644 index 000000000..5dc775b0c --- /dev/null +++ b/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterTest.java @@ -0,0 +1,365 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.datasketches.filters.quotientfilter; +import org.apache.datasketches.common.SketchesArgumentException; +import org.testng.annotations.Test; +import static org.testng.Assert.assertTrue; +import static org.testng.Assert.assertEquals; + +import java.util.BitSet; +import java.util.HashSet; +import java.util.Random; + + +public class QuotientFilterTest { + // this method had been in Bitmap, but was used only to test the QuotientFilter + public static boolean get_fingerprint_bit(long index, long fingerprint) { + long mask = 1 << index; + long and = fingerprint & mask; + return and != 0; + } + + /* + * This test is based on the example from https://en.wikipedia.org/wiki/Quotient_filter + * in "Algorithm Description" section. + * It performs the same insertions and query as the example and verifies that it gets the same results. + * The insertion keys are: b, e, f, c, d, a which are hashed into slots as: + * (b,1), (e,4), (f, 7), (c,1), (d,2), (a,1) + */ + @Test + public void WikiInsertionTest() { + int fingerprint_len_bits = 3; // 3 bits fingerprint => 6 bits per entry, resolved internally in the filter. + int num_entries_power = 3; + QuotientFilter qf = new QuotientFilter(num_entries_power, fingerprint_len_bits, 1.0f); + + final int A = 1; + final int B = 2; + final int C = 3; + final int D = 4; + final int E = 5; + final int F = 6; + + qf.insert(B, 1); + qf.insert(E, 4); + qf.insert(F, 7); + qf.insert(C, 1); + qf.insert(D, 2); + qf.insert(A, 1); + assertEquals(qf.getNumEntries(), 6); + + assertEquals(getState(qf, 0), 0); + assertEquals(qf.getFingerprint(0), 0); + assertEquals(getState(qf, 1), 0b100); + assertEquals(qf.getFingerprint(1), A); + assertEquals(getState(qf, 2), 0b111); + assertEquals(qf.getFingerprint(2), B); + assertEquals(getState(qf, 3), 0b011); + assertEquals(qf.getFingerprint(3), C); + assertEquals(getState(qf, 4), 0b101); + assertEquals(qf.getFingerprint(4), D); + assertEquals(getState(qf, 5), 0b001); + assertEquals(qf.getFingerprint(5), E); + assertEquals(getState(qf, 6), 0); + assertEquals(qf.getFingerprint(6), 0); + assertEquals(getState(qf, 7), 0b100); + assertEquals(qf.getFingerprint(7), F); + } + + public int getState(QuotientFilter filter, int slot) { + return (filter.isOccupied(slot) ? 1 : 0) << 2 + | (filter.isContinuation(slot) ? 1 : 0) << 1 + | (filter.isShifted(slot) ? 1 : 0); + } + + /* + * This test is based on the Figure 2. from https://vldb.org/pvldb/vol5/p1627_michaelabender_vldb2012.pdf. + * It performs the same insertions as in Figure 2 and checks for the same result. + */ + @Test + public void PaperInsertionTest() { + int fingerprint_len_bits = 5; + int num_entries_power = 4; + int num_entries = (int)Math.pow(2, num_entries_power); + QuotientFilter qf = new QuotientFilter(num_entries_power, fingerprint_len_bits); + + final int A = 1; + final int B = 2; + final int C = 3; + final int D = 4; + final int E = 5; + final int F = 6; + final int G = 7; + final int H = 8; + + // (key, slot): {(a, 1), (b, 1), (c, 3), (d, 3), (e, 3), (f, 4), (g, 6), (h, 6)} + qf.insert(A, 1); + qf.insert(B, 1); + qf.insert(C, 3); + qf.insert(D, 3); + qf.insert(E, 3); + qf.insert(F, 4); + qf.insert(G, 6); + qf.insert(H, 6); + + BitSet result = new BitSet(num_entries * qf.getNumBitsPerEntry()); + result = set_slot_in_test(result, qf.getNumBitsPerEntry(), 0, false, false, false, 0); + result = set_slot_in_test(result, qf.getNumBitsPerEntry(), 1, true, false, false, A); + result = set_slot_in_test(result, qf.getNumBitsPerEntry(), 2, false, true, true, B); + result = set_slot_in_test(result, qf.getNumBitsPerEntry(), 3, true, false, false, C); + result = set_slot_in_test(result, qf.getNumBitsPerEntry(), 4, true, true, true, D); + result = set_slot_in_test(result, qf.getNumBitsPerEntry(), 5, false, true, true, E); + result = set_slot_in_test(result, qf.getNumBitsPerEntry(), 6, true, false, true, F); + result = set_slot_in_test(result, qf.getNumBitsPerEntry(), 7, false, false, true, G); + result = set_slot_in_test(result, qf.getNumBitsPerEntry(), 8, false, true, true, H); + assertTrue(check_equality(qf, result, false)); + } + + // test we don't get any false negatives for quotient filter + @Test + public void FalseNegativeTest() { + int fingerprint_len_bits = 7; + int num_entries_power = 10; + QuotientFilter filter = new QuotientFilter(num_entries_power, fingerprint_len_bits); + int num_entries = (int) ((1 << num_entries_power) * 0.8); + assertTrue(test_no_false_negatives(filter, num_entries)); + } + + + /** + * This method tests the functionality of the QuotientFilter and Iterator classes. It creates a QuotientFilter and inserts + * six entries into it. An Iterator is then used to traverse the entries in the QuotientFilter. The method checks if the + * bucket index of each visited entry matches the expected bucket index. If there's a mismatch, an error message is printed + * and the program exits, indicating a test failure. + */ + @Test + public void testQuotientFilterInsertionAndIteration() { + + int fingerprint_len_bits = 5; + int num_entries_power = 4; + QuotientFilter qf = new QuotientFilter(num_entries_power, fingerprint_len_bits); + + qf.insert(0x1F, 2); + qf.insert(0x1F, 3); + qf.insert(0x1F, 3); + qf.insert(0x1F, 4); + qf.insert(0x1F, 15); // last slot in the filter + qf.insert(0x1F, 16); // outside the bounds +// qf.pretty_print(); + + Iterator it = new Iterator(qf); + int[] arr = new int[] {2, 3, 4, 15}; + int arr_index = 0; + while (it.next()) {assertEquals(it.bucket_index, arr[arr_index++]);} + } + + @Test + public void testQuotientFilterIterator() { + + int fingerprint_len_bits = 5; + int num_entries_power = 4; + QuotientFilter qf = new QuotientFilter(num_entries_power, fingerprint_len_bits); + + qf.insert(0, 1); + qf.insert(0, 4); + qf.insert(0, 7); + qf.insert(0, 1); + qf.insert(0, 2); + qf.insert(0, 1); + qf.insert(0, 15); + + Iterator it = new Iterator(qf); + int[] arr = new int[] {1, 2, 4, 7, 15}; + int arr_index = 0; + while (it.next()) {assertEquals(arr[arr_index++], it.bucket_index);} + } + + + // Helper functions + + /** + * This method sets the values of a slot in a BitSet based on the provided parameters. + * The slot is defined by the number of bits per entry and the slot index. + * The values to be set include whether the slot is occupied, whether it is a continuation of a previous entry, + * whether it is shifted, and the fingerprint. + * + * @param result The BitSet where the slot values will be set. + * @param bits_per_entry The number of bits per entry in the BitSet. + * @param slot The index of the slot to be set. + * @param is_occupied Whether the slot is occupied. + * @param is_continuation Whether the slot is a continuation of a previous entry. + * @param is_shifted Whether the slot is shifted. + * @param fingerprint The fingerprint to be set in the slot. + * @return The BitSet after setting the slot values. + */ + static public BitSet set_slot_in_test(BitSet result, int bits_per_entry, int slot, boolean is_occupied, boolean is_continuation, boolean is_shifted, long fingerprint) { + int index = bits_per_entry * slot; + result.set(index++, is_occupied); + result.set(index++, is_continuation); + result.set(index++, is_shifted); + for (int i = 0; i < bits_per_entry - 3; i++) { + result.set(index++, get_fingerprint_bit(i, fingerprint) ); + } + return result; + } + + static public BitSet set_slot_in_test(BitSet result, int bits_per_entry, int slot, boolean is_occupied, boolean is_continuation, boolean is_shifted, String fingerprint) { + long l_fingerprint = 0; + for (int i = 0; i < fingerprint.length(); i++) { + char c = fingerprint.charAt(i); + if (c == '1') { + l_fingerprint |= (1 << i); + } + } + return set_slot_in_test(result, bits_per_entry, slot, is_occupied, is_continuation, is_shifted, l_fingerprint); + } + + static public boolean check_equality(QuotientFilter qf, BitSet bs, boolean check_also_fingerprints) { + for (int i = 0; i < bs.size(); i++) { + if (check_also_fingerprints || (i % qf.getNumBitsPerEntry() == 0 || i % qf.getNumBitsPerEntry() == 1 || i % qf.getNumBitsPerEntry() == 2)) { + if (qf.getBitAtOffset(i) != bs.get(i)) { + return false; + } + } + } + return true; + } + + /* + Helper function to test that no false negatives are returned. + */ + static public boolean test_no_false_negatives(QuotientFilter filter, int num_entries) { + HashSet added = new HashSet(); + int seed = 5; + Random rand = new Random(seed); + + for (int i = 0; i < num_entries; i++) { + int rand_num = rand.nextInt(); + boolean success = filter.insert(rand_num); + if (success) { + added.add(rand_num); + } + else { + System.out.println("insertion failed"); + } + } + + for (Integer i: added) { + boolean found = filter.search((long)i); + if (!found) { + return false; + } + } + return true; + } + + @Test + public void smallExpansion() { + final QuotientFilter qf = new QuotientFilter(5, 9); + final int n = 30; + for (int i = 0; i < n; i++) { qf.insert(i); } + qf.printFilterSummary(); + assertEquals(qf.getNumExpansions(), 1); + assertEquals(qf.getNumEntries(), n); + + // query the same keys + int positives = 0; + for (int i = 0; i < n; i++) { if (qf.search(i)) { positives++; } } + assertEquals(positives, n); + + // query novel keys + positives = 0; + for (int i = 0; i < n; i++) { if (qf.search(i + n)) { positives++; } } + assertTrue(positives < 2); + } + + @Test + public void expansion() { + final QuotientFilter qf = new QuotientFilter(16, 13); + final int n = 60000; + for (int i = 0; i < n; i++) { qf.insert(i); } +// qf.printFilterSummary(); + assertEquals(qf.getNumExpansions(), 1); + assertTrue(qf.getNumEntries() > n * 0.99); // allow a few hash collisions + + // query the same keys + int positives = 0; + for (int i = 0; i < n; i++) { if (qf.search(i)) { positives++; } } + assertEquals(positives, n); + + // query novel keys + positives = 0; + for (int i = 0; i < n; i++) { if (qf.search(i + n)) { positives++; } } + assertTrue(positives < 6); + } + + @Test + public void mergeEmpty() { + final QuotientFilter qf1 = new QuotientFilter(4, 3); + final QuotientFilter qf2 = new QuotientFilter(4, 3); + qf1.merge(qf2); + + assertEquals(qf1.getLgQ(), 4); + assertEquals(qf1.getFingerprintLength(), 3); + assertEquals(qf1.getNumEntries(), 0); + } + + @Test + public void merge() { + final QuotientFilter qf1 = new QuotientFilter(16, 13); + final QuotientFilter qf2 = new QuotientFilter(16, 13); + final int n = 50000; + for (int i = 0; i < n / 2; i++) { + qf1.insert(i); + qf2.insert(i + n / 2); + } + qf1.merge(qf2); + + assertEquals(qf1.getNumExpansions(), 0); + assertTrue(qf1.getNumEntries() > n * 0.99); // allow a few hash collisions + + // query the same keys + int positives = 0; + for (int i = 0; i < n; i++) { if (qf1.search(i)) { positives++; } } + assertEquals(positives, n); + + // query novel keys + positives = 0; + for (int i = 0; i < n; i++) { if (qf1.search(i + n)) { positives++; } } + assertTrue(positives < 4); + } + + @Test + public void mergeDifferentConfiguration() { + final QuotientFilter qf1 = new QuotientFilter(3, 4); + final QuotientFilter qf2 = new QuotientFilter(4, 3); + qf1.insert(4); + qf2.insert(4); + qf1.merge(qf2); + assertEquals(qf1.getNumEntries(), 1); + } + + @Test(expectedExceptions = SketchesArgumentException.class) + public void mergeIncompatible() { + final QuotientFilter qf1 = new QuotientFilter(4, 4); + final QuotientFilter qf2 = new QuotientFilter(4, 3); + qf1.merge(qf2); + } + +}