diff --git a/src/main/scala/com/microsoft/hyperspace/index/FileBasedSignatureProvider.scala b/src/main/scala/com/microsoft/hyperspace/index/FileBasedSignatureProvider.scala index 654c19142..0d71f6fd9 100644 --- a/src/main/scala/com/microsoft/hyperspace/index/FileBasedSignatureProvider.scala +++ b/src/main/scala/com/microsoft/hyperspace/index/FileBasedSignatureProvider.scala @@ -19,15 +19,20 @@ package com.microsoft.hyperspace.index import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, LogicalPlan} import com.microsoft.hyperspace.Hyperspace -import com.microsoft.hyperspace.util.HashingUtils +import com.microsoft.hyperspace.util.fingerprint.{Fingerprint, FingerprintBuilder, FingerprintBuilderFactory} /** * [[FileBasedSignatureProvider]] provides the logical plan signature based on files in the * relation. File metadata, eg. size, modification time and path, of each file in the * relation will be used to generate the signature. + * + * Note that while the order of files in a single relation does not affect the signature, + * the order of relations in the plan do affect the signature calculation. + * * If the given logical plan does not have any supported relations, no signature is provided. */ -class FileBasedSignatureProvider extends LogicalPlanSignatureProvider { +class FileBasedSignatureProvider(fbf: FingerprintBuilderFactory) + extends LogicalPlanSignatureProvider { /** * Generate the signature of logical plan. @@ -35,28 +40,18 @@ class FileBasedSignatureProvider extends LogicalPlanSignatureProvider { * @param logicalPlan logical plan of data frame. * @return signature, if the logical plan has supported relations; Otherwise None. */ - def signature(logicalPlan: LogicalPlan): Option[String] = { - fingerprintVisitor(logicalPlan).map(HashingUtils.md5Hex) - } - - /** - * Visit logical plan and collect info needed for fingerprint. - * - * @param logicalPlan logical plan of data frame. - * @return fingerprint, if the logical plan has supported relations; Otherwise None. - */ - private def fingerprintVisitor(logicalPlan: LogicalPlan): Option[String] = { + def signature(logicalPlan: LogicalPlan): Option[Fingerprint] = { val provider = Hyperspace.getContext.sourceProviderManager - var fingerprint = "" + val fb: FingerprintBuilder = fbf.create + var updated = false logicalPlan.foreachUp { case l: LeafNode if provider.isSupportedRelation(l) => - fingerprint ++= provider.getRelation(l).signature + provider.getRelation(l).signature(fb).foreach { f => + fb.add(f) + updated = true + } case _ => } - - fingerprint match { - case "" => None - case _ => Some(fingerprint) - } + if (updated) Some(fb.build()) else None } } diff --git a/src/main/scala/com/microsoft/hyperspace/index/IndexLogEntry.scala b/src/main/scala/com/microsoft/hyperspace/index/IndexLogEntry.scala index 6e0badba7..d43db2f5e 100644 --- a/src/main/scala/com/microsoft/hyperspace/index/IndexLogEntry.scala +++ b/src/main/scala/com/microsoft/hyperspace/index/IndexLogEntry.scala @@ -32,6 +32,7 @@ import org.apache.spark.sql.types.{DataType, StructType} import com.microsoft.hyperspace.{BuildInfo, HyperspaceException} import com.microsoft.hyperspace.actions.Constants import com.microsoft.hyperspace.util.PathUtils +import com.microsoft.hyperspace.util.fingerprint.Fingerprint // IndexLogEntry-specific fingerprint to be temporarily used where fingerprint is not defined. case class NoOpFingerprint() { @@ -361,7 +362,7 @@ object CoveringIndex { } // IndexLogEntry-specific Signature that stores the signature provider and value. -case class Signature(provider: String, value: String) +case class Signature(provider: String, value: Fingerprint) // IndexLogEntry-specific LogicalPlanFingerprint to store fingerprint of logical plan. case class LogicalPlanFingerprint(properties: LogicalPlanFingerprint.Properties) { diff --git a/src/main/scala/com/microsoft/hyperspace/index/IndexSignatureProvider.scala b/src/main/scala/com/microsoft/hyperspace/index/IndexSignatureProvider.scala index efe4f4cf5..63b862fbd 100644 --- a/src/main/scala/com/microsoft/hyperspace/index/IndexSignatureProvider.scala +++ b/src/main/scala/com/microsoft/hyperspace/index/IndexSignatureProvider.scala @@ -18,7 +18,7 @@ package com.microsoft.hyperspace.index import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan -import com.microsoft.hyperspace.util.HashingUtils +import com.microsoft.hyperspace.util.fingerprint.{Fingerprint, FingerprintBuilderFactory} /** * [[IndexSignatureProvider]] provides signature for a logical plan based on: @@ -29,10 +29,13 @@ import com.microsoft.hyperspace.util.HashingUtils * * If the plan does not comply with [[FileBasedSignatureProvider]] or [[PlanSignatureProvider]] * requirements for signature computation, then no signature will be provided for the plan. + * + * @param fbf [[FingerprintBuilderFactory]] used for building fingerprints */ -class IndexSignatureProvider extends LogicalPlanSignatureProvider { - private val fileBasedSignatureProvider = new FileBasedSignatureProvider - private val planSignatureProvider = new PlanSignatureProvider +class IndexSignatureProvider(fbf: FingerprintBuilderFactory) + extends LogicalPlanSignatureProvider { + private val fileBasedSignatureProvider = new FileBasedSignatureProvider(fbf) + private val planSignatureProvider = new PlanSignatureProvider(fbf) /** * Generate the signature of logical plan. @@ -41,10 +44,10 @@ class IndexSignatureProvider extends LogicalPlanSignatureProvider { * @return signature, if both [[FileBasedSignatureProvider]] and [[PlanSignatureProvider]] * can generate signature for the logical plan; Otherwise None. */ - def signature(logicalPlan: LogicalPlan): Option[String] = { + def signature(logicalPlan: LogicalPlan): Option[Fingerprint] = { fileBasedSignatureProvider.signature(logicalPlan).flatMap { f => planSignatureProvider.signature(logicalPlan).map { p => - HashingUtils.md5Hex(f + p) + fbf.create.add(f).add(p).build() } } } diff --git a/src/main/scala/com/microsoft/hyperspace/index/LogicalPlanSignatureProvider.scala b/src/main/scala/com/microsoft/hyperspace/index/LogicalPlanSignatureProvider.scala index 4c8277cba..8727c4c6c 100644 --- a/src/main/scala/com/microsoft/hyperspace/index/LogicalPlanSignatureProvider.scala +++ b/src/main/scala/com/microsoft/hyperspace/index/LogicalPlanSignatureProvider.scala @@ -21,8 +21,12 @@ import scala.util.{Success, Try} import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.util.hyperspace.Utils +import com.microsoft.hyperspace.util.fingerprint.{Fingerprint, FingerprintBuilderFactory, MD5FingerprintBuilderFactory} + /** * This trait contains the interface that provides the signature of logical plan. + * + * The implementation must have a constructor taking [[FingerprintBuilderFactory]] as an argument. */ trait LogicalPlanSignatureProvider { @@ -36,15 +40,17 @@ trait LogicalPlanSignatureProvider { * @param logicalPlan logical plan. * @return signature if it can be computed w.r.t signature provider assumptions; Otherwise None. */ - def signature(logicalPlan: LogicalPlan): Option[String] + def signature(logicalPlan: LogicalPlan): Option[Fingerprint] } /** * Factory object for LogicalPlanSignatureProvider. */ object LogicalPlanSignatureProvider { + private val fbf: FingerprintBuilderFactory = new MD5FingerprintBuilderFactory + // Creates a default signature provider. - def create(): LogicalPlanSignatureProvider = new IndexSignatureProvider + def create(): LogicalPlanSignatureProvider = new IndexSignatureProvider(fbf) /** * Creates a parameterized signature provider. @@ -53,7 +59,11 @@ object LogicalPlanSignatureProvider { * @return signature provider. */ def create(name: String): LogicalPlanSignatureProvider = { - Try(Utils.classForName(name).newInstance) match { + Try( + Utils + .classForName(name) + .getConstructor(classOf[FingerprintBuilderFactory]) + .newInstance(fbf)) match { case Success(provider: LogicalPlanSignatureProvider) => provider case _ => throw new IllegalArgumentException( diff --git a/src/main/scala/com/microsoft/hyperspace/index/PlanSignatureProvider.scala b/src/main/scala/com/microsoft/hyperspace/index/PlanSignatureProvider.scala index 8a780db4b..cef578d69 100644 --- a/src/main/scala/com/microsoft/hyperspace/index/PlanSignatureProvider.scala +++ b/src/main/scala/com/microsoft/hyperspace/index/PlanSignatureProvider.scala @@ -18,14 +18,16 @@ package com.microsoft.hyperspace.index import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan -import com.microsoft.hyperspace.util.HashingUtils +import com.microsoft.hyperspace.util.fingerprint.{Fingerprint, FingerprintBuilder, FingerprintBuilderFactory} /** * [[PlanSignatureProvider]] provides signature for a logical plan based on * the type of operators in it. * A plan needs to have at least one operator so its signature can be generated. + * + * @param fbf [[FingerprintBuilderFactory]] used for building fingerprints */ -class PlanSignatureProvider extends LogicalPlanSignatureProvider { +class PlanSignatureProvider(fbf: FingerprintBuilderFactory) extends LogicalPlanSignatureProvider { /** * Generate the signature of logical plan. @@ -33,12 +35,9 @@ class PlanSignatureProvider extends LogicalPlanSignatureProvider { * @param logicalPlan logical plan. * @return signature if there is at least one operator in the plan; Otherwise None. */ - def signature(logicalPlan: LogicalPlan): Option[String] = { - var signature = "" - logicalPlan.foreachUp(p => signature = HashingUtils.md5Hex(signature + p.nodeName)) - signature match { - case "" => None - case _ => Some(signature) - } + def signature(logicalPlan: LogicalPlan): Option[Fingerprint] = { + val fb: FingerprintBuilder = fbf.create + logicalPlan.foreachUp(node => fb.add(node.nodeName)) + Some(fb.build()) } } diff --git a/src/main/scala/com/microsoft/hyperspace/index/rules/RuleUtils.scala b/src/main/scala/com/microsoft/hyperspace/index/rules/RuleUtils.scala index d9e0f6bf8..2c0c999c5 100644 --- a/src/main/scala/com/microsoft/hyperspace/index/rules/RuleUtils.scala +++ b/src/main/scala/com/microsoft/hyperspace/index/rules/RuleUtils.scala @@ -34,6 +34,7 @@ import com.microsoft.hyperspace.index.IndexLogEntryTags.{HYBRIDSCAN_RELATED_CONF import com.microsoft.hyperspace.index.plans.logical.{BucketUnion, IndexHadoopFsRelation} import com.microsoft.hyperspace.index.sources.FileBasedRelation import com.microsoft.hyperspace.util.HyperspaceConf +import com.microsoft.hyperspace.util.fingerprint.Fingerprint object RuleUtils { @@ -54,7 +55,7 @@ object RuleUtils { indexes: Seq[IndexLogEntry], relation: FileBasedRelation): Seq[IndexLogEntry] = { // Map of a signature provider to a signature generated for the given plan. - val signatureMap = mutable.Map[String, Option[String]]() + val signatureMap = mutable.Map[String, Option[Fingerprint]]() def signatureValid(entry: IndexLogEntry): Boolean = { entry.withCachedTag(relation.plan, IndexLogEntryTags.SIGNATURE_MATCHED) { diff --git a/src/main/scala/com/microsoft/hyperspace/index/sources/default/DefaultFileBasedRelation.scala b/src/main/scala/com/microsoft/hyperspace/index/sources/default/DefaultFileBasedRelation.scala index 947573c4d..f1a27c1e2 100644 --- a/src/main/scala/com/microsoft/hyperspace/index/sources/default/DefaultFileBasedRelation.scala +++ b/src/main/scala/com/microsoft/hyperspace/index/sources/default/DefaultFileBasedRelation.scala @@ -30,7 +30,7 @@ import com.microsoft.hyperspace.HyperspaceException import com.microsoft.hyperspace.index.{Content, FileIdTracker, Hdfs, Relation} import com.microsoft.hyperspace.index.IndexConstants.GLOBBING_PATTERN_KEY import com.microsoft.hyperspace.index.sources.FileBasedRelation -import com.microsoft.hyperspace.util.HashingUtils +import com.microsoft.hyperspace.util.fingerprint.{Fingerprint, FingerprintBuilder} /** * Implementation for file-based relation used by [[DefaultFileBasedSource]] @@ -41,14 +41,15 @@ class DefaultFileBasedRelation(spark: SparkSession, override val plan: LogicalRe /** * Computes the signature of the current relation. + * + * @param fb [[FingerprintBuilder]] used for building fingerprints */ - override def signature: String = plan.relation match { + override def signature(fb: FingerprintBuilder): Option[Fingerprint] = plan.relation match { case HadoopFsRelation(location: PartitioningAwareFileIndex, _, _, _, _, _) => - val result = filesFromIndex(location).sortBy(_.getPath.toString).foldLeft("") { - (acc: String, f: FileStatus) => - HashingUtils.md5Hex(acc + fingerprint(f)) - } - result + val initialFingerprint = fb.build() + var fingerprint = initialFingerprint + filesFromIndex(location).foreach(fingerprint ^= createFingerprint(_, fb)) + Some(fingerprint).filter(_ != initialFingerprint) } /** @@ -179,9 +180,11 @@ class DefaultFileBasedRelation(spark: SparkSession, override val plan: LogicalRe } } - private def fingerprint(fileStatus: FileStatus): String = { - fileStatus.getLen.toString + fileStatus.getModificationTime.toString + - fileStatus.getPath.toString + private def createFingerprint(fileStatus: FileStatus, fb: FingerprintBuilder): Fingerprint = { + fb.add(fileStatus.getLen) + .add(fileStatus.getModificationTime) + .add(fileStatus.getPath.toString) + .build() } private def filesFromIndex(index: PartitioningAwareFileIndex): Seq[FileStatus] = { diff --git a/src/main/scala/com/microsoft/hyperspace/index/sources/delta/DeltaLakeRelation.scala b/src/main/scala/com/microsoft/hyperspace/index/sources/delta/DeltaLakeRelation.scala index 2cfac9656..705def81c 100644 --- a/src/main/scala/com/microsoft/hyperspace/index/sources/delta/DeltaLakeRelation.scala +++ b/src/main/scala/com/microsoft/hyperspace/index/sources/delta/DeltaLakeRelation.scala @@ -27,6 +27,7 @@ import com.microsoft.hyperspace.actions.Constants import com.microsoft.hyperspace.index.{Content, FileIdTracker, Hdfs, IndexLogEntry, Relation} import com.microsoft.hyperspace.index.sources.default.DefaultFileBasedRelation import com.microsoft.hyperspace.util.{HyperspaceConf, PathUtils} +import com.microsoft.hyperspace.util.fingerprint.{Fingerprint, FingerprintBuilder} /** * Implementation for file-based relation used by [[DeltaLakeFileBasedSource]] @@ -36,10 +37,12 @@ class DeltaLakeRelation(spark: SparkSession, override val plan: LogicalRelation) /** * Computes the signature of the current relation. + * + * @param fb [[FingerprintBuilder]] used for building fingerprints */ - override def signature: String = plan.relation match { + override def signature(fb: FingerprintBuilder): Option[Fingerprint] = plan.relation match { case HadoopFsRelation(location: TahoeLogFileIndex, _, _, _, _, _) => - location.tableVersion + location.path.toString + Some(fb.add(location.tableVersion).add(location.path.toString).build()) } /** diff --git a/src/main/scala/com/microsoft/hyperspace/index/sources/iceberg/IcebergRelation.scala b/src/main/scala/com/microsoft/hyperspace/index/sources/iceberg/IcebergRelation.scala index 7a3f2a99f..9c4a63db7 100644 --- a/src/main/scala/com/microsoft/hyperspace/index/sources/iceberg/IcebergRelation.scala +++ b/src/main/scala/com/microsoft/hyperspace/index/sources/iceberg/IcebergRelation.scala @@ -30,6 +30,7 @@ import org.apache.spark.sql.types.StructType import com.microsoft.hyperspace.index.{Content, FileIdTracker, Hdfs, IndexConstants, Relation} import com.microsoft.hyperspace.index.sources.FileBasedRelation import com.microsoft.hyperspace.util.PathUtils +import com.microsoft.hyperspace.util.fingerprint.{Fingerprint, FingerprintBuilder} /** * Implementation for file-based relation used by [[IcebergFileBasedSource]] @@ -61,9 +62,14 @@ class IcebergRelation( /** * Computes the signature of the current relation. + * + * @param fb [[FingerprintBuilder]] used for building fingerprints */ - override def signature: String = { - snapshotId.getOrElse(table.currentSnapshot().snapshotId()).toString + table.location() + override def signature(fb: FingerprintBuilder): Option[Fingerprint] = { + Some( + fb.add(snapshotId.getOrElse(table.currentSnapshot().snapshotId()).toString) + .add(table.location) + .build()) } /** diff --git a/src/main/scala/com/microsoft/hyperspace/index/sources/interfaces.scala b/src/main/scala/com/microsoft/hyperspace/index/sources/interfaces.scala index 66e3f2e52..495516690 100644 --- a/src/main/scala/com/microsoft/hyperspace/index/sources/interfaces.scala +++ b/src/main/scala/com/microsoft/hyperspace/index/sources/interfaces.scala @@ -24,6 +24,7 @@ import org.apache.spark.sql.execution.datasources.{FileIndex, HadoopFsRelation, import org.apache.spark.sql.types.StructType import com.microsoft.hyperspace.index.{FileIdTracker, FileInfo, IndexConstants, IndexLogEntry, Relation} +import com.microsoft.hyperspace.util.fingerprint.{Fingerprint, FingerprintBuilder} /** * ::Experimental:: @@ -67,8 +68,14 @@ trait FileBasedRelation extends SourceRelation { * * This API is used when the signature of source needs to be computed, e.g., creating an index, * computing query plan's signature, etc. + * + * If it is not possible to compute the signature (e.g. there are no files left), + * the implementation might return None. + * + * @param fb [[FingerprintBuilder]] used for building fingerprints. + * Use it to compute the signature from discriminating properties of the relation. */ - def signature: String + def signature(fb: FingerprintBuilder): Option[Fingerprint] /** * FileStatus list for all source files that the current relation references to. diff --git a/src/main/scala/com/microsoft/hyperspace/util/HashingUtils.scala b/src/main/scala/com/microsoft/hyperspace/util/HashingUtils.scala deleted file mode 100644 index 530cae4ca..000000000 --- a/src/main/scala/com/microsoft/hyperspace/util/HashingUtils.scala +++ /dev/null @@ -1,35 +0,0 @@ -/* - * Copyright (2020) The Hyperspace Project Authors. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.microsoft.hyperspace.util - -import org.apache.commons.codec.digest.DigestUtils - -/** - * HashingUtils supplies different hashing functions. - */ -object HashingUtils { - - /** - * MD5-based hashing function. - * - * @param input the input to be hashed, for Any type of data. - * @return the hash code string. - */ - def md5Hex(input: Any): String = { - DigestUtils.md5Hex(input.toString) - } -} diff --git a/src/main/scala/com/microsoft/hyperspace/util/fingerprint/Fingerprint.scala b/src/main/scala/com/microsoft/hyperspace/util/fingerprint/Fingerprint.scala new file mode 100644 index 000000000..17c89ed7e --- /dev/null +++ b/src/main/scala/com/microsoft/hyperspace/util/fingerprint/Fingerprint.scala @@ -0,0 +1,64 @@ +/* + * Copyright (2020) The Hyperspace Project Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.microsoft.hyperspace.util.fingerprint + +import javax.xml.bind.DatatypeConverter + +import com.fasterxml.jackson.annotation.{JsonCreator, JsonValue} + +/** + * Represents a fingerprint, which can be used to identify and compare objects + * without actually comparing them. + * + * The value of a fingerprint is typically calculated by a hash function. To + * function as a fingerprinting function, the hash function must distribute + * the input uniformly into hashed values, and the number of bytes of the hash + * value should be sufficiently large. For example, MD5 produces 128-bit hash + * values (16 bytes), whereas SHA-1 produces 160-bit hash values (20 bytes). + * + * All objects to be identified and compared with each other must have fingerprints + * created from the same hash function. + */ +case class Fingerprint(value: Vector[Byte]) { + + /** + * Returns a human-readable form of this fingerprint in HEX format. + */ + @JsonValue + override def toString: String = { + value.map(_.formatted("%02x")).mkString + } + + /** + * Returns a new fingerprint by applying bitwise XOR to this and the other fingerprints. + */ + def ^(other: Fingerprint): Fingerprint = { + require(value.size == other.value.size) + Fingerprint(value.zip(other.value).map { case (x, y) => (x ^ y).toByte }) + } +} + +object Fingerprint { + + /** + * Creates a Fingerprint from a HEX string. + */ + @JsonCreator + def apply(value: String): Fingerprint = { + Fingerprint(DatatypeConverter.parseHexBinary(value).toVector) + } +} diff --git a/src/main/scala/com/microsoft/hyperspace/util/fingerprint/FingerprintBuilder.scala b/src/main/scala/com/microsoft/hyperspace/util/fingerprint/FingerprintBuilder.scala new file mode 100644 index 000000000..436a37391 --- /dev/null +++ b/src/main/scala/com/microsoft/hyperspace/util/fingerprint/FingerprintBuilder.scala @@ -0,0 +1,91 @@ +/* + * Copyright (2020) The Hyperspace Project Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.microsoft.hyperspace.util.fingerprint + +import java.nio.ByteBuffer +import java.nio.charset.StandardCharsets +import java.security.MessageDigest + +/** + * A builder class for Fingerprint. + * A fingerprint is built by hashing added values sequentially. + * The order of values is significant. + */ +class FingerprintBuilder(private val md: MessageDigest) { + + def add(value: Boolean): FingerprintBuilder = { + md.update((if (value) 1 else 0).toByte) + this + } + + def add(value: Byte): FingerprintBuilder = { + md.update(value) + this + } + + def add(value: Short): FingerprintBuilder = { + md.update(ByteBuffer.allocate(2).putShort(value).flip().asInstanceOf[ByteBuffer]) + this + } + + def add(value: Int): FingerprintBuilder = { + md.update(ByteBuffer.allocate(4).putInt(value).flip().asInstanceOf[ByteBuffer]) + this + } + + def add(value: Long): FingerprintBuilder = { + md.update(ByteBuffer.allocate(8).putLong(value).flip().asInstanceOf[ByteBuffer]) + this + } + + def add(value: Float): FingerprintBuilder = { + md.update(ByteBuffer.allocate(4).putFloat(value).flip().asInstanceOf[ByteBuffer]) + this + } + + def add(value: Double): FingerprintBuilder = { + md.update(ByteBuffer.allocate(8).putDouble(value).flip().asInstanceOf[ByteBuffer]) + this + } + + def add(value: Char): FingerprintBuilder = { + md.update(ByteBuffer.allocate(2).putChar(value).flip().asInstanceOf[ByteBuffer]) + this + } + + def add(value: String): FingerprintBuilder = { + md.update(value.getBytes(StandardCharsets.UTF_8)) + this + } + + def add(value: Array[Byte]): FingerprintBuilder = { + md.update(value) + this + } + + def add(value: Fingerprint): FingerprintBuilder = { + md.update(value.value.toArray) + this + } + + /** + * Builds a fingerprint by hashing added values. + * After this call is made, values are reset and this object + * can be used for another fingerprint. + */ + def build(): Fingerprint = Fingerprint(md.digest().toVector) +} diff --git a/src/main/scala/com/microsoft/hyperspace/util/fingerprint/FingerprintBuilderFactory.scala b/src/main/scala/com/microsoft/hyperspace/util/fingerprint/FingerprintBuilderFactory.scala new file mode 100644 index 000000000..08fb7dba9 --- /dev/null +++ b/src/main/scala/com/microsoft/hyperspace/util/fingerprint/FingerprintBuilderFactory.scala @@ -0,0 +1,49 @@ +/* + * Copyright (2020) The Hyperspace Project Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.microsoft.hyperspace.util.fingerprint + +import org.apache.commons.codec.digest.DigestUtils + +/** + * A factory trait for FingerprintBuilder. This trait must be implemented without any state. + * The same instance of this trait can be shared and used by many objects. + */ +trait FingerprintBuilderFactory { + + /** + * Creates a fingerprint builder. + */ + def create: FingerprintBuilder +} + +/** + * MD5-based fingerprint builder. The fingerprint has 16 bytes (128 bits). + */ +class MD5FingerprintBuilderFactory extends FingerprintBuilderFactory { + override def create: FingerprintBuilder = { + new FingerprintBuilder(DigestUtils.getMd5Digest) + } +} + +/** + * SHA1-based fingerprint builder. The fingerprint has 20 bytes (160 bits). + */ +class SHA1FingerprintBuilderFactory extends FingerprintBuilderFactory { + override def create: FingerprintBuilder = { + new FingerprintBuilder(DigestUtils.getSha1Digest) + } +} diff --git a/src/test/scala/com/microsoft/hyperspace/actions/RefreshActionTest.scala b/src/test/scala/com/microsoft/hyperspace/actions/RefreshActionTest.scala index 909758687..c0432de28 100644 --- a/src/test/scala/com/microsoft/hyperspace/actions/RefreshActionTest.scala +++ b/src/test/scala/com/microsoft/hyperspace/actions/RefreshActionTest.scala @@ -28,6 +28,7 @@ import com.microsoft.hyperspace.{HyperspaceException, SampleData, SparkInvolvedS import com.microsoft.hyperspace.actions.Constants.States.{ACTIVE, CREATING} import com.microsoft.hyperspace.index._ import com.microsoft.hyperspace.index.sources.FileBasedSourceProviderManager +import com.microsoft.hyperspace.util.fingerprint.Fingerprint class RefreshActionTest extends HyperspaceSuite { private val sampleParquetDataLocation = inTempDir("sampleparquet") @@ -78,7 +79,8 @@ class RefreshActionTest extends HyperspaceSuite { null, null, LogicalPlanFingerprint( - LogicalPlanFingerprint.Properties(Seq(Signature("signatureProvider", "dfSignature"))))) + LogicalPlanFingerprint.Properties( + Seq(Signature("signatureProvider", Fingerprint("abcd")))))) val entry = IndexLogEntry( "index1", diff --git a/src/test/scala/com/microsoft/hyperspace/index/FileBasedSignatureProviderTest.scala b/src/test/scala/com/microsoft/hyperspace/index/FileBasedSignatureProviderTest.scala index 6df97afc3..b574f526a 100644 --- a/src/test/scala/com/microsoft/hyperspace/index/FileBasedSignatureProviderTest.scala +++ b/src/test/scala/com/microsoft/hyperspace/index/FileBasedSignatureProviderTest.scala @@ -20,6 +20,7 @@ import org.apache.hadoop.fs.{FileStatus, Path} import org.apache.spark.SparkFunSuite import com.microsoft.hyperspace.{HyperspaceException, SparkInvolvedSuite} +import com.microsoft.hyperspace.util.fingerprint.{Fingerprint, MD5FingerprintBuilderFactory} class FileBasedSignatureProviderTest extends SparkFunSuite with SparkInvolvedSuite { private val fileLength = 100 @@ -30,6 +31,8 @@ class FileBasedSignatureProviderTest extends SparkFunSuite with SparkInvolvedSui private val fileModificationTimeDelta = 10 private val newFilePath = new Path("newPath") + private val fbf = new MD5FingerprintBuilderFactory + test("Logical relations from a same file have the same signature.") { val signature1 = createFileBasedSignature(Seq(createFileStatus(fileLength, fileModificationTime, filePath))) @@ -98,7 +101,7 @@ class FileBasedSignatureProviderTest extends SparkFunSuite with SparkInvolvedSui } test("Create FileBasedSignatureProvider.") { - val fileBasedSignatureProvider = new FileBasedSignatureProvider + val fileBasedSignatureProvider = new FileBasedSignatureProvider(fbf) assert( LogicalPlanSignatureProvider .create(fileBasedSignatureProvider.name) @@ -113,8 +116,8 @@ class FileBasedSignatureProviderTest extends SparkFunSuite with SparkInvolvedSui private def createFileStatus(length: Long, modificationTime: Long, path: Path): FileStatus = SignatureProviderTestUtils.createFileStatus(length, modificationTime, path) - private def createFileBasedSignature(files: Seq[FileStatus]): String = - new FileBasedSignatureProvider() + private def createFileBasedSignature(files: Seq[FileStatus]): Fingerprint = + new FileBasedSignatureProvider(fbf) .signature(SignatureProviderTestUtils.createLogicalRelation(spark, files)) match { case Some(s) => s case None => throw HyperspaceException("Invalid plan for signature generation.") diff --git a/src/test/scala/com/microsoft/hyperspace/index/IndexCacheTest.scala b/src/test/scala/com/microsoft/hyperspace/index/IndexCacheTest.scala index 296ce028c..d430c5728 100644 --- a/src/test/scala/com/microsoft/hyperspace/index/IndexCacheTest.scala +++ b/src/test/scala/com/microsoft/hyperspace/index/IndexCacheTest.scala @@ -25,6 +25,7 @@ import org.apache.spark.sql.types.{StringType, StructField, StructType} import com.microsoft.hyperspace.{Hyperspace, HyperspaceException, SampleData, SparkInvolvedSuite} import com.microsoft.hyperspace.actions.Constants import com.microsoft.hyperspace.util.FileUtils +import com.microsoft.hyperspace.util.fingerprint.Fingerprint class IndexCacheTest extends HyperspaceSuite { val sampleParquetDataLocation = inTempDir("sampleparquet") @@ -45,7 +46,8 @@ class IndexCacheTest extends HyperspaceSuite { null, null, LogicalPlanFingerprint( - LogicalPlanFingerprint.Properties(Seq(Signature("signatureProvider", "dfSignature"))))) + LogicalPlanFingerprint.Properties( + Seq(Signature("signatureProvider", Fingerprint("abcd")))))) val entry = IndexLogEntry( "index1", diff --git a/src/test/scala/com/microsoft/hyperspace/index/IndexCollectionManagerTest.scala b/src/test/scala/com/microsoft/hyperspace/index/IndexCollectionManagerTest.scala index e6e22ee41..b72e59755 100644 --- a/src/test/scala/com/microsoft/hyperspace/index/IndexCollectionManagerTest.scala +++ b/src/test/scala/com/microsoft/hyperspace/index/IndexCollectionManagerTest.scala @@ -25,6 +25,7 @@ import org.mockito.Mockito.{mock, when} import com.microsoft.hyperspace.{HyperspaceException, SparkInvolvedSuite} import com.microsoft.hyperspace.actions.Constants import com.microsoft.hyperspace.index.IndexConstants.{REFRESH_MODE_FULL, REFRESH_MODE_INCREMENTAL} +import com.microsoft.hyperspace.util.fingerprint.Fingerprint class IndexCollectionManagerTest extends HyperspaceSuite { private val testLogManagerFactory: IndexLogManagerFactory = new IndexLogManagerFactory { @@ -43,7 +44,8 @@ class IndexCollectionManagerTest extends HyperspaceSuite { Seq(), null, null, - LogicalPlanFingerprint(LogicalPlanFingerprint.Properties(Seq(Signature("", ""))))) + LogicalPlanFingerprint( + LogicalPlanFingerprint.Properties(Seq(Signature("", Fingerprint("")))))) val entry = IndexLogEntry( indexPath.toString, @@ -95,7 +97,8 @@ class IndexCollectionManagerTest extends HyperspaceSuite { Seq(), null, null, - LogicalPlanFingerprint(LogicalPlanFingerprint.Properties(Seq(Signature("", ""))))) + LogicalPlanFingerprint( + LogicalPlanFingerprint.Properties(Seq(Signature("", Fingerprint("")))))) val entry = IndexLogEntry( str, diff --git a/src/test/scala/com/microsoft/hyperspace/index/IndexLogEntryTest.scala b/src/test/scala/com/microsoft/hyperspace/index/IndexLogEntryTest.scala index 3eec72cb2..1bac36a18 100644 --- a/src/test/scala/com/microsoft/hyperspace/index/IndexLogEntryTest.scala +++ b/src/test/scala/com/microsoft/hyperspace/index/IndexLogEntryTest.scala @@ -31,6 +31,7 @@ import org.scalatest.BeforeAndAfter import com.microsoft.hyperspace.{BuildInfo, HyperspaceException, TestUtils} import com.microsoft.hyperspace.index.IndexConstants.UNKNOWN_FILE_ID import com.microsoft.hyperspace.util.{JsonUtils, PathUtils} +import com.microsoft.hyperspace.util.fingerprint.Fingerprint class IndexLogEntryTest extends HyperspaceSuite with SQLHelper { var testDir: file.Path = _ @@ -167,7 +168,7 @@ class IndexLogEntryTest extends HyperspaceSuite with SQLHelper { | "properties" : { | "signatures" : [ { | "provider" : "provider", - | "value" : "signatureValue" + | "value" : "abcd" | } ] | }, | "kind" : "LogicalPlan" @@ -219,7 +220,7 @@ class IndexLogEntryTest extends HyperspaceSuite with SQLHelper { null, LogicalPlanFingerprint( LogicalPlanFingerprint - .Properties(Seq(Signature("provider", "signatureValue"))) + .Properties(Seq(Signature("provider", Fingerprint("abcd")))) )) val expected = IndexLogEntry.create( diff --git a/src/test/scala/com/microsoft/hyperspace/index/IndexLogManagerImplTest.scala b/src/test/scala/com/microsoft/hyperspace/index/IndexLogManagerImplTest.scala index d02795964..3fdd42a47 100644 --- a/src/test/scala/com/microsoft/hyperspace/index/IndexLogManagerImplTest.scala +++ b/src/test/scala/com/microsoft/hyperspace/index/IndexLogManagerImplTest.scala @@ -26,6 +26,7 @@ import org.scalatest.BeforeAndAfterAll import com.microsoft.hyperspace.{SparkInvolvedSuite, TestUtils} import com.microsoft.hyperspace.index.IndexConstants.HYPERSPACE_LOG import com.microsoft.hyperspace.util.{FileUtils, JsonUtils} +import com.microsoft.hyperspace.util.fingerprint.Fingerprint class IndexLogManagerImplTest extends HyperspaceSuite { val testRoot = inTempDir("indexLogManagerTests") @@ -68,7 +69,7 @@ class IndexLogManagerImplTest extends HyperspaceSuite { null, null, LogicalPlanFingerprint( - LogicalPlanFingerprint.Properties(Seq(Signature("provider", "signature"))))))), + LogicalPlanFingerprint.Properties(Seq(Signature("provider", Fingerprint("abcd")))))))), Map()) private def getEntry(state: String): LogEntry = { diff --git a/src/test/scala/com/microsoft/hyperspace/index/IndexSignatureProviderTest.scala b/src/test/scala/com/microsoft/hyperspace/index/IndexSignatureProviderTest.scala index 559ce5a2c..1afb664e6 100644 --- a/src/test/scala/com/microsoft/hyperspace/index/IndexSignatureProviderTest.scala +++ b/src/test/scala/com/microsoft/hyperspace/index/IndexSignatureProviderTest.scala @@ -27,6 +27,7 @@ import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructT import com.microsoft.hyperspace.SparkInvolvedSuite import com.microsoft.hyperspace.shim.JoinWithoutHint +import com.microsoft.hyperspace.util.fingerprint.{Fingerprint, MD5FingerprintBuilderFactory} class IndexSignatureProviderTest extends SparkFunSuite with SparkInvolvedSuite { private val fileLength1 = 100 @@ -37,6 +38,8 @@ class IndexSignatureProviderTest extends SparkFunSuite with SparkInvolvedSuite { private val fileModificationTime2 = 35000 private val filePath2 = new Path("originalLocation2") + private val fbf = new MD5FingerprintBuilderFactory + test("Verify signature for a plan with same logical relation node.") { val r1 = createSimplePlan(fileLength1, fileModificationTime1, filePath1) val r2 = createSimplePlan(fileLength1, fileModificationTime1, filePath1) @@ -128,8 +131,8 @@ class IndexSignatureProviderTest extends SparkFunSuite with SparkInvolvedSuite { Aggregate(grpExpression, aggExpression, projectNode) } - private def createIndexSignature(logicalPlan: LogicalPlan): String = - new IndexSignatureProvider().signature(logicalPlan) match { + private def createIndexSignature(logicalPlan: LogicalPlan): Fingerprint = + new IndexSignatureProvider(fbf).signature(logicalPlan) match { case Some(s) => s case None => fail("Invalid plan for signature generation.") } diff --git a/src/test/scala/com/microsoft/hyperspace/index/IndexTest.scala b/src/test/scala/com/microsoft/hyperspace/index/IndexTest.scala index 2a3f17c48..f901f6254 100644 --- a/src/test/scala/com/microsoft/hyperspace/index/IndexTest.scala +++ b/src/test/scala/com/microsoft/hyperspace/index/IndexTest.scala @@ -20,6 +20,7 @@ import org.apache.spark.SparkFunSuite import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType} import com.microsoft.hyperspace.actions.Constants +import com.microsoft.hyperspace.util.fingerprint.Fingerprint class IndexTest extends SparkFunSuite { val indexConfig1 = IndexConfig("myIndex1", Array("id"), Seq("name")) @@ -35,7 +36,8 @@ class IndexTest extends SparkFunSuite { null, null, LogicalPlanFingerprint( - LogicalPlanFingerprint.Properties(Seq(Signature("signatureProvider", "dfSignature"))))) + LogicalPlanFingerprint.Properties( + Seq(Signature("signatureProvider", Fingerprint("abcd")))))) val entry = IndexLogEntry( config.indexName, diff --git a/src/test/scala/com/microsoft/hyperspace/index/rules/HyperspaceRuleSuite.scala b/src/test/scala/com/microsoft/hyperspace/index/rules/HyperspaceRuleSuite.scala index fc2cc1c87..d5767ce56 100644 --- a/src/test/scala/com/microsoft/hyperspace/index/rules/HyperspaceRuleSuite.scala +++ b/src/test/scala/com/microsoft/hyperspace/index/rules/HyperspaceRuleSuite.scala @@ -29,6 +29,7 @@ import com.microsoft.hyperspace.actions.Constants import com.microsoft.hyperspace.index._ import com.microsoft.hyperspace.index.Hdfs.Properties import com.microsoft.hyperspace.util.PathUtils +import com.microsoft.hyperspace.util.fingerprint.MD5FingerprintBuilderFactory trait HyperspaceRuleSuite extends HyperspaceSuite { private val defaultFileNames = Seq("f1.parquet", "f2.parquet") @@ -41,7 +42,8 @@ trait HyperspaceRuleSuite extends HyperspaceSuite { inputFiles: Seq[FileInfo] = Seq(), writeLog: Boolean = true, filenames: Seq[String] = defaultFileNames): IndexLogEntry = { - val signClass = new RuleTestHelper.TestSignatureProvider().getClass.getName + val fbf = new MD5FingerprintBuilderFactory + val signClass = new RuleTestHelper.TestSignatureProvider(fbf).getClass.getName LogicalPlanSignatureProvider.create(signClass).signature(plan) match { case Some(s) => diff --git a/src/test/scala/com/microsoft/hyperspace/index/rules/RuleTestHelper.scala b/src/test/scala/com/microsoft/hyperspace/index/rules/RuleTestHelper.scala index 708517d10..10f168510 100644 --- a/src/test/scala/com/microsoft/hyperspace/index/rules/RuleTestHelper.scala +++ b/src/test/scala/com/microsoft/hyperspace/index/rules/RuleTestHelper.scala @@ -20,15 +20,17 @@ import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, LogicalRelation} import com.microsoft.hyperspace.index.LogicalPlanSignatureProvider +import com.microsoft.hyperspace.util.fingerprint.{Fingerprint, FingerprintBuilderFactory} object RuleTestHelper { - class TestSignatureProvider extends LogicalPlanSignatureProvider { - def signature(plan: LogicalPlan): Option[String] = + class TestSignatureProvider(fbf: FingerprintBuilderFactory) + extends LogicalPlanSignatureProvider { + def signature(plan: LogicalPlan): Option[Fingerprint] = plan .collectFirst { case LogicalRelation(HadoopFsRelation(location, _, _, _, _, _), _, _, _) => location.hashCode() } - .map(_.toString) + .map(fbf.create.add(_).build()) } } diff --git a/src/test/scala/com/microsoft/hyperspace/util/HashingUtilsTest.scala b/src/test/scala/com/microsoft/hyperspace/util/HashingUtilsTest.scala deleted file mode 100644 index 10f6f43f9..000000000 --- a/src/test/scala/com/microsoft/hyperspace/util/HashingUtilsTest.scala +++ /dev/null @@ -1,38 +0,0 @@ -/* - * Copyright (2020) The Hyperspace Project Authors. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.microsoft.hyperspace.util - -import java.util.UUID - -import org.apache.spark.SparkFunSuite - -class HashingUtilsTest extends SparkFunSuite { - test("For md5Hashing(), same input has the same output hash code.") { - val randomUUID = UUID.randomUUID.toString - val hashCode1 = HashingUtils.md5Hex(randomUUID) - val hashCode2 = HashingUtils.md5Hex(randomUUID) - assert(hashCode1.equals(hashCode2)) - } - - test("For md5Hashing(), different input different output hash code.") { - val randomUUID1 = UUID.randomUUID.toString - val randomUUID2 = UUID.randomUUID.toString - val hashCode1 = HashingUtils.md5Hex(randomUUID1) - val hashCode2 = HashingUtils.md5Hex(randomUUID2) - assert(!hashCode1.equals(hashCode2)) - } -} diff --git a/src/test/scala/com/microsoft/hyperspace/util/JsonUtilsTest.scala b/src/test/scala/com/microsoft/hyperspace/util/JsonUtilsTest.scala index 686bc6f42..1d9295ab1 100644 --- a/src/test/scala/com/microsoft/hyperspace/util/JsonUtilsTest.scala +++ b/src/test/scala/com/microsoft/hyperspace/util/JsonUtilsTest.scala @@ -21,6 +21,7 @@ import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructT import com.microsoft.hyperspace.actions.Constants import com.microsoft.hyperspace.index._ +import com.microsoft.hyperspace.util.fingerprint.Fingerprint class JsonUtilsTest extends SparkFunSuite { test("Test for JsonUtils.") { @@ -35,7 +36,8 @@ class JsonUtilsTest extends SparkFunSuite { null, null, LogicalPlanFingerprint( - LogicalPlanFingerprint.Properties(Seq(Signature("signatureProvider", "dfSignature"))))) + LogicalPlanFingerprint.Properties( + Seq(Signature("signatureProvider", Fingerprint("abcd")))))) val index = IndexLogEntry( "myIndex", diff --git a/src/test/scala/com/microsoft/hyperspace/util/fingerprint/FingerprintBuilderTest.scala b/src/test/scala/com/microsoft/hyperspace/util/fingerprint/FingerprintBuilderTest.scala new file mode 100644 index 000000000..752bdfd87 --- /dev/null +++ b/src/test/scala/com/microsoft/hyperspace/util/fingerprint/FingerprintBuilderTest.scala @@ -0,0 +1,110 @@ +/* + * Copyright (2020) The Hyperspace Project Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.microsoft.hyperspace.util.fingerprint + +import org.apache.commons.codec.digest.DigestUtils +import org.apache.spark.SparkFunSuite + +class FingerprintBuilderTest extends SparkFunSuite { + + test("Empty MD5 fingerprint") { + val fb = new FingerprintBuilder(DigestUtils.getMd5Digest) + assert(fb.build() == Fingerprint("d41d8cd98f00b204e9800998ecf8427e")) + } + + test("MD5 fingerprint with an integer") { + val fb = new FingerprintBuilder(DigestUtils.getMd5Digest) + assert(fb.add(1).build() == Fingerprint("f1450306517624a57eafbbf8ed995985")) + } + + test("MD5 fingerprint with 0") { + val fb = new FingerprintBuilder(DigestUtils.getMd5Digest) + assert(fb.add(0).build() == Fingerprint("f1d3ff8443297732862df21dc4e57262")) + } + + test("MD5 fingerprint with two integers") { + val fb = new FingerprintBuilder(DigestUtils.getMd5Digest) + assert(fb.add(1).add(42).build() == Fingerprint("f26c63a4633a5deea0a644cc58050446")) + } + + test("MD5 fingerprint with two integers in a different order") { + val fb = new FingerprintBuilder(DigestUtils.getMd5Digest) + assert(fb.add(42).add(1).build() == Fingerprint("3331d54f38cef13e1a2423af3c6fb223")) + } + + test("MD5 fingerprint with a long") { + val fb = new FingerprintBuilder(DigestUtils.getMd5Digest) + assert(fb.add(1L).build() == Fingerprint("fa5ad9a8557e5a84cf23e52d3d3adf77")) + } + + test("MD5 fingerprint with a short") { + val fb = new FingerprintBuilder(DigestUtils.getMd5Digest) + assert(fb.add(1.toShort).build() == Fingerprint("441077cc9e57554dd476bdfb8b8b8102")) + } + + test("MD5 fingerprint with a byte") { + val fb = new FingerprintBuilder(DigestUtils.getMd5Digest) + assert(fb.add(1.toByte).build() == Fingerprint("55a54008ad1ba589aa210d2629c1df41")) + } + + test("MD5 fingerprint with a float") { + val fb = new FingerprintBuilder(DigestUtils.getMd5Digest) + assert(fb.add(2.3f).build() == Fingerprint("472d85449cd4441d8e13540a57aeff49")) + } + + test("MD5 fingerprint with a double") { + val fb = new FingerprintBuilder(DigestUtils.getMd5Digest) + assert(fb.add(42.3).build() == Fingerprint("4a4591ddc290c1e7db42b538ad681549")) + } + + test("MD5 fingerprint with a boolean") { + val fb = new FingerprintBuilder(DigestUtils.getMd5Digest) + assert(fb.add(true).build() == Fingerprint("55a54008ad1ba589aa210d2629c1df41")) + } + + test("MD5 fingerprint with a char") { + val fb = new FingerprintBuilder(DigestUtils.getMd5Digest) + assert(fb.add('X').build() == Fingerprint("1e160201945908b455769e0318b1a1e9")) + } + + test("MD5 fingerprint with a string") { + val fb = new FingerprintBuilder(DigestUtils.getMd5Digest) + assert(fb.add("hello").build() == Fingerprint("5d41402abc4b2a76b9719d911017c592")) + } + + test("MD5 fingerprint with a string whose bytes are same as 1 (Int)") { + val fb = new FingerprintBuilder(DigestUtils.getMd5Digest) + assert(fb.add("\0\0\0\001").build() == fb.add(1).build()) + } + + test("MD5 fingerprint with a byte array") { + val fb = new FingerprintBuilder(DigestUtils.getMd5Digest) + assert( + fb.add(Array[Byte](104, 101, 108, 108, 111)).build() == + Fingerprint("5d41402abc4b2a76b9719d911017c592")) + } + + test("MD5 fingerprint with a fingerprint") { + val fb = new FingerprintBuilder(DigestUtils.getMd5Digest) + assert(fb.add(fb.build()).build() == Fingerprint("59adb24ef3cdbe0297f05b395827453f")) + } + + test("Empty SHA1 fingerprint") { + val fb = new FingerprintBuilder(DigestUtils.getSha1Digest) + assert(fb.build() == Fingerprint("da39a3ee5e6b4b0d3255bfef95601890afd80709")) + } +} diff --git a/src/test/scala/com/microsoft/hyperspace/util/fingerprint/FingerprintTest.scala b/src/test/scala/com/microsoft/hyperspace/util/fingerprint/FingerprintTest.scala new file mode 100644 index 000000000..1af12c17d --- /dev/null +++ b/src/test/scala/com/microsoft/hyperspace/util/fingerprint/FingerprintTest.scala @@ -0,0 +1,57 @@ +/* + * Copyright (2020) The Hyperspace Project Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.microsoft.hyperspace.util.fingerprint + +import org.apache.spark.SparkFunSuite + +class FingerprintTest extends SparkFunSuite { + + test("toString returns a HEX string.") { + assert(Fingerprint(Vector[Byte](1, 2, 3, 10, 30, 66)).toString == "0102030a1e42") + } + + test("toString returns an empty string.") { + assert(Fingerprint(Vector[Byte]()).toString == "") + } + + test("Bitwise XOR works for fingerprints of the same length.") { + assert( + (Fingerprint(Vector[Byte](19, 121)) ^ Fingerprint(Vector[Byte](31, -4))) == + Fingerprint(Vector[Byte]((19 ^ 31).toByte, (121 ^ -4).toByte))) + } + + test("Bitwise XOR fails for fingerprints of different lengths.") { + assertThrows[IllegalArgumentException] { + Fingerprint(Vector[Byte](19, 121)) ^ Fingerprint(Vector[Byte](31, -4, 3)) + } + } + + test("Fingerprint can be created from an empty string.") { + assert(Fingerprint("") == Fingerprint(Vector[Byte]())) + } + + test("Fingerprint can be created from a HEX string.") { + assert(Fingerprint("0102fffe") == Fingerprint(Vector[Byte](1, 2, -1, -2))) + } + + test("Fingerprint cannot be created from an invalid string.") { + val ex = intercept[IllegalArgumentException] { + Fingerprint("xyz") + } + assert(ex.getMessage.contains("hexBinary needs to be even-length")) + } +}