From b6d584d09543824a7b6ceb79a20306bc7d8b33a5 Mon Sep 17 00:00:00 2001 From: Chungmin Lee Date: Fri, 2 Apr 2021 04:03:26 +0900 Subject: [PATCH] Refactor hashing and fingerprint functions Fixes #271. With newly added classes for fingerprinting, now it is easy to make a fingerprint from various types of data. FingerprintBuilder can be used to build a fingerprint from data. FileBasedRelation.signature now takes it as an argument, so the implementations don't have to do the hashing themselves. They can just use the provided builder. For unorderd combining, one can use bitwise XOR to combine multiple fingerprints. This way, the order becomes irrelavant. --- .../index/FileBasedSignatureProvider.scala | 35 +++--- .../hyperspace/index/IndexLogEntry.scala | 5 +- .../index/IndexSignatureProvider.scala | 12 +- .../index/LogicalPlanSignatureProvider.scala | 14 ++- .../index/PlanSignatureProvider.scala | 15 +-- .../hyperspace/index/rules/RuleUtils.scala | 3 +- .../default/DefaultFileBasedRelation.scala | 21 ++-- .../sources/delta/DeltaLakeRelation.scala | 7 +- .../sources/iceberg/IcebergRelation.scala | 11 +- .../hyperspace/index/sources/interfaces.scala | 6 +- .../hyperspace/util/HashingUtils.scala | 35 ------ .../util/fingerprint/Fingerprint.scala | 64 ++++++++++ .../util/fingerprint/FingerprintBuilder.scala | 91 +++++++++++++++ .../FingerprintBuilderFactory.scala | 49 ++++++++ .../actions/RefreshActionTest.scala | 4 +- .../FileBasedSignatureProviderTest.scala | 9 +- .../hyperspace/index/IndexCacheTest.scala | 4 +- .../index/IndexCollectionManagerTest.scala | 7 +- .../hyperspace/index/IndexLogEntryTest.scala | 5 +- .../index/IndexLogManagerImplTest.scala | 4 +- .../index/IndexSignatureProviderTest.scala | 7 +- .../hyperspace/index/IndexTest.scala | 4 +- .../index/rules/HyperspaceRuleSuite.scala | 4 +- .../index/rules/RuleTestHelper.scala | 7 +- .../hyperspace/util/HashingUtilsTest.scala | 38 ------ .../hyperspace/util/JsonUtilsTest.scala | 4 +- .../fingerprint/FingerprintBuilderTest.scala | 109 ++++++++++++++++++ .../util/fingerprint/FingerprintTest.scala | 57 +++++++++ 28 files changed, 479 insertions(+), 152 deletions(-) delete mode 100644 src/main/scala/com/microsoft/hyperspace/util/HashingUtils.scala create mode 100644 src/main/scala/com/microsoft/hyperspace/util/fingerprint/Fingerprint.scala create mode 100644 src/main/scala/com/microsoft/hyperspace/util/fingerprint/FingerprintBuilder.scala create mode 100644 src/main/scala/com/microsoft/hyperspace/util/fingerprint/FingerprintBuilderFactory.scala delete mode 100644 src/test/scala/com/microsoft/hyperspace/util/HashingUtilsTest.scala create mode 100644 src/test/scala/com/microsoft/hyperspace/util/fingerprint/FingerprintBuilderTest.scala create mode 100644 src/test/scala/com/microsoft/hyperspace/util/fingerprint/FingerprintTest.scala diff --git a/src/main/scala/com/microsoft/hyperspace/index/FileBasedSignatureProvider.scala b/src/main/scala/com/microsoft/hyperspace/index/FileBasedSignatureProvider.scala index 654c19142..9163aed1e 100644 --- a/src/main/scala/com/microsoft/hyperspace/index/FileBasedSignatureProvider.scala +++ b/src/main/scala/com/microsoft/hyperspace/index/FileBasedSignatureProvider.scala @@ -19,15 +19,20 @@ package com.microsoft.hyperspace.index import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, LogicalPlan} import com.microsoft.hyperspace.Hyperspace -import com.microsoft.hyperspace.util.HashingUtils +import com.microsoft.hyperspace.util.fingerprint.{Fingerprint, FingerprintBuilder, FingerprintBuilderFactory} /** * [[FileBasedSignatureProvider]] provides the logical plan signature based on files in the * relation. File metadata, eg. size, modification time and path, of each file in the * relation will be used to generate the signature. + * + * Note that while the order of files in a single relation does not affect the signature, + * the order of relations in the plan do affect the signature calculation. + * * If the given logical plan does not have any supported relations, no signature is provided. */ -class FileBasedSignatureProvider extends LogicalPlanSignatureProvider { +class FileBasedSignatureProvider(fbf: FingerprintBuilderFactory) + extends LogicalPlanSignatureProvider { /** * Generate the signature of logical plan. @@ -35,28 +40,18 @@ class FileBasedSignatureProvider extends LogicalPlanSignatureProvider { * @param logicalPlan logical plan of data frame. * @return signature, if the logical plan has supported relations; Otherwise None. */ - def signature(logicalPlan: LogicalPlan): Option[String] = { - fingerprintVisitor(logicalPlan).map(HashingUtils.md5Hex) - } - - /** - * Visit logical plan and collect info needed for fingerprint. - * - * @param logicalPlan logical plan of data frame. - * @return fingerprint, if the logical plan has supported relations; Otherwise None. - */ - private def fingerprintVisitor(logicalPlan: LogicalPlan): Option[String] = { + def signature(logicalPlan: LogicalPlan): Option[Fingerprint] = { val provider = Hyperspace.getContext.sourceProviderManager - var fingerprint = "" + val fb: FingerprintBuilder = fbf.create + var updated = false logicalPlan.foreachUp { case l: LeafNode if provider.isSupportedRelation(l) => - fingerprint ++= provider.getRelation(l).signature + provider.getRelation(l).signature(fb).foreach { f => + fb.add(f) + updated = true + } case _ => } - - fingerprint match { - case "" => None - case _ => Some(fingerprint) - } + if (updated) Some(fb.build()) else None } } diff --git a/src/main/scala/com/microsoft/hyperspace/index/IndexLogEntry.scala b/src/main/scala/com/microsoft/hyperspace/index/IndexLogEntry.scala index bc6cbc39f..5f1af9bc0 100644 --- a/src/main/scala/com/microsoft/hyperspace/index/IndexLogEntry.scala +++ b/src/main/scala/com/microsoft/hyperspace/index/IndexLogEntry.scala @@ -31,7 +31,8 @@ import org.apache.spark.sql.types.{DataType, StructType} import com.microsoft.hyperspace.HyperspaceException import com.microsoft.hyperspace.actions.Constants -import com.microsoft.hyperspace.util.{PathUtils, SchemaUtils} +import com.microsoft.hyperspace.util.PathUtils +import com.microsoft.hyperspace.util.fingerprint.Fingerprint // IndexLogEntry-specific fingerprint to be temporarily used where fingerprint is not defined. case class NoOpFingerprint() { @@ -361,7 +362,7 @@ object CoveringIndex { } // IndexLogEntry-specific Signature that stores the signature provider and value. -case class Signature(provider: String, value: String) +case class Signature(provider: String, value: Fingerprint) // IndexLogEntry-specific LogicalPlanFingerprint to store fingerprint of logical plan. case class LogicalPlanFingerprint(properties: LogicalPlanFingerprint.Properties) { diff --git a/src/main/scala/com/microsoft/hyperspace/index/IndexSignatureProvider.scala b/src/main/scala/com/microsoft/hyperspace/index/IndexSignatureProvider.scala index efe4f4cf5..ae2da47ed 100644 --- a/src/main/scala/com/microsoft/hyperspace/index/IndexSignatureProvider.scala +++ b/src/main/scala/com/microsoft/hyperspace/index/IndexSignatureProvider.scala @@ -18,7 +18,7 @@ package com.microsoft.hyperspace.index import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan -import com.microsoft.hyperspace.util.HashingUtils +import com.microsoft.hyperspace.util.fingerprint.{Fingerprint, FingerprintBuilderFactory} /** * [[IndexSignatureProvider]] provides signature for a logical plan based on: @@ -30,9 +30,9 @@ import com.microsoft.hyperspace.util.HashingUtils * If the plan does not comply with [[FileBasedSignatureProvider]] or [[PlanSignatureProvider]] * requirements for signature computation, then no signature will be provided for the plan. */ -class IndexSignatureProvider extends LogicalPlanSignatureProvider { - private val fileBasedSignatureProvider = new FileBasedSignatureProvider - private val planSignatureProvider = new PlanSignatureProvider +class IndexSignatureProvider(fbf: FingerprintBuilderFactory) extends LogicalPlanSignatureProvider { + private val fileBasedSignatureProvider = new FileBasedSignatureProvider(fbf) + private val planSignatureProvider = new PlanSignatureProvider(fbf) /** * Generate the signature of logical plan. @@ -41,10 +41,10 @@ class IndexSignatureProvider extends LogicalPlanSignatureProvider { * @return signature, if both [[FileBasedSignatureProvider]] and [[PlanSignatureProvider]] * can generate signature for the logical plan; Otherwise None. */ - def signature(logicalPlan: LogicalPlan): Option[String] = { + def signature(logicalPlan: LogicalPlan): Option[Fingerprint] = { fileBasedSignatureProvider.signature(logicalPlan).flatMap { f => planSignatureProvider.signature(logicalPlan).map { p => - HashingUtils.md5Hex(f + p) + fbf.create.add(f).add(p).build() } } } diff --git a/src/main/scala/com/microsoft/hyperspace/index/LogicalPlanSignatureProvider.scala b/src/main/scala/com/microsoft/hyperspace/index/LogicalPlanSignatureProvider.scala index 4c8277cba..2c12d9053 100644 --- a/src/main/scala/com/microsoft/hyperspace/index/LogicalPlanSignatureProvider.scala +++ b/src/main/scala/com/microsoft/hyperspace/index/LogicalPlanSignatureProvider.scala @@ -21,8 +21,12 @@ import scala.util.{Success, Try} import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.util.hyperspace.Utils +import com.microsoft.hyperspace.util.fingerprint.{Fingerprint, FingerprintBuilderFactory, MD5FingerprintBuilderFactory} + /** * This trait contains the interface that provides the signature of logical plan. + * + * The implementation must have a constructor taking [[FingerprintBuilderFactory]] as an argument. */ trait LogicalPlanSignatureProvider { @@ -36,15 +40,17 @@ trait LogicalPlanSignatureProvider { * @param logicalPlan logical plan. * @return signature if it can be computed w.r.t signature provider assumptions; Otherwise None. */ - def signature(logicalPlan: LogicalPlan): Option[String] + def signature(logicalPlan: LogicalPlan): Option[Fingerprint] } /** * Factory object for LogicalPlanSignatureProvider. */ object LogicalPlanSignatureProvider { + private val fbf: FingerprintBuilderFactory = new MD5FingerprintBuilderFactory + // Creates a default signature provider. - def create(): LogicalPlanSignatureProvider = new IndexSignatureProvider + def create(): LogicalPlanSignatureProvider = new IndexSignatureProvider(fbf) /** * Creates a parameterized signature provider. @@ -53,7 +59,9 @@ object LogicalPlanSignatureProvider { * @return signature provider. */ def create(name: String): LogicalPlanSignatureProvider = { - Try(Utils.classForName(name).newInstance) match { + Try(Utils.classForName(name) + .getConstructor(classOf[FingerprintBuilderFactory]) + .newInstance(fbf)) match { case Success(provider: LogicalPlanSignatureProvider) => provider case _ => throw new IllegalArgumentException( diff --git a/src/main/scala/com/microsoft/hyperspace/index/PlanSignatureProvider.scala b/src/main/scala/com/microsoft/hyperspace/index/PlanSignatureProvider.scala index 8a780db4b..3532e32f9 100644 --- a/src/main/scala/com/microsoft/hyperspace/index/PlanSignatureProvider.scala +++ b/src/main/scala/com/microsoft/hyperspace/index/PlanSignatureProvider.scala @@ -18,14 +18,14 @@ package com.microsoft.hyperspace.index import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan -import com.microsoft.hyperspace.util.HashingUtils +import com.microsoft.hyperspace.util.fingerprint.{Fingerprint, FingerprintBuilder, FingerprintBuilderFactory} /** * [[PlanSignatureProvider]] provides signature for a logical plan based on * the type of operators in it. * A plan needs to have at least one operator so its signature can be generated. */ -class PlanSignatureProvider extends LogicalPlanSignatureProvider { +class PlanSignatureProvider(fbf: FingerprintBuilderFactory) extends LogicalPlanSignatureProvider { /** * Generate the signature of logical plan. @@ -33,12 +33,9 @@ class PlanSignatureProvider extends LogicalPlanSignatureProvider { * @param logicalPlan logical plan. * @return signature if there is at least one operator in the plan; Otherwise None. */ - def signature(logicalPlan: LogicalPlan): Option[String] = { - var signature = "" - logicalPlan.foreachUp(p => signature = HashingUtils.md5Hex(signature + p.nodeName)) - signature match { - case "" => None - case _ => Some(signature) - } + def signature(logicalPlan: LogicalPlan): Option[Fingerprint] = { + val fb: FingerprintBuilder = fbf.create + logicalPlan.foreachUp(node => fb.add(node.nodeName)) + Some(fb.build()) } } diff --git a/src/main/scala/com/microsoft/hyperspace/index/rules/RuleUtils.scala b/src/main/scala/com/microsoft/hyperspace/index/rules/RuleUtils.scala index 087f6559b..0931c577e 100644 --- a/src/main/scala/com/microsoft/hyperspace/index/rules/RuleUtils.scala +++ b/src/main/scala/com/microsoft/hyperspace/index/rules/RuleUtils.scala @@ -34,6 +34,7 @@ import com.microsoft.hyperspace.index.IndexLogEntryTags.{HYBRIDSCAN_RELATED_CONF import com.microsoft.hyperspace.index.plans.logical.{BucketUnion, IndexHadoopFsRelation} import com.microsoft.hyperspace.index.sources.FileBasedRelation import com.microsoft.hyperspace.util.HyperspaceConf +import com.microsoft.hyperspace.util.fingerprint.Fingerprint object RuleUtils { @@ -54,7 +55,7 @@ object RuleUtils { indexes: Seq[IndexLogEntry], relation: FileBasedRelation): Seq[IndexLogEntry] = { // Map of a signature provider to a signature generated for the given plan. - val signatureMap = mutable.Map[String, Option[String]]() + val signatureMap = mutable.Map[String, Option[Fingerprint]]() def signatureValid(entry: IndexLogEntry): Boolean = { entry.withCachedTag(relation.plan, IndexLogEntryTags.SIGNATURE_MATCHED) { diff --git a/src/main/scala/com/microsoft/hyperspace/index/sources/default/DefaultFileBasedRelation.scala b/src/main/scala/com/microsoft/hyperspace/index/sources/default/DefaultFileBasedRelation.scala index 9b845cbdf..906a274f9 100644 --- a/src/main/scala/com/microsoft/hyperspace/index/sources/default/DefaultFileBasedRelation.scala +++ b/src/main/scala/com/microsoft/hyperspace/index/sources/default/DefaultFileBasedRelation.scala @@ -30,7 +30,7 @@ import com.microsoft.hyperspace.HyperspaceException import com.microsoft.hyperspace.index.{Content, FileIdTracker, Hdfs, Relation} import com.microsoft.hyperspace.index.IndexConstants.GLOBBING_PATTERN_KEY import com.microsoft.hyperspace.index.sources.FileBasedRelation -import com.microsoft.hyperspace.util.HashingUtils +import com.microsoft.hyperspace.util.fingerprint.{Fingerprint, FingerprintBuilder} /** * Implementation for file-based relation used by [[DefaultFileBasedSource]] @@ -42,13 +42,12 @@ class DefaultFileBasedRelation(spark: SparkSession, override val plan: LogicalRe /** * Computes the signature of the current relation. */ - override def signature: String = plan.relation match { + override def signature(fb: FingerprintBuilder): Option[Fingerprint] = plan.relation match { case HadoopFsRelation(location: PartitioningAwareFileIndex, _, _, _, _, _) => - val result = filesFromIndex(location).sortBy(_.getPath.toString).foldLeft("") { - (acc: String, f: FileStatus) => - HashingUtils.md5Hex(acc + fingerprint(f)) - } - result + val initialFingerprint = fb.build() + var fingerprint = initialFingerprint + filesFromIndex(location).foreach(fingerprint ^= createFingerprint(_, fb)) + Some(fingerprint).filter(_ != initialFingerprint) } /** @@ -179,9 +178,11 @@ class DefaultFileBasedRelation(spark: SparkSession, override val plan: LogicalRe } } - private def fingerprint(fileStatus: FileStatus): String = { - fileStatus.getLen.toString + fileStatus.getModificationTime.toString + - fileStatus.getPath.toString + private def createFingerprint(fileStatus: FileStatus, fb: FingerprintBuilder): Fingerprint = { + fb.add(fileStatus.getLen) + .add(fileStatus.getModificationTime) + .add(fileStatus.getPath.toString) + .build() } private def filesFromIndex(index: PartitioningAwareFileIndex): Seq[FileStatus] = { diff --git a/src/main/scala/com/microsoft/hyperspace/index/sources/delta/DeltaLakeRelation.scala b/src/main/scala/com/microsoft/hyperspace/index/sources/delta/DeltaLakeRelation.scala index 5a8d591c3..1a82f10ba 100644 --- a/src/main/scala/com/microsoft/hyperspace/index/sources/delta/DeltaLakeRelation.scala +++ b/src/main/scala/com/microsoft/hyperspace/index/sources/delta/DeltaLakeRelation.scala @@ -23,9 +23,10 @@ import org.apache.spark.sql.delta.files.TahoeLogFileIndex import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, LogicalRelation} import com.microsoft.hyperspace.Hyperspace -import com.microsoft.hyperspace.index.{Content, FileIdTracker, FileInfo, Hdfs, IndexLogEntry, Relation} +import com.microsoft.hyperspace.index.{Content, FileIdTracker, Hdfs, IndexLogEntry, Relation} import com.microsoft.hyperspace.index.sources.default.DefaultFileBasedRelation import com.microsoft.hyperspace.util.PathUtils +import com.microsoft.hyperspace.util.fingerprint.{Fingerprint, FingerprintBuilder} /** * Implementation for file-based relation used by [[DeltaLakeFileBasedSource]] @@ -36,9 +37,9 @@ class DeltaLakeRelation(spark: SparkSession, override val plan: LogicalRelation) /** * Computes the signature of the current relation. */ - override def signature: String = plan.relation match { + override def signature(fb: FingerprintBuilder): Option[Fingerprint] = plan.relation match { case HadoopFsRelation(location: TahoeLogFileIndex, _, _, _, _, _) => - location.tableVersion + location.path.toString + Some(fb.add(location.tableVersion).add(location.path.toString).build()) } /** diff --git a/src/main/scala/com/microsoft/hyperspace/index/sources/iceberg/IcebergRelation.scala b/src/main/scala/com/microsoft/hyperspace/index/sources/iceberg/IcebergRelation.scala index acec90b79..105fc4d9a 100644 --- a/src/main/scala/com/microsoft/hyperspace/index/sources/iceberg/IcebergRelation.scala +++ b/src/main/scala/com/microsoft/hyperspace/index/sources/iceberg/IcebergRelation.scala @@ -16,8 +16,6 @@ package com.microsoft.hyperspace.index.sources.iceberg -import java.util.Locale - import collection.JavaConverters._ import org.apache.hadoop.fs.{FileStatus, Path} import org.apache.iceberg.{FileScanTask, Schema, Table} @@ -28,15 +26,16 @@ import org.apache.iceberg.spark.SparkSchemaUtil import org.apache.iceberg.spark.source.IcebergSource import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.expressions.AttributeReference -import org.apache.spark.sql.execution.datasources.{FileIndex, HadoopFsRelation, LogicalRelation, PartitioningAwareFileIndex} +import org.apache.spark.sql.execution.datasources.{FileIndex, HadoopFsRelation, LogicalRelation} import org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation import org.apache.spark.sql.sources.v2.DataSourceOptions import org.apache.spark.sql.types.StructType -import com.microsoft.hyperspace.index.{Content, FileIdTracker, FileInfo, Hdfs, IndexConstants, Relation} +import com.microsoft.hyperspace.index.{Content, FileIdTracker, Hdfs, IndexConstants, Relation} import com.microsoft.hyperspace.index.sources.FileBasedRelation import com.microsoft.hyperspace.util.PathUtils +import com.microsoft.hyperspace.util.fingerprint.{Fingerprint, FingerprintBuilder} /** * Implementation for file-based relation used by [[IcebergFileBasedSource]] @@ -47,11 +46,11 @@ class IcebergRelation(spark: SparkSession, override val plan: DataSourceV2Relati /** * Computes the signature of the current relation. */ - override def signature: String = plan.source match { + override def signature(fb: FingerprintBuilder): Option[Fingerprint] = plan.source match { case _: IcebergSource => val table = loadIcebergTable val snapshotId = plan.options.getOrElse("snapshot-id", table.currentSnapshot().snapshotId()) - snapshotId + table.location() + Some(fb.add(snapshotId.toString).add(table.location).build()) } /** diff --git a/src/main/scala/com/microsoft/hyperspace/index/sources/interfaces.scala b/src/main/scala/com/microsoft/hyperspace/index/sources/interfaces.scala index 9e857c5e7..011d0ddae 100644 --- a/src/main/scala/com/microsoft/hyperspace/index/sources/interfaces.scala +++ b/src/main/scala/com/microsoft/hyperspace/index/sources/interfaces.scala @@ -24,6 +24,7 @@ import org.apache.spark.sql.execution.datasources.{FileIndex, HadoopFsRelation, import org.apache.spark.sql.types.StructType import com.microsoft.hyperspace.index.{FileIdTracker, FileInfo, IndexConstants, IndexLogEntry, Relation} +import com.microsoft.hyperspace.util.fingerprint.{Fingerprint, FingerprintBuilder} /** * ::Experimental:: @@ -57,8 +58,11 @@ trait FileBasedRelation extends SourceRelation { * * This API is used when the signature of source needs to be computed, e.g., creating an index, * computing query plan's signature, etc. + * + * If it is not possible to compute the signature (e.g. there are no files left), + * the implementation might return None. */ - def signature: String + def signature(fb: FingerprintBuilder): Option[Fingerprint] /** * FileStatus list for all source files that the current relation references to. diff --git a/src/main/scala/com/microsoft/hyperspace/util/HashingUtils.scala b/src/main/scala/com/microsoft/hyperspace/util/HashingUtils.scala deleted file mode 100644 index 530cae4ca..000000000 --- a/src/main/scala/com/microsoft/hyperspace/util/HashingUtils.scala +++ /dev/null @@ -1,35 +0,0 @@ -/* - * Copyright (2020) The Hyperspace Project Authors. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.microsoft.hyperspace.util - -import org.apache.commons.codec.digest.DigestUtils - -/** - * HashingUtils supplies different hashing functions. - */ -object HashingUtils { - - /** - * MD5-based hashing function. - * - * @param input the input to be hashed, for Any type of data. - * @return the hash code string. - */ - def md5Hex(input: Any): String = { - DigestUtils.md5Hex(input.toString) - } -} diff --git a/src/main/scala/com/microsoft/hyperspace/util/fingerprint/Fingerprint.scala b/src/main/scala/com/microsoft/hyperspace/util/fingerprint/Fingerprint.scala new file mode 100644 index 000000000..cbaa80cda --- /dev/null +++ b/src/main/scala/com/microsoft/hyperspace/util/fingerprint/Fingerprint.scala @@ -0,0 +1,64 @@ +/* + * Copyright (2020) The Hyperspace Project Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.microsoft.hyperspace.util.fingerprint + +import javax.xml.bind.DatatypeConverter + +import com.fasterxml.jackson.annotation.{JsonCreator, JsonValue} + +/** + * Represents a fingerprint, which can be used to identify and compare objects + * without actually comparing them. + * + * The value of a fingerprint is typically calculated by a hash function. To + * function as a fingerprinting function, the hash function must distribute + * the input uniformly into hashed values, and the number of bytes of the hash + * value should be sufficiently large. For example, MD5 produces 128-bit hash + * values (16 bytes), whereas SHA-1 produces 160-bit hash values (20 bytes). + * + * All objects to be identified and compared with each other must have fingerprints + * created from the same hash function. + */ +case class Fingerprint(value: Vector[Byte]) { + + /** + * Returns a human-readable form of this fingerprint in HEX format. + */ + @JsonValue + override def toString: String = { + value.map(_.formatted("%02x")).mkString + } + + /** + * Returns a new fingerprint by applying bitwise XOR to this and the other fingerprints. + */ + def ^(other: Fingerprint): Fingerprint = { + require(value.size == other.value.size) + Fingerprint(value.zip(other.value).map{ case (x, y) => (x ^ y).toByte }) + } +} + +object Fingerprint { + + /** + * Creates a Fingerprint from a HEX string. + */ + @JsonCreator + def apply(value: String): Fingerprint = { + Fingerprint(DatatypeConverter.parseHexBinary(value).toVector) + } +} diff --git a/src/main/scala/com/microsoft/hyperspace/util/fingerprint/FingerprintBuilder.scala b/src/main/scala/com/microsoft/hyperspace/util/fingerprint/FingerprintBuilder.scala new file mode 100644 index 000000000..436a37391 --- /dev/null +++ b/src/main/scala/com/microsoft/hyperspace/util/fingerprint/FingerprintBuilder.scala @@ -0,0 +1,91 @@ +/* + * Copyright (2020) The Hyperspace Project Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.microsoft.hyperspace.util.fingerprint + +import java.nio.ByteBuffer +import java.nio.charset.StandardCharsets +import java.security.MessageDigest + +/** + * A builder class for Fingerprint. + * A fingerprint is built by hashing added values sequentially. + * The order of values is significant. + */ +class FingerprintBuilder(private val md: MessageDigest) { + + def add(value: Boolean): FingerprintBuilder = { + md.update((if (value) 1 else 0).toByte) + this + } + + def add(value: Byte): FingerprintBuilder = { + md.update(value) + this + } + + def add(value: Short): FingerprintBuilder = { + md.update(ByteBuffer.allocate(2).putShort(value).flip().asInstanceOf[ByteBuffer]) + this + } + + def add(value: Int): FingerprintBuilder = { + md.update(ByteBuffer.allocate(4).putInt(value).flip().asInstanceOf[ByteBuffer]) + this + } + + def add(value: Long): FingerprintBuilder = { + md.update(ByteBuffer.allocate(8).putLong(value).flip().asInstanceOf[ByteBuffer]) + this + } + + def add(value: Float): FingerprintBuilder = { + md.update(ByteBuffer.allocate(4).putFloat(value).flip().asInstanceOf[ByteBuffer]) + this + } + + def add(value: Double): FingerprintBuilder = { + md.update(ByteBuffer.allocate(8).putDouble(value).flip().asInstanceOf[ByteBuffer]) + this + } + + def add(value: Char): FingerprintBuilder = { + md.update(ByteBuffer.allocate(2).putChar(value).flip().asInstanceOf[ByteBuffer]) + this + } + + def add(value: String): FingerprintBuilder = { + md.update(value.getBytes(StandardCharsets.UTF_8)) + this + } + + def add(value: Array[Byte]): FingerprintBuilder = { + md.update(value) + this + } + + def add(value: Fingerprint): FingerprintBuilder = { + md.update(value.value.toArray) + this + } + + /** + * Builds a fingerprint by hashing added values. + * After this call is made, values are reset and this object + * can be used for another fingerprint. + */ + def build(): Fingerprint = Fingerprint(md.digest().toVector) +} diff --git a/src/main/scala/com/microsoft/hyperspace/util/fingerprint/FingerprintBuilderFactory.scala b/src/main/scala/com/microsoft/hyperspace/util/fingerprint/FingerprintBuilderFactory.scala new file mode 100644 index 000000000..08fb7dba9 --- /dev/null +++ b/src/main/scala/com/microsoft/hyperspace/util/fingerprint/FingerprintBuilderFactory.scala @@ -0,0 +1,49 @@ +/* + * Copyright (2020) The Hyperspace Project Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.microsoft.hyperspace.util.fingerprint + +import org.apache.commons.codec.digest.DigestUtils + +/** + * A factory trait for FingerprintBuilder. This trait must be implemented without any state. + * The same instance of this trait can be shared and used by many objects. + */ +trait FingerprintBuilderFactory { + + /** + * Creates a fingerprint builder. + */ + def create: FingerprintBuilder +} + +/** + * MD5-based fingerprint builder. The fingerprint has 16 bytes (128 bits). + */ +class MD5FingerprintBuilderFactory extends FingerprintBuilderFactory { + override def create: FingerprintBuilder = { + new FingerprintBuilder(DigestUtils.getMd5Digest) + } +} + +/** + * SHA1-based fingerprint builder. The fingerprint has 20 bytes (160 bits). + */ +class SHA1FingerprintBuilderFactory extends FingerprintBuilderFactory { + override def create: FingerprintBuilder = { + new FingerprintBuilder(DigestUtils.getSha1Digest) + } +} diff --git a/src/test/scala/com/microsoft/hyperspace/actions/RefreshActionTest.scala b/src/test/scala/com/microsoft/hyperspace/actions/RefreshActionTest.scala index a978718b5..ef69db0c7 100644 --- a/src/test/scala/com/microsoft/hyperspace/actions/RefreshActionTest.scala +++ b/src/test/scala/com/microsoft/hyperspace/actions/RefreshActionTest.scala @@ -28,6 +28,7 @@ import com.microsoft.hyperspace.{HyperspaceException, SampleData, SparkInvolvedS import com.microsoft.hyperspace.actions.Constants.States.{ACTIVE, CREATING} import com.microsoft.hyperspace.index._ import com.microsoft.hyperspace.index.sources.FileBasedSourceProviderManager +import com.microsoft.hyperspace.util.fingerprint.Fingerprint class RefreshActionTest extends SparkFunSuite with SparkInvolvedSuite { private val sampleParquetDataLocation = "src/test/resources/sampleparquet" @@ -78,7 +79,8 @@ class RefreshActionTest extends SparkFunSuite with SparkInvolvedSuite { null, null, LogicalPlanFingerprint( - LogicalPlanFingerprint.Properties(Seq(Signature("signatureProvider", "dfSignature"))))) + LogicalPlanFingerprint.Properties( + Seq(Signature("signatureProvider", Fingerprint("abcd")))))) val entry = IndexLogEntry( "index1", diff --git a/src/test/scala/com/microsoft/hyperspace/index/FileBasedSignatureProviderTest.scala b/src/test/scala/com/microsoft/hyperspace/index/FileBasedSignatureProviderTest.scala index 6df97afc3..b574f526a 100644 --- a/src/test/scala/com/microsoft/hyperspace/index/FileBasedSignatureProviderTest.scala +++ b/src/test/scala/com/microsoft/hyperspace/index/FileBasedSignatureProviderTest.scala @@ -20,6 +20,7 @@ import org.apache.hadoop.fs.{FileStatus, Path} import org.apache.spark.SparkFunSuite import com.microsoft.hyperspace.{HyperspaceException, SparkInvolvedSuite} +import com.microsoft.hyperspace.util.fingerprint.{Fingerprint, MD5FingerprintBuilderFactory} class FileBasedSignatureProviderTest extends SparkFunSuite with SparkInvolvedSuite { private val fileLength = 100 @@ -30,6 +31,8 @@ class FileBasedSignatureProviderTest extends SparkFunSuite with SparkInvolvedSui private val fileModificationTimeDelta = 10 private val newFilePath = new Path("newPath") + private val fbf = new MD5FingerprintBuilderFactory + test("Logical relations from a same file have the same signature.") { val signature1 = createFileBasedSignature(Seq(createFileStatus(fileLength, fileModificationTime, filePath))) @@ -98,7 +101,7 @@ class FileBasedSignatureProviderTest extends SparkFunSuite with SparkInvolvedSui } test("Create FileBasedSignatureProvider.") { - val fileBasedSignatureProvider = new FileBasedSignatureProvider + val fileBasedSignatureProvider = new FileBasedSignatureProvider(fbf) assert( LogicalPlanSignatureProvider .create(fileBasedSignatureProvider.name) @@ -113,8 +116,8 @@ class FileBasedSignatureProviderTest extends SparkFunSuite with SparkInvolvedSui private def createFileStatus(length: Long, modificationTime: Long, path: Path): FileStatus = SignatureProviderTestUtils.createFileStatus(length, modificationTime, path) - private def createFileBasedSignature(files: Seq[FileStatus]): String = - new FileBasedSignatureProvider() + private def createFileBasedSignature(files: Seq[FileStatus]): Fingerprint = + new FileBasedSignatureProvider(fbf) .signature(SignatureProviderTestUtils.createLogicalRelation(spark, files)) match { case Some(s) => s case None => throw HyperspaceException("Invalid plan for signature generation.") diff --git a/src/test/scala/com/microsoft/hyperspace/index/IndexCacheTest.scala b/src/test/scala/com/microsoft/hyperspace/index/IndexCacheTest.scala index a23403ad8..9559db5c8 100644 --- a/src/test/scala/com/microsoft/hyperspace/index/IndexCacheTest.scala +++ b/src/test/scala/com/microsoft/hyperspace/index/IndexCacheTest.scala @@ -25,6 +25,7 @@ import org.apache.spark.sql.types.{StringType, StructField, StructType} import com.microsoft.hyperspace.{Hyperspace, HyperspaceException, SampleData, SparkInvolvedSuite} import com.microsoft.hyperspace.actions.Constants import com.microsoft.hyperspace.util.FileUtils +import com.microsoft.hyperspace.util.fingerprint.Fingerprint class IndexCacheTest extends SparkFunSuite with SparkInvolvedSuite { val sampleParquetDataLocation = "src/test/resources/sampleparquet" @@ -48,7 +49,8 @@ class IndexCacheTest extends SparkFunSuite with SparkInvolvedSuite { null, null, LogicalPlanFingerprint( - LogicalPlanFingerprint.Properties(Seq(Signature("signatureProvider", "dfSignature"))))) + LogicalPlanFingerprint.Properties( + Seq(Signature("signatureProvider", Fingerprint("abcd")))))) val entry = IndexLogEntry( "index1", diff --git a/src/test/scala/com/microsoft/hyperspace/index/IndexCollectionManagerTest.scala b/src/test/scala/com/microsoft/hyperspace/index/IndexCollectionManagerTest.scala index 13dc15a88..5fa297c6b 100644 --- a/src/test/scala/com/microsoft/hyperspace/index/IndexCollectionManagerTest.scala +++ b/src/test/scala/com/microsoft/hyperspace/index/IndexCollectionManagerTest.scala @@ -25,6 +25,7 @@ import org.mockito.Mockito.{mock, when} import com.microsoft.hyperspace.{HyperspaceException, SparkInvolvedSuite} import com.microsoft.hyperspace.actions.Constants import com.microsoft.hyperspace.index.IndexConstants.{REFRESH_MODE_FULL, REFRESH_MODE_INCREMENTAL} +import com.microsoft.hyperspace.util.fingerprint.Fingerprint class IndexCollectionManagerTest extends SparkFunSuite with SparkInvolvedSuite { private val indexSystemPath = "src/test/resources/indexLocation" @@ -44,7 +45,8 @@ class IndexCollectionManagerTest extends SparkFunSuite with SparkInvolvedSuite { Seq(), null, null, - LogicalPlanFingerprint(LogicalPlanFingerprint.Properties(Seq(Signature("", ""))))) + LogicalPlanFingerprint(LogicalPlanFingerprint.Properties( + Seq(Signature("", Fingerprint("")))))) val entry = IndexLogEntry( indexPath.toString, @@ -97,7 +99,8 @@ class IndexCollectionManagerTest extends SparkFunSuite with SparkInvolvedSuite { Seq(), null, null, - LogicalPlanFingerprint(LogicalPlanFingerprint.Properties(Seq(Signature("", ""))))) + LogicalPlanFingerprint(LogicalPlanFingerprint.Properties( + Seq(Signature("", Fingerprint("")))))) val entry = IndexLogEntry( str, diff --git a/src/test/scala/com/microsoft/hyperspace/index/IndexLogEntryTest.scala b/src/test/scala/com/microsoft/hyperspace/index/IndexLogEntryTest.scala index ebe4a5eec..b0d0f1117 100644 --- a/src/test/scala/com/microsoft/hyperspace/index/IndexLogEntryTest.scala +++ b/src/test/scala/com/microsoft/hyperspace/index/IndexLogEntryTest.scala @@ -31,6 +31,7 @@ import org.scalatest.BeforeAndAfter import com.microsoft.hyperspace.{HyperspaceException, TestUtils} import com.microsoft.hyperspace.index.IndexConstants.UNKNOWN_FILE_ID import com.microsoft.hyperspace.util.{JsonUtils, PathUtils} +import com.microsoft.hyperspace.util.fingerprint.Fingerprint class IndexLogEntryTest extends SparkFunSuite with SQLHelper with BeforeAndAfter { var testDir: file.Path = _ @@ -167,7 +168,7 @@ class IndexLogEntryTest extends SparkFunSuite with SQLHelper with BeforeAndAfter | "properties" : { | "signatures" : [ { | "provider" : "provider", - | "value" : "signatureValue" + | "value" : "abcd" | } ] | }, | "kind" : "LogicalPlan" @@ -217,7 +218,7 @@ class IndexLogEntryTest extends SparkFunSuite with SQLHelper with BeforeAndAfter null, LogicalPlanFingerprint( LogicalPlanFingerprint - .Properties(Seq(Signature("provider", "signatureValue"))) + .Properties(Seq(Signature("provider", Fingerprint("abcd")))) )) val expected = IndexLogEntry( diff --git a/src/test/scala/com/microsoft/hyperspace/index/IndexLogManagerImplTest.scala b/src/test/scala/com/microsoft/hyperspace/index/IndexLogManagerImplTest.scala index 789c1de32..96b2d3c5b 100644 --- a/src/test/scala/com/microsoft/hyperspace/index/IndexLogManagerImplTest.scala +++ b/src/test/scala/com/microsoft/hyperspace/index/IndexLogManagerImplTest.scala @@ -26,6 +26,7 @@ import org.scalatest.BeforeAndAfterAll import com.microsoft.hyperspace.{SparkInvolvedSuite, TestUtils} import com.microsoft.hyperspace.index.IndexConstants.HYPERSPACE_LOG import com.microsoft.hyperspace.util.{FileUtils, JsonUtils} +import com.microsoft.hyperspace.util.fingerprint.Fingerprint class IndexLogManagerImplTest extends SparkFunSuite @@ -71,7 +72,8 @@ class IndexLogManagerImplTest null, null, LogicalPlanFingerprint( - LogicalPlanFingerprint.Properties(Seq(Signature("provider", "signature"))))))), + LogicalPlanFingerprint.Properties( + Seq(Signature("provider", Fingerprint("abcd")))))))), Map()) private def getEntry(state: String): LogEntry = { diff --git a/src/test/scala/com/microsoft/hyperspace/index/IndexSignatureProviderTest.scala b/src/test/scala/com/microsoft/hyperspace/index/IndexSignatureProviderTest.scala index 912e67cd6..8fae5064a 100644 --- a/src/test/scala/com/microsoft/hyperspace/index/IndexSignatureProviderTest.scala +++ b/src/test/scala/com/microsoft/hyperspace/index/IndexSignatureProviderTest.scala @@ -26,6 +26,7 @@ import org.apache.spark.sql.execution.datasources.LogicalRelation import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType} import com.microsoft.hyperspace.SparkInvolvedSuite +import com.microsoft.hyperspace.util.fingerprint.{Fingerprint, MD5FingerprintBuilderFactory} class IndexSignatureProviderTest extends SparkFunSuite with SparkInvolvedSuite { private val fileLength1 = 100 @@ -36,6 +37,8 @@ class IndexSignatureProviderTest extends SparkFunSuite with SparkInvolvedSuite { private val fileModificationTime2 = 35000 private val filePath2 = new Path("originalLocation2") + private val fbf = new MD5FingerprintBuilderFactory + test("Verify signature for a plan with same logical relation node.") { val r1 = createSimplePlan(fileLength1, fileModificationTime1, filePath1) val r2 = createSimplePlan(fileLength1, fileModificationTime1, filePath1) @@ -127,8 +130,8 @@ class IndexSignatureProviderTest extends SparkFunSuite with SparkInvolvedSuite { Aggregate(grpExpression, aggExpression, projectNode) } - private def createIndexSignature(logicalPlan: LogicalPlan): String = - new IndexSignatureProvider().signature(logicalPlan) match { + private def createIndexSignature(logicalPlan: LogicalPlan): Fingerprint = + new IndexSignatureProvider(fbf).signature(logicalPlan) match { case Some(s) => s case None => fail("Invalid plan for signature generation.") } diff --git a/src/test/scala/com/microsoft/hyperspace/index/IndexTest.scala b/src/test/scala/com/microsoft/hyperspace/index/IndexTest.scala index 2a3f17c48..f901f6254 100644 --- a/src/test/scala/com/microsoft/hyperspace/index/IndexTest.scala +++ b/src/test/scala/com/microsoft/hyperspace/index/IndexTest.scala @@ -20,6 +20,7 @@ import org.apache.spark.SparkFunSuite import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType} import com.microsoft.hyperspace.actions.Constants +import com.microsoft.hyperspace.util.fingerprint.Fingerprint class IndexTest extends SparkFunSuite { val indexConfig1 = IndexConfig("myIndex1", Array("id"), Seq("name")) @@ -35,7 +36,8 @@ class IndexTest extends SparkFunSuite { null, null, LogicalPlanFingerprint( - LogicalPlanFingerprint.Properties(Seq(Signature("signatureProvider", "dfSignature"))))) + LogicalPlanFingerprint.Properties( + Seq(Signature("signatureProvider", Fingerprint("abcd")))))) val entry = IndexLogEntry( config.indexName, diff --git a/src/test/scala/com/microsoft/hyperspace/index/rules/HyperspaceRuleSuite.scala b/src/test/scala/com/microsoft/hyperspace/index/rules/HyperspaceRuleSuite.scala index f49d8ca2d..6f30fb8a9 100644 --- a/src/test/scala/com/microsoft/hyperspace/index/rules/HyperspaceRuleSuite.scala +++ b/src/test/scala/com/microsoft/hyperspace/index/rules/HyperspaceRuleSuite.scala @@ -27,6 +27,7 @@ import com.microsoft.hyperspace.HyperspaceException import com.microsoft.hyperspace.actions.Constants import com.microsoft.hyperspace.index._ import com.microsoft.hyperspace.index.Hdfs.Properties +import com.microsoft.hyperspace.util.fingerprint.MD5FingerprintBuilderFactory trait HyperspaceRuleSuite extends HyperspaceSuite { private val filenames = Seq("f1.parquet", "f2.parquet") @@ -38,7 +39,8 @@ trait HyperspaceRuleSuite extends HyperspaceSuite { numBuckets: Int = 10, inputFiles: Seq[FileInfo] = Seq(), writeLog: Boolean = true): IndexLogEntry = { - val signClass = new RuleTestHelper.TestSignatureProvider().getClass.getName + val fbf = new MD5FingerprintBuilderFactory + val signClass = new RuleTestHelper.TestSignatureProvider(fbf).getClass.getName LogicalPlanSignatureProvider.create(signClass).signature(plan) match { case Some(s) => diff --git a/src/test/scala/com/microsoft/hyperspace/index/rules/RuleTestHelper.scala b/src/test/scala/com/microsoft/hyperspace/index/rules/RuleTestHelper.scala index 708517d10..f29b9eb96 100644 --- a/src/test/scala/com/microsoft/hyperspace/index/rules/RuleTestHelper.scala +++ b/src/test/scala/com/microsoft/hyperspace/index/rules/RuleTestHelper.scala @@ -20,15 +20,16 @@ import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, LogicalRelation} import com.microsoft.hyperspace.index.LogicalPlanSignatureProvider +import com.microsoft.hyperspace.util.fingerprint.{Fingerprint, FingerprintBuilderFactory} object RuleTestHelper { - class TestSignatureProvider extends LogicalPlanSignatureProvider { - def signature(plan: LogicalPlan): Option[String] = + class TestSignatureProvider(fbf: FingerprintBuilderFactory) extends LogicalPlanSignatureProvider { + def signature(plan: LogicalPlan): Option[Fingerprint] = plan .collectFirst { case LogicalRelation(HadoopFsRelation(location, _, _, _, _, _), _, _, _) => location.hashCode() } - .map(_.toString) + .map(fbf.create.add(_).build()) } } diff --git a/src/test/scala/com/microsoft/hyperspace/util/HashingUtilsTest.scala b/src/test/scala/com/microsoft/hyperspace/util/HashingUtilsTest.scala deleted file mode 100644 index 10f6f43f9..000000000 --- a/src/test/scala/com/microsoft/hyperspace/util/HashingUtilsTest.scala +++ /dev/null @@ -1,38 +0,0 @@ -/* - * Copyright (2020) The Hyperspace Project Authors. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.microsoft.hyperspace.util - -import java.util.UUID - -import org.apache.spark.SparkFunSuite - -class HashingUtilsTest extends SparkFunSuite { - test("For md5Hashing(), same input has the same output hash code.") { - val randomUUID = UUID.randomUUID.toString - val hashCode1 = HashingUtils.md5Hex(randomUUID) - val hashCode2 = HashingUtils.md5Hex(randomUUID) - assert(hashCode1.equals(hashCode2)) - } - - test("For md5Hashing(), different input different output hash code.") { - val randomUUID1 = UUID.randomUUID.toString - val randomUUID2 = UUID.randomUUID.toString - val hashCode1 = HashingUtils.md5Hex(randomUUID1) - val hashCode2 = HashingUtils.md5Hex(randomUUID2) - assert(!hashCode1.equals(hashCode2)) - } -} diff --git a/src/test/scala/com/microsoft/hyperspace/util/JsonUtilsTest.scala b/src/test/scala/com/microsoft/hyperspace/util/JsonUtilsTest.scala index 686bc6f42..1d9295ab1 100644 --- a/src/test/scala/com/microsoft/hyperspace/util/JsonUtilsTest.scala +++ b/src/test/scala/com/microsoft/hyperspace/util/JsonUtilsTest.scala @@ -21,6 +21,7 @@ import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructT import com.microsoft.hyperspace.actions.Constants import com.microsoft.hyperspace.index._ +import com.microsoft.hyperspace.util.fingerprint.Fingerprint class JsonUtilsTest extends SparkFunSuite { test("Test for JsonUtils.") { @@ -35,7 +36,8 @@ class JsonUtilsTest extends SparkFunSuite { null, null, LogicalPlanFingerprint( - LogicalPlanFingerprint.Properties(Seq(Signature("signatureProvider", "dfSignature"))))) + LogicalPlanFingerprint.Properties( + Seq(Signature("signatureProvider", Fingerprint("abcd")))))) val index = IndexLogEntry( "myIndex", diff --git a/src/test/scala/com/microsoft/hyperspace/util/fingerprint/FingerprintBuilderTest.scala b/src/test/scala/com/microsoft/hyperspace/util/fingerprint/FingerprintBuilderTest.scala new file mode 100644 index 000000000..09e7305dc --- /dev/null +++ b/src/test/scala/com/microsoft/hyperspace/util/fingerprint/FingerprintBuilderTest.scala @@ -0,0 +1,109 @@ +/* + * Copyright (2020) The Hyperspace Project Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.microsoft.hyperspace.util.fingerprint + +import org.apache.commons.codec.digest.DigestUtils +import org.apache.spark.SparkFunSuite + +class FingerprintBuilderTest extends SparkFunSuite { + + test("Empty MD5 fingerprint") { + val fb = new FingerprintBuilder(DigestUtils.getMd5Digest) + assert(fb.build() == Fingerprint("d41d8cd98f00b204e9800998ecf8427e")) + } + + test("MD5 fingerprint with an integer") { + val fb = new FingerprintBuilder(DigestUtils.getMd5Digest) + assert(fb.add(1).build() == Fingerprint("f1450306517624a57eafbbf8ed995985")) + } + + test("MD5 fingerprint with 0") { + val fb = new FingerprintBuilder(DigestUtils.getMd5Digest) + assert(fb.add(0).build() == Fingerprint("f1d3ff8443297732862df21dc4e57262")) + } + + test("MD5 fingerprint with two integers") { + val fb = new FingerprintBuilder(DigestUtils.getMd5Digest) + assert(fb.add(1).add(42).build() == Fingerprint("f26c63a4633a5deea0a644cc58050446")) + } + + test("MD5 fingerprint with two integers in a different order") { + val fb = new FingerprintBuilder(DigestUtils.getMd5Digest) + assert(fb.add(42).add(1).build() == Fingerprint("3331d54f38cef13e1a2423af3c6fb223")) + } + + test("MD5 fingerprint with a long") { + val fb = new FingerprintBuilder(DigestUtils.getMd5Digest) + assert(fb.add(1L).build() == Fingerprint("fa5ad9a8557e5a84cf23e52d3d3adf77")) + } + + test("MD5 fingerprint with a short") { + val fb = new FingerprintBuilder(DigestUtils.getMd5Digest) + assert(fb.add(1.toShort).build() == Fingerprint("441077cc9e57554dd476bdfb8b8b8102")) + } + + test("MD5 fingerprint with a byte") { + val fb = new FingerprintBuilder(DigestUtils.getMd5Digest) + assert(fb.add(1.toByte).build() == Fingerprint("55a54008ad1ba589aa210d2629c1df41")) + } + + test("MD5 fingerprint with a float") { + val fb = new FingerprintBuilder(DigestUtils.getMd5Digest) + assert(fb.add(2.3f).build() == Fingerprint("472d85449cd4441d8e13540a57aeff49")) + } + + test("MD5 fingerprint with a double") { + val fb = new FingerprintBuilder(DigestUtils.getMd5Digest) + assert(fb.add(42.3).build() == Fingerprint("4a4591ddc290c1e7db42b538ad681549")) + } + + test("MD5 fingerprint with a boolean") { + val fb = new FingerprintBuilder(DigestUtils.getMd5Digest) + assert(fb.add(true).build() == Fingerprint("55a54008ad1ba589aa210d2629c1df41")) + } + + test("MD5 fingerprint with a char") { + val fb = new FingerprintBuilder(DigestUtils.getMd5Digest) + assert(fb.add('X').build() == Fingerprint("1e160201945908b455769e0318b1a1e9")) + } + + test("MD5 fingerprint with a string") { + val fb = new FingerprintBuilder(DigestUtils.getMd5Digest) + assert(fb.add("hello").build() == Fingerprint("5d41402abc4b2a76b9719d911017c592")) + } + + test("MD5 fingerprint with a string whose bytes are same as 1 (Int)") { + val fb = new FingerprintBuilder(DigestUtils.getMd5Digest) + assert(fb.add("\0\0\0\001").build() == fb.add(1).build()) + } + + test("MD5 fingerprint with a byte array") { + val fb = new FingerprintBuilder(DigestUtils.getMd5Digest) + assert(fb.add(Array[Byte](104, 101, 108, 108, 111)).build() == + Fingerprint("5d41402abc4b2a76b9719d911017c592")) + } + + test("MD5 fingerprint with a fingerprint") { + val fb = new FingerprintBuilder(DigestUtils.getMd5Digest) + assert(fb.add(fb.build()).build() == Fingerprint("59adb24ef3cdbe0297f05b395827453f")) + } + + test("Empty SHA1 fingerprint") { + val fb = new FingerprintBuilder(DigestUtils.getSha1Digest) + assert(fb.build() == Fingerprint("da39a3ee5e6b4b0d3255bfef95601890afd80709")) + } +} diff --git a/src/test/scala/com/microsoft/hyperspace/util/fingerprint/FingerprintTest.scala b/src/test/scala/com/microsoft/hyperspace/util/fingerprint/FingerprintTest.scala new file mode 100644 index 000000000..a1a09b403 --- /dev/null +++ b/src/test/scala/com/microsoft/hyperspace/util/fingerprint/FingerprintTest.scala @@ -0,0 +1,57 @@ +/* + * Copyright (2020) The Hyperspace Project Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.microsoft.hyperspace.util.fingerprint + +import org.apache.spark.SparkFunSuite + +class FingerprintTest extends SparkFunSuite { + + test("toString returns a HEX string.") { + assert(Fingerprint(Vector[Byte](1, 2, 3, 10, 30, 66)).toString == "0102030a1e42") + } + + test("toString returns an empty string.") { + assert(Fingerprint(Vector[Byte]()).toString == "") + } + + test("Bitwise XOR works for fingerprints of the same length") { + assert( + (Fingerprint(Vector[Byte](19, 121)) ^ Fingerprint(Vector[Byte](31, -4))) == + Fingerprint(Vector[Byte]((19 ^ 31).toByte, (121 ^ -4).toByte))) + } + + test("Bitwise XOR fails for fingerprints of different lengths") { + assertThrows[IllegalArgumentException] { + Fingerprint(Vector[Byte](19, 121)) ^ Fingerprint(Vector[Byte](31, -4, 3)) + } + } + + test("Fingerprint can be created from an empty string") { + assert(Fingerprint("") == Fingerprint(Vector[Byte]())) + } + + test("Fingerprint can be created from a HEX string") { + assert(Fingerprint("0102fffe") == Fingerprint(Vector[Byte](1, 2, -1, -2))) + } + + test("Fingerprint cannot be created from an invalid string") { + val ex = intercept[IllegalArgumentException] { + Fingerprint("xyz") + } + assert(ex.getMessage.contains("hexBinary needs to be even-length")) + } +}