linkedin · jverbus · May 30, 2024 · May 30, 2024
diff --git a/isolation-forest/src/main/scala/com/linkedin/relevance/isolationforest/IsolationForest.scala b/isolation-forest/src/main/scala/com/linkedin/relevance/isolationforest/IsolationForest.scala
@@ -46,7 +46,7 @@ class IsolationForest(override val uid: String) extends Estimator[IsolationFores
 
     // Validate $(maxFeatures) and $(maxSamples) against input dataset and determine the values
     // actually used to train the model: numFeatures and numSamples
-    val totalNumFeatures = dataset.head.features.length
+    val totalNumFeatures = dataset.head().features.length
     val numFeatures = if ($(maxFeatures) > 1.0) {
       math.floor($(maxFeatures)).toInt
     } else {
@@ -94,7 +94,7 @@ class IsolationForest(override val uid: String) extends Estimator[IsolationFores
       .partitionBy(new HashPartitioner($(numEstimators)))
     val repartitionedFlattenedSampledDataset = repartitionedFlattenedSampledRdd
       .mapPartitions(x => x.map(y => y._2), preservesPartitioning = true)
-      .toDS
+      .toDS()
     logInfo(s"Training ${$(numEstimators)} isolation trees using" +
       s" ${repartitionedFlattenedSampledDataset.rdd.getNumPartitions} partitions.")
 
@@ -106,7 +106,7 @@ class IsolationForest(override val uid: String) extends Estimator[IsolationFores
       // Use a different seed for each partition to ensure each partition has an independent set of
       // random numbers. This ensures each tree is truly trained independently and doing so has a
       // measurable effect on the results.
-      val seed = $(randomSeed) + TaskContext.get.partitionId() + 2
+      val seed = $(randomSeed) + TaskContext.get().partitionId() + 2
       val rnd = new scala.util.Random(seed)
 
       val dataForTree = rnd.shuffle(x.toSeq).slice(0, numSamples).toArray
@@ -124,7 +124,7 @@ class IsolationForest(override val uid: String) extends Estimator[IsolationFores
       // random numbers. This ensures each tree is truly trained independently and doing so has a
       // measurable effect on the results.
       Iterator(IsolationTree
-        .fit(dataForTree, $(randomSeed) + $(numEstimators) + TaskContext.get.partitionId() + 2, featureIndices))
+        .fit(dataForTree, $(randomSeed) + $(numEstimators) + TaskContext.get().partitionId() + 2, featureIndices))
     }).collect()
 
     val isolationForestModel = copyValues(

diff --git a/...src/main/scala/com/linkedin/relevance/isolationforest/IsolationForestModelReadWrite.scala b/...src/main/scala/com/linkedin/relevance/isolationforest/IsolationForestModelReadWrite.scala
@@ -254,7 +254,7 @@ private[isolationforest] case object IsolationForestModelReadWrite extends Loggi
 
       saveMetadata(instance, path, spark, Some(extraMetadata))
       val dataPath = new Path(path, "data").toString
-      val nodeDataRDD = spark.sparkContext.parallelize(instance.isolationTrees.zipWithIndex)
+      val nodeDataRDD = spark.sparkContext.parallelize(instance.isolationTrees.zipWithIndex.toIndexedSeq)
         .flatMap { case (tree, treeID) => EnsembleNodeData.build(tree, treeID) }
       logInfo(s"Saving IsolationForestModel tree data to path ${dataPath}")
       spark.createDataFrame(nodeDataRDD)
@@ -299,7 +299,7 @@ private[isolationforest] case object IsolationForestModelReadWrite extends Loggi
 
       val uid = instance.uid
       val cls = instance.getClass.getName
-      val params = instance.extractParamMap.toSeq
+      val params = instance.extractParamMap().toSeq
       val jsonParams = render(params.map { case ParamPair(p, v) =>
         p.name -> parse(p.jsonEncode(v))
       }.toList)

diff --git a/isolation-forest/src/main/scala/com/linkedin/relevance/isolationforest/IsolationTree.scala b/isolation-forest/src/main/scala/com/linkedin/relevance/isolationforest/IsolationTree.scala
@@ -47,7 +47,7 @@ private[isolationforest] case object IsolationTree extends Logging {
   def fit(data: Array[DataPoint], randomSeed: Long, featureIndices: Array[Int]): IsolationTree = {
 
     logInfo(s"Fitting isolation tree with random seed ${randomSeed} on" +
-    s" ${featureIndices.seq.toString} features (indices) using ${data.length} data points.")
+    s" ${featureIndices.toIndexedSeq.toString} features (indices) using ${data.length} data points.")
 
     def log2(x: Double): Double = math.log10(x) / math.log10(2.0)
     val heightLimit = math.ceil(log2(data.length.toDouble)).toInt
@@ -124,7 +124,7 @@ private[isolationforest] case object IsolationTree extends Logging {
           if (minFeatureValue != maxFeatureValue) {
             foundFeature = true
             featureIndex = featureIndexTrial
-            featureSplitValue = ((maxFeatureValue - minFeatureValue) * randomState.nextDouble
+            featureSplitValue = ((maxFeatureValue - minFeatureValue) * randomState.nextDouble()
               + minFeatureValue)
           }
         }

diff --git a/isolation-forest/src/test/scala/com/linkedin/relevance/isolationforest/BaggedPointTest.scala b/isolation-forest/src/test/scala/com/linkedin/relevance/isolationforest/BaggedPointTest.scala
@@ -45,7 +45,7 @@ class BaggedPointTest {
     numCols: Int,
     expectedMean: Double,
     expectedStddev: Double,
-    epsilon: Double) {
+    epsilon: Double): Unit = {
 
     val values = new mutable.ArrayBuffer[Double]()
     data.foreach { row =>
@@ -63,7 +63,7 @@ class BaggedPointTest {
     val spark = getSparkSession
 
     val dataPointArray = generateDataPoints(1, 1000)
-    val rdd = spark.sparkContext.parallelize(dataPointArray)
+    val rdd = spark.sparkContext.parallelize(dataPointArray.toIndexedSeq)
     val baggedRDD = BaggedPoint.convertToBaggedRDD(rdd, 1.0, 1, false, 42)
     baggedRDD.collect().foreach { baggedPoint =>
       assert(baggedPoint.subsampleWeights.size == 1 && baggedPoint.subsampleWeights(0) == 1)
@@ -80,7 +80,7 @@ class BaggedPointTest {
 
     val seeds = Array(123, 5354, 230, 349867, 23987)
     val arr = generateDataPoints(1, 1000)
-    val rdd = spark.sparkContext.parallelize(arr)
+    val rdd = spark.sparkContext.parallelize(arr.toIndexedSeq)
     seeds.foreach { seed =>
       val baggedRDD = BaggedPoint.convertToBaggedRDD(rdd, 1.0, numSubsamples, true, seed)
       val subsampleCounts: Array[Array[Double]] = baggedRDD
@@ -101,7 +101,7 @@ class BaggedPointTest {
 
     val seeds = Array(123, 5354, 230, 349867, 23987)
     val arr = generateDataPoints(1, 1000)
-    val rdd = spark.sparkContext.parallelize(arr)
+    val rdd = spark.sparkContext.parallelize(arr.toIndexedSeq)
     seeds.foreach { seed =>
       val baggedRDD = BaggedPoint.convertToBaggedRDD(rdd, subsample, numSubsamples, true, seed)
       val subsampleCounts: Array[Array[Double]] = baggedRDD
@@ -121,7 +121,7 @@ class BaggedPointTest {
 
     val seeds = Array(123, 5354, 230, 349867, 23987)
     val arr = generateDataPoints(1, 1000)
-    val rdd = spark.sparkContext.parallelize(arr)
+    val rdd = spark.sparkContext.parallelize(arr.toIndexedSeq)
     seeds.foreach { seed =>
       val baggedRDD = BaggedPoint.convertToBaggedRDD(rdd, 1.0, numSubsamples, false, seed)
       val subsampleCounts: Array[Array[Double]] = baggedRDD
@@ -142,7 +142,7 @@ class BaggedPointTest {
 
     val seeds = Array(123, 5354, 230, 349867, 23987)
     val arr = generateDataPoints(1, 1000)
-    val rdd = spark.sparkContext.parallelize(arr)
+    val rdd = spark.sparkContext.parallelize(arr.toIndexedSeq)
     seeds.foreach { seed =>
       val baggedRDD = BaggedPoint.convertToBaggedRDD(rdd, subsample, numSubsamples, false, seed)
       val subsampleCounts: Array[Array[Double]] = baggedRDD
@@ -168,7 +168,7 @@ class BaggedPointTest {
                                (1, dataPointArray(1)),
                                (1, dataPointArray(1)))
 
-    val dataPointRDD = spark.sparkContext.parallelize(dataPointArray)
+    val dataPointRDD = spark.sparkContext.parallelize(dataPointArray.toIndexedSeq)
     val baggedPointRDD = dataPointRDD.map(x => new BaggedPoint(x, subsampleWeights))
     val flattenedBaggedPointRDD = BaggedPoint.flattenBaggedRDD(baggedPointRDD, 1L)
     val flattenedBaggedPointArray = flattenedBaggedPointRDD.collect()
@@ -187,7 +187,7 @@ class BaggedPointTest {
     val dataPointArray = generateDataPoints(10, numRecords)
     val subsampleWeights = Array(1.3, 1.75)
 
-    val dataPointRDD = spark.sparkContext.parallelize(dataPointArray)
+    val dataPointRDD = spark.sparkContext.parallelize(dataPointArray.toIndexedSeq)
     val baggedPointRDD = dataPointRDD.map(x => new BaggedPoint(x, subsampleWeights))
     val flattenedBaggedPointRDD = BaggedPoint.flattenBaggedRDD(baggedPointRDD, 1L)
     val flattenedBaggedPointArray = flattenedBaggedPointRDD.collect()

diff --git a/...test/scala/com/linkedin/relevance/isolationforest/IsolationForestModelWriteReadTest.scala b/...test/scala/com/linkedin/relevance/isolationforest/IsolationForestModelWriteReadTest.scala
@@ -38,14 +38,14 @@ class IsolationForestModelWriteReadTest extends Logging {
 
     // Write the trained model to disk and then read it back from disk
     val savePath = System.getProperty("java.io.tmpdir") + "/savedIsolationForestModel"
-    isolationForestModel1.write.overwrite.save(savePath)
+    isolationForestModel1.write.overwrite().save(savePath)
     val isolationForestModel2 = IsolationForestModel.load(savePath)
     deleteDirectory(new File(savePath))
 
     // Assert that all parameter values are equal
     Assert.assertEquals(
-      isolationForestModel1.extractParamMap.toString,
-      isolationForestModel2.extractParamMap.toString)
+      isolationForestModel1.extractParamMap().toString,
+      isolationForestModel2.extractParamMap().toString)
     Assert.assertEquals(isolationForestModel1.getNumSamples, isolationForestModel2.getNumSamples)
     Assert.assertEquals(isolationForestModel1.getNumFeatures, isolationForestModel2.getNumFeatures)
     Assert.assertEquals(
@@ -64,8 +64,8 @@ class IsolationForestModelWriteReadTest extends Logging {
     Assert.assertEquals(auroc1, auroc2)
 
     // Assert the predicted labels are equal
-    val predictedLabels1 = scores1.map(x => x.predictedLabel).collect
-    val predictedLabels2 = scores2.map(x => x.predictedLabel).collect
+    val predictedLabels1 = scores1.map(x => x.predictedLabel).collect()
+    val predictedLabels2 = scores2.map(x => x.predictedLabel).collect()
     Assert.assertEquals(predictedLabels1.toSeq, predictedLabels2.toSeq) 
 
     // Compare each tree in the original and saved/loaded model and assert they are equal
@@ -102,14 +102,14 @@ class IsolationForestModelWriteReadTest extends Logging {
 
     // Write the trained model to disk and then read it back from disk
     val savePath = System.getProperty("java.io.tmpdir") + "/savedIsolationForestModelZeroContamination"
-    isolationForestModel1.write.overwrite.save(savePath)
+    isolationForestModel1.write.overwrite().save(savePath)
     val isolationForestModel2 = IsolationForestModel.load(savePath)
     deleteDirectory(new File(savePath))
 
     // Assert that all parameter values are equal
     Assert.assertEquals(
-      isolationForestModel1.extractParamMap.toString,
-      isolationForestModel2.extractParamMap.toString)
+      isolationForestModel1.extractParamMap().toString,
+      isolationForestModel2.extractParamMap().toString)
     Assert.assertEquals(isolationForestModel1.getNumSamples, isolationForestModel2.getNumSamples)
     Assert.assertEquals(isolationForestModel1.getNumFeatures, isolationForestModel2.getNumFeatures)
     Assert.assertEquals(
@@ -128,8 +128,8 @@ class IsolationForestModelWriteReadTest extends Logging {
     Assert.assertEquals(auroc1, auroc2)
 
     // Assert the predicted labels are equal and always 0.0
-    val predictedLabels1 = scores1.map(x => x.predictedLabel).collect
-    val predictedLabels2 = scores2.map(x => x.predictedLabel).collect
+    val predictedLabels1 = scores1.map(x => x.predictedLabel).collect()
+    val predictedLabels2 = scores2.map(x => x.predictedLabel).collect()
     val expectedLabels = Array.fill[Double](predictedLabels1.length)(0.0)
     Assert.assertEquals(predictedLabels1.toSeq, predictedLabels2.toSeq) 
     Assert.assertEquals(predictedLabels2.toSeq, expectedLabels.toSeq) 
@@ -182,7 +182,7 @@ class IsolationForestModelWriteReadTest extends Logging {
 
     // Write the trained model to disk and then read it back from disk
     val savePath = System.getProperty("java.io.tmpdir") + "/savedIsolationForestModelIdenticalFeatures"
-    isolationForestModel1.write.overwrite.save(savePath)
+    isolationForestModel1.write.overwrite().save(savePath)
     val isolationForestModel2 = IsolationForestModel.load(savePath)
     deleteDirectory(new File(savePath))
 
@@ -197,8 +197,8 @@ class IsolationForestModelWriteReadTest extends Logging {
     val scores2 = isolationForestModel2.transform(data).as[ScoringResult]
 
     Assert.assertEquals(
-      scores1.map(x => x.outlierScore).collect.toSeq,
-      scores2.map(x => x.outlierScore).collect.toSeq)
+      scores1.map(x => x.outlierScore).collect().toSeq,
+      scores2.map(x => x.outlierScore).collect().toSeq)
 
     spark.stop()
   }
@@ -214,14 +214,14 @@ class IsolationForestModelWriteReadTest extends Logging {
 
     // Write the trained model to disk and then read it back from disk
     val savePath = System.getProperty("java.io.tmpdir") + "/emptyIsolationForestModelWriteReadTest"
-    isolationForestModel1.write.overwrite.save(savePath)
+    isolationForestModel1.write.overwrite().save(savePath)
     val isolationForestModel2 = IsolationForestModel.load(savePath)
     deleteDirectory(new File(savePath))
 
     // Assert that all parameter values are equal
     Assert.assertEquals(
-      isolationForestModel1.extractParamMap.toString,
-      isolationForestModel2.extractParamMap.toString)
+      isolationForestModel1.extractParamMap().toString,
+      isolationForestModel2.extractParamMap().toString)
     Assert.assertEquals(isolationForestModel1.getNumSamples, isolationForestModel2.getNumSamples)
     Assert.assertEquals(isolationForestModel1.getNumFeatures, isolationForestModel2.getNumFeatures)
     Assert.assertEquals(

diff --git a/...on-forest/src/test/scala/com/linkedin/relevance/isolationforest/IsolationForestTest.scala b/...on-forest/src/test/scala/com/linkedin/relevance/isolationforest/IsolationForestTest.scala
@@ -32,13 +32,13 @@ class IsolationForestTest {
       .setContaminationError(contamination * 0.01)
       .setRandomSeed(1)
 
-    isolationForest1.write.overwrite.save(savePath)
+    isolationForest1.write.overwrite().save(savePath)
     val isolationForest2 = IsolationForest.load(savePath)
     deleteDirectory(new File(savePath))
 
     Assert.assertEquals(
-      isolationForest1.extractParamMap.toString,
-      isolationForest2.extractParamMap.toString)
+      isolationForest1.extractParamMap().toString,
+      isolationForest2.extractParamMap().toString)
 
     spark.stop()
   }
@@ -148,7 +148,7 @@ class IsolationForestTest {
 
     // Calculate area under ROC curve and assert
     val scores = isolationForestModel.transform(data).as[ScoringResult]
-    val predictedLabels = scores.map(x => x.predictedLabel).collect
+    val predictedLabels = scores.map(x => x.predictedLabel).collect()
     val expectedLabels = Array.fill[Double](predictedLabels.length)(0.0)
 
     Assert.assertEquals(
@@ -195,10 +195,10 @@ class IsolationForestTest {
 
     val labeledOutlierScoresMean = labeledOutlierScores
       .map(_.outlierScore)
-      .reduce(_+_) / labeledOutlierScores.count
+      .reduce(_+_) / labeledOutlierScores.count()
     val labeledInlierScoresMean = labeledInlierScores
       .map(_.outlierScore)
-      .reduce(_+_) / labeledInlierScores.count
+      .reduce(_+_) / labeledInlierScores.count()
 
     val uncert = 0.02
     val expectedOutlierScoreMean = 0.61

diff --git a/isolation-forest/src/test/scala/com/linkedin/relevance/isolationforest/TestUtils.scala b/isolation-forest/src/test/scala/com/linkedin/relevance/isolationforest/TestUtils.scala
@@ -28,7 +28,7 @@ object TestUtils {
     }
 
     // local context with 4 threads
-    SparkSession.builder
+    SparkSession.builder()
       .master("local[4]")
       .appName("testing-spark")
       .config(sparkConf)