Approx quantile bug fix (#4)

* Fixed the rare unexpected behavior reported here #3 that was due to an issue with Spark's approxQuantile method. * Updated README.md to reflect approxQuantile bugfix changes. * Addressed comments from @fastier-li about the difference between the contaminationError and effectiveError (now verificationError).
linkedin · Oct 3, 2019 · 07aecef · 07aecef
1 parent 4b2b54d
commit 07aecef
Show file tree

Hide file tree

Showing 6 changed files with 122 additions and 28 deletions.
diff --git a/README.md b/README.md
@@ -37,8 +37,8 @@ build.gradle file before building.
 
 ### Add an isolation-forest dependency to your project
 
-Artifacts (built with Scala 2.11.8 and Spark 2.3.0) for this project are
-[available on Bintray](https://bintray.com/beta/#/linkedin/maven/isolation-forest).
+Please check [Bintray](https://bintray.com/beta/#/linkedin/maven/isolation-forest) for the latest
+artifact versions (built with Scala 2.11.8 and Spark 2.3.0).
 
 #### Gradle example
 
@@ -64,7 +64,7 @@ Second, add the isolation-forest dependency to the module-level build.gradle fil
 
 ```
 dependencies {
-    compile 'com.linkedin.isolation-forest:isolation-forest_2.11:0.2.2'
+    compile 'com.linkedin.isolation-forest:isolation-forest_2.11:0.3.0'
 }
 ```
 
@@ -114,23 +114,24 @@ Second, declare the isolation-forest dependency in your project's pom.xml file.
 <dependency>
   <groupId>com.linkedin.isolation-forest</groupId>
   <artifactId>isolation-forest_2.11</artifactId>
-  <version>0.2.2</version>
+  <version>0.3.0</version>
 </dependency>
 ```
 
 ### Model parameters
 
-| Parameter     | Default Value    | Description                                                                                                                                                                                                          |
-|---------------|------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| numEstimators | 100              | The number of trees in the ensemble.                                                                                                                                                                                 |
-| maxSamples    | 256              | The number of samples used to train each tree. If this value is between 0.0 and 1.0, then it is treated as a fraction. If it is >1.0, then it is treated as a count.                                                 |
-| contamination | 0.0              | The fraction of outliers in the training data set. If this is set to 0.0, it speeds up the training and all predicted labels will be false. The model and outlier scores are otherwise unaffected by this parameter. |
-| maxFeatures   | 1.0              | The number of features used to train each tree. If this value is between 0.0 and 1.0, then it is treated as a fraction. If it is >1.0, then it is treated as a count.                                                |
-| bootstrap     | false            | If true, draw sample for each tree with replacement. If false, do not sample with replacement.                                                                                                                       |
-| randomSeed    | 1                | The seed used for the random number generator.                                                                                                                                                                       |
-| featuresCol   | "features"       | The feature vector. This column must exist in the input DataFrame for training and scoring.                                                                                                                          |
-| predictionCol | "predictedLabel" | The predicted label. This column is appended to the input DataFrame upon scoring.                                                                                                                                    |
-| scoreCol      | "outlierScore"   | The outlier score. This column is appended to the input DataFrame upon scoring.          
+| Parameter          | Default Value    | Description                                                                                                                                                                                                                                                                                                                                                                          |
+|--------------------|------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| numEstimators      | 100              | The number of trees in the ensemble.                                                                                                                                                                                                                                                                                                                                                 |
+| maxSamples         | 256              | The number of samples used to train each tree. If this value is between 0.0 and 1.0, then it is treated as a fraction. If it is >1.0, then it is treated as a count.                                                                                                                                                                                                                 |
+| contamination      | 0.0              | The fraction of outliers in the training data set. If this is set to 0.0, it speeds up the training and all predicted labels will be false. The model and outlier scores are otherwise unaffected by this parameter.                                                                                                                                                                 |
+| contaminationError | 0.0              | The error allowed when calculating the threshold required to achieve the specified contamination fraction. The default is 0.0, which forces an exact calculation of the threshold. The exact calculation is slow and can fail for large datasets. If there are issues with the exact calculation, a good choice for this parameter is often 1% of the specified contamination value. |
+| maxFeatures        | 1.0              | The number of features used to train each tree. If this value is between 0.0 and 1.0, then it is treated as a fraction. If it is >1.0, then it is treated as a count.                                                                                                                                                                                                                |
+| bootstrap          | false            | If true, draw sample for each tree with replacement. If false, do not sample with replacement.                                                                                                                                                                                                                                                                                       |
+| randomSeed         | 1                | The seed used for the random number generator.                                                                                                                                                                                                                                                                                                                                       |
+| featuresCol        | "features"       | The feature vector. This column must exist in the input DataFrame for training and scoring.                                                                                                                                                                                                                                                                                          |
+| predictionCol      | "predictedLabel" | The predicted label. This column is appended to the input DataFrame upon scoring.                                                                                                                                                                                                                                                                                                    |
+| scoreCol           | "outlierScore"   | The outlier score. This column is appended to the input DataFrame upon scoring.                                                                                                                                                                                                                                                                                                      |
 
 ### Training and scoring
 
@@ -175,7 +176,8 @@ val data = assembler
 /**
   * Train the model
   */
-
+
+val contamination = 0.1
 val isolationForest = new IsolationForest()
   .setNumEstimators(100)
   .setBootstrap(false)
@@ -184,15 +186,16 @@ val isolationForest = new IsolationForest()
   .setFeaturesCol("features")
   .setPredictionCol("predictedLabel")
   .setScoreCol("outlierScore")
-  .setContamination(0.1)
+  .setContamination(contamination)
+  .setContaminationError(0.01 * contamination)
   .setRandomSeed(1)
- 
+
 val isolationForestModel = isolationForest.fit(data)
 
 /**
   * Score the training data
   */
- 
+
 val dataWithScores = isolationForestModel.transform(data)
 
 // scala> dataWithScores.printSchema

diff --git a/isolation-forest/src/main/scala/com/linkedin/relevance/isolationforest/IsolationForest.scala b/isolation-forest/src/main/scala/com/linkedin/relevance/isolationforest/IsolationForest.scala
@@ -1,6 +1,6 @@
 package com.linkedin.relevance.isolationforest
 
-import com.linkedin.relevance.isolationforest.Utils.DataPoint
+import com.linkedin.relevance.isolationforest.Utils.{DataPoint, OutlierScore}
 import org.apache.spark.internal.Logging
 import org.apache.spark.ml.linalg.SQLDataTypes.VectorType
 import org.apache.spark.ml.linalg.Vector
@@ -130,24 +130,58 @@ class IsolationForest(override val uid: String) extends Estimator[IsolationFores
     val isolationForestModel = copyValues(
       new IsolationForestModel(uid, isolationTrees, numSamples).setParent(this))
 
+    // Determine and set the model threshold based upon the specified contamination and
+    // contaminationError parameters.
     if ($(contamination) > 0.0) {
       // Score all training instances to determine the threshold required to achieve the desired
       // level of contamination. The approxQuantile method uses the algorithm in this paper:
       // https://dl.acm.org/citation.cfm?id=375670
-      // The relative error was set to ensure the fraction of samples found to be outliers during
-      // training is within 1% of the value of the parameter $(contamination).
-      val outlierScoreThreshold = isolationForestModel
+      val scores = isolationForestModel
         .transform(df)
-        .stat.approxQuantile($(scoreCol), Array(1 - $(contamination)), $(contamination) * 0.01)
+        .map(row => OutlierScore(row.getAs[Double]($(scoreCol))))
+        .cache()
+      val outlierScoreThreshold = scores
+        .stat.approxQuantile("score", Array(1 - $(contamination)), $(contaminationError))
         .head
       isolationForestModel.setOutlierScoreThreshold(outlierScoreThreshold)
+
+      // Determine labels for each instance using the newly calculated threshold and verify that the
+      // fraction of positive labels is in agreement with the user specified contamination. Issue
+      // a warning if the observed contamination in the scored training data is outside the expected
+      // bounds.
+      //
+      // If the user specifies a non-zero contaminationError model parameter, then the
+      // verificationError used for the verification calculation is equal to the
+      // contaminationError parameter value. If the user selects an "exact" calculation of the
+      // threshold by setting the parameter contaminationError = 0.0, then assume a
+      // verificationError equal to 1% of the contamination parameter value for the validation
+      // calculation.
+      val observedContamination = scores
+        .map(outlierScore => if(outlierScore.score >= outlierScoreThreshold) 1.0 else 0.0)
+        .reduce(_ + _) / scores.count()
+      val verificationError = if (${contaminationError} == 0.0) {
+        // If the threshold is calculated exactly, then assume a relative 1% error on the specified
+        // contamination for the verification.
+        $(contamination) * 0.01
+      } else {
+        ${contaminationError}
+      }
+      if (math.abs(observedContamination - $(contamination)) > verificationError) {
+
+        logWarning(s"Observed contamination is ${observedContamination}, which is outside" +
+          s" the expected range of ${${contamination}} +/- ${verificationError}. If this is" +
+          s" acceptable to you, then it is OK to proceed. If there is a very large discrepancy" +
+          s" between observed and expected values, then please try retraining the model with an" +
+          s" exact threshold calculation (set the contaminationError parameter value to 0.0).")
+      }
     } else {
       // Do not set the outlier score threshold, which ensures no outliers are found. This speeds up
       // the algorithm runtime by avoiding the approxQuantile calculation.
-      logWarning(s"Contamination parameter was set to ${$(contamination)}, so all predicted" +
+      logInfo(s"Contamination parameter was set to ${$(contamination)}, so all predicted" +
         " labels will be false. The model and outlier scores are otherwise not affected by this" +
         " parameter choice.")
     }
+
     isolationForestModel
   }
 

diff --git a/...-forest/src/main/scala/com/linkedin/relevance/isolationforest/IsolationForestParams.scala b/...-forest/src/main/scala/com/linkedin/relevance/isolationforest/IsolationForestParams.scala
@@ -37,6 +37,16 @@ trait IsolationForestParams extends Params {
   def setBootstrap(value: Boolean): this.type = set(bootstrap, value)
   final def getBootstrap: Boolean = $(bootstrap)
 
+  final val contaminationError = new DoubleParam(this, "contaminationError", "The error" +
+    " allowed when calculating the threshold required to achieve the specified contamination" +
+    " fraction. The default is 0.0, which forces an exact calculation of the threshold. The" +
+    " exact calculation is slow and can fail for large datasets. If there are issues with the" +
+    " exact calculation, a good choice for this parameter is often 1% of the specified" +
+    " contamination value.",
+    ParamValidators.inRange(0.0, 1, lowerInclusive = true, upperInclusive = true))
+  def setContaminationError(value: Double): this.type = set(contaminationError, value)
+  final def getContaminationError: Double = $(contaminationError)
+
   final val randomSeed = new LongParam(this, "randomSeed", "The seed used for the random" +
     " number generator.",  ParamValidators.gt(0.0))
   def setRandomSeed(value: Long): this.type = set(randomSeed, value)
@@ -58,6 +68,7 @@ trait IsolationForestParams extends Params {
     numEstimators -> 100,
     maxSamples -> 256,
     contamination -> 0.0,
+    contaminationError -> 0.0,
     maxFeatures -> 1.0,
     bootstrap -> false,
     randomSeed -> 1,

diff --git a/isolation-forest/src/main/scala/com/linkedin/relevance/isolationforest/Utils.scala b/isolation-forest/src/main/scala/com/linkedin/relevance/isolationforest/Utils.scala
@@ -7,6 +7,7 @@ package com.linkedin.relevance.isolationforest
 private[isolationforest] object Utils extends Serializable {
 
   case class DataPoint(features: Array[Float])
+  case class OutlierScore(score: Double)
 
   val EulerConstant = 0.5772156649f
 

diff --git a/...on-forest/src/test/scala/com/linkedin/relevance/isolationforest/IsolationForestTest.scala b/...on-forest/src/test/scala/com/linkedin/relevance/isolationforest/IsolationForestTest.scala
@@ -19,6 +19,7 @@ class IsolationForestTest {
 
     val savePath = System.getProperty("java.io.tmpdir") + "/isolationForestEstimatorWriteReadTest"
 
+    val contamination = 0.02
     val isolationForest1 = new IsolationForest()
       .setNumEstimators(200)
       .setBootstrap(true)
@@ -27,7 +28,8 @@ class IsolationForestTest {
       .setFeaturesCol("featuresTestColumn")
       .setPredictionCol("predictedLabelTestColumn")
       .setScoreCol("outlierScoreTestColumn")
-      .setContamination(0.02)
+      .setContamination(contamination)
+      .setContaminationError(contamination * 0.01)
       .setRandomSeed(1)
 
     isolationForest1.write.overwrite.save(savePath)
@@ -50,6 +52,46 @@ class IsolationForestTest {
 
     val data = loadMammographyData(spark)
 
+    // Train a new isolation forest model
+    val contamination = 0.02
+    val isolationForest = new IsolationForest()
+      .setNumEstimators(100)
+      .setBootstrap(false)
+      .setMaxSamples(256)
+      .setMaxFeatures(1.0)
+      .setFeaturesCol("features")
+      .setPredictionCol("predictedLabel")
+      .setScoreCol("outlierScore")
+      .setContamination(0.02)
+      .setContaminationError(contamination * 0.01)
+      .setRandomSeed(1)
+
+    // Score all training data instances using the new model
+    val isolationForestModel = isolationForest.fit(data)
+
+    // Calculate area under ROC curve and assert
+    val scores = isolationForestModel.transform(data).as[ScoringResult]
+    val metrics = new BinaryClassificationMetrics(scores.rdd.map(x => (x.outlierScore, x.label)))
+
+    // Expectation from results in the 2008 "Isolation Forest" paper by F. T. Liu, et al.
+    val aurocExpectation = 0.86
+    val uncert = 0.02
+    val auroc = metrics.areaUnderROC()
+    Assert.assertTrue(auroc === aurocExpectation +- uncert, "expected area under ROC =" +
+      s" $aurocExpectation +/- $uncert, but observed $auroc")
+
+    spark.stop()
+  }
+
+  @Test(description = "isolationForestMammographyExactContaminationDataTest")
+  def isolationForestMammographyExactContaminationDataTest(): Unit = {
+
+    val spark = getSparkSession
+
+    import spark.implicits._
+
+    val data = loadMammographyData(spark)
+
     // Train a new isolation forest model
     val isolationForest = new IsolationForest()
       .setNumEstimators(100)
@@ -60,6 +102,7 @@ class IsolationForestTest {
       .setPredictionCol("predictedLabel")
       .setScoreCol("outlierScore")
       .setContamination(0.02)
+      .setContaminationError(0.0)
       .setRandomSeed(1)
 
     // Score all training data instances using the new model
@@ -125,6 +168,7 @@ class IsolationForestTest {
     val data = loadShuttleData(spark)
 
     // Train a new isolation forest model
+    val contamination = 0.07
     val isolationForest = new IsolationForest()
       .setNumEstimators(100)
       .setBootstrap(false)
@@ -133,7 +177,8 @@ class IsolationForestTest {
       .setFeaturesCol("features")
       .setPredictionCol("predictedLabel")
       .setScoreCol("outlierScore")
-      .setContamination(0.07)
+      .setContamination(contamination)
+      .setContaminationError(contamination * 0.01)
       .setRandomSeed(1)
 
     // Score all training data instances using the new model

diff --git a/version.properties b/version.properties
@@ -1,4 +1,4 @@
 #Version of the produced binaries. This file is intended to be checked-in.
 #It will be automatically bumped by release automation.
-version=0.2.3
+version=0.3.0
 previousVersion=0.2.2