[CROSSDATA-2041] Incremental collect: limit performance improvement (#834) (#874)

mafernandez-stratio · rvassallo-stratio · commit d52c016b1738 · 2019-05-13T17:12:39.000+02:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -46,6 +46,7 @@ Only listing significant user-visible, not internal code cleanups and minor bug
 * [CROSSDATA-2039][CROSSDATA-2040] Fix bug in session catalog cache, inconsistent cache key generation
 * [CROSSDATA-2049] Allow crontab expression for "refresh-time" in partition refresh
 * [CROSSDATA-2050] Allow proper requestId Deserialization in Server, to keep both values equals
+* [CROSSDATA-2041] Incremental collect: limit performance improvement
 
 ## 2.15.0-24f463e (Built: January 14, 2019 | Released: January 17, 2019)
 
diff --git a/common/src/main/scala/org/apache/spark/sql/crossdata/serializers/StreamedSuccessfulSQLResultSerializer.scala b/common/src/main/scala/org/apache/spark/sql/crossdata/serializers/StreamedSuccessfulSQLResultSerializer.scala
@@ -42,3 +42,7 @@ class StreamedRowSerializer(schema: StructType) extends CustomSerializer[Interna
     PartialFunction.empty
   )
 )
+
+case class CustomStreamedRow(streamedRow: StreamedValues)
+
+case class StreamedValues(values: List[Any])
diff --git a/core/src/main/scala/org/apache/spark/sql/crossdata/catalyst/catalog/persistent/GovernanceMetadataRetriever.scala b/core/src/main/scala/org/apache/spark/sql/crossdata/catalyst/catalog/persistent/GovernanceMetadataRetriever.scala
@@ -426,6 +426,7 @@ object GovernanceMetadataRetriever {
                   logger.warn(s"Table ${database.name}.${success.tableName} failed while refreshing partitions: ${th.getMessage}", th)
               }
             }
+          case _ => //Do nothing
         }
         droppableTableNames.foreach {
           name =>
diff --git a/core/src/main/scala/org/apache/spark/sql/crossdata/execution/XDPlan.scala b/core/src/main/scala/org/apache/spark/sql/crossdata/execution/XDPlan.scala
@@ -5,19 +5,26 @@
  */
 package org.apache.spark.sql.crossdata.execution
 
+import java.io._
+
 import com.stratio.common.utils.components.logger.impl.Slf4jLoggerComponent
 import com.stratio.crossdata.common.profiler.PerformanceLogger
 import com.stratio.crossdata.connector.NativeScan
 import com.stratio.crossdata.metrics.{MetricsGlossary, MetricsRegister}
+import org.apache.spark.SparkEnv
+import org.apache.spark.io.CompressionCodec
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.expressions.{Alias, Attribute, AttributeSet, GetMapValue, GetStructField, Literal}
+import org.apache.spark.sql.catalyst.expressions.{Alias, Attribute, AttributeSet, GetMapValue, GetStructField, Literal, UnsafeProjection, UnsafeRow}
 import org.apache.spark.sql.catalyst.plans.QueryPlan
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.crossdata.execution.command.XDExplainCommand
-import org.apache.spark.sql.execution.SparkPlan
+import org.apache.spark.sql.crossdata.serializers.CustomStreamedRow
 import org.apache.spark.sql.execution.datasources._
+import org.apache.spark.sql.execution.{DeserializeToObjectExec, InputAdapter, LocalLimitExec, MapPartitionsExec, SerializeFromObjectExec, SparkPlan, WholeStageCodegenExec}
+import org.apache.spark.sql.types.IntegerType
 
+import scala.collection.mutable.ArrayBuffer
 import scala.util.{Failure, Success, Try}
 
 case class XDPlan(@transient xdQueryExecution: XDQueryExecution,
@@ -31,6 +38,8 @@ case class XDPlan(@transient xdQueryExecution: XDQueryExecution,
 
   private lazy val nativeQueryExecutor: Option[NativeScan] = findNativeQueryExecutor(analyzedPlan, isNativeQueriesEnabled)
 
+  private lazy val applyXDLimitRule = xdQueryExecution.sparkSession.sparkContext.conf.getBoolean("spark.sql.crossdata.limitRule", false)
+
   private lazy val usablePlan: QueryPlan[_] =
     if(nativeQueryExecutor.exists(x => supportedPlan(x, analyzedPlan))) {
       analyzedPlan
@@ -81,6 +90,147 @@ case class XDPlan(@transient xdQueryExecution: XDQueryExecution,
     }
   }
 
+  /**
+    * Decode the byte arrays back to UnsafeRows and put them into buffer.
+    *
+    * NOTE: Great part of this code is a copy from the [[SparkPlan]].decodeUnsafeRows(bytes).
+    *
+    */
+  private def decodeUnsafeRows(bytes: Array[Byte]): Iterator[InternalRow] = {
+    val nFields = schema.length
+
+    val codec = CompressionCodec.createCodec(SparkEnv.get.conf)
+    val bis = new ByteArrayInputStream(bytes)
+    val ins = new DataInputStream(codec.compressedInputStream(bis))
+
+    new Iterator[InternalRow] {
+      private var sizeOfNextRow = ins.readInt()
+      override def hasNext: Boolean = sizeOfNextRow >= 0
+      override def next(): InternalRow = {
+        val bs = new Array[Byte](sizeOfNextRow)
+        ins.readFully(bs)
+        val row = new UnsafeRow(nFields)
+        row.pointTo(bs, sizeOfNextRow)
+        sizeOfNextRow = ins.readInt()
+        row
+      }
+    }
+  }
+
+  /**
+    * Packing the UnsafeRows into byte array for faster serialization.
+    * The byte arrays are in the following format:
+    * [size] [bytes of UnsafeRow] [size] [bytes of UnsafeRow] ... [-1]
+    *
+    * UnsafeRow is highly compressible (at least 8 bytes for any column), the byte array is also
+    * compressed.
+    *
+    * NOTE: Great part of this code is a copy from the [[SparkPlan]].getByteArrayRdd(n).
+    *
+    * @param executedPlan usable executed plan.
+    * @param limit global limit.
+    * @return executed plan result.
+    */
+  private def getByteArrayRdd(executedPlan: SparkPlan, limit: Int = -1): RDD[Array[Byte]] = {
+    executedPlan.execute().mapPartitionsInternal { iter =>
+      var count = 0
+      val buffer = new Array[Byte](4 << 10)  // 4K
+    val codec = CompressionCodec.createCodec(SparkEnv.get.conf)
+      val bos = new ByteArrayOutputStream()
+      val out = new DataOutputStream(codec.compressedOutputStream(bos))
+      while (iter.hasNext && (limit < 0 || count < limit)) {
+        val row = iter.next().asInstanceOf[UnsafeRow]
+        out.writeInt(row.getSizeInBytes)
+        row.writeToStream(out, buffer)
+        count += 1
+      }
+      out.writeInt(-1)
+      out.flush()
+      out.close()
+      Iterator(bos.toByteArray)
+    }
+  }
+
+  /**
+    * Fetch data partition by partition from a specific plan until the number of rows reaches a specific limit.
+    *
+    * NOTE: Great part of this code is a copy from the [[SparkPlan.executeTake(n)]].
+    *
+    * @param executedPlan usable executed plan.
+    * @param limit global limit.
+    * @return executed plan result.
+    */
+  private def incrementalExecute(executedPlan: SparkPlan, limit: Int): Array[InternalRow] = {
+    if (limit == 0) {
+      return new Array[InternalRow](0)
+    }
+
+    import org.json4s._
+    import org.json4s.jackson.JsonMethods._
+    import org.json4s.jackson.Serialization._
+    implicit val formats: Formats = DefaultFormats
+
+    val buf = new ArrayBuffer[InternalRow]
+    val childRDD = getByteArrayRdd(executedPlan, limit)
+    val totalParts = childRDD.partitions.length
+    var partsScanned = 0
+    var numOfRows = 0
+    while (numOfRows < limit && partsScanned < totalParts) {
+      // The number of partitions to try in this iteration. It is ok for this number to be
+      // greater than totalParts because we actually cap it at totalParts in runJob.
+      var numPartsToTry = 1L
+      if (partsScanned > 0) {
+        // If we didn't find any rows after the previous iteration, quadruple and retry.
+        // Otherwise, interpolate the number of partitions we need to try, but overestimate
+        // it by 50%. We also cap the estimation in the end.
+        val limitScaleUpFactor = Math.max(sqlContext.conf.limitScaleUpFactor, 2)
+        if (buf.isEmpty) {
+          numPartsToTry = partsScanned * limitScaleUpFactor
+        } else {
+          // the left side of max is >=1 whenever partsScanned >= 2
+          numPartsToTry = Math.max((1.5 * limit * partsScanned / buf.size).toInt - partsScanned, 1)
+          numPartsToTry = Math.min(numPartsToTry, partsScanned * limitScaleUpFactor)
+        }
+      }
+
+      val p = partsScanned.until(math.min(partsScanned + numPartsToTry, totalParts).toInt)
+      val sc = sqlContext.sparkContext
+
+      val jobRDD = if (numOfRows == 0) childRDD else getByteArrayRdd(executedPlan, limit-numOfRows)
+
+      val res = sc.runJob(jobRDD, (it: Iterator[Array[Byte]]) => if (it.hasNext) it.next() else Array.empty[Byte], p)
+
+      res.foreach{ pres =>
+        val iter = if(numOfRows < limit){
+          decodeUnsafeRows(pres)
+        } else {
+          Iterator.empty
+        }
+
+        while(iter.hasNext && numOfRows < limit){
+          val row = iter.next()
+          val numElements = row.getInt(0)
+
+          if(numOfRows + numElements <= limit){
+            numOfRows += numElements
+            buf += row
+          } else {
+            collectFirst{
+              case SerializeFromObjectExec(serializer, _) =>
+                val remainingRows = limit - numOfRows
+                val projection = UnsafeProjection.create(serializer)
+                projection.initialize(0)
+                numOfRows += remainingRows
+                buf += projection(InternalRow.apply((remainingRows, write((parse(row.getString(1)).extract[List[CustomStreamedRow]]).take(remainingRows)))))
+            }
+          }
+        }
+      }
+      partsScanned += p.size
+    }
+    buf.toArray
+  }
+
   override def executeCollect(): Array[InternalRow] = {
     nativeQueryExecutor match {
       case Some(nqe) =>
@@ -109,7 +259,13 @@ case class XDPlan(@transient xdQueryExecution: XDQueryExecution,
       case None =>
         logPerformance("[XDPlan][executeSpark]") {
           MetricsRegister.countExecution(MetricsGlossary.Counter.`current_queries_spark_total`, discountWhenFinished = true) {
-            Try(child.executeCollect())
+            child match {
+              case WholeStageCodegenExec(SerializeFromObjectExec(_, InputAdapter(MapPartitionsExec(_, _, DeserializeToObjectExec(_, _, WholeStageCodegenExec(LocalLimitExec(limit, _))))))) if applyXDLimitRule =>
+                logger.debug("Resolving query with incremental execution")
+                Try(incrementalExecute(child, limit))
+              case _ =>
+                Try(child.executeCollect())
+            }
           }
         } match {
           case Success(r) => r
diff --git a/core/src/main/scala/org/apache/spark/sql/crossdata/session/XDSessionStateBuilder.scala b/core/src/main/scala/org/apache/spark/sql/crossdata/session/XDSessionStateBuilder.scala
@@ -5,19 +5,18 @@
  */
 package org.apache.spark.sql.crossdata.session
 
-import org.apache.spark.sql.{SparkSession, Strategy}
 import org.apache.spark.sql.catalyst.analysis.Analyzer
 import org.apache.spark.sql.catalyst.catalog.SessionCatalog
 import org.apache.spark.sql.catalyst.parser.ParserInterface
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.rules.Rule
 import org.apache.spark.sql.crossdata.XDSession
-import org.apache.spark.sql.crossdata.catalyst.catalog.{XDCatalogWrapper, XDSessionCatalog}
 import org.apache.spark.sql.crossdata.catalyst.catalog.temporary.implementations.DefaultTemporaryCatalog
+import org.apache.spark.sql.crossdata.catalyst.catalog.{XDCatalogWrapper, XDSessionCatalog}
 import org.apache.spark.sql.crossdata.execution.{XDQueryExecution, XDSparkSqlParser}
 import org.apache.spark.sql.execution.{QueryExecution, SparkPlanner}
-import org.apache.spark.sql.internal.SQLConf.buildConf
-import org.apache.spark.sql.internal.{BaseSessionStateBuilder, SQLConf, SessionState}
+import org.apache.spark.sql.internal.{BaseSessionStateBuilder, SessionState}
+import org.apache.spark.sql.{SparkSession, Strategy}
 
 
 class XDSessionStateBuilder(sparkSession: SparkSession, parentState: Option[SessionState] = None) extends BaseSessionStateBuilder(sparkSession, parentState){
@@ -34,12 +33,13 @@ class XDSessionStateBuilder(sparkSession: SparkSession, parentState: Option[Sess
     val applyXDLimitRule = sparkSession.sparkContext.conf.getBoolean("spark.sql.crossdata.limitRule", false)
     logDebug(s"applyXDLimitRule? $applyXDLimitRule")
     override def apply(plan: LogicalPlan): LogicalPlan = plan transform {
-      case s@SerializeFromObject(serializer, MapPartitions(func, outputObjAttr, DeserializeToObject(des, attrs, gl@GlobalLimit(_, _)))) =>
-        if(applyXDLimitRule){
-          SerializeFromObject(serializer, MapPartitions(func, outputObjAttr, DeserializeToObject(des, attrs, ReturnAnswer(gl))))
-        } else {
-          s
-        }
+      case s@SerializeFromObject(_, MapPartitions(_, _, DeserializeToObject(_, _, Limit(_, Project(_, Sort(_, _, _)))))) if applyXDLimitRule =>
+        s // Skip third case
+      case s@SerializeFromObject(_, MapPartitions(_, _, DeserializeToObject(_, _, Limit(_, Sort(_, _, _))))) if applyXDLimitRule =>
+        s // Skip third case
+      case SerializeFromObject(serializer, MapPartitions(func, outputObjAttr, DeserializeToObject(des, attrs, Limit(exp, child)))) if applyXDLimitRule =>
+        logDebug("Applying XDLimitRule")
+        SerializeFromObject(serializer, MapPartitions(func, outputObjAttr, DeserializeToObject(des, attrs, LocalLimit(exp, child))))
     }
   }
 
diff --git a/server/pom.xml b/server/pom.xml
@@ -243,6 +243,12 @@ This software – including all its source code – contains proprietary informa
             <type>test-jar</type>
             <scope>test</scope>
         </dependency>
+        <dependency>
+            <groupId>org.apache.httpcomponents</groupId>
+            <artifactId>fluent-hc</artifactId>
+            <version>${httpclient.version}</version>
+            <scope>test</scope>
+        </dependency>
     </dependencies>
 
     <build>
diff --git a/server/src/test/scala/com/stratio/crossdata/server/XDLimitRuleIT.scala b/server/src/test/scala/com/stratio/crossdata/server/XDLimitRuleIT.scala
diff --git a/shell/src/main/scala/com/stratio/crossdata/driver/shell/utils/ShellFunctions.scala b/shell/src/main/scala/com/stratio/crossdata/driver/shell/utils/ShellFunctions.scala

Original file line number	Diff line number	Diff line change
`@@ -42,3 +42,7 @@ class StreamedRowSerializer(schema: StructType) extends CustomSerializer[Interna`
`42`	`42`	`PartialFunction.empty`
`43`	`43`	`)`
`44`	`44`	`)`
	`45`	`+`
	`46`	`+case class CustomStreamedRow(streamedRow: StreamedValues)`
	`47`	`+`
	`48`	`+case class StreamedValues(values: List[Any])`
Original file line number	Diff line number	Diff line change
`@@ -426,6 +426,7 @@ object GovernanceMetadataRetriever {`
`426`	`426`	`logger.warn(s"Table ${database.name}.${success.tableName} failed while refreshing partitions: ${th.getMessage}", th)`
`427`	`427`	`}`
`428`	`428`	`}`
	`429`	`+ case _ => //Do nothing`
`429`	`430`	`}`
`430`	`431`	`droppableTableNames.foreach {`
`431`	`432`	`name =>`