Merge branch 'branch-25.02' into nullable_structs_writes

NVIDIA · Dec 13, 2024 · 30dd34e · 30dd34e
2 parents c08d955 + 561068c
commit 30dd34e
Show file tree

Hide file tree

Showing 57 changed files with 993 additions and 372 deletions.
diff --git a/.github/workflows/license-header-check.yml b/.github/workflows/license-header-check.yml
@@ -0,0 +1,58 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# A workflow to check copyright/license header
+name: license header check
+
+on:
+  pull_request:
+    types: [opened, synchronize, reopened]
+
+jobs:
+  license-header-check:
+    runs-on: ubuntu-latest
+    if: "!contains(github.event.pull_request.title, '[bot]')"
+    steps:
+      - name: Get checkout depth
+        run: |
+          echo "PR_FETCH_DEPTH=$(( ${{ github.event.pull_request.commits }} + 10 ))" >> $GITHUB_ENV
+
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: ${{ env.PR_FETCH_DEPTH }}
+
+      - name: license-header-check
+        uses: NVIDIA/spark-rapids-common/license-header-check@main
+        with:
+          included_file_patterns: |
+            *.yml,
+            *.yaml,
+            *.sh,
+            *.xml,
+            *.properties,
+            *.scala,
+            *.py,
+            build/*,
+            *.cpp,
+            *Dockerfile*,
+            *Jenkinsfile*,
+            *.ini,
+            *.java,
+            *.fbs
+          excluded_file_patterns: |
+            *target/*,
+            thirdparty/*,
+            sql-plugin/src/main/java/com/nvidia/spark/rapids/format/*
+ 
diff --git a/.github/workflows/mvn-verify-check/populate-daily-cache.sh b/.github/workflows/mvn-verify-check/populate-daily-cache.sh
@@ -14,27 +14,30 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-set -x
-max_retry=3; delay=30; i=1
+set -e
+set -o pipefail
+
 if [[ $SCALA_VER == '2.12' ]]; then
     pom='pom.xml'
 elif [[ $SCALA_VER == '2.13' ]]; then
     pom='scala2.13/pom.xml'
 fi
+
+max_retry=3; delay=30; i=1
 while true; do
+    buildvers=($(python build/get_buildvers.py no_snapshots $pom | tr -d ',')) &&
     {
-        python build/get_buildvers.py "no_snapshots.buildvers" $pom | tr -d ',' | \
-            xargs -n 1 -I {} bash -c \
-                "mvn $COMMON_MVN_FLAGS --file $pom -Dbuildver={} de.qaware.maven:go-offline-maven-plugin:resolve-dependencies"
-
+        for buildver in "${buildvers[@]}"; do
+            mvn $COMMON_MVN_FLAGS --file $pom -Dbuildver=$buildver de.qaware.maven:go-offline-maven-plugin:resolve-dependencies
+        done
+    } && {
         # compile base versions to cache scala compiler and compiler bridge
-        mvn $COMMON_MVN_FLAGS --file $pom \
-            process-test-resources -pl sql-plugin-api -am
+        mvn $COMMON_MVN_FLAGS --file $pom process-test-resources -pl sql-plugin-api -am
     } && break || {
     if [[ $i -le $max_retry ]]; then
         echo "mvn command failed. Retry $i/$max_retry."; ((i++)); sleep $delay; ((delay=delay*2))
     else
         echo "mvn command failed. Exit 1"; exit 1
     fi
 }
-done
+done
diff --git a/.gitignore b/.gitignore
@@ -35,4 +35,5 @@ scalastyle-output.xml
 scalastyle.txt
 target/
 cufile.log
+cudf_log.txt
 build/*.class
diff --git a/datagen/src/main/spark400/scala/org/apache/spark/sql/tests/datagen/DataGenExprShims.scala b/datagen/src/main/spark400/scala/org/apache/spark/sql/tests/datagen/DataGenExprShims.scala
@@ -24,6 +24,6 @@ import org.apache.spark.sql.catalyst.expressions.Expression
 import org.apache.spark.sql.internal.ExpressionUtils.{column, expression}
 
 object DataGenExprShims {
-  def columnToExpr(c: Column): Expression = c
-  def exprToColumn(e: Expression): Column = e
+  def columnToExpr(c: Column): Expression = expression(c)
+  def exprToColumn(e: Expression): Column = column(e)
 }
diff --git a/...ain/databricks/scala/org/apache/spark/sql/rapids/delta/GpuOptimizeWriteExchangeExec.scala b/...ain/databricks/scala/org/apache/spark/sql/rapids/delta/GpuOptimizeWriteExchangeExec.scala
@@ -39,6 +39,7 @@ import org.apache.spark.sql.execution.{CoalescedPartitionSpec, ShufflePartitionS
 import org.apache.spark.sql.execution.exchange.Exchange
 import org.apache.spark.sql.execution.metric.{SQLMetrics, SQLShuffleReadMetricsReporter, SQLShuffleWriteMetricsReporter}
 import org.apache.spark.sql.rapids.execution.{GpuShuffleExchangeExecBase, ShuffledBatchRDD}
+import org.apache.spark.sql.rapids.execution.GpuShuffleExchangeExecBase.createAdditionalExchangeMetrics
 import org.apache.spark.sql.vectorized.ColumnarBatch
 import org.apache.spark.util.ThreadUtils
 
@@ -71,22 +72,11 @@ case class GpuOptimizeWriteExchangeExec(
   private[sql] lazy val readMetrics =
     SQLShuffleReadMetricsReporter.createShuffleReadMetrics(sparkContext)
 
-  override lazy val additionalMetrics: Map[String, GpuMetric] = Map(
-    "dataSize" -> createSizeMetric(ESSENTIAL_LEVEL, "data size"),
-    "dataReadSize" -> createSizeMetric(MODERATE_LEVEL, "data read size"),
-    "rapidsShuffleSerializationTime" ->
-        createNanoTimingMetric(DEBUG_LEVEL, "rs. serialization time"),
-    "rapidsShuffleDeserializationTime" ->
-        createNanoTimingMetric(DEBUG_LEVEL, "rs. deserialization time"),
-    "rapidsShuffleWriteTime" ->
-        createNanoTimingMetric(ESSENTIAL_LEVEL, "rs. shuffle write time"),
-    "rapidsShuffleCombineTime" ->
-        createNanoTimingMetric(DEBUG_LEVEL, "rs. shuffle combine time"),
-    "rapidsShuffleWriteIoTime" ->
-        createNanoTimingMetric(DEBUG_LEVEL, "rs. shuffle write io time"),
-    "rapidsShuffleReadTime" ->
-        createNanoTimingMetric(ESSENTIAL_LEVEL, "rs. shuffle read time")
-  ) ++ GpuMetric.wrap(readMetrics) ++ GpuMetric.wrap(writeMetrics)
+  override lazy val additionalMetrics : Map[String, GpuMetric] = {
+    createAdditionalExchangeMetrics(this) ++
+      GpuMetric.wrap(readMetrics) ++
+      GpuMetric.wrap(writeMetrics)
+  }
 
   override lazy val allMetrics: Map[String, GpuMetric] = {
     Map(
@@ -98,7 +88,7 @@ case class GpuOptimizeWriteExchangeExec(
   }
 
   private lazy val serializer: Serializer =
-    new GpuColumnarBatchSerializer(gpuLongMetric("dataSize"),
+    new GpuColumnarBatchSerializer(allMetrics,
       child.output.map(_.dataType).toArray,
       RapidsConf.SHUFFLE_KUDO_SERIALIZER_ENABLED.get(child.conf))
 

diff --git a/docs/archive.md b/docs/archive.md
@@ -5,6 +5,96 @@ nav_order: 15
 ---
 Below are archived releases for RAPIDS Accelerator for Apache Spark.
 
+## Release v24.10.1
+### Hardware Requirements:
+
+The plugin is tested on the following architectures:
+
+	GPU Models: NVIDIA V100, T4, A10/A100, L4 and H100 GPUs
+
+### Software Requirements:
+
+    OS: Spark RAPIDS is compatible with any Linux distribution with glibc >= 2.28 (Please check ldd --version output).  glibc 2.28 was released August 1, 2018. 
+    Tested on Ubuntu 20.04, Ubuntu 22.04, Rocky Linux 8 and Rocky Linux 9
+
+	NVIDIA Driver*: R470+
+
+	Runtime: 
+		Scala 2.12, 2.13
+		Python, Java Virtual Machine (JVM) compatible with your spark-version. 
+
+		* Check the Spark documentation for Python and Java version compatibility with your specific 
+		Spark version. For instance, visit `https://spark.apache.org/docs/3.4.1` for Spark 3.4.1.
+
+	Supported Spark versions:
+		Apache Spark 3.2.0, 3.2.1, 3.2.2, 3.2.3, 3.2.4
+		Apache Spark 3.3.0, 3.3.1, 3.3.2, 3.3.3, 3.3.4
+		Apache Spark 3.4.0, 3.4.1, 3.4.2, 3.4.3
+		Apache Spark 3.5.0, 3.5.1, 3.5.2
+
+	Supported Databricks runtime versions for Azure and AWS:
+		Databricks 11.3 ML LTS (GPU, Scala 2.12, Spark 3.3.0)
+		Databricks 12.2 ML LTS (GPU, Scala 2.12, Spark 3.3.2)
+		Databricks 13.3 ML LTS (GPU, Scala 2.12, Spark 3.4.1)
+
+	Supported Dataproc versions (Debian/Ubuntu/Rocky):
+		GCP Dataproc 2.1
+		GCP Dataproc 2.2
+
+	Supported Dataproc Serverless versions:
+		Spark runtime 1.1 LTS
+		Spark runtime 2.0
+		Spark runtime 2.1
+		Spark runtime 2.2
+
+*Some hardware may have a minimum driver version greater than R470. Check the GPU spec sheet
+for your hardware's minimum driver version.
+
+*For Cloudera and EMR support, please refer to the
+[Distributions](https://docs.nvidia.com/spark-rapids/user-guide/latest/faq.html#which-distributions-are-supported) section of the FAQ.
+
+### RAPIDS Accelerator's Support Policy for Apache Spark
+The RAPIDS Accelerator maintains support for Apache Spark versions available for download from [Apache Spark](https://spark.apache.org/downloads.html)
+
+### Download RAPIDS Accelerator for Apache Spark v24.10.1
+
+| Processor | Scala Version | Download Jar | Download Signature |
+|-----------|---------------|--------------|--------------------|
+| x86_64    | Scala 2.12    | [RAPIDS Accelerator v24.10.1](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/24.10.1/rapids-4-spark_2.12-24.10.1.jar) | [Signature](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/24.10.1/rapids-4-spark_2.12-24.10.1.jar.asc) |
+| x86_64    | Scala 2.13    | [RAPIDS Accelerator v24.10.1](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.13/24.10.1/rapids-4-spark_2.13-24.10.1.jar) | [Signature](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.13/24.10.1/rapids-4-spark_2.13-24.10.1.jar.asc) |
+| arm64     | Scala 2.12    | [RAPIDS Accelerator v24.10.1](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/24.10.1/rapids-4-spark_2.12-24.10.1-cuda11-arm64.jar) | [Signature](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/24.10.1/rapids-4-spark_2.12-24.10.1-cuda11-arm64.jar.asc) |
+| arm64     | Scala 2.13    | [RAPIDS Accelerator v24.10.1](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.13/24.10.1/rapids-4-spark_2.13-24.10.1-cuda11-arm64.jar) | [Signature](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.13/24.10.1/rapids-4-spark_2.13-24.10.1-cuda11-arm64.jar.asc) |
+
+This package is built against CUDA 11.8. It is tested on V100, T4, A10, A100, L4 and H100 GPUs with 
+CUDA 11.8 through CUDA 12.0.
+
+### Verify signature
+* Download the [PUB_KEY](https://keys.openpgp.org/[email protected]).
+* Import the public key: `gpg --import PUB_KEY`
+* Verify the signature for Scala 2.12 jar:
+    `gpg --verify rapids-4-spark_2.12-24.10.1.jar.asc rapids-4-spark_2.12-24.10.1.jar`
+* Verify the signature for Scala 2.13 jar:
+    `gpg --verify rapids-4-spark_2.13-24.10.1.jar.asc rapids-4-spark_2.13-24.10.1.jar`
+
+The output of signature verify:
+
+	gpg: Good signature from "NVIDIA Spark (For the signature of spark-rapids release jars) <[email protected]>"
+
+### Release Notes
+* Optimize scheduling policy for GPU Semaphore
+* Support distinct join for right outer joins
+* Support MinBy and MaxBy for non-float ordering  
+* Support ArrayJoin expression
+* Optimize Expand and Aggregate expression performance
+* Improve JSON related expressions
+* For updates on RAPIDS Accelerator Tools, please visit [this link](https://github.com/NVIDIA/spark-rapids-tools/releases)
+
+Note: There is a known issue in the 24.10.1 release when decompressing gzip files on H100 GPUs.  
+Please find more details in [issue-16661](https://github.com/rapidsai/cudf/issues/16661).
+
+For a detailed list of changes, please refer to the
+[CHANGELOG](https://github.com/NVIDIA/spark-rapids/blob/main/CHANGELOG.md).
+
 ## Release v24.10.0
 ### Hardware Requirements:
 

diff --git a/docs/dev/idea-code-style-settings.xml b/docs/dev/idea-code-style-settings.xml
@@ -1,3 +1,19 @@
+<!--
+ Copyright (c) 2024, NVIDIA CORPORATION.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+--> 
+
 <code_scheme name="Default" version="173">
   <option name="SOFT_MARGINS" value="100" />
   <JavaCodeStyleSettings>

diff --git a/docs/download.md b/docs/download.md
@@ -18,7 +18,7 @@ cuDF jar, that is either preinstalled in the Spark classpath on all nodes or sub
 that uses the RAPIDS Accelerator For Apache Spark. See the [getting-started
 guide](https://docs.nvidia.com/spark-rapids/user-guide/latest/getting-started/overview.html) for more details.
 
-## Release v24.10.1
+## Release v24.12.0
 ### Hardware Requirements:
 
 The plugin is tested on the following architectures:
@@ -69,14 +69,14 @@ for your hardware's minimum driver version.
 ### RAPIDS Accelerator's Support Policy for Apache Spark
 The RAPIDS Accelerator maintains support for Apache Spark versions available for download from [Apache Spark](https://spark.apache.org/downloads.html)
 
-### Download RAPIDS Accelerator for Apache Spark v24.10.1
+### Download RAPIDS Accelerator for Apache Spark v24.12.0
 
 | Processor | Scala Version | Download Jar | Download Signature |
 |-----------|---------------|--------------|--------------------|
-| x86_64    | Scala 2.12    | [RAPIDS Accelerator v24.10.1](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/24.10.1/rapids-4-spark_2.12-24.10.1.jar) | [Signature](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/24.10.1/rapids-4-spark_2.12-24.10.1.jar.asc) |
-| x86_64    | Scala 2.13    | [RAPIDS Accelerator v24.10.1](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.13/24.10.1/rapids-4-spark_2.13-24.10.1.jar) | [Signature](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.13/24.10.1/rapids-4-spark_2.13-24.10.1.jar.asc) |
-| arm64     | Scala 2.12    | [RAPIDS Accelerator v24.10.1](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/24.10.1/rapids-4-spark_2.12-24.10.1-cuda11-arm64.jar) | [Signature](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/24.10.1/rapids-4-spark_2.12-24.10.1-cuda11-arm64.jar.asc) |
-| arm64     | Scala 2.13    | [RAPIDS Accelerator v24.10.1](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.13/24.10.1/rapids-4-spark_2.13-24.10.1-cuda11-arm64.jar) | [Signature](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.13/24.10.1/rapids-4-spark_2.13-24.10.1-cuda11-arm64.jar.asc) |
+| x86_64    | Scala 2.12    | [RAPIDS Accelerator v24.12.0](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/24.12.0/rapids-4-spark_2.12-24.12.0.jar) | [Signature](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/24.12.0/rapids-4-spark_2.12-24.12.0.jar.asc) |
+| x86_64    | Scala 2.13    | [RAPIDS Accelerator v24.12.0](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.13/24.12.0/rapids-4-spark_2.13-24.12.0.jar) | [Signature](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.13/24.12.0/rapids-4-spark_2.13-24.12.0.jar.asc) |
+| arm64     | Scala 2.12    | [RAPIDS Accelerator v24.12.0](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/24.12.0/rapids-4-spark_2.12-24.12.0-cuda11-arm64.jar) | [Signature](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/24.12.0/rapids-4-spark_2.12-24.12.0-cuda11-arm64.jar.asc) |
+| arm64     | Scala 2.13    | [RAPIDS Accelerator v24.12.0](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.13/24.12.0/rapids-4-spark_2.13-24.12.0-cuda11-arm64.jar) | [Signature](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.13/24.12.0/rapids-4-spark_2.13-24.12.0-cuda11-arm64.jar.asc) |
 
 This package is built against CUDA 11.8. It is tested on V100, T4, A10, A100, L4 and H100 GPUs with 
 CUDA 11.8 through CUDA 12.0.
@@ -85,24 +85,27 @@ CUDA 11.8 through CUDA 12.0.
 * Download the [PUB_KEY](https://keys.openpgp.org/[email protected]).
 * Import the public key: `gpg --import PUB_KEY`
 * Verify the signature for Scala 2.12 jar:
-    `gpg --verify rapids-4-spark_2.12-24.10.1.jar.asc rapids-4-spark_2.12-24.10.1.jar`
+    `gpg --verify rapids-4-spark_2.12-24.12.0.jar.asc rapids-4-spark_2.12-24.12.0.jar`
 * Verify the signature for Scala 2.13 jar:
-    `gpg --verify rapids-4-spark_2.13-24.10.1.jar.asc rapids-4-spark_2.13-24.10.1.jar`
+    `gpg --verify rapids-4-spark_2.13-24.12.0.jar.asc rapids-4-spark_2.13-24.12.0.jar`
 
 The output of signature verify:
 
 	gpg: Good signature from "NVIDIA Spark (For the signature of spark-rapids release jars) <[email protected]>"
 
 ### Release Notes
-* Optimize scheduling policy for GPU Semaphore
-* Support distinct join for right outer joins
-* Support MinBy and MaxBy for non-float ordering  
-* Support ArrayJoin expression
-* Optimize Expand and Aggregate expression performance
-* Improve JSON related expressions
+* Add repartition-based algorithm fallback in hash aggregate  
+* Support Spark function months_between
+* Support asynchronous writing for Parquet files
+* Add retry support to improve sub hash-join stability
+* Improve JSON scan and from_json
+* Improved performance for CASE WHEN statements comparing a string column against multiple values
+* Falling back to the CPU for ORC boolean writes by the GPU due to a bug in cudf's ORC writer
+* Fix a device memory leak in timestamp operator in `incompatibleDateFormats` case
+* Fix a host memory leak in GpuBroadcastNestedLoopJoinExecBase when `spillableBuiltBatch` is 0
 * For updates on RAPIDS Accelerator Tools, please visit [this link](https://github.com/NVIDIA/spark-rapids-tools/releases)
 
-Note: There is a known issue in the 24.10.1 release when decompressing gzip files on H100 GPUs.  
+Note: There is a known issue in the 24.12.0 release when decompressing gzip files on H100 GPUs.  
 Please find more details in [issue-16661](https://github.com/rapidsai/cudf/issues/16661).
 
 For a detailed list of changes, please refer to the

diff --git a/integration_tests/src/main/python/datasourcev2_write_test.py b/integration_tests/src/main/python/datasourcev2_write_test.py
@@ -18,7 +18,7 @@
 from data_gen import gen_df, decimal_gens, non_utc_allow
 from marks import *
 from spark_session import is_hive_available, is_spark_330_or_later, with_cpu_session, with_gpu_session
-from hive_parquet_write_test import _hive_bucket_gens, _hive_array_gens, _hive_struct_gens
+from hive_parquet_write_test import _hive_bucket_gens_sans_bools, _hive_array_gens, _hive_struct_gens
 from hive_parquet_write_test import read_single_bucket
 
 _hive_write_conf = {
@@ -33,9 +33,11 @@
 @allow_non_gpu(*non_utc_allow)
 def test_write_hive_bucketed_table(spark_tmp_table_factory, file_format):
     num_rows = 2048
-
+    # Use every type except boolean, see https://github.com/NVIDIA/spark-rapids/issues/11762 and
+    # https://github.com/rapidsai/cudf/issues/6763 .
+    # Once the first issue is fixed, add back boolean_gen
     def gen_table(spark):
-        gen_list = [('_c' + str(i), gen) for i, gen in enumerate(_hive_bucket_gens)]
+        gen_list = [('_c' + str(i), gen) for i, gen in enumerate(_hive_bucket_gens_sans_bools)]
         types_sql_str = ','.join('{} {}'.format(
             name, gen.data_type.simpleString()) for name, gen in gen_list)
         col_names_str = ','.join(name for name, gen in gen_list)