Skip to content

Commit

Permalink
Merge branch 'branch-25.02' into nullable_structs_writes
Browse files Browse the repository at this point in the history
  • Loading branch information
revans2 committed Dec 13, 2024
2 parents c08d955 + 561068c commit 30dd34e
Show file tree
Hide file tree
Showing 57 changed files with 993 additions and 372 deletions.
58 changes: 58 additions & 0 deletions .github/workflows/license-header-check.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
# Copyright (c) 2024, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# A workflow to check copyright/license header
name: license header check

on:
pull_request:
types: [opened, synchronize, reopened]

jobs:
license-header-check:
runs-on: ubuntu-latest
if: "!contains(github.event.pull_request.title, '[bot]')"
steps:
- name: Get checkout depth
run: |
echo "PR_FETCH_DEPTH=$(( ${{ github.event.pull_request.commits }} + 10 ))" >> $GITHUB_ENV
- name: Checkout code
uses: actions/checkout@v4
with:
fetch-depth: ${{ env.PR_FETCH_DEPTH }}

- name: license-header-check
uses: NVIDIA/spark-rapids-common/license-header-check@main
with:
included_file_patterns: |
*.yml,
*.yaml,
*.sh,
*.xml,
*.properties,
*.scala,
*.py,
build/*,
*.cpp,
*Dockerfile*,
*Jenkinsfile*,
*.ini,
*.java,
*.fbs
excluded_file_patterns: |
*target/*,
thirdparty/*,
sql-plugin/src/main/java/com/nvidia/spark/rapids/format/*
21 changes: 12 additions & 9 deletions .github/workflows/mvn-verify-check/populate-daily-cache.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,27 +14,30 @@
# See the License for the specific language governing permissions and
# limitations under the License.

set -x
max_retry=3; delay=30; i=1
set -e
set -o pipefail

if [[ $SCALA_VER == '2.12' ]]; then
pom='pom.xml'
elif [[ $SCALA_VER == '2.13' ]]; then
pom='scala2.13/pom.xml'
fi

max_retry=3; delay=30; i=1
while true; do
buildvers=($(python build/get_buildvers.py no_snapshots $pom | tr -d ',')) &&
{
python build/get_buildvers.py "no_snapshots.buildvers" $pom | tr -d ',' | \
xargs -n 1 -I {} bash -c \
"mvn $COMMON_MVN_FLAGS --file $pom -Dbuildver={} de.qaware.maven:go-offline-maven-plugin:resolve-dependencies"

for buildver in "${buildvers[@]}"; do
mvn $COMMON_MVN_FLAGS --file $pom -Dbuildver=$buildver de.qaware.maven:go-offline-maven-plugin:resolve-dependencies
done
} && {
# compile base versions to cache scala compiler and compiler bridge
mvn $COMMON_MVN_FLAGS --file $pom \
process-test-resources -pl sql-plugin-api -am
mvn $COMMON_MVN_FLAGS --file $pom process-test-resources -pl sql-plugin-api -am
} && break || {
if [[ $i -le $max_retry ]]; then
echo "mvn command failed. Retry $i/$max_retry."; ((i++)); sleep $delay; ((delay=delay*2))
else
echo "mvn command failed. Exit 1"; exit 1
fi
}
done
done
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -35,4 +35,5 @@ scalastyle-output.xml
scalastyle.txt
target/
cufile.log
cudf_log.txt
build/*.class
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,6 @@ import org.apache.spark.sql.catalyst.expressions.Expression
import org.apache.spark.sql.internal.ExpressionUtils.{column, expression}

object DataGenExprShims {
def columnToExpr(c: Column): Expression = c
def exprToColumn(e: Expression): Column = e
def columnToExpr(c: Column): Expression = expression(c)
def exprToColumn(e: Expression): Column = column(e)
}
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ import org.apache.spark.sql.execution.{CoalescedPartitionSpec, ShufflePartitionS
import org.apache.spark.sql.execution.exchange.Exchange
import org.apache.spark.sql.execution.metric.{SQLMetrics, SQLShuffleReadMetricsReporter, SQLShuffleWriteMetricsReporter}
import org.apache.spark.sql.rapids.execution.{GpuShuffleExchangeExecBase, ShuffledBatchRDD}
import org.apache.spark.sql.rapids.execution.GpuShuffleExchangeExecBase.createAdditionalExchangeMetrics
import org.apache.spark.sql.vectorized.ColumnarBatch
import org.apache.spark.util.ThreadUtils

Expand Down Expand Up @@ -71,22 +72,11 @@ case class GpuOptimizeWriteExchangeExec(
private[sql] lazy val readMetrics =
SQLShuffleReadMetricsReporter.createShuffleReadMetrics(sparkContext)

override lazy val additionalMetrics: Map[String, GpuMetric] = Map(
"dataSize" -> createSizeMetric(ESSENTIAL_LEVEL, "data size"),
"dataReadSize" -> createSizeMetric(MODERATE_LEVEL, "data read size"),
"rapidsShuffleSerializationTime" ->
createNanoTimingMetric(DEBUG_LEVEL, "rs. serialization time"),
"rapidsShuffleDeserializationTime" ->
createNanoTimingMetric(DEBUG_LEVEL, "rs. deserialization time"),
"rapidsShuffleWriteTime" ->
createNanoTimingMetric(ESSENTIAL_LEVEL, "rs. shuffle write time"),
"rapidsShuffleCombineTime" ->
createNanoTimingMetric(DEBUG_LEVEL, "rs. shuffle combine time"),
"rapidsShuffleWriteIoTime" ->
createNanoTimingMetric(DEBUG_LEVEL, "rs. shuffle write io time"),
"rapidsShuffleReadTime" ->
createNanoTimingMetric(ESSENTIAL_LEVEL, "rs. shuffle read time")
) ++ GpuMetric.wrap(readMetrics) ++ GpuMetric.wrap(writeMetrics)
override lazy val additionalMetrics : Map[String, GpuMetric] = {
createAdditionalExchangeMetrics(this) ++
GpuMetric.wrap(readMetrics) ++
GpuMetric.wrap(writeMetrics)
}

override lazy val allMetrics: Map[String, GpuMetric] = {
Map(
Expand All @@ -98,7 +88,7 @@ case class GpuOptimizeWriteExchangeExec(
}

private lazy val serializer: Serializer =
new GpuColumnarBatchSerializer(gpuLongMetric("dataSize"),
new GpuColumnarBatchSerializer(allMetrics,
child.output.map(_.dataType).toArray,
RapidsConf.SHUFFLE_KUDO_SERIALIZER_ENABLED.get(child.conf))

Expand Down
90 changes: 90 additions & 0 deletions docs/archive.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,96 @@ nav_order: 15
---
Below are archived releases for RAPIDS Accelerator for Apache Spark.

## Release v24.10.1
### Hardware Requirements:

The plugin is tested on the following architectures:

GPU Models: NVIDIA V100, T4, A10/A100, L4 and H100 GPUs

### Software Requirements:

OS: Spark RAPIDS is compatible with any Linux distribution with glibc >= 2.28 (Please check ldd --version output). glibc 2.28 was released August 1, 2018.
Tested on Ubuntu 20.04, Ubuntu 22.04, Rocky Linux 8 and Rocky Linux 9

NVIDIA Driver*: R470+

Runtime:
Scala 2.12, 2.13
Python, Java Virtual Machine (JVM) compatible with your spark-version.

* Check the Spark documentation for Python and Java version compatibility with your specific
Spark version. For instance, visit `https://spark.apache.org/docs/3.4.1` for Spark 3.4.1.

Supported Spark versions:
Apache Spark 3.2.0, 3.2.1, 3.2.2, 3.2.3, 3.2.4
Apache Spark 3.3.0, 3.3.1, 3.3.2, 3.3.3, 3.3.4
Apache Spark 3.4.0, 3.4.1, 3.4.2, 3.4.3
Apache Spark 3.5.0, 3.5.1, 3.5.2

Supported Databricks runtime versions for Azure and AWS:
Databricks 11.3 ML LTS (GPU, Scala 2.12, Spark 3.3.0)
Databricks 12.2 ML LTS (GPU, Scala 2.12, Spark 3.3.2)
Databricks 13.3 ML LTS (GPU, Scala 2.12, Spark 3.4.1)

Supported Dataproc versions (Debian/Ubuntu/Rocky):
GCP Dataproc 2.1
GCP Dataproc 2.2

Supported Dataproc Serverless versions:
Spark runtime 1.1 LTS
Spark runtime 2.0
Spark runtime 2.1
Spark runtime 2.2

*Some hardware may have a minimum driver version greater than R470. Check the GPU spec sheet
for your hardware's minimum driver version.

*For Cloudera and EMR support, please refer to the
[Distributions](https://docs.nvidia.com/spark-rapids/user-guide/latest/faq.html#which-distributions-are-supported) section of the FAQ.

### RAPIDS Accelerator's Support Policy for Apache Spark
The RAPIDS Accelerator maintains support for Apache Spark versions available for download from [Apache Spark](https://spark.apache.org/downloads.html)

### Download RAPIDS Accelerator for Apache Spark v24.10.1

| Processor | Scala Version | Download Jar | Download Signature |
|-----------|---------------|--------------|--------------------|
| x86_64 | Scala 2.12 | [RAPIDS Accelerator v24.10.1](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/24.10.1/rapids-4-spark_2.12-24.10.1.jar) | [Signature](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/24.10.1/rapids-4-spark_2.12-24.10.1.jar.asc) |
| x86_64 | Scala 2.13 | [RAPIDS Accelerator v24.10.1](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.13/24.10.1/rapids-4-spark_2.13-24.10.1.jar) | [Signature](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.13/24.10.1/rapids-4-spark_2.13-24.10.1.jar.asc) |
| arm64 | Scala 2.12 | [RAPIDS Accelerator v24.10.1](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/24.10.1/rapids-4-spark_2.12-24.10.1-cuda11-arm64.jar) | [Signature](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/24.10.1/rapids-4-spark_2.12-24.10.1-cuda11-arm64.jar.asc) |
| arm64 | Scala 2.13 | [RAPIDS Accelerator v24.10.1](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.13/24.10.1/rapids-4-spark_2.13-24.10.1-cuda11-arm64.jar) | [Signature](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.13/24.10.1/rapids-4-spark_2.13-24.10.1-cuda11-arm64.jar.asc) |

This package is built against CUDA 11.8. It is tested on V100, T4, A10, A100, L4 and H100 GPUs with
CUDA 11.8 through CUDA 12.0.

### Verify signature
* Download the [PUB_KEY](https://keys.openpgp.org/[email protected]).
* Import the public key: `gpg --import PUB_KEY`
* Verify the signature for Scala 2.12 jar:
`gpg --verify rapids-4-spark_2.12-24.10.1.jar.asc rapids-4-spark_2.12-24.10.1.jar`
* Verify the signature for Scala 2.13 jar:
`gpg --verify rapids-4-spark_2.13-24.10.1.jar.asc rapids-4-spark_2.13-24.10.1.jar`

The output of signature verify:

gpg: Good signature from "NVIDIA Spark (For the signature of spark-rapids release jars) <[email protected]>"

### Release Notes
* Optimize scheduling policy for GPU Semaphore
* Support distinct join for right outer joins
* Support MinBy and MaxBy for non-float ordering
* Support ArrayJoin expression
* Optimize Expand and Aggregate expression performance
* Improve JSON related expressions
* For updates on RAPIDS Accelerator Tools, please visit [this link](https://github.com/NVIDIA/spark-rapids-tools/releases)

Note: There is a known issue in the 24.10.1 release when decompressing gzip files on H100 GPUs.
Please find more details in [issue-16661](https://github.com/rapidsai/cudf/issues/16661).

For a detailed list of changes, please refer to the
[CHANGELOG](https://github.com/NVIDIA/spark-rapids/blob/main/CHANGELOG.md).

## Release v24.10.0
### Hardware Requirements:

Expand Down
16 changes: 16 additions & 0 deletions docs/dev/idea-code-style-settings.xml
Original file line number Diff line number Diff line change
@@ -1,3 +1,19 @@
<!--
Copyright (c) 2024, NVIDIA CORPORATION.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->

<code_scheme name="Default" version="173">
<option name="SOFT_MARGINS" value="100" />
<JavaCodeStyleSettings>
Expand Down
33 changes: 18 additions & 15 deletions docs/download.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ cuDF jar, that is either preinstalled in the Spark classpath on all nodes or sub
that uses the RAPIDS Accelerator For Apache Spark. See the [getting-started
guide](https://docs.nvidia.com/spark-rapids/user-guide/latest/getting-started/overview.html) for more details.

## Release v24.10.1
## Release v24.12.0
### Hardware Requirements:

The plugin is tested on the following architectures:
Expand Down Expand Up @@ -69,14 +69,14 @@ for your hardware's minimum driver version.
### RAPIDS Accelerator's Support Policy for Apache Spark
The RAPIDS Accelerator maintains support for Apache Spark versions available for download from [Apache Spark](https://spark.apache.org/downloads.html)

### Download RAPIDS Accelerator for Apache Spark v24.10.1
### Download RAPIDS Accelerator for Apache Spark v24.12.0

| Processor | Scala Version | Download Jar | Download Signature |
|-----------|---------------|--------------|--------------------|
| x86_64 | Scala 2.12 | [RAPIDS Accelerator v24.10.1](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/24.10.1/rapids-4-spark_2.12-24.10.1.jar) | [Signature](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/24.10.1/rapids-4-spark_2.12-24.10.1.jar.asc) |
| x86_64 | Scala 2.13 | [RAPIDS Accelerator v24.10.1](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.13/24.10.1/rapids-4-spark_2.13-24.10.1.jar) | [Signature](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.13/24.10.1/rapids-4-spark_2.13-24.10.1.jar.asc) |
| arm64 | Scala 2.12 | [RAPIDS Accelerator v24.10.1](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/24.10.1/rapids-4-spark_2.12-24.10.1-cuda11-arm64.jar) | [Signature](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/24.10.1/rapids-4-spark_2.12-24.10.1-cuda11-arm64.jar.asc) |
| arm64 | Scala 2.13 | [RAPIDS Accelerator v24.10.1](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.13/24.10.1/rapids-4-spark_2.13-24.10.1-cuda11-arm64.jar) | [Signature](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.13/24.10.1/rapids-4-spark_2.13-24.10.1-cuda11-arm64.jar.asc) |
| x86_64 | Scala 2.12 | [RAPIDS Accelerator v24.12.0](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/24.12.0/rapids-4-spark_2.12-24.12.0.jar) | [Signature](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/24.12.0/rapids-4-spark_2.12-24.12.0.jar.asc) |
| x86_64 | Scala 2.13 | [RAPIDS Accelerator v24.12.0](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.13/24.12.0/rapids-4-spark_2.13-24.12.0.jar) | [Signature](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.13/24.12.0/rapids-4-spark_2.13-24.12.0.jar.asc) |
| arm64 | Scala 2.12 | [RAPIDS Accelerator v24.12.0](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/24.12.0/rapids-4-spark_2.12-24.12.0-cuda11-arm64.jar) | [Signature](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/24.12.0/rapids-4-spark_2.12-24.12.0-cuda11-arm64.jar.asc) |
| arm64 | Scala 2.13 | [RAPIDS Accelerator v24.12.0](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.13/24.12.0/rapids-4-spark_2.13-24.12.0-cuda11-arm64.jar) | [Signature](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.13/24.12.0/rapids-4-spark_2.13-24.12.0-cuda11-arm64.jar.asc) |

This package is built against CUDA 11.8. It is tested on V100, T4, A10, A100, L4 and H100 GPUs with
CUDA 11.8 through CUDA 12.0.
Expand All @@ -85,24 +85,27 @@ CUDA 11.8 through CUDA 12.0.
* Download the [PUB_KEY](https://keys.openpgp.org/[email protected]).
* Import the public key: `gpg --import PUB_KEY`
* Verify the signature for Scala 2.12 jar:
`gpg --verify rapids-4-spark_2.12-24.10.1.jar.asc rapids-4-spark_2.12-24.10.1.jar`
`gpg --verify rapids-4-spark_2.12-24.12.0.jar.asc rapids-4-spark_2.12-24.12.0.jar`
* Verify the signature for Scala 2.13 jar:
`gpg --verify rapids-4-spark_2.13-24.10.1.jar.asc rapids-4-spark_2.13-24.10.1.jar`
`gpg --verify rapids-4-spark_2.13-24.12.0.jar.asc rapids-4-spark_2.13-24.12.0.jar`

The output of signature verify:

gpg: Good signature from "NVIDIA Spark (For the signature of spark-rapids release jars) <[email protected]>"

### Release Notes
* Optimize scheduling policy for GPU Semaphore
* Support distinct join for right outer joins
* Support MinBy and MaxBy for non-float ordering
* Support ArrayJoin expression
* Optimize Expand and Aggregate expression performance
* Improve JSON related expressions
* Add repartition-based algorithm fallback in hash aggregate
* Support Spark function months_between
* Support asynchronous writing for Parquet files
* Add retry support to improve sub hash-join stability
* Improve JSON scan and from_json
* Improved performance for CASE WHEN statements comparing a string column against multiple values
* Falling back to the CPU for ORC boolean writes by the GPU due to a bug in cudf's ORC writer
* Fix a device memory leak in timestamp operator in `incompatibleDateFormats` case
* Fix a host memory leak in GpuBroadcastNestedLoopJoinExecBase when `spillableBuiltBatch` is 0
* For updates on RAPIDS Accelerator Tools, please visit [this link](https://github.com/NVIDIA/spark-rapids-tools/releases)

Note: There is a known issue in the 24.10.1 release when decompressing gzip files on H100 GPUs.
Note: There is a known issue in the 24.12.0 release when decompressing gzip files on H100 GPUs.
Please find more details in [issue-16661](https://github.com/rapidsai/cudf/issues/16661).

For a detailed list of changes, please refer to the
Expand Down
8 changes: 5 additions & 3 deletions integration_tests/src/main/python/datasourcev2_write_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
from data_gen import gen_df, decimal_gens, non_utc_allow
from marks import *
from spark_session import is_hive_available, is_spark_330_or_later, with_cpu_session, with_gpu_session
from hive_parquet_write_test import _hive_bucket_gens, _hive_array_gens, _hive_struct_gens
from hive_parquet_write_test import _hive_bucket_gens_sans_bools, _hive_array_gens, _hive_struct_gens
from hive_parquet_write_test import read_single_bucket

_hive_write_conf = {
Expand All @@ -33,9 +33,11 @@
@allow_non_gpu(*non_utc_allow)
def test_write_hive_bucketed_table(spark_tmp_table_factory, file_format):
num_rows = 2048

# Use every type except boolean, see https://github.com/NVIDIA/spark-rapids/issues/11762 and
# https://github.com/rapidsai/cudf/issues/6763 .
# Once the first issue is fixed, add back boolean_gen
def gen_table(spark):
gen_list = [('_c' + str(i), gen) for i, gen in enumerate(_hive_bucket_gens)]
gen_list = [('_c' + str(i), gen) for i, gen in enumerate(_hive_bucket_gens_sans_bools)]
types_sql_str = ','.join('{} {}'.format(
name, gen.data_type.simpleString()) for name, gen in gen_list)
col_names_str = ','.join(name for name, gen in gen_list)
Expand Down
Loading

0 comments on commit 30dd34e

Please sign in to comment.