From b9d265573969de17bd534e63fc6e2dcb1e5f4543 Mon Sep 17 00:00:00 2001 From: julien bignon Date: Wed, 21 Aug 2024 14:05:57 +0200 Subject: [PATCH] Add spark 3.5 --- technologies/job/spark/spark-2.4/context.yaml | 3 +- technologies/job/spark/spark-3.0/context.yaml | 3 +- technologies/job/spark/spark-3.1/context.yaml | 5 +- .../job/spark/spark-3.5/build.gradle.kts | 22 +++ technologies/job/spark/spark-3.5/context.yaml | 5 + .../spark-3.5/innerContexts/jre/Dockerfile | 22 +++ .../spark-3.5/innerContexts/jre/context.yaml | 18 ++ .../spark-3.5/innerContexts/jre/entrypoint.sh | 141 ++++++++++++++ .../jre/spark-3.5-jre-11/build.gradle.kts | 32 +++ .../jre/spark-3.5-jre-11/build.me | 0 .../jre/spark-3.5-jre-11/dockerInfo.yaml | 4 + .../jre/spark-3.5-jre-11/image_test.yaml | 62 ++++++ .../jre/spark-3.5-jre-11/innerContext.yaml | 5 + .../jre/spark-3.5-jre-17/build.gradle.kts | 32 +++ .../jre/spark-3.5-jre-17/build.me | 0 .../jre/spark-3.5-jre-17/dockerInfo.yaml | 4 + .../jre/spark-3.5-jre-17/image_test.yaml | 62 ++++++ .../jre/spark-3.5-jre-17/innerContext.yaml | 5 + .../spark-3.5/innerContexts/python/Dockerfile | 40 ++++ .../innerContexts/python/context.yaml | 22 +++ .../innerContexts/python/entrypoint.sh | 182 ++++++++++++++++++ .../spark-3.5-python-3.12/build.gradle.kts | 35 ++++ .../python/spark-3.5-python-3.12/build.me | 0 .../spark-3.5-python-3.12/dockerInfo.yaml | 4 + .../spark-3.5-python-3.12/image_test.yaml | 67 +++++++ .../spark-3.5-python-3.12/innerContext.yaml | 5 + .../job/spark/spark-aws-3.1/context.yaml | 3 +- 27 files changed, 778 insertions(+), 5 deletions(-) create mode 100644 technologies/job/spark/spark-3.5/build.gradle.kts create mode 100644 technologies/job/spark/spark-3.5/context.yaml create mode 100644 technologies/job/spark/spark-3.5/innerContexts/jre/Dockerfile create mode 100644 technologies/job/spark/spark-3.5/innerContexts/jre/context.yaml create mode 100755 technologies/job/spark/spark-3.5/innerContexts/jre/entrypoint.sh create mode 100644 technologies/job/spark/spark-3.5/innerContexts/jre/spark-3.5-jre-11/build.gradle.kts create mode 100644 technologies/job/spark/spark-3.5/innerContexts/jre/spark-3.5-jre-11/build.me create mode 100644 technologies/job/spark/spark-3.5/innerContexts/jre/spark-3.5-jre-11/dockerInfo.yaml create mode 100644 technologies/job/spark/spark-3.5/innerContexts/jre/spark-3.5-jre-11/image_test.yaml create mode 100644 technologies/job/spark/spark-3.5/innerContexts/jre/spark-3.5-jre-11/innerContext.yaml create mode 100644 technologies/job/spark/spark-3.5/innerContexts/jre/spark-3.5-jre-17/build.gradle.kts create mode 100644 technologies/job/spark/spark-3.5/innerContexts/jre/spark-3.5-jre-17/build.me create mode 100644 technologies/job/spark/spark-3.5/innerContexts/jre/spark-3.5-jre-17/dockerInfo.yaml create mode 100644 technologies/job/spark/spark-3.5/innerContexts/jre/spark-3.5-jre-17/image_test.yaml create mode 100644 technologies/job/spark/spark-3.5/innerContexts/jre/spark-3.5-jre-17/innerContext.yaml create mode 100644 technologies/job/spark/spark-3.5/innerContexts/python/Dockerfile create mode 100644 technologies/job/spark/spark-3.5/innerContexts/python/context.yaml create mode 100755 technologies/job/spark/spark-3.5/innerContexts/python/entrypoint.sh create mode 100644 technologies/job/spark/spark-3.5/innerContexts/python/spark-3.5-python-3.12/build.gradle.kts create mode 100644 technologies/job/spark/spark-3.5/innerContexts/python/spark-3.5-python-3.12/build.me create mode 100644 technologies/job/spark/spark-3.5/innerContexts/python/spark-3.5-python-3.12/dockerInfo.yaml create mode 100644 technologies/job/spark/spark-3.5/innerContexts/python/spark-3.5-python-3.12/image_test.yaml create mode 100644 technologies/job/spark/spark-3.5/innerContexts/python/spark-3.5-python-3.12/innerContext.yaml diff --git a/technologies/job/spark/spark-2.4/context.yaml b/technologies/job/spark/spark-2.4/context.yaml index 20d35c94d..573fdbcf7 100644 --- a/technologies/job/spark/spark-2.4/context.yaml +++ b/technologies/job/spark/spark-2.4/context.yaml @@ -2,4 +2,5 @@ id: "2.4" label: "2.4" available: true recommended: false -trustLevel: stable +trustLevel: deprecated +deprecationDate: "2024-09-01T00:00:00Z" \ No newline at end of file diff --git a/technologies/job/spark/spark-3.0/context.yaml b/technologies/job/spark/spark-3.0/context.yaml index 19d9683ab..253406e74 100644 --- a/technologies/job/spark/spark-3.0/context.yaml +++ b/technologies/job/spark/spark-3.0/context.yaml @@ -2,4 +2,5 @@ id: "3.0" label: "3.0" available: true recommended: false -trustLevel: stable +trustLevel: deprecated +deprecationDate: "2024-09-01T00:00:00Z" diff --git a/technologies/job/spark/spark-3.1/context.yaml b/technologies/job/spark/spark-3.1/context.yaml index e95dcf146..cfbc887e7 100644 --- a/technologies/job/spark/spark-3.1/context.yaml +++ b/technologies/job/spark/spark-3.1/context.yaml @@ -1,5 +1,6 @@ id: "3.1" label: "3.1" available: true -recommended: true -trustLevel: stable +recommended: false +trustLevel: deprecated +deprecationDate: "2024-09-01T00:00:00Z" diff --git a/technologies/job/spark/spark-3.5/build.gradle.kts b/technologies/job/spark/spark-3.5/build.gradle.kts new file mode 100644 index 000000000..b814e94a3 --- /dev/null +++ b/technologies/job/spark/spark-3.5/build.gradle.kts @@ -0,0 +1,22 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * Copyright 2019-2021. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +import com.bmuschko.gradle.docker.DockerRemoteApiPlugin +import com.saagie.technologies.SaagieTechnologiesGradlePlugin + +apply() +apply() diff --git a/technologies/job/spark/spark-3.5/context.yaml b/technologies/job/spark/spark-3.5/context.yaml new file mode 100644 index 000000000..f7d916b1b --- /dev/null +++ b/technologies/job/spark/spark-3.5/context.yaml @@ -0,0 +1,5 @@ +id: "3.5" +label: "3.5" +available: true +recommended: true +trustLevel: stable \ No newline at end of file diff --git a/technologies/job/spark/spark-3.5/innerContexts/jre/Dockerfile b/technologies/job/spark/spark-3.5/innerContexts/jre/Dockerfile new file mode 100644 index 000000000..d29d050c1 --- /dev/null +++ b/technologies/job/spark/spark-3.5/innerContexts/jre/Dockerfile @@ -0,0 +1,22 @@ +ARG jre_major +FROM spark:3.5.2-scala2.12-java$jre_major-ubuntu + +ENV PATH "$PATH:$SPARK_HOME/bin" +ENV LANG C.UTF-8 + +# LIGHT DEPENDENCIES START +USER root +RUN apt update -qq && apt install -yqq --no-install-recommends \ + wget curl unzip krb5-user zip && \ + rm -rf /var/lib/apt/lists/*s + +COPY entrypoint.sh /opt/ +RUN chmod 755 /opt/entrypoint.sh + +USER spark + +#See hadoop version used by spark and udpate if necessary. +#See https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-aws/3.3.4 to get right version of aws-java-sdk-bundle +RUN wget -nv https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/1.12.262/aws-java-sdk-bundle-1.12.262.jar && \ + wget -nv https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.3.4/hadoop-aws-3.3.4.jar && \ + mv *.jar /opt/spark/jars/ diff --git a/technologies/job/spark/spark-3.5/innerContexts/jre/context.yaml b/technologies/job/spark/spark-3.5/innerContexts/jre/context.yaml new file mode 100644 index 000000000..f73c6948f --- /dev/null +++ b/technologies/job/spark/spark-3.5/innerContexts/jre/context.yaml @@ -0,0 +1,18 @@ +id: java-scala +label: Java/Scala +available: true +trustLevel: stable +job: + features: + - type: COMMAND_LINE + label: Command line + mandatory: true + comment: Linux shell command to launch the job. + defaultValue: "spark-submit \\\n--conf spark.executor.memory=1G \\\n--conf spark.executor.cores=1 \\\n--conf spark.kubernetes.executor.limit.cores=1 \\\n--conf spark.executor.instances=2 \\\n--class=Main {file} arg1 arg2" + - type: ARTIFACT + label: Package + mandatory: true + comment: "Compatible upload file : .jar" + - type: SCHEDULER + label: Scheduled + mandatory: true \ No newline at end of file diff --git a/technologies/job/spark/spark-3.5/innerContexts/jre/entrypoint.sh b/technologies/job/spark/spark-3.5/innerContexts/jre/entrypoint.sh new file mode 100755 index 000000000..59bc8fa18 --- /dev/null +++ b/technologies/job/spark/spark-3.5/innerContexts/jre/entrypoint.sh @@ -0,0 +1,141 @@ +#!/bin/bash +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Prevent any errors from being silently ignored +set -eo pipefail + +attempt_setup_fake_passwd_entry() { + # Check whether there is a passwd entry for the container UID + local myuid; myuid="$(id -u)" + # If there is no passwd entry for the container UID, attempt to fake one + # You can also refer to the https://github.com/docker-library/official-images/pull/13089#issuecomment-1534706523 + # It's to resolve OpenShift random UID case. + # See also: https://github.com/docker-library/postgres/pull/448 + if ! getent passwd "$myuid" &> /dev/null; then + local wrapper + for wrapper in {/usr,}/lib{/*,}/libnss_wrapper.so; do + if [ -s "$wrapper" ]; then + NSS_WRAPPER_PASSWD="$(mktemp)" + NSS_WRAPPER_GROUP="$(mktemp)" + export LD_PRELOAD="$wrapper" NSS_WRAPPER_PASSWD NSS_WRAPPER_GROUP + local mygid; mygid="$(id -g)" + printf 'spark:x:%s:%s:${SPARK_USER_NAME:-anonymous uid}:%s:/bin/false\n' "$myuid" "$mygid" "$SPARK_HOME" > "$NSS_WRAPPER_PASSWD" + printf 'spark:x:%s:\n' "$mygid" > "$NSS_WRAPPER_GROUP" + break + fi + done + fi +} + +if [ -z "$JAVA_HOME" ]; then + JAVA_HOME=$(java -XshowSettings:properties -version 2>&1 > /dev/null | grep 'java.home' | awk '{print $3}') +fi + +SPARK_CLASSPATH="$SPARK_CLASSPATH:${SPARK_HOME}/jars/*" +for v in "${!SPARK_JAVA_OPT_@}"; do + SPARK_EXECUTOR_JAVA_OPTS+=( "${!v}" ) +done + +if [ -n "$SPARK_EXTRA_CLASSPATH" ]; then + SPARK_CLASSPATH="$SPARK_CLASSPATH:$SPARK_EXTRA_CLASSPATH" +fi + +if ! [ -z "${PYSPARK_PYTHON+x}" ]; then + export PYSPARK_PYTHON +fi +if ! [ -z "${PYSPARK_DRIVER_PYTHON+x}" ]; then + export PYSPARK_DRIVER_PYTHON +fi + +# If HADOOP_HOME is set and SPARK_DIST_CLASSPATH is not set, set it here so Hadoop jars are available to the executor. +# It does not set SPARK_DIST_CLASSPATH if already set, to avoid overriding customizations of this value from elsewhere e.g. Docker/K8s. +if [ -n "${HADOOP_HOME}" ] && [ -z "${SPARK_DIST_CLASSPATH}" ]; then + export SPARK_DIST_CLASSPATH="$($HADOOP_HOME/bin/hadoop classpath)" +fi + +if ! [ -z "${HADOOP_CONF_DIR+x}" ]; then + SPARK_CLASSPATH="$HADOOP_CONF_DIR:$SPARK_CLASSPATH"; +fi + +if ! [ -z "${SPARK_CONF_DIR+x}" ]; then + SPARK_CLASSPATH="$SPARK_CONF_DIR:$SPARK_CLASSPATH"; +elif ! [ -z "${SPARK_HOME+x}" ]; then + SPARK_CLASSPATH="$SPARK_HOME/conf:$SPARK_CLASSPATH"; +fi + +# SPARK-43540: add current working directory into executor classpath +SPARK_CLASSPATH="$SPARK_CLASSPATH:$PWD" + +# Switch to spark if no USER specified (root by default) otherwise use USER directly +switch_spark_if_root() { + if [ $(id -u) -eq 0 ]; then + echo gosu spark + fi +} + +case "$1" in + driver) + shift 1 + CMD=( + "$SPARK_HOME/bin/spark-submit" + --conf "spark.driver.bindAddress=$SPARK_DRIVER_BIND_ADDRESS" + --conf "spark.executorEnv.SPARK_DRIVER_POD_IP=$SPARK_DRIVER_BIND_ADDRESS" + --deploy-mode client + "$@" + ) + attempt_setup_fake_passwd_entry + # Execute the container CMD under tini for better hygiene + exec $(switch_spark_if_root) /usr/bin/tini -s -- "${CMD[@]}" + ;; + executor) + shift 1 + CMD=( + ${JAVA_HOME}/bin/java + "${SPARK_EXECUTOR_JAVA_OPTS[@]}" + -Xms"$SPARK_EXECUTOR_MEMORY" + -Xmx"$SPARK_EXECUTOR_MEMORY" + -cp "$SPARK_CLASSPATH:$SPARK_DIST_CLASSPATH" + org.apache.spark.scheduler.cluster.k8s.KubernetesExecutorBackend + --driver-url "$SPARK_DRIVER_URL" + --executor-id "$SPARK_EXECUTOR_ID" + --cores "$SPARK_EXECUTOR_CORES" + --app-id "$SPARK_APPLICATION_ID" + --hostname "$SPARK_EXECUTOR_POD_IP" + --resourceProfileId "$SPARK_RESOURCE_PROFILE_ID" + --podName "$SPARK_EXECUTOR_POD_NAME" + ) + attempt_setup_fake_passwd_entry + # Execute the container CMD under tini for better hygiene + exec $(switch_spark_if_root) /usr/bin/tini -s -- "${CMD[@]}" + ;; + + *) +# BEGIN SAAGIE SPECIFIC CODE + cd /sandbox + mkdir -p /opt/spark/conf/ + cat conf/*.conf > /opt/spark/conf/spark-defaults.conf + if test -f main_script; + then + CMD=(/bin/sh ./main_script) + exec "${CMD[@]}" + else +# END SAAGIE SPECIFIC CODE + #Non-spark-on-k8s command provided, proceeding in pass-through mode... + exec "$@" + fi; + ;; +esac diff --git a/technologies/job/spark/spark-3.5/innerContexts/jre/spark-3.5-jre-11/build.gradle.kts b/technologies/job/spark/spark-3.5/innerContexts/jre/spark-3.5-jre-11/build.gradle.kts new file mode 100644 index 000000000..3c5de8691 --- /dev/null +++ b/technologies/job/spark/spark-3.5/innerContexts/jre/spark-3.5-jre-11/build.gradle.kts @@ -0,0 +1,32 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * Copyright 2019-2021. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +import com.bmuschko.gradle.docker.DockerRemoteApiPlugin +import com.saagie.technologies.SaagieTechnologiesGradlePlugin +import com.saagie.technologies.readDockerInfo +import com.saagie.technologies.getVersionForDocker + + +apply() +apply() + +tasks.withType(com.bmuschko.gradle.docker.tasks.image.DockerBuildImage::class) { + this.buildArgs.put( + "jre_major", + "11" + ) +} diff --git a/technologies/job/spark/spark-3.5/innerContexts/jre/spark-3.5-jre-11/build.me b/technologies/job/spark/spark-3.5/innerContexts/jre/spark-3.5-jre-11/build.me new file mode 100644 index 000000000..e69de29bb diff --git a/technologies/job/spark/spark-3.5/innerContexts/jre/spark-3.5-jre-11/dockerInfo.yaml b/technologies/job/spark/spark-3.5/innerContexts/jre/spark-3.5-jre-11/dockerInfo.yaml new file mode 100644 index 000000000..1b65a74f2 --- /dev/null +++ b/technologies/job/spark/spark-3.5/innerContexts/jre/spark-3.5-jre-11/dockerInfo.yaml @@ -0,0 +1,4 @@ +image: saagie/spark +baseTag: 3.5-jre-11 +dynamicVersion: 1.125.0 +version: 3.5-jre-11-1.125.0 diff --git a/technologies/job/spark/spark-3.5/innerContexts/jre/spark-3.5-jre-11/image_test.yaml b/technologies/job/spark/spark-3.5/innerContexts/jre/spark-3.5-jre-11/image_test.yaml new file mode 100644 index 000000000..fffeadb29 --- /dev/null +++ b/technologies/job/spark/spark-3.5/innerContexts/jre/spark-3.5-jre-11/image_test.yaml @@ -0,0 +1,62 @@ +schemaVersion: "2.0.0" + +metadataTest: + env: + - key: LANG + value: "C.UTF-8" + - key: JAVA_HOME + value: "/opt/java/openjdk" + - key: SPARK_HOME + value: "/opt/spark" + +fileExistenceTests: + - name: "entrypoint.sh" + path: "/opt/entrypoint.sh" + shouldExist: true + permissions: "-rwxr-xr-x" + + - name: "kinit" + path: "/usr/bin/kinit" + shouldExist: true + permissions: "-rwxr-xr-x" + +commandTests: + - name: "Workdir" + command: "pwd" + expectedOutput: ["/opt/spark/work-dir"] + + - name: "Spark version" + command: "/opt/spark/bin/spark-submit" + args: ["--version"] + expectedError: ["version 3.5.*"] + + - name: "krb5-user installation" + command: "kinit" + expectedError: ["kinit: Client's credentials have been revoked while getting initial credentials"] + exitCode: 1 + + - name: "wget" + args: ["--help"] + command: "wget" + exitCode: 0 + + - name: "curl" + args: ["--help"] + command: "curl" + exitCode: 0 + + - name: "unzip" + args: ["--help"] + command: "unzip" + exitCode: 0 + + - name: "tar" + args: ["--help"] + command: "tar" + exitCode: 0 + + - name: "tini" + command: "/usr/bin/tini" + args: ["--version"] + expectedOutput: ["tini version 0.18.0.*"] + exitCode: 0 diff --git a/technologies/job/spark/spark-3.5/innerContexts/jre/spark-3.5-jre-11/innerContext.yaml b/technologies/job/spark/spark-3.5/innerContexts/jre/spark-3.5-jre-11/innerContext.yaml new file mode 100644 index 000000000..77102ced8 --- /dev/null +++ b/technologies/job/spark/spark-3.5/innerContexts/jre/spark-3.5-jre-11/innerContext.yaml @@ -0,0 +1,5 @@ +id: "11" +label: "11" +available: true +trustLevel: stable +recommended: true diff --git a/technologies/job/spark/spark-3.5/innerContexts/jre/spark-3.5-jre-17/build.gradle.kts b/technologies/job/spark/spark-3.5/innerContexts/jre/spark-3.5-jre-17/build.gradle.kts new file mode 100644 index 000000000..c6d0b303b --- /dev/null +++ b/technologies/job/spark/spark-3.5/innerContexts/jre/spark-3.5-jre-17/build.gradle.kts @@ -0,0 +1,32 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * Copyright 2019-2021. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +import com.bmuschko.gradle.docker.DockerRemoteApiPlugin +import com.saagie.technologies.SaagieTechnologiesGradlePlugin +import com.saagie.technologies.readDockerInfo +import com.saagie.technologies.getVersionForDocker + + +apply() +apply() + +tasks.withType(com.bmuschko.gradle.docker.tasks.image.DockerBuildImage::class) { + this.buildArgs.put( + "jre_major", + "17" + ) +} diff --git a/technologies/job/spark/spark-3.5/innerContexts/jre/spark-3.5-jre-17/build.me b/technologies/job/spark/spark-3.5/innerContexts/jre/spark-3.5-jre-17/build.me new file mode 100644 index 000000000..e69de29bb diff --git a/technologies/job/spark/spark-3.5/innerContexts/jre/spark-3.5-jre-17/dockerInfo.yaml b/technologies/job/spark/spark-3.5/innerContexts/jre/spark-3.5-jre-17/dockerInfo.yaml new file mode 100644 index 000000000..85d9a033d --- /dev/null +++ b/technologies/job/spark/spark-3.5/innerContexts/jre/spark-3.5-jre-17/dockerInfo.yaml @@ -0,0 +1,4 @@ +image: saagie/spark +baseTag: 3.5-jre-17 +dynamicVersion: 1.125.0 +version: 3.5-jre-11-1.125.0 diff --git a/technologies/job/spark/spark-3.5/innerContexts/jre/spark-3.5-jre-17/image_test.yaml b/technologies/job/spark/spark-3.5/innerContexts/jre/spark-3.5-jre-17/image_test.yaml new file mode 100644 index 000000000..7ac6f0a44 --- /dev/null +++ b/technologies/job/spark/spark-3.5/innerContexts/jre/spark-3.5-jre-17/image_test.yaml @@ -0,0 +1,62 @@ +schemaVersion: "2.0.0" + +metadataTest: + env: + - key: LANG + value: "C.UTF-8" + - key: JAVA_HOME + value: "/opt/java/openjdk" + - key: SPARK_HOME + value: "/opt/spark" + +fileExistenceTests: + - name: "entrypoint.sh" + path: "/opt/entrypoint.sh" + shouldExist: true + permissions: "-rwxr-xr-x" + + - name: "kinit" + path: "/usr/bin/kinit" + shouldExist: true + permissions: "-rwxr-xr-x" + +commandTests: + - name: "Workdir" + command: "pwd" + expectedOutput: ["/opt/spark/work-dir"] + + - name: "Spark version" + command: "/opt/spark/bin/spark-submit" + args: ["--version"] + expectedError: ["version 3.5.*"] + + - name: "krb5-user installation" + command: "kinit" + expectedError: ["kinit: Client's credentials have been revoked while getting initial credentials"] + exitCode: 1 + + - name: "wget" + args: ["--help"] + command: "wget" + exitCode: 0 + + - name: "curl" + args: ["--help"] + command: "curl" + exitCode: 0 + + - name: "unzip" + args: ["--help"] + command: "unzip" + exitCode: 0 + + - name: "tar" + args: ["--help"] + command: "tar" + exitCode: 0 + + - name: "tini" + command: "/usr/bin/tini" + args: ["--version"] + expectedOutput: ["tini version 0.19.0.*"] + exitCode: 0 diff --git a/technologies/job/spark/spark-3.5/innerContexts/jre/spark-3.5-jre-17/innerContext.yaml b/technologies/job/spark/spark-3.5/innerContexts/jre/spark-3.5-jre-17/innerContext.yaml new file mode 100644 index 000000000..577d6be60 --- /dev/null +++ b/technologies/job/spark/spark-3.5/innerContexts/jre/spark-3.5-jre-17/innerContext.yaml @@ -0,0 +1,5 @@ +id: "17" +label: "17" +available: true +trustLevel: stable +recommended: true diff --git a/technologies/job/spark/spark-3.5/innerContexts/python/Dockerfile b/technologies/job/spark/spark-3.5/innerContexts/python/Dockerfile new file mode 100644 index 000000000..2fe92ac0f --- /dev/null +++ b/technologies/job/spark/spark-3.5/innerContexts/python/Dockerfile @@ -0,0 +1,40 @@ +ARG base_img + +FROM spark:3.5.2 AS SPARK_BASE + +FROM ${base_img} AS BASE_IMG + +COPY --from=SPARK_BASE /opt/spark /opt/spark +COPY --from=SPARK_BASE /usr/bin/tini /usr/bin/tini + +COPY --from=SPARK_BASE /opt/java/openjdk /opt/java/openjdk + +ENV JAVA_HOME /opt/java/openjdk +ENV LANG C.UTF-8 +ENV SPARK_HOME /opt/spark + +#See https://github.com/apache/spark-docker/blob/master/Dockerfile.template#L19 +ARG spark_uid=185 + +RUN groupadd --system --gid=${spark_uid} spark && \ + useradd --system --uid=${spark_uid} --gid=spark spark + +RUN apt update -qq && apt install -yqq --no-install-recommends \ + gosu && \ + rm -rf /var/lib/apt/lists/*s + +RUN pip --no-cache-dir install --upgrade pip \ + && pip --no-cache-dir install pyspark==3.5.2 \ + && rm -rf /root/.cachex \ + && rm -rf /boot/.cache/pip \ + && rm -rf ~/.cache/pip + +# As long as base image is from saagie, no need to add krb5 or LD_LIBRARY_PATH + +# Move scripts and frequently changing directive to the end of the build +COPY entrypoint.sh /opt/ +RUN chmod 755 /opt/entrypoint.sh + +WORKDIR /opt/spark/work-dir + +ENTRYPOINT [ "/opt/entrypoint.sh" ] \ No newline at end of file diff --git a/technologies/job/spark/spark-3.5/innerContexts/python/context.yaml b/technologies/job/spark/spark-3.5/innerContexts/python/context.yaml new file mode 100644 index 000000000..68c12cc49 --- /dev/null +++ b/technologies/job/spark/spark-3.5/innerContexts/python/context.yaml @@ -0,0 +1,22 @@ +id: python +label: Python +available: true +trustLevel: stable +job: + features: + - type: COMMAND_LINE + label: Command line + mandatory: true + comment: Linux shell command to launch the job. + defaultValue: "spark-submit \\\n--conf spark.executor.memory=1G \\\n--conf spark.executor.cores=1 \\\n--conf spark.kubernetes.executor.limit.cores=1 \\\n--conf spark.executor.instances=2 \\\n--py-files={file} local://__main__.py" + - type: ARTIFACT + label: Package + mandatory: true + comment: "Compatible upload file : .py or .zip" + - type: SCHEDULER + label: Scheduled + mandatory: true + - type: AI_DESCRIPTION_GENERATOR + label: AI description generator enabled + mandatory: true + comment: Activation of the AI-based automatic description generation function. \ No newline at end of file diff --git a/technologies/job/spark/spark-3.5/innerContexts/python/entrypoint.sh b/technologies/job/spark/spark-3.5/innerContexts/python/entrypoint.sh new file mode 100755 index 000000000..ccfbfbfbf --- /dev/null +++ b/technologies/job/spark/spark-3.5/innerContexts/python/entrypoint.sh @@ -0,0 +1,182 @@ +#!/bin/bash +# FROM https://github.com/apache/spark-docker/blob/master/entrypoint.sh.template +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Prevent any errors from being silently ignored +set -eo pipefail + +attempt_setup_fake_passwd_entry() { + # Check whether there is a passwd entry for the container UID + local myuid; myuid="$(id -u)" + # If there is no passwd entry for the container UID, attempt to fake one + # You can also refer to the https://github.com/docker-library/official-images/pull/13089#issuecomment-1534706523 + # It's to resolve OpenShift random UID case. + # See also: https://github.com/docker-library/postgres/pull/448 + if ! getent passwd "$myuid" &> /dev/null; then + local wrapper + for wrapper in {/usr,}/lib{/*,}/libnss_wrapper.so; do + if [ -s "$wrapper" ]; then + NSS_WRAPPER_PASSWD="$(mktemp)" + NSS_WRAPPER_GROUP="$(mktemp)" + export LD_PRELOAD="$wrapper" NSS_WRAPPER_PASSWD NSS_WRAPPER_GROUP + local mygid; mygid="$(id -g)" + printf 'spark:x:%s:%s:${SPARK_USER_NAME:-anonymous uid}:%s:/bin/false\n' "$myuid" "$mygid" "$SPARK_HOME" > "$NSS_WRAPPER_PASSWD" + printf 'spark:x:%s:\n' "$mygid" > "$NSS_WRAPPER_GROUP" + break + fi + done + fi +} + +if [ -z "$JAVA_HOME" ]; then + JAVA_HOME=$(java -XshowSettings:properties -version 2>&1 > /dev/null | grep 'java.home' | awk '{print $3}') +fi + +# BEGIN SAAGIE SPECIFIC CODE +cd /sandbox + # parse content and if pyfiles extract minio url and inject it +if [ -f main_script ] && grep -q "\--py-files" main_script; +then + PYSPARK_FILES="`grep -Po '.*--py-files=\K[^ ]+' main_script`" +fi; + +if [ -n "$PYSPARK_FILES" ]; then + PYTHONPATH="$PYTHONPATH:$PYSPARK_FILES" + #Copy and unzip pyfiles + if [[ $PYSPARK_FILES == *[,]* ]];then + echo "PYSPARK_FILES contains comma" + pyfiles=$(echo $PYSPARK_FILES | tr "," "\n") + + for file in $pyfiles + do + echo ">>> [$file]" + wget -nv $file + done + else + echo ">>> [$PYSPARK_FILES]" + wget -nv $PYSPARK_FILES + fi + if [ -f *.zip ] + then + unzip -q *.zip + fi + if [ -f "requirements.txt" ] + then + pip install -r requirements.txt + fi + rm -Rf /opt/spark/work-dir + ln -s /sandbox/ /opt/spark/work-dir +fi +# END SAAGIE SPECIFIC CODE + +SPARK_CLASSPATH="$SPARK_CLASSPATH:${SPARK_HOME}/jars/*" +for v in "${!SPARK_JAVA_OPT_@}"; do + SPARK_EXECUTOR_JAVA_OPTS+=( "${!v}" ) +done + +if [ -n "$SPARK_EXTRA_CLASSPATH" ]; then + SPARK_CLASSPATH="$SPARK_CLASSPATH:$SPARK_EXTRA_CLASSPATH" +fi + +if ! [ -z "${PYSPARK_PYTHON+x}" ]; then + export PYSPARK_PYTHON +fi +if ! [ -z "${PYSPARK_DRIVER_PYTHON+x}" ]; then + export PYSPARK_DRIVER_PYTHON +fi + +# If HADOOP_HOME is set and SPARK_DIST_CLASSPATH is not set, set it here so Hadoop jars are available to the executor. +# It does not set SPARK_DIST_CLASSPATH if already set, to avoid overriding customizations of this value from elsewhere e.g. Docker/K8s. +if [ -n "${HADOOP_HOME}" ] && [ -z "${SPARK_DIST_CLASSPATH}" ]; then + export SPARK_DIST_CLASSPATH="$($HADOOP_HOME/bin/hadoop classpath)" +fi + +if ! [ -z "${HADOOP_CONF_DIR+x}" ]; then + SPARK_CLASSPATH="$HADOOP_CONF_DIR:$SPARK_CLASSPATH"; +fi + +if ! [ -z "${SPARK_CONF_DIR+x}" ]; then + SPARK_CLASSPATH="$SPARK_CONF_DIR:$SPARK_CLASSPATH"; +elif ! [ -z "${SPARK_HOME+x}" ]; then + SPARK_CLASSPATH="$SPARK_HOME/conf:$SPARK_CLASSPATH"; +fi + +# SPARK-43540: add current working directory into executor classpath +SPARK_CLASSPATH="$SPARK_CLASSPATH:$PWD" + +# Switch to spark if no USER specified (root by default) otherwise use USER directly +#SAAGIE disable this part because main_script only ready by root user. +switch_spark_if_root() { +# if [ $(id -u) -eq 0 ]; then +# echo gosu spark +# fi + echo "" +} + +case "$1" in + driver) + shift 1 + CMD=( + "$SPARK_HOME/bin/spark-submit" + --conf "spark.driver.bindAddress=$SPARK_DRIVER_BIND_ADDRESS" + --conf "spark.executorEnv.SPARK_DRIVER_POD_IP=$SPARK_DRIVER_BIND_ADDRESS" + --py-files=/sandbox/* # SAAGIE SPECIFIC CODE + --deploy-mode client + "$@" + ) + attempt_setup_fake_passwd_entry + # Execute the container CMD under tini for better hygiene + exec $(switch_spark_if_root) /usr/bin/tini -s -- "${CMD[@]}" + ;; + executor) + shift 1 + CMD=( + ${JAVA_HOME}/bin/java + "${SPARK_EXECUTOR_JAVA_OPTS[@]}" + -Xms"$SPARK_EXECUTOR_MEMORY" + -Xmx"$SPARK_EXECUTOR_MEMORY" + -cp "$SPARK_CLASSPATH:$SPARK_DIST_CLASSPATH" + org.apache.spark.scheduler.cluster.k8s.KubernetesExecutorBackend + --driver-url "$SPARK_DRIVER_URL" + --executor-id "$SPARK_EXECUTOR_ID" + --cores "$SPARK_EXECUTOR_CORES" + --app-id "$SPARK_APPLICATION_ID" + --hostname "$SPARK_EXECUTOR_POD_IP" + --resourceProfileId "$SPARK_RESOURCE_PROFILE_ID" + --podName "$SPARK_EXECUTOR_POD_NAME" + ) + attempt_setup_fake_passwd_entry + # Execute the container CMD under tini for better hygiene + exec $(switch_spark_if_root) /usr/bin/tini -s -- "${CMD[@]}" + ;; + + *) +# BEGIN SAAGIE SPECIFIC CODE + mkdir -p /opt/spark/conf/ + cat conf/*.conf > /opt/spark/conf/spark-defaults.conf + echo "spark.kubernetes.driver.pod.name $HOSTNAME" >> /opt/spark/conf/spark-defaults.conf + if test -f main_script; + then + CMD=(/bin/sh ./main_script) + exec "${CMD[@]}" + else +# END SAAGIE SPECIFIC CODE + # Non-spark-on-k8s command provided, proceeding in pass-through mode... + exec "$@" + fi; + ;; +esac diff --git a/technologies/job/spark/spark-3.5/innerContexts/python/spark-3.5-python-3.12/build.gradle.kts b/technologies/job/spark/spark-3.5/innerContexts/python/spark-3.5-python-3.12/build.gradle.kts new file mode 100644 index 000000000..d9f5fe8cd --- /dev/null +++ b/technologies/job/spark/spark-3.5/innerContexts/python/spark-3.5-python-3.12/build.gradle.kts @@ -0,0 +1,35 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * Copyright 2019-2021. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +import com.bmuschko.gradle.docker.DockerRemoteApiPlugin +import com.saagie.technologies.SaagieTechnologiesGradlePlugin +import com.saagie.technologies.readDockerInfo +import com.saagie.technologies.getVersionForDocker + + +apply() +apply() + +val dockerInfo = readDockerInfo(projectDir) + +tasks.withType(com.bmuschko.gradle.docker.tasks.image.DockerBuildImage::class) { + this.buildArgs.put( + "base_img", + "saagie/python:3.12-1.183.0" + ) +} + diff --git a/technologies/job/spark/spark-3.5/innerContexts/python/spark-3.5-python-3.12/build.me b/technologies/job/spark/spark-3.5/innerContexts/python/spark-3.5-python-3.12/build.me new file mode 100644 index 000000000..e69de29bb diff --git a/technologies/job/spark/spark-3.5/innerContexts/python/spark-3.5-python-3.12/dockerInfo.yaml b/technologies/job/spark/spark-3.5/innerContexts/python/spark-3.5-python-3.12/dockerInfo.yaml new file mode 100644 index 000000000..4896e37e0 --- /dev/null +++ b/technologies/job/spark/spark-3.5/innerContexts/python/spark-3.5-python-3.12/dockerInfo.yaml @@ -0,0 +1,4 @@ +image: saagie/spark +baseTag: 3.5-py-3.12 +dynamicVersion: 1.139.0_SDKTECHNO-207 +version: 3.5-py-3.12-1.139.0 diff --git a/technologies/job/spark/spark-3.5/innerContexts/python/spark-3.5-python-3.12/image_test.yaml b/technologies/job/spark/spark-3.5/innerContexts/python/spark-3.5-python-3.12/image_test.yaml new file mode 100644 index 000000000..2dc2feb82 --- /dev/null +++ b/technologies/job/spark/spark-3.5/innerContexts/python/spark-3.5-python-3.12/image_test.yaml @@ -0,0 +1,67 @@ +schemaVersion: "2.0.0" + +metadataTest: + env: + - key: LANG + value: "C.UTF-8" + - key: JAVA_HOME + value: "/opt/java/openjdk" + - key: SPARK_HOME + value: "/opt/spark" + +fileExistenceTests: + - name: "entrypoint.sh" + path: "/opt/entrypoint.sh" + shouldExist: true + permissions: "-rwxr-xr-x" + + - name: "kinit" + path: "/usr/bin/kinit" + shouldExist: true + permissions: "-rwxr-xr-x" + +commandTests: + - name: "Workdir" + command: "pwd" + expectedOutput: ["/opt/spark/work-dir"] + + - name: "Spark version" + command: "/opt/spark/bin/spark-submit" + args: ["--version"] + expectedError: ["version 3.5.*"] + + - name: "python installation" + command: "which" + args: ["python"] + expectedOutput: ["/usr/local/bin/python"] + + - name: "krb5-user installation" + command: "kinit" + expectedError: ["kinit: Program lacks support for encryption type while getting initial credentials"] + exitCode: 1 + + - name: "wget" + args: ["--help"] + command: "wget" + exitCode: 0 + + - name: "curl" + args: ["--help"] + command: "curl" + exitCode: 0 + + - name: "unzip" + args: ["--help"] + command: "unzip" + exitCode: 0 + + - name: "tar" + args: ["--help"] + command: "tar" + exitCode: 0 + + - name: "tini" + command: "/usr/bin/tini" + args: ["--version"] + expectedOutput: ["tini version 0.18.0.*"] + exitCode: 0 diff --git a/technologies/job/spark/spark-3.5/innerContexts/python/spark-3.5-python-3.12/innerContext.yaml b/technologies/job/spark/spark-3.5/innerContexts/python/spark-3.5-python-3.12/innerContext.yaml new file mode 100644 index 000000000..48ecd5e61 --- /dev/null +++ b/technologies/job/spark/spark-3.5/innerContexts/python/spark-3.5-python-3.12/innerContext.yaml @@ -0,0 +1,5 @@ +id: "3.12" +label: "3.12" +available: true +trustLevel: stable +recommended: true diff --git a/technologies/job/spark/spark-aws-3.1/context.yaml b/technologies/job/spark/spark-aws-3.1/context.yaml index 2f3e61218..02523907a 100644 --- a/technologies/job/spark/spark-aws-3.1/context.yaml +++ b/technologies/job/spark/spark-aws-3.1/context.yaml @@ -2,4 +2,5 @@ id: 3.1-aws label: 3.1 AWS available: true recommended: true -trustLevel: stable +trustLevel: deprecated +deprecationDate: "2024-09-01T00:00:00Z" \ No newline at end of file