Address PR feedbacks

theoctober19th · theoctober19th · commit 9f38d47f4bbf · 2024-06-19T14:39:25.000+05:45
diff --git a/tests/integration/integration-tests-kyuubi.sh b/tests/integration/integration-tests-kyuubi.sh
@@ -69,20 +69,30 @@ setup_kyuubi_pod_with_s3() {
   s3_secret_key=$(get_s3_secret_key)
 
   # Write Spark configs inside the Kyuubi container
-  kubectl -n $NAMESPACE exec kyuubi-test -- env IMG="$image"                /bin/bash -c 'echo spark.kubernetes.container.image=$IMG  > /etc/spark8t/conf/spark-defaults.conf'
-  kubectl -n $NAMESPACE exec kyuubi-test -- env NN="$NAMESPACE"             /bin/bash -c 'echo spark.kubernetes.namespace=$NN         >> /etc/spark8t/conf/spark-defaults.conf'
-  kubectl -n $NAMESPACE exec kyuubi-test -- env UU="$SERVICE_ACCOUNT"       /bin/bash -c 'echo spark.kubernetes.authenticate.driver.serviceAccountName=$UU >> /etc/spark8t/conf/spark-defaults.conf'
-  kubectl -n $NAMESPACE exec kyuubi-test -- env ENDPOINT="$s3_endpoint"     /bin/bash -c 'echo spark.hadoop.fs.s3a.endpoint=$ENDPOINT >> /etc/spark8t/conf/spark-defaults.conf'
-  kubectl -n $NAMESPACE exec kyuubi-test -- env ACCESS_KEY="$s3_access_key" /bin/bash -c 'echo spark.hadoop.fs.s3a.access.key=$ACCESS_KEY >> /etc/spark8t/conf/spark-defaults.conf'
-  kubectl -n $NAMESPACE exec kyuubi-test -- env SECRET_KEY="$s3_secret_key" /bin/bash -c 'echo spark.hadoop.fs.s3a.secret.key=$SECRET_KEY >> /etc/spark8t/conf/spark-defaults.conf'
-  kubectl -n $NAMESPACE exec kyuubi-test --                                 /bin/bash -c 'echo spark.hadoop.fs.s3a.aws.credentials.provider=org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider >> /etc/spark8t/conf/spark-defaults.conf'
-  kubectl -n $NAMESPACE exec kyuubi-test --                                 /bin/bash -c 'echo spark.hadoop.fs.s3a.connection.ssl.enabled=false >> /etc/spark8t/conf/spark-defaults.conf'
-  kubectl -n $NAMESPACE exec kyuubi-test --                                 /bin/bash -c 'echo spark.hadoop.fs.s3a.path.style.access=true       >> /etc/spark8t/conf/spark-defaults.conf'
-  kubectl -n $NAMESPACE exec kyuubi-test -- env BUCKET="$S3_BUCKET"         /bin/bash -c 'echo spark.sql.warehouse.dir=s3a://$BUCKET/warehouse  >> /etc/spark8t/conf/spark-defaults.conf'
-  kubectl -n $NAMESPACE exec kyuubi-test -- env BUCKET="$S3_BUCKET"         /bin/bash -c 'echo spark.kubernetes.file.upload.path=s3a://$BUCKET  >> /etc/spark8t/conf/spark-defaults.conf'
-
-  # Wait some time for the server to be up and running
-  sleep 10
+  # Add relevant Spark configurations in the service account and write the config
+  # to spark-defaults.conf file inside the container
+  kubectl -n $NAMESPACE exec kyuubi-test -- \
+      env IMG="$image" \
+          UU="$SERVICE_ACCOUNT" \
+          NN="$NAMESPACE" \
+          ENDPOINT="$s3_endpoint" \
+          ACCESS_KEY="$s3_access_key" \
+          SECRET_KEY="$s3_secret_key" \
+          BUCKET="$S3_BUCKET" \
+      /bin/bash -c '\
+        spark-client.service-account-registry add-config --username $UU --namespace $NN \
+          --conf spark.kubernetes.container.image=$IMG \
+          --conf spark.hadoop.fs.s3a.endpoint=$ENDPOINT \
+          --conf spark.hadoop.fs.s3a.access.key=$ACCESS_KEY \
+          --conf spark.hadoop.fs.s3a.secret.key=$SECRET_KEY \
+          --conf spark.hadoop.fs.s3a.aws.credentials.provider=org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider \
+          --conf spark.hadoop.fs.s3a.connection.ssl.enabled=false \
+          --conf spark.hadoop.fs.s3a.path.style.access=true \
+          --conf spark.sql.warehouse.dir=s3a://$BUCKET/warehouse \
+          --conf spark.kubernetes.file.upload.path=s3a://$BUCKET \
+        && \
+        spark-client.service-account-registry get-config --username $UU --namespace $NN > /etc/spark8t/conf/spark-defaults.conf'
+
 }
 
 
@@ -111,33 +121,30 @@ setup_kyuubi_pod_with_azure_abfss() {
   # Create Azure storage container
   create_azure_container $AZURE_CONTAINER
 
-  storage_account_name=$(get_storage_account)
-  storage_account_key=$(get_azure_secret_key)
+  storage_account_name=$(get_azure_storage_account_name)
+  storage_account_key=$(get_azure_storage_secret_key)
   warehouse_path=$(construct_resource_uri $AZURE_CONTAINER warehouse abfss)
   file_upload_path=$(construct_resource_uri $AZURE_CONTAINER "" abfss)
 
-  # Write Spark configs inside the Kyuubi container
+  # Add relevant Spark configurations in the service account and write the config
+  # to spark-defaults.conf file inside the container
   kubectl -n $NAMESPACE exec kyuubi-test -- \
       env IMG="$image" \
-          /bin/bash -c 'echo spark.kubernetes.container.image=$IMG  > /etc/spark8t/conf/spark-defaults.conf'
-  kubectl -n $NAMESPACE exec kyuubi-test -- \
-      env NN="$NAMESPACE" \
-          /bin/bash -c 'echo spark.kubernetes.namespace=$NN >> /etc/spark8t/conf/spark-defaults.conf'
-  kubectl -n $NAMESPACE exec kyuubi-test -- \
-      env UU="$SERVICE_ACCOUNT" \
-          /bin/bash -c 'echo spark.kubernetes.authenticate.driver.serviceAccountName=$UU >> /etc/spark8t/conf/spark-defaults.conf'
-  kubectl -n $NAMESPACE exec kyuubi-test -- \
-      env ACCOUNT_NAME="$storage_account_name" SECRET_KEY="$storage_account_key"\
-          /bin/bash -c 'echo spark.hadoop.fs.azure.account.key.$ACCOUNT_NAME.dfs.core.windows.net=$SECRET_KEY >> /etc/spark8t/conf/spark-defaults.conf'
-  kubectl -n $NAMESPACE exec kyuubi-test -- \
-      env WAREHOUSE="$warehouse_path" \
-          /bin/bash -c 'echo spark.sql.warehouse.dir=$WAREHOUSE >> /etc/spark8t/conf/spark-defaults.conf'
-  kubectl -n $NAMESPACE exec kyuubi-test -- \
-      env UPLOAD_PATH="$file_upload_path" \
-          /bin/bash -c 'echo spark.kubernetes.file.upload.path=$UPLOAD_PATH >> /etc/spark8t/conf/spark-defaults.conf'
+          UU="$SERVICE_ACCOUNT" \
+          NN="$NAMESPACE" \
+          ACCOUNT_NAME="$storage_account_name" \
+          SECRET_KEY="$storage_account_key" \
+          WAREHOUSE="$warehouse_path" \
+          UPLOAD_PATH="$file_upload_path" \
+        /bin/bash -c '\
+          spark-client.service-account-registry add-config --username $UU --namespace $NN \
+            --conf spark.kubernetes.container.image=$IMG \
+            --conf spark.hadoop.fs.azure.account.key.$ACCOUNT_NAME.dfs.core.windows.net=$SECRET_KEY \
+            --conf spark.sql.warehouse.dir=$WAREHOUSE \
+            --conf spark.kubernetes.file.upload.path=$UPLOAD_PATH \
+          && \
+          spark-client.service-account-registry get-config --username $UU --namespace $NN > /etc/spark8t/conf/spark-defaults.conf'
 
-  # Wait some time for the server to be up and running
-  sleep 10
 }
 
 
diff --git a/tests/integration/integration-tests.sh b/tests/integration/integration-tests.sh
@@ -21,6 +21,7 @@ source ./tests/integration/utils/azure-utils.sh
 
 # Global Variables
 NAMESPACE=tests
+SERVICE_ACCOUNT=spark
 ADMIN_POD_NAME=testpod-admin
 S3_BUCKET=spark-$(uuidgen)
 AZURE_CONTAINER=$S3_BUCKET
@@ -154,153 +155,95 @@ run_example_job_in_pod() {
   validate_pi_value $pi
 }
 
+setup_s3_properties_in_pod(){
+  # Setup S3 related Spark properties in the service account inside the pod
 
-test_iceberg_example_in_pod(){
-  # Test Iceberg integration in Charmed Spark Rock
-
-  # First create S3 bucket named 'spark'
-  create_s3_bucket $S3_BUCKET
-
-  # Copy 'test-iceberg.py' script to 'spark' bucket
-  copy_file_to_s3_bucket $S3_BUCKET ./tests/integration/resources/test-iceberg.py
-
-  NAMESPACE="tests"
-  USERNAME="spark"
-
-  # Number of rows that are to be inserted during the test.
-  NUM_ROWS_TO_INSERT="4"
-
-  # Number of driver pods that exist in the namespace already.
-  PREVIOUS_DRIVER_PODS_COUNT=$(kubectl get pods --sort-by=.metadata.creationTimestamp -n ${NAMESPACE} | grep driver | wc -l)
-
-  # Submit the job from inside 'testpod'
   kubectl -n $NAMESPACE exec testpod -- \
       env \
-        UU="$USERNAME" \
+        UU="$SERVICE_ACCOUNT" \
         NN="$NAMESPACE" \
-        IM="$(spark_image)" \
-        NUM_ROWS="$NUM_ROWS_TO_INSERT" \
         ACCESS_KEY="$(get_s3_access_key)" \
         SECRET_KEY="$(get_s3_secret_key)" \
         S3_ENDPOINT="$(get_s3_endpoint)" \
         BUCKET="$S3_BUCKET" \
       /bin/bash -c '\
-        spark-client.spark-submit \
+        spark-client.service-account-registry add-config \
         --username $UU --namespace $NN \
-        --conf spark.kubernetes.driver.request.cores=100m \
-        --conf spark.kubernetes.executor.request.cores=100m \
-        --conf spark.kubernetes.container.image=$IM \
         --conf spark.hadoop.fs.s3a.aws.credentials.provider=org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider \
         --conf spark.hadoop.fs.s3a.connection.ssl.enabled=false \
         --conf spark.hadoop.fs.s3a.path.style.access=true \
         --conf spark.hadoop.fs.s3a.endpoint=$S3_ENDPOINT \
         --conf spark.hadoop.fs.s3a.access.key=$ACCESS_KEY \
         --conf spark.hadoop.fs.s3a.secret.key=$SECRET_KEY \
-        --conf spark.jars.ivy=/tmp \
-        --conf spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions \
-        --conf spark.sql.catalog.spark_catalog=org.apache.iceberg.spark.SparkSessionCatalog \
-        --conf spark.sql.catalog.spark_catalog.type=hive \
-        --conf spark.sql.catalog.local=org.apache.iceberg.spark.SparkCatalog \
-        --conf spark.sql.catalog.local.type=hadoop \
-        --conf spark.sql.catalog.local.warehouse=s3a://$BUCKET/warehouse \
-        --conf spark.sql.defaultCatalog=local \
-        s3a://$BUCKET/test-iceberg.py -n $NUM_ROWS'
-
-  # Delete 'spark' bucket
-  delete_s3_bucket $S3_BUCKET
-
-  # Number of driver pods after the job is completed.
-  DRIVER_PODS_COUNT=$(kubectl get pods --sort-by=.metadata.creationTimestamp -n ${NAMESPACE} | grep driver | wc -l)
-
-  # If the number of driver pods is same as before, job has not been run at all!
-  if [[ "${PREVIOUS_DRIVER_PODS_COUNT}" == "${DRIVER_PODS_COUNT}" ]]
-  then
-    echo "ERROR: Sample job has not run!"
-    exit 1
-  fi
-
-  # Find the ID of the driver pod that ran the job.
-  # tail -n 1       => Filter out the last line
-  # cut -d' ' -f1   => Split by spaces and pick the first part
-  DRIVER_POD_ID=$(kubectl get pods --sort-by=.metadata.creationTimestamp -n ${NAMESPACE} | grep test-iceberg-.*-driver | tail -n 1 | cut -d' ' -f1)
-
-  # Filter out the output log line
-  OUTPUT_LOG_LINE=$(kubectl logs ${DRIVER_POD_ID} -n ${NAMESPACE} | grep 'Number of rows inserted:' )
+        --conf spark.sql.catalog.local.warehouse=s3a://$BUCKET/warehouse'
+}
 
-  # Fetch out the number of rows inserted
-  # rev             => Reverse the string
-  # cut -d' ' -f1   => Split by spaces and pick the first part
-  # rev             => Reverse the string back
-  NUM_ROWS_INSERTED=$(echo $OUTPUT_LOG_LINE | rev | cut -d' ' -f1 | rev)
+setup_azure_storage_properties_in_pod(){
+  # Setup Azure Storage related Spark properties in the service account inside the pod
 
-  if [ "${NUM_ROWS_INSERTED}" != "${NUM_ROWS_TO_INSERT}" ]; then
-      echo "ERROR: ${NUM_ROWS_TO_INSERT} were supposed to be inserted. Found ${NUM_ROWS_INSERTED} rows. Aborting with exit code 1."
-      exit 1
-  fi
+  warehouse_path=$(construct_resource_uri $AZURE_CONTAINER warehouse abfss)
 
+  kubectl -n $NAMESPACE exec testpod -- \
+      env \
+        UU="$SERVICE_ACCOUNT" \
+        NN="$NAMESPACE" \
+        ACCOUNT_NAME="$(get_azure_storage_account_name)" \
+        SECRET_KEY="$(get_azure_storage_secret_key)" \
+        WAREHOUSE="$warehouse_path" \
+      /bin/bash -c '\
+        spark-client.service-account-registry add-config \
+        --username $UU --namespace $NN \
+        --conf spark.hadoop.fs.azure.account.key.$ACCOUNT_NAME.dfs.core.windows.net=$SECRET_KEY \
+        --conf spark.sql.catalog.local.warehouse=$WAREHOUSE'
 }
 
 
-test_iceberg_example_in_pod_with_azure_using_abfss(){
-  # Test Iceberg integration in Charmed Spark Rock with Azure Storage
-
-  # First create S3 bucket named 'spark'
-  create_azure_container $AZURE_CONTAINER
-
-  # Copy 'test-iceberg.py' script to 'spark' bucket
-  copy_file_to_azure_container $AZURE_CONTAINER ./tests/integration/resources/test-iceberg.py
+test_iceberg_example_in_pod(){
+  # Test Iceberg integration in Charmed Spark Rock
+  #
+  # Arguments:
+  # $1: The path of the script in the cloud
+  echo $0 $1
 
-  STORAGE_ACCOUNT_NAME=$(get_storage_account)
-  STORAGE_ACCOUNT_KEY=$(get_azure_secret_key)
-  USERNAME="spark"
 
   # Number of rows that are to be inserted during the test.
   NUM_ROWS_TO_INSERT="4"
+  script_path=$1
 
   # Number of driver pods that exist in the namespace already.
   PREVIOUS_DRIVER_PODS_COUNT=$(kubectl get pods --sort-by=.metadata.creationTimestamp -n ${NAMESPACE} | grep driver | wc -l)
 
-  iceberg_script=$(construct_resource_uri $AZURE_CONTAINER test-iceberg.py abfss)
-  warehouse_path=$(construct_resource_uri $AZURE_CONTAINER warehouse abfss)
   # Submit the job from inside 'testpod'
   kubectl -n $NAMESPACE exec testpod -- \
       env \
         UU="$USERNAME" \
         NN="$NAMESPACE" \
         IM="$(spark_image)" \
         NUM_ROWS="$NUM_ROWS_TO_INSERT" \
-        ACCOUNT_NAME="$STORAGE_ACCOUNT_NAME" \
-        SECRET_KEY="$STORAGE_ACCOUNT_KEY" \
-        SCRIPT="$iceberg_script" \
-        WAREHOUSE="$warehouse_path" \
+        SCRIPT="$script_path" \
       /bin/bash -c '\
         spark-client.spark-submit \
         --username $UU --namespace $NN \
         --conf spark.kubernetes.driver.request.cores=100m \
         --conf spark.kubernetes.executor.request.cores=100m \
         --conf spark.kubernetes.container.image=$IM \
-        --conf spark.hadoop.fs.azure.account.key.$ACCOUNT_NAME.dfs.core.windows.net=$SECRET_KEY \
         --conf spark.jars.ivy=/tmp \
         --conf spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions \
         --conf spark.sql.catalog.spark_catalog=org.apache.iceberg.spark.SparkSessionCatalog \
         --conf spark.sql.catalog.spark_catalog.type=hive \
         --conf spark.sql.catalog.local=org.apache.iceberg.spark.SparkCatalog \
         --conf spark.sql.catalog.local.type=hadoop \
-        --conf spark.sql.catalog.local.warehouse=$WAREHOUSE \
         --conf spark.sql.defaultCatalog=local \
         $SCRIPT -n $NUM_ROWS'
 
-  # Delete 'spark' bucket
-  delete_azure_container $AZURE_CONTAINER
-
   # Number of driver pods after the job is completed.
   DRIVER_PODS_COUNT=$(kubectl get pods --sort-by=.metadata.creationTimestamp -n ${NAMESPACE} | grep driver | wc -l)
 
   # If the number of driver pods is same as before, job has not been run at all!
   if [[ "${PREVIOUS_DRIVER_PODS_COUNT}" == "${DRIVER_PODS_COUNT}" ]]
   then
     echo "ERROR: Sample job has not run!"
-    exit 1
+    return 1
   fi
 
   # Find the ID of the driver pod that ran the job.
@@ -319,9 +262,58 @@ test_iceberg_example_in_pod_with_azure_using_abfss(){
 
   if [ "${NUM_ROWS_INSERTED}" != "${NUM_ROWS_TO_INSERT}" ]; then
       echo "ERROR: ${NUM_ROWS_TO_INSERT} were supposed to be inserted. Found ${NUM_ROWS_INSERTED} rows. Aborting with exit code 1."
-      exit 1
+      return 1
   fi
 
+  return 0
+}
+
+
+test_iceberg_example_in_pod_using_s3(){
+  # Test Iceberg integration in Charmed Spark Rock using S3
+
+  # First create S3 bucket named 'spark'
+  create_s3_bucket $S3_BUCKET
+
+  # Now, setup S3 properties in service account inside the pod
+  setup_s3_properties_in_pod 
+
+  # Copy 'test-iceberg.py' script to 'spark' bucket
+  copy_file_to_s3_bucket $S3_BUCKET ./tests/integration/resources/test-iceberg.py
+  script_path="s3a://$S3_BUCKET/test-iceberg.py"
+
+  test_iceberg_example_in_pod $script_path 
+  return_value=$?
+
+  delete_s3_bucket $S3_BUCKET
+
+  if [ $return_value -eq 1 ]; then
+    exit 1
+  fi
+}
+
+
+test_iceberg_example_in_pod_using_abfss(){
+  # Test Iceberg integration in Charmed Spark Rock with Azure Storage
+
+  # First create S3 bucket named 'spark'
+  create_azure_container $AZURE_CONTAINER
+
+  # Now, setup S3 properties in service account inside the pod
+  setup_azure_storage_properties_in_pod 
+
+  # Copy 'test-iceberg.py' script to 'spark' bucket
+  copy_file_to_azure_container $AZURE_CONTAINER ./tests/integration/resources/test-iceberg.py
+  script_path=$(construct_resource_uri $AZURE_CONTAINER test-iceberg.py abfss)
+
+  test_iceberg_example_in_pod $script_path
+  return_value=$?
+
+  delete_azure_container $AZURE_CONTAINER
+
+  if [ $return_value -eq 1 ]; then
+    exit 1
+  fi
 }
 
 
@@ -642,13 +634,13 @@ echo -e "##################################"
 echo -e "RUN EXAMPLE THAT USES ICEBERG LIBRARIES"
 echo -e "##################################"
 
-(setup_user_context && test_iceberg_example_in_pod && cleanup_user_success) || cleanup_user_failure_in_pod
+(setup_user_context && test_iceberg_example_in_pod_using_s3 && cleanup_user_success) || cleanup_user_failure_in_pod
 
 echo -e "##################################"
 echo -e "RUN EXAMPLE THAT USES AZURE STORAGE"
 echo -e "##################################"
 
-(setup_user_context && test_iceberg_example_in_pod_with_azure_using_abfss && cleanup_user_success) || cleanup_user_failure_in_pod
+(setup_user_context && test_iceberg_example_in_pod_using_abfss && cleanup_user_success) || cleanup_user_failure_in_pod
 
 echo -e "##################################"
 echo -e "TEARDOWN TEST POD"
diff --git a/tests/integration/utils/azure-utils.sh b/tests/integration/utils/azure-utils.sh
@@ -18,13 +18,13 @@ if ! azcli storage container list > /dev/null 2>&1; then
 fi
 
 
-get_storage_account(){
+get_azure_storage_account_name(){
   # Print the name of the azure container (from the environment variable).
   echo $AZURE_STORAGE_ACCOUNT
 }
 
 
-get_azure_secret_key(){
+get_azure_storage_secret_key(){
   # Print the secret key for the Azure storage account used for test.
   echo $AZURE_STORAGE_KEY
 }
@@ -83,7 +83,7 @@ construct_resource_uri(){
   container=$1
   path=$2
   protocol=$3
-  account_name=$(get_storage_account)
+  account_name=$(get_azure_storage_account_name)
 
   case "$protocol" in
     "abfs")