[#5673] Add test and docs about how to use GCS in Hive (#5676)

### What changes were proposed in this pull request? 1. Release a new docker Hive image to support GCS. 2. Add related test based on the new image. ### Why are the changes needed? For users convenience. Fix: #5673 ### Does this PR introduce _any_ user-facing change? N/A ### How was this patch tested? new IT `CatalogHiveGCSIT` --------- Co-authored-by: Jerry Shao <[email protected]>
apache · Dec 3, 2024 · 71a0d63 · 71a0d63
1 parent 758cd2e
commit 71a0d63
Show file tree

Hide file tree

Showing 11 changed files with 185 additions and 14 deletions.
diff --git a/.github/workflows/backend-integration-test.yml b/.github/workflows/backend-integration-test.yml
@@ -28,6 +28,7 @@ jobs:
               - clients/client-java/**
               - clients/client-java-runtime/**
               - clients/filesystem-hadoop3/**
+              - clients/cli/**
               - common/**
               - conf/**
               - core/**

diff --git a/build.gradle.kts b/build.gradle.kts
@@ -174,7 +174,7 @@ allprojects {
       param.environment("PROJECT_VERSION", project.version)
 
       // Gravitino CI Docker image
-      param.environment("GRAVITINO_CI_HIVE_DOCKER_IMAGE", "apache/gravitino-ci:hive-0.1.15")
+      param.environment("GRAVITINO_CI_HIVE_DOCKER_IMAGE", "apache/gravitino-ci:hive-0.1.16")
       param.environment("GRAVITINO_CI_KERBEROS_HIVE_DOCKER_IMAGE", "apache/gravitino-ci:kerberos-hive-0.1.5")
       param.environment("GRAVITINO_CI_DORIS_DOCKER_IMAGE", "apache/gravitino-ci:doris-0.1.5")
       param.environment("GRAVITINO_CI_TRINO_DOCKER_IMAGE", "apache/gravitino-ci:trino-0.1.6")

diff --git a/catalogs/catalog-hive/build.gradle.kts b/catalogs/catalog-hive/build.gradle.kts
@@ -130,6 +130,7 @@ dependencies {
   testImplementation(libs.testcontainers.localstack)
   testImplementation(libs.hadoop2.aws)
   testImplementation(libs.hadoop3.abs)
+  testImplementation(libs.hadoop3.gcs)
 
   // You need this to run test CatalogHiveABSIT as it required hadoop3 environment introduced by hadoop3.abs
   // (The protocol `abfss` was first introduced in Hadoop 3.2.0), However, as the there already exists

diff --git a/...ve/src/test/java/org/apache/gravitino/catalog/hive/integration/test/CatalogHiveGCSIT.java b/...ve/src/test/java/org/apache/gravitino/catalog/hive/integration/test/CatalogHiveGCSIT.java
@@ -0,0 +1,109 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.gravitino.catalog.hive.integration.test;
+
+import java.io.IOException;
+import java.net.URI;
+import java.util.HashMap;
+import java.util.Map;
+import org.apache.commons.lang3.StringUtils;
+import org.apache.gravitino.integration.test.container.HiveContainer;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.spark.sql.SparkSession;
+import org.junit.jupiter.api.condition.EnabledIf;
+import org.testcontainers.shaded.com.google.common.collect.ImmutableMap;
+import org.testcontainers.utility.MountableFile;
+
+@EnabledIf(value = "isGCSConfigured", disabledReason = "Google Cloud Storage(GCS) is not prepared.")
+public class CatalogHiveGCSIT extends CatalogHiveIT {
+
+  private static final String GCS_BUCKET_NAME = System.getenv("GCS_BUCKET_NAME");
+  private static final String GCS_ACCOUNT_JSON_FILE =
+      System.getenv("GCS_SERVICE_ACCOUNT_JSON_PATH");
+  private static final String GCS_ACCOUNT_JSON_FILE_IN_CONTAINER = "/tmp/gcs-service-account.json";
+
+  @Override
+  protected void startNecessaryContainer() {
+    Map<String, String> hiveContainerEnv =
+        ImmutableMap.of(
+            "SERVICE_ACCOUNT_FILE",
+            GCS_ACCOUNT_JSON_FILE_IN_CONTAINER,
+            HiveContainer.HIVE_RUNTIME_VERSION,
+            HiveContainer.HIVE3);
+
+    containerSuite.startHiveContainerWithS3(hiveContainerEnv);
+
+    HIVE_METASTORE_URIS =
+        String.format(
+            "thrift://%s:%d",
+            containerSuite.getHiveContainerWithS3().getContainerIpAddress(),
+            HiveContainer.HIVE_METASTORE_PORT);
+
+    containerSuite
+        .getHiveContainerWithS3()
+        .getContainer()
+        .copyFileToContainer(
+            MountableFile.forHostPath(GCS_ACCOUNT_JSON_FILE), "/tmp/gcs-service-account.json");
+  }
+
+  @Override
+  protected void initFileSystem() throws IOException {
+    Configuration conf = new Configuration();
+
+    conf.set("fs.gs.auth.service.account.enable", "true");
+    conf.set("fs.gs.auth.service.account.json.keyfile", GCS_ACCOUNT_JSON_FILE);
+
+    String path = String.format("gs://%s/", GCS_BUCKET_NAME);
+    fileSystem = FileSystem.get(URI.create(path), conf);
+  }
+
+  @Override
+  protected void initSparkSession() {
+    sparkSession =
+        SparkSession.builder()
+            .master("local[1]")
+            .appName("Hive Catalog integration test")
+            .config("hive.metastore.uris", HIVE_METASTORE_URIS)
+            .config(
+                "spark.sql.warehouse.dir",
+                String.format(String.format("gs://%s/user/hive/warehouse", GCS_BUCKET_NAME)))
+            .config("spark.hadoop.fs.gs.auth.service.account.json.keyfile", GCS_ACCOUNT_JSON_FILE)
+            .config("spark.sql.storeAssignmentPolicy", "LEGACY")
+            .config("mapreduce.input.fileinputformat.input.dir.recursive", "true")
+            .enableHiveSupport()
+            .getOrCreate();
+  }
+
+  @Override
+  protected Map<String, String> createSchemaProperties() {
+    Map<String, String> properties = new HashMap<>();
+    properties.put("key1", "val1");
+    properties.put("key2", "val2");
+    properties.put(
+        "location", String.format("gs://%s/test-%s", GCS_BUCKET_NAME, System.currentTimeMillis()));
+    return properties;
+  }
+
+  private static boolean isGCSConfigured() {
+    return StringUtils.isNotBlank(System.getenv("GCS_BUCKET_NAME"))
+        && StringUtils.isNotBlank(System.getenv("GCS_SERVICE_ACCOUNT_JSON_PATH"));
+  }
+}
diff --git a/clients/cli/build.gradle.kts b/clients/cli/build.gradle.kts
@@ -34,6 +34,10 @@ dependencies {
   testImplementation(libs.junit.jupiter.api)
   testImplementation(libs.junit.jupiter.params)
   testImplementation(libs.mockito.core)
+  testImplementation(libs.mysql.driver)
+  testImplementation(libs.postgresql.driver)
+  testImplementation(libs.testcontainers)
+
   testImplementation(project(":core")) {
     exclude("org.apache.logging.log4j")
   }

diff --git a/dev/docker/hive/Dockerfile b/dev/docker/hive/Dockerfile
@@ -150,6 +150,11 @@ RUN ln -s /opt/hadoop-${HADOOP2_VERSION} ${HADOOP2_HOME}
 ADD packages/hadoop-${HADOOP3_VERSION}.tar.gz /opt/
 RUN ln -s /opt/hadoop-${HADOOP3_VERSION} ${HADOOP3_HOME}
 
+
+# Add gcs connector for hadoop2 and hadoop3
+ADD packages/gcs-connector-hadoop2-2.2.23-shaded.jar ${HADOOP2_HOME}/share/hadoop/common/lib/gcs-connector-hadoop2-2.2.23-shaded.jar
+ADD packages/gcs-connector-hadoop3-2.2.23-shaded.jar ${HADOOP3_HOME}/share/hadoop/common/lib/gcs-connector-hadoop3-2.2.23-shaded.jar
+
 # Add hadoop configuration to temporary directory
 ADD core-site.xml ${HADOOP_TMP_CONF_DIR}/core-site.xml
 ADD hadoop-env.sh ${HADOOP_TMP_CONF_DIR}/hadoop-env.sh

diff --git a/dev/docker/hive/hive-dependency.sh b/dev/docker/hive/hive-dependency.sh
@@ -33,9 +33,13 @@ RANGER_VERSION="2.4.0" # Notice: Currently only tested Ranger plugin 2.4.0 in th
 
 HADOOP2_PACKAGE_NAME="hadoop-${HADOOP2_VERSION}.tar.gz"
 HADOOP2_DOWNLOAD_URL="https://archive.apache.org/dist/hadoop/core/hadoop-${HADOOP2_VERSION}/${HADOOP2_PACKAGE_NAME}"
+HADOOP2_GCS_PACKAGE_NAME="gcs-connector-hadoop2-2.2.23-shaded.jar"
+HADOOP2_GCS_DOWNLOAD_URL="https://github.com/GoogleCloudDataproc/hadoop-connectors/releases/download/v2.2.23/gcs-connector-hadoop2-2.2.23-shaded.jar"
 
 HADOOP3_PACKAGE_NAME="hadoop-${HADOOP3_VERSION}.tar.gz"
 HADOOP3_DOWNLOAD_URL="https://archive.apache.org/dist/hadoop/core/hadoop-${HADOOP3_VERSION}/${HADOOP3_PACKAGE_NAME}"
+HADOOP3_GCS_PACKAGE_NAME="gcs-connector-hadoop3-2.2.23-shaded.jar"
+HADOOP3_GCS_DOWNLOAD_URL="https://github.com/GoogleCloudDataproc/hadoop-connectors/releases/download/v2.2.23/gcs-connector-hadoop3-2.2.23-shaded.jar"
 
 HIVE2_PACKAGE_NAME="apache-hive-${HIVE2_VERSION}-bin.tar.gz"
 HIVE2_DOWNLOAD_URL="https://archive.apache.org/dist/hive/hive-${HIVE2_VERSION}/${HIVE2_PACKAGE_NAME}"
@@ -91,3 +95,11 @@ fi
 if [ ! -f "${hive_dir}/packages/${RANGER_HIVE_PACKAGE_NAME}" ]; then
   curl -L -s -o "${hive_dir}/packages/${RANGER_HIVE_PACKAGE_NAME}" ${RANGER_HIVE_DOWNLOAD_URL}
 fi
+
+if [ ! -f "${hive_dir}/packages/${HADOOP2_GCS_PACKAGE_NAME}" ]; then
+  curl -L -s -o "${hive_dir}/packages/${HADOOP2_GCS_PACKAGE_NAME}" ${HADOOP2_GCS_DOWNLOAD_URL}
+fi
+
+if [ ! -f "${hive_dir}/packages/${HADOOP3_GCS_PACKAGE_NAME}" ]; then
+  curl -L -s -o "${hive_dir}/packages/${HADOOP3_GCS_PACKAGE_NAME}" ${HADOOP3_GCS_DOWNLOAD_URL}
+fi
diff --git a/dev/docker/hive/hive-site.xml b/dev/docker/hive/hive-site.xml
@@ -73,4 +73,14 @@
     <value>ABS_ACCOUNT_KEY</value>
   </property>
 
+  <property>
+    <name>fs.gs.auth.service.account.enable</name>
+    <value>true</value>
+  </property>
+
+  <property>
+    <name>fs.gs.auth.service.account.json.keyfile</name>
+    <value>SERVICE_ACCOUNT_FILE</value>
+  </property>
+
 </configuration>
diff --git a/dev/docker/hive/start.sh b/dev/docker/hive/start.sh
@@ -31,8 +31,8 @@ else
   ln -s ${HADOOP2_HOME} ${HADOOP_HOME}
 fi
 
- cp ${HADOOP_HOME}/share/hadoop/tools/lib/*aws* ${HIVE_HOME}/lib
- cp ${HADOOP_HOME}/share/hadoop/tools/lib/*azure* ${HIVE_HOME}/lib
+cp ${HADOOP_HOME}/share/hadoop/tools/lib/*aws* ${HIVE_HOME}/lib
+cp ${HADOOP_HOME}/share/hadoop/tools/lib/*azure* ${HIVE_HOME}/lib
 
 # Copy Hadoop and Hive configuration file and update hostname
 cp -f ${HADOOP_TMP_CONF_DIR}/* ${HADOOP_CONF_DIR}
@@ -54,6 +54,11 @@ if [[ -n "${ABS_ACCOUNT_NAME}" && -n "${ABS_ACCOUNT_KEY}" ]]; then
   sed -i "s|ABS_ACCOUNT_KEY|${ABS_ACCOUNT_KEY}|g" ${HIVE_CONF_DIR}/hive-site.xml
 fi
 
+# whether GCS is set
+if [[ -n "$SERVICE_ACCOUNT_FILE" ]]; then
+  sed -i "s|SERVICE_ACCOUNT_FILE|${SERVICE_ACCOUNT_FILE}|g" ${HIVE_CONF_DIR}/hive-site.xml
+fi
+
 # Link mysql-connector-java after deciding where HIVE_HOME symbolic link points to.
 ln -s /opt/mysql-connector-java-${MYSQL_JDBC_DRIVER_VERSION}/mysql-connector-java-${MYSQL_JDBC_DRIVER_VERSION}.jar ${HIVE_HOME}/lib
 

diff --git a/docs/docker-image-details.md b/docs/docker-image-details.md
@@ -168,8 +168,12 @@ Changelog
 You can use this kind of image to test the catalog of Apache Hive.
 
 Changelog
+- apache/gravitino-ci:hive-0.1.16
+  - Add GCS related configuration in the `hive-site.xml` file.
+  - Add GCS bundle jar in the `${HADOOP_HOME}/share/hadoop/common/lib/`
+
 - apache/gravitino-ci:hive-0.1.15
-  - Add ADLS related configurations in the `hive-site.xml` file.
+  - Add Azure Blob Storage(ADLS) related configurations in the `hive-site.xml` file.
 
 - apache/gravitino-ci:hive-0.1.14 
   - Add amazon S3 related configurations in the `hive-site.xml` file.

diff --git a/docs/hive-catalog-with-s3-and-adls.md → docs/hive-catalog-with-cloud-storage.md b/docs/hive-catalog-with-s3-and-adls.md → docs/hive-catalog-with-cloud-storage.md
@@ -11,14 +11,13 @@ license: "This software is licensed under the Apache License version 2."
 
 Since Hive 2.x, Hive has supported S3 as a storage backend, enabling users to store and manage data in Amazon S3 directly through Hive. Gravitino enhances this capability by supporting the Hive catalog with S3, allowing users to efficiently manage the storage locations of files located in S3. This integration simplifies data operations and enables seamless access to S3 data from Hive queries.
 
-For ADLS (aka. Azure Blob Storage (ABS), or Azure Data Lake Storage (v2)), the integration is similar to S3. The only difference is the configuration properties for ADLS(see below). 
+For ADLS (aka. Azure Blob Storage (ABS), or Azure Data Lake Storage (v2)) and GCS (Google Cloud Storage), the integration is similar to S3. The only difference is the configuration properties for ADLS and GCS (see below). 
 
-The following sections will guide you through the necessary steps to configure the Hive catalog to utilize S3 and ADLS as a storage backend, including configuration details and examples for creating databases and tables.
+The following sections will guide you through the necessary steps to configure the Hive catalog to utilize S3, ADLS, and GCS as a storage backend, including configuration details and examples for creating databases and tables.
 
 ## Hive metastore configuration
 
-
-The following will mainly focus on configuring the Hive metastore to use S3 as a storage backend. The same configuration can be applied to ADLS with minor changes in the configuration properties. 
+The following will mainly focus on configuring the Hive metastore to use S3 as a storage backend. The same configuration can be applied to ADLS and GCS with minor changes in the configuration properties. 
 
 ### Example Configuration Changes
 
@@ -45,15 +44,14 @@ Below are the essential properties to add or modify in the `hive-site.xml` file
 definition and table definition, as shown in the examples below. After explicitly setting this
 property, you can omit the location property in the schema and table definitions.
 
-It's also applicable for ADLS.
+It's also applicable for Azure Blob Storage(ADSL) and GCS.
 -->
 <property>
   <name>hive.metastore.warehouse.dir</name>
   <value>S3_BUCKET_PATH</value>
 </property>
 
-
-<!-- The following are for Azure Blob Storage(ADLS) -->
+<!-- The following two configurations are for Azure Blob Storage(ADLS) -->
 <property>
   <name>fs.abfss.impl</name>
   <value>org.apache.hadoop.fs.azurebfs.SecureAzureBlobFileSystem</value>
@@ -64,6 +62,18 @@ It's also applicable for ADLS.
   <value>ABS_ACCOUNT_KEY</value>
 </property>
 
+<!-- The following two configurations are only for Google Cloud Storage(gcs) -->
+<property>
+  <name>fs.gs.auth.service.account.enable</name>
+  <value>true</value>
+</property>
+
+<!-- SERVICE_ACCOUNT_FILE should be a local file or remote file that can be access by hive server -->
+<property>
+  <name>fs.gs.auth.service.account.json.keyfile</name>
+  <value>SERVICE_ACCOUNT_FILE</value>
+</property>
+
 ```
 
 ### Adding Required JARs
@@ -78,7 +88,6 @@ cp ${HADOOP_HOME}/share/hadoop/tools/lib/*azure* ${HIVE_HOME}/lib
 
 Alternatively, you can download the required JARs from the Maven repository and place them in the Hive classpath. It is crucial to verify that the JARs are compatible with the version of Hadoop you are using to avoid any compatibility issue.
 
-
 ### Restart Hive metastore
 
 Once all configurations have been correctly set, restart the Hive cluster to apply the changes. This step is essential to ensure that the new configurations take effect and that the Hive services can communicate with S3.
@@ -105,6 +114,9 @@ curl -X POST -H "Accept: application/vnd.gravitino.v1+json" \
      
      # The following line is for Azure Blob Storage(ADLS)
      # "location": "abfss://[email protected]/path"
+     
+     # The following line is for Google Cloud Storage(GCS)
+     # "location": "gs://bucket-name/path"
   }
 }' http://localhost:8090/api/metalakes/metalake/catalogs/catalog/schemas
 ```
@@ -129,6 +141,9 @@ Map<String, String> schemaProperties = ImmutableMap.<String, String>builder()
     // The following line is for Azure Blob Storage(ADLS)
     // .put("location", "abfss://[email protected]/path")
 
+    // The following lines for Google Cloud Storage(GCS)
+    // .put("location", "gs://bucket-name/path")
+
     .build();
 Schema schema = supportsSchemas.createSchema("hive_schema",
     "This is a schema",
@@ -225,13 +240,17 @@ To access S3-stored tables using Spark, you need to configure the SparkSession a
             .config("spark.sql.catalog.{hive_catalog_name}.fs.s3a.endpoint", getS3Endpoint)
             .config("spark.sql.catalog.{hive_catalog_name}.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
 
-            ## This two is for Azure Blob Storage(ADLS) only
+            // This two is for Azure Blob Storage(ADLS) only
             .config(
                 String.format(
                     "spark.sql.catalog.{hive_catalog_name}.fs.azure.account.key.%s.dfs.core.windows.net",
                     ABS_USER_ACCOUNT_NAME),
                 ABS_USER_ACCOUNT_KEY)
             .config("spark.sql.catalog.{hive_catalog_name}.fs.abfss.impl", "org.apache.hadoop.fs.azurebfs.SecureAzureBlobFileSystem")
+  
+            // This two is for Google Cloud Storage(GCS) only
+            .config("spark.sql.catalog.{hive_catalog_name}.fs.gs.auth.service.account.enable", "true")
+            .config("spark.sql.catalog.{hive_catalog_name}.fs.gs.auth.service.account.json.keyfile", "SERVICE_ACCOUNT_FILE")
             
             .config("spark.sql.catalog.{hive_catalog_name}.fs.s3a.path.style.access", "true")
             .config("spark.sql.catalog.{hive_catalog_name}.fs.s3a.connection.ssl.enabled", "false")
@@ -249,6 +268,7 @@ To access S3-stored tables using Spark, you need to configure the SparkSession a
 :::Note
 Please download [Hadoop AWS jar](https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-aws), [aws java sdk jar](https://mvnrepository.com/artifact/com.amazonaws/aws-java-sdk-bundle) and place them in the classpath of the Spark. If the JARs are missing, Spark will not be able to access the S3 storage.
 Azure Blob Storage(ADLS) requires the [Hadoop Azure jar](https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-azure), [Azure cloud sdk jar](https://mvnrepository.com/artifact/com.azure/azure-storage-blob) to be placed in the classpath of the Spark.
+for Google Cloud Storage(GCS), you need to download the [Hadoop GCS jar](https://github.com/GoogleCloudDataproc/hadoop-connectors/releases) and place it in the classpath of the Spark.
 :::
 
-By following these instructions, you can effectively manage and access your S3-stored data through both Hive CLI and Spark, leveraging the capabilities of Gravitino for optimal data management.
+By following these instructions, you can effectively manage and access your S3, ADLS or GCS data through both Hive CLI and Spark, leveraging the capabilities of Gravitino for optimal data management.