diff --git a/.github/workflows/backend-integration-test.yml b/.github/workflows/backend-integration-test.yml index 495424c1636..1958163f863 100644 --- a/.github/workflows/backend-integration-test.yml +++ b/.github/workflows/backend-integration-test.yml @@ -28,6 +28,7 @@ jobs: - clients/client-java/** - clients/client-java-runtime/** - clients/filesystem-hadoop3/** + - clients/cli/** - common/** - conf/** - core/** diff --git a/build.gradle.kts b/build.gradle.kts index 49aa2fe89c0..cc29ff4affc 100644 --- a/build.gradle.kts +++ b/build.gradle.kts @@ -174,7 +174,7 @@ allprojects { param.environment("PROJECT_VERSION", project.version) // Gravitino CI Docker image - param.environment("GRAVITINO_CI_HIVE_DOCKER_IMAGE", "apache/gravitino-ci:hive-0.1.15") + param.environment("GRAVITINO_CI_HIVE_DOCKER_IMAGE", "apache/gravitino-ci:hive-0.1.16") param.environment("GRAVITINO_CI_KERBEROS_HIVE_DOCKER_IMAGE", "apache/gravitino-ci:kerberos-hive-0.1.5") param.environment("GRAVITINO_CI_DORIS_DOCKER_IMAGE", "apache/gravitino-ci:doris-0.1.5") param.environment("GRAVITINO_CI_TRINO_DOCKER_IMAGE", "apache/gravitino-ci:trino-0.1.6") diff --git a/catalogs/catalog-hive/build.gradle.kts b/catalogs/catalog-hive/build.gradle.kts index b328413dfd3..b471fccead1 100644 --- a/catalogs/catalog-hive/build.gradle.kts +++ b/catalogs/catalog-hive/build.gradle.kts @@ -130,6 +130,7 @@ dependencies { testImplementation(libs.testcontainers.localstack) testImplementation(libs.hadoop2.aws) testImplementation(libs.hadoop3.abs) + testImplementation(libs.hadoop3.gcs) // You need this to run test CatalogHiveABSIT as it required hadoop3 environment introduced by hadoop3.abs // (The protocol `abfss` was first introduced in Hadoop 3.2.0), However, as the there already exists diff --git a/catalogs/catalog-hive/src/test/java/org/apache/gravitino/catalog/hive/integration/test/CatalogHiveGCSIT.java b/catalogs/catalog-hive/src/test/java/org/apache/gravitino/catalog/hive/integration/test/CatalogHiveGCSIT.java new file mode 100644 index 00000000000..c69cf013ee5 --- /dev/null +++ b/catalogs/catalog-hive/src/test/java/org/apache/gravitino/catalog/hive/integration/test/CatalogHiveGCSIT.java @@ -0,0 +1,109 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.gravitino.catalog.hive.integration.test; + +import java.io.IOException; +import java.net.URI; +import java.util.HashMap; +import java.util.Map; +import org.apache.commons.lang3.StringUtils; +import org.apache.gravitino.integration.test.container.HiveContainer; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.spark.sql.SparkSession; +import org.junit.jupiter.api.condition.EnabledIf; +import org.testcontainers.shaded.com.google.common.collect.ImmutableMap; +import org.testcontainers.utility.MountableFile; + +@EnabledIf(value = "isGCSConfigured", disabledReason = "Google Cloud Storage(GCS) is not prepared.") +public class CatalogHiveGCSIT extends CatalogHiveIT { + + private static final String GCS_BUCKET_NAME = System.getenv("GCS_BUCKET_NAME"); + private static final String GCS_ACCOUNT_JSON_FILE = + System.getenv("GCS_SERVICE_ACCOUNT_JSON_PATH"); + private static final String GCS_ACCOUNT_JSON_FILE_IN_CONTAINER = "/tmp/gcs-service-account.json"; + + @Override + protected void startNecessaryContainer() { + Map hiveContainerEnv = + ImmutableMap.of( + "SERVICE_ACCOUNT_FILE", + GCS_ACCOUNT_JSON_FILE_IN_CONTAINER, + HiveContainer.HIVE_RUNTIME_VERSION, + HiveContainer.HIVE3); + + containerSuite.startHiveContainerWithS3(hiveContainerEnv); + + HIVE_METASTORE_URIS = + String.format( + "thrift://%s:%d", + containerSuite.getHiveContainerWithS3().getContainerIpAddress(), + HiveContainer.HIVE_METASTORE_PORT); + + containerSuite + .getHiveContainerWithS3() + .getContainer() + .copyFileToContainer( + MountableFile.forHostPath(GCS_ACCOUNT_JSON_FILE), "/tmp/gcs-service-account.json"); + } + + @Override + protected void initFileSystem() throws IOException { + Configuration conf = new Configuration(); + + conf.set("fs.gs.auth.service.account.enable", "true"); + conf.set("fs.gs.auth.service.account.json.keyfile", GCS_ACCOUNT_JSON_FILE); + + String path = String.format("gs://%s/", GCS_BUCKET_NAME); + fileSystem = FileSystem.get(URI.create(path), conf); + } + + @Override + protected void initSparkSession() { + sparkSession = + SparkSession.builder() + .master("local[1]") + .appName("Hive Catalog integration test") + .config("hive.metastore.uris", HIVE_METASTORE_URIS) + .config( + "spark.sql.warehouse.dir", + String.format(String.format("gs://%s/user/hive/warehouse", GCS_BUCKET_NAME))) + .config("spark.hadoop.fs.gs.auth.service.account.json.keyfile", GCS_ACCOUNT_JSON_FILE) + .config("spark.sql.storeAssignmentPolicy", "LEGACY") + .config("mapreduce.input.fileinputformat.input.dir.recursive", "true") + .enableHiveSupport() + .getOrCreate(); + } + + @Override + protected Map createSchemaProperties() { + Map properties = new HashMap<>(); + properties.put("key1", "val1"); + properties.put("key2", "val2"); + properties.put( + "location", String.format("gs://%s/test-%s", GCS_BUCKET_NAME, System.currentTimeMillis())); + return properties; + } + + private static boolean isGCSConfigured() { + return StringUtils.isNotBlank(System.getenv("GCS_BUCKET_NAME")) + && StringUtils.isNotBlank(System.getenv("GCS_SERVICE_ACCOUNT_JSON_PATH")); + } +} diff --git a/clients/cli/build.gradle.kts b/clients/cli/build.gradle.kts index 8d42712eaee..71608ee911d 100644 --- a/clients/cli/build.gradle.kts +++ b/clients/cli/build.gradle.kts @@ -34,6 +34,10 @@ dependencies { testImplementation(libs.junit.jupiter.api) testImplementation(libs.junit.jupiter.params) testImplementation(libs.mockito.core) + testImplementation(libs.mysql.driver) + testImplementation(libs.postgresql.driver) + testImplementation(libs.testcontainers) + testImplementation(project(":core")) { exclude("org.apache.logging.log4j") } diff --git a/dev/docker/hive/Dockerfile b/dev/docker/hive/Dockerfile index 1b4d4dd7f08..cd79e256250 100644 --- a/dev/docker/hive/Dockerfile +++ b/dev/docker/hive/Dockerfile @@ -150,6 +150,11 @@ RUN ln -s /opt/hadoop-${HADOOP2_VERSION} ${HADOOP2_HOME} ADD packages/hadoop-${HADOOP3_VERSION}.tar.gz /opt/ RUN ln -s /opt/hadoop-${HADOOP3_VERSION} ${HADOOP3_HOME} + +# Add gcs connector for hadoop2 and hadoop3 +ADD packages/gcs-connector-hadoop2-2.2.23-shaded.jar ${HADOOP2_HOME}/share/hadoop/common/lib/gcs-connector-hadoop2-2.2.23-shaded.jar +ADD packages/gcs-connector-hadoop3-2.2.23-shaded.jar ${HADOOP3_HOME}/share/hadoop/common/lib/gcs-connector-hadoop3-2.2.23-shaded.jar + # Add hadoop configuration to temporary directory ADD core-site.xml ${HADOOP_TMP_CONF_DIR}/core-site.xml ADD hadoop-env.sh ${HADOOP_TMP_CONF_DIR}/hadoop-env.sh diff --git a/dev/docker/hive/hive-dependency.sh b/dev/docker/hive/hive-dependency.sh index 2038dd001d2..e93361c3c9f 100755 --- a/dev/docker/hive/hive-dependency.sh +++ b/dev/docker/hive/hive-dependency.sh @@ -33,9 +33,13 @@ RANGER_VERSION="2.4.0" # Notice: Currently only tested Ranger plugin 2.4.0 in th HADOOP2_PACKAGE_NAME="hadoop-${HADOOP2_VERSION}.tar.gz" HADOOP2_DOWNLOAD_URL="https://archive.apache.org/dist/hadoop/core/hadoop-${HADOOP2_VERSION}/${HADOOP2_PACKAGE_NAME}" +HADOOP2_GCS_PACKAGE_NAME="gcs-connector-hadoop2-2.2.23-shaded.jar" +HADOOP2_GCS_DOWNLOAD_URL="https://github.com/GoogleCloudDataproc/hadoop-connectors/releases/download/v2.2.23/gcs-connector-hadoop2-2.2.23-shaded.jar" HADOOP3_PACKAGE_NAME="hadoop-${HADOOP3_VERSION}.tar.gz" HADOOP3_DOWNLOAD_URL="https://archive.apache.org/dist/hadoop/core/hadoop-${HADOOP3_VERSION}/${HADOOP3_PACKAGE_NAME}" +HADOOP3_GCS_PACKAGE_NAME="gcs-connector-hadoop3-2.2.23-shaded.jar" +HADOOP3_GCS_DOWNLOAD_URL="https://github.com/GoogleCloudDataproc/hadoop-connectors/releases/download/v2.2.23/gcs-connector-hadoop3-2.2.23-shaded.jar" HIVE2_PACKAGE_NAME="apache-hive-${HIVE2_VERSION}-bin.tar.gz" HIVE2_DOWNLOAD_URL="https://archive.apache.org/dist/hive/hive-${HIVE2_VERSION}/${HIVE2_PACKAGE_NAME}" @@ -91,3 +95,11 @@ fi if [ ! -f "${hive_dir}/packages/${RANGER_HIVE_PACKAGE_NAME}" ]; then curl -L -s -o "${hive_dir}/packages/${RANGER_HIVE_PACKAGE_NAME}" ${RANGER_HIVE_DOWNLOAD_URL} fi + +if [ ! -f "${hive_dir}/packages/${HADOOP2_GCS_PACKAGE_NAME}" ]; then + curl -L -s -o "${hive_dir}/packages/${HADOOP2_GCS_PACKAGE_NAME}" ${HADOOP2_GCS_DOWNLOAD_URL} +fi + +if [ ! -f "${hive_dir}/packages/${HADOOP3_GCS_PACKAGE_NAME}" ]; then + curl -L -s -o "${hive_dir}/packages/${HADOOP3_GCS_PACKAGE_NAME}" ${HADOOP3_GCS_DOWNLOAD_URL} +fi \ No newline at end of file diff --git a/dev/docker/hive/hive-site.xml b/dev/docker/hive/hive-site.xml index c6a247e1a39..1750539b78c 100644 --- a/dev/docker/hive/hive-site.xml +++ b/dev/docker/hive/hive-site.xml @@ -73,4 +73,14 @@ ABS_ACCOUNT_KEY + + fs.gs.auth.service.account.enable + true + + + + fs.gs.auth.service.account.json.keyfile + SERVICE_ACCOUNT_FILE + + diff --git a/dev/docker/hive/start.sh b/dev/docker/hive/start.sh index 86ced409741..93ab35e307a 100644 --- a/dev/docker/hive/start.sh +++ b/dev/docker/hive/start.sh @@ -31,8 +31,8 @@ else ln -s ${HADOOP2_HOME} ${HADOOP_HOME} fi - cp ${HADOOP_HOME}/share/hadoop/tools/lib/*aws* ${HIVE_HOME}/lib - cp ${HADOOP_HOME}/share/hadoop/tools/lib/*azure* ${HIVE_HOME}/lib +cp ${HADOOP_HOME}/share/hadoop/tools/lib/*aws* ${HIVE_HOME}/lib +cp ${HADOOP_HOME}/share/hadoop/tools/lib/*azure* ${HIVE_HOME}/lib # Copy Hadoop and Hive configuration file and update hostname cp -f ${HADOOP_TMP_CONF_DIR}/* ${HADOOP_CONF_DIR} @@ -54,6 +54,11 @@ if [[ -n "${ABS_ACCOUNT_NAME}" && -n "${ABS_ACCOUNT_KEY}" ]]; then sed -i "s|ABS_ACCOUNT_KEY|${ABS_ACCOUNT_KEY}|g" ${HIVE_CONF_DIR}/hive-site.xml fi +# whether GCS is set +if [[ -n "$SERVICE_ACCOUNT_FILE" ]]; then + sed -i "s|SERVICE_ACCOUNT_FILE|${SERVICE_ACCOUNT_FILE}|g" ${HIVE_CONF_DIR}/hive-site.xml +fi + # Link mysql-connector-java after deciding where HIVE_HOME symbolic link points to. ln -s /opt/mysql-connector-java-${MYSQL_JDBC_DRIVER_VERSION}/mysql-connector-java-${MYSQL_JDBC_DRIVER_VERSION}.jar ${HIVE_HOME}/lib diff --git a/docs/docker-image-details.md b/docs/docker-image-details.md index fed00d83c60..4e0a8109325 100644 --- a/docs/docker-image-details.md +++ b/docs/docker-image-details.md @@ -168,8 +168,12 @@ Changelog You can use this kind of image to test the catalog of Apache Hive. Changelog +- apache/gravitino-ci:hive-0.1.16 + - Add GCS related configuration in the `hive-site.xml` file. + - Add GCS bundle jar in the `${HADOOP_HOME}/share/hadoop/common/lib/` + - apache/gravitino-ci:hive-0.1.15 - - Add ADLS related configurations in the `hive-site.xml` file. + - Add Azure Blob Storage(ADLS) related configurations in the `hive-site.xml` file. - apache/gravitino-ci:hive-0.1.14 - Add amazon S3 related configurations in the `hive-site.xml` file. diff --git a/docs/hive-catalog-with-s3-and-adls.md b/docs/hive-catalog-with-cloud-storage.md similarity index 84% rename from docs/hive-catalog-with-s3-and-adls.md rename to docs/hive-catalog-with-cloud-storage.md index 41b8eef77d2..49a018907b4 100644 --- a/docs/hive-catalog-with-s3-and-adls.md +++ b/docs/hive-catalog-with-cloud-storage.md @@ -11,14 +11,13 @@ license: "This software is licensed under the Apache License version 2." Since Hive 2.x, Hive has supported S3 as a storage backend, enabling users to store and manage data in Amazon S3 directly through Hive. Gravitino enhances this capability by supporting the Hive catalog with S3, allowing users to efficiently manage the storage locations of files located in S3. This integration simplifies data operations and enables seamless access to S3 data from Hive queries. -For ADLS (aka. Azure Blob Storage (ABS), or Azure Data Lake Storage (v2)), the integration is similar to S3. The only difference is the configuration properties for ADLS(see below). +For ADLS (aka. Azure Blob Storage (ABS), or Azure Data Lake Storage (v2)) and GCS (Google Cloud Storage), the integration is similar to S3. The only difference is the configuration properties for ADLS and GCS (see below). -The following sections will guide you through the necessary steps to configure the Hive catalog to utilize S3 and ADLS as a storage backend, including configuration details and examples for creating databases and tables. +The following sections will guide you through the necessary steps to configure the Hive catalog to utilize S3, ADLS, and GCS as a storage backend, including configuration details and examples for creating databases and tables. ## Hive metastore configuration - -The following will mainly focus on configuring the Hive metastore to use S3 as a storage backend. The same configuration can be applied to ADLS with minor changes in the configuration properties. +The following will mainly focus on configuring the Hive metastore to use S3 as a storage backend. The same configuration can be applied to ADLS and GCS with minor changes in the configuration properties. ### Example Configuration Changes @@ -45,15 +44,14 @@ Below are the essential properties to add or modify in the `hive-site.xml` file definition and table definition, as shown in the examples below. After explicitly setting this property, you can omit the location property in the schema and table definitions. -It's also applicable for ADLS. +It's also applicable for Azure Blob Storage(ADSL) and GCS. --> hive.metastore.warehouse.dir S3_BUCKET_PATH - - + fs.abfss.impl org.apache.hadoop.fs.azurebfs.SecureAzureBlobFileSystem @@ -64,6 +62,18 @@ It's also applicable for ADLS. ABS_ACCOUNT_KEY + + + fs.gs.auth.service.account.enable + true + + + + + fs.gs.auth.service.account.json.keyfile + SERVICE_ACCOUNT_FILE + + ``` ### Adding Required JARs @@ -78,7 +88,6 @@ cp ${HADOOP_HOME}/share/hadoop/tools/lib/*azure* ${HIVE_HOME}/lib Alternatively, you can download the required JARs from the Maven repository and place them in the Hive classpath. It is crucial to verify that the JARs are compatible with the version of Hadoop you are using to avoid any compatibility issue. - ### Restart Hive metastore Once all configurations have been correctly set, restart the Hive cluster to apply the changes. This step is essential to ensure that the new configurations take effect and that the Hive services can communicate with S3. @@ -105,6 +114,9 @@ curl -X POST -H "Accept: application/vnd.gravitino.v1+json" \ # The following line is for Azure Blob Storage(ADLS) # "location": "abfss://container-name@user-account-name.dfs.core.windows.net/path" + + # The following line is for Google Cloud Storage(GCS) + # "location": "gs://bucket-name/path" } }' http://localhost:8090/api/metalakes/metalake/catalogs/catalog/schemas ``` @@ -129,6 +141,9 @@ Map schemaProperties = ImmutableMap.builder() // The following line is for Azure Blob Storage(ADLS) // .put("location", "abfss://container-name@user-account-name.dfs.core.windows.net/path") + // The following lines for Google Cloud Storage(GCS) + // .put("location", "gs://bucket-name/path") + .build(); Schema schema = supportsSchemas.createSchema("hive_schema", "This is a schema", @@ -225,13 +240,17 @@ To access S3-stored tables using Spark, you need to configure the SparkSession a .config("spark.sql.catalog.{hive_catalog_name}.fs.s3a.endpoint", getS3Endpoint) .config("spark.sql.catalog.{hive_catalog_name}.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") - ## This two is for Azure Blob Storage(ADLS) only + // This two is for Azure Blob Storage(ADLS) only .config( String.format( "spark.sql.catalog.{hive_catalog_name}.fs.azure.account.key.%s.dfs.core.windows.net", ABS_USER_ACCOUNT_NAME), ABS_USER_ACCOUNT_KEY) .config("spark.sql.catalog.{hive_catalog_name}.fs.abfss.impl", "org.apache.hadoop.fs.azurebfs.SecureAzureBlobFileSystem") + + // This two is for Google Cloud Storage(GCS) only + .config("spark.sql.catalog.{hive_catalog_name}.fs.gs.auth.service.account.enable", "true") + .config("spark.sql.catalog.{hive_catalog_name}.fs.gs.auth.service.account.json.keyfile", "SERVICE_ACCOUNT_FILE") .config("spark.sql.catalog.{hive_catalog_name}.fs.s3a.path.style.access", "true") .config("spark.sql.catalog.{hive_catalog_name}.fs.s3a.connection.ssl.enabled", "false") @@ -249,6 +268,7 @@ To access S3-stored tables using Spark, you need to configure the SparkSession a :::Note Please download [Hadoop AWS jar](https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-aws), [aws java sdk jar](https://mvnrepository.com/artifact/com.amazonaws/aws-java-sdk-bundle) and place them in the classpath of the Spark. If the JARs are missing, Spark will not be able to access the S3 storage. Azure Blob Storage(ADLS) requires the [Hadoop Azure jar](https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-azure), [Azure cloud sdk jar](https://mvnrepository.com/artifact/com.azure/azure-storage-blob) to be placed in the classpath of the Spark. +for Google Cloud Storage(GCS), you need to download the [Hadoop GCS jar](https://github.com/GoogleCloudDataproc/hadoop-connectors/releases) and place it in the classpath of the Spark. ::: -By following these instructions, you can effectively manage and access your S3-stored data through both Hive CLI and Spark, leveraging the capabilities of Gravitino for optimal data management. \ No newline at end of file +By following these instructions, you can effectively manage and access your S3, ADLS or GCS data through both Hive CLI and Spark, leveraging the capabilities of Gravitino for optimal data management. \ No newline at end of file