Skip to content

Commit

Permalink
[#5673] Add test and docs about how to use GCS in Hive (#5676)
Browse files Browse the repository at this point in the history
### What changes were proposed in this pull request?

1. Release a new docker Hive image to support GCS.
2. Add related test based on the new image.
 
### Why are the changes needed?

For users convenience. 

Fix: #5673 

### Does this PR introduce _any_ user-facing change?

N/A

### How was this patch tested?

new IT `CatalogHiveGCSIT`

---------

Co-authored-by: Jerry Shao <[email protected]>
  • Loading branch information
yuqi1129 and jerryshao authored Dec 3, 2024
1 parent 758cd2e commit 71a0d63
Show file tree
Hide file tree
Showing 11 changed files with 185 additions and 14 deletions.
1 change: 1 addition & 0 deletions .github/workflows/backend-integration-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ jobs:
- clients/client-java/**
- clients/client-java-runtime/**
- clients/filesystem-hadoop3/**
- clients/cli/**
- common/**
- conf/**
- core/**
Expand Down
2 changes: 1 addition & 1 deletion build.gradle.kts
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,7 @@ allprojects {
param.environment("PROJECT_VERSION", project.version)

// Gravitino CI Docker image
param.environment("GRAVITINO_CI_HIVE_DOCKER_IMAGE", "apache/gravitino-ci:hive-0.1.15")
param.environment("GRAVITINO_CI_HIVE_DOCKER_IMAGE", "apache/gravitino-ci:hive-0.1.16")
param.environment("GRAVITINO_CI_KERBEROS_HIVE_DOCKER_IMAGE", "apache/gravitino-ci:kerberos-hive-0.1.5")
param.environment("GRAVITINO_CI_DORIS_DOCKER_IMAGE", "apache/gravitino-ci:doris-0.1.5")
param.environment("GRAVITINO_CI_TRINO_DOCKER_IMAGE", "apache/gravitino-ci:trino-0.1.6")
Expand Down
1 change: 1 addition & 0 deletions catalogs/catalog-hive/build.gradle.kts
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,7 @@ dependencies {
testImplementation(libs.testcontainers.localstack)
testImplementation(libs.hadoop2.aws)
testImplementation(libs.hadoop3.abs)
testImplementation(libs.hadoop3.gcs)

// You need this to run test CatalogHiveABSIT as it required hadoop3 environment introduced by hadoop3.abs
// (The protocol `abfss` was first introduced in Hadoop 3.2.0), However, as the there already exists
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.apache.gravitino.catalog.hive.integration.test;

import java.io.IOException;
import java.net.URI;
import java.util.HashMap;
import java.util.Map;
import org.apache.commons.lang3.StringUtils;
import org.apache.gravitino.integration.test.container.HiveContainer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.spark.sql.SparkSession;
import org.junit.jupiter.api.condition.EnabledIf;
import org.testcontainers.shaded.com.google.common.collect.ImmutableMap;
import org.testcontainers.utility.MountableFile;

@EnabledIf(value = "isGCSConfigured", disabledReason = "Google Cloud Storage(GCS) is not prepared.")
public class CatalogHiveGCSIT extends CatalogHiveIT {

private static final String GCS_BUCKET_NAME = System.getenv("GCS_BUCKET_NAME");
private static final String GCS_ACCOUNT_JSON_FILE =
System.getenv("GCS_SERVICE_ACCOUNT_JSON_PATH");
private static final String GCS_ACCOUNT_JSON_FILE_IN_CONTAINER = "/tmp/gcs-service-account.json";

@Override
protected void startNecessaryContainer() {
Map<String, String> hiveContainerEnv =
ImmutableMap.of(
"SERVICE_ACCOUNT_FILE",
GCS_ACCOUNT_JSON_FILE_IN_CONTAINER,
HiveContainer.HIVE_RUNTIME_VERSION,
HiveContainer.HIVE3);

containerSuite.startHiveContainerWithS3(hiveContainerEnv);

HIVE_METASTORE_URIS =
String.format(
"thrift://%s:%d",
containerSuite.getHiveContainerWithS3().getContainerIpAddress(),
HiveContainer.HIVE_METASTORE_PORT);

containerSuite
.getHiveContainerWithS3()
.getContainer()
.copyFileToContainer(
MountableFile.forHostPath(GCS_ACCOUNT_JSON_FILE), "/tmp/gcs-service-account.json");
}

@Override
protected void initFileSystem() throws IOException {
Configuration conf = new Configuration();

conf.set("fs.gs.auth.service.account.enable", "true");
conf.set("fs.gs.auth.service.account.json.keyfile", GCS_ACCOUNT_JSON_FILE);

String path = String.format("gs://%s/", GCS_BUCKET_NAME);
fileSystem = FileSystem.get(URI.create(path), conf);
}

@Override
protected void initSparkSession() {
sparkSession =
SparkSession.builder()
.master("local[1]")
.appName("Hive Catalog integration test")
.config("hive.metastore.uris", HIVE_METASTORE_URIS)
.config(
"spark.sql.warehouse.dir",
String.format(String.format("gs://%s/user/hive/warehouse", GCS_BUCKET_NAME)))
.config("spark.hadoop.fs.gs.auth.service.account.json.keyfile", GCS_ACCOUNT_JSON_FILE)
.config("spark.sql.storeAssignmentPolicy", "LEGACY")
.config("mapreduce.input.fileinputformat.input.dir.recursive", "true")
.enableHiveSupport()
.getOrCreate();
}

@Override
protected Map<String, String> createSchemaProperties() {
Map<String, String> properties = new HashMap<>();
properties.put("key1", "val1");
properties.put("key2", "val2");
properties.put(
"location", String.format("gs://%s/test-%s", GCS_BUCKET_NAME, System.currentTimeMillis()));
return properties;
}

private static boolean isGCSConfigured() {
return StringUtils.isNotBlank(System.getenv("GCS_BUCKET_NAME"))
&& StringUtils.isNotBlank(System.getenv("GCS_SERVICE_ACCOUNT_JSON_PATH"));
}
}
4 changes: 4 additions & 0 deletions clients/cli/build.gradle.kts
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,10 @@ dependencies {
testImplementation(libs.junit.jupiter.api)
testImplementation(libs.junit.jupiter.params)
testImplementation(libs.mockito.core)
testImplementation(libs.mysql.driver)
testImplementation(libs.postgresql.driver)
testImplementation(libs.testcontainers)

testImplementation(project(":core")) {
exclude("org.apache.logging.log4j")
}
Expand Down
5 changes: 5 additions & 0 deletions dev/docker/hive/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,11 @@ RUN ln -s /opt/hadoop-${HADOOP2_VERSION} ${HADOOP2_HOME}
ADD packages/hadoop-${HADOOP3_VERSION}.tar.gz /opt/
RUN ln -s /opt/hadoop-${HADOOP3_VERSION} ${HADOOP3_HOME}


# Add gcs connector for hadoop2 and hadoop3
ADD packages/gcs-connector-hadoop2-2.2.23-shaded.jar ${HADOOP2_HOME}/share/hadoop/common/lib/gcs-connector-hadoop2-2.2.23-shaded.jar
ADD packages/gcs-connector-hadoop3-2.2.23-shaded.jar ${HADOOP3_HOME}/share/hadoop/common/lib/gcs-connector-hadoop3-2.2.23-shaded.jar

# Add hadoop configuration to temporary directory
ADD core-site.xml ${HADOOP_TMP_CONF_DIR}/core-site.xml
ADD hadoop-env.sh ${HADOOP_TMP_CONF_DIR}/hadoop-env.sh
Expand Down
12 changes: 12 additions & 0 deletions dev/docker/hive/hive-dependency.sh
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,13 @@ RANGER_VERSION="2.4.0" # Notice: Currently only tested Ranger plugin 2.4.0 in th

HADOOP2_PACKAGE_NAME="hadoop-${HADOOP2_VERSION}.tar.gz"
HADOOP2_DOWNLOAD_URL="https://archive.apache.org/dist/hadoop/core/hadoop-${HADOOP2_VERSION}/${HADOOP2_PACKAGE_NAME}"
HADOOP2_GCS_PACKAGE_NAME="gcs-connector-hadoop2-2.2.23-shaded.jar"
HADOOP2_GCS_DOWNLOAD_URL="https://github.com/GoogleCloudDataproc/hadoop-connectors/releases/download/v2.2.23/gcs-connector-hadoop2-2.2.23-shaded.jar"

HADOOP3_PACKAGE_NAME="hadoop-${HADOOP3_VERSION}.tar.gz"
HADOOP3_DOWNLOAD_URL="https://archive.apache.org/dist/hadoop/core/hadoop-${HADOOP3_VERSION}/${HADOOP3_PACKAGE_NAME}"
HADOOP3_GCS_PACKAGE_NAME="gcs-connector-hadoop3-2.2.23-shaded.jar"
HADOOP3_GCS_DOWNLOAD_URL="https://github.com/GoogleCloudDataproc/hadoop-connectors/releases/download/v2.2.23/gcs-connector-hadoop3-2.2.23-shaded.jar"

HIVE2_PACKAGE_NAME="apache-hive-${HIVE2_VERSION}-bin.tar.gz"
HIVE2_DOWNLOAD_URL="https://archive.apache.org/dist/hive/hive-${HIVE2_VERSION}/${HIVE2_PACKAGE_NAME}"
Expand Down Expand Up @@ -91,3 +95,11 @@ fi
if [ ! -f "${hive_dir}/packages/${RANGER_HIVE_PACKAGE_NAME}" ]; then
curl -L -s -o "${hive_dir}/packages/${RANGER_HIVE_PACKAGE_NAME}" ${RANGER_HIVE_DOWNLOAD_URL}
fi

if [ ! -f "${hive_dir}/packages/${HADOOP2_GCS_PACKAGE_NAME}" ]; then
curl -L -s -o "${hive_dir}/packages/${HADOOP2_GCS_PACKAGE_NAME}" ${HADOOP2_GCS_DOWNLOAD_URL}
fi

if [ ! -f "${hive_dir}/packages/${HADOOP3_GCS_PACKAGE_NAME}" ]; then
curl -L -s -o "${hive_dir}/packages/${HADOOP3_GCS_PACKAGE_NAME}" ${HADOOP3_GCS_DOWNLOAD_URL}
fi
10 changes: 10 additions & 0 deletions dev/docker/hive/hive-site.xml
Original file line number Diff line number Diff line change
Expand Up @@ -73,4 +73,14 @@
<value>ABS_ACCOUNT_KEY</value>
</property>

<property>
<name>fs.gs.auth.service.account.enable</name>
<value>true</value>
</property>

<property>
<name>fs.gs.auth.service.account.json.keyfile</name>
<value>SERVICE_ACCOUNT_FILE</value>
</property>

</configuration>
9 changes: 7 additions & 2 deletions dev/docker/hive/start.sh
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,8 @@ else
ln -s ${HADOOP2_HOME} ${HADOOP_HOME}
fi

cp ${HADOOP_HOME}/share/hadoop/tools/lib/*aws* ${HIVE_HOME}/lib
cp ${HADOOP_HOME}/share/hadoop/tools/lib/*azure* ${HIVE_HOME}/lib
cp ${HADOOP_HOME}/share/hadoop/tools/lib/*aws* ${HIVE_HOME}/lib
cp ${HADOOP_HOME}/share/hadoop/tools/lib/*azure* ${HIVE_HOME}/lib

# Copy Hadoop and Hive configuration file and update hostname
cp -f ${HADOOP_TMP_CONF_DIR}/* ${HADOOP_CONF_DIR}
Expand All @@ -54,6 +54,11 @@ if [[ -n "${ABS_ACCOUNT_NAME}" && -n "${ABS_ACCOUNT_KEY}" ]]; then
sed -i "s|ABS_ACCOUNT_KEY|${ABS_ACCOUNT_KEY}|g" ${HIVE_CONF_DIR}/hive-site.xml
fi

# whether GCS is set
if [[ -n "$SERVICE_ACCOUNT_FILE" ]]; then
sed -i "s|SERVICE_ACCOUNT_FILE|${SERVICE_ACCOUNT_FILE}|g" ${HIVE_CONF_DIR}/hive-site.xml
fi

# Link mysql-connector-java after deciding where HIVE_HOME symbolic link points to.
ln -s /opt/mysql-connector-java-${MYSQL_JDBC_DRIVER_VERSION}/mysql-connector-java-${MYSQL_JDBC_DRIVER_VERSION}.jar ${HIVE_HOME}/lib

Expand Down
6 changes: 5 additions & 1 deletion docs/docker-image-details.md
Original file line number Diff line number Diff line change
Expand Up @@ -168,8 +168,12 @@ Changelog
You can use this kind of image to test the catalog of Apache Hive.

Changelog
- apache/gravitino-ci:hive-0.1.16
- Add GCS related configuration in the `hive-site.xml` file.
- Add GCS bundle jar in the `${HADOOP_HOME}/share/hadoop/common/lib/`

- apache/gravitino-ci:hive-0.1.15
- Add ADLS related configurations in the `hive-site.xml` file.
- Add Azure Blob Storage(ADLS) related configurations in the `hive-site.xml` file.

- apache/gravitino-ci:hive-0.1.14
- Add amazon S3 related configurations in the `hive-site.xml` file.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,13 @@ license: "This software is licensed under the Apache License version 2."

Since Hive 2.x, Hive has supported S3 as a storage backend, enabling users to store and manage data in Amazon S3 directly through Hive. Gravitino enhances this capability by supporting the Hive catalog with S3, allowing users to efficiently manage the storage locations of files located in S3. This integration simplifies data operations and enables seamless access to S3 data from Hive queries.

For ADLS (aka. Azure Blob Storage (ABS), or Azure Data Lake Storage (v2)), the integration is similar to S3. The only difference is the configuration properties for ADLS(see below).
For ADLS (aka. Azure Blob Storage (ABS), or Azure Data Lake Storage (v2)) and GCS (Google Cloud Storage), the integration is similar to S3. The only difference is the configuration properties for ADLS and GCS (see below).

The following sections will guide you through the necessary steps to configure the Hive catalog to utilize S3 and ADLS as a storage backend, including configuration details and examples for creating databases and tables.
The following sections will guide you through the necessary steps to configure the Hive catalog to utilize S3, ADLS, and GCS as a storage backend, including configuration details and examples for creating databases and tables.

## Hive metastore configuration


The following will mainly focus on configuring the Hive metastore to use S3 as a storage backend. The same configuration can be applied to ADLS with minor changes in the configuration properties.
The following will mainly focus on configuring the Hive metastore to use S3 as a storage backend. The same configuration can be applied to ADLS and GCS with minor changes in the configuration properties.

### Example Configuration Changes

Expand All @@ -45,15 +44,14 @@ Below are the essential properties to add or modify in the `hive-site.xml` file
definition and table definition, as shown in the examples below. After explicitly setting this
property, you can omit the location property in the schema and table definitions.
It's also applicable for ADLS.
It's also applicable for Azure Blob Storage(ADSL) and GCS.
-->
<property>
<name>hive.metastore.warehouse.dir</name>
<value>S3_BUCKET_PATH</value>
</property>


<!-- The following are for Azure Blob Storage(ADLS) -->
<!-- The following two configurations are for Azure Blob Storage(ADLS) -->
<property>
<name>fs.abfss.impl</name>
<value>org.apache.hadoop.fs.azurebfs.SecureAzureBlobFileSystem</value>
Expand All @@ -64,6 +62,18 @@ It's also applicable for ADLS.
<value>ABS_ACCOUNT_KEY</value>
</property>

<!-- The following two configurations are only for Google Cloud Storage(gcs) -->
<property>
<name>fs.gs.auth.service.account.enable</name>
<value>true</value>
</property>

<!-- SERVICE_ACCOUNT_FILE should be a local file or remote file that can be access by hive server -->
<property>
<name>fs.gs.auth.service.account.json.keyfile</name>
<value>SERVICE_ACCOUNT_FILE</value>
</property>

```

### Adding Required JARs
Expand All @@ -78,7 +88,6 @@ cp ${HADOOP_HOME}/share/hadoop/tools/lib/*azure* ${HIVE_HOME}/lib

Alternatively, you can download the required JARs from the Maven repository and place them in the Hive classpath. It is crucial to verify that the JARs are compatible with the version of Hadoop you are using to avoid any compatibility issue.


### Restart Hive metastore

Once all configurations have been correctly set, restart the Hive cluster to apply the changes. This step is essential to ensure that the new configurations take effect and that the Hive services can communicate with S3.
Expand All @@ -105,6 +114,9 @@ curl -X POST -H "Accept: application/vnd.gravitino.v1+json" \
# The following line is for Azure Blob Storage(ADLS)
# "location": "abfss://[email protected]/path"
# The following line is for Google Cloud Storage(GCS)
# "location": "gs://bucket-name/path"
}
}' http://localhost:8090/api/metalakes/metalake/catalogs/catalog/schemas
```
Expand All @@ -129,6 +141,9 @@ Map<String, String> schemaProperties = ImmutableMap.<String, String>builder()
// The following line is for Azure Blob Storage(ADLS)
// .put("location", "abfss://[email protected]/path")

// The following lines for Google Cloud Storage(GCS)
// .put("location", "gs://bucket-name/path")

.build();
Schema schema = supportsSchemas.createSchema("hive_schema",
"This is a schema",
Expand Down Expand Up @@ -225,13 +240,17 @@ To access S3-stored tables using Spark, you need to configure the SparkSession a
.config("spark.sql.catalog.{hive_catalog_name}.fs.s3a.endpoint", getS3Endpoint)
.config("spark.sql.catalog.{hive_catalog_name}.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
## This two is for Azure Blob Storage(ADLS) only
// This two is for Azure Blob Storage(ADLS) only
.config(
String.format(
"spark.sql.catalog.{hive_catalog_name}.fs.azure.account.key.%s.dfs.core.windows.net",
ABS_USER_ACCOUNT_NAME),
ABS_USER_ACCOUNT_KEY)
.config("spark.sql.catalog.{hive_catalog_name}.fs.abfss.impl", "org.apache.hadoop.fs.azurebfs.SecureAzureBlobFileSystem")
// This two is for Google Cloud Storage(GCS) only
.config("spark.sql.catalog.{hive_catalog_name}.fs.gs.auth.service.account.enable", "true")
.config("spark.sql.catalog.{hive_catalog_name}.fs.gs.auth.service.account.json.keyfile", "SERVICE_ACCOUNT_FILE")
.config("spark.sql.catalog.{hive_catalog_name}.fs.s3a.path.style.access", "true")
.config("spark.sql.catalog.{hive_catalog_name}.fs.s3a.connection.ssl.enabled", "false")
Expand All @@ -249,6 +268,7 @@ To access S3-stored tables using Spark, you need to configure the SparkSession a
:::Note
Please download [Hadoop AWS jar](https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-aws), [aws java sdk jar](https://mvnrepository.com/artifact/com.amazonaws/aws-java-sdk-bundle) and place them in the classpath of the Spark. If the JARs are missing, Spark will not be able to access the S3 storage.
Azure Blob Storage(ADLS) requires the [Hadoop Azure jar](https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-azure), [Azure cloud sdk jar](https://mvnrepository.com/artifact/com.azure/azure-storage-blob) to be placed in the classpath of the Spark.
for Google Cloud Storage(GCS), you need to download the [Hadoop GCS jar](https://github.com/GoogleCloudDataproc/hadoop-connectors/releases) and place it in the classpath of the Spark.
:::
By following these instructions, you can effectively manage and access your S3-stored data through both Hive CLI and Spark, leveraging the capabilities of Gravitino for optimal data management.
By following these instructions, you can effectively manage and access your S3, ADLS or GCS data through both Hive CLI and Spark, leveraging the capabilities of Gravitino for optimal data management.

0 comments on commit 71a0d63

Please sign in to comment.