-
Notifications
You must be signed in to change notification settings - Fork 400
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Browse files
Browse the repository at this point in the history
### What changes were proposed in this pull request? 1. Release a new docker Hive image to support GCS. 2. Add related test based on the new image. ### Why are the changes needed? For users convenience. Fix: #5673 ### Does this PR introduce _any_ user-facing change? N/A ### How was this patch tested? new IT `CatalogHiveGCSIT` --------- Co-authored-by: Jerry Shao <[email protected]>
- Loading branch information
Showing
11 changed files
with
185 additions
and
14 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
109 changes: 109 additions & 0 deletions
109
...ve/src/test/java/org/apache/gravitino/catalog/hive/integration/test/CatalogHiveGCSIT.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,109 @@ | ||
/* | ||
* Licensed to the Apache Software Foundation (ASF) under one | ||
* or more contributor license agreements. See the NOTICE file | ||
* distributed with this work for additional information | ||
* regarding copyright ownership. The ASF licenses this file | ||
* to you under the Apache License, Version 2.0 (the | ||
* "License"); you may not use this file except in compliance | ||
* with the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, | ||
* software distributed under the License is distributed on an | ||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
* KIND, either express or implied. See the License for the | ||
* specific language governing permissions and limitations | ||
* under the License. | ||
*/ | ||
|
||
package org.apache.gravitino.catalog.hive.integration.test; | ||
|
||
import java.io.IOException; | ||
import java.net.URI; | ||
import java.util.HashMap; | ||
import java.util.Map; | ||
import org.apache.commons.lang3.StringUtils; | ||
import org.apache.gravitino.integration.test.container.HiveContainer; | ||
import org.apache.hadoop.conf.Configuration; | ||
import org.apache.hadoop.fs.FileSystem; | ||
import org.apache.spark.sql.SparkSession; | ||
import org.junit.jupiter.api.condition.EnabledIf; | ||
import org.testcontainers.shaded.com.google.common.collect.ImmutableMap; | ||
import org.testcontainers.utility.MountableFile; | ||
|
||
@EnabledIf(value = "isGCSConfigured", disabledReason = "Google Cloud Storage(GCS) is not prepared.") | ||
public class CatalogHiveGCSIT extends CatalogHiveIT { | ||
|
||
private static final String GCS_BUCKET_NAME = System.getenv("GCS_BUCKET_NAME"); | ||
private static final String GCS_ACCOUNT_JSON_FILE = | ||
System.getenv("GCS_SERVICE_ACCOUNT_JSON_PATH"); | ||
private static final String GCS_ACCOUNT_JSON_FILE_IN_CONTAINER = "/tmp/gcs-service-account.json"; | ||
|
||
@Override | ||
protected void startNecessaryContainer() { | ||
Map<String, String> hiveContainerEnv = | ||
ImmutableMap.of( | ||
"SERVICE_ACCOUNT_FILE", | ||
GCS_ACCOUNT_JSON_FILE_IN_CONTAINER, | ||
HiveContainer.HIVE_RUNTIME_VERSION, | ||
HiveContainer.HIVE3); | ||
|
||
containerSuite.startHiveContainerWithS3(hiveContainerEnv); | ||
|
||
HIVE_METASTORE_URIS = | ||
String.format( | ||
"thrift://%s:%d", | ||
containerSuite.getHiveContainerWithS3().getContainerIpAddress(), | ||
HiveContainer.HIVE_METASTORE_PORT); | ||
|
||
containerSuite | ||
.getHiveContainerWithS3() | ||
.getContainer() | ||
.copyFileToContainer( | ||
MountableFile.forHostPath(GCS_ACCOUNT_JSON_FILE), "/tmp/gcs-service-account.json"); | ||
} | ||
|
||
@Override | ||
protected void initFileSystem() throws IOException { | ||
Configuration conf = new Configuration(); | ||
|
||
conf.set("fs.gs.auth.service.account.enable", "true"); | ||
conf.set("fs.gs.auth.service.account.json.keyfile", GCS_ACCOUNT_JSON_FILE); | ||
|
||
String path = String.format("gs://%s/", GCS_BUCKET_NAME); | ||
fileSystem = FileSystem.get(URI.create(path), conf); | ||
} | ||
|
||
@Override | ||
protected void initSparkSession() { | ||
sparkSession = | ||
SparkSession.builder() | ||
.master("local[1]") | ||
.appName("Hive Catalog integration test") | ||
.config("hive.metastore.uris", HIVE_METASTORE_URIS) | ||
.config( | ||
"spark.sql.warehouse.dir", | ||
String.format(String.format("gs://%s/user/hive/warehouse", GCS_BUCKET_NAME))) | ||
.config("spark.hadoop.fs.gs.auth.service.account.json.keyfile", GCS_ACCOUNT_JSON_FILE) | ||
.config("spark.sql.storeAssignmentPolicy", "LEGACY") | ||
.config("mapreduce.input.fileinputformat.input.dir.recursive", "true") | ||
.enableHiveSupport() | ||
.getOrCreate(); | ||
} | ||
|
||
@Override | ||
protected Map<String, String> createSchemaProperties() { | ||
Map<String, String> properties = new HashMap<>(); | ||
properties.put("key1", "val1"); | ||
properties.put("key2", "val2"); | ||
properties.put( | ||
"location", String.format("gs://%s/test-%s", GCS_BUCKET_NAME, System.currentTimeMillis())); | ||
return properties; | ||
} | ||
|
||
private static boolean isGCSConfigured() { | ||
return StringUtils.isNotBlank(System.getenv("GCS_BUCKET_NAME")) | ||
&& StringUtils.isNotBlank(System.getenv("GCS_SERVICE_ACCOUNT_JSON_PATH")); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -11,14 +11,13 @@ license: "This software is licensed under the Apache License version 2." | |
|
||
Since Hive 2.x, Hive has supported S3 as a storage backend, enabling users to store and manage data in Amazon S3 directly through Hive. Gravitino enhances this capability by supporting the Hive catalog with S3, allowing users to efficiently manage the storage locations of files located in S3. This integration simplifies data operations and enables seamless access to S3 data from Hive queries. | ||
|
||
For ADLS (aka. Azure Blob Storage (ABS), or Azure Data Lake Storage (v2)), the integration is similar to S3. The only difference is the configuration properties for ADLS(see below). | ||
For ADLS (aka. Azure Blob Storage (ABS), or Azure Data Lake Storage (v2)) and GCS (Google Cloud Storage), the integration is similar to S3. The only difference is the configuration properties for ADLS and GCS (see below). | ||
|
||
The following sections will guide you through the necessary steps to configure the Hive catalog to utilize S3 and ADLS as a storage backend, including configuration details and examples for creating databases and tables. | ||
The following sections will guide you through the necessary steps to configure the Hive catalog to utilize S3, ADLS, and GCS as a storage backend, including configuration details and examples for creating databases and tables. | ||
|
||
## Hive metastore configuration | ||
|
||
|
||
The following will mainly focus on configuring the Hive metastore to use S3 as a storage backend. The same configuration can be applied to ADLS with minor changes in the configuration properties. | ||
The following will mainly focus on configuring the Hive metastore to use S3 as a storage backend. The same configuration can be applied to ADLS and GCS with minor changes in the configuration properties. | ||
|
||
### Example Configuration Changes | ||
|
||
|
@@ -45,15 +44,14 @@ Below are the essential properties to add or modify in the `hive-site.xml` file | |
definition and table definition, as shown in the examples below. After explicitly setting this | ||
property, you can omit the location property in the schema and table definitions. | ||
It's also applicable for ADLS. | ||
It's also applicable for Azure Blob Storage(ADSL) and GCS. | ||
--> | ||
<property> | ||
<name>hive.metastore.warehouse.dir</name> | ||
<value>S3_BUCKET_PATH</value> | ||
</property> | ||
|
||
|
||
<!-- The following are for Azure Blob Storage(ADLS) --> | ||
<!-- The following two configurations are for Azure Blob Storage(ADLS) --> | ||
<property> | ||
<name>fs.abfss.impl</name> | ||
<value>org.apache.hadoop.fs.azurebfs.SecureAzureBlobFileSystem</value> | ||
|
@@ -64,6 +62,18 @@ It's also applicable for ADLS. | |
<value>ABS_ACCOUNT_KEY</value> | ||
</property> | ||
|
||
<!-- The following two configurations are only for Google Cloud Storage(gcs) --> | ||
<property> | ||
<name>fs.gs.auth.service.account.enable</name> | ||
<value>true</value> | ||
</property> | ||
|
||
<!-- SERVICE_ACCOUNT_FILE should be a local file or remote file that can be access by hive server --> | ||
<property> | ||
<name>fs.gs.auth.service.account.json.keyfile</name> | ||
<value>SERVICE_ACCOUNT_FILE</value> | ||
</property> | ||
|
||
``` | ||
|
||
### Adding Required JARs | ||
|
@@ -78,7 +88,6 @@ cp ${HADOOP_HOME}/share/hadoop/tools/lib/*azure* ${HIVE_HOME}/lib | |
|
||
Alternatively, you can download the required JARs from the Maven repository and place them in the Hive classpath. It is crucial to verify that the JARs are compatible with the version of Hadoop you are using to avoid any compatibility issue. | ||
|
||
|
||
### Restart Hive metastore | ||
|
||
Once all configurations have been correctly set, restart the Hive cluster to apply the changes. This step is essential to ensure that the new configurations take effect and that the Hive services can communicate with S3. | ||
|
@@ -105,6 +114,9 @@ curl -X POST -H "Accept: application/vnd.gravitino.v1+json" \ | |
# The following line is for Azure Blob Storage(ADLS) | ||
# "location": "abfss://[email protected]/path" | ||
# The following line is for Google Cloud Storage(GCS) | ||
# "location": "gs://bucket-name/path" | ||
} | ||
}' http://localhost:8090/api/metalakes/metalake/catalogs/catalog/schemas | ||
``` | ||
|
@@ -129,6 +141,9 @@ Map<String, String> schemaProperties = ImmutableMap.<String, String>builder() | |
// The following line is for Azure Blob Storage(ADLS) | ||
// .put("location", "abfss://[email protected]/path") | ||
|
||
// The following lines for Google Cloud Storage(GCS) | ||
// .put("location", "gs://bucket-name/path") | ||
|
||
.build(); | ||
Schema schema = supportsSchemas.createSchema("hive_schema", | ||
"This is a schema", | ||
|
@@ -225,13 +240,17 @@ To access S3-stored tables using Spark, you need to configure the SparkSession a | |
.config("spark.sql.catalog.{hive_catalog_name}.fs.s3a.endpoint", getS3Endpoint) | ||
.config("spark.sql.catalog.{hive_catalog_name}.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") | ||
## This two is for Azure Blob Storage(ADLS) only | ||
// This two is for Azure Blob Storage(ADLS) only | ||
.config( | ||
String.format( | ||
"spark.sql.catalog.{hive_catalog_name}.fs.azure.account.key.%s.dfs.core.windows.net", | ||
ABS_USER_ACCOUNT_NAME), | ||
ABS_USER_ACCOUNT_KEY) | ||
.config("spark.sql.catalog.{hive_catalog_name}.fs.abfss.impl", "org.apache.hadoop.fs.azurebfs.SecureAzureBlobFileSystem") | ||
// This two is for Google Cloud Storage(GCS) only | ||
.config("spark.sql.catalog.{hive_catalog_name}.fs.gs.auth.service.account.enable", "true") | ||
.config("spark.sql.catalog.{hive_catalog_name}.fs.gs.auth.service.account.json.keyfile", "SERVICE_ACCOUNT_FILE") | ||
.config("spark.sql.catalog.{hive_catalog_name}.fs.s3a.path.style.access", "true") | ||
.config("spark.sql.catalog.{hive_catalog_name}.fs.s3a.connection.ssl.enabled", "false") | ||
|
@@ -249,6 +268,7 @@ To access S3-stored tables using Spark, you need to configure the SparkSession a | |
:::Note | ||
Please download [Hadoop AWS jar](https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-aws), [aws java sdk jar](https://mvnrepository.com/artifact/com.amazonaws/aws-java-sdk-bundle) and place them in the classpath of the Spark. If the JARs are missing, Spark will not be able to access the S3 storage. | ||
Azure Blob Storage(ADLS) requires the [Hadoop Azure jar](https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-azure), [Azure cloud sdk jar](https://mvnrepository.com/artifact/com.azure/azure-storage-blob) to be placed in the classpath of the Spark. | ||
for Google Cloud Storage(GCS), you need to download the [Hadoop GCS jar](https://github.com/GoogleCloudDataproc/hadoop-connectors/releases) and place it in the classpath of the Spark. | ||
::: | ||
By following these instructions, you can effectively manage and access your S3-stored data through both Hive CLI and Spark, leveraging the capabilities of Gravitino for optimal data management. | ||
By following these instructions, you can effectively manage and access your S3, ADLS or GCS data through both Hive CLI and Spark, leveraging the capabilities of Gravitino for optimal data management. |