(gravitino) branch main updated: [#5673] Add test and docs about how to use GCS in Hive (#5676)

jshao Tue, 03 Dec 2024 03:48:58 -0800

This is an automated email from the ASF dual-hosted git repository.

jshao pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/gravitino.git



The following commit(s) were added to refs/heads/main by this push:
     new 71a0d63cc [#5673] Add test and docs about how to use GCS in Hive 
(#5676)
71a0d63cc is described below

commit 71a0d63cc2214abbed17e8db64965b445689e911
Author: Qi Yu <[email protected]>
AuthorDate: Tue Dec 3 19:48:46 2024 +0800

    [#5673] Add test and docs about how to use GCS in Hive (#5676)
    
    ### What changes were proposed in this pull request?
    
    1. Release a new docker Hive image to support GCS.
    2. Add related test based on the new image.
    
    ### Why are the changes needed?
    
    For users convenience.
    
    Fix: #5673
    
    ### Does this PR introduce _any_ user-facing change?
    
    N/A
    
    ### How was this patch tested?
    
    new IT `CatalogHiveGCSIT`
    
    ---------
    
    Co-authored-by: Jerry Shao <[email protected]>
---
 .github/workflows/backend-integration-test.yml     |   1 +
 build.gradle.kts                                   |   2 +-
 catalogs/catalog-hive/build.gradle.kts             |   1 +
 .../hive/integration/test/CatalogHiveGCSIT.java    | 109 +++++++++++++++++++++
 clients/cli/build.gradle.kts                       |   4 +
 dev/docker/hive/Dockerfile                         |   5 +
 dev/docker/hive/hive-dependency.sh                 |  12 +++
 dev/docker/hive/hive-site.xml                      |  10 ++
 dev/docker/hive/start.sh                           |   9 +-
 docs/docker-image-details.md                       |   6 +-
 ...-adls.md => hive-catalog-with-cloud-storage.md} |  40 ++++++--
 11 files changed, 185 insertions(+), 14 deletions(-)

diff --git a/.github/workflows/backend-integration-test.yml 
b/.github/workflows/backend-integration-test.yml
index 495424c16..1958163f8 100644
--- a/.github/workflows/backend-integration-test.yml
+++ b/.github/workflows/backend-integration-test.yml
@@ -28,6 +28,7 @@ jobs:
               - clients/client-java/**
               - clients/client-java-runtime/**
               - clients/filesystem-hadoop3/**
+              - clients/cli/**
               - common/**
               - conf/**
               - core/**
diff --git a/build.gradle.kts b/build.gradle.kts
index 49aa2fe89..cc29ff4af 100644
--- a/build.gradle.kts
+++ b/build.gradle.kts
@@ -174,7 +174,7 @@ allprojects {
       param.environment("PROJECT_VERSION", project.version)
 
       // Gravitino CI Docker image
-      param.environment("GRAVITINO_CI_HIVE_DOCKER_IMAGE", 
"apache/gravitino-ci:hive-0.1.15")
+      param.environment("GRAVITINO_CI_HIVE_DOCKER_IMAGE", 
"apache/gravitino-ci:hive-0.1.16")
       param.environment("GRAVITINO_CI_KERBEROS_HIVE_DOCKER_IMAGE", 
"apache/gravitino-ci:kerberos-hive-0.1.5")
       param.environment("GRAVITINO_CI_DORIS_DOCKER_IMAGE", 
"apache/gravitino-ci:doris-0.1.5")
       param.environment("GRAVITINO_CI_TRINO_DOCKER_IMAGE", 
"apache/gravitino-ci:trino-0.1.6")
diff --git a/catalogs/catalog-hive/build.gradle.kts 
b/catalogs/catalog-hive/build.gradle.kts
index b328413df..b471fccea 100644
--- a/catalogs/catalog-hive/build.gradle.kts
+++ b/catalogs/catalog-hive/build.gradle.kts
@@ -130,6 +130,7 @@ dependencies {
   testImplementation(libs.testcontainers.localstack)
   testImplementation(libs.hadoop2.aws)
   testImplementation(libs.hadoop3.abs)
+  testImplementation(libs.hadoop3.gcs)
 
   // You need this to run test CatalogHiveABSIT as it required hadoop3 
environment introduced by hadoop3.abs
   // (The protocol `abfss` was first introduced in Hadoop 3.2.0), However, as 
the there already exists
diff --git 
a/catalogs/catalog-hive/src/test/java/org/apache/gravitino/catalog/hive/integration/test/CatalogHiveGCSIT.java
 
b/catalogs/catalog-hive/src/test/java/org/apache/gravitino/catalog/hive/integration/test/CatalogHiveGCSIT.java
new file mode 100644
index 000000000..c69cf013e
--- /dev/null
+++ 
b/catalogs/catalog-hive/src/test/java/org/apache/gravitino/catalog/hive/integration/test/CatalogHiveGCSIT.java
@@ -0,0 +1,109 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.gravitino.catalog.hive.integration.test;
+
+import java.io.IOException;
+import java.net.URI;
+import java.util.HashMap;
+import java.util.Map;
+import org.apache.commons.lang3.StringUtils;
+import org.apache.gravitino.integration.test.container.HiveContainer;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.spark.sql.SparkSession;
+import org.junit.jupiter.api.condition.EnabledIf;
+import org.testcontainers.shaded.com.google.common.collect.ImmutableMap;
+import org.testcontainers.utility.MountableFile;
+
+@EnabledIf(value = "isGCSConfigured", disabledReason = "Google Cloud 
Storage(GCS) is not prepared.")
+public class CatalogHiveGCSIT extends CatalogHiveIT {
+
+  private static final String GCS_BUCKET_NAME = 
System.getenv("GCS_BUCKET_NAME");
+  private static final String GCS_ACCOUNT_JSON_FILE =
+      System.getenv("GCS_SERVICE_ACCOUNT_JSON_PATH");
+  private static final String GCS_ACCOUNT_JSON_FILE_IN_CONTAINER = 
"/tmp/gcs-service-account.json";
+
+  @Override
+  protected void startNecessaryContainer() {
+    Map<String, String> hiveContainerEnv =
+        ImmutableMap.of(
+            "SERVICE_ACCOUNT_FILE",
+            GCS_ACCOUNT_JSON_FILE_IN_CONTAINER,
+            HiveContainer.HIVE_RUNTIME_VERSION,
+            HiveContainer.HIVE3);
+
+    containerSuite.startHiveContainerWithS3(hiveContainerEnv);
+
+    HIVE_METASTORE_URIS =
+        String.format(
+            "thrift://%s:%d",
+            containerSuite.getHiveContainerWithS3().getContainerIpAddress(),
+            HiveContainer.HIVE_METASTORE_PORT);
+
+    containerSuite
+        .getHiveContainerWithS3()
+        .getContainer()
+        .copyFileToContainer(
+            MountableFile.forHostPath(GCS_ACCOUNT_JSON_FILE), 
"/tmp/gcs-service-account.json");
+  }
+
+  @Override
+  protected void initFileSystem() throws IOException {
+    Configuration conf = new Configuration();
+
+    conf.set("fs.gs.auth.service.account.enable", "true");
+    conf.set("fs.gs.auth.service.account.json.keyfile", GCS_ACCOUNT_JSON_FILE);
+
+    String path = String.format("gs://%s/", GCS_BUCKET_NAME);
+    fileSystem = FileSystem.get(URI.create(path), conf);
+  }
+
+  @Override
+  protected void initSparkSession() {
+    sparkSession =
+        SparkSession.builder()
+            .master("local[1]")
+            .appName("Hive Catalog integration test")
+            .config("hive.metastore.uris", HIVE_METASTORE_URIS)
+            .config(
+                "spark.sql.warehouse.dir",
+                String.format(String.format("gs://%s/user/hive/warehouse", 
GCS_BUCKET_NAME)))
+            .config("spark.hadoop.fs.gs.auth.service.account.json.keyfile", 
GCS_ACCOUNT_JSON_FILE)
+            .config("spark.sql.storeAssignmentPolicy", "LEGACY")
+            .config("mapreduce.input.fileinputformat.input.dir.recursive", 
"true")
+            .enableHiveSupport()
+            .getOrCreate();
+  }
+
+  @Override
+  protected Map<String, String> createSchemaProperties() {
+    Map<String, String> properties = new HashMap<>();
+    properties.put("key1", "val1");
+    properties.put("key2", "val2");
+    properties.put(
+        "location", String.format("gs://%s/test-%s", GCS_BUCKET_NAME, 
System.currentTimeMillis()));
+    return properties;
+  }
+
+  private static boolean isGCSConfigured() {
+    return StringUtils.isNotBlank(System.getenv("GCS_BUCKET_NAME"))
+        && 
StringUtils.isNotBlank(System.getenv("GCS_SERVICE_ACCOUNT_JSON_PATH"));
+  }
+}
diff --git a/clients/cli/build.gradle.kts b/clients/cli/build.gradle.kts
index 8d42712ea..71608ee91 100644
--- a/clients/cli/build.gradle.kts
+++ b/clients/cli/build.gradle.kts
@@ -34,6 +34,10 @@ dependencies {
   testImplementation(libs.junit.jupiter.api)
   testImplementation(libs.junit.jupiter.params)
   testImplementation(libs.mockito.core)
+  testImplementation(libs.mysql.driver)
+  testImplementation(libs.postgresql.driver)
+  testImplementation(libs.testcontainers)
+
   testImplementation(project(":core")) {
     exclude("org.apache.logging.log4j")
   }
diff --git a/dev/docker/hive/Dockerfile b/dev/docker/hive/Dockerfile
index 1b4d4dd7f..cd79e2562 100644
--- a/dev/docker/hive/Dockerfile
+++ b/dev/docker/hive/Dockerfile
@@ -150,6 +150,11 @@ RUN ln -s /opt/hadoop-${HADOOP2_VERSION} ${HADOOP2_HOME}
 ADD packages/hadoop-${HADOOP3_VERSION}.tar.gz /opt/
 RUN ln -s /opt/hadoop-${HADOOP3_VERSION} ${HADOOP3_HOME}
 
+
+# Add gcs connector for hadoop2 and hadoop3
+ADD packages/gcs-connector-hadoop2-2.2.23-shaded.jar 
${HADOOP2_HOME}/share/hadoop/common/lib/gcs-connector-hadoop2-2.2.23-shaded.jar
+ADD packages/gcs-connector-hadoop3-2.2.23-shaded.jar 
${HADOOP3_HOME}/share/hadoop/common/lib/gcs-connector-hadoop3-2.2.23-shaded.jar
+
 # Add hadoop configuration to temporary directory
 ADD core-site.xml ${HADOOP_TMP_CONF_DIR}/core-site.xml
 ADD hadoop-env.sh ${HADOOP_TMP_CONF_DIR}/hadoop-env.sh
diff --git a/dev/docker/hive/hive-dependency.sh 
b/dev/docker/hive/hive-dependency.sh
index 2038dd001..e93361c3c 100755
--- a/dev/docker/hive/hive-dependency.sh
+++ b/dev/docker/hive/hive-dependency.sh
@@ -33,9 +33,13 @@ RANGER_VERSION="2.4.0" # Notice: Currently only tested 
Ranger plugin 2.4.0 in th
 
 HADOOP2_PACKAGE_NAME="hadoop-${HADOOP2_VERSION}.tar.gz"
 
HADOOP2_DOWNLOAD_URL="https://archive.apache.org/dist/hadoop/core/hadoop-${HADOOP2_VERSION}/${HADOOP2_PACKAGE_NAME}";
+HADOOP2_GCS_PACKAGE_NAME="gcs-connector-hadoop2-2.2.23-shaded.jar"
+HADOOP2_GCS_DOWNLOAD_URL="https://github.com/GoogleCloudDataproc/hadoop-connectors/releases/download/v2.2.23/gcs-connector-hadoop2-2.2.23-shaded.jar";
 
 HADOOP3_PACKAGE_NAME="hadoop-${HADOOP3_VERSION}.tar.gz"
 
HADOOP3_DOWNLOAD_URL="https://archive.apache.org/dist/hadoop/core/hadoop-${HADOOP3_VERSION}/${HADOOP3_PACKAGE_NAME}";
+HADOOP3_GCS_PACKAGE_NAME="gcs-connector-hadoop3-2.2.23-shaded.jar"
+HADOOP3_GCS_DOWNLOAD_URL="https://github.com/GoogleCloudDataproc/hadoop-connectors/releases/download/v2.2.23/gcs-connector-hadoop3-2.2.23-shaded.jar";
 
 HIVE2_PACKAGE_NAME="apache-hive-${HIVE2_VERSION}-bin.tar.gz"
 
HIVE2_DOWNLOAD_URL="https://archive.apache.org/dist/hive/hive-${HIVE2_VERSION}/${HIVE2_PACKAGE_NAME}";
@@ -91,3 +95,11 @@ fi
 if [ ! -f "${hive_dir}/packages/${RANGER_HIVE_PACKAGE_NAME}" ]; then
   curl -L -s -o "${hive_dir}/packages/${RANGER_HIVE_PACKAGE_NAME}" 
${RANGER_HIVE_DOWNLOAD_URL}
 fi
+
+if [ ! -f "${hive_dir}/packages/${HADOOP2_GCS_PACKAGE_NAME}" ]; then
+  curl -L -s -o "${hive_dir}/packages/${HADOOP2_GCS_PACKAGE_NAME}" 
${HADOOP2_GCS_DOWNLOAD_URL}
+fi
+
+if [ ! -f "${hive_dir}/packages/${HADOOP3_GCS_PACKAGE_NAME}" ]; then
+  curl -L -s -o "${hive_dir}/packages/${HADOOP3_GCS_PACKAGE_NAME}" 
${HADOOP3_GCS_DOWNLOAD_URL}
+fi
\ No newline at end of file
diff --git a/dev/docker/hive/hive-site.xml b/dev/docker/hive/hive-site.xml
index c6a247e1a..1750539b7 100644
--- a/dev/docker/hive/hive-site.xml
+++ b/dev/docker/hive/hive-site.xml
@@ -73,4 +73,14 @@
     <value>ABS_ACCOUNT_KEY</value>
   </property>
 
+  <property>
+    <name>fs.gs.auth.service.account.enable</name>
+    <value>true</value>
+  </property>
+
+  <property>
+    <name>fs.gs.auth.service.account.json.keyfile</name>
+    <value>SERVICE_ACCOUNT_FILE</value>
+  </property>
+
 </configuration>
diff --git a/dev/docker/hive/start.sh b/dev/docker/hive/start.sh
index 86ced4097..93ab35e30 100644
--- a/dev/docker/hive/start.sh
+++ b/dev/docker/hive/start.sh
@@ -31,8 +31,8 @@ else
   ln -s ${HADOOP2_HOME} ${HADOOP_HOME}
 fi
 
- cp ${HADOOP_HOME}/share/hadoop/tools/lib/*aws* ${HIVE_HOME}/lib
- cp ${HADOOP_HOME}/share/hadoop/tools/lib/*azure* ${HIVE_HOME}/lib
+cp ${HADOOP_HOME}/share/hadoop/tools/lib/*aws* ${HIVE_HOME}/lib
+cp ${HADOOP_HOME}/share/hadoop/tools/lib/*azure* ${HIVE_HOME}/lib
 
 # Copy Hadoop and Hive configuration file and update hostname
 cp -f ${HADOOP_TMP_CONF_DIR}/* ${HADOOP_CONF_DIR}
@@ -54,6 +54,11 @@ if [[ -n "${ABS_ACCOUNT_NAME}" && -n "${ABS_ACCOUNT_KEY}" 
]]; then
   sed -i "s|ABS_ACCOUNT_KEY|${ABS_ACCOUNT_KEY}|g" 
${HIVE_CONF_DIR}/hive-site.xml
 fi
 
+# whether GCS is set
+if [[ -n "$SERVICE_ACCOUNT_FILE" ]]; then
+  sed -i "s|SERVICE_ACCOUNT_FILE|${SERVICE_ACCOUNT_FILE}|g" 
${HIVE_CONF_DIR}/hive-site.xml
+fi
+
 # Link mysql-connector-java after deciding where HIVE_HOME symbolic link 
points to.
 ln -s 
/opt/mysql-connector-java-${MYSQL_JDBC_DRIVER_VERSION}/mysql-connector-java-${MYSQL_JDBC_DRIVER_VERSION}.jar
 ${HIVE_HOME}/lib
 
diff --git a/docs/docker-image-details.md b/docs/docker-image-details.md
index fed00d83c..4e0a81093 100644
--- a/docs/docker-image-details.md
+++ b/docs/docker-image-details.md
@@ -168,8 +168,12 @@ Changelog
 You can use this kind of image to test the catalog of Apache Hive.
 
 Changelog
+- apache/gravitino-ci:hive-0.1.16
+  - Add GCS related configuration in the `hive-site.xml` file.
+  - Add GCS bundle jar in the `${HADOOP_HOME}/share/hadoop/common/lib/`
+
 - apache/gravitino-ci:hive-0.1.15
-  - Add ADLS related configurations in the `hive-site.xml` file.
+  - Add Azure Blob Storage(ADLS) related configurations in the `hive-site.xml` 
file.
 
 - apache/gravitino-ci:hive-0.1.14 
   - Add amazon S3 related configurations in the `hive-site.xml` file.
diff --git a/docs/hive-catalog-with-s3-and-adls.md 
b/docs/hive-catalog-with-cloud-storage.md
similarity index 84%
rename from docs/hive-catalog-with-s3-and-adls.md
rename to docs/hive-catalog-with-cloud-storage.md
index 41b8eef77..49a018907 100644
--- a/docs/hive-catalog-with-s3-and-adls.md
+++ b/docs/hive-catalog-with-cloud-storage.md
@@ -11,14 +11,13 @@ license: "This software is licensed under the Apache 
License version 2."
 
 Since Hive 2.x, Hive has supported S3 as a storage backend, enabling users to 
store and manage data in Amazon S3 directly through Hive. Gravitino enhances 
this capability by supporting the Hive catalog with S3, allowing users to 
efficiently manage the storage locations of files located in S3. This 
integration simplifies data operations and enables seamless access to S3 data 
from Hive queries.
 
-For ADLS (aka. Azure Blob Storage (ABS), or Azure Data Lake Storage (v2)), the 
integration is similar to S3. The only difference is the configuration 
properties for ADLS(see below). 
+For ADLS (aka. Azure Blob Storage (ABS), or Azure Data Lake Storage (v2)) and 
GCS (Google Cloud Storage), the integration is similar to S3. The only 
difference is the configuration properties for ADLS and GCS (see below). 
 
-The following sections will guide you through the necessary steps to configure 
the Hive catalog to utilize S3 and ADLS as a storage backend, including 
configuration details and examples for creating databases and tables.
+The following sections will guide you through the necessary steps to configure 
the Hive catalog to utilize S3, ADLS, and GCS as a storage backend, including 
configuration details and examples for creating databases and tables.
 
 ## Hive metastore configuration
 
-
-The following will mainly focus on configuring the Hive metastore to use S3 as 
a storage backend. The same configuration can be applied to ADLS with minor 
changes in the configuration properties. 
+The following will mainly focus on configuring the Hive metastore to use S3 as 
a storage backend. The same configuration can be applied to ADLS and GCS with 
minor changes in the configuration properties. 
 
 ### Example Configuration Changes
 
@@ -45,15 +44,14 @@ Below are the essential properties to add or modify in the 
`hive-site.xml` file
 definition and table definition, as shown in the examples below. After 
explicitly setting this
 property, you can omit the location property in the schema and table 
definitions.
 
-It's also applicable for ADLS.
+It's also applicable for Azure Blob Storage(ADSL) and GCS.
 -->
 <property>
   <name>hive.metastore.warehouse.dir</name>
   <value>S3_BUCKET_PATH</value>
 </property>
 
-
-<!-- The following are for Azure Blob Storage(ADLS) -->
+<!-- The following two configurations are for Azure Blob Storage(ADLS) -->
 <property>
   <name>fs.abfss.impl</name>
   <value>org.apache.hadoop.fs.azurebfs.SecureAzureBlobFileSystem</value>
@@ -64,6 +62,18 @@ It's also applicable for ADLS.
   <value>ABS_ACCOUNT_KEY</value>
 </property>
 
+<!-- The following two configurations are only for Google Cloud Storage(gcs) 
-->
+<property>
+  <name>fs.gs.auth.service.account.enable</name>
+  <value>true</value>
+</property>
+
+<!-- SERVICE_ACCOUNT_FILE should be a local file or remote file that can be 
access by hive server -->
+<property>
+  <name>fs.gs.auth.service.account.json.keyfile</name>
+  <value>SERVICE_ACCOUNT_FILE</value>
+</property>
+
 ```
 
 ### Adding Required JARs
@@ -78,7 +88,6 @@ cp ${HADOOP_HOME}/share/hadoop/tools/lib/*azure* 
${HIVE_HOME}/lib
 
 Alternatively, you can download the required JARs from the Maven repository 
and place them in the Hive classpath. It is crucial to verify that the JARs are 
compatible with the version of Hadoop you are using to avoid any compatibility 
issue.
 
-
 ### Restart Hive metastore
 
 Once all configurations have been correctly set, restart the Hive cluster to 
apply the changes. This step is essential to ensure that the new configurations 
take effect and that the Hive services can communicate with S3.
@@ -105,6 +114,9 @@ curl -X POST -H "Accept: application/vnd.gravitino.v1+json" 
\
      
      # The following line is for Azure Blob Storage(ADLS)
      # "location": 
"abfss://[email protected]/path"
+     
+     # The following line is for Google Cloud Storage(GCS)
+     # "location": "gs://bucket-name/path"
   }
 }' http://localhost:8090/api/metalakes/metalake/catalogs/catalog/schemas
 ```
@@ -129,6 +141,9 @@ Map<String, String> schemaProperties = 
ImmutableMap.<String, String>builder()
     // The following line is for Azure Blob Storage(ADLS)
     // .put("location", 
"abfss://[email protected]/path")
     
+    // The following lines for Google Cloud Storage(GCS)
+    // .put("location", "gs://bucket-name/path")
+    
     .build();
 Schema schema = supportsSchemas.createSchema("hive_schema",
     "This is a schema",
@@ -225,13 +240,17 @@ To access S3-stored tables using Spark, you need to 
configure the SparkSession a
             .config("spark.sql.catalog.{hive_catalog_name}.fs.s3a.endpoint", 
getS3Endpoint)
             .config("spark.sql.catalog.{hive_catalog_name}.fs.s3a.impl", 
"org.apache.hadoop.fs.s3a.S3AFileSystem")
 
-            ## This two is for Azure Blob Storage(ADLS) only
+            // This two is for Azure Blob Storage(ADLS) only
             .config(
                 String.format(
                     
"spark.sql.catalog.{hive_catalog_name}.fs.azure.account.key.%s.dfs.core.windows.net",
                     ABS_USER_ACCOUNT_NAME),
                 ABS_USER_ACCOUNT_KEY)
             .config("spark.sql.catalog.{hive_catalog_name}.fs.abfss.impl", 
"org.apache.hadoop.fs.azurebfs.SecureAzureBlobFileSystem")
+  
+            // This two is for Google Cloud Storage(GCS) only
+            
.config("spark.sql.catalog.{hive_catalog_name}.fs.gs.auth.service.account.enable",
 "true")
+            
.config("spark.sql.catalog.{hive_catalog_name}.fs.gs.auth.service.account.json.keyfile",
 "SERVICE_ACCOUNT_FILE")
             
             
.config("spark.sql.catalog.{hive_catalog_name}.fs.s3a.path.style.access", 
"true")
             
.config("spark.sql.catalog.{hive_catalog_name}.fs.s3a.connection.ssl.enabled", 
"false")
@@ -249,6 +268,7 @@ To access S3-stored tables using Spark, you need to 
configure the SparkSession a
 :::Note
 Please download [Hadoop AWS 
jar](https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-aws), [aws 
java sdk 
jar](https://mvnrepository.com/artifact/com.amazonaws/aws-java-sdk-bundle) and 
place them in the classpath of the Spark. If the JARs are missing, Spark will 
not be able to access the S3 storage.
 Azure Blob Storage(ADLS) requires the [Hadoop Azure 
jar](https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-azure), [Azure 
cloud sdk jar](https://mvnrepository.com/artifact/com.azure/azure-storage-blob) 
to be placed in the classpath of the Spark.
+for Google Cloud Storage(GCS), you need to download the [Hadoop GCS 
jar](https://github.com/GoogleCloudDataproc/hadoop-connectors/releases) and 
place it in the classpath of the Spark.
 :::
 
-By following these instructions, you can effectively manage and access your 
S3-stored data through both Hive CLI and Spark, leveraging the capabilities of 
Gravitino for optimal data management.
\ No newline at end of file
+By following these instructions, you can effectively manage and access your 
S3, ADLS or GCS data through both Hive CLI and Spark, leveraging the 
capabilities of Gravitino for optimal data management.
\ No newline at end of file

(gravitino) branch main updated: [#5673] Add test and docs about how to use GCS in Hive (#5676)

Reply via email to