This is an automated email from the ASF dual-hosted git repository.
jshao pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/gravitino.git
The following commit(s) were added to refs/heads/main by this push:
new bcc592e478 [#6895] feat(GVFS): support specifying the location name in
GVFS (#6937)
bcc592e478 is described below
commit bcc592e478a8e83b6fb9b956751e758d6f1d6b6a
Author: mchades <[email protected]>
AuthorDate: Tue Apr 15 18:52:24 2025 +0800
[#6895] feat(GVFS): support specifying the location name in GVFS (#6937)
### What changes were proposed in this pull request?
support specifying the location name in GVFS
### Why are the changes needed?
since the fileset supports multiple locations, the GVFS should support
to specify the location when preforming FS ops
Fix: #6895
### Does this PR introduce _any_ user-facing change?
yes
### How was this patch tested?
tests added
---
.../hadoop/SecureHadoopCatalogOperations.java | 10 +--
clients/client-python/gravitino/filesystem/gvfs.py | 37 ++++++++++-
.../gravitino/filesystem/gvfs_config.py | 8 +++
.../tests/integration/test_gvfs_with_hdfs.py | 72 +++++++++++++++++++++-
.../hadoop/GravitinoVirtualFileSystem.java | 36 +++++++++--
.../GravitinoVirtualFileSystemConfiguration.java | 18 ++++++
.../gravitino/filesystem/hadoop/TestGvfsBase.java | 7 ++-
.../test/GravitinoVirtualFileSystemIT.java | 51 +++++++++++++++
docs/how-to-use-gvfs.md | 62 ++++++++++---------
9 files changed, 253 insertions(+), 48 deletions(-)
diff --git
a/catalogs/catalog-hadoop/src/main/java/org/apache/gravitino/catalog/hadoop/SecureHadoopCatalogOperations.java
b/catalogs/catalog-hadoop/src/main/java/org/apache/gravitino/catalog/hadoop/SecureHadoopCatalogOperations.java
index 2982080306..b57d71d226 100644
---
a/catalogs/catalog-hadoop/src/main/java/org/apache/gravitino/catalog/hadoop/SecureHadoopCatalogOperations.java
+++
b/catalogs/catalog-hadoop/src/main/java/org/apache/gravitino/catalog/hadoop/SecureHadoopCatalogOperations.java
@@ -19,6 +19,8 @@
package org.apache.gravitino.catalog.hadoop;
+import static org.apache.gravitino.file.Fileset.PROPERTY_DEFAULT_LOCATION_NAME;
+
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;
import java.io.IOException;
@@ -29,7 +31,6 @@ import java.util.Optional;
import java.util.Set;
import java.util.stream.Collectors;
import javax.security.auth.Subject;
-import org.apache.commons.lang3.StringUtils;
import org.apache.gravitino.Catalog;
import org.apache.gravitino.Entity;
import org.apache.gravitino.EntityStore;
@@ -267,12 +268,7 @@ public class SecureHadoopCatalogOperations
"No storage locations found for fileset: " + filesetIdentifier);
// todo: support multiple storage locations
- Preconditions.checkArgument(
- locations.size() == 1, "Only one storage location is supported for
fileset now");
-
- String path = locations.values().iterator().next();
- Preconditions.checkState(
- StringUtils.isNotBlank(path), "The location of fileset should not be
empty.");
+ String path =
locations.get(fileset.properties().get(PROPERTY_DEFAULT_LOCATION_NAME));
Set<String> providers =
CredentialUtils.getCredentialProvidersByOrder(
diff --git a/clients/client-python/gravitino/filesystem/gvfs.py
b/clients/client-python/gravitino/filesystem/gvfs.py
index 0dc020ee90..8c1d89c757 100644
--- a/clients/client-python/gravitino/filesystem/gvfs.py
+++ b/clients/client-python/gravitino/filesystem/gvfs.py
@@ -15,6 +15,7 @@
# specific language governing permissions and limitations
# under the License.
import logging
+import os
import sys
# Disable C0302: Too many lines in module
@@ -104,10 +105,14 @@ class
GravitinoVirtualFileSystem(fsspec.AbstractFileSystem):
access the underlying storage.
"""
+ # Disable R0902: Too many instance attributes
+ # pylint: disable=R0902
+
# Override the parent variable
protocol = PROTOCOL_NAME
_identifier_pattern =
re.compile("^fileset/([^/]+)/([^/]+)/([^/]+)(?:/[^/]+)*/?$")
SLASH = "/"
+ ENV_CURRENT_LOCATION_NAME_ENV_VAR_DEFAULT = "CURRENT_LOCATION_NAME"
def __init__(
self,
@@ -180,6 +185,7 @@ class GravitinoVirtualFileSystem(fsspec.AbstractFileSystem):
self._catalog_cache = LRUCache(maxsize=100)
self._catalog_cache_lock = rwlock.RWLockFair()
self._options = options
+ self._current_location_name = self._init_current_location_name()
super().__init__(**kwargs)
@@ -576,6 +582,24 @@ class
GravitinoVirtualFileSystem(fsspec.AbstractFileSystem):
**kwargs,
)
+ def _init_current_location_name(self):
+ """Initialize the current location name.
+ get from configuration first, otherwise use the env variable
+ if both are not set, return null which means use the default location
+ :return: The current location name
+ """
+ current_location_name_env_var = (
+
self._options.get(GVFSConfig.GVFS_FILESYSTEM_CURRENT_LOCATION_NAME_ENV_VAR)
+ if self._options
+ else None
+ ) or self.ENV_CURRENT_LOCATION_NAME_ENV_VAR_DEFAULT
+
+ return (
+ self._options.get(GVFSConfig.GVFS_FILESYSTEM_CURRENT_LOCATION_NAME)
+ if self._options
+ else None
+ ) or os.environ.get(current_location_name_env_var)
+
def _convert_actual_path(
self,
actual_path: str,
@@ -702,11 +726,17 @@ class
GravitinoVirtualFileSystem(fsspec.AbstractFileSystem):
) = fileset_catalog.as_fileset_catalog().get_file_location(
NameIdentifier.of(identifier.namespace().level(2),
identifier.name()),
sub_path,
+ self._current_location_name,
)
return FilesetContextPair(
actual_file_location,
- self._get_filesystem(actual_file_location, fileset_catalog,
identifier),
+ self._get_filesystem(
+ actual_file_location,
+ fileset_catalog,
+ identifier,
+ self._current_location_name,
+ ),
)
def _extract_identifier(self, path):
@@ -904,13 +934,14 @@ class
GravitinoVirtualFileSystem(fsspec.AbstractFileSystem):
actual_file_location: str,
fileset_catalog: Catalog,
name_identifier: NameIdentifier,
+ location_name: str,
):
storage_type = self._recognize_storage_type(actual_file_location)
read_lock = self._cache_lock.gen_rlock()
try:
read_lock.acquire()
cache_value: Tuple[int, AbstractFileSystem] = self._cache.get(
- name_identifier
+ (name_identifier, location_name)
)
if cache_value is not None:
if not self._file_system_expired(cache_value[0]):
@@ -958,7 +989,7 @@ class GravitinoVirtualFileSystem(fsspec.AbstractFileSystem):
raise GravitinoRuntimeException(
f"Storage type: `{storage_type}` doesn't support now."
)
- self._cache[name_identifier] = new_cache_value
+ self._cache[(name_identifier, location_name)] = new_cache_value
return new_cache_value[1]
finally:
write_lock.release()
diff --git a/clients/client-python/gravitino/filesystem/gvfs_config.py
b/clients/client-python/gravitino/filesystem/gvfs_config.py
index 34db72adee..77dde31e06 100644
--- a/clients/client-python/gravitino/filesystem/gvfs_config.py
+++ b/clients/client-python/gravitino/filesystem/gvfs_config.py
@@ -52,3 +52,11 @@ class GVFSConfig:
# The default value of the credential_expired_time_ratio is 0.5
DEFAULT_CREDENTIAL_EXPIRED_TIME_RATIO = 0.5
+
+ # The configuration key for the fileset with multiple locations, on which
the file system will operate.
+ # The default value is "default".
+ GVFS_FILESYSTEM_CURRENT_LOCATION_NAME = "current_location_name"
+
+ # The configuration key for the env variable name that indicates the
current location name. If
+ # not set, the file system will read the location name from
CURRENT_LOCATION_NAME env variable.
+ GVFS_FILESYSTEM_CURRENT_LOCATION_NAME_ENV_VAR =
"current_location_name_env_var"
diff --git a/clients/client-python/tests/integration/test_gvfs_with_hdfs.py
b/clients/client-python/tests/integration/test_gvfs_with_hdfs.py
index 8b1c367bc5..29ccc71390 100644
--- a/clients/client-python/tests/integration/test_gvfs_with_hdfs.py
+++ b/clients/client-python/tests/integration/test_gvfs_with_hdfs.py
@@ -43,6 +43,7 @@ from gravitino import (
)
from gravitino.auth.auth_constants import AuthConstants
from gravitino.exceptions.base import GravitinoRuntimeException
+from gravitino.filesystem.gvfs_config import GVFSConfig
from tests.integration.integration_test_env import IntegrationTestEnv
from tests.integration.containers.hdfs_container import HDFSContainer
from tests.integration.base_hadoop_env import BaseHadoopEnvironment
@@ -69,8 +70,11 @@ class TestGvfsWithHDFS(IntegrationTestEnv):
schema_name: str = "test_gvfs_schema"
fileset_name: str = "test_gvfs_fileset"
+ multiple_locations_fileset_name: str =
"test_gvfs_multiple_locations_fileset"
fileset_comment: str = "fileset_comment"
fileset_storage_location: str = ""
+ multiple_locations_fileset_storage_location: str = ""
+ multiple_locations_fileset_storage_location1: str = ""
fileset_properties_key1: str = "fileset_properties_key1"
fileset_properties_value1: str = "fileset_properties_value1"
fileset_properties_key2: str = "fileset_properties_key2"
@@ -84,6 +88,9 @@ class TestGvfsWithHDFS(IntegrationTestEnv):
metalake_name, catalog_name, schema_name
)
fileset_ident: NameIdentifier = NameIdentifier.of(schema_name,
fileset_name)
+ multiple_locations_fileset_ident: NameIdentifier = NameIdentifier.of(
+ schema_name, multiple_locations_fileset_name
+ )
gravitino_admin_client: GravitinoAdminClient = GravitinoAdminClient(
uri="http://localhost:8090"
@@ -128,6 +135,13 @@ class TestGvfsWithHDFS(IntegrationTestEnv):
if cls.hdfs_container is not None:
cls.hdfs_container.close()
+ def tearDown(self):
+ fs = gvfs.GravitinoVirtualFileSystem(
+ server_uri="http://localhost:8090",
+ metalake_name=self.metalake_name,
+ )
+ fs.clear_instance_cache()
+
@classmethod
def _init_test_entities(cls):
cls.gravitino_admin_client.create_metalake(
@@ -160,6 +174,33 @@ class TestGvfsWithHDFS(IntegrationTestEnv):
storage_location=cls.fileset_storage_location,
properties=cls.fileset_properties,
)
+
+ cls.multiple_locations_fileset_storage_location: str = (
+
f"hdfs://{cls.hdfs_container.get_ip()}:9000/{cls.catalog_name}/{cls.schema_name}/"
+ f"{cls.multiple_locations_fileset_name}"
+ )
+ cls.multiple_locations_fileset_storage_location1: str = (
+
f"hdfs://{cls.hdfs_container.get_ip()}:9000/{cls.catalog_name}/{cls.schema_name}/"
+ f"{cls.multiple_locations_fileset_name}_1"
+ )
+ cls.multiple_locations_fileset_gvfs_location = (
+ f"gvfs://fileset/{cls.catalog_name}/{cls.schema_name}/"
+ f"{cls.multiple_locations_fileset_name}"
+ )
+ catalog.as_fileset_catalog().create_multiple_location_fileset(
+ ident=cls.multiple_locations_fileset_ident,
+ fileset_type=Fileset.Type.MANAGED,
+ comment=cls.fileset_comment,
+ storage_locations={
+ "default": cls.multiple_locations_fileset_storage_location,
+ "location1": cls.multiple_locations_fileset_storage_location1,
+ },
+ properties={
+ Fileset.PROPERTY_DEFAULT_LOCATION_NAME: "default",
+ **cls.fileset_properties,
+ },
+ )
+
arrow_hadoop_fs = HadoopFileSystem(host=cls.hdfs_container.get_ip(),
port=9000)
cls.fs = ArrowFSWrapper(arrow_hadoop_fs)
cls.conf: Dict = {"fs.defaultFS":
f"hdfs://{cls.hdfs_container.get_ip()}:9000/"}
@@ -280,6 +321,35 @@ class TestGvfsWithHDFS(IntegrationTestEnv):
file_info = fs.info(info_file)
self.assertEqual(file_info["name"], info_file[len("gvfs://") :])
+ def test_current_location_name(self):
+ fs = gvfs.GravitinoVirtualFileSystem(
+ server_uri="http://localhost:8090",
+ metalake_name=self.metalake_name,
+ options={
+ f"{GVFSConfig.GVFS_FILESYSTEM_CURRENT_LOCATION_NAME}":
"location1"
+ },
+ )
+
+ exist_file = (
+ self.multiple_locations_fileset_gvfs_location +
"/test_exist/test.file"
+ )
+ exist_actual_default_file = (
+ self.multiple_locations_fileset_storage_location +
"/test_exist/test.file"
+ )
+ exist_actual_location1_file = (
+ self.multiple_locations_fileset_storage_location1 +
"/test_exist/test.file"
+ )
+ self.fs.touch(exist_actual_default_file)
+ self.assertTrue(self.fs.exists(exist_actual_default_file))
+ self.assertFalse(self.fs.exists(exist_actual_location1_file))
+ # fs.exists will use location1
+ self.assertFalse(fs.exists(exist_file))
+
+ self.fs.touch(exist_actual_location1_file)
+ self.assertTrue(self.fs.exists(exist_actual_location1_file))
+ # fs.exists will use location1
+ self.assertTrue(fs.exists(exist_file))
+
def test_exist(self):
exist_dir = self.fileset_gvfs_location + "/test_exist"
exist_actual_dir = self.fileset_storage_location + "/test_exist"
@@ -516,7 +586,7 @@ class TestGvfsWithHDFS(IntegrationTestEnv):
)
self.check_makedirs(makedirs_dir, makedirs_actual_dir, fs)
- # test mkdir dir not exist
+ # test mkdirs dir not exist
parent_not_exist_virtual_path = makedirs_dir + "/not_exist/sub_dir"
self.assertFalse(fs.exists(parent_not_exist_virtual_path))
fs.makedirs(parent_not_exist_virtual_path)
diff --git
a/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/GravitinoVirtualFileSystem.java
b/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/GravitinoVirtualFileSystem.java
index 67bfe961a2..10000b3da2 100644
---
a/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/GravitinoVirtualFileSystem.java
+++
b/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/GravitinoVirtualFileSystem.java
@@ -18,6 +18,8 @@
*/
package org.apache.gravitino.filesystem.hadoop;
+import static
org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystemConfiguration.FS_GRAVITINO_CURRENT_LOCATION_NAME;
+
import com.github.benmanes.caffeine.cache.Cache;
import com.github.benmanes.caffeine.cache.Caffeine;
import com.github.benmanes.caffeine.cache.Scheduler;
@@ -34,6 +36,7 @@ import java.net.URI;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
+import java.util.Optional;
import java.util.ServiceLoader;
import java.util.Set;
import java.util.concurrent.ScheduledThreadPoolExecutor;
@@ -42,8 +45,10 @@ import java.util.concurrent.TimeUnit;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
+import javax.annotation.Nullable;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.reflect.FieldUtils;
+import org.apache.commons.lang3.tuple.Pair;
import org.apache.gravitino.Catalog;
import org.apache.gravitino.NameIdentifier;
import org.apache.gravitino.audit.CallerContext;
@@ -88,9 +93,9 @@ public class GravitinoVirtualFileSystem extends FileSystem {
private String metalakeName;
private Cache<NameIdentifier, FilesetCatalog> catalogCache;
private ScheduledThreadPoolExecutor catalogCleanScheduler;
- // Fileset name identifier and its corresponding FileSystem cache, the name
identifier has
- // four levels, the first level is metalake name.
- private Cache<NameIdentifier, FileSystem> internalFileSystemCache;
+ // Fileset nameIdentifier-locationName Pair and its corresponding FileSystem
cache, the name
+ // identifier has four levels, the first level is metalake name.
+ private Cache<Pair<NameIdentifier, String>, FileSystem>
internalFileSystemCache;
private ScheduledThreadPoolExecutor internalFileSystemCleanScheduler;
// The pattern is used to match gvfs path. The scheme prefix
(gvfs://fileset) is optional.
@@ -101,6 +106,9 @@ public class GravitinoVirtualFileSystem extends FileSystem {
Pattern.compile("^(?:gvfs://fileset)?/([^/]+)/([^/]+)/([^/]+)(?>/[^/]+)*/?$");
private static final String SLASH = "/";
private final Map<String, FileSystemProvider> fileSystemProvidersMap =
Maps.newHashMap();
+ private String currentLocationEnvVar;
+
+ @Nullable private String currentLocationName;
private static final Set<String> CATALOG_NECESSARY_PROPERTIES_TO_KEEP =
Sets.newHashSet(
@@ -155,6 +163,13 @@ public class GravitinoVirtualFileSystem extends FileSystem
{
// Register the default local and HDFS FileSystemProvider
fileSystemProvidersMap.putAll(getFileSystemProviders());
+ this.currentLocationEnvVar =
+ configuration.get(
+
GravitinoVirtualFileSystemConfiguration.FS_GRAVITINO_CURRENT_LOCATION_NAME_ENV_VAR,
+ GravitinoVirtualFileSystemConfiguration
+ .FS_GRAVITINO_CURRENT_LOCATION_NAME_ENV_VAR_DEFAULT);
+ this.currentLocationName = initCurrentLocationName(configuration);
+
this.workingDirectory = new Path(name);
this.uri = URI.create(name.getScheme() + "://" + name.getAuthority());
@@ -163,7 +178,7 @@ public class GravitinoVirtualFileSystem extends FileSystem {
}
@VisibleForTesting
- Cache<NameIdentifier, FileSystem> internalFileSystemCache() {
+ Cache<Pair<NameIdentifier, String>, FileSystem> internalFileSystemCache() {
return internalFileSystemCache;
}
@@ -282,7 +297,9 @@ public class GravitinoVirtualFileSystem extends FileSystem {
String actualFileLocation =
filesetCatalog.getFileLocation(
- NameIdentifier.of(identifier.namespace().level(2),
identifier.name()), subPath);
+ NameIdentifier.of(identifier.namespace().level(2),
identifier.name()),
+ subPath,
+ currentLocationName);
Path filePath = new Path(actualFileLocation);
URI uri = filePath.toUri();
@@ -292,7 +309,7 @@ public class GravitinoVirtualFileSystem extends FileSystem {
StringUtils.isNotBlank(scheme), "Scheme of the actual file location
cannot be null.");
FileSystem fs =
internalFileSystemCache.get(
- identifier,
+ Pair.of(identifier, currentLocationName),
ident -> {
try {
FileSystemProvider provider =
fileSystemProvidersMap.get(scheme);
@@ -580,6 +597,13 @@ public class GravitinoVirtualFileSystem extends FileSystem
{
super.close();
}
+ private String initCurrentLocationName(Configuration configuration) {
+ // get from configuration first, otherwise use the env variable
+ // if both are not set, return null which means use the default location
+ return
Optional.ofNullable(configuration.get(FS_GRAVITINO_CURRENT_LOCATION_NAME))
+ .orElse(System.getenv(currentLocationEnvVar));
+ }
+
private static class FilesetContextPair {
private final Path actualFileLocation;
private final FileSystem fileSystem;
diff --git
a/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/GravitinoVirtualFileSystemConfiguration.java
b/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/GravitinoVirtualFileSystemConfiguration.java
index e2bce73453..544cc0ccad 100644
---
a/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/GravitinoVirtualFileSystemConfiguration.java
+++
b/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/GravitinoVirtualFileSystemConfiguration.java
@@ -98,5 +98,23 @@ public class GravitinoVirtualFileSystemConfiguration {
public static final long
FS_GRAVITINO_FILESET_CACHE_EVICTION_MILLS_AFTER_ACCESS_DEFAULT =
1000L * 60 * 60;
+ /**
+ * The configuration key for the fileset with multiple locations, on which
the file system will
+ * operate. If not set, the file system will operate on the default location.
+ */
+ public static final String FS_GRAVITINO_CURRENT_LOCATION_NAME =
+ "fs.gravitino.current.location.name";
+
+ /**
+ * The configuration key for the env variable name that indicates the
current location name. If
+ * not set, the file system will read the location name from
CURRENT_LOCATION_NAME env variable.
+ */
+ public static final String FS_GRAVITINO_CURRENT_LOCATION_NAME_ENV_VAR =
+ "fs.gravitino.current.location.name.env.var";
+
+ /** The default env variable to read from to get current location name. */
+ public static final String
FS_GRAVITINO_CURRENT_LOCATION_NAME_ENV_VAR_DEFAULT =
+ "CURRENT_LOCATION_NAME";
+
private GravitinoVirtualFileSystemConfiguration() {}
}
diff --git
a/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/TestGvfsBase.java
b/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/TestGvfsBase.java
index 9c8a863eaa..6403a0200e 100644
---
a/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/TestGvfsBase.java
+++
b/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/TestGvfsBase.java
@@ -63,6 +63,7 @@ import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.ValueSource;
+import org.testcontainers.shaded.org.apache.commons.lang3.tuple.Pair;
public class TestGvfsBase extends GravitinoMockServerBase {
protected static final String GVFS_IMPL_CLASS =
GravitinoVirtualFileSystem.class.getName();
@@ -159,7 +160,9 @@ public class TestGvfsBase extends GravitinoMockServerBase {
((GravitinoVirtualFileSystem) gravitinoFileSystem)
.internalFileSystemCache()
.getIfPresent(
- NameIdentifier.of(metalakeName, catalogName, schemaName,
"testFSCache")));
+ Pair.of(
+ NameIdentifier.of(metalakeName, catalogName,
schemaName, "testFSCache"),
+ null)));
String anotherFilesetName = "test_new_fs";
Path diffLocalPath =
@@ -213,7 +216,7 @@ public class TestGvfsBase extends GravitinoMockServerBase {
assertNull(
((GravitinoVirtualFileSystem) fs)
.internalFileSystemCache()
- .getIfPresent(NameIdentifier.of("file")));
+ .getIfPresent(Pair.of(NameIdentifier.of("file"), null)));
}
}
diff --git
a/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemIT.java
b/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemIT.java
index 7563815e0a..8de5267f19 100644
---
a/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemIT.java
+++
b/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemIT.java
@@ -18,8 +18,11 @@
*/
package org.apache.gravitino.filesystem.hadoop.integration.test;
+import static org.apache.gravitino.file.Fileset.LOCATION_NAME_UNKNOWN;
+import static org.apache.gravitino.file.Fileset.PROPERTY_DEFAULT_LOCATION_NAME;
import static org.junit.jupiter.api.Assertions.assertEquals;
+import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Maps;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
@@ -35,6 +38,7 @@ import org.apache.gravitino.Catalog;
import org.apache.gravitino.NameIdentifier;
import org.apache.gravitino.client.GravitinoMetalake;
import org.apache.gravitino.file.Fileset;
+import
org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystemConfiguration;
import org.apache.gravitino.integration.test.container.ContainerSuite;
import org.apache.gravitino.integration.test.container.HiveContainer;
import org.apache.gravitino.integration.test.util.BaseIT;
@@ -118,6 +122,53 @@ public class GravitinoVirtualFileSystemIT extends BaseIT {
return gvfsConf;
}
+ @Test
+ public void testCurrentLocationName() throws IOException {
+ // create multiple locations fileset
+ String filesetName =
GravitinoITUtils.genRandomName("test_location_selector");
+ NameIdentifier filesetIdent = NameIdentifier.of(schemaName, filesetName);
+ Catalog catalog = metalake.loadCatalog(catalogName);
+ String defaultStorageLocation = genStorageLocation(filesetName);
+ String storageLocation1 = genStorageLocation(filesetName + "_1");
+ String locationName1 = "location1";
+ catalog
+ .asFilesetCatalog()
+ .createMultipleLocationFileset(
+ filesetIdent,
+ "fileset comment",
+ Fileset.Type.MANAGED,
+ ImmutableMap.of(
+ LOCATION_NAME_UNKNOWN, defaultStorageLocation, locationName1,
storageLocation1),
+ ImmutableMap.of(PROPERTY_DEFAULT_LOCATION_NAME,
LOCATION_NAME_UNKNOWN));
+
Assertions.assertTrue(catalog.asFilesetCatalog().filesetExists(filesetIdent));
+
+ // set location1 to current location
+ Configuration configuration = new Configuration(conf);
+ configuration.set(
+
GravitinoVirtualFileSystemConfiguration.FS_GRAVITINO_CURRENT_LOCATION_NAME,
locationName1);
+
+ Path hdfsPath1 = new Path(storageLocation1);
+ try (FileSystem fs =
+
hdfsPath1.getFileSystem(convertGvfsConfigToRealFileSystemConfig(configuration)))
{
+ Path gvfsPath = genGvfsPath(filesetName);
+ try (FileSystem gvfs = gvfsPath.getFileSystem(configuration)) {
+ if (!gvfs.exists(gvfsPath)) {
+ gvfs.mkdirs(gvfsPath);
+ }
+ String fileName = "test.txt";
+ Path createPath = new Path(gvfsPath + "/" + fileName);
+ // GCS need to close the stream to create the file manually.
+ gvfs.create(createPath).close();
+
+ Assertions.assertTrue(gvfs.exists(createPath));
+ Assertions.assertTrue(fs.exists(new Path(storageLocation1 + "/" +
fileName)));
+ Assertions.assertFalse(fs.exists(new Path(defaultStorageLocation + "/"
+ fileName)));
+ }
+ }
+
+ catalog.asFilesetCatalog().dropFileset(filesetIdent);
+ }
+
@Test
public void testCreate() throws IOException {
// create fileset
diff --git a/docs/how-to-use-gvfs.md b/docs/how-to-use-gvfs.md
index bbe975851e..e31ab902a0 100644
--- a/docs/how-to-use-gvfs.md
+++ b/docs/how-to-use-gvfs.md
@@ -48,23 +48,25 @@ the path mapping and convert automatically.
### Configuration
-| Configuration item | Description
| Default value | Required |
Since version |
-|-------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------|-------------------------------------|-----------------|
-| `fs.AbstractFileSystem.gvfs.impl` | The Gravitino
Virtual File System abstract class, set it to
`org.apache.gravitino.filesystem.hadoop.Gvfs`.
| (none) |
Yes | 0.5.0 |
-| `fs.gvfs.impl` | The Gravitino
Virtual File System implementation class, set it to
`org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystem`.
| (none) | Yes
| 0.5.0 |
-| `fs.gvfs.impl.disable.cache` | Disable the
Gravitino Virtual File System cache in the Hadoop environment. If you need to
proxy multi-user operations, please set this value to `true` and create a
separate File System for each user. | `false` | No
| 0.5.0 |
-| `fs.gravitino.server.uri` | The Gravitino server
URI which GVFS needs to load the fileset metadata.
| (none) | Yes |
0.5.0 |
-| `fs.gravitino.client.metalake` | The metalake to
which the fileset belongs.
| (none) | Yes |
0.5.0 |
-| `fs.gravitino.client.authType` | The auth type to
initialize the Gravitino client to use with the Gravitino Virtual File System.
Currently only supports `simple`, `oauth2` and `kerberos` auth types.
| `simple` | No |
0.5.0 |
-| `fs.gravitino.client.oauth2.serverUri` | The auth server URI
for the Gravitino client when using `oauth2` auth type with the Gravitino
Virtual File System.
| (none) | Yes if you use `oauth2` auth type
| 0.5.0 |
-| `fs.gravitino.client.oauth2.credential` | The auth credential
for the Gravitino client when using `oauth2` auth type in the Gravitino Virtual
File System.
| (none) | Yes if you use `oauth2` auth type |
0.5.0 |
-| `fs.gravitino.client.oauth2.path` | The auth server path
for the Gravitino client when using `oauth2` auth type with the Gravitino
Virtual File System. Please remove the first slash `/` from the path, for
example `oauth/token`. | (none) | Yes if you use `oauth2` auth
type | 0.5.0 |
-| `fs.gravitino.client.oauth2.scope` | The auth scope for
the Gravitino client when using `oauth2` auth type with the Gravitino Virtual
File System.
| (none) | Yes if you use `oauth2` auth type |
0.5.0 |
-| `fs.gravitino.client.kerberos.principal` | The auth principal
for the Gravitino client when using `kerberos` auth type with the Gravitino
Virtual File System.
| (none) | Yes if you use `kerberos` auth type
| 0.5.1 |
-| `fs.gravitino.client.kerberos.keytabFilePath` | The auth keytab file
path for the Gravitino client when using `kerberos` auth type in the Gravitino
Virtual File System.
| (none) | No |
0.5.1 |
-| `fs.gravitino.fileset.cache.maxCapacity` | The cache capacity
of the Gravitino Virtual File System.
| `20` | No |
0.5.0 |
-| `fs.gravitino.fileset.cache.evictionMillsAfterAccess` | The value of time
that the cache expires after accessing in the Gravitino Virtual File System.
The value is in `milliseconds`.
| `3600000` | No
| 0.5.0 |
-| `fs.gravitino.fileset.cache.evictionMillsAfterAccess` | The value of time
that the cache expires after accessing in the Gravitino Virtual File System.
The value is in `milliseconds`.
| `3600000` | No
| 0.5.0 |
+| Configuration item | Description
| Default value
| Required | Since version
|
+|-------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------|-------------------------------------|------------------|
+| `fs.AbstractFileSystem.gvfs.impl` | The Gravitino
Virtual File System abstract class, set it to
`org.apache.gravitino.filesystem.hadoop.Gvfs`.
| (none) | Yes
| 0.5.0 |
+| `fs.gvfs.impl` | The Gravitino
Virtual File System implementation class, set it to
`org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystem`.
| (none) | Yes
| 0.5.0 |
+| `fs.gvfs.impl.disable.cache` | Disable the
Gravitino Virtual File System cache in the Hadoop environment. If you need to
proxy multi-user operations, please set this value to `true` and create a
separate File System for each user.
| `false`
| No
| 0.5.0 |
+| `fs.gravitino.server.uri` | The Gravitino server
URI which GVFS needs to load the fileset metadata.
| (none)
| Yes | 0.5.0
|
+| `fs.gravitino.client.metalake` | The metalake to
which the fileset belongs.
| (none)
| Yes | 0.5.0
|
+| `fs.gravitino.client.authType` | The auth type to
initialize the Gravitino client to use with the Gravitino Virtual File System.
Currently only supports `simple`, `oauth2` and `kerberos` auth types.
| `simple`
| No | 0.5.0
|
+| `fs.gravitino.client.oauth2.serverUri` | The auth server URI
for the Gravitino client when using `oauth2` auth type with the Gravitino
Virtual File System.
| (none)
| Yes if you use `oauth2` auth type | 0.5.0
|
+| `fs.gravitino.client.oauth2.credential` | The auth credential
for the Gravitino client when using `oauth2` auth type in the Gravitino Virtual
File System.
| (none)
| Yes if you use `oauth2` auth type | 0.5.0
|
+| `fs.gravitino.client.oauth2.path` | The auth server path
for the Gravitino client when using `oauth2` auth type with the Gravitino
Virtual File System. Please remove the first slash `/` from the path, for
example `oauth/token`.
| (none)
| Yes if you use `oauth2` auth type |
0.5.0 |
+| `fs.gravitino.client.oauth2.scope` | The auth scope for
the Gravitino client when using `oauth2` auth type with the Gravitino Virtual
File System.
| (none)
| Yes if you use `oauth2` auth type | 0.5.0
|
+| `fs.gravitino.client.kerberos.principal` | The auth principal
for the Gravitino client when using `kerberos` auth type with the Gravitino
Virtual File System.
| (none)
| Yes if you use `kerberos` auth type | 0.5.1
|
+| `fs.gravitino.client.kerberos.keytabFilePath` | The auth keytab file
path for the Gravitino client when using `kerberos` auth type in the Gravitino
Virtual File System.
| (none)
| No | 0.5.1
|
+| `fs.gravitino.fileset.cache.maxCapacity` | The cache capacity
of the Gravitino Virtual File System.
| `20`
| No | 0.5.0
|
+| `fs.gravitino.fileset.cache.evictionMillsAfterAccess` | The value of time
that the cache expires after accessing in the Gravitino Virtual File System.
The value is in `milliseconds`.
| `3600000`
| No | 0.5.0
|
+| `fs.gravitino.fileset.cache.evictionMillsAfterAccess` | The value of time
that the cache expires after accessing in the Gravitino Virtual File System.
The value is in `milliseconds`.
| `3600000`
| No | 0.5.0
|
+| `fs.gravitino.current.location.name` | The configuration
used to select the location of the fileset. If this configuration is not set,
the value of environment variable configured by
`fs.gravitino.current.location.env.var` will be checked. If neither is set, the
value of fileset property `default-location-name` will be used as the location
name. | the value of fileset property `default-location-name` | No
| 0.9.0-incubating |
+| `fs.gravitino.current.location.name.env.var` | The environment
variable name to get the current location name.
| `CURRENT_LOCATION_NAME`
| No |
0.9.0-incubating |
Apart from the above properties, to access fileset like S3, GCS, OSS and
custom fileset, extra properties are needed, please see
[S3 GVFS Java client
configurations](./hadoop-catalog-with-s3.md#using-the-gvfs-java-client-to-access-the-fileset),
[GCS GVFS Java client
configurations](./hadoop-catalog-with-gcs.md#using-the-gvfs-java-client-to-access-the-fileset),
[OSS GVFS Java client
configurations](./hadoop-catalog-with-oss.md#using-the-gvfs-java-client-to-access-the-fileset)
and [Azure Blob Storage GVFS Java client
configurations](./hadoop-catalog-with-adls.md#using-the-gvfs-java-client-to-access-the-fileset)
for [...]
@@ -343,18 +345,20 @@ to recompile the native libraries like `libhdfs` and
others, and completely repl
### Configuration
-| Configuration item | Description
| Default value | Required | Since version |
-|-------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------|-----------------------------------|------------------|
-| `server_uri` | The Gravitino server uri, e.g.
`http://localhost:8090`.
| (none) | Yes |
0.6.0-incubating |
-| `metalake_name` | The metalake name which the fileset belongs
to.
| (none) | Yes | 0.6.0-incubating |
-| `cache_size` | The cache capacity of the Gravitino Virtual
File System.
| `20` | No | 0.6.0-incubating |
[...]
-| `cache_expired_time` | The value of time that the cache expires
after accessing in the Gravitino Virtual File System. The value is in
`seconds`.
| `3600` | No |
0.6.0-incubating |
-| `auth_type` | The auth type to initialize the Gravitino
client to use with the Gravitino Virtual File System. Currently supports
`simple` and `oauth2` auth types.
| `simple` | No |
0.6.0-incubating |
-| `oauth2_server_uri` | The auth server URI for the Gravitino client
when using `oauth2` auth type.
| (none) | Yes if you use `oauth2` auth type | 0.7.0-incubating |
-| `oauth2_credential` | The auth credential for the Gravitino client
when using `oauth2` auth type.
| (none) | Yes if you use `oauth2` auth type | 0.7.0-incubating |
-| `oauth2_path` | The auth server path for the Gravitino
client when using `oauth2` auth type. Please remove the first slash `/` from
the path, for example `oauth/token`.
| (none) | Yes if you use `oauth2` auth type |
0.7.0-incubating |
-| `oauth2_scope` | The auth scope for the Gravitino client when
using `oauth2` auth type with the Gravitino Virtual File System.
| (none) | Yes if you use `oauth2` auth type | 0.7.0-incubating |
-| `credential_expiration_ratio` | The ratio of expiration time for credential
from Gravitino. This is used in the cases where Gravitino Hadoop catalogs have
enable credential vending. if the expiration time of credential fetched from
Gravitino is 1 hour, GVFS client will try to refresh the credential in 1 * 0.9
= 0.5 hour. | 0.5 | No |
0.8.0-incubating |
+| Configuration item | Description
| Default value |
Required | Since version |
+|---------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------|-----------------------------------|------------------|
+| `server_uri` | The Gravitino server uri, e.g.
`http://localhost:8090`.
| (none)
| Yes | 0.6.0-incubating |
+| `metalake_name` | The metalake name which the fileset
belongs to.
| (none)
| Yes | 0.6.0-incubating |
+| `cache_size` | The cache capacity of the Gravitino
Virtual File System.
| `20`
| No | 0.6.0-incubating |
[...]
+| `cache_expired_time` | The value of time that the cache expires
after accessing in the Gravitino Virtual File System. The value is in
`seconds`.
| `3600`
| No | 0.6.0-incubating |
+| `auth_type` | The auth type to initialize the Gravitino
client to use with the Gravitino Virtual File System. Currently supports
`simple` and `oauth2` auth types.
| `simple`
| No | 0.6.0-incubating |
+| `oauth2_server_uri` | The auth server URI for the Gravitino
client when using `oauth2` auth type.
| (none)
| Yes if you use `oauth2` auth type | 0.7.0-incubating |
+| `oauth2_credential` | The auth credential for the Gravitino
client when using `oauth2` auth type.
| (none)
| Yes if you use `oauth2` auth type | 0.7.0-incubating |
+| `oauth2_path` | The auth server path for the Gravitino
client when using `oauth2` auth type. Please remove the first slash `/` from
the path, for example `oauth/token`.
| (none)
| Yes if you use `oauth2` auth type | 0.7.0-incubating |
+| `oauth2_scope` | The auth scope for the Gravitino client
when using `oauth2` auth type with the Gravitino Virtual File System.
| (none) |
Yes if you use `oauth2` auth type | 0.7.0-incubating |
+| `credential_expiration_ratio` | The ratio of expiration time for
credential from Gravitino. This is used in the cases where Gravitino Hadoop
catalogs have enable credential vending. if the expiration time of credential
fetched from Gravitino is 1 hour, GVFS client will try to refresh the
credential in 1 * 0.9 = 0.5 hour. | 0.5
| No | 0.8.0-incubating |
+| `current_location_name` | The configuration used to select the
location of the fileset. If this configuration is not set, the value of
environment variable configured by `current_location_name_env_var` will be
checked. If neither is set, the value of fileset property
`default-location-name` will be used as the location name. | the value of
fileset property `default-location-name` | No |
0.9.0-incubating |
+| `current_location_name_env_var` | The environment variable name to get the
current location name.
| `CURRENT_LOCATION_NAME` |
No | 0.9.0-incubating |
#### Configurations for S3, GCS, OSS and Azure Blob storage fileset