This is an automated email from the ASF dual-hosted git repository.
mchades pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/gravitino.git
The following commit(s) were added to refs/heads/main by this push:
new d3271df357 [#9312] feat (client-python): Support multiple cluster
fileset in the Gravitino python client (#9570)
d3271df357 is described below
commit d3271df3572a72eebb23afa5cec06e6ef0e5abdc
Author: Yuhui <[email protected]>
AuthorDate: Thu Feb 12 16:40:53 2026 +0800
[#9312] feat (client-python): Support multiple cluster fileset in the
Gravitino python client (#9570)
### What changes were proposed in this pull request?
Support multiple cluster fileset in the Gravitino python client
### Why are the changes needed?
Fix: #9312
### Does this PR introduce _any_ user-facing change?
Update docs
### How was this patch tested?
Manually test
---------
Co-authored-by: Copilot <[email protected]>
---
.../catalog/hadoop/fs/FileSystemUtils.java | 2 +-
.../gravitino/filesystem/gvfs_base_operations.py | 175 +++++++++++++++++++--
.../gravitino/filesystem/gvfs_config.py | 13 ++
.../gravitino/filesystem/gvfs_storage_handler.py | 85 +++-------
.../tests/integration/integration_test_env.py | 7 +-
.../tests/integration/test_gvfs_with_hdfs.py | 4 -
clients/client-python/tests/unittests/mock_base.py | 20 +++
.../tests/unittests/test_filesystem_cache.py | 36 ++---
.../unittests/test_gvfs_user_defined_configs.py | 127 +++++++++++++++
.../tests/unittests/test_storage_handler.py | 5 -
.../GravitinoVirtualFileSystemConfiguration.java | 5 +-
docs/how-to-use-gvfs.md | 130 +++++++++------
12 files changed, 449 insertions(+), 160 deletions(-)
diff --git
a/catalogs/hadoop-common/src/main/java/org/apache/gravitino/catalog/hadoop/fs/FileSystemUtils.java
b/catalogs/hadoop-common/src/main/java/org/apache/gravitino/catalog/hadoop/fs/FileSystemUtils.java
index f9d38e6975..0c1b00cbee 100644
---
a/catalogs/hadoop-common/src/main/java/org/apache/gravitino/catalog/hadoop/fs/FileSystemUtils.java
+++
b/catalogs/hadoop-common/src/main/java/org/apache/gravitino/catalog/hadoop/fs/FileSystemUtils.java
@@ -49,7 +49,7 @@ public class FileSystemUtils {
ServiceLoader<FileSystemProvider> allFileSystemProviders =
ServiceLoader.load(FileSystemProvider.class);
- // Only get the file system providers that are in the user list and check
if the scheme is
+ // Load all available file system providers from the ServiceLoader and
ensure each scheme is
// unique.
Streams.stream(allFileSystemProviders.iterator())
.forEach(
diff --git a/clients/client-python/gravitino/filesystem/gvfs_base_operations.py
b/clients/client-python/gravitino/filesystem/gvfs_base_operations.py
index 776e980b00..72c37bba0a 100644
--- a/clients/client-python/gravitino/filesystem/gvfs_base_operations.py
+++ b/clients/client-python/gravitino/filesystem/gvfs_base_operations.py
@@ -31,6 +31,7 @@ from readerwriterlock import rwlock
from gravitino.api.credential.credential import Credential
from gravitino.api.file.fileset import Fileset
+from gravitino.api.schema import Schema
from gravitino.audit.caller_context import CallerContextHolder, CallerContext
from gravitino.audit.fileset_audit_constants import FilesetAuditConstants
from gravitino.audit.fileset_data_operation import FilesetDataOperation
@@ -79,8 +80,7 @@ class FileSystemCacheKey:
scheme: str,
authority: Optional[str],
credentials,
- catalog_props: Dict[str, str],
- options: Dict[str, str],
+ fileset_props: Dict[str, str],
extra_kwargs: Dict,
):
"""
@@ -90,8 +90,7 @@ class FileSystemCacheKey:
scheme: The filesystem scheme (e.g., 's3', 'gs', 'hdfs', 'file')
authority: The authority part of the URI (e.g., bucket name,
host:port)
credentials: The credentials for the filesystem
- catalog_props: The catalog properties
- options: The GVFS options
+ fileset_props: The fileset properties
extra_kwargs: Extra keyword arguments
"""
self._pid = os.getpid()
@@ -105,8 +104,7 @@ class FileSystemCacheKey:
scheme,
authority,
credentials,
- catalog_props,
- options,
+ fileset_props,
extra_kwargs,
)
@@ -232,6 +230,9 @@ class BaseGVFSOperations(ABC):
self._fileset_cache = LRUCache(maxsize=10000)
self._fileset_cache_lock = rwlock.RWLockFair()
+ self._schema_cache = LRUCache(maxsize=1000)
+ self._schema_cache_lock = rwlock.RWLockFair()
+
self._enable_credential_vending = (
False
if options is None
@@ -489,6 +490,31 @@ class BaseGVFSOperations(ABC):
fileset_ident, location_name
)
+ def _merge_fileset_properties(
+ self,
+ catalog: FilesetCatalog,
+ schema: Schema,
+ fileset: Fileset,
+ actual_location: str,
+ ) -> Dict[str, str]:
+ """Merge properties from catalog, schema, fileset, options, and
user-defined configs.
+ :param catalog: The fileset catalog
+ :param schema: The schema
+ :param fileset: The fileset
+ :param actual_location: The actual storage location
+ :return: Merged properties dictionary
+ """
+ fileset_props = dict(catalog.properties() or {})
+ fileset_props.update(schema.properties() or {})
+ fileset_props.update(fileset.properties() or {})
+ if self._options:
+ fileset_props.update(self._options)
+ # Get user-defined configurations for the actual location
+ # Apply after global options so path-specific configs take precedence
+ user_defined_configs = self._get_user_defined_configs(actual_location)
+ fileset_props.update(user_defined_configs)
+ return fileset_props
+
def _get_actual_filesystem_by_location_name(
self, fileset_ident: NameIdentifier, location_name: str
) -> AbstractFileSystem:
@@ -505,6 +531,9 @@ class BaseGVFSOperations(ABC):
self._metalake, fileset_ident.namespace().level(1)
)
catalog = self._get_fileset_catalog(catalog_ident)
+ schema = self._get_fileset_schema(
+ NameIdentifier.parse(str(fileset_ident.namespace()))
+ )
fileset = self._get_fileset(fileset_ident)
# Determine target location name
@@ -521,6 +550,10 @@ class BaseGVFSOperations(ABC):
f"Cannot find the location: {target_location_name} in fileset:
{fileset_ident}"
)
+ fileset_props = self._merge_fileset_properties(
+ catalog, schema, fileset, actual_location
+ )
+
# Set caller context for credential vending
if location_name:
context = {
@@ -541,14 +574,13 @@ class BaseGVFSOperations(ABC):
# This matches how Java GVFS caches by (scheme, authority, config)
actual_fs = self._get_filesystem(
credentials,
- catalog.properties(),
- self._options,
+ fileset_props,
actual_location,
**self._kwargs,
)
self._create_fileset_location_if_needed(
- catalog.properties(), actual_fs, actual_location
+ fileset_props, actual_fs, actual_location
)
return actual_fs
finally:
@@ -615,8 +647,7 @@ class BaseGVFSOperations(ABC):
def _get_filesystem(
self,
credentials: Optional[List[Credential]],
- catalog_props: Dict[str, str],
- options: Dict[str, str],
+ fileset_props: Dict[str, str],
actual_path: str,
**kwargs,
) -> AbstractFileSystem:
@@ -629,8 +660,7 @@ class BaseGVFSOperations(ABC):
the same configuration will share the same filesystem instance.
:param credentials: The credentials for accessing the filesystem
- :param catalog_props: The catalog properties
- :param options: The GVFS options
+ :param fileset_props: The fileset properties
:param actual_path: The actual file path (e.g., 's3://bucket/path',
'gs://bucket/path')
:param kwargs: Additional keyword arguments
:return: The filesystem instance
@@ -645,7 +675,7 @@ class BaseGVFSOperations(ABC):
# This allows multiple filesets pointing to the same storage to share
# the same filesystem instance
cache_key = FileSystemCacheKey(
- scheme, authority, credentials, catalog_props, options, kwargs
+ scheme, authority, credentials, fileset_props, kwargs
)
# Try to get from cache with read lock
@@ -678,8 +708,7 @@ class BaseGVFSOperations(ABC):
actual_path
).get_filesystem_with_expiration(
credentials,
- catalog_props,
- options,
+ fileset_props,
actual_path,
**kwargs,
)
@@ -754,3 +783,117 @@ class BaseGVFSOperations(ABC):
return fileset
finally:
write_lock.release()
+
+ def _get_fileset_schema(self, schema_ident: NameIdentifier):
+ """Get the schema by the schema identifier from the cache or load it
from the server if the cache is disabled.
+ :param schema_ident: The schema identifier
+ :return: The schema
+ """
+ if not self._enable_fileset_metadata_cache:
+ catalog_ident: NameIdentifier = NameIdentifier.of(
+ schema_ident.namespace().level(0),
schema_ident.namespace().level(1)
+ )
+ catalog: FilesetCatalog = self._get_fileset_catalog(catalog_ident)
+ return catalog.as_schemas().load_schema(schema_ident.name())
+
+ read_lock = self._schema_cache_lock.gen_rlock()
+ try:
+ read_lock.acquire()
+ cache_value: Schema = self._schema_cache.get(schema_ident)
+ if cache_value is not None:
+ return cache_value
+ finally:
+ read_lock.release()
+
+ write_lock = self._schema_cache_lock.gen_wlock()
+ try:
+ write_lock.acquire()
+ cache_value: Schema = self._schema_cache.get(schema_ident)
+ if cache_value is not None:
+ return cache_value
+
+ catalog_ident: NameIdentifier = NameIdentifier.of(
+ schema_ident.namespace().level(0),
schema_ident.namespace().level(1)
+ )
+ catalog: FilesetCatalog = self._get_fileset_catalog(catalog_ident)
+ schema = catalog.as_schemas().load_schema(schema_ident.name())
+ self._schema_cache[schema_ident] = schema
+ return schema
+ finally:
+ write_lock.release()
+
+ def _get_base_location(self, actual_location: str) -> str:
+ """Get the base location (scheme + authority) from the actual location
path.
+ :param actual_location: The actual location path (e.g.,
's3://bucket/path')
+ :return: The base location (e.g., 's3://bucket')
+ """
+ parsed_uri = urlparse(actual_location)
+ scheme = parsed_uri.scheme if parsed_uri.scheme else "file"
+ authority = parsed_uri.netloc if parsed_uri.netloc else ""
+ return f"{scheme}://{authority}"
+
+ def _get_user_defined_configs(self, path: str) -> Dict[str, str]:
+ """Get user defined configurations for a specific path based on the
path's base location
+ (scheme://authority).
+
+ The logic:
+ 1. Extract baseLocation (scheme://authority) from the given path
+ 2. Find config entries like "fs_path_config_<name> = <base_location>"
where the
+ base_location matches the extracted baseLocation
+ 3. Extract the name from the matching entry
+ 4. Then find all config entries with prefix "fs_path_config_<name>_"
and extract properties
+
+ Example:
+ fs_path_config_cluster1 = s3://bucket1
+ fs_path_config_cluster1_aws-access-key = XXX1
+ fs_path_config_cluster1_aws-secret-key = XXX2
+ If path is "s3://bucket1/path/fileset1", then baseLocation is
"s3://bucket1",
+ cluster1 matches and we extract:
+ - aws-access-key = XXX1
+ - aws-secret-key = XXX2
+
+ :param path: The path to extract configurations for (e.g.,
's3://bucket/path/to/file')
+ :return: A map of configuration properties for the given path
+ """
+ properties: Dict[str, str] = {}
+ if not path:
+ return properties
+
+ base_location = self._get_base_location(path)
+ location_name = None
+ config_prefix = GVFSConfig.FS_GRAVITINO_PATH_CONFIG_PREFIX
+
+ # First pass: find the location name by matching baseLocation
+ # Look for entries like "fs_path_config_<name> = <base_location>"
+ # The key format should be exactly "fs_path_config_<name>" (no
underscore after name)
+ if self._options:
+ for key, value in self._options.items():
+ if not key.startswith(config_prefix):
+ continue
+
+ suffix = key[len(config_prefix) :]
+ # Check if this is a location definition (no underscore after
the name)
+ # Format: "fs_path_config_<name>" (not
"fs_path_config_<name>_<property>")
+ if "_" not in suffix and suffix and value:
+ # This is a location definition: "fs_path_config_<name>"
+ # Extract baseLocation from the value and compare with the
path's baseLocation
+ config_base_location = self._get_base_location(value)
+ if base_location == config_base_location:
+ location_name = suffix
+ break
+
+ # Second pass: extract all properties for the matched location name
+ if location_name:
+ property_prefix = config_prefix + location_name + "_"
+ if self._options:
+ for key, value in self._options.items():
+ # Check if this key is a property for the matched location
+ # e.g., "fs_path_config_cluster1_aws-ak" matches prefix
"fs_path_config_cluster1_"
+ if key.startswith(property_prefix):
+ # Extract the property name after the location prefix
+ # e.g., "fs_path_config_cluster1_aws-ak" -> "aws-ak"
+ property_name = key[len(property_prefix) :]
+ if property_name:
+ properties[property_name] = value
+
+ return properties
diff --git a/clients/client-python/gravitino/filesystem/gvfs_config.py
b/clients/client-python/gravitino/filesystem/gvfs_config.py
index c961aee083..969d8a9052 100644
--- a/clients/client-python/gravitino/filesystem/gvfs_config.py
+++ b/clients/client-python/gravitino/filesystem/gvfs_config.py
@@ -85,3 +85,16 @@ class GVFSConfig:
# The configuration key for whether to enable auto-creation of fileset
location when the
# server-side filesystem ops are disabled and the location does not exist.
The default is true.
GVFS_FILESYSTEM_AUTO_CREATE_LOCATION = "auto_create_location"
+
+ # The configuration prefix for user-defined path-specific configurations.
+ # Note: location names must NOT contain ('.', '_'),
+ # because '_' is used as the separator between prefix, location name, and
property name.
+ # Configuration is defined in two steps:
+ # 1) Map a logical location name to a base location:
+ # fs_path_config_<location_name>=<base_location>
+ # 2) Define properties for that logical location:
+ # fs_path_config_<location_name>_<property_name>=<property_value>
+ # Example:
+ # fs_path_config_my-s3-location=s3://bucket/path
+ # fs_path_config_my-s3-location_aws-access-key=XXX
+ FS_GRAVITINO_PATH_CONFIG_PREFIX = "fs_path_config_"
diff --git a/clients/client-python/gravitino/filesystem/gvfs_storage_handler.py
b/clients/client-python/gravitino/filesystem/gvfs_storage_handler.py
index 842260403e..63c215079a 100644
--- a/clients/client-python/gravitino/filesystem/gvfs_storage_handler.py
+++ b/clients/client-python/gravitino/filesystem/gvfs_storage_handler.py
@@ -81,7 +81,6 @@ class StorageHandler(ABC):
self,
credentials: List[Credential],
catalog_props: Dict[str, str],
- options: Dict[str, str],
actual_path: Optional[str] = None,
**kwargs,
) -> Tuple[int, AbstractFileSystem]:
@@ -214,7 +213,6 @@ class LocalStorageHandler(StorageHandler):
self,
credentials: List[Credential],
catalog_props: Dict[str, str],
- options: Dict[str, str],
actual_path: Optional[str] = None,
**kwargs,
) -> Tuple[int, AbstractFileSystem]:
@@ -256,7 +254,6 @@ class HDFSStorageHandler(StorageHandler):
self,
credentials: List[Credential],
catalog_props: Dict[str, str],
- options: Dict[str, str],
actual_path: Optional[str] = None,
**kwargs,
) -> Tuple[int, AbstractFileSystem]:
@@ -285,7 +282,6 @@ class S3StorageHandler(StorageHandler):
self,
credentials: List[Credential],
catalog_props: Dict[str, str],
- options: Dict[str, str],
actual_path: Optional[str] = None,
**kwargs,
) -> Tuple[int, AbstractFileSystem]:
@@ -296,28 +292,16 @@ class S3StorageHandler(StorageHandler):
# Note: the endpoint may not be a real S3 endpoint, it can be a
simulated S3 endpoint, such as minio,
# so though the endpoint is not a required field for S3FileSystem, we
still need to assign the endpoint
# to the S3FileSystem
- s3_endpoint = None
- if options:
- s3_endpoint = (
- options.get(GVFSConfig.GVFS_FILESYSTEM_S3_ENDPOINT,
s3_endpoint)
- if options
- else None
- )
- if s3_endpoint is None:
- s3_endpoint = (
- catalog_props.get("s3-endpoint", None) if catalog_props
else None
- )
- else:
- s3_endpoint = (
- catalog_props.get("s3-endpoint", None) if catalog_props else
None
- )
+ s3_endpoint = catalog_props.get(
+ GVFSConfig.GVFS_FILESYSTEM_S3_ENDPOINT
+ ) or catalog_props.get("s3-endpoint")
if credentials:
credential = self._get_most_suitable_credential(credentials)
if credential is not None:
expire_time = self._get_expire_time_by_ratio(
credential.expire_time_in_ms(),
- options,
+ catalog_props,
)
if isinstance(credential, S3TokenCredential):
fs = self.get_filesystem(
@@ -339,8 +323,8 @@ class S3StorageHandler(StorageHandler):
# this is the old way to get the s3 file system
# get 'aws_access_key_id' from s3_options, if the key is not found,
throw an exception
- aws_access_key_id = (
- options.get(GVFSConfig.GVFS_FILESYSTEM_S3_ACCESS_KEY) if options
else None
+ aws_access_key_id = catalog_props.get(
+ GVFSConfig.GVFS_FILESYSTEM_S3_ACCESS_KEY, None
)
if aws_access_key_id is None:
raise GravitinoRuntimeException(
@@ -348,8 +332,8 @@ class S3StorageHandler(StorageHandler):
)
# get 'aws_secret_access_key' from s3_options, if the key is not
found, throw an exception
- aws_secret_access_key = (
- options.get(GVFSConfig.GVFS_FILESYSTEM_S3_SECRET_KEY) if options
else None
+ aws_secret_access_key = catalog_props.get(
+ GVFSConfig.GVFS_FILESYSTEM_S3_SECRET_KEY
)
if aws_secret_access_key is None:
raise GravitinoRuntimeException(
@@ -408,7 +392,6 @@ class GCSStorageHandler(StorageHandler):
self,
credentials: List[Credential],
catalog_props: Dict[str, str],
- options: Dict[str, str],
actual_path: Optional[str] = None,
**kwargs,
) -> Tuple[int, AbstractFileSystem]:
@@ -417,7 +400,7 @@ class GCSStorageHandler(StorageHandler):
if credential is not None:
expire_time = self._get_expire_time_by_ratio(
credential.expire_time_in_ms(),
- options,
+ catalog_props,
)
if isinstance(credential, GCSTokenCredential):
return (
@@ -429,10 +412,8 @@ class GCSStorageHandler(StorageHandler):
)
# get 'service-account-key' from gcs_options, if the key is not found,
throw an exception
- service_account_key_path = (
- options.get(GVFSConfig.GVFS_FILESYSTEM_GCS_SERVICE_KEY_FILE)
- if options
- else None
+ service_account_key_path = catalog_props.get(
+ GVFSConfig.GVFS_FILESYSTEM_GCS_SERVICE_KEY_FILE
)
if service_account_key_path is None:
raise GravitinoRuntimeException(
@@ -485,33 +466,20 @@ class OSSStorageHandler(StorageHandler):
self,
credentials: List[Credential],
catalog_props: Dict[str, str],
- options: Dict[str, str],
actual_path: Optional[str] = None,
**kwargs,
) -> Tuple[int, AbstractFileSystem]:
- oss_endpoint = None
# OSS endpoint from client options has a higher priority, override the
endpoint from catalog properties.
- if options:
- oss_endpoint = (
- options.get(GVFSConfig.GVFS_FILESYSTEM_OSS_ENDPOINT,
oss_endpoint)
- if options
- else None
- )
- if oss_endpoint is None:
- oss_endpoint = (
- catalog_props.get("oss-endpoint", None) if catalog_props
else None
- )
- else:
- oss_endpoint = (
- catalog_props.get("oss-endpoint", None) if catalog_props else
None
- )
+ oss_endpoint = catalog_props.get(
+ GVFSConfig.GVFS_FILESYSTEM_OSS_ENDPOINT
+ ) or catalog_props.get("oss-endpoint")
if credentials:
credential = self._get_most_suitable_credential(credentials)
if credential is not None:
expire_time = self._get_expire_time_by_ratio(
credential.expire_time_in_ms(),
- options,
+ catalog_props,
)
if isinstance(credential, OSSTokenCredential):
fs = self.get_filesystem(
@@ -534,17 +502,15 @@ class OSSStorageHandler(StorageHandler):
)
# get 'oss_access_key_id' from oss options, if the key is not found,
throw an exception
- oss_access_key_id = (
- options.get(GVFSConfig.GVFS_FILESYSTEM_OSS_ACCESS_KEY) if options
else None
- )
+ oss_access_key_id =
catalog_props.get(GVFSConfig.GVFS_FILESYSTEM_OSS_ACCESS_KEY)
if oss_access_key_id is None:
raise GravitinoRuntimeException(
"OSS access key id is not found in the options."
)
# get 'oss_secret_access_key' from oss options, if the key is not
found, throw an exception
- oss_secret_access_key = (
- options.get(GVFSConfig.GVFS_FILESYSTEM_OSS_SECRET_KEY) if options
else None
+ oss_secret_access_key = catalog_props.get(
+ GVFSConfig.GVFS_FILESYSTEM_OSS_SECRET_KEY
)
if oss_secret_access_key is None:
raise GravitinoRuntimeException(
@@ -604,7 +570,6 @@ class ABSStorageHandler(StorageHandler):
self,
credentials: List[Credential],
catalog_props: Dict[str, str],
- options: Dict[str, str],
actual_path: Optional[str] = None,
**kwargs,
) -> Tuple[int, AbstractFileSystem]:
@@ -613,7 +578,7 @@ class ABSStorageHandler(StorageHandler):
if credential is not None:
expire_time = self._get_expire_time_by_ratio(
credential.expire_time_in_ms(),
- options,
+ catalog_props,
)
if isinstance(credential, ADLSTokenCredential):
fs = self.get_filesystem(
@@ -632,10 +597,8 @@ class ABSStorageHandler(StorageHandler):
return expire_time, fs
# get 'abs_account_name' from abs options, if the key is not found,
throw an exception
- abs_account_name = (
- options.get(GVFSConfig.GVFS_FILESYSTEM_AZURE_ACCOUNT_NAME)
- if options
- else None
+ abs_account_name = catalog_props.get(
+ GVFSConfig.GVFS_FILESYSTEM_AZURE_ACCOUNT_NAME
)
if abs_account_name is None:
raise GravitinoRuntimeException(
@@ -643,10 +606,8 @@ class ABSStorageHandler(StorageHandler):
)
# get 'abs_account_key' from abs options, if the key is not found,
throw an exception
- abs_account_key = (
- options.get(GVFSConfig.GVFS_FILESYSTEM_AZURE_ACCOUNT_KEY)
- if options
- else None
+ abs_account_key = catalog_props.get(
+ GVFSConfig.GVFS_FILESYSTEM_AZURE_ACCOUNT_KEY
)
if abs_account_key is None:
raise GravitinoRuntimeException(
diff --git a/clients/client-python/tests/integration/integration_test_env.py
b/clients/client-python/tests/integration/integration_test_env.py
index 7b54f1f452..ca943ab6f6 100644
--- a/clients/client-python/tests/integration/integration_test_env.py
+++ b/clients/client-python/tests/integration/integration_test_env.py
@@ -25,6 +25,7 @@ import shutil
import requests
+from gravitino import GravitinoAdminClient
from gravitino.exceptions.base import GravitinoRuntimeException
from tests.integration.config import Config
@@ -64,6 +65,7 @@ class IntegrationTestEnv(unittest.TestCase):
"""Provide real test environment for the Gravitino Server"""
gravitino_startup_script = None
+ gravitino_admin_client: GravitinoAdminClient = None
@classmethod
def setUpClass(cls):
@@ -180,7 +182,10 @@ class IntegrationTestEnv(unittest.TestCase):
if result.stderr:
logger.info("stderr: %s", result.stderr)
- if not check_gravitino_server_status():
+ success = check_gravitino_server_status()
+ if success:
+ cls.gravitino_admin_client =
GravitinoAdminClient("http://localhost:8090")
+ else:
raise GravitinoRuntimeException("ERROR: Can't start Gravitino
server!")
@classmethod
diff --git a/clients/client-python/tests/integration/test_gvfs_with_hdfs.py
b/clients/client-python/tests/integration/test_gvfs_with_hdfs.py
index a0f36d57eb..9a0987fc79 100644
--- a/clients/client-python/tests/integration/test_gvfs_with_hdfs.py
+++ b/clients/client-python/tests/integration/test_gvfs_with_hdfs.py
@@ -36,7 +36,6 @@ from pyarrow.fs import HadoopFileSystem
from gravitino import (
gvfs,
NameIdentifier,
- GravitinoAdminClient,
GravitinoClient,
Catalog,
Fileset,
@@ -92,9 +91,6 @@ class TestGvfsWithHDFS(IntegrationTestEnv):
schema_name, multiple_locations_fileset_name
)
- gravitino_admin_client: GravitinoAdminClient = GravitinoAdminClient(
- uri="http://localhost:8090"
- )
gravitino_client: GravitinoClient = None
options = {}
diff --git a/clients/client-python/tests/unittests/mock_base.py
b/clients/client-python/tests/unittests/mock_base.py
index 9421d9d1fd..c4e9c2910a 100644
--- a/clients/client-python/tests/unittests/mock_base.py
+++ b/clients/client-python/tests/unittests/mock_base.py
@@ -25,6 +25,7 @@ from gravitino.client.generic_model_catalog import
GenericModelCatalog
from gravitino.dto.fileset_dto import FilesetDTO
from gravitino.dto.audit_dto import AuditDTO
from gravitino.dto.metalake_dto import MetalakeDTO
+from gravitino.dto.schema_dto import SchemaDTO
from gravitino.namespace import Namespace
from gravitino.utils.http_client import HTTPClient
@@ -109,6 +110,21 @@ def mock_load_fileset(name: str, location: str):
)
+def mock_load_schema(name: str):
+ audit_dto = AuditDTO(
+ _creator="test",
+ _create_time="2022-01-01T00:00:00Z",
+ _last_modifier="test",
+ _last_modified_time="2024-04-05T10:10:35.218Z",
+ )
+ return SchemaDTO(
+ _name=name,
+ _comment="this is schema test",
+ _properties={"schema-prop": "schema-val"},
+ _audit=audit_dto,
+ )
+
+
def mock_data(cls):
@patch(
"gravitino.client.gravitino_client_base.GravitinoClientBase.load_metalake",
@@ -122,6 +138,10 @@ def mock_data(cls):
"gravitino.client.fileset_catalog.FilesetCatalog.load_fileset",
return_value=mock_load_fileset("fileset", ""),
)
+ @patch(
+ "gravitino.client.fileset_catalog.FilesetCatalog.load_schema",
+ side_effect=mock_load_schema,
+ )
@patch(
"gravitino.client.gravitino_client_base.GravitinoClientBase.check_version",
return_value=True,
diff --git a/clients/client-python/tests/unittests/test_filesystem_cache.py
b/clients/client-python/tests/unittests/test_filesystem_cache.py
index 00d653ece5..6a0a4be9c7 100644
--- a/clients/client-python/tests/unittests/test_filesystem_cache.py
+++ b/clients/client-python/tests/unittests/test_filesystem_cache.py
@@ -324,16 +324,14 @@ class TestFileSystemCacheKey(unittest.TestCase):
scheme="s3",
authority="bucket1",
credentials=None,
- catalog_props={"prop1": "value1"},
- options={"opt1": "val1"},
+ fileset_props={"prop1": "value1", "opt1": "val1"},
extra_kwargs={"arg1": "val1"},
)
key2 = FileSystemCacheKey(
scheme="s3",
authority="bucket1",
credentials=None,
- catalog_props={"prop1": "value1"},
- options={"opt1": "val1"},
+ fileset_props={"prop1": "value1", "opt1": "val1"},
extra_kwargs={"arg1": "val1"},
)
@@ -346,8 +344,7 @@ class TestFileSystemCacheKey(unittest.TestCase):
scheme="s3",
authority="bucket1",
credentials=None,
- catalog_props={"prop1": "value1"},
- options={},
+ fileset_props={"prop1": "value1"},
extra_kwargs={},
)
@@ -356,8 +353,7 @@ class TestFileSystemCacheKey(unittest.TestCase):
scheme="gs",
authority="bucket1",
credentials=None,
- catalog_props={"prop1": "value1"},
- options={},
+ fileset_props={"prop1": "value1"},
extra_kwargs={},
)
self.assertNotEqual(base_key, key_diff_scheme)
@@ -367,31 +363,28 @@ class TestFileSystemCacheKey(unittest.TestCase):
scheme="s3",
authority="bucket2",
credentials=None,
- catalog_props={"prop1": "value1"},
- options={},
+ fileset_props={"prop1": "value1"},
extra_kwargs={},
)
self.assertNotEqual(base_key, key_diff_authority)
- # Different catalog properties
+ # Different fileset properties
key_diff_config = FileSystemCacheKey(
scheme="s3",
authority="bucket1",
credentials=None,
- catalog_props={"prop1": "value2"}, # Different value
- options={},
+ fileset_props={"prop1": "value2"}, # Different value
extra_kwargs={},
)
self.assertNotEqual(base_key, key_diff_config)
- # Different options
+ # Different extra kwargs
key_diff_options = FileSystemCacheKey(
scheme="s3",
authority="bucket1",
credentials=None,
- catalog_props={"prop1": "value1"},
- options={"opt1": "val1"}, # Added option
- extra_kwargs={},
+ fileset_props={"prop1": "value1"},
+ extra_kwargs={"opt1": "val1"}, # Added option
)
self.assertNotEqual(base_key, key_diff_options)
@@ -401,8 +394,7 @@ class TestFileSystemCacheKey(unittest.TestCase):
scheme="s3",
authority="bucket1",
credentials=None,
- catalog_props={},
- options={},
+ fileset_props={},
extra_kwargs={},
)
@@ -415,8 +407,7 @@ class TestFileSystemCacheKey(unittest.TestCase):
scheme="s3",
authority="bucket1",
credentials=None,
- catalog_props={},
- options={},
+ fileset_props={},
extra_kwargs={},
)
@@ -429,8 +420,7 @@ class TestFileSystemCacheKey(unittest.TestCase):
scheme="s3",
authority="my-bucket",
credentials=None,
- catalog_props={},
- options={},
+ fileset_props={},
extra_kwargs={},
)
diff --git
a/clients/client-python/tests/unittests/test_gvfs_user_defined_configs.py
b/clients/client-python/tests/unittests/test_gvfs_user_defined_configs.py
new file mode 100644
index 0000000000..bd67b64d14
--- /dev/null
+++ b/clients/client-python/tests/unittests/test_gvfs_user_defined_configs.py
@@ -0,0 +1,127 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import unittest
+
+from gravitino.filesystem.gvfs_config import GVFSConfig
+from gravitino.filesystem.gvfs_default_operations import DefaultGVFSOperations
+
+
+# pylint: disable=protected-access
+class TestGVFSUserDefinedConfigs(unittest.TestCase):
+ """Test cases for _get_user_defined_configs method."""
+
+ def test_get_user_defined_configs_single_location(self):
+ """Test _get_user_defined_configs with single location."""
+ options = {
+ f"{GVFSConfig.FS_GRAVITINO_PATH_CONFIG_PREFIX}cluster1":
"s3://bucket1",
+ f"{GVFSConfig.FS_GRAVITINO_PATH_CONFIG_PREFIX}cluster1_key1":
"value1",
+ f"{GVFSConfig.FS_GRAVITINO_PATH_CONFIG_PREFIX}cluster1_key2":
"value2",
+ }
+ operations = DefaultGVFSOperations(options=options)
+
+ path = "s3://bucket1/path/to/file"
+ configs = operations._get_user_defined_configs(path)
+
+ self.assertEqual(len(configs), 2)
+ self.assertEqual(configs["key1"], "value1")
+ self.assertEqual(configs["key2"], "value2")
+
+ def test_get_user_defined_configs_multiple_locations(self):
+ """Test _get_user_defined_configs with multiple location
configurations."""
+ options = {
+ f"{GVFSConfig.FS_GRAVITINO_PATH_CONFIG_PREFIX}cluster1":
"s3://bucket1",
+ f"{GVFSConfig.FS_GRAVITINO_PATH_CONFIG_PREFIX}cluster1_key1":
"value1",
+ f"{GVFSConfig.FS_GRAVITINO_PATH_CONFIG_PREFIX}cluster1_key2":
"value2",
+ f"{GVFSConfig.FS_GRAVITINO_PATH_CONFIG_PREFIX}cluster2":
"s3://bucket2",
+ f"{GVFSConfig.FS_GRAVITINO_PATH_CONFIG_PREFIX}cluster2_key1":
"value3",
+ f"{GVFSConfig.FS_GRAVITINO_PATH_CONFIG_PREFIX}cluster2_key2":
"value4",
+ }
+ operations = DefaultGVFSOperations(options=options)
+
+ # Test cluster1
+ path1 = "s3://bucket1/path/to/file1"
+ configs1 = operations._get_user_defined_configs(path1)
+ self.assertEqual(len(configs1), 2)
+ self.assertEqual(configs1["key1"], "value1")
+ self.assertEqual(configs1["key2"], "value2")
+
+ # Test cluster2
+ path2 = "s3://bucket2/path/to/file2"
+ configs2 = operations._get_user_defined_configs(path2)
+ self.assertEqual(len(configs2), 2)
+ self.assertEqual(configs2["key1"], "value3")
+ self.assertEqual(configs2["key2"], "value4")
+
+ def test_get_user_defined_configs_edge_cases(self):
+ """Test _get_user_defined_configs with edge cases."""
+ # Test with empty path
+ options1 = {
+ f"{GVFSConfig.FS_GRAVITINO_PATH_CONFIG_PREFIX}cluster1":
"s3://bucket1",
+ f"{GVFSConfig.FS_GRAVITINO_PATH_CONFIG_PREFIX}cluster1_key1":
"value1",
+ }
+ operations1 = DefaultGVFSOperations(options=options1)
+ self.assertEqual(operations1._get_user_defined_configs(""), {})
+ self.assertEqual(operations1._get_user_defined_configs(None), {})
+
+ # Test with no options
+ operations2 = DefaultGVFSOperations(options=None)
+
self.assertEqual(operations2._get_user_defined_configs("s3://bucket1/path"), {})
+
+ # Test with empty options
+ operations3 = DefaultGVFSOperations(options={})
+
self.assertEqual(operations3._get_user_defined_configs("s3://bucket1/path"), {})
+
+ # Test with no matching location
+ options4 = {
+ f"{GVFSConfig.FS_GRAVITINO_PATH_CONFIG_PREFIX}cluster1":
"s3://bucket1",
+ f"{GVFSConfig.FS_GRAVITINO_PATH_CONFIG_PREFIX}cluster1_key1":
"value1",
+ }
+ operations4 = DefaultGVFSOperations(options=options4)
+
self.assertEqual(operations4._get_user_defined_configs("s3://bucket2/path"), {})
+
+ # Test with property without location definition
+ options5 = {
+ f"{GVFSConfig.FS_GRAVITINO_PATH_CONFIG_PREFIX}cluster1_key1":
"value1",
+ }
+ operations5 = DefaultGVFSOperations(options=options5)
+
self.assertEqual(operations5._get_user_defined_configs("s3://bucket1/path"), {})
+
+ def test_get_user_defined_configs_trailing_slash(self):
+ """Test _get_user_defined_configs with trailing slash in base location
value."""
+ # Config value has trailing slash, path does not
+ options = {
+ f"{GVFSConfig.FS_GRAVITINO_PATH_CONFIG_PREFIX}cluster1":
"hdfs://cluster1/",
+ f"{GVFSConfig.FS_GRAVITINO_PATH_CONFIG_PREFIX}cluster1_key1":
"value1",
+ }
+ operations = DefaultGVFSOperations(options=options)
+
+ path = "hdfs://cluster1/path/to/file"
+ configs = operations._get_user_defined_configs(path)
+ self.assertEqual(len(configs), 1)
+ self.assertEqual(configs["key1"], "value1")
+
+ # Config value has sub path, should still match by scheme://authority
+ options2 = {
+ f"{GVFSConfig.FS_GRAVITINO_PATH_CONFIG_PREFIX}cluster1":
"hdfs://cluster1/sub/path",
+ f"{GVFSConfig.FS_GRAVITINO_PATH_CONFIG_PREFIX}cluster1_key1":
"value2",
+ }
+ operations2 = DefaultGVFSOperations(options=options2)
+
+ path2 = "hdfs://cluster1/other/path"
+ configs2 = operations2._get_user_defined_configs(path2)
+ self.assertEqual(len(configs2), 1)
+ self.assertEqual(configs2["key1"], "value2")
diff --git a/clients/client-python/tests/unittests/test_storage_handler.py
b/clients/client-python/tests/unittests/test_storage_handler.py
index a666c9fd12..53871eb33a 100644
--- a/clients/client-python/tests/unittests/test_storage_handler.py
+++ b/clients/client-python/tests/unittests/test_storage_handler.py
@@ -56,9 +56,6 @@ class TestStorageHandler(unittest.TestCase):
):
result = s3_storage_handler.get_filesystem_with_expiration(
[],
- {
- "s3-endpoint": "endpoint_from_catalog",
- },
{
"s3_endpoint": "endpoint_from_client",
"s3_access_key_id": "access_key_from_client",
@@ -77,8 +74,6 @@ class TestStorageHandler(unittest.TestCase):
[],
{
"s3-endpoint": "endpoint_from_catalog",
- },
- {
"s3_access_key_id": "access_key_from_client",
"s3_secret_access_key": "secret_key_from_client",
},
diff --git
a/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/GravitinoVirtualFileSystemConfiguration.java
b/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/GravitinoVirtualFileSystemConfiguration.java
index 95fc7bafb9..1c07ee7a53 100644
---
a/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/GravitinoVirtualFileSystemConfiguration.java
+++
b/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/GravitinoVirtualFileSystemConfiguration.java
@@ -184,7 +184,10 @@ public class GravitinoVirtualFileSystemConfiguration {
/** The default value for whether to enable auto-creation of fileset
location. */
public static final boolean FS_GRAVITINO_AUTO_CREATE_LOCATION_DEFAULT = true;
- /** The prefix for user-defined location configs: {@code
fs.path.config.<locationName>=<path>}. */
+ /**
+ * The prefix for user-defined location configs: {@code
+ * fs.path.config.<locationName>.<property_name>=<property_value>}.
+ */
public static final String FS_GRAVITINO_PATH_CONFIG_PREFIX =
"fs.path.config.";
private GravitinoVirtualFileSystemConfiguration() {}
diff --git a/docs/how-to-use-gvfs.md b/docs/how-to-use-gvfs.md
index dc67abf58f..bc07174ad9 100644
--- a/docs/how-to-use-gvfs.md
+++ b/docs/how-to-use-gvfs.md
@@ -48,32 +48,32 @@ the path mapping and convert automatically.
### Configuration
-| Configuration item | Description
| Default value
| Required | Since
version |
-|-------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------------------------------------------------------------|-------------------------------------|------------------|
-| `fs.AbstractFileSystem.gvfs.impl` | The Gravitino
Virtual File System abstract class, set it to
`org.apache.gravitino.filesystem.hadoop.Gvfs`.
| (none) | Yes
| 0.5.0 |
-| `fs.gvfs.impl` | The Gravitino
Virtual File System implementation class, set it to
`org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystem`.
| (none) | Yes
| 0.5.0 |
-| `fs.gvfs.impl.disable.cache` | Disable the
Gravitino Virtual File System cache in the Hadoop environment. If you need to
proxy multi-user operations, please set this value to `true` and create a
separate File System for each user.
| `false`
| No
| 0.5.0 |
-| `fs.gravitino.server.uri` | The Gravitino server
URI which GVFS needs to load the fileset metadata.
| (none)
| Yes | 0.5.0
|
-| `fs.gravitino.client.metalake` | The metalake to
which the fileset belongs.
| (none)
| Yes |
0.5.0 |
-| `fs.gravitino.client.authType` | The auth type to
initialize the Gravitino client to use with the Gravitino Virtual File System.
Currently only supports `simple`, `oauth2` and `kerberos` auth types.
| `simple`
| No |
0.5.0 |
-| `fs.gravitino.client.oauth2.serverUri` | The auth server URI
for the Gravitino client when using `oauth2` auth type with the Gravitino
Virtual File System.
| (none)
| Yes if you use `oauth2` auth type |
0.5.0 |
-| `fs.gravitino.client.oauth2.credential` | The auth credential
for the Gravitino client when using `oauth2` auth type in the Gravitino Virtual
File System.
| (none)
| Yes if you use `oauth2` auth type | 0.5.0
|
-| `fs.gravitino.client.oauth2.path` | The auth server path
for the Gravitino client when using `oauth2` auth type with the Gravitino
Virtual File System. Please remove the first slash `/` from the path, for
example `oauth/token`.
| (none)
| Yes if you use `oauth2` auth
type | 0.5.0 |
-| `fs.gravitino.client.oauth2.scope` | The auth scope for
the Gravitino client when using `oauth2` auth type with the Gravitino Virtual
File System.
| (none)
| Yes if you use `oauth2` auth type |
0.5.0 |
-| `fs.gravitino.client.kerberos.principal` | The auth principal
for the Gravitino client when using `kerberos` auth type with the Gravitino
Virtual File System.
| (none)
| Yes if you use `kerberos` auth type |
0.5.1 |
-| `fs.gravitino.client.kerberos.keytabFilePath` | The auth keytab file
path for the Gravitino client when using `kerberos` auth type in the Gravitino
Virtual File System.
| (none)
| No | 0.5.1
|
-| `fs.gravitino.fileset.cache.maxCapacity` | The cache capacity
of the Gravitino Virtual File System.
| `20`
| No |
0.5.0 |
-| `fs.gravitino.fileset.cache.evictionMillsAfterAccess` | The value of time
that the cache expires after accessing in the Gravitino Virtual File System.
The value is in `milliseconds`.
| `3600000`
| No |
0.5.0 |
-| `fs.gravitino.current.location.name` | The configuration
used to select the location of the fileset. If this configuration is not set,
the value of environment variable configured by
`fs.gravitino.current.location.env.var` will be checked. If neither is set, the
value of fileset property `default-location-name` will be used as the location
name. | the value of fileset property `default-location-name` | No
| 0.9.0-incubating |
-| `fs.gravitino.current.location.name.env.var` | The environment
variable name to get the current location name.
| `CURRENT_LOCATION_NAME`
| No |
0.9.0-incubating |
-| `fs.gravitino.operations.class` | The operations class
to provide the FS operations for the Gravitino Virtual File System. Users can
extends `BaseGVFSOperations` to implement their own operations and configure
the class name in this conf to use custom FS operations.
|
`org.apache.gravitino.filesystem.hadoop.DefaultGVFSOperations` | No
| 0.9.0-incubating |
-| `fs.gravitino.hook.class` | The hook class to
inject into the <br/>Gravitino Virtual File System. Users can implement their
own `GravitinoVirtualFileSystemHook` and configure the class name in this conf
to inject custom code.
|
`org.apache.gravitino.filesystem.hadoop.NoOpHook` | No
| 0.9.0-incubating |
-| `fs.gravitino.client.request.header.` | The configuration
key prefix for the Gravitino client request header. You can set the request
header for the Gravitino client.
| (none)
| No |
0.9.0-incubating |
-| `fs.gravitino.enableCredentialVending` | Whether to enable
credential vending for the Gravitino Virtual File System.
| `false`
| No |
0.9.0-incubating |
-| `fs.gravitino.client.` | The configuration
key prefix for the Gravitino client config.
| (none)
| No |
1.0.0 |
-| `fs.gravitino.filesetMetadataCache.enable` | Whether to cache the
fileset, fileset schema or fileset catalog metadata in the Gravitino Virtual
File System. Note that this cache causes a side effect: if you modify the
fileset or fileset catalog metadata, the client can not see the latest changes.
| `false`
| No
| 1.0.0 |
-| `fs.gravitino.autoCreateLocation` | The configuration
key for whether to enable auto-creation of fileset location when the
server-side filesystem ops are disabled and the location does not exist.
| `true`
| No
| 1.1.0 |
-| `fs.path.config.<name>` | Defines a logical
location entry. Set `fs.path.config.<name>` to the real base URI (for example,
`hdfs://cluster1/`). Any key that starts with the same prefix (such as
`fs.path.config.<name>.config.resource`) is treated as a location-scoped
property and will be forwarded to the underlying filesystem client. | (none)
| No
| 1.1.0 |
+| Configuration item | Description
| Default value
| Required [...]
+|-------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------------------------------------------------------------|---------
[...]
+| `fs.AbstractFileSystem.gvfs.impl` | The Gravitino
Virtual File System abstract class, set it to
`org.apache.gravitino.filesystem.hadoop.Gvfs`.
| (none)
| Yes [...]
+| `fs.gvfs.impl` | The Gravitino
Virtual File System implementation class, set it to
`org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystem`.
| (none)
| Yes [...]
+| `fs.gvfs.impl.disable.cache` | Disable the
Gravitino Virtual File System cache in the Hadoop environment. If you need to
proxy multi-user operations, please set this value to `true` and create a
separate File System for each user.
| `false`
| No [...]
+| `fs.gravitino.server.uri` | The Gravitino server
URI which GVFS needs to load the fileset metadata.
| (none)
| Yes [...]
+| `fs.gravitino.client.metalake` | The metalake to
which the fileset belongs.
| (none)
| Yes [...]
+| `fs.gravitino.client.authType` | The auth type to
initialize the Gravitino client to use with the Gravitino Virtual File System.
Currently only supports `simple`, `oauth2` and `kerberos` auth types.
| `simple`
| No [...]
+| `fs.gravitino.client.oauth2.serverUri` | The auth server URI
for the Gravitino client when using `oauth2` auth type with the Gravitino
Virtual File System.
| (none)
| Yes if y [...]
+| `fs.gravitino.client.oauth2.credential` | The auth credential
for the Gravitino client when using `oauth2` auth type in the Gravitino Virtual
File System.
| (none)
| Yes if y [...]
+| `fs.gravitino.client.oauth2.path` | The auth server path
for the Gravitino client when using `oauth2` auth type with the Gravitino
Virtual File System. Please remove the first slash `/` from the path, for
example `oauth/token`.
| (none)
| Yes if y [...]
+| `fs.gravitino.client.oauth2.scope` | The auth scope for
the Gravitino client when using `oauth2` auth type with the Gravitino Virtual
File System.
| (none)
| Yes if y [...]
+| `fs.gravitino.client.kerberos.principal` | The auth principal
for the Gravitino client when using `kerberos` auth type with the Gravitino
Virtual File System.
| (none)
| Yes if y [...]
+| `fs.gravitino.client.kerberos.keytabFilePath` | The auth keytab file
path for the Gravitino client when using `kerberos` auth type in the Gravitino
Virtual File System.
| (none)
| No [...]
+| `fs.gravitino.fileset.cache.maxCapacity` | The cache capacity
of the Gravitino Virtual File System.
| `20`
| No [...]
+| `fs.gravitino.fileset.cache.evictionMillsAfterAccess` | The value of time
that the cache expires after accessing in the Gravitino Virtual File System.
The value is in `milliseconds`.
| `3600000`
| No [...]
+| `fs.gravitino.current.location.name` | The configuration
used to select the location of the fileset. If this configuration is not set,
the value of environment variable configured by
`fs.gravitino.current.location.env.var` will be checked. If neither is set, the
value of fileset property `default-location-name` will be used as the location
name. | the value of fileset
property `default-location-name` | No [...]
+| `fs.gravitino.current.location.name.env.var` | The environment
variable name to get the current location name.
| `CURRENT_LOCATION_NAME`
| No [...]
+| `fs.gravitino.operations.class` | The operations class
to provide the FS operations for the Gravitino Virtual File System. Users can
extends `BaseGVFSOperations` to implement their own operations and configure
the class name in this conf to use custom FS operations.
|
`org.apache.gravitino.filesystem.hadoop.DefaultGVFSOperations` | No [...]
+| `fs.gravitino.hook.class` | The hook class to
inject into the <br/>Gravitino Virtual File System. Users can implement their
own `GravitinoVirtualFileSystemHook` and configure the class name in this conf
to inject custom code.
| `org.apache.gravitino.filesystem.hadoop.NoOpHook`
| No [...]
+| `fs.gravitino.client.request.header.` | The configuration
key prefix for the Gravitino client request header. You can set the request
header for the Gravitino client.
| (none)
| No [...]
+| `fs.gravitino.enableCredentialVending` | Whether to enable
credential vending for the Gravitino Virtual File System.
| `false`
| No [...]
+| `fs.gravitino.client.` | The configuration
key prefix for the Gravitino client config.
| (none)
| No [...]
+| `fs.gravitino.filesetMetadataCache.enable` | Whether to cache the
fileset, fileset schema or fileset catalog metadata in the Gravitino Virtual
File System. Note that this cache causes a side effect: if you modify the
fileset or fileset catalog metadata, the client can not see the latest changes.
| `false`
| No [...]
+| `fs.gravitino.autoCreateLocation` | The configuration
key for whether to enable auto-creation of fileset location when the
server-side filesystem ops are disabled and the location does not exist.
| `true`
| No [...]
+| `fs.path.config.<name>` | Defines a logical
location entry. Set `fs.path.config.<name>` to the real base URI (for example,
`hdfs://cluster1/`). Any key that starts with the same prefix (such as
`fs.path.config.<name>.config.resource`) is treated as a location-scoped
property and will be forwarded to the underlying filesystem client. Note:
location names must not contain (`.`, `_`). | (none)
| No [...]
To configure the Gravitino client, use properties prefixed with
`fs.gravitino.client.`. These properties will be passed to the Gravitino client
after removing the `fs.` prefix.
@@ -410,33 +410,69 @@ to recompile the native libraries like `libhdfs` and
others, and completely repl
### Configuration
-| Configuration item | Description
| Default value
| Required | Since version |
-|---------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------------------------------------------------------------------|-----------------------------------|------------------|
-| `server_uri` | The Gravitino server uri, e.g.
`http://localhost:8090`.
| (none)
| Yes | 0.6.0-incubating |
-| `metalake_name` | The metalake name which the fileset
belongs to.
| (none)
| Yes | 0.6.0-incubating |
-| `cache_size` | The cache capacity of the Gravitino
Virtual File System.
| `20`
| No | 0.6.0-incubating |
-| `cache_expired_time` | The value of time that the cache expires
after accessing in the Gravitino Virtual File System. The value is in
`seconds`.
| `3600`
| No | 0.6.0-incubating |
-| `auth_type` | The auth type to initialize the Gravitino
client to use with the Gravitino Virtual File System. Currently supports
`simple` and `oauth2` auth types.
| `simple`
| No | 0.6.0-incubating |
-| `oauth2_server_uri` | The auth server URI for the Gravitino
client when using `oauth2` auth type.
| (none)
| Yes if you use `oauth2` auth type | 0.7.0-incubating |
-| `oauth2_credential` | The auth credential for the Gravitino
client when using `oauth2` auth type.
| (none)
| Yes if you use `oauth2` auth type | 0.7.0-incubating |
-| `oauth2_path` | The auth server path for the Gravitino
client when using `oauth2` auth type. Please remove the first slash `/` from
the path, for example `oauth/token`.
| (none)
| Yes if you use `oauth2` auth type | 0.7.0-incubating |
-| `oauth2_scope` | The auth scope for the Gravitino client
when using `oauth2` auth type with the Gravitino Virtual File System.
| (none)
| Yes if you use `oauth2` auth type | 0.7.0-incubating |
-| `credential_expiration_ratio` | The ratio of expiration time for
credential from Gravitino. This is used in the cases where Gravitino Fileset
catalogs have enable credential vending. if the expiration time of credential
fetched from Gravitino is 1 hour, GVFS client will try to refresh the
credential in 1 * 0.9 = 0.5 hour. | 0.5
| No |
0.8.0-incubating |
-| `current_location_name` | The configuration used to select the
location of the fileset. If this configuration is not set, the value of
environment variable configured by `current_location_name_env_var` will be
checked. If neither is set, the value of fileset property
`default-location-name` will be used as the location name. | the value of
fileset property `default-location-name` | No
| 0.9.0-incubating |
-| `current_location_name_env_var` | The environment variable name to get the
current location name.
| `CURRENT_LOCATION_NAME`
| No | 0.9.0-incubating |
-| `operations_class` | The operations class to provide the FS
operations for the Gravitino Virtual File System. Users can extends
`BaseGVFSOperations` to implement their own operations and configure the class
name in this conf to use custom FS operations.
|
`gravitino.filesystem.gvfs_default_operations.DefaultGVFSOperations` | No
| 0.9.0-incubating |
-| `hook_class` | The hook class to inject into the
Gravitino Virtual File System. Users can implement their own
`GravitinoVirtualFileSystemHook` and configure the class name in this conf to
inject custom code.
|
`gravitino.filesystem.gvfs_hook.NoOpHook` | No
| 0.9.0-incubating |
-| `client_request_header_` | The configuration key prefix for the
Gravitino client request header. You can set the request header for the
Gravitino client.
| (none)
| No | 0.9.0-incubating |
-| `enable_credential_vending` | Whether to enable credential vending for
the Gravitino Virtual File System.
| `false`
| No | 0.9.0-incubating |
-| `gvfs_gravitino_client_` | The configuration key prefix for the
Gravitino client. You can set the config for the Gravitino client.
| (none)
| No | 1.0.0 |
-| `enable_fileset_metadata_cache` | Whether to cache the fileset or fileset
catalog metadata in the Gravitino Virtual File System. Note that this cache
causes a side effect: if you modify the fileset or fileset catalog metadata,
the client can not see the latest changes.
| `false`
| No | 1.0.0 |
-| `auto_create_location` | The configuration key for whether to
enable auto-creation of fileset location when the server-side filesystem ops
are disabled and the location does not exist.
| `true`
| No | 1.1.0 |
+| Configuration item | Description
|
Default value | Required
[...]
+|---------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------------------------------------------------------------------|-------------------------
[...]
+| `server_uri` | The Gravitino server uri, e.g.
`http://localhost:8090`.
| (none)
| Yes [...]
+| `metalake_name` | The metalake name which the fileset
belongs to.
| (none) | Yes
[...]
+| `cache_size` | The cache capacity of the Gravitino
Virtual File System.
| `20` | No
[...]
+| `cache_expired_time` | The value of time that the cache expires
after accessing in the Gravitino Virtual File System. The value is in
`seconds`.
| `3600`
| No [...]
+| `auth_type` | The auth type to initialize the Gravitino
client to use with the Gravitino Virtual File System. Currently supports
`simple` and `oauth2` auth types.
| `simple` | No
[...]
+| `oauth2_server_uri` | The auth server URI for the Gravitino
client when using `oauth2` auth type.
| (none) | Yes
if you use `oauth2` [...]
+| `oauth2_credential` | The auth credential for the Gravitino
client when using `oauth2` auth type.
| (none) | Yes
if you use `oauth2` [...]
+| `oauth2_path` | The auth server path for the Gravitino
client when using `oauth2` auth type. Please remove the first slash `/` from
the path, for example `oauth/token`.
| (none) | Yes
if you use `oauth2` [...]
+| `oauth2_scope` | The auth scope for the Gravitino client
when using `oauth2` auth type with the Gravitino Virtual File System.
| (none) | Yes if
you use `oauth2` [...]
+| `credential_expiration_ratio` | The ratio of expiration time for
credential from Gravitino. This is used in the cases where Gravitino Fileset
catalogs have enable credential vending. if the expiration time of credential
fetched from Gravitino is 1 hour, GVFS client will try to refresh the
credential in 1 * 0.9 = 0.5 hour.
| 0.5
| No [...]
+| `current_location_name` | The configuration used to select the
location of the fileset. If this configuration is not set, the value of
environment variable configured by `current_location_name_env_var` will be
checked. If neither is set, the value of fileset property
`default-location-name` will be used as the location name.
| the value of fileset property
`default-location-name` | No [...]
+| `current_location_name_env_var` | The environment variable name to get the
current location name.
|
`CURRENT_LOCATION_NAME` | No
[...]
+| `operations_class` | The operations class to provide the FS
operations for the Gravitino Virtual File System. Users can extends
`BaseGVFSOperations` to implement their own operations and configure the class
name in this conf to use custom FS operations.
|
`gravitino.filesystem.gvfs_default_operations.DefaultGVFSOperations` | No
[...]
+| `hook_class` | The hook class to inject into the
Gravitino Virtual File System. Users can implement their own
`GravitinoVirtualFileSystemHook` and configure the class name in this conf to
inject custom code.
| `gravitino.filesystem.gvfs_hook.NoOpHook`
| No [...]
+| `client_request_header_` | The configuration key prefix for the
Gravitino client request header. You can set the request header for the
Gravitino client.
| (none)
| No [...]
+| `enable_credential_vending` | Whether to enable credential vending for
the Gravitino Virtual File System.
|
`false` | No
[...]
+| `gvfs_gravitino_client_` | The configuration key prefix for the
Gravitino client. You can set the config for the Gravitino client.
| (none) | No
[...]
+| `enable_fileset_metadata_cache` | Whether to cache the fileset or fileset
catalog metadata in the Gravitino Virtual File System. Note that this cache
causes a side effect: if you modify the fileset or fileset catalog metadata,
the client can not see the latest changes.
| `false` |
No [...]
+| `auto_create_location` | The configuration key for whether to
enable auto-creation of fileset location when the server-side filesystem ops
are disabled and the location does not exist.
| `true` |
No [...]
+| `fs_path_config_<name>` | Defines a logical location entry. Set
`fs_path_config_<name>` to the real base URI (for example, `hdfs://cluster1/`).
Any key that starts with the same prefix (such as
`fs_path_config_<name>_config.resource`) is treated as a location-scoped
property and will be forwarded to the underlying filesystem client. Note:
location names must not contain (`.`, `_`). | (none)
| No [...]
+
To configure the Gravitino client, use properties prefixed with
`gvfs_gravitino_client_`. These properties will be passed to the Gravitino
client after removing the `gvfs_` prefix.
**Example:** Setting `gvfs_gravitino_client_request_timeout` is equivalent to
setting `gravitino_client_request_timeout` for the Gravitino client.
**Note:** Invalid configuration properties will result in exceptions. Please
see [Gravitino Python client
configurations](./how-to-use-gravitino-client.md#gravitino-python-client-configuration)
for more support client configuration.
+:::note
+When users work with a multi-cluster fileset catalog, they can configure
separate sets of properties for the base paths
+of the different clusters. [Manage fileset with multiple
clusters](./manage-fileset-metadata-using-gravitino.md#manage-fileset-with-multiple-clusters)
+
+For example, a complex catalog structure might look like this:
+
+```text
+catalog1 -> hdfs://cluster1/catalog1
+ schema1 -> hdfs://cluster1/catalog1/schema1
+ fileset1 -> hdfs://cluster1/catalog1/schema1/fileset1
+ fileset2 -> hdfs://cluster1/catalog1/schema1/fileset2
+ schema2 -> hdfs://cluster2/tmp/schema2
+ fileset3 -> hdfs://cluster2/tmp/schema2/fsd
+ fileset4 -> hdfs://cluster3/customers
+```
+
+In this case, users can configure different client properties for each base
path:
+
+```python
+options = {
+ "server_uri": "http://localhost:8090",
+ "metalake_name": "test",
+ "fs_path_config_cluster1": "hdfs://cluster1/",
+ "fs_path_config_cluster1_config.resource":
"/etc/core-site.xml,/etc/hdfs-site.xml",
+ "fs_path_config_cluster2": "hdfs://cluster2/",
+ "fs_path_config_cluster2_config.resource":
"/etc/fs2/core-site.xml,/etc/fs2/hdfs-site.xml",
+ "fs_path_config_cluster3": "hdfs://cluster3/",
+ "fs_path_config_cluster3_config.resource":
"/etc/fs3/core-site.xml,/etc/fs3/hdfs-site.xml",
+}
+```
+
+The plain `fs_path_config_<name>` entry specifies the base path of the
filesystem. Any additional key under the same prefix
(`fs_path_config_<name>_<config_key>`) is treated as a location-scoped
configuration (for example, `config.resource` for HDFS) and is forwarded
directly to the underlying filesystem client.
+:::
+
#### Configurations for S3, GCS, OSS and Azure Blob storage fileset
Please see the cloud-storage-specific configurations [GCS GVFS Java client
configurations](./fileset-catalog-with-gcs.md#using-the-gvfs-python-client-to-access-a-fileset),