This is an automated email from the ASF dual-hosted git repository.
mchades pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/gravitino.git
The following commit(s) were added to refs/heads/main by this push:
new aab665d9ce [#8882]feature(GVFS): add config for location auto-creation
(#8883)
aab665d9ce is described below
commit aab665d9ce6ae6c1ebf2dc47a28e89d2db292672
Author: Junda Yang <[email protected]>
AuthorDate: Mon Oct 27 18:50:40 2025 -0700
[#8882]feature(GVFS): add config for location auto-creation (#8883)
### What changes were proposed in this pull request?
make fileset location auto-creation configurable in GVFS
### Why are the changes needed?
client side perf improvement if client has auto_create_location set to
false.
Fix: #8882
### Does this PR introduce _any_ user-facing change?
no
### How was this patch tested?
unit test added
---
.../gravitino/filesystem/gvfs_base_operations.py | 11 ++
.../gravitino/filesystem/gvfs_config.py | 4 +
.../tests/unittests/test_gvfs_with_local.py | 43 +++++++
.../filesystem/hadoop/BaseGVFSOperations.java | 11 ++
.../GravitinoVirtualFileSystemConfiguration.java | 9 ++
.../gravitino/filesystem/hadoop/TestGvfsBase.java | 124 +++++++++++++++++++++
docs/how-to-use-gvfs.md | 21 ++--
7 files changed, 213 insertions(+), 10 deletions(-)
diff --git a/clients/client-python/gravitino/filesystem/gvfs_base_operations.py
b/clients/client-python/gravitino/filesystem/gvfs_base_operations.py
index 2215224812..57cb646a98 100644
--- a/clients/client-python/gravitino/filesystem/gvfs_base_operations.py
+++ b/clients/client-python/gravitino/filesystem/gvfs_base_operations.py
@@ -76,6 +76,7 @@ class BaseGVFSOperations(ABC):
ENV_CURRENT_LOCATION_NAME_ENV_VAR_DEFAULT = "CURRENT_LOCATION_NAME"
ENABLE_CREDENTIAL_VENDING_DEFAULT = False
ENABLE_FILESET_METADATA_CACHE_DEFAULT = False
+ AUTO_CREATE_LOCATION_DEFAULT = True
def __init__(
self,
@@ -130,6 +131,14 @@ class BaseGVFSOperations(ABC):
GVFSConfig.CACHE_EXPIRED_TIME,
GVFSConfig.DEFAULT_CACHE_EXPIRED_TIME
)
)
+ self._auto_create_location = (
+ self.AUTO_CREATE_LOCATION_DEFAULT
+ if options is None
+ else options.get(
+ GVFSConfig.GVFS_FILESYSTEM_AUTO_CREATE_LOCATION,
+ self.AUTO_CREATE_LOCATION_DEFAULT,
+ )
+ )
self._filesystem_cache = TTLCache(maxsize=cache_size,
ttl=cache_expired_time)
self._cache_lock = rwlock.RWLockFair()
@@ -425,6 +434,8 @@ class BaseGVFSOperations(ABC):
actual_fs: AbstractFileSystem,
fileset_path: str,
):
+ if not self._auto_create_location:
+ return
# If the server-side filesystem ops are disabled, the fileset
directory may not exist. In
# such case the operations like create, open, list files under this
directory will fail.
# So we need to check the existence of the fileset directory
beforehand.
diff --git a/clients/client-python/gravitino/filesystem/gvfs_config.py
b/clients/client-python/gravitino/filesystem/gvfs_config.py
index 6d03d41ebb..c961aee083 100644
--- a/clients/client-python/gravitino/filesystem/gvfs_config.py
+++ b/clients/client-python/gravitino/filesystem/gvfs_config.py
@@ -81,3 +81,7 @@ class GVFSConfig:
# Note that this cache causes a side effect: if you modify the fileset or
fileset catalog metadata,
# the client can not see the latest changes.
GVFS_FILESYSTEM_ENABLE_FILESET_METADATA_CACHE =
"enable_fileset_metadata_cache"
+
+ # The configuration key for whether to enable auto-creation of fileset
location when the
+ # server-side filesystem ops are disabled and the location does not exist.
The default is true.
+ GVFS_FILESYSTEM_AUTO_CREATE_LOCATION = "auto_create_location"
diff --git a/clients/client-python/tests/unittests/test_gvfs_with_local.py
b/clients/client-python/tests/unittests/test_gvfs_with_local.py
index bcc873b9d3..be8ebc447c 100644
--- a/clients/client-python/tests/unittests/test_gvfs_with_local.py
+++ b/clients/client-python/tests/unittests/test_gvfs_with_local.py
@@ -1386,3 +1386,46 @@ class TestLocalFilesystem(unittest.TestCase):
self.assertEqual(status["name"], file_virtual_path)
else:
raise GravitinoRuntimeException("Unexpected file found")
+
+ def test_auto_create_location_config(self, *mock_methods):
+ """Test that auto_create_location configuration is correctly set"""
+ # Test with auto_create_location = False
+ options_disabled = {
+ GVFSConfig.GVFS_FILESYSTEM_AUTO_CREATE_LOCATION: False,
+ }
+ fs_disabled = gvfs.GravitinoVirtualFileSystem(
+ server_uri=self._server_uri,
+ metalake_name=self._metalake_name,
+ options=options_disabled,
+ skip_instance_cache=True,
+ )
+ self.assertFalse(
+ fs_disabled._operations._auto_create_location,
+ "auto_create_location should be False when explicitly disabled",
+ )
+
+ # Test with auto_create_location = True
+ options_enabled = {
+ GVFSConfig.GVFS_FILESYSTEM_AUTO_CREATE_LOCATION: True,
+ }
+ fs_enabled = gvfs.GravitinoVirtualFileSystem(
+ server_uri=self._server_uri,
+ metalake_name=self._metalake_name,
+ options=options_enabled,
+ skip_instance_cache=True,
+ )
+ self.assertTrue(
+ fs_enabled._operations._auto_create_location,
+ "auto_create_location should be True when explicitly enabled",
+ )
+
+ # Test default behavior (should be True)
+ fs_default = gvfs.GravitinoVirtualFileSystem(
+ server_uri=self._server_uri,
+ metalake_name=self._metalake_name,
+ skip_instance_cache=True,
+ )
+ self.assertTrue(
+ fs_default._operations._auto_create_location,
+ "auto_create_location should default to True",
+ )
diff --git
a/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/BaseGVFSOperations.java
b/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/BaseGVFSOperations.java
index 9ffaaf5760..1fc0bc6b80 100644
---
a/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/BaseGVFSOperations.java
+++
b/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/BaseGVFSOperations.java
@@ -133,6 +133,8 @@ public abstract class BaseGVFSOperations implements
Closeable {
private final boolean enableCredentialVending;
+ private final boolean autoCreateLocation;
+
/**
* Constructs a new {@link BaseGVFSOperations} with the given {@link
Configuration}.
*
@@ -176,6 +178,12 @@ public abstract class BaseGVFSOperations implements
Closeable {
configuration.getBoolean(
GravitinoVirtualFileSystemConfiguration.FS_GRAVITINO_ENABLE_CREDENTIAL_VENDING,
GravitinoVirtualFileSystemConfiguration.FS_GRAVITINO_ENABLE_CREDENTIAL_VENDING_DEFAULT);
+
+ this.autoCreateLocation =
+ configuration.getBoolean(
+
GravitinoVirtualFileSystemConfiguration.FS_GRAVITINO_AUTO_CREATE_LOCATION,
+
GravitinoVirtualFileSystemConfiguration.FS_GRAVITINO_AUTO_CREATE_LOCATION_DEFAULT);
+
this.conf = configuration;
}
@@ -444,6 +452,9 @@ public abstract class BaseGVFSOperations implements
Closeable {
private void createFilesetLocationIfNeed(
NameIdentifier filesetIdent, FileSystem fs, Path filesetPath) {
+ if (!autoCreateLocation) {
+ return;
+ }
NameIdentifier catalogIdent =
NameIdentifier.of(filesetIdent.namespace().level(0),
filesetIdent.namespace().level(1));
// If the server-side filesystem ops are disabled, the fileset directory
may not exist. In such
diff --git
a/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/GravitinoVirtualFileSystemConfiguration.java
b/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/GravitinoVirtualFileSystemConfiguration.java
index 974ecf75c8..80d9b6341e 100644
---
a/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/GravitinoVirtualFileSystemConfiguration.java
+++
b/clients/filesystem-hadoop3/src/main/java/org/apache/gravitino/filesystem/hadoop/GravitinoVirtualFileSystemConfiguration.java
@@ -175,5 +175,14 @@ public class GravitinoVirtualFileSystemConfiguration {
/** The default value for whether to enable fileset and catalog cache. */
public static final boolean
FS_GRAVITINO_FILESET_METADATA_CACHE_ENABLE_DEFAULT = false;
+ /**
+ * The configuration key for whether to enable auto-creation of fileset
location when the
+ * server-side filesystem ops are disabled and the location does not exist.
The default is true.
+ */
+ public static final String FS_GRAVITINO_AUTO_CREATE_LOCATION =
"fs.gravitino.autoCreateLocation";
+
+ /** The default value for whether to enable auto-creation of fileset
location. */
+ public static final boolean FS_GRAVITINO_AUTO_CREATE_LOCATION_DEFAULT = true;
+
private GravitinoVirtualFileSystemConfiguration() {}
}
diff --git
a/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/TestGvfsBase.java
b/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/TestGvfsBase.java
index 2d36f84e9b..0956ae2b8f 100644
---
a/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/TestGvfsBase.java
+++
b/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/TestGvfsBase.java
@@ -53,6 +53,7 @@ import java.lang.reflect.Field;
import java.net.SocketTimeoutException;
import java.net.URISyntaxException;
import java.nio.charset.StandardCharsets;
+import java.time.Instant;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
@@ -65,8 +66,10 @@ import org.apache.commons.lang3.tuple.Pair;
import org.apache.gravitino.NameIdentifier;
import org.apache.gravitino.Version;
import org.apache.gravitino.dto.AuditDTO;
+import org.apache.gravitino.dto.CatalogDTO;
import org.apache.gravitino.dto.credential.CredentialDTO;
import org.apache.gravitino.dto.file.FilesetDTO;
+import org.apache.gravitino.dto.responses.CatalogResponse;
import org.apache.gravitino.dto.responses.CredentialResponse;
import org.apache.gravitino.dto.responses.ErrorResponse;
import org.apache.gravitino.dto.responses.FileLocationResponse;
@@ -1040,6 +1043,127 @@ public class TestGvfsBase extends
GravitinoMockServerBase {
Assertions.assertEquals("Read timed out",
throwable.getCause().getMessage());
}
+ @Test
+ public void testAutoCreateLocation() throws IOException,
JsonProcessingException {
+ Assumptions.assumeTrue(getClass() == TestGvfsBase.class);
+ String catalogNameWithFsOpsDisabled = "catalog_fs_ops_disabled";
+ String schemaNameLocal = "schema_auto_create";
+ String filesetName = "fileset_auto_create";
+
+ // Mock a catalog with disable-filesystem-ops=true
+ CatalogDTO catalogWithFsOpsDisabled =
+ CatalogDTO.builder()
+ .withName(catalogNameWithFsOpsDisabled)
+ .withType(CatalogDTO.Type.FILESET)
+ .withProvider(provider)
+ .withComment("test catalog")
+ .withProperties(ImmutableMap.of("disable-filesystem-ops", "true"))
+ .withAudit(
+
AuditDTO.builder().withCreator("creator").withCreateTime(Instant.now()).build())
+ .build();
+ CatalogResponse catalogResponse = new
CatalogResponse(catalogWithFsOpsDisabled);
+ buildMockResource(
+ Method.GET,
+ "/api/metalakes/" + metalakeName + "/catalogs/" +
catalogNameWithFsOpsDisabled,
+ null,
+ catalogResponse,
+ SC_OK);
+
+ Path managedFilesetPath =
+ FileSystemTestUtils.createFilesetPath(
+ catalogNameWithFsOpsDisabled, schemaNameLocal, filesetName, true);
+ Path localPath =
+ FileSystemTestUtils.createLocalDirPrefix(
+ catalogNameWithFsOpsDisabled, schemaNameLocal, filesetName);
+ String locationPath =
+ String.format(
+ "/api/metalakes/%s/catalogs/%s/schemas/%s/filesets/%s/location",
+ metalakeName, catalogNameWithFsOpsDisabled, schemaNameLocal,
filesetName);
+
+ // Mock the fileset
+ mockFilesetDTO(
+ metalakeName,
+ catalogNameWithFsOpsDisabled,
+ schemaNameLocal,
+ filesetName,
+ Fileset.Type.MANAGED,
+ ImmutableMap.of("location1", localPath.toString()),
+ ImmutableMap.of(PROPERTY_DEFAULT_LOCATION_NAME, "location1"));
+
+ // Test with autoCreateLocation = false
+ Configuration configWithoutAutoCreate = new Configuration(conf);
+ configWithoutAutoCreate.setBoolean(
+
GravitinoVirtualFileSystemConfiguration.FS_GRAVITINO_AUTO_CREATE_LOCATION,
false);
+
+ try (FileSystem gravitinoFileSystem =
+ managedFilesetPath.getFileSystem(configWithoutAutoCreate);
+ FileSystem localFileSystem = localPath.getFileSystem(conf)) {
+
+ // Setup mock responses
+ FileLocationResponse fileLocationResponse = new
FileLocationResponse(localPath.toString());
+ Map<String, String> queryParams = new HashMap<>();
+ queryParams.put("sub_path", "");
+ buildMockResource(Method.GET, locationPath, queryParams, null,
fileLocationResponse, SC_OK);
+ buildMockResourceForCredential(filesetName, localPath.toString());
+
+ // Delete local path if it exists
+ if (localFileSystem.exists(localPath)) {
+ localFileSystem.delete(localPath, true);
+ }
+
+ // Verify location does not exist
+ assertFalse(localFileSystem.exists(localPath));
+
+ // Try to list the fileset - this triggers the auto-creation check
+ // When autoCreateLocation=false and directory doesn't exist, it should
throw
+ // FileNotFoundException
+ assertThrows(
+ FileNotFoundException.class,
+ () -> gravitinoFileSystem.listStatus(managedFilesetPath),
+ "Should throw FileNotFoundException when location doesn't exist and
autoCreateLocation=false");
+
+ // Verify location was NOT auto-created when autoCreateLocation=false
+ assertFalse(
+ localFileSystem.exists(localPath),
+ "Location should NOT be auto-created when autoCreateLocation=false");
+ }
+
+ // Test with autoCreateLocation = true (default)
+ Configuration configWithAutoCreate = new Configuration(conf);
+ // Don't set the config, use default which is true
+
+ try (FileSystem gravitinoFileSystem =
managedFilesetPath.getFileSystem(configWithAutoCreate);
+ FileSystem localFileSystem = localPath.getFileSystem(conf)) {
+
+ // Setup mock responses (need to rebuild since we created a new
filesystem)
+ FileLocationResponse fileLocationResponse = new
FileLocationResponse(localPath.toString());
+ Map<String, String> queryParams = new HashMap<>();
+ queryParams.put("sub_path", "");
+ buildMockResource(Method.GET, locationPath, queryParams, null,
fileLocationResponse, SC_OK);
+ buildMockResourceForCredential(filesetName, localPath.toString());
+
+ // Delete local path if it exists
+ if (localFileSystem.exists(localPath)) {
+ localFileSystem.delete(localPath, true);
+ }
+
+ // Verify location does not exist initially
+ assertFalse(localFileSystem.exists(localPath));
+
+ // Try to list the fileset - with autoCreateLocation=true (default), it
should create the
+ // location
+ gravitinoFileSystem.listStatus(managedFilesetPath);
+
+ // Verify location WAS auto-created when autoCreateLocation=true
(default)
+ assertTrue(
+ localFileSystem.exists(localPath),
+ "Location SHOULD be auto-created when autoCreateLocation=true");
+
+ // Clean up
+ localFileSystem.delete(localPath, true);
+ }
+ }
+
@Test
public void testHookSetOperationsContext() throws IOException {
String filesetName = "testHookSetOperationsContext";
diff --git a/docs/how-to-use-gvfs.md b/docs/how-to-use-gvfs.md
index d1759cd140..478d1ce6dc 100644
--- a/docs/how-to-use-gvfs.md
+++ b/docs/how-to-use-gvfs.md
@@ -72,6 +72,7 @@ the path mapping and convert automatically.
| `fs.gravitino.enableCredentialVending` | Whether to enable
credential vending for the Gravitino Virtual File System.
| `false`
| No |
0.9.0-incubating |
| `fs.gravitino.client.` | The configuration
key prefix for the Gravitino client config.
| (none)
| No |
1.0.0 |
| `fs.gravitino.filesetMetadataCache.enable` | Whether to cache the
fileset or fileset catalog metadata in the Gravitino Virtual File System. Note
that this cache causes a side effect: if you modify the fileset or fileset
catalog metadata, the client can not see the latest changes.
| `false`
| No |
1.0.0 |
+| `fs.gravitino.autoCreateLocation` | The configuration
key for whether to enable auto-creation of fileset location when the
server-side filesystem ops are disabled and the location does not exist.
| `true`
| No
| 1.1.0 |
To configure the Gravitino client, use properties prefixed with
`fs.gravitino.client.`. These properties will be passed to the Gravitino client
after removing the `fs.` prefix.
@@ -79,13 +80,13 @@ To configure the Gravitino client, use properties prefixed
with `fs.gravitino.cl
**Note:** Invalid configuration properties will result in exceptions. Please
see [Gravitino Java client
configurations](./how-to-use-gravitino-client.md#gravitino-java-client-configuration)
for more support client configuration.
-Apart from the above properties, to access fileset like S3, GCS, OSS and
custom fileset, extra properties are needed, please see
+Apart from the above properties, to access fileset like S3, GCS, OSS and
custom fileset, extra properties are needed, please see
[S3 GVFS Java client
configurations](./fileset-catalog-with-s3.md#using-the-gvfs-java-client-to-access-the-fileset),
[GCS GVFS Java client
configurations](./fileset-catalog-with-gcs.md#using-the-gvfs-java-client-to-access-the-fileset),
[OSS GVFS Java client
configurations](./fileset-catalog-with-oss.md#using-the-gvfs-java-client-to-access-the-fileset)
and [Azure Blob Storage GVFS Java client
configurations](./fileset-catalog-with-adls.md#using-the-gvfs-java-client-to-access-the-fileset)
for more details.
-#### Custom fileset
+#### Custom fileset
Since 0.7.0-incubating, users can define their own fileset type and configure
the corresponding
properties, for more, please refer to [Custom
Fileset](./fileset-catalog.md#how-to-custom-your-own-hcfs-file-system-fileset).
So, if you want to access the custom fileset through GVFS, you need to
configure the corresponding properties.
@@ -107,7 +108,7 @@ You can configure these properties in two ways:
Path filesetPath = new
Path("gvfs://fileset/test_catalog/test_schema/test_fileset_1");
FileSystem fs = filesetPath.getFileSystem(conf);
```
-
+
2. Configure the properties in the `core-site.xml` file of the Hadoop
environment:
```xml
@@ -262,7 +263,7 @@ For Tensorflow to support GVFS, you need to recompile the
[tensorflow-io](https:
export HADOOP_HOME=${your_hadoop_home}
export HADOOP_CONF_DIR=${your_hadoop_conf_home}
# set the location name if you want to access a specific location
- # export CURRENT_LOCATION_NAME=${the_fileset_location_name}
+ # export CURRENT_LOCATION_NAME=${the_fileset_location_name}
export PATH=$PATH:$HADOOP_HOME/libexec/hadoop-config.sh
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$JAVA_HOME/jre/lib/amd64/server
export PATH=$PATH:$HADOOP_HOME/bin:$HADOOP_HOME/sbin
@@ -380,7 +381,7 @@ to recompile the native libraries like `libhdfs` and
others, and completely repl
|---------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------------------------------------------------------------------|-----------------------------------|------------------|
| `server_uri` | The Gravitino server uri, e.g.
`http://localhost:8090`.
| (none)
| Yes | 0.6.0-incubating |
| `metalake_name` | The metalake name which the fileset
belongs to.
| (none)
| Yes | 0.6.0-incubating |
-| `cache_size` | The cache capacity of the Gravitino
Virtual File System.
| `20`
| No | 0.6.0-incubating |
[...]
+| `cache_size` | The cache capacity of the Gravitino
Virtual File System.
| `20`
| No | 0.6.0-incubating |
| `cache_expired_time` | The value of time that the cache expires
after accessing in the Gravitino Virtual File System. The value is in
`seconds`.
| `3600`
| No | 0.6.0-incubating |
| `auth_type` | The auth type to initialize the Gravitino
client to use with the Gravitino Virtual File System. Currently supports
`simple` and `oauth2` auth types.
| `simple`
| No | 0.6.0-incubating |
| `oauth2_server_uri` | The auth server URI for the Gravitino
client when using `oauth2` auth type.
| (none)
| Yes if you use `oauth2` auth type | 0.7.0-incubating |
@@ -396,7 +397,7 @@ to recompile the native libraries like `libhdfs` and
others, and completely repl
| `enable_credential_vending` | Whether to enable credential vending for
the Gravitino Virtual File System.
| `false`
| No | 0.9.0-incubating |
| `gvfs_gravitino_client_` | The configuration key prefix for the
Gravitino client. You can set the config for the Gravitino client.
| (none)
| No | 1.0.0 |
| `enable_fileset_metadata_cache` | Whether to cache the fileset or fileset
catalog metadata in the Gravitino Virtual File System. Note that this cache
causes a side effect: if you modify the fileset or fileset catalog metadata,
the client can not see the latest changes.
| `false`
| No | 1.0.0 |
-
+| `auto_create_location` | The configuration key for whether to
enable auto-creation of fileset location when the server-side filesystem ops
are disabled and the location does not exist.
| `true`
| No | 1.1.0 |
To configure the Gravitino client, use properties prefixed with
`gvfs_gravitino_client_`. These properties will be passed to the Gravitino
client after removing the `gvfs_` prefix.
**Example:** Setting `gvfs_gravitino_client_request_timeout` is equivalent to
setting `gravitino_client_request_timeout` for the Gravitino client.
@@ -442,23 +443,23 @@ For fileset with multiple locations, you can specify
which location to access us
<name>hadoop.security.authentication</name>
<value>kerberos</value>
</property>
-
+
<property>
<name>hadoop.client.kerberos.principal</name>
<value>[email protected]</value>
</property>
-
+
<property>
<name>hadoop.client.keytab.file</name>
<value>/tmp/xxx.keytab</value>
</property>
-
+
<!-- Optional, if you want to access a specific location -->
<property>
<name>fs.gravitino.current.location.name</name>
<value>location-name</value>
</property>
-
+
# Configure Hadoop env in Linux
export HADOOP_HOME=${YOUR_HADOOP_PATH}
export HADOOP_CONF_DIR=${YOUR_HADOOP_PATH}/etc/hadoop