This is an automated email from the ASF dual-hosted git repository.
jshao pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/gravitino.git
The following commit(s) were added to refs/heads/main by this push:
new fce0bf0cef [#7793] improvement(fileset-catalog): Remove single file
check in FilesetCatalogOperations (#7794)
fce0bf0cef is described below
commit fce0bf0cef3ef976e48bd4ded83c3eb22a41e8bc
Author: Mini Yu <[email protected]>
AuthorDate: Wed Aug 13 18:20:48 2025 +0800
[#7793] improvement(fileset-catalog): Remove single file check in
FilesetCatalogOperations (#7794)
### What changes were proposed in this pull request?
Remove single file check logic in FilesetCatalogOperations.
### Why are the changes needed?
It's not needed.
Fix: #7793
### Does this PR introduce _any_ user-facing change?
N/A.
### How was this patch tested?
Existing tests.
---
.../catalog/fileset/FilesetCatalogOperations.java | 89 ++++++++++------------
.../fileset/TestFilesetCatalogOperations.java | 4 +-
.../fileset/integration/test/FilesetCatalogIT.java | 17 +++++
docs/fileset-catalog.md | 4 +-
docs/manage-fileset-metadata-using-gravitino.md | 4 +-
5 files changed, 64 insertions(+), 54 deletions(-)
diff --git
a/catalogs/catalog-fileset/src/main/java/org/apache/gravitino/catalog/fileset/FilesetCatalogOperations.java
b/catalogs/catalog-fileset/src/main/java/org/apache/gravitino/catalog/fileset/FilesetCatalogOperations.java
index d49c47c288..9f67123aa0 100644
---
a/catalogs/catalog-fileset/src/main/java/org/apache/gravitino/catalog/fileset/FilesetCatalogOperations.java
+++
b/catalogs/catalog-fileset/src/main/java/org/apache/gravitino/catalog/fileset/FilesetCatalogOperations.java
@@ -35,7 +35,6 @@ import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Maps;
import com.google.common.util.concurrent.ThreadFactoryBuilder;
-import java.io.FileNotFoundException;
import java.io.IOException;
import java.time.Instant;
import java.util.Arrays;
@@ -447,6 +446,15 @@ public class FilesetCatalogOperations extends
ManagedSchemaOperations
filesetPathsBuilder.put(entry.getKey(), formalizePath);
FileSystem fs = getFileSystemWithCache(formalizePath, conf);
+
+ if (fs.exists(formalizePath) &&
fs.getFileStatus(formalizePath).isFile()) {
+ throw new IllegalArgumentException(
+ "Fileset location cannot be a file: "
+ + formalizePath
+ + ", location name: "
+ + entry.getKey());
+ }
+
if (!fs.exists(formalizePath)) {
if (!fs.mkdirs(formalizePath)) {
throw new RuntimeException(
@@ -680,24 +688,6 @@ public class FilesetCatalogOperations extends
ManagedSchemaOperations
"Location name %s does not exist in fileset %s", targetLocationName,
ident);
}
- boolean isSingleFile = false;
- if (disableFSOps) {
- LOG.warn(
- "Filesystem operations are disabled in the server side, we cannot
check if the "
- + "storage location mounts to a directory or single file, we
assume it is a directory"
- + "(in most of the cases). If it happens to be a single file,
then the generated "
- + "file location may be a wrong path. Please avoid using Fileset
to manage a single"
- + " file path.");
- } else {
- isSingleFile = checkSingleFile(fileset, targetLocationName);
- }
-
- // if the storage location is a single file, it cannot have sub path to
access.
- if (isSingleFile && StringUtils.isNotBlank(processedSubPath)) {
- throw new GravitinoRuntimeException(
- "Sub path should always be blank, because the fileset only mounts a
single file.");
- }
-
// do checks for some data operations.
if (hasCallerContext()) {
Map<String, String> contextMap =
CallerContext.CallerContextHolder.get().context();
@@ -713,14 +703,8 @@ public class FilesetCatalogOperations extends
ManagedSchemaOperations
FilesetDataOperation dataOperation =
FilesetDataOperation.valueOf(operation);
switch (dataOperation) {
case RENAME:
- // Fileset only mounts a single file, the storage location of the
fileset cannot be
- // renamed; Otherwise the metadata in the Gravitino server may be
inconsistent.
- if (isSingleFile) {
- throw new GravitinoRuntimeException(
- "Cannot rename the fileset: %s which only mounts to a single
file.", ident);
- }
- // if the sub path is blank, it cannot be renamed,
- // otherwise the metadata in the Gravitino server may be
inconsistent.
+ // if the sub path is blank, it cannot be renamed otherwise the
metadata in the
+ // Gravitino server may be inconsistent.
if (StringUtils.isBlank(processedSubPath)
|| (processedSubPath.startsWith(SLASH) &&
processedSubPath.length() == 1)) {
throw new GravitinoRuntimeException(
@@ -734,13 +718,12 @@ public class FilesetCatalogOperations extends
ManagedSchemaOperations
}
String fileLocation;
- // 1. if the storage location is a single file, we pass the storage
location directly
- // 2. if the processed sub path is blank, we pass the storage location
directly
- if (isSingleFile || StringUtils.isBlank(processedSubPath)) {
+ // If the processed sub path is blank, we pass the storage location
directly
+ if (StringUtils.isBlank(processedSubPath)) {
fileLocation = fileset.storageLocations().get(targetLocationName);
} else {
// the processed sub path always starts with "/" if it is not blank,
- // so we can safely remove the tailing slash if storage location ends
with "/".
+ // so we can safely remove the tailing slash if the storage location
ends with "/".
String storageLocation =
removeTrailingSlash(fileset.storageLocations().get(targetLocationName));
fileLocation = String.format("%s%s", storageLocation, processedSubPath);
@@ -769,6 +752,14 @@ public class FilesetCatalogOperations extends
ManagedSchemaOperations
if (schemaPath != null &&
!containsPlaceholder(schemaPath.toString())) {
try {
FileSystem fs = getFileSystemWithCache(schemaPath, conf);
+ if (fs.exists(schemaPath) &&
fs.getFileStatus(schemaPath).isFile()) {
+ throw new IllegalArgumentException(
+ "Fileset schema location cannot be a file: "
+ + schemaPath
+ + ", location name: "
+ + locationName);
+ }
+
if (!fs.exists(schemaPath)) {
if (!fs.mkdirs(schemaPath)) {
// Fail the operation when failed to create the schema path.
@@ -1057,6 +1048,24 @@ public class FilesetCatalogOperations extends
ManagedSchemaOperations
}
checkPlaceholderValue(v);
+
+ if (!disableFSOps && !containsPlaceholder(v)) {
+ Path path = new Path(v);
+ FileSystem fs = getFileSystemWithCache(path, conf);
+ try {
+ if (fs.exists(path) && fs.getFileStatus(path).isFile()) {
+ throw new IllegalArgumentException(
+ "Fileset catalog location cannot be a file: "
+ + v
+ + ", location name: "
+ + locationName);
+ }
+ } catch (IOException e) {
+ throw new RuntimeException(
+ "Failed to check if fileset catalog location exists: " +
v, e);
+ }
+ }
+
catalogStorageLocations.put(locationName, new
Path((ensureTrailingSlash(v))));
}
});
@@ -1321,24 +1330,6 @@ public class FilesetCatalogOperations extends
ManagedSchemaOperations
});
}
- private boolean checkSingleFile(Fileset fileset, String locationName) {
- try {
- Path locationPath = new
Path(fileset.storageLocations().get(locationName));
- FileSystem fileSystem = getFileSystemWithCache(locationPath, conf);
- return fileSystem.getFileStatus(locationPath).isFile();
- } catch (FileNotFoundException e) {
- // We should always return false here, same with the logic in
`FileSystem.isFile(Path f)`.
- return false;
- } catch (IOException e) {
- throw new GravitinoRuntimeException(
- e,
- "Exception occurs when checking whether fileset: %s "
- + "mounts a single file with location name: %s",
- fileset.name(),
- locationName);
- }
- }
-
private String buildGVFSFilePath(
String catalogName, String schemaName, String filesetName, String
subPath) {
String prefix = String.join(SLASH, "/fileset", catalogName, schemaName,
filesetName);
diff --git
a/catalogs/catalog-fileset/src/test/java/org/apache/gravitino/catalog/fileset/TestFilesetCatalogOperations.java
b/catalogs/catalog-fileset/src/test/java/org/apache/gravitino/catalog/fileset/TestFilesetCatalogOperations.java
index a98e54bd8e..3007f93a4f 100644
---
a/catalogs/catalog-fileset/src/test/java/org/apache/gravitino/catalog/fileset/TestFilesetCatalogOperations.java
+++
b/catalogs/catalog-fileset/src/test/java/org/apache/gravitino/catalog/fileset/TestFilesetCatalogOperations.java
@@ -1381,8 +1381,8 @@ public class TestFilesetCatalogOperations {
CallerContext callerContext =
CallerContext.builder().withContext(contextMap).build();
CallerContext.CallerContextHolder.set(callerContext);
- Assertions.assertThrows(
- GravitinoRuntimeException.class, () ->
ops.getFileLocation(filesetIdent, subPath));
+ // We have removed the check for single file in getFileLocation method.
+ Assertions.assertDoesNotThrow(() -> ops.getFileLocation(filesetIdent,
subPath));
} finally {
CallerContext.CallerContextHolder.remove();
}
diff --git
a/catalogs/catalog-fileset/src/test/java/org/apache/gravitino/catalog/fileset/integration/test/FilesetCatalogIT.java
b/catalogs/catalog-fileset/src/test/java/org/apache/gravitino/catalog/fileset/integration/test/FilesetCatalogIT.java
index 66e3ac4ff9..b1a9efb14b 100644
---
a/catalogs/catalog-fileset/src/test/java/org/apache/gravitino/catalog/fileset/integration/test/FilesetCatalogIT.java
+++
b/catalogs/catalog-fileset/src/test/java/org/apache/gravitino/catalog/fileset/integration/test/FilesetCatalogIT.java
@@ -451,6 +451,23 @@ public class FilesetCatalogIT extends BaseIT {
createFileset(filesetName3, "comment", null, storageLocation3,
ImmutableMap.of("k1", "v1"));
assertFilesetExists(filesetName3);
Assertions.assertEquals(MANAGED, fileset3.type(), "fileset type should be
MANAGED by default");
+
+ // Test create fileset with a single file
+ String filesetName7 = "test_create_fileset_with_a_file";
+ String storageLocation7 = storageLocation(filesetName7);
+ String filePath = storageLocation7 + "/file1.txt";
+ fileSystem.createNewFile(new Path(filePath));
+ Assertions.assertTrue(fileSystem.getFileStatus(new
Path(filePath)).isFile());
+
+ Throwable throwable =
+ Assertions.assertThrows(
+ RuntimeException.class,
+ () -> {
+ createFileset(
+ filesetName7, "comment", MANAGED, filePath,
ImmutableMap.of("k1", "v1"));
+ });
+
+ Assertions.assertTrue(throwable.getMessage().contains("Fileset location
cannot be a file"));
}
@Test
diff --git a/docs/fileset-catalog.md b/docs/fileset-catalog.md
index a34b32260e..6bb194a5f4 100644
--- a/docs/fileset-catalog.md
+++ b/docs/fileset-catalog.md
@@ -29,7 +29,7 @@ Besides the [common catalog
properties](./gravitino-server-config.md#apache-grav
| Property Name | Description
| Default Value | Required | Since Version |
|--------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------|----------|------------------|
-| `location` | The storage location managed by
Fileset catalog. Its location name is `unknown`.
| (none) | No | 0.5.0
|
+| `location` | The storage location managed by
Fileset catalog. Its location name is `unknown`. The value should always a
directory(HDFS) or path prefix(cloud storage like S3, GCS.) and does not
support a single file.
| (none) | No |
0.5.0 |
| `location-` | The property prefix. User can use
`location-{name}={path}` to set multiple locations with different names for the
catalog.
| (none) | No |
0.9.0-incubating |
| `default-filesystem-provider` | The default filesystem provider of
this Fileset catalog if users do not specify the scheme in the URI. Candidate
values are 'builtin-local', 'builtin-hdfs', 's3', 'gcs', 'abs' and 'oss'.
Default value is `builtin-local`. For S3, if we set this value to 's3', we can
omit the prefix 's3a://' in the location. | `builtin-local` | No |
0.7.0-incubating |
| `filesystem-providers` | The file system providers to add.
Users need to set this configuration to support cloud storage or custom HCFS.
For instance, set it to `s3` or a comma separated string that contains `s3`
like `gs,s3` to support multiple kinds of fileset including `s3`.
| (none) | Yes |
0.7.0-incubating |
@@ -115,7 +115,7 @@ The Fileset catalog supports creating, updating, deleting,
and listing schema.
| Property name | Description
| Default value | Required | Since Version |
|---------------------------------------|---------------------------------------------------------------------------------------------------------------------------|---------------------------|----------|------------------|
-| `location` | The storage location managed by
schema. Its location name is `unknown`.
| (none) | No | 0.5.0 |
+| `location` | The storage location managed by
schema. Its location name is `unknown`. It's also should be a directory or path
prefix. | (none) | No | 0.5.0 |
| `location-` | The property prefix. User can use
`location-{name}={path}` to set multiple locations with different names for the
schema. | (none) | No | 0.9.0-incubating |
| `authentication.impersonation-enable` | Whether to enable impersonation for
this schema of the Fileset catalog.
| The parent(catalog) value | No | 0.6.0-incubating |
| `authentication.type` | The type of authentication for this
schema of Fileset catalog , currently we only support `kerberos`, `simple`.
| The parent(catalog) value | No | 0.6.0-incubating |
diff --git a/docs/manage-fileset-metadata-using-gravitino.md
b/docs/manage-fileset-metadata-using-gravitino.md
index 7faa351a47..79929d6e95 100644
--- a/docs/manage-fileset-metadata-using-gravitino.md
+++ b/docs/manage-fileset-metadata-using-gravitino.md
@@ -117,8 +117,10 @@ Refer to [Drop a
catalog](./manage-relational-metadata-using-gravitino.md#drop-a
in relational catalog for more details. For a fileset catalog, the drop
operation is the same.
:::note
-Currently, Gravitino doesn't support dropping a catalog with schemas and
filesets under it. You have
+- Currently, Gravitino doesn't support dropping a catalog with schemas and
filesets under it. You have
to drop all the schemas and filesets under the catalog before dropping the
catalog.
+- The value of location property in a catalog should be a directory on a file
system
+ (HDFS) or path prefix on a cloud storage (S3, GCS, etc.).** The same goes
for schema and fileset.
:::
### List all catalogs in a metalake