This is an automated email from the ASF dual-hosted git repository.
abhijain pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/gobblin.git
The following commit(s) were added to refs/heads/master by this push:
new 7c54b95fd3 Fix owner execute permission bit issue for path existing in
destination (#4147)
7c54b95fd3 is described below
commit 7c54b95fd34d57249a1fb351b504713ed43ccce1
Author: abhishekmjain <[email protected]>
AuthorDate: Tue Oct 28 16:20:21 2025 +0530
Fix owner execute permission bit issue for path existing in destination
(#4147)
---
.../data/management/copy/ManifestBasedDataset.java | 22 ++-
.../dataset/ManifestBasedDatasetFinderTest.java | 157 ++++++++++++++++++++-
.../manifestBasedDistcpTest/nestedDirectories.json | 20 +++
3 files changed, 192 insertions(+), 7 deletions(-)
diff --git
a/gobblin-data-management/src/main/java/org/apache/gobblin/data/management/copy/ManifestBasedDataset.java
b/gobblin-data-management/src/main/java/org/apache/gobblin/data/management/copy/ManifestBasedDataset.java
index 8f9a78b7ce..531f506bf8 100644
---
a/gobblin-data-management/src/main/java/org/apache/gobblin/data/management/copy/ManifestBasedDataset.java
+++
b/gobblin-data-management/src/main/java/org/apache/gobblin/data/management/copy/ManifestBasedDataset.java
@@ -116,6 +116,7 @@ public class ManifestBasedDataset implements
IterableCopyableDataset {
// map of paths and permissions sorted by depth of path, so that
permissions can be set in order
Map<String, List<OwnerAndPermission>> ancestorOwnerAndPermissions = new
HashMap<>();
Map<String, List<OwnerAndPermission>>
ancestorOwnerAndPermissionsForSetPermissionStep = new HashMap<>();
+ Map<String, OwnerAndPermission>
existingDirectoryPermissionsForSetPermissionStep = new HashMap<>();
TreeMap<String, OwnerAndPermission> flattenedAncestorPermissions = new
TreeMap<>(
Comparator.comparingInt((String o) ->
o.split("/").length).thenComparing(o -> o));
try {
@@ -151,13 +152,18 @@ public class ManifestBasedDataset implements
IterableCopyableDataset {
// FileAwareInputStreamDataWriter::setOwnerExecuteBitIfDirectory
-> HadoopUtils::addExecutePermissionToOwner
// We need to revert this extra permission change in
setPermissionStep
if (srcFile.isDirectory() &&
!srcFile.getPermission().getUserAction().implies(FsAction.EXECUTE)
- &&
!ancestorOwnerAndPermissionsForSetPermissionStep.containsKey(PathUtils.getPathWithoutSchemeAndAuthority(fileToCopy).toString())
- && !targetFs.exists(fileToCopy)) {
- List<OwnerAndPermission> ancestorsOwnerAndPermissionUpdated =
new ArrayList<>();
+ &&
!ancestorOwnerAndPermissionsForSetPermissionStep.containsKey(PathUtils.getPathWithoutSchemeAndAuthority(fileToCopy).toString()))
{
OwnerAndPermission srcFileOwnerPermissionReplicatedForDest = new
OwnerAndPermission(copyableFile.getDestinationOwnerAndPermission());
-
ancestorsOwnerAndPermissionUpdated.add(srcFileOwnerPermissionReplicatedForDest);
-
copyableFile.getAncestorsOwnerAndPermission().forEach(ancestorOwnPerm ->
ancestorsOwnerAndPermissionUpdated.add(new
OwnerAndPermission(ancestorOwnPerm)));
-
ancestorOwnerAndPermissionsForSetPermissionStep.put(fileToCopy.toString(),
ancestorsOwnerAndPermissionUpdated);
+ if(!targetFs.exists(fileToCopy)) {
+ List<OwnerAndPermission> ancestorsOwnerAndPermissionUpdated =
new ArrayList<>();
+
ancestorsOwnerAndPermissionUpdated.add(srcFileOwnerPermissionReplicatedForDest);
+
copyableFile.getAncestorsOwnerAndPermission().forEach(ancestorOwnPerm ->
ancestorsOwnerAndPermissionUpdated.add(new
OwnerAndPermission(ancestorOwnPerm)));
+
ancestorOwnerAndPermissionsForSetPermissionStep.put(fileToCopy.toString(),
ancestorsOwnerAndPermissionUpdated);
+ }
+ else {
+ // If the path exists, update only current directory
permission in post publish step and not entire hierarchy.
+
existingDirectoryPermissionsForSetPermissionStep.put(fileToCopy.toString(),
srcFileOwnerPermissionReplicatedForDest);
+ }
}
// Always grab the parent since the above permission setting
should be setting the permission for a folder itself
@@ -198,6 +204,10 @@ public class ManifestBasedDataset implements
IterableCopyableDataset {
currentPath = currentPath.getParent();
}
}
+ for (Map.Entry<String, OwnerAndPermission>
existingDirectoryPermissions :
existingDirectoryPermissionsForSetPermissionStep.entrySet()) {
+ Path currentPath = new Path(existingDirectoryPermissions.getKey());
+ flattenedAncestorPermissions.put(currentPath.toString(),
existingDirectoryPermissions.getValue());
+ }
CommitStep setPermissionCommitStep = new
SetPermissionCommitStep(targetFs, flattenedAncestorPermissions,
this.properties);
copyEntities.add(new PostPublishStep(datasetURN(), Maps.newHashMap(),
setPermissionCommitStep, 1));
}
diff --git
a/gobblin-data-management/src/test/java/org/apache/gobblin/data/management/dataset/ManifestBasedDatasetFinderTest.java
b/gobblin-data-management/src/test/java/org/apache/gobblin/data/management/dataset/ManifestBasedDatasetFinderTest.java
index 6acc668073..2b62ccad8f 100644
---
a/gobblin-data-management/src/test/java/org/apache/gobblin/data/management/dataset/ManifestBasedDatasetFinderTest.java
+++
b/gobblin-data-management/src/test/java/org/apache/gobblin/data/management/dataset/ManifestBasedDatasetFinderTest.java
@@ -159,7 +159,7 @@ public class ManifestBasedDatasetFinderTest {
CommitStep setPermissionStep = ((PostPublishStep)
fileSet.getFiles().get(3)).getStep();
Assert.assertTrue(setPermissionStep instanceof SetPermissionCommitStep);
Map<String, OwnerAndPermission> ownerAndPermissionMap =
((SetPermissionCommitStep) setPermissionStep).getPathAndPermissions();
- // Ignore /tmp as it already exists on destination
+ // Ignore /tmp as it is not part of manifest and already present in
destination
Assert.assertEquals(ownerAndPermissionMap.size(), 1);
Assert.assertTrue(ownerAndPermissionMap.containsKey("/tmp/dataset"));
}
@@ -415,6 +415,161 @@ public class ManifestBasedDatasetFinderTest {
}
}
+ /**
+ * Test correct permissions are set for existing directories.
+ */
+ @Test
+ public void testCorrectPermissionForExistingDirectoriesWithoutExecute()
throws IOException, URISyntaxException {
+ Path manifestPath = new
Path(getClass().getClassLoader().getResource("manifestBasedDistcpTest/sampleManifestWithOnlyDirectory.json").getPath());
+ Properties props = new Properties();
+ props.setProperty(ConfigurationKeys.DATA_PUBLISHER_FINAL_DIR, "/");
+ props.setProperty("gobblin.copy.preserved.attributes", "rbugpvta");
+
+ try (FileSystem sourceFs = Mockito.mock(FileSystem.class);
+ FileSystem manifestReadFs = Mockito.mock(FileSystem.class);
+ FileSystem destFs = Mockito.mock(FileSystem.class)) {
+
+ setSourceAndDestFsMocks(sourceFs, destFs, manifestPath, manifestReadFs,
false);
+
+ // Key difference: directory already exists on target
+ Mockito.when(destFs.exists(new Path("/tmp/dataset"))).thenReturn(true);
+ Mockito.when(destFs.exists(new Path("/tmp"))).thenReturn(true);
+
+
Mockito.when(destFs.getFileStatus(any(Path.class))).thenReturn(localFs.getFileStatus(new
Path(tmpDir.toString())));
+
+ // Source directory has 000 permission (no execute bit)
+ setFsMockPathWithPermissions(sourceFs, "/tmp/dataset", "d---------",
"owner1", "group1", true);
+ setFsMockPathWithPermissions(sourceFs, "/tmp", "dr--r--r--", "owner2",
"group2", true);
+
+ Iterator<FileSet<CopyEntity>> fileSets = new
ManifestBasedDataset(sourceFs, manifestReadFs, manifestPath, props)
+ .getFileSetIterator(destFs, CopyConfiguration.builder(destFs,
props).build());
+
+ Assert.assertTrue(fileSets.hasNext());
+ FileSet<CopyEntity> fileSet = fileSets.next();
+ Assert.assertEquals(fileSet.getFiles().size(), 3); // 1 dir to copy + 1
pre publish step + 1 post publish step
+
+ // Verify SetPermissionCommitStep includes the directory even though it
exists on target
+ CommitStep setPermissionStep = ((PostPublishStep)
fileSet.getFiles().get(2)).getStep();
+ Assert.assertTrue(setPermissionStep instanceof SetPermissionCommitStep);
+ Map<String, OwnerAndPermission> ownerAndPermissionMap =
((SetPermissionCommitStep) setPermissionStep).getPathAndPermissions();
+
+ // Should include only /tmp/dataset and not /tmp since it is not part of
manifest for permission correction
+ Assert.assertEquals(ownerAndPermissionMap.size(), 1);
+ Assert.assertTrue(ownerAndPermissionMap.containsKey("/tmp/dataset"));
+
Assert.assertEquals(ownerAndPermissionMap.get("/tmp/dataset").getFsPermission(),
FsPermission.valueOf("d---------"));
+
Assert.assertEquals(ownerAndPermissionMap.get("/tmp/dataset").getOwner(),
"owner1");
+
Assert.assertEquals(ownerAndPermissionMap.get("/tmp/dataset").getGroup(),
"group1");
+
+ Assert.assertFalse(ownerAndPermissionMap.containsKey("/tmp"));
+ }
+ }
+
+ /**
+ * Test correct permissions are set in post-publish step.
+ */
+ @Test
+ public void testCorrectPermissionForMixedExistingAndNewDirectories() throws
IOException, URISyntaxException {
+ Path manifestPath = new
Path(getClass().getClassLoader().getResource("manifestBasedDistcpTest/nestedDirectories.json").getPath());
+ Properties props = new Properties();
+ props.setProperty(ConfigurationKeys.DATA_PUBLISHER_FINAL_DIR, "/");
+ props.setProperty("gobblin.copy.preserved.attributes", "rbugpvta");
+
+ try (FileSystem sourceFs = Mockito.mock(FileSystem.class);
+ FileSystem manifestReadFs = Mockito.mock(FileSystem.class);
+ FileSystem destFs = Mockito.mock(FileSystem.class)) {
+
+ setSourceAndDestFsMocks(sourceFs, destFs, manifestPath, manifestReadFs,
false);
+
+ // Mix of existing and non-existing directories
+ Mockito.when(destFs.exists(new Path("/tmp"))).thenReturn(true);
+ Mockito.when(destFs.exists(new Path("/tmp/dataset"))).thenReturn(true);
+ Mockito.when(destFs.exists(new
Path("/tmp/dataset/hourly"))).thenReturn(false);
+ Mockito.when(destFs.exists(new
Path("/tmp/dataset/hourly/metadata"))).thenReturn(false);
+ Mockito.when(destFs.exists(new Path("/tmp/dataset2"))).thenReturn(true);
+ Mockito.when(destFs.exists(new
Path("/tmp/dataset2/hourly"))).thenReturn(true);
+ Mockito.when(destFs.exists(new
Path("/tmp/dataset2/hourly/metadata"))).thenReturn(true);
+ Mockito.when(destFs.exists(new
Path("/tmp/dataset2/hourly/metadata2"))).thenReturn(true);
+
+
Mockito.when(destFs.getFileStatus(any(Path.class))).thenReturn(localFs.getFileStatus(new
Path(tmpDir.toString())));
+
+ // Set up directories with various permissions, some without execute bit
+ setFsMockPathWithPermissions(sourceFs, "/tmp/dataset/hourly/metadata",
"drw-rw-rw-", "owner1", "group1", true);
+ setFsMockPathWithPermissions(sourceFs, "/tmp/dataset2/hourly/metadata",
"dr-xr--r--", "owner2", "group2", true);
+ setFsMockPathWithPermissions(sourceFs, "/tmp/dataset2/hourly/metadata2",
"dr--r--r--", "owner2", "group2", true);
+ setFsMockPathWithPermissions(sourceFs, "/tmp/dataset/hourly",
"drw-rw-rw-", "owner1", "group1", true);
+ setFsMockPathWithPermissions(sourceFs, "/tmp/dataset2/hourly",
"dr--r--r--", "owner2", "group2", true);
+ setFsMockPathWithPermissions(sourceFs, "/tmp/dataset", "drw-r-----",
"owner1", "group1", true);
+ setFsMockPathWithPermissions(sourceFs, "/tmp/dataset2", "dr--r--r--",
"owner2", "group2", true);
+ setFsMockPathWithPermissions(sourceFs, "/tmp", "dr--r--r--", "owner3",
"group3", true);
+
+ AclStatus aclStatusDest =
buildAclStatusWithPermissions("user::r--,group::---,other::---", "group3",
"owner3");
+
Mockito.when(destFs.getAclStatus(any(Path.class))).thenReturn(aclStatusDest);
+
+ Iterator<FileSet<CopyEntity>> fileSets = new
ManifestBasedDataset(sourceFs, manifestReadFs, manifestPath, props)
+ .getFileSetIterator(destFs, CopyConfiguration.builder(destFs,
props).build());
+
+ Assert.assertTrue(fileSets.hasNext());
+ FileSet<CopyEntity> fileSet = fileSets.next();
+
+ CommitStep setPermissionStep = ((PostPublishStep)
fileSet.getFiles().get(4)).getStep();
+ Assert.assertTrue(setPermissionStep instanceof SetPermissionCommitStep);
+ Map<String, OwnerAndPermission> ownerAndPermissionMap =
((SetPermissionCommitStep) setPermissionStep).getPathAndPermissions();
+
+ // Not part of manifest and already exists in destination
+ Assert.assertFalse(ownerAndPermissionMap.containsKey("/tmp/dataset"));
+ Assert.assertFalse(ownerAndPermissionMap.containsKey("/tmp/dataset2"));
+
+
Assert.assertFalse(ownerAndPermissionMap.containsKey("/tmp/dataset2/hourly/metadata"));
// Owner execute is set
+
+ // Path not present in target
+
Assert.assertTrue(ownerAndPermissionMap.containsKey("/tmp/dataset/hourly/metadata"));
+
Assert.assertEquals(ownerAndPermissionMap.get("/tmp/dataset/hourly/metadata").getFsPermission(),
FsPermission.valueOf("drw-rw-rw-"));
+
Assert.assertTrue(ownerAndPermissionMap.containsKey("/tmp/dataset2/hourly/metadata2"));
+
Assert.assertEquals(ownerAndPermissionMap.get("/tmp/dataset2/hourly/metadata2").getFsPermission(),
FsPermission.valueOf("dr--r--r--"));
+ }
+ }
+
+ /**
+ * Test that directories with execute permission are NOT added to
SetPermissionCommitStep
+ * when they already exist on target.
+ */
+ @Test
+ public void testNoPermissionFixForExistingDirectoriesWithExecute() throws
IOException, URISyntaxException {
+ Path manifestPath = new
Path(getClass().getClassLoader().getResource("manifestBasedDistcpTest/sampleManifestWithOnlyDirectory.json").getPath());
+ Properties props = new Properties();
+ props.setProperty(ConfigurationKeys.DATA_PUBLISHER_FINAL_DIR, "/");
+ props.setProperty("gobblin.copy.preserved.attributes", "rbugpvta");
+
+ try (FileSystem sourceFs = Mockito.mock(FileSystem.class);
+ FileSystem manifestReadFs = Mockito.mock(FileSystem.class);
+ FileSystem destFs = Mockito.mock(FileSystem.class)) {
+
+ setSourceAndDestFsMocks(sourceFs, destFs, manifestPath, manifestReadFs,
false);
+
+ // Directory already exists on target
+ Mockito.when(destFs.exists(new Path("/tmp/dataset"))).thenReturn(true);
+ Mockito.when(destFs.exists(new Path("/tmp"))).thenReturn(true);
+
+
Mockito.when(destFs.getFileStatus(any(Path.class))).thenReturn(localFs.getFileStatus(new
Path(tmpDir.toString())));
+
+ // Source directory has execute permission (should NOT need permission
fix)
+ setFsMockPathWithPermissions(sourceFs, "/tmp/dataset", "drwxrwxrwx",
"owner1", "group1", true);
+ setFsMockPathWithPermissions(sourceFs, "/tmp", "dr-xr-xr-x", "owner2",
"group2", true);
+
+ Iterator<FileSet<CopyEntity>> fileSets = new
ManifestBasedDataset(sourceFs, manifestReadFs, manifestPath, props)
+ .getFileSetIterator(destFs, CopyConfiguration.builder(destFs,
props).build());
+
+ Assert.assertTrue(fileSets.hasNext());
+ FileSet<CopyEntity> fileSet = fileSets.next();
+
+ CommitStep setPermissionStep = ((PostPublishStep)
fileSet.getFiles().get(2)).getStep();
+ Assert.assertTrue(setPermissionStep instanceof SetPermissionCommitStep);
+ Map<String, OwnerAndPermission> ownerAndPermissionMap =
((SetPermissionCommitStep) setPermissionStep).getPathAndPermissions();
+
+ Assert.assertEquals(ownerAndPermissionMap.size(), 0);
+ }
+ }
+
private void setSourceAndDestFsMocks(FileSystem sourceFs, FileSystem destFs,
Path manifestPath, FileSystem manifestReadFs, boolean setFileStatusMock) throws
IOException, URISyntaxException {
URI SRC_FS_URI = new URI("source", "the.source.org", "/", null);
URI MANIFEST_READ_FS_URI = new URI("manifest-read",
"the.manifest-source.org", "/", null);
diff --git
a/gobblin-data-management/src/test/resources/manifestBasedDistcpTest/nestedDirectories.json
b/gobblin-data-management/src/test/resources/manifestBasedDistcpTest/nestedDirectories.json
new file mode 100644
index 0000000000..beb9f5572b
--- /dev/null
+++
b/gobblin-data-management/src/test/resources/manifestBasedDistcpTest/nestedDirectories.json
@@ -0,0 +1,20 @@
+[
+ {
+ "id":"1",
+ "fileName":"/tmp/dataset/hourly/metadata",
+ "fileGroup":"/tmp/dataset",
+ "fileSizeInBytes":"1024"
+ },
+ {
+ "id":"2",
+ "fileName":"/tmp/dataset2/hourly/metadata",
+ "fileGroup":"/tmp/dataset2",
+ "fileSizeInBytes":"1028"
+ },
+ {
+ "id":"3",
+ "fileName":"/tmp/dataset2/hourly/metadata2",
+ "fileGroup":"/tmp/dataset2",
+ "fileSizeInBytes":"1028"
+ }
+]
\ No newline at end of file