This is an automated email from the ASF dual-hosted git repository. vivekrai pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/gobblin.git
The following commit(s) were added to refs/heads/master by this push: new 8d71c89cc1 [GOBBLIN-2206] Revert extra execute bit getting set in ManifestDistcp (#4115) 8d71c89cc1 is described below commit 8d71c89cc16a1b8d2d4bf7f19182fca32a3608c4 Author: Vivek Rai <43493515+blazer-...@users.noreply.github.com> AuthorDate: Mon May 5 13:11:35 2025 +0530 [GOBBLIN-2206] Revert extra execute bit getting set in ManifestDistcp (#4115) * added check to add extra set permission step * add copy constructor for ownerandpermission * remove extra null check for checkstyle failure --- .../data/management/copy/ManifestBasedDataset.java | 22 ++++++- .../dataset/ManifestBasedDatasetFinderTest.java | 72 ++++++++++++++++++++++ .../sampleManifestWithOnlyDirectory.json | 8 +++ .../util/filesystem/OwnerAndPermission.java | 16 +++++ .../util/filesystem/OwnerAndPermissionTest.java | 60 ++++++++++++++++++ 5 files changed, 177 insertions(+), 1 deletion(-) diff --git a/gobblin-data-management/src/main/java/org/apache/gobblin/data/management/copy/ManifestBasedDataset.java b/gobblin-data-management/src/main/java/org/apache/gobblin/data/management/copy/ManifestBasedDataset.java index 6d7df69043..8f9a78b7ce 100644 --- a/gobblin-data-management/src/main/java/org/apache/gobblin/data/management/copy/ManifestBasedDataset.java +++ b/gobblin-data-management/src/main/java/org/apache/gobblin/data/management/copy/ManifestBasedDataset.java @@ -18,6 +18,7 @@ package org.apache.gobblin.data.management.copy; import java.io.IOException; +import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; import java.util.HashMap; @@ -31,6 +32,7 @@ import java.util.concurrent.TimeUnit; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.permission.FsAction; import com.google.common.base.Optional; import com.google.common.cache.Cache; @@ -113,6 +115,7 @@ public class ManifestBasedDataset implements IterableCopyableDataset { List<FileStatus> toDelete = Lists.newArrayList(); // map of paths and permissions sorted by depth of path, so that permissions can be set in order Map<String, List<OwnerAndPermission>> ancestorOwnerAndPermissions = new HashMap<>(); + Map<String, List<OwnerAndPermission>> ancestorOwnerAndPermissionsForSetPermissionStep = new HashMap<>(); TreeMap<String, OwnerAndPermission> flattenedAncestorPermissions = new TreeMap<>( Comparator.comparingInt((String o) -> o.split("/").length).thenComparing(o -> o)); try { @@ -143,12 +146,29 @@ public class ManifestBasedDataset implements IterableCopyableDataset { copyableFile.setFsDatasets(srcFs, targetFs); copyEntities.add(copyableFile); + // In case of directory with 000 permission, the permission is changed to 100 due to HadoopUtils::addExecutePermissionToOwner + // getting called from CopyDataPublisher::preserveFileAttrInPublisher -> FileAwareInputStreamDataWriter::setPathPermission -> + // FileAwareInputStreamDataWriter::setOwnerExecuteBitIfDirectory -> HadoopUtils::addExecutePermissionToOwner + // We need to revert this extra permission change in setPermissionStep + if (srcFile.isDirectory() && !srcFile.getPermission().getUserAction().implies(FsAction.EXECUTE) + && !ancestorOwnerAndPermissionsForSetPermissionStep.containsKey(PathUtils.getPathWithoutSchemeAndAuthority(fileToCopy).toString()) + && !targetFs.exists(fileToCopy)) { + List<OwnerAndPermission> ancestorsOwnerAndPermissionUpdated = new ArrayList<>(); + OwnerAndPermission srcFileOwnerPermissionReplicatedForDest = new OwnerAndPermission(copyableFile.getDestinationOwnerAndPermission()); + ancestorsOwnerAndPermissionUpdated.add(srcFileOwnerPermissionReplicatedForDest); + copyableFile.getAncestorsOwnerAndPermission().forEach(ancestorOwnPerm -> ancestorsOwnerAndPermissionUpdated.add(new OwnerAndPermission(ancestorOwnPerm))); + ancestorOwnerAndPermissionsForSetPermissionStep.put(fileToCopy.toString(), ancestorsOwnerAndPermissionUpdated); + } + // Always grab the parent since the above permission setting should be setting the permission for a folder itself // {@link CopyDataPublisher#preserveFileAttrInPublisher} will be setting the permission for the empty child dir Path fromPath = fileToCopy.getParent(); // Avoid duplicate calculation for the same ancestor if (fromPath != null && !ancestorOwnerAndPermissions.containsKey(PathUtils.getPathWithoutSchemeAndAuthority(fromPath).toString()) && !targetFs.exists(fromPath)) { ancestorOwnerAndPermissions.put(fromPath.toString(), copyableFile.getAncestorsOwnerAndPermission()); + if (!ancestorOwnerAndPermissionsForSetPermissionStep.containsKey(PathUtils.getPathWithoutSchemeAndAuthority(fromPath).toString())) { + ancestorOwnerAndPermissionsForSetPermissionStep.put(fromPath.toString(), copyableFile.getAncestorsOwnerAndPermission()); + } } if (existOnTarget && srcFile.isFile()) { @@ -167,7 +187,7 @@ public class ManifestBasedDataset implements IterableCopyableDataset { copyEntities.add(new PrePublishStep(datasetURN(), Maps.newHashMap(), createDirectoryWithPermissionsCommitStep, 1)); if (this.enableSetPermissionPostPublish) { - for (Map.Entry<String, List<OwnerAndPermission>> recursiveParentPermissions : ancestorOwnerAndPermissions.entrySet()) { + for (Map.Entry<String, List<OwnerAndPermission>> recursiveParentPermissions : ancestorOwnerAndPermissionsForSetPermissionStep.entrySet()) { Path currentPath = new Path(recursiveParentPermissions.getKey()); for (OwnerAndPermission ownerAndPermission : recursiveParentPermissions.getValue()) { // Ignore folders that already exist in destination, we assume that the publisher will re-sync those permissions if needed and diff --git a/gobblin-data-management/src/test/java/org/apache/gobblin/data/management/dataset/ManifestBasedDatasetFinderTest.java b/gobblin-data-management/src/test/java/org/apache/gobblin/data/management/dataset/ManifestBasedDatasetFinderTest.java index d81de494d0..6acc668073 100644 --- a/gobblin-data-management/src/test/java/org/apache/gobblin/data/management/dataset/ManifestBasedDatasetFinderTest.java +++ b/gobblin-data-management/src/test/java/org/apache/gobblin/data/management/dataset/ManifestBasedDatasetFinderTest.java @@ -36,6 +36,7 @@ import org.apache.hadoop.fs.permission.AclStatus; import org.apache.hadoop.fs.permission.FsPermission; import org.mockito.Mockito; import org.testng.Assert; +import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import com.google.common.io.Files; @@ -343,6 +344,77 @@ public class ManifestBasedDatasetFinderTest { Assert.assertEquals(fileSet.getFiles().size(), 2); // 1 files to copy + 1 pre publish step } + @DataProvider + public Object[][] dirPermissionWithExpectedSetPermissionStepCount() { + return new Object[][] { + {"d---------", 2}, + {"drw-rw--w-", 2}, + {"d-w-r----x", 2}, + {"drwxrwxrwx", 1}, + {"dr-xr-xr-x", 1}, + {"d--x------", 1}, + }; + } + + @Test(dataProvider = "dirPermissionWithExpectedSetPermissionStepCount") + public void testSetPermissionWhenCopyingDirectoryWithOwnerExecutePermissionSetUnset(String dirPermission, int expectedCount) throws IOException, URISyntaxException { + Path manifestPath = new Path(getClass().getClassLoader().getResource("manifestBasedDistcpTest/sampleManifestWithOnlyDirectory.json").getPath()); + Properties props = new Properties(); + props.setProperty(ConfigurationKeys.DATA_PUBLISHER_FINAL_DIR, "/"); + props.setProperty("gobblin.copy.preserved.attributes", "rbugpvta"); + try (FileSystem sourceFs = Mockito.mock(FileSystem.class); + FileSystem manifestReadFs = Mockito.mock(FileSystem.class); + FileSystem destFs = Mockito.mock(FileSystem.class)) { + + setSourceAndDestFsMocks(sourceFs, destFs, manifestPath, manifestReadFs, false); + + Mockito.when(destFs.exists(new Path("/tmp/dataset"))).thenReturn(false); + Mockito.when(destFs.exists(new Path("/tmp"))).thenReturn(false); + + Mockito.when(destFs.getFileStatus(any(Path.class))).thenReturn(localFs.getFileStatus(new Path(tmpDir.toString()))); + + setFsMockPathWithPermissions(sourceFs, "/tmp/dataset", dirPermission, "owner1", "group1", true); + setFsMockPathWithPermissions(sourceFs, "/tmp", "dr--r--r--", "owner2", "group2", true); + + Iterator<FileSet<CopyEntity>> fileSets = new ManifestBasedDataset(sourceFs, manifestReadFs, manifestPath, props).getFileSetIterator(destFs, + CopyConfiguration.builder(destFs, props).build()); + + Assert.assertTrue(fileSets.hasNext()); + FileSet<CopyEntity> fileSet = fileSets.next(); + // 1 dir to copy + 1 pre publish step + 1 post publish step + Assert.assertEquals(fileSet.getFiles().size(),3); + + CommitStep createDirectoryStep = ((PrePublishStep) fileSet.getFiles().get(1)).getStep(); + Assert.assertTrue(createDirectoryStep instanceof CreateDirectoryWithPermissionsCommitStep); + Map<String, List<OwnerAndPermission>> pathAndPermissions = ((CreateDirectoryWithPermissionsCommitStep) createDirectoryStep).getPathAndPermissions(); + Assert.assertEquals(pathAndPermissions.size(), 1); + + Assert.assertTrue(pathAndPermissions.containsKey("/tmp")); + Assert.assertEquals(pathAndPermissions.get("/tmp").size(), 1); + Assert.assertEquals(pathAndPermissions.get("/tmp").get(0).getFsPermission(), FsPermission.valueOf("dr--r--r--")); + Assert.assertEquals(pathAndPermissions.get("/tmp").get(0).getOwner(), "owner2"); + Assert.assertEquals(pathAndPermissions.get("/tmp").get(0).getGroup(), "group2"); + + CommitStep setPermissionStep = ((PostPublishStep) fileSet.getFiles().get(2)).getStep(); + Assert.assertTrue(setPermissionStep instanceof SetPermissionCommitStep); + Map<String, OwnerAndPermission> ownerAndPermissionMap = ((SetPermissionCommitStep) setPermissionStep).getPathAndPermissions(); + Assert.assertEquals(ownerAndPermissionMap.size(), expectedCount); + + List<String> sortedMapKeys = new ArrayList<>(ownerAndPermissionMap.keySet()); + Assert.assertEquals(sortedMapKeys.get(0), "/tmp"); + Assert.assertEquals(ownerAndPermissionMap.get("/tmp").getFsPermission(), FsPermission.valueOf("dr--r--r--")); + Assert.assertEquals(ownerAndPermissionMap.get("/tmp").getOwner(), "owner2"); + Assert.assertEquals(ownerAndPermissionMap.get("/tmp").getGroup(), "group2"); + + if (expectedCount > 1) { + Assert.assertEquals(sortedMapKeys.get(1), "/tmp/dataset"); + Assert.assertEquals(ownerAndPermissionMap.get("/tmp/dataset").getFsPermission(), FsPermission.valueOf(dirPermission)); + Assert.assertEquals(ownerAndPermissionMap.get("/tmp/dataset").getOwner(), "owner1"); + Assert.assertEquals(ownerAndPermissionMap.get("/tmp/dataset").getGroup(), "group1"); + } + } + } + private void setSourceAndDestFsMocks(FileSystem sourceFs, FileSystem destFs, Path manifestPath, FileSystem manifestReadFs, boolean setFileStatusMock) throws IOException, URISyntaxException { URI SRC_FS_URI = new URI("source", "the.source.org", "/", null); URI MANIFEST_READ_FS_URI = new URI("manifest-read", "the.manifest-source.org", "/", null); diff --git a/gobblin-data-management/src/test/resources/manifestBasedDistcpTest/sampleManifestWithOnlyDirectory.json b/gobblin-data-management/src/test/resources/manifestBasedDistcpTest/sampleManifestWithOnlyDirectory.json new file mode 100644 index 0000000000..5dd94ff064 --- /dev/null +++ b/gobblin-data-management/src/test/resources/manifestBasedDistcpTest/sampleManifestWithOnlyDirectory.json @@ -0,0 +1,8 @@ +[ + { + "id":"1", + "fileName":"/tmp/dataset", + "fileGroup":"/tmp/dataset", + "fileSizeInBytes":"1024" + } +] \ No newline at end of file diff --git a/gobblin-utility/src/main/java/org/apache/gobblin/util/filesystem/OwnerAndPermission.java b/gobblin-utility/src/main/java/org/apache/gobblin/util/filesystem/OwnerAndPermission.java index 7ae4cdc3fa..b14ab4fbe2 100644 --- a/gobblin-utility/src/main/java/org/apache/gobblin/util/filesystem/OwnerAndPermission.java +++ b/gobblin-utility/src/main/java/org/apache/gobblin/util/filesystem/OwnerAndPermission.java @@ -54,6 +54,22 @@ public class OwnerAndPermission implements Writable { this(owner, group, fsPermission, Lists.newArrayList()); } + + /** + * Copy constructor for {@link OwnerAndPermission}. + * <p> + * Creates a new instance by copying the values from another {@link OwnerAndPermission} object. + * Performs a deep copy of {@link FsPermission} and a shallow copy of the {@code aclEntries} list, + * since {@link org.apache.hadoop.fs.permission.AclEntry} is immutable. + * </p> + * + * @param other the {@code OwnerAndPermission} instance to copy from. + */ + public OwnerAndPermission(OwnerAndPermission other) { + this(other.owner, other.group, new FsPermission(other.fsPermission), + other.aclEntries == null ? Lists.newArrayList() : Lists.newArrayList(other.aclEntries)); + } + @Override public void write(DataOutput dataOutput) throws IOException { Text.writeString(dataOutput, this.owner); diff --git a/gobblin-utility/src/test/java/org/apache/gobblin/util/filesystem/OwnerAndPermissionTest.java b/gobblin-utility/src/test/java/org/apache/gobblin/util/filesystem/OwnerAndPermissionTest.java new file mode 100644 index 0000000000..d2276ee9d0 --- /dev/null +++ b/gobblin-utility/src/test/java/org/apache/gobblin/util/filesystem/OwnerAndPermissionTest.java @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.gobblin.util.filesystem; + +import java.util.Collections; +import java.util.List; + +import org.apache.hadoop.fs.permission.AclEntry; +import org.apache.hadoop.fs.permission.AclEntryScope; +import org.apache.hadoop.fs.permission.AclEntryType; +import org.apache.hadoop.fs.permission.FsAction; +import org.apache.hadoop.fs.permission.FsPermission; +import org.testng.Assert; +import org.testng.annotations.Test; + + +/** Tests for {@link OwnerAndPermission}*/ +public class OwnerAndPermissionTest { + + @Test + public void testOwnerAndPermissionCopyConstructor() { + String owner = "testOwner"; + String group = "testGroup"; + FsPermission permission = new FsPermission((short) 0755); + AclEntry aclEntry = new AclEntry.Builder() + .setPermission(FsAction.READ_WRITE) + .setName("test-acl") + .setScope(AclEntryScope.DEFAULT) + .setType(AclEntryType.GROUP) + .build(); + List<AclEntry> aclEntries = Collections.singletonList(aclEntry); + + OwnerAndPermission ownerAndPermission = new OwnerAndPermission(owner, group, permission, aclEntries); + OwnerAndPermission copyOwnerAndPermission = new OwnerAndPermission(ownerAndPermission); + + Assert.assertEquals(copyOwnerAndPermission.getOwner(), owner); + Assert.assertEquals(copyOwnerAndPermission.getGroup(), group); + Assert.assertEquals(copyOwnerAndPermission.getFsPermission(), permission); + Assert.assertEquals(copyOwnerAndPermission.getAclEntries(), aclEntries); + + Assert.assertNotSame(ownerAndPermission, copyOwnerAndPermission); + Assert.assertNotSame(ownerAndPermission.getFsPermission(), copyOwnerAndPermission.getFsPermission()); + Assert.assertNotSame(ownerAndPermission.getAclEntries(), copyOwnerAndPermission.getAclEntries()); + } +}