This is an automated email from the ASF dual-hosted git repository.
vivekrai pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/gobblin.git
The following commit(s) were added to refs/heads/master by this push:
new 8d71c89cc1 [GOBBLIN-2206] Revert extra execute bit getting set in
ManifestDistcp (#4115)
8d71c89cc1 is described below
commit 8d71c89cc16a1b8d2d4bf7f19182fca32a3608c4
Author: Vivek Rai <[email protected]>
AuthorDate: Mon May 5 13:11:35 2025 +0530
[GOBBLIN-2206] Revert extra execute bit getting set in ManifestDistcp
(#4115)
* added check to add extra set permission step
* add copy constructor for ownerandpermission
* remove extra null check for checkstyle failure
---
.../data/management/copy/ManifestBasedDataset.java | 22 ++++++-
.../dataset/ManifestBasedDatasetFinderTest.java | 72 ++++++++++++++++++++++
.../sampleManifestWithOnlyDirectory.json | 8 +++
.../util/filesystem/OwnerAndPermission.java | 16 +++++
.../util/filesystem/OwnerAndPermissionTest.java | 60 ++++++++++++++++++
5 files changed, 177 insertions(+), 1 deletion(-)
diff --git
a/gobblin-data-management/src/main/java/org/apache/gobblin/data/management/copy/ManifestBasedDataset.java
b/gobblin-data-management/src/main/java/org/apache/gobblin/data/management/copy/ManifestBasedDataset.java
index 6d7df69043..8f9a78b7ce 100644
---
a/gobblin-data-management/src/main/java/org/apache/gobblin/data/management/copy/ManifestBasedDataset.java
+++
b/gobblin-data-management/src/main/java/org/apache/gobblin/data/management/copy/ManifestBasedDataset.java
@@ -18,6 +18,7 @@
package org.apache.gobblin.data.management.copy;
import java.io.IOException;
+import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
@@ -31,6 +32,7 @@ import java.util.concurrent.TimeUnit;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.permission.FsAction;
import com.google.common.base.Optional;
import com.google.common.cache.Cache;
@@ -113,6 +115,7 @@ public class ManifestBasedDataset implements
IterableCopyableDataset {
List<FileStatus> toDelete = Lists.newArrayList();
// map of paths and permissions sorted by depth of path, so that
permissions can be set in order
Map<String, List<OwnerAndPermission>> ancestorOwnerAndPermissions = new
HashMap<>();
+ Map<String, List<OwnerAndPermission>>
ancestorOwnerAndPermissionsForSetPermissionStep = new HashMap<>();
TreeMap<String, OwnerAndPermission> flattenedAncestorPermissions = new
TreeMap<>(
Comparator.comparingInt((String o) ->
o.split("/").length).thenComparing(o -> o));
try {
@@ -143,12 +146,29 @@ public class ManifestBasedDataset implements
IterableCopyableDataset {
copyableFile.setFsDatasets(srcFs, targetFs);
copyEntities.add(copyableFile);
+ // In case of directory with 000 permission, the permission is
changed to 100 due to HadoopUtils::addExecutePermissionToOwner
+ // getting called from
CopyDataPublisher::preserveFileAttrInPublisher ->
FileAwareInputStreamDataWriter::setPathPermission ->
+ // FileAwareInputStreamDataWriter::setOwnerExecuteBitIfDirectory
-> HadoopUtils::addExecutePermissionToOwner
+ // We need to revert this extra permission change in
setPermissionStep
+ if (srcFile.isDirectory() &&
!srcFile.getPermission().getUserAction().implies(FsAction.EXECUTE)
+ &&
!ancestorOwnerAndPermissionsForSetPermissionStep.containsKey(PathUtils.getPathWithoutSchemeAndAuthority(fileToCopy).toString())
+ && !targetFs.exists(fileToCopy)) {
+ List<OwnerAndPermission> ancestorsOwnerAndPermissionUpdated =
new ArrayList<>();
+ OwnerAndPermission srcFileOwnerPermissionReplicatedForDest = new
OwnerAndPermission(copyableFile.getDestinationOwnerAndPermission());
+
ancestorsOwnerAndPermissionUpdated.add(srcFileOwnerPermissionReplicatedForDest);
+
copyableFile.getAncestorsOwnerAndPermission().forEach(ancestorOwnPerm ->
ancestorsOwnerAndPermissionUpdated.add(new
OwnerAndPermission(ancestorOwnPerm)));
+
ancestorOwnerAndPermissionsForSetPermissionStep.put(fileToCopy.toString(),
ancestorsOwnerAndPermissionUpdated);
+ }
+
// Always grab the parent since the above permission setting
should be setting the permission for a folder itself
// {@link CopyDataPublisher#preserveFileAttrInPublisher} will be
setting the permission for the empty child dir
Path fromPath = fileToCopy.getParent();
// Avoid duplicate calculation for the same ancestor
if (fromPath != null &&
!ancestorOwnerAndPermissions.containsKey(PathUtils.getPathWithoutSchemeAndAuthority(fromPath).toString())
&& !targetFs.exists(fromPath)) {
ancestorOwnerAndPermissions.put(fromPath.toString(),
copyableFile.getAncestorsOwnerAndPermission());
+ if
(!ancestorOwnerAndPermissionsForSetPermissionStep.containsKey(PathUtils.getPathWithoutSchemeAndAuthority(fromPath).toString()))
{
+
ancestorOwnerAndPermissionsForSetPermissionStep.put(fromPath.toString(),
copyableFile.getAncestorsOwnerAndPermission());
+ }
}
if (existOnTarget && srcFile.isFile()) {
@@ -167,7 +187,7 @@ public class ManifestBasedDataset implements
IterableCopyableDataset {
copyEntities.add(new PrePublishStep(datasetURN(), Maps.newHashMap(),
createDirectoryWithPermissionsCommitStep, 1));
if (this.enableSetPermissionPostPublish) {
- for (Map.Entry<String, List<OwnerAndPermission>>
recursiveParentPermissions : ancestorOwnerAndPermissions.entrySet()) {
+ for (Map.Entry<String, List<OwnerAndPermission>>
recursiveParentPermissions :
ancestorOwnerAndPermissionsForSetPermissionStep.entrySet()) {
Path currentPath = new Path(recursiveParentPermissions.getKey());
for (OwnerAndPermission ownerAndPermission :
recursiveParentPermissions.getValue()) {
// Ignore folders that already exist in destination, we assume
that the publisher will re-sync those permissions if needed and
diff --git
a/gobblin-data-management/src/test/java/org/apache/gobblin/data/management/dataset/ManifestBasedDatasetFinderTest.java
b/gobblin-data-management/src/test/java/org/apache/gobblin/data/management/dataset/ManifestBasedDatasetFinderTest.java
index d81de494d0..6acc668073 100644
---
a/gobblin-data-management/src/test/java/org/apache/gobblin/data/management/dataset/ManifestBasedDatasetFinderTest.java
+++
b/gobblin-data-management/src/test/java/org/apache/gobblin/data/management/dataset/ManifestBasedDatasetFinderTest.java
@@ -36,6 +36,7 @@ import org.apache.hadoop.fs.permission.AclStatus;
import org.apache.hadoop.fs.permission.FsPermission;
import org.mockito.Mockito;
import org.testng.Assert;
+import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import com.google.common.io.Files;
@@ -343,6 +344,77 @@ public class ManifestBasedDatasetFinderTest {
Assert.assertEquals(fileSet.getFiles().size(), 2); // 1 files to copy + 1
pre publish step
}
+ @DataProvider
+ public Object[][] dirPermissionWithExpectedSetPermissionStepCount() {
+ return new Object[][] {
+ {"d---------", 2},
+ {"drw-rw--w-", 2},
+ {"d-w-r----x", 2},
+ {"drwxrwxrwx", 1},
+ {"dr-xr-xr-x", 1},
+ {"d--x------", 1},
+ };
+ }
+
+ @Test(dataProvider = "dirPermissionWithExpectedSetPermissionStepCount")
+ public void
testSetPermissionWhenCopyingDirectoryWithOwnerExecutePermissionSetUnset(String
dirPermission, int expectedCount) throws IOException, URISyntaxException {
+ Path manifestPath = new
Path(getClass().getClassLoader().getResource("manifestBasedDistcpTest/sampleManifestWithOnlyDirectory.json").getPath());
+ Properties props = new Properties();
+ props.setProperty(ConfigurationKeys.DATA_PUBLISHER_FINAL_DIR, "/");
+ props.setProperty("gobblin.copy.preserved.attributes", "rbugpvta");
+ try (FileSystem sourceFs = Mockito.mock(FileSystem.class);
+ FileSystem manifestReadFs = Mockito.mock(FileSystem.class);
+ FileSystem destFs = Mockito.mock(FileSystem.class)) {
+
+ setSourceAndDestFsMocks(sourceFs, destFs, manifestPath, manifestReadFs,
false);
+
+ Mockito.when(destFs.exists(new Path("/tmp/dataset"))).thenReturn(false);
+ Mockito.when(destFs.exists(new Path("/tmp"))).thenReturn(false);
+
+
Mockito.when(destFs.getFileStatus(any(Path.class))).thenReturn(localFs.getFileStatus(new
Path(tmpDir.toString())));
+
+ setFsMockPathWithPermissions(sourceFs, "/tmp/dataset", dirPermission,
"owner1", "group1", true);
+ setFsMockPathWithPermissions(sourceFs, "/tmp", "dr--r--r--", "owner2",
"group2", true);
+
+ Iterator<FileSet<CopyEntity>> fileSets = new
ManifestBasedDataset(sourceFs, manifestReadFs, manifestPath,
props).getFileSetIterator(destFs,
+ CopyConfiguration.builder(destFs, props).build());
+
+ Assert.assertTrue(fileSets.hasNext());
+ FileSet<CopyEntity> fileSet = fileSets.next();
+ // 1 dir to copy + 1 pre publish step + 1 post publish step
+ Assert.assertEquals(fileSet.getFiles().size(),3);
+
+ CommitStep createDirectoryStep = ((PrePublishStep)
fileSet.getFiles().get(1)).getStep();
+ Assert.assertTrue(createDirectoryStep instanceof
CreateDirectoryWithPermissionsCommitStep);
+ Map<String, List<OwnerAndPermission>> pathAndPermissions =
((CreateDirectoryWithPermissionsCommitStep)
createDirectoryStep).getPathAndPermissions();
+ Assert.assertEquals(pathAndPermissions.size(), 1);
+
+ Assert.assertTrue(pathAndPermissions.containsKey("/tmp"));
+ Assert.assertEquals(pathAndPermissions.get("/tmp").size(), 1);
+
Assert.assertEquals(pathAndPermissions.get("/tmp").get(0).getFsPermission(),
FsPermission.valueOf("dr--r--r--"));
+ Assert.assertEquals(pathAndPermissions.get("/tmp").get(0).getOwner(),
"owner2");
+ Assert.assertEquals(pathAndPermissions.get("/tmp").get(0).getGroup(),
"group2");
+
+ CommitStep setPermissionStep = ((PostPublishStep)
fileSet.getFiles().get(2)).getStep();
+ Assert.assertTrue(setPermissionStep instanceof SetPermissionCommitStep);
+ Map<String, OwnerAndPermission> ownerAndPermissionMap =
((SetPermissionCommitStep) setPermissionStep).getPathAndPermissions();
+ Assert.assertEquals(ownerAndPermissionMap.size(), expectedCount);
+
+ List<String> sortedMapKeys = new
ArrayList<>(ownerAndPermissionMap.keySet());
+ Assert.assertEquals(sortedMapKeys.get(0), "/tmp");
+ Assert.assertEquals(ownerAndPermissionMap.get("/tmp").getFsPermission(),
FsPermission.valueOf("dr--r--r--"));
+ Assert.assertEquals(ownerAndPermissionMap.get("/tmp").getOwner(),
"owner2");
+ Assert.assertEquals(ownerAndPermissionMap.get("/tmp").getGroup(),
"group2");
+
+ if (expectedCount > 1) {
+ Assert.assertEquals(sortedMapKeys.get(1), "/tmp/dataset");
+
Assert.assertEquals(ownerAndPermissionMap.get("/tmp/dataset").getFsPermission(),
FsPermission.valueOf(dirPermission));
+
Assert.assertEquals(ownerAndPermissionMap.get("/tmp/dataset").getOwner(),
"owner1");
+
Assert.assertEquals(ownerAndPermissionMap.get("/tmp/dataset").getGroup(),
"group1");
+ }
+ }
+ }
+
private void setSourceAndDestFsMocks(FileSystem sourceFs, FileSystem destFs,
Path manifestPath, FileSystem manifestReadFs, boolean setFileStatusMock) throws
IOException, URISyntaxException {
URI SRC_FS_URI = new URI("source", "the.source.org", "/", null);
URI MANIFEST_READ_FS_URI = new URI("manifest-read",
"the.manifest-source.org", "/", null);
diff --git
a/gobblin-data-management/src/test/resources/manifestBasedDistcpTest/sampleManifestWithOnlyDirectory.json
b/gobblin-data-management/src/test/resources/manifestBasedDistcpTest/sampleManifestWithOnlyDirectory.json
new file mode 100644
index 0000000000..5dd94ff064
--- /dev/null
+++
b/gobblin-data-management/src/test/resources/manifestBasedDistcpTest/sampleManifestWithOnlyDirectory.json
@@ -0,0 +1,8 @@
+[
+ {
+ "id":"1",
+ "fileName":"/tmp/dataset",
+ "fileGroup":"/tmp/dataset",
+ "fileSizeInBytes":"1024"
+ }
+]
\ No newline at end of file
diff --git
a/gobblin-utility/src/main/java/org/apache/gobblin/util/filesystem/OwnerAndPermission.java
b/gobblin-utility/src/main/java/org/apache/gobblin/util/filesystem/OwnerAndPermission.java
index 7ae4cdc3fa..b14ab4fbe2 100644
---
a/gobblin-utility/src/main/java/org/apache/gobblin/util/filesystem/OwnerAndPermission.java
+++
b/gobblin-utility/src/main/java/org/apache/gobblin/util/filesystem/OwnerAndPermission.java
@@ -54,6 +54,22 @@ public class OwnerAndPermission implements Writable {
this(owner, group, fsPermission, Lists.newArrayList());
}
+
+ /**
+ * Copy constructor for {@link OwnerAndPermission}.
+ * <p>
+ * Creates a new instance by copying the values from another {@link
OwnerAndPermission} object.
+ * Performs a deep copy of {@link FsPermission} and a shallow copy of the
{@code aclEntries} list,
+ * since {@link org.apache.hadoop.fs.permission.AclEntry} is immutable.
+ * </p>
+ *
+ * @param other the {@code OwnerAndPermission} instance to copy from.
+ */
+ public OwnerAndPermission(OwnerAndPermission other) {
+ this(other.owner, other.group, new FsPermission(other.fsPermission),
+ other.aclEntries == null ? Lists.newArrayList() :
Lists.newArrayList(other.aclEntries));
+ }
+
@Override
public void write(DataOutput dataOutput) throws IOException {
Text.writeString(dataOutput, this.owner);
diff --git
a/gobblin-utility/src/test/java/org/apache/gobblin/util/filesystem/OwnerAndPermissionTest.java
b/gobblin-utility/src/test/java/org/apache/gobblin/util/filesystem/OwnerAndPermissionTest.java
new file mode 100644
index 0000000000..d2276ee9d0
--- /dev/null
+++
b/gobblin-utility/src/test/java/org/apache/gobblin/util/filesystem/OwnerAndPermissionTest.java
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.gobblin.util.filesystem;
+
+import java.util.Collections;
+import java.util.List;
+
+import org.apache.hadoop.fs.permission.AclEntry;
+import org.apache.hadoop.fs.permission.AclEntryScope;
+import org.apache.hadoop.fs.permission.AclEntryType;
+import org.apache.hadoop.fs.permission.FsAction;
+import org.apache.hadoop.fs.permission.FsPermission;
+import org.testng.Assert;
+import org.testng.annotations.Test;
+
+
+/** Tests for {@link OwnerAndPermission}*/
+public class OwnerAndPermissionTest {
+
+ @Test
+ public void testOwnerAndPermissionCopyConstructor() {
+ String owner = "testOwner";
+ String group = "testGroup";
+ FsPermission permission = new FsPermission((short) 0755);
+ AclEntry aclEntry = new AclEntry.Builder()
+ .setPermission(FsAction.READ_WRITE)
+ .setName("test-acl")
+ .setScope(AclEntryScope.DEFAULT)
+ .setType(AclEntryType.GROUP)
+ .build();
+ List<AclEntry> aclEntries = Collections.singletonList(aclEntry);
+
+ OwnerAndPermission ownerAndPermission = new OwnerAndPermission(owner,
group, permission, aclEntries);
+ OwnerAndPermission copyOwnerAndPermission = new
OwnerAndPermission(ownerAndPermission);
+
+ Assert.assertEquals(copyOwnerAndPermission.getOwner(), owner);
+ Assert.assertEquals(copyOwnerAndPermission.getGroup(), group);
+ Assert.assertEquals(copyOwnerAndPermission.getFsPermission(), permission);
+ Assert.assertEquals(copyOwnerAndPermission.getAclEntries(), aclEntries);
+
+ Assert.assertNotSame(ownerAndPermission, copyOwnerAndPermission);
+ Assert.assertNotSame(ownerAndPermission.getFsPermission(),
copyOwnerAndPermission.getFsPermission());
+ Assert.assertNotSame(ownerAndPermission.getAclEntries(),
copyOwnerAndPermission.getAclEntries());
+ }
+}