[ https://issues.apache.org/jira/browse/GOBBLIN-1720?focusedWorklogId=814231&page=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-814231 ]
ASF GitHub Bot logged work on GOBBLIN-1720: ------------------------------------------- Author: ASF GitHub Bot Created on: 06/Oct/22 05:41 Start Date: 06/Oct/22 05:41 Worklog Time Spent: 10m Work Description: phet commented on code in PR #3577: URL: https://github.com/apache/gobblin/pull/3577#discussion_r988558753 ########## gobblin-data-management/src/test/java/org/apache/gobblin/data/management/copy/iceberg/IcebergDatasetTest.java: ########## @@ -176,22 +176,72 @@ public void testGenerateCopyEntitiesMultiSnapshotWhenDestEmpty() throws IOExcept verifyCopyEntities(copyEntities, expectedPaths); } + @Test + public void testFsOwnershipAndPermissionPreservationWhenDestEmpty() throws IOException { + List<String> expectedPaths = Arrays.asList(METADATA_PATH, MANIFEST_LIST_PATH_0, + MANIFEST_PATH_0, MANIFEST_DATA_PATH_0A, MANIFEST_DATA_PATH_0B); + + MockFileSystemBuilder sourceBuilder = new MockFileSystemBuilder(SRC_FS_URI); + sourceBuilder.addPaths(expectedPaths); + FileSystem sourceFs = sourceBuilder.build(); + + IcebergTable icebergTable = new MockedIcebergTable(Arrays.asList(SNAPSHOT_PATHS_0)); + IcebergDataset icebergDataset = new TrickIcebergDataset(testDbName, testTblName, icebergTable, new Properties(), sourceFs); + + MockFileSystemBuilder destBuilder = new MockFileSystemBuilder(DEST_FS_URI); + FileSystem destFs = destBuilder.build(); + + CopyConfiguration copyConfiguration = CopyConfiguration.builder(destFs, copyConfigProperties) + .preserve(PreserveAttributes.fromMnemonicString("ugp")) + .copyContext(new CopyContext()) + .build(); + Collection<CopyEntity> copyEntities = icebergDataset.generateCopyEntities(destFs, copyConfiguration); + verifyFsOwnershipAndPermissionPreservation(copyEntities); + } + private void verifyCopyEntities(Collection<CopyEntity> copyEntities, List<String> expected) { List<String> actual = new ArrayList<>(); for (CopyEntity copyEntity : copyEntities) { String json = copyEntity.toString(); - String filepath = new Gson().fromJson(json, JsonObject.class) - .getAsJsonObject("object-data").getAsJsonObject("origin") - .getAsJsonObject("object-data").getAsJsonObject("path") - .getAsJsonObject("object-data").getAsJsonObject("uri") - .getAsJsonPrimitive("object-data").getAsString(); + String filepath = getFilePathAsStringFromJson(json); actual.add(filepath); } Assert.assertEquals(actual.size(), expected.size(), "Set" + actual.toString() + " vs Set" + expected.toString()); Assert.assertEqualsNoOrder(actual.toArray(), expected.toArray()); } + private void verifyFsOwnershipAndPermissionPreservation(Collection<CopyEntity> copyEntities) { Review Comment: I'm surprised you're only passing in the result object to verify, but not some expectation of what value it should hold... ...I also don't see any constants (at the instance or class level) for what perms and ownership we're expecting. ########## gobblin-data-management/src/main/java/org/apache/gobblin/data/management/copy/iceberg/IcebergDataset.java: ########## @@ -137,13 +139,17 @@ Collection<CopyEntity> generateCopyEntities(FileSystem targetFs, CopyConfigurati // TODO: determine whether unnecessarily expensive to repeatedly re-create what should be the same FS: could it // instead be created once and reused thereafter? FileSystem actualSourceFs = getSourceFileSystemFromFileStatus(srcFileStatus, defaultHadoopConfiguration); + Path toPath = PathUtils.getRootPathParent(srcPath); - // TODO: Add preservation of ancestor ownership and permissions! - + // preserving ancestor permissions till root between src and dest Review Comment: since `toPath` is one level beneath root, is that where perms are preserved to? (that actually seems fine, so not suggesting to change anything other than inaccurate comment) ########## gobblin-data-management/src/test/java/org/apache/gobblin/data/management/copy/iceberg/IcebergDatasetTest.java: ########## @@ -176,22 +176,72 @@ public void testGenerateCopyEntitiesMultiSnapshotWhenDestEmpty() throws IOExcept verifyCopyEntities(copyEntities, expectedPaths); } + @Test + public void testFsOwnershipAndPermissionPreservationWhenDestEmpty() throws IOException { + List<String> expectedPaths = Arrays.asList(METADATA_PATH, MANIFEST_LIST_PATH_0, + MANIFEST_PATH_0, MANIFEST_DATA_PATH_0A, MANIFEST_DATA_PATH_0B); + + MockFileSystemBuilder sourceBuilder = new MockFileSystemBuilder(SRC_FS_URI); + sourceBuilder.addPaths(expectedPaths); + FileSystem sourceFs = sourceBuilder.build(); + + IcebergTable icebergTable = new MockedIcebergTable(Arrays.asList(SNAPSHOT_PATHS_0)); + IcebergDataset icebergDataset = new TrickIcebergDataset(testDbName, testTblName, icebergTable, new Properties(), sourceFs); + + MockFileSystemBuilder destBuilder = new MockFileSystemBuilder(DEST_FS_URI); + FileSystem destFs = destBuilder.build(); + + CopyConfiguration copyConfiguration = CopyConfiguration.builder(destFs, copyConfigProperties) + .preserve(PreserveAttributes.fromMnemonicString("ugp")) + .copyContext(new CopyContext()) + .build(); + Collection<CopyEntity> copyEntities = icebergDataset.generateCopyEntities(destFs, copyConfiguration); + verifyFsOwnershipAndPermissionPreservation(copyEntities); + } + private void verifyCopyEntities(Collection<CopyEntity> copyEntities, List<String> expected) { List<String> actual = new ArrayList<>(); for (CopyEntity copyEntity : copyEntities) { String json = copyEntity.toString(); - String filepath = new Gson().fromJson(json, JsonObject.class) - .getAsJsonObject("object-data").getAsJsonObject("origin") - .getAsJsonObject("object-data").getAsJsonObject("path") - .getAsJsonObject("object-data").getAsJsonObject("uri") - .getAsJsonPrimitive("object-data").getAsString(); + String filepath = getFilePathAsStringFromJson(json); actual.add(filepath); } Assert.assertEquals(actual.size(), expected.size(), "Set" + actual.toString() + " vs Set" + expected.toString()); Assert.assertEqualsNoOrder(actual.toArray(), expected.toArray()); } + private void verifyFsOwnershipAndPermissionPreservation(Collection<CopyEntity> copyEntities) { + List<JsonObject> ancestorsOwnerAndPermissionsList = new ArrayList<>(); + List<JsonObject> destinationOwnerAndPermissionsList = new ArrayList<>(); + + for(CopyEntity copyEntity : copyEntities) { + String copyEntityJson = copyEntity.toString(); + + JsonArray ancestorsOwnerAndPermissions = new Gson().fromJson(copyEntityJson, JsonObject.class) + .getAsJsonObject("object-data").getAsJsonArray("ancestorsOwnerAndPermission"); + JsonObject rootAncestorOwnerAndPermissions = ancestorsOwnerAndPermissions.get(ancestorsOwnerAndPermissions.size()-1) + .getAsJsonObject().getAsJsonObject(); + ancestorsOwnerAndPermissionsList.add(rootAncestorOwnerAndPermissions); + String filePath = getFilePathAsStringFromJson(copyEntityJson); + Assert.assertEquals(ancestorsOwnerAndPermissions.size(), new Path(filePath).getParent().depth() -1); + + JsonObject destinationOwnerAndPermissions = new Gson().fromJson(copyEntityJson, JsonObject.class) + .getAsJsonObject("object-data").getAsJsonObject("destinationOwnerAndPermission"); + destinationOwnerAndPermissionsList.add(destinationOwnerAndPermissions); + } + Assert.assertEqualsNoOrder(ancestorsOwnerAndPermissionsList.toArray(), destinationOwnerAndPermissionsList.toArray()); Review Comment: both of these were initialized from the `CopyEntity` JSON... so, perhaps--maybe?--that demonstrates some level of internal coherence in the result... nonetheless it doesn't verify that the particular values of either are anything in particular (like what an observer would expect them to be). ########## gobblin-data-management/src/test/java/org/apache/gobblin/data/management/copy/iceberg/IcebergDatasetTest.java: ########## @@ -176,22 +176,72 @@ public void testGenerateCopyEntitiesMultiSnapshotWhenDestEmpty() throws IOExcept verifyCopyEntities(copyEntities, expectedPaths); } + @Test + public void testFsOwnershipAndPermissionPreservationWhenDestEmpty() throws IOException { + List<String> expectedPaths = Arrays.asList(METADATA_PATH, MANIFEST_LIST_PATH_0, + MANIFEST_PATH_0, MANIFEST_DATA_PATH_0A, MANIFEST_DATA_PATH_0B); + + MockFileSystemBuilder sourceBuilder = new MockFileSystemBuilder(SRC_FS_URI); + sourceBuilder.addPaths(expectedPaths); + FileSystem sourceFs = sourceBuilder.build(); + + IcebergTable icebergTable = new MockedIcebergTable(Arrays.asList(SNAPSHOT_PATHS_0)); + IcebergDataset icebergDataset = new TrickIcebergDataset(testDbName, testTblName, icebergTable, new Properties(), sourceFs); + + MockFileSystemBuilder destBuilder = new MockFileSystemBuilder(DEST_FS_URI); + FileSystem destFs = destBuilder.build(); + + CopyConfiguration copyConfiguration = CopyConfiguration.builder(destFs, copyConfigProperties) + .preserve(PreserveAttributes.fromMnemonicString("ugp")) + .copyContext(new CopyContext()) + .build(); + Collection<CopyEntity> copyEntities = icebergDataset.generateCopyEntities(destFs, copyConfiguration); + verifyFsOwnershipAndPermissionPreservation(copyEntities); + } + private void verifyCopyEntities(Collection<CopyEntity> copyEntities, List<String> expected) { List<String> actual = new ArrayList<>(); for (CopyEntity copyEntity : copyEntities) { String json = copyEntity.toString(); - String filepath = new Gson().fromJson(json, JsonObject.class) - .getAsJsonObject("object-data").getAsJsonObject("origin") - .getAsJsonObject("object-data").getAsJsonObject("path") - .getAsJsonObject("object-data").getAsJsonObject("uri") - .getAsJsonPrimitive("object-data").getAsString(); + String filepath = getFilePathAsStringFromJson(json); actual.add(filepath); } Assert.assertEquals(actual.size(), expected.size(), "Set" + actual.toString() + " vs Set" + expected.toString()); Assert.assertEqualsNoOrder(actual.toArray(), expected.toArray()); } + private void verifyFsOwnershipAndPermissionPreservation(Collection<CopyEntity> copyEntities) { + List<JsonObject> ancestorsOwnerAndPermissionsList = new ArrayList<>(); + List<JsonObject> destinationOwnerAndPermissionsList = new ArrayList<>(); + + for(CopyEntity copyEntity : copyEntities) { + String copyEntityJson = copyEntity.toString(); + + JsonArray ancestorsOwnerAndPermissions = new Gson().fromJson(copyEntityJson, JsonObject.class) + .getAsJsonObject("object-data").getAsJsonArray("ancestorsOwnerAndPermission"); + JsonObject rootAncestorOwnerAndPermissions = ancestorsOwnerAndPermissions.get(ancestorsOwnerAndPermissions.size()-1) + .getAsJsonObject().getAsJsonObject(); Review Comment: why add only the last element and skip the others in the list? ########## gobblin-data-management/src/test/java/org/apache/gobblin/data/management/copy/iceberg/IcebergDatasetTest.java: ########## @@ -176,22 +176,72 @@ public void testGenerateCopyEntitiesMultiSnapshotWhenDestEmpty() throws IOExcept verifyCopyEntities(copyEntities, expectedPaths); } + @Test + public void testFsOwnershipAndPermissionPreservationWhenDestEmpty() throws IOException { + List<String> expectedPaths = Arrays.asList(METADATA_PATH, MANIFEST_LIST_PATH_0, + MANIFEST_PATH_0, MANIFEST_DATA_PATH_0A, MANIFEST_DATA_PATH_0B); + + MockFileSystemBuilder sourceBuilder = new MockFileSystemBuilder(SRC_FS_URI); + sourceBuilder.addPaths(expectedPaths); + FileSystem sourceFs = sourceBuilder.build(); Review Comment: for robust testing, shouldn't we set particular permissions and ownership for the parent/ancestors of at least some paths? I'd hope at least two levels get differing values, to truly demonstrate that the recursive walk toward root accurately preserves what it finds (in the correct order). BTW, could always check whether there may be a test on the hive distcp side to take ideas from... ########## gobblin-utility/src/main/java/org/apache/gobblin/util/PathUtils.java: ########## @@ -80,6 +80,18 @@ public static Path getRootPath(Path path) { return getRootPath(path.getParent()); } + /** + * Returns the root path parent for the specified path. + * + * @see Path + */ + public static Path getRootPathParent(Path path) { Review Comment: naming-wise, this looks like it gets the root path's child, not its parent. also doesn't it need to guard against NPE, given the `Path.isRoot()` definition of: ``` return getParent() == null; ``` ? ########## gobblin-data-management/src/test/java/org/apache/gobblin/data/management/copy/iceberg/IcebergDatasetTest.java: ########## @@ -176,22 +176,72 @@ public void testGenerateCopyEntitiesMultiSnapshotWhenDestEmpty() throws IOExcept verifyCopyEntities(copyEntities, expectedPaths); } + @Test + public void testFsOwnershipAndPermissionPreservationWhenDestEmpty() throws IOException { + List<String> expectedPaths = Arrays.asList(METADATA_PATH, MANIFEST_LIST_PATH_0, + MANIFEST_PATH_0, MANIFEST_DATA_PATH_0A, MANIFEST_DATA_PATH_0B); + + MockFileSystemBuilder sourceBuilder = new MockFileSystemBuilder(SRC_FS_URI); + sourceBuilder.addPaths(expectedPaths); + FileSystem sourceFs = sourceBuilder.build(); + + IcebergTable icebergTable = new MockedIcebergTable(Arrays.asList(SNAPSHOT_PATHS_0)); + IcebergDataset icebergDataset = new TrickIcebergDataset(testDbName, testTblName, icebergTable, new Properties(), sourceFs); + + MockFileSystemBuilder destBuilder = new MockFileSystemBuilder(DEST_FS_URI); + FileSystem destFs = destBuilder.build(); + + CopyConfiguration copyConfiguration = CopyConfiguration.builder(destFs, copyConfigProperties) + .preserve(PreserveAttributes.fromMnemonicString("ugp")) + .copyContext(new CopyContext()) + .build(); + Collection<CopyEntity> copyEntities = icebergDataset.generateCopyEntities(destFs, copyConfiguration); + verifyFsOwnershipAndPermissionPreservation(copyEntities); + } + private void verifyCopyEntities(Collection<CopyEntity> copyEntities, List<String> expected) { List<String> actual = new ArrayList<>(); for (CopyEntity copyEntity : copyEntities) { String json = copyEntity.toString(); - String filepath = new Gson().fromJson(json, JsonObject.class) - .getAsJsonObject("object-data").getAsJsonObject("origin") - .getAsJsonObject("object-data").getAsJsonObject("path") - .getAsJsonObject("object-data").getAsJsonObject("uri") - .getAsJsonPrimitive("object-data").getAsString(); + String filepath = getFilePathAsStringFromJson(json); actual.add(filepath); } Assert.assertEquals(actual.size(), expected.size(), "Set" + actual.toString() + " vs Set" + expected.toString()); Assert.assertEqualsNoOrder(actual.toArray(), expected.toArray()); } + private void verifyFsOwnershipAndPermissionPreservation(Collection<CopyEntity> copyEntities) { + List<JsonObject> ancestorsOwnerAndPermissionsList = new ArrayList<>(); + List<JsonObject> destinationOwnerAndPermissionsList = new ArrayList<>(); + + for(CopyEntity copyEntity : copyEntities) { + String copyEntityJson = copyEntity.toString(); + + JsonArray ancestorsOwnerAndPermissions = new Gson().fromJson(copyEntityJson, JsonObject.class) + .getAsJsonObject("object-data").getAsJsonArray("ancestorsOwnerAndPermission"); + JsonObject rootAncestorOwnerAndPermissions = ancestorsOwnerAndPermissions.get(ancestorsOwnerAndPermissions.size()-1) + .getAsJsonObject().getAsJsonObject(); + ancestorsOwnerAndPermissionsList.add(rootAncestorOwnerAndPermissions); + String filePath = getFilePathAsStringFromJson(copyEntityJson); + Assert.assertEquals(ancestorsOwnerAndPermissions.size(), new Path(filePath).getParent().depth() -1); + + JsonObject destinationOwnerAndPermissions = new Gson().fromJson(copyEntityJson, JsonObject.class) + .getAsJsonObject("object-data").getAsJsonObject("destinationOwnerAndPermission"); + destinationOwnerAndPermissionsList.add(destinationOwnerAndPermissions); + } + Assert.assertEqualsNoOrder(ancestorsOwnerAndPermissionsList.toArray(), destinationOwnerAndPermissionsList.toArray()); + } + + private String getFilePathAsStringFromJson(String json) { + String filepath = new Gson().fromJson(json, JsonObject.class) Review Comment: first off, I *love* that you've raised the level of abstraction! that said, is this the right approach of a single helper method? I see that in the case of verifying permission preservation, you need to initialize a second and even third: ``` Gson().fromJson(copyEntityJson, JsonObject.class) .getAsJsonObject("object-data") ``` (which unfortunately go without any encapsulating abstraction). why not a variation on the Proxy pattern, where you initialize once, hold onto it, and provide getters/extractors? thus a class, perhaps named, `CopyEntityJsonExtractor`, with methods `extractFilePath`/`extractFileUri` and `extractAncestorsO&P` and `extractDestinationO&P`. or perhaps clearer naming: `CopyEntityDeserialization` with methods `get*` Issue Time Tracking ------------------- Worklog Id: (was: 814231) Remaining Estimate: 0h Time Spent: 10m > Preserve Ancestor Owner and Permissions for Fs between Src and Dest for > Iceberg Distcp > -------------------------------------------------------------------------------------- > > Key: GOBBLIN-1720 > URL: https://issues.apache.org/jira/browse/GOBBLIN-1720 > Project: Apache Gobblin > Issue Type: Improvement > Reporter: Meeth Gala > Priority: Major > Time Spent: 10m > Remaining Estimate: 0h > > We want to preserve the Fs ownership and permissions between src and dest > while performing an Iceberg based distcp. Currently, we are preserving all > the permissions up to root dir for Iceberg tables. -- This message was sent by Atlassian Jira (v8.20.10#820010)