phet commented on code in PR #4058: URL: https://github.com/apache/gobblin/pull/4058#discussion_r1812012723
########## gobblin-data-management/src/test/java/org/apache/gobblin/data/management/copy/iceberg/IcebergTableTest.java: ########## @@ -226,6 +226,90 @@ public void testNewTablePropertiesAreRegistered() throws Exception { catalog.dropTable(destTableId); } + /** Verify that getPartitionSpecificDataFiles return datafiles belonging to the partition defined by predicate */ + @Test + public void testGetPartitionSpecificDataFiles() throws IOException { + List<String> paths = Arrays.asList( + "/path/tableName/data/id=1/file1.orc", + "/path/tableName/data/file3.orc", + "/path/tableName/data/id=2/file5.orc", + "/path/tableName/data/file4.orc", + "/path/tableName/data/id=3/file2.orc" + ); + // Using the schema defined in start of this class + PartitionData partitionData = new PartitionData(icebergPartitionSpec.partitionType()); + partitionData.set(0, "1"); + Map<String, PartitionData> pathsWithPartitionData = Maps.newHashMap(); + paths.forEach(path -> pathsWithPartitionData.put(path, partitionData)); + + addPartitionDataFiles(table, createDataFiles(pathsWithPartitionData)); + + IcebergTable icebergTable = new IcebergTable(tableId, + catalog.newTableOps(tableId), + catalogUri, + catalog.loadTable(tableId)); + // Using AlwaysTrue & AlwaysFalse Predicate to avoid mocking of predicate class + Predicate<StructLike> alwaysTruePredicate = partition -> true; + Predicate<StructLike> alwaysFalsePredicate = partition -> false; + Assert.assertEquals(icebergTable.getPartitionSpecificDataFiles(alwaysTruePredicate).size(), 5); + Assert.assertEquals(icebergTable.getPartitionSpecificDataFiles(alwaysFalsePredicate).size(), 0); + } + + /** Verify that overwritePartition replace data files belonging to given partition col and value */ + @Test + public void testOverwritePartition() throws IOException { + List<String> paths = Arrays.asList( + "/path/tableName/data/id=1/file1.orc", + "/path/tableName/data/file2.orc" + ); + // Using the schema defined in start of this class + PartitionData partitionData = new PartitionData(icebergPartitionSpec.partitionType()); + partitionData.set(0, "1"); + Map<String, PartitionData> pathsWithPartitionData = Maps.newHashMap(); + paths.forEach(path -> pathsWithPartitionData.put(path, partitionData)); + + addPartitionDataFiles(table, createDataFiles(pathsWithPartitionData)); + + IcebergTable icebergTable = new IcebergTable(tableId, + catalog.newTableOps(tableId), + catalogUri, + catalog.loadTable(tableId)); + + verifyAnyOrder(paths, icebergTable.getCurrentSnapshotInfo().getAllDataFilePaths(), "data filepaths should match"); + + List<String> paths2 = Arrays.asList( + "/path/tableName/data/file3.orc", + "/path/tableName/data/id=2/file4.orc" + ); + // Using the schema defined in start of this class + PartitionData partitionData2 = new PartitionData(icebergPartitionSpec.partitionType()); + partitionData2.set(0, "2"); + Map<String, PartitionData> paths2WithPartitionData2 = Maps.newHashMap(); + paths2.forEach(path -> paths2WithPartitionData2.put(path, partitionData2)); + + List<DataFile> partition2DataFiles = createDataFiles(paths2WithPartitionData2); + // here, since partition data with value 2 doesn't exist yet, + // we expect it to get added to the table, w/o changing or deleting any other partitions + icebergTable.overwritePartition(partition2DataFiles, "id", "2"); + List<String> expectedPaths2 = new ArrayList<>(paths); + expectedPaths2.addAll(paths2); + verifyAnyOrder(expectedPaths2, icebergTable.getCurrentSnapshotInfo().getAllDataFilePaths(), "data filepaths should match"); + + List<String> paths3 = Arrays.asList( + "/path/tableName/data/id=2/file5.orc", + "/path/tableName/data/file6.orc" + ); + // Reusing same partition data to create data file with different paths + Map<String, PartitionData> paths3WithPartitionData = Maps.newHashMap(); + paths3.forEach(path -> paths3WithPartitionData.put(path, partitionData)); + List<DataFile> partition1NewDataFiles = createDataFiles(paths3WithPartitionData); Review Comment: NBD, but for a one-liner: ``` List<DataFile> partition1NewDataFiles = createDataFiles( paths3.stream().collect(Collectors.toMap(x -> x, partition1Data)) ); ``` (alternative to `x -> x` is `Function.identity()` - your choice) -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: dev-unsubscr...@gobblin.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org