kbendick commented on code in PR #4307:
URL: https://github.com/apache/iceberg/pull/4307#discussion_r856473751
##########
spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/actions/TestRemoveOrphanFilesAction.java:
##########
@@ -551,6 +554,139 @@ public void testManyLeafPartitions() throws
InterruptedException {
Assert.assertEquals("Rows must match", records, actualRecords);
}
+ @Test
+ public void testHiddenPartitionPaths() throws InterruptedException {
+ Schema schema = new Schema(
+ optional(1, "c1", Types.IntegerType.get()),
+ optional(2, "_c2", Types.StringType.get()),
+ optional(3, "c3", Types.StringType.get())
+ );
+ PartitionSpec spec = PartitionSpec.builderFor(schema)
+ .truncate("_c2", 2)
+ .identity("c3")
+ .build();
+ Table table = TABLES.create(schema, spec, Maps.newHashMap(),
tableLocation);
+
+ StructType structType = new StructType()
+ .add("c1", DataTypes.IntegerType)
+ .add("_c2", DataTypes.StringType)
+ .add("c3", DataTypes.StringType);
+ List<Row> records = Lists.newArrayList(
+ RowFactory.create(1, "AAAAAAAAAA", "AAAA")
+ );
+ Dataset<Row> df = spark.createDataFrame(records, structType).coalesce(1);
+
+ df.select("c1", "_c2", "c3")
+ .write()
+ .format("iceberg")
+ .mode("append")
+ .save(tableLocation);
+
+ df.write().mode("append").parquet(tableLocation +
"/data/_c2_trunc=AA/c3=AAAA");
+ df.write().mode("append").parquet(tableLocation +
"/data/_c2_trunc=AA/c3=AAAA");
+
+ waitUntilAfter(System.currentTimeMillis());
+
+ SparkActions actions = SparkActions.get();
+
+ DeleteOrphanFiles.Result result = actions.deleteOrphanFiles(table)
+ .olderThan(System.currentTimeMillis())
+ .execute();
+
+ Assert.assertEquals("Should delete 2 files", 2,
Iterables.size(result.orphanFileLocations()));
+ }
+
+ @Test
+ public void testHiddenPartitionPathsWithPartitionEvolution() throws
InterruptedException {
+ Schema schema = new Schema(
+ optional(1, "_c1", Types.IntegerType.get()),
+ optional(2, "_c2", Types.StringType.get()),
+ optional(3, "c3", Types.StringType.get())
+ );
+ PartitionSpec spec = PartitionSpec.builderFor(schema)
+ .truncate("_c2", 2)
+ .build();
+ Table table = TABLES.create(schema, spec, Maps.newHashMap(),
tableLocation);
+
+ StructType structType = new StructType()
+ .add("_c1", DataTypes.IntegerType)
+ .add("_c2", DataTypes.StringType)
+ .add("c3", DataTypes.StringType);
+ List<Row> records = Lists.newArrayList(
+ RowFactory.create(1, "AAAAAAAAAA", "AAAA")
+ );
+ Dataset<Row> df = spark.createDataFrame(records, structType).coalesce(1);
+
+ df.select("_c1", "_c2", "c3")
+ .write()
+ .format("iceberg")
+ .mode("append")
+ .save(tableLocation);
+
+ df.write().mode("append").parquet(tableLocation + "/data/_c2_trunc=AA");
+
+ table.updateSpec()
+ .addField("_c1")
+ .commit();
+
+ df.write().mode("append").parquet(tableLocation +
"/data/_c2_trunc=AA/_c1=1");
+
+ waitUntilAfter(System.currentTimeMillis());
+
+ SparkActions actions = SparkActions.get();
+
+ DeleteOrphanFiles.Result result = actions.deleteOrphanFiles(table)
+ .olderThan(System.currentTimeMillis())
+ .execute();
+
+ Assert.assertEquals("Should delete 2 files", 2,
Iterables.size(result.orphanFileLocations()));
+ }
+
+ @Test
+ public void testHiddenPathsStartingWithPartitionNamesAreIgnored() throws
InterruptedException, IOException {
+ Schema schema = new Schema(
+ optional(1, "c1", Types.IntegerType.get()),
+ optional(2, "_c2", Types.StringType.get()),
+ optional(3, "c3", Types.StringType.get())
+ );
+ PartitionSpec spec = PartitionSpec.builderFor(schema)
+ .truncate("_c2", 2)
+ .identity("c3")
+ .build();
+ Table table = TABLES.create(schema, spec, Maps.newHashMap(),
tableLocation);
+
+ StructType structType = new StructType()
+ .add("c1", DataTypes.IntegerType)
+ .add("_c2", DataTypes.StringType)
+ .add("c3", DataTypes.StringType);
+ List<Row> records = Lists.newArrayList(
+ RowFactory.create(1, "AAAAAAAAAA", "AAAA")
+ );
+ Dataset<Row> df = spark.createDataFrame(records, structType).coalesce(1);
+
+ df.select("c1", "_c2", "c3")
+ .write()
+ .format("iceberg")
+ .mode("append")
+ .save(tableLocation);
Review Comment:
Nit: These are overindented.
I'm not sure how the style checks passed though. (EDIT: It didn't).
But statements that continue onto the next line should be indented by 4
spaces (from the beginning of the starting line).
So here it would be:
```java
df.select("c1", "_c2", "c3")
.write()
.format("iceberg")
.mode("append")
.save(tableLocation);
```
If you run `./gradlew -DsparkVersions=3.2 clean check`, the process should
complain about that.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]