[GitHub] [iceberg] kbendick commented on a diff in pull request #4307: Spark 3.2: Allow disabling HiddenPathFilter in RemoveOrphansFiles

GitBox Fri, 22 Apr 2022 11:37:31 -0700


kbendick commented on code in PR #4307:
URL: https://github.com/apache/iceberg/pull/4307#discussion_r856473751



##########
spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/actions/TestRemoveOrphanFilesAction.java:
##########
@@ -551,6 +554,139 @@ public void testManyLeafPartitions() throws 
InterruptedException {
     Assert.assertEquals("Rows must match", records, actualRecords);
   }
 
+  @Test
+  public void testHiddenPartitionPaths() throws InterruptedException {
+    Schema schema = new Schema(
+        optional(1, "c1", Types.IntegerType.get()),
+        optional(2, "_c2", Types.StringType.get()),
+        optional(3, "c3", Types.StringType.get())
+    );
+    PartitionSpec spec = PartitionSpec.builderFor(schema)
+        .truncate("_c2", 2)
+        .identity("c3")
+        .build();
+    Table table = TABLES.create(schema, spec, Maps.newHashMap(), 
tableLocation);
+
+    StructType structType = new StructType()
+        .add("c1", DataTypes.IntegerType)
+        .add("_c2", DataTypes.StringType)
+        .add("c3", DataTypes.StringType);
+    List<Row> records = Lists.newArrayList(
+        RowFactory.create(1, "AAAAAAAAAA", "AAAA")
+    );
+    Dataset<Row> df = spark.createDataFrame(records, structType).coalesce(1);
+
+    df.select("c1", "_c2", "c3")
+        .write()
+        .format("iceberg")
+        .mode("append")
+        .save(tableLocation);
+
+    df.write().mode("append").parquet(tableLocation + 
"/data/_c2_trunc=AA/c3=AAAA");
+    df.write().mode("append").parquet(tableLocation + 
"/data/_c2_trunc=AA/c3=AAAA");
+
+    waitUntilAfter(System.currentTimeMillis());
+
+    SparkActions actions = SparkActions.get();
+
+    DeleteOrphanFiles.Result result = actions.deleteOrphanFiles(table)
+        .olderThan(System.currentTimeMillis())
+        .execute();
+
+    Assert.assertEquals("Should delete 2 files", 2, 
Iterables.size(result.orphanFileLocations()));
+  }
+
+  @Test
+  public void testHiddenPartitionPathsWithPartitionEvolution() throws 
InterruptedException {
+    Schema schema = new Schema(
+        optional(1, "_c1", Types.IntegerType.get()),
+        optional(2, "_c2", Types.StringType.get()),
+        optional(3, "c3", Types.StringType.get())
+    );
+    PartitionSpec spec = PartitionSpec.builderFor(schema)
+        .truncate("_c2", 2)
+        .build();
+    Table table = TABLES.create(schema, spec, Maps.newHashMap(), 
tableLocation);
+
+    StructType structType = new StructType()
+        .add("_c1", DataTypes.IntegerType)
+        .add("_c2", DataTypes.StringType)
+        .add("c3", DataTypes.StringType);
+    List<Row> records = Lists.newArrayList(
+        RowFactory.create(1, "AAAAAAAAAA", "AAAA")
+    );
+    Dataset<Row> df = spark.createDataFrame(records, structType).coalesce(1);
+
+    df.select("_c1", "_c2", "c3")
+        .write()
+        .format("iceberg")
+        .mode("append")
+        .save(tableLocation);
+
+    df.write().mode("append").parquet(tableLocation + "/data/_c2_trunc=AA");
+
+    table.updateSpec()
+        .addField("_c1")
+        .commit();
+
+    df.write().mode("append").parquet(tableLocation + 
"/data/_c2_trunc=AA/_c1=1");
+
+    waitUntilAfter(System.currentTimeMillis());
+
+    SparkActions actions = SparkActions.get();
+
+    DeleteOrphanFiles.Result result = actions.deleteOrphanFiles(table)
+        .olderThan(System.currentTimeMillis())
+        .execute();
+
+    Assert.assertEquals("Should delete 2 files", 2, 
Iterables.size(result.orphanFileLocations()));
+  }
+
+  @Test
+  public void testHiddenPathsStartingWithPartitionNamesAreIgnored() throws 
InterruptedException, IOException {
+    Schema schema = new Schema(
+            optional(1, "c1", Types.IntegerType.get()),
+            optional(2, "_c2", Types.StringType.get()),
+            optional(3, "c3", Types.StringType.get())
+    );
+    PartitionSpec spec = PartitionSpec.builderFor(schema)
+            .truncate("_c2", 2)
+            .identity("c3")
+            .build();
+    Table table = TABLES.create(schema, spec, Maps.newHashMap(), 
tableLocation);
+
+    StructType structType = new StructType()
+            .add("c1", DataTypes.IntegerType)
+            .add("_c2", DataTypes.StringType)
+            .add("c3", DataTypes.StringType);
+    List<Row> records = Lists.newArrayList(
+            RowFactory.create(1, "AAAAAAAAAA", "AAAA")
+    );
+    Dataset<Row> df = spark.createDataFrame(records, structType).coalesce(1);
+
+    df.select("c1", "_c2", "c3")
+            .write()
+            .format("iceberg")
+            .mode("append")
+            .save(tableLocation);

Review Comment:
   Nit: These are overindented.
   
   I'm not sure how the style checks passed though. (EDIT: It didn't).
   
   But statements that continue onto the next line should be indented by 4 
spaces (from the beginning of the starting line).
   
   So here it would be:
   
   ```java
       df.select("c1", "_c2", "c3")
           .write()
           .format("iceberg")
           .mode("append")
           .save(tableLocation);
   ```
   
   If you run `./gradlew -DsparkVersions=3.2 clean check`, the process should 
complain about that.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] [iceberg] kbendick commented on a diff in pull request #4307: Spark 3.2: Allow disabling HiddenPathFilter in RemoveOrphansFiles

Reply via email to