[GitHub] [iceberg] aokolnychyi commented on a change in pull request #4243: Add delete_files metadata table

GitBox Fri, 11 Mar 2022 12:05:48 -0800


aokolnychyi commented on a change in pull request #4243:
URL: https://github.com/apache/iceberg/pull/4243#discussion_r825024014




##########
File path: core/src/main/java/org/apache/iceberg/DataFilesTable.java
##########
@@ -47,115 +36,32 @@
 
   @Override
   public TableScan newScan() {
-    return new FilesTableScan(operations(), table(), schema());
-  }
-
-  @Override
-  public Schema schema() {
-    StructType partitionType = Partitioning.partitionType(table());
-    Schema schema = new Schema(DataFile.getType(partitionType).fields());
-    if (partitionType.fields().size() < 1) {
-      // avoid returning an empty struct, which is not always supported. 
instead, drop the partition field
-      return TypeUtil.selectNot(schema, 
Sets.newHashSet(DataFile.PARTITION_ID));
-    } else {
-      return schema;
-    }
+    return new DataFilesTableScan(operations(), table(), schema());
   }
 
   @Override
   MetadataTableType metadataTableType() {
     return MetadataTableType.FILES;
   }
 
-  public static class FilesTableScan extends BaseMetadataTableScan {
-    private final Schema fileSchema;
+  public static class DataFilesTableScan extends BaseFilesTableScan {
 
-    FilesTableScan(TableOperations ops, Table table, Schema fileSchema) {
-      super(ops, table, fileSchema);
-      this.fileSchema = fileSchema;
+    DataFilesTableScan(TableOperations ops, Table table, Schema fileSchema) {
+      super(ops, table, fileSchema, MetadataTableType.FILES);
     }
 
-    private FilesTableScan(TableOperations ops, Table table, Schema schema, 
Schema fileSchema,
-                           TableScanContext context) {
-      super(ops, table, schema, context);
-      this.fileSchema = fileSchema;
-    }
-
-    @Override
-    public TableScan appendsBetween(long fromSnapshotId, long toSnapshotId) {
-      throw new UnsupportedOperationException(
-          String.format("Cannot incrementally scan table of type %s", 
MetadataTableType.FILES.name()));
-    }
-
-    @Override
-    public TableScan appendsAfter(long fromSnapshotId) {
-      throw new UnsupportedOperationException(
-          String.format("Cannot incrementally scan table of type %s", 
MetadataTableType.FILES.name()));
+    DataFilesTableScan(TableOperations ops, Table table, Schema schema, Schema 
fileSchema, TableScanContext context) {
+      super(ops, table, schema, fileSchema, context, MetadataTableType.FILES);
     }
 
     @Override
     protected TableScan newRefinedScan(TableOperations ops, Table table, 
Schema schema, TableScanContext context) {
-      return new FilesTableScan(ops, table, schema, fileSchema, context);
+      return new DataFilesTableScan(ops, table, schema, this.fileSchema(), 
context);

Review comment:
       nit: redundant `this`

##########
File path: 
core/src/test/java/org/apache/iceberg/hadoop/TestTableSerialization.java
##########
@@ -138,8 +143,12 @@ public void testSerializableMetadataTablesPlanning() 
throws IOException {
     table.newAppend()
         .appendFile(FILE_B)
         .commit();
+    table.newRowDelta()
+        .addDeletes(FILE_B_DELETES)
+        .commit();
 
     for (MetadataTableType type : MetadataTableType.values()) {
+

Review comment:
       nit: required?

##########
File path: 
spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestMetadataTables.java
##########
@@ -0,0 +1,178 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.iceberg.spark.extensions;
+
+import java.io.IOException;
+import java.util.List;
+import java.util.Map;
+import org.apache.avro.generic.GenericData.Record;
+import org.apache.iceberg.FileContent;
+import org.apache.iceberg.ManifestFile;
+import org.apache.iceberg.Schema;
+import org.apache.iceberg.Table;
+import org.apache.iceberg.avro.Avro;
+import org.apache.iceberg.io.CloseableIterable;
+import org.apache.iceberg.io.InputFile;
+import org.apache.iceberg.relocated.com.google.common.collect.Lists;
+import org.apache.iceberg.spark.Spark3Util;
+import org.apache.iceberg.spark.data.TestHelpers;
+import org.apache.iceberg.spark.source.SimpleRecord;
+import org.apache.spark.sql.Encoders;
+import org.apache.spark.sql.Row;
+import org.junit.After;
+import org.junit.Assert;
+import org.junit.Test;
+
+
+public class TestMetadataTables extends SparkExtensionsTestBase {
+
+  public TestMetadataTables(String catalogName, String implementation, 
Map<String, String> config) {
+    super(catalogName, implementation, config);
+  }
+
+  @After
+  public void removeTables() {
+    sql("DROP TABLE IF EXISTS %s", tableName);
+  }
+
+  @Test
+  public void testDeleteFilesTable() throws Exception {
+    sql("CREATE TABLE %s (id bigint, data string) USING iceberg TBLPROPERTIES" 
+
+        "('format-version'='2', 'write.delete.mode'='merge-on-read')", 
tableName);
+
+    List<SimpleRecord> records = Lists.newArrayList(
+        new SimpleRecord(1, "a"),
+        new SimpleRecord(2, "b"),
+        new SimpleRecord(3, "c"),
+        new SimpleRecord(4, "d")
+    );
+    spark.createDataset(records, Encoders.bean(SimpleRecord.class))
+        .coalesce(1)
+        .writeTo(tableName)
+        .append();
+
+    sql("DELETE FROM %s WHERE id=1", tableName);
+
+    Table table = Spark3Util.loadIcebergTable(spark, tableName);
+    List<ManifestFile> expectedManifests = TestHelpers.deleteManifests(table);
+    Assert.assertEquals("Should have 1 delete file", 1, 
expectedManifests.size());

Review comment:
       nit: delete manifest?

##########
File path: core/src/main/java/org/apache/iceberg/BaseFilesTable.java
##########
@@ -0,0 +1,173 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.iceberg;
+
+import java.util.List;
+import java.util.Map;
+import org.apache.iceberg.expressions.Expression;
+import org.apache.iceberg.expressions.Expressions;
+import org.apache.iceberg.expressions.ManifestEvaluator;
+import org.apache.iceberg.expressions.Projections;
+import org.apache.iceberg.expressions.ResidualEvaluator;
+import org.apache.iceberg.io.CloseableIterable;
+import org.apache.iceberg.io.FileIO;
+import 
org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting;
+import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList;
+import org.apache.iceberg.relocated.com.google.common.collect.Sets;
+import org.apache.iceberg.types.TypeUtil;
+import org.apache.iceberg.types.Types.StructType;
+
+/**
+ * Base class logic for files metadata tables
+ */
+abstract class BaseFilesTable extends BaseMetadataTable {
+
+  BaseFilesTable(TableOperations ops, Table table, String name) {
+    super(ops, table, name);
+  }
+
+  @Override
+  public Schema schema() {
+    StructType partitionType = Partitioning.partitionType(table());
+    Schema schema = new Schema(DataFile.getType(partitionType).fields());
+    if (partitionType.fields().size() < 1) {
+      // avoid returning an empty struct, which is not always supported. 
instead, drop the partition field
+      return TypeUtil.selectNot(schema, 
Sets.newHashSet(DataFile.PARTITION_ID));
+    } else {
+      return schema;
+    }
+  }
+
+  abstract static class BaseFilesTableScan extends BaseMetadataTableScan {
+    private final Schema fileSchema;
+    private final MetadataTableType type;
+
+    protected BaseFilesTableScan(TableOperations ops, Table table, Schema 
fileSchema, MetadataTableType type) {
+      super(ops, table, fileSchema);
+      this.fileSchema = fileSchema;
+      this.type = type;
+    }
+
+    protected BaseFilesTableScan(TableOperations ops, Table table, Schema 
schema, Schema fileSchema,
+                                 TableScanContext context, MetadataTableType 
type) {
+      super(ops, table, schema, context);
+      this.fileSchema = fileSchema;
+      this.type = type;
+    }
+
+    protected Schema fileSchema() {
+      return fileSchema;
+    }
+
+    @Override
+    public TableScan appendsBetween(long fromSnapshotId, long toSnapshotId) {
+      throw new UnsupportedOperationException(
+          String.format("Cannot incrementally scan table of type %s", 
type.name()));
+    }
+
+    @Override
+    public TableScan appendsAfter(long fromSnapshotId) {
+      throw new UnsupportedOperationException(
+          String.format("Cannot incrementally scan table of type %s", 
type.name()));
+    }
+
+    @Override
+    protected CloseableIterable<FileScanTask> planFiles(
+        TableOperations ops, Snapshot snapshot, Expression rowFilter,

Review comment:
       nit: I know the formatting was off in the original implementation but 
maybe it is a good time to fix it since we are modifying this method anyway?

##########
File path: core/src/test/java/org/apache/iceberg/TestMetadataTableScans.java
##########
@@ -462,6 +479,180 @@ public void testDataFilesTableSelection() throws 
IOException {
     Assert.assertEquals(expected, scan.schema().asStruct());
   }
 
+  @Test
+  public void testDeleteFilesTableScanNoFilter() {
+    Assume.assumeTrue("Only V2 Tables Support Deletes", formatVersion >= 2);
+
+    preparePartitionedTable();
+
+    Table deleteFilesTable = new DeleteFilesTable(table.ops(), table);
+    Types.StructType expected = new Schema(
+        required(102, "partition", Types.StructType.of(
+            optional(1000, "data_bucket", Types.IntegerType.get())),
+            "Partition data tuple, schema based on the partition 
spec")).asStruct();
+
+    TableScan scanNoFilter = 
deleteFilesTable.newScan().select("partition.data_bucket");
+    Assert.assertEquals(expected, scanNoFilter.schema().asStruct());
+    CloseableIterable<FileScanTask> tasksAndEq = scanNoFilter.planFiles();

Review comment:
       nit: looks like a copy and paste typo as I am not sure what `tasksAndEq` 
means. I did not check all tests.

##########
File path: 
spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestMetadataTables.java
##########
@@ -0,0 +1,178 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.iceberg.spark.extensions;
+
+import java.io.IOException;
+import java.util.List;
+import java.util.Map;
+import org.apache.avro.generic.GenericData.Record;
+import org.apache.iceberg.FileContent;
+import org.apache.iceberg.ManifestFile;
+import org.apache.iceberg.Schema;
+import org.apache.iceberg.Table;
+import org.apache.iceberg.avro.Avro;
+import org.apache.iceberg.io.CloseableIterable;
+import org.apache.iceberg.io.InputFile;
+import org.apache.iceberg.relocated.com.google.common.collect.Lists;
+import org.apache.iceberg.spark.Spark3Util;
+import org.apache.iceberg.spark.data.TestHelpers;
+import org.apache.iceberg.spark.source.SimpleRecord;
+import org.apache.spark.sql.Encoders;
+import org.apache.spark.sql.Row;
+import org.junit.After;
+import org.junit.Assert;
+import org.junit.Test;
+
+
+public class TestMetadataTables extends SparkExtensionsTestBase {
+
+  public TestMetadataTables(String catalogName, String implementation, 
Map<String, String> config) {
+    super(catalogName, implementation, config);
+  }
+
+  @After
+  public void removeTables() {
+    sql("DROP TABLE IF EXISTS %s", tableName);
+  }
+
+  @Test
+  public void testDeleteFilesTable() throws Exception {
+    sql("CREATE TABLE %s (id bigint, data string) USING iceberg TBLPROPERTIES" 
+
+        "('format-version'='2', 'write.delete.mode'='merge-on-read')", 
tableName);
+
+    List<SimpleRecord> records = Lists.newArrayList(
+        new SimpleRecord(1, "a"),
+        new SimpleRecord(2, "b"),
+        new SimpleRecord(3, "c"),
+        new SimpleRecord(4, "d")
+    );
+    spark.createDataset(records, Encoders.bean(SimpleRecord.class))
+        .coalesce(1)
+        .writeTo(tableName)
+        .append();
+
+    sql("DELETE FROM %s WHERE id=1", tableName);
+
+    Table table = Spark3Util.loadIcebergTable(spark, tableName);
+    List<ManifestFile> expectedManifests = TestHelpers.deleteManifests(table);
+    Assert.assertEquals("Should have 1 delete file", 1, 
expectedManifests.size());
+
+    Schema entriesTableSchema = Spark3Util.loadIcebergTable(spark, tableName + 
".entries").schema();
+    Schema filesTableSchema = Spark3Util.loadIcebergTable(spark, tableName + 
".delete_files").schema();
+
+    List<Row> actual = spark.sql("SELECT * FROM " + tableName + 
".delete_files").collectAsList();
+
+    List<Record> expected = expectedEntries(table, entriesTableSchema, 
expectedManifests, null);
+
+    Assert.assertEquals("Should be one delete file manifest entry", 1, 
expected.size());
+    Assert.assertEquals("Metadata table should return one delete file", 1, 
actual.size());
+
+    TestHelpers.assertEqualsSafe(filesTableSchema.asStruct(), expected.get(0), 
actual.get(0));
+  }
+
+  @Test
+  public void testDeleteFilesTablePartitioned() throws Exception {
+    sql("CREATE TABLE %s (id bigint, data string) " +
+        "USING iceberg " +
+        "PARTITIONED BY (data) " +
+        "TBLPROPERTIES" +
+        "('format-version'='2', 'write.delete.mode'='merge-on-read')", 
tableName);
+
+    List<SimpleRecord> recordsA = Lists.newArrayList(
+        new SimpleRecord(1, "a"),
+        new SimpleRecord(2, "a")
+    );
+    spark.createDataset(recordsA, Encoders.bean(SimpleRecord.class))
+        .coalesce(1)
+        .writeTo(tableName)
+        .append();
+
+    List<SimpleRecord> recordsB = Lists.newArrayList(
+        new SimpleRecord(1, "b"),
+        new SimpleRecord(2, "b")
+        );
+    spark.createDataset(recordsB, Encoders.bean(SimpleRecord.class))
+        .coalesce(1)
+        .writeTo(tableName)
+        .append();
+
+    sql("DELETE FROM %s WHERE id=1 AND data='a'", tableName);
+    sql("DELETE FROM %s WHERE id=1 AND data='b'", tableName);
+
+    Table table = Spark3Util.loadIcebergTable(spark, tableName);
+    List<ManifestFile> expectedManifests = TestHelpers.deleteManifests(table);
+    Assert.assertEquals("Should have 2 delete files", 2, 
expectedManifests.size());
+
+    List<Row> actual = spark.sql("SELECT * FROM " + tableName + ".delete_files 
" +
+        "WHERE partition.data='a'").collectAsList();
+
+    Schema entriesTableSchema = Spark3Util.loadIcebergTable(spark, tableName + 
".entries").schema();
+    Schema filesTableSchema = Spark3Util.loadIcebergTable(spark, tableName + 
".delete_files").schema();
+
+    List<Record> expected = expectedEntries(table, entriesTableSchema, 
expectedManifests, "a");
+
+    Assert.assertEquals("Should be one delete file manifest entry", 1, 
expected.size());
+    Assert.assertEquals("Metadata table should return one delete file", 1, 
actual.size());
+
+    TestHelpers.assertEqualsSafe(filesTableSchema.asStruct(), expected.get(0), 
actual.get(0));
+  }
+
+  /**
+   * Find matching manifest entries of an Iceberg table
+   * @param table iceberg table
+   * @param entriesTableSchema schema of Manifest entries
+   * @param manifestsToExplore manifests to explore of the table
+   * @param partValue partition value that manifest entries must match, or 
null to skip filtering
+   */
+  private List<Record> expectedEntries(Table table, Schema entriesTableSchema,
+                                       List<ManifestFile> manifestsToExplore, 
String partValue) throws IOException {
+    List<Record> expected = Lists.newArrayList();
+    for (ManifestFile manifest : manifestsToExplore) {
+      InputFile in = table.io().newInputFile(manifest.path());
+      try (CloseableIterable<Record> rows = Avro.read(in).project(
+          entriesTableSchema).build()) {

Review comment:
       nit: can fit on one line?




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]



---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] [iceberg] aokolnychyi commented on a change in pull request #4243: Add delete_files metadata table

Reply via email to