[GitHub] [iceberg] aokolnychyi commented on a change in pull request #4243: Add delete_files metadata table

GitBox Tue, 08 Mar 2022 20:01:53 -0800


aokolnychyi commented on a change in pull request #4243:
URL: https://github.com/apache/iceberg/pull/4243#discussion_r822265143




##########
File path: core/src/main/java/org/apache/iceberg/BaseFilesTable.java
##########
@@ -0,0 +1,175 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.iceberg;
+
+import java.util.List;
+import java.util.Map;
+import org.apache.iceberg.expressions.Expression;
+import org.apache.iceberg.expressions.Expressions;
+import org.apache.iceberg.expressions.ManifestEvaluator;
+import org.apache.iceberg.expressions.Projections;
+import org.apache.iceberg.expressions.ResidualEvaluator;
+import org.apache.iceberg.io.CloseableIterable;
+import org.apache.iceberg.io.FileIO;
+import 
org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting;
+import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList;
+import org.apache.iceberg.relocated.com.google.common.collect.Sets;
+import org.apache.iceberg.types.TypeUtil;
+import org.apache.iceberg.types.Types.StructType;
+
+/**
+ * Base class logic for files metadata tables
+ */
+abstract class BaseFilesTable extends BaseMetadataTable {
+
+  BaseFilesTable(TableOperations ops, Table table, String name) {
+    super(ops, table, name);
+  }
+
+  @Override
+  public Schema schema() {
+    StructType partitionType = Partitioning.partitionType(table());
+    Schema schema = new Schema(DataFile.getType(partitionType).fields());
+    if (partitionType.fields().size() < 1) {
+      // avoid returning an empty struct, which is not always supported. 
instead, drop the partition field
+      return TypeUtil.selectNot(schema, 
Sets.newHashSet(DataFile.PARTITION_ID));
+    } else {
+      return schema;
+    }
+  }
+
+  abstract static class BaseFilesTableScan<T extends BaseFile<?>> extends 
BaseMetadataTableScan {
+    private final Schema fileSchema;
+    private final MetadataTableType type;
+
+    protected BaseFilesTableScan(TableOperations ops, Table table, Schema 
fileSchema, MetadataTableType type) {
+      super(ops, table, fileSchema);
+      this.fileSchema = fileSchema;
+      this.type = type;
+    }
+
+    protected Schema fileSchema() {
+      return fileSchema;
+    }
+
+    protected BaseFilesTableScan(TableOperations ops, Table table, Schema 
schema, Schema fileSchema,
+                                 TableScanContext context, MetadataTableType 
type) {
+      super(ops, table, schema, context);
+      this.fileSchema = fileSchema;
+      this.type = type;
+    }
+
+    @Override
+    public TableScan appendsBetween(long fromSnapshotId, long toSnapshotId) {
+      throw new UnsupportedOperationException(
+          String.format("Cannot incrementally scan table of type %s", 
type.name()));
+    }
+
+    @Override
+    public TableScan appendsAfter(long fromSnapshotId) {
+      throw new UnsupportedOperationException(
+          String.format("Cannot incrementally scan table of type %s", 
type.name()));
+    }
+
+    @Override
+    protected CloseableIterable<FileScanTask> planFiles(
+        TableOperations ops, Snapshot snapshot, Expression rowFilter,
+        boolean ignoreResiduals, boolean caseSensitive, boolean colStats) {
+      CloseableIterable<ManifestFile> filtered = filterManifests(rowFilter, 
caseSensitive);
+
+      String schemaString = SchemaParser.toJson(schema());
+      String specString = 
PartitionSpecParser.toJson(PartitionSpec.unpartitioned());
+      Expression filter = ignoreResiduals ? Expressions.alwaysTrue() : 
rowFilter;
+      ResidualEvaluator residuals = ResidualEvaluator.unpartitioned(filter);
+
+      // Data tasks produce the table schema, not the projection schema and 
projection is done by processing engines.
+      // This data task needs to use the table schema, which may not include a 
partition schema to avoid having an
+      // empty struct in the schema for unpartitioned tables. Some engines, 
like Spark, can't handle empty structs in
+      // all cases.
+      return CloseableIterable.transform(filtered, manifest ->
+          new ManifestReadTask<T>(ops.io(), ops.current().specsById(),
+              manifest, schema(), schemaString, specString, residuals));
+    }
+
+    /**
+     * @return list of manifest files to explore for this files metadata table 
scan
+     */
+    protected abstract List<ManifestFile> manifests();
+
+    private CloseableIterable<ManifestFile> filterManifests(Expression 
rowFilter,
+                                                            boolean 
caseSensitive) {
+      CloseableIterable<ManifestFile> manifestIterable = 
CloseableIterable.withNoopClose(manifests());
+
+      // use an inclusive projection to remove the partition name prefix and 
filter out any non-partition expressions
+      Expression partitionFilter = Projections
+          .inclusive(
+              transformSpec(fileSchema, table().spec(), 
PARTITION_FIELD_PREFIX),

Review comment:
       nit: would it help to create a temp var? Then this would fit on one line.
   
   ```
   PartitionSpec spec = transformSpec(fileSchema, table().spec(), 
PARTITION_FIELD_PREFIX);
   Expression partitionFilter = Projections.inclusive(spec, 
caseSensitive).project(rowFilter);
   ```

##########
File path: 
spark/v3.2/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestMetadataTables.java
##########
@@ -0,0 +1,179 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.iceberg.spark.extensions;
+
+import java.io.IOException;
+import java.util.List;
+import java.util.Map;
+import org.apache.avro.generic.GenericData;
+import org.apache.iceberg.FileContent;
+import org.apache.iceberg.ManifestFile;
+import org.apache.iceberg.Schema;
+import org.apache.iceberg.Table;
+import org.apache.iceberg.avro.Avro;
+import org.apache.iceberg.io.CloseableIterable;
+import org.apache.iceberg.io.InputFile;
+import org.apache.iceberg.relocated.com.google.common.collect.Lists;
+import org.apache.iceberg.spark.Spark3Util;
+import org.apache.iceberg.spark.data.TestHelpers;
+import org.apache.iceberg.spark.source.SimpleRecord;
+import org.apache.spark.sql.Encoders;
+import org.apache.spark.sql.Row;
+import org.junit.After;
+import org.junit.Assert;
+import org.junit.Test;
+
+
+public class TestMetadataTables extends SparkExtensionsTestBase {
+
+  public TestMetadataTables(String catalogName, String implementation, 
Map<String, String> config) {
+    super(catalogName, implementation, config);
+  }
+
+  @After
+  public void removeTables() {
+    sql("DROP TABLE IF EXISTS %s", tableName);
+  }
+
+  @Test
+  public void testDeleteFilesTable() throws Exception {
+    sql("CREATE TABLE %s (id bigint, data string) USING iceberg TBLPROPERTIES" 
+
+        "('format-version'='2', 'write.delete.mode'='merge-on-read')", 
tableName);
+
+    List<SimpleRecord> records = Lists.newArrayList(
+        new SimpleRecord(1, "a"),
+        new SimpleRecord(2, "b"),
+        new SimpleRecord(3, "c"),
+        new SimpleRecord(4, "d")
+    );
+    spark.createDataset(records, Encoders.bean(SimpleRecord.class))
+        .coalesce(1)
+        .writeTo(tableName)
+        .append();
+
+    sql("DELETE FROM %s WHERE id=1", tableName);
+
+    Table table = Spark3Util.loadIcebergTable(spark, tableName);
+    List<ManifestFile> expectedManifests = TestHelpers.deleteManifests(table);
+    Assert.assertEquals("Should have 1 delete file", 1, 
expectedManifests.size());
+
+    Schema entriesTableSchema = Spark3Util.loadIcebergTable(spark, tableName + 
".entries").schema();
+    Schema filesTableSchema = Spark3Util.loadIcebergTable(spark, tableName + 
".delete_files").schema();
+
+    List<Row> actual = spark.sql("SELECT * FROM " + tableName + 
".delete_files").collectAsList();
+
+    List<GenericData.Record> expected = expectedEntries(table, 
entriesTableSchema, expectedManifests, null);
+
+    Assert.assertEquals("Should be one delete file manifest entry", 1, 
expected.size());
+    Assert.assertEquals("Metadata table should return one delete file", 1, 
actual.size());
+
+    TestHelpers.assertEqualsSafe(filesTableSchema.asStruct(), expected.get(0), 
actual.get(0));
+  }
+
+  @Test
+  public void testDeleteFilesTablePartitioned() throws Exception {
+    sql("CREATE TABLE %s (id bigint, data string) " +
+        "USING iceberg " +
+        "PARTITIONED BY (data) " +
+        "TBLPROPERTIES" +
+        "('format-version'='2', 'write.delete.mode'='merge-on-read')", 
tableName);
+
+    List<SimpleRecord> recordsA = Lists.newArrayList(
+        new SimpleRecord(1, "a"),
+        new SimpleRecord(2, "a")
+    );
+    spark.createDataset(recordsA, Encoders.bean(SimpleRecord.class))
+        .coalesce(1)
+        .writeTo(tableName)
+        .append();
+
+    List<SimpleRecord> recordsB = Lists.newArrayList(
+        new SimpleRecord(1, "b"),
+        new SimpleRecord(2, "b")
+        );
+    spark.createDataset(recordsB, Encoders.bean(SimpleRecord.class))
+        .coalesce(1)
+        .writeTo(tableName)
+        .append();
+
+    sql("DELETE FROM %s WHERE id=1 AND data='a'", tableName);
+    sql("DELETE FROM %s WHERE id=1 AND data='b'", tableName);
+
+    Table table = Spark3Util.loadIcebergTable(spark, tableName);
+    List<ManifestFile> expectedManifests = TestHelpers.deleteManifests(table);
+    Assert.assertEquals("Should have 2 delete files", 2, 
expectedManifests.size());
+
+    List<Row> actual = spark.sql("SELECT * FROM " + tableName + ".delete_files 
" +
+        "WHERE partition.data='a'").collectAsList();
+
+    Schema entriesTableSchema = Spark3Util.loadIcebergTable(spark, tableName + 
".entries").schema();
+    Schema filesTableSchema = Spark3Util.loadIcebergTable(spark, tableName + 
".delete_files").schema();
+
+    List<GenericData.Record> expected = expectedEntries(table, 
entriesTableSchema, expectedManifests, "a");
+
+    Assert.assertEquals("Should be one delete file manifest entry", 1, 
expected.size());
+    Assert.assertEquals("Metadata table should return one delete file", 1, 
actual.size());
+
+    TestHelpers.assertEqualsSafe(filesTableSchema.asStruct(), expected.get(0), 
actual.get(0));
+  }
+
+  /**
+   * Find matching manifest entries of an Iceberg table
+   * @param table iceberg table
+   * @param entriesTableSchema schema of Manifest entries
+   * @param manifestsToExplore manifests to explore of the table
+   * @param partValue partition value that manifest entries must match, or 
null to skip filtering
+   */
+  private List<GenericData.Record> expectedEntries(Table table, Schema 
entriesTableSchema,

Review comment:
       nit: can we import directly to shorten the line? Maybe, `throws 
IOException` would fit then.

##########
File path: core/src/main/java/org/apache/iceberg/BaseFilesTable.java
##########
@@ -0,0 +1,175 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.iceberg;
+
+import java.util.List;
+import java.util.Map;
+import org.apache.iceberg.expressions.Expression;
+import org.apache.iceberg.expressions.Expressions;
+import org.apache.iceberg.expressions.ManifestEvaluator;
+import org.apache.iceberg.expressions.Projections;
+import org.apache.iceberg.expressions.ResidualEvaluator;
+import org.apache.iceberg.io.CloseableIterable;
+import org.apache.iceberg.io.FileIO;
+import 
org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting;
+import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList;
+import org.apache.iceberg.relocated.com.google.common.collect.Sets;
+import org.apache.iceberg.types.TypeUtil;
+import org.apache.iceberg.types.Types.StructType;
+
+/**
+ * Base class logic for files metadata tables
+ */
+abstract class BaseFilesTable extends BaseMetadataTable {
+
+  BaseFilesTable(TableOperations ops, Table table, String name) {
+    super(ops, table, name);
+  }
+
+  @Override
+  public Schema schema() {
+    StructType partitionType = Partitioning.partitionType(table());
+    Schema schema = new Schema(DataFile.getType(partitionType).fields());
+    if (partitionType.fields().size() < 1) {
+      // avoid returning an empty struct, which is not always supported. 
instead, drop the partition field
+      return TypeUtil.selectNot(schema, 
Sets.newHashSet(DataFile.PARTITION_ID));
+    } else {
+      return schema;
+    }
+  }
+
+  abstract static class BaseFilesTableScan<T extends BaseFile<?>> extends 
BaseMetadataTableScan {
+    private final Schema fileSchema;
+    private final MetadataTableType type;
+
+    protected BaseFilesTableScan(TableOperations ops, Table table, Schema 
fileSchema, MetadataTableType type) {
+      super(ops, table, fileSchema);
+      this.fileSchema = fileSchema;
+      this.type = type;
+    }
+
+    protected Schema fileSchema() {
+      return fileSchema;
+    }
+
+    protected BaseFilesTableScan(TableOperations ops, Table table, Schema 
schema, Schema fileSchema,
+                                 TableScanContext context, MetadataTableType 
type) {
+      super(ops, table, schema, context);
+      this.fileSchema = fileSchema;
+      this.type = type;
+    }
+
+    @Override
+    public TableScan appendsBetween(long fromSnapshotId, long toSnapshotId) {
+      throw new UnsupportedOperationException(
+          String.format("Cannot incrementally scan table of type %s", 
type.name()));
+    }
+
+    @Override
+    public TableScan appendsAfter(long fromSnapshotId) {
+      throw new UnsupportedOperationException(
+          String.format("Cannot incrementally scan table of type %s", 
type.name()));
+    }
+
+    @Override
+    protected CloseableIterable<FileScanTask> planFiles(
+        TableOperations ops, Snapshot snapshot, Expression rowFilter,
+        boolean ignoreResiduals, boolean caseSensitive, boolean colStats) {
+      CloseableIterable<ManifestFile> filtered = filterManifests(rowFilter, 
caseSensitive);
+
+      String schemaString = SchemaParser.toJson(schema());
+      String specString = 
PartitionSpecParser.toJson(PartitionSpec.unpartitioned());
+      Expression filter = ignoreResiduals ? Expressions.alwaysTrue() : 
rowFilter;
+      ResidualEvaluator residuals = ResidualEvaluator.unpartitioned(filter);
+
+      // Data tasks produce the table schema, not the projection schema and 
projection is done by processing engines.
+      // This data task needs to use the table schema, which may not include a 
partition schema to avoid having an
+      // empty struct in the schema for unpartitioned tables. Some engines, 
like Spark, can't handle empty structs in
+      // all cases.
+      return CloseableIterable.transform(filtered, manifest ->
+          new ManifestReadTask<T>(ops.io(), ops.current().specsById(),
+              manifest, schema(), schemaString, specString, residuals));
+    }
+
+    /**
+     * @return list of manifest files to explore for this files metadata table 
scan
+     */
+    protected abstract List<ManifestFile> manifests();
+
+    private CloseableIterable<ManifestFile> filterManifests(Expression 
rowFilter,
+                                                            boolean 
caseSensitive) {
+      CloseableIterable<ManifestFile> manifestIterable = 
CloseableIterable.withNoopClose(manifests());
+
+      // use an inclusive projection to remove the partition name prefix and 
filter out any non-partition expressions
+      Expression partitionFilter = Projections
+          .inclusive(
+              transformSpec(fileSchema, table().spec(), 
PARTITION_FIELD_PREFIX),
+              caseSensitive)
+          .project(rowFilter);
+
+      ManifestEvaluator manifestEval = ManifestEvaluator.forPartitionFilter(
+          partitionFilter, table().spec(), caseSensitive);
+
+      return CloseableIterable.filter(manifestIterable, manifestEval::eval);
+    }
+  }
+
+  static class ManifestReadTask<T extends BaseFile<?>> extends 
BaseFileScanTask implements DataTask {

Review comment:
       I am not sure about type params in this and the scan classes. Do we need 
them?
   I'd probably remove them and just cast `file` to `StructLike` in `rows`.
   
   This type parameter assumes we read back `BaseFile`, which is true now, but 
may change in the future. Also, there are places where `ManifestReadTask` is 
instantiated without any types and Intellij complains. I think it should be 
sufficient to get back `StructLike` and that's what we can cast to.

##########
File path: core/src/main/java/org/apache/iceberg/AllDataFilesTable.java
##########
@@ -118,7 +118,8 @@ public TableScan asOfTime(long timestampMillis) {
       ResidualEvaluator residuals = ResidualEvaluator.unpartitioned(filter);
 
       return CloseableIterable.transform(manifests, manifest ->
-          new DataFilesTable.ManifestReadTask(ops.io(), manifest, schema(), 
schemaString, specString, residuals));
+          new BaseFilesTable.ManifestReadTask(ops.io(), 
ops.current().specsById(), manifest, schema(),

Review comment:
       nit: what about importing `ManifestReadTask` directly to shorten the 
line?

##########
File path: core/src/test/java/org/apache/iceberg/TestMetadataTableScans.java
##########
@@ -601,14 +791,14 @@ private void 
validateIncludesPartitionScan(CloseableIterable<FileScanTask> tasks
   private void validateFileScanTasks(CloseableIterable<FileScanTask> 
fileScanTasks, int partValue) {
     Assert.assertTrue("File scan tasks do not include correct file",
         StreamSupport.stream(fileScanTasks.spliterator(), false).anyMatch(t -> 
{
-          ManifestFile mf = ((DataFilesTable.ManifestReadTask) t).manifest();
+          ManifestFile mf = ((BaseFilesTable.ManifestReadTask) t).manifest();
           return manifestHasPartition(mf, partValue);
         }));
   }
 
   private void validateCombinedScanTasks(CloseableIterable<CombinedScanTask> 
tasks, int partValue) {
     StreamSupport.stream(tasks.spliterator(), false)
-        .flatMap(c -> c.files().stream().map(t -> 
((DataFilesTable.ManifestReadTask) t).manifest()))
+        .flatMap(c -> c.files().stream().map(t -> 
((BaseFilesTable.ManifestReadTask) t).manifest()))

Review comment:
       nit: what about a direct import?

##########
File path: core/src/main/java/org/apache/iceberg/BaseFilesTable.java
##########
@@ -0,0 +1,175 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.iceberg;
+
+import java.util.List;
+import java.util.Map;
+import org.apache.iceberg.expressions.Expression;
+import org.apache.iceberg.expressions.Expressions;
+import org.apache.iceberg.expressions.ManifestEvaluator;
+import org.apache.iceberg.expressions.Projections;
+import org.apache.iceberg.expressions.ResidualEvaluator;
+import org.apache.iceberg.io.CloseableIterable;
+import org.apache.iceberg.io.FileIO;
+import 
org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting;
+import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList;
+import org.apache.iceberg.relocated.com.google.common.collect.Sets;
+import org.apache.iceberg.types.TypeUtil;
+import org.apache.iceberg.types.Types.StructType;
+
+/**
+ * Base class logic for files metadata tables
+ */
+abstract class BaseFilesTable extends BaseMetadataTable {
+
+  BaseFilesTable(TableOperations ops, Table table, String name) {
+    super(ops, table, name);
+  }
+
+  @Override
+  public Schema schema() {
+    StructType partitionType = Partitioning.partitionType(table());
+    Schema schema = new Schema(DataFile.getType(partitionType).fields());
+    if (partitionType.fields().size() < 1) {
+      // avoid returning an empty struct, which is not always supported. 
instead, drop the partition field
+      return TypeUtil.selectNot(schema, 
Sets.newHashSet(DataFile.PARTITION_ID));
+    } else {
+      return schema;
+    }
+  }
+
+  abstract static class BaseFilesTableScan<T extends BaseFile<?>> extends 
BaseMetadataTableScan {
+    private final Schema fileSchema;
+    private final MetadataTableType type;
+
+    protected BaseFilesTableScan(TableOperations ops, Table table, Schema 
fileSchema, MetadataTableType type) {
+      super(ops, table, fileSchema);
+      this.fileSchema = fileSchema;
+      this.type = type;
+    }
+
+    protected Schema fileSchema() {
+      return fileSchema;
+    }
+
+    protected BaseFilesTableScan(TableOperations ops, Table table, Schema 
schema, Schema fileSchema,
+                                 TableScanContext context, MetadataTableType 
type) {
+      super(ops, table, schema, context);
+      this.fileSchema = fileSchema;
+      this.type = type;
+    }
+
+    @Override
+    public TableScan appendsBetween(long fromSnapshotId, long toSnapshotId) {
+      throw new UnsupportedOperationException(
+          String.format("Cannot incrementally scan table of type %s", 
type.name()));
+    }
+
+    @Override
+    public TableScan appendsAfter(long fromSnapshotId) {
+      throw new UnsupportedOperationException(
+          String.format("Cannot incrementally scan table of type %s", 
type.name()));
+    }
+
+    @Override
+    protected CloseableIterable<FileScanTask> planFiles(
+        TableOperations ops, Snapshot snapshot, Expression rowFilter,
+        boolean ignoreResiduals, boolean caseSensitive, boolean colStats) {
+      CloseableIterable<ManifestFile> filtered = filterManifests(rowFilter, 
caseSensitive);
+
+      String schemaString = SchemaParser.toJson(schema());
+      String specString = 
PartitionSpecParser.toJson(PartitionSpec.unpartitioned());
+      Expression filter = ignoreResiduals ? Expressions.alwaysTrue() : 
rowFilter;
+      ResidualEvaluator residuals = ResidualEvaluator.unpartitioned(filter);
+
+      // Data tasks produce the table schema, not the projection schema and 
projection is done by processing engines.
+      // This data task needs to use the table schema, which may not include a 
partition schema to avoid having an
+      // empty struct in the schema for unpartitioned tables. Some engines, 
like Spark, can't handle empty structs in
+      // all cases.
+      return CloseableIterable.transform(filtered, manifest ->
+          new ManifestReadTask<T>(ops.io(), ops.current().specsById(),
+              manifest, schema(), schemaString, specString, residuals));
+    }
+
+    /**
+     * @return list of manifest files to explore for this files metadata table 
scan
+     */
+    protected abstract List<ManifestFile> manifests();
+
+    private CloseableIterable<ManifestFile> filterManifests(Expression 
rowFilter,

Review comment:
       nit: given the name, I'd expect manifests to passed in as an arg.

##########
File path: core/src/main/java/org/apache/iceberg/BaseFilesTable.java
##########
@@ -0,0 +1,175 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.iceberg;
+
+import java.util.List;
+import java.util.Map;
+import org.apache.iceberg.expressions.Expression;
+import org.apache.iceberg.expressions.Expressions;
+import org.apache.iceberg.expressions.ManifestEvaluator;
+import org.apache.iceberg.expressions.Projections;
+import org.apache.iceberg.expressions.ResidualEvaluator;
+import org.apache.iceberg.io.CloseableIterable;
+import org.apache.iceberg.io.FileIO;
+import 
org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting;
+import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList;
+import org.apache.iceberg.relocated.com.google.common.collect.Sets;
+import org.apache.iceberg.types.TypeUtil;
+import org.apache.iceberg.types.Types.StructType;
+
+/**
+ * Base class logic for files metadata tables
+ */
+abstract class BaseFilesTable extends BaseMetadataTable {
+
+  BaseFilesTable(TableOperations ops, Table table, String name) {
+    super(ops, table, name);
+  }
+
+  @Override
+  public Schema schema() {
+    StructType partitionType = Partitioning.partitionType(table());
+    Schema schema = new Schema(DataFile.getType(partitionType).fields());
+    if (partitionType.fields().size() < 1) {
+      // avoid returning an empty struct, which is not always supported. 
instead, drop the partition field
+      return TypeUtil.selectNot(schema, 
Sets.newHashSet(DataFile.PARTITION_ID));
+    } else {
+      return schema;
+    }
+  }
+
+  abstract static class BaseFilesTableScan<T extends BaseFile<?>> extends 
BaseMetadataTableScan {
+    private final Schema fileSchema;
+    private final MetadataTableType type;
+
+    protected BaseFilesTableScan(TableOperations ops, Table table, Schema 
fileSchema, MetadataTableType type) {
+      super(ops, table, fileSchema);
+      this.fileSchema = fileSchema;
+      this.type = type;
+    }
+
+    protected Schema fileSchema() {

Review comment:
       nit: constructors should come before any methods.

##########
File path: core/src/main/java/org/apache/iceberg/DeleteFilesTable.java
##########
@@ -0,0 +1,68 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.iceberg;
+
+import java.util.List;
+
+/**
+ * A {@link Table} implementation that exposes a table's delete files as rows.
+ */
+public class DeleteFilesTable extends BaseFilesTable {
+
+  DeleteFilesTable(TableOperations ops, Table table) {
+    this(ops, table, table.name() + ".delete_files");
+  }
+
+  DeleteFilesTable(TableOperations ops, Table table, String name) {
+    super(ops, table, name);
+  }
+
+  @Override
+  public TableScan newScan() {
+    return new DeleteFilesTableScan(operations(), table(), schema());
+  }
+
+  @Override
+  MetadataTableType metadataTableType() {
+    return MetadataTableType.DELETE_FILES;
+  }
+
+  public static class DeleteFilesTableScan extends 
BaseFilesTableScan<GenericDeleteFile> {
+
+    DeleteFilesTableScan(TableOperations ops, Table table, Schema fileSchema) {
+      super(ops, table, fileSchema, MetadataTableType.DELETE_FILES);
+    }
+
+    private DeleteFilesTableScan(TableOperations ops, Table table, Schema 
schema, Schema fileSchema,

Review comment:
       nit: this constructor is not private in `DataFilesTableScan`. Either 
make this also package-private and fit all args on one line or make that one 
private for consistency.

##########
File path: core/src/test/java/org/apache/iceberg/TestMetadataTableScans.java
##########
@@ -601,14 +791,14 @@ private void 
validateIncludesPartitionScan(CloseableIterable<FileScanTask> tasks
   private void validateFileScanTasks(CloseableIterable<FileScanTask> 
fileScanTasks, int partValue) {
     Assert.assertTrue("File scan tasks do not include correct file",
         StreamSupport.stream(fileScanTasks.spliterator(), false).anyMatch(t -> 
{
-          ManifestFile mf = ((DataFilesTable.ManifestReadTask) t).manifest();

Review comment:
       nit: what about a direct import?




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]



---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] [iceberg] aokolnychyi commented on a change in pull request #4243: Add delete_files metadata table

Reply via email to