[GitHub] [iceberg] szehon-ho commented on a diff in pull request #4566: Core: Refactor metadata tables and scans

GitBox Fri, 15 Apr 2022 10:04:30 -0700


szehon-ho commented on code in PR #4566:
URL: https://github.com/apache/iceberg/pull/4566#discussion_r851382921



##########
core/src/main/java/org/apache/iceberg/BaseFilesTable.java:
##########
@@ -56,72 +56,73 @@ public Schema schema() {
     }
   }
 
-  abstract static class BaseFilesTableScan extends BaseMetadataTableScan {
+  private static CloseableIterable<FileScanTask> planFiles(Table table, 
CloseableIterable<ManifestFile> manifests,
+                                                           Schema tableSchema, 
Schema projectedSchema,
+                                                           TableScanContext 
context) {
+    Expression rowFilter = context.rowFilter();
+    boolean caseSensitive = context.caseSensitive();
+    boolean ignoreResiduals = context.ignoreResiduals();
+
+    LoadingCache<Integer, ManifestEvaluator> evalCache = 
Caffeine.newBuilder().build(specId -> {

Review Comment:
   Not related to this change, but my next pr plans to use this logic as well 
to fix filtering for PartitionsTable.  Maybe at that time will move the 
evalCache part of this further up to BaseMetadataTables.



##########
core/src/main/java/org/apache/iceberg/BaseAllMetadataTableScan.java:
##########
@@ -19,40 +19,56 @@
 
 package org.apache.iceberg;
 
+import java.io.IOException;
+import java.io.UncheckedIOException;
+import org.apache.iceberg.events.Listeners;
+import org.apache.iceberg.events.ScanEvent;
 import org.apache.iceberg.io.CloseableIterable;
+import org.apache.iceberg.relocated.com.google.common.base.Function;
+import org.apache.iceberg.relocated.com.google.common.collect.Iterables;
+import org.apache.iceberg.relocated.com.google.common.collect.Sets;
+import org.apache.iceberg.util.ParallelIterable;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 abstract class BaseAllMetadataTableScan extends BaseMetadataTableScan {
+  private static final Logger LOG = 
LoggerFactory.getLogger(BaseAllMetadataTableScan.class);
 
-  BaseAllMetadataTableScan(TableOperations ops, Table table, Schema 
fileSchema) {
-    super(ops, table, fileSchema);
+  BaseAllMetadataTableScan(TableOperations ops, Table table, Schema schema, 
MetadataTableType tableType) {
+    super(ops, table, schema, tableType);
   }
 
-  BaseAllMetadataTableScan(TableOperations ops, Table table, Schema schema, 
TableScanContext context) {
-    super(ops, table, schema, context);
+  BaseAllMetadataTableScan(TableOperations ops, Table table, Schema schema, 
MetadataTableType tableType,
+                           TableScanContext context) {
+    super(ops, table, schema, tableType, context);
   }
 
-  /**
-   * Type of scan being performed, such as {@link 
MetadataTableType#ALL_DATA_FILES} when scanning
-   * a table's {@link org.apache.iceberg.AllDataFilesTable}.
-   * <p>
-   * Used for logging and error messages.
-   */
-  protected abstract String tableType();
-
   @Override
-  public TableScan appendsBetween(long fromSnapshotId, long toSnapshotId) {
-    throw new UnsupportedOperationException(
-        String.format("Cannot incrementally scan table of type %s", 
tableType()));
+  public TableScan useSnapshot(long scanSnapshotId) {
+    throw new UnsupportedOperationException("Cannot select snapshot in table: 
" + tableType());
   }
 
   @Override
-  public TableScan appendsAfter(long fromSnapshotId) {
-    throw new UnsupportedOperationException(
-        String.format("Cannot incrementally scan table of type %s", 
tableType()));
+  public TableScan asOfTime(long timestampMillis) {
+    throw new UnsupportedOperationException("Cannot select snapshot in table: 
" + tableType());
   }
 
   @Override
   public CloseableIterable<FileScanTask> planFiles() {
-    return super.planFilesAllSnapshots();
+    LOG.info("Scanning metadata table {} with filter {}.", table(), filter());
+    Listeners.notifyAll(new ScanEvent(table().name(), 0L, filter(), schema()));
+
+    return doPlanFiles();
+  }
+
+  protected CloseableIterable<ManifestFile> allManifests(Function<Snapshot, 
Iterable<ManifestFile>> toManifests) {

Review Comment:
   Name is a bit confusing as 'allManifests' is already used to denote data + 
delete manifests.  How about 'allSnapshotManifests' or 'allSnapshots' , or 
something to denote the fact that is explores all snapshots.



##########
core/src/main/java/org/apache/iceberg/AllEntriesTable.java:
##########
@@ -88,34 +83,16 @@ protected TableScan newRefinedScan(TableOperations ops, 
Table table, Schema sche
     }
 
     @Override
-    protected String tableType() {
-      return MetadataTableType.ALL_ENTRIES.name();
-    }
+    protected CloseableIterable<FileScanTask> doPlanFiles() {
+      CloseableIterable<ManifestFile> manifests = 
allManifests(Snapshot::allManifests);
 
-    @Override
-    protected CloseableIterable<FileScanTask> planFiles(
-        TableOperations ops, Snapshot snapshot, Expression rowFilter,
-        boolean ignoreResiduals, boolean caseSensitive, boolean colStats) {
-      CloseableIterable<ManifestFile> manifests = allManifestFiles(
-          ops.current().snapshots(), context().planExecutor());
       String schemaString = SchemaParser.toJson(schema());
       String specString = 
PartitionSpecParser.toJson(PartitionSpec.unpartitioned());
-      Expression filter = ignoreResiduals ? Expressions.alwaysTrue() : 
rowFilter;
+      Expression filter = shouldIgnoreResiduals() ? Expressions.alwaysTrue() : 
filter();

Review Comment:
   Should we use ignoreResiduals throughout instead of "shouldIgnoreResiduals"? 
 (as per code style we try to avoid useless words in the getter/setter method 
names, ignoreResiduals seems clear enough?)



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org
For additional commands, e-mail: issues-h...@iceberg.apache.org

[GitHub] [iceberg] szehon-ho commented on a diff in pull request #4566: Core: Refactor metadata tables and scans

Reply via email to