count to iceberg

via GitHub Fri, 03 Feb 2023 18:52:09 -0800


aokolnychyi commented on code in PR #6622:
URL: https://github.com/apache/iceberg/pull/6622#discussion_r1096469739



##########
spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/SparkScanBuilder.java:
##########
@@ -158,6 +182,141 @@ public Filter[] pushedFilters() {
     return pushedFilters;
   }
 
+  @Override
+  public boolean pushAggregation(Aggregation aggregation) {
+    if (!pushDownAggregate(aggregation)) {
+      return false;
+    }
+
+    AggregateEvaluator aggregateEvaluator;
+    try {
+      List<Expression> aggregates =
+          Arrays.stream(aggregation.aggregateExpressions())
+              .map(agg -> SparkAggregates.convert(agg))
+              .collect(Collectors.toList());
+      aggregateEvaluator = AggregateEvaluator.create(schema, aggregates);
+    } catch (Exception e) {
+      LOG.info("Can't push down aggregates: " + e.getMessage());
+      return false;
+    }
+
+    if 
(!metricsModeSupportsAggregatePushDown(aggregateEvaluator.aggregates())) {
+      LOG.info("The MetricsMode doesn't support aggregate push down.");
+      return false;
+    }
+
+    List<ManifestFile> manifests = getSnapshot().allManifests(table.io());
+
+    for (ManifestFile manifest : manifests) {
+      try (ManifestReader<DataFile> reader = ManifestFiles.read(manifest, 
table.io())) {
+        for (DataFile dataFile : reader) {
+          aggregateEvaluator.update(dataFile.copy());
+        }
+      } catch (IOException e) {
+        LOG.info("Can't push down aggregates: " + e.getMessage());
+        return false;
+      }
+    }
+
+    Object[] res = aggregateEvaluator.result();
+    applyDataTypeConversionIfNecessary(res);
+
+    List<Object> valuesInSparkInternalRow = java.util.Arrays.asList(res);
+    this.pushedAggregateRows = new InternalRow[1];
+    pushedAggregateRows[0] =
+        
InternalRow.fromSeq(JavaConverters.asScalaBuffer(valuesInSparkInternalRow).toSeq());
+    pushedAggregateSchema =
+        SparkSchemaUtil.convert(new 
Schema(aggregateEvaluator.resultType().fields()));
+    return true;
+  }
+
+  private boolean pushDownAggregate(Aggregation aggregation) {
+    if (!(table instanceof BaseTable)) {
+      return false;
+    }
+
+    if (!readConf.aggregatePushDown()) {
+      return false;
+    }
+
+    Snapshot snapshot = getSnapshot();
+    if (snapshot == null) {
+      return false;
+    } else {
+      Map<String, String> map = snapshot.summary();
+      // if there are row-level deletes in current snapshot, the statistics
+      // maybe changed, so disable push down aggregate.
+      if (Integer.parseInt(map.getOrDefault("total-position-deletes", "0")) > 0
+          || Integer.parseInt(map.getOrDefault("total-equality-deletes", "0")) 
> 0) {
+        LOG.info("Cannot push down aggregates when row level deletes exist.)");

Review Comment:
   I also support the idea of checking if any matching tasks have deletes and 
using that instead of relying on generic snapshot metadata.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] [iceberg] aokolnychyi commented on a diff in pull request #6622: push down min/max/count to iceberg

Reply via email to