RussellSpitzer commented on code in PR #6524: URL: https://github.com/apache/iceberg/pull/6524#discussion_r1062606757
########## spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/SparkScanBuilder.java: ########## @@ -106,41 +106,50 @@ public SparkScanBuilder caseSensitive(boolean isCaseSensitive) { @Override public Filter[] pushFilters(Filter[] filters) { List<Expression> expressions = Lists.newArrayListWithExpectedSize(filters.length); - List<Filter> pushed = Lists.newArrayListWithExpectedSize(filters.length); + List<Filter> pushableFilters = Lists.newArrayListWithExpectedSize(filters.length); + List<Filter> postScanFilters = Lists.newArrayListWithExpectedSize(filters.length); for (Filter filter : filters) { - Expression expr = null; - try { - expr = SparkFilters.convert(filter); - } catch (IllegalArgumentException e) { - // converting to Iceberg Expression failed, so this expression cannot be pushed down - LOG.info( - "Failed to convert filter to Iceberg expression, skipping push down for this expression: {}. {}", - filter, - e.getMessage()); - } + Expression expr = safelyConvertFilter(filter); if (expr != null) { - try { - Binder.bind(schema.asStruct(), expr, caseSensitive); - expressions.add(expr); - pushed.add(filter); - } catch (ValidationException e) { - // binding to the table schema failed, so this expression cannot be pushed down - LOG.info( - "Failed to bind expression to table schema, skipping push down for this expression: {}. {}", - filter, - e.getMessage()); - } + expressions.add(expr); + pushableFilters.add(filter); + } + + if (expr == null || requiresRecordLevelFiltering(expr)) { + postScanFilters.add(filter); } } this.filterExpressions = expressions; - this.pushedFilters = pushed.toArray(new Filter[0]); + this.pushedFilters = pushableFilters.toArray(new Filter[0]); + + // all unsupported filters and filters that require record-level filtering + // must be reported back and handled on the Spark side + return postScanFilters.toArray(new Filter[0]); + } + + private Expression safelyConvertFilter(Filter filter) { + try { + Expression expr = SparkFilters.convert(filter); + + if (expr != null) { + // try binding the expression to ensure it can be pushed down + Binder.bind(schema.asStruct(), expr, caseSensitive); + return expr; + } + + } catch (Exception e) { + LOG.warn("Exception while converting {} to Iceberg: {}.", filter, e.getMessage()); + } + + return null; + } - // Spark doesn't support residuals per task, so return all filters - // to get Spark to handle record-level filtering - return filters; + private boolean requiresRecordLevelFiltering(Expression expr) { + return table.specs().values().stream() Review Comment: I think another optimization maybe worth doing would be to group all expressions based on bound column an operation. In the bad case we are considering we would end up checking wether or not we can filter a "column = literal" for a ton of different literal values. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org