rdblue commented on code in PR #6775: URL: https://github.com/apache/iceberg/pull/6775#discussion_r1199483097
########## python/pyiceberg/io/pyarrow.py: ########## @@ -721,18 +771,47 @@ def _file_to_table( fragment_scanner = ds.Scanner.from_fragment( fragment=fragment, schema=physical_schema, - filter=pyarrow_filter, + # This will push down the query to Arrow. + # But in case there are positional deletes, we have to apply them first + filter=pyarrow_filter if not positional_deletes else None, columns=[col.name for col in file_project_schema.columns], ) + if positional_deletes: + # In the case of a mask, it is a bit awkward because we first + # need to go to a table to apply the bitwise mask, and then + # the table is warped into a dataset to apply the expression + indices = _create_positional_deletes_indices(positional_deletes, fragment.count_rows) + + if limit: + if pyarrow_filter is not None: + # In case of the filter, we don't exactly know how many rows + # we need to fetch upfront, can be optimized in the future: + # https://github.com/apache/arrow/issues/35301 + arrow_table = fragment_scanner.take(indices) Review Comment: Is this lazy or should we also limit indices? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org