This is an automated email from the ASF dual-hosted git repository.

raulcd pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new b1618e0af6 GH-44366: [Python][Acero] RecordBatch.filter on expression 
raises error if result set is empty (#46057)
b1618e0af6 is described below

commit b1618e0af67c88cce4b00cf94c4636fdfaf1227d
Author: koenvo <[email protected]>
AuthorDate: Wed Apr 9 13:03:24 2025 +0200

    GH-44366: [Python][Acero] RecordBatch.filter on expression raises error if 
result set is empty (#46057)
    
    ### Rationale for this change
    When filtering a RecordBatch using an expression that results in zero 
matching rows, an IndexError was raised due to an attempt to access the first 
batch of an empty result. This change ensures that such cases are handled 
gracefully by returning an empty RecordBatch with the correct schema.
    
    ### What changes are included in this PR?
    Fixed a bug in _filter_table where filtering all rows from a RecordBatch 
would raise an IndexError.
    
    Added a helper function _empty_record_batch_from_schema to construct an 
empty RecordBatch with the correct schema.
    
    ### Are these changes tested?
    Yes, the behavior was verified manually. Let me know if you'd like a 
dedicated test case added to the test suite.
    
    ### Are there any user-facing changes?
    No changes to public APIs. The fix ensures more robust handling of edge 
cases internally.
    
    - Github issue: https://github.com/apache/arrow/issues/44366
    * GitHub Issue: #44366
    
    Authored-by: Koen Vossen <[email protected]>
    Signed-off-by: Raúl Cumplido <[email protected]>
---
 python/pyarrow/acero.py              | 10 +++++++---
 python/pyarrow/tests/test_compute.py | 22 ++++++++++++++++++++++
 2 files changed, 29 insertions(+), 3 deletions(-)

diff --git a/python/pyarrow/acero.py b/python/pyarrow/acero.py
index 86bf7cbf4d..4f3fd82eac 100644
--- a/python/pyarrow/acero.py
+++ b/python/pyarrow/acero.py
@@ -22,7 +22,7 @@
 # distutils: language = c++
 # cython: language_level = 3
 
-from pyarrow.lib import Table, RecordBatch
+from pyarrow.lib import Table, RecordBatch, array
 from pyarrow.compute import Expression, field
 
 try:
@@ -362,7 +362,7 @@ def _filter_table(table, expression):
 
     Returns
     -------
-    Table
+    Table or RecordBatch
     """
     is_batch = False
     if isinstance(table, RecordBatch):
@@ -375,7 +375,11 @@ def _filter_table(table, expression):
     ])
     result = decl.to_table(use_threads=True)
     if is_batch:
-        result = result.combine_chunks().to_batches()[0]
+        if result.num_rows > 0:
+            result = result.combine_chunks().to_batches()[0]
+        else:
+            arrays = [array([], type=field.type) for field in result.schema]
+            result = RecordBatch.from_arrays(arrays, schema=result.schema)
     return result
 
 
diff --git a/python/pyarrow/tests/test_compute.py 
b/python/pyarrow/tests/test_compute.py
index 453bc177d9..d186addf91 100644
--- a/python/pyarrow/tests/test_compute.py
+++ b/python/pyarrow/tests/test_compute.py
@@ -1415,6 +1415,17 @@ def test_filter_record_batch():
     expected = pa.record_batch([pa.array(["a", None, "e"])], names=["a'"])
     assert result.equals(expected)
 
+    # GH-46057: filtering all rows should return empty RecordBatch with same 
schema
+    mask_empty_result = batch.filter(pa.array([False] * batch.num_rows))
+    assert mask_empty_result.num_rows == 0
+    assert isinstance(mask_empty_result, pa.RecordBatch)
+    assert mask_empty_result.schema.equals(batch.schema)
+
+    expr_empty_result = batch.filter(pc.field("a'") == "zzz")
+    assert expr_empty_result.num_rows == 0
+    assert isinstance(expr_empty_result, pa.RecordBatch)
+    assert expr_empty_result.schema.equals(batch.schema)
+
 
 def test_filter_table():
     table = pa.table([pa.array(["a", None, "c", "d", "e"])], names=["a"])
@@ -1434,6 +1445,17 @@ def test_filter_table():
         result = table.filter(mask, null_selection_behavior="emit_null")
         assert result.equals(expected_null)
 
+    # GH-46057: filtering all rows should return empty table with same schema
+    mask_empty_result = table.filter(pa.array([False] * table.num_rows))
+    assert mask_empty_result.num_rows == 0
+    assert isinstance(mask_empty_result, pa.Table)
+    assert mask_empty_result.schema.equals(table.schema)
+
+    expr_empty_result = table.filter(pc.field("a") == "zzz")
+    assert expr_empty_result.num_rows == 0
+    assert isinstance(expr_empty_result, pa.Table)
+    assert expr_empty_result.schema.equals(table.schema)
+
 
 def test_filter_errors():
     arr = pa.chunked_array([["a", None], ["c", "d", "e"]])

Reply via email to