This is an automated email from the ASF dual-hosted git repository.
jorisvandenbossche pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 36d3394c78 GH-33377: [Python] Table.drop should support passing a
single column (#33810)
36d3394c78 is described below
commit 36d3394c78a5a4a978f1944ff75962bae12c43a8
Author: Dane Pitkin <[email protected]>
AuthorDate: Fri Feb 3 07:16:31 2023 -0500
GH-33377: [Python] Table.drop should support passing a single column
(#33810)
### Rationale for this change
Provide a better user experience in pyarrow when working with `Table`.
### What changes are included in this PR?
Allow `Table.drop()` to accept a single column name as a `str` argument.
Provide a wrapper `Table.drop_column(str)`, which calls `Table.drop()`, to
match similar APIs such as `add_column()`, `append_column()`.
### Are these changes tested?
Updated the pytest for `Table.drop()` and added a pytest for
`Table.drop_column()`. Verified both test cases ran successfully locally.
### Are there any user-facing changes?
Yes, a new pyarrow API is added. The existing pyarrow API is backwards
compatible, but does support additional types of input now (str). The doc
strings are updated so I assume the python API reference will be auto-updated
somehow? Let me know if this is not the case.
* Closes: #33377
Authored-by: Dane Pitkin <[email protected]>
Signed-off-by: Joris Van den Bossche <[email protected]>
---
python/pyarrow/table.pxi | 23 +++++++++++++++--------
python/pyarrow/tests/test_dataset.py | 2 +-
python/pyarrow/tests/test_table.py | 29 ++++++++++++++++++++++++++---
3 files changed, 42 insertions(+), 12 deletions(-)
diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi
index 318c4d323b..c5c8924b42 100644
--- a/python/pyarrow/table.pxi
+++ b/python/pyarrow/table.pxi
@@ -4664,24 +4664,24 @@ cdef class Table(_PandasConvertible):
return pyarrow_wrap_table(c_table)
- def drop(self, columns):
+ def drop_columns(self, columns):
"""
Drop one or more columns and return a new table.
Parameters
----------
- columns : list of str
- List of field names referencing existing columns.
+ columns : str or list[str]
+ Field name(s) referencing existing column(s).
Raises
------
KeyError
- If any of the passed columns name are not existing.
+ If any of the passed column names do not exist.
Returns
-------
Table
- New table without the columns.
+ New table without the column(s).
Examples
--------
@@ -4693,19 +4693,22 @@ cdef class Table(_PandasConvertible):
Drop one column:
- >>> table.drop(["animals"])
+ >>> table.drop_columns("animals")
pyarrow.Table
n_legs: int64
----
n_legs: [[2,4,5,100]]
- Drop more columns:
+ Drop one or more columns:
- >>> table.drop(["n_legs", "animals"])
+ >>> table.drop_columns(["n_legs", "animals"])
pyarrow.Table
...
----
"""
+ if isinstance(columns, str):
+ columns = [columns]
+
indices = []
for col in columns:
idx = self.schema.get_field_index(col)
@@ -4722,6 +4725,10 @@ cdef class Table(_PandasConvertible):
return table
+ def drop(self, columns):
+ """Alias of Table.drop_columns, but kept for backwards
compatibility."""
+ return self.drop_columns(columns)
+
def group_by(self, keys):
"""Declare a grouping over the columns of the table.
diff --git a/python/pyarrow/tests/test_dataset.py
b/python/pyarrow/tests/test_dataset.py
index 27edc8afad..f6b4c1e9b7 100644
--- a/python/pyarrow/tests/test_dataset.py
+++ b/python/pyarrow/tests/test_dataset.py
@@ -3906,7 +3906,7 @@ def test_write_dataset_with_scanner(tempdir):
load_back = ds.dataset(tempdir2, format='ipc', partitioning=["b"])
load_back_table = load_back.to_table()
assert dict(load_back_table.to_pydict()
- ) == table.drop(["a"]).to_pydict()
+ ) == table.drop_columns("a").to_pydict()
@pytest.mark.parquet
diff --git a/python/pyarrow/tests/test_table.py
b/python/pyarrow/tests/test_table.py
index d710b7aac6..33d103d974 100644
--- a/python/pyarrow/tests/test_table.py
+++ b/python/pyarrow/tests/test_table.py
@@ -1050,17 +1050,40 @@ def test_table_set_column():
assert t2.equals(expected)
-def test_table_drop():
+def test_table_drop_columns():
""" drop one or more columns given labels"""
a = pa.array(range(5))
b = pa.array([-10, -5, 0, 5, 10])
c = pa.array(range(5, 10))
+ table = pa.Table.from_arrays([a, b, c], names=('a', 'b', 'c'))
+ t2 = table.drop_columns(['a', 'b'])
+ t3 = table.drop_columns('a')
+
+ exp_t2 = pa.Table.from_arrays([c], names=('c',))
+ assert exp_t2.equals(t2)
+ exp_t3 = pa.Table.from_arrays([b, c], names=('b', 'c',))
+ assert exp_t3.equals(t3)
+
+ # -- raise KeyError if column not in Table
+ with pytest.raises(KeyError, match="Column 'd' not found"):
+ table.drop_columns(['d'])
+
+
+def test_table_drop():
+ """ verify the alias of drop_columns is working"""
+ a = pa.array(range(5))
+ b = pa.array([-10, -5, 0, 5, 10])
+ c = pa.array(range(5, 10))
+
table = pa.Table.from_arrays([a, b, c], names=('a', 'b', 'c'))
t2 = table.drop(['a', 'b'])
+ t3 = table.drop('a')
- exp = pa.Table.from_arrays([c], names=('c',))
- assert exp.equals(t2)
+ exp_t2 = pa.Table.from_arrays([c], names=('c',))
+ assert exp_t2.equals(t2)
+ exp_t3 = pa.Table.from_arrays([b, c], names=('b', 'c',))
+ assert exp_t3.equals(t3)
# -- raise KeyError if column not in Table
with pytest.raises(KeyError, match="Column 'd' not found"):