This is an automated email from the ASF dual-hosted git repository.
wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new b2eb6ac ARROW-1632: [Python] Permit categorical conversions in
Table.to_pandas on a per-column basis
b2eb6ac is described below
commit b2eb6ac90f9fd09d2bd2855b9c6121ea789ea6a0
Author: Uwe L. Korn <[email protected]>
AuthorDate: Wed Feb 28 23:34:41 2018 -0500
ARROW-1632: [Python] Permit categorical conversions in Table.to_pandas on a
per-column basis
This is failing due to the first entry of the indices being uninitialised
memory in the pandas series. On the C++ side, everything looks good. Would be
nice if someone could have a look, I‘m out of ideas.
Author: Uwe L. Korn <[email protected]>
Author: Wes McKinney <[email protected]>
Closes #1620 from xhochy/ARROW-1632 and squashes the following commits:
17ca4e18 <Uwe L. Korn> Check status codes
8d1e9995 <Wes McKinney> Fix flake
27fe924e <Uwe L. Korn> Ensure underlying index array is kept alive
7ae6a46e <Uwe L. Korn> ARROW-1632: Permit categorical conversions in
Table.to_pandas on a per-column basis
---
cpp/src/arrow/python/arrow_to_pandas.cc | 59 +++++++++++++++++++++++++----
cpp/src/arrow/python/arrow_to_pandas.h | 12 ++++++
python/pyarrow/includes/common.pxd | 1 +
python/pyarrow/includes/libarrow.pxd | 10 +++--
python/pyarrow/pandas_compat.py | 9 +++--
python/pyarrow/table.pxi | 16 ++++++--
python/pyarrow/tests/test_convert_pandas.py | 18 +++++++++
7 files changed, 106 insertions(+), 19 deletions(-)
diff --git a/cpp/src/arrow/python/arrow_to_pandas.cc
b/cpp/src/arrow/python/arrow_to_pandas.cc
index 125892a..aefd4d7 100644
--- a/cpp/src/arrow/python/arrow_to_pandas.cc
+++ b/cpp/src/arrow/python/arrow_to_pandas.cc
@@ -60,6 +60,7 @@ using internal::kNanosecondsInDay;
using internal::kPandasTimestampNull;
using compute::Datum;
+using compute::FunctionContext;
// ----------------------------------------------------------------------
// Utility code
@@ -986,7 +987,8 @@ class CategoricalBlock : public PandasBlock {
// Sniff the first chunk
const std::shared_ptr<Array> arr_first = data.chunk(0);
const auto& dict_arr_first = static_cast<const
DictionaryArray&>(*arr_first);
- const auto& indices_first = static_cast<const
ArrayType&>(*dict_arr_first.indices());
+ const auto indices_first =
+ std::static_pointer_cast<ArrayType>(dict_arr_first.indices());
auto CheckIndices = [](const ArrayType& arr, int64_t dict_length) {
const T* values = arr.raw_values();
@@ -1000,8 +1002,8 @@ class CategoricalBlock : public PandasBlock {
return Status::OK();
};
- if (!needs_copy_ && data.num_chunks() == 1 && indices_first.null_count()
== 0) {
- RETURN_NOT_OK(CheckIndices(indices_first,
dict_arr_first.dictionary()->length()));
+ if (!needs_copy_ && data.num_chunks() == 1 && indices_first->null_count()
== 0) {
+ RETURN_NOT_OK(CheckIndices(*indices_first,
dict_arr_first.dictionary()->length()));
RETURN_NOT_OK(AllocateNDArrayFromIndices<T>(npy_type, indices_first));
} else {
if (options_.zero_copy_only) {
@@ -1011,7 +1013,7 @@ class CategoricalBlock : public PandasBlock {
<< "but only zero-copy conversions allowed.";
} else {
ss << "Needed to copy " << data.num_chunks() << " chunks with "
- << indices_first.null_count()
+ << indices_first->null_count()
<< " indices nulls, but zero_copy_only was True";
}
return Status::Invalid(ss.str());
@@ -1109,10 +1111,11 @@ class CategoricalBlock : public PandasBlock {
protected:
template <typename T>
- Status AllocateNDArrayFromIndices(int npy_type, const PrimitiveArray&
indices) {
+ Status AllocateNDArrayFromIndices(int npy_type,
+ const std::shared_ptr<PrimitiveArray>&
indices) {
npy_intp block_dims[1] = {num_rows_};
- const T* in_values = GetPrimitiveValues<T>(indices);
+ const T* in_values = GetPrimitiveValues<T>(*indices);
void* data = const_cast<T*>(in_values);
PyAcquireGIL lock;
@@ -1127,6 +1130,22 @@ class CategoricalBlock : public PandasBlock {
nullptr, data,
NPY_ARRAY_CARRAY, nullptr);
RETURN_IF_PYERROR();
+ // Add a reference to the underlying Array. Otherwise the array may be
+ // deleted once we leave the block conversion.
+ auto capsule = new ArrowCapsule{{indices}};
+ PyObject* base = PyCapsule_New(reinterpret_cast<void*>(capsule), "arrow",
+ &ArrowCapsule_Destructor);
+ if (base == nullptr) {
+ delete capsule;
+ RETURN_IF_PYERROR();
+ }
+
+ if (PyArray_SetBaseObject(reinterpret_cast<PyArrayObject*>(block_arr),
base) == -1) {
+ // Error occurred, trust that SetBaseObject set the error state
+ Py_XDECREF(base);
+ return Status::OK();
+ }
+
npy_intp placement_dims[1] = {num_columns_};
PyObject* placement_arr = PyArray_SimpleNew(1, placement_dims, NPY_INT64);
RETURN_IF_PYERROR();
@@ -1771,7 +1790,33 @@ Status ConvertColumnToPandas(PandasOptions options,
const std::shared_ptr<Column
Status ConvertTableToPandas(PandasOptions options, const
std::shared_ptr<Table>& table,
int nthreads, MemoryPool* pool, PyObject** out) {
- DataFrameBlockCreator helper(options, table, pool);
+ return ConvertTableToPandas(options, std::unordered_set<std::string>(),
table, nthreads,
+ pool, out);
+}
+
+Status ConvertTableToPandas(PandasOptions options,
+ const std::unordered_set<std::string>&
categorical_columns,
+ const std::shared_ptr<Table>& table, int nthreads,
+ MemoryPool* pool, PyObject** out) {
+ std::shared_ptr<Table> current_table = table;
+ if (!categorical_columns.empty()) {
+ FunctionContext ctx;
+ for (int i = 0; i < table->num_columns(); i++) {
+ const Column& col = *table->column(i);
+ if (categorical_columns.count(col.name())) {
+ Datum out;
+ RETURN_NOT_OK(DictionaryEncode(&ctx, Datum(col.data()), &out));
+ std::shared_ptr<ChunkedArray> array = out.chunked_array();
+ auto field = std::make_shared<Field>(
+ col.name(), array->type(), col.field()->nullable(),
col.field()->metadata());
+ auto column = std::make_shared<Column>(field, array);
+ RETURN_NOT_OK(current_table->RemoveColumn(i, ¤t_table));
+ RETURN_NOT_OK(current_table->AddColumn(i, column, ¤t_table));
+ }
+ }
+ }
+
+ DataFrameBlockCreator helper(options, current_table, pool);
return helper.Convert(nthreads, out);
}
diff --git a/cpp/src/arrow/python/arrow_to_pandas.h
b/cpp/src/arrow/python/arrow_to_pandas.h
index 1e48646..0541b0f 100644
--- a/cpp/src/arrow/python/arrow_to_pandas.h
+++ b/cpp/src/arrow/python/arrow_to_pandas.h
@@ -25,6 +25,7 @@
#include <memory>
#include <string>
+#include <unordered_set>
#include "arrow/util/visibility.h"
@@ -40,6 +41,7 @@ class Table;
namespace py {
struct PandasOptions {
+ /// If true, we will convert all string columns to categoricals
bool strings_to_categorical;
bool zero_copy_only;
@@ -64,6 +66,16 @@ ARROW_EXPORT
Status ConvertTableToPandas(PandasOptions options, const
std::shared_ptr<Table>& table,
int nthreads, MemoryPool* pool, PyObject** out);
+/// Convert a whole table as efficiently as possible to a pandas.DataFrame.
+///
+/// Explicitly name columns that should be a categorical
+/// This option is only used on conversions that are applied to a table.
+ARROW_EXPORT
+Status ConvertTableToPandas(PandasOptions options,
+ const std::unordered_set<std::string>&
categorical_columns,
+ const std::shared_ptr<Table>& table, int nthreads,
+ MemoryPool* pool, PyObject** out);
+
} // namespace py
} // namespace arrow
diff --git a/python/pyarrow/includes/common.pxd
b/python/pyarrow/includes/common.pxd
index 4d799ec..5afa075 100644
--- a/python/pyarrow/includes/common.pxd
+++ b/python/pyarrow/includes/common.pxd
@@ -23,6 +23,7 @@ from libcpp.memory cimport shared_ptr, unique_ptr, make_shared
from libcpp.string cimport string as c_string
from libcpp.vector cimport vector
from libcpp.unordered_map cimport unordered_map
+from libcpp.unordered_set cimport unordered_set
from cpython cimport PyObject
cimport cpython
diff --git a/python/pyarrow/includes/libarrow.pxd
b/python/pyarrow/includes/libarrow.pxd
index 900c3a5..b9abf2b 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -882,10 +882,12 @@ cdef extern from "arrow/python/api.h" namespace
"arrow::py" nogil:
const shared_ptr[CColumn]& arr,
object py_ref, PyObject** out)
- CStatus ConvertTableToPandas(PandasOptions options,
- const shared_ptr[CTable]& table,
- int nthreads, CMemoryPool* pool,
- PyObject** out)
+ CStatus ConvertTableToPandas(
+ PandasOptions options,
+ const unordered_set[c_string]& categorical_columns,
+ const shared_ptr[CTable]& table,
+ int nthreads, CMemoryPool* pool,
+ PyObject** out)
void c_set_default_memory_pool \
" arrow::py::set_default_memory_pool"(CMemoryPool* pool)\
diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py
index 6d4bf5e..0bc47fc 100644
--- a/python/pyarrow/pandas_compat.py
+++ b/python/pyarrow/pandas_compat.py
@@ -490,7 +490,7 @@ def _make_datetimetz(tz):
def table_to_blockmanager(options, table, memory_pool, nthreads=1,
- categoricals=None):
+ categories=None):
from pyarrow.compat import DatetimeTZDtype
index_columns = []
@@ -564,7 +564,8 @@ def table_to_blockmanager(options, table, memory_pool,
nthreads=1,
block_table.schema.get_field_index(raw_name)
)
- blocks = _table_to_blocks(options, block_table, nthreads, memory_pool)
+ blocks = _table_to_blocks(options, block_table, nthreads, memory_pool,
+ categories)
# Construct the row index
if len(index_arrays) > 1:
@@ -651,12 +652,12 @@ def _reconstruct_columns_from_metadata(columns,
column_indexes):
)
-def _table_to_blocks(options, block_table, nthreads, memory_pool):
+def _table_to_blocks(options, block_table, nthreads, memory_pool, categories):
# Part of table_to_blockmanager
# Convert an arrow table to Block from the internal pandas API
result = pa.lib.table_to_blocks(options, block_table, nthreads,
- memory_pool)
+ memory_pool, categories)
# Defined above
return [_reconstruct_block(item) for item in result]
diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi
index e14d473..178df57 100644
--- a/python/pyarrow/table.pxi
+++ b/python/pyarrow/table.pxi
@@ -746,17 +746,22 @@ cdef class RecordBatch:
def table_to_blocks(PandasOptions options, Table table, int nthreads,
- MemoryPool memory_pool):
+ MemoryPool memory_pool, categories):
cdef:
PyObject* result_obj
shared_ptr[CTable] c_table = table.sp_table
CMemoryPool* pool
+ unordered_set[c_string] categorical_columns
+
+ if categories is not None:
+ categorical_columns = {tobytes(cat) for cat in categories}
pool = maybe_unbox_memory_pool(memory_pool)
with nogil:
check_status(
libarrow.ConvertTableToPandas(
- options, c_table, nthreads, pool, &result_obj
+ options, categorical_columns, c_table, nthreads, pool,
+ &result_obj
)
)
@@ -1012,7 +1017,7 @@ cdef class Table:
return result
def to_pandas(self, nthreads=None, strings_to_categorical=False,
- memory_pool=None, zero_copy_only=False):
+ memory_pool=None, zero_copy_only=False, categories=None):
"""
Convert the arrow::Table to a pandas DataFrame
@@ -1029,6 +1034,8 @@ cdef class Table:
zero_copy_only : boolean, default False
Raise an ArrowException if this function call would require copying
the underlying data
+ categories: list, default empty
+ List of columns that should be returned as pandas.Categorical
Returns
-------
@@ -1036,6 +1043,7 @@ cdef class Table:
"""
cdef:
PandasOptions options
+
options = PandasOptions(
strings_to_categorical=strings_to_categorical,
zero_copy_only=zero_copy_only)
@@ -1043,7 +1051,7 @@ cdef class Table:
if nthreads is None:
nthreads = cpu_count()
mgr = pdcompat.table_to_blockmanager(options, self, memory_pool,
- nthreads)
+ nthreads, categories)
return pd.DataFrame(mgr)
def to_pydict(self):
diff --git a/python/pyarrow/tests/test_convert_pandas.py
b/python/pyarrow/tests/test_convert_pandas.py
index 6e68dd9..986aeff 100644
--- a/python/pyarrow/tests/test_convert_pandas.py
+++ b/python/pyarrow/tests/test_convert_pandas.py
@@ -1027,6 +1027,24 @@ class TestConvertStringLikeTypes(object):
expected2 = pd.DataFrame({'strings': pd.Categorical(values)})
tm.assert_frame_equal(result2, expected2, check_dtype=True)
+ def test_selective_categoricals(self):
+ values = ['', '', '', '', '']
+ df = pd.DataFrame({'strings': values})
+ field = pa.field('strings', pa.string())
+ schema = pa.schema([field])
+ table = pa.Table.from_pandas(df, schema=schema)
+ expected_str = pd.DataFrame({'strings': values})
+ expected_cat = pd.DataFrame({'strings': pd.Categorical(values)})
+
+ result1 = table.to_pandas(categories=['strings'])
+ tm.assert_frame_equal(result1, expected_cat, check_dtype=True)
+ result2 = table.to_pandas(categories=[])
+ tm.assert_frame_equal(result2, expected_str, check_dtype=True)
+ result3 = table.to_pandas(categories=('strings',))
+ tm.assert_frame_equal(result3, expected_cat, check_dtype=True)
+ result4 = table.to_pandas(categories=tuple())
+ tm.assert_frame_equal(result4, expected_str, check_dtype=True)
+
def test_table_str_to_categorical_without_na(self):
values = ['a', 'a', 'b', 'b', 'c']
df = pd.DataFrame({'strings': values})
--
To stop receiving notification emails like this one, please contact
[email protected].