[arrow] branch master updated: ARROW-1632: [Python] Permit categorical conversions in Table.to_pandas on a per-column basis

wesm Wed, 28 Feb 2018 20:35:46 -0800

This is an automated email from the ASF dual-hosted git repository.

wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git



The following commit(s) were added to refs/heads/master by this push:
     new b2eb6ac  ARROW-1632: [Python] Permit categorical conversions in 
Table.to_pandas on a per-column basis
b2eb6ac is described below

commit b2eb6ac90f9fd09d2bd2855b9c6121ea789ea6a0
Author: Uwe L. Korn <[email protected]>
AuthorDate: Wed Feb 28 23:34:41 2018 -0500

    ARROW-1632: [Python] Permit categorical conversions in Table.to_pandas on a 
per-column basis
    
    This is failing due to the first entry of the indices being uninitialised 
memory in the pandas series. On the C++ side, everything looks good. Would be 
nice if someone could have a look, I‘m out of ideas.
    
    Author: Uwe L. Korn <[email protected]>
    Author: Wes McKinney <[email protected]>
    
    Closes #1620 from xhochy/ARROW-1632 and squashes the following commits:
    
    17ca4e18 <Uwe L. Korn> Check status codes
    8d1e9995 <Wes McKinney> Fix flake
    27fe924e <Uwe L. Korn> Ensure underlying index array is kept alive
    7ae6a46e <Uwe L. Korn> ARROW-1632:  Permit categorical conversions in 
Table.to_pandas on a per-column basis
---
 cpp/src/arrow/python/arrow_to_pandas.cc     | 59 +++++++++++++++++++++++++----
 cpp/src/arrow/python/arrow_to_pandas.h      | 12 ++++++
 python/pyarrow/includes/common.pxd          |  1 +
 python/pyarrow/includes/libarrow.pxd        | 10 +++--
 python/pyarrow/pandas_compat.py             |  9 +++--
 python/pyarrow/table.pxi                    | 16 ++++++--
 python/pyarrow/tests/test_convert_pandas.py | 18 +++++++++
 7 files changed, 106 insertions(+), 19 deletions(-)

diff --git a/cpp/src/arrow/python/arrow_to_pandas.cc 
b/cpp/src/arrow/python/arrow_to_pandas.cc
index 125892a..aefd4d7 100644
--- a/cpp/src/arrow/python/arrow_to_pandas.cc
+++ b/cpp/src/arrow/python/arrow_to_pandas.cc
@@ -60,6 +60,7 @@ using internal::kNanosecondsInDay;
 using internal::kPandasTimestampNull;
 
 using compute::Datum;
+using compute::FunctionContext;
 
 // ----------------------------------------------------------------------
 // Utility code
@@ -986,7 +987,8 @@ class CategoricalBlock : public PandasBlock {
     // Sniff the first chunk
     const std::shared_ptr<Array> arr_first = data.chunk(0);
     const auto& dict_arr_first = static_cast<const 
DictionaryArray&>(*arr_first);
-    const auto& indices_first = static_cast<const 
ArrayType&>(*dict_arr_first.indices());
+    const auto indices_first =
+        std::static_pointer_cast<ArrayType>(dict_arr_first.indices());
 
     auto CheckIndices = [](const ArrayType& arr, int64_t dict_length) {
       const T* values = arr.raw_values();
@@ -1000,8 +1002,8 @@ class CategoricalBlock : public PandasBlock {
       return Status::OK();
     };
 
-    if (!needs_copy_ && data.num_chunks() == 1 && indices_first.null_count() 
== 0) {
-      RETURN_NOT_OK(CheckIndices(indices_first, 
dict_arr_first.dictionary()->length()));
+    if (!needs_copy_ && data.num_chunks() == 1 && indices_first->null_count() 
== 0) {
+      RETURN_NOT_OK(CheckIndices(*indices_first, 
dict_arr_first.dictionary()->length()));
       RETURN_NOT_OK(AllocateNDArrayFromIndices<T>(npy_type, indices_first));
     } else {
       if (options_.zero_copy_only) {
@@ -1011,7 +1013,7 @@ class CategoricalBlock : public PandasBlock {
              << "but only zero-copy conversions allowed.";
         } else {
           ss << "Needed to copy " << data.num_chunks() << " chunks with "
-             << indices_first.null_count()
+             << indices_first->null_count()
              << " indices nulls, but zero_copy_only was True";
         }
         return Status::Invalid(ss.str());
@@ -1109,10 +1111,11 @@ class CategoricalBlock : public PandasBlock {
 
  protected:
   template <typename T>
-  Status AllocateNDArrayFromIndices(int npy_type, const PrimitiveArray& 
indices) {
+  Status AllocateNDArrayFromIndices(int npy_type,
+                                    const std::shared_ptr<PrimitiveArray>& 
indices) {
     npy_intp block_dims[1] = {num_rows_};
 
-    const T* in_values = GetPrimitiveValues<T>(indices);
+    const T* in_values = GetPrimitiveValues<T>(*indices);
     void* data = const_cast<T*>(in_values);
 
     PyAcquireGIL lock;
@@ -1127,6 +1130,22 @@ class CategoricalBlock : public PandasBlock {
                                                nullptr, data, 
NPY_ARRAY_CARRAY, nullptr);
     RETURN_IF_PYERROR();
 
+    // Add a reference to the underlying Array. Otherwise the array may be
+    // deleted once we leave the block conversion.
+    auto capsule = new ArrowCapsule{{indices}};
+    PyObject* base = PyCapsule_New(reinterpret_cast<void*>(capsule), "arrow",
+                                   &ArrowCapsule_Destructor);
+    if (base == nullptr) {
+      delete capsule;
+      RETURN_IF_PYERROR();
+    }
+
+    if (PyArray_SetBaseObject(reinterpret_cast<PyArrayObject*>(block_arr), 
base) == -1) {
+      // Error occurred, trust that SetBaseObject set the error state
+      Py_XDECREF(base);
+      return Status::OK();
+    }
+
     npy_intp placement_dims[1] = {num_columns_};
     PyObject* placement_arr = PyArray_SimpleNew(1, placement_dims, NPY_INT64);
     RETURN_IF_PYERROR();
@@ -1771,7 +1790,33 @@ Status ConvertColumnToPandas(PandasOptions options, 
const std::shared_ptr<Column
 
 Status ConvertTableToPandas(PandasOptions options, const 
std::shared_ptr<Table>& table,
                             int nthreads, MemoryPool* pool, PyObject** out) {
-  DataFrameBlockCreator helper(options, table, pool);
+  return ConvertTableToPandas(options, std::unordered_set<std::string>(), 
table, nthreads,
+                              pool, out);
+}
+
+Status ConvertTableToPandas(PandasOptions options,
+                            const std::unordered_set<std::string>& 
categorical_columns,
+                            const std::shared_ptr<Table>& table, int nthreads,
+                            MemoryPool* pool, PyObject** out) {
+  std::shared_ptr<Table> current_table = table;
+  if (!categorical_columns.empty()) {
+    FunctionContext ctx;
+    for (int i = 0; i < table->num_columns(); i++) {
+      const Column& col = *table->column(i);
+      if (categorical_columns.count(col.name())) {
+        Datum out;
+        RETURN_NOT_OK(DictionaryEncode(&ctx, Datum(col.data()), &out));
+        std::shared_ptr<ChunkedArray> array = out.chunked_array();
+        auto field = std::make_shared<Field>(
+            col.name(), array->type(), col.field()->nullable(), 
col.field()->metadata());
+        auto column = std::make_shared<Column>(field, array);
+        RETURN_NOT_OK(current_table->RemoveColumn(i, &current_table));
+        RETURN_NOT_OK(current_table->AddColumn(i, column, &current_table));
+      }
+    }
+  }
+
+  DataFrameBlockCreator helper(options, current_table, pool);
   return helper.Convert(nthreads, out);
 }
 
diff --git a/cpp/src/arrow/python/arrow_to_pandas.h 
b/cpp/src/arrow/python/arrow_to_pandas.h
index 1e48646..0541b0f 100644
--- a/cpp/src/arrow/python/arrow_to_pandas.h
+++ b/cpp/src/arrow/python/arrow_to_pandas.h
@@ -25,6 +25,7 @@
 
 #include <memory>
 #include <string>
+#include <unordered_set>
 
 #include "arrow/util/visibility.h"
 
@@ -40,6 +41,7 @@ class Table;
 namespace py {
 
 struct PandasOptions {
+  /// If true, we will convert all string columns to categoricals
   bool strings_to_categorical;
   bool zero_copy_only;
 
@@ -64,6 +66,16 @@ ARROW_EXPORT
 Status ConvertTableToPandas(PandasOptions options, const 
std::shared_ptr<Table>& table,
                             int nthreads, MemoryPool* pool, PyObject** out);
 
+/// Convert a whole table as efficiently as possible to a pandas.DataFrame.
+///
+/// Explicitly name columns that should be a categorical
+/// This option is only used on conversions that are applied to a table.
+ARROW_EXPORT
+Status ConvertTableToPandas(PandasOptions options,
+                            const std::unordered_set<std::string>& 
categorical_columns,
+                            const std::shared_ptr<Table>& table, int nthreads,
+                            MemoryPool* pool, PyObject** out);
+
 }  // namespace py
 }  // namespace arrow
 
diff --git a/python/pyarrow/includes/common.pxd 
b/python/pyarrow/includes/common.pxd
index 4d799ec..5afa075 100644
--- a/python/pyarrow/includes/common.pxd
+++ b/python/pyarrow/includes/common.pxd
@@ -23,6 +23,7 @@ from libcpp.memory cimport shared_ptr, unique_ptr, make_shared
 from libcpp.string cimport string as c_string
 from libcpp.vector cimport vector
 from libcpp.unordered_map cimport unordered_map
+from libcpp.unordered_set cimport unordered_set
 
 from cpython cimport PyObject
 cimport cpython
diff --git a/python/pyarrow/includes/libarrow.pxd 
b/python/pyarrow/includes/libarrow.pxd
index 900c3a5..b9abf2b 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -882,10 +882,12 @@ cdef extern from "arrow/python/api.h" namespace 
"arrow::py" nogil:
                                   const shared_ptr[CColumn]& arr,
                                   object py_ref, PyObject** out)
 
-    CStatus ConvertTableToPandas(PandasOptions options,
-                                 const shared_ptr[CTable]& table,
-                                 int nthreads, CMemoryPool* pool,
-                                 PyObject** out)
+    CStatus ConvertTableToPandas(
+        PandasOptions options,
+        const unordered_set[c_string]& categorical_columns,
+        const shared_ptr[CTable]& table,
+        int nthreads, CMemoryPool* pool,
+        PyObject** out)
 
     void c_set_default_memory_pool \
         " arrow::py::set_default_memory_pool"(CMemoryPool* pool)\
diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py
index 6d4bf5e..0bc47fc 100644
--- a/python/pyarrow/pandas_compat.py
+++ b/python/pyarrow/pandas_compat.py
@@ -490,7 +490,7 @@ def _make_datetimetz(tz):
 
 
 def table_to_blockmanager(options, table, memory_pool, nthreads=1,
-                          categoricals=None):
+                          categories=None):
     from pyarrow.compat import DatetimeTZDtype
 
     index_columns = []
@@ -564,7 +564,8 @@ def table_to_blockmanager(options, table, memory_pool, 
nthreads=1,
                 block_table.schema.get_field_index(raw_name)
             )
 
-    blocks = _table_to_blocks(options, block_table, nthreads, memory_pool)
+    blocks = _table_to_blocks(options, block_table, nthreads, memory_pool,
+                              categories)
 
     # Construct the row index
     if len(index_arrays) > 1:
@@ -651,12 +652,12 @@ def _reconstruct_columns_from_metadata(columns, 
column_indexes):
     )
 
 
-def _table_to_blocks(options, block_table, nthreads, memory_pool):
+def _table_to_blocks(options, block_table, nthreads, memory_pool, categories):
     # Part of table_to_blockmanager
 
     # Convert an arrow table to Block from the internal pandas API
     result = pa.lib.table_to_blocks(options, block_table, nthreads,
-                                    memory_pool)
+                                    memory_pool, categories)
 
     # Defined above
     return [_reconstruct_block(item) for item in result]
diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi
index e14d473..178df57 100644
--- a/python/pyarrow/table.pxi
+++ b/python/pyarrow/table.pxi
@@ -746,17 +746,22 @@ cdef class RecordBatch:
 
 
 def table_to_blocks(PandasOptions options, Table table, int nthreads,
-                    MemoryPool memory_pool):
+                    MemoryPool memory_pool, categories):
     cdef:
         PyObject* result_obj
         shared_ptr[CTable] c_table = table.sp_table
         CMemoryPool* pool
+        unordered_set[c_string] categorical_columns
+
+    if categories is not None:
+        categorical_columns = {tobytes(cat) for cat in categories}
 
     pool = maybe_unbox_memory_pool(memory_pool)
     with nogil:
         check_status(
             libarrow.ConvertTableToPandas(
-                options, c_table, nthreads, pool, &result_obj
+                options, categorical_columns, c_table, nthreads, pool,
+                &result_obj
             )
         )
 
@@ -1012,7 +1017,7 @@ cdef class Table:
         return result
 
     def to_pandas(self, nthreads=None, strings_to_categorical=False,
-                  memory_pool=None, zero_copy_only=False):
+                  memory_pool=None, zero_copy_only=False, categories=None):
         """
         Convert the arrow::Table to a pandas DataFrame
 
@@ -1029,6 +1034,8 @@ cdef class Table:
         zero_copy_only : boolean, default False
             Raise an ArrowException if this function call would require copying
             the underlying data
+        categories: list, default empty
+            List of columns that should be returned as pandas.Categorical
 
         Returns
         -------
@@ -1036,6 +1043,7 @@ cdef class Table:
         """
         cdef:
             PandasOptions options
+
         options = PandasOptions(
             strings_to_categorical=strings_to_categorical,
             zero_copy_only=zero_copy_only)
@@ -1043,7 +1051,7 @@ cdef class Table:
         if nthreads is None:
             nthreads = cpu_count()
         mgr = pdcompat.table_to_blockmanager(options, self, memory_pool,
-                                             nthreads)
+                                             nthreads, categories)
         return pd.DataFrame(mgr)
 
     def to_pydict(self):
diff --git a/python/pyarrow/tests/test_convert_pandas.py 
b/python/pyarrow/tests/test_convert_pandas.py
index 6e68dd9..986aeff 100644
--- a/python/pyarrow/tests/test_convert_pandas.py
+++ b/python/pyarrow/tests/test_convert_pandas.py
@@ -1027,6 +1027,24 @@ class TestConvertStringLikeTypes(object):
         expected2 = pd.DataFrame({'strings': pd.Categorical(values)})
         tm.assert_frame_equal(result2, expected2, check_dtype=True)
 
+    def test_selective_categoricals(self):
+        values = ['', '', '', '', '']
+        df = pd.DataFrame({'strings': values})
+        field = pa.field('strings', pa.string())
+        schema = pa.schema([field])
+        table = pa.Table.from_pandas(df, schema=schema)
+        expected_str = pd.DataFrame({'strings': values})
+        expected_cat = pd.DataFrame({'strings': pd.Categorical(values)})
+
+        result1 = table.to_pandas(categories=['strings'])
+        tm.assert_frame_equal(result1, expected_cat, check_dtype=True)
+        result2 = table.to_pandas(categories=[])
+        tm.assert_frame_equal(result2, expected_str, check_dtype=True)
+        result3 = table.to_pandas(categories=('strings',))
+        tm.assert_frame_equal(result3, expected_cat, check_dtype=True)
+        result4 = table.to_pandas(categories=tuple())
+        tm.assert_frame_equal(result4, expected_str, check_dtype=True)
+
     def test_table_str_to_categorical_without_na(self):
         values = ['a', 'a', 'b', 'b', 'c']
         df = pd.DataFrame({'strings': values})

-- 
To stop receiving notification emails like this one, please contact
[email protected].

[arrow] branch master updated: ARROW-1632: [Python] Permit categorical conversions in Table.to_pandas on a per-column basis

Reply via email to