pydict support (#34730)

wjones127 Fri, 21 Apr 2023 08:18:48 -0700

This is an automated email from the ASF dual-hosted git repository.

wjones127 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git



The following commit(s) were added to refs/heads/main by this push:
     new 1a697abd08 GH-34729: [C++][Python] Enhanced Arrow<->Pandas map/pydict 
support (#34730)
1a697abd08 is described below

commit 1a697abd08d6db6ac1496e548dcdb86a9a82c152
Author: Michael Lui <[email protected]>
AuthorDate: Fri Apr 21 11:18:32 2023 -0400

    GH-34729: [C++][Python] Enhanced Arrow<->Pandas map/pydict support (#34730)
    
    ### Rationale for this change
    
    Explained in issue #34729
    
    ### What changes are included in this PR?
    
    - Add support for list of maps when converting Arrow to Pandas. There 
doesn't seem to be a strong reason to omit this. Previously it was a hard error 
as unsupported, due to a bool check.
    - Refactor Arrow Map -> Pandas to support two paths: (1) list of tuples, or 
(2) pydicts
    - Add another option in PandasOptions to enable (2), above
    - Bugfix in nested pydicts -> Arrow maps.
    - Unit tests
    
    ### Are these changes tested?
    
    Unit tests are added in `test_pandas.py`
    
    ### Are there any user-facing changes?
    
    - An additional option flag in PandasOptions
    - Enable list of maps to Pandas, which was previously disabled
    * Closes: #34729
    
    Authored-by: Mike Lui <[email protected]>
    Signed-off-by: Will Jones <[email protected]>
---
 python/pyarrow/array.pxi                           |  31 ++-
 python/pyarrow/includes/libarrow_python.pxd        |   8 +
 python/pyarrow/src/arrow/python/arrow_to_pandas.cc | 232 ++++++++++++++----
 python/pyarrow/src/arrow/python/arrow_to_pandas.h  |  17 ++
 python/pyarrow/src/arrow/python/python_to_arrow.cc |   2 +-
 python/pyarrow/tests/test_pandas.py                | 267 +++++++++++++++++++++
 6 files changed, 502 insertions(+), 55 deletions(-)

diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index 11f10dddef..542a3c19f3 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -699,6 +699,7 @@ cdef class _PandasConvertible(_Weakrefable):
             bint safe=True,
             bint split_blocks=False,
             bint self_destruct=False,
+            str maps_as_pydicts=None,
             types_mapper=None
     ):
         """
@@ -753,6 +754,19 @@ cdef class _PandasConvertible(_Weakrefable):
             Note that you may not see always memory usage improvements. For
             example, if multiple columns share an underlying allocation,
             memory can't be freed until all columns are converted.
+        maps_as_pydicts : str, optional, default `None`
+            Valid values are `None`, 'lossy', or 'strict'.
+            The default behavior (`None`), is to convert Arrow Map arrays to
+            Python association lists (list-of-tuples) in the same order as the
+            Arrow Map, as in [(key1, value1), (key2, value2), ...].
+
+            If 'lossy' or 'strict', convert Arrow Map arrays to native Python 
dicts.
+            This can change the ordering of (key, value) pairs, and will
+            deduplicate multiple keys, resulting in a possible loss of data.
+
+            If 'lossy', this key deduplication results in a warning printed
+            when detected. If 'strict', this instead results in an exception
+            being raised when detected.
         types_mapper : function, default None
             A function mapping a pyarrow DataType to a pandas ExtensionDtype.
             This can be used to override the default pandas type for conversion
@@ -832,7 +846,8 @@ cdef class _PandasConvertible(_Weakrefable):
             deduplicate_objects=deduplicate_objects,
             safe=safe,
             split_blocks=split_blocks,
-            self_destruct=self_destruct
+            self_destruct=self_destruct,
+            maps_as_pydicts=maps_as_pydicts
         )
         return self._to_pandas(options, categories=categories,
                                ignore_metadata=ignore_metadata,
@@ -853,6 +868,20 @@ cdef PandasOptions _convert_pandas_options(dict options):
     result.split_blocks = options['split_blocks']
     result.self_destruct = options['self_destruct']
     result.ignore_timezone = os.environ.get('PYARROW_IGNORE_TIMEZONE', False)
+
+    maps_as_pydicts = options['maps_as_pydicts']
+    if maps_as_pydicts is None:
+        result.maps_as_pydicts = MapConversionType.DEFAULT
+    elif maps_as_pydicts == "lossy":
+        result.maps_as_pydicts = MapConversionType.LOSSY
+    elif maps_as_pydicts == "strict":
+        result.maps_as_pydicts = MapConversionType.STRICT_
+    else:
+        raise ValueError(
+            "Invalid value for 'maps_as_pydicts': "
+            + "valid values are 'lossy', 'strict' or `None` (default). "
+            + f"Received '{maps_as_pydicts}'."
+        )
     return result
 
 
diff --git a/python/pyarrow/includes/libarrow_python.pxd 
b/python/pyarrow/includes/libarrow_python.pxd
index f29c336bd0..2052600c9f 100644
--- a/python/pyarrow/includes/libarrow_python.pxd
+++ b/python/pyarrow/includes/libarrow_python.pxd
@@ -45,6 +45,13 @@ cdef extern from "arrow/python/api.h" namespace 
"arrow::py::internal":
         const CMonthDayNanoIntervalScalar& scalar)
 
 
+cdef extern from "arrow/python/arrow_to_pandas.h" namespace 
"arrow::py::MapConversionType":
+    cdef enum MapConversionType "arrow::py::MapConversionType":
+        DEFAULT,
+        LOSSY,
+        STRICT_
+
+
 cdef extern from "arrow/python/api.h" namespace "arrow::py" nogil:
     shared_ptr[CDataType] GetPrimitiveType(Type type)
 
@@ -186,6 +193,7 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py" 
nogil:
         c_bool safe_cast
         c_bool split_blocks
         c_bool self_destruct
+        MapConversionType maps_as_pydicts
         c_bool decode_dictionaries
         unordered_set[c_string] categorical_columns
         unordered_set[c_string] extension_columns
diff --git a/python/pyarrow/src/arrow/python/arrow_to_pandas.cc 
b/python/pyarrow/src/arrow/python/arrow_to_pandas.cc
index eac657fb8f..32a97ac756 100644
--- a/python/pyarrow/src/arrow/python/arrow_to_pandas.cc
+++ b/python/pyarrow/src/arrow/python/arrow_to_pandas.cc
@@ -176,6 +176,7 @@ static inline bool ListTypeSupported(const DataType& type) {
     case Type::DATE32:
     case Type::DATE64:
     case Type::STRUCT:
+    case Type::MAP:
     case Type::TIME32:
     case Type::TIME64:
     case Type::TIMESTAMP:
@@ -807,52 +808,20 @@ Status ConvertListsLike(PandasOptions options, const 
ChunkedArray& data,
   return Status::OK();
 }
 
-Status ConvertMap(PandasOptions options, const ChunkedArray& data,
-                  PyObject** out_values) {
-  // Get columns of underlying key/item arrays
-  std::vector<std::shared_ptr<Array>> key_arrays;
-  std::vector<std::shared_ptr<Array>> item_arrays;
-  for (int c = 0; c < data.num_chunks(); ++c) {
-    const auto& map_arr = checked_cast<const MapArray&>(*data.chunk(c));
-    key_arrays.emplace_back(map_arr.keys());
-    item_arrays.emplace_back(map_arr.items());
-  }
-
-  const auto& map_type = checked_cast<const MapType&>(*data.type());
-  auto key_type = map_type.key_type();
-  auto item_type = map_type.item_type();
-
-  // ARROW-6899: Convert dictionary-encoded children to dense instead of
-  // failing below. A more efficient conversion than this could be done later
-  if (key_type->id() == Type::DICTIONARY) {
-    auto dense_type = checked_cast<const 
DictionaryType&>(*key_type).value_type();
-    RETURN_NOT_OK(DecodeDictionaries(options.pool, dense_type, &key_arrays));
-    key_type = dense_type;
-  }
-  if (item_type->id() == Type::DICTIONARY) {
-    auto dense_type = checked_cast<const 
DictionaryType&>(*item_type).value_type();
-    RETURN_NOT_OK(DecodeDictionaries(options.pool, dense_type, &item_arrays));
-    item_type = dense_type;
-  }
+template<typename F1, typename F2, typename F3>
+Status ConvertMapHelper(
+    F1 resetRow,
+    F2 addPairToRow,
+    F3 stealRow,
+    const ChunkedArray& data,
+    PyArrayObject* py_keys,
+    PyArrayObject* py_items,
+    // needed for null checks in items
+    const std::vector<std::shared_ptr<Array>> item_arrays,
+    PyObject** out_values) {
 
-  // See notes in MakeInnerOptions.
-  options = MakeInnerOptions(std::move(options));
-  // Don't blindly convert because timestamps in lists are handled differently.
-  options.timestamp_as_object = true;
-
-  auto flat_keys = std::make_shared<ChunkedArray>(key_arrays, key_type);
-  auto flat_items = std::make_shared<ChunkedArray>(item_arrays, item_type);
-  OwnedRef list_item;
   OwnedRef key_value;
   OwnedRef item_value;
-  OwnedRefNoGIL owned_numpy_keys;
-  RETURN_NOT_OK(
-      ConvertChunkedArrayToPandas(options, flat_keys, nullptr, 
owned_numpy_keys.ref()));
-  OwnedRefNoGIL owned_numpy_items;
-  RETURN_NOT_OK(
-      ConvertChunkedArrayToPandas(options, flat_items, nullptr, 
owned_numpy_items.ref()));
-  PyArrayObject* py_keys = 
reinterpret_cast<PyArrayObject*>(owned_numpy_keys.obj());
-  PyArrayObject* py_items = 
reinterpret_cast<PyArrayObject*>(owned_numpy_items.obj());
 
   int64_t chunk_offset = 0;
   for (int c = 0; c < data.num_chunks(); ++c) {
@@ -866,14 +835,13 @@ Status ConvertMap(PandasOptions options, const 
ChunkedArray& data,
         *out_values = Py_None;
       } else {
         int64_t entry_offset = arr.value_offset(i);
-        int64_t num_maps = arr.value_offset(i + 1) - entry_offset;
+        int64_t num_pairs = arr.value_offset(i + 1) - entry_offset;
 
-        // Build the new list object for the row of maps
-        list_item.reset(PyList_New(num_maps));
-        RETURN_IF_PYERROR();
+        // Build the new list object for the row of Python pairs
+        RETURN_NOT_OK(resetRow(num_pairs));
 
         // Add each key/item pair in the row
-        for (int64_t j = 0; j < num_maps; ++j) {
+        for (int64_t j = 0; j < num_pairs; ++j) {
           // Get key value, key is non-nullable for a valid row
           auto ptr_key = reinterpret_cast<const char*>(
               PyArray_GETPTR1(py_keys, chunk_offset + entry_offset + j));
@@ -892,14 +860,12 @@ Status ConvertMap(PandasOptions options, const 
ChunkedArray& data,
             RETURN_IF_PYERROR();
           }
 
-          // Add the key/item pair to the list for the row
-          PyList_SET_ITEM(list_item.obj(), j,
-                          PyTuple_Pack(2, key_value.obj(), item_value.obj()));
-          RETURN_IF_PYERROR();
+          // Add the key/item pair to the row
+          RETURN_NOT_OK(addPairToRow(j, key_value, item_value));
         }
 
         // Pass ownership to the resulting array
-        *out_values = list_item.detach();
+        *out_values = stealRow();
       }
       ++out_values;
     }
@@ -911,6 +877,166 @@ Status ConvertMap(PandasOptions options, const 
ChunkedArray& data,
   return Status::OK();
 }
 
+// A more helpful error message around TypeErrors that may stem from 
unhashable keys
+Status CheckMapAsPydictsTypeError() {
+  if (ARROW_PREDICT_TRUE(!PyErr_Occurred())) {
+    return Status::OK();
+  }
+  if (PyErr_ExceptionMatches(PyExc_TypeError)) {
+    // Modify the error string directly, so it is re-raised
+    // with our additional info.
+    //
+    // There are not many interesting things happening when this
+    // is hit. This is intended to only be called directly after
+    // PyDict_SetItem, where a finite set of errors could occur.
+    PyObject *type, *value, *traceback;
+    PyErr_Fetch(&type, &value, &traceback);
+    std::string message;
+    RETURN_NOT_OK(internal::PyObject_StdStringStr(value, &message));
+    message += ". If keys are not hashable, then you must use the option "
+        "[maps_as_pydicts=None (default)]";
+
+    // resets the error
+    PyErr_SetString(PyExc_TypeError, message.c_str());
+  }
+  return ConvertPyError();
+}
+
+Status CheckForDuplicateKeys(bool error_on_duplicate_keys,
+                             Py_ssize_t total_dict_len, Py_ssize_t 
total_raw_len) {
+  if (total_dict_len < total_raw_len) {
+    const char* message =
+        "[maps_as_pydicts] "
+        "After conversion of Arrow maps to pydicts, "
+        "detected data loss due to duplicate keys. "
+        "Original input length is [%lld], total converted pydict length is 
[%lld].";
+    std::array<char, 256> buf;
+    std::snprintf(buf.data(), buf.size(), message, total_raw_len, 
total_dict_len);
+
+    if (error_on_duplicate_keys) {
+      return Status::UnknownError(buf.data());
+    } else {
+      ARROW_LOG(WARNING) << buf.data();
+    }
+  }
+  return Status::OK();
+}
+
+Status ConvertMap(PandasOptions options, const ChunkedArray& data,
+                  PyObject** out_values) {
+  // Get columns of underlying key/item arrays
+  std::vector<std::shared_ptr<Array>> key_arrays;
+  std::vector<std::shared_ptr<Array>> item_arrays;
+  for (int c = 0; c < data.num_chunks(); ++c) {
+    const auto& map_arr = checked_cast<const MapArray&>(*data.chunk(c));
+    key_arrays.emplace_back(map_arr.keys());
+    item_arrays.emplace_back(map_arr.items());
+  }
+
+  const auto& map_type = checked_cast<const MapType&>(*data.type());
+  auto key_type = map_type.key_type();
+  auto item_type = map_type.item_type();
+
+  // ARROW-6899: Convert dictionary-encoded children to dense instead of
+  // failing below. A more efficient conversion than this could be done later
+  if (key_type->id() == Type::DICTIONARY) {
+    auto dense_type = checked_cast<const 
DictionaryType&>(*key_type).value_type();
+    RETURN_NOT_OK(DecodeDictionaries(options.pool, dense_type, &key_arrays));
+    key_type = dense_type;
+  }
+  if (item_type->id() == Type::DICTIONARY) {
+    auto dense_type = checked_cast<const 
DictionaryType&>(*item_type).value_type();
+    RETURN_NOT_OK(DecodeDictionaries(options.pool, dense_type, &item_arrays));
+    item_type = dense_type;
+  }
+
+  // See notes in MakeInnerOptions.
+  options = MakeInnerOptions(std::move(options));
+  // Don't blindly convert because timestamps in lists are handled differently.
+  options.timestamp_as_object = true;
+
+  auto flat_keys = std::make_shared<ChunkedArray>(key_arrays, key_type);
+  auto flat_items = std::make_shared<ChunkedArray>(item_arrays, item_type);
+  OwnedRefNoGIL owned_numpy_keys;
+  RETURN_NOT_OK(
+      ConvertChunkedArrayToPandas(options, flat_keys, nullptr, 
owned_numpy_keys.ref()));
+  OwnedRefNoGIL owned_numpy_items;
+  RETURN_NOT_OK(
+      ConvertChunkedArrayToPandas(options, flat_items, nullptr, 
owned_numpy_items.ref()));
+  PyArrayObject* py_keys = 
reinterpret_cast<PyArrayObject*>(owned_numpy_keys.obj());
+  PyArrayObject* py_items = 
reinterpret_cast<PyArrayObject*>(owned_numpy_items.obj());
+
+  if (options.maps_as_pydicts == MapConversionType::DEFAULT) {
+    // The default behavior to express an Arrow MAP as a list of [(key, 
value), ...] pairs
+    OwnedRef list_item;
+    return ConvertMapHelper(
+        [&list_item](int64_t num_pairs) {
+          list_item.reset(PyList_New(num_pairs));
+          return CheckPyError();
+        },
+        [&list_item](int64_t idx, OwnedRef& key_value, OwnedRef& item_value) {
+          PyList_SET_ITEM(list_item.obj(), idx,
+                          PyTuple_Pack(2, key_value.obj(), item_value.obj()));
+          return CheckPyError();
+        },
+        [&list_item]{ return list_item.detach(); },
+        data,
+        py_keys,
+        py_items,
+        item_arrays,
+        out_values);
+  } else {
+    // Use a native pydict
+    OwnedRef dict_item;
+    Py_ssize_t total_dict_len{0};
+    Py_ssize_t total_raw_len{0};
+
+    bool error_on_duplicate_keys;
+    if (options.maps_as_pydicts == MapConversionType::LOSSY) {
+      error_on_duplicate_keys = false;
+    } else if (options.maps_as_pydicts == MapConversionType::STRICT_) {
+      error_on_duplicate_keys = true;
+    } else {
+      auto val = 
std::underlying_type_t<MapConversionType>(options.maps_as_pydicts);
+      return Status::UnknownError(
+          "Received unknown option for maps_as_pydicts: " + std::to_string(val)
+      );
+    }
+
+    auto status = ConvertMapHelper(
+        [&dict_item, &total_raw_len](int64_t num_pairs) {
+          total_raw_len += num_pairs;
+          dict_item.reset(PyDict_New());
+          return CheckPyError();
+        },
+        [&dict_item]([[maybe_unused]] int64_t idx, OwnedRef& key_value, 
OwnedRef& item_value) {
+          auto setitem_result =
+              PyDict_SetItem(dict_item.obj(), key_value.obj(), 
item_value.obj());
+          ARROW_RETURN_NOT_OK(CheckMapAsPydictsTypeError());
+          // returns -1 if there are internal errors around hashing/resizing
+          return setitem_result == 0 ?
+            Status::OK() :
+            Status::UnknownError("[maps_as_pydicts] "
+                "Unexpected failure inserting Arrow (key, value) pair into 
Python dict"
+            );
+        },
+        [&dict_item, &total_dict_len]{
+          total_dict_len += PyDict_Size(dict_item.obj());
+          return dict_item.detach();
+        },
+        data,
+        py_keys,
+        py_items,
+        item_arrays,
+        out_values);
+
+    ARROW_RETURN_NOT_OK(status);
+    // If there were no errors generating the pydicts,
+    // then check if we detected any data loss from duplicate keys.
+    return CheckForDuplicateKeys(error_on_duplicate_keys, total_dict_len, 
total_raw_len);
+  }
+}
+
 template <typename InType, typename OutType>
 inline void ConvertNumericNullable(const ChunkedArray& data, InType na_value,
                                    OutType* out_values) {
diff --git a/python/pyarrow/src/arrow/python/arrow_to_pandas.h 
b/python/pyarrow/src/arrow/python/arrow_to_pandas.h
index 6570364b8d..ac422cc99c 100644
--- a/python/pyarrow/src/arrow/python/arrow_to_pandas.h
+++ b/python/pyarrow/src/arrow/python/arrow_to_pandas.h
@@ -41,6 +41,12 @@ class Table;
 
 namespace py {
 
+enum class MapConversionType {
+  DEFAULT, // convert arrow maps to assoc lists (list of kev-value tuples) in 
Pandas
+  LOSSY, // report warnings when lossiness is encountered due to duplicate keys
+  STRICT_, // raise a Python exception when lossiness is encountered due to 
duplicate keys
+};
+
 struct PandasOptions {
   /// arrow::MemoryPool to use for memory allocations
   MemoryPool* pool = default_memory_pool();
@@ -90,6 +96,17 @@ struct PandasOptions {
   /// conversions
   bool self_destruct = false;
 
+  /// \brief The default behavior (DEFAULT), is to convert Arrow Map arrays to
+  /// Python association lists (list-of-tuples) in the same order as the Arrow
+  /// Map, as in [(key1, value1), (key2, value2), ...]
+  /// If LOSSY or STRICT, convert Arrow Map arrays to native Python dicts.
+  /// This can change the ordering of (key, value) pairs, and will deduplicate
+  /// multiple keys, resulting in a possible loss of data.
+  /// If 'lossy', this key deduplication results in a warning printed
+  /// when detected. If 'strict', this instead results in an exception
+  /// being raised when detected.
+  MapConversionType maps_as_pydicts = MapConversionType::DEFAULT;
+
   // Used internally for nested arrays.
   bool decode_dictionaries = false;
 
diff --git a/python/pyarrow/src/arrow/python/python_to_arrow.cc 
b/python/pyarrow/src/arrow/python/python_to_arrow.cc
index 2bb6a6f459..7c0ffa70c7 100644
--- a/python/pyarrow/src/arrow/python/python_to_arrow.cc
+++ b/python/pyarrow/src/arrow/python/python_to_arrow.cc
@@ -762,7 +762,7 @@ class PyListConverter : public ListConverter<T, 
PyConverter, PyConverterTrait> {
       RETURN_NOT_OK(AppendSequence(value));
     } else if (PySet_Check(value) || (Py_TYPE(value) == &PyDictValues_Type)) {
       RETURN_NOT_OK(AppendIterable(value));
-    } else if (PyDict_Check(value) && this->options_.type->id() == Type::MAP) {
+    } else if (PyDict_Check(value) && this->type()->id() == Type::MAP) {
       // Branch to support Python Dict with `map` DataType.
       auto items = PyDict_Items(value);
       OwnedRef item_ref(items);
diff --git a/python/pyarrow/tests/test_pandas.py 
b/python/pyarrow/tests/test_pandas.py
index a49552aa0a..803a41c28a 100644
--- a/python/pyarrow/tests/test_pandas.py
+++ b/python/pyarrow/tests/test_pandas.py
@@ -2135,6 +2135,66 @@ class TestConvertListTypes:
                                     DeprecationWarning)
             tm.assert_series_equal(series, expected)
 
+    def test_to_list_of_maps_pandas(self):
+        if ((Version(np.__version__) >= Version("1.25.0.dev0")) and
+                (Version(pd.__version__) < Version("2.0.0"))):
+            # TODO: regression in pandas with numpy 1.25dev
+            # https://github.com/pandas-dev/pandas/issues/50360
+            pytest.skip("Regression in pandas with numpy 1.25")
+        data = [
+            [[('foo', ['a', 'b']), ('bar', ['c', 'd'])]],
+            [[('baz', []), ('qux', None), ('quux', [None, 'e'])], [('quz', 
['f', 'g'])]]
+        ]
+        arr = pa.array(data, pa.list_(pa.map_(pa.utf8(), pa.list_(pa.utf8()))))
+        series = arr.to_pandas()
+        expected = pd.Series(data)
+
+        # pandas.testing generates a
+        # DeprecationWarning: elementwise comparison failed
+        with warnings.catch_warnings():
+            warnings.filterwarnings("ignore", "elementwise comparison failed",
+                                    DeprecationWarning)
+            tm.assert_series_equal(series, expected)
+
+    def test_to_list_of_maps_pandas_sliced(self):
+        """
+        A slightly more rigorous test for chunk/slice combinations
+        """
+
+        if ((Version(np.__version__) >= Version("1.25.0.dev0")) and
+                (Version(pd.__version__) < Version("2.0.0"))):
+            # TODO: regression in pandas with numpy 1.25dev
+            # https://github.com/pandas-dev/pandas/issues/50360
+            pytest.skip("Regression in pandas with numpy 1.25")
+
+        keys = pa.array(['ignore', 'foo', 'bar', 'baz',
+                         'qux', 'quux', 'ignore']).slice(1, 5)
+        items = pa.array(
+            [['ignore'], ['ignore'], ['a', 'b'], ['c', 'd'], [], None, [None, 
'e']],
+            pa.list_(pa.string()),
+        ).slice(2, 5)
+        map = pa.MapArray.from_arrays([0, 2, 4], keys, items)
+        arr = pa.ListArray.from_arrays([0, 1, 2], map)
+
+        series = arr.to_pandas()
+        expected = pd.Series([
+            [[('foo', ['a', 'b']), ('bar', ['c', 'd'])]],
+            [[('baz', []), ('qux', None)]],
+        ])
+
+        series_sliced = arr.slice(1, 2).to_pandas()
+        expected_sliced = pd.Series([
+            [[('baz', []), ('qux', None)]],
+        ])
+
+        # pandas.testing generates a
+        # DeprecationWarning: elementwise comparison failed
+        with warnings.catch_warnings():
+            warnings.filterwarnings("ignore", "elementwise comparison failed",
+                                    DeprecationWarning)
+            tm.assert_series_equal(series, expected)
+            tm.assert_series_equal(series_sliced, expected_sliced)
+
     @pytest.mark.parametrize('t,data,expected', [
         (
             pa.int64,
@@ -4549,3 +4609,210 @@ def test_does_not_mutate_timedelta_nested():
     df = table.to_pandas()
 
     assert df["timedelta_2"][0].to_pytimedelta() == timedelta_2[0]
+
+
+def test_roundtrip_nested_map_table_with_pydicts():
+    schema = pa.schema([
+        pa.field(
+            "a",
+            pa.list_(
+                pa.map_(pa.int8(), pa.struct([pa.field("b", pa.binary())]))
+            )
+        )
+    ])
+    table = pa.table([[
+        [[(1, None)]],
+        None,
+        [
+            [(2, {"b": b"abc"})],
+            [(3, {"b": None}), (4, {"b": b"def"})],
+        ]
+    ]],
+        schema=schema,
+    )
+
+    expected_default_df = pd.DataFrame(
+        {"a": [[[(1, None)]], None, [[(2, {"b": b"abc"})],
+                                     [(3, {"b": None}), (4, {"b": b"def"})]]]}
+    )
+    expected_as_pydicts_df = pd.DataFrame(
+        {"a": [
+            [{1: None}],
+            None,
+            [{2: {"b": b"abc"}}, {3: {"b": None}, 4: {"b": b"def"}}],
+        ]}
+    )
+
+    default_df = table.to_pandas()
+    as_pydicts_df = table.to_pandas(maps_as_pydicts="strict")
+
+    tm.assert_frame_equal(default_df, expected_default_df)
+    tm.assert_frame_equal(as_pydicts_df, expected_as_pydicts_df)
+
+    table_default_roundtrip = pa.Table.from_pandas(default_df, schema=schema)
+    assert table.equals(table_default_roundtrip)
+
+    table_as_pydicts_roundtrip = pa.Table.from_pandas(as_pydicts_df, 
schema=schema)
+    assert table.equals(table_as_pydicts_roundtrip)
+
+
+def test_roundtrip_nested_map_array_with_pydicts_sliced():
+    """
+    Slightly more robust test with chunking and slicing
+    """
+    keys_1 = pa.array(['foo', 'bar'])
+    keys_2 = pa.array(['baz', 'qux', 'quux', 'quz'])
+    keys_3 = pa.array([], pa.string())
+
+    items_1 = pa.array(
+        [['a', 'b'], ['c', 'd']],
+        pa.list_(pa.string()),
+    )
+    items_2 = pa.array(
+        [[], None, [None, 'e'], ['f', 'g']],
+        pa.list_(pa.string()),
+    )
+    items_3 = pa.array(
+        [],
+        pa.list_(pa.string()),
+    )
+
+    map_chunk_1 = pa.MapArray.from_arrays([0, 2], keys_1, items_1)
+    map_chunk_2 = pa.MapArray.from_arrays([0, 3, 4], keys_2, items_2)
+    map_chunk_3 = pa.MapArray.from_arrays([0, 0], keys_3, items_3)
+    chunked_array = pa.chunked_array([
+        pa.ListArray.from_arrays([0, 1], map_chunk_1).slice(0),
+        pa.ListArray.from_arrays([0, 1], map_chunk_2.slice(1)).slice(0),
+        pa.ListArray.from_arrays([0, 0], map_chunk_3).slice(0),
+    ])
+
+    series_default = chunked_array.to_pandas()
+    expected_series_default = pd.Series([
+        [[('foo', ['a', 'b']), ('bar', ['c', 'd'])]],
+        [[('quz', ['f', 'g'])]],
+        [],
+    ])
+
+    series_pydicts = chunked_array.to_pandas(maps_as_pydicts="strict")
+    expected_series_pydicts = pd.Series([
+        [{'foo': ['a', 'b'], 'bar': ['c', 'd']}],
+        [{'quz': ['f', 'g']}],
+        [],
+    ])
+
+    sliced = chunked_array.slice(1, 3)
+    series_default_sliced = sliced.to_pandas()
+    expected_series_default_sliced = pd.Series([
+        [[('quz', ['f', 'g'])]],
+        [],
+    ])
+
+    series_pydicts_sliced = sliced.to_pandas(maps_as_pydicts="strict")
+    expected_series_pydicts_sliced = pd.Series([
+        [{'quz': ['f', 'g']}],
+        [],
+    ])
+
+    with warnings.catch_warnings():
+        warnings.filterwarnings("ignore", "elementwise comparison failed",
+                                DeprecationWarning)
+        tm.assert_series_equal(series_default, expected_series_default)
+        tm.assert_series_equal(series_pydicts, expected_series_pydicts)
+        tm.assert_series_equal(series_default_sliced, 
expected_series_default_sliced)
+        tm.assert_series_equal(series_pydicts_sliced, 
expected_series_pydicts_sliced)
+
+    ty = pa.list_(pa.map_(pa.string(), pa.list_(pa.string())))
+
+    def assert_roundtrip(series: pd.Series, data) -> None:
+        array_roundtrip = pa.chunked_array(pa.Array.from_pandas(series, 
type=ty))
+        assert data.equals(array_roundtrip)
+
+    assert_roundtrip(series_default, chunked_array)
+    assert_roundtrip(series_pydicts, chunked_array)
+    assert_roundtrip(series_default_sliced, sliced)
+    assert_roundtrip(series_pydicts_sliced, sliced)
+
+
+def test_roundtrip_map_array_with_pydicts_duplicate_keys():
+    keys = pa.array(['foo', 'bar', 'foo'])
+    items = pa.array(
+        [['a', 'b'], ['c', 'd'], ['1', '2']],
+        pa.list_(pa.string()),
+    )
+    offsets = [0, 3]
+    maps = pa.MapArray.from_arrays(offsets, keys, items)
+    ty = pa.map_(pa.string(), pa.list_(pa.string()))
+
+    # ------------------------
+    # With maps as pydicts
+    with pytest.raises(pa.lib.ArrowException):
+        # raises because of duplicate keys
+        maps.to_pandas(maps_as_pydicts="strict")
+    series_pydicts = maps.to_pandas(maps_as_pydicts="lossy")
+    # some data loss occurs for duplicate keys
+    expected_series_pydicts = pd.Series([
+        {'foo': ['1', '2'], 'bar': ['c', 'd']},
+    ])
+    # roundtrip is not possible because of data loss
+    assert not maps.equals(pa.Array.from_pandas(series_pydicts, type=ty))
+
+    # ------------------------
+    # With default assoc list of tuples
+    series_default = maps.to_pandas()
+    expected_series_default = pd.Series([
+        [('foo', ['a', 'b']), ('bar', ['c', 'd']), ('foo', ['1', '2'])],
+    ])
+    assert maps.equals(pa.Array.from_pandas(series_default, type=ty))
+
+    # custom comparison for compatibility w/ Pandas 1.0.0
+    # would otherwise run:
+    #   tm.assert_series_equal(series_pydicts, expected_series_pydicts)
+    assert len(series_pydicts) == len(expected_series_pydicts)
+    for row1, row2 in zip(series_pydicts, expected_series_pydicts):
+        assert len(row1) == len(row2)
+        for tup1, tup2 in zip(row1.items(), row2.items()):
+            assert tup1[0] == tup2[0]
+            assert np.array_equal(tup1[1], tup2[1])
+
+    # custom comparison for compatibility w/ Pandas 1.0.0
+    # would otherwise run:
+    #   tm.assert_series_equal(series_default, expected_series_default)
+    assert len(series_default) == len(expected_series_default)
+    for row1, row2 in zip(series_default, expected_series_default):
+        assert len(row1) == len(row2)
+        for tup1, tup2 in zip(row1, row2):
+            assert tup1[0] == tup2[0]
+            assert np.array_equal(tup1[1], tup2[1])
+
+
+def test_unhashable_map_keys_with_pydicts():
+    keys = pa.array(
+        [['a', 'b'], ['c', 'd'], [], ['e'], [None, 'f'], ['g', 'h']],
+        pa.list_(pa.string()),
+    )
+    items = pa.array(['foo', 'bar', 'baz', 'qux', 'quux', 'quz'])
+    offsets = [0, 2, 6]
+    maps = pa.MapArray.from_arrays(offsets, keys, items)
+
+    # ------------------------
+    # With maps as pydicts
+    with pytest.raises(TypeError):
+        maps.to_pandas(maps_as_pydicts="lossy")
+
+    # ------------------------
+    # With default assoc list of tuples
+    series = maps.to_pandas()
+    expected_series_default = pd.Series([
+        [(['a', 'b'], 'foo'), (['c', 'd'], 'bar')],
+        [([], 'baz'), (['e'], 'qux'), ([None, 'f'], 'quux'), (['g', 'h'], 
'quz')],
+    ])
+
+    # custom comparison for compatibility w/ Pandas 1.0.0
+    # would otherwise run:
+    #   tm.assert_series_equal(series, expected_series_default)
+    assert len(series) == len(expected_series_default)
+    for row1, row2 in zip(series, expected_series_default):
+        assert len(row1) == len(row2)
+        for tup1, tup2 in zip(row1, row2):
+            assert np.array_equal(tup1[0], tup2[0])
+            assert tup1[1] == tup2[1]

[arrow] branch main updated: GH-34729: [C++][Python] Enhanced Arrow<->Pandas map/pydict support (#34730)

Reply via email to