This is an automated email from the ASF dual-hosted git repository.
wjones127 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 1a697abd08 GH-34729: [C++][Python] Enhanced Arrow<->Pandas map/pydict
support (#34730)
1a697abd08 is described below
commit 1a697abd08d6db6ac1496e548dcdb86a9a82c152
Author: Michael Lui <[email protected]>
AuthorDate: Fri Apr 21 11:18:32 2023 -0400
GH-34729: [C++][Python] Enhanced Arrow<->Pandas map/pydict support (#34730)
### Rationale for this change
Explained in issue #34729
### What changes are included in this PR?
- Add support for list of maps when converting Arrow to Pandas. There
doesn't seem to be a strong reason to omit this. Previously it was a hard error
as unsupported, due to a bool check.
- Refactor Arrow Map -> Pandas to support two paths: (1) list of tuples, or
(2) pydicts
- Add another option in PandasOptions to enable (2), above
- Bugfix in nested pydicts -> Arrow maps.
- Unit tests
### Are these changes tested?
Unit tests are added in `test_pandas.py`
### Are there any user-facing changes?
- An additional option flag in PandasOptions
- Enable list of maps to Pandas, which was previously disabled
* Closes: #34729
Authored-by: Mike Lui <[email protected]>
Signed-off-by: Will Jones <[email protected]>
---
python/pyarrow/array.pxi | 31 ++-
python/pyarrow/includes/libarrow_python.pxd | 8 +
python/pyarrow/src/arrow/python/arrow_to_pandas.cc | 232 ++++++++++++++----
python/pyarrow/src/arrow/python/arrow_to_pandas.h | 17 ++
python/pyarrow/src/arrow/python/python_to_arrow.cc | 2 +-
python/pyarrow/tests/test_pandas.py | 267 +++++++++++++++++++++
6 files changed, 502 insertions(+), 55 deletions(-)
diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index 11f10dddef..542a3c19f3 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -699,6 +699,7 @@ cdef class _PandasConvertible(_Weakrefable):
bint safe=True,
bint split_blocks=False,
bint self_destruct=False,
+ str maps_as_pydicts=None,
types_mapper=None
):
"""
@@ -753,6 +754,19 @@ cdef class _PandasConvertible(_Weakrefable):
Note that you may not see always memory usage improvements. For
example, if multiple columns share an underlying allocation,
memory can't be freed until all columns are converted.
+ maps_as_pydicts : str, optional, default `None`
+ Valid values are `None`, 'lossy', or 'strict'.
+ The default behavior (`None`), is to convert Arrow Map arrays to
+ Python association lists (list-of-tuples) in the same order as the
+ Arrow Map, as in [(key1, value1), (key2, value2), ...].
+
+ If 'lossy' or 'strict', convert Arrow Map arrays to native Python
dicts.
+ This can change the ordering of (key, value) pairs, and will
+ deduplicate multiple keys, resulting in a possible loss of data.
+
+ If 'lossy', this key deduplication results in a warning printed
+ when detected. If 'strict', this instead results in an exception
+ being raised when detected.
types_mapper : function, default None
A function mapping a pyarrow DataType to a pandas ExtensionDtype.
This can be used to override the default pandas type for conversion
@@ -832,7 +846,8 @@ cdef class _PandasConvertible(_Weakrefable):
deduplicate_objects=deduplicate_objects,
safe=safe,
split_blocks=split_blocks,
- self_destruct=self_destruct
+ self_destruct=self_destruct,
+ maps_as_pydicts=maps_as_pydicts
)
return self._to_pandas(options, categories=categories,
ignore_metadata=ignore_metadata,
@@ -853,6 +868,20 @@ cdef PandasOptions _convert_pandas_options(dict options):
result.split_blocks = options['split_blocks']
result.self_destruct = options['self_destruct']
result.ignore_timezone = os.environ.get('PYARROW_IGNORE_TIMEZONE', False)
+
+ maps_as_pydicts = options['maps_as_pydicts']
+ if maps_as_pydicts is None:
+ result.maps_as_pydicts = MapConversionType.DEFAULT
+ elif maps_as_pydicts == "lossy":
+ result.maps_as_pydicts = MapConversionType.LOSSY
+ elif maps_as_pydicts == "strict":
+ result.maps_as_pydicts = MapConversionType.STRICT_
+ else:
+ raise ValueError(
+ "Invalid value for 'maps_as_pydicts': "
+ + "valid values are 'lossy', 'strict' or `None` (default). "
+ + f"Received '{maps_as_pydicts}'."
+ )
return result
diff --git a/python/pyarrow/includes/libarrow_python.pxd
b/python/pyarrow/includes/libarrow_python.pxd
index f29c336bd0..2052600c9f 100644
--- a/python/pyarrow/includes/libarrow_python.pxd
+++ b/python/pyarrow/includes/libarrow_python.pxd
@@ -45,6 +45,13 @@ cdef extern from "arrow/python/api.h" namespace
"arrow::py::internal":
const CMonthDayNanoIntervalScalar& scalar)
+cdef extern from "arrow/python/arrow_to_pandas.h" namespace
"arrow::py::MapConversionType":
+ cdef enum MapConversionType "arrow::py::MapConversionType":
+ DEFAULT,
+ LOSSY,
+ STRICT_
+
+
cdef extern from "arrow/python/api.h" namespace "arrow::py" nogil:
shared_ptr[CDataType] GetPrimitiveType(Type type)
@@ -186,6 +193,7 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py"
nogil:
c_bool safe_cast
c_bool split_blocks
c_bool self_destruct
+ MapConversionType maps_as_pydicts
c_bool decode_dictionaries
unordered_set[c_string] categorical_columns
unordered_set[c_string] extension_columns
diff --git a/python/pyarrow/src/arrow/python/arrow_to_pandas.cc
b/python/pyarrow/src/arrow/python/arrow_to_pandas.cc
index eac657fb8f..32a97ac756 100644
--- a/python/pyarrow/src/arrow/python/arrow_to_pandas.cc
+++ b/python/pyarrow/src/arrow/python/arrow_to_pandas.cc
@@ -176,6 +176,7 @@ static inline bool ListTypeSupported(const DataType& type) {
case Type::DATE32:
case Type::DATE64:
case Type::STRUCT:
+ case Type::MAP:
case Type::TIME32:
case Type::TIME64:
case Type::TIMESTAMP:
@@ -807,52 +808,20 @@ Status ConvertListsLike(PandasOptions options, const
ChunkedArray& data,
return Status::OK();
}
-Status ConvertMap(PandasOptions options, const ChunkedArray& data,
- PyObject** out_values) {
- // Get columns of underlying key/item arrays
- std::vector<std::shared_ptr<Array>> key_arrays;
- std::vector<std::shared_ptr<Array>> item_arrays;
- for (int c = 0; c < data.num_chunks(); ++c) {
- const auto& map_arr = checked_cast<const MapArray&>(*data.chunk(c));
- key_arrays.emplace_back(map_arr.keys());
- item_arrays.emplace_back(map_arr.items());
- }
-
- const auto& map_type = checked_cast<const MapType&>(*data.type());
- auto key_type = map_type.key_type();
- auto item_type = map_type.item_type();
-
- // ARROW-6899: Convert dictionary-encoded children to dense instead of
- // failing below. A more efficient conversion than this could be done later
- if (key_type->id() == Type::DICTIONARY) {
- auto dense_type = checked_cast<const
DictionaryType&>(*key_type).value_type();
- RETURN_NOT_OK(DecodeDictionaries(options.pool, dense_type, &key_arrays));
- key_type = dense_type;
- }
- if (item_type->id() == Type::DICTIONARY) {
- auto dense_type = checked_cast<const
DictionaryType&>(*item_type).value_type();
- RETURN_NOT_OK(DecodeDictionaries(options.pool, dense_type, &item_arrays));
- item_type = dense_type;
- }
+template<typename F1, typename F2, typename F3>
+Status ConvertMapHelper(
+ F1 resetRow,
+ F2 addPairToRow,
+ F3 stealRow,
+ const ChunkedArray& data,
+ PyArrayObject* py_keys,
+ PyArrayObject* py_items,
+ // needed for null checks in items
+ const std::vector<std::shared_ptr<Array>> item_arrays,
+ PyObject** out_values) {
- // See notes in MakeInnerOptions.
- options = MakeInnerOptions(std::move(options));
- // Don't blindly convert because timestamps in lists are handled differently.
- options.timestamp_as_object = true;
-
- auto flat_keys = std::make_shared<ChunkedArray>(key_arrays, key_type);
- auto flat_items = std::make_shared<ChunkedArray>(item_arrays, item_type);
- OwnedRef list_item;
OwnedRef key_value;
OwnedRef item_value;
- OwnedRefNoGIL owned_numpy_keys;
- RETURN_NOT_OK(
- ConvertChunkedArrayToPandas(options, flat_keys, nullptr,
owned_numpy_keys.ref()));
- OwnedRefNoGIL owned_numpy_items;
- RETURN_NOT_OK(
- ConvertChunkedArrayToPandas(options, flat_items, nullptr,
owned_numpy_items.ref()));
- PyArrayObject* py_keys =
reinterpret_cast<PyArrayObject*>(owned_numpy_keys.obj());
- PyArrayObject* py_items =
reinterpret_cast<PyArrayObject*>(owned_numpy_items.obj());
int64_t chunk_offset = 0;
for (int c = 0; c < data.num_chunks(); ++c) {
@@ -866,14 +835,13 @@ Status ConvertMap(PandasOptions options, const
ChunkedArray& data,
*out_values = Py_None;
} else {
int64_t entry_offset = arr.value_offset(i);
- int64_t num_maps = arr.value_offset(i + 1) - entry_offset;
+ int64_t num_pairs = arr.value_offset(i + 1) - entry_offset;
- // Build the new list object for the row of maps
- list_item.reset(PyList_New(num_maps));
- RETURN_IF_PYERROR();
+ // Build the new list object for the row of Python pairs
+ RETURN_NOT_OK(resetRow(num_pairs));
// Add each key/item pair in the row
- for (int64_t j = 0; j < num_maps; ++j) {
+ for (int64_t j = 0; j < num_pairs; ++j) {
// Get key value, key is non-nullable for a valid row
auto ptr_key = reinterpret_cast<const char*>(
PyArray_GETPTR1(py_keys, chunk_offset + entry_offset + j));
@@ -892,14 +860,12 @@ Status ConvertMap(PandasOptions options, const
ChunkedArray& data,
RETURN_IF_PYERROR();
}
- // Add the key/item pair to the list for the row
- PyList_SET_ITEM(list_item.obj(), j,
- PyTuple_Pack(2, key_value.obj(), item_value.obj()));
- RETURN_IF_PYERROR();
+ // Add the key/item pair to the row
+ RETURN_NOT_OK(addPairToRow(j, key_value, item_value));
}
// Pass ownership to the resulting array
- *out_values = list_item.detach();
+ *out_values = stealRow();
}
++out_values;
}
@@ -911,6 +877,166 @@ Status ConvertMap(PandasOptions options, const
ChunkedArray& data,
return Status::OK();
}
+// A more helpful error message around TypeErrors that may stem from
unhashable keys
+Status CheckMapAsPydictsTypeError() {
+ if (ARROW_PREDICT_TRUE(!PyErr_Occurred())) {
+ return Status::OK();
+ }
+ if (PyErr_ExceptionMatches(PyExc_TypeError)) {
+ // Modify the error string directly, so it is re-raised
+ // with our additional info.
+ //
+ // There are not many interesting things happening when this
+ // is hit. This is intended to only be called directly after
+ // PyDict_SetItem, where a finite set of errors could occur.
+ PyObject *type, *value, *traceback;
+ PyErr_Fetch(&type, &value, &traceback);
+ std::string message;
+ RETURN_NOT_OK(internal::PyObject_StdStringStr(value, &message));
+ message += ". If keys are not hashable, then you must use the option "
+ "[maps_as_pydicts=None (default)]";
+
+ // resets the error
+ PyErr_SetString(PyExc_TypeError, message.c_str());
+ }
+ return ConvertPyError();
+}
+
+Status CheckForDuplicateKeys(bool error_on_duplicate_keys,
+ Py_ssize_t total_dict_len, Py_ssize_t
total_raw_len) {
+ if (total_dict_len < total_raw_len) {
+ const char* message =
+ "[maps_as_pydicts] "
+ "After conversion of Arrow maps to pydicts, "
+ "detected data loss due to duplicate keys. "
+ "Original input length is [%lld], total converted pydict length is
[%lld].";
+ std::array<char, 256> buf;
+ std::snprintf(buf.data(), buf.size(), message, total_raw_len,
total_dict_len);
+
+ if (error_on_duplicate_keys) {
+ return Status::UnknownError(buf.data());
+ } else {
+ ARROW_LOG(WARNING) << buf.data();
+ }
+ }
+ return Status::OK();
+}
+
+Status ConvertMap(PandasOptions options, const ChunkedArray& data,
+ PyObject** out_values) {
+ // Get columns of underlying key/item arrays
+ std::vector<std::shared_ptr<Array>> key_arrays;
+ std::vector<std::shared_ptr<Array>> item_arrays;
+ for (int c = 0; c < data.num_chunks(); ++c) {
+ const auto& map_arr = checked_cast<const MapArray&>(*data.chunk(c));
+ key_arrays.emplace_back(map_arr.keys());
+ item_arrays.emplace_back(map_arr.items());
+ }
+
+ const auto& map_type = checked_cast<const MapType&>(*data.type());
+ auto key_type = map_type.key_type();
+ auto item_type = map_type.item_type();
+
+ // ARROW-6899: Convert dictionary-encoded children to dense instead of
+ // failing below. A more efficient conversion than this could be done later
+ if (key_type->id() == Type::DICTIONARY) {
+ auto dense_type = checked_cast<const
DictionaryType&>(*key_type).value_type();
+ RETURN_NOT_OK(DecodeDictionaries(options.pool, dense_type, &key_arrays));
+ key_type = dense_type;
+ }
+ if (item_type->id() == Type::DICTIONARY) {
+ auto dense_type = checked_cast<const
DictionaryType&>(*item_type).value_type();
+ RETURN_NOT_OK(DecodeDictionaries(options.pool, dense_type, &item_arrays));
+ item_type = dense_type;
+ }
+
+ // See notes in MakeInnerOptions.
+ options = MakeInnerOptions(std::move(options));
+ // Don't blindly convert because timestamps in lists are handled differently.
+ options.timestamp_as_object = true;
+
+ auto flat_keys = std::make_shared<ChunkedArray>(key_arrays, key_type);
+ auto flat_items = std::make_shared<ChunkedArray>(item_arrays, item_type);
+ OwnedRefNoGIL owned_numpy_keys;
+ RETURN_NOT_OK(
+ ConvertChunkedArrayToPandas(options, flat_keys, nullptr,
owned_numpy_keys.ref()));
+ OwnedRefNoGIL owned_numpy_items;
+ RETURN_NOT_OK(
+ ConvertChunkedArrayToPandas(options, flat_items, nullptr,
owned_numpy_items.ref()));
+ PyArrayObject* py_keys =
reinterpret_cast<PyArrayObject*>(owned_numpy_keys.obj());
+ PyArrayObject* py_items =
reinterpret_cast<PyArrayObject*>(owned_numpy_items.obj());
+
+ if (options.maps_as_pydicts == MapConversionType::DEFAULT) {
+ // The default behavior to express an Arrow MAP as a list of [(key,
value), ...] pairs
+ OwnedRef list_item;
+ return ConvertMapHelper(
+ [&list_item](int64_t num_pairs) {
+ list_item.reset(PyList_New(num_pairs));
+ return CheckPyError();
+ },
+ [&list_item](int64_t idx, OwnedRef& key_value, OwnedRef& item_value) {
+ PyList_SET_ITEM(list_item.obj(), idx,
+ PyTuple_Pack(2, key_value.obj(), item_value.obj()));
+ return CheckPyError();
+ },
+ [&list_item]{ return list_item.detach(); },
+ data,
+ py_keys,
+ py_items,
+ item_arrays,
+ out_values);
+ } else {
+ // Use a native pydict
+ OwnedRef dict_item;
+ Py_ssize_t total_dict_len{0};
+ Py_ssize_t total_raw_len{0};
+
+ bool error_on_duplicate_keys;
+ if (options.maps_as_pydicts == MapConversionType::LOSSY) {
+ error_on_duplicate_keys = false;
+ } else if (options.maps_as_pydicts == MapConversionType::STRICT_) {
+ error_on_duplicate_keys = true;
+ } else {
+ auto val =
std::underlying_type_t<MapConversionType>(options.maps_as_pydicts);
+ return Status::UnknownError(
+ "Received unknown option for maps_as_pydicts: " + std::to_string(val)
+ );
+ }
+
+ auto status = ConvertMapHelper(
+ [&dict_item, &total_raw_len](int64_t num_pairs) {
+ total_raw_len += num_pairs;
+ dict_item.reset(PyDict_New());
+ return CheckPyError();
+ },
+ [&dict_item]([[maybe_unused]] int64_t idx, OwnedRef& key_value,
OwnedRef& item_value) {
+ auto setitem_result =
+ PyDict_SetItem(dict_item.obj(), key_value.obj(),
item_value.obj());
+ ARROW_RETURN_NOT_OK(CheckMapAsPydictsTypeError());
+ // returns -1 if there are internal errors around hashing/resizing
+ return setitem_result == 0 ?
+ Status::OK() :
+ Status::UnknownError("[maps_as_pydicts] "
+ "Unexpected failure inserting Arrow (key, value) pair into
Python dict"
+ );
+ },
+ [&dict_item, &total_dict_len]{
+ total_dict_len += PyDict_Size(dict_item.obj());
+ return dict_item.detach();
+ },
+ data,
+ py_keys,
+ py_items,
+ item_arrays,
+ out_values);
+
+ ARROW_RETURN_NOT_OK(status);
+ // If there were no errors generating the pydicts,
+ // then check if we detected any data loss from duplicate keys.
+ return CheckForDuplicateKeys(error_on_duplicate_keys, total_dict_len,
total_raw_len);
+ }
+}
+
template <typename InType, typename OutType>
inline void ConvertNumericNullable(const ChunkedArray& data, InType na_value,
OutType* out_values) {
diff --git a/python/pyarrow/src/arrow/python/arrow_to_pandas.h
b/python/pyarrow/src/arrow/python/arrow_to_pandas.h
index 6570364b8d..ac422cc99c 100644
--- a/python/pyarrow/src/arrow/python/arrow_to_pandas.h
+++ b/python/pyarrow/src/arrow/python/arrow_to_pandas.h
@@ -41,6 +41,12 @@ class Table;
namespace py {
+enum class MapConversionType {
+ DEFAULT, // convert arrow maps to assoc lists (list of kev-value tuples) in
Pandas
+ LOSSY, // report warnings when lossiness is encountered due to duplicate keys
+ STRICT_, // raise a Python exception when lossiness is encountered due to
duplicate keys
+};
+
struct PandasOptions {
/// arrow::MemoryPool to use for memory allocations
MemoryPool* pool = default_memory_pool();
@@ -90,6 +96,17 @@ struct PandasOptions {
/// conversions
bool self_destruct = false;
+ /// \brief The default behavior (DEFAULT), is to convert Arrow Map arrays to
+ /// Python association lists (list-of-tuples) in the same order as the Arrow
+ /// Map, as in [(key1, value1), (key2, value2), ...]
+ /// If LOSSY or STRICT, convert Arrow Map arrays to native Python dicts.
+ /// This can change the ordering of (key, value) pairs, and will deduplicate
+ /// multiple keys, resulting in a possible loss of data.
+ /// If 'lossy', this key deduplication results in a warning printed
+ /// when detected. If 'strict', this instead results in an exception
+ /// being raised when detected.
+ MapConversionType maps_as_pydicts = MapConversionType::DEFAULT;
+
// Used internally for nested arrays.
bool decode_dictionaries = false;
diff --git a/python/pyarrow/src/arrow/python/python_to_arrow.cc
b/python/pyarrow/src/arrow/python/python_to_arrow.cc
index 2bb6a6f459..7c0ffa70c7 100644
--- a/python/pyarrow/src/arrow/python/python_to_arrow.cc
+++ b/python/pyarrow/src/arrow/python/python_to_arrow.cc
@@ -762,7 +762,7 @@ class PyListConverter : public ListConverter<T,
PyConverter, PyConverterTrait> {
RETURN_NOT_OK(AppendSequence(value));
} else if (PySet_Check(value) || (Py_TYPE(value) == &PyDictValues_Type)) {
RETURN_NOT_OK(AppendIterable(value));
- } else if (PyDict_Check(value) && this->options_.type->id() == Type::MAP) {
+ } else if (PyDict_Check(value) && this->type()->id() == Type::MAP) {
// Branch to support Python Dict with `map` DataType.
auto items = PyDict_Items(value);
OwnedRef item_ref(items);
diff --git a/python/pyarrow/tests/test_pandas.py
b/python/pyarrow/tests/test_pandas.py
index a49552aa0a..803a41c28a 100644
--- a/python/pyarrow/tests/test_pandas.py
+++ b/python/pyarrow/tests/test_pandas.py
@@ -2135,6 +2135,66 @@ class TestConvertListTypes:
DeprecationWarning)
tm.assert_series_equal(series, expected)
+ def test_to_list_of_maps_pandas(self):
+ if ((Version(np.__version__) >= Version("1.25.0.dev0")) and
+ (Version(pd.__version__) < Version("2.0.0"))):
+ # TODO: regression in pandas with numpy 1.25dev
+ # https://github.com/pandas-dev/pandas/issues/50360
+ pytest.skip("Regression in pandas with numpy 1.25")
+ data = [
+ [[('foo', ['a', 'b']), ('bar', ['c', 'd'])]],
+ [[('baz', []), ('qux', None), ('quux', [None, 'e'])], [('quz',
['f', 'g'])]]
+ ]
+ arr = pa.array(data, pa.list_(pa.map_(pa.utf8(), pa.list_(pa.utf8()))))
+ series = arr.to_pandas()
+ expected = pd.Series(data)
+
+ # pandas.testing generates a
+ # DeprecationWarning: elementwise comparison failed
+ with warnings.catch_warnings():
+ warnings.filterwarnings("ignore", "elementwise comparison failed",
+ DeprecationWarning)
+ tm.assert_series_equal(series, expected)
+
+ def test_to_list_of_maps_pandas_sliced(self):
+ """
+ A slightly more rigorous test for chunk/slice combinations
+ """
+
+ if ((Version(np.__version__) >= Version("1.25.0.dev0")) and
+ (Version(pd.__version__) < Version("2.0.0"))):
+ # TODO: regression in pandas with numpy 1.25dev
+ # https://github.com/pandas-dev/pandas/issues/50360
+ pytest.skip("Regression in pandas with numpy 1.25")
+
+ keys = pa.array(['ignore', 'foo', 'bar', 'baz',
+ 'qux', 'quux', 'ignore']).slice(1, 5)
+ items = pa.array(
+ [['ignore'], ['ignore'], ['a', 'b'], ['c', 'd'], [], None, [None,
'e']],
+ pa.list_(pa.string()),
+ ).slice(2, 5)
+ map = pa.MapArray.from_arrays([0, 2, 4], keys, items)
+ arr = pa.ListArray.from_arrays([0, 1, 2], map)
+
+ series = arr.to_pandas()
+ expected = pd.Series([
+ [[('foo', ['a', 'b']), ('bar', ['c', 'd'])]],
+ [[('baz', []), ('qux', None)]],
+ ])
+
+ series_sliced = arr.slice(1, 2).to_pandas()
+ expected_sliced = pd.Series([
+ [[('baz', []), ('qux', None)]],
+ ])
+
+ # pandas.testing generates a
+ # DeprecationWarning: elementwise comparison failed
+ with warnings.catch_warnings():
+ warnings.filterwarnings("ignore", "elementwise comparison failed",
+ DeprecationWarning)
+ tm.assert_series_equal(series, expected)
+ tm.assert_series_equal(series_sliced, expected_sliced)
+
@pytest.mark.parametrize('t,data,expected', [
(
pa.int64,
@@ -4549,3 +4609,210 @@ def test_does_not_mutate_timedelta_nested():
df = table.to_pandas()
assert df["timedelta_2"][0].to_pytimedelta() == timedelta_2[0]
+
+
+def test_roundtrip_nested_map_table_with_pydicts():
+ schema = pa.schema([
+ pa.field(
+ "a",
+ pa.list_(
+ pa.map_(pa.int8(), pa.struct([pa.field("b", pa.binary())]))
+ )
+ )
+ ])
+ table = pa.table([[
+ [[(1, None)]],
+ None,
+ [
+ [(2, {"b": b"abc"})],
+ [(3, {"b": None}), (4, {"b": b"def"})],
+ ]
+ ]],
+ schema=schema,
+ )
+
+ expected_default_df = pd.DataFrame(
+ {"a": [[[(1, None)]], None, [[(2, {"b": b"abc"})],
+ [(3, {"b": None}), (4, {"b": b"def"})]]]}
+ )
+ expected_as_pydicts_df = pd.DataFrame(
+ {"a": [
+ [{1: None}],
+ None,
+ [{2: {"b": b"abc"}}, {3: {"b": None}, 4: {"b": b"def"}}],
+ ]}
+ )
+
+ default_df = table.to_pandas()
+ as_pydicts_df = table.to_pandas(maps_as_pydicts="strict")
+
+ tm.assert_frame_equal(default_df, expected_default_df)
+ tm.assert_frame_equal(as_pydicts_df, expected_as_pydicts_df)
+
+ table_default_roundtrip = pa.Table.from_pandas(default_df, schema=schema)
+ assert table.equals(table_default_roundtrip)
+
+ table_as_pydicts_roundtrip = pa.Table.from_pandas(as_pydicts_df,
schema=schema)
+ assert table.equals(table_as_pydicts_roundtrip)
+
+
+def test_roundtrip_nested_map_array_with_pydicts_sliced():
+ """
+ Slightly more robust test with chunking and slicing
+ """
+ keys_1 = pa.array(['foo', 'bar'])
+ keys_2 = pa.array(['baz', 'qux', 'quux', 'quz'])
+ keys_3 = pa.array([], pa.string())
+
+ items_1 = pa.array(
+ [['a', 'b'], ['c', 'd']],
+ pa.list_(pa.string()),
+ )
+ items_2 = pa.array(
+ [[], None, [None, 'e'], ['f', 'g']],
+ pa.list_(pa.string()),
+ )
+ items_3 = pa.array(
+ [],
+ pa.list_(pa.string()),
+ )
+
+ map_chunk_1 = pa.MapArray.from_arrays([0, 2], keys_1, items_1)
+ map_chunk_2 = pa.MapArray.from_arrays([0, 3, 4], keys_2, items_2)
+ map_chunk_3 = pa.MapArray.from_arrays([0, 0], keys_3, items_3)
+ chunked_array = pa.chunked_array([
+ pa.ListArray.from_arrays([0, 1], map_chunk_1).slice(0),
+ pa.ListArray.from_arrays([0, 1], map_chunk_2.slice(1)).slice(0),
+ pa.ListArray.from_arrays([0, 0], map_chunk_3).slice(0),
+ ])
+
+ series_default = chunked_array.to_pandas()
+ expected_series_default = pd.Series([
+ [[('foo', ['a', 'b']), ('bar', ['c', 'd'])]],
+ [[('quz', ['f', 'g'])]],
+ [],
+ ])
+
+ series_pydicts = chunked_array.to_pandas(maps_as_pydicts="strict")
+ expected_series_pydicts = pd.Series([
+ [{'foo': ['a', 'b'], 'bar': ['c', 'd']}],
+ [{'quz': ['f', 'g']}],
+ [],
+ ])
+
+ sliced = chunked_array.slice(1, 3)
+ series_default_sliced = sliced.to_pandas()
+ expected_series_default_sliced = pd.Series([
+ [[('quz', ['f', 'g'])]],
+ [],
+ ])
+
+ series_pydicts_sliced = sliced.to_pandas(maps_as_pydicts="strict")
+ expected_series_pydicts_sliced = pd.Series([
+ [{'quz': ['f', 'g']}],
+ [],
+ ])
+
+ with warnings.catch_warnings():
+ warnings.filterwarnings("ignore", "elementwise comparison failed",
+ DeprecationWarning)
+ tm.assert_series_equal(series_default, expected_series_default)
+ tm.assert_series_equal(series_pydicts, expected_series_pydicts)
+ tm.assert_series_equal(series_default_sliced,
expected_series_default_sliced)
+ tm.assert_series_equal(series_pydicts_sliced,
expected_series_pydicts_sliced)
+
+ ty = pa.list_(pa.map_(pa.string(), pa.list_(pa.string())))
+
+ def assert_roundtrip(series: pd.Series, data) -> None:
+ array_roundtrip = pa.chunked_array(pa.Array.from_pandas(series,
type=ty))
+ assert data.equals(array_roundtrip)
+
+ assert_roundtrip(series_default, chunked_array)
+ assert_roundtrip(series_pydicts, chunked_array)
+ assert_roundtrip(series_default_sliced, sliced)
+ assert_roundtrip(series_pydicts_sliced, sliced)
+
+
+def test_roundtrip_map_array_with_pydicts_duplicate_keys():
+ keys = pa.array(['foo', 'bar', 'foo'])
+ items = pa.array(
+ [['a', 'b'], ['c', 'd'], ['1', '2']],
+ pa.list_(pa.string()),
+ )
+ offsets = [0, 3]
+ maps = pa.MapArray.from_arrays(offsets, keys, items)
+ ty = pa.map_(pa.string(), pa.list_(pa.string()))
+
+ # ------------------------
+ # With maps as pydicts
+ with pytest.raises(pa.lib.ArrowException):
+ # raises because of duplicate keys
+ maps.to_pandas(maps_as_pydicts="strict")
+ series_pydicts = maps.to_pandas(maps_as_pydicts="lossy")
+ # some data loss occurs for duplicate keys
+ expected_series_pydicts = pd.Series([
+ {'foo': ['1', '2'], 'bar': ['c', 'd']},
+ ])
+ # roundtrip is not possible because of data loss
+ assert not maps.equals(pa.Array.from_pandas(series_pydicts, type=ty))
+
+ # ------------------------
+ # With default assoc list of tuples
+ series_default = maps.to_pandas()
+ expected_series_default = pd.Series([
+ [('foo', ['a', 'b']), ('bar', ['c', 'd']), ('foo', ['1', '2'])],
+ ])
+ assert maps.equals(pa.Array.from_pandas(series_default, type=ty))
+
+ # custom comparison for compatibility w/ Pandas 1.0.0
+ # would otherwise run:
+ # tm.assert_series_equal(series_pydicts, expected_series_pydicts)
+ assert len(series_pydicts) == len(expected_series_pydicts)
+ for row1, row2 in zip(series_pydicts, expected_series_pydicts):
+ assert len(row1) == len(row2)
+ for tup1, tup2 in zip(row1.items(), row2.items()):
+ assert tup1[0] == tup2[0]
+ assert np.array_equal(tup1[1], tup2[1])
+
+ # custom comparison for compatibility w/ Pandas 1.0.0
+ # would otherwise run:
+ # tm.assert_series_equal(series_default, expected_series_default)
+ assert len(series_default) == len(expected_series_default)
+ for row1, row2 in zip(series_default, expected_series_default):
+ assert len(row1) == len(row2)
+ for tup1, tup2 in zip(row1, row2):
+ assert tup1[0] == tup2[0]
+ assert np.array_equal(tup1[1], tup2[1])
+
+
+def test_unhashable_map_keys_with_pydicts():
+ keys = pa.array(
+ [['a', 'b'], ['c', 'd'], [], ['e'], [None, 'f'], ['g', 'h']],
+ pa.list_(pa.string()),
+ )
+ items = pa.array(['foo', 'bar', 'baz', 'qux', 'quux', 'quz'])
+ offsets = [0, 2, 6]
+ maps = pa.MapArray.from_arrays(offsets, keys, items)
+
+ # ------------------------
+ # With maps as pydicts
+ with pytest.raises(TypeError):
+ maps.to_pandas(maps_as_pydicts="lossy")
+
+ # ------------------------
+ # With default assoc list of tuples
+ series = maps.to_pandas()
+ expected_series_default = pd.Series([
+ [(['a', 'b'], 'foo'), (['c', 'd'], 'bar')],
+ [([], 'baz'), (['e'], 'qux'), ([None, 'f'], 'quux'), (['g', 'h'],
'quz')],
+ ])
+
+ # custom comparison for compatibility w/ Pandas 1.0.0
+ # would otherwise run:
+ # tm.assert_series_equal(series, expected_series_default)
+ assert len(series) == len(expected_series_default)
+ for row1, row2 in zip(series, expected_series_default):
+ assert len(row1) == len(row2)
+ for tup1, tup2 in zip(row1, row2):
+ assert np.array_equal(tup1[0], tup2[0])
+ assert tup1[1] == tup2[1]