pydict support

via GitHub Fri, 14 Apr 2023 10:44:44 -0700


wjones127 commented on code in PR #34730:
URL: https://github.com/apache/arrow/pull/34730#discussion_r1167102035



##########
python/pyarrow/tests/test_pandas.py:
##########
@@ -4542,3 +4569,48 @@ def test_does_not_mutate_timedelta_nested():
     df = table.to_pandas()
 
     assert df["timedelta_2"][0].to_pytimedelta() == timedelta_2[0]
+
+
+def test_roundtrip_nested_map_table_with_pydicts():

Review Comment:
   This is a bit paranoid, but could you also test the `to_pandas` conversion 
on a MapArray that has been sliced (or even a chunked one where each chunk has 
been sliced)? That often brings out some bugs in these implementations.



##########
python/pyarrow/src/arrow/python/arrow_to_pandas.cc:
##########
@@ -911,6 +877,97 @@ Status ConvertMap(PandasOptions options, const 
ChunkedArray& data,
   return Status::OK();
 }
 
+Status ConvertMap(PandasOptions options, const ChunkedArray& data,
+                  PyObject** out_values) {
+  // Get columns of underlying key/item arrays
+  std::vector<std::shared_ptr<Array>> key_arrays;
+  std::vector<std::shared_ptr<Array>> item_arrays;
+  for (int c = 0; c < data.num_chunks(); ++c) {
+    const auto& map_arr = checked_cast<const MapArray&>(*data.chunk(c));
+    key_arrays.emplace_back(map_arr.keys());
+    item_arrays.emplace_back(map_arr.items());
+  }
+
+  const auto& map_type = checked_cast<const MapType&>(*data.type());
+  auto key_type = map_type.key_type();
+  auto item_type = map_type.item_type();
+
+  // ARROW-6899: Convert dictionary-encoded children to dense instead of
+  // failing below. A more efficient conversion than this could be done later
+  if (key_type->id() == Type::DICTIONARY) {
+    auto dense_type = checked_cast<const 
DictionaryType&>(*key_type).value_type();
+    RETURN_NOT_OK(DecodeDictionaries(options.pool, dense_type, &key_arrays));
+    key_type = dense_type;
+  }
+  if (item_type->id() == Type::DICTIONARY) {
+    auto dense_type = checked_cast<const 
DictionaryType&>(*item_type).value_type();
+    RETURN_NOT_OK(DecodeDictionaries(options.pool, dense_type, &item_arrays));
+    item_type = dense_type;
+  }
+
+  // See notes in MakeInnerOptions.
+  options = MakeInnerOptions(std::move(options));
+  // Don't blindly convert because timestamps in lists are handled differently.
+  options.timestamp_as_object = true;
+
+  auto flat_keys = std::make_shared<ChunkedArray>(key_arrays, key_type);
+  auto flat_items = std::make_shared<ChunkedArray>(item_arrays, item_type);
+  OwnedRefNoGIL owned_numpy_keys;
+  RETURN_NOT_OK(
+      ConvertChunkedArrayToPandas(options, flat_keys, nullptr, 
owned_numpy_keys.ref()));
+  OwnedRefNoGIL owned_numpy_items;
+  RETURN_NOT_OK(
+      ConvertChunkedArrayToPandas(options, flat_items, nullptr, 
owned_numpy_items.ref()));
+  PyArrayObject* py_keys = 
reinterpret_cast<PyArrayObject*>(owned_numpy_keys.obj());
+  PyArrayObject* py_items = 
reinterpret_cast<PyArrayObject*>(owned_numpy_items.obj());
+
+  if (!options.maps_as_pydicts) {
+    // The default behavior to express an Arrow MAP as a list of [(key, 
value), ...] pairs
+    OwnedRef list_item;
+    return ConvertMapHelper(
+        [&list_item](int64_t num_pairs) {
+          list_item.reset(PyList_New(num_pairs));
+          return CheckPyError();
+        },
+        [&list_item](int64_t idx, OwnedRef& key_value, OwnedRef& item_value) {
+          PyList_SET_ITEM(list_item.obj(), idx,
+                          PyTuple_Pack(2, key_value.obj(), item_value.obj()));
+          return CheckPyError();
+        },
+        [&list_item]{ return list_item.detach(); },
+        data,
+        py_keys,
+        py_items,
+        item_arrays,
+        out_values);
+  } else {
+    // Use a native pydict
+    OwnedRef dict_item;
+    return ConvertMapHelper(
+        [&dict_item]([[maybe_unused]] int64_t) {
+          dict_item.reset(PyDict_New());
+          return CheckPyError();
+        },
+        [&dict_item]([[maybe_unused]] int64_t idx, OwnedRef& key_value, 
OwnedRef& item_value) {
+          auto setitem_result =
+              PyDict_SetItem(dict_item.obj(), key_value.obj(), 
item_value.obj());
+          RETURN_IF_PYERROR();

Review Comment:
   This should handle if `TypeError` is raised in case `key_value.obj()` isn't 
hashable? Perhaps we should test that? I think having a key type of a map or 
list would trigger that.



##########
python/pyarrow/src/arrow/python/arrow_to_pandas.cc:
##########
@@ -176,6 +176,7 @@ static inline bool ListTypeSupported(const DataType& type) {
     case Type::DATE32:
     case Type::DATE64:
     case Type::STRUCT:
+    case Type::MAP:

Review Comment:
   Agreed. Although it's interesting we check the value type for list, but not 
the fields of structs? I was thinking you should put `Type::MAP` down with 
`Type::LIST`, since it is a subclass of ListType, though since the immediate 
child is `Type::STRUCT` right now it won't do anything meaningful.



##########
python/pyarrow/src/arrow/python/python_to_arrow.cc:
##########
@@ -762,7 +762,7 @@ class PyListConverter : public ListConverter<T, 
PyConverter, PyConverterTrait> {
       RETURN_NOT_OK(AppendSequence(value));
     } else if (PySet_Check(value) || (Py_TYPE(value) == &PyDictValues_Type)) {
       RETURN_NOT_OK(AppendIterable(value));
-    } else if (PyDict_Check(value) && this->options_.type->id() == Type::MAP) {
+    } else if (PyDict_Check(value) && this->type()->id() == Type::MAP) {

Review Comment:
   Agreed. `MakeConverter` doesn't seem to expect that options is specific to 
the type in question; it's templated over it. This change seems correct to me. 
Thanks for catching it. 👍 



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [arrow] wjones127 commented on a diff in pull request #34730: GH-34729: [C++][Python] Enhanced Arrow<->Pandas map/pydict support

Reply via email to