pydict support

via GitHub Wed, 19 Apr 2023 10:48:24 -0700


wjones127 commented on code in PR #34730:
URL: https://github.com/apache/arrow/pull/34730#discussion_r1171471068



##########
python/pyarrow/src/arrow/python/arrow_to_pandas.cc:
##########
@@ -911,6 +877,165 @@ Status ConvertMap(PandasOptions options, const 
ChunkedArray& data,
   return Status::OK();
 }
 
+// A more helpful error message around TypeErrors that may stem from 
unhashable keys
+Status CheckMapAsPydictsTypeError() {
+  if (ARROW_PREDICT_TRUE(!PyErr_Occurred())) {
+    return Status::OK();
+  }
+  if (PyErr_ExceptionMatches(PyExc_TypeError)) {
+    // Modify the error string directly, so it is re-raised
+    // with our additional info.
+    //
+    // There are not many interesting things happening when this
+    // is hit. This is intended to only be called directly after
+    // PyDict_SetItem, where a finite set of errors could occur.
+    PyObject *type, *value, *traceback;
+    PyErr_Fetch(&type, &value, &traceback);
+    std::string message;
+    RETURN_NOT_OK(internal::PyObject_StdStringStr(value, &message));
+    message += ". If keys are not hashable, then you must use the option "
+        "[maps_as_pydicts=None (default)]";
+
+    // resets the error
+    PyErr_SetString(PyExc_TypeError, message.c_str());
+  }
+  return ConvertPyError();
+}
+
+Status CheckForDuplicateKeys(bool error_on_duplicate_keys,
+                             Py_ssize_t total_dict_len, Py_ssize_t 
total_raw_len) {
+  if (total_dict_len < total_raw_len) {
+    const char* message =
+        "[maps_as_pydicts] "
+        "After conversion of Arrow maps to pydicts, "
+        "detected data loss due to duplicate keys. "
+        "Original input length is [%lld], total converted pydict length is 
[%lld].";
+    std::array<char, 256> buf;
+    std::snprintf(buf.data(), buf.size(), message, total_raw_len, 
total_dict_len);
+
+    if (error_on_duplicate_keys) {
+      return Status::UnknownError(buf.data());
+    } else {
+      ARROW_LOG(WARNING) << buf.data();
+    }
+  }
+  return Status::OK();
+}
+
+Status ConvertMap(PandasOptions options, const ChunkedArray& data,
+                  PyObject** out_values) {
+  // Get columns of underlying key/item arrays
+  std::vector<std::shared_ptr<Array>> key_arrays;
+  std::vector<std::shared_ptr<Array>> item_arrays;
+  for (int c = 0; c < data.num_chunks(); ++c) {
+    const auto& map_arr = checked_cast<const MapArray&>(*data.chunk(c));
+    key_arrays.emplace_back(map_arr.keys());
+    item_arrays.emplace_back(map_arr.items());
+  }
+
+  const auto& map_type = checked_cast<const MapType&>(*data.type());
+  auto key_type = map_type.key_type();
+  auto item_type = map_type.item_type();
+
+  // ARROW-6899: Convert dictionary-encoded children to dense instead of
+  // failing below. A more efficient conversion than this could be done later
+  if (key_type->id() == Type::DICTIONARY) {
+    auto dense_type = checked_cast<const 
DictionaryType&>(*key_type).value_type();
+    RETURN_NOT_OK(DecodeDictionaries(options.pool, dense_type, &key_arrays));
+    key_type = dense_type;
+  }
+  if (item_type->id() == Type::DICTIONARY) {
+    auto dense_type = checked_cast<const 
DictionaryType&>(*item_type).value_type();
+    RETURN_NOT_OK(DecodeDictionaries(options.pool, dense_type, &item_arrays));
+    item_type = dense_type;
+  }
+
+  // See notes in MakeInnerOptions.
+  options = MakeInnerOptions(std::move(options));
+  // Don't blindly convert because timestamps in lists are handled differently.
+  options.timestamp_as_object = true;
+
+  auto flat_keys = std::make_shared<ChunkedArray>(key_arrays, key_type);
+  auto flat_items = std::make_shared<ChunkedArray>(item_arrays, item_type);
+  OwnedRefNoGIL owned_numpy_keys;
+  RETURN_NOT_OK(
+      ConvertChunkedArrayToPandas(options, flat_keys, nullptr, 
owned_numpy_keys.ref()));
+  OwnedRefNoGIL owned_numpy_items;
+  RETURN_NOT_OK(
+      ConvertChunkedArrayToPandas(options, flat_items, nullptr, 
owned_numpy_items.ref()));
+  PyArrayObject* py_keys = 
reinterpret_cast<PyArrayObject*>(owned_numpy_keys.obj());
+  PyArrayObject* py_items = 
reinterpret_cast<PyArrayObject*>(owned_numpy_items.obj());
+
+  if (options.maps_as_pydicts == MapConversionType::DEFAULT) {
+    // The default behavior to express an Arrow MAP as a list of [(key, 
value), ...] pairs
+    OwnedRef list_item;
+    return ConvertMapHelper(
+        [&list_item](int64_t num_pairs) {
+          list_item.reset(PyList_New(num_pairs));
+          return CheckPyError();
+        },
+        [&list_item](int64_t idx, OwnedRef& key_value, OwnedRef& item_value) {
+          PyList_SET_ITEM(list_item.obj(), idx,
+                          PyTuple_Pack(2, key_value.obj(), item_value.obj()));
+          return CheckPyError();
+        },
+        [&list_item]{ return list_item.detach(); },
+        data,
+        py_keys,
+        py_items,
+        item_arrays,
+        out_values);
+  } else {
+    // Use a native pydict
+    OwnedRef dict_item;
+    Py_ssize_t total_dict_len{0};
+    Py_ssize_t total_raw_len{0};
+
+    bool error_on_duplicate_keys;
+    if (options.maps_as_pydicts == MapConversionType::LOSSY) {
+      error_on_duplicate_keys = false;
+    } else if (options.maps_as_pydicts == MapConversionType::STRICT_) {
+      error_on_duplicate_keys = true;
+    } else {
+      auto val = 
std::underlying_type_t<MapConversionType>(options.maps_as_pydicts);
+      return Status::UnknownError(
+          "Received unknown option for maps_as_pydicts: " + std::to_string(val)
+      );
+    }
+
+    auto status = ConvertMapHelper(
+        [&dict_item, &total_raw_len](int64_t num_pairs) {
+          total_raw_len += num_pairs;
+          dict_item.reset(PyDict_New());
+          return CheckPyError();
+        },
+        [&dict_item]([[maybe_unused]] int64_t idx, OwnedRef& key_value, 
OwnedRef& item_value) {
+          auto setitem_result =
+              PyDict_SetItem(dict_item.obj(), key_value.obj(), 
item_value.obj());
+          ARROW_RETURN_NOT_OK(CheckMapAsPydictsTypeError());
+          // returns -1 if there are internal errors around hashing/resizing
+          return setitem_result == 0 ?
+            Status::OK() :
+            Status::UnknownError("[maps_as_pydicts] "
+                "Unexpected failure inserting Arrow (key, value) pair into 
Python dict"
+            );
+        },
+        [&dict_item, &total_dict_len]{
+          total_dict_len += PyDict_Size(dict_item.obj());
+          return dict_item.detach();
+        },
+        data,
+        py_keys,
+        py_items,
+        item_arrays,
+        out_values);
+
+    // If there were no errors generating the pydicts,
+    // then check if we detected any data loss from duplicate keys.
+    return !status.ok() ? status : 
CheckForDuplicateKeys(error_on_duplicate_keys, total_dict_len, total_raw_len);

Review Comment:
   I think we typically use the `ARROW_RETURN_NOT_OK` to early return.
   ```suggestion
       ARROW_RETURN_NOT_OK(status);
       return CheckForDuplicateKeys(error_on_duplicate_keys, total_dict_len, 
total_raw_len);
   ```



##########
python/pyarrow/tests/test_pandas.py:
##########
@@ -2128,6 +2128,91 @@ def test_to_list_of_structs_pandas(self):
                                     DeprecationWarning)
             tm.assert_series_equal(series, expected)
 
+    def test_to_list_of_maps_pandas(self):
+        if ((Version(np.__version__) >= Version("1.25.0.dev0")) and
+                (Version(pd.__version__) < Version("2.0.0"))):
+            # TODO: regression in pandas with numpy 1.25dev
+            # https://github.com/pandas-dev/pandas/issues/50360
+            pytest.skip("Regression in pandas with numpy 1.25")
+        offsets = pa.array([0, 2, 5, 6], pa.int32())
+        keys = pa.array(['foo', 'bar', 'baz', 'qux', 'quux', 'quz'])
+        items = pa.array([['a', 'b'], ['c', 'd'], [], None, [None, 'e'], ['f', 
'g']],
+                         pa.list_(pa.string()))
+        maps = pa.MapArray.from_arrays(offsets, keys, items)
+        data = pa.ListArray.from_arrays([0, 1, 3], maps)
+
+        expected = pd.Series([
+            [[('foo', ['a', 'b']), ('bar', ['c', 'd'])]],
+            [[('baz', []), ('qux', None), ('quux', [None, 'e'])], [('quz', 
['f', 'g'])]]
+        ])
+
+        series = pd.Series(data.to_pandas())

Review Comment:
   I think this could be simplified:
   
   ```suggestion
           data = [
               [[('foo', ['a', 'b']), ('bar', ['c', 'd'])]],
               [[('baz', []), ('qux', None), ('quux', [None, 'e'])], [('quz', 
['f', 'g'])]]
           ]
           arr = pa.array(data, pa.list_(pa.map_(pa.utf8(), 
pa.list_(pa.utf8()))))
           series = arr.to_pandas()
   
           expected = pd.Series(data)
   ```



##########
python/pyarrow/tests/test_pandas.py:
##########
@@ -2128,6 +2128,91 @@ def test_to_list_of_structs_pandas(self):
                                     DeprecationWarning)
             tm.assert_series_equal(series, expected)
 
+    def test_to_list_of_maps_pandas(self):
+        if ((Version(np.__version__) >= Version("1.25.0.dev0")) and
+                (Version(pd.__version__) < Version("2.0.0"))):
+            # TODO: regression in pandas with numpy 1.25dev
+            # https://github.com/pandas-dev/pandas/issues/50360
+            pytest.skip("Regression in pandas with numpy 1.25")
+        offsets = pa.array([0, 2, 5, 6], pa.int32())
+        keys = pa.array(['foo', 'bar', 'baz', 'qux', 'quux', 'quz'])
+        items = pa.array([['a', 'b'], ['c', 'd'], [], None, [None, 'e'], ['f', 
'g']],
+                         pa.list_(pa.string()))
+        maps = pa.MapArray.from_arrays(offsets, keys, items)
+        data = pa.ListArray.from_arrays([0, 1, 3], maps)
+
+        expected = pd.Series([
+            [[('foo', ['a', 'b']), ('bar', ['c', 'd'])]],
+            [[('baz', []), ('qux', None), ('quux', [None, 'e'])], [('quz', 
['f', 'g'])]]
+        ])
+
+        series = pd.Series(data.to_pandas())
+
+        # pandas.testing generates a
+        # DeprecationWarning: elementwise comparison failed
+        with warnings.catch_warnings():
+            warnings.filterwarnings("ignore", "elementwise comparison failed",
+                                    DeprecationWarning)
+            tm.assert_series_equal(series, expected)
+
+    def test_to_list_of_maps_pandas_sliced(self):
+        """
+        A slightly more rigorous test for chunk/slice combinations
+        """
+
+        if ((Version(np.__version__) >= Version("1.25.0.dev0")) and
+                (Version(pd.__version__) < Version("2.0.0"))):
+            # TODO: regression in pandas with numpy 1.25dev
+            # https://github.com/pandas-dev/pandas/issues/50360
+            pytest.skip("Regression in pandas with numpy 1.25")
+
+        keys_1 = pa.array(['foo', 'bar'])
+        keys_2 = pa.array(['baz', 'qux', 'quux'])
+        keys_3 = pa.array(['quz'])
+
+        items_1 = pa.array(
+            [['a', 'b'], ['c', 'd']],
+            pa.list_(pa.string()),
+        )
+        items_2 = pa.array(
+            [[], None, [None, 'e']],
+            pa.list_(pa.string()),
+        )
+        items_3 = pa.array(
+            [['f', 'g']],
+            pa.list_(pa.string()),
+        )
+
+        map_chunk_1 = pa.MapArray.from_arrays([0, 2], keys_1, items_1)
+        map_chunk_2 = pa.MapArray.from_arrays([0, 3], keys_2, items_2)
+        map_chunk_3 = pa.MapArray.from_arrays([0, 1], keys_3, items_3)
+        data = pa.chunked_array([
+            pa.ListArray.from_arrays([0, 1], map_chunk_1),
+            pa.ListArray.from_arrays([0, 1], map_chunk_2),
+            pa.ListArray.from_arrays([0, 1], map_chunk_3),
+        ])

Review Comment:
   A couple thoughts:
   
    1. I think the chunked array is overcomplicated it in the end (sorry for 
that distraction). What's more important is the slicing.
    2. This currently doesn't slice within any arrays; it slices at array 
boundaries, which doesn't trigger the code paths I was hoping for.
    3. It would also be good to slice the inner arrays.
   
   Here's my suggested test case:
   
   ```suggestion
           keys = pa.array(['ignore', 'foo', 'bar', 'baz', 'qux', 'quux', 
'ignore']).slice(1, 5)
           items = pa.array(
               [['ignore'], ['ignore'], ['a', 'b'], ['c', 'd'], [], None, 
[None, 'e']],
               pa.list_(pa.string()),
           ).slice(2, 5)
           map = pa.MapArray.from_arrays([0, 2, 4], keys, items)
           arr = pa.ListArray.from_arrays([0, 1, 2], map)
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [arrow] wjones127 commented on a diff in pull request #34730: GH-34729: [C++][Python] Enhanced Arrow<->Pandas map/pydict support

Reply via email to