This is an automated email from the ASF dual-hosted git repository. apitrou pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push: new 3eee3e4 ARROW-2101: [Python/C++] Correctly convert numpy arrays of bytes to arrow arrays of strings when user specifies arrow type of string 3eee3e4 is described below commit 3eee3e49fe1978071e411401114fc2101928485a Author: Joshua Storck <joshua.sto...@twosigma.com> AuthorDate: Mon Apr 16 12:11:25 2018 +0200 ARROW-2101: [Python/C++] Correctly convert numpy arrays of bytes to arrow arrays of strings when user specifies arrow type of string See https://issues.apache.org/jira/browse/ARROW-2101. Author: Joshua Storck <joshua.sto...@twosigma.com> Closes #1886 from joshuastorck/ARROW_2101 and squashes the following commits: 4f6409b <Joshua Storck> Removing use of std::nullptr since it doesn't work on all compilers ac0c27d <Joshua Storck> Removing C++ unit tests, moving python unit tests to test_convert_pandas.py, adding comparison to nullptr instead of 0, adhere to C++ formatting style 034791c <Joshua Storck> Delete trailing whitespace 8a5312d <Joshua Storck> Bug fix for ARROW-2101. When converting a numpy array of bytes with a dtype of object that has valid utf-8 values, the output array's type was binary, even when the caller specified the output type as string. The NumPyConverter::ConvertObjectStrings function now only converts to binary if no type was specified and bytes were encountered. Added unit tests so that the code won't regress and added an additional unit test that ensures that an exception is thrown wh [...] --- cpp/src/arrow/python/numpy_to_arrow.cc | 40 +++++++++++++++++++++-------- python/pyarrow/tests/test_convert_pandas.py | 15 +++++++++++ 2 files changed, 45 insertions(+), 10 deletions(-) diff --git a/cpp/src/arrow/python/numpy_to_arrow.cc b/cpp/src/arrow/python/numpy_to_arrow.cc index e3fb71b..f5e093a 100644 --- a/cpp/src/arrow/python/numpy_to_arrow.cc +++ b/cpp/src/arrow/python/numpy_to_arrow.cc @@ -228,12 +228,15 @@ static Status AppendObjectBinaries(PyArrayObject* arr, PyArrayObject* mask, /// can fit /// /// \param[in] offset starting offset for appending +/// \param[in] check_valid if set to true and the input array +/// contains values that cannot be converted to unicode, returns +/// a Status code containing a Python exception message /// \param[out] end_offset ending offset where we stopped appending. Will /// be length of arr if fully consumed /// \param[out] have_bytes true if we encountered any PyBytes object static Status AppendObjectStrings(PyArrayObject* arr, PyArrayObject* mask, int64_t offset, - StringBuilder* builder, int64_t* end_offset, - bool* have_bytes) { + bool check_valid, StringBuilder* builder, + int64_t* end_offset, bool* have_bytes) { PyObject* obj; Ndarray1DIndexer<PyObject*> objects(arr); @@ -256,8 +259,7 @@ static Status AppendObjectStrings(PyArrayObject* arr, PyArrayObject* mask, int64 *have_bytes = true; } bool is_full; - RETURN_NOT_OK( - internal::BuilderAppend(builder, obj, false /* check_valid */, &is_full)); + RETURN_NOT_OK(internal::BuilderAppend(builder, obj, check_valid, &is_full)); if (is_full) { break; } @@ -843,6 +845,13 @@ Status NumPyConverter::ConvertObjectStrings() { StringBuilder builder(pool_); RETURN_NOT_OK(builder.Resize(length_)); + // If the creator of this NumPyConverter specified a type, + // then we want to force the output type to be utf8. If + // the input data is PyBytes and not PyUnicode and + // not convertible to utf8, the call to AppendObjectStrings + // below will fail because we pass force_string as the + // value for check_valid. + bool force_string = type_ != nullptr && type_->Equals(utf8()); bool global_have_bytes = false; if (length_ == 0) { // Produce an empty chunk @@ -853,8 +862,10 @@ Status NumPyConverter::ConvertObjectStrings() { int64_t offset = 0; while (offset < length_) { bool chunk_have_bytes = false; - RETURN_NOT_OK( - AppendObjectStrings(arr_, mask_, offset, &builder, &offset, &chunk_have_bytes)); + // Always set check_valid to true when force_string is true + RETURN_NOT_OK(AppendObjectStrings(arr_, mask_, offset, + force_string /* check_valid */, &builder, &offset, + &chunk_have_bytes)); global_have_bytes = global_have_bytes | chunk_have_bytes; std::shared_ptr<Array> chunk; @@ -863,8 +874,13 @@ Status NumPyConverter::ConvertObjectStrings() { } } - // If we saw PyBytes, convert everything to BinaryArray - if (global_have_bytes) { + // If we saw bytes, convert it to a binary array. If + // force_string was set to true, the input data could + // have been bytes but we've checked to make sure that + // it can be converted to utf-8 in the call to + // AppendObjectStrings. In that case, we can safely leave + // it as a utf8 type. + if (!force_string && global_have_bytes) { for (size_t i = 0; i < out_arrays_.size(); ++i) { auto binary_data = out_arrays_[i]->data()->Copy(); binary_data->type = ::arrow::binary(); @@ -1392,8 +1408,12 @@ inline Status NumPyConverter::ConvertTypedLists<NPY_OBJECT, StringType>( RETURN_NOT_OK(CheckFlatNumpyArray(numpy_array, NPY_OBJECT)); int64_t offset = 0; - RETURN_NOT_OK(AppendObjectStrings(numpy_array, nullptr, 0, value_builder, &offset, - &have_bytes)); + // If a type was specified and it was utf8, then we set + // check_valid to true. If any of the input cannot be + // converted, then we will exit early here. + bool check_valid = type_ != nullptr && type_->Equals(::arrow::utf8()); + RETURN_NOT_OK(AppendObjectStrings(numpy_array, nullptr, 0, check_valid, + value_builder, &offset, &have_bytes)); if (offset < PyArray_SIZE(numpy_array)) { return Status::Invalid("Array cell value exceeded 2GB"); } diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py index bbb5b2d..7f9ede9 100644 --- a/python/pyarrow/tests/test_convert_pandas.py +++ b/python/pyarrow/tests/test_convert_pandas.py @@ -1228,6 +1228,21 @@ class TestConvertStringLikeTypes(object): table.to_pandas(strings_to_categorical=True, zero_copy_only=True) + # Regression test for ARROW-2101 + def test_array_of_bytes_to_strings(self): + converted = pa.array(np.array([b'x'], dtype=object), pa.string()) + assert converted.type == pa.string() + + # Make sure that if an ndarray of bytes is passed to the array + # constructor and the type is string, it will fail if those bytes + # cannot be converted to utf-8 + def test_array_of_bytes_to_strings_bad_data(self): + with pytest.raises( + pa.lib.ArrowException, + message="Unknown error: 'utf-8' codec can't decode byte 0x80 " + "in position 0: invalid start byte"): + pa.array(np.array([b'\x80\x81'], dtype=object), pa.string()) + class TestConvertDecimalTypes(object): """ -- To stop receiving notification emails like this one, please contact apit...@apache.org.