This is an automated email from the ASF dual-hosted git repository.

apitrou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 3eee3e4  ARROW-2101: [Python/C++] Correctly convert numpy arrays of 
bytes to arrow arrays of strings when user specifies arrow type of string
3eee3e4 is described below

commit 3eee3e49fe1978071e411401114fc2101928485a
Author: Joshua Storck <joshua.sto...@twosigma.com>
AuthorDate: Mon Apr 16 12:11:25 2018 +0200

    ARROW-2101: [Python/C++] Correctly convert numpy arrays of bytes to arrow 
arrays of strings when user specifies arrow type of string
    
    See https://issues.apache.org/jira/browse/ARROW-2101.
    
    Author: Joshua Storck <joshua.sto...@twosigma.com>
    
    Closes #1886 from joshuastorck/ARROW_2101 and squashes the following 
commits:
    
    4f6409b <Joshua Storck> Removing use of std::nullptr since it doesn't work 
on all compilers
    ac0c27d <Joshua Storck> Removing C++ unit tests, moving python unit tests 
to test_convert_pandas.py, adding comparison to nullptr instead of 0, adhere to 
C++ formatting style
    034791c <Joshua Storck> Delete trailing whitespace
    8a5312d <Joshua Storck> Bug fix for ARROW-2101. When converting a numpy 
array of bytes with a dtype of object that has valid utf-8 values, the output 
array's type was binary, even when the caller specified the output type as 
string. The NumPyConverter::ConvertObjectStrings function now only converts to 
binary if no type was specified and bytes were encountered. Added unit tests so 
that the code won't regress and added an additional unit test that ensures that 
an exception is thrown wh [...]
---
 cpp/src/arrow/python/numpy_to_arrow.cc      | 40 +++++++++++++++++++++--------
 python/pyarrow/tests/test_convert_pandas.py | 15 +++++++++++
 2 files changed, 45 insertions(+), 10 deletions(-)

diff --git a/cpp/src/arrow/python/numpy_to_arrow.cc 
b/cpp/src/arrow/python/numpy_to_arrow.cc
index e3fb71b..f5e093a 100644
--- a/cpp/src/arrow/python/numpy_to_arrow.cc
+++ b/cpp/src/arrow/python/numpy_to_arrow.cc
@@ -228,12 +228,15 @@ static Status AppendObjectBinaries(PyArrayObject* arr, 
PyArrayObject* mask,
 /// can fit
 ///
 /// \param[in] offset starting offset for appending
+/// \param[in] check_valid if set to true and the input array
+/// contains values that cannot be converted to unicode, returns
+/// a Status code containing a Python exception message
 /// \param[out] end_offset ending offset where we stopped appending. Will
 /// be length of arr if fully consumed
 /// \param[out] have_bytes true if we encountered any PyBytes object
 static Status AppendObjectStrings(PyArrayObject* arr, PyArrayObject* mask, 
int64_t offset,
-                                  StringBuilder* builder, int64_t* end_offset,
-                                  bool* have_bytes) {
+                                  bool check_valid, StringBuilder* builder,
+                                  int64_t* end_offset, bool* have_bytes) {
   PyObject* obj;
 
   Ndarray1DIndexer<PyObject*> objects(arr);
@@ -256,8 +259,7 @@ static Status AppendObjectStrings(PyArrayObject* arr, 
PyArrayObject* mask, int64
       *have_bytes = true;
     }
     bool is_full;
-    RETURN_NOT_OK(
-        internal::BuilderAppend(builder, obj, false /* check_valid */, 
&is_full));
+    RETURN_NOT_OK(internal::BuilderAppend(builder, obj, check_valid, 
&is_full));
     if (is_full) {
       break;
     }
@@ -843,6 +845,13 @@ Status NumPyConverter::ConvertObjectStrings() {
   StringBuilder builder(pool_);
   RETURN_NOT_OK(builder.Resize(length_));
 
+  // If the creator of this NumPyConverter specified a type,
+  // then we want to force the output type to be utf8. If
+  // the input data is PyBytes and not PyUnicode and
+  // not convertible to utf8, the call to AppendObjectStrings
+  // below will fail because we pass force_string as the
+  // value for check_valid.
+  bool force_string = type_ != nullptr && type_->Equals(utf8());
   bool global_have_bytes = false;
   if (length_ == 0) {
     // Produce an empty chunk
@@ -853,8 +862,10 @@ Status NumPyConverter::ConvertObjectStrings() {
     int64_t offset = 0;
     while (offset < length_) {
       bool chunk_have_bytes = false;
-      RETURN_NOT_OK(
-          AppendObjectStrings(arr_, mask_, offset, &builder, &offset, 
&chunk_have_bytes));
+      // Always set check_valid to true when force_string is true
+      RETURN_NOT_OK(AppendObjectStrings(arr_, mask_, offset,
+                                        force_string /* check_valid */, 
&builder, &offset,
+                                        &chunk_have_bytes));
 
       global_have_bytes = global_have_bytes | chunk_have_bytes;
       std::shared_ptr<Array> chunk;
@@ -863,8 +874,13 @@ Status NumPyConverter::ConvertObjectStrings() {
     }
   }
 
-  // If we saw PyBytes, convert everything to BinaryArray
-  if (global_have_bytes) {
+  // If we saw bytes, convert it to a binary array. If
+  // force_string was set to true, the input data could
+  // have been bytes but we've checked to make sure that
+  // it can be converted to utf-8 in the call to
+  // AppendObjectStrings. In that case, we can safely leave
+  // it as a utf8 type.
+  if (!force_string && global_have_bytes) {
     for (size_t i = 0; i < out_arrays_.size(); ++i) {
       auto binary_data = out_arrays_[i]->data()->Copy();
       binary_data->type = ::arrow::binary();
@@ -1392,8 +1408,12 @@ inline Status 
NumPyConverter::ConvertTypedLists<NPY_OBJECT, StringType>(
       RETURN_NOT_OK(CheckFlatNumpyArray(numpy_array, NPY_OBJECT));
 
       int64_t offset = 0;
-      RETURN_NOT_OK(AppendObjectStrings(numpy_array, nullptr, 0, 
value_builder, &offset,
-                                        &have_bytes));
+      // If a type was specified and it was utf8, then we set
+      // check_valid to true. If any of the input cannot be
+      // converted, then we will exit early here.
+      bool check_valid = type_ != nullptr && type_->Equals(::arrow::utf8());
+      RETURN_NOT_OK(AppendObjectStrings(numpy_array, nullptr, 0, check_valid,
+                                        value_builder, &offset, &have_bytes));
       if (offset < PyArray_SIZE(numpy_array)) {
         return Status::Invalid("Array cell value exceeded 2GB");
       }
diff --git a/python/pyarrow/tests/test_convert_pandas.py 
b/python/pyarrow/tests/test_convert_pandas.py
index bbb5b2d..7f9ede9 100644
--- a/python/pyarrow/tests/test_convert_pandas.py
+++ b/python/pyarrow/tests/test_convert_pandas.py
@@ -1228,6 +1228,21 @@ class TestConvertStringLikeTypes(object):
             table.to_pandas(strings_to_categorical=True,
                             zero_copy_only=True)
 
+    # Regression test for ARROW-2101
+    def test_array_of_bytes_to_strings(self):
+        converted = pa.array(np.array([b'x'], dtype=object), pa.string())
+        assert converted.type == pa.string()
+
+    # Make sure that if an ndarray of bytes is passed to the array
+    # constructor and the type is string, it will fail if those bytes
+    # cannot be converted to utf-8
+    def test_array_of_bytes_to_strings_bad_data(self):
+        with pytest.raises(
+                pa.lib.ArrowException,
+                message="Unknown error: 'utf-8' codec can't decode byte 0x80 "
+                "in position 0: invalid start byte"):
+            pa.array(np.array([b'\x80\x81'], dtype=object), pa.string())
+
 
 class TestConvertDecimalTypes(object):
     """

-- 
To stop receiving notification emails like this one, please contact
apit...@apache.org.

Reply via email to