This is an automated email from the ASF dual-hosted git repository.

wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 0ffc882  ARROW-1998: [Python] fix crash on empty Numpy arrays
0ffc882 is described below

commit 0ffc8822aaa7f62eda8ba44a44210e90bd0fe23c
Author: Antoine Pitrou <anto...@python.org>
AuthorDate: Mon Feb 12 18:00:13 2018 -0500

    ARROW-1998: [Python] fix crash on empty Numpy arrays
    
    This would happen with some object types (binary, string).
    
    Author: Antoine Pitrou <anto...@python.org>
    
    Closes #1594 from pitrou/ARROW-1998-empty-np-array-crash and squashes the 
following commits:
    
    3e071148 [Antoine Pitrou] ARROW-1998: [Python] fix crash on empty Numpy 
arrays
---
 cpp/src/arrow/python/numpy_to_arrow.cc      | 39 +++++++++++++++++++----------
 python/pyarrow/tests/test_convert_pandas.py | 25 ++++++++++++++++++
 2 files changed, 51 insertions(+), 13 deletions(-)

diff --git a/cpp/src/arrow/python/numpy_to_arrow.cc 
b/cpp/src/arrow/python/numpy_to_arrow.cc
index d487d9d..3dd5a79 100644
--- a/cpp/src/arrow/python/numpy_to_arrow.cc
+++ b/cpp/src/arrow/python/numpy_to_arrow.cc
@@ -850,16 +850,23 @@ Status NumPyConverter::ConvertObjectStrings() {
   RETURN_NOT_OK(builder.Resize(length_));
 
   bool global_have_bytes = false;
-  int64_t offset = 0;
-  while (offset < length_) {
-    bool chunk_have_bytes = false;
-    RETURN_NOT_OK(
-        AppendObjectStrings(arr_, mask_, offset, &builder, &offset, 
&chunk_have_bytes));
-
-    global_have_bytes = global_have_bytes | chunk_have_bytes;
+  if (length_ == 0) {
+    // Produce an empty chunk
     std::shared_ptr<Array> chunk;
     RETURN_NOT_OK(builder.Finish(&chunk));
     out_arrays_.emplace_back(std::move(chunk));
+  } else {
+    int64_t offset = 0;
+    while (offset < length_) {
+      bool chunk_have_bytes = false;
+      RETURN_NOT_OK(
+          AppendObjectStrings(arr_, mask_, offset, &builder, &offset, 
&chunk_have_bytes));
+
+      global_have_bytes = global_have_bytes | chunk_have_bytes;
+      std::shared_ptr<Array> chunk;
+      RETURN_NOT_OK(builder.Finish(&chunk));
+      out_arrays_.emplace_back(std::move(chunk));
+    }
   }
 
   // If we saw PyBytes, convert everything to BinaryArray
@@ -954,14 +961,21 @@ Status NumPyConverter::ConvertObjectFixedWidthBytes(
   FixedSizeBinaryBuilder builder(type, pool_);
   RETURN_NOT_OK(builder.Resize(length_));
 
-  int64_t offset = 0;
-  while (offset < length_) {
-    RETURN_NOT_OK(
-        AppendObjectFixedWidthBytes(arr_, mask_, byte_width, offset, &builder, 
&offset));
-
+  if (length_ == 0) {
+    // Produce an empty chunk
     std::shared_ptr<Array> chunk;
     RETURN_NOT_OK(builder.Finish(&chunk));
     out_arrays_.emplace_back(std::move(chunk));
+  } else {
+    int64_t offset = 0;
+    while (offset < length_) {
+      RETURN_NOT_OK(AppendObjectFixedWidthBytes(arr_, mask_, byte_width, 
offset, &builder,
+                                                &offset));
+
+      std::shared_ptr<Array> chunk;
+      RETURN_NOT_OK(builder.Finish(&chunk));
+      out_arrays_.emplace_back(std::move(chunk));
+    }
   }
   return Status::OK();
 }
@@ -1567,7 +1581,6 @@ Status NdarrayToArrow(MemoryPool* pool, PyObject* ao, 
PyObject* mo,
   if (!PyArray_Check(ao)) {
     return Status::Invalid("Input object was not a NumPy array");
   }
-
   NumPyConverter converter(pool, ao, mo, type, use_pandas_null_sentinels);
   RETURN_NOT_OK(converter.Convert());
   const auto& output_arrays = converter.result();
diff --git a/python/pyarrow/tests/test_convert_pandas.py 
b/python/pyarrow/tests/test_convert_pandas.py
index b73522d..987ac23 100644
--- a/python/pyarrow/tests/test_convert_pandas.py
+++ b/python/pyarrow/tests/test_convert_pandas.py
@@ -119,6 +119,26 @@ def _check_array_from_pandas_roundtrip(np_array):
 
 class TestPandasConversion(object):
 
+    type_pairs = [
+        (np.int8, pa.int8()),
+        (np.int16, pa.int16()),
+        (np.int32, pa.int32()),
+        (np.int64, pa.int64()),
+        (np.uint8, pa.uint8()),
+        (np.uint16, pa.uint16()),
+        (np.uint32, pa.uint32()),
+        (np.uint64, pa.uint64()),
+        # (np.float16, pa.float16()),  # XXX unsupported
+        (np.float32, pa.float32()),
+        (np.float64, pa.float64()),
+        # XXX unsupported
+        # (np.dtype([('a', 'i2')]), pa.struct([pa.field('a', pa.int16())])),
+        (np.object, pa.string()),
+        # (np.object, pa.binary()),  # XXX unsupported
+        (np.object, pa.binary(10)),
+        (np.object, pa.list_(pa.int64())),
+        ]
+
     def test_all_none_objects(self):
         df = pd.DataFrame({'a': [None, None, None]})
         _check_pandas_roundtrip(df)
@@ -128,6 +148,11 @@ class TestPandasConversion(object):
         df['a'] = df['a'].astype('category')
         _check_pandas_roundtrip(df)
 
+    def test_empty_arrays(self):
+        for dtype, pa_type in self.type_pairs:
+            arr = np.array([], dtype=dtype)
+            _check_array_roundtrip(arr, type=pa_type)
+
     def test_non_string_columns(self):
         df = pd.DataFrame({0: [1, 2, 3]})
         table = pa.Table.from_pandas(df)

-- 
To stop receiving notification emails like this one, please contact
w...@apache.org.

Reply via email to