[ 
https://issues.apache.org/jira/browse/ARROW-1873?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16282857#comment-16282857
 ] 

ASF GitHub Bot commented on ARROW-1873:
---------------------------------------

wesm closed pull request #1404: ARROW-1873: [Python] Catch more possible 
Python/OOM errors in to_pandas conversion path
URL: https://github.com/apache/arrow/pull/1404
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/cpp/src/arrow/python/arrow_to_pandas.cc 
b/cpp/src/arrow/python/arrow_to_pandas.cc
index 1a1b71125..08ce37cda 100644
--- a/cpp/src/arrow/python/arrow_to_pandas.cc
+++ b/cpp/src/arrow/python/arrow_to_pandas.cc
@@ -240,19 +240,14 @@ class PandasBlock {
       block_arr = PyArray_SimpleNewFromDescr(1, block_dims, descr);
     }
 
-    if (block_arr == NULL) {
-      // TODO(wesm): propagating Python exception
-      return Status::OK();
-    }
+    RETURN_IF_PYERROR();
 
     PyArray_ENABLEFLAGS(reinterpret_cast<PyArrayObject*>(block_arr), 
NPY_ARRAY_OWNDATA);
 
     npy_intp placement_dims[1] = {num_columns_};
     PyObject* placement_arr = PyArray_SimpleNew(1, placement_dims, NPY_INT64);
-    if (placement_arr == NULL) {
-      // TODO(wesm): propagating Python exception
-      return Status::OK();
-    }
+
+    RETURN_IF_PYERROR();
 
     block_arr_.reset(block_arr);
     placement_arr_.reset(placement_arr);
@@ -282,12 +277,19 @@ class PandasBlock {
   ARROW_DISALLOW_COPY_AND_ASSIGN(PandasBlock);
 };
 
+template <typename T>
+inline const T* GetPrimitiveValues(const Array& arr) {
+  const auto& prim_arr = static_cast<const PrimitiveArray&>(arr);
+  const T* raw_values = reinterpret_cast<const T*>(prim_arr.values()->data());
+  return raw_values + arr.offset();
+}
+
 template <typename T>
 inline void ConvertIntegerWithNulls(PandasOptions options, const ChunkedArray& 
data,
                                     double* out_values) {
   for (int c = 0; c < data.num_chunks(); c++) {
-    const auto& arr = static_cast<const PrimitiveArray&>(*data.chunk(c));
-    auto in_values = reinterpret_cast<const T*>(arr.raw_values());
+    const auto& arr = *data.chunk(c);
+    const T* in_values = GetPrimitiveValues<T>(arr);
     // Upcast to double, set NaN as appropriate
 
     for (int i = 0; i < arr.length(); ++i) {
@@ -300,8 +302,8 @@ template <typename T>
 inline void ConvertIntegerNoNullsSameType(PandasOptions options, const 
ChunkedArray& data,
                                           T* out_values) {
   for (int c = 0; c < data.num_chunks(); c++) {
-    const auto& arr = static_cast<const PrimitiveArray&>(*data.chunk(c));
-    auto in_values = reinterpret_cast<const T*>(arr.raw_values());
+    const auto& arr = *data.chunk(c);
+    const T* in_values = GetPrimitiveValues<T>(arr);
     memcpy(out_values, in_values, sizeof(T) * arr.length());
     out_values += arr.length();
   }
@@ -311,8 +313,8 @@ template <typename InType, typename OutType>
 inline void ConvertIntegerNoNullsCast(PandasOptions options, const 
ChunkedArray& data,
                                       OutType* out_values) {
   for (int c = 0; c < data.num_chunks(); c++) {
-    const auto& arr = static_cast<const PrimitiveArray&>(*data.chunk(c));
-    auto in_values = reinterpret_cast<const InType*>(arr.raw_values());
+    const auto& arr = *data.chunk(c);
+    const InType* in_values = GetPrimitiveValues<InType>(arr);
     for (int64_t i = 0; i < arr.length(); ++i) {
       *out_values = in_values[i];
     }
@@ -323,14 +325,13 @@ static Status ConvertBooleanWithNulls(PandasOptions 
options, const ChunkedArray&
                                       PyObject** out_values) {
   PyAcquireGIL lock;
   for (int c = 0; c < data.num_chunks(); c++) {
-    const std::shared_ptr<Array> arr = data.chunk(c);
-    auto bool_arr = static_cast<BooleanArray*>(arr.get());
+    const auto& arr = static_cast<const BooleanArray&>(*data.chunk(c));
 
-    for (int64_t i = 0; i < arr->length(); ++i) {
-      if (bool_arr->IsNull(i)) {
+    for (int64_t i = 0; i < arr.length(); ++i) {
+      if (arr.IsNull(i)) {
         Py_INCREF(Py_None);
         *out_values++ = Py_None;
-      } else if (bool_arr->Value(i)) {
+      } else if (arr.Value(i)) {
         // True
         Py_INCREF(Py_True);
         *out_values++ = Py_True;
@@ -347,10 +348,9 @@ static Status ConvertBooleanWithNulls(PandasOptions 
options, const ChunkedArray&
 static void ConvertBooleanNoNulls(PandasOptions options, const ChunkedArray& 
data,
                                   uint8_t* out_values) {
   for (int c = 0; c < data.num_chunks(); c++) {
-    const std::shared_ptr<Array> arr = data.chunk(c);
-    auto bool_arr = static_cast<BooleanArray*>(arr.get());
-    for (int64_t i = 0; i < arr->length(); ++i) {
-      *out_values++ = static_cast<uint8_t>(bool_arr->Value(i));
+    const auto& arr = static_cast<const BooleanArray&>(*data.chunk(c));
+    for (int64_t i = 0; i < arr.length(); ++i) {
+      *out_values++ = static_cast<uint8_t>(arr.Value(i));
     }
   }
 }
@@ -361,17 +361,17 @@ inline Status ConvertBinaryLike(PandasOptions options, 
const ChunkedArray& data,
   using ArrayType = typename TypeTraits<Type>::ArrayType;
   PyAcquireGIL lock;
   for (int c = 0; c < data.num_chunks(); c++) {
-    auto arr = static_cast<ArrayType*>(data.chunk(c).get());
+    const auto& arr = static_cast<const ArrayType&>(*data.chunk(c));
 
     const uint8_t* data_ptr;
     int32_t length;
     const bool has_nulls = data.null_count() > 0;
-    for (int64_t i = 0; i < arr->length(); ++i) {
-      if (has_nulls && arr->IsNull(i)) {
+    for (int64_t i = 0; i < arr.length(); ++i) {
+      if (has_nulls && arr.IsNull(i)) {
         Py_INCREF(Py_None);
         *out_values = Py_None;
       } else {
-        data_ptr = arr->GetValue(i, &length);
+        data_ptr = arr.GetValue(i, &length);
         *out_values = WrapBytes<ArrayType>::Wrap(data_ptr, length);
         if (*out_values == nullptr) {
           PyErr_Clear();
@@ -530,13 +530,25 @@ inline Status ConvertListsLike(PandasOptions options, 
const std::shared_ptr<Colu
         PyObject* start = PyLong_FromLongLong(arr->value_offset(i) + 
chunk_offset);
         PyObject* end = PyLong_FromLongLong(arr->value_offset(i + 1) + 
chunk_offset);
         PyObject* slice = PySlice_New(start, end, NULL);
+        Py_XDECREF(start);
+        Py_XDECREF(end);
+
+        if (ARROW_PREDICT_FALSE(slice == nullptr)) {
+          // Fall out of loop, will return from RETURN_IF_PYERROR
+          break;
+        }
         *out_values = PyObject_GetItem(numpy_array, slice);
-        Py_DECREF(start);
-        Py_DECREF(end);
-        Py_DECREF(slice);
+
+        if (*out_values == nullptr) {
+          // Fall out of loop, will return from RETURN_IF_PYERROR
+          break;
+        }
+
+        Py_XDECREF(slice);
       }
       ++out_values;
     }
+    RETURN_IF_PYERROR();
 
     chunk_offset += arr->values()->length();
   }
@@ -548,14 +560,12 @@ inline Status ConvertListsLike(PandasOptions options, 
const std::shared_ptr<Colu
 template <typename T>
 inline void ConvertNumericNullable(const ChunkedArray& data, T na_value, T* 
out_values) {
   for (int c = 0; c < data.num_chunks(); c++) {
-    const auto& arr = static_cast<const PrimitiveArray&>(*data.chunk(c));
-    auto in_values = reinterpret_cast<const T*>(arr.raw_values());
-
-    const uint8_t* valid_bits = arr.null_bitmap_data();
+    const auto& arr = *data.chunk(c);
+    const T* in_values = GetPrimitiveValues<T>(arr);
 
     if (arr.null_count() > 0) {
       for (int64_t i = 0; i < arr.length(); ++i) {
-        *out_values++ = BitUtil::BitNotSet(valid_bits, i) ? na_value : 
in_values[i];
+        *out_values++ = arr.IsNull(i) ? na_value : in_values[i];
       }
     } else {
       memcpy(out_values, in_values, sizeof(T) * arr.length());
@@ -568,8 +578,8 @@ template <typename InType, typename OutType>
 inline void ConvertNumericNullableCast(const ChunkedArray& data, OutType 
na_value,
                                        OutType* out_values) {
   for (int c = 0; c < data.num_chunks(); c++) {
-    const auto& arr = static_cast<const PrimitiveArray&>(*data.chunk(c));
-    auto in_values = reinterpret_cast<const InType*>(arr.raw_values());
+    const auto& arr = *data.chunk(c);
+    const InType* in_values = GetPrimitiveValues<InType>(arr);
 
     for (int64_t i = 0; i < arr.length(); ++i) {
       *out_values++ = arr.IsNull(i) ? na_value : 
static_cast<OutType>(in_values[i]);
@@ -577,11 +587,11 @@ inline void ConvertNumericNullableCast(const 
ChunkedArray& data, OutType na_valu
   }
 }
 
-template <typename InType, int64_t SHIFT>
+template <typename T, int64_t SHIFT>
 inline void ConvertDatetimeNanos(const ChunkedArray& data, int64_t* 
out_values) {
   for (int c = 0; c < data.num_chunks(); c++) {
-    const auto& arr = static_cast<const PrimitiveArray&>(*data.chunk(c));
-    auto in_values = reinterpret_cast<const InType*>(arr.raw_values());
+    const auto& arr = *data.chunk(c);
+    const T* in_values = GetPrimitiveValues<T>(arr);
 
     for (int64_t i = 0; i < arr.length(); ++i) {
       *out_values++ = arr.IsNull(i) ? kPandasTimestampNull
@@ -631,7 +641,7 @@ static Status ConvertDecimals(PandasOptions options, const 
ChunkedArray& data,
   PyObject* Decimal = Decimal_ref.obj();
 
   for (int c = 0; c < data.num_chunks(); c++) {
-    const auto& arr(static_cast<const 
arrow::Decimal128Array&>(*data.chunk(c).get()));
+    const auto& arr = static_cast<const 
arrow::Decimal128Array&>(*data.chunk(c));
 
     for (int64_t i = 0; i < arr.length(); ++i) {
       if (arr.IsNull(i)) {
@@ -856,7 +866,7 @@ class BoolBlock : public PandasBlock {
     uint8_t* out_buffer =
         reinterpret_cast<uint8_t*>(block_data_) + rel_placement * num_rows_;
 
-    ConvertBooleanNoNulls(options_, *col->data().get(), out_buffer);
+    ConvertBooleanNoNulls(options_, *col->data(), out_buffer);
     placement_data_[rel_placement] = abs_placement;
     return Status::OK();
   }
@@ -884,7 +894,7 @@ class DatetimeBlock : public PandasBlock {
     int64_t* out_buffer =
         reinterpret_cast<int64_t*>(block_data_) + rel_placement * num_rows_;
 
-    const ChunkedArray& data = *col.get()->data();
+    const ChunkedArray& data = *col->data();
 
     if (type == Type::DATE32) {
       // Convert from days since epoch to datetime64[ns]
@@ -1089,7 +1099,7 @@ class CategoricalBlock : public PandasBlock {
   Status AllocateNDArrayFromIndices(int npy_type, const PrimitiveArray& 
indices) {
     npy_intp block_dims[1] = {num_rows_};
 
-    auto in_values = reinterpret_cast<const T*>(indices.raw_values());
+    const T* in_values = GetPrimitiveValues<T>(indices);
     void* data = const_cast<T*>(in_values);
 
     PyAcquireGIL lock;
@@ -1420,6 +1430,7 @@ class ArrowDeserializer {
     PyAcquireGIL lock;
 
     result_ = NewArray1DFromType(col_->type().get(), type, col_->length(), 
nullptr);
+    RETURN_IF_PYERROR();
     arr_ = reinterpret_cast<PyArrayObject*>(result_);
     return Status::OK();
   }
@@ -1429,8 +1440,7 @@ class ArrowDeserializer {
                                std::shared_ptr<Array> arr) {
     typedef typename internal::arrow_traits<TYPE>::T T;
 
-    const auto& prim_arr = static_cast<const PrimitiveArray&>(*arr);
-    auto in_values = reinterpret_cast<const T*>(prim_arr.raw_values());
+    const T* in_values = GetPrimitiveValues<T>(*arr);
 
     // Zero-Copy. We can pass the data pointer directly to NumPy.
     void* data = const_cast<T*>(in_values);
@@ -1528,8 +1538,8 @@ class ArrowDeserializer {
     constexpr int64_t kShift = traits::npy_shift;
 
     for (int c = 0; c < data_.num_chunks(); c++) {
-      const auto& arr = static_cast<const PrimitiveArray&>(*data_.chunk(c));
-      auto in_values = reinterpret_cast<const c_type*>(arr.raw_values());
+      const auto& arr = *data_.chunk(c);
+      const c_type* in_values = GetPrimitiveValues<c_type>(arr);
 
       for (int64_t i = 0; i < arr.length(); ++i) {
         *out_values++ = arr.IsNull(i) ? na_value : 
static_cast<T>(in_values[i]) / kShift;


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


> [Python] Segmentation fault when loading total 2GB of parquet files
> -------------------------------------------------------------------
>
>                 Key: ARROW-1873
>                 URL: https://issues.apache.org/jira/browse/ARROW-1873
>             Project: Apache Arrow
>          Issue Type: Bug
>            Reporter: DB Tsai
>            Assignee: Wes McKinney
>              Labels: pull-request-available
>             Fix For: 0.8.0
>
>
> We are trying to load 100 parquet files, and each of them is around 20MB. 
> Before we port [ARROW-1830] into our pyarrow distribution, we use {{glob}} to 
> list all the files, and then load them as pandas dataframe through pyarrow. 
> The schema of the parquet files is like 
> {code:java}
> root
>  |-- dateint: integer (nullable = true)
>  |-- profileid: long (nullable = true)
>  |-- time: long (nullable = true)
>  |-- label: double (nullable = true)
>  |-- weight: double (nullable = true)
>  |-- features: array (nullable = true)
>  |    |-- element: double (containsNull = true)
> {code}
> If we only load couple of them, it works without any issue. However, when 
> loading 100 of them, we got segmentation fault as the following. FYI, if we 
> flatten {{features: array[double]}} into top level, the file sizes are around 
> the same, and work fine too. 
> Is there anything we can try to eliminate this issue? Thanks.
> {code}
> >>> import glob
> >>> files = glob.glob("/home/dbt/data/*")
> >>> data = pq.ParquetDataset(files).read().to_pandas()
> [New Thread 0x7fffe8f84700 (LWP 23769)]
> [New Thread 0x7fffe3b93700 (LWP 23770)]
> [New Thread 0x7fffe3392700 (LWP 23771)]
> [New Thread 0x7fffe2b91700 (LWP 23772)]
> [Thread 0x7fffe2b91700 (LWP 23772) exited]
> [Thread 0x7fffe3b93700 (LWP 23770) exited]
> Thread 4 "python" received signal SIGSEGV, Segmentation fault.
> [Switching to Thread 0x7fffe3392700 (LWP 23771)]
> 0x00007ffff270fc94 in arrow::Status 
> arrow::VisitTypeInline<arrow::py::ArrowDeserializer>(arrow::DataType const&, 
> arrow::py::ArrowDeserializer*) ()
>    from 
> /home/dbt/miniconda3/lib/python3.6/site-packages/pyarrow/../../../libarrow_python.so.0
> (gdb) backtrace
> #0  0x00007ffff270fc94 in arrow::Status 
> arrow::VisitTypeInline<arrow::py::ArrowDeserializer>(arrow::DataType const&, 
> arrow::py::ArrowDeserializer*) ()
>    from 
> /home/dbt/miniconda3/lib/python3.6/site-packages/pyarrow/../../../libarrow_python.so.0
> #1  0x00007ffff2700b5a in 
> arrow::py::ConvertColumnToPandas(arrow::py::PandasOptions, 
> std::shared_ptr<arrow::Column> const&, _object*, _object**) ()
>    from 
> /home/dbt/miniconda3/lib/python3.6/site-packages/pyarrow/../../../libarrow_python.so.0
> #2  0x00007ffff2714985 in arrow::Status 
> arrow::py::ConvertListsLike<arrow::DoubleType>(arrow::py::PandasOptions, 
> std::shared_ptr<arrow::Column> const&, _object**) () from 
> /home/dbt/miniconda3/lib/python3.6/site-packages/pyarrow/../../../libarrow_python.so.0
> #3  0x00007ffff2716b92 in 
> arrow::py::ObjectBlock::Write(std::shared_ptr<arrow::Column> const&, long, 
> long) ()
>    from 
> /home/dbt/miniconda3/lib/python3.6/site-packages/pyarrow/../../../libarrow_python.so.0
> #4  0x00007ffff270a489 in 
> arrow::py::DataFrameBlockCreator::WriteTableToBlocks(int)::{lambda(int)#1}::operator()(int)
>  const ()
>    from 
> /home/dbt/miniconda3/lib/python3.6/site-packages/pyarrow/../../../libarrow_python.so.0
> #5  0x00007ffff270a67c in std::thread::_Impl<std::_Bind_simple<arrow::Status 
> arrow::ParallelFor<arrow::py::DataFrameBlockCreator::WriteTableToBlocks(int)::{lambda(int)#1}&>(int,
>  int, 
> arrow::py::DataFrameBlockCreator::WriteTableToBlocks(int)::{lambda(int)#1}&)::{lambda()#1}
>  ()> >::_M_run() ()
>    from 
> /home/dbt/miniconda3/lib/python3.6/site-packages/pyarrow/../../../libarrow_python.so.0
> #6  0x00007ffff1e30c5c in std::execute_native_thread_routine_compat 
> (__p=<optimized out>)
>     at 
> /opt/conda/conda-bld/compilers_linux-64_1505664199673/work/.build/src/gcc-7.2.0/libstdc++-v3/src/c++11/thread.cc:110
> #7  0x00007ffff7bc16ba in start_thread (arg=0x7fffe3392700) at 
> pthread_create.c:333
> #8  0x00007ffff78f73dd in clone () at 
> ../sysdeps/unix/sysv/linux/x86_64/clone.S:109
> {code}



--
This message was sent by Atlassian JIRA
(v6.4.14#64029)

Reply via email to