[arrow] branch master updated: ARROW-2227: [Python] Fix off-by-one error in chunked binary conversions

wesm Tue, 13 Mar 2018 13:46:26 -0700

This is an automated email from the ASF dual-hosted git repository.

wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git



The following commit(s) were added to refs/heads/master by this push:
     new 33d1091  ARROW-2227: [Python] Fix off-by-one error in chunked binary 
conversions
33d1091 is described below

commit 33d1091ee6f6a835ed058a9c368f465a9a97cbe0
Author: Wes McKinney <wes.mckin...@twosigma.com>
AuthorDate: Tue Mar 13 16:46:10 2018 -0400

    ARROW-2227: [Python] Fix off-by-one error in chunked binary conversions
    
    We were already testing the chunked behavior but we did not exercise the 
off-by-one error edge case
    
    Author: Wes McKinney <wes.mckin...@twosigma.com>
    
    Closes #1740 from wesm/ARROW-2227 and squashes the following commits:
    
    fd1ab178 <Wes McKinney> Move constexpr to builder.h, add constexpr for max 
# of list elements
    ca5ecc0f <Wes McKinney> Fix off-by-one error in chunked binary conversions
---
 cpp/src/arrow/builder.cc                    | 16 ++++++++--------
 cpp/src/arrow/builder.h                     |  7 ++++---
 cpp/src/arrow/python/numpy_to_arrow.cc      |  2 --
 python/pyarrow/tests/test_convert_pandas.py |  7 +++++--
 4 files changed, 17 insertions(+), 15 deletions(-)

diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc
index ef4e7fd..aa9f3ce 100644
--- a/cpp/src/arrow/builder.cc
+++ b/cpp/src/arrow/builder.cc
@@ -1236,7 +1236,7 @@ Status ListBuilder::Append(const int32_t* offsets, 
int64_t length,
 
 Status ListBuilder::AppendNextOffset() {
   int64_t num_values = value_builder_->length();
-  if (ARROW_PREDICT_FALSE(num_values >= std::numeric_limits<int32_t>::max())) {
+  if (ARROW_PREDICT_FALSE(num_values > kListMaximumElements)) {
     std::stringstream ss;
     ss << "ListArray cannot contain more then INT32_MAX - 1 child elements,"
        << " have " << num_values;
@@ -1252,14 +1252,14 @@ Status ListBuilder::Append(bool is_valid) {
 }
 
 Status ListBuilder::Init(int64_t elements) {
-  DCHECK_LT(elements, std::numeric_limits<int32_t>::max());
+  DCHECK_LE(elements, kListMaximumElements);
   RETURN_NOT_OK(ArrayBuilder::Init(elements));
   // one more then requested for offsets
   return offsets_builder_.Resize((elements + 1) * sizeof(int32_t));
 }
 
 Status ListBuilder::Resize(int64_t capacity) {
-  DCHECK_LT(capacity, std::numeric_limits<int32_t>::max());
+  DCHECK_LE(capacity, kListMaximumElements);
   // one more then requested for offsets
   RETURN_NOT_OK(offsets_builder_.Resize((capacity + 1) * sizeof(int32_t)));
   return ArrayBuilder::Resize(capacity);
@@ -1303,14 +1303,14 @@ BinaryBuilder::BinaryBuilder(const 
std::shared_ptr<DataType>& type, MemoryPool*
 BinaryBuilder::BinaryBuilder(MemoryPool* pool) : BinaryBuilder(binary(), pool) 
{}
 
 Status BinaryBuilder::Init(int64_t elements) {
-  DCHECK_LT(elements, std::numeric_limits<int32_t>::max());
+  DCHECK_LE(elements, kListMaximumElements);
   RETURN_NOT_OK(ArrayBuilder::Init(elements));
   // one more then requested for offsets
   return offsets_builder_.Resize((elements + 1) * sizeof(int32_t));
 }
 
 Status BinaryBuilder::Resize(int64_t capacity) {
-  DCHECK_LT(capacity, std::numeric_limits<int32_t>::max());
+  DCHECK_LE(capacity, kListMaximumElements);
   // one more then requested for offsets
   RETURN_NOT_OK(offsets_builder_.Resize((capacity + 1) * sizeof(int32_t)));
   return ArrayBuilder::Resize(capacity);
@@ -1318,7 +1318,7 @@ Status BinaryBuilder::Resize(int64_t capacity) {
 
 Status BinaryBuilder::ReserveData(int64_t elements) {
   if (value_data_length() + elements > value_data_capacity()) {
-    if (value_data_length() + elements > std::numeric_limits<int32_t>::max()) {
+    if (value_data_length() + elements > kBinaryMemoryLimit) {
       return Status::Invalid("Cannot reserve capacity larger than 2^31 - 1 for 
binary");
     }
     RETURN_NOT_OK(value_data_builder_.Reserve(elements));
@@ -1328,9 +1328,9 @@ Status BinaryBuilder::ReserveData(int64_t elements) {
 
 Status BinaryBuilder::AppendNextOffset() {
   const int64_t num_bytes = value_data_builder_.length();
-  if (ARROW_PREDICT_FALSE(num_bytes > kMaximumCapacity)) {
+  if (ARROW_PREDICT_FALSE(num_bytes > kBinaryMemoryLimit)) {
     std::stringstream ss;
-    ss << "BinaryArray cannot contain more than " << kMaximumCapacity << " 
bytes, have "
+    ss << "BinaryArray cannot contain more than " << kBinaryMemoryLimit << " 
bytes, have "
        << num_bytes;
     return Status::Invalid(ss.str());
   }
diff --git a/cpp/src/arrow/builder.h b/cpp/src/arrow/builder.h
index dabfb75..cdcee80 100644
--- a/cpp/src/arrow/builder.h
+++ b/cpp/src/arrow/builder.h
@@ -41,13 +41,16 @@ namespace arrow {
 class Array;
 class Decimal128;
 
+constexpr int64_t kBinaryMemoryLimit = std::numeric_limits<int32_t>::max() - 1;
+constexpr int64_t kListMaximumElements = std::numeric_limits<int32_t>::max() - 
1;
+
 namespace internal {
 
 struct ArrayData;
 
 }  // namespace internal
 
-static constexpr int64_t kMinBuilderCapacity = 1 << 5;
+constexpr int64_t kMinBuilderCapacity = 1 << 5;
 
 /// Base class for all data array builders.
 //
@@ -702,8 +705,6 @@ class ARROW_EXPORT BinaryBuilder : public ArrayBuilder {
   TypedBufferBuilder<int32_t> offsets_builder_;
   TypedBufferBuilder<uint8_t> value_data_builder_;
 
-  static constexpr int64_t kMaximumCapacity = 
std::numeric_limits<int32_t>::max() - 1;
-
   Status AppendNextOffset();
   void Reset();
 };
diff --git a/cpp/src/arrow/python/numpy_to_arrow.cc 
b/cpp/src/arrow/python/numpy_to_arrow.cc
index 4d91e53..71bf69f 100644
--- a/cpp/src/arrow/python/numpy_to_arrow.cc
+++ b/cpp/src/arrow/python/numpy_to_arrow.cc
@@ -60,8 +60,6 @@ namespace py {
 
 using internal::NumPyTypeSize;
 
-constexpr int64_t kBinaryMemoryLimit = std::numeric_limits<int32_t>::max();
-
 // ----------------------------------------------------------------------
 // Conversion utilities
 
diff --git a/python/pyarrow/tests/test_convert_pandas.py 
b/python/pyarrow/tests/test_convert_pandas.py
index dff4e10..d448de0 100644
--- a/python/pyarrow/tests/test_convert_pandas.py
+++ b/python/pyarrow/tests/test_convert_pandas.py
@@ -1048,9 +1048,12 @@ class TestConvertStringLikeTypes(object):
 
     @pytest.mark.large_memory
     def test_bytes_exceed_2gb(self):
-        val = 'x' * (1 << 20)
+        v1 = b'x' * 100000000
+        v2 = b'x' * 147483646
+
+        # ARROW-2227, hit exactly 2GB on the nose
         df = pd.DataFrame({
-            'strings': np.array([val] * 4000, dtype=object)
+            'strings': [v1] * 20 + [v2] + ['x'] * 20
         })
         arr = pa.array(df['strings'])
         assert isinstance(arr, pa.ChunkedArray)

-- 
To stop receiving notification emails like this one, please contact
w...@apache.org.

[arrow] branch master updated: ARROW-2227: [Python] Fix off-by-one error in chunked binary conversions

Reply via email to