This is an automated email from the ASF dual-hosted git repository. wesm pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push: new 33d1091 ARROW-2227: [Python] Fix off-by-one error in chunked binary conversions 33d1091 is described below commit 33d1091ee6f6a835ed058a9c368f465a9a97cbe0 Author: Wes McKinney <wes.mckin...@twosigma.com> AuthorDate: Tue Mar 13 16:46:10 2018 -0400 ARROW-2227: [Python] Fix off-by-one error in chunked binary conversions We were already testing the chunked behavior but we did not exercise the off-by-one error edge case Author: Wes McKinney <wes.mckin...@twosigma.com> Closes #1740 from wesm/ARROW-2227 and squashes the following commits: fd1ab178 <Wes McKinney> Move constexpr to builder.h, add constexpr for max # of list elements ca5ecc0f <Wes McKinney> Fix off-by-one error in chunked binary conversions --- cpp/src/arrow/builder.cc | 16 ++++++++-------- cpp/src/arrow/builder.h | 7 ++++--- cpp/src/arrow/python/numpy_to_arrow.cc | 2 -- python/pyarrow/tests/test_convert_pandas.py | 7 +++++-- 4 files changed, 17 insertions(+), 15 deletions(-) diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc index ef4e7fd..aa9f3ce 100644 --- a/cpp/src/arrow/builder.cc +++ b/cpp/src/arrow/builder.cc @@ -1236,7 +1236,7 @@ Status ListBuilder::Append(const int32_t* offsets, int64_t length, Status ListBuilder::AppendNextOffset() { int64_t num_values = value_builder_->length(); - if (ARROW_PREDICT_FALSE(num_values >= std::numeric_limits<int32_t>::max())) { + if (ARROW_PREDICT_FALSE(num_values > kListMaximumElements)) { std::stringstream ss; ss << "ListArray cannot contain more then INT32_MAX - 1 child elements," << " have " << num_values; @@ -1252,14 +1252,14 @@ Status ListBuilder::Append(bool is_valid) { } Status ListBuilder::Init(int64_t elements) { - DCHECK_LT(elements, std::numeric_limits<int32_t>::max()); + DCHECK_LE(elements, kListMaximumElements); RETURN_NOT_OK(ArrayBuilder::Init(elements)); // one more then requested for offsets return offsets_builder_.Resize((elements + 1) * sizeof(int32_t)); } Status ListBuilder::Resize(int64_t capacity) { - DCHECK_LT(capacity, std::numeric_limits<int32_t>::max()); + DCHECK_LE(capacity, kListMaximumElements); // one more then requested for offsets RETURN_NOT_OK(offsets_builder_.Resize((capacity + 1) * sizeof(int32_t))); return ArrayBuilder::Resize(capacity); @@ -1303,14 +1303,14 @@ BinaryBuilder::BinaryBuilder(const std::shared_ptr<DataType>& type, MemoryPool* BinaryBuilder::BinaryBuilder(MemoryPool* pool) : BinaryBuilder(binary(), pool) {} Status BinaryBuilder::Init(int64_t elements) { - DCHECK_LT(elements, std::numeric_limits<int32_t>::max()); + DCHECK_LE(elements, kListMaximumElements); RETURN_NOT_OK(ArrayBuilder::Init(elements)); // one more then requested for offsets return offsets_builder_.Resize((elements + 1) * sizeof(int32_t)); } Status BinaryBuilder::Resize(int64_t capacity) { - DCHECK_LT(capacity, std::numeric_limits<int32_t>::max()); + DCHECK_LE(capacity, kListMaximumElements); // one more then requested for offsets RETURN_NOT_OK(offsets_builder_.Resize((capacity + 1) * sizeof(int32_t))); return ArrayBuilder::Resize(capacity); @@ -1318,7 +1318,7 @@ Status BinaryBuilder::Resize(int64_t capacity) { Status BinaryBuilder::ReserveData(int64_t elements) { if (value_data_length() + elements > value_data_capacity()) { - if (value_data_length() + elements > std::numeric_limits<int32_t>::max()) { + if (value_data_length() + elements > kBinaryMemoryLimit) { return Status::Invalid("Cannot reserve capacity larger than 2^31 - 1 for binary"); } RETURN_NOT_OK(value_data_builder_.Reserve(elements)); @@ -1328,9 +1328,9 @@ Status BinaryBuilder::ReserveData(int64_t elements) { Status BinaryBuilder::AppendNextOffset() { const int64_t num_bytes = value_data_builder_.length(); - if (ARROW_PREDICT_FALSE(num_bytes > kMaximumCapacity)) { + if (ARROW_PREDICT_FALSE(num_bytes > kBinaryMemoryLimit)) { std::stringstream ss; - ss << "BinaryArray cannot contain more than " << kMaximumCapacity << " bytes, have " + ss << "BinaryArray cannot contain more than " << kBinaryMemoryLimit << " bytes, have " << num_bytes; return Status::Invalid(ss.str()); } diff --git a/cpp/src/arrow/builder.h b/cpp/src/arrow/builder.h index dabfb75..cdcee80 100644 --- a/cpp/src/arrow/builder.h +++ b/cpp/src/arrow/builder.h @@ -41,13 +41,16 @@ namespace arrow { class Array; class Decimal128; +constexpr int64_t kBinaryMemoryLimit = std::numeric_limits<int32_t>::max() - 1; +constexpr int64_t kListMaximumElements = std::numeric_limits<int32_t>::max() - 1; + namespace internal { struct ArrayData; } // namespace internal -static constexpr int64_t kMinBuilderCapacity = 1 << 5; +constexpr int64_t kMinBuilderCapacity = 1 << 5; /// Base class for all data array builders. // @@ -702,8 +705,6 @@ class ARROW_EXPORT BinaryBuilder : public ArrayBuilder { TypedBufferBuilder<int32_t> offsets_builder_; TypedBufferBuilder<uint8_t> value_data_builder_; - static constexpr int64_t kMaximumCapacity = std::numeric_limits<int32_t>::max() - 1; - Status AppendNextOffset(); void Reset(); }; diff --git a/cpp/src/arrow/python/numpy_to_arrow.cc b/cpp/src/arrow/python/numpy_to_arrow.cc index 4d91e53..71bf69f 100644 --- a/cpp/src/arrow/python/numpy_to_arrow.cc +++ b/cpp/src/arrow/python/numpy_to_arrow.cc @@ -60,8 +60,6 @@ namespace py { using internal::NumPyTypeSize; -constexpr int64_t kBinaryMemoryLimit = std::numeric_limits<int32_t>::max(); - // ---------------------------------------------------------------------- // Conversion utilities diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py index dff4e10..d448de0 100644 --- a/python/pyarrow/tests/test_convert_pandas.py +++ b/python/pyarrow/tests/test_convert_pandas.py @@ -1048,9 +1048,12 @@ class TestConvertStringLikeTypes(object): @pytest.mark.large_memory def test_bytes_exceed_2gb(self): - val = 'x' * (1 << 20) + v1 = b'x' * 100000000 + v2 = b'x' * 147483646 + + # ARROW-2227, hit exactly 2GB on the nose df = pd.DataFrame({ - 'strings': np.array([val] * 4000, dtype=object) + 'strings': [v1] * 20 + [v2] + ['x'] * 20 }) arr = pa.array(df['strings']) assert isinstance(arr, pa.ChunkedArray) -- To stop receiving notification emails like this one, please contact w...@apache.org.