This is an automated email from the ASF dual-hosted git repository.

kszucs pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 66d9a30  ARROW-3514: [C++] Work around insufficient output size 
estimate on old zlibs
66d9a30 is described below

commit 66d9a30a26e1659d9e992037339515e59a6ae518
Author: Antoine Pitrou <[email protected]>
AuthorDate: Tue Oct 16 13:32:29 2018 +0200

    ARROW-3514: [C++] Work around insufficient output size estimate on old zlibs
    
    With a manylinux1 zlib (1.2.3.x), one could get the following error when 
writing a Parquet table with gzip compress:
    "zlib deflate failed, output buffer too small"
    
    Author: Antoine Pitrou <[email protected]>
    
    Closes #2771 from pitrou/ARROW-3514-zlib-compression-bug and squashes the 
following commits:
    
    5b607327 <Antoine Pitrou> ARROW-3514:  Work around insufficient output size 
estimate on old zlibs
---
 cpp/src/arrow/util/compression_zlib.cc | 10 ++++++----
 python/manylinux1/build_arrow.sh       |  1 -
 python/pyarrow/tests/test_parquet.py   | 13 +++++++++++++
 3 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/cpp/src/arrow/util/compression_zlib.cc 
b/cpp/src/arrow/util/compression_zlib.cc
index cb3baff..9fe163e 100644
--- a/cpp/src/arrow/util/compression_zlib.cc
+++ b/cpp/src/arrow/util/compression_zlib.cc
@@ -438,13 +438,15 @@ class GZipCodec::GZipCodecImpl {
   }
 
   int64_t MaxCompressedLen(int64_t input_length, const uint8_t* 
ARROW_ARG_UNUSED(input)) {
-    // Most be in compression mode
+    // Must be in compression mode
     if (!compressor_initialized_) {
       Status s = InitCompressor();
       DCHECK(s.ok());
     }
-    // TODO(wesm): deal with zlib < 1.2.3 (see Impala codebase)
-    return deflateBound(&stream_, static_cast<uLong>(input_length));
+    int64_t max_len = deflateBound(&stream_, static_cast<uLong>(input_length));
+    // ARROW-3514: return a more pessimistic estimate to account for bugs
+    // in old zlib versions.
+    return max_len + 12;
   }
 
   Status Compress(int64_t input_length, const uint8_t* input, int64_t 
output_buffer_len,
@@ -460,7 +462,7 @@ class GZipCodec::GZipCodecImpl {
     int64_t ret = 0;
     if ((ret = deflate(&stream_, Z_FINISH)) != Z_STREAM_END) {
       if (ret == Z_OK) {
-        // will return Z_OK (and stream.msg NOT set) if stream.avail_out is too
+        // Will return Z_OK (and stream.msg NOT set) if stream.avail_out is too
         // small
         return Status::IOError("zlib deflate failed, output buffer too small");
       }
diff --git a/python/manylinux1/build_arrow.sh b/python/manylinux1/build_arrow.sh
index 8c37952..d99f072 100755
--- a/python/manylinux1/build_arrow.sh
+++ b/python/manylinux1/build_arrow.sh
@@ -101,7 +101,6 @@ for PYTHON_TUPLE in ${PYTHON_VERSIONS}; do
     echo "=== (${PYTHON}) Building wheel ==="
     PATH="$PATH:${CPYTHON_PATH}/bin" $PYTHON_INTERPRETER setup.py build_ext \
         --inplace \
-        --with-parquet \
         --bundle-arrow-cpp \
         --bundle-boost \
         --boost-namespace=arrow_boost
diff --git a/python/pyarrow/tests/test_parquet.py 
b/python/pyarrow/tests/test_parquet.py
index f3391ce..78677a0 100644
--- a/python/pyarrow/tests/test_parquet.py
+++ b/python/pyarrow/tests/test_parquet.py
@@ -2225,3 +2225,16 @@ def 
test_parquet_writer_context_obj_with_exception(tempdir):
 
     expected = pd.concat(frames, ignore_index=True)
     tm.assert_frame_equal(result.to_pandas(), expected)
+
+
+def test_zlib_compression_bug():
+    # ARROW-3514: "zlib deflate failed, output buffer too small"
+    import pyarrow.parquet as pq
+
+    table = pa.Table.from_arrays([pa.array(['abc', 'def'])], ['some_col'])
+    f = io.BytesIO()
+    pq.write_table(table, f, compression='gzip')
+
+    f.seek(0)
+    roundtrip = pq.read_table(f)
+    tm.assert_frame_equal(roundtrip.to_pandas(), table.to_pandas())

Reply via email to