[arrow] branch master updated: ARROW-8750: [Python] Correctly default to lz4 compression for Feather V2 in Python

wesm Tue, 12 May 2020 10:02:18 -0700

This is an automated email from the ASF dual-hosted git repository.

wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git



The following commit(s) were added to refs/heads/master by this push:
     new 479c1cf  ARROW-8750: [Python] Correctly default to lz4 compression for 
Feather V2 in Python
479c1cf is described below

commit 479c1cf41fcdc20ca5374b530f004f46c0e5c872
Author: Wes McKinney <[email protected]>
AuthorDate: Tue May 12 12:01:37 2020 -0500

    ARROW-8750: [Python] Correctly default to lz4 compression for Feather V2 in 
Python
    
    This was the intention but I had not implemented it correctly. It's now 
tested to be so
    
    Closes #7150 from wesm/ARROW-8750
    
    Authored-by: Wes McKinney <[email protected]>
    Signed-off-by: Wes McKinney <[email protected]>
---
 python/pyarrow/feather.py            | 20 +++++++++++++-------
 python/pyarrow/io.pxi                | 13 ++++++++-----
 python/pyarrow/tests/test_feather.py | 20 ++++++++++++++++++++
 3 files changed, 41 insertions(+), 12 deletions(-)

diff --git a/python/pyarrow/feather.py b/python/pyarrow/feather.py
index a599e15..6071b5e 100644
--- a/python/pyarrow/feather.py
+++ b/python/pyarrow/feather.py
@@ -19,8 +19,8 @@
 import os
 
 from pyarrow.pandas_compat import _pandas_api  # noqa
-from pyarrow.lib import FeatherError  # noqa
-from pyarrow.lib import Table, concat_tables, schema
+from pyarrow.lib import (Codec, FeatherError, Table,  # noqa
+                         concat_tables, schema)
 import pyarrow.lib as ext
 
 
@@ -112,6 +112,9 @@ def check_chunked_overflow(name, col):
                          "Feather format".format(name, str(col.type)))
 
 
+_FEATHER_SUPPORTED_CODECS = {'lz4', 'zstd', 'uncompressed'}
+
+
 def write_feather(df, dest, compression=None, compression_level=None,
                   chunksize=None, version=2):
     """
@@ -165,11 +168,14 @@ def write_feather(df, dest, compression=None, 
compression_level=None,
         if chunksize is not None:
             raise ValueError("Feather V1 files do not support chunksize "
                              "option")
-
-    supported_compression_options = (None, 'lz4', 'zstd', 'uncompressed')
-    if compression not in supported_compression_options:
-        raise ValueError('compression="{}" not supported, must be one of {}'
-                         .format(compression, supported_compression_options))
+    else:
+        if compression is None and Codec.is_available('lz4_frame'):
+            compression = 'lz4'
+        elif (compression is not None and
+              compression not in _FEATHER_SUPPORTED_CODECS):
+            raise ValueError('compression="{}" not supported, must be '
+                             'one of {}'.format(compression,
+                                                _FEATHER_SUPPORTED_CODECS))
 
     try:
         ext.write_feather(table, dest, compression=compression,
diff --git a/python/pyarrow/io.pxi b/python/pyarrow/io.pxi
index d89a9a3..198bfb7 100644
--- a/python/pyarrow/io.pxi
+++ b/python/pyarrow/io.pxi
@@ -1474,8 +1474,10 @@ cdef CCompressionType _ensure_compression(str name) 
except *:
         return CCompressionType_BZ2
     elif uppercase == 'BROTLI':
         return CCompressionType_BROTLI
-    elif uppercase == 'LZ4':
+    elif uppercase == 'LZ4' or uppercase == 'LZ4_FRAME':
         return CCompressionType_LZ4_FRAME
+    elif uppercase == 'LZ4_RAW':
+        return CCompressionType_LZ4
     elif uppercase == 'ZSTD':
         return CCompressionType_ZSTD
     elif uppercase == 'SNAPPY':
@@ -1491,8 +1493,9 @@ cdef class Codec:
     Parameters
     ----------
     compression : str
-        Type of compression codec to initialize, valid values are: gzip, bz2,
-        brotli, lz4, zstd and snappy.
+        Type of compression codec to initialize, valid values are: 'gzip',
+        'bz2', 'brotli', 'lz4' (or 'lz4_frame'), 'lz4_raw', 'zstd' and
+        'snappy'.
 
     Raises
     ------
@@ -1676,7 +1679,7 @@ def compress(object buf, codec='lz4', asbytes=False, 
memory_pool=None):
     buf : pyarrow.Buffer, bytes, or other object supporting buffer protocol
     codec : str, default 'lz4'
         Compression codec.
-        Supported types: {'brotli, 'gzip', 'lz4', 'snappy', 'zstd'}
+        Supported types: {'brotli, 'gzip', 'lz4', 'lz4_raw', 'snappy', 'zstd'}
     asbytes : bool, default False
         Return result as Python bytes object, otherwise Buffer.
     memory_pool : MemoryPool, default None
@@ -1704,7 +1707,7 @@ def decompress(object buf, decompressed_size=None, 
codec='lz4',
         the uncompressed buffer size.
     codec : str, default 'lz4'
         Compression codec.
-        Supported types: {'brotli, 'gzip', 'lz4', 'snappy', 'zstd'}
+        Supported types: {'brotli, 'gzip', 'lz4', 'lz4_raw', 'snappy', 'zstd'}
     asbytes : bool, default False
         Return result as Python bytes object, otherwise Buffer.
     memory_pool : MemoryPool, default None
diff --git a/python/pyarrow/tests/test_feather.py 
b/python/pyarrow/tests/test_feather.py
index fbfcade..06dfac9 100644
--- a/python/pyarrow/tests/test_feather.py
+++ b/python/pyarrow/tests/test_feather.py
@@ -637,6 +637,26 @@ def test_v2_compression_options():
         write_feather(df, buf, compression='snappy')
 
 
+def test_v2_lz4_default_compression():
+    # ARROW-8750: Make sure that the compression=None option selects lz4 if
+    # it's available
+    if not pa.Codec.is_available('lz4_frame'):
+        pytest.skip("LZ4 compression support is not built in C++")
+
+    # some highly compressible data
+    t = pa.table([np.repeat(0, 100000)], names=['f0'])
+
+    buf = io.BytesIO()
+    write_feather(t, buf)
+    default_result = buf.getvalue()
+
+    buf = io.BytesIO()
+    write_feather(t, buf, compression='uncompressed')
+    uncompressed_result = buf.getvalue()
+
+    assert len(default_result) < len(uncompressed_result)
+
+
 def test_v1_unsupported_types():
     table = pa.table([pa.array([[1, 2, 3], [], None])], names=['f0'])

[arrow] branch master updated: ARROW-8750: [Python] Correctly default to lz4 compression for Feather V2 in Python

Reply via email to