This is an automated email from the ASF dual-hosted git repository.
wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 479c1cf ARROW-8750: [Python] Correctly default to lz4 compression for
Feather V2 in Python
479c1cf is described below
commit 479c1cf41fcdc20ca5374b530f004f46c0e5c872
Author: Wes McKinney <[email protected]>
AuthorDate: Tue May 12 12:01:37 2020 -0500
ARROW-8750: [Python] Correctly default to lz4 compression for Feather V2 in
Python
This was the intention but I had not implemented it correctly. It's now
tested to be so
Closes #7150 from wesm/ARROW-8750
Authored-by: Wes McKinney <[email protected]>
Signed-off-by: Wes McKinney <[email protected]>
---
python/pyarrow/feather.py | 20 +++++++++++++-------
python/pyarrow/io.pxi | 13 ++++++++-----
python/pyarrow/tests/test_feather.py | 20 ++++++++++++++++++++
3 files changed, 41 insertions(+), 12 deletions(-)
diff --git a/python/pyarrow/feather.py b/python/pyarrow/feather.py
index a599e15..6071b5e 100644
--- a/python/pyarrow/feather.py
+++ b/python/pyarrow/feather.py
@@ -19,8 +19,8 @@
import os
from pyarrow.pandas_compat import _pandas_api # noqa
-from pyarrow.lib import FeatherError # noqa
-from pyarrow.lib import Table, concat_tables, schema
+from pyarrow.lib import (Codec, FeatherError, Table, # noqa
+ concat_tables, schema)
import pyarrow.lib as ext
@@ -112,6 +112,9 @@ def check_chunked_overflow(name, col):
"Feather format".format(name, str(col.type)))
+_FEATHER_SUPPORTED_CODECS = {'lz4', 'zstd', 'uncompressed'}
+
+
def write_feather(df, dest, compression=None, compression_level=None,
chunksize=None, version=2):
"""
@@ -165,11 +168,14 @@ def write_feather(df, dest, compression=None,
compression_level=None,
if chunksize is not None:
raise ValueError("Feather V1 files do not support chunksize "
"option")
-
- supported_compression_options = (None, 'lz4', 'zstd', 'uncompressed')
- if compression not in supported_compression_options:
- raise ValueError('compression="{}" not supported, must be one of {}'
- .format(compression, supported_compression_options))
+ else:
+ if compression is None and Codec.is_available('lz4_frame'):
+ compression = 'lz4'
+ elif (compression is not None and
+ compression not in _FEATHER_SUPPORTED_CODECS):
+ raise ValueError('compression="{}" not supported, must be '
+ 'one of {}'.format(compression,
+ _FEATHER_SUPPORTED_CODECS))
try:
ext.write_feather(table, dest, compression=compression,
diff --git a/python/pyarrow/io.pxi b/python/pyarrow/io.pxi
index d89a9a3..198bfb7 100644
--- a/python/pyarrow/io.pxi
+++ b/python/pyarrow/io.pxi
@@ -1474,8 +1474,10 @@ cdef CCompressionType _ensure_compression(str name)
except *:
return CCompressionType_BZ2
elif uppercase == 'BROTLI':
return CCompressionType_BROTLI
- elif uppercase == 'LZ4':
+ elif uppercase == 'LZ4' or uppercase == 'LZ4_FRAME':
return CCompressionType_LZ4_FRAME
+ elif uppercase == 'LZ4_RAW':
+ return CCompressionType_LZ4
elif uppercase == 'ZSTD':
return CCompressionType_ZSTD
elif uppercase == 'SNAPPY':
@@ -1491,8 +1493,9 @@ cdef class Codec:
Parameters
----------
compression : str
- Type of compression codec to initialize, valid values are: gzip, bz2,
- brotli, lz4, zstd and snappy.
+ Type of compression codec to initialize, valid values are: 'gzip',
+ 'bz2', 'brotli', 'lz4' (or 'lz4_frame'), 'lz4_raw', 'zstd' and
+ 'snappy'.
Raises
------
@@ -1676,7 +1679,7 @@ def compress(object buf, codec='lz4', asbytes=False,
memory_pool=None):
buf : pyarrow.Buffer, bytes, or other object supporting buffer protocol
codec : str, default 'lz4'
Compression codec.
- Supported types: {'brotli, 'gzip', 'lz4', 'snappy', 'zstd'}
+ Supported types: {'brotli, 'gzip', 'lz4', 'lz4_raw', 'snappy', 'zstd'}
asbytes : bool, default False
Return result as Python bytes object, otherwise Buffer.
memory_pool : MemoryPool, default None
@@ -1704,7 +1707,7 @@ def decompress(object buf, decompressed_size=None,
codec='lz4',
the uncompressed buffer size.
codec : str, default 'lz4'
Compression codec.
- Supported types: {'brotli, 'gzip', 'lz4', 'snappy', 'zstd'}
+ Supported types: {'brotli, 'gzip', 'lz4', 'lz4_raw', 'snappy', 'zstd'}
asbytes : bool, default False
Return result as Python bytes object, otherwise Buffer.
memory_pool : MemoryPool, default None
diff --git a/python/pyarrow/tests/test_feather.py
b/python/pyarrow/tests/test_feather.py
index fbfcade..06dfac9 100644
--- a/python/pyarrow/tests/test_feather.py
+++ b/python/pyarrow/tests/test_feather.py
@@ -637,6 +637,26 @@ def test_v2_compression_options():
write_feather(df, buf, compression='snappy')
+def test_v2_lz4_default_compression():
+ # ARROW-8750: Make sure that the compression=None option selects lz4 if
+ # it's available
+ if not pa.Codec.is_available('lz4_frame'):
+ pytest.skip("LZ4 compression support is not built in C++")
+
+ # some highly compressible data
+ t = pa.table([np.repeat(0, 100000)], names=['f0'])
+
+ buf = io.BytesIO()
+ write_feather(t, buf)
+ default_result = buf.getvalue()
+
+ buf = io.BytesIO()
+ write_feather(t, buf, compression='uncompressed')
+ uncompressed_result = buf.getvalue()
+
+ assert len(default_result) < len(uncompressed_result)
+
+
def test_v1_unsupported_types():
table = pa.table([pa.array([[1, 2, 3], [], None])], names=['f0'])