[GitHub] [arrow] jorisvandenbossche commented on a change in pull request #11724: ARROW-13781: [Python] Allow per column encoding in parquet writer

GitBox Thu, 25 Nov 2021 09:16:19 -0800


jorisvandenbossche commented on a change in pull request #11724:
URL: https://github.com/apache/arrow/pull/11724#discussion_r757051173




##########
File path: python/pyarrow/tests/parquet/test_basic.py
##########
@@ -352,6 +352,92 @@ def test_byte_stream_split(use_legacy_dataset):
                          use_legacy_dataset=use_legacy_dataset)
 
 
+@parametrize_legacy_dataset
+def test_column_encoding(use_legacy_dataset):
+    arr_float = pa.array(list(map(float, range(100))))
+    arr_int = pa.array(list(map(int, range(100))))
+    mixed_table = pa.Table.from_arrays([arr_float, arr_int],
+                                       names=['a', 'b'])
+
+    # Check "BYTE_STREAM_SPLIT" for column 'a' and "PLAIN" column_encoding for
+    # column 'b'.
+    _check_roundtrip(mixed_table, expected=mixed_table, use_dictionary=False,
+                     column_encoding={'a': "BYTE_STREAM_SPLIT", 'b': "PLAIN"},
+                     use_legacy_dataset=use_legacy_dataset)
+
+    # Check "PLAIN" for all columns.
+    _check_roundtrip(mixed_table, expected=mixed_table,
+                     use_dictionary=False,
+                     column_encoding="PLAIN",
+                     use_legacy_dataset=use_legacy_dataset)
+
+    # Try to pass "BYTE_STREAM_SPLIT" column encoding for integer column 'b'.
+    # This should throw an error as it is only supports FLOAT and DOUBLE.
+    with pytest.raises(IOError):
+        _check_roundtrip(mixed_table, expected=mixed_table,
+                         use_dictionary=False,
+                         column_encoding={'b': "BYTE_STREAM_SPLIT"},
+                         use_legacy_dataset=use_legacy_dataset)
+
+    # Try to pass "DELTA_BINARY_PACKED".
+    # This should throw an error as it is only supported for reading.
+    with pytest.raises(IOError):
+        _check_roundtrip(mixed_table, expected=mixed_table,
+                         use_dictionary=False,
+                         column_encoding={'b': "DELTA_BINARY_PACKED"},
+                         use_legacy_dataset=use_legacy_dataset)
+
+    # Try to pass unsupported encoding.
+    with pytest.raises(ValueError):
+        _check_roundtrip(mixed_table, expected=mixed_table,
+                         use_dictionary=False,
+                         column_encoding={'a': "MADE_UP_ENCODING"},
+                         use_legacy_dataset=use_legacy_dataset)
+
+    # Try to pass column_encoding and use_dictionary.
+    # This should throw an error.
+    with pytest.raises(ValueError):
+        _check_roundtrip(mixed_table, expected=mixed_table,
+                         use_dictionary=['b'],
+                         column_encoding={'b': "DELTA_BINARY_PACKED"},

Review comment:
       Just to be sure we are not catching the wrong error, can you use an 
encoding that would already work? eg PLAIN

##########
File path: python/pyarrow/_parquet.pyx
##########
@@ -880,6 +880,21 @@ cdef encoding_name_from_enum(ParquetEncoding encoding_):
     }.get(encoding_, 'UNKNOWN')
 
 
+cdef encoding_enum_from_name(str encoding_name):
+    enc = {
+        'PLAIN': ParquetEncoding_PLAIN,
+        'BIT_PACKED': ParquetEncoding_BIT_PACKED,
+        'RLE': ParquetEncoding_RLE,
+        'BYTE_STREAM_SPLIT': ParquetEncoding_BYTE_STREAM_SPLIT,
+        'DELTA_BINARY_PACKED': ParquetEncoding_DELTA_BINARY_PACKED,
+        'DELTA_BYTE_ARRAY': ParquetEncoding_DELTA_BYTE_ARRAY,
+    }.get(encoding_name, None)
+    if enc is None:
+        raise ValueError(f"Unsupported column encoding: {encoding_name!r}")

Review comment:
       I think we should maybe still check for dictionary encoding 
specifically, and raise a slightly different error message.
   
   Because right now, if people pass a dictionary encoding here, you get 
"Unsupported column encoding", while it's not actually unsupported. Only 
unsupported through this keyword, since it's already used by default. So the 
current error message might be quite confusing for users when they do that.

##########
File path: python/pyarrow/tests/parquet/test_basic.py
##########
@@ -352,6 +352,92 @@ def test_byte_stream_split(use_legacy_dataset):
                          use_legacy_dataset=use_legacy_dataset)
 
 
+@parametrize_legacy_dataset
+def test_column_encoding(use_legacy_dataset):
+    arr_float = pa.array(list(map(float, range(100))))
+    arr_int = pa.array(list(map(int, range(100))))
+    mixed_table = pa.Table.from_arrays([arr_float, arr_int],
+                                       names=['a', 'b'])
+
+    # Check "BYTE_STREAM_SPLIT" for column 'a' and "PLAIN" column_encoding for
+    # column 'b'.
+    _check_roundtrip(mixed_table, expected=mixed_table, use_dictionary=False,
+                     column_encoding={'a': "BYTE_STREAM_SPLIT", 'b': "PLAIN"},
+                     use_legacy_dataset=use_legacy_dataset)
+
+    # Check "PLAIN" for all columns.
+    _check_roundtrip(mixed_table, expected=mixed_table,
+                     use_dictionary=False,
+                     column_encoding="PLAIN",
+                     use_legacy_dataset=use_legacy_dataset)
+
+    # Try to pass "BYTE_STREAM_SPLIT" column encoding for integer column 'b'.
+    # This should throw an error as it is only supports FLOAT and DOUBLE.
+    with pytest.raises(IOError):
+        _check_roundtrip(mixed_table, expected=mixed_table,
+                         use_dictionary=False,
+                         column_encoding={'b': "BYTE_STREAM_SPLIT"},
+                         use_legacy_dataset=use_legacy_dataset)
+
+    # Try to pass "DELTA_BINARY_PACKED".
+    # This should throw an error as it is only supported for reading.
+    with pytest.raises(IOError):
+        _check_roundtrip(mixed_table, expected=mixed_table,
+                         use_dictionary=False,
+                         column_encoding={'b': "DELTA_BINARY_PACKED"},
+                         use_legacy_dataset=use_legacy_dataset)
+
+    # Try to pass unsupported encoding.
+    with pytest.raises(ValueError):
+        _check_roundtrip(mixed_table, expected=mixed_table,
+                         use_dictionary=False,
+                         column_encoding={'a': "MADE_UP_ENCODING"},
+                         use_legacy_dataset=use_legacy_dataset)
+
+    # Try to pass column_encoding and use_dictionary.
+    # This should throw an error.
+    with pytest.raises(ValueError):
+        _check_roundtrip(mixed_table, expected=mixed_table,
+                         use_dictionary=['b'],
+                         column_encoding={'b': "DELTA_BINARY_PACKED"},
+                         use_legacy_dataset=use_legacy_dataset)
+
+    # Try to pass column_encoding and use_dictionary=True (default value).
+    # This should throw an error.
+    with pytest.raises(ValueError):
+        _check_roundtrip(mixed_table, expected=mixed_table,
+                         column_encoding={'b': "DELTA_BINARY_PACKED"},
+                         use_legacy_dataset=use_legacy_dataset)
+
+    # Try to pass column_encoding and use_byte_stream_split on same column.
+    # This should throw an error.
+    with pytest.raises(ValueError):
+        _check_roundtrip(mixed_table, expected=mixed_table,
+                         use_dictionary=False,
+                         use_byte_stream_split=['a'],
+                         column_encoding={'a': "RLE",
+                                          'b': "BYTE_STREAM_SPLIT"},
+                         use_legacy_dataset=use_legacy_dataset)
+
+    # Try to pass column_encoding and use_byte_stream_split=True.
+    # This should throw an error.
+    with pytest.raises(ValueError):
+        _check_roundtrip(mixed_table, expected=mixed_table,
+                         use_dictionary=False,
+                         use_byte_stream_split=True,
+                         column_encoding={'a': "RLE",
+                                          'b': "BYTE_STREAM_SPLIT"},
+                         use_legacy_dataset=use_legacy_dataset)
+
+    # Try to pass column_encoding=True.
+    # This should throw an error.
+    with pytest.raises(TypeError):
+        _check_roundtrip(mixed_table, expected=mixed_table,
+                         use_dictionary=False,
+                         column_encoding=True,
+                         use_legacy_dataset=use_legacy_dataset)
+

Review comment:
       Related to the comment above, I would also add one more test case 
specifying a dictionary encoding. Since that's the case were we ran into 
segfaults initially, it would be good to ensure this doesn't happen at the 
moment. 




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [arrow] jorisvandenbossche commented on a change in pull request #11724: ARROW-13781: [Python] Allow per column encoding in parquet writer

Reply via email to