jorisvandenbossche commented on a change in pull request #11724: URL: https://github.com/apache/arrow/pull/11724#discussion_r757051173
########## File path: python/pyarrow/tests/parquet/test_basic.py ########## @@ -352,6 +352,92 @@ def test_byte_stream_split(use_legacy_dataset): use_legacy_dataset=use_legacy_dataset) +@parametrize_legacy_dataset +def test_column_encoding(use_legacy_dataset): + arr_float = pa.array(list(map(float, range(100)))) + arr_int = pa.array(list(map(int, range(100)))) + mixed_table = pa.Table.from_arrays([arr_float, arr_int], + names=['a', 'b']) + + # Check "BYTE_STREAM_SPLIT" for column 'a' and "PLAIN" column_encoding for + # column 'b'. + _check_roundtrip(mixed_table, expected=mixed_table, use_dictionary=False, + column_encoding={'a': "BYTE_STREAM_SPLIT", 'b': "PLAIN"}, + use_legacy_dataset=use_legacy_dataset) + + # Check "PLAIN" for all columns. + _check_roundtrip(mixed_table, expected=mixed_table, + use_dictionary=False, + column_encoding="PLAIN", + use_legacy_dataset=use_legacy_dataset) + + # Try to pass "BYTE_STREAM_SPLIT" column encoding for integer column 'b'. + # This should throw an error as it is only supports FLOAT and DOUBLE. + with pytest.raises(IOError): + _check_roundtrip(mixed_table, expected=mixed_table, + use_dictionary=False, + column_encoding={'b': "BYTE_STREAM_SPLIT"}, + use_legacy_dataset=use_legacy_dataset) + + # Try to pass "DELTA_BINARY_PACKED". + # This should throw an error as it is only supported for reading. + with pytest.raises(IOError): + _check_roundtrip(mixed_table, expected=mixed_table, + use_dictionary=False, + column_encoding={'b': "DELTA_BINARY_PACKED"}, + use_legacy_dataset=use_legacy_dataset) + + # Try to pass unsupported encoding. + with pytest.raises(ValueError): + _check_roundtrip(mixed_table, expected=mixed_table, + use_dictionary=False, + column_encoding={'a': "MADE_UP_ENCODING"}, + use_legacy_dataset=use_legacy_dataset) + + # Try to pass column_encoding and use_dictionary. + # This should throw an error. + with pytest.raises(ValueError): + _check_roundtrip(mixed_table, expected=mixed_table, + use_dictionary=['b'], + column_encoding={'b': "DELTA_BINARY_PACKED"}, Review comment: Just to be sure we are not catching the wrong error, can you use an encoding that would already work? eg PLAIN ########## File path: python/pyarrow/_parquet.pyx ########## @@ -880,6 +880,21 @@ cdef encoding_name_from_enum(ParquetEncoding encoding_): }.get(encoding_, 'UNKNOWN') +cdef encoding_enum_from_name(str encoding_name): + enc = { + 'PLAIN': ParquetEncoding_PLAIN, + 'BIT_PACKED': ParquetEncoding_BIT_PACKED, + 'RLE': ParquetEncoding_RLE, + 'BYTE_STREAM_SPLIT': ParquetEncoding_BYTE_STREAM_SPLIT, + 'DELTA_BINARY_PACKED': ParquetEncoding_DELTA_BINARY_PACKED, + 'DELTA_BYTE_ARRAY': ParquetEncoding_DELTA_BYTE_ARRAY, + }.get(encoding_name, None) + if enc is None: + raise ValueError(f"Unsupported column encoding: {encoding_name!r}") Review comment: I think we should maybe still check for dictionary encoding specifically, and raise a slightly different error message. Because right now, if people pass a dictionary encoding here, you get "Unsupported column encoding", while it's not actually unsupported. Only unsupported through this keyword, since it's already used by default. So the current error message might be quite confusing for users when they do that. ########## File path: python/pyarrow/tests/parquet/test_basic.py ########## @@ -352,6 +352,92 @@ def test_byte_stream_split(use_legacy_dataset): use_legacy_dataset=use_legacy_dataset) +@parametrize_legacy_dataset +def test_column_encoding(use_legacy_dataset): + arr_float = pa.array(list(map(float, range(100)))) + arr_int = pa.array(list(map(int, range(100)))) + mixed_table = pa.Table.from_arrays([arr_float, arr_int], + names=['a', 'b']) + + # Check "BYTE_STREAM_SPLIT" for column 'a' and "PLAIN" column_encoding for + # column 'b'. + _check_roundtrip(mixed_table, expected=mixed_table, use_dictionary=False, + column_encoding={'a': "BYTE_STREAM_SPLIT", 'b': "PLAIN"}, + use_legacy_dataset=use_legacy_dataset) + + # Check "PLAIN" for all columns. + _check_roundtrip(mixed_table, expected=mixed_table, + use_dictionary=False, + column_encoding="PLAIN", + use_legacy_dataset=use_legacy_dataset) + + # Try to pass "BYTE_STREAM_SPLIT" column encoding for integer column 'b'. + # This should throw an error as it is only supports FLOAT and DOUBLE. + with pytest.raises(IOError): + _check_roundtrip(mixed_table, expected=mixed_table, + use_dictionary=False, + column_encoding={'b': "BYTE_STREAM_SPLIT"}, + use_legacy_dataset=use_legacy_dataset) + + # Try to pass "DELTA_BINARY_PACKED". + # This should throw an error as it is only supported for reading. + with pytest.raises(IOError): + _check_roundtrip(mixed_table, expected=mixed_table, + use_dictionary=False, + column_encoding={'b': "DELTA_BINARY_PACKED"}, + use_legacy_dataset=use_legacy_dataset) + + # Try to pass unsupported encoding. + with pytest.raises(ValueError): + _check_roundtrip(mixed_table, expected=mixed_table, + use_dictionary=False, + column_encoding={'a': "MADE_UP_ENCODING"}, + use_legacy_dataset=use_legacy_dataset) + + # Try to pass column_encoding and use_dictionary. + # This should throw an error. + with pytest.raises(ValueError): + _check_roundtrip(mixed_table, expected=mixed_table, + use_dictionary=['b'], + column_encoding={'b': "DELTA_BINARY_PACKED"}, + use_legacy_dataset=use_legacy_dataset) + + # Try to pass column_encoding and use_dictionary=True (default value). + # This should throw an error. + with pytest.raises(ValueError): + _check_roundtrip(mixed_table, expected=mixed_table, + column_encoding={'b': "DELTA_BINARY_PACKED"}, + use_legacy_dataset=use_legacy_dataset) + + # Try to pass column_encoding and use_byte_stream_split on same column. + # This should throw an error. + with pytest.raises(ValueError): + _check_roundtrip(mixed_table, expected=mixed_table, + use_dictionary=False, + use_byte_stream_split=['a'], + column_encoding={'a': "RLE", + 'b': "BYTE_STREAM_SPLIT"}, + use_legacy_dataset=use_legacy_dataset) + + # Try to pass column_encoding and use_byte_stream_split=True. + # This should throw an error. + with pytest.raises(ValueError): + _check_roundtrip(mixed_table, expected=mixed_table, + use_dictionary=False, + use_byte_stream_split=True, + column_encoding={'a': "RLE", + 'b': "BYTE_STREAM_SPLIT"}, + use_legacy_dataset=use_legacy_dataset) + + # Try to pass column_encoding=True. + # This should throw an error. + with pytest.raises(TypeError): + _check_roundtrip(mixed_table, expected=mixed_table, + use_dictionary=False, + column_encoding=True, + use_legacy_dataset=use_legacy_dataset) + Review comment: Related to the comment above, I would also add one more test case specifying a dictionary encoding. Since that's the case were we ran into segfaults initially, it would be good to ensure this doesn't happen at the moment. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org