rdblue commented on a change in pull request #4263: URL: https://github.com/apache/iceberg/pull/4263#discussion_r832801948
########## File path: python/tests/test_conversions.py ########## @@ -0,0 +1,422 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""This test module tests PrimitiveType based conversions of values to/from bytes + +Notes: + Boolean: + - Stored as 0x00 for False and non-zero byte for True + Integer: + - Stored as 4 bytes in little-endian order + - 84202 is 0...01|01001000|11101010 in binary + Long: + - Stored as 8 bytes in little-endian order + - 200L is 0...0|11001000 in binary + - 11001000 -> 200 (-56), 00000000 -> 0, ... , 00000000 -> 0 + Double: + - Stored as 8 bytes in little-endian order + - floating point numbers are represented as sign * 2ˆexponent * mantissa + - 6.0 is 1 * 2ˆ4 * 1.5 and encoded as 01000000|00011000|0...0 + - 00000000 -> 0, ... , 00011000 -> 24, 01000000 -> 64 + Date: + - Stored as days from 1970-01-01 in a 4-byte little-endian int + - 1000 is 0...0|00000011|11101000 in binary + - 11101000 -> 232 (-24), 00000011 -> 3, ... , 00000000 -> 0 + Time: + - Stored as microseconds from midnight in an 8-byte little-endian long + - 10000L is 0...0|00100111|00010000 in binary + - 00010000 -> 16, 00100111 -> 39, ... , 00000000 -> 0 + Timestamp: + - Stored as microseconds from 1970-01-01 00:00:00.000000 in an 8-byte little-endian long + - 400000L is 0...110|00011010|10000000 in binary + - 10000000 -> 128 (-128), 00011010 -> 26, 00000110 -> 6, ... , 00000000 -> 0 + String: + - Stored as UTF-8 bytes (without length) + - 'A' -> 65, 'B' -> 66, 'C' -> 67 + UUID: + - Stored as 16-byte big-endian values + - f79c3e09-677c-4bbd-a479-3f349cb785e7 is encoded as F7 9C 3E 09 67 7C 4B BD A4 79 3F 34 9C B7 85 E7 + - 0xF7 -> 11110111 -> 247 (-9), 0x9C -> 10011100 -> 156 (-100), 0x3E -> 00111110 -> 62, + - 0x09 -> 00001001 -> 9, 0x67 -> 01100111 -> 103, 0x7C -> 01111100 -> 124, + - 0x4B -> 01001011 -> 75, 0xBD -> 10111101 -> 189 (-67), 0xA4 -> 10100100 -> 164 (-92), + - 0x79 -> 01111001 -> 121, 0x3F -> 00111111 -> 63, 0x34 -> 00110100 -> 52, + - 0x9C -> 10011100 -> 156 (-100), 0xB7 -> 10110111 -> 183 (-73), 0x85 -> 10000101 -> 133 (-123), + - 0xE7 -> 11100111 -> 231 (-25) + Fixed: + - Stored directly + - 'a' -> 97, 'b' -> 98 + Binary: + - Stored directly + - 'Z' -> 90 + Decimal: + - Stored as unscaled values in the form of two's-complement big-endian binary using the minimum number of bytes for the values + - 345 is 0...1|01011001 in binary + - 00000001 -> 1, 01011001 -> 89 + Float: + - Stored as 4 bytes in little-endian order + - floating point numbers are represented as sign * 2ˆexponent * mantissa + - -4.5F is -1 * 2ˆ2 * 1.125 and encoded as 11000000|10010000|0...0 in binary + - 00000000 -> 0, 00000000 -> 0, 10010000 -> 144 (-112), 11000000 -> 192 (-64), +""" +import uuid +from decimal import Decimal + +import pytest + +from iceberg import conversions +from iceberg.types import ( + BinaryType, + BooleanType, + DateType, + DecimalType, + DoubleType, + FixedType, + FloatType, + IntegerType, + LongType, + StringType, + TimestampType, + TimestamptzType, + TimeType, + UUIDType, +) + + [email protected]( + "primitive_type, partition_value_as_str, expected_result", + [ + (BooleanType(), "true", True), + (BooleanType(), "false", False), + (BooleanType(), "TRUE", True), + (BooleanType(), "FALSE", False), + (BooleanType(), None, False), + (IntegerType(), "1", 1), + (IntegerType(), "9999", 9999), + (LongType(), "123456789", 123456789), + (FloatType(), "1.1", 1.1), + (DoubleType(), "99999.9", 99999.9), + (DecimalType(5, 2), "123.45", Decimal("123.45")), + (StringType(), "foo", "foo"), + (UUIDType(), "f79c3e09-677c-4bbd-a479-3f349cb785e7", uuid.UUID("f79c3e09-677c-4bbd-a479-3f349cb785e7")), + (FixedType(3), "foo", bytearray(b"foo")), + (BinaryType(), "foo", b"foo"), + (None, None, None), + ], +) +def test_from_partition_value_to_py(primitive_type, partition_value_as_str, expected_result): + """Test converting a partition value to a python built-in""" + assert conversions.from_partition_value_to_py(primitive_type, partition_value_as_str) == expected_result + + [email protected]( + "primitive_type, b, result", + [ + (BooleanType(), b"\x00", False), + (BooleanType(), b"\x01", True), + (IntegerType(), b"\xd2\x04\x00\x00", 1234), + (LongType(), b"\xd2\x04\x00\x00\x00\x00\x00\x00", 1234), + (DoubleType(), b"\x8d\x97\x6e\x12\x83\xc0\xf3\x3f", 1.2345), + (DateType(), bytes([232, 3, 0, 0]), 1000), + (DateType(), b"\xd2\x04\x00\x00", 1234), + (TimeType(), bytes([16, 39, 0, 0, 0, 0, 0, 0]), 10000), + (TimeType(), b"\x00\xe8vH\x17\x00\x00\x00", 100000000000), + (TimestamptzType(), bytes([128, 26, 6, 0, 0, 0, 0, 0]), 400000), + (TimestamptzType(), b"\x00\xe8vH\x17\x00\x00\x00", 100000000000), + (TimestampType(), bytes([128, 26, 6, 0, 0, 0, 0, 0]), 400000), + (TimestampType(), b"\x00\xe8vH\x17\x00\x00\x00", 100000000000), + (StringType(), bytes([65, 66, 67]), "ABC"), + (StringType(), b"foo", "foo"), + ( + UUIDType(), + bytes([247, 156, 62, 9, 103, 124, 75, 189, 164, 121, 63, 52, 156, 183, 133, 231]), + uuid.UUID("f79c3e09-677c-4bbd-a479-3f349cb785e7"), + ), + (UUIDType(), b"\xf7\x9c>\tg|K\xbd\xa4y?4\x9c\xb7\x85\xe7", uuid.UUID("f79c3e09-677c-4bbd-a479-3f349cb785e7")), + (FixedType(3), bytes([97, 98]), bytes("ab", "utf8")), + (FixedType(3), b"foo", b"foo"), + (BinaryType(), bytearray("Z", "utf8"), bytes([90])), + (BinaryType(), b"foo", b"foo"), + (DecimalType(5, 2), b"\x30\x39", Decimal("123.45")), + (DecimalType(7, 4), b"\x00\x12\xd6\x87", Decimal("123.4567")), + (DecimalType(7, 4), b"\xff\xed\x29\x79", Decimal("-123.4567")), + ], +) +def test_from_bytes(primitive_type, b, result): + """Test converting from bytes""" + assert conversions.from_bytes(primitive_type, b) == result + + [email protected]( + "primitive_type, b, approximate_result, approximation", + [ + (FloatType(), b"\x19\x04\x9e?", 1.2345, 5), + ], +) +def test_from_bytes_approximately(primitive_type, b, approximate_result, approximation): + """Test approximate equality when converting from bytes + + Note: This tests approximate equality because floating point numbers in python are implemented + using a double in C. Therefore a 32-bit (single precision) float is unpacked to 64-bit (double precision) + float in python which introduces some imprecision. + """ + assert conversions.from_bytes(primitive_type, b) == pytest.approx(approximate_result, approximation) Review comment: I don't think this should be an approximate result. Instead, can you send the expected result through pack/unpack to produce the same 32-bit value? ########## File path: python/tests/test_conversions.py ########## @@ -0,0 +1,422 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""This test module tests PrimitiveType based conversions of values to/from bytes + +Notes: + Boolean: + - Stored as 0x00 for False and non-zero byte for True + Integer: + - Stored as 4 bytes in little-endian order + - 84202 is 0...01|01001000|11101010 in binary + Long: + - Stored as 8 bytes in little-endian order + - 200L is 0...0|11001000 in binary + - 11001000 -> 200 (-56), 00000000 -> 0, ... , 00000000 -> 0 + Double: + - Stored as 8 bytes in little-endian order + - floating point numbers are represented as sign * 2ˆexponent * mantissa + - 6.0 is 1 * 2ˆ4 * 1.5 and encoded as 01000000|00011000|0...0 + - 00000000 -> 0, ... , 00011000 -> 24, 01000000 -> 64 + Date: + - Stored as days from 1970-01-01 in a 4-byte little-endian int + - 1000 is 0...0|00000011|11101000 in binary + - 11101000 -> 232 (-24), 00000011 -> 3, ... , 00000000 -> 0 + Time: + - Stored as microseconds from midnight in an 8-byte little-endian long + - 10000L is 0...0|00100111|00010000 in binary + - 00010000 -> 16, 00100111 -> 39, ... , 00000000 -> 0 + Timestamp: + - Stored as microseconds from 1970-01-01 00:00:00.000000 in an 8-byte little-endian long + - 400000L is 0...110|00011010|10000000 in binary + - 10000000 -> 128 (-128), 00011010 -> 26, 00000110 -> 6, ... , 00000000 -> 0 + String: + - Stored as UTF-8 bytes (without length) + - 'A' -> 65, 'B' -> 66, 'C' -> 67 + UUID: + - Stored as 16-byte big-endian values + - f79c3e09-677c-4bbd-a479-3f349cb785e7 is encoded as F7 9C 3E 09 67 7C 4B BD A4 79 3F 34 9C B7 85 E7 + - 0xF7 -> 11110111 -> 247 (-9), 0x9C -> 10011100 -> 156 (-100), 0x3E -> 00111110 -> 62, + - 0x09 -> 00001001 -> 9, 0x67 -> 01100111 -> 103, 0x7C -> 01111100 -> 124, + - 0x4B -> 01001011 -> 75, 0xBD -> 10111101 -> 189 (-67), 0xA4 -> 10100100 -> 164 (-92), + - 0x79 -> 01111001 -> 121, 0x3F -> 00111111 -> 63, 0x34 -> 00110100 -> 52, + - 0x9C -> 10011100 -> 156 (-100), 0xB7 -> 10110111 -> 183 (-73), 0x85 -> 10000101 -> 133 (-123), + - 0xE7 -> 11100111 -> 231 (-25) + Fixed: + - Stored directly + - 'a' -> 97, 'b' -> 98 + Binary: + - Stored directly + - 'Z' -> 90 + Decimal: + - Stored as unscaled values in the form of two's-complement big-endian binary using the minimum number of bytes for the values + - 345 is 0...1|01011001 in binary + - 00000001 -> 1, 01011001 -> 89 + Float: + - Stored as 4 bytes in little-endian order + - floating point numbers are represented as sign * 2ˆexponent * mantissa + - -4.5F is -1 * 2ˆ2 * 1.125 and encoded as 11000000|10010000|0...0 in binary + - 00000000 -> 0, 00000000 -> 0, 10010000 -> 144 (-112), 11000000 -> 192 (-64), +""" +import uuid +from decimal import Decimal + +import pytest + +from iceberg import conversions +from iceberg.types import ( + BinaryType, + BooleanType, + DateType, + DecimalType, + DoubleType, + FixedType, + FloatType, + IntegerType, + LongType, + StringType, + TimestampType, + TimestamptzType, + TimeType, + UUIDType, +) + + [email protected]( + "primitive_type, partition_value_as_str, expected_result", + [ + (BooleanType(), "true", True), + (BooleanType(), "false", False), + (BooleanType(), "TRUE", True), + (BooleanType(), "FALSE", False), + (BooleanType(), None, False), + (IntegerType(), "1", 1), + (IntegerType(), "9999", 9999), + (LongType(), "123456789", 123456789), + (FloatType(), "1.1", 1.1), + (DoubleType(), "99999.9", 99999.9), + (DecimalType(5, 2), "123.45", Decimal("123.45")), + (StringType(), "foo", "foo"), + (UUIDType(), "f79c3e09-677c-4bbd-a479-3f349cb785e7", uuid.UUID("f79c3e09-677c-4bbd-a479-3f349cb785e7")), + (FixedType(3), "foo", bytearray(b"foo")), + (BinaryType(), "foo", b"foo"), + (None, None, None), + ], +) +def test_from_partition_value_to_py(primitive_type, partition_value_as_str, expected_result): + """Test converting a partition value to a python built-in""" + assert conversions.from_partition_value_to_py(primitive_type, partition_value_as_str) == expected_result + + [email protected]( + "primitive_type, b, result", + [ + (BooleanType(), b"\x00", False), + (BooleanType(), b"\x01", True), + (IntegerType(), b"\xd2\x04\x00\x00", 1234), + (LongType(), b"\xd2\x04\x00\x00\x00\x00\x00\x00", 1234), + (DoubleType(), b"\x8d\x97\x6e\x12\x83\xc0\xf3\x3f", 1.2345), + (DateType(), bytes([232, 3, 0, 0]), 1000), + (DateType(), b"\xd2\x04\x00\x00", 1234), + (TimeType(), bytes([16, 39, 0, 0, 0, 0, 0, 0]), 10000), + (TimeType(), b"\x00\xe8vH\x17\x00\x00\x00", 100000000000), + (TimestamptzType(), bytes([128, 26, 6, 0, 0, 0, 0, 0]), 400000), + (TimestamptzType(), b"\x00\xe8vH\x17\x00\x00\x00", 100000000000), + (TimestampType(), bytes([128, 26, 6, 0, 0, 0, 0, 0]), 400000), + (TimestampType(), b"\x00\xe8vH\x17\x00\x00\x00", 100000000000), + (StringType(), bytes([65, 66, 67]), "ABC"), + (StringType(), b"foo", "foo"), + ( + UUIDType(), + bytes([247, 156, 62, 9, 103, 124, 75, 189, 164, 121, 63, 52, 156, 183, 133, 231]), + uuid.UUID("f79c3e09-677c-4bbd-a479-3f349cb785e7"), + ), + (UUIDType(), b"\xf7\x9c>\tg|K\xbd\xa4y?4\x9c\xb7\x85\xe7", uuid.UUID("f79c3e09-677c-4bbd-a479-3f349cb785e7")), + (FixedType(3), bytes([97, 98]), bytes("ab", "utf8")), + (FixedType(3), b"foo", b"foo"), + (BinaryType(), bytearray("Z", "utf8"), bytes([90])), + (BinaryType(), b"foo", b"foo"), + (DecimalType(5, 2), b"\x30\x39", Decimal("123.45")), + (DecimalType(7, 4), b"\x00\x12\xd6\x87", Decimal("123.4567")), + (DecimalType(7, 4), b"\xff\xed\x29\x79", Decimal("-123.4567")), + ], +) +def test_from_bytes(primitive_type, b, result): + """Test converting from bytes""" + assert conversions.from_bytes(primitive_type, b) == result + + [email protected]( + "primitive_type, b, approximate_result, approximation", + [ + (FloatType(), b"\x19\x04\x9e?", 1.2345, 5), + ], +) +def test_from_bytes_approximately(primitive_type, b, approximate_result, approximation): + """Test approximate equality when converting from bytes + + Note: This tests approximate equality because floating point numbers in python are implemented + using a double in C. Therefore a 32-bit (single precision) float is unpacked to 64-bit (double precision) + float in python which introduces some imprecision. + """ + assert conversions.from_bytes(primitive_type, b) == pytest.approx(approximate_result, approximation) Review comment: I don't think this should be an approximate result. Instead, can you send the expected result through pack/unpack to produce the same 32-bit cast to 64-bit value? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
