rdblue commented on a change in pull request #4263:
URL: https://github.com/apache/iceberg/pull/4263#discussion_r832789096



##########
File path: python/tests/test_conversions.py
##########
@@ -0,0 +1,422 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""This test module tests PrimitiveType based conversions of values to/from 
bytes
+
+Notes:
+    Boolean:
+        - Stored as 0x00 for False and non-zero byte for True
+    Integer:
+        - Stored as 4 bytes in little-endian order
+        - 84202 is 0...01|01001000|11101010 in binary
+    Long:
+        - Stored as 8 bytes in little-endian order
+        - 200L is 0...0|11001000 in binary
+        - 11001000 -> 200 (-56), 00000000 -> 0, ... , 00000000 -> 0
+    Double:
+        - Stored as 8 bytes in little-endian order
+        - floating point numbers are represented as sign * 2ˆexponent * 
mantissa
+        - 6.0 is 1 * 2ˆ4 * 1.5 and encoded as 01000000|00011000|0...0
+        - 00000000 -> 0, ... , 00011000 -> 24, 01000000 -> 64
+    Date:
+        - Stored as days from 1970-01-01 in a 4-byte little-endian int
+        - 1000 is 0...0|00000011|11101000 in binary
+        - 11101000 -> 232 (-24), 00000011 -> 3, ... , 00000000 -> 0
+    Time:
+        - Stored as microseconds from midnight in an 8-byte little-endian long
+        - 10000L is 0...0|00100111|00010000 in binary
+        - 00010000 -> 16, 00100111 -> 39, ... , 00000000 -> 0
+    Timestamp:
+        - Stored as microseconds from 1970-01-01 00:00:00.000000 in an 8-byte 
little-endian long
+        - 400000L is 0...110|00011010|10000000 in binary
+        - 10000000 -> 128 (-128), 00011010 -> 26, 00000110 -> 6, ... , 
00000000 -> 0
+    String:
+        - Stored as UTF-8 bytes (without length)
+        - 'A' -> 65, 'B' -> 66, 'C' -> 67
+    UUID:
+        - Stored as 16-byte big-endian values
+        - f79c3e09-677c-4bbd-a479-3f349cb785e7 is encoded as F7 9C 3E 09 67 7C 
4B BD A4 79 3F 34 9C B7 85 E7
+        - 0xF7 -> 11110111 -> 247 (-9), 0x9C -> 10011100 -> 156 (-100), 0x3E 
-> 00111110 -> 62,
+        - 0x09 -> 00001001 -> 9, 0x67 -> 01100111 -> 103, 0x7C -> 01111100 -> 
124,
+        - 0x4B -> 01001011 -> 75, 0xBD -> 10111101 -> 189 (-67), 0xA4 -> 
10100100 -> 164 (-92),
+        - 0x79 -> 01111001 -> 121, 0x3F -> 00111111 -> 63, 0x34 -> 00110100 -> 
52,
+        - 0x9C -> 10011100 -> 156 (-100), 0xB7 -> 10110111 -> 183 (-73), 0x85 
-> 10000101 -> 133 (-123),
+        - 0xE7 -> 11100111 -> 231 (-25)
+    Fixed:
+        - Stored directly
+        - 'a' -> 97, 'b' -> 98
+    Binary:
+        - Stored directly
+        - 'Z' -> 90
+    Decimal:
+        - Stored as unscaled values in the form of two's-complement big-endian 
binary using the minimum number of bytes for the values
+        - 345 is 0...1|01011001 in binary
+        - 00000001 -> 1, 01011001 -> 89
+    Float:
+        - Stored as 4 bytes in little-endian order
+        - floating point numbers are represented as sign * 2ˆexponent * 
mantissa
+        - -4.5F is -1 * 2ˆ2 * 1.125 and encoded as 11000000|10010000|0...0 in 
binary
+        - 00000000 -> 0, 00000000 -> 0, 10010000 -> 144 (-112), 11000000 -> 
192 (-64),
+"""
+import uuid
+from decimal import Decimal
+
+import pytest
+
+from iceberg import conversions
+from iceberg.types import (
+    BinaryType,
+    BooleanType,
+    DateType,
+    DecimalType,
+    DoubleType,
+    FixedType,
+    FloatType,
+    IntegerType,
+    LongType,
+    StringType,
+    TimestampType,
+    TimestamptzType,
+    TimeType,
+    UUIDType,
+)
+
+
[email protected](
+    "primitive_type, partition_value_as_str, expected_result",
+    [
+        (BooleanType(), "true", True),
+        (BooleanType(), "false", False),
+        (BooleanType(), "TRUE", True),
+        (BooleanType(), "FALSE", False),
+        (BooleanType(), None, False),
+        (IntegerType(), "1", 1),
+        (IntegerType(), "9999", 9999),
+        (LongType(), "123456789", 123456789),
+        (FloatType(), "1.1", 1.1),
+        (DoubleType(), "99999.9", 99999.9),
+        (DecimalType(5, 2), "123.45", Decimal("123.45")),
+        (StringType(), "foo", "foo"),
+        (UUIDType(), "f79c3e09-677c-4bbd-a479-3f349cb785e7", 
uuid.UUID("f79c3e09-677c-4bbd-a479-3f349cb785e7")),
+        (FixedType(3), "foo", bytearray(b"foo")),
+        (BinaryType(), "foo", b"foo"),
+        (None, None, None),
+    ],
+)
+def test_from_partition_value_to_py(primitive_type, partition_value_as_str, 
expected_result):
+    """Test converting a partition value to a python built-in"""
+    assert conversions.from_partition_value_to_py(primitive_type, 
partition_value_as_str) == expected_result
+
+
[email protected](
+    "primitive_type, b, result",
+    [
+        (BooleanType(), b"\x00", False),
+        (BooleanType(), b"\x01", True),
+        (IntegerType(), b"\xd2\x04\x00\x00", 1234),
+        (LongType(), b"\xd2\x04\x00\x00\x00\x00\x00\x00", 1234),
+        (DoubleType(), b"\x8d\x97\x6e\x12\x83\xc0\xf3\x3f", 1.2345),
+        (DateType(), bytes([232, 3, 0, 0]), 1000),
+        (DateType(), b"\xd2\x04\x00\x00", 1234),
+        (TimeType(), bytes([16, 39, 0, 0, 0, 0, 0, 0]), 10000),
+        (TimeType(), b"\x00\xe8vH\x17\x00\x00\x00", 100000000000),
+        (TimestamptzType(), bytes([128, 26, 6, 0, 0, 0, 0, 0]), 400000),
+        (TimestamptzType(), b"\x00\xe8vH\x17\x00\x00\x00", 100000000000),
+        (TimestampType(), bytes([128, 26, 6, 0, 0, 0, 0, 0]), 400000),
+        (TimestampType(), b"\x00\xe8vH\x17\x00\x00\x00", 100000000000),
+        (StringType(), bytes([65, 66, 67]), "ABC"),
+        (StringType(), b"foo", "foo"),
+        (
+            UUIDType(),
+            bytes([247, 156, 62, 9, 103, 124, 75, 189, 164, 121, 63, 52, 156, 
183, 133, 231]),
+            uuid.UUID("f79c3e09-677c-4bbd-a479-3f349cb785e7"),
+        ),
+        (UUIDType(), b"\xf7\x9c>\tg|K\xbd\xa4y?4\x9c\xb7\x85\xe7", 
uuid.UUID("f79c3e09-677c-4bbd-a479-3f349cb785e7")),
+        (FixedType(3), bytes([97, 98]), bytes("ab", "utf8")),
+        (FixedType(3), b"foo", b"foo"),
+        (BinaryType(), bytearray("Z", "utf8"), bytes([90])),
+        (BinaryType(), b"foo", b"foo"),
+        (DecimalType(5, 2), b"\x30\x39", Decimal("123.45")),
+        (DecimalType(7, 4), b"\x00\x12\xd6\x87", Decimal("123.4567")),

Review comment:
       I don't think this is correct. The number of bytes should be 3, not 4, 
because null bytes are omitted.
   
   The calculation for min bytes is:
   ```python
   min_num_bytes = ((unscaled_value).bit_length() + 7) // 8
   ```
   
   That translates to:
   ```python
   ((1234567).bit_length() + 7) // 8
   => 3
   ```
   
   So the calculation looks correct. I'm wondering why the byte array ends up 
with the extra null byte at the start.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]



---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to