This is an automated email from the ASF dual-hosted git repository.
fokko pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/iceberg.git
The following commit(s) were added to refs/heads/master by this push:
new 427d23b7ba Python: Add more tests for the Avro writer (#8067)
427d23b7ba is described below
commit 427d23b7ba4d45355e6f7912737efa99a89098d7
Author: Fokko Driesprong <[email protected]>
AuthorDate: Mon Aug 7 21:27:19 2023 +0200
Python: Add more tests for the Avro writer (#8067)
* Python: Add more tests for the Avro writer
* Fix the tests
* WIP
* Update
* Update python/pyiceberg/utils/datetime.py
* Update python/pyiceberg/utils/datetime.py
* Update python/tests/avro/test_encoder.py
* WIP
* Cleanup
* Cleanup
* Cleanup
* Cleanup
* Fix the tests
---
python/dev/provision.py | 42 ++++++++++
python/pyiceberg/avro/decoder.py | 69 +---------------
python/pyiceberg/avro/encoder.py | 120 +++-------------------------
python/pyiceberg/avro/file.py | 2 +-
python/pyiceberg/avro/reader.py | 63 ++++++++++++---
python/pyiceberg/avro/writer.py | 29 +++----
python/pyiceberg/conversions.py | 4 +-
python/pyiceberg/expressions/literals.py | 4 +-
python/pyiceberg/utils/datetime.py | 11 ++-
python/pyiceberg/utils/decimal.py | 43 +++++++++-
python/pyiceberg/utils/schema_conversion.py | 17 ++--
python/tests/avro/test_decoder.py | 65 ---------------
python/tests/avro/test_encoder.py | 87 ++------------------
python/tests/avro/test_file.py | 94 ++++++++++++++++++++++
python/tests/avro/test_writer.py | 17 ++++
python/tests/test_integration.py | 16 ++++
python/tests/test_schema.py | 47 +++++++++++
python/tests/test_transforms.py | 4 +-
python/tests/utils/test_decimal.py | 40 ++++++++++
19 files changed, 405 insertions(+), 369 deletions(-)
diff --git a/python/dev/provision.py b/python/dev/provision.py
index 73ec34fdc1..f62687b746 100644
--- a/python/dev/provision.py
+++ b/python/dev/provision.py
@@ -185,3 +185,45 @@ all_types_dataframe = (
all_types_dataframe.writeTo("default.test_all_types").tableProperty("format-version",
"2").partitionedBy(
"intCol"
).createOrReplace()
+
+for table_name, partition in [
+ ("test_partitioned_by_identity", "ts"),
+ ("test_partitioned_by_years", "years(dt)"),
+ ("test_partitioned_by_months", "months(dt)"),
+ ("test_partitioned_by_days", "days(ts)"),
+ ("test_partitioned_by_hours", "hours(ts)"),
+ ("test_partitioned_by_truncate", "truncate(1, letter)"),
+ ("test_partitioned_by_bucket", "bucket(16, number)"),
+]:
+ spark.sql(
+ f"""
+ CREATE OR REPLACE TABLE default.{table_name} (
+ dt date,
+ ts timestamp,
+ number integer,
+ letter string
+ )
+ USING iceberg;
+ """
+ )
+
+ spark.sql(f"ALTER TABLE default.{table_name} ADD PARTITION FIELD
{partition}")
+
+ spark.sql(
+ f"""
+ INSERT INTO default.{table_name}
+ VALUES
+ (CAST('2022-03-01' AS date), CAST('2022-03-01 01:22:00' AS timestamp),
1, 'a'),
+ (CAST('2022-03-02' AS date), CAST('2022-03-02 02:22:00' AS timestamp),
2, 'b'),
+ (CAST('2022-03-03' AS date), CAST('2022-03-03 03:22:00' AS timestamp),
3, 'c'),
+ (CAST('2022-03-04' AS date), CAST('2022-03-04 04:22:00' AS timestamp),
4, 'd'),
+ (CAST('2023-03-05' AS date), CAST('2023-03-05 05:22:00' AS timestamp),
5, 'e'),
+ (CAST('2023-03-06' AS date), CAST('2023-03-06 06:22:00' AS timestamp),
6, 'f'),
+ (CAST('2023-03-07' AS date), CAST('2023-03-07 07:22:00' AS timestamp),
7, 'g'),
+ (CAST('2023-03-08' AS date), CAST('2023-03-08 08:22:00' AS timestamp),
8, 'h'),
+ (CAST('2023-03-09' AS date), CAST('2023-03-09 09:22:00' AS timestamp),
9, 'i'),
+ (CAST('2023-03-10' AS date), CAST('2023-03-10 10:22:00' AS timestamp),
10, 'j'),
+ (CAST('2023-03-11' AS date), CAST('2023-03-11 11:22:00' AS timestamp),
11, 'k'),
+ (CAST('2023-03-12' AS date), CAST('2023-03-12 12:22:00' AS timestamp),
12, 'l');
+ """
+ )
diff --git a/python/pyiceberg/avro/decoder.py b/python/pyiceberg/avro/decoder.py
index 35a1651192..0b39340b0e 100644
--- a/python/pyiceberg/avro/decoder.py
+++ b/python/pyiceberg/avro/decoder.py
@@ -14,21 +14,16 @@
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
-import decimal
from abc import ABC, abstractmethod
-from datetime import datetime, time
from io import SEEK_CUR
from typing import Dict, List
-from uuid import UUID
from pyiceberg.avro import STRUCT_DOUBLE, STRUCT_FLOAT
from pyiceberg.io import InputStream
-from pyiceberg.utils.datetime import micros_to_time, micros_to_timestamp,
micros_to_timestamptz
-from pyiceberg.utils.decimal import unscaled_to_decimal
class BinaryDecoder(ABC):
- """Read leaf values."""
+ """Decodes bytes into Python physical primitives."""
@abstractmethod
def __init__(self, input_stream: InputStream) -> None:
@@ -106,77 +101,19 @@ class BinaryDecoder(ABC):
"""
return float(STRUCT_DOUBLE.unpack(self.read(8))[0])
- def read_decimal_from_bytes(self, precision: int, scale: int) ->
decimal.Decimal:
- """Reads a value from the stream as a decimal.
-
- Decimal bytes are decoded as signed short, int or long depending on the
- size of bytes.
- """
- size = self.read_int()
- return self.read_decimal_from_fixed(precision, scale, size)
-
- def read_decimal_from_fixed(self, _: int, scale: int, size: int) ->
decimal.Decimal:
- """Reads a value from the stream as a decimal.
-
- Decimal is encoded as fixed. Fixed instances are encoded using the
- number of bytes declared in the schema.
- """
- data = self.read(size)
- unscaled_datum = int.from_bytes(data, byteorder="big", signed=True)
- return unscaled_to_decimal(unscaled_datum, scale)
-
def read_bytes(self) -> bytes:
"""Bytes are encoded as a long followed by that many bytes of data."""
num_bytes = self.read_int()
return self.read(num_bytes) if num_bytes > 0 else b""
def read_utf8(self) -> str:
- """Reads a utf-8 encoded string from the stream.
+ """Reads an utf-8 encoded string from the stream.
A string is encoded as a long followed by
that many bytes of UTF-8 encoded character data.
"""
return self.read_bytes().decode("utf-8")
- def read_uuid_from_fixed(self) -> UUID:
- """Reads a UUID as a fixed[16]."""
- return UUID(bytes=self.read(16))
-
- def read_time_millis(self) -> time:
- """Reads a milliseconds granularity time from the stream.
-
- Int is decoded as python time object which represents
- the number of milliseconds after midnight, 00:00:00.000.
- """
- millis = self.read_int()
- return micros_to_time(millis * 1000)
-
- def read_time_micros(self) -> time:
- """Reads a microseconds granularity time from the stream.
-
- Long is decoded as python time object which represents
- the number of microseconds after midnight, 00:00:00.000000.
- """
- return micros_to_time(self.read_int())
-
- def read_timestamp_micros(self) -> datetime:
- """Reads a microsecond granularity timestamp from the stream.
-
- Long is decoded as python datetime object which represents
- the number of microseconds from the unix epoch, 1 January 1970.
- """
- return micros_to_timestamp(self.read_int())
-
- def read_timestamptz_micros(self) -> datetime:
- """Reads a microsecond granularity timestamptz from the stream.
-
- Long is decoded as python datetime object which represents
- the number of microseconds from the unix epoch, 1 January 1970.
-
- Adjusted to UTC.
- """
- return micros_to_timestamptz(self.read_int())
-
def skip_boolean(self) -> None:
self.skip(1)
@@ -199,7 +136,7 @@ class BinaryDecoder(ABC):
class StreamingBinaryDecoder(BinaryDecoder):
- """Read leaf values."""
+ """Decodes bytes into Python physical primitives."""
__slots__ = "_input_stream"
_input_stream: InputStream
diff --git a/python/pyiceberg/avro/encoder.py b/python/pyiceberg/avro/encoder.py
index cf6d601233..a25e4bb66d 100644
--- a/python/pyiceberg/avro/encoder.py
+++ b/python/pyiceberg/avro/encoder.py
@@ -14,17 +14,14 @@
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
-import decimal
-import struct
-from datetime import date, datetime, time
+from uuid import UUID
from pyiceberg.avro import STRUCT_DOUBLE, STRUCT_FLOAT
from pyiceberg.io import OutputStream
-from pyiceberg.utils.datetime import date_to_days, datetime_to_micros,
time_object_to_micros
class BinaryEncoder:
- """Write leaf values."""
+ """Encodes Python physical types into bytes."""
_output_stream: OutputStream
@@ -58,118 +55,21 @@ class BinaryEncoder:
"""A double is written as 8 bytes."""
self.write(STRUCT_DOUBLE.pack(f))
- def write_decimal_bytes(self, datum: decimal.Decimal) -> None:
- """
- Decimal in bytes are encoded as long.
-
- Since size of packed value in bytes for signed long is 8, 8 bytes are
written.
- """
- sign, digits, _ = datum.as_tuple()
-
- unscaled_datum = 0
- for digit in digits:
- unscaled_datum = (unscaled_datum * 10) + digit
-
- bits_req = unscaled_datum.bit_length() + 1
- if sign:
- unscaled_datum = (1 << bits_req) - unscaled_datum
-
- bytes_req = bits_req // 8
- padding_bits = ~((1 << bits_req) - 1) if sign else 0
- packed_bits = padding_bits | unscaled_datum
-
- bytes_req += 1 if (bytes_req << 3) < bits_req else 0
- self.write_int(bytes_req)
- for index in range(bytes_req - 1, -1, -1):
- bits_to_write = packed_bits >> (8 * index)
- self.write(bytearray([bits_to_write & 0xFF]))
-
- def write_decimal_fixed(self, datum: decimal.Decimal, size: int) -> None:
- """Decimal in fixed are encoded as size of fixed bytes."""
- sign, digits, _ = datum.as_tuple()
-
- unscaled_datum = 0
- for digit in digits:
- unscaled_datum = (unscaled_datum * 10) + digit
-
- bits_req = unscaled_datum.bit_length() + 1
- size_in_bits = size * 8
- offset_bits = size_in_bits - bits_req
-
- mask = 2**size_in_bits - 1
- bit = 1
- for _ in range(bits_req):
- mask ^= bit
- bit <<= 1
-
- if bits_req < 8:
- bytes_req = 1
- else:
- bytes_req = bits_req // 8
- if bits_req % 8 != 0:
- bytes_req += 1
- if sign:
- unscaled_datum = (1 << bits_req) - unscaled_datum
- unscaled_datum = mask | unscaled_datum
- for index in range(size - 1, -1, -1):
- bits_to_write = unscaled_datum >> (8 * index)
- self.write(bytearray([bits_to_write & 0xFF]))
- else:
- for _ in range(offset_bits // 8):
- self.write(b"\x00")
- for index in range(bytes_req - 1, -1, -1):
- bits_to_write = unscaled_datum >> (8 * index)
- self.write(bytearray([bits_to_write & 0xFF]))
-
def write_bytes(self, b: bytes) -> None:
"""Bytes are encoded as a long followed by that many bytes of data."""
self.write_int(len(b))
- self.write(struct.pack(f"{len(b)}s", b))
-
- def write_bytes_fixed(self, b: bytes) -> None:
- """Writes fixed number of bytes."""
- self.write(struct.pack(f"{len(b)}s", b))
+ self.write(b)
def write_utf8(self, s: str) -> None:
"""A string is encoded as a long followed by that many bytes of UTF-8
encoded character data."""
self.write_bytes(s.encode("utf-8"))
- def write_date_int(self, d: date) -> None:
- """
- Encode python date object as int.
-
- It stores the number of days from the unix epoch, 1 January 1970 (ISO
calendar).
- """
- self.write_int(date_to_days(d))
-
- def write_time_millis_int(self, dt: time) -> None:
- """
- Encode python time object as int.
-
- It stores the number of milliseconds from midnight, 00:00:00.000
- """
- self.write_int(int(time_object_to_micros(dt) / 1000))
-
- def write_time_micros_long(self, dt: time) -> None:
- """
- Encode python time object as long.
-
- It stores the number of microseconds from midnight, 00:00:00.000000
- """
- self.write_int(time_object_to_micros(dt))
-
- def write_timestamp_millis_long(self, dt: datetime) -> None:
- """
- Encode python datetime object as long.
-
- It stores the number of milliseconds from midnight of unix epoch, 1
January 1970.
- """
- self.write_int(int(datetime_to_micros(dt) / 1000))
-
- def write_timestamp_micros_long(self, dt: datetime) -> None:
- """
- Encode python datetime object as long.
+ def write_uuid(self, uuid: UUID) -> None:
+ """Write UUID as a fixed[16].
- It stores the number of microseconds from midnight of unix epoch, 1
January 1970.
+ The uuid logical type represents a random generated universally unique
identifier (UUID).
+ An uuid logical type annotates an Avro string. The string has to
conform with RFC-4122.
"""
- self.write_int(datetime_to_micros(dt))
+ if len(uuid.bytes) != 16:
+ raise ValueError(f"Expected UUID to have 16 bytes, got:
len({uuid.bytes!r})")
+ return self.write(uuid.bytes)
diff --git a/python/pyiceberg/avro/file.py b/python/pyiceberg/avro/file.py
index f780a8a30c..3b2f62045e 100644
--- a/python/pyiceberg/avro/file.py
+++ b/python/pyiceberg/avro/file.py
@@ -275,4 +275,4 @@ class AvroOutputFile(Generic[D]):
self.encoder.write_int(len(objects))
self.encoder.write_int(len(block_content))
self.encoder.write(block_content)
- self.encoder.write_bytes_fixed(self.sync_bytes)
+ self.encoder.write(self.sync_bytes)
diff --git a/python/pyiceberg/avro/reader.py b/python/pyiceberg/avro/reader.py
index ba9456e892..8a8f319a90 100644
--- a/python/pyiceberg/avro/reader.py
+++ b/python/pyiceberg/avro/reader.py
@@ -28,7 +28,6 @@ from __future__ import annotations
from abc import abstractmethod
from dataclasses import dataclass
from dataclasses import field as dataclassfield
-from datetime import datetime, time
from decimal import Decimal
from typing import (
Any,
@@ -43,6 +42,7 @@ from uuid import UUID
from pyiceberg.avro.decoder import BinaryDecoder
from pyiceberg.typedef import StructProtocol
from pyiceberg.types import StructType
+from pyiceberg.utils.decimal import bytes_to_decimal, decimal_required_bytes
from pyiceberg.utils.singleton import Singleton
@@ -153,6 +153,11 @@ class DoubleReader(Reader):
class DateReader(Reader):
+ """Reads a day granularity date from the stream.
+
+ The number of days from 1 January 1970.
+ """
+
def read(self, decoder: BinaryDecoder) -> int:
return decoder.read_int()
@@ -161,24 +166,44 @@ class DateReader(Reader):
class TimeReader(Reader):
- def read(self, decoder: BinaryDecoder) -> time:
- return decoder.read_time_micros()
+ """Reads a microsecond granularity timestamp from the stream.
+
+ Long is decoded as an integer which represents
+ the number of microseconds from the unix epoch, 1 January 1970.
+ """
+
+ def read(self, decoder: BinaryDecoder) -> int:
+ return decoder.read_int()
def skip(self, decoder: BinaryDecoder) -> None:
decoder.skip_int()
class TimestampReader(Reader):
- def read(self, decoder: BinaryDecoder) -> datetime:
- return decoder.read_timestamp_micros()
+ """Reads a microsecond granularity timestamp from the stream.
+
+ Long is decoded as python integer which represents
+ the number of microseconds from the unix epoch, 1 January 1970.
+ """
+
+ def read(self, decoder: BinaryDecoder) -> int:
+ return decoder.read_int()
def skip(self, decoder: BinaryDecoder) -> None:
decoder.skip_int()
class TimestamptzReader(Reader):
- def read(self, decoder: BinaryDecoder) -> datetime:
- return decoder.read_timestamptz_micros()
+ """Reads a microsecond granularity timestamptz from the stream.
+
+ Long is decoded as python integer which represents
+ the number of microseconds from the unix epoch, 1 January 1970.
+
+ Adjusted to UTC.
+ """
+
+ def read(self, decoder: BinaryDecoder) -> int:
+ return decoder.read_int()
def skip(self, decoder: BinaryDecoder) -> None:
decoder.skip_int()
@@ -194,7 +219,7 @@ class StringReader(Reader):
class UUIDReader(Reader):
def read(self, decoder: BinaryDecoder) -> UUID:
- return decoder.read_uuid_from_fixed()
+ return UUID(bytes=decoder.read(16))
def skip(self, decoder: BinaryDecoder) -> None:
decoder.skip(16)
@@ -220,6 +245,12 @@ class FixedReader(Reader):
class BinaryReader(Reader):
+ """Read a binary value.
+
+ First reads an integer, to get the length of the binary value,
+ then reads the binary field itself.
+ """
+
def read(self, decoder: BinaryDecoder) -> bytes:
return decoder.read_bytes()
@@ -227,13 +258,25 @@ class BinaryReader(Reader):
decoder.skip_bytes()
-@dataclass(frozen=True)
+@dataclass(frozen=True, init=False)
class DecimalReader(Reader):
+ """Reads a value as a decimal.
+
+ Decimal bytes are decoded as signed short, int or long depending on the
+ size of bytes.
+ """
+
precision: int = dataclassfield()
scale: int = dataclassfield()
+ _length: int
+
+ def __init__(self, precision: int, scale: int):
+ object.__setattr__(self, "precision", precision)
+ object.__setattr__(self, "scale", scale)
+ object.__setattr__(self, "_length", decimal_required_bytes(precision))
def read(self, decoder: BinaryDecoder) -> Decimal:
- return decoder.read_decimal_from_bytes(self.precision, self.scale)
+ return bytes_to_decimal(decoder.read(self._length), self.scale)
def skip(self, decoder: BinaryDecoder) -> None:
decoder.skip_bytes()
diff --git a/python/pyiceberg/avro/writer.py b/python/pyiceberg/avro/writer.py
index 8ac44036e1..2a1dcd5804 100644
--- a/python/pyiceberg/avro/writer.py
+++ b/python/pyiceberg/avro/writer.py
@@ -25,7 +25,6 @@ from __future__ import annotations
from abc import abstractmethod
from dataclasses import dataclass
from dataclasses import field as dataclassfield
-from datetime import datetime, time
from typing import (
Any,
Dict,
@@ -36,6 +35,7 @@ from uuid import UUID
from pyiceberg.avro.encoder import BinaryEncoder
from pyiceberg.types import StructType
+from pyiceberg.utils.decimal import decimal_required_bytes, decimal_to_bytes
from pyiceberg.utils.singleton import Singleton
@@ -77,23 +77,23 @@ class DoubleWriter(Writer):
class DateWriter(Writer):
- def write(self, encoder: BinaryEncoder, val: Any) -> None:
- encoder.write_date_int(val)
+ def write(self, encoder: BinaryEncoder, val: int) -> None:
+ encoder.write_int(val)
class TimeWriter(Writer):
- def write(self, encoder: BinaryEncoder, val: time) -> None:
- encoder.write_time_micros_long(val)
+ def write(self, encoder: BinaryEncoder, val: int) -> None:
+ encoder.write_int(val)
class TimestampWriter(Writer):
- def write(self, encoder: BinaryEncoder, val: datetime) -> None:
- encoder.write_timestamp_micros_long(val)
+ def write(self, encoder: BinaryEncoder, val: int) -> None:
+ encoder.write_int(val)
class TimestamptzWriter(Writer):
- def write(self, encoder: BinaryEncoder, val: datetime) -> None:
- encoder.write_timestamp_micros_long(val)
+ def write(self, encoder: BinaryEncoder, val: int) -> None:
+ encoder.write_int(val)
class StringWriter(Writer):
@@ -103,12 +103,7 @@ class StringWriter(Writer):
class UUIDWriter(Writer):
def write(self, encoder: BinaryEncoder, val: UUID) -> None:
- uuid_bytes = val.bytes
-
- if len(uuid_bytes) != 16:
- raise ValueError(f"Expected UUID to be 16 bytes, got:
{len(uuid_bytes)}")
-
- encoder.write_bytes_fixed(uuid_bytes)
+ encoder.write(val.bytes)
@dataclass(frozen=True)
@@ -116,6 +111,8 @@ class FixedWriter(Writer):
_len: int = dataclassfield()
def write(self, encoder: BinaryEncoder, val: bytes) -> None:
+ if len(val) != self._len:
+ raise ValueError(f"Expected {self._len} bytes, got {len(val)}")
encoder.write(val)
def __len__(self) -> int:
@@ -140,7 +137,7 @@ class DecimalWriter(Writer):
scale: int = dataclassfield()
def write(self, encoder: BinaryEncoder, val: Any) -> None:
- return encoder.write_decimal_bytes(val)
+ return encoder.write(decimal_to_bytes(val,
byte_length=decimal_required_bytes(self.precision)))
def __repr__(self) -> str:
"""Returns string representation of this object."""
diff --git a/python/pyiceberg/conversions.py b/python/pyiceberg/conversions.py
index ebf6cfa666..8f155fce3d 100644
--- a/python/pyiceberg/conversions.py
+++ b/python/pyiceberg/conversions.py
@@ -57,7 +57,7 @@ from pyiceberg.types import (
TimeType,
UUIDType,
)
-from pyiceberg.utils.datetime import date_to_days, datetime_to_micros,
time_object_to_micros
+from pyiceberg.utils.datetime import date_to_days, datetime_to_micros,
time_to_micros
from pyiceberg.utils.decimal import decimal_to_bytes, unscaled_to_decimal
_BOOL_STRUCT = Struct("<?")
@@ -203,7 +203,7 @@ def _(_: DateType, value: Union[date, int]) -> bytes:
@to_bytes.register(TimeType)
def _(_: TimeType, value: Union[time, int]) -> bytes:
if isinstance(value, time):
- value = time_object_to_micros(value)
+ value = time_to_micros(value)
return _LONG_STRUCT.pack(value)
diff --git a/python/pyiceberg/expressions/literals.py
b/python/pyiceberg/expressions/literals.py
index 1e47e87608..f89d0c8331 100644
--- a/python/pyiceberg/expressions/literals.py
+++ b/python/pyiceberg/expressions/literals.py
@@ -50,7 +50,7 @@ from pyiceberg.types import (
from pyiceberg.utils.datetime import (
date_str_to_days,
micros_to_days,
- time_to_micros,
+ time_str_to_micros,
timestamp_to_micros,
timestamptz_to_micros,
)
@@ -558,7 +558,7 @@ class StringLiteral(Literal[str]):
@to.register(TimeType)
def _(self, type_var: TimeType) -> Literal[int]:
try:
- return TimeLiteral(time_to_micros(self.value))
+ return TimeLiteral(time_str_to_micros(self.value))
except (TypeError, ValueError) as e:
raise ValueError(f"Could not convert {self.value} into a
{type_var}") from e
diff --git a/python/pyiceberg/utils/datetime.py
b/python/pyiceberg/utils/datetime.py
index 71a62ca456..f43d0fa84b 100644
--- a/python/pyiceberg/utils/datetime.py
+++ b/python/pyiceberg/utils/datetime.py
@@ -61,15 +61,14 @@ def days_to_date(days: int) -> date:
return EPOCH_DATE + timedelta(days)
-def time_to_micros(time_str: str) -> int:
+def time_str_to_micros(time_str: str) -> int:
"""Converts an ISO-8601 formatted time to microseconds from midnight."""
- t = time.fromisoformat(time_str)
- return (((t.hour * 60 + t.minute) * 60) + t.second) * 1_000_000 +
t.microsecond
+ return time_to_micros(time.fromisoformat(time_str))
-def time_object_to_micros(t: time) -> int:
- """Converts an datetime.time object to microseconds from midnight."""
- return int(t.hour * 60 * 60 * 1e6 + t.minute * 60 * 1e6 + t.second * 1e6 +
t.microsecond)
+def time_to_micros(t: time) -> int:
+ """Converts a datetime.time object to microseconds from midnight."""
+ return (((t.hour * 60 + t.minute) * 60) + t.second) * 1_000_000 +
t.microsecond
def datetime_to_micros(dt: datetime) -> int:
diff --git a/python/pyiceberg/utils/decimal.py
b/python/pyiceberg/utils/decimal.py
index 32f6db6aa9..06b2b945cd 100644
--- a/python/pyiceberg/utils/decimal.py
+++ b/python/pyiceberg/utils/decimal.py
@@ -16,8 +16,9 @@
# under the License.
"""Helper methods for working with Python Decimals."""
+import math
from decimal import Decimal
-from typing import Union
+from typing import Optional, Union
def decimal_to_unscaled(value: Decimal) -> int:
@@ -64,16 +65,33 @@ def bytes_required(value: Union[int, Decimal]) -> int:
raise ValueError(f"Unsupported value: {value}")
-def decimal_to_bytes(value: Decimal) -> bytes:
+def decimal_to_bytes(value: Decimal, byte_length: Optional[int] = None) ->
bytes:
"""Returns a byte representation of a decimal.
Args:
value (Decimal): a decimal value.
+ byte_length (int): The number of bytes.
Returns:
bytes: the unscaled value of the Decimal as bytes.
"""
unscaled_value = decimal_to_unscaled(value)
- return unscaled_value.to_bytes(bytes_required(unscaled_value),
byteorder="big", signed=True)
+ if byte_length is None:
+ byte_length = bytes_required(unscaled_value)
+ return unscaled_value.to_bytes(byte_length, byteorder="big", signed=True)
+
+
+def bytes_to_decimal(value: bytes, scale: int) -> Decimal:
+ """Returns a decimal from the bytes.
+
+ Args:
+ value (bytes): tbe bytes to be converted into a decimal.
+ scale (int): the scale of the decimal.
+
+ Returns:
+ Decimal: the scaled decimal.
+ """
+ unscaled_datum = int.from_bytes(value, byteorder="big", signed=True)
+ return unscaled_to_decimal(unscaled_datum, scale)
def truncate_decimal(value: Decimal, width: int) -> Decimal:
@@ -88,3 +106,22 @@ def truncate_decimal(value: Decimal, width: int) -> Decimal:
unscaled_value = decimal_to_unscaled(value)
applied_value = unscaled_value - (((unscaled_value % width) + width) %
width)
return unscaled_to_decimal(applied_value,
abs(int(value.as_tuple().exponent)))
+
+
+MAX_PRECISION = tuple(math.floor(math.log10(math.fabs(math.pow(2, 8 * pos - 1)
- 1))) for pos in range(24))
+REQUIRED_LENGTH = tuple(next(pos for pos in range(24) if p <=
MAX_PRECISION[pos]) for p in range(40))
+
+
+def decimal_required_bytes(precision: int) -> int:
+ """Compute the number of bytes required to store a precision.
+
+ Args:
+ precision: The number of digits to store.
+
+ Returns:
+ The number of bytes required to store a decimal with a certain
precision.
+ """
+ if precision <= 0 or precision >= 40:
+ raise ValueError(f"Unsupported precision, outside of (0, 40]:
{precision}")
+
+ return REQUIRED_LENGTH[precision]
diff --git a/python/pyiceberg/utils/schema_conversion.py
b/python/pyiceberg/utils/schema_conversion.py
index 4f46668866..0746024e51 100644
--- a/python/pyiceberg/utils/schema_conversion.py
+++ b/python/pyiceberg/utils/schema_conversion.py
@@ -64,9 +64,7 @@ PRIMITIVE_FIELD_TYPE_MAPPING: Dict[str, PrimitiveType] = {
LOGICAL_FIELD_TYPE_MAPPING: Dict[Tuple[str, str], PrimitiveType] = {
("date", "int"): DateType(),
- ("time-millis", "int"): TimeType(),
- ("timestamp-millis", "long"): TimestampType(),
- ("time-micros", "int"): TimeType(),
+ ("time-micros", "long"): TimeType(),
("timestamp-micros", "long"): TimestampType(),
("uuid", "fixed"): UUIDType(),
}
@@ -369,6 +367,11 @@ class AvroSchemaConversion:
return self._convert_logical_decimal_type(avro_logical_type)
elif logical_type == "map":
return self._convert_logical_map_type(avro_logical_type)
+ elif logical_type == "timestamp-micros":
+ if avro_logical_type.get("adjust-to-utc", False) is True:
+ return TimestamptzType()
+ else:
+ return TimestampType()
elif (logical_type, physical_type) in LOGICAL_FIELD_TYPE_MAPPING:
return LOGICAL_FIELD_TYPE_MAPPING[(logical_type, physical_type)]
else:
@@ -542,6 +545,8 @@ class
ConvertSchemaToAvro(SchemaVisitorPerPrimitiveType[AvroType]):
return {
"type": "map",
"values": value_result,
+ "key-id": self.last_map_key_field_id,
+ "value-id": self.last_map_value_field_id,
}
else:
# Creates a logical map that's a list of schema's
@@ -588,17 +593,17 @@ class
ConvertSchemaToAvro(SchemaVisitorPerPrimitiveType[AvroType]):
def visit_timestamp(self, timestamp_type: TimestampType) -> AvroType:
# Iceberg only supports micro's
- return {"type": "long", "logicalType": "timestamp-micros"}
+ return {"type": "long", "logicalType": "timestamp-micros",
"adjust-to-utc": False}
def visit_timestamptz(self, timestamptz_type: TimestamptzType) -> AvroType:
# Iceberg only supports micro's
- return {"type": "long", "logicalType": "timestamp-micros"}
+ return {"type": "long", "logicalType": "timestamp-micros",
"adjust-to-utc": True}
def visit_string(self, string_type: StringType) -> AvroType:
return "string"
def visit_uuid(self, uuid_type: UUIDType) -> AvroType:
- return {"type": "string", "logicalType": "uuid"}
+ return {"type": "fixed", "size": "16", "logicalType": "uuid"}
def visit_binary(self, binary_type: BinaryType) -> AvroType:
return "bytes"
diff --git a/python/tests/avro/test_decoder.py
b/python/tests/avro/test_decoder.py
index b1ab97fb1d..c51f1090fc 100644
--- a/python/tests/avro/test_decoder.py
+++ b/python/tests/avro/test_decoder.py
@@ -17,12 +17,9 @@
from __future__ import annotations
import io
-from datetime import datetime, timezone
-from decimal import Decimal
from io import SEEK_SET
from types import TracebackType
from typing import Optional, Type
-from uuid import UUID
import pytest
@@ -34,15 +31,6 @@ from pyiceberg.types import DoubleType, FloatType
AVAILABLE_DECODERS = [StreamingBinaryDecoder, InMemoryBinaryDecoder]
[email protected]("decoder_class", AVAILABLE_DECODERS)
-def test_read_decimal_from_fixed(decoder_class: Type[BinaryDecoder]) -> None:
- mis = io.BytesIO(b"\x00\x00\x00\x05\x6A\x48\x1C\xFB\x2C\x7C\x50\x00")
- decoder = decoder_class(mis)
- actual = decoder.read_decimal_from_fixed(28, 15, 12)
- expected = Decimal("99892.123400000000000")
- assert actual == expected
-
-
@pytest.mark.parametrize("decoder_class", AVAILABLE_DECODERS)
def test_read_boolean_true(decoder_class: Type[BinaryDecoder]) -> None:
mis = io.BytesIO(b"\x01")
@@ -82,24 +70,6 @@ def test_skip_int(decoder_class: Type[BinaryDecoder]) ->
None:
assert decoder.tell() == 1
[email protected]("decoder_class", AVAILABLE_DECODERS)
-def test_read_decimal(decoder_class: Type[BinaryDecoder]) -> None:
- mis = io.BytesIO(b"\x18\x00\x00\x00\x05\x6A\x48\x1C\xFB\x2C\x7C\x50\x00")
- decoder = decoder_class(mis)
- actual = decoder.read_decimal_from_bytes(28, 15)
- expected = Decimal("99892.123400000000000")
- assert actual == expected
-
-
[email protected]("decoder_class", AVAILABLE_DECODERS)
-def test_decimal_from_fixed_big(decoder_class: Type[BinaryDecoder]) -> None:
- mis = io.BytesIO(b"\x0E\xC2\x02\xE9\x06\x16\x33\x49\x77\x67\xA8\x00")
- decoder = decoder_class(mis)
- actual = decoder.read_decimal_from_fixed(28, 15, 12)
- expected = Decimal("4567335489766.998340000000000")
- assert actual == expected
-
-
@pytest.mark.parametrize("decoder_class", AVAILABLE_DECODERS)
def test_read_negative_bytes(decoder_class: Type[BinaryDecoder]) -> None:
mis = io.BytesIO(b"")
@@ -180,41 +150,6 @@ def test_skip_double(decoder_class: Type[BinaryDecoder])
-> None:
assert decoder.tell() == 8
[email protected]("decoder_class", AVAILABLE_DECODERS)
-def test_read_uuid_from_fixed(decoder_class: Type[BinaryDecoder]) -> None:
- mis = io.BytesIO(b"\x12\x34\x56\x78" * 4)
- decoder = decoder_class(mis)
- assert decoder.read_uuid_from_fixed() ==
UUID("{12345678-1234-5678-1234-567812345678}")
-
-
[email protected]("decoder_class", AVAILABLE_DECODERS)
-def test_read_time_millis(decoder_class: Type[BinaryDecoder]) -> None:
- mis = io.BytesIO(b"\xBC\x7D")
- decoder = decoder_class(mis)
- assert decoder.read_time_millis().microsecond == 30000
-
-
[email protected]("decoder_class", AVAILABLE_DECODERS)
-def test_read_time_micros(decoder_class: Type[BinaryDecoder]) -> None:
- mis = io.BytesIO(b"\xBC\x7D")
- decoder = decoder_class(mis)
- assert decoder.read_time_micros().microsecond == 8030
-
-
[email protected]("decoder_class", AVAILABLE_DECODERS)
-def test_read_timestamp_micros(decoder_class: Type[BinaryDecoder]) -> None:
- mis = io.BytesIO(b"\xBC\x7D")
- decoder = decoder_class(mis)
- assert decoder.read_timestamp_micros() == datetime(1970, 1, 1, 0, 0, 0,
8030)
-
-
[email protected]("decoder_class", AVAILABLE_DECODERS)
-def test_read_timestamptz_micros(decoder_class: Type[BinaryDecoder]) -> None:
- mis = io.BytesIO(b"\xBC\x7D")
- decoder = decoder_class(mis)
- assert decoder.read_timestamptz_micros() == datetime(1970, 1, 1, 0, 0, 0,
8030, tzinfo=timezone.utc)
-
-
@pytest.mark.parametrize("decoder_class", AVAILABLE_DECODERS)
def test_read_bytes(decoder_class: Type[BinaryDecoder]) -> None:
mis = io.BytesIO(b"\x08\x01\x02\x03\x04")
diff --git a/python/tests/avro/test_encoder.py
b/python/tests/avro/test_encoder.py
index 4646e65e6e..5866719434 100644
--- a/python/tests/avro/test_encoder.py
+++ b/python/tests/avro/test_encoder.py
@@ -16,10 +16,9 @@
# under the License.
from __future__ import annotations
-import datetime
import io
import struct
-from decimal import Decimal
+import uuid
from pyiceberg.avro.encoder import BinaryEncoder
@@ -101,28 +100,6 @@ def test_write_double() -> None:
assert output.getbuffer() == struct.pack("<d", _input)
-def test_write_decimal_bytes() -> None:
- output = io.BytesIO()
- encoder = BinaryEncoder(output)
-
- _input = Decimal("3.14159265359")
-
- encoder.write_decimal_bytes(_input)
-
- assert output.getbuffer() == b"\x0a\x49\x25\x59\xf6\x4f"
-
-
-def test_write_decimal_fixed() -> None:
- output = io.BytesIO()
- encoder = BinaryEncoder(output)
-
- _input = Decimal("3.14159265359")
-
- encoder.write_decimal_fixed(_input, 8)
-
- assert output.getbuffer() == b"\x00\x00\x00\x49\x25\x59\xf6\x4f"
-
-
def test_write_bytes() -> None:
output = io.BytesIO()
encoder = BinaryEncoder(output)
@@ -134,17 +111,6 @@ def test_write_bytes() -> None:
assert output.getbuffer() == b"".join([b"\x06", _input])
-def test_write_bytes_fixed() -> None:
- output = io.BytesIO()
- encoder = BinaryEncoder(output)
-
- _input = b"\x12\x34\x56"
-
- encoder.write_bytes_fixed(_input)
-
- assert output.getbuffer() == _input
-
-
def test_write_utf8() -> None:
output = io.BytesIO()
encoder = BinaryEncoder(output)
@@ -156,52 +122,13 @@ def test_write_utf8() -> None:
assert output.getbuffer() == b"".join([b"\x7a", bin_input])
-def test_write_date_int() -> None:
- output = io.BytesIO()
- encoder = BinaryEncoder(output)
-
- _input = datetime.date(1970, 1, 2)
- encoder.write_date_int(_input)
-
- assert output.getbuffer() == b"\x02"
-
-
-def test_write_time_millis_int() -> None:
- output = io.BytesIO()
- encoder = BinaryEncoder(output)
-
- _input = datetime.time(1, 2, 3, 456000)
- encoder.write_time_millis_int(_input)
-
- assert output.getbuffer() == b"\x80\xc3\xc6\x03"
-
-
-def test_write_time_micros_long() -> None:
- output = io.BytesIO()
- encoder = BinaryEncoder(output)
-
- _input = datetime.time(1, 2, 3, 456000)
-
- encoder.write_time_micros_long(_input)
-
- assert output.getbuffer() == b"\x80\xb8\xfb\xde\x1b"
-
-
-def test_write_timestamp_millis_long() -> None:
- output = io.BytesIO()
- encoder = BinaryEncoder(output)
-
- _input = datetime.datetime(2023, 1, 1, 1, 2, 3)
- encoder.write_timestamp_millis_long(_input)
-
- assert output.getbuffer() == b"\xf0\xdb\xcc\xad\xad\x61"
-
-
-def test_write_timestamp_micros_long() -> None:
+def test_write_uuid() -> None:
output = io.BytesIO()
encoder = BinaryEncoder(output)
- _input = datetime.datetime(2023, 1, 1, 1, 2, 3)
- encoder.write_timestamp_micros_long(_input)
+ _input = uuid.UUID("12345678-1234-5678-1234-567812345678")
+ encoder.write_uuid(_input)
- assert output.getbuffer() == b"\x80\xe3\xad\x9f\xac\xca\xf8\x05"
+ buf = output.getbuffer()
+ assert len(buf) == 16
+ assert buf.tobytes() == b"\x124Vx\x124Vx\x124Vx\x124Vx"
diff --git a/python/tests/avro/test_file.py b/python/tests/avro/test_file.py
index 101e676c90..2738770492 100644
--- a/python/tests/avro/test_file.py
+++ b/python/tests/avro/test_file.py
@@ -15,11 +15,14 @@
# specific language governing permissions and limitations
# under the License.
import inspect
+from datetime import date, datetime, time
from enum import Enum
from tempfile import TemporaryDirectory
from typing import Any
+from uuid import UUID
import pytest
+from _decimal import Decimal
from fastavro import reader, writer
import pyiceberg.avro.file as avro
@@ -34,7 +37,24 @@ from pyiceberg.manifest import (
ManifestEntry,
ManifestEntryStatus,
)
+from pyiceberg.schema import Schema
from pyiceberg.typedef import Record
+from pyiceberg.types import (
+ BooleanType,
+ DateType,
+ DecimalType,
+ DoubleType,
+ FixedType,
+ FloatType,
+ IntegerType,
+ LongType,
+ NestedField,
+ StringType,
+ TimestampType,
+ TimestamptzType,
+ TimeType,
+ UUIDType,
+)
from pyiceberg.utils.schema_conversion import AvroSchemaConversion
@@ -193,3 +213,77 @@ def
test_write_manifest_entry_with_fastavro_read_with_iceberg() -> None:
avro_entry = next(it)
assert entry == avro_entry
+
+
[email protected]("is_required", [True, False])
+def test_all_primitive_types(is_required: bool) -> None:
+ all_primitives_schema = Schema(
+ NestedField(field_id=1, name="field_fixed", field_type=FixedType(16),
required=is_required),
+ NestedField(field_id=2, name="field_decimal",
field_type=DecimalType(6, 2), required=is_required),
+ NestedField(field_id=3, name="field_bool", field_type=BooleanType(),
required=is_required),
+ NestedField(field_id=4, name="field_int", field_type=IntegerType(),
required=True),
+ NestedField(field_id=5, name="field_long", field_type=LongType(),
required=is_required),
+ NestedField(field_id=6, name="field_float", field_type=FloatType(),
required=is_required),
+ NestedField(field_id=7, name="field_double", field_type=DoubleType(),
required=is_required),
+ NestedField(field_id=8, name="field_date", field_type=DateType(),
required=is_required),
+ NestedField(field_id=9, name="field_time", field_type=TimeType(),
required=is_required),
+ NestedField(field_id=10, name="field_timestamp",
field_type=TimestampType(), required=is_required),
+ NestedField(field_id=11, name="field_timestamptz",
field_type=TimestamptzType(), required=is_required),
+ NestedField(field_id=12, name="field_string", field_type=StringType(),
required=is_required),
+ NestedField(field_id=13, name="field_uuid", field_type=UUIDType(),
required=is_required),
+ schema_id=1,
+ )
+
+ class AllPrimitivesRecord(Record):
+ field_fixed: bytes
+ field_decimal: Decimal
+ field_bool: bool
+ field_int: int
+ field_long: int
+ field_float: float
+ field_double: float
+ field_date: date
+ field_time: time
+ field_timestamp: datetime
+ field_timestamptz: datetime
+ field_string: str
+ field_uuid: UUID
+
+ def __init__(self, *data: Any, **named_data: Any) -> None:
+ super().__init__(*data, **{"struct":
all_primitives_schema.as_struct(), **named_data})
+
+ record = AllPrimitivesRecord(
+ b"\x124Vx\x124Vx\x124Vx\x124Vx",
+ Decimal("123.45"),
+ True,
+ 123,
+ 429496729622,
+ 123.22000122070312,
+ 429496729622.314,
+ 19052,
+ 69922000000,
+ 1677629965000000,
+ 1677629965000000,
+ "this is a sentence",
+ UUID("12345678-1234-5678-1234-567812345678"),
+ )
+
+ with TemporaryDirectory() as tmpdir:
+ tmp_avro_file = tmpdir + "/all_primitives.avro"
+ # write to disk
+ with avro.AvroOutputFile[AllPrimitivesRecord](
+ PyArrowFileIO().new_output(tmp_avro_file), all_primitives_schema,
"all_primitives_schema"
+ ) as out:
+ out.write_block([record])
+
+ # read from disk
+ with avro.AvroFile[AllPrimitivesRecord](
+ PyArrowFileIO().new_input(tmp_avro_file),
+ all_primitives_schema,
+ {-1: AllPrimitivesRecord},
+ ) as avro_reader:
+ it = iter(avro_reader)
+ avro_entry = next(it)
+
+ for idx, field in enumerate(all_primitives_schema.as_struct()):
+ assert record[idx] == avro_entry[idx], f"Invalid {field}"
diff --git a/python/tests/avro/test_writer.py b/python/tests/avro/test_writer.py
index c517a0cd1c..2cdcd4482a 100644
--- a/python/tests/avro/test_writer.py
+++ b/python/tests/avro/test_writer.py
@@ -21,6 +21,7 @@ import struct
from typing import Dict, List
import pytest
+from _decimal import Decimal
from pyiceberg.avro.encoder import BinaryEncoder
from pyiceberg.avro.resolver import construct_writer
@@ -218,3 +219,19 @@ def test_write_struct_with_list() -> None:
b"\x00",
]
)
+
+
+def test_write_decimal() -> None:
+ output = io.BytesIO()
+ encoder = BinaryEncoder(output)
+
+ schema = StructType(
+ NestedField(1, "decimal", DecimalType(10, 2), required=True),
+ )
+
+ class MyStruct(Record):
+ decimal: Decimal
+
+ construct_writer(schema).write(encoder, MyStruct(Decimal("1000.12")))
+
+ assert output.getvalue() == b"\x00\x00\x01\x86\xac"
diff --git a/python/tests/test_integration.py b/python/tests/test_integration.py
index 3609920819..37ba5b9048 100644
--- a/python/tests/test_integration.py
+++ b/python/tests/test_integration.py
@@ -299,3 +299,19 @@ def
test_pyarrow_deletes_double(test_positional_mor_double_deletes: Table) -> No
# Testing the slicing of indices
arrow_table = test_positional_mor_double_deletes.scan(limit=8).to_arrow()
assert arrow_table["number"].to_pylist() == [1, 2, 3, 4, 5, 7, 8, 10]
+
+
[email protected]
+def test_partitioned_tables(catalog: Catalog) -> None:
+ for table_name, predicate in [
+ ("test_partitioned_by_identity", "ts >= '2023-03-05T00:00:00+00:00'"),
+ ("test_partitioned_by_years", "dt >= '2023-03-05'"),
+ ("test_partitioned_by_months", "dt >= '2023-03-05'"),
+ ("test_partitioned_by_days", "ts >= '2023-03-05T00:00:00+00:00'"),
+ ("test_partitioned_by_hours", "ts >= '2023-03-05T00:00:00+00:00'"),
+ ("test_partitioned_by_truncate", "letter >= 'e'"),
+ ("test_partitioned_by_bucket", "number >= '5'"),
+ ]:
+ table = catalog.load_table(f"default.{table_name}")
+ arrow_table = table.scan(selected_fields=("number",),
row_filter=predicate).to_arrow()
+ assert set(arrow_table["number"].to_pylist()) == {5, 6, 7, 8, 9, 10,
11, 12}, f"Table {table_name}, predicate {predicate}"
diff --git a/python/tests/test_schema.py b/python/tests/test_schema.py
index 4a59631acd..76dddb6486 100644
--- a/python/tests/test_schema.py
+++ b/python/tests/test_schema.py
@@ -154,6 +154,53 @@ def test_schema_index_by_id_visitor(table_schema_nested:
Schema) -> None:
def test_schema_index_by_name_visitor(table_schema_nested: Schema) -> None:
"""Test index_by_name visitor function"""
+ table_schema_nested = schema.Schema(
+ NestedField(field_id=1, name="foo", field_type=StringType(),
required=False),
+ NestedField(field_id=2, name="bar", field_type=IntegerType(),
required=True),
+ NestedField(field_id=3, name="baz", field_type=BooleanType(),
required=False),
+ NestedField(
+ field_id=4,
+ name="qux",
+ field_type=ListType(element_id=5, element_type=StringType(),
element_required=True),
+ required=True,
+ ),
+ NestedField(
+ field_id=6,
+ name="quux",
+ field_type=MapType(
+ key_id=7,
+ key_type=StringType(),
+ value_id=8,
+ value_type=MapType(key_id=9, key_type=StringType(),
value_id=10, value_type=IntegerType(), value_required=True),
+ value_required=True,
+ ),
+ required=True,
+ ),
+ NestedField(
+ field_id=11,
+ name="location",
+ field_type=ListType(
+ element_id=12,
+ element_type=StructType(
+ NestedField(field_id=13, name="latitude",
field_type=FloatType(), required=False),
+ NestedField(field_id=14, name="longitude",
field_type=FloatType(), required=False),
+ ),
+ element_required=True,
+ ),
+ required=True,
+ ),
+ NestedField(
+ field_id=15,
+ name="person",
+ field_type=StructType(
+ NestedField(field_id=16, name="name", field_type=StringType(),
required=False),
+ NestedField(field_id=17, name="age", field_type=IntegerType(),
required=True),
+ ),
+ required=False,
+ ),
+ schema_id=1,
+ identifier_field_ids=[1],
+ )
index = schema.index_by_name(table_schema_nested)
assert index == {
"foo": 1,
diff --git a/python/tests/test_transforms.py b/python/tests/test_transforms.py
index 201dbbbd6d..5c4980d8ac 100644
--- a/python/tests/test_transforms.py
+++ b/python/tests/test_transforms.py
@@ -88,7 +88,7 @@ from pyiceberg.types import (
from pyiceberg.utils.datetime import (
date_str_to_days,
date_to_days,
- time_to_micros,
+ time_str_to_micros,
timestamp_to_micros,
timestamptz_to_micros,
)
@@ -102,7 +102,7 @@ from pyiceberg.utils.datetime import (
(34, LongType(), 2017239379),
(date_to_days(date(2017, 11, 16)), DateType(), -653330422),
(date_str_to_days("2017-11-16"), DateType(), -653330422),
- (time_to_micros("22:31:08"), TimeType(), -662762989),
+ (time_str_to_micros("22:31:08"), TimeType(), -662762989),
(
timestamp_to_micros("2017-11-16T22:31:08"),
TimestampType(),
diff --git a/python/tests/utils/test_decimal.py
b/python/tests/utils/test_decimal.py
new file mode 100644
index 0000000000..683eab93fd
--- /dev/null
+++ b/python/tests/utils/test_decimal.py
@@ -0,0 +1,40 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import pytest
+
+from pyiceberg.utils.decimal import decimal_required_bytes
+
+
+def test_decimal_required_bytes() -> None:
+ assert decimal_required_bytes(precision=1) == 1
+ assert decimal_required_bytes(precision=2) == 1
+ assert decimal_required_bytes(precision=3) == 2
+ assert decimal_required_bytes(precision=4) == 2
+ assert decimal_required_bytes(precision=5) == 3
+ assert decimal_required_bytes(precision=7) == 4
+ assert decimal_required_bytes(precision=8) == 4
+ assert decimal_required_bytes(precision=10) == 5
+ assert decimal_required_bytes(precision=32) == 14
+ assert decimal_required_bytes(precision=38) == 16
+
+ with pytest.raises(ValueError) as exc_info:
+ decimal_required_bytes(precision=40)
+ assert "(0, 40]" in str(exc_info.value)
+
+ with pytest.raises(ValueError) as exc_info:
+ decimal_required_bytes(precision=-1)
+ assert "(0, 40]" in str(exc_info.value)