kevinjqliu commented on code in PR #2881:
URL: https://github.com/apache/iceberg-python/pull/2881#discussion_r3306524176
##########
pyiceberg/io/pyarrow.py:
##########
@@ -1641,7 +1645,12 @@ def _task_to_record_batches(
bound_row_filter, file_schema, case_sensitive=case_sensitive,
projected_field_values=projected_missing_fields
)
bound_file_filter = bind(file_schema, translated_row_filter,
case_sensitive=case_sensitive)
- pyarrow_filter = expression_to_pyarrow(bound_file_filter,
file_schema)
+ try:
+ pyarrow_filter = expression_to_pyarrow(bound_file_filter,
file_schema)
+ except pyarrow.lib.ArrowNotImplementedError as e:
+ if "arrow.uuid" in str(e):
+ raise
NotImplementedError(UUID_FILTER_NOT_SUPPORTED_ERROR_MESSAGE) from e
+ raise
Review Comment:
nit: should we reraise here or in `scan`? I feel like in `scan` might be
better since its public
##########
pyiceberg/io/pyarrow.py:
##########
@@ -203,6 +203,10 @@
MAP_VALUE_NAME = "value"
DOC = "doc"
UTC_ALIASES = {"UTC", "+00:00", "Etc/UTC", "Z"}
+UUID_FILTER_NOT_SUPPORTED_ERROR_MESSAGE = (
Review Comment:
nit: can we inline this, i want to someone to accidentally use this variable
😄
##########
tests/integration/test_reads.py:
##########
@@ -820,17 +820,20 @@ def test_partitioned_tables(catalog: Catalog) -> None:
@pytest.mark.parametrize("catalog", [lf("session_catalog_hive"),
lf("session_catalog")])
def test_unpartitioned_uuid_table(catalog: Catalog) -> None:
unpartitioned_uuid =
catalog.load_table("default.test_uuid_and_fixed_unpartitioned")
- arrow_table_eq = unpartitioned_uuid.scan(row_filter="uuid_col ==
'102cb62f-e6f8-4eb0-9973-d9b012ff0967'").to_arrow()
- assert arrow_table_eq["uuid_col"].to_pylist() ==
[uuid.UUID("102cb62f-e6f8-4eb0-9973-d9b012ff0967")]
-
- arrow_table_neq = unpartitioned_uuid.scan(
- row_filter="uuid_col != '102cb62f-e6f8-4eb0-9973-d9b012ff0967' and
uuid_col != '639cccce-c9d2-494a-a78c-278ab234f024'"
- ).to_arrow()
- assert arrow_table_neq["uuid_col"].to_pylist() == [
- uuid.UUID("ec33e4b2-a834-4cc3-8c4a-a1d3bfc2f226"),
- uuid.UUID("c1b0d8e0-0b0e-4b1e-9b0a-0e0b0d0c0a0b"),
- uuid.UUID("923dae77-83d6-47cd-b4b0-d383e64ee57e"),
- ]
+ try:
+ arrow_table_eq = unpartitioned_uuid.scan(row_filter="uuid_col ==
'102cb62f-e6f8-4eb0-9973-d9b012ff0967'").to_arrow()
+ assert arrow_table_eq["uuid_col"].to_pylist() ==
[uuid.UUID("102cb62f-e6f8-4eb0-9973-d9b012ff0967")]
+
+ arrow_table_neq = unpartitioned_uuid.scan(
+ row_filter="uuid_col != '102cb62f-e6f8-4eb0-9973-d9b012ff0967' and
uuid_col != '639cccce-c9d2-494a-a78c-278ab234f024'"
+ ).to_arrow()
+ assert arrow_table_neq["uuid_col"].to_pylist() == [
+ uuid.UUID("ec33e4b2-a834-4cc3-8c4a-a1d3bfc2f226"),
+ uuid.UUID("c1b0d8e0-0b0e-4b1e-9b0a-0e0b0d0c0a0b"),
+ uuid.UUID("923dae77-83d6-47cd-b4b0-d383e64ee57e"),
+ ]
+ except NotImplementedError as e:
Review Comment:
nit: try/except in test feels weird. is this based on pyarrow library
version? if so, could we just branch on the version instead?
##########
tests/integration/test_reads.py:
##########
@@ -840,14 +843,11 @@ def test_unpartitioned_fixed_table(catalog: Catalog) ->
None:
arrow_table_eq = fixed_table.scan(row_filter=EqualTo("fixed_col",
b"1234567890123456789012345")).to_arrow()
assert arrow_table_eq["fixed_col"].to_pylist() ==
[b"1234567890123456789012345"]
- arrow_table_neq = fixed_table.scan(
- row_filter=And(
- NotEqualTo("fixed_col", b"1234567890123456789012345"),
NotEqualTo("uuid_col", "c1b0d8e0-0b0e-4b1e-9b0a-0e0b0d0c0a0b")
- )
- ).to_arrow()
+ arrow_table_neq = fixed_table.scan(row_filter=NotEqualTo("fixed_col",
b"1234567890123456789012345")).to_arrow()
Review Comment:
is this change intentional?
and where did the new row `asdasasdads12312312312111` come from?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]