maxdebayser commented on code in PR #7831:
URL: https://github.com/apache/iceberg/pull/7831#discussion_r1288926377
##########
python/tests/io/test_pyarrow.py:
##########
@@ -1345,3 +1374,655 @@ def test_pyarrow_wrap_fsspec(example_task:
FileScanTask, table_schema_simple: Sc
bar: [[1,2,3]]
baz: [[true,false,null]]"""
)
+
+
+def construct_test_table() -> Tuple[Any, Any, Union[TableMetadataV1,
TableMetadataV2]]:
+ table_metadata = {
+ "format-version": 2,
+ "location": "s3://bucket/test/location",
+ "last-column-id": 7,
+ "current-schema-id": 0,
+ "schemas": [
+ {
+ "type": "struct",
+ "schema-id": 0,
+ "fields": [
+ {"id": 1, "name": "strings", "required": False, "type":
"string"},
+ {"id": 2, "name": "floats", "required": False, "type":
"float"},
+ {
+ "id": 3,
+ "name": "list",
+ "required": False,
+ "type": {"type": "list", "element-id": 5, "element":
"long", "element-required": False},
+ },
+ {
+ "id": 4,
+ "name": "maps",
+ "required": False,
+ "type": {
+ "type": "map",
+ "key-id": 6,
+ "key": "long",
+ "value-id": 7,
+ "value": "long",
+ "value-required": False,
+ },
+ },
+ ],
+ },
+ ],
+ "default-spec-id": 0,
+ "partition-specs": [{"spec-id": 0, "fields": []}],
+ "properties": {},
+ }
+
+ table_metadata = TableMetadataUtil.parse_obj(table_metadata)
+ arrow_schema = schema_to_pyarrow(table_metadata.schemas[0])
+
+ _strings = ["zzzzzzzzzzzzzzzzzzzz", "rrrrrrrrrrrrrrrrrrrr", None,
"aaaaaaaaaaaaaaaaaaaa"]
+
+ _floats = [3.14, math.nan, 1.69, 100]
+
+ _list = [[1, 2, 3], [4, 5, 6], None, [7, 8, 9]]
+
+ _maps: List[Optional[Dict[int, int]]] = [
+ {1: 2, 3: 4},
+ None,
+ {5: 6},
+ {},
+ ]
+
+ table = pa.Table.from_pydict(
+ {
+ "strings": _strings,
+ "floats": _floats,
+ "list": _list,
+ "maps": _maps,
+ },
+ schema=arrow_schema,
+ )
+ metadata_collector: List[Any] = []
+
+ with pa.BufferOutputStream() as f:
+ with pq.ParquetWriter(f, table.schema,
metadata_collector=metadata_collector) as writer:
+ writer.write_table(table)
+
+ return f.getvalue(), metadata_collector[0], table_metadata
+
+
+def test_record_count() -> None:
+ (file_bytes, metadata, table_metadata) = construct_test_table()
+
+ datafile = DataFile()
+ fill_parquet_file_metadata(datafile, metadata, len(file_bytes),
table_metadata)
+
+ assert datafile.record_count == 4
+
+
+def test_file_size() -> None:
+ (file_bytes, metadata, table_metadata) = construct_test_table()
+
+ datafile = DataFile()
+ fill_parquet_file_metadata(datafile, metadata, len(file_bytes),
table_metadata)
+
+ assert datafile.file_size_in_bytes == len(file_bytes)
+
+
+def test_value_counts() -> None:
+ (file_bytes, metadata, table_metadata) = construct_test_table()
+
+ datafile = DataFile()
+ fill_parquet_file_metadata(datafile, metadata, len(file_bytes),
table_metadata)
+
+ assert len(datafile.value_counts) == 5
+ assert datafile.value_counts[1] == 4
+ assert datafile.value_counts[2] == 4
+ assert datafile.value_counts[5] == 10 # 3 lists with 3 items and a None
value
+ assert datafile.value_counts[6] == 5
+ assert datafile.value_counts[7] == 5
+
+
+def test_column_sizes() -> None:
+ (file_bytes, metadata, table_metadata) = construct_test_table()
+
+ datafile = DataFile()
+ fill_parquet_file_metadata(datafile, metadata, len(file_bytes),
table_metadata)
+
+ assert len(datafile.column_sizes) == 5
+ # these values are an artifact of how the write_table encodes the columns
+ assert datafile.column_sizes[1] == 116
+ assert datafile.column_sizes[2] == 89
+ assert datafile.column_sizes[5] == 151
+ assert datafile.column_sizes[6] == 117
+ assert datafile.column_sizes[7] == 117
+
+
+def test_null_and_nan_counts() -> None:
+ (file_bytes, metadata, table_metadata) = construct_test_table()
+
+ datafile = DataFile()
+ fill_parquet_file_metadata(datafile, metadata, len(file_bytes),
table_metadata)
+
+ assert len(datafile.null_value_counts) == 5
+ assert datafile.null_value_counts[1] == 1
+ assert datafile.null_value_counts[2] == 0
+ assert datafile.null_value_counts[5] == 1
Review Comment:
This is how pyarrow is returning these values. Unless we look at the actual
table it seems to me that the only options are not returning this statistic or
returning it as is.
```
<pyarrow._parquet.ColumnChunkMetaData object at 0x7fbd4a8a15e0>
file_offset: 543
file_path:
physical_type: INT64
num_values: 10
path_in_schema: list.list.element
is_stats_set: True
statistics:
<pyarrow._parquet.Statistics object at 0x7fbd4a8a1130>
has_min_max: True
min: 1
max: 9
null_count: 1
distinct_count: 0
num_values: 9
physical_type: INT64
logical_type: None
converted_type (legacy): NONE
compression: SNAPPY
encodings: ('RLE_DICTIONARY', 'PLAIN', 'RLE')
has_dictionary_page: True
dictionary_page_offset: 392
data_page_offset: 455
total_compressed_size: 151
total_uncompressed_size: 174
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]