wgtmac commented on PR #43995:
URL: https://github.com/apache/arrow/pull/43995#issuecomment-2342463156
I‘m using Hive schema, so that's why it is `array<array<int>>`. The file
could be easily produced by Spark Sql like below:
```
package org.example
import org.apache.spark.sql.SparkSession
object ParquetTwoLevelList {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder
.master("local[1]")
.appName("NestedListTest")
.config("spark.serializer",
"org.apache.spark.serializer.KryoSerializer")
.config("spark.sql.catalog.spark_catalog",
"org.apache.spark.sql.hudi.catalog.HoodieCatalog")
.config("spark.sql.extensions",
"org.apache.spark.sql.hudi.HoodieSparkSessionExtension")
.config("spark.kryo.registrator",
"org.apache.spark.HoodieSparkKryoRegistrar")
.getOrCreate()
spark.sql("CREATE TABLE nested_list_test (a array<array<int>>) USING
HUDI")
spark.sql("INSERT INTO nested_list_test VALUES ( array(array(1,2),
array(3,4)) )")
}
}
```
The parquet-cli prints the following metadata:
```
File path:
/Users/gangwu/Projects/hudi-spark-generator/spark-warehouse/nested_list_test/f92ed4b5-c063-4b94-90a4-5ef997db1a6c-0_0-13-12_20240911093900996.parquet
Created by: parquet-mr version 1.12.3 (build
f8dced182c4c1fbdec6ccb3185537b5a01e6ed6b)
Properties:
hoodie_bloom_filter_type_code: DYNAMIC_V0
org.apache.hudi.bloomfilter: ***
hoodie_min_record_key: 20240911093900996_0_0
parquet.avro.schema:
{"type":"record","name":"nested_list_test_record","namespace":"hoodie.nested_list_test","fields":[{"name":"_hoodie_commit_time","type":["null","string"],"doc":"","default":null},{"name":"_hoodie_commit_seqno","type":["null","string"],"doc":"","default":null},{"name":"_hoodie_record_key","type":["null","string"],"doc":"","default":null},{"name":"_hoodie_partition_path","type":["null","string"],"doc":"","default":null},{"name":"_hoodie_file_name","type":["null","string"],"doc":"","default":null},{"name":"a","type":["null",{"type":"array","items":["null",{"type":"array","items":["null","int"]}]}],"default":null}]}
writer.model.name: avro
hoodie_max_record_key: 20240911093900996_0_0
Schema:
message hoodie.nested_list_test.nested_list_test_record {
optional binary _hoodie_commit_time (STRING);
optional binary _hoodie_commit_seqno (STRING);
optional binary _hoodie_record_key (STRING);
optional binary _hoodie_partition_path (STRING);
optional binary _hoodie_file_name (STRING);
optional group a (LIST) {
repeated group array (LIST) {
repeated int32 array;
}
}
}
Row group 0: count: 1 441.00 B records start: 4 total(compressed): 441 B
total(uncompressed):349 B
--------------------------------------------------------------------------------
type encodings count avg size nulls min
/ max
_hoodie_commit_time BINARY G _ 1 68.00 B 0
"20240911093900996" / "20240911093900996"
_hoodie_commit_seqno BINARY G _ 1 72.00 B 0
"20240911093900996_0_0" / "20240911093900996_0_0"
_hoodie_record_key BINARY G _ 1 72.00 B 0
"20240911093900996_0_0" / "20240911093900996_0_0"
_hoodie_partition_path BINARY G _ 1 50.00 B 0 ""
/ ""
_hoodie_file_name BINARY G _ 1 116.00 B 0
"f92ed4b5-c063-4b94-90a4-5..." / "f92ed4b5-c063-4b94-90a4-5..."
a.array.array INT32 G _ 4 15.75 B 0 "1"
/ "4"
-------------
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]