nikita-sheremet-clearscale opened a new issue #4310:
URL: https://github.com/apache/hudi/issues/4310


   Look at the sources: 
[fast-parquet-row-count-in-spark](https://stackoverflow.com/questions/40629435/fast-parquet-row-count-in-spark)
 and 
[parquet-count-metadata-explanation](https://github.com/dennyglee/databricks/blob/master/misc/parquet-count-metadata-explanation.md)
   
   Stackoverflow and official spark documentation tells us that parquet file 
should contains row count in metadata. And spark added this by default since 
1.6. In my opinion it is obvious that hudi parquet file should have this field 
by default. No?
   
   I tried to see this "field" but have no luck. Maybe I am doing something 
wrong? Could somebody tell me how ensure that hudi parquet file has such filed? 
For now I am invoking org.apache.parquet.tools.Main with arguments meta 
D:\myparquet_file.parquet and see no count keyword in results.
   
   Btw way quesy like `select count(*) from my_table` (Athena) with 
`org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat` works faster 
then `org.apache.hudi.hadoop.HoodieParquetInputFormat` for the same table and 
data.
   
   
   * Hudi version : 0.9.0
   
   * Spark version : 2.4.7-amzn-1
   
   * Hive version : 2.3.7
   
   * Hadoop version : 2.10.1
   
   * Storage (HDFS/S3/GCS..) : S3
   
   * Running on Docker? (yes/no) : no
   
   
   **Additional context**
   
   Here is meta from parquet (index was removed)
   ```
   extra:                  hoodie_min_record_key = 
device_uuid:00055458-67BF-4714-96CB-46ABC492645E
   extra:                  parquet.avro.schema = 
{"type":"record","name":"vendor_table_record","namespace":"hoodie.vendor_table","fields":[{"name":"_hoodie_commit_time","type":["null","string"],"doc":"","default":null},{"name":"_hoodie_commit_seqno","type":["null","string"],"doc":"","default":null},{"name":"_hoodie_record_key","type":["null","string"],"doc":"","default":null},{"name":"_hoodie_partition_path","type":["null","string"],"doc":"","default":null},{"name":"_hoodie_file_name","type":["null","string"],"doc":"","default":null},{"name":"visit_id","type":["null","string"],"default":null},{"name":"device_uuid","type":["null","string"],"default":null},{"name":"publisher_app_id","type":["null","string"],"default":null},{"name":"m2m_rels","type":["null","string"],"default":null},{"name":"accuracy","type":["null","float"],"default":null},{"name":"arrival_date","type":["null",{"type":"long","logicalType":"timestamp-micros"}],"default":null},{"name":"departure_date","type":["null",{"ty
 
pe":"long","logicalType":"timestamp-micros"}],"default":null},{"name":"lat","type":["null","float"],"default":null},{"name":"lon","type":["null","float"],"default":null},{"name":"client_loc_dtime","type":["null",{"type":"long","logicalType":"timestamp-micros"}],"default":null},{"name":"device_type","type":["null","string"],"default":null},{"name":"altitude","type":["null","float"],"default":null},{"name":"vertical_accuracy","type":["null","float"],"default":null},{"name":"speed","type":["null","float"],"default":null},{"name":"course","type":["null","float"],"default":null},{"name":"battery","type":["null","float"],"default":null},{"name":"charge","type":["null","float"],"default":null},{"name":"device_version","type":["null","string"],"default":null},{"name":"device_model","type":["null","string"],"default":null},{"name":"opt_in","type":["null","float"],"default":null},{"name":"arrival_date_pst","type":["null",{"type":"long","logicalType":"timestamp-micros"}],"default":null},{"name
 
":"departure_date_pst","type":["null",{"type":"long","logicalType":"timestamp-micros"}],"default":null},{"name":"client_loc_dtime_pst","type":["null",{"type":"long","logicalType":"timestamp-micros"}],"default":null}]}
   extra:                  writer.model.name = avro
   extra:                  hoodie_max_record_key = 
device_uuid:FFFEBC13-DB07-4F82-975C-CF20B1ED6C6D
   
   file schema:            hoodie.vendor_table.vendor_table_record
   
--------------------------------------------------------------------------------
   _hoodie_commit_time:    OPTIONAL BINARY O:UTF8 R:0 D:1
   _hoodie_commit_seqno:   OPTIONAL BINARY O:UTF8 R:0 D:1
   _hoodie_record_key:     OPTIONAL BINARY O:UTF8 R:0 D:1
   _hoodie_partition_path: OPTIONAL BINARY O:UTF8 R:0 D:1
   _hoodie_file_name:      OPTIONAL BINARY O:UTF8 R:0 D:1
   visit_id:               OPTIONAL BINARY O:UTF8 R:0 D:1
   device_uuid:            OPTIONAL BINARY O:UTF8 R:0 D:1
   publisher_app_id:       OPTIONAL BINARY O:UTF8 R:0 D:1
   m2m_rels:               OPTIONAL BINARY O:UTF8 R:0 D:1
   accuracy:               OPTIONAL FLOAT R:0 D:1
   arrival_date:           OPTIONAL INT64 O:TIMESTAMP_MICROS R:0 D:1
   departure_date:         OPTIONAL INT64 O:TIMESTAMP_MICROS R:0 D:1
   lat:                    OPTIONAL FLOAT R:0 D:1
   lon:                    OPTIONAL FLOAT R:0 D:1
   client_loc_dtime:       OPTIONAL INT64 O:TIMESTAMP_MICROS R:0 D:1
   device_type:            OPTIONAL BINARY O:UTF8 R:0 D:1
   altitude:               OPTIONAL FLOAT R:0 D:1
   vertical_accuracy:      OPTIONAL FLOAT R:0 D:1
   speed:                  OPTIONAL FLOAT R:0 D:1
   course:                 OPTIONAL FLOAT R:0 D:1
   battery:                OPTIONAL FLOAT R:0 D:1
   charge:                 OPTIONAL FLOAT R:0 D:1
   device_version:         OPTIONAL BINARY O:UTF8 R:0 D:1
   device_model:           OPTIONAL BINARY O:UTF8 R:0 D:1
   opt_in:                 OPTIONAL FLOAT R:0 D:1
   arrival_date_pst:       OPTIONAL INT64 O:TIMESTAMP_MICROS R:0 D:1
   departure_date_pst:     OPTIONAL INT64 O:TIMESTAMP_MICROS R:0 D:1
   client_loc_dtime_pst:   OPTIONAL INT64 O:TIMESTAMP_MICROS R:0 D:1
   
   row group 1:            RC:3268670 TS:389659897 OFFSET:4
   
--------------------------------------------------------------------------------
   _hoodie_commit_time:     BINARY GZIP DO:0 FPO:4 SZ:6659/5671/0,85 VC:3268670 
ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY
   _hoodie_commit_seqno:    BINARY GZIP DO:0 FPO:6663 
SZ:11186821/113377200/10,13 VC:3268670 ENC:RLE,BIT_PACKED,PLAIN
   _hoodie_record_key:      BINARY GZIP DO:0 FPO:11193484 
SZ:547432/1149548/2,10 VC:3268670 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY
   _hoodie_partition_path:  BINARY GZIP DO:0 FPO:11740916 SZ:7926/6774/0,85 
VC:3268670 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY
   _hoodie_file_name:       BINARY GZIP DO:0 FPO:11748842 SZ:89266/84812/0,95 
VC:3268670 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY
   visit_id:                BINARY GZIP DO:0 FPO:11838108 
SZ:7093540/190643896/26,88 VC:3268670 ENC:RLE,BIT_PACKED,PLAIN,PLAIN_DICTIONARY
   device_uuid:             BINARY GZIP DO:0 FPO:18931648 SZ:526152/899621/1,71 
VC:3268670 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY
   publisher_app_id:        BINARY GZIP DO:0 FPO:19457800 SZ:1941/1496/0,77 
VC:3268670 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY
   m2m_rels:                BINARY GZIP DO:0 FPO:19459741 SZ:1941/1496/0,77 
VC:3268670 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY
   accuracy:                FLOAT GZIP DO:0 FPO:19461682 
SZ:3982568/6577634/1,65 VC:3268670 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY
   arrival_date:            INT64 GZIP DO:0 FPO:23444250 SZ:56/36/0,64 
VC:3268670 ENC:RLE,BIT_PACKED,PLAIN
   departure_date:          INT64 GZIP DO:0 FPO:23444306 SZ:56/36/0,64 
VC:3268670 ENC:RLE,BIT_PACKED,PLAIN
   lat:                     FLOAT GZIP DO:0 FPO:23444362 
SZ:6655882/13075459/1,96 VC:3268670 ENC:RLE,BIT_PACKED,PLAIN
   lon:                     FLOAT GZIP DO:0 FPO:30100244 
SZ:6228217/13075459/2,10 VC:3268670 ENC:RLE,BIT_PACKED,PLAIN
   client_loc_dtime:        INT64 GZIP DO:0 FPO:36328461 
SZ:7346975/7629527/1,04 VC:3268670 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY
   device_type:             BINARY GZIP DO:0 FPO:43675436 SZ:20244/28874/1,43 
VC:3268670 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY
   altitude:                FLOAT GZIP DO:0 FPO:43695680 
SZ:8537491/13075459/1,53 VC:3268670 ENC:RLE,BIT_PACKED,PLAIN
   vertical_accuracy:       FLOAT GZIP DO:0 FPO:52233171 SZ:1025/784/0,76 
VC:3268670 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY
   speed:                   FLOAT GZIP DO:0 FPO:52234196 
SZ:4608960/8745091/1,90 VC:3268670 ENC:RLE,BIT_PACKED,PLAIN,PLAIN_DICTIONARY
   course:                  FLOAT GZIP DO:0 FPO:56843156 
SZ:4403773/10267860/2,33 VC:3268670 ENC:RLE,BIT_PACKED,PLAIN,PLAIN_DICTIONARY
   battery:                 FLOAT GZIP DO:0 FPO:61246929 
SZ:2580916/2792204/1,08 VC:3268670 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY
   charge:                  FLOAT GZIP DO:0 FPO:63827845 SZ:343398/430622/1,25 
VC:3268670 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY
   device_version:          BINARY GZIP DO:0 FPO:64171243 SZ:52447/75248/1,43 
VC:3268670 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY
   device_model:            BINARY GZIP DO:0 FPO:64223690 SZ:62645/84707/1,35 
VC:3268670 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY
   opt_in:                  FLOAT GZIP DO:0 FPO:64286335 SZ:1025/784/0,76 
VC:3268670 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY
   arrival_date_pst:        INT64 GZIP DO:0 FPO:64287360 SZ:56/36/0,64 
VC:3268670 ENC:RLE,BIT_PACKED,PLAIN
   departure_date_pst:      INT64 GZIP DO:0 FPO:64287416 SZ:56/36/0,64 
VC:3268670 ENC:RLE,BIT_PACKED,PLAIN
   client_loc_dtime_pst:    INT64 GZIP DO:0 FPO:64287472 
SZ:7346961/7629527/1,04 VC:3268670 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY
   ```


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to