[ 
https://issues.apache.org/jira/browse/PARQUET-2078?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17406507#comment-17406507
 ] 

Nemon Lou commented on PARQUET-2078:
------------------------------------

Meta data dumping from footer shows that dictionary page offset is right.Only 
RowGroup.file_offset is wrong in the file, i think.

java -jar parquet-tools-deprecated-1.12.0.jar meta ~/data/customer1/000000_0
{noformat}
file:                   file:/home/nemon/data/customer1/000000_0 
creator:                parquet-mr version 1.12.0 (build 
b7c9d0beddc1052004370eebe944e22f55a7d508) 
extra:                  writer.date.proleptic = false 
extra:                  writer.time.zone = Asia/Shanghai 
extra:                  writer.model.name = 4.0.0-SNAPSHOT 
extra:                  writer.zone.conversion.legacy = false 

file schema:            hive_schema 
--------------------------------------------------------------------------------
c_customer_sk:          OPTIONAL INT64 R:0 D:1
c_customer_id:          OPTIONAL BINARY L:STRING R:0 D:1
c_current_cdemo_sk:     OPTIONAL INT64 R:0 D:1
c_current_hdemo_sk:     OPTIONAL INT64 R:0 D:1
c_current_addr_sk:      OPTIONAL INT64 R:0 D:1
c_first_shipto_date_sk: OPTIONAL INT64 R:0 D:1
c_first_sales_date_sk:  OPTIONAL INT64 R:0 D:1
c_salutation:           OPTIONAL BINARY L:STRING R:0 D:1
c_first_name:           OPTIONAL BINARY L:STRING R:0 D:1
c_last_name:            OPTIONAL BINARY L:STRING R:0 D:1
c_preferred_cust_flag:  OPTIONAL BINARY L:STRING R:0 D:1
c_birth_day:            OPTIONAL INT32 R:0 D:1
c_birth_month:          OPTIONAL INT32 R:0 D:1
c_birth_year:           OPTIONAL INT32 R:0 D:1
c_birth_country:        OPTIONAL BINARY L:STRING R:0 D:1
c_login:                OPTIONAL BINARY L:STRING R:0 D:1
c_email_address:        OPTIONAL BINARY L:STRING R:0 D:1
c_last_review_date_sk:  OPTIONAL INT64 R:0 D:1

row group 1:            RC:1530100 TS:133961157 OFFSET:4 
--------------------------------------------------------------------------------
c_customer_sk:           INT64 UNCOMPRESSED DO:0 FPO:4 
SZ:12243643/12243643/1.00 VC:1530100 ENC:RLE,BIT_PACKED,PLAIN ST:[min: 6, max: 
64999980, num_nulls: 0]
c_customer_id:           BINARY UNCOMPRESSED DO:0 FPO:12243647 
SZ:30604844/30604844/1.00 VC:1530100 ENC:RLE,BIT_PACKED,PLAIN ST:[min: 
AAAAAAAAAAAABDAA, max: AAAAAAAAPPPPKCBA, num_nulls: 0]
c_current_cdemo_sk:      INT64 UNCOMPRESSED DO:0 FPO:42848491 
SZ:11962420/11962420/1.00 VC:1530100 ENC:RLE,BIT_PACKED,PLAIN ST:[min: 1, max: 
1920799, num_nulls: 53592]
c_current_hdemo_sk:      INT64 UNCOMPRESSED DO:54810911 FPO:54868535 
SZ:2611021/2611021/1.00 VC:1530100 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY ST:[min: 
1, max: 7200, num_nulls: 53561]
c_current_addr_sk:       INT64 UNCOMPRESSED DO:0 FPO:57421932 
SZ:12243645/12243645/1.00 VC:1530100 ENC:RLE,BIT_PACKED,PLAIN ST:[min: 9, max: 
32499985, num_nulls: 0]
c_first_shipto_date_sk:  INT64 UNCOMPRESSED DO:69665577 FPO:69694809 
SZ:2398231/2398231/1.00 VC:1530100 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY ST:[min: 
2449028, max: 2452678, num_nulls: 53457]
c_first_sales_date_sk:   INT64 UNCOMPRESSED DO:72063808 FPO:72093040 
SZ:2397633/2397633/1.00 VC:1530100 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY ST:[min: 
2448998, max: 2452648, num_nulls: 53502]
c_salutation:            BINARY UNCOMPRESSED DO:74461441 FPO:74461508 
SZ:579678/579678/1.00 VC:1530100 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY ST:[min: , 
max: Sir, num_nulls: 0]
c_first_name:            BINARY UNCOMPRESSED DO:75041119 FPO:75092758 
SZ:2534042/2534042/1.00 VC:1530100 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY ST:[min: 
, max: Zulma, num_nulls: 0]
c_last_name:             BINARY UNCOMPRESSED DO:77575161 FPO:77626525 
SZ:2541263/2541263/1.00 VC:1530100 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY ST:[min: 
, max: Zuniga, num_nulls: 0]
c_preferred_cust_flag:   BINARY UNCOMPRESSED DO:80116424 FPO:80116456 
SZ:388782/388782/1.00 VC:1530100 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY ST:[min: , 
max: Y, num_nulls: 0]
c_birth_day:             INT32 UNCOMPRESSED DO:80505206 FPO:80505351 
SZ:1076499/1076499/1.00 VC:1530100 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY ST:[min: 
1, max: 31, num_nulls: 53591]
c_birth_month:           INT32 UNCOMPRESSED DO:81581705 FPO:81581772 
SZ:891737/891737/1.00 VC:1530100 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY ST:[min: 
1, max: 12, num_nulls: 53288]
c_birth_year:            INT32 UNCOMPRESSED DO:82473442 FPO:82473740 
SZ:1445414/1445414/1.00 VC:1530100 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY ST:[min: 
1924, max: 1992, num_nulls: 53375]
c_birth_country:         BINARY UNCOMPRESSED DO:83918856 FPO:83921564 
SZ:1538795/1538795/1.00 VC:1530100 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY ST:[min: 
, max: ZIMBABWE, num_nulls: 0]
c_login:                 BINARY UNCOMPRESSED DO:85457651 FPO:85457674 
SZ:2872/2872/1.00 VC:1530100 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY ST:[min: , 
max: , num_nulls: 0]
c_email_address:         BINARY UNCOMPRESSED DO:0 FPO:85460523 
SZ:46682636/46682636/1.00 VC:1530100 ENC:RLE,BIT_PACKED,PLAIN ST:[min: , max: 
[email protected], num_nulls: 0]
c_last_review_date_sk:   INT64 UNCOMPRESSED DO:132143159 FPO:132146109 
SZ:1818002/1818002/1.00 VC:1530100 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY ST:[min: 
2452283, max: 2452648, num_nulls: 53812]

row group 2:            RC:1378576 TS:120729344 OFFSET:133961161 
--------------------------------------------------------------------------------
c_customer_sk:           INT64 UNCOMPRESSED DO:0 FPO:133961161 
SZ:11031160/11031160/1.00 VC:1378576 ENC:RLE,BIT_PACKED,PLAIN ST:[min: 353, 
max: 64999947, num_nulls: 0]
c_customer_id:           BINARY UNCOMPRESSED DO:0 FPO:144992321 
SZ:27574069/27574069/1.00 VC:1378576 ENC:RLE,BIT_PACKED,PLAIN ST:[min: 
AAAAAAAAAAAABAAA, max: AAAAAAAAPPPPPJCA, num_nulls: 0]
c_current_cdemo_sk:      INT64 UNCOMPRESSED DO:0 FPO:172566390 
SZ:10777041/10777041/1.00 VC:1378576 ENC:RLE,BIT_PACKED,PLAIN ST:[min: 3, max: 
1920799, num_nulls: 48426]
c_current_hdemo_sk:      INT64 UNCOMPRESSED DO:183343431 FPO:183401055 
SZ:2358286/2358286/1.00 VC:1378576 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY ST:[min: 
1, max: 7200, num_nulls: 48252]
c_current_addr_sk:       INT64 UNCOMPRESSED DO:0 FPO:185701717 
SZ:11031152/11031152/1.00 VC:1378576 ENC:RLE,BIT_PACKED,PLAIN ST:[min: 13, max: 
32499975, num_nulls: 0]
c_first_shipto_date_sk:  INT64 UNCOMPRESSED DO:196732869 FPO:196762101 
SZ:2163621/2163621/1.00 VC:1378576 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY ST:[min: 
2449028, max: 2452678, num_nulls: 48435]
c_first_sales_date_sk:   INT64 UNCOMPRESSED DO:198896490 FPO:198925722 
SZ:2163332/2163332/1.00 VC:1378576 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY ST:[min: 
2448998, max: 2452648, num_nulls: 48508]
c_salutation:            BINARY UNCOMPRESSED DO:201059822 FPO:201059889 
SZ:522266/522266/1.00 VC:1378576 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY ST:[min: , 
max: Sir, num_nulls: 0]
c_first_name:            BINARY UNCOMPRESSED DO:201582088 FPO:201633695 
SZ:2287170/2287170/1.00 VC:1378576 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY ST:[min: 
, max: Zulma, num_nulls: 0]
c_last_name:             BINARY UNCOMPRESSED DO:203869258 FPO:203920622 
SZ:2294427/2294427/1.00 VC:1378576 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY ST:[min: 
, max: Zuniga, num_nulls: 0]
c_preferred_cust_flag:   BINARY UNCOMPRESSED DO:206163685 FPO:206163718 
SZ:350234/350234/1.00 VC:1378576 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY ST:[min: , 
max: Y, num_nulls: 0]
c_birth_day:             INT32 UNCOMPRESSED DO:206513919 FPO:206514064 
SZ:970564/970564/1.00 VC:1378576 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY ST:[min: 
1, max: 31, num_nulls: 48659]
c_birth_month:           INT32 UNCOMPRESSED DO:207484483 FPO:207484550 
SZ:803919/803919/1.00 VC:1378576 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY ST:[min: 
1, max: 12, num_nulls: 48294]
c_birth_year:            INT32 UNCOMPRESSED DO:208288402 FPO:208288700 
SZ:1303139/1303139/1.00 VC:1378576 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY ST:[min: 
1924, max: 1992, num_nulls: 48458]
c_birth_country:         BINARY UNCOMPRESSED DO:209591541 FPO:209594249 
SZ:1386657/1386657/1.00 VC:1378576 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY ST:[min: 
, max: ZIMBABWE, num_nulls: 0]
c_login:                 BINARY UNCOMPRESSED DO:210978198 FPO:210978221 
SZ:2576/2576/1.00 VC:1378576 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY ST:[min: , 
max: , num_nulls: 0]
c_email_address:         BINARY UNCOMPRESSED DO:0 FPO:210980774 
SZ:42071765/42071765/1.00 VC:1378576 ENC:RLE,BIT_PACKED,PLAIN ST:[min: , max: 
[email protected], num_nulls: 0]
c_last_review_date_sk:   INT64 UNCOMPRESSED DO:253052539 FPO:253055489 
SZ:1637966/1637966/1.00 VC:1378576 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY ST:[min: 
2452283, max: 2452648, num_nulls: 48083]

{noformat}

> Failed to read parquet file after writing with the same parquet version
> -----------------------------------------------------------------------
>
>                 Key: PARQUET-2078
>                 URL: https://issues.apache.org/jira/browse/PARQUET-2078
>             Project: Parquet
>          Issue Type: Bug
>          Components: parquet-mr
>    Affects Versions: 1.12.0
>            Reporter: Nemon Lou
>            Priority: Critical
>             Fix For: 1.13.0, 1.12.1
>
>         Attachments: 
> PARQUET_2078_how_to_fix_rowgroup_fileoffset_for_branch_1.12.x.patch, 
> tpcds_customer_footer.json
>
>
> Writing parquet  file with version 1.12.0 in Apache Hive, then read that 
> file, returns the following error:
> {noformat}
> Caused by: java.lang.IllegalStateException: All of the offsets in the split 
> should be found in the file. expected: [4, 133961161] found: 
> [BlockMetaData{1530100, 133961157 [ColumnMetaData{UNCOMPRESSED 
> [c_customer_sk] optional int64 c_customer_sk  [PLAIN, RLE, BIT_PACKED], 4}, 
> ColumnMetaData{UNCOMPRESSED [c_customer_id] optional binary c_customer_id 
> (STRING)  [PLAIN, RLE, BIT_PACKED], 12243647}, ColumnMetaData{UNCOMPRESSED 
> [c_current_cdemo_sk] optional int64 c_current_cdemo_sk  [PLAIN, RLE, 
> BIT_PACKED], 42848491}, ColumnMetaData{UNCOMPRESSED [c_current_hdemo_sk] 
> optional int64 c_current_hdemo_sk  [RLE, PLAIN_DICTIONARY, BIT_PACKED], 
> 54868535}, ColumnMetaData{UNCOMPRESSED [c_current_addr_sk] optional int64 
> c_current_addr_sk  [PLAIN, RLE, BIT_PACKED], 57421932}, 
> ColumnMetaData{UNCOMPRESSED [c_first_shipto_date_sk] optional int64 
> c_first_shipto_date_sk  [RLE, PLAIN_DICTIONARY, BIT_PACKED], 69694809}, 
> ColumnMetaData{UNCOMPRESSED [c_first_sales_date_sk] optional int64 
> c_first_sales_date_sk  [RLE, PLAIN_DICTIONARY, BIT_PACKED], 72093040}, 
> ColumnMetaData{UNCOMPRESSED [c_salutation] optional binary c_salutation 
> (STRING)  [RLE, PLAIN_DICTIONARY, BIT_PACKED], 74461508}, 
> ColumnMetaData{UNCOMPRESSED [c_first_name] optional binary c_first_name 
> (STRING)  [RLE, PLAIN_DICTIONARY, BIT_PACKED], 75092758}, 
> ColumnMetaData{UNCOMPRESSED [c_last_name] optional binary c_last_name 
> (STRING)  [RLE, PLAIN_DICTIONARY, BIT_PACKED], 77626525}, 
> ColumnMetaData{UNCOMPRESSED [c_preferred_cust_flag] optional binary 
> c_preferred_cust_flag (STRING)  [RLE, PLAIN_DICTIONARY, BIT_PACKED], 
> 80116456}, ColumnMetaData{UNCOMPRESSED [c_birth_day] optional int32 
> c_birth_day  [RLE, PLAIN_DICTIONARY, BIT_PACKED], 80505351}, 
> ColumnMetaData{UNCOMPRESSED [c_birth_month] optional int32 c_birth_month  
> [RLE, PLAIN_DICTIONARY, BIT_PACKED], 81581772}, ColumnMetaData{UNCOMPRESSED 
> [c_birth_year] optional int32 c_birth_year  [RLE, PLAIN_DICTIONARY, 
> BIT_PACKED], 82473740}, ColumnMetaData{UNCOMPRESSED [c_birth_country] 
> optional binary c_birth_country (STRING)  [RLE, PLAIN_DICTIONARY, 
> BIT_PACKED], 83921564}, ColumnMetaData{UNCOMPRESSED [c_login] optional binary 
> c_login (STRING)  [RLE, PLAIN_DICTIONARY, BIT_PACKED], 85457674}, 
> ColumnMetaData{UNCOMPRESSED [c_email_address] optional binary c_email_address 
> (STRING)  [PLAIN, RLE, BIT_PACKED], 85460523}, ColumnMetaData{UNCOMPRESSED 
> [c_last_review_date_sk] optional int64 c_last_review_date_sk  [RLE, 
> PLAIN_DICTIONARY, BIT_PACKED], 132146109}]}]
>       at 
> org.apache.parquet.hadoop.ParquetRecordReader.initializeInternalReader(ParquetRecordReader.java:172)
>  ~[parquet-hadoop-bundle-1.12.0.jar:1.12.0]
>       at 
> org.apache.parquet.hadoop.ParquetRecordReader.initialize(ParquetRecordReader.java:140)
>  ~[parquet-hadoop-bundle-1.12.0.jar:1.12.0]
>       at 
> org.apache.hadoop.hive.ql.io.parquet.read.ParquetRecordReaderWrapper.<init>(ParquetRecordReaderWrapper.java:95)
>  ~[hive-exec-4.0.0-SNAPSHOT.jar:4.0.0-SNAPSHOT]
>       at 
> org.apache.hadoop.hive.ql.io.parquet.read.ParquetRecordReaderWrapper.<init>(ParquetRecordReaderWrapper.java:60)
>  ~[hive-exec-4.0.0-SNAPSHOT.jar:4.0.0-SNAPSHOT]
>       at 
> org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat.getRecordReader(MapredParquetInputFormat.java:89)
>  ~[hive-exec-4.0.0-SNAPSHOT.jar:4.0.0-SNAPSHOT]
>       at 
> org.apache.hadoop.hive.ql.io.CombineHiveRecordReader.<init>(CombineHiveRecordReader.java:96)
>  ~[hive-exec-4.0.0-SNAPSHOT.jar:4.0.0-SNAPSHOT]
>       at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native 
> Method) ~[?:1.8.0_292]
>       at 
> sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
>  ~[?:1.8.0_292]
>       at 
> sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
>  ~[?:1.8.0_292]
>       at java.lang.reflect.Constructor.newInstance(Constructor.java:423) 
> ~[?:1.8.0_292]
>       at 
> org.apache.hadoop.hive.shims.HadoopShimsSecure$CombineFileRecordReader.initNextRecordReader(HadoopShimsSecure.java:254)
>  ~[hive-exec-4.0.0-SNAPSHOT.jar:4.0.0-SNAPSHOT]
>       at 
> org.apache.hadoop.hive.shims.HadoopShimsSecure$CombineFileRecordReader.<init>(HadoopShimsSecure.java:214)
>  ~[hive-exec-4.0.0-SNAPSHOT.jar:4.0.0-SNAPSHOT]
>       at 
> org.apache.hadoop.hive.shims.HadoopShimsSecure$CombineFileInputFormatShim.getRecordReader(HadoopShimsSecure.java:342)
>  ~[hive-exec-4.0.0-SNAPSHOT.jar:4.0.0-SNAPSHOT]
>       at 
> org.apache.hadoop.hive.ql.io.CombineHiveInputFormat.getRecordReader(CombineHiveInputFormat.java:716)
>  ~[hive-exec-4.0.0-SNAPSHOT.jar:4.0.0-SNAPSHOT]
>       at 
> org.apache.hadoop.mapred.MapTask$TrackedRecordReader.<init>(MapTask.java:175) 
> ~[hadoop-mapreduce-client-core-3.1.4.jar:?]
>       at org.apache.hadoop.mapred.MapTask.runOldMapper(MapTask.java:444) 
> ~[hadoop-mapreduce-client-core-3.1.4.jar:?]
>       at org.apache.hadoop.mapred.MapTask.run(MapTask.java:349) 
> ~[hadoop-mapreduce-client-core-3.1.4.jar:?]
>       at 
> org.apache.hadoop.mapred.LocalJobRunner$Job$MapTaskRunnable.run(LocalJobRunner.java:271)
>  ~[hadoop-mapreduce-client-common-3.1.4.jar:?]
>       at 
> java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511) 
> ~[?:1.8.0_292]
>       at java.util.concurrent.FutureTask.run(FutureTask.java:266) 
> ~[?:1.8.0_292]
>       at 
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
>  ~[?:1.8.0_292]
>       at 
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
>  ~[?:1.8.0_292]
>       at java.lang.Thread.run(Thread.java:748) ~[?:1.8.0_292]
> {noformat}
> Repoduce Scenario:
> TPC-DS table customer, any parquet file witten by 1.12.0 that larger than 
> 128MB(two row groups).
> {code:sql}
> create  table if not exists customer(
>       c_customer_sk bigint
> ,     c_customer_id char(16)
> ,     c_current_cdemo_sk bigint
> ,     c_current_hdemo_sk bigint
> ,     c_current_addr_sk bigint
> ,     c_first_shipto_date_sk bigint
> ,     c_first_sales_date_sk bigint
> ,     c_salutation char(10)
> ,     c_first_name char(20)
> ,     c_last_name char(30)
> ,     c_preferred_cust_flag char(1)
> ,     c_birth_day int
> ,     c_birth_month int
> ,     c_birth_year int
> ,     c_birth_country varchar(20)
> ,     c_login char(13)
> ,     c_email_address char(50)
> ,     c_last_review_date_sk bigint
> )
> stored as parquet location 'file:///home/username/data/customer';
> --after add file:
> select count(*) from customer;
> {code}
>  



--
This message was sent by Atlassian Jira
(v8.3.4#803005)

Reply via email to