[
https://issues.apache.org/jira/browse/PARQUET-2078?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]
Nemon Lou updated PARQUET-2078:
-------------------------------
Description:
Writing parquet file with version 1.12.0 in Apache Hive, then read that file,
returns the following error:
{noformat}
Caused by: java.lang.IllegalStateException: All of the offsets in the split
should be found in the file. expected: [4, 133961161] found:
[BlockMetaData{1530100, 133961157 [ColumnMetaData{UNCOMPRESSED [c_customer_sk]
optional int64 c_customer_sk [PLAIN, RLE, BIT_PACKED], 4},
ColumnMetaData{UNCOMPRESSED [c_customer_id] optional binary c_customer_id
(STRING) [PLAIN, RLE, BIT_PACKED], 12243647}, ColumnMetaData{UNCOMPRESSED
[c_current_cdemo_sk] optional int64 c_current_cdemo_sk [PLAIN, RLE,
BIT_PACKED], 42848491}, ColumnMetaData{UNCOMPRESSED [c_current_hdemo_sk]
optional int64 c_current_hdemo_sk [RLE, PLAIN_DICTIONARY, BIT_PACKED],
54868535}, ColumnMetaData{UNCOMPRESSED [c_current_addr_sk] optional int64
c_current_addr_sk [PLAIN, RLE, BIT_PACKED], 57421932},
ColumnMetaData{UNCOMPRESSED [c_first_shipto_date_sk] optional int64
c_first_shipto_date_sk [RLE, PLAIN_DICTIONARY, BIT_PACKED], 69694809},
ColumnMetaData{UNCOMPRESSED [c_first_sales_date_sk] optional int64
c_first_sales_date_sk [RLE, PLAIN_DICTIONARY, BIT_PACKED], 72093040},
ColumnMetaData{UNCOMPRESSED [c_salutation] optional binary c_salutation
(STRING) [RLE, PLAIN_DICTIONARY, BIT_PACKED], 74461508},
ColumnMetaData{UNCOMPRESSED [c_first_name] optional binary c_first_name
(STRING) [RLE, PLAIN_DICTIONARY, BIT_PACKED], 75092758},
ColumnMetaData{UNCOMPRESSED [c_last_name] optional binary c_last_name (STRING)
[RLE, PLAIN_DICTIONARY, BIT_PACKED], 77626525}, ColumnMetaData{UNCOMPRESSED
[c_preferred_cust_flag] optional binary c_preferred_cust_flag (STRING) [RLE,
PLAIN_DICTIONARY, BIT_PACKED], 80116456}, ColumnMetaData{UNCOMPRESSED
[c_birth_day] optional int32 c_birth_day [RLE, PLAIN_DICTIONARY, BIT_PACKED],
80505351}, ColumnMetaData{UNCOMPRESSED [c_birth_month] optional int32
c_birth_month [RLE, PLAIN_DICTIONARY, BIT_PACKED], 81581772},
ColumnMetaData{UNCOMPRESSED [c_birth_year] optional int32 c_birth_year [RLE,
PLAIN_DICTIONARY, BIT_PACKED], 82473740}, ColumnMetaData{UNCOMPRESSED
[c_birth_country] optional binary c_birth_country (STRING) [RLE,
PLAIN_DICTIONARY, BIT_PACKED], 83921564}, ColumnMetaData{UNCOMPRESSED [c_login]
optional binary c_login (STRING) [RLE, PLAIN_DICTIONARY, BIT_PACKED],
85457674}, ColumnMetaData{UNCOMPRESSED [c_email_address] optional binary
c_email_address (STRING) [PLAIN, RLE, BIT_PACKED], 85460523},
ColumnMetaData{UNCOMPRESSED [c_last_review_date_sk] optional int64
c_last_review_date_sk [RLE, PLAIN_DICTIONARY, BIT_PACKED], 132146109}]}]
at
org.apache.parquet.hadoop.ParquetRecordReader.initializeInternalReader(ParquetRecordReader.java:172)
~[parquet-hadoop-bundle-1.12.0.jar:1.12.0]
at
org.apache.parquet.hadoop.ParquetRecordReader.initialize(ParquetRecordReader.java:140)
~[parquet-hadoop-bundle-1.12.0.jar:1.12.0]
at
org.apache.hadoop.hive.ql.io.parquet.read.ParquetRecordReaderWrapper.<init>(ParquetRecordReaderWrapper.java:95)
~[hive-exec-4.0.0-SNAPSHOT.jar:4.0.0-SNAPSHOT]
at
org.apache.hadoop.hive.ql.io.parquet.read.ParquetRecordReaderWrapper.<init>(ParquetRecordReaderWrapper.java:60)
~[hive-exec-4.0.0-SNAPSHOT.jar:4.0.0-SNAPSHOT]
at
org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat.getRecordReader(MapredParquetInputFormat.java:89)
~[hive-exec-4.0.0-SNAPSHOT.jar:4.0.0-SNAPSHOT]
at
org.apache.hadoop.hive.ql.io.CombineHiveRecordReader.<init>(CombineHiveRecordReader.java:96)
~[hive-exec-4.0.0-SNAPSHOT.jar:4.0.0-SNAPSHOT]
at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native
Method) ~[?:1.8.0_292]
at
sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
~[?:1.8.0_292]
at
sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
~[?:1.8.0_292]
at java.lang.reflect.Constructor.newInstance(Constructor.java:423)
~[?:1.8.0_292]
at
org.apache.hadoop.hive.shims.HadoopShimsSecure$CombineFileRecordReader.initNextRecordReader(HadoopShimsSecure.java:254)
~[hive-exec-4.0.0-SNAPSHOT.jar:4.0.0-SNAPSHOT]
at
org.apache.hadoop.hive.shims.HadoopShimsSecure$CombineFileRecordReader.<init>(HadoopShimsSecure.java:214)
~[hive-exec-4.0.0-SNAPSHOT.jar:4.0.0-SNAPSHOT]
at
org.apache.hadoop.hive.shims.HadoopShimsSecure$CombineFileInputFormatShim.getRecordReader(HadoopShimsSecure.java:342)
~[hive-exec-4.0.0-SNAPSHOT.jar:4.0.0-SNAPSHOT]
at
org.apache.hadoop.hive.ql.io.CombineHiveInputFormat.getRecordReader(CombineHiveInputFormat.java:716)
~[hive-exec-4.0.0-SNAPSHOT.jar:4.0.0-SNAPSHOT]
at
org.apache.hadoop.mapred.MapTask$TrackedRecordReader.<init>(MapTask.java:175)
~[hadoop-mapreduce-client-core-3.1.4.jar:?]
at org.apache.hadoop.mapred.MapTask.runOldMapper(MapTask.java:444)
~[hadoop-mapreduce-client-core-3.1.4.jar:?]
at org.apache.hadoop.mapred.MapTask.run(MapTask.java:349)
~[hadoop-mapreduce-client-core-3.1.4.jar:?]
at
org.apache.hadoop.mapred.LocalJobRunner$Job$MapTaskRunnable.run(LocalJobRunner.java:271)
~[hadoop-mapreduce-client-common-3.1.4.jar:?]
at
java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
~[?:1.8.0_292]
at java.util.concurrent.FutureTask.run(FutureTask.java:266)
~[?:1.8.0_292]
at
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
~[?:1.8.0_292]
at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
~[?:1.8.0_292]
at java.lang.Thread.run(Thread.java:748) ~[?:1.8.0_292]
{noformat}
was:
Writing parquet file with version 1.12.0 in Apache Hive, then read that file,
returns the following error:
{code:noformat}
Caused by: java.lang.IllegalStateException: All of the offsets in the split
should be found in the file. expected: [4, 133961161] found:
[BlockMetaData{1530100, 133961157 [ColumnMetaData{UNCOMPRESSED [c_customer_sk]
optional int64 c_customer_sk [PLAIN, RLE, BIT_PACKED], 4},
ColumnMetaData{UNCOMPRESSED [c_customer_id] optional binary c_customer_id
(STRING) [PLAIN, RLE, BIT_PACKED], 12243647}, ColumnMetaData{UNCOMPRESSED
[c_current_cdemo_sk] optional int64 c_current_cdemo_sk [PLAIN, RLE,
BIT_PACKED], 42848491}, ColumnMetaData{UNCOMPRESSED [c_current_hdemo_sk]
optional int64 c_current_hdemo_sk [RLE, PLAIN_DICTIONARY, BIT_PACKED],
54868535}, ColumnMetaData{UNCOMPRESSED [c_current_addr_sk] optional int64
c_current_addr_sk [PLAIN, RLE, BIT_PACKED], 57421932},
ColumnMetaData{UNCOMPRESSED [c_first_shipto_date_sk] optional int64
c_first_shipto_date_sk [RLE, PLAIN_DICTIONARY, BIT_PACKED], 69694809},
ColumnMetaData{UNCOMPRESSED [c_first_sales_date_sk] optional int64
c_first_sales_date_sk [RLE, PLAIN_DICTIONARY, BIT_PACKED], 72093040},
ColumnMetaData{UNCOMPRESSED [c_salutation] optional binary c_salutation
(STRING) [RLE, PLAIN_DICTIONARY, BIT_PACKED], 74461508},
ColumnMetaData{UNCOMPRESSED [c_first_name] optional binary c_first_name
(STRING) [RLE, PLAIN_DICTIONARY, BIT_PACKED], 75092758},
ColumnMetaData{UNCOMPRESSED [c_last_name] optional binary c_last_name (STRING)
[RLE, PLAIN_DICTIONARY, BIT_PACKED], 77626525}, ColumnMetaData{UNCOMPRESSED
[c_preferred_cust_flag] optional binary c_preferred_cust_flag (STRING) [RLE,
PLAIN_DICTIONARY, BIT_PACKED], 80116456}, ColumnMetaData{UNCOMPRESSED
[c_birth_day] optional int32 c_birth_day [RLE, PLAIN_DICTIONARY, BIT_PACKED],
80505351}, ColumnMetaData{UNCOMPRESSED [c_birth_month] optional int32
c_birth_month [RLE, PLAIN_DICTIONARY, BIT_PACKED], 81581772},
ColumnMetaData{UNCOMPRESSED [c_birth_year] optional int32 c_birth_year [RLE,
PLAIN_DICTIONARY, BIT_PACKED], 82473740}, ColumnMetaData{UNCOMPRESSED
[c_birth_country] optional binary c_birth_country (STRING) [RLE,
PLAIN_DICTIONARY, BIT_PACKED], 83921564}, ColumnMetaData{UNCOMPRESSED [c_login]
optional binary c_login (STRING) [RLE, PLAIN_DICTIONARY, BIT_PACKED],
85457674}, ColumnMetaData{UNCOMPRESSED [c_email_address] optional binary
c_email_address (STRING) [PLAIN, RLE, BIT_PACKED], 85460523},
ColumnMetaData{UNCOMPRESSED [c_last_review_date_sk] optional int64
c_last_review_date_sk [RLE, PLAIN_DICTIONARY, BIT_PACKED], 132146109}]}]
at
org.apache.parquet.hadoop.ParquetRecordReader.initializeInternalReader(ParquetRecordReader.java:172)
~[parquet-hadoop-bundle-1.12.0.jar:1.12.0]
at
org.apache.parquet.hadoop.ParquetRecordReader.initialize(ParquetRecordReader.java:140)
~[parquet-hadoop-bundle-1.12.0.jar:1.12.0]
at
org.apache.hadoop.hive.ql.io.parquet.read.ParquetRecordReaderWrapper.<init>(ParquetRecordReaderWrapper.java:95)
~[hive-exec-4.0.0-SNAPSHOT.jar:4.0.0-SNAPSHOT]
at
org.apache.hadoop.hive.ql.io.parquet.read.ParquetRecordReaderWrapper.<init>(ParquetRecordReaderWrapper.java:60)
~[hive-exec-4.0.0-SNAPSHOT.jar:4.0.0-SNAPSHOT]
at
org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat.getRecordReader(MapredParquetInputFormat.java:89)
~[hive-exec-4.0.0-SNAPSHOT.jar:4.0.0-SNAPSHOT]
at
org.apache.hadoop.hive.ql.io.CombineHiveRecordReader.<init>(CombineHiveRecordReader.java:96)
~[hive-exec-4.0.0-SNAPSHOT.jar:4.0.0-SNAPSHOT]
at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native
Method) ~[?:1.8.0_292]
at
sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
~[?:1.8.0_292]
at
sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
~[?:1.8.0_292]
at java.lang.reflect.Constructor.newInstance(Constructor.java:423)
~[?:1.8.0_292]
at
org.apache.hadoop.hive.shims.HadoopShimsSecure$CombineFileRecordReader.initNextRecordReader(HadoopShimsSecure.java:254)
~[hive-exec-4.0.0-SNAPSHOT.jar:4.0.0-SNAPSHOT]
at
org.apache.hadoop.hive.shims.HadoopShimsSecure$CombineFileRecordReader.<init>(HadoopShimsSecure.java:214)
~[hive-exec-4.0.0-SNAPSHOT.jar:4.0.0-SNAPSHOT]
at
org.apache.hadoop.hive.shims.HadoopShimsSecure$CombineFileInputFormatShim.getRecordReader(HadoopShimsSecure.java:342)
~[hive-exec-4.0.0-SNAPSHOT.jar:4.0.0-SNAPSHOT]
at
org.apache.hadoop.hive.ql.io.CombineHiveInputFormat.getRecordReader(CombineHiveInputFormat.java:716)
~[hive-exec-4.0.0-SNAPSHOT.jar:4.0.0-SNAPSHOT]
at
org.apache.hadoop.mapred.MapTask$TrackedRecordReader.<init>(MapTask.java:175)
~[hadoop-mapreduce-client-core-3.1.4.jar:?]
at org.apache.hadoop.mapred.MapTask.runOldMapper(MapTask.java:444)
~[hadoop-mapreduce-client-core-3.1.4.jar:?]
at org.apache.hadoop.mapred.MapTask.run(MapTask.java:349)
~[hadoop-mapreduce-client-core-3.1.4.jar:?]
at
org.apache.hadoop.mapred.LocalJobRunner$Job$MapTaskRunnable.run(LocalJobRunner.java:271)
~[hadoop-mapreduce-client-common-3.1.4.jar:?]
at
java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
~[?:1.8.0_292]
at java.util.concurrent.FutureTask.run(FutureTask.java:266)
~[?:1.8.0_292]
at
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
~[?:1.8.0_292]
at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
~[?:1.8.0_292]
at java.lang.Thread.run(Thread.java:748) ~[?:1.8.0_292]
{code}
> Failed to read parquet file after writing with the same parquet version
> -----------------------------------------------------------------------
>
> Key: PARQUET-2078
> URL: https://issues.apache.org/jira/browse/PARQUET-2078
> Project: Parquet
> Issue Type: Bug
> Components: parquet-mr
> Affects Versions: 1.12.0
> Reporter: Nemon Lou
> Priority: Critical
>
> Writing parquet file with version 1.12.0 in Apache Hive, then read that
> file, returns the following error:
> {noformat}
> Caused by: java.lang.IllegalStateException: All of the offsets in the split
> should be found in the file. expected: [4, 133961161] found:
> [BlockMetaData{1530100, 133961157 [ColumnMetaData{UNCOMPRESSED
> [c_customer_sk] optional int64 c_customer_sk [PLAIN, RLE, BIT_PACKED], 4},
> ColumnMetaData{UNCOMPRESSED [c_customer_id] optional binary c_customer_id
> (STRING) [PLAIN, RLE, BIT_PACKED], 12243647}, ColumnMetaData{UNCOMPRESSED
> [c_current_cdemo_sk] optional int64 c_current_cdemo_sk [PLAIN, RLE,
> BIT_PACKED], 42848491}, ColumnMetaData{UNCOMPRESSED [c_current_hdemo_sk]
> optional int64 c_current_hdemo_sk [RLE, PLAIN_DICTIONARY, BIT_PACKED],
> 54868535}, ColumnMetaData{UNCOMPRESSED [c_current_addr_sk] optional int64
> c_current_addr_sk [PLAIN, RLE, BIT_PACKED], 57421932},
> ColumnMetaData{UNCOMPRESSED [c_first_shipto_date_sk] optional int64
> c_first_shipto_date_sk [RLE, PLAIN_DICTIONARY, BIT_PACKED], 69694809},
> ColumnMetaData{UNCOMPRESSED [c_first_sales_date_sk] optional int64
> c_first_sales_date_sk [RLE, PLAIN_DICTIONARY, BIT_PACKED], 72093040},
> ColumnMetaData{UNCOMPRESSED [c_salutation] optional binary c_salutation
> (STRING) [RLE, PLAIN_DICTIONARY, BIT_PACKED], 74461508},
> ColumnMetaData{UNCOMPRESSED [c_first_name] optional binary c_first_name
> (STRING) [RLE, PLAIN_DICTIONARY, BIT_PACKED], 75092758},
> ColumnMetaData{UNCOMPRESSED [c_last_name] optional binary c_last_name
> (STRING) [RLE, PLAIN_DICTIONARY, BIT_PACKED], 77626525},
> ColumnMetaData{UNCOMPRESSED [c_preferred_cust_flag] optional binary
> c_preferred_cust_flag (STRING) [RLE, PLAIN_DICTIONARY, BIT_PACKED],
> 80116456}, ColumnMetaData{UNCOMPRESSED [c_birth_day] optional int32
> c_birth_day [RLE, PLAIN_DICTIONARY, BIT_PACKED], 80505351},
> ColumnMetaData{UNCOMPRESSED [c_birth_month] optional int32 c_birth_month
> [RLE, PLAIN_DICTIONARY, BIT_PACKED], 81581772}, ColumnMetaData{UNCOMPRESSED
> [c_birth_year] optional int32 c_birth_year [RLE, PLAIN_DICTIONARY,
> BIT_PACKED], 82473740}, ColumnMetaData{UNCOMPRESSED [c_birth_country]
> optional binary c_birth_country (STRING) [RLE, PLAIN_DICTIONARY,
> BIT_PACKED], 83921564}, ColumnMetaData{UNCOMPRESSED [c_login] optional binary
> c_login (STRING) [RLE, PLAIN_DICTIONARY, BIT_PACKED], 85457674},
> ColumnMetaData{UNCOMPRESSED [c_email_address] optional binary c_email_address
> (STRING) [PLAIN, RLE, BIT_PACKED], 85460523}, ColumnMetaData{UNCOMPRESSED
> [c_last_review_date_sk] optional int64 c_last_review_date_sk [RLE,
> PLAIN_DICTIONARY, BIT_PACKED], 132146109}]}]
> at
> org.apache.parquet.hadoop.ParquetRecordReader.initializeInternalReader(ParquetRecordReader.java:172)
> ~[parquet-hadoop-bundle-1.12.0.jar:1.12.0]
> at
> org.apache.parquet.hadoop.ParquetRecordReader.initialize(ParquetRecordReader.java:140)
> ~[parquet-hadoop-bundle-1.12.0.jar:1.12.0]
> at
> org.apache.hadoop.hive.ql.io.parquet.read.ParquetRecordReaderWrapper.<init>(ParquetRecordReaderWrapper.java:95)
> ~[hive-exec-4.0.0-SNAPSHOT.jar:4.0.0-SNAPSHOT]
> at
> org.apache.hadoop.hive.ql.io.parquet.read.ParquetRecordReaderWrapper.<init>(ParquetRecordReaderWrapper.java:60)
> ~[hive-exec-4.0.0-SNAPSHOT.jar:4.0.0-SNAPSHOT]
> at
> org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat.getRecordReader(MapredParquetInputFormat.java:89)
> ~[hive-exec-4.0.0-SNAPSHOT.jar:4.0.0-SNAPSHOT]
> at
> org.apache.hadoop.hive.ql.io.CombineHiveRecordReader.<init>(CombineHiveRecordReader.java:96)
> ~[hive-exec-4.0.0-SNAPSHOT.jar:4.0.0-SNAPSHOT]
> at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native
> Method) ~[?:1.8.0_292]
> at
> sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
> ~[?:1.8.0_292]
> at
> sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
> ~[?:1.8.0_292]
> at java.lang.reflect.Constructor.newInstance(Constructor.java:423)
> ~[?:1.8.0_292]
> at
> org.apache.hadoop.hive.shims.HadoopShimsSecure$CombineFileRecordReader.initNextRecordReader(HadoopShimsSecure.java:254)
> ~[hive-exec-4.0.0-SNAPSHOT.jar:4.0.0-SNAPSHOT]
> at
> org.apache.hadoop.hive.shims.HadoopShimsSecure$CombineFileRecordReader.<init>(HadoopShimsSecure.java:214)
> ~[hive-exec-4.0.0-SNAPSHOT.jar:4.0.0-SNAPSHOT]
> at
> org.apache.hadoop.hive.shims.HadoopShimsSecure$CombineFileInputFormatShim.getRecordReader(HadoopShimsSecure.java:342)
> ~[hive-exec-4.0.0-SNAPSHOT.jar:4.0.0-SNAPSHOT]
> at
> org.apache.hadoop.hive.ql.io.CombineHiveInputFormat.getRecordReader(CombineHiveInputFormat.java:716)
> ~[hive-exec-4.0.0-SNAPSHOT.jar:4.0.0-SNAPSHOT]
> at
> org.apache.hadoop.mapred.MapTask$TrackedRecordReader.<init>(MapTask.java:175)
> ~[hadoop-mapreduce-client-core-3.1.4.jar:?]
> at org.apache.hadoop.mapred.MapTask.runOldMapper(MapTask.java:444)
> ~[hadoop-mapreduce-client-core-3.1.4.jar:?]
> at org.apache.hadoop.mapred.MapTask.run(MapTask.java:349)
> ~[hadoop-mapreduce-client-core-3.1.4.jar:?]
> at
> org.apache.hadoop.mapred.LocalJobRunner$Job$MapTaskRunnable.run(LocalJobRunner.java:271)
> ~[hadoop-mapreduce-client-common-3.1.4.jar:?]
> at
> java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
> ~[?:1.8.0_292]
> at java.util.concurrent.FutureTask.run(FutureTask.java:266)
> ~[?:1.8.0_292]
> at
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
> ~[?:1.8.0_292]
> at
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
> ~[?:1.8.0_292]
> at java.lang.Thread.run(Thread.java:748) ~[?:1.8.0_292]
> {noformat}
>
--
This message was sent by Atlassian Jira
(v8.3.4#803005)