[jira] [Commented] (SPARK-16334) [SQL] SQL query on parquet table java.lang.ArrayIndexOutOfBoundsException
[ https://issues.apache.org/jira/browse/SPARK-16334?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=15432040#comment-15432040 ] Egor Pahomov commented on SPARK-16334: -- (just reason for reopen) > [SQL] SQL query on parquet table java.lang.ArrayIndexOutOfBoundsException > - > > Key: SPARK-16334 > URL: https://issues.apache.org/jira/browse/SPARK-16334 > Project: Spark > Issue Type: Bug >Affects Versions: 2.0.0 >Reporter: Egor Pahomov >Assignee: Sameer Agarwal >Priority: Critical > Labels: sql > Fix For: 2.0.1, 2.1.0 > > > Query: > {code} > select * from blabla where user_id = 415706251 > {code} > Error: > {code} > 16/06/30 14:07:27 WARN scheduler.TaskSetManager: Lost task 11.0 in stage 0.0 > (TID 3, hadoop6): java.lang.ArrayIndexOutOfBoundsException: 6934 > at > org.apache.parquet.column.values.dictionary.PlainValuesDictionary$PlainBinaryDictionary.decodeToBinary(PlainValuesDictionary.java:119) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedColumnReader.decodeDictionaryIds(VectorizedColumnReader.java:273) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedColumnReader.readBatch(VectorizedColumnReader.java:170) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.nextBatch(VectorizedParquetRecordReader.java:230) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.nextKeyValue(VectorizedParquetRecordReader.java:137) > at > org.apache.spark.sql.execution.datasources.RecordReaderIterator.hasNext(RecordReaderIterator.scala:36) > at > org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:91) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.scan_nextBatch$(Unknown > Source) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown > Source) > at > org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) > at > org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:370) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:246) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:240) > at > org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:780) > at > org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:780) > at > org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:283) > at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:70) > at org.apache.spark.scheduler.Task.run(Task.scala:85) > at > org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274) > at > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145) > at > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615) > at java.lang.Thread.run(Thread.java:745) > {code} > Work on 1.6.1 -- This message was sent by Atlassian JIRA (v6.3.4#6332) - To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org
[jira] [Commented] (SPARK-16334) [SQL] SQL query on parquet table java.lang.ArrayIndexOutOfBoundsException
[ https://issues.apache.org/jira/browse/SPARK-16334?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=15408531#comment-15408531 ] Sameer Agarwal commented on SPARK-16334: Thanks Keith, that'll work. You can mail it to me at sam...@databricks.com. > [SQL] SQL query on parquet table java.lang.ArrayIndexOutOfBoundsException > - > > Key: SPARK-16334 > URL: https://issues.apache.org/jira/browse/SPARK-16334 > Project: Spark > Issue Type: Bug >Affects Versions: 2.0.0 >Reporter: Egor Pahomov >Assignee: Sameer Agarwal >Priority: Critical > Labels: sql > Fix For: 2.0.1, 2.1.0 > > > Query: > {code} > select * from blabla where user_id = 415706251 > {code} > Error: > {code} > 16/06/30 14:07:27 WARN scheduler.TaskSetManager: Lost task 11.0 in stage 0.0 > (TID 3, hadoop6): java.lang.ArrayIndexOutOfBoundsException: 6934 > at > org.apache.parquet.column.values.dictionary.PlainValuesDictionary$PlainBinaryDictionary.decodeToBinary(PlainValuesDictionary.java:119) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedColumnReader.decodeDictionaryIds(VectorizedColumnReader.java:273) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedColumnReader.readBatch(VectorizedColumnReader.java:170) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.nextBatch(VectorizedParquetRecordReader.java:230) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.nextKeyValue(VectorizedParquetRecordReader.java:137) > at > org.apache.spark.sql.execution.datasources.RecordReaderIterator.hasNext(RecordReaderIterator.scala:36) > at > org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:91) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.scan_nextBatch$(Unknown > Source) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown > Source) > at > org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) > at > org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:370) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:246) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:240) > at > org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:780) > at > org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:780) > at > org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:283) > at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:70) > at org.apache.spark.scheduler.Task.run(Task.scala:85) > at > org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274) > at > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145) > at > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615) > at java.lang.Thread.run(Thread.java:745) > {code} > Work on 1.6.1 -- This message was sent by Atlassian JIRA (v6.3.4#6332) - To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org
[jira] [Commented] (SPARK-16334) [SQL] SQL query on parquet table java.lang.ArrayIndexOutOfBoundsException
[ https://issues.apache.org/jira/browse/SPARK-16334?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=15408520#comment-15408520 ] Keith Kraus commented on SPARK-16334: - [~sameerag] Sharing even a subset of the data would be very difficult as it has confidential information. I've gone ahead and ran parquet-dump on one part of the parquet (seems like the tool requires pointing it to a specific .parquet file rather than the directory of parts). I'd prefer to share this privately if possible, how can I send you the command output? > [SQL] SQL query on parquet table java.lang.ArrayIndexOutOfBoundsException > - > > Key: SPARK-16334 > URL: https://issues.apache.org/jira/browse/SPARK-16334 > Project: Spark > Issue Type: Bug >Affects Versions: 2.0.0 >Reporter: Egor Pahomov >Assignee: Sameer Agarwal >Priority: Critical > Labels: sql > Fix For: 2.0.1, 2.1.0 > > > Query: > {code} > select * from blabla where user_id = 415706251 > {code} > Error: > {code} > 16/06/30 14:07:27 WARN scheduler.TaskSetManager: Lost task 11.0 in stage 0.0 > (TID 3, hadoop6): java.lang.ArrayIndexOutOfBoundsException: 6934 > at > org.apache.parquet.column.values.dictionary.PlainValuesDictionary$PlainBinaryDictionary.decodeToBinary(PlainValuesDictionary.java:119) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedColumnReader.decodeDictionaryIds(VectorizedColumnReader.java:273) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedColumnReader.readBatch(VectorizedColumnReader.java:170) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.nextBatch(VectorizedParquetRecordReader.java:230) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.nextKeyValue(VectorizedParquetRecordReader.java:137) > at > org.apache.spark.sql.execution.datasources.RecordReaderIterator.hasNext(RecordReaderIterator.scala:36) > at > org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:91) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.scan_nextBatch$(Unknown > Source) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown > Source) > at > org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) > at > org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:370) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:246) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:240) > at > org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:780) > at > org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:780) > at > org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:283) > at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:70) > at org.apache.spark.scheduler.Task.run(Task.scala:85) > at > org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274) > at > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145) > at > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615) > at java.lang.Thread.run(Thread.java:745) > {code} > Work on 1.6.1 -- This message was sent by Atlassian JIRA (v6.3.4#6332) - To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org
[jira] [Commented] (SPARK-16334) [SQL] SQL query on parquet table java.lang.ArrayIndexOutOfBoundsException
[ https://issues.apache.org/jira/browse/SPARK-16334?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=15408334#comment-15408334 ] Sameer Agarwal commented on SPARK-16334: [~keith.j.kraus], [~sebastianherold] -- would it be possible for you share the subset of data that's causing this bug (privately, if you'd like)? If that's not really an option, can you please share some more information about its format (row group/ page encoding etc.) by running [parquet-dump | https://github.com/apache/parquet-mr/blob/master/parquet-tools/src/main/scripts/parquet-dump] on your data? > [SQL] SQL query on parquet table java.lang.ArrayIndexOutOfBoundsException > - > > Key: SPARK-16334 > URL: https://issues.apache.org/jira/browse/SPARK-16334 > Project: Spark > Issue Type: Bug >Affects Versions: 2.0.0 >Reporter: Egor Pahomov >Assignee: Sameer Agarwal >Priority: Critical > Labels: sql > Fix For: 2.0.1, 2.1.0 > > > Query: > {code} > select * from blabla where user_id = 415706251 > {code} > Error: > {code} > 16/06/30 14:07:27 WARN scheduler.TaskSetManager: Lost task 11.0 in stage 0.0 > (TID 3, hadoop6): java.lang.ArrayIndexOutOfBoundsException: 6934 > at > org.apache.parquet.column.values.dictionary.PlainValuesDictionary$PlainBinaryDictionary.decodeToBinary(PlainValuesDictionary.java:119) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedColumnReader.decodeDictionaryIds(VectorizedColumnReader.java:273) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedColumnReader.readBatch(VectorizedColumnReader.java:170) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.nextBatch(VectorizedParquetRecordReader.java:230) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.nextKeyValue(VectorizedParquetRecordReader.java:137) > at > org.apache.spark.sql.execution.datasources.RecordReaderIterator.hasNext(RecordReaderIterator.scala:36) > at > org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:91) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.scan_nextBatch$(Unknown > Source) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown > Source) > at > org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) > at > org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:370) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:246) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:240) > at > org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:780) > at > org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:780) > at > org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:283) > at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:70) > at org.apache.spark.scheduler.Task.run(Task.scala:85) > at > org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274) > at > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145) > at > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615) > at java.lang.Thread.run(Thread.java:745) > {code} > Work on 1.6.1 -- This message was sent by Atlassian JIRA (v6.3.4#6332) - To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org
[jira] [Commented] (SPARK-16334) [SQL] SQL query on parquet table java.lang.ArrayIndexOutOfBoundsException
[ https://issues.apache.org/jira/browse/SPARK-16334?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=15407607#comment-15407607 ] Sebastian Herold commented on SPARK-16334: -- [~sameer] We can reproduce the problem as well. {{spark.conf.set("spark.sql.parquet.enableVectorizedReader", "false")}} help us, too. We let you know, when we tried the patch. > [SQL] SQL query on parquet table java.lang.ArrayIndexOutOfBoundsException > - > > Key: SPARK-16334 > URL: https://issues.apache.org/jira/browse/SPARK-16334 > Project: Spark > Issue Type: Bug >Affects Versions: 2.0.0 >Reporter: Egor Pahomov >Assignee: Sameer Agarwal >Priority: Critical > Labels: sql > Fix For: 2.0.1, 2.1.0 > > > Query: > {code} > select * from blabla where user_id = 415706251 > {code} > Error: > {code} > 16/06/30 14:07:27 WARN scheduler.TaskSetManager: Lost task 11.0 in stage 0.0 > (TID 3, hadoop6): java.lang.ArrayIndexOutOfBoundsException: 6934 > at > org.apache.parquet.column.values.dictionary.PlainValuesDictionary$PlainBinaryDictionary.decodeToBinary(PlainValuesDictionary.java:119) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedColumnReader.decodeDictionaryIds(VectorizedColumnReader.java:273) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedColumnReader.readBatch(VectorizedColumnReader.java:170) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.nextBatch(VectorizedParquetRecordReader.java:230) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.nextKeyValue(VectorizedParquetRecordReader.java:137) > at > org.apache.spark.sql.execution.datasources.RecordReaderIterator.hasNext(RecordReaderIterator.scala:36) > at > org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:91) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.scan_nextBatch$(Unknown > Source) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown > Source) > at > org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) > at > org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:370) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:246) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:240) > at > org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:780) > at > org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:780) > at > org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:283) > at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:70) > at org.apache.spark.scheduler.Task.run(Task.scala:85) > at > org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274) > at > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145) > at > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615) > at java.lang.Thread.run(Thread.java:745) > {code} > Work on 1.6.1 -- This message was sent by Atlassian JIRA (v6.3.4#6332) - To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org
[jira] [Commented] (SPARK-16334) [SQL] SQL query on parquet table java.lang.ArrayIndexOutOfBoundsException
[ https://issues.apache.org/jira/browse/SPARK-16334?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=15398130#comment-15398130 ] Keith Kraus commented on SPARK-16334: - [~sameerag] I have just built branch-2.0 which should have included your patch, but I am still experiencing this issue. The parquet file I am using was written using Spark 1.6 and has ~450 columns. Issuing {{spark.conf.set("spark.sql.parquet.enableVectorizedReader", "false")}} prevents the issue from occurring, so it's definitely the vectorized parquet reader. Let me know if I can provide any additional information to help resolve this issue. > [SQL] SQL query on parquet table java.lang.ArrayIndexOutOfBoundsException > - > > Key: SPARK-16334 > URL: https://issues.apache.org/jira/browse/SPARK-16334 > Project: Spark > Issue Type: Bug >Affects Versions: 2.0.0 >Reporter: Egor Pahomov >Assignee: Sameer Agarwal >Priority: Critical > Labels: sql > Fix For: 2.0.1, 2.1.0 > > > Query: > {code} > select * from blabla where user_id = 415706251 > {code} > Error: > {code} > 16/06/30 14:07:27 WARN scheduler.TaskSetManager: Lost task 11.0 in stage 0.0 > (TID 3, hadoop6): java.lang.ArrayIndexOutOfBoundsException: 6934 > at > org.apache.parquet.column.values.dictionary.PlainValuesDictionary$PlainBinaryDictionary.decodeToBinary(PlainValuesDictionary.java:119) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedColumnReader.decodeDictionaryIds(VectorizedColumnReader.java:273) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedColumnReader.readBatch(VectorizedColumnReader.java:170) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.nextBatch(VectorizedParquetRecordReader.java:230) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.nextKeyValue(VectorizedParquetRecordReader.java:137) > at > org.apache.spark.sql.execution.datasources.RecordReaderIterator.hasNext(RecordReaderIterator.scala:36) > at > org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:91) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.scan_nextBatch$(Unknown > Source) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown > Source) > at > org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) > at > org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:370) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:246) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:240) > at > org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:780) > at > org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:780) > at > org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:283) > at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:70) > at org.apache.spark.scheduler.Task.run(Task.scala:85) > at > org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274) > at > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145) > at > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615) > at java.lang.Thread.run(Thread.java:745) > {code} > Work on 1.6.1 -- This message was sent by Atlassian JIRA (v6.3.4#6332) - To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org
[jira] [Commented] (SPARK-16334) [SQL] SQL query on parquet table java.lang.ArrayIndexOutOfBoundsException
[ https://issues.apache.org/jira/browse/SPARK-16334?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=15384736#comment-15384736 ] Egor Pahomov commented on SPARK-16334: -- [~sameerag] that particular case on which I experienced the problem - now working for me. It's hard for me to reproduce what exactly made the bug appear before - would not go into such much trouble. Thanks > [SQL] SQL query on parquet table java.lang.ArrayIndexOutOfBoundsException > - > > Key: SPARK-16334 > URL: https://issues.apache.org/jira/browse/SPARK-16334 > Project: Spark > Issue Type: Bug >Affects Versions: 2.0.0 >Reporter: Egor Pahomov >Priority: Critical > Labels: sql > > Query: > {code} > select * from blabla where user_id = 415706251 > {code} > Error: > {code} > 16/06/30 14:07:27 WARN scheduler.TaskSetManager: Lost task 11.0 in stage 0.0 > (TID 3, hadoop6): java.lang.ArrayIndexOutOfBoundsException: 6934 > at > org.apache.parquet.column.values.dictionary.PlainValuesDictionary$PlainBinaryDictionary.decodeToBinary(PlainValuesDictionary.java:119) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedColumnReader.decodeDictionaryIds(VectorizedColumnReader.java:273) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedColumnReader.readBatch(VectorizedColumnReader.java:170) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.nextBatch(VectorizedParquetRecordReader.java:230) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.nextKeyValue(VectorizedParquetRecordReader.java:137) > at > org.apache.spark.sql.execution.datasources.RecordReaderIterator.hasNext(RecordReaderIterator.scala:36) > at > org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:91) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.scan_nextBatch$(Unknown > Source) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown > Source) > at > org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) > at > org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:370) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:246) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:240) > at > org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:780) > at > org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:780) > at > org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:283) > at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:70) > at org.apache.spark.scheduler.Task.run(Task.scala:85) > at > org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274) > at > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145) > at > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615) > at java.lang.Thread.run(Thread.java:745) > {code} > Work on 1.6.1 -- This message was sent by Atlassian JIRA (v6.3.4#6332) - To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org
[jira] [Commented] (SPARK-16334) [SQL] SQL query on parquet table java.lang.ArrayIndexOutOfBoundsException
[ https://issues.apache.org/jira/browse/SPARK-16334?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=15384630#comment-15384630 ] Sameer Agarwal commented on SPARK-16334: [~epakhomov] [~lester] any luck with trying out this change? Also in order to prevent future regressions, it'd be great if you can share an illustrative (anonymized) sample of data on which you're seeing this issue so that we can make it part of the test harness. > [SQL] SQL query on parquet table java.lang.ArrayIndexOutOfBoundsException > - > > Key: SPARK-16334 > URL: https://issues.apache.org/jira/browse/SPARK-16334 > Project: Spark > Issue Type: Bug >Affects Versions: 2.0.0 >Reporter: Egor Pahomov >Priority: Critical > Labels: sql > > Query: > {code} > select * from blabla where user_id = 415706251 > {code} > Error: > {code} > 16/06/30 14:07:27 WARN scheduler.TaskSetManager: Lost task 11.0 in stage 0.0 > (TID 3, hadoop6): java.lang.ArrayIndexOutOfBoundsException: 6934 > at > org.apache.parquet.column.values.dictionary.PlainValuesDictionary$PlainBinaryDictionary.decodeToBinary(PlainValuesDictionary.java:119) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedColumnReader.decodeDictionaryIds(VectorizedColumnReader.java:273) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedColumnReader.readBatch(VectorizedColumnReader.java:170) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.nextBatch(VectorizedParquetRecordReader.java:230) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.nextKeyValue(VectorizedParquetRecordReader.java:137) > at > org.apache.spark.sql.execution.datasources.RecordReaderIterator.hasNext(RecordReaderIterator.scala:36) > at > org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:91) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.scan_nextBatch$(Unknown > Source) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown > Source) > at > org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) > at > org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:370) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:246) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:240) > at > org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:780) > at > org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:780) > at > org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:283) > at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:70) > at org.apache.spark.scheduler.Task.run(Task.scala:85) > at > org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274) > at > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145) > at > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615) > at java.lang.Thread.run(Thread.java:745) > {code} > Work on 1.6.1 -- This message was sent by Atlassian JIRA (v6.3.4#6332) - To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org
[jira] [Commented] (SPARK-16334) [SQL] SQL query on parquet table java.lang.ArrayIndexOutOfBoundsException
[ https://issues.apache.org/jira/browse/SPARK-16334?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=15380330#comment-15380330 ] Sameer Agarwal commented on SPARK-16334: cc [~vivanov] too > [SQL] SQL query on parquet table java.lang.ArrayIndexOutOfBoundsException > - > > Key: SPARK-16334 > URL: https://issues.apache.org/jira/browse/SPARK-16334 > Project: Spark > Issue Type: Bug >Affects Versions: 2.0.0 >Reporter: Egor Pahomov >Priority: Critical > Labels: sql > > Query: > {code} > select * from blabla where user_id = 415706251 > {code} > Error: > {code} > 16/06/30 14:07:27 WARN scheduler.TaskSetManager: Lost task 11.0 in stage 0.0 > (TID 3, hadoop6): java.lang.ArrayIndexOutOfBoundsException: 6934 > at > org.apache.parquet.column.values.dictionary.PlainValuesDictionary$PlainBinaryDictionary.decodeToBinary(PlainValuesDictionary.java:119) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedColumnReader.decodeDictionaryIds(VectorizedColumnReader.java:273) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedColumnReader.readBatch(VectorizedColumnReader.java:170) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.nextBatch(VectorizedParquetRecordReader.java:230) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.nextKeyValue(VectorizedParquetRecordReader.java:137) > at > org.apache.spark.sql.execution.datasources.RecordReaderIterator.hasNext(RecordReaderIterator.scala:36) > at > org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:91) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.scan_nextBatch$(Unknown > Source) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown > Source) > at > org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) > at > org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:370) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:246) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:240) > at > org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:780) > at > org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:780) > at > org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:283) > at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:70) > at org.apache.spark.scheduler.Task.run(Task.scala:85) > at > org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274) > at > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145) > at > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615) > at java.lang.Thread.run(Thread.java:745) > {code} > Work on 1.6.1 -- This message was sent by Atlassian JIRA (v6.3.4#6332) - To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org
[jira] [Commented] (SPARK-16334) [SQL] SQL query on parquet table java.lang.ArrayIndexOutOfBoundsException
[ https://issues.apache.org/jira/browse/SPARK-16334?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=15380249#comment-15380249 ] Egor Pahomov commented on SPARK-16334: -- Sure, would test on Monday. > [SQL] SQL query on parquet table java.lang.ArrayIndexOutOfBoundsException > - > > Key: SPARK-16334 > URL: https://issues.apache.org/jira/browse/SPARK-16334 > Project: Spark > Issue Type: Bug >Affects Versions: 2.0.0 >Reporter: Egor Pahomov >Priority: Critical > Labels: sql > > Query: > {code} > select * from blabla where user_id = 415706251 > {code} > Error: > {code} > 16/06/30 14:07:27 WARN scheduler.TaskSetManager: Lost task 11.0 in stage 0.0 > (TID 3, hadoop6): java.lang.ArrayIndexOutOfBoundsException: 6934 > at > org.apache.parquet.column.values.dictionary.PlainValuesDictionary$PlainBinaryDictionary.decodeToBinary(PlainValuesDictionary.java:119) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedColumnReader.decodeDictionaryIds(VectorizedColumnReader.java:273) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedColumnReader.readBatch(VectorizedColumnReader.java:170) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.nextBatch(VectorizedParquetRecordReader.java:230) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.nextKeyValue(VectorizedParquetRecordReader.java:137) > at > org.apache.spark.sql.execution.datasources.RecordReaderIterator.hasNext(RecordReaderIterator.scala:36) > at > org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:91) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.scan_nextBatch$(Unknown > Source) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown > Source) > at > org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) > at > org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:370) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:246) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:240) > at > org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:780) > at > org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:780) > at > org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:283) > at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:70) > at org.apache.spark.scheduler.Task.run(Task.scala:85) > at > org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274) > at > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145) > at > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615) > at java.lang.Thread.run(Thread.java:745) > {code} > Work on 1.6.1 -- This message was sent by Atlassian JIRA (v6.3.4#6332) - To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org
[jira] [Commented] (SPARK-16334) [SQL] SQL query on parquet table java.lang.ArrayIndexOutOfBoundsException
[ https://issues.apache.org/jira/browse/SPARK-16334?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=15380187#comment-15380187 ] Reynold Xin commented on SPARK-16334: - [~epakhomov] can you try the patch and see if it fixes your problem? https://github.com/apache/spark/pull/14225 > [SQL] SQL query on parquet table java.lang.ArrayIndexOutOfBoundsException > - > > Key: SPARK-16334 > URL: https://issues.apache.org/jira/browse/SPARK-16334 > Project: Spark > Issue Type: Bug >Affects Versions: 2.0.0 >Reporter: Egor Pahomov >Priority: Critical > Labels: sql > > Query: > {code} > select * from blabla where user_id = 415706251 > {code} > Error: > {code} > 16/06/30 14:07:27 WARN scheduler.TaskSetManager: Lost task 11.0 in stage 0.0 > (TID 3, hadoop6): java.lang.ArrayIndexOutOfBoundsException: 6934 > at > org.apache.parquet.column.values.dictionary.PlainValuesDictionary$PlainBinaryDictionary.decodeToBinary(PlainValuesDictionary.java:119) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedColumnReader.decodeDictionaryIds(VectorizedColumnReader.java:273) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedColumnReader.readBatch(VectorizedColumnReader.java:170) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.nextBatch(VectorizedParquetRecordReader.java:230) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.nextKeyValue(VectorizedParquetRecordReader.java:137) > at > org.apache.spark.sql.execution.datasources.RecordReaderIterator.hasNext(RecordReaderIterator.scala:36) > at > org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:91) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.scan_nextBatch$(Unknown > Source) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown > Source) > at > org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) > at > org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:370) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:246) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:240) > at > org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:780) > at > org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:780) > at > org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:283) > at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:70) > at org.apache.spark.scheduler.Task.run(Task.scala:85) > at > org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274) > at > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145) > at > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615) > at java.lang.Thread.run(Thread.java:745) > {code} > Work on 1.6.1 -- This message was sent by Atlassian JIRA (v6.3.4#6332) - To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org
[jira] [Commented] (SPARK-16334) [SQL] SQL query on parquet table java.lang.ArrayIndexOutOfBoundsException
[ https://issues.apache.org/jira/browse/SPARK-16334?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=15380170#comment-15380170 ] Apache Spark commented on SPARK-16334: -- User 'sameeragarwal' has created a pull request for this issue: https://github.com/apache/spark/pull/14225 > [SQL] SQL query on parquet table java.lang.ArrayIndexOutOfBoundsException > - > > Key: SPARK-16334 > URL: https://issues.apache.org/jira/browse/SPARK-16334 > Project: Spark > Issue Type: Bug >Affects Versions: 2.0.0 >Reporter: Egor Pahomov >Priority: Critical > Labels: sql > > Query: > {code} > select * from blabla where user_id = 415706251 > {code} > Error: > {code} > 16/06/30 14:07:27 WARN scheduler.TaskSetManager: Lost task 11.0 in stage 0.0 > (TID 3, hadoop6): java.lang.ArrayIndexOutOfBoundsException: 6934 > at > org.apache.parquet.column.values.dictionary.PlainValuesDictionary$PlainBinaryDictionary.decodeToBinary(PlainValuesDictionary.java:119) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedColumnReader.decodeDictionaryIds(VectorizedColumnReader.java:273) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedColumnReader.readBatch(VectorizedColumnReader.java:170) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.nextBatch(VectorizedParquetRecordReader.java:230) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.nextKeyValue(VectorizedParquetRecordReader.java:137) > at > org.apache.spark.sql.execution.datasources.RecordReaderIterator.hasNext(RecordReaderIterator.scala:36) > at > org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:91) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.scan_nextBatch$(Unknown > Source) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown > Source) > at > org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) > at > org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:370) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:246) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:240) > at > org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:780) > at > org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:780) > at > org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:283) > at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:70) > at org.apache.spark.scheduler.Task.run(Task.scala:85) > at > org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274) > at > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145) > at > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615) > at java.lang.Thread.run(Thread.java:745) > {code} > Work on 1.6.1 -- This message was sent by Atlassian JIRA (v6.3.4#6332) - To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org
[jira] [Commented] (SPARK-16334) [SQL] SQL query on parquet table java.lang.ArrayIndexOutOfBoundsException
[ https://issues.apache.org/jira/browse/SPARK-16334?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=15380137#comment-15380137 ] Sameer Agarwal commented on SPARK-16334: While I've not been able to reproduce this bug, looking at the stack trace, I think one of the likely causes of this is that we're resetting the dictionary while reading every page in a row batch. This is generally okay but might cause problems if a batch has a combination of dictionary-encoded and non-dictionary-encoded pages. > [SQL] SQL query on parquet table java.lang.ArrayIndexOutOfBoundsException > - > > Key: SPARK-16334 > URL: https://issues.apache.org/jira/browse/SPARK-16334 > Project: Spark > Issue Type: Bug >Affects Versions: 2.0.0 >Reporter: Egor Pahomov >Priority: Critical > Labels: sql > > Query: > {code} > select * from blabla where user_id = 415706251 > {code} > Error: > {code} > 16/06/30 14:07:27 WARN scheduler.TaskSetManager: Lost task 11.0 in stage 0.0 > (TID 3, hadoop6): java.lang.ArrayIndexOutOfBoundsException: 6934 > at > org.apache.parquet.column.values.dictionary.PlainValuesDictionary$PlainBinaryDictionary.decodeToBinary(PlainValuesDictionary.java:119) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedColumnReader.decodeDictionaryIds(VectorizedColumnReader.java:273) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedColumnReader.readBatch(VectorizedColumnReader.java:170) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.nextBatch(VectorizedParquetRecordReader.java:230) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.nextKeyValue(VectorizedParquetRecordReader.java:137) > at > org.apache.spark.sql.execution.datasources.RecordReaderIterator.hasNext(RecordReaderIterator.scala:36) > at > org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:91) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.scan_nextBatch$(Unknown > Source) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown > Source) > at > org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) > at > org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:370) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:246) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:240) > at > org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:780) > at > org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:780) > at > org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:283) > at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:70) > at org.apache.spark.scheduler.Task.run(Task.scala:85) > at > org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274) > at > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145) > at > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615) > at java.lang.Thread.run(Thread.java:745) > {code} > Work on 1.6.1 -- This message was sent by Atlassian JIRA (v6.3.4#6332) - To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org
[jira] [Commented] (SPARK-16334) [SQL] SQL query on parquet table java.lang.ArrayIndexOutOfBoundsException
[ https://issues.apache.org/jira/browse/SPARK-16334?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=15372068#comment-15372068 ] Vladimir Ivanov commented on SPARK-16334: - Hi Herman, Thank you for reply! I wasn't able to reproduce this error anymore with this spark.sql.parquet.enableVectorizedReader setting set to false in Spark 2.0. As for steps to reproduce: unfortunately I have reproduced it on partitioned parquet files containing client-sensitive data (we are calling SparkSession programmatically from our app), so I can't provide data here. Let's see what I can do. > [SQL] SQL query on parquet table java.lang.ArrayIndexOutOfBoundsException > - > > Key: SPARK-16334 > URL: https://issues.apache.org/jira/browse/SPARK-16334 > Project: Spark > Issue Type: Bug >Affects Versions: 2.0.0 >Reporter: Egor Pahomov >Priority: Critical > Labels: sql > > Query: > {code} > select * from blabla where user_id = 415706251 > {code} > Error: > {code} > 16/06/30 14:07:27 WARN scheduler.TaskSetManager: Lost task 11.0 in stage 0.0 > (TID 3, hadoop6): java.lang.ArrayIndexOutOfBoundsException: 6934 > at > org.apache.parquet.column.values.dictionary.PlainValuesDictionary$PlainBinaryDictionary.decodeToBinary(PlainValuesDictionary.java:119) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedColumnReader.decodeDictionaryIds(VectorizedColumnReader.java:273) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedColumnReader.readBatch(VectorizedColumnReader.java:170) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.nextBatch(VectorizedParquetRecordReader.java:230) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.nextKeyValue(VectorizedParquetRecordReader.java:137) > at > org.apache.spark.sql.execution.datasources.RecordReaderIterator.hasNext(RecordReaderIterator.scala:36) > at > org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:91) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.scan_nextBatch$(Unknown > Source) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown > Source) > at > org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) > at > org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:370) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:246) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:240) > at > org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:780) > at > org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:780) > at > org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:283) > at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:70) > at org.apache.spark.scheduler.Task.run(Task.scala:85) > at > org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274) > at > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145) > at > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615) > at java.lang.Thread.run(Thread.java:745) > {code} > Work on 1.6.1 -- This message was sent by Atlassian JIRA (v6.3.4#6332) - To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org
[jira] [Commented] (SPARK-16334) [SQL] SQL query on parquet table java.lang.ArrayIndexOutOfBoundsException
[ https://issues.apache.org/jira/browse/SPARK-16334?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=15371633#comment-15371633 ] Herman van Hovell commented on SPARK-16334: --- It would also be great if we can reproduce this. Could share an example? > [SQL] SQL query on parquet table java.lang.ArrayIndexOutOfBoundsException > - > > Key: SPARK-16334 > URL: https://issues.apache.org/jira/browse/SPARK-16334 > Project: Spark > Issue Type: Bug >Affects Versions: 2.0.0 >Reporter: Egor Pahomov >Priority: Critical > Labels: sql > > Query: > {code} > select * from blabla where user_id = 415706251 > {code} > Error: > {code} > 16/06/30 14:07:27 WARN scheduler.TaskSetManager: Lost task 11.0 in stage 0.0 > (TID 3, hadoop6): java.lang.ArrayIndexOutOfBoundsException: 6934 > at > org.apache.parquet.column.values.dictionary.PlainValuesDictionary$PlainBinaryDictionary.decodeToBinary(PlainValuesDictionary.java:119) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedColumnReader.decodeDictionaryIds(VectorizedColumnReader.java:273) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedColumnReader.readBatch(VectorizedColumnReader.java:170) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.nextBatch(VectorizedParquetRecordReader.java:230) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.nextKeyValue(VectorizedParquetRecordReader.java:137) > at > org.apache.spark.sql.execution.datasources.RecordReaderIterator.hasNext(RecordReaderIterator.scala:36) > at > org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:91) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.scan_nextBatch$(Unknown > Source) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown > Source) > at > org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) > at > org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:370) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:246) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:240) > at > org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:780) > at > org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:780) > at > org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:283) > at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:70) > at org.apache.spark.scheduler.Task.run(Task.scala:85) > at > org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274) > at > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145) > at > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615) > at java.lang.Thread.run(Thread.java:745) > {code} > Work on 1.6.1 -- This message was sent by Atlassian JIRA (v6.3.4#6332) - To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org
[jira] [Commented] (SPARK-16334) [SQL] SQL query on parquet table java.lang.ArrayIndexOutOfBoundsException
[ https://issues.apache.org/jira/browse/SPARK-16334?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=15371606#comment-15371606 ] Herman van Hovell commented on SPARK-16334: --- Could you try to disable the vectorized parquet reader. You can do this issuing the following SQL statement {{SET spark.sql.parquet.enableVectorizedReader = FALSE}}, or by issuing {{spark.conf.set("spark.sql.parquet.enableVectorizedReader", "false")}} in the REPL. > [SQL] SQL query on parquet table java.lang.ArrayIndexOutOfBoundsException > - > > Key: SPARK-16334 > URL: https://issues.apache.org/jira/browse/SPARK-16334 > Project: Spark > Issue Type: Bug >Affects Versions: 2.0.0 >Reporter: Egor Pahomov >Priority: Critical > Labels: sql > > Query: > {code} > select * from blabla where user_id = 415706251 > {code} > Error: > {code} > 16/06/30 14:07:27 WARN scheduler.TaskSetManager: Lost task 11.0 in stage 0.0 > (TID 3, hadoop6): java.lang.ArrayIndexOutOfBoundsException: 6934 > at > org.apache.parquet.column.values.dictionary.PlainValuesDictionary$PlainBinaryDictionary.decodeToBinary(PlainValuesDictionary.java:119) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedColumnReader.decodeDictionaryIds(VectorizedColumnReader.java:273) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedColumnReader.readBatch(VectorizedColumnReader.java:170) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.nextBatch(VectorizedParquetRecordReader.java:230) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.nextKeyValue(VectorizedParquetRecordReader.java:137) > at > org.apache.spark.sql.execution.datasources.RecordReaderIterator.hasNext(RecordReaderIterator.scala:36) > at > org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:91) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.scan_nextBatch$(Unknown > Source) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown > Source) > at > org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) > at > org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:370) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:246) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:240) > at > org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:780) > at > org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:780) > at > org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:283) > at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:70) > at org.apache.spark.scheduler.Task.run(Task.scala:85) > at > org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274) > at > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145) > at > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615) > at java.lang.Thread.run(Thread.java:745) > {code} > Work on 1.6.1 -- This message was sent by Atlassian JIRA (v6.3.4#6332) - To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org
[jira] [Commented] (SPARK-16334) [SQL] SQL query on parquet table java.lang.ArrayIndexOutOfBoundsException
[ https://issues.apache.org/jira/browse/SPARK-16334?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=15371572#comment-15371572 ] Vladimir Ivanov commented on SPARK-16334: - I believe it relates to the following change in Spark 2.0: "For example, we have implemented a new vectorized Parquet reader that does decompression and decoding in column batches. When decoding integer columns (on disk), this new reader is roughly 9 times faster than the non-vectorized one" taken from this link: https://databricks.com/blog/2016/05/23/apache-spark-as-a-compiler-joining-a-billion-rows-per-second-on-a-laptop.html Corresponding JIRA tickets: https://issues.apache.org/jira/browse/SPARK-12854 https://issues.apache.org/jira/browse/SPARK-14008 > [SQL] SQL query on parquet table java.lang.ArrayIndexOutOfBoundsException > - > > Key: SPARK-16334 > URL: https://issues.apache.org/jira/browse/SPARK-16334 > Project: Spark > Issue Type: Bug >Affects Versions: 2.0.0 >Reporter: Egor Pahomov >Priority: Critical > Labels: sql > > Query: > {code} > select * from blabla where user_id = 415706251 > {code} > Error: > {code} > 16/06/30 14:07:27 WARN scheduler.TaskSetManager: Lost task 11.0 in stage 0.0 > (TID 3, hadoop6): java.lang.ArrayIndexOutOfBoundsException: 6934 > at > org.apache.parquet.column.values.dictionary.PlainValuesDictionary$PlainBinaryDictionary.decodeToBinary(PlainValuesDictionary.java:119) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedColumnReader.decodeDictionaryIds(VectorizedColumnReader.java:273) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedColumnReader.readBatch(VectorizedColumnReader.java:170) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.nextBatch(VectorizedParquetRecordReader.java:230) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.nextKeyValue(VectorizedParquetRecordReader.java:137) > at > org.apache.spark.sql.execution.datasources.RecordReaderIterator.hasNext(RecordReaderIterator.scala:36) > at > org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:91) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.scan_nextBatch$(Unknown > Source) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown > Source) > at > org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) > at > org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:370) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:246) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:240) > at > org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:780) > at > org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:780) > at > org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:283) > at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:70) > at org.apache.spark.scheduler.Task.run(Task.scala:85) > at > org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274) > at > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145) > at > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615) > at java.lang.Thread.run(Thread.java:745) > {code} > Work on 1.6.1 -- This message was sent by Atlassian JIRA (v6.3.4#6332) - To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org
[jira] [Commented] (SPARK-16334) [SQL] SQL query on parquet table java.lang.ArrayIndexOutOfBoundsException
[ https://issues.apache.org/jira/browse/SPARK-16334?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=15364501#comment-15364501 ] Vladimir Ivanov commented on SPARK-16334: - Can't reproduce it on 1.6.1 too. Also in our case it looks like it fails in VectorizedColumnReader in INT96 case for TimestampType, which in turn calls PlainValuesDictionary.decodeToBinary(int id). > [SQL] SQL query on parquet table java.lang.ArrayIndexOutOfBoundsException > - > > Key: SPARK-16334 > URL: https://issues.apache.org/jira/browse/SPARK-16334 > Project: Spark > Issue Type: Bug >Affects Versions: 2.0.0 >Reporter: Egor Pahomov >Priority: Critical > Labels: sql > > Query: > {code} > select * from blabla where user_id = 415706251 > {code} > Error: > {code} > 16/06/30 14:07:27 WARN scheduler.TaskSetManager: Lost task 11.0 in stage 0.0 > (TID 3, hadoop6): java.lang.ArrayIndexOutOfBoundsException: 6934 > at > org.apache.parquet.column.values.dictionary.PlainValuesDictionary$PlainBinaryDictionary.decodeToBinary(PlainValuesDictionary.java:119) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedColumnReader.decodeDictionaryIds(VectorizedColumnReader.java:273) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedColumnReader.readBatch(VectorizedColumnReader.java:170) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.nextBatch(VectorizedParquetRecordReader.java:230) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.nextKeyValue(VectorizedParquetRecordReader.java:137) > at > org.apache.spark.sql.execution.datasources.RecordReaderIterator.hasNext(RecordReaderIterator.scala:36) > at > org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:91) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.scan_nextBatch$(Unknown > Source) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown > Source) > at > org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) > at > org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:370) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:246) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:240) > at > org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:780) > at > org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:780) > at > org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:283) > at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:70) > at org.apache.spark.scheduler.Task.run(Task.scala:85) > at > org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274) > at > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145) > at > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615) > at java.lang.Thread.run(Thread.java:745) > {code} > Work on 1.6.1 -- This message was sent by Atlassian JIRA (v6.3.4#6332) - To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org
[jira] [Commented] (SPARK-16334) [SQL] SQL query on parquet table java.lang.ArrayIndexOutOfBoundsException
[ https://issues.apache.org/jira/browse/SPARK-16334?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=15363438#comment-15363438 ] Vladimir Ivanov commented on SPARK-16334: - Hi, we discovered problem with the same stacktrace in Spark 2.0. In our case it's thrown during {code}DataFrame.rdd{code} call. Moreover it somehow depends on volume of data, because it is not thrown when we change filter criteria accordingly. We used SparkSQL to write these parquet files and didn't explicitly specify {code}WriterVersion{code} option so I believe whatever version is set by default was used. > [SQL] SQL query on parquet table java.lang.ArrayIndexOutOfBoundsException > - > > Key: SPARK-16334 > URL: https://issues.apache.org/jira/browse/SPARK-16334 > Project: Spark > Issue Type: Bug >Affects Versions: 2.0.0 >Reporter: Egor Pahomov >Priority: Critical > Labels: sql > > Query: > {code} > select * from blabla where user_id = 415706251 > {code} > Error: > {code} > 16/06/30 14:07:27 WARN scheduler.TaskSetManager: Lost task 11.0 in stage 0.0 > (TID 3, hadoop6): java.lang.ArrayIndexOutOfBoundsException: 6934 > at > org.apache.parquet.column.values.dictionary.PlainValuesDictionary$PlainBinaryDictionary.decodeToBinary(PlainValuesDictionary.java:119) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedColumnReader.decodeDictionaryIds(VectorizedColumnReader.java:273) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedColumnReader.readBatch(VectorizedColumnReader.java:170) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.nextBatch(VectorizedParquetRecordReader.java:230) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.nextKeyValue(VectorizedParquetRecordReader.java:137) > at > org.apache.spark.sql.execution.datasources.RecordReaderIterator.hasNext(RecordReaderIterator.scala:36) > at > org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:91) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.scan_nextBatch$(Unknown > Source) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown > Source) > at > org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) > at > org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:370) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:246) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:240) > at > org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:780) > at > org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:780) > at > org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:283) > at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:70) > at org.apache.spark.scheduler.Task.run(Task.scala:85) > at > org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274) > at > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145) > at > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615) > at java.lang.Thread.run(Thread.java:745) > {code} > Work on 1.6.1 -- This message was sent by Atlassian JIRA (v6.3.4#6332) - To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org
[jira] [Commented] (SPARK-16334) [SQL] SQL query on parquet table java.lang.ArrayIndexOutOfBoundsException
[ https://issues.apache.org/jira/browse/SPARK-16334?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=15358424#comment-15358424 ] Liwei Lin commented on SPARK-16334: --- hi [~epahomov], by which tool were your parquet files written, SparkSQL or ? In addition, what's the {{WriterVersion}}, {{PARQUET_1_0 ("v1")}} or {{PARQUET_2_0 ("v2")}}? > [SQL] SQL query on parquet table java.lang.ArrayIndexOutOfBoundsException > - > > Key: SPARK-16334 > URL: https://issues.apache.org/jira/browse/SPARK-16334 > Project: Spark > Issue Type: Bug >Affects Versions: 2.0.0 >Reporter: Egor Pahomov >Priority: Critical > Labels: sql > > Query: > {code} > select * from blabla where user_id = 415706251 > {code} > Error: > {code} > 16/06/30 14:07:27 WARN scheduler.TaskSetManager: Lost task 11.0 in stage 0.0 > (TID 3, hadoop6): java.lang.ArrayIndexOutOfBoundsException: 6934 > at > org.apache.parquet.column.values.dictionary.PlainValuesDictionary$PlainBinaryDictionary.decodeToBinary(PlainValuesDictionary.java:119) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedColumnReader.decodeDictionaryIds(VectorizedColumnReader.java:273) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedColumnReader.readBatch(VectorizedColumnReader.java:170) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.nextBatch(VectorizedParquetRecordReader.java:230) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.nextKeyValue(VectorizedParquetRecordReader.java:137) > at > org.apache.spark.sql.execution.datasources.RecordReaderIterator.hasNext(RecordReaderIterator.scala:36) > at > org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:91) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.scan_nextBatch$(Unknown > Source) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown > Source) > at > org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) > at > org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:370) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:246) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:240) > at > org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:780) > at > org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:780) > at > org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:283) > at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:70) > at org.apache.spark.scheduler.Task.run(Task.scala:85) > at > org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274) > at > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145) > at > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615) > at java.lang.Thread.run(Thread.java:745) > {code} > Work on 1.6.1 -- This message was sent by Atlassian JIRA (v6.3.4#6332) - To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org