[ https://issues.apache.org/jira/browse/SPARK-32234?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17177708#comment-17177708 ]
Ramakrishna Prasad K S commented on SPARK-32234: ------------------------------------------------ Thanks [~saurabhc100] I am going ahead and merging these changes to my local Spark_3.0 setup. I hope there is no regression or side effects due to these changes. Just wanted to know why this bug is still in resolved state. Is any test still pending to be run? Thank you. > Spark sql commands are failing on select Queries for the orc tables > -------------------------------------------------------------------- > > Key: SPARK-32234 > URL: https://issues.apache.org/jira/browse/SPARK-32234 > Project: Spark > Issue Type: Bug > Components: SQL > Affects Versions: 3.0.0 > Reporter: Saurabh Chawla > Assignee: Saurabh Chawla > Priority: Blocker > Fix For: 3.0.1, 3.1.0 > > Attachments: e17f6887c06d47f6a62c0140c1ad569c_000000 > > > Spark sql commands are failing on select Queries for the orc tables > Steps to reproduce > > {code:java} > val table = """CREATE TABLE `date_dim` ( > `d_date_sk` INT, > `d_date_id` STRING, > `d_date` TIMESTAMP, > `d_month_seq` INT, > `d_week_seq` INT, > `d_quarter_seq` INT, > `d_year` INT, > `d_dow` INT, > `d_moy` INT, > `d_dom` INT, > `d_qoy` INT, > `d_fy_year` INT, > `d_fy_quarter_seq` INT, > `d_fy_week_seq` INT, > `d_day_name` STRING, > `d_quarter_name` STRING, > `d_holiday` STRING, > `d_weekend` STRING, > `d_following_holiday` STRING, > `d_first_dom` INT, > `d_last_dom` INT, > `d_same_day_ly` INT, > `d_same_day_lq` INT, > `d_current_day` STRING, > `d_current_week` STRING, > `d_current_month` STRING, > `d_current_quarter` STRING, > `d_current_year` STRING) > USING orc > LOCATION '/Users/test/tpcds_scale5data/date_dim' > TBLPROPERTIES ( > 'transient_lastDdlTime' = '1574682806')""" > spark.sql(table).collect > val u = """select date_dim.d_date_id from date_dim limit 5""" > spark.sql(u).collect > {code} > > > Exception > > {code:java} > org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in > stage 2.0 failed 1 times, most recent failure: Lost task 0.0 in stage 2.0 > (TID 2, 192.168.0.103, executor driver): > java.lang.ArrayIndexOutOfBoundsException: 1 > at > org.apache.spark.sql.execution.datasources.orc.OrcColumnarBatchReader.initBatch(OrcColumnarBatchReader.java:156) > at > org.apache.spark.sql.execution.datasources.orc.OrcFileFormat.$anonfun$buildReaderWithPartitionValues$7(OrcFileFormat.scala:258) > at > org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.org$apache$spark$sql$execution$datasources$FileScanRDD$$anon$$readCurrentFile(FileScanRDD.scala:141) > at > org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:203) > at > org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:116) > at > org.apache.spark.sql.execution.FileSourceScanExec$$anon$1.hasNext(DataSourceScanExec.scala:620) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.columnartorow_nextBatch_0$(Unknown > Source) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown > Source) > at > org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) > at > org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:729) > at > org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:343) > at > org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:895) > at > org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:895) > at > org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:372) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:336) > at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90) > at org.apache.spark.scheduler.Task.run(Task.scala:133) > at > org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:445) > at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1489) > at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:448) > at > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) > at > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) > at java.lang.Thread.run(Thread.java:748) > {code} > > > The reason behind this initBatch is not getting the schema that is needed to > find out the column value in OrcFileFormat.scala > > {code:java} > batchReader.initBatch( > TypeDescription.fromString(resultSchemaString){code} > > Query is working if > {code:java} > val u = """select * from date_dim limit 5"""{code} > -- This message was sent by Atlassian Jira (v8.3.4#803005) --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org