[
https://issues.apache.org/jira/browse/SPARK-27913?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17087813#comment-17087813
]
Giri Dandu commented on SPARK-27913:
------------------------------------
[~hyukjin.kwon]
with mergeSchema orc option it is still *NOT* working in spark 2.4.5
{code:java}
scala> spark.conf.getAllscala> spark.conf.getAllres15: Map[String,String] =
Map(spark.driver.host -> 192.168.7.124, spark.sql.orc.mergeSchema -> true,
spark.driver.port -> 54231, spark.repl.class.uri ->
spark://192.168.7.124:54231/classes, spark.jars -> "",
spark.repl.class.outputDir ->
/private/var/folders/6h/nkpdlpcd0h34sq6x2fmz896wt2p_sp/T/spark-373735c9-6837-4734-bb13-e8457848a70e/repl-0852551f-cfa5-4b4a-aa2c-ac129818bbc2,
spark.app.name -> Spark shell, spark.ui.showConsoleProgress -> true,
spark.executor.id -> driver, spark.submit.deployMode -> client, spark.master ->
local[*], spark.home -> /Users/gdandu/Downloads/spark-2.4.5-bin-hadoop2.7,
spark.sql.catalogImplementation -> hive, spark.app.id -> local-1587393426045)
scala> spark.sql("drop table test_broken_orc");res16:
org.apache.spark.sql.DataFrame = []
scala> spark.sql("create external table test_broken_orc(a struct<f1:int>)
stored as orc location '/tmp/test_broken_2'");res17:
org.apache.spark.sql.DataFrame = []
scala> spark.sql("insert into table test_broken_orc select named_struct(\"f1\",
1)");
res18: org.apache.spark.sql.DataFrame = []
scala> spark.sql("select * from test_broken_orc");
res19: org.apache.spark.sql.DataFrame = [a: struct<f1: int>]
scala> res19.show
+---+| a|+---+|[1]|+---+
scala> spark.sql("drop table test_broken_orc");
res21: org.apache.spark.sql.DataFrame = []
scala> spark.sql("create external table test_broken_orc(a struct<f1:int, f2:
int>) stored as orc location '/tmp/test_broken_2'");
res22: org.apache.spark.sql.DataFrame = []
scala> spark.sql("select * from test_broken_orc");
res23: org.apache.spark.sql.DataFrame = [a: struct<f1: int, f2: int>]
scala> res23.show
20/04/20 10:46:23 ERROR Executor: Exception in task 0.0 in stage 5.0 (TID
5)java.lang.ArrayIndexOutOfBoundsException: 1 at
org.apache.orc.mapred.OrcStruct.getFieldValue(OrcStruct.java:49) at
org.apache.spark.sql.execution.datasources.orc.OrcDeserializer$$anonfun$org$apache$spark$sql$execution$datasources$orc$OrcDeserializer$$newWriter$14.apply(OrcDeserializer.scala:133)
at
org.apache.spark.sql.execution.datasources.orc.OrcDeserializer$$anonfun$org$apache$spark$sql$execution$datasources$orc$OrcDeserializer$$newWriter$14.apply(OrcDeserializer.scala:123)
at
org.apache.spark.sql.execution.datasources.orc.OrcDeserializer$$anonfun$2$$anonfun$apply$1.apply(OrcDeserializer.scala:51)
at
org.apache.spark.sql.execution.datasources.orc.OrcDeserializer$$anonfun$2$$anonfun$apply$1.apply(OrcDeserializer.scala:51)
at
org.apache.spark.sql.execution.datasources.orc.OrcDeserializer.deserialize(OrcDeserializer.scala:64)
at
org.apache.spark.sql.execution.datasources.orc.OrcFileFormat$$anonfun$buildReaderWithPartitionValues$2$$anonfun$apply$7.apply(OrcFileFormat.scala:230)
at
org.apache.spark.sql.execution.datasources.orc.OrcFileFormat$$anonfun$buildReaderWithPartitionValues$2$$anonfun$apply$7.apply(OrcFileFormat.scala:230)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:410) at
org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.next(FileScanRDD.scala:104)
at
org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown
Source) at
org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
at
org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$13$$anon$1.hasNext(WholeStageCodegenExec.scala:636)
at
org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:255)
at
org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:247)
at
org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:858)
at
org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:858)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) at
org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346) at
org.apache.spark.rdd.RDD.iterator(RDD.scala:310) at
org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) at
org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346) at
org.apache.spark.rdd.RDD.iterator(RDD.scala:310) at
org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90) at
org.apache.spark.scheduler.Task.run(Task.scala:123) at
org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360) at
org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414) at
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)20/04/20 10:46:23 WARN TaskSetManager:
Lost task 0.0 in stage 5.0 (TID 5, localhost, executor driver):
java.lang.ArrayIndexOutOfBoundsException: 1 at
org.apache.orc.mapred.OrcStruct.getFieldValue(OrcStruct.java:49)
{code}
> Spark SQL's native ORC reader implements its own schema evolution
> -----------------------------------------------------------------
>
> Key: SPARK-27913
> URL: https://issues.apache.org/jira/browse/SPARK-27913
> Project: Spark
> Issue Type: Bug
> Components: SQL
> Affects Versions: 2.3.3
> Reporter: Owen O'Malley
> Priority: Major
>
> ORC's reader handles a wide range of schema evolution, but the Spark SQL
> native ORC bindings do not provide the desired schema to the ORC reader. This
> causes a regression when moving spark.sql.orc.impl from 'hive' to 'native'.
--
This message was sent by Atlassian Jira
(v8.3.4#803005)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]