[
https://issues.apache.org/jira/browse/HIVE-20032?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16555512#comment-16555512
]
Rui Li commented on HIVE-20032:
-------------------------------
I run a simple query in yarn-cluster mode with patch v8 and hit an issue:
{noformat}
2018-07-25T17:58:05,859 ERROR [6f7f3077-05bf-45cc-bf32-4c65132ccf48 main]
status.SparkJobMonitor: Spark job[-1] failed
java.lang.NoClassDefFoundError: org/apache/hadoop/hive/ql/io/HiveKey
at
org.apache.hive.spark.HiveKryoRegistrator.registerClasses(HiveKryoRegistrator.java:37)
~[hive-kryo-registrator-4.0.0-SNAPSHOT.jar:4.0.0-SNAPSHOT]
at
org.apache.spark.serializer.KryoSerializer$$anonfun$newKryo$6.apply(KryoSerializer.scala:136)
~[spark-core_2.11-2.3.0.jar:2.3.0]
at
org.apache.spark.serializer.KryoSerializer$$anonfun$newKryo$6.apply(KryoSerializer.scala:136)
~[spark-core_2.11-2.3.0.jar:2.3.0]
at
scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)
~[scala-library-2.11.8.jar:?]
at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:186)
~[scala-library-2.11.8.jar:?]
at
org.apache.spark.serializer.KryoSerializer.newKryo(KryoSerializer.scala:136)
~[spark-core_2.11-2.3.0.jar:2.3.0]
at
org.apache.spark.serializer.KryoSerializerInstance.borrowKryo(KryoSerializer.scala:324)
~[spark-core_2.11-2.3.0.jar:2.3.0]
at
org.apache.spark.serializer.KryoSerializerInstance.<init>(KryoSerializer.scala:309)
~[spark-core_2.11-2.3.0.jar:2.3.0]
at
org.apache.spark.serializer.KryoSerializer.newInstance(KryoSerializer.scala:218)
~[spark-core_2.11-2.3.0.jar:2.3.0]
at
org.apache.spark.broadcast.TorrentBroadcast$.blockifyObject(TorrentBroadcast.scala:288)
~[spark-core_2.11-2.3.0.jar:2.3.0]
at
org.apache.spark.broadcast.TorrentBroadcast.writeBlocks(TorrentBroadcast.scala:127)
~[spark-core_2.11-2.3.0.jar:2.3.0]
at
org.apache.spark.broadcast.TorrentBroadcast.<init>(TorrentBroadcast.scala:88)
~[spark-core_2.11-2.3.0.jar:2.3.0]
at
org.apache.spark.broadcast.TorrentBroadcastFactory.newBroadcast(TorrentBroadcastFactory.scala:34)
~[spark-core_2.11-2.3.0.jar:2.3.0]
at
org.apache.spark.broadcast.BroadcastManager.newBroadcast(BroadcastManager.scala:62)
~[spark-core_2.11-2.3.0.jar:2.3.0]
at org.apache.spark.SparkContext.broadcast(SparkContext.scala:1481)
~[spark-core_2.11-2.3.0.jar:2.3.0]
at org.apache.spark.rdd.HadoopRDD.<init>(HadoopRDD.scala:117)
~[spark-core_2.11-2.3.0.jar:2.3.0]
at
org.apache.spark.SparkContext$$anonfun$hadoopRDD$1.apply(SparkContext.scala:997)
~[spark-core_2.11-2.3.0.jar:2.3.0]
at
org.apache.spark.SparkContext$$anonfun$hadoopRDD$1.apply(SparkContext.scala:988)
~[spark-core_2.11-2.3.0.jar:2.3.0]
at
org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
~[spark-core_2.11-2.3.0.jar:2.3.0]
at
org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
~[spark-core_2.11-2.3.0.jar:2.3.0]
at org.apache.spark.SparkContext.withScope(SparkContext.scala:692)
~[spark-core_2.11-2.3.0.jar:2.3.0]
at org.apache.spark.SparkContext.hadoopRDD(SparkContext.scala:988)
~[spark-core_2.11-2.3.0.jar:2.3.0]
at
org.apache.spark.api.java.JavaSparkContext.hadoopRDD(JavaSparkContext.scala:416)
~[spark-core_2.11-2.3.0.jar:2.3.0]
at
org.apache.hadoop.hive.ql.exec.spark.SparkPlanGenerator.generateMapInput(SparkPlanGenerator.java:239)
~[hive-exec-4.0.0-SNAPSHOT.jar:4.0.0-SNAPSHOT]
at
org.apache.hadoop.hive.ql.exec.spark.SparkPlanGenerator.generateParentTran(SparkPlanGenerator.java:176)
~[hive-exec-4.0.0-SNAPSHOT.jar:4.0.0-SNAPSHOT]
at
org.apache.hadoop.hive.ql.exec.spark.SparkPlanGenerator.generate(SparkPlanGenerator.java:127)
~[hive-exec-4.0.0-SNAPSHOT.jar:4.0.0-SNAPSHOT]
at
org.apache.hadoop.hive.ql.exec.spark.RemoteHiveSparkClient$JobStatusJob.call(RemoteHiveSparkClient.java:361)
~[hive-exec-4.0.0-SNAPSHOT.jar:4.0.0-SNAPSHOT]
at
org.apache.hive.spark.client.RemoteDriver$JobWrapper.call(RemoteDriver.java:400)
~[hive-exec-4.0.0-SNAPSHOT.jar:4.0.0-SNAPSHOT]
at
org.apache.hive.spark.client.RemoteDriver$JobWrapper.call(RemoteDriver.java:365)
~[hive-exec-4.0.0-SNAPSHOT.jar:4.0.0-SNAPSHOT]
at java.util.concurrent.FutureTask.run(FutureTask.java:266)
~[?:1.8.0_151]
at
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
~[?:1.8.0_151]
at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
~[?:1.8.0_151]
at java.lang.Thread.run(Thread.java:748) [?:1.8.0_151]
Caused by: java.lang.ClassNotFoundException:
org.apache.hadoop.hive.ql.io.HiveKey
at java.net.URLClassLoader.findClass(URLClassLoader.java:381)
~[?:1.8.0_151]
at java.lang.ClassLoader.loadClass(ClassLoader.java:424) ~[?:1.8.0_151]
at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:335)
~[?:1.8.0_151]
{noformat}
Any ideas what caused this?
> Don't serialize hashCode for repartitionAndSortWithinPartitions
> ---------------------------------------------------------------
>
> Key: HIVE-20032
> URL: https://issues.apache.org/jira/browse/HIVE-20032
> Project: Hive
> Issue Type: Improvement
> Components: Spark
> Reporter: Sahil Takiar
> Assignee: Sahil Takiar
> Priority: Major
> Attachments: HIVE-20032.1.patch, HIVE-20032.2.patch,
> HIVE-20032.3.patch, HIVE-20032.4.patch, HIVE-20032.5.patch,
> HIVE-20032.6.patch, HIVE-20032.7.patch, HIVE-20032.8.patch, HIVE-20032.9.patch
>
>
> Follow up on HIVE-15104, if we don't enable RDD cacheing or groupByShuffles,
> then we don't need to serialize the hashCode when shuffling data in HoS.
--
This message was sent by Atlassian JIRA
(v7.6.3#76005)