[ https://issues.apache.org/jira/browse/SPARK-26511?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Amy Koh updated SPARK-26511: ---------------------------- Attachment: repro.zip > java.lang.ClassCastException error when loading Spark MLlib model from > parquet file > ----------------------------------------------------------------------------------- > > Key: SPARK-26511 > URL: https://issues.apache.org/jira/browse/SPARK-26511 > Project: Spark > Issue Type: Bug > Components: MLlib > Affects Versions: 2.4.0 > Reporter: Amy Koh > Priority: Major > Attachments: repro.zip > > > When I tried to load a decision tree model from a parquet file, the following > error is thrown. > {code:bash} > Py4JJavaError: An error occurred while calling > z:org.apache.spark.mllib.tree.model.DecisionTreeModel.load. : > org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in > stage 2.0 failed 1 times, most recent failure: Lost task 0.0 in stage 2.0 > (TID 2, localhost, executor driver): java.lang.ClassCastException: class > java.lang.Double cannot be cast to class java.lang.Integer (java.lang.Double > and java.lang.Integer are in module java.base of loader 'bootstrap') at > scala.runtime.BoxesRunTime.unboxToInt(BoxesRunTime.java:101) at > org.apache.spark.sql.Row$class.getInt(Row.scala:223) at > org.apache.spark.sql.catalyst.expressions.GenericRow.getInt(rows.scala:165) > at > org.apache.spark.mllib.tree.model.DecisionTreeModel$SaveLoadV1_0$SplitData$.apply(DecisionTreeModel.scala:171) > at > org.apache.spark.mllib.tree.model.DecisionTreeModel$SaveLoadV1_0$NodeData$.apply(DecisionTreeModel.scala:195) > at > org.apache.spark.mllib.tree.model.DecisionTreeModel$SaveLoadV1_0$$anonfun$9.apply(DecisionTreeModel.scala:247) > at > org.apache.spark.mllib.tree.model.DecisionTreeModel$SaveLoadV1_0$$anonfun$9.apply(DecisionTreeModel.scala:247) > at scala.collection.Iterator$$anon$11.next(Iterator.scala:409) at > scala.collection.Iterator$$anon$11.next(Iterator.scala:409) at > org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:149) > at > org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:96) at > org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:53) at > org.apache.spark.scheduler.Task.run(Task.scala:108) at > org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:335) at > java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128) > at > java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628) > at java.base/java.lang.Thread.run(Thread.java:834) Driver stacktrace: at > org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1499) > at > org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1487) > at > org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1486) > at > scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59) > at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48) at > org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1486) > at > org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:814) > at > org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:814) > at scala.Option.foreach(Option.scala:257) at > org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:814) > at > org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1714) > at > org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1669) > at > org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1658) > at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48) at > org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:630) at > org.apache.spark.SparkContext.runJob(SparkContext.scala:2022) at > org.apache.spark.SparkContext.runJob(SparkContext.scala:2043) at > org.apache.spark.SparkContext.runJob(SparkContext.scala:2062) at > org.apache.spark.SparkContext.runJob(SparkContext.scala:2087) at > org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:936) at > org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151) > at > org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112) > at org.apache.spark.rdd.RDD.withScope(RDD.scala:362) at > org.apache.spark.rdd.RDD.collect(RDD.scala:935) at > org.apache.spark.mllib.tree.model.DecisionTreeModel$SaveLoadV1_0$.constructTrees(DecisionTreeModel.scala:262) > at > org.apache.spark.mllib.tree.model.DecisionTreeModel$SaveLoadV1_0$.load(DecisionTreeModel.scala:249) > at > org.apache.spark.mllib.tree.model.DecisionTreeModel$.load(DecisionTreeModel.scala:326) > at > org.apache.spark.mllib.tree.model.DecisionTreeModel.load(DecisionTreeModel.scala) > at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native > Method) at > java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) > at > java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) > at java.base/java.lang.reflect.Method.invoke(Method.java:566) at > py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244) at > py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357) at > py4j.Gateway.invoke(Gateway.java:280) at > py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132) at > py4j.commands.CallCommand.execute(CallCommand.java:79) at > py4j.GatewayConnection.run(GatewayConnection.java:214) at > java.base/java.lang.Thread.run(Thread.java:834) Caused by: > java.lang.ClassCastException: class java.lang.Double cannot be cast to class > java.lang.Integer (java.lang.Double and java.lang.Integer are in module > java.base of loader 'bootstrap') at > scala.runtime.BoxesRunTime.unboxToInt(BoxesRunTime.java:101) at > org.apache.spark.sql.Row$class.getInt(Row.scala:223) at > org.apache.spark.sql.catalyst.expressions.GenericRow.getInt(rows.scala:165) > at > org.apache.spark.mllib.tree.model.DecisionTreeModel$SaveLoadV1_0$SplitData$.apply(DecisionTreeModel.scala:171) > at > org.apache.spark.mllib.tree.model.DecisionTreeModel$SaveLoadV1_0$NodeData$.apply(DecisionTreeModel.scala:195) > at > org.apache.spark.mllib.tree.model.DecisionTreeModel$SaveLoadV1_0$$anonfun$9.apply(DecisionTreeModel.scala:247) > at > org.apache.spark.mllib.tree.model.DecisionTreeModel$SaveLoadV1_0$$anonfun$9.apply(DecisionTreeModel.scala:247) > at scala.collection.Iterator$$anon$11.next(Iterator.scala:409) at > scala.collection.Iterator$$anon$11.next(Iterator.scala:409) at > org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:149) > at > org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:96) at > org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:53) at > org.apache.spark.scheduler.Task.run(Task.scala:108) at > org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:335) at > java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128) > at > java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628) > ... 1 more > {code} > Reproduction steps as follow with reproduction files attached: > {code:python} > from pyspark.mllib.tree import DecisionTree, DecisionTreeModel > from pyspark.mllib.util import MLUtils > from pyspark import SparkContext > sc = SparkContext() > model = DecisionTreeModel.load(spark, <modelFilePath>) > {code} > > -- This message was sent by Atlassian JIRA (v7.6.3#76005) --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org