[ https://issues.apache.org/jira/browse/SPARK-5532?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Joseph K. Bradley updated SPARK-5532: ------------------------------------- Description: Deterministic failure: {code} import org.apache.spark.mllib.linalg._ import org.apache.spark.sql.SQLContext val sqlContext = new SQLContext(sc) import sqlContext._ val data = sc.parallelize(Seq((1.0, Vectors.dense(1,2,3)))).toDataFrame("label", "features") data.repartition(1).saveAsParquetFile("blah") {code} If you remove the repartition, then this succeeds. Here's the stack trace: {code} 15/02/02 12:10:53 WARN TaskSetManager: Lost task 0.0 in stage 2.0 (TID 4, 192.168.1.230): java.lang.ClassCastException: org.apache.spark.mllib.linalg.DenseVector cannot be cast to org.apache.spark.sql.Row at org.apache.spark.sql.parquet.RowWriteSupport.writeValue(ParquetTableSupport.scala:186) at org.apache.spark.sql.parquet.RowWriteSupport.writeValue(ParquetTableSupport.scala:177) at org.apache.spark.sql.parquet.RowWriteSupport.write(ParquetTableSupport.scala:166) at org.apache.spark.sql.parquet.RowWriteSupport.write(ParquetTableSupport.scala:129) at parquet.hadoop.InternalParquetRecordWriter.write(InternalParquetRecordWriter.java:120) at parquet.hadoop.ParquetRecordWriter.write(ParquetRecordWriter.java:81) at parquet.hadoop.ParquetRecordWriter.write(ParquetRecordWriter.java:37) at org.apache.spark.sql.parquet.InsertIntoParquetTable.org$apache$spark$sql$parquet$InsertIntoParquetTable$$writeShard$1(ParquetTableOperations.scala:315) at org.apache.spark.sql.parquet.InsertIntoParquetTable$$anonfun$saveAsHadoopFile$1.apply(ParquetTableOperations.scala:332) at org.apache.spark.sql.parquet.InsertIntoParquetTable$$anonfun$saveAsHadoopFile$1.apply(ParquetTableOperations.scala:332) at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:61) at org.apache.spark.scheduler.Task.run(Task.scala:64) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:194) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615) at java.lang.Thread.run(Thread.java:745) 15/02/02 12:10:54 ERROR TaskSetManager: Task 0 in stage 2.0 failed 4 times; aborting job org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 2.0 failed 4 times, most recent failure: Lost task 0.3 in stage 2.0 (TID 7, 192.168.1.230): java.lang.ClassCastException: org.apache.spark.mllib.linalg.DenseVector cannot be cast to org.apache.spark.sql.Row at org.apache.spark.sql.parquet.RowWriteSupport.writeValue(ParquetTableSupport.scala:186) at org.apache.spark.sql.parquet.RowWriteSupport.writeValue(ParquetTableSupport.scala:177) at org.apache.spark.sql.parquet.RowWriteSupport.write(ParquetTableSupport.scala:166) at org.apache.spark.sql.parquet.RowWriteSupport.write(ParquetTableSupport.scala:129) at parquet.hadoop.InternalParquetRecordWriter.write(InternalParquetRecordWriter.java:120) at parquet.hadoop.ParquetRecordWriter.write(ParquetRecordWriter.java:81) at parquet.hadoop.ParquetRecordWriter.write(ParquetRecordWriter.java:37) at org.apache.spark.sql.parquet.InsertIntoParquetTable.org$apache$spark$sql$parquet$InsertIntoParquetTable$$writeShard$1(ParquetTableOperations.scala:315) at org.apache.spark.sql.parquet.InsertIntoParquetTable$$anonfun$saveAsHadoopFile$1.apply(ParquetTableOperations.scala:332) at org.apache.spark.sql.parquet.InsertIntoParquetTable$$anonfun$saveAsHadoopFile$1.apply(ParquetTableOperations.scala:332) at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:61) at org.apache.spark.scheduler.Task.run(Task.scala:64) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:194) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615) at java.lang.Thread.run(Thread.java:745) Driver stacktrace: at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1185) at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1174) at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1173) at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59) at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47) at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1173) at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:684) at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:684) at scala.Option.foreach(Option.scala:236) at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:684) at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1366) at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1327) at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48) {code} was: Deterministic failure: {code} import org.apache.spark.mllib.linalg._ import org.apache.spark.sql.SQLContext val sqlContext = new SQLContext(sc) import sqlContext._ val data = sc.parallelize(Seq((1.0, Vectors.dense(1,2,3)))).toDataFrame("label", "features") data.repartition(1).saveAsParquetFile("blah") {code} If you remove the repartition, then this succeeds. > Repartitioning DataFrame causes saveAsParquetFile to fail with VectorUDT > ------------------------------------------------------------------------ > > Key: SPARK-5532 > URL: https://issues.apache.org/jira/browse/SPARK-5532 > Project: Spark > Issue Type: Bug > Components: MLlib, SQL > Affects Versions: 1.3.0 > Reporter: Joseph K. Bradley > > Deterministic failure: > {code} > import org.apache.spark.mllib.linalg._ > import org.apache.spark.sql.SQLContext > val sqlContext = new SQLContext(sc) > import sqlContext._ > val data = sc.parallelize(Seq((1.0, > Vectors.dense(1,2,3)))).toDataFrame("label", "features") > data.repartition(1).saveAsParquetFile("blah") > {code} > If you remove the repartition, then this succeeds. > Here's the stack trace: > {code} > 15/02/02 12:10:53 WARN TaskSetManager: Lost task 0.0 in stage 2.0 (TID 4, > 192.168.1.230): java.lang.ClassCastException: > org.apache.spark.mllib.linalg.DenseVector cannot be cast to > org.apache.spark.sql.Row > at > org.apache.spark.sql.parquet.RowWriteSupport.writeValue(ParquetTableSupport.scala:186) > at > org.apache.spark.sql.parquet.RowWriteSupport.writeValue(ParquetTableSupport.scala:177) > at > org.apache.spark.sql.parquet.RowWriteSupport.write(ParquetTableSupport.scala:166) > at > org.apache.spark.sql.parquet.RowWriteSupport.write(ParquetTableSupport.scala:129) > at > parquet.hadoop.InternalParquetRecordWriter.write(InternalParquetRecordWriter.java:120) > at parquet.hadoop.ParquetRecordWriter.write(ParquetRecordWriter.java:81) > at parquet.hadoop.ParquetRecordWriter.write(ParquetRecordWriter.java:37) > at > org.apache.spark.sql.parquet.InsertIntoParquetTable.org$apache$spark$sql$parquet$InsertIntoParquetTable$$writeShard$1(ParquetTableOperations.scala:315) > at > org.apache.spark.sql.parquet.InsertIntoParquetTable$$anonfun$saveAsHadoopFile$1.apply(ParquetTableOperations.scala:332) > at > org.apache.spark.sql.parquet.InsertIntoParquetTable$$anonfun$saveAsHadoopFile$1.apply(ParquetTableOperations.scala:332) > at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:61) > at org.apache.spark.scheduler.Task.run(Task.scala:64) > at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:194) > at > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145) > at > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615) > at java.lang.Thread.run(Thread.java:745) > 15/02/02 12:10:54 ERROR TaskSetManager: Task 0 in stage 2.0 failed 4 times; > aborting job > org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in > stage 2.0 failed 4 times, most recent failure: Lost task 0.3 in stage 2.0 > (TID 7, 192.168.1.230): java.lang.ClassCastException: > org.apache.spark.mllib.linalg.DenseVector cannot be cast to > org.apache.spark.sql.Row > at > org.apache.spark.sql.parquet.RowWriteSupport.writeValue(ParquetTableSupport.scala:186) > at > org.apache.spark.sql.parquet.RowWriteSupport.writeValue(ParquetTableSupport.scala:177) > at > org.apache.spark.sql.parquet.RowWriteSupport.write(ParquetTableSupport.scala:166) > at > org.apache.spark.sql.parquet.RowWriteSupport.write(ParquetTableSupport.scala:129) > at > parquet.hadoop.InternalParquetRecordWriter.write(InternalParquetRecordWriter.java:120) > at parquet.hadoop.ParquetRecordWriter.write(ParquetRecordWriter.java:81) > at parquet.hadoop.ParquetRecordWriter.write(ParquetRecordWriter.java:37) > at > org.apache.spark.sql.parquet.InsertIntoParquetTable.org$apache$spark$sql$parquet$InsertIntoParquetTable$$writeShard$1(ParquetTableOperations.scala:315) > at > org.apache.spark.sql.parquet.InsertIntoParquetTable$$anonfun$saveAsHadoopFile$1.apply(ParquetTableOperations.scala:332) > at > org.apache.spark.sql.parquet.InsertIntoParquetTable$$anonfun$saveAsHadoopFile$1.apply(ParquetTableOperations.scala:332) > at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:61) > at org.apache.spark.scheduler.Task.run(Task.scala:64) > at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:194) > at > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145) > at > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615) > at java.lang.Thread.run(Thread.java:745) > Driver stacktrace: > at > org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1185) > at > org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1174) > at > org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1173) > at > scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59) > at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47) > at > org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1173) > at > org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:684) > at > org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:684) > at scala.Option.foreach(Option.scala:236) > at > org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:684) > at > org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1366) > at > org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1327) > at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48) > {code} -- This message was sent by Atlassian JIRA (v6.3.4#6332) --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org