Hello Spark Experts,

I am facing the following issue.


1)      I am converting a org.apache.spark.sql.Row into 
org.apache.spark.mllib.linalg.Vectors using sparse notation

2)      After the parsing proceeds successfully I try to look at the result and 
I get the following error: com.esotericsoftware.kryo.KryoException: 
java.io.IOException: failed to read chunk

Has anybody experience this before? I noticed this bug 
https://issues.apache.org/jira/browse/SPARK-4105

Is it related?

This is the command that creates the RDD of Sparse vectors.

>  val parsedData = stack.map(row => LabeledPoint(row.getDouble(4), 
> sparseVectorCat(row, CategoriesIdx, InteractionIds, tupleMap, vecLength)))

parsedData: 
org.apache.spark.rdd.RDD[org.apache.spark.mllib.regression.LabeledPoint] = 
MapPartitionsRDD[64] at map at DataFrame.scala:848


Here is how the data looks like after the conversion into sparse vectors

> val test2 = stack.filter("rows[1] is not null").head(10)

test2: Array[org.apache.spark.sql.Row] = Array([2014-08-17 20:19:00,2014-08-17 
20:19:00,545,Greenville-N.Bern-Washngtn,0.0,EST5EDT,Sunday,20,19,33,8,2014,Sunday|20,545,2014-08-17
 20:19:00,ArrayBuffer(37!2014-08-17 20:19:00!2014-08-17 20:19:50!2014-08-17 
20:19:50!2014-08-17 
20:19:50!545!Greenville-N.Bern-Washngtn!EST5EDT!UNKNOWN!10!CNN!Prime!M!UNKNOWN|10|CNN|Prime|M!2019!0.006461420296348449,
 37!2014-08-17 20:19:00!2014-08-17 20:19:45!2014-08-17 20:19:45!2014-08-17 
20:19:45!545!Greenville-N.Bern-Washngtn!EST5EDT!UNKNOWN!5!CNN!Prime!M!UNKNOWN|5|CNN|Prime|M!2019!0.006461420296348449)],
 [2014-08-17 23:45:00,2014-08-18 
00:45:00,625,Waco-Temple-Bryan,0.0,CST6CDT,Sunday,23,45,33,8,2014,Sunday|23,625,2014-08-18
 00:45:00,ArrayBuffer(276!2014-08-18 00:45:00!2014-08-18 00:45:14!2014-08-17 
23:45:14...

> val parsedTest2 = test2.map(row => LabeledPoint(row.getDouble(4), 
> sparseVectorCat(row, CategoriesIdx, InteractionIds, tupleMap, vecLength)))

parsedTest2: Array[org.apache.spark.mllib.regression.LabeledPoint] = 
Array((0.0,(2450,[241,1452,1480,1608,1706],[1.0,1.0,2019.0,2019.0,1.0])), 
(0.0,(2450,[6,1133,1342,2184,2314],[1.0,1.0,1050.0,1244.0,1.0])), 
(0.0,(2450,[414,1133,1310,1605,2206],[941.0,1.0,1.0,1.0,907.0])), 
(0.0,(2450,[322,761,981,1203,1957],[5203.0,1.0,1.0,1.0,5203.0])), 
(1.0,(2450,[117,322,943,1757,1957],[1.0,910.0,1.0,1.0,910.0])), 
(0.0,(2450,[645,1018,1074,1778,1974],[1.0,1507.0,1.0,1507.0,1.0])), 
(0.0,(2450,[522,542,814,1128,1749],[1.0,432.0,796.0,1.0,1.0])), 
(0.0,(2450,[166,322,413,1706,2256],[1.0,1260.0,1.0,1.0,1260.0])), 
(1.0,(2450,[1203,1295,1354,2189,2388],[1.0,1.0,1.0,3705.0,2823.0])), 
(6.0,(2450,[293,1203,1312,1627,2035],[2716.0,1.0,1.0,1.0,2716.0])))


> parsedData.first

org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in 
stage 52.0 failed 4 times, most recent failure: Lost task 0.3 in stage 52.0 
(TID 4243, ip-10-0-0-174.ec2.internal): 
com.esotericsoftware.kryo.KryoException: java.io.IOException: failed to read 
chunk
Serialization trace:
values (org.apache.spark.sql.catalyst.expressions.GenericMutableRow)
            at com.esotericsoftware.kryo.io.Input.fill(Input.java:142)
            at com.esotericsoftware.kryo.io.Input.require(Input.java:169)
            at com.esotericsoftware.kryo.io.Input.readUtf8_slow(Input.java:524)
            at com.esotericsoftware.kryo.io.Input.readUtf8(Input.java:517)
            at com.esotericsoftware.kryo.io.Input.readString(Input.java:447)
            at 
com.esotericsoftware.kryo.serializers.DefaultSerializers$StringSerializer.read(DefaultSerializers.java:157)
            at 
com.esotericsoftware.kryo.serializers.DefaultSerializers$StringSerializer.read(DefaultSerializers.java:146)
            at com.esotericsoftware.kryo.Kryo.readClassAndObject(Kryo.java:732)
            at 
com.twitter.chill.TraversableSerializer.read(Traversable.scala:43)
            at 
com.twitter.chill.TraversableSerializer.read(Traversable.scala:21)
            at com.esotericsoftware.kryo.Kryo.readClassAndObject(Kryo.java:732)
            at 
com.esotericsoftware.kryo.serializers.DefaultArraySerializers$ObjectArraySerializer.read(DefaultArraySerializers.java:338)
            at 
com.esotericsoftware.kryo.serializers.DefaultArraySerializers$ObjectArraySerializer.read(DefaultArraySerializers.java:293)
            at com.esotericsoftware.kryo.Kryo.readObject(Kryo.java:651)
            at 
com.esotericsoftware.kryo.serializers.FieldSerializer$ObjectField.read(FieldSerializer.java:605)
            at 
com.esotericsoftware.kryo.serializers.FieldSerializer.read(FieldSerializer.java:221)
            at com.esotericsoftware.kryo.Kryo.readClassAndObject(Kryo.java:732)
            at 
com.twitter.chill.Tuple2Serializer.read(TupleSerializers.scala:42)
            at 
com.twitter.chill.Tuple2Serializer.read(TupleSerializers.scala:33)
            at com.esotericsoftware.kryo.Kryo.readClassAndObject(Kryo.java:732)
            at 
org.apache.spark.serializer.KryoDeserializationStream.readObject(KryoSerializer.scala:138)
            at 
org.apache.spark.serializer.DeserializationStream$$anon$1.getNext(Serializer.scala:133)
            at org.apache.spark.util.NextIterator.hasNext(NextIterator.scala:71)
            at 
org.apache.spark.util.CompletionIterator.hasNext(CompletionIterator.scala:32)
            at scala.collection.Iterator$$anon$13.hasNext(Iterator.scala:371)
            at 
org.apache.spark.util.CompletionIterator.hasNext(CompletionIterator.scala:32)
            at 
org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:39)
            at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:327)
            at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:327)
            at 
org.apache.spark.sql.execution.joins.HashOuterJoin.org$apache$spark$sql$execution$joins$HashOuterJoin$$buildHashTable(HashOuterJoin.scala:170)
            at 
org.apache.spark.sql.execution.joins.HashOuterJoin$$anonfun$execute$1.apply(HashOuterJoin.scala:193)
            at 
org.apache.spark.sql.execution.joins.HashOuterJoin$$anonfun$execute$1.apply(HashOuterJoin.scala:188)
            at 
org.apache.spark.rdd.ZippedPartitionsRDD2.compute(ZippedPartitionsRDD.scala:88)
            at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:277)
            at org.apache.spark.rdd.RDD.iterator(RDD.scala:244)
            at 
org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35)
            at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:277)
            at org.apache.spark.rdd.RDD.iterator(RDD.scala:244)
            at 
org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35)
            at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:277)
            at org.apache.spark.rdd.RDD.iterator(RDD.scala:244)
            at 
org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:61)
            at org.apache.spark.scheduler.Task.run(Task.scala:64)
            at 
org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:203)
            at 
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
            at 
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
            at java.lang.Thread.run(Thread.java:745)
Caused by: java.io.IOException: failed to read chunk
            at 
org.xerial.snappy.SnappyInputStream.hasNextChunk(SnappyInputStream.java:347)
            at 
org.xerial.snappy.SnappyInputStream.rawRead(SnappyInputStream.java:158)
            at 
org.xerial.snappy.SnappyInputStream.read(SnappyInputStream.java:142)
            at com.esotericsoftware.kryo.io.Input.fill(Input.java:140)
            ... 46 more

Driver stacktrace:
            at 
org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1230)
            at 
org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1219)
            at 
org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1218)
            at 
scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
            at 
scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
            at 
org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1218)
            at 
org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:719)
            at 
org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:719)
            at scala.Option.foreach(Option.scala:236)
            at 
org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:719)
            at 
org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1419)
            at 
org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1380)
            at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)

<<attachment: winmail.dat>>

---------------------------------------------------------------------
To unsubscribe, e-mail: user-unsubscr...@spark.apache.org
For additional commands, e-mail: user-h...@spark.apache.org

Reply via email to