Parviz Deyhim created SPARK-4278:
------------------------------------

             Summary: SparkSQL job failing with java.lang.ClassCastException
                 Key: SPARK-4278
                 URL: https://issues.apache.org/jira/browse/SPARK-4278
             Project: Spark
          Issue Type: Bug
          Components: SQL
            Reporter: Parviz Deyhim


The following job fails with the java.lang.ClassCastException error. Ideally 
SparkSQL should have the ability to ignore records that don't conform with the 
inferred schema. 

The steps that gets me to this error: 

1) infer schema from a small subset of data
2) apply the schema to a larger dataset
3) do a simple join of two datasets

sample code:
{code}
val sampleJson = 
sqlContext.jsonRDD(sc.textFile(".../dt=2014-10-10/file.snappy"))
val mydata = sqlContext.jsonRDD(larger_dataset,sampleJson.schema)
mydata.registerTempTable("mytable1")

other dataset:
val x = sc.textFile(".....")
case class Dataset(a:String,state:String, b:String, z:String, c:String, 
d:String)
val xSchemaRDD = 
x.map(_.split("\t")).map(f=>Dataset(f(0),f(1),f(2),f(3),f(4),f(5)))
xSchemaRDD.registerTempTable("mytable2")

{code}


java.lang.ClassCastException: java.lang.Long cannot be cast to java.lang.Integer
        scala.runtime.BoxesRunTime.unboxToInt(BoxesRunTime.java:106)
        org.apache.spark.sql.json.JsonRDD$.enforceCorrectType(JsonRDD.scala:389)
        
org.apache.spark.sql.json.JsonRDD$$anonfun$enforceCorrectType$1.apply(JsonRDD.scala:397)
        
scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:244)
        
scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:244)
        
scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
        scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
        scala.collection.TraversableLike$class.map(TraversableLike.scala:244)
        scala.collection.AbstractTraversable.map(Traversable.scala:105)
        org.apache.spark.sql.json.JsonRDD$.enforceCorrectType(JsonRDD.scala:397)
        
org.apache.spark.sql.json.JsonRDD$$anonfun$org$apache$spark$sql$json$JsonRDD$$asRow$1$$anonfun$apply$4.apply(JsonRDD.scala:410)
        scala.Option.map(Option.scala:145)
        
org.apache.spark.sql.json.JsonRDD$$anonfun$org$apache$spark$sql$json$JsonRDD$$asRow$1.apply(JsonRDD.scala:409)
        
org.apache.spark.sql.json.JsonRDD$$anonfun$org$apache$spark$sql$json$JsonRDD$$asRow$1.apply(JsonRDD.scala:407)
        
scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
        scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
        
org.apache.spark.sql.json.JsonRDD$.org$apache$spark$sql$json$JsonRDD$$asRow(JsonRDD.scala:407)
        org.apache.spark.sql.json.JsonRDD$.enforceCorrectType(JsonRDD.scala:398)
        
org.apache.spark.sql.json.JsonRDD$$anonfun$org$apache$spark$sql$json$JsonRDD$$asRow$1$$anonfun$apply$4.apply(JsonRDD.scala:410)
        scala.Option.map(Option.scala:145)
        
org.apache.spark.sql.json.JsonRDD$$anonfun$org$apache$spark$sql$json$JsonRDD$$asRow$1.apply(JsonRDD.scala:409)
        
org.apache.spark.sql.json.JsonRDD$$anonfun$org$apache$spark$sql$json$JsonRDD$$asRow$1.apply(JsonRDD.scala:407)
        
scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
        scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
        
org.apache.spark.sql.json.JsonRDD$.org$apache$spark$sql$json$JsonRDD$$asRow(JsonRDD.scala:407)
        org.apache.spark.sql.json.JsonRDD$.enforceCorrectType(JsonRDD.scala:398)
        
org.apache.spark.sql.json.JsonRDD$$anonfun$org$apache$spark$sql$json$JsonRDD$$asRow$1$$anonfun$apply$4.apply(JsonRDD.scala:410)
        scala.Option.map(Option.scala:145)
        
org.apache.spark.sql.json.JsonRDD$$anonfun$org$apache$spark$sql$json$JsonRDD$$asRow$1.apply(JsonRDD.scala:409)
        
org.apache.spark.sql.json.JsonRDD$$anonfun$org$apache$spark$sql$json$JsonRDD$$asRow$1.apply(JsonRDD.scala:407)
        
scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
        scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
        
org.apache.spark.sql.json.JsonRDD$.org$apache$spark$sql$json$JsonRDD$$asRow(JsonRDD.scala:407)
        
org.apache.spark.sql.json.JsonRDD$$anonfun$jsonStringToRow$1.apply(JsonRDD.scala:41)
        
org.apache.spark.sql.json.JsonRDD$$anonfun$jsonStringToRow$1.apply(JsonRDD.scala:41)
        scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
        scala.collection.Iterator$$anon$14.hasNext(Iterator.scala:389)
        scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:327)
        scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:327)
        
org.apache.spark.util.collection.ExternalSorter.insertAll(ExternalSorter.scala:209)
        
org.apache.spark.shuffle.sort.SortShuffleWriter.write(SortShuffleWriter.scala:65)
        
org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:68)
        
org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:41)
        org.apache.spark.scheduler.Task.run(Task.scala:56)
        org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:182)
        
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
        
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
        java.lang.Thread.run(Thread.java:724)



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to