Parviz Deyhim created SPARK-4278:
------------------------------------
Summary: SparkSQL job failing with java.lang.ClassCastException
Key: SPARK-4278
URL: https://issues.apache.org/jira/browse/SPARK-4278
Project: Spark
Issue Type: Bug
Components: SQL
Reporter: Parviz Deyhim
The following job fails with the java.lang.ClassCastException error. Ideally
SparkSQL should have the ability to ignore records that don't conform with the
inferred schema.
The steps that gets me to this error:
1) infer schema from a small subset of data
2) apply the schema to a larger dataset
3) do a simple join of two datasets
sample code:
{code}
val sampleJson =
sqlContext.jsonRDD(sc.textFile(".../dt=2014-10-10/file.snappy"))
val mydata = sqlContext.jsonRDD(larger_dataset,sampleJson.schema)
mydata.registerTempTable("mytable1")
other dataset:
val x = sc.textFile(".....")
case class Dataset(a:String,state:String, b:String, z:String, c:String,
d:String)
val xSchemaRDD =
x.map(_.split("\t")).map(f=>Dataset(f(0),f(1),f(2),f(3),f(4),f(5)))
xSchemaRDD.registerTempTable("mytable2")
{code}
java.lang.ClassCastException: java.lang.Long cannot be cast to java.lang.Integer
scala.runtime.BoxesRunTime.unboxToInt(BoxesRunTime.java:106)
org.apache.spark.sql.json.JsonRDD$.enforceCorrectType(JsonRDD.scala:389)
org.apache.spark.sql.json.JsonRDD$$anonfun$enforceCorrectType$1.apply(JsonRDD.scala:397)
scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:244)
scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:244)
scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
scala.collection.TraversableLike$class.map(TraversableLike.scala:244)
scala.collection.AbstractTraversable.map(Traversable.scala:105)
org.apache.spark.sql.json.JsonRDD$.enforceCorrectType(JsonRDD.scala:397)
org.apache.spark.sql.json.JsonRDD$$anonfun$org$apache$spark$sql$json$JsonRDD$$asRow$1$$anonfun$apply$4.apply(JsonRDD.scala:410)
scala.Option.map(Option.scala:145)
org.apache.spark.sql.json.JsonRDD$$anonfun$org$apache$spark$sql$json$JsonRDD$$asRow$1.apply(JsonRDD.scala:409)
org.apache.spark.sql.json.JsonRDD$$anonfun$org$apache$spark$sql$json$JsonRDD$$asRow$1.apply(JsonRDD.scala:407)
scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
org.apache.spark.sql.json.JsonRDD$.org$apache$spark$sql$json$JsonRDD$$asRow(JsonRDD.scala:407)
org.apache.spark.sql.json.JsonRDD$.enforceCorrectType(JsonRDD.scala:398)
org.apache.spark.sql.json.JsonRDD$$anonfun$org$apache$spark$sql$json$JsonRDD$$asRow$1$$anonfun$apply$4.apply(JsonRDD.scala:410)
scala.Option.map(Option.scala:145)
org.apache.spark.sql.json.JsonRDD$$anonfun$org$apache$spark$sql$json$JsonRDD$$asRow$1.apply(JsonRDD.scala:409)
org.apache.spark.sql.json.JsonRDD$$anonfun$org$apache$spark$sql$json$JsonRDD$$asRow$1.apply(JsonRDD.scala:407)
scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
org.apache.spark.sql.json.JsonRDD$.org$apache$spark$sql$json$JsonRDD$$asRow(JsonRDD.scala:407)
org.apache.spark.sql.json.JsonRDD$.enforceCorrectType(JsonRDD.scala:398)
org.apache.spark.sql.json.JsonRDD$$anonfun$org$apache$spark$sql$json$JsonRDD$$asRow$1$$anonfun$apply$4.apply(JsonRDD.scala:410)
scala.Option.map(Option.scala:145)
org.apache.spark.sql.json.JsonRDD$$anonfun$org$apache$spark$sql$json$JsonRDD$$asRow$1.apply(JsonRDD.scala:409)
org.apache.spark.sql.json.JsonRDD$$anonfun$org$apache$spark$sql$json$JsonRDD$$asRow$1.apply(JsonRDD.scala:407)
scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
org.apache.spark.sql.json.JsonRDD$.org$apache$spark$sql$json$JsonRDD$$asRow(JsonRDD.scala:407)
org.apache.spark.sql.json.JsonRDD$$anonfun$jsonStringToRow$1.apply(JsonRDD.scala:41)
org.apache.spark.sql.json.JsonRDD$$anonfun$jsonStringToRow$1.apply(JsonRDD.scala:41)
scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
scala.collection.Iterator$$anon$14.hasNext(Iterator.scala:389)
scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:327)
scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:327)
org.apache.spark.util.collection.ExternalSorter.insertAll(ExternalSorter.scala:209)
org.apache.spark.shuffle.sort.SortShuffleWriter.write(SortShuffleWriter.scala:65)
org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:68)
org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:41)
org.apache.spark.scheduler.Task.run(Task.scala:56)
org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:182)
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
java.lang.Thread.run(Thread.java:724)
--
This message was sent by Atlassian JIRA
(v6.3.4#6332)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]