Re: CassandraSQLContext throwing NullPointer Exception

2015-09-28 Thread Ted Yu
Experts on spark-connector-user mailing list should be able to chime in.

My reply to that list bounced since I didn't subscribe to that list.

FYI

On Mon, Sep 28, 2015 at 10:07 AM, Priya Ch 
wrote:

> Ted,
>
> I am using spark 1.3.0 and running the code in YARN mode.
>
> Here is the code..
>
> object streaming {
> def main(args:Array[String])
> {
>   val conf = new SparkConfst
>   conf..setMaster("yarn-client")
>   conf.setAppName("SimpleApp")
> conf.set("spark.cassandra.coonection.host","")
>
> val sc = new SparkContext(conf)
> val sqlContext = new CassandraSQLContext(sc)
>
> val dstream = KafkaUtils.createStream()
> val words = dstream.flatMap(line => line.split(","))
> val wordPairs = words.map(word =>(word,1))
> val reducedStream = wordPairs.reduceByKey((a,b) => a+b)
>
> reducedStream.foreachRDD{
> rdd => rdd.foreach{
> case(key,value) => val df = getDataFrame(sqlContext, key)
> df.save("org.apach.spark.cassandra",SaveMode.Overwrite, Map("c_table" ->
> "table1","kespace" -> "test"))
> }
> }
>   }
>
> def getDataFrame(cqlContext:CassandraSQLContext, key:String):DataFrame =
> cqlContext.sql("select word,count from wordcount where word = '"+key+"'")
> }
>
> In the above code, the method getDataFrame is throwing Null Pointer
> Exception at cqlContext.sql line.
>
> On Mon, Sep 28, 2015 at 6:54 PM, Ted Yu  wrote:
>
>> Which Spark release are you using ?
>>
>> Can you show the snippet of your code around CassandraSQLContext#sql() ?
>>
>> Thanks
>>
>> On Mon, Sep 28, 2015 at 6:21 AM, Priya Ch 
>> wrote:
>>
>>> Hi All,
>>>
>>>  I am trying to use dataframes (which contain data from cassandra) in
>>> rdd.foreach. This is throwing the following exception:
>>>
>>> Is CassandraSQLContext accessible within executor 
>>>
>>> 15/09/28 17:22:40 ERROR JobScheduler: Error running job streaming job
>>> 144344116 ms.0
>>> org.apache.spark.SparkException: Job aborted due to stage failure: Task
>>> 0 in stage 6.0 failed 4 times, most recent failure: Lost task 0.3 in stage
>>> 6.0 (TID 83, blrrndnbudn05.rnd.amadeus.net):
>>> java.lang.NullPointerException
>>> at org.apache.spark.sql.SQLContext.parseSql(SQLContext.scala:134)
>>> at
>>> org.apache.spark.sql.cassandra.CassandraSQLContext.cassandraSql(CassandraSQLContext.scala:74)
>>> at
>>> org.apache.spark.sql.cassandra.CassandraSQLContext.sql(CassandraSQLContext.scala:77)
>>> at
>>> com.amadeus.spark.example.SparkDemo$.withDataFrames(SparkDemo.scala:170)
>>> at
>>> com.amadeus.spark.example.SparkDemo$$anonfun$main$1$$anonfun$apply$1.apply(SparkDemo.scala:158)
>>> at
>>> com.amadeus.spark.example.SparkDemo$$anonfun$main$1$$anonfun$apply$1.apply(SparkDemo.scala:158)
>>> at scala.collection.Iterator$class.foreach(Iterator.scala:727)
>>> at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
>>> at
>>> org.apache.spark.rdd.RDD$$anonfun$foreach$1.apply(RDD.scala:798)
>>> at
>>> org.apache.spark.rdd.RDD$$anonfun$foreach$1.apply(RDD.scala:798)
>>> at
>>> org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1503)
>>> at
>>> org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1503)
>>> at
>>> org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:61)
>>> at org.apache.spark.scheduler.Task.run(Task.scala:64)
>>> at
>>> org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:203)
>>> at
>>> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
>>> at
>>> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
>>> at java.lang.Thread.run(Thread.java:745)
>>>
>>> Driver stacktrace:
>>> at org.apache.spark.scheduler.DAGScheduler.org
>>> $apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1203)
>>> at
>>> org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1192)
>>> at
>>> org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1191)
>>> at
>>> scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
>>> at
>>> scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
>>> at
>>> org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1191)
>>> at
>>> org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:693)
>>> at
>>> org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:693)
>>> at scala.Option.foreach(Option.scala:236)
>>> at
>>> org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:693)
>>> at
>>> org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1393)
>>> at
>>> org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGSche

Re: CassandraSQLContext throwing NullPointer Exception

2015-09-28 Thread Priya Ch
Ted,

I am using spark 1.3.0 and running the code in YARN mode.

Here is the code..

object streaming {
def main(args:Array[String])
{
  val conf = new SparkConfst
  conf..setMaster("yarn-client")
  conf.setAppName("SimpleApp")
conf.set("spark.cassandra.coonection.host","")

val sc = new SparkContext(conf)
val sqlContext = new CassandraSQLContext(sc)

val dstream = KafkaUtils.createStream()
val words = dstream.flatMap(line => line.split(","))
val wordPairs = words.map(word =>(word,1))
val reducedStream = wordPairs.reduceByKey((a,b) => a+b)

reducedStream.foreachRDD{
rdd => rdd.foreach{
case(key,value) => val df = getDataFrame(sqlContext, key)
df.save("org.apach.spark.cassandra",SaveMode.Overwrite, Map("c_table" ->
"table1","kespace" -> "test"))
}
}
  }

def getDataFrame(cqlContext:CassandraSQLContext, key:String):DataFrame =
cqlContext.sql("select word,count from wordcount where word = '"+key+"'")
}

In the above code, the method getDataFrame is throwing Null Pointer
Exception at cqlContext.sql line.

On Mon, Sep 28, 2015 at 6:54 PM, Ted Yu  wrote:

> Which Spark release are you using ?
>
> Can you show the snippet of your code around CassandraSQLContext#sql() ?
>
> Thanks
>
> On Mon, Sep 28, 2015 at 6:21 AM, Priya Ch 
> wrote:
>
>> Hi All,
>>
>>  I am trying to use dataframes (which contain data from cassandra) in
>> rdd.foreach. This is throwing the following exception:
>>
>> Is CassandraSQLContext accessible within executor 
>>
>> 15/09/28 17:22:40 ERROR JobScheduler: Error running job streaming job
>> 144344116 ms.0
>> org.apache.spark.SparkException: Job aborted due to stage failure: Task 0
>> in stage 6.0 failed 4 times, most recent failure: Lost task 0.3 in stage
>> 6.0 (TID 83, blrrndnbudn05.rnd.amadeus.net):
>> java.lang.NullPointerException
>> at org.apache.spark.sql.SQLContext.parseSql(SQLContext.scala:134)
>> at
>> org.apache.spark.sql.cassandra.CassandraSQLContext.cassandraSql(CassandraSQLContext.scala:74)
>> at
>> org.apache.spark.sql.cassandra.CassandraSQLContext.sql(CassandraSQLContext.scala:77)
>> at
>> com.amadeus.spark.example.SparkDemo$.withDataFrames(SparkDemo.scala:170)
>> at
>> com.amadeus.spark.example.SparkDemo$$anonfun$main$1$$anonfun$apply$1.apply(SparkDemo.scala:158)
>> at
>> com.amadeus.spark.example.SparkDemo$$anonfun$main$1$$anonfun$apply$1.apply(SparkDemo.scala:158)
>> at scala.collection.Iterator$class.foreach(Iterator.scala:727)
>> at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
>> at
>> org.apache.spark.rdd.RDD$$anonfun$foreach$1.apply(RDD.scala:798)
>> at
>> org.apache.spark.rdd.RDD$$anonfun$foreach$1.apply(RDD.scala:798)
>> at
>> org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1503)
>> at
>> org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1503)
>> at
>> org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:61)
>> at org.apache.spark.scheduler.Task.run(Task.scala:64)
>> at
>> org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:203)
>> at
>> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
>> at
>> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
>> at java.lang.Thread.run(Thread.java:745)
>>
>> Driver stacktrace:
>> at org.apache.spark.scheduler.DAGScheduler.org
>> $apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1203)
>> at
>> org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1192)
>> at
>> org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1191)
>> at
>> scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
>> at
>> scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
>> at
>> org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1191)
>> at
>> org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:693)
>> at
>> org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:693)
>> at scala.Option.foreach(Option.scala:236)
>> at
>> org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:693)
>> at
>> org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1393)
>> at
>> org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1354)
>> at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
>>
>>
>


Re: CassandraSQLContext throwing NullPointer Exception

2015-09-28 Thread Ted Yu
Which Spark release are you using ?

Can you show the snippet of your code around CassandraSQLContext#sql() ?

Thanks

On Mon, Sep 28, 2015 at 6:21 AM, Priya Ch 
wrote:

> Hi All,
>
>  I am trying to use dataframes (which contain data from cassandra) in
> rdd.foreach. This is throwing the following exception:
>
> Is CassandraSQLContext accessible within executor 
>
> 15/09/28 17:22:40 ERROR JobScheduler: Error running job streaming job
> 144344116 ms.0
> org.apache.spark.SparkException: Job aborted due to stage failure: Task 0
> in stage 6.0 failed 4 times, most recent failure: Lost task 0.3 in stage
> 6.0 (TID 83, blrrndnbudn05.rnd.amadeus.net):
> java.lang.NullPointerException
> at org.apache.spark.sql.SQLContext.parseSql(SQLContext.scala:134)
> at
> org.apache.spark.sql.cassandra.CassandraSQLContext.cassandraSql(CassandraSQLContext.scala:74)
> at
> org.apache.spark.sql.cassandra.CassandraSQLContext.sql(CassandraSQLContext.scala:77)
> at
> com.amadeus.spark.example.SparkDemo$.withDataFrames(SparkDemo.scala:170)
> at
> com.amadeus.spark.example.SparkDemo$$anonfun$main$1$$anonfun$apply$1.apply(SparkDemo.scala:158)
> at
> com.amadeus.spark.example.SparkDemo$$anonfun$main$1$$anonfun$apply$1.apply(SparkDemo.scala:158)
> at scala.collection.Iterator$class.foreach(Iterator.scala:727)
> at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
> at org.apache.spark.rdd.RDD$$anonfun$foreach$1.apply(RDD.scala:798)
> at org.apache.spark.rdd.RDD$$anonfun$foreach$1.apply(RDD.scala:798)
> at
> org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1503)
> at
> org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1503)
> at
> org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:61)
> at org.apache.spark.scheduler.Task.run(Task.scala:64)
> at
> org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:203)
> at
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
> at
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
> at java.lang.Thread.run(Thread.java:745)
>
> Driver stacktrace:
> at org.apache.spark.scheduler.DAGScheduler.org
> $apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1203)
> at
> org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1192)
> at
> org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1191)
> at
> scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
> at
> scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
> at
> org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1191)
> at
> org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:693)
> at
> org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:693)
> at scala.Option.foreach(Option.scala:236)
> at
> org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:693)
> at
> org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1393)
> at
> org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1354)
> at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
>
>


CassandraSQLContext throwing NullPointer Exception

2015-09-28 Thread Priya Ch
Hi All,

 I am trying to use dataframes (which contain data from cassandra) in
rdd.foreach. This is throwing the following exception:

Is CassandraSQLContext accessible within executor 

15/09/28 17:22:40 ERROR JobScheduler: Error running job streaming job
144344116 ms.0
org.apache.spark.SparkException: Job aborted due to stage failure: Task 0
in stage 6.0 failed 4 times, most recent failure: Lost task 0.3 in stage
6.0 (TID 83, blrrndnbudn05.rnd.amadeus.net): java.lang.NullPointerException
at org.apache.spark.sql.SQLContext.parseSql(SQLContext.scala:134)
at
org.apache.spark.sql.cassandra.CassandraSQLContext.cassandraSql(CassandraSQLContext.scala:74)
at
org.apache.spark.sql.cassandra.CassandraSQLContext.sql(CassandraSQLContext.scala:77)
at
com.amadeus.spark.example.SparkDemo$.withDataFrames(SparkDemo.scala:170)
at
com.amadeus.spark.example.SparkDemo$$anonfun$main$1$$anonfun$apply$1.apply(SparkDemo.scala:158)
at
com.amadeus.spark.example.SparkDemo$$anonfun$main$1$$anonfun$apply$1.apply(SparkDemo.scala:158)
at scala.collection.Iterator$class.foreach(Iterator.scala:727)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
at org.apache.spark.rdd.RDD$$anonfun$foreach$1.apply(RDD.scala:798)
at org.apache.spark.rdd.RDD$$anonfun$foreach$1.apply(RDD.scala:798)
at
org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1503)
at
org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1503)
at
org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:61)
at org.apache.spark.scheduler.Task.run(Task.scala:64)
at
org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:203)
at
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:745)

Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org
$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1203)
at
org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1192)
at
org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1191)
at
scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at
scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
at
org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1191)
at
org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:693)
at
org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:693)
at scala.Option.foreach(Option.scala:236)
at
org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:693)
at
org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1393)
at
org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1354)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)