Hi guys, I'm having trouble implementing this scenario:
I have a column with a typical entry being : ['apple', 'orange', 'apple', 'pear', 'pear'] I need to use a StringIndexer to transform this to : [0, 2, 0, 1, 1] I'm attempting to do this but because of the nested operation on another RDD I get the NPE. Here's my code so far, thanks: val dfWithSchema = sqlContext.createDataFrame(eventFeaturesRDD).toDF("email", "event_name") // attempting import sqlContext.implicits._ val event_list = dfWithSchema.select("event_name").distinct val event_listDF = event_list.toDF() val eventIndexer = new StringIndexer() .setInputCol("event_name") .setOutputCol("eventIndex") .fit(event_listDF) val eventIndexed = eventIndexer.transform(event_listDF) val converter = new IndexToString() .setInputCol("eventIndex") .setOutputCol("originalCategory") val convertedEvents = converter.transform(eventIndexed) val rddX = dfWithSchema.select("event_name").rdd.map(_.getString(0).split( ",").map(_.trim replaceAll ("[\\[\\]\"]", "")).toList) //val oneRow = Converted(eventIndexer.transform(sqlContext.sparkContext.parallelize(Seq("CCB")).toDF("event_name")).select("eventIndex").first().getDouble(0)) val severalRows = rddX.map(row => { // Split array into n tools println("ROW: " + row(0).toString) println(row(0).getClass) println("PRINT: " + eventIndexer.transform(sqlContext.sparkContext.parallelize(Seq(row(0 ))).toDF("event_name")).select("eventIndex").first().getDouble(0)) (eventIndexer.transform(sqlContext.sparkContext.parallelize(Seq(row)).toDF( "event_name")).select("eventIndex").first().getDouble(0), Seq(row).toString) }) // attempting