Code: import java.text.SimpleDateFormat import java.util.Calendar import java.sql.Date import org.apache.spark.storage.StorageLevel
def extract(array: Array[String], index: Integer) = { if (index < array.length) { array(index).replaceAll("\"", "") } else { "" } } case class GuidSess( guid: String, sessionKey: String, sessionStartDate: String, siteId: String, eventCount: String, browser: String, browserVersion: String, operatingSystem: String, experimentChannel: String, deviceName: String) val rowStructText = sc.textFile("/user/zeppelin/guidsess/2015/08/05/part-m-00001.gz") val guidSessRDD = rowStructText.filter(s => s.length != 1).map(s => s.split(",")).map( { s => GuidSess(extract(s, 0), extract(s, 1), extract(s, 2), extract(s, 3), extract(s, 4), extract(s, 5), extract(s, 6), extract(s, 7), extract(s, 8), extract(s, 9)) }) val guidSessDF = guidSessRDD.toDF() guidSessDF.registerTempTable("guidsess") Once the temp table is created, i wrote this query select siteid, count(distinct guid) total_visitor, count(sessionKey) as total_visits from guidsess group by siteid *Metrics:* Data Size: 170 MB Spark Version: 1.3.1 YARN: 2.7.x Timeline: There is 1 Job, 2 stages with 1 task each. *1st Stage : mapPartitions* [image: Inline image 1] 1st Stage: Task 1 started to fail. A second attempt started for 1st task of first Stage. The first attempt failed "Executor LOST" when i go to YARN resource manager and go to that particular host, i see that its running fine. *Attempt #1* [image: Inline image 2] *Attempt #2* Executor LOST AGAIN [image: Inline image 3] *Attempt 3&4* *[image: Inline image 4]* *2nd Stage runJob : SKIPPED* *[image: Inline image 5]* Any suggestions ? -- Deepak