For some reason writing data from Spark shell to csv using the `csv
package` takes almost an hour to dump to disk. Am I going crazy or did I do
this wrong? I tried writing to parquet first and its fast as normal.

On my Macbook Pro 16g - 2.2 GHz Intel Core i7 -1TB the machine CPU's goes
crazy and it sounds like its taking off like a plane ... lol

Here is the code if anyone wants to experiment:

// ./bin/spark-shell --packages com.databricks:spark-csv_2.11:1.4.0

//

// version 2.0.0-SNAPSHOT

// Using Scala version 2.11.7 (Java HotSpot(TM) 64-Bit Server VM, Java
1.7.0_80)

// http://stat-computing.org/dataexpo/2009/the-data.html


def time[R](block: => R): R = {

    val t0 = System.nanoTime()

    val result = block    // call-by-name

    val t1 = System.nanoTime()

    println("Elapsed time: " + (t1 - t0) + "ns")

    result

}


val df =
sqlContext.read.format("com.databricks.spark.csv").option("header",
"true").load("/Users/employee/Downloads/2008.csv")

val df_1 = df.withColumnRenamed("Year","oldYear")

val df_2 =
df_1.withColumn("Year",df_1.col("oldYear").cast("int")).drop("oldYear")

def convertColumn(df: org.apache.spark.sql.DataFrame, name:String,
newType:String) = {

  val df_1 = df.withColumnRenamed(name, "swap")

  df_1.withColumn(name, df_1.col("swap").cast(newType)).drop("swap")

}

val df_3 = convertColumn(df_2, "ArrDelay", "int")

val df_4 = convertColumn(df_2, "DepDelay", "int")


// test write to parquet is fast

df_4.select("Year",
"Cancelled").write.format("parquet").save("yearAndCancelled.parquet")


val selectedData = df_4.select("Year", "Cancelled")



val howLong =
Time(selectedData.write.format("com.databricks.spark.csv").option("header",
"true").save("output.csv"))


//scala> val howLong =
time(selectedData.write.format("com.databricks.spark.csv").option("header",
"true").save("output.csv"))

//Elapsed time: 3488272270000ns

//howLong: Unit = ()

https://gist.github.com/bigsnarfdude/581b780ce85d7aaecbcb

Reply via email to