For some reason writing data from Spark shell to csv using the `csv package` takes almost an hour to dump to disk. Am I going crazy or did I do this wrong? I tried writing to parquet first and its fast as normal.
On my Macbook Pro 16g - 2.2 GHz Intel Core i7 -1TB the machine CPU's goes crazy and it sounds like its taking off like a plane ... lol Here is the code if anyone wants to experiment: // ./bin/spark-shell --packages com.databricks:spark-csv_2.11:1.4.0 // // version 2.0.0-SNAPSHOT // Using Scala version 2.11.7 (Java HotSpot(TM) 64-Bit Server VM, Java 1.7.0_80) // http://stat-computing.org/dataexpo/2009/the-data.html def time[R](block: => R): R = { val t0 = System.nanoTime() val result = block // call-by-name val t1 = System.nanoTime() println("Elapsed time: " + (t1 - t0) + "ns") result } val df = sqlContext.read.format("com.databricks.spark.csv").option("header", "true").load("/Users/employee/Downloads/2008.csv") val df_1 = df.withColumnRenamed("Year","oldYear") val df_2 = df_1.withColumn("Year",df_1.col("oldYear").cast("int")).drop("oldYear") def convertColumn(df: org.apache.spark.sql.DataFrame, name:String, newType:String) = { val df_1 = df.withColumnRenamed(name, "swap") df_1.withColumn(name, df_1.col("swap").cast(newType)).drop("swap") } val df_3 = convertColumn(df_2, "ArrDelay", "int") val df_4 = convertColumn(df_2, "DepDelay", "int") // test write to parquet is fast df_4.select("Year", "Cancelled").write.format("parquet").save("yearAndCancelled.parquet") val selectedData = df_4.select("Year", "Cancelled") val howLong = Time(selectedData.write.format("com.databricks.spark.csv").option("header", "true").save("output.csv")) //scala> val howLong = time(selectedData.write.format("com.databricks.spark.csv").option("header", "true").save("output.csv")) //Elapsed time: 3488272270000ns //howLong: Unit = () https://gist.github.com/bigsnarfdude/581b780ce85d7aaecbcb
