Repository: spark
Updated Branches:
  refs/heads/branch-1.2 cde8a310a -> 7a245412f


[SPARK-4386] Improve performance when writing Parquet files

Convert type of RowWriteSupport.attributes to Array.

Analysis of performance for writing very wide tables shows that time is spent 
predominantly in apply method on  attributes var. Type of attributes previously 
was LinearSeqOptimized and apply is O(N) which made write O(N squared).

Measurements on 575 column table showed this change made a 6x improvement in 
write times.

Author: Michael Davies <michael.belldav...@gmail.com>

Closes #3843 from MickDavies/SPARK-4386 and squashes the following commits:

892519d [Michael Davies] [SPARK-4386] Improve performance when writing Parquet 
files

(cherry picked from commit 7425bec320227bf8818dc2844c12d5373d166364)
Signed-off-by: Michael Armbrust <mich...@databricks.com>


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/7a245412
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/7a245412
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/7a245412

Branch: refs/heads/branch-1.2
Commit: 7a245412f7b1337c766981f43bcbb64890439002
Parents: cde8a31
Author: Michael Davies <michael.belldav...@gmail.com>
Authored: Tue Dec 30 13:40:51 2014 -0800
Committer: Michael Armbrust <mich...@databricks.com>
Committed: Tue Dec 30 13:41:08 2014 -0800

----------------------------------------------------------------------
 .../scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/7a245412/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala
----------------------------------------------------------------------
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala
 
b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala
index ef3687e..9049eb5 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala
@@ -130,7 +130,7 @@ private[parquet] object RowReadSupport {
 private[parquet] class RowWriteSupport extends WriteSupport[Row] with Logging {
 
   private[parquet] var writer: RecordConsumer = null
-  private[parquet] var attributes: Seq[Attribute] = null
+  private[parquet] var attributes: Array[Attribute] = null
 
   override def init(configuration: Configuration): WriteSupport.WriteContext = 
{
     val origAttributesStr: String = 
configuration.get(RowWriteSupport.SPARK_ROW_SCHEMA)
@@ -138,7 +138,7 @@ private[parquet] class RowWriteSupport extends 
WriteSupport[Row] with Logging {
     metadata.put(RowReadSupport.SPARK_METADATA_KEY, origAttributesStr)
 
     if (attributes == null) {
-      attributes = ParquetTypesConverter.convertFromString(origAttributesStr)
+      attributes = 
ParquetTypesConverter.convertFromString(origAttributesStr).toArray
     }
 
     log.debug(s"write support initialized for requested schema $attributes")


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

Reply via email to