Repository: spark Updated Branches: refs/heads/branch-1.2 cde8a310a -> 7a245412f
[SPARK-4386] Improve performance when writing Parquet files Convert type of RowWriteSupport.attributes to Array. Analysis of performance for writing very wide tables shows that time is spent predominantly in apply method on attributes var. Type of attributes previously was LinearSeqOptimized and apply is O(N) which made write O(N squared). Measurements on 575 column table showed this change made a 6x improvement in write times. Author: Michael Davies <michael.belldav...@gmail.com> Closes #3843 from MickDavies/SPARK-4386 and squashes the following commits: 892519d [Michael Davies] [SPARK-4386] Improve performance when writing Parquet files (cherry picked from commit 7425bec320227bf8818dc2844c12d5373d166364) Signed-off-by: Michael Armbrust <mich...@databricks.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/7a245412 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/7a245412 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/7a245412 Branch: refs/heads/branch-1.2 Commit: 7a245412f7b1337c766981f43bcbb64890439002 Parents: cde8a31 Author: Michael Davies <michael.belldav...@gmail.com> Authored: Tue Dec 30 13:40:51 2014 -0800 Committer: Michael Armbrust <mich...@databricks.com> Committed: Tue Dec 30 13:41:08 2014 -0800 ---------------------------------------------------------------------- .../scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/7a245412/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala ---------------------------------------------------------------------- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala index ef3687e..9049eb5 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala @@ -130,7 +130,7 @@ private[parquet] object RowReadSupport { private[parquet] class RowWriteSupport extends WriteSupport[Row] with Logging { private[parquet] var writer: RecordConsumer = null - private[parquet] var attributes: Seq[Attribute] = null + private[parquet] var attributes: Array[Attribute] = null override def init(configuration: Configuration): WriteSupport.WriteContext = { val origAttributesStr: String = configuration.get(RowWriteSupport.SPARK_ROW_SCHEMA) @@ -138,7 +138,7 @@ private[parquet] class RowWriteSupport extends WriteSupport[Row] with Logging { metadata.put(RowReadSupport.SPARK_METADATA_KEY, origAttributesStr) if (attributes == null) { - attributes = ParquetTypesConverter.convertFromString(origAttributesStr) + attributes = ParquetTypesConverter.convertFromString(origAttributesStr).toArray } log.debug(s"write support initialized for requested schema $attributes") --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org