Repository: spark Updated Branches: refs/heads/master 18f209843 -> 4ad5153f5
[SPARK-6037][SQL] Avoiding duplicate Parquet schema merging `FilteringParquetRowInputFormat` manually merges Parquet schemas before computing splits. However, it is duplicate because the schemas are already merged in `ParquetRelation2`. We don't need to re-merge them at `InputFormat`. Author: Liang-Chi Hsieh <[email protected]> Closes #4786 from viirya/dup_parquet_schemas_merge and squashes the following commits: ef78a5a [Liang-Chi Hsieh] Avoiding duplicate Parquet schema merging. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4ad5153f Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4ad5153f Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4ad5153f Branch: refs/heads/master Commit: 4ad5153f5449319a7e82c9013ccff4494ab58ef1 Parents: 18f2098 Author: Liang-Chi Hsieh <[email protected]> Authored: Fri Feb 27 11:06:47 2015 +0800 Committer: Cheng Lian <[email protected]> Committed: Fri Feb 27 11:06:47 2015 +0800 ---------------------------------------------------------------------- .../sql/parquet/ParquetTableOperations.scala | 23 ++++++-------------- 1 file changed, 7 insertions(+), 16 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/4ad5153f/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala ---------------------------------------------------------------------- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala index 9061d3f..4e4f647 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala @@ -434,22 +434,13 @@ private[parquet] class FilteringParquetRowInputFormat return splits } - Option(globalMetaData.getKeyValueMetaData.get(RowReadSupport.SPARK_METADATA_KEY)).foreach { - schemas => - val mergedSchema = schemas - .map(DataType.fromJson(_).asInstanceOf[StructType]) - .reduce(_ merge _) - .json - - val mergedMetadata = globalMetaData - .getKeyValueMetaData - .updated(RowReadSupport.SPARK_METADATA_KEY, setAsJavaSet(Set(mergedSchema))) - - globalMetaData = new GlobalMetaData( - globalMetaData.getSchema, - mergedMetadata, - globalMetaData.getCreatedBy) - } + val metadata = configuration.get(RowWriteSupport.SPARK_ROW_SCHEMA) + val mergedMetadata = globalMetaData + .getKeyValueMetaData + .updated(RowReadSupport.SPARK_METADATA_KEY, setAsJavaSet(Set(metadata))) + + globalMetaData = new GlobalMetaData(globalMetaData.getSchema, + mergedMetadata, globalMetaData.getCreatedBy) val readContext = getReadSupport(configuration).init( new InitContext(configuration, --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
