spark git commit: [SPARK-4413][SQL] Parquet support through datasource API
Repository: spark Updated Branches: refs/heads/master 84d79ee9e - 02ec058ef [SPARK-4413][SQL] Parquet support through datasource API Goals: - Support for accessing parquet using SQL but not requiring Hive (thus allowing support of parquet tables with decimal columns) - Support for folder based partitioning with automatic discovery of available partitions - Caching of file metadata See scaladoc of `ParquetRelation2` for more details. Author: Michael Armbrust mich...@databricks.com Closes #3269 from marmbrus/newParquet and squashes the following commits: 1dd75f1 [Michael Armbrust] Pass all paths for FileInputFormat at once. 645768b [Michael Armbrust] Review comments. abd8e2f [Michael Armbrust] Alternative implementation of parquet based on the datasources API. 938019e [Michael Armbrust] Add an experimental interface to data sources that exposes catalyst expressions. e9d2641 [Michael Armbrust] logging / formatting improvements. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/02ec058e Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/02ec058e Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/02ec058e Branch: refs/heads/master Commit: 02ec058efe24348cdd3691b55942e6f0ef138732 Parents: 84d79ee Author: Michael Armbrust mich...@databricks.com Authored: Thu Nov 20 18:31:02 2014 -0800 Committer: Michael Armbrust mich...@databricks.com Committed: Thu Nov 20 18:31:02 2014 -0800 -- .../sql/parquet/ParquetTableOperations.scala| 4 +- .../apache/spark/sql/parquet/newParquet.scala | 290 +++ .../spark/sql/sources/DataSourceStrategy.scala | 43 ++- .../apache/spark/sql/sources/interfaces.scala | 22 +- .../sql/parquet/ParquetMetastoreSuite.scala | 207 - .../spark/sql/parquet/parquetSuites.scala | 253 6 files changed, 599 insertions(+), 220 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/02ec058e/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala index 5d0643a..0e36852 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala @@ -361,7 +361,7 @@ private[parquet] class FilteringParquetRowInputFormat private var footers: JList[Footer] = _ - private var fileStatuses= Map.empty[Path, FileStatus] + private var fileStatuses = Map.empty[Path, FileStatus] override def createRecordReader( inputSplit: InputSplit, @@ -405,7 +405,9 @@ private[parquet] class FilteringParquetRowInputFormat } val newFooters = new mutable.HashMap[FileStatus, Footer] if (toFetch.size 0) { + val startFetch = System.currentTimeMillis val fetched = getFooters(conf, toFetch) + logInfo(sFetched $toFetch footers in ${System.currentTimeMillis - startFetch} ms) for ((status, i) - toFetch.zipWithIndex) { newFooters(status) = fetched.get(i) } http://git-wip-us.apache.org/repos/asf/spark/blob/02ec058e/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala new file mode 100644 index 000..bea12e6 --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala @@ -0,0 +1,290 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the License); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.parquet + +import java.util.{List = JList} + +import org.apache.hadoop.fs.{FileStatus, FileSystem, Path} +import org.apache.hadoop.conf.{Configurable,
spark git commit: [SPARK-4413][SQL] Parquet support through datasource API
Repository: spark Updated Branches: refs/heads/branch-1.2 0f6a2eeaf - 64b30be7e [SPARK-4413][SQL] Parquet support through datasource API Goals: - Support for accessing parquet using SQL but not requiring Hive (thus allowing support of parquet tables with decimal columns) - Support for folder based partitioning with automatic discovery of available partitions - Caching of file metadata See scaladoc of `ParquetRelation2` for more details. Author: Michael Armbrust mich...@databricks.com Closes #3269 from marmbrus/newParquet and squashes the following commits: 1dd75f1 [Michael Armbrust] Pass all paths for FileInputFormat at once. 645768b [Michael Armbrust] Review comments. abd8e2f [Michael Armbrust] Alternative implementation of parquet based on the datasources API. 938019e [Michael Armbrust] Add an experimental interface to data sources that exposes catalyst expressions. e9d2641 [Michael Armbrust] logging / formatting improvements. (cherry picked from commit 02ec058efe24348cdd3691b55942e6f0ef138732) Signed-off-by: Michael Armbrust mich...@databricks.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/64b30be7 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/64b30be7 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/64b30be7 Branch: refs/heads/branch-1.2 Commit: 64b30be7e4cb86059bbfeb3e2f8f47f41d015862 Parents: 0f6a2ee Author: Michael Armbrust mich...@databricks.com Authored: Thu Nov 20 18:31:02 2014 -0800 Committer: Michael Armbrust mich...@databricks.com Committed: Thu Nov 20 18:31:31 2014 -0800 -- .../sql/parquet/ParquetTableOperations.scala| 4 +- .../apache/spark/sql/parquet/newParquet.scala | 290 +++ .../spark/sql/sources/DataSourceStrategy.scala | 43 ++- .../apache/spark/sql/sources/interfaces.scala | 22 +- .../sql/parquet/ParquetMetastoreSuite.scala | 207 - .../spark/sql/parquet/parquetSuites.scala | 253 6 files changed, 599 insertions(+), 220 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/64b30be7/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala index 5d0643a..0e36852 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala @@ -361,7 +361,7 @@ private[parquet] class FilteringParquetRowInputFormat private var footers: JList[Footer] = _ - private var fileStatuses= Map.empty[Path, FileStatus] + private var fileStatuses = Map.empty[Path, FileStatus] override def createRecordReader( inputSplit: InputSplit, @@ -405,7 +405,9 @@ private[parquet] class FilteringParquetRowInputFormat } val newFooters = new mutable.HashMap[FileStatus, Footer] if (toFetch.size 0) { + val startFetch = System.currentTimeMillis val fetched = getFooters(conf, toFetch) + logInfo(sFetched $toFetch footers in ${System.currentTimeMillis - startFetch} ms) for ((status, i) - toFetch.zipWithIndex) { newFooters(status) = fetched.get(i) } http://git-wip-us.apache.org/repos/asf/spark/blob/64b30be7/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala new file mode 100644 index 000..bea12e6 --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala @@ -0,0 +1,290 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the License); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.parquet + +import