Repository: spark Updated Branches: refs/heads/master fc2e18963 -> 3b59d326c
[SPARK-24576][BUILD] Upgrade Apache ORC to 1.5.2 ## What changes were proposed in this pull request? This issue aims to upgrade Apache ORC library from 1.4.4 to 1.5.2 in order to bring the following benefits into Apache Spark. - [ORC-91](https://issues.apache.org/jira/browse/ORC-91) Support for variable length blocks in HDFS (The current space wasted in ORC to padding is known to be 5%.) - [ORC-344](https://issues.apache.org/jira/browse/ORC-344) Support for using Decimal64ColumnVector In addition to that, Apache Hive 3.1 and 3.2 will use ORC 1.5.1 ([HIVE-19669](https://issues.apache.org/jira/browse/HIVE-19465)) and 1.5.2 ([HIVE-19792](https://issues.apache.org/jira/browse/HIVE-19792)) respectively. This will improve the compatibility between Apache Spark and Apache Hive by sharing the common library. ## How was this patch tested? Pass the Jenkins with all existing tests. Author: Dongjoon Hyun <dongj...@apache.org> Closes #21582 from dongjoon-hyun/SPARK-24576. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/3b59d326 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/3b59d326 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/3b59d326 Branch: refs/heads/master Commit: 3b59d326c77bec96e5fb856d827139e0389394ba Parents: fc2e189 Author: Dongjoon Hyun <dongj...@apache.org> Authored: Tue Jul 17 23:52:17 2018 -0700 Committer: Xiao Li <gatorsm...@gmail.com> Committed: Tue Jul 17 23:52:17 2018 -0700 ---------------------------------------------------------------------- dev/deps/spark-deps-hadoop-2.6 | 7 ++--- dev/deps/spark-deps-hadoop-2.7 | 7 ++--- dev/deps/spark-deps-hadoop-3.1 | 7 ++--- pom.xml | 2 +- sql/core/pom.xml | 28 ++++++++++++++++++++ .../datasources/orc/OrcFileFormat.scala | 15 ++++++++++- .../datasources/orc/OrcSerializer.scala | 2 +- 7 files changed, 56 insertions(+), 12 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/3b59d326/dev/deps/spark-deps-hadoop-2.6 ---------------------------------------------------------------------- diff --git a/dev/deps/spark-deps-hadoop-2.6 b/dev/deps/spark-deps-hadoop-2.6 index f50a0aa..ff6d5c3 100644 --- a/dev/deps/spark-deps-hadoop-2.6 +++ b/dev/deps/spark-deps-hadoop-2.6 @@ -2,7 +2,7 @@ JavaEWAH-0.3.2.jar RoaringBitmap-0.5.11.jar ST4-4.0.4.jar activation-1.1.1.jar -aircompressor-0.8.jar +aircompressor-0.10.jar antlr-2.7.7.jar antlr-runtime-3.4.jar antlr4-runtime-4.7.jar @@ -157,8 +157,9 @@ objenesis-2.1.jar okhttp-3.8.1.jar okio-1.13.0.jar opencsv-2.3.jar -orc-core-1.4.4-nohive.jar -orc-mapreduce-1.4.4-nohive.jar +orc-core-1.5.2-nohive.jar +orc-mapreduce-1.5.2-nohive.jar +orc-shims-1.5.2.jar oro-2.0.8.jar osgi-resource-locator-1.0.1.jar paranamer-2.8.jar http://git-wip-us.apache.org/repos/asf/spark/blob/3b59d326/dev/deps/spark-deps-hadoop-2.7 ---------------------------------------------------------------------- diff --git a/dev/deps/spark-deps-hadoop-2.7 b/dev/deps/spark-deps-hadoop-2.7 index 774f9dc..72a94f8 100644 --- a/dev/deps/spark-deps-hadoop-2.7 +++ b/dev/deps/spark-deps-hadoop-2.7 @@ -2,7 +2,7 @@ JavaEWAH-0.3.2.jar RoaringBitmap-0.5.11.jar ST4-4.0.4.jar activation-1.1.1.jar -aircompressor-0.8.jar +aircompressor-0.10.jar antlr-2.7.7.jar antlr-runtime-3.4.jar antlr4-runtime-4.7.jar @@ -158,8 +158,9 @@ objenesis-2.1.jar okhttp-3.8.1.jar okio-1.13.0.jar opencsv-2.3.jar -orc-core-1.4.4-nohive.jar -orc-mapreduce-1.4.4-nohive.jar +orc-core-1.5.2-nohive.jar +orc-mapreduce-1.5.2-nohive.jar +orc-shims-1.5.2.jar oro-2.0.8.jar osgi-resource-locator-1.0.1.jar paranamer-2.8.jar http://git-wip-us.apache.org/repos/asf/spark/blob/3b59d326/dev/deps/spark-deps-hadoop-3.1 ---------------------------------------------------------------------- diff --git a/dev/deps/spark-deps-hadoop-3.1 b/dev/deps/spark-deps-hadoop-3.1 index 19c05ad..3409dc4 100644 --- a/dev/deps/spark-deps-hadoop-3.1 +++ b/dev/deps/spark-deps-hadoop-3.1 @@ -4,7 +4,7 @@ RoaringBitmap-0.5.11.jar ST4-4.0.4.jar accessors-smart-1.2.jar activation-1.1.1.jar -aircompressor-0.8.jar +aircompressor-0.10.jar antlr-2.7.7.jar antlr-runtime-3.4.jar antlr4-runtime-4.7.jar @@ -176,8 +176,9 @@ okhttp-2.7.5.jar okhttp-3.8.1.jar okio-1.13.0.jar opencsv-2.3.jar -orc-core-1.4.4-nohive.jar -orc-mapreduce-1.4.4-nohive.jar +orc-core-1.5.2-nohive.jar +orc-mapreduce-1.5.2-nohive.jar +orc-shims-1.5.2.jar oro-2.0.8.jar osgi-resource-locator-1.0.1.jar paranamer-2.8.jar http://git-wip-us.apache.org/repos/asf/spark/blob/3b59d326/pom.xml ---------------------------------------------------------------------- diff --git a/pom.xml b/pom.xml index 1892bbe..649221d 100644 --- a/pom.xml +++ b/pom.xml @@ -131,7 +131,7 @@ <hive.version.short>1.2.1</hive.version.short> <derby.version>10.12.1.1</derby.version> <parquet.version>1.10.0</parquet.version> - <orc.version>1.4.4</orc.version> + <orc.version>1.5.2</orc.version> <orc.classifier>nohive</orc.classifier> <hive.parquet.version>1.6.0</hive.parquet.version> <jetty.version>9.3.20.v20170531</jetty.version> http://git-wip-us.apache.org/repos/asf/spark/blob/3b59d326/sql/core/pom.xml ---------------------------------------------------------------------- diff --git a/sql/core/pom.xml b/sql/core/pom.xml index 18ae314..8873b00 100644 --- a/sql/core/pom.xml +++ b/sql/core/pom.xml @@ -90,11 +90,39 @@ <groupId>org.apache.orc</groupId> <artifactId>orc-core</artifactId> <classifier>${orc.classifier}</classifier> + <exclusions> + <exclusion> + <groupId>org.apache.hadoop</groupId> + <artifactId>hadoop-hdfs</artifactId> + </exclusion> + <!-- + orc-core:nohive doesn't have this dependency, but we adds this to prevent + sbt from getting confused. + --> + <exclusion> + <groupId>org.apache.hive</groupId> + <artifactId>hive-storage-api</artifactId> + </exclusion> + </exclusions> </dependency> <dependency> <groupId>org.apache.orc</groupId> <artifactId>orc-mapreduce</artifactId> <classifier>${orc.classifier}</classifier> + <exclusions> + <exclusion> + <groupId>org.apache.hadoop</groupId> + <artifactId>hadoop-hdfs</artifactId> + </exclusion> + <!-- + orc-core:nohive doesn't have this dependency, but we adds this to prevent + sbt from getting confused. + --> + <exclusion> + <groupId>org.apache.hive</groupId> + <artifactId>hive-storage-api</artifactId> + </exclusion> + </exclusions> </dependency> <dependency> <groupId>org.apache.parquet</groupId> http://git-wip-us.apache.org/repos/asf/spark/blob/3b59d326/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFileFormat.scala ---------------------------------------------------------------------- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFileFormat.scala index 3a8c0ad..df1cebe 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFileFormat.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFileFormat.scala @@ -59,6 +59,19 @@ private[sql] object OrcFileFormat { def checkFieldNames(names: Seq[String]): Unit = { names.foreach(checkFieldName) } + + def getQuotedSchemaString(dataType: DataType): String = dataType match { + case _: AtomicType => dataType.catalogString + case StructType(fields) => + fields.map(f => s"`${f.name}`:${getQuotedSchemaString(f.dataType)}") + .mkString("struct<", ",", ">") + case ArrayType(elementType, _) => + s"array<${getQuotedSchemaString(elementType)}>" + case MapType(keyType, valueType, _) => + s"map<${getQuotedSchemaString(keyType)},${getQuotedSchemaString(valueType)}>" + case _ => // UDT and others + dataType.catalogString + } } /** @@ -93,7 +106,7 @@ class OrcFileFormat val conf = job.getConfiguration - conf.set(MAPRED_OUTPUT_SCHEMA.getAttribute, dataSchema.catalogString) + conf.set(MAPRED_OUTPUT_SCHEMA.getAttribute, OrcFileFormat.getQuotedSchemaString(dataSchema)) conf.set(COMPRESS.getAttribute, orcOptions.compressionCodec) http://git-wip-us.apache.org/repos/asf/spark/blob/3b59d326/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcSerializer.scala ---------------------------------------------------------------------- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcSerializer.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcSerializer.scala index 899af07..90d1268 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcSerializer.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcSerializer.scala @@ -223,6 +223,6 @@ class OrcSerializer(dataSchema: StructType) { * Return a Orc value object for the given Spark schema. */ private def createOrcValue(dataType: DataType) = { - OrcStruct.createValue(TypeDescription.fromString(dataType.catalogString)) + OrcStruct.createValue(TypeDescription.fromString(OrcFileFormat.getQuotedSchemaString(dataType))) } } --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org