Repository: spark Updated Branches: refs/heads/master a433fbcee -> 1c9486c1a
[SPARK-25635][SQL][BUILD] Support selective direct encoding in native ORC write ## What changes were proposed in this pull request? Before ORC 1.5.3, `orc.dictionary.key.threshold` and `hive.exec.orc.dictionary.key.size.threshold` are applied for all columns. This has been a big huddle to enable dictionary encoding. From ORC 1.5.3, `orc.column.encoding.direct` is added to enforce direct encoding selectively in a column-wise manner. This PR aims to add that feature by upgrading ORC from 1.5.2 to 1.5.3. The followings are the patches in ORC 1.5.3 and this feature is the only one related to Spark directly. ``` ORC-406: ORC: Char(n) and Varchar(n) writers truncate to n bytes & corrupts multi-byte data (gopalv) ORC-403: [C++] Add checks to avoid invalid offsets in InputStream ORC-405: Remove calcite as a dependency from the benchmarks. ORC-375: Fix libhdfs on gcc7 by adding #include <functional> two places. ORC-383: Parallel builds fails with ConcurrentModificationException ORC-382: Apache rat exclusions + add rat check to travis ORC-401: Fix incorrect quoting in specification. ORC-385: Change RecordReader to extend Closeable. ORC-384: [C++] fix memory leak when loading non-ORC files ORC-391: [c++] parseType does not accept underscore in the field name ORC-397: Allow selective disabling of dictionary encoding. Original patch was by Mithun Radhakrishnan. ORC-389: Add ability to not decode Acid metadata columns ``` ## How was this patch tested? Pass the Jenkins with newly added test cases. Closes #22622 from dongjoon-hyun/SPARK-25635. Authored-by: Dongjoon Hyun <dongj...@apache.org> Signed-off-by: gatorsmile <gatorsm...@gmail.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1c9486c1 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1c9486c1 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1c9486c1 Branch: refs/heads/master Commit: 1c9486c1acceb73e5cc6f1fa684b6d992e187a9a Parents: a433fbc Author: Dongjoon Hyun <dongj...@apache.org> Authored: Fri Oct 5 16:42:06 2018 -0700 Committer: gatorsmile <gatorsm...@gmail.com> Committed: Fri Oct 5 16:42:06 2018 -0700 ---------------------------------------------------------------------- dev/deps/spark-deps-hadoop-2.6 | 6 +- dev/deps/spark-deps-hadoop-2.7 | 6 +- dev/deps/spark-deps-hadoop-3.1 | 6 +- pom.xml | 2 +- .../datasources/orc/OrcSourceSuite.scala | 75 ++++++++++++++++++++ .../spark/sql/hive/orc/HiveOrcSourceSuite.scala | 8 +++ 6 files changed, 93 insertions(+), 10 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/1c9486c1/dev/deps/spark-deps-hadoop-2.6 ---------------------------------------------------------------------- diff --git a/dev/deps/spark-deps-hadoop-2.6 b/dev/deps/spark-deps-hadoop-2.6 index 2dcab85..22e86ef 100644 --- a/dev/deps/spark-deps-hadoop-2.6 +++ b/dev/deps/spark-deps-hadoop-2.6 @@ -153,9 +153,9 @@ objenesis-2.5.1.jar okhttp-3.8.1.jar okio-1.13.0.jar opencsv-2.3.jar -orc-core-1.5.2-nohive.jar -orc-mapreduce-1.5.2-nohive.jar -orc-shims-1.5.2.jar +orc-core-1.5.3-nohive.jar +orc-mapreduce-1.5.3-nohive.jar +orc-shims-1.5.3.jar oro-2.0.8.jar osgi-resource-locator-1.0.1.jar paranamer-2.8.jar http://git-wip-us.apache.org/repos/asf/spark/blob/1c9486c1/dev/deps/spark-deps-hadoop-2.7 ---------------------------------------------------------------------- diff --git a/dev/deps/spark-deps-hadoop-2.7 b/dev/deps/spark-deps-hadoop-2.7 index d1d695c..19dd786 100644 --- a/dev/deps/spark-deps-hadoop-2.7 +++ b/dev/deps/spark-deps-hadoop-2.7 @@ -154,9 +154,9 @@ objenesis-2.5.1.jar okhttp-3.8.1.jar okio-1.13.0.jar opencsv-2.3.jar -orc-core-1.5.2-nohive.jar -orc-mapreduce-1.5.2-nohive.jar -orc-shims-1.5.2.jar +orc-core-1.5.3-nohive.jar +orc-mapreduce-1.5.3-nohive.jar +orc-shims-1.5.3.jar oro-2.0.8.jar osgi-resource-locator-1.0.1.jar paranamer-2.8.jar http://git-wip-us.apache.org/repos/asf/spark/blob/1c9486c1/dev/deps/spark-deps-hadoop-3.1 ---------------------------------------------------------------------- diff --git a/dev/deps/spark-deps-hadoop-3.1 b/dev/deps/spark-deps-hadoop-3.1 index e9691eb..ea0f487 100644 --- a/dev/deps/spark-deps-hadoop-3.1 +++ b/dev/deps/spark-deps-hadoop-3.1 @@ -172,9 +172,9 @@ okhttp-2.7.5.jar okhttp-3.8.1.jar okio-1.13.0.jar opencsv-2.3.jar -orc-core-1.5.2-nohive.jar -orc-mapreduce-1.5.2-nohive.jar -orc-shims-1.5.2.jar +orc-core-1.5.3-nohive.jar +orc-mapreduce-1.5.3-nohive.jar +orc-shims-1.5.3.jar oro-2.0.8.jar osgi-resource-locator-1.0.1.jar paranamer-2.8.jar http://git-wip-us.apache.org/repos/asf/spark/blob/1c9486c1/pom.xml ---------------------------------------------------------------------- diff --git a/pom.xml b/pom.xml index c5eea6a..79af5d6 100644 --- a/pom.xml +++ b/pom.xml @@ -131,7 +131,7 @@ <hive.version.short>1.2.1</hive.version.short> <derby.version>10.12.1.1</derby.version> <parquet.version>1.10.0</parquet.version> - <orc.version>1.5.2</orc.version> + <orc.version>1.5.3</orc.version> <orc.classifier>nohive</orc.classifier> <hive.parquet.version>1.6.0</hive.parquet.version> <jetty.version>9.3.24.v20180605</jetty.version> http://git-wip-us.apache.org/repos/asf/spark/blob/1c9486c1/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcSourceSuite.scala ---------------------------------------------------------------------- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcSourceSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcSourceSuite.scala index b6bb1d7..dc81c05 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcSourceSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcSourceSuite.scala @@ -25,6 +25,7 @@ import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.orc.OrcConf.COMPRESS import org.apache.orc.OrcFile +import org.apache.orc.OrcProto.ColumnEncoding.Kind.{DICTIONARY_V2, DIRECT, DIRECT_V2} import org.apache.orc.OrcProto.Stream.Kind import org.apache.orc.impl.RecordReaderImpl import org.scalatest.BeforeAndAfterAll @@ -115,6 +116,76 @@ abstract class OrcSuite extends OrcTest with BeforeAndAfterAll { } } + protected def testSelectiveDictionaryEncoding(isSelective: Boolean) { + val tableName = "orcTable" + + withTempDir { dir => + withTable(tableName) { + val sqlStatement = orcImp match { + case "native" => + s""" + |CREATE TABLE $tableName (zipcode STRING, uniqColumn STRING, value DOUBLE) + |USING ORC + |OPTIONS ( + | path '${dir.toURI}', + | orc.dictionary.key.threshold '1.0', + | orc.column.encoding.direct 'uniqColumn' + |) + """.stripMargin + case "hive" => + s""" + |CREATE TABLE $tableName (zipcode STRING, uniqColumn STRING, value DOUBLE) + |STORED AS ORC + |LOCATION '${dir.toURI}' + |TBLPROPERTIES ( + | orc.dictionary.key.threshold '1.0', + | hive.exec.orc.dictionary.key.size.threshold '1.0', + | orc.column.encoding.direct 'uniqColumn' + |) + """.stripMargin + case impl => + throw new UnsupportedOperationException(s"Unknown ORC implementation: $impl") + } + + sql(sqlStatement) + sql(s"INSERT INTO $tableName VALUES ('94086', 'random-uuid-string', 0.0)") + + val partFiles = dir.listFiles() + .filter(f => f.isFile && !f.getName.startsWith(".") && !f.getName.startsWith("_")) + assert(partFiles.length === 1) + + val orcFilePath = new Path(partFiles.head.getAbsolutePath) + val readerOptions = OrcFile.readerOptions(new Configuration()) + val reader = OrcFile.createReader(orcFilePath, readerOptions) + var recordReader: RecordReaderImpl = null + try { + recordReader = reader.rows.asInstanceOf[RecordReaderImpl] + + // Check the kind + val stripe = recordReader.readStripeFooter(reader.getStripes.get(0)) + + // The encodings are divided into direct or dictionary-based categories and + // further refined as to whether they use RLE v1 or v2. RLE v1 is used by + // Hive 0.11 and RLE v2 is introduced in Hive 0.12 ORC with more improvements. + // For more details, see https://orc.apache.org/specification/ + assert(stripe.getColumns(1).getKind === DICTIONARY_V2) + if (isSelective) { + assert(stripe.getColumns(2).getKind === DIRECT_V2) + } else { + assert(stripe.getColumns(2).getKind === DICTIONARY_V2) + } + // Floating point types are stored with DIRECT encoding in IEEE 754 floating + // point bit layout. + assert(stripe.getColumns(3).getKind === DIRECT) + } finally { + if (recordReader != null) { + recordReader.close() + } + } + } + } + } + test("create temporary orc table") { checkAnswer(sql("SELECT COUNT(*) FROM normal_orc_source"), Row(10)) @@ -284,4 +355,8 @@ class OrcSourceSuite extends OrcSuite with SharedSQLContext { test("Check BloomFilter creation") { testBloomFilterCreation(Kind.BLOOM_FILTER_UTF8) // After ORC-101 } + + test("Enforce direct encoding column-wise selectively") { + testSelectiveDictionaryEncoding(isSelective = true) + } } http://git-wip-us.apache.org/repos/asf/spark/blob/1c9486c1/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/HiveOrcSourceSuite.scala ---------------------------------------------------------------------- diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/HiveOrcSourceSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/HiveOrcSourceSuite.scala index c1ae2f6..7fefaf5 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/HiveOrcSourceSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/HiveOrcSourceSuite.scala @@ -182,4 +182,12 @@ class HiveOrcSourceSuite extends OrcSuite with TestHiveSingleton { } } } + + test("Enforce direct encoding column-wise selectively") { + Seq(true, false).foreach { convertMetastore => + withSQLConf(HiveUtils.CONVERT_METASTORE_ORC.key -> s"$convertMetastore") { + testSelectiveDictionaryEncoding(isSelective = false) + } + } + } } --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org