spark git commit: [SPARK-25635][SQL][BUILD] Support selective direct encoding in native ORC write

lixiao Fri, 05 Oct 2018 16:43:07 -0700

Repository: spark
Updated Branches:
  refs/heads/master a433fbcee -> 1c9486c1a



[SPARK-25635][SQL][BUILD] Support selective direct encoding in native ORC write

## What changes were proposed in this pull request?

Before ORC 1.5.3, `orc.dictionary.key.threshold` and 
`hive.exec.orc.dictionary.key.size.threshold` are applied for all columns. This 
has been a big huddle to enable dictionary encoding. From ORC 1.5.3, 
`orc.column.encoding.direct` is added to enforce direct encoding selectively in 
a column-wise manner. This PR aims to add that feature by upgrading ORC from 
1.5.2 to 1.5.3.

The followings are the patches in ORC 1.5.3 and this feature is the only one 
related to Spark directly.
```
ORC-406: ORC: Char(n) and Varchar(n) writers truncate to n bytes & corrupts 
multi-byte data (gopalv)
ORC-403: [C++] Add checks to avoid invalid offsets in InputStream
ORC-405: Remove calcite as a dependency from the benchmarks.
ORC-375: Fix libhdfs on gcc7 by adding #include <functional> two places.
ORC-383: Parallel builds fails with ConcurrentModificationException
ORC-382: Apache rat exclusions + add rat check to travis
ORC-401: Fix incorrect quoting in specification.
ORC-385: Change RecordReader to extend Closeable.
ORC-384: [C++] fix memory leak when loading non-ORC files
ORC-391: [c++] parseType does not accept underscore in the field name
ORC-397: Allow selective disabling of dictionary encoding. Original patch was 
by Mithun Radhakrishnan.
ORC-389: Add ability to not decode Acid metadata columns
```

## How was this patch tested?

Pass the Jenkins with newly added test cases.

Closes #22622 from dongjoon-hyun/SPARK-25635.

Authored-by: Dongjoon Hyun <[email protected]>
Signed-off-by: gatorsmile <[email protected]>


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1c9486c1
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1c9486c1
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1c9486c1

Branch: refs/heads/master
Commit: 1c9486c1acceb73e5cc6f1fa684b6d992e187a9a
Parents: a433fbc
Author: Dongjoon Hyun <[email protected]>
Authored: Fri Oct 5 16:42:06 2018 -0700
Committer: gatorsmile <[email protected]>
Committed: Fri Oct 5 16:42:06 2018 -0700

----------------------------------------------------------------------
 dev/deps/spark-deps-hadoop-2.6                  |  6 +-
 dev/deps/spark-deps-hadoop-2.7                  |  6 +-
 dev/deps/spark-deps-hadoop-3.1                  |  6 +-
 pom.xml                                         |  2 +-
 .../datasources/orc/OrcSourceSuite.scala        | 75 ++++++++++++++++++++
 .../spark/sql/hive/orc/HiveOrcSourceSuite.scala |  8 +++
 6 files changed, 93 insertions(+), 10 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/1c9486c1/dev/deps/spark-deps-hadoop-2.6
----------------------------------------------------------------------
diff --git a/dev/deps/spark-deps-hadoop-2.6 b/dev/deps/spark-deps-hadoop-2.6
index 2dcab85..22e86ef 100644
--- a/dev/deps/spark-deps-hadoop-2.6
+++ b/dev/deps/spark-deps-hadoop-2.6
@@ -153,9 +153,9 @@ objenesis-2.5.1.jar
 okhttp-3.8.1.jar
 okio-1.13.0.jar
 opencsv-2.3.jar
-orc-core-1.5.2-nohive.jar
-orc-mapreduce-1.5.2-nohive.jar
-orc-shims-1.5.2.jar
+orc-core-1.5.3-nohive.jar
+orc-mapreduce-1.5.3-nohive.jar
+orc-shims-1.5.3.jar
 oro-2.0.8.jar
 osgi-resource-locator-1.0.1.jar
 paranamer-2.8.jar

http://git-wip-us.apache.org/repos/asf/spark/blob/1c9486c1/dev/deps/spark-deps-hadoop-2.7
----------------------------------------------------------------------
diff --git a/dev/deps/spark-deps-hadoop-2.7 b/dev/deps/spark-deps-hadoop-2.7
index d1d695c..19dd786 100644
--- a/dev/deps/spark-deps-hadoop-2.7
+++ b/dev/deps/spark-deps-hadoop-2.7
@@ -154,9 +154,9 @@ objenesis-2.5.1.jar
 okhttp-3.8.1.jar
 okio-1.13.0.jar
 opencsv-2.3.jar
-orc-core-1.5.2-nohive.jar
-orc-mapreduce-1.5.2-nohive.jar
-orc-shims-1.5.2.jar
+orc-core-1.5.3-nohive.jar
+orc-mapreduce-1.5.3-nohive.jar
+orc-shims-1.5.3.jar
 oro-2.0.8.jar
 osgi-resource-locator-1.0.1.jar
 paranamer-2.8.jar

http://git-wip-us.apache.org/repos/asf/spark/blob/1c9486c1/dev/deps/spark-deps-hadoop-3.1
----------------------------------------------------------------------
diff --git a/dev/deps/spark-deps-hadoop-3.1 b/dev/deps/spark-deps-hadoop-3.1
index e9691eb..ea0f487 100644
--- a/dev/deps/spark-deps-hadoop-3.1
+++ b/dev/deps/spark-deps-hadoop-3.1
@@ -172,9 +172,9 @@ okhttp-2.7.5.jar
 okhttp-3.8.1.jar
 okio-1.13.0.jar
 opencsv-2.3.jar
-orc-core-1.5.2-nohive.jar
-orc-mapreduce-1.5.2-nohive.jar
-orc-shims-1.5.2.jar
+orc-core-1.5.3-nohive.jar
+orc-mapreduce-1.5.3-nohive.jar
+orc-shims-1.5.3.jar
 oro-2.0.8.jar
 osgi-resource-locator-1.0.1.jar
 paranamer-2.8.jar

http://git-wip-us.apache.org/repos/asf/spark/blob/1c9486c1/pom.xml
----------------------------------------------------------------------
diff --git a/pom.xml b/pom.xml
index c5eea6a..79af5d6 100644
--- a/pom.xml
+++ b/pom.xml
@@ -131,7 +131,7 @@
     <hive.version.short>1.2.1</hive.version.short>
     <derby.version>10.12.1.1</derby.version>
     <parquet.version>1.10.0</parquet.version>
-    <orc.version>1.5.2</orc.version>
+    <orc.version>1.5.3</orc.version>
     <orc.classifier>nohive</orc.classifier>
     <hive.parquet.version>1.6.0</hive.parquet.version>
     <jetty.version>9.3.24.v20180605</jetty.version>

http://git-wip-us.apache.org/repos/asf/spark/blob/1c9486c1/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcSourceSuite.scala
----------------------------------------------------------------------
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcSourceSuite.scala
 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcSourceSuite.scala
index b6bb1d7..dc81c05 100644
--- 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcSourceSuite.scala
+++ 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcSourceSuite.scala
@@ -25,6 +25,7 @@ import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.Path
 import org.apache.orc.OrcConf.COMPRESS
 import org.apache.orc.OrcFile
+import org.apache.orc.OrcProto.ColumnEncoding.Kind.{DICTIONARY_V2, DIRECT, 
DIRECT_V2}
 import org.apache.orc.OrcProto.Stream.Kind
 import org.apache.orc.impl.RecordReaderImpl
 import org.scalatest.BeforeAndAfterAll
@@ -115,6 +116,76 @@ abstract class OrcSuite extends OrcTest with 
BeforeAndAfterAll {
     }
   }
 
+  protected def testSelectiveDictionaryEncoding(isSelective: Boolean) {
+    val tableName = "orcTable"
+
+    withTempDir { dir =>
+      withTable(tableName) {
+        val sqlStatement = orcImp match {
+          case "native" =>
+            s"""
+               |CREATE TABLE $tableName (zipcode STRING, uniqColumn STRING, 
value DOUBLE)
+               |USING ORC
+               |OPTIONS (
+               |  path '${dir.toURI}',
+               |  orc.dictionary.key.threshold '1.0',
+               |  orc.column.encoding.direct 'uniqColumn'
+               |)
+            """.stripMargin
+          case "hive" =>
+            s"""
+               |CREATE TABLE $tableName (zipcode STRING, uniqColumn STRING, 
value DOUBLE)
+               |STORED AS ORC
+               |LOCATION '${dir.toURI}'
+               |TBLPROPERTIES (
+               |  orc.dictionary.key.threshold '1.0',
+               |  hive.exec.orc.dictionary.key.size.threshold '1.0',
+               |  orc.column.encoding.direct 'uniqColumn'
+               |)
+            """.stripMargin
+          case impl =>
+            throw new UnsupportedOperationException(s"Unknown ORC 
implementation: $impl")
+        }
+
+        sql(sqlStatement)
+        sql(s"INSERT INTO $tableName VALUES ('94086', 'random-uuid-string', 
0.0)")
+
+        val partFiles = dir.listFiles()
+          .filter(f => f.isFile && !f.getName.startsWith(".") && 
!f.getName.startsWith("_"))
+        assert(partFiles.length === 1)
+
+        val orcFilePath = new Path(partFiles.head.getAbsolutePath)
+        val readerOptions = OrcFile.readerOptions(new Configuration())
+        val reader = OrcFile.createReader(orcFilePath, readerOptions)
+        var recordReader: RecordReaderImpl = null
+        try {
+          recordReader = reader.rows.asInstanceOf[RecordReaderImpl]
+
+          // Check the kind
+          val stripe = recordReader.readStripeFooter(reader.getStripes.get(0))
+
+          // The encodings are divided into direct or dictionary-based 
categories and
+          // further refined as to whether they use RLE v1 or v2. RLE v1 is 
used by
+          // Hive 0.11 and RLE v2 is introduced in Hive 0.12 ORC with more 
improvements.
+          // For more details, see https://orc.apache.org/specification/
+          assert(stripe.getColumns(1).getKind === DICTIONARY_V2)
+          if (isSelective) {
+            assert(stripe.getColumns(2).getKind === DIRECT_V2)
+          } else {
+            assert(stripe.getColumns(2).getKind === DICTIONARY_V2)
+          }
+          // Floating point types are stored with DIRECT encoding in IEEE 754 
floating
+          // point bit layout.
+          assert(stripe.getColumns(3).getKind === DIRECT)
+        } finally {
+          if (recordReader != null) {
+            recordReader.close()
+          }
+        }
+      }
+    }
+  }
+
   test("create temporary orc table") {
     checkAnswer(sql("SELECT COUNT(*) FROM normal_orc_source"), Row(10))
 
@@ -284,4 +355,8 @@ class OrcSourceSuite extends OrcSuite with SharedSQLContext 
{
   test("Check BloomFilter creation") {
     testBloomFilterCreation(Kind.BLOOM_FILTER_UTF8) // After ORC-101
   }
+
+  test("Enforce direct encoding column-wise selectively") {
+    testSelectiveDictionaryEncoding(isSelective = true)
+  }
 }

http://git-wip-us.apache.org/repos/asf/spark/blob/1c9486c1/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/HiveOrcSourceSuite.scala
----------------------------------------------------------------------
diff --git 
a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/HiveOrcSourceSuite.scala
 
b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/HiveOrcSourceSuite.scala
index c1ae2f6..7fefaf5 100644
--- 
a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/HiveOrcSourceSuite.scala
+++ 
b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/HiveOrcSourceSuite.scala
@@ -182,4 +182,12 @@ class HiveOrcSourceSuite extends OrcSuite with 
TestHiveSingleton {
       }
     }
   }
+
+  test("Enforce direct encoding column-wise selectively") {
+    Seq(true, false).foreach { convertMetastore =>
+      withSQLConf(HiveUtils.CONVERT_METASTORE_ORC.key -> s"$convertMetastore") 
{
+        testSelectiveDictionaryEncoding(isSelective = false)
+      }
+    }
+  }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

spark git commit: [SPARK-25635][SQL][BUILD] Support selective direct encoding in native ORC write

Reply via email to