[spark] branch master updated: [SPARK-23749][SQL] Replace built-in Hive API (isSub/toKryo) and remove OrcProto.Type usage

dongjoon Thu, 14 Mar 2019 11:42:07 -0700

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new da7db9a  [SPARK-23749][SQL] Replace built-in Hive API (isSub/toKryo) 
and remove OrcProto.Type usage
da7db9a is described below

commit da7db9abf61b85860ace714d34ab17810c775e25
Author: Yuming Wang <[email protected]>
AuthorDate: Thu Mar 14 11:41:40 2019 -0700

    [SPARK-23749][SQL] Replace built-in Hive API (isSub/toKryo) and remove 
OrcProto.Type usage
    
    ## What changes were proposed in this pull request?
    
    In order to make the upgrade built-in Hive changes smaller.
    This pr workaround the simplest 3 API changes first.
    
    ## How was this patch tested?
    
    manual tests
    
    Closes #24018 from wangyum/SPARK-23749.
    
    Lead-authored-by: Yuming Wang <[email protected]>
    Co-authored-by: Yuming Wang <[email protected]>
    Signed-off-by: Dongjoon Hyun <[email protected]>
---
 .../hadoop/hive/ql/io/orc/SparkOrcNewRecordReader.java   |  8 +++++---
 .../apache/spark/sql/hive/execution/SaveAsHiveFile.scala |  9 ++++++++-
 .../org/apache/spark/sql/hive/orc/OrcFileFormat.scala    | 16 +++++++++++++++-
 3 files changed, 28 insertions(+), 5 deletions(-)

diff --git 
a/sql/hive/src/main/java/org/apache/hadoop/hive/ql/io/orc/SparkOrcNewRecordReader.java
 
b/sql/hive/src/main/java/org/apache/hadoop/hive/ql/io/orc/SparkOrcNewRecordReader.java
index f093637..8e9362a 100644
--- 
a/sql/hive/src/main/java/org/apache/hadoop/hive/ql/io/orc/SparkOrcNewRecordReader.java
+++ 
b/sql/hive/src/main/java/org/apache/hadoop/hive/ql/io/orc/SparkOrcNewRecordReader.java
@@ -24,7 +24,6 @@ import org.apache.hadoop.mapreduce.InputSplit;
 import org.apache.hadoop.mapreduce.TaskAttemptContext;
 
 import java.io.IOException;
-import java.util.List;
 
 /**
  * This is based on hive-exec-1.2.1
@@ -42,8 +41,11 @@ public class SparkOrcNewRecordReader extends
 
   public SparkOrcNewRecordReader(Reader file, Configuration conf,
       long offset, long length) throws IOException {
-    List<OrcProto.Type> types = file.getTypes();
-    numColumns = (types.size() == 0) ? 0 : types.get(0).getSubtypesCount();
+    if (file.getTypes().isEmpty()) {
+      numColumns = 0;
+    } else {
+      numColumns = file.getTypes().get(0).getSubtypesCount();
+    }
     value = new OrcStruct(numColumns);
     this.reader = OrcInputFormat.createReaderFromFile(file, conf, offset,
         length);
diff --git 
a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala
 
b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala
index 4ddba50..73b3f20 100644
--- 
a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala
+++ 
b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala
@@ -227,7 +227,7 @@ private[hive] trait SaveAsHiveFile extends 
DataWritingCommand {
     // SPARK-20594: This is a walk-around fix to resolve a Hive bug. Hive 
requires that the
     // staging directory needs to avoid being deleted when users set 
hive.exec.stagingdir
     // under the table directory.
-    if (FileUtils.isSubDir(new Path(stagingPathName), inputPath, fs) &&
+    if (isSubDir(new Path(stagingPathName), inputPath, fs) &&
       
!stagingPathName.stripPrefix(inputPathName).stripPrefix(File.separator).startsWith("."))
 {
       logDebug(s"The staging dir '$stagingPathName' should be a child 
directory starts " +
         "with '.' to avoid being deleted if we set hive.exec.stagingdir under 
the table " +
@@ -253,6 +253,13 @@ private[hive] trait SaveAsHiveFile extends 
DataWritingCommand {
     dir
   }
 
+  // HIVE-14259 removed FileUtils.isSubDir(). Adapted it from Hive 1.2's 
FileUtils.isSubDir().
+  private def isSubDir(p1: Path, p2: Path, fs: FileSystem): Boolean = {
+    val path1 = fs.makeQualified(p1).toString + Path.SEPARATOR
+    val path2 = fs.makeQualified(p2).toString + Path.SEPARATOR
+    path1.startsWith(path2)
+  }
+
   private def executionId: String = {
     val rand: Random = new Random
     val format = new SimpleDateFormat("yyyy-MM-dd_HH-mm-ss_SSS", Locale.US)
diff --git 
a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala 
b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala
index bfb0a95..9ac3e98 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala
@@ -24,10 +24,14 @@ import java.util.Properties
 import scala.collection.JavaConverters._
 import scala.util.control.NonFatal
 
+import com.esotericsoftware.kryo.Kryo
+import com.esotericsoftware.kryo.io.Output
+import org.apache.commons.codec.binary.Base64
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.{FileStatus, Path}
 import org.apache.hadoop.hive.conf.HiveConf.ConfVars
 import org.apache.hadoop.hive.ql.io.orc._
+import org.apache.hadoop.hive.ql.io.sarg.SearchArgument
 import 
org.apache.hadoop.hive.serde2.objectinspector.{SettableStructObjectInspector, 
StructObjectInspector}
 import org.apache.hadoop.hive.serde2.typeinfo.{StructTypeInfo, TypeInfoUtils}
 import org.apache.hadoop.io.{NullWritable, Writable}
@@ -130,7 +134,7 @@ class OrcFileFormat extends FileFormat with 
DataSourceRegister with Serializable
     if (sparkSession.sessionState.conf.orcFilterPushDown) {
       // Sets pushed predicates
       OrcFilters.createFilter(requiredSchema, filters.toArray).foreach { f =>
-        hadoopConf.set(OrcFileFormat.SARG_PUSHDOWN, f.toKryo)
+        hadoopConf.set(OrcFileFormat.SARG_PUSHDOWN, toKryo(f))
         hadoopConf.setBoolean(ConfVars.HIVEOPTINDEXFILTER.varname, true)
       }
     }
@@ -195,6 +199,16 @@ class OrcFileFormat extends FileFormat with 
DataSourceRegister with Serializable
 
     case _ => false
   }
+
+  // HIVE-11253 moved `toKryo` from `SearchArgument` to `storage-api` module.
+  // This is copied from Hive 1.2's SearchArgumentImpl.toKryo().
+  private def toKryo(sarg: SearchArgument): String = {
+    val kryo = new Kryo()
+    val out = new Output(4 * 1024, 10 * 1024 * 1024)
+    kryo.writeObject(out, sarg)
+    out.close()
+    Base64.encodeBase64String(out.toBytes)
+  }
 }
 
 private[orc] class OrcSerializer(dataSchema: StructType, conf: Configuration)


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[spark] branch master updated: [SPARK-23749][SQL] Replace built-in Hive API (isSub/toKryo) and remove OrcProto.Type usage

Reply via email to