Repository: spark
Updated Branches:
  refs/heads/master 6cbde337a -> e679bc3c1


[SPARK-16901] Hive settings in hive-site.xml may be overridden by Hive's 
default values

## What changes were proposed in this pull request?
When we create the HiveConf for metastore client, we use a Hadoop Conf as the 
base, which may contain Hive settings in hive-site.xml 
(https://github.com/apache/spark/blob/master/sql/core/src/main/scala/org/apache/spark/sql/internal/SharedState.scala#L49).
 However, HiveConf's initialize function basically ignores the base Hadoop Conf 
and always its default values (i.e. settings with non-null default values) as 
the base 
(https://github.com/apache/hive/blob/release-1.2.1/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java#L2687).
 So, even a user put javax.jdo.option.ConnectionURL in hive-site.xml, it is not 
used and Hive will use its default, which is 
jdbc:derby:;databaseName=metastore_db;create=true.

This issue only shows up when `spark.sql.hive.metastore.jars` is not set to 
builtin.

## How was this patch tested?
New test in HiveSparkSubmitSuite.

Author: Yin Huai <yh...@databricks.com>

Closes #14497 from yhuai/SPARK-16901.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e679bc3c
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e679bc3c
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e679bc3c

Branch: refs/heads/master
Commit: e679bc3c1cd418ef0025d2ecbc547c9660cac433
Parents: 6cbde33
Author: Yin Huai <yh...@databricks.com>
Authored: Fri Aug 5 15:52:02 2016 -0700
Committer: Yin Huai <yh...@databricks.com>
Committed: Fri Aug 5 15:52:02 2016 -0700

----------------------------------------------------------------------
 .../spark/sql/hive/client/HiveClientImpl.scala  | 24 +++++-
 .../spark/sql/hive/HiveSparkSubmitSuite.scala   | 80 ++++++++++++++++++++
 2 files changed, 101 insertions(+), 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/e679bc3c/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
----------------------------------------------------------------------
diff --git 
a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala 
b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
index ef69ac7..3bf4ed5 100644
--- 
a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
+++ 
b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
@@ -141,14 +141,32 @@ private[hive] class HiveClientImpl(
         // so we should keep `conf` and reuse the existing instance of 
`CliSessionState`.
         originalState
       } else {
-        val hiveConf = new HiveConf(hadoopConf, classOf[SessionState])
+        val hiveConf = new HiveConf(classOf[SessionState])
+        // 1: we set all confs in the hadoopConf to this hiveConf.
+        // This hadoopConf contains user settings in Hadoop's core-site.xml 
file
+        // and Hive's hive-site.xml file. Note, we load hive-site.xml file 
manually in
+        // SharedState and put settings in this hadoopConf instead of relying 
on HiveConf
+        // to load user settings. Otherwise, HiveConf's initialize method will 
override
+        // settings in the hadoopConf. This issue only shows up when 
spark.sql.hive.metastore.jars
+        // is not set to builtin. When spark.sql.hive.metastore.jars is 
builtin, the classpath
+        // has hive-site.xml. So, HiveConf will use that to override its 
default values.
+        hadoopConf.iterator().asScala.foreach { entry =>
+          val key = entry.getKey
+          val value = entry.getValue
+          if (key.toLowerCase.contains("password")) {
+            logDebug(s"Applying Hadoop and Hive config to Hive Conf: $key=xxx")
+          } else {
+            logDebug(s"Applying Hadoop and Hive config to Hive Conf: 
$key=$value")
+          }
+          hiveConf.set(key, value)
+        }
         // HiveConf is a Hadoop Configuration, which has a field of 
classLoader and
         // the initial value will be the current thread's context class loader
         // (i.e. initClassLoader at here).
         // We call initialConf.setClassLoader(initClassLoader) at here to make
         // this action explicit.
         hiveConf.setClassLoader(initClassLoader)
-        // First, we set all spark confs to this hiveConf.
+        // 2: we set all spark confs to this hiveConf.
         sparkConf.getAll.foreach { case (k, v) =>
           if (k.toLowerCase.contains("password")) {
             logDebug(s"Applying Spark config to Hive Conf: $k=xxx")
@@ -157,7 +175,7 @@ private[hive] class HiveClientImpl(
           }
           hiveConf.set(k, v)
         }
-        // Second, we set all entries in config to this hiveConf.
+        // 3: we set all entries in config to this hiveConf.
         extraConfig.foreach { case (k, v) =>
           if (k.toLowerCase.contains("password")) {
             logDebug(s"Applying extra config to HiveConf: $k=xxx")

http://git-wip-us.apache.org/repos/asf/spark/blob/e679bc3c/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala
----------------------------------------------------------------------
diff --git 
a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala 
b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala
index 9bca720..dd8fec0 100644
--- 
a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala
+++ 
b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala
@@ -253,6 +253,47 @@ class HiveSparkSubmitSuite
     runSparkSubmit(args)
   }
 
+  test("SPARK-16901: set javax.jdo.option.ConnectionURL") {
+    // In this test, we set javax.jdo.option.ConnectionURL and set metastore 
version to
+    // 0.13. This test will make sure that javax.jdo.option.ConnectionURL will 
not be
+    // overridden by hive's default settings when we create a HiveConf object 
inside
+    // HiveClientImpl. Please see SPARK-16901 for more details.
+
+    val metastoreLocation = Utils.createTempDir()
+    metastoreLocation.delete()
+    val metastoreURL =
+      
s"jdbc:derby:memory:;databaseName=${metastoreLocation.getAbsolutePath};create=true"
+    val hiveSiteXmlContent =
+      s"""
+         |<configuration>
+         |  <property>
+         |    <name>javax.jdo.option.ConnectionURL</name>
+         |    <value>$metastoreURL</value>
+         |  </property>
+         |</configuration>
+     """.stripMargin
+
+    // Write a hive-site.xml containing a setting of 
hive.metastore.warehouse.dir.
+    val hiveSiteDir = Utils.createTempDir()
+    val file = new File(hiveSiteDir.getCanonicalPath, "hive-site.xml")
+    val bw = new BufferedWriter(new FileWriter(file))
+    bw.write(hiveSiteXmlContent)
+    bw.close()
+
+    val unusedJar = TestUtils.createJarWithClasses(Seq.empty)
+    val args = Seq(
+      "--class", SetMetastoreURLTest.getClass.getName.stripSuffix("$"),
+      "--name", "SetMetastoreURLTest",
+      "--master", "local[1]",
+      "--conf", "spark.ui.enabled=false",
+      "--conf", "spark.master.rest.enabled=false",
+      "--conf", s"spark.sql.test.expectedMetastoreURL=$metastoreURL",
+      "--conf", s"spark.driver.extraClassPath=${hiveSiteDir.getCanonicalPath}",
+      "--driver-java-options", "-Dderby.system.durability=test",
+      unusedJar.toString)
+    runSparkSubmit(args)
+  }
+
   // NOTE: This is an expensive operation in terms of time (10 seconds+). Use 
sparingly.
   // This is copied from org.apache.spark.deploy.SparkSubmitSuite
   private def runSparkSubmit(args: Seq[String]): Unit = {
@@ -313,6 +354,45 @@ class HiveSparkSubmitSuite
   }
 }
 
+object SetMetastoreURLTest extends Logging {
+  def main(args: Array[String]): Unit = {
+    Utils.configTestLog4j("INFO")
+
+    val sparkConf = new SparkConf(loadDefaults = true)
+    val builder = SparkSession.builder()
+      .config(sparkConf)
+      .config("spark.ui.enabled", "false")
+      .config("spark.sql.hive.metastore.version", "0.13.1")
+      // The issue described in SPARK-16901 only appear when
+      // spark.sql.hive.metastore.jars is not set to builtin.
+      .config("spark.sql.hive.metastore.jars", "maven")
+      .enableHiveSupport()
+
+    val spark = builder.getOrCreate()
+    val expectedMetastoreURL =
+      spark.conf.get("spark.sql.test.expectedMetastoreURL")
+    logInfo(s"spark.sql.test.expectedMetastoreURL is $expectedMetastoreURL")
+
+    if (expectedMetastoreURL == null) {
+      throw new Exception(
+        s"spark.sql.test.expectedMetastoreURL should be set.")
+    }
+
+    // HiveSharedState is used when Hive support is enabled.
+    val actualMetastoreURL =
+      spark.sharedState.asInstanceOf[HiveSharedState]
+        .metadataHive
+        .getConf("javax.jdo.option.ConnectionURL", "this_is_a_wrong_URL")
+    logInfo(s"javax.jdo.option.ConnectionURL is $actualMetastoreURL")
+
+    if (actualMetastoreURL != expectedMetastoreURL) {
+      throw new Exception(
+        s"Expected value of javax.jdo.option.ConnectionURL is 
$expectedMetastoreURL. But, " +
+          s"the actual value is $actualMetastoreURL")
+    }
+  }
+}
+
 object SetWarehouseLocationTest extends Logging {
   def main(args: Array[String]): Unit = {
     Utils.configTestLog4j("INFO")


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

Reply via email to