[GitHub] spark pull request: [SPARK-14014] [SQL] Replace existing catalog w...

yhuai Tue, 22 Mar 2016 10:08:55 -0700

Github user yhuai commented on a diff in the pull request:

    https://github.com/apache/spark/pull/11836#discussion_r57026840
  
    --- Diff: 
sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala ---
    @@ -666,6 +468,241 @@ private[hive] object HiveContext {
         defaultValue = Some(true),
         doc = "When set to true, Hive Thrift server executes SQL queries in an 
asynchronous way.")
     
    +  /**
    +   * The version of the hive client that will be used to communicate with 
the metastore.  Note that
    +   * this does not necessarily need to be the same version of Hive that is 
used internally by
    +   * Spark SQL for execution.
    +   */
    +  private def hiveMetastoreVersion(conf: SQLConf): String = {
    +    conf.getConf(HIVE_METASTORE_VERSION)
    +  }
    +
    +  /**
    +   * The location of the jars that should be used to instantiate the 
HiveMetastoreClient.  This
    +   * property can be one of three options:
    +   *  - a classpath in the standard format for both hive and hadoop.
    +   *  - builtin - attempt to discover the jars that were used to load 
Spark SQL and use those. This
    +   *              option is only valid when using the execution version of 
Hive.
    +   *  - maven - download the correct version of hive on demand from maven.
    +   */
    +  private def hiveMetastoreJars(conf: SQLConf): String = {
    +    conf.getConf(HIVE_METASTORE_JARS)
    +  }
    +
    +  /**
    +   * A comma separated list of class prefixes that should be loaded using 
the classloader that
    +   * is shared between Spark SQL and a specific version of Hive. An 
example of classes that should
    +   * be shared is JDBC drivers that are needed to talk to the metastore. 
Other classes that need
    +   * to be shared are those that interact with classes that are already 
shared.  For example,
    +   * custom appenders that are used by log4j.
    +   */
    +  private def hiveMetastoreSharedPrefixes(conf: SQLConf): Seq[String] = {
    +    conf.getConf(HIVE_METASTORE_SHARED_PREFIXES).filterNot(_ == "")
    +  }
    +
    +  /**
    +   * A comma separated list of class prefixes that should explicitly be 
reloaded for each version
    +   * of Hive that Spark SQL is communicating with.  For example, Hive UDFs 
that are declared in a
    +   * prefix that typically would be shared (i.e. org.apache.spark.*)
    +   */
    +  private def hiveMetastoreBarrierPrefixes(conf: SQLConf): Seq[String] = {
    +    conf.getConf(HIVE_METASTORE_BARRIER_PREFIXES).filterNot(_ == "")
    +  }
    +
    +  /**
    +   * Configurations needed to create a [[HiveClient]].
    +   */
    +  private[hive] def hiveClientConfigurations(hiveconf: HiveConf): 
Map[String, String] = {
    +    // Hive 0.14.0 introduces timeout operations in HiveConf, and changes 
default values of a bunch
    +    // of time `ConfVar`s by adding time suffixes (`s`, `ms`, and `d` 
etc.).  This breaks backwards-
    +    // compatibility when users are trying to connecting to a Hive 
metastore of lower version,
    +    // because these options are expected to be integral values in lower 
versions of Hive.
    +    //
    +    // Here we enumerate all time `ConfVar`s and convert their values to 
numeric strings according
    +    // to their output time units.
    +    Seq(
    +      ConfVars.METASTORE_CLIENT_CONNECT_RETRY_DELAY -> TimeUnit.SECONDS,
    +      ConfVars.METASTORE_CLIENT_SOCKET_TIMEOUT -> TimeUnit.SECONDS,
    +      ConfVars.METASTORE_CLIENT_SOCKET_LIFETIME -> TimeUnit.SECONDS,
    +      ConfVars.HMSHANDLERINTERVAL -> TimeUnit.MILLISECONDS,
    +      ConfVars.METASTORE_EVENT_DB_LISTENER_TTL -> TimeUnit.SECONDS,
    +      ConfVars.METASTORE_EVENT_CLEAN_FREQ -> TimeUnit.SECONDS,
    +      ConfVars.METASTORE_EVENT_EXPIRY_DURATION -> TimeUnit.SECONDS,
    +      ConfVars.METASTORE_AGGREGATE_STATS_CACHE_TTL -> TimeUnit.SECONDS,
    +      ConfVars.METASTORE_AGGREGATE_STATS_CACHE_MAX_WRITER_WAIT -> 
TimeUnit.MILLISECONDS,
    +      ConfVars.METASTORE_AGGREGATE_STATS_CACHE_MAX_READER_WAIT -> 
TimeUnit.MILLISECONDS,
    +      ConfVars.HIVES_AUTO_PROGRESS_TIMEOUT -> TimeUnit.SECONDS,
    +      ConfVars.HIVE_LOG_INCREMENTAL_PLAN_PROGRESS_INTERVAL -> 
TimeUnit.MILLISECONDS,
    +      ConfVars.HIVE_STATS_JDBC_TIMEOUT -> TimeUnit.SECONDS,
    +      ConfVars.HIVE_STATS_RETRIES_WAIT -> TimeUnit.MILLISECONDS,
    +      ConfVars.HIVE_LOCK_SLEEP_BETWEEN_RETRIES -> TimeUnit.SECONDS,
    +      ConfVars.HIVE_ZOOKEEPER_SESSION_TIMEOUT -> TimeUnit.MILLISECONDS,
    +      ConfVars.HIVE_ZOOKEEPER_CONNECTION_BASESLEEPTIME -> 
TimeUnit.MILLISECONDS,
    +      ConfVars.HIVE_TXN_TIMEOUT -> TimeUnit.SECONDS,
    +      ConfVars.HIVE_COMPACTOR_WORKER_TIMEOUT -> TimeUnit.SECONDS,
    +      ConfVars.HIVE_COMPACTOR_CHECK_INTERVAL -> TimeUnit.SECONDS,
    +      ConfVars.HIVE_COMPACTOR_CLEANER_RUN_INTERVAL -> 
TimeUnit.MILLISECONDS,
    +      ConfVars.HIVE_SERVER2_THRIFT_HTTP_MAX_IDLE_TIME -> 
TimeUnit.MILLISECONDS,
    +      ConfVars.HIVE_SERVER2_THRIFT_HTTP_WORKER_KEEPALIVE_TIME -> 
TimeUnit.SECONDS,
    +      ConfVars.HIVE_SERVER2_THRIFT_HTTP_COOKIE_MAX_AGE -> TimeUnit.SECONDS,
    +      ConfVars.HIVE_SERVER2_THRIFT_LOGIN_BEBACKOFF_SLOT_LENGTH -> 
TimeUnit.MILLISECONDS,
    +      ConfVars.HIVE_SERVER2_THRIFT_LOGIN_TIMEOUT -> TimeUnit.SECONDS,
    +      ConfVars.HIVE_SERVER2_THRIFT_WORKER_KEEPALIVE_TIME -> 
TimeUnit.SECONDS,
    +      ConfVars.HIVE_SERVER2_ASYNC_EXEC_SHUTDOWN_TIMEOUT -> 
TimeUnit.SECONDS,
    +      ConfVars.HIVE_SERVER2_ASYNC_EXEC_KEEPALIVE_TIME -> TimeUnit.SECONDS,
    +      ConfVars.HIVE_SERVER2_LONG_POLLING_TIMEOUT -> TimeUnit.MILLISECONDS,
    +      ConfVars.HIVE_SERVER2_SESSION_CHECK_INTERVAL -> 
TimeUnit.MILLISECONDS,
    +      ConfVars.HIVE_SERVER2_IDLE_SESSION_TIMEOUT -> TimeUnit.MILLISECONDS,
    +      ConfVars.HIVE_SERVER2_IDLE_OPERATION_TIMEOUT -> 
TimeUnit.MILLISECONDS,
    +      ConfVars.SERVER_READ_SOCKET_TIMEOUT -> TimeUnit.SECONDS,
    +      ConfVars.HIVE_LOCALIZE_RESOURCE_WAIT_INTERVAL -> 
TimeUnit.MILLISECONDS,
    +      ConfVars.SPARK_CLIENT_FUTURE_TIMEOUT -> TimeUnit.SECONDS,
    +      ConfVars.SPARK_JOB_MONITOR_TIMEOUT -> TimeUnit.SECONDS,
    +      ConfVars.SPARK_RPC_CLIENT_CONNECT_TIMEOUT -> TimeUnit.MILLISECONDS,
    +      ConfVars.SPARK_RPC_CLIENT_HANDSHAKE_TIMEOUT -> TimeUnit.MILLISECONDS
    +    ).map { case (confVar, unit) =>
    +      confVar.varname -> hiveconf.getTimeVar(confVar, unit).toString
    +    }.toMap
    +  }
    +
    +  /**
    +   * Create a [[HiveClient]] used for execution.
    +   *
    +   * Currently this must always be Hive 13 as this is the version of Hive 
that is packaged
    +   * with Spark SQL. This copy of the client is used for execution related 
tasks like
    +   * registering temporary functions or ensuring that the ThreadLocal 
SessionState is
    +   * correctly populated.  This copy of Hive is *not* used for storing 
persistent metadata,
    +   * and only point to a dummy metastore in a temporary directory.
    +   */
    +  protected[hive] def newClientForExecution(
    +      conf: SparkConf,
    +      hadoopConf: Configuration): HiveClientImpl = {
    +    logInfo(s"Initializing execution hive, version $hiveExecutionVersion")
    +    val loader = new IsolatedClientLoader(
    +      version = IsolatedClientLoader.hiveVersion(hiveExecutionVersion),
    +      sparkConf = conf,
    +      execJars = Seq(),
    +      hadoopConf = hadoopConf,
    +      config = newTemporaryConfiguration(useInMemoryDerby = true),
    +      isolationOn = false,
    +      baseClassLoader = Utils.getContextOrSparkClassLoader)
    +    loader.createClient().asInstanceOf[HiveClientImpl]
    +  }
    +
    +  /**
    +   * Create a [[HiveClient]] used to retrieve metadata from the Hive 
MetaStore.
    +   *
    +   * The version of the Hive client that is used here must match the 
metastore that is configured
    +   * in the hive-site.xml file.
    +   */
    +  private def newClientForMetadata(conf: SparkConf, hadoopConf: 
Configuration): HiveClient = {
    +    val hiveConf = new HiveConf(hadoopConf, classOf[HiveConf])
    +    val configurations = hiveClientConfigurations(hiveConf)
    +    newClientForMetadata(conf, hiveConf, hadoopConf, configurations)
    +  }
    +
    +  protected[hive] def newClientForMetadata(
    +      conf: SparkConf,
    +      hiveConf: HiveConf,
    +      hadoopConf: Configuration,
    +      configurations: Map[String, String]): HiveClient = {
    +    val sqlConf = new SQLConf
    +    val hiveMetastoreVersion = HiveContext.hiveMetastoreVersion(sqlConf)
    +    val hiveMetastoreJars = HiveContext.hiveMetastoreJars(sqlConf)
    +    val hiveMetastoreSharedPrefixes = 
HiveContext.hiveMetastoreSharedPrefixes(sqlConf)
    +    val hiveMetastoreBarrierPrefixes = 
HiveContext.hiveMetastoreBarrierPrefixes(sqlConf)
    --- End diff --
    
    When we create the sqlConf in this method, that conf object only hold 
default values and do not have settings of these four keys. We need to populate 
the sqlConf by copy sql related keys from the spark conf to it like what we do 
at 
https://github.com/apache/spark/blob/master/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala#L206-L228.
 Then, we will get the correct values of these four keys. 
    
    This will help us fix `HiveUDFSuite`.



---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at [email protected] or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] spark pull request: [SPARK-14014] [SQL] Replace existing catalog w...

Reply via email to