[GitHub] spark pull request: [SPARK-14609][SQL] Native support for LOAD DAT...

viirya Fri, 22 Apr 2016 01:45:04 -0700

Github user viirya commented on a diff in the pull request:

    https://github.com/apache/spark/pull/12412#discussion_r60708050
  
    --- Diff: 
sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala ---
    @@ -135,3 +138,127 @@ case class AlterTableRename(
       }
     
     }
    +
    +/**
    + * A command that loads data into a Hive table.
    + *
    + * The syntax of this command is:
    + * {{{
    + *  LOAD DATA [LOCAL] INPATH 'filepath' [OVERWRITE] INTO TABLE tablename
    + *  [PARTITION (partcol1=val1, partcol2=val2 ...)]
    + * }}}
    + */
    +case class LoadData(
    +    table: TableIdentifier,
    +    path: String,
    +    isLocal: Boolean,
    +    isOverwrite: Boolean,
    +    partition: Option[ExternalCatalog.TablePartitionSpec]) extends 
RunnableCommand {
    +
    +  override def run(sqlContext: SQLContext): Seq[Row] = {
    +    val catalog = sqlContext.sessionState.catalog
    +    if (!catalog.tableExists(table)) {
    +      throw new AnalysisException(
    +        s"Table in LOAD DATA does not exist: '$table'")
    +    }
    +
    +    val targetTable = catalog.getTableMetadataOption(table).getOrElse {
    +      throw new AnalysisException(
    +        s"Table in LOAD DATA cannot be temporary: '$table'")
    +    }
    +
    +    if (DDLUtils.isDatasourceTable(targetTable)) {
    +      throw new AnalysisException(
    +        "LOAD DATA is not supported for datasource tables")
    +    }
    +
    +    if (targetTable.partitionColumnNames.nonEmpty) {
    +      if (partition.isEmpty || targetTable.partitionColumnNames.size != 
partition.get.size) {
    +        throw new AnalysisException(
    +          "LOAD DATA to partitioned table must specify a specific 
partition of " +
    +          "the table by specifying values for all of the partitioning 
columns.")
    +      }
    +
    +      partition.get.keys.foreach { colName =>
    +        if (!targetTable.partitionColumnNames.contains(colName)) {
    +          throw new AnalysisException(
    +            s"LOAD DATA to partitioned table specifies a non-existing 
partition column: '$colName'")
    +        }
    +      }
    +    } else {
    +      if (partition.nonEmpty) {
    +        throw new AnalysisException(
    +          "LOAD DATA to non-partitioned table cannot specify partition.")
    +      }
    +    }
    +
    +    val loadPath =
    +      if (isLocal) {
    +        val uri = Utils.resolveURI(path)
    +        if (!new File(uri.getPath()).exists()) {
    +          throw new AnalysisException(s"LOAD DATA with non-existing path: 
$path")
    +        }
    +        uri
    +      } else {
    +        val uri = new URI(path)
    +        if (uri.getScheme() != null && uri.getAuthority() != null) {
    +          uri
    +        } else {
    +          // Follow Hive's behavior:
    +          // If no schema or authority is provided with non-local inpath,
    +          // we will use hadoop configuration "fs.default.name".
    +          val defaultFSConf = 
sqlContext.sparkContext.hadoopConfiguration.get("fs.default.name")
    +          val defaultFS = if (defaultFSConf == null) {
    +            new URI("")
    +          } else {
    +            new URI(defaultFSConf)
    +          }
    +
    +          val scheme = if (uri.getScheme() != null) {
    +            uri.getScheme()
    +          } else {
    +            defaultFS.getScheme()
    +          }
    +          val authority = if (uri.getAuthority() != null) {
    +            uri.getAuthority()
    +          } else {
    +            defaultFS.getAuthority()
    +          }
    +
    +          if (scheme == null) {
    +            throw new AnalysisException(
    +              "LOAD DATA with non-local path must specify URI Scheme.")
    +          }
    +
    +          // Follow Hive's behavior:
    +          // If LOCAL is not specified, and the path is relative,
    +          // then the path is interpreted relative to "/user/<username>"
    +          val uriPath = uri.getPath()
    +          val absolutePath = if (uriPath != null && 
uriPath.startsWith("/")) {
    +            uriPath
    +          } else {
    +            s"/user/${System.getProperty("user.name")}/$uriPath"
    --- End diff --
    
    This is following Hive behavior. It should be referred to HDFS path that 
puts users dirs in /user/<user name>.



---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at [email protected] or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] spark pull request: [SPARK-14609][SQL] Native support for LOAD DAT...

Reply via email to