[GitHub] spark pull request: [SPARK-14388][SQL] Implement CREATE TABLE

yhuai Tue, 12 Apr 2016 21:53:44 -0700

Github user yhuai commented on a diff in the pull request:

    https://github.com/apache/spark/pull/12271#discussion_r59492629
  
    --- Diff: 
sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveSqlParser.scala 
---
    @@ -121,84 +123,115 @@ class HiveSqlAstBuilder extends SparkSqlAstBuilder {
       }
     
       /**
    -   * Create a [[CatalogStorageFormat]]. This is part of the 
[[CreateTableAsSelect]] command.
    +   * Create a [[CatalogStorageFormat]] for creating tables.
        */
       override def visitCreateFileFormat(
           ctx: CreateFileFormatContext): CatalogStorageFormat = 
withOrigin(ctx) {
    -    if (ctx.storageHandler == null) {
    -      typedVisit[CatalogStorageFormat](ctx.fileFormat)
    -    } else {
    -      visitStorageHandler(ctx.storageHandler)
    +    (ctx.fileFormat, ctx.storageHandler) match {
    +      // Expected format: INPUTFORMAT input_format OUTPUTFORMAT 
output_format
    +      case (c: TableFileFormatContext, null) =>
    +          visitTableFileFormat(c)
    +      // Expected format: SEQUENCEFILE | TEXTFILE | RCFILE | ORC | PARQUET 
| AVRO
    +      case (c: GenericFileFormatContext, null) =>
    +        visitGenericFileFormat(c)
    +      case (null, storageHandler) =>
    +        throw new ParseException("Operation not allowed: ... STORED BY 
storage_handler ...", ctx)
    +      case _ =>
    +        throw new ParseException("expected either STORED AS or STORED BY, 
not both", ctx)
         }
       }
     
       /**
    -   * Create a [[CreateTableAsSelect]] command.
    +   * Create a table, returning either a [[CreateTable]] or a 
[[CreateTableAsSelect]].
    +   *
    +   * This is not used to create datasource tables, which is handled through
    +   * "CREATE TABLE ... USING ...".
    +   *
    +   * Note: several features are currently not supported - temporary 
tables, bucketing,
    +   * skewed columns and storage handlers (STORED BY).
    +   *
    +   * Expected format:
    +   * {{{
    +   *   CREATE [TEMPORARY] [EXTERNAL] TABLE [IF NOT EXISTS] 
[db_name.]table_name
    +   *   [(col1 data_type [COMMENT col_comment], ...)]
    +   *   [COMMENT table_comment]
    +   *   [PARTITIONED BY (col3 data_type [COMMENT col_comment], ...)]
    +   *   [CLUSTERED BY (col1, ...) [SORTED BY (col1 [ASC|DESC], ...)] INTO 
num_buckets BUCKETS]
    +   *   [SKEWED BY (col1, col2, ...) ON ((col_value, col_value, ...), ...) 
[STORED AS DIRECTORIES]]
    +   *   [ROW FORMAT row_format]
    +   *   [STORED AS file_format | STORED BY storage_handler_class [WITH 
SERDEPROPERTIES (...)]]
    +   *   [LOCATION path]
    +   *   [TBLPROPERTIES (property_name=property_value, ...)]
    +   *   [AS select_statement];
    +   * }}}
        */
    -  override def visitCreateTable(ctx: CreateTableContext): LogicalPlan = {
    -    if (ctx.query == null) {
    -      HiveNativeCommand(command(ctx))
    +  override def visitCreateTable(ctx: CreateTableContext): LogicalPlan = 
withOrigin(ctx) {
    +    val (name, temp, ifNotExists, external) = 
visitCreateTableHeader(ctx.createTableHeader)
    +    // TODO: implement temporary tables
    +    if (temp) {
    +      throw new ParseException(
    +        "CREATE TEMPORARY TABLE is not supported yet. " +
    +        "Please use registerTempTable as an alternative.", ctx)
    +    }
    +    if (ctx.skewSpec != null) {
    +      throw new ParseException("Operation not allowed: CREATE TABLE ... 
SKEWED BY ...", ctx)
    +    }
    +    if (ctx.bucketSpec != null) {
    +      throw new ParseException("Operation not allowed: CREATE TABLE ... 
CLUSTERED BY ...", ctx)
    +    }
    +    val tableType = if (external) {
    +      CatalogTableType.EXTERNAL_TABLE
         } else {
    -      // Get the table header.
    -      val (table, temp, ifNotExists, external) = 
visitCreateTableHeader(ctx.createTableHeader)
    -      val tableType = if (external) {
    -        CatalogTableType.EXTERNAL_TABLE
    -      } else {
    -        CatalogTableType.MANAGED_TABLE
    -      }
    -
    -      // Unsupported clauses.
    -      if (temp) {
    -        throw new ParseException(s"Unsupported operation: TEMPORARY 
clause.", ctx)
    -      }
    -      if (ctx.bucketSpec != null) {
    -        // TODO add this - we need cluster columns in the CatalogTable for 
this to work.
    -        throw new ParseException("Unsupported operation: " +
    -          "CLUSTERED BY ... [ORDERED BY ...] INTO ... BUCKETS clause.", 
ctx)
    -      }
    -      if (ctx.skewSpec != null) {
    -        throw new ParseException("Operation not allowed: " +
    -          "SKEWED BY ... ON ... [STORED AS DIRECTORIES] clause.", ctx)
    -      }
    -
    -      // Create the schema.
    -      val schema = 
Option(ctx.columns).toSeq.flatMap(visitCatalogColumns(_, _.toLowerCase))
    -
    -      // Get the column by which the table is partitioned.
    -      val partitionCols = 
Option(ctx.partitionColumns).toSeq.flatMap(visitCatalogColumns(_))
    -
    -      // Create the storage.
    -      def format(fmt: ParserRuleContext): CatalogStorageFormat = {
    -        
Option(fmt).map(typedVisit[CatalogStorageFormat]).getOrElse(EmptyStorageFormat)
    -      }
    -      // Default storage.
    +      CatalogTableType.MANAGED_TABLE
    +    }
    +    val comment = Option(ctx.STRING).map(string)
    +    val partitionCols = 
Option(ctx.partitionColumns).toSeq.flatMap(visitCatalogColumns)
    +    val cols = Option(ctx.columns).toSeq.flatMap(visitCatalogColumns)
    +    val properties = 
Option(ctx.tablePropertyList).map(visitTablePropertyList).getOrElse(Map.empty)
    +    val selectQuery = Option(ctx.query).map(plan)
    +
    +    // Note: Hive requires partition columns to be distinct from the 
schema, so we need
    +    // to include the partition columns here explicitly
    +    val schema = cols ++ partitionCols
    +
    +    // Storage format
    +    val defaultStorage: CatalogStorageFormat = {
           val defaultStorageType = 
hiveConf.getVar(HiveConf.ConfVars.HIVEDEFAULTFILEFORMAT)
    -      val hiveSerDe = HiveSerDe.sourceToSerDe(defaultStorageType, 
hiveConf).getOrElse {
    -        HiveSerDe(
    -          inputFormat = Option("org.apache.hadoop.mapred.TextInputFormat"),
    -          outputFormat = 
Option("org.apache.hadoop.hive.ql.io.IgnoreKeyTextOutputFormat"))
    -      }
    -      // Defined storage.
    -      val fileStorage = format(ctx.createFileFormat)
    -      val rowStorage = format(ctx.rowFormat)
    -      val storage = CatalogStorageFormat(
    -        Option(ctx.locationSpec).map(visitLocationSpec),
    -        fileStorage.inputFormat.orElse(hiveSerDe.inputFormat),
    -        fileStorage.outputFormat.orElse(hiveSerDe.outputFormat),
    -        rowStorage.serde.orElse(hiveSerDe.serde).orElse(fileStorage.serde),
    -        rowStorage.serdeProperties ++ fileStorage.serdeProperties
    -      )
    +      val defaultHiveSerde = HiveSerDe.sourceToSerDe(defaultStorageType, 
hiveConf)
    +      CatalogStorageFormat(
    +        locationUri = None,
    +        inputFormat = defaultHiveSerde.flatMap(_.inputFormat)
    +          .orElse(Some("org.apache.hadoop.mapred.TextInputFormat")),
    +        outputFormat = defaultHiveSerde.flatMap(_.outputFormat)
    +          
.orElse(Some("org.apache.hadoop.hive.ql.io.IgnoreKeyTextOutputFormat")),
    +        serde = defaultHiveSerde.flatMap(_.serde)
    +          
.orElse(Some("org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe")),
    --- End diff --
    
    I guess we should fill this in as late as possible? Probably in 
`toHiveTable`? In `HiveClientImpl.toHiveTable`, we first create a 
`org.apache.hadoop.hive.ql.metadata.Table`, which uses 
`MetadataTypedColumnsetSerDe` as the default one (so, originally 
`InsertIntoHiveTableSuite` was broken without this change).



---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at [email protected] or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] spark pull request: [SPARK-14388][SQL] Implement CREATE TABLE

Reply via email to