[GitHub] spark pull request #14155: [SPARK-16498][SQL] move hive hack for data source...
Github user yhuai commented on a diff in the pull request: https://github.com/apache/spark/pull/14155#discussion_r75431775 --- Diff: sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala --- @@ -49,6 +49,8 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with TestHiv jsonFilePath = Utils.getSparkClassLoader.getResource("sample.json").getFile } + private def hiveClient: HiveClient = sharedState.asInstanceOf[HiveSharedState].metadataHive --- End diff -- is it possible to not use hiveClient directly? --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #14155: [SPARK-16498][SQL] move hive hack for data source...
Github user yhuai commented on a diff in the pull request: https://github.com/apache/spark/pull/14155#discussion_r75431528 --- Diff: sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala --- @@ -200,22 +348,73 @@ private[spark] class HiveExternalCatalog(client: HiveClient, hadoopConf: Configu * Alter a table whose name that matches the one specified in `tableDefinition`, * assuming the table exists. * - * Note: As of now, this only supports altering table properties, serde properties, - * and num buckets! + * Note: As of now, this only supports altering table properties and serde properties. */ override def alterTable(tableDefinition: CatalogTable): Unit = withClient { assert(tableDefinition.identifier.database.isDefined) val db = tableDefinition.identifier.database.get requireTableExists(db, tableDefinition.identifier.table) -client.alterTable(tableDefinition) +verifyTableProperties(tableDefinition) + +if (tableDefinition.provider == Some("hive") || tableDefinition.tableType == VIEW) { + client.alterTable(tableDefinition) +} else { + val oldDef = client.getTable(db, tableDefinition.identifier.table) + // Sets the `schema`, `partitionColumnNames` and `bucketSpec` from the old table definition, + // to retain the spark specific format if it is. + // Also add table meta properties to table properties, to retain the data source table format. + val newDef = tableDefinition.copy( +schema = oldDef.schema, +partitionColumnNames = oldDef.partitionColumnNames, +bucketSpec = oldDef.bucketSpec, +properties = tableMetadataToProperties(tableDefinition) ++ tableDefinition.properties) --- End diff -- if tableMetadataToProperties(tableDefinition) has new schema, `tableMetadataToProperties(tableDefinition)` is actually changing the schema? --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #14155: [SPARK-16498][SQL] move hive hack for data source...
Github user yhuai commented on a diff in the pull request: https://github.com/apache/spark/pull/14155#discussion_r75431369 --- Diff: sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala --- @@ -144,16 +163,172 @@ private[spark] class HiveExternalCatalog(client: HiveClient, hadoopConf: Configu assert(tableDefinition.identifier.database.isDefined) val db = tableDefinition.identifier.database.get requireDbExists(db) +verifyTableProperties(tableDefinition) +// We can't create index table currently. +assert(tableDefinition.tableType != INDEX) +// All tables except view must have a provider. +assert(tableDefinition.tableType == VIEW || tableDefinition.provider.isDefined) + +// For view or Hive serde tables, they are guaranteed to be Hive compatible and we save them +// to Hive metastore directly. Otherwise, we need to put table metadata to table properties to +// work around some hive metastore problems, e.g. not case-preserving, bad decimal type support. +if (tableDefinition.provider == Some("hive") || tableDefinition.tableType == VIEW) { + client.createTable(tableDefinition, ignoreIfExists) +} else { + // Before saving data source table metadata into Hive metastore, we should: + // 1. Put table schema, partition column names and bucket specification in table properties. + // 2. Check if this table is hive compatible + //2.1 If it's not hive compatible, set schema, partition columns and bucket spec to empty + // and save table metadata to Hive. + //2.1 If it's hive compatible, set serde information in table metadata and try to save + // it to Hive. If it fails, treat it as not hive compatible and go back to 2.1 + + val tableProperties = tableMetadataToProperties(tableDefinition) + + // converts the table metadata to Spark SQL specific format, i.e. set schema, partition column + // names and bucket specification to empty. + def newSparkSQLSpecificMetastoreTable(): CatalogTable = { +tableDefinition.copy( + schema = new StructType, + partitionColumnNames = Nil, + bucketSpec = None, + properties = tableDefinition.properties ++ tableProperties) + } + + // converts the table metadata to Hive compatible format, i.e. set the serde information. + def newHiveCompatibleMetastoreTable(serde: HiveSerDe, path: String): CatalogTable = { +tableDefinition.copy( + storage = tableDefinition.storage.copy( +locationUri = Some(new Path(path).toUri.toString), +inputFormat = serde.inputFormat, +outputFormat = serde.outputFormat, +serde = serde.serde + ), + properties = tableDefinition.properties ++ tableProperties) + } + + val qualifiedTableName = tableDefinition.identifier.quotedString + val maybeSerde = HiveSerDe.sourceToSerDe(tableDefinition.provider.get) + val maybePath = new CaseInsensitiveMap(tableDefinition.storage.properties).get("path") + val skipHiveMetadata = tableDefinition.storage.properties +.getOrElse("skipHiveMetadata", "false").toBoolean + + val (hiveCompatibleTable, logMessage) = (maybeSerde, maybePath) match { +case _ if skipHiveMetadata => + val message = +s"Persisting data source table $qualifiedTableName into Hive metastore in" + + "Spark SQL specific format, which is NOT compatible with Hive." + (None, message) + +// our bucketing is un-compatible with hive(different hash function) +case _ if tableDefinition.bucketSpec.nonEmpty => + val message = +s"Persisting bucketed data source table $qualifiedTableName into " + + "Hive metastore in Spark SQL specific format, which is NOT compatible with Hive. " + (None, message) + +case (Some(serde), Some(path)) => + val message = +s"Persisting data source table $qualifiedTableName with a single input path " + + s"into Hive metastore in Hive compatible format." + (Some(newHiveCompatibleMetastoreTable(serde, path)), message) + +case (Some(_), None) => + val message = +s"Data source table $qualifiedTableName is not file based. Persisting it into " + + s"Hive metastore in Spark SQL specific format, which is NOT compatible with Hive." +
[GitHub] spark pull request #14155: [SPARK-16498][SQL] move hive hack for data source...
Github user yhuai commented on a diff in the pull request: https://github.com/apache/spark/pull/14155#discussion_r75431206 --- Diff: sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala --- @@ -144,16 +161,147 @@ private[spark] class HiveExternalCatalog(client: HiveClient, hadoopConf: Configu assert(tableDefinition.identifier.database.isDefined) val db = tableDefinition.identifier.database.get requireDbExists(db) +verifyTableProperties(tableDefinition) + +if (tableDefinition.provider == Some("hive") || tableDefinition.tableType == VIEW) { + client.createTable(tableDefinition, ignoreIfExists) +} else { + val tableProperties = tableMetadataToProperties(tableDefinition) + + def newSparkSQLSpecificMetastoreTable(): CatalogTable = { +tableDefinition.copy( + schema = new StructType, + partitionColumnNames = Nil, + bucketSpec = None, + properties = tableDefinition.properties ++ tableProperties) + } + + def newHiveCompatibleMetastoreTable(serde: HiveSerDe, path: String): CatalogTable = { +tableDefinition.copy( + storage = tableDefinition.storage.copy( +locationUri = Some(new Path(path).toUri.toString), +inputFormat = serde.inputFormat, +outputFormat = serde.outputFormat, +serde = serde.serde + ), + properties = tableDefinition.properties ++ tableProperties) + } + + val qualifiedTableName = tableDefinition.identifier.quotedString + val maybeSerde = HiveSerDe.sourceToSerDe(tableDefinition.provider.get) + val maybePath = new CaseInsensitiveMap(tableDefinition.storage.properties).get("path") --- End diff -- oh there is a chance that it is not set. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #14155: [SPARK-16498][SQL] move hive hack for data source...
Github user yhuai commented on a diff in the pull request: https://github.com/apache/spark/pull/14155#discussion_r75250883 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala --- @@ -97,16 +92,17 @@ case class CreateDataSourceTableCommand( } } -CreateDataSourceTableUtils.createDataSourceTable( - sparkSession = sparkSession, - tableIdent = tableIdent, +val table = CatalogTable( + identifier = tableIdent, + tableType = if (isExternal) CatalogTableType.EXTERNAL else CatalogTableType.MANAGED, + storage = CatalogStorageFormat.empty.copy(properties = optionsWithPath), schema = dataSource.schema, - partitionColumns = partitionColumns, - bucketSpec = bucketSpec, - provider = provider, - options = optionsWithPath, --- End diff -- Is it different from what we do at line https://github.com/apache/spark/pull/14155/files/96d57b665ac65750eb5c6f9757e5827ea9c14ca4#diff-945e51801b84b92da242fcb42f83f5f5R98? --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #14155: [SPARK-16498][SQL] move hive hack for data source...
Github user yhuai commented on a diff in the pull request: https://github.com/apache/spark/pull/14155#discussion_r75232851 --- Diff: sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala --- @@ -81,6 +86,18 @@ private[spark] class HiveExternalCatalog(client: HiveClient, hadoopConf: Configu withClient { getTable(db, table) } } + /** + * If the given table properties contains datasource properties, throw an exception. + */ --- End diff -- Let's make it clear that this method is used when we want to write metadata to metastore. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
spark git commit: [SPARK-17102][SQL] bypass UserDefinedGenerator for json format check
Repository: spark Updated Branches: refs/heads/branch-1.6 5c34029b8 -> 60de30faf [SPARK-17102][SQL] bypass UserDefinedGenerator for json format check We use reflection to convert `TreeNode` to json string, and currently don't support arbitrary object. `UserDefinedGenerator` takes a function object, so we should skip json format test for it, or the tests can be flacky, e.g. `DataFrameSuite.simple explode`, this test always fail with scala 2.10(branch 1.6 builds with scala 2.10 by default), but pass with scala 2.11(master branch builds with scala 2.11 by default). N/A Author: Wenchen FanCloses #14679 from cloud-fan/json. (cherry picked from commit 928ca1c6d12b23d84f9b6205e22d2e756311f072) Signed-off-by: Yin Huai Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/60de30fa Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/60de30fa Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/60de30fa Branch: refs/heads/branch-1.6 Commit: 60de30faf29b77b9488495fbcd57f46e3d9248ab Parents: 5c34029 Author: Wenchen Fan Authored: Wed Aug 17 09:31:22 2016 -0700 Committer: Yin Huai Committed: Wed Aug 17 09:35:33 2016 -0700 -- sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/60de30fa/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala index b80dfa1..ebb6bad 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala @@ -201,7 +201,8 @@ abstract class QueryTest extends PlanTest { case _: CoGroup[_, _, _, _] => return case _: LogicalRelation => return }.transformAllExpressions { - case a: ImperativeAggregate => return + case _: ImperativeAggregate => return + case _: UserDefinedGenerator => return } // bypass hive tests before we fix all corner cases in hive module. - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-17102][SQL] bypass UserDefinedGenerator for json format check
Repository: spark Updated Branches: refs/heads/master 0b0c8b95e -> 928ca1c6d [SPARK-17102][SQL] bypass UserDefinedGenerator for json format check ## What changes were proposed in this pull request? We use reflection to convert `TreeNode` to json string, and currently don't support arbitrary object. `UserDefinedGenerator` takes a function object, so we should skip json format test for it, or the tests can be flacky, e.g. `DataFrameSuite.simple explode`, this test always fail with scala 2.10(branch 1.6 builds with scala 2.10 by default), but pass with scala 2.11(master branch builds with scala 2.11 by default). ## How was this patch tested? N/A Author: Wenchen FanCloses #14679 from cloud-fan/json. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/928ca1c6 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/928ca1c6 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/928ca1c6 Branch: refs/heads/master Commit: 928ca1c6d12b23d84f9b6205e22d2e756311f072 Parents: 0b0c8b9 Author: Wenchen Fan Authored: Wed Aug 17 09:31:22 2016 -0700 Committer: Yin Huai Committed: Wed Aug 17 09:31:22 2016 -0700 -- sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/928ca1c6/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala index cff9d22..484e438 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala @@ -249,9 +249,10 @@ abstract class QueryTest extends PlanTest { } p }.transformAllExpressions { - case a: ImperativeAggregate => return + case _: ImperativeAggregate => return case _: TypedAggregateExpression => return case Literal(_, _: ObjectType) => return + case _: UserDefinedGenerator => return } // bypass hive tests before we fix all corner cases in hive module. - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-17102][SQL] bypass UserDefinedGenerator for json format check
Repository: spark Updated Branches: refs/heads/branch-2.0 22c7660a8 -> 394d59866 [SPARK-17102][SQL] bypass UserDefinedGenerator for json format check ## What changes were proposed in this pull request? We use reflection to convert `TreeNode` to json string, and currently don't support arbitrary object. `UserDefinedGenerator` takes a function object, so we should skip json format test for it, or the tests can be flacky, e.g. `DataFrameSuite.simple explode`, this test always fail with scala 2.10(branch 1.6 builds with scala 2.10 by default), but pass with scala 2.11(master branch builds with scala 2.11 by default). ## How was this patch tested? N/A Author: Wenchen FanCloses #14679 from cloud-fan/json. (cherry picked from commit 928ca1c6d12b23d84f9b6205e22d2e756311f072) Signed-off-by: Yin Huai Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/394d5986 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/394d5986 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/394d5986 Branch: refs/heads/branch-2.0 Commit: 394d5986617e65852422afeb8d755e38795bbe25 Parents: 22c7660 Author: Wenchen Fan Authored: Wed Aug 17 09:31:22 2016 -0700 Committer: Yin Huai Committed: Wed Aug 17 09:31:41 2016 -0700 -- sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/394d5986/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala index e8480a7..b2c051d 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala @@ -242,9 +242,10 @@ abstract class QueryTest extends PlanTest { case p if p.getClass.getSimpleName == "MetastoreRelation" => return case _: MemoryPlan => return }.transformAllExpressions { - case a: ImperativeAggregate => return + case _: ImperativeAggregate => return case _: TypedAggregateExpression => return case Literal(_, _: ObjectType) => return + case _: UserDefinedGenerator => return } // bypass hive tests before we fix all corner cases in hive module. - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[GitHub] spark issue #14679: [SPARK-17102][SQL] bypass UserDefinedGenerator for json ...
Github user yhuai commented on the issue: https://github.com/apache/spark/pull/14679 I am merging this to master, branch 2.0, and branch 1.6. So, we will not hit this issue. I have created https://issues.apache.org/jira/browse/SPARK-17109 to investigate the root cause. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #14679: [SPARK-17102][SQL] bypass UserDefinedGenerator for json ...
Github user yhuai commented on the issue: https://github.com/apache/spark/pull/14679 Looks like it is caused by `Seq[(DataType, Boolean, String)]` in `UserDefinedGenerator`? I think it is fine to bypass it for now. But, it will be good to address issues that causing toJson crash. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #14155: [SPARK-16498][SQL] move hive hack for data source table ...
Github user yhuai commented on the issue: https://github.com/apache/spark/pull/14155 I have took a close look at `HiveExternalCatalog`. My overall feeling is that the current version still not very clear and people may have a hard time to understand the code. Let me also think about it and see how to improve the code. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #14155: [SPARK-16498][SQL] move hive hack for data source...
Github user yhuai commented on a diff in the pull request: https://github.com/apache/spark/pull/14155#discussion_r75064724 --- Diff: sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala --- @@ -81,6 +86,18 @@ private[spark] class HiveExternalCatalog(client: HiveClient, hadoopConf: Configu withClient { getTable(db, table) } } + /** + * If the given table properties contains datasource properties, throw an exception. + */ + private def verifyTableProperties(table: CatalogTable): Unit = { +val datasourceKeys = table.properties.keys.filter(_.startsWith(DATASOURCE_PREFIX)) +if (datasourceKeys.nonEmpty) { + throw new AnalysisException(s"Cannot persistent ${table.qualifiedName} into hive metastore " + +s"as table property keys may not start with '$DATASOURCE_PREFIX': " + +datasourceKeys.mkString("[", ", ", "]")) +} + } --- End diff -- Just realized one thing. Is it possible that we somehow create `table` based on a `CatalogTable` generated from `restoreTableMetadata`? --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #14155: [SPARK-16498][SQL] move hive hack for data source...
Github user yhuai commented on a diff in the pull request: https://github.com/apache/spark/pull/14155#discussion_r75064560 --- Diff: sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala --- @@ -200,22 +348,73 @@ private[spark] class HiveExternalCatalog(client: HiveClient, hadoopConf: Configu * Alter a table whose name that matches the one specified in `tableDefinition`, * assuming the table exists. * - * Note: As of now, this only supports altering table properties, serde properties, - * and num buckets! + * Note: As of now, this only supports altering table properties and serde properties. */ override def alterTable(tableDefinition: CatalogTable): Unit = withClient { assert(tableDefinition.identifier.database.isDefined) val db = tableDefinition.identifier.database.get requireTableExists(db, tableDefinition.identifier.table) -client.alterTable(tableDefinition) +verifyTableProperties(tableDefinition) + +if (tableDefinition.provider == Some("hive") || tableDefinition.tableType == VIEW) { + client.alterTable(tableDefinition) +} else { + val oldDef = client.getTable(db, tableDefinition.identifier.table) + // Sets the `schema`, `partitionColumnNames` and `bucketSpec` from the old table definition, + // to retain the spark specific format if it is. + // Also add table meta properties to table properties, to retain the data source table format. + val newDef = tableDefinition.copy( +schema = oldDef.schema, +partitionColumnNames = oldDef.partitionColumnNames, +bucketSpec = oldDef.bucketSpec, +properties = tableMetadataToProperties(tableDefinition) ++ tableDefinition.properties) + + client.alterTable(newDef) +} } override def getTable(db: String, table: String): CatalogTable = withClient { -client.getTable(db, table) +restoreTableMetadata(client.getTable(db, table)) } override def getTableOption(db: String, table: String): Option[CatalogTable] = withClient { -client.getTableOption(db, table) +client.getTableOption(db, table).map(restoreTableMetadata) + } + + /** + * Restores table metadata from the table properties if it's a datasouce table. This method is + * kind of a opposite version of [[createTable]]. + */ + private def restoreTableMetadata(table: CatalogTable): CatalogTable = { +if (table.tableType == VIEW) { + table +} else { + getProviderFromTableProperties(table).map { provider => --- End diff -- `provider` can be `hive`, right? --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #14155: [SPARK-16498][SQL] move hive hack for data source...
Github user yhuai commented on a diff in the pull request: https://github.com/apache/spark/pull/14155#discussion_r75064362 --- Diff: sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala --- @@ -200,22 +348,73 @@ private[spark] class HiveExternalCatalog(client: HiveClient, hadoopConf: Configu * Alter a table whose name that matches the one specified in `tableDefinition`, * assuming the table exists. * - * Note: As of now, this only supports altering table properties, serde properties, - * and num buckets! + * Note: As of now, this only supports altering table properties and serde properties. */ override def alterTable(tableDefinition: CatalogTable): Unit = withClient { assert(tableDefinition.identifier.database.isDefined) val db = tableDefinition.identifier.database.get requireTableExists(db, tableDefinition.identifier.table) -client.alterTable(tableDefinition) +verifyTableProperties(tableDefinition) + +if (tableDefinition.provider == Some("hive") || tableDefinition.tableType == VIEW) { + client.alterTable(tableDefinition) +} else { + val oldDef = client.getTable(db, tableDefinition.identifier.table) + // Sets the `schema`, `partitionColumnNames` and `bucketSpec` from the old table definition, + // to retain the spark specific format if it is. + // Also add table meta properties to table properties, to retain the data source table format. + val newDef = tableDefinition.copy( +schema = oldDef.schema, +partitionColumnNames = oldDef.partitionColumnNames, +bucketSpec = oldDef.bucketSpec, +properties = tableMetadataToProperties(tableDefinition) ++ tableDefinition.properties) --- End diff -- If we only look at this method, it is not clear if the new `tableDefinition` changes other fields like `storage`. Also, we are using the existing `bucketSpec`. But, is it possible that we have a new `bucketSpec` in `tableDefinition`? --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #14155: [SPARK-16498][SQL] move hive hack for data source...
Github user yhuai commented on a diff in the pull request: https://github.com/apache/spark/pull/14155#discussion_r75063920 --- Diff: sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala --- @@ -144,16 +161,147 @@ private[spark] class HiveExternalCatalog(client: HiveClient, hadoopConf: Configu assert(tableDefinition.identifier.database.isDefined) val db = tableDefinition.identifier.database.get requireDbExists(db) +verifyTableProperties(tableDefinition) + +if (tableDefinition.provider == Some("hive") || tableDefinition.tableType == VIEW) { + client.createTable(tableDefinition, ignoreIfExists) +} else { + val tableProperties = tableMetadataToProperties(tableDefinition) + + def newSparkSQLSpecificMetastoreTable(): CatalogTable = { +tableDefinition.copy( + schema = new StructType, + partitionColumnNames = Nil, + bucketSpec = None, + properties = tableDefinition.properties ++ tableProperties) + } + + def newHiveCompatibleMetastoreTable(serde: HiveSerDe, path: String): CatalogTable = { +tableDefinition.copy( + storage = tableDefinition.storage.copy( +locationUri = Some(new Path(path).toUri.toString), +inputFormat = serde.inputFormat, +outputFormat = serde.outputFormat, +serde = serde.serde + ), + properties = tableDefinition.properties ++ tableProperties) + } + + val qualifiedTableName = tableDefinition.identifier.quotedString + val maybeSerde = HiveSerDe.sourceToSerDe(tableDefinition.provider.get) + val maybePath = new CaseInsensitiveMap(tableDefinition.storage.properties).get("path") + val skipHiveMetadata = tableDefinition.storage.properties +.getOrElse("skipHiveMetadata", "false").toBoolean + + val (hiveCompatibleTable, logMessage) = (maybeSerde, maybePath) match { +case _ if skipHiveMetadata => + val message = +s"Persisting data source table $qualifiedTableName into Hive metastore in" + + "Spark SQL specific format, which is NOT compatible with Hive." + (None, message) + +// our bucketing is un-compatible with hive(different hash function) +case _ if tableDefinition.bucketSpec.nonEmpty => + val message = +s"Persisting bucketed data source table $qualifiedTableName into " + + "Hive metastore in Spark SQL specific format, which is NOT compatible with Hive. " + (None, message) + +case (Some(serde), Some(path)) => + val message = +s"Persisting data source table $qualifiedTableName with a single input path " + + s"into Hive metastore in Hive compatible format." + (Some(newHiveCompatibleMetastoreTable(serde, path)), message) + +case (Some(_), None) => + val message = +s"Data source table $qualifiedTableName is not file based. Persisting it into " + + s"Hive metastore in Spark SQL specific format, which is NOT compatible with Hive." + (None, message) + +case _ => + val provider = tableDefinition.provider.get + val message = +s"Couldn't find corresponding Hive SerDe for data source provider $provider. " + + s"Persisting data source table $qualifiedTableName into Hive metastore in " + + s"Spark SQL specific format, which is NOT compatible with Hive." + (None, message) + } + + (hiveCompatibleTable, logMessage) match { +case (Some(table), message) => + // We first try to save the metadata of the table in a Hive compatible way. + // If Hive throws an error, we fall back to save its metadata in the Spark SQL + // specific way. + try { +logInfo(message) +saveTableIntoHive(table, ignoreIfExists) + } catch { +case NonFatal(e) => + val warningMessage = +s"Could not persist ${tableDefinition.identifier.quotedString} in a Hive " + + "compatible way. Persisting it into Hive metastore in Spark SQL specific format." + logWarning(warningMessage, e) + saveTableIntoHive(newSparkSQLSpecificMetastoreTable(), ignoreIfExists) + } + +case (None,
[GitHub] spark pull request #14155: [SPARK-16498][SQL] move hive hack for data source...
Github user yhuai commented on a diff in the pull request: https://github.com/apache/spark/pull/14155#discussion_r75063736 --- Diff: sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala --- @@ -144,16 +161,147 @@ private[spark] class HiveExternalCatalog(client: HiveClient, hadoopConf: Configu assert(tableDefinition.identifier.database.isDefined) val db = tableDefinition.identifier.database.get requireDbExists(db) +verifyTableProperties(tableDefinition) + +if (tableDefinition.provider == Some("hive") || tableDefinition.tableType == VIEW) { + client.createTable(tableDefinition, ignoreIfExists) +} else { + val tableProperties = tableMetadataToProperties(tableDefinition) + + def newSparkSQLSpecificMetastoreTable(): CatalogTable = { +tableDefinition.copy( + schema = new StructType, + partitionColumnNames = Nil, + bucketSpec = None, + properties = tableDefinition.properties ++ tableProperties) + } + + def newHiveCompatibleMetastoreTable(serde: HiveSerDe, path: String): CatalogTable = { +tableDefinition.copy( + storage = tableDefinition.storage.copy( +locationUri = Some(new Path(path).toUri.toString), +inputFormat = serde.inputFormat, +outputFormat = serde.outputFormat, +serde = serde.serde + ), + properties = tableDefinition.properties ++ tableProperties) + } + + val qualifiedTableName = tableDefinition.identifier.quotedString + val maybeSerde = HiveSerDe.sourceToSerDe(tableDefinition.provider.get) + val maybePath = new CaseInsensitiveMap(tableDefinition.storage.properties).get("path") + val skipHiveMetadata = tableDefinition.storage.properties +.getOrElse("skipHiveMetadata", "false").toBoolean + + val (hiveCompatibleTable, logMessage) = (maybeSerde, maybePath) match { +case _ if skipHiveMetadata => + val message = +s"Persisting data source table $qualifiedTableName into Hive metastore in" + + "Spark SQL specific format, which is NOT compatible with Hive." + (None, message) + +// our bucketing is un-compatible with hive(different hash function) +case _ if tableDefinition.bucketSpec.nonEmpty => + val message = +s"Persisting bucketed data source table $qualifiedTableName into " + + "Hive metastore in Spark SQL specific format, which is NOT compatible with Hive. " + (None, message) + +case (Some(serde), Some(path)) => + val message = +s"Persisting data source table $qualifiedTableName with a single input path " + + s"into Hive metastore in Hive compatible format." + (Some(newHiveCompatibleMetastoreTable(serde, path)), message) + +case (Some(_), None) => + val message = +s"Data source table $qualifiedTableName is not file based. Persisting it into " + + s"Hive metastore in Spark SQL specific format, which is NOT compatible with Hive." + (None, message) + +case _ => + val provider = tableDefinition.provider.get + val message = +s"Couldn't find corresponding Hive SerDe for data source provider $provider. " + + s"Persisting data source table $qualifiedTableName into Hive metastore in " + + s"Spark SQL specific format, which is NOT compatible with Hive." + (None, message) + } + + (hiveCompatibleTable, logMessage) match { +case (Some(table), message) => + // We first try to save the metadata of the table in a Hive compatible way. + // If Hive throws an error, we fall back to save its metadata in the Spark SQL + // specific way. + try { +logInfo(message) +saveTableIntoHive(table, ignoreIfExists) + } catch { +case NonFatal(e) => + val warningMessage = +s"Could not persist ${tableDefinition.identifier.quotedString} in a Hive " + + "compatible way. Persisting it into Hive metastore in Spark SQL specific format." + logWarning(warningMessage, e) + saveTableIntoHive(newSparkSQLSpecificMetastoreTable(), ignoreIfExists) + } + +case (None,
[GitHub] spark pull request #14155: [SPARK-16498][SQL] move hive hack for data source...
Github user yhuai commented on a diff in the pull request: https://github.com/apache/spark/pull/14155#discussion_r75063676 --- Diff: sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala --- @@ -144,16 +161,147 @@ private[spark] class HiveExternalCatalog(client: HiveClient, hadoopConf: Configu assert(tableDefinition.identifier.database.isDefined) val db = tableDefinition.identifier.database.get requireDbExists(db) +verifyTableProperties(tableDefinition) + +if (tableDefinition.provider == Some("hive") || tableDefinition.tableType == VIEW) { + client.createTable(tableDefinition, ignoreIfExists) +} else { + val tableProperties = tableMetadataToProperties(tableDefinition) + + def newSparkSQLSpecificMetastoreTable(): CatalogTable = { +tableDefinition.copy( + schema = new StructType, + partitionColumnNames = Nil, + bucketSpec = None, + properties = tableDefinition.properties ++ tableProperties) + } + + def newHiveCompatibleMetastoreTable(serde: HiveSerDe, path: String): CatalogTable = { +tableDefinition.copy( + storage = tableDefinition.storage.copy( +locationUri = Some(new Path(path).toUri.toString), +inputFormat = serde.inputFormat, +outputFormat = serde.outputFormat, +serde = serde.serde + ), + properties = tableDefinition.properties ++ tableProperties) + } + + val qualifiedTableName = tableDefinition.identifier.quotedString + val maybeSerde = HiveSerDe.sourceToSerDe(tableDefinition.provider.get) + val maybePath = new CaseInsensitiveMap(tableDefinition.storage.properties).get("path") + val skipHiveMetadata = tableDefinition.storage.properties +.getOrElse("skipHiveMetadata", "false").toBoolean + + val (hiveCompatibleTable, logMessage) = (maybeSerde, maybePath) match { +case _ if skipHiveMetadata => + val message = +s"Persisting data source table $qualifiedTableName into Hive metastore in" + + "Spark SQL specific format, which is NOT compatible with Hive." + (None, message) + +// our bucketing is un-compatible with hive(different hash function) +case _ if tableDefinition.bucketSpec.nonEmpty => + val message = +s"Persisting bucketed data source table $qualifiedTableName into " + + "Hive metastore in Spark SQL specific format, which is NOT compatible with Hive. " + (None, message) + +case (Some(serde), Some(path)) => + val message = +s"Persisting data source table $qualifiedTableName with a single input path " + + s"into Hive metastore in Hive compatible format." + (Some(newHiveCompatibleMetastoreTable(serde, path)), message) + +case (Some(_), None) => + val message = +s"Data source table $qualifiedTableName is not file based. Persisting it into " + + s"Hive metastore in Spark SQL specific format, which is NOT compatible with Hive." + (None, message) + +case _ => + val provider = tableDefinition.provider.get + val message = +s"Couldn't find corresponding Hive SerDe for data source provider $provider. " + + s"Persisting data source table $qualifiedTableName into Hive metastore in " + + s"Spark SQL specific format, which is NOT compatible with Hive." + (None, message) + } + + (hiveCompatibleTable, logMessage) match { +case (Some(table), message) => + // We first try to save the metadata of the table in a Hive compatible way. + // If Hive throws an error, we fall back to save its metadata in the Spark SQL + // specific way. + try { +logInfo(message) +saveTableIntoHive(table, ignoreIfExists) + } catch { +case NonFatal(e) => + val warningMessage = +s"Could not persist ${tableDefinition.identifier.quotedString} in a Hive " + + "compatible way. Persisting it into Hive metastore in Spark SQL specific format." + logWarning(warningMessage, e) + saveTableIntoHive(newSparkSQLSpecificMetastoreTable(), ignoreIfExists) + } + +case (None,
[GitHub] spark pull request #14155: [SPARK-16498][SQL] move hive hack for data source...
Github user yhuai commented on a diff in the pull request: https://github.com/apache/spark/pull/14155#discussion_r75063449 --- Diff: sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala --- @@ -144,16 +161,147 @@ private[spark] class HiveExternalCatalog(client: HiveClient, hadoopConf: Configu assert(tableDefinition.identifier.database.isDefined) val db = tableDefinition.identifier.database.get requireDbExists(db) +verifyTableProperties(tableDefinition) + +if (tableDefinition.provider == Some("hive") || tableDefinition.tableType == VIEW) { + client.createTable(tableDefinition, ignoreIfExists) +} else { + val tableProperties = tableMetadataToProperties(tableDefinition) + + def newSparkSQLSpecificMetastoreTable(): CatalogTable = { +tableDefinition.copy( + schema = new StructType, + partitionColumnNames = Nil, + bucketSpec = None, + properties = tableDefinition.properties ++ tableProperties) + } + + def newHiveCompatibleMetastoreTable(serde: HiveSerDe, path: String): CatalogTable = { +tableDefinition.copy( + storage = tableDefinition.storage.copy( +locationUri = Some(new Path(path).toUri.toString), +inputFormat = serde.inputFormat, +outputFormat = serde.outputFormat, +serde = serde.serde + ), + properties = tableDefinition.properties ++ tableProperties) + } + + val qualifiedTableName = tableDefinition.identifier.quotedString + val maybeSerde = HiveSerDe.sourceToSerDe(tableDefinition.provider.get) + val maybePath = new CaseInsensitiveMap(tableDefinition.storage.properties).get("path") --- End diff -- I think this path will be set by the ddl command (e.g. `CreateDataSourceTableAsSelectCommand`). --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #14155: [SPARK-16498][SQL] move hive hack for data source...
Github user yhuai commented on a diff in the pull request: https://github.com/apache/spark/pull/14155#discussion_r75063156 --- Diff: sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala --- @@ -144,16 +161,147 @@ private[spark] class HiveExternalCatalog(client: HiveClient, hadoopConf: Configu assert(tableDefinition.identifier.database.isDefined) val db = tableDefinition.identifier.database.get requireDbExists(db) +verifyTableProperties(tableDefinition) + +if (tableDefinition.provider == Some("hive") || tableDefinition.tableType == VIEW) { + client.createTable(tableDefinition, ignoreIfExists) +} else { + val tableProperties = tableMetadataToProperties(tableDefinition) + + def newSparkSQLSpecificMetastoreTable(): CatalogTable = { +tableDefinition.copy( + schema = new StructType, + partitionColumnNames = Nil, + bucketSpec = None, + properties = tableDefinition.properties ++ tableProperties) + } + + def newHiveCompatibleMetastoreTable(serde: HiveSerDe, path: String): CatalogTable = { +tableDefinition.copy( + storage = tableDefinition.storage.copy( +locationUri = Some(new Path(path).toUri.toString), +inputFormat = serde.inputFormat, +outputFormat = serde.outputFormat, +serde = serde.serde + ), + properties = tableDefinition.properties ++ tableProperties) + } + + val qualifiedTableName = tableDefinition.identifier.quotedString + val maybeSerde = HiveSerDe.sourceToSerDe(tableDefinition.provider.get) + val maybePath = new CaseInsensitiveMap(tableDefinition.storage.properties).get("path") --- End diff -- If the create table command does not specify the location, does this `maybePath` contains the default location? --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #14155: [SPARK-16498][SQL] move hive hack for data source...
Github user yhuai commented on a diff in the pull request: https://github.com/apache/spark/pull/14155#discussion_r75063094 --- Diff: sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala --- @@ -144,16 +161,147 @@ private[spark] class HiveExternalCatalog(client: HiveClient, hadoopConf: Configu assert(tableDefinition.identifier.database.isDefined) val db = tableDefinition.identifier.database.get requireDbExists(db) +verifyTableProperties(tableDefinition) + +if (tableDefinition.provider == Some("hive") || tableDefinition.tableType == VIEW) { + client.createTable(tableDefinition, ignoreIfExists) +} else { + val tableProperties = tableMetadataToProperties(tableDefinition) --- End diff -- Let's explain what will be put into this `tableProperties`. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #14155: [SPARK-16498][SQL] move hive hack for data source...
Github user yhuai commented on a diff in the pull request: https://github.com/apache/spark/pull/14155#discussion_r75062850 --- Diff: sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala --- @@ -144,16 +161,147 @@ private[spark] class HiveExternalCatalog(client: HiveClient, hadoopConf: Configu assert(tableDefinition.identifier.database.isDefined) val db = tableDefinition.identifier.database.get requireDbExists(db) +verifyTableProperties(tableDefinition) + +if (tableDefinition.provider == Some("hive") || tableDefinition.tableType == VIEW) { + client.createTable(tableDefinition, ignoreIfExists) +} else { + val tableProperties = tableMetadataToProperties(tableDefinition) + + def newSparkSQLSpecificMetastoreTable(): CatalogTable = { +tableDefinition.copy( + schema = new StructType, + partitionColumnNames = Nil, + bucketSpec = None, + properties = tableDefinition.properties ++ tableProperties) + } + + def newHiveCompatibleMetastoreTable(serde: HiveSerDe, path: String): CatalogTable = { --- End diff -- comment --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #14155: [SPARK-16498][SQL] move hive hack for data source...
Github user yhuai commented on a diff in the pull request: https://github.com/apache/spark/pull/14155#discussion_r75062846 --- Diff: sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala --- @@ -144,16 +161,147 @@ private[spark] class HiveExternalCatalog(client: HiveClient, hadoopConf: Configu assert(tableDefinition.identifier.database.isDefined) val db = tableDefinition.identifier.database.get requireDbExists(db) +verifyTableProperties(tableDefinition) + +if (tableDefinition.provider == Some("hive") || tableDefinition.tableType == VIEW) { + client.createTable(tableDefinition, ignoreIfExists) +} else { + val tableProperties = tableMetadataToProperties(tableDefinition) + + def newSparkSQLSpecificMetastoreTable(): CatalogTable = { --- End diff -- comment --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #14155: [SPARK-16498][SQL] move hive hack for data source...
Github user yhuai commented on a diff in the pull request: https://github.com/apache/spark/pull/14155#discussion_r75062743 --- Diff: sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala --- @@ -144,16 +161,147 @@ private[spark] class HiveExternalCatalog(client: HiveClient, hadoopConf: Configu assert(tableDefinition.identifier.database.isDefined) val db = tableDefinition.identifier.database.get requireDbExists(db) +verifyTableProperties(tableDefinition) + +if (tableDefinition.provider == Some("hive") || tableDefinition.tableType == VIEW) { + client.createTable(tableDefinition, ignoreIfExists) +} else { --- End diff -- Let's add comment to explain what we are doing at here. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #14668: [SPARK-16656][SQL][BRANCH-1.6] Try to make Create...
Github user yhuai closed the pull request at: https://github.com/apache/spark/pull/14668 --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #14668: [SPARK-16656][SQL][BRANCH-1.6] Try to make CreateTableAs...
Github user yhuai commented on the issue: https://github.com/apache/spark/pull/14668 I have merged this PR to branch 1.6. Closing it. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
spark git commit: [SPARK-16656][SQL][BRANCH-1.6] Try to make CreateTableAsSelectSuite more stable
Repository: spark Updated Branches: refs/heads/branch-1.6 4d64c7fd1 -> 5c34029b8 [SPARK-16656][SQL][BRANCH-1.6] Try to make CreateTableAsSelectSuite more stable ## What changes were proposed in this pull request? This PR backports #14289 to branch 1.6 https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/62593/testReport/junit/org.apache.spark.sql.sources/CreateTableAsSelectSuite/create_a_table__drop_it_and_create_another_one_with_the_same_name/ shows that `create a table, drop it and create another one with the same name` failed. But other runs were good. Seems it is a flaky test. This PR tries to make this test more stable. Author: Yin Huai <yh...@databricks.com> Closes #14668 from yhuai/SPARK-16656-branch1.6. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5c34029b Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5c34029b Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5c34029b Branch: refs/heads/branch-1.6 Commit: 5c34029b856efdf80cd139b73bcdb9197fe43e2f Parents: 4d64c7f Author: Yin Huai <yh...@databricks.com> Authored: Tue Aug 16 13:42:58 2016 -0700 Committer: Yin Huai <yh...@databricks.com> Committed: Tue Aug 16 13:42:58 2016 -0700 -- .../sql/sources/CreateTableAsSelectSuite.scala | 19 +++ 1 file changed, 15 insertions(+), 4 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/5c34029b/sql/core/src/test/scala/org/apache/spark/sql/sources/CreateTableAsSelectSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/CreateTableAsSelectSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/CreateTableAsSelectSuite.scala index 6fc9feb..7275da1 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/CreateTableAsSelectSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/CreateTableAsSelectSuite.scala @@ -19,20 +19,23 @@ package org.apache.spark.sql.sources import java.io.{File, IOException} -import org.scalatest.BeforeAndAfter +import org.scalatest.BeforeAndAfterEach import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.execution.datasources.DDLException import org.apache.spark.sql.test.SharedSQLContext import org.apache.spark.util.Utils -class CreateTableAsSelectSuite extends DataSourceTest with SharedSQLContext with BeforeAndAfter { +class CreateTableAsSelectSuite + extends DataSourceTest + with SharedSQLContext + with BeforeAndAfterEach { + protected override lazy val sql = caseInsensitiveContext.sql _ private var path: File = null override def beforeAll(): Unit = { super.beforeAll() -path = Utils.createTempDir() val rdd = sparkContext.parallelize((1 to 10).map(i => s"""{"a":$i, "b":"str${i}"}""")) caseInsensitiveContext.read.json(rdd).registerTempTable("jt") } @@ -40,13 +43,21 @@ class CreateTableAsSelectSuite extends DataSourceTest with SharedSQLContext with override def afterAll(): Unit = { try { caseInsensitiveContext.dropTempTable("jt") + Utils.deleteRecursively(path) } finally { super.afterAll() } } - after { + override def beforeEach(): Unit = { +super.beforeEach() +path = Utils.createTempDir() +path.delete() + } + + override def afterEach(): Unit = { Utils.deleteRecursively(path) +super.afterEach() } test("CREATE TEMPORARY TABLE AS SELECT") { - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[GitHub] spark issue #14668: [SPARK-16656][SQL][BRANCH-1.6] Try to make CreateTableAs...
Github user yhuai commented on the issue: https://github.com/apache/spark/pull/14668 I am merging this to branch 1.6. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #14668: [SPARK-16656][SQL][BRANCH-1.6] Try to make Create...
GitHub user yhuai opened a pull request: https://github.com/apache/spark/pull/14668 [SPARK-16656][SQL][BRANCH-1.6] Try to make CreateTableAsSelectSuite more stable ## What changes were proposed in this pull request? This PR backports #14289 to branch 1.6 https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/62593/testReport/junit/org.apache.spark.sql.sources/CreateTableAsSelectSuite/create_a_table__drop_it_and_create_another_one_with_the_same_name/ shows that `create a table, drop it and create another one with the same name` failed. But other runs were good. Seems it is a flaky test. This PR tries to make this test more stable. You can merge this pull request into a Git repository by running: $ git pull https://github.com/yhuai/spark SPARK-16656-branch1.6 Alternatively you can review and apply these changes as the patch at: https://github.com/apache/spark/pull/14668.patch To close this pull request, make a commit to your master/trunk branch with (at least) the following in the commit message: This closes #14668 commit 59b65b5b41a67b8bc472de6a5052994c20207dc7 Author: Yin Huai <yh...@databricks.com> Date: 2016-07-21T19:10:26Z [SPARK-16656][SQL] Try to make CreateTableAsSelectSuite more stable https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/62593/testReport/junit/org.apache.spark.sql.sources/CreateTableAsSelectSuite/create_a_table__drop_it_and_create_another_one_with_the_same_name/ shows that `create a table, drop it and create another one with the same name` failed. But other runs were good. Seems it is a flaky test. This PR tries to make this test more stable. Author: Yin Huai <yh...@databricks.com> Closes #14289 from yhuai/SPARK-16656. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #14506: [SPARK-16916][SQL] serde/storage properties should not h...
Github user yhuai commented on the issue: https://github.com/apache/spark/pull/14506 Thanks. Merging to master. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #14580: [SPARK-16991][SQL] Fix `EliminateOuterJoin` optimizer to...
Github user yhuai commented on the issue: https://github.com/apache/spark/pull/14580 Can you explain `isnotnull(coalesce(b#227, c#238)) does not filter out NULL!!!`? --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #14155: [SPARK-16498][SQL] move hive hack for data source...
Github user yhuai commented on a diff in the pull request: https://github.com/apache/spark/pull/14155#discussion_r74852722 --- Diff: sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala --- @@ -689,4 +689,38 @@ class HiveDDLSuite )) } } + + test("datasource table property keys are not allowed") { +import org.apache.spark.sql.hive.HiveExternalCatalog.DATASOURCE_PREFIX + +withTable("tbl") { + sql("CREATE TABLE tbl(a INT) STORED AS parquet") + + val e = intercept[AnalysisException] { +sql(s"ALTER TABLE tbl SET TBLPROPERTIES ('${DATASOURCE_PREFIX}foo' = 'loser')") + } + assert(e.getMessage.contains(DATASOURCE_PREFIX + "foo")) --- End diff -- Yea. I think it makes sense to consider `ALTER TABLE tbl SET TBLPROPERTIES` a hive specific command. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #14506: [SPARK-16916][SQL] serde/storage properties should not h...
Github user yhuai commented on the issue: https://github.com/apache/spark/pull/14506 LGTM pending jenkins. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #14506: [SPARK-16916][SQL] serde/storage properties should not h...
Github user yhuai commented on the issue: https://github.com/apache/spark/pull/14506 test this please --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
spark git commit: [SPARK-17065][SQL] Improve the error message when encountering an incompatible DataSourceRegister
Repository: spark Updated Branches: refs/heads/branch-2.0 8f4cacd3a -> 45036327f [SPARK-17065][SQL] Improve the error message when encountering an incompatible DataSourceRegister ## What changes were proposed in this pull request? Add an instruction to ask the user to remove or upgrade the incompatible DataSourceRegister in the error message. ## How was this patch tested? Test command: ``` build/sbt -Dscala-2.10 package SPARK_SCALA_VERSION=2.10 bin/spark-shell --packages ai.h2o:sparkling-water-core_2.10:1.6.5 scala> Seq(1).toDS().write.format("parquet").save("foo") ``` Before: ``` java.util.ServiceConfigurationError: org.apache.spark.sql.sources.DataSourceRegister: Provider org.apache.spark.h2o.DefaultSource could not be instantiated at java.util.ServiceLoader.fail(ServiceLoader.java:232) at java.util.ServiceLoader.access$100(ServiceLoader.java:185) at java.util.ServiceLoader$LazyIterator.nextService(ServiceLoader.java:384) at java.util.ServiceLoader$LazyIterator.next(ServiceLoader.java:404) at java.util.ServiceLoader$1.next(ServiceLoader.java:480) ... Caused by: java.lang.NoClassDefFoundError: org/apache/spark/Logging at java.lang.ClassLoader.defineClass1(Native Method) at java.lang.ClassLoader.defineClass(ClassLoader.java:760) at java.security.SecureClassLoader.defineClass(SecureClassLoader.java:142) at java.net.URLClassLoader.defineClass(URLClassLoader.java:467) at java.net.URLClassLoader.access$100(URLClassLoader.java:73) at java.net.URLClassLoader$1.run(URLClassLoader.java:368) at java.net.URLClassLoader$1.run(URLClassLoader.java:362) at java.security.AccessController.doPrivileged(Native Method) ... ``` After: ``` java.lang.ClassNotFoundException: Detected an incompatible DataSourceRegister. Please remove the incompatible library from classpath or upgrade it. Error: org.apache.spark.sql.sources.DataSourceRegister: Provider org.apache.spark.h2o.DefaultSource could not be instantiated at org.apache.spark.sql.execution.datasources.DataSource.lookupDataSource(DataSource.scala:178) at org.apache.spark.sql.execution.datasources.DataSource.providingClass$lzycompute(DataSource.scala:79) at org.apache.spark.sql.execution.datasources.DataSource.providingClass(DataSource.scala:79) at org.apache.spark.sql.execution.datasources.DataSource.write(DataSource.scala:441) at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:213) at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:196) ... ``` Author: Shixiong ZhuCloses #14651 from zsxwing/SPARK-17065. (cherry picked from commit 268b71d0d792f875fcfaec5314862236754a00d6) Signed-off-by: Yin Huai Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/45036327 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/45036327 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/45036327 Branch: refs/heads/branch-2.0 Commit: 45036327fdbdb0167b3c53245fce9dc2be67ffe9 Parents: 8f4cacd Author: Shixiong Zhu Authored: Mon Aug 15 15:55:32 2016 -0700 Committer: Yin Huai Committed: Mon Aug 15 15:55:50 2016 -0700 -- .../sql/execution/datasources/DataSource.scala | 91 +++- 1 file changed, 52 insertions(+), 39 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/45036327/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala index f572b93..f5727da 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala @@ -17,7 +17,7 @@ package org.apache.spark.sql.execution.datasources -import java.util.ServiceLoader +import java.util.{ServiceConfigurationError, ServiceLoader} import scala.collection.JavaConverters._ import scala.language.{existentials, implicitConversions} @@ -123,50 +123,63 @@ case class DataSource( val loader = Utils.getContextOrSparkClassLoader val serviceLoader = ServiceLoader.load(classOf[DataSourceRegister], loader) - serviceLoader.asScala.filter(_.shortName().equalsIgnoreCase(provider)).toList match { - // the provider format did not match any given registered aliases - case Nil => -try { - Try(loader.loadClass(provider)).orElse(Try(loader.loadClass(provider2))) match { -
spark git commit: [SPARK-17065][SQL] Improve the error message when encountering an incompatible DataSourceRegister
Repository: spark Updated Branches: refs/heads/master fffb0c0d1 -> 268b71d0d [SPARK-17065][SQL] Improve the error message when encountering an incompatible DataSourceRegister ## What changes were proposed in this pull request? Add an instruction to ask the user to remove or upgrade the incompatible DataSourceRegister in the error message. ## How was this patch tested? Test command: ``` build/sbt -Dscala-2.10 package SPARK_SCALA_VERSION=2.10 bin/spark-shell --packages ai.h2o:sparkling-water-core_2.10:1.6.5 scala> Seq(1).toDS().write.format("parquet").save("foo") ``` Before: ``` java.util.ServiceConfigurationError: org.apache.spark.sql.sources.DataSourceRegister: Provider org.apache.spark.h2o.DefaultSource could not be instantiated at java.util.ServiceLoader.fail(ServiceLoader.java:232) at java.util.ServiceLoader.access$100(ServiceLoader.java:185) at java.util.ServiceLoader$LazyIterator.nextService(ServiceLoader.java:384) at java.util.ServiceLoader$LazyIterator.next(ServiceLoader.java:404) at java.util.ServiceLoader$1.next(ServiceLoader.java:480) ... Caused by: java.lang.NoClassDefFoundError: org/apache/spark/Logging at java.lang.ClassLoader.defineClass1(Native Method) at java.lang.ClassLoader.defineClass(ClassLoader.java:760) at java.security.SecureClassLoader.defineClass(SecureClassLoader.java:142) at java.net.URLClassLoader.defineClass(URLClassLoader.java:467) at java.net.URLClassLoader.access$100(URLClassLoader.java:73) at java.net.URLClassLoader$1.run(URLClassLoader.java:368) at java.net.URLClassLoader$1.run(URLClassLoader.java:362) at java.security.AccessController.doPrivileged(Native Method) ... ``` After: ``` java.lang.ClassNotFoundException: Detected an incompatible DataSourceRegister. Please remove the incompatible library from classpath or upgrade it. Error: org.apache.spark.sql.sources.DataSourceRegister: Provider org.apache.spark.h2o.DefaultSource could not be instantiated at org.apache.spark.sql.execution.datasources.DataSource.lookupDataSource(DataSource.scala:178) at org.apache.spark.sql.execution.datasources.DataSource.providingClass$lzycompute(DataSource.scala:79) at org.apache.spark.sql.execution.datasources.DataSource.providingClass(DataSource.scala:79) at org.apache.spark.sql.execution.datasources.DataSource.write(DataSource.scala:441) at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:213) at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:196) ... ``` Author: Shixiong ZhuCloses #14651 from zsxwing/SPARK-17065. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/268b71d0 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/268b71d0 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/268b71d0 Branch: refs/heads/master Commit: 268b71d0d792f875fcfaec5314862236754a00d6 Parents: fffb0c0 Author: Shixiong Zhu Authored: Mon Aug 15 15:55:32 2016 -0700 Committer: Yin Huai Committed: Mon Aug 15 15:55:32 2016 -0700 -- .../sql/execution/datasources/DataSource.scala | 91 +++- 1 file changed, 52 insertions(+), 39 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/268b71d0/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala index 79024fd..5ad6ae0 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala @@ -17,7 +17,7 @@ package org.apache.spark.sql.execution.datasources -import java.util.ServiceLoader +import java.util.{ServiceConfigurationError, ServiceLoader} import scala.collection.JavaConverters._ import scala.language.{existentials, implicitConversions} @@ -124,50 +124,63 @@ case class DataSource( val loader = Utils.getContextOrSparkClassLoader val serviceLoader = ServiceLoader.load(classOf[DataSourceRegister], loader) - serviceLoader.asScala.filter(_.shortName().equalsIgnoreCase(provider)).toList match { - // the provider format did not match any given registered aliases - case Nil => -try { - Try(loader.loadClass(provider)).orElse(Try(loader.loadClass(provider2))) match { -case Success(dataSource) => - // Found the data source using fully qualified path -
[GitHub] spark issue #14651: [SPARK-17065][SQL]Improve the error message when encount...
Github user yhuai commented on the issue: https://github.com/apache/spark/pull/14651 Merging to master and branch 2.0. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #14651: [SPARK-17065][SQL]Improve the error message when encount...
Github user yhuai commented on the issue: https://github.com/apache/spark/pull/14651 LGTM. Thanks! --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #14634: [SPARK-17051][SQL] we should use hadoopConf in InsertInt...
Github user yhuai commented on the issue: https://github.com/apache/spark/pull/14634 Sorry. What's the necessity to make this change? --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #14155: [SPARK-16498][SQL] move hive hack for data source...
Github user yhuai commented on a diff in the pull request: https://github.com/apache/spark/pull/14155#discussion_r74665317 --- Diff: sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala --- @@ -207,15 +310,52 @@ private[spark] class HiveExternalCatalog(client: HiveClient, hadoopConf: Configu assert(tableDefinition.identifier.database.isDefined) val db = tableDefinition.identifier.database.get requireTableExists(db, tableDefinition.identifier.table) +verifyTableProperties(tableDefinition) --- End diff -- alterTable is my main concern. It is possible that `tableDefinition` is in the standard form. But, the actual metadata stored in the metastore is using properties to store schema. Then, the alterTable may just fail or significantly change the metadata in metastore. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #14155: [SPARK-16498][SQL] move hive hack for data source...
Github user yhuai commented on a diff in the pull request: https://github.com/apache/spark/pull/14155#discussion_r74665106 --- Diff: sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala --- @@ -81,6 +86,19 @@ private[spark] class HiveExternalCatalog(client: HiveClient, hadoopConf: Configu withClient { getTable(db, table) } } + /** + * If the given table properties contains datasource properties, throw an exception. + */ + private def verifyTableProperties(table: CatalogTable): Unit = { +val datasourceKeys = (table.properties.keys ++ table.storage.properties.keys) + .filter(_.startsWith(DATASOURCE_PREFIX)) +if (datasourceKeys.nonEmpty) { + throw new AnalysisException(s"Cannot persistent ${table.qualifiedName} into hive metastore " + +s"as table/storage property keys may not start with '$DATASOURCE_PREFIX': " + +datasourceKeys.mkString("[", ", ", "]")) +} + } --- End diff -- nvm. This is for writing. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #14155: [SPARK-16498][SQL] move hive hack for data source...
Github user yhuai commented on a diff in the pull request: https://github.com/apache/spark/pull/14155#discussion_r74665091 --- Diff: sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala --- @@ -81,6 +86,19 @@ private[spark] class HiveExternalCatalog(client: HiveClient, hadoopConf: Configu withClient { getTable(db, table) } } + /** + * If the given table properties contains datasource properties, throw an exception. + */ + private def verifyTableProperties(table: CatalogTable): Unit = { +val datasourceKeys = (table.properties.keys ++ table.storage.properties.keys) + .filter(_.startsWith(DATASOURCE_PREFIX)) --- End diff -- I think we should not include `table.storage.properties`, right? --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #14155: [SPARK-16498][SQL] move hive hack for data source...
Github user yhuai commented on a diff in the pull request: https://github.com/apache/spark/pull/14155#discussion_r74665033 --- Diff: sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala --- @@ -363,3 +503,82 @@ private[spark] class HiveExternalCatalog(client: HiveClient, hadoopConf: Configu } } + +object HiveExternalCatalog { + val DATASOURCE_PREFIX = "spark.sql.sources." + val DATASOURCE_PROVIDER = DATASOURCE_PREFIX + "provider" + val DATASOURCE_SCHEMA = DATASOURCE_PREFIX + "schema" + val DATASOURCE_SCHEMA_PREFIX = DATASOURCE_SCHEMA + "." + val DATASOURCE_SCHEMA_NUMPARTS = DATASOURCE_SCHEMA_PREFIX + "numParts" + val DATASOURCE_SCHEMA_NUMPARTCOLS = DATASOURCE_SCHEMA_PREFIX + "numPartCols" + val DATASOURCE_SCHEMA_NUMSORTCOLS = DATASOURCE_SCHEMA_PREFIX + "numSortCols" + val DATASOURCE_SCHEMA_NUMBUCKETS = DATASOURCE_SCHEMA_PREFIX + "numBuckets" + val DATASOURCE_SCHEMA_NUMBUCKETCOLS = DATASOURCE_SCHEMA_PREFIX + "numBucketCols" + val DATASOURCE_SCHEMA_PART_PREFIX = DATASOURCE_SCHEMA_PREFIX + "part." + val DATASOURCE_SCHEMA_PARTCOL_PREFIX = DATASOURCE_SCHEMA_PREFIX + "partCol." + val DATASOURCE_SCHEMA_BUCKETCOL_PREFIX = DATASOURCE_SCHEMA_PREFIX + "bucketCol." + val DATASOURCE_SCHEMA_SORTCOL_PREFIX = DATASOURCE_SCHEMA_PREFIX + "sortCol." + + def getProviderFromTableProperties(metadata: CatalogTable): Option[String] = { +metadata.properties.get(DATASOURCE_PROVIDER) + } + + def getOriginalTableProperties(metadata: CatalogTable): Map[String, String] = { +metadata.properties.filterNot { case (key, _) => key.startsWith(DATASOURCE_PREFIX) } --- End diff -- For a pre-built spark, there is no way to read the raw data, right? --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #14155: [SPARK-16498][SQL] move hive hack for data source...
Github user yhuai commented on a diff in the pull request: https://github.com/apache/spark/pull/14155#discussion_r74664610 --- Diff: sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala --- @@ -207,15 +310,52 @@ private[spark] class HiveExternalCatalog(client: HiveClient, hadoopConf: Configu assert(tableDefinition.identifier.database.isDefined) val db = tableDefinition.identifier.database.get requireTableExists(db, tableDefinition.identifier.table) +verifyTableProperties(tableDefinition) --- End diff -- For data source tables, seems we also need to adjust the metadata? --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #14155: [SPARK-16498][SQL] move hive hack for data source...
Github user yhuai commented on a diff in the pull request: https://github.com/apache/spark/pull/14155#discussion_r74664134 --- Diff: sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala --- @@ -144,16 +162,101 @@ private[spark] class HiveExternalCatalog(client: HiveClient, hadoopConf: Configu assert(tableDefinition.identifier.database.isDefined) val db = tableDefinition.identifier.database.get requireDbExists(db) +verifyTableProperties(tableDefinition) + +if (tableDefinition.provider == Some("hive") || tableDefinition.tableType == VIEW) { + client.createTable(tableDefinition, ignoreIfExists) +} else { + val provider = tableDefinition.provider.get + val partitionColumns = tableDefinition.partitionColumnNames + val bucketSpec = tableDefinition.bucketSpec + + val tableProperties = new scala.collection.mutable.HashMap[String, String] + tableProperties.put(DATASOURCE_PROVIDER, provider) + + // Serialized JSON schema string may be too long to be stored into a single metastore table + // property. In this case, we split the JSON string and store each part as a separate table + // property. + val threshold = 4000 --- End diff -- Should we consider this conf as an immutable one and pass the value in when we create the external catalog? --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #14155: [SPARK-16498][SQL] move hive hack for data source...
Github user yhuai commented on a diff in the pull request: https://github.com/apache/spark/pull/14155#discussion_r74664001 --- Diff: sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala --- @@ -81,6 +86,19 @@ private[spark] class HiveExternalCatalog(client: HiveClient, hadoopConf: Configu withClient { getTable(db, table) } } + /** + * If the given table properties contains datasource properties, throw an exception. + */ + private def verifyTableProperties(table: CatalogTable): Unit = { +val datasourceKeys = (table.properties.keys ++ table.storage.properties.keys) + .filter(_.startsWith(DATASOURCE_PREFIX)) +if (datasourceKeys.nonEmpty) { + throw new AnalysisException(s"Cannot persistent ${table.qualifiedName} into hive metastore " + +s"as table/storage property keys may not start with '$DATASOURCE_PREFIX': " + +datasourceKeys.mkString("[", ", ", "]")) +} + } --- End diff -- actually, I am wondering if we should keep it in the properties? So, it is easy for us to debug? --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #14155: [SPARK-16498][SQL] move hive hack for data source...
Github user yhuai commented on a diff in the pull request: https://github.com/apache/spark/pull/14155#discussion_r74663776 --- Diff: sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala --- @@ -396,40 +393,6 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach { assert(e.message == "Found duplicate column(s) in bucket: a") } - test("Describe Table with Corrupted Schema") { -import testImplicits._ - -val tabName = "tab1" -withTempPath { dir => - val path = dir.getCanonicalPath - val df = sparkContext.parallelize(1 to 10).map(i => (i, i.toString)).toDF("col1", "col2") - df.write.format("json").save(path) - - withTable(tabName) { -sql( - s""" - |CREATE TABLE $tabName - |USING json - |OPTIONS ( - | path '$path' - |) - """.stripMargin) - -val catalog = spark.sessionState.catalog -val table = catalog.getTableMetadata(TableIdentifier(tabName)) -val newProperties = table.properties.filterKeys(key => - key != CreateDataSourceTableUtils.DATASOURCE_SCHEMA_NUMPARTS) -val newTable = table.copy(properties = newProperties) -catalog.alterTable(newTable) - -val e = intercept[AnalysisException] { - sql(s"DESC $tabName") -}.getMessage -assert(e.contains(s"Could not read schema from the metastore because it is corrupted")) - } -} - } --- End diff -- do we still have this test? --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #14586: [SPARK-17003] [BUILD] [BRANCH-1.6] release-build....
Github user yhuai closed the pull request at: https://github.com/apache/spark/pull/14586 --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
spark git commit: [SPARK-17003][BUILD][BRANCH-1.6] release-build.sh is missing hive-thriftserver for scala 2.11
Repository: spark Updated Branches: refs/heads/branch-1.6 b3ecff640 -> 909231d7a [SPARK-17003][BUILD][BRANCH-1.6] release-build.sh is missing hive-thriftserver for scala 2.11 ## What changes were proposed in this pull request? hive-thriftserver works with Scala 2.11 (https://issues.apache.org/jira/browse/SPARK-8013). So, let's publish scala 2.11 artifacts with the flag of `-Phive-thfitserver`. I am also fixing the doc. Author: Yin Huai <yh...@databricks.com> Closes #14586 from yhuai/SPARK-16453-branch-1.6. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/909231d7 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/909231d7 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/909231d7 Branch: refs/heads/branch-1.6 Commit: 909231d7aec591af2fcf0ffaf0612a8c034bcd7a Parents: b3ecff6 Author: Yin Huai <yh...@databricks.com> Authored: Fri Aug 12 10:29:05 2016 -0700 Committer: Yin Huai <yh...@databricks.com> Committed: Fri Aug 12 10:29:05 2016 -0700 -- dev/create-release/release-build.sh | 10 -- docs/building-spark.md | 2 -- python/pyspark/sql/functions.py | 2 +- 3 files changed, 5 insertions(+), 9 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/909231d7/dev/create-release/release-build.sh -- diff --git a/dev/create-release/release-build.sh b/dev/create-release/release-build.sh index 2c3af6a..840fb20 100755 --- a/dev/create-release/release-build.sh +++ b/dev/create-release/release-build.sh @@ -80,7 +80,7 @@ NEXUS_PROFILE=d63f592e7eac0 # Profile for Spark staging uploads BASE_DIR=$(pwd) MVN="build/mvn --force" -PUBLISH_PROFILES="-Pyarn -Phive -Phadoop-2.2" +PUBLISH_PROFILES="-Pyarn -Phive -Phive-thriftserver -Phadoop-2.2" PUBLISH_PROFILES="$PUBLISH_PROFILES -Pspark-ganglia-lgpl -Pkinesis-asl" rm -rf spark @@ -187,7 +187,7 @@ if [[ "$1" == "package" ]]; then # We increment the Zinc port each time to avoid OOM's and other craziness if multiple builds # share the same Zinc server. make_binary_release "hadoop1" "-Psparkr -Phadoop-1 -Phive -Phive-thriftserver" "3030" & - make_binary_release "hadoop1-scala2.11" "-Psparkr -Phadoop-1 -Phive -Dscala-2.11" "3031" & + make_binary_release "hadoop1-scala2.11" "-Psparkr -Phadoop-1 -Phive -Phive-thriftserver -Dscala-2.11" "3031" & make_binary_release "cdh4" "-Psparkr -Phadoop-1 -Phive -Phive-thriftserver -Dhadoop.version=2.0.0-mr1-cdh4.2.0" "3032" & make_binary_release "hadoop2.3" "-Psparkr -Phadoop-2.3 -Phive -Phive-thriftserver -Pyarn" "3033" & make_binary_release "hadoop2.4" "-Psparkr -Phadoop-2.4 -Phive -Phive-thriftserver -Pyarn" "3034" & @@ -256,8 +256,7 @@ if [[ "$1" == "publish-snapshot" ]]; then # Generate random point for Zinc export ZINC_PORT=$(python -S -c "import random; print random.randrange(3030,4030)") - $MVN -DzincPort=$ZINC_PORT --settings $tmp_settings -DskipTests $PUBLISH_PROFILES \ --Phive-thriftserver deploy + $MVN -DzincPort=$ZINC_PORT --settings $tmp_settings -DskipTests $PUBLISH_PROFILES deploy ./dev/change-scala-version.sh 2.11 $MVN -DzincPort=$ZINC_PORT -Dscala-2.11 --settings $tmp_settings \ -DskipTests $PUBLISH_PROFILES clean deploy @@ -293,8 +292,7 @@ if [[ "$1" == "publish-release" ]]; then # Generate random point for Zinc export ZINC_PORT=$(python -S -c "import random; print random.randrange(3030,4030)") - $MVN -DzincPort=$ZINC_PORT -Dmaven.repo.local=$tmp_repo -DskipTests $PUBLISH_PROFILES \ --Phive-thriftserver clean install + $MVN -DzincPort=$ZINC_PORT -Dmaven.repo.local=$tmp_repo -DskipTests $PUBLISH_PROFILES clean install ./dev/change-scala-version.sh 2.11 http://git-wip-us.apache.org/repos/asf/spark/blob/909231d7/docs/building-spark.md -- diff --git a/docs/building-spark.md b/docs/building-spark.md index 5f694dc..4348b38 100644 --- a/docs/building-spark.md +++ b/docs/building-spark.md @@ -129,8 +129,6 @@ To produce a Spark package compiled with Scala 2.11, use the `-Dscala-2.11` prop ./dev/change-scala-version.sh 2.11 mvn -Pyarn -Phadoop-2.4 -Dscala-2.11 -DskipTests clean package -Spark does not yet support its JDBC component for Scala 2.11. - # Spark Tests in Maven Tests are run by default via the [ScalaTest Maven plugin](http://www.scalatest.org/user_guide/using_the_scalatest_mave
[GitHub] spark issue #14586: [SPARK-17003] [BUILD] [BRANCH-1.6] release-build.sh is m...
Github user yhuai commented on the issue: https://github.com/apache/spark/pull/14586 merging to branch 1.6. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #14586: [SPARK-17003] [BUILD] [BRANCH-1.6] release-build.sh is m...
Github user yhuai commented on the issue: https://github.com/apache/spark/pull/14586 I am merging this. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #14546: [SPARK-16955][SQL] Using ordinals in ORDER BY and GROUP ...
Github user yhuai commented on the issue: https://github.com/apache/spark/pull/14546 @dongjoon-hyun Seems this issue has been fixed as a by-product of https://github.com/apache/spark/pull/14595. How about we close this? Also, feel free to look at @clockfly's follow-up pr. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #14586: [SPARK-17003] [BUILD] [BRANCH-1.6] release-build.sh is m...
Github user yhuai commented on the issue: https://github.com/apache/spark/pull/14586 test this please --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #14586: [SPARK-17003] [BUILD] [BRANCH-1.6] release-build.sh is m...
Github user yhuai commented on the issue: https://github.com/apache/spark/pull/14586 Done --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #14603: [SPARK-17021][SQL] simplify the constructor parameters o...
Github user yhuai commented on the issue: https://github.com/apache/spark/pull/14603 With this chnage, I think we can use encoder to serialize it. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #14603: [SPARK-17021][SQL] simplify the constructor parameters o...
Github user yhuai commented on the issue: https://github.com/apache/spark/pull/14603 LGTM. Merging to master. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #14456: [SPARK-16831] [Python] Fixed bug in CrossValidator.avgMe...
Github user yhuai commented on the issue: https://github.com/apache/spark/pull/14456 Thanks! Seems https://github.com/apache/spark/pull/12464 introduced avgMetrics to CrossValidator model. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #14456: [SPARK-16831] [Python] Fixed bug in CrossValidator.avgMe...
Github user yhuai commented on the issue: https://github.com/apache/spark/pull/14456 Sorry. I think this pr breaks 1.6 build. ``` ** File "/home/jenkins/workspace/NewSparkPullRequestBuilder/python/pyspark/ml/tuning.py", line 111, in __main__.CrossValidator Failed example: cvModel.avgMetrics[0] Exception raised: Traceback (most recent call last): File "/usr/lib64/python2.6/doctest.py", line 1253, in __run compileflags, 1) in test.globs File "", line 1, in cvModel.avgMetrics[0] AttributeError: 'CrossValidatorModel' object has no attribute 'avgMetrics' ** ``` --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #13775: [SPARK-16060][SQL] Vectorized Orc reader
Github user yhuai commented on the issue: https://github.com/apache/spark/pull/13775 for the benchmark, how about we just test the scan operation? --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #14586: [SPARK-16453] [BUILD] [BRANCH-1.6] release-build.sh is m...
Github user yhuai commented on the issue: https://github.com/apache/spark/pull/14586 test this please --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #14586: [SPARK-16453] [BUILD] [BRANCH-1.6] release-build.sh is m...
Github user yhuai commented on the issue: https://github.com/apache/spark/pull/14586 test this please --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #14586: [SPARK-16453] [BUILD] [BRANCH-1.6] release-build....
GitHub user yhuai opened a pull request: https://github.com/apache/spark/pull/14586 [SPARK-16453] [BUILD] [BRANCH-1.6] release-build.sh is missing hive-thriftserver for scala 2.10 ## What changes were proposed in this pull request? hive-thriftserver works with Scala 2.11 (https://issues.apache.org/jira/browse/SPARK-8013). So, let's publish scala 2.11 artifacts with the flag of `-Phive-thfitserver`. I am also fixing the doc. You can merge this pull request into a Git repository by running: $ git pull https://github.com/yhuai/spark SPARK-16453-branch-1.6 Alternatively you can review and apply these changes as the patch at: https://github.com/apache/spark/pull/14586.patch To close this pull request, make a commit to your master/trunk branch with (at least) the following in the commit message: This closes #14586 commit cdd80b8dcfb5e74a4fe8aa9dc44a840f6f4ebd81 Author: Yin Huai <yh...@databricks.com> Date: 2016-08-10T22:29:23Z [SPARK-16453] [BUILD] [BRANCH-1.6] release-build.sh is missing hive-thriftserver for scala 2.10 --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #14155: [SPARK-16498][SQL] move hive hack for data source table ...
Github user yhuai commented on the issue: https://github.com/apache/spark/pull/14155 Thank you for working on this! It's great to see we are moving those hacks into `HiveExternalCatalog`. It will be very helpful if we can have two diagrams to show how we use CatalogTable before and after this refactoring. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #14155: [SPARK-16498][SQL] move hive hack for data source...
Github user yhuai commented on a diff in the pull request: https://github.com/apache/spark/pull/14155#discussion_r74131369 --- Diff: sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveCommandSuite.scala --- @@ -18,21 +18,32 @@ package org.apache.spark.sql.hive.execution import org.apache.spark.sql.{AnalysisException, QueryTest, Row, SaveMode} +import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.analysis.NoSuchTableException -import org.apache.spark.sql.execution.command.CreateDataSourceTableUtils._ +import org.apache.spark.sql.catalyst.catalog.{CatalogStorageFormat, CatalogTable, CatalogTableType} import org.apache.spark.sql.hive.test.TestHiveSingleton import org.apache.spark.sql.test.SQLTestUtils +import org.apache.spark.sql.types.StructType class HiveCommandSuite extends QueryTest with SQLTestUtils with TestHiveSingleton { import testImplicits._ protected override def beforeAll(): Unit = { super.beforeAll() -sql( - """ -|CREATE TABLE parquet_tab1 (c1 INT, c2 STRING) -|USING org.apache.spark.sql.parquet.DefaultSource - """.stripMargin) --- End diff -- How did we originally add `my_key1`? --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #14155: [SPARK-16498][SQL] move hive hack for data source...
Github user yhuai commented on a diff in the pull request: https://github.com/apache/spark/pull/14155#discussion_r74131072 --- Diff: sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala --- @@ -70,64 +69,16 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log val cacheLoader = new CacheLoader[QualifiedTableName, LogicalPlan]() { override def load(in: QualifiedTableName): LogicalPlan = { logDebug(s"Creating new cached data source for $in") -val table = client.getTable(in.database, in.name) +val table = sparkSession.sharedState.externalCatalog.getTable(in.database, in.name) -// TODO: the following code is duplicated with FindDataSourceTable.readDataSourceTable --- End diff -- nice! --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #14155: [SPARK-16498][SQL] move hive hack for data source...
Github user yhuai commented on a diff in the pull request: https://github.com/apache/spark/pull/14155#discussion_r74130831 --- Diff: sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala --- @@ -363,3 +503,82 @@ private[spark] class HiveExternalCatalog(client: HiveClient, hadoopConf: Configu } } + +object HiveExternalCatalog { + val DATASOURCE_PREFIX = "spark.sql.sources." + val DATASOURCE_PROVIDER = DATASOURCE_PREFIX + "provider" + val DATASOURCE_SCHEMA = DATASOURCE_PREFIX + "schema" + val DATASOURCE_SCHEMA_PREFIX = DATASOURCE_SCHEMA + "." + val DATASOURCE_SCHEMA_NUMPARTS = DATASOURCE_SCHEMA_PREFIX + "numParts" + val DATASOURCE_SCHEMA_NUMPARTCOLS = DATASOURCE_SCHEMA_PREFIX + "numPartCols" + val DATASOURCE_SCHEMA_NUMSORTCOLS = DATASOURCE_SCHEMA_PREFIX + "numSortCols" + val DATASOURCE_SCHEMA_NUMBUCKETS = DATASOURCE_SCHEMA_PREFIX + "numBuckets" + val DATASOURCE_SCHEMA_NUMBUCKETCOLS = DATASOURCE_SCHEMA_PREFIX + "numBucketCols" + val DATASOURCE_SCHEMA_PART_PREFIX = DATASOURCE_SCHEMA_PREFIX + "part." + val DATASOURCE_SCHEMA_PARTCOL_PREFIX = DATASOURCE_SCHEMA_PREFIX + "partCol." + val DATASOURCE_SCHEMA_BUCKETCOL_PREFIX = DATASOURCE_SCHEMA_PREFIX + "bucketCol." + val DATASOURCE_SCHEMA_SORTCOL_PREFIX = DATASOURCE_SCHEMA_PREFIX + "sortCol." + + def getProviderFromTableProperties(metadata: CatalogTable): Option[String] = { +metadata.properties.get(DATASOURCE_PROVIDER) + } + + def getOriginalTableProperties(metadata: CatalogTable): Map[String, String] = { +metadata.properties.filterNot { case (key, _) => key.startsWith(DATASOURCE_PREFIX) } --- End diff -- I am wondering if it is still helpful to keep those in the properties (they may be useful when we want to debug an issue). --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #14155: [SPARK-16498][SQL] move hive hack for data source...
Github user yhuai commented on a diff in the pull request: https://github.com/apache/spark/pull/14155#discussion_r74130452 --- Diff: sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala --- @@ -144,16 +162,101 @@ private[spark] class HiveExternalCatalog(client: HiveClient, hadoopConf: Configu assert(tableDefinition.identifier.database.isDefined) val db = tableDefinition.identifier.database.get requireDbExists(db) +verifyTableProperties(tableDefinition) + +if (tableDefinition.provider == Some("hive") || tableDefinition.tableType == VIEW) { + client.createTable(tableDefinition, ignoreIfExists) +} else { + val provider = tableDefinition.provider.get + val partitionColumns = tableDefinition.partitionColumnNames + val bucketSpec = tableDefinition.bucketSpec + + val tableProperties = new scala.collection.mutable.HashMap[String, String] + tableProperties.put(DATASOURCE_PROVIDER, provider) + + // Serialized JSON schema string may be too long to be stored into a single metastore table + // property. In this case, we split the JSON string and store each part as a separate table + // property. + val threshold = 4000 --- End diff -- Let's me see if we can still get the conf value. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #14155: [SPARK-16498][SQL] move hive hack for data source...
Github user yhuai commented on a diff in the pull request: https://github.com/apache/spark/pull/14155#discussion_r74130034 --- Diff: sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala --- @@ -81,6 +86,19 @@ private[spark] class HiveExternalCatalog(client: HiveClient, hadoopConf: Configu withClient { getTable(db, table) } } + /** + * If the given table properties contains datasource properties, throw an exception. + */ + private def verifyTableProperties(table: CatalogTable): Unit = { +val datasourceKeys = (table.properties.keys ++ table.storage.properties.keys) + .filter(_.startsWith(DATASOURCE_PREFIX)) +if (datasourceKeys.nonEmpty) { + throw new AnalysisException(s"Cannot persistent ${table.qualifiedName} into hive metastore " + +s"as table/storage property keys may not start with '$DATASOURCE_PREFIX': " + +datasourceKeys.mkString("[", ", ", "]")) +} + } --- End diff -- Oh, I see. You do not want `CatalogTable`'s properties have keys starting with `DATASOURCE_PREFIX`? --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #14155: [SPARK-16498][SQL] move hive hack for data source...
Github user yhuai commented on a diff in the pull request: https://github.com/apache/spark/pull/14155#discussion_r74129714 --- Diff: sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala --- @@ -81,6 +86,19 @@ private[spark] class HiveExternalCatalog(client: HiveClient, hadoopConf: Configu withClient { getTable(db, table) } } + /** + * If the given table properties contains datasource properties, throw an exception. + */ + private def verifyTableProperties(table: CatalogTable): Unit = { +val datasourceKeys = (table.properties.keys ++ table.storage.properties.keys) + .filter(_.startsWith(DATASOURCE_PREFIX)) +if (datasourceKeys.nonEmpty) { + throw new AnalysisException(s"Cannot persistent ${table.qualifiedName} into hive metastore " + +s"as table/storage property keys may not start with '$DATASOURCE_PREFIX': " + +datasourceKeys.mkString("[", ", ", "]")) +} + } --- End diff -- How is it used? --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #14155: [SPARK-16498][SQL] move hive hack for data source...
Github user yhuai commented on a diff in the pull request: https://github.com/apache/spark/pull/14155#discussion_r74129515 --- Diff: sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala --- @@ -689,4 +689,38 @@ class HiveDDLSuite )) } } + + test("datasource table property keys are not allowed") { +import org.apache.spark.sql.hive.HiveExternalCatalog.DATASOURCE_PREFIX + +withTable("tbl") { + sql("CREATE TABLE tbl(a INT) STORED AS parquet") + + val e = intercept[AnalysisException] { +sql(s"ALTER TABLE tbl SET TBLPROPERTIES ('${DATASOURCE_PREFIX}foo' = 'loser')") + } + assert(e.getMessage.contains(DATASOURCE_PREFIX + "foo")) --- End diff -- ah, I guess I am confused. Sorry. Who will throw the exception? --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #14155: [SPARK-16498][SQL] move hive hack for data source...
Github user yhuai commented on a diff in the pull request: https://github.com/apache/spark/pull/14155#discussion_r74128927 --- Diff: sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala --- @@ -93,7 +92,7 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach { .add("col2", "string") .add("a", "int") .add("b", "int"), - provider = Some("parquet"), + provider = Some("hive"), --- End diff -- Will we lose test coverage after changing this line? --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #14155: [SPARK-16498][SQL] move hive hack for data source...
Github user yhuai commented on a diff in the pull request: https://github.com/apache/spark/pull/14155#discussion_r74127228 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala --- @@ -229,10 +230,8 @@ case class AlterTableSetPropertiesCommand( extends RunnableCommand { override def run(sparkSession: SparkSession): Seq[Row] = { -val ident = if (isView) "VIEW" else "TABLE" val catalog = sparkSession.sessionState.catalog DDLUtils.verifyAlterTableType(catalog, tableName, isView) -DDLUtils.verifyTableProperties(properties.keys.toSeq, s"ALTER $ident") --- End diff -- I am wondering if it is possible to not change this part in this part? We can decide if we want to remove it in its individual pr. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #14500: [SPARK-16905] SQL DDL: MSCK REPAIR TABLE
Github user yhuai commented on a diff in the pull request: https://github.com/apache/spark/pull/14500#discussion_r74123952 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala --- @@ -425,6 +430,111 @@ case class AlterTableDropPartitionCommand( } +/** + * Recover Partitions in ALTER TABLE: recover all the partition in the directory of a table and + * update the catalog. + * + * The syntax of this command is: + * {{{ + * ALTER TABLE table RECOVER PARTITIONS; + * MSCK REPAIR TABLE table; + * }}} + */ +case class AlterTableRecoverPartitionsCommand( +tableName: TableIdentifier, +cmd: String = "ALTER TABLE RECOVER PARTITIONS") extends RunnableCommand { + override def run(spark: SparkSession): Seq[Row] = { +val catalog = spark.sessionState.catalog +if (!catalog.tableExists(tableName)) { + throw new AnalysisException(s"Table $tableName in $cmd does not exist.") +} +val table = catalog.getTableMetadata(tableName) +if (catalog.isTemporaryTable(tableName)) { + throw new AnalysisException( +s"Operation not allowed: $cmd on temporary tables: $tableName") +} +if (DDLUtils.isDatasourceTable(table)) { + throw new AnalysisException( +s"Operation not allowed: $cmd on datasource tables: $tableName") +} +if (table.tableType != CatalogTableType.EXTERNAL) { + throw new AnalysisException( +s"Operation not allowed: $cmd only works on external tables: $tableName") +} +if (!DDLUtils.isTablePartitioned(table)) { + throw new AnalysisException( +s"Operation not allowed: $cmd only works on partitioned tables: $tableName") +} +if (table.storage.locationUri.isEmpty) { + throw new AnalysisException( +s"Operation not allowed: $cmd only works on table with location provided: $tableName") +} + +val root = new Path(table.storage.locationUri.get) +val fs = root.getFileSystem(spark.sparkContext.hadoopConfiguration) +// Dummy jobconf to get to the pathFilter defined in configuration +// It's very expensive to create a JobConf(ClassUtil.findContainingJar() is slow) +val jobConf = new JobConf(spark.sparkContext.hadoopConfiguration, this.getClass) +val pathFilter = FileInputFormat.getInputPathFilter(jobConf) +val partitionSpecsAndLocs = scanPartitions( + spark, fs, pathFilter, root, Map(), table.partitionColumnNames.map(_.toLowerCase)) +val parts = partitionSpecsAndLocs.map { case (spec, location) => + // inherit table storage format (possibly except for location) + CatalogTablePartition(spec, table.storage.copy(locationUri = Some(location.toUri.toString))) +} +spark.sessionState.catalog.createPartitions(tableName, + parts.toArray[CatalogTablePartition], ignoreIfExists = true) --- End diff -- What will happen if we get thousands of new partitions of tens thousands of new partitions? --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #14500: [SPARK-16905] SQL DDL: MSCK REPAIR TABLE
Github user yhuai commented on the issue: https://github.com/apache/spark/pull/14500 @liancheng Can you do a post-hoc review? --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #14492: [SPARK-16887] Add SPARK_DIST_CLASSPATH to LAUNCH_CLASSPA...
Github user yhuai commented on the issue: https://github.com/apache/spark/pull/14492 That's a good point. Let me close this. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #14492: [SPARK-16887] Add SPARK_DIST_CLASSPATH to LAUNCH_...
Github user yhuai closed the pull request at: https://github.com/apache/spark/pull/14492 --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
spark git commit: [SPARK-16749][SQL] Simplify processing logic in LEAD/LAG processing.
Repository: spark Updated Branches: refs/heads/master 53d1c7877 -> df1065883 [SPARK-16749][SQL] Simplify processing logic in LEAD/LAG processing. ## What changes were proposed in this pull request? The logic for LEAD/LAG processing is more complex that it needs to be. This PR fixes that. ## How was this patch tested? Existing tests. Author: Herman van HovellCloses #14376 from hvanhovell/SPARK-16749. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/df106588 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/df106588 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/df106588 Branch: refs/heads/master Commit: df10658831f4e5f9756a5732673ad12904b5d05c Parents: 53d1c78 Author: Herman van Hovell Authored: Mon Aug 8 16:34:57 2016 -0700 Committer: Yin Huai Committed: Mon Aug 8 16:34:57 2016 -0700 -- .../apache/spark/sql/execution/WindowExec.scala | 53 +++- 1 file changed, 18 insertions(+), 35 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/df106588/sql/core/src/main/scala/org/apache/spark/sql/execution/WindowExec.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/WindowExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/WindowExec.scala index 7149603..b60f17c 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/WindowExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/WindowExec.scala @@ -209,7 +209,8 @@ case class WindowExec( new OffsetWindowFunctionFrame( target, ordinal, -functions, +// OFFSET frame functions are guaranteed be OffsetWindowFunctions. +functions.map(_.asInstanceOf[OffsetWindowFunction]), child.output, (expressions, schema) => newMutableProjection(expressions, schema, subexpressionEliminationEnabled), @@ -557,6 +558,9 @@ private[execution] abstract class WindowFunctionFrame { * The offset window frame calculates frames containing LEAD/LAG statements. * * @param target to write results to. + * @param ordinal the ordinal is the starting offset at which the results of the window frame get + *written into the (shared) target row. The result of the frame expression with + *index 'i' will be written to the 'ordinal' + 'i' position in the target row. * @param expressions to shift a number of rows. * @param inputSchema required for creating a projection. * @param newMutableProjection function used to create the projection. @@ -565,7 +569,7 @@ private[execution] abstract class WindowFunctionFrame { private[execution] final class OffsetWindowFunctionFrame( target: MutableRow, ordinal: Int, -expressions: Array[Expression], +expressions: Array[OffsetWindowFunction], inputSchema: Seq[Attribute], newMutableProjection: (Seq[Expression], Seq[Attribute]) => MutableProjection, offset: Int) extends WindowFunctionFrame { @@ -576,12 +580,6 @@ private[execution] final class OffsetWindowFunctionFrame( /** Index of the input row currently used for output. */ private[this] var inputIndex = 0 - /** Row used when there is no valid input. */ - private[this] val emptyRow = new GenericInternalRow(inputSchema.size) - - /** Row used to combine the offset and the current row. */ - private[this] val join = new JoinedRow - /** * Create the projection used when the offset row exists. * Please note that this project always respect null input values (like PostgreSQL). @@ -589,12 +587,8 @@ private[execution] final class OffsetWindowFunctionFrame( private[this] val projection = { // Collect the expressions and bind them. val inputAttrs = inputSchema.map(_.withNullability(true)) -val boundExpressions = Seq.fill(ordinal)(NoOp) ++ expressions.toSeq.map { - case e: OffsetWindowFunction => -val input = BindReferences.bindReference(e.input, inputAttrs) -input - case e => -BindReferences.bindReference(e, inputAttrs) +val boundExpressions = Seq.fill(ordinal)(NoOp) ++ expressions.toSeq.map { e => + BindReferences.bindReference(e.input, inputAttrs) } // Create the projection. @@ -605,23 +599,14 @@ private[execution] final class OffsetWindowFunctionFrame( private[this] val fillDefaultValue = { // Collect the expressions and bind them. val inputAttrs = inputSchema.map(_.withNullability(true)) -val numInputAttributes = inputAttrs.size -val boundExpressions = Seq.fill(ordinal)(NoOp) ++ expressions.toSeq.map { -
[GitHub] spark issue #14376: [SPARK-16749][SQL] Simplify processing logic in LEAD/LAG...
Github user yhuai commented on the issue: https://github.com/apache/spark/pull/14376 Merging to master. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #14531: [SPARK-16943] [SPARK-16942] [SQL] Fix multiple bugs in C...
Github user yhuai commented on the issue: https://github.com/apache/spark/pull/14531 We do not support index tables at all (you can not create such a table). Let's not add the support right now. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #14207: [SPARK-16552] [SQL] Store the Inferred Schemas into Exte...
Github user yhuai commented on the issue: https://github.com/apache/spark/pull/14207 @gatorsmile Where is change for the following description? ``` This PR is to store the inferred schema in the external catalog when creating the table. When users intend to refresh the schema after possible changes on external files (table location), they issue REFRESH TABLE. Spark SQL will infer the schema again based on the previously specified table location and update/refresh the schema in the external catalog and metadata cache. ``` --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #14207: [SPARK-16552] [SQL] Store the Inferred Schemas in...
Github user yhuai commented on a diff in the pull request: https://github.com/apache/spark/pull/14207#discussion_r73941472 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala --- @@ -95,17 +95,39 @@ case class CreateDataSourceTableCommand( } // Create the relation to validate the arguments before writing the metadata to the metastore. -DataSource( - sparkSession = sparkSession, - userSpecifiedSchema = userSpecifiedSchema, - className = provider, - bucketSpec = None, - options = optionsWithPath).resolveRelation(checkPathExist = false) +val dataSource: BaseRelation = + DataSource( +sparkSession = sparkSession, +userSpecifiedSchema = userSpecifiedSchema, +className = provider, +bucketSpec = None, +options = optionsWithPath).resolveRelation(checkPathExist = false) + +val partitionColumns = if (userSpecifiedSchema.nonEmpty) { + userSpecifiedPartitionColumns +} else { + val res = dataSource match { +case r: HadoopFsRelation => r.partitionSchema.fieldNames +case _ => Array.empty[String] + } + if (userSpecifiedPartitionColumns.length > 0) { +// The table does not have a specified schema, which means that the schema will be inferred +// when we load the table. So, we are not expecting partition columns and we will discover +// partitions when we load the table. However, if there are specified partition columns, +// we simply ignore them and provide a warning message. +logWarning( + s"Specified partition columns (${userSpecifiedPartitionColumns.mkString(",")}) will be " + +s"ignored. The schema and partition columns of table $tableIdent are inferred. " + +s"Schema: ${dataSource.schema.simpleString}; " + +s"Partition columns: ${res.mkString("(", ", ", ")")}") + } + res +} CreateDataSourceTableUtils.createDataSourceTable( sparkSession = sparkSession, tableIdent = tableIdent, - userSpecifiedSchema = userSpecifiedSchema, + schema = dataSource.schema, --- End diff -- I think from the code, it is not very clear that `dataSource.schema` will be `userSpecifiedSchema`? --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #14207: [SPARK-16552] [SQL] Store the Inferred Schemas in...
Github user yhuai commented on a diff in the pull request: https://github.com/apache/spark/pull/14207#discussion_r73940895 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala --- @@ -521,31 +521,29 @@ object DDLUtils { table.partitionColumns.nonEmpty || table.properties.contains(DATASOURCE_SCHEMA_NUMPARTCOLS) } - // A persisted data source table may not store its schema in the catalog. In this case, its schema - // will be inferred at runtime when the table is referenced. - def getSchemaFromTableProperties(metadata: CatalogTable): Option[StructType] = { + // A persisted data source table always store its schema in the catalog. + def getSchemaFromTableProperties(metadata: CatalogTable): StructType = { require(isDatasourceTable(metadata)) +val msgSchemaCorrupted = "Could not read schema from the metastore because it is corrupted." val props = metadata.properties -if (props.isDefinedAt(DATASOURCE_SCHEMA)) { +props.get(DATASOURCE_SCHEMA).map { schema => // Originally, we used spark.sql.sources.schema to store the schema of a data source table. // After SPARK-6024, we removed this flag. // Although we are not using spark.sql.sources.schema any more, we need to still support. - props.get(DATASOURCE_SCHEMA).map(DataType.fromJson(_).asInstanceOf[StructType]) -} else { - metadata.properties.get(DATASOURCE_SCHEMA_NUMPARTS).map { numParts => + DataType.fromJson(schema).asInstanceOf[StructType] +} getOrElse { --- End diff -- I am not sure if `getOrElse` makes the code easier to follow. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #14207: [SPARK-16552] [SQL] Store the Inferred Schemas in...
Github user yhuai commented on a diff in the pull request: https://github.com/apache/spark/pull/14207#discussion_r73940848 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala --- @@ -521,31 +521,29 @@ object DDLUtils { table.partitionColumns.nonEmpty || table.properties.contains(DATASOURCE_SCHEMA_NUMPARTCOLS) } - // A persisted data source table may not store its schema in the catalog. In this case, its schema - // will be inferred at runtime when the table is referenced. - def getSchemaFromTableProperties(metadata: CatalogTable): Option[StructType] = { + // A persisted data source table always store its schema in the catalog. + def getSchemaFromTableProperties(metadata: CatalogTable): StructType = { require(isDatasourceTable(metadata)) +val msgSchemaCorrupted = "Could not read schema from the metastore because it is corrupted." val props = metadata.properties -if (props.isDefinedAt(DATASOURCE_SCHEMA)) { +props.get(DATASOURCE_SCHEMA).map { schema => // Originally, we used spark.sql.sources.schema to store the schema of a data source table. // After SPARK-6024, we removed this flag. // Although we are not using spark.sql.sources.schema any more, we need to still support. - props.get(DATASOURCE_SCHEMA).map(DataType.fromJson(_).asInstanceOf[StructType]) -} else { - metadata.properties.get(DATASOURCE_SCHEMA_NUMPARTS).map { numParts => + DataType.fromJson(schema).asInstanceOf[StructType] +} getOrElse { + props.get(DATASOURCE_SCHEMA_NUMPARTS).map { numParts => val parts = (0 until numParts.toInt).map { index => val part = metadata.properties.get(s"$DATASOURCE_SCHEMA_PART_PREFIX$index").orNull if (part == null) { -throw new AnalysisException( - "Could not read schema from the metastore because it is corrupted " + -s"(missing part $index of the schema, $numParts parts are expected).") +throw new AnalysisException(msgSchemaCorrupted + + s" (missing part $index of the schema, $numParts parts are expected).") } - part } // Stick all parts back to a single schema string. DataType.fromJson(parts.mkString).asInstanceOf[StructType] - } + } getOrElse(throw new AnalysisException(msgSchemaCorrupted)) --- End diff -- ah, this `getOrElse` is too far from the `get(DATASOURCE_SCHEMA)`... Actually, I prefer the `if/else`. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #14207: [SPARK-16552] [SQL] Store the Inferred Schemas in...
Github user yhuai commented on a diff in the pull request: https://github.com/apache/spark/pull/14207#discussion_r73940468 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala --- @@ -95,17 +95,39 @@ case class CreateDataSourceTableCommand( } // Create the relation to validate the arguments before writing the metadata to the metastore. -DataSource( - sparkSession = sparkSession, - userSpecifiedSchema = userSpecifiedSchema, - className = provider, - bucketSpec = None, - options = optionsWithPath).resolveRelation(checkPathExist = false) +val dataSource: BaseRelation = + DataSource( +sparkSession = sparkSession, +userSpecifiedSchema = userSpecifiedSchema, +className = provider, +bucketSpec = None, +options = optionsWithPath).resolveRelation(checkPathExist = false) + +val partitionColumns = if (userSpecifiedSchema.nonEmpty) { + userSpecifiedPartitionColumns +} else { + val res = dataSource match { +case r: HadoopFsRelation => r.partitionSchema.fieldNames +case _ => Array.empty[String] + } + if (userSpecifiedPartitionColumns.length > 0) { +// The table does not have a specified schema, which means that the schema will be inferred +// when we load the table. So, we are not expecting partition columns and we will discover +// partitions when we load the table. However, if there are specified partition columns, +// we simply ignore them and provide a warning message. +logWarning( + s"Specified partition columns (${userSpecifiedPartitionColumns.mkString(",")}) will be " + +s"ignored. The schema and partition columns of table $tableIdent are inferred. " + +s"Schema: ${dataSource.schema.simpleString}; " + +s"Partition columns: ${res.mkString("(", ", ", ")")}") + } + res +} CreateDataSourceTableUtils.createDataSourceTable( sparkSession = sparkSession, tableIdent = tableIdent, - userSpecifiedSchema = userSpecifiedSchema, + schema = dataSource.schema, --- End diff -- seems we should still use the user-specified schema, right? --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #14207: [SPARK-16552] [SQL] Store the Inferred Schemas in...
Github user yhuai commented on a diff in the pull request: https://github.com/apache/spark/pull/14207#discussion_r73940350 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala --- @@ -95,17 +95,39 @@ case class CreateDataSourceTableCommand( } // Create the relation to validate the arguments before writing the metadata to the metastore. -DataSource( - sparkSession = sparkSession, - userSpecifiedSchema = userSpecifiedSchema, - className = provider, - bucketSpec = None, - options = optionsWithPath).resolveRelation(checkPathExist = false) +val dataSource: BaseRelation = + DataSource( +sparkSession = sparkSession, +userSpecifiedSchema = userSpecifiedSchema, +className = provider, +bucketSpec = None, +options = optionsWithPath).resolveRelation(checkPathExist = false) + +val partitionColumns = if (userSpecifiedSchema.nonEmpty) { + userSpecifiedPartitionColumns +} else { + val res = dataSource match { +case r: HadoopFsRelation => r.partitionSchema.fieldNames +case _ => Array.empty[String] + } + if (userSpecifiedPartitionColumns.length > 0) { --- End diff -- Should we throw an exception for this case? --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #14541: [SPARK-16953] Make requestTotalExecutors public Develope...
Github user yhuai commented on the issue: https://github.com/apache/spark/pull/14541 lgtm --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #14376: [SPARK-16749][SQL] Simplify processing logic in L...
Github user yhuai commented on a diff in the pull request: https://github.com/apache/spark/pull/14376#discussion_r73917166 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/WindowExec.scala --- @@ -565,7 +566,7 @@ private[execution] abstract class WindowFunctionFrame { private[execution] final class OffsetWindowFunctionFrame( target: MutableRow, ordinal: Int, --- End diff -- Since we are changing this file, can you add comment to explain what is ordinal (with an example)? --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #14518: [SPARK-16610][SQL] Add `orc.compress` as an alias for `c...
Github user yhuai commented on the issue: https://github.com/apache/spark/pull/14518 I think it is fine that `compression` takes precedence. btw, is this flag used by other data sources? --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #14518: [SPARK-16610][SQL] Add `orc.compress` as an alias...
Github user yhuai commented on a diff in the pull request: https://github.com/apache/spark/pull/14518#discussion_r73914641 --- Diff: sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcOptions.scala --- @@ -31,7 +30,8 @@ private[orc] class OrcOptions( * Acceptable values are defined in [[shortOrcCompressionCodecNames]]. */ val compressionCodec: String = { -val codecName = parameters.getOrElse("compression", "snappy").toLowerCase +val codecName = parameters.getOrElse( + "compression", parameters.getOrElse("orc.compress", "snappy")).toLowerCase --- End diff -- use `OrcRelation.ORC_COMPRESSION` (since we have a val defined)? Let's add comments to explain what we are doing (we should mention that orc.compress is a ORC conf and which conf will take precedence). Also, will the following lines look better? ``` val orcCompressionConf = parameters.get(OrcRelation.ORC_COMPRESSION) val codecName = parameters .get("compression") .orElse(orcCompressionConf) .getOrElse("snappy") ``` --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #14518: [SPARK-16610][SQL] Add `orc.compress` as an alias...
Github user yhuai commented on a diff in the pull request: https://github.com/apache/spark/pull/14518#discussion_r73914816 --- Diff: sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala --- @@ -161,6 +161,29 @@ class OrcQuerySuite extends QueryTest with BeforeAndAfterAll with OrcTest { } } + test("SPARK-16610: Respect orc.compress option when compression is unset") { +// Respect `orc.compress`. +withTempPath { file => + spark.range(0, 10).write +.option("orc.compress", "ZLIB") +.orc(file.getCanonicalPath) + val expectedCompressionKind = + OrcFileOperator.getFileReader(file.getCanonicalPath).get.getCompression + assert("ZLIB" === expectedCompressionKind.name()) +} + +// `compression` overrides `orc.compress`. +withTempPath { file => + spark.range(0, 10).write +.option("compression", "ZLIB") +.option("orc.compress", "SNAPPY") +.orc(file.getCanonicalPath) + val expectedCompressionKind = + OrcFileOperator.getFileReader(file.getCanonicalPath).get.getCompression + assert("ZLIB" === expectedCompressionKind.name()) +} + } --- End diff -- nice. Thank you for adding this test. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #14518: [SPARK-16610][SQL] Do not ignore `orc.compress` w...
Github user yhuai commented on a diff in the pull request: https://github.com/apache/spark/pull/14518#discussion_r73811203 --- Diff: sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcOptions.scala --- @@ -17,27 +17,40 @@ package org.apache.spark.sql.hive.orc +import org.apache.hadoop.conf.Configuration + /** * Options for the ORC data source. */ private[orc] class OrcOptions( -@transient private val parameters: Map[String, String]) +@transient private val parameters: Map[String, String], +@transient private val conf: Configuration) extends Serializable { import OrcOptions._ /** - * Compression codec to use. By default snappy compression. + * Compression codec to use. By default use the value specified in Hadoop configuration. + * If `orc.compress` is unset, then we use snappy. * Acceptable values are defined in [[shortOrcCompressionCodecNames]]. */ val compressionCodec: String = { -val codecName = parameters.getOrElse("compression", "snappy").toLowerCase -if (!shortOrcCompressionCodecNames.contains(codecName)) { - val availableCodecs = shortOrcCompressionCodecNames.keys.map(_.toLowerCase) - throw new IllegalArgumentException(s"Codec [$codecName] " + -s"is not available. Available codecs are ${availableCodecs.mkString(", ")}.") +val default = conf.get(OrcRelation.ORC_COMPRESSION, "SNAPPY") --- End diff -- Sorry. Maybe I did not explain clearly in the jira. The use case I mentioned was `df.write.option("orc.compress", ...)`. We do not need to look at hadoop conf. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #12872: [SPARK-6339][SQL] Supports CREATE TEMPORARY VIEW tableId...
Github user yhuai commented on the issue: https://github.com/apache/spark/pull/12872 Can you be more specific on the inconsistency? Seems `ALTER VIEW view_name` is the only inconsistent command? --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
spark git commit: [SPARK-16901] Hive settings in hive-site.xml may be overridden by Hive's default values
Repository: spark Updated Branches: refs/heads/branch-2.0 d99d90982 -> b5d65b45d [SPARK-16901] Hive settings in hive-site.xml may be overridden by Hive's default values ## What changes were proposed in this pull request? When we create the HiveConf for metastore client, we use a Hadoop Conf as the base, which may contain Hive settings in hive-site.xml (https://github.com/apache/spark/blob/master/sql/core/src/main/scala/org/apache/spark/sql/internal/SharedState.scala#L49). However, HiveConf's initialize function basically ignores the base Hadoop Conf and always its default values (i.e. settings with non-null default values) as the base (https://github.com/apache/hive/blob/release-1.2.1/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java#L2687). So, even a user put javax.jdo.option.ConnectionURL in hive-site.xml, it is not used and Hive will use its default, which is jdbc:derby:;databaseName=metastore_db;create=true. This issue only shows up when `spark.sql.hive.metastore.jars` is not set to builtin. ## How was this patch tested? New test in HiveSparkSubmitSuite. Author: Yin Huai <yh...@databricks.com> Closes #14497 from yhuai/SPARK-16901. (cherry picked from commit e679bc3c1cd418ef0025d2ecbc547c9660cac433) Signed-off-by: Yin Huai <yh...@databricks.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b5d65b45 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b5d65b45 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b5d65b45 Branch: refs/heads/branch-2.0 Commit: b5d65b45dfd34a7f451465ed3aac923077675166 Parents: d99d909 Author: Yin Huai <yh...@databricks.com> Authored: Fri Aug 5 15:52:02 2016 -0700 Committer: Yin Huai <yh...@databricks.com> Committed: Fri Aug 5 15:52:17 2016 -0700 -- .../spark/sql/hive/client/HiveClientImpl.scala | 24 +- .../spark/sql/hive/HiveSparkSubmitSuite.scala | 80 2 files changed, 101 insertions(+), 3 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/b5d65b45/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala -- diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala index 6cdf3ef..1d40895 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala @@ -139,14 +139,32 @@ private[hive] class HiveClientImpl( // so we should keep `conf` and reuse the existing instance of `CliSessionState`. originalState } else { -val hiveConf = new HiveConf(hadoopConf, classOf[SessionState]) +val hiveConf = new HiveConf(classOf[SessionState]) +// 1: we set all confs in the hadoopConf to this hiveConf. +// This hadoopConf contains user settings in Hadoop's core-site.xml file +// and Hive's hive-site.xml file. Note, we load hive-site.xml file manually in +// SharedState and put settings in this hadoopConf instead of relying on HiveConf +// to load user settings. Otherwise, HiveConf's initialize method will override +// settings in the hadoopConf. This issue only shows up when spark.sql.hive.metastore.jars +// is not set to builtin. When spark.sql.hive.metastore.jars is builtin, the classpath +// has hive-site.xml. So, HiveConf will use that to override its default values. +hadoopConf.iterator().asScala.foreach { entry => + val key = entry.getKey + val value = entry.getValue + if (key.toLowerCase.contains("password")) { +logDebug(s"Applying Hadoop and Hive config to Hive Conf: $key=xxx") + } else { +logDebug(s"Applying Hadoop and Hive config to Hive Conf: $key=$value") + } + hiveConf.set(key, value) +} // HiveConf is a Hadoop Configuration, which has a field of classLoader and // the initial value will be the current thread's context class loader // (i.e. initClassLoader at here). // We call initialConf.setClassLoader(initClassLoader) at here to make // this action explicit. hiveConf.setClassLoader(initClassLoader) -// First, we set all spark confs to this hiveConf. +// 2: we set all spark confs to this hiveConf. sparkConf.getAll.foreach { case (k, v) => if (k.toLowerCase.contains("password")) { logDebug(s"Applying Spark config to Hive Conf: $k=xxx") @@ -155,7 +173,7 @@ private[hive] class HiveClientImp
spark git commit: [SPARK-16901] Hive settings in hive-site.xml may be overridden by Hive's default values
Repository: spark Updated Branches: refs/heads/master 6cbde337a -> e679bc3c1 [SPARK-16901] Hive settings in hive-site.xml may be overridden by Hive's default values ## What changes were proposed in this pull request? When we create the HiveConf for metastore client, we use a Hadoop Conf as the base, which may contain Hive settings in hive-site.xml (https://github.com/apache/spark/blob/master/sql/core/src/main/scala/org/apache/spark/sql/internal/SharedState.scala#L49). However, HiveConf's initialize function basically ignores the base Hadoop Conf and always its default values (i.e. settings with non-null default values) as the base (https://github.com/apache/hive/blob/release-1.2.1/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java#L2687). So, even a user put javax.jdo.option.ConnectionURL in hive-site.xml, it is not used and Hive will use its default, which is jdbc:derby:;databaseName=metastore_db;create=true. This issue only shows up when `spark.sql.hive.metastore.jars` is not set to builtin. ## How was this patch tested? New test in HiveSparkSubmitSuite. Author: Yin Huai <yh...@databricks.com> Closes #14497 from yhuai/SPARK-16901. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e679bc3c Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e679bc3c Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e679bc3c Branch: refs/heads/master Commit: e679bc3c1cd418ef0025d2ecbc547c9660cac433 Parents: 6cbde33 Author: Yin Huai <yh...@databricks.com> Authored: Fri Aug 5 15:52:02 2016 -0700 Committer: Yin Huai <yh...@databricks.com> Committed: Fri Aug 5 15:52:02 2016 -0700 -- .../spark/sql/hive/client/HiveClientImpl.scala | 24 +- .../spark/sql/hive/HiveSparkSubmitSuite.scala | 80 2 files changed, 101 insertions(+), 3 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/e679bc3c/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala -- diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala index ef69ac7..3bf4ed5 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala @@ -141,14 +141,32 @@ private[hive] class HiveClientImpl( // so we should keep `conf` and reuse the existing instance of `CliSessionState`. originalState } else { -val hiveConf = new HiveConf(hadoopConf, classOf[SessionState]) +val hiveConf = new HiveConf(classOf[SessionState]) +// 1: we set all confs in the hadoopConf to this hiveConf. +// This hadoopConf contains user settings in Hadoop's core-site.xml file +// and Hive's hive-site.xml file. Note, we load hive-site.xml file manually in +// SharedState and put settings in this hadoopConf instead of relying on HiveConf +// to load user settings. Otherwise, HiveConf's initialize method will override +// settings in the hadoopConf. This issue only shows up when spark.sql.hive.metastore.jars +// is not set to builtin. When spark.sql.hive.metastore.jars is builtin, the classpath +// has hive-site.xml. So, HiveConf will use that to override its default values. +hadoopConf.iterator().asScala.foreach { entry => + val key = entry.getKey + val value = entry.getValue + if (key.toLowerCase.contains("password")) { +logDebug(s"Applying Hadoop and Hive config to Hive Conf: $key=xxx") + } else { +logDebug(s"Applying Hadoop and Hive config to Hive Conf: $key=$value") + } + hiveConf.set(key, value) +} // HiveConf is a Hadoop Configuration, which has a field of classLoader and // the initial value will be the current thread's context class loader // (i.e. initClassLoader at here). // We call initialConf.setClassLoader(initClassLoader) at here to make // this action explicit. hiveConf.setClassLoader(initClassLoader) -// First, we set all spark confs to this hiveConf. +// 2: we set all spark confs to this hiveConf. sparkConf.getAll.foreach { case (k, v) => if (k.toLowerCase.contains("password")) { logDebug(s"Applying Spark config to Hive Conf: $k=xxx") @@ -157,7 +175,7 @@ private[hive] class HiveClientImpl( } hiveConf.set(k, v) } -// Second, we set all entries in config t
[GitHub] spark issue #14497: [SPARK-16901] Hive settings in hive-site.xml may be over...
Github user yhuai commented on the issue: https://github.com/apache/spark/pull/14497 Thanks for reviewing! I am merging this to master and branch 2.0. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #14500: [SPARK-] SQL DDL: MSCK REPAIR TABLE
Github user yhuai commented on a diff in the pull request: https://github.com/apache/spark/pull/14500#discussion_r73730807 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala --- @@ -425,6 +431,96 @@ case class AlterTableDropPartitionCommand( } +/** + * Discover Partitions in ALTER TABLE: discover all the partition in the directory of a table and + * update the catalog. + * + * The syntax of this command is: + * {{{ + * ALTER TABLE table DISCOVER PARTITIONS; + * }}} + */ +case class AlterTableRecoverPartitionsCommand( +tableName: TableIdentifier) extends RunnableCommand { + override def run(spark: SparkSession): Seq[Row] = { +val catalog = spark.sessionState.catalog +if (!catalog.tableExists(tableName)) { + throw new AnalysisException( +s"Table $tableName in ALTER TABLE RECOVER PARTITIONS does not exist.") +} +val table = catalog.getTableMetadata(tableName) +if (catalog.isTemporaryTable(tableName)) { + throw new AnalysisException( +s"Operation not allowed: ALTER TABLE RECOVER PARTITIONS on temporary tables: $tableName") +} +if (DDLUtils.isDatasourceTable(table)) { + throw new AnalysisException( +s"Operation not allowed: ALTER TABLE RECOVER PARTITIONS on datasource tables: $tableName") +} +if (table.tableType != CatalogTableType.EXTERNAL) { + throw new AnalysisException( +s"Operation not allowed: ALTER TABLE RECOVER PARTITIONS only works on external " + + s"tables: $tableName") +} +if (DDLUtils.isTablePartitioned(table)) { + throw new AnalysisException( +s"Operation not allowed: ALTER TABLE RECOVER PARTITIONS only works on partitioned " + + s"tables: $tableName") +} +if (table.storage.locationUri.isEmpty) { + throw new AnalysisException( +s"Operation not allowed: ALTER TABLE RECOVER PARTITIONS only works on tables with " + + s"location provided: $tableName") +} + +recoverPartitions(spark, table) +Seq.empty[Row] + } + + def recoverPartitions(spark: SparkSession, table: CatalogTable): Unit = { +val root = new Path(table.storage.locationUri.get) +val fs = root.getFileSystem(spark.sparkContext.hadoopConfiguration) +val partitionSpecsAndLocs = scanPartitions(spark, fs, root, Map(), table.partitionSchema.size) +val parts = partitionSpecsAndLocs.map { case (spec, location) => + // inherit table storage format (possibly except for location) + CatalogTablePartition(spec, table.storage.copy(locationUri = Some(location.toUri.toString))) +} +spark.sessionState.catalog.createPartitions(tableName, + parts.toArray[CatalogTablePartition], ignoreIfExists = true) + } + + @transient private lazy val evalTaskSupport = new ForkJoinTaskSupport(new ForkJoinPool(8)) + + private def scanPartitions( + spark: SparkSession, + fs: FileSystem, + path: Path, + spec: TablePartitionSpec, + numPartitionsLeft: Int): GenSeq[(TablePartitionSpec, Path)] = { --- End diff -- Let's see if we can reuse code in `PartitionUtils`. Also, path name can be escaped. We need to handle this kind of cases (we have `unescapePathName` in `PartitionUtils`). --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #14500: [SPARK-] SQL DDL: MSCK REPAIR TABLE
Github user yhuai commented on a diff in the pull request: https://github.com/apache/spark/pull/14500#discussion_r73730136 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala --- @@ -409,6 +409,18 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder { } /** + * Create a [[RepairTableCommand]] command. + * + * For example: + * {{{ + * MSCK REPAIR TABLE tablename + * }}} + */ + override def visitRepairTable(ctx: RepairTableContext): LogicalPlan = withOrigin(ctx) { +RepairTableCommand(visitTableIdentifier(ctx.tableIdentifier)) --- End diff -- I see. How about we use a single command internally? --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #14500: [SPARK-] SQL DDL: MSCK REPAIR TABLE
Github user yhuai commented on a diff in the pull request: https://github.com/apache/spark/pull/14500#discussion_r73729927 --- Diff: sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala --- @@ -827,6 +827,45 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach { testAddPartitions(isDatasourceTable = true) } + test("alter table: recover partitions (sequential)") { +withSQLConf("spark.rdd.parallelListingThreshold" -> "1") { + testRecoverPartitions() +} + } + + test("after table: recover partition (parallel)") { +withSQLConf("spark.rdd.parallelListingThreshold" -> "10") { + testRecoverPartitions() +} + } + + private def testRecoverPartitions() { +val catalog = spark.sessionState.catalog +// table to alter does not exist +intercept[AnalysisException] { + sql("ALTER TABLE does_not_exist RECOVER PARTITIONS") +} + +val tableIdent = TableIdentifier("tab1") +createTable(catalog, tableIdent) +val part1 = Map("a" -> "1", "b" -> "5") +createTablePartition(catalog, part1, tableIdent) +assert(catalog.listPartitions(tableIdent).map(_.spec).toSet == Set(part1)) + +val part2 = Map("a" -> "2", "b" -> "6") +val root = new Path(catalog.getTableMetadata(tableIdent).storage.locationUri.get) +val fs = root.getFileSystem(spark.sparkContext.hadoopConfiguration) +fs.mkdirs(new Path(new Path(root, "a=1"), "b=5")) +fs.mkdirs(new Path(new Path(root, "a=2"), "b=6")) +try { + sql("ALTER TABLE tab1 RECOVER PARTITIONS") + assert(catalog.listPartitions(tableIdent).map(_.spec).toSet == +Set(part1, part2)) +} finally { + fs.delete(root, true) +} + } --- End diff -- Let's add tests to exercise the command more. Here are three examples. 1. There is an partition dir has a bad name (not in the format of key=value). 2. Say that we have two partition columns. We have some files under the first layer (e.g. _SUCCESS, parquet's metadata files, and/or regular data files). 3. Some dirs do not have the expected number of partition columns. For example, the schema specifies 3 partition columns. But, a path only has two partition columns. 4. The partition column columns encoded in the path does not match the name specified in the schema. For example, when we create the table, we specify `c1` as the first partition column. However, the dir in fs has `c2` as the first partition column. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #14500: [SPARK-] SQL DDL: MSCK REPAIR TABLE
Github user yhuai commented on a diff in the pull request: https://github.com/apache/spark/pull/14500#discussion_r73728488 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala --- @@ -409,6 +409,18 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder { } /** + * Create a [[RepairTableCommand]] command. + * + * For example: + * {{{ + * MSCK REPAIR TABLE tablename + * }}} + */ + override def visitRepairTable(ctx: RepairTableContext): LogicalPlan = withOrigin(ctx) { +RepairTableCommand(visitTableIdentifier(ctx.tableIdentifier)) --- End diff -- Are AlterTableRecoverPartitionsCommand and RepairTableCommand the same? --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #14155: [SPARK-16498][SQL] move hive hack for data source...
Github user yhuai commented on a diff in the pull request: https://github.com/apache/spark/pull/14155#discussion_r73727982 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala --- @@ -301,9 +298,6 @@ case class AlterTableSerDePropertiesCommand( "ALTER TABLE attempted to set neither serde class name nor serde properties") override def run(sparkSession: SparkSession): Seq[Row] = { -DDLUtils.verifyTableProperties( --- End diff -- yea. Let's discuss it. Can we revert these changes for now? --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org