[spark] branch master updated (f7be024 -> ce1f97f)
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git. from f7be024 [SPARK-37480][K8S][DOC] Sync Kubernetes configuration to latest in running-on-k8s.md add ce1f97f [SPARK-37326][SQL] Support TimestampNTZ in CSV data source No new revisions were added by this update. Summary of changes: docs/sql-data-sources-csv.md | 12 +- .../spark/sql/catalyst/csv/CSVInferSchema.scala| 24 +++ .../apache/spark/sql/catalyst/csv/CSVOptions.scala | 4 + .../sql/catalyst/csv/UnivocityGenerator.scala | 2 +- .../spark/sql/catalyst/csv/UnivocityParser.scala | 4 +- .../spark/sql/catalyst/util/DateTimeUtils.scala| 32 ++- .../sql/catalyst/util/TimestampFormatter.scala | 36 +++- .../spark/sql/errors/QueryExecutionErrors.scala| 8 +- .../sql/catalyst/util/DateTimeUtilsSuite.scala | 12 ++ .../org/apache/spark/sql/CsvFunctionsSuite.scala | 11 ++ .../sql/execution/datasources/csv/CSVSuite.scala | 216 - 11 files changed, 331 insertions(+), 30 deletions(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated (a6ca481 -> 7484c1b)
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git. from a6ca481 [SPARK-36346][SQL][FOLLOWUP] Rename `withAllOrcReaders` to `withAllNativeOrcReaders` add 7484c1b [SPARK-37468][SQL] Support ANSI intervals and TimestampNTZ for UnionEstimation No new revisions were added by this update. Summary of changes: .../logical/statsEstimation/UnionEstimation.scala | 8 +++- .../statsEstimation/UnionEstimationSuite.scala | 24 +++--- 2 files changed, 28 insertions(+), 4 deletions(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated: [SPARK-37332][SQL] Allow ANSI intervals in `ALTER TABLE .. ADD COLUMNS`
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 0f20678 [SPARK-37332][SQL] Allow ANSI intervals in `ALTER TABLE .. ADD COLUMNS` 0f20678 is described below commit 0f20678fc50aaf26359d9751fe96b15dc2e12540 Author: Max Gekk AuthorDate: Tue Nov 16 10:30:11 2021 +0300 [SPARK-37332][SQL] Allow ANSI intervals in `ALTER TABLE .. ADD COLUMNS` ### What changes were proposed in this pull request? In the PR, I propose to allow ANSI intervals: year-month and day-time intervals in the `ALTER TABLE .. ADD COLUMNS` command for tables in v1 and v2 In-Memory catalogs. Also added an unified test suite to migrate related tests in the future. ### Why are the changes needed? To improve user experience with Spark SQL. After the changes, users will be able to add columns with ANSI intervals instead of dropping and creating new table. ### Does this PR introduce _any_ user-facing change? In some sense, yes. After the changes, the command doesn't output any error message. ### How was this patch tested? By running new test suite: ``` $ build/sbt -Phive-2.3 -Phive-thriftserver "test:testOnly *AlterTableAddColumnsSuite" $ build/sbt -Phive-2.3 "test:testOnly *HiveDDLSuite" ``` Closes #34600 from MaxGekk/add-columns-ansi-intervals. Authored-by: Max Gekk Signed-off-by: Max Gekk --- .../sql/catalyst/analysis/CheckAnalysis.scala | 6 +-- .../plans/logical/v2AlterTableCommands.scala | 2 +- .../apache/spark/sql/catalyst/util/TypeUtils.scala | 4 +- .../command/AlterTableAddColumnsSuiteBase.scala| 53 ++ .../command/v1/AlterTableAddColumnsSuite.scala | 38 .../command/v2/AlterTableAddColumnsSuite.scala | 28 .../spark/sql/hive/execution/HiveDDLSuite.scala| 5 -- .../command/AlterTableAddColumnsSuite.scala| 46 +++ 8 files changed, 170 insertions(+), 12 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala index 1a105ad..5bf37a2 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala @@ -464,12 +464,10 @@ trait CheckAnalysis extends PredicateHelper with LookupCatalog { failAnalysis(s"Invalid partitioning: ${badReferences.mkString(", ")}") } -create.tableSchema.foreach(f => - TypeUtils.failWithIntervalType(f.dataType, forbidAnsiIntervals = false)) +create.tableSchema.foreach(f => TypeUtils.failWithIntervalType(f.dataType)) case write: V2WriteCommand if write.resolved => -write.query.schema.foreach(f => - TypeUtils.failWithIntervalType(f.dataType, forbidAnsiIntervals = false)) +write.query.schema.foreach(f => TypeUtils.failWithIntervalType(f.dataType)) case alter: AlterTableCommand => checkAlterTableCommand(alter) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/v2AlterTableCommands.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/v2AlterTableCommands.scala index 2eb828e..302a810 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/v2AlterTableCommands.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/v2AlterTableCommands.scala @@ -134,7 +134,7 @@ case class ReplaceColumns( table: LogicalPlan, columnsToAdd: Seq[QualifiedColType]) extends AlterTableCommand { columnsToAdd.foreach { c => -TypeUtils.failWithIntervalType(c.dataType, forbidAnsiIntervals = false) +TypeUtils.failWithIntervalType(c.dataType) } override lazy val resolved: Boolean = table.resolved && columnsToAdd.forall(_.resolved) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TypeUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TypeUtils.scala index 144508c..729a26b 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TypeUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TypeUtils.scala @@ -98,8 +98,8 @@ object TypeUtils { case _ => false } - def failWithIntervalType(dataType: DataType, forbidAnsiIntervals: Boolean = true): Unit = { -invokeOnceForInterval(dataType, forbidAnsiIntervals) { + def failWithIntervalType(dataType: DataType): Unit = { +inv
[spark] branch master updated: [SPARK-37304][SQL] Allow ANSI intervals in v2 `ALTER TABLE .. REPLACE COLUMNS`
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 71f4ee3 [SPARK-37304][SQL] Allow ANSI intervals in v2 `ALTER TABLE .. REPLACE COLUMNS` 71f4ee3 is described below commit 71f4ee38c71734128c5653b8f18a7d0bf1014b6b Author: Max Gekk AuthorDate: Fri Nov 12 17:23:36 2021 +0300 [SPARK-37304][SQL] Allow ANSI intervals in v2 `ALTER TABLE .. REPLACE COLUMNS` ### What changes were proposed in this pull request? In the PR, I propose to allow ANSI intervals: year-month and day-time intervals in the `ALTER TABLE .. REPLACE COLUMNS` command for tables in v2 catalogs (v1 catalogs don't support the command). Also added unified test suite to migrate related tests in the future. ### Why are the changes needed? To improve user experience with Spark SQL. After the changes, users can replace columns with ANSI intervals instead of removing and adding such columns. ### Does this PR introduce _any_ user-facing change? In some sense, yes. After the changes, the command doesn't output any error message. ### How was this patch tested? By running new test suite: ``` $ build/sbt "test:testOnly *AlterTableReplaceColumnsSuite" ``` Closes #34571 from MaxGekk/add-replace-ansi-interval-col. Authored-by: Max Gekk Signed-off-by: Max Gekk --- .../plans/logical/v2AlterTableCommands.scala | 2 +- .../AlterTableReplaceColumnsSuiteBase.scala| 54 ++ .../command/v2/AlterTableReplaceColumnsSuite.scala | 28 +++ 3 files changed, 83 insertions(+), 1 deletion(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/v2AlterTableCommands.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/v2AlterTableCommands.scala index 302a810..2eb828e 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/v2AlterTableCommands.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/v2AlterTableCommands.scala @@ -134,7 +134,7 @@ case class ReplaceColumns( table: LogicalPlan, columnsToAdd: Seq[QualifiedColType]) extends AlterTableCommand { columnsToAdd.foreach { c => -TypeUtils.failWithIntervalType(c.dataType) +TypeUtils.failWithIntervalType(c.dataType, forbidAnsiIntervals = false) } override lazy val resolved: Boolean = table.resolved && columnsToAdd.forall(_.resolved) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/AlterTableReplaceColumnsSuiteBase.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/AlterTableReplaceColumnsSuiteBase.scala new file mode 100644 index 000..fed4076 --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/AlterTableReplaceColumnsSuiteBase.scala @@ -0,0 +1,54 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.command + +import java.time.{Duration, Period} + +import org.apache.spark.sql.{QueryTest, Row} + +/** + * This base suite contains unified tests for the `ALTER TABLE .. REPLACE COLUMNS` command that + * check the V2 table catalog. The tests that cannot run for all supported catalogs are + * located in more specific test suites: + * + * - V2 table catalog tests: + * `org.apache.spark.sql.execution.command.v2.AlterTableReplaceColumnsSuite` + */ +trait AlterTableReplaceColumnsSuiteBase extends QueryTest with DDLCommandTestUtils { + override val command = "ALTER TABLE .. REPLACE COLUMNS" + + test("SPARK-37304: Replace columns by ANSI intervals") { +withNamespaceAndTable("ns", "tbl") { t => + sql(s"CREATE TABLE $t (ym INTERVAL MONTH, dt INTERVAL HOUR, data STRING) $defaultUsing") + // TODO(SPARK-37303): Uncomment the command below after REPLACE COLUMNS is fixed + // sql(s"INSERT INTO $t SELECT INTERVAL '1' MONTH, INTERVAL '2' HOUR, 'abc'"
[spark] branch master updated (9191632 -> a4b8a8d)
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git. from 9191632 [SPARK-36825][FOLLOWUP] Move the test code from `ParquetIOSuite` to `ParquetFileFormatSuite` add a4b8a8d [SPARK-37294][SQL][TESTS] Check inserting of ANSI intervals into a table partitioned by the interval columns No new revisions were added by this update. Summary of changes: .../spark/sql/connector/DataSourceV2SQLSuite.scala | 35 +- .../org/apache/spark/sql/sources/InsertSuite.scala | 34 + 2 files changed, 68 insertions(+), 1 deletion(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated: [SPARK-37261][SQL] Allow adding partitions with ANSI intervals in DSv2
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 2a1267a [SPARK-37261][SQL] Allow adding partitions with ANSI intervals in DSv2 2a1267a is described below commit 2a1267aeb75bf838c74d1cf274aa258be060c17b Author: Max Gekk AuthorDate: Wed Nov 10 15:21:33 2021 +0300 [SPARK-37261][SQL] Allow adding partitions with ANSI intervals in DSv2 ### What changes were proposed in this pull request? In the PR, I propose to skip checking of ANSI interval types while creating or writing to a table using V2 catalogs. As the consequence of that, users can creating tables in V2 catalogs partitioned by ANSI interval columns (the legacy intervals of `CalendarIntervalType` are still prohibited). Also this PR adds new test which checks: 1. Adding new partition with ANSI intervals via `ALTER TABLE .. ADD PARTITION` 2. INSERT INTO a table partitioned by ANSI intervals for V1/V2 In-Memory catalogs (skips V1 Hive external catalog). ### Why are the changes needed? To allow users saving of ANSI intervals as partition values using DSv2. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? By running new test for V1/V2 In-Memory and V1 Hive external catalogs: ``` $ build/sbt "test:testOnly org.apache.spark.sql.execution.command.v1.AlterTableAddPartitionSuite" $ build/sbt "test:testOnly org.apache.spark.sql.execution.command.v2.AlterTableAddPartitionSuite" $ build/sbt -Phive-2.3 -Phive-thriftserver "test:testOnly org.apache.spark.sql.hive.execution.command.AlterTableAddPartitionSuite" $ build/sbt -Phive-2.3 -Phive-thriftserver "test:testOnly *DataSourceV2SQLSuite" ``` Closes #34537 from MaxGekk/alter-table-ansi-interval. Authored-by: Max Gekk Signed-off-by: Max Gekk --- .../sql/catalyst/analysis/CheckAnalysis.scala | 6 ++-- .../apache/spark/sql/catalyst/util/TypeUtils.scala | 4 +-- .../spark/sql/connector/DataSourceV2SQLSuite.scala | 16 + .../command/AlterTableAddPartitionSuiteBase.scala | 40 +- 4 files changed, 54 insertions(+), 12 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala index 5bf37a2..1a105ad 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala @@ -464,10 +464,12 @@ trait CheckAnalysis extends PredicateHelper with LookupCatalog { failAnalysis(s"Invalid partitioning: ${badReferences.mkString(", ")}") } -create.tableSchema.foreach(f => TypeUtils.failWithIntervalType(f.dataType)) +create.tableSchema.foreach(f => + TypeUtils.failWithIntervalType(f.dataType, forbidAnsiIntervals = false)) case write: V2WriteCommand if write.resolved => -write.query.schema.foreach(f => TypeUtils.failWithIntervalType(f.dataType)) +write.query.schema.foreach(f => + TypeUtils.failWithIntervalType(f.dataType, forbidAnsiIntervals = false)) case alter: AlterTableCommand => checkAlterTableCommand(alter) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TypeUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TypeUtils.scala index cba3a9a..144508c 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TypeUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TypeUtils.scala @@ -98,8 +98,8 @@ object TypeUtils { case _ => false } - def failWithIntervalType(dataType: DataType): Unit = { -invokeOnceForInterval(dataType, forbidAnsiIntervals = true) { + def failWithIntervalType(dataType: DataType, forbidAnsiIntervals: Boolean = true): Unit = { +invokeOnceForInterval(dataType, forbidAnsiIntervals) { throw QueryCompilationErrors.cannotUseIntervalTypeInTableSchemaError() } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala index f03792f..499638c 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala @@ -340,13 +340,15 @@ class DataSourceV2SQLSuite } test("CTAS/RTAS: invalid schema if has interval type") { -Seq("CREATE"
[spark] branch master updated: [SPARK-24774][SQL][FOLLOWUP] Remove unused code in SchemaConverters.scala
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 59c55dd [SPARK-24774][SQL][FOLLOWUP] Remove unused code in SchemaConverters.scala 59c55dd is described below commit 59c55dd4c6f7772ef7949653679a2b76211788e8 Author: Gengliang Wang AuthorDate: Wed Nov 3 08:43:25 2021 +0300 [SPARK-24774][SQL][FOLLOWUP] Remove unused code in SchemaConverters.scala ### What changes were proposed in this pull request? As MaxGekk pointed out in https://github.com/apache/spark/pull/22037/files#r741373793, there is some unused code in SchemaConverters.scala. The UUID generator was for generating `fix` avro field names but we figure out a better solution during PR review. This PR is to remove the dead code. ### Why are the changes needed? Code clean up ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Existing UT. Closes #34472 from gengliangwang/SPARK-24774-followup. Authored-by: Gengliang Wang Signed-off-by: Max Gekk --- .../src/main/scala/org/apache/spark/sql/avro/SchemaConverters.scala | 4 1 file changed, 4 deletions(-) diff --git a/external/avro/src/main/scala/org/apache/spark/sql/avro/SchemaConverters.scala b/external/avro/src/main/scala/org/apache/spark/sql/avro/SchemaConverters.scala index 1c9b06b..347364c 100644 --- a/external/avro/src/main/scala/org/apache/spark/sql/avro/SchemaConverters.scala +++ b/external/avro/src/main/scala/org/apache/spark/sql/avro/SchemaConverters.scala @@ -18,14 +18,12 @@ package org.apache.spark.sql.avro import scala.collection.JavaConverters._ -import scala.util.Random import org.apache.avro.{LogicalTypes, Schema, SchemaBuilder} import org.apache.avro.LogicalTypes.{Date, Decimal, LocalTimestampMicros, LocalTimestampMillis, TimestampMicros, TimestampMillis} import org.apache.avro.Schema.Type._ import org.apache.spark.annotation.DeveloperApi -import org.apache.spark.sql.catalyst.util.RandomUUIDGenerator import org.apache.spark.sql.types._ import org.apache.spark.sql.types.Decimal.minBytesForPrecision @@ -35,8 +33,6 @@ import org.apache.spark.sql.types.Decimal.minBytesForPrecision */ @DeveloperApi object SchemaConverters { - private lazy val uuidGenerator = RandomUUIDGenerator(new Random().nextLong()) - private lazy val nullSchema = Schema.create(Schema.Type.NULL) /** - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated: [SPARK-37176][SQL] Sync JsonInferSchema#infer method's exception handle logic with JacksonParser#parse method
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new ec6a3ae [SPARK-37176][SQL] Sync JsonInferSchema#infer method's exception handle logic with JacksonParser#parse method ec6a3ae is described below commit ec6a3ae6dff1dc9c63978ae14a1793ccd771 Author: Xianjin YE AuthorDate: Tue Nov 2 12:40:09 2021 +0300 [SPARK-37176][SQL] Sync JsonInferSchema#infer method's exception handle logic with JacksonParser#parse method ### What changes were proposed in this pull request? Change `JsonInferSchema#infer`'s exception handle logic to be aligned with `JacksonParser#parse` ### Why are the changes needed? To reduce behavior inconsistency, users can have the same expectation for schema infer and json parse when dealing with some malformed input. ### Does this PR introduce _any_ user-facing change? Yes. Before this patch, json's inferring schema could be failed for some malformed input but succeeded when parsing. After this patch, they have the same exception handle logic. ### How was this patch tested? Added one new test and modify one exist test to cover the new case. Closes #34455 from advancedxy/SPARK-37176. Authored-by: Xianjin YE Signed-off-by: Max Gekk --- .../spark/sql/catalyst/json/JsonInferSchema.scala | 33 +++- .../test/resources/test-data/malformed_utf8.json | 3 ++ .../sql/execution/datasources/json/JsonSuite.scala | 35 ++ 3 files changed, 63 insertions(+), 8 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JsonInferSchema.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JsonInferSchema.scala index 3b17cde..3b62b16 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JsonInferSchema.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JsonInferSchema.scala @@ -17,6 +17,8 @@ package org.apache.spark.sql.catalyst.json +import java.io.CharConversionException +import java.nio.charset.MalformedInputException import java.util.Comparator import scala.util.control.Exception.allCatch @@ -45,6 +47,18 @@ private[sql] class JsonInferSchema(options: JSONOptions) extends Serializable { legacyFormat = FAST_DATE_FORMAT, isParsing = true) + private def handleJsonErrorsByParseMode(parseMode: ParseMode, + columnNameOfCorruptRecord: String, e: Throwable): Option[StructType] = { +parseMode match { + case PermissiveMode => +Some(StructType(Seq(StructField(columnNameOfCorruptRecord, StringType + case DropMalformedMode => +None + case FailFastMode => +throw QueryExecutionErrors.malformedRecordsDetectedInSchemaInferenceError(e) +} + } + /** * Infer the type of a collection of json records in three stages: * 1. Infer the type of each record @@ -68,14 +82,17 @@ private[sql] class JsonInferSchema(options: JSONOptions) extends Serializable { Some(inferField(parser)) } } catch { - case e @ (_: RuntimeException | _: JsonProcessingException) => parseMode match { -case PermissiveMode => - Some(StructType(Seq(StructField(columnNameOfCorruptRecord, StringType -case DropMalformedMode => - None -case FailFastMode => - throw QueryExecutionErrors.malformedRecordsDetectedInSchemaInferenceError(e) - } + case e @ (_: RuntimeException | _: JsonProcessingException | +_: MalformedInputException) => +handleJsonErrorsByParseMode(parseMode, columnNameOfCorruptRecord, e) + case e: CharConversionException if options.encoding.isEmpty => +val msg = + """JSON parser cannot handle a character in its input. +|Specifying encoding as an input option explicitly might help to resolve the issue. +|""".stripMargin + e.getMessage +val wrappedCharException = new CharConversionException(msg) +wrappedCharException.initCause(e) +handleJsonErrorsByParseMode(parseMode, columnNameOfCorruptRecord, wrappedCharException) } }.reduceOption(typeMerger).toIterator } diff --git a/sql/core/src/test/resources/test-data/malformed_utf8.json b/sql/core/src/test/resources/test-data/malformed_utf8.json new file mode 100644 index 000..c57eb43 --- /dev/null +++ b/sql/core/src/test/resources/test-data/malformed_utf8.json @@ -0,0 +1,3 @@ +{"a": 1} +{"a": 1} +� \ No newline at end of file diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite
[spark] branch master updated (13c372d -> d43a678)
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git. from 13c372d [SPARK-37150][SQL] Migrate DESCRIBE NAMESPACE to use V2 command by default add d43a678 [SPARK-37161][SQL] RowToColumnConverter support AnsiIntervalType No new revisions were added by this update. Summary of changes: .../org/apache/spark/sql/execution/Columnar.scala | 4 +-- .../execution/vectorized/ColumnarBatchSuite.scala | 37 ++ 2 files changed, 39 insertions(+), 2 deletions(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated: [SPARK-37138][SQL] Support ANSI Interval types in ApproxCountDistinctForIntervals/ApproximatePercentile/Percentile
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 08123a3 [SPARK-37138][SQL] Support ANSI Interval types in ApproxCountDistinctForIntervals/ApproximatePercentile/Percentile 08123a3 is described below commit 08123a3795683238352e5bf55452de381349fdd9 Author: Angerszh AuthorDate: Sat Oct 30 20:03:20 2021 +0300 [SPARK-37138][SQL] Support ANSI Interval types in ApproxCountDistinctForIntervals/ApproximatePercentile/Percentile ### What changes were proposed in this pull request? Support Ansi Interval types in the agg expressions: - ApproxCountDistinctForIntervals - ApproximatePercentile - Percentile ### Why are the changes needed? To improve user experience with Spark SQL. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Added new UT. Closes #34412 from AngersZh/SPARK-37138. Authored-by: Angerszh Signed-off-by: Max Gekk --- .../ApproxCountDistinctForIntervals.scala | 13 +++--- .../aggregate/ApproximatePercentile.scala | 32 -- .../expressions/aggregate/Percentile.scala | 26 +--- .../ApproxCountDistinctForIntervalsSuite.scala | 6 ++- .../expressions/aggregate/PercentileSuite.scala| 8 ++-- ...ApproxCountDistinctForIntervalsQuerySuite.scala | 28 + .../sql/ApproximatePercentileQuerySuite.scala | 22 +- .../apache/spark/sql/PercentileQuerySuite.scala| 49 ++ 8 files changed, 153 insertions(+), 31 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproxCountDistinctForIntervals.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproxCountDistinctForIntervals.scala index a7e9a22..f3bf251 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproxCountDistinctForIntervals.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproxCountDistinctForIntervals.scala @@ -61,7 +61,8 @@ case class ApproxCountDistinctForIntervals( } override def inputTypes: Seq[AbstractDataType] = { -Seq(TypeCollection(NumericType, TimestampType, DateType, TimestampNTZType), ArrayType) +Seq(TypeCollection(NumericType, TimestampType, DateType, TimestampNTZType, + YearMonthIntervalType, DayTimeIntervalType), ArrayType) } // Mark as lazy so that endpointsExpression is not evaluated during tree transformation. @@ -79,14 +80,16 @@ case class ApproxCountDistinctForIntervals( TypeCheckFailure("The endpoints provided must be constant literals") } else { endpointsExpression.dataType match { -case ArrayType(_: NumericType | DateType | TimestampType | TimestampNTZType, _) => +case ArrayType(_: NumericType | DateType | TimestampType | TimestampNTZType | + _: AnsiIntervalType, _) => if (endpoints.length < 2) { TypeCheckFailure("The number of endpoints must be >= 2 to construct intervals") } else { TypeCheckSuccess } case _ => - TypeCheckFailure("Endpoints require (numeric or timestamp or date) type") + TypeCheckFailure("Endpoints require (numeric or timestamp or date or timestamp_ntz or " + +"interval year to month or interval day to second) type") } } } @@ -120,9 +123,9 @@ case class ApproxCountDistinctForIntervals( val doubleValue = child.dataType match { case n: NumericType => n.numeric.toDouble(value.asInstanceOf[n.InternalType]) -case _: DateType => +case _: DateType | _: YearMonthIntervalType => value.asInstanceOf[Int].toDouble -case TimestampType | TimestampNTZType => +case TimestampType | TimestampNTZType | _: DayTimeIntervalType => value.asInstanceOf[Long].toDouble } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala index 8cce79c..0dcb906 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala @@ -49,15 +49,16 @@ import org.apache.spark.sql.types._ * yields better accuracy, the default value is * DEFAULT_PERCENTILE_ACCURACY. */ +// scalas
[spark] branch master updated: [SPARK-36928][SQL] Handle ANSI intervals in ColumnarRow, ColumnarBatchRow and ColumnarArray
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new fd8d5ad [SPARK-36928][SQL] Handle ANSI intervals in ColumnarRow, ColumnarBatchRow and ColumnarArray fd8d5ad is described below commit fd8d5ad2140d6405357b908dce2d00a21036dedb Author: PengLei AuthorDate: Thu Oct 28 14:52:41 2021 +0300 [SPARK-36928][SQL] Handle ANSI intervals in ColumnarRow, ColumnarBatchRow and ColumnarArray ### What changes were proposed in this pull request? 1. add handle ansi interval type for `get`, `copy` method of ColumnarArray 2. add handle ansi interval type for `get`, `copy` method of ColumnarBatchRow 3. add handle ansi interval type for `get`, `copy` method of ColumnarRow ### Why are the changes needed? [SPARK-36928](https://issues.apache.org/jira/browse/SPARK-36928) ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Add test case Closes #34421 from Peng-Lei/SPARK-36928. Authored-by: PengLei Signed-off-by: Max Gekk --- .../apache/spark/sql/vectorized/ColumnarArray.java | 6 +- .../spark/sql/vectorized/ColumnarBatchRow.java | 8 +-- .../apache/spark/sql/vectorized/ColumnarRow.java | 8 +-- .../execution/vectorized/ColumnVectorSuite.scala | 69 ++ .../execution/vectorized/ColumnarBatchSuite.scala | 32 ++ 5 files changed, 113 insertions(+), 10 deletions(-) diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/vectorized/ColumnarArray.java b/sql/catalyst/src/main/java/org/apache/spark/sql/vectorized/ColumnarArray.java index 147dd24..2fb6b3f 100644 --- a/sql/catalyst/src/main/java/org/apache/spark/sql/vectorized/ColumnarArray.java +++ b/sql/catalyst/src/main/java/org/apache/spark/sql/vectorized/ColumnarArray.java @@ -57,9 +57,11 @@ public final class ColumnarArray extends ArrayData { return UnsafeArrayData.fromPrimitiveArray(toByteArray()); } else if (dt instanceof ShortType) { return UnsafeArrayData.fromPrimitiveArray(toShortArray()); -} else if (dt instanceof IntegerType || dt instanceof DateType) { +} else if (dt instanceof IntegerType || dt instanceof DateType +|| dt instanceof YearMonthIntervalType) { return UnsafeArrayData.fromPrimitiveArray(toIntArray()); -} else if (dt instanceof LongType || dt instanceof TimestampType) { +} else if (dt instanceof LongType || dt instanceof TimestampType +|| dt instanceof DayTimeIntervalType) { return UnsafeArrayData.fromPrimitiveArray(toLongArray()); } else if (dt instanceof FloatType) { return UnsafeArrayData.fromPrimitiveArray(toFloatArray()); diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/vectorized/ColumnarBatchRow.java b/sql/catalyst/src/main/java/org/apache/spark/sql/vectorized/ColumnarBatchRow.java index c6b7287e7..8c32d5c 100644 --- a/sql/catalyst/src/main/java/org/apache/spark/sql/vectorized/ColumnarBatchRow.java +++ b/sql/catalyst/src/main/java/org/apache/spark/sql/vectorized/ColumnarBatchRow.java @@ -52,9 +52,9 @@ public final class ColumnarBatchRow extends InternalRow { row.setByte(i, getByte(i)); } else if (dt instanceof ShortType) { row.setShort(i, getShort(i)); -} else if (dt instanceof IntegerType) { +} else if (dt instanceof IntegerType || dt instanceof YearMonthIntervalType) { row.setInt(i, getInt(i)); -} else if (dt instanceof LongType) { +} else if (dt instanceof LongType || dt instanceof DayTimeIntervalType) { row.setLong(i, getLong(i)); } else if (dt instanceof FloatType) { row.setFloat(i, getFloat(i)); @@ -151,9 +151,9 @@ public final class ColumnarBatchRow extends InternalRow { return getByte(ordinal); } else if (dataType instanceof ShortType) { return getShort(ordinal); -} else if (dataType instanceof IntegerType) { +} else if (dataType instanceof IntegerType || dataType instanceof YearMonthIntervalType) { return getInt(ordinal); -} else if (dataType instanceof LongType) { +} else if (dataType instanceof LongType || dataType instanceof DayTimeIntervalType) { return getLong(ordinal); } else if (dataType instanceof FloatType) { return getFloat(ordinal); diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/vectorized/ColumnarRow.java b/sql/catalyst/src/main/java/org/apache/spark/sql/vectorized/ColumnarRow.java index 4b9d3c5..da4b242 100644 --- a/sql/catalyst/src/main/java/org/apache/spark/sql/vectorized/ColumnarRow.java +++ b/sql/catalyst/src/main/java/org/apache/spark/sql/vectorized/ColumnarRow.java @@ -61,9 +61,9 @@ public final class ColumnarRow extends InternalRow { row.setByte(i, getByte(i
[spark] branch master updated (c29bb02 -> 21fa3ce)
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git. from c29bb02 [SPARK-36965][PYTHON] Extend python test runner by logging out the temp output files add 21fa3ce [SPARK-35925][SQL] Support DayTimeIntervalType in width-bucket function No new revisions were added by this update. Summary of changes: .../sql/catalyst/expressions/mathExpressions.scala | 12 +--- .../catalyst/expressions/MathExpressionsSuite.scala | 20 .../src/test/resources/sql-tests/inputs/interval.sql | 2 ++ .../sql-tests/results/ansi/interval.sql.out | 18 +- .../resources/sql-tests/results/interval.sql.out | 18 +- 5 files changed, 65 insertions(+), 5 deletions(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated: [SPARK-35926][SQL] Add support YearMonthIntervalType for width_bucket
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 9d061e3 [SPARK-35926][SQL] Add support YearMonthIntervalType for width_bucket 9d061e3 is described below commit 9d061e3939a021c602c070fc13cef951a8f94c82 Author: PengLei AuthorDate: Fri Oct 15 17:15:50 2021 +0300 [SPARK-35926][SQL] Add support YearMonthIntervalType for width_bucket ### What changes were proposed in this pull request? Support width_bucket(YearMonthIntervalType, YearMonthIntervalType, YearMonthIntervalType, Long), it return long result eg: ``` width_bucket(input_value, min_value, max_value, bucket_nums) width_bucket(INTERVAL '1' YEAR, INTERVAL '0' YEAR, INTERVAL '10' YEAR, 10) It will divides the range between the max_value and min_value into 10 buckets. [ INTERVAL '0' YEAR, INTERVAL '1' YEAR), [ INTERVAL '1' YEAR, INTERVAL '2' YEAR).. [INTERVAL '9' YEAR, INTERVAL '10' YEAR) Then, calculates which bucket the given input_value locate. ``` The function `width_bucket` is introduced from [SPARK-21117](https://issues.apache.org/jira/browse/SPARK-21117) ### Why are the changes needed? [35926](https://issues.apache.org/jira/browse/SPARK-35926) 1. The `WIDTH_BUCKET` function assigns values to buckets (individual segments) in an equiwidth histogram. The ANSI SQL Standard Syntax is like follow: `WIDTH_BUCKET( expression, min, max, buckets)`. [Reference](https://www.oreilly.com/library/view/sql-in-a/9780596155322/re91.html). 2. `WIDTH_BUCKET` just support `Double` at now, Of course, we can cast `Int` to `Double` to use it. But we cloud not cast `YearMonthIntervayType` to `Double`. 3. I think it has a use scenario. eg: Histogram of employee years of service, the `years of service` is a column of `YearMonthIntervalType` dataType. ### Does this PR introduce _any_ user-facing change? Yes. The user can use `width_bucket` with YearMonthIntervalType. ### How was this patch tested? Add ut test Closes #33132 from Peng-Lei/SPARK-35926. Authored-by: PengLei Signed-off-by: Max Gekk --- .../sql/catalyst/expressions/mathExpressions.scala | 33 ++ .../expressions/MathExpressionsSuite.scala | 15 ++ .../test/resources/sql-tests/inputs/interval.sql | 2 ++ .../sql-tests/results/ansi/interval.sql.out| 18 +++- .../resources/sql-tests/results/interval.sql.out | 18 +++- .../org/apache/spark/sql/MathFunctionsSuite.scala | 17 +++ 6 files changed, 96 insertions(+), 7 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala index c14fa72..6c34ed6 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala @@ -25,7 +25,7 @@ import org.apache.spark.sql.catalyst.analysis.{FunctionRegistry, TypeCheckResult import org.apache.spark.sql.catalyst.analysis.TypeCheckResult.{TypeCheckFailure, TypeCheckSuccess} import org.apache.spark.sql.catalyst.expressions.codegen._ import org.apache.spark.sql.catalyst.expressions.codegen.Block._ -import org.apache.spark.sql.catalyst.util.NumberConverter +import org.apache.spark.sql.catalyst.util.{NumberConverter, TypeUtils} import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.UTF8String @@ -1613,6 +1613,10 @@ object WidthBucket { 5 > SELECT _FUNC_(-0.9, 5.2, 0.5, 2); 3 + > SELECT _FUNC_(INTERVAL '0' YEAR, INTERVAL '0' YEAR, INTERVAL '10' YEAR, 10); + 1 + > SELECT _FUNC_(INTERVAL '1' YEAR, INTERVAL '0' YEAR, INTERVAL '10' YEAR, 10); + 2 """, since = "3.1.0", group = "math_funcs") @@ -1623,16 +1627,35 @@ case class WidthBucket( numBucket: Expression) extends QuaternaryExpression with ImplicitCastInputTypes with NullIntolerant { - override def inputTypes: Seq[AbstractDataType] = Seq(DoubleType, DoubleType, DoubleType, LongType) + override def inputTypes: Seq[AbstractDataType] = Seq( +TypeCollection(DoubleType, YearMonthIntervalType), +TypeCollection(DoubleType, YearMonthIntervalType), +TypeCollection(DoubleType, YearMonthIntervalType), +LongType) + + override def checkInputDataTypes(): TypeCheckResult = { +super.checkInputDataTypes() match { + case TypeCheckSuccess => +(value.dataType, minValue.dataType, maxValue.dataType) match { + case (_: YearMonthIntervalType, _: YearMonthIntervalType, _: YearMonthIntervalType) => +TypeCheckS
[spark] branch branch-3.0 updated (74fddec -> 86bf5d3)
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a change to branch branch-3.0 in repository https://gitbox.apache.org/repos/asf/spark.git. from 74fddec [SPARK-36717][CORE] Incorrect order of variable initialization may lead incorrect behavior add 86bf5d3 [SPARK-36993][SQL][3.0] Fix json_tuple throw NPE if fields exist no foldable null value No new revisions were added by this update. Summary of changes: .../sql/catalyst/expressions/jsonExpressions.scala | 7 +-- .../test/resources/sql-tests/inputs/json-functions.sql | 4 .../resources/sql-tests/results/json-functions.sql.out | 18 +- 3 files changed, 26 insertions(+), 3 deletions(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated: [SPARK-36982] Migrate SHOW NAMESPACES to use V2 command by default
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 5ac76d9 [SPARK-36982] Migrate SHOW NAMESPACES to use V2 command by default 5ac76d9 is described below commit 5ac76d9cb45d58eeb4253d50e90060a68c3e87cb Author: Terry Kim AuthorDate: Wed Oct 13 22:12:56 2021 +0300 [SPARK-36982] Migrate SHOW NAMESPACES to use V2 command by default ### What changes were proposed in this pull request? This PR proposes to use V2 commands as default as outlined in [SPARK-36588](https://issues.apache.org/jira/browse/SPARK-36588), and this PR migrates `SHOW NAMESPACES` to use v2 command by default. (Technically speaking, there is no v1 command for `SHOW NAMESPACES/DATABASES`, but this PR removes an extra check in `ResolveSessionCatalog` to handle session catalog.) ### Why are the changes needed? It's been a while since we introduced the v2 commands, and it seems reasonable to use v2 commands by default even for the session catalog, with a legacy config to fall back to the v1 commands. ### Does this PR introduce _any_ user-facing change? No, the user can use v1 command by setting `spark.sql.legacy.useV1Command` to `true`. ### How was this patch tested? Added unit tests. Closes #34255 from imback82/migrate_show_namespaces. Authored-by: Terry Kim Signed-off-by: Max Gekk --- .../sql/catalyst/analysis/KeepLegacyOutputs.scala | 5 +++- .../org/apache/spark/sql/internal/SQLConf.scala| 2 +- .../catalyst/analysis/ResolveSessionCatalog.scala | 16 +- .../execution/command/DDLCommandTestUtils.scala| 7 +++-- .../command/ShowNamespacesSuiteBase.scala | 6 .../execution/command/TestsV1AndV2Commands.scala} | 35 -- .../execution/command/v1/CommandSuiteBase.scala| 3 +- .../execution/command/v1/ShowNamespacesSuite.scala | 8 ++--- .../sql/execution/command/v1/ShowTablesSuite.scala | 27 ++--- .../execution/command/v2/CommandSuiteBase.scala| 3 +- .../hive/execution/command/CommandSuiteBase.scala | 3 +- .../execution/command/ShowNamespacesSuite.scala| 2 ++ .../hive/execution/command/ShowTablesSuite.scala | 2 +- 13 files changed, 49 insertions(+), 70 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/KeepLegacyOutputs.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/KeepLegacyOutputs.scala index baee2bd..3af5544 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/KeepLegacyOutputs.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/KeepLegacyOutputs.scala @@ -17,7 +17,7 @@ package org.apache.spark.sql.catalyst.analysis -import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, ShowTables} +import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, ShowNamespaces, ShowTables} import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.catalyst.trees.TreePattern.COMMAND import org.apache.spark.sql.internal.SQLConf @@ -36,6 +36,9 @@ object KeepLegacyOutputs extends Rule[LogicalPlan] { assert(s.output.length == 3) val newOutput = s.output.head.withName("database") +: s.output.tail s.copy(output = newOutput) +case s: ShowNamespaces => + assert(s.output.length == 1) + s.copy(output = Seq(s.output.head.withName("databaseName"))) } } } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index 1dc4a8e..f0e7731 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -3364,7 +3364,7 @@ object SQLConf { buildConf("spark.sql.legacy.keepCommandOutputSchema") .internal() .doc("When true, Spark will keep the output schema of commands such as SHOW DATABASES " + -"unchanged, for v1 catalog and/or table.") +"unchanged.") .version("3.0.2") .booleanConf .createWithDefault(false) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala index 2252225..c77be4e 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala @@ -121,14 +121,6 @@ class ResolveSessionCatalog(val catalogManager: CatalogManag
[spark] branch master updated (7aedce4 -> 1aa3611)
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git. from 7aedce4 [SPARK-36993][SQL] Fix json_tuple throw NPE if fields exist no foldable null value add 1aa3611 [SPARK-36949][SQL] Disallow Hive provider tables with ANSI intervals No new revisions were added by this update. Summary of changes: .../spark/sql/errors/QueryExecutionErrors.scala| 4 .../apache/spark/sql/execution/command/ddl.scala | 12 ++ .../spark/sql/execution/command/tables.scala | 2 +- .../spark/sql/execution/datasources/rules.scala| 4 ++-- .../org/apache/spark/sql/hive/HiveStrategies.scala | 2 +- .../hive/execution/InsertIntoHiveDirCommand.scala | 2 +- .../spark/sql/hive/execution/HiveDDLSuite.scala| 26 ++ 7 files changed, 43 insertions(+), 9 deletions(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-3.1 updated: [SPARK-36993][SQL] Fix json_tuple throw NPE if fields exist no foldable null value
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a commit to branch branch-3.1 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.1 by this push: new fe2f646 [SPARK-36993][SQL] Fix json_tuple throw NPE if fields exist no foldable null value fe2f646 is described below commit fe2f646a7ee2d9c456d9aa1a6916cc478b73dd12 Author: ulysses-you AuthorDate: Wed Oct 13 19:36:16 2021 +0300 [SPARK-36993][SQL] Fix json_tuple throw NPE if fields exist no foldable null value ### What changes were proposed in this pull request? Wrap `expr.eval(input)` with Option in `json_tuple`. ### Why are the changes needed? If json_tuple exists no foldable null field, Spark would throw NPE during eval field.toString. e.g. the query will fail with: ```SQL SELECT json_tuple('{"a":"1"}', if(c1 < 1, null, 'a')) FROM ( SELECT rand() AS c1 ); ``` ``` Caused by: java.lang.NullPointerException at org.apache.spark.sql.catalyst.expressions.JsonTuple.$anonfun$parseRow$2(jsonExpressions.scala:435) at scala.collection.TraversableLike.$anonfun$map$1(TraversableLike.scala:286) at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62) at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55) at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49) at scala.collection.TraversableLike.map(TraversableLike.scala:286) at scala.collection.TraversableLike.map$(TraversableLike.scala:279) at scala.collection.AbstractTraversable.map(Traversable.scala:108) at org.apache.spark.sql.catalyst.expressions.JsonTuple.parseRow(jsonExpressions.scala:435) at org.apache.spark.sql.catalyst.expressions.JsonTuple.$anonfun$eval$6(jsonExpressions.scala:413) ``` ### Does this PR introduce _any_ user-facing change? yes, bug fix. ### How was this patch tested? add test in `json-functions.sql`. Closes #34268 from ulysses-you/SPARK-36993. Authored-by: ulysses-you Signed-off-by: Max Gekk (cherry picked from commit 7aedce44b73d9b0c56863f970257abf52ce551ce) Signed-off-by: Max Gekk --- .../sql/catalyst/expressions/jsonExpressions.scala | 7 +-- .../test/resources/sql-tests/inputs/json-functions.sql | 4 .../resources/sql-tests/results/json-functions.sql.out | 18 +- 3 files changed, 26 insertions(+), 3 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala index a363615..5abac01 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala @@ -426,12 +426,15 @@ case class JsonTuple(children: Seq[Expression]) foldableFieldNames.map(_.orNull) } else if (constantFields == 0) { // none are foldable so all field names need to be evaluated from the input row - fieldExpressions.map(_.eval(input).asInstanceOf[UTF8String].toString) + fieldExpressions.map { expr => + Option(expr.eval(input)).map(_.asInstanceOf[UTF8String].toString).orNull + } } else { // if there is a mix of constant and non-constant expressions // prefer the cached copy when available foldableFieldNames.zip(fieldExpressions).map { -case (null, expr) => expr.eval(input).asInstanceOf[UTF8String].toString +case (null, expr) => + Option(expr.eval(input)).map(_.asInstanceOf[UTF8String].toString).orNull case (fieldName, _) => fieldName.orNull } } diff --git a/sql/core/src/test/resources/sql-tests/inputs/json-functions.sql b/sql/core/src/test/resources/sql-tests/inputs/json-functions.sql index f6fa441..245a6a6 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/json-functions.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/json-functions.sql @@ -25,6 +25,10 @@ select from_json(); SELECT json_tuple('{"a" : 1, "b" : 2}', CAST(NULL AS STRING), 'b', CAST(NULL AS STRING), 'a'); CREATE TEMPORARY VIEW jsonTable(jsonField, a) AS SELECT * FROM VALUES ('{"a": 1, "b": 2}', 'a'); SELECT json_tuple(jsonField, 'b', CAST(NULL AS STRING), a) FROM jsonTable; +-- json_tuple exists no foldable null field +SELECT json_tuple('{"a":"1"}', if(c1 < 1, null, 'a')) FROM ( SELECT rand() AS c1 ); +SELECT json_tuple('{"a":"1"}', if(c1 < 1, null, 'a'), if(c2 < 1, null, 'a')) FROM ( SELECT 0 AS c1, rand() AS c2 ); + -- Clean up DROP VIEW IF EXISTS jsonTable; diff --git
[spark] branch branch-3.2 updated: [SPARK-36993][SQL] Fix json_tuple throw NPE if fields exist no foldable null value
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a commit to branch branch-3.2 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.2 by this push: new e60b63c [SPARK-36993][SQL] Fix json_tuple throw NPE if fields exist no foldable null value e60b63c is described below commit e60b63cff9ea36fb5864134afa231cf7bfbd0090 Author: ulysses-you AuthorDate: Wed Oct 13 19:36:16 2021 +0300 [SPARK-36993][SQL] Fix json_tuple throw NPE if fields exist no foldable null value ### What changes were proposed in this pull request? Wrap `expr.eval(input)` with Option in `json_tuple`. ### Why are the changes needed? If json_tuple exists no foldable null field, Spark would throw NPE during eval field.toString. e.g. the query will fail with: ```SQL SELECT json_tuple('{"a":"1"}', if(c1 < 1, null, 'a')) FROM ( SELECT rand() AS c1 ); ``` ``` Caused by: java.lang.NullPointerException at org.apache.spark.sql.catalyst.expressions.JsonTuple.$anonfun$parseRow$2(jsonExpressions.scala:435) at scala.collection.TraversableLike.$anonfun$map$1(TraversableLike.scala:286) at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62) at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55) at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49) at scala.collection.TraversableLike.map(TraversableLike.scala:286) at scala.collection.TraversableLike.map$(TraversableLike.scala:279) at scala.collection.AbstractTraversable.map(Traversable.scala:108) at org.apache.spark.sql.catalyst.expressions.JsonTuple.parseRow(jsonExpressions.scala:435) at org.apache.spark.sql.catalyst.expressions.JsonTuple.$anonfun$eval$6(jsonExpressions.scala:413) ``` ### Does this PR introduce _any_ user-facing change? yes, bug fix. ### How was this patch tested? add test in `json-functions.sql`. Closes #34268 from ulysses-you/SPARK-36993. Authored-by: ulysses-you Signed-off-by: Max Gekk (cherry picked from commit 7aedce44b73d9b0c56863f970257abf52ce551ce) Signed-off-by: Max Gekk --- .../sql/catalyst/expressions/jsonExpressions.scala | 7 +-- .../test/resources/sql-tests/inputs/json-functions.sql | 4 .../resources/sql-tests/results/json-functions.sql.out | 18 +- 3 files changed, 26 insertions(+), 3 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala index 49ded89..263075f 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala @@ -432,12 +432,15 @@ case class JsonTuple(children: Seq[Expression]) foldableFieldNames.map(_.orNull) } else if (constantFields == 0) { // none are foldable so all field names need to be evaluated from the input row - fieldExpressions.map(_.eval(input).asInstanceOf[UTF8String].toString) + fieldExpressions.map { expr => + Option(expr.eval(input)).map(_.asInstanceOf[UTF8String].toString).orNull + } } else { // if there is a mix of constant and non-constant expressions // prefer the cached copy when available foldableFieldNames.zip(fieldExpressions).map { -case (null, expr) => expr.eval(input).asInstanceOf[UTF8String].toString +case (null, expr) => + Option(expr.eval(input)).map(_.asInstanceOf[UTF8String].toString).orNull case (fieldName, _) => fieldName.orNull } } diff --git a/sql/core/src/test/resources/sql-tests/inputs/json-functions.sql b/sql/core/src/test/resources/sql-tests/inputs/json-functions.sql index f6fa441..245a6a6 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/json-functions.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/json-functions.sql @@ -25,6 +25,10 @@ select from_json(); SELECT json_tuple('{"a" : 1, "b" : 2}', CAST(NULL AS STRING), 'b', CAST(NULL AS STRING), 'a'); CREATE TEMPORARY VIEW jsonTable(jsonField, a) AS SELECT * FROM VALUES ('{"a": 1, "b": 2}', 'a'); SELECT json_tuple(jsonField, 'b', CAST(NULL AS STRING), a) FROM jsonTable; +-- json_tuple exists no foldable null field +SELECT json_tuple('{"a":"1"}', if(c1 < 1, null, 'a')) FROM ( SELECT rand() AS c1 ); +SELECT json_tuple('{"a":"1"}', if(c1 < 1, null, 'a'), if(c2 < 1, null, 'a')) FROM ( SELECT 0 AS c1, rand() AS c2 ); + -- Clean up DROP VIEW IF EXISTS jsonTable; diff --git
[spark] branch master updated (f070d18 -> 7aedce4)
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git. from f070d18 [SPARK-36871][SQL] Migrate CreateViewStatement to v2 command add 7aedce4 [SPARK-36993][SQL] Fix json_tuple throw NPE if fields exist no foldable null value No new revisions were added by this update. Summary of changes: .../sql/catalyst/expressions/jsonExpressions.scala | 7 +-- .../test/resources/sql-tests/inputs/json-functions.sql | 4 .../resources/sql-tests/results/json-functions.sql.out | 18 +- 3 files changed, 26 insertions(+), 3 deletions(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated: [SPARK-36921][SQL] Support ANSI intervals by `DIV`
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 1ccc4dc [SPARK-36921][SQL] Support ANSI intervals by `DIV` 1ccc4dc is described below commit 1ccc4dc5a82cecadc8ccc2a1f8aefa6845efd4be Author: PengLei AuthorDate: Wed Oct 13 15:11:42 2021 +0300 [SPARK-36921][SQL] Support ANSI intervals by `DIV` ### What changes were proposed in this pull request? 1. support div(YearMonthIntervalType, YearMonthIntervalType), return long result 2. support div(DayTimeIntervalType, DayTimeIntervalType), return long result 3. if input is NULL or input2 is 0, then return null ### Why are the changes needed? Extended the div function to support ANSI intervals. The operation should produce quotient of division. [SPARK-36921](https://issues.apache.org/jira/browse/SPARK-36921) ### Does this PR introduce _any_ user-facing change? Yes, user can use user can use YearMonthIntervalType and DayTimeIntervalType as input for div function. ### How was this patch tested? Add ut testcase Closes #34257 from Peng-Lei/SPARK-36921. Authored-by: PengLei Signed-off-by: Max Gekk --- .../sql/catalyst/expressions/arithmetic.scala | 9 ++- .../expressions/ArithmeticExpressionSuite.scala| 81 ++ .../test/resources/sql-tests/inputs/interval.sql | 5 ++ .../sql-tests/results/ansi/interval.sql.out| 43 +++- .../resources/sql-tests/results/interval.sql.out | 43 +++- 5 files changed, 178 insertions(+), 3 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala index b77d558..06e8982 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala @@ -568,6 +568,8 @@ case class Divide( Examples: > SELECT 3 _FUNC_ 2; 1 + > SELECT INTERVAL '1-1' YEAR TO MONTH _FUNC_ INTERVAL '-1' MONTH; + -13 """, since = "3.0.0", group = "math_funcs") @@ -584,7 +586,8 @@ case class IntegralDivide( case _ => false } - override def inputType: AbstractDataType = TypeCollection(LongType, DecimalType) + override def inputType: AbstractDataType = TypeCollection( +LongType, DecimalType, YearMonthIntervalType, DayTimeIntervalType) override def dataType: DataType = LongType @@ -599,6 +602,10 @@ case class IntegralDivide( i.integral.asInstanceOf[Integral[Any]] case d: DecimalType => d.asIntegral.asInstanceOf[Integral[Any]] + case _: YearMonthIntervalType => +IntegerType.integral.asInstanceOf[Integral[Any]] + case _: DayTimeIntervalType => +LongType.integral.asInstanceOf[Integral[Any]] } (x, y) => { val res = integral.quot(x, y) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ArithmeticExpressionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ArithmeticExpressionSuite.scala index af1bc72..31d7a4b 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ArithmeticExpressionSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ArithmeticExpressionSuite.scala @@ -699,4 +699,85 @@ class ArithmeticExpressionSuite extends SparkFunSuite with ExpressionEvalHelper checkConsistencyBetweenInterpretedAndCodegen((e: Expression) => Abs(e, false), tpe) } } + + test("SPARK-36921: Support YearMonthIntervalType by div") { +checkEvaluation(IntegralDivide(Literal(Period.ZERO), Literal(Period.ZERO)), null) +checkEvaluation(IntegralDivide(Literal(Period.ofYears(1)), + Literal(Period.ZERO)), null) +checkEvaluation(IntegralDivide(Period.ofMonths(Int.MinValue), + Literal(Period.ZERO)), null) +checkEvaluation(IntegralDivide(Period.ofMonths(Int.MaxValue), + Literal(Period.ZERO)), null) + +checkEvaluation(IntegralDivide(Literal.create(null, YearMonthIntervalType()), + Literal.create(null, YearMonthIntervalType())), null) +checkEvaluation(IntegralDivide(Literal.create(null, YearMonthIntervalType()), + Literal(Period.ofYears(1))), null) +checkEvaluation(IntegralDivide(Literal(Period.ofYears(1)), + Literal.create(null, YearMonthIntervalType())), null) + +checkEvaluation(IntegralDivide(Period.ofMonths(Int.MaxValue), + Period.ofMonths(Int.MaxValue)), 1L) +checkEvaluation(IntegralDivide(Period.ofMonths(Int.MaxValue), + Period.ofMonths(Int.MinValue)),
[spark] branch master updated (dc1db95 -> 1af7072)
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git. from dc1db95 [SPARK-36867][SQL] Fix error message with GROUP BY alias add 1af7072 [SPARK-36970][SQL] Manual disabled format `B` of `date_format` function to make Java 17 compatible with Java 8 No new revisions were added by this update. Summary of changes: .../apache/spark/sql/catalyst/util/DateTimeFormatterHelper.scala| 6 +- .../org/apache/spark/sql/catalyst/util/DatetimeFormatterSuite.scala | 2 +- .../resources/sql-tests/results/datetime-formatting-invalid.sql.out | 2 +- 3 files changed, 7 insertions(+), 3 deletions(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated: [SPARK-36960][SQL] Pushdown filters with ANSI interval values to ORC
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new ebfc6bb [SPARK-36960][SQL] Pushdown filters with ANSI interval values to ORC ebfc6bb is described below commit ebfc6bbe0e9200f87ebb52fb71d009b2d71b956d Author: Kousuke Saruta AuthorDate: Sat Oct 9 16:55:59 2021 +0300 [SPARK-36960][SQL] Pushdown filters with ANSI interval values to ORC ### What changes were proposed in this pull request? This PR proposes to push down filters with ANSI intervals to ORC. ### Why are the changes needed? After SPARK-36931 (#34184), V1 and V2 ORC datasources support ANSI intervals. So it's great to be able to push down filters with ANSI interval values for the better performance. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? New tests. Closes #34224 from sarutak/orc-ansi-interval-pushdown. Lead-authored-by: Kousuke Saruta Co-authored-by: Kousuke Saruta Signed-off-by: Max Gekk --- .../apache/spark/sql/catalyst/dsl/package.scala| 4 +- .../sql/execution/datasources/orc/OrcFilters.scala | 10 ++- .../execution/datasources/orc/OrcFilterSuite.scala | 97 ++ 3 files changed, 108 insertions(+), 3 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala index 4a97a8d..979c280 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala @@ -18,7 +18,7 @@ package org.apache.spark.sql.catalyst import java.sql.{Date, Timestamp} -import java.time.{Instant, LocalDate} +import java.time.{Duration, Instant, LocalDate, Period} import scala.language.implicitConversions @@ -167,6 +167,8 @@ package object dsl { implicit def timestampToLiteral(t: Timestamp): Literal = Literal(t) implicit def instantToLiteral(i: Instant): Literal = Literal(i) implicit def binaryToLiteral(a: Array[Byte]): Literal = Literal(a) +implicit def periodToLiteral(p: Period): Literal = Literal(p) +implicit def durationToLiteral(d: Duration): Literal = Literal(d) implicit def symbolToUnresolvedAttribute(s: Symbol): analysis.UnresolvedAttribute = analysis.UnresolvedAttribute(s.name) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFilters.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFilters.scala index 5abfa4c..8e02fc3 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFilters.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFilters.scala @@ -17,7 +17,7 @@ package org.apache.spark.sql.execution.datasources.orc -import java.time.{Instant, LocalDate} +import java.time.{Duration, Instant, LocalDate, Period} import org.apache.hadoop.hive.common.`type`.HiveDecimal import org.apache.hadoop.hive.ql.io.sarg.{PredicateLeaf, SearchArgument} @@ -26,6 +26,7 @@ import org.apache.hadoop.hive.ql.io.sarg.SearchArgumentFactory.newBuilder import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable import org.apache.spark.sql.catalyst.util.DateTimeUtils.{instantToMicros, localDateToDays, toJavaDate, toJavaTimestamp} +import org.apache.spark.sql.catalyst.util.IntervalUtils import org.apache.spark.sql.errors.QueryExecutionErrors import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.sources.Filter @@ -140,7 +141,8 @@ private[sql] object OrcFilters extends OrcFiltersBase { */ def getPredicateLeafType(dataType: DataType): PredicateLeaf.Type = dataType match { case BooleanType => PredicateLeaf.Type.BOOLEAN -case ByteType | ShortType | IntegerType | LongType => PredicateLeaf.Type.LONG +case ByteType | ShortType | IntegerType | LongType | + _: AnsiIntervalType => PredicateLeaf.Type.LONG case FloatType | DoubleType => PredicateLeaf.Type.FLOAT case StringType => PredicateLeaf.Type.STRING case DateType => PredicateLeaf.Type.DATE @@ -166,6 +168,10 @@ private[sql] object OrcFilters extends OrcFiltersBase { toJavaDate(localDateToDays(value.asInstanceOf[LocalDate])) case _: TimestampType if value.isInstanceOf[Instant] => toJavaTimestamp(instantToMicros(value.asInstanceOf[Instant])) +case _: YearMonthIntervalType => + IntervalUtils.periodToMonths(value.asInstanceOf[Period]).longValue() +case _: DayTimeIntervalType => + IntervalUtils.durationToMicros(value.asInstanceOf[Duration]) case _ => value } diff --git a/sql/core/src/test/scala/org/apache/spark
[spark] branch master updated: [SPARK-36931][SQL] Support reading and writing ANSI intervals from/to ORC datasources
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new f853afd [SPARK-36931][SQL] Support reading and writing ANSI intervals from/to ORC datasources f853afd is described below commit f853afdc035273f772dc47f5476be6cf205d0941 Author: Kousuke Saruta AuthorDate: Fri Oct 8 10:49:11 2021 +0300 [SPARK-36931][SQL] Support reading and writing ANSI intervals from/to ORC datasources ### What changes were proposed in this pull request? This PR aims to support reading and writing ANSI intervals from/to ORC datasources. year-month and day-time intervals are mapped to ORC's `int` and `bigint` respectively, To preserve the Catalyst's types, this change adds `spark.sql.catalyst.type` attribute for each ORC's type information. The value of the attribute is the value returned by `YearMonthIntervalType.typeName` or `DayTimeIntervalType.typeName`. ### Why are the changes needed? For better usability. There should be no reason to prohibit from reading/writing ANSI intervals from/to ORC datasources. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? New tests. Closes #34184 from sarutak/ansi-interval-orc-source. Authored-by: Kousuke Saruta Signed-off-by: Max Gekk --- .../sql/execution/datasources/DataSource.scala | 3 +- .../datasources/orc/OrcDeserializer.scala | 10 ++-- .../execution/datasources/orc/OrcFileFormat.scala | 8 ++- .../datasources/orc/OrcOutputWriter.scala | 1 + .../execution/datasources/orc/OrcSerializer.scala | 4 +- .../sql/execution/datasources/orc/OrcUtils.scala | 62 +- .../execution/datasources/v2/orc/OrcTable.scala| 2 - .../datasources/CommonFileDataSourceSuite.scala| 2 +- .../execution/datasources/orc/OrcSourceSuite.scala | 49 - 9 files changed, 123 insertions(+), 18 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala index 32913c6..9936126 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala @@ -581,7 +581,8 @@ case class DataSource( // TODO: Remove the Set below once all the built-in datasources support ANSI interval types private val writeAllowedSources: Set[Class[_]] = -Set(classOf[ParquetFileFormat], classOf[CSVFileFormat], classOf[JsonFileFormat]) +Set(classOf[ParquetFileFormat], classOf[CSVFileFormat], + classOf[JsonFileFormat], classOf[OrcFileFormat]) private def disallowWritingIntervals( dataTypes: Seq[DataType], diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcDeserializer.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcDeserializer.scala index fa8977f..1476083 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcDeserializer.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcDeserializer.scala @@ -86,10 +86,10 @@ class OrcDeserializer( case ShortType => (ordinal, value) => updater.setShort(ordinal, value.asInstanceOf[ShortWritable].get) - case IntegerType => (ordinal, value) => + case IntegerType | _: YearMonthIntervalType => (ordinal, value) => updater.setInt(ordinal, value.asInstanceOf[IntWritable].get) - case LongType => (ordinal, value) => + case LongType | _: DayTimeIntervalType => (ordinal, value) => updater.setLong(ordinal, value.asInstanceOf[LongWritable].get) case FloatType => (ordinal, value) => @@ -197,8 +197,10 @@ class OrcDeserializer( case BooleanType => UnsafeArrayData.fromPrimitiveArray(new Array[Boolean](length)) case ByteType => UnsafeArrayData.fromPrimitiveArray(new Array[Byte](length)) case ShortType => UnsafeArrayData.fromPrimitiveArray(new Array[Short](length)) -case IntegerType => UnsafeArrayData.fromPrimitiveArray(new Array[Int](length)) -case LongType => UnsafeArrayData.fromPrimitiveArray(new Array[Long](length)) +case IntegerType | _: YearMonthIntervalType => + UnsafeArrayData.fromPrimitiveArray(new Array[Int](length)) +case LongType | _: DayTimeIntervalType => + UnsafeArrayData.fromPrimitiveArray(new Array[Long](length)) case FloatType => UnsafeArrayData.fromPrimitiveArray(new Array[Float](length)) case DoubleType => UnsafeArrayData.fromPrimitiveArray(new Array[Double](length)) case _ =&
[spark] branch master updated: [SPARK-36948][SQL][TESTS] Check CREATE TABLE with ANSI intervals using Hive external catalog and Parquet
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 72fa176 [SPARK-36948][SQL][TESTS] Check CREATE TABLE with ANSI intervals using Hive external catalog and Parquet 72fa176 is described below commit 72fa176cae40a8a25ba72e1e5ba3a49928efebe1 Author: Max Gekk AuthorDate: Fri Oct 8 10:47:31 2021 +0300 [SPARK-36948][SQL][TESTS] Check CREATE TABLE with ANSI intervals using Hive external catalog and Parquet ### What changes were proposed in this pull request? In the PR, I propose new test to check: 1. CREATE TABLE with ANSI interval columns 2. INSERT INTO the table ANSI interval values 3. SELECT the table with ANSI interval columns Since Hive Metastore/Parquet Serde doesn't support interval types natively, Spark fallbacks to its specific format while saving the schema to Hive external catalog, and outputs the warning: ``` 20:10:52.797 WARN org.apache.spark.sql.hive.test.TestHiveExternalCatalog: Could not persist `default`.`tbl_with_ansi_intervals` in a Hive compatible way. Persisting it into Hive metastore in Spark SQL specific format. org.apache.hadoop.hive.ql.metadata.HiveException: java.lang.IllegalArgumentException: Error: type expected at the position 0 of 'interval year to month:interval day to second' but 'interval year to month' is found. at org.apache.hadoop.hive.ql.metadata.Hive.createTable(Hive.java:869) ``` ### Why are the changes needed? To improve test coverage. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? By running new test: ``` $ ./build/sbt -Phive-2.3 "test:testOnly *HiveParquetSuite" ``` Closes #34215 from MaxGekk/create-table-ansi-intervals-2. Authored-by: Max Gekk Signed-off-by: Max Gekk --- .../org/apache/spark/sql/hive/HiveParquetSuite.scala | 20 1 file changed, 20 insertions(+) diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveParquetSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveParquetSuite.scala index 8940ab4..ae7ca38 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveParquetSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveParquetSuite.scala @@ -17,6 +17,9 @@ package org.apache.spark.sql.hive +import java.time.{Duration, Period} +import java.time.temporal.ChronoUnit + import org.apache.spark.sql.{AnalysisException, QueryTest, Row} import org.apache.spark.sql.execution.datasources.parquet.ParquetTest import org.apache.spark.sql.hive.test.TestHiveSingleton @@ -123,4 +126,21 @@ class HiveParquetSuite extends QueryTest with ParquetTest with TestHiveSingleton assert(msg.contains("cannot resolve 'c3' given input columns")) } } + + test("SPARK-36948: Create a table with ANSI intervals using Hive external catalog") { +val tbl = "tbl_with_ansi_intervals" +withTable(tbl) { + sql(s"CREATE TABLE $tbl (ym INTERVAL YEAR TO MONTH, dt INTERVAL DAY TO SECOND) USING PARQUET") + sql( +s"""INSERT INTO $tbl VALUES ( + | INTERVAL '1-1' YEAR TO MONTH, + | INTERVAL '1 02:03:04.123456' DAY TO SECOND)""".stripMargin) + checkAnswer( +sql(s"SELECT * FROM $tbl"), +Row( + Period.ofYears(1).plusMonths(1), + Duration.ofDays(1).plusHours(2).plusMinutes(3).plusSeconds(4) +.plus(123456, ChronoUnit.MICROS))) +} + } } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated (218da86 -> 4a143f02)
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git. from 218da86 [SPARK-36742][PYTHON] Fix ps.to_datetime with plurals of keys like years, months, days add 4a143f02 [SPARK-36941][SQL][TESTS] Check saving/loading of ANSI intervals to Hive Parquet table No new revisions were added by this update. Summary of changes: .../apache/spark/sql/hive/HiveParquetSourceSuite.scala| 15 ++- 1 file changed, 14 insertions(+), 1 deletion(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated (2953d4f -> 090c9bf)
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git. from 2953d4f [SPARK-36751][PYTHON][DOCS][FOLLOW-UP] Fix unexpected section title for Examples in docstring add 090c9bf [SPARK-36937][SQL][TESTS] Change OrcSourceSuite to test both V1 and V2 sources No new revisions were added by this update. Summary of changes: .../sql/execution/datasources/orc/OrcSourceSuite.scala | 18 -- 1 file changed, 16 insertions(+), 2 deletions(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated: [SPARK-36920][SQL][FOLLOWUP] Fix input types of `ABS()`: numeric and ANSI intervals
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 3ac0382 [SPARK-36920][SQL][FOLLOWUP] Fix input types of `ABS()`: numeric and ANSI intervals 3ac0382 is described below commit 3ac0382759d33a46cf5d590c6f3915be7fa75351 Author: Max Gekk AuthorDate: Tue Oct 5 15:37:08 2021 +0300 [SPARK-36920][SQL][FOLLOWUP] Fix input types of `ABS()`: numeric and ANSI intervals ### What changes were proposed in this pull request? Change allowed input types of `Abs()` from: ``` NumericType + CalendarIntervalType + YearMonthIntervalType + DayTimeIntervalType ``` to ``` NumericType + YearMonthIntervalType + DayTimeIntervalType ``` ### Why are the changes needed? The changes make the error message more clear. Before changes: ```sql spark-sql> set spark.sql.legacy.interval.enabled=true; spark.sql.legacy.interval.enabled true spark-sql> select abs(interval -10 days -20 minutes); 21/10/05 09:11:30 ERROR SparkSQLDriver: Failed in [select abs(interval -10 days -20 minutes)] java.lang.ClassCastException: org.apache.spark.sql.types.CalendarIntervalType$ cannot be cast to org.apache.spark.sql.types.NumericType at org.apache.spark.sql.catalyst.util.TypeUtils$.getNumeric(TypeUtils.scala:77) at org.apache.spark.sql.catalyst.expressions.Abs.numeric$lzycompute(arithmetic.scala:172) at org.apache.spark.sql.catalyst.expressions.Abs.numeric(arithmetic.scala:169) ``` After: ```sql spark.sql.legacy.interval.enabled true spark-sql> select abs(interval -10 days -20 minutes); Error in query: cannot resolve 'abs(INTERVAL '-10 days -20 minutes')' due to data type mismatch: argument 1 requires (numeric or interval day to second or interval year to month) type, however, 'INTERVAL '-10 days -20 minutes'' is of interval type.; line 1 pos 7; 'Project [unresolvedalias(abs(-10 days -20 minutes, false), None)] +- OneRowRelation ``` ### Does this PR introduce _any_ user-facing change? No, because the original changes of https://github.com/apache/spark/pull/34169 haven't released yet. ### How was this patch tested? Manually checked in the command line, see examples above. Closes #34183 from MaxGekk/fix-abs-input-types. Authored-by: Max Gekk Signed-off-by: Max Gekk --- .../spark/sql/catalyst/expressions/arithmetic.scala| 2 +- .../org/apache/spark/sql/types/AbstractDataType.scala | 12 .../analysis/ExpressionTypeCheckingSuite.scala | 4 ++-- .../resources/sql-tests/results/ansi/literals.sql.out | 18 +- .../resources/sql-tests/results/ansi/timestamp.sql.out | 4 ++-- .../test/resources/sql-tests/results/literals.sql.out | 18 +- .../results/timestampNTZ/timestamp-ansi.sql.out| 4 ++-- .../typeCoercion/native/windowFrameCoercion.sql.out| 6 +++--- 8 files changed, 36 insertions(+), 32 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala index 9fd1f35..b77d558 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala @@ -162,7 +162,7 @@ case class Abs(child: Expression, failOnError: Boolean = SQLConf.get.ansiEnabled def this(child: Expression) = this(child, SQLConf.get.ansiEnabled) - override def inputTypes: Seq[AbstractDataType] = Seq(TypeCollection.NumericAndInterval) + override def inputTypes: Seq[AbstractDataType] = Seq(TypeCollection.NumericAndAnsiInterval) override def dataType: DataType = child.dataType diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/AbstractDataType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/AbstractDataType.scala index f7e754e..1d6d6e7 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/AbstractDataType.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/AbstractDataType.scala @@ -80,15 +80,19 @@ private[sql] class TypeCollection(private val types: Seq[AbstractDataType]) private[sql] object TypeCollection { /** - * Types that include numeric types and interval type. They are only used in unary_minus, - * unary_positive, add and subtract operations. + * Types that include numeric types and ANSI interval types. */ - val NumericAndInterval = TypeCollection( + val NumericAndAnsiInterval = TypeCollection( NumericType, -CalendarIntervalType, DayTimeIntervalType, YearMonthIntervalType) + /** + * Types that include numeric
[spark] branch master updated: [SPARK-36830][SQL] Support reading and writing ANSI intervals from/to JSON datasources
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 7c15580 [SPARK-36830][SQL] Support reading and writing ANSI intervals from/to JSON datasources 7c15580 is described below commit 7c155806ed6e1a2488d4ec2fa8da620318fea8dd Author: Kousuke Saruta AuthorDate: Thu Sep 30 20:22:06 2021 +0300 [SPARK-36830][SQL] Support reading and writing ANSI intervals from/to JSON datasources ### What changes were proposed in this pull request? This PR aims to support reading and writing ANSI intervals from/to JSON datasources. Aith this change, a interval data is written as a literal form like `{"col":"INTERVAL '1-2' YEAR TO MONTH"}`. For the reading part, we need to specify the schema explicitly like: ``` val readDF = spark.read.schema("col INTERVAL YEAR TO MONTH").json(...) ``` ### Why are the changes needed? For better usability. There should be no reason to prohibit from reading/writing ANSI intervals from/to JSON datasources. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? New test. It covers both V1 and V2 sources. Closes #34155 from sarutak/ansi-interval-json-source. Authored-by: Kousuke Saruta Signed-off-by: Max Gekk --- .../sql/execution/datasources/DataSource.scala | 2 +- .../execution/datasources/json/JsonFileFormat.scala | 2 -- .../execution/datasources/v2/json/JsonTable.scala | 2 -- .../scala/org/apache/spark/sql/SQLQuerySuite.scala | 19 --- .../datasources/CommonFileDataSourceSuite.scala | 2 +- .../sql/execution/datasources/json/JsonSuite.scala | 21 - 6 files changed, 22 insertions(+), 26 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala index be9a912..32913c6 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala @@ -581,7 +581,7 @@ case class DataSource( // TODO: Remove the Set below once all the built-in datasources support ANSI interval types private val writeAllowedSources: Set[Class[_]] = -Set(classOf[ParquetFileFormat], classOf[CSVFileFormat]) +Set(classOf[ParquetFileFormat], classOf[CSVFileFormat], classOf[JsonFileFormat]) private def disallowWritingIntervals( dataTypes: Seq[DataType], diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonFileFormat.scala index 8357a41..9c6c77a 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonFileFormat.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonFileFormat.scala @@ -134,8 +134,6 @@ class JsonFileFormat extends TextBasedFileFormat with DataSourceRegister { override def equals(other: Any): Boolean = other.isInstanceOf[JsonFileFormat] override def supportDataType(dataType: DataType): Boolean = dataType match { -case _: AnsiIntervalType => false - case _: AtomicType => true case st: StructType => st.forall { f => supportDataType(f.dataType) } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/json/JsonTable.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/json/JsonTable.scala index 244dd0f..5216800 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/json/JsonTable.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/json/JsonTable.scala @@ -55,8 +55,6 @@ case class JsonTable( } override def supportsDataType(dataType: DataType): Boolean = dataType match { -case _: AnsiIntervalType => false - case _: AtomicType => true case st: StructType => st.forall { f => supportsDataType(f.dataType) } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala index 22e3e33..7b2c0bb 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala @@ -1445,32 +1445,13 @@ class SQLQuerySuite extends QueryTest with SharedSparkSession with AdaptiveSpark val ymDF = sql("select interval 3 years -3 month") checkAnswer(ymDF, Row(Period.of(2, 9, 0))) -withTempPath(f => { - val e = intercept[AnalysisException] { -ymD
[spark] branch master updated: [SPARK-36831][SQL] Support reading and writing ANSI intervals from/to CSV datasources
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new a8734e3 [SPARK-36831][SQL] Support reading and writing ANSI intervals from/to CSV datasources a8734e3 is described below commit a8734e3f1695a3c436f65bbb1d54d1d02b0df33f Author: Kousuke Saruta AuthorDate: Wed Sep 29 21:22:34 2021 +0300 [SPARK-36831][SQL] Support reading and writing ANSI intervals from/to CSV datasources ### What changes were proposed in this pull request? This PR aims to support reading and writing ANSI intervals from/to CSV datasources. Aith this change, a interval data is written as a literal form like `INTERVAL '1-2' YEAR TO MONTH`. For the reading part, we need to specify the schema explicitly like: ``` val readDF = spark.read.schema("col INTERVAL YEAR TO MONTH").csv(...) ``` ### Why are the changes needed? For better usability. There should be no reason to prohibit from reading/writing ANSI intervals from/to CSV datasources. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? New test. It covers both V1 and V2 sources. Closes #34142 from sarutak/ansi-interval-csv-source. Authored-by: Kousuke Saruta Signed-off-by: Max Gekk --- .../sql/execution/datasources/DataSource.scala | 9 +++-- .../execution/datasources/csv/CSVFileFormat.scala | 2 -- .../sql/execution/datasources/v2/csv/CSVTable.scala | 4 +--- .../datasources/CommonFileDataSourceSuite.scala | 2 +- .../sql/execution/datasources/csv/CSVSuite.scala| 21 - 5 files changed, 29 insertions(+), 9 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala index 0707af4..be9a912 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala @@ -579,11 +579,16 @@ case class DataSource( checkEmptyGlobPath, checkFilesExist, enableGlobbing = globPaths) } + // TODO: Remove the Set below once all the built-in datasources support ANSI interval types + private val writeAllowedSources: Set[Class[_]] = +Set(classOf[ParquetFileFormat], classOf[CSVFileFormat]) + private def disallowWritingIntervals( dataTypes: Seq[DataType], forbidAnsiIntervals: Boolean): Unit = { -val isParquet = providingClass == classOf[ParquetFileFormat] -dataTypes.foreach(TypeUtils.invokeOnceForInterval(_, forbidAnsiIntervals || !isParquet) { +val isWriteAllowedSource = writeAllowedSources(providingClass) +dataTypes.foreach( + TypeUtils.invokeOnceForInterval(_, forbidAnsiIntervals || !isWriteAllowedSource) { throw QueryCompilationErrors.cannotSaveIntervalIntoExternalStorageError() }) } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVFileFormat.scala index 8add63c..d40ad9d 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVFileFormat.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVFileFormat.scala @@ -148,8 +148,6 @@ class CSVFileFormat extends TextBasedFileFormat with DataSourceRegister { override def equals(other: Any): Boolean = other.isInstanceOf[CSVFileFormat] override def supportDataType(dataType: DataType): Boolean = dataType match { -case _: AnsiIntervalType => false - case _: AtomicType => true case udt: UserDefinedType[_] => supportDataType(udt.sqlType) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/csv/CSVTable.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/csv/CSVTable.scala index 02601b3..839cd01 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/csv/CSVTable.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/csv/CSVTable.scala @@ -26,7 +26,7 @@ import org.apache.spark.sql.connector.write.{LogicalWriteInfo, Write, WriteBuild import org.apache.spark.sql.execution.datasources.FileFormat import org.apache.spark.sql.execution.datasources.csv.CSVDataSource import org.apache.spark.sql.execution.datasources.v2.FileTable -import org.apache.spark.sql.types.{AnsiIntervalType, AtomicType, DataType, StructType, UserDefinedType} +import org.apache.spark.sql.types.{AtomicType, DataType, StructType, UserDefinedType} import org.apache.spark.sql.util.CaseInsensitiveStringMap case class CSVTable( @@
[spark] branch master updated: [SPARK-36825][SQL] Read/write dataframes with ANSI intervals from/to parquet files
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new c77c0d4 [SPARK-36825][SQL] Read/write dataframes with ANSI intervals from/to parquet files c77c0d4 is described below commit c77c0d41e13ba85b9a6cf713cefebbb7170f53c2 Author: Max Gekk AuthorDate: Fri Sep 24 09:55:11 2021 +0300 [SPARK-36825][SQL] Read/write dataframes with ANSI intervals from/to parquet files ### What changes were proposed in this pull request? Allow saving and loading of ANSI intervals - `YearMonthIntervalType` and `DayTimeIntervalType` to/from the Parquet datasource. After the changes, Spark saves ANSI intervals as primitive physical Parquet types: - year-month intervals as `INT32` - day-time intervals as `INT64` w/o any modifications. To load the values as intervals back, Spark puts the info about interval types to the extra key `org.apache.spark.sql.parquet.row.metadata`: ``` $ java -jar parquet-tools-1.12.0.jar meta ./part-...-c000.snappy.parquet creator: parquet-mr version 1.12.1 (build 2a5c06c58fa987f85aa22170be14d927d5ff6e7d) extra: org.apache.spark.version = 3.3.0 extra: org.apache.spark.sql.parquet.row.metadata = {"type":"struct","fields":[...,{"name":"i","type":"interval year to month","nullable":false,"metadata":{}}]} file schema: spark_schema ... i: REQUIRED INT32 R:0 D:0 ``` **Note:** The given PR focus on support of ANSI intervals in the Parquet datasource via write or read as a column in `Dataset`. ### Why are the changes needed? To improve user experience with Spark SQL. At the moment, users can make ANSI intervals "inside" Spark or parallelize Java collections of `Period`/`Duration` objects but cannot save the intervals to any built-in datasources. After the changes, users can save datasets/dataframes with year-month/day-time intervals to load them back later by Apache Spark. For example: ```scala scala> sql("select date'today' - date'2021-01-01' as diff").write.parquet("/Users/maximgekk/tmp/parquet_interval") scala> val readback = spark.read.parquet("/Users/maximgekk/tmp/parquet_interval") readback: org.apache.spark.sql.DataFrame = [diff: interval day] scala> readback.printSchema root |-- diff: interval day (nullable = true) scala> readback.show +--+ | diff| +--+ |INTERVAL '264' DAY| +--+ ``` ### Does this PR introduce _any_ user-facing change? In some sense, yes. Before the changes, users get an error while saving of ANSI intervals as dataframe columns to parquet files but the operation should complete successfully after the changes. ### How was this patch tested? 1. By running the existing test suites: ``` $ build/sbt "test:testOnly *ParquetFileFormatV2Suite" $ build/sbt "test:testOnly *FileBasedDataSourceSuite" $ build/sbt "sql/test:testOnly *JsonV2Suite" ``` 2. Added new tests: ``` $ build/sbt "sql/test:testOnly *ParquetIOSuite" $ build/sbt "sql/test:testOnly *ParquetSchemaSuite" ``` Closes #34057 from MaxGekk/ansi-interval-save-parquet. Authored-by: Max Gekk Signed-off-by: Max Gekk --- .../apache/spark/sql/catalyst/util/TypeUtils.scala | 7 ++--- .../parquet/ParquetVectorUpdaterFactory.java | 8 +++--- .../sql/execution/datasources/DataSource.scala | 13 ++--- .../datasources/parquet/ParquetFileFormat.scala| 2 -- .../datasources/parquet/ParquetRowConverter.scala | 3 ++- .../parquet/ParquetSchemaConverter.scala | 4 +-- .../datasources/parquet/ParquetWriteSupport.scala | 4 +-- .../spark/sql/FileBasedDataSourceSuite.scala | 4 ++- .../datasources/CommonFileDataSourceSuite.scala| 31 -- .../datasources/parquet/ParquetIOSuite.scala | 20 ++ .../datasources/parquet/ParquetSchemaSuite.scala | 18 + 11 files changed, 82 insertions(+), 32 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TypeUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TypeUtils.scala index f4c0f3b..1a8de4c 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TypeUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TypeUtils.scala @@ -110,14 +110,15 @@ object Typ
[spark] branch branch-3.2 updated: [SPARK-36807][SQL] Merge ANSI interval types to a tightest common type
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a commit to branch branch-3.2 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.2 by this push: new 7fa88b2 [SPARK-36807][SQL] Merge ANSI interval types to a tightest common type 7fa88b2 is described below commit 7fa88b28a56a5be4fa78dbb690c9c72e8d856b56 Author: Max Gekk AuthorDate: Tue Sep 21 10:20:16 2021 +0300 [SPARK-36807][SQL] Merge ANSI interval types to a tightest common type ### What changes were proposed in this pull request? In the PR, I propose to modify `StructType` to support merging of ANSI interval types with different fields. ### Why are the changes needed? This will allow merging of schemas from different datasource files. ### Does this PR introduce _any_ user-facing change? No, the ANSI interval types haven't released yet. ### How was this patch tested? Added new test to `StructTypeSuite`. Closes #34049 from MaxGekk/merge-ansi-interval-types. Authored-by: Max Gekk Signed-off-by: Max Gekk (cherry picked from commit d2340f8e1c342354e1a67d468b35e86e3496ccf9) Signed-off-by: Max Gekk --- .../org/apache/spark/sql/types/StructType.scala| 6 ++ .../apache/spark/sql/types/StructTypeSuite.scala | 23 ++ 2 files changed, 29 insertions(+) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala index 83ee191..c9862cb 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala @@ -653,6 +653,12 @@ object StructType extends AbstractDataType { case (leftUdt: UserDefinedType[_], rightUdt: UserDefinedType[_]) if leftUdt.userClass == rightUdt.userClass => leftUdt + case (YearMonthIntervalType(lstart, lend), YearMonthIntervalType(rstart, rend)) => +YearMonthIntervalType(Math.min(lstart, rstart).toByte, Math.max(lend, rend).toByte) + + case (DayTimeIntervalType(lstart, lend), DayTimeIntervalType(rstart, rend)) => +DayTimeIntervalType(Math.min(lstart, rstart).toByte, Math.max(lend, rend).toByte) + case (leftType, rightType) if leftType == rightType => leftType diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/StructTypeSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/StructTypeSuite.scala index 8db3831..8cc04c7 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/StructTypeSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/StructTypeSuite.scala @@ -25,7 +25,9 @@ import org.apache.spark.sql.catalyst.plans.SQLHelper import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.{DayTimeIntervalType => DT} import org.apache.spark.sql.types.{YearMonthIntervalType => YM} +import org.apache.spark.sql.types.DayTimeIntervalType._ import org.apache.spark.sql.types.StructType.fromDDL +import org.apache.spark.sql.types.YearMonthIntervalType._ class StructTypeSuite extends SparkFunSuite with SQLHelper { @@ -382,4 +384,25 @@ class StructTypeSuite extends SparkFunSuite with SQLHelper { assert(e.getMessage.contains( "Field name a2.element.C.name is invalid: a2.element.c is not a struct")) } + + test("SPARK-36807: Merge ANSI interval types to a tightest common type") { +Seq( + (YM(YEAR), YM(YEAR)) -> YM(YEAR), + (YM(YEAR), YM(MONTH)) -> YM(YEAR, MONTH), + (YM(MONTH), YM(MONTH)) -> YM(MONTH), + (YM(YEAR, MONTH), YM(YEAR)) -> YM(YEAR, MONTH), + (YM(YEAR, MONTH), YM(YEAR, MONTH)) -> YM(YEAR, MONTH), + (DT(DAY), DT(DAY)) -> DT(DAY), + (DT(SECOND), DT(SECOND)) -> DT(SECOND), + (DT(DAY), DT(SECOND)) -> DT(DAY, SECOND), + (DT(HOUR, SECOND), DT(DAY, MINUTE)) -> DT(DAY, SECOND), + (DT(HOUR, MINUTE), DT(DAY, SECOND)) -> DT(DAY, SECOND) +).foreach { case ((i1, i2), expected) => + val st1 = new StructType().add("interval", i1) + val st2 = new StructType().add("interval", i2) + val expectedStruct = new StructType().add("interval", expected) + assert(st1.merge(st2) === expectedStruct) + assert(st2.merge(st1) === expectedStruct) +} + } } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated (cc182fe -> d2340f8)
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git. from cc182fe [SPARK-36785][PYTHON] Fix DataFrame.isin when DataFrame has NaN value add d2340f8 [SPARK-36807][SQL] Merge ANSI interval types to a tightest common type No new revisions were added by this update. Summary of changes: .../org/apache/spark/sql/types/StructType.scala| 6 ++ .../apache/spark/sql/types/StructTypeSuite.scala | 23 ++ 2 files changed, 29 insertions(+) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated: [SPARK-36778][SQL] Support ILIKE API on Scala(dataframe)
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 1312a87 [SPARK-36778][SQL] Support ILIKE API on Scala(dataframe) 1312a87 is described below commit 1312a873659a90502ebc611ae678fc6e6d938b6e Author: Leona Yoda AuthorDate: Fri Sep 17 14:37:10 2021 +0300 [SPARK-36778][SQL] Support ILIKE API on Scala(dataframe) ### What changes were proposed in this pull request? Support ILIKE (case insensitive LIKE) API on Scala. ### Why are the changes needed? ILIKE statement on SQL interface is supported by SPARK-36674. This PR will support Scala(dataframe) API for it. ### Does this PR introduce _any_ user-facing change? Yes. Users can call `ilike` from dataframe. ### How was this patch tested? unit tests. Closes #34027 from yoda-mon/scala-ilike. Authored-by: Leona Yoda Signed-off-by: Max Gekk --- .../main/scala/org/apache/spark/sql/Column.scala | 8 ++ .../apache/spark/sql/ColumnExpressionSuite.scala | 31 ++ 2 files changed, 39 insertions(+) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala index c450271..554f6a3 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala @@ -847,6 +847,14 @@ class Column(val expr: Expression) extends Logging { def rlike(literal: String): Column = withExpr { RLike(expr, lit(literal).expr) } /** + * SQL ILIKE expression (case insensitive LIKE). + * + * @group expr_ops + * @since 3.3.0 + */ + def ilike(literal: String): Column = withExpr { new ILike(expr, lit(literal).expr) } + + /** * An expression that gets an item at position `ordinal` out of an array, * or gets a value by key `key` in a `MapType`. * diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala index b0cd613..d954dc1 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala @@ -2904,4 +2904,35 @@ class ColumnExpressionSuite extends QueryTest with SharedSparkSession { } } } + + test("SPARK-36778: add ilike API for scala") { +// scalastyle:off +// non ascii characters are not allowed in the code, so we disable the scalastyle here. +// null handling +val nullDf = Seq("a", null).toDF("src") +checkAnswer(nullDf.filter($"src".ilike("A")), Row("a")) +checkAnswer(nullDf.filter($"src".ilike(null)), spark.emptyDataFrame) +// simple pattern +val simpleDf = Seq("a", "A", "abdef", "a_%b", "addb", "abC", "a\nb").toDF("src") +checkAnswer(simpleDf.filter($"src".ilike("a")), Seq("a", "A").toDF()) +checkAnswer(simpleDf.filter($"src".ilike("A")), Seq("a", "A").toDF()) +checkAnswer(simpleDf.filter($"src".ilike("b")), spark.emptyDataFrame) +checkAnswer(simpleDf.filter($"src".ilike("aBdef")), Seq("abdef").toDF()) +checkAnswer(simpleDf.filter($"src".ilike("a\\__b")), Seq("a_%b").toDF()) +checkAnswer(simpleDf.filter($"src".ilike("A_%b")), Seq("a_%b", "addb", "a\nb").toDF()) +checkAnswer(simpleDf.filter($"src".ilike("a%")), simpleDf) +checkAnswer(simpleDf.filter($"src".ilike("a_b")), Seq("a\nb").toDF()) +// double-escaping backslash +val dEscDf = Seq("""\__""", """\\__""").toDF("src") +checkAnswer(dEscDf.filter($"src".ilike("""\\\__""")), Seq("""\__""").toDF()) +checkAnswer(dEscDf.filter($"src".ilike("""%\\%\%""")), spark.emptyDataFrame) +// unicode +val uncDf = Seq("a\u20ACA", "A€a", "a€AA", "a\u20ACaz", "ЀЁЂѺΏỀ").toDF("src") +checkAnswer(uncDf.filter($"src".ilike("_\u20AC_")), Seq("a\u20ACA", "A€a").toDF()) +checkAnswer(uncDf.filter($"src".ilike("_€_")), Seq("a\u20ACA", "A€a").toDF()) +checkAnswer(uncDf.filter($"src".ilike("_\u20AC_a")), Seq("a€AA").toDF()) +checkAnswer(uncDf.filter($"src".ilike("_€_Z")), Seq("a\u20ACaz").toDF()) +checkAnswer(uncDf.filter($"src".ilike("ѐёђѻώề")), Seq("ЀЁЂѺΏỀ").toDF()) +// scalastyle:on + } } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated: [SPARK-36675][SQL] Support ScriptTransformation for timestamp_ntz
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 0ab0cb1 [SPARK-36675][SQL] Support ScriptTransformation for timestamp_ntz 0ab0cb1 is described below commit 0ab0cb108d64c95c0d46075c9c30d735d74a3b0d Author: Kousuke Saruta AuthorDate: Mon Sep 6 20:58:07 2021 +0200 [SPARK-36675][SQL] Support ScriptTransformation for timestamp_ntz ### What changes were proposed in this pull request? This PR aims to support `ScriptTransformation` for `timestamp_ntz`. In the current master, it doesn't work. ``` spark.sql("SELECT transform(col1) USING 'cat' AS (col1 timestamp_ntz) FROM VALUES timestamp_ntz'2021-09-06 20:19:13' t").show(false) 21/09/06 22:03:55 ERROR Executor: Exception in task 0.0 in stage 0.0 (TID 0) org.apache.spark.SparkException: SparkScriptTransformation without serde does not support TimestampNTZType$ as output data type at org.apache.spark.sql.errors.QueryExecutionErrors$.outputDataTypeUnsupportedByNodeWithoutSerdeError(QueryExecutionErrors.scala:1740) at org.apache.spark.sql.execution.BaseScriptTransformationExec.$anonfun$outputFieldWriters$1(BaseScriptTransformationExec.scala:245) at scala.collection.immutable.List.map(List.scala:293) at org.apache.spark.sql.execution.BaseScriptTransformationExec.org$apache$spark$sql$execution$BaseScriptTransformationExec$$outputFieldWriters(BaseScriptTransformationExec.scala:194) at org.apache.spark.sql.execution.BaseScriptTransformationExec.org$apache$spark$sql$execution$BaseScriptTransformationExec$$outputFieldWriters$(BaseScriptTransformationExec.scala:194) at org.apache.spark.sql.execution.SparkScriptTransformationExec.org$apache$spark$sql$execution$BaseScriptTransformationExec$$outputFieldWriters$lzycompute(SparkScriptTransformationExec.scala:38) at org.apache.spark.sql.execution.SparkScriptTransformationExec.org$apache$spark$sql$execution$BaseScriptTransformationExec$$outputFieldWriters(SparkScriptTransformationExec.scala:38) at org.apache.spark.sql.execution.BaseScriptTransformationExec$$anon$1.$anonfun$processRowWithoutSerde$1(BaseScriptTransformationExec.scala:121) at org.apache.spark.sql.execution.BaseScriptTransformationExec$$anon$1.next(BaseScriptTransformationExec.scala:162) at org.apache.spark.sql.execution.BaseScriptTransformationExec$$anon$1.next(BaseScriptTransformationExec.scala:113) ``` ### Why are the changes needed? For better usability. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? New test. Closes #33920 from sarutak/script-transformation-timestamp-ntz. Authored-by: Kousuke Saruta Signed-off-by: Max Gekk --- .../spark/sql/execution/BaseScriptTransformationExec.scala | 3 +++ .../sql/execution/BaseScriptTransformationSuite.scala | 14 ++ 2 files changed, 17 insertions(+) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/BaseScriptTransformationExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/BaseScriptTransformationExec.scala index fc3a124..6040050 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/BaseScriptTransformationExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/BaseScriptTransformationExec.scala @@ -220,6 +220,9 @@ trait BaseScriptTransformationExec extends UnaryExecNode { UTF8String.fromString(data), DateTimeUtils.getZoneId(conf.sessionLocalTimeZone)) .map(DateTimeUtils.toJavaTimestamp).orNull, converter) + case TimestampNTZType => +wrapperConvertException(data => DateTimeUtils.stringToTimestampWithoutTimeZone( + UTF8String.fromString(data)).map(DateTimeUtils.microsToLocalDateTime).orNull, converter) case CalendarIntervalType => wrapperConvertException( data => IntervalUtils.stringToInterval(UTF8String.fromString(data)), converter) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/BaseScriptTransformationSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/BaseScriptTransformationSuite.scala index 9d8fcda..488a0fd 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/BaseScriptTransformationSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/BaseScriptTransformationSuite.scala @@ -654,6 +654,20 @@ abstract class BaseScriptTransformationSuite extends SparkPlanTest with SQLTestU df.select($"ym", $"dt").collect()) } } + + test("SPARK-36675: TRANSFORM should support timestamp_ntz (no serde)") { +val df = spark.sql("SELECT timestamp_ntz'2021-09-06 20:19:13' co
[spark] branch master updated: [SPARK-36626][PYTHON][FOLLOW-UP] Use datetime.tzinfo instead of datetime.tzname()
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new c6f3a13 [SPARK-36626][PYTHON][FOLLOW-UP] Use datetime.tzinfo instead of datetime.tzname() c6f3a13 is described below commit c6f3a13087a954d56ef671ecb82c8031a2f45d52 Author: Hyukjin Kwon AuthorDate: Mon Sep 6 17:16:52 2021 +0200 [SPARK-36626][PYTHON][FOLLOW-UP] Use datetime.tzinfo instead of datetime.tzname() ### What changes were proposed in this pull request? This PR is a small followup of https://github.com/apache/spark/pull/33876 which proposes to use `datetime.tzinfo` instead of `datetime.tzname` to see if timezome information is provided or not. This way is consistent with other places such as: https://github.com/apache/spark/blob/9c5bcac61ee56fbb271e890cc33f9a983612c5b0/python/pyspark/sql/types.py#L182 https://github.com/apache/spark/blob/9c5bcac61ee56fbb271e890cc33f9a983612c5b0/python/pyspark/sql/types.py#L1662 ### Why are the changes needed? In some cases, `datetime.tzname` can raise an exception (https://docs.python.org/3/library/datetime.html#datetime.datetime.tzname): > ... raises an exception if the latter doesn’t return None or a string object, I was able to reproduce this in Jenkins with setting `spark.sql.timestampType` to `TIMESTAMP_NTZ` by default: ``` == ERROR: test_time_with_timezone (pyspark.sql.tests.test_serde.SerdeTests) -- Traceback (most recent call last): File "/home/jenkins/workspace/SparkPullRequestBuilder/python/pyspark/sql/tests/test_serde.py", line 92, in test_time_with_timezone ... File "/usr/lib/pypy3/lib-python/3/datetime.py", line 979, in tzname raise NotImplementedError("tzinfo subclass must override tzname()") NotImplementedError: tzinfo subclass must override tzname() ``` ### Does this PR introduce _any_ user-facing change? No to end users because it has not be released. This is rather a safeguard to prevent potential breakage. ### How was this patch tested? Manually tested. Closes #33918 from HyukjinKwon/SPARK-36626-followup. Authored-by: Hyukjin Kwon Signed-off-by: Max Gekk --- python/pyspark/sql/types.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py index 6cb8aec..e8b7411 100644 --- a/python/pyspark/sql/types.py +++ b/python/pyspark/sql/types.py @@ -1045,7 +1045,7 @@ def _infer_type(obj, infer_dict_as_struct=False, prefer_timestamp_ntz=False): if dataType is DecimalType: # the precision and scale of `obj` may be different from row to row. return DecimalType(38, 18) -if dataType is TimestampType and prefer_timestamp_ntz and obj.tzname() is None: +if dataType is TimestampType and prefer_timestamp_ntz and obj.tzinfo is None: return TimestampNTZType() elif dataType is not None: return dataType() - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated: [SPARK-36418][SPARK-36536][SQL][DOCS][FOLLOWUP] Update the SQL migration guide about using `CAST` in datetime parsing
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 9f595c4 [SPARK-36418][SPARK-36536][SQL][DOCS][FOLLOWUP] Update the SQL migration guide about using `CAST` in datetime parsing 9f595c4 is described below commit 9f595c4ce34728f5d8f943eadea8d85a548b2d41 Author: Max Gekk AuthorDate: Mon Aug 23 13:07:37 2021 +0300 [SPARK-36418][SPARK-36536][SQL][DOCS][FOLLOWUP] Update the SQL migration guide about using `CAST` in datetime parsing ### What changes were proposed in this pull request? In the PR, I propose the update the SQL migration guide about the changes introduced by the PRs https://github.com/apache/spark/pull/33709 and https://github.com/apache/spark/pull/33769. https://user-images.githubusercontent.com/1580697/130419710-640f20b3-6a38-4eb1-a6d6-2e069dc5665c.png;> ### Why are the changes needed? To inform users about the upcoming changes in parsing datetime strings. This should help users to migrate on the new release. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? By generating the doc, and checking by eyes: ``` $ SKIP_API=1 SKIP_RDOC=1 SKIP_PYTHONDOC=1 SKIP_SCALADOC=1 bundle exec jekyll build ``` Closes #33809 from MaxGekk/datetime-cast-migr-guide. Authored-by: Max Gekk Signed-off-by: Max Gekk --- docs/sql-migration-guide.md | 20 1 file changed, 20 insertions(+) diff --git a/docs/sql-migration-guide.md b/docs/sql-migration-guide.md index 7ad384f..47e7921 100644 --- a/docs/sql-migration-guide.md +++ b/docs/sql-migration-guide.md @@ -26,6 +26,26 @@ license: | - Since Spark 3.3, Spark turns a non-nullable schema into nullable for API `DataFrameReader.schema(schema: StructType).json(jsonDataset: Dataset[String])` and `DataFrameReader.schema(schema: StructType).csv(csvDataset: Dataset[String])` when the schema is specified by the user and contains non-nullable fields. + - Since Spark 3.3, when the date or timestamp pattern is not specified, Spark converts an input string to a date/timestamp using the `CAST` expression approach. The changes affect CSV/JSON datasources and parsing of partition values. In Spark 3.2 or earlier, when the date or timestamp pattern is not set, Spark uses the default patterns: `-MM-dd` for dates and `-MM-dd HH:mm:ss` for timestamps. After the changes, Spark still recognizes the pattern together with + +Date patterns: + * `[+-]*` + * `[+-]*-[m]m` + * `[+-]*-[m]m-[d]d` + * `[+-]*-[m]m-[d]d ` + * `[+-]*-[m]m-[d]d *` + * `[+-]*-[m]m-[d]dT*` + +Timestamp patterns: + * `[+-]*` + * `[+-]*-[m]m` + * `[+-]*-[m]m-[d]d` + * `[+-]*-[m]m-[d]d ` + * `[+-]*-[m]m-[d]d [h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]` + * `[+-]*-[m]m-[d]dT[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]` + * `[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]` + * `T[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]` + ## Upgrading from Spark SQL 3.1 to 3.2 - Since Spark 3.2, ADD FILE/JAR/ARCHIVE commands require each path to be enclosed by `"` or `'` if the path contains whitespaces. - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated (013f2b7 -> 1235bd2)
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git. from 013f2b7 [SPARK-36512][UI][TESTS] Fix UISeleniumSuite in sql/hive-thriftserver add 1235bd2 [SPARK-36536][SQL] Use CAST for datetime in CSV/JSON by default No new revisions were added by this update. Summary of changes: .../spark/sql/catalyst/csv/CSVInferSchema.scala| 2 +- .../apache/spark/sql/catalyst/csv/CSVOptions.scala | 12 +-- .../sql/catalyst/csv/UnivocityGenerator.scala | 6 ++-- .../spark/sql/catalyst/csv/UnivocityParser.scala | 6 ++-- .../spark/sql/catalyst/json/JSONOptions.scala | 12 +-- .../spark/sql/catalyst/json/JacksonGenerator.scala | 6 ++-- .../spark/sql/catalyst/json/JacksonParser.scala| 6 ++-- .../spark/sql/catalyst/json/JsonInferSchema.scala | 2 +- .../spark/sql/catalyst/util/DateFormatter.scala| 8 + .../sql/catalyst/util/TimestampFormatter.scala | 24 ++ .../sql/catalyst/csv/UnivocityParserSuite.scala| 4 +-- .../sql/execution/datasources/csv/CSVSuite.scala | 37 +- .../sql/execution/datasources/json/JsonSuite.scala | 37 -- 13 files changed, 139 insertions(+), 23 deletions(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-3.2 updated: [SPARK-36524][SQL] Common class for ANSI interval types
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a commit to branch branch-3.2 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.2 by this push: new 07c6976 [SPARK-36524][SQL] Common class for ANSI interval types 07c6976 is described below commit 07c6976f79e418be8aed9bed8e7b396231a27c25 Author: Max Gekk AuthorDate: Tue Aug 17 12:27:56 2021 +0300 [SPARK-36524][SQL] Common class for ANSI interval types ### What changes were proposed in this pull request? Add new type `AnsiIntervalType` to `AbstractDataType.scala`, and extend it by `YearMonthIntervalType` and by `DayTimeIntervalType` ### Why are the changes needed? To improve code maintenance. The change will allow to replace checking of both `YearMonthIntervalType` and `DayTimeIntervalType` by a check of `AnsiIntervalType`, for instance: ```scala case _: YearMonthIntervalType | _: DayTimeIntervalType => false ``` by ```scala case _: AnsiIntervalType => false ``` ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? By existing test suites. Closes #33753 from MaxGekk/ansi-interval-type-trait. Authored-by: Max Gekk Signed-off-by: Max Gekk (cherry picked from commit 82a31508afffd089048e28276c75b5deb1ada47f) Signed-off-by: Max Gekk --- .../avro/src/main/scala/org/apache/spark/sql/avro/AvroUtils.scala | 2 +- .../scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala | 8 .../org/apache/spark/sql/catalyst/analysis/AnsiTypeCoercion.scala | 2 +- .../org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala| 2 +- .../org/apache/spark/sql/catalyst/expressions/arithmetic.scala| 4 ++-- .../spark/sql/catalyst/expressions/collectionOperations.scala | 2 +- .../spark/sql/catalyst/expressions/datetimeExpressions.scala | 2 +- .../main/scala/org/apache/spark/sql/catalyst/util/TypeUtils.scala | 4 ++-- .../main/scala/org/apache/spark/sql/types/AbstractDataType.scala | 5 + .../scala/org/apache/spark/sql/types/DayTimeIntervalType.scala| 2 +- .../scala/org/apache/spark/sql/types/YearMonthIntervalType.scala | 2 +- .../spark/sql/execution/datasources/csv/CSVFileFormat.scala | 2 +- .../spark/sql/execution/datasources/json/JsonFileFormat.scala | 2 +- .../spark/sql/execution/datasources/orc/OrcFileFormat.scala | 2 +- .../sql/execution/datasources/parquet/ParquetFileFormat.scala | 2 +- .../apache/spark/sql/execution/datasources/v2/csv/CSVTable.scala | 4 ++-- .../spark/sql/execution/datasources/v2/json/JsonTable.scala | 2 +- .../apache/spark/sql/execution/datasources/v2/orc/OrcTable.scala | 2 +- .../spark/sql/execution/datasources/v2/parquet/ParquetTable.scala | 2 +- .../sql/hive/thriftserver/SparkExecuteStatementOperation.scala| 2 +- .../spark/sql/hive/thriftserver/SparkGetColumnsOperation.scala| 5 ++--- .../main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala | 2 +- 22 files changed, 33 insertions(+), 29 deletions(-) diff --git a/external/avro/src/main/scala/org/apache/spark/sql/avro/AvroUtils.scala b/external/avro/src/main/scala/org/apache/spark/sql/avro/AvroUtils.scala index 68b393e..5b8afe8 100644 --- a/external/avro/src/main/scala/org/apache/spark/sql/avro/AvroUtils.scala +++ b/external/avro/src/main/scala/org/apache/spark/sql/avro/AvroUtils.scala @@ -71,7 +71,7 @@ private[sql] object AvroUtils extends Logging { } def supportsDataType(dataType: DataType): Boolean = dataType match { -case _: DayTimeIntervalType | _: YearMonthIntervalType => false +case _: AnsiIntervalType => false case _: AtomicType => true diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala index 468986d..2f0a709 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala @@ -377,9 +377,9 @@ class Analyzer(override val catalogManager: CatalogManager) TimestampAddYMInterval(r, l) case (CalendarIntervalType, CalendarIntervalType) | (_: DayTimeIntervalType, _: DayTimeIntervalType) => a - case (_: NullType, _: DayTimeIntervalType | _: YearMonthIntervalType) => + case (_: NullType, _: AnsiIntervalType) => a.copy(left = Cast(a.left, a.right.dataType)) - case (_: DayTimeIntervalType | _: YearMonthIntervalType, _: NullType) => + case (_: AnsiIntervalType, _: NullType) => a.copy(right = Cast(a.right, a.left.dataType)) case (DateType, CalendarIntervalType) => DateAddInterval(l, r, ansiEnabled = f)
[spark] branch master updated (ea13c5a -> 82a3150)
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git. from ea13c5a [SPARK-36052][K8S][FOLLOWUP] Update config version to 3.2.0 add 82a3150 [SPARK-36524][SQL] Common class for ANSI interval types No new revisions were added by this update. Summary of changes: .../avro/src/main/scala/org/apache/spark/sql/avro/AvroUtils.scala | 2 +- .../scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala | 8 .../org/apache/spark/sql/catalyst/analysis/AnsiTypeCoercion.scala | 2 +- .../org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala| 2 +- .../org/apache/spark/sql/catalyst/expressions/arithmetic.scala| 4 ++-- .../spark/sql/catalyst/expressions/collectionOperations.scala | 2 +- .../spark/sql/catalyst/expressions/datetimeExpressions.scala | 2 +- .../main/scala/org/apache/spark/sql/catalyst/util/TypeUtils.scala | 4 ++-- .../main/scala/org/apache/spark/sql/types/AbstractDataType.scala | 5 + .../scala/org/apache/spark/sql/types/DayTimeIntervalType.scala| 2 +- .../scala/org/apache/spark/sql/types/YearMonthIntervalType.scala | 2 +- .../spark/sql/execution/datasources/csv/CSVFileFormat.scala | 2 +- .../spark/sql/execution/datasources/json/JsonFileFormat.scala | 2 +- .../spark/sql/execution/datasources/orc/OrcFileFormat.scala | 2 +- .../sql/execution/datasources/parquet/ParquetFileFormat.scala | 2 +- .../apache/spark/sql/execution/datasources/v2/csv/CSVTable.scala | 4 ++-- .../spark/sql/execution/datasources/v2/json/JsonTable.scala | 2 +- .../apache/spark/sql/execution/datasources/v2/orc/OrcTable.scala | 2 +- .../spark/sql/execution/datasources/v2/parquet/ParquetTable.scala | 2 +- .../sql/hive/thriftserver/SparkExecuteStatementOperation.scala| 2 +- .../spark/sql/hive/thriftserver/SparkGetColumnsOperation.scala| 5 ++--- .../main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala | 2 +- 22 files changed, 33 insertions(+), 29 deletions(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-3.2 updated: [SPARK-36521][SQL] Disallow comparison between Interval and String
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a commit to branch branch-3.2 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.2 by this push: new 41e5144 [SPARK-36521][SQL] Disallow comparison between Interval and String 41e5144 is described below commit 41e5144b53d21d4c67e35250594ee418bdfba136 Author: Gengliang Wang AuthorDate: Mon Aug 16 22:41:14 2021 +0300 [SPARK-36521][SQL] Disallow comparison between Interval and String ### What changes were proposed in this pull request? Disallow comparison between Interval and String in the default type coercion rules. ### Why are the changes needed? If a binary comparison contains interval type and string type, we can't decide which interval type the string should be promoted as. There are many possible interval types, such as year interval, month interval, day interval, hour interval, etc. ### Does this PR introduce _any_ user-facing change? No, the new interval type is not released yet. ### How was this patch tested? Existing UT Closes #33750 from gengliangwang/disallowCom. Authored-by: Gengliang Wang Signed-off-by: Max Gekk (cherry picked from commit 26d6b952dcf7d387930701396de9cef679df7432) Signed-off-by: Max Gekk --- .../spark/sql/catalyst/analysis/TypeCoercion.scala | 16 +++- .../test/resources/sql-tests/inputs/interval.sql | 6 ++ .../sql-tests/results/ansi/interval.sql.out| 56 +- .../resources/sql-tests/results/interval.sql.out | 86 ++ 4 files changed, 148 insertions(+), 16 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala index 23654af..863bdc0 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala @@ -862,6 +862,18 @@ object TypeCoercion extends TypeCoercionBase { case _ => None } + // Return whether a string literal can be promoted as the give data type in a binary comparison. + private def canPromoteAsInBinaryComparison(dt: DataType) = dt match { +// If a binary comparison contains interval type and string type, we can't decide which +// interval type the string should be promoted as. There are many possible interval +// types, such as year interval, month interval, day interval, hour interval, etc. +case _: YearMonthIntervalType | _: DayTimeIntervalType => false +// There is no need to add `Cast` for comparison between strings. +case _: StringType => false +case _: AtomicType => true +case _ => false + } + /** * This function determines the target type of a comparison operator when one operand * is a String and the other is not. It also handles when one op is a Date and the @@ -891,8 +903,8 @@ object TypeCoercion extends TypeCoercionBase { case (n: DecimalType, s: StringType) => Some(DoubleType) case (s: StringType, n: DecimalType) => Some(DoubleType) -case (l: StringType, r: AtomicType) if r != StringType => Some(r) -case (l: AtomicType, r: StringType) if l != StringType => Some(l) +case (l: StringType, r: AtomicType) if canPromoteAsInBinaryComparison(r) => Some(r) +case (l: AtomicType, r: StringType) if canPromoteAsInBinaryComparison(l) => Some(l) case (l, r) => None } diff --git a/sql/core/src/test/resources/sql-tests/inputs/interval.sql b/sql/core/src/test/resources/sql-tests/inputs/interval.sql index 618cf16..279c5441 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/interval.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/interval.sql @@ -341,9 +341,15 @@ SELECT INTERVAL 1 MONTH > INTERVAL 20 DAYS; SELECT INTERVAL '1' DAY < '1'; SELECT INTERVAL '1' DAY = '1'; SELECT INTERVAL '1' DAY > '1'; +SELECT '1' < INTERVAL '1' DAY; +SELECT '1' = INTERVAL '1' DAY; +SELECT '1' > INTERVAL '1' DAY; SELECT INTERVAL '1' YEAR < '1'; SELECT INTERVAL '1' YEAR = '1'; SELECT INTERVAL '1' YEAR > '1'; +SELECT '1' < INTERVAL '1' YEAR; +SELECT '1' = INTERVAL '1' YEAR; +SELECT '1' > INTERVAL '1' YEAR; SELECT array(INTERVAL '1' YEAR, INTERVAL '1' MONTH); SELECT array(INTERVAL '1' DAY, INTERVAL '01:01' HOUR TO MINUTE); diff --git a/sql/core/src/test/resources/sql-tests/results/ansi/interval.sql.out b/sql/core/src/test/resources/sql-tests/results/ansi/interval.sql.out index e0bf076..1aa0920 100644 --- a/sql/core/src/test/resources/sql-tests/results/ansi/interval.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/ansi/interval.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- N
[spark] branch master updated: [SPARK-36521][SQL] Disallow comparison between Interval and String
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 26d6b95 [SPARK-36521][SQL] Disallow comparison between Interval and String 26d6b95 is described below commit 26d6b952dcf7d387930701396de9cef679df7432 Author: Gengliang Wang AuthorDate: Mon Aug 16 22:41:14 2021 +0300 [SPARK-36521][SQL] Disallow comparison between Interval and String ### What changes were proposed in this pull request? Disallow comparison between Interval and String in the default type coercion rules. ### Why are the changes needed? If a binary comparison contains interval type and string type, we can't decide which interval type the string should be promoted as. There are many possible interval types, such as year interval, month interval, day interval, hour interval, etc. ### Does this PR introduce _any_ user-facing change? No, the new interval type is not released yet. ### How was this patch tested? Existing UT Closes #33750 from gengliangwang/disallowCom. Authored-by: Gengliang Wang Signed-off-by: Max Gekk --- .../spark/sql/catalyst/analysis/TypeCoercion.scala | 16 +++- .../test/resources/sql-tests/inputs/interval.sql | 6 ++ .../sql-tests/results/ansi/interval.sql.out| 56 +- .../resources/sql-tests/results/interval.sql.out | 86 ++ 4 files changed, 148 insertions(+), 16 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala index 23654af..863bdc0 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala @@ -862,6 +862,18 @@ object TypeCoercion extends TypeCoercionBase { case _ => None } + // Return whether a string literal can be promoted as the give data type in a binary comparison. + private def canPromoteAsInBinaryComparison(dt: DataType) = dt match { +// If a binary comparison contains interval type and string type, we can't decide which +// interval type the string should be promoted as. There are many possible interval +// types, such as year interval, month interval, day interval, hour interval, etc. +case _: YearMonthIntervalType | _: DayTimeIntervalType => false +// There is no need to add `Cast` for comparison between strings. +case _: StringType => false +case _: AtomicType => true +case _ => false + } + /** * This function determines the target type of a comparison operator when one operand * is a String and the other is not. It also handles when one op is a Date and the @@ -891,8 +903,8 @@ object TypeCoercion extends TypeCoercionBase { case (n: DecimalType, s: StringType) => Some(DoubleType) case (s: StringType, n: DecimalType) => Some(DoubleType) -case (l: StringType, r: AtomicType) if r != StringType => Some(r) -case (l: AtomicType, r: StringType) if l != StringType => Some(l) +case (l: StringType, r: AtomicType) if canPromoteAsInBinaryComparison(r) => Some(r) +case (l: AtomicType, r: StringType) if canPromoteAsInBinaryComparison(l) => Some(l) case (l, r) => None } diff --git a/sql/core/src/test/resources/sql-tests/inputs/interval.sql b/sql/core/src/test/resources/sql-tests/inputs/interval.sql index 618cf16..279c5441 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/interval.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/interval.sql @@ -341,9 +341,15 @@ SELECT INTERVAL 1 MONTH > INTERVAL 20 DAYS; SELECT INTERVAL '1' DAY < '1'; SELECT INTERVAL '1' DAY = '1'; SELECT INTERVAL '1' DAY > '1'; +SELECT '1' < INTERVAL '1' DAY; +SELECT '1' = INTERVAL '1' DAY; +SELECT '1' > INTERVAL '1' DAY; SELECT INTERVAL '1' YEAR < '1'; SELECT INTERVAL '1' YEAR = '1'; SELECT INTERVAL '1' YEAR > '1'; +SELECT '1' < INTERVAL '1' YEAR; +SELECT '1' = INTERVAL '1' YEAR; +SELECT '1' > INTERVAL '1' YEAR; SELECT array(INTERVAL '1' YEAR, INTERVAL '1' MONTH); SELECT array(INTERVAL '1' DAY, INTERVAL '01:01' HOUR TO MINUTE); diff --git a/sql/core/src/test/resources/sql-tests/results/ansi/interval.sql.out b/sql/core/src/test/resources/sql-tests/results/ansi/interval.sql.out index e0bf076..1aa0920 100644 --- a/sql/core/src/test/resources/sql-tests/results/ansi/interval.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/ansi/interval.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 251 +-- Number of queries: 257 -- !query @@ -2328,6 +2328,33 @@ cannot resolve '(I
[spark] branch master updated (8b8d91c -> 9b9db5a)
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git. from 8b8d91c [SPARK-36465][SS] Dynamic gap duration in session window add 9b9db5a [SPARK-36491][SQL] Make from_json/to_json to handle timestamp_ntz type properly No new revisions were added by this update. Summary of changes: .../apache/spark/sql/catalyst/json/JacksonGenerator.scala | 12 .../org/apache/spark/sql/catalyst/json/JacksonParser.scala | 12 .../sql-tests/results/timestampNTZ/timestamp-ansi.sql.out | 5 ++--- .../sql-tests/results/timestampNTZ/timestamp.sql.out | 5 ++--- .../scala/org/apache/spark/sql/JsonFunctionsSuite.scala| 14 +- 5 files changed, 41 insertions(+), 7 deletions(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated (7fd3454 -> 41436b2)
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git. from 7fd3454 [SPARK-36490][SQL] Make from_csv/to_csv to handle timestamp_ntz type properly add 41436b2 [SPARK-36507][DOCS] Remove/Replace missing links to AMP Camp materials from index.md No new revisions were added by this update. Summary of changes: docs/index.md | 7 +++ 1 file changed, 3 insertions(+), 4 deletions(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated (eb6be7f -> 7fd3454)
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git. from eb6be7f [SPARK-36499][SQL][TESTS] Test Interval multiply / divide null add 7fd3454 [SPARK-36490][SQL] Make from_csv/to_csv to handle timestamp_ntz type properly No new revisions were added by this update. Summary of changes: .../spark/sql/catalyst/csv/UnivocityGenerator.scala | 12 +++- .../apache/spark/sql/catalyst/csv/UnivocityParser.scala | 11 +++ .../results/timestampNTZ/timestamp-ansi.sql.out | 5 ++--- .../sql-tests/results/timestampNTZ/timestamp.sql.out| 5 ++--- .../scala/org/apache/spark/sql/CsvFunctionsSuite.scala | 17 - 5 files changed, 42 insertions(+), 8 deletions(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-3.2 updated: [SPARK-36499][SQL][TESTS] Test Interval multiply / divide null
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a commit to branch branch-3.2 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.2 by this push: new eaf92be [SPARK-36499][SQL][TESTS] Test Interval multiply / divide null eaf92be is described below commit eaf92bea99d1a6f4661718c484d167c7a08a37f6 Author: Gengliang Wang AuthorDate: Fri Aug 13 11:05:57 2021 +0300 [SPARK-36499][SQL][TESTS] Test Interval multiply / divide null ### What changes were proposed in this pull request? Test the following valid operations: ``` year-month interval * null null * year-month interval year-month interval / null ``` and invalid operations: ``` null / interval int / interval ``` ### Why are the changes needed? Improve test coverage ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Pass CI Closes #33729 from gengliangwang/addTest. Authored-by: Gengliang Wang Signed-off-by: Max Gekk (cherry picked from commit eb6be7f1ee076aeaa312f7a3ff0c88db516b793b) Signed-off-by: Max Gekk --- .../test/resources/sql-tests/inputs/interval.sql | 11 .../sql-tests/results/ansi/interval.sql.out| 71 +- .../resources/sql-tests/results/interval.sql.out | 71 +- 3 files changed, 151 insertions(+), 2 deletions(-) diff --git a/sql/core/src/test/resources/sql-tests/inputs/interval.sql b/sql/core/src/test/resources/sql-tests/inputs/interval.sql index 80df45d..be13a25 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/interval.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/interval.sql @@ -16,6 +16,16 @@ select interval '2 seconds' / 0; select interval '2 seconds' / null; select interval '2 seconds' * null; select null * interval '2 seconds'; +select interval '2' year / 0; +select interval '2' year / null; +select interval '2' year * null; +select null * interval '2' year; + +-- invalid: divide by interval +select 2 / interval '2' year; +select 2 / interval '2' hour; +select null / interval '2' year; +select null / interval '2' hour; -- interval with a positive/negative sign select -interval '-1 month 1 day -1 second'; @@ -170,6 +180,7 @@ select null - interval '2' year, null + interval '2' hour, null - interval '2' hour; + -- invalid: malformed interval string select interval '2' year + '3-3'; select interval '2' year - '4'; diff --git a/sql/core/src/test/resources/sql-tests/results/ansi/interval.sql.out b/sql/core/src/test/resources/sql-tests/results/ansi/interval.sql.out index b048105..4e784b0 100644 --- a/sql/core/src/test/resources/sql-tests/results/ansi/interval.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/ansi/interval.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 223 +-- Number of queries: 231 -- !query @@ -118,6 +118,75 @@ NULL -- !query +select interval '2' year / 0 +-- !query schema +struct<> +-- !query output +java.lang.ArithmeticException +/ by zero + + +-- !query +select interval '2' year / null +-- !query schema +struct<(INTERVAL '2' YEAR / NULL):interval year to month> +-- !query output +NULL + + +-- !query +select interval '2' year * null +-- !query schema +struct<(INTERVAL '2' YEAR * NULL):interval year to month> +-- !query output +NULL + + +-- !query +select null * interval '2' year +-- !query schema +struct<(INTERVAL '2' YEAR * NULL):interval year to month> +-- !query output +NULL + + +-- !query +select 2 / interval '2' year +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +cannot resolve '(2 / INTERVAL '2' YEAR)' due to data type mismatch: differing types in '(2 / INTERVAL '2' YEAR)' (int and interval year).; line 1 pos 7 + + +-- !query +select 2 / interval '2' hour +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +cannot resolve '(2 / INTERVAL '02' HOUR)' due to data type mismatch: differing types in '(2 / INTERVAL '02' HOUR)' (int and interval hour).; line 1 pos 7 + + +-- !query +select null / interval '2' year +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +cannot resolve '(NULL / INTERVAL '2' YEAR)' due to data type mismatch: differing types in '(NULL / INTERVAL '2' YEAR)' (void and interval year).; line 1 pos 7 + + +-- !query +select null / interval '2' hour +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +cannot resolve '(NULL / INTERVAL '02' HOUR)' due to data type mismatch: differing types in '(NULL / INTERVAL '02' HOUR)' (void and interval hour).; line 1 pos 7 + + +-- !query select -interval '-1 month 1 day -1 second' -- !query schema struct<> diff
[spark] branch master updated (7d82336 -> eb6be7f)
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git. from 7d82336 [SPARK-36428][SQL] the seconds parameter of make_timestamp should accept integer type add eb6be7f [SPARK-36499][SQL][TESTS] Test Interval multiply / divide null No new revisions were added by this update. Summary of changes: .../test/resources/sql-tests/inputs/interval.sql | 11 .../sql-tests/results/ansi/interval.sql.out| 71 +- .../resources/sql-tests/results/interval.sql.out | 71 +- 3 files changed, 151 insertions(+), 2 deletions(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-3.2 updated: [SPARK-36468][SQL][DOCS] Update docs about ANSI interval literals
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a commit to branch branch-3.2 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.2 by this push: new 8dbcbeb [SPARK-36468][SQL][DOCS] Update docs about ANSI interval literals 8dbcbeb is described below commit 8dbcbebc3604f01b865215f2d02eb823cad582ec Author: Max Gekk AuthorDate: Wed Aug 11 13:38:39 2021 +0300 [SPARK-36468][SQL][DOCS] Update docs about ANSI interval literals ### What changes were proposed in this pull request? In the PR, I propose to update the doc page https://spark.apache.org/docs/latest/sql-ref-literals.html#interval-literal, and describe formats of ANSI interval literals. https://user-images.githubusercontent.com/1580697/128988454-7a6ac435-409b-4961-9b79-ebecfb141d5e.png;> https://user-images.githubusercontent.com/1580697/128912018-a4ea3ee5-f252-49c7-a90e-5beaf7ac868f.png;> ### Why are the changes needed? To improve UX with Spark SQL, and inform users about recently added ANSI interval literals. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Manually checked the generated docs: ``` $ SKIP_API=1 SKIP_RDOC=1 SKIP_PYTHONDOC=1 SKIP_SCALADOC=1 bundle exec jekyll build ``` Closes #33693 from MaxGekk/doc-ansi-interval-literals. Authored-by: Max Gekk Signed-off-by: Max Gekk (cherry picked from commit bbf988bd73b00d18dd1d443f225b3915a2c4433f) Signed-off-by: Max Gekk --- docs/sql-ref-literals.md | 91 +--- 1 file changed, 70 insertions(+), 21 deletions(-) diff --git a/docs/sql-ref-literals.md b/docs/sql-ref-literals.md index b23c63a..355ac69 100644 --- a/docs/sql-ref-literals.md +++ b/docs/sql-ref-literals.md @@ -439,13 +439,78 @@ SELECT TIMESTAMP '1997-01' AS col; An interval literal is used to specify a fixed period of time. + ANSI style + +The ANSI SQL standard defines interval literals in the form: +```sql +INTERVAL [ ] +``` +where `` can be a single field or in the field-to-field form: +```sql + ::= TO | +``` +The field name is case-insensitive, and can be one of `YEAR`, `MONTH`, `DAY`, `HOUR`, `MINUTE` and `SECOND`. + +An interval literal can have either year-month or day-time interval type. The interval sub-type defines format of ``: +```sql + ::= [ ] { | } + ::= [ ] | + ::= | + ::= [ [ [ ] ] ] + ::= [ [ ] ] + | [ ] + | +``` + +Supported year-month interval literals and theirs formats: + +|``|Interval string pattern|An instance of the literal| +|-|---|| +|YEAR|`[+|-]'[+|-]y'`|`INTERVAL -'2021' YEAR`| +|YEAR TO MONTH|`[+|-]'[+|-]y-m'`|`INTERVAL '-2021-07' YEAR TO MONTH`| +|MONTH|`[+|-]'[+|-]m'`|`interval '10' month`| + +Formats of supported day-time interval literals: + +|``|Interval string pattern|An instance of the literal| +|-||---| +|DAY|`[+|-]'[+|-]d'`|`INTERVAL -'100' DAY`| +|DAY TO HOUR|`[+|-]'[+|-]d h'`|`INTERVAL '-100 10' DAY TO HOUR`| +|DAY TO MINUTE|`[+|-]'[+|-]d h:m'`|`INTERVAL '100 10:30' DAY TO MINUTE`| +|DAY TO SECOND|`[+|-]'[+|-]d h:m:s.n'`|`INTERVAL '100 10:30:40.99' DAY TO SECOND`| +|HOUR|`[+|-]'[+|-]h'`|`INTERVAL '123' HOUR`| +|HOUR TO MINUTE|`[+|-]'[+|-]h:m'`|`INTERVAL -'-123:10' HOUR TO MINUTE`| +|HOUR TO SECOND|`[+|-]'[+|-]h:m:s.n'`|`INTERVAL '123:10:59' HOUR TO SECOND`| +|MINUTE|`[+|-]'[+|-]m'`|`interval '1000' minute`| +|MINUTE TO SECOND|`[+|-]'[+|-]m:s.n'`|`INTERVAL '1000:01.001' MINUTE TO SECOND`| +|SECOND|`[+|-]'[+|-]s.n'`|`INTERVAL '1000.01' SECOND`| + +# Examples + +```sql +SELECT INTERVAL '2-3' YEAR TO MONTH AS col; +++ +|col | +++ +|INTERVAL '2-3' YEAR TO MONTH| +++ + +SELECT INTERVAL -'20 15:40:32.9989' DAY TO SECOND AS col; +++ +|col | +++ +|INTERVAL '-20 15:40:32.998999' DAY TO SECOND| +++ +``` + + Multi-units style + ```sql INTERVAL interval_value interval_unit [ interval_value interval_unit ... ] | INTERVAL 'interval_value interval_unit [ interval_value interval_unit ... ]' | -INTERVAL interval_string_value interval_unit TO interval_unit ``` - Parameters +# Parameters * **interval_value** @@ -453,10 +518,6 @@ INTERVAL interval_string_value interval_unit TO interval_unit [ + | - ] number_value | '[ + | - ] number_value' -* **interval_string_value** - - year-month/day-time interval string. - * **interval_unit** **Syntax:** @@ -464,7 +525,9 @@ INTERVAL interval_string_value interval_unit TO interval_unit YEAR[S] | MONTH[S] | WEEK[S] | DAY[S] |
[spark] branch master updated (f7c85b8 -> bbf988b)
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git. from f7c85b8 [SPARK-36456][CORE][SQL][SS] Clean up compilation warnings related to `method closeQuietly in class IOUtils is deprecated` add bbf988b [SPARK-36468][SQL][DOCS] Update docs about ANSI interval literals No new revisions were added by this update. Summary of changes: docs/sql-ref-literals.md | 91 +--- 1 file changed, 70 insertions(+), 21 deletions(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-3.2 updated: [SPARK-36431][SQL] Support TypeCoercion of ANSI intervals with different fields
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a commit to branch branch-3.2 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.2 by this push: new fb6f379 [SPARK-36431][SQL] Support TypeCoercion of ANSI intervals with different fields fb6f379 is described below commit fb6f3792afacead3299e86cc94f3f1460b3a4ba1 Author: Angerszh AuthorDate: Tue Aug 10 14:22:31 2021 +0300 [SPARK-36431][SQL] Support TypeCoercion of ANSI intervals with different fields ### What changes were proposed in this pull request? Support TypeCoercion of ANSI intervals with different fields ### Why are the changes needed? Support TypeCoercion of ANSI intervals with different fields ### Does this PR introduce _any_ user-facing change? After this pr user can - use comparison function with different fields of DayTimeIntervalType/YearMonthIntervalType such as `INTERVAL '1' YEAR` > `INTERVAL '11' MONTH` - support different field of ansi interval type in collection function such as `array(INTERVAL '1' YEAR, INTERVAL '11' MONTH)` - support different field of ansi interval type in `coalesce` etc.. ### How was this patch tested? Added UT Closes #33661 from AngersZh/SPARK-SPARK-36431. Authored-by: Angerszh Signed-off-by: Max Gekk (cherry picked from commit 89d8a4eacfd09f67ad31bf1cbf7d4b88de3b1e24) Signed-off-by: Max Gekk --- .../sql/catalyst/analysis/AnsiTypeCoercion.scala | 5 ++ .../spark/sql/catalyst/analysis/TypeCoercion.scala | 5 ++ .../sql/catalyst/analysis/TypeCoercionSuite.scala | 47 +++ .../test/resources/sql-tests/inputs/interval.sql | 13 +++ .../sql-tests/results/ansi/interval.sql.out| 98 +- .../resources/sql-tests/results/interval.sql.out | 98 +- 6 files changed, 258 insertions(+), 8 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/AnsiTypeCoercion.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/AnsiTypeCoercion.scala index 457dc10..f03296f 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/AnsiTypeCoercion.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/AnsiTypeCoercion.scala @@ -120,6 +120,11 @@ object AnsiTypeCoercion extends TypeCoercionBase { case (_: TimestampType, _: DateType) | (_: DateType, _: TimestampType) => Some(TimestampType) +case (t1: DayTimeIntervalType, t2: DayTimeIntervalType) => + Some(DayTimeIntervalType(t1.startField.min(t2.startField), t1.endField.max(t2.endField))) +case (t1: YearMonthIntervalType, t2: YearMonthIntervalType) => + Some(YearMonthIntervalType(t1.startField.min(t2.startField), t1.endField.max(t2.endField))) + case (t1, t2) => findTypeForComplex(t1, t2, findTightestCommonType) } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala index 42c10e8..db6f499 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala @@ -867,6 +867,11 @@ object TypeCoercion extends TypeCoercionBase { case (_: TimestampType, _: DateType) | (_: DateType, _: TimestampType) => Some(TimestampType) + case (t1: DayTimeIntervalType, t2: DayTimeIntervalType) => +Some(DayTimeIntervalType(t1.startField.min(t2.startField), t1.endField.max(t2.endField))) + case (t1: YearMonthIntervalType, t2: YearMonthIntervalType) => +Some(YearMonthIntervalType(t1.startField.min(t2.startField), t1.endField.max(t2.endField))) + case (_: TimestampNTZType, _: DateType) | (_: DateType, _: TimestampNTZType) => Some(TimestampNTZType) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercionSuite.scala index 602daf8..6a7d7ef 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercionSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercionSuite.scala @@ -18,6 +18,7 @@ package org.apache.spark.sql.catalyst.analysis import java.sql.Timestamp +import java.time.{Duration, Period} import org.apache.spark.internal.config.Tests.IS_TESTING import org.apache.spark.sql.catalyst.analysis.TypeCoercion._ @@ -1604,6 +1605,52 @@ class TypeCoercionSuite extends AnalysisTest { ruleTest(TypeCoercion.IntegralDivision, IntegralDivide(2, 1L), IntegralDivide(Cast(2, LongType), 1L)) } + + test("SPARK-364
[spark] branch master updated (7f56b73c -> 89d8a4e)
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git. from 7f56b73c [SPARK-36466][SQL] Table in unloaded catalog referenced by view should load correctly add 89d8a4e [SPARK-36431][SQL] Support TypeCoercion of ANSI intervals with different fields No new revisions were added by this update. Summary of changes: .../sql/catalyst/analysis/AnsiTypeCoercion.scala | 5 ++ .../spark/sql/catalyst/analysis/TypeCoercion.scala | 5 ++ .../sql/catalyst/analysis/TypeCoercionSuite.scala | 47 +++ .../test/resources/sql-tests/inputs/interval.sql | 13 +++ .../sql-tests/results/ansi/interval.sql.out| 98 +- .../resources/sql-tests/results/interval.sql.out | 98 +- 6 files changed, 258 insertions(+), 8 deletions(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-3.2 updated: [SPARK-36349][SQL] Disallow ANSI intervals in file-based datasources
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a commit to branch branch-3.2 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.2 by this push: new bd33408 [SPARK-36349][SQL] Disallow ANSI intervals in file-based datasources bd33408 is described below commit bd33408b4b5aefc5b83ab1355bb0c1faacad190c Author: Max Gekk AuthorDate: Tue Aug 3 20:30:20 2021 +0300 [SPARK-36349][SQL] Disallow ANSI intervals in file-based datasources ### What changes were proposed in this pull request? In the PR, I propose to ban `YearMonthIntervalType` and `DayTimeIntervalType` at the analysis phase while creating a table using a built-in filed-based datasource or writing a dataset to such datasource. In particular, add the following case: ```scala case _: DayTimeIntervalType | _: YearMonthIntervalType => false ``` to all methods that override either: - V2 `FileTable.supportsDataType()` - V1 `FileFormat.supportDataType()` ### Why are the changes needed? To improve user experience with Spark SQL, and output a proper error message at the analysis phase. ### Does this PR introduce _any_ user-facing change? Yes but ANSI interval types haven't released yet. So, for users this is new behavior. ### How was this patch tested? By running the affected test suites: ``` $ build/sbt -Phive-2.3 "test:testOnly *HiveOrcSourceSuite" ``` Closes #33580 from MaxGekk/interval-ban-in-ds. Authored-by: Max Gekk Signed-off-by: Max Gekk (cherry picked from commit 67cbc932638179925ebbeb76d6d6e6f25a3cb2e2) Signed-off-by: Max Gekk --- .../org/apache/spark/sql/avro/AvroUtils.scala | 2 ++ .../execution/datasources/csv/CSVFileFormat.scala | 2 ++ .../datasources/json/JsonFileFormat.scala | 2 ++ .../execution/datasources/orc/OrcFileFormat.scala | 2 ++ .../datasources/parquet/ParquetFileFormat.scala| 2 ++ .../execution/datasources/v2/csv/CSVTable.scala| 4 +++- .../execution/datasources/v2/json/JsonTable.scala | 2 ++ .../execution/datasources/v2/orc/OrcTable.scala| 2 ++ .../datasources/v2/parquet/ParquetTable.scala | 2 ++ .../datasources/CommonFileDataSourceSuite.scala| 25 +- .../apache/spark/sql/hive/orc/OrcFileFormat.scala | 2 ++ 11 files changed, 45 insertions(+), 2 deletions(-) diff --git a/external/avro/src/main/scala/org/apache/spark/sql/avro/AvroUtils.scala b/external/avro/src/main/scala/org/apache/spark/sql/avro/AvroUtils.scala index f09af74..68b393e 100644 --- a/external/avro/src/main/scala/org/apache/spark/sql/avro/AvroUtils.scala +++ b/external/avro/src/main/scala/org/apache/spark/sql/avro/AvroUtils.scala @@ -71,6 +71,8 @@ private[sql] object AvroUtils extends Logging { } def supportsDataType(dataType: DataType): Boolean = dataType match { +case _: DayTimeIntervalType | _: YearMonthIntervalType => false + case _: AtomicType => true case st: StructType => st.forall { f => supportsDataType(f.dataType) } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVFileFormat.scala index d40ad9d..c3a8a95 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVFileFormat.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVFileFormat.scala @@ -148,6 +148,8 @@ class CSVFileFormat extends TextBasedFileFormat with DataSourceRegister { override def equals(other: Any): Boolean = other.isInstanceOf[CSVFileFormat] override def supportDataType(dataType: DataType): Boolean = dataType match { +case _: DayTimeIntervalType | _: YearMonthIntervalType => false + case _: AtomicType => true case udt: UserDefinedType[_] => supportDataType(udt.sqlType) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonFileFormat.scala index 9c6c77a..7ffeba4e 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonFileFormat.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonFileFormat.scala @@ -134,6 +134,8 @@ class JsonFileFormat extends TextBasedFileFormat with DataSourceRegister { override def equals(other: Any): Boolean = other.isInstanceOf[JsonFileFormat] override def supportDataType(dataType: DataType): Boolean = dataType match { +case _: DayTimeIntervalType | _: YearMonthIntervalType => false + case _: AtomicType => true case st: StructType => st.forall { f => supportDataType(f.dataType) } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/executi
[spark] branch master updated: [SPARK-36349][SQL] Disallow ANSI intervals in file-based datasources
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 67cbc93 [SPARK-36349][SQL] Disallow ANSI intervals in file-based datasources 67cbc93 is described below commit 67cbc932638179925ebbeb76d6d6e6f25a3cb2e2 Author: Max Gekk AuthorDate: Tue Aug 3 20:30:20 2021 +0300 [SPARK-36349][SQL] Disallow ANSI intervals in file-based datasources ### What changes were proposed in this pull request? In the PR, I propose to ban `YearMonthIntervalType` and `DayTimeIntervalType` at the analysis phase while creating a table using a built-in filed-based datasource or writing a dataset to such datasource. In particular, add the following case: ```scala case _: DayTimeIntervalType | _: YearMonthIntervalType => false ``` to all methods that override either: - V2 `FileTable.supportsDataType()` - V1 `FileFormat.supportDataType()` ### Why are the changes needed? To improve user experience with Spark SQL, and output a proper error message at the analysis phase. ### Does this PR introduce _any_ user-facing change? Yes but ANSI interval types haven't released yet. So, for users this is new behavior. ### How was this patch tested? By running the affected test suites: ``` $ build/sbt -Phive-2.3 "test:testOnly *HiveOrcSourceSuite" ``` Closes #33580 from MaxGekk/interval-ban-in-ds. Authored-by: Max Gekk Signed-off-by: Max Gekk --- .../org/apache/spark/sql/avro/AvroUtils.scala | 2 ++ .../execution/datasources/csv/CSVFileFormat.scala | 2 ++ .../datasources/json/JsonFileFormat.scala | 2 ++ .../execution/datasources/orc/OrcFileFormat.scala | 2 ++ .../datasources/parquet/ParquetFileFormat.scala| 2 ++ .../execution/datasources/v2/csv/CSVTable.scala| 4 +++- .../execution/datasources/v2/json/JsonTable.scala | 2 ++ .../execution/datasources/v2/orc/OrcTable.scala| 2 ++ .../datasources/v2/parquet/ParquetTable.scala | 2 ++ .../datasources/CommonFileDataSourceSuite.scala| 25 +- .../apache/spark/sql/hive/orc/OrcFileFormat.scala | 2 ++ 11 files changed, 45 insertions(+), 2 deletions(-) diff --git a/external/avro/src/main/scala/org/apache/spark/sql/avro/AvroUtils.scala b/external/avro/src/main/scala/org/apache/spark/sql/avro/AvroUtils.scala index 149d0b6..328927f 100644 --- a/external/avro/src/main/scala/org/apache/spark/sql/avro/AvroUtils.scala +++ b/external/avro/src/main/scala/org/apache/spark/sql/avro/AvroUtils.scala @@ -71,6 +71,8 @@ private[sql] object AvroUtils extends Logging { } def supportsDataType(dataType: DataType): Boolean = dataType match { +case _: DayTimeIntervalType | _: YearMonthIntervalType => false + case _: AtomicType => true case st: StructType => st.forall { f => supportsDataType(f.dataType) } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVFileFormat.scala index d40ad9d..c3a8a95 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVFileFormat.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVFileFormat.scala @@ -148,6 +148,8 @@ class CSVFileFormat extends TextBasedFileFormat with DataSourceRegister { override def equals(other: Any): Boolean = other.isInstanceOf[CSVFileFormat] override def supportDataType(dataType: DataType): Boolean = dataType match { +case _: DayTimeIntervalType | _: YearMonthIntervalType => false + case _: AtomicType => true case udt: UserDefinedType[_] => supportDataType(udt.sqlType) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonFileFormat.scala index 9c6c77a..7ffeba4e 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonFileFormat.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonFileFormat.scala @@ -134,6 +134,8 @@ class JsonFileFormat extends TextBasedFileFormat with DataSourceRegister { override def equals(other: Any): Boolean = other.isInstanceOf[JsonFileFormat] override def supportDataType(dataType: DataType): Boolean = dataType match { +case _: DayTimeIntervalType | _: YearMonthIntervalType => false + case _: AtomicType => true case st: StructType => st.forall { f => supportDataType(f.dataType) } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/
[spark] branch branch-3.2 updated (8d817dc -> cc75618)
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a change to branch branch-3.2 in repository https://gitbox.apache.org/repos/asf/spark.git. from 8d817dc [SPARK-36315][SQL] Only skip AQEShuffleReadRule in the final stage if it breaks the distribution requirement add cc75618 [SPARK-35815][SQL][FOLLOWUP] Add test considering the case spark.sql.legacy.interval.enabled is true No new revisions were added by this update. Summary of changes: .../sql/streaming/EventTimeWatermarkSuite.scala| 105 +++-- 1 file changed, 56 insertions(+), 49 deletions(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated (dd80457 -> 92cdb17)
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git. from dd80457 [SPARK-36315][SQL] Only skip AQEShuffleReadRule in the final stage if it breaks the distribution requirement add 92cdb17 [SPARK-35815][SQL][FOLLOWUP] Add test considering the case spark.sql.legacy.interval.enabled is true No new revisions were added by this update. Summary of changes: .../sql/streaming/EventTimeWatermarkSuite.scala| 105 +++-- 1 file changed, 56 insertions(+), 49 deletions(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-3.2 updated: [SPARK-36380][SQL] Simplify the logical plan names for ALTER TABLE ... COLUMN
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a commit to branch branch-3.2 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.2 by this push: new 7c58684 [SPARK-36380][SQL] Simplify the logical plan names for ALTER TABLE ... COLUMN 7c58684 is described below commit 7c586842d71064169aa77baf666a8566d9ed785e Author: Wenchen Fan AuthorDate: Tue Aug 3 10:43:00 2021 +0300 [SPARK-36380][SQL] Simplify the logical plan names for ALTER TABLE ... COLUMN ### What changes were proposed in this pull request? This a followup of the recent work such as https://github.com/apache/spark/pull/33200 For `ALTER TABLE` commands, the logical plans do not have the common `AlterTable` prefix in the name and just use names like `SetTableLocation`. This PR proposes to follow the same naming rule in `ALTER TABE ... COLUMN` commands. This PR also moves these AlterTable commands to a individual file and give them a base trait. ### Why are the changes needed? name simplification ### Does this PR introduce _any_ user-facing change? no ### How was this patch tested? existing test Closes #33609 from cloud-fan/dsv2. Authored-by: Wenchen Fan Signed-off-by: Max Gekk (cherry picked from commit 7cb9c1c2415a0984515e4d4733f816673e4ae3c8) Signed-off-by: Max Gekk --- .../spark/sql/catalyst/analysis/Analyzer.scala | 12 +- .../sql/catalyst/analysis/CheckAnalysis.scala | 12 +- .../spark/sql/catalyst/parser/AstBuilder.scala | 12 +- .../plans/logical/v2AlterTableCommands.scala | 230 + .../sql/catalyst/plans/logical/v2Commands.scala| 193 + .../spark/sql/catalyst/parser/DDLParserSuite.scala | 54 ++--- .../catalyst/analysis/ResolveSessionCatalog.scala | 10 +- .../datasources/v2/DataSourceV2Strategy.scala | 26 +-- .../connector/V2CommandsCaseSensitivitySuite.scala | 26 +-- .../execution/command/PlanResolutionSuite.scala| 12 +- 10 files changed, 303 insertions(+), 284 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala index a9c085a..75fad11a 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala @@ -269,7 +269,7 @@ class Analyzer(override val catalogManager: CatalogManager) ResolveRelations :: ResolveTables :: ResolvePartitionSpec :: - ResolveAlterTableColumnCommands :: + ResolveAlterTableCommands :: AddMetadataColumns :: DeduplicateRelations :: ResolveReferences :: @@ -3607,15 +3607,15 @@ class Analyzer(override val catalogManager: CatalogManager) * Rule to mostly resolve, normalize and rewrite column names based on case sensitivity * for alter table column commands. */ - object ResolveAlterTableColumnCommands extends Rule[LogicalPlan] { + object ResolveAlterTableCommands extends Rule[LogicalPlan] { def apply(plan: LogicalPlan): LogicalPlan = plan.resolveOperatorsUp { - case a: AlterTableColumnCommand if a.table.resolved && hasUnresolvedFieldName(a) => + case a: AlterTableCommand if a.table.resolved && hasUnresolvedFieldName(a) => val table = a.table.asInstanceOf[ResolvedTable] a.transformExpressions { case u: UnresolvedFieldName => resolveFieldNames(table, u.name, u) } - case a @ AlterTableAddColumns(r: ResolvedTable, cols) if !a.resolved => + case a @ AddColumns(r: ResolvedTable, cols) if !a.resolved => // 'colsToAdd' keeps track of new columns being added. It stores a mapping from a // normalized parent name of fields to field names that belong to the parent. // For example, if we add columns "a.b.c", "a.b.d", and "a.c", 'colsToAdd' will become @@ -3668,7 +3668,7 @@ class Analyzer(override val catalogManager: CatalogManager) resolved.copyTagsFrom(a) resolved - case a @ AlterTableAlterColumn( + case a @ AlterColumn( table: ResolvedTable, ResolvedFieldName(path, field), dataType, _, _, position) => val newDataType = dataType.flatMap { dt => // Hive style syntax provides the column type, even if it may not have changed. @@ -3705,7 +3705,7 @@ class Analyzer(override val catalogManager: CatalogManager) }.getOrElse(throw QueryCompilationErrors.missingFieldError(fieldName, table, context.origin)) } -private def hasUnresolvedFieldName(a: AlterTableColumnCommand): Boolean = { +private def hasUnresolvedFieldName(a: AlterTableComma
[spark] branch master updated (8ca11fe -> 7cb9c1c)
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git. from 8ca11fe [SPARK-36192][PYTHON] Better error messages for DataTypeOps against lists add 7cb9c1c [SPARK-36380][SQL] Simplify the logical plan names for ALTER TABLE ... COLUMN No new revisions were added by this update. Summary of changes: .../spark/sql/catalyst/analysis/Analyzer.scala | 12 +- .../sql/catalyst/analysis/CheckAnalysis.scala | 12 +- .../spark/sql/catalyst/parser/AstBuilder.scala | 12 +- .../plans/logical/v2AlterTableCommands.scala | 230 + .../sql/catalyst/plans/logical/v2Commands.scala| 193 + .../spark/sql/catalyst/parser/DDLParserSuite.scala | 54 ++--- .../catalyst/analysis/ResolveSessionCatalog.scala | 10 +- .../datasources/v2/DataSourceV2Strategy.scala | 26 +-- .../connector/V2CommandsCaseSensitivitySuite.scala | 26 +-- .../execution/command/PlanResolutionSuite.scala| 12 +- 10 files changed, 303 insertions(+), 284 deletions(-) create mode 100644 sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/v2AlterTableCommands.scala - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-3.2 updated: [SPARK-36323][SQL] Support ANSI interval literals for TimeWindow
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a commit to branch branch-3.2 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.2 by this push: new d247a6cd [SPARK-36323][SQL] Support ANSI interval literals for TimeWindow d247a6cd is described below commit d247a6cd1a2240fcfb45f288cfb6cc6236368c5e Author: Kousuke Saruta AuthorDate: Thu Jul 29 08:51:51 2021 +0300 [SPARK-36323][SQL] Support ANSI interval literals for TimeWindow ### What changes were proposed in this pull request? This PR proposes to support ANSI interval literals for `TimeWindow`. ### Why are the changes needed? Watermark also supports ANSI interval literals so it's great to support for `TimeWindow`. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? New test. Closes #33551 from sarutak/window-interval. Authored-by: Kousuke Saruta Signed-off-by: Max Gekk (cherry picked from commit db18866742a4641e7119f637024bc89a3f048634) Signed-off-by: Max Gekk --- .../sql/catalyst/expressions/TimeWindow.scala | 5 +- .../spark/sql/catalyst/util/IntervalUtils.scala| 23 +++- .../spark/sql/errors/QueryCompilationErrors.scala | 4 +- .../sql/catalyst/expressions/TimeWindowSuite.scala | 62 +++--- .../main/scala/org/apache/spark/sql/Dataset.scala | 20 +-- 5 files changed, 82 insertions(+), 32 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/TimeWindow.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/TimeWindow.scala index 8475c1f..2f08fd7 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/TimeWindow.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/TimeWindow.scala @@ -26,7 +26,6 @@ import org.apache.spark.sql.catalyst.util.DateTimeConstants.MICROS_PER_DAY import org.apache.spark.sql.catalyst.util.IntervalUtils import org.apache.spark.sql.errors.QueryCompilationErrors import org.apache.spark.sql.types._ -import org.apache.spark.unsafe.types.UTF8String case class TimeWindow( timeColumn: Expression, @@ -110,12 +109,12 @@ object TimeWindow { * precision. */ def getIntervalInMicroSeconds(interval: String): Long = { -val cal = IntervalUtils.stringToInterval(UTF8String.fromString(interval)) +val cal = IntervalUtils.fromIntervalString(interval) if (cal.months != 0) { throw new IllegalArgumentException( s"Intervals greater than a month is not supported ($interval).") } -cal.days * MICROS_PER_DAY + cal.microseconds +Math.addExact(Math.multiplyExact(cal.days, MICROS_PER_DAY), cal.microseconds) } /** diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/IntervalUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/IntervalUtils.scala index dc6c02e..62a2657 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/IntervalUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/IntervalUtils.scala @@ -25,11 +25,14 @@ import java.util.concurrent.TimeUnit import scala.collection.mutable import scala.util.control.NonFatal +import org.apache.spark.sql.catalyst.expressions.Literal +import org.apache.spark.sql.catalyst.parser.CatalystSqlParser import org.apache.spark.sql.catalyst.util.DateTimeConstants._ import org.apache.spark.sql.catalyst.util.DateTimeUtils.millisToMicros import org.apache.spark.sql.catalyst.util.IntervalStringStyles.{ANSI_STYLE, HIVE_STYLE, IntervalStyle} -import org.apache.spark.sql.errors.QueryExecutionErrors +import org.apache.spark.sql.errors.{QueryCompilationErrors, QueryExecutionErrors} import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.types._ import org.apache.spark.sql.types.{DayTimeIntervalType => DT, Decimal, YearMonthIntervalType => YM} import org.apache.spark.unsafe.types.{CalendarInterval, UTF8String} @@ -433,6 +436,24 @@ object IntervalUtils { } } + /** + * Parse all kinds of interval literals including unit-to-unit form and unit list form + */ + def fromIntervalString(input: String): CalendarInterval = try { +if (input.toLowerCase(Locale.ROOT).trim.startsWith("interval")) { + CatalystSqlParser.parseExpression(input) match { +case Literal(months: Int, _: YearMonthIntervalType) => new CalendarInterval(months, 0, 0) +case Literal(micros: Long, _: DayTimeIntervalType) => new CalendarInterval(0, 0, micros) +case Literal(cal: CalendarInterval, CalendarIntervalType) => cal + } +} else { + stringToInterval(UTF8String.fromString(input)) +} + } catch { +case NonFatal(e) => + throw QueryComp
[spark] branch master updated: [SPARK-36323][SQL] Support ANSI interval literals for TimeWindow
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new db18866 [SPARK-36323][SQL] Support ANSI interval literals for TimeWindow db18866 is described below commit db18866742a4641e7119f637024bc89a3f048634 Author: Kousuke Saruta AuthorDate: Thu Jul 29 08:51:51 2021 +0300 [SPARK-36323][SQL] Support ANSI interval literals for TimeWindow ### What changes were proposed in this pull request? This PR proposes to support ANSI interval literals for `TimeWindow`. ### Why are the changes needed? Watermark also supports ANSI interval literals so it's great to support for `TimeWindow`. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? New test. Closes #33551 from sarutak/window-interval. Authored-by: Kousuke Saruta Signed-off-by: Max Gekk --- .../sql/catalyst/expressions/TimeWindow.scala | 5 +- .../spark/sql/catalyst/util/IntervalUtils.scala| 23 +++- .../spark/sql/errors/QueryCompilationErrors.scala | 4 +- .../sql/catalyst/expressions/TimeWindowSuite.scala | 62 +++--- .../main/scala/org/apache/spark/sql/Dataset.scala | 20 +-- 5 files changed, 82 insertions(+), 32 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/TimeWindow.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/TimeWindow.scala index 8475c1f..2f08fd7 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/TimeWindow.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/TimeWindow.scala @@ -26,7 +26,6 @@ import org.apache.spark.sql.catalyst.util.DateTimeConstants.MICROS_PER_DAY import org.apache.spark.sql.catalyst.util.IntervalUtils import org.apache.spark.sql.errors.QueryCompilationErrors import org.apache.spark.sql.types._ -import org.apache.spark.unsafe.types.UTF8String case class TimeWindow( timeColumn: Expression, @@ -110,12 +109,12 @@ object TimeWindow { * precision. */ def getIntervalInMicroSeconds(interval: String): Long = { -val cal = IntervalUtils.stringToInterval(UTF8String.fromString(interval)) +val cal = IntervalUtils.fromIntervalString(interval) if (cal.months != 0) { throw new IllegalArgumentException( s"Intervals greater than a month is not supported ($interval).") } -cal.days * MICROS_PER_DAY + cal.microseconds +Math.addExact(Math.multiplyExact(cal.days, MICROS_PER_DAY), cal.microseconds) } /** diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/IntervalUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/IntervalUtils.scala index dc6c02e..62a2657 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/IntervalUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/IntervalUtils.scala @@ -25,11 +25,14 @@ import java.util.concurrent.TimeUnit import scala.collection.mutable import scala.util.control.NonFatal +import org.apache.spark.sql.catalyst.expressions.Literal +import org.apache.spark.sql.catalyst.parser.CatalystSqlParser import org.apache.spark.sql.catalyst.util.DateTimeConstants._ import org.apache.spark.sql.catalyst.util.DateTimeUtils.millisToMicros import org.apache.spark.sql.catalyst.util.IntervalStringStyles.{ANSI_STYLE, HIVE_STYLE, IntervalStyle} -import org.apache.spark.sql.errors.QueryExecutionErrors +import org.apache.spark.sql.errors.{QueryCompilationErrors, QueryExecutionErrors} import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.types._ import org.apache.spark.sql.types.{DayTimeIntervalType => DT, Decimal, YearMonthIntervalType => YM} import org.apache.spark.unsafe.types.{CalendarInterval, UTF8String} @@ -433,6 +436,24 @@ object IntervalUtils { } } + /** + * Parse all kinds of interval literals including unit-to-unit form and unit list form + */ + def fromIntervalString(input: String): CalendarInterval = try { +if (input.toLowerCase(Locale.ROOT).trim.startsWith("interval")) { + CatalystSqlParser.parseExpression(input) match { +case Literal(months: Int, _: YearMonthIntervalType) => new CalendarInterval(months, 0, 0) +case Literal(micros: Long, _: DayTimeIntervalType) => new CalendarInterval(0, 0, micros) +case Literal(cal: CalendarInterval, CalendarIntervalType) => cal + } +} else { + stringToInterval(UTF8String.fromString(input)) +} + } catch { +case NonFatal(e) => + throw QueryCompilationErrors.cannotParseIntervalError(input, e) + } + private val dayTimePatternLegacy = &qu
[spark] branch branch-3.2 updated (dcd37f9 -> 3d86128)
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a change to branch branch-3.2 in repository https://gitbox.apache.org/repos/asf/spark.git. from dcd37f9 Revert "[SPARK-36136][SQL][TESTS] Refactor PruneFileSourcePartitionsSuite etc to a different package" add 3d86128 [SPARK-34619][SQL][DOCS][3.2] Describe ANSI interval types at the Data types page of the SQL reference No new revisions were added by this update. Summary of changes: docs/sql-ref-datatypes.md | 38 ++ 1 file changed, 38 insertions(+) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-3.2 updated (3ee9a0d -> c5697d0)
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a change to branch branch-3.2 in repository https://gitbox.apache.org/repos/asf/spark.git. from 3ee9a0d [SPARK-35815][SQL] Allow delayThreshold for watermark to be represented as ANSI interval literals add c5697d0 [SPARK-36257][SQL][3.2] Updated the version of TimestampNTZ related changes as 3.3.0 No new revisions were added by this update. Summary of changes: .../spark/sql/catalyst/expressions/datetimeExpressions.scala | 10 +- .../scala/org/apache/spark/sql/types/TimestampNTZType.scala| 4 ++-- sql/core/src/main/scala/org/apache/spark/sql/functions.scala | 2 +- 3 files changed, 8 insertions(+), 8 deletions(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-3.2 updated: [SPARK-35815][SQL] Allow delayThreshold for watermark to be represented as ANSI interval literals
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a commit to branch branch-3.2 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.2 by this push: new 3ee9a0d [SPARK-35815][SQL] Allow delayThreshold for watermark to be represented as ANSI interval literals 3ee9a0d is described below commit 3ee9a0db3a3eb8e88bbab28207c91bd4637b313a Author: Kousuke Saruta AuthorDate: Thu Jul 22 17:36:22 2021 +0300 [SPARK-35815][SQL] Allow delayThreshold for watermark to be represented as ANSI interval literals ### What changes were proposed in this pull request? This PR extends the way to represent `delayThreshold` with ANSI interval literals for watermark. ### Why are the changes needed? A `delayThreshold` is semantically an interval value so it's should be represented as ANSI interval literals as well as the conventional `1 second` form. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? New tests. Closes #33456 from sarutak/delayThreshold-interval. Authored-by: Kousuke Saruta Signed-off-by: Max Gekk (cherry picked from commit 07fa38e2c1082c2b69b3bf9489cee4dfe4db2c26) Signed-off-by: Max Gekk --- .../spark/sql/errors/QueryCompilationErrors.scala | 2 +- .../main/scala/org/apache/spark/sql/Dataset.scala | 23 ++--- .../sql/streaming/EventTimeWatermarkSuite.scala| 54 ++ 3 files changed, 71 insertions(+), 8 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala index 7a33d52a..f576036 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala @@ -2265,7 +2265,7 @@ private[spark] object QueryCompilationErrors { s"""Cannot resolve column name "$colName" among (${fieldsStr})${extraMsg}""") } - def cannotParseTimeDelayError(delayThreshold: String, e: IllegalArgumentException): Throwable = { + def cannotParseTimeDelayError(delayThreshold: String, e: Throwable): Throwable = { new AnalysisException(s"Unable to parse time delay '$delayThreshold'", cause = Some(e)) } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala index 0fd10c1..3abc060 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala @@ -18,6 +18,7 @@ package org.apache.spark.sql import java.io.{ByteArrayOutputStream, CharArrayWriter, DataOutputStream} +import java.util.Locale import scala.collection.JavaConverters._ import scala.collection.mutable.{ArrayBuffer, HashSet} @@ -42,7 +43,7 @@ import org.apache.spark.sql.catalyst.encoders._ import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.json.{JacksonGenerator, JSONOptions} import org.apache.spark.sql.catalyst.optimizer.CombineUnions -import org.apache.spark.sql.catalyst.parser.{ParseException, ParserUtils} +import org.apache.spark.sql.catalyst.parser.{CatalystSqlParser, ParseException, ParserUtils} import org.apache.spark.sql.catalyst.plans._ import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.catalyst.plans.physical.{Partitioning, PartitioningCollection} @@ -63,7 +64,7 @@ import org.apache.spark.sql.types._ import org.apache.spark.sql.util.SchemaUtils import org.apache.spark.storage.StorageLevel import org.apache.spark.unsafe.array.ByteArrayMethods -import org.apache.spark.unsafe.types.UTF8String +import org.apache.spark.unsafe.types.{CalendarInterval, UTF8String} import org.apache.spark.util.Utils private[sql] object Dataset { @@ -739,13 +740,21 @@ class Dataset[T] private[sql]( // We only accept an existing column name, not a derived column here as a watermark that is // defined on a derived column cannot referenced elsewhere in the plan. def withWatermark(eventTime: String, delayThreshold: String): Dataset[T] = withTypedPlan { -val parsedDelay = - try { +val parsedDelay = try { + if (delayThreshold.toLowerCase(Locale.ROOT).trim.startsWith("interval")) { +CatalystSqlParser.parseExpression(delayThreshold) match { + case Literal(months: Int, _: YearMonthIntervalType) => +new CalendarInterval(months, 0, 0) + case Literal(micros: Long, _: DayTimeIntervalType) => +new CalendarInterval(0, 0, micros) +} + } else { IntervalUtils.stringToInterval(UTF8String.fromString(delayThreshold)) - } cat
[spark] branch master updated (3a1db2d -> 07fa38e)
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git. from 3a1db2d [SPARK-36209][PYTHON][DOCS] Fix link to pyspark Dataframe documentation add 07fa38e [SPARK-35815][SQL] Allow delayThreshold for watermark to be represented as ANSI interval literals No new revisions were added by this update. Summary of changes: .../spark/sql/errors/QueryCompilationErrors.scala | 2 +- .../main/scala/org/apache/spark/sql/Dataset.scala | 23 ++--- .../sql/streaming/EventTimeWatermarkSuite.scala| 54 ++ 3 files changed, 71 insertions(+), 8 deletions(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-3.2 updated: [SPARK-36208][SQL][3.2] SparkScriptTransformation should support ANSI interval types
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a commit to branch branch-3.2 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.2 by this push: new 468165a [SPARK-36208][SQL][3.2] SparkScriptTransformation should support ANSI interval types 468165a is described below commit 468165ae52aa788e3fa59f385225b90c616bfa0f Author: Kousuke Saruta AuthorDate: Wed Jul 21 20:54:18 2021 +0300 [SPARK-36208][SQL][3.2] SparkScriptTransformation should support ANSI interval types ### What changes were proposed in this pull request? This PR changes `BaseScriptTransformationExec` for `SparkScriptTransformationExec` to support ANSI interval types. ### Why are the changes needed? `SparkScriptTransformationExec` support `CalendarIntervalType` so it's better to support ANSI interval types as well. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? New test. Authored-by: Kousuke Saruta Signed-off-by: Max Gekk (cherry picked from commit f56c7b71ff27e6f5379f3699c2dcb5f79a0ae791) Signed-off-by: Max Gekk Closes #33463 from MaxGekk/sarutak_script-transformation-interval-3.2. Authored-by: Kousuke Saruta Signed-off-by: Max Gekk --- .../execution/BaseScriptTransformationExec.scala| 8 .../execution/BaseScriptTransformationSuite.scala | 21 + 2 files changed, 29 insertions(+) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/BaseScriptTransformationExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/BaseScriptTransformationExec.scala index 7835981..e249cd6 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/BaseScriptTransformationExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/BaseScriptTransformationExec.scala @@ -223,6 +223,14 @@ trait BaseScriptTransformationExec extends UnaryExecNode { case CalendarIntervalType => wrapperConvertException( data => IntervalUtils.stringToInterval(UTF8String.fromString(data)), converter) + case YearMonthIntervalType(start, end) => wrapperConvertException( +data => IntervalUtils.monthsToPeriod( + IntervalUtils.castStringToYMInterval(UTF8String.fromString(data), start, end)), +converter) + case DayTimeIntervalType(start, end) => wrapperConvertException( +data => IntervalUtils.microsToDuration( + IntervalUtils.castStringToDTInterval(UTF8String.fromString(data), start, end)), +converter) case _: ArrayType | _: MapType | _: StructType => val complexTypeFactory = JsonToStructs(attr.dataType, ioschema.outputSerdeProps.toMap, Literal(null), Some(conf.sessionLocalTimeZone)) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/BaseScriptTransformationSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/BaseScriptTransformationSuite.scala index c845dd81..9d8fcda 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/BaseScriptTransformationSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/BaseScriptTransformationSuite.scala @@ -633,6 +633,27 @@ abstract class BaseScriptTransformationSuite extends SparkPlanTest with SQLTestU } } } + + test("SPARK-36208: TRANSFORM should support ANSI interval (no serde)") { +assume(TestUtils.testCommandAvailable("python")) +withTempView("v") { + val df = Seq( +(Period.of(1, 2, 0), Duration.ofDays(1).plusHours(2).plusMinutes(3).plusSeconds(4)) + ).toDF("ym", "dt") + + checkAnswer( +df, +(child: SparkPlan) => createScriptTransformationExec( + script = "cat", + output = Seq( +AttributeReference("ym", YearMonthIntervalType())(), +AttributeReference("dt", DayTimeIntervalType())()), + child = child, + ioschema = defaultIOSchema +), +df.select($"ym", $"dt").collect()) +} + } } case class ExceptionInjectingOperator(child: SparkPlan) extends UnaryExecNode { - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated (94aece4 -> f56c7b7)
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git. from 94aece4 [SPARK-36020][SQL][FOLLOWUP] RemoveRedundantProjects should retain the LOGICAL_PLAN_TAG tag add f56c7b7 [SPARK-36208][SQL] SparkScriptTransformation should support ANSI interval types No new revisions were added by this update. Summary of changes: .../execution/BaseScriptTransformationExec.scala| 8 .../execution/BaseScriptTransformationSuite.scala | 21 + 2 files changed, 29 insertions(+) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-3.2 updated: [SPARK-36222][SQL] Step by days in the Sequence expression for dates
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a commit to branch branch-3.2 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.2 by this push: new 9a7c59c [SPARK-36222][SQL] Step by days in the Sequence expression for dates 9a7c59c is described below commit 9a7c59c99ce69411485acf382dfc9be053927b59 Author: gengjiaan AuthorDate: Tue Jul 20 19:16:56 2021 +0300 [SPARK-36222][SQL] Step by days in the Sequence expression for dates ### What changes were proposed in this pull request? The current implement of `Sequence` expression not support step by days for dates. ``` spark-sql> select sequence(date'2021-07-01', date'2021-07-10', interval '3' day); Error in query: cannot resolve 'sequence(DATE '2021-07-01', DATE '2021-07-10', INTERVAL '3' DAY)' due to data type mismatch: sequence uses the wrong parameter type. The parameter type must conform to: 1. The start and stop expressions must resolve to the same type. 2. If start and stop expressions resolve to the 'date' or 'timestamp' type then the step expression must resolve to the 'interval' or 'interval year to month' or 'interval day to second' type, otherwise to the same type as the start and stop expressions. ; line 1 pos 7; 'Project [unresolvedalias(sequence(2021-07-01, 2021-07-10, Some(INTERVAL '3' DAY), Some(Europe/Moscow)), None)] +- OneRowRelation ``` ### Why are the changes needed? `DayTimeInterval` has day granularity should as step for dates. ### Does this PR introduce _any_ user-facing change? 'Yes'. Sequence expression will supports step by `DayTimeInterval` has day granularity for dates. ### How was this patch tested? New tests. Closes #33439 from beliefer/SPARK-36222. Authored-by: gengjiaan Signed-off-by: Max Gekk (cherry picked from commit c0d84e6cf1046b7944796038414ef21fe9c7e3b5) Signed-off-by: Max Gekk --- .../expressions/collectionOperations.scala | 17 -- .../expressions/CollectionExpressionsSuite.scala | 61 -- 2 files changed, 68 insertions(+), 10 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala index 730b8d0..2c3312a 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala @@ -2574,7 +2574,8 @@ case class Sequence( DayTimeIntervalType.acceptsType(stepType) case DateType => stepOpt.isEmpty || CalendarIntervalType.acceptsType(stepType) || - YearMonthIntervalType.acceptsType(stepType) + YearMonthIntervalType.acceptsType(stepType) || + DayTimeIntervalType.acceptsType(stepType) case _: IntegralType => stepOpt.isEmpty || stepType.sameType(startType) case _ => false @@ -2626,8 +2627,10 @@ case class Sequence( case DateType => if (stepOpt.isEmpty || CalendarIntervalType.acceptsType(stepOpt.get.dataType)) { new TemporalSequenceImpl[Int](IntegerType, start.dataType, MICROS_PER_DAY, _.toInt, zoneId) - } else { + } else if (YearMonthIntervalType.acceptsType(stepOpt.get.dataType)) { new PeriodSequenceImpl[Int](IntegerType, start.dataType, MICROS_PER_DAY, _.toInt, zoneId) + } else { +new DurationSequenceImpl[Int](IntegerType, start.dataType, MICROS_PER_DAY, _.toInt, zoneId) } } @@ -2807,15 +2810,19 @@ object Sequence { val intervalType: DataType = DayTimeIntervalType() def splitStep(input: Any): (Int, Int, Long) = { - (0, 0, input.asInstanceOf[Long]) + val duration = input.asInstanceOf[Long] + val days = IntervalUtils.getDays(duration) + val micros = duration - days * MICROS_PER_DAY + (0, days, micros) } def stepSplitCode( stepMonths: String, stepDays: String, stepMicros: String, step: String): String = { s""" |final int $stepMonths = 0; - |final int $stepDays = 0; - |final long $stepMicros = $step; + |final int $stepDays = + | (int) org.apache.spark.sql.catalyst.util.IntervalUtils.getDays($step); + |final long $stepMicros = $step - $stepDays * ${MICROS_PER_DAY}L; """.stripMargin } } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionExpressionsSuite.scala index bfecbf5..caa5e96 100644 --- a/sql/catalyst/src/test/sc
[spark] branch master updated (bf680bf -> c0d84e6)
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git. from bf680bf [SPARK-36210][SQL] Preserve column insertion order in Dataset.withColumns add c0d84e6 [SPARK-36222][SQL] Step by days in the Sequence expression for dates No new revisions were added by this update. Summary of changes: .../expressions/collectionOperations.scala | 17 -- .../expressions/CollectionExpressionsSuite.scala | 61 -- 2 files changed, 68 insertions(+), 10 deletions(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-3.2 updated: [SPARK-36090][SQL] Support TimestampNTZType in expression Sequence
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a commit to branch branch-3.2 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.2 by this push: new 85f70a1 [SPARK-36090][SQL] Support TimestampNTZType in expression Sequence 85f70a1 is described below commit 85f70a1181b1b11417c197cee411e0ec9ced2373 Author: gengjiaan AuthorDate: Sun Jul 18 20:46:23 2021 +0300 [SPARK-36090][SQL] Support TimestampNTZType in expression Sequence ### What changes were proposed in this pull request? The current implement of `Sequence` accept `TimestampType`, `DateType` and `IntegralType`. This PR will let `Sequence` accepts `TimestampNTZType`. ### Why are the changes needed? We can generate sequence for timestamp without time zone. ### Does this PR introduce _any_ user-facing change? 'Yes'. This PR will let `Sequence` accepts `TimestampNTZType`. ### How was this patch tested? New tests. Closes #33360 from beliefer/SPARK-36090. Lead-authored-by: gengjiaan Co-authored-by: Jiaan Geng Signed-off-by: Max Gekk (cherry picked from commit 42275bb20d6849ee9df488d9ec1fa402f394ac89) Signed-off-by: Max Gekk --- .../expressions/collectionOperations.scala | 48 +--- .../spark/sql/catalyst/util/DateTimeUtils.scala| 21 +++- .../expressions/CollectionExpressionsSuite.scala | 122 - 3 files changed, 172 insertions(+), 19 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala index 2883d8d..730b8d0 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala @@ -2568,7 +2568,7 @@ case class Sequence( val typesCorrect = startType.sameType(stop.dataType) && (startType match { - case TimestampType => + case TimestampType | TimestampNTZType => stepOpt.isEmpty || CalendarIntervalType.acceptsType(stepType) || YearMonthIntervalType.acceptsType(stepType) || DayTimeIntervalType.acceptsType(stepType) @@ -2614,20 +2614,20 @@ case class Sequence( val ct = ClassTag[T](iType.tag.mirror.runtimeClass(iType.tag.tpe)) new IntegralSequenceImpl(iType)(ct, iType.integral) -case TimestampType => +case TimestampType | TimestampNTZType => if (stepOpt.isEmpty || CalendarIntervalType.acceptsType(stepOpt.get.dataType)) { -new TemporalSequenceImpl[Long](LongType, 1, identity, zoneId) +new TemporalSequenceImpl[Long](LongType, start.dataType, 1, identity, zoneId) } else if (YearMonthIntervalType.acceptsType(stepOpt.get.dataType)) { -new PeriodSequenceImpl[Long](LongType, 1, identity, zoneId) +new PeriodSequenceImpl[Long](LongType, start.dataType, 1, identity, zoneId) } else { -new DurationSequenceImpl[Long](LongType, 1, identity, zoneId) +new DurationSequenceImpl[Long](LongType, start.dataType, 1, identity, zoneId) } case DateType => if (stepOpt.isEmpty || CalendarIntervalType.acceptsType(stepOpt.get.dataType)) { -new TemporalSequenceImpl[Int](IntegerType, MICROS_PER_DAY, _.toInt, zoneId) +new TemporalSequenceImpl[Int](IntegerType, start.dataType, MICROS_PER_DAY, _.toInt, zoneId) } else { -new PeriodSequenceImpl[Int](IntegerType, MICROS_PER_DAY, _.toInt, zoneId) +new PeriodSequenceImpl[Int](IntegerType, start.dataType, MICROS_PER_DAY, _.toInt, zoneId) } } @@ -2769,8 +2769,9 @@ object Sequence { } private class PeriodSequenceImpl[T: ClassTag] - (dt: IntegralType, scale: Long, fromLong: Long => T, zoneId: ZoneId) - (implicit num: Integral[T]) extends InternalSequenceBase(dt, scale, fromLong, zoneId) { + (dt: IntegralType, outerDataType: DataType, scale: Long, fromLong: Long => T, zoneId: ZoneId) + (implicit num: Integral[T]) +extends InternalSequenceBase(dt, outerDataType, scale, fromLong, zoneId) { override val defaultStep: DefaultStep = new DefaultStep( (dt.ordering.lteq _).asInstanceOf[LessThanOrEqualFn], @@ -2794,8 +2795,9 @@ object Sequence { } private class DurationSequenceImpl[T: ClassTag] - (dt: IntegralType, scale: Long, fromLong: Long => T, zoneId: ZoneId) - (implicit num: Integral[T]) extends InternalSequenceBase(dt, scale, fromLong, zoneId) { + (dt: IntegralType, outerDataType: DataType, scale: Long, fromLong: Long => T, zoneId: ZoneId) + (implicit num: Integral[T]) +extends InternalSequenceBase(dt, outerDataType, scale, from
[spark] branch master updated (d7df7a8 -> 42275bb)
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git. from d7df7a8 [SPARK-36195][BUILD] Set MaxMetaspaceSize JVM option to 2g add 42275bb [SPARK-36090][SQL] Support TimestampNTZType in expression Sequence No new revisions were added by this update. Summary of changes: .../expressions/collectionOperations.scala | 48 +--- .../spark/sql/catalyst/util/DateTimeUtils.scala| 21 +++- .../expressions/CollectionExpressionsSuite.scala | 122 - 3 files changed, 172 insertions(+), 19 deletions(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-3.2 updated: [SPARK-36170][SQL] Change quoted interval literal (interval constructor) to be converted to ANSI interval types
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a commit to branch branch-3.2 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.2 by this push: new f7ed6fc [SPARK-36170][SQL] Change quoted interval literal (interval constructor) to be converted to ANSI interval types f7ed6fc is described below commit f7ed6fc6c608835ba33daff09edc02bb32fe6e97 Author: Kousuke Saruta AuthorDate: Sat Jul 17 12:23:37 2021 +0300 [SPARK-36170][SQL] Change quoted interval literal (interval constructor) to be converted to ANSI interval types ### What changes were proposed in this pull request? This PR changes the behavior of the quoted interval literals like `SELECT INTERVAL '1 year 2 month'` to be converted to ANSI interval types. ### Why are the changes needed? The tnit-to-unit interval literals and the unit list interval literals are converted to ANSI interval types but quoted interval literals are still converted to CalendarIntervalType. ``` -- Unit list interval literals spark-sql> select interval 1 year 2 month; 1-2 -- Quoted interval literals spark-sql> select interval '1 year 2 month'; 1 years 2 months ``` ### Does this PR introduce _any_ user-facing change? Yes but the following sentence in `sql-migration-guide.md` seems to cover this change. ``` - In Spark 3.2, the unit list interval literals can not mix year-month fields (YEAR and MONTH) and day-time fields (WEEK, DAY, ..., MICROSECOND). For example, `INTERVAL 1 day 1 hour` is invalid in Spark 3.2. In Spark 3.1 and earlier, there is no such limitation and the literal returns value of `CalendarIntervalType`. To restore the behavior before Spark 3.2, you can set `spark.sql.legacy.interval.enabled` to `true`. ``` ### How was this patch tested? Modified existing tests and add new tests. Closes #33380 from sarutak/fix-interval-constructor. Authored-by: Kousuke Saruta Signed-off-by: Max Gekk (cherry picked from commit 71ea25d4f567c7ef598aa87b6e4a091a3119fead) Signed-off-by: Max Gekk --- .../spark/sql/catalyst/parser/AstBuilder.scala | 77 -- .../spark/sql/errors/QueryParsingErrors.scala | 2 +- .../spark/sql/catalyst/parser/DDLParserSuite.scala | 11 +- .../catalyst/parser/ExpressionParserSuite.scala| 42 +--- .../test/resources/sql-tests/inputs/interval.sql | 7 +- .../sql-tests/results/ansi/interval.sql.out| 90 ++--- .../sql-tests/results/ansi/literals.sql.out| 4 +- .../resources/sql-tests/results/interval.sql.out | 112 + .../resources/sql-tests/results/literals.sql.out | 4 +- .../sql-tests/results/misc-functions.sql.out | 4 +- .../org/apache/spark/sql/DateFunctionsSuite.scala | 70 +++-- 11 files changed, 284 insertions(+), 139 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala index 2624b5c..d213549 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala @@ -2165,7 +2165,15 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with SQLConfHelper with Logg ex.setStackTrace(e.getStackTrace) throw ex } - Literal(interval, CalendarIntervalType) + if (!conf.legacyIntervalEnabled) { +val units = value + .split("\\s") + .map(_.toLowerCase(Locale.ROOT).stripSuffix("s")) + .filter(s => s != "interval" && s.matches("[a-z]+")) +constructMultiUnitsIntervalLiteral(ctx, interval, units) + } else { +Literal(interval, CalendarIntervalType) + } case "X" => val padding = if (value.length % 2 != 0) "0" else "" Literal(DatatypeConverter.parseHexBinary(padding + value)) @@ -2373,6 +2381,44 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with SQLConfHelper with Logg } /** + * Construct an [[Literal]] from [[CalendarInterval]] and + * units represented as a [[Seq]] of [[String]]. + */ + private def constructMultiUnitsIntervalLiteral( + ctx: ParserRuleContext, + calendarInterval: CalendarInterval, + units: Seq[String]): Literal = { +val yearMonthFields = Set.empty[Byte] +val dayTimeFields = Set.empty[Byte] +for (unit <- units) { + if (YearMonthIntervalType.stringToField.contains(unit)) { +yearMonthFields += YearMonthIntervalType.stringToField(unit) + } else if (DayTimeIntervalTyp
[spark] branch master updated (8009f0d -> 71ea25d)
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git. from 8009f0d [SPARK-35785][SS][FOLLOWUP] Remove ignored test from RocksDBSuite add 71ea25d [SPARK-36170][SQL] Change quoted interval literal (interval constructor) to be converted to ANSI interval types No new revisions were added by this update. Summary of changes: .../spark/sql/catalyst/parser/AstBuilder.scala | 77 -- .../spark/sql/errors/QueryParsingErrors.scala | 2 +- .../spark/sql/catalyst/parser/DDLParserSuite.scala | 11 +- .../catalyst/parser/ExpressionParserSuite.scala| 42 +--- .../test/resources/sql-tests/inputs/interval.sql | 7 +- .../sql-tests/results/ansi/interval.sql.out| 90 ++--- .../sql-tests/results/ansi/literals.sql.out| 4 +- .../resources/sql-tests/results/interval.sql.out | 112 + .../resources/sql-tests/results/literals.sql.out | 4 +- .../sql-tests/results/misc-functions.sql.out | 4 +- .../org/apache/spark/sql/DateFunctionsSuite.scala | 70 +++-- 11 files changed, 284 insertions(+), 139 deletions(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-3.2 updated: [SPARK-36034][SQL] Rebase datetime in pushed down filters to parquet
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a commit to branch branch-3.2 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.2 by this push: new 57849a5 [SPARK-36034][SQL] Rebase datetime in pushed down filters to parquet 57849a5 is described below commit 57849a54da39c337fc7209404815e8980e83e03b Author: Max Gekk AuthorDate: Thu Jul 15 22:21:57 2021 +0300 [SPARK-36034][SQL] Rebase datetime in pushed down filters to parquet ### What changes were proposed in this pull request? In the PR, I propose to propagate either the SQL config `spark.sql.parquet.datetimeRebaseModeInRead` or/and Parquet option `datetimeRebaseMode` to `ParquetFilters`. The `ParquetFilters` class uses the settings in conversions of dates/timestamps instances from datasource filters to values pushed via `FilterApi` to the `parquet-column` lib. Before the changes, date/timestamp values expressed as days/microseconds/milliseconds are interpreted as offsets in Proleptic Gregorian calendar, and pushed to the parquet library as is. That works fine if timestamp/dates values in parquet files were saved in the `CORRECTED` mode but in the `LEGACY` mode, filter's values could not match to actual values. After the changes, timestamp/dates values of filters pushed down to parquet libs such as `FilterApi.eq(col1, -719162)` are rebased according the rebase settings. For the example, if the rebase mode is `CORRECTED`, **-719162** is pushed down as is but if the current rebase mode is `LEGACY`, the number of days is rebased to **-719164**. For more context, the PR description https://github.com/apache/spark/pull/28067 shows the diffs between two calendars. ### Why are the changes needed? The changes fix the bug portrayed by the following example from SPARK-36034: ```scala In [27]: spark.conf.set("spark.sql.legacy.parquet.datetimeRebaseModeInWrite", "LEGACY") >>> spark.sql("SELECT DATE '0001-01-01' AS date").write.mode("overwrite").parquet("date_written_by_spark3_legacy") >>> spark.read.parquet("date_written_by_spark3_legacy").where("date = '0001-01-01'").show() ++ |date| ++ ++ ``` The result must have the date value `0001-01-01`. ### Does this PR introduce _any_ user-facing change? In some sense, yes. Query results can be different in some cases. For the example above: ```scala scala> spark.conf.set("spark.sql.parquet.datetimeRebaseModeInWrite", "LEGACY") scala> spark.sql("SELECT DATE '0001-01-01' AS date").write.mode("overwrite").parquet("date_written_by_spark3_legacy") scala> spark.read.parquet("date_written_by_spark3_legacy").where("date = '0001-01-01'").show(false) +--+ |date | +--+ |0001-01-01| +--+ ``` ### How was this patch tested? By running the modified test suite `ParquetFilterSuite`: ``` $ build/sbt "test:testOnly *ParquetV1FilterSuite" $ build/sbt "test:testOnly *ParquetV2FilterSuite" ``` Closes #33347 from MaxGekk/fix-parquet-ts-filter-pushdown. Authored-by: Max Gekk Signed-off-by: Max Gekk (cherry picked from commit b09b7f7cc024d3054debd7bdb51caec3b53764d7) Signed-off-by: Max Gekk --- .../datasources/parquet/ParquetFileFormat.scala| 17 ++- .../datasources/parquet/ParquetFilters.scala | 29 +++- .../v2/parquet/ParquetPartitionReaderFactory.scala | 17 ++- .../v2/parquet/ParquetScanBuilder.scala| 14 +- .../datasources/parquet/ParquetFilterSuite.scala | 164 +++-- 5 files changed, 145 insertions(+), 96 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala index 48e2e6e..ee229a3 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala @@ -266,11 +266,21 @@ class ParquetFileFormat lazy val footerFileMetaData = ParquetFooterReader.readFooter(sharedConf, filePath, SKIP_ROW_GROUPS).getFileMetaData + val datetimeRebaseMode = DataSourceUtils.datetimeRebaseMode( +footerFileMetaData.getKeyValueMetaData.get, +datetimeRebaseModeInRead) // Try to push down filters when filter push-down is enabled. val pushed = if (enableParquetFilterPushDown) { val parquetSchema = footerFileMetaData.getSchema -val parquetFilters = new ParquetFilters(parquetS
[spark] branch master updated (c8a3c22 -> b09b7f7)
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git. from c8a3c22 [SPARK-36164][INFRA] run-test.py should not fail when APACHE_SPARK_REF is not defined add b09b7f7 [SPARK-36034][SQL] Rebase datetime in pushed down filters to parquet No new revisions were added by this update. Summary of changes: .../datasources/parquet/ParquetFileFormat.scala| 17 ++- .../datasources/parquet/ParquetFilters.scala | 29 +++- .../v2/parquet/ParquetPartitionReaderFactory.scala | 17 ++- .../v2/parquet/ParquetScanBuilder.scala| 14 +- .../datasources/parquet/ParquetFilterSuite.scala | 164 +++-- 5 files changed, 145 insertions(+), 96 deletions(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated (0062c03 -> 2db7ed7)
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git. from 0062c03 [SPARK-32792][SQL][FOLLOWUP] Fix Parquet filter pushdown NOT IN predicate add 2db7ed7 [SPARK-36158][PYTHON][DOCS] Improving pyspark sql/functions months_between documentation No new revisions were added by this update. Summary of changes: python/pyspark/sql/functions.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-3.2 updated: [SPARK-32792][SQL][FOLLOWUP] Fix Parquet filter pushdown NOT IN predicate
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a commit to branch branch-3.2 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.2 by this push: new 1ed72e2 [SPARK-32792][SQL][FOLLOWUP] Fix Parquet filter pushdown NOT IN predicate 1ed72e2 is described below commit 1ed72e2e8e25991aee4cd8c50d627c5477a3 Author: Yuming Wang AuthorDate: Thu Jul 15 18:51:53 2021 +0300 [SPARK-32792][SQL][FOLLOWUP] Fix Parquet filter pushdown NOT IN predicate ### What changes were proposed in this pull request? This pr fix Parquet filter pushdown `NOT` `IN` predicate if its values exceeds `spark.sql.parquet.pushdown.inFilterThreshold`. For example: `Not(In(a, Array(2, 3, 7))`. We can not push down `not(and(gteq(a, 2), lteq(a, 7)))`. ### Why are the changes needed? Fix bug. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Unit test. Closes #33365 from wangyum/SPARK-32792-3. Authored-by: Yuming Wang Signed-off-by: Max Gekk (cherry picked from commit 0062c03c1584b0ab1443fa24f288e611bfbb5115) Signed-off-by: Max Gekk --- .../sql/execution/datasources/parquet/ParquetFilters.scala | 4 +++- .../execution/datasources/parquet/ParquetFilterSuite.scala | 14 ++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala index 71205f9..df7 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala @@ -710,7 +710,7 @@ class ParquetFilters( values.distinct.flatMap { v => makeEq.lift(fieldType).map(_(fieldNames, v)) }.reduceLeftOption(FilterApi.or) -} else { +} else if (canPartialPushDownConjuncts) { val primitiveType = schema.getColumnDescription(fieldNames).getPrimitiveType val statistics: ParquetStatistics[_] = ParquetStatistics.createStats(primitiveType) if (values.contains(null)) { @@ -721,6 +721,8 @@ class ParquetFilters( } else { makeInPredicate.lift(fieldType).map(_(fieldNames, values, statistics)) } +} else { + None } case sources.StringStartsWith(name, prefix) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala index bdcdbf3..230d547 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala @@ -1563,6 +1563,20 @@ abstract class ParquetFilterSuite extends QueryTest with ParquetTest with Shared ) { parquetFilters.createFilter(sources.In("a", Array(2, 3, 7, null, 6))) } + + assertResult( +Some(FilterApi.not(or( + FilterApi.eq(intColumn("a"), 2: Integer), + FilterApi.eq(intColumn("a"), 3: Integer + ) { +parquetFilters.createFilter(sources.Not(sources.In("a", Array(2, 3 + } + + assertResult( +None + ) { +parquetFilters.createFilter(sources.Not(sources.In("a", Array(2, 3, 7 + } } } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated (a71dd6a -> 0062c03)
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git. from a71dd6a [SPARK-36146][PYTHON][INFRA][TESTS] Upgrade Python version from 3.6 to 3.9 in GitHub Actions' linter/docs add 0062c03 [SPARK-32792][SQL][FOLLOWUP] Fix Parquet filter pushdown NOT IN predicate No new revisions were added by this update. Summary of changes: .../sql/execution/datasources/parquet/ParquetFilters.scala | 4 +++- .../execution/datasources/parquet/ParquetFilterSuite.scala | 14 ++ 2 files changed, 17 insertions(+), 1 deletion(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-3.1 updated: [SPARK-36154][DOCS] Documenting week and quarter as valid formats in pyspark sql/functions trunc
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a commit to branch branch-3.1 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.1 by this push: new 249b567 [SPARK-36154][DOCS] Documenting week and quarter as valid formats in pyspark sql/functions trunc 249b567 is described below commit 249b567684f977a1af676c4cec14641ae6c75269 Author: Dominik Gehl AuthorDate: Thu Jul 15 16:51:11 2021 +0300 [SPARK-36154][DOCS] Documenting week and quarter as valid formats in pyspark sql/functions trunc ### What changes were proposed in this pull request? Added missing documentation of week and quarter as valid formats to pyspark sql/functions trunc ### Why are the changes needed? Pyspark documentation and scala documentation didn't mentioned the same supported formats ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Only documentation change Closes #33359 from dominikgehl/feature/SPARK-36154. Authored-by: Dominik Gehl Signed-off-by: Max Gekk (cherry picked from commit 802f632a28e46538053c056d1ce43374f80454ae) Signed-off-by: Max Gekk --- R/pkg/R/functions.R | 9 ++--- python/pyspark/sql/functions.py | 11 --- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index 28e4ef8..87a75c3 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -47,10 +47,13 @@ NULL #' \item \code{to_date} and \code{to_timestamp}: it is the string to use to parse #'Column \code{x} to DateType or TimestampType. #' \item \code{trunc}: it is the string to use to specify the truncation method. -#'For example, "year", "", "yy" for truncate by year, or "month", "mon", -#'"mm" for truncate by month. +#''year', '', 'yy' to truncate by year, +#'or 'month', 'mon', 'mm' to truncate by month +#'Other options are: 'week', 'quarter' #' \item \code{date_trunc}: it is similar with \code{trunc}'s but additionally -#'supports "day", "dd", "second", "minute", "hour", "week" and "quarter". +#'supports +#''day', 'dd' to truncate by day, +#''microsecond', 'millisecond', 'second', 'minute' and 'hour' #' } #' @param ... additional argument(s). #' @name column_datetime_functions diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py index 75a31b7..88b7c4d 100644 --- a/python/pyspark/sql/functions.py +++ b/python/pyspark/sql/functions.py @@ -1941,7 +1941,9 @@ def trunc(date, format): -- date : :class:`~pyspark.sql.Column` or str format : str -'year', '', 'yy' or 'month', 'mon', 'mm' +'year', '', 'yy' to truncate by year, +or 'month', 'mon', 'mm' to truncate by month +Other options are: 'week', 'quarter' Examples @@ -1964,8 +1966,11 @@ def date_trunc(format, timestamp): Parameters -- format : str -'year', '', 'yy', 'month', 'mon', 'mm', -'day', 'dd', 'hour', 'minute', 'second', 'week', 'quarter' +'year', '', 'yy' to truncate by year, +'month', 'mon', 'mm' to truncate by month, +'day', 'dd' to truncate by day, +Other options are: +'microsecond', 'millisecond', 'second', 'minute', 'hour', 'week', 'quarter' timestamp : :class:`~pyspark.sql.Column` or str Examples - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-3.2 updated: [SPARK-36154][DOCS] Documenting week and quarter as valid formats in pyspark sql/functions trunc
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a commit to branch branch-3.2 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.2 by this push: new 3a09024 [SPARK-36154][DOCS] Documenting week and quarter as valid formats in pyspark sql/functions trunc 3a09024 is described below commit 3a0902463680abeadb46d5bd6b99cfc72e836c8d Author: Dominik Gehl AuthorDate: Thu Jul 15 16:51:11 2021 +0300 [SPARK-36154][DOCS] Documenting week and quarter as valid formats in pyspark sql/functions trunc ### What changes were proposed in this pull request? Added missing documentation of week and quarter as valid formats to pyspark sql/functions trunc ### Why are the changes needed? Pyspark documentation and scala documentation didn't mentioned the same supported formats ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Only documentation change Closes #33359 from dominikgehl/feature/SPARK-36154. Authored-by: Dominik Gehl Signed-off-by: Max Gekk (cherry picked from commit 802f632a28e46538053c056d1ce43374f80454ae) Signed-off-by: Max Gekk --- R/pkg/R/functions.R | 9 ++--- python/pyspark/sql/functions.py | 11 --- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index 656206f..2adff08 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -47,10 +47,13 @@ NULL #' \item \code{to_date} and \code{to_timestamp}: it is the string to use to parse #'Column \code{x} to DateType or TimestampType. #' \item \code{trunc}: it is the string to use to specify the truncation method. -#'For example, "year", "", "yy" for truncate by year, or "month", "mon", -#'"mm" for truncate by month. +#''year', '', 'yy' to truncate by year, +#'or 'month', 'mon', 'mm' to truncate by month +#'Other options are: 'week', 'quarter' #' \item \code{date_trunc}: it is similar with \code{trunc}'s but additionally -#'supports "day", "dd", "second", "minute", "hour", "week" and "quarter". +#'supports +#''day', 'dd' to truncate by day, +#''microsecond', 'millisecond', 'second', 'minute' and 'hour' #' } #' @param ... additional argument(s). #' @name column_datetime_functions diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py index 6079949..d4f527d 100644 --- a/python/pyspark/sql/functions.py +++ b/python/pyspark/sql/functions.py @@ -2042,7 +2042,9 @@ def trunc(date, format): -- date : :class:`~pyspark.sql.Column` or str format : str -'year', '', 'yy' or 'month', 'mon', 'mm' +'year', '', 'yy' to truncate by year, +or 'month', 'mon', 'mm' to truncate by month +Other options are: 'week', 'quarter' Examples @@ -2065,8 +2067,11 @@ def date_trunc(format, timestamp): Parameters -- format : str -'year', '', 'yy', 'month', 'mon', 'mm', -'day', 'dd', 'hour', 'minute', 'second', 'week', 'quarter' +'year', '', 'yy' to truncate by year, +'month', 'mon', 'mm' to truncate by month, +'day', 'dd' to truncate by day, +Other options are: +'microsecond', 'millisecond', 'second', 'minute', 'hour', 'week', 'quarter' timestamp : :class:`~pyspark.sql.Column` or str Examples - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated: [SPARK-36154][DOCS] Documenting week and quarter as valid formats in pyspark sql/functions trunc
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 802f632 [SPARK-36154][DOCS] Documenting week and quarter as valid formats in pyspark sql/functions trunc 802f632 is described below commit 802f632a28e46538053c056d1ce43374f80454ae Author: Dominik Gehl AuthorDate: Thu Jul 15 16:51:11 2021 +0300 [SPARK-36154][DOCS] Documenting week and quarter as valid formats in pyspark sql/functions trunc ### What changes were proposed in this pull request? Added missing documentation of week and quarter as valid formats to pyspark sql/functions trunc ### Why are the changes needed? Pyspark documentation and scala documentation didn't mentioned the same supported formats ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Only documentation change Closes #33359 from dominikgehl/feature/SPARK-36154. Authored-by: Dominik Gehl Signed-off-by: Max Gekk --- R/pkg/R/functions.R | 9 ++--- python/pyspark/sql/functions.py | 11 --- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index 892a8dd..40642f1 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -47,10 +47,13 @@ NULL #' \item \code{to_date} and \code{to_timestamp}: it is the string to use to parse #'Column \code{x} to DateType or TimestampType. #' \item \code{trunc}: it is the string to use to specify the truncation method. -#'For example, "year", "", "yy" for truncate by year, or "month", "mon", -#'"mm" for truncate by month. +#''year', '', 'yy' to truncate by year, +#'or 'month', 'mon', 'mm' to truncate by month +#'Other options are: 'week', 'quarter' #' \item \code{date_trunc}: it is similar with \code{trunc}'s but additionally -#'supports "day", "dd", "second", "minute", "hour", "week" and "quarter". +#'supports +#''day', 'dd' to truncate by day, +#''microsecond', 'millisecond', 'second', 'minute' and 'hour' #' } #' @param ... additional argument(s). #' @name column_datetime_functions diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py index bd32217..e5b2a00 100644 --- a/python/pyspark/sql/functions.py +++ b/python/pyspark/sql/functions.py @@ -2045,7 +2045,9 @@ def trunc(date, format): -- date : :class:`~pyspark.sql.Column` or str format : str -'year', '', 'yy' or 'month', 'mon', 'mm' +'year', '', 'yy' to truncate by year, +or 'month', 'mon', 'mm' to truncate by month +Other options are: 'week', 'quarter' Examples @@ -2068,8 +2070,11 @@ def date_trunc(format, timestamp): Parameters -- format : str -'year', '', 'yy', 'month', 'mon', 'mm', -'day', 'dd', 'hour', 'minute', 'second', 'week', 'quarter' +'year', '', 'yy' to truncate by year, +'month', 'mon', 'mm' to truncate by month, +'day', 'dd' to truncate by day, +Other options are: +'microsecond', 'millisecond', 'second', 'minute', 'hour', 'week', 'quarter' timestamp : :class:`~pyspark.sql.Column` or str Examples - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated (b5ee6d0 -> 4dfd266)
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git. from b5ee6d0 [SPARK-36149][PYTHON] Clarify documentation for dayofweek and weekofyear add 4dfd266 [SPARK-36148][SQL] Fix input data types check for regexp_replace No new revisions were added by this update. Summary of changes: .../apache/spark/sql/catalyst/expressions/regexpExpressions.scala | 4 .../src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala | 7 +++ 2 files changed, 11 insertions(+) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated (564d3de -> b5ee6d0)
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git. from 564d3de [SPARK-36037][TESTS][FOLLOWUP] Avoid wrong test results on daylight saving time add b5ee6d0 [SPARK-36149][PYTHON] Clarify documentation for dayofweek and weekofyear No new revisions were added by this update. Summary of changes: R/pkg/R/functions.R | 3 +++ python/pyspark/sql/functions.py | 3 +++ 2 files changed, 6 insertions(+) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated (5acfecb -> 564d3de)
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git. from 5acfecb [SPARK-36150][INFRA][TESTS] Disable MiMa for Scala 2.13 artifacts add 564d3de [SPARK-36037][TESTS][FOLLOWUP] Avoid wrong test results on daylight saving time No new revisions were added by this update. Summary of changes: .../apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-3.2 updated: [SPARK-36037][TESTS][FOLLOWUP] Avoid wrong test results on daylight saving time
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a commit to branch branch-3.2 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.2 by this push: new 75ff69a [SPARK-36037][TESTS][FOLLOWUP] Avoid wrong test results on daylight saving time 75ff69a is described below commit 75ff69a9940776f0337a080b721d45d9f571c386 Author: Gengliang Wang AuthorDate: Thu Jul 15 11:40:51 2021 +0300 [SPARK-36037][TESTS][FOLLOWUP] Avoid wrong test results on daylight saving time ### What changes were proposed in this pull request? Only use the zone ids that has no daylight saving for testing `localtimestamp` ### Why are the changes needed? https://github.com/apache/spark/pull/33346#discussion_r670135296 MaxGekk suggests that we should avoid wrong results if possible. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Unit test Closes #33354 from gengliangwang/FIxDST. Authored-by: Gengliang Wang Signed-off-by: Max Gekk (cherry picked from commit 564d3de7c62fa89c6db1e08e809400b339a704cd) Signed-off-by: Max Gekk --- .../apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala index dbfe0af..eecaa93 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala @@ -97,7 +97,8 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { } test("datetime function localtimestamp") { -outstandingTimezonesIds.foreach { zid => +// Verify with multiple outstanding time zones which has no daylight saving time. +Seq("UTC", "Africa/Dakar", "Asia/Hong_Kong").foreach { zid => val zoneId = DateTimeUtils.getZoneId(zid) val ct = LocalTimestamp(Some(zid)).eval(EmptyRow).asInstanceOf[Long] val t1 = DateTimeUtils.localDateTimeToMicros(LocalDateTime.now(zoneId)) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated (38196412 -> 1e86345)
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git. from 38196412 [SPARK-35639][SQL][FOLLOWUP] Make hasCoalescedPartition return true if something was actually coalesced add 1e86345 [SPARK-36069][SQL] Add field info to `from_json`'s exception in the FAILFAST mode No new revisions were added by this update. Summary of changes: .../spark/sql/catalyst/json/JacksonParser.scala| 10 - .../spark/sql/errors/QueryExecutionErrors.scala| 17 ++- .../org/apache/spark/sql/JsonFunctionsSuite.scala | 25 ++ 3 files changed, 42 insertions(+), 10 deletions(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-3.2 updated: [SPARK-36120][SQL] Support TimestampNTZ type in cache table
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a commit to branch branch-3.2 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.2 by this push: new 3ace01b [SPARK-36120][SQL] Support TimestampNTZ type in cache table 3ace01b is described below commit 3ace01b25bf1c69901a07c11acbae6fa996fc06c Author: Gengliang Wang AuthorDate: Tue Jul 13 17:23:48 2021 +0300 [SPARK-36120][SQL] Support TimestampNTZ type in cache table ### What changes were proposed in this pull request? Support TimestampNTZ type column in SQL command Cache table ### Why are the changes needed? Cache table should support the new timestamp type. ### Does this PR introduce _any_ user-facing change? Yes, the TimemstampNTZ type column can used in `CACHE TABLE` ### How was this patch tested? Unit test Closes #33322 from gengliangwang/cacheTable. Authored-by: Gengliang Wang Signed-off-by: Max Gekk (cherry picked from commit 067432705fbec970bd713adf37d2aa17c6bcf5a0) Signed-off-by: Max Gekk --- .../spark/sql/execution/columnar/ColumnBuilder.scala | 3 ++- .../apache/spark/sql/execution/columnar/ColumnType.scala | 2 +- .../sql/execution/columnar/GenerateColumnAccessor.scala | 2 +- .../scala/org/apache/spark/sql/CachedTableSuite.scala | 15 ++- 4 files changed, 18 insertions(+), 4 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/ColumnBuilder.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/ColumnBuilder.scala index e9251e8..9ddc665 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/ColumnBuilder.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/ColumnBuilder.scala @@ -175,7 +175,8 @@ private[columnar] object ColumnBuilder { case ByteType => new ByteColumnBuilder case ShortType => new ShortColumnBuilder case IntegerType | DateType | _: YearMonthIntervalType => new IntColumnBuilder - case LongType | TimestampType | _: DayTimeIntervalType => new LongColumnBuilder + case LongType | TimestampType | TimestampNTZType | _: DayTimeIntervalType => +new LongColumnBuilder case FloatType => new FloatColumnBuilder case DoubleType => new DoubleColumnBuilder case StringType => new StringColumnBuilder diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/ColumnType.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/ColumnType.scala index 8e99368..419dcc6 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/ColumnType.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/ColumnType.scala @@ -818,7 +818,7 @@ private[columnar] object ColumnType { case ByteType => BYTE case ShortType => SHORT case IntegerType | DateType | _: YearMonthIntervalType => INT - case LongType | TimestampType | _: DayTimeIntervalType => LONG + case LongType | TimestampType | TimestampNTZType | _: DayTimeIntervalType => LONG case FloatType => FLOAT case DoubleType => DOUBLE case StringType => STRING diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/GenerateColumnAccessor.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/GenerateColumnAccessor.scala index 190c2c3..6e666d4 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/GenerateColumnAccessor.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/GenerateColumnAccessor.scala @@ -81,7 +81,7 @@ object GenerateColumnAccessor extends CodeGenerator[Seq[DataType], ColumnarItera case ByteType => classOf[ByteColumnAccessor].getName case ShortType => classOf[ShortColumnAccessor].getName case IntegerType | DateType | _: YearMonthIntervalType => classOf[IntColumnAccessor].getName -case LongType | TimestampType | _: DayTimeIntervalType => +case LongType | TimestampType | TimestampNTZType | _: DayTimeIntervalType => classOf[LongColumnAccessor].getName case FloatType => classOf[FloatColumnAccessor].getName case DoubleType => classOf[DoubleColumnAccessor].getName diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala index 1915044..db717d2 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala @@ -19,7 +19,7 @@ package org.apache.spark.sql import java.io.{File, FilenameFilter} import java.nio.file.{Files, Paths} -import java.time.{Duration,
[spark] branch master updated (583173b -> 0674327)
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git. from 583173b [SPARK-36033][SQL][TEST] Validate partitioning requirements in TPCDS tests add 0674327 [SPARK-36120][SQL] Support TimestampNTZ type in cache table No new revisions were added by this update. Summary of changes: .../spark/sql/execution/columnar/ColumnBuilder.scala | 3 ++- .../apache/spark/sql/execution/columnar/ColumnType.scala | 2 +- .../sql/execution/columnar/GenerateColumnAccessor.scala | 2 +- .../scala/org/apache/spark/sql/CachedTableSuite.scala | 15 ++- 4 files changed, 18 insertions(+), 4 deletions(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-3.2 updated: [SPARK-35735][SQL][FOLLOWUP] Remove unused method `IntervalUtils.checkIntervalStringDataType()`
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a commit to branch branch-3.2 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.2 by this push: new b469294 [SPARK-35735][SQL][FOLLOWUP] Remove unused method `IntervalUtils.checkIntervalStringDataType()` b469294 is described below commit b4692949f802a01bcdd7364235ce9e7ed31ff14d Author: Max Gekk AuthorDate: Tue Jul 13 15:11:21 2021 +0300 [SPARK-35735][SQL][FOLLOWUP] Remove unused method `IntervalUtils.checkIntervalStringDataType()` ### What changes were proposed in this pull request? Remove the private method `checkIntervalStringDataType()` from `IntervalUtils` since it hasn't been used anymore after https://github.com/apache/spark/pull/33242. ### Why are the changes needed? To improve code maintenance. ### Does this PR introduce _any_ user-facing change? No. The method is private, and it existing in code base for short time. ### How was this patch tested? By existing GAs/tests. Closes #33321 from MaxGekk/SPARK-35735-remove-unused-method. Authored-by: Max Gekk Signed-off-by: Max Gekk (cherry picked from commit 1ba3982d16f98601583520794b30fa6ad6d85cf0) Signed-off-by: Max Gekk --- .../spark/sql/catalyst/util/IntervalUtils.scala | 21 + 1 file changed, 1 insertion(+), 20 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/IntervalUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/IntervalUtils.scala index e026266..dc6c02e 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/IntervalUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/IntervalUtils.scala @@ -30,7 +30,7 @@ import org.apache.spark.sql.catalyst.util.DateTimeUtils.millisToMicros import org.apache.spark.sql.catalyst.util.IntervalStringStyles.{ANSI_STYLE, HIVE_STYLE, IntervalStyle} import org.apache.spark.sql.errors.QueryExecutionErrors import org.apache.spark.sql.internal.SQLConf -import org.apache.spark.sql.types.{DataType, DayTimeIntervalType => DT, Decimal, YearMonthIntervalType => YM} +import org.apache.spark.sql.types.{DayTimeIntervalType => DT, Decimal, YearMonthIntervalType => YM} import org.apache.spark.unsafe.types.{CalendarInterval, UTF8String} // The style of textual representation of intervals @@ -124,25 +124,6 @@ object IntervalUtils { s"${fallBackNotice.map(s => s", $s").getOrElse("")}") } - private def checkIntervalStringDataType( - input: UTF8String, - targetStartField: Byte, - targetEndField: Byte, - inputIntervalType: DataType, - fallBackNotice: Option[String] = None): Unit = { -val (intervalStr, typeName, inputStartField, inputEndField) = inputIntervalType match { - case DT(startField, endField) => -("day-time", DT(targetStartField, targetEndField).typeName, startField, endField) - case YM(startField, endField) => -("year-month", YM(targetStartField, targetEndField).typeName, startField, endField) -} -if (targetStartField != inputStartField || targetEndField != inputEndField) { - throwIllegalIntervalFormatException( -input, targetStartField, targetEndField, intervalStr, typeName, fallBackNotice) -} - } - - val supportedFormat = Map( (YM.YEAR, YM.MONTH) -> Seq("[+|-]y-m", "INTERVAL [+|-]'[+|-]y-m' YEAR TO MONTH"), (YM.YEAR, YM.YEAR) -> Seq("[+|-]y", "INTERVAL [+|-]'[+|-]y' YEAR"), - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated (03e48c8 -> 1ba3982)
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git. from 03e48c8 [SPARK-35334][K8S] Make Spark more resilient to intermittent K8s flakiness add 1ba3982 [SPARK-35735][SQL][FOLLOWUP] Remove unused method `IntervalUtils.checkIntervalStringDataType()` No new revisions were added by this update. Summary of changes: .../spark/sql/catalyst/util/IntervalUtils.scala | 21 + 1 file changed, 1 insertion(+), 20 deletions(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-3.2 updated: [SPARK-36046][SQL] Support new functions make_timestamp_ntz and make_timestamp_ltz
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a commit to branch branch-3.2 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.2 by this push: new fba3e90 [SPARK-36046][SQL] Support new functions make_timestamp_ntz and make_timestamp_ltz fba3e90 is described below commit fba3e90863e584361b2f61828a500c96d89c35de Author: Gengliang Wang AuthorDate: Mon Jul 12 22:44:26 2021 +0300 [SPARK-36046][SQL] Support new functions make_timestamp_ntz and make_timestamp_ltz ### What changes were proposed in this pull request? Support new functions make_timestamp_ntz and make_timestamp_ltz Syntax: * `make_timestamp_ntz(year, month, day, hour, min, sec)`: Create local date-time from year, month, day, hour, min, sec fields * `make_timestamp_ltz(year, month, day, hour, min, sec[, timezone])`: Create current timestamp with local time zone from year, month, day, hour, min, sec and timezone fields ### Why are the changes needed? As the result of `make_timestamp` become consistent with the SQL configuration `spark.sql.timestmapType`, we need these two new functions to construct timestamp literals. They align to the functions [`make_timestamp` and `make_timestamptz`](https://www.postgresql.org/docs/9.4/functions-datetime.html) in PostgreSQL ### Does this PR introduce _any_ user-facing change? Yes, two new datetime functions: make_timestamp_ntz and make_timestamp_ltz. ### How was this patch tested? End-to-end tests. Closes #33299 from gengliangwang/make_timestamp_ntz_ltz. Authored-by: Gengliang Wang Signed-off-by: Max Gekk (cherry picked from commit 92bf83ed0aba2c399debb1db5fb88bad3961ab06) Signed-off-by: Max Gekk --- .../sql/catalyst/analysis/FunctionRegistry.scala | 2 + .../catalyst/expressions/datetimeExpressions.scala | 122 + .../sql-functions/sql-expression-schema.md | 6 +- .../test/resources/sql-tests/inputs/datetime.sql | 12 ++ .../sql-tests/results/ansi/datetime.sql.out| 61 ++- .../sql-tests/results/datetime-legacy.sql.out | 59 +- .../resources/sql-tests/results/datetime.sql.out | 59 +- .../results/timestampNTZ/datetime.sql.out | 59 +- 8 files changed, 374 insertions(+), 6 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala index fcc0220..4fd871d 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala @@ -552,6 +552,8 @@ object FunctionRegistry { expression[TimeWindow]("window"), expression[MakeDate]("make_date"), expression[MakeTimestamp]("make_timestamp"), +expression[MakeTimestampNTZ]("make_timestamp_ntz", true), +expression[MakeTimestampLTZ]("make_timestamp_ltz", true), expression[MakeInterval]("make_interval"), expression[MakeDTInterval]("make_dt_interval"), expression[MakeYMInterval]("make_ym_interval"), diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala index 0ebeacb..2840b18 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala @@ -2272,6 +2272,128 @@ case class MakeDate( // scalastyle:off line.size.limit @ExpressionDescription( + usage = "_FUNC_(year, month, day, hour, min, sec) - Create local date-time from year, month, day, hour, min, sec fields. ", + arguments = """ +Arguments: + * year - the year to represent, from 1 to + * month - the month-of-year to represent, from 1 (January) to 12 (December) + * day - the day-of-month to represent, from 1 to 31 + * hour - the hour-of-day to represent, from 0 to 23 + * min - the minute-of-hour to represent, from 0 to 59 + * sec - the second-of-minute and its micro-fraction to represent, from + 0 to 60. If the sec argument equals to 60, the seconds field is set + to 0 and 1 minute is added to the final timestamp. + """, + examples = """ +Examples: + > SELECT _FUNC_(2014, 12, 28, 6, 30, 45.887); + 2014-12-28 06:30:45.887 + > SELECT _FUNC_(2019, 6, 30, 23, 59, 60); + 2019-07-01 00:00:00 + > SELECT _FUNC_(null, 7, 22, 15, 30, 0);
[spark] branch master updated: [SPARK-36046][SQL] Support new functions make_timestamp_ntz and make_timestamp_ltz
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 92bf83e [SPARK-36046][SQL] Support new functions make_timestamp_ntz and make_timestamp_ltz 92bf83e is described below commit 92bf83ed0aba2c399debb1db5fb88bad3961ab06 Author: Gengliang Wang AuthorDate: Mon Jul 12 22:44:26 2021 +0300 [SPARK-36046][SQL] Support new functions make_timestamp_ntz and make_timestamp_ltz ### What changes were proposed in this pull request? Support new functions make_timestamp_ntz and make_timestamp_ltz Syntax: * `make_timestamp_ntz(year, month, day, hour, min, sec)`: Create local date-time from year, month, day, hour, min, sec fields * `make_timestamp_ltz(year, month, day, hour, min, sec[, timezone])`: Create current timestamp with local time zone from year, month, day, hour, min, sec and timezone fields ### Why are the changes needed? As the result of `make_timestamp` become consistent with the SQL configuration `spark.sql.timestmapType`, we need these two new functions to construct timestamp literals. They align to the functions [`make_timestamp` and `make_timestamptz`](https://www.postgresql.org/docs/9.4/functions-datetime.html) in PostgreSQL ### Does this PR introduce _any_ user-facing change? Yes, two new datetime functions: make_timestamp_ntz and make_timestamp_ltz. ### How was this patch tested? End-to-end tests. Closes #33299 from gengliangwang/make_timestamp_ntz_ltz. Authored-by: Gengliang Wang Signed-off-by: Max Gekk --- .../sql/catalyst/analysis/FunctionRegistry.scala | 2 + .../catalyst/expressions/datetimeExpressions.scala | 122 + .../sql-functions/sql-expression-schema.md | 6 +- .../test/resources/sql-tests/inputs/datetime.sql | 12 ++ .../sql-tests/results/ansi/datetime.sql.out| 61 ++- .../sql-tests/results/datetime-legacy.sql.out | 59 +- .../resources/sql-tests/results/datetime.sql.out | 59 +- .../results/timestampNTZ/datetime.sql.out | 59 +- 8 files changed, 374 insertions(+), 6 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala index fcc0220..4fd871d 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala @@ -552,6 +552,8 @@ object FunctionRegistry { expression[TimeWindow]("window"), expression[MakeDate]("make_date"), expression[MakeTimestamp]("make_timestamp"), +expression[MakeTimestampNTZ]("make_timestamp_ntz", true), +expression[MakeTimestampLTZ]("make_timestamp_ltz", true), expression[MakeInterval]("make_interval"), expression[MakeDTInterval]("make_dt_interval"), expression[MakeYMInterval]("make_ym_interval"), diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala index 0ebeacb..2840b18 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala @@ -2272,6 +2272,128 @@ case class MakeDate( // scalastyle:off line.size.limit @ExpressionDescription( + usage = "_FUNC_(year, month, day, hour, min, sec) - Create local date-time from year, month, day, hour, min, sec fields. ", + arguments = """ +Arguments: + * year - the year to represent, from 1 to + * month - the month-of-year to represent, from 1 (January) to 12 (December) + * day - the day-of-month to represent, from 1 to 31 + * hour - the hour-of-day to represent, from 0 to 23 + * min - the minute-of-hour to represent, from 0 to 59 + * sec - the second-of-minute and its micro-fraction to represent, from + 0 to 60. If the sec argument equals to 60, the seconds field is set + to 0 and 1 minute is added to the final timestamp. + """, + examples = """ +Examples: + > SELECT _FUNC_(2014, 12, 28, 6, 30, 45.887); + 2014-12-28 06:30:45.887 + > SELECT _FUNC_(2019, 6, 30, 23, 59, 60); + 2019-07-01 00:00:00 + > SELECT _FUNC_(null, 7, 22, 15, 30, 0); + NULL + """, + group = "datetime_funcs", + since = "3.2.0") +// sca
[spark] branch branch-3.2 updated: [SPARK-36072][SQL] TO_TIMESTAMP: return different results based on the default timestamp type
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a commit to branch branch-3.2 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.2 by this push: new 4e9e2f3 [SPARK-36072][SQL] TO_TIMESTAMP: return different results based on the default timestamp type 4e9e2f3 is described below commit 4e9e2f32e84cd1d317e25b85eb18f2c662bc641f Author: Gengliang Wang AuthorDate: Mon Jul 12 10:12:30 2021 +0300 [SPARK-36072][SQL] TO_TIMESTAMP: return different results based on the default timestamp type ### What changes were proposed in this pull request? The SQL function TO_TIMESTAMP should return different results based on the default timestamp type: * when "spark.sql.timestampType" is TIMESTAMP_NTZ, return TimestampNTZType literal * when "spark.sql.timestampType" is TIMESTAMP_LTZ, return TimestampType literal This PR also refactor the class GetTimestamp and GetTimestampNTZ to reduce duplicated code. ### Why are the changes needed? As "spark.sql.timestampType" sets the default timestamp type, the to_timestamp function should behave consistently with it. ### Does this PR introduce _any_ user-facing change? Yes, when the value of "spark.sql.timestampType" is TIMESTAMP_NTZ, the result type of `TO_TIMESTAMP` is of TIMESTAMP_NTZ type. ### How was this patch tested? Unit test Closes #33280 from gengliangwang/to_timestamp. Authored-by: Gengliang Wang Signed-off-by: Max Gekk (cherry picked from commit 32720dd3e18ea43c6d88125a52356f40f808b300) Signed-off-by: Max Gekk --- .../catalyst/expressions/datetimeExpressions.scala | 54 +- .../expressions/DateExpressionsSuite.scala | 23 +++--- .../results/timestampNTZ/datetime.sql.out | 84 +++--- 3 files changed, 76 insertions(+), 85 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala index f0ed89e..0ebeacb 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala @@ -1008,17 +1008,17 @@ case class UnixTimestamp( copy(timeExp = newLeft, format = newRight) } -case class GetTimestampNTZ( +/** + * Gets a timestamp from a string or a date. + */ +case class GetTimestamp( left: Expression, right: Expression, +override val dataType: DataType, timeZoneId: Option[String] = None, failOnError: Boolean = SQLConf.get.ansiEnabled) extends ToTimestamp { - override val forTimestampNTZ: Boolean = true - - override def inputTypes: Seq[AbstractDataType] = Seq(StringType, StringType) - - override def dataType: DataType = TimestampNTZType + override val forTimestampNTZ: Boolean = dataType == TimestampNTZType override protected def downScaleFactor: Long = 1 @@ -1064,7 +1064,7 @@ case class ParseToTimestampNTZ( child: Expression) extends RuntimeReplaceable { def this(left: Expression, format: Expression) = { -this(left, Option(format), GetTimestampNTZ(left, format)) +this(left, Option(format), GetTimestamp(left, format, TimestampNTZType)) } def this(left: Expression) = this(left, None, Cast(left, TimestampNTZType)) @@ -1886,7 +1886,7 @@ case class ParseToDate(left: Expression, format: Option[Expression], child: Expr extends RuntimeReplaceable { def this(left: Expression, format: Expression) = { -this(left, Option(format), Cast(GetTimestamp(left, format), DateType)) +this(left, Option(format), Cast(GetTimestamp(left, format, TimestampType), DateType)) } def this(left: Expression) = { @@ -1911,7 +1911,8 @@ case class ParseToDate(left: Expression, format: Option[Expression], child: Expr usage = """ _FUNC_(timestamp_str[, fmt]) - Parses the `timestamp_str` expression with the `fmt` expression to a timestamp. Returns null with invalid input. By default, it follows casting rules to - a timestamp if the `fmt` is omitted. + a timestamp if the `fmt` is omitted. The result data type is consistent with the value of + configuration `spark.sql.timestampType`. """, arguments = """ Arguments: @@ -1929,20 +1930,24 @@ case class ParseToDate(left: Expression, format: Option[Expression], child: Expr group = "datetime_funcs", since = "2.2.0") // scalastyle:on line.size.limit -case class ParseToTimestamp(left: Expression, format: Option[Expression], child: Expression) - extends RuntimeReplaceable { +case class ParseToTimestamp( +left: Expression, +format: Opt
[spark] branch master updated (8738682 -> 32720dd)
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git. from 8738682 [SPARK-36044][SQL] Suport TimestampNTZ in functions unix_timestamp/to_unix_timestamp add 32720dd [SPARK-36072][SQL] TO_TIMESTAMP: return different results based on the default timestamp type No new revisions were added by this update. Summary of changes: .../catalyst/expressions/datetimeExpressions.scala | 54 +- .../expressions/DateExpressionsSuite.scala | 23 +++--- .../results/timestampNTZ/datetime.sql.out | 84 +++--- 3 files changed, 76 insertions(+), 85 deletions(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-3.2 updated: [SPARK-36044][SQL] Suport TimestampNTZ in functions unix_timestamp/to_unix_timestamp
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a commit to branch branch-3.2 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.2 by this push: new 5816482 [SPARK-36044][SQL] Suport TimestampNTZ in functions unix_timestamp/to_unix_timestamp 5816482 is described below commit 58164828683db50fb0e1698b679ffc0a773da847 Author: gengjiaan AuthorDate: Mon Jul 12 09:55:43 2021 +0300 [SPARK-36044][SQL] Suport TimestampNTZ in functions unix_timestamp/to_unix_timestamp ### What changes were proposed in this pull request? The functions `unix_timestamp`/`to_unix_timestamp` should be able to accept input of `TimestampNTZType`. ### Why are the changes needed? The functions `unix_timestamp`/`to_unix_timestamp` should be able to accept input of `TimestampNTZType`. ### Does this PR introduce _any_ user-facing change? 'Yes'. ### How was this patch tested? New tests. Closes #33278 from beliefer/SPARK-36044. Authored-by: gengjiaan Signed-off-by: Max Gekk (cherry picked from commit 8738682f6a36436da0e9fc332d58b2f41309e2c2) Signed-off-by: Max Gekk --- .../spark/sql/catalyst/expressions/datetimeExpressions.scala | 8 +--- .../spark/sql/catalyst/expressions/DateExpressionsSuite.scala | 9 + .../test/scala/org/apache/spark/sql/DateFunctionsSuite.scala | 11 ++- 3 files changed, 24 insertions(+), 4 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala index 979eeba..f0ed89e 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala @@ -1091,8 +1091,10 @@ abstract class ToTimestamp override protected def formatString: Expression = right override protected def isParsing = true + override def forTimestampNTZ: Boolean = left.dataType == TimestampNTZType + override def inputTypes: Seq[AbstractDataType] = -Seq(TypeCollection(StringType, DateType, TimestampType), StringType) +Seq(TypeCollection(StringType, DateType, TimestampType, TimestampNTZType), StringType) override def dataType: DataType = LongType override def nullable: Boolean = if (failOnError) children.exists(_.nullable) else true @@ -1112,7 +1114,7 @@ abstract class ToTimestamp left.dataType match { case DateType => daysToMicros(t.asInstanceOf[Int], zoneId) / downScaleFactor -case TimestampType => +case TimestampType | TimestampNTZType => t.asInstanceOf[Long] / downScaleFactor case StringType => val fmt = right.eval(input) @@ -1192,7 +1194,7 @@ abstract class ToTimestamp |} |""".stripMargin) } - case TimestampType => + case TimestampType | TimestampNTZType => val eval1 = left.genCode(ctx) ev.copy(code = code""" ${eval1.code} diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala index 5f071c3..02d6d95 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala @@ -916,6 +916,11 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { Literal(new Timestamp(100)), Literal("-MM-dd HH:mm:ss"), timeZoneId), 1000L) checkEvaluation( + UnixTimestamp( + Literal(DateTimeUtils.microsToLocalDateTime(DateTimeUtils.millisToMicros(100))), +Literal("-MM-dd HH:mm:ss"), timeZoneId), + 1000L) +checkEvaluation( UnixTimestamp(Literal(date1), Literal("-MM-dd HH:mm:ss"), timeZoneId), MICROSECONDS.toSeconds( DateTimeUtils.daysToMicros(DateTimeUtils.fromJavaDate(date1), tz.toZoneId))) @@ -981,6 +986,10 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { checkEvaluation(ToUnixTimestamp( Literal(new Timestamp(100)), Literal(fmt1)), 1000L) +checkEvaluation(ToUnixTimestamp( + Literal(DateTimeUtils.microsToLocalDateTime(DateTimeUtils.millisToMicros(100))), + Literal(fmt1)), + 1000L) checkEvaluation( ToUnixTimestamp(Literal(date1), Literal(fmt
[spark] branch master updated (badb039 -> 8738682)
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git. from badb039 [SPARK-36003][PYTHON] Implement unary operator `invert` of integral ps.Series/Index add 8738682 [SPARK-36044][SQL] Suport TimestampNTZ in functions unix_timestamp/to_unix_timestamp No new revisions were added by this update. Summary of changes: .../spark/sql/catalyst/expressions/datetimeExpressions.scala | 8 +--- .../spark/sql/catalyst/expressions/DateExpressionsSuite.scala | 9 + .../test/scala/org/apache/spark/sql/DateFunctionsSuite.scala | 11 ++- 3 files changed, 24 insertions(+), 4 deletions(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-3.2 updated: [SPARK-36083][SQL] make_timestamp: return different result based on the default timestamp type
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a commit to branch branch-3.2 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.2 by this push: new 09e5bbd [SPARK-36083][SQL] make_timestamp: return different result based on the default timestamp type 09e5bbd is described below commit 09e5bbdfbe53ccb4efd85f5774b7bee9e731a14f Author: Gengliang Wang AuthorDate: Sun Jul 11 20:47:49 2021 +0300 [SPARK-36083][SQL] make_timestamp: return different result based on the default timestamp type ### What changes were proposed in this pull request? The SQL function MAKE_TIMESTAMP should return different results based on the default timestamp type: * when "spark.sql.timestampType" is TIMESTAMP_NTZ, return TimestampNTZType literal * when "spark.sql.timestampType" is TIMESTAMP_LTZ, return TimestampType literal ### Why are the changes needed? As "spark.sql.timestampType" sets the default timestamp type, the make_timestamp function should behave consistently with it. ### Does this PR introduce _any_ user-facing change? Yes, when the value of "spark.sql.timestampType" is TIMESTAMP_NTZ, the result type of `MAKE_TIMESTAMP` is of TIMESTAMP_NTZ type. ### How was this patch tested? Unit test Closes #33290 from gengliangwang/mkTS. Authored-by: Gengliang Wang Signed-off-by: Max Gekk (cherry picked from commit 17ddcc9e8273a098b63984b950bfa6cd36b58b99) Signed-off-by: Max Gekk --- .../catalyst/expressions/datetimeExpressions.scala | 30 -- .../expressions/DateExpressionsSuite.scala | 120 +++-- .../test/resources/sql-tests/inputs/datetime.sql | 4 + .../sql-tests/results/ansi/datetime.sql.out| 19 +++- .../sql-tests/results/datetime-legacy.sql.out | 18 +++- .../resources/sql-tests/results/datetime.sql.out | 18 +++- .../results/timestampNTZ/datetime.sql.out | 18 +++- 7 files changed, 161 insertions(+), 66 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala index be527ce..979eeba 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala @@ -2286,7 +2286,8 @@ case class MakeDate( // scalastyle:off line.size.limit @ExpressionDescription( - usage = "_FUNC_(year, month, day, hour, min, sec[, timezone]) - Create timestamp from year, month, day, hour, min, sec and timezone fields.", + usage = "_FUNC_(year, month, day, hour, min, sec[, timezone]) - Create timestamp from year, month, day, hour, min, sec and timezone fields. " + +"The result data type is consistent with the value of configuration `spark.sql.timestampType`", arguments = """ Arguments: * year - the year to represent, from 1 to @@ -2324,7 +2325,8 @@ case class MakeTimestamp( sec: Expression, timezone: Option[Expression] = None, timeZoneId: Option[String] = None, -failOnError: Boolean = SQLConf.get.ansiEnabled) +failOnError: Boolean = SQLConf.get.ansiEnabled, +override val dataType: DataType = SQLConf.get.timestampType) extends SeptenaryExpression with TimeZoneAwareExpression with ImplicitCastInputTypes with NullIntolerant { @@ -2335,7 +2337,8 @@ case class MakeTimestamp( hour: Expression, min: Expression, sec: Expression) = { -this(year, month, day, hour, min, sec, None, None, SQLConf.get.ansiEnabled) +this(year, month, day, hour, min, sec, None, None, SQLConf.get.ansiEnabled, + SQLConf.get.timestampType) } def this( @@ -2346,7 +2349,8 @@ case class MakeTimestamp( min: Expression, sec: Expression, timezone: Expression) = { -this(year, month, day, hour, min, sec, Some(timezone), None, SQLConf.get.ansiEnabled) +this(year, month, day, hour, min, sec, Some(timezone), None, SQLConf.get.ansiEnabled, + SQLConf.get.timestampType) } override def children: Seq[Expression] = Seq(year, month, day, hour, min, sec) ++ timezone @@ -2355,7 +2359,6 @@ case class MakeTimestamp( override def inputTypes: Seq[AbstractDataType] = Seq(IntegerType, IntegerType, IntegerType, IntegerType, IntegerType, DecimalType(8, 6)) ++ timezone.map(_ => StringType) - override def dataType: DataType = TimestampType override def nullable: Boolean = if (failOnError) children.exists(_.nullable) else true override def withTimeZone(timeZoneId: String): TimeZoneAwareExpression = @@ -2388,7 +2391,11 @@ case class MakeTimestam
[spark] branch master updated (cfcd094 -> 17ddcc9)
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git. from cfcd094 [SPARK-36036][CORE] Fix cleanup of DownloadFile resources add 17ddcc9 [SPARK-36083][SQL] make_timestamp: return different result based on the default timestamp type No new revisions were added by this update. Summary of changes: .../catalyst/expressions/datetimeExpressions.scala | 30 -- .../expressions/DateExpressionsSuite.scala | 120 +++-- .../test/resources/sql-tests/inputs/datetime.sql | 4 + .../sql-tests/results/ansi/datetime.sql.out| 19 +++- .../sql-tests/results/datetime-legacy.sql.out | 18 +++- .../resources/sql-tests/results/datetime.sql.out | 18 +++- .../results/timestampNTZ/datetime.sql.out | 18 +++- 7 files changed, 161 insertions(+), 66 deletions(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-3.2 updated: [SPARK-36049][SQL] Remove IntervalUnit
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a commit to branch branch-3.2 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.2 by this push: new 2f54d9e [SPARK-36049][SQL] Remove IntervalUnit 2f54d9e is described below commit 2f54d9eed6dce5e9fc853ef4dceb1abb2b338a34 Author: Angerszh AuthorDate: Thu Jul 8 23:02:21 2021 +0300 [SPARK-36049][SQL] Remove IntervalUnit ### What changes were proposed in this pull request? Remove IntervalUnit ### Why are the changes needed? Clean code ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Not need Closes #33265 from AngersZh/SPARK-36049. Lead-authored-by: Angerszh Co-authored-by: Maxim Gekk Signed-off-by: Max Gekk (cherry picked from commit fef7e1703c342165000f89b01112a8a28a936436) Signed-off-by: Max Gekk --- .../spark/sql/catalyst/util/IntervalUtils.scala| 120 - .../catalyst/parser/ExpressionParserSuite.scala| 33 ++ 2 files changed, 58 insertions(+), 95 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/IntervalUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/IntervalUtils.scala index 7579a28..e026266 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/IntervalUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/IntervalUtils.scala @@ -41,22 +41,6 @@ object IntervalStringStyles extends Enumeration { object IntervalUtils { - object IntervalUnit extends Enumeration { -type IntervalUnit = Value - -val NANOSECOND = Value(0, "nanosecond") -val MICROSECOND = Value(1, "microsecond") -val MILLISECOND = Value(2, "millisecond") -val SECOND = Value(3, "second") -val MINUTE = Value(4, "minute") -val HOUR = Value(5, "hour") -val DAY = Value(6, "day") -val WEEK = Value(7, "week") -val MONTH = Value(8, "month") -val YEAR = Value(9, "year") - } - import IntervalUnit._ - private val MAX_DAY = Long.MaxValue / MICROS_PER_DAY private val MAX_HOUR = Long.MaxValue / MICROS_PER_HOUR private val MAX_MINUTE = Long.MaxValue / MICROS_PER_MINUTE @@ -97,7 +81,7 @@ object IntervalUtils { def getSeconds(interval: CalendarInterval): Decimal = getSeconds(interval.microseconds) private def toLongWithRange( - fieldName: IntervalUnit, + fieldName: UTF8String, s: String, minValue: Long, maxValue: Long): Long = { @@ -250,10 +234,11 @@ object IntervalUtils { } } - private def toYMInterval(yearStr: String, monthStr: String, sign: Int): Int = { + private def toYMInterval(year: String, month: String, sign: Int): Int = { safeToInterval("year-month") { - val years = toLongWithRange(YEAR, yearStr, 0, Integer.MAX_VALUE / MONTHS_PER_YEAR) - val totalMonths = sign * (years * MONTHS_PER_YEAR + toLongWithRange(MONTH, monthStr, 0, 11)) + val years = toLongWithRange(yearStr, year, 0, Integer.MAX_VALUE / MONTHS_PER_YEAR) + val totalMonths = +sign * (years * MONTHS_PER_YEAR + toLongWithRange(monthStr, month, 0, 11)) Math.toIntExact(totalMonths) } } @@ -402,45 +387,33 @@ object IntervalUtils { } } - def toDTInterval( - dayStr: String, - hourStr: String, - minuteStr: String, - secondStr: String, - sign: Int): Long = { + def toDTInterval(day: String, hour: String, minute: String, second: String, sign: Int): Long = { var micros = 0L -val days = toLongWithRange(DAY, dayStr, 0, MAX_DAY).toInt +val days = toLongWithRange(dayStr, day, 0, MAX_DAY).toInt micros = Math.addExact(micros, sign * days * MICROS_PER_DAY) -val hours = toLongWithRange(HOUR, hourStr, 0, 23) +val hours = toLongWithRange(hourStr, hour, 0, 23) micros = Math.addExact(micros, sign * hours * MICROS_PER_HOUR) -val minutes = toLongWithRange(MINUTE, minuteStr, 0, 59) +val minutes = toLongWithRange(minuteStr, minute, 0, 59) micros = Math.addExact(micros, sign * minutes * MICROS_PER_MINUTE) -micros = Math.addExact(micros, sign * parseSecondNano(secondStr)) +micros = Math.addExact(micros, sign * parseSecondNano(second)) micros } - def toDTInterval( - hourStr: String, - minuteStr: String, - secondStr: String, - sign: Int): Long = { + def toDTInterval(hour: String, minute: String, second: String, sign: Int): Long = { var micros = 0L -val hours = toLongWithRange(HOUR, hourStr, 0, MAX_HOUR) +val hours = toLongWithRange(hourStr, hour, 0, MAX_HOUR) micros = Math.addExact(micros, sign * hours * MICROS_PER_HOUR) -val minutes = toLongWithRange(MINU
[spark] branch master updated (382b66e -> fef7e170)
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git. from 382b66e [SPARK-36054][SQL] Support group by TimestampNTZ type column add fef7e170 [SPARK-36049][SQL] Remove IntervalUnit No new revisions were added by this update. Summary of changes: .../spark/sql/catalyst/util/IntervalUtils.scala| 120 - .../catalyst/parser/ExpressionParserSuite.scala| 33 ++ 2 files changed, 58 insertions(+), 95 deletions(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-3.2 updated: [SPARK-36054][SQL] Support group by TimestampNTZ type column
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a commit to branch branch-3.2 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.2 by this push: new ae62c9d [SPARK-36054][SQL] Support group by TimestampNTZ type column ae62c9d is described below commit ae62c9d7726e2b05897f7e807bb9cdcc2748e3fa Author: Gengliang Wang AuthorDate: Thu Jul 8 22:33:25 2021 +0300 [SPARK-36054][SQL] Support group by TimestampNTZ type column ### What changes were proposed in this pull request? Support group by TimestampNTZ type column ### Why are the changes needed? It's a basic SQL operation. ### Does this PR introduce _any_ user-facing change? No, the new timestmap type is not released yet. ### How was this patch tested? Unit test Closes #33268 from gengliangwang/agg. Authored-by: Gengliang Wang Signed-off-by: Max Gekk (cherry picked from commit 382b66e26725e4667607303ebb9803b05e8076bc) Signed-off-by: Max Gekk --- .../org/apache/spark/sql/catalyst/expressions/hash.scala| 2 +- .../spark/sql/execution/vectorized/OffHeapColumnVector.java | 3 ++- .../spark/sql/execution/vectorized/OnHeapColumnVector.java | 3 ++- .../spark/sql/execution/aggregate/HashMapGenerator.scala| 2 +- .../org/apache/spark/sql/DataFrameAggregateSuite.scala | 13 - 5 files changed, 18 insertions(+), 5 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala index d730586..3785262 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala @@ -490,7 +490,7 @@ abstract class HashExpression[E] extends Expression { case BooleanType => genHashBoolean(input, result) case ByteType | ShortType | IntegerType | DateType => genHashInt(input, result) case LongType => genHashLong(input, result) -case TimestampType => genHashTimestamp(input, result) +case TimestampType | TimestampNTZType => genHashTimestamp(input, result) case FloatType => genHashFloat(input, result) case DoubleType => genHashDouble(input, result) case d: DecimalType => genHashDecimal(ctx, d, input, result) diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OffHeapColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OffHeapColumnVector.java index 7da5a28..b4b6903 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OffHeapColumnVector.java +++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OffHeapColumnVector.java @@ -553,7 +553,8 @@ public final class OffHeapColumnVector extends WritableColumnVector { type instanceof DateType || DecimalType.is32BitDecimalType(type)) { this.data = Platform.reallocateMemory(data, oldCapacity * 4L, newCapacity * 4L); } else if (type instanceof LongType || type instanceof DoubleType || -DecimalType.is64BitDecimalType(type) || type instanceof TimestampType) { +DecimalType.is64BitDecimalType(type) || type instanceof TimestampType || +type instanceof TimestampNTZType) { this.data = Platform.reallocateMemory(data, oldCapacity * 8L, newCapacity * 8L); } else if (childColumns != null) { // Nothing to store. diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OnHeapColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OnHeapColumnVector.java index 5942c5f..3fb96d8 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OnHeapColumnVector.java +++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OnHeapColumnVector.java @@ -547,7 +547,8 @@ public final class OnHeapColumnVector extends WritableColumnVector { if (intData != null) System.arraycopy(intData, 0, newData, 0, capacity); intData = newData; } -} else if (type instanceof LongType || type instanceof TimestampType || +} else if (type instanceof LongType || +type instanceof TimestampType ||type instanceof TimestampNTZType || DecimalType.is64BitDecimalType(type) || type instanceof DayTimeIntervalType) { if (longData == null || longData.length < newCapacity) { long[] newData = new long[newCapacity]; diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashMapGenerator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashMapGenerator.scala index b3f5e34..713e7db 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashMapGenerator.scala +++ b/
[spark] branch master updated (819c482 -> 382b66e)
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git. from 819c482 [SPARK-35340][PYTHON] Standardize TypeError messages for unsupported basic operations add 382b66e [SPARK-36054][SQL] Support group by TimestampNTZ type column No new revisions were added by this update. Summary of changes: .../org/apache/spark/sql/catalyst/expressions/hash.scala| 2 +- .../spark/sql/execution/vectorized/OffHeapColumnVector.java | 3 ++- .../spark/sql/execution/vectorized/OnHeapColumnVector.java | 3 ++- .../spark/sql/execution/aggregate/HashMapGenerator.scala| 2 +- .../org/apache/spark/sql/DataFrameAggregateSuite.scala | 13 - 5 files changed, 18 insertions(+), 5 deletions(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-3.2 updated: [SPARK-36055][SQL] Assign pretty SQL string to TimestampNTZ literals
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a commit to branch branch-3.2 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.2 by this push: new 9103c1f [SPARK-36055][SQL] Assign pretty SQL string to TimestampNTZ literals 9103c1f is described below commit 9103c1fe2332a60424077ca9ecffb24afa440c55 Author: Gengliang Wang AuthorDate: Thu Jul 8 21:42:50 2021 +0300 [SPARK-36055][SQL] Assign pretty SQL string to TimestampNTZ literals ### What changes were proposed in this pull request? Currently the TimestampNTZ literals shows only long value instead of timestamp string in its SQL string and toString result. Before changes (with default timestamp type as TIMESTAMP_NTZ) ``` – !query select timestamp '2019-01-01\t' – !query schema struct<15463008:timestamp_ntz> ``` After changes: ``` – !query select timestamp '2019-01-01\t' – !query schema struct ``` ### Why are the changes needed? Make the schema of TimestampNTZ literals readable. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Unit test Closes #33269 from gengliangwang/ntzLiteralString. Authored-by: Gengliang Wang Signed-off-by: Max Gekk (cherry picked from commit ee945e99cc1d3979a2c24077a9ae786ce50bbe81) Signed-off-by: Max Gekk --- .../spark/sql/catalyst/expressions/literals.scala | 6 +- .../catalyst/expressions/LiteralExpressionSuite.scala | 9 + .../sql-tests/results/timestampNTZ/datetime.sql.out| 18 +- 3 files changed, 23 insertions(+), 10 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala index a2270eb..ee40909 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala @@ -28,7 +28,7 @@ import java.lang.{Short => JavaShort} import java.math.{BigDecimal => JavaBigDecimal} import java.nio.charset.StandardCharsets import java.sql.{Date, Timestamp} -import java.time.{Duration, Instant, LocalDate, LocalDateTime, Period} +import java.time.{Duration, Instant, LocalDate, LocalDateTime, Period, ZoneOffset} import java.util import java.util.Objects import javax.xml.bind.DatatypeConverter @@ -352,6 +352,8 @@ case class Literal (value: Any, dataType: DataType) extends LeafExpression { DateFormatter().format(value.asInstanceOf[Int]) case TimestampType => TimestampFormatter.getFractionFormatter(timeZoneId).format(value.asInstanceOf[Long]) +case TimestampNTZType => + TimestampFormatter.getFractionFormatter(ZoneOffset.UTC).format(value.asInstanceOf[Long]) case DayTimeIntervalType(startField, endField) => toDayTimeIntervalString(value.asInstanceOf[Long], ANSI_STYLE, startField, endField) case YearMonthIntervalType(startField, endField) => @@ -473,6 +475,8 @@ case class Literal (value: Any, dataType: DataType) extends LeafExpression { s"DATE '$toString'" case (v: Long, TimestampType) => s"TIMESTAMP '$toString'" +case (v: Long, TimestampNTZType) => + s"TIMESTAMP_NTZ '$toString'" case (i: CalendarInterval, CalendarIntervalType) => s"INTERVAL '${i.toString}'" case (v: Array[Byte], BinaryType) => s"X'${DatatypeConverter.printHexBinary(v)}'" diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/LiteralExpressionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/LiteralExpressionSuite.scala index 50b7263..4081e13 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/LiteralExpressionSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/LiteralExpressionSuite.scala @@ -362,6 +362,15 @@ class LiteralExpressionSuite extends SparkFunSuite with ExpressionEvalHelper { } } + test("SPARK-36055: TimestampNTZ toString") { +assert(Literal.default(TimestampNTZType).toString === "1970-01-01 00:00:00") +withTimeZones(sessionTimeZone = "GMT+01:00", systemTimeZone = "GMT-08:00") { + val timestamp = LocalDateTime.of(2021, 2, 3, 16, 50, 3, 45600) + val literalStr = Literal.create(timestamp).toString + assert(literalStr === "2021-02-03 16:50:03.456") +} + } + test("SPARK-35664: construct literals from java.time.LocalDateTime") { Seq( LocalDateTime.of(1, 1, 1, 0, 0, 0, 0), diff