Repository: spark Updated Branches: refs/heads/master c5daccb1d -> 94145786a
[SPARK-25908][SQL][FOLLOW-UP] Add back unionAll ## What changes were proposed in this pull request? This PR is to add back `unionAll`, which is widely used. The name is also consistent with our ANSI SQL. We also have the corresponding `intersectAll` and `exceptAll`, which were introduced in Spark 2.4. ## How was this patch tested? Added a test case in DataFrameSuite Closes #23131 from gatorsmile/addBackUnionAll. Authored-by: gatorsmile <[email protected]> Signed-off-by: gatorsmile <[email protected]> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/94145786 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/94145786 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/94145786 Branch: refs/heads/master Commit: 94145786a5b91a7f0bca44f27599a61c72f3a18f Parents: c5daccb Author: gatorsmile <[email protected]> Authored: Sun Nov 25 15:53:07 2018 -0800 Committer: gatorsmile <[email protected]> Committed: Sun Nov 25 15:53:07 2018 -0800 ---------------------------------------------------------------------- R/pkg/NAMESPACE | 1 + R/pkg/R/DataFrame.R | 14 ++++++++++++++ R/pkg/R/generics.R | 3 +++ R/pkg/tests/fulltests/test_sparkSQL.R | 1 + docs/sparkr.md | 2 +- docs/sql-migration-guide-upgrade.md | 2 ++ python/pyspark/sql/dataframe.py | 11 +++++++++++ .../src/main/scala/org/apache/spark/sql/Dataset.scala | 14 ++++++++++++++ .../scala/org/apache/spark/sql/DataFrameSuite.scala | 6 ++++++ 9 files changed, 53 insertions(+), 1 deletion(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/94145786/R/pkg/NAMESPACE ---------------------------------------------------------------------- diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE index de56061..cdeafdd 100644 --- a/R/pkg/NAMESPACE +++ b/R/pkg/NAMESPACE @@ -169,6 +169,7 @@ exportMethods("arrange", "toJSON", "transform", "union", + "unionAll", "unionByName", "unique", "unpersist", http://git-wip-us.apache.org/repos/asf/spark/blob/94145786/R/pkg/R/DataFrame.R ---------------------------------------------------------------------- diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index 52e7657..ad9cd84 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -2732,6 +2732,20 @@ setMethod("union", dataFrame(unioned) }) +#' Return a new SparkDataFrame containing the union of rows +#' +#' This is an alias for `union`. +#' +#' @rdname union +#' @name unionAll +#' @aliases unionAll,SparkDataFrame,SparkDataFrame-method +#' @note unionAll since 1.4.0 +setMethod("unionAll", + signature(x = "SparkDataFrame", y = "SparkDataFrame"), + function(x, y) { + union(x, y) + }) + #' Return a new SparkDataFrame containing the union of rows, matched by column names #' #' Return a new SparkDataFrame containing the union of rows in this SparkDataFrame http://git-wip-us.apache.org/repos/asf/spark/blob/94145786/R/pkg/R/generics.R ---------------------------------------------------------------------- diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index cbed276..b2ca6e6 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -631,6 +631,9 @@ setGeneric("toRDD", function(x) { standardGeneric("toRDD") }) #' @rdname union setGeneric("union", function(x, y) { standardGeneric("union") }) +#' @rdname union +setGeneric("unionAll", function(x, y) { standardGeneric("unionAll") }) + #' @rdname unionByName setGeneric("unionByName", function(x, y) { standardGeneric("unionByName") }) http://git-wip-us.apache.org/repos/asf/spark/blob/94145786/R/pkg/tests/fulltests/test_sparkSQL.R ---------------------------------------------------------------------- diff --git a/R/pkg/tests/fulltests/test_sparkSQL.R b/R/pkg/tests/fulltests/test_sparkSQL.R index f355a51..77a29c9 100644 --- a/R/pkg/tests/fulltests/test_sparkSQL.R +++ b/R/pkg/tests/fulltests/test_sparkSQL.R @@ -2458,6 +2458,7 @@ test_that("union(), unionByName(), rbind(), except(), and intersect() on a DataF expect_equal(count(unioned), 6) expect_equal(first(unioned)$name, "Michael") expect_equal(count(arrange(suppressWarnings(union(df, df2)), df$age)), 6) + expect_equal(count(arrange(suppressWarnings(unionAll(df, df2)), df$age)), 6) df1 <- select(df2, "age", "name") unioned1 <- arrange(unionByName(df1, df), df1$age) http://git-wip-us.apache.org/repos/asf/spark/blob/94145786/docs/sparkr.md ---------------------------------------------------------------------- diff --git a/docs/sparkr.md b/docs/sparkr.md index acd0e77..5972435 100644 --- a/docs/sparkr.md +++ b/docs/sparkr.md @@ -718,4 +718,4 @@ You can inspect the search path in R with [`search()`](https://stat.ethz.ch/R-ma ## Upgrading to SparkR 3.0.0 - The deprecated methods `sparkR.init`, `sparkRSQL.init`, `sparkRHive.init` have been removed. Use `sparkR.session` instead. - - The deprecated methods `parquetFile`, `saveAsParquetFile`, `jsonFile`, `registerTempTable`, `createExternalTable`, `dropTempTable`, `unionAll` have been removed. Use `read.parquet`, `write.parquet`, `read.json`, `createOrReplaceTempView`, `createTable`, `dropTempView`, `union` instead. + - The deprecated methods `parquetFile`, `saveAsParquetFile`, `jsonFile`, `registerTempTable`, `createExternalTable`, and `dropTempTable` have been removed. Use `read.parquet`, `write.parquet`, `read.json`, `createOrReplaceTempView`, `createTable`, `dropTempView`, `union` instead. http://git-wip-us.apache.org/repos/asf/spark/blob/94145786/docs/sql-migration-guide-upgrade.md ---------------------------------------------------------------------- diff --git a/docs/sql-migration-guide-upgrade.md b/docs/sql-migration-guide-upgrade.md index 397ca59..68cb8f5 100644 --- a/docs/sql-migration-guide-upgrade.md +++ b/docs/sql-migration-guide-upgrade.md @@ -9,6 +9,8 @@ displayTitle: Spark SQL Upgrading Guide ## Upgrading From Spark SQL 2.4 to 3.0 + - Since Spark 3.0, the Dataset and DataFrame API `unionAll` is not deprecated any more. It is an alias for `union`. + - In PySpark, when creating a `SparkSession` with `SparkSession.builder.getOrCreate()`, if there is an existing `SparkContext`, the builder was trying to update the `SparkConf` of the existing `SparkContext` with configurations specified to the builder, but the `SparkContext` is shared by all `SparkSession`s, so we should not update them. Since 3.0, the builder comes to not update the configurations. This is the same behavior as Java/Scala API in 2.3 and above. If you want to update them, you need to update them prior to creating a `SparkSession`. - In Spark version 2.4 and earlier, the parser of JSON data source treats empty strings as null for some data types such as `IntegerType`. For `FloatType` and `DoubleType`, it fails on empty strings and throws exceptions. Since Spark 3.0, we disallow empty strings and will throw exceptions for data types except for `StringType` and `BinaryType`. http://git-wip-us.apache.org/repos/asf/spark/blob/94145786/python/pyspark/sql/dataframe.py ---------------------------------------------------------------------- diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py index 4abbeac..ca15b36 100644 --- a/python/pyspark/sql/dataframe.py +++ b/python/pyspark/sql/dataframe.py @@ -1448,6 +1448,17 @@ class DataFrame(object): """ return DataFrame(self._jdf.union(other._jdf), self.sql_ctx) + @since(1.3) + def unionAll(self, other): + """ Return a new :class:`DataFrame` containing union of rows in this and another frame. + + This is equivalent to `UNION ALL` in SQL. To do a SQL-style set union + (that does deduplication of elements), use this function followed by :func:`distinct`. + + Also as standard in SQL, this function resolves columns by position (not by name). + """ + return self.union(other) + @since(2.3) def unionByName(self, other): """ Returns a new :class:`DataFrame` containing union of rows in this and another frame. http://git-wip-us.apache.org/repos/asf/spark/blob/94145786/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala ---------------------------------------------------------------------- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala index e757921..f361bde 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala @@ -1854,6 +1854,20 @@ class Dataset[T] private[sql]( /** * Returns a new Dataset containing union of rows in this Dataset and another Dataset. + * This is an alias for `union`. + * + * This is equivalent to `UNION ALL` in SQL. To do a SQL-style set union (that does + * deduplication of elements), use this function followed by a [[distinct]]. + * + * Also as standard in SQL, this function resolves columns by position (not by name). + * + * @group typedrel + * @since 2.0.0 + */ + def unionAll(other: Dataset[T]): Dataset[T] = union(other) + + /** + * Returns a new Dataset containing union of rows in this Dataset and another Dataset. * * This is different from both `UNION ALL` and `UNION DISTINCT` in SQL. To do a SQL-style set * union (that does deduplication of elements), use this function followed by a [[distinct]]. http://git-wip-us.apache.org/repos/asf/spark/blob/94145786/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala ---------------------------------------------------------------------- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala index 0ee2627..7a0767a 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala @@ -97,6 +97,12 @@ class DataFrameSuite extends QueryTest with SharedSQLContext { unionDF.agg(avg('key), max('key), min('key), sum('key)), Row(50.5, 100, 1, 25250) :: Nil ) + + // unionAll is an alias of union + val unionAllDF = testData.unionAll(testData).unionAll(testData) + .unionAll(testData).unionAll(testData) + + checkAnswer(unionDF, unionAllDF) } test("union should union DataFrames with UDTs (SPARK-13410)") { --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
