spark git commit: [SPARK-25908][SQL][FOLLOW-UP] Add back unionAll

lixiao Sun, 25 Nov 2018 15:53:41 -0800

Repository: spark
Updated Branches:
  refs/heads/master c5daccb1d -> 94145786a



[SPARK-25908][SQL][FOLLOW-UP] Add back unionAll

## What changes were proposed in this pull request?
This PR is to add back `unionAll`, which is widely used. The name is also 
consistent with our ANSI SQL. We also have the corresponding `intersectAll` and 
`exceptAll`, which were introduced in Spark 2.4.

## How was this patch tested?
Added a test case in DataFrameSuite

Closes #23131 from gatorsmile/addBackUnionAll.

Authored-by: gatorsmile <[email protected]>
Signed-off-by: gatorsmile <[email protected]>


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/94145786
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/94145786
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/94145786

Branch: refs/heads/master
Commit: 94145786a5b91a7f0bca44f27599a61c72f3a18f
Parents: c5daccb
Author: gatorsmile <[email protected]>
Authored: Sun Nov 25 15:53:07 2018 -0800
Committer: gatorsmile <[email protected]>
Committed: Sun Nov 25 15:53:07 2018 -0800

----------------------------------------------------------------------
 R/pkg/NAMESPACE                                       |  1 +
 R/pkg/R/DataFrame.R                                   | 14 ++++++++++++++
 R/pkg/R/generics.R                                    |  3 +++
 R/pkg/tests/fulltests/test_sparkSQL.R                 |  1 +
 docs/sparkr.md                                        |  2 +-
 docs/sql-migration-guide-upgrade.md                   |  2 ++
 python/pyspark/sql/dataframe.py                       | 11 +++++++++++
 .../src/main/scala/org/apache/spark/sql/Dataset.scala | 14 ++++++++++++++
 .../scala/org/apache/spark/sql/DataFrameSuite.scala   |  6 ++++++
 9 files changed, 53 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/94145786/R/pkg/NAMESPACE
----------------------------------------------------------------------
diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
index de56061..cdeafdd 100644
--- a/R/pkg/NAMESPACE
+++ b/R/pkg/NAMESPACE
@@ -169,6 +169,7 @@ exportMethods("arrange",
               "toJSON",
               "transform",
               "union",
+              "unionAll",
               "unionByName",
               "unique",
               "unpersist",

http://git-wip-us.apache.org/repos/asf/spark/blob/94145786/R/pkg/R/DataFrame.R
----------------------------------------------------------------------
diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index 52e7657..ad9cd84 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -2732,6 +2732,20 @@ setMethod("union",
             dataFrame(unioned)
           })
 
+#' Return a new SparkDataFrame containing the union of rows
+#'
+#' This is an alias for `union`.
+#'
+#' @rdname union
+#' @name unionAll
+#' @aliases unionAll,SparkDataFrame,SparkDataFrame-method
+#' @note unionAll since 1.4.0
+setMethod("unionAll",
+          signature(x = "SparkDataFrame", y = "SparkDataFrame"),
+          function(x, y) {
+            union(x, y)
+          })
+
 #' Return a new SparkDataFrame containing the union of rows, matched by column 
names
 #'
 #' Return a new SparkDataFrame containing the union of rows in this 
SparkDataFrame

http://git-wip-us.apache.org/repos/asf/spark/blob/94145786/R/pkg/R/generics.R
----------------------------------------------------------------------
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index cbed276..b2ca6e6 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -631,6 +631,9 @@ setGeneric("toRDD", function(x) { standardGeneric("toRDD") 
})
 #' @rdname union
 setGeneric("union", function(x, y) { standardGeneric("union") })
 
+#' @rdname union
+setGeneric("unionAll", function(x, y) { standardGeneric("unionAll") })
+
 #' @rdname unionByName
 setGeneric("unionByName", function(x, y) { standardGeneric("unionByName") })
 

http://git-wip-us.apache.org/repos/asf/spark/blob/94145786/R/pkg/tests/fulltests/test_sparkSQL.R
----------------------------------------------------------------------
diff --git a/R/pkg/tests/fulltests/test_sparkSQL.R 
b/R/pkg/tests/fulltests/test_sparkSQL.R
index f355a51..77a29c9 100644
--- a/R/pkg/tests/fulltests/test_sparkSQL.R
+++ b/R/pkg/tests/fulltests/test_sparkSQL.R
@@ -2458,6 +2458,7 @@ test_that("union(), unionByName(), rbind(), except(), and 
intersect() on a DataF
   expect_equal(count(unioned), 6)
   expect_equal(first(unioned)$name, "Michael")
   expect_equal(count(arrange(suppressWarnings(union(df, df2)), df$age)), 6)
+  expect_equal(count(arrange(suppressWarnings(unionAll(df, df2)), df$age)), 6)
 
   df1 <- select(df2, "age", "name")
   unioned1 <- arrange(unionByName(df1, df), df1$age)

http://git-wip-us.apache.org/repos/asf/spark/blob/94145786/docs/sparkr.md
----------------------------------------------------------------------
diff --git a/docs/sparkr.md b/docs/sparkr.md
index acd0e77..5972435 100644
--- a/docs/sparkr.md
+++ b/docs/sparkr.md
@@ -718,4 +718,4 @@ You can inspect the search path in R with 
[`search()`](https://stat.ethz.ch/R-ma
 ## Upgrading to SparkR 3.0.0
 
  - The deprecated methods `sparkR.init`, `sparkRSQL.init`, `sparkRHive.init` 
have been removed. Use `sparkR.session` instead.
- - The deprecated methods `parquetFile`, `saveAsParquetFile`, `jsonFile`, 
`registerTempTable`, `createExternalTable`, `dropTempTable`, `unionAll` have 
been removed. Use `read.parquet`, `write.parquet`, `read.json`, 
`createOrReplaceTempView`, `createTable`, `dropTempView`, `union` instead.
+ - The deprecated methods `parquetFile`, `saveAsParquetFile`, `jsonFile`, 
`registerTempTable`, `createExternalTable`, and `dropTempTable` have been 
removed. Use `read.parquet`, `write.parquet`, `read.json`, 
`createOrReplaceTempView`, `createTable`, `dropTempView`, `union` instead.

http://git-wip-us.apache.org/repos/asf/spark/blob/94145786/docs/sql-migration-guide-upgrade.md
----------------------------------------------------------------------
diff --git a/docs/sql-migration-guide-upgrade.md 
b/docs/sql-migration-guide-upgrade.md
index 397ca59..68cb8f5 100644
--- a/docs/sql-migration-guide-upgrade.md
+++ b/docs/sql-migration-guide-upgrade.md
@@ -9,6 +9,8 @@ displayTitle: Spark SQL Upgrading Guide
 
 ## Upgrading From Spark SQL 2.4 to 3.0
 
+  - Since Spark 3.0, the Dataset and DataFrame API `unionAll` is not 
deprecated any more. It is an alias for `union`.
+
   - In PySpark, when creating a `SparkSession` with 
`SparkSession.builder.getOrCreate()`, if there is an existing `SparkContext`, 
the builder was trying to update the `SparkConf` of the existing `SparkContext` 
with configurations specified to the builder, but the `SparkContext` is shared 
by all `SparkSession`s, so we should not update them. Since 3.0, the builder 
comes to not update the configurations. This is the same behavior as Java/Scala 
API in 2.3 and above. If you want to update them, you need to update them prior 
to creating a `SparkSession`.
 
   - In Spark version 2.4 and earlier, the parser of JSON data source treats 
empty strings as null for some data types such as `IntegerType`. For 
`FloatType` and `DoubleType`, it fails on empty strings and throws exceptions. 
Since Spark 3.0, we disallow empty strings and will throw exceptions for data 
types except for `StringType` and `BinaryType`.

http://git-wip-us.apache.org/repos/asf/spark/blob/94145786/python/pyspark/sql/dataframe.py
----------------------------------------------------------------------
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index 4abbeac..ca15b36 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -1448,6 +1448,17 @@ class DataFrame(object):
         """
         return DataFrame(self._jdf.union(other._jdf), self.sql_ctx)
 
+    @since(1.3)
+    def unionAll(self, other):
+        """ Return a new :class:`DataFrame` containing union of rows in this 
and another frame.
+
+        This is equivalent to `UNION ALL` in SQL. To do a SQL-style set union
+        (that does deduplication of elements), use this function followed by 
:func:`distinct`.
+
+        Also as standard in SQL, this function resolves columns by position 
(not by name).
+        """
+        return self.union(other)
+
     @since(2.3)
     def unionByName(self, other):
         """ Returns a new :class:`DataFrame` containing union of rows in this 
and another frame.

http://git-wip-us.apache.org/repos/asf/spark/blob/94145786/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
----------------------------------------------------------------------
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
index e757921..f361bde 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -1854,6 +1854,20 @@ class Dataset[T] private[sql](
 
   /**
    * Returns a new Dataset containing union of rows in this Dataset and 
another Dataset.
+   * This is an alias for `union`.
+   *
+   * This is equivalent to `UNION ALL` in SQL. To do a SQL-style set union 
(that does
+   * deduplication of elements), use this function followed by a [[distinct]].
+   *
+   * Also as standard in SQL, this function resolves columns by position (not 
by name).
+   *
+   * @group typedrel
+   * @since 2.0.0
+   */
+  def unionAll(other: Dataset[T]): Dataset[T] = union(other)
+
+  /**
+   * Returns a new Dataset containing union of rows in this Dataset and 
another Dataset.
    *
    * This is different from both `UNION ALL` and `UNION DISTINCT` in SQL. To 
do a SQL-style set
    * union (that does deduplication of elements), use this function followed 
by a [[distinct]].

http://git-wip-us.apache.org/repos/asf/spark/blob/94145786/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
----------------------------------------------------------------------
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala 
b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
index 0ee2627..7a0767a 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -97,6 +97,12 @@ class DataFrameSuite extends QueryTest with SharedSQLContext 
{
       unionDF.agg(avg('key), max('key), min('key), sum('key)),
       Row(50.5, 100, 1, 25250) :: Nil
     )
+
+    // unionAll is an alias of union
+    val unionAllDF = testData.unionAll(testData).unionAll(testData)
+      .unionAll(testData).unionAll(testData)
+
+    checkAnswer(unionDF, unionAllDF)
   }
 
   test("union should union DataFrames with UDTs (SPARK-13410)") {


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

spark git commit: [SPARK-25908][SQL][FOLLOW-UP] Add back unionAll

Reply via email to