This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a commit to branch branch-3.1
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.1 by this push:
     new 264ce7b  [SPARK-35573][R][TESTSt] Make SparkR tests pass with R 4.1+
264ce7b is described below

commit 264ce7b9062bb37be721fff0f2518fc5644cfe28
Author: Hyukjin Kwon <gurwls...@apache.org>
AuthorDate: Tue Jun 1 10:35:52 2021 +0900

    [SPARK-35573][R][TESTSt] Make SparkR tests pass with R 4.1+
    
    This PR proposes to support R 4.1.0+ in SparkR. Currently the tests are 
being failed as below:
    
    ```
    ══ Failed 
══════════════════════════════════════════════════════════════════════
    ── 1. Failure (test_sparkSQL_arrow.R:71:3): createDataFrame/collect Arrow 
optimi
    collect(createDataFrame(rdf)) not equal to `expected`.
    Component “g”: 'tzone' attributes are inconsistent ('UTC' and '')
    
    ── 2. Failure (test_sparkSQL_arrow.R:143:3): dapply() Arrow optimization - 
type
    collect(ret) not equal to `rdf`.
    Component “b”: 'tzone' attributes are inconsistent ('UTC' and '')
    
    ── 3. Failure (test_sparkSQL_arrow.R:229:3): gapply() Arrow optimization - 
type
    collect(ret) not equal to `rdf`.
    Component “b”: 'tzone' attributes are inconsistent ('UTC' and '')
    
    ── 4. Error (test_sparkSQL.R:1454:3): column functions 
─────────────────────────
    Error: (converted from warning) cannot xtfrm data frames
    Backtrace:
      1. base::sort(collect(distinct(select(df, input_file_name())))) 
test_sparkSQL.R:1454:2
      2. base::sort.default(collect(distinct(select(df, input_file_name()))))
      5. base::order(x, na.last = na.last, decreasing = decreasing)
      6. base::lapply(z, function(x) if (is.object(x)) as.vector(xtfrm(x)) else 
x)
      7. base:::FUN(X[[i]], ...)
     10. base::xtfrm.data.frame(x)
    
    ── 5. Failure (test_utils.R:67:3): cleanClosure on R functions 
─────────────────
    `actual` not equal to `g`.
    names for current but not for target
    Length mismatch: comparison on first 0 components
    
    ── 6. Failure (test_utils.R:80:3): cleanClosure on R functions 
─────────────────
    `actual` not equal to `g`.
    names for current but not for target
    Length mismatch: comparison on first 0 components
    ```
    
    It fixes three as below:
    
    - Avoid a sort on DataFrame which isn't legitimate: 
https://github.com/apache/spark/pull/32709#discussion_r642458108
    - Treat the empty timezone and local timezone as equivalent in SparkR: 
https://github.com/apache/spark/pull/32709#discussion_r642464454
    - Disable `check.environment` in the cleaned closure comparison (enabled by 
default from R 4.1+, 
https://cran.r-project.org/doc/manuals/r-release/NEWS.html), and keep the test 
as is https://github.com/apache/spark/pull/32709#discussion_r642510089
    
    Higher R versions have bug fixes and improvements. More importantly R users 
tend to use highest R versions.
    
    Yes, SparkR will work together with R 4.1.0+
    
    ```bash
    ./R/run-tests.sh
    ```
    
    ```
    sparkSQL_arrow:
    SparkSQL Arrow optimization: .................
    
    ...
    
    sparkSQL:
    SparkSQL functions: 
........................................................................................................................................................................................................
    
........................................................................................................................................................................................................
    
........................................................................................................................................................................................................
    
........................................................................................................................................................................................................
    
........................................................................................................................................................................................................
    
........................................................................................................................................................................................................
    
    ...
    
    utils:
    functions in utils.R: ..............................................
    ```
    
    Closes #32709 from HyukjinKwon/SPARK-35573.
    
    Authored-by: Hyukjin Kwon <gurwls...@apache.org>
    Signed-off-by: Hyukjin Kwon <gurwls...@apache.org>
    (cherry picked from commit 1ba1b70cfe24f94b882ebc2dcc6f18d8638596a2)
    Signed-off-by: Hyukjin Kwon <gurwls...@apache.org>
---
 R/pkg/tests/fulltests/test_sparkSQL.R       |  2 +-
 R/pkg/tests/fulltests/test_sparkSQL_arrow.R |  6 +++---
 R/pkg/tests/fulltests/test_utils.R          | 14 ++++++++++++--
 3 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/R/pkg/tests/fulltests/test_sparkSQL.R 
b/R/pkg/tests/fulltests/test_sparkSQL.R
index 2326897..628d871 100644
--- a/R/pkg/tests/fulltests/test_sparkSQL.R
+++ b/R/pkg/tests/fulltests/test_sparkSQL.R
@@ -1448,7 +1448,7 @@ test_that("column functions", {
   expect_equal(collect(df2)[[3, 2]], TRUE)
 
   # Test that input_file_name()
-  actual_names <- sort(collect(distinct(select(df, input_file_name()))))
+  actual_names <- collect(distinct(select(df, input_file_name())))
   expect_equal(length(actual_names), 1)
   expect_equal(basename(actual_names[1, 1]), basename(jsonPath))
 
diff --git a/R/pkg/tests/fulltests/test_sparkSQL_arrow.R 
b/R/pkg/tests/fulltests/test_sparkSQL_arrow.R
index 0674348..a25a1bb 100644
--- a/R/pkg/tests/fulltests/test_sparkSQL_arrow.R
+++ b/R/pkg/tests/fulltests/test_sparkSQL_arrow.R
@@ -68,7 +68,7 @@ test_that("createDataFrame/collect Arrow optimization - type 
specification", {
     callJMethod(conf, "set", "spark.sql.execution.arrow.sparkr.enabled", 
arrowEnabled)
   })
 
-  expect_equal(collect(createDataFrame(rdf)), expected)
+  expect_true(all(collect(createDataFrame(rdf)) == expected))
 })
 
 test_that("dapply() Arrow optimization", {
@@ -140,7 +140,7 @@ test_that("dapply() Arrow optimization - type specification 
(date and timestamp)
                               b = as.POSIXct("1990-02-24 12:34:56"))))
   df <- createDataFrame(rdf)
   ret <- dapply(df, function(rdf) { rdf }, schema(df))
-  expect_equal(collect(ret), rdf)
+  expect_true(all(collect(ret) == rdf))
 })
 
 test_that("gapply() Arrow optimization", {
@@ -226,7 +226,7 @@ test_that("gapply() Arrow optimization - type specification 
(date and timestamp)
   ret <- gapply(df,
                 "a",
                 function(key, grouped) { grouped }, schema(df))
-  expect_equal(collect(ret), rdf)
+  expect_true(all(collect(ret) == rdf))
 })
 
 test_that("Arrow optimization - unsupported types", {
diff --git a/R/pkg/tests/fulltests/test_utils.R 
b/R/pkg/tests/fulltests/test_utils.R
index 6c83a13..35f9c9e 100644
--- a/R/pkg/tests/fulltests/test_utils.R
+++ b/R/pkg/tests/fulltests/test_utils.R
@@ -64,7 +64,12 @@ test_that("cleanClosure on R functions", {
   actual <- get("y", envir = env, inherits = FALSE)
   expect_equal(actual, y)
   actual <- get("g", envir = env, inherits = FALSE)
-  expect_equal(actual, g)
+  if (as.numeric(R.Version()$major) >= 4 && !startsWith(R.Version()$minor, 
"0")) {
+    # 4.1+ checks environment in the function
+    expect_true(all.equal(actual, g, check.environment = FALSE))
+  } else {
+    expect_equal(actual, g)
+  }
 
   # Test for nested enclosures and package variables.
   env2 <- new.env()
@@ -77,7 +82,12 @@ test_that("cleanClosure on R functions", {
   actual <- get("y", envir = env, inherits = FALSE)
   expect_equal(actual, y)
   actual <- get("g", envir = env, inherits = FALSE)
-  expect_equal(actual, g)
+  if (as.numeric(R.Version()$major) >= 4 && !startsWith(R.Version()$minor, 
"0")) {
+    # 4.1+ checks environment in the function
+    expect_true(all.equal(actual, g, check.environment = FALSE))
+  } else {
+    expect_equal(actual, g)
+  }
 
   base <- c(1, 2, 3)
   l <- list(field = matrix(1))

---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

Reply via email to