[arrow] branch master updated: ARROW-9179: [R] Replace usage of iris dataset in tests

npr Fri, 19 Jun 2020 16:36:15 -0700

This is an automated email from the ASF dual-hosted git repository.

npr pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git



The following commit(s) were added to refs/heads/master by this push:
     new 341c6c5  ARROW-9179: [R] Replace usage of iris dataset in tests
341c6c5 is described below

commit 341c6c546fe1a1b9a01f93e2d3e9d89fa17b6793
Author: Neal Richardson <[email protected]>
AuthorDate: Fri Jun 19 16:35:22 2020 -0700

    ARROW-9179: [R] Replace usage of iris dataset in tests
    
    FYI @romainfrancois @wesm
    
    It's not a great dataset anyway for our tests because there's basically no 
variation in data type or missingness. Switching some tests to use a more 
fully-featured data frame revealed some other issues, in fact.
    
    Closes #7499 from nealrichardson/de-iris
    
    Authored-by: Neal Richardson <[email protected]>
    Signed-off-by: Neal Richardson <[email protected]>
---
 r/R/csv.R                           |  4 +--
 r/R/feather.R                       |  4 +--
 r/R/record-batch-reader.R           |  4 +--
 r/R/record-batch-writer.R           |  4 +--
 r/man/RecordBatchReader.Rd          |  4 +--
 r/man/RecordBatchWriter.Rd          |  4 +--
 r/man/read_delim_arrow.Rd           |  4 +--
 r/man/read_feather.Rd               |  4 +--
 r/tests/testthat/helper-data.R      | 25 +++++++++++++++
 r/tests/testthat/test-RecordBatch.R |  4 +--
 r/tests/testthat/test-Table.R       |  4 +--
 r/tests/testthat/test-csv.R         | 63 +++++++++++++++++--------------------
 r/tests/testthat/test-dplyr.R       | 18 ++++-------
 r/tests/testthat/test-type.R        |  2 +-
 14 files changed, 80 insertions(+), 68 deletions(-)

diff --git a/r/R/csv.R b/r/R/csv.R
index 29557e3..e145a90 100644
--- a/r/R/csv.R
+++ b/r/R/csv.R
@@ -77,11 +77,11 @@
 #' \donttest{
 #'   tf <- tempfile()
 #'   on.exit(unlink(tf))
-#'   write.csv(iris, file = tf)
+#'   write.csv(mtcars, file = tf)
 #'   df <- read_csv_arrow(tf)
 #'   dim(df)
 #'   # Can select columns
-#'   df <- read_csv_arrow(tf, col_select = starts_with("Sepal"))
+#'   df <- read_csv_arrow(tf, col_select = starts_with("d"))
 #' }
 read_delim_arrow <- function(file,
                              delim = ",",
diff --git a/r/R/feather.R b/r/R/feather.R
index 7ee49b9..9b8dc8c 100644
--- a/r/R/feather.R
+++ b/r/R/feather.R
@@ -135,11 +135,11 @@ write_feather <- function(x,
 #' \donttest{
 #' tf <- tempfile()
 #' on.exit(unlink(tf))
-#' write_feather(iris, tf)
+#' write_feather(mtcars, tf)
 #' df <- read_feather(tf)
 #' dim(df)
 #' # Can select columns
-#' df <- read_feather(tf, col_select = starts_with("Sepal"))
+#' df <- read_feather(tf, col_select = starts_with("d"))
 #' }
 read_feather <- function(file, col_select = NULL, as_data_frame = TRUE, ...) {
   if (!inherits(file, "InputStream")) {
diff --git a/r/R/record-batch-reader.R b/r/R/record-batch-reader.R
index d2f4f19..85ce839 100644
--- a/r/R/record-batch-reader.R
+++ b/r/R/record-batch-reader.R
@@ -61,7 +61,7 @@
 #' tf <- tempfile()
 #' on.exit(unlink(tf))
 #'
-#' batch <- record_batch(iris)
+#' batch <- record_batch(chickwts)
 #'
 #' # This opens a connection to the file in Arrow
 #' file_obj <- FileOutputStream$create(tf)
@@ -87,7 +87,7 @@
 #' # Call as.data.frame to turn that Table into an R data.frame
 #' df <- as.data.frame(tab)
 #' # This should be the same data we sent
-#' all.equal(df, iris, check.attributes = FALSE)
+#' all.equal(df, chickwts, check.attributes = FALSE)
 #' # Unlike the Writers, we don't have to close RecordBatchReaders,
 #' # but we do still need to close the file connection
 #' read_file_obj$close()
diff --git a/r/R/record-batch-writer.R b/r/R/record-batch-writer.R
index f4ab664..cd71bdb 100644
--- a/r/R/record-batch-writer.R
+++ b/r/R/record-batch-writer.R
@@ -60,7 +60,7 @@
 #' tf <- tempfile()
 #' on.exit(unlink(tf))
 #'
-#' batch <- record_batch(iris)
+#' batch <- record_batch(chickwts)
 #'
 #' # This opens a connection to the file in Arrow
 #' file_obj <- FileOutputStream$create(tf)
@@ -86,7 +86,7 @@
 #' # Call as.data.frame to turn that Table into an R data.frame
 #' df <- as.data.frame(tab)
 #' # This should be the same data we sent
-#' all.equal(df, iris, check.attributes = FALSE)
+#' all.equal(df, chickwts, check.attributes = FALSE)
 #' # Unlike the Writers, we don't have to close RecordBatchReaders,
 #' # but we do still need to close the file connection
 #' read_file_obj$close()
diff --git a/r/man/RecordBatchReader.Rd b/r/man/RecordBatchReader.Rd
index bbadfd5..6b204b0 100644
--- a/r/man/RecordBatchReader.Rd
+++ b/r/man/RecordBatchReader.Rd
@@ -48,7 +48,7 @@ are in the file.
 tf <- tempfile()
 on.exit(unlink(tf))
 
-batch <- record_batch(iris)
+batch <- record_batch(chickwts)
 
 # This opens a connection to the file in Arrow
 file_obj <- FileOutputStream$create(tf)
@@ -74,7 +74,7 @@ tab <- reader$read_table()
 # Call as.data.frame to turn that Table into an R data.frame
 df <- as.data.frame(tab)
 # This should be the same data we sent
-all.equal(df, iris, check.attributes = FALSE)
+all.equal(df, chickwts, check.attributes = FALSE)
 # Unlike the Writers, we don't have to close RecordBatchReaders,
 # but we do still need to close the file connection
 read_file_obj$close()
diff --git a/r/man/RecordBatchWriter.Rd b/r/man/RecordBatchWriter.Rd
index f7f3c77..fd7b886 100644
--- a/r/man/RecordBatchWriter.Rd
+++ b/r/man/RecordBatchWriter.Rd
@@ -46,7 +46,7 @@ to be closed separately.
 tf <- tempfile()
 on.exit(unlink(tf))
 
-batch <- record_batch(iris)
+batch <- record_batch(chickwts)
 
 # This opens a connection to the file in Arrow
 file_obj <- FileOutputStream$create(tf)
@@ -72,7 +72,7 @@ tab <- reader$read_table()
 # Call as.data.frame to turn that Table into an R data.frame
 df <- as.data.frame(tab)
 # This should be the same data we sent
-all.equal(df, iris, check.attributes = FALSE)
+all.equal(df, chickwts, check.attributes = FALSE)
 # Unlike the Writers, we don't have to close RecordBatchReaders,
 # but we do still need to close the file connection
 read_file_obj$close()
diff --git a/r/man/read_delim_arrow.Rd b/r/man/read_delim_arrow.Rd
index a01d722..124abdc 100644
--- a/r/man/read_delim_arrow.Rd
+++ b/r/man/read_delim_arrow.Rd
@@ -136,10 +136,10 @@ use \link{CsvTableReader} directly for lower-level access.
 \donttest{
   tf <- tempfile()
   on.exit(unlink(tf))
-  write.csv(iris, file = tf)
+  write.csv(mtcars, file = tf)
   df <- read_csv_arrow(tf)
   dim(df)
   # Can select columns
-  df <- read_csv_arrow(tf, col_select = starts_with("Sepal"))
+  df <- read_csv_arrow(tf, col_select = starts_with("d"))
 }
 }
diff --git a/r/man/read_feather.Rd b/r/man/read_feather.Rd
index 6f722a4..f507edb 100644
--- a/r/man/read_feather.Rd
+++ b/r/man/read_feather.Rd
@@ -37,11 +37,11 @@ and the version 2 specification, which is the Apache Arrow 
IPC file format.
 \donttest{
 tf <- tempfile()
 on.exit(unlink(tf))
-write_feather(iris, tf)
+write_feather(mtcars, tf)
 df <- read_feather(tf)
 dim(df)
 # Can select columns
-df <- read_feather(tf, col_select = starts_with("Sepal"))
+df <- read_feather(tf, col_select = starts_with("d"))
 }
 }
 \seealso{
diff --git a/r/tests/testthat/helper-data.R b/r/tests/testthat/helper-data.R
new file mode 100644
index 0000000..e826cd4
--- /dev/null
+++ b/r/tests/testthat/helper-data.R
@@ -0,0 +1,25 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+example_data <- tibble::tibble(
+  int = c(1:3, NA_integer_, 5:10),
+  dbl = c(1:8, NA, 10) + .1,
+  lgl = sample(c(TRUE, FALSE, NA), 10, replace = TRUE),
+  false = logical(10),
+  chr = letters[c(1:5, NA, 7:10)],
+  fct = factor(letters[c(1:4, NA, NA, 7:10)])
+)
diff --git a/r/tests/testthat/test-RecordBatch.R 
b/r/tests/testthat/test-RecordBatch.R
index 8c265b0..c87893a 100644
--- a/r/tests/testthat/test-RecordBatch.R
+++ b/r/tests/testthat/test-RecordBatch.R
@@ -102,10 +102,10 @@ test_that("RecordBatch", {
 })
 
 test_that("RecordBatch S3 methods", {
-  tab <- RecordBatch$create(iris)
+  tab <- RecordBatch$create(example_data)
   for (f in c("dim", "nrow", "ncol", "dimnames", "colnames", "row.names", 
"as.list")) {
     fun <- get(f)
-    expect_identical(fun(tab), fun(iris), info = f)
+    expect_identical(fun(tab), fun(example_data), info = f)
   }
 })
 
diff --git a/r/tests/testthat/test-Table.R b/r/tests/testthat/test-Table.R
index d8d4688..4daf0c8 100644
--- a/r/tests/testthat/test-Table.R
+++ b/r/tests/testthat/test-Table.R
@@ -66,10 +66,10 @@ test_that("Table cast (ARROW-3741)", {
 })
 
 test_that("Table S3 methods", {
-  tab <- Table$create(iris)
+  tab <- Table$create(example_data)
   for (f in c("dim", "nrow", "ncol", "dimnames", "colnames", "row.names", 
"as.list")) {
     fun <- get(f)
-    expect_identical(fun(tab), fun(iris), info = f)
+    expect_identical(fun(tab), fun(example_data), info = f)
   }
 })
 
diff --git a/r/tests/testthat/test-csv.R b/r/tests/testthat/test-csv.R
index 81a4bf8..2d85437 100644
--- a/r/tests/testthat/test-csv.R
+++ b/r/tests/testthat/test-csv.R
@@ -17,14 +17,16 @@
 
 context("CsvTableReader")
 
+# Not all types round trip via CSV 100% identical by default
+tbl <- example_data[, c("dbl", "lgl", "false", "chr")]
+
 test_that("Can read csv file", {
   tf <- tempfile()
   on.exit(unlink(tf))
 
-  write.csv(iris, tf, row.names = FALSE)
+  write.csv(tbl, tf, row.names = FALSE)
 
-  iris$Species <- as.character(iris$Species)
-  tab0 <- Table$create(!!!iris)
+  tab0 <- Table$create(tbl)
   tab1 <- read_csv_arrow(tf, as_data_frame = FALSE)
   expect_equal(tab0, tab1)
   tab2 <- read_csv_arrow(mmap_open(tf), as_data_frame = FALSE)
@@ -37,24 +39,20 @@ test_that("read_csv_arrow(as_data_frame=TRUE)", {
   tf <- tempfile()
   on.exit(unlink(tf))
 
-  write.csv(iris, tf, row.names = FALSE)
-  iris$Species <- as.character(iris$Species)
-
+  write.csv(tbl, tf, row.names = FALSE)
   tab1 <- read_csv_arrow(tf, as_data_frame = TRUE)
-  expect_equivalent(iris, tab1)
+  expect_equivalent(tbl, tab1)
 })
 
 test_that("read_delim_arrow parsing options: delim", {
   tf <- tempfile()
   on.exit(unlink(tf))
 
-  write.table(iris, tf, sep = "\t", row.names = FALSE)
+  write.table(tbl, tf, sep = "\t", row.names = FALSE)
   tab1 <- read_tsv_arrow(tf)
   tab2 <- read_delim_arrow(tf, delim = "\t")
   expect_equivalent(tab1, tab2)
-
-  iris$Species <- as.character(iris$Species)
-  expect_equivalent(iris, tab1)
+  expect_equivalent(tbl, tab1)
 })
 
 test_that("read_delim_arrow parsing options: quote", {
@@ -79,26 +77,25 @@ test_that("read_csv_arrow parsing options: col_names", {
   on.exit(unlink(tf))
 
   # Writing the CSV without the header
-  write.table(iris, tf, sep = ",", row.names = FALSE, col.names = FALSE)
+  write.table(tbl, tf, sep = ",", row.names = FALSE, col.names = FALSE)
 
   # Reading with col_names = FALSE autogenerates names
   no_names <- read_csv_arrow(tf, col_names = FALSE)
-  expect_equal(no_names$f0, iris$Sepal.Length)
+  expect_equal(no_names$f0, tbl[[1]])
 
-  tab1 <- read_csv_arrow(tf, col_names = names(iris))
+  tab1 <- read_csv_arrow(tf, col_names = names(tbl))
 
-  expect_identical(names(tab1), names(iris))
-  iris$Species <- as.character(iris$Species)
-  expect_equivalent(iris, tab1)
+  expect_identical(names(tab1), names(tbl))
+  expect_equivalent(tbl, tab1)
 
   # This errors (correctly) because I haven't given enough names
   # but the error message is "Invalid: Empty CSV file", which is not accurate
   expect_error(
-    read_csv_arrow(tf, col_names = names(iris)[1])
+    read_csv_arrow(tf, col_names = names(tbl)[1])
   )
   # Same here
   expect_error(
-    read_csv_arrow(tf, col_names = c(names(iris), names(iris)))
+    read_csv_arrow(tf, col_names = c(names(tbl), names(tbl)))
   )
 })
 
@@ -108,25 +105,24 @@ test_that("read_csv_arrow parsing options: skip", {
 
   # Adding two garbage lines to start the csv
   cat("asdf\nqwer\n", file = tf)
-  suppressWarnings(write.table(iris, tf, sep = ",", row.names = FALSE, append 
= TRUE))
+  suppressWarnings(write.table(tbl, tf, sep = ",", row.names = FALSE, append = 
TRUE))
 
   tab1 <- read_csv_arrow(tf, skip = 2)
 
-  expect_identical(names(tab1), names(iris))
-  iris$Species <- as.character(iris$Species)
-  expect_equivalent(iris, tab1)
+  expect_identical(names(tab1), names(tbl))
+  expect_equivalent(tbl, tab1)
 })
 
 test_that("read_csv_arrow parsing options: skip_empty_rows", {
   tf <- tempfile()
   on.exit(unlink(tf))
 
-  write.csv(iris, tf, row.names = FALSE)
+  write.csv(tbl, tf, row.names = FALSE)
   cat("\n\n", file = tf, append = TRUE)
 
   tab1 <- read_csv_arrow(tf, skip_empty_rows = FALSE)
 
-  expect_equal(nrow(tab1), nrow(iris) + 2)
+  expect_equal(nrow(tab1), nrow(tbl) + 2)
   expect_true(is.na(tail(tab1, 1)[[1]]))
 })
 
@@ -160,13 +156,13 @@ test_that("read_csv_arrow() respects col_select", {
   tf <- tempfile()
   on.exit(unlink(tf))
 
-  write.csv(iris, tf, row.names = FALSE, quote = FALSE)
+  write.csv(tbl, tf, row.names = FALSE, quote = FALSE)
 
-  tab <- read_csv_arrow(tf, col_select = starts_with("Sepal"), as_data_frame = 
FALSE)
-  expect_equal(tab, Table$create(Sepal.Length = iris$Sepal.Length, Sepal.Width 
= iris$Sepal.Width))
+  tab <- read_csv_arrow(tf, col_select = ends_with("l"), as_data_frame = FALSE)
+  expect_equal(tab, Table$create(example_data[, c("dbl", "lgl")]))
 
-  tib <- read_csv_arrow(tf, col_select = starts_with("Sepal"), as_data_frame = 
TRUE)
-  expect_equal(tib, tibble::tibble(Sepal.Length = iris$Sepal.Length, 
Sepal.Width = iris$Sepal.Width))
+  tib <- read_csv_arrow(tf, col_select = ends_with("l"), as_data_frame = TRUE)
+  expect_equal(tib, example_data[, c("dbl", "lgl")])
 })
 
 test_that("read_csv_arrow() can detect compression from file name", {
@@ -174,10 +170,7 @@ test_that("read_csv_arrow() can detect compression from 
file name", {
   tf <- tempfile(fileext = ".csv.gz")
   on.exit(unlink(tf))
 
-  write.csv(iris, gzfile(tf), row.names = FALSE, quote = FALSE)
-
-  iris$Species <- as.character(iris$Species)
-
+  write.csv(tbl, gzfile(tf), row.names = FALSE, quote = FALSE)
   tab1 <- read_csv_arrow(tf)
-  expect_equivalent(iris, tab1)
+  expect_equivalent(tbl, tab1)
 })
diff --git a/r/tests/testthat/test-dplyr.R b/r/tests/testthat/test-dplyr.R
index 3ae915e..7b4afda 100644
--- a/r/tests/testthat/test-dplyr.R
+++ b/r/tests/testthat/test-dplyr.R
@@ -32,7 +32,7 @@ expect_dplyr_equal <- function(expr, # A dplyr pipeline with 
`input` as its star
       expr,
       rlang::new_data_mask(rlang::env(input = record_batch(tbl)))
     )
-    expect_equal(via_batch, expected, ...)
+    expect_equivalent(via_batch, expected, ...)
   } else {
     skip(skip_record_batch)
   }
@@ -42,7 +42,7 @@ expect_dplyr_equal <- function(expr, # A dplyr pipeline with 
`input` as its star
       expr,
       rlang::new_data_mask(rlang::env(input = Table$create(tbl)))
     )
-    expect_equal(via_table, expected, ...)
+    expect_equivalent(via_table, expected, ...)
   } else {
     skip(skip_table)
   }
@@ -76,14 +76,7 @@ expect_dplyr_error <- function(expr, # A dplyr pipeline with 
`input` as its star
   )
 }
 
-tbl <- tibble::tibble(
-  int = 1:10,
-  dbl = as.numeric(1:10),
-  lgl = sample(c(TRUE, FALSE, NA), 10, replace = TRUE),
-  false = logical(10),
-  chr = letters[1:10],
-  fct = factor(letters[1:10])
-)
+tbl <- example_data
 
 test_that("basic select/filter/collect", {
   batch <- record_batch(tbl)
@@ -94,7 +87,7 @@ test_that("basic select/filter/collect", {
 
   expect_is(b2, "arrow_dplyr_query")
   t2 <- collect(b2)
-  expect_equal(t2, tbl[tbl$int > 5, c("int", "chr")])
+  expect_equal(t2, tbl[tbl$int > 5 & !is.na(tbl$int), c("int", "chr")])
   # Test that the original object is not affected
   expect_identical(collect(batch), tbl)
 })
@@ -176,9 +169,10 @@ test_that("filter environment scope", {
   # 'could not find function "isEqualTo"'
   expect_dplyr_error(filter(batch, isEqualTo(int, 4)))
 
-  isEqualTo <- function(x, y) x == y
+  isEqualTo <- function(x, y) x == y & !is.na(x)
   expect_dplyr_equal(
     input %>%
+      select(-fct) %>% # factor levels aren't identical
       filter(isEqualTo(int, 4)) %>%
       collect(),
     tbl
diff --git a/r/tests/testthat/test-type.R b/r/tests/testthat/test-type.R
index aa7627b..8f2cc7c 100644
--- a/r/tests/testthat/test-type.R
+++ b/r/tests/testthat/test-type.R
@@ -34,7 +34,7 @@ test_that("type() infers from R type", {
   expect_type_equal(type(raw()), int8())
   expect_type_equal(type(""), utf8())
   expect_type_equal(
-    type(iris$Species),
+    type(example_data$fct),
     dictionary(int8(), utf8(), FALSE)
   )
   expect_type_equal(

[arrow] branch master updated: ARROW-9179: [R] Replace usage of iris dataset in tests

Reply via email to