This is an automated email from the ASF dual-hosted git repository.
npr pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 341c6c5 ARROW-9179: [R] Replace usage of iris dataset in tests
341c6c5 is described below
commit 341c6c546fe1a1b9a01f93e2d3e9d89fa17b6793
Author: Neal Richardson <[email protected]>
AuthorDate: Fri Jun 19 16:35:22 2020 -0700
ARROW-9179: [R] Replace usage of iris dataset in tests
FYI @romainfrancois @wesm
It's not a great dataset anyway for our tests because there's basically no
variation in data type or missingness. Switching some tests to use a more
fully-featured data frame revealed some other issues, in fact.
Closes #7499 from nealrichardson/de-iris
Authored-by: Neal Richardson <[email protected]>
Signed-off-by: Neal Richardson <[email protected]>
---
r/R/csv.R | 4 +--
r/R/feather.R | 4 +--
r/R/record-batch-reader.R | 4 +--
r/R/record-batch-writer.R | 4 +--
r/man/RecordBatchReader.Rd | 4 +--
r/man/RecordBatchWriter.Rd | 4 +--
r/man/read_delim_arrow.Rd | 4 +--
r/man/read_feather.Rd | 4 +--
r/tests/testthat/helper-data.R | 25 +++++++++++++++
r/tests/testthat/test-RecordBatch.R | 4 +--
r/tests/testthat/test-Table.R | 4 +--
r/tests/testthat/test-csv.R | 63 +++++++++++++++++--------------------
r/tests/testthat/test-dplyr.R | 18 ++++-------
r/tests/testthat/test-type.R | 2 +-
14 files changed, 80 insertions(+), 68 deletions(-)
diff --git a/r/R/csv.R b/r/R/csv.R
index 29557e3..e145a90 100644
--- a/r/R/csv.R
+++ b/r/R/csv.R
@@ -77,11 +77,11 @@
#' \donttest{
#' tf <- tempfile()
#' on.exit(unlink(tf))
-#' write.csv(iris, file = tf)
+#' write.csv(mtcars, file = tf)
#' df <- read_csv_arrow(tf)
#' dim(df)
#' # Can select columns
-#' df <- read_csv_arrow(tf, col_select = starts_with("Sepal"))
+#' df <- read_csv_arrow(tf, col_select = starts_with("d"))
#' }
read_delim_arrow <- function(file,
delim = ",",
diff --git a/r/R/feather.R b/r/R/feather.R
index 7ee49b9..9b8dc8c 100644
--- a/r/R/feather.R
+++ b/r/R/feather.R
@@ -135,11 +135,11 @@ write_feather <- function(x,
#' \donttest{
#' tf <- tempfile()
#' on.exit(unlink(tf))
-#' write_feather(iris, tf)
+#' write_feather(mtcars, tf)
#' df <- read_feather(tf)
#' dim(df)
#' # Can select columns
-#' df <- read_feather(tf, col_select = starts_with("Sepal"))
+#' df <- read_feather(tf, col_select = starts_with("d"))
#' }
read_feather <- function(file, col_select = NULL, as_data_frame = TRUE, ...) {
if (!inherits(file, "InputStream")) {
diff --git a/r/R/record-batch-reader.R b/r/R/record-batch-reader.R
index d2f4f19..85ce839 100644
--- a/r/R/record-batch-reader.R
+++ b/r/R/record-batch-reader.R
@@ -61,7 +61,7 @@
#' tf <- tempfile()
#' on.exit(unlink(tf))
#'
-#' batch <- record_batch(iris)
+#' batch <- record_batch(chickwts)
#'
#' # This opens a connection to the file in Arrow
#' file_obj <- FileOutputStream$create(tf)
@@ -87,7 +87,7 @@
#' # Call as.data.frame to turn that Table into an R data.frame
#' df <- as.data.frame(tab)
#' # This should be the same data we sent
-#' all.equal(df, iris, check.attributes = FALSE)
+#' all.equal(df, chickwts, check.attributes = FALSE)
#' # Unlike the Writers, we don't have to close RecordBatchReaders,
#' # but we do still need to close the file connection
#' read_file_obj$close()
diff --git a/r/R/record-batch-writer.R b/r/R/record-batch-writer.R
index f4ab664..cd71bdb 100644
--- a/r/R/record-batch-writer.R
+++ b/r/R/record-batch-writer.R
@@ -60,7 +60,7 @@
#' tf <- tempfile()
#' on.exit(unlink(tf))
#'
-#' batch <- record_batch(iris)
+#' batch <- record_batch(chickwts)
#'
#' # This opens a connection to the file in Arrow
#' file_obj <- FileOutputStream$create(tf)
@@ -86,7 +86,7 @@
#' # Call as.data.frame to turn that Table into an R data.frame
#' df <- as.data.frame(tab)
#' # This should be the same data we sent
-#' all.equal(df, iris, check.attributes = FALSE)
+#' all.equal(df, chickwts, check.attributes = FALSE)
#' # Unlike the Writers, we don't have to close RecordBatchReaders,
#' # but we do still need to close the file connection
#' read_file_obj$close()
diff --git a/r/man/RecordBatchReader.Rd b/r/man/RecordBatchReader.Rd
index bbadfd5..6b204b0 100644
--- a/r/man/RecordBatchReader.Rd
+++ b/r/man/RecordBatchReader.Rd
@@ -48,7 +48,7 @@ are in the file.
tf <- tempfile()
on.exit(unlink(tf))
-batch <- record_batch(iris)
+batch <- record_batch(chickwts)
# This opens a connection to the file in Arrow
file_obj <- FileOutputStream$create(tf)
@@ -74,7 +74,7 @@ tab <- reader$read_table()
# Call as.data.frame to turn that Table into an R data.frame
df <- as.data.frame(tab)
# This should be the same data we sent
-all.equal(df, iris, check.attributes = FALSE)
+all.equal(df, chickwts, check.attributes = FALSE)
# Unlike the Writers, we don't have to close RecordBatchReaders,
# but we do still need to close the file connection
read_file_obj$close()
diff --git a/r/man/RecordBatchWriter.Rd b/r/man/RecordBatchWriter.Rd
index f7f3c77..fd7b886 100644
--- a/r/man/RecordBatchWriter.Rd
+++ b/r/man/RecordBatchWriter.Rd
@@ -46,7 +46,7 @@ to be closed separately.
tf <- tempfile()
on.exit(unlink(tf))
-batch <- record_batch(iris)
+batch <- record_batch(chickwts)
# This opens a connection to the file in Arrow
file_obj <- FileOutputStream$create(tf)
@@ -72,7 +72,7 @@ tab <- reader$read_table()
# Call as.data.frame to turn that Table into an R data.frame
df <- as.data.frame(tab)
# This should be the same data we sent
-all.equal(df, iris, check.attributes = FALSE)
+all.equal(df, chickwts, check.attributes = FALSE)
# Unlike the Writers, we don't have to close RecordBatchReaders,
# but we do still need to close the file connection
read_file_obj$close()
diff --git a/r/man/read_delim_arrow.Rd b/r/man/read_delim_arrow.Rd
index a01d722..124abdc 100644
--- a/r/man/read_delim_arrow.Rd
+++ b/r/man/read_delim_arrow.Rd
@@ -136,10 +136,10 @@ use \link{CsvTableReader} directly for lower-level access.
\donttest{
tf <- tempfile()
on.exit(unlink(tf))
- write.csv(iris, file = tf)
+ write.csv(mtcars, file = tf)
df <- read_csv_arrow(tf)
dim(df)
# Can select columns
- df <- read_csv_arrow(tf, col_select = starts_with("Sepal"))
+ df <- read_csv_arrow(tf, col_select = starts_with("d"))
}
}
diff --git a/r/man/read_feather.Rd b/r/man/read_feather.Rd
index 6f722a4..f507edb 100644
--- a/r/man/read_feather.Rd
+++ b/r/man/read_feather.Rd
@@ -37,11 +37,11 @@ and the version 2 specification, which is the Apache Arrow
IPC file format.
\donttest{
tf <- tempfile()
on.exit(unlink(tf))
-write_feather(iris, tf)
+write_feather(mtcars, tf)
df <- read_feather(tf)
dim(df)
# Can select columns
-df <- read_feather(tf, col_select = starts_with("Sepal"))
+df <- read_feather(tf, col_select = starts_with("d"))
}
}
\seealso{
diff --git a/r/tests/testthat/helper-data.R b/r/tests/testthat/helper-data.R
new file mode 100644
index 0000000..e826cd4
--- /dev/null
+++ b/r/tests/testthat/helper-data.R
@@ -0,0 +1,25 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+example_data <- tibble::tibble(
+ int = c(1:3, NA_integer_, 5:10),
+ dbl = c(1:8, NA, 10) + .1,
+ lgl = sample(c(TRUE, FALSE, NA), 10, replace = TRUE),
+ false = logical(10),
+ chr = letters[c(1:5, NA, 7:10)],
+ fct = factor(letters[c(1:4, NA, NA, 7:10)])
+)
diff --git a/r/tests/testthat/test-RecordBatch.R
b/r/tests/testthat/test-RecordBatch.R
index 8c265b0..c87893a 100644
--- a/r/tests/testthat/test-RecordBatch.R
+++ b/r/tests/testthat/test-RecordBatch.R
@@ -102,10 +102,10 @@ test_that("RecordBatch", {
})
test_that("RecordBatch S3 methods", {
- tab <- RecordBatch$create(iris)
+ tab <- RecordBatch$create(example_data)
for (f in c("dim", "nrow", "ncol", "dimnames", "colnames", "row.names",
"as.list")) {
fun <- get(f)
- expect_identical(fun(tab), fun(iris), info = f)
+ expect_identical(fun(tab), fun(example_data), info = f)
}
})
diff --git a/r/tests/testthat/test-Table.R b/r/tests/testthat/test-Table.R
index d8d4688..4daf0c8 100644
--- a/r/tests/testthat/test-Table.R
+++ b/r/tests/testthat/test-Table.R
@@ -66,10 +66,10 @@ test_that("Table cast (ARROW-3741)", {
})
test_that("Table S3 methods", {
- tab <- Table$create(iris)
+ tab <- Table$create(example_data)
for (f in c("dim", "nrow", "ncol", "dimnames", "colnames", "row.names",
"as.list")) {
fun <- get(f)
- expect_identical(fun(tab), fun(iris), info = f)
+ expect_identical(fun(tab), fun(example_data), info = f)
}
})
diff --git a/r/tests/testthat/test-csv.R b/r/tests/testthat/test-csv.R
index 81a4bf8..2d85437 100644
--- a/r/tests/testthat/test-csv.R
+++ b/r/tests/testthat/test-csv.R
@@ -17,14 +17,16 @@
context("CsvTableReader")
+# Not all types round trip via CSV 100% identical by default
+tbl <- example_data[, c("dbl", "lgl", "false", "chr")]
+
test_that("Can read csv file", {
tf <- tempfile()
on.exit(unlink(tf))
- write.csv(iris, tf, row.names = FALSE)
+ write.csv(tbl, tf, row.names = FALSE)
- iris$Species <- as.character(iris$Species)
- tab0 <- Table$create(!!!iris)
+ tab0 <- Table$create(tbl)
tab1 <- read_csv_arrow(tf, as_data_frame = FALSE)
expect_equal(tab0, tab1)
tab2 <- read_csv_arrow(mmap_open(tf), as_data_frame = FALSE)
@@ -37,24 +39,20 @@ test_that("read_csv_arrow(as_data_frame=TRUE)", {
tf <- tempfile()
on.exit(unlink(tf))
- write.csv(iris, tf, row.names = FALSE)
- iris$Species <- as.character(iris$Species)
-
+ write.csv(tbl, tf, row.names = FALSE)
tab1 <- read_csv_arrow(tf, as_data_frame = TRUE)
- expect_equivalent(iris, tab1)
+ expect_equivalent(tbl, tab1)
})
test_that("read_delim_arrow parsing options: delim", {
tf <- tempfile()
on.exit(unlink(tf))
- write.table(iris, tf, sep = "\t", row.names = FALSE)
+ write.table(tbl, tf, sep = "\t", row.names = FALSE)
tab1 <- read_tsv_arrow(tf)
tab2 <- read_delim_arrow(tf, delim = "\t")
expect_equivalent(tab1, tab2)
-
- iris$Species <- as.character(iris$Species)
- expect_equivalent(iris, tab1)
+ expect_equivalent(tbl, tab1)
})
test_that("read_delim_arrow parsing options: quote", {
@@ -79,26 +77,25 @@ test_that("read_csv_arrow parsing options: col_names", {
on.exit(unlink(tf))
# Writing the CSV without the header
- write.table(iris, tf, sep = ",", row.names = FALSE, col.names = FALSE)
+ write.table(tbl, tf, sep = ",", row.names = FALSE, col.names = FALSE)
# Reading with col_names = FALSE autogenerates names
no_names <- read_csv_arrow(tf, col_names = FALSE)
- expect_equal(no_names$f0, iris$Sepal.Length)
+ expect_equal(no_names$f0, tbl[[1]])
- tab1 <- read_csv_arrow(tf, col_names = names(iris))
+ tab1 <- read_csv_arrow(tf, col_names = names(tbl))
- expect_identical(names(tab1), names(iris))
- iris$Species <- as.character(iris$Species)
- expect_equivalent(iris, tab1)
+ expect_identical(names(tab1), names(tbl))
+ expect_equivalent(tbl, tab1)
# This errors (correctly) because I haven't given enough names
# but the error message is "Invalid: Empty CSV file", which is not accurate
expect_error(
- read_csv_arrow(tf, col_names = names(iris)[1])
+ read_csv_arrow(tf, col_names = names(tbl)[1])
)
# Same here
expect_error(
- read_csv_arrow(tf, col_names = c(names(iris), names(iris)))
+ read_csv_arrow(tf, col_names = c(names(tbl), names(tbl)))
)
})
@@ -108,25 +105,24 @@ test_that("read_csv_arrow parsing options: skip", {
# Adding two garbage lines to start the csv
cat("asdf\nqwer\n", file = tf)
- suppressWarnings(write.table(iris, tf, sep = ",", row.names = FALSE, append
= TRUE))
+ suppressWarnings(write.table(tbl, tf, sep = ",", row.names = FALSE, append =
TRUE))
tab1 <- read_csv_arrow(tf, skip = 2)
- expect_identical(names(tab1), names(iris))
- iris$Species <- as.character(iris$Species)
- expect_equivalent(iris, tab1)
+ expect_identical(names(tab1), names(tbl))
+ expect_equivalent(tbl, tab1)
})
test_that("read_csv_arrow parsing options: skip_empty_rows", {
tf <- tempfile()
on.exit(unlink(tf))
- write.csv(iris, tf, row.names = FALSE)
+ write.csv(tbl, tf, row.names = FALSE)
cat("\n\n", file = tf, append = TRUE)
tab1 <- read_csv_arrow(tf, skip_empty_rows = FALSE)
- expect_equal(nrow(tab1), nrow(iris) + 2)
+ expect_equal(nrow(tab1), nrow(tbl) + 2)
expect_true(is.na(tail(tab1, 1)[[1]]))
})
@@ -160,13 +156,13 @@ test_that("read_csv_arrow() respects col_select", {
tf <- tempfile()
on.exit(unlink(tf))
- write.csv(iris, tf, row.names = FALSE, quote = FALSE)
+ write.csv(tbl, tf, row.names = FALSE, quote = FALSE)
- tab <- read_csv_arrow(tf, col_select = starts_with("Sepal"), as_data_frame =
FALSE)
- expect_equal(tab, Table$create(Sepal.Length = iris$Sepal.Length, Sepal.Width
= iris$Sepal.Width))
+ tab <- read_csv_arrow(tf, col_select = ends_with("l"), as_data_frame = FALSE)
+ expect_equal(tab, Table$create(example_data[, c("dbl", "lgl")]))
- tib <- read_csv_arrow(tf, col_select = starts_with("Sepal"), as_data_frame =
TRUE)
- expect_equal(tib, tibble::tibble(Sepal.Length = iris$Sepal.Length,
Sepal.Width = iris$Sepal.Width))
+ tib <- read_csv_arrow(tf, col_select = ends_with("l"), as_data_frame = TRUE)
+ expect_equal(tib, example_data[, c("dbl", "lgl")])
})
test_that("read_csv_arrow() can detect compression from file name", {
@@ -174,10 +170,7 @@ test_that("read_csv_arrow() can detect compression from
file name", {
tf <- tempfile(fileext = ".csv.gz")
on.exit(unlink(tf))
- write.csv(iris, gzfile(tf), row.names = FALSE, quote = FALSE)
-
- iris$Species <- as.character(iris$Species)
-
+ write.csv(tbl, gzfile(tf), row.names = FALSE, quote = FALSE)
tab1 <- read_csv_arrow(tf)
- expect_equivalent(iris, tab1)
+ expect_equivalent(tbl, tab1)
})
diff --git a/r/tests/testthat/test-dplyr.R b/r/tests/testthat/test-dplyr.R
index 3ae915e..7b4afda 100644
--- a/r/tests/testthat/test-dplyr.R
+++ b/r/tests/testthat/test-dplyr.R
@@ -32,7 +32,7 @@ expect_dplyr_equal <- function(expr, # A dplyr pipeline with
`input` as its star
expr,
rlang::new_data_mask(rlang::env(input = record_batch(tbl)))
)
- expect_equal(via_batch, expected, ...)
+ expect_equivalent(via_batch, expected, ...)
} else {
skip(skip_record_batch)
}
@@ -42,7 +42,7 @@ expect_dplyr_equal <- function(expr, # A dplyr pipeline with
`input` as its star
expr,
rlang::new_data_mask(rlang::env(input = Table$create(tbl)))
)
- expect_equal(via_table, expected, ...)
+ expect_equivalent(via_table, expected, ...)
} else {
skip(skip_table)
}
@@ -76,14 +76,7 @@ expect_dplyr_error <- function(expr, # A dplyr pipeline with
`input` as its star
)
}
-tbl <- tibble::tibble(
- int = 1:10,
- dbl = as.numeric(1:10),
- lgl = sample(c(TRUE, FALSE, NA), 10, replace = TRUE),
- false = logical(10),
- chr = letters[1:10],
- fct = factor(letters[1:10])
-)
+tbl <- example_data
test_that("basic select/filter/collect", {
batch <- record_batch(tbl)
@@ -94,7 +87,7 @@ test_that("basic select/filter/collect", {
expect_is(b2, "arrow_dplyr_query")
t2 <- collect(b2)
- expect_equal(t2, tbl[tbl$int > 5, c("int", "chr")])
+ expect_equal(t2, tbl[tbl$int > 5 & !is.na(tbl$int), c("int", "chr")])
# Test that the original object is not affected
expect_identical(collect(batch), tbl)
})
@@ -176,9 +169,10 @@ test_that("filter environment scope", {
# 'could not find function "isEqualTo"'
expect_dplyr_error(filter(batch, isEqualTo(int, 4)))
- isEqualTo <- function(x, y) x == y
+ isEqualTo <- function(x, y) x == y & !is.na(x)
expect_dplyr_equal(
input %>%
+ select(-fct) %>% # factor levels aren't identical
filter(isEqualTo(int, 4)) %>%
collect(),
tbl
diff --git a/r/tests/testthat/test-type.R b/r/tests/testthat/test-type.R
index aa7627b..8f2cc7c 100644
--- a/r/tests/testthat/test-type.R
+++ b/r/tests/testthat/test-type.R
@@ -34,7 +34,7 @@ test_that("type() infers from R type", {
expect_type_equal(type(raw()), int8())
expect_type_equal(type(""), utf8())
expect_type_equal(
- type(iris$Species),
+ type(example_data$fct),
dictionary(int8(), utf8(), FALSE)
)
expect_type_equal(