This is an automated email from the ASF dual-hosted git repository.
apitrou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 5a5f521461 ARROW-16803: [R][CI] Fix caching for R mingw build (#13379)
5a5f521461 is described below
commit 5a5f521461d8a7eeaac20058970e358c258b537b
Author: Jacob Wujciak-Jens <[email protected]>
AuthorDate: Thu Jun 16 15:45:01 2022 +0200
ARROW-16803: [R][CI] Fix caching for R mingw build (#13379)
The primary issue here was that a matrix build used the same cache keys on
all sub-builds which caused all but one cache-step to fail.
During my work on this, I noticed several `actions/cache` related issues:
- `actions/cache` is not updating the cache on primary key hit
https://github.com/actions/cache/issues/342
- I think this is one of the reasons for bad ccache hit-rate, I have
implemented a workaround that seems to work but has the downside of creating a
new cache.
- actions/cache also uses the path of the cached files when matching
existing caches https://github.com/actions/cache/issues/815
- This prevents re-use of R package dependencies (on windows only?). I
switched to `r-libs/actions/setup-r-dependecies` which is more concise and
***much*** faster (~ 2min vs. ~15+min) even without caching.
I have also increased the ccache maxsize to prevent/reduce the amount of
(slow) cleanup operations and increase cache hits and performance. This is a
change that will affect other workflows through `setup_ccache.sh`, which is why
I have marked this as a draft for now.
I also removed ccache from the windows R builds as we do not use it when
building the R package (we could enable that in the future)
Authored-by: Jacob Wujciak-Jens <[email protected]>
Signed-off-by: Antoine Pitrou <[email protected]>
---
.github/workflows/r.yml | 121 ++++++++++---------------------------------
ci/scripts/ccache_setup.sh | 2 +-
ci/scripts/r_test.sh | 2 +-
docker-compose.yml | 2 +-
r/R/arrow-tabular.R | 2 +-
r/R/dataset-format.R | 4 +-
r/R/dplyr-datetime-helpers.R | 2 +-
r/R/dplyr-mutate.R | 2 +-
8 files changed, 34 insertions(+), 103 deletions(-)
diff --git a/.github/workflows/r.yml b/.github/workflows/r.yml
index 8de703b71a..48d9672c74 100644
--- a/.github/workflows/r.yml
+++ b/.github/workflows/r.yml
@@ -72,8 +72,12 @@ jobs:
uses: actions/cache@v2
with:
path: .docker
- key: ubuntu-${{ matrix.ubuntu }}-r-${{ matrix.r }}-${{
hashFiles('cpp/**') }}
- restore-keys: ubuntu-${{ matrix.ubuntu }}-r-${{ matrix.r }}-
+ # As this key is identical on both matrix builds only one will be
able to successfully cache,
+ # this is fine as there are no differences in the build
+ key: ubuntu-${{ matrix.ubuntu }}-r-${{ matrix.r }}-${{
hashFiles('cpp/src/**/*.cc','cpp/src/**/*.h)') }}-${{ github.run_id }}
+ restore-keys: |
+ ubuntu-${{ matrix.ubuntu }}-r-${{ matrix.r }}-${{
hashFiles('cpp/src/**/*.cc','cpp/src/**/*.h)') }}-
+ ubuntu-${{ matrix.ubuntu }}-r-${{ matrix.r }}-
- name: Check pkgdown reference sections
run: ci/scripts/r_pkgdown_check.sh
- name: Setup Python
@@ -126,12 +130,6 @@ jobs:
uses: actions/checkout@v3
with:
fetch-depth: 0
- - name: Cache Docker Volumes
- uses: actions/cache@v2
- with:
- path: .docker
- key: ${{ matrix.config.image }}-r-${{ hashFiles('cpp/**') }}
- restore-keys: ${{ matrix.config.image }}-r-
- name: Setup Python
uses: actions/setup-python@v1
with:
@@ -197,8 +195,10 @@ jobs:
uses: actions/cache@v2
with:
path: ccache
- key: r-${{ matrix.config.rtools }}-ccache-mingw-${{
hashFiles('cpp/**') }}
- restore-keys: r-${{ matrix.config.rtools }}-ccache-mingw-
+ key: r-${{ matrix.config.rtools }}-ccache-mingw-${{
matrix.config.arch }}-${{ hashFiles('cpp/src/**/*.cc','cpp/src/**/*.h)') }}-${{
github.run_id }}
+ restore-keys: |
+ r-${{ matrix.config.rtools }}-ccache-mingw-${{ matrix.config.arch
}}-${{ hashFiles('cpp/src/**/*.cc','cpp/src/**/*.h)') }}-
+ r-${{ matrix.config.rtools }}-ccache-mingw-${{ matrix.config.arch
}}-
# We use the makepkg-mingw setup that is included in rtools40 even when
# we use the rtools35 compilers, so we always install R 4.0/Rtools40
- uses: r-lib/actions/setup-r@v2
@@ -257,28 +257,16 @@ jobs:
fetch-depth: 0
- run: mkdir r/windows
- name: Download artifacts
- if: ${{ matrix.config.rtools == 35 }}
- uses: actions/download-artifact@v2
- with:
- name: libarrow-rtools35-mingw32.zip
- path: r/windows
- - name: Download artifacts
- if: ${{ matrix.config.rtools == 35 }}
+ if: ${{ matrix.config.rtools != 42 }}
uses: actions/download-artifact@v2
with:
- name: libarrow-rtools35-mingw64.zip
+ name: libarrow-rtools${{ matrix.config.rtools }}-mingw32.zip
path: r/windows
- name: Download artifacts
- if: ${{ matrix.config.rtools == 40 }}
+ if: ${{ matrix.config.rtools !=42 }}
uses: actions/download-artifact@v2
with:
- name: libarrow-rtools40-mingw32.zip
- path: r/windows
- - name: Download artifacts
- if: ${{ matrix.config.rtools == 40 }}
- uses: actions/download-artifact@v2
- with:
- name: libarrow-rtools40-mingw64.zip
+ name: libarrow-rtools${{ matrix.config.rtools }}-mingw64.zip
path: r/windows
- name: Download artifacts
if: ${{ matrix.config.rtools == 42 }}
@@ -292,75 +280,23 @@ jobs:
cd r/windows
ls *.zip | xargs -n 1 unzip -uo
rm -rf *.zip
- - name: Setup ccache
- shell: bash
- run: |
- ci/scripts/ccache_setup.sh
- echo "CCACHE_DIR=$(cygpath --absolute --windows ccache)" >>
$GITHUB_ENV
- # We must enable actions/cache before r-lib/actions/setup-r to ensure
- # using system tar instead of tar provided by Rtools.
- # We can use tar provided by Rtools when we drop support for Rtools 3.5.
- # Because Rtools 4.0 or later has zstd. actions/cache requires zstd
- # when tar is GNU tar.
- - name: Cache ccache
- uses: actions/cache@v2
- with:
- path: ccache
- key: r-${{ matrix.config.rtools }}-ccache-mingw-${{
hashFiles('cpp/**') }}
- restore-keys: r-${{ matrix.config.rtools }}-ccache-mingw-
- uses: r-lib/actions/setup-r@v2
with:
r-version: ${{ matrix.config.rversion }}
rtools-version: ${{ matrix.config.rtools }}
+ # RSPM keeps install times short for 3.6
+ use-public-rspm: true
Ncpus: 2
- - name: Make R tests verbose
- # If you get a segfault/mysterious test Execution halted,
- # make this `true` to see where it dies.
- if: false
- shell: cmd
- run: |
- cd r/tests
- sed -i.bak -E -e 's/"arrow"/"arrow", reporter = "location"/'
testthat.R
- rm -f testthat.R.bak
- - name: Install cpp11 (on R 3.6)
- # Since we force installation of binary packages below, dependency
versions
- # are frozen for old versions of R. We need newer cpp11 than is
available as
- # "binary" (though it doesn't matter because the cpp11 R package is just
a
- # vehicle for the header-only C++ code.
- if: ${{ matrix.config.rtools == 35 }}
- shell: Rscript {0}
- run: install.packages("cpp11", type = "source")
- - name: Prune dependencies (on R 3.6)
- if: ${{ matrix.config.rtools == 35 }}
- shell: Rscript {0}
- run: |
- # To prevent the build from timing out, let's prune some optional
deps (and their possible version requirements)
- setwd("r")
- # create a backup to use later
- file.copy("DESCRIPTION", "DESCRIPTION.bak")
- d <- read.dcf("DESCRIPTION")
- to_prune <- c("duckdb", "DBI", "dbplyr", "decor", "knitr",
"rmarkdown", "pkgload", "reticulate")
- pattern <- paste0("\\n?", to_prune, "( \\([^,]*\\))?,?", collapse
= "|")
- d[,"Suggests"] <- gsub(pattern, "", d[,"Suggests"])
- write.dcf(d, "DESCRIPTION")
- - name: R package cache
- uses: actions/cache@v2
+ - uses: r-lib/actions/setup-r-dependencies@v2
+ env:
+ GITHUB_PAT: "${{ github.token }}"
with:
- path: |
- ${{ env.R_LIBS_USER }}/*
- key: r-${{ matrix.config.rtools }}-R-LIBS-${{
hashFiles('r/DESCRIPTION') }}
- restore-keys: r-${{ matrix.config.rtools }}-R-LIBS-
- - name: Install R package dependencies
- shell: Rscript {0}
- run: |
- # options(pkgType="win.binary") # processx doesn't have a binary for
UCRT yet
- install.packages(c("remotes", "rcmdcheck"))
- remotes::install_deps("r", dependencies = TRUE)
- - name: Restore DESCRIPTION for 3.6
- if: ${{ matrix.config.rtools == 35 }}
- run: |
- rm r/DESCRIPTION
- mv r/DESCRIPTION.bak r/DESCRIPTION
+ # For some arcane reason caching does not work on the windows runners
+ # most likely due to https://github.com/actions/cache/issues/815
+ cache: false
+ working-directory: 'r'
+ extra-packages: |
+ any::rcmdcheck
- name: Check
shell: Rscript {0}
run: |
@@ -391,7 +327,7 @@ jobs:
shell: Rscript {0}
working-directory: r
run: |
- remotes::install_github("jonkeane/lintr@arrow-branch")
+ pak::pak("lintr")
lintr::expect_lint_free()
- name: Dump install logs
shell: cmd
@@ -401,8 +337,3 @@ jobs:
shell: bash
run: find r/check -name 'testthat.Rout*' -exec cat '{}' \; || true
if: always()
- # We can remove this when we drop support for Rtools 3.5.
- - name: Ensure using system tar in actions/cache
- run: |
- Write-Output "${Env:windir}\System32" | `
- Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
diff --git a/ci/scripts/ccache_setup.sh b/ci/scripts/ccache_setup.sh
index f77fbb3736..6afcdda7d0 100755
--- a/ci/scripts/ccache_setup.sh
+++ b/ci/scripts/ccache_setup.sh
@@ -23,4 +23,4 @@ echo "ARROW_USE_CCACHE=ON" >> $GITHUB_ENV
echo "CCACHE_COMPILERCHECK=content" >> $GITHUB_ENV
echo "CCACHE_COMPRESS=1" >> $GITHUB_ENV
echo "CCACHE_COMPRESSLEVEL=6" >> $GITHUB_ENV
-echo "CCACHE_MAXSIZE=500M" >> $GITHUB_ENV
+echo "CCACHE_MAXSIZE=1G" >> $GITHUB_ENV
diff --git a/ci/scripts/r_test.sh b/ci/scripts/r_test.sh
index 354655e1ad..8429187d88 100755
--- a/ci/scripts/r_test.sh
+++ b/ci/scripts/r_test.sh
@@ -33,7 +33,7 @@ printenv
# In the other CI checks the files are synced but ignored.
make sync-cpp
-if [ "$ARROW_R_FORCE_TESTS" = "true"]; then
+if [ "$ARROW_R_FORCE_TESTS" = "true" ]; then
export ARROW_R_DEV=TRUE
export NOT_CRAN=true
export ARROW_LARGE_MEMORY_TESTS=TRUE
diff --git a/docker-compose.yml b/docker-compose.yml
index 7a8ca192f9..6ae62a5ae4 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -59,7 +59,7 @@ x-ccache: &ccache
CCACHE_COMPILERCHECK: content
CCACHE_COMPRESS: 1
CCACHE_COMPRESSLEVEL: 6
- CCACHE_MAXSIZE: 500M
+ CCACHE_MAXSIZE: 1G
CCACHE_DIR: /ccache
# CPU/memory limit presets to pass to Docker.
diff --git a/r/R/arrow-tabular.R b/r/R/arrow-tabular.R
index 58a604ba61..ae68cc2118 100644
--- a/r/R/arrow-tabular.R
+++ b/r/R/arrow-tabular.R
@@ -161,7 +161,7 @@ as.data.frame.ArrowTabular <- function(x, row.names = NULL,
optional = FALSE, ..
#' @export
`[[<-.ArrowTabular` <- function(x, i, value) {
- if (!is.character(i) & !is.numeric(i)) {
+ if (!is.character(i) && !is.numeric(i)) {
stop("'i' must be character or numeric, not ", class(i), call. = FALSE)
}
assert_that(length(i) == 1, !is.na(i))
diff --git a/r/R/dataset-format.R b/r/R/dataset-format.R
index acc1a41b02..948abf2829 100644
--- a/r/R/dataset-format.R
+++ b/r/R/dataset-format.R
@@ -132,7 +132,7 @@ CsvFileFormat$create <- function(...,
column_names <- read_options$column_names
schema_names <- names(schema)
- if (!is.null(schema) & !identical(schema_names, column_names)) {
+ if (!is.null(schema) && !identical(schema_names, column_names)) {
missing_from_schema <- setdiff(column_names, schema_names)
missing_from_colnames <- setdiff(schema_names, column_names)
message_colnames <- NULL
@@ -153,7 +153,7 @@ CsvFileFormat$create <- function(...,
)
}
- if (length(missing_from_schema) == 0 & length(missing_from_colnames) == 0)
{
+ if (length(missing_from_schema) == 0 && length(missing_from_colnames) ==
0) {
message_order <- "`column_names` and `schema` field names match but are
not in the same order"
}
diff --git a/r/R/dplyr-datetime-helpers.R b/r/R/dplyr-datetime-helpers.R
index 607104d7ce..bc1a13075d 100644
--- a/r/R/dplyr-datetime-helpers.R
+++ b/r/R/dplyr-datetime-helpers.R
@@ -16,7 +16,7 @@
# under the License.
check_time_locale <- function(locale = Sys.getlocale("LC_TIME")) {
- if (tolower(Sys.info()[["sysname"]]) == "windows" & locale != "C") {
+ if (tolower(Sys.info()[["sysname"]]) == "windows" && locale != "C") {
# MingW C++ std::locale only supports "C" and "POSIX"
stop(paste0(
"On Windows, time locales other than 'C' are not supported in Arrow. ",
diff --git a/r/R/dplyr-mutate.R b/r/R/dplyr-mutate.R
index 07802f8c83..653c1e6f25 100644
--- a/r/R/dplyr-mutate.R
+++ b/r/R/dplyr-mutate.R
@@ -117,7 +117,7 @@ transmute.arrow_dplyr_query <- function(.data, ...) {
dots <- check_transmute_args(...)
has_null <- map_lgl(dots, quo_is_null)
.data <- dplyr::mutate(.data, !!!dots, .keep = "none")
- if (is_empty(dots) | any(has_null)) {
+ if (is_empty(dots) || any(has_null)) {
return(.data)
}