[
https://issues.apache.org/jira/browse/ARROW-17002?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]
Adam Black updated ARROW-17002:
-------------------------------
Description:
I think that dplyr queries on FileSystemDataset objects will create locks that
persist unnecessarily. This issue only seems to occur on Windows. I'm using
Windows 10. Calling the garbage collector after the dplyr query seems to
release the lock.
{code:r}
library(arrow)
#>
#> Attaching package: 'arrow'
#> The following object is masked from 'package:utils':
#>
#> timestamp
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
# I can delete an arrow dataset that has been opened
write_dataset(iris, "iris")
ds <- open_dataset("iris")
file.exists("iris")
#> [1] TRUE
print(unlink("iris", recursive = T))
#> [1] 0
file.exists("iris")
#> [1] FALSE
# However if I run a dplyr query on the data before deleting it the file is
locked.
write_dataset(iris, "iris")
ds <- open_dataset("iris")
file.exists("iris")
#> [1] TRUE
# I think this adds a lock that is not removed
ds %>% count() %>% collect()
#> # A tibble: 1 x 1
#> n
#> <int>
#> 1 150
print(unlink("iris", recursive = T))
#> [1] 1
file.exists("iris")
#> [1] TRUE
print(unlink("iris", recursive = T, force = T))
#> [1] 1
file.exists("iris")
#> [1] TRUE
file.remove("iris/part-0.parquet")
#> Warning in file.remove("iris/part-0.parquet"): cannot remove file 'iris/
#> part-0.parquet', reason 'Permission denied'
#> [1] FALSE
# running gc() will clean up the lock and allow the file to be deleted
gc()
#> used (Mb) gc trigger (Mb) max used (Mb)
#> Ncells 1178845 63 2349652 125.5 1656436 88.5
#> Vcells 2093715 16 8388608 64.0 3170844 24.2
print(unlink("iris", recursive = T))
#> [1] 0
file.exists("iris")
#> [1] FALSE
sessioninfo::session_info()
#> - Session info
---------------------------------------------------------------
#> setting value
#> version R version 4.0.5 (2021-03-31)
#> os Windows 10 x64
#> system x86_64, mingw32
#> ui RTerm
#> language (EN)
#> collate English_United States.1252
#> ctype English_United States.1252
#> tz America/New_York
#> date 2022-07-07
#>
#> - Packages
-------------------------------------------------------------------
#> package * version date lib source
#> arrow * 8.0.0 2022-05-09 [1] CRAN (R 4.0.5)
#> assertthat 0.2.1 2019-03-21 [1] CRAN (R 4.0.5)
#> backports 1.4.0 2021-11-23 [1] CRAN (R 4.0.5)
#> bit 4.0.4 2020-08-04 [1] CRAN (R 4.0.5)
#> bit64 4.0.5 2020-08-30 [1] CRAN (R 4.0.5)
#> cli 3.0.1 2021-07-17 [1] CRAN (R 4.0.5)
#> crayon 1.5.1 2022-03-26 [1] CRAN (R 4.0.5)
#> DBI 1.1.2 2021-12-20 [1] CRAN (R 4.0.5)
#> digest 0.6.27 2020-10-24 [1] CRAN (R 4.0.5)
#> dplyr * 1.0.8 2022-02-08 [1] CRAN (R 4.0.5)
#> ellipsis 0.3.2 2021-04-29 [1] CRAN (R 4.0.5)
#> evaluate 0.14 2019-05-28 [1] CRAN (R 4.0.5)
#> fansi 0.5.0 2021-05-25 [1] CRAN (R 4.0.5)
#> fastmap 1.1.0 2021-01-25 [1] CRAN (R 4.0.5)
#> fs 1.5.0 2020-07-31 [1] CRAN (R 4.0.5)
#> generics 0.1.2 2022-01-31 [1] CRAN (R 4.0.5)
#> glue 1.4.2 2020-08-27 [1] CRAN (R 4.0.5)
#> highr 0.9 2021-04-16 [1] CRAN (R 4.0.5)
#> htmltools 0.5.2 2021-08-25 [1] CRAN (R 4.0.5)
#> knitr 1.36 2021-09-29 [1] CRAN (R 4.0.5)
#> lifecycle 1.0.1 2021-09-24 [1] CRAN (R 4.0.5)
#> magrittr 2.0.1 2020-11-17 [1] CRAN (R 4.0.5)
#> pillar 1.7.0 2022-02-01 [1] CRAN (R 4.0.5)
#> pkgconfig 2.0.3 2019-09-22 [1] CRAN (R 4.0.5)
#> purrr 0.3.4 2020-04-17 [1] CRAN (R 4.0.5)
#> R6 2.5.1 2021-08-19 [1] CRAN (R 4.0.5)
#> reprex 2.0.1 2021-08-05 [1] CRAN (R 4.0.5)
#> rlang 1.0.2 2022-03-04 [1] CRAN (R 4.0.5)
#> rmarkdown 2.10 2021-08-06 [1] CRAN (R 4.0.5)
#> rstudioapi 0.13 2020-11-12 [1] CRAN (R 4.0.5)
#> sessioninfo 1.1.1 2018-11-05 [1] CRAN (R 4.0.5)
#> stringi 1.7.5 2021-10-04 [1] CRAN (R 4.0.5)
#> stringr 1.4.0 2019-02-10 [1] CRAN (R 4.0.5)
#> styler 1.5.1 2021-07-13 [1] CRAN (R 4.0.5)
#> tibble 3.1.2 2021-05-16 [1] CRAN (R 4.0.5)
#> tidyselect 1.1.2 2022-02-21 [1] CRAN (R 4.0.5)
#> tzdb 0.2.0 2021-10-27 [1] CRAN (R 4.0.5)
#> utf8 1.2.1 2021-03-12 [1] CRAN (R 4.0.5)
#> vctrs 0.3.8 2021-04-29 [1] CRAN (R 4.0.5)
#> withr 2.5.0 2022-03-03 [1] CRAN (R 4.0.5)
#> xfun 0.25 2021-08-06 [1] CRAN (R 4.0.5)
#> yaml 2.2.1 2020-02-01 [1] CRAN (R 4.0.5)
#>
#> [1] C:/Users/adam.DESKTOP-D3KQQA1/Documents/R/win-library/4.0
#> [2] C:/Program Files/R/R-4.0.5/library
{code}
was:
I think that dplyr queries on FileSystemDataset objects will create locks that
persist unnecessarily. This issue only seems to occur on Windows. I'm using
Windows 10. Calling the garbage collector after the dplyr query seems to
release the lock.
``` r
library(arrow)
#>
#> Attaching package: 'arrow'
#> The following object is masked from 'package:utils':
#>
#> timestamp
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
# I can delete an arrow dataset that has been opened
write_dataset(iris, "iris")
ds <- open_dataset("iris")
file.exists("iris")
#> [1] TRUE
print(unlink("iris", recursive = T))
#> [1] 0
file.exists("iris")
#> [1] FALSE
# However if I run a dplyr query on the data before deleting it the file is
locked.
write_dataset(iris, "iris")
ds <- open_dataset("iris")
file.exists("iris")
#> [1] TRUE
# I think this adds a lock that is not automatically removed (on Windows)
ds %>% count() %>% collect()
#> # A tibble: 1 x 1
#> n
#> <int>
#> 1 150
print(unlink("iris", recursive = T))
#> [1] 1
file.exists("iris")
#> [1] TRUE
print(unlink("iris", recursive = T, force = T))
#> [1] 1
file.exists("iris")
#> [1] TRUE
file.remove("iris/part-0.parquet")
#> Warning in file.remove("iris/part-0.parquet"): cannot remove file 'iris/
#> part-0.parquet', reason 'Permission denied'
#> [1] FALSE
# running gc() will clean up the lock and allow the file to be deleted
gc()
#> used (Mb) gc trigger (Mb) max used (Mb)
#> Ncells 1179433 63 2354975 125.8 1664192 88.9
#> Vcells 2095138 16 8388608 64.0 3175226 24.3
print(unlink("iris", recursive = T))
#> [1] 0
file.exists("iris")
#> [1] FALSE
```
<sup>Created on 2022-07-07 by the [reprex
package](https://reprex.tidyverse.org) (v2.0.1)</sup>
<details style="margin-bottom:10px;">
<summary>
Session info
</summary>
``` r
sessioninfo::session_info()
#> - Session info
---------------------------------------------------------------
#> setting value
#> version R version 4.0.5 (2021-03-31)
#> os Windows 10 x64
#> system x86_64, mingw32
#> ui RTerm
#> language (EN)
#> collate English_United States.1252
#> ctype English_United States.1252
#> tz America/New_York
#> date 2022-07-07
#>
#> - Packages
-------------------------------------------------------------------
#> package * version date lib source
#> arrow * 8.0.0 2022-05-09 [1] CRAN (R 4.0.5)
#> assertthat 0.2.1 2019-03-21 [1] CRAN (R 4.0.5)
#> backports 1.4.0 2021-11-23 [1] CRAN (R 4.0.5)
#> bit 4.0.4 2020-08-04 [1] CRAN (R 4.0.5)
#> bit64 4.0.5 2020-08-30 [1] CRAN (R 4.0.5)
#> cli 3.0.1 2021-07-17 [1] CRAN (R 4.0.5)
#> crayon 1.5.1 2022-03-26 [1] CRAN (R 4.0.5)
#> DBI 1.1.2 2021-12-20 [1] CRAN (R 4.0.5)
#> digest 0.6.27 2020-10-24 [1] CRAN (R 4.0.5)
#> dplyr * 1.0.8 2022-02-08 [1] CRAN (R 4.0.5)
#> ellipsis 0.3.2 2021-04-29 [1] CRAN (R 4.0.5)
#> evaluate 0.14 2019-05-28 [1] CRAN (R 4.0.5)
#> fansi 0.5.0 2021-05-25 [1] CRAN (R 4.0.5)
#> fastmap 1.1.0 2021-01-25 [1] CRAN (R 4.0.5)
#> fs 1.5.0 2020-07-31 [1] CRAN (R 4.0.5)
#> generics 0.1.2 2022-01-31 [1] CRAN (R 4.0.5)
#> glue 1.4.2 2020-08-27 [1] CRAN (R 4.0.5)
#> highr 0.9 2021-04-16 [1] CRAN (R 4.0.5)
#> htmltools 0.5.2 2021-08-25 [1] CRAN (R 4.0.5)
#> knitr 1.36 2021-09-29 [1] CRAN (R 4.0.5)
#> lifecycle 1.0.1 2021-09-24 [1] CRAN (R 4.0.5)
#> magrittr 2.0.1 2020-11-17 [1] CRAN (R 4.0.5)
#> pillar 1.7.0 2022-02-01 [1] CRAN (R 4.0.5)
#> pkgconfig 2.0.3 2019-09-22 [1] CRAN (R 4.0.5)
#> purrr 0.3.4 2020-04-17 [1] CRAN (R 4.0.5)
#> R6 2.5.1 2021-08-19 [1] CRAN (R 4.0.5)
#> reprex 2.0.1 2021-08-05 [1] CRAN (R 4.0.5)
#> rlang 1.0.2 2022-03-04 [1] CRAN (R 4.0.5)
#> rmarkdown 2.10 2021-08-06 [1] CRAN (R 4.0.5)
#> rstudioapi 0.13 2020-11-12 [1] CRAN (R 4.0.5)
#> sessioninfo 1.1.1 2018-11-05 [1] CRAN (R 4.0.5)
#> stringi 1.7.5 2021-10-04 [1] CRAN (R 4.0.5)
#> stringr 1.4.0 2019-02-10 [1] CRAN (R 4.0.5)
#> styler 1.5.1 2021-07-13 [1] CRAN (R 4.0.5)
#> tibble 3.1.2 2021-05-16 [1] CRAN (R 4.0.5)
#> tidyselect 1.1.2 2022-02-21 [1] CRAN (R 4.0.5)
#> tzdb 0.2.0 2021-10-27 [1] CRAN (R 4.0.5)
#> utf8 1.2.1 2021-03-12 [1] CRAN (R 4.0.5)
#> vctrs 0.3.8 2021-04-29 [1] CRAN (R 4.0.5)
#> withr 2.5.0 2022-03-03 [1] CRAN (R 4.0.5)
#> xfun 0.25 2021-08-06 [1] CRAN (R 4.0.5)
#> yaml 2.2.1 2020-02-01 [1] CRAN (R 4.0.5)
#>
#> [1] C:/Users/adam.DESKTOP-D3KQQA1/Documents/R/win-library/4.0
#> [2] C:/Program Files/R/R-4.0.5/library
```
</details>
> R dplyr queries create locks on FileSystemDataset files
> -------------------------------------------------------
>
> Key: ARROW-17002
> URL: https://issues.apache.org/jira/browse/ARROW-17002
> Project: Apache Arrow
> Issue Type: Bug
> Components: R
> Affects Versions: 8.0.0
> Reporter: Adam Black
> Priority: Minor
>
> I think that dplyr queries on FileSystemDataset objects will create locks
> that persist unnecessarily. This issue only seems to occur on Windows. I'm
> using Windows 10. Calling the garbage collector after the dplyr query seems
> to release the lock.
> {code:r}
> library(arrow)
> #>
> #> Attaching package: 'arrow'
> #> The following object is masked from 'package:utils':
> #>
> #> timestamp
> library(dplyr)
> #>
> #> Attaching package: 'dplyr'
> #> The following objects are masked from 'package:stats':
> #>
> #> filter, lag
> #> The following objects are masked from 'package:base':
> #>
> #> intersect, setdiff, setequal, union
> # I can delete an arrow dataset that has been opened
> write_dataset(iris, "iris")
> ds <- open_dataset("iris")
> file.exists("iris")
> #> [1] TRUE
> print(unlink("iris", recursive = T))
> #> [1] 0
> file.exists("iris")
> #> [1] FALSE
> # However if I run a dplyr query on the data before deleting it the file is
> locked.
> write_dataset(iris, "iris")
> ds <- open_dataset("iris")
> file.exists("iris")
> #> [1] TRUE
> # I think this adds a lock that is not removed
> ds %>% count() %>% collect()
> #> # A tibble: 1 x 1
> #> n
> #> <int>
> #> 1 150
> print(unlink("iris", recursive = T))
> #> [1] 1
> file.exists("iris")
> #> [1] TRUE
> print(unlink("iris", recursive = T, force = T))
> #> [1] 1
> file.exists("iris")
> #> [1] TRUE
> file.remove("iris/part-0.parquet")
> #> Warning in file.remove("iris/part-0.parquet"): cannot remove file 'iris/
> #> part-0.parquet', reason 'Permission denied'
> #> [1] FALSE
> # running gc() will clean up the lock and allow the file to be deleted
> gc()
> #> used (Mb) gc trigger (Mb) max used (Mb)
> #> Ncells 1178845 63 2349652 125.5 1656436 88.5
> #> Vcells 2093715 16 8388608 64.0 3170844 24.2
> print(unlink("iris", recursive = T))
> #> [1] 0
> file.exists("iris")
> #> [1] FALSE
> sessioninfo::session_info()
> #> - Session info
> ---------------------------------------------------------------
> #> setting value
> #> version R version 4.0.5 (2021-03-31)
> #> os Windows 10 x64
> #> system x86_64, mingw32
> #> ui RTerm
> #> language (EN)
> #> collate English_United States.1252
> #> ctype English_United States.1252
> #> tz America/New_York
> #> date 2022-07-07
> #>
> #> - Packages
> -------------------------------------------------------------------
> #> package * version date lib source
> #> arrow * 8.0.0 2022-05-09 [1] CRAN (R 4.0.5)
> #> assertthat 0.2.1 2019-03-21 [1] CRAN (R 4.0.5)
> #> backports 1.4.0 2021-11-23 [1] CRAN (R 4.0.5)
> #> bit 4.0.4 2020-08-04 [1] CRAN (R 4.0.5)
> #> bit64 4.0.5 2020-08-30 [1] CRAN (R 4.0.5)
> #> cli 3.0.1 2021-07-17 [1] CRAN (R 4.0.5)
> #> crayon 1.5.1 2022-03-26 [1] CRAN (R 4.0.5)
> #> DBI 1.1.2 2021-12-20 [1] CRAN (R 4.0.5)
> #> digest 0.6.27 2020-10-24 [1] CRAN (R 4.0.5)
> #> dplyr * 1.0.8 2022-02-08 [1] CRAN (R 4.0.5)
> #> ellipsis 0.3.2 2021-04-29 [1] CRAN (R 4.0.5)
> #> evaluate 0.14 2019-05-28 [1] CRAN (R 4.0.5)
> #> fansi 0.5.0 2021-05-25 [1] CRAN (R 4.0.5)
> #> fastmap 1.1.0 2021-01-25 [1] CRAN (R 4.0.5)
> #> fs 1.5.0 2020-07-31 [1] CRAN (R 4.0.5)
> #> generics 0.1.2 2022-01-31 [1] CRAN (R 4.0.5)
> #> glue 1.4.2 2020-08-27 [1] CRAN (R 4.0.5)
> #> highr 0.9 2021-04-16 [1] CRAN (R 4.0.5)
> #> htmltools 0.5.2 2021-08-25 [1] CRAN (R 4.0.5)
> #> knitr 1.36 2021-09-29 [1] CRAN (R 4.0.5)
> #> lifecycle 1.0.1 2021-09-24 [1] CRAN (R 4.0.5)
> #> magrittr 2.0.1 2020-11-17 [1] CRAN (R 4.0.5)
> #> pillar 1.7.0 2022-02-01 [1] CRAN (R 4.0.5)
> #> pkgconfig 2.0.3 2019-09-22 [1] CRAN (R 4.0.5)
> #> purrr 0.3.4 2020-04-17 [1] CRAN (R 4.0.5)
> #> R6 2.5.1 2021-08-19 [1] CRAN (R 4.0.5)
> #> reprex 2.0.1 2021-08-05 [1] CRAN (R 4.0.5)
> #> rlang 1.0.2 2022-03-04 [1] CRAN (R 4.0.5)
> #> rmarkdown 2.10 2021-08-06 [1] CRAN (R 4.0.5)
> #> rstudioapi 0.13 2020-11-12 [1] CRAN (R 4.0.5)
> #> sessioninfo 1.1.1 2018-11-05 [1] CRAN (R 4.0.5)
> #> stringi 1.7.5 2021-10-04 [1] CRAN (R 4.0.5)
> #> stringr 1.4.0 2019-02-10 [1] CRAN (R 4.0.5)
> #> styler 1.5.1 2021-07-13 [1] CRAN (R 4.0.5)
> #> tibble 3.1.2 2021-05-16 [1] CRAN (R 4.0.5)
> #> tidyselect 1.1.2 2022-02-21 [1] CRAN (R 4.0.5)
> #> tzdb 0.2.0 2021-10-27 [1] CRAN (R 4.0.5)
> #> utf8 1.2.1 2021-03-12 [1] CRAN (R 4.0.5)
> #> vctrs 0.3.8 2021-04-29 [1] CRAN (R 4.0.5)
> #> withr 2.5.0 2022-03-03 [1] CRAN (R 4.0.5)
> #> xfun 0.25 2021-08-06 [1] CRAN (R 4.0.5)
> #> yaml 2.2.1 2020-02-01 [1] CRAN (R 4.0.5)
> #>
> #> [1] C:/Users/adam.DESKTOP-D3KQQA1/Documents/R/win-library/4.0
> #> [2] C:/Program Files/R/R-4.0.5/library
> {code}
--
This message was sent by Atlassian Jira
(v8.20.10#820010)