kevinpemonon commented on issue #36161:
URL: https://github.com/apache/arrow/issues/36161#issuecomment-1628619698
Hello,
After trying on linux (debian), I also get memory release issues after using
gc() on functions using the arrow library, looking at the rsession element of
the top command.
Below are the different captures:
1. The script used with output :
```
> gc(verbose = TRUE)
Garbage collection 5 = 3+0+2 (level 2) ...
26.3 Mbytes of cons cells used (46%)
6.1 Mbytes of vectors used (9%)
used (Mb) gc trigger (Mb) max used (Mb)
Ncells 490588 26.3 1068414 57.1 666531 35.6
Vcells 792200 6.1 8388608 64.0 1820111 13.9
>
> # basic memory
> # memory.size(max=F)
> library(pryr)
> mem_used()
45 MB
>
> library(arrow, warn.conflicts = FALSE)
Some features are not enabled in this build of Arrow. Run `arrow_info()` for
more information.
>
> # Memory after loading the arrow library with memory.size
> # memory.size(max=F)
> mem_used()
57.7 MB
>
> # bytes_allocated after loading the arrow library
> # default_memory_pool()$bytes_allocated
>
> # max_memory after loading the arrow library
> # default_memory_pool()$max_memory
>
> library(dplyr)
Attachement du package : ‘dplyr’
L'objet suivant est masqué depuis ‘package:pryr’:
where
Les objets suivants sont masqués depuis ‘package:stats’:
filter, lag
Les objets suivants sont masqués depuis ‘package:base’:
intersect, setdiff, setequal, union
>
> # Memory after loading the dplyr library with memory.size
> # memory.size(max=F)
> mem_used()
62.8 MB
>
> # bytes_allocated after loading the dplyr library
> # default_memory_pool()$bytes_allocated
>
> # max_memory after loading the dplyr library
> # default_memory_pool()$max_memory
>
> df <- data.frame(
+ col1 = rnorm(1000000),
+ col2 = rnorm(1000000),
+ col3 = runif(1000000),
+ col4 = sample(1:999, size = 1000000, replace = T),
+ col5 = sample(c("GroupA", "GroupB"), size = 1000000, replace = T),
+ col6 = sample(c("TypeA", "TypeB"), size = 1000000, replace = T)
+ )
>
> # Memory after df object creation
> # memory.size(max=F)
> mem_used()
107 MB
>
> # bytes_allocated after df object creation
> # default_memory_pool()$bytes_allocated
>
> # max_memory after df object creation
> # default_memory_pool()$max_memory
>
> arrow::write_dataset(
+ df,
+ "~/test",
+ format = "parquet"
+ )
>
> # Memory after writing to disk
> # memory.size(max=F)
> mem_used()
119 MB
>
> # bytes_allocated after writing to disk
> # default_memory_pool()$bytes_allocated
>
> # max_memory after writing to disk
> # default_memory_pool()$max_memory
>
> rm(df)
>
> # Memory after deletion df
> # memory.size(max=F)
> mem_used()
74.6 MB
>
> # bytes_allocated after deletion df
> # default_memory_pool()$bytes_allocated
>
> # max_memory after deletion df
> # default_memory_pool()$max_memory
>
> gc(verbose = TRUE)
Garbage collection 19 = 10+0+9 (level 2) ...
57.9 Mbytes of cons cells used (63%)
13.3 Mbytes of vectors used (17%)
used (Mb) gc trigger (Mb) max used (Mb)
Ncells 1083548 57.9 1707185 91.2 1707185 91.2
Vcells 1739307 13.3 10076106 76.9 8500943 64.9
>
> # Memory after gc(verbose = TRUE)
> # memory.size(max=F)
> mem_used()
74.6 MB
>
> # bytes_allocated after gc(verbose = TRUE)
> # default_memory_pool()$bytes_allocated
>
> # max_memory after gc(verbose = TRUE)
> # default_memory_pool()$max_memory
>
> gc(verbose = TRUE)
Garbage collection 21 = 10+0+11 (level 2) ...
57.9 Mbytes of cons cells used (63%)
13.3 Mbytes of vectors used (17%)
used (Mb) gc trigger (Mb) max used (Mb)
Ncells 1083548 57.9 1707185 91.2 1707185 91.2
Vcells 1739307 13.3 10076106 76.9 8500943 64.9
>
> # Memory after gc(verbose = TRUE)
> # memory.size(max=F)
> mem_used()
74.6 MB
>
> # bytes_allocated after gc(verbose = TRUE)
> # default_memory_pool()$bytes_allocated
>
> # max_memory after gc(verbose = TRUE)
> # default_memory_pool()$max_memory
>
> ds <- arrow::open_dataset("~/test")
>
> # Memory after ds creation
> # memory.size(max=F)
> mem_used()
79.2 MB
>
> # bytes_allocated after ds creation
> # default_memory_pool()$bytes_allocated
>
> # max_memory after ds creation
> # default_memory_pool()$max_memory
>
> req <-
+ ds %>%
+ collect()
>
> # Memory after req creation
> # memory.size(max=F)
> mem_used()
80.6 MB
>
> # bytes_allocated after req creation
> # default_memory_pool()$bytes_allocated
>
> # max_memory after req creation
> # default_memory_pool()$max_memory
>
> rm(req)
>
> # Memory after deletion req
> # memory.size(max=F)
> mem_used()
80.6 MB
>
> # bytes_allocated after deletion req
> # default_memory_pool()$bytes_allocated
>
> # max_memory after deletion req
> # default_memory_pool()$max_memory
>
> gc(verbose = TRUE)
Garbage collection 26 = 10+0+16 (level 2) ...
62.5 Mbytes of cons cells used (56%)
14.5 Mbytes of vectors used (19%)
used (Mb) gc trigger (Mb) max used (Mb)
Ncells 1168741 62.5 2088622 111.6 1707185 91.2
Vcells 1891012 14.5 10076106 76.9 8500943 64.9
>
> # Memory after gc(verbose = TRUE)
> # memory.size(max=F)
> mem_used()
80.6 MB
>
> # bytes_allocated after gc(verbose = TRUE)
> # default_memory_pool()$bytes_allocated
>
> # max_memory after gc(verbose = TRUE)
> # default_memory_pool()$max_memory
>
> gc(verbose = TRUE)
Garbage collection 28 = 10+0+18 (level 2) ...
62.5 Mbytes of cons cells used (56%)
14.5 Mbytes of vectors used (19%)
used (Mb) gc trigger (Mb) max used (Mb)
Ncells 1168741 62.5 2088622 111.6 1707185 91.2
Vcells 1891012 14.5 10076106 76.9 8500943 64.9
>
> # bytes_allocated after gc(verbose = TRUE)
> # default_memory_pool()$bytes_allocated
>
> # max_memory after gc(verbose = TRUE)
> # default_memory_pool()$max_memory
>
> # Memory after gc(verbose = TRUE)
> # memory.size(max=F)
> mem_used()
80.6 MB
>
> rm(ds)
>
> # Memory after deletion df
> # memory.size(max=F)
> mem_used()
80.6 MB
>
> # bytes_allocated after deletion df
> # default_memory_pool()$bytes_allocated
>
> # max_memory after deletion df
> # default_memory_pool()$max_memory
>
> gc(verbose = TRUE)
Garbage collection 31 = 10+0+21 (level 2) ...
62.5 Mbytes of cons cells used (56%)
14.5 Mbytes of vectors used (19%)
used (Mb) gc trigger (Mb) max used (Mb)
Ncells 1168648 62.5 2088622 111.6 1707185 91.2
Vcells 1890971 14.5 10076106 76.9 8500943 64.9
>
> # Memory after gc(verbose = TRUE)
> # memory.size(max=F)
> mem_used()
80.6 MB
>
> # bytes_allocated after gc(verbose = TRUE)
> # default_memory_pool()$bytes_allocated
>
> # max_memory after gc(verbose = TRUE)
> # default_memory_pool()$max_memory
>
> gc(verbose = TRUE)
Garbage collection 33 = 10+0+23 (level 2) ...
62.5 Mbytes of cons cells used (56%)
14.5 Mbytes of vectors used (19%)
used (Mb) gc trigger (Mb) max used (Mb)
Ncells 1168648 62.5 2088622 111.6 1707185 91.2
Vcells 1890971 14.5 10076106 76.9 8500943 64.9
>
> # Memory after gc(verbose = TRUE)
> # memory.size(max=F)
> mem_used()
80.6 MB
>
> # bytes_allocated after gc(verbose = TRUE)
> # default_memory_pool()$bytes_allocated
>
> # max_memory after gc(verbose = TRUE)
> # default_memory_pool()$max_memory
```
2. top (before using the script) => %MEM of rsession =1.8

3. top (after using script with rm() and gc()) => %MEM of rsession =5.2

4. free -h (before using script) => mem used : 2.2Gi

5. free -h (after using script with rm() and gc()) => mem used : 2.4Gi

Do you think this problem could be linked to the arrow library or the R
language?
Best regards,
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]