thisisnic commented on issue #34519:
URL: https://github.com/apache/arrow/issues/34519#issuecomment-1469802891

   A little more investigation of the specific circumstances in which it does 
and does not occur:
   
   ``` r
   library(arrow)
   library(dplyr)
   
   # no problem when replacing column with self when there is just 1 column
   df <- tibble::tibble(x = 1:10) 
   tf <- tempfile()
   dir.create(tf)
   write_dataset(df, tf)
   
   open_dataset(tf) %>%
     mutate(x = as.numeric(x)) %>%
     collect()
   #> # A tibble: 10 × 1
   #>        x
   #>    <dbl>
   #>  1     1
   #>  2     2
   #>  3     3
   #>  4     4
   #>  5     5
   #>  6     6
   #>  7     7
   #>  8     8
   #>  9     9
   #> 10    10
   
   # NA values when there are 2 columns
   df <- tibble::tibble(x = 1:10, y = 1:10) 
   tf <- tempfile()
   dir.create(tf)
   
   write_dataset(df, tf)
   
   open_dataset(tf) %>%
     mutate(x = as.numeric(x)) %>%
     collect()
   #> # A tibble: 10 × 2
   #>        x     y
   #>    <dbl> <int>
   #>  1    NA     1
   #>  2    NA     2
   #>  3    NA     3
   #>  4    NA     4
   #>  5    NA     5
   #>  6    NA     6
   #>  7    NA     7
   #>  8    NA     8
   #>  9    NA     9
   #> 10    NA    10
   
   # works fine if we're creating a brand new column
   open_dataset(tf) %>%
     mutate(z = as.numeric(x)) %>%
     collect()
   #> # A tibble: 10 × 3
   #>        x     y     z
   #>    <int> <int> <dbl>
   #>  1     1     1     1
   #>  2     2     2     2
   #>  3     3     3     3
   #>  4     4     4     4
   #>  5     5     5     5
   #>  6     6     6     6
   #>  7     7     7     7
   #>  8     8     8     8
   #>  9     9     9     9
   #> 10    10    10    10
   
   # works fine if we're replacing a different column
   open_dataset(tf) %>%
     mutate(y = as.numeric(x)) %>%
     collect()
   #> # A tibble: 10 × 2
   #>        x     y
   #>    <int> <dbl>
   #>  1     1     1
   #>  2     2     2
   #>  3     3     3
   #>  4     4     4
   #>  5     5     5
   #>  6     6     6
   #>  7     7     7
   #>  8     8     8
   #>  9     9     9
   #> 10    10    10
   
   # works fine with in-memory datasets when replacing existing columns
   InMemoryDataset$create(df) %>%
     mutate(x = as.numeric(x)) %>%
     collect()
   #> # A tibble: 10 × 2
   #>        x     y
   #>    <dbl> <int>
   #>  1     1     1
   #>  2     2     2
   #>  3     3     3
   #>  4     4     4
   #>  5     5     5
   #>  6     6     6
   #>  7     7     7
   #>  8     8     8
   #>  9     9     9
   #> 10    10    10
   ```
   
   Given it works with 11.0.0.3 and not the dev version of the R package, and 
there are very few R code changes since 11.0.0.3, I'm inclined to think that 
this could be something happening at the C++ level.  I'll try to narrow it down 
to the PR which caused this change.


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to