TPDeramus commented on issue #43747:
URL: https://github.com/apache/arrow/issues/43747#issuecomment-2297113011

   Update:
   
   It appears this one didn't run either:
   ```
   FilteredOutput <- LargeDataset |> 
   select(ID, BeginDate, EndDate, class, identity_string, database) |> 
   distinct() |> 
   mutate(Foundate = as.Date("1700-01-01"), Logflag = case_when((((class == "A" 
& arrow_is_in(identity_string,options = list(value_set = Array$create(HardA), 
skip_nulls = TRUE))) | (class == "A" & str_starts(identity_string, WildA)) | 
(class == "B" & arrow_is_in(identity_string, options = list(value_set = 
Array$create(HardB),skip_nulls = TRUE))) | (class == "C" & 
arrow_is_in(identity_string, options = list(value_set = 
Array$create(HardC),skip_nulls = TRUE)))) & database == "secondary") ~ 1, 
database == "primary" ~ 2, .default = 0)) |> filter(Logflag > 0) |> 
mutate(Logdate = if_else(Logflag == 2 ~ (as.numeric(BeginDate)+30), 
as.numeric(EndDate)))
   Error: Expression if_else(Logflag == 2 ~ (as.numeric(BeginDate) + 30), 
as.numeric(EndDate)) not supported in Arrow
   Call collect() first to pull data into R.
   ```
   
   In this case, I specifically attempted to cast the date as a numeric value 
and then convert it later after flagging what needs to be filtered and 
converted outside of `arrow`, but there were some subsequent issues when I 
tried to trim down the size of the dataset. Specifically, every time I tried to 
either re-cast or pull the date-to-numeric transformed data to do operations on 
it, it failed to run:
   
   ```
   #Works
   Filtertab <- LargeDataset |> 
   select(ID, BeginDate, EndDate, class, identity_string, database) |> 
   distinct() |> 
   mutate(BeginDate = as.numeric(BeginDate), EndDate = as.numeric(EndDate), 
Logdate = case_when((((class == "A" & arrow_is_in(identity_string,options = 
list(value_set = Array$create(HardA), skip_nulls = TRUE))) | (class == "A" & 
str_starts(identity_string, WildA)) | (class == "B" & arrow_is_in( 
identity_string, options = list(value_set = Array$create(HardB),skip_nulls = 
TRUE))) | (class == "C" & arrow_is_in(identity_string, options = list(value_set 
= Array$create(HardC),skip_nulls = TRUE)))) & database == "primary") ~ EndDate, 
database == "secondary" ~ (BeginDate+30), .default = NA))
   
   
   #Fails
   Filtertab <- LargeDataset |> 
   select(ID, BeginDate, EndDate, class, identity_string, database) |> 
   distinct() |> 
   mutate(BeginDate = as.numeric(BeginDate), EndDate = as.numeric(EndDate), 
Logdate = case_when((((class == "A" & arrow_is_in(identity_string,options = 
list(value_set = Array$create(HardA), skip_nulls = TRUE))) | (class == "A" & 
str_starts(identity_string, WildA)) | (class == "B" & arrow_is_in( 
identity_string, options = list(value_set = Array$create(HardB),skip_nulls = 
TRUE))) | (class == "C" & arrow_is_in(identity_string, options = list(value_set 
= Array$create(HardC),skip_nulls = TRUE)))) & database == "primary") ~ EndDate, 
database == "secondary" ~ (BeginDate+30), .default = NA)) |>
   mutate(BeginDate = as.date(BeginDate), EndDate = as.date(EndDate), Logdate = 
as.date(Logdate))
   ℹ In index: 2.
   ℹ With name: BeginDate.
   Caused by error:
   ! NotImplemented: Unsupported cast from date32[day] to double using function 
cast_double
   Run `rlang::last_trace()` to see where the error occurred.
   
   #Fails
   Filtertab <- LargeDataset |> 
   select(ID, BeginDate, EndDate, class, identity_string, database) |> 
   distinct() |> 
   mutate(BeginDate = as.numeric(BeginDate), EndDate = as.numeric(EndDate), 
Logdate = case_when((((class == "A" & arrow_is_in(identity_string,options = 
list(value_set = Array$create(HardA), skip_nulls = TRUE))) | (class == "A" & 
str_starts(identity_string, WildA)) | (class == "B" & arrow_is_in( 
identity_string, options = list(value_set = Array$create(HardB),skip_nulls = 
TRUE))) | (class == "C" & arrow_is_in(identity_string, options = list(value_set 
= Array$create(HardC),skip_nulls = TRUE)))) & database == "primary") ~ EndDate, 
database == "secondary" ~ (BeginDate+30), .default = NA)) |>
   filter((BeginDate <= EndDate) & (EndDate <= Logdate)) |>
   group_by(ID,BeginDate) |>
   summarize(Logdate = max(Logdate))
   ℹ In index: 2.
   ℹ With name: BeginDate.
   Caused by error:
   ! NotImplemented: Unsupported cast from date32[day] to double using function 
cast_double
   Run `rlang::last_trace()` to see where the error occurred.
        
   #Fails
   Filtertab <- LargeDataset |> 
   select(ID, BeginDate, EndDate, class, identity_string, database) |> 
   distinct() |> 
   mutate(DDt = as.numeric(BeginDate), BDate = as.numeric(EndDate), Logdate = 
case_when((((class == "A" & arrow_is_in(identity_string,options = 
list(value_set = Array$create(HardA), skip_nulls = TRUE))) | (class == "A" & 
str_starts(identity_string, WildA)) | (class == "B" & arrow_is_in( 
identity_string, options = list(value_set = Array$create(HardB),skip_nulls = 
TRUE))) | (class == "C" & arrow_is_in(identity_string, options = list(value_set 
= Array$create(HardC),skip_nulls = TRUE)))) & database == "primary") ~ BDate, 
database == "secondary" ~ (DDt+30), .default = NA)) |>
   filter((DDt <= BDate) & (BDate <= Logdate)) |>
   group_by(ID,BeginDate) |>
   summarize(Logdate = max(Logdate))
   Error in `map()`:
   ℹ In index: 7.
   ℹ With name: DDt.
   Caused by error:
   ! NotImplemented: Unsupported cast from date32[day] to double using function 
cast_double
   Run `rlang::last_trace()` to see where the error occurred.
   ```
   
   Is assigning an output as a date in general an issue?


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to