TPDeramus commented on issue #43747:
URL: https://github.com/apache/arrow/issues/43747#issuecomment-2297113011
Update:
It appears this one didn't run either:
```
FilteredOutput <- LargeDataset |>
select(ID, BeginDate, EndDate, class, identity_string, database) |>
distinct() |>
mutate(Foundate = as.Date("1700-01-01"), Logflag = case_when((((class == "A"
& arrow_is_in(identity_string,options = list(value_set = Array$create(HardA),
skip_nulls = TRUE))) | (class == "A" & str_starts(identity_string, WildA)) |
(class == "B" & arrow_is_in(identity_string, options = list(value_set =
Array$create(HardB),skip_nulls = TRUE))) | (class == "C" &
arrow_is_in(identity_string, options = list(value_set =
Array$create(HardC),skip_nulls = TRUE)))) & database == "secondary") ~ 1,
database == "primary" ~ 2, .default = 0)) |> filter(Logflag > 0) |>
mutate(Logdate = if_else(Logflag == 2 ~ (as.numeric(BeginDate)+30),
as.numeric(EndDate)))
Error: Expression if_else(Logflag == 2 ~ (as.numeric(BeginDate) + 30),
as.numeric(EndDate)) not supported in Arrow
Call collect() first to pull data into R.
```
In this case, I specifically attempted to cast the date as a numeric value
and then convert it later after flagging what needs to be filtered and
converted outside of `arrow`, but there were some subsequent issues when I
tried to trim down the size of the dataset. Specifically, every time I tried to
either re-cast or pull the date-to-numeric transformed data to do operations on
it, it failed to run:
```
#Works
Filtertab <- LargeDataset |>
select(ID, BeginDate, EndDate, class, identity_string, database) |>
distinct() |>
mutate(BeginDate = as.numeric(BeginDate), EndDate = as.numeric(EndDate),
Logdate = case_when((((class == "A" & arrow_is_in(identity_string,options =
list(value_set = Array$create(HardA), skip_nulls = TRUE))) | (class == "A" &
str_starts(identity_string, WildA)) | (class == "B" & arrow_is_in(
identity_string, options = list(value_set = Array$create(HardB),skip_nulls =
TRUE))) | (class == "C" & arrow_is_in(identity_string, options = list(value_set
= Array$create(HardC),skip_nulls = TRUE)))) & database == "primary") ~ EndDate,
database == "secondary" ~ (BeginDate+30), .default = NA))
#Fails
Filtertab <- LargeDataset |>
select(ID, BeginDate, EndDate, class, identity_string, database) |>
distinct() |>
mutate(BeginDate = as.numeric(BeginDate), EndDate = as.numeric(EndDate),
Logdate = case_when((((class == "A" & arrow_is_in(identity_string,options =
list(value_set = Array$create(HardA), skip_nulls = TRUE))) | (class == "A" &
str_starts(identity_string, WildA)) | (class == "B" & arrow_is_in(
identity_string, options = list(value_set = Array$create(HardB),skip_nulls =
TRUE))) | (class == "C" & arrow_is_in(identity_string, options = list(value_set
= Array$create(HardC),skip_nulls = TRUE)))) & database == "primary") ~ EndDate,
database == "secondary" ~ (BeginDate+30), .default = NA)) |>
mutate(BeginDate = as.date(BeginDate), EndDate = as.date(EndDate), Logdate =
as.date(Logdate))
ℹ In index: 2.
ℹ With name: BeginDate.
Caused by error:
! NotImplemented: Unsupported cast from date32[day] to double using function
cast_double
Run `rlang::last_trace()` to see where the error occurred.
#Fails
Filtertab <- LargeDataset |>
select(ID, BeginDate, EndDate, class, identity_string, database) |>
distinct() |>
mutate(BeginDate = as.numeric(BeginDate), EndDate = as.numeric(EndDate),
Logdate = case_when((((class == "A" & arrow_is_in(identity_string,options =
list(value_set = Array$create(HardA), skip_nulls = TRUE))) | (class == "A" &
str_starts(identity_string, WildA)) | (class == "B" & arrow_is_in(
identity_string, options = list(value_set = Array$create(HardB),skip_nulls =
TRUE))) | (class == "C" & arrow_is_in(identity_string, options = list(value_set
= Array$create(HardC),skip_nulls = TRUE)))) & database == "primary") ~ EndDate,
database == "secondary" ~ (BeginDate+30), .default = NA)) |>
filter((BeginDate <= EndDate) & (EndDate <= Logdate)) |>
group_by(ID,BeginDate) |>
summarize(Logdate = max(Logdate))
ℹ In index: 2.
ℹ With name: BeginDate.
Caused by error:
! NotImplemented: Unsupported cast from date32[day] to double using function
cast_double
Run `rlang::last_trace()` to see where the error occurred.
#Fails
Filtertab <- LargeDataset |>
select(ID, BeginDate, EndDate, class, identity_string, database) |>
distinct() |>
mutate(DDt = as.numeric(BeginDate), BDate = as.numeric(EndDate), Logdate =
case_when((((class == "A" & arrow_is_in(identity_string,options =
list(value_set = Array$create(HardA), skip_nulls = TRUE))) | (class == "A" &
str_starts(identity_string, WildA)) | (class == "B" & arrow_is_in(
identity_string, options = list(value_set = Array$create(HardB),skip_nulls =
TRUE))) | (class == "C" & arrow_is_in(identity_string, options = list(value_set
= Array$create(HardC),skip_nulls = TRUE)))) & database == "primary") ~ BDate,
database == "secondary" ~ (DDt+30), .default = NA)) |>
filter((DDt <= BDate) & (BDate <= Logdate)) |>
group_by(ID,BeginDate) |>
summarize(Logdate = max(Logdate))
Error in `map()`:
ℹ In index: 7.
ℹ With name: DDt.
Caused by error:
! NotImplemented: Unsupported cast from date32[day] to double using function
cast_double
Run `rlang::last_trace()` to see where the error occurred.
```
Is assigning an output as a date in general an issue?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]