[
https://issues.apache.org/jira/browse/ARROW-15599?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17488436#comment-17488436
]
Nicola Crane edited comment on ARROW-15599 at 2/8/22, 2:00 PM:
---------------------------------------------------------------
Thanks for reporting this!
Here's a reprex with more verbose output.
{code:r}
library(arrow)
#>
#> Attaching package: 'arrow'
#> The following object is masked from 'package:utils':
#>
#> timestamp
tf <- tempfile()
write.csv(data.frame(x = '2018-10-07 19:04:05.005'), tf, row.names = FALSE)
# successfully read in file
read_csv_arrow(tf, as_data_frame = TRUE)
#> # A tibble: 1 × 1
#> x
#> <dttm>
#> 1 2018-10-07 20:04:05
# the unit here is seconds - doesn't work
read_csv_arrow(
tf,
col_names = "x",
col_types = "T",
skip = 1
)
#> Error in `handle_csv_read_error()`:
#> ! Invalid: In CSV column #0: CSV conversion error to timestamp[s]: invalid
value '2018-10-07 19:04:05.005'
#> /home/nic2/arrow/cpp/src/arrow/csv/converter.cc:550 decoder_.Decode(data,
size, quoted, &value)
#> /home/nic2/arrow/cpp/src/arrow/csv/parser.h:123 status
#> /home/nic2/arrow/cpp/src/arrow/csv/converter.cc:554
parser.VisitColumn(col_index, visit)
# the unit here is ms - doesn't work
read_csv_arrow(
tf,
col_names = "x",
col_types = "t",
skip = 1
)
#> Error in `handle_csv_read_error()`:
#> ! Invalid: In CSV column #0: CSV conversion error to time32[ms]: invalid
value '2018-10-07 19:04:05.005'
#> /home/nic2/arrow/cpp/src/arrow/csv/converter.cc:550 decoder_.Decode(data,
size, quoted, &value)
#> /home/nic2/arrow/cpp/src/arrow/csv/parser.h:123 status
#> /home/nic2/arrow/cpp/src/arrow/csv/converter.cc:554
parser.VisitColumn(col_index, visit)
# the unit here is inferred as ns - does work!
read_csv_arrow(
tf,
col_names = "x",
col_types = "?",
skip = 1,
as_data_frame = FALSE
)
#> Table
#> 1 rows x 1 columns
#> $x <timestamp[ns]>
{code}
It looks like what's happening here is that the {{col_types}} compact
representations are mapped to a timestamp with units in seconds ("T") or time32
objects with units in milliseconds, but the data is actually to nanosecond
precision.
You could get round this for now by specifying a schema for the names and
column types instead of the {{readr}} shortcodes:
{code:r}
read_csv_arrow(
tf,
schema = schema(x = timestamp(unit = "us")),
skip = 1
)
{code}
That said, this is something we should either fix or document.
was (Author: thisisnic):
Here's a reprex with more verbose output.
{code:r}
library(arrow)
#>
#> Attaching package: 'arrow'
#> The following object is masked from 'package:utils':
#>
#> timestamp
tf <- tempfile()
write.csv(data.frame(x = '2018-10-07 19:04:05.005'), tf, row.names = FALSE)
# successfully read in file
read_csv_arrow(tf, as_data_frame = FALSE)
#> Table
#> 1 rows x 1 columns
#> $x <timestamp[ns]>
# successfully read in with col_names and col_types specified
read_csv_arrow(
tf,
col_names = "x",
col_types = "?",
skip = 1,
as_data_frame = FALSE
)
#> Table
#> 1 rows x 1 columns
#> $x <timestamp[ns]>
read_csv_arrow(
tf,
col_names = "x",
col_types = "T",
skip = 1,
as_data_frame = FALSE
)
#> Error: Invalid: In CSV column #0: CSV conversion error to timestamp[s]:
invalid value '2018-10-07 19:04:05.005'
#> /home/nic2/arrow/cpp/src/arrow/csv/converter.cc:550 decoder_.Decode(data,
size, quoted, &value)
#> /home/nic2/arrow/cpp/src/arrow/csv/parser.h:123 status
#> /home/nic2/arrow/cpp/src/arrow/csv/converter.cc:554
parser.VisitColumn(col_index, visit)
read_csv_arrow(
tf,
col_names = "x",
col_types = "T",
as_data_frame = FALSE,
skip = 1,
timestamp_parsers = "%Y-%m-%d %H:%M:%S"
)
#> Error: Invalid: In CSV column #0: CSV conversion error to timestamp[s]:
invalid value '2018-10-07 19:04:05.005'
#> /home/nic2/arrow/cpp/src/arrow/csv/converter.cc:550 decoder_.Decode(data,
size, quoted, &value)
#> /home/nic2/arrow/cpp/src/arrow/csv/parser.h:123 status
#> /home/nic2/arrow/cpp/src/arrow/csv/converter.cc:554
parser.VisitColumn(col_index, visit)
{code}
> [R] can't explicitly convert a column as a sub-seconds typestamp from CSV (or
> other delimited) file
> ---------------------------------------------------------------------------------------------------
>
> Key: ARROW-15599
> URL: https://issues.apache.org/jira/browse/ARROW-15599
> Project: Apache Arrow
> Issue Type: Bug
> Affects Versions: 6.0.1
> Environment: R version 4.1.2 (2021-11-01)
> Platform: x86_64-pc-linux-gnu (64-bit)
> Running under: Ubuntu 20.04.3 LTS
> Reporter: SHIMA Tatsuya
> Priority: Major
>
> I tried to read the csv column type as timestamp, but I could only get it to
> work well when `col_types` was not specified.
> I'm sorry if I missed something and this is the expected behavior. (It would
> be great if you could add an example with `col_types` in the documentation.)
> {code:r}
> library(arrow)
> #>
> #> Attaching package: 'arrow'
> #> The following object is masked from 'package:utils':
> #>
> #> timestamp
> t_string <- tibble::tibble(
> x = "2018-10-07 19:04:05.005"
> )
> write_csv_arrow(t_string, "tmp.csv")
> read_csv_arrow(
> "tmp.csv",
> as_data_frame = FALSE
> )
> #> Table
> #> 1 rows x 1 columns
> #> $x <timestamp[ns]>
> read_csv_arrow(
> "tmp.csv",
> col_names = "x",
> col_types = "?",
> skip = 1,
> as_data_frame = FALSE
> )
> #> Table
> #> 1 rows x 1 columns
> #> $x <timestamp[ns]>
> read_csv_arrow(
> "tmp.csv",
> col_names = "x",
> col_types = "T",
> skip = 1,
> as_data_frame = FALSE
> )
> #> Error: Invalid: In CSV column #0: CSV conversion error to timestamp[s]:
> invalid value '2018-10-07 19:04:05.005'
> read_csv_arrow(
> "tmp.csv",
> col_names = "x",
> col_types = "T",
> as_data_frame = FALSE,
> skip = 1,
> timestamp_parsers = "%Y-%m-%d %H:%M:%S"
> )
> #> Error: Invalid: In CSV column #0: CSV conversion error to timestamp[s]:
> invalid value '2018-10-07 19:04:05.005'
> {code}
--
This message was sent by Atlassian Jira
(v8.20.1#820001)