This is an automated email from the ASF dual-hosted git repository.
thisisnic pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new c20a04a0bf GH-35806: [R] Improve error message for null type inference
with sparse CSV data
c20a04a0bf is described below
commit c20a04a0bf3c4e437b7f6370342b2ba8314a4257
Author: Nic Crane <[email protected]>
AuthorDate: Thu Feb 19 09:21:40 2026 +0000
GH-35806: [R] Improve error message for null type inference with sparse CSV
data
When a CSV column contains only missing values in the first block of data,
Arrow infers the type as null. If a non-null value appears later, the
conversion fails with an unhelpful error suggesting `skip = 1`.
This change adds a specific check for "conversion error to null" and
provides a more helpful message explaining the cause (type inference
from sparse data) and the solution (specify column types explicitly).
Co-Authored-By: Claude Opus 4.5 <[email protected]>
---
r/R/util.R | 15 +++++++++++++++
r/tests/testthat/test-dataset-csv.R | 18 ++++++++++++++++++
2 files changed, 33 insertions(+)
diff --git a/r/R/util.R b/r/R/util.R
index c63e1ee545..acbd39e203 100644
--- a/r/R/util.R
+++ b/r/R/util.R
@@ -196,6 +196,21 @@ repeat_value_as_array <- function(object, n) {
}
handle_csv_read_error <- function(msg, call, schema) {
+ # Handle null type inference issue with sparse data
+ if (grepl("conversion error to null", msg)) {
+ msg <- c(
+ msg,
+ i = paste(
+ "Column type was inferred as null because the first block of data",
+ "(default 1MB, set via `block_size` in read options) contained only",
+ "missing values. Try specifying the column types explicitly using the",
+ "`col_types` or `schema` argument."
+ )
+ )
+ abort(msg, call = call)
+ }
+
+ # Handle schema + header row issue
if (grepl("conversion error", msg) && inherits(schema, "Schema")) {
msg <- c(
msg,
diff --git a/r/tests/testthat/test-dataset-csv.R
b/r/tests/testthat/test-dataset-csv.R
index 749d1672ac..145a376da9 100644
--- a/r/tests/testthat/test-dataset-csv.R
+++ b/r/tests/testthat/test-dataset-csv.R
@@ -711,3 +711,21 @@ test_that("open_dataset() with `decimal_point` argument", {
tibble(x = 1.2, y = "c")
)
})
+
+
+test_that("more informative error when column inferred as null due to sparse
data (GH-35806)", {
+ tf <- tempfile()
+ on.exit(unlink(tf))
+
+ # Create a CSV where the second column has NAs in the first rows
+ # but a value later - this causes Arrow to infer null type
+ writeLines(c("x,y", paste0(1:100, ",")), tf)
+ write("101,foo", tf, append = TRUE)
+
+ # Use small block_size to force type inference from only the first rows
+ expect_error(
+ open_dataset(tf, format = "csv", read_options =
csv_read_options(block_size = 100L)) |>
+ collect(),
+ "inferred as null"
+ )
+})