This is an automated email from the ASF dual-hosted git repository.
jonkeane pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 3fb84ff2c6 GH-48629: [R] Add tests for duplicate column names and
incompatible types in joins (#48630)
3fb84ff2c6 is described below
commit 3fb84ff2c66deea205b45cad697ab724ea98ad5d
Author: Hyukjin Kwon <[email protected]>
AuthorDate: Sat Jan 10 23:41:35 2026 +0900
GH-48629: [R] Add tests for duplicate column names and incompatible types
in joins (#48630)
### Rationale for this change
Two TODOs in `test-dplyr-join.R` requested test coverage for edge cases
in join operations:
1. Testing duplicate column names in joins (with suffixes)
2. Testing type casting behavior when joining on columns with
incompatible types
### What changes are included in this PR?
1. Duplicated column names test
- Default suffixes (`.x` and `.y`) when no explicit suffix is provided
- Custom suffixes (`_left` and `_right`)
- Merged the existing "suffix" test into this test block
2. Join key cast failure test
- Tests that joining on columns with incompatible types (int32 vs
double) correctly errors
### Are these changes tested?
Yes, corresponding tests were added
### Are there any user-facing changes?
No, test-only changes.
* GitHub Issue: #48629
---
r/tests/testthat/test-dplyr-join.R | 111 +++++++++++++++++++++++++++++--------
1 file changed, 89 insertions(+), 22 deletions(-)
diff --git a/r/tests/testthat/test-dplyr-join.R
b/r/tests/testthat/test-dplyr-join.R
index 51ca528a64..ce7ed2e63f 100644
--- a/r/tests/testthat/test-dplyr-join.R
+++ b/r/tests/testthat/test-dplyr-join.R
@@ -188,8 +188,95 @@ test_that("Error handling for unsupported expressions in
join_by", {
)
})
-# TODO: test duplicate col names
-# TODO: casting: int and float columns?
+test_that("joins with duplicate column names", {
+ # When column names are duplicated (not in by), suffixes are added
+ left_dup <- tibble::tibble(
+ key = 1:5,
+ shared = 1:5,
+ shared_float = c(1.1, 2.2, 3.3, 4.4, 5.5),
+ left_unique = letters[1:5]
+ )
+ right_dup <- tibble::tibble(
+ key = 1:5,
+ shared = 6:10,
+ shared_float = c(6.1, 7.2, 8.3, 9.4, 10.5),
+ right_unique = LETTERS[1:5]
+ )
+
+ # Test with default suffixes (.x and .y)
+ compare_dplyr_binding(
+ .input |>
+ left_join(right_dup, by = "key") |>
+ collect(),
+ left_dup
+ )
+
+ compare_dplyr_binding(
+ .input |>
+ inner_join(right_dup, by = "key") |>
+ collect(),
+ left_dup
+ )
+
+ # Test with custom suffixes
+ compare_dplyr_binding(
+ .input |>
+ left_join(right_dup, by = "key", suffix = c("_left", "_right")) |>
+ collect(),
+ left_dup
+ )
+
+ compare_dplyr_binding(
+ .input |>
+ inner_join(right_dup, by = "key", suffix = c("_left", "_right")) |>
+ collect(),
+ left_dup
+ )
+
+ # Test that column names are correctly suffixed
+ # Verify exact column names match expected pattern using the same fixture
+ result <- arrow_table(left_dup) |>
+ inner_join(
+ arrow_table(right_dup),
+ by = "key",
+ suffix = c("_left", "_right")
+ ) |>
+ collect()
+ res_col_names <- names(result)
+ # Column order: join key first, then left table columns (with suffixes),
+ # then right table columns (with suffixes)
+ expected_col_names <- c(
+ "key",
+ "shared_left",
+ "shared_float_left",
+ "left_unique",
+ "shared_right",
+ "shared_float_right",
+ "right_unique"
+ )
+ expect_equal(expected_col_names, res_col_names)
+})
+
+test_that("joins with incompatible types for join keys", {
+ # Test that joining on columns with incompatible types (int vs float) fails
+ # Arrow requires join keys to have compatible types - type casting is not
+ # automatically performed for join keys
+ left_int <- Table$create(
+ x = c(1L, 2L),
+ shared = c(10L, 20L)
+ )
+ right_float <- Table$create(
+ x = c(1.0, 2.0),
+ shared = c(10.1, 20.2)
+ )
+
+ expect_error(
+ left_int |>
+ left_join(right_float, by = "x") |>
+ collect(),
+ "Incompatible data types for corresponding join field keys"
+ )
+})
test_that("right_join", {
compare_dplyr_binding(
@@ -317,26 +404,6 @@ test_that("arrow dplyr query correctly filters then
joins", {
)
})
-test_that("suffix", {
- left_suf <- Table$create(
- key = c(1, 2),
- left_unique = c(2.1, 3.1),
- shared = c(10.1, 10.3)
- )
-
- right_suf <- Table$create(
- key = c(1, 2, 3, 10, 20),
- right_unique = c(1.1, 1.2, 3.1, 4.1, 4.3),
- shared = c(20.1, 30, 40, 50, 60)
- )
-
- join_op <- inner_join(left_suf, right_suf, by = "key", suffix = c("_left",
"_right"))
- output <- collect(join_op)
- res_col_names <- names(output)
- expected_col_names <- c("key", "left_unique", "shared_left", "right_unique",
"shared_right")
- expect_equal(expected_col_names, res_col_names)
-})
-
test_that("suffix and implicit schema", {
left_suf <- Table$create(
key = c(1, 2),