This is an automated email from the ASF dual-hosted git repository.

jonkeane pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new 3fb84ff2c6 GH-48629: [R] Add tests for duplicate column names and 
incompatible types in joins (#48630)
3fb84ff2c6 is described below

commit 3fb84ff2c66deea205b45cad697ab724ea98ad5d
Author: Hyukjin Kwon <[email protected]>
AuthorDate: Sat Jan 10 23:41:35 2026 +0900

    GH-48629: [R] Add tests for duplicate column names and incompatible types 
in joins (#48630)
    
    ### Rationale for this change
    
    Two TODOs in `test-dplyr-join.R` requested test coverage for edge cases
    in join operations:
    1. Testing duplicate column names in joins (with suffixes)
    2. Testing type casting behavior when joining on columns with
    incompatible types
    
    ### What changes are included in this PR?
    
    1. Duplicated column names test
    - Default suffixes (`.x` and `.y`) when no explicit suffix is provided
       - Custom suffixes (`_left` and `_right`)
       - Merged the existing "suffix" test into this test block
    
    2. Join key cast failure test
    - Tests that joining on columns with incompatible types (int32 vs
    double) correctly errors
    
    ### Are these changes tested?
    
    Yes, corresponding tests were added
    
    ### Are there any user-facing changes?
    
    No, test-only changes.
    
    * GitHub Issue: #48629
---
 r/tests/testthat/test-dplyr-join.R | 111 +++++++++++++++++++++++++++++--------
 1 file changed, 89 insertions(+), 22 deletions(-)

diff --git a/r/tests/testthat/test-dplyr-join.R 
b/r/tests/testthat/test-dplyr-join.R
index 51ca528a64..ce7ed2e63f 100644
--- a/r/tests/testthat/test-dplyr-join.R
+++ b/r/tests/testthat/test-dplyr-join.R
@@ -188,8 +188,95 @@ test_that("Error handling for unsupported expressions in 
join_by", {
   )
 })
 
-# TODO: test duplicate col names
-# TODO: casting: int and float columns?
+test_that("joins with duplicate column names", {
+  # When column names are duplicated (not in by), suffixes are added
+  left_dup <- tibble::tibble(
+    key = 1:5,
+    shared = 1:5,
+    shared_float = c(1.1, 2.2, 3.3, 4.4, 5.5),
+    left_unique = letters[1:5]
+  )
+  right_dup <- tibble::tibble(
+    key = 1:5,
+    shared = 6:10,
+    shared_float = c(6.1, 7.2, 8.3, 9.4, 10.5),
+    right_unique = LETTERS[1:5]
+  )
+
+  # Test with default suffixes (.x and .y)
+  compare_dplyr_binding(
+    .input |>
+      left_join(right_dup, by = "key") |>
+      collect(),
+    left_dup
+  )
+
+  compare_dplyr_binding(
+    .input |>
+      inner_join(right_dup, by = "key") |>
+      collect(),
+    left_dup
+  )
+
+  # Test with custom suffixes
+  compare_dplyr_binding(
+    .input |>
+      left_join(right_dup, by = "key", suffix = c("_left", "_right")) |>
+      collect(),
+    left_dup
+  )
+
+  compare_dplyr_binding(
+    .input |>
+      inner_join(right_dup, by = "key", suffix = c("_left", "_right")) |>
+      collect(),
+    left_dup
+  )
+
+  # Test that column names are correctly suffixed
+  # Verify exact column names match expected pattern using the same fixture
+  result <- arrow_table(left_dup) |>
+    inner_join(
+      arrow_table(right_dup),
+      by = "key",
+      suffix = c("_left", "_right")
+    ) |>
+    collect()
+  res_col_names <- names(result)
+  # Column order: join key first, then left table columns (with suffixes),
+  # then right table columns (with suffixes)
+  expected_col_names <- c(
+    "key",
+    "shared_left",
+    "shared_float_left",
+    "left_unique",
+    "shared_right",
+    "shared_float_right",
+    "right_unique"
+  )
+  expect_equal(expected_col_names, res_col_names)
+})
+
+test_that("joins with incompatible types for join keys", {
+  # Test that joining on columns with incompatible types (int vs float) fails
+  # Arrow requires join keys to have compatible types - type casting is not
+  # automatically performed for join keys
+  left_int <- Table$create(
+    x = c(1L, 2L),
+    shared = c(10L, 20L)
+  )
+  right_float <- Table$create(
+    x = c(1.0, 2.0),
+    shared = c(10.1, 20.2)
+  )
+
+  expect_error(
+    left_int |>
+      left_join(right_float, by = "x") |>
+      collect(),
+    "Incompatible data types for corresponding join field keys"
+  )
+})
 
 test_that("right_join", {
   compare_dplyr_binding(
@@ -317,26 +404,6 @@ test_that("arrow dplyr query correctly filters then 
joins", {
   )
 })
 
-test_that("suffix", {
-  left_suf <- Table$create(
-    key = c(1, 2),
-    left_unique = c(2.1, 3.1),
-    shared = c(10.1, 10.3)
-  )
-
-  right_suf <- Table$create(
-    key = c(1, 2, 3, 10, 20),
-    right_unique = c(1.1, 1.2, 3.1, 4.1, 4.3),
-    shared = c(20.1, 30, 40, 50, 60)
-  )
-
-  join_op <- inner_join(left_suf, right_suf, by = "key", suffix = c("_left", 
"_right"))
-  output <- collect(join_op)
-  res_col_names <- names(output)
-  expected_col_names <- c("key", "left_unique", "shared_left", "right_unique", 
"shared_right")
-  expect_equal(expected_col_names, res_col_names)
-})
-
 test_that("suffix and implicit schema", {
   left_suf <- Table$create(
     key = c(1, 2),

Reply via email to