Re: [PR] feat: `DataFrame` supports unnesting multiple columns [arrow-datafusion]

via GitHub Wed, 17 Apr 2024 17:18:20 -0700


jayzhan211 commented on code in PR #10118:
URL: 
https://github.com/apache/arrow-datafusion/pull/10118#discussion_r1569710494



##########
datafusion/core/tests/dataframe/mod.rs:
##########
@@ -1437,6 +1438,91 @@ async fn unnest_analyze_metrics() -> Result<()> {
 
     Ok(())
 }
+
+#[tokio::test]
+async fn unnest_multiple_columns() -> Result<()> {
+    let df = table_with_mixed_lists().await?;
+    // Default behavior is to preserve nulls.
+    let results = df
+        .clone()
+        .unnest_columns(&["list", "large_list", "fixed_list"])?
+        .collect()
+        .await?;
+    let expected = [
+        "+------+------------+------------+--------+",
+        "| list | large_list | fixed_list | string |",
+        "+------+------------+------------+--------+",
+        "|      |            |            | d      |",
+        "|      |            | 3          | c      |",
+        "|      |            | 4          | c      |",
+        "|      | 2.2        | 1          | b      |",
+        "|      | 3.3        | 2          | b      |",
+        "|      | 4.4        |            | b      |",
+        "| 1    |            |            | a      |",
+        "| 2    | 1.1        |            | a      |",
+        "| 3    |            |            | a      |",
+        "+------+------------+------------+--------+",
+    ];
+    assert_batches_sorted_eq!(expected, &results);

Review Comment:
   I think the result is deterministic, so we can check without sorting
   
   Then, we can have a more straightforward result
   ```
   [
       "+------+------------+------------+--------+",
       "| list | large_list | fixed_list | string |",
       "+------+------------+------------+--------+",
       "| 1    |            |            | a      |",
       "| 2    | 1.1        |            | a      |",
       "| 3    |            |            | a      |",
       "|      | 2.2        | 1          | b      |",
       "|      | 3.3        | 2          | b      |",
       "|      | 4.4        |            | b      |",
       "|      |            | 3          | c      |",
       "|      |            | 4          | c      |",
       "|      |            |            | d      |",
       "+------+------------+------------+--------+",
   ]
   ```



##########
datafusion/core/tests/dataframe/mod.rs:
##########
@@ -1437,6 +1438,91 @@ async fn unnest_analyze_metrics() -> Result<()> {
 
     Ok(())
 }
+
+#[tokio::test]
+async fn unnest_multiple_columns() -> Result<()> {
+    let df = table_with_mixed_lists().await?;
+    // Default behavior is to preserve nulls.
+    let results = df
+        .clone()
+        .unnest_columns(&["list", "large_list", "fixed_list"])?
+        .collect()
+        .await?;
+    let expected = [
+        "+------+------------+------------+--------+",
+        "| list | large_list | fixed_list | string |",
+        "+------+------------+------------+--------+",
+        "|      |            |            | d      |",
+        "|      |            | 3          | c      |",
+        "|      |            | 4          | c      |",
+        "|      | 2.2        | 1          | b      |",
+        "|      | 3.3        | 2          | b      |",
+        "|      | 4.4        |            | b      |",
+        "| 1    |            |            | a      |",
+        "| 2    | 1.1        |            | a      |",
+        "| 3    |            |            | a      |",
+        "+------+------------+------------+--------+",
+    ];
+    assert_batches_sorted_eq!(expected, &results);
+
+    // Test with `preserve_nulls = false``
+    let results = df
+        .clone()
+        .unnest_columns_with_options(
+            &["list", "large_list", "fixed_list"],
+            UnnestOptions::new().with_preserve_nulls(false),
+        )?
+        .collect()
+        .await?;
+    let expected = [
+        "+------+------------+------------+--------+",
+        "| list | large_list | fixed_list | string |",
+        "+------+------------+------------+--------+",
+        "|      |            | 3          | c      |",
+        "|      |            | 4          | c      |",
+        "|      | 2.2        | 1          | b      |",
+        "|      | 3.3        | 2          | b      |",
+        "|      | 4.4        |            | b      |",
+        "| 1    |            |            | a      |",
+        "| 2    | 1.1        |            | a      |",
+        "| 3    |            |            | a      |",
+        "+------+------------+------------+--------+",
+    ];
+    assert_batches_sorted_eq!(expected, &results);

Review Comment:
   Same here



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] feat: `DataFrame` supports unnesting multiple columns [arrow-datafusion]

Reply via email to