Re: [PR] tests: adds tests associated with #9237 [arrow-datafusion]

via GitHub Thu, 22 Feb 2024 10:15:05 -0800


tshauck commented on code in PR #9316:
URL: https://github.com/apache/arrow-datafusion/pull/9316#discussion_r1499700380



##########
datafusion/core/tests/dataframe/mod.rs:
##########
@@ -1895,3 +1899,155 @@ async fn 
test_dataframe_placeholder_missing_param_values() -> Result<()> {
 
     Ok(())
 }
+
+#[tokio::test]
+async fn write_partitioned_parquet_results() -> Result<()> {
+    // create partitioned input file and context
+    let tmp_dir = TempDir::new()?;
+
+    let ctx = SessionContext::new();
+
+    // Create an in memory table with schema C1 and C2, both strings
+    let schema = Arc::new(Schema::new(vec![
+        Field::new("c1", DataType::Utf8, false),
+        Field::new("c2", DataType::Utf8, false),
+    ]));
+
+    let record_batch = RecordBatch::try_new(
+        schema.clone(),
+        vec![
+            Arc::new(StringArray::from(vec!["abc", "def"])),
+            Arc::new(StringArray::from(vec!["123", "456"])),
+        ],
+    )?;
+
+    let mem_table = Arc::new(MemTable::try_new(schema, 
vec![vec![record_batch]])?);
+
+    // Register the table in the context
+    ctx.register_table("test", mem_table)?;
+
+    let local = Arc::new(LocalFileSystem::new_with_prefix(&tmp_dir)?);
+    let local_url = Url::parse("file://local").unwrap();
+    ctx.runtime_env().register_object_store(&local_url, local);
+
+    // execute a simple query and write the results to parquet
+    let out_dir = tmp_dir.as_ref().to_str().unwrap().to_string() + "/out/";
+    let out_dir_url = format!("file://{out_dir}");
+
+    // Write the results to parquet with partitioning
+    let df = ctx.sql("SELECT c1, c2 FROM test").await?;
+    let df_write_options =
+        
DataFrameWriteOptions::new().with_partition_by(vec![String::from("c2")]);
+
+    df.write_parquet(&out_dir_url, df_write_options, None)
+        .await?;
+
+    // Explicitly read the parquet file at c2=123 to verify the physical files 
are partitioned
+    let partitioned_file = format!("{out_dir}/c2=123", out_dir = out_dir);
+    let filted_df = ctx
+        .read_parquet(&partitioned_file, ParquetReadOptions::default())
+        .await?;
+
+    // Check that the c2 column is gone and that c1 is abc.
+    let results = filted_df.collect().await?;
+    let expected = ["+-----+", "| c1  |", "+-----+", "| abc |", "+-----+"];
+
+    assert_batches_eq!(expected, &results);
+
+    // Read the entire set of parquet files
+    let df = ctx
+        .read_parquet(
+            &out_dir_url,
+            ParquetReadOptions::default()
+                .table_partition_cols(vec![(String::from("c2"), 
DataType::Utf8)]),
+        )
+        .await?;
+
+    // Check that the df has the entire set of data
+    let results = df.collect().await?;
+    let expected = [
+        "+-----+-----+",
+        "| c1  | c2  |",
+        "+-----+-----+",
+        "| abc | 123 |",
+        "| def | 456 |",
+        "+-----+-----+",
+    ];
+
+    assert_batches_eq!(expected, &results);
+
+    Ok(())
+}
+
+#[tokio::test]
+async fn write_parquet_results() -> Result<()> {

Review Comment:
   This is the moved test.



##########
datafusion/core/tests/dataframe/mod.rs:
##########
@@ -1895,3 +1899,155 @@ async fn 
test_dataframe_placeholder_missing_param_values() -> Result<()> {
 
     Ok(())
 }
+
+#[tokio::test]

Review Comment:
   This is the new test.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] tests: adds tests associated with #9237 [arrow-datafusion]

Reply via email to