[GitHub] [arrow-datafusion] alamb commented on a change in pull request #1024: Ignore metadata on schema merge

GitBox Tue, 21 Sep 2021 03:24:37 -0700


alamb commented on a change in pull request #1024:
URL: https://github.com/apache/arrow-datafusion/pull/1024#discussion_r712904314




##########
File path: datafusion/src/execution/context.rs
##########
@@ -3753,6 +3753,67 @@ mod tests {
         assert_eq!(Weak::strong_count(&catalog_weak), 0);
     }
 
+    #[tokio::test]
+    async fn schema_merge_ignores_metadata() {
+        // Create two parquet files in same table with same schema but 
different metadata
+        let tmp_dir = TempDir::new().unwrap();
+        let table_dir = tmp_dir.path().join("parquet_test");
+        let table_path = Path::new(&table_dir);
+
+        let mut non_empty_metadata: HashMap<String, String> = HashMap::new();
+        non_empty_metadata.insert("testing".to_string(), 
"metadata".to_string());
+
+        let fields = vec![
+            Field::new("id", DataType::Int32, true),
+            Field::new("name", DataType::Utf8, true),
+        ];
+        let schemas = vec![
+            Arc::new(Schema::new_with_metadata(
+                fields.clone(),
+                non_empty_metadata.clone(),
+            )),
+            Arc::new(Schema::new(fields.clone())),
+        ];
+
+        match fs::create_dir(table_path) {
+            Ok(()) => {
+                for i in 0..2 {
+                    let filename = format!("part-{}.parquet", i);
+                    let path = table_path.join(&filename);
+                    let file = fs::File::create(path).unwrap();
+                    let mut writer = ArrowWriter::try_new(
+                        file.try_clone().unwrap(),
+                        schemas[i].clone(),
+                        None,
+                    )
+                    .unwrap();
+
+                    // create mock record batch
+                    let ids = Arc::new(Int32Array::from(vec![i as i32]));
+                    let names = Arc::new(StringArray::from(vec!["test"]));
+                    let rec_batch =
+                        RecordBatch::try_new(schemas[i].clone(), vec![ids, 
names])
+                            .unwrap();
+
+                    writer.write(&rec_batch).unwrap();
+                    writer.close().unwrap();
+                }
+            }
+            _ => {}
+        }
+
+        // Read the parquet files into a dataframe to confirm results
+        let mut ctx = ExecutionContext::new();
+        let df = ctx
+            .read_parquet(table_dir.to_str().unwrap().to_string())
+            .unwrap();
+        let result = df.collect().await.unwrap();
+
+        // I expected this to be true if files are read sequentially
+        // assert_eq!(result[0].schema().metadata(), &non_empty_metadata);

Review comment:
       The order that the files is read by parquet I think is the order that 
listing the directory returns the filenames, which is non deterministic for 
most filesystems. Thus I would not expect this check to always be true. 
   
   ```suggestion
   ```

##########
File path: datafusion/src/execution/context.rs
##########
@@ -3753,6 +3753,67 @@ mod tests {
         assert_eq!(Weak::strong_count(&catalog_weak), 0);
     }
 
+    #[tokio::test]
+    async fn schema_merge_ignores_metadata() {
+        // Create two parquet files in same table with same schema but 
different metadata
+        let tmp_dir = TempDir::new().unwrap();
+        let table_dir = tmp_dir.path().join("parquet_test");
+        let table_path = Path::new(&table_dir);
+
+        let mut non_empty_metadata: HashMap<String, String> = HashMap::new();
+        non_empty_metadata.insert("testing".to_string(), 
"metadata".to_string());
+
+        let fields = vec![
+            Field::new("id", DataType::Int32, true),
+            Field::new("name", DataType::Utf8, true),
+        ];
+        let schemas = vec![
+            Arc::new(Schema::new_with_metadata(
+                fields.clone(),
+                non_empty_metadata.clone(),
+            )),
+            Arc::new(Schema::new(fields.clone())),
+        ];
+
+        match fs::create_dir(table_path) {
+            Ok(()) => {
+                for i in 0..2 {
+                    let filename = format!("part-{}.parquet", i);
+                    let path = table_path.join(&filename);
+                    let file = fs::File::create(path).unwrap();
+                    let mut writer = ArrowWriter::try_new(
+                        file.try_clone().unwrap(),
+                        schemas[i].clone(),
+                        None,
+                    )
+                    .unwrap();
+
+                    // create mock record batch
+                    let ids = Arc::new(Int32Array::from(vec![i as i32]));
+                    let names = Arc::new(StringArray::from(vec!["test"]));
+                    let rec_batch =
+                        RecordBatch::try_new(schemas[i].clone(), vec![ids, 
names])
+                            .unwrap();
+
+                    writer.write(&rec_batch).unwrap();
+                    writer.close().unwrap();
+                }
+            }
+            _ => {}
+        }
+
+        // Read the parquet files into a dataframe to confirm results

Review comment:
       ```suggestion
           // Read the parquet files into a dataframe to confirm results 
           // (no errors)
   ```




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [arrow-datafusion] alamb commented on a change in pull request #1024: Ignore metadata on schema merge

Reply via email to