alamb commented on code in PR #7276:
URL: https://github.com/apache/arrow-rs/pull/7276#discussion_r2953973293


##########
arrow-json/src/reader/mod.rs:
##########
@@ -281,6 +284,18 @@ impl ReaderBuilder {
         }
     }
 
+    /// Sets whether the decoder should produce NULL instead of returning an 
error if it encounters
+    /// a type conflict on a nullable column (effectively treating it as a 
non-existent column).
+    ///
+    /// NOTE: The inferred NULL on type conflict will still produce errors for 
non-nullable columns,
+    /// the same as any other NULL or missing value.

Review Comment:
   Can we please define what a "type conflict" means more specifically?  
Perhaps with an example
   
   I think it means something like:
   
   the JSON decoder encounters a value that can not be parsed into the 
specified column type. 
   
   For example, if the type is declared to be a nullable [`DataType::Int32`] 
but the reader encounters a string value `"foo"`:
   - If `with_ignore_type_conflicts` is set to false (the default), the reader 
will return an error. 
   - If `with_ignore_type_conflicts` is set to true, the reader will fill in 
`NULL` value for that array element 



##########
arrow-json/src/reader/mod.rs:
##########
@@ -2808,4 +2830,354 @@ mod tests {
             "Json error: whilst decoding field 'a': failed to parse \"a\" as 
Int32".to_owned()
         );
     }
+
+    #[test]
+    fn test_type_conflict_nulls() {
+        let schema = Schema::new(vec![
+            Field::new("null", DataType::Null, true),
+            Field::new("bool", DataType::Boolean, true),
+            Field::new("primitive", DataType::Int32, true),
+            Field::new("numeric", DataType::Decimal128(10, 3), true),
+            Field::new("string", DataType::Utf8, true),
+            Field::new("string_view", DataType::Utf8View, true),
+            Field::new(
+                "timestamp",
+                DataType::Timestamp(TimeUnit::Second, None),
+                true,
+            ),
+            Field::new(
+                "array",
+                DataType::List(Arc::new(Field::new("item", DataType::Int32, 
true))),
+                true,
+            ),
+            Field::new(
+                "map",
+                DataType::Map(
+                    Arc::new(Field::new(
+                        "entries",
+                        DataType::Struct(Fields::from(vec![
+                            Field::new("keys", DataType::Utf8, false),
+                            Field::new("values", DataType::Utf8, true),
+                        ])),
+                        false, // not nullable
+                    )),
+                    false, // not sorted
+                ),
+                true, // nullable
+            ),
+            Field::new(
+                "struct",
+                DataType::Struct(Fields::from(vec![Field::new("a", 
DataType::Int32, true)])),
+                true,
+            ),
+        ]);
+
+        // A compatible value for each schema field above, in schema order
+        let json_values = vec![
+            json!(null),
+            json!(true),
+            json!(42),
+            json!(1.234),
+            json!("hi"),
+            json!("ho"),
+            json!("1970-01-01T00:00:00+02:00"),
+            json!([1, "ho", 3]),
+            json!({"k": "value"}),
+            json!({"a": 1}),
+        ];
+
+        // Create a set of JSON rows that rotates each value past every field
+        let json: Vec<_> = (0..json_values.len())
+            .map(|i| {
+                let pairs = json_values[i..]
+                    .iter()
+                    .chain(json_values[..i].iter())
+                    .zip(&schema.fields)
+                    .map(|(v, f)| (f.name().to_string(), v.clone()))
+                    .collect();
+                serde_json::Value::Object(pairs)
+            })
+            .collect();
+        let mut decoder = ReaderBuilder::new(Arc::new(schema))
+            .with_ignore_type_conflicts(true)
+            .with_coerce_primitive(true)
+            .build_decoder()
+            .unwrap();
+        decoder.serialize(&json).unwrap();
+        let batch = decoder.flush().unwrap().unwrap();
+        assert_eq!(batch.num_rows(), 10);
+        assert_eq!(batch.num_columns(), 10);
+
+        // NOTE: NullArray doesn't materialize any values (they're all NULL by 
definition)
+        let _ = batch
+            .column(0)
+            .as_any()
+            .downcast_ref::<NullArray>()
+            .unwrap();
+
+        assert!(batch
+            .column(1)
+            .as_any()
+            .downcast_ref::<BooleanArray>()
+            .unwrap()
+            .iter()
+            .eq([
+                Some(true),
+                None,
+                None,
+                None,
+                None,
+                None,
+                None,
+                None,
+                None,
+                None
+            ]));
+
+        assert!(batch.column(2).as_primitive::<Int32Type>().iter().eq([
+            Some(42),
+            Some(1),
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None
+        ]));
+
+        assert!(batch.column(3).as_primitive::<Decimal128Type>().iter().eq([
+            Some(1234),
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            Some(42000)
+        ]));
+
+        assert!(batch
+            .column(4)
+            .as_any()
+            .downcast_ref::<StringArray>()
+            .unwrap()
+            .iter()
+            .eq([
+                Some("hi"),
+                Some("ho"),
+                Some("1970-01-01T00:00:00+02:00"),
+                None,
+                None,
+                None,
+                None,
+                Some("true"),
+                Some("42"),
+                Some("1.234"),
+            ]));
+
+        assert!(batch
+            .column(5)
+            .as_any()
+            .downcast_ref::<StringViewArray>()
+            .unwrap()
+            .iter()
+            .eq([
+                Some("ho"),
+                Some("1970-01-01T00:00:00+02:00"),
+                None,
+                None,
+                None,
+                None,
+                Some("true"),
+                Some("42"),
+                Some("1.234"),
+                Some("hi"),
+            ]));
+
+        assert!(batch
+            .column(6)
+            .as_primitive::<TimestampSecondType>()
+            .iter()
+            .eq([
+                Some(-7200),
+                None,
+                None,
+                None,
+                None,
+                None,
+                Some(42),
+                None,
+                None,
+                None,
+            ]));
+
+        let arrays = batch
+            .column(7)
+            .as_any()
+            .downcast_ref::<ListArray>()
+            .unwrap();
+        assert_eq!(
+            arrays.nulls(),
+            Some(&NullBuffer::from(
+                &[true, false, false, false, false, false, false, false, 
false, false][..]
+            ))
+        );
+        assert_eq!(arrays.offsets()[1], 3);
+        let array_values = arrays
+            .values()
+            .as_any()
+            .downcast_ref::<Int32Array>()
+            .unwrap();
+        assert!(array_values.iter().eq([Some(1), None, Some(3)]));
+
+        let maps = 
batch.column(8).as_any().downcast_ref::<MapArray>().unwrap();
+        assert_eq!(
+            maps.nulls(),
+            Some(&NullBuffer::from(
+                // Both map and struct can parse
+                &[true, true, false, false, false, false, false, false, false, 
false][..]
+            ))
+        );
+        let map_keys = 
maps.keys().as_any().downcast_ref::<StringArray>().unwrap();
+        assert!(map_keys.iter().eq([Some("k"), Some("a")]));
+        let map_values = maps
+            .values()
+            .as_any()
+            .downcast_ref::<StringArray>()
+            .unwrap();
+        assert!(map_values.iter().eq([Some("value"), Some("1")]));
+
+        let structs = batch
+            .column(9)
+            .as_any()
+            .downcast_ref::<StructArray>()
+            .unwrap();
+        assert_eq!(
+            structs.nulls(),
+            Some(&NullBuffer::from(
+                // Both map and struct can parse
+                &[true, false, false, false, false, false, false, false, 
false, true][..]
+            ))
+        );
+        let struct_fields = structs
+            .column(0)
+            .as_any()
+            .downcast_ref::<Int32Array>()
+            .unwrap();
+        assert!(struct_fields.slice(0, 2).iter().eq([Some(1), None]));
+    }
+
+    #[test]
+    fn test_type_conflict_non_nullable() {
+        let fields = [
+            Field::new("bool", DataType::Boolean, false),

Review Comment:
   👍 



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to