from JSON [arrow-go]

via GitHub Fri, 05 Jun 2026 11:47:04 -0700


Copilot commented on code in PR #833:
URL: https://github.com/apache/arrow-go/pull/833#discussion_r3364700027



##########
arrow/array/struct.go:
##########
@@ -207,9 +207,14 @@ func (a *Struct) GetOneForMarshal(i int) interface{} {
        }
 
        tmp := make(map[string]interface{})
-       fieldList := a.data.dtype.(*arrow.StructType).Fields()
+       dtype := a.data.dtype.(*arrow.StructType)
+       fieldList := dtype.Fields()
        for j, d := range a.fields {
-               tmp[fieldList[j].Name] = d.GetOneForMarshal(i)
+               if dtype.Field(j).Nullable && a.IsNull(i) {
+                       tmp[fieldList[j].Name] = nil
+               } else {
+                       tmp[fieldList[j].Name] = d.GetOneForMarshal(i)
+               }
        }

Review Comment:
   In Struct.GetOneForMarshal, the per-field nullability check is ineffective 
because it re-checks a.IsNull(i) inside the loop after already returning early 
when the struct is null. As written, the `if dtype.Field(j).Nullable && 
a.IsNull(i)` branch can never be taken, so the new nullable-field handling 
doesn’t actually run.



##########
arrow/array/struct.go:
##########
@@ -467,19 +472,27 @@ func (b *StructBuilder) UnmarshalOne(dec *json.Decoder) 
error {
                        if keylist[key] {
                                return fmt.Errorf("key %s is specified twice", 
key)
                        }
-
                        keylist[key] = true
 
-                       idx, ok := b.dtype.(*arrow.StructType).FieldIdx(key)
+                       var next json.RawMessage
+                       if err := dec.Decode(&next); err != nil {
+                               return err
+                       }
+
+                       dtype := b.dtype.(*arrow.StructType)
+
+                       idx, ok := dtype.FieldIdx(key)
                        if !ok {
-                               var extra interface{}
-                               if err := dec.Decode(&extra); err != nil {
-                                       return err
-                               }
                                continue
                        }
 
-                       if err := b.fields[idx].UnmarshalOne(dec); err != nil {
+                       if bytes.Equal(next, []byte("null")) && 
!dtype.Field(idx).Nullable {
+                               return fmt.Errorf("field '%s' is non-nullable 
but got null", dtype.Field(idx).Name)
+                       }
+
+                       valDec := json.NewDecoder(bytes.NewReader(next))
+                       valDec.UseNumber()
+                       if err := b.fields[idx].UnmarshalOne(valDec); err != 
nil {
                                return err

Review Comment:
   On this error path (non-nullable field receiving `null`), 
StructBuilder.UnmarshalOne returns after the struct row has already been 
appended earlier in the method. That leaves the builder length advanced (and 
potentially partially-appended child fields), which can corrupt the builder for 
subsequent reads after an error.



##########
arrow/array/record_test.go:
##########
@@ -527,9 +541,27 @@ func TestRecordBuilder(t *testing.T) {
        if got, want := rec.ColumnName(0), schema.Field(0).Name; got != want {
                t.Fatalf("invalid column name: got=%q, want=%q", got, want)
        }
-       if got, want := rec.Column(2).String(), `[{["0" "2" "3"] ["a" "b" "c"]} 
{[] []} {[] []} {["3" "2" "3"] ["a" "b" "c"]} {[] []}]`; got != want {
-               t.Fatalf("invalid column name: got=%q, want=%q", got, want)
+
+       if got, want := rec.Column(0).String(), `[(null) 2 3 4 5 6]`; got != 
want {
+               t.Fatalf("invalid column values: got=%q, want=%q", got, want)
+       }
+       if got, want := rec.Column(1).String(), `[1.1 2.2 3.3 4.4 5.5 6.6]`; 
got != want {
+               t.Fatalf("invalid column values: got=%q, want=%q", got, want)
        }
+       if got, want := rec.Column(2).String(), `[{["0" "2" "3"] ["a" "b" "c"]} 
{[] []} {[] []} {["3" "2" "3"] ["a" "b" "c"]} {[] []} {["4"] ["d"]}]`; got != 
want {
+               t.Fatalf("invalid column values: got=%q, want=%q", got, want)
+       }
+
+       // roundtripping from JSON with array.FromJSON should work
+       arr := array.RecordToStructArray(rec)
+       defer arr.Release()
+       jsonStr, err := json.Marshal(arr)
+       assert.NoError(t, err)
+
+       roundtripped, _, err := array.FromJSON(mem, arr.DataType(), 
bytes.NewReader(jsonStr))
+       defer roundtripped.Release()
+       assert.NoError(t, err)
+       assert.Truef(t, array.Equal(arr, roundtripped), "JSON round trip 
returns different array: got=%q, want=%d", arr, roundtripped)

Review Comment:
   roundtripped.Release() is deferred before checking `err`. Since 
array.FromJSON returns a nil array on error, this can panic and hide the real 
failure. Also, the assert.Truef format string uses `%d` for an array value, 
which will produce malformed output when the assertion fails.



##########
arrow/array/struct_test.go:
##########
@@ -472,52 +474,48 @@ func TestStructArrayUnmarshalJSONMissingFields(t 
*testing.T) {
                name      string
                jsonInput string
                want      string
-               panic     bool
+               panicErr  error
        }{
                {
                        name:      "missing required field",
                        jsonInput: `[{"f2": 3, "f3": {"f3_1": "test"}}]`,
-                       panic:     true,
+                       panicErr:  errors.New("arrow/array: index out of 
range"),
                        want:      "",
                },
                {
                        name:      "missing optional fields",
                        jsonInput: `[{"f2": 3, "f3": {"f3_3": "test"}}]`,
-                       panic:     false,
+                       panicErr:  nil,
                        want:      `{[(null)] [3] {[(null)] [(null)] 
["test"]}}`,
                },
+               {
+                       name:      "explicit null in required field",
+                       jsonInput: `[{"f2": 3, "f3": {"f3_3": null}}]`,
+                       panicErr:  errors.New("field 'f3_3' is non-nullable but 
got null"),
+                       want:      "",
+               },
        }
 
        for _, tc := range tests {
                t.Run(
                        tc.name, func(t *testing.T) {
-
-                               var val bool
-
                                sb := array.NewStructBuilder(pool, dtype)
                                defer sb.Release()
 
-                               if tc.panic {
-                                       defer func() {
-                                               e := recover()
-                                               if e == nil {
-                                                       t.Fatalf("this should 
have panicked, but did not; slice value %v", val)
-                                               }
-                                               if got, want := e.(string), 
"arrow/array: index out of range"; got != want {
-                                                       t.Fatalf("invalid 
error. got=%q, want=%q", got, want)
-                                               }
-                                       }()
-                               } else {
-                                       defer func() {
-                                               if e := recover(); e != nil {
-                                                       t.Fatalf("unexpected 
panic: %v", e)
-                                               }
-                                       }()
-                               }
+                               defer func() {
+                                       e := recover()
+                                       if e == nil && tc.panicErr != nil {
+                                               t.Fatalf("did not panic, 
expected panic: %v", tc.panicErr)
+                                       } else if e != nil && tc.panicErr == 
nil {
+                                               t.Fatalf("unexpected panic: 
%v", e)
+                                       } else if e != nil && tc.panicErr != 
nil && fmt.Errorf("%s", e).Error() != tc.panicErr.Error() {
+                                               t.Fatalf("invalid error. 
got=%v, want=%v", e, tc.panicErr.Error())
+                                       }
+                               }()
 
                                err := sb.UnmarshalJSON([]byte(tc.jsonInput))
                                if err != nil {
-                                       t.Fatal(err)
+                                       panic(err)
                                }

Review Comment:
   This test currently converts any returned error from UnmarshalJSON into a 
panic (`panic(err)`), which makes it impossible to assert the intended behavior 
(e.g., the new non-nullable-null path returns an error, not a panic). 
Additionally, `fmt.Errorf("%s", e)` is incorrect when `e` is an error value and 
will produce `%!s(...)`-style strings.



##########
arrow/array/record_test.go:
##########
@@ -511,14 +516,23 @@ func TestRecordBuilder(t *testing.T) {
                }
        }
 
+       err := b.UnmarshalJSON([]byte(`{"f1-i32": null, "f2-f64-notnull": null, 
"map": null}`))
+       assert.Contains(t, err.Error(), "field 'f2-f64-notnull' is non-nullable 
but got null")
+
+       err = b.UnmarshalJSON([]byte(`{"f1-i32": null, "map": null}`))
+       assert.Contains(t, err.Error(), "field 'f2-f64-notnull' is required but 
no value was given")
+
+       err = b.UnmarshalJSON([]byte(`{"f1-i32": 6, "f2-f64-notnull": 6.6, 
"map": [{"key": "4": "value": "d"}]}`))
+       assert.NoError(t, err)

Review Comment:
   This JSON literal used for the successful UnmarshalJSON case is invalid 
(`{"key": "4": "value": "d"}` is missing a comma). As written, this test will 
fail before it can exercise the new non-nullable validation behavior.



##########
arrow/array/record.go:
##########
@@ -434,43 +433,69 @@ func (b *RecordBuilder) UnmarshalOne(dec *json.Decoder) 
error {
                return fmt.Errorf("record should start with '{', not %s", t)
        }
 
-       keylist := make(map[string]bool)
+       // consume one row checking for duplicates and nulls
+       keylist := make(map[string]json.RawMessage)
        for dec.More() {
                keyTok, err := dec.Token()
                if err != nil {
                        return err
                }
 
                key := keyTok.(string)
-               if keylist[key] {
+               if _, ok := keylist[key]; ok {
                        return fmt.Errorf("key %s shows up twice in row to be 
decoded", key)
                }
-               keylist[key] = true
+
+               var val json.RawMessage
+               if err := dec.Decode(&val); err != nil {
+                       return err
+               }
 
                indices := b.schema.FieldIndices(key)
                if len(indices) == 0 {
-                       var extra interface{}
-                       if err := dec.Decode(&extra); err != nil {
-                               return err
-                       }
                        continue
                }
 
-               if err := b.fields[indices[0]].UnmarshalOne(dec); err != nil {
-                       return err
+               idx := indices[0]
+
+               if bytes.Equal(val, []byte("null")) && 
!b.schema.Field(idx).Nullable {
+                       return fmt.Errorf("field '%s' is non-nullable but got 
null", key)
                }
+
+               keylist[key] = val
        }
 
        // consume the closing '}'
        if _, err := dec.Token(); err != nil {
                return err
        }
 
+       // check that all non-nullable fields were specified
+       for i := 0; i < b.schema.NumFields(); i++ {
+               f := b.schema.Field(i)
+               if _, ok := keylist[f.Name]; !ok && !f.Nullable {
+                       return fmt.Errorf("field '%s' is required but no value 
was given", f.Name)
+               }
+       }
+
+       // at this point we know there are no integrity errors, append values 
to field builders
+       for key, val := range keylist {
+               valDec := json.NewDecoder(bytes.NewReader(val))
+               valDec.UseNumber()
+
+               indices := b.schema.FieldIndices(key)
+               if err := b.fields[indices[0]].UnmarshalOne(valDec); err != nil 
{
+                       return err
+               }
+       }

Review Comment:
   Even with the validate-then-append split, UnmarshalOne can still leave the 
RecordBuilder in a partially-appended state when a field’s UnmarshalOne returns 
an error (e.g., type mismatch) after some other fields have already been 
appended. Because the loop appends per key and returns immediately on error, 
the final “append missing nulls” pass won’t run, and column lengths can diverge.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] fix(arrow/array): Parity with Arrow C++ when reading/writing `null` to/from JSON [arrow-go]

Reply via email to