Copilot commented on code in PR #833:
URL: https://github.com/apache/arrow-go/pull/833#discussion_r3364700027
##########
arrow/array/struct.go:
##########
@@ -207,9 +207,14 @@ func (a *Struct) GetOneForMarshal(i int) interface{} {
}
tmp := make(map[string]interface{})
- fieldList := a.data.dtype.(*arrow.StructType).Fields()
+ dtype := a.data.dtype.(*arrow.StructType)
+ fieldList := dtype.Fields()
for j, d := range a.fields {
- tmp[fieldList[j].Name] = d.GetOneForMarshal(i)
+ if dtype.Field(j).Nullable && a.IsNull(i) {
+ tmp[fieldList[j].Name] = nil
+ } else {
+ tmp[fieldList[j].Name] = d.GetOneForMarshal(i)
+ }
}
Review Comment:
In Struct.GetOneForMarshal, the per-field nullability check is ineffective
because it re-checks a.IsNull(i) inside the loop after already returning early
when the struct is null. As written, the `if dtype.Field(j).Nullable &&
a.IsNull(i)` branch can never be taken, so the new nullable-field handling
doesn’t actually run.
##########
arrow/array/struct.go:
##########
@@ -467,19 +472,27 @@ func (b *StructBuilder) UnmarshalOne(dec *json.Decoder)
error {
if keylist[key] {
return fmt.Errorf("key %s is specified twice",
key)
}
-
keylist[key] = true
- idx, ok := b.dtype.(*arrow.StructType).FieldIdx(key)
+ var next json.RawMessage
+ if err := dec.Decode(&next); err != nil {
+ return err
+ }
+
+ dtype := b.dtype.(*arrow.StructType)
+
+ idx, ok := dtype.FieldIdx(key)
if !ok {
- var extra interface{}
- if err := dec.Decode(&extra); err != nil {
- return err
- }
continue
}
- if err := b.fields[idx].UnmarshalOne(dec); err != nil {
+ if bytes.Equal(next, []byte("null")) &&
!dtype.Field(idx).Nullable {
+ return fmt.Errorf("field '%s' is non-nullable
but got null", dtype.Field(idx).Name)
+ }
+
+ valDec := json.NewDecoder(bytes.NewReader(next))
+ valDec.UseNumber()
+ if err := b.fields[idx].UnmarshalOne(valDec); err !=
nil {
return err
Review Comment:
On this error path (non-nullable field receiving `null`),
StructBuilder.UnmarshalOne returns after the struct row has already been
appended earlier in the method. That leaves the builder length advanced (and
potentially partially-appended child fields), which can corrupt the builder for
subsequent reads after an error.
##########
arrow/array/record_test.go:
##########
@@ -527,9 +541,27 @@ func TestRecordBuilder(t *testing.T) {
if got, want := rec.ColumnName(0), schema.Field(0).Name; got != want {
t.Fatalf("invalid column name: got=%q, want=%q", got, want)
}
- if got, want := rec.Column(2).String(), `[{["0" "2" "3"] ["a" "b" "c"]}
{[] []} {[] []} {["3" "2" "3"] ["a" "b" "c"]} {[] []}]`; got != want {
- t.Fatalf("invalid column name: got=%q, want=%q", got, want)
+
+ if got, want := rec.Column(0).String(), `[(null) 2 3 4 5 6]`; got !=
want {
+ t.Fatalf("invalid column values: got=%q, want=%q", got, want)
+ }
+ if got, want := rec.Column(1).String(), `[1.1 2.2 3.3 4.4 5.5 6.6]`;
got != want {
+ t.Fatalf("invalid column values: got=%q, want=%q", got, want)
}
+ if got, want := rec.Column(2).String(), `[{["0" "2" "3"] ["a" "b" "c"]}
{[] []} {[] []} {["3" "2" "3"] ["a" "b" "c"]} {[] []} {["4"] ["d"]}]`; got !=
want {
+ t.Fatalf("invalid column values: got=%q, want=%q", got, want)
+ }
+
+ // roundtripping from JSON with array.FromJSON should work
+ arr := array.RecordToStructArray(rec)
+ defer arr.Release()
+ jsonStr, err := json.Marshal(arr)
+ assert.NoError(t, err)
+
+ roundtripped, _, err := array.FromJSON(mem, arr.DataType(),
bytes.NewReader(jsonStr))
+ defer roundtripped.Release()
+ assert.NoError(t, err)
+ assert.Truef(t, array.Equal(arr, roundtripped), "JSON round trip
returns different array: got=%q, want=%d", arr, roundtripped)
Review Comment:
roundtripped.Release() is deferred before checking `err`. Since
array.FromJSON returns a nil array on error, this can panic and hide the real
failure. Also, the assert.Truef format string uses `%d` for an array value,
which will produce malformed output when the assertion fails.
##########
arrow/array/struct_test.go:
##########
@@ -472,52 +474,48 @@ func TestStructArrayUnmarshalJSONMissingFields(t
*testing.T) {
name string
jsonInput string
want string
- panic bool
+ panicErr error
}{
{
name: "missing required field",
jsonInput: `[{"f2": 3, "f3": {"f3_1": "test"}}]`,
- panic: true,
+ panicErr: errors.New("arrow/array: index out of
range"),
want: "",
},
{
name: "missing optional fields",
jsonInput: `[{"f2": 3, "f3": {"f3_3": "test"}}]`,
- panic: false,
+ panicErr: nil,
want: `{[(null)] [3] {[(null)] [(null)]
["test"]}}`,
},
+ {
+ name: "explicit null in required field",
+ jsonInput: `[{"f2": 3, "f3": {"f3_3": null}}]`,
+ panicErr: errors.New("field 'f3_3' is non-nullable but
got null"),
+ want: "",
+ },
}
for _, tc := range tests {
t.Run(
tc.name, func(t *testing.T) {
-
- var val bool
-
sb := array.NewStructBuilder(pool, dtype)
defer sb.Release()
- if tc.panic {
- defer func() {
- e := recover()
- if e == nil {
- t.Fatalf("this should
have panicked, but did not; slice value %v", val)
- }
- if got, want := e.(string),
"arrow/array: index out of range"; got != want {
- t.Fatalf("invalid
error. got=%q, want=%q", got, want)
- }
- }()
- } else {
- defer func() {
- if e := recover(); e != nil {
- t.Fatalf("unexpected
panic: %v", e)
- }
- }()
- }
+ defer func() {
+ e := recover()
+ if e == nil && tc.panicErr != nil {
+ t.Fatalf("did not panic,
expected panic: %v", tc.panicErr)
+ } else if e != nil && tc.panicErr ==
nil {
+ t.Fatalf("unexpected panic:
%v", e)
+ } else if e != nil && tc.panicErr !=
nil && fmt.Errorf("%s", e).Error() != tc.panicErr.Error() {
+ t.Fatalf("invalid error.
got=%v, want=%v", e, tc.panicErr.Error())
+ }
+ }()
err := sb.UnmarshalJSON([]byte(tc.jsonInput))
if err != nil {
- t.Fatal(err)
+ panic(err)
}
Review Comment:
This test currently converts any returned error from UnmarshalJSON into a
panic (`panic(err)`), which makes it impossible to assert the intended behavior
(e.g., the new non-nullable-null path returns an error, not a panic).
Additionally, `fmt.Errorf("%s", e)` is incorrect when `e` is an error value and
will produce `%!s(...)`-style strings.
##########
arrow/array/record_test.go:
##########
@@ -511,14 +516,23 @@ func TestRecordBuilder(t *testing.T) {
}
}
+ err := b.UnmarshalJSON([]byte(`{"f1-i32": null, "f2-f64-notnull": null,
"map": null}`))
+ assert.Contains(t, err.Error(), "field 'f2-f64-notnull' is non-nullable
but got null")
+
+ err = b.UnmarshalJSON([]byte(`{"f1-i32": null, "map": null}`))
+ assert.Contains(t, err.Error(), "field 'f2-f64-notnull' is required but
no value was given")
+
+ err = b.UnmarshalJSON([]byte(`{"f1-i32": 6, "f2-f64-notnull": 6.6,
"map": [{"key": "4": "value": "d"}]}`))
+ assert.NoError(t, err)
Review Comment:
This JSON literal used for the successful UnmarshalJSON case is invalid
(`{"key": "4": "value": "d"}` is missing a comma). As written, this test will
fail before it can exercise the new non-nullable validation behavior.
##########
arrow/array/record.go:
##########
@@ -434,43 +433,69 @@ func (b *RecordBuilder) UnmarshalOne(dec *json.Decoder)
error {
return fmt.Errorf("record should start with '{', not %s", t)
}
- keylist := make(map[string]bool)
+ // consume one row checking for duplicates and nulls
+ keylist := make(map[string]json.RawMessage)
for dec.More() {
keyTok, err := dec.Token()
if err != nil {
return err
}
key := keyTok.(string)
- if keylist[key] {
+ if _, ok := keylist[key]; ok {
return fmt.Errorf("key %s shows up twice in row to be
decoded", key)
}
- keylist[key] = true
+
+ var val json.RawMessage
+ if err := dec.Decode(&val); err != nil {
+ return err
+ }
indices := b.schema.FieldIndices(key)
if len(indices) == 0 {
- var extra interface{}
- if err := dec.Decode(&extra); err != nil {
- return err
- }
continue
}
- if err := b.fields[indices[0]].UnmarshalOne(dec); err != nil {
- return err
+ idx := indices[0]
+
+ if bytes.Equal(val, []byte("null")) &&
!b.schema.Field(idx).Nullable {
+ return fmt.Errorf("field '%s' is non-nullable but got
null", key)
}
+
+ keylist[key] = val
}
// consume the closing '}'
if _, err := dec.Token(); err != nil {
return err
}
+ // check that all non-nullable fields were specified
+ for i := 0; i < b.schema.NumFields(); i++ {
+ f := b.schema.Field(i)
+ if _, ok := keylist[f.Name]; !ok && !f.Nullable {
+ return fmt.Errorf("field '%s' is required but no value
was given", f.Name)
+ }
+ }
+
+ // at this point we know there are no integrity errors, append values
to field builders
+ for key, val := range keylist {
+ valDec := json.NewDecoder(bytes.NewReader(val))
+ valDec.UseNumber()
+
+ indices := b.schema.FieldIndices(key)
+ if err := b.fields[indices[0]].UnmarshalOne(valDec); err != nil
{
+ return err
+ }
+ }
Review Comment:
Even with the validate-then-append split, UnmarshalOne can still leave the
RecordBuilder in a partially-appended state when a field’s UnmarshalOne returns
an error (e.g., type mismatch) after some other fields have already been
appended. Because the loop appends per key and returns immediately on error,
the final “append missing nulls” pass won’t run, and column lengths can diverge.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]