This is an automated email from the ASF dual-hosted git repository.
zeroshade pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 2e6c13d499 GH-34790: [Go]: Add array.Edits.UnifiedDiff (#34827)
2e6c13d499 is described below
commit 2e6c13d49956bd852ef7a70d94d3957599775e7c
Author: Herman Schaaf <[email protected]>
AuthorDate: Mon Apr 10 16:21:17 2023 +0100
GH-34790: [Go]: Add array.Edits.UnifiedDiff (#34827)
This adds a `UnifiedDiff(base, target arrow.Array)` method to the
`array.Edits` type. It returns a string diff in Unified Diff format. This makes
use of the `array.Edits` type returned by the `arrays.Diff()` function added in
https://github.com/apache/arrow/pull/34806
- Part of https://github.com/apache/arrow/issues/34790
Authored-by: Herman Schaaf <[email protected]>
Signed-off-by: Matt Topol <[email protected]>
---
go/arrow/array/diff.go | 64 +++++++
go/arrow/array/diff_test.go | 414 ++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 478 insertions(+)
diff --git a/go/arrow/array/diff.go b/go/arrow/array/diff.go
index 0371214cff..32030173b5 100644
--- a/go/arrow/array/diff.go
+++ b/go/arrow/array/diff.go
@@ -18,6 +18,7 @@ package array
import (
"fmt"
+ "strings"
"github.com/apache/arrow/go/v12/arrow"
)
@@ -47,6 +48,69 @@ type Edit struct {
// ]
type Edits []Edit
+// String returns a simple string representation of the edit script.
+func (e Edits) String() string {
+ return fmt.Sprintf("%v", []Edit(e))
+}
+
+// UnifiedDiff returns a string representation of the diff of base and target
in Unified Diff format.
+func (e Edits) UnifiedDiff(base, target arrow.Array) string {
+ var s strings.Builder
+ baseIndex := int64(0)
+ targetIndex := int64(0)
+ wrotePosition := false
+ for i := 0; i < len(e); i++ {
+ if i > 0 {
+ if !wrotePosition {
+ s.WriteString(fmt.Sprintf("@@ -%d, +%d @@\n",
baseIndex, targetIndex))
+ wrotePosition = true
+ }
+ if e[i].Insert {
+ s.WriteString(fmt.Sprintf("+%v\n",
stringAt(target, targetIndex)))
+ targetIndex++
+ } else {
+ s.WriteString(fmt.Sprintf("-%v\n",
stringAt(base, baseIndex)))
+ baseIndex++
+ }
+ }
+ for j := int64(0); j < e[i].RunLength; j++ {
+ baseIndex++
+ targetIndex++
+ wrotePosition = false
+ }
+ }
+ return s.String()
+}
+
+func stringAt(arr arrow.Array, i int64) string {
+ if arr.IsNull(int(i)) {
+ return "null"
+ }
+ dt := arr.DataType()
+ switch {
+ case arrow.TypeEqual(dt, arrow.PrimitiveTypes.Float32):
+ return fmt.Sprintf("%f", arr.(*Float32).Value(int(i)))
+ case arrow.TypeEqual(dt, arrow.PrimitiveTypes.Float64):
+ return fmt.Sprintf("%f", arr.(*Float64).Value(int(i)))
+ case arrow.TypeEqual(dt, arrow.PrimitiveTypes.Date32):
+ return arr.(*Date32).Value(int(i)).FormattedString()
+ case arrow.TypeEqual(dt, arrow.PrimitiveTypes.Date64):
+ return arr.(*Date64).Value(int(i)).FormattedString()
+ case arrow.TypeEqual(dt, arrow.FixedWidthTypes.Timestamp_s):
+ return
arr.(*Timestamp).Value(int(i)).ToTime(arrow.Second).String()
+ case arrow.TypeEqual(dt, arrow.FixedWidthTypes.Timestamp_ms):
+ return
arr.(*Timestamp).Value(int(i)).ToTime(arrow.Millisecond).String()
+ case arrow.TypeEqual(dt, arrow.FixedWidthTypes.Timestamp_us):
+ return
arr.(*Timestamp).Value(int(i)).ToTime(arrow.Microsecond).String()
+ case arrow.TypeEqual(dt, arrow.FixedWidthTypes.Timestamp_ns):
+ return
arr.(*Timestamp).Value(int(i)).ToTime(arrow.Nanosecond).String()
+ }
+ s := NewSlice(arr, i, i+1)
+ defer s.Release()
+ st, _ := s.MarshalJSON()
+ return strings.Trim(string(st[1:len(st)-1]), "\n")
+}
+
// Diff compares two arrays, returning an edit script which expresses the
difference
// between them. The edit script can be applied to the base array to produce
the target.
// 'base' is a baseline for comparison.
diff --git a/go/arrow/array/diff_test.go b/go/arrow/array/diff_test.go
index 0bb2838320..e2c7820d5f 100644
--- a/go/arrow/array/diff_test.go
+++ b/go/arrow/array/diff_test.go
@@ -26,6 +26,7 @@ import (
"github.com/apache/arrow/go/v12/arrow"
"github.com/apache/arrow/go/v12/arrow/array"
+ "github.com/apache/arrow/go/v12/arrow/internal/testing/types"
"github.com/apache/arrow/go/v12/arrow/memory"
)
@@ -462,3 +463,416 @@ func validateEditScript(t *testing.T, edits array.Edits,
base, target arrow.Arra
t.Fatalf("edit script (%v) when applied to base %v does not
produce target %v", edits, base, target)
}
}
+
+type diffStringTestCase struct {
+ dataType arrow.DataType
+
+ name string
+ baseJSON string
+ targetJSON string
+ want string
+}
+
+func (s *diffStringTestCase) check(t *testing.T) {
+ mem := memory.NewCheckedAllocator(memory.NewGoAllocator())
+ defer mem.AssertSize(t, 0)
+
+ base, _, err := array.FromJSON(mem, s.dataType,
strings.NewReader(s.baseJSON))
+ if err != nil {
+ t.Fatal(err)
+ }
+ defer base.Release()
+
+ target, _, err := array.FromJSON(mem, s.dataType,
strings.NewReader(s.targetJSON))
+ if err != nil {
+ t.Fatal(err)
+ }
+ defer target.Release()
+
+ edits, err := array.Diff(base, target)
+ if err != nil {
+ t.Fatalf("got unexpected error %v", err)
+ }
+ got := edits.UnifiedDiff(base, target)
+ if got != s.want {
+ t.Errorf("got:\n%v\n, want:\n%v", got, s.want)
+ }
+}
+
+func TestEdits_UnifiedDiff(t *testing.T) {
+ msPerDay := 24 * 60 * 60 * 1000
+ cases := []diffStringTestCase{
+ {
+ name: "no changes",
+ dataType: arrow.BinaryTypes.String,
+ baseJSON: `["give", "me", "a", "break"]`,
+ targetJSON: `["give", "me", "a", "break"]`,
+ want: ``,
+ },
+ {
+ name: "insert one",
+ dataType: arrow.BinaryTypes.String,
+ baseJSON: `["give", "a", "break"]`,
+ targetJSON: `["give", "me", "a", "break"]`,
+ want: `@@ -1, +1 @@
++"me"
+`,
+ },
+ {
+ name: "delete one",
+ dataType: arrow.BinaryTypes.String,
+ baseJSON: `["give", "me", "a", "break"]`,
+ targetJSON: `["give", "a", "break"]`,
+ want: `@@ -1, +1 @@
+-"me"
+`,
+ },
+ {
+ name: "change one",
+ dataType: arrow.BinaryTypes.String,
+ baseJSON: `["give", "a", "break"]`,
+ targetJSON: `["gimme", "a", "break"]`,
+ want: `@@ -0, +0 @@
+-"give"
++"gimme"
+`,
+ },
+ {
+ name: "null out one",
+ dataType: arrow.BinaryTypes.String,
+ baseJSON: `["give", "a", "break"]`,
+ targetJSON: `["give", "a", null]`,
+ want: `@@ -2, +2 @@
+-"break"
++null
+`,
+ },
+ {
+ name: "strings with escaped chars",
+ dataType: arrow.BinaryTypes.String,
+ baseJSON: `["newline:\\n", "quote:'",
"backslash:\\\\"]`,
+ targetJSON: `["newline:\\n", "tab:\\t", "quote:\\\"",
"backslash:\\\\"]`,
+ want: `@@ -1, +1 @@
+-"quote:'"
++"tab:\\t"
++"quote:\\\""
+`,
+ },
+ {
+ name: "date32",
+ dataType: arrow.PrimitiveTypes.Date32,
+ baseJSON: `[0, 1, 2, 31, 4]`,
+ targetJSON: `[0, 1, 31, 2, 4]`,
+ want: `@@ -2, +2 @@
+-1970-01-03
+@@ -4, +3 @@
++1970-01-03
+`,
+ },
+ {
+ name: "date64",
+ dataType: arrow.PrimitiveTypes.Date64,
+ baseJSON: fmt.Sprintf(`[%d, %d, %d, %d, %d]`,
0*msPerDay, 1*msPerDay, 2*msPerDay, 31*msPerDay, 4*msPerDay),
+ targetJSON: fmt.Sprintf(`[%d, %d, %d, %d, %d]`,
0*msPerDay, 1*msPerDay, 31*msPerDay, 2*msPerDay, 4*msPerDay),
+ want: `@@ -2, +2 @@
+-1970-01-03
+@@ -4, +3 @@
++1970-01-03
+`,
+ },
+ {
+ name: "timestamp_s",
+ dataType: arrow.FixedWidthTypes.Timestamp_s,
+ baseJSON: fmt.Sprintf(`[0, 1, %d, 2, 4]`,
678+(5+60*(4+60*(3+24*int64(1))))),
+ targetJSON: fmt.Sprintf(`[0, 1, 2, %d, 4]`,
678+(5+60*(4+60*(3+24*int64(1))))),
+ want: `@@ -2, +2 @@
+-1970-01-02 03:15:23 +0000 UTC
+@@ -4, +3 @@
++1970-01-02 03:15:23 +0000 UTC
+`,
+ },
+ {
+ name: "timestamp_ms",
+ dataType: arrow.FixedWidthTypes.Timestamp_ms,
+ baseJSON: fmt.Sprintf(`[0, 1, %d, 2, 4]`,
678+1000*(5+60*(4+60*(3+24*int64(1))))),
+ targetJSON: fmt.Sprintf(`[0, 1, 2, %d, 4]`,
678+1000*(5+60*(4+60*(3+24*int64(1))))),
+ want: `@@ -2, +2 @@
+-1970-01-02 03:04:05.678 +0000 UTC
+@@ -4, +3 @@
++1970-01-02 03:04:05.678 +0000 UTC
+`,
+ },
+ {
+ name: "timestamp_us",
+ dataType: arrow.FixedWidthTypes.Timestamp_us,
+ baseJSON: fmt.Sprintf(`[0, 1, %d, 2, 4]`,
678+1000000*(5+60*(4+60*(3+24*int64(1))))),
+ targetJSON: fmt.Sprintf(`[0, 1, 2, %d, 4]`,
678+1000000*(5+60*(4+60*(3+24*int64(1))))),
+ want: `@@ -2, +2 @@
+-1970-01-02 03:04:05.000678 +0000 UTC
+@@ -4, +3 @@
++1970-01-02 03:04:05.000678 +0000 UTC
+`,
+ },
+ {
+ name: "timestamp_ns",
+ dataType: arrow.FixedWidthTypes.Timestamp_ns,
+ baseJSON: fmt.Sprintf(`[0, 1, %d, 2, 4]`,
678+1000000000*(5+60*(4+60*(3+24*int64(1))))),
+ targetJSON: fmt.Sprintf(`[0, 1, 2, %d, 4]`,
678+1000000000*(5+60*(4+60*(3+24*int64(1))))),
+ want: `@@ -2, +2 @@
+-1970-01-02 03:04:05.000000678 +0000 UTC
+@@ -4, +3 @@
++1970-01-02 03:04:05.000000678 +0000 UTC
+`,
+ },
+ {
+ name: "lists",
+ dataType: arrow.ListOf(arrow.PrimitiveTypes.Int32),
+ baseJSON: `[[2, 3, 1], [], [13], []]`,
+ targetJSON: `[[2, 3, 1], [5, 9], [], [13]]`,
+ want: `@@ -1, +1 @@
++[5,9]
+@@ -3, +4 @@
+-[]
+`,
+ },
+ {
+ name: "maps",
+ dataType: arrow.MapOf(arrow.BinaryTypes.String,
arrow.PrimitiveTypes.Int32),
+ baseJSON: `[
+ [{"key": "foo", "value": 2}, {"key": "bar", "value":
3}, {"key": "baz", "value": 1}],
+ [{"key": "quux", "value": 13}]
+ []
+ ]`,
+ targetJSON: `[
+ [{"key": "foo", "value": 2}, {"key": "bar", "value":
3}, {"key": "baz", "value": 1}],
+ [{"key": "ytho", "value": 11}],
+ [{"key": "quux", "value": 13}]
+ []
+ ]`,
+ want: `@@ -1, +1 @@
++[{"key":"ytho","value":11}]
+`,
+ },
+ {
+ name: "structs",
+ dataType: arrow.StructOf(
+ []arrow.Field{
+ {Name: "foo", Type:
arrow.BinaryTypes.String, Nullable: true},
+ {Name: "bar", Type:
arrow.PrimitiveTypes.Int32, Nullable: true},
+ }...,
+ ),
+ baseJSON: `[{"foo": "!", "bar": 3}, {}, {"bar": 13}]`,
+ targetJSON: `[{"foo": null, "bar": 2}, {}, {"bar":
13}]`,
+ want: `@@ -0, +0 @@
+-{"bar":3,"foo":"!"}
++{"bar":2,"foo":null}
+`,
+ },
+ {
+ name: "unions",
+ dataType: arrow.UnionOf(arrow.SparseMode,
+ []arrow.Field{
+ {Name: "foo", Type:
arrow.BinaryTypes.String},
+ {Name: "bar", Type:
arrow.PrimitiveTypes.Int32},
+ },
+ []arrow.UnionTypeCode{2, 5},
+ ),
+ baseJSON: `[[2, "!"], [5, 3], [5, 13]]`,
+ targetJSON: `[[2, "!"], [2, "3"], [5, 13]]`,
+ want: `@@ -1, +1 @@
+-[5,3]
++[2,"3"]
+`,
+ },
+ {
+ name: "string",
+ dataType: arrow.BinaryTypes.String,
+ baseJSON: `["h", "l", "l", "o", "o"]`,
+ targetJSON: `["h", "e", "l", "l", "o", "0"]`,
+ want: `@@ -1, +1 @@
++"e"
+@@ -4, +5 @@
+-"o"
++"0"
+`,
+ },
+ {
+ name: "int8",
+ dataType: arrow.PrimitiveTypes.Int8,
+ baseJSON: `[0, 1, 2, 3, 5, 8, 11, 13, 17]`,
+ targetJSON: `[2, 3, 5, 7, 11, 13, 17, 19]`,
+ want: `@@ -0, +0 @@
+-0
+-1
+@@ -5, +3 @@
+-8
++7
+@@ -9, +7 @@
++19
+`,
+ },
+ {
+ name: "int16",
+ dataType: arrow.PrimitiveTypes.Int16,
+ baseJSON: `[0, 1, 2, 3, 5, 8, 11, 13, 17]`,
+ targetJSON: `[2, 3, 5, 7, 11, 13, 17, 19]`,
+ want: `@@ -0, +0 @@
+-0
+-1
+@@ -5, +3 @@
+-8
++7
+@@ -9, +7 @@
++19
+`,
+ },
+ {
+ name: "int32",
+ dataType: arrow.PrimitiveTypes.Int32,
+ baseJSON: `[0, 1, 2, 3, 5, 8, 11, 13, 17]`,
+ targetJSON: `[2, 3, 5, 7, 11, 13, 17, 19]`,
+ want: `@@ -0, +0 @@
+-0
+-1
+@@ -5, +3 @@
+-8
++7
+@@ -9, +7 @@
++19
+`,
+ },
+ {
+ name: "int64",
+ dataType: arrow.PrimitiveTypes.Int64,
+ baseJSON: `[0, 1, 2, 3, 5, 8, 11, 13, 17]`,
+ targetJSON: `[2, 3, 5, 7, 11, 13, 17, 19]`,
+ want: `@@ -0, +0 @@
+-0
+-1
+@@ -5, +3 @@
+-8
++7
+@@ -9, +7 @@
++19
+`,
+ },
+ {
+ name: "uint8",
+ dataType: arrow.PrimitiveTypes.Uint8,
+ baseJSON: `[0, 1, 2, 3, 5, 8, 11, 13, 17]`,
+ targetJSON: `[2, 3, 5, 7, 11, 13, 17, 19]`,
+ want: `@@ -0, +0 @@
+-0
+-1
+@@ -5, +3 @@
+-8
++7
+@@ -9, +7 @@
++19
+`,
+ },
+ {
+ name: "uint16",
+ dataType: arrow.PrimitiveTypes.Uint16,
+ baseJSON: `[0, 1, 2, 3, 5, 8, 11, 13, 17]`,
+ targetJSON: `[2, 3, 5, 7, 11, 13, 17, 19]`,
+ want: `@@ -0, +0 @@
+-0
+-1
+@@ -5, +3 @@
+-8
++7
+@@ -9, +7 @@
++19
+`,
+ },
+ {
+ name: "uint32",
+ dataType: arrow.PrimitiveTypes.Uint32,
+ baseJSON: `[0, 1, 2, 3, 5, 8, 11, 13, 17]`,
+ targetJSON: `[2, 3, 5, 7, 11, 13, 17, 19]`,
+ want: `@@ -0, +0 @@
+-0
+-1
+@@ -5, +3 @@
+-8
++7
+@@ -9, +7 @@
++19
+`,
+ },
+ {
+ name: "uint64",
+ dataType: arrow.PrimitiveTypes.Uint64,
+ baseJSON: `[0, 1, 2, 3, 5, 8, 11, 13, 17]`,
+ targetJSON: `[2, 3, 5, 7, 11, 13, 17, 19]`,
+ want: `@@ -0, +0 @@
+-0
+-1
+@@ -5, +3 @@
+-8
++7
+@@ -9, +7 @@
++19
+`,
+ },
+ {
+ name: "float32",
+ dataType: arrow.PrimitiveTypes.Float32,
+ baseJSON: `[0.1, 0.3, -0.5]`,
+ targetJSON: `[0.1, -0.5, 0.3]`,
+ want: `@@ -1, +1 @@
+-0.300000
+@@ -3, +2 @@
++0.300000
+`,
+ },
+ {
+ name: "float64",
+ dataType: arrow.PrimitiveTypes.Float64,
+ baseJSON: `[0.1, 0.3, -0.5]`,
+ targetJSON: `[0.1, -0.5, 0.3]`,
+ want: `@@ -1, +1 @@
+-0.300000
+@@ -3, +2 @@
++0.300000
+`,
+ },
+ {
+ name: "equal nulls",
+ dataType: arrow.PrimitiveTypes.Int32,
+ baseJSON: `[null, null]`,
+ targetJSON: `[null, null]`,
+ want: ``,
+ },
+ {
+ name: "nulls",
+ dataType: arrow.PrimitiveTypes.Int32,
+ baseJSON: `[1, null, null, null]`,
+ targetJSON: `[null, 1, null, 2]`,
+ want: `@@ -0, +0 @@
+-1
+@@ -2, +1 @@
+-null
++1
+@@ -4, +3 @@
++2
+`,
+ },
+ {
+ name: "extensions",
+ dataType: types.NewUUIDType(),
+ baseJSON: `["00000000-0000-0000-0000-000000000000",
"00000000-0000-0000-0000-000000000001"]`,
+ targetJSON: `["00000000-0000-0000-0000-000000000001",
"00000000-0000-0000-0000-000000000002"]`,
+ want: `@@ -0, +0 @@
+-"00000000-0000-0000-0000-000000000000"
+@@ -2, +1 @@
++"00000000-0000-0000-0000-000000000002"
+`,
+ },
+ }
+
+ for _, tc := range cases {
+ t.Run(tc.name, tc.check)
+ }
+}