This is an automated email from the ASF dual-hosted git repository.
zeroshade pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-go.git
The following commit(s) were added to refs/heads/main by this push:
new 7ef4d1e feat: Add arrayApproxEqualString to handle null characters in
string. (#291)
7ef4d1e is described below
commit 7ef4d1e0e7ca610d802153ec3cc7b279f3199ac8
Author: Saurabh Singh <[email protected]>
AuthorDate: Fri Feb 21 23:39:28 2025 +0530
feat: Add arrayApproxEqualString to handle null characters in string. (#291)
Fixes: #56
### Rationale for this change
Many databases, including PostgreSQL, do not accept null characters
(\x00) in strings. This PR introduces `arrayApproxEqualString` &
`arrayApproxEqualLargeString`, functions that enables approximate
equality comparison for string arrays while ignoring trailing null
characters. This helps improve consistency with database behavior and
aligns with existing arrayApproxEqualFloat16.
### What changes are included in this PR?
- Added `arrayApproxEqualString`:
- Compares two string arrays for approximate equality.
- Ignores trailing null characters (\x00).
- Added unit tests in `compare_test.go` to verify correctness.
### Are these changes tested?
Yes, unit tests have been added to validate:
- Identical strings return true.
- Strings differing only in trailing null characters return true.
- Completely different strings return false.
### Are there any user-facing changes?
No
Signed-off-by: Saurabh Kumar Singh <[email protected]>
---
arrow/array/compare.go | 42 +++++++++++++++--
arrow/array/compare_test.go | 111 ++++++++++++++++++++++++++++++++++++++++++++
arrow/array/util.go | 4 ++
3 files changed, 154 insertions(+), 3 deletions(-)
diff --git a/arrow/array/compare.go b/arrow/array/compare.go
index ad3a50b..e412feb 100644
--- a/arrow/array/compare.go
+++ b/arrow/array/compare.go
@@ -487,19 +487,19 @@ func arrayApproxEqual(left, right arrow.Array, opt
equalOption) bool {
return arrayEqualBinary(l, r)
case *String:
r := right.(*String)
- return arrayEqualString(l, r)
+ return arrayApproxEqualString(l, r)
case *LargeBinary:
r := right.(*LargeBinary)
return arrayEqualLargeBinary(l, r)
case *LargeString:
r := right.(*LargeString)
- return arrayEqualLargeString(l, r)
+ return arrayApproxEqualLargeString(l, r)
case *BinaryView:
r := right.(*BinaryView)
return arrayEqualBinaryView(l, r)
case *StringView:
r := right.(*StringView)
- return arrayEqualStringView(l, r)
+ return arrayApproxEqualStringView(l, r)
case *Int8:
r := right.(*Int8)
return arrayEqualInt8(l, r)
@@ -644,6 +644,42 @@ func validityBitmapEqual(left, right arrow.Array) bool {
return true
}
+func arrayApproxEqualString(left, right *String) bool {
+ for i := 0; i < left.Len(); i++ {
+ if left.IsNull(i) {
+ continue
+ }
+ if stripNulls(left.Value(i)) != stripNulls(right.Value(i)) {
+ return false
+ }
+ }
+ return true
+}
+
+func arrayApproxEqualLargeString(left, right *LargeString) bool {
+ for i := 0; i < left.Len(); i++ {
+ if left.IsNull(i) {
+ continue
+ }
+ if stripNulls(left.Value(i)) != stripNulls(right.Value(i)) {
+ return false
+ }
+ }
+ return true
+}
+
+func arrayApproxEqualStringView(left, right *StringView) bool {
+ for i := 0; i < left.Len(); i++ {
+ if left.IsNull(i) {
+ continue
+ }
+ if stripNulls(left.Value(i)) != stripNulls(right.Value(i)) {
+ return false
+ }
+ }
+ return true
+}
+
func arrayApproxEqualFloat16(left, right *Float16, opt equalOption) bool {
for i := 0; i < left.Len(); i++ {
if left.IsNull(i) {
diff --git a/arrow/array/compare_test.go b/arrow/array/compare_test.go
index 3059ed3..5c569f2 100644
--- a/arrow/array/compare_test.go
+++ b/arrow/array/compare_test.go
@@ -111,6 +111,94 @@ func TestArrayApproxEqual(t *testing.T) {
}
}
+func TestArrayApproxEqualStrings(t *testing.T) {
+ for _, tc := range []struct {
+ name string
+ a1 interface{}
+ a2 interface{}
+ want bool
+ }{
+ {
+ name: "string",
+ a1: []string{"a", "b", "c", "d", "e", "f"},
+ a2: []string{"a", "b", "c", "d", "e", "f"},
+ want: true,
+ },
+ {
+ name: "string",
+ a1: []string{"a", "b\x00"},
+ a2: []string{"a", "b"},
+ want: true,
+ },
+ {
+ name: "string",
+ a1: []string{"a", "b\x00"},
+ a2: []string{"a\x00", "b"},
+ want: true,
+ },
+ {
+ name: "equal large strings",
+ a1: []string{"a", "b", "c", "d", "e", "f"},
+ a2: []string{"a", "b", "c", "d", "e", "f"},
+ want: true,
+ },
+ {
+ name: "equal large strings with nulls",
+ a1: []string{"a", "b\x00"},
+ a2: []string{"a", "b"},
+ want: true,
+ },
+ {
+ name: "equal large strings with nulls in both",
+ a1: []string{"Apache", "Arrow\x00"},
+ a2: []string{"Apache\x00", "Arrow"},
+ want: true,
+ },
+ {
+ name: "equal string views",
+ a1: []string{"a", "b", "c", "d", "e", "f"},
+ a2: []string{"a", "b", "c", "d", "e", "f"},
+ want: true,
+ },
+ {
+ name: "equal string views with nulls",
+ a1: []string{"Apache", "Arrow\x00"},
+ a2: []string{"Apache", "Arrow"},
+ want: true,
+ },
+ {
+ name: "equal string views with nulls in both",
+ a1: []string{"Apache", "Arrow\x00"},
+ a2: []string{"Apache\x00", "Arrow"},
+ want: true,
+ },
+ } {
+ t.Run(tc.name, func(t *testing.T) {
+ mem :=
memory.NewCheckedAllocator(memory.NewGoAllocator())
+ defer mem.AssertSize(t, 0)
+
+ var a1, a2 arrow.Array
+ switch tc.name {
+ case "equal large strings", "equal large strings with
nulls", "equal large strings with nulls in both":
+ a1 = arrayOfLargeString(mem, tc.a1.([]string),
nil)
+ a2 = arrayOfLargeString(mem, tc.a2.([]string),
nil)
+ case "equal string views", "equal string views with
nulls", "equal string views with nulls in both":
+ a1 = arrayOfStringView(mem, tc.a1.([]string),
nil)
+ a2 = arrayOfStringView(mem, tc.a2.([]string),
nil)
+ default:
+ a1 = arrayOf(mem, tc.a1, nil)
+ a2 = arrayOf(mem, tc.a2, nil)
+ }
+ defer a1.Release()
+ defer a2.Release()
+
+ if got, want := array.ApproxEqual(a1, a2), tc.want; got
!= want {
+ t.Fatalf("invalid comparison: got=%v,
want=%v\na1: %v\na2: %v\n", got, want, a1, a2)
+ }
+ })
+ }
+}
+
func TestArrayApproxEqualFloats(t *testing.T) {
f16sFrom := func(vs []float64) []float16.Num {
o := make([]float16.Num, len(vs))
@@ -445,11 +533,34 @@ func arrayOf(mem memory.Allocator, a interface{}, valids
[]bool) arrow.Array {
bldr.AppendValues(a, valids)
return bldr.NewFloat64Array()
+ case []string:
+ bldr := array.NewStringBuilder(mem)
+ defer bldr.Release()
+
+ bldr.AppendValues(a, valids)
+ return bldr.NewStringArray()
+
default:
panic(fmt.Errorf("arrdata: invalid data slice type %T", a))
}
}
+func arrayOfLargeString(mem memory.Allocator, a []string, valids []bool)
arrow.Array {
+ bldr := array.NewLargeStringBuilder(mem)
+ defer bldr.Release()
+
+ bldr.AppendValues(a, valids)
+ return bldr.NewLargeStringArray()
+}
+
+func arrayOfStringView(mem memory.Allocator, a []string, valids []bool)
arrow.Array {
+ bldr := array.NewStringViewBuilder(mem)
+ defer bldr.Release()
+
+ bldr.AppendValues(a, valids)
+ return bldr.NewStringViewArray()
+}
+
func TestArrayEqualBaseArray(t *testing.T) {
mem := memory.NewCheckedAllocator(memory.NewGoAllocator())
defer mem.AssertSize(t, 0)
diff --git a/arrow/array/util.go b/arrow/array/util.go
index 53a1fdd..c8316ab 100644
--- a/arrow/array/util.go
+++ b/arrow/array/util.go
@@ -521,3 +521,7 @@ func MakeArrayOfNull(mem memory.Allocator, dt
arrow.DataType, length int) arrow.
defer data.Release()
return MakeFromData(data)
}
+
+func stripNulls(s string) string {
+ return strings.TrimRight(s, "\x00")
+}