felipecrv commented on code in PR #39019:
URL: https://github.com/apache/arrow/pull/39019#discussion_r1425719213
##########
go/arrow/array/list.go:
##########
@@ -1411,217 +1410,56 @@ func (b *baseListViewBuilder) UnmarshalJSON(data
[]byte) error {
return b.Unmarshal(dec)
}
-// Pre-conditions:
-//
-// input.DataType() is ListViewType
-// input.Len() > 0 && input.NullN() != input.Len()
-func minListViewOffset32(input arrow.ArrayData) int32 {
- var bitmap []byte
- if input.Buffers()[0] != nil {
- bitmap = input.Buffers()[0].Bytes()
- }
- offsets :=
arrow.Int32Traits.CastFromBytes(input.Buffers()[1].Bytes())[input.Offset():]
- sizes :=
arrow.Int32Traits.CastFromBytes(input.Buffers()[2].Bytes())[input.Offset():]
-
- isNull := func(i int) bool {
- return bitmap != nil && bitutil.BitIsNotSet(bitmap,
input.Offset()+i)
- }
-
- // It's very likely that the first non-null non-empty list-view starts
at
- // offset 0 of the child array.
- i := 0
- for i < input.Len() && (isNull(i) || sizes[i] == 0) {
- i += 1
- }
- if i >= input.Len() {
- return 0
- }
- minOffset := offsets[i]
- if minOffset == 0 {
- // early exit: offset 0 found already
- return 0
- }
-
- // Slow path: scan the buffers entirely.
- i += 1
- for ; i < input.Len(); i += 1 {
- if isNull(i) {
- continue
- }
- offset := offsets[i]
- if offset < minOffset && sizes[i] > 0 {
- minOffset = offset
- }
- }
- return minOffset
-}
-
-// Find the maximum offset+size in a LIST_VIEW array.
+// Find the minimum offset+size in a LIST_VIEW/LARGE_LIST_VIEW array.
//
// Pre-conditions:
//
-// input.DataType() is ListViewType
-// input.Len() > 0 && input.NullN() != input.Len()
-func maxListViewOffset32(input arrow.ArrayData) int {
+// input.DataType() is ListViewType if Offset=int32 or LargeListViewType
if Offset=int64
+// input.Len() > 0
Review Comment:
Why remove the `NullN() != Len()` pre-condition?
##########
go/arrow/array/list.go:
##########
@@ -1411,217 +1410,56 @@ func (b *baseListViewBuilder) UnmarshalJSON(data
[]byte) error {
return b.Unmarshal(dec)
}
-// Pre-conditions:
-//
-// input.DataType() is ListViewType
-// input.Len() > 0 && input.NullN() != input.Len()
-func minListViewOffset32(input arrow.ArrayData) int32 {
- var bitmap []byte
- if input.Buffers()[0] != nil {
- bitmap = input.Buffers()[0].Bytes()
- }
- offsets :=
arrow.Int32Traits.CastFromBytes(input.Buffers()[1].Bytes())[input.Offset():]
- sizes :=
arrow.Int32Traits.CastFromBytes(input.Buffers()[2].Bytes())[input.Offset():]
-
- isNull := func(i int) bool {
- return bitmap != nil && bitutil.BitIsNotSet(bitmap,
input.Offset()+i)
- }
-
- // It's very likely that the first non-null non-empty list-view starts
at
- // offset 0 of the child array.
- i := 0
- for i < input.Len() && (isNull(i) || sizes[i] == 0) {
- i += 1
- }
- if i >= input.Len() {
- return 0
- }
- minOffset := offsets[i]
- if minOffset == 0 {
- // early exit: offset 0 found already
- return 0
- }
-
- // Slow path: scan the buffers entirely.
- i += 1
- for ; i < input.Len(); i += 1 {
- if isNull(i) {
- continue
- }
- offset := offsets[i]
- if offset < minOffset && sizes[i] > 0 {
- minOffset = offset
- }
- }
- return minOffset
-}
-
-// Find the maximum offset+size in a LIST_VIEW array.
+// Find the minimum offset+size in a LIST_VIEW/LARGE_LIST_VIEW array.
//
// Pre-conditions:
//
-// input.DataType() is ListViewType
-// input.Len() > 0 && input.NullN() != input.Len()
-func maxListViewOffset32(input arrow.ArrayData) int {
+// input.DataType() is ListViewType if Offset=int32 or LargeListViewType
if Offset=int64
+// input.Len() > 0
+func minListViewOffset[Offset int32 | int64](input arrow.ArrayData) Offset {
inputOffset := input.Offset()
- var bitmap []byte
- if input.Buffers()[0] != nil {
- bitmap = input.Buffers()[0].Bytes()
- }
- offsets :=
arrow.Int32Traits.CastFromBytes(input.Buffers()[1].Bytes())[inputOffset:]
- sizes :=
arrow.Int32Traits.CastFromBytes(input.Buffers()[2].Bytes())[inputOffset:]
-
- isNull := func(i int) bool {
- return bitmap != nil && bitutil.BitIsNotSet(bitmap,
inputOffset+i)
- }
-
- i := input.Len() - 1 // safe because input.Len() > 0
- for i != 0 && (isNull(i) || sizes[i] == 0) {
- i -= 1
- }
- offset := offsets[i]
- size := sizes[i]
- if i == 0 {
- if isNull(i) || sizes[i] == 0 {
- return 0
- } else {
- return int(offset + size)
- }
- }
+ offsets :=
arrow.GetData[Offset](input.Buffers()[1].Bytes())[inputOffset:]
- values := input.Children()[0]
- maxEnd := int(offsets[i] + sizes[i])
- if maxEnd == values.Len() {
- // Early-exit: maximum possible view-end found already.
- return maxEnd
- }
-
- // Slow path: scan the buffers entirely.
- for ; i >= 0; i -= 1 {
- offset := offsets[i]
- size := sizes[i]
- if size > 0 && !isNull(i) {
- if int(offset+size) > maxEnd {
- maxEnd = int(offset + size)
- if maxEnd == values.Len() {
- return maxEnd
- }
- }
- }
- }
- return maxEnd
-}
-
-// Pre-conditions:
-//
-// input.DataType() is LargeListViewType
-// input.Len() > 0 && input.NullN() != input.Len()
-func minLargeListViewOffset64(input arrow.ArrayData) int64 {
- var bitmap []byte
- if input.Buffers()[0] != nil {
- bitmap = input.Buffers()[0].Bytes()
- }
- offsets :=
arrow.Int64Traits.CastFromBytes(input.Buffers()[1].Bytes())[input.Offset():]
- sizes :=
arrow.Int64Traits.CastFromBytes(input.Buffers()[2].Bytes())[input.Offset():]
-
- isNull := func(i int) bool {
- return bitmap != nil && bitutil.BitIsNotSet(bitmap,
input.Offset()+i)
- }
-
- // It's very likely that the first non-null non-empty list-view starts
at
- // offset 0 of the child array.
i := 0
- for i < input.Len() && (isNull(i) || sizes[i] == 0) {
- i += 1
- }
- if i >= input.Len() {
- return 0
- }
- minOffset := offsets[i]
- if minOffset == 0 {
- // early exit: offset 0 found already
- return 0
- }
+ minOffset := offsets[i] // safe because input.Len() > 0
- // Slow path: scan the buffers entirely.
- i += 1
- for ; i < input.Len(); i += 1 {
- if isNull(i) {
- continue
+ for i += 1; i < input.Len(); i += 1 {
+ if minOffset == 0 {
+ // Fast path: the minimum offset is frequently 0 (the
start of the child array),
+ // and frequently a view which has this offset will be
near the start of the array.
+ return 0
}
- offset := offsets[i]
- if offset < minOffset && sizes[i] > 0 {
+ if offset := offsets[i]; offset < minOffset {
minOffset = offset
}
}
return minOffset
}
-// Find the maximum offset+size in a LARGE_LIST_VIEW array.
+// Find the maximum offset+size in a LIST_VIEW/LARGE_LIST_VIEW array.
//
// Pre-conditions:
//
-// input.DataType() is LargeListViewType
-// input.Len() > 0 && input.NullN() != input.Len()
-func maxLargeListViewOffset64(input arrow.ArrayData) int64 {
+// input.DataType() is ListViewType if Offset=int32 or LargeListViewType
if Offset=int64
+// input.Len() > 0
+func maxListViewEnd[Offset int32 | int64](input arrow.ArrayData) Offset {
inputOffset := input.Offset()
- var bitmap []byte
- if input.Buffers()[0] != nil {
- bitmap = input.Buffers()[0].Bytes()
- }
- offsets :=
arrow.Int64Traits.CastFromBytes(input.Buffers()[1].Bytes())[inputOffset:]
- sizes :=
arrow.Int64Traits.CastFromBytes(input.Buffers()[2].Bytes())[inputOffset:]
+ offsets :=
arrow.GetData[Offset](input.Buffers()[1].Bytes())[inputOffset:]
+ sizes := arrow.GetData[Offset](input.Buffers()[2].Bytes())[inputOffset:]
- isNull := func(i int) bool {
- return bitmap != nil && bitutil.BitIsNotSet(bitmap,
inputOffset+i)
- }
+ maxLegalOffset := Offset(input.Children()[0].Len())
- // It's very likely that the first non-null non-empty list-view starts
at
- // offset zero, so we check that first and potentially early-return a 0.
- i := input.Len() - 1 // safe because input.Len() > 0
- for i != 0 && (isNull(i) || sizes[i] == 0) {
- i -= 1
- }
- offset := offsets[i]
- size := sizes[i]
- if i == 0 {
- if isNull(i) || sizes[i] == 0 {
- return 0
- } else {
- return offset + size
- }
- }
+ i := input.Len() - 1
+ maxEnd := offsets[i] + sizes[i] // safe because input.Len() > 0
- if offset > math.MaxInt64-size {
- // Early-exit: 64-bit overflow detected. This is not possible
on a
- // valid list-view, but we return the maximum possible value to
- // avoid undefined behavior.
- return math.MaxInt64
- }
- values := input.Children()[0]
- maxEnd := offsets[i] + sizes[i]
- if maxEnd == int64(values.Len()) {
- // Early-exit: maximum possible view-end found already.
- return maxEnd
- }
-
- // Slow path: scan the buffers entirely.
- for ; i >= 0; i -= 1 {
- offset := offsets[i]
- size := sizes[i]
- if size > 0 && !isNull(i) {
- if offset+size > maxEnd {
- if offset > math.MaxInt64-size {
- // 64-bit overflow detected. This is
not possible on a valid list-view,
- // but we saturate maxEnd to the
maximum possible value to avoid
- // undefined behavior.
- return math.MaxInt64
- }
- maxEnd = offset + size
- if maxEnd == int64(values.Len()) {
- return maxEnd
- }
- }
+ for i -= 1; i >= 0; i -= 1 {
+ if maxEnd == maxLegalOffset {
+ // Fast path: the maximum offset+size is frequently
exactly the end of the child array,
+ // and frequently a view which has this offset+size
will be near the end of the array.
+ return maxEnd
+ }
+ if end := offsets[i] + sizes[i]; end > maxEnd {
+ maxEnd = end
Review Comment:
You can take a look at my final version of this function in the C++
implementation:
https://github.com/apache/arrow/blob/main/cpp/src/arrow/util/list_util.cc#L91
##########
dev/archery/archery/integration/datagen.py:
##########
@@ -927,6 +927,85 @@ class LargeListColumn(_BaseListColumn, _LargeOffsetsMixin):
pass
+class ListViewField(Field):
+
+ def __init__(self, name, value_field, *, nullable=True,
+ metadata=None):
+ super().__init__(name, nullable=nullable,
+ metadata=metadata)
+ self.value_field = value_field
+
+ @property
+ def column_class(self):
+ return ListViewColumn
+
+ def _get_type(self):
+ return OrderedDict([
+ ('name', 'listview')
+ ])
+
+ def _get_children(self):
+ return [self.value_field.get_json()]
+
+ def generate_column(self, size, name=None):
+ MAX_LIST_SIZE = 4
+
+ is_valid = self._make_is_valid(size)
+ offsets = []
+ sizes = np.random.randint(0, MAX_LIST_SIZE + 1, size=size)
+ offset = 0
+ for s in sizes:
+ offsets.append(offset)
+ offset += int(s)
Review Comment:
Offsets should probably be out of order to break code that assumes list-view
offsets are sorted like in lists.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]