Copilot commented on code in PR #820:
URL: https://github.com/apache/arrow-nanoarrow/pull/820#discussion_r2467164013
##########
python/src/nanoarrow/iterator.py:
##########
@@ -297,6 +297,65 @@ def _fixed_size_list_iter(self, offset, length):
for _ in range(length):
yield list(islice(child_iter, fixed_size))
+ def _sparse_union_iter(self, offset, length):
+ view = self._array_view
+ offset += view.offset
+
+ type_codes = self.schema.type_codes
+ child_index_by_type_id = {
+ member_id: i for i, member_id in zip(type_codes,
range(len(type_codes)))
+ }
+
+ type_id = memoryview(view.buffer(0))[offset : (offset + length + 1)]
+
+ # Try as hard as we can to reduce the number of times we request a
child
+ # iterator by iterating over runs of consecutive type_ids
+ i = 0
+ for item_type_id, item_type_id_iter in groupby(type_id):
+ type_id_run_length = len(list(item_type_id_iter))
+ child = self._children[child_index_by_type_id[item_type_id]]
+ yield from child._iter_chunk(i, type_id_run_length)
+
+ i += type_id_run_length
+
+ def _dense_union_iter(self, offset, length):
+ view = self._array_view
+ offset += view.offset
+
+ type_codes = self.schema.type_codes
+ child_index_by_type_id = {
+ member_id: i for i, member_id in zip(type_codes,
range(len(type_codes)))
+ }
+
+ type_id = memoryview(view.buffer(0))[offset : (offset + length + 1)]
+ offsets = memoryview(view.buffer(1))[offset : (offset + length + 1)]
+
+ # Try as hard as we can to reduce the number of times we request a
child
+ # iterator by iterating over runs of consecutive type_ids
+ i = 0
+ for item_type_id, item_type_id_iter in groupby(type_id):
+ type_id_run_length = 0
Review Comment:
Line 337 initializes `type_id_run_length` to 0 but line 338 immediately
overwrites it. Remove the redundant initialization on line 337.
```suggestion
```
##########
python/src/nanoarrow/iterator.py:
##########
@@ -297,6 +297,65 @@ def _fixed_size_list_iter(self, offset, length):
for _ in range(length):
yield list(islice(child_iter, fixed_size))
+ def _sparse_union_iter(self, offset, length):
+ view = self._array_view
+ offset += view.offset
+
+ type_codes = self.schema.type_codes
+ child_index_by_type_id = {
+ member_id: i for i, member_id in zip(type_codes,
range(len(type_codes)))
+ }
+
+ type_id = memoryview(view.buffer(0))[offset : (offset + length + 1)]
+
+ # Try as hard as we can to reduce the number of times we request a
child
+ # iterator by iterating over runs of consecutive type_ids
+ i = 0
+ for item_type_id, item_type_id_iter in groupby(type_id):
+ type_id_run_length = len(list(item_type_id_iter))
+ child = self._children[child_index_by_type_id[item_type_id]]
+ yield from child._iter_chunk(i, type_id_run_length)
+
+ i += type_id_run_length
+
+ def _dense_union_iter(self, offset, length):
+ view = self._array_view
+ offset += view.offset
+
+ type_codes = self.schema.type_codes
+ child_index_by_type_id = {
+ member_id: i for i, member_id in zip(type_codes,
range(len(type_codes)))
+ }
+
+ type_id = memoryview(view.buffer(0))[offset : (offset + length + 1)]
+ offsets = memoryview(view.buffer(1))[offset : (offset + length + 1)]
+
+ # Try as hard as we can to reduce the number of times we request a
child
+ # iterator by iterating over runs of consecutive type_ids
+ i = 0
+ for item_type_id, item_type_id_iter in groupby(type_id):
+ type_id_run_length = 0
+ type_id_run_length = len(list(item_type_id_iter))
+ child_offsets = offsets[i : (i + type_id_run_length)]
+ child_offset0 = child_offsets[0]
+
+ # This only works if there are no missing elements (i.e., for
sequences
+ # of an identical type_id, the elements must be sequential and
increasing).
+ # The spec specifies the sequential/increasing nature of these
offsets but
+ # we check to be sure.
+ if (child_offsets[-1] - child_offset0) != (type_id_run_length - 1):
+ raise ValueError(
+ f"Child offsets for type_id {item_type_id} are not
sequential:"
+ " {list(child_offsets)} / {type_id_run_length}"
Review Comment:
The error message uses f-string syntax but the variables `child_offsets` and
`type_id_run_length` are not interpolated (they're inside regular string
literals). The template expressions should be outside the quotes or the entire
message should be a proper f-string.
```suggestion
f"Child offsets for type_id {item_type_id} are not
sequential: {list(child_offsets)} / {type_id_run_length}"
```
##########
python/src/nanoarrow/iterator.py:
##########
@@ -297,6 +297,65 @@ def _fixed_size_list_iter(self, offset, length):
for _ in range(length):
yield list(islice(child_iter, fixed_size))
+ def _sparse_union_iter(self, offset, length):
+ view = self._array_view
+ offset += view.offset
+
+ type_codes = self.schema.type_codes
+ child_index_by_type_id = {
+ member_id: i for i, member_id in zip(type_codes,
range(len(type_codes)))
Review Comment:
The dictionary comprehension uses `zip(type_codes, range(len(type_codes)))`
which can be simplified to `enumerate(type_codes)` for better readability.
```suggestion
member_id: i for i, member_id in enumerate(type_codes)
```
##########
python/src/nanoarrow/iterator.py:
##########
@@ -297,6 +297,65 @@ def _fixed_size_list_iter(self, offset, length):
for _ in range(length):
yield list(islice(child_iter, fixed_size))
+ def _sparse_union_iter(self, offset, length):
+ view = self._array_view
+ offset += view.offset
+
+ type_codes = self.schema.type_codes
+ child_index_by_type_id = {
+ member_id: i for i, member_id in zip(type_codes,
range(len(type_codes)))
+ }
+
+ type_id = memoryview(view.buffer(0))[offset : (offset + length + 1)]
+
+ # Try as hard as we can to reduce the number of times we request a
child
+ # iterator by iterating over runs of consecutive type_ids
+ i = 0
+ for item_type_id, item_type_id_iter in groupby(type_id):
+ type_id_run_length = len(list(item_type_id_iter))
+ child = self._children[child_index_by_type_id[item_type_id]]
+ yield from child._iter_chunk(i, type_id_run_length)
+
+ i += type_id_run_length
+
+ def _dense_union_iter(self, offset, length):
+ view = self._array_view
+ offset += view.offset
+
+ type_codes = self.schema.type_codes
+ child_index_by_type_id = {
+ member_id: i for i, member_id in zip(type_codes,
range(len(type_codes)))
Review Comment:
The dictionary comprehension uses `zip(type_codes, range(len(type_codes)))`
which can be simplified to `enumerate(type_codes)` for better readability.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]