This is an automated email from the ASF dual-hosted git repository.

kou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new b413ac4f2b GH-32613: [C++] Simplify IPC writer for dense unions 
(#33822)
b413ac4f2b is described below

commit b413ac4f2b6911af5e8241803277caccc43aa3c4
Author: rtadepalli <[email protected]>
AuthorDate: Thu Feb 2 01:25:41 2023 -0500

    GH-32613: [C++] Simplify IPC writer for dense unions (#33822)
    
    JIRA: https://issues.apache.org/jira/browse/ARROW-17339
    Closes: https://github.com/apache/arrow/issues/32613
    
    ### Rationale for this change
    Dense union offsets are always non-strictly monotonic for any given child 
as mandated by the spec, The C++ implementation still assumes that the offsets 
may be in any order. This can be improved.
    
    ### What changes are included in this PR?
    
    Just a change to eliminate looping over the size of a `DenseUnionArray` 
twice.
    
    ### Are these changes tested?
    
    I am not functionally changing anything. All changes respect the spec, and 
behavior should be 1:1 with the existing implementation. I believe existing 
tests should suffice.
    
    ### Are there any user-facing changes?
    
    There are no user facing changes for this.
    
    * Closes: #32613
    
    Lead-authored-by: Ramasai Tadepalli <[email protected]>
    Co-authored-by: Ramasai <[email protected]>
    Signed-off-by: Sutou Kouhei <[email protected]>
---
 cpp/src/arrow/ipc/writer.cc | 24 ++++++++++--------------
 1 file changed, 10 insertions(+), 14 deletions(-)

diff --git a/cpp/src/arrow/ipc/writer.cc b/cpp/src/arrow/ipc/writer.cc
index 585b86fd84..dfd390349c 100644
--- a/cpp/src/arrow/ipc/writer.cc
+++ b/cpp/src/arrow/ipc/writer.cc
@@ -473,23 +473,19 @@ class RecordBatchSerializer {
       int32_t* shifted_offsets =
           reinterpret_cast<int32_t*>(shifted_offsets_buffer->mutable_data());
 
-      // Offsets may not be ascending, so we need to find out the start offset
-      // for each child
-      for (int64_t i = 0; i < length; ++i) {
-        const uint8_t code = type_codes[i];
+      // Offsets are guaranteed to be increasing according to the spec, so
+      // the first offset we find for a child is the initial offset and
+      // will become the 0th offset for this child.
+      for (int64_t code_idx = 0; code_idx < length; ++code_idx) {
+        const uint8_t code = type_codes[code_idx];
         if (child_offsets[code] == -1) {
-          child_offsets[code] = unshifted_offsets[i];
+          child_offsets[code] = unshifted_offsets[code_idx];
+          shifted_offsets[code_idx] = 0;
         } else {
-          child_offsets[code] = std::min(child_offsets[code], 
unshifted_offsets[i]);
+          shifted_offsets[code_idx] = unshifted_offsets[code_idx] - 
child_offsets[code];
         }
-      }
-
-      // Now compute shifted offsets by subtracting child offset
-      for (int64_t i = 0; i < length; ++i) {
-        const int8_t code = type_codes[i];
-        shifted_offsets[i] = unshifted_offsets[i] - child_offsets[code];
-        // Update the child length to account for observed value
-        child_lengths[code] = std::max(child_lengths[code], shifted_offsets[i] 
+ 1);
+        child_lengths[code] =
+            std::max(child_lengths[code], shifted_offsets[code_idx] + 1);
       }
 
       value_offsets = std::move(shifted_offsets_buffer);

Reply via email to