felipecrv commented on code in PR #507:
URL: https://github.com/apache/arrow-nanoarrow/pull/507#discussion_r1626817305


##########
src/nanoarrow/array.c:
##########
@@ -1163,6 +1214,40 @@ static int ArrowArrayViewValidateFull(struct 
ArrowArrayView* array_view,
     }
   }
 
+  if (array_view->storage_type == NANOARROW_TYPE_RUN_END_ENCODED) {
+    struct ArrowArrayView* run_ends_view = array_view->children[0];
+    switch (run_ends_view->storage_type) {
+      case NANOARROW_TYPE_INT16:
+      case NANOARROW_TYPE_INT32:
+      case NANOARROW_TYPE_INT64:
+        break;
+      default:
+        ArrowErrorSet(
+            error,
+            "Run-end encoded array only supports INT16, INT32 or INT64 
run-ends "
+            "but found run-ends type %s",
+            ArrowTypeString(run_ends_view->storage_type));
+        return EINVAL;
+    }
+    int64_t prev_run_end = 0;
+    for (int64_t i = 0; i < run_ends_view->length; i++) {
+      int64_t run_end = ArrowArrayViewGetIntUnsafe(run_ends_view, i);
+      if (run_end < 0 || run_end < prev_run_end) {
+        return EINVAL;
+      }
+      prev_run_end = run_end;
+    }
+    if (prev_run_end != array_view->length) {
+      ArrowErrorSet(error,
+                    "The last run end value of a run-end encoded array must be 
equal to "
+                    "the logical length"

Review Comment:
   Nope. :-) And this is what make run-end encoded arrays so mind-bending.
   
   Think of the children run-ends and values arrays as forming a virtual big 
array starting from zero and ending at the last run-end. Then the parent 
offset/length pair are just a projection onto this big array.
   
   The offset on the parent can land on any of the runs. And that offset plus 
the length can land on any of the runs and the run doesn't necessarily end when 
the parent's offset+length says.
   
   For example, in this REE
   
   ```
   offset: 1
   length: 3
     run_ends: [2, 5, 6]
     values:   [A, B, C]
   ```
   
   the uncompressed "big array" is `[A A B B B C]` but the entire REE is `[A B 
B]`.
   



##########
src/nanoarrow/array.c:
##########
@@ -995,6 +1021,31 @@ static int ArrowArrayViewValidateDefault(struct 
ArrowArrayView* array_view,
         }
       }
       break;
+
+    case NANOARROW_TYPE_RUN_END_ENCODED: {
+      struct ArrowArrayView* run_ends_view;
+      run_ends_view = array_view->children[0];
+      if (run_ends_view->null_count != 0) {
+        ArrowErrorSet(error, "Run Ends cannot be null but found null_count 
value %ld",
+                      (long)run_ends_view->null_count);
+        return EINVAL;
+      }
+      for (int64_t i = 0, prev_run_end = 0; i < run_ends_view->length; i++) {
+        int64_t run_end = ArrowArrayViewGetIntUnsafe(run_ends_view, i);
+        if (run_end < 0 || run_end < prev_run_end) {
+          return EINVAL;
+        }
+        prev_run_end = run_end;
+        if (i == run_ends_view->length - 1 && run_end != array_view->length) {

Review Comment:
   This "violation" is not really a violation. The parent array can change it's 
length without mutating the immutable run ends children.
   
   Also keep in mind that if the parent has an offset, that last run-end would 
be affected and not match the logical length of the parent.



##########
src/nanoarrow/array.c:
##########
@@ -846,6 +856,22 @@ static int ArrowArrayViewValidateMinimal(struct 
ArrowArrayView* array_view,
         return EINVAL;
       }
       break;
+
+    case NANOARROW_TYPE_RUN_END_ENCODED:
+      if (array_view->children[0]->null_count != 0) {
+        ArrowErrorSet(error, "Run Ends cannot be null but found null_count 
value %ld",
+                      (long)array_view->children[0]->null_count);
+        return EINVAL;
+      }
+      if (array_view->children[0]->length < array_view->children[1]->length) {
+        ArrowErrorSet(
+            error,
+            "Expected the 2 children of run-end encoded array to have equal 
length "
+            "but found mismatched lengths: run_ends->length=%ld, 
values->length=%ld",
+            (long)array_view->children[0]->length, 
(long)array_view->children[1]->length);
+        return EINVAL;
+      }

Review Comment:
   It's OK to have less run-ends than values. It's the opposite that's wrong 
because you always start the indexing of values by checking the run-ends first:
   
   1. find run-end index according to logical index
   2. use that index with the values array
   
   https://github.com/apache/arrow/blob/main/cpp/src/arrow/util/ree_util.cc#L199



##########
src/nanoarrow/array.c:
##########
@@ -995,6 +1021,31 @@ static int ArrowArrayViewValidateDefault(struct 
ArrowArrayView* array_view,
         }
       }
       break;
+
+    case NANOARROW_TYPE_RUN_END_ENCODED: {
+      struct ArrowArrayView* run_ends_view;
+      run_ends_view = array_view->children[0];
+      if (run_ends_view->null_count != 0) {
+        ArrowErrorSet(error, "Run Ends cannot be null but found null_count 
value %ld",
+                      (long)run_ends_view->null_count);
+        return EINVAL;
+      }
+      for (int64_t i = 0, prev_run_end = 0; i < run_ends_view->length; i++) {
+        int64_t run_end = ArrowArrayViewGetIntUnsafe(run_ends_view, i);
+        if (run_end < 0 || run_end < prev_run_end) {
+          return EINVAL;
+        }
+        prev_run_end = run_end;
+        if (i == run_ends_view->length - 1 && run_end != array_view->length) {
+          ArrowErrorSet(error,
+                        "Run End value %ld at index %ld exceeds the logical "
+                        "length of the run-end encoded array %ld",
+                        (long)run_end, (long)i, (long)array_view->length);
+          return EINVAL;
+        }
+      }

Review Comment:
   This is how I wrote this loop with a single branch inside the loop:
   
   ```cpp
         if (run_ends[0] < 1) {
           return Status::Invalid(
               "All run ends must be greater than 0 but the first run end is ", 
run_ends[0]);
         }
         int64_t last_run_end = run_ends[0];
         for (int64_t index = 1; index < run_ends_length; index++) {
           const int64_t run_end = run_ends[index];
           if (run_end <= last_run_end) {
             return Status::Invalid(
                 "Every run end must be strictly greater than the previous run 
end, "
                 "but run_ends[",
                 index, "] is ", run_end, " and run_ends[", index - 1, "] is ",
                 last_run_end);
           }
           last_run_end = run_end;
         }
   ```
   
   
https://github.com/apache/arrow/blob/e4baf6be2167eb6ccbda90275304336f49998eac/cpp/src/arrow/array/validate.cc#L753



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to