paleolimbot commented on code in PR #507:
URL: https://github.com/apache/arrow-nanoarrow/pull/507#discussion_r1627645298
##########
src/nanoarrow/array.c:
##########
@@ -811,6 +812,15 @@ static int ArrowArrayViewValidateMinimal(struct
ArrowArrayView* array_view,
(long)array_view->n_children);
return EINVAL;
}
+ break;
+ case NANOARROW_TYPE_RUN_END_ENCODED:
+ if (array_view->n_children != 2) {
+ ArrowErrorSet(error, "Expected 2 child of %s array but found %ld child
arrays",
Review Comment:
```suggestion
ArrowErrorSet(error, "Expected 2 children of %s array but found %ld
child arrays",
```
##########
src/nanoarrow/array.c:
##########
@@ -846,6 +856,75 @@ static int ArrowArrayViewValidateMinimal(struct
ArrowArrayView* array_view,
return EINVAL;
}
break;
+
+ case NANOARROW_TYPE_RUN_END_ENCODED:
+ if (array_view->n_children != 2) {
+ ArrowErrorSet(error,
+ "Expected 2 children for run-end encoded array but found
%ld",
+ (long)array_view->n_children);
+ return EINVAL;
+ }
+ struct ArrowArrayView* run_ends_view;
+ struct ArrowArrayView* values_view;
+ run_ends_view = array_view->children[0];
+ values_view = array_view->children[1];
Review Comment:
```suggestion
struct ArrowArrayView* run_ends_view = array_view->children[0];
struct ArrowArrayView* values_view = values_view =
array_view->children[1];
```
##########
src/nanoarrow/array.c:
##########
@@ -846,6 +856,75 @@ static int ArrowArrayViewValidateMinimal(struct
ArrowArrayView* array_view,
return EINVAL;
}
break;
+
+ case NANOARROW_TYPE_RUN_END_ENCODED:
+ if (array_view->n_children != 2) {
+ ArrowErrorSet(error,
+ "Expected 2 children for run-end encoded array but found
%ld",
+ (long)array_view->n_children);
+ return EINVAL;
+ }
+ struct ArrowArrayView* run_ends_view;
+ struct ArrowArrayView* values_view;
+ run_ends_view = array_view->children[0];
+ values_view = array_view->children[1];
+ if (run_ends_view == NULL) {
+ ArrowErrorSet(error, "Run ends array is null pointer");
+ return EINVAL;
+ }
+ if (values_view == NULL) {
+ ArrowErrorSet(error, "Values array is null pointer");
+ return EINVAL;
+ }
+ uint64_t max_length;
+ switch (run_ends_view->storage_type) {
+ case NANOARROW_TYPE_INT16:
+ max_length = INT16_MAX;
+ break;
+ case NANOARROW_TYPE_INT32:
+ max_length = INT32_MAX;
+ break;
+ case NANOARROW_TYPE_INT64:
+ max_length = INT64_MAX;
+ break;
+ default:
+ ArrowErrorSet(
+ error,
+ "Run-end encoded array only supports INT16, INT32 or INT64
run-ends "
+ "but found run-ends type %s",
+ ArrowTypeString(run_ends_view->storage_type));
+ return EINVAL;
+ }
+ if ((uint64_t)array_view->offset + (uint64_t)array_view->length >
max_length) {
Review Comment:
```suggestion
if ((array_view->offset + array_view->length) > max_length) {
```
##########
src/nanoarrow/array.c:
##########
@@ -846,6 +856,75 @@ static int ArrowArrayViewValidateMinimal(struct
ArrowArrayView* array_view,
return EINVAL;
}
break;
+
+ case NANOARROW_TYPE_RUN_END_ENCODED:
+ if (array_view->n_children != 2) {
+ ArrowErrorSet(error,
+ "Expected 2 children for run-end encoded array but found
%ld",
+ (long)array_view->n_children);
+ return EINVAL;
+ }
+ struct ArrowArrayView* run_ends_view;
+ struct ArrowArrayView* values_view;
+ run_ends_view = array_view->children[0];
+ values_view = array_view->children[1];
+ if (run_ends_view == NULL) {
+ ArrowErrorSet(error, "Run ends array is null pointer");
+ return EINVAL;
+ }
+ if (values_view == NULL) {
+ ArrowErrorSet(error, "Values array is null pointer");
+ return EINVAL;
+ }
Review Comment:
I am not sure that we check this anywhere else? (Getting a NULL here would
be very difficult to achieve and would be a programming error)
##########
src/nanoarrow/array.c:
##########
@@ -846,6 +856,75 @@ static int ArrowArrayViewValidateMinimal(struct
ArrowArrayView* array_view,
return EINVAL;
}
break;
+
+ case NANOARROW_TYPE_RUN_END_ENCODED:
+ if (array_view->n_children != 2) {
+ ArrowErrorSet(error,
+ "Expected 2 children for run-end encoded array but found
%ld",
+ (long)array_view->n_children);
+ return EINVAL;
+ }
+ struct ArrowArrayView* run_ends_view;
+ struct ArrowArrayView* values_view;
+ run_ends_view = array_view->children[0];
+ values_view = array_view->children[1];
+ if (run_ends_view == NULL) {
+ ArrowErrorSet(error, "Run ends array is null pointer");
+ return EINVAL;
+ }
+ if (values_view == NULL) {
+ ArrowErrorSet(error, "Values array is null pointer");
+ return EINVAL;
+ }
+ uint64_t max_length;
Review Comment:
```suggestion
int64_t max_length;
```
(As a convention we use signed integers for length-like things in nanoarrow,
for better or worse 🙂 )
##########
src/nanoarrow/nanoarrow.h:
##########
@@ -372,6 +374,15 @@ ArrowErrorCode ArrowSchemaSetTypeDecimal(struct
ArrowSchema* schema, enum ArrowT
int32_t decimal_precision,
int32_t decimal_scale);
+/// \brief Set the format field of a run-end encoded schema
+///
+/// Returns EINVAL for scale <= 0 or for run_end_type that is not
+/// NANOARROW_TYPE_INT16, NANOARROW_TYPE_INT32 or NANOARROW_TYPE_INT64.
+/// Schema must have been initialized using ArrowSchemaInit() or
ArrowSchemaDeepCopy().
+ArrowErrorCode ArrowSchemaSetTypeRunEndEncoded(struct ArrowSchema* schema,
+ enum ArrowType run_end_type,
+ enum ArrowType value_type);
Review Comment:
I might leave this as `struct ArrowSchema* schema, enum ArrowType
run_end_type` and require that the caller do
`ArrowSchemaSetType(schema.children[1], ...)`. The `run_end_type` can always be
represented by a simple `ArrowType` value but the `value_type` could be a
nested or parameterized type for which a simple enum value won't be able to
construct it.
##########
src/nanoarrow/array_inline.h:
##########
@@ -661,6 +661,44 @@ static inline ArrowErrorCode
ArrowArrayFinishElement(struct ArrowArray* array) {
}
}
break;
+
+ case NANOARROW_TYPE_RUN_END_ENCODED:
+ if (array->children[0]->length != array->children[1]->length) {
+ return EINVAL;
+ }
+ if (array->children[0]->null_count != 0) {
+ return EINVAL;
+ }
+ array->length = 0;
+ struct ArrowBuffer* data_buffer;
+ data_buffer = ArrowArrayBuffer(array->children[0], 1);
+ for (int64_t i = 0; i < array->children[0]->length; i++) {
+ int64_t run_end = -1;
+ switch (((struct
ArrowArrayPrivateData*)array->children[0]->private_data)
+ ->storage_type) {
+ case NANOARROW_TYPE_INT16:
+ run_end = ((int16_t*)data_buffer->data)[i];
+ break;
+ case NANOARROW_TYPE_INT32:
+ run_end = ((int32_t*)data_buffer->data)[i];
+ break;
+ case NANOARROW_TYPE_INT64:
+ run_end = ((int64_t*)data_buffer->data)[i];
+ break;
+ default:
+ break;
+ }
+ if (run_end < 0) {
+ return EINVAL;
+ }
+ array->length = run_end;
+ }
+
+ if (private_data->bitmap.buffer.data != NULL) {
+
NANOARROW_RETURN_NOT_OK(ArrowBitmapAppend(ArrowArrayValidityBitmap(array), 1,
1));
+ }
Review Comment:
I am not sure that the outer array here is allowed to have a validity buffer?
##########
src/nanoarrow/array_test.cc:
##########
@@ -1440,6 +1441,86 @@ TEST(ArrayTest, ArrayTestAppendToStructArray) {
EXPECT_TRUE(arrow_array.ValueUnsafe()->Equals(expected_array.ValueUnsafe()));
}
+TEST(ArrayTest, ArrayTestAppendToRunEndEncodedArray) {
+ struct ArrowArray array;
+ struct ArrowSchema schema;
+ struct ArrowError error;
+
+ ArrowSchemaInit(&schema);
+ ASSERT_EQ(ArrowSchemaSetTypeRunEndEncoded(&schema, NANOARROW_TYPE_INT32,
+ NANOARROW_TYPE_FLOAT),
+ NANOARROW_OK);
+ ASSERT_EQ(ArrowArrayInitFromSchema(&array, &schema, nullptr), NANOARROW_OK);
+
+ ASSERT_EQ(ArrowArrayStartAppending(&array), NANOARROW_OK);
+
+ ASSERT_EQ(ArrowArrayAppendInt(array.children[0], 4), NANOARROW_OK);
+ ASSERT_EQ(ArrowArrayAppendInt(array.children[0], 6), NANOARROW_OK);
+ ASSERT_EQ(ArrowArrayAppendInt(array.children[0], 7), NANOARROW_OK);
+ ASSERT_EQ(ArrowArrayAppendDouble(array.children[1], 1.0), NANOARROW_OK);
+ ASSERT_EQ(ArrowArrayAppendNull(array.children[1], 1), NANOARROW_OK);
+ ASSERT_EQ(ArrowArrayAppendDouble(array.children[1], 2.0), NANOARROW_OK);
+ EXPECT_EQ(ArrowArrayFinishElement(&array), NANOARROW_OK);
+
+ // Make sure number of children is checked at finish
+ array.n_children = 0;
+ EXPECT_EQ(ArrowArrayFinishBuildingDefault(&array, &error), EINVAL);
+ EXPECT_STREQ(ArrowErrorMessage(&error),
+ "Expected 2 child of run_end_encoded array but found 0 child
arrays");
+ array.n_children = 2;
+
+ // Make sure final child size is checked at finish
+ array.children[0]->length = array.children[0]->length - 1;
+ EXPECT_EQ(ArrowArrayFinishBuilding(&array, NANOARROW_VALIDATION_LEVEL_FULL,
&error),
+ EINVAL);
+ EXPECT_STREQ(ArrowErrorMessage(&error),
+ "Last run end is 6 but it should match 7 (offset: 0, length:
7)");
+
+ array.children[0]->length = array.children[0]->length + 1;
+ EXPECT_EQ(ArrowArrayFinishBuildingDefault(&array, nullptr), NANOARROW_OK);
+
+ // run-end encoded array with offsets
+ struct ArrowArrayView array_view;
+ ASSERT_EQ(ArrowArrayViewInitFromSchema(&array_view, &schema, NULL),
NANOARROW_OK);
+ ASSERT_EQ(ArrowArrayViewSetArray(&array_view, &array, NULL), NANOARROW_OK);
+ array_view.length = 4;
+ array_view.offset = 1;
+ EXPECT_EQ(ArrowArrayViewValidate(&array_view,
NANOARROW_VALIDATION_LEVEL_FULL, &error),
+ NANOARROW_OK);
Review Comment:
It may be helpful to add a comment to each of these cases to explain what
they are testing? For this one in particular (non-zero offset), I wonder if
`length = 6` and `offset = 1` might be a better choice (to check that the last
run end -- parent array math is correct).
##########
src/nanoarrow/array_test.cc:
##########
@@ -1440,6 +1441,86 @@ TEST(ArrayTest, ArrayTestAppendToStructArray) {
EXPECT_TRUE(arrow_array.ValueUnsafe()->Equals(expected_array.ValueUnsafe()));
}
+TEST(ArrayTest, ArrayTestAppendToRunEndEncodedArray) {
+ struct ArrowArray array;
+ struct ArrowSchema schema;
+ struct ArrowError error;
+
+ ArrowSchemaInit(&schema);
+ ASSERT_EQ(ArrowSchemaSetTypeRunEndEncoded(&schema, NANOARROW_TYPE_INT32,
+ NANOARROW_TYPE_FLOAT),
+ NANOARROW_OK);
+ ASSERT_EQ(ArrowArrayInitFromSchema(&array, &schema, nullptr), NANOARROW_OK);
+
+ ASSERT_EQ(ArrowArrayStartAppending(&array), NANOARROW_OK);
+
+ ASSERT_EQ(ArrowArrayAppendInt(array.children[0], 4), NANOARROW_OK);
+ ASSERT_EQ(ArrowArrayAppendInt(array.children[0], 6), NANOARROW_OK);
+ ASSERT_EQ(ArrowArrayAppendInt(array.children[0], 7), NANOARROW_OK);
+ ASSERT_EQ(ArrowArrayAppendDouble(array.children[1], 1.0), NANOARROW_OK);
+ ASSERT_EQ(ArrowArrayAppendNull(array.children[1], 1), NANOARROW_OK);
+ ASSERT_EQ(ArrowArrayAppendDouble(array.children[1], 2.0), NANOARROW_OK);
+ EXPECT_EQ(ArrowArrayFinishElement(&array), NANOARROW_OK);
+
+ // Make sure number of children is checked at finish
+ array.n_children = 0;
+ EXPECT_EQ(ArrowArrayFinishBuildingDefault(&array, &error), EINVAL);
+ EXPECT_STREQ(ArrowErrorMessage(&error),
+ "Expected 2 child of run_end_encoded array but found 0 child
arrays");
+ array.n_children = 2;
+
+ // Make sure final child size is checked at finish
+ array.children[0]->length = array.children[0]->length - 1;
+ EXPECT_EQ(ArrowArrayFinishBuilding(&array, NANOARROW_VALIDATION_LEVEL_FULL,
&error),
+ EINVAL);
+ EXPECT_STREQ(ArrowErrorMessage(&error),
+ "Last run end is 6 but it should match 7 (offset: 0, length:
7)");
Review Comment:
Maybe `>= 7`? (i.e., the last run end must be greater than or equal to the
length of the parent array?)
##########
src/nanoarrow/array.c:
##########
@@ -995,6 +1074,40 @@ static int ArrowArrayViewValidateDefault(struct
ArrowArrayView* array_view,
}
}
break;
+
+ case NANOARROW_TYPE_RUN_END_ENCODED: {
+ struct ArrowArrayView* run_ends_view;
+ run_ends_view = array_view->children[0];
+ int64_t last_run_end = ArrowArrayViewGetIntUnsafe(run_ends_view, 0);
+ if (last_run_end < 1) {
+ ArrowErrorSet(error,
+ "All run ends must be greater than 0 but the first run
end is %ld",
+ (long)last_run_end);
+ return EINVAL;
+ }
+ for (int64_t i = 1; i < run_ends_view->length; i++) {
+ const int64_t run_end = ArrowArrayViewGetIntUnsafe(run_ends_view, i);
+ if (run_end <= last_run_end) {
+ ArrowErrorSet(
+ error,
+ "Every run end must be strictly greater than the previous run
end, "
+ "but run_ends[%ld] is %ld and run_ends[%ld] is %ld",
+ (long)i, (long)run_end, (long)i - 1, (long)last_run_end);
+ return EINVAL;
+ }
+ last_run_end = run_end;
+ }
Review Comment:
This type of validation (`O(1)` along the length of something) I believe is
a better fit for `ArrowArrayViewValidateFull()`.
##########
src/nanoarrow/array_test.cc:
##########
@@ -1440,6 +1441,86 @@ TEST(ArrayTest, ArrayTestAppendToStructArray) {
EXPECT_TRUE(arrow_array.ValueUnsafe()->Equals(expected_array.ValueUnsafe()));
}
+TEST(ArrayTest, ArrayTestAppendToRunEndEncodedArray) {
+ struct ArrowArray array;
+ struct ArrowSchema schema;
+ struct ArrowError error;
+
+ ArrowSchemaInit(&schema);
+ ASSERT_EQ(ArrowSchemaSetTypeRunEndEncoded(&schema, NANOARROW_TYPE_INT32,
+ NANOARROW_TYPE_FLOAT),
+ NANOARROW_OK);
+ ASSERT_EQ(ArrowArrayInitFromSchema(&array, &schema, nullptr), NANOARROW_OK);
+
+ ASSERT_EQ(ArrowArrayStartAppending(&array), NANOARROW_OK);
+
+ ASSERT_EQ(ArrowArrayAppendInt(array.children[0], 4), NANOARROW_OK);
Review Comment:
It may help future readers to add a JSON-ish (or other human-readable)
representation of the array you are building here (maybe `[1.0] * 2 + [null] *
1 + [2.0] * 1` ?)
##########
src/nanoarrow/array.c:
##########
@@ -995,6 +1074,40 @@ static int ArrowArrayViewValidateDefault(struct
ArrowArrayView* array_view,
}
}
break;
+
+ case NANOARROW_TYPE_RUN_END_ENCODED: {
+ struct ArrowArrayView* run_ends_view;
+ run_ends_view = array_view->children[0];
Review Comment:
```suggestion
struct ArrowArrayView* run_ends_view = array_view->children[0];
```
##########
src/nanoarrow/array_test.cc:
##########
@@ -1440,6 +1441,86 @@ TEST(ArrayTest, ArrayTestAppendToStructArray) {
EXPECT_TRUE(arrow_array.ValueUnsafe()->Equals(expected_array.ValueUnsafe()));
}
+TEST(ArrayTest, ArrayTestAppendToRunEndEncodedArray) {
+ struct ArrowArray array;
+ struct ArrowSchema schema;
+ struct ArrowError error;
+
+ ArrowSchemaInit(&schema);
+ ASSERT_EQ(ArrowSchemaSetTypeRunEndEncoded(&schema, NANOARROW_TYPE_INT32,
+ NANOARROW_TYPE_FLOAT),
+ NANOARROW_OK);
+ ASSERT_EQ(ArrowArrayInitFromSchema(&array, &schema, nullptr), NANOARROW_OK);
+
+ ASSERT_EQ(ArrowArrayStartAppending(&array), NANOARROW_OK);
+
+ ASSERT_EQ(ArrowArrayAppendInt(array.children[0], 4), NANOARROW_OK);
+ ASSERT_EQ(ArrowArrayAppendInt(array.children[0], 6), NANOARROW_OK);
+ ASSERT_EQ(ArrowArrayAppendInt(array.children[0], 7), NANOARROW_OK);
+ ASSERT_EQ(ArrowArrayAppendDouble(array.children[1], 1.0), NANOARROW_OK);
+ ASSERT_EQ(ArrowArrayAppendNull(array.children[1], 1), NANOARROW_OK);
+ ASSERT_EQ(ArrowArrayAppendDouble(array.children[1], 2.0), NANOARROW_OK);
+ EXPECT_EQ(ArrowArrayFinishElement(&array), NANOARROW_OK);
+
+ // Make sure number of children is checked at finish
+ array.n_children = 0;
+ EXPECT_EQ(ArrowArrayFinishBuildingDefault(&array, &error), EINVAL);
+ EXPECT_STREQ(ArrowErrorMessage(&error),
+ "Expected 2 child of run_end_encoded array but found 0 child
arrays");
+ array.n_children = 2;
+
+ // Make sure final child size is checked at finish
+ array.children[0]->length = array.children[0]->length - 1;
+ EXPECT_EQ(ArrowArrayFinishBuilding(&array, NANOARROW_VALIDATION_LEVEL_FULL,
&error),
+ EINVAL);
+ EXPECT_STREQ(ArrowErrorMessage(&error),
+ "Last run end is 6 but it should match 7 (offset: 0, length:
7)");
+
+ array.children[0]->length = array.children[0]->length + 1;
+ EXPECT_EQ(ArrowArrayFinishBuildingDefault(&array, nullptr), NANOARROW_OK);
+
+ // run-end encoded array with offsets
+ struct ArrowArrayView array_view;
+ ASSERT_EQ(ArrowArrayViewInitFromSchema(&array_view, &schema, NULL),
NANOARROW_OK);
+ ASSERT_EQ(ArrowArrayViewSetArray(&array_view, &array, NULL), NANOARROW_OK);
+ array_view.length = 4;
+ array_view.offset = 1;
+ EXPECT_EQ(ArrowArrayViewValidate(&array_view,
NANOARROW_VALIDATION_LEVEL_FULL, &error),
+ NANOARROW_OK);
+
+ array_view.length = 10;
+ array_view.offset = 1;
+ EXPECT_EQ(ArrowArrayViewValidate(&array_view,
NANOARROW_VALIDATION_LEVEL_FULL, &error),
+ EINVAL);
+ EXPECT_STREQ(ArrowErrorMessage(&error),
+ "Last run end is 7 but it should match 11 (offset: 1, length:
10)");
Review Comment:
Similarly, I wonder if `length = 7`, `offset = 1` would be a better check
(only off by one)
##########
src/nanoarrow/array_inline.h:
##########
@@ -661,6 +661,44 @@ static inline ArrowErrorCode
ArrowArrayFinishElement(struct ArrowArray* array) {
}
}
break;
+
+ case NANOARROW_TYPE_RUN_END_ENCODED:
+ if (array->children[0]->length != array->children[1]->length) {
+ return EINVAL;
+ }
+ if (array->children[0]->null_count != 0) {
+ return EINVAL;
+ }
+ array->length = 0;
+ struct ArrowBuffer* data_buffer;
+ data_buffer = ArrowArrayBuffer(array->children[0], 1);
+ for (int64_t i = 0; i < array->children[0]->length; i++) {
Review Comment:
I am not sure that you need a loop here? (i.e., is it possible to just
extract the last run end?)
I wonder if this would be a better fit for `ArrowArrayFinishBuilding()`? Or
maybe we need `FinishAppending()`? I am not sure I would have thought of
`FinishElement` to do these checks/updates since they really only need to
happen once per array.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]