lidavidm commented on code in PR #82:
URL: https://github.com/apache/arrow-nanoarrow/pull/82#discussion_r1041480876
##########
src/nanoarrow/nanoarrow.h:
##########
@@ -785,6 +785,9 @@ ArrowErrorCode ArrowArrayReserve(struct ArrowArray* array,
/// \brief Append a null value to an array
static inline ArrowErrorCode ArrowArrayAppendNull(struct ArrowArray* array,
int64_t n);
+/// \brief Append an empty, non-null value to an array
+static inline ArrowErrorCode ArrowArrayAppendEmpty(struct ArrowArray* array,
int64_t n);
Review Comment:
Do we want to add an explicit unit test for this?
##########
src/nanoarrow/array_inline.h:
##########
@@ -107,29 +107,69 @@ static inline ArrowErrorCode _ArrowArrayAppendBits(struct
ArrowArray* array,
return NANOARROW_OK;
}
-static inline ArrowErrorCode ArrowArrayAppendNull(struct ArrowArray* array,
int64_t n) {
+static inline ArrowErrorCode _ArrowArrayAppendEmptyInternal(struct ArrowArray*
array,
+ int64_t n, uint8_t
is_valid) {
struct ArrowArrayPrivateData* private_data =
(struct ArrowArrayPrivateData*)array->private_data;
if (n == 0) {
return NANOARROW_OK;
}
- if (private_data->storage_type == NANOARROW_TYPE_NA) {
- array->null_count += n;
- array->length += n;
- return NANOARROW_OK;
+ // Some type-specific handling
+ switch (private_data->storage_type) {
+ case NANOARROW_TYPE_NA:
+ array->null_count += n * !is_valid;
+ array->length += n;
+ return NANOARROW_OK;
+ case NANOARROW_TYPE_DENSE_UNION:
+ // Add one null to the first child and append n references to that child
Review Comment:
How would this interact with children declared non-nullable?
##########
src/nanoarrow/array_inline.h:
##########
@@ -440,6 +473,51 @@ static inline ArrowErrorCode
ArrowArrayFinishElement(struct ArrowArray* array) {
return NANOARROW_OK;
}
+static inline ArrowErrorCode ArrowArrayFinishUnionElement(struct ArrowArray*
array,
+ int32_t type_id) {
+ struct ArrowArrayPrivateData* private_data =
+ (struct ArrowArrayPrivateData*)array->private_data;
+
+ // Currently this assumes type_ids == child index
+ int64_t child_index = type_id;
+ if (child_index < 0 || child_index >= array->n_children) {
+ return EINVAL;
+ }
+
+ switch (private_data->storage_type) {
+ case NANOARROW_TYPE_DENSE_UNION:
+ // Apppend the target child length to the union offsets buffer
+ _NANOARROW_CHECK_RANGE(array->children[child_index]->length, 0,
INT32_MAX);
+ NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt32(
+ ArrowArrayBuffer(array, 1),
(int32_t)array->children[child_index]->length - 1));
+ break;
+ case NANOARROW_TYPE_SPARSE_UNION:
+ // Append one null any non-target column that isn't already the right
length
+ // or abort if appending a null will result in a column with invalid
length
+ for (int64_t i = 0; i < array->n_children; i++) {
+ if (i == child_index || array->children[i]->length == (array->length +
1)) {
+ continue;
+ }
+
+ if (array->children[i]->length != array->length) {
+ return EINVAL;
+ }
+
+ NANOARROW_RETURN_NOT_OK(ArrowArrayAppendEmpty(array->children[i], 1));
Review Comment:
Hmm, why not just append a null here?
##########
src/nanoarrow/array_inline.h:
##########
@@ -107,29 +107,69 @@ static inline ArrowErrorCode _ArrowArrayAppendBits(struct
ArrowArray* array,
return NANOARROW_OK;
}
-static inline ArrowErrorCode ArrowArrayAppendNull(struct ArrowArray* array,
int64_t n) {
+static inline ArrowErrorCode _ArrowArrayAppendEmptyInternal(struct ArrowArray*
array,
+ int64_t n, uint8_t
is_valid) {
struct ArrowArrayPrivateData* private_data =
(struct ArrowArrayPrivateData*)array->private_data;
if (n == 0) {
return NANOARROW_OK;
}
- if (private_data->storage_type == NANOARROW_TYPE_NA) {
- array->null_count += n;
- array->length += n;
- return NANOARROW_OK;
+ // Some type-specific handling
+ switch (private_data->storage_type) {
+ case NANOARROW_TYPE_NA:
+ array->null_count += n * !is_valid;
+ array->length += n;
+ return NANOARROW_OK;
+ case NANOARROW_TYPE_DENSE_UNION:
+ // Add one null to the first child and append n references to that child
+ // (Currently assumes type_id == child_index)
Review Comment:
Yeah, so what Arrow does is keep a 128-element lookup table in the type
object that maps type ID to child index. That might be hard here...
##########
src/nanoarrow/nanoarrow.h:
##########
@@ -835,6 +838,15 @@ static inline ArrowErrorCode ArrowArrayAppendString(struct
ArrowArray* array,
/// length of the child array(s) did not match the expected length.
static inline ArrowErrorCode ArrowArrayFinishElement(struct ArrowArray* array);
+/// \brief Finish a union array element
+///
+/// Appends an element to the union type ids buffer and increments
array->length.
+/// For sparse unions, up to one element is added to non type-id children.
Returns
+/// EINVAL if the underlying storage type is not a union, if type_id is not
valid,
+/// or if child sizes after appending are inconsistent.
+static inline ArrowErrorCode ArrowArrayFinishUnionElement(struct ArrowArray*
array,
+ int32_t type_id);
Review Comment:
So, I would say there's no 'standard type ID', there's just the convention
of making type ID == child index. Since we take type ID here, there's nothing
more to do (in terms of this signature).
##########
src/nanoarrow/nanoarrow.h:
##########
@@ -835,6 +838,15 @@ static inline ArrowErrorCode ArrowArrayAppendString(struct
ArrowArray* array,
/// length of the child array(s) did not match the expected length.
static inline ArrowErrorCode ArrowArrayFinishElement(struct ArrowArray* array);
+/// \brief Finish a union array element
+///
+/// Appends an element to the union type ids buffer and increments
array->length.
+/// For sparse unions, up to one element is added to non type-id children.
Returns
+/// EINVAL if the underlying storage type is not a union, if type_id is not
valid,
+/// or if child sizes after appending are inconsistent.
+static inline ArrowErrorCode ArrowArrayFinishUnionElement(struct ArrowArray*
array,
+ int32_t type_id);
Review Comment:
int8_t type_id?
##########
src/nanoarrow/array_test.cc:
##########
@@ -1070,6 +1070,97 @@ TEST(ArrayTest, ArrayTestAppendToStructArray) {
EXPECT_TRUE(arrow_array.ValueUnsafe()->Equals(expected_array.ValueUnsafe()));
}
+TEST(ArrayTest, ArrayTestAppendToDenseUnionArray) {
+ struct ArrowArray array;
+ struct ArrowSchema schema;
+
+ ArrowSchemaInit(&schema);
+ ASSERT_EQ(ArrowSchemaSetTypeUnion(&schema, NANOARROW_TYPE_DENSE_UNION, 2),
+ NANOARROW_OK);
+ ASSERT_EQ(ArrowSchemaSetType(schema.children[0], NANOARROW_TYPE_INT64),
NANOARROW_OK);
+ ASSERT_EQ(ArrowSchemaSetName(schema.children[0], "integers"), NANOARROW_OK);
+ ASSERT_EQ(ArrowSchemaSetType(schema.children[1], NANOARROW_TYPE_STRING),
NANOARROW_OK);
+ ASSERT_EQ(ArrowSchemaSetName(schema.children[1], "strings"), NANOARROW_OK);
+ ASSERT_EQ(ArrowArrayInitFromSchema(&array, &schema, nullptr), NANOARROW_OK);
+
+ ASSERT_EQ(ArrowArrayStartAppending(&array), NANOARROW_OK);
+ ASSERT_EQ(ArrowArrayAppendInt(array.children[0], 123), NANOARROW_OK);
+ EXPECT_EQ(ArrowArrayFinishUnionElement(&array, 0), NANOARROW_OK);
+ EXPECT_EQ(ArrowArrayAppendNull(&array, 2), NANOARROW_OK);
+ ASSERT_EQ(ArrowArrayAppendString(array.children[1], ArrowCharView("one
twenty four")),
+ NANOARROW_OK);
+ EXPECT_EQ(ArrowArrayFinishUnionElement(&array, 1), NANOARROW_OK);
+ EXPECT_EQ(ArrowArrayFinishBuilding(&array, nullptr), NANOARROW_OK);
+
+ auto arrow_array = ImportArray(&array, &schema);
+ ARROW_EXPECT_OK(arrow_array);
+
+ auto child_builder_int = std::make_shared<Int64Builder>();
+ auto child_builder_string = std::make_shared<StringBuilder>();
+ std::vector<std::shared_ptr<ArrayBuilder>> children = {child_builder_int,
+ child_builder_string};
+ auto builder = DenseUnionBuilder(default_memory_pool(), children,
+ arrow_array.ValueUnsafe()->type());
+ ARROW_EXPECT_OK(builder.Append(0));
+ ARROW_EXPECT_OK(child_builder_int->Append(123));
+ ARROW_EXPECT_OK(builder.AppendNulls(2));
+ ARROW_EXPECT_OK(builder.Append(1));
+ ARROW_EXPECT_OK(child_builder_string->Append("one twenty four"));
+
+ auto expected_array = builder.Finish();
+ ARROW_EXPECT_OK(expected_array);
+
+ EXPECT_TRUE(arrow_array.ValueUnsafe()->Equals(expected_array.ValueUnsafe()));
+}
+
+TEST(ArrayTest, ArrayTestAppendToSparseUnionArray) {
+ struct ArrowArray array;
+ struct ArrowSchema schema;
+
+ ArrowSchemaInit(&schema);
+ ASSERT_EQ(ArrowSchemaSetTypeUnion(&schema, NANOARROW_TYPE_SPARSE_UNION, 2),
+ NANOARROW_OK);
+ ASSERT_EQ(ArrowSchemaSetType(schema.children[0], NANOARROW_TYPE_INT64),
NANOARROW_OK);
+ ASSERT_EQ(ArrowSchemaSetName(schema.children[0], "integers"), NANOARROW_OK);
+ ASSERT_EQ(ArrowSchemaSetType(schema.children[1], NANOARROW_TYPE_STRING),
NANOARROW_OK);
+ ASSERT_EQ(ArrowSchemaSetName(schema.children[1], "strings"), NANOARROW_OK);
+ ASSERT_EQ(ArrowArrayInitFromSchema(&array, &schema, nullptr), NANOARROW_OK);
+
+ ASSERT_EQ(ArrowArrayStartAppending(&array), NANOARROW_OK);
+ ASSERT_EQ(ArrowArrayAppendInt(array.children[0], 123), NANOARROW_OK);
+ EXPECT_EQ(ArrowArrayFinishUnionElement(&array, 0), NANOARROW_OK);
+ EXPECT_EQ(ArrowArrayAppendNull(&array, 2), NANOARROW_OK);
+ ASSERT_EQ(ArrowArrayAppendString(array.children[1], ArrowCharView("one
twenty four")),
+ NANOARROW_OK);
+ EXPECT_EQ(ArrowArrayFinishUnionElement(&array, 1), NANOARROW_OK);
+ EXPECT_EQ(ArrowArrayFinishBuilding(&array, nullptr), NANOARROW_OK);
+
+ auto arrow_array = ImportArray(&array, &schema);
+ ARROW_EXPECT_OK(arrow_array);
+
+ auto child_builder_int = std::make_shared<Int64Builder>();
+ auto child_builder_string = std::make_shared<StringBuilder>();
+ std::vector<std::shared_ptr<ArrayBuilder>> children = {child_builder_int,
+ child_builder_string};
+ auto builder = SparseUnionBuilder(default_memory_pool(), children,
+ arrow_array.ValueUnsafe()->type());
+ // Arrow's SparseUnionBuilder requires explicit empty value appends?
Review Comment:
Odd, but I guess that's fine. Might be worth filing an issue. (Though
changing it would be a break...)
##########
src/nanoarrow/array_inline.h:
##########
@@ -107,29 +107,69 @@ static inline ArrowErrorCode _ArrowArrayAppendBits(struct
ArrowArray* array,
return NANOARROW_OK;
}
-static inline ArrowErrorCode ArrowArrayAppendNull(struct ArrowArray* array,
int64_t n) {
+static inline ArrowErrorCode _ArrowArrayAppendEmptyInternal(struct ArrowArray*
array,
+ int64_t n, uint8_t
is_valid) {
struct ArrowArrayPrivateData* private_data =
(struct ArrowArrayPrivateData*)array->private_data;
if (n == 0) {
return NANOARROW_OK;
}
- if (private_data->storage_type == NANOARROW_TYPE_NA) {
- array->null_count += n;
- array->length += n;
- return NANOARROW_OK;
+ // Some type-specific handling
+ switch (private_data->storage_type) {
+ case NANOARROW_TYPE_NA:
+ array->null_count += n * !is_valid;
Review Comment:
Hmm, for NA, null count always has to equal length right?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]