This is an automated email from the ASF dual-hosted git repository.

paleolimbot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-nanoarrow.git


The following commit(s) were added to refs/heads/main by this push:
     new c4d20a08 feat: Add support for run-end encoded array (#507)
c4d20a08 is described below

commit c4d20a08514eaa0520e0c5d3f0d51469011a1a4a
Author: Cocoa <[email protected]>
AuthorDate: Fri Jun 7 20:57:23 2024 +0100

    feat: Add support for run-end encoded array (#507)
    
    Hi this PR tries to add support for run-end encoded array based on the
    arrow spec here,
    https://arrow.apache.org/docs/format/Columnar.html#run-end-encoded-layout.
---
 src/nanoarrow/array.c           | 108 ++++++++++++++++++++++++++++
 src/nanoarrow/array_inline.h    |   1 +
 src/nanoarrow/array_test.cc     | 153 ++++++++++++++++++++++++++++++++++++++++
 src/nanoarrow/nanoarrow.h       |  13 ++++
 src/nanoarrow/nanoarrow_types.h |   5 +-
 src/nanoarrow/schema.c          |  41 +++++++++++
 src/nanoarrow/schema_test.cc    |  61 ++++++++++++++++
 src/nanoarrow/utils.c           |   1 +
 8 files changed, 382 insertions(+), 1 deletion(-)

diff --git a/src/nanoarrow/array.c b/src/nanoarrow/array.c
index 4fb7b7b1..82236490 100644
--- a/src/nanoarrow/array.c
+++ b/src/nanoarrow/array.c
@@ -69,6 +69,7 @@ static ArrowErrorCode ArrowArraySetStorageType(struct 
ArrowArray* array,
   switch (storage_type) {
     case NANOARROW_TYPE_UNINITIALIZED:
     case NANOARROW_TYPE_NA:
+    case NANOARROW_TYPE_RUN_END_ENCODED:
       array->n_buffers = 0;
       break;
 
@@ -811,6 +812,15 @@ static int ArrowArrayViewValidateMinimal(struct 
ArrowArrayView* array_view,
                       (long)array_view->n_children);
         return EINVAL;
       }
+      break;
+    case NANOARROW_TYPE_RUN_END_ENCODED:
+      if (array_view->n_children != 2) {
+        ArrowErrorSet(
+            error, "Expected 2 children for %s array but found %ld child 
arrays",
+            ArrowTypeString(array_view->storage_type), 
(long)array_view->n_children);
+        return EINVAL;
+      }
+      break;
     default:
       break;
   }
@@ -846,6 +856,68 @@ static int ArrowArrayViewValidateMinimal(struct 
ArrowArrayView* array_view,
         return EINVAL;
       }
       break;
+
+    case NANOARROW_TYPE_RUN_END_ENCODED: {
+      if (array_view->n_children != 2) {
+        ArrowErrorSet(error,
+                      "Expected 2 children for run-end encoded array but found 
%ld",
+                      (long)array_view->n_children);
+        return EINVAL;
+      }
+      struct ArrowArrayView* run_ends_view = array_view->children[0];
+      struct ArrowArrayView* values_view = array_view->children[1];
+      int64_t max_length;
+      switch (run_ends_view->storage_type) {
+        case NANOARROW_TYPE_INT16:
+          max_length = INT16_MAX;
+          break;
+        case NANOARROW_TYPE_INT32:
+          max_length = INT32_MAX;
+          break;
+        case NANOARROW_TYPE_INT64:
+          max_length = INT64_MAX;
+          break;
+        default:
+          ArrowErrorSet(
+              error,
+              "Run-end encoded array only supports INT16, INT32 or INT64 
run-ends "
+              "but found run-ends type %s",
+              ArrowTypeString(run_ends_view->storage_type));
+          return EINVAL;
+      }
+      // uint64_t is used here to avoid overflow when adding the offset and 
length
+      if ((uint64_t)array_view->offset + (uint64_t)array_view->length >
+          (uint64_t)max_length) {
+        ArrowErrorSet(
+            error,
+            "Offset + length of a run-end encoded array must fit in a value"
+            " of the run end type %s, but offset + length is %lu while the "
+            "allowed maximum is %lu",
+            ArrowTypeString(run_ends_view->storage_type),
+            (unsigned long)array_view->offset + (unsigned 
long)array_view->length,
+            (unsigned long)max_length);
+        return EINVAL;
+      }
+      if (run_ends_view->length > values_view->length) {
+        ArrowErrorSet(
+            error, "Length of run_ends is greater than the length of values: 
%ld > %ld",
+            (long)run_ends_view->length, (long)values_view->length);
+        return EINVAL;
+      }
+      if (run_ends_view->length == 0 && values_view->length != 0) {
+        ArrowErrorSet(error,
+                      "Run-end encoded array has zero length %ld, but values 
array has "
+                      "non-zero length",
+                      (long)values_view->length);
+        return EINVAL;
+      }
+      if (run_ends_view->null_count != 0) {
+        ArrowErrorSet(error, "Null count must be 0 for run ends array, but is 
%ld",
+                      (long)run_ends_view->null_count);
+        return EINVAL;
+      }
+      break;
+    }
     default:
       break;
   }
@@ -995,6 +1067,18 @@ static int ArrowArrayViewValidateDefault(struct 
ArrowArrayView* array_view,
         }
       }
       break;
+
+    case NANOARROW_TYPE_RUN_END_ENCODED: {
+      struct ArrowArrayView* run_ends_view = array_view->children[0];
+      int64_t last_run_end = ArrowArrayViewGetIntUnsafe(run_ends_view, 0);
+      if (last_run_end < 1) {
+        ArrowErrorSet(error,
+                      "All run ends must be greater than 0 but the first run 
end is %ld",
+                      (long)last_run_end);
+        return EINVAL;
+      }
+      break;
+    }
     default:
       break;
   }
@@ -1163,6 +1247,30 @@ static int ArrowArrayViewValidateFull(struct 
ArrowArrayView* array_view,
     }
   }
 
+  if (array_view->storage_type == NANOARROW_TYPE_RUN_END_ENCODED) {
+    struct ArrowArrayView* run_ends_view = array_view->children[0];
+    int64_t last_run_end = ArrowArrayViewGetIntUnsafe(run_ends_view, 0);
+    for (int64_t i = 1; i < run_ends_view->length; i++) {
+      const int64_t run_end = ArrowArrayViewGetIntUnsafe(run_ends_view, i);
+      if (run_end <= last_run_end) {
+        ArrowErrorSet(error,
+                      "Every run end must be strictly greater than the 
previous run end, "
+                      "but run_ends[%ld] is %ld and run_ends[%ld] is %ld",
+                      (long)i, (long)run_end, (long)i - 1, (long)last_run_end);
+        return EINVAL;
+      }
+      last_run_end = run_end;
+    }
+    last_run_end = ArrowArrayViewGetIntUnsafe(run_ends_view, 
run_ends_view->length - 1);
+    if (last_run_end < (array_view->offset + array_view->length)) {
+      ArrowErrorSet(error,
+                    "Last run end is %ld but it should >= %ld (offset: %ld, 
length: %ld)",
+                    (long)last_run_end, (long)(array_view->offset + 
array_view->length),
+                    (long)array_view->offset, (long)array_view->length);
+      return EINVAL;
+    }
+  }
+
   // Recurse for children
   for (int64_t i = 0; i < array_view->n_children; i++) {
     
NANOARROW_RETURN_NOT_OK(ArrowArrayViewValidateFull(array_view->children[i], 
error));
diff --git a/src/nanoarrow/array_inline.h b/src/nanoarrow/array_inline.h
index 7ee3d943..5c734b46 100644
--- a/src/nanoarrow/array_inline.h
+++ b/src/nanoarrow/array_inline.h
@@ -661,6 +661,7 @@ static inline ArrowErrorCode ArrowArrayFinishElement(struct 
ArrowArray* array) {
         }
       }
       break;
+      return NANOARROW_OK;
     default:
       return EINVAL;
   }
diff --git a/src/nanoarrow/array_test.cc b/src/nanoarrow/array_test.cc
index ff9ffcd2..4cc1bd08 100644
--- a/src/nanoarrow/array_test.cc
+++ b/src/nanoarrow/array_test.cc
@@ -26,6 +26,7 @@
 #include <arrow/array/builder_decimal.h>
 #include <arrow/array/builder_nested.h>
 #include <arrow/array/builder_primitive.h>
+#include <arrow/array/builder_run_end.h>
 #include <arrow/array/builder_time.h>
 #include <arrow/array/builder_union.h>
 #include <arrow/c/bridge.h>
@@ -1440,6 +1441,158 @@ TEST(ArrayTest, ArrayTestAppendToStructArray) {
   EXPECT_TRUE(arrow_array.ValueUnsafe()->Equals(expected_array.ValueUnsafe()));
 }
 
+TEST(ArrayTest, ArrayTestAppendToRunEndEncodedArray) {
+  struct ArrowArray array;
+  struct ArrowSchema schema;
+  struct ArrowError error;
+
+  // in this test case we construct a run-end encoded array with logical 
length = 7
+  // and the values are float32s
+  //
+  // the virtual big array:
+  //   type: Float32
+  //   [1.0, 1.0, 1.0, 1.0, null, null, 2.0]
+  //
+  // run-end encoded array:
+  //   run_ends<INT32>: [4, 6, 7]
+  //   values<FLOAT>: [1.0, null, 2.0]
+
+  ArrowSchemaInit(&schema);
+  ASSERT_EQ(ArrowSchemaSetTypeRunEndEncoded(&schema, NANOARROW_TYPE_INT32), 
NANOARROW_OK);
+  ASSERT_EQ(ArrowSchemaSetType(schema.children[1], NANOARROW_TYPE_FLOAT), 
NANOARROW_OK);
+  ASSERT_EQ(ArrowArrayInitFromSchema(&array, &schema, nullptr), NANOARROW_OK);
+
+  ASSERT_EQ(ArrowArrayStartAppending(&array), NANOARROW_OK);
+  ASSERT_EQ(ArrowArrayAppendInt(array.children[0], 4), NANOARROW_OK);
+  ASSERT_EQ(ArrowArrayAppendInt(array.children[0], 6), NANOARROW_OK);
+  ASSERT_EQ(ArrowArrayAppendInt(array.children[0], 7), NANOARROW_OK);
+  ASSERT_EQ(ArrowArrayAppendDouble(array.children[1], 1.0), NANOARROW_OK);
+  ASSERT_EQ(ArrowArrayAppendNull(array.children[1], 1), NANOARROW_OK);
+  ASSERT_EQ(ArrowArrayAppendDouble(array.children[1], 2.0), NANOARROW_OK);
+  array.length = 7;
+
+  // Make sure number of children is checked at finish
+  array.n_children = 0;
+  EXPECT_EQ(ArrowArrayFinishBuildingDefault(&array, &error), EINVAL);
+  EXPECT_STREQ(ArrowErrorMessage(&error),
+               "Expected 2 children for run_end_encoded array but found 0 
child arrays");
+  array.n_children = 2;
+
+  {
+    array.offset = INT32_MAX;
+    EXPECT_EQ(ArrowArrayFinishBuilding(&array, 
NANOARROW_VALIDATION_LEVEL_FULL, &error),
+              EINVAL);
+    EXPECT_STREQ(
+        ArrowErrorMessage(&error),
+        "Offset + length of a run-end encoded array must fit in a value of the 
"
+        "run end type int32, but offset + length is 2147483654 while the 
allowed "
+        "maximum is 2147483647");
+
+    ((struct 
ArrowArrayPrivateData*)(array.children[0]->private_data))->storage_type =
+        NANOARROW_TYPE_INT16;
+    array.offset = INT16_MAX;
+    EXPECT_EQ(ArrowArrayFinishBuilding(&array, 
NANOARROW_VALIDATION_LEVEL_FULL, &error),
+              EINVAL);
+    EXPECT_STREQ(
+        ArrowErrorMessage(&error),
+        "Offset + length of a run-end encoded array must fit in a value of the 
run end "
+        "type int16, but offset + length is 32774 while the allowed maximum is 
32767");
+
+    ((struct 
ArrowArrayPrivateData*)(array.children[0]->private_data))->storage_type =
+        NANOARROW_TYPE_INT64;
+    array.offset = INT64_MAX;
+    EXPECT_EQ(ArrowArrayFinishBuilding(&array, 
NANOARROW_VALIDATION_LEVEL_FULL, &error),
+              EINVAL);
+    EXPECT_STREQ(ArrowErrorMessage(&error),
+                 "Offset + length of a run-end encoded array must fit in a 
value of the "
+                 "run end type int64, but offset + length is 
9223372036854775814 while "
+                 "the allowed "
+                 "maximum is 9223372036854775807");
+  }
+  ((struct 
ArrowArrayPrivateData*)(array.children[0]->private_data))->storage_type =
+      NANOARROW_TYPE_INT32;
+  array.offset = 0;
+
+  // Make sure final child size is checked at finish
+  array.children[0]->length += 1;
+  EXPECT_EQ(ArrowArrayFinishBuilding(&array, NANOARROW_VALIDATION_LEVEL_FULL, 
&error),
+            EINVAL);
+  EXPECT_STREQ(ArrowErrorMessage(&error),
+               "Length of run_ends is greater than the length of values: 4 > 
3");
+  array.children[0]->length -= 1;
+
+  array.children[0]->length = 0;
+  EXPECT_EQ(ArrowArrayFinishBuilding(&array, NANOARROW_VALIDATION_LEVEL_FULL, 
&error),
+            EINVAL);
+  EXPECT_STREQ(
+      ArrowErrorMessage(&error),
+      "Run-end encoded array has zero length 3, but values array has non-zero 
length");
+  array.children[0]->length = 3;
+
+  array.children[0]->null_count = 1;
+  EXPECT_EQ(ArrowArrayFinishBuilding(&array, NANOARROW_VALIDATION_LEVEL_FULL, 
&error),
+            EINVAL);
+  EXPECT_STREQ(ArrowErrorMessage(&error),
+               "Null count must be 0 for run ends array, but is 1");
+  array.children[0]->null_count = 0;
+
+  // it can be a projection of the virtual big array
+  //  [1.0, 1.0, 1.0, 1.0, null, null, 2.0]
+  //        ^                          ^
+  //        |- offset = 1              |- length = 6
+  array.length = 6;
+  array.offset = 1;
+  EXPECT_EQ(ArrowArrayFinishBuilding(&array, NANOARROW_VALIDATION_LEVEL_FULL, 
&error),
+            NANOARROW_OK);
+
+  // checks for one-off errors
+  //  this one makes the logical length larger than the last run end
+  //  [1.0, 1.0, 1.0, 1.0, null, null, 2.0]
+  //        ^                               ^
+  //        |- offset = 1                   |- length = 7 (out of bound)
+  array.length = 7;
+  array.offset = 1;
+  EXPECT_EQ(ArrowArrayFinishBuilding(&array, NANOARROW_VALIDATION_LEVEL_FULL, 
&error),
+            EINVAL);
+  EXPECT_STREQ(ArrowErrorMessage(&error),
+               "Last run end is 7 but it should >= 8 (offset: 1, length: 7)");
+
+  //  [1.0, 1.0, 1.0, 1.0, null, null, 2.0]
+  //   ^                                    ^
+  //   |- offset = 1                        |- length = 8 (out of bound)
+  array.length = 8;
+  array.offset = 0;
+  EXPECT_EQ(ArrowArrayFinishBuilding(&array, NANOARROW_VALIDATION_LEVEL_FULL, 
&error),
+            EINVAL);
+  EXPECT_STREQ(ArrowErrorMessage(&error),
+               "Last run end is 7 but it should >= 8 (offset: 0, length: 8)");
+
+  array.length = 7;
+  array.offset = 0;
+  EXPECT_EQ(ArrowArrayFinishBuilding(&array, NANOARROW_VALIDATION_LEVEL_FULL, 
&error),
+            NANOARROW_OK);
+
+  auto arrow_array = ImportArray(&array, &schema);
+  ARROW_EXPECT_OK(arrow_array);
+
+  auto run_ends_builder = std::make_shared<Int32Builder>();
+  auto values_builder = std::make_shared<FloatBuilder>();
+  auto builder =
+      RunEndEncodedBuilder(default_memory_pool(), run_ends_builder, 
values_builder,
+                           run_end_encoded(int32(), float32()));
+  ARROW_EXPECT_OK(run_ends_builder->Append(4));
+  ARROW_EXPECT_OK(run_ends_builder->Append(6));
+  ARROW_EXPECT_OK(run_ends_builder->Append(7));
+  ARROW_EXPECT_OK(values_builder->Append(1.0));
+  ARROW_EXPECT_OK(values_builder->AppendNull());
+  ARROW_EXPECT_OK(values_builder->Append(2.0));
+  auto expected_array = builder.Finish();
+  ARROW_EXPECT_OK(expected_array);
+
+  EXPECT_STREQ(arrow_array.ValueUnsafe()->ToString().c_str(),
+               expected_array.ValueUnsafe()->ToString().c_str());
+}
+
 TEST(ArrayTest, ArrayTestUnionUtils) {
   // Check length calculation with nullptr
   EXPECT_EQ(_ArrowParseUnionTypeIds("", nullptr), 0);
diff --git a/src/nanoarrow/nanoarrow.h b/src/nanoarrow/nanoarrow.h
index 4831a40a..84d59850 100644
--- a/src/nanoarrow/nanoarrow.h
+++ b/src/nanoarrow/nanoarrow.h
@@ -60,6 +60,8 @@
   NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaSetTypeFixedSize)
 #define ArrowSchemaSetTypeDecimal \
   NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaSetTypeDecimal)
+#define ArrowSchemaSetTypeRunEndEncoded \
+  NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaSetTypeRunEndEncoded)
 #define ArrowSchemaSetTypeDateTime \
   NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaSetTypeDateTime)
 #define ArrowSchemaSetTypeUnion \
@@ -372,6 +374,17 @@ ArrowErrorCode ArrowSchemaSetTypeDecimal(struct 
ArrowSchema* schema, enum ArrowT
                                          int32_t decimal_precision,
                                          int32_t decimal_scale);
 
+/// \brief Set the format field of a run-end encoded schema
+///
+/// Returns EINVAL for run_end_type that is not
+/// NANOARROW_TYPE_INT16, NANOARROW_TYPE_INT32 or NANOARROW_TYPE_INT64.
+/// Schema must have been initialized using ArrowSchemaInit() or 
ArrowSchemaDeepCopy().
+/// The caller must call `ArrowSchemaSetTypeXXX(schema->children[1])` to
+/// set the value type. Note that when building arrays using the 
`ArrowArrayAppendXXX()`
+/// functions, the run-end encoded array's logical length must be updated 
manually.
+ArrowErrorCode ArrowSchemaSetTypeRunEndEncoded(struct ArrowSchema* schema,
+                                               enum ArrowType run_end_type);
+
 /// \brief Set the format field of a time, timestamp, or duration schema
 ///
 /// Returns EINVAL for type that is not
diff --git a/src/nanoarrow/nanoarrow_types.h b/src/nanoarrow/nanoarrow_types.h
index d814a056..03c5836e 100644
--- a/src/nanoarrow/nanoarrow_types.h
+++ b/src/nanoarrow/nanoarrow_types.h
@@ -450,7 +450,8 @@ enum ArrowType {
   NANOARROW_TYPE_LARGE_STRING,
   NANOARROW_TYPE_LARGE_BINARY,
   NANOARROW_TYPE_LARGE_LIST,
-  NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO
+  NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO,
+  NANOARROW_TYPE_RUN_END_ENCODED
 };
 
 /// \brief Get a string value of an enum ArrowType value
@@ -537,6 +538,8 @@ static inline const char* ArrowTypeString(enum ArrowType 
type) {
       return "large_list";
     case NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO:
       return "interval_month_day_nano";
+    case NANOARROW_TYPE_RUN_END_ENCODED:
+      return "run_end_encoded";
     default:
       return NULL;
   }
diff --git a/src/nanoarrow/schema.c b/src/nanoarrow/schema.c
index 7451136c..aa8725b2 100644
--- a/src/nanoarrow/schema.c
+++ b/src/nanoarrow/schema.c
@@ -124,6 +124,8 @@ static const char* ArrowSchemaFormatTemplate(enum ArrowType 
type) {
       return "+s";
     case NANOARROW_TYPE_MAP:
       return "+m";
+    case NANOARROW_TYPE_RUN_END_ENCODED:
+      return "+r";
 
     default:
       return NULL;
@@ -155,6 +157,13 @@ static int ArrowSchemaInitChildrenIfNeeded(struct 
ArrowSchema* schema,
       NANOARROW_RETURN_NOT_OK(
           ArrowSchemaSetName(schema->children[0]->children[1], "value"));
       break;
+    case NANOARROW_TYPE_RUN_END_ENCODED:
+      NANOARROW_RETURN_NOT_OK(ArrowSchemaAllocateChildren(schema, 2));
+      ArrowSchemaInit(schema->children[0]);
+      NANOARROW_RETURN_NOT_OK(ArrowSchemaSetName(schema->children[0], 
"run_ends"));
+      schema->children[0]->flags &= ~ARROW_FLAG_NULLABLE;
+      ArrowSchemaInit(schema->children[1]);
+      NANOARROW_RETURN_NOT_OK(ArrowSchemaSetName(schema->children[1], 
"values"));
     default:
       break;
   }
@@ -277,6 +286,28 @@ ArrowErrorCode ArrowSchemaSetTypeDecimal(struct 
ArrowSchema* schema, enum ArrowT
   return ArrowSchemaSetFormat(schema, buffer);
 }
 
+ArrowErrorCode ArrowSchemaSetTypeRunEndEncoded(struct ArrowSchema* schema,
+                                               enum ArrowType run_end_type) {
+  switch (run_end_type) {
+    case NANOARROW_TYPE_INT16:
+    case NANOARROW_TYPE_INT32:
+    case NANOARROW_TYPE_INT64:
+      break;
+    default:
+      return EINVAL;
+  }
+
+  NANOARROW_RETURN_NOT_OK(ArrowSchemaSetFormat(
+      schema, ArrowSchemaFormatTemplate(NANOARROW_TYPE_RUN_END_ENCODED)));
+  NANOARROW_RETURN_NOT_OK(
+      ArrowSchemaInitChildrenIfNeeded(schema, NANOARROW_TYPE_RUN_END_ENCODED));
+  NANOARROW_RETURN_NOT_OK(ArrowSchemaSetType(schema->children[0], 
run_end_type));
+  NANOARROW_RETURN_NOT_OK(
+      ArrowSchemaSetType(schema->children[1], NANOARROW_TYPE_UNINITIALIZED));
+
+  return NANOARROW_OK;
+}
+
 static const char* ArrowTimeUnitFormatString(enum ArrowTimeUnit time_unit) {
   switch (time_unit) {
     case NANOARROW_TIME_UNIT_SECOND:
@@ -750,6 +781,13 @@ static ArrowErrorCode ArrowSchemaViewParse(struct 
ArrowSchemaView* schema_view,
           *format_end_out = format + 2;
           return NANOARROW_OK;
 
+        // run end encoded has no buffer at all
+        case 'r':
+          schema_view->storage_type = NANOARROW_TYPE_RUN_END_ENCODED;
+          schema_view->type = NANOARROW_TYPE_RUN_END_ENCODED;
+          *format_end_out = format + 2;
+          return NANOARROW_OK;
+
         // just validity buffer
         case 'w':
           if (format[2] != ':' || format[3] == '\0') {
@@ -1124,6 +1162,9 @@ static ArrowErrorCode ArrowSchemaViewValidate(struct 
ArrowSchemaView* schema_vie
     case NANOARROW_TYPE_FIXED_SIZE_LIST:
       return ArrowSchemaViewValidateNChildren(schema_view, 1, error);
 
+    case NANOARROW_TYPE_RUN_END_ENCODED:
+      return ArrowSchemaViewValidateNChildren(schema_view, 2, error);
+
     case NANOARROW_TYPE_STRUCT:
       return ArrowSchemaViewValidateNChildren(schema_view, -1, error);
 
diff --git a/src/nanoarrow/schema_test.cc b/src/nanoarrow/schema_test.cc
index 6e521eef..da8c6a1e 100644
--- a/src/nanoarrow/schema_test.cc
+++ b/src/nanoarrow/schema_test.cc
@@ -219,6 +219,47 @@ TEST(SchemaTest, SchemaInitDecimal) {
   EXPECT_TRUE(arrow_type.ValueUnsafe()->Equals(decimal256(3, 4)));
 }
 
+TEST(SchemaTest, SchemaInitRunEndEncoded) {
+  struct ArrowSchema schema;
+
+  // run-ends type has to be one of INT16, INT32, INT64
+  ArrowSchemaInit(&schema);
+  EXPECT_EQ(ArrowSchemaSetTypeRunEndEncoded(&schema, NANOARROW_TYPE_DOUBLE), 
EINVAL);
+
+  ArrowSchemaInit(&schema);
+  EXPECT_EQ(ArrowSchemaSetTypeRunEndEncoded(&schema, NANOARROW_TYPE_UINT16), 
EINVAL);
+
+  ArrowSchemaInit(&schema);
+  EXPECT_EQ(ArrowSchemaSetTypeRunEndEncoded(&schema, NANOARROW_TYPE_INT16), 
NANOARROW_OK);
+  EXPECT_STREQ(schema.format, "+r");
+
+  ASSERT_EQ(ArrowSchemaSetType(schema.children[1], NANOARROW_TYPE_FLOAT), 
NANOARROW_OK);
+
+  auto arrow_type = ImportType(&schema);
+  ARROW_EXPECT_OK(arrow_type);
+  EXPECT_TRUE(arrow_type.ValueUnsafe()->Equals(run_end_encoded(int16(), 
float32())));
+
+  ArrowSchemaInit(&schema);
+  EXPECT_EQ(ArrowSchemaSetTypeRunEndEncoded(&schema, NANOARROW_TYPE_INT32), 
NANOARROW_OK);
+  EXPECT_STREQ(schema.format, "+r");
+
+  ASSERT_EQ(ArrowSchemaSetType(schema.children[1], NANOARROW_TYPE_FLOAT), 
NANOARROW_OK);
+
+  arrow_type = ImportType(&schema);
+  ARROW_EXPECT_OK(arrow_type);
+  EXPECT_TRUE(arrow_type.ValueUnsafe()->Equals(run_end_encoded(int32(), 
float32())));
+
+  ArrowSchemaInit(&schema);
+  EXPECT_EQ(ArrowSchemaSetTypeRunEndEncoded(&schema, NANOARROW_TYPE_INT64), 
NANOARROW_OK);
+  EXPECT_STREQ(schema.format, "+r");
+
+  ASSERT_EQ(ArrowSchemaSetType(schema.children[1], NANOARROW_TYPE_FLOAT), 
NANOARROW_OK);
+
+  arrow_type = ImportType(&schema);
+  ARROW_EXPECT_OK(arrow_type);
+  EXPECT_TRUE(arrow_type.ValueUnsafe()->Equals(run_end_encoded(int64(), 
float32())));
+}
+
 TEST(SchemaTest, SchemaInitDateTime) {
   struct ArrowSchema schema;
 
@@ -501,6 +542,26 @@ TEST(SchemaTest, SchemaCopyDictType) {
   ArrowSchemaRelease(&schema_copy);
 }
 
+TEST(SchemaTest, SchemaCopyRunEndEncodedType) {
+  struct ArrowSchema schema;
+  auto struct_type = run_end_encoded(int32(), float32());
+  ARROW_EXPECT_OK(ExportType(*struct_type, &schema));
+
+  struct ArrowSchema schema_copy;
+  ASSERT_EQ(ArrowSchemaDeepCopy(&schema, &schema_copy), NANOARROW_OK);
+
+  ASSERT_NE(schema_copy.release, nullptr);
+  EXPECT_STREQ(schema_copy.format, "+r");
+  EXPECT_EQ(schema_copy.n_children, 2);
+  EXPECT_STREQ(schema_copy.children[0]->format, "i");
+  EXPECT_STREQ(schema_copy.children[0]->name, "run_ends");
+  EXPECT_STREQ(schema_copy.children[1]->format, "f");
+  EXPECT_STREQ(schema_copy.children[1]->name, "values");
+
+  ArrowSchemaRelease(&schema);
+  ArrowSchemaRelease(&schema_copy);
+}
+
 TEST(SchemaTest, SchemaCopyFlags) {
   struct ArrowSchema schema;
   ARROW_EXPECT_OK(ExportType(*int32(), &schema));
diff --git a/src/nanoarrow/utils.c b/src/nanoarrow/utils.c
index c9b4ebd6..4aba25ae 100644
--- a/src/nanoarrow/utils.c
+++ b/src/nanoarrow/utils.c
@@ -66,6 +66,7 @@ void ArrowLayoutInit(struct ArrowLayout* layout, enum 
ArrowType storage_type) {
   switch (storage_type) {
     case NANOARROW_TYPE_UNINITIALIZED:
     case NANOARROW_TYPE_NA:
+    case NANOARROW_TYPE_RUN_END_ENCODED:
       layout->buffer_type[0] = NANOARROW_BUFFER_TYPE_NONE;
       layout->buffer_data_type[0] = NANOARROW_TYPE_UNINITIALIZED;
       layout->buffer_type[1] = NANOARROW_BUFFER_TYPE_NONE;

Reply via email to