This is an automated email from the ASF dual-hosted git repository.

github-bot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-nanoarrow.git


The following commit(s) were added to refs/heads/main by this push:
     new 917e8e7d Update dist/ for commit 
b5d2742e2d0aee71c2ca5a277169e53c335f6c43
917e8e7d is described below

commit 917e8e7dbe17efb5ba4a1e859181f6fb48aa2fa5
Author: GitHub Actions <[email protected]>
AuthorDate: Tue Apr 16 01:26:17 2024 +0000

    Update dist/ for commit b5d2742e2d0aee71c2ca5a277169e53c335f6c43
---
 dist/nanoarrow.c           |  68 +++++++++++++-----
 dist/nanoarrow_ipc.c       |  30 ++++++++
 dist/nanoarrow_testing.hpp | 174 ++++++++++++++++++++++++++++++++++++++-------
 3 files changed, 232 insertions(+), 40 deletions(-)

diff --git a/dist/nanoarrow.c b/dist/nanoarrow.c
index d7925587..9677a0e5 100644
--- a/dist/nanoarrow.c
+++ b/dist/nanoarrow.c
@@ -430,6 +430,13 @@ ArrowErrorCode ArrowDecimalAppendDigitsToBuffer(const 
struct ArrowDecimal* decim
   // The most significant segment should have no leading zeroes
   int n_chars = snprintf((char*)buffer->data + buffer->size_bytes, 21, "%lu",
                          (unsigned long)segments[num_segments - 1]);
+
+  // Ensure that an encoding error from snprintf() does not result
+  // in an out-of-bounds access.
+  if (n_chars < 0) {
+    return ERANGE;
+  }
+
   buffer->size_bytes += n_chars;
 
   // Subsequent output needs to be left-padded with zeroes such that each 
segment
@@ -678,6 +685,10 @@ ArrowErrorCode ArrowSchemaSetTypeFixedSize(struct 
ArrowSchema* schema,
       return EINVAL;
   }
 
+  if (((size_t)n_chars) >= sizeof(buffer) || n_chars < 0) {
+    return ERANGE;
+  }
+
   buffer[n_chars] = '\0';
   NANOARROW_RETURN_NOT_OK(ArrowSchemaSetFormat(schema, buffer));
 
@@ -710,6 +721,10 @@ ArrowErrorCode ArrowSchemaSetTypeDecimal(struct 
ArrowSchema* schema, enum ArrowT
       return EINVAL;
   }
 
+  if (((size_t)n_chars) >= sizeof(buffer) || n_chars < 0) {
+    return ERANGE;
+  }
+
   buffer[n_chars] = '\0';
   return ArrowSchemaSetFormat(schema, buffer);
 }
@@ -786,7 +801,7 @@ ArrowErrorCode ArrowSchemaSetTypeDateTime(struct 
ArrowSchema* schema, enum Arrow
       return EINVAL;
   }
 
-  if (((size_t)n_chars) >= sizeof(buffer)) {
+  if (((size_t)n_chars) >= sizeof(buffer) || n_chars < 0) {
     return ERANGE;
   }
 
@@ -823,6 +838,12 @@ ArrowErrorCode ArrowSchemaSetTypeUnion(struct ArrowSchema* 
schema, enum ArrowTyp
       return EINVAL;
   }
 
+  // Ensure that an encoding error from snprintf() does not result
+  // in an out-of-bounds access.
+  if (n_chars < 0) {
+    return ERANGE;
+  }
+
   if (n_children > 0) {
     n_chars = snprintf(format_cursor, format_out_size, "0");
     format_cursor += n_chars;
@@ -835,6 +856,12 @@ ArrowErrorCode ArrowSchemaSetTypeUnion(struct ArrowSchema* 
schema, enum ArrowTyp
     }
   }
 
+  // Ensure that an encoding error from snprintf() does not result
+  // in an out-of-bounds access.
+  if (n_chars < 0) {
+    return ERANGE;
+  }
+
   NANOARROW_RETURN_NOT_OK(ArrowSchemaSetFormat(schema, format_out));
 
   NANOARROW_RETURN_NOT_OK(ArrowSchemaAllocateChildren(schema, n_children));
@@ -1701,6 +1728,12 @@ static int64_t ArrowSchemaTypeToStringInternal(struct 
ArrowSchemaView* schema_vi
 // among multiple sprintf calls.
 static inline void ArrowToStringLogChars(char** out, int64_t n_chars_last,
                                          int64_t* n_remaining, int64_t* 
n_chars) {
+  // In the unlikely snprintf() returning a negative value (encoding error),
+  // ensure the result won't cause an out-of-bounds access.
+  if (n_chars_last < 0) {
+    n_chars = 0;
+  }
+
   *n_chars += n_chars_last;
   *n_remaining -= n_chars_last;
 
@@ -1796,7 +1829,12 @@ int64_t ArrowSchemaToString(const struct ArrowSchema* 
schema, char* out, int64_t
     n_chars += snprintf(out, n, ">");
   }
 
-  return n_chars;
+  // Ensure that we always return a positive result
+  if (n_chars > 0) {
+    return n_chars;
+  } else {
+    return 0;
+  }
 }
 
 ArrowErrorCode ArrowMetadataReaderInit(struct ArrowMetadataReader* reader,
@@ -2423,19 +2461,16 @@ static ArrowErrorCode ArrowArrayFinalizeBuffers(struct 
ArrowArray* array) {
   struct ArrowArrayPrivateData* private_data =
       (struct ArrowArrayPrivateData*)array->private_data;
 
-  // The only buffer finalizing this currently does is make sure the data
-  // buffer for (Large)String|Binary is never NULL
-  switch (private_data->storage_type) {
-    case NANOARROW_TYPE_BINARY:
-    case NANOARROW_TYPE_STRING:
-    case NANOARROW_TYPE_LARGE_BINARY:
-    case NANOARROW_TYPE_LARGE_STRING:
-      if (ArrowArrayBuffer(array, 2)->data == NULL) {
-        NANOARROW_RETURN_NOT_OK(ArrowBufferAppendUInt8(ArrowArrayBuffer(array, 
2), 0));
-      }
-      break;
-    default:
-      break;
+  for (int i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; i++) {
+    if (private_data->layout.buffer_type[i] == NANOARROW_BUFFER_TYPE_VALIDITY 
||
+        private_data->layout.buffer_type[i] == NANOARROW_BUFFER_TYPE_NONE) {
+      continue;
+    }
+
+    struct ArrowBuffer* buffer = ArrowArrayBuffer(array, i);
+    if (buffer->data == NULL) {
+      NANOARROW_RETURN_NOT_OK((ArrowBufferReserve(buffer, 1)));
+    }
   }
 
   for (int64_t i = 0; i < array->n_children; i++) {
@@ -2471,7 +2506,8 @@ ArrowErrorCode ArrowArrayFinishBuilding(struct 
ArrowArray* array,
                                         struct ArrowError* error) {
   // Even if the data buffer is size zero, the pointer value needed to be 
non-null
   // in some implementations (at least one version of Arrow C++ at the time 
this
-  // was added). Only do this fix if we can assume CPU data access.
+  // was added and C# as later discovered). Only do this fix if we can assume
+  // CPU data access.
   if (validation_level >= NANOARROW_VALIDATION_LEVEL_DEFAULT) {
     NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowArrayFinalizeBuffers(array), 
error);
   }
diff --git a/dist/nanoarrow_ipc.c b/dist/nanoarrow_ipc.c
index 38d2b586..c125b99b 100644
--- a/dist/nanoarrow_ipc.c
+++ b/dist/nanoarrow_ipc.c
@@ -20884,6 +20884,11 @@ static int ArrowIpcDecoderSetTypeFixedSizeList(struct 
ArrowSchema* schema,
 
   char fixed_size_str[128];
   int n_chars = snprintf(fixed_size_str, 128, "+w:%d", fixed_size);
+  if (n_chars < 0) {
+    ArrowErrorSet(error, "snprintf() encoding error");
+    return ERANGE;
+  }
+
   fixed_size_str[n_chars] = '\0';
   return ArrowIpcDecoderSetTypeSimpleNested(schema, fixed_size_str, error);
 }
@@ -20942,6 +20947,11 @@ static int ArrowIpcDecoderSetTypeUnion(struct 
ArrowSchema* schema,
       return EINVAL;
   }
 
+  if (n_chars < 0) {
+    ArrowErrorSet(error, "snprintf() encoding error");
+    return ERANGE;
+  }
+
   if (ns(Union_typeIds_is_present(type))) {
     flatbuffers_int32_vec_t type_ids = ns(Union_typeIds(type));
     int64_t n_type_ids = flatbuffers_int32_vec_len(type_ids);
@@ -20960,11 +20970,21 @@ static int ArrowIpcDecoderSetTypeUnion(struct 
ArrowSchema* schema,
       format_cursor += n_chars;
       format_out_size -= n_chars;
 
+      if (n_chars < 0) {
+        ArrowErrorSet(error, "snprintf() encoding error");
+        return ERANGE;
+      }
+
       for (int64_t i = 1; i < n_type_ids; i++) {
         n_chars = snprintf(format_cursor, format_out_size, ",%d",
                            (int)flatbuffers_int32_vec_at(type_ids, i));
         format_cursor += n_chars;
         format_out_size -= n_chars;
+
+        if (n_chars < 0) {
+          ArrowErrorSet(error, "snprintf() encoding error");
+          return ERANGE;
+        }
       }
     }
   } else if (n_children > 0) {
@@ -20972,10 +20992,20 @@ static int ArrowIpcDecoderSetTypeUnion(struct 
ArrowSchema* schema,
     format_cursor += n_chars;
     format_out_size -= n_chars;
 
+    if (n_chars < 0) {
+      ArrowErrorSet(error, "snprintf() encoding error");
+      return ERANGE;
+    }
+
     for (int64_t i = 1; i < n_children; i++) {
       n_chars = snprintf(format_cursor, format_out_size, ",%d", (int)i);
       format_cursor += n_chars;
       format_out_size -= n_chars;
+
+      if (n_chars < 0) {
+        ArrowErrorSet(error, "snprintf() encoding error");
+        return ERANGE;
+      }
     }
   }
 
diff --git a/dist/nanoarrow_testing.hpp b/dist/nanoarrow_testing.hpp
index a7602b70..f7d2da4f 100644
--- a/dist/nanoarrow_testing.hpp
+++ b/dist/nanoarrow_testing.hpp
@@ -138,7 +138,7 @@ class DictionaryContext {
 /// \brief Writer for the Arrow integration testing JSON format
 class TestingJSONWriter {
  public:
-  TestingJSONWriter() : float_precision_(-1) {}
+  TestingJSONWriter() : float_precision_(-1), include_metadata_(true) {}
 
   /// \brief Set the floating point precision of the writer
   ///
@@ -146,7 +146,12 @@ class TestingJSONWriter {
   /// to encode the value in the output. When writing files specifically for
   /// integration tests, floating point values should be rounded to 3 decimal 
places to
   /// avoid serialization issues.
-  void set_float_precision(int precision) { float_precision_ = precision; }
+  void set_float_precision(int value) { float_precision_ = value; }
+
+  /// \brief Set whether metadata should be included in the output of a schema 
or field
+  ///
+  /// Use false to skip writing schema/field metadata in the output.
+  void set_include_metadata(bool value) { include_metadata_ = value; }
 
   void ResetDictionaries() { dictionaries_.clear(); }
 
@@ -227,7 +232,7 @@ class TestingJSONWriter {
     }
 
     // Write metadata
-    if (schema->metadata != nullptr) {
+    if (ShouldWriteMetadata(schema->metadata)) {
       out << R"(, "metadata": )";
       NANOARROW_RETURN_NOT_OK(WriteMetadata(out, schema->metadata));
     }
@@ -293,7 +298,7 @@ class TestingJSONWriter {
     }
 
     // Write metadata
-    if (field->metadata != nullptr) {
+    if (ShouldWriteMetadata(field->metadata)) {
       out << R"(, "metadata": )";
       NANOARROW_RETURN_NOT_OK(WriteMetadata(out, field->metadata));
     }
@@ -494,8 +499,13 @@ class TestingJSONWriter {
 
  private:
   int float_precision_;
+  bool include_metadata_;
   internal::DictionaryContext dictionaries_;
 
+  bool ShouldWriteMetadata(const char* metadata) {
+    return metadata != nullptr && include_metadata_;
+  }
+
   ArrowErrorCode WriteDictionaryBatch(std::ostream& out, int32_t 
dictionary_id) {
     const internal::Dictionary& dict = dictionaries_.Get(dictionary_id);
     out << R"({"id": )" << dictionary_id << R"(, "data": {"count": )"
@@ -2023,8 +2033,7 @@ class TestingJSONReader {
       buffer_view->data.as_uint8 = buffer->data;
       buffer_view->size_bytes = buffer->size_bytes;
 
-      // If this is a validity buffer with a big enough size, set the 
array_view's
-      // null_count
+      // If this is a validity buffer, set the null_count
       if (array_view->layout.buffer_type[i] == NANOARROW_BUFFER_TYPE_VALIDITY 
&&
           _ArrowBytesForBits(array_view->length) <= buffer_view->size_bytes) {
         array_view->null_count =
@@ -2033,6 +2042,11 @@ class TestingJSONReader {
       }
     }
 
+    // The null type doesn't have any buffers but we can set the null_count
+    if (array_view->storage_type == NANOARROW_TYPE_NA) {
+      array_view->null_count = array_view->length;
+    }
+
     // If there is a dictionary associated with schema, parse its value into 
dictionary
     if (schema->dictionary != nullptr) {
       NANOARROW_RETURN_NOT_OK(Check(
@@ -2051,9 +2065,10 @@ class TestingJSONReader {
         ArrowArrayViewValidate(array_view, NANOARROW_VALIDATION_LEVEL_FULL, 
error), error,
         error_prefix + "failed to validate: "));
 
-    // Flush length and buffer pointers to the Array
-    NANOARROW_RETURN_NOT_OK_WITH_ERROR(
-        ArrowArrayFinishBuilding(array, NANOARROW_VALIDATION_LEVEL_NONE, 
nullptr), error);
+    // Flush length and buffer pointers to the Array. This also ensures that 
buffers
+    // are not NULL (matters for some versions of some implementations).
+    NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowArrayFinishBuildingDefault(array, 
nullptr),
+                                       error);
     array->length = array_view->length;
     array->null_count = array_view->null_count;
 
@@ -2178,6 +2193,11 @@ class TestingJSONReader {
     NANOARROW_RETURN_NOT_OK(
         Check(value.is_array(), error, "bitmap buffer must be array"));
 
+    // Reserving with the exact length ensures that the last bits are always 
zeroed.
+    // This was an assumption made by the C# implementation at the time this 
was
+    // implemented.
+    NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowBitmapReserve(bitmap, 
value.size()), error);
+
     for (const auto& item : value) {
       // Some example files write bitmaps as [true, false, true] but the 
documentation
       // says [1, 0, 1]. Accept both for simplicity.
@@ -2505,6 +2525,35 @@ class TestingJSONComparison {
   };
 
  public:
+  TestingJSONComparison() : compare_batch_flags_(true), 
compare_metadata_order_(true) {
+    // We do our own metadata comparison
+    writer_actual_.set_include_metadata(false);
+    writer_expected_.set_include_metadata(false);
+  }
+
+  /// \brief Compare top-level RecordBatch flags (e.g., nullability)
+  ///
+  /// Some Arrow implementations export batches as nullable, and some export 
them as
+  /// non-nullable. Use false to consider these two types of batches as 
equivalent.
+  void set_compare_batch_flags(bool value) { compare_batch_flags_ = value; }
+
+  /// \brief Compare metadata order
+  ///
+  /// Some Arrow implementations store metadata using structures (e.g., hash 
map) that
+  /// reorder metadata items. Use false to consider metadata whose keys/values 
have
+  /// been reordered as equivalent.
+  void set_compare_metadata_order(bool value) { compare_metadata_order_ = 
value; }
+
+  /// \brief Set float precision
+  ///
+  /// The Arrow Integration Testing JSON document states that values should be 
compared
+  /// to 3 decimal places to avoid floating point serialization issues. Use -1 
to specify
+  /// that all decimal places should be used (the default).
+  void set_compare_float_precision(int value) {
+    writer_actual_.set_float_precision(value);
+    writer_expected_.set_float_precision(value);
+  }
+
   /// \brief Returns the number of differences found by the previous call
   int64_t num_differences() const { return differences_.size(); }
 
@@ -2619,7 +2668,7 @@ class TestingJSONComparison {
     // (Purposefully ignore the name field at the top level)
 
     // Compare flags
-    if (actual->flags != expected->flags) {
+    if (compare_batch_flags_ && actual->flags != expected->flags) {
       differences_.push_back({path,
                               std::string(".flags: ") + 
std::to_string(actual->flags),
                               std::string(".flags: ") + 
std::to_string(expected->flags)});
@@ -2639,20 +2688,8 @@ class TestingJSONComparison {
     }
 
     // Compare metadata
-    std::stringstream ss;
-    NANOARROW_RETURN_NOT_OK_WITH_ERROR(writer_actual_.WriteMetadata(ss, 
actual->metadata),
-                                       error);
-    std::string actual_metadata = ss.str();
-
-    ss.str("");
-    NANOARROW_RETURN_NOT_OK_WITH_ERROR(
-        writer_expected_.WriteMetadata(ss, expected->metadata), error);
-    std::string expected_metadata = ss.str();
-
-    if (actual_metadata != expected_metadata) {
-      differences_.push_back({path, std::string(".metadata: ") + 
actual_metadata,
-                              std::string(".metadata: ") + expected_metadata});
-    }
+    NANOARROW_RETURN_NOT_OK(CompareMetadata(actual->metadata, 
expected->metadata, error,
+                                            path + std::string(".metadata")));
 
     return NANOARROW_OK;
   }
@@ -2722,6 +2759,10 @@ class TestingJSONComparison {
   nanoarrow::UniqueArrayView actual_;
   nanoarrow::UniqueArrayView expected_;
 
+  // Comparison options
+  bool compare_batch_flags_;
+  bool compare_metadata_order_;
+
   ArrowErrorCode CompareField(ArrowSchema* actual, ArrowSchema* expected,
                               ArrowError* error, const std::string& path = "") 
{
     // Preprocess both fields such that map types have canonical names
@@ -2753,6 +2794,91 @@ class TestingJSONComparison {
       differences_.push_back({path, actual_json, expected_json});
     }
 
+    NANOARROW_RETURN_NOT_OK(CompareMetadata(actual->metadata, 
expected->metadata, error,
+                                            path + std::string(".metadata")));
+    return NANOARROW_OK;
+  }
+
+  ArrowErrorCode CompareMetadata(const char* actual, const char* expected,
+                                 ArrowError* error, const std::string& path = 
"") {
+    std::stringstream ss;
+
+    NANOARROW_RETURN_NOT_OK_WITH_ERROR(writer_actual_.WriteMetadata(ss, 
actual), error);
+    std::string actual_json = ss.str();
+
+    ss.str("");
+    NANOARROW_RETURN_NOT_OK_WITH_ERROR(writer_expected_.WriteMetadata(ss, 
expected),
+                                       error);
+    std::string expected_json = ss.str();
+
+    bool metadata_equal = actual_json == expected_json;
+
+    // If there is a difference in the rendered JSON but we aren't being 
strict about
+    // order, check again using the KeyValue comparison.
+    if (!metadata_equal && !compare_metadata_order_) {
+      NANOARROW_RETURN_NOT_OK(
+          MetadataEqualKeyValue(actual, expected, &metadata_equal, error));
+    }
+
+    // If we still have an inequality, add a difference.
+    if (!metadata_equal) {
+      differences_.push_back({path, actual_json, expected_json});
+    }
+
+    return NANOARROW_OK;
+  }
+
+  ArrowErrorCode MetadataEqualKeyValue(const char* actual, const char* 
expected,
+                                       bool* out, ArrowError* error) {
+    std::unordered_map<std::string, std::string> actual_map, expected_map;
+    NANOARROW_RETURN_NOT_OK(MetadataToMap(actual, &actual_map, error));
+    NANOARROW_RETURN_NOT_OK(MetadataToMap(expected, &expected_map, error));
+
+    if (actual_map.size() != expected_map.size()) {
+      *out = false;
+      return NANOARROW_OK;
+    }
+
+    for (const auto& item : expected_map) {
+      const auto& actual_item = actual_map.find(item.first);
+      if (actual_item == actual_map.end()) {
+        *out = false;
+        return NANOARROW_OK;
+      }
+
+      if (actual_item->second != item.second) {
+        *out = false;
+        return NANOARROW_OK;
+      }
+    }
+
+    *out = true;
+    return NANOARROW_OK;
+  }
+
+  ArrowErrorCode MetadataToMap(const char* metadata,
+                               std::unordered_map<std::string, std::string>* 
out,
+                               ArrowError* error) {
+    ArrowMetadataReader reader;
+    NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowMetadataReaderInit(&reader, 
metadata), error);
+
+    ArrowStringView key, value;
+    size_t metadata_num_keys = 0;
+    while (reader.remaining_keys > 0) {
+      NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowMetadataReaderRead(&reader, 
&key, &value),
+                                         error);
+      out->insert({std::string(key.data, key.size_bytes),
+                   std::string(value.data, value.size_bytes)});
+      metadata_num_keys++;
+    }
+
+    if (metadata_num_keys != out->size()) {
+      ArrowErrorSet(error,
+                    "Comparison of metadata containing duplicate keys without "
+                    "considering order is not implemented");
+      return ENOTSUP;
+    }
+
     return NANOARROW_OK;
   }
 

Reply via email to