This is an automated email from the ASF dual-hosted git repository.
github-bot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-nanoarrow.git
The following commit(s) were added to refs/heads/main by this push:
new 917e8e7d Update dist/ for commit
b5d2742e2d0aee71c2ca5a277169e53c335f6c43
917e8e7d is described below
commit 917e8e7dbe17efb5ba4a1e859181f6fb48aa2fa5
Author: GitHub Actions <[email protected]>
AuthorDate: Tue Apr 16 01:26:17 2024 +0000
Update dist/ for commit b5d2742e2d0aee71c2ca5a277169e53c335f6c43
---
dist/nanoarrow.c | 68 +++++++++++++-----
dist/nanoarrow_ipc.c | 30 ++++++++
dist/nanoarrow_testing.hpp | 174 ++++++++++++++++++++++++++++++++++++++-------
3 files changed, 232 insertions(+), 40 deletions(-)
diff --git a/dist/nanoarrow.c b/dist/nanoarrow.c
index d7925587..9677a0e5 100644
--- a/dist/nanoarrow.c
+++ b/dist/nanoarrow.c
@@ -430,6 +430,13 @@ ArrowErrorCode ArrowDecimalAppendDigitsToBuffer(const
struct ArrowDecimal* decim
// The most significant segment should have no leading zeroes
int n_chars = snprintf((char*)buffer->data + buffer->size_bytes, 21, "%lu",
(unsigned long)segments[num_segments - 1]);
+
+ // Ensure that an encoding error from snprintf() does not result
+ // in an out-of-bounds access.
+ if (n_chars < 0) {
+ return ERANGE;
+ }
+
buffer->size_bytes += n_chars;
// Subsequent output needs to be left-padded with zeroes such that each
segment
@@ -678,6 +685,10 @@ ArrowErrorCode ArrowSchemaSetTypeFixedSize(struct
ArrowSchema* schema,
return EINVAL;
}
+ if (((size_t)n_chars) >= sizeof(buffer) || n_chars < 0) {
+ return ERANGE;
+ }
+
buffer[n_chars] = '\0';
NANOARROW_RETURN_NOT_OK(ArrowSchemaSetFormat(schema, buffer));
@@ -710,6 +721,10 @@ ArrowErrorCode ArrowSchemaSetTypeDecimal(struct
ArrowSchema* schema, enum ArrowT
return EINVAL;
}
+ if (((size_t)n_chars) >= sizeof(buffer) || n_chars < 0) {
+ return ERANGE;
+ }
+
buffer[n_chars] = '\0';
return ArrowSchemaSetFormat(schema, buffer);
}
@@ -786,7 +801,7 @@ ArrowErrorCode ArrowSchemaSetTypeDateTime(struct
ArrowSchema* schema, enum Arrow
return EINVAL;
}
- if (((size_t)n_chars) >= sizeof(buffer)) {
+ if (((size_t)n_chars) >= sizeof(buffer) || n_chars < 0) {
return ERANGE;
}
@@ -823,6 +838,12 @@ ArrowErrorCode ArrowSchemaSetTypeUnion(struct ArrowSchema*
schema, enum ArrowTyp
return EINVAL;
}
+ // Ensure that an encoding error from snprintf() does not result
+ // in an out-of-bounds access.
+ if (n_chars < 0) {
+ return ERANGE;
+ }
+
if (n_children > 0) {
n_chars = snprintf(format_cursor, format_out_size, "0");
format_cursor += n_chars;
@@ -835,6 +856,12 @@ ArrowErrorCode ArrowSchemaSetTypeUnion(struct ArrowSchema*
schema, enum ArrowTyp
}
}
+ // Ensure that an encoding error from snprintf() does not result
+ // in an out-of-bounds access.
+ if (n_chars < 0) {
+ return ERANGE;
+ }
+
NANOARROW_RETURN_NOT_OK(ArrowSchemaSetFormat(schema, format_out));
NANOARROW_RETURN_NOT_OK(ArrowSchemaAllocateChildren(schema, n_children));
@@ -1701,6 +1728,12 @@ static int64_t ArrowSchemaTypeToStringInternal(struct
ArrowSchemaView* schema_vi
// among multiple sprintf calls.
static inline void ArrowToStringLogChars(char** out, int64_t n_chars_last,
int64_t* n_remaining, int64_t*
n_chars) {
+ // In the unlikely snprintf() returning a negative value (encoding error),
+ // ensure the result won't cause an out-of-bounds access.
+ if (n_chars_last < 0) {
+ n_chars = 0;
+ }
+
*n_chars += n_chars_last;
*n_remaining -= n_chars_last;
@@ -1796,7 +1829,12 @@ int64_t ArrowSchemaToString(const struct ArrowSchema*
schema, char* out, int64_t
n_chars += snprintf(out, n, ">");
}
- return n_chars;
+ // Ensure that we always return a positive result
+ if (n_chars > 0) {
+ return n_chars;
+ } else {
+ return 0;
+ }
}
ArrowErrorCode ArrowMetadataReaderInit(struct ArrowMetadataReader* reader,
@@ -2423,19 +2461,16 @@ static ArrowErrorCode ArrowArrayFinalizeBuffers(struct
ArrowArray* array) {
struct ArrowArrayPrivateData* private_data =
(struct ArrowArrayPrivateData*)array->private_data;
- // The only buffer finalizing this currently does is make sure the data
- // buffer for (Large)String|Binary is never NULL
- switch (private_data->storage_type) {
- case NANOARROW_TYPE_BINARY:
- case NANOARROW_TYPE_STRING:
- case NANOARROW_TYPE_LARGE_BINARY:
- case NANOARROW_TYPE_LARGE_STRING:
- if (ArrowArrayBuffer(array, 2)->data == NULL) {
- NANOARROW_RETURN_NOT_OK(ArrowBufferAppendUInt8(ArrowArrayBuffer(array,
2), 0));
- }
- break;
- default:
- break;
+ for (int i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; i++) {
+ if (private_data->layout.buffer_type[i] == NANOARROW_BUFFER_TYPE_VALIDITY
||
+ private_data->layout.buffer_type[i] == NANOARROW_BUFFER_TYPE_NONE) {
+ continue;
+ }
+
+ struct ArrowBuffer* buffer = ArrowArrayBuffer(array, i);
+ if (buffer->data == NULL) {
+ NANOARROW_RETURN_NOT_OK((ArrowBufferReserve(buffer, 1)));
+ }
}
for (int64_t i = 0; i < array->n_children; i++) {
@@ -2471,7 +2506,8 @@ ArrowErrorCode ArrowArrayFinishBuilding(struct
ArrowArray* array,
struct ArrowError* error) {
// Even if the data buffer is size zero, the pointer value needed to be
non-null
// in some implementations (at least one version of Arrow C++ at the time
this
- // was added). Only do this fix if we can assume CPU data access.
+ // was added and C# as later discovered). Only do this fix if we can assume
+ // CPU data access.
if (validation_level >= NANOARROW_VALIDATION_LEVEL_DEFAULT) {
NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowArrayFinalizeBuffers(array),
error);
}
diff --git a/dist/nanoarrow_ipc.c b/dist/nanoarrow_ipc.c
index 38d2b586..c125b99b 100644
--- a/dist/nanoarrow_ipc.c
+++ b/dist/nanoarrow_ipc.c
@@ -20884,6 +20884,11 @@ static int ArrowIpcDecoderSetTypeFixedSizeList(struct
ArrowSchema* schema,
char fixed_size_str[128];
int n_chars = snprintf(fixed_size_str, 128, "+w:%d", fixed_size);
+ if (n_chars < 0) {
+ ArrowErrorSet(error, "snprintf() encoding error");
+ return ERANGE;
+ }
+
fixed_size_str[n_chars] = '\0';
return ArrowIpcDecoderSetTypeSimpleNested(schema, fixed_size_str, error);
}
@@ -20942,6 +20947,11 @@ static int ArrowIpcDecoderSetTypeUnion(struct
ArrowSchema* schema,
return EINVAL;
}
+ if (n_chars < 0) {
+ ArrowErrorSet(error, "snprintf() encoding error");
+ return ERANGE;
+ }
+
if (ns(Union_typeIds_is_present(type))) {
flatbuffers_int32_vec_t type_ids = ns(Union_typeIds(type));
int64_t n_type_ids = flatbuffers_int32_vec_len(type_ids);
@@ -20960,11 +20970,21 @@ static int ArrowIpcDecoderSetTypeUnion(struct
ArrowSchema* schema,
format_cursor += n_chars;
format_out_size -= n_chars;
+ if (n_chars < 0) {
+ ArrowErrorSet(error, "snprintf() encoding error");
+ return ERANGE;
+ }
+
for (int64_t i = 1; i < n_type_ids; i++) {
n_chars = snprintf(format_cursor, format_out_size, ",%d",
(int)flatbuffers_int32_vec_at(type_ids, i));
format_cursor += n_chars;
format_out_size -= n_chars;
+
+ if (n_chars < 0) {
+ ArrowErrorSet(error, "snprintf() encoding error");
+ return ERANGE;
+ }
}
}
} else if (n_children > 0) {
@@ -20972,10 +20992,20 @@ static int ArrowIpcDecoderSetTypeUnion(struct
ArrowSchema* schema,
format_cursor += n_chars;
format_out_size -= n_chars;
+ if (n_chars < 0) {
+ ArrowErrorSet(error, "snprintf() encoding error");
+ return ERANGE;
+ }
+
for (int64_t i = 1; i < n_children; i++) {
n_chars = snprintf(format_cursor, format_out_size, ",%d", (int)i);
format_cursor += n_chars;
format_out_size -= n_chars;
+
+ if (n_chars < 0) {
+ ArrowErrorSet(error, "snprintf() encoding error");
+ return ERANGE;
+ }
}
}
diff --git a/dist/nanoarrow_testing.hpp b/dist/nanoarrow_testing.hpp
index a7602b70..f7d2da4f 100644
--- a/dist/nanoarrow_testing.hpp
+++ b/dist/nanoarrow_testing.hpp
@@ -138,7 +138,7 @@ class DictionaryContext {
/// \brief Writer for the Arrow integration testing JSON format
class TestingJSONWriter {
public:
- TestingJSONWriter() : float_precision_(-1) {}
+ TestingJSONWriter() : float_precision_(-1), include_metadata_(true) {}
/// \brief Set the floating point precision of the writer
///
@@ -146,7 +146,12 @@ class TestingJSONWriter {
/// to encode the value in the output. When writing files specifically for
/// integration tests, floating point values should be rounded to 3 decimal
places to
/// avoid serialization issues.
- void set_float_precision(int precision) { float_precision_ = precision; }
+ void set_float_precision(int value) { float_precision_ = value; }
+
+ /// \brief Set whether metadata should be included in the output of a schema
or field
+ ///
+ /// Use false to skip writing schema/field metadata in the output.
+ void set_include_metadata(bool value) { include_metadata_ = value; }
void ResetDictionaries() { dictionaries_.clear(); }
@@ -227,7 +232,7 @@ class TestingJSONWriter {
}
// Write metadata
- if (schema->metadata != nullptr) {
+ if (ShouldWriteMetadata(schema->metadata)) {
out << R"(, "metadata": )";
NANOARROW_RETURN_NOT_OK(WriteMetadata(out, schema->metadata));
}
@@ -293,7 +298,7 @@ class TestingJSONWriter {
}
// Write metadata
- if (field->metadata != nullptr) {
+ if (ShouldWriteMetadata(field->metadata)) {
out << R"(, "metadata": )";
NANOARROW_RETURN_NOT_OK(WriteMetadata(out, field->metadata));
}
@@ -494,8 +499,13 @@ class TestingJSONWriter {
private:
int float_precision_;
+ bool include_metadata_;
internal::DictionaryContext dictionaries_;
+ bool ShouldWriteMetadata(const char* metadata) {
+ return metadata != nullptr && include_metadata_;
+ }
+
ArrowErrorCode WriteDictionaryBatch(std::ostream& out, int32_t
dictionary_id) {
const internal::Dictionary& dict = dictionaries_.Get(dictionary_id);
out << R"({"id": )" << dictionary_id << R"(, "data": {"count": )"
@@ -2023,8 +2033,7 @@ class TestingJSONReader {
buffer_view->data.as_uint8 = buffer->data;
buffer_view->size_bytes = buffer->size_bytes;
- // If this is a validity buffer with a big enough size, set the
array_view's
- // null_count
+ // If this is a validity buffer, set the null_count
if (array_view->layout.buffer_type[i] == NANOARROW_BUFFER_TYPE_VALIDITY
&&
_ArrowBytesForBits(array_view->length) <= buffer_view->size_bytes) {
array_view->null_count =
@@ -2033,6 +2042,11 @@ class TestingJSONReader {
}
}
+ // The null type doesn't have any buffers but we can set the null_count
+ if (array_view->storage_type == NANOARROW_TYPE_NA) {
+ array_view->null_count = array_view->length;
+ }
+
// If there is a dictionary associated with schema, parse its value into
dictionary
if (schema->dictionary != nullptr) {
NANOARROW_RETURN_NOT_OK(Check(
@@ -2051,9 +2065,10 @@ class TestingJSONReader {
ArrowArrayViewValidate(array_view, NANOARROW_VALIDATION_LEVEL_FULL,
error), error,
error_prefix + "failed to validate: "));
- // Flush length and buffer pointers to the Array
- NANOARROW_RETURN_NOT_OK_WITH_ERROR(
- ArrowArrayFinishBuilding(array, NANOARROW_VALIDATION_LEVEL_NONE,
nullptr), error);
+ // Flush length and buffer pointers to the Array. This also ensures that
buffers
+ // are not NULL (matters for some versions of some implementations).
+ NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowArrayFinishBuildingDefault(array,
nullptr),
+ error);
array->length = array_view->length;
array->null_count = array_view->null_count;
@@ -2178,6 +2193,11 @@ class TestingJSONReader {
NANOARROW_RETURN_NOT_OK(
Check(value.is_array(), error, "bitmap buffer must be array"));
+ // Reserving with the exact length ensures that the last bits are always
zeroed.
+ // This was an assumption made by the C# implementation at the time this
was
+ // implemented.
+ NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowBitmapReserve(bitmap,
value.size()), error);
+
for (const auto& item : value) {
// Some example files write bitmaps as [true, false, true] but the
documentation
// says [1, 0, 1]. Accept both for simplicity.
@@ -2505,6 +2525,35 @@ class TestingJSONComparison {
};
public:
+ TestingJSONComparison() : compare_batch_flags_(true),
compare_metadata_order_(true) {
+ // We do our own metadata comparison
+ writer_actual_.set_include_metadata(false);
+ writer_expected_.set_include_metadata(false);
+ }
+
+ /// \brief Compare top-level RecordBatch flags (e.g., nullability)
+ ///
+ /// Some Arrow implementations export batches as nullable, and some export
them as
+ /// non-nullable. Use false to consider these two types of batches as
equivalent.
+ void set_compare_batch_flags(bool value) { compare_batch_flags_ = value; }
+
+ /// \brief Compare metadata order
+ ///
+ /// Some Arrow implementations store metadata using structures (e.g., hash
map) that
+ /// reorder metadata items. Use false to consider metadata whose keys/values
have
+ /// been reordered as equivalent.
+ void set_compare_metadata_order(bool value) { compare_metadata_order_ =
value; }
+
+ /// \brief Set float precision
+ ///
+ /// The Arrow Integration Testing JSON document states that values should be
compared
+ /// to 3 decimal places to avoid floating point serialization issues. Use -1
to specify
+ /// that all decimal places should be used (the default).
+ void set_compare_float_precision(int value) {
+ writer_actual_.set_float_precision(value);
+ writer_expected_.set_float_precision(value);
+ }
+
/// \brief Returns the number of differences found by the previous call
int64_t num_differences() const { return differences_.size(); }
@@ -2619,7 +2668,7 @@ class TestingJSONComparison {
// (Purposefully ignore the name field at the top level)
// Compare flags
- if (actual->flags != expected->flags) {
+ if (compare_batch_flags_ && actual->flags != expected->flags) {
differences_.push_back({path,
std::string(".flags: ") +
std::to_string(actual->flags),
std::string(".flags: ") +
std::to_string(expected->flags)});
@@ -2639,20 +2688,8 @@ class TestingJSONComparison {
}
// Compare metadata
- std::stringstream ss;
- NANOARROW_RETURN_NOT_OK_WITH_ERROR(writer_actual_.WriteMetadata(ss,
actual->metadata),
- error);
- std::string actual_metadata = ss.str();
-
- ss.str("");
- NANOARROW_RETURN_NOT_OK_WITH_ERROR(
- writer_expected_.WriteMetadata(ss, expected->metadata), error);
- std::string expected_metadata = ss.str();
-
- if (actual_metadata != expected_metadata) {
- differences_.push_back({path, std::string(".metadata: ") +
actual_metadata,
- std::string(".metadata: ") + expected_metadata});
- }
+ NANOARROW_RETURN_NOT_OK(CompareMetadata(actual->metadata,
expected->metadata, error,
+ path + std::string(".metadata")));
return NANOARROW_OK;
}
@@ -2722,6 +2759,10 @@ class TestingJSONComparison {
nanoarrow::UniqueArrayView actual_;
nanoarrow::UniqueArrayView expected_;
+ // Comparison options
+ bool compare_batch_flags_;
+ bool compare_metadata_order_;
+
ArrowErrorCode CompareField(ArrowSchema* actual, ArrowSchema* expected,
ArrowError* error, const std::string& path = "")
{
// Preprocess both fields such that map types have canonical names
@@ -2753,6 +2794,91 @@ class TestingJSONComparison {
differences_.push_back({path, actual_json, expected_json});
}
+ NANOARROW_RETURN_NOT_OK(CompareMetadata(actual->metadata,
expected->metadata, error,
+ path + std::string(".metadata")));
+ return NANOARROW_OK;
+ }
+
+ ArrowErrorCode CompareMetadata(const char* actual, const char* expected,
+ ArrowError* error, const std::string& path =
"") {
+ std::stringstream ss;
+
+ NANOARROW_RETURN_NOT_OK_WITH_ERROR(writer_actual_.WriteMetadata(ss,
actual), error);
+ std::string actual_json = ss.str();
+
+ ss.str("");
+ NANOARROW_RETURN_NOT_OK_WITH_ERROR(writer_expected_.WriteMetadata(ss,
expected),
+ error);
+ std::string expected_json = ss.str();
+
+ bool metadata_equal = actual_json == expected_json;
+
+ // If there is a difference in the rendered JSON but we aren't being
strict about
+ // order, check again using the KeyValue comparison.
+ if (!metadata_equal && !compare_metadata_order_) {
+ NANOARROW_RETURN_NOT_OK(
+ MetadataEqualKeyValue(actual, expected, &metadata_equal, error));
+ }
+
+ // If we still have an inequality, add a difference.
+ if (!metadata_equal) {
+ differences_.push_back({path, actual_json, expected_json});
+ }
+
+ return NANOARROW_OK;
+ }
+
+ ArrowErrorCode MetadataEqualKeyValue(const char* actual, const char*
expected,
+ bool* out, ArrowError* error) {
+ std::unordered_map<std::string, std::string> actual_map, expected_map;
+ NANOARROW_RETURN_NOT_OK(MetadataToMap(actual, &actual_map, error));
+ NANOARROW_RETURN_NOT_OK(MetadataToMap(expected, &expected_map, error));
+
+ if (actual_map.size() != expected_map.size()) {
+ *out = false;
+ return NANOARROW_OK;
+ }
+
+ for (const auto& item : expected_map) {
+ const auto& actual_item = actual_map.find(item.first);
+ if (actual_item == actual_map.end()) {
+ *out = false;
+ return NANOARROW_OK;
+ }
+
+ if (actual_item->second != item.second) {
+ *out = false;
+ return NANOARROW_OK;
+ }
+ }
+
+ *out = true;
+ return NANOARROW_OK;
+ }
+
+ ArrowErrorCode MetadataToMap(const char* metadata,
+ std::unordered_map<std::string, std::string>*
out,
+ ArrowError* error) {
+ ArrowMetadataReader reader;
+ NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowMetadataReaderInit(&reader,
metadata), error);
+
+ ArrowStringView key, value;
+ size_t metadata_num_keys = 0;
+ while (reader.remaining_keys > 0) {
+ NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowMetadataReaderRead(&reader,
&key, &value),
+ error);
+ out->insert({std::string(key.data, key.size_bytes),
+ std::string(value.data, value.size_bytes)});
+ metadata_num_keys++;
+ }
+
+ if (metadata_num_keys != out->size()) {
+ ArrowErrorSet(error,
+ "Comparison of metadata containing duplicate keys without "
+ "considering order is not implemented");
+ return ENOTSUP;
+ }
+
return NANOARROW_OK;
}