This is an automated email from the ASF dual-hosted git repository.
paleolimbot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-nanoarrow.git
The following commit(s) were added to refs/heads/main by this push:
new 89b5932 Add metadata builder functions (#12)
89b5932 is described below
commit 89b59322fe29dc26e4792039a219252622c3a95c
Author: Dewey Dunnington <[email protected]>
AuthorDate: Tue Aug 9 13:25:31 2022 -0300
Add metadata builder functions (#12)
* add metadata builder functions
* don't copy existing metadata unless needed
* better comments in tests
* fix buffer change
* test builder from existing string, error for null key/value input
* All metadata values are StringViews
* less annoying getvalue
* less annoying string view from const char
* move the string view + helper to the inlined typedefs
* everything is a stringview
* don't use NULL in header
* more consistent inline function definition for ArrowCharView()
---
src/nanoarrow/metadata.c | 178 +++++++++++++++++++++++++++++++++++-----
src/nanoarrow/metadata_test.cc | 89 ++++++++++++++++++--
src/nanoarrow/nanoarrow.h | 46 +++++++----
src/nanoarrow/schema_view.c | 6 +-
src/nanoarrow/typedefs_inline.h | 13 +++
src/nanoarrow/utils_inline.h | 46 +++++++++++
6 files changed, 334 insertions(+), 44 deletions(-)
diff --git a/src/nanoarrow/metadata.c b/src/nanoarrow/metadata.c
index 123a8d8..2f24cbc 100644
--- a/src/nanoarrow/metadata.c
+++ b/src/nanoarrow/metadata.c
@@ -84,29 +84,22 @@ int64_t ArrowMetadataSizeOf(const char* metadata) {
return size;
}
-ArrowErrorCode ArrowMetadataGetValue(const char* metadata, const char* key,
- const char* default_value,
- struct ArrowStringView* value_out) {
- struct ArrowStringView target_key_view = {key, strlen(key)};
- value_out->data = default_value;
- if (default_value != NULL) {
- value_out->n_bytes = strlen(default_value);
- } else {
- value_out->n_bytes = 0;
- }
-
+static ArrowErrorCode ArrowMetadataGetValueInternal(const char* metadata,
+ struct ArrowStringView*
key,
+ struct ArrowStringView*
value_out) {
struct ArrowMetadataReader reader;
- struct ArrowStringView key_view;
- struct ArrowStringView value;
+ struct ArrowStringView existing_key;
+ struct ArrowStringView existing_value;
ArrowMetadataReaderInit(&reader, metadata);
int64_t size = sizeof(int32_t);
- while (ArrowMetadataReaderRead(&reader, &key_view, &value) == NANOARROW_OK) {
- int key_equal = target_key_view.n_bytes == key_view.n_bytes &&
- strncmp(target_key_view.data, key_view.data,
key_view.n_bytes) == 0;
+ while (ArrowMetadataReaderRead(&reader, &existing_key, &existing_value) ==
+ NANOARROW_OK) {
+ int key_equal = key->n_bytes == existing_key.n_bytes &&
+ strncmp(key->data, existing_key.data,
existing_key.n_bytes) == 0;
if (key_equal) {
- value_out->data = value.data;
- value_out->n_bytes = value.n_bytes;
+ value_out->data = existing_value.data;
+ value_out->n_bytes = existing_value.n_bytes;
break;
}
}
@@ -114,8 +107,151 @@ ArrowErrorCode ArrowMetadataGetValue(const char*
metadata, const char* key,
return NANOARROW_OK;
}
-char ArrowMetadataHasKey(const char* metadata, const char* key) {
- struct ArrowStringView value;
- ArrowMetadataGetValue(metadata, key, NULL, &value);
+ArrowErrorCode ArrowMetadataGetValue(const char* metadata, struct
ArrowStringView key,
+ struct ArrowStringView* value_out) {
+ if (value_out == NULL) {
+ return EINVAL;
+ }
+
+ return ArrowMetadataGetValueInternal(metadata, &key, value_out);
+}
+
+char ArrowMetadataHasKey(const char* metadata, struct ArrowStringView key) {
+ struct ArrowStringView value = ArrowCharView(NULL);
+ ArrowMetadataGetValue(metadata, key, &value);
return value.data != NULL;
}
+
+ArrowErrorCode ArrowMetadataBuilderInit(struct ArrowBuffer* buffer,
+ const char* metadata) {
+ ArrowBufferInit(buffer);
+ int result = ArrowBufferAppend(buffer, metadata,
ArrowMetadataSizeOf(metadata));
+ if (result != NANOARROW_OK) {
+ return result;
+ }
+
+ return NANOARROW_OK;
+}
+
+static ArrowErrorCode ArrowMetadataBuilderAppendInternal(struct ArrowBuffer*
buffer,
+ struct
ArrowStringView* key,
+ struct
ArrowStringView* value) {
+ if (value == NULL) {
+ return NANOARROW_OK;
+ }
+
+ int result;
+
+ if (buffer->capacity_bytes == 0) {
+ int32_t zero = 0;
+ result = ArrowBufferAppend(buffer, &zero, sizeof(int32_t));
+ if (result != NANOARROW_OK) {
+ return result;
+ }
+ }
+
+ if (buffer->capacity_bytes < sizeof(int32_t)) {
+ return EINVAL;
+ }
+
+ int32_t n_keys;
+ memcpy(&n_keys, buffer->data, sizeof(int32_t));
+
+ int32_t key_size = key->n_bytes;
+ int32_t value_size = value->n_bytes;
+ result = ArrowBufferReserve(buffer,
+ sizeof(int32_t) + key_size + sizeof(int32_t) +
value_size);
+ if (result != NANOARROW_OK) {
+ return result;
+ }
+
+ ArrowBufferAppendUnsafe(buffer, &key_size, sizeof(int32_t));
+ ArrowBufferAppendUnsafe(buffer, key->data, key_size);
+ ArrowBufferAppendUnsafe(buffer, &value_size, sizeof(int32_t));
+ ArrowBufferAppendUnsafe(buffer, value->data, value_size);
+
+ n_keys++;
+ memcpy(buffer->data, &n_keys, sizeof(int32_t));
+
+ return NANOARROW_OK;
+}
+
+static ArrowErrorCode ArrowMetadataBuilderSetInternal(struct ArrowBuffer*
buffer,
+ struct ArrowStringView*
key,
+ struct ArrowStringView*
value) {
+ // Inspect the current value to see if we can avoid copying the buffer
+ struct ArrowStringView current_value = ArrowCharView(NULL);
+ int result =
+ ArrowMetadataGetValueInternal((const char*)buffer->data, key,
¤t_value);
+ if (result != NANOARROW_OK) {
+ return result;
+ }
+
+ // The key should be removed but no key exists
+ if (value == NULL && current_value.data == NULL) {
+ return NANOARROW_OK;
+ }
+
+ // The key/value can be appended because no key exists
+ if (value != NULL && current_value.data == NULL) {
+ return ArrowMetadataBuilderAppendInternal(buffer, key, value);
+ }
+
+ struct ArrowMetadataReader reader;
+ struct ArrowStringView existing_key;
+ struct ArrowStringView existing_value;
+ result = ArrowMetadataReaderInit(&reader, (const char*)buffer->data);
+ if (result != NANOARROW_OK) {
+ return result;
+ }
+
+ struct ArrowBuffer new_buffer;
+ result = ArrowMetadataBuilderInit(&new_buffer, NULL);
+ if (result != NANOARROW_OK) {
+ return result;
+ }
+
+ while (reader.remaining_keys > 0) {
+ result = ArrowMetadataReaderRead(&reader, &existing_key, &existing_value);
+ if (result != NANOARROW_OK) {
+ ArrowBufferReset(&new_buffer);
+ return result;
+ }
+
+ if (key->n_bytes == existing_key.n_bytes &&
+ strncmp((const char*)key->data, (const char*)existing_key.data,
+ existing_key.n_bytes) == 0) {
+ result = ArrowMetadataBuilderAppendInternal(&new_buffer, key, value);
+ value = NULL;
+ } else {
+ result =
+ ArrowMetadataBuilderAppendInternal(&new_buffer, &existing_key,
&existing_value);
+ }
+
+ if (result != NANOARROW_OK) {
+ ArrowBufferReset(&new_buffer);
+ return result;
+ }
+ }
+
+ ArrowBufferReset(buffer);
+ ArrowBufferMove(&new_buffer, buffer);
+ return NANOARROW_OK;
+}
+
+ArrowErrorCode ArrowMetadataBuilderAppend(struct ArrowBuffer* buffer,
+ struct ArrowStringView key,
+ struct ArrowStringView value) {
+ return ArrowMetadataBuilderAppendInternal(buffer, &key, &value);
+}
+
+ArrowErrorCode ArrowMetadataBuilderSet(struct ArrowBuffer* buffer,
+ struct ArrowStringView key,
+ struct ArrowStringView value) {
+ return ArrowMetadataBuilderSetInternal(buffer, &key, &value);
+}
+
+ArrowErrorCode ArrowMetadataBuilderRemove(struct ArrowBuffer* buffer,
+ struct ArrowStringView key) {
+ return ArrowMetadataBuilderSetInternal(buffer, &key, NULL);
+}
diff --git a/src/nanoarrow/metadata_test.cc b/src/nanoarrow/metadata_test.cc
index 5ac959c..c6b47f0 100644
--- a/src/nanoarrow/metadata_test.cc
+++ b/src/nanoarrow/metadata_test.cc
@@ -25,7 +25,7 @@
using namespace arrow;
-TEST(SchemaTest, Metadata) {
+TEST(MetadataTest, Metadata) {
// (test will only work on little endian)
char simple_metadata[] = {'\1', '\0', '\0', '\0', '\3', '\0', '\0', '\0',
'k', 'e',
'y', '\5', '\0', '\0', '\0', 'v', 'a', 'l',
'u', 'e'};
@@ -33,14 +33,91 @@ TEST(SchemaTest, Metadata) {
EXPECT_EQ(ArrowMetadataSizeOf(nullptr), 0);
EXPECT_EQ(ArrowMetadataSizeOf(simple_metadata), sizeof(simple_metadata));
- EXPECT_EQ(ArrowMetadataHasKey(simple_metadata, "key"), 1);
- EXPECT_EQ(ArrowMetadataHasKey(simple_metadata, "not_a_key"), 0);
+ EXPECT_EQ(ArrowMetadataHasKey(simple_metadata, ArrowCharView("key")), 1);
+ EXPECT_EQ(ArrowMetadataHasKey(simple_metadata, ArrowCharView("not_a_key")),
0);
- struct ArrowStringView value;
- EXPECT_EQ(ArrowMetadataGetValue(simple_metadata, "key", "default_val",
&value),
+ struct ArrowStringView value = ArrowCharView("default_val");
+ EXPECT_EQ(ArrowMetadataGetValue(simple_metadata, ArrowCharView("key"),
&value),
NANOARROW_OK);
EXPECT_EQ(std::string(value.data, value.n_bytes), "value");
- EXPECT_EQ(ArrowMetadataGetValue(simple_metadata, "not_a_key", "default_val",
&value),
+
+ value = ArrowCharView("default_val");
+ EXPECT_EQ(ArrowMetadataGetValue(simple_metadata, ArrowCharView("not_a_key"),
&value),
NANOARROW_OK);
EXPECT_EQ(std::string(value.data, value.n_bytes), "default_val");
}
+
+TEST(MetadataTest, MetadataBuild) {
+ // (test will only work on little endian)
+ char simple_metadata[] = {'\1', '\0', '\0', '\0', '\3', '\0', '\0', '\0',
'k', 'e',
+ 'y', '\5', '\0', '\0', '\0', 'v', 'a', 'l',
'u', 'e'};
+
+ // Metadata builder from copy
+ struct ArrowBuffer metadata_builder;
+ ASSERT_EQ(ArrowMetadataBuilderInit(&metadata_builder, simple_metadata),
NANOARROW_OK);
+ EXPECT_EQ(metadata_builder.size_bytes, sizeof(simple_metadata));
+ EXPECT_EQ(memcmp(metadata_builder.data, simple_metadata,
metadata_builder.size_bytes),
+ 0);
+ ArrowBufferReset(&metadata_builder);
+
+ // Empty metadata
+ ASSERT_EQ(ArrowMetadataBuilderInit(&metadata_builder, nullptr),
NANOARROW_OK);
+ EXPECT_EQ(metadata_builder.size_bytes, 0);
+ EXPECT_EQ(metadata_builder.data, nullptr);
+
+ // Recreate simple_metadata
+ ASSERT_EQ(ArrowMetadataBuilderAppend(&metadata_builder, ArrowCharView("key"),
+ ArrowCharView("value")),
+ NANOARROW_OK);
+ ASSERT_EQ(metadata_builder.size_bytes, ArrowMetadataSizeOf(simple_metadata));
+ EXPECT_EQ(memcmp(metadata_builder.data, simple_metadata,
metadata_builder.size_bytes),
+ 0);
+
+ // Remove a key that doesn't exist
+ ASSERT_EQ(ArrowMetadataBuilderRemove(&metadata_builder,
ArrowCharView("key2")),
+ NANOARROW_OK);
+ ASSERT_EQ(metadata_builder.size_bytes, ArrowMetadataSizeOf(simple_metadata));
+ EXPECT_EQ(memcmp(metadata_builder.data, simple_metadata,
metadata_builder.size_bytes),
+ 0);
+
+ // Add a new key
+ ASSERT_EQ(ArrowMetadataBuilderSet(&metadata_builder, ArrowCharView("key2"),
+ ArrowCharView("value2")),
+ NANOARROW_OK);
+ EXPECT_EQ(metadata_builder.size_bytes, ArrowMetadataSizeOf(simple_metadata) +
+ sizeof(int32_t) + 4 +
sizeof(int32_t) + 6);
+
+ struct ArrowStringView value = ArrowCharView(nullptr);
+ ASSERT_EQ(ArrowMetadataGetValue((const char*)metadata_builder.data,
+ ArrowCharView("key2"), &value),
+ NANOARROW_OK);
+ EXPECT_EQ(std::string(value.data, value.n_bytes), "value2");
+
+ // Set an existing key
+ ASSERT_EQ(ArrowMetadataBuilderSet(&metadata_builder, ArrowCharView("key"),
+ ArrowCharView("value3")),
+ NANOARROW_OK);
+ value = ArrowCharView(nullptr);
+ ASSERT_EQ(ArrowMetadataGetValue((const char*)metadata_builder.data,
+ ArrowCharView("key"), &value),
+ NANOARROW_OK);
+ EXPECT_EQ(std::string(value.data, value.n_bytes), "value3");
+ value = ArrowCharView(nullptr);
+ ASSERT_EQ(ArrowMetadataGetValue((const char*)metadata_builder.data,
+ ArrowCharView("key2"), &value),
+ NANOARROW_OK);
+ EXPECT_EQ(std::string(value.data, value.n_bytes), "value2");
+
+ // Remove a key that does exist
+ ASSERT_EQ(ArrowMetadataBuilderRemove(&metadata_builder,
ArrowCharView("key")),
+ NANOARROW_OK);
+ EXPECT_EQ(ArrowMetadataHasKey((const char*)metadata_builder.data,
ArrowCharView("key")),
+ false);
+ value = ArrowCharView(nullptr);
+ ASSERT_EQ(ArrowMetadataGetValue((const char*)metadata_builder.data,
+ ArrowCharView("key2"), &value),
+ NANOARROW_OK);
+ EXPECT_EQ(std::string(value.data, value.n_bytes), "value2");
+
+ ArrowBufferReset(&metadata_builder);
+}
diff --git a/src/nanoarrow/nanoarrow.h b/src/nanoarrow/nanoarrow.h
index 8958951..a6dbe89 100644
--- a/src/nanoarrow/nanoarrow.h
+++ b/src/nanoarrow/nanoarrow.h
@@ -87,18 +87,8 @@ const char* ArrowErrorMessage(struct ArrowError* error);
/// \defgroup nanoarrow-utils Utility data structures
-/// \brief An non-owning view of a string
-struct ArrowStringView {
- /// \brief A pointer to the start of the string
- ///
- /// If n_bytes is 0, this value may be NULL.
- const char* data;
-
- /// \brief The size of the string in bytes,
- ///
- /// (Not including the null terminator.)
- int64_t n_bytes;
-};
+/// \brief Create a string view from a null-terminated string
+static inline struct ArrowStringView ArrowCharView(const char* value);
/// \brief Arrow time unit enumerator
///
@@ -207,13 +197,38 @@ ArrowErrorCode ArrowMetadataReaderRead(struct
ArrowMetadataReader* reader,
int64_t ArrowMetadataSizeOf(const char* metadata);
/// \brief Check for a key in schema metadata
-char ArrowMetadataHasKey(const char* metadata, const char* key);
+char ArrowMetadataHasKey(const char* metadata, struct ArrowStringView key);
/// \brief Extract a value from schema metadata
-ArrowErrorCode ArrowMetadataGetValue(const char* metadata, const char* key,
- const char* default_value,
+///
+/// If key does not exist in metadata, value_out is unmodified
+ArrowErrorCode ArrowMetadataGetValue(const char* metadata, struct
ArrowStringView key,
struct ArrowStringView* value_out);
+/// \brief Initialize a builder for schema metadata from key/value pairs
+///
+/// metadata can be an existing metadata string or NULL to initialize
+/// an empty metadata string.
+ArrowErrorCode ArrowMetadataBuilderInit(struct ArrowBuffer* buffer, const
char* metadata);
+
+/// \brief Append a key/value pair to a buffer containing serialized metadata
+ArrowErrorCode ArrowMetadataBuilderAppend(struct ArrowBuffer* buffer,
+ struct ArrowStringView key,
+ struct ArrowStringView value);
+
+/// \brief Set a key/value pair to a buffer containing serialized metadata
+///
+/// Ensures that the only entry for key in the metadata is set to value.
+/// This function maintains the existing position of (the first instance of)
+/// key if present in the data.
+ArrowErrorCode ArrowMetadataBuilderSet(struct ArrowBuffer* buffer,
+ struct ArrowStringView key,
+ struct ArrowStringView value);
+
+/// \brief Remove a key from a buffer containing serialized metadata
+ArrowErrorCode ArrowMetadataBuilderRemove(struct ArrowBuffer* buffer,
+ struct ArrowStringView key);
+
/// }@
/// \defgroup nanoarrow-schema-view Schema consumer helpers
@@ -498,6 +513,7 @@ ArrowErrorCode ArrowArraySetBuffer(struct ArrowArray*
array, int64_t i,
// Inline function definitions
#include "bitmap_inline.h"
#include "buffer_inline.h"
+#include "utils_inline.h"
#ifdef __cplusplus
}
diff --git a/src/nanoarrow/schema_view.c b/src/nanoarrow/schema_view.c
index 54d586a..7a3ca93 100644
--- a/src/nanoarrow/schema_view.c
+++ b/src/nanoarrow/schema_view.c
@@ -668,9 +668,11 @@ ArrowErrorCode ArrowSchemaViewInit(struct ArrowSchemaView*
schema_view,
}
}
- ArrowMetadataGetValue(schema->metadata, "ARROW:extension:name", NULL,
+ schema_view->extension_name = ArrowCharView(NULL);
+ schema_view->extension_metadata = ArrowCharView(NULL);
+ ArrowMetadataGetValue(schema->metadata,
ArrowCharView("ARROW:extension:name"),
&schema_view->extension_name);
- ArrowMetadataGetValue(schema->metadata, "ARROW:extension:metadata", NULL,
+ ArrowMetadataGetValue(schema->metadata,
ArrowCharView("ARROW:extension:metadata"),
&schema_view->extension_metadata);
return NANOARROW_OK;
diff --git a/src/nanoarrow/typedefs_inline.h b/src/nanoarrow/typedefs_inline.h
index c04f909..5aca1ec 100644
--- a/src/nanoarrow/typedefs_inline.h
+++ b/src/nanoarrow/typedefs_inline.h
@@ -166,6 +166,19 @@ enum ArrowType {
NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO
};
+/// \brief An non-owning view of a string
+struct ArrowStringView {
+ /// \brief A pointer to the start of the string
+ ///
+ /// If n_bytes is 0, this value may be NULL.
+ const char* data;
+
+ /// \brief The size of the string in bytes,
+ ///
+ /// (Not including the null terminator.)
+ int64_t n_bytes;
+};
+
/// \brief Array buffer allocation and deallocation
///
/// Container for allocate, reallocate, and free methods that can be used
diff --git a/src/nanoarrow/utils_inline.h b/src/nanoarrow/utils_inline.h
new file mode 100644
index 0000000..4c61555
--- /dev/null
+++ b/src/nanoarrow/utils_inline.h
@@ -0,0 +1,46 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef NANOARROW_UTILS_INLINE_H_INCLUDED
+#define NANOARROW_UTILS_INLINE_H_INCLUDED
+
+#include <string.h>
+
+#include "typedefs_inline.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static inline struct ArrowStringView ArrowCharView(const char* value) {
+ struct ArrowStringView out;
+
+ out.data = value;
+ if (value) {
+ out.n_bytes = (int64_t)strlen(value);
+ } else {
+ out.n_bytes = 0;
+ }
+
+ return out;
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif