Copilot commented on code in PR #738:
URL: https://github.com/apache/arrow-nanoarrow/pull/738#discussion_r2892133911
##########
src/nanoarrow/nanoarrow_ipc.h:
##########
@@ -307,6 +316,9 @@ struct ArrowIpcDecoder {
/// \brief The number of bytes in the forthcoming body message.
int64_t body_size_bytes;
+ /// \brief The last decoded DictionaryBatch
+ struct ArrowIpcDictionaryBatch* dictionary;
Review Comment:
This pointer refers to decoder-owned internal storage (set to a private
struct in `decoder.c`). Consider making it `const struct
ArrowIpcDictionaryBatch*` to discourage callers from mutating internal decoder
state.
```suggestion
const struct ArrowIpcDictionaryBatch* dictionary;
```
##########
src/nanoarrow/nanoarrow_ipc.h:
##########
@@ -307,6 +316,9 @@ struct ArrowIpcDecoder {
/// \brief The number of bytes in the forthcoming body message.
int64_t body_size_bytes;
+ /// \brief The last decoded DictionaryBatch
+ struct ArrowIpcDictionaryBatch* dictionary;
+
/// \brief The last decoded Footer
///
Review Comment:
The comment doesn’t describe lifetime/ownership. Since this points to
decoder-owned memory, please document that it is valid only until the next
`ArrowIpcDecoderDecodeHeader()` call (or `ArrowIpcDecoderReset()`), and that
callers must not free it.
```suggestion
/// \brief The last decoded DictionaryBatch
///
/// The returned pointer is owned and managed by the decoder and remains
/// valid only until the next call to ArrowIpcDecoderDecodeHeader() or
/// ArrowIpcDecoderReset(). Callers must not free or modify the underlying
/// memory.
struct ArrowIpcDictionaryBatch* dictionary;
/// \brief The last decoded Footer
///
/// The returned pointer is owned and managed by the decoder and remains
/// valid only until the next call to ArrowIpcDecoderDecodeHeader() or
/// ArrowIpcDecoderReset(). Callers must not free or modify the underlying
/// memory.
///
```
##########
src/nanoarrow/ipc/decoder_test.cc:
##########
@@ -159,6 +161,51 @@ alignas(8) static uint8_t
kSimpleRecordBatchUncompressible[] = {
0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00,
0x00, 0x00, 0x00, 0x00};
+static uint8_t kDictionarySchema[] = {
+ 0xff, 0xff, 0xff, 0xff, 0x50, 0x01, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
0x00, 0x00,
+ 0x0a, 0x00, 0x0e, 0x00, 0x06, 0x00, 0x05, 0x00, 0x08, 0x00, 0x0a, 0x00,
0x00, 0x00,
+ 0x00, 0x01, 0x04, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0a, 0x00,
0x0c, 0x00,
+ 0x00, 0x00, 0x04, 0x00, 0x08, 0x00, 0x0a, 0x00, 0x00, 0x00, 0xb0, 0x00,
0x00, 0x00,
+ 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00,
0x08, 0x00,
+ 0x0c, 0x00, 0x04, 0x00, 0x08, 0x00, 0x08, 0x00, 0x00, 0x00, 0x8c, 0x00,
0x00, 0x00,
+ 0x04, 0x00, 0x00, 0x00, 0x7e, 0x00, 0x00, 0x00, 0x41, 0x0a, 0x33, 0x0a,
0x32, 0x36,
+ 0x33, 0x31, 0x37, 0x30, 0x0a, 0x31, 0x39, 0x37, 0x38, 0x38, 0x38, 0x0a,
0x35, 0x0a,
+ 0x55, 0x54, 0x46, 0x2d, 0x38, 0x0a, 0x35, 0x33, 0x31, 0x0a, 0x31, 0x0a,
0x35, 0x33,
+ 0x31, 0x0a, 0x31, 0x0a, 0x32, 0x35, 0x34, 0x0a, 0x31, 0x30, 0x32, 0x36,
0x0a, 0x31,
+ 0x0a, 0x32, 0x36, 0x32, 0x31, 0x35, 0x33, 0x0a, 0x35, 0x0a, 0x6e, 0x61,
0x6d, 0x65,
+ 0x73, 0x0a, 0x31, 0x36, 0x0a, 0x31, 0x0a, 0x32, 0x36, 0x32, 0x31, 0x35,
0x33, 0x0a,
+ 0x38, 0x0a, 0x73, 0x6f, 0x6d, 0x65, 0x5f, 0x63, 0x6f, 0x6c, 0x0a, 0x32,
0x35, 0x34,
+ 0x0a, 0x31, 0x30, 0x32, 0x36, 0x0a, 0x35, 0x31, 0x31, 0x0a, 0x31, 0x36,
0x0a, 0x31,
+ 0x0a, 0x32, 0x36, 0x32, 0x31, 0x35, 0x33, 0x0a, 0x37, 0x0a, 0x63, 0x6f,
0x6c, 0x75,
+ 0x6d, 0x6e, 0x73, 0x0a, 0x32, 0x35, 0x34, 0x0a, 0x00, 0x00, 0x01, 0x00,
0x00, 0x00,
+ 0x72, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,
0x10, 0x00,
+ 0x18, 0x00, 0x08, 0x00, 0x06, 0x00, 0x07, 0x00, 0x0c, 0x00, 0x10, 0x00,
0x14, 0x00,
+ 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x05, 0x14, 0x00, 0x00, 0x00,
0x48, 0x00,
+ 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00,
+ 0x08, 0x00, 0x00, 0x00, 0x73, 0x6f, 0x6d, 0x65, 0x5f, 0x63, 0x6f, 0x6c,
0x00, 0x00,
+ 0x00, 0x00, 0x08, 0x00, 0x08, 0x00, 0x00, 0x00, 0x04, 0x00, 0x08, 0x00,
0x00, 0x00,
+ 0x0c, 0x00, 0x00, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x08, 0x00, 0x07, 0x00,
0x08, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x08, 0x00, 0x00, 0x00, 0x04, 0x00,
0x04, 0x00,
+ 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
+
+static uint8_t kDictionaryBatch[] = {
Review Comment:
These FlatBuffers test payloads are not explicitly aligned (unlike other
test buffers in this file that use `alignas(8)`). If the decoder or FlatBuffers
accessors perform aligned reads via pointer casts, this can cause undefined
behavior on strict-alignment architectures. Consider adding `alignas(8)` to
these buffers (or otherwise guaranteeing safe unaligned access).
##########
src/nanoarrow/ipc/decoder.c:
##########
@@ -823,17 +825,132 @@ static int ArrowIpcDecoderSetType(struct ArrowSchema*
schema, ns(Field_table_t)
}
}
+// A fun corner case when decoding dictionaries: the extension metadata lives
with
+// the dictionary (i.e., the non-index type); however, the field metadata still
+// needs to exist on the field.
+static int ArrowIpcMoveNonExtensionFieldMetadataBackToFieldIfNeeded(
+ struct ArrowSchema* schema) {
+ NANOARROW_DCHECK(schema->dictionary != NULL);
+ struct ArrowMetadataReader reader;
+ NANOARROW_RETURN_NOT_OK(ArrowMetadataReaderInit(&reader,
schema->dictionary->metadata));
+
+ // For the most common case (no metadata), nothing needs to be done here
+ if (reader.remaining_keys == 0) {
+ return NANOARROW_OK;
+ }
+
+ struct ArrowBuffer field_metadata;
+ struct ArrowBuffer extension_metadata;
+ NANOARROW_RETURN_NOT_OK(ArrowMetadataBuilderInit(&field_metadata, NULL));
+ ArrowErrorCode result = ArrowMetadataBuilderInit(&extension_metadata, NULL);
+ if (result != NANOARROW_OK) {
+ ArrowBufferReset(&field_metadata);
+ return result;
+ }
+
+ const struct ArrowStringView extension_name_key =
ArrowCharView("ARROW:extension:name");
+ const struct ArrowStringView extension_metadata_key =
+ ArrowCharView("ARROW:extension:metadata");
+
+ struct ArrowStringView key;
+ struct ArrowStringView value;
+ while (reader.remaining_keys > 0) {
+ result = ArrowMetadataReaderRead(&reader, &key, &value);
+ if (result != NANOARROW_OK) {
+ ArrowBufferReset(&field_metadata);
+ ArrowBufferReset(&extension_metadata);
+ return result;
+ }
+
+ int key_is_extension_name =
+ key.size_bytes == extension_name_key.size_bytes &&
+ strncmp(key.data, extension_name_key.data, key.size_bytes) == 0;
+ int key_is_extension_metadata =
+ key.size_bytes == extension_metadata_key.size_bytes &&
+ strncmp(key.data, extension_metadata_key.data, key.size_bytes) == 0;
+ if (!key_is_extension_name && !key_is_extension_metadata) {
+ result = ArrowMetadataBuilderAppend(&field_metadata, key, value);
+ if (result != NANOARROW_OK) {
+ ArrowBufferReset(&field_metadata);
+ ArrowBufferReset(&extension_metadata);
+ return result;
+ }
+ } else {
+ result = ArrowMetadataBuilderAppend(&extension_metadata, key, value);
+ if (result != NANOARROW_OK) {
+ ArrowBufferReset(&field_metadata);
+ ArrowBufferReset(&extension_metadata);
+ return result;
+ }
+ }
+ }
+
+ result = ArrowSchemaSetMetadata(schema, (char*)field_metadata.data);
+ if (result != NANOARROW_OK) {
+ ArrowBufferReset(&field_metadata);
+ ArrowBufferReset(&extension_metadata);
+ return result;
+ }
+
+ result = ArrowSchemaSetMetadata(schema->dictionary,
(char*)extension_metadata.data);
+ ArrowBufferReset(&field_metadata);
+ ArrowBufferReset(&extension_metadata);
+
+ return result;
+}
+
+static int ArrowIpcSetDictionaryEncoding(
+ struct ArrowSchema* schema, ns(DictionaryEncoding_table_t
dictionary_encoding),
+ struct ArrowError* error) {
+ switch (
+
org_apache_arrow_flatbuf_DictionaryEncoding_dictionaryKind(dictionary_encoding))
{
+ case ns(DictionaryKind_DenseArray):
+ break;
+ default:
+ ArrowErrorSet(error, "Uexpected value for DictionaryKind");
Review Comment:
Typo in error message: change 'Uexpected' to 'Unexpected' to keep
diagnostics professional and searchable.
```suggestion
ArrowErrorSet(error, "Unexpected value for DictionaryKind");
```
##########
src/nanoarrow/ipc/decoder_test.cc:
##########
@@ -159,6 +161,51 @@ alignas(8) static uint8_t
kSimpleRecordBatchUncompressible[] = {
0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00,
0x00, 0x00, 0x00, 0x00};
+static uint8_t kDictionarySchema[] = {
Review Comment:
These FlatBuffers test payloads are not explicitly aligned (unlike other
test buffers in this file that use `alignas(8)`). If the decoder or FlatBuffers
accessors perform aligned reads via pointer casts, this can cause undefined
behavior on strict-alignment architectures. Consider adding `alignas(8)` to
these buffers (or otherwise guaranteeing safe unaligned access).
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]