(arrow-nanoarrow) branch main updated: feat: add Footer decoding (#598)

bkietz Fri, 30 Aug 2024 12:22:51 -0700

This is an automated email from the ASF dual-hosted git repository.

bkietz pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-nanoarrow.git



The following commit(s) were added to refs/heads/main by this push:
     new cf388965 feat: add Footer decoding (#598)
cf388965 is described below

commit cf38896523c2407cc021f552b73cccd8f57dea83
Author: Benjamin Kietzman <[email protected]>
AuthorDate: Fri Aug 30 14:22:42 2024 -0500

    feat: add Footer decoding (#598)
    
    - Adds ArrowIpcDecoderPeekFooter(), ArrowIpcDecoderVerifyFooter(), and
    ArrowIpcDecoderDecodeFooter()
    - Uses these to read IPC files in the integration test executable
---
 src/nanoarrow/integration/ipc_integration.cc |  66 ++++++++---
 src/nanoarrow/ipc/decoder.c                  | 167 +++++++++++++++++++++++----
 src/nanoarrow/ipc/decoder_test.cc            |  91 +++++++++++++++
 src/nanoarrow/ipc/encoder.c                  |   6 +-
 src/nanoarrow/nanoarrow_ipc.h                |  69 ++++++++++-
 5 files changed, 352 insertions(+), 47 deletions(-)

diff --git a/src/nanoarrow/integration/ipc_integration.cc 
b/src/nanoarrow/integration/ipc_integration.cc
index b2a092bd..84f12ee5 100644
--- a/src/nanoarrow/integration/ipc_integration.cc
+++ b/src/nanoarrow/integration/ipc_integration.cc
@@ -95,6 +95,9 @@ int main(int argc, char** argv) try {
 }
 
 struct File {
+  File(FILE* file) : file_{file} {}
+  File() = default;
+
   ~File() {
     if (file_ != nullptr) {
       fclose(file_);
@@ -166,35 +169,62 @@ struct MaterializedArrayStream {
     // Footer).
     File ipc_file;
     NANOARROW_RETURN_NOT_OK(ipc_file.open(path, "rb", error));
-    return FromIpcFile(ipc_file, error);
-  }
+    auto bytes = ipc_file.read();
 
-  ArrowErrorCode FromIpcFile(FILE* ipc_file, struct ArrowError* error) {
-    char prefix[sizeof(NANOARROW_IPC_FILE_PADDED_MAGIC)] = {};
-    if (fread(&prefix, 1, sizeof(prefix), ipc_file) < sizeof(prefix)) {
-      ArrowErrorSet(error, "Expected file of more than %lu bytes, got %ld",
-                    sizeof(prefix), ftell(ipc_file));
+    auto min_size = sizeof(NANOARROW_IPC_FILE_PADDED_MAGIC) + sizeof(int32_t) +
+                    strlen(NANOARROW_IPC_FILE_PADDED_MAGIC);
+    if (bytes.size() < min_size) {
+      ArrowErrorSet(error, "Expected file of more than %lu bytes, got %ld", 
min_size,
+                    bytes.size());
       return EINVAL;
     }
 
-    if (memcmp(&prefix, NANOARROW_IPC_FILE_PADDED_MAGIC, sizeof(prefix)) != 0) 
{
+    if (memcmp(bytes.data(), NANOARROW_IPC_FILE_PADDED_MAGIC,
+               sizeof(NANOARROW_IPC_FILE_PADDED_MAGIC)) != 0) {
       ArrowErrorSet(error, "File did not begin with 'ARROW1\\0\\0'");
       return EINVAL;
     }
 
-    nanoarrow::ipc::UniqueInputStream input_stream;
-    NANOARROW_RETURN_NOT_OK_WITH_ERROR(
-        ArrowIpcInputStreamInitFile(input_stream.get(), ipc_file,
-                                    /*close_on_release=*/false),
-        error);
+    nanoarrow::ipc::UniqueDecoder decoder;
+    NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowIpcDecoderInit(decoder.get()), 
error);
+    NANOARROW_RETURN_NOT_OK(ArrowIpcDecoderVerifyFooter(
+        decoder.get(), {{bytes.data()}, static_cast<int64_t>(bytes.size())}, 
error));
+    NANOARROW_RETURN_NOT_OK(ArrowIpcDecoderDecodeFooter(
+        decoder.get(), {{bytes.data()}, static_cast<int64_t>(bytes.size())}, 
error));
 
-    nanoarrow::UniqueArrayStream array_stream;
     NANOARROW_RETURN_NOT_OK_WITH_ERROR(
-        ArrowIpcArrayStreamReaderInit(array_stream.get(), input_stream.get(),
-                                      /*options=*/nullptr),
-        error);
+        ArrowSchemaDeepCopy(&decoder->footer->schema, schema.get()), error);
+    NANOARROW_RETURN_NOT_OK(
+        ArrowIpcDecoderSetSchema(decoder.get(), &decoder->footer->schema, 
error));
+    NANOARROW_RETURN_NOT_OK_WITH_ERROR(
+        ArrowIpcDecoderSetEndianness(decoder.get(), decoder->endianness), 
error);
+
+    nanoarrow::UniqueBuffer record_batch_blocks;
+    ArrowBufferMove(&decoder->footer->record_batch_blocks, 
record_batch_blocks.get());
+
+    for (int i = 0;
+         i < record_batch_blocks->size_bytes / sizeof(struct 
ArrowIpcFileBlock); i++) {
+      const auto& block =
+          reinterpret_cast<struct 
ArrowIpcFileBlock*>(record_batch_blocks->data)[i];
+      struct ArrowBufferView metadata_view = {
+          {bytes.data() + block.offset},
+          block.metadata_length,
+      };
+      NANOARROW_RETURN_NOT_OK(
+          ArrowIpcDecoderDecodeHeader(decoder.get(), metadata_view, error));
 
-    return From(array_stream.get(), error);
+      struct ArrowBufferView body_view = {
+          {metadata_view.data.as_uint8 + metadata_view.size_bytes},
+          block.body_length,
+      };
+      nanoarrow::UniqueArray batch;
+      NANOARROW_RETURN_NOT_OK(
+          ArrowIpcDecoderDecodeArray(decoder.get(), body_view, -1, batch.get(),
+                                     NANOARROW_VALIDATION_LEVEL_FULL, error));
+      batches.push_back(std::move(batch));
+    }
+
+    return NANOARROW_OK;
   }
 
   ArrowErrorCode Write(struct ArrowIpcOutputStream* output_stream, bool 
write_file,
diff --git a/src/nanoarrow/ipc/decoder.c b/src/nanoarrow/ipc/decoder.c
index 6e37e20d..f2118db2 100644
--- a/src/nanoarrow/ipc/decoder.c
+++ b/src/nanoarrow/ipc/decoder.c
@@ -56,6 +56,8 @@
 // at the beginning of every message header.
 static const int32_t kMessageHeaderPrefixSize = 8;
 
+#define NANOARROW_IPC_MAGIC "ARROW1"
+
 // Internal representation of a parsed "Field" from flatbuffers. This
 // represents a field in a depth-first walk of column arrays and their
 // children.
@@ -95,6 +97,8 @@ struct ArrowIpcDecoderPrivate {
   int64_t n_buffers;
   // A pointer to the last flatbuffers message.
   const void* last_message;
+  // Storage for a Footer
+  struct ArrowIpcFooter footer;
 };
 
 ArrowErrorCode ArrowIpcCheckRuntime(struct ArrowError* error) {
@@ -236,6 +240,7 @@ ArrowErrorCode ArrowIpcDecoderInit(struct ArrowIpcDecoder* 
decoder) {
 
   memset(private_data, 0, sizeof(struct ArrowIpcDecoderPrivate));
   private_data->system_endianness = ArrowIpcSystemEndianness();
+  ArrowIpcFooterInit(&private_data->footer);
   decoder->private_data = private_data;
   return NANOARROW_OK;
 }
@@ -256,6 +261,8 @@ void ArrowIpcDecoderReset(struct ArrowIpcDecoder* decoder) {
       private_data->n_fields = 0;
     }
 
+    ArrowIpcFooterReset(&private_data->footer);
+
     ArrowFree(private_data);
     memset(decoder, 0, sizeof(struct ArrowIpcDecoder));
   }
@@ -959,6 +966,8 @@ static inline void ArrowIpcDecoderResetHeaderInfo(struct 
ArrowIpcDecoder* decode
   decoder->codec = 0;
   decoder->header_size_bytes = 0;
   decoder->body_size_bytes = 0;
+  decoder->footer = NULL;
+  ArrowIpcFooterReset(&private_data->footer);
   private_data->last_message = NULL;
 }
 
@@ -1053,6 +1062,85 @@ ArrowErrorCode ArrowIpcDecoderVerifyHeader(struct 
ArrowIpcDecoder* decoder,
   return NANOARROW_OK;
 }
 
+ArrowErrorCode ArrowIpcDecoderPeekFooter(struct ArrowIpcDecoder* decoder,
+                                         struct ArrowBufferView data,
+                                         struct ArrowError* error) {
+  struct ArrowIpcDecoderPrivate* private_data =
+      (struct ArrowIpcDecoderPrivate*)decoder->private_data;
+
+  ArrowIpcDecoderResetHeaderInfo(decoder);
+  if (data.size_bytes < (int)strlen(NANOARROW_IPC_MAGIC) + 
(int)sizeof(int32_t)) {
+    ArrowErrorSet(error,
+                  "Expected data of at least 10 bytes but only %" PRId64
+                  " bytes are available",
+                  data.size_bytes);
+    return ESPIPE;
+  }
+
+  const char* data_end = data.data.as_char + data.size_bytes;
+  const char* magic = data_end - strlen(NANOARROW_IPC_MAGIC);
+  const char* footer_size_data = magic - sizeof(int32_t);
+
+  if (memcmp(magic, NANOARROW_IPC_MAGIC, strlen(NANOARROW_IPC_MAGIC)) != 0) {
+    ArrowErrorSet(error, "Expected file to end with ARROW1 but got %s", 
data_end);
+    return EINVAL;
+  }
+
+  int32_t footer_size;
+  memcpy(&footer_size, footer_size_data, sizeof(footer_size));
+  if (private_data->system_endianness == NANOARROW_IPC_ENDIANNESS_BIG) {
+    footer_size = bswap32(footer_size);
+  }
+
+  if (footer_size < 0) {
+    ArrowErrorSet(error, "Expected footer size > 0 but found footer size of %d 
bytes",
+                  footer_size);
+    return EINVAL;
+  }
+
+  decoder->header_size_bytes = footer_size;
+  return NANOARROW_OK;
+}
+
+ArrowErrorCode ArrowIpcDecoderVerifyFooter(struct ArrowIpcDecoder* decoder,
+                                           struct ArrowBufferView data,
+                                           struct ArrowError* error) {
+  NANOARROW_RETURN_NOT_OK(ArrowIpcDecoderPeekFooter(decoder, data, error));
+
+  // Check that data contains at least the entire footer (return ESPIPE to 
signal
+  // that reading more data may help).
+  int32_t footer_and_size_and_magic_size =
+      decoder->header_size_bytes + sizeof(int32_t) + 
strlen(NANOARROW_IPC_MAGIC);
+  if (data.size_bytes < footer_and_size_and_magic_size) {
+    ArrowErrorSet(error,
+                  "Expected >= %d bytes of data but only %" PRId64
+                  " bytes are in the buffer",
+                  footer_and_size_and_magic_size, data.size_bytes);
+    return ESPIPE;
+  }
+
+  const uint8_t* footer_data =
+      data.data.as_uint8 + data.size_bytes - footer_and_size_and_magic_size;
+
+  // Run flatbuffers verification
+  if (ns(Footer_verify_as_root(footer_data, decoder->header_size_bytes) !=
+         flatcc_verify_ok)) {
+    ArrowErrorSet(error, "Footer flatbuffer verification failed");
+    return EINVAL;
+  }
+
+  // Read some basic information from the message
+  ns(Footer_table_t) footer = ns(Footer_as_root(footer_data));
+  if (ns(Footer_schema(footer)) == NULL) {
+    ArrowErrorSet(error, "Footer has no schema");
+    return EINVAL;
+  }
+
+  decoder->metadata_version = ns(Footer_version(footer));
+  decoder->body_size_bytes = 0;
+  return NANOARROW_OK;
+}
+
 ArrowErrorCode ArrowIpcDecoderDecodeHeader(struct ArrowIpcDecoder* decoder,
                                            struct ArrowBufferView data,
                                            struct ArrowError* error) {
@@ -1126,6 +1214,29 @@ ArrowErrorCode ArrowIpcDecoderDecodeHeader(struct 
ArrowIpcDecoder* decoder,
   return NANOARROW_OK;
 }
 
+static ArrowErrorCode ArrowIpcDecoderDecodeSchemaImpl(ns(Schema_table_t) 
schema,
+                                                      struct ArrowSchema* out,
+                                                      struct ArrowError* 
error) {
+  ArrowSchemaInit(out);
+  // Top-level batch schema is typically non-nullable
+  out->flags = 0;
+
+  ns(Field_vec_t) fields = ns(Schema_fields(schema));
+  int64_t n_fields = ns(Schema_vec_len(fields));
+
+  ArrowErrorCode result = ArrowSchemaSetTypeStruct(out, n_fields);
+  if (result != NANOARROW_OK) {
+    ArrowErrorSet(error, "Failed to allocate struct schema with %" PRId64 " 
children",
+                  n_fields);
+    return result;
+  }
+
+  NANOARROW_RETURN_NOT_OK(ArrowIpcDecoderSetChildren(out, fields, error));
+  NANOARROW_RETURN_NOT_OK(
+      ArrowIpcDecoderSetMetadata(out, ns(Schema_custom_metadata(schema)), 
error));
+  return NANOARROW_OK;
+}
+
 ArrowErrorCode ArrowIpcDecoderDecodeSchema(struct ArrowIpcDecoder* decoder,
                                            struct ArrowSchema* out,
                                            struct ArrowError* error) {
@@ -1138,37 +1249,47 @@ ArrowErrorCode ArrowIpcDecoderDecodeSchema(struct 
ArrowIpcDecoder* decoder,
     return EINVAL;
   }
 
-  ns(Schema_table_t) schema = (ns(Schema_table_t))private_data->last_message;
-
-  ns(Field_vec_t) fields = ns(Schema_fields(schema));
-  int64_t n_fields = ns(Schema_vec_len(fields));
-
   struct ArrowSchema tmp;
-  ArrowSchemaInit(&tmp);
-  int result = ArrowSchemaSetTypeStruct(&tmp, n_fields);
-  if (result != NANOARROW_OK) {
-    ArrowSchemaRelease(&tmp);
-    ArrowErrorSet(error, "Failed to allocate struct schema with %" PRId64 " 
children",
-                  n_fields);
-    return result;
-  }
-
-  // Top-level batch schema is typically non-nullable
-  tmp.flags = 0;
+  ArrowErrorCode result = ArrowIpcDecoderDecodeSchemaImpl(
+      (ns(Schema_table_t))private_data->last_message, &tmp, error);
 
-  result = ArrowIpcDecoderSetChildren(&tmp, fields, error);
   if (result != NANOARROW_OK) {
     ArrowSchemaRelease(&tmp);
     return result;
   }
+  ArrowSchemaMove(&tmp, out);
+  return NANOARROW_OK;
+}
 
-  result = ArrowIpcDecoderSetMetadata(&tmp, 
ns(Schema_custom_metadata(schema)), error);
-  if (result != NANOARROW_OK) {
-    ArrowSchemaRelease(&tmp);
-    return result;
-  }
+ArrowErrorCode ArrowIpcDecoderDecodeFooter(struct ArrowIpcDecoder* decoder,
+                                           struct ArrowBufferView data,
+                                           struct ArrowError* error) {
+  struct ArrowIpcDecoderPrivate* private_data =
+      (struct ArrowIpcDecoderPrivate*)decoder->private_data;
 
-  ArrowSchemaMove(&tmp, out);
+  int32_t footer_and_size_and_magic_size =
+      decoder->header_size_bytes + sizeof(int32_t) + 
strlen(NANOARROW_IPC_MAGIC);
+  const uint8_t* footer_data =
+      data.data.as_uint8 + data.size_bytes - footer_and_size_and_magic_size;
+  ns(Footer_table_t) footer = ns(Footer_as_root(footer_data));
+
+  NANOARROW_RETURN_NOT_OK(ArrowIpcDecoderDecodeSchemaImpl(
+      ns(Footer_schema(footer)), &private_data->footer.schema, error));
+
+  ns(Block_vec_t) blocks = ns(Footer_recordBatches(footer));
+  int64_t n = ns(Block_vec_len(blocks));
+  
NANOARROW_RETURN_NOT_OK(ArrowBufferResize(&private_data->footer.record_batch_blocks,
+                                            sizeof(struct ArrowIpcFileBlock) * 
n,
+                                            /*shrink_to_fit=*/0));
+  struct ArrowIpcFileBlock* record_batches =
+      (struct ArrowIpcFileBlock*)private_data->footer.record_batch_blocks.data;
+  for (int64_t i = 0; i < n; i++) {
+    record_batches[i].offset = blocks[i].offset;
+    record_batches[i].metadata_length = blocks[i].metaDataLength;
+    record_batches[i].body_length = blocks[i].bodyLength;
+  }
+
+  decoder->footer = &private_data->footer;
   return NANOARROW_OK;
 }
 
diff --git a/src/nanoarrow/ipc/decoder_test.cc 
b/src/nanoarrow/ipc/decoder_test.cc
index dd50201c..8d56db7a 100644
--- a/src/nanoarrow/ipc/decoder_test.cc
+++ b/src/nanoarrow/ipc/decoder_test.cc
@@ -48,6 +48,7 @@ struct ArrowIpcDecoderPrivate {
   struct ArrowIpcField* fields;
   int64_t n_buffers;
   const void* last_message;
+  struct ArrowIpcFooter footer;
 };
 }
 
@@ -959,6 +960,56 @@ TEST_P(ArrowSchemaParameterizedTestFixture, 
NanoarrowIpcNanoarrowSchemaRoundtrip
   EXPECT_EQ(ArrowSchemaToString(roundtripped.get()), 
ArrowSchemaToString(schema.get()));
 }
 
+TEST_P(ArrowSchemaParameterizedTestFixture, 
NanoarrowIpcNanoarrowFooterRoundtrip) {
+  using namespace nanoarrow::literals;
+  const std::shared_ptr<arrow::Schema>& arrow_schema = GetParam();
+
+  nanoarrow::ipc::UniqueFooter footer;
+  ASSERT_TRUE(arrow::ExportSchema(*arrow_schema, &footer->schema).ok());
+
+  struct ArrowIpcFileBlock dummy_block = {1, 2, 3};
+  EXPECT_EQ(
+      ArrowBufferAppend(&footer->record_batch_blocks, &dummy_block, 
sizeof(dummy_block)),
+      NANOARROW_OK);
+
+  nanoarrow::ipc::UniqueEncoder encoder;
+  EXPECT_EQ(ArrowIpcEncoderInit(encoder.get()), NANOARROW_OK);
+
+  struct ArrowError error;
+  EXPECT_EQ(ArrowIpcEncoderEncodeFooter(encoder.get(), footer.get(), &error),
+            NANOARROW_OK)
+      << error.message;
+
+  nanoarrow::UniqueBuffer buffer;
+  EXPECT_EQ(
+      ArrowIpcEncoderFinalizeBuffer(encoder.get(), /*encapsulate=*/false, 
buffer.get()),
+      NANOARROW_OK);
+  EXPECT_EQ(ArrowBufferAppendInt32(buffer.get(), buffer->size_bytes), 
NANOARROW_OK);
+  EXPECT_EQ(ArrowBufferAppendStringView(buffer.get(), "ARROW1"_asv), 
NANOARROW_OK);
+
+  struct ArrowBufferView buffer_view;
+  buffer_view.data.data = buffer->data;
+  buffer_view.size_bytes = buffer->size_bytes;
+
+  nanoarrow::ipc::UniqueDecoder decoder;
+  ArrowIpcDecoderInit(decoder.get());
+  ASSERT_EQ(ArrowIpcDecoderVerifyFooter(decoder.get(), buffer_view, &error), 
NANOARROW_OK)
+      << error.message;
+  ASSERT_EQ(ArrowIpcDecoderDecodeFooter(decoder.get(), buffer_view, &error), 
NANOARROW_OK)
+      << error.message;
+
+  EXPECT_EQ(ArrowSchemaToString(&decoder->footer->schema),
+            ArrowSchemaToString(&footer->schema));
+  EXPECT_EQ(decoder->footer->record_batch_blocks.size_bytes, 
sizeof(dummy_block));
+
+  struct ArrowIpcFileBlock roundtripped_block;
+  memcpy(&roundtripped_block, decoder->footer->record_batch_blocks.data,
+         sizeof(roundtripped_block));
+  EXPECT_EQ(roundtripped_block.offset, dummy_block.offset);
+  EXPECT_EQ(roundtripped_block.metadata_length, dummy_block.metadata_length);
+  EXPECT_EQ(roundtripped_block.body_length, dummy_block.body_length);
+}
+
 INSTANTIATE_TEST_SUITE_P(
     NanoarrowIpcTest, ArrowSchemaParameterizedTestFixture,
     ::testing::Values(
@@ -1136,3 +1187,43 @@ INSTANTIATE_TEST_SUITE_P(NanoarrowIpcTest, 
ArrowTypeIdParameterizedTestFixture,
                                            NANOARROW_TYPE_DECIMAL128,
                                            NANOARROW_TYPE_DECIMAL256,
                                            
NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO));
+
+TEST(NanoarrowIpcTest, NanoarrowIpcFooterDecodingErrors) {
+  struct ArrowError error;
+
+  nanoarrow::ipc::UniqueDecoder decoder;
+  ArrowIpcDecoderInit(decoder.get());
+
+  // not enough data to get the size+magic
+  EXPECT_EQ(ArrowIpcDecoderPeekFooter(decoder.get(), {nullptr, 3}, &error), 
ESPIPE)
+      << error.message;
+
+  // doesn't end with magic
+  EXPECT_EQ(ArrowIpcDecoderPeekFooter(decoder.get(), {"\0\0\0\0blargh", 10}, 
&error),
+            EINVAL)
+      << error.message;
+
+  // negative size
+  EXPECT_EQ(ArrowIpcDecoderPeekFooter(decoder.get(),
+                                      {"\xFF\xFF\xFF\xFF"
+                                       "ARROW1",
+                                       10},
+                                      &error),
+            EINVAL)
+      << error.message;
+
+  // PeekFooter doesn't check for available data
+  EXPECT_EQ(ArrowIpcDecoderPeekFooter(decoder.get(), {"\xFF\xFF\0\0ARROW1", 
10}, &error),
+            NANOARROW_OK)
+      << error.message;
+  EXPECT_EQ(decoder->header_size_bytes, 0xFFFF);
+
+  decoder->header_size_bytes = -1;
+
+  // VerifyFooter *does* check for enough available data
+  EXPECT_EQ(
+      ArrowIpcDecoderVerifyFooter(decoder.get(), {"\xFF\xFF\0\0ARROW1", 10}, 
&error),
+      ESPIPE)
+      << error.message;
+  EXPECT_EQ(decoder->header_size_bytes, 0xFFFF);
+}
diff --git a/src/nanoarrow/ipc/encoder.c b/src/nanoarrow/ipc/encoder.c
index 13b72c74..d02c8cb6 100644
--- a/src/nanoarrow/ipc/encoder.c
+++ b/src/nanoarrow/ipc/encoder.c
@@ -238,8 +238,10 @@ static ArrowErrorCode 
ArrowIpcEncodeFieldType(flatcc_builder_t* builder,
       FLATCC_RETURN_UNLESS_0(
           Timestamp_unit_add(builder, 
(ns(TimeUnit_enum_t))schema_view->time_unit),
           error);
-      FLATCC_RETURN_UNLESS_0(
-          Timestamp_timezone_create_str(builder, schema_view->timezone), 
error);
+      if (schema_view->timezone && schema_view->timezone[0] != 0) {
+        FLATCC_RETURN_UNLESS_0(
+            Timestamp_timezone_create_str(builder, schema_view->timezone), 
error);
+      }
       FLATCC_RETURN_UNLESS_0(Field_type_Timestamp_end(builder), error);
       return NANOARROW_OK;
 
diff --git a/src/nanoarrow/nanoarrow_ipc.h b/src/nanoarrow/nanoarrow_ipc.h
index b11a0360..ebeb3222 100644
--- a/src/nanoarrow/nanoarrow_ipc.h
+++ b/src/nanoarrow/nanoarrow_ipc.h
@@ -49,6 +49,12 @@
   NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowIpcDecoderSetSchema)
 #define ArrowIpcDecoderSetEndianness \
   NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowIpcDecoderSetEndianness)
+#define ArrowIpcDecoderPeekFooter \
+  NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowIpcDecoderPeekFooter)
+#define ArrowIpcDecoderVerifyFooter \
+  NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowIpcDecoderVerifyFooter)
+#define ArrowIpcDecoderDecodeFooter \
+  NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowIpcDecoderDecodeFooter)
 #define ArrowIpcInputStreamInitBuffer \
   NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowIpcInputStreamInitBuffer)
 #define ArrowIpcInputStreamInitFile \
@@ -222,6 +228,12 @@ struct ArrowIpcDecoder {
   /// \brief The number of bytes in the forthcoming body message.
   int64_t body_size_bytes;
 
+  /// \brief The last decoded Footer
+  ///
+  /// \warning This API is currently only public for use in integration 
testing;
+  ///          use at your own risk.
+  struct ArrowIpcFooter* footer;
+
   /// \brief Private resources managed by this library
   void* private_data;
 };
@@ -569,7 +581,7 @@ ArrowErrorCode ArrowIpcWriterStartFile(struct 
ArrowIpcWriter* writer,
 
 /// \brief Finish writing an IPC file
 ///
-/// Writes the IPC file's Footer, footer size, and ending magic.
+/// Writes the IPC file's footer, footer size, and ending magic.
 ArrowErrorCode ArrowIpcWriterFinalizeFile(struct ArrowIpcWriter* writer,
                                           struct ArrowError* error);
 /// @}
@@ -589,7 +601,7 @@ struct ArrowIpcFileBlock {
   int64_t body_length;
 };
 
-/// \brief A Footer for use in an IPC file
+/// \brief A footer for use in an IPC file
 ///
 /// \warning This API is currently only public for use in integration testing;
 ///          use at your own risk.
@@ -603,7 +615,7 @@ struct ArrowIpcFooter {
   struct ArrowBuffer record_batch_blocks;
 };
 
-/// \brief Initialize a Footer
+/// \brief Initialize a footer
 ///
 /// \warning This API is currently only public for use in integration testing;
 ///          use at your own risk.
@@ -615,7 +627,7 @@ void ArrowIpcFooterInit(struct ArrowIpcFooter* footer);
 ///          use at your own risk.
 void ArrowIpcFooterReset(struct ArrowIpcFooter* footer);
 
-/// \brief Encode a Footer for use in an IPC file
+/// \brief Encode a footer for use in an IPC file
 ///
 /// \warning This API is currently only public for use in integration testing;
 ///          use at your own risk.
@@ -625,6 +637,55 @@ ArrowErrorCode ArrowIpcEncoderEncodeFooter(struct 
ArrowIpcEncoder* encoder,
                                            const struct ArrowIpcFooter* footer,
                                            struct ArrowError* error);
 
+/// \brief Peek at a footer
+///
+/// The last 10 bytes of an Arrow IPC file are the footer size as a 
little-endian
+/// 32-bit integer followed by the ARROW1 magic. ArrowIpcDecoderPeekFooter() 
reads
+/// these bytes and returns ESPIPE if there are not enough remaining bytes in 
data
+/// to read the entire footer, EINVAL if the last 10 bytes are not valid,
+/// or NANOARROW_OK otherwise.
+///
+/// The footer size will be stored in decoder.header_size_bytes.
+///
+/// \warning This API is currently only public for use in integration testing;
+///          use at your own risk.
+ArrowErrorCode ArrowIpcDecoderPeekFooter(struct ArrowIpcDecoder* decoder,
+                                         struct ArrowBufferView data,
+                                         struct ArrowError* error);
+
+/// \brief Verify a footer
+///
+/// Runs ArrowIpcDecoderPeekFooter() to ensure data is sufficiently large but 
additionally
+/// runs flatbuffer verification to ensure that decoding the data will not 
access
+/// memory outside of the buffer specified by data. 
ArrowIpcDecoderVerifyFooter() will
+/// also set decoder.header_size_bytes and decoder.metadata_version.
+///
+/// Returns as ArrowIpcDecoderPeekFooter() and additionally will
+/// return EINVAL if flatbuffer verification fails.
+///
+/// \warning This API is currently only public for use in integration testing;
+///          use at your own risk.
+ArrowErrorCode ArrowIpcDecoderVerifyFooter(struct ArrowIpcDecoder* decoder,
+                                           struct ArrowBufferView data,
+                                           struct ArrowError* error);
+
+/// \brief Decode a footer
+///
+/// Runs ArrowIpcDecoderPeekFooter() to ensure data is sufficiently large and 
decodes
+/// the content of the footer. decoder.footer will be set for access to the 
file's
+/// schema and record batches. In almost all cases this should be preceded by 
a call to
+/// ArrowIpcDecoderVerifyFooter() to ensure decoding does not access data 
outside of the
+/// specified buffer.
+///
+/// Returns EINVAL if the content of the footer cannot be decoded or ENOTSUP 
if the
+/// content of the footer uses features not supported by this library.
+///
+/// \warning This API is currently only public for use in integration testing;
+///          use at your own risk.
+ArrowErrorCode ArrowIpcDecoderDecodeFooter(struct ArrowIpcDecoder* decoder,
+                                           struct ArrowBufferView data,
+                                           struct ArrowError* error);
+
 #ifdef __cplusplus
 }
 #endif

(arrow-nanoarrow) branch main updated: feat: add Footer decoding (#598)

Reply via email to