qzyu999 commented on code in PR #50121:
URL: https://github.com/apache/arrow/pull/50121#discussion_r3485317746


##########
cpp/src/arrow/extension/variant_test.cc:
##########
@@ -0,0 +1,2128 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/extension/variant_internal.h"
+#include "arrow/extension/variant_test_util.h"
+
+#include <cmath>
+#include <cstring>
+#include <limits>
+#include <string>
+#include <vector>
+
+#include "arrow/testing/gtest_util.h"
+
+namespace arrow::extension::variant_internal {
+
+// ===========================================================================
+// Test helpers
+// ===========================================================================
+
+/// \brief Build a metadata buffer from a list of strings.
+///
+/// Uses offset_size=1, version=1, sorted flag as specified.
+std::vector<uint8_t> BuildMetadataBuffer(const std::vector<std::string>& 
strings,
+                                         bool sorted = false, int32_t 
offset_size = 1) {
+  std::vector<uint8_t> buffer;
+
+  // Header byte: version=1, sorted flag, offset_size
+  uint8_t header = kVariantVersion;
+  if (sorted) {
+    header |= (1 << 4);
+  }
+  header |= static_cast<uint8_t>((offset_size - 1) << 6);
+  buffer.push_back(header);
+
+  // Dictionary size
+  auto dict_size = static_cast<uint32_t>(strings.size());
+  for (int32_t b = 0; b < offset_size; ++b) {
+    buffer.push_back(static_cast<uint8_t>((dict_size >> (b * 8)) & 0xFF));
+  }
+
+  // Compute string offsets
+  std::vector<uint32_t> offsets(dict_size + 1);
+  offsets[0] = 0;
+  for (uint32_t i = 0; i < dict_size; ++i) {
+    offsets[i + 1] = offsets[i] + static_cast<uint32_t>(strings[i].size());
+  }
+
+  // Write offsets
+  for (uint32_t i = 0; i <= dict_size; ++i) {
+    for (int32_t b = 0; b < offset_size; ++b) {
+      buffer.push_back(static_cast<uint8_t>((offsets[i] >> (b * 8)) & 0xFF));
+    }
+  }
+
+  // Write string data
+  for (const auto& s : strings) {
+    buffer.insert(buffer.end(), s.begin(), s.end());
+  }
+
+  return buffer;
+}
+
+/// \brief Build a primitive value header byte.
+uint8_t PrimitiveHeader(PrimitiveType type) {
+  return static_cast<uint8_t>(BasicType::kPrimitive) | 
(static_cast<uint8_t>(type) << 2);
+}
+
+/// \brief Build a short string value buffer.
+std::vector<uint8_t> BuildShortString(const std::string& s) {
+  std::vector<uint8_t> buffer;
+  auto len = static_cast<uint8_t>(s.size());
+  uint8_t header = static_cast<uint8_t>(BasicType::kShortString) | (len << 2);
+  buffer.push_back(header);
+  buffer.insert(buffer.end(), s.begin(), s.end());
+  return buffer;
+}
+
+/// \brief Build an object value buffer.
+///
+/// \param field_ids Dictionary indices for each field name
+/// \param field_values Serialized variant values for each field
+/// \param field_id_size Bytes per field ID (1-4)
+/// \param field_offset_size Bytes per offset (1-4)
+std::vector<uint8_t> BuildObject(const std::vector<uint32_t>& field_ids,
+                                 const std::vector<std::vector<uint8_t>>& 
field_values,
+                                 int32_t field_id_size = 1,
+                                 int32_t field_offset_size = 1) {
+  auto num_fields = static_cast<uint32_t>(field_ids.size());
+  bool is_large = (num_fields > 255);
+
+  std::vector<uint8_t> buffer;
+
+  // Header per spec: basic_type=2 in bits 0-1,
+  //   bits 2-3: field_offset_size-1
+  //   bits 4-5: field_id_size-1
+  //   bit 6: is_large
+  uint8_t header = static_cast<uint8_t>(BasicType::kObject);
+  header |= static_cast<uint8_t>((field_offset_size - 1) << 2);
+  header |= static_cast<uint8_t>((field_id_size - 1) << 4);
+  if (is_large) {
+    header |= (1 << 6);
+  }
+  buffer.push_back(header);
+
+  // num_fields: 1 byte or 4 bytes depending on is_large
+  int32_t num_fields_size = is_large ? 4 : 1;
+  for (int32_t b = 0; b < num_fields_size; ++b) {
+    buffer.push_back(static_cast<uint8_t>((num_fields >> (b * 8)) & 0xFF));
+  }
+
+  // field_ids
+  for (auto fid : field_ids) {
+    for (int32_t b = 0; b < field_id_size; ++b) {
+      buffer.push_back(static_cast<uint8_t>((fid >> (b * 8)) & 0xFF));
+    }
+  }
+
+  // Compute offsets
+  std::vector<uint32_t> offsets(num_fields + 1);
+  offsets[0] = 0;
+  for (uint32_t i = 0; i < num_fields; ++i) {
+    offsets[i + 1] = offsets[i] + 
static_cast<uint32_t>(field_values[i].size());
+  }
+
+  // Write offsets
+  for (uint32_t i = 0; i <= num_fields; ++i) {
+    for (int32_t b = 0; b < field_offset_size; ++b) {
+      buffer.push_back(static_cast<uint8_t>((offsets[i] >> (b * 8)) & 0xFF));
+    }
+  }
+
+  // Write field value data
+  for (const auto& fv : field_values) {
+    buffer.insert(buffer.end(), fv.begin(), fv.end());
+  }
+
+  return buffer;
+}
+
+/// \brief Build an array value buffer.
+///
+/// \param elements Serialized variant values for each element
+/// \param field_offset_size Bytes per offset (1-4)
+std::vector<uint8_t> BuildArray(const std::vector<std::vector<uint8_t>>& 
elements,
+                                int32_t field_offset_size = 1) {
+  auto num_elements = static_cast<uint32_t>(elements.size());
+  bool is_large = (num_elements > 255);
+
+  std::vector<uint8_t> buffer;
+
+  // Header per spec: basic_type=3 in bits 0-1,
+  //   bits 2-3: field_offset_size-1
+  //   bit 4: is_large
+  uint8_t header = static_cast<uint8_t>(BasicType::kArray);
+  header |= static_cast<uint8_t>((field_offset_size - 1) << 2);
+  if (is_large) {
+    header |= (1 << 4);
+  }
+  buffer.push_back(header);
+
+  // num_elements: 1 byte or 4 bytes depending on is_large
+  int32_t num_elements_size = is_large ? 4 : 1;
+  for (int32_t b = 0; b < num_elements_size; ++b) {
+    buffer.push_back(static_cast<uint8_t>((num_elements >> (b * 8)) & 0xFF));
+  }
+
+  // Compute offsets
+  std::vector<uint32_t> offsets(num_elements + 1);
+  offsets[0] = 0;
+  for (uint32_t i = 0; i < num_elements; ++i) {
+    offsets[i + 1] = offsets[i] + static_cast<uint32_t>(elements[i].size());
+  }
+
+  // Write offsets
+  for (uint32_t i = 0; i <= num_elements; ++i) {
+    for (int32_t b = 0; b < field_offset_size; ++b) {
+      buffer.push_back(static_cast<uint8_t>((offsets[i] >> (b * 8)) & 0xFF));
+    }
+  }
+
+  // Write element data
+  for (const auto& elem : elements) {
+    buffer.insert(buffer.end(), elem.begin(), elem.end());
+  }
+
+  return buffer;
+}
+
+// ===========================================================================
+// Metadata decoding tests
+// ===========================================================================
+
+class VariantMetadataTest : public ::testing::Test {};
+
+TEST_F(VariantMetadataTest, EmptyDictionary) {
+  auto buf = BuildMetadataBuffer({});
+  ASSERT_OK_AND_ASSIGN(auto metadata, DecodeMetadata(buf.data(), buf.size()));
+  ASSERT_EQ(metadata.version, 1);
+  ASSERT_FALSE(metadata.is_sorted);
+  ASSERT_EQ(metadata.offset_size, 1);
+  ASSERT_EQ(metadata.strings.size(), 0);
+}
+
+TEST_F(VariantMetadataTest, SingleString) {
+  auto buf = BuildMetadataBuffer({"hello"});
+  ASSERT_OK_AND_ASSIGN(auto metadata, DecodeMetadata(buf.data(), buf.size()));
+  ASSERT_EQ(metadata.strings.size(), 1);
+  ASSERT_EQ(metadata.strings[0], "hello");
+}
+
+TEST_F(VariantMetadataTest, MultipleStrings) {
+  auto buf = BuildMetadataBuffer({"name", "age", "scores"});
+  ASSERT_OK_AND_ASSIGN(auto metadata, DecodeMetadata(buf.data(), buf.size()));
+  ASSERT_EQ(metadata.strings.size(), 3);
+  ASSERT_EQ(metadata.strings[0], "name");
+  ASSERT_EQ(metadata.strings[1], "age");
+  ASSERT_EQ(metadata.strings[2], "scores");
+}
+
+TEST_F(VariantMetadataTest, SortedFlag) {
+  auto buf = BuildMetadataBuffer({"age", "name", "score"}, true);
+  ASSERT_OK_AND_ASSIGN(auto metadata, DecodeMetadata(buf.data(), buf.size()));
+  ASSERT_TRUE(metadata.is_sorted);
+}
+
+TEST_F(VariantMetadataTest, OffsetSize2) {
+  auto buf = BuildMetadataBuffer({"key1", "key2"}, false, 2);
+  ASSERT_OK_AND_ASSIGN(auto metadata, DecodeMetadata(buf.data(), buf.size()));
+  ASSERT_EQ(metadata.offset_size, 2);
+  ASSERT_EQ(metadata.strings.size(), 2);
+  ASSERT_EQ(metadata.strings[0], "key1");
+  ASSERT_EQ(metadata.strings[1], "key2");
+}
+
+TEST_F(VariantMetadataTest, OffsetSize4) {
+  auto buf = BuildMetadataBuffer({"a", "bb", "ccc"}, false, 4);
+  ASSERT_OK_AND_ASSIGN(auto metadata, DecodeMetadata(buf.data(), buf.size()));
+  ASSERT_EQ(metadata.offset_size, 4);
+  ASSERT_EQ(metadata.strings.size(), 3);
+  ASSERT_EQ(metadata.strings[0], "a");
+  ASSERT_EQ(metadata.strings[1], "bb");
+  ASSERT_EQ(metadata.strings[2], "ccc");
+}
+
+TEST_F(VariantMetadataTest, EmptyStrings) {
+  auto buf = BuildMetadataBuffer({"", "nonempty", ""});
+  ASSERT_OK_AND_ASSIGN(auto metadata, DecodeMetadata(buf.data(), buf.size()));
+  ASSERT_EQ(metadata.strings.size(), 3);
+  ASSERT_EQ(metadata.strings[0], "");
+  ASSERT_EQ(metadata.strings[1], "nonempty");
+  ASSERT_EQ(metadata.strings[2], "");
+}
+
+// Error cases
+
+TEST_F(VariantMetadataTest, NullBuffer) {
+  ASSERT_RAISES(Invalid, DecodeMetadata(nullptr, 0));
+}
+
+TEST_F(VariantMetadataTest, EmptyBuffer) {
+  uint8_t data = 0;
+  ASSERT_RAISES(Invalid, DecodeMetadata(&data, 0));
+}
+
+TEST_F(VariantMetadataTest, UnsupportedVersion) {
+  // Version 2 (unsupported)
+  uint8_t data[] = {0x02, 0x00};
+  ASSERT_RAISES(Invalid, DecodeMetadata(data, sizeof(data)));
+}
+
+TEST_F(VariantMetadataTest, TruncatedDictionarySize) {
+  // Header says offset_size=2 (bits 6-7 = 01), but only 1 byte follows
+  uint8_t data[] = {0x41, 0x00};  // version=1, offset_size=2
+  ASSERT_RAISES(Invalid, DecodeMetadata(data, sizeof(data)));
+}
+
+TEST_F(VariantMetadataTest, TruncatedStringOffsets) {
+  // Claims dict_size=5 but buffer is too short for offsets
+  uint8_t data[] = {0x01, 0x05, 0x00};
+  ASSERT_RAISES(Invalid, DecodeMetadata(data, sizeof(data)));
+}
+
+TEST_F(VariantMetadataTest, OffsetSize3) {
+  auto buf = BuildMetadataBuffer({"foo", "bar"}, false, 3);
+  ASSERT_OK_AND_ASSIGN(auto metadata, DecodeMetadata(buf.data(), buf.size()));
+  ASSERT_EQ(metadata.offset_size, 3);
+  ASSERT_EQ(metadata.strings.size(), 2);
+  ASSERT_EQ(metadata.strings[0], "foo");
+  ASSERT_EQ(metadata.strings[1], "bar");
+}
+
+TEST_F(VariantMetadataTest, ReservedBit5Set) {
+  // Header with bit 5 set: 0x21 = version=1, bit5=1
+  uint8_t data[] = {0x21, 0x00, 0x00};
+  ASSERT_RAISES(Invalid, DecodeMetadata(data, sizeof(data)));
+}
+
+TEST_F(VariantMetadataTest, NonMonotonicStringOffsets) {
+  // Manually construct metadata where string offsets are NOT monotonically
+  // non-decreasing. ValidateOffsets should reject this.
+  // Header: version=1, offset_size=1
+  // dict_size=2, offsets=[0, 5, 3] — 3 < 5, non-monotonic
+  // String data: "helloabc" (8 bytes, but offsets claim 3 as last)
+  uint8_t data[] = {
+      0x01,              // header: version=1, offset_size=1
+      0x02,              // dict_size = 2
+      0x00, 0x05, 0x03,  // offsets: [0, 5, 3] — non-monotonic
+      'h', 'e', 'l', 'l', 'o', 'a', 'b', 'c'};
+  ASSERT_RAISES(Invalid, DecodeMetadata(data, sizeof(data)));
+}
+
+// ===========================================================================
+// Primitive value decoding tests
+// ===========================================================================
+
+class VariantPrimitiveTest : public ::testing::Test {
+ protected:
+  VariantMetadata empty_metadata_;
+
+  void SetUp() override {
+    empty_metadata_.version = 1;
+    empty_metadata_.is_sorted = false;
+    empty_metadata_.offset_size = 1;
+  }
+};
+
+TEST_F(VariantPrimitiveTest, DecodeNull) {
+  uint8_t data[] = {PrimitiveHeader(PrimitiveType::kNull)};
+  RecordingVisitor visitor;
+  ASSERT_OK(DecodeVariantValue(empty_metadata_, data, sizeof(data), &visitor));
+  ASSERT_EQ(visitor.events.size(), 1);
+  ASSERT_EQ(visitor.events[0], "Null");
+}
+
+TEST_F(VariantPrimitiveTest, DecodeTrue) {
+  uint8_t data[] = {PrimitiveHeader(PrimitiveType::kTrue)};
+  RecordingVisitor visitor;
+  ASSERT_OK(DecodeVariantValue(empty_metadata_, data, sizeof(data), &visitor));
+  ASSERT_EQ(visitor.events.size(), 1);
+  ASSERT_EQ(visitor.events[0], "Bool(true)");
+}
+
+TEST_F(VariantPrimitiveTest, DecodeFalse) {
+  uint8_t data[] = {PrimitiveHeader(PrimitiveType::kFalse)};
+  RecordingVisitor visitor;
+  ASSERT_OK(DecodeVariantValue(empty_metadata_, data, sizeof(data), &visitor));
+  ASSERT_EQ(visitor.events.size(), 1);
+  ASSERT_EQ(visitor.events[0], "Bool(false)");
+}
+
+TEST_F(VariantPrimitiveTest, DecodeInt8) {
+  uint8_t data[] = {PrimitiveHeader(PrimitiveType::kInt8), 0x2A};
+  RecordingVisitor visitor;
+  ASSERT_OK(DecodeVariantValue(empty_metadata_, data, sizeof(data), &visitor));
+  ASSERT_EQ(visitor.events[0], "Int8(42)");
+}
+
+TEST_F(VariantPrimitiveTest, DecodeInt8Negative) {
+  uint8_t data[] = {PrimitiveHeader(PrimitiveType::kInt8), 0xD6};
+  RecordingVisitor visitor;
+  ASSERT_OK(DecodeVariantValue(empty_metadata_, data, sizeof(data), &visitor));
+  ASSERT_EQ(visitor.events[0], "Int8(-42)");
+}
+
+TEST_F(VariantPrimitiveTest, DecodeInt16) {
+  // 300 = 0x012C in little-endian: 0x2C, 0x01
+  uint8_t data[] = {PrimitiveHeader(PrimitiveType::kInt16), 0x2C, 0x01};
+  RecordingVisitor visitor;
+  ASSERT_OK(DecodeVariantValue(empty_metadata_, data, sizeof(data), &visitor));
+  ASSERT_EQ(visitor.events[0], "Int16(300)");
+}
+
+TEST_F(VariantPrimitiveTest, DecodeInt32) {
+  // 100000 = 0x000186A0 in LE: A0 86 01 00
+  uint8_t data[] = {PrimitiveHeader(PrimitiveType::kInt32), 0xA0, 0x86, 0x01, 
0x00};
+  RecordingVisitor visitor;
+  ASSERT_OK(DecodeVariantValue(empty_metadata_, data, sizeof(data), &visitor));
+  ASSERT_EQ(visitor.events[0], "Int32(100000)");
+}
+
+TEST_F(VariantPrimitiveTest, DecodeInt32Max) {
+  int32_t val = std::numeric_limits<int32_t>::max();
+  uint8_t data[5];
+  data[0] = PrimitiveHeader(PrimitiveType::kInt32);
+  std::memcpy(data + 1, &val, 4);
+  RecordingVisitor visitor;
+  ASSERT_OK(DecodeVariantValue(empty_metadata_, data, sizeof(data), &visitor));
+  ASSERT_EQ(visitor.events[0], "Int32(" + std::to_string(val) + ")");
+}
+
+TEST_F(VariantPrimitiveTest, DecodeInt64) {
+  int64_t val = 1234567890123LL;
+  uint8_t data[9];
+  data[0] = PrimitiveHeader(PrimitiveType::kInt64);
+  std::memcpy(data + 1, &val, 8);
+  RecordingVisitor visitor;
+  ASSERT_OK(DecodeVariantValue(empty_metadata_, data, sizeof(data), &visitor));
+  ASSERT_EQ(visitor.events[0], "Int64(" + std::to_string(val) + ")");
+}
+
+TEST_F(VariantPrimitiveTest, DecodeFloat) {
+  float val = 3.14f;
+  uint8_t data[5];
+  data[0] = PrimitiveHeader(PrimitiveType::kFloat);
+  std::memcpy(data + 1, &val, 4);
+  RecordingVisitor visitor;
+  ASSERT_OK(DecodeVariantValue(empty_metadata_, data, sizeof(data), &visitor));
+  // Float string representation may vary; just check it starts with Float(
+  ASSERT_TRUE(visitor.events[0].find("Float(") == 0);
+}
+
+TEST_F(VariantPrimitiveTest, DecodeDouble) {
+  double val = 2.718281828459045;
+  uint8_t data[9];
+  data[0] = PrimitiveHeader(PrimitiveType::kDouble);
+  std::memcpy(data + 1, &val, 8);
+  RecordingVisitor visitor;
+  ASSERT_OK(DecodeVariantValue(empty_metadata_, data, sizeof(data), &visitor));
+  ASSERT_TRUE(visitor.events[0].find("Double(") == 0);
+}
+
+TEST_F(VariantPrimitiveTest, DecodeDate) {
+  // Days since epoch: 19000 (approximately 2022-01-01)
+  int32_t days = 19000;
+  uint8_t data[5];
+  data[0] = PrimitiveHeader(PrimitiveType::kDate);
+  std::memcpy(data + 1, &days, 4);
+  RecordingVisitor visitor;
+  ASSERT_OK(DecodeVariantValue(empty_metadata_, data, sizeof(data), &visitor));
+  ASSERT_EQ(visitor.events[0], "Date(19000)");
+}
+
+TEST_F(VariantPrimitiveTest, DecodeTimestampMicros) {
+  int64_t micros = 1654041600000000LL;  // some timestamp
+  uint8_t data[9];
+  data[0] = PrimitiveHeader(PrimitiveType::kTimestampMicros);
+  std::memcpy(data + 1, &micros, 8);
+  RecordingVisitor visitor;
+  ASSERT_OK(DecodeVariantValue(empty_metadata_, data, sizeof(data), &visitor));
+  ASSERT_EQ(visitor.events[0], "TimestampMicros(" + std::to_string(micros) + 
")");
+}
+
+TEST_F(VariantPrimitiveTest, DecodeTimestampMicrosNTZ) {
+  int64_t micros = 1654041600000000LL;
+  uint8_t data[9];
+  data[0] = PrimitiveHeader(PrimitiveType::kTimestampMicrosNTZ);
+  std::memcpy(data + 1, &micros, 8);
+  RecordingVisitor visitor;
+  ASSERT_OK(DecodeVariantValue(empty_metadata_, data, sizeof(data), &visitor));
+  ASSERT_EQ(visitor.events[0], "TimestampMicrosNTZ(" + std::to_string(micros) 
+ ")");
+}
+
+TEST_F(VariantPrimitiveTest, DecodeDecimal4) {
+  // Spec layout: 1 byte scale, then 4 bytes LE unscaled value
+  uint8_t data[6];
+  data[0] = PrimitiveHeader(PrimitiveType::kDecimal4);
+  data[1] = 2;  // scale = 2
+  int32_t val = 12345;
+  std::memcpy(data + 2, &val, 4);  // unscaled value
+  RecordingVisitor visitor;
+  ASSERT_OK(DecodeVariantValue(empty_metadata_, data, sizeof(data), &visitor));
+  ASSERT_EQ(visitor.events[0], "Decimal4(scale=2)");
+}
+
+TEST_F(VariantPrimitiveTest, DecodeDecimal4MaxScale) {
+  // Scale at maximum per spec: 38
+  uint8_t data[6];
+  data[0] = PrimitiveHeader(PrimitiveType::kDecimal4);
+  data[1] = 38;  // scale = 38 (maximum per spec)
+  int32_t val = 12345;
+  std::memcpy(data + 2, &val, 4);
+  RecordingVisitor visitor;
+  ASSERT_OK(DecodeVariantValue(empty_metadata_, data, sizeof(data), &visitor));
+  ASSERT_EQ(visitor.events[0], "Decimal4(scale=38)");
+}
+
+TEST_F(VariantPrimitiveTest, DecodeDecimal8) {
+  // Spec layout: 1 byte scale, then 8 bytes LE unscaled value
+  uint8_t data[10];
+  data[0] = PrimitiveHeader(PrimitiveType::kDecimal8);
+  data[1] = 5;  // scale = 5
+  int64_t val = 123456789012345LL;
+  std::memcpy(data + 2, &val, 8);
+  RecordingVisitor visitor;
+  ASSERT_OK(DecodeVariantValue(empty_metadata_, data, sizeof(data), &visitor));
+  ASSERT_EQ(visitor.events[0], "Decimal8(scale=5)");
+}
+
+TEST_F(VariantPrimitiveTest, DecodeDecimal16) {
+  // Spec layout: 1 byte scale, then 16 bytes LE unscaled value
+  uint8_t data[18];
+  data[0] = PrimitiveHeader(PrimitiveType::kDecimal16);
+  data[1] = 10;  // scale = 10
+  std::memset(data + 2, 0, 16);
+  data[2] = 0x01;  // low byte = 1
+  RecordingVisitor visitor;
+  ASSERT_OK(DecodeVariantValue(empty_metadata_, data, sizeof(data), &visitor));
+  ASSERT_EQ(visitor.events[0], "Decimal16(scale=10)");
+}
+
+TEST_F(VariantPrimitiveTest, DecodeLongString) {
+  // Long string: primitive type kString with 4-byte length prefix
+  std::string test_str = "hello world, this is a long string";
+  auto str_len = static_cast<uint32_t>(test_str.size());
+
+  std::vector<uint8_t> data;
+  data.push_back(PrimitiveHeader(PrimitiveType::kString));
+  // 4-byte little-endian length
+  for (int b = 0; b < 4; ++b) {
+    data.push_back(static_cast<uint8_t>((str_len >> (b * 8)) & 0xFF));
+  }
+  data.insert(data.end(), test_str.begin(), test_str.end());
+
+  RecordingVisitor visitor;
+  ASSERT_OK(DecodeVariantValue(empty_metadata_, data.data(),
+                               static_cast<int64_t>(data.size()), &visitor));
+  ASSERT_EQ(visitor.events[0], "String(\"hello world, this is a long 
string\")");
+}
+
+TEST_F(VariantPrimitiveTest, DecodeBinary) {
+  std::vector<uint8_t> bin_bytes = {0x00, 0x01, 0x02, 0x03};
+  auto bin_len = static_cast<uint32_t>(bin_bytes.size());
+
+  std::vector<uint8_t> data;
+  data.push_back(PrimitiveHeader(PrimitiveType::kBinary));
+  for (int b = 0; b < 4; ++b) {
+    data.push_back(static_cast<uint8_t>((bin_len >> (b * 8)) & 0xFF));
+  }
+  data.insert(data.end(), bin_bytes.begin(), bin_bytes.end());
+
+  RecordingVisitor visitor;
+  ASSERT_OK(DecodeVariantValue(empty_metadata_, data.data(),
+                               static_cast<int64_t>(data.size()), &visitor));
+  ASSERT_EQ(visitor.events[0], "Binary(len=4)");
+}
+
+// Truncation errors
+
+TEST_F(VariantPrimitiveTest, TruncatedInt32) {
+  // Only 2 bytes after header, but Int32 needs 4
+  uint8_t data[] = {PrimitiveHeader(PrimitiveType::kInt32), 0x00, 0x00};
+  RecordingVisitor visitor;
+  ASSERT_RAISES(Invalid,
+                DecodeVariantValue(empty_metadata_, data, sizeof(data), 
&visitor));
+}
+
+TEST_F(VariantPrimitiveTest, EmptyValueBuffer) {
+  RecordingVisitor visitor;
+  ASSERT_RAISES(Invalid, DecodeVariantValue(empty_metadata_, nullptr, 0, 
&visitor));
+}
+
+// ===========================================================================
+// Short string tests
+// ===========================================================================
+
+class VariantShortStringTest : public ::testing::Test {
+ protected:
+  VariantMetadata empty_metadata_;
+
+  void SetUp() override {
+    empty_metadata_.version = 1;
+    empty_metadata_.is_sorted = false;
+    empty_metadata_.offset_size = 1;
+  }
+};
+
+TEST_F(VariantShortStringTest, EmptyShortString) {
+  auto data = BuildShortString("");
+  RecordingVisitor visitor;
+  ASSERT_OK(DecodeVariantValue(empty_metadata_, data.data(),
+                               static_cast<int64_t>(data.size()), &visitor));
+  ASSERT_EQ(visitor.events[0], "String(\"\")");
+}
+
+TEST_F(VariantShortStringTest, SimpleShortString) {
+  auto data = BuildShortString("hi");
+  RecordingVisitor visitor;
+  ASSERT_OK(DecodeVariantValue(empty_metadata_, data.data(),
+                               static_cast<int64_t>(data.size()), &visitor));
+  ASSERT_EQ(visitor.events[0], "String(\"hi\")");
+}
+
+TEST_F(VariantShortStringTest, MaxLengthShortString) {
+  // Maximum short string is 63 bytes
+  std::string max_str(63, 'x');
+  auto data = BuildShortString(max_str);
+  RecordingVisitor visitor;
+  ASSERT_OK(DecodeVariantValue(empty_metadata_, data.data(),
+                               static_cast<int64_t>(data.size()), &visitor));
+  ASSERT_EQ(visitor.events[0], "String(\"" + max_str + "\")");
+}
+
+TEST_F(VariantShortStringTest, TruncatedShortString) {
+  // Header says length=10 but buffer only has 3 bytes total
+  uint8_t data[] = {static_cast<uint8_t>(BasicType::kShortString) | (10 << 2), 
'a', 'b'};
+  RecordingVisitor visitor;
+  ASSERT_RAISES(Invalid,
+                DecodeVariantValue(empty_metadata_, data, sizeof(data), 
&visitor));
+}
+
+// ===========================================================================
+// Object decoding tests
+// ===========================================================================
+
+class VariantObjectTest : public ::testing::Test {
+ protected:
+  VariantMetadata metadata_;
+
+  void SetUp() override {
+    metadata_.version = 1;
+    metadata_.is_sorted = false;
+    metadata_.offset_size = 1;
+    metadata_.strings = {"name", "age", "scores"};
+  }
+};
+
+TEST_F(VariantObjectTest, EmptyObject) {
+  auto data = BuildObject({}, {});
+  RecordingVisitor visitor;
+  ASSERT_OK(DecodeVariantValue(metadata_, data.data(), 
static_cast<int64_t>(data.size()),
+                               &visitor));
+  ASSERT_EQ(visitor.events.size(), 2);
+  ASSERT_EQ(visitor.events[0], "StartObject(0)");
+  ASSERT_EQ(visitor.events[1], "EndObject");
+}
+
+TEST_F(VariantObjectTest, SingleField) {
+  // Object with one field: name -> "Alice" (short string)
+  auto value = BuildShortString("Alice");
+  auto data = BuildObject({0}, {value});
+
+  RecordingVisitor visitor;
+  ASSERT_OK(DecodeVariantValue(metadata_, data.data(), 
static_cast<int64_t>(data.size()),
+                               &visitor));
+  ASSERT_EQ(visitor.events.size(), 4);
+  ASSERT_EQ(visitor.events[0], "StartObject(1)");
+  ASSERT_EQ(visitor.events[1], "FieldName(\"name\")");
+  ASSERT_EQ(visitor.events[2], "String(\"Alice\")");
+  ASSERT_EQ(visitor.events[3], "EndObject");
+}
+
+TEST_F(VariantObjectTest, MultipleFields) {
+  // Object: {name: "Bob", age: 30}
+  auto name_val = BuildShortString("Bob");
+  // age: Int32(30)
+  std::vector<uint8_t> age_val = {PrimitiveHeader(PrimitiveType::kInt32), 30, 
0, 0, 0};
+
+  auto data = BuildObject({0, 1}, {name_val, age_val});
+
+  RecordingVisitor visitor;
+  ASSERT_OK(DecodeVariantValue(metadata_, data.data(), 
static_cast<int64_t>(data.size()),
+                               &visitor));
+  ASSERT_EQ(visitor.events.size(), 6);
+  ASSERT_EQ(visitor.events[0], "StartObject(2)");
+  ASSERT_EQ(visitor.events[1], "FieldName(\"name\")");
+  ASSERT_EQ(visitor.events[2], "String(\"Bob\")");
+  ASSERT_EQ(visitor.events[3], "FieldName(\"age\")");
+  ASSERT_EQ(visitor.events[4], "Int32(30)");
+  ASSERT_EQ(visitor.events[5], "EndObject");
+}
+
+TEST_F(VariantObjectTest, InvalidFieldId) {
+  // field_id=99 exceeds dictionary size of 3
+  auto value = BuildShortString("oops");
+  auto data = BuildObject({99}, {value});
+
+  RecordingVisitor visitor;
+  ASSERT_RAISES(Invalid, DecodeVariantValue(metadata_, data.data(),
+                                            static_cast<int64_t>(data.size()), 
&visitor));
+}
+
+TEST_F(VariantObjectTest, ThreeByteOffsetSize) {
+  // Exercises value decoding with 3-byte field_offset_size and field_id_size.
+  // Object with 2 fields: {name: "test", age: 42}
+  auto name_val = BuildShortString("test");
+  std::vector<uint8_t> age_val = {PrimitiveHeader(PrimitiveType::kInt32), 42, 
0, 0, 0};
+  auto data = BuildObject({0, 1}, {name_val, age_val},
+                          /*field_id_size=*/3, /*field_offset_size=*/3);
+
+  RecordingVisitor visitor;
+  ASSERT_OK(DecodeVariantValue(metadata_, data.data(), 
static_cast<int64_t>(data.size()),
+                               &visitor));
+  ASSERT_EQ(visitor.events.size(), 6);
+  ASSERT_EQ(visitor.events[0], "StartObject(2)");
+  ASSERT_EQ(visitor.events[1], "FieldName(\"name\")");
+  ASSERT_EQ(visitor.events[2], "String(\"test\")");
+  ASSERT_EQ(visitor.events[3], "FieldName(\"age\")");
+  ASSERT_EQ(visitor.events[4], "Int32(42)");
+  ASSERT_EQ(visitor.events[5], "EndObject");
+}
+
+// ===========================================================================
+// Array decoding tests
+// ===========================================================================
+
+class VariantArrayTest : public ::testing::Test {
+ protected:
+  VariantMetadata empty_metadata_;
+
+  void SetUp() override {
+    empty_metadata_.version = 1;
+    empty_metadata_.is_sorted = false;
+    empty_metadata_.offset_size = 1;
+  }
+};
+
+TEST_F(VariantArrayTest, EmptyArray) {
+  auto data = BuildArray({});
+  RecordingVisitor visitor;
+  ASSERT_OK(DecodeVariantValue(empty_metadata_, data.data(),
+                               static_cast<int64_t>(data.size()), &visitor));
+  ASSERT_EQ(visitor.events.size(), 2);
+  ASSERT_EQ(visitor.events[0], "StartArray(0)");
+  ASSERT_EQ(visitor.events[1], "EndArray");
+}
+
+TEST_F(VariantArrayTest, SingleElement) {
+  std::vector<uint8_t> elem = {PrimitiveHeader(PrimitiveType::kInt32), 42, 0, 
0, 0};
+  auto data = BuildArray({elem});
+
+  RecordingVisitor visitor;
+  ASSERT_OK(DecodeVariantValue(empty_metadata_, data.data(),
+                               static_cast<int64_t>(data.size()), &visitor));
+  ASSERT_EQ(visitor.events.size(), 3);
+  ASSERT_EQ(visitor.events[0], "StartArray(1)");
+  ASSERT_EQ(visitor.events[1], "Int32(42)");
+  ASSERT_EQ(visitor.events[2], "EndArray");
+}
+
+TEST_F(VariantArrayTest, HeterogeneousElements) {
+  // Array with mixed types: [42, "hello", true]
+  std::vector<uint8_t> int_elem = {PrimitiveHeader(PrimitiveType::kInt32), 42, 
0, 0, 0};
+  auto str_elem = BuildShortString("hello");
+  std::vector<uint8_t> bool_elem = {PrimitiveHeader(PrimitiveType::kTrue)};
+
+  auto data = BuildArray({int_elem, str_elem, bool_elem});
+
+  RecordingVisitor visitor;
+  ASSERT_OK(DecodeVariantValue(empty_metadata_, data.data(),
+                               static_cast<int64_t>(data.size()), &visitor));
+  ASSERT_EQ(visitor.events.size(), 5);
+  ASSERT_EQ(visitor.events[0], "StartArray(3)");
+  ASSERT_EQ(visitor.events[1], "Int32(42)");
+  ASSERT_EQ(visitor.events[2], "String(\"hello\")");
+  ASSERT_EQ(visitor.events[3], "Bool(true)");
+  ASSERT_EQ(visitor.events[4], "EndArray");
+}
+
+TEST_F(VariantArrayTest, LargeArrayIsLargeFlag) {
+  // Build an array with 256 elements to exercise is_large=true (4-byte
+  // num_elements). Each element is a Null primitive (1 byte each).
+  // Use field_offset_size=2 since total data (256 bytes) exceeds 1-byte max.
+  std::vector<std::vector<uint8_t>> elements;
+  elements.reserve(256);
+  for (int i = 0; i < 256; ++i) {
+    elements.push_back({PrimitiveHeader(PrimitiveType::kNull)});
+  }
+  auto data = BuildArray(elements, /*field_offset_size=*/2);
+
+  RecordingVisitor visitor;
+  ASSERT_OK(DecodeVariantValue(empty_metadata_, data.data(),
+                               static_cast<int64_t>(data.size()), &visitor));
+  // StartArray(256) + 256 Nulls + EndArray = 258 events
+  ASSERT_EQ(visitor.events.size(), 258);
+  ASSERT_EQ(visitor.events[0], "StartArray(256)");
+  ASSERT_EQ(visitor.events[1], "Null");
+  ASSERT_EQ(visitor.events[256], "Null");
+  ASSERT_EQ(visitor.events[257], "EndArray");
+}
+
+// ===========================================================================
+// Nested structure tests
+// ===========================================================================
+
+class VariantNestedTest : public ::testing::Test {
+ protected:
+  VariantMetadata metadata_;
+
+  void SetUp() override {
+    metadata_.version = 1;
+    metadata_.is_sorted = false;
+    metadata_.offset_size = 1;
+    metadata_.strings = {"name", "scores", "inner"};
+  }
+};
+
+TEST_F(VariantNestedTest, ObjectWithNestedArray) {
+  // {name: "Alice", scores: [95, 87]}
+  auto name_val = BuildShortString("Alice");
+
+  // scores array: [Int32(95), Int32(87)]
+  std::vector<uint8_t> score1 = {PrimitiveHeader(PrimitiveType::kInt32), 95, 
0, 0, 0};
+  std::vector<uint8_t> score2 = {PrimitiveHeader(PrimitiveType::kInt32), 87, 
0, 0, 0};
+  auto scores_val = BuildArray({score1, score2});
+
+  auto data = BuildObject({0, 1}, {name_val, scores_val});
+
+  RecordingVisitor visitor;
+  ASSERT_OK(DecodeVariantValue(metadata_, data.data(), 
static_cast<int64_t>(data.size()),
+                               &visitor));
+
+  // Expected events:
+  // StartObject(2), FieldName("name"), String("Alice"),
+  // FieldName("scores"), StartArray(2), Int32(95), Int32(87), EndArray,
+  // EndObject
+  ASSERT_EQ(visitor.events.size(), 9);
+  ASSERT_EQ(visitor.events[0], "StartObject(2)");
+  ASSERT_EQ(visitor.events[1], "FieldName(\"name\")");
+  ASSERT_EQ(visitor.events[2], "String(\"Alice\")");
+  ASSERT_EQ(visitor.events[3], "FieldName(\"scores\")");
+  ASSERT_EQ(visitor.events[4], "StartArray(2)");
+  ASSERT_EQ(visitor.events[5], "Int32(95)");
+  ASSERT_EQ(visitor.events[6], "Int32(87)");
+  ASSERT_EQ(visitor.events[7], "EndArray");
+  ASSERT_EQ(visitor.events[8], "EndObject");
+}
+
+TEST_F(VariantNestedTest, NestedObjects) {
+  // {inner: {name: "deep"}}
+  auto deep_name = BuildShortString("deep");
+  auto inner_obj = BuildObject({0}, {deep_name});
+  auto data = BuildObject({2}, {inner_obj});
+
+  RecordingVisitor visitor;
+  ASSERT_OK(DecodeVariantValue(metadata_, data.data(), 
static_cast<int64_t>(data.size()),
+                               &visitor));
+
+  ASSERT_EQ(visitor.events.size(), 7);
+  ASSERT_EQ(visitor.events[0], "StartObject(1)");
+  ASSERT_EQ(visitor.events[1], "FieldName(\"inner\")");
+  ASSERT_EQ(visitor.events[2], "StartObject(1)");
+  ASSERT_EQ(visitor.events[3], "FieldName(\"name\")");
+  ASSERT_EQ(visitor.events[4], "String(\"deep\")");
+  ASSERT_EQ(visitor.events[5], "EndObject");
+  ASSERT_EQ(visitor.events[6], "EndObject");
+}
+
+TEST_F(VariantNestedTest, ArrayOfObjects) {
+  // [{name: "a"}, {name: "b"}]
+  auto val_a = BuildShortString("a");
+  auto obj_a = BuildObject({0}, {val_a});
+
+  auto val_b = BuildShortString("b");
+  auto obj_b = BuildObject({0}, {val_b});
+
+  auto data = BuildArray({obj_a, obj_b});
+
+  RecordingVisitor visitor;
+  ASSERT_OK(DecodeVariantValue(metadata_, data.data(), 
static_cast<int64_t>(data.size()),
+                               &visitor));
+
+  ASSERT_EQ(visitor.events.size(), 10);
+  ASSERT_EQ(visitor.events[0], "StartArray(2)");
+  ASSERT_EQ(visitor.events[1], "StartObject(1)");
+  ASSERT_EQ(visitor.events[2], "FieldName(\"name\")");
+  ASSERT_EQ(visitor.events[3], "String(\"a\")");
+  ASSERT_EQ(visitor.events[4], "EndObject");
+  ASSERT_EQ(visitor.events[5], "StartObject(1)");
+  ASSERT_EQ(visitor.events[6], "FieldName(\"name\")");
+  ASSERT_EQ(visitor.events[7], "String(\"b\")");
+  ASSERT_EQ(visitor.events[8], "EndObject");
+  ASSERT_EQ(visitor.events[9], "EndArray");
+}
+
+// ===========================================================================
+// Recursion depth limit test
+// ===========================================================================
+
+class VariantDepthTest : public ::testing::Test {
+ protected:
+  VariantMetadata metadata_;
+
+  void SetUp() override {
+    metadata_.version = 1;
+    metadata_.is_sorted = false;
+    metadata_.offset_size = 1;
+    metadata_.strings = {"x"};
+  }
+};
+
+TEST_F(VariantDepthTest, ExceedsMaxNestingDepth) {
+  // Build a deeply nested array: [[[[...]]]]
+  // Each level wraps the inner in a 1-element array with offset_size=2
+  // to allow buffers larger than 255 bytes.
+  std::vector<uint8_t> inner = {PrimitiveHeader(PrimitiveType::kNull)};
+
+  // Wrap 130 times (exceeds kMaxNestingDepth=128)
+  for (int i = 0; i < 130; ++i) {
+    inner = BuildArray({inner}, /*field_offset_size=*/2);
+  }
+
+  RecordingVisitor visitor;
+  ASSERT_RAISES(Invalid,
+                DecodeVariantValue(metadata_, inner.data(),
+                                   static_cast<int64_t>(inner.size()), 
&visitor));
+}
+
+TEST_F(VariantDepthTest, AtMaxNestingDepthSucceeds) {
+  // Build 50 levels of nesting — well within kMaxNestingDepth=128
+  // and within offset_size=1 limits (each level adds ~4 bytes).
+  std::vector<uint8_t> inner = {PrimitiveHeader(PrimitiveType::kNull)};
+
+  for (int i = 0; i < 50; ++i) {
+    inner = BuildArray({inner});
+  }
+
+  RecordingVisitor visitor;
+  ASSERT_OK(DecodeVariantValue(metadata_, inner.data(),
+                               static_cast<int64_t>(inner.size()), &visitor));
+}
+
+// ===========================================================================
+// Utility function tests
+// ===========================================================================
+
+class VariantUtilTest : public ::testing::Test {};
+
+TEST_F(VariantUtilTest, GetValueBasicTypePrimitive) {
+  uint8_t data[] = {PrimitiveHeader(PrimitiveType::kInt32), 0, 0, 0, 0};
+  ASSERT_OK_AND_ASSIGN(auto bt, GetValueBasicType(data, sizeof(data)));
+  ASSERT_EQ(bt, BasicType::kPrimitive);
+}
+
+TEST_F(VariantUtilTest, GetValueBasicTypeShortString) {
+  auto data = BuildShortString("test");
+  ASSERT_OK_AND_ASSIGN(auto bt,
+                       GetValueBasicType(data.data(), 
static_cast<int64_t>(data.size())));
+  ASSERT_EQ(bt, BasicType::kShortString);
+}
+
+TEST_F(VariantUtilTest, GetValueBasicTypeObject) {
+  VariantMetadata meta;
+  meta.version = 1;
+  meta.strings = {"key"};
+  auto val = BuildShortString("val");
+  auto data = BuildObject({0}, {val});
+  ASSERT_OK_AND_ASSIGN(auto bt,
+                       GetValueBasicType(data.data(), 
static_cast<int64_t>(data.size())));
+  ASSERT_EQ(bt, BasicType::kObject);
+}
+
+TEST_F(VariantUtilTest, GetValueBasicTypeArray) {
+  auto data = BuildArray({});
+  ASSERT_OK_AND_ASSIGN(auto bt,
+                       GetValueBasicType(data.data(), 
static_cast<int64_t>(data.size())));
+  ASSERT_EQ(bt, BasicType::kArray);
+}
+
+TEST_F(VariantUtilTest, GetValueBasicTypeEmptyBuffer) {
+  ASSERT_RAISES(Invalid, GetValueBasicType(nullptr, 0));
+}
+
+TEST_F(VariantUtilTest, GetObjectFieldCount) {
+  VariantMetadata meta;
+  meta.version = 1;
+  meta.strings = {"a", "b", "c"};
+  auto v1 = BuildShortString("x");
+  auto v2 = BuildShortString("y");
+  auto data = BuildObject({0, 1}, {v1, v2});
+  ASSERT_OK_AND_ASSIGN(
+      auto count, GetObjectFieldCount(data.data(), 
static_cast<int64_t>(data.size())));
+  ASSERT_EQ(count, 2);
+}
+
+TEST_F(VariantUtilTest, GetArrayElementCount) {
+  std::vector<uint8_t> e1 = {PrimitiveHeader(PrimitiveType::kNull)};
+  std::vector<uint8_t> e2 = {PrimitiveHeader(PrimitiveType::kTrue)};
+  std::vector<uint8_t> e3 = {PrimitiveHeader(PrimitiveType::kFalse)};
+  auto data = BuildArray({e1, e2, e3});
+  ASSERT_OK_AND_ASSIGN(
+      auto count, GetArrayElementCount(data.data(), 
static_cast<int64_t>(data.size())));
+  ASSERT_EQ(count, 3);
+}
+
+TEST_F(VariantUtilTest, PrimitiveValueSizes) {
+  ASSERT_EQ(PrimitiveValueSize(PrimitiveType::kNull), 0);
+  ASSERT_EQ(PrimitiveValueSize(PrimitiveType::kTrue), 0);
+  ASSERT_EQ(PrimitiveValueSize(PrimitiveType::kFalse), 0);
+  ASSERT_EQ(PrimitiveValueSize(PrimitiveType::kInt8), 1);
+  ASSERT_EQ(PrimitiveValueSize(PrimitiveType::kInt16), 2);
+  ASSERT_EQ(PrimitiveValueSize(PrimitiveType::kInt32), 4);
+  ASSERT_EQ(PrimitiveValueSize(PrimitiveType::kInt64), 8);
+  ASSERT_EQ(PrimitiveValueSize(PrimitiveType::kFloat), 4);
+  ASSERT_EQ(PrimitiveValueSize(PrimitiveType::kDouble), 8);
+  ASSERT_EQ(PrimitiveValueSize(PrimitiveType::kDate), 4);
+  ASSERT_EQ(PrimitiveValueSize(PrimitiveType::kTimestampMicros), 8);
+  ASSERT_EQ(PrimitiveValueSize(PrimitiveType::kTimestampMicrosNTZ), 8);
+  ASSERT_EQ(PrimitiveValueSize(PrimitiveType::kTimeNTZ), 8);
+  ASSERT_EQ(PrimitiveValueSize(PrimitiveType::kTimestampNanos), 8);
+  ASSERT_EQ(PrimitiveValueSize(PrimitiveType::kTimestampNanosNTZ), 8);
+  ASSERT_EQ(PrimitiveValueSize(PrimitiveType::kUUID), 16);
+  ASSERT_EQ(PrimitiveValueSize(PrimitiveType::kDecimal4), 5);
+  ASSERT_EQ(PrimitiveValueSize(PrimitiveType::kDecimal8), 9);
+  ASSERT_EQ(PrimitiveValueSize(PrimitiveType::kDecimal16), 17);
+  ASSERT_EQ(PrimitiveValueSize(PrimitiveType::kBinary), -1);
+  ASSERT_EQ(PrimitiveValueSize(PrimitiveType::kString), -1);
+}
+
+// ===========================================================================
+// Integration: Metadata + Value decoding together
+// ===========================================================================
+
+class VariantIntegrationTest : public ::testing::Test {};
+
+TEST_F(VariantIntegrationTest, FullRoundTrip) {

Review Comment:
   Added. The refactored view classes support composable navigation:
   
   ```cpp
   auto obj = view.as_object();
   auto inner = obj->get("address")->as_object();
   auto city = inner->get("city")->as_string();
   ```
   
   Tests exercise this chaining pattern with multi-level nesting (object -> 
object -> value, object -> array -> value, etc.).



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to