qzyu999 commented on code in PR #50122: URL: https://github.com/apache/arrow/pull/50122#discussion_r3485323349
########## cpp/src/arrow/extension/variant_builder_test.cc: ########## @@ -0,0 +1,1180 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/extension/variant_internal.h" +#include "arrow/extension/variant_test_util.h" + +#include <cstring> +#include <limits> +#include <string> +#include <vector> + +#include "arrow/testing/gtest_util.h" + +namespace arrow::extension::variant_internal { + +// =========================================================================== +// Helper: decode an EncodedVariant and return visitor events +// =========================================================================== + +/// Encode with builder, decode, return events. +/// Note: Uses .ValueOrDie() because ASSERT_OK_AND_ASSIGN cannot be used +/// in a non-void function. Test-only; will crash with a descriptive message +/// on failure rather than producing a clean test failure. +std::vector<std::string> RoundTrip(VariantBuilder& builder) { + auto result = builder.Finish().ValueOrDie(); + auto metadata = + DecodeMetadata(result.metadata.data(), static_cast<int64_t>(result.metadata.size())) + .ValueOrDie(); + RecordingVisitor visitor; + DecodeVariantValue(metadata, result.value.data(), + static_cast<int64_t>(result.value.size()), &visitor) + .ok(); + return visitor.events; +} + +// =========================================================================== +// Primitive round-trip tests +// =========================================================================== + +class VariantBuilderPrimitiveTest : public ::testing::Test {}; + +TEST_F(VariantBuilderPrimitiveTest, Null) { + VariantBuilder b; + ASSERT_OK(b.Null()); + auto events = RoundTrip(b); + ASSERT_EQ(events[0], "Null"); +} + +TEST_F(VariantBuilderPrimitiveTest, BoolTrue) { + VariantBuilder b; + ASSERT_OK(b.Bool(true)); + auto events = RoundTrip(b); + ASSERT_EQ(events[0], "Bool(true)"); +} + +TEST_F(VariantBuilderPrimitiveTest, BoolFalse) { + VariantBuilder b; + ASSERT_OK(b.Bool(false)); + auto events = RoundTrip(b); + ASSERT_EQ(events[0], "Bool(false)"); +} + +TEST_F(VariantBuilderPrimitiveTest, IntAutoSizesInt8) { + VariantBuilder b; + ASSERT_OK(b.Int(42)); + auto events = RoundTrip(b); + ASSERT_EQ(events[0], "Int8(42)"); +} + +TEST_F(VariantBuilderPrimitiveTest, IntAutoSizesInt16) { + VariantBuilder b; + ASSERT_OK(b.Int(300)); + auto events = RoundTrip(b); + ASSERT_EQ(events[0], "Int16(300)"); +} + +TEST_F(VariantBuilderPrimitiveTest, IntAutoSizesInt32) { + VariantBuilder b; + ASSERT_OK(b.Int(100000)); + auto events = RoundTrip(b); + ASSERT_EQ(events[0], "Int32(100000)"); +} + +TEST_F(VariantBuilderPrimitiveTest, IntAutoSizesInt64) { + VariantBuilder b; + ASSERT_OK(b.Int(5000000000LL)); + auto events = RoundTrip(b); + ASSERT_EQ(events[0], "Int64(5000000000)"); +} + +TEST_F(VariantBuilderPrimitiveTest, IntNegative) { + VariantBuilder b; + ASSERT_OK(b.Int(-42)); + auto events = RoundTrip(b); + ASSERT_EQ(events[0], "Int8(-42)"); +} + +TEST_F(VariantBuilderPrimitiveTest, ShortString) { + VariantBuilder b; + ASSERT_OK(b.String("hello")); + auto events = RoundTrip(b); + ASSERT_EQ(events[0], "String(\"hello\")"); +} + +TEST_F(VariantBuilderPrimitiveTest, LongString) { + std::string long_str(100, 'x'); + VariantBuilder b; + ASSERT_OK(b.String(long_str)); + auto events = RoundTrip(b); + ASSERT_EQ(events[0], "String(\"" + long_str + "\")"); +} + +TEST_F(VariantBuilderPrimitiveTest, ShortStringBoundary63) { + std::string str63(63, 'a'); + VariantBuilder b; + ASSERT_OK(b.String(str63)); + ASSERT_OK_AND_ASSIGN(auto result, b.Finish()); + // Should use short string encoding: 1 byte header + 63 bytes + ASSERT_EQ(result.value.size(), 64); +} + +TEST_F(VariantBuilderPrimitiveTest, LongStringBoundary64) { + std::string str64(64, 'a'); + VariantBuilder b; + ASSERT_OK(b.String(str64)); + ASSERT_OK_AND_ASSIGN(auto result, b.Finish()); + // Should use long string encoding: 1 byte header + 4 byte length + 64 bytes + ASSERT_EQ(result.value.size(), 69); +} + +TEST_F(VariantBuilderPrimitiveTest, Date) { + VariantBuilder b; + ASSERT_OK(b.Date(19000)); + auto events = RoundTrip(b); + ASSERT_EQ(events[0], "Date(19000)"); +} + +TEST_F(VariantBuilderPrimitiveTest, Double) { + VariantBuilder b; + ASSERT_OK(b.Double(3.14)); + auto events = RoundTrip(b); + ASSERT_TRUE(events[0].find("Double(") == 0); +} + +// =========================================================================== +// Array round-trip tests +// =========================================================================== + +class VariantBuilderArrayTest : public ::testing::Test {}; + +TEST_F(VariantBuilderArrayTest, EmptyArray) { + VariantBuilder b; + auto start = b.Offset(); + std::vector<int64_t> offsets; + ASSERT_OK(b.FinishArray(start, offsets)); + auto events = RoundTrip(b); + ASSERT_EQ(events.size(), 2); + ASSERT_EQ(events[0], "StartArray(0)"); + ASSERT_EQ(events[1], "EndArray"); +} + +TEST_F(VariantBuilderArrayTest, SimpleArray) { + VariantBuilder b; + auto start = b.Offset(); + std::vector<int64_t> offsets; + offsets.push_back(b.NextElement(start)); + ASSERT_OK(b.Int(1)); + offsets.push_back(b.NextElement(start)); + ASSERT_OK(b.Int(2)); + offsets.push_back(b.NextElement(start)); + ASSERT_OK(b.Int(3)); + ASSERT_OK(b.FinishArray(start, offsets)); + + auto events = RoundTrip(b); + ASSERT_EQ(events.size(), 5); + ASSERT_EQ(events[0], "StartArray(3)"); + ASSERT_EQ(events[1], "Int8(1)"); + ASSERT_EQ(events[2], "Int8(2)"); + ASSERT_EQ(events[3], "Int8(3)"); + ASSERT_EQ(events[4], "EndArray"); +} + +TEST_F(VariantBuilderArrayTest, NestedArray) { + VariantBuilder b; + auto start = b.Offset(); + std::vector<int64_t> offsets; + + // First element: nested array [10, 20] + offsets.push_back(b.NextElement(start)); + auto inner_start = b.Offset(); + std::vector<int64_t> inner_offsets; + inner_offsets.push_back(b.NextElement(inner_start)); + ASSERT_OK(b.Int(10)); + inner_offsets.push_back(b.NextElement(inner_start)); + ASSERT_OK(b.Int(20)); + ASSERT_OK(b.FinishArray(inner_start, inner_offsets)); + + // Second element: 30 + offsets.push_back(b.NextElement(start)); + ASSERT_OK(b.Int(30)); + + ASSERT_OK(b.FinishArray(start, offsets)); + + auto events = RoundTrip(b); + ASSERT_EQ(events[0], "StartArray(2)"); + ASSERT_EQ(events[1], "StartArray(2)"); + ASSERT_EQ(events[2], "Int8(10)"); + ASSERT_EQ(events[3], "Int8(20)"); + ASSERT_EQ(events[4], "EndArray"); + ASSERT_EQ(events[5], "Int8(30)"); + ASSERT_EQ(events[6], "EndArray"); +} + +// =========================================================================== +// Object round-trip tests +// =========================================================================== + +class VariantBuilderObjectTest : public ::testing::Test {}; + +TEST_F(VariantBuilderObjectTest, EmptyObject) { + VariantBuilder b; + auto start = b.Offset(); + std::vector<VariantBuilder::FieldEntry> fields; + ASSERT_OK(b.FinishObject(start, fields)); + auto events = RoundTrip(b); + ASSERT_EQ(events.size(), 2); + ASSERT_EQ(events[0], "StartObject(0)"); + ASSERT_EQ(events[1], "EndObject"); +} + +TEST_F(VariantBuilderObjectTest, SimpleObject) { + VariantBuilder b; + auto start = b.Offset(); + std::vector<VariantBuilder::FieldEntry> fields; + fields.push_back(b.NextField(start, "name")); + ASSERT_OK(b.String("Alice")); + fields.push_back(b.NextField(start, "age")); + ASSERT_OK(b.Int(30)); + ASSERT_OK(b.FinishObject(start, fields)); + + auto events = RoundTrip(b); + // Fields sorted by key: "age" before "name" + ASSERT_EQ(events[0], "StartObject(2)"); + ASSERT_EQ(events[1], "FieldName(\"age\")"); + ASSERT_EQ(events[2], "Int8(30)"); + ASSERT_EQ(events[3], "FieldName(\"name\")"); + ASSERT_EQ(events[4], "String(\"Alice\")"); + ASSERT_EQ(events[5], "EndObject"); +} + +TEST_F(VariantBuilderObjectTest, NestedObject) { + VariantBuilder b; + auto start = b.Offset(); + std::vector<VariantBuilder::FieldEntry> fields; + fields.push_back(b.NextField(start, "inner")); + { + auto inner_start = b.Offset(); + std::vector<VariantBuilder::FieldEntry> inner_fields; + inner_fields.push_back(b.NextField(inner_start, "key")); + ASSERT_OK(b.String("value")); + ASSERT_OK(b.FinishObject(inner_start, inner_fields)); + } + ASSERT_OK(b.FinishObject(start, fields)); + + auto events = RoundTrip(b); + ASSERT_EQ(events[0], "StartObject(1)"); + ASSERT_EQ(events[1], "FieldName(\"inner\")"); + ASSERT_EQ(events[2], "StartObject(1)"); + ASSERT_EQ(events[3], "FieldName(\"key\")"); + ASSERT_EQ(events[4], "String(\"value\")"); + ASSERT_EQ(events[5], "EndObject"); + ASSERT_EQ(events[6], "EndObject"); +} + +TEST_F(VariantBuilderObjectTest, DuplicateKeyError) { + VariantBuilder b; + auto start = b.Offset(); + std::vector<VariantBuilder::FieldEntry> fields; + fields.push_back(b.NextField(start, "key")); + ASSERT_OK(b.Int(1)); + fields.push_back(b.NextField(start, "key")); + ASSERT_OK(b.Int(2)); + ASSERT_RAISES(Invalid, b.FinishObject(start, fields)); +} + +TEST_F(VariantBuilderObjectTest, FieldsSortedByKey) { + // Insert fields in reverse order; verify they come out sorted + VariantBuilder b; + auto start = b.Offset(); + std::vector<VariantBuilder::FieldEntry> fields; + fields.push_back(b.NextField(start, "z_last")); + ASSERT_OK(b.Int(3)); + fields.push_back(b.NextField(start, "a_first")); + ASSERT_OK(b.Int(1)); + fields.push_back(b.NextField(start, "m_middle")); + ASSERT_OK(b.Int(2)); + ASSERT_OK(b.FinishObject(start, fields)); + + auto events = RoundTrip(b); + ASSERT_EQ(events[1], "FieldName(\"a_first\")"); + ASSERT_EQ(events[2], "Int8(1)"); + ASSERT_EQ(events[3], "FieldName(\"m_middle\")"); + ASSERT_EQ(events[4], "Int8(2)"); + ASSERT_EQ(events[5], "FieldName(\"z_last\")"); + ASSERT_EQ(events[6], "Int8(3)"); +} + +// =========================================================================== +// Builder features +// =========================================================================== + +class VariantBuilderFeatureTest : public ::testing::Test {}; + +TEST_F(VariantBuilderFeatureTest, Reset) { + VariantBuilder b; + ASSERT_OK(b.Int(42)); + auto events1 = RoundTrip(b); + ASSERT_EQ(events1[0], "Int8(42)"); + + b.Reset(); + ASSERT_OK(b.String("hello")); + auto events2 = RoundTrip(b); + ASSERT_EQ(events2[0], "String(\"hello\")"); +} + +TEST_F(VariantBuilderFeatureTest, BuilderFromExistingMetadata) { + // First, build a variant to get metadata + VariantBuilder b1; + auto start = b1.Offset(); + std::vector<VariantBuilder::FieldEntry> fields; + fields.push_back(b1.NextField(start, "name")); + ASSERT_OK(b1.String("Alice")); + ASSERT_OK(b1.FinishObject(start, fields)); + ASSERT_OK_AND_ASSIGN(auto encoded1, b1.Finish()); + + // Decode the metadata + ASSERT_OK_AND_ASSIGN(auto meta, + DecodeMetadata(encoded1.metadata.data(), + static_cast<int64_t>(encoded1.metadata.size()))); + + // Build a new variant reusing the same metadata Review Comment: Thanks for the suggestion. After working through the refactoring, I realized this test isn't possible because of how the variant format works, the metadata dictionary contains only key names (string interning for object field names), not value types. The format is self-describing: each value carries its own type tag in its header byte. A "type mismatch between metadata and data" is architecturally impossible because metadata doesn't encode types at all. The refactored `VariantMetadata` struct's docstring makes this explicit: "This is NOT a schema, it contains key names only, not value types." What you can test (and what the validation tests cover) is structural invariant violations: field IDs that exceed the dictionary size, offsets that point out of bounds, truncated values, etc. These are exercised by the `ValidateVariant` tests. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
