Copilot commented on code in PR #80: URL: https://github.com/apache/paimon-cpp/pull/80#discussion_r3393738729
########## src/paimon/core/io/data_file_path_factory_test.cpp: ########## @@ -0,0 +1,145 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "paimon/core/io/data_file_path_factory.h" + +#include <optional> +#include <utility> + +#include "gtest/gtest.h" +#include "paimon/common/data/binary_row.h" +#include "paimon/common/fs/external_path_provider.h" +#include "paimon/core/io/data_file_meta.h" +#include "paimon/core/manifest/file_source.h" +#include "paimon/core/stats/simple_stats.h" +#include "paimon/data/timestamp.h" +#include "paimon/result.h" +#include "paimon/status.h" +#include "paimon/testing/utils/testharness.h" + +namespace paimon::test { + +class DataFilePathFactoryTest : public ::testing::Test { + protected: + void SetUp() override { + // Initialize the DataFilePathFactory with a parent directory and format identifier + ASSERT_OK(factory_.Init(/*parent=*/"/tmp", /*format_identifier=*/"txt", + /*data_file_prefix=*/"data-", /*external_path_provider=*/nullptr)); + } + + DataFilePathFactory factory_; +}; + +TEST_F(DataFilePathFactoryTest, TestNewPath) { + std::string path1 = factory_.NewPath(); + std::string path2 = factory_.NewPath(); + + // Ensure that the paths are unique + ASSERT_NE(path1, path2); + ASSERT_TRUE(path1.find("/tmp/data-") != std::string::npos); + ASSERT_TRUE(path2.find("/tmp/data-") != std::string::npos); + // test Parent() and NewPathFromName() + ASSERT_EQ(factory_.Parent(), "/tmp"); + ASSERT_EQ(factory_.NewPathFromName("index-file"), "/tmp/index-file"); +} + +TEST_F(DataFilePathFactoryTest, TestNewPathWithDataFilePrefixAndExternalPath) { + DataFilePathFactory factory; Review Comment: The local variable `factory` is never used, while the test re-initializes and uses the fixture member `factory_`. This can trigger unused-variable warnings (and potentially fail builds with `-Werror`). Either remove `factory` or use it for this test instead of `factory_`. ########## src/paimon/core/io/data_file_meta_first_row_id_legacy_serializer.cpp: ########## @@ -0,0 +1,140 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "paimon/core/io/data_file_meta_first_row_id_legacy_serializer.h" + +#include <cassert> +#include <cstdint> +#include <optional> +#include <string> +#include <utility> + +#include "arrow/api.h" +#include "paimon/common/data/binary_string.h" +#include "paimon/common/data/internal_row.h" +#include "paimon/common/utils/internal_row_utils.h" +#include "paimon/common/utils/serialization_utils.h" +#include "paimon/core/manifest/file_source.h" +#include "paimon/core/stats/simple_stats.h" +#include "paimon/status.h" + +namespace paimon { +class Bytes; +class InternalArray; + +const std::shared_ptr<arrow::DataType>& DataFileMetaFirstRowIdLegacySerializer::DataType() { + static std::shared_ptr<arrow::DataType> schema = arrow::struct_( + {arrow::field("_FILE_NAME", arrow::utf8(), /*nullable=*/false), + arrow::field("_FILE_SIZE", arrow::int64(), /*nullable=*/false), + arrow::field("_ROW_COUNT", arrow::int64(), /*nullable=*/false), + arrow::field("_MIN_KEY", arrow::binary(), /*nullable=*/false), + arrow::field("_MAX_KEY", arrow::binary(), /*nullable=*/false), + arrow::field("_KEY_STATS", SimpleStats::DataType(), /*nullable=*/false), + arrow::field("_VALUE_STATS", SimpleStats::DataType(), /*nullable=*/false), + arrow::field("_MIN_SEQUENCE_NUMBER", arrow::int64(), /*nullable=*/false), + arrow::field("_MAX_SEQUENCE_NUMBER", arrow::int64(), /*nullable=*/false), + arrow::field("_SCHEMA_ID", arrow::int64(), /*nullable=*/false), + arrow::field("_LEVEL", arrow::int32(), /*nullable=*/false), + arrow::field("_EXTRA_FILES", + arrow::list(arrow::field("item", arrow::utf8(), /*nullable=*/false)), + /*nullable=*/false), + arrow::field("_CREATION_TIME", arrow::timestamp(arrow::TimeUnit::NANO), /*nullable=*/true), + arrow::field("_DELETE_ROW_COUNT", arrow::int64(), /*nullable=*/true), + arrow::field("_EMBEDDED_FILE_INDEX", arrow::binary(), /*nullable=*/true), + arrow::field("_FILE_SOURCE", arrow::int8(), /*nullable=*/true), + arrow::field("_VALUE_STATS_COLS", + arrow::list(arrow::field("item", arrow::utf8(), /*nullable=*/false)), + /*nullable=*/true), + arrow::field("_EXTERNAL_PATH", arrow::utf8(), /*nullable=*/true), + arrow::field("_FIRST_ROW_ID", arrow::int64(), /*nullable=*/true)}); + return schema; +} + +Result<BinaryRow> DataFileMetaFirstRowIdLegacySerializer::ToRow( + const std::shared_ptr<DataFileMeta>& meta) const { + assert(false); + return Status::Invalid("to row for data file meta first row id legacy serializer is invalid"); +} + +Result<std::shared_ptr<DataFileMeta>> DataFileMetaFirstRowIdLegacySerializer::FromRow( + const InternalRow& row) const { + auto file_name = row.GetString(0); + auto file_size = row.GetLong(1); + auto row_count = row.GetLong(2); + auto min_key = row.GetBinary(3); + auto max_key = row.GetBinary(4); + auto key_stats_row = row.GetRow(5, 3); + auto value_stats_row = row.GetRow(6, 3); + auto min_sequence_number = row.GetLong(7); + auto max_sequence_number = row.GetLong(8); + auto schema_id = row.GetLong(9); + auto level = row.GetInt(10); + std::shared_ptr<InternalArray> extra_files = row.GetArray(11); + auto creation_time = row.GetTimestamp(12, 3); Review Comment: The Arrow schema declares `_CREATION_TIME` as `timestamp(NANO)`, but deserialization uses `row.GetTimestamp(12, 3)` (millisecond precision). This mismatch will deserialize legacy snapshots with incorrect timestamps. Fix by aligning the schema and read precision (e.g., change the schema to `TimeUnit::MILLI`, or read with nanosecond precision and convert appropriately). ########## src/paimon/core/io/data_file_path_factory_test.cpp: ########## @@ -0,0 +1,145 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "paimon/core/io/data_file_path_factory.h" + +#include <optional> +#include <utility> + +#include "gtest/gtest.h" +#include "paimon/common/data/binary_row.h" +#include "paimon/common/fs/external_path_provider.h" +#include "paimon/core/io/data_file_meta.h" +#include "paimon/core/manifest/file_source.h" +#include "paimon/core/stats/simple_stats.h" +#include "paimon/data/timestamp.h" +#include "paimon/result.h" +#include "paimon/status.h" +#include "paimon/testing/utils/testharness.h" + +namespace paimon::test { + +class DataFilePathFactoryTest : public ::testing::Test { + protected: + void SetUp() override { + // Initialize the DataFilePathFactory with a parent directory and format identifier + ASSERT_OK(factory_.Init(/*parent=*/"/tmp", /*format_identifier=*/"txt", + /*data_file_prefix=*/"data-", /*external_path_provider=*/nullptr)); + } + + DataFilePathFactory factory_; +}; + +TEST_F(DataFilePathFactoryTest, TestNewPath) { + std::string path1 = factory_.NewPath(); + std::string path2 = factory_.NewPath(); + + // Ensure that the paths are unique + ASSERT_NE(path1, path2); + ASSERT_TRUE(path1.find("/tmp/data-") != std::string::npos); + ASSERT_TRUE(path2.find("/tmp/data-") != std::string::npos); + // test Parent() and NewPathFromName() + ASSERT_EQ(factory_.Parent(), "/tmp"); + ASSERT_EQ(factory_.NewPathFromName("index-file"), "/tmp/index-file"); +} + +TEST_F(DataFilePathFactoryTest, TestNewPathWithDataFilePrefixAndExternalPath) { + DataFilePathFactory factory; + ASSERT_OK_AND_ASSIGN( + std::unique_ptr<ExternalPathProvider> external_path_provider, + ExternalPathProvider::Create({"/tmp/external_path/"}, "p0=1/p1=0/bucket-0")); + + ASSERT_OK(factory_.Init(/*parent=*/"/tmp/p0=1/p1=0/bucket-0/", /*format_identifier=*/"txt", + /*data_file_prefix=*/"test-data-", std::move(external_path_provider))); Review Comment: The local variable `factory` is never used, while the test re-initializes and uses the fixture member `factory_`. This can trigger unused-variable warnings (and potentially fail builds with `-Werror`). Either remove `factory` or use it for this test instead of `factory_`. ########## src/paimon/core/io/data_file_meta_test.cpp: ########## @@ -0,0 +1,203 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "paimon/core/io/data_file_meta.h" + +#include "gtest/gtest.h" +#include "paimon/status.h" +#include "paimon/testing/utils/testharness.h" + +namespace paimon::test { +TEST(DataFileMetaTest, TestAddRowCount) { + DataFileMeta file_meta("data-80110e15-97b5-4bcf-ac09-6ca2659a4950-0.orc", /*file_size=*/645, + /*row_count=*/5, BinaryRow::EmptyRow(), BinaryRow::EmptyRow(), + SimpleStats::EmptyStats(), SimpleStats::EmptyStats(), + /*min_sequence_number=*/0, /*max_sequence_number=*/4, /*schema_id=*/0, + /*level=*/0, /*extra_files=*/{}, + /*creation_time=*/Timestamp(1737111915429ll, 0), + /*delete_row_count=*/2, /*embedded_index=*/nullptr, FileSource::Append(), + /*value_stats_cols=*/std::nullopt, + /*external_path=*/std::nullopt, /*first_row_id=*/std::nullopt, + /*write_cols=*/std::nullopt); + ASSERT_EQ(3, file_meta.AddRowCount().value()); + // test null delete row count + file_meta.delete_row_count = std::nullopt; + ASSERT_EQ(std::nullopt, file_meta.AddRowCount()); +} + +TEST(DataFileMetaTest, TestFileFormat) { + DataFileMeta file_meta("data-80110e15-97b5-4bcf-ac09-6ca2659a4950-0.orc", /*file_size=*/645, + /*row_count=*/5, BinaryRow::EmptyRow(), BinaryRow::EmptyRow(), + SimpleStats::EmptyStats(), SimpleStats::EmptyStats(), + /*min_sequence_number=*/0, /*max_sequence_number=*/4, /*schema_id=*/0, + /*level=*/0, /*extra_files=*/{}, + /*creation_time=*/Timestamp(1737111915429ll, 0), + /*delete_row_count=*/2, /*embedded_index=*/nullptr, FileSource::Append(), + /*value_stats_cols=*/std::nullopt, + /*external_path=*/std::nullopt, /*first_row_id=*/std::nullopt, + /*write_cols=*/std::nullopt); + ASSERT_OK_AND_ASSIGN(auto file_format, file_meta.FileFormat()); + ASSERT_EQ("orc", file_format); + file_meta.file_name = "data-80110e15-97b5-4bcf-ac09-6ca2659a4950-0.parquet"; + ASSERT_OK_AND_ASSIGN(file_format, file_meta.FileFormat()); + ASSERT_EQ("parquet", file_format); + // test invalid data file name + file_meta.file_name = "data-80110e15-97b5-4bcf-ac09-6ca2659a4950-0"; + ASSERT_NOK_WITH_MSG(file_meta.FileFormat(), + "cannot find format from file data-80110e15-97b5-4bcf-ac09-6ca2659a4950-0"); +} + +TEST(DataFileMetaTest, TestExternalPathDir) { + DataFileMeta file_meta( + "data-80110e15-97b5-4bcf-ac09-6ca2659a4950-0.orc", /*file_size=*/645, /*row_count=*/5, + BinaryRow::EmptyRow(), BinaryRow::EmptyRow(), SimpleStats::EmptyStats(), + SimpleStats::EmptyStats(), + /*min_sequence_number=*/0, /*max_sequence_number=*/4, /*schema_id=*/0, + /*level=*/0, /*extra_files=*/{}, /*creation_time=*/Timestamp(1737111915429ll, 0), + /*delete_row_count=*/0, /*embedded_index=*/nullptr, FileSource::Append(), + /*value_stats_cols=*/std::nullopt, + /*external_path=*/"file:/tmp/bucket-0/data-80110e15-97b5-4bcf-ac09-6ca2659a4950-0.orc", + /*first_row_id=*/std::nullopt, /*write_cols=*/std::nullopt); + ASSERT_EQ("file:/tmp/bucket-0", file_meta.ExternalPathDir().value()); + file_meta.external_path = std::nullopt; + ASSERT_EQ(std::nullopt, file_meta.ExternalPathDir()); +} + +TEST(DataFileMetaTest, TestGetMaxSequenceNumber) { + auto file_meta1 = std::make_shared<DataFileMeta>( + "data-80110e15-97b5-4bcf-ac09-6ca2659a4950-0.orc", /*file_size=*/645, + /*row_count=*/5, BinaryRow::EmptyRow(), BinaryRow::EmptyRow(), SimpleStats::EmptyStats(), + SimpleStats::EmptyStats(), + /*min_sequence_number=*/0, /*max_sequence_number=*/4, /*schema_id=*/0, + /*level=*/0, /*extra_files=*/std::vector<std::optional<std::string>>(), + /*creation_time=*/Timestamp(1737111915429ll, 0), + /*delete_row_count=*/2, /*embedded_index=*/nullptr, FileSource::Append(), + /*value_stats_cols=*/std::nullopt, + /*external_path=*/std::nullopt, /*first_row_id=*/std::nullopt, /*write_cols=*/std::nullopt); + auto file_meta2 = std::make_shared<DataFileMeta>( + "data-80110e15-97b5-4bcf-ac09-6ca2659a4950-1.orc", /*file_size=*/645, + /*row_count=*/5, BinaryRow::EmptyRow(), BinaryRow::EmptyRow(), SimpleStats::EmptyStats(), + SimpleStats::EmptyStats(), + /*min_sequence_number=*/5, /*max_sequence_number=*/10, /*schema_id=*/0, + /*level=*/0, /*extra_files=*/std::vector<std::optional<std::string>>(), + /*creation_time=*/Timestamp(1737111915429ll, 0), + /*delete_row_count=*/2, /*embedded_index=*/nullptr, FileSource::Append(), + /*value_stats_cols=*/std::nullopt, + /*external_path=*/std::nullopt, /*first_row_id=*/std::nullopt, /*write_cols=*/std::nullopt); + ASSERT_EQ(4, DataFileMeta::GetMaxSequenceNumber({file_meta1})); + ASSERT_EQ(10, DataFileMeta::GetMaxSequenceNumber({file_meta1, file_meta2})); + ASSERT_EQ(-1, DataFileMeta::GetMaxSequenceNumber({})); + ASSERT_EQ(file_meta1, file_meta1); Review Comment: This assertion is tautological and doesn't validate any behavior (it will always pass). Consider removing it or replacing it with an assertion that exercises the intended equality semantics (e.g., comparing dereferenced `DataFileMeta` objects if that's what you want to validate). ########## src/paimon/core/io/data_file_meta_first_row_id_legacy_serializer.cpp: ########## @@ -0,0 +1,140 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "paimon/core/io/data_file_meta_first_row_id_legacy_serializer.h" + +#include <cassert> +#include <cstdint> +#include <optional> +#include <string> +#include <utility> + +#include "arrow/api.h" +#include "paimon/common/data/binary_string.h" +#include "paimon/common/data/internal_row.h" +#include "paimon/common/utils/internal_row_utils.h" +#include "paimon/common/utils/serialization_utils.h" +#include "paimon/core/manifest/file_source.h" +#include "paimon/core/stats/simple_stats.h" +#include "paimon/status.h" + +namespace paimon { +class Bytes; +class InternalArray; + +const std::shared_ptr<arrow::DataType>& DataFileMetaFirstRowIdLegacySerializer::DataType() { + static std::shared_ptr<arrow::DataType> schema = arrow::struct_( + {arrow::field("_FILE_NAME", arrow::utf8(), /*nullable=*/false), + arrow::field("_FILE_SIZE", arrow::int64(), /*nullable=*/false), + arrow::field("_ROW_COUNT", arrow::int64(), /*nullable=*/false), + arrow::field("_MIN_KEY", arrow::binary(), /*nullable=*/false), + arrow::field("_MAX_KEY", arrow::binary(), /*nullable=*/false), + arrow::field("_KEY_STATS", SimpleStats::DataType(), /*nullable=*/false), + arrow::field("_VALUE_STATS", SimpleStats::DataType(), /*nullable=*/false), + arrow::field("_MIN_SEQUENCE_NUMBER", arrow::int64(), /*nullable=*/false), + arrow::field("_MAX_SEQUENCE_NUMBER", arrow::int64(), /*nullable=*/false), + arrow::field("_SCHEMA_ID", arrow::int64(), /*nullable=*/false), + arrow::field("_LEVEL", arrow::int32(), /*nullable=*/false), + arrow::field("_EXTRA_FILES", + arrow::list(arrow::field("item", arrow::utf8(), /*nullable=*/false)), + /*nullable=*/false), + arrow::field("_CREATION_TIME", arrow::timestamp(arrow::TimeUnit::NANO), /*nullable=*/true), Review Comment: The Arrow schema declares `_CREATION_TIME` as `timestamp(NANO)`, but deserialization uses `row.GetTimestamp(12, 3)` (millisecond precision). This mismatch will deserialize legacy snapshots with incorrect timestamps. Fix by aligning the schema and read precision (e.g., change the schema to `TimeUnit::MILLI`, or read with nanosecond precision and convert appropriately). ########## src/paimon/core/io/data_file_meta_serializer.cpp: ########## @@ -0,0 +1,177 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "paimon/core/io/data_file_meta_serializer.h" + +#include <cassert> +#include <cstdint> +#include <optional> +#include <string> +#include <utility> + +#include "paimon/common/data/binary_row_writer.h" +#include "paimon/common/data/binary_string.h" +#include "paimon/common/data/internal_row.h" +#include "paimon/common/utils/internal_row_utils.h" +#include "paimon/common/utils/serialization_utils.h" +#include "paimon/core/manifest/file_source.h" +#include "paimon/core/stats/simple_stats.h" +#include "paimon/data/timestamp.h" +#include "paimon/status.h" + +namespace paimon { + +class Bytes; +class InternalArray; +class MemoryPool; + +Result<BinaryRow> DataFileMetaSerializer::ToRow(const std::shared_ptr<DataFileMeta>& meta) const { + BinaryRow row(20); + BinaryRowWriter writer(&row, 32 * 1024, pool_.get()); + writer.WriteString(0, BinaryString::FromString(meta->file_name, pool_.get())); + writer.WriteLong(1, meta->file_size); + writer.WriteLong(2, meta->row_count); + auto min_key_bytes = SerializationUtils::SerializeBinaryRow(meta->min_key, pool_.get()); + writer.WriteBinary(3, *min_key_bytes); + auto max_key_bytes = SerializationUtils::SerializeBinaryRow(meta->max_key, pool_.get()); + writer.WriteBinary(4, *max_key_bytes); + writer.WriteRow(5, meta->key_stats.ToRow()); + writer.WriteRow(6, meta->value_stats.ToRow()); + writer.WriteLong(7, meta->min_sequence_number); + writer.WriteLong(8, meta->max_sequence_number); + writer.WriteLong(9, meta->schema_id); + writer.WriteInt(10, meta->level); + writer.WriteArray(11, InternalRowUtils::ToStringArrayData(meta->extra_files, pool_)); + writer.WriteTimestamp(12, meta->creation_time, 3); + if (meta->delete_row_count == std::nullopt) { + writer.SetNullAt(13); + } else { + writer.WriteLong(13, meta->delete_row_count.value()); + } + if (meta->embedded_index == nullptr) { + writer.SetNullAt(14); + } else { + writer.WriteBinary(14, *meta->embedded_index); + } + if (meta->file_source == std::nullopt) { + writer.SetNullAt(15); + } else { + writer.WriteByte(15, meta->file_source.value().ToByteValue()); + } + if (meta->value_stats_cols == std::nullopt) { + writer.SetNullAt(16); + } else { + writer.WriteArray( + 16, InternalRowUtils::ToNotNullStringArrayData(meta->value_stats_cols.value(), pool_)); + } + if (meta->external_path == std::nullopt) { + writer.SetNullAt(17); + } else { + writer.WriteString(17, BinaryString::FromString(meta->external_path.value(), pool_.get())); + } + if (meta->first_row_id == std::nullopt) { + writer.SetNullAt(18); + } else { + writer.WriteLong(18, meta->first_row_id.value()); + } + if (meta->write_cols == std::nullopt) { + writer.SetNullAt(19); + } else { + writer.WriteArray( + 19, InternalRowUtils::ToNotNullStringArrayData(meta->write_cols.value(), pool_)); + } + writer.Complete(); + return row; +} + +Result<std::shared_ptr<DataFileMeta>> DataFileMetaSerializer::FromRow( + const InternalRow& row) const { + auto file_name = row.GetString(0); + auto file_size = row.GetLong(1); + auto row_count = row.GetLong(2); + auto min_key = row.GetBinary(3); + auto max_key = row.GetBinary(4); + auto key_stats_row = row.GetRow(5, 3); + auto value_stats_row = row.GetRow(6, 3); + auto min_sequence_number = row.GetLong(7); + auto max_sequence_number = row.GetLong(8); + auto schema_id = row.GetLong(9); + auto level = row.GetInt(10); + std::shared_ptr<InternalArray> extra_files = row.GetArray(11); + auto creation_time = row.GetTimestamp(12, 3); + + assert(min_key && max_key && key_stats_row && value_stats_row); + if (extra_files == nullptr) { + return Status::Invalid("extra files is empty"); + } + + std::optional<int64_t> delete_row_count; + if (!row.IsNullAt(13)) { + delete_row_count = row.GetLong(13); + } + std::shared_ptr<Bytes> embedded_file_index; + if (!row.IsNullAt(14)) { + embedded_file_index = row.GetBinary(14); + } + + std::optional<FileSource> file_source; + if (!row.IsNullAt(15)) { + PAIMON_ASSIGN_OR_RAISE(file_source, FileSource::FromByteValue(row.GetByte(15))); + } + + std::optional<std::vector<std::string>> value_stats_cols; + if (!row.IsNullAt(16)) { + std::shared_ptr<InternalArray> array = row.GetArray(16); + if (array == nullptr) { + return Status::Invalid("invalid value stats cols"); + } + value_stats_cols = InternalRowUtils::FromNotNullStringArrayData(array.get()); + } + + std::optional<std::string> external_path; + if (!row.IsNullAt(17)) { + external_path = row.GetString(17).ToString(); + } + std::optional<int64_t> first_row_id; + if (!row.IsNullAt(18)) { + first_row_id = row.GetLong(18); + } + + std::optional<std::vector<std::string>> write_cols; + if (!row.IsNullAt(19)) { + std::shared_ptr<InternalArray> array = row.GetArray(19); + if (array == nullptr) { + return Status::Invalid("invalid write cols"); + } + write_cols = InternalRowUtils::FromNotNullStringArrayData(array.get()); + } + PAIMON_ASSIGN_OR_RAISE(BinaryRow min_values, SerializationUtils::DeserializeBinaryRow(min_key)); + PAIMON_ASSIGN_OR_RAISE(BinaryRow max_values, SerializationUtils::DeserializeBinaryRow(max_key)); + PAIMON_ASSIGN_OR_RAISE(SimpleStats key_stats, + SimpleStats::FromRow(key_stats_row.get(), pool_.get())); + PAIMON_ASSIGN_OR_RAISE(SimpleStats value_stats, + SimpleStats::FromRow(value_stats_row.get(), pool_.get())); + return std::make_shared<DataFileMeta>( + file_name.ToString(), file_size, row_count, min_values, max_values, key_stats, value_stats, + min_sequence_number, max_sequence_number, schema_id, level, + InternalRowUtils::FromStringArrayData(extra_files.get()), creation_time, delete_row_count, + embedded_file_index, file_source, std::optional<std::vector<std::string>>(value_stats_cols), + external_path, first_row_id, write_cols); Review Comment: `value_stats_cols` is already a `std::optional<std::vector<std::string>>`. Wrapping it as `std::optional<std::vector<std::string>>(value_stats_cols)` attempts to construct a `std::vector<std::string>` from an `std::optional<...>` and will not compile. Pass `value_stats_cols` directly. The same pattern appears in the other snapshot/legacy serializers added in this PR and should be fixed consistently. ########## src/paimon/core/io/data_file_meta_10_serializer.cpp: ########## @@ -0,0 +1,129 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "paimon/core/io/data_file_meta_10_serializer.h" + +#include <cassert> +#include <cstdint> +#include <optional> +#include <string> +#include <utility> + +#include "arrow/api.h" +#include "paimon/common/data/binary_string.h" +#include "paimon/common/data/internal_row.h" +#include "paimon/common/utils/internal_row_utils.h" +#include "paimon/common/utils/serialization_utils.h" +#include "paimon/core/manifest/file_source.h" +#include "paimon/core/stats/simple_stats.h" +#include "paimon/status.h" + +namespace paimon { +class Bytes; +class InternalArray; + +const std::shared_ptr<arrow::DataType>& DataFileMeta10Serializer::DataType() { + static std::shared_ptr<arrow::DataType> schema = arrow::struct_( + {arrow::field("_FILE_NAME", arrow::utf8(), /*nullable=*/false), + arrow::field("_FILE_SIZE", arrow::int64(), /*nullable=*/false), + arrow::field("_ROW_COUNT", arrow::int64(), /*nullable=*/false), + arrow::field("_MIN_KEY", arrow::binary(), /*nullable=*/false), + arrow::field("_MAX_KEY", arrow::binary(), /*nullable=*/false), + arrow::field("_KEY_STATS", SimpleStats::DataType(), /*nullable=*/false), + arrow::field("_VALUE_STATS", SimpleStats::DataType(), /*nullable=*/false), + arrow::field("_MIN_SEQUENCE_NUMBER", arrow::int64(), /*nullable=*/false), + arrow::field("_MAX_SEQUENCE_NUMBER", arrow::int64(), /*nullable=*/false), + arrow::field("_SCHEMA_ID", arrow::int64(), /*nullable=*/false), + arrow::field("_LEVEL", arrow::int32(), /*nullable=*/false), + arrow::field("_EXTRA_FILES", + arrow::list(arrow::field("item", arrow::utf8(), /*nullable=*/false)), + /*nullable=*/false), + arrow::field("_CREATION_TIME", arrow::timestamp(arrow::TimeUnit::MILLI), + /*nullable=*/true), + arrow::field("_DELETE_ROW_COUNT", arrow::int64(), /*nullable=*/true), + arrow::field("_EMBEDDED_FILE_INDEX", arrow::binary(), /*nullable=*/true), + arrow::field("_FILE_SOURCE", arrow::int8(), /*nullable=*/true), + arrow::field("_VALUE_STATS_COLS", + arrow::list(arrow::field("item", arrow::utf8(), /*nullable=*/false)), + /*nullable=*/true)}); + return schema; +} + +Result<BinaryRow> DataFileMeta10Serializer::ToRow(const std::shared_ptr<DataFileMeta>& meta) const { + assert(false); + return Status::Invalid("to row for data file meta 10 serializer is invalid"); +} Review Comment: Using `assert(false)` here means any accidental call in debug builds will abort the process rather than returning a clean error. Since this serializer is intentionally read-only, consider removing the `assert(false)` and just returning `Status::Invalid`, or if the type hierarchy allows it, avoid exposing `ToRow` for legacy-only serializers (e.g., a separate interface or explicitly disabling it). -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
