pitrou commented on code in PR #14803: URL: https://github.com/apache/arrow/pull/14803#discussion_r1039611117
########## cpp/src/parquet/page_index.h: ########## @@ -0,0 +1,119 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "parquet/exception.h" +#include "parquet/platform.h" +#include "parquet/schema.h" + +#include <vector> + +namespace parquet { + +/// \brief BoundaryOrder is a proxy around format::BoundaryOrder. +enum class PARQUET_EXPORT BoundaryOrder { Unordered = 0, Ascending = 1, Descending = 2 }; + +/// \brief ColumnIndex is a proxy around format::ColumnIndex. +class PARQUET_EXPORT ColumnIndex { + public: + /// \brief Create a ColumnIndex from a serialized thrift message. + static std::unique_ptr<ColumnIndex> Make(const ColumnDescriptor& descr, + const void* serialized_index, + uint32_t index_len, + const ReaderProperties& properties); + + virtual ~ColumnIndex() = default; + + /// \brief Returns a list of boolean values to determine the validity of the + /// corresponding min and max values. Review Comment: ```suggestion /// \brief A bitmap with a bit set for each data page that has only null values. /// /// The length of this vector is equal to the number of data pages in the column. ``` ########## cpp/src/parquet/page_index.h: ########## @@ -0,0 +1,119 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "parquet/exception.h" +#include "parquet/platform.h" +#include "parquet/schema.h" + +#include <vector> + +namespace parquet { + +/// \brief BoundaryOrder is a proxy around format::BoundaryOrder. +enum class PARQUET_EXPORT BoundaryOrder { Unordered = 0, Ascending = 1, Descending = 2 }; + +/// \brief ColumnIndex is a proxy around format::ColumnIndex. +class PARQUET_EXPORT ColumnIndex { + public: + /// \brief Create a ColumnIndex from a serialized thrift message. + static std::unique_ptr<ColumnIndex> Make(const ColumnDescriptor& descr, + const void* serialized_index, + uint32_t index_len, + const ReaderProperties& properties); + + virtual ~ColumnIndex() = default; + + /// \brief Returns a list of boolean values to determine the validity of the + /// corresponding min and max values. + virtual const std::vector<bool>& null_pages() const = 0; + + /// \brief Returns a list of encoded lower bound for the values of each page. For null + /// pages the default value is an empty string. Readers must make sure that list entries + /// are populated before using them by inspecting null_pages. Review Comment: ```suggestion /// \brief A vector of encoded lower bounds for each data page in this column. /// /// `null_pages` should be inspected first, as only pages with non-null values /// may have their lower bounds populated. ``` ########## cpp/src/parquet/page_index.h: ########## @@ -0,0 +1,119 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "parquet/exception.h" +#include "parquet/platform.h" +#include "parquet/schema.h" Review Comment: Suggestion: use forward-declarations for `ColumnDescriptor` and `ReaderProperties` so that you can minimize transitive inclusions in this `.h`. ########## cpp/src/parquet/page_index.h: ########## @@ -0,0 +1,119 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "parquet/exception.h" +#include "parquet/platform.h" +#include "parquet/schema.h" + +#include <vector> + +namespace parquet { + +/// \brief BoundaryOrder is a proxy around format::BoundaryOrder. +enum class PARQUET_EXPORT BoundaryOrder { Unordered = 0, Ascending = 1, Descending = 2 }; + +/// \brief ColumnIndex is a proxy around format::ColumnIndex. +class PARQUET_EXPORT ColumnIndex { + public: + /// \brief Create a ColumnIndex from a serialized thrift message. + static std::unique_ptr<ColumnIndex> Make(const ColumnDescriptor& descr, + const void* serialized_index, + uint32_t index_len, + const ReaderProperties& properties); + + virtual ~ColumnIndex() = default; + + /// \brief Returns a list of boolean values to determine the validity of the + /// corresponding min and max values. + virtual const std::vector<bool>& null_pages() const = 0; + + /// \brief Returns a list of encoded lower bound for the values of each page. For null + /// pages the default value is an empty string. Readers must make sure that list entries + /// are populated before using them by inspecting null_pages. + virtual const std::vector<std::string>& encoded_min_values() const = 0; + + /// \brief Returns a list of encoded upper bound for the values of each page. For null + /// pages the default value is an empty string. Readers must make sure that list entries + /// are populated before using them by inspecting null_pages. Review Comment: ```suggestion /// \brief A vector of encoded upper bounds for each data page in this column. /// /// `null_pages` should be inspected first, as only pages with non-null values /// may have their upper bounds populated. ``` ########## cpp/src/parquet/page_index.h: ########## @@ -0,0 +1,119 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "parquet/exception.h" +#include "parquet/platform.h" +#include "parquet/schema.h" + +#include <vector> + +namespace parquet { + +/// \brief BoundaryOrder is a proxy around format::BoundaryOrder. +enum class PARQUET_EXPORT BoundaryOrder { Unordered = 0, Ascending = 1, Descending = 2 }; + +/// \brief ColumnIndex is a proxy around format::ColumnIndex. +class PARQUET_EXPORT ColumnIndex { + public: + /// \brief Create a ColumnIndex from a serialized thrift message. + static std::unique_ptr<ColumnIndex> Make(const ColumnDescriptor& descr, + const void* serialized_index, + uint32_t index_len, + const ReaderProperties& properties); + + virtual ~ColumnIndex() = default; + + /// \brief Returns a list of boolean values to determine the validity of the + /// corresponding min and max values. + virtual const std::vector<bool>& null_pages() const = 0; + + /// \brief Returns a list of encoded lower bound for the values of each page. For null + /// pages the default value is an empty string. Readers must make sure that list entries + /// are populated before using them by inspecting null_pages. + virtual const std::vector<std::string>& encoded_min_values() const = 0; + + /// \brief Returns a list of encoded upper bound for the values of each page. For null + /// pages the default value is an empty string. Readers must make sure that list entries + /// are populated before using them by inspecting null_pages. + virtual const std::vector<std::string>& encoded_max_values() const = 0; + + /// \brief Returns whether both min_values and max_values are orderd and if so, in which + /// direction. + virtual BoundaryOrder boundary_order() const = 0; + + /// \brief Returns if null count is available. Review Comment: ```suggestion /// \brief Whether per-page null count information is available. ``` ########## cpp/src/parquet/page_index.h: ########## @@ -0,0 +1,119 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "parquet/exception.h" +#include "parquet/platform.h" +#include "parquet/schema.h" + +#include <vector> + +namespace parquet { + +/// \brief BoundaryOrder is a proxy around format::BoundaryOrder. +enum class PARQUET_EXPORT BoundaryOrder { Unordered = 0, Ascending = 1, Descending = 2 }; + +/// \brief ColumnIndex is a proxy around format::ColumnIndex. +class PARQUET_EXPORT ColumnIndex { + public: + /// \brief Create a ColumnIndex from a serialized thrift message. + static std::unique_ptr<ColumnIndex> Make(const ColumnDescriptor& descr, + const void* serialized_index, + uint32_t index_len, + const ReaderProperties& properties); + + virtual ~ColumnIndex() = default; + + /// \brief Returns a list of boolean values to determine the validity of the + /// corresponding min and max values. + virtual const std::vector<bool>& null_pages() const = 0; + + /// \brief Returns a list of encoded lower bound for the values of each page. For null + /// pages the default value is an empty string. Readers must make sure that list entries + /// are populated before using them by inspecting null_pages. + virtual const std::vector<std::string>& encoded_min_values() const = 0; + + /// \brief Returns a list of encoded upper bound for the values of each page. For null + /// pages the default value is an empty string. Readers must make sure that list entries + /// are populated before using them by inspecting null_pages. + virtual const std::vector<std::string>& encoded_max_values() const = 0; + + /// \brief Returns whether both min_values and max_values are orderd and if so, in which + /// direction. + virtual BoundaryOrder boundary_order() const = 0; + + /// \brief Returns if null count is available. + virtual bool has_null_counts() const = 0; + + /// \brief Returns A list containing the number of null values for each page. + virtual const std::vector<int64_t>& null_counts() const = 0; +}; + +/// \brief Typed implementation of ColumnIndex. +template <typename DType> +class PARQUET_EXPORT TypedColumnIndex : public ColumnIndex { + public: + using T = typename DType::c_type; + + /// \brief Returns a list of lower bound for the values of every non-null page. + /// Excluding non-null pages helps binary search if the values are ordered. Review Comment: ```suggestion /// \brief A vector of lower bounds for each data page in this column. /// /// This is like `encoded_min_values`, but with the values decoded according to /// the column's physical type. /// `min_values` and `max_values` can be used together with `boundary_order` /// in order to prune some data pages when searching for specific values. ``` ########## cpp/src/parquet/page_index.h: ########## @@ -0,0 +1,119 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "parquet/exception.h" +#include "parquet/platform.h" +#include "parquet/schema.h" + +#include <vector> + +namespace parquet { + +/// \brief BoundaryOrder is a proxy around format::BoundaryOrder. +enum class PARQUET_EXPORT BoundaryOrder { Unordered = 0, Ascending = 1, Descending = 2 }; + +/// \brief ColumnIndex is a proxy around format::ColumnIndex. +class PARQUET_EXPORT ColumnIndex { + public: + /// \brief Create a ColumnIndex from a serialized thrift message. + static std::unique_ptr<ColumnIndex> Make(const ColumnDescriptor& descr, + const void* serialized_index, + uint32_t index_len, + const ReaderProperties& properties); + + virtual ~ColumnIndex() = default; + + /// \brief Returns a list of boolean values to determine the validity of the + /// corresponding min and max values. + virtual const std::vector<bool>& null_pages() const = 0; + + /// \brief Returns a list of encoded lower bound for the values of each page. For null + /// pages the default value is an empty string. Readers must make sure that list entries + /// are populated before using them by inspecting null_pages. + virtual const std::vector<std::string>& encoded_min_values() const = 0; + + /// \brief Returns a list of encoded upper bound for the values of each page. For null + /// pages the default value is an empty string. Readers must make sure that list entries + /// are populated before using them by inspecting null_pages. + virtual const std::vector<std::string>& encoded_max_values() const = 0; + + /// \brief Returns whether both min_values and max_values are orderd and if so, in which + /// direction. + virtual BoundaryOrder boundary_order() const = 0; + + /// \brief Returns if null count is available. + virtual bool has_null_counts() const = 0; + + /// \brief Returns A list containing the number of null values for each page. + virtual const std::vector<int64_t>& null_counts() const = 0; +}; + +/// \brief Typed implementation of ColumnIndex. +template <typename DType> +class PARQUET_EXPORT TypedColumnIndex : public ColumnIndex { + public: + using T = typename DType::c_type; + + /// \brief Returns a list of lower bound for the values of every non-null page. + /// Excluding non-null pages helps binary search if the values are ordered. + virtual const std::vector<T>& min_values() const = 0; + + /// \brief Returns a list of upper bound for the values of every non-null page. + /// Excluding non-null pages helps binary search if the values are ordered. Review Comment: ```suggestion /// \brief A vector of upper bounds for each data page in this column. /// /// Just like `min_values`, but for upper bounds instead of lower bounds. ``` ########## cpp/src/parquet/page_index.h: ########## @@ -0,0 +1,119 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "parquet/exception.h" +#include "parquet/platform.h" +#include "parquet/schema.h" + +#include <vector> + +namespace parquet { + +/// \brief BoundaryOrder is a proxy around format::BoundaryOrder. +enum class PARQUET_EXPORT BoundaryOrder { Unordered = 0, Ascending = 1, Descending = 2 }; + +/// \brief ColumnIndex is a proxy around format::ColumnIndex. +class PARQUET_EXPORT ColumnIndex { + public: + /// \brief Create a ColumnIndex from a serialized thrift message. + static std::unique_ptr<ColumnIndex> Make(const ColumnDescriptor& descr, + const void* serialized_index, + uint32_t index_len, + const ReaderProperties& properties); + + virtual ~ColumnIndex() = default; + + /// \brief Returns a list of boolean values to determine the validity of the + /// corresponding min and max values. + virtual const std::vector<bool>& null_pages() const = 0; + + /// \brief Returns a list of encoded lower bound for the values of each page. For null + /// pages the default value is an empty string. Readers must make sure that list entries + /// are populated before using them by inspecting null_pages. + virtual const std::vector<std::string>& encoded_min_values() const = 0; + + /// \brief Returns a list of encoded upper bound for the values of each page. For null + /// pages the default value is an empty string. Readers must make sure that list entries + /// are populated before using them by inspecting null_pages. + virtual const std::vector<std::string>& encoded_max_values() const = 0; + + /// \brief Returns whether both min_values and max_values are orderd and if so, in which + /// direction. + virtual BoundaryOrder boundary_order() const = 0; + + /// \brief Returns if null count is available. + virtual bool has_null_counts() const = 0; + + /// \brief Returns A list containing the number of null values for each page. + virtual const std::vector<int64_t>& null_counts() const = 0; +}; + +/// \brief Typed implementation of ColumnIndex. +template <typename DType> +class PARQUET_EXPORT TypedColumnIndex : public ColumnIndex { + public: + using T = typename DType::c_type; + + /// \brief Returns a list of lower bound for the values of every non-null page. + /// Excluding non-null pages helps binary search if the values are ordered. + virtual const std::vector<T>& min_values() const = 0; + + /// \brief Returns a list of upper bound for the values of every non-null page. + /// Excluding non-null pages helps binary search if the values are ordered. + virtual const std::vector<T>& max_values() const = 0; + + /// \brief Returns a list of page indices for not-null pages. It is helpful to + /// understand the original page id in the values returned from min_values() + /// and max_values() above. + virtual const std::vector<int32_t> GetNonNullPageIndices() const = 0; +}; + +using BoolColumnIndex = TypedColumnIndex<BooleanType>; +using Int32ColumnIndex = TypedColumnIndex<Int32Type>; +using Int64ColumnIndex = TypedColumnIndex<Int64Type>; +using FloatColumnIndex = TypedColumnIndex<FloatType>; +using DoubleColumnIndex = TypedColumnIndex<DoubleType>; +using ByteArrayColumnIndex = TypedColumnIndex<ByteArrayType>; +using FLBAColumnIndex = TypedColumnIndex<FLBAType>; + +/// \brief PageLocation is a proxy around format::PageLocation. +struct PARQUET_EXPORT PageLocation { + /// File offset of the data page. + int64_t offset; + /// Total compressed size of the data page and header. + int32_t compressed_page_size; + // row id of the first row in the page within the row group. Review Comment: ```suggestion /// Row id of the first row in the page within the row group. ``` ########## cpp/src/parquet/page_index.cc: ########## @@ -0,0 +1,174 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "parquet/page_index.h" +#include "parquet/encoding.h" +#include "parquet/statistics.h" +#include "parquet/thrift_internal.h" + +#include <map> + +namespace parquet { + +namespace { + +template <typename DType> +void Decode(std::unique_ptr<typename EncodingTraits<DType>::Decoder>& decoder, + const std::string& src, typename DType::c_type* dst) { + decoder->SetData(/*num_values=*/1, reinterpret_cast<const uint8_t*>(src.c_str()), + static_cast<int>(src.size())); + decoder->Decode(dst, /*max_values=*/1); +} + +template <> +void Decode<ByteArrayType>(std::unique_ptr<ByteArrayDecoder>&, const std::string& src, + ByteArray* dst) { + dst->len = static_cast<uint32_t>(src.size()); + dst->ptr = reinterpret_cast<const uint8_t*>(src.c_str()); +} + +template <typename DType> +class TypedColumnIndexImpl : public TypedColumnIndex<DType> { + public: + using T = typename DType::c_type; + + TypedColumnIndexImpl(const ColumnDescriptor& descr, + const format::ColumnIndex& column_index) + : column_index_(column_index) { + min_values_.reserve(column_index_.null_pages.size()); + max_values_.reserve(column_index_.null_pages.size()); + // Decode min and max values into a compact form (i.e. w/o null page) + auto plain_decoder = MakeTypedDecoder<DType>(Encoding::PLAIN, &descr); + T value; + for (size_t i = 0; i < column_index_.null_pages.size(); ++i) { + if (!column_index_.null_pages[i]) { + non_null_page_indices_.emplace_back(static_cast<int32_t>(i)); + Decode<DType>(plain_decoder, column_index_.min_values[i], &value); + min_values_.emplace_back(value); + Decode<DType>(plain_decoder, column_index_.max_values[i], &value); + max_values_.emplace_back(value); + } + } + } + + const std::vector<bool>& null_pages() const override { + return column_index_.null_pages; + } + + const std::vector<std::string>& encoded_min_values() const override { + return column_index_.min_values; + } + + const std::vector<std::string>& encoded_max_values() const override { + return column_index_.max_values; + } + + BoundaryOrder boundary_order() const override { + return static_cast<BoundaryOrder>(static_cast<int>(column_index_.boundary_order)); + } + + bool has_null_counts() const override { return column_index_.__isset.null_counts; } + + const std::vector<int64_t>& null_counts() const override { + return column_index_.null_counts; + } + + const std::vector<T>& min_values() const override { return min_values_; } + + const std::vector<T>& max_values() const override { return max_values_; } + + const std::vector<int32_t> GetNonNullPageIndices() const override { + return non_null_page_indices_; + } + + private: + /// Wrapped thrift column index. + const format::ColumnIndex column_index_; + /// Decoded typed min/max values. Null pages are set to std::nullopt. + std::vector<T> min_values_; + std::vector<T> max_values_; + /// A list of page indices for not-null pages. + std::vector<int32_t> non_null_page_indices_; +}; + +class OffsetIndexImpl : public OffsetIndex { + public: + explicit OffsetIndexImpl(const format::OffsetIndex& offset_index) { + page_locations_.reserve(offset_index.page_locations.size()); + for (const auto& page_location : offset_index.page_locations) { + page_locations_.emplace_back(PageLocation{page_location.offset, + page_location.compressed_page_size, + page_location.first_row_index}); + } + } + + const std::vector<PageLocation>& GetPageLocations() const override { + return page_locations_; + } + + private: + std::vector<PageLocation> page_locations_; +}; + +} // namespace + +// ---------------------------------------------------------------------- +// Public factory functions + +std::unique_ptr<ColumnIndex> ColumnIndex::Make(const ColumnDescriptor& descr, + const void* serialized_index, + uint32_t index_len, + const ReaderProperties& properties) { + format::ColumnIndex column_index; + ThriftDeserializer deserializer(properties); + deserializer.DeserializeMessage(reinterpret_cast<const uint8_t*>(serialized_index), + &index_len, &column_index); + switch (descr.physical_type()) { + case Type::BOOLEAN: + return std::make_unique<TypedColumnIndexImpl<BooleanType>>(descr, column_index); + case Type::INT32: + return std::make_unique<TypedColumnIndexImpl<Int32Type>>(descr, column_index); + case Type::INT64: + return std::make_unique<TypedColumnIndexImpl<Int64Type>>(descr, column_index); + case Type::INT96: + return std::make_unique<TypedColumnIndexImpl<Int96Type>>(descr, column_index); + case Type::FLOAT: + return std::make_unique<TypedColumnIndexImpl<FloatType>>(descr, column_index); + case Type::DOUBLE: + return std::make_unique<TypedColumnIndexImpl<DoubleType>>(descr, column_index); + case Type::BYTE_ARRAY: + return std::make_unique<TypedColumnIndexImpl<ByteArrayType>>(descr, column_index); + case Type::FIXED_LEN_BYTE_ARRAY: + return std::make_unique<TypedColumnIndexImpl<FLBAType>>(descr, column_index); + default: + break; + } + DCHECK(false) << "Should not be able to reach this code"; Review Comment: Can use `arrow::Unreachable()` from `arrow/util/unreachable.h`. ########## cpp/src/parquet/page_index.h: ########## @@ -0,0 +1,119 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "parquet/exception.h" +#include "parquet/platform.h" +#include "parquet/schema.h" + +#include <vector> + +namespace parquet { + +/// \brief BoundaryOrder is a proxy around format::BoundaryOrder. +enum class PARQUET_EXPORT BoundaryOrder { Unordered = 0, Ascending = 1, Descending = 2 }; + +/// \brief ColumnIndex is a proxy around format::ColumnIndex. +class PARQUET_EXPORT ColumnIndex { + public: + /// \brief Create a ColumnIndex from a serialized thrift message. + static std::unique_ptr<ColumnIndex> Make(const ColumnDescriptor& descr, + const void* serialized_index, + uint32_t index_len, + const ReaderProperties& properties); + + virtual ~ColumnIndex() = default; + + /// \brief Returns a list of boolean values to determine the validity of the + /// corresponding min and max values. + virtual const std::vector<bool>& null_pages() const = 0; + + /// \brief Returns a list of encoded lower bound for the values of each page. For null + /// pages the default value is an empty string. Readers must make sure that list entries + /// are populated before using them by inspecting null_pages. + virtual const std::vector<std::string>& encoded_min_values() const = 0; + + /// \brief Returns a list of encoded upper bound for the values of each page. For null + /// pages the default value is an empty string. Readers must make sure that list entries + /// are populated before using them by inspecting null_pages. + virtual const std::vector<std::string>& encoded_max_values() const = 0; + + /// \brief Returns whether both min_values and max_values are orderd and if so, in which + /// direction. + virtual BoundaryOrder boundary_order() const = 0; + + /// \brief Returns if null count is available. + virtual bool has_null_counts() const = 0; + + /// \brief Returns A list containing the number of null values for each page. + virtual const std::vector<int64_t>& null_counts() const = 0; +}; + +/// \brief Typed implementation of ColumnIndex. +template <typename DType> +class PARQUET_EXPORT TypedColumnIndex : public ColumnIndex { + public: + using T = typename DType::c_type; + + /// \brief Returns a list of lower bound for the values of every non-null page. + /// Excluding non-null pages helps binary search if the values are ordered. + virtual const std::vector<T>& min_values() const = 0; + + /// \brief Returns a list of upper bound for the values of every non-null page. + /// Excluding non-null pages helps binary search if the values are ordered. + virtual const std::vector<T>& max_values() const = 0; + + /// \brief Returns a list of page indices for not-null pages. It is helpful to + /// understand the original page id in the values returned from min_values() + /// and max_values() above. + virtual const std::vector<int32_t> GetNonNullPageIndices() const = 0; +}; + +using BoolColumnIndex = TypedColumnIndex<BooleanType>; +using Int32ColumnIndex = TypedColumnIndex<Int32Type>; +using Int64ColumnIndex = TypedColumnIndex<Int64Type>; +using FloatColumnIndex = TypedColumnIndex<FloatType>; +using DoubleColumnIndex = TypedColumnIndex<DoubleType>; +using ByteArrayColumnIndex = TypedColumnIndex<ByteArrayType>; +using FLBAColumnIndex = TypedColumnIndex<FLBAType>; + +/// \brief PageLocation is a proxy around format::PageLocation. +struct PARQUET_EXPORT PageLocation { + /// File offset of the data page. + int64_t offset; + /// Total compressed size of the data page and header. + int32_t compressed_page_size; + // row id of the first row in the page within the row group. + int64_t first_row_index; +}; + +/// \brief OffsetIndex is a proxy around format::OffsetIndex. +class PARQUET_EXPORT OffsetIndex { + public: + /// \brief Create a OffsetIndex from a serialized thrift message. + static std::unique_ptr<OffsetIndex> Make(const void* serialized_index, + uint32_t index_len, + const ReaderProperties& properties); + + virtual ~OffsetIndex() = default; + + /// \brief Returns all page locations in the offset index. + virtual const std::vector<PageLocation>& GetPageLocations() const = 0; Review Comment: ```suggestion /// \brief A vector of locations for each data page in this column. virtual const std::vector<PageLocation>& page_locations() const = 0; ``` ########## cpp/src/parquet/page_index.cc: ########## @@ -0,0 +1,174 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "parquet/page_index.h" +#include "parquet/encoding.h" +#include "parquet/statistics.h" +#include "parquet/thrift_internal.h" + +#include <map> + +namespace parquet { + +namespace { + +template <typename DType> +void Decode(std::unique_ptr<typename EncodingTraits<DType>::Decoder>& decoder, + const std::string& src, typename DType::c_type* dst) { + decoder->SetData(/*num_values=*/1, reinterpret_cast<const uint8_t*>(src.c_str()), + static_cast<int>(src.size())); + decoder->Decode(dst, /*max_values=*/1); +} + +template <> +void Decode<ByteArrayType>(std::unique_ptr<ByteArrayDecoder>&, const std::string& src, + ByteArray* dst) { + dst->len = static_cast<uint32_t>(src.size()); + dst->ptr = reinterpret_cast<const uint8_t*>(src.c_str()); +} + +template <typename DType> +class TypedColumnIndexImpl : public TypedColumnIndex<DType> { + public: + using T = typename DType::c_type; + + TypedColumnIndexImpl(const ColumnDescriptor& descr, + const format::ColumnIndex& column_index) + : column_index_(column_index) { + min_values_.reserve(column_index_.null_pages.size()); + max_values_.reserve(column_index_.null_pages.size()); + // Decode min and max values into a compact form (i.e. w/o null page) Review Comment: Hmm, why not, but then `reserve` should be called with the number of non-null data pages (meaning you probably want two separate loops). ########## cpp/src/parquet/page_index.h: ########## @@ -0,0 +1,119 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "parquet/exception.h" +#include "parquet/platform.h" +#include "parquet/schema.h" + +#include <vector> + +namespace parquet { + +/// \brief BoundaryOrder is a proxy around format::BoundaryOrder. +enum class PARQUET_EXPORT BoundaryOrder { Unordered = 0, Ascending = 1, Descending = 2 }; + +/// \brief ColumnIndex is a proxy around format::ColumnIndex. +class PARQUET_EXPORT ColumnIndex { + public: + /// \brief Create a ColumnIndex from a serialized thrift message. + static std::unique_ptr<ColumnIndex> Make(const ColumnDescriptor& descr, + const void* serialized_index, + uint32_t index_len, + const ReaderProperties& properties); + + virtual ~ColumnIndex() = default; + + /// \brief Returns a list of boolean values to determine the validity of the + /// corresponding min and max values. + virtual const std::vector<bool>& null_pages() const = 0; + + /// \brief Returns a list of encoded lower bound for the values of each page. For null + /// pages the default value is an empty string. Readers must make sure that list entries + /// are populated before using them by inspecting null_pages. + virtual const std::vector<std::string>& encoded_min_values() const = 0; + + /// \brief Returns a list of encoded upper bound for the values of each page. For null + /// pages the default value is an empty string. Readers must make sure that list entries + /// are populated before using them by inspecting null_pages. + virtual const std::vector<std::string>& encoded_max_values() const = 0; + + /// \brief Returns whether both min_values and max_values are orderd and if so, in which + /// direction. + virtual BoundaryOrder boundary_order() const = 0; + + /// \brief Returns if null count is available. + virtual bool has_null_counts() const = 0; + + /// \brief Returns A list containing the number of null values for each page. Review Comment: ```suggestion /// \brief An optional vector with the number of null values in each data page. /// /// `has_null_counts` should be called first to determine if this information is available. ``` ########## cpp/src/parquet/page_index.h: ########## @@ -0,0 +1,119 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "parquet/exception.h" +#include "parquet/platform.h" +#include "parquet/schema.h" + +#include <vector> + +namespace parquet { + +/// \brief BoundaryOrder is a proxy around format::BoundaryOrder. +enum class PARQUET_EXPORT BoundaryOrder { Unordered = 0, Ascending = 1, Descending = 2 }; + +/// \brief ColumnIndex is a proxy around format::ColumnIndex. +class PARQUET_EXPORT ColumnIndex { + public: + /// \brief Create a ColumnIndex from a serialized thrift message. + static std::unique_ptr<ColumnIndex> Make(const ColumnDescriptor& descr, + const void* serialized_index, + uint32_t index_len, + const ReaderProperties& properties); + + virtual ~ColumnIndex() = default; + + /// \brief Returns a list of boolean values to determine the validity of the + /// corresponding min and max values. + virtual const std::vector<bool>& null_pages() const = 0; + + /// \brief Returns a list of encoded lower bound for the values of each page. For null + /// pages the default value is an empty string. Readers must make sure that list entries + /// are populated before using them by inspecting null_pages. + virtual const std::vector<std::string>& encoded_min_values() const = 0; + + /// \brief Returns a list of encoded upper bound for the values of each page. For null + /// pages the default value is an empty string. Readers must make sure that list entries + /// are populated before using them by inspecting null_pages. + virtual const std::vector<std::string>& encoded_max_values() const = 0; + + /// \brief Returns whether both min_values and max_values are orderd and if so, in which + /// direction. + virtual BoundaryOrder boundary_order() const = 0; + + /// \brief Returns if null count is available. + virtual bool has_null_counts() const = 0; + + /// \brief Returns A list containing the number of null values for each page. + virtual const std::vector<int64_t>& null_counts() const = 0; +}; + +/// \brief Typed implementation of ColumnIndex. +template <typename DType> +class PARQUET_EXPORT TypedColumnIndex : public ColumnIndex { + public: + using T = typename DType::c_type; + + /// \brief Returns a list of lower bound for the values of every non-null page. + /// Excluding non-null pages helps binary search if the values are ordered. + virtual const std::vector<T>& min_values() const = 0; + + /// \brief Returns a list of upper bound for the values of every non-null page. + /// Excluding non-null pages helps binary search if the values are ordered. + virtual const std::vector<T>& max_values() const = 0; + + /// \brief Returns a list of page indices for not-null pages. It is helpful to + /// understand the original page id in the values returned from min_values() + /// and max_values() above. Review Comment: Why expose this `TypedColumnIndex` rather than `ColumnIndex`? ########## cpp/src/parquet/page_index.h: ########## @@ -0,0 +1,119 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "parquet/exception.h" +#include "parquet/platform.h" +#include "parquet/schema.h" + +#include <vector> + +namespace parquet { + +/// \brief BoundaryOrder is a proxy around format::BoundaryOrder. +enum class PARQUET_EXPORT BoundaryOrder { Unordered = 0, Ascending = 1, Descending = 2 }; + +/// \brief ColumnIndex is a proxy around format::ColumnIndex. +class PARQUET_EXPORT ColumnIndex { + public: + /// \brief Create a ColumnIndex from a serialized thrift message. + static std::unique_ptr<ColumnIndex> Make(const ColumnDescriptor& descr, + const void* serialized_index, + uint32_t index_len, + const ReaderProperties& properties); + + virtual ~ColumnIndex() = default; + + /// \brief Returns a list of boolean values to determine the validity of the + /// corresponding min and max values. + virtual const std::vector<bool>& null_pages() const = 0; + + /// \brief Returns a list of encoded lower bound for the values of each page. For null + /// pages the default value is an empty string. Readers must make sure that list entries + /// are populated before using them by inspecting null_pages. + virtual const std::vector<std::string>& encoded_min_values() const = 0; + + /// \brief Returns a list of encoded upper bound for the values of each page. For null + /// pages the default value is an empty string. Readers must make sure that list entries + /// are populated before using them by inspecting null_pages. + virtual const std::vector<std::string>& encoded_max_values() const = 0; + + /// \brief Returns whether both min_values and max_values are orderd and if so, in which + /// direction. + virtual BoundaryOrder boundary_order() const = 0; + + /// \brief Returns if null count is available. + virtual bool has_null_counts() const = 0; + + /// \brief Returns A list containing the number of null values for each page. + virtual const std::vector<int64_t>& null_counts() const = 0; +}; + +/// \brief Typed implementation of ColumnIndex. +template <typename DType> +class PARQUET_EXPORT TypedColumnIndex : public ColumnIndex { + public: + using T = typename DType::c_type; + + /// \brief Returns a list of lower bound for the values of every non-null page. + /// Excluding non-null pages helps binary search if the values are ordered. + virtual const std::vector<T>& min_values() const = 0; + + /// \brief Returns a list of upper bound for the values of every non-null page. + /// Excluding non-null pages helps binary search if the values are ordered. + virtual const std::vector<T>& max_values() const = 0; + + /// \brief Returns a list of page indices for not-null pages. It is helpful to + /// understand the original page id in the values returned from min_values() + /// and max_values() above. + virtual const std::vector<int32_t> GetNonNullPageIndices() const = 0; Review Comment: ```suggestion virtual const std::vector<int32_t>& non_null_page_indices() const = 0; ``` ########## cpp/src/parquet/page_index.h: ########## @@ -0,0 +1,119 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "parquet/exception.h" +#include "parquet/platform.h" +#include "parquet/schema.h" + +#include <vector> + +namespace parquet { + +/// \brief BoundaryOrder is a proxy around format::BoundaryOrder. +enum class PARQUET_EXPORT BoundaryOrder { Unordered = 0, Ascending = 1, Descending = 2 }; + +/// \brief ColumnIndex is a proxy around format::ColumnIndex. +class PARQUET_EXPORT ColumnIndex { + public: + /// \brief Create a ColumnIndex from a serialized thrift message. + static std::unique_ptr<ColumnIndex> Make(const ColumnDescriptor& descr, + const void* serialized_index, + uint32_t index_len, + const ReaderProperties& properties); + + virtual ~ColumnIndex() = default; + + /// \brief Returns a list of boolean values to determine the validity of the + /// corresponding min and max values. + virtual const std::vector<bool>& null_pages() const = 0; + + /// \brief Returns a list of encoded lower bound for the values of each page. For null + /// pages the default value is an empty string. Readers must make sure that list entries + /// are populated before using them by inspecting null_pages. + virtual const std::vector<std::string>& encoded_min_values() const = 0; + + /// \brief Returns a list of encoded upper bound for the values of each page. For null + /// pages the default value is an empty string. Readers must make sure that list entries + /// are populated before using them by inspecting null_pages. + virtual const std::vector<std::string>& encoded_max_values() const = 0; + + /// \brief Returns whether both min_values and max_values are orderd and if so, in which + /// direction. Review Comment: ```suggestion /// \brief The ordering of lower and upper bounds. /// /// The boundary order applies accross all lower bounds, and all upper bounds, /// respectively. However, the order between lower bounds and upper bounds /// cannot be derived from this. ``` ########## cpp/src/parquet/page_index.cc: ########## @@ -0,0 +1,174 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "parquet/page_index.h" +#include "parquet/encoding.h" +#include "parquet/statistics.h" +#include "parquet/thrift_internal.h" + +#include <map> + +namespace parquet { + +namespace { + +template <typename DType> +void Decode(std::unique_ptr<typename EncodingTraits<DType>::Decoder>& decoder, + const std::string& src, typename DType::c_type* dst) { + decoder->SetData(/*num_values=*/1, reinterpret_cast<const uint8_t*>(src.c_str()), + static_cast<int>(src.size())); + decoder->Decode(dst, /*max_values=*/1); +} + +template <> +void Decode<ByteArrayType>(std::unique_ptr<ByteArrayDecoder>&, const std::string& src, + ByteArray* dst) { + dst->len = static_cast<uint32_t>(src.size()); + dst->ptr = reinterpret_cast<const uint8_t*>(src.c_str()); +} + +template <typename DType> +class TypedColumnIndexImpl : public TypedColumnIndex<DType> { + public: + using T = typename DType::c_type; + + TypedColumnIndexImpl(const ColumnDescriptor& descr, + const format::ColumnIndex& column_index) + : column_index_(column_index) { + min_values_.reserve(column_index_.null_pages.size()); + max_values_.reserve(column_index_.null_pages.size()); + // Decode min and max values into a compact form (i.e. w/o null page) + auto plain_decoder = MakeTypedDecoder<DType>(Encoding::PLAIN, &descr); + T value; + for (size_t i = 0; i < column_index_.null_pages.size(); ++i) { + if (!column_index_.null_pages[i]) { + non_null_page_indices_.emplace_back(static_cast<int32_t>(i)); + Decode<DType>(plain_decoder, column_index_.min_values[i], &value); Review Comment: Is it efficient to decode each value separately, or is it possible to batch-decode all min values at once? ########## cpp/src/parquet/page_index.cc: ########## @@ -0,0 +1,174 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "parquet/page_index.h" +#include "parquet/encoding.h" +#include "parquet/statistics.h" +#include "parquet/thrift_internal.h" + +#include <map> + +namespace parquet { + +namespace { + +template <typename DType> +void Decode(std::unique_ptr<typename EncodingTraits<DType>::Decoder>& decoder, + const std::string& src, typename DType::c_type* dst) { + decoder->SetData(/*num_values=*/1, reinterpret_cast<const uint8_t*>(src.c_str()), + static_cast<int>(src.size())); + decoder->Decode(dst, /*max_values=*/1); +} + +template <> +void Decode<ByteArrayType>(std::unique_ptr<ByteArrayDecoder>&, const std::string& src, + ByteArray* dst) { + dst->len = static_cast<uint32_t>(src.size()); + dst->ptr = reinterpret_cast<const uint8_t*>(src.c_str()); +} + +template <typename DType> +class TypedColumnIndexImpl : public TypedColumnIndex<DType> { + public: + using T = typename DType::c_type; + + TypedColumnIndexImpl(const ColumnDescriptor& descr, + const format::ColumnIndex& column_index) + : column_index_(column_index) { + min_values_.reserve(column_index_.null_pages.size()); + max_values_.reserve(column_index_.null_pages.size()); + // Decode min and max values into a compact form (i.e. w/o null page) + auto plain_decoder = MakeTypedDecoder<DType>(Encoding::PLAIN, &descr); + T value; + for (size_t i = 0; i < column_index_.null_pages.size(); ++i) { + if (!column_index_.null_pages[i]) { + non_null_page_indices_.emplace_back(static_cast<int32_t>(i)); + Decode<DType>(plain_decoder, column_index_.min_values[i], &value); + min_values_.emplace_back(value); + Decode<DType>(plain_decoder, column_index_.max_values[i], &value); + max_values_.emplace_back(value); + } + } + } + + const std::vector<bool>& null_pages() const override { + return column_index_.null_pages; + } + + const std::vector<std::string>& encoded_min_values() const override { + return column_index_.min_values; + } + + const std::vector<std::string>& encoded_max_values() const override { + return column_index_.max_values; + } + + BoundaryOrder boundary_order() const override { + return static_cast<BoundaryOrder>(static_cast<int>(column_index_.boundary_order)); Review Comment: Should use `LoadEnumSafe` here to avoid undefined behavior (UB) on corrupt/malicious data files. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
