pitrou commented on code in PR #14803: URL: https://github.com/apache/arrow/pull/14803#discussion_r1040733335
########## cpp/src/parquet/page_index.h: ########## @@ -0,0 +1,132 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "parquet/types.h" + +#include <vector> + +namespace parquet { + +class ColumnDescriptor; +class ReaderProperties; + +/// \brief ColumnIndex is a proxy around format::ColumnIndex. +class PARQUET_EXPORT ColumnIndex { + public: + /// \brief Create a ColumnIndex from a serialized thrift message. + static std::unique_ptr<ColumnIndex> Make(const ColumnDescriptor& descr, + const void* serialized_index, + uint32_t index_len, + const ReaderProperties& properties); + + virtual ~ColumnIndex() = default; + + /// \brief A bitmap with a bit set for each data page that has only null values. + /// + /// The length of this vector is equal to the number of data pages in the column. + virtual const std::vector<bool>& null_pages() const = 0; + + /// \brief A vector of encoded lower bounds for each data page in this column. + /// + /// `null_pages` should be inspected first, as only pages with non-null values + /// may have their lower bounds populated. + virtual const std::vector<std::string>& encoded_min_values() const = 0; + + /// \brief A vector of encoded upper bounds for each data page in this column. + /// + /// `null_pages` should be inspected first, as only pages with non-null values + /// may have their upper bounds populated. + virtual const std::vector<std::string>& encoded_max_values() const = 0; + + /// \brief The ordering of lower and upper bounds. + /// + /// The boundary order applies accross all lower bounds, and all upper bounds, + /// respectively. However, the order between lower bounds and upper bounds + /// cannot be derived from this. + virtual BoundaryOrder::type boundary_order() const = 0; + + /// \brief Whether per-page null count information is available. + virtual bool has_null_counts() const = 0; + + /// \brief An optional vector with the number of null values in each data page. + /// + /// `has_null_counts` should be called first to determine if this information is + /// available. + virtual const std::vector<int64_t>& null_counts() const = 0; +}; + +/// \brief Typed implementation of ColumnIndex. +template <typename DType> +class PARQUET_EXPORT TypedColumnIndex : public ColumnIndex { + public: + using T = typename DType::c_type; + + /// \brief A vector of lower bounds for each data page in this column. + /// + /// This is like `encoded_min_values`, but with the values decoded according to + /// the column's physical type. + /// `min_values` and `max_values` can be used together with `boundary_order` + /// in order to prune some data pages when searching for specific values. + virtual const std::vector<T>& min_values() const = 0; + + /// \brief A vector of upper bounds for each data page in this column. + /// + /// Just like `min_values`, but for upper bounds instead of lower bounds. + virtual const std::vector<T>& max_values() const = 0; + + /// \brief A vector of page indices for not-null pages. + /// + /// It is helpful to understand the original page id in the values returned from + /// min_values() and max_values() above. + virtual const std::vector<int32_t>& non_null_page_indices() const = 0; Review Comment: Again, this doesn't depend on T, it should be exposed on the base class instead. ########## cpp/src/parquet/page_index.cc: ########## @@ -0,0 +1,188 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "parquet/page_index.h" +#include "parquet/encoding.h" +#include "parquet/exception.h" +#include "parquet/schema.h" +#include "parquet/statistics.h" +#include "parquet/thrift_internal.h" + +#include "arrow/util/unreachable.h" + +#include <limits> +#include <map> +#include <numeric> + +namespace parquet { + +namespace { + +template <typename DType> +void Decode(std::unique_ptr<typename EncodingTraits<DType>::Decoder>& decoder, + const std::string& src, typename DType::c_type* dst) { + decoder->SetData(/*num_values=*/1, reinterpret_cast<const uint8_t*>(src.c_str()), + static_cast<int>(src.size())); + decoder->Decode(dst, /*max_values=*/1); +} + +template <> +void Decode<ByteArrayType>(std::unique_ptr<ByteArrayDecoder>&, const std::string& src, + ByteArray* dst) { + DCHECK_LE(src.size(), static_cast<size_t>(std::numeric_limits<uint32_t>::max())); Review Comment: Since this depends on the file's correctness, this should raise an exception instead of asserting out. ########## cpp/src/parquet/page_index.cc: ########## @@ -0,0 +1,188 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "parquet/page_index.h" +#include "parquet/encoding.h" +#include "parquet/exception.h" +#include "parquet/schema.h" +#include "parquet/statistics.h" +#include "parquet/thrift_internal.h" + +#include "arrow/util/unreachable.h" + +#include <limits> +#include <map> +#include <numeric> + +namespace parquet { + +namespace { + +template <typename DType> +void Decode(std::unique_ptr<typename EncodingTraits<DType>::Decoder>& decoder, + const std::string& src, typename DType::c_type* dst) { + decoder->SetData(/*num_values=*/1, reinterpret_cast<const uint8_t*>(src.c_str()), + static_cast<int>(src.size())); + decoder->Decode(dst, /*max_values=*/1); +} + +template <> +void Decode<ByteArrayType>(std::unique_ptr<ByteArrayDecoder>&, const std::string& src, + ByteArray* dst) { + DCHECK_LE(src.size(), static_cast<size_t>(std::numeric_limits<uint32_t>::max())); + dst->len = static_cast<uint32_t>(src.size()); + dst->ptr = reinterpret_cast<const uint8_t*>(src.data()); +} + +template <typename DType> +class TypedColumnIndexImpl : public TypedColumnIndex<DType> { + public: + using T = typename DType::c_type; + + TypedColumnIndexImpl(const ColumnDescriptor& descr, + const format::ColumnIndex& column_index) + : column_index_(column_index) { + size_t num_non_null_pages = std::accumulate( + column_index_.null_pages.cbegin(), column_index_.null_pages.cend(), 0U, + [](size_t num_non_null_pages, bool null_page) { + return num_non_null_pages + (null_page ? 0U : 1U); + }); + if (num_non_null_pages != 0U) { + min_values_.reserve(num_non_null_pages); + max_values_.reserve(num_non_null_pages); Review Comment: Can also reserve on `non_null_page_indices_`. ########## cpp/src/parquet/page_index.cc: ########## @@ -0,0 +1,188 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "parquet/page_index.h" +#include "parquet/encoding.h" +#include "parquet/exception.h" +#include "parquet/schema.h" +#include "parquet/statistics.h" +#include "parquet/thrift_internal.h" + +#include "arrow/util/unreachable.h" + +#include <limits> +#include <map> +#include <numeric> + +namespace parquet { + +namespace { + +template <typename DType> +void Decode(std::unique_ptr<typename EncodingTraits<DType>::Decoder>& decoder, + const std::string& src, typename DType::c_type* dst) { + decoder->SetData(/*num_values=*/1, reinterpret_cast<const uint8_t*>(src.c_str()), + static_cast<int>(src.size())); + decoder->Decode(dst, /*max_values=*/1); +} + +template <> +void Decode<ByteArrayType>(std::unique_ptr<ByteArrayDecoder>&, const std::string& src, + ByteArray* dst) { + DCHECK_LE(src.size(), static_cast<size_t>(std::numeric_limits<uint32_t>::max())); + dst->len = static_cast<uint32_t>(src.size()); + dst->ptr = reinterpret_cast<const uint8_t*>(src.data()); +} + +template <typename DType> +class TypedColumnIndexImpl : public TypedColumnIndex<DType> { + public: + using T = typename DType::c_type; + + TypedColumnIndexImpl(const ColumnDescriptor& descr, + const format::ColumnIndex& column_index) + : column_index_(column_index) { + size_t num_non_null_pages = std::accumulate( + column_index_.null_pages.cbegin(), column_index_.null_pages.cend(), 0U, + [](size_t num_non_null_pages, bool null_page) { + return num_non_null_pages + (null_page ? 0U : 1U); + }); + if (num_non_null_pages != 0U) { + min_values_.reserve(num_non_null_pages); + max_values_.reserve(num_non_null_pages); + // Decode min and max values into a compact form (i.e. w/o null page) + auto plain_decoder = MakeTypedDecoder<DType>(Encoding::PLAIN, &descr); + T value; + for (size_t i = 0; i < column_index_.null_pages.size(); ++i) { + if (!column_index_.null_pages[i]) { + non_null_page_indices_.emplace_back(static_cast<int32_t>(i)); + Decode<DType>(plain_decoder, column_index_.min_values[i], &value); Review Comment: This seems to assume that `min_values` and `max_values` have the same length as `null_pages`. What if it's not the case? We should detect this situation and raise an exception. ########## cpp/src/parquet/page_index.cc: ########## @@ -0,0 +1,174 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "parquet/page_index.h" +#include "parquet/encoding.h" +#include "parquet/statistics.h" +#include "parquet/thrift_internal.h" + +#include <map> + +namespace parquet { + +namespace { + +template <typename DType> +void Decode(std::unique_ptr<typename EncodingTraits<DType>::Decoder>& decoder, + const std::string& src, typename DType::c_type* dst) { + decoder->SetData(/*num_values=*/1, reinterpret_cast<const uint8_t*>(src.c_str()), + static_cast<int>(src.size())); + decoder->Decode(dst, /*max_values=*/1); +} + +template <> +void Decode<ByteArrayType>(std::unique_ptr<ByteArrayDecoder>&, const std::string& src, + ByteArray* dst) { + dst->len = static_cast<uint32_t>(src.size()); + dst->ptr = reinterpret_cast<const uint8_t*>(src.c_str()); +} + +template <typename DType> +class TypedColumnIndexImpl : public TypedColumnIndex<DType> { + public: + using T = typename DType::c_type; + + TypedColumnIndexImpl(const ColumnDescriptor& descr, + const format::ColumnIndex& column_index) + : column_index_(column_index) { + min_values_.reserve(column_index_.null_pages.size()); + max_values_.reserve(column_index_.null_pages.size()); + // Decode min and max values into a compact form (i.e. w/o null page) + auto plain_decoder = MakeTypedDecoder<DType>(Encoding::PLAIN, &descr); + T value; + for (size_t i = 0; i < column_index_.null_pages.size(); ++i) { + if (!column_index_.null_pages[i]) { + non_null_page_indices_.emplace_back(static_cast<int32_t>(i)); + Decode<DType>(plain_decoder, column_index_.min_values[i], &value); + min_values_.emplace_back(value); + Decode<DType>(plain_decoder, column_index_.max_values[i], &value); + max_values_.emplace_back(value); + } + } + } + + const std::vector<bool>& null_pages() const override { + return column_index_.null_pages; + } + + const std::vector<std::string>& encoded_min_values() const override { + return column_index_.min_values; + } + + const std::vector<std::string>& encoded_max_values() const override { + return column_index_.max_values; + } + + BoundaryOrder boundary_order() const override { + return static_cast<BoundaryOrder>(static_cast<int>(column_index_.boundary_order)); + } + + bool has_null_counts() const override { return column_index_.__isset.null_counts; } + + const std::vector<int64_t>& null_counts() const override { + return column_index_.null_counts; + } + + const std::vector<T>& min_values() const override { return min_values_; } + + const std::vector<T>& max_values() const override { return max_values_; } + + const std::vector<int32_t> GetNonNullPageIndices() const override { + return non_null_page_indices_; + } + + private: + /// Wrapped thrift column index. + const format::ColumnIndex column_index_; + /// Decoded typed min/max values. Null pages are set to std::nullopt. + std::vector<T> min_values_; + std::vector<T> max_values_; + /// A list of page indices for not-null pages. + std::vector<int32_t> non_null_page_indices_; +}; + +class OffsetIndexImpl : public OffsetIndex { + public: + explicit OffsetIndexImpl(const format::OffsetIndex& offset_index) { + page_locations_.reserve(offset_index.page_locations.size()); + for (const auto& page_location : offset_index.page_locations) { Review Comment: I'm not sure that is an important concern, and I'm especially not sure why we would care for this for `OffsetIndex` but not for `ColumnIndex` (whose initialization will be more expensive still). ########## cpp/src/parquet/page_index.cc: ########## @@ -0,0 +1,188 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "parquet/page_index.h" +#include "parquet/encoding.h" +#include "parquet/exception.h" +#include "parquet/schema.h" +#include "parquet/statistics.h" +#include "parquet/thrift_internal.h" + +#include "arrow/util/unreachable.h" + +#include <limits> +#include <map> +#include <numeric> + +namespace parquet { + +namespace { + +template <typename DType> +void Decode(std::unique_ptr<typename EncodingTraits<DType>::Decoder>& decoder, + const std::string& src, typename DType::c_type* dst) { + decoder->SetData(/*num_values=*/1, reinterpret_cast<const uint8_t*>(src.c_str()), + static_cast<int>(src.size())); + decoder->Decode(dst, /*max_values=*/1); +} + +template <> +void Decode<ByteArrayType>(std::unique_ptr<ByteArrayDecoder>&, const std::string& src, + ByteArray* dst) { + DCHECK_LE(src.size(), static_cast<size_t>(std::numeric_limits<uint32_t>::max())); + dst->len = static_cast<uint32_t>(src.size()); + dst->ptr = reinterpret_cast<const uint8_t*>(src.data()); +} + +template <typename DType> +class TypedColumnIndexImpl : public TypedColumnIndex<DType> { + public: + using T = typename DType::c_type; + + TypedColumnIndexImpl(const ColumnDescriptor& descr, + const format::ColumnIndex& column_index) + : column_index_(column_index) { + size_t num_non_null_pages = std::accumulate( + column_index_.null_pages.cbegin(), column_index_.null_pages.cend(), 0U, + [](size_t num_non_null_pages, bool null_page) { + return num_non_null_pages + (null_page ? 0U : 1U); + }); + if (num_non_null_pages != 0U) { + min_values_.reserve(num_non_null_pages); + max_values_.reserve(num_non_null_pages); + // Decode min and max values into a compact form (i.e. w/o null page) + auto plain_decoder = MakeTypedDecoder<DType>(Encoding::PLAIN, &descr); + T value; + for (size_t i = 0; i < column_index_.null_pages.size(); ++i) { + if (!column_index_.null_pages[i]) { + non_null_page_indices_.emplace_back(static_cast<int32_t>(i)); Review Comment: Before downcasting here we should probably check above that the size doesn't go above the limit (unlikely, but who knows). -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
