wgtmac commented on code in PR #35825: URL: https://github.com/apache/arrow/pull/35825#discussion_r1222363698
########## cpp/examples/parquet/parquet_arrow/reader_writer.cc: ########## @@ -137,4 +137,4 @@ int main(int argc, char** argv) { read_single_rowgroup(); read_single_column(); read_single_column_chunk(); -} +} Review Comment: Please revert this line. ########## cpp/src/parquet/types.h: ########## @@ -65,7 +65,7 @@ struct Type { BYTE_ARRAY = 6, FIXED_LEN_BYTE_ARRAY = 7, // Should always be last element. - UNDEFINED = 8 Review Comment: Please revert this line. ########## cpp/src/parquet/arrow/arrow_reader_writer_test.cc: ########## @@ -3862,6 +3868,18 @@ TEST(TestArrowReaderAdHoc, CorruptedSchema) { TryReadDataFile(path, ::arrow::StatusCode::IOError); } +TEST(TestArrowParquet, LargeByteArray) { + auto path = test::get_data_file("chunked_string_map.parquet"); + + TryReadDataFile(path, ::arrow::StatusCode::NotImplemented); + + auto reader_properties = default_arrow_reader_properties(); + + reader_properties.set_use_binary_large_variants(true); Review Comment: Would be good to create a new ArrowReaderProperties if a non-default one is expected. ########## cpp/src/parquet/column_reader.cc: ########## @@ -2093,15 +2093,44 @@ class FLBARecordReader : public TypedRecordReader<FLBAType>, std::unique_ptr<::arrow::FixedSizeBinaryBuilder> builder_; }; -class ByteArrayChunkedRecordReader : public TypedRecordReader<ByteArrayType>, - virtual public BinaryRecordReader { +// Below concept could be used to simplify type assertion, but it seems like c++20 is not Review Comment: Yes, we need to support C++17. It would be good to add a TODO item here for this. ########## cpp/src/parquet/column_reader.cc: ########## @@ -2241,12 +2285,27 @@ std::shared_ptr<RecordReader> MakeByteArrayRecordReader(const ColumnDescriptor* } } +std::shared_ptr<RecordReader> MakeLargeByteArrayRecordReader(const ColumnDescriptor* descr, + LevelInfo leaf_info, + ::arrow::MemoryPool* pool, + bool read_dictionary, + bool read_dense_for_nullable) { + if (read_dictionary) { + return std::make_shared<LargeByteArrayDictionaryRecordReader>(descr, leaf_info, pool, + read_dense_for_nullable); Review Comment: You may use clang-format to format all changed files. ########## cpp/src/parquet/column_reader.cc: ########## @@ -2132,15 +2161,24 @@ class ByteArrayChunkedRecordReader : public TypedRecordReader<ByteArrayType>, private: // Helper data structure for accumulating builder chunks - typename EncodingTraits<ByteArrayType>::Accumulator accumulator_; + typename EncodingTraits<BAT>::Accumulator accumulator_; }; -class ByteArrayDictionaryRecordReader : public TypedRecordReader<ByteArrayType>, - virtual public DictionaryRecordReader { +using ByteArrayChunkedRecordReader = ChunkedRecordReader<ByteArrayType>; +using LargeByteArrayChunkedRecordReader = ChunkedRecordReader<LargeByteArrayType>; + + +template <typename BAT> +class DictionaryRecordReaderImpl : public TypedRecordReader<BAT>, Review Comment: Rename it to `ByteArrayDictionaryRecordReaderImpl` ? ########## cpp/src/parquet/arrow/arrow_reader_writer_test.cc: ########## @@ -3862,6 +3868,18 @@ TEST(TestArrowReaderAdHoc, CorruptedSchema) { TryReadDataFile(path, ::arrow::StatusCode::IOError); } +TEST(TestArrowParquet, LargeByteArray) { + auto path = test::get_data_file("chunked_string_map.parquet"); + Review Comment: Could you delete these empty lines in this test? ########## cpp/src/parquet/types.h: ########## @@ -761,6 +761,17 @@ using Int96Type = PhysicalType<Type::INT96>; using FloatType = PhysicalType<Type::FLOAT>; using DoubleType = PhysicalType<Type::DOUBLE>; using ByteArrayType = PhysicalType<Type::BYTE_ARRAY>; + +/* + * Parquet does not have a LARGE_BYTE_ARRAY_TYPE, but arrow does. + * It is used to store ByteArrays with length > 2^31 - 1. Review Comment: I don't think so. It is used to store ByteArrays whose concatenated length may exceed 2^31 - 1. ########## cpp/src/parquet/encoding.cc: ########## @@ -1854,220 +1922,237 @@ void DictDecoderImpl<ByteArrayType>::InsertDictionary(::arrow::ArrayBuilder* bui PARQUET_THROW_NOT_OK(binary_builder->InsertMemoValues(*arr)); } -class DictByteArrayDecoderImpl : public DictDecoderImpl<ByteArrayType>, - virtual public ByteArrayDecoder { - public: - using BASE = DictDecoderImpl<ByteArrayType>; - using BASE::DictDecoderImpl; +template <> +void DictDecoderImpl<LargeByteArrayType>::InsertDictionary(::arrow::ArrayBuilder* builder) { + auto binary_builder = checked_cast<::arrow::LargeBinaryDictionary32Builder*>(builder); - int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, - int64_t valid_bits_offset, - ::arrow::BinaryDictionary32Builder* builder) override { - int result = 0; - if (null_count == 0) { - PARQUET_THROW_NOT_OK(DecodeArrowNonNull(num_values, builder, &result)); - } else { - PARQUET_THROW_NOT_OK(DecodeArrow(num_values, null_count, valid_bits, - valid_bits_offset, builder, &result)); - } - return result; - } + // Make a BinaryArray referencing the internal dictionary data Review Comment: ```suggestion // Make a LargeBinaryArray referencing the internal dictionary data ``` ########## cpp/src/parquet/encoding.h: ########## @@ -24,6 +24,7 @@ #include "arrow/util/spaced.h" +#include "arrow/type.h" Review Comment: Please check my comment in encoding.cc. I am not sure if that can help remove this header as well as `memory_limit` defined below. ########## cpp/src/parquet/arrow/reader_internal.h: ########## @@ -109,6 +109,7 @@ struct ReaderContext { FileColumnIteratorFactory iterator_factory; bool filter_leaves; std::shared_ptr<std::unordered_set<int>> included_leaves; + bool use_binary_large_variants = false; Review Comment: Is use_large_binary_variants more precise? They are called LargeBinary and LargeString. ########## cpp/src/parquet/arrow/arrow_reader_writer_test.cc: ########## @@ -3862,6 +3868,18 @@ TEST(TestArrowReaderAdHoc, CorruptedSchema) { TryReadDataFile(path, ::arrow::StatusCode::IOError); } +TEST(TestArrowParquet, LargeByteArray) { Review Comment: BTW, it would be good to add a test to verify that reading into StringArray and LargeStringArray are getting the same string values in a non-overflow case. ########## cpp/src/parquet/column_reader.cc: ########## @@ -2093,15 +2093,44 @@ class FLBARecordReader : public TypedRecordReader<FLBAType>, std::unique_ptr<::arrow::FixedSizeBinaryBuilder> builder_; }; -class ByteArrayChunkedRecordReader : public TypedRecordReader<ByteArrayType>, - virtual public BinaryRecordReader { +// Below concept could be used to simplify type assertion, but it seems like c++20 is not +// available +//template <typename T> +//concept ByteArrayTypeConcept = std::is_same<T, ByteArrayType>::value || +// std::is_same<T, LargeByteArrayType>::value; + +template<typename T> +struct IsByteArrayType : std::false_type {}; + +template<> +struct IsByteArrayType<ByteArrayType> : std::true_type {}; + +template<> +struct IsByteArrayType<LargeByteArrayType> : std::true_type {}; + +template<typename BAT> +struct ByteArrayBuilderTypeTrait { + using BuilderType = typename std::conditional<std::is_same<BAT, LargeByteArrayType>::value, + ::arrow::LargeBinaryBuilder, + ::arrow::BinaryBuilder>::type; +}; + +template<typename BAT> +class ChunkedRecordReader : public TypedRecordReader<BAT>, Review Comment: What about naming it `ByteArrayChunkedRecordReaderImpl`? ########## cpp/src/parquet/column_reader.cc: ########## @@ -2132,15 +2161,24 @@ class ByteArrayChunkedRecordReader : public TypedRecordReader<ByteArrayType>, private: // Helper data structure for accumulating builder chunks - typename EncodingTraits<ByteArrayType>::Accumulator accumulator_; + typename EncodingTraits<BAT>::Accumulator accumulator_; }; -class ByteArrayDictionaryRecordReader : public TypedRecordReader<ByteArrayType>, - virtual public DictionaryRecordReader { +using ByteArrayChunkedRecordReader = ChunkedRecordReader<ByteArrayType>; +using LargeByteArrayChunkedRecordReader = ChunkedRecordReader<LargeByteArrayType>; + Review Comment: Remove this blank line. ########## cpp/src/parquet/column_reader.cc: ########## @@ -2093,15 +2093,44 @@ class FLBARecordReader : public TypedRecordReader<FLBAType>, std::unique_ptr<::arrow::FixedSizeBinaryBuilder> builder_; }; -class ByteArrayChunkedRecordReader : public TypedRecordReader<ByteArrayType>, - virtual public BinaryRecordReader { +// Below concept could be used to simplify type assertion, but it seems like c++20 is not +// available +//template <typename T> +//concept ByteArrayTypeConcept = std::is_same<T, ByteArrayType>::value || +// std::is_same<T, LargeByteArrayType>::value; + +template<typename T> +struct IsByteArrayType : std::false_type {}; + +template<> +struct IsByteArrayType<ByteArrayType> : std::true_type {}; + +template<> +struct IsByteArrayType<LargeByteArrayType> : std::true_type {}; + +template<typename BAT> +struct ByteArrayBuilderTypeTrait { + using BuilderType = typename std::conditional<std::is_same<BAT, LargeByteArrayType>::value, + ::arrow::LargeBinaryBuilder, + ::arrow::BinaryBuilder>::type; +}; Review Comment: ```suggestion template <typename T> struct IsByteArrayType : std::false_type {}; template <> struct IsByteArrayType<ByteArrayType> : std::true_type {}; template <> struct IsByteArrayType<LargeByteArrayType> : std::true_type {}; template <typename BAT> struct ByteArrayBuilderTypeTrait { using BuilderType = typename std::conditional<std::is_same<BAT, LargeByteArrayType>::value, ::arrow::LargeBinaryBuilder, ::arrow::BinaryBuilder>::type; }; ``` ########## cpp/src/parquet/encoding.cc: ########## @@ -1238,12 +1238,13 @@ int PlainBooleanDecoder::Decode(bool* buffer, int max_values) { return max_values; } -struct ArrowBinaryHelper { - explicit ArrowBinaryHelper(typename EncodingTraits<ByteArrayType>::Accumulator* out) { +template <typename BAT> +struct ArrowBinaryHelperBase { + explicit ArrowBinaryHelperBase(typename EncodingTraits<BAT>::Accumulator* out) { this->out = out; this->builder = out->builder.get(); this->chunk_space_remaining = - ::arrow::kBinaryMemoryLimit - this->builder->value_data_length(); + EncodingTraits<BAT>::memory_limit - this->builder->value_data_length(); Review Comment: If `memory_limit` is only used here, can't we use `BAT` to check whether it is large variant? Then we can avoid adding `memory_limit` to EncodingTraits. ########## cpp/src/parquet/encoding.cc: ########## @@ -1677,6 +1702,35 @@ void DictDecoderImpl<ByteArrayType>::SetDict(TypedDecoder<ByteArrayType>* dictio bytes_offsets[dictionary_length_] = offset; } +template <> Review Comment: These lines are duplicated as above. Could you please consolidate them? ########## cpp/src/parquet/types.h: ########## @@ -761,6 +761,17 @@ using Int96Type = PhysicalType<Type::INT96>; using FloatType = PhysicalType<Type::FLOAT>; using DoubleType = PhysicalType<Type::DOUBLE>; using ByteArrayType = PhysicalType<Type::BYTE_ARRAY>; + +/* + * Parquet does not have a LARGE_BYTE_ARRAY_TYPE, but arrow does. Review Comment: This is not precise. We may say > Parquet uses ByteArrayType for variable length strings and binaries and their lengths will not exceed 2^31 - 1. However, arrow supports StringType/BinaryType and their large variants (i.e. LargeStringType and LargeBinaryType). ########## cpp/src/parquet/encoding.h: ########## @@ -140,15 +143,37 @@ template <> struct EncodingTraits<ByteArrayType> { using Encoder = ByteArrayEncoder; using Decoder = ByteArrayDecoder; + using BinaryBuilder = ::arrow::BinaryBuilder; Review Comment: Is this actually required? Can we use decltype to get it from Accumulator? ########## cpp/src/arrow/array/builder_dict.h: ########## @@ -724,6 +724,7 @@ using BinaryDictionaryBuilder = DictionaryBuilder<BinaryType>; using StringDictionaryBuilder = DictionaryBuilder<StringType>; using BinaryDictionary32Builder = Dictionary32Builder<BinaryType>; using StringDictionary32Builder = Dictionary32Builder<StringType>; +using LargeBinaryDictionary32Builder = Dictionary32Builder<LargeBinaryType>; Review Comment: SGTM -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org