[GitHub] [arrow] wgtmac commented on a diff in pull request #35825: GH-32723: [C++][Parquet] Add option to use LARGE* variants of binary types

via GitHub Wed, 07 Jun 2023 21:00:21 -0700


wgtmac commented on code in PR #35825:
URL: https://github.com/apache/arrow/pull/35825#discussion_r1222363698



##########
cpp/examples/parquet/parquet_arrow/reader_writer.cc:
##########
@@ -137,4 +137,4 @@ int main(int argc, char** argv) {
   read_single_rowgroup();
   read_single_column();
   read_single_column_chunk();
-}
+}

Review Comment:
   Please revert this line.



##########
cpp/src/parquet/types.h:
##########
@@ -65,7 +65,7 @@ struct Type {
     BYTE_ARRAY = 6,
     FIXED_LEN_BYTE_ARRAY = 7,
     // Should always be last element.
-    UNDEFINED = 8

Review Comment:
   Please revert this line.



##########
cpp/src/parquet/arrow/arrow_reader_writer_test.cc:
##########
@@ -3862,6 +3868,18 @@ TEST(TestArrowReaderAdHoc, CorruptedSchema) {
   TryReadDataFile(path, ::arrow::StatusCode::IOError);
 }
 
+TEST(TestArrowParquet, LargeByteArray) {
+  auto path = test::get_data_file("chunked_string_map.parquet");
+
+  TryReadDataFile(path, ::arrow::StatusCode::NotImplemented);
+
+  auto reader_properties = default_arrow_reader_properties();
+
+  reader_properties.set_use_binary_large_variants(true);

Review Comment:
   Would be good to create a new ArrowReaderProperties if a non-default one is 
expected.



##########
cpp/src/parquet/column_reader.cc:
##########
@@ -2093,15 +2093,44 @@ class FLBARecordReader : public 
TypedRecordReader<FLBAType>,
   std::unique_ptr<::arrow::FixedSizeBinaryBuilder> builder_;
 };
 
-class ByteArrayChunkedRecordReader : public TypedRecordReader<ByteArrayType>,
-                                     virtual public BinaryRecordReader {
+// Below concept could be used to simplify type assertion, but it seems like 
c++20 is not

Review Comment:
   Yes, we need to support C++17. It would be good to add a TODO item here for 
this.



##########
cpp/src/parquet/column_reader.cc:
##########
@@ -2241,12 +2285,27 @@ std::shared_ptr<RecordReader> 
MakeByteArrayRecordReader(const ColumnDescriptor*
   }
 }
 
+std::shared_ptr<RecordReader> MakeLargeByteArrayRecordReader(const 
ColumnDescriptor* descr,
+                                                             LevelInfo 
leaf_info,
+                                                             
::arrow::MemoryPool* pool,
+                                                             bool 
read_dictionary,
+                                                             bool 
read_dense_for_nullable) {
+  if (read_dictionary) {
+    return std::make_shared<LargeByteArrayDictionaryRecordReader>(descr, 
leaf_info, pool,
+                                                             
read_dense_for_nullable);

Review Comment:
   You may use clang-format to format all changed files.



##########
cpp/src/parquet/column_reader.cc:
##########
@@ -2132,15 +2161,24 @@ class ByteArrayChunkedRecordReader : public 
TypedRecordReader<ByteArrayType>,
 
  private:
   // Helper data structure for accumulating builder chunks
-  typename EncodingTraits<ByteArrayType>::Accumulator accumulator_;
+  typename EncodingTraits<BAT>::Accumulator accumulator_;
 };
 
-class ByteArrayDictionaryRecordReader : public 
TypedRecordReader<ByteArrayType>,
-                                        virtual public DictionaryRecordReader {
+using ByteArrayChunkedRecordReader = ChunkedRecordReader<ByteArrayType>;
+using LargeByteArrayChunkedRecordReader = 
ChunkedRecordReader<LargeByteArrayType>;
+
+
+template <typename BAT>
+class DictionaryRecordReaderImpl : public TypedRecordReader<BAT>,

Review Comment:
   Rename it to `ByteArrayDictionaryRecordReaderImpl` ?



##########
cpp/src/parquet/arrow/arrow_reader_writer_test.cc:
##########
@@ -3862,6 +3868,18 @@ TEST(TestArrowReaderAdHoc, CorruptedSchema) {
   TryReadDataFile(path, ::arrow::StatusCode::IOError);
 }
 
+TEST(TestArrowParquet, LargeByteArray) {
+  auto path = test::get_data_file("chunked_string_map.parquet");
+

Review Comment:
   Could you delete these empty lines in this test?



##########
cpp/src/parquet/types.h:
##########
@@ -761,6 +761,17 @@ using Int96Type = PhysicalType<Type::INT96>;
 using FloatType = PhysicalType<Type::FLOAT>;
 using DoubleType = PhysicalType<Type::DOUBLE>;
 using ByteArrayType = PhysicalType<Type::BYTE_ARRAY>;
+
+/*
+ * Parquet does not have a LARGE_BYTE_ARRAY_TYPE, but arrow does.
+ * It is used to store ByteArrays with length > 2^31 - 1.

Review Comment:
   I don't think so. It is used to store ByteArrays whose concatenated length 
may exceed 2^31 - 1.



##########
cpp/src/parquet/encoding.cc:
##########
@@ -1854,220 +1922,237 @@ void 
DictDecoderImpl<ByteArrayType>::InsertDictionary(::arrow::ArrayBuilder* bui
   PARQUET_THROW_NOT_OK(binary_builder->InsertMemoValues(*arr));
 }
 
-class DictByteArrayDecoderImpl : public DictDecoderImpl<ByteArrayType>,
-                                 virtual public ByteArrayDecoder {
- public:
-  using BASE = DictDecoderImpl<ByteArrayType>;
-  using BASE::DictDecoderImpl;
+template <>
+void 
DictDecoderImpl<LargeByteArrayType>::InsertDictionary(::arrow::ArrayBuilder* 
builder) {
+  auto binary_builder = 
checked_cast<::arrow::LargeBinaryDictionary32Builder*>(builder);
 
-  int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
-                  int64_t valid_bits_offset,
-                  ::arrow::BinaryDictionary32Builder* builder) override {
-    int result = 0;
-    if (null_count == 0) {
-      PARQUET_THROW_NOT_OK(DecodeArrowNonNull(num_values, builder, &result));
-    } else {
-      PARQUET_THROW_NOT_OK(DecodeArrow(num_values, null_count, valid_bits,
-                                       valid_bits_offset, builder, &result));
-    }
-    return result;
-  }
+  // Make a BinaryArray referencing the internal dictionary data

Review Comment:
   ```suggestion
     // Make a LargeBinaryArray referencing the internal dictionary data
   ```



##########
cpp/src/parquet/encoding.h:
##########
@@ -24,6 +24,7 @@
 
 #include "arrow/util/spaced.h"
 
+#include "arrow/type.h"

Review Comment:
   Please check my comment in encoding.cc. I am not sure if that can help 
remove this header as well as `memory_limit` defined below.



##########
cpp/src/parquet/arrow/reader_internal.h:
##########
@@ -109,6 +109,7 @@ struct ReaderContext {
   FileColumnIteratorFactory iterator_factory;
   bool filter_leaves;
   std::shared_ptr<std::unordered_set<int>> included_leaves;
+  bool use_binary_large_variants = false;

Review Comment:
   Is use_large_binary_variants more precise? They are called LargeBinary and 
LargeString.



##########
cpp/src/parquet/arrow/arrow_reader_writer_test.cc:
##########
@@ -3862,6 +3868,18 @@ TEST(TestArrowReaderAdHoc, CorruptedSchema) {
   TryReadDataFile(path, ::arrow::StatusCode::IOError);
 }
 
+TEST(TestArrowParquet, LargeByteArray) {

Review Comment:
   BTW, it would be good to add a test to verify that reading into StringArray 
and LargeStringArray are getting the same string values in a non-overflow case.



##########
cpp/src/parquet/column_reader.cc:
##########
@@ -2093,15 +2093,44 @@ class FLBARecordReader : public 
TypedRecordReader<FLBAType>,
   std::unique_ptr<::arrow::FixedSizeBinaryBuilder> builder_;
 };
 
-class ByteArrayChunkedRecordReader : public TypedRecordReader<ByteArrayType>,
-                                     virtual public BinaryRecordReader {
+// Below concept could be used to simplify type assertion, but it seems like 
c++20 is not
+// available
+//template <typename T>
+//concept ByteArrayTypeConcept = std::is_same<T, ByteArrayType>::value ||
+//                               std::is_same<T, LargeByteArrayType>::value;
+
+template<typename T>
+struct IsByteArrayType : std::false_type {};
+
+template<>
+struct IsByteArrayType<ByteArrayType> : std::true_type {};
+
+template<>
+struct IsByteArrayType<LargeByteArrayType> : std::true_type {};
+
+template<typename BAT>
+struct ByteArrayBuilderTypeTrait {
+  using BuilderType = typename std::conditional<std::is_same<BAT, 
LargeByteArrayType>::value,
+                                                ::arrow::LargeBinaryBuilder,
+                                                ::arrow::BinaryBuilder>::type;
+};
+
+template<typename BAT>
+class ChunkedRecordReader : public TypedRecordReader<BAT>,

Review Comment:
   What about naming it `ByteArrayChunkedRecordReaderImpl`?



##########
cpp/src/parquet/column_reader.cc:
##########
@@ -2132,15 +2161,24 @@ class ByteArrayChunkedRecordReader : public 
TypedRecordReader<ByteArrayType>,
 
  private:
   // Helper data structure for accumulating builder chunks
-  typename EncodingTraits<ByteArrayType>::Accumulator accumulator_;
+  typename EncodingTraits<BAT>::Accumulator accumulator_;
 };
 
-class ByteArrayDictionaryRecordReader : public 
TypedRecordReader<ByteArrayType>,
-                                        virtual public DictionaryRecordReader {
+using ByteArrayChunkedRecordReader = ChunkedRecordReader<ByteArrayType>;
+using LargeByteArrayChunkedRecordReader = 
ChunkedRecordReader<LargeByteArrayType>;
+

Review Comment:
   Remove this blank line.



##########
cpp/src/parquet/column_reader.cc:
##########
@@ -2093,15 +2093,44 @@ class FLBARecordReader : public 
TypedRecordReader<FLBAType>,
   std::unique_ptr<::arrow::FixedSizeBinaryBuilder> builder_;
 };
 
-class ByteArrayChunkedRecordReader : public TypedRecordReader<ByteArrayType>,
-                                     virtual public BinaryRecordReader {
+// Below concept could be used to simplify type assertion, but it seems like 
c++20 is not
+// available
+//template <typename T>
+//concept ByteArrayTypeConcept = std::is_same<T, ByteArrayType>::value ||
+//                               std::is_same<T, LargeByteArrayType>::value;
+
+template<typename T>
+struct IsByteArrayType : std::false_type {};
+
+template<>
+struct IsByteArrayType<ByteArrayType> : std::true_type {};
+
+template<>
+struct IsByteArrayType<LargeByteArrayType> : std::true_type {};
+
+template<typename BAT>
+struct ByteArrayBuilderTypeTrait {
+  using BuilderType = typename std::conditional<std::is_same<BAT, 
LargeByteArrayType>::value,
+                                                ::arrow::LargeBinaryBuilder,
+                                                ::arrow::BinaryBuilder>::type;
+};

Review Comment:
   ```suggestion
   template <typename T>
   struct IsByteArrayType : std::false_type {};
   
   template <>
   struct IsByteArrayType<ByteArrayType> : std::true_type {};
   
   template <>
   struct IsByteArrayType<LargeByteArrayType> : std::true_type {};
   
   template <typename BAT>
   struct ByteArrayBuilderTypeTrait {
     using BuilderType = typename std::conditional<std::is_same<BAT, 
LargeByteArrayType>::value,
                                                   ::arrow::LargeBinaryBuilder,
                                                   
::arrow::BinaryBuilder>::type;
   };
   ```



##########
cpp/src/parquet/encoding.cc:
##########
@@ -1238,12 +1238,13 @@ int PlainBooleanDecoder::Decode(bool* buffer, int 
max_values) {
   return max_values;
 }
 
-struct ArrowBinaryHelper {
-  explicit ArrowBinaryHelper(typename 
EncodingTraits<ByteArrayType>::Accumulator* out) {
+template <typename BAT>
+struct ArrowBinaryHelperBase {
+  explicit ArrowBinaryHelperBase(typename EncodingTraits<BAT>::Accumulator* 
out) {
     this->out = out;
     this->builder = out->builder.get();
     this->chunk_space_remaining =
-        ::arrow::kBinaryMemoryLimit - this->builder->value_data_length();
+        EncodingTraits<BAT>::memory_limit - this->builder->value_data_length();

Review Comment:
   If `memory_limit` is only used here, can't we use `BAT` to check whether it 
is large variant? Then we can avoid adding `memory_limit` to EncodingTraits.



##########
cpp/src/parquet/encoding.cc:
##########
@@ -1677,6 +1702,35 @@ void 
DictDecoderImpl<ByteArrayType>::SetDict(TypedDecoder<ByteArrayType>* dictio
   bytes_offsets[dictionary_length_] = offset;
 }
 
+template <>

Review Comment:
   These lines are duplicated as above. Could you please consolidate them?



##########
cpp/src/parquet/types.h:
##########
@@ -761,6 +761,17 @@ using Int96Type = PhysicalType<Type::INT96>;
 using FloatType = PhysicalType<Type::FLOAT>;
 using DoubleType = PhysicalType<Type::DOUBLE>;
 using ByteArrayType = PhysicalType<Type::BYTE_ARRAY>;
+
+/*
+ * Parquet does not have a LARGE_BYTE_ARRAY_TYPE, but arrow does.

Review Comment:
   This is not precise. We may say
   
   > Parquet uses ByteArrayType for variable length strings and binaries and 
their lengths will not exceed 2^31 - 1. However, arrow supports 
StringType/BinaryType and their large variants (i.e. LargeStringType and 
LargeBinaryType).



##########
cpp/src/parquet/encoding.h:
##########
@@ -140,15 +143,37 @@ template <>
 struct EncodingTraits<ByteArrayType> {
   using Encoder = ByteArrayEncoder;
   using Decoder = ByteArrayDecoder;
+  using BinaryBuilder = ::arrow::BinaryBuilder;

Review Comment:
   Is this actually required? Can we use decltype to get it from Accumulator?



##########
cpp/src/arrow/array/builder_dict.h:
##########
@@ -724,6 +724,7 @@ using BinaryDictionaryBuilder = 
DictionaryBuilder<BinaryType>;
 using StringDictionaryBuilder = DictionaryBuilder<StringType>;
 using BinaryDictionary32Builder = Dictionary32Builder<BinaryType>;
 using StringDictionary32Builder = Dictionary32Builder<StringType>;
+using LargeBinaryDictionary32Builder = Dictionary32Builder<LargeBinaryType>;

Review Comment:
   SGTM



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [arrow] wgtmac commented on a diff in pull request #35825: GH-32723: [C++][Parquet] Add option to use LARGE* variants of binary types

Reply via email to