pitrou commented on code in PR #35886:
URL: https://github.com/apache/arrow/pull/35886#discussion_r1229753260


##########
cpp/examples/arrow/parquet_read_write.cc:
##########
@@ -165,11 +165,44 @@ arrow::Status WriteInBatches(std::string path_to_file) {
   return arrow::Status::OK();
 }
 
+arrow::Status WriteWithCodecOptions(std::string path_to_file) {
+  using parquet::ArrowWriterProperties;
+  using parquet::WriterProperties;
+
+  ARROW_ASSIGN_OR_RAISE(std::shared_ptr<arrow::Table> table, GetTable());
+
+  // Customize codec options with compression level and window bits
+  // the default window bits is 15, here we set a small number of 12 for some 
use scenario
+  // with limited hisotry buffer size
+  auto codec_options = std::make_shared<arrow::util::GZipCodecOptions>();
+  codec_options->compression_level_ = 9;
+  codec_options->window_bits = 12;
+
+  // Choose compression
+  std::shared_ptr<WriterProperties> props = WriterProperties::Builder()
+                                                
.compression(arrow::Compression::GZIP)
+                                                ->codec_options(codec_options)
+                                                ->build();
+
+  // Opt to store Arrow schema for easier reads back into Arrow
+  std::shared_ptr<ArrowWriterProperties> arrow_props =
+      ArrowWriterProperties::Builder().store_schema()->build();
+
+  std::shared_ptr<arrow::io::FileOutputStream> outfile;
+  ARROW_ASSIGN_OR_RAISE(outfile, 
arrow::io::FileOutputStream::Open(path_to_file));
+
+  ARROW_RETURN_NOT_OK(parquet::arrow::WriteTable(*table.get(),
+                                                 arrow::default_memory_pool(), 
outfile,
+                                                 /*chunk_size=*/3, props, 
arrow_props));
+  return arrow::Status::OK();
+}
+
 arrow::Status RunExamples(std::string path_to_file) {
   ARROW_RETURN_NOT_OK(WriteFullFile(path_to_file));
   ARROW_RETURN_NOT_OK(ReadFullFile(path_to_file));
   ARROW_RETURN_NOT_OK(WriteInBatches(path_to_file));
   ARROW_RETURN_NOT_OK(ReadInBatches(path_to_file));
+  ARROW_RETURN_NOT_OK(WriteWithCodecOptions(path_to_file));

Review Comment:
   I don't think it is useful to add this example. We're not showcasing other 
options in this file, either.



##########
cpp/src/arrow/util/compression_brotli.cc:
##########
@@ -221,6 +226,14 @@ class BrotliCodec : public Codec {
     return ptr;
   }
 
+  Status Init() override {
+    if (window_bits_ < BROTLI_MIN_WINDOW_BITS || window_bits_ > 
BROTLI_MAX_WINDOW_BITS) {
+      return Status::Invalid("window_bits should be between ", 
BROTLI_MIN_WINDOW_BITS,

Review Comment:
   Nit: make the context more explicit in the error message?
   ```suggestion
         return Status::Invalid("Brotli window_bits should be between ", 
BROTLI_MIN_WINDOW_BITS,
   ```



##########
cpp/src/parquet/column_writer_test.cc:
##########
@@ -118,8 +118,9 @@ class TestPrimitiveWriter : public 
PrimitiveTypedTest<TestType> {
 
     metadata_ = ColumnChunkMetaDataBuilder::Make(writer_properties_, 
this->descr_);
     std::unique_ptr<PageWriter> pager = PageWriter::Open(
-        sink_, column_properties.compression(), 
Codec::UseDefaultCompressionLevel(),
-        metadata_.get(), /* row_group_ordinal */ -1, /* column_chunk_ordinal*/ 
-1,
+        sink_, column_properties.compression(), 
std::make_shared<CodecOptions>(),

Review Comment:
   This change isn't necessary AFAICT.



##########
cpp/src/parquet/column_writer_test.cc:
##########
@@ -162,6 +163,25 @@ class TestPrimitiveWriter : public 
PrimitiveTypedTest<TestType> {
     ASSERT_NO_FATAL_FAILURE(this->ReadAndCompare(compression, num_rows, 
enable_checksum));
   }
 
+  void TestRequiredWithCodecOptions(
+      Encoding::type encoding, Compression::type compression, bool 
enable_dictionary,
+      bool enable_statistics, int64_t num_rows = SMALL_SIZE,
+      int compression_level = Codec::UseDefaultCompressionLevel(),
+      ::arrow::util::GZipFormat::type format = ::arrow::util::GZipFormat::GZIP,
+      int window_bits = ::arrow::util::kGZipDefaultWindowBits,
+      bool enable_checksum = false) {

Review Comment:
   Why not simply take a `const CodecOptions&` here?



##########
cpp/src/parquet/properties.h:
##########
@@ -154,6 +154,7 @@ class PARQUET_EXPORT ColumnProperties {
         statistics_enabled_(statistics_enabled),
         max_stats_size_(max_stats_size),
         compression_level_(Codec::UseDefaultCompressionLevel()),
+        codec_options_(std::make_shared<CodecOptions>()),

Review Comment:
   Can we just leave this null-initialized by default?



##########
cpp/src/parquet/column_writer_test.cc:
##########
@@ -522,6 +560,11 @@ TYPED_TEST(TestPrimitiveWriter, 
RequiredPlainWithStatsAndGzipCompression) {
   this->TestRequiredWithSettings(Encoding::PLAIN, Compression::GZIP, false, 
true,
                                  LARGE_SIZE);
 }
+
+TYPED_TEST(TestPrimitiveWriter, RequiredPlainWithGzipCodecOptions) {
+  this->TestRequiredWithCodecOptions(Encoding::PLAIN, Compression::GZIP, 
false, false,
+                                     LARGE_SIZE, 10, 
::arrow::util::GZipFormat::GZIP, 12);

Review Comment:
   For readability, can you please pass parameter names explicitly?
   ```suggestion
     this->TestRequiredWithCodecOptions(Encoding::PLAIN, Compression::GZIP, 
false, false,
                                        LARGE_SIZE, /*xxx=*/ 10, /*xxx=*/ 
::arrow::util::GZipFormat::GZIP, /*xxx=*/ 12);
   ```



##########
cpp/src/parquet/properties.h:
##########
@@ -431,6 +445,49 @@ class PARQUET_EXPORT WriterProperties {
       return this->compression_level(path->ToDotString(), compression_level);
     }
 
+    /// \brief Specify the default codec options for the compressor in
+    /// every column. Previously only compression level is supported to be 
customized,
+    /// with CodecOptions, more specific properties could be set by users.
+    ///
+    /// For compression level, it could be set by 
codec_options.compression_level. In
+    /// case a column does not have an explicitly specified compression level, 
the default
+    /// one would be used.
+    ///
+    /// The provided compression level is compressor specific. The user would
+    /// have to familiarize oneself with the available levels for the selected
+    /// compressor.  If the compressor does not allow for selecting different
+    /// compression levels, calling this function would not have any effect.
+    /// Parquet and Arrow do not validate the passed compression level.  If no
+    /// level is selected by the user or if the special
+    /// std::numeric_limits<int>::min() value is passed, then Arrow selects the
+    /// compression level.
+    ///
+    /// For GZip Codec, users could set window_bits and format with 
GZipCodecOptions.
+    /// For other Codecs, CodecOptions could be used to set compression_level. 
More
+    /// specific codec options to be added.

Review Comment:
   This section does not seem particularly informative either.



##########
cpp/src/parquet/properties.h:
##########
@@ -618,6 +677,7 @@ class PARQUET_EXPORT WriterProperties {
     std::unordered_map<std::string, Encoding::type> encodings_;
     std::unordered_map<std::string, Compression::type> codecs_;
     std::unordered_map<std::string, int32_t> codecs_compression_level_;

Review Comment:
   Let's remove this and only keep `codec_options_`?



##########
cpp/src/parquet/properties.h:
##########
@@ -431,6 +445,49 @@ class PARQUET_EXPORT WriterProperties {
       return this->compression_level(path->ToDotString(), compression_level);
     }

Review Comment:
   Can you change the docstrings for `compression_level()` to also mention the 
existence of `codec_options()` as an alternative?
   For example:
   > If other compressor-specific options need to be set in addition to the 
compression level, use the `codec_options` method.



##########
cpp/src/parquet/properties_test.cc:
##########
@@ -70,6 +70,37 @@ TEST(TestWriterProperties, AdvancedHandling) {
   ASSERT_EQ(ParquetDataPageVersion::V2, props->data_page_version());
 }
 
+TEST(TestWriterProperties, SetCodecOptions) {
+  WriterProperties::Builder builder;
+  builder.compression("gzip", Compression::GZIP);
+  builder.compression("zstd", Compression::ZSTD);
+  builder.compression("brotli", Compression::BROTLI);
+  auto gzip_codec_options = 
std::make_shared<::arrow::util::GZipCodecOptions>();
+  gzip_codec_options->compression_level_ = 9;
+  gzip_codec_options->window_bits = 12;
+  builder.codec_options("gzip", gzip_codec_options);
+  auto codec_options = std::make_shared<CodecOptions>();
+  builder.codec_options(codec_options);
+  auto brotli_codec_options = 
std::make_shared<::arrow::util::BrotliCodecOptions>();
+  brotli_codec_options->compression_level_ = 11;
+  brotli_codec_options->window_bits = 20;
+  builder.codec_options("brotli", brotli_codec_options);
+  std::shared_ptr<WriterProperties> props = builder.build();
+
+  ASSERT_EQ(9,
+            
props->codec_options(ColumnPath::FromDotString("gzip"))->compression_level_);
+  ASSERT_EQ(12, std::dynamic_pointer_cast<::arrow::util::GZipCodecOptions>(
+                    props->codec_options(ColumnPath::FromDotString("gzip")))
+                    ->window_bits);
+  ASSERT_EQ(Codec::UseDefaultCompressionLevel(),
+            
props->codec_options(ColumnPath::FromDotString("zstd"))->compression_level_);

Review Comment:
   This is not very interesting as the default compression level would be used 
in any case. Can you pass a non-default value above?



##########
cpp/src/parquet/column_writer.cc:
##########
@@ -36,7 +36,7 @@
 #include "arrow/util/bit_util.h"
 #include "arrow/util/bitmap_ops.h"
 #include "arrow/util/checked_cast.h"
-#include "arrow/util/compression.h"
+

Review Comment:
   Can you remove the spurious newline?



##########
cpp/src/arrow/util/compression_zlib.cc:
##########
@@ -461,6 +468,10 @@ class GZipCodec : public Codec {
   }
 
   Status Init() override {
+    if (window_bits_ < kGZipMinWindowBits || window_bits_ > 
kGZipMaxWindowBits) {
+      return Status::Invalid("window_bits should be between ", 
kGZipMinWindowBits,

Review Comment:
   Nit
   ```suggestion
         return Status::Invalid("GZip window_bits should be between ", 
kGZipMinWindowBits,
   ```



##########
cpp/src/parquet/properties.h:
##########
@@ -174,6 +175,16 @@ class PARQUET_EXPORT ColumnProperties {
 
   void set_compression_level(int compression_level) {
     compression_level_ = compression_level;

Review Comment:
   +1, I think we can just keep the codec options.



##########
cpp/src/parquet/properties.h:
##########
@@ -431,6 +445,49 @@ class PARQUET_EXPORT WriterProperties {
       return this->compression_level(path->ToDotString(), compression_level);
     }
 
+    /// \brief Specify the default codec options for the compressor in
+    /// every column. Previously only compression level is supported to be 
customized,
+    /// with CodecOptions, more specific properties could be set by users.
+    ///
+    /// For compression level, it could be set by 
codec_options.compression_level. In
+    /// case a column does not have an explicitly specified compression level, 
the default
+    /// one would be used.

Review Comment:
   This seems needlessly wordy.
   ```suggestion
       /// \brief Specify the default codec options for the compressor in
       /// every column.
       ///
       /// The codec options allow configuring the compression level as well
       /// as other codec-specific options.
   ```



##########
cpp/src/parquet/properties.h:
##########
@@ -431,6 +445,49 @@ class PARQUET_EXPORT WriterProperties {
       return this->compression_level(path->ToDotString(), compression_level);
     }
 
+    /// \brief Specify the default codec options for the compressor in
+    /// every column. Previously only compression level is supported to be 
customized,
+    /// with CodecOptions, more specific properties could be set by users.
+    ///
+    /// For compression level, it could be set by 
codec_options.compression_level. In
+    /// case a column does not have an explicitly specified compression level, 
the default
+    /// one would be used.
+    ///
+    /// The provided compression level is compressor specific. The user would
+    /// have to familiarize oneself with the available levels for the selected
+    /// compressor.  If the compressor does not allow for selecting different
+    /// compression levels, calling this function would not have any effect.
+    /// Parquet and Arrow do not validate the passed compression level.  If no
+    /// level is selected by the user or if the special
+    /// std::numeric_limits<int>::min() value is passed, then Arrow selects the
+    /// compression level.

Review Comment:
   Remove this section?



##########
cpp/src/parquet/column_writer.h:
##########
@@ -87,8 +88,9 @@ class PARQUET_EXPORT PageWriter {
 
   static std::unique_ptr<PageWriter> Open(
       std::shared_ptr<ArrowOutputStream> sink, Compression::type codec,
-      int compression_level, ColumnChunkMetaDataBuilder* metadata,
-      int16_t row_group_ordinal = -1, int16_t column_chunk_ordinal = -1,
+      const std::shared_ptr<CodecOptions>& codec_options,

Review Comment:
   Also, unless the `PageWriter` needs to keep ownership of the codec options, 
this should simply be `const CodecOptions& = {}`.



##########
cpp/src/parquet/column_writer.h:
##########
@@ -87,8 +88,9 @@ class PARQUET_EXPORT PageWriter {
 
   static std::unique_ptr<PageWriter> Open(
       std::shared_ptr<ArrowOutputStream> sink, Compression::type codec,
-      int compression_level, ColumnChunkMetaDataBuilder* metadata,
-      int16_t row_group_ordinal = -1, int16_t column_chunk_ordinal = -1,
+      const std::shared_ptr<CodecOptions>& codec_options,

Review Comment:
   +1



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to