wgtmac commented on code in PR #35886:
URL: https://github.com/apache/arrow/pull/35886#discussion_r1231802272
##########
cpp/src/parquet/column_writer.h:
##########
@@ -87,8 +88,9 @@ class PARQUET_EXPORT PageWriter {
static std::unique_ptr<PageWriter> Open(
std::shared_ptr<ArrowOutputStream> sink, Compression::type codec,
- int compression_level, ColumnChunkMetaDataBuilder* metadata,
- int16_t row_group_ordinal = -1, int16_t column_chunk_ordinal = -1,
+ const std::shared_ptr<CodecOptions>& codec_options,
Review Comment:
In addition to the comments above, should be consolidate `Compression::type
codec` and `const std::shared_ptr<CodecOptions>& codec_options`? codec_options
should know codec.
##########
cpp/src/parquet/column_writer.h:
##########
@@ -21,6 +21,7 @@
#include <cstring>
#include <memory>
+#include "arrow/type_fwd.h"
Review Comment:
Is this required? It should be included by `parquet/platform.h`.
##########
cpp/src/arrow/util/compression.h:
##########
@@ -107,6 +107,50 @@ class ARROW_EXPORT Decompressor {
// XXX add methods for buffer size heuristics?
};
+/// \brief Compression codec options
+class ARROW_EXPORT CodecOptions {
+ public:
+ CodecOptions(int compression_level = kUseDefaultCompressionLevel) {
+ compression_level_ = compression_level;
+ }
+ virtual ~CodecOptions() = default;
+
+ int compression_level_;
+};
Review Comment:
That sounds reasonable for now. But it would be harder to use once we add
more parameters.
##########
cpp/src/arrow/util/compression.h:
##########
@@ -107,6 +107,46 @@ class ARROW_EXPORT Decompressor {
// XXX add methods for buffer size heuristics?
};
+/// \brief Compression codec options
+class ARROW_EXPORT CodecOptions {
+ public:
+ CodecOptions(int compression_level = kUseDefaultCompressionLevel) {
+ compression_level_ = compression_level;
+ }
+ virtual ~CodecOptions() = default;
+
+ int compression_level_;
+};
+
+// ----------------------------------------------------------------------
+// gzip codec options implementation
+
+struct GZipFormat {
+ enum type {
+ ZLIB,
+ DEFLATE,
+ GZIP,
+ };
+};
+
+constexpr int kGZipDefaultWindowBits = 15;
+
+class ARROW_EXPORT GZipCodecOptions : public CodecOptions {
+ public:
+ GZipFormat::type gzip_format = GZipFormat::GZIP;
+ int window_bits = kGZipDefaultWindowBits;
+};
+
+// ----------------------------------------------------------------------
+// brotli codec options implementation
+
+constexpr int kBrotliDefaultWindowBits = 22;
Review Comment:
Same with my previous comment. We can use `BROTLI_DEFAULT_WINDOW` in the
`.cc` file
##########
cpp/src/parquet/column_writer.h:
##########
@@ -87,8 +88,9 @@ class PARQUET_EXPORT PageWriter {
static std::unique_ptr<PageWriter> Open(
std::shared_ptr<ArrowOutputStream> sink, Compression::type codec,
- int compression_level, ColumnChunkMetaDataBuilder* metadata,
Review Comment:
Should we mark the old one as deprecated? The new one should always be
preferable and we don't want to maintain different overloads in the long term.
##########
cpp/src/arrow/util/compression.h:
##########
@@ -107,6 +107,46 @@ class ARROW_EXPORT Decompressor {
// XXX add methods for buffer size heuristics?
};
+/// \brief Compression codec options
+class ARROW_EXPORT CodecOptions {
+ public:
+ CodecOptions(int compression_level = kUseDefaultCompressionLevel) {
+ compression_level_ = compression_level;
+ }
+ virtual ~CodecOptions() = default;
+
+ int compression_level_;
+};
+
+// ----------------------------------------------------------------------
+// gzip codec options implementation
+
+struct GZipFormat {
+ enum type {
+ ZLIB,
+ DEFLATE,
+ GZIP,
+ };
+};
+
+constexpr int kGZipDefaultWindowBits = 15;
+
+class ARROW_EXPORT GZipCodecOptions : public CodecOptions {
+ public:
+ GZipFormat::type gzip_format = GZipFormat::GZIP;
+ int window_bits = kGZipDefaultWindowBits;
Review Comment:
This could be easily out of sync with the upstream. What about using
`std::optional<int>` and always use the library default value if it is unset?
##########
cpp/src/arrow/util/compression.h:
##########
@@ -107,6 +107,46 @@ class ARROW_EXPORT Decompressor {
// XXX add methods for buffer size heuristics?
};
+/// \brief Compression codec options
+class ARROW_EXPORT CodecOptions {
+ public:
+ CodecOptions(int compression_level = kUseDefaultCompressionLevel) {
+ compression_level_ = compression_level;
+ }
+ virtual ~CodecOptions() = default;
+
+ int compression_level_;
+};
+
+// ----------------------------------------------------------------------
+// gzip codec options implementation
Review Comment:
```suggestion
// GZip codec options implementation
```
##########
cpp/src/parquet/properties.h:
##########
@@ -192,6 +197,8 @@ class PARQUET_EXPORT ColumnProperties {
int compression_level() const { return compression_level_; }
Review Comment:
+1 on this. `compression_level_` is duplicated.
##########
cpp/src/arrow/util/compression.h:
##########
@@ -107,6 +107,46 @@ class ARROW_EXPORT Decompressor {
// XXX add methods for buffer size heuristics?
};
+/// \brief Compression codec options
+class ARROW_EXPORT CodecOptions {
+ public:
+ CodecOptions(int compression_level = kUseDefaultCompressionLevel) {
+ compression_level_ = compression_level;
+ }
+ virtual ~CodecOptions() = default;
+
+ int compression_level_;
+};
+
+// ----------------------------------------------------------------------
+// gzip codec options implementation
+
+struct GZipFormat {
Review Comment:
It was an internal class in the past. As it is moved to a public header now,
changing it to enum class would be preferable.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]