pitrou commented on code in PR #35886:
URL: https://github.com/apache/arrow/pull/35886#discussion_r1219302368
##########
cpp/src/arrow/util/compression_brotli.cc:
##########
@@ -221,6 +226,13 @@ class BrotliCodec : public Codec {
return ptr;
}
+ Status Init() override {
+ if (window_bits_ < BROTLI_MIN_WINDOW_BITS || window_bits_ >
BROTLI_MAX_WINDOW_BITS) {
+ return Status::Invalid("window_bits should be within 10 ~ 24");
Review Comment:
Let's make the error message respect the macro values:
```suggestion
return Status::Invalid("window_bits should be between ",
BROTLI_MIN_WINDOW_BITS,
" and ", BROTLI_MAX_WINDOW_BITS);
```
##########
cpp/src/arrow/util/compression_lz4.cc:
##########
@@ -17,13 +17,13 @@
#include "arrow/util/compression_internal.h"
-#include <cstdint>
-#include <cstring>
-#include <memory>
-
#include <lz4.h>
#include <lz4frame.h>
#include <lz4hc.h>
+#include <cstdint>
+#include <cstring>
+#include <iostream>
+#include <memory>
Review Comment:
Those changes shouldn't be necessary? (also, why `iostream`?)
##########
cpp/src/arrow/util/compression_zlib.cc:
##########
@@ -46,6 +45,9 @@ namespace {
// Maximum window size
constexpr int WINDOW_BITS = 15;
+// Minimum window size
+constexpr int kGZipMinWindowBits = 9;
Review Comment:
Can you also rename the above to `kGZipMaxWindowBits`?
##########
cpp/src/arrow/util/compression.h:
##########
@@ -124,7 +168,15 @@ class ARROW_EXPORT Codec {
/// \brief Create a codec for the given compression algorithm
static Result<std::unique_ptr<Codec>> Create(
- Compression::type codec, int compression_level =
kUseDefaultCompressionLevel);
+ Compression::type codec,
+ const std::shared_ptr<CodecOptions>& codec_options =
+ std::make_shared<CodecOptions>(kUseDefaultCompressionLevel));
+
+ /// \brief Create a codec for the given compression algorithm
+ /// \deprecated and left for backwards compatibility.
+ ARROW_DEPRECATED("Use CodecOptions to create Codec")
Review Comment:
I don't think this should be deprecated, since it's a perfectly good API for
common usage (most people will never change the windows bits or any other
advanced compression parameters).
##########
cpp/src/arrow/flight/flight_benchmark.cc:
##########
@@ -432,7 +432,9 @@ int main(int argc, char** argv) {
const int level = level_str.empty() ?
arrow::util::kUseDefaultCompressionLevel
: std::stoi(level_str);
const auto type =
arrow::util::Codec::GetCompressionType(name).ValueOrDie();
- auto codec = arrow::util::Codec::Create(type, level).ValueOrDie();
+ auto codec = arrow::util::Codec::Create(
+ type, std::make_shared<arrow::util::CodecOptions>(level))
+ .ValueOrDie();
Review Comment:
These changes (and other similar ones) can be reverted if we remove the
deprecation.
##########
cpp/src/arrow/util/compression.h:
##########
@@ -107,6 +107,50 @@ class ARROW_EXPORT Decompressor {
// XXX add methods for buffer size heuristics?
};
+/// \brief Compression codec options
+class ARROW_EXPORT CodecOptions {
+ public:
+ CodecOptions(int compression_level = kUseDefaultCompressionLevel) {
+ compression_level_ = compression_level;
+ }
+ virtual ~CodecOptions() = default;
+
+ int compression_level_;
+};
+
+// ----------------------------------------------------------------------
+// gzip codec options implementation
+
+struct GZipFormat {
+ enum type {
+ ZLIB,
+ DEFLATE,
+ GZIP,
+ };
+};
+
+constexpr int kGZipDefaultWindowBits = 15;
+
+class GZipCodecOptions : public CodecOptions {
+ public:
+ ~GZipCodecOptions() = default;
Review Comment:
I don't think you need to explicitly define the destructor?
##########
cpp/src/arrow/util/compression.h:
##########
@@ -107,6 +107,50 @@ class ARROW_EXPORT Decompressor {
// XXX add methods for buffer size heuristics?
};
+/// \brief Compression codec options
+class ARROW_EXPORT CodecOptions {
+ public:
+ CodecOptions(int compression_level = kUseDefaultCompressionLevel) {
+ compression_level_ = compression_level;
+ }
+ virtual ~CodecOptions() = default;
+
+ int compression_level_;
+};
+
+// ----------------------------------------------------------------------
+// gzip codec options implementation
+
+struct GZipFormat {
+ enum type {
+ ZLIB,
+ DEFLATE,
+ GZIP,
+ };
+};
+
+constexpr int kGZipDefaultWindowBits = 15;
+
+class GZipCodecOptions : public CodecOptions {
+ public:
+ ~GZipCodecOptions() = default;
+
+ GZipFormat::type gzip_format = GZipFormat::GZIP;
+ int window_bits = kGZipDefaultWindowBits;
+};
+
+// ----------------------------------------------------------------------
+// brotli codec options implementation
+
+constexpr int kBrotliDefaultWindowBits = 22;
+
+class BrotliCodecOptions : public CodecOptions {
Review Comment:
Same here?
```suggestion
class ARROW_EXPORT BrotliCodecOptions : public CodecOptions {
```
##########
cpp/src/parquet/column_writer.h:
##########
@@ -21,6 +21,7 @@
#include <cstring>
#include <memory>
+#include "arrow/util/compression.h"
Review Comment:
```suggestion
#include "arrow/type_fwd.h"
```
##########
cpp/src/arrow/util/compression_zlib.cc:
##########
@@ -55,8 +57,8 @@ constexpr int DETECT_CODEC = 32;
constexpr int kGZipMinCompressionLevel = 1;
constexpr int kGZipMaxCompressionLevel = 9;
-int CompressionWindowBitsForFormat(GZipFormat::type format) {
- int window_bits = WINDOW_BITS;
+int CompressionWindowBitsForFormat(GZipFormat::type format, int
input_window_bits) {
+ int window_bits = input_window_bits;
Review Comment:
This is a bit convoluted
```suggestion
int CompressionWindowBitsForFormat(GZipFormat::type format, int window_bits)
{
```
##########
cpp/src/arrow/util/compression.cc:
##########
@@ -135,6 +135,92 @@ Result<int>
Codec::DefaultCompressionLevel(Compression::type codec_type) {
return codec->default_compression_level();
}
+Result<std::unique_ptr<Codec>> Codec::Create(
+ Compression::type codec_type, const std::shared_ptr<CodecOptions>&
codec_options) {
+ if (!IsAvailable(codec_type)) {
+ if (codec_type == Compression::LZO) {
+ return Status::NotImplemented("LZO codec not implemented");
+ }
+
+ auto name = GetCodecAsString(codec_type);
+ if (name == "unknown") {
+ return Status::Invalid("Unrecognized codec");
+ }
+
+ return Status::NotImplemented("Support for codec '",
GetCodecAsString(codec_type),
+ "' not built");
+ }
+
+ auto compression_level = codec_options->compression_level_;
+ if (compression_level != kUseDefaultCompressionLevel &&
+ !SupportsCompressionLevel(codec_type)) {
+ return Status::Invalid("Codec '", GetCodecAsString(codec_type),
+ "' doesn't support setting a compression level.");
+ }
+
+ std::unique_ptr<Codec> codec;
+ switch (codec_type) {
+ case Compression::UNCOMPRESSED:
+ return nullptr;
+ case Compression::SNAPPY:
+#ifdef ARROW_WITH_SNAPPY
+ codec = internal::MakeSnappyCodec();
+#endif
+ break;
+ case Compression::GZIP: {
+#ifdef ARROW_WITH_ZLIB
+ std::shared_ptr<GZipCodecOptions> opt =
+ std::dynamic_pointer_cast<GZipCodecOptions>(codec_options);
+ codec = internal::MakeGZipCodec(compression_level,
+ opt ? opt->gzip_format :
GZipFormat::GZIP,
+ opt ? opt->window_bits :
kGZipDefaultWindowBits);
+#endif
+ break;
+ }
+ case Compression::BROTLI: {
+#ifdef ARROW_WITH_BROTLI
+ std::shared_ptr<BrotliCodecOptions> opt =
+ std::dynamic_pointer_cast<BrotliCodecOptions>(codec_options);
+ codec = internal::MakeBrotliCodec(
+ compression_level, opt ? opt->window_bits :
kBrotliDefaultWindowBits);
+#endif
+ break;
+ }
+ case Compression::LZ4:
+#ifdef ARROW_WITH_LZ4
+ codec = internal::MakeLz4RawCodec(compression_level);
+#endif
+ break;
+ case Compression::LZ4_FRAME:
+#ifdef ARROW_WITH_LZ4
+ codec = internal::MakeLz4FrameCodec(compression_level);
+#endif
+ break;
+ case Compression::LZ4_HADOOP:
+#ifdef ARROW_WITH_LZ4
+ codec = internal::MakeLz4HadoopRawCodec();
+#endif
+ break;
+ case Compression::ZSTD:
+#ifdef ARROW_WITH_ZSTD
+ codec = internal::MakeZSTDCodec(compression_level);
+#endif
+ break;
+ case Compression::BZ2:
+#ifdef ARROW_WITH_BZ2
+ codec = internal::MakeBZ2Codec(compression_level);
+#endif
+ break;
+ default:
+ break;
+ }
+
+ DCHECK_NE(codec, nullptr);
+ RETURN_NOT_OK(codec->Init());
+ return std::move(codec);
+}
+
+// Deprecated and use CodecOptions to create Codec instead
Result<std::unique_ptr<Codec>> Codec::Create(Compression::type codec_type,
Review Comment:
There's a lot of repetition below. Can you please implement this overload in
terms of the one above?
##########
cpp/src/parquet/types.h:
##########
@@ -487,6 +487,12 @@ bool IsCodecSupported(Compression::type codec);
PARQUET_EXPORT
std::unique_ptr<Codec> GetCodec(Compression::type codec);
+PARQUET_EXPORT
+std::unique_ptr<Codec> GetCodec(Compression::type codec,
+ const std::shared_ptr<CodecOptions>&
codec_options);
Review Comment:
`const CodecOptions&`
##########
cpp/src/parquet/types.h:
##########
@@ -487,6 +487,12 @@ bool IsCodecSupported(Compression::type codec);
PARQUET_EXPORT
std::unique_ptr<Codec> GetCodec(Compression::type codec);
+PARQUET_EXPORT
+std::unique_ptr<Codec> GetCodec(Compression::type codec,
+ const std::shared_ptr<CodecOptions>&
codec_options);
+
+/// \deprecated and left for backwards compatibility.
+ARROW_DEPRECATED("Use CodecOptions to create Codec")
Review Comment:
Again, I'm not sure it's useful to deprecate this one.
##########
cpp/src/parquet/column_writer.h:
##########
@@ -87,8 +88,9 @@ class PARQUET_EXPORT PageWriter {
static std::unique_ptr<PageWriter> Open(
std::shared_ptr<ArrowOutputStream> sink, Compression::type codec,
- int compression_level, ColumnChunkMetaDataBuilder* metadata,
Review Comment:
Let's not remove this API but create a new one taking `const CodecOptions&`,
to avoid breaking API compatibility.
##########
r/src/compression.cpp:
##########
@@ -23,7 +23,8 @@
// [[arrow::export]]
std::shared_ptr<arrow::util::Codec>
util___Codec__Create(arrow::Compression::type codec,
R_xlen_t
compression_level) {
- return ValueOrStop(arrow::util::Codec::Create(codec, compression_level));
+ return ValueOrStop(arrow::util::Codec::Create(
+ codec, std::make_shared<CodecOptions>(CodecOptions(compression_level))));
Review Comment:
Can probably revert this.
##########
cpp/src/arrow/util/compression_zlib.cc:
##########
@@ -461,6 +468,9 @@ class GZipCodec : public Codec {
}
Status Init() override {
+ if (window_bits_ < kGZipMinWindowBits || window_bits_ > WINDOW_BITS) {
+ return Status::Invalid("window_bits should be within 9 ~ 15");
Review Comment:
Here as well, please incorporate the macro values in the error message.
##########
cpp/src/arrow/util/compression_zlib.cc:
##########
@@ -17,15 +17,14 @@
#include "arrow/util/compression_internal.h"
+#include <zconf.h>
+#include <zlib.h>
Review Comment:
Nit, but can you undo the include ordering change? It's a bit gratuitous.
Also, it makes things more readable to separate stdlib imports from third-party
library imports.
##########
cpp/src/arrow/util/compression.h:
##########
@@ -107,6 +107,50 @@ class ARROW_EXPORT Decompressor {
// XXX add methods for buffer size heuristics?
};
+/// \brief Compression codec options
+class ARROW_EXPORT CodecOptions {
+ public:
+ CodecOptions(int compression_level = kUseDefaultCompressionLevel) {
+ compression_level_ = compression_level;
+ }
+ virtual ~CodecOptions() = default;
+
+ int compression_level_;
+};
+
+// ----------------------------------------------------------------------
+// gzip codec options implementation
+
+struct GZipFormat {
+ enum type {
+ ZLIB,
+ DEFLATE,
+ GZIP,
+ };
+};
+
+constexpr int kGZipDefaultWindowBits = 15;
+
+class GZipCodecOptions : public CodecOptions {
Review Comment:
Probably need to ARROW_EXPORT this again for Windows.
```suggestion
class ARROW_EXPORT GZipCodecOptions : public CodecOptions {
```
##########
cpp/src/arrow/util/compression.h:
##########
@@ -107,6 +107,50 @@ class ARROW_EXPORT Decompressor {
// XXX add methods for buffer size heuristics?
};
+/// \brief Compression codec options
+class ARROW_EXPORT CodecOptions {
+ public:
+ CodecOptions(int compression_level = kUseDefaultCompressionLevel) {
+ compression_level_ = compression_level;
+ }
+ virtual ~CodecOptions() = default;
+
+ int compression_level_;
+};
Review Comment:
Or simply:
```c++
class ARROW_EXPORT CodecOptions {
public:
virtual ~CodecOptions() = default;
int compression_level_ = kUseDefaultCompressionLevel;
};
```
##########
cpp/src/parquet/column_writer.cc:
##########
@@ -249,9 +249,9 @@ int LevelEncoder::Encode(int batch_size, const int16_t*
levels) {
class SerializedPageWriter : public PageWriter {
public:
SerializedPageWriter(std::shared_ptr<ArrowOutputStream> sink,
Compression::type codec,
- int compression_level, ColumnChunkMetaDataBuilder*
metadata,
- int16_t row_group_ordinal, int16_t column_chunk_ordinal,
- bool use_page_checksum_verification,
+ const std::shared_ptr<CodecOptions>& codec_options,
Review Comment:
No need for `shared_ptr` here.
```suggestion
const CodecOptions& codec_options,
```
##########
cpp/src/arrow/util/compression.h:
##########
@@ -124,7 +168,15 @@ class ARROW_EXPORT Codec {
/// \brief Create a codec for the given compression algorithm
static Result<std::unique_ptr<Codec>> Create(
- Compression::type codec, int compression_level =
kUseDefaultCompressionLevel);
+ Compression::type codec,
+ const std::shared_ptr<CodecOptions>& codec_options =
+ std::make_shared<CodecOptions>(kUseDefaultCompressionLevel));
Review Comment:
It's not necessary to mandate `shared_ptr` here. Codecs won't take
ownership of the pointer, so you can pass a const reference instead:
```c++
static Result<std::unique_ptr<Codec>> Create(
Compression::type codec, const CodecOptions& codec_options = {});
```
##########
cpp/src/parquet/types.cc:
##########
@@ -51,9 +51,30 @@ bool IsCodecSupported(Compression::type codec) {
}
std::unique_ptr<Codec> GetCodec(Compression::type codec) {
- return GetCodec(codec, Codec::UseDefaultCompressionLevel());
+ return GetCodec(codec, std::make_shared<CodecOptions>());
}
+std::unique_ptr<Codec> GetCodec(Compression::type codec,
+ const std::shared_ptr<CodecOptions>&
codec_options) {
+ std::unique_ptr<Codec> result;
+ if (codec == Compression::LZO) {
+ throw ParquetException(
+ "While LZO compression is supported by the Parquet format in "
+ "general, it is currently not supported by the C++ implementation.");
+ }
+
+ if (!IsCodecSupported(codec)) {
+ std::stringstream ss;
+ ss << "Codec type " << Codec::GetCodecAsString(codec)
+ << " not supported in Parquet format";
+ throw ParquetException(ss.str());
+ }
+
+ PARQUET_ASSIGN_OR_THROW(result, Codec::Create(codec, codec_options));
+ return result;
+}
+
+// Deprecated and use CodecOptions to create Codec instead
std::unique_ptr<Codec> GetCodec(Compression::type codec, int
compression_level) {
Review Comment:
Here as well, can you implement this overload in terms of the one above?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]