Repository: parquet-cpp Updated Branches: refs/heads/master 339287aec -> 6787492ec
PARQUET-992: Do not transitively include zlib.h in public API Use PIMPL pattern to hide zlib from public API Author: Wes McKinney <[email protected]> Closes #333 from wesm/gzip-pimpl and squashes the following commits: adbca51 [Wes McKinney] Make the appveyor build matrix a little smaller 7d88204 [Wes McKinney] cpplint 39a85bd [Wes McKinney] Use override for virtuals 1064669 [Wes McKinney] Fix up GZIP pimpl 6de81df [Wes McKinney] WIP converting GZipCodec to PIMPL to hide zlib.h Project: http://git-wip-us.apache.org/repos/asf/parquet-cpp/repo Commit: http://git-wip-us.apache.org/repos/asf/parquet-cpp/commit/6787492e Tree: http://git-wip-us.apache.org/repos/asf/parquet-cpp/tree/6787492e Diff: http://git-wip-us.apache.org/repos/asf/parquet-cpp/diff/6787492e Branch: refs/heads/master Commit: 6787492eccef3ba6ff891a6dad4f32ba09d268e7 Parents: 339287a Author: Wes McKinney <[email protected]> Authored: Wed May 17 14:45:18 2017 -0400 Committer: Wes McKinney <[email protected]> Committed: Wed May 17 14:45:18 2017 -0400 ---------------------------------------------------------------------- CMakeLists.txt | 11 +- appveyor.yml | 17 ++- ci/msvc-build.bat | 4 +- cmake_modules/FindBrotli.cmake | 41 +++--- src/parquet/arrow/schema.cc | 7 +- src/parquet/arrow/schema.h | 7 +- src/parquet/column/scanner.h | 5 +- src/parquet/compression.cc | 262 +++++++++++++++++++++--------------- src/parquet/compression.h | 55 +++----- src/parquet/public-api-test.cc | 6 + 10 files changed, 226 insertions(+), 189 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/6787492e/CMakeLists.txt ---------------------------------------------------------------------- diff --git a/CMakeLists.txt b/CMakeLists.txt index 4c0f3d7..3728a0c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -346,6 +346,10 @@ enable_testing() # Dependencies ############################################################ +include_directories( + ${CMAKE_CURRENT_SOURCE_DIR}/src +) + if (PARQUET_MINIMAL_DEPENDENCY) set(IGNORE_OPTIONAL_PACKAGES ON) message(STATUS "Build using minimal dependencies") @@ -438,13 +442,6 @@ include(GenerateExportHeader) add_compiler_export_flags() ############################################################ -# Includes - -include_directories( - ${CMAKE_CURRENT_SOURCE_DIR}/src -) - -############################################################ # "make lint" target ############################################################ if (UNIX) http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/6787492e/appveyor.yml ---------------------------------------------------------------------- diff --git a/appveyor.yml b/appveyor.yml index 53d198d..ba78a02 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -23,24 +23,27 @@ environment: - GENERATOR: NMake Makefiles PYTHON: "3.5" ARCH: "64" - + CONFIGURATION: "Release" + - GENERATOR: Visual Studio 14 2015 Win64 + PYTHON: "3.5" + ARCH: "64" + CONFIGURATION: "Debug" - GENERATOR: Visual Studio 14 2015 Win64 PYTHON: "3.5" ARCH: "64" + CONFIGURATION: "Release" + - GENERATOR: Visual Studio 14 2015 Win64 + PYTHON: "3.5" + ARCH: "64" + CONFIGURATION: "Toolchain" MSVC_DEFAULT_OPTIONS: ON BOOST_ROOT: C:\Libraries\boost_1_63_0 BOOST_LIBRARYDIR: C:\Libraries\boost_1_63_0\lib64-msvc-14.0 -configuration: - - Debug - - Release - - Toolchain - init: - set MINICONDA=C:\Miniconda35-x64 - set PATH=%MINICONDA%;%MINICONDA%/Scripts;%MINICONDA%/Library/bin;%PATH% - if "%GENERATOR%"=="NMake Makefiles" call "C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\vcvarsall.bat" x64 - - if "%CONFIGURATION%"=="Toolchain" conda install -y boost-cpp=1.63 brotli=0.6.0 zlib=1.2.11 snappy=1.1.4 thrift-cpp=0.10.0 -c conda-forge build_script: - call ci\msvc-build.bat http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/6787492e/ci/msvc-build.bat ---------------------------------------------------------------------- diff --git a/ci/msvc-build.bat b/ci/msvc-build.bat index 6789b46..04743e6 100644 --- a/ci/msvc-build.bat +++ b/ci/msvc-build.bat @@ -23,6 +23,8 @@ cd build SET PARQUET_TEST_DATA=%APPVEYOR_BUILD_FOLDER%\data if "%CONFIGURATION%" == "Toolchain" ( + conda install -y boost-cpp=1.63 brotli=0.6.0 zlib=1.2.11 snappy=1.1.4 thrift-cpp=0.10.0 -c conda-forge + set PARQUET_BUILD_TOOLCHAIN=%MINICONDA%/Library cmake -G "%GENERATOR%" ^ @@ -48,4 +50,4 @@ if NOT "%CONFIGURATION%" == "Toolchain" ( if "%CONFIGURATION%" == "Release" ( ctest -VV || exit /B ) -) \ No newline at end of file +) http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/6787492e/cmake_modules/FindBrotli.cmake ---------------------------------------------------------------------- diff --git a/cmake_modules/FindBrotli.cmake b/cmake_modules/FindBrotli.cmake index 669c3f3..0a991b4 100644 --- a/cmake_modules/FindBrotli.cmake +++ b/cmake_modules/FindBrotli.cmake @@ -38,24 +38,25 @@ elseif ( Brotli_HOME ) list( APPEND _brotli_roots ${Brotli_HOME} ) endif() -# Try the parameterized roots, if they exist -if ( _brotli_roots ) - find_path( BROTLI_INCLUDE_DIR NAMES brotli/decode.h - PATHS ${_brotli_roots} NO_DEFAULT_PATH - PATH_SUFFIXES "include" ) - find_library( BROTLI_LIBRARY_ENC NAMES libbrotlienc.a brotlienc - PATHS ${_brotli_roots} NO_DEFAULT_PATH - PATH_SUFFIXES "lib/${CMAKE_LIBRARY_ARCHITECTURE}" "lib" ) - find_library( BROTLI_LIBRARY_DEC NAMES libbrotlidec.a brotlidec - PATHS ${_brotli_roots} NO_DEFAULT_PATH - PATH_SUFFIXES "lib/${CMAKE_LIBRARY_ARCHITECTURE}" "lib" ) - find_library( BROTLI_LIBRARY_COMMON NAMES libbrotlicommon.a brotlicommon - PATHS ${_brotli_roots} NO_DEFAULT_PATH - PATH_SUFFIXES "lib/${CMAKE_LIBRARY_ARCHITECTURE}" "lib" ) -else () - find_path( BROTLI_INCLUDE_DIR NAMES brotli.h ) - find_library( BROTLI_LIBRARIES NAMES brotlienc ) -endif () +find_path( BROTLI_INCLUDE_DIR NAMES brotli/decode.h + PATHS ${_brotli_roots} + NO_DEFAULT_PATH + PATH_SUFFIXES "include" ) + +find_library( BROTLI_LIBRARY_ENC NAMES libbrotlienc.a brotlienc + PATHS ${_brotli_roots} + NO_DEFAULT_PATH + PATH_SUFFIXES "lib/${CMAKE_LIBRARY_ARCHITECTURE}" "lib" ) + +find_library( BROTLI_LIBRARY_DEC NAMES libbrotlidec.a brotlidec + PATHS ${_brotli_roots} + NO_DEFAULT_PATH + PATH_SUFFIXES "lib/${CMAKE_LIBRARY_ARCHITECTURE}" "lib" ) + +find_library( BROTLI_LIBRARY_COMMON NAMES libbrotlicommon.a brotlicommon + PATHS ${_brotli_roots} + NO_DEFAULT_PATH + PATH_SUFFIXES "lib/${CMAKE_LIBRARY_ARCHITECTURE}" "lib" ) set(BROTLI_LIBRARIES ${BROTLI_LIBRARY_ENC} ${BROTLI_LIBRARY_DEC} ${BROTLI_LIBRARY_COMMON}) @@ -63,8 +64,6 @@ set(BROTLI_LIBRARIES ${BROTLI_LIBRARY_ENC} ${BROTLI_LIBRARY_DEC} if (BROTLI_INCLUDE_DIR AND (PARQUET_MINIMAL_DEPENDENCY OR BROTLI_LIBRARIES)) set(BROTLI_FOUND TRUE) get_filename_component( BROTLI_LIBS ${BROTLI_LIBRARY_ENC} PATH ) - set(BROTLI_HEADER_NAME brotli.h) - set(BROTLI_HEADER ${BROTLI_INCLUDE_DIR}/${BROTLI_HEADER_NAME}) set(BROTLI_LIB_NAME brotli) if (MSVC AND NOT BROTLI_MSVC_STATIC_LIB_SUFFIX) set(BROTLI_MSVC_STATIC_LIB_SUFFIX _static) @@ -84,7 +83,7 @@ endif () if (BROTLI_FOUND) if (NOT Brotli_FIND_QUIETLY) if (PARQUET_MINIMAL_DEPENDENCY) - message(STATUS "Found the Brotli header: ${BROTLI_HEADER}") + message(STATUS "Found the Brotli headers: ${BROTLI_INCLUDE_DIR}") else () message(STATUS "Found the Brotli library: ${BROTLI_LIBRARIES}") endif () http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/6787492e/src/parquet/arrow/schema.cc ---------------------------------------------------------------------- diff --git a/src/parquet/arrow/schema.cc b/src/parquet/arrow/schema.cc index 6aeff17..83968bc 100644 --- a/src/parquet/arrow/schema.cc +++ b/src/parquet/arrow/schema.cc @@ -371,13 +371,12 @@ Status FromParquetSchema(const SchemaDescriptor* parquet_schema, } Status FromParquetSchema(const SchemaDescriptor* parquet_schema, - const std::vector<int>& column_indices, - std::shared_ptr<::arrow::Schema>* out) { + const std::vector<int>& column_indices, std::shared_ptr<::arrow::Schema>* out) { return FromParquetSchema(parquet_schema, column_indices, nullptr, out); } -Status FromParquetSchema(const SchemaDescriptor* parquet_schema, - std::shared_ptr<::arrow::Schema>* out) { +Status FromParquetSchema( + const SchemaDescriptor* parquet_schema, std::shared_ptr<::arrow::Schema>* out) { return FromParquetSchema(parquet_schema, nullptr, out); } http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/6787492e/src/parquet/arrow/schema.h ---------------------------------------------------------------------- diff --git a/src/parquet/arrow/schema.h b/src/parquet/arrow/schema.h index 30dee20..69ca661 100644 --- a/src/parquet/arrow/schema.h +++ b/src/parquet/arrow/schema.h @@ -59,12 +59,11 @@ namespace arrow { // Without metadata ::arrow::Status PARQUET_EXPORT FromParquetSchema(const SchemaDescriptor* parquet_schema, - const std::vector<int>& column_indices, - std::shared_ptr<::arrow::Schema>* out); + const std::vector<int>& column_indices, std::shared_ptr<::arrow::Schema>* out); // Without metadata or indices -::arrow::Status PARQUET_EXPORT FromParquetSchema(const SchemaDescriptor* parquet_schema, - std::shared_ptr<::arrow::Schema>* out); +::arrow::Status PARQUET_EXPORT FromParquetSchema( + const SchemaDescriptor* parquet_schema, std::shared_ptr<::arrow::Schema>* out); ::arrow::Status PARQUET_EXPORT FieldToNode(const std::shared_ptr<::arrow::Field>& field, const WriterProperties& properties, schema::NodePtr* out); http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/6787492e/src/parquet/column/scanner.h ---------------------------------------------------------------------- diff --git a/src/parquet/column/scanner.h b/src/parquet/column/scanner.h index 75b08b6..914f2ad 100644 --- a/src/parquet/column/scanner.h +++ b/src/parquet/column/scanner.h @@ -103,9 +103,8 @@ class PARQUET_EXPORT TypedScanner : public Scanner { bool NextLevels(int16_t* def_level, int16_t* rep_level) { if (level_offset_ == levels_buffered_) { - levels_buffered_ = typed_reader_->ReadBatch( - batch_size_, def_levels_.data(), rep_levels_.data(), - values_, &values_buffered_); + levels_buffered_ = typed_reader_->ReadBatch(batch_size_, def_levels_.data(), + rep_levels_.data(), values_, &values_buffered_); value_offset_ = 0; level_offset_ = 0; http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/6787492e/src/parquet/compression.cc ---------------------------------------------------------------------- diff --git a/src/parquet/compression.cc b/src/parquet/compression.cc index c71f161..7d219fe 100644 --- a/src/parquet/compression.cc +++ b/src/parquet/compression.cc @@ -15,6 +15,8 @@ // specific language governing permissions and limitations // under the License. +#include "parquet/compression.h" + #include <cstdint> #include <memory> #include <string> @@ -22,13 +24,15 @@ #include <brotli/decode.h> #include <brotli/encode.h> #include <snappy.h> +#include <zlib.h> -#include "parquet/compression.h" #include "parquet/exception.h" #include "parquet/types.h" namespace parquet { +Codec::~Codec() {} + std::unique_ptr<Codec> Codec::Create(Compression::type codec_type) { std::unique_ptr<Codec> result; switch (codec_type) { @@ -68,140 +72,188 @@ static constexpr int GZIP_CODEC = 16; // Determine if this is libz or gzip from header. static constexpr int DETECT_CODEC = 32; -GZipCodec::GZipCodec(Format format) - : format_(format), compressor_initialized_(false), decompressor_initialized_(false) {} - -GZipCodec::~GZipCodec() { - EndCompressor(); - EndDecompressor(); -} +class GZipCodec::GZipCodecImpl { + public: + explicit GZipCodecImpl(GZipCodec::Format format) + : format_(format), + compressor_initialized_(false), + decompressor_initialized_(false) {} -void GZipCodec::InitCompressor() { - EndDecompressor(); - memset(&stream_, 0, sizeof(stream_)); - - int ret; - // Initialize to run specified format - int window_bits = WINDOW_BITS; - if (format_ == DEFLATE) { - window_bits = -window_bits; - } else if (format_ == GZIP) { - window_bits += GZIP_CODEC; + ~GZipCodecImpl() { + EndCompressor(); + EndDecompressor(); } - if ((ret = deflateInit2(&stream_, Z_DEFAULT_COMPRESSION, Z_DEFLATED, window_bits, 9, - Z_DEFAULT_STRATEGY)) != Z_OK) { - throw ParquetException("zlib deflateInit failed: " + std::string(stream_.msg)); + + void InitCompressor() { + EndDecompressor(); + memset(&stream_, 0, sizeof(stream_)); + + int ret; + // Initialize to run specified format + int window_bits = WINDOW_BITS; + if (format_ == DEFLATE) { + window_bits = -window_bits; + } else if (format_ == GZIP) { + window_bits += GZIP_CODEC; + } + if ((ret = deflateInit2(&stream_, Z_DEFAULT_COMPRESSION, Z_DEFLATED, window_bits, 9, + Z_DEFAULT_STRATEGY)) != Z_OK) { + throw ParquetException("zlib deflateInit failed: " + std::string(stream_.msg)); + } + + compressor_initialized_ = true; } - compressor_initialized_ = true; -} + void EndCompressor() { + if (compressor_initialized_) { (void)deflateEnd(&stream_); } + compressor_initialized_ = false; + } -void GZipCodec::EndCompressor() { - if (compressor_initialized_) { (void)deflateEnd(&stream_); } - compressor_initialized_ = false; -} + void InitDecompressor() { + EndCompressor(); + memset(&stream_, 0, sizeof(stream_)); + int ret; -void GZipCodec::InitDecompressor() { - EndCompressor(); - memset(&stream_, 0, sizeof(stream_)); - int ret; + // Initialize to run either deflate or zlib/gzip format + int window_bits = format_ == DEFLATE ? -WINDOW_BITS : WINDOW_BITS | DETECT_CODEC; + if ((ret = inflateInit2(&stream_, window_bits)) != Z_OK) { + throw ParquetException("zlib inflateInit failed: " + std::string(stream_.msg)); + } + decompressor_initialized_ = true; + } - // Initialize to run either deflate or zlib/gzip format - int window_bits = format_ == DEFLATE ? -WINDOW_BITS : WINDOW_BITS | DETECT_CODEC; - if ((ret = inflateInit2(&stream_, window_bits)) != Z_OK) { - throw ParquetException("zlib inflateInit failed: " + std::string(stream_.msg)); + void EndDecompressor() { + if (decompressor_initialized_) { (void)inflateEnd(&stream_); } + decompressor_initialized_ = false; } - decompressor_initialized_ = true; -} -void GZipCodec::EndDecompressor() { - if (decompressor_initialized_) { (void)inflateEnd(&stream_); } - decompressor_initialized_ = false; -} + void Decompress(int64_t input_length, const uint8_t* input, int64_t output_length, + uint8_t* output) { + if (!decompressor_initialized_) { InitDecompressor(); } + if (output_length == 0) { + // The zlib library does not allow *output to be NULL, even when output_length + // is 0 (inflate() will return Z_STREAM_ERROR). We don't consider this an + // error, so bail early if no output is expected. Note that we don't signal + // an error if the input actually contains compressed data. + return; + } -void GZipCodec::Decompress( - int64_t input_length, const uint8_t* input, int64_t output_length, uint8_t* output) { - if (!decompressor_initialized_) { InitDecompressor(); } - if (output_length == 0) { - // The zlib library does not allow *output to be NULL, even when output_length - // is 0 (inflate() will return Z_STREAM_ERROR). We don't consider this an - // error, so bail early if no output is expected. Note that we don't signal - // an error if the input actually contains compressed data. - return; + // Reset the stream for this block + if (inflateReset(&stream_) != Z_OK) { + throw ParquetException("zlib inflateReset failed: " + std::string(stream_.msg)); + } + + int ret = 0; + // gzip can run in streaming mode or non-streaming mode. We only + // support the non-streaming use case where we present it the entire + // compressed input and a buffer big enough to contain the entire + // compressed output. In the case where we don't know the output, + // we just make a bigger buffer and try the non-streaming mode + // from the beginning again. + while (ret != Z_STREAM_END) { + stream_.next_in = const_cast<Bytef*>(reinterpret_cast<const Bytef*>(input)); + stream_.avail_in = input_length; + stream_.next_out = reinterpret_cast<Bytef*>(output); + stream_.avail_out = output_length; + + // We know the output size. In this case, we can use Z_FINISH + // which is more efficient. + ret = inflate(&stream_, Z_FINISH); + if (ret == Z_STREAM_END || ret != Z_OK) break; + + // Failure, buffer was too small + std::stringstream ss; + ss << "Too small a buffer passed to GZipCodec. InputLength=" << input_length + << " OutputLength=" << output_length; + throw ParquetException(ss.str()); + } + + // Failure for some other reason + if (ret != Z_STREAM_END) { + std::stringstream ss; + ss << "GZipCodec failed: "; + if (stream_.msg != NULL) ss << stream_.msg; + throw ParquetException(ss.str()); + } } - // Reset the stream for this block - if (inflateReset(&stream_) != Z_OK) { - throw ParquetException("zlib inflateReset failed: " + std::string(stream_.msg)); + int64_t MaxCompressedLen(int64_t input_length, const uint8_t* input) { + // Most be in compression mode + if (!compressor_initialized_) { InitCompressor(); } + // TODO(wesm): deal with zlib < 1.2.3 (see Impala codebase) + return deflateBound(&stream_, static_cast<uLong>(input_length)); } - int ret = 0; - // gzip can run in streaming mode or non-streaming mode. We only - // support the non-streaming use case where we present it the entire - // compressed input and a buffer big enough to contain the entire - // compressed output. In the case where we don't know the output, - // we just make a bigger buffer and try the non-streaming mode - // from the beginning again. - while (ret != Z_STREAM_END) { + int64_t Compress(int64_t input_length, const uint8_t* input, int64_t output_length, + uint8_t* output) { + if (!compressor_initialized_) { InitCompressor(); } stream_.next_in = const_cast<Bytef*>(reinterpret_cast<const Bytef*>(input)); stream_.avail_in = input_length; stream_.next_out = reinterpret_cast<Bytef*>(output); stream_.avail_out = output_length; - // We know the output size. In this case, we can use Z_FINISH - // which is more efficient. - ret = inflate(&stream_, Z_FINISH); - if (ret == Z_STREAM_END || ret != Z_OK) break; + int64_t ret = 0; + if ((ret = deflate(&stream_, Z_FINISH)) != Z_STREAM_END) { + if (ret == Z_OK) { + // will return Z_OK (and stream.msg NOT set) if stream.avail_out is too + // small + throw ParquetException("zlib deflate failed, output buffer too small"); + } + std::stringstream ss; + ss << "zlib deflate failed: " << stream_.msg; + throw ParquetException(ss.str()); + } - // Failure, buffer was too small - std::stringstream ss; - ss << "Too small a buffer passed to GZipCodec. InputLength=" << input_length - << " OutputLength=" << output_length; - throw ParquetException(ss.str()); - } + if (deflateReset(&stream_) != Z_OK) { + throw ParquetException("zlib deflateReset failed: " + std::string(stream_.msg)); + } - // Failure for some other reason - if (ret != Z_STREAM_END) { - std::stringstream ss; - ss << "GZipCodec failed: "; - if (stream_.msg != NULL) ss << stream_.msg; - throw ParquetException(ss.str()); + // Actual output length + return output_length - stream_.avail_out; } + + private: + // zlib is stateful and the z_stream state variable must be initialized + // before + z_stream stream_; + + // Realistically, this will always be GZIP, but we leave the option open to + // configure + GZipCodec::Format format_; + + // These variables are mutually exclusive. When the codec is in "compressor" + // state, compressor_initialized_ is true while decompressor_initialized_ is + // false. When it's decompressing, the opposite is true. + // + // Indeed, this is slightly hacky, but the alternative is having separate + // Compressor and Decompressor classes. If this ever becomes an issue, we can + // perform the refactoring then + bool compressor_initialized_; + bool decompressor_initialized_; +}; + +GZipCodec::GZipCodec(Format format) { + impl_.reset(new GZipCodecImpl(format)); +} + +GZipCodec::~GZipCodec() {} + +void GZipCodec::Decompress( + int64_t input_length, const uint8_t* input, int64_t output_length, uint8_t* output) { + return impl_->Decompress(input_length, input, output_length, output); } int64_t GZipCodec::MaxCompressedLen(int64_t input_length, const uint8_t* input) { - // Most be in compression mode - if (!compressor_initialized_) { InitCompressor(); } - // TODO(wesm): deal with zlib < 1.2.3 (see Impala codebase) - return deflateBound(&stream_, static_cast<uLong>(input_length)); + return impl_->MaxCompressedLen(input_length, input); } int64_t GZipCodec::Compress( int64_t input_length, const uint8_t* input, int64_t output_length, uint8_t* output) { - if (!compressor_initialized_) { InitCompressor(); } - stream_.next_in = const_cast<Bytef*>(reinterpret_cast<const Bytef*>(input)); - stream_.avail_in = input_length; - stream_.next_out = reinterpret_cast<Bytef*>(output); - stream_.avail_out = output_length; - - int64_t ret = 0; - if ((ret = deflate(&stream_, Z_FINISH)) != Z_STREAM_END) { - if (ret == Z_OK) { - // will return Z_OK (and stream.msg NOT set) if stream.avail_out is too - // small - throw ParquetException("zlib deflate failed, output buffer too small"); - } - std::stringstream ss; - ss << "zlib deflate failed: " << stream_.msg; - throw ParquetException(ss.str()); - } - - if (deflateReset(&stream_) != Z_OK) { - throw ParquetException("zlib deflateReset failed: " + std::string(stream_.msg)); - } + return impl_->Compress(input_length, input, output_length, output); +} - // Actual output length - return output_length - stream_.avail_out; +const char* GZipCodec::name() const { + return "gzip"; } // ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/6787492e/src/parquet/compression.h ---------------------------------------------------------------------- diff --git a/src/parquet/compression.h b/src/parquet/compression.h index f006e54..f0a38b6 100644 --- a/src/parquet/compression.h +++ b/src/parquet/compression.h @@ -18,19 +18,18 @@ #ifndef PARQUET_COMPRESSION_CODEC_H #define PARQUET_COMPRESSION_CODEC_H -#include <zlib.h> - #include <cstdint> #include <memory> #include "parquet/exception.h" #include "parquet/types.h" +#include "parquet/util/visibility.h" namespace parquet { -class Codec { +class PARQUET_EXPORT Codec { public: - virtual ~Codec() {} + virtual ~Codec(); static std::unique_ptr<Codec> Create(Compression::type codec); @@ -48,15 +47,15 @@ class Codec { // Snappy codec. class PARQUET_EXPORT SnappyCodec : public Codec { public: - virtual void Decompress(int64_t input_len, const uint8_t* input, int64_t output_len, - uint8_t* output_buffer); + void Decompress(int64_t input_len, const uint8_t* input, int64_t output_len, + uint8_t* output_buffer) override; - virtual int64_t Compress(int64_t input_len, const uint8_t* input, - int64_t output_buffer_len, uint8_t* output_buffer); + int64_t Compress(int64_t input_len, const uint8_t* input, + int64_t output_buffer_len, uint8_t* output_buffer) override; - virtual int64_t MaxCompressedLen(int64_t input_len, const uint8_t* input); + int64_t MaxCompressedLen(int64_t input_len, const uint8_t* input) override; - virtual const char* name() const { return "snappy"; } + const char* name() const { return "snappy"; } }; // Brotli codec. @@ -86,38 +85,20 @@ class PARQUET_EXPORT GZipCodec : public Codec { explicit GZipCodec(Format format = GZIP); virtual ~GZipCodec(); - virtual void Decompress(int64_t input_len, const uint8_t* input, int64_t output_len, - uint8_t* output_buffer); + void Decompress(int64_t input_len, const uint8_t* input, int64_t output_len, + uint8_t* output_buffer) override; - virtual int64_t Compress(int64_t input_len, const uint8_t* input, - int64_t output_buffer_len, uint8_t* output_buffer); + int64_t Compress(int64_t input_len, const uint8_t* input, + int64_t output_buffer_len, uint8_t* output_buffer) override; - virtual int64_t MaxCompressedLen(int64_t input_len, const uint8_t* input); + int64_t MaxCompressedLen(int64_t input_len, const uint8_t* input) override; - virtual const char* name() const { return "gzip"; } + const char* name() const override; private: - // zlib is stateful and the z_stream state variable must be initialized - // before - z_stream stream_; - - // Realistically, this will always be GZIP, but we leave the option open to - // configure - Format format_; - - // These variables are mutually exclusive. When the codec is in "compressor" - // state, compressor_initialized_ is true while decompressor_initialized_ is - // false. When it's decompressing, the opposite is true. - // - // Indeed, this is slightly hacky, but the alternative is having separate - // Compressor and Decompressor classes. If this ever becomes an issue, we can - // perform the refactoring then - void InitCompressor(); - void InitDecompressor(); - void EndCompressor(); - void EndDecompressor(); - bool compressor_initialized_; - bool decompressor_initialized_; + // The gzip compressor is stateful + class GZipCodecImpl; + std::unique_ptr<GZipCodecImpl> impl_; }; } // namespace parquet http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/6787492e/src/parquet/public-api-test.cc ---------------------------------------------------------------------- diff --git a/src/parquet/public-api-test.cc b/src/parquet/public-api-test.cc index a5c88da..4d6f675 100644 --- a/src/parquet/public-api-test.cc +++ b/src/parquet/public-api-test.cc @@ -34,6 +34,12 @@ TEST(TestPublicAPI, DoesNotExportDCHECK) { #endif } +TEST(TestPublicAPI, DoesNotIncludeZlib) { +#ifdef ZLIB_H + FAIL() << "zlib.h should not be transitively included"; +#endif +} + void ThrowsParquetException() { throw parquet::ParquetException("This function throws"); }
