Repository: parquet-cpp Updated Branches: refs/heads/master 3e0e5da1c -> 1219fa48f
PARQUET-769: Add support for Brotli compression Author: Uwe L. Korn <[email protected]> Closes #194 from xhochy/PARQUET-769 and squashes the following commits: aad390f [Uwe L. Korn] Pass buffer sizes also as in parameter 9847171 [Uwe L. Korn] make format 855250d [Uwe L. Korn] make format 40e93de [Uwe L. Korn] Add FindBrotli 47b9d03 [Uwe L. Korn] PARQUET-769: Add support for Brotli compression Project: http://git-wip-us.apache.org/repos/asf/parquet-cpp/repo Commit: http://git-wip-us.apache.org/repos/asf/parquet-cpp/commit/1219fa48 Tree: http://git-wip-us.apache.org/repos/asf/parquet-cpp/tree/1219fa48 Diff: http://git-wip-us.apache.org/repos/asf/parquet-cpp/diff/1219fa48 Branch: refs/heads/master Commit: 1219fa48ff8193829cd5ac5cf64b012de527eb24 Parents: 3e0e5da Author: Uwe L. Korn <[email protected]> Authored: Sat Nov 26 14:30:12 2016 -0500 Committer: Wes McKinney <[email protected]> Committed: Sat Nov 26 14:30:12 2016 -0500 ---------------------------------------------------------------------- CMakeLists.txt | 14 ++++ cmake_modules/FindBrotli.cmake | 105 +++++++++++++++++++++++++ src/parquet/column/column-writer-test.cc | 10 +++ src/parquet/compression/brotli-codec.cc | 53 +++++++++++++ src/parquet/compression/codec-test.cc | 4 + src/parquet/compression/codec.cc | 2 +- src/parquet/compression/codec.h | 14 ++++ src/parquet/file/file-deserialize-test.cc | 3 +- src/parquet/file/file-serialize-test.cc | 4 + thirdparty/build_thirdparty.sh | 9 +++ thirdparty/download_thirdparty.sh | 5 ++ thirdparty/set_thirdparty_env.sh | 1 + thirdparty/versions.sh | 4 + 13 files changed, 226 insertions(+), 2 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/1219fa48/CMakeLists.txt ---------------------------------------------------------------------- diff --git a/CMakeLists.txt b/CMakeLists.txt index f0e14ae..a9fe089 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -294,6 +294,16 @@ include_directories(SYSTEM ${SNAPPY_INCLUDE_DIR}) add_library(snappystatic STATIC IMPORTED) set_target_properties(snappystatic PROPERTIES IMPORTED_LOCATION ${SNAPPY_STATIC_LIB}) +## Brotli +find_package(Brotli REQUIRED) +include_directories(SYSTEM ${BROTLI_INCLUDE_DIR}) +add_library(brotlistatic_enc STATIC IMPORTED) +set_target_properties(brotlistatic_enc PROPERTIES IMPORTED_LOCATION ${BROTLI_LIBRARY_ENC}) +add_library(brotlistatic_dec STATIC IMPORTED) +set_target_properties(brotlistatic_dec PROPERTIES IMPORTED_LOCATION ${BROTLI_LIBRARY_DEC}) +add_library(brotlistatic_common STATIC IMPORTED) +set_target_properties(brotlistatic_common PROPERTIES IMPORTED_LOCATION ${BROTLI_LIBRARY_COMMON}) + ## ZLIB find_package(ZLIB REQUIRED) include_directories(SYSTEM ${ZLIB_INCLUDE_DIRS}) @@ -512,6 +522,7 @@ set(LIBPARQUET_SRCS src/parquet/column/statistics.cc src/parquet/compression/codec.cc + src/parquet/compression/brotli-codec.cc src/parquet/compression/snappy-codec.cc src/parquet/compression/gzip-codec.cc @@ -539,6 +550,9 @@ set(LIBPARQUET_LINK_LIBS set(LIBPARQUET_PRIVATE_LINK_LIBS parquet_thrift + brotlistatic_dec + brotlistatic_enc + brotlistatic_common snappystatic thriftstatic zlibstatic http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/1219fa48/cmake_modules/FindBrotli.cmake ---------------------------------------------------------------------- diff --git a/cmake_modules/FindBrotli.cmake b/cmake_modules/FindBrotli.cmake new file mode 100644 index 0000000..9df15ee --- /dev/null +++ b/cmake_modules/FindBrotli.cmake @@ -0,0 +1,105 @@ +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Tries to find Brotli headers and libraries. +# +# Usage of this module as follows: +# +# find_package(Brotli) +# +# Variables used by this module, they can change the default behaviour and need +# to be set before calling find_package: +# +# Brotli_HOME - When set, this path is inspected instead of standard library +# locations as the root of the Brotli installation. +# The environment variable BROTLI_HOME overrides this veriable. +# +# This module defines +# BROTLI_INCLUDE_DIR, directory containing headers +# BROTLI_LIBS, directory containing brotli libraries +# BROTLI_STATIC_LIB, path to libbrotli.a +# BROTLI_SHARED_LIB, path to libbrotli's shared library +# BROTLI_FOUND, whether brotli has been found + +if( NOT "$ENV{BROTLI_HOME}" STREQUAL "") + file( TO_CMAKE_PATH "$ENV{BROTLI_HOME}" _native_path ) + list( APPEND _brotli_roots ${_native_path} ) +elseif ( Brotli_HOME ) + list( APPEND _brotli_roots ${Brotli_HOME} ) +endif() + +# Try the parameterized roots, if they exist +if ( _brotli_roots ) + find_path( BROTLI_INCLUDE_DIR NAMES brotli/decode.h + PATHS ${_brotli_roots} NO_DEFAULT_PATH + PATH_SUFFIXES "include" ) + find_library( BROTLI_LIBRARY_ENC NAMES brotlienc + PATHS ${_brotli_roots} NO_DEFAULT_PATH + PATH_SUFFIXES "lib/${CMAKE_LIBRARY_ARCHITECTURE}" "lib" ) + find_library( BROTLI_LIBRARY_DEC NAMES brotlidec + PATHS ${_brotli_roots} NO_DEFAULT_PATH + PATH_SUFFIXES "lib/${CMAKE_LIBRARY_ARCHITECTURE}" "lib" ) + find_library( BROTLI_LIBRARY_COMMON NAMES brotlicommon + PATHS ${_brotli_roots} NO_DEFAULT_PATH + PATH_SUFFIXES "lib/${CMAKE_LIBRARY_ARCHITECTURE}" "lib" ) +else () + find_path( BROTLI_INCLUDE_DIR NAMES brotli.h ) + find_library( BROTLI_LIBRARIES NAMES brotlienc ) +endif () + +set(BROTLI_LIBRARIES ${BROTLI_LIBRARY_ENC} ${BROTLI_LIBRARY_DEC} + ${BROTLI_LIBRARY_COMMON}) + +if (BROTLI_INCLUDE_DIR AND BROTLI_LIBRARIES) + set(BROTLI_FOUND TRUE) + get_filename_component( BROTLI_LIBS ${BROTLI_LIBRARY_ENC} PATH ) + set(BROTLI_LIB_NAME libbrotli) + set(BROTLI_STATIC_LIB + ${BROTLI_LIBS}/${BROTLI_LIB_NAME}enc.a + ${BROTLI_LIBS}/${BROTLI_LIB_NAME}dec.a + ${BROTLI_LIBS}/${BROTLI_LIB_NAME}common.a) + set(BROTLI_SHARED_LIB + ${BROTLI_LIBS}/${BROTLI_LIB_NAME}enc${CMAKE_SHARED_LIBRARY_SUFFIX} + ${BROTLI_LIBS}/${BROTLI_LIB_NAME}dec${CMAKE_SHARED_LIBRARY_SUFFIX} + ${BROTLI_LIBS}/${BROTLI_LIB_NAME}common${CMAKE_SHARED_LIBRARY_SUFFIX}) +else () + set(BROTLI_FOUND FALSE) +endif () + +if (BROTLI_FOUND) + if (NOT Brotli_FIND_QUIETLY) + message(STATUS "Found the Brotli library: ${BROTLI_LIBRARIES}") + endif () +else () + if (NOT Brotli_FIND_QUIETLY) + set(BROTLI_ERR_MSG "Could not find the Brotli library. Looked in ") + if ( _brotli_roots ) + set(BROTLI_ERR_MSG "${BROTLI_ERR_MSG} in ${_brotli_roots}.") + else () + set(BROTLI_ERR_MSG "${BROTLI_ERR_MSG} system search paths.") + endif () + if (Brotli_FIND_REQUIRED) + message(FATAL_ERROR "${BROTLI_ERR_MSG}") + else (Brotli_FIND_REQUIRED) + message(STATUS "${BROTLI_ERR_MSG}") + endif (Brotli_FIND_REQUIRED) + endif () +endif () + +mark_as_advanced( + BROTLI_INCLUDE_DIR + BROTLI_LIBS + BROTLI_LIBRARIES + BROTLI_STATIC_LIB + BROTLI_SHARED_LIB +) http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/1219fa48/src/parquet/column/column-writer-test.cc ---------------------------------------------------------------------- diff --git a/src/parquet/column/column-writer-test.cc b/src/parquet/column/column-writer-test.cc index 0a20ac1..5a65175 100644 --- a/src/parquet/column/column-writer-test.cc +++ b/src/parquet/column/column-writer-test.cc @@ -259,6 +259,11 @@ TYPED_TEST(TestPrimitiveWriter, RequiredPlainWithSnappyCompression) { Encoding::PLAIN, Compression::SNAPPY, false, false, LARGE_SIZE); } +TYPED_TEST(TestPrimitiveWriter, RequiredPlainWithBrotliCompression) { + this->TestRequiredWithSettings( + Encoding::PLAIN, Compression::BROTLI, false, false, LARGE_SIZE); +} + TYPED_TEST(TestPrimitiveWriter, RequiredPlainWithGzipCompression) { this->TestRequiredWithSettings( Encoding::PLAIN, Compression::GZIP, false, false, LARGE_SIZE); @@ -274,6 +279,11 @@ TYPED_TEST(TestPrimitiveWriter, RequiredPlainWithStatsAndSnappyCompression) { Encoding::PLAIN, Compression::SNAPPY, false, true, LARGE_SIZE); } +TYPED_TEST(TestPrimitiveWriter, RequiredPlainWithStatsAndBrotliCompression) { + this->TestRequiredWithSettings( + Encoding::PLAIN, Compression::BROTLI, false, true, LARGE_SIZE); +} + TYPED_TEST(TestPrimitiveWriter, RequiredPlainWithStatsAndGzipCompression) { this->TestRequiredWithSettings( Encoding::PLAIN, Compression::GZIP, false, true, LARGE_SIZE); http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/1219fa48/src/parquet/compression/brotli-codec.cc ---------------------------------------------------------------------- diff --git a/src/parquet/compression/brotli-codec.cc b/src/parquet/compression/brotli-codec.cc new file mode 100644 index 0000000..24ff230 --- /dev/null +++ b/src/parquet/compression/brotli-codec.cc @@ -0,0 +1,53 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include <cstdint> +#include <cstdlib> +#include <brotli/decode.h> +#include <brotli/encode.h> + +#include "parquet/compression/codec.h" +#include "parquet/exception.h" + +namespace parquet { + +void BrotliCodec::Decompress( + int64_t input_len, const uint8_t* input, int64_t output_len, uint8_t* output_buffer) { + size_t output_size = output_len; + if (BrotliDecoderDecompress(input_len, input, &output_size, output_buffer) != + BROTLI_DECODER_RESULT_SUCCESS) { + throw parquet::ParquetException("Corrupt brotli compressed data."); + } +} + +int64_t BrotliCodec::MaxCompressedLen(int64_t input_len, const uint8_t* input) { + return BrotliEncoderMaxCompressedSize(input_len); +} + +int64_t BrotliCodec::Compress(int64_t input_len, const uint8_t* input, + int64_t output_buffer_len, uint8_t* output_buffer) { + size_t output_len = output_buffer_len; + // TODO: Make quality configurable. We use 8 as a default as it is the best + // trade-off for Parquet workload + if (BrotliEncoderCompress(8, BROTLI_DEFAULT_WINDOW, BROTLI_DEFAULT_MODE, input_len, + input, &output_len, output_buffer) == BROTLI_FALSE) { + throw parquet::ParquetException("Brotli compression failure."); + } + return output_len; +} + +} // namespace parquet http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/1219fa48/src/parquet/compression/codec-test.cc ---------------------------------------------------------------------- diff --git a/src/parquet/compression/codec-test.cc b/src/parquet/compression/codec-test.cc index 2f7cc1a..f2be84b 100644 --- a/src/parquet/compression/codec-test.cc +++ b/src/parquet/compression/codec-test.cc @@ -73,6 +73,10 @@ TEST(TestCompressors, Snappy) { CheckCodec<SnappyCodec>(); } +TEST(TestCompressors, Brotli) { + CheckCodec<BrotliCodec>(); +} + TEST(TestCompressors, GZip) { CheckCodec<GZipCodec>(); } http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/1219fa48/src/parquet/compression/codec.cc ---------------------------------------------------------------------- diff --git a/src/parquet/compression/codec.cc b/src/parquet/compression/codec.cc index f5aaefd..a7e5fba 100644 --- a/src/parquet/compression/codec.cc +++ b/src/parquet/compression/codec.cc @@ -38,7 +38,7 @@ std::unique_ptr<Codec> Codec::Create(Compression::type codec_type) { ParquetException::NYI("LZO codec not implemented"); break; case Compression::BROTLI: - ParquetException::NYI("BROTLI codec not implemented"); + result.reset(new BrotliCodec()); break; default: ParquetException::NYI("Unrecognized codec"); http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/1219fa48/src/parquet/compression/codec.h ---------------------------------------------------------------------- diff --git a/src/parquet/compression/codec.h b/src/parquet/compression/codec.h index ca823c5..e803a8c 100644 --- a/src/parquet/compression/codec.h +++ b/src/parquet/compression/codec.h @@ -59,6 +59,20 @@ class SnappyCodec : public Codec { virtual const char* name() const { return "snappy"; } }; +// Brotli codec. +class BrotliCodec : public Codec { + public: + void Decompress(int64_t input_len, const uint8_t* input, int64_t output_len, + uint8_t* output_buffer) override; + + int64_t Compress(int64_t input_len, const uint8_t* input, + int64_t output_buffer_len, uint8_t* output_buffer) override; + + int64_t MaxCompressedLen(int64_t input_len, const uint8_t* input) override; + + const char* name() const override { return "brotli"; } +}; + // GZip codec. class GZipCodec : public Codec { public: http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/1219fa48/src/parquet/file/file-deserialize-test.cc ---------------------------------------------------------------------- diff --git a/src/parquet/file/file-deserialize-test.cc b/src/parquet/file/file-deserialize-test.cc index 8f832df..5d97cd9 100644 --- a/src/parquet/file/file-deserialize-test.cc +++ b/src/parquet/file/file-deserialize-test.cc @@ -165,7 +165,8 @@ TEST_F(TestPageSerde, TestFailLargePageHeaders) { } TEST_F(TestPageSerde, Compression) { - Compression::type codec_types[2] = {Compression::GZIP, Compression::SNAPPY}; + Compression::type codec_types[3] = { + Compression::GZIP, Compression::SNAPPY, Compression::BROTLI}; // This is a dummy number data_page_header_.num_values = 32; http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/1219fa48/src/parquet/file/file-serialize-test.cc ---------------------------------------------------------------------- diff --git a/src/parquet/file/file-serialize-test.cc b/src/parquet/file/file-serialize-test.cc index 42a73c9..3a11cd8 100644 --- a/src/parquet/file/file-serialize-test.cc +++ b/src/parquet/file/file-serialize-test.cc @@ -119,6 +119,10 @@ TYPED_TEST(TestSerialize, SmallFileSnappy) { this->FileSerializeTest(Compression::SNAPPY); } +TYPED_TEST(TestSerialize, SmallFileBrotli) { + this->FileSerializeTest(Compression::BROTLI); +} + TYPED_TEST(TestSerialize, SmallFileGzip) { this->FileSerializeTest(Compression::GZIP); } http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/1219fa48/thirdparty/build_thirdparty.sh ---------------------------------------------------------------------- diff --git a/thirdparty/build_thirdparty.sh b/thirdparty/build_thirdparty.sh index 4a91516..727e722 100755 --- a/thirdparty/build_thirdparty.sh +++ b/thirdparty/build_thirdparty.sh @@ -33,6 +33,7 @@ else for arg in "$@"; do case $arg in "arrow") F_ARROW=1 ;; + "brotli") F_BROTLI=1 ;; "zlib") F_ZLIB=1 ;; "gbenchmark") F_GBENCHMARK=1 ;; "gtest") F_GTEST=1 ;; @@ -144,5 +145,13 @@ if [ -n "$F_ALL" -o -n "$F_ARROW" ]; then # : fi +# build brotli +if [ -n "$F_ALL" -o -n "$F_BROTLI" ]; then + cd $TP_DIR/$BROTLI_BASEDIR + cmake -DCMAKE_INSTALL_PREFIX=$PREFIX -DBUILD_SHARED_LIBS=OFF . + make -j$PARALLEL install + # : +fi + echo "---------------------" echo "Thirdparty dependencies built and installed into $PREFIX successfully" http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/1219fa48/thirdparty/download_thirdparty.sh ---------------------------------------------------------------------- diff --git a/thirdparty/download_thirdparty.sh b/thirdparty/download_thirdparty.sh index 3483321..4831bbf 100755 --- a/thirdparty/download_thirdparty.sh +++ b/thirdparty/download_thirdparty.sh @@ -42,6 +42,11 @@ if [ ! -d ${ARROW_BASEDIR} ]; then download_extract_and_cleanup $ARROW_URL fi +if [ ! -d ${BROTLI_BASEDIR} ]; then + echo "Fetching brotli" + download_extract_and_cleanup $BROTLI_URL +fi + if [ ! -d ${SNAPPY_BASEDIR} ]; then echo "Fetching snappy" download_extract_and_cleanup $SNAPPY_URL http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/1219fa48/thirdparty/set_thirdparty_env.sh ---------------------------------------------------------------------- diff --git a/thirdparty/set_thirdparty_env.sh b/thirdparty/set_thirdparty_env.sh index 547ed54..e8a6068 100644 --- a/thirdparty/set_thirdparty_env.sh +++ b/thirdparty/set_thirdparty_env.sh @@ -25,6 +25,7 @@ if [ -z "$THIRDPARTY_DIR" ]; then fi export ARROW_HOME=$THIRDPARTY_DIR/installed +export BROTLI_HOME=$THIRDPARTY_DIR/installed export SNAPPY_HOME=$THIRDPARTY_DIR/installed export ZLIB_HOME=$THIRDPARTY_DIR/installed # build script doesn't support building thrift on OSX http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/1219fa48/thirdparty/versions.sh ---------------------------------------------------------------------- diff --git a/thirdparty/versions.sh b/thirdparty/versions.sh index 855b6f7..ff5644e 100755 --- a/thirdparty/versions.sh +++ b/thirdparty/versions.sh @@ -19,6 +19,10 @@ ARROW_VERSION="d946e7917d55cb220becd6469ae93430f2e60764" ARROW_URL="https://github.com/apache/arrow/archive/${ARROW_VERSION}.tar.gz" ARROW_BASEDIR="arrow-${ARROW_VERSION}" +BROTLI_VERSION="5db62dcc9d386579609540cdf8869e95ad334bbd" +BROTLI_URL="https://github.com/google/brotli/archive/${BROTLI_VERSION}.tar.gz" +BROTLI_BASEDIR="brotli-${BROTLI_VERSION}" + SNAPPY_VERSION=1.1.3 SNAPPY_URL="https://github.com/google/snappy/releases/download/${SNAPPY_VERSION}/snappy-${SNAPPY_VERSION}.tar.gz" SNAPPY_BASEDIR=snappy-$SNAPPY_VERSION
