[arrow] Diff for: [GitHub] kszucs closed pull request #3403: ARROW-4260: [Python] NumPy buffer protocol failure
diff --git a/ci/appveyor-cpp-build.bat b/ci/appveyor-cpp-build.bat index 387dd55d18..78f5e41928 100644 --- a/ci/appveyor-cpp-build.bat +++ b/ci/appveyor-cpp-build.bat @@ -95,11 +95,12 @@ if "%JOB%" == "Build_Debug" ( exit /B 0 ) -conda create -n arrow -q -y ^ +conda create -n arrow -q -y -c conda-forge ^ + --file=ci\conda_env_python.yml ^ python=%PYTHON% ^ - six pytest setuptools numpy pandas cython hypothesis ^ - thrift-cpp=0.11.0 boost-cpp ^ - -c conda-forge + numpy=1.14 ^ + thrift-cpp=0.11 ^ + boost-cpp call activate arrow @@ -109,9 +110,9 @@ set BOOST_LIBRARYDIR=%CONDA_PREFIX%\Library\lib if "%JOB%" == "Toolchain" ( @rem Install pre-built "toolchain" packages for faster builds - conda install -q -y --file=ci\conda_env_cpp.yml ^ -python=%PYTHON% ^ --c conda-forge + conda install -q -y -c conda-forge ^ +--file=ci\conda_env_cpp.yml ^ +python=%PYTHON% set ARROW_BUILD_TOOLCHAIN=%CONDA_PREFIX%\Library ) diff --git a/ci/cpp-msvc-build-main.bat b/ci/cpp-msvc-build-main.bat index 644170775d..d9d7e548dd 100644 --- a/ci/cpp-msvc-build-main.bat +++ b/ci/cpp-msvc-build-main.bat @@ -72,7 +72,7 @@ popd pushd python -pip install pickle5 +pip install -r requirements.txt pickle5 set PYARROW_CXXFLAGS=%ARROW_CXXFLAGS% set PYARROW_CMAKE_GENERATOR=%GENERATOR% diff --git a/ci/travis_script_python.sh b/ci/travis_script_python.sh index e9a1122755..c3e2d1903c 100755 --- a/ci/travis_script_python.sh +++ b/ci/travis_script_python.sh @@ -50,7 +50,7 @@ conda create -y -q -p $CONDA_ENV_DIR \ nomkl \ cmake \ pip \ - numpy=1.13.1 \ + numpy=1.14 \ python=${PYTHON_VERSION} \ ${CONDA_JVM_DEPS} @@ -124,7 +124,7 @@ $ARROW_CPP_BUILD_DIR/$ARROW_BUILD_TYPE/arrow-python-test pushd $ARROW_PYTHON_DIR # Other stuff pip install -pip install -q -r requirements.txt +pip install -r requirements.txt if [ "$PYTHON_VERSION" == "3.6" ]; then pip install -q pickle5 diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py index 466d2e9562..264b51c32f 100644 --- a/python/pyarrow/tests/test_convert_pandas.py +++ b/python/pyarrow/tests/test_convert_pandas.py @@ -23,6 +23,7 @@ from collections import OrderedDict from datetime import date, datetime, time, timedelta +from distutils.version import LooseVersion import hypothesis as h import hypothesis.extra.pytz as tzst @@ -2224,8 +2225,6 @@ def test_safe_unsafe_casts(self): def _fully_loaded_dataframe_example(): -from distutils.version import LooseVersion - index = pd.MultiIndex.from_arrays([ pd.date_range('2000-01-01', periods=5).repeat(2), np.tile(np.array(['foo', 'bar'], dtype=object), 5) @@ -2271,6 +2270,8 @@ def _check_serialize_components_roundtrip(df): tm.assert_frame_equal(df, deserialized) +@pytest.mark.skipif(LooseVersion(np.__version__) >= '0.16', +reason='Until numpy/numpy#12745 is resolved') def test_serialize_deserialize_pandas(): # ARROW-1784, serialize and deserialize DataFrame by decomposing # BlockManager With regards, Apache Git Services
[arrow] Diff for: [GitHub] kszucs closed pull request #3405: ARROW-4266: [Python][CI] Disable ORC tests in dask integration test
diff --git a/integration/dask/runtest.sh b/integration/dask/runtest.sh index 9a37e0a67b..baf9ccf445 100755 --- a/integration/dask/runtest.sh +++ b/integration/dask/runtest.sh @@ -29,6 +29,7 @@ python -c "import pyarrow.parquet" # pytest -sv --pyargs dask.bytes.tests.test_hdfs # pytest -sv --pyargs dask.bytes.tests.test_local -pytest -v --pyargs dask.dataframe.io.tests.test_orc +# TODO(kszucs): re-enable it, for more see ARROW-3910 +# pytest -v --pyargs dask.dataframe.io.tests.test_orc pytest -v --pyargs dask.dataframe.io.tests.test_parquet pytest -v --pyargs dask.dataframe.tests.test_dataframe With regards, Apache Git Services
[arrow] Diff for: [GitHub] wesm closed pull request #3395: ARROW-4258: [Python] Safe cast fails from numpy float64 array with nans to integer
diff --git a/cpp/src/arrow/python/numpy_to_arrow.cc b/cpp/src/arrow/python/numpy_to_arrow.cc index aada6bf598..a944b80914 100644 --- a/cpp/src/arrow/python/numpy_to_arrow.cc +++ b/cpp/src/arrow/python/numpy_to_arrow.cc @@ -443,8 +443,8 @@ inline Status NumPyConverter::ConvertData(std::shared_ptr* data) { RETURN_NOT_OK(NumPyDtypeToArrow(reinterpret_cast(dtype_), _type)); if (!input_type->Equals(*type_)) { -RETURN_NOT_OK(CastBuffer(input_type, *data, length_, nullptr, 0, type_, cast_options_, - pool_, data)); +RETURN_NOT_OK(CastBuffer(input_type, *data, length_, null_bitmap_, null_count_, type_, + cast_options_, pool_, data)); } return Status::OK(); @@ -477,8 +477,8 @@ inline Status NumPyConverter::ConvertData(std::shared_ptr* d } else { RETURN_NOT_OK(NumPyDtypeToArrow(reinterpret_cast(dtype_), _type)); if (!input_type->Equals(*type_)) { - RETURN_NOT_OK(CastBuffer(input_type, *data, length_, nullptr, 0, type_, - cast_options_, pool_, data)); + RETURN_NOT_OK(CastBuffer(input_type, *data, length_, null_bitmap_, null_count_, + type_, cast_options_, pool_, data)); } } @@ -518,8 +518,8 @@ inline Status NumPyConverter::ConvertData(std::shared_ptr* d } else { RETURN_NOT_OK(NumPyDtypeToArrow(reinterpret_cast(dtype_), _type)); if (!input_type->Equals(*type_)) { - RETURN_NOT_OK(CastBuffer(input_type, *data, length_, nullptr, 0, type_, - cast_options_, pool_, data)); + RETURN_NOT_OK(CastBuffer(input_type, *data, length_, null_bitmap_, null_count_, + type_, cast_options_, pool_, data)); } } diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py index 264b51c32f..9bee9053c5 100644 --- a/python/pyarrow/tests/test_convert_pandas.py +++ b/python/pyarrow/tests/test_convert_pandas.py @@ -2224,6 +2224,15 @@ def test_safe_unsafe_casts(self): assert table.column('B').type == pa.int32() +def test_safe_cast_from_float_with_nans_to_int(): +# TODO(kszucs): write tests for creating Date32 and Date64 arrays, see +# ARROW-4258 and https://github.com/apache/arrow/pull/3395 +values = pd.Series([1, 2, None, 4]) +arr = pa.Array.from_pandas(values, type=pa.int32(), safe=True) +expected = pa.array([1, 2, None, 4], type=pa.int32()) +assert arr.equals(expected) + + def _fully_loaded_dataframe_example(): index = pd.MultiIndex.from_arrays([ pd.date_range('2000-01-01', periods=5).repeat(2), diff --git a/python/pyarrow/tests/test_plasma_tf_op.py b/python/pyarrow/tests/test_plasma_tf_op.py index e239055209..53ecae217e 100644 --- a/python/pyarrow/tests/test_plasma_tf_op.py +++ b/python/pyarrow/tests/test_plasma_tf_op.py @@ -82,6 +82,7 @@ def FromPlasma(): @pytest.mark.plasma @pytest.mark.tensorflow +@pytest.mark.skip(reason='Until ARROW-4259 is resolved') def test_plasma_tf_op(use_gpu=False): import pyarrow.plasma as plasma import tensorflow as tf With regards, Apache Git Services
[arrow] Diff for: [GitHub] kou closed pull request #3390: ARROW-4228: [GLib] Add garrow_list_data_type_get_field()
diff --git a/c_glib/arrow-glib/composite-data-type.cpp b/c_glib/arrow-glib/composite-data-type.cpp index 5ddc1c3dd8..675900a5be 100644 --- a/c_glib/arrow-glib/composite-data-type.cpp +++ b/c_glib/arrow-glib/composite-data-type.cpp @@ -88,9 +88,26 @@ garrow_list_data_type_new(GArrowField *field) * @list_data_type: A #GArrowListDataType. * * Returns: (transfer full): The field of value. + * + * Deprecated: 0.13.0: + * Use garrow_list_data_type_get_field() instead. */ GArrowField * garrow_list_data_type_get_value_field(GArrowListDataType *list_data_type) +{ + return garrow_list_data_type_get_field(list_data_type); +} + +/** + * garrow_list_data_type_get_field: + * @list_data_type: A #GArrowListDataType. + * + * Returns: (transfer full): The field of value. + * + * Since: 0.13.0 + */ +GArrowField * +garrow_list_data_type_get_field(GArrowListDataType *list_data_type) { auto data_type = GARROW_DATA_TYPE(list_data_type); auto arrow_data_type = garrow_data_type_get_raw(data_type); diff --git a/c_glib/arrow-glib/composite-data-type.h b/c_glib/arrow-glib/composite-data-type.h index f60a9cdeb6..beb312597d 100644 --- a/c_glib/arrow-glib/composite-data-type.h +++ b/c_glib/arrow-glib/composite-data-type.h @@ -22,6 +22,7 @@ #include #include #include +#include G_BEGIN_DECLS @@ -67,7 +68,12 @@ struct _GArrowListDataTypeClass GType garrow_list_data_type_get_type (void) G_GNUC_CONST; GArrowListDataType *garrow_list_data_type_new (GArrowField *field); +#ifndef GARROW_DISABLE_DEPRECATED +GARROW_DEPRECATED_IN_0_13_FOR(garrow_list_data_type_get_field) GArrowField *garrow_list_data_type_get_value_field (GArrowListDataType *list_data_type); +#endif +GARROW_AVAILABLE_IN_0_13 +GArrowField *garrow_list_data_type_get_field (GArrowListDataType *list_data_type); #define GARROW_TYPE_STRUCT_DATA_TYPE (garrow_struct_data_type_get_type()) diff --git a/c_glib/arrow-glib/version.h.in b/c_glib/arrow-glib/version.h.in index 501827d06e..827b9c9a81 100644 --- a/c_glib/arrow-glib/version.h.in +++ b/c_glib/arrow-glib/version.h.in @@ -110,6 +110,15 @@ # define GARROW_UNAVAILABLE(major, minor) G_UNAVAILABLE(major, minor) #endif +/** + * GARROW_VERSION_0_13: + * + * You can use this macro value for compile time API version check. + * + * Since: 0.13.0 + */ +#define GARROW_VERSION_0_13 G_ENCODE_VERSION(0, 13) + /** * GARROW_VERSION_0_12: * @@ -175,6 +184,20 @@ #define GARROW_AVAILABLE_IN_ALL +#if GARROW_VERSION_MIN_REQUIRED >= GARROW_VERSION_0_13 +# define GARROW_DEPRECATED_IN_0_13 GARROW_DEPRECATED +# define GARROW_DEPRECATED_IN_0_13_FOR(function) GARROW_DEPRECATED_FOR(function) +#else +# define GARROW_DEPRECATED_IN_0_13 +# define GARROW_DEPRECATED_IN_0_13_FOR(function) +#endif + +#if GARROW_VERSION_MAX_ALLOWED < GARROW_VERSION_0_13 +# define GARROW_AVAILABLE_IN_0_13 GARROW_UNAVAILABLE(0, 13) +#else +# define GARROW_AVAILABLE_IN_0_13 +#endif + #if GARROW_VERSION_MIN_REQUIRED >= GARROW_VERSION_0_12 # define GARROW_DEPRECATED_IN_0_12 GARROW_DEPRECATED # define GARROW_DEPRECATED_IN_0_12_FOR(function) GARROW_DEPRECATED_FOR(function) diff --git a/c_glib/doc/arrow-glib/arrow-glib-docs.xml b/c_glib/doc/arrow-glib/arrow-glib-docs.xml index f9f01fe23e..1016703001 100644 --- a/c_glib/doc/arrow-glib/arrow-glib-docs.xml +++ b/c_glib/doc/arrow-glib/arrow-glib-docs.xml @@ -163,6 +163,10 @@ Index of deprecated API + +Index of new symbols in 0.13.0 + + Index of new symbols in 0.12.0 diff --git a/c_glib/test/test-list-data-type.rb b/c_glib/test/test-list-data-type.rb index 2d96fcb21e..78df28a144 100644 --- a/c_glib/test/test-list-data-type.rb +++ b/c_glib/test/test-list-data-type.rb @@ -30,14 +30,14 @@ def test_to_s assert_equal("list", @data_type.to_s) end - def test_value_field + def test_field assert_equal([ @field, @field_data_type, ], [ - @data_type.value_field, - @data_type.value_field.data_type, + @data_type.field, + @data_type.field.data_type, ]) end end With regards, Apache Git Services
[arrow] Diff for: [GitHub] kszucs merged pull request #3406: [CI] Temporary fix for conda-forge migration
diff --git a/ci/appveyor-cpp-build.bat b/ci/appveyor-cpp-build.bat index 78f5e41928..5b653a9050 100644 --- a/ci/appveyor-cpp-build.bat +++ b/ci/appveyor-cpp-build.bat @@ -95,7 +95,7 @@ if "%JOB%" == "Build_Debug" ( exit /B 0 ) -conda create -n arrow -q -y -c conda-forge ^ +conda create -n arrow -q -y -c conda-forge/label/cf201901 ^ --file=ci\conda_env_python.yml ^ python=%PYTHON% ^ numpy=1.14 ^ @@ -110,7 +110,7 @@ set BOOST_LIBRARYDIR=%CONDA_PREFIX%\Library\lib if "%JOB%" == "Toolchain" ( @rem Install pre-built "toolchain" packages for faster builds - conda install -q -y -c conda-forge ^ + conda install -q -y -c conda-forge/label/cf201901 ^ --file=ci\conda_env_cpp.yml ^ python=%PYTHON% diff --git a/ci/appveyor-cpp-setup.bat b/ci/appveyor-cpp-setup.bat index 4cae2cb1ea..2a46d130ea 100644 --- a/ci/appveyor-cpp-setup.bat +++ b/ci/appveyor-cpp-setup.bat @@ -39,7 +39,7 @@ if defined need_vcvarsall ( ) ) -if "%GENERATOR%"=="Ninja" conda install -y -q ninja +if "%GENERATOR%"=="Ninja" conda install -y -q ninja -c conda-forge/label/cf201901 if "%USE_CLCACHE%" == "true" ( @rem Use clcache for faster builds diff --git a/ci/appveyor-cpp-test-cmake-script.bat b/ci/appveyor-cpp-test-cmake-script.bat index 415406c4ac..d55a24dccf 100644 --- a/ci/appveyor-cpp-test-cmake-script.bat +++ b/ci/appveyor-cpp-test-cmake-script.bat @@ -19,8 +19,8 @@ @rem Validate cmake script behaviour on missed lib in toolchain set CONDA_ENV=arrow-cmake-tests-libs -conda create -n %CONDA_ENV% -q -y -conda install -n %CONDA_ENV% -q -y -c conda-forge boost-cpp +conda create -n %CONDA_ENV% -q -y -c conda-forge/label/cf201901 +conda install -n %CONDA_ENV% -q -y -c conda-forge/label/cf201901 boost-cpp call activate %CONDA_ENV% set BUILD_DIR=cpp\build-cmake-test @@ -161,8 +161,8 @@ call deactivate @rem Validate libs availability in conda toolchain set CONDA_ENV=arrow-cmake-tests-toolchain -conda create -n %CONDA_ENV% -q -y -conda install -n %CONDA_ENV% -q -y -c conda-forge ^ +conda create -n %CONDA_ENV% -q -y -c conda-forge/label/cf201901 +conda install -n %CONDA_ENV% -q -y -c conda-forge/label/cf201901 ^ --file=ci\conda_env_cpp.yml call activate %CONDA_ENV% diff --git a/ci/cpp-msvc-build-main.bat b/ci/cpp-msvc-build-main.bat index d9d7e548dd..6629060a47 100644 --- a/ci/cpp-msvc-build-main.bat +++ b/ci/cpp-msvc-build-main.bat @@ -105,7 +105,7 @@ popd call deactivate -conda create -n wheel_test -q -y python=%PYTHON% || exit /B +conda create -n wheel_test -q -y -c conda-forge/label/cf201901 python=%PYTHON% || exit /B call activate wheel_test diff --git a/ci/travis_before_script_c_glib.sh b/ci/travis_before_script_c_glib.sh index e8dd0cdc80..8507f779b3 100755 --- a/ci/travis_before_script_c_glib.sh +++ b/ci/travis_before_script_c_glib.sh @@ -23,7 +23,7 @@ source $TRAVIS_BUILD_DIR/ci/travis_env_common.sh source $TRAVIS_BUILD_DIR/ci/travis_install_conda.sh -conda create -n meson -y -q python=3.6 +conda create -n meson -y -q -c conda-forge/label/cf201901 python=3.6 conda activate meson pip install meson @@ -36,7 +36,7 @@ else autoconf-archive \ gtk-doc-tools \ libgirepository1.0-dev - conda install -q -y ninja + conda install -q -y ninja -c conda-forge/label/cf201901 fi gem install test-unit gobject-introspection diff --git a/ci/travis_install_toolchain.sh b/ci/travis_install_toolchain.sh index 82031e8fd3..aa50d5966d 100755 --- a/ci/travis_install_toolchain.sh +++ b/ci/travis_install_toolchain.sh @@ -27,7 +27,7 @@ if [ ! -e $CPP_TOOLCHAIN ]; then fi # Set up C++ toolchain from conda-forge packages for faster builds -conda create -y -q -p $CPP_TOOLCHAIN \ +conda create -y -q -p $CPP_TOOLCHAIN -c conda-forge/label/cf201901 \ --file=$TRAVIS_BUILD_DIR/ci/conda_env_cpp.yml \ ${CONDA_LLVM} \ ccache \ diff --git a/ci/travis_script_integration.sh b/ci/travis_script_integration.sh index 342db58b5d..5571ebc0b0 100755 --- a/ci/travis_script_integration.sh +++ b/ci/travis_script_integration.sh @@ -43,14 +43,14 @@ popd pushd $ARROW_INTEGRATION_DIR CONDA_ENV_NAME=arrow-integration-test -conda create -y -q -n $CONDA_ENV_NAME python=3.5 +conda create -y -q -n $CONDA_ENV_NAME -c conda-forge/label/cf201901 python=3.5 conda activate $CONDA_ENV_NAME # faster builds, please -conda install -y nomkl +conda install -y nomkl -c conda-forge/label/cf201901 # Expensive dependencies install from Continuum package repo -conda install -y pip numpy six +conda install -y pip numpy six -c conda-forge/label/cf201901 # ARROW-4008: Create a directory to write temporary files since /tmp can be # unstable in Travis CI diff --git a/ci/travis_script_manylinux.sh b/ci/travis_script_manylinux.sh index 588d0f9a7b..59b818af69 100755 --- a/ci/travis_script_manylinux.sh +++ b/ci/travis_script_manylinux.sh @@ -34,7 +34,7 @@ source $TRAVIS_BUILD_DIR/ci/travis_install_conda.sh PYTHON_VERSION=3.6
[arrow] Diff for: [GitHub] robertnishihara closed pull request #3410: ARROW-4269: [Python] Fix serialization in pandas 0.22
diff --git a/python/pyarrow/serialization.py b/python/pyarrow/serialization.py index 6bbe1c7bc8..9b261c1bf7 100644 --- a/python/pyarrow/serialization.py +++ b/python/pyarrow/serialization.py @@ -174,26 +174,27 @@ def _deserialize_pandas_series(data): custom_serializer=_pickle_to_buffer, custom_deserializer=_load_pickle_from_buffer) -if hasattr(pd.core.arrays, 'interval'): -context.register_type( -pd.core.arrays.interval.IntervalArray, -'pd.core.arrays.interval.IntervalArray', -custom_serializer=_pickle_to_buffer, -custom_deserializer=_load_pickle_from_buffer) - -if hasattr(pd.core.arrays, 'period'): -context.register_type( -pd.core.arrays.period.PeriodArray, -'pd.core.arrays.period.PeriodArray', -custom_serializer=_pickle_to_buffer, -custom_deserializer=_load_pickle_from_buffer) - -if hasattr(pd.core.arrays, 'datetimes'): -context.register_type( -pd.core.arrays.datetimes.DatetimeArray, -'pd.core.arrays.datetimes.DatetimeArray', -custom_serializer=_pickle_to_buffer, -custom_deserializer=_load_pickle_from_buffer) +if hasattr(pd.core, 'arrays'): +if hasattr(pd.core.arrays, 'interval'): +context.register_type( +pd.core.arrays.interval.IntervalArray, +'pd.core.arrays.interval.IntervalArray', +custom_serializer=_pickle_to_buffer, +custom_deserializer=_load_pickle_from_buffer) + +if hasattr(pd.core.arrays, 'period'): +context.register_type( +pd.core.arrays.period.PeriodArray, +'pd.core.arrays.period.PeriodArray', +custom_serializer=_pickle_to_buffer, +custom_deserializer=_load_pickle_from_buffer) + +if hasattr(pd.core.arrays, 'datetimes'): +context.register_type( +pd.core.arrays.datetimes.DatetimeArray, +'pd.core.arrays.datetimes.DatetimeArray', +custom_serializer=_pickle_to_buffer, +custom_deserializer=_load_pickle_from_buffer) context.register_type( pd.DataFrame, 'pd.DataFrame', With regards, Apache Git Services
[arrow] Diff for: [GitHub] kszucs closed pull request #3402: WIP ARROW-4260: [Python] NumPy buffer protocol failure
diff --git a/ci/appveyor-cpp-build.bat b/ci/appveyor-cpp-build.bat index 387dd55d18..6681f5673f 100644 --- a/ci/appveyor-cpp-build.bat +++ b/ci/appveyor-cpp-build.bat @@ -97,7 +97,7 @@ if "%JOB%" == "Build_Debug" ( conda create -n arrow -q -y ^ python=%PYTHON% ^ - six pytest setuptools numpy pandas cython hypothesis ^ + --file=ci\conda_env_python.yml ^ thrift-cpp=0.11.0 boost-cpp ^ -c conda-forge diff --git a/ci/conda_env_python.yml b/ci/conda_env_python.yml index b51f5c32f3..0a5b57167f 100644 --- a/ci/conda_env_python.yml +++ b/ci/conda_env_python.yml @@ -18,8 +18,9 @@ cython cloudpickle hypothesis -numpy +numpy=1.15.4 pandas pytest setuptools setuptools_scm +six \ No newline at end of file With regards, Apache Git Services
[arrow] Diff for: [GitHub] wesm closed pull request #3397: ARROW-4257: [Release] Update release verification script to check binaries on Bintray
diff --git a/dev/release/verify-release-candidate.sh b/dev/release/verify-release-candidate.sh index c8b9c54c82..28fb1e5eb0 100755 --- a/dev/release/verify-release-candidate.sh +++ b/dev/release/verify-release-candidate.sh @@ -87,24 +87,56 @@ fetch_archive() { shasum -a 512 -c ${dist_name}.tar.gz.sha512 } +bintray() { + local command=$1 + shift + local path=$1 + shift + local url=https://bintray.com/api/v1${path} + echo "${command} ${url}" 1>&2 + curl \ +--fail \ +--basic \ +--user "${BINTRAY_USER}:${BINTRAY_PASSWORD}" \ +--header "Content-Type: application/json" \ +--request ${command} \ +${url} \ +"$@" | \ + jq . +} + +download_bintray_files() { + local target=$1 + + local version_name=${VERSION}-rc${RC_NUMBER} + + local files=$( +bintray \ + GET /packages/${BINTRAY_REPOSITORY}/${target}-rc/versions/${version_name}/files | \ + jq -r ".[].path") + + for file in ${files}; do +mkdir -p "$(dirname ${file})" +curl \ + --fail \ + --location \ + --output ${file} \ + https://dl.bintray.com/${BINTRAY_REPOSITORY}/${file} + done +} + verify_binary_artifacts() { - # --show-progress not supported on wget < 1.16 - wget --help | grep -q '\--show-progress' && \ - _WGET_PROGRESS_OPT="-q --show-progress" || _WGET_PROGRESS_OPT="" - - # download the binaries folder for the current RC - rcname=apache-arrow-${VERSION}-rc${RC_NUMBER} - wget -P "$rcname" \ ---quiet \ ---no-host-directories \ ---cut-dirs=5 \ -$_WGET_PROGRESS_OPT \ ---no-parent \ ---reject 'index.html*' \ ---recursive "$ARROW_DIST_URL/$rcname/binaries/" + local download_dir=binaries + mkdir -p ${download_dir} + pushd ${download_dir} + + # takes longer on slow network + for target in centos debian python ubuntu; do +download_bintray_files ${target} + done # verify the signature and the checksums of each artifact - find $rcname/binaries -name '*.asc' | while read sigfile; do + find . -name '*.asc' | while read sigfile; do artifact=${sigfile/.asc/} gpg --verify $sigfile $artifact || exit 1 @@ -112,10 +144,14 @@ verify_binary_artifacts() { # basename of the artifact pushd $(dirname $artifact) base_artifact=$(basename $artifact) -shasum -a 256 -c $base_artifact.sha256 || exit 1 +if [ -f $base_artifact.sha256 ]; then + shasum -a 256 -c $base_artifact.sha256 || exit 1 +fi shasum -a 512 -c $base_artifact.sha512 || exit 1 popd done + + popd } setup_tempdir() { @@ -343,7 +379,14 @@ if [ "$ARTIFACT" == "source" ]; then test_integration test_rust else - # takes longer on slow network + if [ -z "${BINTRAY_PASSWORD}" ]; then +echo "BINTRAY_PASSWORD is empty" +exit 1 + fi + + : ${BINTRAY_USER:=$USER} + : ${BINTRAY_REPOSITORY:=apache/arrow} + verify_binary_artifacts fi With regards, Apache Git Services
[arrow] Diff for: [GitHub] wesm closed pull request #3398: ARROW-4246: [Plasma][Python] PlasmaClient.list returns wrong information with CUDA enabled Plasma
diff --git a/cpp/src/plasma/common.h b/cpp/src/plasma/common.h index 17155b2dff..8e90fb301c 100644 --- a/cpp/src/plasma/common.h +++ b/cpp/src/plasma/common.h @@ -93,10 +93,6 @@ struct ObjectTableEntry { int64_t data_size; /// Size of the object metadata in bytes. int64_t metadata_size; -#ifdef PLASMA_CUDA - /// IPC GPU handle to share with clients. - std::shared_ptr<::arrow::cuda::CudaIpcMemHandle> ipc_handle; -#endif /// Number of clients currently using this object. int ref_count; /// Unix epoch of when this object was created. @@ -108,6 +104,13 @@ struct ObjectTableEntry { ObjectState state; /// The digest of the object. Used to see if two objects are the same. unsigned char digest[kDigestSize]; + +#ifdef PLASMA_CUDA + /// Put CUDA related members at the last to create Python bindings easily. + + /// IPC GPU handle to share with clients. + std::shared_ptr<::arrow::cuda::CudaIpcMemHandle> ipc_handle; +#endif }; /// Mapping from ObjectIDs to information about the object. With regards, Apache Git Services
[arrow] Diff for: [GitHub] kou closed pull request #3401: ARROW-4246: [Plasma][Python][Follow-up] Ensure plasma::ObjectTableEntry always has the same size regardless of whether built with CUDA support
diff --git a/cpp/src/plasma/common.h b/cpp/src/plasma/common.h index 8e90fb301c..dfbd90c3aa 100644 --- a/cpp/src/plasma/common.h +++ b/cpp/src/plasma/common.h @@ -72,6 +72,12 @@ enum class ObjectState : int { PLASMA_SEALED }; +namespace internal { + +struct CudaIpcPlaceholder {}; + +} // namespace internal + /// This type is used by the Plasma store. It is here because it is exposed to /// the eviction policy. struct ObjectTableEntry { @@ -106,10 +112,10 @@ struct ObjectTableEntry { unsigned char digest[kDigestSize]; #ifdef PLASMA_CUDA - /// Put CUDA related members at the last to create Python bindings easily. - /// IPC GPU handle to share with clients. std::shared_ptr<::arrow::cuda::CudaIpcMemHandle> ipc_handle; +#else + std::shared_ptr ipc_handle; #endif }; diff --git a/python/pyarrow/_plasma.pyx b/python/pyarrow/_plasma.pyx index cfaa39c96e..4f64f202ce 100644 --- a/python/pyarrow/_plasma.pyx +++ b/python/pyarrow/_plasma.pyx @@ -45,6 +45,8 @@ PLASMA_WAIT_TIMEOUT = 2 ** 30 cdef extern from "plasma/common.h" nogil: +cdef cppclass CCudaIpcPlaceholder" plasma::internal::CudaIpcPlaceholder": +pass cdef cppclass CUniqueID" plasma::UniqueID": @@ -79,6 +81,7 @@ cdef extern from "plasma/common.h" nogil: int64_t create_time int64_t construct_duration CObjectState state +shared_ptr[CCudaIpcPlaceholder] ipc_handle ctypedef unordered_map[CUniqueID, unique_ptr[CObjectTableEntry]] \ CObjectTable" plasma::ObjectTable" With regards, Apache Git Services
[arrow] Diff for: [GitHub] robertnishihara closed pull request #3392: ARROW-4249: [Plasma] Clean up client namespace
diff --git a/cpp/src/plasma/common.h b/cpp/src/plasma/common.h index 38925fef92..17155b2dff 100644 --- a/cpp/src/plasma/common.h +++ b/cpp/src/plasma/common.h @@ -33,7 +33,6 @@ #include "plasma/compat.h" #include "arrow/status.h" -#include "arrow/util/logging.h" #ifdef PLASMA_CUDA #include "arrow/gpu/cuda_api.h" #endif diff --git a/cpp/src/plasma/io.cc b/cpp/src/plasma/io.cc index d2794e89d3..cc425428ec 100644 --- a/cpp/src/plasma/io.cc +++ b/cpp/src/plasma/io.cc @@ -22,6 +22,7 @@ #include #include "arrow/status.h" +#include "arrow/util/logging.h" #include "plasma/common.h" #include "plasma/plasma_generated.h" diff --git a/cpp/src/plasma/lib/java/org_apache_arrow_plasma_PlasmaClientJNI.cc b/cpp/src/plasma/lib/java/org_apache_arrow_plasma_PlasmaClientJNI.cc index fa376ec43c..d552994e54 100644 --- a/cpp/src/plasma/lib/java/org_apache_arrow_plasma_PlasmaClientJNI.cc +++ b/cpp/src/plasma/lib/java/org_apache_arrow_plasma_PlasmaClientJNI.cc @@ -28,6 +28,8 @@ #include #include +#include "arrow/util/logging.h" + #include "plasma/client.h" constexpr jsize OBJECT_ID_SIZE = sizeof(plasma::ObjectID) / sizeof(jbyte); diff --git a/python/pyarrow/tensorflow/plasma_op.cc b/python/pyarrow/tensorflow/plasma_op.cc index 852be33938..bf4eec7891 100644 --- a/python/pyarrow/tensorflow/plasma_op.cc +++ b/python/pyarrow/tensorflow/plasma_op.cc @@ -33,6 +33,7 @@ #include "arrow/adapters/tensorflow/convert.h" #include "arrow/api.h" #include "arrow/io/api.h" +#include "arrow/util/logging.h" // These headers do not include Python.h #include "arrow/python/deserialize.h" With regards, Apache Git Services
[arrow] Diff for: [GitHub] xhochy closed pull request #3393: ARROW-4256: [Release] Fix Windows verification script for 0.12 release
diff --git a/dev/release/verify-release-candidate.bat b/dev/release/verify-release-candidate.bat index cc25b045dc..c85ece4465 100644 --- a/dev/release/verify-release-candidate.bat +++ b/dev/release/verify-release-candidate.bat @@ -46,12 +46,11 @@ call conda create -p %_VERIFICATION_CONDA_ENV% -f -q -y python=%PYTHON% || exit call activate %_VERIFICATION_CONDA_ENV% || exit /B call conda install -y ^ - six pytest setuptools numpy pandas cython ^ - thrift-cpp flatbuffers rapidjson ^ - cmake ^ - git ^ - boost-cpp ^ - snappy zlib brotli gflags lz4-c zstd -c conda-forge || exit /B + python=3.7 ^ + git ^ + --file=ci\conda_env_cpp.yml ^ + --file=ci\conda_env_python.yml ^ + -c conda-forge || exit /B set GENERATOR=Visual Studio 14 2015 Win64 set CONFIGURATION=release With regards, Apache Git Services
[GitHub] [arrow-site] nealrichardson commented on issue #9: ARROW-4473: [Website] Support test site deployment (take 2)
nealrichardson commented on issue #9: ARROW-4473: [Website] Support test site deployment (take 2) URL: https://github.com/apache/arrow-site/pull/9#issuecomment-520553530 Thanks for the suggestions for simplifying the script, @kou. I did them and the test build seemed to succeed: see https://enpiar.com/arrow-site/ for what was deployed. You may want to review the diff on the gh-pages branch there for what your proposed `rsync` command made. I think it's right but it deleted a lot of files--there seemed to be a copy of the website in a `_site` directory, and then there were some stale/misdated copies of blog posts that got removed too. https://github.com/nealrichardson/arrow-site/commit/b4431a6f2f2e2aa0bcec24e1f9b4b7130f9bc210 is huge (> 2M lines deleted) so it's probably easier to review starting around here in the Travis log: https://travis-ci.org/nealrichardson/arrow-site/builds/570970342#L6417 I spot-checked some of the deleted blog posts and there are copies of them with a different date (e.g. the 0.3.0 release post is at https://enpiar.com/arrow-site/blog/2017/05/08/0.3-release/, while in production there is the post at 2017/05/08 and then a (somewhat broken) copy at 05/07 (http://arrow.apache.org/blog/2017/05/07/0.3-release/), which seems to be what `rsync` removed. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services
[GitHub] [arrow-site] nealrichardson commented on issue #9: ARROW-4473: [Website] Support test site deployment (take 2)
nealrichardson commented on issue #9: ARROW-4473: [Website] Support test site deployment (take 2) URL: https://github.com/apache/arrow-site/pull/9#issuecomment-521025692 IDK why that file mode would change, but it sounds like a good idea. No reason for a .png to be 755 is there? /events/ looks like it hasn't been touched since 2016. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services
[GitHub] [arrow-site] kou commented on issue #9: ARROW-4473: [Website] Support test site deployment (take 2)
kou commented on issue #9: ARROW-4473: [Website] Support test site deployment (take 2) URL: https://github.com/apache/arrow-site/pull/9#issuecomment-521493249 +1 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services
[GitHub] [arrow-site] nealrichardson opened a new pull request #11: ARROW-6246: [Website] Add link to R documentation site
nealrichardson opened a new pull request #11: ARROW-6246: [Website] Add link to R documentation site URL: https://github.com/apache/arrow-site/pull/11 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services
[GitHub] [arrow-site] kou commented on issue #9: ARROW-4473: [Website] Support test site deployment (take 2)
kou commented on issue #9: ARROW-4473: [Website] Support test site deployment (take 2) URL: https://github.com/apache/arrow-site/pull/9#issuecomment-521471371 It seems that `img/arrow.png`'s mode has been changed by https://github.com/apache/arrow-site/commit/dea4fb2e708151fc3f9ef07f44f6520770dc283b#diff-6ad51cc5e7d889c484fcb3597709f7d7 . Could you restore mode of `img/arrow.png` to `0644`? Then we can merge this. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services
[GitHub] [arrow-site] kou commented on issue #9: ARROW-4473: [Website] Support test site deployment (take 2)
kou commented on issue #9: ARROW-4473: [Website] Support test site deployment (take 2) URL: https://github.com/apache/arrow-site/pull/9#issuecomment-521471569 If we want to maintain `/events/` again, we should migrate the source of `/events/` to Jekyll based one. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services
[GitHub] [arrow-site] nealrichardson commented on issue #9: ARROW-4473: [Website] Support test site deployment (take 2)
nealrichardson commented on issue #9: ARROW-4473: [Website] Support test site deployment (take 2) URL: https://github.com/apache/arrow-site/pull/9#issuecomment-521457081 @kou @wesm are we good here? This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services
[GitHub] [arrow-site] wesm merged pull request #11: ARROW-6246: [Website] Add link to R documentation site
wesm merged pull request #11: ARROW-6246: [Website] Add link to R documentation site URL: https://github.com/apache/arrow-site/pull/11 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services
[GitHub] [arrow-site] wesm commented on issue #11: ARROW-6246: [Website] Add link to R documentation site
wesm commented on issue #11: ARROW-6246: [Website] Add link to R documentation site URL: https://github.com/apache/arrow-site/pull/11#issuecomment-521651223 LGTM, thanks This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services
[GitHub] [arrow-site] wesm commented on issue #10: ARROW-6217: [Website] Remove needless _site/ directory
wesm commented on issue #10: ARROW-6217: [Website] Remove needless _site/ directory URL: https://github.com/apache/arrow-site/pull/10#issuecomment-520646619 Thanks =) This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services
[GitHub] [arrow-site] wesm merged pull request #10: ARROW-6217: [Website] Remove needless _site/ directory
wesm merged pull request #10: ARROW-6217: [Website] Remove needless _site/ directory URL: https://github.com/apache/arrow-site/pull/10 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services
[GitHub] [arrow-site] kou commented on issue #9: ARROW-4473: [Website] Support test site deployment (take 2)
kou commented on issue #9: ARROW-4473: [Website] Support test site deployment (take 2) URL: https://github.com/apache/arrow-site/pull/9#issuecomment-520659486 #10 has been merged. Could you rebase your fork? This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services
[GitHub] [arrow-site] nealrichardson commented on issue #9: ARROW-4473: [Website] Support test site deployment (take 2)
nealrichardson commented on issue #9: ARROW-4473: [Website] Support test site deployment (take 2) URL: https://github.com/apache/arrow-site/pull/9#issuecomment-520661062 Here's the diff: https://github.com/nealrichardson/arrow-site/commit/a0d9803c06df8099fb916f24fff579edddf4fefe Note that many of the line changes are the insertion of the relative path on my `gh-pages` branch. Here's the commit summary that names all of the deleted files: https://travis-ci.org/nealrichardson/arrow-site/jobs/570970343#L309 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services
[GitHub] [arrow-site] kou opened a new pull request #10: ARROW-6217: [Website] Remove needless _site/ directory
kou opened a new pull request #10: ARROW-6217: [Website] Remove needless _site/ directory URL: https://github.com/apache/arrow-site/pull/10 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services
[GitHub] [arrow-site] kou commented on issue #9: ARROW-4473: [Website] Support test site deployment (take 2)
kou commented on issue #9: ARROW-4473: [Website] Support test site deployment (take 2) URL: https://github.com/apache/arrow-site/pull/9#issuecomment-520629972 I've created #10 to remove needless `_site/` directory. We merge it then we can confirm diff for this change. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services
[GitHub] [arrow-site] nealrichardson commented on issue #9: ARROW-4473: [Website] Support test site deployment (take 2)
nealrichardson commented on issue #9: ARROW-4473: [Website] Support test site deployment (take 2) URL: https://github.com/apache/arrow-site/pull/9#issuecomment-520609697 I suspect those duplicate (stale) blog posts are from before the timezone was hardcoded (https://github.com/apache/arrow-site/commit/aa19f5d845e2be843af649e340a7821ada7d9424). All but one are in 2017. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services
[GitHub] [arrow-site] nealrichardson commented on issue #9: ARROW-4473: [Website] Support test site deployment (take 2)
nealrichardson commented on issue #9: ARROW-4473: [Website] Support test site deployment (take 2) URL: https://github.com/apache/arrow-site/pull/9#issuecomment-520634784 fine, you can have the -2,000,000 line diff :P FTR http://arrow.apache.org/_site/ does exist and shows a version of 0.12. https://github.com/apache/arrow-site/commit/99f824603cc862d3cef2eb6bad5a89aadd929951 is the commit that created that directory and the only one that has touched it, so definitely looks like an accident. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services
[GitHub] [arrow-site] nealrichardson commented on issue #10: ARROW-6217: [Website] Remove needless _site/ directory
nealrichardson commented on issue #10: ARROW-6217: [Website] Remove needless _site/ directory URL: https://github.com/apache/arrow-site/pull/10#issuecomment-520634975 +1 from me FWIW. https://github.com/apache/arrow-site/commit/99f824603cc862d3cef2eb6bad5a89aadd929951 is the commit that created that directory and the only one that has touched it, so definitely looks like an accident. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services
[GitHub] [arrow-site] nealrichardson commented on issue #7: ARROW-6139: [Documentation][R] Build R docs (pkgdown) site and add to arrow-site
nealrichardson commented on issue #7: ARROW-6139: [Documentation][R] Build R docs (pkgdown) site and add to arrow-site URL: https://github.com/apache/arrow-site/pull/7#issuecomment-521335374 This has been revised to build the CRAN published 0.14.1.1 package, and it removes all changes outside of the new `docs/r/` directory. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services
[GitHub] [arrow-site] wesm merged pull request #7: ARROW-6139: [Documentation][R] Build R docs (pkgdown) site and add to arrow-site
wesm merged pull request #7: ARROW-6139: [Documentation][R] Build R docs (pkgdown) site and add to arrow-site URL: https://github.com/apache/arrow-site/pull/7 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services
[GitHub] [arrow-site] wesm commented on issue #7: ARROW-6139: [Documentation][R] Build R docs (pkgdown) site and add to arrow-site
wesm commented on issue #7: ARROW-6139: [Documentation][R] Build R docs (pkgdown) site and add to arrow-site URL: https://github.com/apache/arrow-site/pull/7#issuecomment-521335846 Thanks! This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services
[GitHub] [arrow-site] wesm closed pull request #15: Create README.md
wesm closed pull request #15: Create README.md URL: https://github.com/apache/arrow-site/pull/15 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services
[GitHub] [arrow-site] wesm commented on issue #15: Create README.md
wesm commented on issue #15: Create README.md URL: https://github.com/apache/arrow-site/pull/15#issuecomment-523662542 This branch is only for publishing the website. There is a README already at https://github.com/apache/arrow-site/tree/master see also https://issues.apache.org/jira/browse/INFRA-18914 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services
[GitHub] [arrow-site] nealrichardson opened a new pull request #16: ARROW-6260: [Website] Use deploy key on Travis to build and push to asf-site
nealrichardson opened a new pull request #16: ARROW-6260: [Website] Use deploy key on Travis to build and push to asf-site URL: https://github.com/apache/arrow-site/pull/16 This change enables the use of [GitHub deploy keys](https://developer.github.com/v3/guides/managing-deploy-keys/#deploy-keys), which are tied to a repository and not an individual (in contrast with personal access tokens). Once this is merged, we can add a key pair to the repository settings in GitHub and Travis (INFRA may be required to do one or both of those) and then commits to the website source in the `master` branch will automatically build and push the generated static site to the `asf-site` branch. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services
[GitHub] [arrow-site] kou commented on a change in pull request #16: ARROW-6260: [Website] Use deploy key on Travis to build and push to asf-site
kou commented on a change in pull request #16: ARROW-6260: [Website] Use deploy key on Travis to build and push to asf-site URL: https://github.com/apache/arrow-site/pull/16#discussion_r316460102 ## File path: build-and-deploy.sh ## @@ -3,13 +3,25 @@ set -ev if [ "${TRAVIS_BRANCH}" = "master" ] && [ "${TRAVIS_PULL_REQUEST}" = "false" ]; then -if [ -z "${GITHUB_PAT}" ]; then +if [ -z "${GITHUB_PAT}" ] && [ -z "${DEPLOY_KEY}" ]; then # Don't build because we can't publish -echo "To publish the site, you must set a GITHUB_PAT at" +echo "To publish the site, you must set a GITHUB_PAT or DEPLOY_KEY at" echo "https://travis-ci.org/${TRAVIS_REPO_SLUG}/settings; exit 1 fi +if [ "${DEPLOY_KEY}" != "" ]; then Review comment: Could you use one of `-z "..."` style or `"..." != ""` style in this script for consistency? This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services
[GitHub] [arrow-site] fsaintjacques merged pull request #14: Fix PR link on powered_by page in source
fsaintjacques merged pull request #14: Fix PR link on powered_by page in source URL: https://github.com/apache/arrow-site/pull/14 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services
[GitHub] [arrow-site] RandomFractals opened a new pull request #15: Create README.md
RandomFractals opened a new pull request #15: Create README.md URL: https://github.com/apache/arrow-site/pull/15 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services
[GitHub] [arrow-site] nealrichardson opened a new pull request #14: Fix PR link on powered_by page in source
nealrichardson opened a new pull request #14: Fix PR link on powered_by page in source URL: https://github.com/apache/arrow-site/pull/14 See https://github.com/apache/arrow/issues/5156 and https://github.com/apache/arrow-site/pull/13 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services
[GitHub] [arrow-site] nealrichardson opened a new pull request #12: ARROW-6225: [Website] Update arrow-site/README and any other places to point website contributors in right direction
nealrichardson opened a new pull request #12: ARROW-6225: [Website] Update arrow-site/README and any other places to point website contributors in right direction URL: https://github.com/apache/arrow-site/pull/12 This is publishing the latest changes to the website (since we don't have the auto build and deploy set up on apache/arrow-site yet). It has the side effect of deleting README.md, which was stale, along with a bunch of other unused artifacts, using the `rsync -a --delete --exclude '/.git/' --exclude '/docs/'` command that Kou suggested on https://github.com/apache/arrow-site/pull/9. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services
[GitHub] [arrow-site] kou commented on a change in pull request #16: ARROW-6260: [Website] Use deploy key on Travis to build and push to asf-site
kou commented on a change in pull request #16: ARROW-6260: [Website] Use deploy key on Travis to build and push to asf-site URL: https://github.com/apache/arrow-site/pull/16#discussion_r316479684 ## File path: build-and-deploy.sh ## @@ -35,7 +35,22 @@ if [ "${TRAVIS_BRANCH}" = "master" ] && [ "${TRAVIS_PULL_REQUEST}" = "false" ]; JEKYLL_ENV=production bundle exec jekyll build --baseurl="${BASE_URL}" # Publish -git clone -b ${TARGET_BRANCH} https://${GITHUB_PAT}@github.com/$TRAVIS_REPO_SLUG.git OUTPUT +if [ "${DEPLOY_KEY}" != "" ]; then +echo "Setting deploy key" +# Stick it in "scripts" because Jekyll ignores it +echo $DEPLOY_KEY > scripts/deploy_key +# Hack to make the key from the env var have real newlines +sed -i 's/\\n/\ +/g' scripts/deploy_key +chmod 600 scripts/deploy_key +eval $(ssh-agent -s) +ssh-add scripts/deploy_key Review comment: We can use `ssh-add -` here: ```bash echo "${DEPLOY_KEY}" | sed -e 's/\\n/\n/g' | ssh-add - ``` This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services
[GitHub] [arrow-site] nealrichardson commented on a change in pull request #16: ARROW-6260: [Website] Use deploy key on Travis to build and push to asf-site
nealrichardson commented on a change in pull request #16: ARROW-6260: [Website] Use deploy key on Travis to build and push to asf-site URL: https://github.com/apache/arrow-site/pull/16#discussion_r316486584 ## File path: build-and-deploy.sh ## @@ -35,7 +35,18 @@ if [ "${TRAVIS_BRANCH}" = "master" ] && [ "${TRAVIS_PULL_REQUEST}" = "false" ]; JEKYLL_ENV=production bundle exec jekyll build --baseurl="${BASE_URL}" # Publish -git clone -b ${TARGET_BRANCH} https://${GITHUB_PAT}@github.com/$TRAVIS_REPO_SLUG.git OUTPUT +if [ "${DEPLOY_KEY}" != "" ]; then +echo "Setting deploy key" +eval $(ssh-agent -s) +# Hack to make the key from the env var have real newlines +echo "${DEPLOY_KEY}" | sed -e 's/\\n/\ +/g' | ssh-add - Review comment: Seems to work. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services
[GitHub] [arrow-site] nealrichardson commented on a change in pull request #16: ARROW-6260: [Website] Use deploy key on Travis to build and push to asf-site
nealrichardson commented on a change in pull request #16: ARROW-6260: [Website] Use deploy key on Travis to build and push to asf-site URL: https://github.com/apache/arrow-site/pull/16#discussion_r316489942 ## File path: build-and-deploy.sh ## @@ -35,7 +35,18 @@ if [ "${TRAVIS_BRANCH}" = "master" ] && [ "${TRAVIS_PULL_REQUEST}" = "false" ]; JEKYLL_ENV=production bundle exec jekyll build --baseurl="${BASE_URL}" # Publish -git clone -b ${TARGET_BRANCH} https://${GITHUB_PAT}@github.com/$TRAVIS_REPO_SLUG.git OUTPUT +if [ "${DEPLOY_KEY}" != "" ]; then +echo "Setting deploy key" +eval $(ssh-agent -s) +# Hack to make the key from the env var have real newlines +echo "${DEPLOY_KEY}" | sed -e 's/\\n/\ +/g' | ssh-add - Review comment: Thanks for always managing to teach me a new bash trick :) This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services
[GitHub] [arrow-site] kou merged pull request #16: ARROW-6260: [Website] Use deploy key on Travis to build and push to asf-site
kou merged pull request #16: ARROW-6260: [Website] Use deploy key on Travis to build and push to asf-site URL: https://github.com/apache/arrow-site/pull/16 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services
[GitHub] [arrow-site] kou commented on a change in pull request #16: ARROW-6260: [Website] Use deploy key on Travis to build and push to asf-site
kou commented on a change in pull request #16: ARROW-6260: [Website] Use deploy key on Travis to build and push to asf-site URL: https://github.com/apache/arrow-site/pull/16#discussion_r316489518 ## File path: build-and-deploy.sh ## @@ -35,7 +35,18 @@ if [ "${TRAVIS_BRANCH}" = "master" ] && [ "${TRAVIS_PULL_REQUEST}" = "false" ]; JEKYLL_ENV=production bundle exec jekyll build --baseurl="${BASE_URL}" # Publish -git clone -b ${TARGET_BRANCH} https://${GITHUB_PAT}@github.com/$TRAVIS_REPO_SLUG.git OUTPUT +if [ "${DEPLOY_KEY}" != "" ]; then +echo "Setting deploy key" +eval $(ssh-agent -s) +# Hack to make the key from the env var have real newlines +echo "${DEPLOY_KEY}" | sed -e 's/\\n/\ +/g' | ssh-add - Review comment: Thanks for the try! This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services
[GitHub] [arrow-site] nealrichardson commented on a change in pull request #16: ARROW-6260: [Website] Use deploy key on Travis to build and push to asf-site
nealrichardson commented on a change in pull request #16: ARROW-6260: [Website] Use deploy key on Travis to build and push to asf-site URL: https://github.com/apache/arrow-site/pull/16#discussion_r316482335 ## File path: build-and-deploy.sh ## @@ -35,7 +35,22 @@ if [ "${TRAVIS_BRANCH}" = "master" ] && [ "${TRAVIS_PULL_REQUEST}" = "false" ]; JEKYLL_ENV=production bundle exec jekyll build --baseurl="${BASE_URL}" # Publish -git clone -b ${TARGET_BRANCH} https://${GITHUB_PAT}@github.com/$TRAVIS_REPO_SLUG.git OUTPUT +if [ "${DEPLOY_KEY}" != "" ]; then +echo "Setting deploy key" +# Stick it in "scripts" because Jekyll ignores it +echo $DEPLOY_KEY > scripts/deploy_key +# Hack to make the key from the env var have real newlines +sed -i 's/\\n/\ +/g' scripts/deploy_key +chmod 600 scripts/deploy_key +eval $(ssh-agent -s) +ssh-add scripts/deploy_key Review comment: Done, nice to avoid making that file This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services
[GitHub] [arrow-site] kou commented on a change in pull request #16: ARROW-6260: [Website] Use deploy key on Travis to build and push to asf-site
kou commented on a change in pull request #16: ARROW-6260: [Website] Use deploy key on Travis to build and push to asf-site URL: https://github.com/apache/arrow-site/pull/16#discussion_r316484106 ## File path: build-and-deploy.sh ## @@ -35,7 +35,18 @@ if [ "${TRAVIS_BRANCH}" = "master" ] && [ "${TRAVIS_PULL_REQUEST}" = "false" ]; JEKYLL_ENV=production bundle exec jekyll build --baseurl="${BASE_URL}" # Publish -git clone -b ${TARGET_BRANCH} https://${GITHUB_PAT}@github.com/$TRAVIS_REPO_SLUG.git OUTPUT +if [ "${DEPLOY_KEY}" != "" ]; then +echo "Setting deploy key" +eval $(ssh-agent -s) +# Hack to make the key from the env var have real newlines +echo "${DEPLOY_KEY}" | sed -e 's/\\n/\ +/g' | ssh-add - Review comment: Can we use `s/\\n/\n/g` here? It works well with bash on my Debian GNU/Linux: ```bash #!/bin/bash DEPLOY_KEY="-BEGIN\ OPENSSH\ PRIVATE\ KEY-\\naaa\\nbbb" echo "${DEPLOY_KEY}" echo "${DEPLOY_KEY}" | sed -e 's/\\n/\n/g' ``` ```console $ /tmp/a.sh -BEGIN\ OPENSSH\ PRIVATE\ KEY-\naaa\nbbb -BEGIN\ OPENSSH\ PRIVATE\ KEY- aaa bbb ``` This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services
[GitHub] [arrow-site] nealrichardson commented on a change in pull request #16: ARROW-6260: [Website] Use deploy key on Travis to build and push to asf-site
nealrichardson commented on a change in pull request #16: ARROW-6260: [Website] Use deploy key on Travis to build and push to asf-site URL: https://github.com/apache/arrow-site/pull/16#discussion_r316477513 ## File path: build-and-deploy.sh ## @@ -3,13 +3,25 @@ set -ev if [ "${TRAVIS_BRANCH}" = "master" ] && [ "${TRAVIS_PULL_REQUEST}" = "false" ]; then -if [ -z "${GITHUB_PAT}" ]; then +if [ -z "${GITHUB_PAT}" ] && [ -z "${DEPLOY_KEY}" ]; then # Don't build because we can't publish -echo "To publish the site, you must set a GITHUB_PAT at" +echo "To publish the site, you must set a GITHUB_PAT or DEPLOY_KEY at" echo "https://travis-ci.org/${TRAVIS_REPO_SLUG}/settings; exit 1 fi +if [ "${DEPLOY_KEY}" != "" ]; then Review comment: Thanks! Done, PTAL. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services
[GitHub] [arrow-site] nealrichardson commented on a change in pull request #16: ARROW-6260: [Website] Use deploy key on Travis to build and push to asf-site
nealrichardson commented on a change in pull request #16: ARROW-6260: [Website] Use deploy key on Travis to build and push to asf-site URL: https://github.com/apache/arrow-site/pull/16#discussion_r316486127 ## File path: build-and-deploy.sh ## @@ -35,7 +35,18 @@ if [ "${TRAVIS_BRANCH}" = "master" ] && [ "${TRAVIS_PULL_REQUEST}" = "false" ]; JEKYLL_ENV=production bundle exec jekyll build --baseurl="${BASE_URL}" # Publish -git clone -b ${TARGET_BRANCH} https://${GITHUB_PAT}@github.com/$TRAVIS_REPO_SLUG.git OUTPUT +if [ "${DEPLOY_KEY}" != "" ]; then +echo "Setting deploy key" +eval $(ssh-agent -s) +# Hack to make the key from the env var have real newlines +echo "${DEPLOY_KEY}" | sed -e 's/\\n/\ +/g' | ssh-add - Review comment: Didn't work on my mac but it doesn't have to as long as it works on Travis ;) I'll give it a shot. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services
[GitHub] [arrow-site] RandomFractals commented on issue #15: Create README.md
RandomFractals commented on issue #15: Create README.md URL: https://github.com/apache/arrow-site/pull/15#issuecomment-523818984 There was no README.md there yesterday :) Thanks for addressing it in that jira ticket and master branch! This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services
[GitHub] [arrow-site] RandomFractals opened a new pull request #17: Updated powered-by.md to include Data Preview
RandomFractals opened a new pull request #17: Updated powered-by.md to include Data Preview URL: https://github.com/apache/arrow-site/pull/17 Resolves https://github.com/RandomFractals/vscode-data-preview/issues/139 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services
[GitHub] [arrow-site] saintstack commented on a change in pull request #18: Update committer/PMC roster. Move a couple people to Emeritus status
saintstack commented on a change in pull request #18: Update committer/PMC roster. Move a couple people to Emeritus status URL: https://github.com/apache/arrow-site/pull/18#discussion_r318292197 ## File path: committers.html ## @@ -18,24 +18,12 @@ Committers Dremio -Todd Lipcon -PMC -todd -Cloudera - - Ted Dunning PMC tdunning MapR -Michael Stack -PMC -stack -Cloudera - - Review comment: Thanks. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services
[GitHub] [arrow-site] wesm opened a new pull request #18: Update committer/PMC roster. Move a couple people to Emeritus status
wesm opened a new pull request #18: Update committer/PMC roster. Move a couple people to Emeritus status URL: https://github.com/apache/arrow-site/pull/18 A couple of inactive PMC members have requested to be moved to Emeritus status. They can always be restored to PMC status at a later time. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services
[GitHub] [arrow-site] RandomFractals commented on issue #17: Updated powered-by.md to include Data Preview
RandomFractals commented on issue #17: Updated powered-by.md to include Data Preview URL: https://github.com/apache/arrow-site/pull/17#issuecomment-525516786 Hi! Please let me know if there is anything outstanding I need to do for you to accept this powered-by.md PR. I've tried to follow your instructions on that site page & added my Data Preview tool in alphabetical order to the list of great tools and frameworks using Apache Arrow you already have listed there. Thanks! cc @wesm This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services
[GitHub] [arrow-site] wesm merged pull request #18: Update committer/PMC roster. Move a couple people to Emeritus status
wesm merged pull request #18: Update committer/PMC roster. Move a couple people to Emeritus status URL: https://github.com/apache/arrow-site/pull/18 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services
[GitHub] [arrow-site] wesm merged pull request #17: Updated powered-by.md to include Data Preview
wesm merged pull request #17: Updated powered-by.md to include Data Preview URL: https://github.com/apache/arrow-site/pull/17 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services
[GitHub] [arrow-site] RandomFractals commented on issue #17: Updated powered-by.md to include Data Preview
RandomFractals commented on issue #17: Updated powered-by.md to include Data Preview URL: https://github.com/apache/arrow-site/pull/17#issuecomment-525517650 thank you @wesm This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services
[GitHub] [arrow-site] RandomFractals commented on issue #18: Update committer/PMC roster. Move a couple people to Emeritus status
RandomFractals commented on issue #18: Update committer/PMC roster. Move a couple people to Emeritus status URL: https://github.com/apache/arrow-site/pull/18#issuecomment-525494883 hi guys! I could use your prompt attention to this PR too. thanks in advance! https://github.com/apache/arrow-site/pull/17 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services
[GitHub] [arrow-site] wesm opened a new pull request #19: ARROW-6419: [Website] Blog post about Parquet C++ read performance improvements in Arrow 0.15
wesm opened a new pull request #19: ARROW-6419: [Website] Blog post about Parquet C++ read performance improvements in Arrow 0.15 URL: https://github.com/apache/arrow-site/pull/19 The dates will need to be changed for the actual publication date. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services
[GitHub] [arrow-site] wesm commented on issue #19: ARROW-6419: [Website] Blog post about Parquet C++ read performance improvements in Arrow 0.15
wesm commented on issue #19: ARROW-6419: [Website] Blog post about Parquet C++ read performance improvements in Arrow 0.15 URL: https://github.com/apache/arrow-site/pull/19#issuecomment-527286878 cc @hatemhelal @xhochy for any review. Note that we have dropped BinaryArray read performance in the non-dictionary case. Not sure why that is yet. I opened https://issues.apache.org/jira/browse/ARROW-6417 to investigate ![20190903_parquet_read_perf](https://user-images.githubusercontent.com/329591/64141564-2b9a4b80-cdce-11e9-94ea-bfcc0dea0b23.png) This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services
[GitHub] [arrow-site] wesm commented on issue #19: ARROW-6419: [Website] Blog post about Parquet C++ read performance improvements in Arrow 0.15
wesm commented on issue #19: ARROW-6419: [Website] Blog post about Parquet C++ read performance improvements in Arrow 0.15 URL: https://github.com/apache/arrow-site/pull/19#issuecomment-527287458 In light of the mixed performance results the post might need a new title to reframe around the dictionary read improvements This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services
[GitHub] [arrow-site] wesm commented on a change in pull request #19: ARROW-6419: [Website] Blog post about Parquet C++ read performance improvements in Arrow 0.15
wesm commented on a change in pull request #19: ARROW-6419: [Website] Blog post about Parquet C++ read performance improvements in Arrow 0.15 URL: https://github.com/apache/arrow-site/pull/19#discussion_r320453511 ## File path: _posts/2019-09-03-faster-strings-cpp-parquet.md ## @@ -0,0 +1,233 @@ +--- +layout: post +title: "Faster C++ Apache Parquet performance on string-heavy data coming in Apache Arrow 0.15" +date: "2019-09-01 00:00:00 -0600" +author: Wes McKinney +categories: [application] +--- + + +We have been implementing a series of optimizations in the Apache Parquet C++ +internals to improve read and write efficiency (both performance and memory +use) for Arrow columnar binary and string types, including native support for +Arrow's dictionary types. This should have a big impact on users of the C++, +MATLAB, Python, R, and Ruby interfaces to Parquet files. + +We discuss the work that was done and show benchmarks comparing Arrow 0.11.0 +(released in October, 2018) with the current development version (to be +released soon as Arrow 0.15.0). + +# Summary of work + +One of the challenges of developing the Parquet C++ library is that we +maintain low-level read and write APIs that do not involve the Arrow columnar +data structures. So we have had to take care to do Arrow-related optimizations +without impacting non-Arrow Parquet users, which includes database systems like +Clickhouse and Vertica. + +One of the largest and most complex optimizations involves encoding and +decoding Parquet files' internal dictionary-encoded data streams to and from +Arrow's in-memory dictionary-encoded `DictionaryArray` +representation. Dictionary encoding is a compression strategy in Parquet, and +there is no formal "dictionary" or "categorical" type. I will go into more +detail about this below. + +Some of the particular JIRA issues related to this work include: + +- Vectorize comparators for computing statistics ([PARQUET-1523][1]) +- Read binary directly data directly into DictionaryBuilder + ([ARROW-3769][2]) +- Writing Parquet's dictionary indices directly into DictionaryBuilder + ([ARROW-3772][3]) +- Write dense (non-dictionary) Arrow arrays directly into Parquet data encoders + ([ARROW-6152][4]) +- Direct writing of arrow::DictionaryArray to Parquet column writers ([ARROW-3246][5]) +- Supporting changing dictionaries ([ARROW-3144][6]) +- Internal IO optimizations and improved raw `BYTE_ARRAY` encoding performance + ([ARROW-4398][7]) + +# Background: how Parquet files do dictionary encoding + +Many direct and indirect users of Apache Arrow use dictionary encoding to +improve performance and memory use on binary or string data types that include +many repeated values. pandas users will know this as the [Categorical type][8] Review comment: Lowercase is the official styling of the project name =) This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services
[GitHub] [arrow-site] nealrichardson commented on a change in pull request #19: ARROW-6419: [Website] Blog post about Parquet C++ read performance improvements in Arrow 0.15
nealrichardson commented on a change in pull request #19: ARROW-6419: [Website] Blog post about Parquet C++ read performance improvements in Arrow 0.15 URL: https://github.com/apache/arrow-site/pull/19#discussion_r320429697 ## File path: _posts/2019-09-03-faster-strings-cpp-parquet.md ## @@ -0,0 +1,233 @@ +--- +layout: post +title: "Faster C++ Apache Parquet performance on string-heavy data coming in Apache Arrow 0.15" +date: "2019-09-01 00:00:00 -0600" +author: Wes McKinney +categories: [application] +--- + + +We have been implementing a series of optimizations in the Apache Parquet C++ +internals to improve read and write efficiency (both performance and memory +use) for Arrow columnar binary and string types, including native support for +Arrow's dictionary types. This should have a big impact on users of the C++, +MATLAB, Python, R, and Ruby interfaces to Parquet files. + +We discuss the work that was done and show benchmarks comparing Arrow 0.11.0 +(released in October, 2018) with the current development version (to be +released soon as Arrow 0.15.0). + +# Summary of work + +One of the challenges of developing the Parquet C++ library is that we +maintain low-level read and write APIs that do not involve the Arrow columnar +data structures. So we have had to take care to do Arrow-related optimizations +without impacting non-Arrow Parquet users, which includes database systems like +Clickhouse and Vertica. + +One of the largest and most complex optimizations involves encoding and +decoding Parquet files' internal dictionary-encoded data streams to and from +Arrow's in-memory dictionary-encoded `DictionaryArray` +representation. Dictionary encoding is a compression strategy in Parquet, and +there is no formal "dictionary" or "categorical" type. I will go into more +detail about this below. + +Some of the particular JIRA issues related to this work include: + +- Vectorize comparators for computing statistics ([PARQUET-1523][1]) +- Read binary directly data directly into DictionaryBuilder + ([ARROW-3769][2]) +- Writing Parquet's dictionary indices directly into DictionaryBuilder + ([ARROW-3772][3]) +- Write dense (non-dictionary) Arrow arrays directly into Parquet data encoders + ([ARROW-6152][4]) +- Direct writing of arrow::DictionaryArray to Parquet column writers ([ARROW-3246][5]) +- Supporting changing dictionaries ([ARROW-3144][6]) +- Internal IO optimizations and improved raw `BYTE_ARRAY` encoding performance + ([ARROW-4398][7]) + +# Background: how Parquet files do dictionary encoding + +Many direct and indirect users of Apache Arrow use dictionary encoding to +improve performance and memory use on binary or string data types that include +many repeated values. pandas users will know this as the [Categorical type][8] +while in R such encoding is known as [`factor`][9]. In the Arrow C++ library +and various bindings we have the `DictionaryArray` object for representing such +data in memory. + +For example, an array such as + +``` +['apple', 'orange', 'apple', NULL, 'orange', 'orange'] +``` + +has dictionary-encoded form + +``` +dictionary: ['apple', 'orange'] +indices: [0, 1, 0, NULL, 1, 1] +``` + +The [Parquet format uses dictionary encoding][10] to compress data, and it is +used for all Parquet data types, not just binary or string data. Parquet +further uses bit-packing and run-length encoding (RLE) to compress the +dictionary indices, so if you had data like + +``` +['apple', 'apple', 'apple', 'apple', 'apple', 'apple', 'orange'] +``` + +the indices would be encoded like + +``` +[rle-run=(6, 0), + bit-packed-run=[1]] +``` + +The full details of the rle-bitpacking encoding are found in the Parquet +specification. + +When writing a Parquet file, most implementations will use dictionary encoding +to compress a column until the dictionary itself reaches a certain size +threshold, usually around 1 megabyte. At this point, the column writer will +"fall back" to `PLAIN` encoding where values are written end-to-end in "data +pages" and then usually compressed with Snappy or Gzip. See the following rough +diagram: + + + + + +When reading a Parquet file, the dictionary-encoded portions are usually +materialized to their non-dictionary-encoded form, causing binary or string +values to be duplicated in memory. So an obvious (but not trivial) optimization +is to skip this "dense" materialization. There are several issues to deal with: + +* A Parquet file often contains multiple ColumnChunks for each semantic column, + and the dictionary values may be different in each ColumnChunk +* We must gracefully handle the "fall back" portion which is not + dictionary-encoded + +We pursued several avenues to help with this: + +* Allowing each `arrow::DictionaryArray` to have a different dictionary + (before, the dictionary was part of the `DictionaryType`, which caused + problems) +* We enabled the Parquet dictionary indices to be directly written into an + Arrow `DictionaryBuilder`
[GitHub] [arrow-site] nealrichardson commented on a change in pull request #19: ARROW-6419: [Website] Blog post about Parquet C++ read performance improvements in Arrow 0.15
nealrichardson commented on a change in pull request #19: ARROW-6419: [Website] Blog post about Parquet C++ read performance improvements in Arrow 0.15 URL: https://github.com/apache/arrow-site/pull/19#discussion_r320430467 ## File path: _posts/2019-09-03-faster-strings-cpp-parquet.md ## @@ -0,0 +1,233 @@ +--- +layout: post +title: "Faster C++ Apache Parquet performance on string-heavy data coming in Apache Arrow 0.15" +date: "2019-09-01 00:00:00 -0600" +author: Wes McKinney +categories: [application] +--- + + +We have been implementing a series of optimizations in the Apache Parquet C++ +internals to improve read and write efficiency (both performance and memory +use) for Arrow columnar binary and string types, including native support for +Arrow's dictionary types. This should have a big impact on users of the C++, +MATLAB, Python, R, and Ruby interfaces to Parquet files. + +We discuss the work that was done and show benchmarks comparing Arrow 0.11.0 +(released in October, 2018) with the current development version (to be +released soon as Arrow 0.15.0). + +# Summary of work + +One of the challenges of developing the Parquet C++ library is that we +maintain low-level read and write APIs that do not involve the Arrow columnar +data structures. So we have had to take care to do Arrow-related optimizations +without impacting non-Arrow Parquet users, which includes database systems like +Clickhouse and Vertica. + +One of the largest and most complex optimizations involves encoding and +decoding Parquet files' internal dictionary-encoded data streams to and from +Arrow's in-memory dictionary-encoded `DictionaryArray` +representation. Dictionary encoding is a compression strategy in Parquet, and +there is no formal "dictionary" or "categorical" type. I will go into more +detail about this below. + +Some of the particular JIRA issues related to this work include: + +- Vectorize comparators for computing statistics ([PARQUET-1523][1]) +- Read binary directly data directly into DictionaryBuilder + ([ARROW-3769][2]) +- Writing Parquet's dictionary indices directly into DictionaryBuilder + ([ARROW-3772][3]) +- Write dense (non-dictionary) Arrow arrays directly into Parquet data encoders + ([ARROW-6152][4]) +- Direct writing of arrow::DictionaryArray to Parquet column writers ([ARROW-3246][5]) +- Supporting changing dictionaries ([ARROW-3144][6]) +- Internal IO optimizations and improved raw `BYTE_ARRAY` encoding performance + ([ARROW-4398][7]) + +# Background: how Parquet files do dictionary encoding + +Many direct and indirect users of Apache Arrow use dictionary encoding to +improve performance and memory use on binary or string data types that include +many repeated values. pandas users will know this as the [Categorical type][8] +while in R such encoding is known as [`factor`][9]. In the Arrow C++ library +and various bindings we have the `DictionaryArray` object for representing such +data in memory. + +For example, an array such as + +``` +['apple', 'orange', 'apple', NULL, 'orange', 'orange'] +``` + +has dictionary-encoded form + +``` +dictionary: ['apple', 'orange'] +indices: [0, 1, 0, NULL, 1, 1] +``` + +The [Parquet format uses dictionary encoding][10] to compress data, and it is +used for all Parquet data types, not just binary or string data. Parquet +further uses bit-packing and run-length encoding (RLE) to compress the +dictionary indices, so if you had data like + +``` +['apple', 'apple', 'apple', 'apple', 'apple', 'apple', 'orange'] +``` + +the indices would be encoded like + +``` +[rle-run=(6, 0), + bit-packed-run=[1]] +``` + +The full details of the rle-bitpacking encoding are found in the Parquet +specification. + +When writing a Parquet file, most implementations will use dictionary encoding +to compress a column until the dictionary itself reaches a certain size +threshold, usually around 1 megabyte. At this point, the column writer will +"fall back" to `PLAIN` encoding where values are written end-to-end in "data +pages" and then usually compressed with Snappy or Gzip. See the following rough +diagram: + + + + + +When reading a Parquet file, the dictionary-encoded portions are usually +materialized to their non-dictionary-encoded form, causing binary or string +values to be duplicated in memory. So an obvious (but not trivial) optimization +is to skip this "dense" materialization. There are several issues to deal with: + +* A Parquet file often contains multiple ColumnChunks for each semantic column, + and the dictionary values may be different in each ColumnChunk +* We must gracefully handle the "fall back" portion which is not + dictionary-encoded + +We pursued several avenues to help with this: + +* Allowing each `arrow::DictionaryArray` to have a different dictionary + (before, the dictionary was part of the `DictionaryType`, which caused + problems) +* We enabled the Parquet dictionary indices to be directly written into an + Arrow `DictionaryBuilder`
[GitHub] [arrow-site] nealrichardson commented on a change in pull request #19: ARROW-6419: [Website] Blog post about Parquet C++ read performance improvements in Arrow 0.15
nealrichardson commented on a change in pull request #19: ARROW-6419: [Website] Blog post about Parquet C++ read performance improvements in Arrow 0.15 URL: https://github.com/apache/arrow-site/pull/19#discussion_r320433177 ## File path: _posts/2019-09-03-faster-strings-cpp-parquet.md ## @@ -0,0 +1,233 @@ +--- +layout: post +title: "Faster C++ Apache Parquet performance on string-heavy data coming in Apache Arrow 0.15" +date: "2019-09-01 00:00:00 -0600" +author: Wes McKinney +categories: [application] +--- + + +We have been implementing a series of optimizations in the Apache Parquet C++ +internals to improve read and write efficiency (both performance and memory +use) for Arrow columnar binary and string types, including native support for +Arrow's dictionary types. This should have a big impact on users of the C++, +MATLAB, Python, R, and Ruby interfaces to Parquet files. + +We discuss the work that was done and show benchmarks comparing Arrow 0.11.0 +(released in October, 2018) with the current development version (to be +released soon as Arrow 0.15.0). + +# Summary of work + +One of the challenges of developing the Parquet C++ library is that we +maintain low-level read and write APIs that do not involve the Arrow columnar +data structures. So we have had to take care to do Arrow-related optimizations +without impacting non-Arrow Parquet users, which includes database systems like +Clickhouse and Vertica. + +One of the largest and most complex optimizations involves encoding and +decoding Parquet files' internal dictionary-encoded data streams to and from +Arrow's in-memory dictionary-encoded `DictionaryArray` +representation. Dictionary encoding is a compression strategy in Parquet, and +there is no formal "dictionary" or "categorical" type. I will go into more +detail about this below. + +Some of the particular JIRA issues related to this work include: + +- Vectorize comparators for computing statistics ([PARQUET-1523][1]) +- Read binary directly data directly into DictionaryBuilder + ([ARROW-3769][2]) +- Writing Parquet's dictionary indices directly into DictionaryBuilder + ([ARROW-3772][3]) +- Write dense (non-dictionary) Arrow arrays directly into Parquet data encoders + ([ARROW-6152][4]) +- Direct writing of arrow::DictionaryArray to Parquet column writers ([ARROW-3246][5]) +- Supporting changing dictionaries ([ARROW-3144][6]) +- Internal IO optimizations and improved raw `BYTE_ARRAY` encoding performance + ([ARROW-4398][7]) + +# Background: how Parquet files do dictionary encoding + +Many direct and indirect users of Apache Arrow use dictionary encoding to +improve performance and memory use on binary or string data types that include +many repeated values. pandas users will know this as the [Categorical type][8] +while in R such encoding is known as [`factor`][9]. In the Arrow C++ library +and various bindings we have the `DictionaryArray` object for representing such +data in memory. + +For example, an array such as + +``` +['apple', 'orange', 'apple', NULL, 'orange', 'orange'] +``` + +has dictionary-encoded form + +``` +dictionary: ['apple', 'orange'] +indices: [0, 1, 0, NULL, 1, 1] +``` + +The [Parquet format uses dictionary encoding][10] to compress data, and it is +used for all Parquet data types, not just binary or string data. Parquet +further uses bit-packing and run-length encoding (RLE) to compress the +dictionary indices, so if you had data like + +``` +['apple', 'apple', 'apple', 'apple', 'apple', 'apple', 'orange'] +``` + +the indices would be encoded like + +``` +[rle-run=(6, 0), + bit-packed-run=[1]] +``` + +The full details of the rle-bitpacking encoding are found in the Parquet +specification. + +When writing a Parquet file, most implementations will use dictionary encoding +to compress a column until the dictionary itself reaches a certain size +threshold, usually around 1 megabyte. At this point, the column writer will +"fall back" to `PLAIN` encoding where values are written end-to-end in "data +pages" and then usually compressed with Snappy or Gzip. See the following rough +diagram: + + + + + +When reading a Parquet file, the dictionary-encoded portions are usually +materialized to their non-dictionary-encoded form, causing binary or string +values to be duplicated in memory. So an obvious (but not trivial) optimization +is to skip this "dense" materialization. There are several issues to deal with: + +* A Parquet file often contains multiple ColumnChunks for each semantic column, + and the dictionary values may be different in each ColumnChunk +* We must gracefully handle the "fall back" portion which is not + dictionary-encoded + +We pursued several avenues to help with this: + +* Allowing each `arrow::DictionaryArray` to have a different dictionary + (before, the dictionary was part of the `DictionaryType`, which caused + problems) +* We enabled the Parquet dictionary indices to be directly written into an + Arrow `DictionaryBuilder`
[GitHub] [arrow-site] nealrichardson commented on a change in pull request #19: ARROW-6419: [Website] Blog post about Parquet C++ read performance improvements in Arrow 0.15
nealrichardson commented on a change in pull request #19: ARROW-6419: [Website] Blog post about Parquet C++ read performance improvements in Arrow 0.15 URL: https://github.com/apache/arrow-site/pull/19#discussion_r320432915 ## File path: _posts/2019-09-03-faster-strings-cpp-parquet.md ## @@ -0,0 +1,233 @@ +--- +layout: post +title: "Faster C++ Apache Parquet performance on string-heavy data coming in Apache Arrow 0.15" +date: "2019-09-01 00:00:00 -0600" +author: Wes McKinney +categories: [application] +--- + + +We have been implementing a series of optimizations in the Apache Parquet C++ +internals to improve read and write efficiency (both performance and memory +use) for Arrow columnar binary and string types, including native support for +Arrow's dictionary types. This should have a big impact on users of the C++, +MATLAB, Python, R, and Ruby interfaces to Parquet files. + +We discuss the work that was done and show benchmarks comparing Arrow 0.11.0 +(released in October, 2018) with the current development version (to be +released soon as Arrow 0.15.0). + +# Summary of work + +One of the challenges of developing the Parquet C++ library is that we +maintain low-level read and write APIs that do not involve the Arrow columnar +data structures. So we have had to take care to do Arrow-related optimizations +without impacting non-Arrow Parquet users, which includes database systems like +Clickhouse and Vertica. + +One of the largest and most complex optimizations involves encoding and +decoding Parquet files' internal dictionary-encoded data streams to and from +Arrow's in-memory dictionary-encoded `DictionaryArray` +representation. Dictionary encoding is a compression strategy in Parquet, and +there is no formal "dictionary" or "categorical" type. I will go into more +detail about this below. + +Some of the particular JIRA issues related to this work include: + +- Vectorize comparators for computing statistics ([PARQUET-1523][1]) +- Read binary directly data directly into DictionaryBuilder + ([ARROW-3769][2]) +- Writing Parquet's dictionary indices directly into DictionaryBuilder + ([ARROW-3772][3]) +- Write dense (non-dictionary) Arrow arrays directly into Parquet data encoders + ([ARROW-6152][4]) +- Direct writing of arrow::DictionaryArray to Parquet column writers ([ARROW-3246][5]) +- Supporting changing dictionaries ([ARROW-3144][6]) +- Internal IO optimizations and improved raw `BYTE_ARRAY` encoding performance + ([ARROW-4398][7]) + +# Background: how Parquet files do dictionary encoding + +Many direct and indirect users of Apache Arrow use dictionary encoding to +improve performance and memory use on binary or string data types that include +many repeated values. pandas users will know this as the [Categorical type][8] +while in R such encoding is known as [`factor`][9]. In the Arrow C++ library +and various bindings we have the `DictionaryArray` object for representing such +data in memory. + +For example, an array such as + +``` +['apple', 'orange', 'apple', NULL, 'orange', 'orange'] +``` + +has dictionary-encoded form + +``` +dictionary: ['apple', 'orange'] +indices: [0, 1, 0, NULL, 1, 1] +``` + +The [Parquet format uses dictionary encoding][10] to compress data, and it is +used for all Parquet data types, not just binary or string data. Parquet +further uses bit-packing and run-length encoding (RLE) to compress the +dictionary indices, so if you had data like + +``` +['apple', 'apple', 'apple', 'apple', 'apple', 'apple', 'orange'] +``` + +the indices would be encoded like + +``` +[rle-run=(6, 0), + bit-packed-run=[1]] +``` + +The full details of the rle-bitpacking encoding are found in the Parquet +specification. + +When writing a Parquet file, most implementations will use dictionary encoding +to compress a column until the dictionary itself reaches a certain size +threshold, usually around 1 megabyte. At this point, the column writer will +"fall back" to `PLAIN` encoding where values are written end-to-end in "data +pages" and then usually compressed with Snappy or Gzip. See the following rough +diagram: + + + + + +When reading a Parquet file, the dictionary-encoded portions are usually Review comment: This seems to start a new section, where you've moved on from "background" and are starting to discuss your optimizations. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services
[GitHub] [arrow-site] nealrichardson commented on a change in pull request #19: ARROW-6419: [Website] Blog post about Parquet C++ read performance improvements in Arrow 0.15
nealrichardson commented on a change in pull request #19: ARROW-6419: [Website] Blog post about Parquet C++ read performance improvements in Arrow 0.15 URL: https://github.com/apache/arrow-site/pull/19#discussion_r320432044 ## File path: _posts/2019-09-03-faster-strings-cpp-parquet.md ## @@ -0,0 +1,233 @@ +--- +layout: post +title: "Faster C++ Apache Parquet performance on string-heavy data coming in Apache Arrow 0.15" +date: "2019-09-01 00:00:00 -0600" +author: Wes McKinney +categories: [application] +--- + + +We have been implementing a series of optimizations in the Apache Parquet C++ +internals to improve read and write efficiency (both performance and memory +use) for Arrow columnar binary and string types, including native support for +Arrow's dictionary types. This should have a big impact on users of the C++, +MATLAB, Python, R, and Ruby interfaces to Parquet files. + +We discuss the work that was done and show benchmarks comparing Arrow 0.11.0 +(released in October, 2018) with the current development version (to be +released soon as Arrow 0.15.0). + +# Summary of work + +One of the challenges of developing the Parquet C++ library is that we +maintain low-level read and write APIs that do not involve the Arrow columnar +data structures. So we have had to take care to do Arrow-related optimizations +without impacting non-Arrow Parquet users, which includes database systems like +Clickhouse and Vertica. + +One of the largest and most complex optimizations involves encoding and +decoding Parquet files' internal dictionary-encoded data streams to and from +Arrow's in-memory dictionary-encoded `DictionaryArray` +representation. Dictionary encoding is a compression strategy in Parquet, and +there is no formal "dictionary" or "categorical" type. I will go into more +detail about this below. + +Some of the particular JIRA issues related to this work include: + +- Vectorize comparators for computing statistics ([PARQUET-1523][1]) +- Read binary directly data directly into DictionaryBuilder + ([ARROW-3769][2]) +- Writing Parquet's dictionary indices directly into DictionaryBuilder + ([ARROW-3772][3]) +- Write dense (non-dictionary) Arrow arrays directly into Parquet data encoders + ([ARROW-6152][4]) +- Direct writing of arrow::DictionaryArray to Parquet column writers ([ARROW-3246][5]) +- Supporting changing dictionaries ([ARROW-3144][6]) +- Internal IO optimizations and improved raw `BYTE_ARRAY` encoding performance + ([ARROW-4398][7]) + +# Background: how Parquet files do dictionary encoding + +Many direct and indirect users of Apache Arrow use dictionary encoding to +improve performance and memory use on binary or string data types that include +many repeated values. pandas users will know this as the [Categorical type][8] Review comment: lowercase "pandas" reads odd to start a sentence This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services
[GitHub] [arrow-site] nealrichardson commented on a change in pull request #19: ARROW-6419: [Website] Blog post about Parquet C++ read performance improvements in Arrow 0.15
nealrichardson commented on a change in pull request #19: ARROW-6419: [Website] Blog post about Parquet C++ read performance improvements in Arrow 0.15 URL: https://github.com/apache/arrow-site/pull/19#discussion_r320430931 ## File path: _posts/2019-09-03-faster-strings-cpp-parquet.md ## @@ -0,0 +1,233 @@ +--- +layout: post +title: "Faster C++ Apache Parquet performance on string-heavy data coming in Apache Arrow 0.15" +date: "2019-09-01 00:00:00 -0600" +author: Wes McKinney +categories: [application] +--- + + +We have been implementing a series of optimizations in the Apache Parquet C++ +internals to improve read and write efficiency (both performance and memory +use) for Arrow columnar binary and string types, including native support for +Arrow's dictionary types. This should have a big impact on users of the C++, +MATLAB, Python, R, and Ruby interfaces to Parquet files. + +We discuss the work that was done and show benchmarks comparing Arrow 0.11.0 +(released in October, 2018) with the current development version (to be +released soon as Arrow 0.15.0). + +# Summary of work + +One of the challenges of developing the Parquet C++ library is that we +maintain low-level read and write APIs that do not involve the Arrow columnar +data structures. So we have had to take care to do Arrow-related optimizations +without impacting non-Arrow Parquet users, which includes database systems like +Clickhouse and Vertica. + +One of the largest and most complex optimizations involves encoding and +decoding Parquet files' internal dictionary-encoded data streams to and from +Arrow's in-memory dictionary-encoded `DictionaryArray` +representation. Dictionary encoding is a compression strategy in Parquet, and +there is no formal "dictionary" or "categorical" type. I will go into more +detail about this below. + +Some of the particular JIRA issues related to this work include: + +- Vectorize comparators for computing statistics ([PARQUET-1523][1]) +- Read binary directly data directly into DictionaryBuilder + ([ARROW-3769][2]) +- Writing Parquet's dictionary indices directly into DictionaryBuilder + ([ARROW-3772][3]) +- Write dense (non-dictionary) Arrow arrays directly into Parquet data encoders + ([ARROW-6152][4]) +- Direct writing of arrow::DictionaryArray to Parquet column writers ([ARROW-3246][5]) +- Supporting changing dictionaries ([ARROW-3144][6]) +- Internal IO optimizations and improved raw `BYTE_ARRAY` encoding performance + ([ARROW-4398][7]) + +# Background: how Parquet files do dictionary encoding + +Many direct and indirect users of Apache Arrow use dictionary encoding to +improve performance and memory use on binary or string data types that include +many repeated values. pandas users will know this as the [Categorical type][8] +while in R such encoding is known as [`factor`][9]. In the Arrow C++ library +and various bindings we have the `DictionaryArray` object for representing such +data in memory. + +For example, an array such as + +``` +['apple', 'orange', 'apple', NULL, 'orange', 'orange'] +``` + +has dictionary-encoded form + +``` +dictionary: ['apple', 'orange'] +indices: [0, 1, 0, NULL, 1, 1] +``` + +The [Parquet format uses dictionary encoding][10] to compress data, and it is +used for all Parquet data types, not just binary or string data. Parquet +further uses bit-packing and run-length encoding (RLE) to compress the +dictionary indices, so if you had data like + +``` +['apple', 'apple', 'apple', 'apple', 'apple', 'apple', 'orange'] +``` + +the indices would be encoded like + +``` +[rle-run=(6, 0), + bit-packed-run=[1]] +``` + +The full details of the rle-bitpacking encoding are found in the Parquet +specification. + +When writing a Parquet file, most implementations will use dictionary encoding +to compress a column until the dictionary itself reaches a certain size +threshold, usually around 1 megabyte. At this point, the column writer will +"fall back" to `PLAIN` encoding where values are written end-to-end in "data +pages" and then usually compressed with Snappy or Gzip. See the following rough +diagram: + + + + + +When reading a Parquet file, the dictionary-encoded portions are usually +materialized to their non-dictionary-encoded form, causing binary or string +values to be duplicated in memory. So an obvious (but not trivial) optimization +is to skip this "dense" materialization. There are several issues to deal with: + +* A Parquet file often contains multiple ColumnChunks for each semantic column, + and the dictionary values may be different in each ColumnChunk +* We must gracefully handle the "fall back" portion which is not + dictionary-encoded + +We pursued several avenues to help with this: + +* Allowing each `arrow::DictionaryArray` to have a different dictionary + (before, the dictionary was part of the `DictionaryType`, which caused + problems) +* We enabled the Parquet dictionary indices to be directly written into an + Arrow `DictionaryBuilder`
[GitHub] [arrow-site] nealrichardson commented on a change in pull request #19: ARROW-6419: [Website] Blog post about Parquet C++ read performance improvements in Arrow 0.15
nealrichardson commented on a change in pull request #19: ARROW-6419: [Website] Blog post about Parquet C++ read performance improvements in Arrow 0.15 URL: https://github.com/apache/arrow-site/pull/19#discussion_r320431812 ## File path: _posts/2019-09-03-faster-strings-cpp-parquet.md ## @@ -0,0 +1,233 @@ +--- +layout: post +title: "Faster C++ Apache Parquet performance on string-heavy data coming in Apache Arrow 0.15" +date: "2019-09-01 00:00:00 -0600" +author: Wes McKinney +categories: [application] +--- + + +We have been implementing a series of optimizations in the Apache Parquet C++ +internals to improve read and write efficiency (both performance and memory +use) for Arrow columnar binary and string types, including native support for +Arrow's dictionary types. This should have a big impact on users of the C++, +MATLAB, Python, R, and Ruby interfaces to Parquet files. + +We discuss the work that was done and show benchmarks comparing Arrow 0.11.0 +(released in October, 2018) with the current development version (to be +released soon as Arrow 0.15.0). + +# Summary of work + +One of the challenges of developing the Parquet C++ library is that we Review comment: I'd move this paragraph to after the list of Jiras. This is an odd way to start a "summary" This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services
[GitHub] [arrow-site] nealrichardson commented on a change in pull request #19: ARROW-6419: [Website] Blog post about Parquet C++ read performance improvements in Arrow 0.15
nealrichardson commented on a change in pull request #19: ARROW-6419: [Website] Blog post about Parquet C++ read performance improvements in Arrow 0.15 URL: https://github.com/apache/arrow-site/pull/19#discussion_r320430285 ## File path: _posts/2019-09-03-faster-strings-cpp-parquet.md ## @@ -0,0 +1,233 @@ +--- +layout: post +title: "Faster C++ Apache Parquet performance on string-heavy data coming in Apache Arrow 0.15" +date: "2019-09-01 00:00:00 -0600" +author: Wes McKinney +categories: [application] +--- + + +We have been implementing a series of optimizations in the Apache Parquet C++ +internals to improve read and write efficiency (both performance and memory +use) for Arrow columnar binary and string types, including native support for +Arrow's dictionary types. This should have a big impact on users of the C++, +MATLAB, Python, R, and Ruby interfaces to Parquet files. + +We discuss the work that was done and show benchmarks comparing Arrow 0.11.0 +(released in October, 2018) with the current development version (to be +released soon as Arrow 0.15.0). + +# Summary of work + +One of the challenges of developing the Parquet C++ library is that we +maintain low-level read and write APIs that do not involve the Arrow columnar +data structures. So we have had to take care to do Arrow-related optimizations +without impacting non-Arrow Parquet users, which includes database systems like +Clickhouse and Vertica. + +One of the largest and most complex optimizations involves encoding and +decoding Parquet files' internal dictionary-encoded data streams to and from +Arrow's in-memory dictionary-encoded `DictionaryArray` +representation. Dictionary encoding is a compression strategy in Parquet, and +there is no formal "dictionary" or "categorical" type. I will go into more +detail about this below. + +Some of the particular JIRA issues related to this work include: + +- Vectorize comparators for computing statistics ([PARQUET-1523][1]) +- Read binary directly data directly into DictionaryBuilder + ([ARROW-3769][2]) +- Writing Parquet's dictionary indices directly into DictionaryBuilder + ([ARROW-3772][3]) +- Write dense (non-dictionary) Arrow arrays directly into Parquet data encoders + ([ARROW-6152][4]) +- Direct writing of arrow::DictionaryArray to Parquet column writers ([ARROW-3246][5]) +- Supporting changing dictionaries ([ARROW-3144][6]) +- Internal IO optimizations and improved raw `BYTE_ARRAY` encoding performance + ([ARROW-4398][7]) + +# Background: how Parquet files do dictionary encoding + +Many direct and indirect users of Apache Arrow use dictionary encoding to +improve performance and memory use on binary or string data types that include +many repeated values. pandas users will know this as the [Categorical type][8] +while in R such encoding is known as [`factor`][9]. In the Arrow C++ library +and various bindings we have the `DictionaryArray` object for representing such +data in memory. + +For example, an array such as + +``` +['apple', 'orange', 'apple', NULL, 'orange', 'orange'] +``` + +has dictionary-encoded form + +``` +dictionary: ['apple', 'orange'] +indices: [0, 1, 0, NULL, 1, 1] +``` + +The [Parquet format uses dictionary encoding][10] to compress data, and it is +used for all Parquet data types, not just binary or string data. Parquet +further uses bit-packing and run-length encoding (RLE) to compress the +dictionary indices, so if you had data like + +``` +['apple', 'apple', 'apple', 'apple', 'apple', 'apple', 'orange'] +``` + +the indices would be encoded like + +``` +[rle-run=(6, 0), + bit-packed-run=[1]] +``` + +The full details of the rle-bitpacking encoding are found in the Parquet +specification. + +When writing a Parquet file, most implementations will use dictionary encoding +to compress a column until the dictionary itself reaches a certain size +threshold, usually around 1 megabyte. At this point, the column writer will +"fall back" to `PLAIN` encoding where values are written end-to-end in "data +pages" and then usually compressed with Snappy or Gzip. See the following rough +diagram: + + + + + +When reading a Parquet file, the dictionary-encoded portions are usually +materialized to their non-dictionary-encoded form, causing binary or string +values to be duplicated in memory. So an obvious (but not trivial) optimization +is to skip this "dense" materialization. There are several issues to deal with: + +* A Parquet file often contains multiple ColumnChunks for each semantic column, + and the dictionary values may be different in each ColumnChunk +* We must gracefully handle the "fall back" portion which is not + dictionary-encoded + +We pursued several avenues to help with this: + +* Allowing each `arrow::DictionaryArray` to have a different dictionary + (before, the dictionary was part of the `DictionaryType`, which caused + problems) +* We enabled the Parquet dictionary indices to be directly written into an + Arrow `DictionaryBuilder`
[GitHub] [arrow-site] nealrichardson commented on a change in pull request #19: ARROW-6419: [Website] Blog post about Parquet C++ read performance improvements in Arrow 0.15
nealrichardson commented on a change in pull request #19: ARROW-6419: [Website] Blog post about Parquet C++ read performance improvements in Arrow 0.15 URL: https://github.com/apache/arrow-site/pull/19#discussion_r320432165 ## File path: _posts/2019-09-03-faster-strings-cpp-parquet.md ## @@ -0,0 +1,233 @@ +--- +layout: post +title: "Faster C++ Apache Parquet performance on string-heavy data coming in Apache Arrow 0.15" +date: "2019-09-01 00:00:00 -0600" +author: Wes McKinney +categories: [application] +--- + + +We have been implementing a series of optimizations in the Apache Parquet C++ +internals to improve read and write efficiency (both performance and memory +use) for Arrow columnar binary and string types, including native support for +Arrow's dictionary types. This should have a big impact on users of the C++, +MATLAB, Python, R, and Ruby interfaces to Parquet files. + +We discuss the work that was done and show benchmarks comparing Arrow 0.11.0 +(released in October, 2018) with the current development version (to be +released soon as Arrow 0.15.0). + +# Summary of work + +One of the challenges of developing the Parquet C++ library is that we +maintain low-level read and write APIs that do not involve the Arrow columnar +data structures. So we have had to take care to do Arrow-related optimizations +without impacting non-Arrow Parquet users, which includes database systems like +Clickhouse and Vertica. + +One of the largest and most complex optimizations involves encoding and +decoding Parquet files' internal dictionary-encoded data streams to and from +Arrow's in-memory dictionary-encoded `DictionaryArray` +representation. Dictionary encoding is a compression strategy in Parquet, and +there is no formal "dictionary" or "categorical" type. I will go into more +detail about this below. + +Some of the particular JIRA issues related to this work include: + +- Vectorize comparators for computing statistics ([PARQUET-1523][1]) +- Read binary directly data directly into DictionaryBuilder + ([ARROW-3769][2]) +- Writing Parquet's dictionary indices directly into DictionaryBuilder + ([ARROW-3772][3]) +- Write dense (non-dictionary) Arrow arrays directly into Parquet data encoders + ([ARROW-6152][4]) +- Direct writing of arrow::DictionaryArray to Parquet column writers ([ARROW-3246][5]) +- Supporting changing dictionaries ([ARROW-3144][6]) +- Internal IO optimizations and improved raw `BYTE_ARRAY` encoding performance + ([ARROW-4398][7]) + +# Background: how Parquet files do dictionary encoding + +Many direct and indirect users of Apache Arrow use dictionary encoding to +improve performance and memory use on binary or string data types that include +many repeated values. pandas users will know this as the [Categorical type][8] +while in R such encoding is known as [`factor`][9]. In the Arrow C++ library +and various bindings we have the `DictionaryArray` object for representing such +data in memory. + +For example, an array such as + +``` +['apple', 'orange', 'apple', NULL, 'orange', 'orange'] +``` + +has dictionary-encoded form + +``` +dictionary: ['apple', 'orange'] +indices: [0, 1, 0, NULL, 1, 1] +``` + +The [Parquet format uses dictionary encoding][10] to compress data, and it is +used for all Parquet data types, not just binary or string data. Parquet +further uses bit-packing and run-length encoding (RLE) to compress the +dictionary indices, so if you had data like + +``` +['apple', 'apple', 'apple', 'apple', 'apple', 'apple', 'orange'] +``` + +the indices would be encoded like + +``` +[rle-run=(6, 0), + bit-packed-run=[1]] +``` + +The full details of the rle-bitpacking encoding are found in the Parquet +specification. Review comment: Link to that please This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services
[GitHub] [arrow-site] nealrichardson commented on a change in pull request #19: ARROW-6419: [Website] Blog post about Parquet C++ read performance improvements in Arrow 0.15
nealrichardson commented on a change in pull request #19: ARROW-6419: [Website] Blog post about Parquet C++ read performance improvements in Arrow 0.15 URL: https://github.com/apache/arrow-site/pull/19#discussion_r320429352 ## File path: _posts/2019-09-03-faster-strings-cpp-parquet.md ## @@ -0,0 +1,233 @@ +--- +layout: post +title: "Faster C++ Apache Parquet performance on string-heavy data coming in Apache Arrow 0.15" +date: "2019-09-01 00:00:00 -0600" +author: Wes McKinney +categories: [application] +--- + + +We have been implementing a series of optimizations in the Apache Parquet C++ +internals to improve read and write efficiency (both performance and memory +use) for Arrow columnar binary and string types, including native support for +Arrow's dictionary types. This should have a big impact on users of the C++, +MATLAB, Python, R, and Ruby interfaces to Parquet files. + +We discuss the work that was done and show benchmarks comparing Arrow 0.11.0 +(released in October, 2018) with the current development version (to be +released soon as Arrow 0.15.0). + +# Summary of work + +One of the challenges of developing the Parquet C++ library is that we +maintain low-level read and write APIs that do not involve the Arrow columnar +data structures. So we have had to take care to do Arrow-related optimizations +without impacting non-Arrow Parquet users, which includes database systems like +Clickhouse and Vertica. + +One of the largest and most complex optimizations involves encoding and +decoding Parquet files' internal dictionary-encoded data streams to and from +Arrow's in-memory dictionary-encoded `DictionaryArray` +representation. Dictionary encoding is a compression strategy in Parquet, and +there is no formal "dictionary" or "categorical" type. I will go into more +detail about this below. + +Some of the particular JIRA issues related to this work include: + +- Vectorize comparators for computing statistics ([PARQUET-1523][1]) +- Read binary directly data directly into DictionaryBuilder + ([ARROW-3769][2]) +- Writing Parquet's dictionary indices directly into DictionaryBuilder + ([ARROW-3772][3]) +- Write dense (non-dictionary) Arrow arrays directly into Parquet data encoders + ([ARROW-6152][4]) +- Direct writing of arrow::DictionaryArray to Parquet column writers ([ARROW-3246][5]) +- Supporting changing dictionaries ([ARROW-3144][6]) +- Internal IO optimizations and improved raw `BYTE_ARRAY` encoding performance + ([ARROW-4398][7]) + +# Background: how Parquet files do dictionary encoding + +Many direct and indirect users of Apache Arrow use dictionary encoding to +improve performance and memory use on binary or string data types that include +many repeated values. pandas users will know this as the [Categorical type][8] +while in R such encoding is known as [`factor`][9]. In the Arrow C++ library +and various bindings we have the `DictionaryArray` object for representing such +data in memory. + +For example, an array such as + +``` +['apple', 'orange', 'apple', NULL, 'orange', 'orange'] +``` + +has dictionary-encoded form + +``` +dictionary: ['apple', 'orange'] +indices: [0, 1, 0, NULL, 1, 1] +``` + +The [Parquet format uses dictionary encoding][10] to compress data, and it is +used for all Parquet data types, not just binary or string data. Parquet +further uses bit-packing and run-length encoding (RLE) to compress the +dictionary indices, so if you had data like + +``` +['apple', 'apple', 'apple', 'apple', 'apple', 'apple', 'orange'] +``` + +the indices would be encoded like + +``` +[rle-run=(6, 0), + bit-packed-run=[1]] +``` + +The full details of the rle-bitpacking encoding are found in the Parquet +specification. + +When writing a Parquet file, most implementations will use dictionary encoding +to compress a column until the dictionary itself reaches a certain size +threshold, usually around 1 megabyte. At this point, the column writer will +"fall back" to `PLAIN` encoding where values are written end-to-end in "data +pages" and then usually compressed with Snappy or Gzip. See the following rough +diagram: + + + + + +When reading a Parquet file, the dictionary-encoded portions are usually +materialized to their non-dictionary-encoded form, causing binary or string +values to be duplicated in memory. So an obvious (but not trivial) optimization +is to skip this "dense" materialization. There are several issues to deal with: + +* A Parquet file often contains multiple ColumnChunks for each semantic column, + and the dictionary values may be different in each ColumnChunk +* We must gracefully handle the "fall back" portion which is not + dictionary-encoded + +We pursued several avenues to help with this: + +* Allowing each `arrow::DictionaryArray` to have a different dictionary + (before, the dictionary was part of the `DictionaryType`, which caused + problems) +* We enabled the Parquet dictionary indices to be directly written into an + Arrow `DictionaryBuilder`
[GitHub] [arrow-site] nealrichardson commented on a change in pull request #19: ARROW-6419: [Website] Blog post about Parquet C++ read performance improvements in Arrow 0.15
nealrichardson commented on a change in pull request #19: ARROW-6419: [Website] Blog post about Parquet C++ read performance improvements in Arrow 0.15 URL: https://github.com/apache/arrow-site/pull/19#discussion_r320431254 ## File path: _posts/2019-09-03-faster-strings-cpp-parquet.md ## @@ -0,0 +1,233 @@ +--- +layout: post +title: "Faster C++ Apache Parquet performance on string-heavy data coming in Apache Arrow 0.15" +date: "2019-09-01 00:00:00 -0600" +author: Wes McKinney +categories: [application] +--- + + +We have been implementing a series of optimizations in the Apache Parquet C++ +internals to improve read and write efficiency (both performance and memory +use) for Arrow columnar binary and string types, including native support for +Arrow's dictionary types. This should have a big impact on users of the C++, +MATLAB, Python, R, and Ruby interfaces to Parquet files. + +We discuss the work that was done and show benchmarks comparing Arrow 0.11.0 Review comment: ```suggestion This post reviews the work that was done and shows benchmarks comparing Arrow 0.11.0 ``` This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services
[GitHub] [arrow-site] wesm commented on a change in pull request #19: ARROW-6419: [Website] Blog post about Parquet C++ read performance improvements in Arrow 0.15
wesm commented on a change in pull request #19: ARROW-6419: [Website] Blog post about Parquet C++ read performance improvements in Arrow 0.15 URL: https://github.com/apache/arrow-site/pull/19#discussion_r320453000 ## File path: _posts/2019-09-03-faster-strings-cpp-parquet.md ## @@ -0,0 +1,233 @@ +--- +layout: post +title: "Faster C++ Apache Parquet performance on string-heavy data coming in Apache Arrow 0.15" +date: "2019-09-01 00:00:00 -0600" +author: Wes McKinney +categories: [application] +--- + + +We have been implementing a series of optimizations in the Apache Parquet C++ +internals to improve read and write efficiency (both performance and memory +use) for Arrow columnar binary and string types, including native support for +Arrow's dictionary types. This should have a big impact on users of the C++, +MATLAB, Python, R, and Ruby interfaces to Parquet files. + +We discuss the work that was done and show benchmarks comparing Arrow 0.11.0 +(released in October, 2018) with the current development version (to be +released soon as Arrow 0.15.0). + +# Summary of work + +One of the challenges of developing the Parquet C++ library is that we +maintain low-level read and write APIs that do not involve the Arrow columnar +data structures. So we have had to take care to do Arrow-related optimizations +without impacting non-Arrow Parquet users, which includes database systems like +Clickhouse and Vertica. + +One of the largest and most complex optimizations involves encoding and +decoding Parquet files' internal dictionary-encoded data streams to and from +Arrow's in-memory dictionary-encoded `DictionaryArray` +representation. Dictionary encoding is a compression strategy in Parquet, and +there is no formal "dictionary" or "categorical" type. I will go into more +detail about this below. + +Some of the particular JIRA issues related to this work include: + +- Vectorize comparators for computing statistics ([PARQUET-1523][1]) +- Read binary directly data directly into DictionaryBuilder + ([ARROW-3769][2]) +- Writing Parquet's dictionary indices directly into DictionaryBuilder + ([ARROW-3772][3]) +- Write dense (non-dictionary) Arrow arrays directly into Parquet data encoders + ([ARROW-6152][4]) +- Direct writing of arrow::DictionaryArray to Parquet column writers ([ARROW-3246][5]) +- Supporting changing dictionaries ([ARROW-3144][6]) +- Internal IO optimizations and improved raw `BYTE_ARRAY` encoding performance + ([ARROW-4398][7]) + +# Background: how Parquet files do dictionary encoding + +Many direct and indirect users of Apache Arrow use dictionary encoding to +improve performance and memory use on binary or string data types that include +many repeated values. pandas users will know this as the [Categorical type][8] +while in R such encoding is known as [`factor`][9]. In the Arrow C++ library +and various bindings we have the `DictionaryArray` object for representing such +data in memory. + +For example, an array such as + +``` +['apple', 'orange', 'apple', NULL, 'orange', 'orange'] +``` + +has dictionary-encoded form + +``` +dictionary: ['apple', 'orange'] +indices: [0, 1, 0, NULL, 1, 1] +``` + +The [Parquet format uses dictionary encoding][10] to compress data, and it is +used for all Parquet data types, not just binary or string data. Parquet +further uses bit-packing and run-length encoding (RLE) to compress the +dictionary indices, so if you had data like + +``` +['apple', 'apple', 'apple', 'apple', 'apple', 'apple', 'orange'] +``` + +the indices would be encoded like + +``` +[rle-run=(6, 0), + bit-packed-run=[1]] +``` + +The full details of the rle-bitpacking encoding are found in the Parquet +specification. + +When writing a Parquet file, most implementations will use dictionary encoding +to compress a column until the dictionary itself reaches a certain size +threshold, usually around 1 megabyte. At this point, the column writer will +"fall back" to `PLAIN` encoding where values are written end-to-end in "data +pages" and then usually compressed with Snappy or Gzip. See the following rough +diagram: + + + + + +When reading a Parquet file, the dictionary-encoded portions are usually +materialized to their non-dictionary-encoded form, causing binary or string +values to be duplicated in memory. So an obvious (but not trivial) optimization +is to skip this "dense" materialization. There are several issues to deal with: + +* A Parquet file often contains multiple ColumnChunks for each semantic column, + and the dictionary values may be different in each ColumnChunk +* We must gracefully handle the "fall back" portion which is not + dictionary-encoded + +We pursued several avenues to help with this: + +* Allowing each `arrow::DictionaryArray` to have a different dictionary + (before, the dictionary was part of the `DictionaryType`, which caused + problems) +* We enabled the Parquet dictionary indices to be directly written into an + Arrow `DictionaryBuilder` without
[GitHub] [arrow-site] kou merged pull request #12: ARROW-6225: [Website] Update arrow-site/README and any other places to point website contributors in right direction
kou merged pull request #12: ARROW-6225: [Website] Update arrow-site/README and any other places to point website contributors in right direction URL: https://github.com/apache/arrow-site/pull/12 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services
[GitHub] [arrow-site] wesm merged pull request #19: ARROW-6419: [Website] Blog post about Parquet C++ read performance improvements in Arrow 0.15
wesm merged pull request #19: ARROW-6419: [Website] Blog post about Parquet C++ read performance improvements in Arrow 0.15 URL: https://github.com/apache/arrow-site/pull/19 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services
[GitHub] [arrow-site] wesm commented on a change in pull request #19: ARROW-6419: [Website] Blog post about Parquet C++ read performance improvements in Arrow 0.15
wesm commented on a change in pull request #19: ARROW-6419: [Website] Blog post about Parquet C++ read performance improvements in Arrow 0.15 URL: https://github.com/apache/arrow-site/pull/19#discussion_r321321708 ## File path: _posts/2019-09-03-faster-strings-cpp-parquet.md ## @@ -0,0 +1,238 @@ +--- +layout: post +title: "Faster C++ Apache Parquet performance on dictionary-encoded string data coming in Apache Arrow 0.15" +date: "2019-09-05 00:00:00 -0600" +author: Wes McKinney +categories: [application] +--- + + +We have been implementing a series of optimizations in the Apache Parquet C++ +internals to improve read and write efficiency (both performance and memory +use) for Arrow columnar binary and string data, with new "native" support for +Arrow's dictionary types. This should have a big impact on users of the C++, +MATLAB, Python, R, and Ruby interfaces to Parquet files. + +This post reviews work that was done and shows benchmarks comparing Arrow +0.12.1 with the current development version (to be released soon as Arrow +0.15.0). + +# Summary of work + +One of the largest and most complex optimizations involves encoding and +decoding Parquet files' internal dictionary-encoded data streams to and from +Arrow's in-memory dictionary-encoded `DictionaryArray` +representation. Dictionary encoding is a compression strategy in Parquet, and +there is no formal "dictionary" or "categorical" type. I will go into more +detail about this below. + +Some of the particular JIRA issues related to this work include: + +- Vectorize comparators for computing statistics ([PARQUET-1523][1]) +- Read binary directly data directly into DictionaryBuilder + ([ARROW-3769][2]) +- Writing Parquet's dictionary indices directly into DictionaryBuilder + ([ARROW-3772][3]) +- Write dense (non-dictionary) Arrow arrays directly into Parquet data encoders + ([ARROW-6152][4]) +- Direct writing of arrow::DictionaryArray to Parquet column writers ([ARROW-3246][5]) +- Supporting changing dictionaries ([ARROW-3144][6]) +- Internal IO optimizations and improved raw `BYTE_ARRAY` encoding performance + ([ARROW-4398][7]) + +One of the challenges of developing the Parquet C++ library is that we +maintain low-level read and write APIs that do not involve the Arrow columnar +data structures. So we have had to take care to do Arrow-related optimizations +without impacting non-Arrow Parquet users, which includes database systems like +Clickhouse and Vertica. + +# Background: how Parquet files do dictionary encoding + +Many direct and indirect users of Apache Arrow use dictionary encoding to +improve performance and memory use on binary or string data types that include +many repeated values. pandas users will know this as the [Categorical type][8] +while in R such encoding is known as [`factor`][9]. In the Arrow C++ library +and various bindings we have the `DictionaryArray` object for representing such +data in memory. + +For example, an array such as + +``` +['apple', 'orange', 'apple', NULL, 'orange', 'orange'] +``` + +has dictionary-encoded form + +``` +dictionary: ['apple', 'orange'] +indices: [0, 1, 0, NULL, 1, 1] +``` + +The [Parquet format uses dictionary encoding][10] to compress data, and it is +used for all Parquet data types, not just binary or string data. Parquet +further uses bit-packing and run-length encoding (RLE) to compress the +dictionary indices, so if you had data like + +``` +['apple', 'apple', 'apple', 'apple', 'apple', 'apple', 'orange'] +``` + +the indices would be encoded like + +``` +[rle-run=(6, 0), + bit-packed-run=[1]] +``` + +The full details of the rle-bitpacking encoding are found in the [Parquet +specification][10]. + +When writing a Parquet file, most implementations will use dictionary encoding +to compress a column until the dictionary itself reaches a certain size +threshold, usually around 1 megabyte. At this point, the column writer will +"fall back" to `PLAIN` encoding where values are written end-to-end in "data +pages" and then usually compressed with Snappy or Gzip. See the following rough +diagram: + + + + + +# Faster reading and writing of dictionary-encoded data + +When reading a Parquet file, the dictionary-encoded portions are usually +materialized to their non-dictionary-encoded form, causing binary or string +values to be duplicated in memory. So an obvious (but not trivial) optimization +is to skip this "dense" materialization. There are several issues to deal with: + +* A Parquet file often contains multiple ColumnChunks for each semantic column, + and the dictionary values may be different in each ColumnChunk +* We must gracefully handle the "fall back" portion which is not + dictionary-encoded + +We pursued several avenues to help with this: + +* Allowing each `arrow::DictionaryArray` to have a different dictionary + (before, the dictionary was part of the `DictionaryType`, which caused + problems) +* We enabled the Parquet dictionary indices to be directly
[GitHub] [arrow-site] wesm commented on issue #17: Updated powered-by.md to include Data Preview
wesm commented on issue #17: Updated powered-by.md to include Data Preview URL: https://github.com/apache/arrow-site/pull/17#issuecomment-528422201 Next site update This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services
[GitHub] [arrow-site] RandomFractals commented on issue #17: Updated powered-by.md to include Data Preview
RandomFractals commented on issue #17: Updated powered-by.md to include Data Preview URL: https://github.com/apache/arrow-site/pull/17#issuecomment-528421046 @wesm & this site maintainers: I still don't see this addition on your powered by site: https://arrow.apache.org/powered_by/ is it just queued up for your next site update? Can you please provide an ETA on that? Thanks in advance! Taras This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services
[GitHub] [arrow-site] hatemhelal commented on a change in pull request #19: ARROW-6419: [Website] Blog post about Parquet C++ read performance improvements in Arrow 0.15
hatemhelal commented on a change in pull request #19: ARROW-6419: [Website] Blog post about Parquet C++ read performance improvements in Arrow 0.15 URL: https://github.com/apache/arrow-site/pull/19#discussion_r321296155 ## File path: _posts/2019-09-03-faster-strings-cpp-parquet.md ## @@ -0,0 +1,238 @@ +--- +layout: post +title: "Faster C++ Apache Parquet performance on dictionary-encoded string data coming in Apache Arrow 0.15" +date: "2019-09-05 00:00:00 -0600" +author: Wes McKinney +categories: [application] +--- + + +We have been implementing a series of optimizations in the Apache Parquet C++ +internals to improve read and write efficiency (both performance and memory +use) for Arrow columnar binary and string data, with new "native" support for +Arrow's dictionary types. This should have a big impact on users of the C++, +MATLAB, Python, R, and Ruby interfaces to Parquet files. + +This post reviews work that was done and shows benchmarks comparing Arrow +0.12.1 with the current development version (to be released soon as Arrow +0.15.0). + +# Summary of work + +One of the largest and most complex optimizations involves encoding and +decoding Parquet files' internal dictionary-encoded data streams to and from +Arrow's in-memory dictionary-encoded `DictionaryArray` +representation. Dictionary encoding is a compression strategy in Parquet, and +there is no formal "dictionary" or "categorical" type. I will go into more +detail about this below. + +Some of the particular JIRA issues related to this work include: + +- Vectorize comparators for computing statistics ([PARQUET-1523][1]) +- Read binary directly data directly into DictionaryBuilder + ([ARROW-3769][2]) +- Writing Parquet's dictionary indices directly into DictionaryBuilder + ([ARROW-3772][3]) +- Write dense (non-dictionary) Arrow arrays directly into Parquet data encoders + ([ARROW-6152][4]) +- Direct writing of arrow::DictionaryArray to Parquet column writers ([ARROW-3246][5]) +- Supporting changing dictionaries ([ARROW-3144][6]) +- Internal IO optimizations and improved raw `BYTE_ARRAY` encoding performance + ([ARROW-4398][7]) + +One of the challenges of developing the Parquet C++ library is that we +maintain low-level read and write APIs that do not involve the Arrow columnar +data structures. So we have had to take care to do Arrow-related optimizations +without impacting non-Arrow Parquet users, which includes database systems like +Clickhouse and Vertica. + +# Background: how Parquet files do dictionary encoding + +Many direct and indirect users of Apache Arrow use dictionary encoding to +improve performance and memory use on binary or string data types that include +many repeated values. pandas users will know this as the [Categorical type][8] +while in R such encoding is known as [`factor`][9]. In the Arrow C++ library +and various bindings we have the `DictionaryArray` object for representing such +data in memory. + +For example, an array such as + +``` +['apple', 'orange', 'apple', NULL, 'orange', 'orange'] +``` + +has dictionary-encoded form + +``` +dictionary: ['apple', 'orange'] +indices: [0, 1, 0, NULL, 1, 1] +``` + +The [Parquet format uses dictionary encoding][10] to compress data, and it is +used for all Parquet data types, not just binary or string data. Parquet +further uses bit-packing and run-length encoding (RLE) to compress the +dictionary indices, so if you had data like + +``` +['apple', 'apple', 'apple', 'apple', 'apple', 'apple', 'orange'] +``` + +the indices would be encoded like + +``` +[rle-run=(6, 0), + bit-packed-run=[1]] +``` + +The full details of the rle-bitpacking encoding are found in the [Parquet +specification][10]. + +When writing a Parquet file, most implementations will use dictionary encoding +to compress a column until the dictionary itself reaches a certain size +threshold, usually around 1 megabyte. At this point, the column writer will +"fall back" to `PLAIN` encoding where values are written end-to-end in "data +pages" and then usually compressed with Snappy or Gzip. See the following rough +diagram: + + + + + +# Faster reading and writing of dictionary-encoded data + +When reading a Parquet file, the dictionary-encoded portions are usually +materialized to their non-dictionary-encoded form, causing binary or string +values to be duplicated in memory. So an obvious (but not trivial) optimization +is to skip this "dense" materialization. There are several issues to deal with: + +* A Parquet file often contains multiple ColumnChunks for each semantic column, + and the dictionary values may be different in each ColumnChunk +* We must gracefully handle the "fall back" portion which is not + dictionary-encoded + +We pursued several avenues to help with this: + +* Allowing each `arrow::DictionaryArray` to have a different dictionary Review comment: I think it makes sense to drop the arrow namespace and use `DictionaryArray` throughout the post: ```suggestion
[GitHub] [arrow-site] hatemhelal commented on a change in pull request #19: ARROW-6419: [Website] Blog post about Parquet C++ read performance improvements in Arrow 0.15
hatemhelal commented on a change in pull request #19: ARROW-6419: [Website] Blog post about Parquet C++ read performance improvements in Arrow 0.15 URL: https://github.com/apache/arrow-site/pull/19#discussion_r321294831 ## File path: _posts/2019-09-03-faster-strings-cpp-parquet.md ## @@ -0,0 +1,238 @@ +--- +layout: post +title: "Faster C++ Apache Parquet performance on dictionary-encoded string data coming in Apache Arrow 0.15" +date: "2019-09-05 00:00:00 -0600" +author: Wes McKinney +categories: [application] +--- + + +We have been implementing a series of optimizations in the Apache Parquet C++ +internals to improve read and write efficiency (both performance and memory +use) for Arrow columnar binary and string data, with new "native" support for +Arrow's dictionary types. This should have a big impact on users of the C++, +MATLAB, Python, R, and Ruby interfaces to Parquet files. + +This post reviews work that was done and shows benchmarks comparing Arrow +0.12.1 with the current development version (to be released soon as Arrow +0.15.0). + +# Summary of work + +One of the largest and most complex optimizations involves encoding and +decoding Parquet files' internal dictionary-encoded data streams to and from +Arrow's in-memory dictionary-encoded `DictionaryArray` +representation. Dictionary encoding is a compression strategy in Parquet, and +there is no formal "dictionary" or "categorical" type. I will go into more +detail about this below. + +Some of the particular JIRA issues related to this work include: + +- Vectorize comparators for computing statistics ([PARQUET-1523][1]) +- Read binary directly data directly into DictionaryBuilder + ([ARROW-3769][2]) +- Writing Parquet's dictionary indices directly into DictionaryBuilder + ([ARROW-3772][3]) +- Write dense (non-dictionary) Arrow arrays directly into Parquet data encoders + ([ARROW-6152][4]) +- Direct writing of arrow::DictionaryArray to Parquet column writers ([ARROW-3246][5]) +- Supporting changing dictionaries ([ARROW-3144][6]) +- Internal IO optimizations and improved raw `BYTE_ARRAY` encoding performance + ([ARROW-4398][7]) + +One of the challenges of developing the Parquet C++ library is that we +maintain low-level read and write APIs that do not involve the Arrow columnar +data structures. So we have had to take care to do Arrow-related optimizations +without impacting non-Arrow Parquet users, which includes database systems like +Clickhouse and Vertica. + +# Background: how Parquet files do dictionary encoding + +Many direct and indirect users of Apache Arrow use dictionary encoding to +improve performance and memory use on binary or string data types that include +many repeated values. pandas users will know this as the [Categorical type][8] Review comment: Shameless plug... ```suggestion many repeated values. MATLAB and pandas users will know this as the [Categorical type][8] ``` Can add link to this doc page too: https://www.mathworks.com/help/matlab/categorical-arrays.html Sorry this messes up the citation numbering... This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services
[GitHub] [arrow-site] hatemhelal commented on a change in pull request #19: ARROW-6419: [Website] Blog post about Parquet C++ read performance improvements in Arrow 0.15
hatemhelal commented on a change in pull request #19: ARROW-6419: [Website] Blog post about Parquet C++ read performance improvements in Arrow 0.15 URL: https://github.com/apache/arrow-site/pull/19#discussion_r321301937 ## File path: _posts/2019-09-03-faster-strings-cpp-parquet.md ## @@ -0,0 +1,238 @@ +--- +layout: post +title: "Faster C++ Apache Parquet performance on dictionary-encoded string data coming in Apache Arrow 0.15" +date: "2019-09-05 00:00:00 -0600" +author: Wes McKinney +categories: [application] +--- + + +We have been implementing a series of optimizations in the Apache Parquet C++ +internals to improve read and write efficiency (both performance and memory +use) for Arrow columnar binary and string data, with new "native" support for +Arrow's dictionary types. This should have a big impact on users of the C++, +MATLAB, Python, R, and Ruby interfaces to Parquet files. + +This post reviews work that was done and shows benchmarks comparing Arrow +0.12.1 with the current development version (to be released soon as Arrow +0.15.0). + +# Summary of work + +One of the largest and most complex optimizations involves encoding and +decoding Parquet files' internal dictionary-encoded data streams to and from +Arrow's in-memory dictionary-encoded `DictionaryArray` +representation. Dictionary encoding is a compression strategy in Parquet, and +there is no formal "dictionary" or "categorical" type. I will go into more +detail about this below. + +Some of the particular JIRA issues related to this work include: + +- Vectorize comparators for computing statistics ([PARQUET-1523][1]) +- Read binary directly data directly into DictionaryBuilder + ([ARROW-3769][2]) +- Writing Parquet's dictionary indices directly into DictionaryBuilder + ([ARROW-3772][3]) +- Write dense (non-dictionary) Arrow arrays directly into Parquet data encoders + ([ARROW-6152][4]) +- Direct writing of arrow::DictionaryArray to Parquet column writers ([ARROW-3246][5]) +- Supporting changing dictionaries ([ARROW-3144][6]) +- Internal IO optimizations and improved raw `BYTE_ARRAY` encoding performance + ([ARROW-4398][7]) + +One of the challenges of developing the Parquet C++ library is that we +maintain low-level read and write APIs that do not involve the Arrow columnar +data structures. So we have had to take care to do Arrow-related optimizations +without impacting non-Arrow Parquet users, which includes database systems like +Clickhouse and Vertica. + +# Background: how Parquet files do dictionary encoding + +Many direct and indirect users of Apache Arrow use dictionary encoding to +improve performance and memory use on binary or string data types that include +many repeated values. pandas users will know this as the [Categorical type][8] +while in R such encoding is known as [`factor`][9]. In the Arrow C++ library +and various bindings we have the `DictionaryArray` object for representing such +data in memory. + +For example, an array such as + +``` +['apple', 'orange', 'apple', NULL, 'orange', 'orange'] +``` + +has dictionary-encoded form + +``` +dictionary: ['apple', 'orange'] +indices: [0, 1, 0, NULL, 1, 1] +``` + +The [Parquet format uses dictionary encoding][10] to compress data, and it is +used for all Parquet data types, not just binary or string data. Parquet +further uses bit-packing and run-length encoding (RLE) to compress the +dictionary indices, so if you had data like + +``` +['apple', 'apple', 'apple', 'apple', 'apple', 'apple', 'orange'] +``` + +the indices would be encoded like + +``` +[rle-run=(6, 0), + bit-packed-run=[1]] +``` + +The full details of the rle-bitpacking encoding are found in the [Parquet +specification][10]. + +When writing a Parquet file, most implementations will use dictionary encoding +to compress a column until the dictionary itself reaches a certain size +threshold, usually around 1 megabyte. At this point, the column writer will +"fall back" to `PLAIN` encoding where values are written end-to-end in "data +pages" and then usually compressed with Snappy or Gzip. See the following rough +diagram: + + + + + +# Faster reading and writing of dictionary-encoded data + +When reading a Parquet file, the dictionary-encoded portions are usually +materialized to their non-dictionary-encoded form, causing binary or string +values to be duplicated in memory. So an obvious (but not trivial) optimization +is to skip this "dense" materialization. There are several issues to deal with: + +* A Parquet file often contains multiple ColumnChunks for each semantic column, + and the dictionary values may be different in each ColumnChunk +* We must gracefully handle the "fall back" portion which is not + dictionary-encoded + +We pursued several avenues to help with this: + +* Allowing each `arrow::DictionaryArray` to have a different dictionary + (before, the dictionary was part of the `DictionaryType`, which caused + problems) +* We enabled the Parquet dictionary indices to be
[GitHub] [arrow-site] hatemhelal commented on a change in pull request #19: ARROW-6419: [Website] Blog post about Parquet C++ read performance improvements in Arrow 0.15
hatemhelal commented on a change in pull request #19: ARROW-6419: [Website] Blog post about Parquet C++ read performance improvements in Arrow 0.15 URL: https://github.com/apache/arrow-site/pull/19#discussion_r321296528 ## File path: _posts/2019-09-03-faster-strings-cpp-parquet.md ## @@ -0,0 +1,238 @@ +--- +layout: post +title: "Faster C++ Apache Parquet performance on dictionary-encoded string data coming in Apache Arrow 0.15" +date: "2019-09-05 00:00:00 -0600" +author: Wes McKinney +categories: [application] +--- + + +We have been implementing a series of optimizations in the Apache Parquet C++ +internals to improve read and write efficiency (both performance and memory +use) for Arrow columnar binary and string data, with new "native" support for +Arrow's dictionary types. This should have a big impact on users of the C++, +MATLAB, Python, R, and Ruby interfaces to Parquet files. + +This post reviews work that was done and shows benchmarks comparing Arrow +0.12.1 with the current development version (to be released soon as Arrow +0.15.0). + +# Summary of work + +One of the largest and most complex optimizations involves encoding and +decoding Parquet files' internal dictionary-encoded data streams to and from +Arrow's in-memory dictionary-encoded `DictionaryArray` +representation. Dictionary encoding is a compression strategy in Parquet, and +there is no formal "dictionary" or "categorical" type. I will go into more +detail about this below. + +Some of the particular JIRA issues related to this work include: + +- Vectorize comparators for computing statistics ([PARQUET-1523][1]) +- Read binary directly data directly into DictionaryBuilder + ([ARROW-3769][2]) +- Writing Parquet's dictionary indices directly into DictionaryBuilder + ([ARROW-3772][3]) +- Write dense (non-dictionary) Arrow arrays directly into Parquet data encoders + ([ARROW-6152][4]) +- Direct writing of arrow::DictionaryArray to Parquet column writers ([ARROW-3246][5]) +- Supporting changing dictionaries ([ARROW-3144][6]) +- Internal IO optimizations and improved raw `BYTE_ARRAY` encoding performance + ([ARROW-4398][7]) + +One of the challenges of developing the Parquet C++ library is that we +maintain low-level read and write APIs that do not involve the Arrow columnar +data structures. So we have had to take care to do Arrow-related optimizations +without impacting non-Arrow Parquet users, which includes database systems like +Clickhouse and Vertica. + +# Background: how Parquet files do dictionary encoding + +Many direct and indirect users of Apache Arrow use dictionary encoding to +improve performance and memory use on binary or string data types that include +many repeated values. pandas users will know this as the [Categorical type][8] +while in R such encoding is known as [`factor`][9]. In the Arrow C++ library +and various bindings we have the `DictionaryArray` object for representing such +data in memory. + +For example, an array such as + +``` +['apple', 'orange', 'apple', NULL, 'orange', 'orange'] +``` + +has dictionary-encoded form + +``` +dictionary: ['apple', 'orange'] +indices: [0, 1, 0, NULL, 1, 1] +``` + +The [Parquet format uses dictionary encoding][10] to compress data, and it is +used for all Parquet data types, not just binary or string data. Parquet +further uses bit-packing and run-length encoding (RLE) to compress the +dictionary indices, so if you had data like + +``` +['apple', 'apple', 'apple', 'apple', 'apple', 'apple', 'orange'] +``` + +the indices would be encoded like + +``` +[rle-run=(6, 0), + bit-packed-run=[1]] +``` + +The full details of the rle-bitpacking encoding are found in the [Parquet +specification][10]. + +When writing a Parquet file, most implementations will use dictionary encoding +to compress a column until the dictionary itself reaches a certain size +threshold, usually around 1 megabyte. At this point, the column writer will +"fall back" to `PLAIN` encoding where values are written end-to-end in "data +pages" and then usually compressed with Snappy or Gzip. See the following rough +diagram: + + + + + +# Faster reading and writing of dictionary-encoded data + +When reading a Parquet file, the dictionary-encoded portions are usually +materialized to their non-dictionary-encoded form, causing binary or string +values to be duplicated in memory. So an obvious (but not trivial) optimization +is to skip this "dense" materialization. There are several issues to deal with: + +* A Parquet file often contains multiple ColumnChunks for each semantic column, + and the dictionary values may be different in each ColumnChunk +* We must gracefully handle the "fall back" portion which is not + dictionary-encoded + +We pursued several avenues to help with this: + +* Allowing each `arrow::DictionaryArray` to have a different dictionary + (before, the dictionary was part of the `DictionaryType`, which caused + problems) +* We enabled the Parquet dictionary indices to be
[GitHub] [arrow-site] hatemhelal commented on a change in pull request #19: ARROW-6419: [Website] Blog post about Parquet C++ read performance improvements in Arrow 0.15
hatemhelal commented on a change in pull request #19: ARROW-6419: [Website] Blog post about Parquet C++ read performance improvements in Arrow 0.15 URL: https://github.com/apache/arrow-site/pull/19#discussion_r321242756 ## File path: _posts/2019-09-03-faster-strings-cpp-parquet.md ## @@ -0,0 +1,238 @@ +--- +layout: post +title: "Faster C++ Apache Parquet performance on dictionary-encoded string data coming in Apache Arrow 0.15" +date: "2019-09-05 00:00:00 -0600" +author: Wes McKinney +categories: [application] +--- + + +We have been implementing a series of optimizations in the Apache Parquet C++ +internals to improve read and write efficiency (both performance and memory +use) for Arrow columnar binary and string data, with new "native" support for +Arrow's dictionary types. This should have a big impact on users of the C++, +MATLAB, Python, R, and Ruby interfaces to Parquet files. + +This post reviews work that was done and shows benchmarks comparing Arrow +0.12.1 with the current development version (to be released soon as Arrow +0.15.0). + +# Summary of work + +One of the largest and most complex optimizations involves encoding and +decoding Parquet files' internal dictionary-encoded data streams to and from +Arrow's in-memory dictionary-encoded `DictionaryArray` +representation. Dictionary encoding is a compression strategy in Parquet, and +there is no formal "dictionary" or "categorical" type. I will go into more +detail about this below. + +Some of the particular JIRA issues related to this work include: + +- Vectorize comparators for computing statistics ([PARQUET-1523][1]) +- Read binary directly data directly into DictionaryBuilder + ([ARROW-3769][2]) +- Writing Parquet's dictionary indices directly into DictionaryBuilder + ([ARROW-3772][3]) +- Write dense (non-dictionary) Arrow arrays directly into Parquet data encoders + ([ARROW-6152][4]) +- Direct writing of arrow::DictionaryArray to Parquet column writers ([ARROW-3246][5]) Review comment: ```suggestion Direct writing of `arrow::DictionaryArray` to Parquet column writers ([ARROW-3246][5]) ``` This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services
[GitHub] [arrow-site] hatemhelal commented on a change in pull request #19: ARROW-6419: [Website] Blog post about Parquet C++ read performance improvements in Arrow 0.15
hatemhelal commented on a change in pull request #19: ARROW-6419: [Website] Blog post about Parquet C++ read performance improvements in Arrow 0.15 URL: https://github.com/apache/arrow-site/pull/19#discussion_r321302667 ## File path: _posts/2019-09-03-faster-strings-cpp-parquet.md ## @@ -0,0 +1,238 @@ +--- +layout: post +title: "Faster C++ Apache Parquet performance on dictionary-encoded string data coming in Apache Arrow 0.15" +date: "2019-09-05 00:00:00 -0600" +author: Wes McKinney +categories: [application] +--- + + +We have been implementing a series of optimizations in the Apache Parquet C++ +internals to improve read and write efficiency (both performance and memory +use) for Arrow columnar binary and string data, with new "native" support for +Arrow's dictionary types. This should have a big impact on users of the C++, +MATLAB, Python, R, and Ruby interfaces to Parquet files. + +This post reviews work that was done and shows benchmarks comparing Arrow +0.12.1 with the current development version (to be released soon as Arrow +0.15.0). + +# Summary of work + +One of the largest and most complex optimizations involves encoding and +decoding Parquet files' internal dictionary-encoded data streams to and from +Arrow's in-memory dictionary-encoded `DictionaryArray` +representation. Dictionary encoding is a compression strategy in Parquet, and +there is no formal "dictionary" or "categorical" type. I will go into more +detail about this below. + +Some of the particular JIRA issues related to this work include: + +- Vectorize comparators for computing statistics ([PARQUET-1523][1]) +- Read binary directly data directly into DictionaryBuilder + ([ARROW-3769][2]) +- Writing Parquet's dictionary indices directly into DictionaryBuilder + ([ARROW-3772][3]) +- Write dense (non-dictionary) Arrow arrays directly into Parquet data encoders + ([ARROW-6152][4]) +- Direct writing of arrow::DictionaryArray to Parquet column writers ([ARROW-3246][5]) +- Supporting changing dictionaries ([ARROW-3144][6]) +- Internal IO optimizations and improved raw `BYTE_ARRAY` encoding performance + ([ARROW-4398][7]) + +One of the challenges of developing the Parquet C++ library is that we +maintain low-level read and write APIs that do not involve the Arrow columnar +data structures. So we have had to take care to do Arrow-related optimizations +without impacting non-Arrow Parquet users, which includes database systems like +Clickhouse and Vertica. + +# Background: how Parquet files do dictionary encoding + +Many direct and indirect users of Apache Arrow use dictionary encoding to +improve performance and memory use on binary or string data types that include +many repeated values. pandas users will know this as the [Categorical type][8] +while in R such encoding is known as [`factor`][9]. In the Arrow C++ library +and various bindings we have the `DictionaryArray` object for representing such +data in memory. + +For example, an array such as + +``` +['apple', 'orange', 'apple', NULL, 'orange', 'orange'] +``` + +has dictionary-encoded form + +``` +dictionary: ['apple', 'orange'] +indices: [0, 1, 0, NULL, 1, 1] +``` + +The [Parquet format uses dictionary encoding][10] to compress data, and it is +used for all Parquet data types, not just binary or string data. Parquet +further uses bit-packing and run-length encoding (RLE) to compress the +dictionary indices, so if you had data like + +``` +['apple', 'apple', 'apple', 'apple', 'apple', 'apple', 'orange'] +``` + +the indices would be encoded like + +``` +[rle-run=(6, 0), + bit-packed-run=[1]] +``` + +The full details of the rle-bitpacking encoding are found in the [Parquet +specification][10]. + +When writing a Parquet file, most implementations will use dictionary encoding +to compress a column until the dictionary itself reaches a certain size +threshold, usually around 1 megabyte. At this point, the column writer will +"fall back" to `PLAIN` encoding where values are written end-to-end in "data +pages" and then usually compressed with Snappy or Gzip. See the following rough +diagram: + + + + + +# Faster reading and writing of dictionary-encoded data + +When reading a Parquet file, the dictionary-encoded portions are usually +materialized to their non-dictionary-encoded form, causing binary or string +values to be duplicated in memory. So an obvious (but not trivial) optimization +is to skip this "dense" materialization. There are several issues to deal with: + +* A Parquet file often contains multiple ColumnChunks for each semantic column, + and the dictionary values may be different in each ColumnChunk +* We must gracefully handle the "fall back" portion which is not + dictionary-encoded + +We pursued several avenues to help with this: + +* Allowing each `arrow::DictionaryArray` to have a different dictionary + (before, the dictionary was part of the `DictionaryType`, which caused + problems) +* We enabled the Parquet dictionary indices to be
[GitHub] [arrow-site] hatemhelal commented on a change in pull request #19: ARROW-6419: [Website] Blog post about Parquet C++ read performance improvements in Arrow 0.15
hatemhelal commented on a change in pull request #19: ARROW-6419: [Website] Blog post about Parquet C++ read performance improvements in Arrow 0.15 URL: https://github.com/apache/arrow-site/pull/19#discussion_r321303184 ## File path: _posts/2019-09-03-faster-strings-cpp-parquet.md ## @@ -0,0 +1,238 @@ +--- +layout: post +title: "Faster C++ Apache Parquet performance on dictionary-encoded string data coming in Apache Arrow 0.15" +date: "2019-09-05 00:00:00 -0600" +author: Wes McKinney +categories: [application] +--- + + +We have been implementing a series of optimizations in the Apache Parquet C++ +internals to improve read and write efficiency (both performance and memory +use) for Arrow columnar binary and string data, with new "native" support for +Arrow's dictionary types. This should have a big impact on users of the C++, +MATLAB, Python, R, and Ruby interfaces to Parquet files. + +This post reviews work that was done and shows benchmarks comparing Arrow +0.12.1 with the current development version (to be released soon as Arrow +0.15.0). + +# Summary of work + +One of the largest and most complex optimizations involves encoding and +decoding Parquet files' internal dictionary-encoded data streams to and from +Arrow's in-memory dictionary-encoded `DictionaryArray` +representation. Dictionary encoding is a compression strategy in Parquet, and +there is no formal "dictionary" or "categorical" type. I will go into more +detail about this below. + +Some of the particular JIRA issues related to this work include: + +- Vectorize comparators for computing statistics ([PARQUET-1523][1]) +- Read binary directly data directly into DictionaryBuilder + ([ARROW-3769][2]) +- Writing Parquet's dictionary indices directly into DictionaryBuilder + ([ARROW-3772][3]) +- Write dense (non-dictionary) Arrow arrays directly into Parquet data encoders + ([ARROW-6152][4]) +- Direct writing of arrow::DictionaryArray to Parquet column writers ([ARROW-3246][5]) +- Supporting changing dictionaries ([ARROW-3144][6]) +- Internal IO optimizations and improved raw `BYTE_ARRAY` encoding performance + ([ARROW-4398][7]) + +One of the challenges of developing the Parquet C++ library is that we +maintain low-level read and write APIs that do not involve the Arrow columnar +data structures. So we have had to take care to do Arrow-related optimizations +without impacting non-Arrow Parquet users, which includes database systems like +Clickhouse and Vertica. + +# Background: how Parquet files do dictionary encoding + +Many direct and indirect users of Apache Arrow use dictionary encoding to +improve performance and memory use on binary or string data types that include +many repeated values. pandas users will know this as the [Categorical type][8] +while in R such encoding is known as [`factor`][9]. In the Arrow C++ library +and various bindings we have the `DictionaryArray` object for representing such +data in memory. + +For example, an array such as + +``` +['apple', 'orange', 'apple', NULL, 'orange', 'orange'] +``` + +has dictionary-encoded form + +``` +dictionary: ['apple', 'orange'] +indices: [0, 1, 0, NULL, 1, 1] +``` + +The [Parquet format uses dictionary encoding][10] to compress data, and it is +used for all Parquet data types, not just binary or string data. Parquet +further uses bit-packing and run-length encoding (RLE) to compress the +dictionary indices, so if you had data like + +``` +['apple', 'apple', 'apple', 'apple', 'apple', 'apple', 'orange'] +``` + +the indices would be encoded like + +``` +[rle-run=(6, 0), + bit-packed-run=[1]] +``` + +The full details of the rle-bitpacking encoding are found in the [Parquet +specification][10]. + +When writing a Parquet file, most implementations will use dictionary encoding +to compress a column until the dictionary itself reaches a certain size +threshold, usually around 1 megabyte. At this point, the column writer will +"fall back" to `PLAIN` encoding where values are written end-to-end in "data +pages" and then usually compressed with Snappy or Gzip. See the following rough +diagram: + + + + + +# Faster reading and writing of dictionary-encoded data + +When reading a Parquet file, the dictionary-encoded portions are usually +materialized to their non-dictionary-encoded form, causing binary or string +values to be duplicated in memory. So an obvious (but not trivial) optimization +is to skip this "dense" materialization. There are several issues to deal with: + +* A Parquet file often contains multiple ColumnChunks for each semantic column, + and the dictionary values may be different in each ColumnChunk +* We must gracefully handle the "fall back" portion which is not + dictionary-encoded + +We pursued several avenues to help with this: + +* Allowing each `arrow::DictionaryArray` to have a different dictionary + (before, the dictionary was part of the `DictionaryType`, which caused + problems) +* We enabled the Parquet dictionary indices to be
[GitHub] [arrow-site] wesm commented on issue #19: ARROW-6419: [Website] Blog post about Parquet C++ read performance improvements in Arrow 0.15
wesm commented on issue #19: ARROW-6419: [Website] Blog post about Parquet C++ read performance improvements in Arrow 0.15 URL: https://github.com/apache/arrow-site/pull/19#issuecomment-528024557 I'll address the comments and update the benchmark results with ARROW-6417 taken into account so we can publish this ~tomorrow This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services
[GitHub] [arrow-site] kou commented on issue #21: ARROW-6497: [Website] On change to master branch, automatically make PR to asf-site
kou commented on issue #21: ARROW-6497: [Website] On change to master branch, automatically make PR to asf-site URL: https://github.com/apache/arrow-site/pull/21#issuecomment-530261322 I don't try it yet but we may be able to use GitHub Actions and the default `GITHUB_TOKEN` in GitHub Actions instead of this Travis CI and pull request approach. See also: https://help.github.com/en/articles/virtual-environments-for-github-actions#github_token-secret I'll take a look into this. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services
[GitHub] [arrow-site] wesm merged pull request #22: ARROW-6505: [Website] Add new committers
wesm merged pull request #22: ARROW-6505: [Website] Add new committers URL: https://github.com/apache/arrow-site/pull/22 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services
[GitHub] [arrow-site] wesm commented on issue #22: ARROW-6505: [Website] Add new committers
wesm commented on issue #22: ARROW-6505: [Website] Add new committers URL: https://github.com/apache/arrow-site/pull/22#issuecomment-529949292 I added the missing affiliations. +1 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services
[GitHub] [arrow-site] mrkn commented on issue #22: ARROW-6505: [Website] Add new committers
mrkn commented on issue #22: ARROW-6505: [Website] Add new committers URL: https://github.com/apache/arrow-site/pull/22#issuecomment-529980970 Thanks, @wesm! This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services
[GitHub] [arrow-site] mrkn commented on a change in pull request #22: Add new committers
mrkn commented on a change in pull request #22: Add new committers URL: https://github.com/apache/arrow-site/pull/22#discussion_r322564482 ## File path: committers.html ## @@ -251,6 +251,24 @@ Committers TBD Dremio + +Ben Kietzman +Committer +bkietz + + + +Kenta Murata +Committer +mrkn +Speee, Inc. + + +Neal Richardson +Committer +npr + Review comment: @nealrichardson Could you tell me your affiliation? This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services
[GitHub] [arrow-site] mrkn opened a new pull request #22: Add new committers
mrkn opened a new pull request #22: Add new committers URL: https://github.com/apache/arrow-site/pull/22 I'd like to add new committers on the list. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services
[GitHub] [arrow-site] mrkn commented on a change in pull request #22: Add new committers
mrkn commented on a change in pull request #22: Add new committers URL: https://github.com/apache/arrow-site/pull/22#discussion_r322564323 ## File path: committers.html ## @@ -251,6 +251,24 @@ Committers TBD Dremio + +Ben Kietzman +Committer +bkietz + Review comment: @bkietz Could you tell me your affiliation? This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services
[GitHub] [arrow-site] nealrichardson commented on issue #24: ARROW-6569: [Website] Add support for auto deployment by GitHub Actions
nealrichardson commented on issue #24: ARROW-6569: [Website] Add support for auto deployment by GitHub Actions URL: https://github.com/apache/arrow-site/pull/24#issuecomment-531883882 https://kou.github.io/arrow-site/ works for me (now) so maybe whatever the problem was is fixed? If it works, then I'm +1 on the change. I'd just ask that you update the README to note this change and update the instructions there (remove the stuff about deploy keys and PATs, at minimum). Also, is there any setup required to make this work? Do I have to do something to enable github actions on my fork? Do we have to do something to enable it on apache/arrow-site? Are you certain we are permitted to do that on apache/arrow-site? This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services
[GitHub] [arrow-site] kou commented on issue #24: ARROW-6569: [Website] Add support for auto deployment by GitHub Actions
kou commented on issue #24: ARROW-6569: [Website] Add support for auto deployment by GitHub Actions URL: https://github.com/apache/arrow-site/pull/24#issuecomment-531973905 I've removed all automatic deployment document from README. Because we don't need to do nothing. This GitHub Action runs on apache/arrow-site. So contributors need to do nothing. apache/arrow-site is already GitHub Actions ready. I'm also adding a step to this GitHub Action to comment preview GitHub Pages URL. If we don't have this, we need to describe preview URL in README. It doesn't work yet. I'll leave a comment here when it works. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services
[GitHub] [arrow-site] wesm commented on issue #24: ARROW-6569: [Website] Add support for auto deployment by GitHub Actions
wesm commented on issue #24: ARROW-6569: [Website] Add support for auto deployment by GitHub Actions URL: https://github.com/apache/arrow-site/pull/24#issuecomment-531951936 @nealrichardson it appears that GitHub Actions may allow us to benefit from the capabilities of Azure Devops without having to jump through hoops with ASF Infra (previously we've been unable to use it because of repo permissions issue -- Azure Devops / Pipelines require that their app have write access to the repo) This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services
[GitHub] [arrow-site] kou commented on a change in pull request #23: ARROW-6127: [Website] Add favicons and meta tags
kou commented on a change in pull request #23: ARROW-6127: [Website] Add favicons and meta tags URL: https://github.com/apache/arrow-site/pull/23#discussion_r324439945 ## File path: .gitignore ## @@ -8,3 +8,4 @@ build/ .bundle/ ruby/ .DS_Store +themes/ Review comment: Is this needed? This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services
[GitHub] [arrow-site] kou commented on a change in pull request #23: ARROW-6127: [Website] Add favicons and meta tags
kou commented on a change in pull request #23: ARROW-6127: [Website] Add favicons and meta tags URL: https://github.com/apache/arrow-site/pull/23#discussion_r324440167 ## File path: build-and-deploy.sh ## @@ -28,7 +28,13 @@ if [ "${TRAVIS_BRANCH}" = "master" ] && [ "${TRAVIS_PULL_REQUEST}" = "false" ]; # because we can infer it based on GitHub Pages conventions if [ "${BASE_URL}" = "" ]; then BASE_URL=$(echo $TRAVIS_REPO_SLUG | sed -e 's@.*/@/@') +FULL_URL="https://"$(echo $TRAVIS_REPO_SLUG | sed 's@/.*@.github.io@') +else +# Everything is shoved into BASE_URL so this can be empty +FULL_URL= fi +# FULL_URL is for the opengraph tags, which can't be relative +perl -pe 's@^url:.*@url: '"${FULL_URL}"'@' -i _config.yml Review comment: How about using override config file instead of rewrite the original config file? ```shell custom_config_yml="_config.override.yml" touch ${custom_config_yml} if [ "${TRAVIS_REPO_SLUG}" = "apache/arrow-site" ]; then # Production TARGET_BRANCH=asf-site BASE_URL= else # On a fork, so we'll deploy to GitHub Pages TARGET_BRANCH=gh-pages # You could supply an alternate BASE_URL, but that's not necessary # because we can infer it based on GitHub Pages conventions if [ "${BASE_URL}" = "" ]; then BASE_URL=$(echo $TRAVIS_REPO_SLUG | sed -e 's@.*/@/@') echo "url: https://$(echo $TRAVIS_REPO_SLUG | sed 's@/.*@.github.io@')" >> ${custom_config_yml} fi fi # Build JEKYLL_ENV=production \ bundle exec \ jekyll build \ --baseurl="${BASE_URL}" \ --config=_config.yml,${custom_config_yml} ``` See also: https://jekyllrb.com/docs/configuration/options/#build-command-options This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services