from:"GitBox"

[arrow] Diff for: [GitHub] kszucs closed pull request #3403: ARROW-4260: [Python] NumPy buffer protocol failure

2019-01-15 Thread GitBox

diff --git a/ci/appveyor-cpp-build.bat b/ci/appveyor-cpp-build.bat
index 387dd55d18..78f5e41928 100644
--- a/ci/appveyor-cpp-build.bat
+++ b/ci/appveyor-cpp-build.bat
@@ -95,11 +95,12 @@ if "%JOB%" == "Build_Debug" (
   exit /B 0
 )
 
-conda create -n arrow -q -y ^
+conda create -n arrow -q -y -c conda-forge ^
+  --file=ci\conda_env_python.yml ^
   python=%PYTHON% ^
-  six pytest setuptools numpy pandas cython hypothesis ^
-  thrift-cpp=0.11.0 boost-cpp ^
-  -c conda-forge
+  numpy=1.14 ^
+  thrift-cpp=0.11 ^
+  boost-cpp
 
 call activate arrow
 
@@ -109,9 +110,9 @@ set BOOST_LIBRARYDIR=%CONDA_PREFIX%\Library\lib
 
 if "%JOB%" == "Toolchain" (
   @rem Install pre-built "toolchain" packages for faster builds
-  conda install -q -y --file=ci\conda_env_cpp.yml ^
-python=%PYTHON% ^
--c conda-forge
+  conda install -q -y -c conda-forge ^
+--file=ci\conda_env_cpp.yml ^
+python=%PYTHON%
 
   set ARROW_BUILD_TOOLCHAIN=%CONDA_PREFIX%\Library
 )
diff --git a/ci/cpp-msvc-build-main.bat b/ci/cpp-msvc-build-main.bat
index 644170775d..d9d7e548dd 100644
--- a/ci/cpp-msvc-build-main.bat
+++ b/ci/cpp-msvc-build-main.bat
@@ -72,7 +72,7 @@ popd
 
 pushd python
 
-pip install pickle5
+pip install -r requirements.txt pickle5
 
 set PYARROW_CXXFLAGS=%ARROW_CXXFLAGS%
 set PYARROW_CMAKE_GENERATOR=%GENERATOR%
diff --git a/ci/travis_script_python.sh b/ci/travis_script_python.sh
index e9a1122755..c3e2d1903c 100755
--- a/ci/travis_script_python.sh
+++ b/ci/travis_script_python.sh
@@ -50,7 +50,7 @@ conda create -y -q -p $CONDA_ENV_DIR \
   nomkl \
   cmake \
   pip \
-  numpy=1.13.1 \
+  numpy=1.14 \
   python=${PYTHON_VERSION} \
   ${CONDA_JVM_DEPS}
 
@@ -124,7 +124,7 @@ $ARROW_CPP_BUILD_DIR/$ARROW_BUILD_TYPE/arrow-python-test
 pushd $ARROW_PYTHON_DIR
 
 # Other stuff pip install
-pip install -q -r requirements.txt
+pip install -r requirements.txt
 
 if [ "$PYTHON_VERSION" == "3.6" ]; then
 pip install -q pickle5
diff --git a/python/pyarrow/tests/test_convert_pandas.py 
b/python/pyarrow/tests/test_convert_pandas.py
index 466d2e9562..264b51c32f 100644
--- a/python/pyarrow/tests/test_convert_pandas.py
+++ b/python/pyarrow/tests/test_convert_pandas.py
@@ -23,6 +23,7 @@
 
 from collections import OrderedDict
 from datetime import date, datetime, time, timedelta
+from distutils.version import LooseVersion
 
 import hypothesis as h
 import hypothesis.extra.pytz as tzst
@@ -2224,8 +2225,6 @@ def test_safe_unsafe_casts(self):
 
 
 def _fully_loaded_dataframe_example():
-from distutils.version import LooseVersion
-
 index = pd.MultiIndex.from_arrays([
 pd.date_range('2000-01-01', periods=5).repeat(2),
 np.tile(np.array(['foo', 'bar'], dtype=object), 5)
@@ -2271,6 +2270,8 @@ def _check_serialize_components_roundtrip(df):
 tm.assert_frame_equal(df, deserialized)
 
 
+@pytest.mark.skipif(LooseVersion(np.__version__) >= '0.16',
+reason='Until numpy/numpy#12745 is resolved')
 def test_serialize_deserialize_pandas():
 # ARROW-1784, serialize and deserialize DataFrame by decomposing
 # BlockManager


With regards,
Apache Git Services

[arrow] Diff for: [GitHub] kszucs closed pull request #3405: ARROW-4266: [Python][CI] Disable ORC tests in dask integration test

2019-01-15 Thread GitBox

diff --git a/integration/dask/runtest.sh b/integration/dask/runtest.sh
index 9a37e0a67b..baf9ccf445 100755
--- a/integration/dask/runtest.sh
+++ b/integration/dask/runtest.sh
@@ -29,6 +29,7 @@ python -c "import pyarrow.parquet"
 # pytest -sv --pyargs dask.bytes.tests.test_hdfs
 # pytest -sv --pyargs dask.bytes.tests.test_local
 
-pytest -v --pyargs dask.dataframe.io.tests.test_orc
+# TODO(kszucs): re-enable it, for more see ARROW-3910
+# pytest -v --pyargs dask.dataframe.io.tests.test_orc
 pytest -v --pyargs dask.dataframe.io.tests.test_parquet
 pytest -v --pyargs dask.dataframe.tests.test_dataframe


With regards,
Apache Git Services

[arrow] Diff for: [GitHub] wesm closed pull request #3395: ARROW-4258: [Python] Safe cast fails from numpy float64 array with nans to integer

2019-01-15 Thread GitBox

diff --git a/cpp/src/arrow/python/numpy_to_arrow.cc 
b/cpp/src/arrow/python/numpy_to_arrow.cc
index aada6bf598..a944b80914 100644
--- a/cpp/src/arrow/python/numpy_to_arrow.cc
+++ b/cpp/src/arrow/python/numpy_to_arrow.cc
@@ -443,8 +443,8 @@ inline Status 
NumPyConverter::ConvertData(std::shared_ptr* data) {
   RETURN_NOT_OK(NumPyDtypeToArrow(reinterpret_cast(dtype_), 
_type));
 
   if (!input_type->Equals(*type_)) {
-RETURN_NOT_OK(CastBuffer(input_type, *data, length_, nullptr, 0, type_, 
cast_options_,
- pool_, data));
+RETURN_NOT_OK(CastBuffer(input_type, *data, length_, null_bitmap_, 
null_count_, type_,
+ cast_options_, pool_, data));
   }
 
   return Status::OK();
@@ -477,8 +477,8 @@ inline Status 
NumPyConverter::ConvertData(std::shared_ptr* d
   } else {
 RETURN_NOT_OK(NumPyDtypeToArrow(reinterpret_cast(dtype_), 
_type));
 if (!input_type->Equals(*type_)) {
-  RETURN_NOT_OK(CastBuffer(input_type, *data, length_, nullptr, 0, type_,
-   cast_options_, pool_, data));
+  RETURN_NOT_OK(CastBuffer(input_type, *data, length_, null_bitmap_, 
null_count_,
+   type_, cast_options_, pool_, data));
 }
   }
 
@@ -518,8 +518,8 @@ inline Status 
NumPyConverter::ConvertData(std::shared_ptr* d
   } else {
 RETURN_NOT_OK(NumPyDtypeToArrow(reinterpret_cast(dtype_), 
_type));
 if (!input_type->Equals(*type_)) {
-  RETURN_NOT_OK(CastBuffer(input_type, *data, length_, nullptr, 0, type_,
-   cast_options_, pool_, data));
+  RETURN_NOT_OK(CastBuffer(input_type, *data, length_, null_bitmap_, 
null_count_,
+   type_, cast_options_, pool_, data));
 }
   }
 
diff --git a/python/pyarrow/tests/test_convert_pandas.py 
b/python/pyarrow/tests/test_convert_pandas.py
index 264b51c32f..9bee9053c5 100644
--- a/python/pyarrow/tests/test_convert_pandas.py
+++ b/python/pyarrow/tests/test_convert_pandas.py
@@ -2224,6 +2224,15 @@ def test_safe_unsafe_casts(self):
 assert table.column('B').type == pa.int32()
 
 
+def test_safe_cast_from_float_with_nans_to_int():
+# TODO(kszucs): write tests for creating Date32 and Date64 arrays, see
+#   ARROW-4258 and https://github.com/apache/arrow/pull/3395
+values = pd.Series([1, 2, None, 4])
+arr = pa.Array.from_pandas(values, type=pa.int32(), safe=True)
+expected = pa.array([1, 2, None, 4], type=pa.int32())
+assert arr.equals(expected)
+
+
 def _fully_loaded_dataframe_example():
 index = pd.MultiIndex.from_arrays([
 pd.date_range('2000-01-01', periods=5).repeat(2),
diff --git a/python/pyarrow/tests/test_plasma_tf_op.py 
b/python/pyarrow/tests/test_plasma_tf_op.py
index e239055209..53ecae217e 100644
--- a/python/pyarrow/tests/test_plasma_tf_op.py
+++ b/python/pyarrow/tests/test_plasma_tf_op.py
@@ -82,6 +82,7 @@ def FromPlasma():
 
 @pytest.mark.plasma
 @pytest.mark.tensorflow
+@pytest.mark.skip(reason='Until ARROW-4259 is resolved')
 def test_plasma_tf_op(use_gpu=False):
 import pyarrow.plasma as plasma
 import tensorflow as tf


With regards,
Apache Git Services

[arrow] Diff for: [GitHub] kou closed pull request #3390: ARROW-4228: [GLib] Add garrow_list_data_type_get_field()

2019-01-14 Thread GitBox

diff --git a/c_glib/arrow-glib/composite-data-type.cpp 
b/c_glib/arrow-glib/composite-data-type.cpp
index 5ddc1c3dd8..675900a5be 100644
--- a/c_glib/arrow-glib/composite-data-type.cpp
+++ b/c_glib/arrow-glib/composite-data-type.cpp
@@ -88,9 +88,26 @@ garrow_list_data_type_new(GArrowField *field)
  * @list_data_type: A #GArrowListDataType.
  *
  * Returns: (transfer full): The field of value.
+ *
+ * Deprecated: 0.13.0:
+ *   Use garrow_list_data_type_get_field() instead.
  */
 GArrowField *
 garrow_list_data_type_get_value_field(GArrowListDataType *list_data_type)
+{
+  return garrow_list_data_type_get_field(list_data_type);
+}
+
+/**
+ * garrow_list_data_type_get_field:
+ * @list_data_type: A #GArrowListDataType.
+ *
+ * Returns: (transfer full): The field of value.
+ *
+ * Since: 0.13.0
+ */
+GArrowField *
+garrow_list_data_type_get_field(GArrowListDataType *list_data_type)
 {
   auto data_type = GARROW_DATA_TYPE(list_data_type);
   auto arrow_data_type = garrow_data_type_get_raw(data_type);
diff --git a/c_glib/arrow-glib/composite-data-type.h 
b/c_glib/arrow-glib/composite-data-type.h
index f60a9cdeb6..beb312597d 100644
--- a/c_glib/arrow-glib/composite-data-type.h
+++ b/c_glib/arrow-glib/composite-data-type.h
@@ -22,6 +22,7 @@
 #include 
 #include 
 #include 
+#include 
 
 G_BEGIN_DECLS
 
@@ -67,7 +68,12 @@ struct _GArrowListDataTypeClass
 
 GType   garrow_list_data_type_get_type (void) G_GNUC_CONST;
 GArrowListDataType *garrow_list_data_type_new  (GArrowField *field);
+#ifndef GARROW_DISABLE_DEPRECATED
+GARROW_DEPRECATED_IN_0_13_FOR(garrow_list_data_type_get_field)
 GArrowField *garrow_list_data_type_get_value_field (GArrowListDataType 
*list_data_type);
+#endif
+GARROW_AVAILABLE_IN_0_13
+GArrowField *garrow_list_data_type_get_field (GArrowListDataType 
*list_data_type);
 
 
 #define GARROW_TYPE_STRUCT_DATA_TYPE (garrow_struct_data_type_get_type())
diff --git a/c_glib/arrow-glib/version.h.in b/c_glib/arrow-glib/version.h.in
index 501827d06e..827b9c9a81 100644
--- a/c_glib/arrow-glib/version.h.in
+++ b/c_glib/arrow-glib/version.h.in
@@ -110,6 +110,15 @@
 #  define GARROW_UNAVAILABLE(major, minor) G_UNAVAILABLE(major, minor)
 #endif
 
+/**
+ * GARROW_VERSION_0_13:
+ *
+ * You can use this macro value for compile time API version check.
+ *
+ * Since: 0.13.0
+ */
+#define GARROW_VERSION_0_13 G_ENCODE_VERSION(0, 13)
+
 /**
  * GARROW_VERSION_0_12:
  *
@@ -175,6 +184,20 @@
 
 #define GARROW_AVAILABLE_IN_ALL
 
+#if GARROW_VERSION_MIN_REQUIRED >= GARROW_VERSION_0_13
+#  define GARROW_DEPRECATED_IN_0_13   GARROW_DEPRECATED
+#  define GARROW_DEPRECATED_IN_0_13_FOR(function) 
GARROW_DEPRECATED_FOR(function)
+#else
+#  define GARROW_DEPRECATED_IN_0_13
+#  define GARROW_DEPRECATED_IN_0_13_FOR(function)
+#endif
+
+#if GARROW_VERSION_MAX_ALLOWED < GARROW_VERSION_0_13
+#  define GARROW_AVAILABLE_IN_0_13 GARROW_UNAVAILABLE(0, 13)
+#else
+#  define GARROW_AVAILABLE_IN_0_13
+#endif
+
 #if GARROW_VERSION_MIN_REQUIRED >= GARROW_VERSION_0_12
 #  define GARROW_DEPRECATED_IN_0_12   GARROW_DEPRECATED
 #  define GARROW_DEPRECATED_IN_0_12_FOR(function) 
GARROW_DEPRECATED_FOR(function)
diff --git a/c_glib/doc/arrow-glib/arrow-glib-docs.xml 
b/c_glib/doc/arrow-glib/arrow-glib-docs.xml
index f9f01fe23e..1016703001 100644
--- a/c_glib/doc/arrow-glib/arrow-glib-docs.xml
+++ b/c_glib/doc/arrow-glib/arrow-glib-docs.xml
@@ -163,6 +163,10 @@
 Index of deprecated API
 
   
+  
+Index of new symbols in 0.13.0
+
+  
   
 Index of new symbols in 0.12.0
 
diff --git a/c_glib/test/test-list-data-type.rb 
b/c_glib/test/test-list-data-type.rb
index 2d96fcb21e..78df28a144 100644
--- a/c_glib/test/test-list-data-type.rb
+++ b/c_glib/test/test-list-data-type.rb
@@ -30,14 +30,14 @@ def test_to_s
 assert_equal("list", @data_type.to_s)
   end
 
-  def test_value_field
+  def test_field
 assert_equal([
@field,
@field_data_type,
  ],
  [
-   @data_type.value_field,
-   @data_type.value_field.data_type,
+   @data_type.field,
+   @data_type.field.data_type,
  ])
   end
 end


With regards,
Apache Git Services

[arrow] Diff for: [GitHub] kszucs merged pull request #3406: [CI] Temporary fix for conda-forge migration

2019-01-15 Thread GitBox

diff --git a/ci/appveyor-cpp-build.bat b/ci/appveyor-cpp-build.bat
index 78f5e41928..5b653a9050 100644
--- a/ci/appveyor-cpp-build.bat
+++ b/ci/appveyor-cpp-build.bat
@@ -95,7 +95,7 @@ if "%JOB%" == "Build_Debug" (
   exit /B 0
 )
 
-conda create -n arrow -q -y -c conda-forge ^
+conda create -n arrow -q -y -c conda-forge/label/cf201901 ^
   --file=ci\conda_env_python.yml ^
   python=%PYTHON% ^
   numpy=1.14 ^
@@ -110,7 +110,7 @@ set BOOST_LIBRARYDIR=%CONDA_PREFIX%\Library\lib
 
 if "%JOB%" == "Toolchain" (
   @rem Install pre-built "toolchain" packages for faster builds
-  conda install -q -y -c conda-forge ^
+  conda install -q -y -c conda-forge/label/cf201901 ^
 --file=ci\conda_env_cpp.yml ^
 python=%PYTHON%
 
diff --git a/ci/appveyor-cpp-setup.bat b/ci/appveyor-cpp-setup.bat
index 4cae2cb1ea..2a46d130ea 100644
--- a/ci/appveyor-cpp-setup.bat
+++ b/ci/appveyor-cpp-setup.bat
@@ -39,7 +39,7 @@ if defined need_vcvarsall (
 )
 )
 
-if "%GENERATOR%"=="Ninja" conda install -y -q ninja
+if "%GENERATOR%"=="Ninja" conda install -y -q ninja -c 
conda-forge/label/cf201901
 
 if "%USE_CLCACHE%" == "true" (
 @rem Use clcache for faster builds
diff --git a/ci/appveyor-cpp-test-cmake-script.bat 
b/ci/appveyor-cpp-test-cmake-script.bat
index 415406c4ac..d55a24dccf 100644
--- a/ci/appveyor-cpp-test-cmake-script.bat
+++ b/ci/appveyor-cpp-test-cmake-script.bat
@@ -19,8 +19,8 @@
 
 @rem Validate cmake script behaviour on missed lib in toolchain
 set CONDA_ENV=arrow-cmake-tests-libs
-conda create -n %CONDA_ENV% -q -y
-conda install -n %CONDA_ENV% -q -y -c conda-forge boost-cpp
+conda create -n %CONDA_ENV% -q -y -c conda-forge/label/cf201901
+conda install -n %CONDA_ENV% -q -y -c conda-forge/label/cf201901 boost-cpp
 call activate %CONDA_ENV%
 
 set BUILD_DIR=cpp\build-cmake-test
@@ -161,8 +161,8 @@ call deactivate
 
 @rem Validate libs availability in conda toolchain
 set CONDA_ENV=arrow-cmake-tests-toolchain
-conda create -n %CONDA_ENV% -q -y
-conda install -n %CONDA_ENV% -q -y -c conda-forge ^
+conda create -n %CONDA_ENV% -q -y -c conda-forge/label/cf201901
+conda install -n %CONDA_ENV% -q -y -c conda-forge/label/cf201901 ^
   --file=ci\conda_env_cpp.yml
 call activate %CONDA_ENV%
 
diff --git a/ci/cpp-msvc-build-main.bat b/ci/cpp-msvc-build-main.bat
index d9d7e548dd..6629060a47 100644
--- a/ci/cpp-msvc-build-main.bat
+++ b/ci/cpp-msvc-build-main.bat
@@ -105,7 +105,7 @@ popd
 
 call deactivate
 
-conda create -n wheel_test -q -y python=%PYTHON% || exit /B
+conda create -n wheel_test -q -y -c conda-forge/label/cf201901 python=%PYTHON% 
|| exit /B
 
 call activate wheel_test
 
diff --git a/ci/travis_before_script_c_glib.sh 
b/ci/travis_before_script_c_glib.sh
index e8dd0cdc80..8507f779b3 100755
--- a/ci/travis_before_script_c_glib.sh
+++ b/ci/travis_before_script_c_glib.sh
@@ -23,7 +23,7 @@ source $TRAVIS_BUILD_DIR/ci/travis_env_common.sh
 
 source $TRAVIS_BUILD_DIR/ci/travis_install_conda.sh
 
-conda create -n meson -y -q python=3.6
+conda create -n meson -y -q -c conda-forge/label/cf201901 python=3.6
 conda activate meson
 
 pip install meson
@@ -36,7 +36,7 @@ else
 autoconf-archive \
 gtk-doc-tools \
 libgirepository1.0-dev
-  conda install -q -y ninja
+  conda install -q -y ninja -c conda-forge/label/cf201901
 fi
 
 gem install test-unit gobject-introspection
diff --git a/ci/travis_install_toolchain.sh b/ci/travis_install_toolchain.sh
index 82031e8fd3..aa50d5966d 100755
--- a/ci/travis_install_toolchain.sh
+++ b/ci/travis_install_toolchain.sh
@@ -27,7 +27,7 @@ if [ ! -e $CPP_TOOLCHAIN ]; then
 fi
 
 # Set up C++ toolchain from conda-forge packages for faster builds
-conda create -y -q -p $CPP_TOOLCHAIN \
+conda create -y -q -p $CPP_TOOLCHAIN -c conda-forge/label/cf201901 \
 --file=$TRAVIS_BUILD_DIR/ci/conda_env_cpp.yml \
 ${CONDA_LLVM} \
 ccache \
diff --git a/ci/travis_script_integration.sh b/ci/travis_script_integration.sh
index 342db58b5d..5571ebc0b0 100755
--- a/ci/travis_script_integration.sh
+++ b/ci/travis_script_integration.sh
@@ -43,14 +43,14 @@ popd
 pushd $ARROW_INTEGRATION_DIR
 
 CONDA_ENV_NAME=arrow-integration-test
-conda create -y -q -n $CONDA_ENV_NAME python=3.5
+conda create -y -q -n $CONDA_ENV_NAME -c conda-forge/label/cf201901 python=3.5
 conda activate $CONDA_ENV_NAME
 
 # faster builds, please
-conda install -y nomkl
+conda install -y nomkl -c conda-forge/label/cf201901
 
 # Expensive dependencies install from Continuum package repo
-conda install -y pip numpy six
+conda install -y pip numpy six -c conda-forge/label/cf201901
 
 # ARROW-4008: Create a directory to write temporary files since /tmp can be
 # unstable in Travis CI
diff --git a/ci/travis_script_manylinux.sh b/ci/travis_script_manylinux.sh
index 588d0f9a7b..59b818af69 100755
--- a/ci/travis_script_manylinux.sh
+++ b/ci/travis_script_manylinux.sh
@@ -34,7 +34,7 @@ source $TRAVIS_BUILD_DIR/ci/travis_install_conda.sh
 PYTHON_VERSION=3.6

[arrow] Diff for: [GitHub] robertnishihara closed pull request #3410: ARROW-4269: [Python] Fix serialization in pandas 0.22

2019-01-15 Thread GitBox

diff --git a/python/pyarrow/serialization.py b/python/pyarrow/serialization.py
index 6bbe1c7bc8..9b261c1bf7 100644
--- a/python/pyarrow/serialization.py
+++ b/python/pyarrow/serialization.py
@@ -174,26 +174,27 @@ def _deserialize_pandas_series(data):
 custom_serializer=_pickle_to_buffer,
 custom_deserializer=_load_pickle_from_buffer)
 
-if hasattr(pd.core.arrays, 'interval'):
-context.register_type(
-pd.core.arrays.interval.IntervalArray,
-'pd.core.arrays.interval.IntervalArray',
-custom_serializer=_pickle_to_buffer,
-custom_deserializer=_load_pickle_from_buffer)
-
-if hasattr(pd.core.arrays, 'period'):
-context.register_type(
-pd.core.arrays.period.PeriodArray,
-'pd.core.arrays.period.PeriodArray',
-custom_serializer=_pickle_to_buffer,
-custom_deserializer=_load_pickle_from_buffer)
-
-if hasattr(pd.core.arrays, 'datetimes'):
-context.register_type(
-pd.core.arrays.datetimes.DatetimeArray,
-'pd.core.arrays.datetimes.DatetimeArray',
-custom_serializer=_pickle_to_buffer,
-custom_deserializer=_load_pickle_from_buffer)
+if hasattr(pd.core, 'arrays'):
+if hasattr(pd.core.arrays, 'interval'):
+context.register_type(
+pd.core.arrays.interval.IntervalArray,
+'pd.core.arrays.interval.IntervalArray',
+custom_serializer=_pickle_to_buffer,
+custom_deserializer=_load_pickle_from_buffer)
+
+if hasattr(pd.core.arrays, 'period'):
+context.register_type(
+pd.core.arrays.period.PeriodArray,
+'pd.core.arrays.period.PeriodArray',
+custom_serializer=_pickle_to_buffer,
+custom_deserializer=_load_pickle_from_buffer)
+
+if hasattr(pd.core.arrays, 'datetimes'):
+context.register_type(
+pd.core.arrays.datetimes.DatetimeArray,
+'pd.core.arrays.datetimes.DatetimeArray',
+custom_serializer=_pickle_to_buffer,
+custom_deserializer=_load_pickle_from_buffer)
 
 context.register_type(
 pd.DataFrame, 'pd.DataFrame',


With regards,
Apache Git Services

[arrow] Diff for: [GitHub] kszucs closed pull request #3402: WIP ARROW-4260: [Python] NumPy buffer protocol failure

2019-01-15 Thread GitBox

diff --git a/ci/appveyor-cpp-build.bat b/ci/appveyor-cpp-build.bat
index 387dd55d18..6681f5673f 100644
--- a/ci/appveyor-cpp-build.bat
+++ b/ci/appveyor-cpp-build.bat
@@ -97,7 +97,7 @@ if "%JOB%" == "Build_Debug" (
 
 conda create -n arrow -q -y ^
   python=%PYTHON% ^
-  six pytest setuptools numpy pandas cython hypothesis ^
+  --file=ci\conda_env_python.yml ^
   thrift-cpp=0.11.0 boost-cpp ^
   -c conda-forge
 
diff --git a/ci/conda_env_python.yml b/ci/conda_env_python.yml
index b51f5c32f3..0a5b57167f 100644
--- a/ci/conda_env_python.yml
+++ b/ci/conda_env_python.yml
@@ -18,8 +18,9 @@
 cython
 cloudpickle
 hypothesis
-numpy
+numpy=1.15.4
 pandas
 pytest
 setuptools
 setuptools_scm
+six
\ No newline at end of file


With regards,
Apache Git Services

[arrow] Diff for: [GitHub] wesm closed pull request #3397: ARROW-4257: [Release] Update release verification script to check binaries on Bintray

2019-01-14 Thread GitBox

diff --git a/dev/release/verify-release-candidate.sh 
b/dev/release/verify-release-candidate.sh
index c8b9c54c82..28fb1e5eb0 100755
--- a/dev/release/verify-release-candidate.sh
+++ b/dev/release/verify-release-candidate.sh
@@ -87,24 +87,56 @@ fetch_archive() {
   shasum -a 512 -c ${dist_name}.tar.gz.sha512
 }
 
+bintray() {
+  local command=$1
+  shift
+  local path=$1
+  shift
+  local url=https://bintray.com/api/v1${path}
+  echo "${command} ${url}" 1>&2
+  curl \
+--fail \
+--basic \
+--user "${BINTRAY_USER}:${BINTRAY_PASSWORD}" \
+--header "Content-Type: application/json" \
+--request ${command} \
+${url} \
+"$@" | \
+  jq .
+}
+
+download_bintray_files() {
+  local target=$1
+
+  local version_name=${VERSION}-rc${RC_NUMBER}
+
+  local files=$(
+bintray \
+  GET 
/packages/${BINTRAY_REPOSITORY}/${target}-rc/versions/${version_name}/files | \
+  jq -r ".[].path")
+
+  for file in ${files}; do
+mkdir -p "$(dirname ${file})"
+curl \
+  --fail \
+  --location \
+  --output ${file} \
+  https://dl.bintray.com/${BINTRAY_REPOSITORY}/${file}
+  done
+}
+
 verify_binary_artifacts() {
-  # --show-progress not supported on wget < 1.16
-  wget --help | grep -q '\--show-progress' && \
-  _WGET_PROGRESS_OPT="-q --show-progress" || _WGET_PROGRESS_OPT=""
-
-  # download the binaries folder for the current RC
-  rcname=apache-arrow-${VERSION}-rc${RC_NUMBER}
-  wget -P "$rcname" \
---quiet \
---no-host-directories \
---cut-dirs=5 \
-$_WGET_PROGRESS_OPT \
---no-parent \
---reject 'index.html*' \
---recursive "$ARROW_DIST_URL/$rcname/binaries/"
+  local download_dir=binaries
+  mkdir -p ${download_dir}
+  pushd ${download_dir}
+
+  # takes longer on slow network
+  for target in centos debian python ubuntu; do
+download_bintray_files ${target}
+  done
 
   # verify the signature and the checksums of each artifact
-  find $rcname/binaries -name '*.asc' | while read sigfile; do
+  find . -name '*.asc' | while read sigfile; do
 artifact=${sigfile/.asc/}
 gpg --verify $sigfile $artifact || exit 1
 
@@ -112,10 +144,14 @@ verify_binary_artifacts() {
 # basename of the artifact
 pushd $(dirname $artifact)
 base_artifact=$(basename $artifact)
-shasum -a 256 -c $base_artifact.sha256 || exit 1
+if [ -f $base_artifact.sha256 ]; then
+  shasum -a 256 -c $base_artifact.sha256 || exit 1
+fi
 shasum -a 512 -c $base_artifact.sha512 || exit 1
 popd
   done
+
+  popd
 }
 
 setup_tempdir() {
@@ -343,7 +379,14 @@ if [ "$ARTIFACT" == "source" ]; then
   test_integration
   test_rust
 else
-  # takes longer on slow network
+  if [ -z "${BINTRAY_PASSWORD}" ]; then
+echo "BINTRAY_PASSWORD is empty"
+exit 1
+  fi
+
+  : ${BINTRAY_USER:=$USER}
+  : ${BINTRAY_REPOSITORY:=apache/arrow}
+
   verify_binary_artifacts
 fi
 


With regards,
Apache Git Services

[arrow] Diff for: [GitHub] wesm closed pull request #3398: ARROW-4246: [Plasma][Python] PlasmaClient.list returns wrong information with CUDA enabled Plasma

2019-01-14 Thread GitBox

diff --git a/cpp/src/plasma/common.h b/cpp/src/plasma/common.h
index 17155b2dff..8e90fb301c 100644
--- a/cpp/src/plasma/common.h
+++ b/cpp/src/plasma/common.h
@@ -93,10 +93,6 @@ struct ObjectTableEntry {
   int64_t data_size;
   /// Size of the object metadata in bytes.
   int64_t metadata_size;
-#ifdef PLASMA_CUDA
-  /// IPC GPU handle to share with clients.
-  std::shared_ptr<::arrow::cuda::CudaIpcMemHandle> ipc_handle;
-#endif
   /// Number of clients currently using this object.
   int ref_count;
   /// Unix epoch of when this object was created.
@@ -108,6 +104,13 @@ struct ObjectTableEntry {
   ObjectState state;
   /// The digest of the object. Used to see if two objects are the same.
   unsigned char digest[kDigestSize];
+
+#ifdef PLASMA_CUDA
+  /// Put CUDA related members at the last to create Python bindings easily.
+
+  /// IPC GPU handle to share with clients.
+  std::shared_ptr<::arrow::cuda::CudaIpcMemHandle> ipc_handle;
+#endif
 };
 
 /// Mapping from ObjectIDs to information about the object.


With regards,
Apache Git Services

[arrow] Diff for: [GitHub] kou closed pull request #3401: ARROW-4246: [Plasma][Python][Follow-up] Ensure plasma::ObjectTableEntry always has the same size regardless of whether built with CUDA support

2019-01-15 Thread GitBox

diff --git a/cpp/src/plasma/common.h b/cpp/src/plasma/common.h
index 8e90fb301c..dfbd90c3aa 100644
--- a/cpp/src/plasma/common.h
+++ b/cpp/src/plasma/common.h
@@ -72,6 +72,12 @@ enum class ObjectState : int {
   PLASMA_SEALED
 };
 
+namespace internal {
+
+struct CudaIpcPlaceholder {};
+
+}  //  namespace internal
+
 /// This type is used by the Plasma store. It is here because it is exposed to
 /// the eviction policy.
 struct ObjectTableEntry {
@@ -106,10 +112,10 @@ struct ObjectTableEntry {
   unsigned char digest[kDigestSize];
 
 #ifdef PLASMA_CUDA
-  /// Put CUDA related members at the last to create Python bindings easily.
-
   /// IPC GPU handle to share with clients.
   std::shared_ptr<::arrow::cuda::CudaIpcMemHandle> ipc_handle;
+#else
+  std::shared_ptr ipc_handle;
 #endif
 };
 
diff --git a/python/pyarrow/_plasma.pyx b/python/pyarrow/_plasma.pyx
index cfaa39c96e..4f64f202ce 100644
--- a/python/pyarrow/_plasma.pyx
+++ b/python/pyarrow/_plasma.pyx
@@ -45,6 +45,8 @@ PLASMA_WAIT_TIMEOUT = 2 ** 30
 
 
 cdef extern from "plasma/common.h" nogil:
+cdef cppclass CCudaIpcPlaceholder" plasma::internal::CudaIpcPlaceholder":
+pass
 
 cdef cppclass CUniqueID" plasma::UniqueID":
 
@@ -79,6 +81,7 @@ cdef extern from "plasma/common.h" nogil:
 int64_t create_time
 int64_t construct_duration
 CObjectState state
+shared_ptr[CCudaIpcPlaceholder] ipc_handle
 
 ctypedef unordered_map[CUniqueID, unique_ptr[CObjectTableEntry]] \
 CObjectTable" plasma::ObjectTable"


With regards,
Apache Git Services

[arrow] Diff for: [GitHub] robertnishihara closed pull request #3392: ARROW-4249: [Plasma] Clean up client namespace

2019-01-13 Thread GitBox

diff --git a/cpp/src/plasma/common.h b/cpp/src/plasma/common.h
index 38925fef92..17155b2dff 100644
--- a/cpp/src/plasma/common.h
+++ b/cpp/src/plasma/common.h
@@ -33,7 +33,6 @@
 #include "plasma/compat.h"
 
 #include "arrow/status.h"
-#include "arrow/util/logging.h"
 #ifdef PLASMA_CUDA
 #include "arrow/gpu/cuda_api.h"
 #endif
diff --git a/cpp/src/plasma/io.cc b/cpp/src/plasma/io.cc
index d2794e89d3..cc425428ec 100644
--- a/cpp/src/plasma/io.cc
+++ b/cpp/src/plasma/io.cc
@@ -22,6 +22,7 @@
 #include 
 
 #include "arrow/status.h"
+#include "arrow/util/logging.h"
 
 #include "plasma/common.h"
 #include "plasma/plasma_generated.h"
diff --git a/cpp/src/plasma/lib/java/org_apache_arrow_plasma_PlasmaClientJNI.cc 
b/cpp/src/plasma/lib/java/org_apache_arrow_plasma_PlasmaClientJNI.cc
index fa376ec43c..d552994e54 100644
--- a/cpp/src/plasma/lib/java/org_apache_arrow_plasma_PlasmaClientJNI.cc
+++ b/cpp/src/plasma/lib/java/org_apache_arrow_plasma_PlasmaClientJNI.cc
@@ -28,6 +28,8 @@
 #include 
 #include 
 
+#include "arrow/util/logging.h"
+
 #include "plasma/client.h"
 
 constexpr jsize OBJECT_ID_SIZE = sizeof(plasma::ObjectID) / sizeof(jbyte);
diff --git a/python/pyarrow/tensorflow/plasma_op.cc 
b/python/pyarrow/tensorflow/plasma_op.cc
index 852be33938..bf4eec7891 100644
--- a/python/pyarrow/tensorflow/plasma_op.cc
+++ b/python/pyarrow/tensorflow/plasma_op.cc
@@ -33,6 +33,7 @@
 #include "arrow/adapters/tensorflow/convert.h"
 #include "arrow/api.h"
 #include "arrow/io/api.h"
+#include "arrow/util/logging.h"
 
 // These headers do not include Python.h
 #include "arrow/python/deserialize.h"


With regards,
Apache Git Services

[arrow] Diff for: [GitHub] xhochy closed pull request #3393: ARROW-4256: [Release] Fix Windows verification script for 0.12 release

2019-01-14 Thread GitBox

diff --git a/dev/release/verify-release-candidate.bat 
b/dev/release/verify-release-candidate.bat
index cc25b045dc..c85ece4465 100644
--- a/dev/release/verify-release-candidate.bat
+++ b/dev/release/verify-release-candidate.bat
@@ -46,12 +46,11 @@ call conda create -p %_VERIFICATION_CONDA_ENV% -f -q -y 
python=%PYTHON% || exit
 call activate %_VERIFICATION_CONDA_ENV% || exit /B
 
 call conda install -y ^
-  six pytest setuptools numpy pandas cython ^
-  thrift-cpp flatbuffers rapidjson ^
-  cmake ^
-  git ^
-  boost-cpp ^
-  snappy zlib brotli gflags lz4-c zstd -c conda-forge || exit /B
+ python=3.7 ^
+ git ^
+ --file=ci\conda_env_cpp.yml ^
+ --file=ci\conda_env_python.yml ^
+ -c conda-forge || exit /B
 
 set GENERATOR=Visual Studio 14 2015 Win64
 set CONFIGURATION=release


With regards,
Apache Git Services

[GitHub] [arrow-site] nealrichardson commented on issue #9: ARROW-4473: [Website] Support test site deployment (take 2)

2019-08-12 Thread GitBox

nealrichardson commented on issue #9: ARROW-4473: [Website] Support test site 
deployment (take 2)
URL: https://github.com/apache/arrow-site/pull/9#issuecomment-520553530
 
 
   Thanks for the suggestions for simplifying the script, @kou. I did them and 
the test build seemed to succeed: see https://enpiar.com/arrow-site/ for what 
was deployed. 
   
   You may want to review the diff on the gh-pages branch there for what your 
proposed `rsync` command made. I think it's right but it deleted a lot of 
files--there seemed to be a copy of the website in a `_site` directory, and 
then there were some stale/misdated copies of blog posts that got removed too. 
https://github.com/nealrichardson/arrow-site/commit/b4431a6f2f2e2aa0bcec24e1f9b4b7130f9bc210
 is huge (> 2M lines deleted) so it's probably easier to review starting around 
here in the Travis log: 
https://travis-ci.org/nealrichardson/arrow-site/builds/570970342#L6417
   
   I spot-checked some of the deleted blog posts and there are copies of them 
with a different date (e.g. the 0.3.0 release post is at 
https://enpiar.com/arrow-site/blog/2017/05/08/0.3-release/, while in production 
there is the post at 2017/05/08 and then a (somewhat broken) copy at 05/07 
(http://arrow.apache.org/blog/2017/05/07/0.3-release/), which seems to be what 
`rsync` removed.


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

[GitHub] [arrow-site] nealrichardson commented on issue #9: ARROW-4473: [Website] Support test site deployment (take 2)

2019-08-13 Thread GitBox

nealrichardson commented on issue #9: ARROW-4473: [Website] Support test site 
deployment (take 2)
URL: https://github.com/apache/arrow-site/pull/9#issuecomment-521025692
 
 
   IDK why that file mode would change, but it sounds like a good idea. No 
reason for a .png to be 755 is there?
   
   /events/ looks like it hasn't been touched since 2016.


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

[GitHub] [arrow-site] kou commented on issue #9: ARROW-4473: [Website] Support test site deployment (take 2)

2019-08-14 Thread GitBox

kou commented on issue #9: ARROW-4473: [Website] Support test site deployment 
(take 2)
URL: https://github.com/apache/arrow-site/pull/9#issuecomment-521493249
 
 
   +1


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

[GitHub] [arrow-site] nealrichardson opened a new pull request #11: ARROW-6246: [Website] Add link to R documentation site

2019-08-14 Thread GitBox

nealrichardson opened a new pull request #11: ARROW-6246: [Website] Add link to 
R documentation site
URL: https://github.com/apache/arrow-site/pull/11
 
 
   


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

[GitHub] [arrow-site] kou commented on issue #9: ARROW-4473: [Website] Support test site deployment (take 2)

2019-08-14 Thread GitBox

kou commented on issue #9: ARROW-4473: [Website] Support test site deployment 
(take 2)
URL: https://github.com/apache/arrow-site/pull/9#issuecomment-521471371
 
 
   It seems that `img/arrow.png`'s mode has been changed by 
https://github.com/apache/arrow-site/commit/dea4fb2e708151fc3f9ef07f44f6520770dc283b#diff-6ad51cc5e7d889c484fcb3597709f7d7
 .
   Could you restore mode of `img/arrow.png` to `0644`?
   Then we can merge this.


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

[GitHub] [arrow-site] kou commented on issue #9: ARROW-4473: [Website] Support test site deployment (take 2)

2019-08-14 Thread GitBox

kou commented on issue #9: ARROW-4473: [Website] Support test site deployment 
(take 2)
URL: https://github.com/apache/arrow-site/pull/9#issuecomment-521471569
 
 
   If we want to maintain `/events/` again, we should migrate the source of 
`/events/` to Jekyll based one.


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

[GitHub] [arrow-site] nealrichardson commented on issue #9: ARROW-4473: [Website] Support test site deployment (take 2)

2019-08-14 Thread GitBox

nealrichardson commented on issue #9: ARROW-4473: [Website] Support test site 
deployment (take 2)
URL: https://github.com/apache/arrow-site/pull/9#issuecomment-521457081
 
 
   @kou @wesm are we good here?


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

[GitHub] [arrow-site] wesm merged pull request #11: ARROW-6246: [Website] Add link to R documentation site

2019-08-15 Thread GitBox

wesm merged pull request #11: ARROW-6246: [Website] Add link to R documentation 
site
URL: https://github.com/apache/arrow-site/pull/11
 
 
   


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

[GitHub] [arrow-site] wesm commented on issue #11: ARROW-6246: [Website] Add link to R documentation site

2019-08-15 Thread GitBox

wesm commented on issue #11: ARROW-6246: [Website] Add link to R documentation 
site
URL: https://github.com/apache/arrow-site/pull/11#issuecomment-521651223
 
 
   LGTM, thanks


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

[GitHub] [arrow-site] wesm commented on issue #10: ARROW-6217: [Website] Remove needless _site/ directory

2019-08-12 Thread GitBox

wesm commented on issue #10: ARROW-6217: [Website] Remove needless _site/ 
directory
URL: https://github.com/apache/arrow-site/pull/10#issuecomment-520646619
 
 
   Thanks =)


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

[GitHub] [arrow-site] wesm merged pull request #10: ARROW-6217: [Website] Remove needless _site/ directory

2019-08-12 Thread GitBox

wesm merged pull request #10: ARROW-6217: [Website] Remove needless _site/ 
directory
URL: https://github.com/apache/arrow-site/pull/10
 
 
   


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

[GitHub] [arrow-site] kou commented on issue #9: ARROW-4473: [Website] Support test site deployment (take 2)

2019-08-12 Thread GitBox

kou commented on issue #9: ARROW-4473: [Website] Support test site deployment 
(take 2)
URL: https://github.com/apache/arrow-site/pull/9#issuecomment-520659486
 
 
   #10 has been merged.
   Could you rebase your fork?


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

[GitHub] [arrow-site] nealrichardson commented on issue #9: ARROW-4473: [Website] Support test site deployment (take 2)

2019-08-12 Thread GitBox

nealrichardson commented on issue #9: ARROW-4473: [Website] Support test site 
deployment (take 2)
URL: https://github.com/apache/arrow-site/pull/9#issuecomment-520661062
 
 
   Here's the diff: 
https://github.com/nealrichardson/arrow-site/commit/a0d9803c06df8099fb916f24fff579edddf4fefe
   
   Note that many of the line changes are the insertion of the relative path on 
my `gh-pages` branch. Here's the commit summary that names all of the deleted 
files: https://travis-ci.org/nealrichardson/arrow-site/jobs/570970343#L309


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

[GitHub] [arrow-site] kou opened a new pull request #10: ARROW-6217: [Website] Remove needless _site/ directory

2019-08-12 Thread GitBox

kou opened a new pull request #10: ARROW-6217: [Website] Remove needless _site/ 
directory
URL: https://github.com/apache/arrow-site/pull/10
 
 
   


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

[GitHub] [arrow-site] kou commented on issue #9: ARROW-4473: [Website] Support test site deployment (take 2)

2019-08-12 Thread GitBox

kou commented on issue #9: ARROW-4473: [Website] Support test site deployment 
(take 2)
URL: https://github.com/apache/arrow-site/pull/9#issuecomment-520629972
 
 
   I've created #10 to remove needless `_site/` directory.
   We merge it then we can confirm diff for this change.


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

[GitHub] [arrow-site] nealrichardson commented on issue #9: ARROW-4473: [Website] Support test site deployment (take 2)

2019-08-12 Thread GitBox

nealrichardson commented on issue #9: ARROW-4473: [Website] Support test site 
deployment (take 2)
URL: https://github.com/apache/arrow-site/pull/9#issuecomment-520609697
 
 
   I suspect those duplicate (stale) blog posts are from before the timezone 
was hardcoded 
(https://github.com/apache/arrow-site/commit/aa19f5d845e2be843af649e340a7821ada7d9424).
 All but one are in 2017.


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

[GitHub] [arrow-site] nealrichardson commented on issue #9: ARROW-4473: [Website] Support test site deployment (take 2)

2019-08-12 Thread GitBox

nealrichardson commented on issue #9: ARROW-4473: [Website] Support test site 
deployment (take 2)
URL: https://github.com/apache/arrow-site/pull/9#issuecomment-520634784
 
 
    fine, you can have the -2,000,000 line diff :P 
   
   FTR http://arrow.apache.org/_site/ does exist and shows a version of 0.12. 
https://github.com/apache/arrow-site/commit/99f824603cc862d3cef2eb6bad5a89aadd929951
 is the commit that created that directory and the only one that has touched 
it, so definitely looks like an accident.


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

[GitHub] [arrow-site] nealrichardson commented on issue #10: ARROW-6217: [Website] Remove needless _site/ directory

2019-08-12 Thread GitBox

nealrichardson commented on issue #10: ARROW-6217: [Website] Remove needless 
_site/ directory
URL: https://github.com/apache/arrow-site/pull/10#issuecomment-520634975
 
 
   +1 from me FWIW. 
https://github.com/apache/arrow-site/commit/99f824603cc862d3cef2eb6bad5a89aadd929951
 is the commit that created that directory and the only one that has touched 
it, so definitely looks like an accident.


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

[GitHub] [arrow-site] nealrichardson commented on issue #7: ARROW-6139: [Documentation][R] Build R docs (pkgdown) site and add to arrow-site

2019-08-14 Thread GitBox

nealrichardson commented on issue #7: ARROW-6139: [Documentation][R] Build R 
docs (pkgdown) site and add to arrow-site
URL: https://github.com/apache/arrow-site/pull/7#issuecomment-521335374
 
 
   This has been revised to build the CRAN published 0.14.1.1 package, and it 
removes all changes outside of the new `docs/r/` directory.


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

[GitHub] [arrow-site] wesm merged pull request #7: ARROW-6139: [Documentation][R] Build R docs (pkgdown) site and add to arrow-site

2019-08-14 Thread GitBox

wesm merged pull request #7: ARROW-6139: [Documentation][R] Build R docs 
(pkgdown) site and add to arrow-site
URL: https://github.com/apache/arrow-site/pull/7
 
 
   


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

[GitHub] [arrow-site] wesm commented on issue #7: ARROW-6139: [Documentation][R] Build R docs (pkgdown) site and add to arrow-site

2019-08-14 Thread GitBox

wesm commented on issue #7: ARROW-6139: [Documentation][R] Build R docs 
(pkgdown) site and add to arrow-site
URL: https://github.com/apache/arrow-site/pull/7#issuecomment-521335846
 
 
   Thanks!


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

[GitHub] [arrow-site] wesm closed pull request #15: Create README.md

2019-08-21 Thread GitBox

wesm closed pull request #15: Create README.md
URL: https://github.com/apache/arrow-site/pull/15
 
 
   


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

[GitHub] [arrow-site] wesm commented on issue #15: Create README.md

2019-08-21 Thread GitBox

wesm commented on issue #15: Create README.md
URL: https://github.com/apache/arrow-site/pull/15#issuecomment-523662542
 
 
   This branch is only for publishing the website. There is a README already at 
   
   https://github.com/apache/arrow-site/tree/master
   
   see also https://issues.apache.org/jira/browse/INFRA-18914


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

[GitHub] [arrow-site] nealrichardson opened a new pull request #16: ARROW-6260: [Website] Use deploy key on Travis to build and push to asf-site

2019-08-21 Thread GitBox

nealrichardson opened a new pull request #16: ARROW-6260: [Website] Use deploy 
key on Travis to build and push to asf-site
URL: https://github.com/apache/arrow-site/pull/16
 
 
   This change enables the use of [GitHub deploy 
keys](https://developer.github.com/v3/guides/managing-deploy-keys/#deploy-keys),
 which are tied to a repository and not an individual (in contrast with 
personal access tokens). Once this is merged, we can add a key pair to the 
repository settings in GitHub and Travis (INFRA may be required to do one or 
both of those) and then commits to the website source in the `master` branch 
will automatically build and push the generated static site to the `asf-site` 
branch.


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

[GitHub] [arrow-site] kou commented on a change in pull request #16: ARROW-6260: [Website] Use deploy key on Travis to build and push to asf-site

2019-08-21 Thread GitBox

kou commented on a change in pull request #16: ARROW-6260: [Website] Use deploy 
key on Travis to build and push to asf-site
URL: https://github.com/apache/arrow-site/pull/16#discussion_r316460102
 
 

 ##
 File path: build-and-deploy.sh
 ##
 @@ -3,13 +3,25 @@ set -ev
 
 if [ "${TRAVIS_BRANCH}" = "master" ] && [ "${TRAVIS_PULL_REQUEST}" = "false" 
]; then
 
-if [ -z "${GITHUB_PAT}" ]; then
+if [ -z "${GITHUB_PAT}" ] && [ -z "${DEPLOY_KEY}" ]; then
 # Don't build because we can't publish
-echo "To publish the site, you must set a GITHUB_PAT at"
+echo "To publish the site, you must set a GITHUB_PAT or DEPLOY_KEY at"
 echo "https://travis-ci.org/${TRAVIS_REPO_SLUG}/settings;
 exit 1
 fi
 
+if [ "${DEPLOY_KEY}" != "" ]; then
 
 Review comment:
   Could you use one of `-z "..."` style or `"..." != ""` style in this script 
for consistency?


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

[GitHub] [arrow-site] fsaintjacques merged pull request #14: Fix PR link on powered_by page in source

2019-08-21 Thread GitBox

fsaintjacques merged pull request #14: Fix PR link on powered_by page in source
URL: https://github.com/apache/arrow-site/pull/14
 
 
   


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

[GitHub] [arrow-site] RandomFractals opened a new pull request #15: Create README.md

2019-08-21 Thread GitBox

RandomFractals opened a new pull request #15: Create README.md
URL: https://github.com/apache/arrow-site/pull/15
 
 
   


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

[GitHub] [arrow-site] nealrichardson opened a new pull request #14: Fix PR link on powered_by page in source

2019-08-21 Thread GitBox

nealrichardson opened a new pull request #14: Fix PR link on powered_by page in 
source
URL: https://github.com/apache/arrow-site/pull/14
 
 
   See https://github.com/apache/arrow/issues/5156 and 
https://github.com/apache/arrow-site/pull/13


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

[GitHub] [arrow-site] nealrichardson opened a new pull request #12: ARROW-6225: [Website] Update arrow-site/README and any other places to point website contributors in right direction

2019-08-20 Thread GitBox

nealrichardson opened a new pull request #12: ARROW-6225: [Website] Update 
arrow-site/README and any other places to point website contributors in right 
direction
URL: https://github.com/apache/arrow-site/pull/12
 
 
   This is publishing the latest changes to the website (since we don't have 
the auto build and deploy set up on apache/arrow-site yet). It has the side 
effect of deleting README.md, which was stale, along with a bunch of other 
unused artifacts, using the `rsync -a --delete --exclude '/.git/' --exclude 
'/docs/'` command that Kou suggested on 
https://github.com/apache/arrow-site/pull/9. 


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

[GitHub] [arrow-site] kou commented on a change in pull request #16: ARROW-6260: [Website] Use deploy key on Travis to build and push to asf-site

2019-08-21 Thread GitBox

kou commented on a change in pull request #16: ARROW-6260: [Website] Use deploy 
key on Travis to build and push to asf-site
URL: https://github.com/apache/arrow-site/pull/16#discussion_r316479684
 
 

 ##
 File path: build-and-deploy.sh
 ##
 @@ -35,7 +35,22 @@ if [ "${TRAVIS_BRANCH}" = "master" ] && [ 
"${TRAVIS_PULL_REQUEST}" = "false" ];
 JEKYLL_ENV=production bundle exec jekyll build --baseurl="${BASE_URL}"
 
 # Publish
-git clone -b ${TARGET_BRANCH} 
https://${GITHUB_PAT}@github.com/$TRAVIS_REPO_SLUG.git OUTPUT
+if [ "${DEPLOY_KEY}" != "" ]; then
+echo "Setting deploy key"
+# Stick it in "scripts" because Jekyll ignores it
+echo $DEPLOY_KEY > scripts/deploy_key
+# Hack to make the key from the env var have real newlines
+sed -i 's/\\n/\
+/g' scripts/deploy_key
+chmod 600 scripts/deploy_key
+eval $(ssh-agent -s)
+ssh-add scripts/deploy_key
 
 Review comment:
   We can use `ssh-add -` here:
   
   ```bash
   echo "${DEPLOY_KEY}" | sed -e 's/\\n/\n/g' | ssh-add -
   ```


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

[GitHub] [arrow-site] nealrichardson commented on a change in pull request #16: ARROW-6260: [Website] Use deploy key on Travis to build and push to asf-site

2019-08-21 Thread GitBox

nealrichardson commented on a change in pull request #16: ARROW-6260: [Website] 
Use deploy key on Travis to build and push to asf-site
URL: https://github.com/apache/arrow-site/pull/16#discussion_r316486584
 
 

 ##
 File path: build-and-deploy.sh
 ##
 @@ -35,7 +35,18 @@ if [ "${TRAVIS_BRANCH}" = "master" ] && [ 
"${TRAVIS_PULL_REQUEST}" = "false" ];
 JEKYLL_ENV=production bundle exec jekyll build --baseurl="${BASE_URL}"
 
 # Publish
-git clone -b ${TARGET_BRANCH} 
https://${GITHUB_PAT}@github.com/$TRAVIS_REPO_SLUG.git OUTPUT
+if [ "${DEPLOY_KEY}" != "" ]; then
+echo "Setting deploy key"
+eval $(ssh-agent -s)
+# Hack to make the key from the env var have real newlines
+echo "${DEPLOY_KEY}" | sed -e 's/\\n/\
+/g' | ssh-add -
 
 Review comment:
   Seems to work.


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

[GitHub] [arrow-site] nealrichardson commented on a change in pull request #16: ARROW-6260: [Website] Use deploy key on Travis to build and push to asf-site

2019-08-21 Thread GitBox

nealrichardson commented on a change in pull request #16: ARROW-6260: [Website] 
Use deploy key on Travis to build and push to asf-site
URL: https://github.com/apache/arrow-site/pull/16#discussion_r316489942
 
 

 ##
 File path: build-and-deploy.sh
 ##
 @@ -35,7 +35,18 @@ if [ "${TRAVIS_BRANCH}" = "master" ] && [ 
"${TRAVIS_PULL_REQUEST}" = "false" ];
 JEKYLL_ENV=production bundle exec jekyll build --baseurl="${BASE_URL}"
 
 # Publish
-git clone -b ${TARGET_BRANCH} 
https://${GITHUB_PAT}@github.com/$TRAVIS_REPO_SLUG.git OUTPUT
+if [ "${DEPLOY_KEY}" != "" ]; then
+echo "Setting deploy key"
+eval $(ssh-agent -s)
+# Hack to make the key from the env var have real newlines
+echo "${DEPLOY_KEY}" | sed -e 's/\\n/\
+/g' | ssh-add -
 
 Review comment:
   Thanks for always managing to teach me a new bash trick :)


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

[GitHub] [arrow-site] kou merged pull request #16: ARROW-6260: [Website] Use deploy key on Travis to build and push to asf-site

2019-08-21 Thread GitBox

kou merged pull request #16: ARROW-6260: [Website] Use deploy key on Travis to 
build and push to asf-site
URL: https://github.com/apache/arrow-site/pull/16
 
 
   


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

[GitHub] [arrow-site] kou commented on a change in pull request #16: ARROW-6260: [Website] Use deploy key on Travis to build and push to asf-site

2019-08-21 Thread GitBox

kou commented on a change in pull request #16: ARROW-6260: [Website] Use deploy 
key on Travis to build and push to asf-site
URL: https://github.com/apache/arrow-site/pull/16#discussion_r316489518
 
 

 ##
 File path: build-and-deploy.sh
 ##
 @@ -35,7 +35,18 @@ if [ "${TRAVIS_BRANCH}" = "master" ] && [ 
"${TRAVIS_PULL_REQUEST}" = "false" ];
 JEKYLL_ENV=production bundle exec jekyll build --baseurl="${BASE_URL}"
 
 # Publish
-git clone -b ${TARGET_BRANCH} 
https://${GITHUB_PAT}@github.com/$TRAVIS_REPO_SLUG.git OUTPUT
+if [ "${DEPLOY_KEY}" != "" ]; then
+echo "Setting deploy key"
+eval $(ssh-agent -s)
+# Hack to make the key from the env var have real newlines
+echo "${DEPLOY_KEY}" | sed -e 's/\\n/\
+/g' | ssh-add -
 
 Review comment:
   Thanks for the try!


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

[GitHub] [arrow-site] nealrichardson commented on a change in pull request #16: ARROW-6260: [Website] Use deploy key on Travis to build and push to asf-site

2019-08-21 Thread GitBox

nealrichardson commented on a change in pull request #16: ARROW-6260: [Website] 
Use deploy key on Travis to build and push to asf-site
URL: https://github.com/apache/arrow-site/pull/16#discussion_r316482335
 
 

 ##
 File path: build-and-deploy.sh
 ##
 @@ -35,7 +35,22 @@ if [ "${TRAVIS_BRANCH}" = "master" ] && [ 
"${TRAVIS_PULL_REQUEST}" = "false" ];
 JEKYLL_ENV=production bundle exec jekyll build --baseurl="${BASE_URL}"
 
 # Publish
-git clone -b ${TARGET_BRANCH} 
https://${GITHUB_PAT}@github.com/$TRAVIS_REPO_SLUG.git OUTPUT
+if [ "${DEPLOY_KEY}" != "" ]; then
+echo "Setting deploy key"
+# Stick it in "scripts" because Jekyll ignores it
+echo $DEPLOY_KEY > scripts/deploy_key
+# Hack to make the key from the env var have real newlines
+sed -i 's/\\n/\
+/g' scripts/deploy_key
+chmod 600 scripts/deploy_key
+eval $(ssh-agent -s)
+ssh-add scripts/deploy_key
 
 Review comment:
   Done, nice to avoid making that file


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

[GitHub] [arrow-site] kou commented on a change in pull request #16: ARROW-6260: [Website] Use deploy key on Travis to build and push to asf-site

2019-08-21 Thread GitBox

kou commented on a change in pull request #16: ARROW-6260: [Website] Use deploy 
key on Travis to build and push to asf-site
URL: https://github.com/apache/arrow-site/pull/16#discussion_r316484106
 
 

 ##
 File path: build-and-deploy.sh
 ##
 @@ -35,7 +35,18 @@ if [ "${TRAVIS_BRANCH}" = "master" ] && [ 
"${TRAVIS_PULL_REQUEST}" = "false" ];
 JEKYLL_ENV=production bundle exec jekyll build --baseurl="${BASE_URL}"
 
 # Publish
-git clone -b ${TARGET_BRANCH} 
https://${GITHUB_PAT}@github.com/$TRAVIS_REPO_SLUG.git OUTPUT
+if [ "${DEPLOY_KEY}" != "" ]; then
+echo "Setting deploy key"
+eval $(ssh-agent -s)
+# Hack to make the key from the env var have real newlines
+echo "${DEPLOY_KEY}" | sed -e 's/\\n/\
+/g' | ssh-add -
 
 Review comment:
   Can we use `s/\\n/\n/g` here?
   
   It works well with bash on my Debian GNU/Linux:
   
   ```bash
   #!/bin/bash
   
   DEPLOY_KEY="-BEGIN\ OPENSSH\ PRIVATE\ KEY-\\naaa\\nbbb"
   echo "${DEPLOY_KEY}"
   echo "${DEPLOY_KEY}" | sed -e 's/\\n/\n/g'
   ```
   
   ```console
   $ /tmp/a.sh
   -BEGIN\ OPENSSH\ PRIVATE\ KEY-\naaa\nbbb
   -BEGIN\ OPENSSH\ PRIVATE\ KEY-
   aaa
   bbb
   ```


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

[GitHub] [arrow-site] nealrichardson commented on a change in pull request #16: ARROW-6260: [Website] Use deploy key on Travis to build and push to asf-site

2019-08-21 Thread GitBox

nealrichardson commented on a change in pull request #16: ARROW-6260: [Website] 
Use deploy key on Travis to build and push to asf-site
URL: https://github.com/apache/arrow-site/pull/16#discussion_r316477513
 
 

 ##
 File path: build-and-deploy.sh
 ##
 @@ -3,13 +3,25 @@ set -ev
 
 if [ "${TRAVIS_BRANCH}" = "master" ] && [ "${TRAVIS_PULL_REQUEST}" = "false" 
]; then
 
-if [ -z "${GITHUB_PAT}" ]; then
+if [ -z "${GITHUB_PAT}" ] && [ -z "${DEPLOY_KEY}" ]; then
 # Don't build because we can't publish
-echo "To publish the site, you must set a GITHUB_PAT at"
+echo "To publish the site, you must set a GITHUB_PAT or DEPLOY_KEY at"
 echo "https://travis-ci.org/${TRAVIS_REPO_SLUG}/settings;
 exit 1
 fi
 
+if [ "${DEPLOY_KEY}" != "" ]; then
 
 Review comment:
   Thanks! Done, PTAL. 


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

[GitHub] [arrow-site] nealrichardson commented on a change in pull request #16: ARROW-6260: [Website] Use deploy key on Travis to build and push to asf-site

2019-08-21 Thread GitBox

nealrichardson commented on a change in pull request #16: ARROW-6260: [Website] 
Use deploy key on Travis to build and push to asf-site
URL: https://github.com/apache/arrow-site/pull/16#discussion_r316486127
 
 

 ##
 File path: build-and-deploy.sh
 ##
 @@ -35,7 +35,18 @@ if [ "${TRAVIS_BRANCH}" = "master" ] && [ 
"${TRAVIS_PULL_REQUEST}" = "false" ];
 JEKYLL_ENV=production bundle exec jekyll build --baseurl="${BASE_URL}"
 
 # Publish
-git clone -b ${TARGET_BRANCH} 
https://${GITHUB_PAT}@github.com/$TRAVIS_REPO_SLUG.git OUTPUT
+if [ "${DEPLOY_KEY}" != "" ]; then
+echo "Setting deploy key"
+eval $(ssh-agent -s)
+# Hack to make the key from the env var have real newlines
+echo "${DEPLOY_KEY}" | sed -e 's/\\n/\
+/g' | ssh-add -
 
 Review comment:
   Didn't work on my mac but it doesn't have to as long as it works on Travis ;)
   
   I'll give it a shot. 


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

[GitHub] [arrow-site] RandomFractals commented on issue #15: Create README.md

2019-08-22 Thread GitBox

RandomFractals commented on issue #15: Create README.md
URL: https://github.com/apache/arrow-site/pull/15#issuecomment-523818984
 
 
   There was no README.md there yesterday :)
   
   Thanks for addressing it in that jira ticket and master branch!


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

[GitHub] [arrow-site] RandomFractals opened a new pull request #17: Updated powered-by.md to include Data Preview 

2019-08-22 Thread GitBox

RandomFractals opened a new pull request #17: Updated powered-by.md to include 
Data Preview 
URL: https://github.com/apache/arrow-site/pull/17
 
 
   Resolves https://github.com/RandomFractals/vscode-data-preview/issues/139


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

[GitHub] [arrow-site] saintstack commented on a change in pull request #18: Update committer/PMC roster. Move a couple people to Emeritus status

2019-08-27 Thread GitBox

saintstack commented on a change in pull request #18: Update committer/PMC 
roster. Move a couple people to Emeritus status
URL: https://github.com/apache/arrow-site/pull/18#discussion_r318292197
 
 

 ##
 File path: committers.html
 ##
 @@ -18,24 +18,12 @@ Committers
 Dremio
 
 
-Todd Lipcon
-PMC
-todd
-Cloudera
-
-
 Ted Dunning
 PMC
 tdunning
 MapR
 
 
-Michael Stack
-PMC
-stack
-Cloudera
-
-
 
 Review comment:
   Thanks.


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

[GitHub] [arrow-site] wesm opened a new pull request #18: Update committer/PMC roster. Move a couple people to Emeritus status

2019-08-27 Thread GitBox

wesm opened a new pull request #18: Update committer/PMC roster. Move a couple 
people to Emeritus status
URL: https://github.com/apache/arrow-site/pull/18
 
 
   A couple of inactive PMC members have requested to be moved to Emeritus 
status. They can always be restored to PMC status at a later time. 


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

[GitHub] [arrow-site] RandomFractals commented on issue #17: Updated powered-by.md to include Data Preview 

2019-08-27 Thread GitBox

RandomFractals commented on issue #17: Updated powered-by.md to include Data 
Preview 
URL: https://github.com/apache/arrow-site/pull/17#issuecomment-525516786
 
 
   Hi! Please let me know if there is anything outstanding I need to do for you 
to accept this powered-by.md PR. 
   
   I've tried to follow your instructions on that site page & added my Data 
Preview tool in alphabetical order to the list of great tools and frameworks 
using Apache Arrow you already have listed there.
   
   Thanks! cc @wesm 


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

[GitHub] [arrow-site] wesm merged pull request #18: Update committer/PMC roster. Move a couple people to Emeritus status

2019-08-27 Thread GitBox

wesm merged pull request #18: Update committer/PMC roster. Move a couple people 
to Emeritus status
URL: https://github.com/apache/arrow-site/pull/18
 
 
   


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

[GitHub] [arrow-site] wesm merged pull request #17: Updated powered-by.md to include Data Preview 

2019-08-27 Thread GitBox

wesm merged pull request #17: Updated powered-by.md to include Data Preview 
URL: https://github.com/apache/arrow-site/pull/17
 
 
   


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

[GitHub] [arrow-site] RandomFractals commented on issue #17: Updated powered-by.md to include Data Preview 

2019-08-27 Thread GitBox

RandomFractals commented on issue #17: Updated powered-by.md to include Data 
Preview 
URL: https://github.com/apache/arrow-site/pull/17#issuecomment-525517650
 
 
   thank you @wesm 


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

[GitHub] [arrow-site] RandomFractals commented on issue #18: Update committer/PMC roster. Move a couple people to Emeritus status

2019-08-27 Thread GitBox

RandomFractals commented on issue #18: Update committer/PMC roster. Move a 
couple people to Emeritus status
URL: https://github.com/apache/arrow-site/pull/18#issuecomment-525494883
 
 
   hi guys! I could use your prompt attention to this PR too. thanks in advance!
   
   https://github.com/apache/arrow-site/pull/17


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

[GitHub] [arrow-site] wesm opened a new pull request #19: ARROW-6419: [Website] Blog post about Parquet C++ read performance improvements in Arrow 0.15

2019-09-02 Thread GitBox

wesm opened a new pull request #19: ARROW-6419: [Website] Blog post about 
Parquet C++ read performance improvements in Arrow 0.15
URL: https://github.com/apache/arrow-site/pull/19
 
 
   The dates will need to be changed for the actual publication date.


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

[GitHub] [arrow-site] wesm commented on issue #19: ARROW-6419: [Website] Blog post about Parquet C++ read performance improvements in Arrow 0.15

2019-09-02 Thread GitBox

wesm commented on issue #19: ARROW-6419: [Website] Blog post about Parquet C++ 
read performance improvements in Arrow 0.15
URL: https://github.com/apache/arrow-site/pull/19#issuecomment-527286878
 
 
   cc @hatemhelal @xhochy for any review. 
   
   Note that we have dropped BinaryArray read performance in the non-dictionary 
case. Not sure why that is yet. I opened 
https://issues.apache.org/jira/browse/ARROW-6417 to investigate
   
   
![20190903_parquet_read_perf](https://user-images.githubusercontent.com/329591/64141564-2b9a4b80-cdce-11e9-94ea-bfcc0dea0b23.png)
   


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

[GitHub] [arrow-site] wesm commented on issue #19: ARROW-6419: [Website] Blog post about Parquet C++ read performance improvements in Arrow 0.15

2019-09-02 Thread GitBox

wesm commented on issue #19: ARROW-6419: [Website] Blog post about Parquet C++ 
read performance improvements in Arrow 0.15
URL: https://github.com/apache/arrow-site/pull/19#issuecomment-527287458
 
 
   In light of the mixed performance results the post might need a new title to 
reframe around the dictionary read improvements


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

[GitHub] [arrow-site] wesm commented on a change in pull request #19: ARROW-6419: [Website] Blog post about Parquet C++ read performance improvements in Arrow 0.15

2019-09-03 Thread GitBox

wesm commented on a change in pull request #19: ARROW-6419: [Website] Blog post 
about Parquet C++ read performance improvements in Arrow 0.15
URL: https://github.com/apache/arrow-site/pull/19#discussion_r320453511
 
 

 ##
 File path: _posts/2019-09-03-faster-strings-cpp-parquet.md
 ##
 @@ -0,0 +1,233 @@
+---
+layout: post
+title: "Faster C++ Apache Parquet performance on string-heavy data coming in 
Apache Arrow 0.15"
+date: "2019-09-01 00:00:00 -0600"
+author: Wes McKinney
+categories: [application]
+---
+
+
+We have been implementing a series of optimizations in the Apache Parquet C++
+internals to improve read and write efficiency (both performance and memory
+use) for Arrow columnar binary and string types, including native support for
+Arrow's dictionary types. This should have a big impact on users of the C++,
+MATLAB, Python, R, and Ruby interfaces to Parquet files.
+
+We discuss the work that was done and show benchmarks comparing Arrow 0.11.0
+(released in October, 2018) with the current development version (to be
+released soon as Arrow 0.15.0).
+
+# Summary of work
+
+One of the challenges of developing the Parquet C++ library is that we
+maintain low-level read and write APIs that do not involve the Arrow columnar
+data structures. So we have had to take care to do Arrow-related optimizations
+without impacting non-Arrow Parquet users, which includes database systems like
+Clickhouse and Vertica.
+
+One of the largest and most complex optimizations involves encoding and
+decoding Parquet files' internal dictionary-encoded data streams to and from
+Arrow's in-memory dictionary-encoded `DictionaryArray`
+representation. Dictionary encoding is a compression strategy in Parquet, and
+there is no formal "dictionary" or "categorical" type. I will go into more
+detail about this below.
+
+Some of the particular JIRA issues related to this work include:
+
+- Vectorize comparators for computing statistics ([PARQUET-1523][1])
+- Read binary directly data directly into DictionaryBuilder
+  ([ARROW-3769][2])
+- Writing Parquet's dictionary indices directly into DictionaryBuilder
+  ([ARROW-3772][3])
+- Write dense (non-dictionary) Arrow arrays directly into Parquet data encoders
+  ([ARROW-6152][4])
+- Direct writing of arrow::DictionaryArray to Parquet column writers 
([ARROW-3246][5])
+- Supporting changing dictionaries ([ARROW-3144][6])
+- Internal IO optimizations and improved raw `BYTE_ARRAY` encoding performance
+  ([ARROW-4398][7])
+
+# Background: how Parquet files do dictionary encoding
+
+Many direct and indirect users of Apache Arrow use dictionary encoding to
+improve performance and memory use on binary or string data types that include
+many repeated values. pandas users will know this as the [Categorical type][8]
 
 Review comment:
   Lowercase is the official styling of the project name =) 


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

[GitHub] [arrow-site] nealrichardson commented on a change in pull request #19: ARROW-6419: [Website] Blog post about Parquet C++ read performance improvements in Arrow 0.15

2019-09-03 Thread GitBox

nealrichardson commented on a change in pull request #19: ARROW-6419: [Website] 
Blog post about Parquet C++ read performance improvements in Arrow 0.15
URL: https://github.com/apache/arrow-site/pull/19#discussion_r320429697
 
 

 ##
 File path: _posts/2019-09-03-faster-strings-cpp-parquet.md
 ##
 @@ -0,0 +1,233 @@
+---
+layout: post
+title: "Faster C++ Apache Parquet performance on string-heavy data coming in 
Apache Arrow 0.15"
+date: "2019-09-01 00:00:00 -0600"
+author: Wes McKinney
+categories: [application]
+---
+
+
+We have been implementing a series of optimizations in the Apache Parquet C++
+internals to improve read and write efficiency (both performance and memory
+use) for Arrow columnar binary and string types, including native support for
+Arrow's dictionary types. This should have a big impact on users of the C++,
+MATLAB, Python, R, and Ruby interfaces to Parquet files.
+
+We discuss the work that was done and show benchmarks comparing Arrow 0.11.0
+(released in October, 2018) with the current development version (to be
+released soon as Arrow 0.15.0).
+
+# Summary of work
+
+One of the challenges of developing the Parquet C++ library is that we
+maintain low-level read and write APIs that do not involve the Arrow columnar
+data structures. So we have had to take care to do Arrow-related optimizations
+without impacting non-Arrow Parquet users, which includes database systems like
+Clickhouse and Vertica.
+
+One of the largest and most complex optimizations involves encoding and
+decoding Parquet files' internal dictionary-encoded data streams to and from
+Arrow's in-memory dictionary-encoded `DictionaryArray`
+representation. Dictionary encoding is a compression strategy in Parquet, and
+there is no formal "dictionary" or "categorical" type. I will go into more
+detail about this below.
+
+Some of the particular JIRA issues related to this work include:
+
+- Vectorize comparators for computing statistics ([PARQUET-1523][1])
+- Read binary directly data directly into DictionaryBuilder
+  ([ARROW-3769][2])
+- Writing Parquet's dictionary indices directly into DictionaryBuilder
+  ([ARROW-3772][3])
+- Write dense (non-dictionary) Arrow arrays directly into Parquet data encoders
+  ([ARROW-6152][4])
+- Direct writing of arrow::DictionaryArray to Parquet column writers 
([ARROW-3246][5])
+- Supporting changing dictionaries ([ARROW-3144][6])
+- Internal IO optimizations and improved raw `BYTE_ARRAY` encoding performance
+  ([ARROW-4398][7])
+
+# Background: how Parquet files do dictionary encoding
+
+Many direct and indirect users of Apache Arrow use dictionary encoding to
+improve performance and memory use on binary or string data types that include
+many repeated values. pandas users will know this as the [Categorical type][8]
+while in R such encoding is known as [`factor`][9]. In the Arrow C++ library
+and various bindings we have the `DictionaryArray` object for representing such
+data in memory.
+
+For example, an array such as
+
+```
+['apple', 'orange', 'apple', NULL, 'orange', 'orange']
+```
+
+has dictionary-encoded form
+
+```
+dictionary: ['apple', 'orange']
+indices: [0, 1, 0, NULL, 1, 1]
+```
+
+The [Parquet format uses dictionary encoding][10] to compress data, and it is
+used for all Parquet data types, not just binary or string data. Parquet
+further uses bit-packing and run-length encoding (RLE) to compress the
+dictionary indices, so if you had data like
+
+```
+['apple', 'apple', 'apple', 'apple', 'apple', 'apple', 'orange']
+```
+
+the indices would be encoded like
+
+```
+[rle-run=(6, 0),
+ bit-packed-run=[1]]
+```
+
+The full details of the rle-bitpacking encoding are found in the Parquet
+specification.
+
+When writing a Parquet file, most implementations will use dictionary encoding
+to compress a column until the dictionary itself reaches a certain size
+threshold, usually around 1 megabyte. At this point, the column writer will
+"fall back" to `PLAIN` encoding where values are written end-to-end in "data
+pages" and then usually compressed with Snappy or Gzip. See the following rough
+diagram:
+
+
+
+
+
+When reading a Parquet file, the dictionary-encoded portions are usually
+materialized to their non-dictionary-encoded form, causing binary or string
+values to be duplicated in memory. So an obvious (but not trivial) optimization
+is to skip this "dense" materialization. There are several issues to deal with:
+
+* A Parquet file often contains multiple ColumnChunks for each semantic column,
+  and the dictionary values may be different in each ColumnChunk
+* We must gracefully handle the "fall back" portion which is not
+  dictionary-encoded
+
+We pursued several avenues to help with this:
+
+* Allowing each `arrow::DictionaryArray` to have a different dictionary
+  (before, the dictionary was part of the `DictionaryType`, which caused
+  problems)
+* We enabled the Parquet dictionary indices to be directly written into an
+  Arrow `DictionaryBuilder`

[GitHub] [arrow-site] nealrichardson commented on a change in pull request #19: ARROW-6419: [Website] Blog post about Parquet C++ read performance improvements in Arrow 0.15

2019-09-03 Thread GitBox

nealrichardson commented on a change in pull request #19: ARROW-6419: [Website] 
Blog post about Parquet C++ read performance improvements in Arrow 0.15
URL: https://github.com/apache/arrow-site/pull/19#discussion_r320430467
 
 

 ##
 File path: _posts/2019-09-03-faster-strings-cpp-parquet.md
 ##
 @@ -0,0 +1,233 @@
+---
+layout: post
+title: "Faster C++ Apache Parquet performance on string-heavy data coming in 
Apache Arrow 0.15"
+date: "2019-09-01 00:00:00 -0600"
+author: Wes McKinney
+categories: [application]
+---
+
+
+We have been implementing a series of optimizations in the Apache Parquet C++
+internals to improve read and write efficiency (both performance and memory
+use) for Arrow columnar binary and string types, including native support for
+Arrow's dictionary types. This should have a big impact on users of the C++,
+MATLAB, Python, R, and Ruby interfaces to Parquet files.
+
+We discuss the work that was done and show benchmarks comparing Arrow 0.11.0
+(released in October, 2018) with the current development version (to be
+released soon as Arrow 0.15.0).
+
+# Summary of work
+
+One of the challenges of developing the Parquet C++ library is that we
+maintain low-level read and write APIs that do not involve the Arrow columnar
+data structures. So we have had to take care to do Arrow-related optimizations
+without impacting non-Arrow Parquet users, which includes database systems like
+Clickhouse and Vertica.
+
+One of the largest and most complex optimizations involves encoding and
+decoding Parquet files' internal dictionary-encoded data streams to and from
+Arrow's in-memory dictionary-encoded `DictionaryArray`
+representation. Dictionary encoding is a compression strategy in Parquet, and
+there is no formal "dictionary" or "categorical" type. I will go into more
+detail about this below.
+
+Some of the particular JIRA issues related to this work include:
+
+- Vectorize comparators for computing statistics ([PARQUET-1523][1])
+- Read binary directly data directly into DictionaryBuilder
+  ([ARROW-3769][2])
+- Writing Parquet's dictionary indices directly into DictionaryBuilder
+  ([ARROW-3772][3])
+- Write dense (non-dictionary) Arrow arrays directly into Parquet data encoders
+  ([ARROW-6152][4])
+- Direct writing of arrow::DictionaryArray to Parquet column writers 
([ARROW-3246][5])
+- Supporting changing dictionaries ([ARROW-3144][6])
+- Internal IO optimizations and improved raw `BYTE_ARRAY` encoding performance
+  ([ARROW-4398][7])
+
+# Background: how Parquet files do dictionary encoding
+
+Many direct and indirect users of Apache Arrow use dictionary encoding to
+improve performance and memory use on binary or string data types that include
+many repeated values. pandas users will know this as the [Categorical type][8]
+while in R such encoding is known as [`factor`][9]. In the Arrow C++ library
+and various bindings we have the `DictionaryArray` object for representing such
+data in memory.
+
+For example, an array such as
+
+```
+['apple', 'orange', 'apple', NULL, 'orange', 'orange']
+```
+
+has dictionary-encoded form
+
+```
+dictionary: ['apple', 'orange']
+indices: [0, 1, 0, NULL, 1, 1]
+```
+
+The [Parquet format uses dictionary encoding][10] to compress data, and it is
+used for all Parquet data types, not just binary or string data. Parquet
+further uses bit-packing and run-length encoding (RLE) to compress the
+dictionary indices, so if you had data like
+
+```
+['apple', 'apple', 'apple', 'apple', 'apple', 'apple', 'orange']
+```
+
+the indices would be encoded like
+
+```
+[rle-run=(6, 0),
+ bit-packed-run=[1]]
+```
+
+The full details of the rle-bitpacking encoding are found in the Parquet
+specification.
+
+When writing a Parquet file, most implementations will use dictionary encoding
+to compress a column until the dictionary itself reaches a certain size
+threshold, usually around 1 megabyte. At this point, the column writer will
+"fall back" to `PLAIN` encoding where values are written end-to-end in "data
+pages" and then usually compressed with Snappy or Gzip. See the following rough
+diagram:
+
+
+
+
+
+When reading a Parquet file, the dictionary-encoded portions are usually
+materialized to their non-dictionary-encoded form, causing binary or string
+values to be duplicated in memory. So an obvious (but not trivial) optimization
+is to skip this "dense" materialization. There are several issues to deal with:
+
+* A Parquet file often contains multiple ColumnChunks for each semantic column,
+  and the dictionary values may be different in each ColumnChunk
+* We must gracefully handle the "fall back" portion which is not
+  dictionary-encoded
+
+We pursued several avenues to help with this:
+
+* Allowing each `arrow::DictionaryArray` to have a different dictionary
+  (before, the dictionary was part of the `DictionaryType`, which caused
+  problems)
+* We enabled the Parquet dictionary indices to be directly written into an
+  Arrow `DictionaryBuilder`

[GitHub] [arrow-site] nealrichardson commented on a change in pull request #19: ARROW-6419: [Website] Blog post about Parquet C++ read performance improvements in Arrow 0.15

2019-09-03 Thread GitBox

nealrichardson commented on a change in pull request #19: ARROW-6419: [Website] 
Blog post about Parquet C++ read performance improvements in Arrow 0.15
URL: https://github.com/apache/arrow-site/pull/19#discussion_r320433177
 
 

 ##
 File path: _posts/2019-09-03-faster-strings-cpp-parquet.md
 ##
 @@ -0,0 +1,233 @@
+---
+layout: post
+title: "Faster C++ Apache Parquet performance on string-heavy data coming in 
Apache Arrow 0.15"
+date: "2019-09-01 00:00:00 -0600"
+author: Wes McKinney
+categories: [application]
+---
+
+
+We have been implementing a series of optimizations in the Apache Parquet C++
+internals to improve read and write efficiency (both performance and memory
+use) for Arrow columnar binary and string types, including native support for
+Arrow's dictionary types. This should have a big impact on users of the C++,
+MATLAB, Python, R, and Ruby interfaces to Parquet files.
+
+We discuss the work that was done and show benchmarks comparing Arrow 0.11.0
+(released in October, 2018) with the current development version (to be
+released soon as Arrow 0.15.0).
+
+# Summary of work
+
+One of the challenges of developing the Parquet C++ library is that we
+maintain low-level read and write APIs that do not involve the Arrow columnar
+data structures. So we have had to take care to do Arrow-related optimizations
+without impacting non-Arrow Parquet users, which includes database systems like
+Clickhouse and Vertica.
+
+One of the largest and most complex optimizations involves encoding and
+decoding Parquet files' internal dictionary-encoded data streams to and from
+Arrow's in-memory dictionary-encoded `DictionaryArray`
+representation. Dictionary encoding is a compression strategy in Parquet, and
+there is no formal "dictionary" or "categorical" type. I will go into more
+detail about this below.
+
+Some of the particular JIRA issues related to this work include:
+
+- Vectorize comparators for computing statistics ([PARQUET-1523][1])
+- Read binary directly data directly into DictionaryBuilder
+  ([ARROW-3769][2])
+- Writing Parquet's dictionary indices directly into DictionaryBuilder
+  ([ARROW-3772][3])
+- Write dense (non-dictionary) Arrow arrays directly into Parquet data encoders
+  ([ARROW-6152][4])
+- Direct writing of arrow::DictionaryArray to Parquet column writers 
([ARROW-3246][5])
+- Supporting changing dictionaries ([ARROW-3144][6])
+- Internal IO optimizations and improved raw `BYTE_ARRAY` encoding performance
+  ([ARROW-4398][7])
+
+# Background: how Parquet files do dictionary encoding
+
+Many direct and indirect users of Apache Arrow use dictionary encoding to
+improve performance and memory use on binary or string data types that include
+many repeated values. pandas users will know this as the [Categorical type][8]
+while in R such encoding is known as [`factor`][9]. In the Arrow C++ library
+and various bindings we have the `DictionaryArray` object for representing such
+data in memory.
+
+For example, an array such as
+
+```
+['apple', 'orange', 'apple', NULL, 'orange', 'orange']
+```
+
+has dictionary-encoded form
+
+```
+dictionary: ['apple', 'orange']
+indices: [0, 1, 0, NULL, 1, 1]
+```
+
+The [Parquet format uses dictionary encoding][10] to compress data, and it is
+used for all Parquet data types, not just binary or string data. Parquet
+further uses bit-packing and run-length encoding (RLE) to compress the
+dictionary indices, so if you had data like
+
+```
+['apple', 'apple', 'apple', 'apple', 'apple', 'apple', 'orange']
+```
+
+the indices would be encoded like
+
+```
+[rle-run=(6, 0),
+ bit-packed-run=[1]]
+```
+
+The full details of the rle-bitpacking encoding are found in the Parquet
+specification.
+
+When writing a Parquet file, most implementations will use dictionary encoding
+to compress a column until the dictionary itself reaches a certain size
+threshold, usually around 1 megabyte. At this point, the column writer will
+"fall back" to `PLAIN` encoding where values are written end-to-end in "data
+pages" and then usually compressed with Snappy or Gzip. See the following rough
+diagram:
+
+
+
+
+
+When reading a Parquet file, the dictionary-encoded portions are usually
+materialized to their non-dictionary-encoded form, causing binary or string
+values to be duplicated in memory. So an obvious (but not trivial) optimization
+is to skip this "dense" materialization. There are several issues to deal with:
+
+* A Parquet file often contains multiple ColumnChunks for each semantic column,
+  and the dictionary values may be different in each ColumnChunk
+* We must gracefully handle the "fall back" portion which is not
+  dictionary-encoded
+
+We pursued several avenues to help with this:
+
+* Allowing each `arrow::DictionaryArray` to have a different dictionary
+  (before, the dictionary was part of the `DictionaryType`, which caused
+  problems)
+* We enabled the Parquet dictionary indices to be directly written into an
+  Arrow `DictionaryBuilder`

[GitHub] [arrow-site] nealrichardson commented on a change in pull request #19: ARROW-6419: [Website] Blog post about Parquet C++ read performance improvements in Arrow 0.15

2019-09-03 Thread GitBox

nealrichardson commented on a change in pull request #19: ARROW-6419: [Website] 
Blog post about Parquet C++ read performance improvements in Arrow 0.15
URL: https://github.com/apache/arrow-site/pull/19#discussion_r320432915
 
 

 ##
 File path: _posts/2019-09-03-faster-strings-cpp-parquet.md
 ##
 @@ -0,0 +1,233 @@
+---
+layout: post
+title: "Faster C++ Apache Parquet performance on string-heavy data coming in 
Apache Arrow 0.15"
+date: "2019-09-01 00:00:00 -0600"
+author: Wes McKinney
+categories: [application]
+---
+
+
+We have been implementing a series of optimizations in the Apache Parquet C++
+internals to improve read and write efficiency (both performance and memory
+use) for Arrow columnar binary and string types, including native support for
+Arrow's dictionary types. This should have a big impact on users of the C++,
+MATLAB, Python, R, and Ruby interfaces to Parquet files.
+
+We discuss the work that was done and show benchmarks comparing Arrow 0.11.0
+(released in October, 2018) with the current development version (to be
+released soon as Arrow 0.15.0).
+
+# Summary of work
+
+One of the challenges of developing the Parquet C++ library is that we
+maintain low-level read and write APIs that do not involve the Arrow columnar
+data structures. So we have had to take care to do Arrow-related optimizations
+without impacting non-Arrow Parquet users, which includes database systems like
+Clickhouse and Vertica.
+
+One of the largest and most complex optimizations involves encoding and
+decoding Parquet files' internal dictionary-encoded data streams to and from
+Arrow's in-memory dictionary-encoded `DictionaryArray`
+representation. Dictionary encoding is a compression strategy in Parquet, and
+there is no formal "dictionary" or "categorical" type. I will go into more
+detail about this below.
+
+Some of the particular JIRA issues related to this work include:
+
+- Vectorize comparators for computing statistics ([PARQUET-1523][1])
+- Read binary directly data directly into DictionaryBuilder
+  ([ARROW-3769][2])
+- Writing Parquet's dictionary indices directly into DictionaryBuilder
+  ([ARROW-3772][3])
+- Write dense (non-dictionary) Arrow arrays directly into Parquet data encoders
+  ([ARROW-6152][4])
+- Direct writing of arrow::DictionaryArray to Parquet column writers 
([ARROW-3246][5])
+- Supporting changing dictionaries ([ARROW-3144][6])
+- Internal IO optimizations and improved raw `BYTE_ARRAY` encoding performance
+  ([ARROW-4398][7])
+
+# Background: how Parquet files do dictionary encoding
+
+Many direct and indirect users of Apache Arrow use dictionary encoding to
+improve performance and memory use on binary or string data types that include
+many repeated values. pandas users will know this as the [Categorical type][8]
+while in R such encoding is known as [`factor`][9]. In the Arrow C++ library
+and various bindings we have the `DictionaryArray` object for representing such
+data in memory.
+
+For example, an array such as
+
+```
+['apple', 'orange', 'apple', NULL, 'orange', 'orange']
+```
+
+has dictionary-encoded form
+
+```
+dictionary: ['apple', 'orange']
+indices: [0, 1, 0, NULL, 1, 1]
+```
+
+The [Parquet format uses dictionary encoding][10] to compress data, and it is
+used for all Parquet data types, not just binary or string data. Parquet
+further uses bit-packing and run-length encoding (RLE) to compress the
+dictionary indices, so if you had data like
+
+```
+['apple', 'apple', 'apple', 'apple', 'apple', 'apple', 'orange']
+```
+
+the indices would be encoded like
+
+```
+[rle-run=(6, 0),
+ bit-packed-run=[1]]
+```
+
+The full details of the rle-bitpacking encoding are found in the Parquet
+specification.
+
+When writing a Parquet file, most implementations will use dictionary encoding
+to compress a column until the dictionary itself reaches a certain size
+threshold, usually around 1 megabyte. At this point, the column writer will
+"fall back" to `PLAIN` encoding where values are written end-to-end in "data
+pages" and then usually compressed with Snappy or Gzip. See the following rough
+diagram:
+
+
+
+
+
+When reading a Parquet file, the dictionary-encoded portions are usually
 
 Review comment:
   This seems to start a new section, where you've moved on from "background" 
and are starting to discuss your optimizations.


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

[GitHub] [arrow-site] nealrichardson commented on a change in pull request #19: ARROW-6419: [Website] Blog post about Parquet C++ read performance improvements in Arrow 0.15

2019-09-03 Thread GitBox

nealrichardson commented on a change in pull request #19: ARROW-6419: [Website] 
Blog post about Parquet C++ read performance improvements in Arrow 0.15
URL: https://github.com/apache/arrow-site/pull/19#discussion_r320432044
 
 

 ##
 File path: _posts/2019-09-03-faster-strings-cpp-parquet.md
 ##
 @@ -0,0 +1,233 @@
+---
+layout: post
+title: "Faster C++ Apache Parquet performance on string-heavy data coming in 
Apache Arrow 0.15"
+date: "2019-09-01 00:00:00 -0600"
+author: Wes McKinney
+categories: [application]
+---
+
+
+We have been implementing a series of optimizations in the Apache Parquet C++
+internals to improve read and write efficiency (both performance and memory
+use) for Arrow columnar binary and string types, including native support for
+Arrow's dictionary types. This should have a big impact on users of the C++,
+MATLAB, Python, R, and Ruby interfaces to Parquet files.
+
+We discuss the work that was done and show benchmarks comparing Arrow 0.11.0
+(released in October, 2018) with the current development version (to be
+released soon as Arrow 0.15.0).
+
+# Summary of work
+
+One of the challenges of developing the Parquet C++ library is that we
+maintain low-level read and write APIs that do not involve the Arrow columnar
+data structures. So we have had to take care to do Arrow-related optimizations
+without impacting non-Arrow Parquet users, which includes database systems like
+Clickhouse and Vertica.
+
+One of the largest and most complex optimizations involves encoding and
+decoding Parquet files' internal dictionary-encoded data streams to and from
+Arrow's in-memory dictionary-encoded `DictionaryArray`
+representation. Dictionary encoding is a compression strategy in Parquet, and
+there is no formal "dictionary" or "categorical" type. I will go into more
+detail about this below.
+
+Some of the particular JIRA issues related to this work include:
+
+- Vectorize comparators for computing statistics ([PARQUET-1523][1])
+- Read binary directly data directly into DictionaryBuilder
+  ([ARROW-3769][2])
+- Writing Parquet's dictionary indices directly into DictionaryBuilder
+  ([ARROW-3772][3])
+- Write dense (non-dictionary) Arrow arrays directly into Parquet data encoders
+  ([ARROW-6152][4])
+- Direct writing of arrow::DictionaryArray to Parquet column writers 
([ARROW-3246][5])
+- Supporting changing dictionaries ([ARROW-3144][6])
+- Internal IO optimizations and improved raw `BYTE_ARRAY` encoding performance
+  ([ARROW-4398][7])
+
+# Background: how Parquet files do dictionary encoding
+
+Many direct and indirect users of Apache Arrow use dictionary encoding to
+improve performance and memory use on binary or string data types that include
+many repeated values. pandas users will know this as the [Categorical type][8]
 
 Review comment:
   lowercase "pandas" reads odd to start a sentence


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

[GitHub] [arrow-site] nealrichardson commented on a change in pull request #19: ARROW-6419: [Website] Blog post about Parquet C++ read performance improvements in Arrow 0.15

2019-09-03 Thread GitBox

nealrichardson commented on a change in pull request #19: ARROW-6419: [Website] 
Blog post about Parquet C++ read performance improvements in Arrow 0.15
URL: https://github.com/apache/arrow-site/pull/19#discussion_r320430931
 
 

 ##
 File path: _posts/2019-09-03-faster-strings-cpp-parquet.md
 ##
 @@ -0,0 +1,233 @@
+---
+layout: post
+title: "Faster C++ Apache Parquet performance on string-heavy data coming in 
Apache Arrow 0.15"
+date: "2019-09-01 00:00:00 -0600"
+author: Wes McKinney
+categories: [application]
+---
+
+
+We have been implementing a series of optimizations in the Apache Parquet C++
+internals to improve read and write efficiency (both performance and memory
+use) for Arrow columnar binary and string types, including native support for
+Arrow's dictionary types. This should have a big impact on users of the C++,
+MATLAB, Python, R, and Ruby interfaces to Parquet files.
+
+We discuss the work that was done and show benchmarks comparing Arrow 0.11.0
+(released in October, 2018) with the current development version (to be
+released soon as Arrow 0.15.0).
+
+# Summary of work
+
+One of the challenges of developing the Parquet C++ library is that we
+maintain low-level read and write APIs that do not involve the Arrow columnar
+data structures. So we have had to take care to do Arrow-related optimizations
+without impacting non-Arrow Parquet users, which includes database systems like
+Clickhouse and Vertica.
+
+One of the largest and most complex optimizations involves encoding and
+decoding Parquet files' internal dictionary-encoded data streams to and from
+Arrow's in-memory dictionary-encoded `DictionaryArray`
+representation. Dictionary encoding is a compression strategy in Parquet, and
+there is no formal "dictionary" or "categorical" type. I will go into more
+detail about this below.
+
+Some of the particular JIRA issues related to this work include:
+
+- Vectorize comparators for computing statistics ([PARQUET-1523][1])
+- Read binary directly data directly into DictionaryBuilder
+  ([ARROW-3769][2])
+- Writing Parquet's dictionary indices directly into DictionaryBuilder
+  ([ARROW-3772][3])
+- Write dense (non-dictionary) Arrow arrays directly into Parquet data encoders
+  ([ARROW-6152][4])
+- Direct writing of arrow::DictionaryArray to Parquet column writers 
([ARROW-3246][5])
+- Supporting changing dictionaries ([ARROW-3144][6])
+- Internal IO optimizations and improved raw `BYTE_ARRAY` encoding performance
+  ([ARROW-4398][7])
+
+# Background: how Parquet files do dictionary encoding
+
+Many direct and indirect users of Apache Arrow use dictionary encoding to
+improve performance and memory use on binary or string data types that include
+many repeated values. pandas users will know this as the [Categorical type][8]
+while in R such encoding is known as [`factor`][9]. In the Arrow C++ library
+and various bindings we have the `DictionaryArray` object for representing such
+data in memory.
+
+For example, an array such as
+
+```
+['apple', 'orange', 'apple', NULL, 'orange', 'orange']
+```
+
+has dictionary-encoded form
+
+```
+dictionary: ['apple', 'orange']
+indices: [0, 1, 0, NULL, 1, 1]
+```
+
+The [Parquet format uses dictionary encoding][10] to compress data, and it is
+used for all Parquet data types, not just binary or string data. Parquet
+further uses bit-packing and run-length encoding (RLE) to compress the
+dictionary indices, so if you had data like
+
+```
+['apple', 'apple', 'apple', 'apple', 'apple', 'apple', 'orange']
+```
+
+the indices would be encoded like
+
+```
+[rle-run=(6, 0),
+ bit-packed-run=[1]]
+```
+
+The full details of the rle-bitpacking encoding are found in the Parquet
+specification.
+
+When writing a Parquet file, most implementations will use dictionary encoding
+to compress a column until the dictionary itself reaches a certain size
+threshold, usually around 1 megabyte. At this point, the column writer will
+"fall back" to `PLAIN` encoding where values are written end-to-end in "data
+pages" and then usually compressed with Snappy or Gzip. See the following rough
+diagram:
+
+
+
+
+
+When reading a Parquet file, the dictionary-encoded portions are usually
+materialized to their non-dictionary-encoded form, causing binary or string
+values to be duplicated in memory. So an obvious (but not trivial) optimization
+is to skip this "dense" materialization. There are several issues to deal with:
+
+* A Parquet file often contains multiple ColumnChunks for each semantic column,
+  and the dictionary values may be different in each ColumnChunk
+* We must gracefully handle the "fall back" portion which is not
+  dictionary-encoded
+
+We pursued several avenues to help with this:
+
+* Allowing each `arrow::DictionaryArray` to have a different dictionary
+  (before, the dictionary was part of the `DictionaryType`, which caused
+  problems)
+* We enabled the Parquet dictionary indices to be directly written into an
+  Arrow `DictionaryBuilder`

[GitHub] [arrow-site] nealrichardson commented on a change in pull request #19: ARROW-6419: [Website] Blog post about Parquet C++ read performance improvements in Arrow 0.15

2019-09-03 Thread GitBox

nealrichardson commented on a change in pull request #19: ARROW-6419: [Website] 
Blog post about Parquet C++ read performance improvements in Arrow 0.15
URL: https://github.com/apache/arrow-site/pull/19#discussion_r320431812
 
 

 ##
 File path: _posts/2019-09-03-faster-strings-cpp-parquet.md
 ##
 @@ -0,0 +1,233 @@
+---
+layout: post
+title: "Faster C++ Apache Parquet performance on string-heavy data coming in 
Apache Arrow 0.15"
+date: "2019-09-01 00:00:00 -0600"
+author: Wes McKinney
+categories: [application]
+---
+
+
+We have been implementing a series of optimizations in the Apache Parquet C++
+internals to improve read and write efficiency (both performance and memory
+use) for Arrow columnar binary and string types, including native support for
+Arrow's dictionary types. This should have a big impact on users of the C++,
+MATLAB, Python, R, and Ruby interfaces to Parquet files.
+
+We discuss the work that was done and show benchmarks comparing Arrow 0.11.0
+(released in October, 2018) with the current development version (to be
+released soon as Arrow 0.15.0).
+
+# Summary of work
+
+One of the challenges of developing the Parquet C++ library is that we
 
 Review comment:
   I'd move this paragraph to after the list of Jiras. This is an odd way to 
start a "summary"


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

[GitHub] [arrow-site] nealrichardson commented on a change in pull request #19: ARROW-6419: [Website] Blog post about Parquet C++ read performance improvements in Arrow 0.15

2019-09-03 Thread GitBox

nealrichardson commented on a change in pull request #19: ARROW-6419: [Website] 
Blog post about Parquet C++ read performance improvements in Arrow 0.15
URL: https://github.com/apache/arrow-site/pull/19#discussion_r320430285
 
 

 ##
 File path: _posts/2019-09-03-faster-strings-cpp-parquet.md
 ##
 @@ -0,0 +1,233 @@
+---
+layout: post
+title: "Faster C++ Apache Parquet performance on string-heavy data coming in 
Apache Arrow 0.15"
+date: "2019-09-01 00:00:00 -0600"
+author: Wes McKinney
+categories: [application]
+---
+
+
+We have been implementing a series of optimizations in the Apache Parquet C++
+internals to improve read and write efficiency (both performance and memory
+use) for Arrow columnar binary and string types, including native support for
+Arrow's dictionary types. This should have a big impact on users of the C++,
+MATLAB, Python, R, and Ruby interfaces to Parquet files.
+
+We discuss the work that was done and show benchmarks comparing Arrow 0.11.0
+(released in October, 2018) with the current development version (to be
+released soon as Arrow 0.15.0).
+
+# Summary of work
+
+One of the challenges of developing the Parquet C++ library is that we
+maintain low-level read and write APIs that do not involve the Arrow columnar
+data structures. So we have had to take care to do Arrow-related optimizations
+without impacting non-Arrow Parquet users, which includes database systems like
+Clickhouse and Vertica.
+
+One of the largest and most complex optimizations involves encoding and
+decoding Parquet files' internal dictionary-encoded data streams to and from
+Arrow's in-memory dictionary-encoded `DictionaryArray`
+representation. Dictionary encoding is a compression strategy in Parquet, and
+there is no formal "dictionary" or "categorical" type. I will go into more
+detail about this below.
+
+Some of the particular JIRA issues related to this work include:
+
+- Vectorize comparators for computing statistics ([PARQUET-1523][1])
+- Read binary directly data directly into DictionaryBuilder
+  ([ARROW-3769][2])
+- Writing Parquet's dictionary indices directly into DictionaryBuilder
+  ([ARROW-3772][3])
+- Write dense (non-dictionary) Arrow arrays directly into Parquet data encoders
+  ([ARROW-6152][4])
+- Direct writing of arrow::DictionaryArray to Parquet column writers 
([ARROW-3246][5])
+- Supporting changing dictionaries ([ARROW-3144][6])
+- Internal IO optimizations and improved raw `BYTE_ARRAY` encoding performance
+  ([ARROW-4398][7])
+
+# Background: how Parquet files do dictionary encoding
+
+Many direct and indirect users of Apache Arrow use dictionary encoding to
+improve performance and memory use on binary or string data types that include
+many repeated values. pandas users will know this as the [Categorical type][8]
+while in R such encoding is known as [`factor`][9]. In the Arrow C++ library
+and various bindings we have the `DictionaryArray` object for representing such
+data in memory.
+
+For example, an array such as
+
+```
+['apple', 'orange', 'apple', NULL, 'orange', 'orange']
+```
+
+has dictionary-encoded form
+
+```
+dictionary: ['apple', 'orange']
+indices: [0, 1, 0, NULL, 1, 1]
+```
+
+The [Parquet format uses dictionary encoding][10] to compress data, and it is
+used for all Parquet data types, not just binary or string data. Parquet
+further uses bit-packing and run-length encoding (RLE) to compress the
+dictionary indices, so if you had data like
+
+```
+['apple', 'apple', 'apple', 'apple', 'apple', 'apple', 'orange']
+```
+
+the indices would be encoded like
+
+```
+[rle-run=(6, 0),
+ bit-packed-run=[1]]
+```
+
+The full details of the rle-bitpacking encoding are found in the Parquet
+specification.
+
+When writing a Parquet file, most implementations will use dictionary encoding
+to compress a column until the dictionary itself reaches a certain size
+threshold, usually around 1 megabyte. At this point, the column writer will
+"fall back" to `PLAIN` encoding where values are written end-to-end in "data
+pages" and then usually compressed with Snappy or Gzip. See the following rough
+diagram:
+
+
+
+
+
+When reading a Parquet file, the dictionary-encoded portions are usually
+materialized to their non-dictionary-encoded form, causing binary or string
+values to be duplicated in memory. So an obvious (but not trivial) optimization
+is to skip this "dense" materialization. There are several issues to deal with:
+
+* A Parquet file often contains multiple ColumnChunks for each semantic column,
+  and the dictionary values may be different in each ColumnChunk
+* We must gracefully handle the "fall back" portion which is not
+  dictionary-encoded
+
+We pursued several avenues to help with this:
+
+* Allowing each `arrow::DictionaryArray` to have a different dictionary
+  (before, the dictionary was part of the `DictionaryType`, which caused
+  problems)
+* We enabled the Parquet dictionary indices to be directly written into an
+  Arrow `DictionaryBuilder`

[GitHub] [arrow-site] nealrichardson commented on a change in pull request #19: ARROW-6419: [Website] Blog post about Parquet C++ read performance improvements in Arrow 0.15

2019-09-03 Thread GitBox

nealrichardson commented on a change in pull request #19: ARROW-6419: [Website] 
Blog post about Parquet C++ read performance improvements in Arrow 0.15
URL: https://github.com/apache/arrow-site/pull/19#discussion_r320432165
 
 

 ##
 File path: _posts/2019-09-03-faster-strings-cpp-parquet.md
 ##
 @@ -0,0 +1,233 @@
+---
+layout: post
+title: "Faster C++ Apache Parquet performance on string-heavy data coming in 
Apache Arrow 0.15"
+date: "2019-09-01 00:00:00 -0600"
+author: Wes McKinney
+categories: [application]
+---
+
+
+We have been implementing a series of optimizations in the Apache Parquet C++
+internals to improve read and write efficiency (both performance and memory
+use) for Arrow columnar binary and string types, including native support for
+Arrow's dictionary types. This should have a big impact on users of the C++,
+MATLAB, Python, R, and Ruby interfaces to Parquet files.
+
+We discuss the work that was done and show benchmarks comparing Arrow 0.11.0
+(released in October, 2018) with the current development version (to be
+released soon as Arrow 0.15.0).
+
+# Summary of work
+
+One of the challenges of developing the Parquet C++ library is that we
+maintain low-level read and write APIs that do not involve the Arrow columnar
+data structures. So we have had to take care to do Arrow-related optimizations
+without impacting non-Arrow Parquet users, which includes database systems like
+Clickhouse and Vertica.
+
+One of the largest and most complex optimizations involves encoding and
+decoding Parquet files' internal dictionary-encoded data streams to and from
+Arrow's in-memory dictionary-encoded `DictionaryArray`
+representation. Dictionary encoding is a compression strategy in Parquet, and
+there is no formal "dictionary" or "categorical" type. I will go into more
+detail about this below.
+
+Some of the particular JIRA issues related to this work include:
+
+- Vectorize comparators for computing statistics ([PARQUET-1523][1])
+- Read binary directly data directly into DictionaryBuilder
+  ([ARROW-3769][2])
+- Writing Parquet's dictionary indices directly into DictionaryBuilder
+  ([ARROW-3772][3])
+- Write dense (non-dictionary) Arrow arrays directly into Parquet data encoders
+  ([ARROW-6152][4])
+- Direct writing of arrow::DictionaryArray to Parquet column writers 
([ARROW-3246][5])
+- Supporting changing dictionaries ([ARROW-3144][6])
+- Internal IO optimizations and improved raw `BYTE_ARRAY` encoding performance
+  ([ARROW-4398][7])
+
+# Background: how Parquet files do dictionary encoding
+
+Many direct and indirect users of Apache Arrow use dictionary encoding to
+improve performance and memory use on binary or string data types that include
+many repeated values. pandas users will know this as the [Categorical type][8]
+while in R such encoding is known as [`factor`][9]. In the Arrow C++ library
+and various bindings we have the `DictionaryArray` object for representing such
+data in memory.
+
+For example, an array such as
+
+```
+['apple', 'orange', 'apple', NULL, 'orange', 'orange']
+```
+
+has dictionary-encoded form
+
+```
+dictionary: ['apple', 'orange']
+indices: [0, 1, 0, NULL, 1, 1]
+```
+
+The [Parquet format uses dictionary encoding][10] to compress data, and it is
+used for all Parquet data types, not just binary or string data. Parquet
+further uses bit-packing and run-length encoding (RLE) to compress the
+dictionary indices, so if you had data like
+
+```
+['apple', 'apple', 'apple', 'apple', 'apple', 'apple', 'orange']
+```
+
+the indices would be encoded like
+
+```
+[rle-run=(6, 0),
+ bit-packed-run=[1]]
+```
+
+The full details of the rle-bitpacking encoding are found in the Parquet
+specification.
 
 Review comment:
   Link to that please


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

[GitHub] [arrow-site] nealrichardson commented on a change in pull request #19: ARROW-6419: [Website] Blog post about Parquet C++ read performance improvements in Arrow 0.15

2019-09-03 Thread GitBox

nealrichardson commented on a change in pull request #19: ARROW-6419: [Website] 
Blog post about Parquet C++ read performance improvements in Arrow 0.15
URL: https://github.com/apache/arrow-site/pull/19#discussion_r320429352
 
 

 ##
 File path: _posts/2019-09-03-faster-strings-cpp-parquet.md
 ##
 @@ -0,0 +1,233 @@
+---
+layout: post
+title: "Faster C++ Apache Parquet performance on string-heavy data coming in 
Apache Arrow 0.15"
+date: "2019-09-01 00:00:00 -0600"
+author: Wes McKinney
+categories: [application]
+---
+
+
+We have been implementing a series of optimizations in the Apache Parquet C++
+internals to improve read and write efficiency (both performance and memory
+use) for Arrow columnar binary and string types, including native support for
+Arrow's dictionary types. This should have a big impact on users of the C++,
+MATLAB, Python, R, and Ruby interfaces to Parquet files.
+
+We discuss the work that was done and show benchmarks comparing Arrow 0.11.0
+(released in October, 2018) with the current development version (to be
+released soon as Arrow 0.15.0).
+
+# Summary of work
+
+One of the challenges of developing the Parquet C++ library is that we
+maintain low-level read and write APIs that do not involve the Arrow columnar
+data structures. So we have had to take care to do Arrow-related optimizations
+without impacting non-Arrow Parquet users, which includes database systems like
+Clickhouse and Vertica.
+
+One of the largest and most complex optimizations involves encoding and
+decoding Parquet files' internal dictionary-encoded data streams to and from
+Arrow's in-memory dictionary-encoded `DictionaryArray`
+representation. Dictionary encoding is a compression strategy in Parquet, and
+there is no formal "dictionary" or "categorical" type. I will go into more
+detail about this below.
+
+Some of the particular JIRA issues related to this work include:
+
+- Vectorize comparators for computing statistics ([PARQUET-1523][1])
+- Read binary directly data directly into DictionaryBuilder
+  ([ARROW-3769][2])
+- Writing Parquet's dictionary indices directly into DictionaryBuilder
+  ([ARROW-3772][3])
+- Write dense (non-dictionary) Arrow arrays directly into Parquet data encoders
+  ([ARROW-6152][4])
+- Direct writing of arrow::DictionaryArray to Parquet column writers 
([ARROW-3246][5])
+- Supporting changing dictionaries ([ARROW-3144][6])
+- Internal IO optimizations and improved raw `BYTE_ARRAY` encoding performance
+  ([ARROW-4398][7])
+
+# Background: how Parquet files do dictionary encoding
+
+Many direct and indirect users of Apache Arrow use dictionary encoding to
+improve performance and memory use on binary or string data types that include
+many repeated values. pandas users will know this as the [Categorical type][8]
+while in R such encoding is known as [`factor`][9]. In the Arrow C++ library
+and various bindings we have the `DictionaryArray` object for representing such
+data in memory.
+
+For example, an array such as
+
+```
+['apple', 'orange', 'apple', NULL, 'orange', 'orange']
+```
+
+has dictionary-encoded form
+
+```
+dictionary: ['apple', 'orange']
+indices: [0, 1, 0, NULL, 1, 1]
+```
+
+The [Parquet format uses dictionary encoding][10] to compress data, and it is
+used for all Parquet data types, not just binary or string data. Parquet
+further uses bit-packing and run-length encoding (RLE) to compress the
+dictionary indices, so if you had data like
+
+```
+['apple', 'apple', 'apple', 'apple', 'apple', 'apple', 'orange']
+```
+
+the indices would be encoded like
+
+```
+[rle-run=(6, 0),
+ bit-packed-run=[1]]
+```
+
+The full details of the rle-bitpacking encoding are found in the Parquet
+specification.
+
+When writing a Parquet file, most implementations will use dictionary encoding
+to compress a column until the dictionary itself reaches a certain size
+threshold, usually around 1 megabyte. At this point, the column writer will
+"fall back" to `PLAIN` encoding where values are written end-to-end in "data
+pages" and then usually compressed with Snappy or Gzip. See the following rough
+diagram:
+
+
+
+
+
+When reading a Parquet file, the dictionary-encoded portions are usually
+materialized to their non-dictionary-encoded form, causing binary or string
+values to be duplicated in memory. So an obvious (but not trivial) optimization
+is to skip this "dense" materialization. There are several issues to deal with:
+
+* A Parquet file often contains multiple ColumnChunks for each semantic column,
+  and the dictionary values may be different in each ColumnChunk
+* We must gracefully handle the "fall back" portion which is not
+  dictionary-encoded
+
+We pursued several avenues to help with this:
+
+* Allowing each `arrow::DictionaryArray` to have a different dictionary
+  (before, the dictionary was part of the `DictionaryType`, which caused
+  problems)
+* We enabled the Parquet dictionary indices to be directly written into an
+  Arrow `DictionaryBuilder`

[GitHub] [arrow-site] nealrichardson commented on a change in pull request #19: ARROW-6419: [Website] Blog post about Parquet C++ read performance improvements in Arrow 0.15

2019-09-03 Thread GitBox

nealrichardson commented on a change in pull request #19: ARROW-6419: [Website] 
Blog post about Parquet C++ read performance improvements in Arrow 0.15
URL: https://github.com/apache/arrow-site/pull/19#discussion_r320431254
 
 

 ##
 File path: _posts/2019-09-03-faster-strings-cpp-parquet.md
 ##
 @@ -0,0 +1,233 @@
+---
+layout: post
+title: "Faster C++ Apache Parquet performance on string-heavy data coming in 
Apache Arrow 0.15"
+date: "2019-09-01 00:00:00 -0600"
+author: Wes McKinney
+categories: [application]
+---
+
+
+We have been implementing a series of optimizations in the Apache Parquet C++
+internals to improve read and write efficiency (both performance and memory
+use) for Arrow columnar binary and string types, including native support for
+Arrow's dictionary types. This should have a big impact on users of the C++,
+MATLAB, Python, R, and Ruby interfaces to Parquet files.
+
+We discuss the work that was done and show benchmarks comparing Arrow 0.11.0
 
 Review comment:
   ```suggestion
   This post reviews the work that was done and shows benchmarks comparing 
Arrow 0.11.0
   ```


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

[GitHub] [arrow-site] wesm commented on a change in pull request #19: ARROW-6419: [Website] Blog post about Parquet C++ read performance improvements in Arrow 0.15

2019-09-03 Thread GitBox

wesm commented on a change in pull request #19: ARROW-6419: [Website] Blog post 
about Parquet C++ read performance improvements in Arrow 0.15
URL: https://github.com/apache/arrow-site/pull/19#discussion_r320453000
 
 

 ##
 File path: _posts/2019-09-03-faster-strings-cpp-parquet.md
 ##
 @@ -0,0 +1,233 @@
+---
+layout: post
+title: "Faster C++ Apache Parquet performance on string-heavy data coming in 
Apache Arrow 0.15"
+date: "2019-09-01 00:00:00 -0600"
+author: Wes McKinney
+categories: [application]
+---
+
+
+We have been implementing a series of optimizations in the Apache Parquet C++
+internals to improve read and write efficiency (both performance and memory
+use) for Arrow columnar binary and string types, including native support for
+Arrow's dictionary types. This should have a big impact on users of the C++,
+MATLAB, Python, R, and Ruby interfaces to Parquet files.
+
+We discuss the work that was done and show benchmarks comparing Arrow 0.11.0
+(released in October, 2018) with the current development version (to be
+released soon as Arrow 0.15.0).
+
+# Summary of work
+
+One of the challenges of developing the Parquet C++ library is that we
+maintain low-level read and write APIs that do not involve the Arrow columnar
+data structures. So we have had to take care to do Arrow-related optimizations
+without impacting non-Arrow Parquet users, which includes database systems like
+Clickhouse and Vertica.
+
+One of the largest and most complex optimizations involves encoding and
+decoding Parquet files' internal dictionary-encoded data streams to and from
+Arrow's in-memory dictionary-encoded `DictionaryArray`
+representation. Dictionary encoding is a compression strategy in Parquet, and
+there is no formal "dictionary" or "categorical" type. I will go into more
+detail about this below.
+
+Some of the particular JIRA issues related to this work include:
+
+- Vectorize comparators for computing statistics ([PARQUET-1523][1])
+- Read binary directly data directly into DictionaryBuilder
+  ([ARROW-3769][2])
+- Writing Parquet's dictionary indices directly into DictionaryBuilder
+  ([ARROW-3772][3])
+- Write dense (non-dictionary) Arrow arrays directly into Parquet data encoders
+  ([ARROW-6152][4])
+- Direct writing of arrow::DictionaryArray to Parquet column writers 
([ARROW-3246][5])
+- Supporting changing dictionaries ([ARROW-3144][6])
+- Internal IO optimizations and improved raw `BYTE_ARRAY` encoding performance
+  ([ARROW-4398][7])
+
+# Background: how Parquet files do dictionary encoding
+
+Many direct and indirect users of Apache Arrow use dictionary encoding to
+improve performance and memory use on binary or string data types that include
+many repeated values. pandas users will know this as the [Categorical type][8]
+while in R such encoding is known as [`factor`][9]. In the Arrow C++ library
+and various bindings we have the `DictionaryArray` object for representing such
+data in memory.
+
+For example, an array such as
+
+```
+['apple', 'orange', 'apple', NULL, 'orange', 'orange']
+```
+
+has dictionary-encoded form
+
+```
+dictionary: ['apple', 'orange']
+indices: [0, 1, 0, NULL, 1, 1]
+```
+
+The [Parquet format uses dictionary encoding][10] to compress data, and it is
+used for all Parquet data types, not just binary or string data. Parquet
+further uses bit-packing and run-length encoding (RLE) to compress the
+dictionary indices, so if you had data like
+
+```
+['apple', 'apple', 'apple', 'apple', 'apple', 'apple', 'orange']
+```
+
+the indices would be encoded like
+
+```
+[rle-run=(6, 0),
+ bit-packed-run=[1]]
+```
+
+The full details of the rle-bitpacking encoding are found in the Parquet
+specification.
+
+When writing a Parquet file, most implementations will use dictionary encoding
+to compress a column until the dictionary itself reaches a certain size
+threshold, usually around 1 megabyte. At this point, the column writer will
+"fall back" to `PLAIN` encoding where values are written end-to-end in "data
+pages" and then usually compressed with Snappy or Gzip. See the following rough
+diagram:
+
+
+
+
+
+When reading a Parquet file, the dictionary-encoded portions are usually
+materialized to their non-dictionary-encoded form, causing binary or string
+values to be duplicated in memory. So an obvious (but not trivial) optimization
+is to skip this "dense" materialization. There are several issues to deal with:
+
+* A Parquet file often contains multiple ColumnChunks for each semantic column,
+  and the dictionary values may be different in each ColumnChunk
+* We must gracefully handle the "fall back" portion which is not
+  dictionary-encoded
+
+We pursued several avenues to help with this:
+
+* Allowing each `arrow::DictionaryArray` to have a different dictionary
+  (before, the dictionary was part of the `DictionaryType`, which caused
+  problems)
+* We enabled the Parquet dictionary indices to be directly written into an
+  Arrow `DictionaryBuilder` without

[GitHub] [arrow-site] kou merged pull request #12: ARROW-6225: [Website] Update arrow-site/README and any other places to point website contributors in right direction

2019-08-21 Thread GitBox

kou merged pull request #12: ARROW-6225: [Website] Update arrow-site/README and 
any other places to point website contributors in right direction
URL: https://github.com/apache/arrow-site/pull/12
 
 
   


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

[GitHub] [arrow-site] wesm merged pull request #19: ARROW-6419: [Website] Blog post about Parquet C++ read performance improvements in Arrow 0.15

2019-09-05 Thread GitBox

wesm merged pull request #19: ARROW-6419: [Website] Blog post about Parquet C++ 
read performance improvements in Arrow 0.15
URL: https://github.com/apache/arrow-site/pull/19
 
 
   


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

[GitHub] [arrow-site] wesm commented on a change in pull request #19: ARROW-6419: [Website] Blog post about Parquet C++ read performance improvements in Arrow 0.15

2019-09-05 Thread GitBox

wesm commented on a change in pull request #19: ARROW-6419: [Website] Blog post 
about Parquet C++ read performance improvements in Arrow 0.15
URL: https://github.com/apache/arrow-site/pull/19#discussion_r321321708
 
 

 ##
 File path: _posts/2019-09-03-faster-strings-cpp-parquet.md
 ##
 @@ -0,0 +1,238 @@
+---
+layout: post
+title: "Faster C++ Apache Parquet performance on dictionary-encoded string 
data coming in Apache Arrow 0.15"
+date: "2019-09-05 00:00:00 -0600"
+author: Wes McKinney
+categories: [application]
+---
+
+
+We have been implementing a series of optimizations in the Apache Parquet C++
+internals to improve read and write efficiency (both performance and memory
+use) for Arrow columnar binary and string data, with new "native" support for
+Arrow's dictionary types. This should have a big impact on users of the C++,
+MATLAB, Python, R, and Ruby interfaces to Parquet files.
+
+This post reviews work that was done and shows benchmarks comparing Arrow
+0.12.1 with the current development version (to be released soon as Arrow
+0.15.0).
+
+# Summary of work
+
+One of the largest and most complex optimizations involves encoding and
+decoding Parquet files' internal dictionary-encoded data streams to and from
+Arrow's in-memory dictionary-encoded `DictionaryArray`
+representation. Dictionary encoding is a compression strategy in Parquet, and
+there is no formal "dictionary" or "categorical" type. I will go into more
+detail about this below.
+
+Some of the particular JIRA issues related to this work include:
+
+- Vectorize comparators for computing statistics ([PARQUET-1523][1])
+- Read binary directly data directly into DictionaryBuilder
+  ([ARROW-3769][2])
+- Writing Parquet's dictionary indices directly into DictionaryBuilder
+  ([ARROW-3772][3])
+- Write dense (non-dictionary) Arrow arrays directly into Parquet data encoders
+  ([ARROW-6152][4])
+- Direct writing of arrow::DictionaryArray to Parquet column writers 
([ARROW-3246][5])
+- Supporting changing dictionaries ([ARROW-3144][6])
+- Internal IO optimizations and improved raw `BYTE_ARRAY` encoding performance
+  ([ARROW-4398][7])
+
+One of the challenges of developing the Parquet C++ library is that we
+maintain low-level read and write APIs that do not involve the Arrow columnar
+data structures. So we have had to take care to do Arrow-related optimizations
+without impacting non-Arrow Parquet users, which includes database systems like
+Clickhouse and Vertica.
+
+# Background: how Parquet files do dictionary encoding
+
+Many direct and indirect users of Apache Arrow use dictionary encoding to
+improve performance and memory use on binary or string data types that include
+many repeated values. pandas users will know this as the [Categorical type][8]
+while in R such encoding is known as [`factor`][9]. In the Arrow C++ library
+and various bindings we have the `DictionaryArray` object for representing such
+data in memory.
+
+For example, an array such as
+
+```
+['apple', 'orange', 'apple', NULL, 'orange', 'orange']
+```
+
+has dictionary-encoded form
+
+```
+dictionary: ['apple', 'orange']
+indices: [0, 1, 0, NULL, 1, 1]
+```
+
+The [Parquet format uses dictionary encoding][10] to compress data, and it is
+used for all Parquet data types, not just binary or string data. Parquet
+further uses bit-packing and run-length encoding (RLE) to compress the
+dictionary indices, so if you had data like
+
+```
+['apple', 'apple', 'apple', 'apple', 'apple', 'apple', 'orange']
+```
+
+the indices would be encoded like
+
+```
+[rle-run=(6, 0),
+ bit-packed-run=[1]]
+```
+
+The full details of the rle-bitpacking encoding are found in the [Parquet
+specification][10].
+
+When writing a Parquet file, most implementations will use dictionary encoding
+to compress a column until the dictionary itself reaches a certain size
+threshold, usually around 1 megabyte. At this point, the column writer will
+"fall back" to `PLAIN` encoding where values are written end-to-end in "data
+pages" and then usually compressed with Snappy or Gzip. See the following rough
+diagram:
+
+
+
+
+
+# Faster reading and writing of dictionary-encoded data
+
+When reading a Parquet file, the dictionary-encoded portions are usually
+materialized to their non-dictionary-encoded form, causing binary or string
+values to be duplicated in memory. So an obvious (but not trivial) optimization
+is to skip this "dense" materialization. There are several issues to deal with:
+
+* A Parquet file often contains multiple ColumnChunks for each semantic column,
+  and the dictionary values may be different in each ColumnChunk
+* We must gracefully handle the "fall back" portion which is not
+  dictionary-encoded
+
+We pursued several avenues to help with this:
+
+* Allowing each `arrow::DictionaryArray` to have a different dictionary
+  (before, the dictionary was part of the `DictionaryType`, which caused
+  problems)
+* We enabled the Parquet dictionary indices to be directly

[GitHub] [arrow-site] wesm commented on issue #17: Updated powered-by.md to include Data Preview 

2019-09-05 Thread GitBox

wesm commented on issue #17: Updated powered-by.md to include Data Preview 
URL: https://github.com/apache/arrow-site/pull/17#issuecomment-528422201
 
 
   Next site update


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

[GitHub] [arrow-site] RandomFractals commented on issue #17: Updated powered-by.md to include Data Preview 

2019-09-05 Thread GitBox

RandomFractals commented on issue #17: Updated powered-by.md to include Data 
Preview 
URL: https://github.com/apache/arrow-site/pull/17#issuecomment-528421046
 
 
   @wesm & this site maintainers: I still don't see this addition on your 
powered by site:
   
   https://arrow.apache.org/powered_by/
   
   is it just queued up for your next site update? 
   
   Can you please provide an ETA on that?
   
   Thanks in advance!
   
   Taras


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

[GitHub] [arrow-site] hatemhelal commented on a change in pull request #19: ARROW-6419: [Website] Blog post about Parquet C++ read performance improvements in Arrow 0.15

2019-09-05 Thread GitBox

hatemhelal commented on a change in pull request #19: ARROW-6419: [Website] 
Blog post about Parquet C++ read performance improvements in Arrow 0.15
URL: https://github.com/apache/arrow-site/pull/19#discussion_r321296155
 
 

 ##
 File path: _posts/2019-09-03-faster-strings-cpp-parquet.md
 ##
 @@ -0,0 +1,238 @@
+---
+layout: post
+title: "Faster C++ Apache Parquet performance on dictionary-encoded string 
data coming in Apache Arrow 0.15"
+date: "2019-09-05 00:00:00 -0600"
+author: Wes McKinney
+categories: [application]
+---
+
+
+We have been implementing a series of optimizations in the Apache Parquet C++
+internals to improve read and write efficiency (both performance and memory
+use) for Arrow columnar binary and string data, with new "native" support for
+Arrow's dictionary types. This should have a big impact on users of the C++,
+MATLAB, Python, R, and Ruby interfaces to Parquet files.
+
+This post reviews work that was done and shows benchmarks comparing Arrow
+0.12.1 with the current development version (to be released soon as Arrow
+0.15.0).
+
+# Summary of work
+
+One of the largest and most complex optimizations involves encoding and
+decoding Parquet files' internal dictionary-encoded data streams to and from
+Arrow's in-memory dictionary-encoded `DictionaryArray`
+representation. Dictionary encoding is a compression strategy in Parquet, and
+there is no formal "dictionary" or "categorical" type. I will go into more
+detail about this below.
+
+Some of the particular JIRA issues related to this work include:
+
+- Vectorize comparators for computing statistics ([PARQUET-1523][1])
+- Read binary directly data directly into DictionaryBuilder
+  ([ARROW-3769][2])
+- Writing Parquet's dictionary indices directly into DictionaryBuilder
+  ([ARROW-3772][3])
+- Write dense (non-dictionary) Arrow arrays directly into Parquet data encoders
+  ([ARROW-6152][4])
+- Direct writing of arrow::DictionaryArray to Parquet column writers 
([ARROW-3246][5])
+- Supporting changing dictionaries ([ARROW-3144][6])
+- Internal IO optimizations and improved raw `BYTE_ARRAY` encoding performance
+  ([ARROW-4398][7])
+
+One of the challenges of developing the Parquet C++ library is that we
+maintain low-level read and write APIs that do not involve the Arrow columnar
+data structures. So we have had to take care to do Arrow-related optimizations
+without impacting non-Arrow Parquet users, which includes database systems like
+Clickhouse and Vertica.
+
+# Background: how Parquet files do dictionary encoding
+
+Many direct and indirect users of Apache Arrow use dictionary encoding to
+improve performance and memory use on binary or string data types that include
+many repeated values. pandas users will know this as the [Categorical type][8]
+while in R such encoding is known as [`factor`][9]. In the Arrow C++ library
+and various bindings we have the `DictionaryArray` object for representing such
+data in memory.
+
+For example, an array such as
+
+```
+['apple', 'orange', 'apple', NULL, 'orange', 'orange']
+```
+
+has dictionary-encoded form
+
+```
+dictionary: ['apple', 'orange']
+indices: [0, 1, 0, NULL, 1, 1]
+```
+
+The [Parquet format uses dictionary encoding][10] to compress data, and it is
+used for all Parquet data types, not just binary or string data. Parquet
+further uses bit-packing and run-length encoding (RLE) to compress the
+dictionary indices, so if you had data like
+
+```
+['apple', 'apple', 'apple', 'apple', 'apple', 'apple', 'orange']
+```
+
+the indices would be encoded like
+
+```
+[rle-run=(6, 0),
+ bit-packed-run=[1]]
+```
+
+The full details of the rle-bitpacking encoding are found in the [Parquet
+specification][10].
+
+When writing a Parquet file, most implementations will use dictionary encoding
+to compress a column until the dictionary itself reaches a certain size
+threshold, usually around 1 megabyte. At this point, the column writer will
+"fall back" to `PLAIN` encoding where values are written end-to-end in "data
+pages" and then usually compressed with Snappy or Gzip. See the following rough
+diagram:
+
+
+
+
+
+# Faster reading and writing of dictionary-encoded data
+
+When reading a Parquet file, the dictionary-encoded portions are usually
+materialized to their non-dictionary-encoded form, causing binary or string
+values to be duplicated in memory. So an obvious (but not trivial) optimization
+is to skip this "dense" materialization. There are several issues to deal with:
+
+* A Parquet file often contains multiple ColumnChunks for each semantic column,
+  and the dictionary values may be different in each ColumnChunk
+* We must gracefully handle the "fall back" portion which is not
+  dictionary-encoded
+
+We pursued several avenues to help with this:
+
+* Allowing each `arrow::DictionaryArray` to have a different dictionary
 
 Review comment:
   I think it makes sense to drop the arrow namespace and use `DictionaryArray` 
throughout the post:
   
   ```suggestion

[GitHub] [arrow-site] hatemhelal commented on a change in pull request #19: ARROW-6419: [Website] Blog post about Parquet C++ read performance improvements in Arrow 0.15

2019-09-05 Thread GitBox

hatemhelal commented on a change in pull request #19: ARROW-6419: [Website] 
Blog post about Parquet C++ read performance improvements in Arrow 0.15
URL: https://github.com/apache/arrow-site/pull/19#discussion_r321294831
 
 

 ##
 File path: _posts/2019-09-03-faster-strings-cpp-parquet.md
 ##
 @@ -0,0 +1,238 @@
+---
+layout: post
+title: "Faster C++ Apache Parquet performance on dictionary-encoded string 
data coming in Apache Arrow 0.15"
+date: "2019-09-05 00:00:00 -0600"
+author: Wes McKinney
+categories: [application]
+---
+
+
+We have been implementing a series of optimizations in the Apache Parquet C++
+internals to improve read and write efficiency (both performance and memory
+use) for Arrow columnar binary and string data, with new "native" support for
+Arrow's dictionary types. This should have a big impact on users of the C++,
+MATLAB, Python, R, and Ruby interfaces to Parquet files.
+
+This post reviews work that was done and shows benchmarks comparing Arrow
+0.12.1 with the current development version (to be released soon as Arrow
+0.15.0).
+
+# Summary of work
+
+One of the largest and most complex optimizations involves encoding and
+decoding Parquet files' internal dictionary-encoded data streams to and from
+Arrow's in-memory dictionary-encoded `DictionaryArray`
+representation. Dictionary encoding is a compression strategy in Parquet, and
+there is no formal "dictionary" or "categorical" type. I will go into more
+detail about this below.
+
+Some of the particular JIRA issues related to this work include:
+
+- Vectorize comparators for computing statistics ([PARQUET-1523][1])
+- Read binary directly data directly into DictionaryBuilder
+  ([ARROW-3769][2])
+- Writing Parquet's dictionary indices directly into DictionaryBuilder
+  ([ARROW-3772][3])
+- Write dense (non-dictionary) Arrow arrays directly into Parquet data encoders
+  ([ARROW-6152][4])
+- Direct writing of arrow::DictionaryArray to Parquet column writers 
([ARROW-3246][5])
+- Supporting changing dictionaries ([ARROW-3144][6])
+- Internal IO optimizations and improved raw `BYTE_ARRAY` encoding performance
+  ([ARROW-4398][7])
+
+One of the challenges of developing the Parquet C++ library is that we
+maintain low-level read and write APIs that do not involve the Arrow columnar
+data structures. So we have had to take care to do Arrow-related optimizations
+without impacting non-Arrow Parquet users, which includes database systems like
+Clickhouse and Vertica.
+
+# Background: how Parquet files do dictionary encoding
+
+Many direct and indirect users of Apache Arrow use dictionary encoding to
+improve performance and memory use on binary or string data types that include
+many repeated values. pandas users will know this as the [Categorical type][8]
 
 Review comment:
   Shameless plug...
   
   ```suggestion
   many repeated values. MATLAB and pandas users will know this as the 
[Categorical type][8]
   ```
   
   Can add link to this doc page too:
   
   https://www.mathworks.com/help/matlab/categorical-arrays.html
   
   Sorry this messes up the citation numbering...


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

[GitHub] [arrow-site] hatemhelal commented on a change in pull request #19: ARROW-6419: [Website] Blog post about Parquet C++ read performance improvements in Arrow 0.15

2019-09-05 Thread GitBox

hatemhelal commented on a change in pull request #19: ARROW-6419: [Website] 
Blog post about Parquet C++ read performance improvements in Arrow 0.15
URL: https://github.com/apache/arrow-site/pull/19#discussion_r321301937
 
 

 ##
 File path: _posts/2019-09-03-faster-strings-cpp-parquet.md
 ##
 @@ -0,0 +1,238 @@
+---
+layout: post
+title: "Faster C++ Apache Parquet performance on dictionary-encoded string 
data coming in Apache Arrow 0.15"
+date: "2019-09-05 00:00:00 -0600"
+author: Wes McKinney
+categories: [application]
+---
+
+
+We have been implementing a series of optimizations in the Apache Parquet C++
+internals to improve read and write efficiency (both performance and memory
+use) for Arrow columnar binary and string data, with new "native" support for
+Arrow's dictionary types. This should have a big impact on users of the C++,
+MATLAB, Python, R, and Ruby interfaces to Parquet files.
+
+This post reviews work that was done and shows benchmarks comparing Arrow
+0.12.1 with the current development version (to be released soon as Arrow
+0.15.0).
+
+# Summary of work
+
+One of the largest and most complex optimizations involves encoding and
+decoding Parquet files' internal dictionary-encoded data streams to and from
+Arrow's in-memory dictionary-encoded `DictionaryArray`
+representation. Dictionary encoding is a compression strategy in Parquet, and
+there is no formal "dictionary" or "categorical" type. I will go into more
+detail about this below.
+
+Some of the particular JIRA issues related to this work include:
+
+- Vectorize comparators for computing statistics ([PARQUET-1523][1])
+- Read binary directly data directly into DictionaryBuilder
+  ([ARROW-3769][2])
+- Writing Parquet's dictionary indices directly into DictionaryBuilder
+  ([ARROW-3772][3])
+- Write dense (non-dictionary) Arrow arrays directly into Parquet data encoders
+  ([ARROW-6152][4])
+- Direct writing of arrow::DictionaryArray to Parquet column writers 
([ARROW-3246][5])
+- Supporting changing dictionaries ([ARROW-3144][6])
+- Internal IO optimizations and improved raw `BYTE_ARRAY` encoding performance
+  ([ARROW-4398][7])
+
+One of the challenges of developing the Parquet C++ library is that we
+maintain low-level read and write APIs that do not involve the Arrow columnar
+data structures. So we have had to take care to do Arrow-related optimizations
+without impacting non-Arrow Parquet users, which includes database systems like
+Clickhouse and Vertica.
+
+# Background: how Parquet files do dictionary encoding
+
+Many direct and indirect users of Apache Arrow use dictionary encoding to
+improve performance and memory use on binary or string data types that include
+many repeated values. pandas users will know this as the [Categorical type][8]
+while in R such encoding is known as [`factor`][9]. In the Arrow C++ library
+and various bindings we have the `DictionaryArray` object for representing such
+data in memory.
+
+For example, an array such as
+
+```
+['apple', 'orange', 'apple', NULL, 'orange', 'orange']
+```
+
+has dictionary-encoded form
+
+```
+dictionary: ['apple', 'orange']
+indices: [0, 1, 0, NULL, 1, 1]
+```
+
+The [Parquet format uses dictionary encoding][10] to compress data, and it is
+used for all Parquet data types, not just binary or string data. Parquet
+further uses bit-packing and run-length encoding (RLE) to compress the
+dictionary indices, so if you had data like
+
+```
+['apple', 'apple', 'apple', 'apple', 'apple', 'apple', 'orange']
+```
+
+the indices would be encoded like
+
+```
+[rle-run=(6, 0),
+ bit-packed-run=[1]]
+```
+
+The full details of the rle-bitpacking encoding are found in the [Parquet
+specification][10].
+
+When writing a Parquet file, most implementations will use dictionary encoding
+to compress a column until the dictionary itself reaches a certain size
+threshold, usually around 1 megabyte. At this point, the column writer will
+"fall back" to `PLAIN` encoding where values are written end-to-end in "data
+pages" and then usually compressed with Snappy or Gzip. See the following rough
+diagram:
+
+
+
+
+
+# Faster reading and writing of dictionary-encoded data
+
+When reading a Parquet file, the dictionary-encoded portions are usually
+materialized to their non-dictionary-encoded form, causing binary or string
+values to be duplicated in memory. So an obvious (but not trivial) optimization
+is to skip this "dense" materialization. There are several issues to deal with:
+
+* A Parquet file often contains multiple ColumnChunks for each semantic column,
+  and the dictionary values may be different in each ColumnChunk
+* We must gracefully handle the "fall back" portion which is not
+  dictionary-encoded
+
+We pursued several avenues to help with this:
+
+* Allowing each `arrow::DictionaryArray` to have a different dictionary
+  (before, the dictionary was part of the `DictionaryType`, which caused
+  problems)
+* We enabled the Parquet dictionary indices to be

[GitHub] [arrow-site] hatemhelal commented on a change in pull request #19: ARROW-6419: [Website] Blog post about Parquet C++ read performance improvements in Arrow 0.15

2019-09-05 Thread GitBox

hatemhelal commented on a change in pull request #19: ARROW-6419: [Website] 
Blog post about Parquet C++ read performance improvements in Arrow 0.15
URL: https://github.com/apache/arrow-site/pull/19#discussion_r321296528
 
 

 ##
 File path: _posts/2019-09-03-faster-strings-cpp-parquet.md
 ##
 @@ -0,0 +1,238 @@
+---
+layout: post
+title: "Faster C++ Apache Parquet performance on dictionary-encoded string 
data coming in Apache Arrow 0.15"
+date: "2019-09-05 00:00:00 -0600"
+author: Wes McKinney
+categories: [application]
+---
+
+
+We have been implementing a series of optimizations in the Apache Parquet C++
+internals to improve read and write efficiency (both performance and memory
+use) for Arrow columnar binary and string data, with new "native" support for
+Arrow's dictionary types. This should have a big impact on users of the C++,
+MATLAB, Python, R, and Ruby interfaces to Parquet files.
+
+This post reviews work that was done and shows benchmarks comparing Arrow
+0.12.1 with the current development version (to be released soon as Arrow
+0.15.0).
+
+# Summary of work
+
+One of the largest and most complex optimizations involves encoding and
+decoding Parquet files' internal dictionary-encoded data streams to and from
+Arrow's in-memory dictionary-encoded `DictionaryArray`
+representation. Dictionary encoding is a compression strategy in Parquet, and
+there is no formal "dictionary" or "categorical" type. I will go into more
+detail about this below.
+
+Some of the particular JIRA issues related to this work include:
+
+- Vectorize comparators for computing statistics ([PARQUET-1523][1])
+- Read binary directly data directly into DictionaryBuilder
+  ([ARROW-3769][2])
+- Writing Parquet's dictionary indices directly into DictionaryBuilder
+  ([ARROW-3772][3])
+- Write dense (non-dictionary) Arrow arrays directly into Parquet data encoders
+  ([ARROW-6152][4])
+- Direct writing of arrow::DictionaryArray to Parquet column writers 
([ARROW-3246][5])
+- Supporting changing dictionaries ([ARROW-3144][6])
+- Internal IO optimizations and improved raw `BYTE_ARRAY` encoding performance
+  ([ARROW-4398][7])
+
+One of the challenges of developing the Parquet C++ library is that we
+maintain low-level read and write APIs that do not involve the Arrow columnar
+data structures. So we have had to take care to do Arrow-related optimizations
+without impacting non-Arrow Parquet users, which includes database systems like
+Clickhouse and Vertica.
+
+# Background: how Parquet files do dictionary encoding
+
+Many direct and indirect users of Apache Arrow use dictionary encoding to
+improve performance and memory use on binary or string data types that include
+many repeated values. pandas users will know this as the [Categorical type][8]
+while in R such encoding is known as [`factor`][9]. In the Arrow C++ library
+and various bindings we have the `DictionaryArray` object for representing such
+data in memory.
+
+For example, an array such as
+
+```
+['apple', 'orange', 'apple', NULL, 'orange', 'orange']
+```
+
+has dictionary-encoded form
+
+```
+dictionary: ['apple', 'orange']
+indices: [0, 1, 0, NULL, 1, 1]
+```
+
+The [Parquet format uses dictionary encoding][10] to compress data, and it is
+used for all Parquet data types, not just binary or string data. Parquet
+further uses bit-packing and run-length encoding (RLE) to compress the
+dictionary indices, so if you had data like
+
+```
+['apple', 'apple', 'apple', 'apple', 'apple', 'apple', 'orange']
+```
+
+the indices would be encoded like
+
+```
+[rle-run=(6, 0),
+ bit-packed-run=[1]]
+```
+
+The full details of the rle-bitpacking encoding are found in the [Parquet
+specification][10].
+
+When writing a Parquet file, most implementations will use dictionary encoding
+to compress a column until the dictionary itself reaches a certain size
+threshold, usually around 1 megabyte. At this point, the column writer will
+"fall back" to `PLAIN` encoding where values are written end-to-end in "data
+pages" and then usually compressed with Snappy or Gzip. See the following rough
+diagram:
+
+
+
+
+
+# Faster reading and writing of dictionary-encoded data
+
+When reading a Parquet file, the dictionary-encoded portions are usually
+materialized to their non-dictionary-encoded form, causing binary or string
+values to be duplicated in memory. So an obvious (but not trivial) optimization
+is to skip this "dense" materialization. There are several issues to deal with:
+
+* A Parquet file often contains multiple ColumnChunks for each semantic column,
+  and the dictionary values may be different in each ColumnChunk
+* We must gracefully handle the "fall back" portion which is not
+  dictionary-encoded
+
+We pursued several avenues to help with this:
+
+* Allowing each `arrow::DictionaryArray` to have a different dictionary
+  (before, the dictionary was part of the `DictionaryType`, which caused
+  problems)
+* We enabled the Parquet dictionary indices to be

[GitHub] [arrow-site] hatemhelal commented on a change in pull request #19: ARROW-6419: [Website] Blog post about Parquet C++ read performance improvements in Arrow 0.15

2019-09-05 Thread GitBox

hatemhelal commented on a change in pull request #19: ARROW-6419: [Website] 
Blog post about Parquet C++ read performance improvements in Arrow 0.15
URL: https://github.com/apache/arrow-site/pull/19#discussion_r321242756
 
 

 ##
 File path: _posts/2019-09-03-faster-strings-cpp-parquet.md
 ##
 @@ -0,0 +1,238 @@
+---
+layout: post
+title: "Faster C++ Apache Parquet performance on dictionary-encoded string 
data coming in Apache Arrow 0.15"
+date: "2019-09-05 00:00:00 -0600"
+author: Wes McKinney
+categories: [application]
+---
+
+
+We have been implementing a series of optimizations in the Apache Parquet C++
+internals to improve read and write efficiency (both performance and memory
+use) for Arrow columnar binary and string data, with new "native" support for
+Arrow's dictionary types. This should have a big impact on users of the C++,
+MATLAB, Python, R, and Ruby interfaces to Parquet files.
+
+This post reviews work that was done and shows benchmarks comparing Arrow
+0.12.1 with the current development version (to be released soon as Arrow
+0.15.0).
+
+# Summary of work
+
+One of the largest and most complex optimizations involves encoding and
+decoding Parquet files' internal dictionary-encoded data streams to and from
+Arrow's in-memory dictionary-encoded `DictionaryArray`
+representation. Dictionary encoding is a compression strategy in Parquet, and
+there is no formal "dictionary" or "categorical" type. I will go into more
+detail about this below.
+
+Some of the particular JIRA issues related to this work include:
+
+- Vectorize comparators for computing statistics ([PARQUET-1523][1])
+- Read binary directly data directly into DictionaryBuilder
+  ([ARROW-3769][2])
+- Writing Parquet's dictionary indices directly into DictionaryBuilder
+  ([ARROW-3772][3])
+- Write dense (non-dictionary) Arrow arrays directly into Parquet data encoders
+  ([ARROW-6152][4])
+- Direct writing of arrow::DictionaryArray to Parquet column writers 
([ARROW-3246][5])
 
 Review comment:
   ```suggestion
   Direct writing of `arrow::DictionaryArray` to Parquet column writers 
([ARROW-3246][5])
   ```


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

[GitHub] [arrow-site] hatemhelal commented on a change in pull request #19: ARROW-6419: [Website] Blog post about Parquet C++ read performance improvements in Arrow 0.15

2019-09-05 Thread GitBox

hatemhelal commented on a change in pull request #19: ARROW-6419: [Website] 
Blog post about Parquet C++ read performance improvements in Arrow 0.15
URL: https://github.com/apache/arrow-site/pull/19#discussion_r321302667
 
 

 ##
 File path: _posts/2019-09-03-faster-strings-cpp-parquet.md
 ##
 @@ -0,0 +1,238 @@
+---
+layout: post
+title: "Faster C++ Apache Parquet performance on dictionary-encoded string 
data coming in Apache Arrow 0.15"
+date: "2019-09-05 00:00:00 -0600"
+author: Wes McKinney
+categories: [application]
+---
+
+
+We have been implementing a series of optimizations in the Apache Parquet C++
+internals to improve read and write efficiency (both performance and memory
+use) for Arrow columnar binary and string data, with new "native" support for
+Arrow's dictionary types. This should have a big impact on users of the C++,
+MATLAB, Python, R, and Ruby interfaces to Parquet files.
+
+This post reviews work that was done and shows benchmarks comparing Arrow
+0.12.1 with the current development version (to be released soon as Arrow
+0.15.0).
+
+# Summary of work
+
+One of the largest and most complex optimizations involves encoding and
+decoding Parquet files' internal dictionary-encoded data streams to and from
+Arrow's in-memory dictionary-encoded `DictionaryArray`
+representation. Dictionary encoding is a compression strategy in Parquet, and
+there is no formal "dictionary" or "categorical" type. I will go into more
+detail about this below.
+
+Some of the particular JIRA issues related to this work include:
+
+- Vectorize comparators for computing statistics ([PARQUET-1523][1])
+- Read binary directly data directly into DictionaryBuilder
+  ([ARROW-3769][2])
+- Writing Parquet's dictionary indices directly into DictionaryBuilder
+  ([ARROW-3772][3])
+- Write dense (non-dictionary) Arrow arrays directly into Parquet data encoders
+  ([ARROW-6152][4])
+- Direct writing of arrow::DictionaryArray to Parquet column writers 
([ARROW-3246][5])
+- Supporting changing dictionaries ([ARROW-3144][6])
+- Internal IO optimizations and improved raw `BYTE_ARRAY` encoding performance
+  ([ARROW-4398][7])
+
+One of the challenges of developing the Parquet C++ library is that we
+maintain low-level read and write APIs that do not involve the Arrow columnar
+data structures. So we have had to take care to do Arrow-related optimizations
+without impacting non-Arrow Parquet users, which includes database systems like
+Clickhouse and Vertica.
+
+# Background: how Parquet files do dictionary encoding
+
+Many direct and indirect users of Apache Arrow use dictionary encoding to
+improve performance and memory use on binary or string data types that include
+many repeated values. pandas users will know this as the [Categorical type][8]
+while in R such encoding is known as [`factor`][9]. In the Arrow C++ library
+and various bindings we have the `DictionaryArray` object for representing such
+data in memory.
+
+For example, an array such as
+
+```
+['apple', 'orange', 'apple', NULL, 'orange', 'orange']
+```
+
+has dictionary-encoded form
+
+```
+dictionary: ['apple', 'orange']
+indices: [0, 1, 0, NULL, 1, 1]
+```
+
+The [Parquet format uses dictionary encoding][10] to compress data, and it is
+used for all Parquet data types, not just binary or string data. Parquet
+further uses bit-packing and run-length encoding (RLE) to compress the
+dictionary indices, so if you had data like
+
+```
+['apple', 'apple', 'apple', 'apple', 'apple', 'apple', 'orange']
+```
+
+the indices would be encoded like
+
+```
+[rle-run=(6, 0),
+ bit-packed-run=[1]]
+```
+
+The full details of the rle-bitpacking encoding are found in the [Parquet
+specification][10].
+
+When writing a Parquet file, most implementations will use dictionary encoding
+to compress a column until the dictionary itself reaches a certain size
+threshold, usually around 1 megabyte. At this point, the column writer will
+"fall back" to `PLAIN` encoding where values are written end-to-end in "data
+pages" and then usually compressed with Snappy or Gzip. See the following rough
+diagram:
+
+
+
+
+
+# Faster reading and writing of dictionary-encoded data
+
+When reading a Parquet file, the dictionary-encoded portions are usually
+materialized to their non-dictionary-encoded form, causing binary or string
+values to be duplicated in memory. So an obvious (but not trivial) optimization
+is to skip this "dense" materialization. There are several issues to deal with:
+
+* A Parquet file often contains multiple ColumnChunks for each semantic column,
+  and the dictionary values may be different in each ColumnChunk
+* We must gracefully handle the "fall back" portion which is not
+  dictionary-encoded
+
+We pursued several avenues to help with this:
+
+* Allowing each `arrow::DictionaryArray` to have a different dictionary
+  (before, the dictionary was part of the `DictionaryType`, which caused
+  problems)
+* We enabled the Parquet dictionary indices to be

[GitHub] [arrow-site] hatemhelal commented on a change in pull request #19: ARROW-6419: [Website] Blog post about Parquet C++ read performance improvements in Arrow 0.15

2019-09-05 Thread GitBox

hatemhelal commented on a change in pull request #19: ARROW-6419: [Website] 
Blog post about Parquet C++ read performance improvements in Arrow 0.15
URL: https://github.com/apache/arrow-site/pull/19#discussion_r321303184
 
 

 ##
 File path: _posts/2019-09-03-faster-strings-cpp-parquet.md
 ##
 @@ -0,0 +1,238 @@
+---
+layout: post
+title: "Faster C++ Apache Parquet performance on dictionary-encoded string 
data coming in Apache Arrow 0.15"
+date: "2019-09-05 00:00:00 -0600"
+author: Wes McKinney
+categories: [application]
+---
+
+
+We have been implementing a series of optimizations in the Apache Parquet C++
+internals to improve read and write efficiency (both performance and memory
+use) for Arrow columnar binary and string data, with new "native" support for
+Arrow's dictionary types. This should have a big impact on users of the C++,
+MATLAB, Python, R, and Ruby interfaces to Parquet files.
+
+This post reviews work that was done and shows benchmarks comparing Arrow
+0.12.1 with the current development version (to be released soon as Arrow
+0.15.0).
+
+# Summary of work
+
+One of the largest and most complex optimizations involves encoding and
+decoding Parquet files' internal dictionary-encoded data streams to and from
+Arrow's in-memory dictionary-encoded `DictionaryArray`
+representation. Dictionary encoding is a compression strategy in Parquet, and
+there is no formal "dictionary" or "categorical" type. I will go into more
+detail about this below.
+
+Some of the particular JIRA issues related to this work include:
+
+- Vectorize comparators for computing statistics ([PARQUET-1523][1])
+- Read binary directly data directly into DictionaryBuilder
+  ([ARROW-3769][2])
+- Writing Parquet's dictionary indices directly into DictionaryBuilder
+  ([ARROW-3772][3])
+- Write dense (non-dictionary) Arrow arrays directly into Parquet data encoders
+  ([ARROW-6152][4])
+- Direct writing of arrow::DictionaryArray to Parquet column writers 
([ARROW-3246][5])
+- Supporting changing dictionaries ([ARROW-3144][6])
+- Internal IO optimizations and improved raw `BYTE_ARRAY` encoding performance
+  ([ARROW-4398][7])
+
+One of the challenges of developing the Parquet C++ library is that we
+maintain low-level read and write APIs that do not involve the Arrow columnar
+data structures. So we have had to take care to do Arrow-related optimizations
+without impacting non-Arrow Parquet users, which includes database systems like
+Clickhouse and Vertica.
+
+# Background: how Parquet files do dictionary encoding
+
+Many direct and indirect users of Apache Arrow use dictionary encoding to
+improve performance and memory use on binary or string data types that include
+many repeated values. pandas users will know this as the [Categorical type][8]
+while in R such encoding is known as [`factor`][9]. In the Arrow C++ library
+and various bindings we have the `DictionaryArray` object for representing such
+data in memory.
+
+For example, an array such as
+
+```
+['apple', 'orange', 'apple', NULL, 'orange', 'orange']
+```
+
+has dictionary-encoded form
+
+```
+dictionary: ['apple', 'orange']
+indices: [0, 1, 0, NULL, 1, 1]
+```
+
+The [Parquet format uses dictionary encoding][10] to compress data, and it is
+used for all Parquet data types, not just binary or string data. Parquet
+further uses bit-packing and run-length encoding (RLE) to compress the
+dictionary indices, so if you had data like
+
+```
+['apple', 'apple', 'apple', 'apple', 'apple', 'apple', 'orange']
+```
+
+the indices would be encoded like
+
+```
+[rle-run=(6, 0),
+ bit-packed-run=[1]]
+```
+
+The full details of the rle-bitpacking encoding are found in the [Parquet
+specification][10].
+
+When writing a Parquet file, most implementations will use dictionary encoding
+to compress a column until the dictionary itself reaches a certain size
+threshold, usually around 1 megabyte. At this point, the column writer will
+"fall back" to `PLAIN` encoding where values are written end-to-end in "data
+pages" and then usually compressed with Snappy or Gzip. See the following rough
+diagram:
+
+
+
+
+
+# Faster reading and writing of dictionary-encoded data
+
+When reading a Parquet file, the dictionary-encoded portions are usually
+materialized to their non-dictionary-encoded form, causing binary or string
+values to be duplicated in memory. So an obvious (but not trivial) optimization
+is to skip this "dense" materialization. There are several issues to deal with:
+
+* A Parquet file often contains multiple ColumnChunks for each semantic column,
+  and the dictionary values may be different in each ColumnChunk
+* We must gracefully handle the "fall back" portion which is not
+  dictionary-encoded
+
+We pursued several avenues to help with this:
+
+* Allowing each `arrow::DictionaryArray` to have a different dictionary
+  (before, the dictionary was part of the `DictionaryType`, which caused
+  problems)
+* We enabled the Parquet dictionary indices to be

[GitHub] [arrow-site] wesm commented on issue #19: ARROW-6419: [Website] Blog post about Parquet C++ read performance improvements in Arrow 0.15

2019-09-04 Thread GitBox

wesm commented on issue #19: ARROW-6419: [Website] Blog post about Parquet C++ 
read performance improvements in Arrow 0.15
URL: https://github.com/apache/arrow-site/pull/19#issuecomment-528024557
 
 
   I'll address the comments and update the benchmark results with ARROW-6417 
taken into account so we can publish this ~tomorrow


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

[GitHub] [arrow-site] kou commented on issue #21: ARROW-6497: [Website] On change to master branch, automatically make PR to asf-site

2019-09-11 Thread GitBox

kou commented on issue #21: ARROW-6497: [Website] On change to master branch, 
automatically make PR to asf-site
URL: https://github.com/apache/arrow-site/pull/21#issuecomment-530261322
 
 
   I don't try it yet but we may be able to use GitHub Actions and the default 
`GITHUB_TOKEN` in GitHub Actions instead of this Travis CI and pull request 
approach.
   
   See also: 
https://help.github.com/en/articles/virtual-environments-for-github-actions#github_token-secret
   
   I'll take a look into this.


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

[GitHub] [arrow-site] wesm merged pull request #22: ARROW-6505: [Website] Add new committers

2019-09-10 Thread GitBox

wesm merged pull request #22: ARROW-6505: [Website] Add new committers
URL: https://github.com/apache/arrow-site/pull/22
 
 
   


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

[GitHub] [arrow-site] wesm commented on issue #22: ARROW-6505: [Website] Add new committers

2019-09-10 Thread GitBox

wesm commented on issue #22: ARROW-6505: [Website] Add new committers
URL: https://github.com/apache/arrow-site/pull/22#issuecomment-529949292
 
 
   I added the missing affiliations. +1


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

[GitHub] [arrow-site] mrkn commented on issue #22: ARROW-6505: [Website] Add new committers

2019-09-10 Thread GitBox

mrkn commented on issue #22: ARROW-6505: [Website] Add new committers
URL: https://github.com/apache/arrow-site/pull/22#issuecomment-529980970
 
 
   Thanks, @wesm!


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

[GitHub] [arrow-site] mrkn commented on a change in pull request #22: Add new committers

2019-09-10 Thread GitBox

mrkn commented on a change in pull request #22: Add new committers
URL: https://github.com/apache/arrow-site/pull/22#discussion_r322564482
 
 

 ##
 File path: committers.html
 ##
 @@ -251,6 +251,24 @@ Committers
 TBD
 Dremio
 
+
+Ben Kietzman
+Committer
+bkietz
+
+
+
+Kenta Murata
+Committer
+mrkn
+Speee, Inc.
+
+
+Neal Richardson
+Committer
+npr
+
 
 Review comment:
   @nealrichardson Could you tell me your affiliation?


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

[GitHub] [arrow-site] mrkn opened a new pull request #22: Add new committers

2019-09-10 Thread GitBox

mrkn opened a new pull request #22: Add new committers
URL: https://github.com/apache/arrow-site/pull/22
 
 
   I'd like to add new committers on the list.


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

[GitHub] [arrow-site] mrkn commented on a change in pull request #22: Add new committers

2019-09-10 Thread GitBox

mrkn commented on a change in pull request #22: Add new committers
URL: https://github.com/apache/arrow-site/pull/22#discussion_r322564323
 
 

 ##
 File path: committers.html
 ##
 @@ -251,6 +251,24 @@ Committers
 TBD
 Dremio
 
+
+Ben Kietzman
+Committer
+bkietz
+
 
 Review comment:
   @bkietz Could you tell me your affiliation?


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

[GitHub] [arrow-site] nealrichardson commented on issue #24: ARROW-6569: [Website] Add support for auto deployment by GitHub Actions

2019-09-16 Thread GitBox

nealrichardson commented on issue #24: ARROW-6569: [Website] Add support for 
auto deployment by GitHub Actions
URL: https://github.com/apache/arrow-site/pull/24#issuecomment-531883882
 
 
   https://kou.github.io/arrow-site/ works for me (now) so maybe whatever the 
problem was is fixed?
   
   If it works, then I'm +1 on the change. I'd just ask that you update the 
README to note this change and update the instructions there (remove the stuff 
about deploy keys and PATs, at minimum). Also, is there any setup required to 
make this work? Do I have to do something to enable github actions on my fork? 
Do we have to do something to enable it on apache/arrow-site? Are you certain 
we are permitted to do that on apache/arrow-site?
   


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

[GitHub] [arrow-site] kou commented on issue #24: ARROW-6569: [Website] Add support for auto deployment by GitHub Actions

2019-09-16 Thread GitBox

kou commented on issue #24: ARROW-6569: [Website] Add support for auto 
deployment by GitHub Actions
URL: https://github.com/apache/arrow-site/pull/24#issuecomment-531973905
 
 
   I've removed all automatic deployment document from README. Because we don't 
need to do nothing.
   This GitHub Action runs on apache/arrow-site. So contributors need to do 
nothing. apache/arrow-site is already GitHub Actions ready.
   
   I'm also adding a step to this GitHub Action to comment preview GitHub Pages 
URL. If we don't have this, we need to describe preview URL in README.
   
   It doesn't work yet. I'll leave a comment here when it works.


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

[GitHub] [arrow-site] wesm commented on issue #24: ARROW-6569: [Website] Add support for auto deployment by GitHub Actions

2019-09-16 Thread GitBox

wesm commented on issue #24: ARROW-6569: [Website] Add support for auto 
deployment by GitHub Actions
URL: https://github.com/apache/arrow-site/pull/24#issuecomment-531951936
 
 
   @nealrichardson it appears that GitHub Actions may allow us to benefit from 
the capabilities of Azure Devops without having to jump through hoops with ASF 
Infra (previously we've been unable to use it because of repo permissions issue 
-- Azure Devops / Pipelines require that their app have write access to the 
repo)


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

[GitHub] [arrow-site] kou commented on a change in pull request #23: ARROW-6127: [Website] Add favicons and meta tags

2019-09-14 Thread GitBox

kou commented on a change in pull request #23: ARROW-6127: [Website] Add 
favicons and meta tags
URL: https://github.com/apache/arrow-site/pull/23#discussion_r324439945
 
 

 ##
 File path: .gitignore
 ##
 @@ -8,3 +8,4 @@ build/
 .bundle/
 ruby/
 .DS_Store
+themes/
 
 Review comment:
   Is this needed?


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

[GitHub] [arrow-site] kou commented on a change in pull request #23: ARROW-6127: [Website] Add favicons and meta tags

2019-09-14 Thread GitBox

kou commented on a change in pull request #23: ARROW-6127: [Website] Add 
favicons and meta tags
URL: https://github.com/apache/arrow-site/pull/23#discussion_r324440167
 
 

 ##
 File path: build-and-deploy.sh
 ##
 @@ -28,7 +28,13 @@ if [ "${TRAVIS_BRANCH}" = "master" ] && [ 
"${TRAVIS_PULL_REQUEST}" = "false" ];
 # because we can infer it based on GitHub Pages conventions
 if [ "${BASE_URL}" = "" ]; then
 BASE_URL=$(echo $TRAVIS_REPO_SLUG | sed -e 's@.*/@/@')
+FULL_URL="https://"$(echo $TRAVIS_REPO_SLUG | sed 
's@/.*@.github.io@')
+else
+# Everything is shoved into BASE_URL so this can be empty
+FULL_URL=
 fi
+# FULL_URL is for the opengraph tags, which can't be relative
+perl -pe 's@^url:.*@url: '"${FULL_URL}"'@' -i _config.yml
 
 Review comment:
   How about using override config file instead of rewrite the original config 
file?
   
   ```shell
   custom_config_yml="_config.override.yml"
   touch ${custom_config_yml}
   if [ "${TRAVIS_REPO_SLUG}" = "apache/arrow-site" ]; then
   # Production
   TARGET_BRANCH=asf-site
   BASE_URL=
   else
   # On a fork, so we'll deploy to GitHub Pages
   TARGET_BRANCH=gh-pages
   # You could supply an alternate BASE_URL, but that's not necessary
   # because we can infer it based on GitHub Pages conventions
   if [ "${BASE_URL}" = "" ]; then
   BASE_URL=$(echo $TRAVIS_REPO_SLUG | sed -e 's@.*/@/@')
   echo "url: https://$(echo $TRAVIS_REPO_SLUG | sed 
's@/.*@.github.io@')" >> ${custom_config_yml}
   fi
   fi
   
   # Build
   JEKYLL_ENV=production \
   bundle exec \
   jekyll build \
  --baseurl="${BASE_URL}" \
  --config=_config.yml,${custom_config_yml}
   ```
   
   See also: 
https://jekyllrb.com/docs/configuration/options/#build-command-options


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

1 2 3 4 5 6 7 8 9 10 >

1 - 100 of 2295 matches

Mail list logo