[arrow] branch master updated: ARROW-3126: [Python] Make Buffered* IO classes available to Python, incorporate into input_stream, output_stream factory functions

2019-01-09 Thread wesm
This is an automated email from the ASF dual-hosted git repository.

wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
 new 7fcad2c  ARROW-3126: [Python] Make Buffered* IO classes available to 
Python, incorporate into input_stream, output_stream factory functions
7fcad2c is described below

commit 7fcad2c29e3c3ac99b2f6c1f1fddc91c05b7f2b3
Author: Krisztián Szűcs 
AuthorDate: Wed Jan 9 22:38:12 2019 -0600

ARROW-3126: [Python] Make Buffered* IO classes available to Python, 
incorporate into input_stream, output_stream factory functions

We should add benchmarks too as a follow up PR.

Author: Krisztián Szűcs 
Author: Wes McKinney 

Closes #3252 from kszucs/ARROW-3126 and squashes the following commits:

50118a639  Fix API in file-benchmark.cc
d3917d9e5  Code review comments, buffer_size=0 means 
unbuffered
88bed90ef  lint
5842eae0e  remove test runner script
fd729abdb  don't typehint _detect_compression
3d1e386ce  tests
5e8b38551  fix failing test
e458db5a6  python support for buffered input and output 
streams
---
 cpp/CMakeLists.txt   |   2 +-
 cpp/src/arrow/io/api.h   |   1 +
 cpp/src/arrow/io/buffered-test.cc|   5 +-
 cpp/src/arrow/io/buffered.cc |  30 +++---
 cpp/src/arrow/io/buffered.h  |  20 ++--
 cpp/src/arrow/io/file-benchmark.cc   |   9 +-
 python/pyarrow/includes/libarrow.pxd |  16 +++
 python/pyarrow/io.pxi| 195 +--
 python/pyarrow/tests/test_io.py  |  86 +++
 9 files changed, 234 insertions(+), 130 deletions(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 0e4f395..08868af 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -80,7 +80,6 @@ if ("$ENV{CMAKE_EXPORT_COMPILE_COMMANDS}" STREQUAL "1" OR 
INFER_FOUND)
   # See http://clang.llvm.org/docs/JSONCompilationDatabase.html
   set(CMAKE_EXPORT_COMPILE_COMMANDS 1)
 endif()
-
 # --
 # cmake options
 
@@ -358,6 +357,7 @@ endif()
 if (ARROW_USE_CCACHE)
   find_program(CCACHE_FOUND ccache)
   if(CCACHE_FOUND)
+message(STATUS "Using ccache: ${CCACHE_FOUND}")
 set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE ${CCACHE_FOUND})
 set_property(GLOBAL PROPERTY RULE_LAUNCH_LINK ${CCACHE_FOUND})
   endif(CCACHE_FOUND)
diff --git a/cpp/src/arrow/io/api.h b/cpp/src/arrow/io/api.h
index 0d5742a..cf1be33 100644
--- a/cpp/src/arrow/io/api.h
+++ b/cpp/src/arrow/io/api.h
@@ -18,6 +18,7 @@
 #ifndef ARROW_IO_API_H
 #define ARROW_IO_API_H
 
+#include "arrow/io/buffered.h"
 #include "arrow/io/compressed.h"
 #include "arrow/io/file.h"
 #include "arrow/io/hdfs.h"
diff --git a/cpp/src/arrow/io/buffered-test.cc 
b/cpp/src/arrow/io/buffered-test.cc
index 074833d..7b9ab0c 100644
--- a/cpp/src/arrow/io/buffered-test.cc
+++ b/cpp/src/arrow/io/buffered-test.cc
@@ -105,7 +105,8 @@ class TestBufferedOutputStream : public 
FileTestFixture {
   lseek(fd_, 0, SEEK_END);
 #endif
 }
-ASSERT_OK(BufferedOutputStream::Create(file, buffer_size, _));
+ASSERT_OK(BufferedOutputStream::Create(buffer_size, default_memory_pool(), 
file,
+   _));
   }
 
   void WriteChunkwise(const std::string& datastr, const 
std::valarray& sizes) {
@@ -321,7 +322,7 @@ class TestBufferedInputStream : public 
FileTestFixture {
 std::shared_ptr file_in;
 ASSERT_OK(ReadableFile::Open(path_, _in));
 raw_ = file_in;
-ASSERT_OK(BufferedInputStream::Create(raw_, buffer_size, pool, 
_));
+ASSERT_OK(BufferedInputStream::Create(buffer_size, pool, raw_, 
_));
   }
 
  protected:
diff --git a/cpp/src/arrow/io/buffered.cc b/cpp/src/arrow/io/buffered.cc
index f3eae39..0b1431f 100644
--- a/cpp/src/arrow/io/buffered.cc
+++ b/cpp/src/arrow/io/buffered.cc
@@ -91,8 +91,8 @@ class BufferedBase {
 
 class BufferedOutputStream::Impl : public BufferedBase {
  public:
-  explicit Impl(std::shared_ptr raw)
-  : BufferedBase(default_memory_pool()), raw_(std::move(raw)) {}
+  explicit Impl(std::shared_ptr raw, MemoryPool* pool)
+  : BufferedBase(pool), raw_(std::move(raw)) {}
 
   Status Close() {
 std::lock_guard guard(lock_);
@@ -173,14 +173,16 @@ class BufferedOutputStream::Impl : public BufferedBase {
   std::shared_ptr raw_;
 };
 
-BufferedOutputStream::BufferedOutputStream(std::shared_ptr raw)
-: impl_(new BufferedOutputStream::Impl(std::move(raw))) {}
+BufferedOutputStream::BufferedOutputStream(std::shared_ptr raw,
+   MemoryPool* pool) {
+  impl_.reset(new Impl(std::move(raw), pool));
+}
 
-Status BufferedOutputStream::Create(std::shared_ptr raw,
-int64_t buffer_size,
+Status BufferedOutputStream::Create(int64_t buffer_size, MemoryPool* pool,
+ 

[arrow] branch master updated: ARROW-4065: [C++] arrowTargets.cmake is broken

2019-01-09 Thread wesm
This is an automated email from the ASF dual-hosted git repository.

wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
 new 8ab1493  ARROW-4065: [C++] arrowTargets.cmake is broken
8ab1493 is described below

commit 8ab1493c810ae354ce085c2c2052676f349b168a
Author: Kousuke Saruta 
AuthorDate: Wed Jan 9 22:30:39 2019 -0600

ARROW-4065: [C++] arrowTargets.cmake is broken

When we build Arrow's cpp library using CMake, arrowTargets.cmake will be 
generated and installed but it's broken.

The following is a part of arrowTargets.cmake generated.

```
# Create imported target arrow_shared
add_library(arrow_shared SHARED IMPORTED)

set_target_properties(arrow_shared PROPERTIES
  INTERFACE_LINK_LIBRARIES "dl;pthreadshared"
)

# Create imported target arrow_static
add_library(arrow_static STATIC IMPORTED)

set_target_properties(arrow_static PROPERTIES
  INTERFACE_LINK_LIBRARIES 
"glog_static;zstd_static;zlib_shared;snappy_static;lz4_static;brotli_dec_static;brotli_enc_static;brotli_common_static;double-conversion_static;boost_system_shared;boost_filesystem_shared;boost_regex_shared;jemalloc_static;rt;pthreadshared"
)
```

There are no INTERFACE_INCLUDE_DIRECTORIES and linker doesn't recognize 
pthreadshared because the true name of pthread should be libpthread.so or 
libpthread.a.
*_static and *_shared are also wrong name.

After this fix, we can build apps which links to arrow using CMake with 
CMakeLists.txt like as follows.

```
cmake_minimum_required(VERSION ...)
project(...)
...
find_package(arrow)
add_executable(your_excellent_app ...)
target_link_libraries(your_excellent_app arrow_shared) # or arrow_static
...
```

`$ cmake -D CMAKE_PREFIX_PATH=/path/to/arrow /path/to/CMakeLists.txt`
`$ cmake --build .`

Author: Kousuke Saruta 

Closes #3212 from sarutak/improve-cmake-config-file-generation and squashes 
the following commits:

0213d2666  Fix cpp/CMakeLists.txt, src/arrow/CMakeLists.txt 
and BuildUtils.cmake to enable building apps which links to Arrow using 
arrowTargets.cmake
---
 cpp/CMakeLists.txt | 80 ++
 cpp/cmake_modules/BuildUtils.cmake | 30 +++---
 cpp/src/arrow/CMakeLists.txt   |  4 +-
 3 files changed, 84 insertions(+), 30 deletions(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 4232af3..0e4f395 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -690,46 +690,59 @@ endif(UNIX)
 
 
 set(ARROW_LINK_LIBS)
+set(ARROW_SHARED_INSTALL_INTERFACE_LIBS)
+set(ARROW_STATIC_INSTALL_INTERFACE_LIBS)
 
 # Libraries to link statically with libarrow.so
 set(ARROW_STATIC_LINK_LIBS double-conversion_static)
+set(ARROW_STATIC_INSTALL_INTERFACE_LIBS double-conversion)
 
 if (ARROW_WITH_BROTLI)
-  SET(ARROW_STATIC_LINK_LIBS
+  list(APPEND
+ARROW_STATIC_LINK_LIBS
 brotli_dec_static
 brotli_enc_static
-brotli_common_static
-${ARROW_STATIC_LINK_LIBS})
+brotli_common_static)
+  list(APPEND
+ARROW_STATIC_INSTALL_INTERFACE_LIBS
+brotlidec
+brotlienc
+brotlicommon)
 endif()
 
 if (ARROW_WITH_BZ2)
-  SET(ARROW_STATIC_LINK_LIBS bz2_static ${ARROW_STATIC_LINK_LIBS})
+  list(APPEND ARROW_STATIC_LINK_LIBS bz2_static)
+  list(APPEND ARROW_STATIC_INSTALL_INTERFACE_LIBS bz2)
 endif()
 
 if (ARROW_WITH_LZ4)
-  SET(ARROW_STATIC_LINK_LIBS lz4_static ${ARROW_STATIC_LINK_LIBS})
+  list(APPEND ARROW_STATIC_LINK_LIBS lz4_static)
+  list(APPEND ARROW_STATIC_INSTALL_INTERFACE_LIBS lz4)
 endif()
 
 if (ARROW_WITH_SNAPPY)
-  SET(ARROW_STATIC_LINK_LIBS snappy_static ${ARROW_STATIC_LINK_LIBS})
+  list(APPEND ARROW_STATIC_LINK_LIBS snappy_static)
+  list(APPEND ARROW_STATIC_INSTALL_INTERFACE_LIBS snappy)
 endif()
 
 if (ARROW_WITH_ZLIB)
-  SET(ARROW_STATIC_LINK_LIBS ${ZLIB_LIBRARY} ${ARROW_STATIC_LINK_LIBS})
+  list(APPEND ARROW_STATIC_LINK_LIBS ${ZLIB_LIBRARY})
+  list(APPEND ARROW_STATIC_INSTALL_INTERFACE_LIBS z)
 endif()
 
 if (ARROW_WITH_ZSTD)
-  SET(ARROW_STATIC_LINK_LIBS zstd_static ${ARROW_STATIC_LINK_LIBS})
+  list(APPEND ARROW_STATIC_LINK_LIBS zstd_static)
+  list(APPEND ARROW_STATIC_INSTALL_INTERFACE_LIBS zstd)
 endif()
 
 if (ARROW_ORC)
-  SET(ARROW_STATIC_LINK_LIBS
-${ARROW_STATIC_LINK_LIBS}
-orc_static)
+  list(APPEND ARROW_STATIC_LINK_LIBS orc_static)
+  list(APPEND ARROW_STATIC_INSTALL_INTERFACE_LIBS orc)
 endif()
 
 if (ARROW_USE_GLOG)
-  SET(ARROW_STATIC_LINK_LIBS glog_static ${ARROW_STATIC_LINK_LIBS})
+  list(APPEND ARROW_STATIC_LINK_LIBS glog_static)
+  list(APPEND ARROW_STATIC_INSTALL_INTERFACE_LIBS glog)
   add_definitions("-DARROW_USE_GLOG")
 endif()
 
@@ -746,15 +759,24 @@ set(ARROW_SHARED_PRIVATE_LINK_LIBS
   

[arrow] branch master updated: ARROW-3428: [Python] Fix from_pandas conversion from float to bool

2019-01-09 Thread wesm
This is an automated email from the ASF dual-hosted git repository.

wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
 new 2b361fb  ARROW-3428: [Python] Fix from_pandas conversion from float to 
bool
2b361fb is described below

commit 2b361fb2e5b4321a6cdcbdbf457181702fd97eaa
Author: Bryan Cutler 
AuthorDate: Wed Jan 9 22:07:14 2019 -0600

ARROW-3428: [Python] Fix from_pandas conversion from float to bool

When `from_pandas` converts data to boolean, the values are read into a 
`uint8_t` and then checked. When the values are floating point numbers, not all 
bits are checked which can cause incorrect results.

Author: Bryan Cutler 

Closes #2698 from BryanCutler/python-from_pandas-float-to-bool-ARROW-3428 
and squashes the following commits:

f3d472626  added test with fix that passes, but fails other 
tests
---
 cpp/src/arrow/compute/kernels/cast-test.cc  | 19 +
 cpp/src/arrow/python/numpy_to_arrow.cc  | 66 +
 cpp/src/arrow/python/type_traits.h  |  1 +
 python/pyarrow/tests/test_convert_pandas.py | 39 ++---
 4 files changed, 81 insertions(+), 44 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/cast-test.cc 
b/cpp/src/arrow/compute/kernels/cast-test.cc
index 781e0af..c3a0df5 100644
--- a/cpp/src/arrow/compute/kernels/cast-test.cc
+++ b/cpp/src/arrow/compute/kernels/cast-test.cc
@@ -138,6 +138,25 @@ TEST_F(TestCast, SameTypeZeroCopy) {
   AssertBufferSame(*arr, *result, 1);
 }
 
+TEST_F(TestCast, FromBoolean) {
+  CastOptions options;
+
+  vector is_valid(20, true);
+  is_valid[3] = false;
+
+  vector v1(is_valid.size(), true);
+  vector e1(is_valid.size(), 1);
+  for (size_t i = 0; i < v1.size(); ++i) {
+if (i % 3 == 1) {
+  v1[i] = false;
+  e1[i] = 0;
+}
+  }
+
+  CheckCase(boolean(), v1, is_valid, 
int32(), e1,
+   options);
+}
+
 TEST_F(TestCast, ToBoolean) {
   CastOptions options;
   for (auto type : kNumericTypes) {
diff --git a/cpp/src/arrow/python/numpy_to_arrow.cc 
b/cpp/src/arrow/python/numpy_to_arrow.cc
index aa28b6e..aada6bf 100644
--- a/cpp/src/arrow/python/numpy_to_arrow.cc
+++ b/cpp/src/arrow/python/numpy_to_arrow.cc
@@ -63,6 +63,7 @@ namespace arrow {
 
 using internal::checked_cast;
 using internal::CopyBitmap;
+using internal::GenerateBitsUnrolled;
 
 namespace py {
 
@@ -246,6 +247,11 @@ class NumPyConverter {
 return Status::OK();
   }
 
+  // Called before ConvertData to ensure Numpy input buffer is in expected
+  // Arrow layout
+  template 
+  Status PrepareInputData(std::shared_ptr* data);
+
   // --
   // Traditional visitor conversion for non-object arrays
 
@@ -407,14 +413,32 @@ Status CopyStridedArray(PyArrayObject* arr, const int64_t 
length, MemoryPool* po
 }  // namespace
 
 template 
-inline Status NumPyConverter::ConvertData(std::shared_ptr* data) {
+inline Status NumPyConverter::PrepareInputData(std::shared_ptr* data) {
   if (is_strided()) {
 RETURN_NOT_OK(CopyStridedArray(arr_, length_, pool_, data));
+  } else if (dtype_->type_num == NPY_BOOL) {
+int64_t nbytes = BitUtil::BytesForBits(length_);
+std::shared_ptr buffer;
+RETURN_NOT_OK(AllocateBuffer(pool_, nbytes, ));
+
+Ndarray1DIndexer values(arr_);
+int64_t i = 0;
+const auto generate = [, ]() -> bool { return values[i++] > 0; };
+GenerateBitsUnrolled(buffer->mutable_data(), 0, length_, generate);
+
+*data = buffer;
   } else {
 // Can zero-copy
 *data = std::make_shared(reinterpret_cast(arr_));
   }
 
+  return Status::OK();
+}
+
+template 
+inline Status NumPyConverter::ConvertData(std::shared_ptr* data) {
+  RETURN_NOT_OK(PrepareInputData(data));
+
   std::shared_ptr input_type;
   RETURN_NOT_OK(NumPyDtypeToArrow(reinterpret_cast(dtype_), 
_type));
 
@@ -427,37 +451,11 @@ inline Status 
NumPyConverter::ConvertData(std::shared_ptr* data) {
 }
 
 template <>
-inline Status 
NumPyConverter::ConvertData(std::shared_ptr* data) {
-  int64_t nbytes = BitUtil::BytesForBits(length_);
-  std::shared_ptr buffer;
-  RETURN_NOT_OK(AllocateBuffer(pool_, nbytes, ));
-
-  Ndarray1DIndexer values(arr_);
-
-  uint8_t* bitmap = buffer->mutable_data();
-
-  memset(bitmap, 0, nbytes);
-  for (int i = 0; i < length_; ++i) {
-if (values[i] > 0) {
-  BitUtil::SetBit(bitmap, i);
-}
-  }
-
-  *data = buffer;
-  return Status::OK();
-}
-
-template <>
 inline Status NumPyConverter::ConvertData(std::shared_ptr* 
data) {
-  if (is_strided()) {
-RETURN_NOT_OK(CopyStridedArray(arr_, length_, pool_, data));
-  } else {
-// Can zero-copy
-*data = std::make_shared(reinterpret_cast(arr_));
-  }
-
   std::shared_ptr input_type;
 
+  RETURN_NOT_OK(PrepareInputData(data));
+
   auto date_dtype = 

[arrow] branch master updated: ARROW-2968: [R] Multi-threaded conversion from Arrow table to R data.frame

2019-01-09 Thread wesm
This is an automated email from the ASF dual-hosted git repository.

wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
 new 3b61349  ARROW-2968: [R] Multi-threaded conversion from Arrow table to 
R data.frame
3b61349 is described below

commit 3b61349b3c16d43003e493c7e2aec9348e7e7343
Author: Romain Francois 
AuthorDate: Wed Jan 9 22:00:12 2019 -0600

ARROW-2968: [R] Multi-threaded conversion from Arrow table to R data.frame

The `as_tibble()` methods for `arrow::RecordBatch` and `arrow::Table` 
gained a `use_threads` argument. When set to `TRUE` columns of a record batch 
or table are converted to R vectors in parallel.

We cannot allocate R data structures in parallel (including scalar 
strings), so it goes like this:

```
for each column:
  - allocate the R vector host for the array
  - if that can be done in parallel, fill the R vector with data from the 
array

fill serially all columns that could not be filled in parallel

wait for all columns to be full
```

This is I believe better (although perhaps harder to explain) than
  - allocate all the vectors
  - fill them in parallel
Because we don't have to wait for all the vectors to be allocated to start 
filling them.

I believe the python does that, in `DataFrameBlockCreator::Convert`

```
RETURN_NOT_OK(CreateBlocks());
RETURN_NOT_OK(WriteTableToBlocks());
```

I've had to split the implementation of `Array__as_vector` into two steps:

 - Allocate: this must happen on the main thread, or alternatively would 
need to mutex R
 - Ingest: For most array types, this can be done in parallel

Author: Romain Francois 

Closes #3332 from romainfrancois/2968/threads and squashes the following 
commits:

8261f2907  sprinkle use_threads in functions that call 
as_tibble()
3205de2d8  lint
590baf5a6  using string_view
cd0dd343e  no need for checkBuffers
29546cd5d  Some more refactoring of the Converters
5557b7974  refactor the Converter api, so that all 
Converters are implementations of the base class Converter.
e2ed26b78  lint
2a5815e03  moving parallel_ingest() to a static method of 
the Converter classes
2613d4ec4  null_count already local variable
62a842054  + to_r_index lambda, with comment about why +1
52c725fc8  default_value() marked constexpr
11e82e769  lint
d22b9c551  parallel version of Table__to_dataframe
2455bd057  parallel version of RecordBatch__to_dataframe
380d3a5bc  simplify ArrayVector__as_vector.
85881a3e2  simplify ArrayVector_To_Vector
7074b36e9  reinstate Converter_Timestamp so that 
ArrayVector__as_vector can be simplified
cf7e76bae  + parallel_ingest() to indicate if 
ingest for a givne converter can be doine in parallel
baaaefe1b  Re"work Converter api
e650b7934  + arrow::r::inspect(SEXP) for debugging
a335dfdfc  Factor out Array -> R vector code in separate 
file
1212e28a9  .Ingest() return an Invalid status 
instead of throwing an exception
39bf76403  .Ingest() return a Status instead of 
void
f68b79376  replaced DictionaryArrays_to_Vector and 
Converter_Dictionary_Int32Indices by Converter_Dictionary
d25a0e6b5  replace Date32ArrayVector_to_Vector by 
Converter_Date32
85e48c0c7  lint
18b921e6f  + Get/Set ThreadPoolCapacity
---
 r/NAMESPACE|   2 +
 r/R/RcppExports.R  |  57 +-
 r/R/RecordBatch.R  |   4 +-
 r/R/Table.R|   4 +-
 r/R/feather.R  |   5 +-
 r/R/parquet.R  |   5 +-
 r/R/read_table.R   |   4 +-
 r/man/GetCpuThreadPoolCapacity.Rd  |  18 +
 r/man/SetCpuThreadPoolCapacity.Rd  |  17 +
 r/man/read_feather.Rd  |   5 +-
 r/man/read_parquet.Rd  |   4 +-
 r/man/read_table.Rd|   4 +-
 r/src/RcppExports.cpp  | 120 ++--
 r/src/array.cpp| 496 ---
 r/src/array__to_vector.cpp | 697 +
 r/src/arrow_types.h|  12 +-
 r/src/recordbatch.cpp  |  16 -
 r/src/symbols.cpp  |   9 +
 r/src/table.cpp|  17 -
 r/src/threadpool.cpp   |  44 ++
 r/tests/testthat/test-RecordBatch.R|   1 -
 .../testthat/test-cputhreadpoolcapacity.R} |  25 +-
 22 files changed, 942 insertions(+), 624 deletions(-)

diff --git a/r/NAMESPACE 

[arrow] branch master updated: ARROW-4215: [GLib] Fix typos in documentation

2019-01-09 Thread shiro
This is an automated email from the ASF dual-hosted git repository.

shiro pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
 new db29723  ARROW-4215: [GLib] Fix typos in documentation
db29723 is described below

commit db29723f661174eefd04077666347a9bbaca5be1
Author: Kouhei Sutou 
AuthorDate: Thu Jan 10 11:49:29 2019 +0900

ARROW-4215: [GLib] Fix typos in documentation

This solves the following warnings:

arrow-glib/basic-data-type.cpp:1070: warning: multi-line since docs 
found
arrow-glib/decimal128.cpp:37: warning: Section decimal is not defined 
in the arrow-glib-sections.txt file.

Author: Kouhei Sutou 

Closes #3361 from kou/glib-fix-document and squashes the following commits:

edd43c8a   Fix typos in documentation
---
 c_glib/arrow-glib/basic-data-type.cpp | 2 +-
 c_glib/arrow-glib/decimal128.cpp  | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/c_glib/arrow-glib/basic-data-type.cpp 
b/c_glib/arrow-glib/basic-data-type.cpp
index 2a59996..861bbaf 100644
--- a/c_glib/arrow-glib/basic-data-type.cpp
+++ b/c_glib/arrow-glib/basic-data-type.cpp
@@ -1065,7 +1065,7 @@ 
garrow_decimal_data_type_class_init(GArrowDecimalDataTypeClass *klass)
  *
  * Since: 0.10.0
  *
- * Deprecate: 0.12.0:
+ * Deprecated: 0.12.0:
  *   Use garrow_decimal128_data_type_new() instead.
  */
 GArrowDecimalDataType *
diff --git a/c_glib/arrow-glib/decimal128.cpp b/c_glib/arrow-glib/decimal128.cpp
index a49dba5..32bdf5f 100644
--- a/c_glib/arrow-glib/decimal128.cpp
+++ b/c_glib/arrow-glib/decimal128.cpp
@@ -27,8 +27,8 @@
 G_BEGIN_DECLS
 
 /**
- * SECTION: decimal
- * @title: Decimal classes
+ * SECTION: decimal128
+ * @title: 128-bit decimal class
  * @include: arrow-glib/arrow-glib.h
  *
  * #GArrowDecimal128 is a 128-bit decimal class.



[arrow] branch master updated: ARROW-4177: [C++] Add ThreadPool and TaskGroup microbenchmarks

2019-01-09 Thread wesm
This is an automated email from the ASF dual-hosted git repository.

wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
 new b29ecdc  ARROW-4177: [C++] Add ThreadPool and TaskGroup microbenchmarks
b29ecdc is described below

commit b29ecdce6e096618aeb110878367906b3b4b48a5
Author: Antoine Pitrou 
AuthorDate: Wed Jan 9 19:30:29 2019 -0600

ARROW-4177: [C++] Add ThreadPool and TaskGroup microbenchmarks

These benchmarks measure the number of tasks per second that can be 
executed depending on task cost and number of threads. It shows that for short 
tasks (< 10 µs), the scalability can be poor or even negative for very short 
tasks (< 1 µs).

Also includes an optimization of ThreadedTaskGroup to avoid taking a lock 
on the hot path.

Sample output (8-core AMD CPU, Ubuntu 18.04):
```

---
Benchmark   
 Time   CPU Iterations

---
BM_WorkloadCost/task_cost:1000/repeats:1   
724 ns724 ns 987295   1.31655M items/s
BM_WorkloadCost/task_cost:1/repeats:1 
7331 ns   7330 ns  88982133.23k items/s
BM_WorkloadCost/task_cost:10/repeats:1   
73279 ns  73267 ns   9182   13.3288k items/s

BM_ThreadPoolSpawn/threads:1/task_cost:1000/repeats:1/real_time  
163842359 ns   41132762 ns  4   1.16414M items/s
BM_ThreadPoolSpawn/threads:2/task_cost:1000/repeats:1/real_time  
158705340 ns  103873994 ns  7   1.20182M items/s
BM_ThreadPoolSpawn/threads:4/task_cost:1000/repeats:1/real_time  
447998576 ns  370986805 ns  2   435.969k items/s
BM_ThreadPoolSpawn/threads:8/task_cost:1000/repeats:1/real_time  
674500180 ns  543967794 ns  1   289.568k items/s
BM_ThreadPoolSpawn/threads:1/task_cost:1/repeats:1/real_time 
150078690 ns4887868 ns  5   130.147k items/s
BM_ThreadPoolSpawn/threads:2/task_cost:1/repeats:1/real_time  
84446492 ns5402850 ns  8   231.297k items/s
BM_ThreadPoolSpawn/threads:4/task_cost:1/repeats:1/real_time  
46164089 ns4912818 ns 15   423.104k items/s
BM_ThreadPoolSpawn/threads:8/task_cost:1/repeats:1/real_time  
22703512 ns7074437 ns 31   860.317k items/s
BM_ThreadPoolSpawn/threads:1/task_cost:10/repeats:1/real_time
149733023 ns 515907 ns  4   13.0506k items/s
BM_ThreadPoolSpawn/threads:2/task_cost:10/repeats:1/real_time 
81157195 ns 448091 ns  924.078k items/s
BM_ThreadPoolSpawn/threads:4/task_cost:10/repeats:1/real_time 
45600571 ns 521094 ns 16   42.8526k items/s
BM_ThreadPoolSpawn/threads:8/task_cost:10/repeats:1/real_time 
20867873 ns 359547 ns 32   93.6416k items/s

BM_SerialTaskGroup/task_cost:1000/repeats:1/real_time  
8366557 ns8362959 ns 66   1.13998M items/s
BM_SerialTaskGroup/task_cost:1/repeats:1/real_time 
8346475 ns8345288 ns 75117.12k items/s
BM_SerialTaskGroup/task_cost:10/repeats:1/real_time
8409974 ns8408879 ns 80   11.7281k items/s

BM_ThreadedTaskGroup/threads:1/task_cost:1000/repeats:1/real_time 
12932016 ns6283623 ns 60   755.227k items/s
BM_ThreadedTaskGroup/threads:2/task_cost:1000/repeats:1/real_time 
10622580 ns8631946 ns 58   919.419k items/s
BM_ThreadedTaskGroup/threads:4/task_cost:1000/repeats:1/real_time 
25544253 ns   20347053 ns 25382.34k items/s
BM_ThreadedTaskGroup/threads:8/task_cost:1000/repeats:1/real_time 
36215077 ns   29435817 ns 19   269.683k items/s
BM_ThreadedTaskGroup/threads:1/task_cost:1/repeats:1/real_time 
9830469 ns 476288 ns 69   99.4397k items/s
BM_ThreadedTaskGroup/threads:2/task_cost:1/repeats:1/real_time 
5446608 ns 546159 ns116   179.477k items/s
BM_ThreadedTaskGroup/threads:4/task_cost:1/repeats:1/real_time 
2858316 ns 666944 ns247   341.998k items/s
BM_ThreadedTaskGroup/threads:8/task_cost:1/repeats:1/real_time 
1544885 ns 526298 ns452   632.759k items/s
BM_ThreadedTaskGroup/threads:1/task_cost:10/repeats:1/real_time
9506192 ns  53110 ns 69   10.3756k items/s
BM_ThreadedTaskGroup/threads:2/task_cost:10/repeats:1/real_time
5262119 ns  67967 ns116   18.7439k items/s

[arrow] branch master updated: ARROW-3959: [Rust] Add date/time data types

2019-01-09 Thread agrove
This is an automated email from the ASF dual-hosted git repository.

agrove pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
 new 87ceb3c  ARROW-3959: [Rust] Add date/time data types
87ceb3c is described below

commit 87ceb3ca904c9e9a839ff1cc724d3139c1958047
Author: Andy Grove 
AuthorDate: Wed Jan 9 16:49:04 2019 -0700

ARROW-3959: [Rust] Add date/time data types

This only adds the date/time types to the DataTypes enum as well as JSON 
serialization for meta data.

This PR also implements `Schema::to_json`

Author: Andy Grove 

Closes #3340 from andygrove/ARROW-3959 and squashes the following commits:

945498e  merge from master and implement Hash for DateUnit, 
TimeUnit, etc.
b05d6a0  Merge branch 'master' into ARROW-3959
312885e  Timestamp now uses TimeUnit
c3e092b  Merge branch 'master' into ARROW-3959
d289cbb  improve test
2d36927  update unit test
d51bc82  fix mistake
f4bbf10  Add date/time data types
---
 rust/arrow/src/datatypes.rs | 146 +++-
 1 file changed, 145 insertions(+), 1 deletion(-)

diff --git a/rust/arrow/src/datatypes.rs b/rust/arrow/src/datatypes.rs
index 05db6ce..5008a97 100644
--- a/rust/arrow/src/datatypes.rs
+++ b/rust/arrow/src/datatypes.rs
@@ -56,11 +56,36 @@ pub enum DataType {
 Float16,
 Float32,
 Float64,
+Timestamp(TimeUnit),
+Date(DateUnit),
+Time32(TimeUnit),
+Time64(TimeUnit),
+Interval(IntervalUnit),
 Utf8,
 List(Box),
 Struct(Vec),
 }
 
+#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq, Hash)]
+pub enum DateUnit {
+Day,
+Millisecond,
+}
+
+#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq, Hash)]
+pub enum TimeUnit {
+Second,
+Millisecond,
+Microsecond,
+Nanosecond,
+}
+
+#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq, Hash)]
+pub enum IntervalUnit {
+YearMonth,
+DayTime,
+}
+
 /// Contains the meta-data for a single relative type.
 ///
 /// The `Schema` object is an ordered collection of `Field` objects.
@@ -175,6 +200,47 @@ impl DataType {
 "floatingpoint precision missing or 
invalid".to_string(),
 )),
 },
+Some(s) if s == "timestamp" => match map.get("unit") {
+Some(p) if p == "SECOND" => 
Ok(DataType::Timestamp(TimeUnit::Second)),
+Some(p) if p == "MILLISECOND" => 
Ok(DataType::Timestamp(TimeUnit::Millisecond)),
+Some(p) if p == "MICROSECOND" => 
Ok(DataType::Timestamp(TimeUnit::Microsecond)),
+Some(p) if p == "NANOSECOND" => 
Ok(DataType::Timestamp(TimeUnit::Nanosecond)),
+_ => Err(ArrowError::ParseError(
+"timestamp unit missing or invalid".to_string(),
+)),
+},
+Some(s) if s == "date" => match map.get("unit") {
+Some(p) if p == "DAY" => Ok(DataType::Date(DateUnit::Day)),
+Some(p) if p == "MILLISECOND" => 
Ok(DataType::Date(DateUnit::Millisecond)),
+_ => Err(ArrowError::ParseError(
+"date unit missing or invalid".to_string(),
+)),
+},
+Some(s) if s == "time" => {
+let unit = match map.get("unit") {
+Some(p) if p == "SECOND" => Ok(TimeUnit::Second),
+Some(p) if p == "MILLISECOND" => 
Ok(TimeUnit::Millisecond),
+Some(p) if p == "MICROSECOND" => 
Ok(TimeUnit::Microsecond),
+Some(p) if p == "NANOSECOND" => 
Ok(TimeUnit::Nanosecond),
+_ => Err(ArrowError::ParseError(
+"time unit missing or invalid".to_string(),
+)),
+};
+match map.get("bitWidth") {
+Some(p) if p == "32" => Ok(DataType::Time32(unit?)),
+Some(p) if p == "64" => Ok(DataType::Time32(unit?)),
+_ => Err(ArrowError::ParseError(
+"time bitWidth missing or invalid".to_string(),
+)),
+}
+}
+Some(s) if s == "interval" => match map.get("unit") {
+Some(p) if p == "DAY_TIME" => 
Ok(DataType::Interval(IntervalUnit::DayTime)),
+Some(p) if p == "YEAR_MONTH" => 
Ok(DataType::Interval(IntervalUnit::YearMonth)),
+_ => Err(ArrowError::ParseError(
+"interval unit missing or invalid".to_string(),
+)),
+},
 Some(s) if s == "int" => match map.get("isSigned") {
  

[arrow] branch master updated (b8aeb79 -> 84b221d)

2019-01-09 Thread wesm
This is an automated email from the ASF dual-hosted git repository.

wesm pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git.


from b8aeb79  ARROW-854: [Format] Add tentative SparseTensor format
 add 84b221d  ARROW-4138: [Python] Fix setuptools_scm version customization 
on Windows

No new revisions were added by this update.

Summary of changes:
 python/setup.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)



[arrow] branch master updated: ARROW-854: [Format] Add tentative SparseTensor format

2019-01-09 Thread wesm
This is an automated email from the ASF dual-hosted git repository.

wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
 new b8aeb79  ARROW-854: [Format] Add tentative SparseTensor format
b8aeb79 is described below

commit b8aeb79e94a5a507aeec55d0b6c6bf5d7f0100b2
Author: Kenta Murata 
AuthorDate: Wed Jan 9 16:18:19 2019 -0600

ARROW-854: [Format] Add tentative SparseTensor format

I'm interested in making a language-agnostic sparse tensor format. I 
believe one of the suitable places to do this is Apache Arrow, so let me 
propose my idea of this here.

First of all, I found that there is no common memory layout of sparse 
tensor representations in my investigation. It means we need some kinds of 
conversion to share sparse tensors among different systems even if the data 
format is logically the same. It is the same situation as dataframe, and this 
is the reason why I believe Apache Arrow is the suitable place.

There are many formats to represent a sparse tensor. Most of them are 
specialized for a matrix, which has two dimensions. There are few formats for 
general sparse tensor with more than two dimensions.

I think the COO format is suitable to start because COO can handle any 
dimensions, and many systems support the COO format. In my investigation, the 
systems support COO are SciPy, dask, pydata/sparse, TensorFlow, and PyTorch.

Additionally, CSR format for matrices may also be good to support at the 
first time. The reason is that CSR format is efficient to extract row slices, 
that may be important for extracting samples from tidy data, and it is 
supported by SciPy, MXNet, and R's Matrix library.

I add my prototype definition of SparseTensor format in this pull-request. 
I designed this prototype format to be extensible so that we can support 
additional sparse formats. I think we at least need to support additional 
sparse tensor format for more than two dimensions in addition to COO so we will 
need this extensibility.

Author: Kenta Murata 

Closes #2546 from mrkn/sparse_tensor_proposal and squashes the following 
commits:

148bff822  make format
d57e56fc6  Merge sparse_tensor_format.h into sparse_tensor.h
880bbc4eb  Rename too-verbose function name
c83ea6aaf  Add type aliases of sparse tensor types
90e8b3166  Rename sparse tensor classes
07a651863  Use substitution instead of constructor call
37a0a14c6  Remove needless function declaration
97e85bd35  Use std::make_shared
3dd434c83  Capitalize member function name
6ef6ad065  Apply code formatter
6f291581e  Mark APIs for sparse tensor as EXPERIMENTAL
ff3ea71c5  Rename length to non_zero_length in SparseTensor
f78230344  Return Status::IOError instead of DCHECK if 
message header type is not matched
7e814de36  Put EXPERIMENTAL markn in comments
357860d8c  Fix typo in comments
43d8eea44  Fix coding style
99b1d1d4d  Add missing ARROW_EXPORT specifiers
401ae8023  Fix SparseCSRIndex::ToString and add tests
9e457acd3  Remove needless virtual specifiers
3b1db7d32  Add SparseTensorBase::Equals
d6a8c3805  Unify Tensor.fbs and SparseTensor.fbs
b3a62ebfa  Fix format
6bc9e296f  Support IPC read and write of SparseTensor
1d9042709  Fix format
51a83bfee  Add SparseTensorFormat
93c03adad  Add SparseIndex::ToString()
021b46be0  Add SparseTensorBase
ed3984dd4  Add SparseIndex::format_type
4251b4d08  Add SparseCSRIndex
433c9b441  Change COO index matrix to column-major in a 
format description
392a25b7c  Implement SparseTensor and SparseCOOIndex
b24f3c342  Insert additional padding in sparse tensor format
c508db086  Write sparse tensor format in IPC.md
2b50040f5  Add an example of the CSR format in comment
76c56dd35  Make indptr of CSR a buffer
d7e653f17  Add an example of COO format in comment
866b2c13a  Add header comments in SparseTensor.fbs
aa9b8a4d0  Add SparseTensor.fbs in FBS_SRC
1f16ffed8  Fix syntax error in SparseTensor.fbs
c3bc6edfa  Add tentative SparseTensor format
---
 cpp/src/arrow/CMakeLists.txt   |   2 +
 cpp/src/arrow/compare.cc   |  93 +++
 cpp/src/arrow/compare.h|   4 +
 cpp/src/arrow/ipc/message.cc   |   2 +
 cpp/src/arrow/ipc/message.h|   2 +-
 cpp/src/arrow/ipc/metadata-internal.cc | 148 +++
 cpp/src/arrow/ipc/metadata-internal.h  |  12 +
 cpp/src/arrow/ipc/read-write-test.cc   | 112 
 cpp/src/arrow/ipc/reader.cc| 119 +
 cpp/src/arrow/ipc/reader.h |  17 ++
 cpp/src/arrow/ipc/writer.cc| 101 
 cpp/src/arrow/ipc/writer.h |  15 ++
 cpp/src/arrow/sparse_tensor-test.cc| 244 ++
 cpp/src/arrow/sparse_tensor.cc | 452 

[arrow] branch master updated: ARROW-3997: [Documentation] Clarify dictionary index type

2019-01-09 Thread wesm
This is an automated email from the ASF dual-hosted git repository.

wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
 new 6b496f7  ARROW-3997: [Documentation] Clarify dictionary index type
6b496f7 is described below

commit 6b496f7c1929a0a371fe708ae653228a9e722150
Author: Antoine Pitrou 
AuthorDate: Wed Jan 9 16:16:40 2019 -0600

ARROW-3997: [Documentation] Clarify dictionary index type

Mandate signed integers for dictionary index types, without constraining 
integer width.

Author: Antoine Pitrou 

Closes #3355 from pitrou/ARROW-3997-dictionary-encoding-doc and squashes 
the following commits:

4e05e2642  ARROW-3997:  Clarify dictionary index type
---
 docs/source/format/Layout.rst | 31 ---
 1 file changed, 16 insertions(+), 15 deletions(-)

diff --git a/docs/source/format/Layout.rst b/docs/source/format/Layout.rst
index 69cbf06..f3e5290 100644
--- a/docs/source/format/Layout.rst
+++ b/docs/source/format/Layout.rst
@@ -614,13 +614,13 @@ Dictionary encoding
 ---
 
 When a field is dictionary encoded, the values are represented by an array of
-Int32 representing the index of the value in the dictionary.  The Dictionary is
-received as one or more DictionaryBatches with the id referenced by a
-dictionary attribute defined in the metadata (Message.fbs) in the Field
-table.  The dictionary has the same layout as the type of the field would
-dictate. Each entry in the dictionary can be accessed by its index in the
-DictionaryBatches.  When a Schema references a Dictionary id, it must send at
-least one DictionaryBatch for this id.
+signed integers representing the index of the value in the dictionary.
+The Dictionary is received as one or more DictionaryBatches with the id
+referenced by a dictionary attribute defined in the metadata (Message.fbs)
+in the Field table.  The dictionary has the same layout as the type of the
+field would dictate. Each entry in the dictionary can be accessed by its
+index in the DictionaryBatches.  When a Schema references a Dictionary id,
+it must send at least one DictionaryBatch for this id.
 
 As an example, you could have the following data: ::
 
@@ -640,16 +640,17 @@ As an example, you could have the following data: ::
 In dictionary-encoded form, this could appear as: ::
 
 data List (dictionary-encoded, dictionary id i)
-indices: [0, 0, 0, 1, 1, 1, 0]
+   type: Int32
+   values:
+   [0, 0, 0, 1, 1, 1, 0]
 
 dictionary i
-
-type: List
-
-[
- ['a', 'b'],
- ['c', 'd', 'e'],
-]
+   type: List
+   values:
+   [
+['a', 'b'],
+['c', 'd', 'e'],
+   ]
 
 References
 --



[arrow] branch master updated: ARROW-4118: [Python] Fix benchmark setup for "asv run"

2019-01-09 Thread wesm
This is an automated email from the ASF dual-hosted git repository.

wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
 new 3330d66  ARROW-4118: [Python] Fix benchmark setup for "asv run"
3330d66 is described below

commit 3330d660643a034168b472b52aebfe0fea84b8cf
Author: Antoine Pitrou 
AuthorDate: Wed Jan 9 16:14:25 2019 -0600

ARROW-4118: [Python] Fix benchmark setup for "asv run"

"conda activate" unfortunately isn't available from a non-interactive 
shell, and running bash as interactive doesn't look like a workable solution.

Also fix a setup slowness issue in the Parquet benchmarks, and fix a C++ 
ABI issue by downloading packages from Anaconda rather than conda-forge.

Author: Antoine Pitrou 

Closes #3357 from pitrou/ARROW-4118-fix-asv-run and squashes the following 
commits:

b07b68e61  ARROW-4118:  Fix benchmark setup for "asv run"
---
 docs/source/python/benchmarks.rst | 24 +---
 python/asv-build.sh   | 17 -
 python/asv.conf.json  |  4 +++-
 python/benchmarks/parquet.py  | 16 +---
 4 files changed, 37 insertions(+), 24 deletions(-)

diff --git a/docs/source/python/benchmarks.rst 
b/docs/source/python/benchmarks.rst
index 7672294..12205c5 100644
--- a/docs/source/python/benchmarks.rst
+++ b/docs/source/python/benchmarks.rst
@@ -19,35 +19,37 @@ Benchmarks
 ==
 
 The ``pyarrow`` package comes with a suite of benchmarks meant to
-run with `asv`_.  You'll need to install the ``asv`` package first
+run with `ASV`_.  You'll need to install the ``asv`` package first
 (``pip install asv`` or ``conda install -c conda-forge asv``).
 
-The benchmarks are run using `asv`_ which is also their only requirement.
-
 Running the benchmarks
 --
 
-To run the benchmarks, call ``asv run --python=same``. You cannot use the
-plain ``asv run`` command at the moment as asv cannot handle python packages
-in subdirectories of a repository.
+To run the benchmarks for a locally-built Arrow, run ``asv dev`` or
+``asv run --python=same``.
 
-Running with arbitrary revisions
-
+Running for arbitrary Git revisions
+---
 
 ASV allows to store results and generate graphs of the benchmarks over
-the project's evolution.  For this you have the latest development version of 
ASV:
+the project's evolution.  You need to have the latest development version of 
ASV:
 
 .. code::
 
 pip install git+https://github.com/airspeed-velocity/asv
 
+The build scripts assume that Conda's ``activate`` script is on the PATH
+(the ``conda activate`` command unfortunately isn't available from
+non-interactive scripts).
+
 Now you should be ready to run ``asv run`` or whatever other command
-suits your needs.
+suits your needs.  Note that this can be quite long, as each Arrow needs
+to be rebuilt for each Git revision you're running the benchmarks for.
 
 Compatibility
 -
 
 We only expect the benchmarking setup to work with Python 3.6 or later,
-on a Unix-like system.
+on a Unix-like system with bash.
 
 .. _asv: https://asv.readthedocs.org/
diff --git a/python/asv-build.sh b/python/asv-build.sh
index 7b55456..90c7872 100755
--- a/python/asv-build.sh
+++ b/python/asv-build.sh
@@ -21,7 +21,9 @@ set -e
 
 # ASV doesn't activate its conda environment for us
 if [ -z "$ASV_ENV_DIR" ]; then exit 1; fi
-conda activate $ASV_ENV_DIR
+# Avoid "conda activate" because it's only set up in interactive shells
+# (https://github.com/conda/conda/issues/8072)
+source activate $ASV_ENV_DIR
 echo "== Conda Prefix for benchmarks: " $CONDA_PREFIX " =="
 
 # Build Arrow C++ libraries
@@ -32,6 +34,8 @@ export ORC_HOME=$CONDA_PREFIX
 export PROTOBUF_HOME=$CONDA_PREFIX
 export BOOST_ROOT=$CONDA_PREFIX
 
+export CXXFLAGS="-D_GLIBCXX_USE_CXX11_ABI=1"
+
 pushd ../cpp
 mkdir -p build
 pushd build
@@ -40,9 +44,11 @@ cmake -GNinja \
   -DCMAKE_BUILD_TYPE=release \
   -DCMAKE_INSTALL_PREFIX=$ARROW_HOME \
   -DARROW_CXXFLAGS=$CXXFLAGS \
-  -DARROW_PYTHON=ON \
-  -DARROW_PLASMA=ON \
-  -DARROW_BUILD_TESTS=OFF \
+  -DARROW_USE_GLOG=off \
+  -DARROW_PARQUET=on \
+  -DARROW_PYTHON=on \
+  -DARROW_PLASMA=on \
+  -DARROW_BUILD_TESTS=off \
   ..
 cmake --build . --target install
 
@@ -52,7 +58,8 @@ popd
 # Build pyarrow wrappers
 export SETUPTOOLS_SCM_PRETEND_VERSION=0.0.1
 export PYARROW_BUILD_TYPE=release
-export PYARROW_PARALLEL=4
+export PYARROW_PARALLEL=8
+export PYARROW_WITH_PARQUET=1
 export PYARROW_WITH_PLASMA=1
 
 python setup.py clean
diff --git a/python/asv.conf.json b/python/asv.conf.json
index 40938ee..09031c8 100644
--- a/python/asv.conf.json
+++ b/python/asv.conf.json
@@ -35,6 +35,7 @@
 // of the repository.
 "repo_subdir": "python",
 
+// Custom build commands for 

[arrow] branch master updated: ARROW-3233: [Python] Add prose documentation for CUDA support

2019-01-09 Thread apitrou
This is an automated email from the ASF dual-hosted git repository.

apitrou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
 new bcfacaa  ARROW-3233: [Python] Add prose documentation for CUDA support
bcfacaa is described below

commit bcfacaafcb181a39d43dbb3d0540c018a5afe157
Author: Antoine Pitrou 
AuthorDate: Wed Jan 9 23:12:31 2019 +0100

ARROW-3233: [Python] Add prose documentation for CUDA support

It will be harder to add generated API docs without requiring CUDA support 
on the machine building the docs.

Author: Antoine Pitrou 

Closes #3359 from pitrou/ARROW-3233-pyarrow-cuda-doc and squashes the 
following commits:

40b63f0f  ARROW-3233:  Add prose documentation for CUDA 
support
---
 docs/source/python/cuda.rst   | 159 ++
 docs/source/python/index.rst  |   1 +
 docs/source/python/memory.rst |   3 +
 3 files changed, 163 insertions(+)

diff --git a/docs/source/python/cuda.rst b/docs/source/python/cuda.rst
new file mode 100644
index 000..b0150c1
--- /dev/null
+++ b/docs/source/python/cuda.rst
@@ -0,0 +1,159 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements.  See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership.  The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License.  You may obtain a copy of the License at
+
+..   http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied.  See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+.. currentmodule:: pyarrow.cuda
+
+CUDA Integration
+
+
+Arrow is not limited to CPU buffers (located in the computer's main memory,
+also named "host memory").  It also has provisions for accessing buffers
+located on a CUDA-capable GPU device (in "device memory").
+
+.. note::
+   This functionality is optional and must have been enabled at build time.
+   If this is not done by your package manager, you might have to build Arrow
+   yourself.
+
+CUDA Contexts
+-
+
+A CUDA context represents access to a particular CUDA-capable device.
+For example, this is creating a CUDA context accessing CUDA device number 0::
+
+   >>> from pyarrow import cuda
+   >>> ctx = cuda.Context(0)
+   >>>
+
+CUDA Buffers
+
+
+A CUDA buffer can be created by copying data from host memory to the memory
+of a CUDA device, using the :meth:`Context.buffer_from_data` method.
+The source data can be any Python buffer-like object, including Arrow buffers::
+
+   >>> import numpy as np
+   >>> arr = np.arange(4, dtype=np.int32)
+   >>> arr.nbytes
+   16
+   >>> cuda_buf = ctx.buffer_from_data(arr)
+   >>> type(cuda_buf)
+   pyarrow._cuda.CudaBuffer
+   >>> cuda_buf.size # The buffer's size in bytes
+   16
+   >>> cuda_buf.address  # The buffer's address in device memory
+   30088364544
+   >>> cuda_buf.context.device_number
+   0
+
+Conversely, you can copy back a CUDA buffer to device memory, getting a regular
+CPU buffer::
+
+   >>> buf = cuda_buf.copy_to_host()
+   >>> type(buf)
+   pyarrow.lib.Buffer
+   >>> np.frombuffer(buf, dtype=np.int32)
+   array([0, 1, 2, 3], dtype=int32)
+
+.. warning::
+   Many Arrow functions expect a CPU buffer but will not check the buffer's
+   actual type.  You will get a crash if you pass a CUDA buffer to such a
+   function::
+
+  >>> pa.py_buffer(b"x" * 16).equals(cuda_buf)
+  Segmentation fault
+
+Numba Integration
+-
+
+There is not much you can do directly with Arrow CUDA buffers from Python,
+but they support interoperation with `Numba `_,
+a JIT compiler which can turn Python code into optimized CUDA kernels.
+
+Arrow to Numba
+~~
+
+First let's define a Numba CUDA kernel operating on an ``int32`` array.  Here,
+we will simply increment each array element (assuming the array is writable)::
+
+   import numba.cuda
+
+   @numba.cuda.jit
+   def increment_by_one(an_array):
+   pos = numba.cuda.grid(1)
+   if pos < an_array.size:
+   an_array[pos] += 1
+
+Then we need to wrap our CUDA buffer into a Numba "device array" with the right
+array metadata (shape, strides and datatype).  This is necessary so that Numba
+can identify the array's characteristics and compile the kernel with the
+appropriate type declarations.
+
+In this case the metadata can simply be got from the original Numpy array.
+Note the GPU data isn't copied, just pointed to::
+
+   >>> from 

[arrow] branch master updated: ARROW-4209: [Gandiva] Avoid struct return param in IR

2019-01-09 Thread wesm
This is an automated email from the ASF dual-hosted git repository.

wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
 new 361285d  ARROW-4209: [Gandiva] Avoid struct return param in IR
361285d is described below

commit 361285d86c345b3943eee8e63d3f9a782e7bf6da
Author: Pindikura Ravindra 
AuthorDate: Wed Jan 9 10:09:48 2019 -0600

ARROW-4209: [Gandiva] Avoid struct return param in IR

Author: Pindikura Ravindra 

Closes #3356 from pravindra/struct and squashes the following commits:

f437acd0  ARROW-4209:  Avoid struct return param in IR
---
 cpp/src/gandiva/decimal_ir.cc  | 30 +++---
 cpp/src/gandiva/precompiled/decimal_wrapper.cc | 20 ++---
 2 files changed, 20 insertions(+), 30 deletions(-)

diff --git a/cpp/src/gandiva/decimal_ir.cc b/cpp/src/gandiva/decimal_ir.cc
index 38b35a6..d10158a 100644
--- a/cpp/src/gandiva/decimal_ir.cc
+++ b/cpp/src/gandiva/decimal_ir.cc
@@ -218,27 +218,23 @@ DecimalIR::ValueWithOverflow 
DecimalIR::AddWithOverflowCheck(const ValueFull& x,
 // This is pretty complex, so use CPP fns.
 llvm::Value* DecimalIR::AddLarge(const ValueFull& x, const ValueFull& y,
  const ValueFull& out) {
-  std::vector args;
-
+  auto block = ir_builder()->GetInsertBlock();
+  auto out_high_ptr = new llvm::AllocaInst(types()->i64_type(), 0, "out_hi", 
block);
+  auto out_low_ptr = new llvm::AllocaInst(types()->i64_type(), 0, "out_low", 
block);
   auto x_split = ValueSplit::MakeFromInt128(this, x.value());
-  args.push_back(x_split.high());
-  args.push_back(x_split.low());
-  args.push_back(x.precision());
-  args.push_back(x.scale());
-
   auto y_split = ValueSplit::MakeFromInt128(this, y.value());
-  args.push_back(y_split.high());
-  args.push_back(y_split.low());
-  args.push_back(y.precision());
-  args.push_back(y.scale());
 
-  args.push_back(out.precision());
-  args.push_back(out.scale());
-
-  auto split = ir_builder()->CreateCall(
-  module()->getFunction("add_large_decimal128_decimal128"), args);
+  std::vector args = {
+  x_split.high(),  x_split.low(), x.precision(), x.scale(),
+  y_split.high(),  y_split.low(), y.precision(), y.scale(),
+  out.precision(), out.scale(),   out_high_ptr,  out_low_ptr,
+  };
+  
ir_builder()->CreateCall(module()->getFunction("add_large_decimal128_decimal128"),
+   args);
 
-  auto sum = ValueSplit::MakeFromStruct(this, split).AsInt128(this);
+  auto out_high = ir_builder()->CreateLoad(out_high_ptr);
+  auto out_low = ir_builder()->CreateLoad(out_low_ptr);
+  auto sum = ValueSplit(out_high, out_low).AsInt128(this);
   ADD_TRACE_128("AddLarge : sum", sum);
   return sum;
 }
diff --git a/cpp/src/gandiva/precompiled/decimal_wrapper.cc 
b/cpp/src/gandiva/precompiled/decimal_wrapper.cc
index fdc751f..0118100 100644
--- a/cpp/src/gandiva/precompiled/decimal_wrapper.cc
+++ b/cpp/src/gandiva/precompiled/decimal_wrapper.cc
@@ -20,24 +20,18 @@
 
 extern "C" {
 
-/// TODO : Passing around structs in IR can be fragile due to c-abi 
compatibility issues.
-/// This seems to work for now, but will need to revisit if we hit issues.
-struct DecimalSplit {
-  int64_t high_bits;
-  uint64_t low_bits;
-};
-
 FORCE_INLINE
-DecimalSplit add_large_decimal128_decimal128(int64_t x_high, uint64_t x_low,
- int32_t x_precision, int32_t 
x_scale,
- int64_t y_high, uint64_t y_low,
- int32_t y_precision, int32_t 
y_scale,
- int32_t out_precision, int32_t 
out_scale) {
+void add_large_decimal128_decimal128(int64_t x_high, uint64_t x_low, int32_t 
x_precision,
+ int32_t x_scale, int64_t y_high, uint64_t 
y_low,
+ int32_t y_precision, int32_t y_scale,
+ int32_t out_precision, int32_t out_scale,
+ int64_t* out_high, uint64_t* out_low) {
   gandiva::Decimal128Full x(x_high, x_low, x_precision, x_scale);
   gandiva::Decimal128Full y(y_high, y_low, y_precision, y_scale);
 
   arrow::Decimal128 out = gandiva::decimalops::Add(x, y, out_precision, 
out_scale);
-  return DecimalSplit{out.high_bits(), out.low_bits()};
+  *out_high = out.high_bits();
+  *out_low = out.low_bits();
 }
 
 }  // extern "C"



[arrow] branch master updated: ARROW-2038: [Python] Strip s3:// scheme in S3FSWrapper isdir() and isfile()

2019-01-09 Thread kszucs
This is an automated email from the ASF dual-hosted git repository.

kszucs pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
 new af925d9  ARROW-2038: [Python] Strip s3:// scheme in S3FSWrapper 
isdir() and isfile()
af925d9 is described below

commit af925d9395bd8f5cf435f379e389633bd3acfdfd
Author: Dmitry Vukolov 
AuthorDate: Wed Jan 9 13:58:48 2019 +0100

ARROW-2038: [Python] Strip s3:// scheme in S3FSWrapper isdir() and isfile()

This fixes an exception from ParquetDataset arising when the supplied path 
contains the `s3://` scheme specifier. The issue stemmed from the fact that 
while the underlying S3FileSystem does support both types of paths, with and 
without and explicit `s3://`, its function calls always return paths stripped 
of the scheme. This messed up with the logic in isdir() and isfile().

An alternative solution would be to strip the scheme in parquet.py (by 
adding it to _URI_STRIP_SCHEMES). This however would require additional code 
changes along the lines of:

```python
_URI_STRIP_SCHEMES = ('hdfs', 's3')

def _parse_uri(path):
path = _stringify_path(path)
parsed_uri = urlparse(path)
if parsed_uri.scheme in _URI_STRIP_SCHEMES:
scheme = '{0}://'.format(parsed_uri.scheme)
path = parsed_uri.geturl().replace(scheme, '', 1)
return path
else:
# ARROW-4073: On Windows returning the path with the scheme
# stripped removes the drive letter, if any
return path
```

Not sure if that would have any impact on handling HDFS. Therefore this 
patch proposes a safer, more localised approach, already used in other parts of 
S3FSWrapper.

Author: Dmitry Vukolov 

Closes #3286 from dvukolov/master and squashes the following commits:

8de916c5  Strip s3:// scheme in S3FSWrapper isdir() and 
isfile()
---
 python/pyarrow/filesystem.py | 13 ++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/python/pyarrow/filesystem.py b/python/pyarrow/filesystem.py
index 98efb1e..92a65ce 100644
--- a/python/pyarrow/filesystem.py
+++ b/python/pyarrow/filesystem.py
@@ -319,7 +319,7 @@ class S3FSWrapper(DaskFileSystem):
 
 @implements(FileSystem.isdir)
 def isdir(self, path):
-path = _stringify_path(path)
+path = _sanitize_s3(_stringify_path(path))
 try:
 contents = self.fs.ls(path)
 if len(contents) == 1 and contents[0] == path:
@@ -331,7 +331,7 @@ class S3FSWrapper(DaskFileSystem):
 
 @implements(FileSystem.isfile)
 def isfile(self, path):
-path = _stringify_path(path)
+path = _sanitize_s3(_stringify_path(path))
 try:
 contents = self.fs.ls(path)
 return len(contents) == 1 and contents[0] == path
@@ -345,7 +345,7 @@ class S3FSWrapper(DaskFileSystem):
 Generator version of what is in s3fs, which yields a flattened list of
 files
 """
-path = _stringify_path(path).replace('s3://', '')
+path = _sanitize_s3(_stringify_path(path))
 directories = set()
 files = set()
 
@@ -371,6 +371,13 @@ class S3FSWrapper(DaskFileSystem):
 yield tup
 
 
+def _sanitize_s3(path):
+if path.startswith('s3://'):
+return path.replace('s3://', '')
+else:
+return path
+
+
 def _ensure_filesystem(fs):
 fs_type = type(fs)