[arrow] branch master updated: PARQUET-1467: [C++] Remove defunct ChunkedAllocator code
This is an automated email from the ASF dual-hosted git repository. wesm pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/arrow.git The following commit(s) were added to refs/heads/master by this push: new 98bdde8 PARQUET-1467: [C++] Remove defunct ChunkedAllocator code 98bdde8 is described below commit 98bdde873a9f7ff3854a52f096dabe3c52be6e3a Author: Wes McKinney AuthorDate: Sun Dec 2 21:44:36 2018 -0600 PARQUET-1467: [C++] Remove defunct ChunkedAllocator code It does not seem that memory allocation on the dictionary encoding path requires something so elaborate right now Author: Wes McKinney Closes #3069 from wesm/PARQUET-1467 and squashes the following commits: f37ed0756 Remove defunct memory allocator code --- cpp/src/parquet/column_writer.cc | 6 +- cpp/src/parquet/column_writer.h | 1 - cpp/src/parquet/encoding-benchmark.cc | 3 +- cpp/src/parquet/encoding-internal.h | 9 +- cpp/src/parquet/encoding-test.cc | 8 +- cpp/src/parquet/test-util.h | 5 +- cpp/src/parquet/util/memory-test.cc | 216 --- cpp/src/parquet/util/memory.cc| 232 -- cpp/src/parquet/util/memory.h | 143 - 9 files changed, 8 insertions(+), 615 deletions(-) diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index a45613f..857673d 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -353,7 +353,6 @@ ColumnWriter::ColumnWriter(ColumnChunkMetaDataBuilder* metadata, encoding_(encoding), properties_(properties), allocator_(properties->memory_pool()), - pool_(properties->memory_pool()), num_buffered_values_(0), num_buffered_encoded_values_(0), rows_written_(0), @@ -546,8 +545,7 @@ TypedColumnWriter::TypedColumnWriter(ColumnChunkMetaDataBuilder* metadata, break; case Encoding::PLAIN_DICTIONARY: case Encoding::RLE_DICTIONARY: - current_encoder_.reset( - new DictEncoder(descr_, _, properties->memory_pool())); + current_encoder_.reset(new DictEncoder(descr_, properties->memory_pool())); break; default: ParquetException::NYI("Selected encoding is not supported"); @@ -582,8 +580,6 @@ void TypedColumnWriter::WriteDictionaryPage() { std::shared_ptr buffer = AllocateBuffer(properties_->memory_pool(), dict_encoder->dict_encoded_size()); dict_encoder->WriteDict(buffer->mutable_data()); - // TODO Get rid of this deep call - dict_encoder->mem_pool()->FreeAll(); DictionaryPage page(buffer, dict_encoder->num_entries(), properties_->dictionary_index_encoding()); diff --git a/cpp/src/parquet/column_writer.h b/cpp/src/parquet/column_writer.h index 41bc7bd..3c69dd3 100644 --- a/cpp/src/parquet/column_writer.h +++ b/cpp/src/parquet/column_writer.h @@ -186,7 +186,6 @@ class PARQUET_EXPORT ColumnWriter { LevelEncoder level_encoder_; ::arrow::MemoryPool* allocator_; - ChunkedAllocator pool_; // The total number of values stored in the data page. This is the maximum of // the number of encoded definition levels or encoded values. For diff --git a/cpp/src/parquet/encoding-benchmark.cc b/cpp/src/parquet/encoding-benchmark.cc index 364cdba..f8d2839 100644 --- a/cpp/src/parquet/encoding-benchmark.cc +++ b/cpp/src/parquet/encoding-benchmark.cc @@ -104,11 +104,10 @@ static void DecodeDict(std::vector& values, typedef typename Type::c_type T; int num_values = static_cast(values.size()); - ChunkedAllocator pool; MemoryPool* allocator = default_memory_pool(); std::shared_ptr descr = Int64Schema(Repetition::REQUIRED); - DictEncoder encoder(descr.get(), , allocator); + DictEncoder encoder(descr.get(), allocator); for (int i = 0; i < num_values; ++i) { encoder.Put(values[i]); } diff --git a/cpp/src/parquet/encoding-internal.h b/cpp/src/parquet/encoding-internal.h index b06ad41..e2dfc23 100644 --- a/cpp/src/parquet/encoding-internal.h +++ b/cpp/src/parquet/encoding-internal.h @@ -465,12 +465,10 @@ class DictEncoder : public Encoder { public: typedef typename DType::c_type T; - // XXX pool is unused - explicit DictEncoder(const ColumnDescriptor* desc, ChunkedAllocator* pool = nullptr, + explicit DictEncoder(const ColumnDescriptor* desc, ::arrow::MemoryPool* allocator = ::arrow::default_memory_pool()) : Encoder(desc, Encoding::PLAIN_DICTIONARY, allocator), allocator_(allocator), -pool_(pool), dict_encoded_size_(0), type_length_(desc->type_length()), memo_table_(INITIAL_HASH_TABLE_SIZE) {} @@ -538,8 +536,6 @@ class DictEncoder : public Encoder { /// dict_encoded_size() bytes. void WriteDict(uint8_t* buffer); - ChunkedAllocator* mem_pool() { return pool_; } - /// The number of entries in the
[arrow] branch master updated: ARROW-3922: [C++] Micro-optimizations to BitUtil::GetBit
This is an automated email from the ASF dual-hosted git repository. wesm pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/arrow.git The following commit(s) were added to refs/heads/master by this push: new 1621868 ARROW-3922: [C++] Micro-optimizations to BitUtil::GetBit 1621868 is described below commit 16218682336681cad79f143bd01484cf96332eb7 Author: Wes McKinney AuthorDate: Sun Dec 2 17:05:30 2018 -0600 ARROW-3922: [C++] Micro-optimizations to BitUtil::GetBit The results are fairly noisy so I couldn't really measure any improvement, but it seems to be at most 0.5-1%. I changed the NaiveBitmapReader to use `BitUtil::GetBit` so it is an apples-to-apples comparison. On my laptop (with CPU throttling disabled) the difference is within 1 standard deviation of the mean, so not statistically significant. Since the generated assembly is smaller, I would say it's a reasonable change after ``` - Benchmark Time CPU Iterations - BM_NaiveBitmapReader/100/min_time:1.000/repeats:10 12255 us 12255 us111 155.636MB/s BM_NaiveBitmapReader/100/min_time:1.000/repeats:10 12367 us 12367 us111 154.228MB/s BM_NaiveBitmapReader/100/min_time:1.000/repeats:10 12243 us 12243 us111 155.786MB/s BM_NaiveBitmapReader/100/min_time:1.000/repeats:10 12235 us 12236 us111 155.885MB/s BM_NaiveBitmapReader/100/min_time:1.000/repeats:10 12426 us 12426 us111 153.491MB/s BM_NaiveBitmapReader/100/min_time:1.000/repeats:10 12372 us 12372 us111 154.164MB/s BM_NaiveBitmapReader/100/min_time:1.000/repeats:10 12283 us 12283 us111 155.285MB/s BM_NaiveBitmapReader/100/min_time:1.000/repeats:10 12340 us 12340 us111 154.567MB/s BM_NaiveBitmapReader/100/min_time:1.000/repeats:10 12389 us 12390 us111 153.946MB/s BM_NaiveBitmapReader/100/min_time:1.000/repeats:10 12489 us 12489 us111 152.722MB/s BM_NaiveBitmapReader/100/min_time:1.000/repeats:10_mean12340 us 12340 us111 154.571MB/s BM_NaiveBitmapReader/100/min_time:1.000/repeats:10_median 12353 us 12354 us111 154.397MB/s BM_NaiveBitmapReader/100/min_time:1.000/repeats:10_stddev 85 us 85 us111 1085.01kB/s BM_BitmapReader/100/min_time:1.000/repeats:10 10717 us 10717 us132 177.969MB/s BM_BitmapReader/100/min_time:1.000/repeats:10 10656 us 10657 us132 178.982MB/s BM_BitmapReader/100/min_time:1.000/repeats:10 10835 us 10836 us132 176.028MB/s BM_BitmapReader/100/min_time:1.000/repeats:10 10735 us 10735 us132 177.672MB/s BM_BitmapReader/100/min_time:1.000/repeats:10 10700 us 10700 us132 178.253MB/s BM_BitmapReader/100/min_time:1.000/repeats:10 11481 us 11481 us132 166.131MB/s BM_BitmapReader/100/min_time:1.000/repeats:10 10850 us 10850 us132 175.797MB/s BM_BitmapReader/100/min_time:1.000/repeats:10 10591 us 10591 us132 180.095MB/s BM_BitmapReader/100/min_time:1.000/repeats:10 10684 us 10684 us132 178.523MB/s BM_BitmapReader/100/min_time:1.000/repeats:10 10705 us 10705 us132 178.167MB/s BM_BitmapReader/100/min_time:1.000/repeats:10_mean 10795 us 10796 us132 176.762MB/s BM_BitmapReader/100/min_time:1.000/repeats:10_median 10711 us 10711 us132 178.068MB/s BM_BitmapReader/100/min_time:1.000/repeats:10_stddev 253 us 253 us132 3.94562MB/s ``` previous ``` - Benchmark Time CPU Iterations - BM_NaiveBitmapReader/100/min_time:1.000/repeats:10 12349 us 12348 us107 154.464MB/s BM_NaiveBitmapReader/100/min_time:1.000/repeats:10 12289 us 12288 us
[arrow] branch master updated: ARROW-3684: [Go] Add chunking ability to CSV reader
This is an automated email from the ASF dual-hosted git repository. wesm pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/arrow.git The following commit(s) were added to refs/heads/master by this push: new fd2f798 ARROW-3684: [Go] Add chunking ability to CSV reader fd2f798 is described below commit fd2f798adccaf1644aa32a9b9cf2873a8f540142 Author: Sebastien Binet AuthorDate: Sun Dec 2 11:35:53 2018 -0600 ARROW-3684: [Go] Add chunking ability to CSV reader Author: Sebastien Binet Closes #3019 from sbinet/issue-3684 and squashes the following commits: 64e88515e ARROW-3684: Add chunking ability to CSV reader --- go/arrow/csv/csv.go | 98 +++- go/arrow/csv/csv_test.go | 295 +++ 2 files changed, 387 insertions(+), 6 deletions(-) diff --git a/go/arrow/csv/csv.go b/go/arrow/csv/csv.go index 36f3abd..79f2280 100644 --- a/go/arrow/csv/csv.go +++ b/go/arrow/csv/csv.go @@ -17,8 +17,6 @@ // Package csv reads CSV files and presents the extracted data as records. package csv -// TODO: implement a row chunker to accumulate N rows into the current record. - import ( "encoding/csv" "errors" @@ -61,6 +59,19 @@ func WithAllocator(mem memory.Allocator) Option { } } +// WithChunk specifies the chunk size used while parsing CSV files. +// +// If n is zero or 1, no chunking will take place and the reader will create +// one record per row. +// If n is greater than 1, chunks of n rows will be read. +// If n is negative, the reader will load the whole CSV file into memory and +// create one big record with all the rows. +func WithChunk(n int) Option { + return func(r *Reader) { + r.chunk = n + } +} + // Reader wraps encoding/csv.Reader and creates array.Records from a schema. type Reader struct { r *csv.Reader @@ -71,6 +82,10 @@ type Reader struct { cur array.Record err error + chunk int + done bool + next func() bool + mem memory.Allocator } @@ -82,7 +97,7 @@ type Reader struct { func NewReader(r io.Reader, schema *arrow.Schema, opts ...Option) *Reader { validate(schema) - rr := {r: csv.NewReader(r), schema: schema, refs: 1} + rr := {r: csv.NewReader(r), schema: schema, refs: 1, chunk: 1} for _, opt := range opts { opt(rr) } @@ -93,6 +108,14 @@ func NewReader(r io.Reader, schema *arrow.Schema, opts ...Option) *Reader { rr.bld = array.NewRecordBuilder(rr.mem, rr.schema) + switch { + case rr.chunk < 0: + rr.next = rr.nextall + case rr.chunk > 1: + rr.next = rr.nextn + default: + rr.next = rr.next1 + } return rr } @@ -117,13 +140,20 @@ func (r *Reader) Next() bool { r.cur = nil } - if r.err != nil { + if r.err != nil || r.done { return false } + return r.next() +} + +// next1 reads one row from the CSV file and creates a single Record +// from that row. +func (r *Reader) next1() bool { var recs []string recs, r.err = r.r.Read() if r.err != nil { + r.done = true if r.err == io.EOF { r.err = nil } @@ -132,8 +162,65 @@ func (r *Reader) Next() bool { r.validate(recs) r.read(recs) + r.cur = r.bld.NewRecord() - return r.err == nil + return true +} + +// nextall reads the whole CSV file into memory and creates one single +// Record from all the CSV rows. +func (r *Reader) nextall() bool { + defer func() { + r.done = true + }() + + var ( + recs [][]string + ) + + recs, r.err = r.r.ReadAll() + if r.err != nil { + return false + } + + for _, rec := range recs { + r.validate(rec) + r.read(rec) + } + r.cur = r.bld.NewRecord() + + return true +} + +// nextn reads n rows from the CSV file, where n is the chunk size, and creates +// a Record from these rows. +func (r *Reader) nextn() bool { + var ( + recs []string + n= 0 + ) + + for i := 0; i < r.chunk && !r.done; i++ { + recs, r.err = r.r.Read() + if r.err != nil { + r.done = true + break + } + + r.validate(recs) + r.read(recs) + n++ + } + + if r.err != nil { + r.done = true + if r.err == io.EOF { + r.err = nil + } + } + + r.cur = r.bld.NewRecord() + return n > 0 } func (r *Reader) validate(recs []string) { @@ -193,7 +280,6 @@ func (r *Reader) read(recs []string) {
[arrow] branch master updated: ARROW-3890: [Python] Handle NumPy binary arrays with UTF-8 validation when converting to StringArray
This is an automated email from the ASF dual-hosted git repository. wesm pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/arrow.git The following commit(s) were added to refs/heads/master by this push: new 67b9215 ARROW-3890: [Python] Handle NumPy binary arrays with UTF-8 validation when converting to StringArray 67b9215 is described below commit 67b92151d3a39b1908acc949c7192aa4fe77229c Author: Wes McKinney AuthorDate: Sun Dec 2 10:41:24 2018 -0600 ARROW-3890: [Python] Handle NumPy binary arrays with UTF-8 validation when converting to StringArray I'm not sure if all compilers will be smart enough to do loop unswitching here. If it ends up being a bottleneck I suggest rewriting in a follow up patch. The BinaryArray overflow issue (ChunkedArray is not being produced) is still present here. We will need to address that in ARROW-2970 This patch also includes symbol export macros particular to the arrow_python shared library. These are needed so that global data members in arrow.dll can be accessed in arrow_python.dll Author: Wes McKinney Closes #3063 from wesm/ARROW-3890 and squashes the following commits: dac4995f9 Windows needs arrow.lib in addition to arrow_python.lib now because of the new export flags 91dbea8bb Add libarrow_python-specific visibility macros so that global data members from arrow.dll can be accessed correctly in arrow_python.dll 062c3836f Clarify comment cfbd30bbe Handle case where user passes UTF-8 encoded numpy.str_ type array to pyarrow.array with type=pyarrow.string() --- cpp/src/arrow/python/CMakeLists.txt | 3 +- cpp/src/arrow/python/arrow_to_pandas.h | 12 ++--- cpp/src/arrow/python/benchmark.h| 4 +- cpp/src/arrow/python/common.h | 18 +++ cpp/src/arrow/python/config.h | 6 +-- cpp/src/arrow/python/decimal.h | 16 +++ cpp/src/arrow/python/deserialize.h | 12 ++--- cpp/src/arrow/python/helpers.h | 30 ++-- cpp/src/arrow/python/inference.cc | 6 +-- cpp/src/arrow/python/inference.h| 12 ++--- cpp/src/arrow/python/init.h | 4 +- cpp/src/arrow/python/io.h | 8 ++-- cpp/src/arrow/python/numpy_convert.h| 20 cpp/src/arrow/python/numpy_to_arrow.cc | 28 +-- cpp/src/arrow/python/numpy_to_arrow.h | 6 +-- cpp/src/arrow/python/pyarrow.h | 62 + cpp/src/arrow/python/python_to_arrow.h | 6 +-- cpp/src/arrow/python/serialize.h| 10 ++-- cpp/src/arrow/python/{config.h => visibility.h} | 39 +++- python/pyarrow/__init__.py | 2 +- python/pyarrow/tests/test_array.py | 27 +++ 21 files changed, 189 insertions(+), 142 deletions(-) diff --git a/cpp/src/arrow/python/CMakeLists.txt b/cpp/src/arrow/python/CMakeLists.txt index ff63eb0..7f4603a 100644 --- a/cpp/src/arrow/python/CMakeLists.txt +++ b/cpp/src/arrow/python/CMakeLists.txt @@ -76,7 +76,7 @@ ADD_ARROW_LIB(arrow_python foreach(LIB_TARGET ${ARROW_PYTHON_LIBRARIES}) target_compile_definitions(${LIB_TARGET} -PRIVATE ARROW_EXPORTING) +PRIVATE ARROW_PYTHON_EXPORTING) endforeach() if (ARROW_BUILD_STATIC AND MSVC) @@ -112,6 +112,7 @@ install(FILES pyarrow.h serialize.h type_traits.h + visibility.h DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/arrow/python") # pkg-config support diff --git a/cpp/src/arrow/python/arrow_to_pandas.h b/cpp/src/arrow/python/arrow_to_pandas.h index 138b010..753bf48 100644 --- a/cpp/src/arrow/python/arrow_to_pandas.h +++ b/cpp/src/arrow/python/arrow_to_pandas.h @@ -27,7 +27,7 @@ #include #include -#include "arrow/util/visibility.h" +#include "arrow/python/visibility.h" namespace arrow { @@ -57,16 +57,16 @@ struct PandasOptions { use_threads(false) {} }; -ARROW_EXPORT +ARROW_PYTHON_EXPORT Status ConvertArrayToPandas(PandasOptions options, const std::shared_ptr& arr, PyObject* py_ref, PyObject** out); -ARROW_EXPORT +ARROW_PYTHON_EXPORT Status ConvertChunkedArrayToPandas(PandasOptions options, const std::shared_ptr& col, PyObject* py_ref, PyObject** out); -ARROW_EXPORT +ARROW_PYTHON_EXPORT Status ConvertColumnToPandas(PandasOptions options, const std::shared_ptr& col, PyObject* py_ref, PyObject** out); @@ -76,7 +76,7 @@ Status ConvertColumnToPandas(PandasOptions options, const std::shared_ptr& table, MemoryPool* pool, PyObject** out); @@ -84,7 +84,7 @@ Status ConvertTableToPandas(PandasOptions options, const std::shared_ptr& /// /// Explicitly name columns that should be a categorical /// This
[arrow] branch master updated: ARROW-3912: [Plasma][GLib] Add support for creating and referring objects
This is an automated email from the ASF dual-hosted git repository. shiro pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/arrow.git The following commit(s) were added to refs/heads/master by this push: new 8f02a1b ARROW-3912: [Plasma][GLib] Add support for creating and referring objects 8f02a1b is described below commit 8f02a1b28ac10a2baa7250e00806d8663daaa3d6 Author: Kouhei Sutou AuthorDate: Sun Dec 2 19:39:18 2018 +0900 ARROW-3912: [Plasma][GLib] Add support for creating and referring objects Author: Kouhei Sutou Closes #3056 from kou/glib-plasma-create-get and squashes the following commits: e6bc59ff Add missing status check for Disconnect() 623ce411 Add missing return on error d90d8960 Add missing status check for Release() 1d5bf107 Add missing include 7ac27db6 Support old GObject Introspection 84df1849 Support old GObject Introspection 26b2926e Add support for creating and referring objects --- c_glib/arrow-gpu-glib/meson.build | 36 +- c_glib/configure.ac| 4 + c_glib/doc/plasma-glib/Makefile.am | 7 + c_glib/doc/plasma-glib/meson.build | 3 + c_glib/doc/plasma-glib/plasma-glib-docs.xml| 8 +- c_glib/plasma-glib/Makefile.am | 59 ++- c_glib/plasma-glib/client.cpp | 330 - c_glib/plasma-glib/client.h| 36 +- c_glib/plasma-glib/client.hpp | 8 +- c_glib/plasma-glib/meson.build | 54 ++- c_glib/plasma-glib/object.cpp | 538 + c_glib/plasma-glib/object.h| 89 c_glib/plasma-glib/{client.hpp => object.hpp} | 22 +- c_glib/plasma-glib/plasma-glib.h | 1 + c_glib/plasma-glib/plasma-glib.hpp | 1 + c_glib/plasma-glib/plasma-glib.pc.in | 2 +- c_glib/test/plasma/test-plasma-client.rb | 58 ++- ...sma-client.rb => test-plasma-created-object.rb} | 29 +- ...ma-client.rb => test-plasma-referred-object.rb} | 24 +- 19 files changed, 1230 insertions(+), 79 deletions(-) diff --git a/c_glib/arrow-gpu-glib/meson.build b/c_glib/arrow-gpu-glib/meson.build index e6b170e..680982e 100644 --- a/c_glib/arrow-gpu-glib/meson.build +++ b/c_glib/arrow-gpu-glib/meson.build @@ -57,19 +57,23 @@ pkgconfig.generate(filebase: 'arrow-gpu-glib', requires: ['arrow-glib', 'arrow-gpu'], libraries: [libarrow_gpu_glib]) -gnome.generate_gir(libarrow_gpu_glib, - dependencies: declare_dependency(sources: arrow_glib_gir), - sources: sources + c_headers, - namespace: 'ArrowGPU', - nsversion: api_version, - identifier_prefix: 'GArrowGPU', - symbol_prefix: 'garrow_gpu', - export_packages: 'arrow-gpu-glib', - includes: [ - 'Arrow-1.0', - ], - install: true, - extra_args: [ - '--warn-all', - '--include-uninstalled=./arrow-glib/Arrow-1.0.gir', - ]) +gir_dependencies = [ + declare_dependency(sources: arrow_glib_gir), +] +gir_extra_args = [ + '--warn-all', + '--include-uninstalled=./arrow-glib/Arrow-1.0.gir', +] +arrow_gpu_glib_gir = gnome.generate_gir(libarrow_gpu_glib, +dependencies: gir_dependencies, +sources: sources + c_headers, +namespace: 'ArrowGPU', +nsversion: api_version, +identifier_prefix: 'GArrowGPU', +symbol_prefix: 'garrow_gpu', +export_packages: 'arrow-gpu-glib', +includes: [ + 'Arrow-1.0', +], +install: true, +extra_args: gir_extra_args) diff --git a/c_glib/configure.ac b/c_glib/configure.ac index badf9e9..b84e3d3 100644 --- a/c_glib/configure.ac +++ b/c_glib/configure.ac @@ -223,8 +223,12 @@ fi AM_CONDITIONAL([HAVE_ARROW_GPU], [test "$HAVE_ARROW_GPU" = "yes"]) if test "$HAVE_ARROW_GPU" = "yes"; then + ARROW_GPU_GLIB_PACKAGE="arrow-gpu-glib" AC_DEFINE(HAVE_ARROW_GPU, [1], [Define to 1 if Apache Arrow supports GPU.]) +else + ARROW_GPU_GLIB_PACKAGE="" fi +AC_SUBST(ARROW_GPU_GLIB_PACKAGE) AM_CONDITIONAL([HAVE_GANDIVA], [test "$HAVE_GANDIVA" = "yes"]) if test "$HAVE_GANDIVA" = "yes"; then diff --git a/c_glib/doc/plasma-glib/Makefile.am