[arrow] branch master updated: ARROW-4324: [Python] Triage broken type inference logic in presence of a mix of NumPy dtype-having objects and other scalar values
This is an automated email from the ASF dual-hosted git repository. wesm pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/arrow.git The following commit(s) were added to refs/heads/master by this push: new 25b4a468 ARROW-4324: [Python] Triage broken type inference logic in presence of a mix of NumPy dtype-having objects and other scalar values 25b4a468 is described below commit 25b4a46805a3be01c83e53a92524d4d7b021c74d Author: Wes McKinney AuthorDate: Wed Jun 12 17:14:40 2019 -0500 ARROW-4324: [Python] Triage broken type inference logic in presence of a mix of NumPy dtype-having objects and other scalar values In investigating the innocuous bug report from ARROW-4324 I stumbled on a pile of hacks and flawed design around type inference ``` test_list = [np.dtype('int32').type(10), np.dtype('float32').type(0.5)] test_array = pa.array(test_list) # Expected # test_array # # [ # 10, # 0.5 # ] # Got # test_array # # [ # 10, # 0 # ] ``` It turns out there are several issues: * There was a kludge around handling the `numpy.nan` value which is a PyFloat, not a NumPy float64 scalar * Type inference assumed "NaN is null", which should not be hard coded, so I added a flag to switch between pandas semantics and non-pandas * Mixing NumPy scalar values and non-NumPy scalars (like our evil friend numpy.nan) caused the output type to be simply incorrect. For example `[np.float16(1.5), 2.5]` would yield `pa.float16()` output type. Yuck In inserted some hacks to force what I believe to be the correct behavior and fixed a couple unit tests that actually exhibited buggy behavior before (see within). I don't have time to do the "right thing" right now which is to more or less rewrite the hot path of `arrow/python/inference.cc`, so at least this gets the unit tests asserting what is correct so that refactoring will be more productive later. Author: Wes McKinney Closes #4527 from wesm/ARROW-4324 and squashes the following commits: e396958b0 Add unit test for passing pandas Series with from_pandas=False 754468a5d Set from_pandas to None by default in pyarrow.array so that user wishes can be respected e1b839339 Remove outdated unit test, add Python unit test that shows behavior from ARROW-2240 that's been changed 4bc8c8193 Triage type inference logic in presence of a mix of NumPy dtype-having objects and other typed values, pending more serious refactor in ARROW-5564 --- cpp/src/arrow/python/arrow_to_pandas.cc | 14 +-- cpp/src/arrow/python/inference.cc| 134 +++ cpp/src/arrow/python/inference.h | 6 +- cpp/src/arrow/python/numpy-internal.h| 9 ++ cpp/src/arrow/python/python-test.cc | 14 --- cpp/src/arrow/python/python_to_arrow.cc | 2 +- python/pyarrow/array.pxi | 41 +--- python/pyarrow/tests/test_convert_builtin.py | 42 +++-- python/pyarrow/tests/test_pandas.py | 7 ++ 9 files changed, 182 insertions(+), 87 deletions(-) diff --git a/cpp/src/arrow/python/arrow_to_pandas.cc b/cpp/src/arrow/python/arrow_to_pandas.cc index d556664..fa35a6e 100644 --- a/cpp/src/arrow/python/arrow_to_pandas.cc +++ b/cpp/src/arrow/python/arrow_to_pandas.cc @@ -173,19 +173,11 @@ inline void set_numpy_metadata(int type, DataType* datatype, PyArray_Descr* out) } } -static inline PyArray_Descr* GetSafeNumPyDtype(int type) { - if (type == NPY_DATETIME) { -// It is not safe to mutate the result of DescrFromType -return PyArray_DescrNewFromType(type); - } else { -return PyArray_DescrFromType(type); - } -} static inline PyObject* NewArray1DFromType(DataType* arrow_type, int type, int64_t length, void* data) { npy_intp dims[1] = {length}; - PyArray_Descr* descr = GetSafeNumPyDtype(type); + PyArray_Descr* descr = internal::GetSafeNumPyDtype(type); if (descr == nullptr) { // Error occurred, trust error state is set return nullptr; @@ -244,7 +236,7 @@ class PandasBlock { Status AllocateNDArray(int npy_type, int ndim = 2) { PyAcquireGIL lock; -PyArray_Descr* descr = GetSafeNumPyDtype(npy_type); +PyArray_Descr* descr = internal::GetSafeNumPyDtype(npy_type); PyObject* block_arr; if (ndim == 2) { @@ -1220,7 +1212,7 @@ class CategoricalBlock : public PandasBlock { PyAcquireGIL lock; -PyArray_Descr* descr = GetSafeNumPyDtype(npy_type); +PyArray_Descr* descr = internal::GetSafeNumPyDtype(npy_type); if (descr == nullptr) { // Error occurred, trust error state is set return Status::OK(); diff --git a/cpp/src/arrow/python/inference.cc b/cpp/src/arrow/python/inference.cc index 6cf8bed..4ec4d9d 100644 --- a/cpp/src/arrow/python/inference.cc +++
[arrow] branch master updated: ARROW-5339: [C++] Add jemalloc URL to thirdparty/versions.txt so download_dependencies.sh gets it
This is an automated email from the ASF dual-hosted git repository. wesm pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/arrow.git The following commit(s) were added to refs/heads/master by this push: new 4ea86ff ARROW-5339: [C++] Add jemalloc URL to thirdparty/versions.txt so download_dependencies.sh gets it 4ea86ff is described below commit 4ea86ffeb8fa7a371243ac65052536c409ac2c1f Author: Wes McKinney AuthorDate: Wed Jun 12 14:48:09 2019 -0500 ARROW-5339: [C++] Add jemalloc URL to thirdparty/versions.txt so download_dependencies.sh gets it I confirmed that I'm able to build the project with jemalloc with no networking disabled with this change Author: Wes McKinney Closes #4533 from wesm/ARROW-5339 and squashes the following commits: 6b627ea1d Add jemalloc URL to thirdparty/versions.txt --- cpp/thirdparty/versions.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/thirdparty/versions.txt b/cpp/thirdparty/versions.txt index f064b50..23001d0 100644 --- a/cpp/thirdparty/versions.txt +++ b/cpp/thirdparty/versions.txt @@ -64,6 +64,7 @@ DEPENDENCIES=( "ARROW_GLOG_URL glog-${GLOG_VERSION}.tar.gz https://github.com/google/glog/archive/${GLOG_VERSION}.tar.gz; "ARROW_GRPC_URL grpc-${GRPC_VERSION}.tar.gz https://github.com/grpc/grpc/archive/${GRPC_VERSION}.tar.gz; "ARROW_GTEST_URL gtest-${GTEST_VERSION}.tar.gz https://github.com/google/googletest/archive/release-${GTEST_VERSION}.tar.gz; + "ARROW_JEMALLOC_URL jemalloc-${JEMALLOC_VERSION}.tar.gz https://github.com/jemalloc/jemalloc/archive/${JEMALLOC_VERSION}.tar.gz; "ARROW_LZ4_URL lz4-${LZ4_VERSION}.tar.gz https://github.com/lz4/lz4/archive/${LZ4_VERSION}.tar.gz; "ARROW_ORC_URL orc-${ORC_VERSION}.tar.gz https://github.com/apache/orc/archive/rel/release-${ORC_VERSION}.tar.gz; "ARROW_PROTOBUF_URL protobuf-${PROTOBUF_VERSION}.tar.gz https://github.com/google/protobuf/releases/download/${PROTOBUF_VERSION}/protobuf-all-${PROTOBUF_VERSION:1}.tar.gz;
[arrow] branch master updated: ARROW-5556: [Doc] [Python] Document JSON reader
This is an automated email from the ASF dual-hosted git repository. wesm pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/arrow.git The following commit(s) were added to refs/heads/master by this push: new ac4a9ef ARROW-5556: [Doc] [Python] Document JSON reader ac4a9ef is described below commit ac4a9ef1096128a1a4b563eaf83f1503e1db953f Author: Antoine Pitrou AuthorDate: Wed Jun 12 10:57:27 2019 -0500 ARROW-5556: [Doc] [Python] Document JSON reader Author: Antoine Pitrou Closes #4521 from pitrou/ARROW-5556-py-json-docs and squashes the following commits: 40064de97 Address review comments 65f357a6b ARROW-5556: Document JSON reader --- docs/source/python/api/formats.rst | 18 +- docs/source/python/csv.rst | 2 +- docs/source/python/index.rst | 1 + docs/source/python/json.rst| 114 + python/pyarrow/_csv.pyx| 13 +++-- python/pyarrow/_json.pyx | 5 +- 6 files changed, 141 insertions(+), 12 deletions(-) diff --git a/docs/source/python/api/formats.rst b/docs/source/python/api/formats.rst index 8de30ec..f8aab4a 100644 --- a/docs/source/python/api/formats.rst +++ b/docs/source/python/api/formats.rst @@ -18,13 +18,13 @@ Tabular File Formats -.. currentmodule:: pyarrow.csv - .. _api.csv: CSV Files - +.. currentmodule:: pyarrow.csv + .. autosummary:: :toctree: ../generated/ @@ -46,7 +46,19 @@ Feather Files read_feather write_feather -.. currentmodule:: pyarrow +.. _api.json: + +JSON Files +-- + +.. currentmodule:: pyarrow.json + +.. autosummary:: + :toctree: ../generated/ + + ReadOptions + ParseOptions + read_json .. _api.parquet: diff --git a/docs/source/python/csv.rst b/docs/source/python/csv.rst index 17023b1..96a79e6 100644 --- a/docs/source/python/csv.rst +++ b/docs/source/python/csv.rst @@ -21,7 +21,7 @@ Reading CSV files = -Arrow provides preliminary support for reading data from CSV files. +Arrow supports reading columnar data from CSV files. The features currently offered are the following: * multi-threaded or single-threaded reading diff --git a/docs/source/python/index.rst b/docs/source/python/index.rst index 7f227c5..09367f4 100644 --- a/docs/source/python/index.rst +++ b/docs/source/python/index.rst @@ -43,6 +43,7 @@ files into Arrow structures. pandas timestamps csv + json parquet cuda extending diff --git a/docs/source/python/json.rst b/docs/source/python/json.rst new file mode 100644 index 000..e4abbff --- /dev/null +++ b/docs/source/python/json.rst @@ -0,0 +1,114 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. currentmodule:: pyarrow.json +.. _json: + +Reading JSON files +== + +Arrow supports reading columnar data from JSON files. In this context, a +JSON file consists of multiple JSON objects, one per line, representing +individual data rows. For example, this file represents two rows of data +with four columns "a", "b", "c", "d": + +.. code-block:: json + + {"a": 1, "b": 2.0, "c": "foo", "d": false} + {"a": 4, "b": -5.5, "c": null, "d": true} + +The features currently offered are the following: + +* multi-threaded or single-threaded reading +* automatic decompression of input files (based on the filename extension, + such as ``my_data.json.gz``) +* sophisticated type inference (see below) + + +Usage +- + +JSON reading functionality is available through the :mod:`pyarrow.json` module. +In many cases, you will simply call the :func:`read_json` function +with the file path you want to read from:: + + >>> from pyarrow import json + >>> fn = 'my_data.json' + >>> table = json.read_json(fn) + >>> table + pyarrow.Table + a: int64 + b: double + c: string + d: bool + >>> table.to_pandas() + ab c d + 0 1 2.0 foo False + 1 4 -5.5 None True + + +Automatic Type Inference + + +Arrow :ref:`data types ` are inferred from the JSON types and +values of each column: + +* JSON null values convert to the ``null`` type, but can fall back to any + other
[arrow] branch master updated: ARROW-5574: [R] documentation error for read_arrow()
This is an automated email from the ASF dual-hosted git repository. romainfrancois pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/arrow.git The following commit(s) were added to refs/heads/master by this push: new 24c42f7 ARROW-5574: [R] documentation error for read_arrow() 24c42f7 is described below commit 24c42f71e219614b08becb8f635aeeb2d2a4ddc1 Author: Romain Francois AuthorDate: Wed Jun 12 17:04:02 2019 +0200 ARROW-5574: [R] documentation error for read_arrow() @wesm this should fix the current R related issues on travis. I'll merge once travis gives the ✅ Author: Romain Francois Closes #4530 from romainfrancois/ARROW-5574/read_arrow_doc_error and squashes the following commits: cd83b1da use_threads= argument of read_arrow() is obsolete --- r/DESCRIPTION| 2 +- r/R/read_table.R | 4 ++-- r/man/arrow_available.Rd | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/r/DESCRIPTION b/r/DESCRIPTION index f38f0de..103a63b 100644 --- a/r/DESCRIPTION +++ b/r/DESCRIPTION @@ -59,8 +59,8 @@ Collate: 'Struct.R' 'Table.R' 'array.R' -'arrowExports.R' 'arrow-package.R' +'arrowExports.R' 'buffer.R' 'io.R' 'compression.R' diff --git a/r/R/read_table.R b/r/R/read_table.R index d5122a8..57ef5ec 100644 --- a/r/R/read_table.R +++ b/r/R/read_table.R @@ -83,6 +83,6 @@ read_table.fs_path <- function(stream) { #' @rdname read_table #' @export -read_arrow <- function(stream, use_threads = TRUE){ - as.data.frame(read_table(stream)) +read_arrow <- function(stream){ + as_tibble(read_table(stream)) } diff --git a/r/man/arrow_available.Rd b/r/man/arrow_available.Rd index af0f938..26a01ca 100644 --- a/r/man/arrow_available.Rd +++ b/r/man/arrow_available.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/zzz.R +% Please edit documentation in R/arrow-package.R \name{arrow_available} \alias{arrow_available} \title{Is the C++ Arrow library available}
[arrow] branch master updated: ARROW-4194: [Format][Docs] Remove duplicated / out-of-date logical type information from documentation
This is an automated email from the ASF dual-hosted git repository. wesm pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/arrow.git The following commit(s) were added to refs/heads/master by this push: new c2da956 ARROW-4194: [Format][Docs] Remove duplicated / out-of-date logical type information from documentation c2da956 is described below commit c2da956ac061fe8128edf8ce7527a563e791e170 Author: Wes McKinney AuthorDate: Wed Jun 12 09:19:33 2019 -0500 ARROW-4194: [Format][Docs] Remove duplicated / out-of-date logical type information from documentation This documentation is not being properly maintained and it duplicates the point-of-truth information in format/Schema.fbs. I also split out the integration-testing related JSON stuff to a separate section and opened ARROW-5563 about sprucing that up. Author: Wes McKinney Closes #4523 from wesm/remove-outdated-metadata-stuff and squashes the following commits: 276412604 Remove duplicated / out-of-date logical type information from documentation and direct readers to Schema.fbs --- docs/source/format/Metadata.rst | 245 1 file changed, 70 insertions(+), 175 deletions(-) diff --git a/docs/source/format/Metadata.rst b/docs/source/format/Metadata.rst index 293d011..b6c2a5f 100644 --- a/docs/source/format/Metadata.rst +++ b/docs/source/format/Metadata.rst @@ -65,96 +65,6 @@ the columns. The Flatbuffers IDL for a field is: :: The ``type`` is the logical type of the field. Nested types, such as List, Struct, and Union, have a sequence of child fields. -A JSON representation of the schema is also provided: - -Field: :: - -{ - "name" : "name_of_the_field", - "nullable" : false, - "type" : /* Type */, - "children" : [ /* Field */ ], -} - -Type: :: - -{ - "name" : "null|struct|list|union|int|floatingpoint|utf8|binary|fixedsizebinary|bool|decimal|date|time|timestamp|interval" - // fields as defined in the Flatbuffer depending on the type name -} - -Union: :: - -{ - "name" : "union", - "mode" : "Sparse|Dense", - "typeIds" : [ /* integer */ ] -} - -The ``typeIds`` field in the Union are the codes used to denote each type, which -may be different from the index of the child array. This is so that the union -type ids do not have to be enumerated from 0. - -Int: :: - -{ - "name" : "int", - "bitWidth" : /* integer */, - "isSigned" : /* boolean */ -} - -FloatingPoint: :: - -{ - "name" : "floatingpoint", - "precision" : "HALF|SINGLE|DOUBLE" -} - -Decimal: :: - -{ - "name" : "decimal", - "precision" : /* integer */, - "scale" : /* integer */ -} - -Timestamp: :: - -{ - "name" : "timestamp", - "unit" : "SECOND|MILLISECOND|MICROSECOND|NANOSECOND" -} - -Date: :: - -{ - "name" : "date", - "unit" : "DAY|MILLISECOND" -} - -Time: :: - -{ - "name" : "time", - "unit" : "SECOND|MILLISECOND|MICROSECOND|NANOSECOND", - "bitWidth": /* integer: 32 or 64 */ -} - -Interval: :: - -{ - "name" : "interval", - "unit" : "YEAR_MONTH|DAY_TIME" -} - -Schema: :: - -{ - "fields" : [ -/* Field */ - ] -} - Record data headers --- @@ -280,117 +190,102 @@ categories: * Types having equivalent memory layout to a physical nested type (e.g. strings use the list representation, but logically are not nested types) -Integers - +Refer to `Schema.fbs`_ for up-to-date descriptions of each built-in +logical type. -In the first version of Arrow we provide the standard 8-bit through 64-bit size -standard C integer types, both signed and unsigned: +Integration Testing +--- -* Signed types: Int8, Int16, Int32, Int64 -* Unsigned types: UInt8, UInt16, UInt32, UInt64 +A JSON representation of the schema is provided for cross-language +integration testing purposes. -The IDL looks like: :: +Field: :: -table Int { - bitWidth: int; - is_signed: bool; +{ + "name" : "name_of_the_field", + "nullable" : false, + "type" : /* Type */, + "children" : [ /* Field */ ], } -The integer endianness is currently set globally at the schema level. If a -schema is set to be little-endian, then all integer types occurring within must -be little-endian. Integers that are part of other data representations, such as -list offsets and union types, must have the same endianness as the entire -record batch. - -Floating point numbers -~~ - -We provide 3 types of floating point numbers as fixed bit-width primitive array - -- Half precision, 16-bit width -- Single precision, 32-bit width -- Double precision, 64-bit width - -The IDL looks like: :: - -enum Precision:int {HALF, SINGLE, DOUBLE} +Type: :: -table FloatingPoint { - precision:
[arrow] branch master updated: ARROW-5553: [Ruby] Use the official packages to install Apache Arrow
This is an automated email from the ASF dual-hosted git repository. shiro pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/arrow.git The following commit(s) were added to refs/heads/master by this push: new adba5bb ARROW-5553: [Ruby] Use the official packages to install Apache Arrow adba5bb is described below commit adba5bb4d46403e2d3df431375b6c654ad0570f6 Author: Sutou Kouhei AuthorDate: Wed Jun 12 21:00:40 2019 +0900 ARROW-5553: [Ruby] Use the official packages to install Apache Arrow packages.red-data-tools.org is deprecated. Author: Sutou Kouhei Closes #4526 from kou/ruby-update-package-information and squashes the following commits: 898ef4e54 Use the official packages to install Apache Arrow --- ruby/red-arrow-cuda/README.md | 4 +--- ruby/red-arrow/README.md | 4 +--- ruby/red-gandiva/README.md| 4 +--- ruby/red-parquet/README.md| 4 +--- ruby/red-plasma/README.md | 4 +--- 5 files changed, 5 insertions(+), 15 deletions(-) diff --git a/ruby/red-arrow-cuda/README.md b/ruby/red-arrow-cuda/README.md index 76fa51c..f05e664 100644 --- a/ruby/red-arrow-cuda/README.md +++ b/ruby/red-arrow-cuda/README.md @@ -33,9 +33,7 @@ gobject-introspection gem is a Ruby bindings of GObject Introspection. Red Arrow ## Install -Install Apache Arrow CUDA GLib before install Red Arrow CUDA. Use [packages.red-data-tools.org](https://github.com/red-data-tools/packages.red-data-tools.org) for installing Apache Arrow CUDA GLib. - -Note that the Apache Arrow CUDA GLib packages are "unofficial". "Official" packages will be released in the future. +Install Apache Arrow CUDA GLib before install Red Arrow CUDA. Install Apache Arrow GLib before install Red Arrow. See [Apache Arrow install document](https://arrow.apache.org/install/) for details. Install Red Arrow CUDA after you install Apache Arrow CUDA GLib: diff --git a/ruby/red-arrow/README.md b/ruby/red-arrow/README.md index 95ec396..20ca83f 100644 --- a/ruby/red-arrow/README.md +++ b/ruby/red-arrow/README.md @@ -33,9 +33,7 @@ gobject-introspection gem is a Ruby bindings of GObject Introspection. Red Arrow ## Install -Install Apache Arrow GLib before install Red Arrow. Use [packages.red-data-tools.org](https://github.com/red-data-tools/packages.red-data-tools.org) for installing Apache Arrow GLib. - -Note that the Apache Arrow GLib packages are "unofficial". "Official" packages will be released in the future. +Install Apache Arrow GLib before install Red Arrow. See [Apache Arrow install document](https://arrow.apache.org/install/) for details. Install Red Arrow after you install Apache Arrow GLib: diff --git a/ruby/red-gandiva/README.md b/ruby/red-gandiva/README.md index d6ab944..91174ee 100644 --- a/ruby/red-gandiva/README.md +++ b/ruby/red-gandiva/README.md @@ -33,9 +33,7 @@ gobject-introspection gem is a Ruby bindings of GObject Introspection. Red Gandi ## Install -Install Gandiva GLib before install Red Gandiva. Use [packages.red-data-tools.org](https://github.com/red-data-tools/packages.red-data-tools.org) for installing Gandiva GLib. - -Note that the Gandiva GLib packages are "unofficial". "Official" packages will be released in the future. +Install Gandiva GLib before install Red Gandiva. See [Apache Arrow install document](https://arrow.apache.org/install/) for details. Install Red Gandiva after you install Gandiva GLib: diff --git a/ruby/red-parquet/README.md b/ruby/red-parquet/README.md index 4fb8438..434dab9 100644 --- a/ruby/red-parquet/README.md +++ b/ruby/red-parquet/README.md @@ -33,9 +33,7 @@ gobject-introspection gem is a Ruby bindings of GObject Introspection. Red Parqu ## Install -Install Apache Parquet GLib before install Red Parquet. Use [packages.red-data-tools.org](https://github.com/red-data-tools/packages.red-data-tools.org) for installing Apache Parquet GLib. - -Note that the Apache Parquet GLib packages are "unofficial". "Official" packages will be released in the future. +Install Apache Parquet GLib before install Red Parquet. See [Apache Arrow install document](https://arrow.apache.org/install/) for details. Install Red Parquet after you install Apache Parquet GLib: diff --git a/ruby/red-plasma/README.md b/ruby/red-plasma/README.md index 19038af..e8939f6 100644 --- a/ruby/red-plasma/README.md +++ b/ruby/red-plasma/README.md @@ -33,9 +33,7 @@ gobject-introspection gem is a Ruby bindings of GObject Introspection. Red Plasm ## Install -Install Plasma GLib before install Red Plasma. Use [packages.red-data-tools.org](https://github.com/red-data-tools/packages.red-data-tools.org) for installing Plasma GLib. - -Note that the Plasma GLib packages are "unofficial". "Official" packages will be released in the future. +Install Plasma GLib before install Red Plasma. See [Apache Arrow install document](https://arrow.apache.org/install/) for details. Install Red Plasma after you
[arrow] branch master updated (f6e3c43 -> 9709e96)
This is an automated email from the ASF dual-hosted git repository. kszucs pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/arrow.git. from f6e3c43 ARROW-5504 [R]: move use_threads argument to global option add 9709e96 ARROW-5465: [Crossbow] Support writing submitted job definition yaml to a file No new revisions were added by this update. Summary of changes: dev/tasks/conda-recipes/appveyor.yml | 4 +- dev/tasks/conda-recipes/travis.linux.yml | 4 +- dev/tasks/conda-recipes/travis.osx.yml| 4 +- dev/tasks/crossbow.py | 181 -- dev/tasks/docker-tests/travis.linux.yml | 10 +- dev/tasks/gandiva-jars/travis.linux.yml | 5 +- dev/tasks/gandiva-jars/travis.osx.yml | 4 +- dev/tasks/linux-packages/travis.linux.yml | 3 +- dev/tasks/python-wheels/appveyor.yml | 5 +- dev/tasks/python-wheels/travis.linux.yml | 3 +- dev/tasks/python-wheels/travis.osx.yml| 3 +- 11 files changed, 130 insertions(+), 96 deletions(-)
[arrow] branch master updated: ARROW-5504 [R]: move use_threads argument to global option
This is an automated email from the ASF dual-hosted git repository. romainfrancois pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/arrow.git The following commit(s) were added to refs/heads/master by this push: new f6e3c43 ARROW-5504 [R]: move use_threads argument to global option f6e3c43 is described below commit f6e3c437f0b330fb8ffbc6959838bc1fe3bf24f4 Author: Romain Francois AuthorDate: Wed Jun 12 13:51:44 2019 +0200 ARROW-5504 [R]: move use_threads argument to global option At this point the function is not exported or documented and threads are always used, users would need to set `options(arrow.use_threads)` to turn them off. Author: Romain Francois Closes #4515 from romainfrancois/ARROW-5504/use_threads and squashes the following commits: 445364ab s/as_tibble()/as.data.frame()/ 4afa7d4d + option_use_threads() function --- r/DESCRIPTION | 2 +- r/NAMESPACE | 1 + r/R/ChunkedArray.R | 1 - r/R/R6.R| 6 -- r/R/RecordBatch.R | 2 +- r/R/Table.R | 2 +- r/R/array.R | 1 - r/R/{zzz.R => arrow-package.R} | 13 +++-- r/R/csv.R | 6 ++ r/R/feather.R | 5 ++--- r/R/json.R | 2 +- r/R/parquet.R | 6 ++ r/R/read_table.R| 2 -- r/README.Rmd| 2 +- r/README.md | 22 +++--- r/man/arrow-package.Rd | 40 r/man/csv_read_options.Rd | 4 +--- r/man/read_feather.Rd | 5 + r/man/read_parquet.Rd | 5 + r/man/read_table.Rd | 4 +--- r/tests/testthat/test-json.R| 4 ++-- r/tests/testthat/test-parquet.R | 7 --- 22 files changed, 80 insertions(+), 62 deletions(-) diff --git a/r/DESCRIPTION b/r/DESCRIPTION index c38e5a1..f38f0de 100644 --- a/r/DESCRIPTION +++ b/r/DESCRIPTION @@ -60,6 +60,7 @@ Collate: 'Table.R' 'array.R' 'arrowExports.R' +'arrow-package.R' 'buffer.R' 'io.R' 'compression.R' @@ -75,4 +76,3 @@ Collate: 'read_table.R' 'reexports-bit64.R' 'write_arrow.R' -'zzz.R' diff --git a/r/NAMESPACE b/r/NAMESPACE index d535ea9..3f91568 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -180,6 +180,7 @@ importFrom(purrr,map_int) importFrom(rlang,"%||%") importFrom(rlang,abort) importFrom(rlang,dots_n) +importFrom(rlang,is_false) importFrom(rlang,list2) importFrom(rlang,warn) useDynLib(arrow, .registration = TRUE) diff --git a/r/R/ChunkedArray.R b/r/R/ChunkedArray.R index 339a416..69a0224 100644 --- a/r/R/ChunkedArray.R +++ b/r/R/ChunkedArray.R @@ -60,7 +60,6 @@ #' @param \dots Vectors to coerce #' @param type currently ignored #' -#' @importFrom rlang list2 %||% #' @export chunked_array <- function(..., type = NULL){ shared_ptr(`arrow::ChunkedArray`, ChunkedArray__from_list(list2(...), type)) diff --git a/r/R/R6.R b/r/R/R6.R index 26c679f..e343116 100644 --- a/r/R/R6.R +++ b/r/R/R6.R @@ -16,12 +16,6 @@ # under the License. #' @include enums.R -#' @importFrom R6 R6Class -#' @importFrom glue glue -#' @importFrom purrr map map_int map2 -#' @importFrom rlang dots_n -#' @importFrom assertthat assert_that - `arrow::Object` <- R6Class("arrow::Object", public = list( initialize = function(xp) self$set_pointer(xp), diff --git a/r/R/RecordBatch.R b/r/R/RecordBatch.R index 3ebd81b..d60c823 100644 --- a/r/R/RecordBatch.R +++ b/r/R/RecordBatch.R @@ -86,7 +86,7 @@ #' @export `as.data.frame.arrow::RecordBatch` <- function(x, row.names = NULL, optional = FALSE, use_threads = TRUE, ...){ - RecordBatch__to_dataframe(x, use_threads = use_threads) + RecordBatch__to_dataframe(x, use_threads = option_use_threads()) } #' Create an [arrow::RecordBatch][arrow__RecordBatch] from a data frame diff --git a/r/R/Table.R b/r/R/Table.R index 4c434b0..6d50394 100644 --- a/r/R/Table.R +++ b/r/R/Table.R @@ -67,7 +67,7 @@ table <- function(..., schema = NULL){ #' @export `as.data.frame.arrow::Table` <- function(x, row.names = NULL, optional = FALSE, use_threads = TRUE, ...){ - Table__to_dataframe(x, use_threads = use_threads) + Table__to_dataframe(x, use_threads = option_use_threads()) } #' @export diff --git a/r/R/array.R b/r/R/array.R index ccb8521..244cee0 100644 --- a/r/R/array.R +++ b/r/R/array.R @@ -122,7 +122,6 @@ #' @param x R object #' @param type Explicit [type][arrow__DataType], or NULL (the default) to infer from the data #' -#' @importFrom rlang warn #' @export array <- function(x, type = NULL){ `arrow::Array`$dispatch(Array__from_vector(x, type)) diff --git a/r/R/zzz.R b/r/R/arrow-package.R similarity index 76% rename from r/R/zzz.R rename to r/R/arrow-package.R index eab9ad4..41cbc2a 100644 --- a/r/R/zzz.R +++
[arrow] branch master updated: ARROW-5190 [R]: Discussion: tibble dependency in R package
This is an automated email from the ASF dual-hosted git repository. romainfrancois pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/arrow.git The following commit(s) were added to refs/heads/master by this push: new 5750f5a ARROW-5190 [R]: Discussion: tibble dependency in R package 5750f5a is described below commit 5750f5aff5cf7ea34ba2263f6969919895eae033 Author: Romain Francois AuthorDate: Wed Jun 12 13:38:59 2019 +0200 ARROW-5190 [R]: Discussion: tibble dependency in R package `tibble` is now on Suggests. The code still makes tibbles, so that if `tibble` is otherwise loaded, data frames are nicely print, etc... but does not need to specifically `Imports` it. Author: Romain Francois Closes #4454 from romainfrancois/ARROW-5190/tibble and squashes the following commits: 18b1e05e s/as_tibble/as.data.frame/ in tests 66e63038 as.data.frame() needs arguments row.names = NULL, optional = FALSE 240ef90d move tibble dependency to Suggests --- r/DESCRIPTION | 7 +++ r/NAMESPACE | 6 ++ r/R/RecordBatch.R | 2 +- r/R/Table.R | 2 +- r/R/feather.R | 2 +- r/R/parquet.R | 2 +- r/R/read_table.R| 2 +- r/R/reexports-tibble.R | 20 r/man/reexports.Rd | 5 + r/tests/testthat/test-RecordBatch.R | 6 +++--- r/tests/testthat/test-Table.R | 10 +- r/tests/testthat/test-feather.R | 8 +--- r/tests/testthat/test-parquet.R | 2 +- 13 files changed, 25 insertions(+), 49 deletions(-) diff --git a/r/DESCRIPTION b/r/DESCRIPTION index 58a208c..c38e5a1 100644 --- a/r/DESCRIPTION +++ b/r/DESCRIPTION @@ -21,9 +21,9 @@ Encoding: UTF-8 LazyData: true SystemRequirements: C++11 LinkingTo: -Rcpp (>= 1.0.0) +Rcpp (>= 1.0.1) Imports: -Rcpp (>= 1.0.0), +Rcpp (>= 1.0.1), rlang, purrr, assertthat, @@ -31,12 +31,12 @@ Imports: R6, vctrs (>= 0.1.0), fs, -tibble, crayon, bit64 Roxygen: list(markdown = TRUE) RoxygenNote: 6.1.1 Suggests: +tibble, covr, pkgdown, rmarkdown, @@ -74,6 +74,5 @@ Collate: 'read_record_batch.R' 'read_table.R' 'reexports-bit64.R' -'reexports-tibble.R' 'write_arrow.R' 'zzz.R' diff --git a/r/NAMESPACE b/r/NAMESPACE index 13071b9..d535ea9 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -38,8 +38,8 @@ S3method(RecordBatchStreamReader,raw) S3method(RecordBatchStreamWriter,"arrow::io::OutputStream") S3method(RecordBatchStreamWriter,character) S3method(RecordBatchStreamWriter,fs_path) -S3method(as_tibble,"arrow::RecordBatch") -S3method(as_tibble,"arrow::Table") +S3method(as.data.frame,"arrow::RecordBatch") +S3method(as.data.frame,"arrow::Table") S3method(buffer,"arrow::Buffer") S3method(buffer,complex) S3method(buffer,default) @@ -114,7 +114,6 @@ export(TimeUnit) export(Type) export(array) export(arrow_available) -export(as_tibble) export(boolean) export(buffer) export(cast_options) @@ -183,5 +182,4 @@ importFrom(rlang,abort) importFrom(rlang,dots_n) importFrom(rlang,list2) importFrom(rlang,warn) -importFrom(tibble,as_tibble) useDynLib(arrow, .registration = TRUE) diff --git a/r/R/RecordBatch.R b/r/R/RecordBatch.R index 2b9148a..3ebd81b 100644 --- a/r/R/RecordBatch.R +++ b/r/R/RecordBatch.R @@ -85,7 +85,7 @@ } #' @export -`as_tibble.arrow::RecordBatch` <- function(x, use_threads = TRUE, ...){ +`as.data.frame.arrow::RecordBatch` <- function(x, row.names = NULL, optional = FALSE, use_threads = TRUE, ...){ RecordBatch__to_dataframe(x, use_threads = use_threads) } diff --git a/r/R/Table.R b/r/R/Table.R index 87e87ac..4c434b0 100644 --- a/r/R/Table.R +++ b/r/R/Table.R @@ -66,7 +66,7 @@ table <- function(..., schema = NULL){ } #' @export -`as_tibble.arrow::Table` <- function(x, use_threads = TRUE, ...){ +`as.data.frame.arrow::Table` <- function(x, row.names = NULL, optional = FALSE, use_threads = TRUE, ...){ Table__to_dataframe(x, use_threads = use_threads) } diff --git a/r/R/feather.R b/r/R/feather.R index 4a1d9de..c65ea9e 100644 --- a/r/R/feather.R +++ b/r/R/feather.R @@ -169,7 +169,7 @@ FeatherTableReader.fs_path <- function(file, mmap = TRUE, ...) { read_feather <- function(file, columns = NULL, as_tibble = TRUE, use_threads = TRUE, ...){ out <- FeatherTableReader(file, ...)$Read(columns) if (isTRUE(as_tibble)) { -out <- as_tibble(out, use_threads = use_threads) +out <- as.data.frame(out, use_threads = use_threads) } out } diff --git a/r/R/parquet.R b/r/R/parquet.R index d7f389f..8caf356 100644 --- a/r/R/parquet.R +++ b/r/R/parquet.R @@ -39,7 +39,7 @@ read_parquet <- function(file, as_tibble = TRUE, use_threads = TRUE, ...) { tab <- shared_ptr(`arrow::Table`, read_parquet_file(file)) if (isTRUE(as_tibble))
[arrow] branch master updated: ARROW-3780 [R]: Failed to fetch data: invalid data when collecting int16
This is an automated email from the ASF dual-hosted git repository. romainfrancois pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/arrow.git The following commit(s) were added to refs/heads/master by this push: new 2cb776d ARROW-3780 [R]: Failed to fetch data: invalid data when collecting int16 2cb776d is described below commit 2cb776d6aa95cbc32da45cd72990d273cdc0c82f Author: Romain Francois AuthorDate: Wed Jun 12 13:37:00 2019 +0200 ARROW-3780 [R]: Failed to fetch data: invalid data when collecting int16 same as #4505 which for some reason has disappeared Author: Romain Francois Closes #4529 from romainfrancois/ARROW-3780/STOP_IF_NULL and squashes the following commits: ca9caf26 STOP_IF_NULL() macro is never used --- r/src/arrow_types.h | 5 - 1 file changed, 5 deletions(-) diff --git a/r/src/arrow_types.h b/r/src/arrow_types.h index 867a89f..ca1e2d6 100644 --- a/r/src/arrow_types.h +++ b/r/src/arrow_types.h @@ -43,11 +43,6 @@ struct symbols { #define STOP_IF_NOT_OK(s) STOP_IF_NOT(s.ok(), s.ToString()) template -inline void STOP_IF_NULL(T* ptr) { - STOP_IF_NOT(ptr, "invalid data"); -} - -template struct NoDelete { inline void operator()(T* ptr) {} };
[arrow] branch master updated: ARROW-3815 [R]: refine record batch factory
This is an automated email from the ASF dual-hosted git repository. romainfrancois pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/arrow.git The following commit(s) were added to refs/heads/master by this push: new 72287df ARROW-3815 [R]: refine record batch factory 72287df is described below commit 72287dfef7109bfa49affd4fc7c1016a86b25416 Author: Romain Francois AuthorDate: Wed Jun 12 13:34:09 2019 +0200 ARROW-3815 [R]: refine record batch factory so that `array()` when given an `arrow::Array` is identity ``` r library(arrow, warn.conflicts = FALSE) array(array(1:10)) #> arrow::Array #> [ #> 1, #> 2, #> 3, #> 4, #> 5, #> 6, #> 7, #> 8, #> 9, #> 10 #> ] ``` so that we may supply already made `arrow::Array` in `record_batch()` ```r batch <- record_batch(x = 1:10, y = arrow::array(1:10)) as_tibble(batch) #> # A tibble: 10 x 2 #>x y #> #> 1 1 1 #> 2 2 2 #> 3 3 3 #> 4 4 4 #> 5 5 5 #> 6 6 6 #> 7 7 7 #> 8 8 8 #> 9 9 9 #> 101010 ``` Created on 2019-06-10 by the [reprex package](https://reprex.tidyverse.org) (v0.3.0.9000) Author: Romain Francois Closes #4508 from romainfrancois/ARROW-3815/record_batch and squashes the following commits: 189ab978 more idiomatic C++11 range foor loop 9e479c81 linting 4fb4f332 additional chunked_array() test f17ae031 Array__from_vector() recosgnise arrow::Array 2925a740 InferType recognizes arrow::Array --- .travis.yml | 1 + r/src/array_from_vector.cpp | 12 r/tests/testthat/test-Array.R| 5 + r/tests/testthat/test-RecordBatch.R | 5 + r/tests/testthat/test-chunkedarray.R | 7 +++ 5 files changed, 30 insertions(+) diff --git a/.travis.yml b/.travis.yml index 74b5bad..36a2dcc 100644 --- a/.travis.yml +++ b/.travis.yml @@ -362,6 +362,7 @@ matrix: - $TRAVIS_BUILD_DIR/ci/travis_before_script_cpp.sh --only-library - export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$TRAVIS_BUILD_DIR/cpp-install/lib - export PKG_CONFIG_PATH=$PKG_CONFIG_PATH:$TRAVIS_BUILD_DIR/cpp-install/lib/pkgconfig +- export CXX11FLAGS=-Wall - pushd ${TRAVIS_BUILD_DIR}/r after_success: - Rscript ../ci/travis_upload_r_coverage.R diff --git a/r/src/array_from_vector.cpp b/r/src/array_from_vector.cpp index 0c21e94..0d4e318 100644 --- a/r/src/array_from_vector.cpp +++ b/r/src/array_from_vector.cpp @@ -763,6 +763,13 @@ std::shared_ptr GetFactorType(SEXP factor) { std::shared_ptr InferType(SEXP x) { switch (TYPEOF(x)) { +case ENVSXP: + if (Rf_inherits(x, "arrow::Array")) { + Rcpp::ConstReferenceSmartPtrInputParameter> array( +x); +return static_cast>(array)->type(); + } + break; case LGLSXP: return boolean(); case INTSXP: @@ -915,6 +922,11 @@ bool CheckCompatibleFactor(SEXP obj, const std::shared_ptr& typ std::shared_ptr Array__from_vector( SEXP x, const std::shared_ptr& type, bool type_infered) { + // short circuit if `x` is already an Array + if (Rf_inherits(x, "arrow::Array")) { +return Rcpp::ConstReferenceSmartPtrInputParameter>(x); + } + // special case when we can just use the data from the R vector // directly. This still needs to handle the null bitmap if (arrow::r::can_reuse_memory(x, type)) { diff --git a/r/tests/testthat/test-Array.R b/r/tests/testthat/test-Array.R index 68fd9dc..fd2fe51 100644 --- a/r/tests/testthat/test-Array.R +++ b/r/tests/testthat/test-Array.R @@ -410,3 +410,8 @@ test_that("Array$as_vector() converts to integer (ARROW-3794)", { expect_equal(a$type, uint8()) expect_equal(a$as_vector(), 0:255) }) + +test_that("array() recognise arrow::Array (ARROW-3815)", { + a <- array(1:10) + expect_equal(a, array(a)) +}) diff --git a/r/tests/testthat/test-RecordBatch.R b/r/tests/testthat/test-RecordBatch.R index c0295ba..a492f0f 100644 --- a/r/tests/testthat/test-RecordBatch.R +++ b/r/tests/testthat/test-RecordBatch.R @@ -145,3 +145,8 @@ test_that("RecordBatch dim() and nrow() (ARROW-3816)", { expect_equal(dim(batch), c(10L, 2L)) expect_equal(nrow(batch), 10L) }) + +test_that("record_batch() handles arrow::Array", { + batch <- record_batch(x = 1:10, y = arrow::array(1:10)) + expect_equal(batch$schema, schema(x = int32(), y = int32())) +}) diff --git a/r/tests/testthat/test-chunkedarray.R b/r/tests/testthat/test-chunkedarray.R index 5e22e5c..9505b8f 100644 --- a/r/tests/testthat/test-chunkedarray.R +++ b/r/tests/testthat/test-chunkedarray.R @@ -265,3 +265,10 @@ test_that("chunked_array() handles 0 chunks if given a type", { expect_equal(a$length(), 0L) } }) + +test_that("chunked_array()
[arrow] branch master updated: ARROW-5503 [R]: add read_json()
This is an automated email from the ASF dual-hosted git repository. romainfrancois pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/arrow.git The following commit(s) were added to refs/heads/master by this push: new 0b552a0 ARROW-5503 [R]: add read_json() 0b552a0 is described below commit 0b552a0795db43c0bfbcb65be032793c8172e2df Author: Romain Francois AuthorDate: Wed Jun 12 13:27:17 2019 +0200 ARROW-5503 [R]: add read_json() ``` r library(arrow, warn.conflicts = FALSE) tf <- tempfile() writeLines(' { "hello": 3.5, "world": false, "yo": "thing" } { "hello": 3.25, "world": null } { "hello": 3.125, "world": null, "yo": "\u5fcd" } { "hello": 0.0, "world": true, "yo": null } ', tf) tab <- read_json_arrow(tf) tab #> arrow::Table tab$schema #> arrow::Schema #> hello: double #> world: bool #> yo: string as_tibble(tab) #> # A tibble: 4 x 3 #> hello world yo #> #> 1 3.5 FALSE thing #> 2 3.25 NA #> 3 3.12 NA忍 #> 4 0TRUE ``` Created on 2019-06-11 by the [reprex package](https://reprex.tidyverse.org) (v0.3.0.9000) Author: Romain Francois Closes #4518 from romainfrancois/ARROW-5503/read_json and squashes the following commits: 0314b6a0 #include should be protected by ARROW_R_WITH_ARROW 14cf419e + read_json_arrow(as_tibble=) d469467a use ] 57d4d076 + read_json_arrow() --- r/DESCRIPTION | 1 + r/NAMESPACE | 9 +++ r/R/arrowExports.R| 16 r/R/json.R| 159 ++ r/man/arrow__json__TableReader.Rd | 18 + r/man/json_parse_options.Rd | 14 r/man/json_read_options.Rd| 16 r/man/json_table_reader.Rd| 21 + r/man/read_json_arrow.Rd | 16 r/src/arrowExports.cpp| 66 r/src/arrow_types.h | 1 + r/src/json.cpp| 63 +++ r/tests/testthat/test-json.R | 106 + 13 files changed, 506 insertions(+) diff --git a/r/DESCRIPTION b/r/DESCRIPTION index d566f74..58a208c 100644 --- a/r/DESCRIPTION +++ b/r/DESCRIPTION @@ -67,6 +67,7 @@ Collate: 'csv.R' 'dictionary.R' 'feather.R' +'json.R' 'memory_pool.R' 'message.R' 'parquet.R' diff --git a/r/NAMESPACE b/r/NAMESPACE index bc4f677..13071b9 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -53,6 +53,11 @@ S3method(csv_table_reader,default) S3method(csv_table_reader,fs_path) S3method(dim,"arrow::RecordBatch") S3method(dim,"arrow::Table") +S3method(json_table_reader,"arrow::io::InputStream") +S3method(json_table_reader,"arrow::json::TableReader") +S3method(json_table_reader,character) +S3method(json_table_reader,default) +S3method(json_table_reader,fs_path) S3method(length,"arrow::Array") S3method(names,"arrow::RecordBatch") S3method(print,"arrow-enum") @@ -132,6 +137,9 @@ export(int16) export(int32) export(int64) export(int8) +export(json_parse_options) +export(json_read_options) +export(json_table_reader) export(list_of) export(mmap_create) export(mmap_open) @@ -139,6 +147,7 @@ export(null) export(read_arrow) export(read_csv_arrow) export(read_feather) +export(read_json_arrow) export(read_message) export(read_parquet) export(read_record_batch) diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R index 6359f90..52ff492 100644 --- a/r/R/arrowExports.R +++ b/r/R/arrowExports.R @@ -612,6 +612,22 @@ io___FixedSizeBufferWriter__initialize <- function(buffer){ .Call(`_arrow_io___FixedSizeBufferWriter__initialize` , buffer) } +json___ReadOptions__initialize <- function(options){ +.Call(`_arrow_json___ReadOptions__initialize` , options) +} + +json___ParseOptions__initialize <- function(options){ +.Call(`_arrow_json___ParseOptions__initialize` , options) +} + +json___TableReader__Make <- function(input, read_options, parse_options){ +.Call(`_arrow_json___TableReader__Make` , input, read_options, parse_options) +} + +json___TableReader__Read <- function(table_reader){ +.Call(`_arrow_json___TableReader__Read` , table_reader) +} + MemoryPool__default <- function(){ .Call(`_arrow_MemoryPool__default` ) } diff --git a/r/R/json.R b/r/R/json.R new file mode 100644 index 000..2de8b94 --- /dev/null +++ b/r/R/json.R @@ -0,0 +1,159 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +#
[arrow] branch master updated: ARROW-3794 [R]: Consider mapping INT8 to integer() not raw()
This is an automated email from the ASF dual-hosted git repository. romainfrancois pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/arrow.git The following commit(s) were added to refs/heads/master by this push: new 365ec5b ARROW-3794 [R]: Consider mapping INT8 to integer() not raw() 365ec5b is described below commit 365ec5b0bd8c27f4b1d80d8d34071a8a9b62179b Author: Romain Francois AuthorDate: Wed Jun 12 12:39:37 2019 +0200 ARROW-3794 [R]: Consider mapping INT8 to integer() not raw() R vectors of raw types are converted to arrays of type uint8 ``` r arrow::array(as.raw(1:10))$type #> arrow::UInt8 #> uint8 ``` but otoh, arrays of type uint8 are converted to R integer vectors. ``` r library(arrow, warn.conflicts = FALSE) a <- array((-128):127)$cast(int8()) str(a$as_vector()) #> int [1:256] -128 -127 -126 -125 -124 -123 -122 -121 -120 -119 ... a <- array(0:255)$cast(uint8()) str(a$as_vector()) #> int [1:256] 0 1 2 3 4 5 6 7 8 9 ... ``` Created on 2019-06-10 by the [reprex package](https://reprex.tidyverse.org) (v0.3.0.9000) Author: Romain Francois Closes #4507 from romainfrancois/ARROW-3794/INT8 and squashes the following commits: 19ff70b1 not echo line number in arrowExports.cpp, as this would create some unnecessary changes 524bcb06 better tests 68a4aa2b raw R vectors are converted to uint8 arrays uint8 arrays are converted to integer vectors --- r/src/array__to_vector.cpp | 7 --- r/src/array_from_vector.cpp| 2 +- r/tests/testthat/test-Array.R | 13 + r/tests/testthat/test-read-write.R | 11 --- 4 files changed, 26 insertions(+), 7 deletions(-) diff --git a/r/src/array__to_vector.cpp b/r/src/array__to_vector.cpp index 65670e8..17d0600 100644 --- a/r/src/array__to_vector.cpp +++ b/r/src/array__to_vector.cpp @@ -530,9 +530,6 @@ class Converter_Int64 : public Converter { std::shared_ptr Converter::Make(const ArrayVector& arrays) { switch (arrays[0]->type_id()) { // direct support -case Type::INT8: - return std::make_shared>(arrays); - case Type::INT32: return std::make_shared>(arrays); @@ -557,6 +554,10 @@ std::shared_ptr Converter::Make(const ArrayVector& arrays) { return std::make_shared(arrays); // promotions to integer vector +case Type::INT8: + return std::make_shared>( + arrays); + case Type::UINT8: return std::make_shared>( arrays); diff --git a/r/src/array_from_vector.cpp b/r/src/array_from_vector.cpp index 98e59ff..0c21e94 100644 --- a/r/src/array_from_vector.cpp +++ b/r/src/array_from_vector.cpp @@ -897,7 +897,7 @@ std::shared_ptr Array__from_vector_reuse_memory(SEXP x) { } return MakeSimpleArray(x); case RAWSXP: - return MakeSimpleArray(x); + return MakeSimpleArray(x); default: break; } diff --git a/r/tests/testthat/test-Array.R b/r/tests/testthat/test-Array.R index b15b4bd..68fd9dc 100644 --- a/r/tests/testthat/test-Array.R +++ b/r/tests/testthat/test-Array.R @@ -397,3 +397,16 @@ test_that("array() does not convert doubles to integer", { } }) +test_that("array() converts raw vectors to uint8 arrays (ARROW-3794)", { + expect_equal(array(as.raw(1:10))$type, uint8()) +}) + +test_that("Array$as_vector() converts to integer (ARROW-3794)", { + a <- array((-128):127)$cast(int8()) + expect_equal(a$type, int8()) + expect_equal(a$as_vector(), (-128):127) + + a <- array(0:255)$cast(uint8()) + expect_equal(a$type, uint8()) + expect_equal(a$as_vector(), 0:255) +}) diff --git a/r/tests/testthat/test-read-write.R b/r/tests/testthat/test-read-write.R index c56a7d3..fb442a2 100644 --- a/r/tests/testthat/test-read-write.R +++ b/r/tests/testthat/test-read-write.R @@ -76,7 +76,7 @@ test_that("arrow::table round trip", { chunked_array_raw <- col_raw$data() expect_equal(chunked_array_raw$length(), 10L) expect_equal(chunked_array_raw$null_count, 0L) - expect_equal(chunked_array_raw$as_vector(), tbl$raw) + expect_equal(chunked_array_raw$as_vector(), as.integer(tbl$raw)) # arrow::Array chunks_raw <- chunked_array_raw$chunks @@ -88,7 +88,9 @@ test_that("arrow::table round trip", { write_arrow(tbl, tf) res <- read_arrow(tf) - expect_identical(tbl, res) + expect_identical(tbl$int, res$int) + expect_identical(tbl$dbl, res$dbl) + expect_identical(as.integer(tbl$raw), res$raw) unlink(tf) }) @@ -119,7 +121,10 @@ test_that("arrow::table round trip handles NA in integer and numeric", { write_arrow(tbl, tf) res <- read_arrow(tf) - expect_identical(tbl, res) + expect_identical(tbl$int, res$int) + expect_identical(tbl$dbl, res$dbl) + expect_identical(as.integer(tbl$raw), res$raw) + expect_true(is.na(res$int[1])) expect_true(is.na(res$dbl[6])) expect_true(is.na(res$dbl[10]))
[arrow] branch master updated: ARROW-5451: [C++][Gandiva] Support cast/round functions for decimal
This is an automated email from the ASF dual-hosted git repository. ravindra pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/arrow.git The following commit(s) were added to refs/heads/master by this push: new 01dcb45 ARROW-5451: [C++][Gandiva] Support cast/round functions for decimal 01dcb45 is described below commit 01dcb455c9e89854625322a1f053468f430a039b Author: Pindikura Ravindra AuthorDate: Wed Jun 12 14:15:15 2019 +0530 ARROW-5451: [C++][Gandiva] Support cast/round functions for decimal Author: Pindikura Ravindra Closes #4418 from pravindra/cast and squashes the following commits: 1f499dfb ARROW-5451: Support cast/round functions for decimal --- cpp/src/gandiva/decimal_ir.cc | 143 +++ cpp/src/gandiva/decimal_ir.h| 3 + cpp/src/gandiva/function_registry_arithmetic.cc | 29 +- cpp/src/gandiva/function_registry_math_ops.cc | 12 +- cpp/src/gandiva/precompiled/decimal_ops.cc | 239 +++ cpp/src/gandiva/precompiled/decimal_ops.h | 29 ++ cpp/src/gandiva/precompiled/decimal_ops_test.cc | 502 cpp/src/gandiva/precompiled/decimal_wrapper.cc | 126 ++ cpp/src/gandiva/tests/decimal_test.cc | 185 - 9 files changed, 1253 insertions(+), 15 deletions(-) diff --git a/cpp/src/gandiva/decimal_ir.cc b/cpp/src/gandiva/decimal_ir.cc index 6344332..d291611 100644 --- a/cpp/src/gandiva/decimal_ir.cc +++ b/cpp/src/gandiva/decimal_ir.cc @@ -502,8 +502,63 @@ Status DecimalIR::BuildCompare(const std::string& function_name, return Status::OK(); } +Status DecimalIR::BuildDecimalFunction(const std::string& function_name, + llvm::Type* return_type, + std::vector in_types) { + auto i64 = types()->i64_type(); + auto i128 = types()->i128_type(); + auto function = BuildFunction(function_name, return_type, in_types); + + auto entry = llvm::BasicBlock::Create(*context(), "entry", function); + ir_builder()->SetInsertPoint(entry); + + std::vector args; + int arg_idx = 0; + auto arg_iter = function->arg_begin(); + for (auto& type : in_types) { +if (type.type == i128) { + // split i128 arg into two int64s. + auto split = ValueSplit::MakeFromInt128(this, _iter[arg_idx]); + args.push_back(split.high()); + args.push_back(split.low()); +} else { + args.push_back(_iter[arg_idx]); +} +++arg_idx; + } + + auto internal_name = function_name + "_internal"; + llvm::Value* result = nullptr; + if (return_type == i128) { +// for i128 ret, replace with two int64* args, and join them. +auto block = ir_builder()->GetInsertBlock(); +auto out_high_ptr = new llvm::AllocaInst(i64, 0, "out_hi", block); +auto out_low_ptr = new llvm::AllocaInst(i64, 0, "out_low", block); +args.push_back(out_high_ptr); +args.push_back(out_low_ptr); + +// Make call to pre-compiled IR function. +ir_builder()->CreateCall(module()->getFunction(internal_name), args); + +auto out_high = ir_builder()->CreateLoad(out_high_ptr); +auto out_low = ir_builder()->CreateLoad(out_low_ptr); +result = ValueSplit(out_high, out_low).AsInt128(this); + } else { +DCHECK_NE(return_type, types()->void_type()); + +// Make call to pre-compiled IR function. +result = ir_builder()->CreateCall(module()->getFunction(internal_name), args); + } + ir_builder()->CreateRet(result); + return Status::OK(); +} + Status DecimalIR::AddFunctions(Engine* engine) { auto decimal_ir = std::make_shared(engine); + auto i128 = decimal_ir->types()->i128_type(); + auto i32 = decimal_ir->types()->i32_type(); + auto i64 = decimal_ir->types()->i64_type(); + auto f64 = decimal_ir->types()->double_type(); // Populate global variables used by decimal operations. decimal_ir->AddGlobals(engine); @@ -531,6 +586,94 @@ Status DecimalIR::AddFunctions(Engine* engine) { llvm::ICmpInst::ICMP_SGT)); ARROW_RETURN_NOT_OK(decimal_ir->BuildCompare( "greater_than_or_equal_to_decimal128_decimal128", llvm::ICmpInst::ICMP_SGE)); + + ARROW_RETURN_NOT_OK(decimal_ir->BuildDecimalFunction("abs_decimal128", i128, + { + {"x_value", i128}, + {"x_precision", i32}, + {"x_scale", i32}, + })); + + ARROW_RETURN_NOT_OK(decimal_ir->BuildDecimalFunction("ceil_decimal128", i128, + { + {"x_value", i128}, + {"x_precision", i32}, +
[arrow] branch master updated: ARROW-4787: [C++] Add support for Null in MemoTable and related kernels
This is an automated email from the ASF dual-hosted git repository. apitrou pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/arrow.git The following commit(s) were added to refs/heads/master by this push: new ceaed81 ARROW-4787: [C++] Add support for Null in MemoTable and related kernels ceaed81 is described below commit ceaed810e6e7a654d0f45db4c8e2f4bae7edefe5 Author: François Saint-Jacques AuthorDate: Wed Jun 12 10:36:47 2019 +0200 ARROW-4787: [C++] Add support for Null in MemoTable and related kernels Author: François Saint-Jacques Closes #4480 from fsaintjacques/ARROW-4787-hash-nulls and squashes the following commits: 3e2b30eca Fix python unique test 9cb73b8fe Add option to skip memotable in HashKernel 4f7aa5331 Fix MVCC again 16aa21181 Fix MVCC warnings 8736606f2 Fix typo de1957681 Add null supports to kernels 2322ead49 Add GetNull and GetOrInsertNull to MemoTable b85b8f4f3 Add kKeyNotFound to memo table --- cpp/src/arrow/compute/kernels/hash-test.cc | 71 - cpp/src/arrow/compute/kernels/hash.cc | 200 - cpp/src/arrow/util/bit-util.cc | 17 +++ cpp/src/arrow/util/bit-util.h | 7 +- cpp/src/arrow/util/hashing-test.cc | 127 +--- cpp/src/arrow/util/hashing.h | 226 - python/pyarrow/tests/test_array.py | 2 +- 7 files changed, 460 insertions(+), 190 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/hash-test.cc b/cpp/src/arrow/compute/kernels/hash-test.cc index 61553c1..0752eee 100644 --- a/cpp/src/arrow/compute/kernels/hash-test.cc +++ b/cpp/src/arrow/compute/kernels/hash-test.cc @@ -91,10 +91,11 @@ void CheckValueCounts(FunctionContext* ctx, const std::shared_ptr& typ const std::vector& out_values, const std::vector& out_is_valid, const std::vector& out_counts) { + std::vector all_valids(out_is_valid.size(), true); std::shared_ptr input = _MakeArray(type, in_values, in_is_valid); std::shared_ptr ex_values = _MakeArray(type, out_values, out_is_valid); std::shared_ptr ex_counts = - _MakeArray(int64(), out_counts, out_is_valid); + _MakeArray(int64(), out_counts, all_valids); std::shared_ptr result; ASSERT_OK(ValueCounts(ctx, input, )); @@ -141,17 +142,17 @@ TYPED_TEST(TestHashKernelPrimitive, Unique) { using T = typename TypeParam::c_type; auto type = TypeTraits::type_singleton(); CheckUnique(>ctx_, type, {2, 1, 2, 1}, {true, false, true, true}, -{2, 1}, {}); +{2, 0, 1}, {1, 0, 1}); CheckUnique(>ctx_, type, {2, 1, 3, 1}, {false, false, true, true}, -{3, 1}, {}); +{0, 3, 1}, {0, 1, 1}); } TYPED_TEST(TestHashKernelPrimitive, ValueCounts) { using T = typename TypeParam::c_type; auto type = TypeTraits::type_singleton(); CheckValueCounts(>ctx_, type, {2, 1, 2, 1, 2, 3, 4}, - {true, false, true, true, true, true, false}, {2, 1, 3}, - {}, {3, 1, 1}); + {true, false, true, true, true, true, false}, + {2, 0, 1, 3}, {1, 0, 1, 1}, {3, 2, 1, 1}); CheckValueCounts(>ctx_, type, {}, {}, {}, {}, {}); CheckValueCountsNull(>ctx_, type); } @@ -160,8 +161,8 @@ TYPED_TEST(TestHashKernelPrimitive, DictEncode) { using T = typename TypeParam::c_type; auto type = TypeTraits::type_singleton(); CheckDictEncode(>ctx_, type, {2, 1, 2, 1, 2, 3}, -{true, false, true, true, true, true}, {2, 1, 3}, {}, -{0, 0, 0, 1, 0, 2}); +{true, false, true, true, true, true}, {2, 1, 3}, +{1, 1, 1}, {0, 0, 0, 1, 0, 2}); } TYPED_TEST(TestHashKernelPrimitive, PrimitiveResizeTable) { @@ -193,35 +194,38 @@ TYPED_TEST(TestHashKernelPrimitive, PrimitiveResizeTable) { TEST_F(TestHashKernel, UniqueTimeTimestamp) { CheckUnique(>ctx_, time32(TimeUnit::SECOND), {2, 1, 2, 1}, - {true, false, true, true}, {2, 1}, {}); + {true, false, true, true}, {2, 0, 1}, {1, 0, 1}); CheckUnique(>ctx_, time64(TimeUnit::NANO), {2, 1, 2, 1}, - {true, false, true, true}, {2, 1}, {}); + {true, false, true, true}, {2, 0, 1}, {1, 0, 1}); CheckUnique(>ctx_, timestamp(TimeUnit::NANO), - {2, 1, 2, 1}, {true, false, true, true}, {2, 1}, - {}); + {2, 1, 2, 1}, {true, false, true, true}, {2, 0, 1}, + {1, 0, 1}); }
[arrow] branch master updated: ARROW-4845: [R] Compiler warnings on Windows MingW64
This is an automated email from the ASF dual-hosted git repository. romainfrancois pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/arrow.git The following commit(s) were added to refs/heads/master by this push: new 5b0034d ARROW-4845: [R] Compiler warnings on Windows MingW64 5b0034d is described below commit 5b0034d677d0ad33896d4a54499189d404b7fec0 Author: Romain Francois AuthorDate: Wed Jun 12 10:29:23 2019 +0200 ARROW-4845: [R] Compiler warnings on Windows MingW64 Author: Romain Francois Closes #4511 from romainfrancois/ARROW-4845/sign_compare and squashes the following commits: 7052b9e7 not echo line number in arrowExports.cpp, as this would create some unnecessary changes 04a71581 more idiomatic c++ range based for loop e646f82f using size_t instead of int64_t, -Wsign-compare --- r/data-raw/codegen.R | 2 +- r/src/arrowExports.cpp | 416 - r/src/recordbatch.cpp | 16 +- 3 files changed, 219 insertions(+), 215 deletions(-) diff --git a/r/data-raw/codegen.R b/r/data-raw/codegen.R index 17fa0f0..3e56e33 100644 --- a/r/data-raw/codegen.R +++ b/r/data-raw/codegen.R @@ -72,7 +72,7 @@ cpp_functions_definitions <- decorations %>% select(name, return_type, args, file, line) %>% pmap_chr(function(name, return_type, args, file, line){ glue::glue(' -// {basename(file)}:{line} +// {basename(file)} #if defined(ARROW_R_WITH_ARROW) {return_type} {name}({real_params}); RcppExport SEXP _arrow_{name}({sexp_params}){{ diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp index 0ddac30..37e9a3d 100644 --- a/r/src/arrowExports.cpp +++ b/r/src/arrowExports.cpp @@ -4,7 +4,7 @@ using namespace Rcpp; -// array.cpp:25 +// array.cpp #if defined(ARROW_R_WITH_ARROW) std::shared_ptr Array__Slice1(const std::shared_ptr& array, int offset); RcppExport SEXP _arrow_Array__Slice1(SEXP array_sexp, SEXP offset_sexp){ @@ -20,7 +20,7 @@ RcppExport SEXP _arrow_Array__Slice1(SEXP array_sexp, SEXP offset_sexp){ } #endif -// array.cpp:31 +// array.cpp #if defined(ARROW_R_WITH_ARROW) std::shared_ptr Array__Slice2(const std::shared_ptr& array, int offset, int length); RcppExport SEXP _arrow_Array__Slice2(SEXP array_sexp, SEXP offset_sexp, SEXP length_sexp){ @@ -37,7 +37,7 @@ RcppExport SEXP _arrow_Array__Slice2(SEXP array_sexp, SEXP offset_sexp, SEXP len } #endif -// array.cpp:37 +// array.cpp #if defined(ARROW_R_WITH_ARROW) bool Array__IsNull(const std::shared_ptr& x, int i); RcppExport SEXP _arrow_Array__IsNull(SEXP x_sexp, SEXP i_sexp){ @@ -53,7 +53,7 @@ RcppExport SEXP _arrow_Array__IsNull(SEXP x_sexp, SEXP i_sexp){ } #endif -// array.cpp:40 +// array.cpp #if defined(ARROW_R_WITH_ARROW) bool Array__IsValid(const std::shared_ptr& x, int i); RcppExport SEXP _arrow_Array__IsValid(SEXP x_sexp, SEXP i_sexp){ @@ -69,7 +69,7 @@ RcppExport SEXP _arrow_Array__IsValid(SEXP x_sexp, SEXP i_sexp){ } #endif -// array.cpp:45 +// array.cpp #if defined(ARROW_R_WITH_ARROW) int Array__length(const std::shared_ptr& x); RcppExport SEXP _arrow_Array__length(SEXP x_sexp){ @@ -84,7 +84,7 @@ RcppExport SEXP _arrow_Array__length(SEXP x_sexp){ } #endif -// array.cpp:48 +// array.cpp #if defined(ARROW_R_WITH_ARROW) int Array__offset(const std::shared_ptr& x); RcppExport SEXP _arrow_Array__offset(SEXP x_sexp){ @@ -99,7 +99,7 @@ RcppExport SEXP _arrow_Array__offset(SEXP x_sexp){ } #endif -// array.cpp:51 +// array.cpp #if defined(ARROW_R_WITH_ARROW) int Array__null_count(const std::shared_ptr& x); RcppExport SEXP _arrow_Array__null_count(SEXP x_sexp){ @@ -114,7 +114,7 @@ RcppExport SEXP _arrow_Array__null_count(SEXP x_sexp){ } #endif -// array.cpp:54 +// array.cpp #if defined(ARROW_R_WITH_ARROW) std::shared_ptr Array__type(const std::shared_ptr& x); RcppExport SEXP _arrow_Array__type(SEXP x_sexp){ @@ -129,7 +129,7 @@ RcppExport SEXP _arrow_Array__type(SEXP x_sexp){ } #endif -// array.cpp:59 +// array.cpp #if defined(ARROW_R_WITH_ARROW) std::string Array__ToString(const std::shared_ptr& x); RcppExport SEXP _arrow_Array__ToString(SEXP x_sexp){ @@ -144,7 +144,7 @@ RcppExport SEXP _arrow_Array__ToString(SEXP x_sexp){ } #endif -// array.cpp:64 +// array.cpp #if defined(ARROW_R_WITH_ARROW) arrow::Type::type Array__type_id(const std::shared_ptr& x); RcppExport SEXP _arrow_Array__type_id(SEXP x_sexp){ @@ -159,7 +159,7 @@ RcppExport SEXP _arrow_Array__type_id(SEXP x_sexp){ } #endif -// array.cpp:69 +// array.cpp #if defined(ARROW_R_WITH_ARROW) bool Array__Equals(const std::shared_ptr& lhs, const std::shared_ptr& rhs); RcppExport SEXP _arrow_Array__Equals(SEXP lhs_sexp, SEXP rhs_sexp){ @@ -175,7 +175,7 @@ RcppExport SEXP _arrow_Array__Equals(SEXP lhs_sexp, SEXP rhs_sexp){ } #endif -// array.cpp:75 +// array.cpp #if defined(ARROW_R_WITH_ARROW) bool Array__ApproxEquals(const std::shared_ptr& lhs, const