[arrow-datafusion] branch master updated: python `lit` function to support bool and byte vec (#1152)
This is an automated email from the ASF dual-hosted git repository. jiayuliu pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git The following commit(s) were added to refs/heads/master by this push: new f455357 python `lit` function to support bool and byte vec (#1152) f455357 is described below commit f455357bf159763a19312bab2c9238bc101792e0 Author: Jiayu Liu AuthorDate: Thu Oct 21 13:04:41 2021 +0800 python `lit` function to support bool and byte vec (#1152) * python lit function to support bool and byte vec * update per comment --- datafusion/src/logical_plan/expr.rs | 12 ++ python/Cargo.lock | 224 python/src/functions.rs | 48 +--- python/tests/test_functions.py | 11 +- 4 files changed, 156 insertions(+), 139 deletions(-) diff --git a/datafusion/src/logical_plan/expr.rs b/datafusion/src/logical_plan/expr.rs index d50d533..011068d 100644 --- a/datafusion/src/logical_plan/expr.rs +++ b/datafusion/src/logical_plan/expr.rs @@ -1407,6 +1407,18 @@ impl Literal for String { } } +impl Literal for Vec { +fn lit() -> Expr { +Expr::Literal(ScalarValue::Binary(Some((*self).to_owned( +} +} + +impl Literal for &[u8] { +fn lit() -> Expr { +Expr::Literal(ScalarValue::Binary(Some((*self).to_owned( +} +} + impl Literal for ScalarValue { fn lit() -> Expr { Expr::Literal(self.clone()) diff --git a/python/Cargo.lock b/python/Cargo.lock index 6daefea..6ae2702 100644 --- a/python/Cargo.lock +++ b/python/Cargo.lock @@ -51,24 +51,19 @@ checksum = "a4c527152e37cf757a3f78aae5a06fbeefdb07ccc535c980a3208ee3060dd544" [[package]] name = "arrayvec" -version = "0.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index; -checksum = "23b62fc65de8e4e7f52534fb52b0f3ed04746ae267519eef2a83941e8085068b" - -[[package]] -name = "arrayvec" version = "0.7.1" source = "registry+https://github.com/rust-lang/crates.io-index; checksum = "be4dc07131ffa69b8072d35f5007352af944213cde02545e2103680baed38fcd" [[package]] name = "arrow" -version = "5.3.0" +version = "6.0.0" source = "registry+https://github.com/rust-lang/crates.io-index; -checksum = "a4091f84cacfdbd5238e161d314e585820269926f79e05d184db8a2898782d44" +checksum = "337e668497751234149fd607f5cb41a6ae7b286b6329589126fe67f0ac55d637" dependencies = [ "bitflags", "chrono", + "comfy-table", "csv", "flatbuffers", "hex", @@ -77,7 +72,6 @@ dependencies = [ "lexical-core", "multiversion", "num", - "prettytable-rs", "rand 0.8.4", "regex", "serde", @@ -97,17 +91,6 @@ dependencies = [ ] [[package]] -name = "atty" -version = "0.2.14" -source = "registry+https://github.com/rust-lang/crates.io-index; -checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8" -dependencies = [ - "hermit-abi", - "libc", - "winapi", -] - -[[package]] name = "autocfg" version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index; @@ -137,24 +120,13 @@ dependencies = [ ] [[package]] -name = "blake2b_simd" -version = "0.5.11" -source = "registry+https://github.com/rust-lang/crates.io-index; -checksum = "afa748e348ad3be8263be728124b24a24f268266f6f5d58af9d75f6a40b5c587" -dependencies = [ - "arrayref", - "arrayvec 0.5.2", - "constant_time_eq", -] - -[[package]] name = "blake3" version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index; checksum = "dcd555c66291d5f836dbb6883b48660ece810fe25a31f3bdfb911945dff2691f" dependencies = [ "arrayref", - "arrayvec 0.7.1", + "arrayvec", "cc", "cfg-if", "constant_time_eq", @@ -238,6 +210,17 @@ dependencies = [ ] [[package]] +name = "comfy-table" +version = "4.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index; +checksum = "11e95a3e867422fd8d04049041f5671f94d53c32a9dcd82e2be268714942f3f3" +dependencies = [ + "strum", + "strum_macros", + "unicode-width", +] + +[[package]] name = "constant_time_eq" version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index; @@ -262,16 +245,6 @@ dependencies = [ ] [[package]] -name = "crossbeam-utils" -version = "0.8.5" -source = "registry+https://github.com/rust-lang/crates.io-index; -checksum = "d82cfc11ce7f2c3faef78d8a684447b40d503d9681acebed6cb728d45940c4db" -dependencies = [ - "cfg-if", - "lazy_static", -] - -[[package]] name = "crypto-mac" version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index; @@ -355,23 +328,6 @@ dependencies = [ ] [[package]] -name = "dirs" -version = "1.0.5" -source = "registry+https://github.com/rust-lang/crates.io-index; -checksum = "3fd78930633bd1c6e35c4b42b1df7b0cbc6bc191146e512bb3bedf243fcc3901" -dependencies = [ - "libc", - "redox_users", - "winapi", -] - -[[package]] -name = "encode_unicode" -version = "0.3.6" -source = "registry+https://github.com/rust-lang/crates.io-index; -checksum =
[arrow] branch master updated (6f478d0 -> f893fa2)
This is an automated email from the ASF dual-hosted git repository. kou pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/arrow.git. from 6f478d0 ARROW-14396: [R][Doc] Remove relic note in write_dataset that columns cannot be renamed add f893fa2 ARROW-14401: [C++] Fix bundled crc32c's include path No new revisions were added by this update. Summary of changes: cpp/cmake_modules/ThirdpartyToolchain.cmake | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-)
[arrow] branch master updated (9841dc8 -> 6f478d0)
This is an automated email from the ASF dual-hosted git repository. npr pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/arrow.git. from 9841dc8 ARROW-14400: [Go] Equals and ApproxEquals for Tables and Chunked Arrays add 6f478d0 ARROW-14396: [R][Doc] Remove relic note in write_dataset that columns cannot be renamed No new revisions were added by this update. Summary of changes: r/R/dataset-write.R| 8 +++- r/man/arrow-package.Rd | 6 +- r/man/write_dataset.Rd | 8 +++- 3 files changed, 7 insertions(+), 15 deletions(-)
[arrow] branch master updated: ARROW-14400: [Go] Equals and ApproxEquals for Tables and Chunked Arrays
This is an automated email from the ASF dual-hosted git repository. zeroshade pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/arrow.git The following commit(s) were added to refs/heads/master by this push: new 9841dc8 ARROW-14400: [Go] Equals and ApproxEquals for Tables and Chunked Arrays 9841dc8 is described below commit 9841dc864c62115d68706750b86ced5e142804f6 Author: Matthew Topol AuthorDate: Wed Oct 20 15:07:27 2021 -0400 ARROW-14400: [Go] Equals and ApproxEquals for Tables and Chunked Arrays Closes #11488 from zeroshade/extra-comparisons Authored-by: Matthew Topol Signed-off-by: Matthew Topol --- go/arrow/array/compare.go | 140 + go/arrow/array/compare_test.go | 81 2 files changed, 221 insertions(+) diff --git a/go/arrow/array/compare.go b/go/arrow/array/compare.go index c4ee046..89c81ef 100644 --- a/go/arrow/array/compare.go +++ b/go/arrow/array/compare.go @@ -65,6 +65,136 @@ func RecordApproxEqual(left, right Record, opts ...EqualOption) bool { return true } +// helper function to evaluate a function on two chunked object having possibly different +// chunk layouts. the function passed in will be called for each corresponding slice of the +// two chunked arrays and if the function returns false it will end the loop early. +func chunkedBinaryApply(left, right *Chunked, fn func(left Interface, lbeg, lend int64, right Interface, rbeg, rend int64) bool) { + var ( + pos int64 + lengthint64 = int64(left.length) + leftIdx, rightIdx int + leftPos, rightPos int64 + ) + + for pos < length { + var cleft, cright Interface + for { + cleft, cright = left.Chunk(leftIdx), right.Chunk(rightIdx) + if leftPos == int64(cleft.Len()) { + leftPos = 0 + leftIdx++ + continue + } + if rightPos == int64(cright.Len()) { + rightPos = 0 + rightIdx++ + continue + } + break + } + + sz := int64(min(cleft.Len()-int(leftPos), cright.Len()-int(rightPos))) + pos += sz + if !fn(cleft, leftPos, leftPos+sz, cright, rightPos, rightPos+sz) { + return + } + + leftPos += sz + rightPos += sz + } +} + +// ChunkedEqual reports whether two chunked arrays are equal regardless of their chunkings +func ChunkedEqual(left, right *Chunked) bool { + switch { + case left == right: + return true + case left.length != right.length: + return false + case left.nulls != right.nulls: + return false + case !arrow.TypeEqual(left.dtype, right.dtype): + return false + } + + var isequal bool + chunkedBinaryApply(left, right, func(left Interface, lbeg, lend int64, right Interface, rbeg, rend int64) bool { + isequal = ArraySliceEqual(left, lbeg, lend, right, rbeg, rend) + return isequal + }) + + return isequal +} + +// ChunkedApproxEqual reports whether two chunked arrays are approximately equal regardless of their chunkings +// for non-floating point arrays, this is equivalent to ChunkedEqual +func ChunkedApproxEqual(left, right *Chunked, opts ...EqualOption) bool { + switch { + case left == right: + return true + case left.length != right.length: + return false + case left.nulls != right.nulls: + return false + case !arrow.TypeEqual(left.dtype, right.dtype): + return false + } + + var isequal bool + chunkedBinaryApply(left, right, func(left Interface, lbeg, lend int64, right Interface, rbeg, rend int64) bool { + isequal = ArraySliceApproxEqual(left, lbeg, lend, right, rbeg, rend, opts...) + return isequal + }) + + return isequal +} + +// TableEqual returns if the two tables have the same data in the same schema +func TableEqual(left, right Table) bool { + switch { + case left.NumCols() != right.NumCols(): + return false + case left.NumRows() != right.NumRows(): + return false + } + + for i := 0; int64(i) < left.NumCols(); i++ { + lc := left.Column(i) + rc := right.Column(i) + if !lc.field.Equal(rc.field) { + return false + } + + if !ChunkedEqual(lc.data, rc.data) { + return false +
[arrow] branch master updated (4ac62d5 -> a8e1c81)
This is an automated email from the ASF dual-hosted git repository. emkornfield pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/arrow.git. from 4ac62d5 ARROW-14393: [C++] GTest linking errors during the source release verification add a8e1c81 ARROW-14345: [C++] Implement streaming reads No new revisions were added by this update. Summary of changes: cpp/src/arrow/filesystem/gcsfs.cc | 63 +++-- cpp/src/arrow/filesystem/gcsfs_internal.cc | 4 +- cpp/src/arrow/filesystem/gcsfs_test.cc | 109 +++-- 3 files changed, 162 insertions(+), 14 deletions(-)
[arrow] 01/03: [Release] Update CHANGELOG.md for 6.0.0
This is an automated email from the ASF dual-hosted git repository. kszucs pushed a commit to annotated tag apache-arrow-6.0.0 in repository https://gitbox.apache.org/repos/asf/arrow.git commit fa2e00a1591c9f991a4aff339dadb5affd335eb1 Author: Krisztián Szűcs AuthorDate: Wed Oct 20 17:21:42 2021 +0200 [Release] Update CHANGELOG.md for 6.0.0 --- CHANGELOG.md | 601 +++ 1 file changed, 601 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3ed715d..0fbb382 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,605 @@ +# Apache Arrow 6.0.0 (2021-10-20) + +## Bug Fixes + +* [ARROW-6946](https://issues.apache.org/jira/browse/ARROW-6946) - [Go] Run tests with assert build tag enabled +* [ARROW-8452](https://issues.apache.org/jira/browse/ARROW-8452) - [Go][Integration] Go JSON producer generates incorrect nullable flag for nested types +* [ARROW-8453](https://issues.apache.org/jira/browse/ARROW-8453) - [Integration][Go] Recursive nested types unsupported +* [ARROW-8999](https://issues.apache.org/jira/browse/ARROW-8999) - [Python][C++] Non-deterministic segfault in "AMD64 MacOS 10.15 Python 3.7" build +* [ARROW-9948](https://issues.apache.org/jira/browse/ARROW-9948) - [C++] Decimal128 does not check scale range when rescaling; can cause buffer overflow +* [ARROW-10213](https://issues.apache.org/jira/browse/ARROW-10213) - [C++] Temporal cast from timestamp to date rounds instead of extracting date component +* [ARROW-10373](https://issues.apache.org/jira/browse/ARROW-10373) - [C++] ValidateFull() does not validate null\_count +* [ARROW-10773](https://issues.apache.org/jira/browse/ARROW-10773) - [R] parallel as.data.frame.Table hangs indefinitely on Windows +* [ARROW-11518](https://issues.apache.org/jira/browse/ARROW-11518) - [C++] [Parquet] Parquet reader crashes when reading boolean columns +* [ARROW-11579](https://issues.apache.org/jira/browse/ARROW-11579) - [R] read\_feather hanging on Windows +* [ARROW-11634](https://issues.apache.org/jira/browse/ARROW-11634) - [C++][Parquet] Parquet statistics (min/max) for dictionary columns are incorrect +* [ARROW-11729](https://issues.apache.org/jira/browse/ARROW-11729) - [R] Add examples to the datasets documentation +* [ARROW-12011](https://issues.apache.org/jira/browse/ARROW-12011) - [C++][Python] Crashes and incorrect results when converting large integers to dates +* [ARROW-12072](https://issues.apache.org/jira/browse/ARROW-12072) - (ipc.Writer).Write panics with \`arrow/array: index out of range\` +* [ARROW-12087](https://issues.apache.org/jira/browse/ARROW-12087) - [C++] Fix sort\_indices, array\_sort\_indices timestamp support discrepancy +* [ARROW-12513](https://issues.apache.org/jira/browse/ARROW-12513) - [C++][Parquet] Parquet Writer always puts null\_count=0 in Parquet statistics for dictionary-encoded array with nulls +* [ARROW-12540](https://issues.apache.org/jira/browse/ARROW-12540) - [C++] Implement cast from date32[day] to utf8 +* [ARROW-12636](https://issues.apache.org/jira/browse/ARROW-12636) - [JS] ESM Tree-Shaking produces broken code +* [ARROW-12700](https://issues.apache.org/jira/browse/ARROW-12700) - [R] Read/Write\_feather stuck forever after bad write, R, Win32 +* [ARROW-12837](https://issues.apache.org/jira/browse/ARROW-12837) - [C++] Array::ToString() segfaults with null buffer. +* [ARROW-13134](https://issues.apache.org/jira/browse/ARROW-13134) - [C++] SSL-related arrow-s3fs-test failures with aws-sdk-cpp 1.9.51 +* [ARROW-13151](https://issues.apache.org/jira/browse/ARROW-13151) - [Python] Unable to read single child field of struct column from Parquet +* [ARROW-13198](https://issues.apache.org/jira/browse/ARROW-13198) - [C++][Dataset] Async scanner occasionally segfaulting in CI +* [ARROW-13293](https://issues.apache.org/jira/browse/ARROW-13293) - [R] open\_dataset followed by collect hangs (while compute works) +* [ARROW-13304](https://issues.apache.org/jira/browse/ARROW-13304) - [C++] Unable to install nightly on Ubuntu 21.04 due to day of week options +* [ARROW-13336](https://issues.apache.org/jira/browse/ARROW-13336) - [Doc][Python] make clean doesn't clean up "generated" documentation +* [ARROW-13422](https://issues.apache.org/jira/browse/ARROW-13422) - [R] Clarify README about S3 support on Windows +* [ARROW-13424](https://issues.apache.org/jira/browse/ARROW-13424) - [C++] conda-forge benchmark library rejected +* [ARROW-13425](https://issues.apache.org/jira/browse/ARROW-13425) - [Dev][Archery] Archery import pandas which imports pyarrow +* [ARROW-13429](https://issues.apache.org/jira/browse/ARROW-13429) - [C++][Gandiva] Gandiva crashes when compiling If-else expression with binary type +* [ARROW-13430](https://issues.apache.org/jira/browse/ARROW-13430) - [Integration][Go] Various errors in the integration tests +* [ARROW-13436](https://issues.apache.org/jira/browse/ARROW-13436) - [Python][Doc] Clarify what should be
[arrow] 03/03: [Release] Update versions for 6.0.0
This is an automated email from the ASF dual-hosted git repository. kszucs pushed a commit to annotated tag apache-arrow-6.0.0 in repository https://gitbox.apache.org/repos/asf/arrow.git commit 5a6f5919e68d1c0f2672c9c711858e5cbe0944cf Author: Krisztián Szűcs AuthorDate: Wed Oct 20 17:21:50 2021 +0200 [Release] Update versions for 6.0.0 --- c_glib/meson.build | 2 +- ci/scripts/PKGBUILD | 2 +- cpp/CMakeLists.txt | 2 +- cpp/vcpkg.json | 2 +- csharp/Directory.Build.props | 2 +- dev/tasks/homebrew-formulae/apache-arrow.rb | 2 +- dev/tasks/homebrew-formulae/autobrew/apache-arrow.rb | 2 +- java/adapter/avro/pom.xml| 2 +- java/adapter/jdbc/pom.xml| 2 +- java/adapter/orc/pom.xml | 2 +- java/algorithm/pom.xml | 2 +- java/c/pom.xml | 2 +- java/compression/pom.xml | 2 +- java/dataset/pom.xml | 2 +- java/flight/flight-core/pom.xml | 2 +- java/flight/flight-grpc/pom.xml | 2 +- java/format/pom.xml | 2 +- java/gandiva/pom.xml | 2 +- java/memory/memory-core/pom.xml | 2 +- java/memory/memory-netty/pom.xml | 2 +- java/memory/memory-unsafe/pom.xml| 2 +- java/memory/pom.xml | 2 +- java/performance/pom.xml | 4 ++-- java/plasma/pom.xml | 2 +- java/pom.xml | 2 +- java/tools/pom.xml | 2 +- java/vector/pom.xml | 2 +- js/package.json | 2 +- matlab/CMakeLists.txt| 2 +- python/setup.py | 2 +- r/DESCRIPTION| 2 +- r/NEWS.md| 2 +- ruby/red-arrow-cuda/lib/arrow-cuda/version.rb| 2 +- ruby/red-arrow-dataset/lib/arrow-dataset/version.rb | 2 +- ruby/red-arrow-flight/lib/arrow-flight/version.rb| 2 +- ruby/red-arrow/lib/arrow/version.rb | 2 +- ruby/red-gandiva/lib/gandiva/version.rb | 2 +- ruby/red-parquet/lib/parquet/version.rb | 2 +- ruby/red-plasma/lib/plasma/version.rb| 2 +- 39 files changed, 40 insertions(+), 40 deletions(-) diff --git a/c_glib/meson.build b/c_glib/meson.build index 0e090c9..fb92181 100644 --- a/c_glib/meson.build +++ b/c_glib/meson.build @@ -23,7 +23,7 @@ project('arrow-glib', 'c', 'cpp', 'cpp_std=c++11', ]) -version = '6.0.0-SNAPSHOT' +version = '6.0.0' if version.endswith('-SNAPSHOT') version_numbers = version.split('-')[0].split('.') version_tag = version.split('-')[1] diff --git a/ci/scripts/PKGBUILD b/ci/scripts/PKGBUILD index 246b679..ff84553 100644 --- a/ci/scripts/PKGBUILD +++ b/ci/scripts/PKGBUILD @@ -18,7 +18,7 @@ _realname=arrow pkgbase=mingw-w64-${_realname} pkgname="${MINGW_PACKAGE_PREFIX}-${_realname}" -pkgver=5.0.0.9000 +pkgver=6.0.0 pkgrel=8000 pkgdesc="Apache Arrow is a cross-language development platform for in-memory data (mingw-w64)" arch=("any") diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index c787794..ba8c36e 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -47,7 +47,7 @@ if(POLICY CMP0074) cmake_policy(SET CMP0074 NEW) endif() -set(ARROW_VERSION "6.0.0-SNAPSHOT") +set(ARROW_VERSION "6.0.0") string(REGEX MATCH "^[0-9]+\\.[0-9]+\\.[0-9]+" ARROW_BASE_VERSION "${ARROW_VERSION}") diff --git a/cpp/vcpkg.json b/cpp/vcpkg.json index 723f3a4..35907e1 100644 --- a/cpp/vcpkg.json +++ b/cpp/vcpkg.json @@ -1,6 +1,6 @@ { "name": "arrow", - "version-string": "6.0.0-SNAPSHOT", + "version-string": "6.0.0", "dependencies": [ "abseil", { diff --git a/csharp/Directory.Build.props b/csharp/Directory.Build.props index c42ff55..893208c 100644 --- a/csharp/Directory.Build.props +++ b/csharp/Directory.Build.props @@ -29,7 +29,7 @@ Apache Arrow library Copyright 2016-2019 The Apache Software Foundation The Apache Software Foundation -6.0.0-SNAPSHOT +6.0.0 diff --git a/dev/tasks/homebrew-formulae/apache-arrow.rb b/dev/tasks/homebrew-formulae/apache-arrow.rb index ca3f831..7023b5d 100644 --- a/dev/tasks/homebrew-formulae/apache-arrow.rb +++ b/dev/tasks/homebrew-formulae/apache-arrow.rb @@ -1,7 +1,7 @@ class ApacheArrow < Formula desc "Columnar in-memory analytics layer designed to accelerate big data" homepage "https://arrow.apache.org/; - url
[arrow] 02/03: [Release] Update .deb/.rpm changelogs for 6.0.0
This is an automated email from the ASF dual-hosted git repository. kszucs pushed a commit to annotated tag apache-arrow-6.0.0 in repository https://gitbox.apache.org/repos/asf/arrow.git commit b03fce8921b873ed64c4b7f577ade7d54ec8a42b Author: Krisztián Szűcs AuthorDate: Wed Oct 20 17:21:45 2021 +0200 [Release] Update .deb/.rpm changelogs for 6.0.0 --- dev/tasks/linux-packages/apache-arrow-apt-source/debian/changelog | 5 + .../apache-arrow-release/yum/apache-arrow-release.spec.in | 3 +++ dev/tasks/linux-packages/apache-arrow/debian/changelog | 6 ++ dev/tasks/linux-packages/apache-arrow/yum/arrow.spec.in | 3 +++ 4 files changed, 17 insertions(+) diff --git a/dev/tasks/linux-packages/apache-arrow-apt-source/debian/changelog b/dev/tasks/linux-packages/apache-arrow-apt-source/debian/changelog index e69de29..d22ad35 100644 --- a/dev/tasks/linux-packages/apache-arrow-apt-source/debian/changelog +++ b/dev/tasks/linux-packages/apache-arrow-apt-source/debian/changelog @@ -0,0 +1,5 @@ +apache-arrow-apt-source (6.0.0-1) unstable; urgency=low + + * New upstream release. + + -- Krisztián Szűcs Wed, 20 Oct 2021 15:21:43 - diff --git a/dev/tasks/linux-packages/apache-arrow-release/yum/apache-arrow-release.spec.in b/dev/tasks/linux-packages/apache-arrow-release/yum/apache-arrow-release.spec.in index 8edb453..071ec26 100644 --- a/dev/tasks/linux-packages/apache-arrow-release/yum/apache-arrow-release.spec.in +++ b/dev/tasks/linux-packages/apache-arrow-release/yum/apache-arrow-release.spec.in @@ -91,6 +91,9 @@ else fi %changelog +* Wed Oct 20 2021 Krisztián Szűcs - 6.0.0-1 +- New upstream release. + * Mon Jan 18 2021 Krisztián Szűcs - 3.0.0-1 - New upstream release. diff --git a/dev/tasks/linux-packages/apache-arrow/debian/changelog b/dev/tasks/linux-packages/apache-arrow/debian/changelog index 2adfc44..1df8b4a 100644 --- a/dev/tasks/linux-packages/apache-arrow/debian/changelog +++ b/dev/tasks/linux-packages/apache-arrow/debian/changelog @@ -1,3 +1,9 @@ +apache-arrow (6.0.0-1) unstable; urgency=low + + * New upstream release. + + -- Krisztián Szűcs Wed, 20 Oct 2021 15:21:43 - + apache-arrow (3.0.0-1) unstable; urgency=low * New upstream release. diff --git a/dev/tasks/linux-packages/apache-arrow/yum/arrow.spec.in b/dev/tasks/linux-packages/apache-arrow/yum/arrow.spec.in index 629fbb1..de65184 100644 --- a/dev/tasks/linux-packages/apache-arrow/yum/arrow.spec.in +++ b/dev/tasks/linux-packages/apache-arrow/yum/arrow.spec.in @@ -834,6 +834,9 @@ Documentation for Apache Parquet GLib. %{_datadir}/gtk-doc/html/parquet-glib/ %changelog +* Wed Oct 20 2021 Krisztián Szűcs - 6.0.0-1 +- New upstream release. + * Mon Jan 18 2021 Krisztián Szűcs - 3.0.0-1 - New upstream release.
[arrow] annotated tag apache-arrow-6.0.0 updated (0f8bbdb -> 6714759)
This is an automated email from the ASF dual-hosted git repository. kszucs pushed a change to annotated tag apache-arrow-6.0.0 in repository https://gitbox.apache.org/repos/asf/arrow.git. *** WARNING: tag apache-arrow-6.0.0 was modified! *** from 0f8bbdb (tag) to 6714759 (tag) tagging 5a6f5919e68d1c0f2672c9c711858e5cbe0944cf (commit) replaces apache-arrow-3.0.0 by Krisztián Szűcs on Wed Oct 20 17:21:51 2021 +0200 - Log - [Release] Apache Arrow Release 6.0.0 --- omit 421dd6d [Release] Update versions for 6.0.0 omit 4b59043 [Release] Update .deb/.rpm changelogs for 6.0.0 omit d04c2c0 [Release] Update CHANGELOG.md for 6.0.0 add 98b0e99 ARROW-13784: [Python] Table.from_arrays should raise an error when array is empty but names is not add 54bacf9d ARROW-10094: [Python][Doc] Document missing pandas to arrow conversions add 29892ba ARROW-14004: [Python][Doc] Document nullable dtypes handling and usage of types_mapper in to_pandas conversion add eb3c1bd ARROW-14392: [C++] Bundled gRPC misses bundled Abseil include path add 65e69ac ARROW-13317: [Python] Improve documentation on what 'use_threads' does in 'read_feather' add ae943c3 ARROW-13436: [Python][Doc] Clarify what should be expected if read_table is passed an empty list of columns add b2e1285 MINOR: [R] Fix sed for cross-OS compatibility add 77da17b MINOR: [Docs] Uncomment the docs about file visitor when writing Datasets (#11480) add 80ecf33 ARROW-14397: [C++] Fix valgrind error in test utility add 4ac62d5 ARROW-14393: [C++] GTest linking errors during the source release verification new fa2e00a [Release] Update CHANGELOG.md for 6.0.0 new b03fce8 [Release] Update .deb/.rpm changelogs for 6.0.0 new 5a6f591 [Release] Update versions for 6.0.0 This update added new revisions after undoing existing revisions. That is to say, some revisions that were in the old version of the annotated tag are not in the new version. This situation occurs when a user --force pushes a change and generates a repository containing something like this: * -- * -- B -- O -- O -- O (0f8bbdb) \ N -- N -- N refs/tags/apache-arrow-6.0.0 (6714759) You should already have received notification emails for all of the O revisions, and so the following emails describe only the N revisions from the common base, B. Any revisions marked "omit" are not gone; other references still refer to them. Any revisions marked "discard" are gone forever. The 3 revisions listed above as "new" are entirely new to this repository and will be described in separate emails. The revisions listed as "add" were already present in the repository and have only been added to this reference. Summary of changes: CHANGELOG.md | 401 +++-- ...kerfile => ubuntu-20.04-cpp-minimal.dockerfile} | 77 ++-- cpp/cmake_modules/ThirdpartyToolchain.cmake| 22 +- cpp/src/arrow/compute/kernels/vector_sort_test.cc | 2 +- cpp/thirdparty/versions.txt| 4 +- .../apache-arrow-apt-source/debian/changelog | 2 +- .../yum/apache-arrow-release.spec.in | 2 +- .../linux-packages/apache-arrow/debian/changelog | 2 +- .../linux-packages/apache-arrow/yum/arrow.spec.in | 2 +- dev/tasks/tasks.yml| 8 + docker-compose.yml | 21 ++ docs/source/python/data.rst| 1 + docs/source/python/dataset.rst | 44 +-- docs/source/python/pandas.rst | 177 - python/pyarrow/feather.py | 4 +- python/pyarrow/parquet.py | 4 +- python/pyarrow/table.pxi | 8 +- python/pyarrow/tests/test_table.py | 17 + r/Makefile | 2 +- 19 files changed, 513 insertions(+), 287 deletions(-) copy ci/docker/{fedora-33-cpp.dockerfile => ubuntu-20.04-cpp-minimal.dockerfile} (56%)
svn commit: r50529 - in /dev/arrow/apache-arrow-6.0.0-rc1: ./ apache-arrow-6.0.0.tar.gz apache-arrow-6.0.0.tar.gz.asc apache-arrow-6.0.0.tar.gz.sha256 apache-arrow-6.0.0.tar.gz.sha512
Author: kszucs Date: Wed Oct 20 15:25:30 2021 New Revision: 50529 Log: Apache Arrow 6.0.0 RC1 Added: dev/arrow/apache-arrow-6.0.0-rc1/ dev/arrow/apache-arrow-6.0.0-rc1/apache-arrow-6.0.0.tar.gz (with props) dev/arrow/apache-arrow-6.0.0-rc1/apache-arrow-6.0.0.tar.gz.asc dev/arrow/apache-arrow-6.0.0-rc1/apache-arrow-6.0.0.tar.gz.sha256 dev/arrow/apache-arrow-6.0.0-rc1/apache-arrow-6.0.0.tar.gz.sha512 Added: dev/arrow/apache-arrow-6.0.0-rc1/apache-arrow-6.0.0.tar.gz == Binary file - no diff available. Propchange: dev/arrow/apache-arrow-6.0.0-rc1/apache-arrow-6.0.0.tar.gz -- svn:mime-type = application/octet-stream Added: dev/arrow/apache-arrow-6.0.0-rc1/apache-arrow-6.0.0.tar.gz.asc == --- dev/arrow/apache-arrow-6.0.0-rc1/apache-arrow-6.0.0.tar.gz.asc (added) +++ dev/arrow/apache-arrow-6.0.0-rc1/apache-arrow-6.0.0.tar.gz.asc Wed Oct 20 15:25:30 2021 @@ -0,0 +1,16 @@ +-BEGIN PGP SIGNATURE- + +iQIzBAABCAAdFiEEJl+Aq4T+AxJ+FPARJbzKUiDYQHkFAmFwNNUACgkQJbzKUiDY +QHnZJhAA+HFVu1WH8gZYkhvqwBSxxiDalgV5b6EKimtN6Xr9OcRuV3oXwrl8wohM +ZT+8uwgIqzTmykGOwIiKlgqCVWYJNo/CuWq89Y+Fu9JRbrU+VeYqNJe/bSQrYErj +vi7LeX5bQVRpNyBBSOIMAmteP+k6omV1uVU4BD9U00HfgrwPbAyK09m6bfwg1l/U +b20mtUNC/fMFnQUwlMDPjsXZ9UQ+34eLNnNxQHhQNAo0FwH5RMcEhFLJaQOsr6vn +hBHIIYs5WKqlhIIFRByApWxlUr0Nb8SpqL38FgU2ePRwfusTJ4SXRDAb1Xus28Ot +JViyaqiOSg9vkWdYBwLC4CM45X51aSG3BT7WF5urFErHaTwepHl8uD6IeQ96y2sN +/yZw94S8T83P6E69zwvWMW2F2VT550UqabUrPQp9gUK7wezjUz6/P3kszGGqKawp +JHD2cFNn2zG9Gw9oFO4z7gOjA58Nbbb2Na9R5C4EcmDhf1287gcP0r+7b7tFabib +LuWf5Vcz7KxIi6B7TAJ8pGLj8M2+yJCyAhBe6UsfJdr1speKcxdbbYuh3Gkc7zfs +oN6jTeO6WjHkfMiDBJ2/BmuJzspEchz3wZ2RBgzTq95KY0Yq9dhG1cX5g3eEhBr6 +9ugw/TDgNPKNylHohcyXm8IU5XQHJeZ1gnI62HUG5W2DJd1XRrs= +=+dIx +-END PGP SIGNATURE- Added: dev/arrow/apache-arrow-6.0.0-rc1/apache-arrow-6.0.0.tar.gz.sha256 == --- dev/arrow/apache-arrow-6.0.0-rc1/apache-arrow-6.0.0.tar.gz.sha256 (added) +++ dev/arrow/apache-arrow-6.0.0-rc1/apache-arrow-6.0.0.tar.gz.sha256 Wed Oct 20 15:25:30 2021 @@ -0,0 +1 @@ +40e25a16b61c103ccd6bab4a7d0bd15a0568e565865402c37a1e3a782c5ff6b2 apache-arrow-6.0.0.tar.gz Added: dev/arrow/apache-arrow-6.0.0-rc1/apache-arrow-6.0.0.tar.gz.sha512 == --- dev/arrow/apache-arrow-6.0.0-rc1/apache-arrow-6.0.0.tar.gz.sha512 (added) +++ dev/arrow/apache-arrow-6.0.0-rc1/apache-arrow-6.0.0.tar.gz.sha512 Wed Oct 20 15:25:30 2021 @@ -0,0 +1 @@ +f2eca86a4fae2dd8e28195c5b6af0edb7c52422870af80ad58e85fc7834dd9216792e8f8f4985c9a84c6888ab051f2c4c71e2433ef14d6a7bc06fc5b1184fd35 apache-arrow-6.0.0.tar.gz
[arrow] branch release-6.0.0 updated (c8f882c -> 4ac62d5)
This is an automated email from the ASF dual-hosted git repository. kszucs pushed a change to branch release-6.0.0 in repository https://gitbox.apache.org/repos/asf/arrow.git. from c8f882c ARROW-14386: [Packaging][Java] Ensure using installed devtoolset version add 98b0e99 ARROW-13784: [Python] Table.from_arrays should raise an error when array is empty but names is not add 54bacf9d ARROW-10094: [Python][Doc] Document missing pandas to arrow conversions add 29892ba ARROW-14004: [Python][Doc] Document nullable dtypes handling and usage of types_mapper in to_pandas conversion add eb3c1bd ARROW-14392: [C++] Bundled gRPC misses bundled Abseil include path add 65e69ac ARROW-13317: [Python] Improve documentation on what 'use_threads' does in 'read_feather' add ae943c3 ARROW-13436: [Python][Doc] Clarify what should be expected if read_table is passed an empty list of columns add b2e1285 MINOR: [R] Fix sed for cross-OS compatibility add 77da17b MINOR: [Docs] Uncomment the docs about file visitor when writing Datasets (#11480) add 80ecf33 ARROW-14397: [C++] Fix valgrind error in test utility add 4ac62d5 ARROW-14393: [C++] GTest linking errors during the source release verification No new revisions were added by this update. Summary of changes: ...kerfile => ubuntu-20.04-cpp-minimal.dockerfile} | 77 +++-- cpp/cmake_modules/ThirdpartyToolchain.cmake| 22 ++- cpp/src/arrow/compute/kernels/vector_sort_test.cc | 2 +- cpp/thirdparty/versions.txt| 4 +- dev/tasks/tasks.yml| 8 + docker-compose.yml | 21 +++ docs/source/python/data.rst| 1 + docs/source/python/dataset.rst | 44 ++--- docs/source/python/pandas.rst | 177 - python/pyarrow/feather.py | 4 +- python/pyarrow/parquet.py | 4 +- python/pyarrow/table.pxi | 8 +- python/pyarrow/tests/test_table.py | 17 ++ r/Makefile | 2 +- 14 files changed, 304 insertions(+), 87 deletions(-) copy ci/docker/{fedora-33-cpp.dockerfile => ubuntu-20.04-cpp-minimal.dockerfile} (56%)
[arrow] branch master updated (80ecf33 -> 4ac62d5)
This is an automated email from the ASF dual-hosted git repository. kszucs pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/arrow.git. from 80ecf33 ARROW-14397: [C++] Fix valgrind error in test utility add 4ac62d5 ARROW-14393: [C++] GTest linking errors during the source release verification No new revisions were added by this update. Summary of changes: cpp/thirdparty/versions.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-)
[arrow] branch master updated (77da17b -> 80ecf33)
This is an automated email from the ASF dual-hosted git repository. kszucs pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/arrow.git. from 77da17b MINOR: [Docs] Uncomment the docs about file visitor when writing Datasets (#11480) add 80ecf33 ARROW-14397: [C++] Fix valgrind error in test utility No new revisions were added by this update. Summary of changes: cpp/src/arrow/compute/kernels/vector_sort_test.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
[arrow] branch master updated (b2e1285 -> 77da17b)
This is an automated email from the ASF dual-hosted git repository. kszucs pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/arrow.git. from b2e1285 MINOR: [R] Fix sed for cross-OS compatibility add 77da17b MINOR: [Docs] Uncomment the docs about file visitor when writing Datasets (#11480) No new revisions were added by this update. Summary of changes: docs/source/python/dataset.rst | 44 +- 1 file changed, 22 insertions(+), 22 deletions(-)
[arrow] branch master updated (ae943c3 -> b2e1285)
This is an automated email from the ASF dual-hosted git repository. jonkeane pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/arrow.git. from ae943c3 ARROW-13436: [Python][Doc] Clarify what should be expected if read_table is passed an empty list of columns add b2e1285 MINOR: [R] Fix sed for cross-OS compatibility No new revisions were added by this update. Summary of changes: r/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
[arrow] branch master updated (65e69ac -> ae943c3)
This is an automated email from the ASF dual-hosted git repository. jorisvandenbossche pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/arrow.git. from 65e69ac ARROW-13317: [Python] Improve documentation on what 'use_threads' does in 'read_feather' add ae943c3 ARROW-13436: [Python][Doc] Clarify what should be expected if read_table is passed an empty list of columns No new revisions were added by this update. Summary of changes: python/pyarrow/parquet.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-)
[arrow] branch master updated (eb3c1bd -> 65e69ac)
This is an automated email from the ASF dual-hosted git repository. kszucs pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/arrow.git. from eb3c1bd ARROW-14392: [C++] Bundled gRPC misses bundled Abseil include path add 65e69ac ARROW-13317: [Python] Improve documentation on what 'use_threads' does in 'read_feather' No new revisions were added by this update. Summary of changes: python/pyarrow/feather.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-)
[arrow] branch master updated (29892ba -> eb3c1bd)
This is an automated email from the ASF dual-hosted git repository. kszucs pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/arrow.git. from 29892ba ARROW-14004: [Python][Doc] Document nullable dtypes handling and usage of types_mapper in to_pandas conversion add eb3c1bd ARROW-14392: [C++] Bundled gRPC misses bundled Abseil include path No new revisions were added by this update. Summary of changes: ...kerfile => ubuntu-20.04-cpp-minimal.dockerfile} | 77 -- cpp/cmake_modules/ThirdpartyToolchain.cmake| 22 ++- dev/tasks/tasks.yml| 8 +++ docker-compose.yml | 21 ++ 4 files changed, 74 insertions(+), 54 deletions(-) copy ci/docker/{fedora-33-cpp.dockerfile => ubuntu-20.04-cpp-minimal.dockerfile} (56%)
[arrow] branch master updated (54bacf9d -> 29892ba)
This is an automated email from the ASF dual-hosted git repository. jorisvandenbossche pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/arrow.git. from 54bacf9d ARROW-10094: [Python][Doc] Document missing pandas to arrow conversions add 29892ba ARROW-14004: [Python][Doc] Document nullable dtypes handling and usage of types_mapper in to_pandas conversion No new revisions were added by this update. Summary of changes: docs/source/python/pandas.rst | 117 ++ 1 file changed, 117 insertions(+)
[arrow] branch master updated (98b0e99 -> 54bacf9d)
This is an automated email from the ASF dual-hosted git repository. jorisvandenbossche pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/arrow.git. from 98b0e99 ARROW-13784: [Python] Table.from_arrays should raise an error when array is empty but names is not add 54bacf9d ARROW-10094: [Python][Doc] Document missing pandas to arrow conversions No new revisions were added by this update. Summary of changes: docs/source/python/data.rst | 1 + docs/source/python/pandas.rst | 60 --- 2 files changed, 58 insertions(+), 3 deletions(-)
[arrow-datafusion] branch master updated: Multiple files per partitions for CSV Avro Json (#1138)
This is an automated email from the ASF dual-hosted git repository. alamb pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git The following commit(s) were added to refs/heads/master by this push: new 4b577f3 Multiple files per partitions for CSV Avro Json (#1138) 4b577f3 is described below commit 4b577f374ce0922f61608be25d8d91c59a65c2cf Author: rdettai AuthorDate: Wed Oct 20 12:54:14 2021 +0200 Multiple files per partitions for CSV Avro Json (#1138) * [feat] multi file partition for csv avro json * [fix] typos * [fix] aliasing closure trait --- ballista/rust/core/proto/ballista.proto| 4 +- .../core/src/serde/physical_plan/from_proto.rs | 12 +- .../rust/core/src/serde/physical_plan/to_proto.rs | 22 +- datafusion/src/datasource/file_format/avro.rs | 3 +- datafusion/src/datasource/file_format/csv.rs | 3 +- datafusion/src/datasource/file_format/json.rs | 3 +- datafusion/src/physical_plan/file_format/avro.rs | 158 datafusion/src/physical_plan/file_format/csv.rs| 146 .../src/physical_plan/file_format/file_stream.rs | 265 + datafusion/src/physical_plan/file_format/json.rs | 138 +++ datafusion/src/physical_plan/file_format/mod.rs| 1 + datafusion/src/test/mod.rs | 19 +- 12 files changed, 436 insertions(+), 338 deletions(-) diff --git a/ballista/rust/core/proto/ballista.proto b/ballista/rust/core/proto/ballista.proto index 49b65cf..338c5a6 100644 --- a/ballista/rust/core/proto/ballista.proto +++ b/ballista/rust/core/proto/ballista.proto @@ -615,7 +615,7 @@ message ParquetScanExecNode { } message CsvScanExecNode { - repeated PartitionedFile files = 1; + repeated FileGroup file_groups = 1; Schema schema = 2; bool has_header = 3; uint32 batch_size = 4; @@ -626,7 +626,7 @@ message CsvScanExecNode { } message AvroScanExecNode { - repeated PartitionedFile files = 1; + repeated FileGroup file_groups = 1; Schema schema = 2; uint32 batch_size = 4; repeated uint32 projection = 6; diff --git a/ballista/rust/core/src/serde/physical_plan/from_proto.rs b/ballista/rust/core/src/serde/physical_plan/from_proto.rs index 75dd915..dce354a 100644 --- a/ballista/rust/core/src/serde/physical_plan/from_proto.rs +++ b/ballista/rust/core/src/serde/physical_plan/from_proto.rs @@ -125,10 +125,10 @@ impl TryInto> for ::PhysicalPlanNode { Ok(Arc::new(CsvExec::new( Arc::new(LocalFileSystem {}), -scan.files +scan.file_groups .iter() -.map(|f| f.into()) -.collect::>(), +.map(|p| p.into()) +.collect::>>(), statistics, schema, scan.has_header, @@ -165,10 +165,10 @@ impl TryInto> for ::PhysicalPlanNode { Ok(Arc::new(AvroExec::new( Arc::new(LocalFileSystem {}), -scan.files +scan.file_groups .iter() -.map(|f| f.into()) -.collect::>(), +.map(|p| p.into()) +.collect::>>(), statistics, schema, Some(projection), diff --git a/ballista/rust/core/src/serde/physical_plan/to_proto.rs b/ballista/rust/core/src/serde/physical_plan/to_proto.rs index e5e6347..52285ee 100644 --- a/ballista/rust/core/src/serde/physical_plan/to_proto.rs +++ b/ballista/rust/core/src/serde/physical_plan/to_proto.rs @@ -244,14 +244,15 @@ impl TryInto for Arc { ))), }) } else if let Some(exec) = plan.downcast_ref::() { +let file_groups = exec +.file_groups() +.iter() +.map(|p| p.as_slice().into()) +.collect(); Ok(protobuf::PhysicalPlanNode { physical_plan_type: Some(PhysicalPlanType::CsvScan( protobuf::CsvScanExecNode { -files: exec -.files() -.iter() -.map(|f| f.into()) -.collect::>(), +file_groups, statistics: Some((()).into()), limit: exec .limit() @@ -301,14 +302,15 @@ impl TryInto for Arc { )), }) } else if let Some(exec) = plan.downcast_ref::() { +let file_groups = exec +.file_groups() +.iter() +.map(|p| p.as_slice().into()) +.collect(); Ok(protobuf::PhysicalPlanNode
[arrow-datafusion] branch master updated: Add ScalarValue support for arbitrary list elements (#1142)
This is an automated email from the ASF dual-hosted git repository. alamb pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git The following commit(s) were added to refs/heads/master by this push: new 8bab267 Add ScalarValue support for arbitrary list elements (#1142) 8bab267 is described below commit 8bab2676e070ee3cfc55d2ec0877c724d4daf568 Author: Jon Mease AuthorDate: Wed Oct 20 06:45:48 2021 -0400 Add ScalarValue support for arbitrary list elements (#1142) * clippy fix * clippy fixes * Rebase and review cleanup --- datafusion/src/scalar.rs | 349 +-- 1 file changed, 339 insertions(+), 10 deletions(-) diff --git a/datafusion/src/scalar.rs b/datafusion/src/scalar.rs index 31c48a6..00586bf 100644 --- a/datafusion/src/scalar.rs +++ b/datafusion/src/scalar.rs @@ -777,6 +777,11 @@ impl ScalarValue { DataType::List(fields) if fields.data_type() == ::LargeUtf8 => { build_array_list_string!(LargeStringBuilder, LargeUtf8) } +DataType::List(_) => { +// Fallback case handling homogeneous lists with any ScalarValue element type +let list_array = ScalarValue::iter_to_array_list(scalars, _type)?; +Arc::new(list_array) +} DataType::Struct(fields) => { // Initialize a Vector to store the ScalarValues for each column let mut columns: Vec> = @@ -833,6 +838,73 @@ impl ScalarValue { Ok(array) } +fn iter_to_array_list( +scalars: impl IntoIterator, +data_type: , +) -> Result> { +let mut offsets = Int32Array::builder(0); +if let Err(err) = offsets.append_value(0) { +return Err(DataFusionError::ArrowError(err)); +} + +let mut elements: Vec = Vec::new(); +let mut valid = BooleanBufferBuilder::new(0); +let mut flat_len = 0i32; +for scalar in scalars { +if let ScalarValue::List(values, _) = scalar { +match values { +Some(values) => { +let element_array = ScalarValue::iter_to_array(*values)?; + +// Add new offset index +flat_len += element_array.len() as i32; +if let Err(err) = offsets.append_value(flat_len) { +return Err(DataFusionError::ArrowError(err)); +} + +elements.push(element_array); + +// Element is valid +valid.append(true); +} +None => { +// Repeat previous offset index +if let Err(err) = offsets.append_value(flat_len) { +return Err(DataFusionError::ArrowError(err)); +} + +// Element is null +valid.append(false); +} +} +} else { +return Err(DataFusionError::Internal(format!( +"Expected ScalarValue::List element. Received {:?}", +scalar +))); +} +} + +// Concatenate element arrays to create single flat array +let element_arrays: Vec< Array> = +elements.iter().map(|a| a.as_ref()).collect(); +let flat_array = match arrow::compute::concat(_arrays) { +Ok(flat_array) => flat_array, +Err(err) => return Err(DataFusionError::ArrowError(err)), +}; + +// Build ListArray using ArrayData so we can specify a flat inner array, and offset indices +let offsets_array = offsets.finish(); +let array_data = ArrayDataBuilder::new(data_type.clone()) +.len(offsets_array.len() - 1) +.null_bit_buffer(valid.finish()) +.add_buffer(offsets_array.data().buffers()[0].clone()) +.add_child_data(flat_array.data().clone()); + +let list_array = ListArray::from(array_data.build()?); +Ok(list_array) +} + /// Converts a scalar value into an array of `size` rows. pub fn to_array_of_size(, size: usize) -> ArrayRef { match self { @@ -945,7 +1017,15 @@ impl ScalarValue { ::LargeUtf8 => { build_list!(LargeStringBuilder, LargeUtf8, values, size) } -dt => panic!("Unexpected DataType for list {:?}", dt), +_ => ScalarValue::iter_to_array_list( +repeat(self.clone()).take(size), +::List(Box::new(Field::new( +"item", +data_type.as_ref().clone(), +true, +))), +) +
[GitHub] [arrow-site] alamb commented on pull request #154: Update datafusion website, add datafusion roadmap, etc
alamb commented on pull request #154: URL: https://github.com/apache/arrow-site/pull/154#issuecomment-947540721 Thanks @houqp ! -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[arrow-site] branch asf-site updated: add datafusion roadmap (#154)
This is an automated email from the ASF dual-hosted git repository. alamb pushed a commit to branch asf-site in repository https://gitbox.apache.org/repos/asf/arrow-site.git The following commit(s) were added to refs/heads/asf-site by this push: new 201ebf0 add datafusion roadmap (#154) 201ebf0 is described below commit 201ebf0d7238c89c3749ee228bc8583008678970 Author: QP Hou AuthorDate: Wed Oct 20 03:36:09 2021 -0700 add datafusion roadmap (#154) --- datafusion/_modules/index.html | 7 + datafusion/_sources/cli/index.rst.txt | 6 +- datafusion/_sources/community/communication.md.txt | 17 ++ datafusion/_sources/index.rst.txt | 1 + datafusion/_sources/specification/roadmap.md.txt | 99 .../_sources/user-guide/example-usage.md.txt | 4 - datafusion/_sources/user-guide/library.md.txt | 5 +- datafusion/cli/index.html | 16 +- datafusion/community/communication.html| 24 ++ datafusion/genindex.html | 32 ++- datafusion/index.html | 8 + datafusion/objects.inv | Bin 1632 -> 1694 bytes datafusion/py-modindex.html| 5 + datafusion/python/api/dataframe.html | 10 + datafusion/python/api/execution_context.html | 10 + datafusion/python/api/expression.html | 10 + datafusion/python/api/functions.html | 10 + .../python/generated/datafusion.DataFrame.html | 10 + .../generated/datafusion.ExecutionContext.html | 10 + .../python/generated/datafusion.Expression.html| 10 + .../python/generated/datafusion.functions.html | 44 datafusion/search.html | 5 + datafusion/searchindex.js | 2 +- .../roadmap.html} | 250 ++--- datafusion/user-guide/example-usage.html | 14 +- datafusion/user-guide/library.html | 15 +- 26 files changed, 525 insertions(+), 99 deletions(-) diff --git a/datafusion/_modules/index.html b/datafusion/_modules/index.html index 7bcb0b0..0233d14 100644 --- a/datafusion/_modules/index.html +++ b/datafusion/_modules/index.html @@ -319,6 +319,11 @@ + + Roadmap + + + DataFusion’s Invariants @@ -392,6 +397,8 @@ All modules for which code is available builtins +datafusion.functions +functions diff --git a/datafusion/_sources/cli/index.rst.txt b/datafusion/_sources/cli/index.rst.txt index 93ae173..2b91430 100644 --- a/datafusion/_sources/cli/index.rst.txt +++ b/datafusion/_sources/cli/index.rst.txt @@ -53,7 +53,7 @@ Usage .. code-block:: bash -DataFusion 5.0.0-SNAPSHOT +DataFusion 5.1.0-SNAPSHOT DataFusion is an in-memory query engine that uses Apache Arrow as the memory model. It supports executing SQL queries against CSV and Parquet files as well as querying directly against in-memory data. @@ -68,8 +68,10 @@ Usage OPTIONS: -c, --batch-size The batch size of each query, or use DataFusion default -p, --data-path Path to your data, default to current directory --f, --file Execute commands from file, then exit +-f, --file ... Execute commands from file(s), then exit --format Output format [default: table] [possible values: csv, tsv, table, json, ndjson] +--host Ballista scheduler host +--port Ballista scheduler port Type `exit` or `quit` to exit the CLI. diff --git a/datafusion/_sources/community/communication.md.txt b/datafusion/_sources/community/communication.md.txt index bbf07a1..7d8e58a 100644 --- a/datafusion/_sources/community/communication.md.txt +++ b/datafusion/_sources/community/communication.md.txt @@ -52,6 +52,23 @@ server ([invite link](https://discord.gg/Qw5gKqHxUM)) in case you are not able to join the Slack workspace. If you need an invite to the Slack workspace, you can also ask for one in our Discord server. +### Sync up Zoom calls + +We have biweekly sync calls every other Thursdays at 16:00 UTC +(starting September 30, 2021) on Zoom [Meeting Link](https://influxdata.zoom.us/j/94666921249) + +The[agenda](https://docs.google.com/document/d/1atCVnoff5SR4eM4Lwf2M1BBJTY6g3_HUNR6qswYJW_U/edit) +is available if you would like to add a topic for discussion or see what is planned. + +The goals of these calls are: + +1. Help "put a face to the name" of some of other contributors we are working with +2. Discuss / synchronize on the goals and major initiatives from different stakeholders to identify areas where more alignment is needed + +No decisions are made on the call and anything of substance will be discussed on this mailing list or in github issues / google docs. + +We
[GitHub] [arrow-site] alamb merged pull request #154: Update datafusion website, add datafusion roadmap, etc
alamb merged pull request #154: URL: https://github.com/apache/arrow-site/pull/154 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[arrow-rs] branch master updated: Use kernel utility for parsing timestamps in csv reader. (#832)
This is an automated email from the ASF dual-hosted git repository. alamb pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/arrow-rs.git The following commit(s) were added to refs/heads/master by this push: new 4cfe621 Use kernel utility for parsing timestamps in csv reader. (#832) 4cfe621 is described below commit 4cfe621902eaed08abc609013b85b3d0a42de3c8 Author: Navin AuthorDate: Wed Oct 20 21:34:50 2021 +1100 Use kernel utility for parsing timestamps in csv reader. (#832) * Use kernel utility for parsing timestamps in csvs. * Remove cruft. * Cleanup. * Lint. * Remove erroneous stringify. --- arrow/src/csv/reader.rs | 101 ++-- 1 file changed, 97 insertions(+), 4 deletions(-) diff --git a/arrow/src/csv/reader.rs b/arrow/src/csv/reader.rs index 7bd12eb..b68ac1b 100644 --- a/arrow/src/csv/reader.rs +++ b/arrow/src/csv/reader.rs @@ -52,6 +52,7 @@ use std::sync::Arc; use crate::array::{ ArrayRef, BooleanArray, DictionaryArray, PrimitiveArray, StringArray, }; +use crate::compute::kernels::cast_utils::string_to_timestamp_nanos; use crate::datatypes::*; use crate::error::{ArrowError, Result}; use crate::record_batch::RecordBatch; @@ -694,8 +695,7 @@ impl Parser for TimestampNanosecondType { fn parse(string: ) -> Option { match Self::DATA_TYPE { DataType::Timestamp(TimeUnit::Nanosecond, None) => { -let date_time = string.parse::().ok()?; -Self::Native::from_i64(date_time.timestamp_nanos()) +string_to_timestamp_nanos(string).ok() } _ => None, } @@ -706,8 +706,8 @@ impl Parser for TimestampMicrosecondType { fn parse(string: ) -> Option { match Self::DATA_TYPE { DataType::Timestamp(TimeUnit::Microsecond, None) => { -let date_time = string.parse::().ok()?; -Self::Native::from_i64(date_time.timestamp_nanos() / 1000) +let nanos = string_to_timestamp_nanos(string).ok(); +nanos.map(|x| x / 1000) } _ => None, } @@ -979,6 +979,7 @@ mod tests { use crate::array::*; use crate::compute::cast; use crate::datatypes::Field; +use chrono::{prelude::*, LocalResult}; #[test] fn test_csv() { @@ -1371,6 +1372,98 @@ mod tests { ); } +/// Interprets a naive_datetime (with no explicit timezone offset) +/// using the local timezone and returns the timestamp in UTC (0 +/// offset) +fn naive_datetime_to_timestamp(naive_datetime: ) -> i64 { +// Note: Use chrono APIs that are different than +// naive_datetime_to_timestamp to compute the utc offset to +// try and double check the logic +let utc_offset_secs = match Local.offset_from_local_datetime(naive_datetime) { +LocalResult::Single(local_offset) => { +local_offset.fix().local_minus_utc() as i64 +} +_ => panic!( +"Unexpected failure converting {} to local datetime", +naive_datetime +), +}; +let utc_offset_nanos = utc_offset_secs * 1_000_000_000; +naive_datetime.timestamp_nanos() - utc_offset_nanos +} + +#[test] +fn test_parse_timestamp_microseconds() { +assert_eq!( + parse_item::("1970-01-01T00:00:00Z").unwrap(), +0 +); +let naive_datetime = NaiveDateTime::new( +NaiveDate::from_ymd(2018, 11, 13), +NaiveTime::from_hms_nano(17, 11, 10, 0), +); +assert_eq!( + parse_item::("2018-11-13T17:11:10").unwrap(), +naive_datetime_to_timestamp(_datetime) / 1000 +); +assert_eq!( +parse_item::("2018-11-13 17:11:10").unwrap(), +naive_datetime_to_timestamp(_datetime) / 1000 +); +let naive_datetime = NaiveDateTime::new( +NaiveDate::from_ymd(2018, 11, 13), +NaiveTime::from_hms_nano(17, 11, 10, 1100), +); +assert_eq!( + parse_item::("2018-11-13T17:11:10.011").unwrap(), +naive_datetime_to_timestamp(_datetime) / 1000 +); +let naive_datetime = NaiveDateTime::new( +NaiveDate::from_ymd(1900, 2, 28), +NaiveTime::from_hms_nano(12, 34, 56, 0), +); +assert_eq!( + parse_item::("1900-02-28T12:34:56").unwrap(), +naive_datetime_to_timestamp(_datetime) / 1000 +); +} + +#[test] +fn test_parse_timestamp_nanoseconds() { +assert_eq!( + parse_item::("1970-01-01T00:00:00Z").unwrap(), +0 +); +let naive_datetime = NaiveDateTime::new( +NaiveDate::from_ymd(2018, 11, 13), +NaiveTime::from_hms_nano(17, 11, 10, 0), +); +
[arrow-datafusion] branch master updated: Dependency upgrades (#1148)
This is an automated email from the ASF dual-hosted git repository. dheres pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git The following commit(s) were added to refs/heads/master by this push: new 27044a0 Dependency upgrades (#1148) 27044a0 is described below commit 27044a05ad6172d78eee19d0ade600971bfb26b3 Author: Daniël Heres AuthorDate: Wed Oct 20 10:45:52 2021 +0200 Dependency upgrades (#1148) --- ballista/rust/executor/Cargo.toml | 2 +- ballista/rust/scheduler/Cargo.toml | 2 +- benchmarks/Cargo.toml | 2 +- datafusion-cli/Cargo.toml | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/ballista/rust/executor/Cargo.toml b/ballista/rust/executor/Cargo.toml index 231b05f..5d26f39 100644 --- a/ballista/rust/executor/Cargo.toml +++ b/ballista/rust/executor/Cargo.toml @@ -36,7 +36,7 @@ async-trait = "0.1.36" ballista-core = { path = "../core", version = "0.6.0" } configure_me = "0.4.0" datafusion = { path = "../../../datafusion", version = "5.1.0" } -env_logger = "0.8" +env_logger = "0.9" futures = "0.3" log = "0.4" snmalloc-rs = {version = "0.2", features= ["cache-friendly"], optional = true} diff --git a/ballista/rust/scheduler/Cargo.toml b/ballista/rust/scheduler/Cargo.toml index c840772..10664f1 100644 --- a/ballista/rust/scheduler/Cargo.toml +++ b/ballista/rust/scheduler/Cargo.toml @@ -36,7 +36,7 @@ ballista-core = { path = "../core", version = "0.6.0" } clap = "2" configure_me = "0.4.0" datafusion = { path = "../../../datafusion", version = "5.1.0" } -env_logger = "0.8" +env_logger = "0.9" etcd-client = { version = "0.7", optional = true } futures = "0.3" http = "0.2" diff --git a/benchmarks/Cargo.toml b/benchmarks/Cargo.toml index 19a67a5..ce882f6 100644 --- a/benchmarks/Cargo.toml +++ b/benchmarks/Cargo.toml @@ -36,7 +36,7 @@ ballista = { path = "../ballista/rust/client" } structopt = { version = "0.3", default-features = false } tokio = { version = "^1.0", features = ["macros", "rt", "rt-multi-thread"] } futures = "0.3" -env_logger = "^0.8" +env_logger = "0.9" mimalloc = { version = "0.1", optional = true, default-features = false } snmalloc-rs = {version = "0.2", optional = true, features= ["cache-friendly"] } diff --git a/datafusion-cli/Cargo.toml b/datafusion-cli/Cargo.toml index 22196ca..8b7ac19 100644 --- a/datafusion-cli/Cargo.toml +++ b/datafusion-cli/Cargo.toml @@ -28,7 +28,7 @@ repository = "https://github.com/apache/arrow-datafusion; [dependencies] clap = "2.33" -rustyline = "8.0" +rustyline = "9.0" tokio = { version = "1.0", features = ["macros", "rt", "rt-multi-thread", "sync"] } datafusion = { path = "../datafusion", version = "5.1.0" } arrow = { version = "6.0.0" }
[arrow] branch master updated (c8f882c -> 98b0e99)
This is an automated email from the ASF dual-hosted git repository. jorisvandenbossche pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/arrow.git. from c8f882c ARROW-14386: [Packaging][Java] Ensure using installed devtoolset version add 98b0e99 ARROW-13784: [Python] Table.from_arrays should raise an error when array is empty but names is not No new revisions were added by this update. Summary of changes: python/pyarrow/table.pxi | 8 ++-- python/pyarrow/tests/test_table.py | 17 + 2 files changed, 23 insertions(+), 2 deletions(-)