[arrow-datafusion] branch master updated: python `lit` function to support bool and byte vec (#1152)

2021-10-20 Thread jiayuliu
This is an automated email from the ASF dual-hosted git repository.

jiayuliu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git


The following commit(s) were added to refs/heads/master by this push:
 new f455357  python `lit` function to support bool and byte vec (#1152)
f455357 is described below

commit f455357bf159763a19312bab2c9238bc101792e0
Author: Jiayu Liu 
AuthorDate: Thu Oct 21 13:04:41 2021 +0800

python `lit` function to support bool and byte vec (#1152)

* python lit function to support bool and byte vec

* update per comment
---
 datafusion/src/logical_plan/expr.rs |  12 ++
 python/Cargo.lock   | 224 
 python/src/functions.rs |  48 +---
 python/tests/test_functions.py  |  11 +-
 4 files changed, 156 insertions(+), 139 deletions(-)

diff --git a/datafusion/src/logical_plan/expr.rs 
b/datafusion/src/logical_plan/expr.rs
index d50d533..011068d 100644
--- a/datafusion/src/logical_plan/expr.rs
+++ b/datafusion/src/logical_plan/expr.rs
@@ -1407,6 +1407,18 @@ impl Literal for String {
 }
 }
 
+impl Literal for Vec {
+fn lit() -> Expr {
+Expr::Literal(ScalarValue::Binary(Some((*self).to_owned(
+}
+}
+
+impl Literal for &[u8] {
+fn lit() -> Expr {
+Expr::Literal(ScalarValue::Binary(Some((*self).to_owned(
+}
+}
+
 impl Literal for ScalarValue {
 fn lit() -> Expr {
 Expr::Literal(self.clone())
diff --git a/python/Cargo.lock b/python/Cargo.lock
index 6daefea..6ae2702 100644
--- a/python/Cargo.lock
+++ b/python/Cargo.lock
@@ -51,24 +51,19 @@ checksum = 
"a4c527152e37cf757a3f78aae5a06fbeefdb07ccc535c980a3208ee3060dd544"
 
 [[package]]
 name = "arrayvec"
-version = "0.5.2"
-source = "registry+https://github.com/rust-lang/crates.io-index;
-checksum = "23b62fc65de8e4e7f52534fb52b0f3ed04746ae267519eef2a83941e8085068b"
-
-[[package]]
-name = "arrayvec"
 version = "0.7.1"
 source = "registry+https://github.com/rust-lang/crates.io-index;
 checksum = "be4dc07131ffa69b8072d35f5007352af944213cde02545e2103680baed38fcd"
 
 [[package]]
 name = "arrow"
-version = "5.3.0"
+version = "6.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index;
-checksum = "a4091f84cacfdbd5238e161d314e585820269926f79e05d184db8a2898782d44"
+checksum = "337e668497751234149fd607f5cb41a6ae7b286b6329589126fe67f0ac55d637"
 dependencies = [
  "bitflags",
  "chrono",
+ "comfy-table",
  "csv",
  "flatbuffers",
  "hex",
@@ -77,7 +72,6 @@ dependencies = [
  "lexical-core",
  "multiversion",
  "num",
- "prettytable-rs",
  "rand 0.8.4",
  "regex",
  "serde",
@@ -97,17 +91,6 @@ dependencies = [
 ]
 
 [[package]]
-name = "atty"
-version = "0.2.14"
-source = "registry+https://github.com/rust-lang/crates.io-index;
-checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8"
-dependencies = [
- "hermit-abi",
- "libc",
- "winapi",
-]
-
-[[package]]
 name = "autocfg"
 version = "1.0.1"
 source = "registry+https://github.com/rust-lang/crates.io-index;
@@ -137,24 +120,13 @@ dependencies = [
 ]
 
 [[package]]
-name = "blake2b_simd"
-version = "0.5.11"
-source = "registry+https://github.com/rust-lang/crates.io-index;
-checksum = "afa748e348ad3be8263be728124b24a24f268266f6f5d58af9d75f6a40b5c587"
-dependencies = [
- "arrayref",
- "arrayvec 0.5.2",
- "constant_time_eq",
-]
-
-[[package]]
 name = "blake3"
 version = "1.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index;
 checksum = "dcd555c66291d5f836dbb6883b48660ece810fe25a31f3bdfb911945dff2691f"
 dependencies = [
  "arrayref",
- "arrayvec 0.7.1",
+ "arrayvec",
  "cc",
  "cfg-if",
  "constant_time_eq",
@@ -238,6 +210,17 @@ dependencies = [
 ]
 
 [[package]]
+name = "comfy-table"
+version = "4.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index;
+checksum = "11e95a3e867422fd8d04049041f5671f94d53c32a9dcd82e2be268714942f3f3"
+dependencies = [
+ "strum",
+ "strum_macros",
+ "unicode-width",
+]
+
+[[package]]
 name = "constant_time_eq"
 version = "0.1.5"
 source = "registry+https://github.com/rust-lang/crates.io-index;
@@ -262,16 +245,6 @@ dependencies = [
 ]
 
 [[package]]
-name = "crossbeam-utils"
-version = "0.8.5"
-source = "registry+https://github.com/rust-lang/crates.io-index;
-checksum = "d82cfc11ce7f2c3faef78d8a684447b40d503d9681acebed6cb728d45940c4db"
-dependencies = [
- "cfg-if",
- "lazy_static",
-]
-
-[[package]]
 name = "crypto-mac"
 version = "0.8.0"
 source = "registry+https://github.com/rust-lang/crates.io-index;
@@ -355,23 +328,6 @@ dependencies = [
 ]
 
 [[package]]
-name = "dirs"
-version = "1.0.5"
-source = "registry+https://github.com/rust-lang/crates.io-index;
-checksum = "3fd78930633bd1c6e35c4b42b1df7b0cbc6bc191146e512bb3bedf243fcc3901"
-dependencies = [
- "libc",
- "redox_users",
- "winapi",
-]
-
-[[package]]
-name = "encode_unicode"
-version = "0.3.6"
-source = "registry+https://github.com/rust-lang/crates.io-index;
-checksum = 

[arrow] branch master updated (6f478d0 -> f893fa2)

2021-10-20 Thread kou
This is an automated email from the ASF dual-hosted git repository.

kou pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git.


from 6f478d0  ARROW-14396: [R][Doc] Remove relic note in write_dataset that 
columns cannot be renamed
 add f893fa2  ARROW-14401: [C++] Fix bundled crc32c's include path

No new revisions were added by this update.

Summary of changes:
 cpp/cmake_modules/ThirdpartyToolchain.cmake | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)


[arrow] branch master updated (9841dc8 -> 6f478d0)

2021-10-20 Thread npr
This is an automated email from the ASF dual-hosted git repository.

npr pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git.


from 9841dc8  ARROW-14400: [Go] Equals and ApproxEquals for Tables and 
Chunked Arrays
 add 6f478d0  ARROW-14396: [R][Doc] Remove relic note in write_dataset that 
columns cannot be renamed

No new revisions were added by this update.

Summary of changes:
 r/R/dataset-write.R| 8 +++-
 r/man/arrow-package.Rd | 6 +-
 r/man/write_dataset.Rd | 8 +++-
 3 files changed, 7 insertions(+), 15 deletions(-)


[arrow] branch master updated: ARROW-14400: [Go] Equals and ApproxEquals for Tables and Chunked Arrays

2021-10-20 Thread zeroshade
This is an automated email from the ASF dual-hosted git repository.

zeroshade pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
 new 9841dc8  ARROW-14400: [Go] Equals and ApproxEquals for Tables and 
Chunked Arrays
9841dc8 is described below

commit 9841dc864c62115d68706750b86ced5e142804f6
Author: Matthew Topol 
AuthorDate: Wed Oct 20 15:07:27 2021 -0400

ARROW-14400: [Go] Equals and ApproxEquals for Tables and Chunked Arrays

Closes #11488 from zeroshade/extra-comparisons

Authored-by: Matthew Topol 
Signed-off-by: Matthew Topol 
---
 go/arrow/array/compare.go  | 140 +
 go/arrow/array/compare_test.go |  81 
 2 files changed, 221 insertions(+)

diff --git a/go/arrow/array/compare.go b/go/arrow/array/compare.go
index c4ee046..89c81ef 100644
--- a/go/arrow/array/compare.go
+++ b/go/arrow/array/compare.go
@@ -65,6 +65,136 @@ func RecordApproxEqual(left, right Record, opts 
...EqualOption) bool {
return true
 }
 
+// helper function to evaluate a function on two chunked object having 
possibly different
+// chunk layouts. the function passed in will be called for each corresponding 
slice of the
+// two chunked arrays and if the function returns false it will end the loop 
early.
+func chunkedBinaryApply(left, right *Chunked, fn func(left Interface, lbeg, 
lend int64, right Interface, rbeg, rend int64) bool) {
+   var (
+   pos   int64
+   lengthint64 = int64(left.length)
+   leftIdx, rightIdx int
+   leftPos, rightPos int64
+   )
+
+   for pos < length {
+   var cleft, cright Interface
+   for {
+   cleft, cright = left.Chunk(leftIdx), 
right.Chunk(rightIdx)
+   if leftPos == int64(cleft.Len()) {
+   leftPos = 0
+   leftIdx++
+   continue
+   }
+   if rightPos == int64(cright.Len()) {
+   rightPos = 0
+   rightIdx++
+   continue
+   }
+   break
+   }
+
+   sz := int64(min(cleft.Len()-int(leftPos), 
cright.Len()-int(rightPos)))
+   pos += sz
+   if !fn(cleft, leftPos, leftPos+sz, cright, rightPos, 
rightPos+sz) {
+   return
+   }
+
+   leftPos += sz
+   rightPos += sz
+   }
+}
+
+// ChunkedEqual reports whether two chunked arrays are equal regardless of 
their chunkings
+func ChunkedEqual(left, right *Chunked) bool {
+   switch {
+   case left == right:
+   return true
+   case left.length != right.length:
+   return false
+   case left.nulls != right.nulls:
+   return false
+   case !arrow.TypeEqual(left.dtype, right.dtype):
+   return false
+   }
+
+   var isequal bool
+   chunkedBinaryApply(left, right, func(left Interface, lbeg, lend int64, 
right Interface, rbeg, rend int64) bool {
+   isequal = ArraySliceEqual(left, lbeg, lend, right, rbeg, rend)
+   return isequal
+   })
+
+   return isequal
+}
+
+// ChunkedApproxEqual reports whether two chunked arrays are approximately 
equal regardless of their chunkings
+// for non-floating point arrays, this is equivalent to ChunkedEqual
+func ChunkedApproxEqual(left, right *Chunked, opts ...EqualOption) bool {
+   switch {
+   case left == right:
+   return true
+   case left.length != right.length:
+   return false
+   case left.nulls != right.nulls:
+   return false
+   case !arrow.TypeEqual(left.dtype, right.dtype):
+   return false
+   }
+
+   var isequal bool
+   chunkedBinaryApply(left, right, func(left Interface, lbeg, lend int64, 
right Interface, rbeg, rend int64) bool {
+   isequal = ArraySliceApproxEqual(left, lbeg, lend, right, rbeg, 
rend, opts...)
+   return isequal
+   })
+
+   return isequal
+}
+
+// TableEqual returns if the two tables have the same data in the same schema
+func TableEqual(left, right Table) bool {
+   switch {
+   case left.NumCols() != right.NumCols():
+   return false
+   case left.NumRows() != right.NumRows():
+   return false
+   }
+
+   for i := 0; int64(i) < left.NumCols(); i++ {
+   lc := left.Column(i)
+   rc := right.Column(i)
+   if !lc.field.Equal(rc.field) {
+   return false
+   }
+
+   if !ChunkedEqual(lc.data, rc.data) {
+   return false
+  

[arrow] branch master updated (4ac62d5 -> a8e1c81)

2021-10-20 Thread emkornfield
This is an automated email from the ASF dual-hosted git repository.

emkornfield pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git.


from 4ac62d5  ARROW-14393: [C++] GTest linking errors during the source 
release verification
 add a8e1c81  ARROW-14345: [C++] Implement streaming reads

No new revisions were added by this update.

Summary of changes:
 cpp/src/arrow/filesystem/gcsfs.cc  |  63 +++--
 cpp/src/arrow/filesystem/gcsfs_internal.cc |   4 +-
 cpp/src/arrow/filesystem/gcsfs_test.cc | 109 +++--
 3 files changed, 162 insertions(+), 14 deletions(-)


[arrow] 01/03: [Release] Update CHANGELOG.md for 6.0.0

2021-10-20 Thread kszucs
This is an automated email from the ASF dual-hosted git repository.

kszucs pushed a commit to annotated tag apache-arrow-6.0.0
in repository https://gitbox.apache.org/repos/asf/arrow.git

commit fa2e00a1591c9f991a4aff339dadb5affd335eb1
Author: Krisztián Szűcs 
AuthorDate: Wed Oct 20 17:21:42 2021 +0200

[Release] Update CHANGELOG.md for 6.0.0
---
 CHANGELOG.md | 601 +++
 1 file changed, 601 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 3ed715d..0fbb382 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,4 +1,605 @@
 
+# Apache Arrow 6.0.0 (2021-10-20)
+
+## Bug Fixes
+
+* [ARROW-6946](https://issues.apache.org/jira/browse/ARROW-6946) - [Go] Run 
tests with assert build tag enabled
+* [ARROW-8452](https://issues.apache.org/jira/browse/ARROW-8452) - 
[Go][Integration] Go JSON producer generates incorrect nullable flag for nested 
types
+* [ARROW-8453](https://issues.apache.org/jira/browse/ARROW-8453) - 
[Integration][Go] Recursive nested types unsupported
+* [ARROW-8999](https://issues.apache.org/jira/browse/ARROW-8999) - 
[Python][C++] Non-deterministic segfault in "AMD64 MacOS 10.15 Python 3.7" build
+* [ARROW-9948](https://issues.apache.org/jira/browse/ARROW-9948) - [C++] 
Decimal128 does not check scale range when rescaling; can cause buffer overflow
+* [ARROW-10213](https://issues.apache.org/jira/browse/ARROW-10213) - [C++] 
Temporal cast from timestamp to date rounds instead of extracting date component
+* [ARROW-10373](https://issues.apache.org/jira/browse/ARROW-10373) - [C++] 
ValidateFull() does not validate null\_count
+* [ARROW-10773](https://issues.apache.org/jira/browse/ARROW-10773) - [R] 
parallel as.data.frame.Table hangs indefinitely on Windows
+* [ARROW-11518](https://issues.apache.org/jira/browse/ARROW-11518) - [C++] 
[Parquet] Parquet reader crashes when reading boolean columns
+* [ARROW-11579](https://issues.apache.org/jira/browse/ARROW-11579) - [R] 
read\_feather hanging on Windows
+* [ARROW-11634](https://issues.apache.org/jira/browse/ARROW-11634) - 
[C++][Parquet] Parquet statistics (min/max) for dictionary columns are incorrect
+* [ARROW-11729](https://issues.apache.org/jira/browse/ARROW-11729) - [R] Add 
examples to the datasets documentation
+* [ARROW-12011](https://issues.apache.org/jira/browse/ARROW-12011) - 
[C++][Python] Crashes and incorrect results when converting large integers to 
dates
+* [ARROW-12072](https://issues.apache.org/jira/browse/ARROW-12072) - 
(ipc.Writer).Write panics with \`arrow/array: index out of range\`
+* [ARROW-12087](https://issues.apache.org/jira/browse/ARROW-12087) - [C++] Fix 
sort\_indices, array\_sort\_indices timestamp support discrepancy
+* [ARROW-12513](https://issues.apache.org/jira/browse/ARROW-12513) - 
[C++][Parquet] Parquet Writer always puts null\_count=0 in Parquet statistics 
for dictionary-encoded array with nulls
+* [ARROW-12540](https://issues.apache.org/jira/browse/ARROW-12540) - [C++] 
Implement cast from date32[day] to utf8 
+* [ARROW-12636](https://issues.apache.org/jira/browse/ARROW-12636) - [JS] ESM 
Tree-Shaking produces broken code
+* [ARROW-12700](https://issues.apache.org/jira/browse/ARROW-12700) - [R] 
Read/Write\_feather stuck forever after bad write, R, Win32
+* [ARROW-12837](https://issues.apache.org/jira/browse/ARROW-12837) - [C++] 
Array::ToString() segfaults with null buffer.
+* [ARROW-13134](https://issues.apache.org/jira/browse/ARROW-13134) - [C++] 
SSL-related arrow-s3fs-test failures with aws-sdk-cpp 1.9.51
+* [ARROW-13151](https://issues.apache.org/jira/browse/ARROW-13151) - [Python] 
Unable to read single child field of struct column from Parquet
+* [ARROW-13198](https://issues.apache.org/jira/browse/ARROW-13198) - 
[C++][Dataset] Async scanner occasionally segfaulting in CI
+* [ARROW-13293](https://issues.apache.org/jira/browse/ARROW-13293) - [R] 
open\_dataset followed by collect hangs (while compute works)
+* [ARROW-13304](https://issues.apache.org/jira/browse/ARROW-13304) - [C++] 
Unable to install nightly on Ubuntu 21.04 due to day of week options
+* [ARROW-13336](https://issues.apache.org/jira/browse/ARROW-13336) - 
[Doc][Python] make clean doesn't clean up "generated" documentation
+* [ARROW-13422](https://issues.apache.org/jira/browse/ARROW-13422) - [R] 
Clarify README about S3 support on Windows
+* [ARROW-13424](https://issues.apache.org/jira/browse/ARROW-13424) - [C++] 
conda-forge benchmark library rejected
+* [ARROW-13425](https://issues.apache.org/jira/browse/ARROW-13425) - 
[Dev][Archery] Archery import pandas which imports pyarrow
+* [ARROW-13429](https://issues.apache.org/jira/browse/ARROW-13429) - 
[C++][Gandiva] Gandiva crashes when compiling If-else expression with binary 
type
+* [ARROW-13430](https://issues.apache.org/jira/browse/ARROW-13430) - 
[Integration][Go] Various errors in the integration tests
+* [ARROW-13436](https://issues.apache.org/jira/browse/ARROW-13436) - 
[Python][Doc] Clarify what should be 

[arrow] 03/03: [Release] Update versions for 6.0.0

2021-10-20 Thread kszucs
This is an automated email from the ASF dual-hosted git repository.

kszucs pushed a commit to annotated tag apache-arrow-6.0.0
in repository https://gitbox.apache.org/repos/asf/arrow.git

commit 5a6f5919e68d1c0f2672c9c711858e5cbe0944cf
Author: Krisztián Szűcs 
AuthorDate: Wed Oct 20 17:21:50 2021 +0200

[Release] Update versions for 6.0.0
---
 c_glib/meson.build   | 2 +-
 ci/scripts/PKGBUILD  | 2 +-
 cpp/CMakeLists.txt   | 2 +-
 cpp/vcpkg.json   | 2 +-
 csharp/Directory.Build.props | 2 +-
 dev/tasks/homebrew-formulae/apache-arrow.rb  | 2 +-
 dev/tasks/homebrew-formulae/autobrew/apache-arrow.rb | 2 +-
 java/adapter/avro/pom.xml| 2 +-
 java/adapter/jdbc/pom.xml| 2 +-
 java/adapter/orc/pom.xml | 2 +-
 java/algorithm/pom.xml   | 2 +-
 java/c/pom.xml   | 2 +-
 java/compression/pom.xml | 2 +-
 java/dataset/pom.xml | 2 +-
 java/flight/flight-core/pom.xml  | 2 +-
 java/flight/flight-grpc/pom.xml  | 2 +-
 java/format/pom.xml  | 2 +-
 java/gandiva/pom.xml | 2 +-
 java/memory/memory-core/pom.xml  | 2 +-
 java/memory/memory-netty/pom.xml | 2 +-
 java/memory/memory-unsafe/pom.xml| 2 +-
 java/memory/pom.xml  | 2 +-
 java/performance/pom.xml | 4 ++--
 java/plasma/pom.xml  | 2 +-
 java/pom.xml | 2 +-
 java/tools/pom.xml   | 2 +-
 java/vector/pom.xml  | 2 +-
 js/package.json  | 2 +-
 matlab/CMakeLists.txt| 2 +-
 python/setup.py  | 2 +-
 r/DESCRIPTION| 2 +-
 r/NEWS.md| 2 +-
 ruby/red-arrow-cuda/lib/arrow-cuda/version.rb| 2 +-
 ruby/red-arrow-dataset/lib/arrow-dataset/version.rb  | 2 +-
 ruby/red-arrow-flight/lib/arrow-flight/version.rb| 2 +-
 ruby/red-arrow/lib/arrow/version.rb  | 2 +-
 ruby/red-gandiva/lib/gandiva/version.rb  | 2 +-
 ruby/red-parquet/lib/parquet/version.rb  | 2 +-
 ruby/red-plasma/lib/plasma/version.rb| 2 +-
 39 files changed, 40 insertions(+), 40 deletions(-)

diff --git a/c_glib/meson.build b/c_glib/meson.build
index 0e090c9..fb92181 100644
--- a/c_glib/meson.build
+++ b/c_glib/meson.build
@@ -23,7 +23,7 @@ project('arrow-glib', 'c', 'cpp',
   'cpp_std=c++11',
 ])
 
-version = '6.0.0-SNAPSHOT'
+version = '6.0.0'
 if version.endswith('-SNAPSHOT')
   version_numbers = version.split('-')[0].split('.')
   version_tag = version.split('-')[1]
diff --git a/ci/scripts/PKGBUILD b/ci/scripts/PKGBUILD
index 246b679..ff84553 100644
--- a/ci/scripts/PKGBUILD
+++ b/ci/scripts/PKGBUILD
@@ -18,7 +18,7 @@
 _realname=arrow
 pkgbase=mingw-w64-${_realname}
 pkgname="${MINGW_PACKAGE_PREFIX}-${_realname}"
-pkgver=5.0.0.9000
+pkgver=6.0.0
 pkgrel=8000
 pkgdesc="Apache Arrow is a cross-language development platform for in-memory 
data (mingw-w64)"
 arch=("any")
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index c787794..ba8c36e 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -47,7 +47,7 @@ if(POLICY CMP0074)
   cmake_policy(SET CMP0074 NEW)
 endif()
 
-set(ARROW_VERSION "6.0.0-SNAPSHOT")
+set(ARROW_VERSION "6.0.0")
 
 string(REGEX MATCH "^[0-9]+\\.[0-9]+\\.[0-9]+" ARROW_BASE_VERSION 
"${ARROW_VERSION}")
 
diff --git a/cpp/vcpkg.json b/cpp/vcpkg.json
index 723f3a4..35907e1 100644
--- a/cpp/vcpkg.json
+++ b/cpp/vcpkg.json
@@ -1,6 +1,6 @@
 {
   "name": "arrow",
-  "version-string": "6.0.0-SNAPSHOT",
+  "version-string": "6.0.0",
   "dependencies": [
 "abseil",
 {
diff --git a/csharp/Directory.Build.props b/csharp/Directory.Build.props
index c42ff55..893208c 100644
--- a/csharp/Directory.Build.props
+++ b/csharp/Directory.Build.props
@@ -29,7 +29,7 @@
 Apache Arrow library
 Copyright 2016-2019 The Apache Software Foundation
 The Apache Software Foundation
-6.0.0-SNAPSHOT
+6.0.0
   
 
   
diff --git a/dev/tasks/homebrew-formulae/apache-arrow.rb 
b/dev/tasks/homebrew-formulae/apache-arrow.rb
index ca3f831..7023b5d 100644
--- a/dev/tasks/homebrew-formulae/apache-arrow.rb
+++ b/dev/tasks/homebrew-formulae/apache-arrow.rb
@@ -1,7 +1,7 @@
 class ApacheArrow < Formula
   desc "Columnar in-memory analytics layer designed to accelerate big data"
   homepage "https://arrow.apache.org/;
-  url 

[arrow] 02/03: [Release] Update .deb/.rpm changelogs for 6.0.0

2021-10-20 Thread kszucs
This is an automated email from the ASF dual-hosted git repository.

kszucs pushed a commit to annotated tag apache-arrow-6.0.0
in repository https://gitbox.apache.org/repos/asf/arrow.git

commit b03fce8921b873ed64c4b7f577ade7d54ec8a42b
Author: Krisztián Szűcs 
AuthorDate: Wed Oct 20 17:21:45 2021 +0200

[Release] Update .deb/.rpm changelogs for 6.0.0
---
 dev/tasks/linux-packages/apache-arrow-apt-source/debian/changelog   | 5 +
 .../apache-arrow-release/yum/apache-arrow-release.spec.in   | 3 +++
 dev/tasks/linux-packages/apache-arrow/debian/changelog  | 6 ++
 dev/tasks/linux-packages/apache-arrow/yum/arrow.spec.in | 3 +++
 4 files changed, 17 insertions(+)

diff --git a/dev/tasks/linux-packages/apache-arrow-apt-source/debian/changelog 
b/dev/tasks/linux-packages/apache-arrow-apt-source/debian/changelog
index e69de29..d22ad35 100644
--- a/dev/tasks/linux-packages/apache-arrow-apt-source/debian/changelog
+++ b/dev/tasks/linux-packages/apache-arrow-apt-source/debian/changelog
@@ -0,0 +1,5 @@
+apache-arrow-apt-source (6.0.0-1) unstable; urgency=low
+
+  * New upstream release.
+
+ -- Krisztián Szűcs   Wed, 20 Oct 2021 15:21:43 
-
diff --git 
a/dev/tasks/linux-packages/apache-arrow-release/yum/apache-arrow-release.spec.in
 
b/dev/tasks/linux-packages/apache-arrow-release/yum/apache-arrow-release.spec.in
index 8edb453..071ec26 100644
--- 
a/dev/tasks/linux-packages/apache-arrow-release/yum/apache-arrow-release.spec.in
+++ 
b/dev/tasks/linux-packages/apache-arrow-release/yum/apache-arrow-release.spec.in
@@ -91,6 +91,9 @@ else
 fi
 
 %changelog
+* Wed Oct 20 2021 Krisztián Szűcs  - 6.0.0-1
+- New upstream release.
+
 * Mon Jan 18 2021 Krisztián Szűcs  - 3.0.0-1
 - New upstream release.
 
diff --git a/dev/tasks/linux-packages/apache-arrow/debian/changelog 
b/dev/tasks/linux-packages/apache-arrow/debian/changelog
index 2adfc44..1df8b4a 100644
--- a/dev/tasks/linux-packages/apache-arrow/debian/changelog
+++ b/dev/tasks/linux-packages/apache-arrow/debian/changelog
@@ -1,3 +1,9 @@
+apache-arrow (6.0.0-1) unstable; urgency=low
+
+  * New upstream release.
+
+ -- Krisztián Szűcs   Wed, 20 Oct 2021 15:21:43 
-
+
 apache-arrow (3.0.0-1) unstable; urgency=low
 
   * New upstream release.
diff --git a/dev/tasks/linux-packages/apache-arrow/yum/arrow.spec.in 
b/dev/tasks/linux-packages/apache-arrow/yum/arrow.spec.in
index 629fbb1..de65184 100644
--- a/dev/tasks/linux-packages/apache-arrow/yum/arrow.spec.in
+++ b/dev/tasks/linux-packages/apache-arrow/yum/arrow.spec.in
@@ -834,6 +834,9 @@ Documentation for Apache Parquet GLib.
 %{_datadir}/gtk-doc/html/parquet-glib/
 
 %changelog
+* Wed Oct 20 2021 Krisztián Szűcs  - 6.0.0-1
+- New upstream release.
+
 * Mon Jan 18 2021 Krisztián Szűcs  - 3.0.0-1
 - New upstream release.
 


[arrow] annotated tag apache-arrow-6.0.0 updated (0f8bbdb -> 6714759)

2021-10-20 Thread kszucs
This is an automated email from the ASF dual-hosted git repository.

kszucs pushed a change to annotated tag apache-arrow-6.0.0
in repository https://gitbox.apache.org/repos/asf/arrow.git.


*** WARNING: tag apache-arrow-6.0.0 was modified! ***

from 0f8bbdb  (tag)
  to 6714759  (tag)
 tagging 5a6f5919e68d1c0f2672c9c711858e5cbe0944cf (commit)
 replaces apache-arrow-3.0.0
  by Krisztián Szűcs
  on Wed Oct 20 17:21:51 2021 +0200

- Log -
[Release] Apache Arrow Release 6.0.0
---

omit 421dd6d  [Release] Update versions for 6.0.0
omit 4b59043  [Release] Update .deb/.rpm changelogs for 6.0.0
omit d04c2c0  [Release] Update CHANGELOG.md for 6.0.0
 add 98b0e99  ARROW-13784: [Python] Table.from_arrays should raise an error 
when array is empty but names is not
 add 54bacf9d ARROW-10094: [Python][Doc] Document missing pandas to arrow 
conversions
 add 29892ba  ARROW-14004: [Python][Doc] Document nullable dtypes handling 
and usage of types_mapper in to_pandas conversion
 add eb3c1bd  ARROW-14392: [C++] Bundled gRPC misses bundled Abseil include 
path
 add 65e69ac  ARROW-13317: [Python] Improve documentation on what 
'use_threads' does in 'read_feather'
 add ae943c3  ARROW-13436: [Python][Doc] Clarify what should be expected if 
read_table is passed an empty list of columns
 add b2e1285  MINOR: [R] Fix sed for cross-OS compatibility
 add 77da17b  MINOR: [Docs] Uncomment the docs about file visitor when 
writing Datasets (#11480)
 add 80ecf33  ARROW-14397: [C++] Fix valgrind error in test utility
 add 4ac62d5  ARROW-14393: [C++] GTest linking errors during the source 
release verification
 new fa2e00a  [Release] Update CHANGELOG.md for 6.0.0
 new b03fce8  [Release] Update .deb/.rpm changelogs for 6.0.0
 new 5a6f591  [Release] Update versions for 6.0.0

This update added new revisions after undoing existing revisions.
That is to say, some revisions that were in the old version of the
annotated tag are not in the new version.  This situation occurs
when a user --force pushes a change and generates a repository
containing something like this:

 * -- * -- B -- O -- O -- O   (0f8bbdb)
\
 N -- N -- N   refs/tags/apache-arrow-6.0.0 (6714759)

You should already have received notification emails for all of the O
revisions, and so the following emails describe only the N revisions
from the common base, B.

Any revisions marked "omit" are not gone; other references still
refer to them.  Any revisions marked "discard" are gone forever.

The 3 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 CHANGELOG.md   | 401 +++--
 ...kerfile => ubuntu-20.04-cpp-minimal.dockerfile} |  77 ++--
 cpp/cmake_modules/ThirdpartyToolchain.cmake|  22 +-
 cpp/src/arrow/compute/kernels/vector_sort_test.cc  |   2 +-
 cpp/thirdparty/versions.txt|   4 +-
 .../apache-arrow-apt-source/debian/changelog   |   2 +-
 .../yum/apache-arrow-release.spec.in   |   2 +-
 .../linux-packages/apache-arrow/debian/changelog   |   2 +-
 .../linux-packages/apache-arrow/yum/arrow.spec.in  |   2 +-
 dev/tasks/tasks.yml|   8 +
 docker-compose.yml |  21 ++
 docs/source/python/data.rst|   1 +
 docs/source/python/dataset.rst |  44 +--
 docs/source/python/pandas.rst  | 177 -
 python/pyarrow/feather.py  |   4 +-
 python/pyarrow/parquet.py  |   4 +-
 python/pyarrow/table.pxi   |   8 +-
 python/pyarrow/tests/test_table.py |  17 +
 r/Makefile |   2 +-
 19 files changed, 513 insertions(+), 287 deletions(-)
 copy ci/docker/{fedora-33-cpp.dockerfile => 
ubuntu-20.04-cpp-minimal.dockerfile} (56%)


svn commit: r50529 - in /dev/arrow/apache-arrow-6.0.0-rc1: ./ apache-arrow-6.0.0.tar.gz apache-arrow-6.0.0.tar.gz.asc apache-arrow-6.0.0.tar.gz.sha256 apache-arrow-6.0.0.tar.gz.sha512

2021-10-20 Thread kszucs
Author: kszucs
Date: Wed Oct 20 15:25:30 2021
New Revision: 50529

Log:
Apache Arrow 6.0.0 RC1

Added:
dev/arrow/apache-arrow-6.0.0-rc1/
dev/arrow/apache-arrow-6.0.0-rc1/apache-arrow-6.0.0.tar.gz   (with props)
dev/arrow/apache-arrow-6.0.0-rc1/apache-arrow-6.0.0.tar.gz.asc
dev/arrow/apache-arrow-6.0.0-rc1/apache-arrow-6.0.0.tar.gz.sha256
dev/arrow/apache-arrow-6.0.0-rc1/apache-arrow-6.0.0.tar.gz.sha512

Added: dev/arrow/apache-arrow-6.0.0-rc1/apache-arrow-6.0.0.tar.gz
==
Binary file - no diff available.

Propchange: dev/arrow/apache-arrow-6.0.0-rc1/apache-arrow-6.0.0.tar.gz
--
svn:mime-type = application/octet-stream

Added: dev/arrow/apache-arrow-6.0.0-rc1/apache-arrow-6.0.0.tar.gz.asc
==
--- dev/arrow/apache-arrow-6.0.0-rc1/apache-arrow-6.0.0.tar.gz.asc (added)
+++ dev/arrow/apache-arrow-6.0.0-rc1/apache-arrow-6.0.0.tar.gz.asc Wed Oct 20 
15:25:30 2021
@@ -0,0 +1,16 @@
+-BEGIN PGP SIGNATURE-
+
+iQIzBAABCAAdFiEEJl+Aq4T+AxJ+FPARJbzKUiDYQHkFAmFwNNUACgkQJbzKUiDY
+QHnZJhAA+HFVu1WH8gZYkhvqwBSxxiDalgV5b6EKimtN6Xr9OcRuV3oXwrl8wohM
+ZT+8uwgIqzTmykGOwIiKlgqCVWYJNo/CuWq89Y+Fu9JRbrU+VeYqNJe/bSQrYErj
+vi7LeX5bQVRpNyBBSOIMAmteP+k6omV1uVU4BD9U00HfgrwPbAyK09m6bfwg1l/U
+b20mtUNC/fMFnQUwlMDPjsXZ9UQ+34eLNnNxQHhQNAo0FwH5RMcEhFLJaQOsr6vn
+hBHIIYs5WKqlhIIFRByApWxlUr0Nb8SpqL38FgU2ePRwfusTJ4SXRDAb1Xus28Ot
+JViyaqiOSg9vkWdYBwLC4CM45X51aSG3BT7WF5urFErHaTwepHl8uD6IeQ96y2sN
+/yZw94S8T83P6E69zwvWMW2F2VT550UqabUrPQp9gUK7wezjUz6/P3kszGGqKawp
+JHD2cFNn2zG9Gw9oFO4z7gOjA58Nbbb2Na9R5C4EcmDhf1287gcP0r+7b7tFabib
+LuWf5Vcz7KxIi6B7TAJ8pGLj8M2+yJCyAhBe6UsfJdr1speKcxdbbYuh3Gkc7zfs
+oN6jTeO6WjHkfMiDBJ2/BmuJzspEchz3wZ2RBgzTq95KY0Yq9dhG1cX5g3eEhBr6
+9ugw/TDgNPKNylHohcyXm8IU5XQHJeZ1gnI62HUG5W2DJd1XRrs=
+=+dIx
+-END PGP SIGNATURE-

Added: dev/arrow/apache-arrow-6.0.0-rc1/apache-arrow-6.0.0.tar.gz.sha256
==
--- dev/arrow/apache-arrow-6.0.0-rc1/apache-arrow-6.0.0.tar.gz.sha256 (added)
+++ dev/arrow/apache-arrow-6.0.0-rc1/apache-arrow-6.0.0.tar.gz.sha256 Wed Oct 
20 15:25:30 2021
@@ -0,0 +1 @@
+40e25a16b61c103ccd6bab4a7d0bd15a0568e565865402c37a1e3a782c5ff6b2  
apache-arrow-6.0.0.tar.gz

Added: dev/arrow/apache-arrow-6.0.0-rc1/apache-arrow-6.0.0.tar.gz.sha512
==
--- dev/arrow/apache-arrow-6.0.0-rc1/apache-arrow-6.0.0.tar.gz.sha512 (added)
+++ dev/arrow/apache-arrow-6.0.0-rc1/apache-arrow-6.0.0.tar.gz.sha512 Wed Oct 
20 15:25:30 2021
@@ -0,0 +1 @@
+f2eca86a4fae2dd8e28195c5b6af0edb7c52422870af80ad58e85fc7834dd9216792e8f8f4985c9a84c6888ab051f2c4c71e2433ef14d6a7bc06fc5b1184fd35
  apache-arrow-6.0.0.tar.gz




[arrow] branch release-6.0.0 updated (c8f882c -> 4ac62d5)

2021-10-20 Thread kszucs
This is an automated email from the ASF dual-hosted git repository.

kszucs pushed a change to branch release-6.0.0
in repository https://gitbox.apache.org/repos/asf/arrow.git.


from c8f882c  ARROW-14386: [Packaging][Java] Ensure using installed 
devtoolset version
 add 98b0e99  ARROW-13784: [Python] Table.from_arrays should raise an error 
when array is empty but names is not
 add 54bacf9d ARROW-10094: [Python][Doc] Document missing pandas to arrow 
conversions
 add 29892ba  ARROW-14004: [Python][Doc] Document nullable dtypes handling 
and usage of types_mapper in to_pandas conversion
 add eb3c1bd  ARROW-14392: [C++] Bundled gRPC misses bundled Abseil include 
path
 add 65e69ac  ARROW-13317: [Python] Improve documentation on what 
'use_threads' does in 'read_feather'
 add ae943c3  ARROW-13436: [Python][Doc] Clarify what should be expected if 
read_table is passed an empty list of columns
 add b2e1285  MINOR: [R] Fix sed for cross-OS compatibility
 add 77da17b  MINOR: [Docs] Uncomment the docs about file visitor when 
writing Datasets (#11480)
 add 80ecf33  ARROW-14397: [C++] Fix valgrind error in test utility
 add 4ac62d5  ARROW-14393: [C++] GTest linking errors during the source 
release verification

No new revisions were added by this update.

Summary of changes:
 ...kerfile => ubuntu-20.04-cpp-minimal.dockerfile} |  77 +++--
 cpp/cmake_modules/ThirdpartyToolchain.cmake|  22 ++-
 cpp/src/arrow/compute/kernels/vector_sort_test.cc  |   2 +-
 cpp/thirdparty/versions.txt|   4 +-
 dev/tasks/tasks.yml|   8 +
 docker-compose.yml |  21 +++
 docs/source/python/data.rst|   1 +
 docs/source/python/dataset.rst |  44 ++---
 docs/source/python/pandas.rst  | 177 -
 python/pyarrow/feather.py  |   4 +-
 python/pyarrow/parquet.py  |   4 +-
 python/pyarrow/table.pxi   |   8 +-
 python/pyarrow/tests/test_table.py |  17 ++
 r/Makefile |   2 +-
 14 files changed, 304 insertions(+), 87 deletions(-)
 copy ci/docker/{fedora-33-cpp.dockerfile => 
ubuntu-20.04-cpp-minimal.dockerfile} (56%)


[arrow] branch master updated (80ecf33 -> 4ac62d5)

2021-10-20 Thread kszucs
This is an automated email from the ASF dual-hosted git repository.

kszucs pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git.


from 80ecf33  ARROW-14397: [C++] Fix valgrind error in test utility
 add 4ac62d5  ARROW-14393: [C++] GTest linking errors during the source 
release verification

No new revisions were added by this update.

Summary of changes:
 cpp/thirdparty/versions.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)


[arrow] branch master updated (77da17b -> 80ecf33)

2021-10-20 Thread kszucs
This is an automated email from the ASF dual-hosted git repository.

kszucs pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git.


from 77da17b  MINOR: [Docs] Uncomment the docs about file visitor when 
writing Datasets (#11480)
 add 80ecf33  ARROW-14397: [C++] Fix valgrind error in test utility

No new revisions were added by this update.

Summary of changes:
 cpp/src/arrow/compute/kernels/vector_sort_test.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)


[arrow] branch master updated (b2e1285 -> 77da17b)

2021-10-20 Thread kszucs
This is an automated email from the ASF dual-hosted git repository.

kszucs pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git.


from b2e1285  MINOR: [R] Fix sed for cross-OS compatibility
 add 77da17b  MINOR: [Docs] Uncomment the docs about file visitor when 
writing Datasets (#11480)

No new revisions were added by this update.

Summary of changes:
 docs/source/python/dataset.rst | 44 +-
 1 file changed, 22 insertions(+), 22 deletions(-)


[arrow] branch master updated (ae943c3 -> b2e1285)

2021-10-20 Thread jonkeane
This is an automated email from the ASF dual-hosted git repository.

jonkeane pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git.


from ae943c3  ARROW-13436: [Python][Doc] Clarify what should be expected if 
read_table is passed an empty list of columns
 add b2e1285  MINOR: [R] Fix sed for cross-OS compatibility

No new revisions were added by this update.

Summary of changes:
 r/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)


[arrow] branch master updated (65e69ac -> ae943c3)

2021-10-20 Thread jorisvandenbossche
This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git.


from 65e69ac  ARROW-13317: [Python] Improve documentation on what 
'use_threads' does in 'read_feather'
 add ae943c3  ARROW-13436: [Python][Doc] Clarify what should be expected if 
read_table is passed an empty list of columns

No new revisions were added by this update.

Summary of changes:
 python/pyarrow/parquet.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)


[arrow] branch master updated (eb3c1bd -> 65e69ac)

2021-10-20 Thread kszucs
This is an automated email from the ASF dual-hosted git repository.

kszucs pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git.


from eb3c1bd  ARROW-14392: [C++] Bundled gRPC misses bundled Abseil include 
path
 add 65e69ac  ARROW-13317: [Python] Improve documentation on what 
'use_threads' does in 'read_feather'

No new revisions were added by this update.

Summary of changes:
 python/pyarrow/feather.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)


[arrow] branch master updated (29892ba -> eb3c1bd)

2021-10-20 Thread kszucs
This is an automated email from the ASF dual-hosted git repository.

kszucs pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git.


from 29892ba  ARROW-14004: [Python][Doc] Document nullable dtypes handling 
and usage of types_mapper in to_pandas conversion
 add eb3c1bd  ARROW-14392: [C++] Bundled gRPC misses bundled Abseil include 
path

No new revisions were added by this update.

Summary of changes:
 ...kerfile => ubuntu-20.04-cpp-minimal.dockerfile} | 77 --
 cpp/cmake_modules/ThirdpartyToolchain.cmake| 22 ++-
 dev/tasks/tasks.yml|  8 +++
 docker-compose.yml | 21 ++
 4 files changed, 74 insertions(+), 54 deletions(-)
 copy ci/docker/{fedora-33-cpp.dockerfile => 
ubuntu-20.04-cpp-minimal.dockerfile} (56%)


[arrow] branch master updated (54bacf9d -> 29892ba)

2021-10-20 Thread jorisvandenbossche
This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git.


from 54bacf9d ARROW-10094: [Python][Doc] Document missing pandas to arrow 
conversions
 add 29892ba  ARROW-14004: [Python][Doc] Document nullable dtypes handling 
and usage of types_mapper in to_pandas conversion

No new revisions were added by this update.

Summary of changes:
 docs/source/python/pandas.rst | 117 ++
 1 file changed, 117 insertions(+)


[arrow] branch master updated (98b0e99 -> 54bacf9d)

2021-10-20 Thread jorisvandenbossche
This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git.


from 98b0e99  ARROW-13784: [Python] Table.from_arrays should raise an error 
when array is empty but names is not
 add 54bacf9d ARROW-10094: [Python][Doc] Document missing pandas to arrow 
conversions

No new revisions were added by this update.

Summary of changes:
 docs/source/python/data.rst   |  1 +
 docs/source/python/pandas.rst | 60 ---
 2 files changed, 58 insertions(+), 3 deletions(-)


[arrow-datafusion] branch master updated: Multiple files per partitions for CSV Avro Json (#1138)

2021-10-20 Thread alamb
This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git


The following commit(s) were added to refs/heads/master by this push:
 new 4b577f3  Multiple files per partitions for CSV Avro Json (#1138)
4b577f3 is described below

commit 4b577f374ce0922f61608be25d8d91c59a65c2cf
Author: rdettai 
AuthorDate: Wed Oct 20 12:54:14 2021 +0200

Multiple files per partitions for CSV Avro Json (#1138)

* [feat] multi file partition for csv avro json

* [fix] typos

* [fix] aliasing closure trait
---
 ballista/rust/core/proto/ballista.proto|   4 +-
 .../core/src/serde/physical_plan/from_proto.rs |  12 +-
 .../rust/core/src/serde/physical_plan/to_proto.rs  |  22 +-
 datafusion/src/datasource/file_format/avro.rs  |   3 +-
 datafusion/src/datasource/file_format/csv.rs   |   3 +-
 datafusion/src/datasource/file_format/json.rs  |   3 +-
 datafusion/src/physical_plan/file_format/avro.rs   | 158 
 datafusion/src/physical_plan/file_format/csv.rs| 146 
 .../src/physical_plan/file_format/file_stream.rs   | 265 +
 datafusion/src/physical_plan/file_format/json.rs   | 138 +++
 datafusion/src/physical_plan/file_format/mod.rs|   1 +
 datafusion/src/test/mod.rs |  19 +-
 12 files changed, 436 insertions(+), 338 deletions(-)

diff --git a/ballista/rust/core/proto/ballista.proto 
b/ballista/rust/core/proto/ballista.proto
index 49b65cf..338c5a6 100644
--- a/ballista/rust/core/proto/ballista.proto
+++ b/ballista/rust/core/proto/ballista.proto
@@ -615,7 +615,7 @@ message ParquetScanExecNode {
 }
 
 message CsvScanExecNode {
-  repeated PartitionedFile files = 1;
+  repeated FileGroup file_groups = 1;
   Schema schema = 2;
   bool has_header = 3;
   uint32 batch_size = 4;
@@ -626,7 +626,7 @@ message CsvScanExecNode {
 }
 
 message AvroScanExecNode {
-  repeated PartitionedFile files = 1;
+  repeated FileGroup file_groups = 1;
   Schema schema = 2;
   uint32 batch_size = 4;
   repeated uint32 projection = 6;
diff --git a/ballista/rust/core/src/serde/physical_plan/from_proto.rs 
b/ballista/rust/core/src/serde/physical_plan/from_proto.rs
index 75dd915..dce354a 100644
--- a/ballista/rust/core/src/serde/physical_plan/from_proto.rs
+++ b/ballista/rust/core/src/serde/physical_plan/from_proto.rs
@@ -125,10 +125,10 @@ impl TryInto> for 
::PhysicalPlanNode {
 
 Ok(Arc::new(CsvExec::new(
 Arc::new(LocalFileSystem {}),
-scan.files
+scan.file_groups
 .iter()
-.map(|f| f.into())
-.collect::>(),
+.map(|p| p.into())
+.collect::>>(),
 statistics,
 schema,
 scan.has_header,
@@ -165,10 +165,10 @@ impl TryInto> for 
::PhysicalPlanNode {
 
 Ok(Arc::new(AvroExec::new(
 Arc::new(LocalFileSystem {}),
-scan.files
+scan.file_groups
 .iter()
-.map(|f| f.into())
-.collect::>(),
+.map(|p| p.into())
+.collect::>>(),
 statistics,
 schema,
 Some(projection),
diff --git a/ballista/rust/core/src/serde/physical_plan/to_proto.rs 
b/ballista/rust/core/src/serde/physical_plan/to_proto.rs
index e5e6347..52285ee 100644
--- a/ballista/rust/core/src/serde/physical_plan/to_proto.rs
+++ b/ballista/rust/core/src/serde/physical_plan/to_proto.rs
@@ -244,14 +244,15 @@ impl TryInto for Arc {
 ))),
 })
 } else if let Some(exec) = plan.downcast_ref::() {
+let file_groups = exec
+.file_groups()
+.iter()
+.map(|p| p.as_slice().into())
+.collect();
 Ok(protobuf::PhysicalPlanNode {
 physical_plan_type: Some(PhysicalPlanType::CsvScan(
 protobuf::CsvScanExecNode {
-files: exec
-.files()
-.iter()
-.map(|f| f.into())
-.collect::>(),
+file_groups,
 statistics: Some((()).into()),
 limit: exec
 .limit()
@@ -301,14 +302,15 @@ impl TryInto for Arc {
 )),
 })
 } else if let Some(exec) = plan.downcast_ref::() {
+let file_groups = exec
+.file_groups()
+.iter()
+.map(|p| p.as_slice().into())
+.collect();
 Ok(protobuf::PhysicalPlanNode 

[arrow-datafusion] branch master updated: Add ScalarValue support for arbitrary list elements (#1142)

2021-10-20 Thread alamb
This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git


The following commit(s) were added to refs/heads/master by this push:
 new 8bab267  Add ScalarValue support for arbitrary list elements (#1142)
8bab267 is described below

commit 8bab2676e070ee3cfc55d2ec0877c724d4daf568
Author: Jon Mease 
AuthorDate: Wed Oct 20 06:45:48 2021 -0400

Add ScalarValue support for arbitrary list elements (#1142)

* clippy fix

* clippy fixes

* Rebase and review cleanup
---
 datafusion/src/scalar.rs | 349 +--
 1 file changed, 339 insertions(+), 10 deletions(-)

diff --git a/datafusion/src/scalar.rs b/datafusion/src/scalar.rs
index 31c48a6..00586bf 100644
--- a/datafusion/src/scalar.rs
+++ b/datafusion/src/scalar.rs
@@ -777,6 +777,11 @@ impl ScalarValue {
 DataType::List(fields) if fields.data_type() == 
::LargeUtf8 => {
 build_array_list_string!(LargeStringBuilder, LargeUtf8)
 }
+DataType::List(_) => {
+// Fallback case handling homogeneous lists with any 
ScalarValue element type
+let list_array = ScalarValue::iter_to_array_list(scalars, 
_type)?;
+Arc::new(list_array)
+}
 DataType::Struct(fields) => {
 // Initialize a Vector to store the ScalarValues for each 
column
 let mut columns: Vec> =
@@ -833,6 +838,73 @@ impl ScalarValue {
 Ok(array)
 }
 
+fn iter_to_array_list(
+scalars: impl IntoIterator,
+data_type: ,
+) -> Result> {
+let mut offsets = Int32Array::builder(0);
+if let Err(err) = offsets.append_value(0) {
+return Err(DataFusionError::ArrowError(err));
+}
+
+let mut elements: Vec = Vec::new();
+let mut valid = BooleanBufferBuilder::new(0);
+let mut flat_len = 0i32;
+for scalar in scalars {
+if let ScalarValue::List(values, _) = scalar {
+match values {
+Some(values) => {
+let element_array = 
ScalarValue::iter_to_array(*values)?;
+
+// Add new offset index
+flat_len += element_array.len() as i32;
+if let Err(err) = offsets.append_value(flat_len) {
+return Err(DataFusionError::ArrowError(err));
+}
+
+elements.push(element_array);
+
+// Element is valid
+valid.append(true);
+}
+None => {
+// Repeat previous offset index
+if let Err(err) = offsets.append_value(flat_len) {
+return Err(DataFusionError::ArrowError(err));
+}
+
+// Element is null
+valid.append(false);
+}
+}
+} else {
+return Err(DataFusionError::Internal(format!(
+"Expected ScalarValue::List element. Received {:?}",
+scalar
+)));
+}
+}
+
+// Concatenate element arrays to create single flat array
+let element_arrays: Vec< Array> =
+elements.iter().map(|a| a.as_ref()).collect();
+let flat_array = match arrow::compute::concat(_arrays) {
+Ok(flat_array) => flat_array,
+Err(err) => return Err(DataFusionError::ArrowError(err)),
+};
+
+// Build ListArray using ArrayData so we can specify a flat inner 
array, and offset indices
+let offsets_array = offsets.finish();
+let array_data = ArrayDataBuilder::new(data_type.clone())
+.len(offsets_array.len() - 1)
+.null_bit_buffer(valid.finish())
+.add_buffer(offsets_array.data().buffers()[0].clone())
+.add_child_data(flat_array.data().clone());
+
+let list_array = ListArray::from(array_data.build()?);
+Ok(list_array)
+}
+
 /// Converts a scalar value into an array of `size` rows.
 pub fn to_array_of_size(, size: usize) -> ArrayRef {
 match self {
@@ -945,7 +1017,15 @@ impl ScalarValue {
 ::LargeUtf8 => {
 build_list!(LargeStringBuilder, LargeUtf8, values, size)
 }
-dt => panic!("Unexpected DataType for list {:?}", dt),
+_ => ScalarValue::iter_to_array_list(
+repeat(self.clone()).take(size),
+::List(Box::new(Field::new(
+"item",
+data_type.as_ref().clone(),
+true,
+))),
+)
+ 

[GitHub] [arrow-site] alamb commented on pull request #154: Update datafusion website, add datafusion roadmap, etc

2021-10-20 Thread GitBox


alamb commented on pull request #154:
URL: https://github.com/apache/arrow-site/pull/154#issuecomment-947540721


   Thanks @houqp !


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@arrow.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[arrow-site] branch asf-site updated: add datafusion roadmap (#154)

2021-10-20 Thread alamb
This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch asf-site
in repository https://gitbox.apache.org/repos/asf/arrow-site.git


The following commit(s) were added to refs/heads/asf-site by this push:
 new 201ebf0  add datafusion roadmap (#154)
201ebf0 is described below

commit 201ebf0d7238c89c3749ee228bc8583008678970
Author: QP Hou 
AuthorDate: Wed Oct 20 03:36:09 2021 -0700

add datafusion roadmap (#154)
---
 datafusion/_modules/index.html |   7 +
 datafusion/_sources/cli/index.rst.txt  |   6 +-
 datafusion/_sources/community/communication.md.txt |  17 ++
 datafusion/_sources/index.rst.txt  |   1 +
 datafusion/_sources/specification/roadmap.md.txt   |  99 
 .../_sources/user-guide/example-usage.md.txt   |   4 -
 datafusion/_sources/user-guide/library.md.txt  |   5 +-
 datafusion/cli/index.html  |  16 +-
 datafusion/community/communication.html|  24 ++
 datafusion/genindex.html   |  32 ++-
 datafusion/index.html  |   8 +
 datafusion/objects.inv | Bin 1632 -> 1694 bytes
 datafusion/py-modindex.html|   5 +
 datafusion/python/api/dataframe.html   |  10 +
 datafusion/python/api/execution_context.html   |  10 +
 datafusion/python/api/expression.html  |  10 +
 datafusion/python/api/functions.html   |  10 +
 .../python/generated/datafusion.DataFrame.html |  10 +
 .../generated/datafusion.ExecutionContext.html |  10 +
 .../python/generated/datafusion.Expression.html|  10 +
 .../python/generated/datafusion.functions.html |  44 
 datafusion/search.html |   5 +
 datafusion/searchindex.js  |   2 +-
 .../roadmap.html}  | 250 ++---
 datafusion/user-guide/example-usage.html   |  14 +-
 datafusion/user-guide/library.html |  15 +-
 26 files changed, 525 insertions(+), 99 deletions(-)

diff --git a/datafusion/_modules/index.html b/datafusion/_modules/index.html
index 7bcb0b0..0233d14 100644
--- a/datafusion/_modules/index.html
+++ b/datafusion/_modules/index.html
@@ -319,6 +319,11 @@
 
 
  
+  
+   Roadmap
+  
+ 
+ 
   
DataFusion’s Invariants
   
@@ -392,6 +397,8 @@
 
   All modules for which code is available
 builtins
+datafusion.functions
+functions
 
 
   
diff --git a/datafusion/_sources/cli/index.rst.txt 
b/datafusion/_sources/cli/index.rst.txt
index 93ae173..2b91430 100644
--- a/datafusion/_sources/cli/index.rst.txt
+++ b/datafusion/_sources/cli/index.rst.txt
@@ -53,7 +53,7 @@ Usage
 
 .. code-block:: bash
 
-DataFusion 5.0.0-SNAPSHOT
+DataFusion 5.1.0-SNAPSHOT
 DataFusion is an in-memory query engine that uses Apache Arrow as the 
memory model. It supports executing SQL queries
 against CSV and Parquet files as well as querying directly against 
in-memory data.
 
@@ -68,8 +68,10 @@ Usage
 OPTIONS:
 -c, --batch-size The batch size of each query, or use 
DataFusion default
 -p, --data-path   Path to your data, default to current 
directory
--f, --file Execute commands from file, then exit
+-f, --file ... Execute commands from file(s), then 
exit
 --format Output format [default: table]  
[possible values: csv, tsv, table, json, ndjson]
+--host Ballista scheduler host
+--port Ballista scheduler port
 
 Type `exit` or `quit` to exit the CLI.
 
diff --git a/datafusion/_sources/community/communication.md.txt 
b/datafusion/_sources/community/communication.md.txt
index bbf07a1..7d8e58a 100644
--- a/datafusion/_sources/community/communication.md.txt
+++ b/datafusion/_sources/community/communication.md.txt
@@ -52,6 +52,23 @@ server ([invite link](https://discord.gg/Qw5gKqHxUM)) in 
case you are not able
 to join the Slack workspace. If you need an invite to the Slack workspace, you
 can also ask for one in our Discord server.
 
+### Sync up Zoom calls
+
+We have biweekly sync calls every other Thursdays at 16:00 UTC
+(starting September 30, 2021) on Zoom [Meeting 
Link](https://influxdata.zoom.us/j/94666921249)
+
+The[agenda](https://docs.google.com/document/d/1atCVnoff5SR4eM4Lwf2M1BBJTY6g3_HUNR6qswYJW_U/edit)
+is available if you would like to add a topic for discussion or see what is 
planned.
+
+The goals of these calls are:
+
+1. Help "put a face to the name" of some of other contributors we are working 
with
+2. Discuss / synchronize on the goals and major initiatives from different 
stakeholders to identify areas where more alignment is needed
+
+No decisions are made on the call and anything of substance will be discussed 
on this mailing list or in github issues / google docs.
+
+We 

[GitHub] [arrow-site] alamb merged pull request #154: Update datafusion website, add datafusion roadmap, etc

2021-10-20 Thread GitBox


alamb merged pull request #154:
URL: https://github.com/apache/arrow-site/pull/154


   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@arrow.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[arrow-rs] branch master updated: Use kernel utility for parsing timestamps in csv reader. (#832)

2021-10-20 Thread alamb
This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
 new 4cfe621  Use kernel utility for parsing timestamps in csv reader. 
(#832)
4cfe621 is described below

commit 4cfe621902eaed08abc609013b85b3d0a42de3c8
Author: Navin 
AuthorDate: Wed Oct 20 21:34:50 2021 +1100

Use kernel utility for parsing timestamps in csv reader. (#832)

* Use kernel utility for parsing timestamps in csvs.

* Remove cruft.

* Cleanup.

* Lint.

* Remove erroneous stringify.
---
 arrow/src/csv/reader.rs | 101 ++--
 1 file changed, 97 insertions(+), 4 deletions(-)

diff --git a/arrow/src/csv/reader.rs b/arrow/src/csv/reader.rs
index 7bd12eb..b68ac1b 100644
--- a/arrow/src/csv/reader.rs
+++ b/arrow/src/csv/reader.rs
@@ -52,6 +52,7 @@ use std::sync::Arc;
 use crate::array::{
 ArrayRef, BooleanArray, DictionaryArray, PrimitiveArray, StringArray,
 };
+use crate::compute::kernels::cast_utils::string_to_timestamp_nanos;
 use crate::datatypes::*;
 use crate::error::{ArrowError, Result};
 use crate::record_batch::RecordBatch;
@@ -694,8 +695,7 @@ impl Parser for TimestampNanosecondType {
 fn parse(string: ) -> Option {
 match Self::DATA_TYPE {
 DataType::Timestamp(TimeUnit::Nanosecond, None) => {
-let date_time = string.parse::().ok()?;
-Self::Native::from_i64(date_time.timestamp_nanos())
+string_to_timestamp_nanos(string).ok()
 }
 _ => None,
 }
@@ -706,8 +706,8 @@ impl Parser for TimestampMicrosecondType {
 fn parse(string: ) -> Option {
 match Self::DATA_TYPE {
 DataType::Timestamp(TimeUnit::Microsecond, None) => {
-let date_time = string.parse::().ok()?;
-Self::Native::from_i64(date_time.timestamp_nanos() / 1000)
+let nanos = string_to_timestamp_nanos(string).ok();
+nanos.map(|x| x / 1000)
 }
 _ => None,
 }
@@ -979,6 +979,7 @@ mod tests {
 use crate::array::*;
 use crate::compute::cast;
 use crate::datatypes::Field;
+use chrono::{prelude::*, LocalResult};
 
 #[test]
 fn test_csv() {
@@ -1371,6 +1372,98 @@ mod tests {
 );
 }
 
+/// Interprets a naive_datetime (with no explicit timezone offset)
+/// using the local timezone and returns the timestamp in UTC (0
+/// offset)
+fn naive_datetime_to_timestamp(naive_datetime: ) -> i64 {
+// Note: Use chrono APIs that are different than
+// naive_datetime_to_timestamp to compute the utc offset to
+// try and double check the logic
+let utc_offset_secs = match 
Local.offset_from_local_datetime(naive_datetime) {
+LocalResult::Single(local_offset) => {
+local_offset.fix().local_minus_utc() as i64
+}
+_ => panic!(
+"Unexpected failure converting {} to local datetime",
+naive_datetime
+),
+};
+let utc_offset_nanos = utc_offset_secs * 1_000_000_000;
+naive_datetime.timestamp_nanos() - utc_offset_nanos
+}
+
+#[test]
+fn test_parse_timestamp_microseconds() {
+assert_eq!(
+
parse_item::("1970-01-01T00:00:00Z").unwrap(),
+0
+);
+let naive_datetime = NaiveDateTime::new(
+NaiveDate::from_ymd(2018, 11, 13),
+NaiveTime::from_hms_nano(17, 11, 10, 0),
+);
+assert_eq!(
+
parse_item::("2018-11-13T17:11:10").unwrap(),
+naive_datetime_to_timestamp(_datetime) / 1000
+);
+assert_eq!(
+parse_item::("2018-11-13 
17:11:10").unwrap(),
+naive_datetime_to_timestamp(_datetime) / 1000
+);
+let naive_datetime = NaiveDateTime::new(
+NaiveDate::from_ymd(2018, 11, 13),
+NaiveTime::from_hms_nano(17, 11, 10, 1100),
+);
+assert_eq!(
+
parse_item::("2018-11-13T17:11:10.011").unwrap(),
+naive_datetime_to_timestamp(_datetime) / 1000
+);
+let naive_datetime = NaiveDateTime::new(
+NaiveDate::from_ymd(1900, 2, 28),
+NaiveTime::from_hms_nano(12, 34, 56, 0),
+);
+assert_eq!(
+
parse_item::("1900-02-28T12:34:56").unwrap(),
+naive_datetime_to_timestamp(_datetime) / 1000
+);
+}
+
+#[test]
+fn test_parse_timestamp_nanoseconds() {
+assert_eq!(
+
parse_item::("1970-01-01T00:00:00Z").unwrap(),
+0
+);
+let naive_datetime = NaiveDateTime::new(
+NaiveDate::from_ymd(2018, 11, 13),
+NaiveTime::from_hms_nano(17, 11, 10, 0),
+);
+  

[arrow-datafusion] branch master updated: Dependency upgrades (#1148)

2021-10-20 Thread dheres
This is an automated email from the ASF dual-hosted git repository.

dheres pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git


The following commit(s) were added to refs/heads/master by this push:
 new 27044a0  Dependency upgrades (#1148)
27044a0 is described below

commit 27044a05ad6172d78eee19d0ade600971bfb26b3
Author: Daniël Heres 
AuthorDate: Wed Oct 20 10:45:52 2021 +0200

Dependency upgrades (#1148)
---
 ballista/rust/executor/Cargo.toml  | 2 +-
 ballista/rust/scheduler/Cargo.toml | 2 +-
 benchmarks/Cargo.toml  | 2 +-
 datafusion-cli/Cargo.toml  | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/ballista/rust/executor/Cargo.toml 
b/ballista/rust/executor/Cargo.toml
index 231b05f..5d26f39 100644
--- a/ballista/rust/executor/Cargo.toml
+++ b/ballista/rust/executor/Cargo.toml
@@ -36,7 +36,7 @@ async-trait = "0.1.36"
 ballista-core = { path = "../core", version = "0.6.0" }
 configure_me = "0.4.0"
 datafusion = { path = "../../../datafusion", version = "5.1.0" }
-env_logger = "0.8"
+env_logger = "0.9"
 futures = "0.3"
 log = "0.4"
 snmalloc-rs = {version = "0.2", features= ["cache-friendly"], optional = true}
diff --git a/ballista/rust/scheduler/Cargo.toml 
b/ballista/rust/scheduler/Cargo.toml
index c840772..10664f1 100644
--- a/ballista/rust/scheduler/Cargo.toml
+++ b/ballista/rust/scheduler/Cargo.toml
@@ -36,7 +36,7 @@ ballista-core = { path = "../core", version = "0.6.0" }
 clap = "2"
 configure_me = "0.4.0"
 datafusion = { path = "../../../datafusion", version = "5.1.0" }
-env_logger = "0.8"
+env_logger = "0.9"
 etcd-client = { version = "0.7", optional = true }
 futures = "0.3"
 http = "0.2"
diff --git a/benchmarks/Cargo.toml b/benchmarks/Cargo.toml
index 19a67a5..ce882f6 100644
--- a/benchmarks/Cargo.toml
+++ b/benchmarks/Cargo.toml
@@ -36,7 +36,7 @@ ballista = { path = "../ballista/rust/client" }
 structopt = { version = "0.3", default-features = false }
 tokio = { version = "^1.0", features = ["macros", "rt", "rt-multi-thread"] }
 futures = "0.3"
-env_logger = "^0.8"
+env_logger = "0.9"
 mimalloc = { version = "0.1", optional = true, default-features = false }
 snmalloc-rs = {version = "0.2", optional = true, features= ["cache-friendly"] }
 
diff --git a/datafusion-cli/Cargo.toml b/datafusion-cli/Cargo.toml
index 22196ca..8b7ac19 100644
--- a/datafusion-cli/Cargo.toml
+++ b/datafusion-cli/Cargo.toml
@@ -28,7 +28,7 @@ repository = "https://github.com/apache/arrow-datafusion;
 
 [dependencies]
 clap = "2.33"
-rustyline = "8.0"
+rustyline = "9.0"
 tokio = { version = "1.0", features = ["macros", "rt", "rt-multi-thread", 
"sync"] }
 datafusion = { path = "../datafusion", version = "5.1.0" }
 arrow = { version = "6.0.0"  }


[arrow] branch master updated (c8f882c -> 98b0e99)

2021-10-20 Thread jorisvandenbossche
This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git.


from c8f882c  ARROW-14386: [Packaging][Java] Ensure using installed 
devtoolset version
 add 98b0e99  ARROW-13784: [Python] Table.from_arrays should raise an error 
when array is empty but names is not

No new revisions were added by this update.

Summary of changes:
 python/pyarrow/table.pxi   |  8 ++--
 python/pyarrow/tests/test_table.py | 17 +
 2 files changed, 23 insertions(+), 2 deletions(-)