from:"jorisvandenbossche"

(arrow) branch main updated (01d2fa0d46 -> b51e997df7)

2024-06-07 Thread jorisvandenbossche

This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a change to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


from 01d2fa0d46 GH-41307: [Java] Use org.apache:apache parent pom version 
31 (#41772)
 add b51e997df7 GH-41960: Expose new S3 option 
check_directory_existence_before_creation (#41972)

No new revisions were added by this update.

Summary of changes:
 python/pyarrow/_s3fs.pyx| 20 
 python/pyarrow/includes/libarrow_fs.pxd |  1 +
 python/pyarrow/tests/test_fs.py |  5 +
 3 files changed, 22 insertions(+), 4 deletions(-)

(arrow) branch main updated (37d0acdccb -> 0b5f0a2af1)

2024-06-05 Thread jorisvandenbossche

This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a change to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


from 37d0acdccb GH-41983: [Dev] Run issue labeling bot only when opening an 
issue (not editing) (#41986)
 add 0b5f0a2af1 GH-41502: [Python] Fix reading column index with decimal 
values (#41503)

No new revisions were added by this update.

Summary of changes:
 python/pyarrow/pandas_compat.py |  5 +
 python/pyarrow/tests/test_pandas.py | 11 +++
 2 files changed, 16 insertions(+)

(arrow) branch main updated: GH-41684: [C++][Python] Add optional null_bitmap to MapArray::FromArrays (#41757)

2024-05-31 Thread jorisvandenbossche

This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
 new 255dbf990c GH-41684: [C++][Python] Add optional null_bitmap to 
MapArray::FromArrays (#41757)
255dbf990c is described below

commit 255dbf990c3d3e5fb1270a2a11efe0af2be195ab
Author: Alenka Frim 
AuthorDate: Fri May 31 10:09:54 2024 +0200

GH-41684: [C++][Python] Add optional null_bitmap to MapArray::FromArrays 
(#41757)

### Rationale for this change

When constructing a `MapArray` with `FromArrays` one can not supply a 
`null_bitmap`.

### What changes are included in this PR?

Optional `null_bitmap` argument is added to `MapArray::FromArrays`.

### Are these changes tested?

TODO (have them locally, need to clean them up and commit.

### Are there any user-facing changes?

No.
* GitHub Issue: #41684

Authored-by: AlenkaF 
Signed-off-by: Joris Van den Bossche 
---
 cpp/src/arrow/array/array_list_test.cc | 17 +
 cpp/src/arrow/array/array_nested.cc| 45 +++---
 cpp/src/arrow/array/array_nested.h |  9 ---
 python/pyarrow/array.pxi   | 11 ++---
 python/pyarrow/includes/libarrow.pxd   |  8 --
 python/pyarrow/tests/test_array.py | 34 +
 6 files changed, 102 insertions(+), 22 deletions(-)

diff --git a/cpp/src/arrow/array/array_list_test.cc 
b/cpp/src/arrow/array/array_list_test.cc
index e79ce6fe17..55f91dc341 100644
--- a/cpp/src/arrow/array/array_list_test.cc
+++ b/cpp/src/arrow/array/array_list_test.cc
@@ -1368,6 +1368,23 @@ TEST_F(TestMapArray, FromArrays) {
   ASSERT_EQ(keys_with_null->length(), tmp_items->length());
   ASSERT_RAISES(Invalid,
 MapArray::FromArrays(offsets1, keys_with_null, tmp_items, 
pool_));
+
+  // With null_bitmap
+  ASSERT_OK_AND_ASSIGN(auto map7, MapArray::FromArrays(offsets1, keys, items, 
pool_,
+   
offsets3->data()->buffers[0]));
+  ASSERT_OK(map7->Validate());
+  MapArray expected7(map_type, length, offsets1->data()->buffers[1], keys, 
items,
+ offsets3->data()->buffers[0], 1);
+  AssertArraysEqual(expected7, *map7);
+
+  // Null bitmap and offset with null
+  ASSERT_RAISES(Invalid, MapArray::FromArrays(offsets3, keys, items, pool_,
+  offsets3->data()->buffers[0]));
+
+  // Null bitmap and offset with offset
+  ASSERT_RAISES(NotImplemented,
+MapArray::FromArrays(offsets3->Slice(2), keys, items, pool_,
+ offsets3->data()->buffers[0]));
 }
 
 TEST_F(TestMapArray, FromArraysEquality) {
diff --git a/cpp/src/arrow/array/array_nested.cc 
b/cpp/src/arrow/array/array_nested.cc
index 67a499c2b8..bb5c6bf018 100644
--- a/cpp/src/arrow/array/array_nested.cc
+++ b/cpp/src/arrow/array/array_nested.cc
@@ -807,7 +807,7 @@ MapArray::MapArray(const std::shared_ptr& type, 
int64_t length,
 Result> MapArray::FromArraysInternal(
 std::shared_ptr type, const std::shared_ptr& offsets,
 const std::shared_ptr& keys, const std::shared_ptr& items,
-MemoryPool* pool) {
+MemoryPool* pool, const std::shared_ptr& null_bitmap) {
   using offset_type = typename MapType::offset_type;
   using OffsetArrowType = typename CTypeTraits::ArrowType;
 
@@ -827,6 +827,15 @@ Result> 
MapArray::FromArraysInternal(
 return Status::Invalid("Map key and item arrays must be equal length");
   }
 
+  if (null_bitmap != nullptr && offsets->null_count() > 0) {
+return Status::Invalid(
+"Ambiguous to specify both validity map and offsets with nulls");
+  }
+
+  if (null_bitmap != nullptr && offsets->offset() != 0) {
+return Status::NotImplemented("Null bitmap with offsets slice not 
supported.");
+  }
+
   if (offsets->null_count() > 0) {
 ARROW_ASSIGN_OR_RAISE(auto buffers,
   CleanListOffsets(NULLPTR, *offsets, pool));
@@ -836,24 +845,32 @@ Result> 
MapArray::FromArraysInternal(
 
   using OffsetArrayType = typename TypeTraits::ArrayType;
   const auto& typed_offsets = checked_cast(*offsets);
-  auto buffers = BufferVector({nullptr, typed_offsets.values()});
+
+  BufferVector buffers;
+  int64_t null_count;
+  if (null_bitmap != nullptr) {
+buffers = BufferVector({std::move(null_bitmap), typed_offsets.values()});
+null_count = null_bitmap->size();
+  } else {
+buffers = BufferVector({null_bitmap, typed_offsets.values()});
+null_count = 0;
+  }
   return std::make_shared(type, offsets->length() - 1, 
std::move(buffers), keys,
-items, /*null_count=*/0, 
offsets-&g

(arrow) branch main updated: GH-41126: [Python] Basic bindings for Device and MemoryManager classes (#41685)

2024-05-31 Thread jorisvandenbossche

This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
 new 31fe24dd33 GH-41126: [Python] Basic bindings for Device and 
MemoryManager classes (#41685)
31fe24dd33 is described below

commit 31fe24dd3345d387ba52d46c2915a909a5667813
Author: Joris Van den Bossche 
AuthorDate: Fri May 31 09:48:54 2024 +0200

GH-41126: [Python] Basic bindings for Device and MemoryManager classes 
(#41685)

### Rationale for this change

Add bindings for the C++ `arrow::Device` and `arrow::MemoryManager` classes.

### What changes are included in this PR?

Basic bindings by adding the `pyarrow.Device` and `pyarrow.MemoryManager` 
classes, and just tested for CPU.

What is not included here are additional methods on the `MemoryManager` 
class (eg to allocate or copy buffers), and this is also not yet tested for 
CUDA. Planning to do this as follow-ups, and first doing those basic bindings 
should enable further enhancements to be done in parallel.

### Are these changes tested?

Yes, for the CPU device only.

* GitHub Issue: #41126

Authored-by: Joris Van den Bossche 
Signed-off-by: Joris Van den Bossche 
---
 python/pyarrow/__init__.py   |   3 +
 python/pyarrow/device.pxi| 162 +++
 python/pyarrow/includes/libarrow.pxd |  35 
 python/pyarrow/io.pxi|  33 +++
 python/pyarrow/lib.pxd   |  20 +
 python/pyarrow/lib.pyx   |   3 +
 python/pyarrow/tests/test_device.py  |  43 ++
 python/pyarrow/tests/test_misc.py|   2 +
 8 files changed, 301 insertions(+)

diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py
index 936f473697..e52e0d242b 100644
--- a/python/pyarrow/__init__.py
+++ b/python/pyarrow/__init__.py
@@ -236,6 +236,9 @@ from pyarrow.lib import (null, bool_,
  RunEndEncodedScalar, ExtensionScalar)
 
 # Buffers, allocation
+from pyarrow.lib import (DeviceAllocationType, Device, MemoryManager,
+ default_cpu_memory_manager)
+
 from pyarrow.lib import (Buffer, ResizableBuffer, foreign_buffer, py_buffer,
  Codec, compress, decompress, allocate_buffer)
 
diff --git a/python/pyarrow/device.pxi b/python/pyarrow/device.pxi
new file mode 100644
index 00..6e60347520
--- /dev/null
+++ b/python/pyarrow/device.pxi
@@ -0,0 +1,162 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# cython: profile=False
+# distutils: language = c++
+# cython: embedsignature = True
+
+
+cpdef enum DeviceAllocationType:
+CPU =  CDeviceAllocationType_kCPU
+CUDA =  CDeviceAllocationType_kCUDA
+CUDA_HOST =  CDeviceAllocationType_kCUDA_HOST
+OPENCL =  CDeviceAllocationType_kOPENCL
+VULKAN =  CDeviceAllocationType_kVULKAN
+METAL =  CDeviceAllocationType_kMETAL
+VPI =  CDeviceAllocationType_kVPI
+ROCM =  CDeviceAllocationType_kROCM
+ROCM_HOST =  CDeviceAllocationType_kROCM_HOST
+EXT_DEV =  CDeviceAllocationType_kEXT_DEV
+CUDA_MANAGED =  CDeviceAllocationType_kCUDA_MANAGED
+ONEAPI =  CDeviceAllocationType_kONEAPI
+WEBGPU =  CDeviceAllocationType_kWEBGPU
+HEXAGON =  CDeviceAllocationType_kHEXAGON
+
+
+cdef object _wrap_device_allocation_type(CDeviceAllocationType device_type):
+return DeviceAllocationType( device_type)
+
+
+cdef class Device(_Weakrefable):
+"""
+Abstract interface for hardware devices
+
+This object represents a device with access to some memory spaces.
+When handling a Buffer or raw memory address, it allows deciding in which
+context the raw memory address should be interpreted
+(e.g. CPU-accessible memory, or embedded memory on some particular GPU).
+"""
+
+def __init__(self):
+raise TypeError("Do not call Device's constructor directly, "
+"use the device attribute of the MemoryManager 
instead.")
+
+cdef void init(s

(arrow) branch main updated: GH-41748: [Python][Parquet] Update BYTE_STREAM_SPLIT description in write_table() docstring (#41759)

2024-05-22 Thread jorisvandenbossche

This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
 new 065a6da852 GH-41748: [Python][Parquet] Update BYTE_STREAM_SPLIT 
description in write_table() docstring (#41759)
065a6da852 is described below

commit 065a6da8520bd65fb4f59b2e3e496fe1124ac685
Author: Antoine Pitrou 
AuthorDate: Wed May 22 10:37:52 2024 +0200

GH-41748: [Python][Parquet] Update BYTE_STREAM_SPLIT description in 
write_table() docstring (#41759)

### Rationale for this change

In PR #40094 (issue GH-39978), we forgot to update the `write_table` 
docstring with an accurate description of the supported data types for 
BYTE_STREAM_SPLIT.

### Are these changes tested?

No (only a doc change).

### Are there any user-facing changes?

No.
* GitHub Issue: #41748

Authored-by: Antoine Pitrou 
Signed-off-by: Joris Van den Bossche 
---
 python/pyarrow/parquet/core.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/python/pyarrow/parquet/core.py b/python/pyarrow/parquet/core.py
index f54a203c87..81798b1544 100644
--- a/python/pyarrow/parquet/core.py
+++ b/python/pyarrow/parquet/core.py
@@ -797,8 +797,9 @@ use_byte_stream_split : bool or list, default False
 Specify if the byte_stream_split encoding should be used in general or
 only for some columns. If both dictionary and byte_stream_stream are
 enabled, then dictionary is preferred.
-The byte_stream_split encoding is valid only for floating-point data types
-and should be combined with a compression codec.
+The byte_stream_split encoding is valid for integer, floating-point
+and fixed-size binary data types (including decimals); it should be
+combined with a compression codec so as to achieve size reduction.
 column_encoding : string or dict, default None
 Specify the encoding scheme on a per column basis.
 Can only be used when ``use_dictionary`` is set to False, and

(arrow) branch main updated (1f07404dac -> e254c43c09)

2024-05-21 Thread jorisvandenbossche

This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a change to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


from 1f07404dac GH-41321: [C++][Parquet] More strict Parquet level checking 
(#41346)
 add e254c43c09 GH-41389: [Python] Expose byte_width and bit_width of 
ExtensionType in terms of the storage type (#41413)

No new revisions were added by this update.

Summary of changes:
 python/pyarrow/includes/libarrow.pxd|  2 ++
 python/pyarrow/tests/test_extension_type.py | 30 +++--
 python/pyarrow/types.pxi| 18 +
 3 files changed, 48 insertions(+), 2 deletions(-)

(arrow) branch main updated: GH-41688: [Dev] Include all relevant CMakeLists.txt files in cmake-format precommit hook (#41689)

2024-05-17 Thread jorisvandenbossche

This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
 new 14b8ca5317 GH-41688: [Dev] Include all relevant CMakeLists.txt files 
in cmake-format precommit hook (#41689)
14b8ca5317 is described below

commit 14b8ca53171435113a0f0f0c4ff1063d12543bc4
Author: Joris Van den Bossche 
AuthorDate: Fri May 17 14:35:02 2024 +0200

GH-41688: [Dev] Include all relevant CMakeLists.txt files in cmake-format 
precommit hook (#41689)

### Rationale for this change

Some CMakeLists.txt files are not included in the pre-commit hook (causing 
failures on CI through archery if you rely on the pre-commit hook locally)

### What changes are included in this PR?

Include all CMakeLists.txt files by default anywhere in the repo, and 
explicitly exclude the ones we don't want (vendored files).

In practice, compared to the current set of files covered by the hook, 
those new files are included in the search:

'cpp/CMakeLists.txt',
'java/CMakeLists.txt',
'matlab/CMakeLists.txt',
'python/CMakeLists.txt'

### Are these changes tested?

Yes
* GitHub Issue: #41688

Authored-by: Joris Van den Bossche 
Signed-off-by: Joris Van den Bossche 
---
 .pre-commit-config.yaml   | 7 ++-
 dev/archery/archery/utils/lint.py | 2 +-
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 7dcc1c9816..1e4b91e27e 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -116,17 +116,14 @@ repos:
 name: CMake Format
 files: >-
   (
+  ?.*CMakeLists\.txt$|
   ?^ci/.*/.*\.cmake$|
   ?^cpp/.*/.*\.cmake\.in$|
   ?^cpp/.*/.*\.cmake$|
-  ?^cpp/.*/CMakeLists\.txt$|
-  ?^go/.*/CMakeLists\.txt$|
-  ?^java/.*/CMakeLists\.txt$|
-  ?^matlab/.*/CMakeLists\.txt$|
-  ?^python/.*/CMakeLists\.txt$|
   )
 exclude: >-
   (
+  ?^ci/conan/all/.*CMakeLists\.txt$|
   ?^cpp/cmake_modules/FindNumPy\.cmake$|
   ?^cpp/cmake_modules/FindPythonLibsNew\.cmake$|
   ?^cpp/cmake_modules/UseCython\.cmake$|
diff --git a/dev/archery/archery/utils/lint.py 
b/dev/archery/archery/utils/lint.py
index 108c9ded36..92b7f79fc1 100644
--- a/dev/archery/archery/utils/lint.py
+++ b/dev/archery/archery/utils/lint.py
@@ -157,7 +157,7 @@ def cmake_linter(src, fix=False):
 'go/**/CMakeLists.txt',
 'java/**/CMakeLists.txt',
 'matlab/**/CMakeLists.txt',
-'python/CMakeLists.txt',
+'python/**/CMakeLists.txt',
 ],
 exclude_patterns=[
 'cpp/cmake_modules/FindNumPy.cmake',

(arrow) branch main updated: MINOR: [Python][Docs] Use CMake presets to simplify Python build installation (#41500)

2024-05-17 Thread jorisvandenbossche

This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
 new 2dbc5e26dc MINOR: [Python][Docs] Use CMake presets to simplify Python 
build installation (#41500)
2dbc5e26dc is described below

commit 2dbc5e26dcbc6826b4eb7a330fa8090836f6b727
Author: William Ayd 
AuthorDate: Fri May 17 04:24:56 2024 -0400

MINOR: [Python][Docs] Use CMake presets to simplify Python build 
installation (#41500)


### Rationale for this change

This should simplify the number of steps users have to go through to get a 
working Python installation from source

Authored-by: Will Ayd 
Signed-off-by: Joris Van den Bossche 
---
 docs/source/developers/python.rst | 29 -
 docs/source/python/data.rst   |  2 +-
 2 files changed, 21 insertions(+), 10 deletions(-)

diff --git a/docs/source/developers/python.rst 
b/docs/source/developers/python.rst
index be9fac067c..e84cd25201 100644
--- a/docs/source/developers/python.rst
+++ b/docs/source/developers/python.rst
@@ -302,10 +302,24 @@ created above (stored in ``$ARROW_HOME``):
 
 .. code-block::
 
-   $ mkdir arrow/cpp/build
-   $ pushd arrow/cpp/build
-   $ cmake -DCMAKE_INSTALL_PREFIX=$ARROW_HOME \
-   -DCMAKE_INSTALL_LIBDIR=lib \
+   $ cmake -S arrow/cpp -B arrow/cpp/build \
+   -DCMAKE_INSTALL_PREFIX=$ARROW_HOME \
+   --preset ninja-release-python
+   $ cmake --build arrow/cpp/build --target install
+
+``ninja-release-python`` is not the only preset available - if you would like a
+build with more features like CUDA, Flight and Gandiva support you may opt for
+the ``ninja-release-python-maximal`` preset. If you wanted less features, (i.e.
+removing ORC and dataset support) you could opt for
+``ninja-release-python-minimal``. Changing the word ``release`` to ``debug``
+with any of the aforementioned presets will generate a debug build of Arrow.
+
+The presets are provided as a convenience, but you may instead opt to
+specify the individual components:
+
+.. code-block::
+   $ cmake -S arrow/cpp -B arrow/cpp/build \
+   -DCMAKE_INSTALL_PREFIX=$ARROW_HOME \
-DCMAKE_BUILD_TYPE=Debug \
-DARROW_BUILD_TESTS=ON \
-DARROW_COMPUTE=ON \
@@ -321,11 +335,8 @@ created above (stored in ``$ARROW_HOME``):
-DARROW_WITH_SNAPPY=ON \
-DARROW_WITH_ZLIB=ON \
-DARROW_WITH_ZSTD=ON \
-   -DPARQUET_REQUIRE_ENCRYPTION=ON \
-   ..
-   $ make -j4
-   $ make install
-   $ popd
+   -DPARQUET_REQUIRE_ENCRYPTION=ON
+   $ cmake --build arrow/cpp/build --target install -j4
 
 There are a number of optional components that can be switched ON by
 adding flags with ``ON``:
diff --git a/docs/source/python/data.rst b/docs/source/python/data.rst
index f17475138c..598c8c125f 100644
--- a/docs/source/python/data.rst
+++ b/docs/source/python/data.rst
@@ -561,7 +561,7 @@ schema without having to get any of the batches.::
 
 It can also be sent between languages using the :ref:`C stream interface 
`.
 
-Conversion of RecordBatch do Tensor
+Conversion of RecordBatch to Tensor
 ---
 
 Each array of the ``RecordBatch`` has it's own contiguous memory that is not 
necessarily

(arrow) branch main updated: GH-38575: [Python] Include metadata when creating pa.schema from PyCapsule (#41538)

2024-05-17 Thread jorisvandenbossche

This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
 new 6a9e2d53b5 GH-38575: [Python] Include metadata when creating pa.schema 
from PyCapsule (#41538)
6a9e2d53b5 is described below

commit 6a9e2d53b5cdd0f387bfcd44e9549f122fac93e5
Author: Jacob Hayes 
AuthorDate: Fri May 17 03:07:02 2024 -0400

GH-38575: [Python] Include metadata when creating pa.schema from PyCapsule 
(#41538)

### Rationale for this change

Fixes the dropped `pa.schema` metadata reported in #38575, which was 
introduced in #37797.

### What changes are included in this PR?

Passes through the `metadata` to the short-circuited `Schema` created with 
`_import_from_c_capsule`.

### Are these changes tested?

Yes - added `metadata` to the existing test.

### Are there any user-facing changes?

I'm not sure this quite rises to the `(b) a bug that caused incorrect or 
invalid data to be produced,` condition, but I added that note to be safe since 
the resulting schema is "incorrect" (and broke some round-trip tests on my end 
after a pyarrow update):

**This PR contains a "Critical Fix".**

* GitHub Issue: #38575

Lead-authored-by: Jacob Hayes 
Co-authored-by: Joris Van den Bossche 
Signed-off-by: Joris Van den Bossche 
---
 python/pyarrow/tests/test_types.py | 5 -
 python/pyarrow/types.pxi   | 5 -
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/python/pyarrow/tests/test_types.py 
b/python/pyarrow/tests/test_types.py
index 4f66a6f416..f7b6040f51 100644
--- a/python/pyarrow/tests/test_types.py
+++ b/python/pyarrow/tests/test_types.py
@@ -1331,10 +1331,13 @@ def test_schema_import_c_schema_interface():
 def __arrow_c_schema__(self):
 return self.schema.__arrow_c_schema__()
 
-schema = pa.schema([pa.field("field_name", pa.int32())])
+schema = pa.schema([pa.field("field_name", pa.int32())], metadata={"a": 
"b"})
+assert schema.metadata == {b"a": b"b"}
 wrapped_schema = Wrapper(schema)
 
 assert pa.schema(wrapped_schema) == schema
+assert pa.schema(wrapped_schema).metadata == {b"a": b"b"}
+assert pa.schema(wrapped_schema, metadata={"a": "c"}).metadata == {b"a": 
b"c"}
 
 
 def test_field_import_c_schema_interface():
diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi
index 018099ae7e..480f19c81d 100644
--- a/python/pyarrow/types.pxi
+++ b/python/pyarrow/types.pxi
@@ -5332,7 +5332,10 @@ def schema(fields, metadata=None):
 if isinstance(fields, Mapping):
 fields = fields.items()
 elif hasattr(fields, "__arrow_c_schema__"):
-return Schema._import_from_c_capsule(fields.__arrow_c_schema__())
+result = Schema._import_from_c_capsule(fields.__arrow_c_schema__())
+if metadata is not None:
+result = result.with_metadata(metadata)
+return result
 
 for item in fields:
 if isinstance(item, tuple):

(arrow-site) branch asf-site updated: MINOR: Update docs/python/install.html with GH-41105 (#521)

2024-05-17 Thread jorisvandenbossche

This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a commit to branch asf-site
in repository https://gitbox.apache.org/repos/asf/arrow-site.git


The following commit(s) were added to refs/heads/asf-site by this push:
 new 22b975f4ca7 MINOR: Update docs/python/install.html with GH-41105 (#521)
22b975f4ca7 is described below

commit 22b975f4ca718883b472a78dc64933b8a7cc3586
Author: Bryce Mecum 
AuthorDate: Thu May 16 22:57:52 2024 -0800

MINOR: Update docs/python/install.html with GH-41105 (#521)

My attempt at updating docs/python/install.html with teh changes in
https://github.com/apache/arrow/pull/41135.

I generated the docs locally, copied the generated install.html into
arrow-site, and then only committed the hunks I know changed. I didn't
commit the entire changed file since the diff included many more
changes, some of which looked like they'd break the page.
---
 docs/python/install.html | 115 +++
 1 file changed, 115 insertions(+)

diff --git a/docs/python/install.html b/docs/python/install.html
index d012eceb315..124d1fdf796 100644
--- a/docs/python/install.html
+++ b/docs/python/install.html
@@ -1549,6 +1549,13 @@ Linux distributions. We strongly recommend using a 
64-bit system.
 conda install -c conda-forge 
pyarrow
 
 
+
+Note
+While the pyarrow https://conda-forge.org/;>conda-forge package is
+the right choice for most users, both a minimal and maximal variant of the
+package exist, either of which may be better for your use case. See
+Differences between conda-forge packages.
+
 
 
 Using Pip#
@@ -1597,6 +1604,114 @@ a custom path to the database from Python:
 
 
 
+
+Differences between conda-forge 
packages#
+On https://conda-forge.org/;>conda-forge, PyArrow is published as three
+separate packages, each providing varying levels of functionality. This is in
+contrast to PyPi, where only a single PyArrow package is provided.
+The purpose of this split is to minimize the size of the installed package 
for
+most users (pyarrow), provide a smaller, minimal package for 
specialized use
+cases (pyarrow-core), while still providing a complete 
package for users who
+require it (pyarrow-all). What was historically pyarrow on
+https://conda-forge.org/;>conda-forge 
is now pyarrow-all, though most
+users can continue using pyarrow.
+The pyarrow-core package includes the following 
functionality:
+
+Data Types and In-Memory Data Model
+Compute Functions (i.e., pyarrow.compute)
+Memory and IO Interfaces
+Streaming, Serialization, and IPC (i.e., pyarrow.ipc)
+Filesystem Interface (i.e., pyarrow.fs. Note: It’s planned to move cloud 
fileystems (i.e., S3, 
GCSFile formats: Arrow/Feather, 
JSON, CSV, ORC (but not Parquet)
+
+The pyarrow package adds the following:
+
+Acero (i.e., pyarrow.acero)
+Tabular Datasets (i.e., pyarrow.dataset)
+Parquet (i.e., pyarrow.parquet)
+Substrait (i.e., pyarrow.substrait)
+
+Finally, pyarrow-all adds:
+
+Arrow Flight RPC and Flight SQL (i.e., pyarrow.flight)
+Gandiva  (i.e., pyarrow.gandiva)
+
+The following table lists the functionality provided by each package and 
may be
+useful when deciding to use one package over another or when
+Creating A Custom Selection.
+
+
+Component
+Package
+pyarrow-core
+pyarrow
+pyarrow-all
+
+Core
+pyarrow-core
+✓
+✓
+✓
+
+Parquet
+libparquet
+
+✓
+✓
+
+Dataset
+libarrow-dataset
+
+✓
+✓
+
+Acero
+libarrow-acero
+
+✓
+✓
+
+Substrait
+libarrow-substrait
+
+✓
+✓
+
+Flight
+libarrow-flight
+
+
+✓
+
+Flight SQL
+libarrow-flight-sql
+
+
+✓
+
+Gandiva
+libarrow-gandiva
+
+
+✓
+
+
+
+
+Creating A Custom 
Selection#
+If you know which components you need and want to control what’s installed, 
you
+can create a custom selection of packages to include only the extra features 
you
+need. For example, to install pyarrow-core and add support for reading and
+writing Parquet, install libparquet alongside pyarrow-core:
+conda install -c conda-forge 
pyarrow-core libparquet
+
+
+Or if you wish to use pyarrow but need support for Flight RPC:
+conda install -c conda-forge 
pyarrow libarrow-flight
+
+
+
+

(arrow) branch main updated: GH-41480: [Python] Building PyArrow: enable/disable python components by default based on availability in Arrow C++ (#41494)

2024-05-16 Thread jorisvandenbossche

This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
 new 1c546fb3c1 GH-41480: [Python] Building PyArrow: enable/disable python 
components by default based on availability in Arrow C++ (#41494)
1c546fb3c1 is described below

commit 1c546fb3c130fc6a4f3e06ad31dc49d923785104
Author: Joris Van den Bossche 
AuthorDate: Thu May 16 14:15:57 2024 +0200

GH-41480: [Python] Building PyArrow: enable/disable python components by 
default based on availability in Arrow C++ (#41494)

### Rationale for this change

Currently, when building pyarrow from source, one needs to manually enable 
the optional components through setting `PYARROW_WITH_...` environment 
variables. However, we could also make a default choice of components based on 
which ones where enabled in the Arrow C++ build.

### What changes are included in this PR?

Set defaults for the various `PYARROW_BUILD_` based on the 
`ARROW_` setting. Keep the current `PYARROW_WITH_` 
environment variables working to allow to override this default.

### Are there any user-facing changes?

No
* GitHub Issue: #41480

Lead-authored-by: Joris Van den Bossche 
Co-authored-by: Sutou Kouhei 
Signed-off-by: Joris Van den Bossche 
---
 ci/appveyor-cpp-build.bat |   1 -
 python/CMakeLists.txt | 115 +--
 python/setup.py   | 134 +-
 3 files changed, 123 insertions(+), 127 deletions(-)

diff --git a/ci/appveyor-cpp-build.bat b/ci/appveyor-cpp-build.bat
index 8cfa67c437..f688fbb63a 100644
--- a/ci/appveyor-cpp-build.bat
+++ b/ci/appveyor-cpp-build.bat
@@ -129,7 +129,6 @@ set PYARROW_WITH_ORC=%ARROW_ORC%
 set PYARROW_WITH_PARQUET=ON
 set PYARROW_WITH_PARQUET_ENCRYPTION=ON
 set PYARROW_WITH_S3=%ARROW_S3%
-set PYARROW_WITH_STATIC_BOOST=ON
 set PYARROW_WITH_SUBSTRAIT=ON
 
 set ARROW_HOME=%CONDA_PREFIX%\Library
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 212862357a..07acb9e31a 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -108,25 +108,6 @@ if(UNIX)
   endif()
 endif()
 
-# Top level cmake dir
-if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}")
-  option(PYARROW_BUILD_ACERO "Build the PyArrow Acero integration" OFF)
-  option(PYARROW_BUILD_CUDA "Build the PyArrow CUDA support" OFF)
-  option(PYARROW_BUILD_DATASET "Build the PyArrow Dataset integration" OFF)
-  option(PYARROW_BUILD_FLIGHT "Build the PyArrow Flight integration" OFF)
-  option(PYARROW_BUILD_GANDIVA "Build the PyArrow Gandiva integration" OFF)
-  option(PYARROW_BUILD_ORC "Build the PyArrow ORC integration" OFF)
-  option(PYARROW_BUILD_PARQUET "Build the PyArrow Parquet integration" OFF)
-  option(PYARROW_BUILD_PARQUET_ENCRYPTION
- "Build the PyArrow Parquet encryption integration" OFF)
-  option(PYARROW_BUNDLE_ARROW_CPP "Bundle the Arrow C++ libraries" OFF)
-  option(PYARROW_BUNDLE_CYTHON_CPP "Bundle the C++ files generated by Cython" 
OFF)
-  option(PYARROW_GENERATE_COVERAGE "Build with Cython code coverage enabled" 
OFF)
-  set(PYARROW_CXXFLAGS
-  ""
-  CACHE STRING "Compiler flags to append when compiling Arrow")
-endif()
-
 find_program(CCACHE_FOUND ccache)
 if(CCACHE_FOUND
AND NOT CMAKE_C_COMPILER_LAUNCHER
@@ -265,11 +246,70 @@ message(STATUS "NumPy include dir: ${NUMPY_INCLUDE_DIRS}")
 
 include(UseCython)
 
-# PyArrow C++
+# Arrow C++ and set default PyArrow build options
 include(GNUInstallDirs)
-
 find_package(Arrow REQUIRED)
 
+macro(define_option name description arrow_option)
+  set("PYARROW_${name}"
+  "AUTO"
+  CACHE STRING ${description})
+
+  if("${PYARROW_${name}}" STREQUAL "AUTO")
+# by default, first check if env variable exists, otherwise use Arrow C++ 
config
+set(env_variable "PYARROW_WITH_${name}")
+if(DEFINED ENV{${env_variable}})
+  if($ENV{${env_variable}})
+set("PYARROW_BUILD_${name}" ON)
+  else()
+set("PYARROW_BUILD_${name}" OFF)
+  endif()
+else()
+  if(${arrow_option})
+set("PYARROW_BUILD_${name}" ON)
+  else()
+set("PYARROW_BUILD_${name}" OFF)
+  endif()
+endif()
+  else()
+if("${PYARROW_${name}}")
+  set("PYARROW_BUILD_${name}" ON)
+else()
+  set("PYARROW_BUILD_${name}" OFF)
+endif()
+  endif()
+endmacro()
+
+define_option(ACERO "Build the PyArrow Acero integration" ARROW_ACERO)
+define_option(CUDA "Build the PyArrow CUDA support" ARROW_CUDA)
+define_option(DATA

(arrow-nanoarrow) branch main updated: fix(python): Add iterator for null/na type (#467)

2024-05-14 Thread jorisvandenbossche

This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-nanoarrow.git


The following commit(s) were added to refs/heads/main by this push:
 new 65e90b7f fix(python): Add iterator for null/na type (#467)
65e90b7f is described below

commit 65e90b7f340ca40901a30fee577453c08abdba77
Author: Dewey Dunnington 
AuthorDate: Tue May 14 12:58:41 2024 -0300

fix(python): Add iterator for null/na type (#467)

Closes #465
---
 python/src/nanoarrow/iterator.py | 6 +-
 python/tests/test_iterator.py| 5 +
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/python/src/nanoarrow/iterator.py b/python/src/nanoarrow/iterator.py
index 2364ea82..3ff1714f 100644
--- a/python/src/nanoarrow/iterator.py
+++ b/python/src/nanoarrow/iterator.py
@@ -17,7 +17,7 @@
 
 import warnings
 from functools import cached_property
-from itertools import islice
+from itertools import islice, repeat
 from typing import Iterable, Tuple
 
 from nanoarrow._lib import CArrayView, CArrowType
@@ -482,6 +482,9 @@ class PyIterator(ArrayViewBaseIterator):
 else:
 return iter(items)
 
+def _null_iter(self, offset, length):
+return repeat(None, length)
+
 
 class RowTupleIterator(PyIterator):
 """Iterate over rows of a struct array (stream) where each row is a
@@ -545,6 +548,7 @@ def _get_tzinfo(tz_string, strategy=None):
 
 
 _ITEMS_ITER_LOOKUP = {
+CArrowType.NA: "_null_iter",
 CArrowType.BINARY: "_binary_iter",
 CArrowType.LARGE_BINARY: "_binary_iter",
 CArrowType.STRING: "_string_iter",
diff --git a/python/tests/test_iterator.py b/python/tests/test_iterator.py
index ff0b34e2..fe6e8bbd 100644
--- a/python/tests/test_iterator.py
+++ b/python/tests/test_iterator.py
@@ -513,3 +513,8 @@ def test_iterator_extension():
 
 with pytest.warns(UnregisteredExtensionWarning):
 assert list(iter_py(extension_array)) == [1, 2, 3]
+
+
+def test_iterator_null():
+array = na.c_array_from_buffers(na.null(), 3, [])
+assert list(iter_py(array)) == [None, None, None]

(arrow) branch main updated (fd84ec0b1a -> d7c22601e7)

2024-05-14 Thread jorisvandenbossche

This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a change to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


from fd84ec0b1a GH-39129 [Python] pa.array: add check for byte-swapped 
numpy arrays inside python objects (#41549)
 add d7c22601e7 GH-41464: [Python] Fix StructArray.sort() for by=None 
(#41495)

No new revisions were added by this update.

Summary of changes:
 python/pyarrow/array.pxi   | 7 +++
 python/pyarrow/tests/test_array.py | 8 
 2 files changed, 11 insertions(+), 4 deletions(-)

(arrow) branch main updated (fc7c723bab -> fd84ec0b1a)

2024-05-14 Thread jorisvandenbossche

This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a change to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


from fc7c723bab MINOR: [Go] Bump golang.org/x/tools from 0.20.0 to 0.21.0 
in /go (#41639)
 add fd84ec0b1a GH-39129 [Python] pa.array: add check for byte-swapped 
numpy arrays inside python objects (#41549)

No new revisions were added by this update.

Summary of changes:
 python/pyarrow/src/arrow/python/python_to_arrow.cc |  4 
 python/pyarrow/tests/test_array.py | 24 ++
 2 files changed, 28 insertions(+)

(arrow) branch main updated (52321377cc -> b719408f4a)

2024-05-07 Thread jorisvandenbossche

This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a change to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


from 52321377cc GH-40997: [C++] Get null_bit_id according to 
are_cols_in_encoding_order in NullUpdateColumnToRow_avx2 (#40998)
 add b719408f4a GH-40560: [Python] RunEndEncodedArray.from_arrays: bugfix 
for Array arguments (#40560) (#41093)

No new revisions were added by this update.

Summary of changes:
 python/pyarrow/array.pxi   |  2 +-
 python/pyarrow/tests/test_array.py | 11 +++
 2 files changed, 12 insertions(+), 1 deletion(-)

(arrow) branch main updated: GH-41491: [Python] remove special methods related to buffers in python <2.6 (#41492)

2024-05-02 Thread jorisvandenbossche

This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
 new 3c67091f93 GH-41491: [Python] remove special methods related to 
buffers in python <2.6 (#41492)
3c67091f93 is described below

commit 3c67091f93223f2d12f5a73d3e5bc51e7b389a00
Author: Thomas A Caswell 
AuthorDate: Thu May 2 08:18:21 2024 -0400

GH-41491: [Python] remove special methods related to buffers in python <2.6 
(#41492)



### Rationale for this change
These methods are not actually used and will be removed from Cython in an 
upcoming release.

Closes #41491

### What changes are included in this PR?

### Are these changes tested?
Trust CI

### Are there any user-facing changes?

No, this code should never be actually used.

* GitHub Issue: #41491

Authored-by: Thomas A Caswell 
Signed-off-by: Joris Van den Bossche 
---
 python/pyarrow/io.pxi | 47 +--
 1 file changed, 13 insertions(+), 34 deletions(-)

diff --git a/python/pyarrow/io.pxi b/python/pyarrow/io.pxi
index 7890bf4b2d..9e8026deb4 100644
--- a/python/pyarrow/io.pxi
+++ b/python/pyarrow/io.pxi
@@ -1446,27 +1446,6 @@ cdef class Buffer(_Weakrefable):
 buffer.strides = self.strides
 buffer.suboffsets = NULL
 
-def __getsegcount__(self, Py_ssize_t *len_out):
-if len_out != NULL:
-len_out[0] = self.size
-return 1
-
-def __getreadbuffer__(self, Py_ssize_t idx, void **p):
-if idx != 0:
-raise SystemError("accessing nonexistent buffer segment")
-if p != NULL:
-p[0] =  self.buffer.get().data()
-return self.size
-
-def __getwritebuffer__(self, Py_ssize_t idx, void **p):
-if not self.buffer.get().is_mutable():
-raise SystemError("trying to write an immutable buffer")
-if idx != 0:
-raise SystemError("accessing nonexistent buffer segment")
-if p != NULL:
-p[0] =  self.buffer.get().data()
-return self.size
-
 
 cdef class ResizableBuffer(Buffer):
 """
@@ -2142,21 +2121,21 @@ cdef class CacheOptions(_Weakrefable):
 Parameters
 --
 hole_size_limit : int, default 8KiB
-The maximum distance in bytes between two consecutive ranges; beyond 
+The maximum distance in bytes between two consecutive ranges; beyond
 this value, ranges are not combined.
 range_size_limit : int, default 32MiB
-The maximum size in bytes of a combined range; if combining two 
-consecutive ranges would produce a range of a size greater than this, 
+The maximum size in bytes of a combined range; if combining two
+consecutive ranges would produce a range of a size greater than this,
 they are not combined
 lazy : bool, default True
 lazy = false: request all byte ranges when PreBuffer or WillNeed is 
called.
-lazy = True, prefetch_limit = 0: request merged byte ranges only after 
the reader 
-needs them. 
-lazy = True, prefetch_limit = k: prefetch up to k merged byte ranges 
ahead of the 
+lazy = True, prefetch_limit = 0: request merged byte ranges only after 
the reader
+needs them.
+lazy = True, prefetch_limit = k: prefetch up to k merged byte ranges 
ahead of the
 range that is currently being read.
 prefetch_limit : int, default 0
-The maximum number of ranges to be prefetched. This is only used for 
-lazy cache to asynchronously read some ranges after reading the target 
+The maximum number of ranges to be prefetched. This is only used for
+lazy cache to asynchronously read some ranges after reading the target
 range.
 """
 
@@ -2227,19 +2206,19 @@ cdef class CacheOptions(_Weakrefable):
 """
 Create suiteable CacheOptions based on provided network metrics.
 
-Typically this will be used with object storage solutions like Amazon 
S3, 
+Typically this will be used with object storage solutions like Amazon 
S3,
 Google Cloud Storage and Azure Blob Storage.
 
 Parameters
 --
 time_to_first_byte_millis : int
-Seek-time or Time-To-First-Byte (TTFB) in milliseconds, also 
called call 
-setup latency of a new read request. The value is a positive 
integer. 
+Seek-time or Time-To-First-Byte (TTFB) in milliseconds, also 
called call
+setup latency of a new read request. The value is a positive 
integer.
 transfer_bandwidth_mib_per_sec : int
-Data transfer Bandwidth (BW) in MiB/sec (per connection). The 
value is

(arrow) branch main updated: GH-41463: [C++] Skip TestConcurrentFillFromScalar for platforms without threading support (#41461)

2024-05-01 Thread jorisvandenbossche

This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
 new 250291500b GH-41463: [C++] Skip TestConcurrentFillFromScalar for 
platforms without threading support (#41461)
250291500b is described below

commit 250291500b6a7d5d934901acef708cef2eb1dc08
Author: Rossi Sun 
AuthorDate: Wed May 1 14:39:35 2024 +0800

GH-41463: [C++] Skip TestConcurrentFillFromScalar for platforms without 
threading support (#41461)



### Rationale for this change

See #41463 and 
https://github.com/apache/arrow/pull/40237#issuecomment-2084577090

### What changes are included in this PR?

Skip test for platforms that have no threading support.

### Are these changes tested?

Change is test.

### Are there any user-facing changes?

None.

* GitHub Issue: #41463

Authored-by: Ruoxi Sun 
Signed-off-by: Joris Van den Bossche 
---
 cpp/src/arrow/array/array_test.cc | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/cpp/src/arrow/array/array_test.cc 
b/cpp/src/arrow/array/array_test.cc
index af64908b59..7e25ad61fa 100644
--- a/cpp/src/arrow/array/array_test.cc
+++ b/cpp/src/arrow/array/array_test.cc
@@ -827,6 +827,9 @@ TEST_F(TestArray, TestFillFromScalar) {
 // GH-40069: Data-race when concurrent calling ArraySpan::FillFromScalar of 
the same
 // scalar instance.
 TEST_F(TestArray, TestConcurrentFillFromScalar) {
+#ifndef ARROW_ENABLE_THREADING
+  GTEST_SKIP() << "Test requires threading support";
+#endif
   for (auto type : TestArrayUtilitiesAgainstTheseTypes()) {
 ARROW_SCOPED_TRACE("type = ", type->ToString());
 for (auto seed : {0u, 0xdeadbeef, 42u}) {

(arrow) branch main updated: GH-40342: [Python] Fix pickling of LocalFileSystem for cython 2 (#41459)

2024-04-30 Thread jorisvandenbossche

This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
 new b609de374c GH-40342: [Python] Fix pickling of LocalFileSystem for 
cython 2 (#41459)
b609de374c is described below

commit b609de374c7c00e1537eb8092e1ff2db718d2b61
Author: Joris Van den Bossche 
AuthorDate: Tue Apr 30 13:42:31 2024 +0200

GH-40342: [Python] Fix pickling of LocalFileSystem for cython 2 (#41459)

Small follow-up fix for the failure introduced by 
https://github.com/apache/arrow/pull/40356
* GitHub Issue: #40342

Authored-by: Joris Van den Bossche 
Signed-off-by: Joris Van den Bossche 
---
 python/pyarrow/_fs.pyx | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/python/pyarrow/_fs.pyx b/python/pyarrow/_fs.pyx
index 0e635b2c8a..dbfb6ed114 100644
--- a/python/pyarrow/_fs.pyx
+++ b/python/pyarrow/_fs.pyx
@@ -18,6 +18,7 @@
 # cython: language_level = 3
 
 from cpython.datetime cimport datetime, PyDateTime_DateTime
+from cython cimport binding
 
 from pyarrow.includes.common cimport *
 from pyarrow.includes.libarrow_python cimport PyDateTime_to_TimePoint
@@ -421,6 +422,7 @@ cdef class FileSystem(_Weakrefable):
 "SubTreeFileSystem")
 
 @staticmethod
+@binding(True)  # Required for cython < 3
 def _from_uri(uri):
 fs, _path = FileSystem.from_uri(uri)
 return fs

(arrow-nanoarrow) branch main updated: feat(python): add back nanoarrow.array(..) constructor (#441)

2024-04-23 Thread jorisvandenbossche

This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-nanoarrow.git


The following commit(s) were added to refs/heads/main by this push:
 new c677d4d3 feat(python): add back nanoarrow.array(..) constructor (#441)
c677d4d3 is described below

commit c677d4d396e75d362a626db6c56207ef4ee4befa
Author: Joris Van den Bossche 
AuthorDate: Tue Apr 23 21:15:49 2024 +0200

feat(python): add back nanoarrow.array(..) constructor (#441)

Closes https://github.com/apache/arrow-nanoarrow/issues/434

-

Co-authored-by: Dewey Dunnington 
---
 python/src/nanoarrow/__init__.py |  3 ++-
 python/src/nanoarrow/array.py| 40 +++-
 python/tests/test_array.py   |  5 +
 3 files changed, 46 insertions(+), 2 deletions(-)

diff --git a/python/src/nanoarrow/__init__.py b/python/src/nanoarrow/__init__.py
index 5f99dc22..1e220932 100644
--- a/python/src/nanoarrow/__init__.py
+++ b/python/src/nanoarrow/__init__.py
@@ -73,7 +73,7 @@ from nanoarrow.schema import (
 decimal256,
 struct,
 )
-from nanoarrow.array import Array
+from nanoarrow.array import array, Array
 from nanoarrow._version import __version__  # noqa: F401
 
 # Helps Sphinx automatically populate an API reference section
@@ -125,4 +125,5 @@ __all__ = [
 "uint64",
 "uint8",
 "Array",
+"array",
 ]
diff --git a/python/src/nanoarrow/array.py b/python/src/nanoarrow/array.py
index e38dc9c0..d3730e07 100644
--- a/python/src/nanoarrow/array.py
+++ b/python/src/nanoarrow/array.py
@@ -97,7 +97,7 @@ class Array:
 
 The Array is nanoarrow's high-level in-memory array representation whose
 scope maps to that of a fully-consumed ArrowArrayStream in the Arrow C Data
-interface. See :func:`array` for class details.
+interface.
 
 The :class:`Array` class is nanoarrow's high-level in-memory array
 representation, encompasing the role of PyArrow's ``Array``,
@@ -498,3 +498,41 @@ class Array:
 """
 self._assert_one_chunk("inspect")
 print(_repr_utils.array_inspect(c_array(self)))
+
+
+def array(obj, schema=None, device=None) -> Array:
+"""
+Create a nanoarrow.Array from array-like input.
+
+The :class:`Array` class is nanoarrow's high-level in-memory array
+representation whose scope maps to that of a fully-consumed
+ArrowArrayStream in the Arrow C Data interface. Note that an
+:class:`Array` is not necessarily contiguous in memory (i.e.,
+it may consist of zero or more ``ArrowArray``s).
+See :class:`Array` for class details.
+
+Parameters
+--
+obj : array or array stream-like
+An array-like or array stream-like object. This can be any object
+supporting the Arrow PyCapsule interface, the Python buffer
+protocol, or an iterable of Python objects.
+schema : schema-like, optional
+An optional schema. This can be a Schema object, or object
+implementing the Arrow PyCapsule interface for schemas
+(i.e. having the ``__arrow_c_schema__`` protocol method).
+device : Device, optional
+The device associated with the buffers held by this Array.
+Defaults to the CPU device.
+
+Examples
+
+
+>>> import nanoarrow as na
+>>> na.array([1, 2, 3], na.int32())
+nanoarrow.Array[3]
+1
+2
+3
+"""
+return Array(obj, schema=schema, device=device)
diff --git a/python/tests/test_array.py b/python/tests/test_array.py
index 553a6350..f99b38f5 100644
--- a/python/tests/test_array.py
+++ b/python/tests/test_array.py
@@ -38,6 +38,11 @@ def test_array_construct():
 iter(array)
 
 
+def test_array_constructor():
+array = na.array([1, 2, 3], na.int32())
+assert array.schema.type == na.Type.INT32
+
+
 def test_array_empty():
 array = na.Array([], na.int32())
 assert array.schema.type == na.Type.INT32

(arrow-nanoarrow) branch main updated: feat(python): function to inspect a single-chunk Array (#436)

2024-04-22 Thread jorisvandenbossche

This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-nanoarrow.git


The following commit(s) were added to refs/heads/main by this push:
 new 821b580a feat(python): function to inspect a single-chunk Array (#436)
821b580a is described below

commit 821b580a2fd964b1e4536bbaed927e208dcff6cc
Author: Joris Van den Bossche 
AuthorDate: Mon Apr 22 20:02:29 2024 +0200

feat(python): function to inspect a single-chunk Array (#436)
---
 python/src/nanoarrow/_repr_utils.py | 43 +
 python/src/nanoarrow/array.py   |  8 +++
 python/tests/test_array.py  | 19 
 3 files changed, 70 insertions(+)

diff --git a/python/src/nanoarrow/_repr_utils.py 
b/python/src/nanoarrow/_repr_utils.py
index 3209a341..bd090af5 100644
--- a/python/src/nanoarrow/_repr_utils.py
+++ b/python/src/nanoarrow/_repr_utils.py
@@ -248,3 +248,46 @@ def device_repr(device):
 device_type = f"- device_type: {device.device_type.name} 
<{device.device_type_id}>"
 device_id = f"- device_id: {device.device_id}"
 return "\n".join([title_line, device_type, device_id])
+
+
+def array_inspect(array, indent=0, max_char_width=80):
+array_view = array.view()
+
+if max_char_width < 20:
+max_char_width = 20
+
+indent_str = " " * indent
+class_label = "ArrowArray"
+if array._addr() == 0:
+return f"<{class_label} >"
+elif not array.is_valid():
+return f"<{class_label} >"
+
+schema_string = array.schema._to_string(
+max_chars=max_char_width - indent - 23, recursive=True
+)
+lines = [f"<{class_label} {schema_string}>"]
+for attr in ("length", "offset", "null_count"):
+attr_repr = repr(getattr(array, attr))
+lines.append(f"{indent_str}- {attr}: {attr_repr}")
+
+lines.append(f"{indent_str}- buffers[{array_view.n_buffers}]:")
+for i, buffer in enumerate(array_view.buffers):
+buffer_type = array_view.buffer_type(i)
+lines.append(
+f"{indent_str}  - {buffer_type} "
+f"<{buffer_view_repr(buffer, max_char_width - indent - 4 - 
len(buffer))}>"
+)
+
+if array.dictionary:
+dictionary_repr = array_inspect(array.dictionary, indent=indent + 2)
+lines.append(f"{indent_str}- dictionary: {dictionary_repr}")
+else:
+lines.append(f"{indent_str}- dictionary: NULL")
+
+lines.append(f"{indent_str}- children[{array.n_children}]:")
+for child in array.children:
+child_repr = array_inspect(child, indent=indent + 4)
+lines.append(f"{indent_str}  {repr(child.schema.name)}: {child_repr}")
+
+return "\n".join(lines)
diff --git a/python/src/nanoarrow/array.py b/python/src/nanoarrow/array.py
index af2e3cd4..e38dc9c0 100644
--- a/python/src/nanoarrow/array.py
+++ b/python/src/nanoarrow/array.py
@@ -490,3 +490,11 @@ class Array:
 
 def __repr__(self) -> str:
 return self.to_string()
+
+def inspect(self):
+"""
+Print the details of the array (type, length, offset, buffers,
+and children arrays).
+"""
+self._assert_one_chunk("inspect")
+print(_repr_utils.array_inspect(c_array(self)))
diff --git a/python/tests/test_array.py b/python/tests/test_array.py
index ee88d20d..553a6350 100644
--- a/python/tests/test_array.py
+++ b/python/tests/test_array.py
@@ -280,3 +280,22 @@ def test_array_repr_long():
 assert len(repr_lines) == 2
 assert repr_lines[1].endswith("...")
 assert len(repr_lines[1]) == 80
+
+
+def test_array_inspect(capsys):
+array = na.Array(range(10), na.int32())
+array.inspect()
+captured = capsys.readouterr()
+assert captured.out.startswith("")
+
+# with children
+c_array = na.c_array_from_buffers(
+na.struct({f"col{i}": na.int32() for i in range(100)}),
+length=1,
+buffers=[None],
+children=[na.c_array([123456], na.int32())] * 100,
+)
+array = na.Array(c_array)
+array.inspect()
+captured = capsys.readouterr()
+assert captured.out.startswith("

(arrow-nanoarrow) branch ci-upload-nightly-wheels deleted (was fa35ec5d)

2024-04-19 Thread jorisvandenbossche

This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a change to branch ci-upload-nightly-wheels
in repository https://gitbox.apache.org/repos/asf/arrow-nanoarrow.git


 was fa35ec5d Merge remote-tracking branch 'upstream/main' into 
ci-upload-nightly-wheels

The revisions that were on this branch are still contained in
other references; therefore, this change does not discard any commits
from the repository.

(arrow-nanoarrow) branch main updated (b921dae1 -> db6630b7)

2024-04-19 Thread jorisvandenbossche

This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a change to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-nanoarrow.git


from b921dae1 Update dist/ for commit 
626e219dac259ebf4109c8a4188ddbc6ce93cd4a
 add db6630b7 ci(python): upload nightly python packages (#429)

No new revisions were added by this update.

Summary of changes:
 .github/workflows/python-wheels.yaml | 38 ++--
 1 file changed, 36 insertions(+), 2 deletions(-)

(arrow-nanoarrow) branch ci-upload-nightly-wheels updated (a830d78e -> fa35ec5d)

2024-04-17 Thread jorisvandenbossche

This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a change to branch ci-upload-nightly-wheels
in repository https://gitbox.apache.org/repos/asf/arrow-nanoarrow.git


from a830d78e fix syntax
 add 8e8e38d3 chore(python): Restructure buffer packing to support nulls 
and improve performance (#426)
 add 3a78aa45 fix: Relax comparison strictness such that integration tests 
pass (#399)
 add b5d2742e fix: Ensure negative return values from snprintf() are not 
used as indexes (#418)
 add 917e8e7d Update dist/ for commit 
b5d2742e2d0aee71c2ca5a277169e53c335f6c43
 add 09481518 feat(python): Create string/binary arrays from iterables 
(#430)
 add fa35ec5d Merge remote-tracking branch 'upstream/main' into 
ci-upload-nightly-wheels

No new revisions were added by this update.

Summary of changes:
 dist/nanoarrow.c   |  68 +++--
 dist/nanoarrow_ipc.c   |  30 +++
 dist/nanoarrow_testing.hpp | 174 +++--
 .../src/nanoarrow/nanoarrow_ipc_decoder.c  |  30 +++
 python/bootstrap.py|   2 +-
 python/src/nanoarrow/_lib.pyx  | 266 +++-
 python/src/nanoarrow/c_lib.py  |  83 ++-
 python/tests/test_c_array.py   | 113 -
 python/tests/test_c_buffer.py  |  34 ++-
 python/tests/test_iterator.py  |  37 +--
 src/nanoarrow/array.c  |  26 +-
 src/nanoarrow/array_test.cc|  35 +++
 src/nanoarrow/integration/c_data_integration.cc|  10 +
 src/nanoarrow/nanoarrow_testing.hpp| 174 +++--
 src/nanoarrow/nanoarrow_testing_test.cc| 274 -
 src/nanoarrow/schema.c |  35 ++-
 src/nanoarrow/utils.c  |   7 +
 17 files changed, 1253 insertions(+), 145 deletions(-)

(arrow) branch main updated: GH-35081: [Python] construct pandas.DataFrame with public API in `to_pandas` (#40897)

2024-04-16 Thread jorisvandenbossche

This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
 new eb47fd653f GH-35081: [Python] construct pandas.DataFrame with public 
API in `to_pandas` (#40897)
eb47fd653f is described below

commit eb47fd653fbbe03efc18daf5488369cb87752f96
Author: Joris Van den Bossche 
AuthorDate: Tue Apr 16 09:59:51 2024 +0200

GH-35081: [Python] construct pandas.DataFrame with public API in 
`to_pandas` (#40897)

### Rationale for this change

Avoiding using pandas internals to create Block objects ourselves, using a 
new API for pandas>=3

* GitHub Issue: #35081

Authored-by: Joris Van den Bossche 
Signed-off-by: Joris Van den Bossche 
---
 python/pyarrow/pandas-shim.pxi  |  7 +++-
 python/pyarrow/pandas_compat.py | 75 +++--
 2 files changed, 48 insertions(+), 34 deletions(-)

diff --git a/python/pyarrow/pandas-shim.pxi b/python/pyarrow/pandas-shim.pxi
index 0409e133ad..74f0d981b5 100644
--- a/python/pyarrow/pandas-shim.pxi
+++ b/python/pyarrow/pandas-shim.pxi
@@ -38,7 +38,7 @@ cdef class _PandasAPIShim(object):
 object _array_like_types, _is_extension_array_dtype, _lock
 bint has_sparse
 bint _pd024
-bint _is_v1, _is_ge_v21
+bint _is_v1, _is_ge_v21, _is_ge_v3
 
 def __init__(self):
 self._lock = Lock()
@@ -79,6 +79,7 @@ cdef class _PandasAPIShim(object):
 
 self._is_v1 = self._loose_version < Version('2.0.0')
 self._is_ge_v21 = self._loose_version >= Version('2.1.0')
+self._is_ge_v3 = self._loose_version >= Version('3.0.0.dev0')
 
 self._compat_module = pdcompat
 self._data_frame = pd.DataFrame
@@ -169,6 +170,10 @@ cdef class _PandasAPIShim(object):
 self._check_import()
 return self._is_ge_v21
 
+def is_ge_v3(self):
+self._check_import()
+return self._is_ge_v3
+
 @property
 def categorical_type(self):
 self._check_import()
diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py
index 5bd0dfcf6b..00fa19604e 100644
--- a/python/pyarrow/pandas_compat.py
+++ b/python/pyarrow/pandas_compat.py
@@ -676,7 +676,7 @@ def get_datetimetz_type(values, dtype, type_):
 # Converting pyarrow.Table efficiently to pandas.DataFrame
 
 
-def _reconstruct_block(item, columns=None, extension_columns=None):
+def _reconstruct_block(item, columns=None, extension_columns=None, 
return_block=True):
 """
 Construct a pandas Block from the `item` dictionary coming from pyarrow's
 serialization or returned by arrow::python::ConvertTableToPandas.
@@ -709,22 +709,23 @@ def _reconstruct_block(item, columns=None, 
extension_columns=None):
 block_arr = item.get('block', None)
 placement = item['placement']
 if 'dictionary' in item:
-cat = _pandas_api.categorical_type.from_codes(
+arr = _pandas_api.categorical_type.from_codes(
 block_arr, categories=item['dictionary'],
 ordered=item['ordered'])
-block = _int.make_block(cat, placement=placement)
 elif 'timezone' in item:
 unit, _ = np.datetime_data(block_arr.dtype)
 dtype = make_datetimetz(unit, item['timezone'])
 if _pandas_api.is_ge_v21():
-pd_arr = _pandas_api.pd.array(
+arr = _pandas_api.pd.array(
 block_arr.view("int64"), dtype=dtype, copy=False
 )
-block = _int.make_block(pd_arr, placement=placement)
 else:
-block = _int.make_block(block_arr, placement=placement,
-klass=_int.DatetimeTZBlock,
-dtype=dtype)
+arr = block_arr
+if return_block:
+block = _int.make_block(block_arr, placement=placement,
+klass=_int.DatetimeTZBlock,
+dtype=dtype)
+return block
 elif 'py_array' in item:
 # create ExtensionBlock
 arr = item['py_array']
@@ -734,12 +735,14 @@ def _reconstruct_block(item, columns=None, 
extension_columns=None):
 if not hasattr(pandas_dtype, '__from_arrow__'):
 raise ValueError("This column does not support to be converted "
  "to a pandas ExtensionArray")
-pd_ext_arr = pandas_dtype.__from_arrow__(arr)
-block = _int.make_block(pd_ext_arr, placement=placement)
+arr = pandas_dtype.__from_arrow__(arr)
 else:
-block = _int.make_block(block_arr, placement=placement)
+arr = block_arr
 
-return block
+if return_block:
+return _int.make_block(arr, placement=placement)
+else:
+return arr, place

(arrow-nanoarrow) branch ci-upload-nightly-wheels created (now a830d78e)

2024-04-15 Thread jorisvandenbossche

This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a change to branch ci-upload-nightly-wheels
in repository https://gitbox.apache.org/repos/asf/arrow-nanoarrow.git


  at a830d78e fix syntax

No new revisions were added by this update.

(arrow) branch main updated: GH-38010: [Python] Construct pyarrow.Field and ChunkedArray through Arrow PyCapsule Protocol (#40818)

2024-04-15 Thread jorisvandenbossche

This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
 new b842b530d1 GH-38010: [Python] Construct pyarrow.Field and ChunkedArray 
through Arrow PyCapsule Protocol (#40818)
b842b530d1 is described below

commit b842b530d14a752697f4283c33f16f2f293713ff
Author: Joris Van den Bossche 
AuthorDate: Mon Apr 15 10:22:15 2024 +0200

GH-38010: [Python] Construct pyarrow.Field and ChunkedArray through Arrow 
PyCapsule Protocol (#40818)

### Rationale for this change

See https://github.com/apache/arrow/issues/38010#issuecomment-2010601912 
for more context. Right now for _consuming_ ArrowSchema-compatible objects that 
implement the PyCapsule interface, we only have the private 
`_import_from_c_capsule` (on Schema, Field, DataType) and we check for the 
protocol in the public `pa.schema(..)`.

But that means you currently can only consume objects that represent the 
schema of a batch (struct type), and not schemas of individual arrays.

### What changes are included in this PR?

Expand the `pa.field(..)` constructor to accept objects implementing the 
protocol method.

### Are these changes tested?

TODO

* GitHub Issue: #38010

Authored-by: Joris Van den Bossche 
Signed-off-by: Joris Van den Bossche 
---
 docs/source/python/extending_types.rst | 29 ++--
 python/pyarrow/table.pxi   | 37 ++
 python/pyarrow/tests/test_array.py |  2 +-
 python/pyarrow/tests/test_cffi.py  | 12 --
 python/pyarrow/tests/test_table.py | 41 ++
 python/pyarrow/tests/test_types.py | 22 ++
 python/pyarrow/types.pxi   | 18 ++-
 7 files changed, 151 insertions(+), 10 deletions(-)

diff --git a/docs/source/python/extending_types.rst 
b/docs/source/python/extending_types.rst
index b7261005e6..8df0ef0b1f 100644
--- a/docs/source/python/extending_types.rst
+++ b/docs/source/python/extending_types.rst
@@ -37,14 +37,14 @@ under the hood, you can implement the following methods on 
those objects:
 
 - ``__arrow_c_schema__`` for schema or type-like objects.
 - ``__arrow_c_array__`` for arrays and record batches (contiguous tables).
-- ``__arrow_c_stream__`` for chunked tables or streams of data.
+- ``__arrow_c_stream__`` for chunked arrays, tables and streams of data.
 
 Those methods return `PyCapsule 
<https://docs.python.org/3/c-api/capsule.html>`__
 objects, and more details on the exact semantics can be found in the
 :ref:`specification `.
 
 When your data structures have those methods defined, the PyArrow constructors
-(such as :func:`pyarrow.array` or :func:`pyarrow.table`) will recognize those 
objects as
+(see below) will recognize those objects as
 supporting this protocol, and convert them to PyArrow data structures 
zero-copy. And the
 same can be true for any other library supporting this protocol on ingesting 
data.
 
@@ -53,6 +53,31 @@ support for this protocol by checking for the presence of 
those methods, and
 therefore accept any Arrow data (instead of harcoding support for a specific
 Arrow producer such as PyArrow).
 
+For consuming data through this protocol with PyArrow, the following 
constructors
+can be used to create the various PyArrow objects:
+
+++---++
+| Result class   | PyArrow constructor   | 
Supported protocol |
+++===++
+| :class:`Array` | :func:`pyarrow.array` | 
array  |
+++---++
+| :class:`ChunkedArray`  | :func:`pyarrow.chunked_array` | 
array, stream  |
+++---++
+| :class:`RecordBatch`   | :func:`pyarrow.record_batch`  | 
array  |
+++---++
+| :class:`Table` | :func:`pyarrow.table` | 
array, stream  |
+++---++
+| :class:`RecordBatchReader` | :meth:`pyarrow.RecordBatchReader.from_stream` | 
stream |
+++---++
+| :class:`Field` | :func:`pyarrow.field` |

(arrow) branch main updated (6e1b62509b -> 831b94a65e)

2024-04-10 Thread jorisvandenbossche

This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a change to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


from 6e1b62509b GH-40801: [Docs] Clarify device identifier documentation in 
the Arrow C Device data interface (#41101)
 add 831b94a65e GH-40866: [C++][Python] Basic conversion of RecordBatch to 
Arrow Tensor - add support for row-major (#40867)

No new revisions were added by this update.

Summary of changes:
 cpp/src/arrow/record_batch.cc|  79 ++---
 cpp/src/arrow/record_batch.h |   4 +-
 cpp/src/arrow/record_batch_test.cc   | 212 ++-
 python/pyarrow/includes/libarrow.pxd |   3 +-
 python/pyarrow/table.pxi |  36 --
 python/pyarrow/tests/test_table.py   |  74 +++-
 6 files changed, 326 insertions(+), 82 deletions(-)

(arrow) branch main updated (aeb1618a30 -> 75a100a113)

2024-04-09 Thread jorisvandenbossche

This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a change to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


from aeb1618a30 GH-41020: [C++] Introduce portable compiler assumptions 
(#41021)
 add 75a100a113 GH-38768: [Python] Empty slicing an array backwards beyond 
the start is now empty (#40682)

No new revisions were added by this update.

Summary of changes:
 python/pyarrow/array.pxi   | 29 +
 python/pyarrow/tests/test_array.py |  1 +
 2 files changed, 2 insertions(+), 28 deletions(-)

(arrow) branch main updated: GH-40061: [C++][Python] Basic conversion of RecordBatch to Arrow Tensor - add option to cast NULL to NaN (#40803)

2024-03-29 Thread jorisvandenbossche

This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
 new 96f686b81b GH-40061: [C++][Python] Basic conversion of RecordBatch to 
Arrow Tensor - add option to cast NULL to NaN (#40803)
96f686b81b is described below

commit 96f686b81ba148f4d434846f0b9e161c538f131d
Author: Alenka Frim 
AuthorDate: Fri Mar 29 08:30:03 2024 +0100

GH-40061: [C++][Python] Basic conversion of RecordBatch to Arrow Tensor - 
add option to cast NULL to NaN (#40803)

### Rationale for this change

The conversion from `RecordBatch` to `Tensor` class exists but it doesn't 
support record batches with validity bitmaps. This PR adds support for an 
option to convert null values to NaN.

### What changes are included in this PR?

This PR adds a `nul_to_nan` option in `RecordBatch::ToTensor` so that null 
values are converted to NaN in the resulting `Tensor`. This for example works:

```python
>>> import pyarrow as pa
>>> batch = pa.record_batch(
... [
... pa.array([1, 2, 3, 4, None], type=pa.int32()),
... pa.array([10, 20, 30, 40, None], type=pa.float32()),
... ], names = ["a", "b"]
... )

>>> batch
pyarrow.RecordBatch
a: int32
b: float

a: [1,2,3,4,null]
b: [10,20,30,40,null]

>>> batch.to_tensor(null_to_nan=True)

type: double
shape: (5, 2)
strides: (8, 40)

>>> batch.to_tensor(null_to_nan=True).to_numpy()
array([[ 1., 10.],
   [ 2., 20.],
   [ 3., 30.],
   [ 4., 40.],
   [nan, nan]])
```
but default would raise:

```python
>>> batch.to_tensor()
Traceback (most recent call last):
  File "", line 1, in 
  File "pyarrow/table.pxi", line 3421, in pyarrow.lib.RecordBatch.to_tensor
a: int32
  File "pyarrow/error.pxi", line 154, in 
pyarrow.lib.pyarrow_internal_check_status
return check_status(status)
  File "pyarrow/error.pxi", line 91, in pyarrow.lib.check_status
raise convert_status(status)
pyarrow.lib.ArrowTypeError: Can only convert a RecordBatch with no nulls. 
Set null_to_nan to true to convert nulls to nan
```

### Are these changes tested?

Yes.

### Are there any user-facing changes?

No.
* GitHub Issue: #40061

Lead-authored-by: AlenkaF 
Co-authored-by: Alenka Frim 
Co-authored-by: Joris Van den Bossche 
Signed-off-by: Joris Van den Bossche 
---
 cpp/src/arrow/record_batch.cc| 47 --
 cpp/src/arrow/record_batch.h |  6 ++-
 cpp/src/arrow/record_batch_test.cc   | 76 +++-
 python/pyarrow/includes/libarrow.pxd |  2 +-
 python/pyarrow/table.pxi | 49 +--
 python/pyarrow/tests/test_table.py   | 48 ++-
 6 files changed, 208 insertions(+), 20 deletions(-)

diff --git a/cpp/src/arrow/record_batch.cc b/cpp/src/arrow/record_batch.cc
index 0d8bda9b66..6f3b8e75a2 100644
--- a/cpp/src/arrow/record_batch.cc
+++ b/cpp/src/arrow/record_batch.cc
@@ -18,6 +18,7 @@
 #include "arrow/record_batch.h"
 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -261,12 +262,19 @@ struct ConvertColumnsToTensorVisitor {
   using In = typename T::c_type;
   auto in_values = ArraySpan(in_data).GetSpan(1, in_data.length);
 
-  if constexpr (std::is_same_v) {
-memcpy(out_values, in_values.data(), in_values.size_bytes());
-out_values += in_values.size();
+  if (in_data.null_count == 0) {
+if constexpr (std::is_same_v) {
+  memcpy(out_values, in_values.data(), in_values.size_bytes());
+  out_values += in_values.size();
+} else {
+  for (In in_value : in_values) {
+*out_values++ = static_cast(in_value);
+  }
+}
   } else {
-for (In in_value : in_values) {
-  *out_values++ = static_cast(in_value);
+for (int64_t i = 0; i < in_data.length; ++i) {
+  *out_values++ =
+  in_data.IsNull(i) ? static_cast(NAN) : 
static_cast(in_values[i]);
 }
   }
   return Status::OK();
@@ -286,16 +294,20 @@ inline void ConvertColumnsToTensor(const RecordBatch& 
batch, uint8_t* out) {
   }
 }
 
-Result> RecordBatch::ToTensor(MemoryPool* pool) const {
+Result> RecordBatch::ToTensor(bool null_to_nan,
+  MemoryPool* pool) const {
   if (num_columns() == 0) {
 return Status::TypeError(
 "Conversion to Tensor for RecordBatches without columns/schema is not "
 "suppo

(arrow) branch main updated: GH-40841: [Docs][C++][Python] Add initial documentation for RecordBatch::Tensor conversion (#40842)

2024-03-29 Thread jorisvandenbossche

This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
 new ed8c3630db GH-40841: [Docs][C++][Python] Add initial documentation for 
RecordBatch::Tensor conversion (#40842)
ed8c3630db is described below

commit ed8c3630dbe2261bed9123a4ccfc7df0e3f031bd
Author: Alenka Frim 
AuthorDate: Fri Mar 29 08:29:28 2024 +0100

GH-40841: [Docs][C++][Python] Add initial documentation for 
RecordBatch::Tensor conversion (#40842)

### Rationale for this change

The work on the conversion from `Table`/`RecordBatch` to `Tensor` is 
progressing and we have to make sure to add information to the documentation.

### What changes are included in this PR?

I propose to add

- new page (`converting_recordbatch_to_tensor.rst`) in the `cpp/examples` 
section,
- added section (Conversion of RecordBatch do Tensor) in the 
`docs/source/python/data.rst`

the content above would be updated as the features are added in the future 
(row-major conversion, `Table::ToTensor`, DLPack support for `Tensor` class, 
etc.)

### Are these changes tested?

It will be tested with the crossbow preview-docs job.

### Are there any user-facing changes?

No, just documentation.
* GitHub Issue: #40841

Lead-authored-by: AlenkaF 
Co-authored-by: Alenka Frim 
Co-authored-by: Joris Van den Bossche 
Signed-off-by: Joris Van den Bossche 
---
 .../examples/converting_recordbatch_to_tensor.rst  | 46 +++
 docs/source/cpp/examples/index.rst |  1 +
 docs/source/python/data.rst| 52 ++
 3 files changed, 99 insertions(+)

diff --git a/docs/source/cpp/examples/converting_recordbatch_to_tensor.rst 
b/docs/source/cpp/examples/converting_recordbatch_to_tensor.rst
new file mode 100644
index 00..2be27096cf
--- /dev/null
+++ b/docs/source/cpp/examples/converting_recordbatch_to_tensor.rst
@@ -0,0 +1,46 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements.  See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership.  The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License.  You may obtain a copy of the License at
+
+..   http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied.  See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+.. default-domain:: cpp
+.. highlight:: cpp
+
+Conversion of ``RecordBatch`` to ``Tensor`` instances
+=
+
+Arrow provides a method to convert ``RecordBatch`` objects to a ``Tensor``
+with two dimensions:
+
+.. code::
+
+   std::shared_ptr batch;
+
+   ASSERT_OK_AND_ASSIGN(auto tensor, batch->ToTensor());
+   ASSERT_OK(tensor->Validate());
+
+The conversion supports signed and unsigned integer types plus float types.
+In case the ``RecordBatch`` has null values the conversion succeeds if
+``null_to_nan`` parameter is set to ``true``. In this case all
+types will be promoted to a floating-point data type.
+
+.. code::
+
+   std::shared_ptr batch;
+
+   ASSERT_OK_AND_ASSIGN(auto tensor, batch->ToTensor(/*null_to_nan=*/true));
+   ASSERT_OK(tensor->Validate());
+
+Currently only column-major conversion is supported.
diff --git a/docs/source/cpp/examples/index.rst 
b/docs/source/cpp/examples/index.rst
index b886a0d29e..90b00bbdf6 100644
--- a/docs/source/cpp/examples/index.rst
+++ b/docs/source/cpp/examples/index.rst
@@ -27,3 +27,4 @@ Examples
dataset_skyhook_scan_example
row_columnar_conversion
std::tuple-like ranges to Arrow 
+   Converting RecordBatch to Tensor 
diff --git a/docs/source/python/data.rst b/docs/source/python/data.rst
index 2cc33561d4..9156157fcd 100644
--- a/docs/source/python/data.rst
+++ b/docs/source/python/data.rst
@@ -560,3 +560,55 @@ schema without having to get any of the batches.::
x: int64
 
 It can also be sent between languages using the :ref:`C stream interface 
`.
+
+Conversion of RecordBatch do Tensor
+---
+
+Each array of the ``RecordBatch`` has it's own contiguous memory that is not 
necessarily
+adjacent to other arrays. A different memory structure that is used in machine 
learning
+libraries is a two dimensional array (also called a 2-dim tensor or a matrix) 
which takes
+only one contiguous block of memory.
+
+For

(arrow) branch main updated (aae2557e30 -> a407a6b45e)

2024-03-27 Thread jorisvandenbossche

This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a change to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


from aae2557e30 GH-39377: [C++] IO: Reuse same buffer in 
CompressedInputStream (#39807)
 add a407a6b45e GH-40698: [C++] Create registry for Devices to map 
DeviceType to MemoryManager in C Device Data import (#40699)

No new revisions were added by this update.

Summary of changes:
 cpp/src/arrow/buffer_test.cc | 13 +
 cpp/src/arrow/c/bridge.cc| 11 ---
 cpp/src/arrow/c/bridge.h | 12 
 cpp/src/arrow/device.cc  | 63 
 cpp/src/arrow/device.h   | 28 ++
 cpp/src/arrow/gpu/cuda_memory.cc | 19 
 cpp/src/arrow/gpu/cuda_memory.h  |  4 ++-
 cpp/src/arrow/gpu/cuda_test.cc   | 15 ++
 8 files changed, 139 insertions(+), 26 deletions(-)

(arrow) branch main updated (32437a5aeb -> 434f87274e)

2024-03-26 Thread jorisvandenbossche

This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a change to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


from 32437a5aeb GH-40205: [Python] ListView arrow-to-pandas conversion 
(#40482)
 add 434f87274e GH-40060: [C++][Python] Basic conversion of RecordBatch to 
Arrow Tensor - add support for different data types (#40359)

No new revisions were added by this update.

Summary of changes:
 cpp/src/arrow/record_batch.cc  |  91 --
 cpp/src/arrow/record_batch_test.cc | 128 +
 python/pyarrow/table.pxi   |   3 +
 python/pyarrow/tests/test_table.py |  97 +++-
 4 files changed, 268 insertions(+), 51 deletions(-)

(arrow) branch main updated (dbff1f4a3e -> 32437a5aeb)

2024-03-26 Thread jorisvandenbossche

This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a change to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


from dbff1f4a3e GH-36026: [C++][ORC] Catch all ORC exceptions to avoid 
crash (#40697)
 add 32437a5aeb GH-40205: [Python] ListView arrow-to-pandas conversion 
(#40482)

No new revisions were added by this update.

Summary of changes:
 python/pyarrow/src/arrow/python/arrow_to_pandas.cc | 44 +---
 python/pyarrow/tests/test_pandas.py| 82 ++
 2 files changed, 115 insertions(+), 11 deletions(-)

(arrow) branch main updated (5e1a4fd8a4 -> 7d4d744794)

2024-03-26 Thread jorisvandenbossche

This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a change to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


from 5e1a4fd8a4 GH-40767: [C++][Parquet] Simplify PageWriter and 
ColumnWriter creation (#40768)
 add 7d4d744794 GH-40720: [Python] Simplify and improve perf of creation of 
the column names in Table.to_pandas (#40721)

No new revisions were added by this update.

Summary of changes:
 python/pyarrow/pandas_compat.py | 67 +++--
 1 file changed, 17 insertions(+), 50 deletions(-)

(arrow) branch main updated: GH-40357: [C++] Add benchmark for ToTensor conversions (#40358)

2024-03-26 Thread jorisvandenbossche

This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
 new fc87fd75d6 GH-40357: [C++] Add benchmark for ToTensor conversions 
(#40358)
fc87fd75d6 is described below

commit fc87fd75d6602562e64abf8744890332e35f979e
Author: Alenka Frim 
AuthorDate: Tue Mar 26 08:59:50 2024 +0100

GH-40357: [C++] Add benchmark for ToTensor conversions (#40358)

### Rationale for this change

We should add benchmarks to be sure not to cause regressions while working 
on additional implementations of `RecordBatch::ToTensor` and `Table::ToTensor`.

### What changes are included in this PR?

New `cpp/src/arrow/to_tensor_benchmark.cc file`.
* GitHub Issue: #40357

Lead-authored-by: AlenkaF 
Co-authored-by: Alenka Frim 
Co-authored-by: Joris Van den Bossche 
Signed-off-by: Joris Van den Bossche 
---
 cpp/src/arrow/CMakeLists.txt  |  1 +
 cpp/src/arrow/tensor_benchmark.cc | 68 +++
 2 files changed, 69 insertions(+)

diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt
index 3d1b621db0..4bf1008af4 100644
--- a/cpp/src/arrow/CMakeLists.txt
+++ b/cpp/src/arrow/CMakeLists.txt
@@ -1175,6 +1175,7 @@ add_arrow_benchmark(builder_benchmark)
 add_arrow_benchmark(compare_benchmark)
 add_arrow_benchmark(memory_pool_benchmark)
 add_arrow_benchmark(type_benchmark)
+add_arrow_benchmark(tensor_benchmark)
 
 #
 # Recurse into sub-directories
diff --git a/cpp/src/arrow/tensor_benchmark.cc 
b/cpp/src/arrow/tensor_benchmark.cc
new file mode 100644
index 00..91a9270ef3
--- /dev/null
+++ b/cpp/src/arrow/tensor_benchmark.cc
@@ -0,0 +1,68 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "benchmark/benchmark.h"
+
+#include "arrow/record_batch.h"
+#include "arrow/testing/gtest_util.h"
+#include "arrow/testing/random.h"
+#include "arrow/type.h"
+#include "arrow/util/benchmark_util.h"
+
+namespace arrow {
+
+template 
+static void BatchToTensorSimple(benchmark::State& state) {
+  using CType = typename ValueType::c_type;
+  std::shared_ptr ty = TypeTraits::type_singleton();
+
+  const int64_t num_cols = state.range(1);
+  const int64_t num_rows = state.range(0) / num_cols / sizeof(CType);
+  arrow::random::RandomArrayGenerator gen_{42};
+
+  std::vector> fields = {};
+  std::vector> columns = {};
+
+  for (int64_t i = 0; i < num_cols; ++i) {
+fields.push_back(field("f" + std::to_string(i), ty));
+columns.push_back(gen_.ArrayOf(ty, num_rows));
+  }
+  auto schema = std::make_shared(std::move(fields));
+  auto batch = RecordBatch::Make(schema, num_rows, columns);
+
+  for (auto _ : state) {
+ASSERT_OK_AND_ASSIGN(auto tensor, batch->ToTensor());
+  }
+  state.SetItemsProcessed(state.iterations() * num_rows * num_cols);
+  state.SetBytesProcessed(state.iterations() * ty->byte_width() * num_rows * 
num_cols);
+}
+
+void SetArgs(benchmark::internal::Benchmark* bench) {
+  for (int64_t size : {kL1Size, kL2Size}) {
+for (int64_t num_columns : {3, 30, 300}) {
+  bench->Args({size, num_columns});
+  bench->ArgNames({"size", "num_columns"});
+}
+  }
+}
+
+BENCHMARK_TEMPLATE(BatchToTensorSimple, Int8Type)->Apply(SetArgs);
+BENCHMARK_TEMPLATE(BatchToTensorSimple, Int16Type)->Apply(SetArgs);
+BENCHMARK_TEMPLATE(BatchToTensorSimple, Int32Type)->Apply(SetArgs);
+BENCHMARK_TEMPLATE(BatchToTensorSimple, Int64Type)->Apply(SetArgs);
+
+}  // namespace arrow

(arrow) branch main updated (dada4e1aad -> cc9d52ca1f)

2024-03-21 Thread jorisvandenbossche

This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a change to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


from dada4e1aad GH-40659: [Python][C++] Support conversion of 
pyarrow.RunEndEncodedArray to numpy/pandas (#40661)
 add cc9d52ca1f GH-36399: [Python] Add missing `shape` property to 
`RecordBatch` (#40643)

No new revisions were added by this update.

Summary of changes:
 python/pyarrow/table.pxi   |  42 -
 python/pyarrow/tests/test_table.py | 189 +
 2 files changed, 129 insertions(+), 102 deletions(-)

(arrow) branch main updated (54c4cedd45 -> dada4e1aad)

2024-03-21 Thread jorisvandenbossche

This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a change to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


from 54c4cedd45 GH-40328: [C++][Parquet] Allow use of 
FileDecryptionProperties after the CryptoFactory is destroyed (#40329)
 add dada4e1aad GH-40659: [Python][C++] Support conversion of 
pyarrow.RunEndEncodedArray to numpy/pandas (#40661)

No new revisions were added by this update.

Summary of changes:
 python/pyarrow/src/arrow/python/arrow_to_pandas.cc | 25 ++
 python/pyarrow/tests/test_array.py | 22 +++
 2 files changed, 47 insertions(+)

(arrow) branch main updated: GH-37328: [Python] Add a function to download and extract timezone database on Windows (#38179)

2024-03-20 Thread jorisvandenbossche

This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
 new e52017a727 GH-37328: [Python] Add a function to download and extract 
timezone database on Windows (#38179)
e52017a727 is described below

commit e52017a72735d502c3ac3323d9d1fc61a15a6ae0
Author: Alenka Frim 
AuthorDate: Wed Mar 20 08:59:14 2024 +0100

GH-37328: [Python] Add a function to download and extract timezone database 
on Windows (#38179)

### Rationale for this change

There is a section in the [Arrow C++ documentation with the 
instructions](https://arrow.apache.org/docs/dev/cpp/build_system.html#runtime-dependencies)
 on how to download and extract text version of the IANA timezone database and 
on Windows. We should provide a function in PyArrow that a user would call to 
download and extract the timezone database from Python.

### What changes are included in this PR?

Function `download_tzdata_on_windows()` added to python/pyarrow/util.py 
that downloads and extracts timezone database to a standard location in 
`%USERPROFILE%\Downloads\tzdata` on Widnows.

### Are these changes tested?
Yes.

### Are there any user-facing changes?
No.
* Closes: #37328

Lead-authored-by: AlenkaF 
Co-authored-by: Alenka Frim 
Co-authored-by: Joris Van den Bossche 
Signed-off-by: Joris Van den Bossche 
---
 docs/source/python/install.rst|  3 ++-
 python/pyarrow/tests/test_util.py | 22 +-
 python/pyarrow/util.py| 28 
 3 files changed, 51 insertions(+), 2 deletions(-)

diff --git a/docs/source/python/install.rst b/docs/source/python/install.rst
index 4555977ece..4b966e6d26 100644
--- a/docs/source/python/install.rst
+++ b/docs/source/python/install.rst
@@ -82,7 +82,8 @@ tzdata on Windows
 While Arrow uses the OS-provided timezone database on Linux and macOS, it 
requires a
 user-provided database on Windows. To download and extract the text version of
 the IANA timezone database follow the instructions in the C++
-:ref:`download-timezone-database`.
+:ref:`download-timezone-database` or use pyarrow utility function
+`pyarrow.util.download_tzdata_on_windows()` that does the same.
 
 By default, the timezone database will be detected at 
``%USERPROFILE%\Downloads\tzdata``.
 If the database has been downloaded in a different location, you will need to 
set
diff --git a/python/pyarrow/tests/test_util.py 
b/python/pyarrow/tests/test_util.py
index 9fccb76112..e584b04111 100644
--- a/python/pyarrow/tests/test_util.py
+++ b/python/pyarrow/tests/test_util.py
@@ -16,14 +16,17 @@
 # under the License.
 
 import gc
+import os
 import signal
+import shutil
 import sys
 import textwrap
 import weakref
 
 import pytest
 
-from pyarrow.util import doc, _break_traceback_cycle_from_frame
+from pyarrow.util import (doc, _break_traceback_cycle_from_frame,
+  download_tzdata_on_windows)
 from pyarrow.tests.util import disabled_gc
 
 
@@ -207,3 +210,20 @@ def test_signal_refcycle():
 assert wr() is not None
 _break_traceback_cycle_from_frame(sys._getframe(0))
 assert wr() is None
+
+
+@pytest.mark.skipif(sys.platform != "win32",
+reason="Timezone database is already provided.")
+def test_download_tzdata_on_windows():
+tzdata_path = os.path.expandvars(r"%USERPROFILE%\Downloads\tzdata")
+
+# Download timezone database and remove data in case it already exists
+if (os.path.exists(tzdata_path)):
+shutil.rmtree(tzdata_path)
+download_tzdata_on_windows()
+
+# Inspect the folder
+assert os.path.exists(tzdata_path)
+assert os.path.exists(os.path.join(tzdata_path, "windowsZones.xml"))
+assert os.path.exists(os.path.join(tzdata_path, "europe"))
+assert 'version' in os.listdir(tzdata_path)
diff --git a/python/pyarrow/util.py b/python/pyarrow/util.py
index bb693cd663..89780da10f 100644
--- a/python/pyarrow/util.py
+++ b/python/pyarrow/util.py
@@ -228,3 +228,31 @@ def _break_traceback_cycle_from_frame(frame):
 # us visit the outer frame).
 refs = gc.get_referrers(frame)
 refs = frame = this_frame = None
+
+
+def download_tzdata_on_windows():
+r"""
+Download and extract latest IANA timezone database into the
+location expected by Arrow which is %USERPROFILE%\Downloads\tzdata.
+"""
+if sys.platform != 'win32':
+raise TypeError(f"Timezone database is already provided by 
{sys.platform}")
+
+import tarfile
+
+tzdata_path = os.path.expandvars(r"%USERPROFILE%\Downloads\tzdata")
+tzdata_compressed = os.path.join(tzdata_path, "tzdata.tar.gz")
+os.makedirs(tzdata_path, exist

(arrow) branch main updated: GH-40273: [Python] Support construction of Run-End Encoded arrays in pa.array(..) (#40341)

2024-03-20 Thread jorisvandenbossche

This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
 new dd3d3cd1be GH-40273: [Python] Support construction of Run-End Encoded 
arrays in pa.array(..) (#40341)
dd3d3cd1be is described below

commit dd3d3cd1be27da7c872bfced553f25b8a0240021
Author: Alenka Frim 
AuthorDate: Wed Mar 20 08:44:08 2024 +0100

GH-40273: [Python] Support construction of Run-End Encoded arrays in 
pa.array(..) (#40341)

### Rationale for this change

We want to enable the construction of a Run-End Encoded arrays with 
`pyarrow.array `constructor

### What changes are included in this PR?

Added a check for Run-End Encoded Type in the `pyarrow.array` constructor 
code.

### Are these changes tested?

Yes, added test_run_end_encoded_from_array_with_type.

### Are there any user-facing changes?

No.
* GitHub Issue: #40273

Lead-authored-by: AlenkaF 
Co-authored-by: Joris Van den Bossche 
Signed-off-by: Joris Van den Bossche 
---
 python/pyarrow/array.pxi   | 18 +++---
 python/pyarrow/tests/test_array.py | 39 ++
 2 files changed, 54 insertions(+), 3 deletions(-)

diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index def4c5e9ba..59d2e91ef6 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -336,11 +336,23 @@ def array(object obj, type=None, mask=None, size=None, 
from_pandas=None,
 if pandas_api.have_pandas:
 values, type = pandas_api.compat.get_datetimetz_type(
 values, obj.dtype, type)
-result = _ndarray_to_array(values, mask, type, c_from_pandas, safe,
-   pool)
+if type and type.id == _Type_RUN_END_ENCODED:
+arr = _ndarray_to_array(
+values, mask, type.value_type, c_from_pandas, safe, pool)
+result = _pc().run_end_encode(arr, 
run_end_type=type.run_end_type,
+  memory_pool=memory_pool)
+else:
+result = _ndarray_to_array(values, mask, type, c_from_pandas, 
safe,
+   pool)
 else:
+if type and type.id == _Type_RUN_END_ENCODED:
+arr = _sequence_to_array(
+obj, mask, size, type.value_type, pool, from_pandas)
+result = _pc().run_end_encode(arr, run_end_type=type.run_end_type,
+  memory_pool=memory_pool)
 # ConvertPySequence does strict conversion if type is explicitly passed
-result = _sequence_to_array(obj, mask, size, type, pool, c_from_pandas)
+else:
+result = _sequence_to_array(obj, mask, size, type, pool, 
c_from_pandas)
 
 if extension_type is not None:
 result = ExtensionArray.from_storage(extension_type, result)
diff --git a/python/pyarrow/tests/test_array.py 
b/python/pyarrow/tests/test_array.py
index a8cd20720e..999c1af453 100644
--- a/python/pyarrow/tests/test_array.py
+++ b/python/pyarrow/tests/test_array.py
@@ -3580,6 +3580,45 @@ def test_run_end_encoded_from_buffers():
1, offset, children)
 
 
+def test_run_end_encoded_from_array_with_type():
+run_ends = [1, 3, 6]
+values = [1, 2, 3]
+ree_type = pa.run_end_encoded(pa.int32(), pa.int64())
+expected = pa.RunEndEncodedArray.from_arrays(run_ends, values,
+ ree_type)
+
+arr = [1, 2, 2, 3, 3, 3]
+result = pa.array(arr, type=ree_type)
+assert result.equals(expected)
+result = pa.array(np.array(arr), type=ree_type)
+assert result.equals(expected)
+
+ree_type_2 = pa.run_end_encoded(pa.int16(), pa.float32())
+result = pa.array(arr, type=ree_type_2)
+assert not result.equals(expected)
+expected_2 = pa.RunEndEncodedArray.from_arrays(run_ends, values,
+   ree_type_2)
+assert result.equals(expected_2)
+
+run_ends = [1, 3, 5, 6]
+values = [1, 2, 3, None]
+expected = pa.RunEndEncodedArray.from_arrays(run_ends, values,
+ ree_type)
+
+arr = [1, 2, 2, 3, 3, None]
+result = pa.array(arr, type=ree_type)
+assert result.equals(expected)
+
+run_ends = [1, 3, 4, 5, 6]
+values = [1, 2, None, 3, None]
+expected = pa.RunEndEncodedArray.from_arrays(run_ends, values,
+ ree_type)
+
+mask = pa.array([False, False, False, True, False, True])
+result = pa.array(arr, type=ree_type, mask=mask)
+assert result.equals(expected)
+
+
 @pytest.mark.parametrize(('list_array_type', 'list_type_factory

(arrow) branch main updated: GH-39958: [Python][CI] Remove upper pin on pytest (#40487)

2024-03-19 Thread jorisvandenbossche

This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
 new 6d5cfb2b2f GH-39958: [Python][CI] Remove upper pin on pytest (#40487)
6d5cfb2b2f is described below

commit 6d5cfb2b2fc3a2cbb0bb7ecc9aff24e2834ade66
Author: Alenka Frim 
AuthorDate: Tue Mar 19 15:13:16 2024 +0100

GH-39958: [Python][CI] Remove upper pin on pytest (#40487)

### Rationale for this change

The latest version of pytest (`8.0.0`) is breaking our CI:
- S3 fixture from out test suite fails
- `doctest-cython` check fails

### What changes are included in this PR?

- added `allow_bucket_creation=True` to the `s3_example_fs` fixture
- removed the pin on pytest, except for the doc builds

### Are these changes tested?

Yes.

### Are there any user-facing changes?
No

Closes:

- Closes https://github.com/apache/arrow/issues/39958
- Closes https://github.com/apache/arrow/issues/39957

* GitHub Issue: #39958

Lead-authored-by: AlenkaF 
Co-authored-by: Joris Van den Bossche 
Signed-off-by: Joris Van den Bossche 
---
 ci/conda_env_python.txt  |  2 +-
 docker-compose.yml   | 10 ++
 python/pyarrow/tests/parquet/conftest.py |  1 +
 python/requirements-test.txt |  2 +-
 python/requirements-wheel-test.txt   |  2 +-
 5 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/ci/conda_env_python.txt b/ci/conda_env_python.txt
index 19e94d7d3e..4366e30010 100644
--- a/ci/conda_env_python.txt
+++ b/ci/conda_env_python.txt
@@ -23,7 +23,7 @@ cloudpickle
 fsspec
 hypothesis
 numpy>=1.16.6
-pytest<8
+pytest
 pytest-faulthandler
 s3fs>=2023.10.0
 setuptools
diff --git a/docker-compose.yml b/docker-compose.yml
index eb434b9062..9b0610fe55 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -1208,15 +1208,17 @@ services:
   LANG: "C.UTF-8"
   BUILD_DOCS_CPP: "ON"
   BUILD_DOCS_PYTHON: "ON"
-  # GH-31506/GH-33609: Remove --disable-warnings once
-  # https://github.com/lgpage/pytest-cython/issues/24 is resolved
-  # and a new version that includes the fix is released.
-  PYTEST_ARGS: "--doctest-modules --doctest-cython --disable-warnings"
+  PYTEST_ARGS: "--doctest-modules --doctest-cython"
 volumes: *conda-volumes
+# pytest is installed with an upper pin of 8.0.0 because
+# newer version breaks cython doctesting, see:
+# https://github.com/lgpage/pytest-cython/issues/58
+# Remove pip install pytest~=7 when upstream issue is resolved
 command:
   ["/arrow/ci/scripts/cpp_build.sh /arrow /build &&
 /arrow/ci/scripts/python_build.sh /arrow /build &&
 pip install -e /arrow/dev/archery[numpydoc] &&
+pip install pytest~=7.4 &&
 archery numpydoc --allow-rule GL10,PR01,PR03,PR04,PR05,PR10,RT03,YD01 
&&
 /arrow/ci/scripts/python_test.sh /arrow"]
 
diff --git a/python/pyarrow/tests/parquet/conftest.py 
b/python/pyarrow/tests/parquet/conftest.py
index 461c24af22..767e7f6b69 100644
--- a/python/pyarrow/tests/parquet/conftest.py
+++ b/python/pyarrow/tests/parquet/conftest.py
@@ -81,6 +81,7 @@ def s3_example_fs(s3_server):
 host, port, access_key, secret_key = s3_server['connection']
 uri = (
 "s3://{}:{}@mybucket/data.parquet?scheme=http_override={}:{}"
+"_bucket_creation=True"
 .format(access_key, secret_key, host, port)
 )
 fs, path = FileSystem.from_uri(uri)
diff --git a/python/requirements-test.txt b/python/requirements-test.txt
index 2108d70a54..975477c422 100644
--- a/python/requirements-test.txt
+++ b/python/requirements-test.txt
@@ -1,5 +1,5 @@
 cffi
 hypothesis
 pandas
-pytest<8
+pytest
 pytz
diff --git a/python/requirements-wheel-test.txt 
b/python/requirements-wheel-test.txt
index a1046bc18c..46bedc13ba 100644
--- a/python/requirements-wheel-test.txt
+++ b/python/requirements-wheel-test.txt
@@ -1,7 +1,7 @@
 cffi
 cython
 hypothesis
-pytest<8
+pytest
 pytz
 tzdata; sys_platform == 'win32'

(arrow) branch main updated: GH-34235: [Python] Correct test marker for join_asof tests (#40666)

2024-03-19 Thread jorisvandenbossche

This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
 new ed47ad22c8 GH-34235: [Python] Correct test marker for join_asof tests 
(#40666)
ed47ad22c8 is described below

commit ed47ad22c8537b32abf27580e75fcf514be11f7e
Author: Joris Van den Bossche 
AuthorDate: Tue Mar 19 13:57:56 2024 +0100

GH-34235: [Python] Correct test marker for join_asof tests (#40666)

Small follow-up on https://github.com/apache/arrow/pull/34234 fixing the 
marker for a newly added test, fixing the minimal builds
* GitHub Issue: #34235

Authored-by: Joris Van den Bossche 
Signed-off-by: Joris Van den Bossche 
---
 python/pyarrow/tests/test_table.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/python/pyarrow/tests/test_table.py 
b/python/pyarrow/tests/test_table.py
index d0a7ccacac..72e8cb73e1 100644
--- a/python/pyarrow/tests/test_table.py
+++ b/python/pyarrow/tests/test_table.py
@@ -2901,6 +2901,7 @@ def test_table_join_asof_by_length_mismatch():
 )
 
 
+@pytest.mark.dataset
 def test_table_join_asof_by_type_mismatch():
 t1 = pa.table({
 "colA": [1, 2, 6],
@@ -2922,6 +2923,7 @@ def test_table_join_asof_by_type_mismatch():
 )
 
 
+@pytest.mark.dataset
 def test_table_join_asof_on_type_mismatch():
 t1 = pa.table({
 "colA": [1, 2, 6],

(arrow) branch main updated (00a48217e9 -> 681be03cfc)

2024-03-15 Thread jorisvandenbossche

This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a change to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


from 00a48217e9 GH-38768: [Python] Slicing an array backwards beyond the 
start now includes first item. (#39240)
 add 681be03cfc GH-34235: [Python] Add `join_asof` binding (#34234)

No new revisions were added by this update.

Summary of changes:
 cpp/src/arrow/acero/asof_join_node.cc  |   2 +-
 python/pyarrow/_acero.pyx  |  79 ++
 python/pyarrow/_dataset.pyx|  64 +++
 python/pyarrow/acero.py|  87 +++
 python/pyarrow/includes/libarrow_acero.pxd |   7 ++
 python/pyarrow/table.pxi   |  86 +++
 python/pyarrow/tests/test_acero.py |  35 +++
 python/pyarrow/tests/test_dataset.py   | 114 
 python/pyarrow/tests/test_table.py | 163 +
 9 files changed, 636 insertions(+), 1 deletion(-)

(arrow) branch main updated (03c771a626 -> 00a48217e9)

2024-03-15 Thread jorisvandenbossche

This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a change to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


from 03c771a626 GH-37989: [Python] Plug reference leaks when creating Arrow 
array from Python list of dicts (#40412)
 add 00a48217e9 GH-38768: [Python] Slicing an array backwards beyond the 
start now includes first item. (#39240)

No new revisions were added by this update.

Summary of changes:
 python/pyarrow/array.pxi   | 40 +-
 python/pyarrow/tests/test_array.py |  1 +
 2 files changed, 23 insertions(+), 18 deletions(-)

(arrow) branch main updated (7f361fd806 -> 03c771a626)

2024-03-15 Thread jorisvandenbossche

This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a change to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


from 7f361fd806 GH-30915: [C++][Python] Add missing methods to 
`RecordBatch` (#39506)
 add 03c771a626 GH-37989: [Python] Plug reference leaks when creating Arrow 
array from Python list of dicts (#40412)

No new revisions were added by this update.

Summary of changes:
 python/pyarrow/src/arrow/python/python_to_arrow.cc | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(arrow) branch main updated: GH-30915: [C++][Python] Add missing methods to `RecordBatch` (#39506)

2024-03-15 Thread jorisvandenbossche

This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
 new 7f361fd806 GH-30915: [C++][Python] Add missing methods to 
`RecordBatch` (#39506)
7f361fd806 is described below

commit 7f361fd80651010f3dc91ec6302f661a16892291
Author: Judah Rand <17158624+judahr...@users.noreply.github.com>
AuthorDate: Fri Mar 15 15:15:38 2024 +

GH-30915: [C++][Python] Add missing methods to `RecordBatch` (#39506)

### Rationale for this change

These methods are present on `Table` but missing on `RecordBatch`:

* `add_column`
* `append_column`
* `remove_column`
* `set_column`
* `drop_columns`
* `rename_columns`
* `cast`

We also should probably accept a `dict` as input to `pa.record_batch` like 
we do for `pa.table`.

### What changes are included in this PR?

Add the methods.

### Are these changes tested?

Yes.

* Parent issue: https://github.com/apache/arrow/issues/36399
* Related: #30559
* Closes #30915
* GitHub Issue: #30915

Lead-authored-by: Judah Rand <17158624+judahr...@users.noreply.github.com>
Co-authored-by: Joris Van den Bossche 
Signed-off-by: Joris Van den Bossche 
---
 cpp/src/arrow/record_batch.cc|  29 +++
 cpp/src/arrow/record_batch.h |   7 +
 cpp/src/arrow/record_batch_test.cc   |  26 ++
 python/pyarrow/includes/libarrow.pxd |   7 +
 python/pyarrow/table.pxi | 474 +++
 python/pyarrow/tests/test_table.py   | 202 +--
 6 files changed, 561 insertions(+), 184 deletions(-)

diff --git a/cpp/src/arrow/record_batch.cc b/cpp/src/arrow/record_batch.cc
index d23b2b584b..d52ebe053b 100644
--- a/cpp/src/arrow/record_batch.cc
+++ b/cpp/src/arrow/record_batch.cc
@@ -395,6 +395,35 @@ Result> 
RecordBatch::ReplaceSchema(
   return RecordBatch::Make(std::move(schema), num_rows(), columns());
 }
 
+std::vector RecordBatch::ColumnNames() const {
+  std::vector names(num_columns());
+  for (int i = 0; i < num_columns(); ++i) {
+names[i] = schema()->field(i)->name();
+  }
+  return names;
+}
+
+Result> RecordBatch::RenameColumns(
+const std::vector& names) const {
+  int n = num_columns();
+
+  if (static_cast(names.size()) != n) {
+return Status::Invalid("tried to rename a record batch of ", n, " columns 
but only ",
+   names.size(), " names were provided");
+  }
+
+  ArrayVector columns(n);
+  FieldVector fields(n);
+
+  for (int i = 0; i < n; ++i) {
+columns[i] = column(i);
+fields[i] = schema()->field(i)->WithName(names[i]);
+  }
+
+  return RecordBatch::Make(::arrow::schema(std::move(fields)), num_rows(),
+   std::move(columns));
+}
+
 Result> RecordBatch::SelectColumns(
 const std::vector& indices) const {
   int n = static_cast(indices.size());
diff --git a/cpp/src/arrow/record_batch.h b/cpp/src/arrow/record_batch.h
index 8a2c1ba6d7..16d721caad 100644
--- a/cpp/src/arrow/record_batch.h
+++ b/cpp/src/arrow/record_batch.h
@@ -227,6 +227,13 @@ class ARROW_EXPORT RecordBatch {
   /// \return PrettyPrint representation suitable for debugging
   std::string ToString() const;
 
+  /// \brief Return names of all columns
+  std::vector ColumnNames() const;
+
+  /// \brief Rename columns with provided names
+  Result> RenameColumns(
+  const std::vector& names) const;
+
   /// \brief Return new record batch with specified columns
   Result> SelectColumns(
   const std::vector& indices) const;
diff --git a/cpp/src/arrow/record_batch_test.cc 
b/cpp/src/arrow/record_batch_test.cc
index db68a9a937..45cf7cae65 100644
--- a/cpp/src/arrow/record_batch_test.cc
+++ b/cpp/src/arrow/record_batch_test.cc
@@ -315,6 +315,32 @@ TEST_F(TestRecordBatch, RemoveColumn) {
   AssertBatchesEqual(*new_batch, *batch4);
 }
 
+TEST_F(TestRecordBatch, RenameColumns) {
+  const int length = 10;
+
+  auto field1 = field("f1", int32());
+  auto field2 = field("f2", uint8());
+  auto field3 = field("f3", int16());
+
+  auto schema1 = ::arrow::schema({field1, field2, field3});
+
+  random::RandomArrayGenerator gen(42);
+
+  auto array1 = gen.ArrayOf(int32(), length);
+  auto array2 = gen.ArrayOf(uint8(), length);
+  auto array3 = gen.ArrayOf(int16(), length);
+
+  auto batch = RecordBatch::Make(schema1, length, {array1, array2, array3});
+  EXPECT_THAT(batch->ColumnNames(), testing::ElementsAre("f1", "f2", "f3"));
+
+  ASSERT_OK_AND_ASSIGN(auto renamed, batch->RenameColumns({"zero", "one", 
"two"}));
+  EXPECT_THAT(renamed->ColumnNames(), testing::ElementsAre("zero", "one", 
&quo

(arrow) branch main updated: GH-40291: [Python] Accept dict in pyarrow.record_batch() function (#40292)

2024-03-15 Thread jorisvandenbossche

This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
 new 0402e306a9 GH-40291: [Python] Accept dict in pyarrow.record_batch() 
function (#40292)
0402e306a9 is described below

commit 0402e306a9d9f57ff22c87bf8689b8e7203483e5
Author: Joris Van den Bossche 
AuthorDate: Fri Mar 15 15:17:14 2024 +0100

GH-40291: [Python] Accept dict in pyarrow.record_batch() function (#40292)

### Rationale for this change

`pa.table(dict)` works, but `pa.record_batch(dict)` is not supported. Let's 
make this consistent.

Also harmonized the documentation for the `data` argument for both 
functions.

* GitHub Issue: #40291

Authored-by: Joris Van den Bossche 
Signed-off-by: Joris Van den Bossche 
---
 python/pyarrow/table.pxi | 49 +---
 1 file changed, 38 insertions(+), 11 deletions(-)

diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi
index dfd549befc..9f60150427 100644
--- a/python/pyarrow/table.pxi
+++ b/python/pyarrow/table.pxi
@@ -5109,10 +5109,10 @@ def record_batch(data, names=None, schema=None, 
metadata=None):
 
 Parameters
 --
-data : pandas.DataFrame, list, Arrow-compatible table
-A DataFrame, list of arrays or chunked arrays, or a tabular object
-implementing the Arrow PyCapsule Protocol (has an
-``__arrow_c_array__`` method).
+data : dict, list, pandas.DataFrame, Arrow-compatible table
+A mapping of strings to Arrays or Python lists, a list of Arrays,
+a pandas DataFame, or any tabular object implementing the
+Arrow PyCapsule Protocol (has an ``__arrow_c_array__`` method).
 names : list, default None
 Column names if list of arrays passed as data. Mutually exclusive with
 'schema' argument.
@@ -5137,16 +5137,16 @@ def record_batch(data, names=None, schema=None, 
metadata=None):
 >>> animals = pa.array(["Flamingo", "Parrot", "Dog", "Horse", "Brittle 
stars", "Centipede"])
 >>> names = ["n_legs", "animals"]
 
-Creating a RecordBatch from a list of arrays with names:
+Construct a RecordBatch from a python dictionary:
 
->>> pa.record_batch([n_legs, animals], names=names)
+>>> pa.record_batch({"n_legs": n_legs, "animals": animals})
 pyarrow.RecordBatch
 n_legs: int64
 animals: string
 
 n_legs: [2,2,4,4,5,100]
 animals: ["Flamingo","Parrot","Dog","Horse","Brittle stars","Centipede"]
->>> pa.record_batch([n_legs, animals], names=["n_legs", 
"animals"]).to_pandas()
+>>> pa.record_batch({"n_legs": n_legs, "animals": animals}).to_pandas()
n_legsanimals
 0   2   Flamingo
 1   2 Parrot
@@ -5155,6 +5155,16 @@ def record_batch(data, names=None, schema=None, 
metadata=None):
 4   5  Brittle stars
 5 100  Centipede
 
+Creating a RecordBatch from a list of arrays with names:
+
+>>> pa.record_batch([n_legs, animals], names=names)
+pyarrow.RecordBatch
+n_legs: int64
+animals: string
+
+n_legs: [2,2,4,4,5,100]
+animals: ["Flamingo","Parrot","Dog","Horse","Brittle stars","Centipede"]
+
 Creating a RecordBatch from a list of arrays with names and metadata:
 
 >>> my_metadata={"n_legs": "How many legs does an animal have?"}
@@ -5231,6 +5241,11 @@ def record_batch(data, names=None, schema=None, 
metadata=None):
 if isinstance(data, (list, tuple)):
 return RecordBatch.from_arrays(data, names=names, schema=schema,
metadata=metadata)
+elif isinstance(data, dict):
+if names is not None:
+raise ValueError(
+"The 'names' argument is not valid when passing a dictionary")
+return RecordBatch.from_pydict(data, schema=schema, metadata=metadata)
 elif hasattr(data, "__arrow_c_array__"):
 if schema is not None:
 requested_schema = schema.__arrow_c_schema__()
@@ -5241,7 +5256,7 @@ def record_batch(data, names=None, schema=None, 
metadata=None):
 if schema is not None and batch.schema != schema:
 # __arrow_c_array__ coerces schema with best effort, so we might
 # need to cast it if the producer wasn't able to cast to exact 
schema.
-batch = Table.from_batches([batch]).cast(schema).to_batches()[0]
+batch = batch.cast(schema)
 return bat

(arrow) branch main updated (a1fd4c4964 -> fd1e9ca81f)

2024-03-14 Thread jorisvandenbossche

This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a change to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


from a1fd4c4964 MINOR: [Go] Bump github.com/andybalholm/brotli from 1.0.5 
to 1.1.0 in /go (#40531)
 add fd1e9ca81f GH-39444: [Python] Fix parquet import in encryption test 
(#40505)

No new revisions were added by this update.

Summary of changes:
 dev/tasks/tasks.yml | 1 +
 python/pyarrow/tests/test_dataset_encryption.py | 4 +++-
 2 files changed, 4 insertions(+), 1 deletion(-)

(arrow) branch main updated (dd6d7288e4 -> 9f6dc1feb5)

2024-03-13 Thread jorisvandenbossche

This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a change to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


from dd6d7288e4 GH-39444: [C++][Parquet] Fix crash in Modular Encryption 
(#39623)
 add 9f6dc1feb5 GH-39968: [Python][FS][Azure] Minimal Python bindings for 
`AzureFileSystem`  (#40021)

No new revisions were added by this update.

Summary of changes:
 ci/docker/alpine-linux-3.16-cpp.dockerfile|   1 +
 ci/docker/fedora-39-cpp.dockerfile|   1 +
 ci/docker/linux-apt-docs.dockerfile   |   1 +
 ci/docker/ubuntu-20.04-cpp-minimal.dockerfile |   1 +
 ci/docker/ubuntu-22.04-cpp-minimal.dockerfile |   1 +
 cpp/src/arrow/filesystem/api.h|   5 +-
 cpp/src/arrow/filesystem/azurefs_test.cc  |   2 +
 cpp/src/arrow/filesystem/type_fwd.h   |   7 +-
 cpp/src/arrow/util/config.h.cmake |   1 +
 python/CMakeLists.txt |   4 +
 python/pyarrow/__init__.py|   3 +-
 python/pyarrow/_azurefs.pyx   | 134 ++
 python/pyarrow/_fs.pyx|   3 +
 python/pyarrow/conftest.py|   9 +-
 python/pyarrow/fs.py  |   4 +
 python/pyarrow/includes/libarrow_fs.pxd   |  16 +++
 python/pyarrow/tests/conftest.py  |  31 ++
 python/pyarrow/tests/test_fs.py   |  78 ++-
 python/setup.py   |   8 ++
 19 files changed, 303 insertions(+), 7 deletions(-)
 create mode 100644 python/pyarrow/_azurefs.pyx

(arrow) branch main updated: GH-40428: [Python][CI] Fix dataset partition filter tests with pandas nightly (#40429)

2024-03-13 Thread jorisvandenbossche

This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
 new 788200a434 GH-40428: [Python][CI] Fix dataset partition filter tests 
with pandas nightly (#40429)
788200a434 is described below

commit 788200a434462325c9feff4b52203520a90694e4
Author: Joris Van den Bossche 
AuthorDate: Wed Mar 13 14:20:52 2024 +0100

GH-40428: [Python][CI] Fix dataset partition filter tests with pandas 
nightly (#40429)

### Rationale for this change

From debugging the failure, it seems this is due to pandas changing a 
filter operation to sometimes preserve a RangeIndex now instead of returning an 
Integer64Index. And the conversion to Arrow changes based on that (RangeIndex 
is metadata only by default, integer index becomes a column)

Therefore making the tests more robust to ensure there is always at least 
one non-partition column in the DataFrame, so it doesn't depend on the index 
whether the result is empty or not.

* GitHub Issue: #40428

Authored-by: Joris Van den Bossche 
Signed-off-by: Joris Van den Bossche 
---
 python/pyarrow/tests/parquet/test_dataset.py | 12 ++--
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/python/pyarrow/tests/parquet/test_dataset.py 
b/python/pyarrow/tests/parquet/test_dataset.py
index 30dae05124..47e608a140 100644
--- a/python/pyarrow/tests/parquet/test_dataset.py
+++ b/python/pyarrow/tests/parquet/test_dataset.py
@@ -107,9 +107,9 @@ def test_filters_equivalency(tempdir):
 df = pd.DataFrame({
 'integer': np.array(integer_keys, dtype='i4').repeat(15),
 'string': np.tile(np.tile(np.array(string_keys, dtype=object), 5), 2),
-'boolean': np.tile(np.tile(np.array(boolean_keys, dtype='bool'), 5),
-   3),
-}, columns=['integer', 'string', 'boolean'])
+'boolean': np.tile(np.tile(np.array(boolean_keys, dtype='bool'), 5), 
3),
+'values': np.arange(30),
+})
 
 _generate_partition_directories(local, base_path, partition_spec, df)
 
@@ -312,9 +312,9 @@ def test_filters_inclusive_set(tempdir):
 df = pd.DataFrame({
 'integer': np.array(integer_keys, dtype='i4').repeat(15),
 'string': np.tile(np.tile(np.array(string_keys, dtype=object), 5), 2),
-'boolean': np.tile(np.tile(np.array(boolean_keys, dtype='bool'), 5),
-   3),
-}, columns=['integer', 'string', 'boolean'])
+'boolean': np.tile(np.tile(np.array(boolean_keys, dtype='bool'), 5), 
3),
+'values': np.arange(30),
+})
 
 _generate_partition_directories(local, base_path, partition_spec, df)

(arrow) branch main updated (acdf2a7f68 -> a421314900)

2024-03-13 Thread jorisvandenbossche

This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a change to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


from acdf2a7f68 GH-40312: [Python] Add ListView documentation to user guide 
(#40313)
 add a421314900 GH-40376: [Python] Update for NumPy 2.0 ABI change in 
PyArray_Descr->elsize (#40418)

No new revisions were added by this update.

Summary of changes:
 .env   |  2 +-
 python/CMakeLists.txt  |  3 +++
 python/pyarrow/src/arrow/python/arrow_to_pandas.cc | 10 ++
 python/pyarrow/src/arrow/python/numpy_convert.cc   |  6 +++---
 python/pyarrow/src/arrow/python/numpy_interop.h|  7 +++
 python/pyarrow/src/arrow/python/numpy_to_arrow.cc  | 21 -
 6 files changed, 32 insertions(+), 17 deletions(-)

(arrow) branch main updated (b202ede131 -> 6121b3fd06)

2024-03-12 Thread jorisvandenbossche

This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a change to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


from b202ede131 GH-40458: [Release][Docs] Changes for version and warning 
banner should not affect minor releases (#40459)
 add 6121b3fd06 GH-40485: [Python][CI] Skip failing 
test_dateutil_tzinfo_to_string (#40486)

No new revisions were added by this update.

Summary of changes:
 python/pyarrow/tests/test_types.py | 5 +
 1 file changed, 5 insertions(+)

(arrow) branch main updated: GH-40377: [Python][CI] Fix install of nightly dask in integration tests (#40378)

2024-03-07 Thread jorisvandenbossche

This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
 new e950eb4baa GH-40377: [Python][CI] Fix install of nightly dask in 
integration tests (#40378)
e950eb4baa is described below

commit e950eb4baa73b9ab4e498e71354738c56287c48d
Author: Joris Van den Bossche 
AuthorDate: Thu Mar 7 13:36:53 2024 +0100

GH-40377: [Python][CI] Fix install of nightly dask in integration tests 
(#40378)

### Rationale for this change

Use a proper (non-deprecated) way of installing from git with an "extra", 
which also fixes the currently failing installation.

* GitHub Issue: #40377

Authored-by: Joris Van den Bossche 
Signed-off-by: Joris Van den Bossche 
---
 ci/scripts/install_dask.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/ci/scripts/install_dask.sh b/ci/scripts/install_dask.sh
index 478c1d5997..b89e43cfb3 100755
--- a/ci/scripts/install_dask.sh
+++ b/ci/scripts/install_dask.sh
@@ -27,7 +27,8 @@ fi
 dask=$1
 
 if [ "${dask}" = "upstream_devel" ]; then
-  pip install 
https://github.com/dask/dask/archive/main.tar.gz#egg=dask[dataframe]
+  pip install "dask[dataframe] @ git+https://github.com/dask/dask.git;
+  pip install -U git+https://github.com/dask-contrib/dask-expr.git
 elif [ "${dask}" = "latest" ]; then
   pip install dask[dataframe]
 else

(arrow) branch main updated (ef6ea6beed -> 3d467ac7bf)

2024-03-05 Thread jorisvandenbossche

This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a change to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


from ef6ea6beed GH-40345: [FlightRPC][C++][Java][Go] Add URI scheme to 
reuse connection (#40084)
 add 3d467ac7bf GH-20127: [Python][CI] Remove legacy hdfs tests from hdfs 
and hypothesis setup (#40363)

No new revisions were added by this update.

Summary of changes:
 ci/scripts/integration_hdfs.sh | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(arrow) branch main updated (4ce9a5edd2 -> 3ba6d286ca)

2024-03-05 Thread jorisvandenbossche

This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a change to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


from 4ce9a5edd2 GH-40153: [Python] Make `Tensor.__getbuffer__` work on 
32-bit platforms (#40294)
 add 3ba6d286ca GH-40059: [C++][Python] Basic conversion of RecordBatch to 
Arrow Tensor (#40064)

No new revisions were added by this update.

Summary of changes:
 cpp/src/arrow/record_batch.cc|  92 ++
 cpp/src/arrow/record_batch.h |   8 ++
 cpp/src/arrow/record_batch_test.cc   | 229 +++
 python/pyarrow/includes/libarrow.pxd |   2 +
 python/pyarrow/table.pxi |  14 +++
 python/pyarrow/tests/test_table.py   | 142 ++
 6 files changed, 487 insertions(+)

(arrow) branch main updated: GH-40153: [Python] Make `Tensor.getbuffer` work on 32-bit platforms (#40294)

2024-03-04 Thread jorisvandenbossche

This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
 new 4ce9a5edd2 GH-40153: [Python] Make `Tensor.__getbuffer__` work on 
32-bit platforms (#40294)
4ce9a5edd2 is described below

commit 4ce9a5edd2710fb8bf0c642fd0e3863b01c2ea20
Author: Antoine Pitrou 
AuthorDate: Tue Mar 5 08:56:25 2024 +0100

GH-40153: [Python] Make `Tensor.__getbuffer__` work on 32-bit platforms 
(#40294)

### Rationale for this change

`Tensor.__getbuffer__` would silently assume that `Py_ssize_t` is the same 
width as `int64_t`, which is true only on 64-bit platforms.

### What changes are included in this PR?

Create an internal buffer of `Py_ssize_t` values mirroring a Tensor's shape 
and strides, to avoid relying on the aforementioned assumption.

### Are these changes tested?

Yes.

### Are there any user-facing changes?

No.

* GitHub Issue: #40153

Authored-by: Antoine Pitrou 
Signed-off-by: Joris Van den Bossche 
---
 python/pyarrow/lib.pxd|  2 ++
 python/pyarrow/tensor.pxi | 17 +
 2 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd
index 48350212c2..b1187a77c2 100644
--- a/python/pyarrow/lib.pxd
+++ b/python/pyarrow/lib.pxd
@@ -295,6 +295,8 @@ cdef class Tensor(_Weakrefable):
 
 cdef readonly:
 DataType type
+bytes _ssize_t_shape
+bytes _ssize_t_strides
 
 cdef void init(self, const shared_ptr[CTensor]& sp_tensor)
 
diff --git a/python/pyarrow/tensor.pxi b/python/pyarrow/tensor.pxi
index 1afce7f4a1..6fb4fc99d7 100644
--- a/python/pyarrow/tensor.pxi
+++ b/python/pyarrow/tensor.pxi
@@ -15,6 +15,9 @@
 # specific language governing permissions and limitations
 # under the License.
 
+# Avoid name clash with `pa.struct` function
+import struct as _struct
+
 
 cdef class Tensor(_Weakrefable):
 """
@@ -40,6 +43,14 @@ cdef class Tensor(_Weakrefable):
 self.sp_tensor = sp_tensor
 self.tp = sp_tensor.get()
 self.type = pyarrow_wrap_data_type(self.tp.type())
+self._ssize_t_shape = self._make_shape_or_strides_buffer(self.shape)
+self._ssize_t_strides = 
self._make_shape_or_strides_buffer(self.strides)
+
+def _make_shape_or_strides_buffer(self, values):
+"""
+Make a bytes object holding an array of `values` cast to `Py_ssize_t`.
+"""
+return _struct.pack(f"{len(values)}n", *values)
 
 def __repr__(self):
 return """
@@ -282,10 +293,8 @@ strides: {0.strides}""".format(self)
 buffer.readonly = 0
 else:
 buffer.readonly = 1
-# NOTE: This assumes Py_ssize_t == int64_t, and that the shape
-# and strides arrays lifetime is tied to the tensor's
-buffer.shape =  ()[0]
-buffer.strides =  ()[0]
+buffer.shape =  cp.PyBytes_AsString(self._ssize_t_shape)
+buffer.strides =  
cp.PyBytes_AsString(self._ssize_t_strides)
 buffer.suboffsets = NULL

(arrow) branch main updated: GH-20127: [Python] Remove deprecated pyarrow.filesystem legacy implementations (#39825)

2024-03-04 Thread jorisvandenbossche

This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
 new 2b194ad222 GH-20127: [Python] Remove deprecated pyarrow.filesystem 
legacy implementations (#39825)
2b194ad222 is described below

commit 2b194ad222f4dc8ecf2eb73539ab8cab5b1fc5e7
Author: Alenka Frim 
AuthorDate: Mon Mar 4 13:33:18 2024 +0100

GH-20127: [Python] Remove deprecated pyarrow.filesystem legacy 
implementations (#39825)

This PR removes the `pyarrow.filesystem` and `pyarrow.hdfs` filesystems 
that have been deprecated since 2.0.0.
* Closes: #20127

Lead-authored-by: AlenkaF 
Co-authored-by: Alenka Frim 
Co-authored-by: Antoine Pitrou 
Co-authored-by: Joris Van den Bossche 
Signed-off-by: Joris Van den Bossche 
---
 docs/source/python/filesystems_deprecated.rst  |  88 
 docs/source/python/index.rst   |   1 -
 python/CMakeLists.txt  |   1 -
 python/pyarrow/__init__.py |  47 +-
 python/pyarrow/_hdfsio.pyx | 478 ---
 python/pyarrow/filesystem.py   | 511 -
 python/pyarrow/fs.py   |  25 +-
 python/pyarrow/hdfs.py | 240 --
 python/pyarrow/io.pxi  |  13 +
 python/pyarrow/parquet/core.py |  30 +-
 python/pyarrow/tests/parquet/test_basic.py |   5 +-
 python/pyarrow/tests/parquet/test_dataset.py   | 137 +++---
 .../pyarrow/tests/parquet/test_parquet_writer.py   |  43 --
 python/pyarrow/tests/test_filesystem.py|  75 ---
 python/pyarrow/tests/test_hdfs.py  | 451 --
 python/setup.py|   1 -
 16 files changed, 93 insertions(+), 2053 deletions(-)

diff --git a/docs/source/python/filesystems_deprecated.rst 
b/docs/source/python/filesystems_deprecated.rst
deleted file mode 100644
index c51245341b..00
--- a/docs/source/python/filesystems_deprecated.rst
+++ /dev/null
@@ -1,88 +0,0 @@
-.. Licensed to the Apache Software Foundation (ASF) under one
-.. or more contributor license agreements.  See the NOTICE file
-.. distributed with this work for additional information
-.. regarding copyright ownership.  The ASF licenses this file
-.. to you under the Apache License, Version 2.0 (the
-.. "License"); you may not use this file except in compliance
-.. with the License.  You may obtain a copy of the License at
-
-..   http://www.apache.org/licenses/LICENSE-2.0
-
-.. Unless required by applicable law or agreed to in writing,
-.. software distributed under the License is distributed on an
-.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-.. KIND, either express or implied.  See the License for the
-.. specific language governing permissions and limitations
-.. under the License.
-
-Filesystem Interface (legacy)
-=
-
-.. warning::
-   This section documents the deprecated filesystem layer.  You should
-   use the :ref:`new filesystem layer ` instead.
-
-.. _hdfs:
-
-Hadoop File System (HDFS)
--
-
-PyArrow comes with bindings to a C++-based interface to the Hadoop File
-System. You connect like so:
-
-.. code-block:: python
-
-   import pyarrow as pa
-   fs = pa.hdfs.connect(host, port, user=user, kerb_ticket=ticket_cache_path)
-   with fs.open(path, 'rb') as f:
-   # Do something with f
-
-By default, ``pyarrow.hdfs.HadoopFileSystem`` uses libhdfs, a JNI-based
-interface to the Java Hadoop client. This library is loaded **at runtime**
-(rather than at link / library load time, since the library may not be in your
-LD_LIBRARY_PATH), and relies on some environment variables.
-
-* ``HADOOP_HOME``: the root of your installed Hadoop distribution. Often has
-  `lib/native/libhdfs.so`.
-
-* ``JAVA_HOME``: the location of your Java SDK installation.
-
-* ``ARROW_LIBHDFS_DIR`` (optional): explicit location of ``libhdfs.so`` if it 
is
-  installed somewhere other than ``$HADOOP_HOME/lib/native``.
-
-* ``CLASSPATH``: must contain the Hadoop jars. You can set these using:
-
-.. code-block:: shell
-
-export CLASSPATH=`$HADOOP_HOME/bin/hdfs classpath --glob`
-
-If ``CLASSPATH`` is not set, then it will be set automatically if the
-``hadoop`` executable is in your system path, or if ``HADOOP_HOME`` is set.
-
-HDFS API
-
-
-.. currentmodule:: pyarrow
-
-.. autosummary::
-   :toctree: generated/
-
-   hdfs.connect
-   HadoopFileSystem.cat
-   HadoopFileSystem.chmod
-   HadoopFileSystem.chown
-   HadoopFileSystem.delete
-   HadoopFileSystem.df
-   HadoopFileSystem.disk_usage
-   HadoopFileSystem.download
-   HadoopFileSystem.exists
-   HadoopFileSystem.get_capacity
-   HadoopFileS

(arrow) branch main updated: GH-39855: [Python] ListView support for pa.array() (#40160)

2024-02-29 Thread jorisvandenbossche

This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
 new 7c4f4c2bb1 GH-39855: [Python] ListView support for pa.array() (#40160)
7c4f4c2bb1 is described below

commit 7c4f4c2bb140fb51a6c26908f2420a972c7f48e0
Author: Dane Pitkin <48041712+danepit...@users.noreply.github.com>
AuthorDate: Fri Mar 1 02:30:01 2024 -0500

GH-39855: [Python] ListView support for pa.array() (#40160)

### Rationale for this change

Add pa.array() instantiation support for ListView and LargeListView formats.

### What changes are included in this PR?

* pa.array() supports creating ListView and LargeListView types
* ListArray, LargeListArray now have their size initialized before adding 
elements during python-to-arrow conversion. This allows these types to be 
convertible to ListViewArray and LargeListViewArray types.

### Are these changes tested?

Yes, unit tested.

### Are there any user-facing changes?

Yes, new feature added.
* Closes: #39855
* GitHub Issue: #39855

Authored-by: Dane Pitkin 
Signed-off-by: Joris Van den Bossche 
---
 python/pyarrow/src/arrow/python/python_to_arrow.cc |  23 +++-
 python/pyarrow/tests/strategies.py |   4 +-
 python/pyarrow/tests/test_array.py | 147 +++--
 python/pyarrow/tests/test_convert_builtin.py   |  44 --
 python/pyarrow/tests/test_scalars.py   |  43 +++---
 5 files changed, 218 insertions(+), 43 deletions(-)

diff --git a/python/pyarrow/src/arrow/python/python_to_arrow.cc 
b/python/pyarrow/src/arrow/python/python_to_arrow.cc
index 3c4d59d659..a0bae2f501 100644
--- a/python/pyarrow/src/arrow/python/python_to_arrow.cc
+++ b/python/pyarrow/src/arrow/python/python_to_arrow.cc
@@ -581,7 +581,8 @@ struct PyConverterTrait<
 };
 
 template 
-struct PyConverterTrait> {
+struct PyConverterTrait<
+T, enable_if_t::value || 
is_list_view_type::value>> {
   using type = PyListConverter;
 };
 
@@ -803,7 +804,6 @@ class PyListConverter : public ListConverter {
   return this->list_builder_->AppendNull();
 }
 
-RETURN_NOT_OK(this->list_builder_->Append());
 if (PyArray_Check(value)) {
   RETURN_NOT_OK(AppendNdarray(value));
 } else if (PySequence_Check(value)) {
@@ -824,6 +824,21 @@ class PyListConverter : public ListConverter {
   }
 
  protected:
+  // MapType does not support args in the Append() method
+  Status AppendTo(const MapType*, int64_t size) { return 
this->list_builder_->Append(); }
+
+  // FixedSizeListType does not support args in the Append() method
+  Status AppendTo(const FixedSizeListType*, int64_t size) {
+return this->list_builder_->Append();
+  }
+
+  // ListType requires the size argument in the Append() method
+  // in order to be convertible to a ListViewType. ListViewType
+  // requires the size argument in the Append() method always.
+  Status AppendTo(const BaseListType*, int64_t size) {
+return this->list_builder_->Append(true, size);
+  }
+
   Status ValidateBuilder(const MapType*) {
 if (this->list_builder_->key_builder()->null_count() > 0) {
   return Status::Invalid("Invalid Map: key field cannot contain null 
values");
@@ -836,11 +851,14 @@ class PyListConverter : public ListConverter {
 
   Status AppendSequence(PyObject* value) {
 int64_t size = static_cast(PySequence_Size(value));
+RETURN_NOT_OK(AppendTo(this->list_type_, size));
 RETURN_NOT_OK(this->list_builder_->ValidateOverflow(size));
 return this->value_converter_->Extend(value, size);
   }
 
   Status AppendIterable(PyObject* value) {
+auto size = static_cast(PyObject_Size(value));
+RETURN_NOT_OK(AppendTo(this->list_type_, size));
 PyObject* iterator = PyObject_GetIter(value);
 OwnedRef iter_ref(iterator);
 while (PyObject* item = PyIter_Next(iterator)) {
@@ -857,6 +875,7 @@ class PyListConverter : public ListConverter {
   return Status::Invalid("Can only convert 1-dimensional array values");
 }
 const int64_t size = PyArray_SIZE(ndarray);
+RETURN_NOT_OK(AppendTo(this->list_type_, size));
 RETURN_NOT_OK(this->list_builder_->ValidateOverflow(size));
 
 const auto value_type = this->value_converter_->builder()->type();
diff --git a/python/pyarrow/tests/strategies.py 
b/python/pyarrow/tests/strategies.py
index bb88a4dcb7..7affe815a2 100644
--- a/python/pyarrow/tests/strategies.py
+++ b/python/pyarrow/tests/strategies.py
@@ -167,7 +167,9 @@ def list_types(item_strategy=primitive_types):
 pa.list_,
 item_strategy,
 st.integers(min_value=0, max_value=16)
-)
+),
+st.builds(pa.list_v

(arrow) branch main updated (99c5412a6a -> d6b9051fa0)

2024-02-28 Thread jorisvandenbossche

This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a change to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


from 99c5412a6a GH-39979: [Python] Low-level bindings for 
exporting/importing the C Device Interface (#39980)
 add d6b9051fa0 GH-40066: [Python] Support `requested_schema` in 
`__arrow_c_stream__()` (#40070)

No new revisions were added by this update.

Summary of changes:
 python/pyarrow/includes/libarrow_python.pxd |  8 
 python/pyarrow/ipc.pxi  | 39 ++---
 python/pyarrow/src/arrow/python/ipc.cc  | 66 
 python/pyarrow/src/arrow/python/ipc.h   | 20 +
 python/pyarrow/table.pxi| 23 ++
 python/pyarrow/tests/test_cffi.py   | 18 +++-
 python/pyarrow/tests/test_ipc.py| 68 +++--
 python/pyarrow/tests/test_table.py  | 32 +-
 8 files changed, 261 insertions(+), 13 deletions(-)

(arrow) branch main updated: GH-40266: [Python] Mark ListView as a nested type (#40265)

2024-02-28 Thread jorisvandenbossche

This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
 new d519a4cb05 GH-40266: [Python] Mark ListView as a nested type (#40265)
d519a4cb05 is described below

commit d519a4cb05773dc6ef36e02c963b5e27c73d06e5
Author: Dane Pitkin <48041712+danepit...@users.noreply.github.com>
AuthorDate: Wed Feb 28 04:37:34 2024 -0500

GH-40266: [Python] Mark ListView as a nested type (#40265)

### Rationale for this change

ListView types are nested, so `is_nested()` should return True.

### What changes are included in this PR?

* `pa.types.is_nested(pa.list_view())` returns True

### Are these changes tested?

Yes, unit tested.

### Are there any user-facing changes?

Yes.
* GitHub Issue: #40266

Authored-by: Dane Pitkin 
Signed-off-by: Joris Van den Bossche 
---
 python/pyarrow/tests/test_types.py | 2 ++
 python/pyarrow/types.py| 1 +
 2 files changed, 3 insertions(+)

diff --git a/python/pyarrow/tests/test_types.py 
b/python/pyarrow/tests/test_types.py
index e048ed6fa5..a79702a8ca 100644
--- a/python/pyarrow/tests/test_types.py
+++ b/python/pyarrow/tests/test_types.py
@@ -216,6 +216,8 @@ def test_is_nested_or_struct():
 assert types.is_nested(pa.list_(pa.int32()))
 assert types.is_nested(pa.list_(pa.int32(), 3))
 assert types.is_nested(pa.large_list(pa.int32()))
+assert types.is_nested(pa.list_view(pa.int32()))
+assert types.is_nested(pa.large_list_view(pa.int32()))
 assert not types.is_nested(pa.int32())
 
 
diff --git a/python/pyarrow/types.py b/python/pyarrow/types.py
index 6c262b49cb..66b1ec3395 100644
--- a/python/pyarrow/types.py
+++ b/python/pyarrow/types.py
@@ -41,6 +41,7 @@ _TEMPORAL_TYPES = ({lib.Type_TIMESTAMP,
_INTERVAL_TYPES)
 _UNION_TYPES = {lib.Type_SPARSE_UNION, lib.Type_DENSE_UNION}
 _NESTED_TYPES = {lib.Type_LIST, lib.Type_FIXED_SIZE_LIST, lib.Type_LARGE_LIST,
+ lib.Type_LIST_VIEW, lib.Type_LARGE_LIST_VIEW,
  lib.Type_STRUCT, lib.Type_MAP} | _UNION_TYPES

(arrow) branch main updated (3f7b2884dc -> 06d841ee7d)

2024-02-27 Thread jorisvandenbossche

This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a change to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


from 3f7b2884dc GH-40171: [Python] Add Type_FIXED_SIZE_LIST to 
_NESTED_TYPES set (#40172)
 add 06d841ee7d MINOR: [Documentation][C++][Python][R] Clarify docstrings 
around max_chunksize (#40251)

No new revisions were added by this update.

Summary of changes:
 cpp/src/arrow/ipc/writer.h |  4 ++--
 cpp/src/arrow/table.h  |  4 ++--
 python/pyarrow/_flight.pyx |  4 ++--
 python/pyarrow/ipc.pxi |  4 ++--
 python/pyarrow/table.pxi   | 12 ++--
 r/R/flight.R   |  3 ++-
 r/man/flight_put.Rd|  3 ++-
 7 files changed, 18 insertions(+), 16 deletions(-)

(arrow) branch main updated (06d841ee7d -> c57115de8d)

2024-02-27 Thread jorisvandenbossche

This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a change to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


from 06d841ee7d MINOR: [Documentation][C++][Python][R] Clarify docstrings 
around max_chunksize (#40251)
 add c57115de8d GH-40142: [Python] Allow FileInfo instances to be passed to 
dataset init (#40143)

No new revisions were added by this update.

Summary of changes:
 python/pyarrow/_dataset.pyx  | 34 +---
 python/pyarrow/dataset.py| 16 +++--
 python/pyarrow/includes/libarrow_dataset.pxd |  8 +++
 python/pyarrow/tests/test_dataset.py | 10 
 4 files changed, 58 insertions(+), 10 deletions(-)

(arrow) branch main updated: GH-40171: [Python] Add Type_FIXED_SIZE_LIST to _NESTED_TYPES set (#40172)

2024-02-27 Thread jorisvandenbossche

This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
 new 3f7b2884dc GH-40171: [Python] Add Type_FIXED_SIZE_LIST to 
_NESTED_TYPES set (#40172)
3f7b2884dc is described below

commit 3f7b2884dccb4c0164092b754a2a76ccbb900154
Author: Hussein Awala 
AuthorDate: Tue Feb 27 14:28:55 2024 +0100

GH-40171: [Python] Add Type_FIXED_SIZE_LIST to _NESTED_TYPES set (#40172)



### Rationale for this change

### What changes are included in this PR?

This PR fixes a minor bug in `types.is_nested` which doesn't consider the 
`FIXED_SIZE_LIST` type as nested type.

### Are these changes tested?

### Are there any user-facing changes?

* Closes: #40171

Authored-by: hussein-awala 
Signed-off-by: Joris Van den Bossche 
---
 python/pyarrow/tests/test_types.py | 1 +
 python/pyarrow/types.py| 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/python/pyarrow/tests/test_types.py 
b/python/pyarrow/tests/test_types.py
index 0add578608..e048ed6fa5 100644
--- a/python/pyarrow/tests/test_types.py
+++ b/python/pyarrow/tests/test_types.py
@@ -214,6 +214,7 @@ def test_is_nested_or_struct():
 
 assert types.is_nested(struct_ex)
 assert types.is_nested(pa.list_(pa.int32()))
+assert types.is_nested(pa.list_(pa.int32(), 3))
 assert types.is_nested(pa.large_list(pa.int32()))
 assert not types.is_nested(pa.int32())
 
diff --git a/python/pyarrow/types.py b/python/pyarrow/types.py
index 0f68ca9fe5..6c262b49cb 100644
--- a/python/pyarrow/types.py
+++ b/python/pyarrow/types.py
@@ -40,8 +40,8 @@ _TEMPORAL_TYPES = ({lib.Type_TIMESTAMP,
 lib.Type_DURATION} | _TIME_TYPES | _DATE_TYPES |
_INTERVAL_TYPES)
 _UNION_TYPES = {lib.Type_SPARSE_UNION, lib.Type_DENSE_UNION}
-_NESTED_TYPES = {lib.Type_LIST, lib.Type_LARGE_LIST, lib.Type_STRUCT,
- lib.Type_MAP} | _UNION_TYPES
+_NESTED_TYPES = {lib.Type_LIST, lib.Type_FIXED_SIZE_LIST, lib.Type_LARGE_LIST,
+ lib.Type_STRUCT, lib.Type_MAP} | _UNION_TYPES
 
 
 @doc(datatype="null")

(arrow) branch main updated: MINOR: [Format] Clarify that the buffers for the Binary View layout differ in the C Data Interface (#40156)

2024-02-27 Thread jorisvandenbossche

This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
 new 5f3688351f MINOR: [Format] Clarify that the buffers for the Binary 
View layout differ in the C Data Interface (#40156)
5f3688351f is described below

commit 5f3688351f3adfba9a84d9e0bd65b300eabe35d2
Author: Joris Van den Bossche 
AuthorDate: Tue Feb 27 09:15:55 2024 +0100

MINOR: [Format] Clarify that the buffers for the Binary View layout differ 
in the C Data Interface (#40156)

### Rationale for this change

Attempt to draw more attention to the fact that the buffer listing / number 
of buffers differ between the main Format spec and the C Data Interface, for 
the Binary View layout.

Triggered by feedback from implementing this in duckdb at 
https://github.com/duckdb/duckdb/pull/10481#discussion_r1489245865

Authored-by: Joris Van den Bossche 
Signed-off-by: Joris Van den Bossche 
---
 docs/source/format/CDataInterface.rst | 7 ++-
 docs/source/format/Columnar.rst   | 3 +++
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/docs/source/format/CDataInterface.rst 
b/docs/source/format/CDataInterface.rst
index ef4bf1cf32..fd9952b037 100644
--- a/docs/source/format/CDataInterface.rst
+++ b/docs/source/format/CDataInterface.rst
@@ -467,7 +467,10 @@ It has the following fields:
 
Mandatory.  The number of physical buffers backing this array.  The
number of buffers is a function of the data type, as described in the
-   :ref:`Columnar format specification `.
+   :ref:`Columnar format specification `, except for the
+   the binary or utf-8 view type, which has one additional buffer compared
+   to the Columnar format specification (see
+   :ref:`c-data-interface-binary-view-arrays`).
 
Buffers of children arrays are not included.
 
@@ -552,6 +555,8 @@ parameterized extension types).
 The ``ArrowArray`` structure exported from an extension array simply points
 to the storage data of the extension array.
 
+.. _c-data-interface-binary-view-arrays:
+
 Binary view arrays
 --
 
diff --git a/docs/source/format/Columnar.rst b/docs/source/format/Columnar.rst
index 84f251968f..7b74b972f2 100644
--- a/docs/source/format/Columnar.rst
+++ b/docs/source/format/Columnar.rst
@@ -409,6 +409,9 @@ All integers (length, buffer index, and offset) are signed.
 
 This layout is adapted from TU Munich's `UmbraDB`_.
 
+Note that this layout uses one additional buffer to store the variadic buffer
+lengths in the :ref:`Arrow C data interface 
`.
+
 .. _variable-size-list-layout:
 
 Variable-size List Layout

(arrow) branch main updated: GH-40092: [Python] Support Binary/StringView conversion to numpy/pandas (#40093)

2024-02-22 Thread jorisvandenbossche

This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
 new 8e53451cc4 GH-40092: [Python] Support Binary/StringView conversion to 
numpy/pandas (#40093)
8e53451cc4 is described below

commit 8e53451cc48081df20fdf52b82edcc52ea778ec5
Author: Joris Van den Bossche 
AuthorDate: Thu Feb 22 10:19:17 2024 +0100

GH-40092: [Python] Support Binary/StringView conversion to numpy/pandas 
(#40093)

Last step for Binary/StringView support in Python 
(https://github.com/apache/arrow/issues/39633), now adding it to the 
arrow->pandas/numpy conversion code path.
* Closes: #40092

Authored-by: Joris Van den Bossche 
Signed-off-by: Joris Van den Bossche 
---
 python/pyarrow/src/arrow/python/arrow_to_pandas.cc | 22 +++---
 python/pyarrow/tests/test_pandas.py| 14 ++
 2 files changed, 33 insertions(+), 3 deletions(-)

diff --git a/python/pyarrow/src/arrow/python/arrow_to_pandas.cc 
b/python/pyarrow/src/arrow/python/arrow_to_pandas.cc
index e979342b88..2115cd8015 100644
--- a/python/pyarrow/src/arrow/python/arrow_to_pandas.cc
+++ b/python/pyarrow/src/arrow/python/arrow_to_pandas.cc
@@ -133,6 +133,13 @@ struct WrapBytes {
   }
 };
 
+template <>
+struct WrapBytes {
+  static inline PyObject* Wrap(const char* data, int64_t length) {
+return PyUnicode_FromStringAndSize(data, length);
+  }
+};
+
 template <>
 struct WrapBytes {
   static inline PyObject* Wrap(const char* data, int64_t length) {
@@ -147,6 +154,13 @@ struct WrapBytes {
   }
 };
 
+template <>
+struct WrapBytes {
+  static inline PyObject* Wrap(const char* data, int64_t length) {
+return PyBytes_FromStringAndSize(data, length);
+  }
+};
+
 template <>
 struct WrapBytes {
   static inline PyObject* Wrap(const char* data, int64_t length) {
@@ -1154,7 +1168,8 @@ struct ObjectWriterVisitor {
   }
 
   template 
-  enable_if_t::value || 
is_fixed_size_binary_type::value,
+  enable_if_t::value || 
is_binary_view_like_type::value ||
+  is_fixed_size_binary_type::value,
   Status>
   Visit(const Type& type) {
 auto WrapValue = [](const std::string_view& view, PyObject** out) {
@@ -1355,8 +1370,7 @@ struct ObjectWriterVisitor {
   std::is_same::value ||
   (std::is_base_of::value &&
!std::is_same::value) ||
-  std::is_base_of::value ||
-  std::is_base_of::value,
+  std::is_base_of::value,
   Status>
   Visit(const Type& type) {
 return Status::NotImplemented("No implemented conversion to object dtype: 
",
@@ -2086,8 +2100,10 @@ static Status GetPandasWriterType(const ChunkedArray& 
data, const PandasOptions&
   break;
 case Type::STRING:// fall through
 case Type::LARGE_STRING:  // fall through
+case Type::STRING_VIEW:   // fall through
 case Type::BINARY:// fall through
 case Type::LARGE_BINARY:
+case Type::BINARY_VIEW:
 case Type::NA:   // fall through
 case Type::FIXED_SIZE_BINARY:// fall through
 case Type::STRUCT:   // fall through
diff --git a/python/pyarrow/tests/test_pandas.py 
b/python/pyarrow/tests/test_pandas.py
index 89a241a27e..fdfd123a8c 100644
--- a/python/pyarrow/tests/test_pandas.py
+++ b/python/pyarrow/tests/test_pandas.py
@@ -1760,6 +1760,20 @@ class TestConvertStringLikeTypes:
 _check_pandas_roundtrip(
 df, schema=pa.schema([('a', pa.large_string())]))
 
+def test_binary_view(self):
+s = pd.Series([b'123', b'', b'a', None])
+_check_series_roundtrip(s, type_=pa.binary_view())
+df = pd.DataFrame({'a': s})
+_check_pandas_roundtrip(
+df, schema=pa.schema([('a', pa.binary_view())]))
+
+def test_string_view(self):
+s = pd.Series(['123', '', 'a', None])
+_check_series_roundtrip(s, type_=pa.string_view())
+df = pd.DataFrame({'a': s})
+_check_pandas_roundtrip(
+df, schema=pa.schema([('a', pa.string_view())]))
+
 def test_table_empty_str(self):
 values = ['', '', '', '', '']
 df = pd.DataFrame({'strings': values})

(arrow) branch main updated: GH-39291: [Docs] Remove the "Show source" links from doc pages (#40167)

2024-02-21 Thread jorisvandenbossche

This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
 new 6a22a1dee7 GH-39291: [Docs] Remove the "Show source" links from doc 
pages (#40167)
6a22a1dee7 is described below

commit 6a22a1dee78b0f7daa7e4d8793d663e29a5712a6
Author: Divyansh200102 <146909065+divyansh200...@users.noreply.github.com>
AuthorDate: Wed Feb 21 20:00:24 2024 +0530

GH-39291: [Docs] Remove the "Show source" links from doc pages (#40167)



### Rationale for this change
To fix the show source button links to 404 page problem

### What changes are included in this PR?
The show source button link will be removed.

### Are these changes tested?
Not yet

### Are there any user-facing changes?
Yes

* Closes: #39291
* GitHub Issue: #39291

Authored-by: Divyansh200102 
Signed-off-by: Joris Van den Bossche 
---
 docs/source/conf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/conf.py b/docs/source/conf.py
index 5af7b7955f..c6be6cb94c 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -414,7 +414,7 @@ html_baseurl = "https://arrow.apache.org/docs/;
 
 # If true, links to the reST sources are added to the pages.
 #
-# html_show_sourcelink = True
+html_show_sourcelink = False
 
 # If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
 #

(arrow) branch main updated: GH-39999: [Python] Fix tests for pandas with CoW / nightly integration tests (#40000)

2024-02-09 Thread jorisvandenbossche

This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
 new 40cb0a22c1 GH-3: [Python] Fix tests for pandas with CoW / nightly 
integration tests (#4)
40cb0a22c1 is described below

commit 40cb0a22c1685a1861652b68b6eb394903cf3cba
Author: Joris Van den Bossche 
AuthorDate: Fri Feb 9 09:04:16 2024 +0100

GH-3: [Python] Fix tests for pandas with CoW / nightly integration 
tests (#4)

### Rationale for this change

Fixing a failing test with pandas nightly because of CoW changes.

* Closes: #3

Authored-by: Joris Van den Bossche 
Signed-off-by: Joris Van den Bossche 
---
 python/pyarrow/tests/test_pandas.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/pyarrow/tests/test_pandas.py 
b/python/pyarrow/tests/test_pandas.py
index 8106219057..676cc96151 100644
--- a/python/pyarrow/tests/test_pandas.py
+++ b/python/pyarrow/tests/test_pandas.py
@@ -3650,7 +3650,8 @@ def test_singleton_blocks_zero_copy():
 
 prior_allocation = pa.total_allocated_bytes()
 result = t.to_pandas()
-assert result['f0'].values.flags.writeable
+# access private `_values` because the public `values` is made read-only 
by pandas
+assert result['f0']._values.flags.writeable
 assert pa.total_allocated_bytes() > prior_allocation

(arrow-site) branch main updated: Fix errant line with pantab note (#472)

2024-02-08 Thread jorisvandenbossche

This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-site.git


The following commit(s) were added to refs/heads/main by this push:
 new 2cdbe04f91b Fix errant line with pantab note (#472)
2cdbe04f91b is described below

commit 2cdbe04f91b956476dc47445b678d155da0eb940
Author: William Ayd 
AuthorDate: Thu Feb 8 10:11:23 2024 -0500

Fix errant line with pantab note (#472)

Follow up to https://github.com/apache/arrow-site/pull/471 this wasn't
meant to be there
---
 powered_by.md | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/powered_by.md b/powered_by.md
index 1ba4278d0ea..edb3ff53f9a 100644
--- a/powered_by.md
+++ b/powered_by.md
@@ -155,9 +155,8 @@ short description of your use case.
   supports reading and writing Parquet files using pyarrow. Several pandas
   core developers are also contributors to Apache Arrow.
 * **[pantab][52]:** Allows high performance read/writes of popular dataframe 
libraries
-  like pandas, polars pyarrow, etc... to/from Tableau's Hyper database. pantab 
uses nanoarrow
+  like pandas, polars, pyarrow, etc... to/from Tableau's Hyper database. 
pantab uses nanoarrow
   and the Arrow PyCapsule interface to make that exchange process seamless.
-  core developers are also contributors to Apache Arrow.
 * **[Parseable][51]:** Log analytics platform built for scale and usability. 
Ingest logs from anywhere and unify logs with Parseable. Parseable uses Arrow 
as the intermediary, in-memory data format for log data ingestion.
 * **[Perspective][23]:** Perspective is a streaming data visualization engine 
in JavaScript for building real-time & user-configurable analytics entirely in 
the browser.
 * **[Petastorm][28]:** Petastorm enables single machine or distributed training

(arrow) branch main updated: GH-39812: [Python] Add bindings for ListView and LargeListView (#39813)

2024-02-08 Thread jorisvandenbossche

This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
 new 42e35f101e GH-39812: [Python] Add bindings for ListView and 
LargeListView (#39813)
42e35f101e is described below

commit 42e35f101e87e689dcc48981abf81bc32c41d162
Author: Dane Pitkin <48041712+danepit...@users.noreply.github.com>
AuthorDate: Thu Feb 8 09:44:19 2024 -0500

GH-39812: [Python] Add bindings for ListView and LargeListView (#39813)

### Rationale for this change

Add bindings to the ListView and LargeListView array formats.

### What changes are included in this PR?

* Add initial implementation for ListView and LargeListView
* Add basic unit tests

### Are these changes tested?

* Basic unit tests only (follow up PRs will be needed to implement full 
functionality)

### Are there any user-facing changes?

Yes, documentation is updated in this PR to include the new PyArrow objects.
* Closes: #39812

Lead-authored-by: Dane Pitkin 
Co-authored-by: Dane Pitkin <48041712+danepit...@users.noreply.github.com>
Co-authored-by: Joris Van den Bossche 
Signed-off-by: Joris Van den Bossche 
---
 docs/source/python/api/arrays.rst|   4 +
 docs/source/python/api/datatypes.rst |   4 +
 python/pyarrow/__init__.py   |  14 +-
 python/pyarrow/array.pxi | 574 +++
 python/pyarrow/includes/libarrow.pxd |  90 ++
 python/pyarrow/lib.pxd   |  18 ++
 python/pyarrow/lib.pyx   |   2 +
 python/pyarrow/public-api.pxi|   4 +
 python/pyarrow/scalar.pxi|  10 +
 python/pyarrow/tests/test_array.py   |  71 +
 python/pyarrow/tests/test_misc.py|   4 +
 python/pyarrow/tests/test_scalars.py |   8 +-
 python/pyarrow/tests/test_types.py   |  49 +++
 python/pyarrow/types.pxi | 171 +++
 python/pyarrow/types.py  |  10 +
 15 files changed, 1027 insertions(+), 6 deletions(-)

diff --git a/docs/source/python/api/arrays.rst 
b/docs/source/python/api/arrays.rst
index b858862dcf..e6f6c3dbbd 100644
--- a/docs/source/python/api/arrays.rst
+++ b/docs/source/python/api/arrays.rst
@@ -77,6 +77,8 @@ may expose data type-specific methods or properties.
ListArray
FixedSizeListArray
LargeListArray
+   ListViewArray
+   LargeListViewArray
MapArray
RunEndEncodedArray
StructArray
@@ -135,6 +137,8 @@ classes may expose data type-specific methods or properties.
RunEndEncodedScalar
ListScalar
LargeListScalar
+   ListViewScalar
+   LargeListViewScalar
MapScalar
StructScalar
UnionScalar
diff --git a/docs/source/python/api/datatypes.rst 
b/docs/source/python/api/datatypes.rst
index 642c243b21..62bf4b7723 100644
--- a/docs/source/python/api/datatypes.rst
+++ b/docs/source/python/api/datatypes.rst
@@ -60,6 +60,8 @@ These should be used to create Arrow data types and schemas.
decimal128
list_
large_list
+   list_view
+   large_list_view
map_
struct
dictionary
@@ -149,6 +151,8 @@ represents a given data type (such as ``int32``) or general 
category
is_list
is_large_list
is_fixed_size_list
+   is_list_view
+   is_large_list_view
is_struct
is_union
is_nested
diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py
index 4dbd1258d3..2ee97ddb66 100644
--- a/python/pyarrow/__init__.py
+++ b/python/pyarrow/__init__.py
@@ -166,7 +166,8 @@ from pyarrow.lib import (null, bool_,
  binary, string, utf8, binary_view, string_view,
  large_binary, large_string, large_utf8,
  decimal128, decimal256,
- list_, large_list, map_, struct,
+ list_, large_list, list_view, large_list_view,
+ map_, struct,
  union, sparse_union, dense_union,
  dictionary,
  run_end_encoded,
@@ -174,8 +175,9 @@ from pyarrow.lib import (null, bool_,
  field,
  type_for_alias,
  DataType, DictionaryType, StructType,
- ListType, LargeListType, MapType, FixedSizeListType,
- UnionType, SparseUnionType, DenseUnionType,
+ ListType, LargeListType, FixedSizeListType,
+ ListViewType, LargeListViewType,
+ MapType, UnionType, SparseUnionType, DenseUnionType,
  TimestampType, Time32Type, Time64Type, DurationType,
  FixedSizeBinaryType, Decimal128Type, Decimal256Type,
  BaseExtensionType, ExtensionType,
@@ -201,8 +203,9 @@ from pyarrow.lib

(arrow) branch main updated: GH-39852: [Python] Support creating Binary/StringView arrays from python objects (#39853)

2024-02-07 Thread jorisvandenbossche

This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
 new 7e2fe4fe76 GH-39852: [Python] Support creating Binary/StringView 
arrays from python objects (#39853)
7e2fe4fe76 is described below

commit 7e2fe4fe7634c359017213b79255c9040786fc06
Author: Joris Van den Bossche 
AuthorDate: Wed Feb 7 15:21:37 2024 +0100

GH-39852: [Python] Support creating Binary/StringView arrays from python 
objects (#39853)

Next step for Binary/StringView support in Python 
(https://github.com/apache/arrow/issues/39633), now adding it to the 
python->arrow conversion code path.
* Closes: #39852

Authored-by: Joris Van den Bossche 
Signed-off-by: Joris Van den Bossche 
---
 python/pyarrow/src/arrow/python/python_to_arrow.cc | 35 ++
 python/pyarrow/tests/test_convert_builtin.py   | 19 ++--
 python/pyarrow/tests/test_scalars.py   | 28 +++--
 3 files changed, 42 insertions(+), 40 deletions(-)

diff --git a/python/pyarrow/src/arrow/python/python_to_arrow.cc 
b/python/pyarrow/src/arrow/python/python_to_arrow.cc
index d1d94ac17a..3c4d59d659 100644
--- a/python/pyarrow/src/arrow/python/python_to_arrow.cc
+++ b/python/pyarrow/src/arrow/python/python_to_arrow.cc
@@ -486,6 +486,10 @@ class PyValue {
 return view.ParseString(obj);
   }
 
+  static Status Convert(const BinaryViewType*, const O&, I obj, PyBytesView& 
view) {
+return view.ParseString(obj);
+  }
+
   static Status Convert(const FixedSizeBinaryType* type, const O&, I obj,
 PyBytesView& view) {
 ARROW_RETURN_NOT_OK(view.ParseString(obj));
@@ -499,8 +503,8 @@ class PyValue {
   }
 
   template 
-  static enable_if_string Convert(const T*, const O& options, I obj,
- PyBytesView& view) {
+  static enable_if_t::value || 
is_string_view_type::value, Status>
+  Convert(const T*, const O& options, I obj, PyBytesView& view) {
 if (options.strict) {
   // Strict conversion, force output to be unicode / utf8 and validate that
   // any binary values are utf8
@@ -570,18 +574,12 @@ struct PyConverterTrait;
 
 template 
 struct PyConverterTrait<
-T,
-enable_if_t<(!is_nested_type::value && !is_interval_type::value &&
- !is_extension_type::value && 
!is_binary_view_like_type::value) ||
-std::is_same::value>> {
+T, enable_if_t<(!is_nested_type::value && !is_interval_type::value &&
+!is_extension_type::value) ||
+   std::is_same::value>> {
   using type = PyPrimitiveConverter;
 };
 
-template 
-struct PyConverterTrait> {
-  // not implemented
-};
-
 template 
 struct PyConverterTrait> {
   using type = PyListConverter;
@@ -699,11 +697,22 @@ class PyPrimitiveConverter::
   PyBytesView view_;
 };
 
+template 
+struct OffsetTypeTrait {
+  using type = typename T::offset_type;
+};
+
+template 
+struct OffsetTypeTrait> {
+  using type = int64_t;
+};
+
 template 
-class PyPrimitiveConverter>
+class PyPrimitiveConverter<
+T, enable_if_t::value || 
is_binary_view_like_type::value>>
 : public PrimitiveConverter {
  public:
-  using OffsetType = typename T::offset_type;
+  using OffsetType = typename OffsetTypeTrait::type;
 
   Status Append(PyObject* value) override {
 if (PyValue::IsNull(this->options_, value)) {
diff --git a/python/pyarrow/tests/test_convert_builtin.py 
b/python/pyarrow/tests/test_convert_builtin.py
index 49c4f1a6e7..55ea28f50f 100644
--- a/python/pyarrow/tests/test_convert_builtin.py
+++ b/python/pyarrow/tests/test_convert_builtin.py
@@ -763,6 +763,16 @@ def test_sequence_unicode():
 assert arr.to_pylist() == data
 
 
+@pytest.mark.parametrize("ty", [pa.string(), pa.large_string(), 
pa.string_view()])
+def test_sequence_unicode_explicit_type(ty):
+data = ['foo', 'bar', None, 'mañana']
+arr = pa.array(data, type=ty)
+assert len(arr) == 4
+assert arr.null_count == 1
+assert arr.type == ty
+assert arr.to_pylist() == data
+
+
 def check_array_mixed_unicode_bytes(binary_type, string_type):
 values = ['qux', b'foo', bytearray(b'barz')]
 b_values = [b'qux', b'foo', b'barz']
@@ -787,6 +797,7 @@ def check_array_mixed_unicode_bytes(binary_type, 
string_type):
 def test_array_mixed_unicode_bytes():
 check_array_mixed_unicode_bytes(pa.binary(), pa.string())
 check_array_mixed_unicode_bytes(pa.large_binary(), pa.large_string())
+check_array_mixed_unicode_bytes(pa.binary_view(), pa.string_view())
 
 
 @pytest.mark.large_memory
@@ -818,7 +829,7 @@ def test_large_binary_value(ty):
 
 
 @pytest.mark.large_memory
-@pytest.mark.parametrize("ty", [pa.binary(

(arrow) branch main updated (a1c1773b72 -> 4ceb661013)

2024-02-01 Thread jorisvandenbossche

This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a change to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


from a1c1773b72 GH-39555: [Packaging][Python] Enable building pyarrow 
against numpy 2.0 (#39557)
 add 4ceb661013 GH-39880: [Python][CI] Pin moto<5 for dask integration 
tests (#39881)

No new revisions were added by this update.

Summary of changes:
 ci/scripts/install_dask.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(arrow) branch main updated: GH-39555: [Packaging][Python] Enable building pyarrow against numpy 2.0 (#39557)

2024-02-01 Thread jorisvandenbossche

This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
 new a1c1773b72 GH-39555: [Packaging][Python] Enable building pyarrow 
against numpy 2.0 (#39557)
a1c1773b72 is described below

commit a1c1773b724e4d78faf9a097247c7e976cd2cbfa
Author: Joris Van den Bossche 
AuthorDate: Thu Feb 1 14:53:35 2024 +0100

GH-39555: [Packaging][Python] Enable building pyarrow against numpy 2.0 
(#39557)

### Rationale for this change

Ensure we can build pyarrow against numpy 2.0 nightly (update 
pyproject.toml to allow this), and test this by building our nightly wheels 
with numpy nightly. This also ensures that other projects that use our nightly 
wheels to test together with numpy nightly can do that (numpy 2.0 changes the 
ABI, so to run with numpy 2.0, your package needs to be built with numpy 2.x; 
currently pyarrow installed with our nightly wheel will fail to import when 
also numpy nightly is installed).

See the parent issue https://github.com/apache/arrow/issues/39532 for 
details, and 
https://numpy.org/devdocs/dev/depending_on_numpy.html#numpy-2-0-specific-advice 
for a direct link to the NumPy guidelines on updating build dependencies for 
NumPy 2.0.

* Closes: #39555

Lead-authored-by: Joris Van den Bossche 
Co-authored-by: Antoine Pitrou 
Signed-off-by: Joris Van den Bossche 
---
 ci/docker/python-wheel-manylinux.dockerfile  | 5 +++--
 ci/docker/python-wheel-windows-vs2017.dockerfile | 3 ++-
 ci/scripts/python_wheel_macos_build.sh   | 5 -
 python/pyproject.toml| 7 ++-
 python/requirements-build.txt| 3 ++-
 python/requirements-wheel-build.txt  | 3 ++-
 python/setup.py  | 2 +-
 7 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/ci/docker/python-wheel-manylinux.dockerfile 
b/ci/docker/python-wheel-manylinux.dockerfile
index 0a50d450c2..a07c727ac7 100644
--- a/ci/docker/python-wheel-manylinux.dockerfile
+++ b/ci/docker/python-wheel-manylinux.dockerfile
@@ -28,7 +28,7 @@ ENV MANYLINUX_VERSION=${manylinux}
 RUN yum install -y dnf
 
 # Install basic dependencies
-RUN dnf install -y git flex curl autoconf zip perl-IPC-Cmd wget kernel-headers
+RUN dnf install -y git flex curl autoconf zip perl-IPC-Cmd wget
 
 # A system Python is required for ninja and vcpkg in this Dockerfile.
 # On manylinux2014 base images, system Python is 2.7.5, while
@@ -97,4 +97,5 @@ SHELL ["/bin/bash", "-i", "-c"]
 ENTRYPOINT ["/bin/bash", "-i", "-c"]
 
 COPY python/requirements-wheel-build.txt /arrow/python/
-RUN pip install -r /arrow/python/requirements-wheel-build.txt
+# TODO(GH-39848) Remove the `--pre --extra-index-url` for numpy nightly again 
before the 16.0 release 
+RUN pip install -r /arrow/python/requirements-wheel-build.txt --pre 
--extra-index-url 
"https://pypi.anaconda.org/scientific-python-nightly-wheels/simple;
diff --git a/ci/docker/python-wheel-windows-vs2017.dockerfile 
b/ci/docker/python-wheel-windows-vs2017.dockerfile
index faf07800c9..067105b3a7 100644
--- a/ci/docker/python-wheel-windows-vs2017.dockerfile
+++ b/ci/docker/python-wheel-windows-vs2017.dockerfile
@@ -88,7 +88,8 @@ RUN choco install -r -y --no-progress python 
--version=%PYTHON_VERSION%
 RUN python -m pip install -U pip setuptools
 
 COPY python/requirements-wheel-build.txt arrow/python/
-RUN python -m pip install -r arrow/python/requirements-wheel-build.txt
+# TODO(GH-39848) Remove the `--pre --extra-index-url` for numpy nightly again 
before the 16.0 release 
+RUN python -m pip install -r arrow/python/requirements-wheel-build.txt --pre 
--extra-index-url 
"https://pypi.anaconda.org/scientific-python-nightly-wheels/simple;
 
 # ENV CLCACHE_DIR="C:\clcache"
 # ENV CLCACHE_COMPRESS=1
diff --git a/ci/scripts/python_wheel_macos_build.sh 
b/ci/scripts/python_wheel_macos_build.sh
index fd845c512d..8123a9fdf1 100755
--- a/ci/scripts/python_wheel_macos_build.sh
+++ b/ci/scripts/python_wheel_macos_build.sh
@@ -50,12 +50,15 @@ echo "=== (${PYTHON_VERSION}) Install Python build 
dependencies ==="
 export PIP_SITE_PACKAGES=$(python -c 'import site; 
print(site.getsitepackages()[0])')
 export PIP_TARGET_PLATFORM="macosx_${MACOSX_DEPLOYMENT_TARGET//./_}_${arch}"
 
+# TODO(GH-39848) Remove the `--pre --extra-index-url` for numpy nightly again 
before the 16.0 release 
 pip install \
   --upgrade \
   --only-binary=:all: \
   --target $PIP_SITE_PACKAGES \
   --platform $PIP_TARGET_PLATFORM \
-  -r ${source_dir}/python/requirements-wheel-build.txt
+  -r ${source_dir}/python/requirements-wheel-build.txt \
+  --pre \
+  --extra-index-url 
"https://pypi.anaconda.org/scientific-python-nightly-whe

(arrow) branch main updated: GH-39779: [Python] Expose force_virtual_addressing in PyArrow (#39819)

2024-02-01 Thread jorisvandenbossche

This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
 new 3d45ac9653 GH-39779: [Python] Expose force_virtual_addressing in 
PyArrow (#39819)
3d45ac9653 is described below

commit 3d45ac96534fc76b820b488aa02182e6b93a388f
Author: y.yoshida5 <39612448+yo1...@users.noreply.github.com>
AuthorDate: Thu Feb 1 22:36:59 2024 +0900

GH-39779: [Python] Expose force_virtual_addressing in PyArrow (#39819)



### Rationale for this change / What changes are included in this PR?

To expose force_virtual_addressing in PyArrow.

### Are these changes tested?

Existing unit tests are not broken, and a new test case have been added.

### Are there any user-facing changes?

pyarrow.fs.S3FileSystem: it becomes possible to specify the argument 
'force_virtual_addressing'.

* Closes: #39779

Authored-by: yo1956 
Signed-off-by: Joris Van den Bossche 
---
 python/pyarrow/_s3fs.pyx| 11 ++-
 python/pyarrow/includes/libarrow_fs.pxd |  1 +
 python/pyarrow/tests/test_fs.py |  4 
 3 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/python/pyarrow/_s3fs.pyx b/python/pyarrow/_s3fs.pyx
index 13b8c748cb..f5bab99a49 100644
--- a/python/pyarrow/_s3fs.pyx
+++ b/python/pyarrow/_s3fs.pyx
@@ -245,6 +245,11 @@ cdef class S3FileSystem(FileSystem):
 retry_strategy : S3RetryStrategy, default 
AwsStandardS3RetryStrategy(max_attempts=3)
 The retry strategy to use with S3; fail after max_attempts. Available
 strategies are AwsStandardS3RetryStrategy, AwsDefaultS3RetryStrategy.
+force_virtual_addressing : bool, default False
+Whether to use virtual addressing of buckets.
+If true, then virtual addressing is always enabled.
+If false, then virtual addressing is only enabled if 
`endpoint_override` is empty.
+This can be used for non-AWS backends that only support virtual 
hosted-style access.
 
 Examples
 
@@ -268,7 +273,9 @@ cdef class S3FileSystem(FileSystem):
  role_arn=None, session_name=None, external_id=None,
  load_frequency=900, proxy_options=None,
  allow_bucket_creation=False, allow_bucket_deletion=False,
- retry_strategy: S3RetryStrategy = 
AwsStandardS3RetryStrategy(max_attempts=3)):
+ retry_strategy: S3RetryStrategy = AwsStandardS3RetryStrategy(
+ max_attempts=3),
+ force_virtual_addressing=False):
 cdef:
 optional[CS3Options] options
 shared_ptr[CS3FileSystem] wrapped
@@ -380,6 +387,7 @@ cdef class S3FileSystem(FileSystem):
 
 options.value().allow_bucket_creation = allow_bucket_creation
 options.value().allow_bucket_deletion = allow_bucket_deletion
+options.value().force_virtual_addressing = force_virtual_addressing
 
 if isinstance(retry_strategy, AwsStandardS3RetryStrategy):
 options.value().retry_strategy = 
CS3RetryStrategy.GetAwsStandardRetryStrategy(
@@ -447,6 +455,7 @@ cdef class S3FileSystem(FileSystem):
opts.proxy_options.username),
'password': frombytes(
opts.proxy_options.password)},
+force_virtual_addressing=opts.force_virtual_addressing,
 ),)
 )
 
diff --git a/python/pyarrow/includes/libarrow_fs.pxd 
b/python/pyarrow/includes/libarrow_fs.pxd
index cb30f4e750..7876fb0f96 100644
--- a/python/pyarrow/includes/libarrow_fs.pxd
+++ b/python/pyarrow/includes/libarrow_fs.pxd
@@ -167,6 +167,7 @@ cdef extern from "arrow/filesystem/api.h" namespace 
"arrow::fs" nogil:
 c_bool background_writes
 c_bool allow_bucket_creation
 c_bool allow_bucket_deletion
+c_bool force_virtual_addressing
 shared_ptr[const CKeyValueMetadata] default_metadata
 c_string role_arn
 c_string session_name
diff --git a/python/pyarrow/tests/test_fs.py b/python/pyarrow/tests/test_fs.py
index ab10addfc3..6ba5137e4f 100644
--- a/python/pyarrow/tests/test_fs.py
+++ b/python/pyarrow/tests/test_fs.py
@@ -1186,6 +1186,10 @@ def test_s3_options(pickle_module):
 assert pickle_module.loads(pickle_module.dumps(fs2)) == fs2
 assert fs2 != fs
 
+fs = S3FileSystem(endpoint_override='localhost:8999', 
force_virtual_addressing=True)
+assert isinstance(fs, S3FileSystem)
+assert pickle_module.loads(pickle_module.dumps(fs)) == fs
+
 with pytest.raises(ValueError):
 S3FileSystem(access_key='access')
 with pytest.raises(ValueError):

(arrow) branch main updated: GH-39849: [Python] Remove the use of pytest-lazy-fixture (#39850)

2024-02-01 Thread jorisvandenbossche

This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
 new 44d5597a0e GH-39849: [Python] Remove the use of pytest-lazy-fixture 
(#39850)
44d5597a0e is described below

commit 44d5597a0e8a4d635f1aec82ba885f61b5c17829
Author: Alenka Frim 
AuthorDate: Thu Feb 1 14:35:32 2024 +0100

GH-39849: [Python] Remove the use of pytest-lazy-fixture (#39850)

### Rationale for this change

Removing the use of `pytest-lazy-fixture` in our test suite as it is 
unmaintained.
Changes in this PR include:

- Remove the use of `pytest-lazy-fixture`
- Remove marks from fixtures to avoid future error, see
   ```
   PytestRemovedIn9Warning: Marks applied to fixtures have no effect
 See docs: 
https://docs.pytest.org/en/stable/deprecations.html#applying-a-mark-to-a-fixture-function
   ```
- Catch two different warnings in `def test_legacy_int_type()`

### Are these changes tested?

The changes affect the tests so they must pass.

### Are there any user-facing changes?

No.
* Closes: #39849

Lead-authored-by: AlenkaF 
Co-authored-by: Joris Van den Bossche 
Signed-off-by: Joris Van den Bossche 
---
 ci/conda_env_python.txt |  3 +--
 dev/tasks/conda-recipes/arrow-cpp/meta.yaml |  1 -
 python/pyarrow/tests/conftest.py|  7 +++---
 python/pyarrow/tests/test_dataset.py|  3 ---
 python/pyarrow/tests/test_extension_type.py |  5 +
 python/pyarrow/tests/test_fs.py | 34 ++---
 python/pyarrow/tests/test_ipc.py|  6 ++---
 python/requirements-test.txt|  1 -
 python/requirements-wheel-test.txt  |  1 -
 9 files changed, 25 insertions(+), 36 deletions(-)

diff --git a/ci/conda_env_python.txt b/ci/conda_env_python.txt
index 5fdd21d2bd..59e2def1bf 100644
--- a/ci/conda_env_python.txt
+++ b/ci/conda_env_python.txt
@@ -23,9 +23,8 @@ cloudpickle
 fsspec
 hypothesis
 numpy>=1.16.6
-pytest<8  # pytest-lazy-fixture broken on pytest 8.0.0
+pytest<8
 pytest-faulthandler
-pytest-lazy-fixture
 s3fs>=2023.10.0
 setuptools
 setuptools_scm<8.0.0
diff --git a/dev/tasks/conda-recipes/arrow-cpp/meta.yaml 
b/dev/tasks/conda-recipes/arrow-cpp/meta.yaml
index b8ffbfdb71..367445c595 100644
--- a/dev/tasks/conda-recipes/arrow-cpp/meta.yaml
+++ b/dev/tasks/conda-recipes/arrow-cpp/meta.yaml
@@ -340,7 +340,6 @@ outputs:
 # test_cpp_extension_in_python requires a compiler
 - {{ compiler("cxx") }}  # [linux]
 - pytest
-- pytest-lazy-fixture
 - backports.zoneinfo # [py<39]
 - boto3
 - cffi
diff --git a/python/pyarrow/tests/conftest.py b/python/pyarrow/tests/conftest.py
index a5941e8c8d..0da757a4bc 100644
--- a/python/pyarrow/tests/conftest.py
+++ b/python/pyarrow/tests/conftest.py
@@ -24,7 +24,6 @@ import time
 import urllib.request
 
 import pytest
-from pytest_lazyfixture import lazy_fixture
 import hypothesis as h
 from ..conftest import groups, defaults
 
@@ -259,13 +258,13 @@ def gcs_server():
 
 @pytest.fixture(
 params=[
-lazy_fixture('builtin_pickle'),
-lazy_fixture('cloudpickle')
+'builtin_pickle',
+'cloudpickle'
 ],
 scope='session'
 )
 def pickle_module(request):
-return request.param
+return request.getfixturevalue(request.param)
 
 
 @pytest.fixture(scope='session')
diff --git a/python/pyarrow/tests/test_dataset.py 
b/python/pyarrow/tests/test_dataset.py
index a4838d63a6..a9054f0b17 100644
--- a/python/pyarrow/tests/test_dataset.py
+++ b/python/pyarrow/tests/test_dataset.py
@@ -100,7 +100,6 @@ def assert_dataset_fragment_convenience_methods(dataset):
 
 
 @pytest.fixture
-@pytest.mark.parquet
 def mockfs():
 mockfs = fs._MockFileSystem()
 
@@ -221,7 +220,6 @@ def multisourcefs(request):
 
 
 @pytest.fixture
-@pytest.mark.parquet
 def dataset(mockfs):
 format = ds.ParquetFileFormat()
 selector = fs.FileSelector('subdir', recursive=True)
@@ -2692,7 +2690,6 @@ def 
test_dataset_partitioned_dictionary_type_reconstruct(tempdir, pickle_module)
 
 
 @pytest.fixture
-@pytest.mark.parquet
 def s3_example_simple(s3_server):
 from pyarrow.fs import FileSystem
 
diff --git a/python/pyarrow/tests/test_extension_type.py 
b/python/pyarrow/tests/test_extension_type.py
index a88e20eefe..d8c792ef00 100644
--- a/python/pyarrow/tests/test_extension_type.py
+++ b/python/pyarrow/tests/test_extension_type.py
@@ -1485,10 +1485,7 @@ def test_legacy_int_type():
 batch = pa.RecordBatch.from_arrays([ext_arr], names=['ext'])
 buf = ipc_write_batch(batch)
 
-with pytest.warns(
-RuntimeWarning,
-match="pickle-based deserialization of pyarrow.PyExtensionType "
-

(arrow) branch main updated: GH-39651: [Python] Basic pyarrow bindings for Binary/StringView classes (#39652)

2024-01-30 Thread jorisvandenbossche

This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
 new 787afa1594 GH-39651: [Python] Basic pyarrow bindings for 
Binary/StringView classes (#39652)
787afa1594 is described below

commit 787afa1594586d2d556d21471647f9cd2c55b18f
Author: Joris Van den Bossche 
AuthorDate: Tue Jan 30 12:54:19 2024 +0100

GH-39651: [Python] Basic pyarrow bindings for Binary/StringView classes 
(#39652)

### Rationale for this change

First step for https://github.com/apache/arrow/issues/39633: exposing the 
Array, DataType and Scalar classes for BinaryView and StringView, such that 
those can already be represented in pyarrow.

(I exposed a variant of StringBuilder as well, just for now to be able to 
create test data)

* Closes: #39651

Authored-by: Joris Van den Bossche 
Signed-off-by: Joris Van den Bossche 
---
 docs/source/python/api/arrays.rst  |  4 ++
 docs/source/python/api/datatypes.rst   |  4 ++
 python/pyarrow/__init__.py |  7 ++--
 python/pyarrow/array.pxi   | 14 +++
 python/pyarrow/builder.pxi | 66 ++
 python/pyarrow/includes/libarrow.pxd   |  9 
 python/pyarrow/lib.pxd |  8 
 python/pyarrow/lib.pyx |  2 +
 python/pyarrow/scalar.pxi  | 10 +
 python/pyarrow/src/arrow/python/helpers.cc |  2 +
 python/pyarrow/tests/test_builder.py   | 21 +-
 python/pyarrow/tests/test_misc.py  |  4 ++
 python/pyarrow/tests/test_scalars.py   | 28 -
 python/pyarrow/tests/test_types.py |  8 
 python/pyarrow/types.pxi   | 32 +++
 python/pyarrow/types.py| 10 +
 16 files changed, 223 insertions(+), 6 deletions(-)

diff --git a/docs/source/python/api/arrays.rst 
b/docs/source/python/api/arrays.rst
index 73b5e063ff..b858862dcf 100644
--- a/docs/source/python/api/arrays.rst
+++ b/docs/source/python/api/arrays.rst
@@ -63,6 +63,8 @@ may expose data type-specific methods or properties.
FixedSizeBinaryArray
LargeBinaryArray
LargeStringArray
+   BinaryViewArray,
+   StringViewArray,
Time32Array
Time64Array
Date32Array
@@ -119,6 +121,8 @@ classes may expose data type-specific methods or properties.
FixedSizeBinaryScalar
LargeBinaryScalar
LargeStringScalar
+   BinaryViewScalar
+   StringViewScalar
Time32Scalar
Time64Scalar
Date32Scalar
diff --git a/docs/source/python/api/datatypes.rst 
b/docs/source/python/api/datatypes.rst
index 4066ef3142..642c243b21 100644
--- a/docs/source/python/api/datatypes.rst
+++ b/docs/source/python/api/datatypes.rst
@@ -55,6 +55,8 @@ These should be used to create Arrow data types and schemas.
large_binary
large_string
large_utf8
+   binary_view
+   string_view
decimal128
list_
large_list
@@ -168,6 +170,8 @@ represents a given data type (such as ``int32``) or general 
category
is_large_binary
is_large_unicode
is_large_string
+   is_binary_view
+   is_string_view
is_fixed_size_binary
is_map
is_dictionary
diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py
index 9da94885ec..4dbd1258d3 100644
--- a/python/pyarrow/__init__.py
+++ b/python/pyarrow/__init__.py
@@ -163,7 +163,7 @@ from pyarrow.lib import (null, bool_,
  time32, time64, timestamp, date32, date64, duration,
  month_day_nano_interval,
  float16, float32, float64,
- binary, string, utf8,
+ binary, string, utf8, binary_view, string_view,
  large_binary, large_string, large_utf8,
  decimal128, decimal256,
  list_, large_list, map_, struct,
@@ -205,6 +205,7 @@ from pyarrow.lib import (null, bool_,
  FixedSizeListArray, UnionArray,
  BinaryArray, StringArray,
  LargeBinaryArray, LargeStringArray,
+ BinaryViewArray, StringViewArray,
  FixedSizeBinaryArray,
  DictionaryArray,
  Date32Array, Date64Array, TimestampArray,
@@ -223,8 +224,8 @@ from pyarrow.lib import (null, bool_,
  Time32Scalar, Time64Scalar,
  TimestampScalar, DurationScalar,
  MonthDayNanoIntervalScalar,
- BinaryScalar, LargeBinaryScalar,
- StringScalar, LargeStringScalar,
+ BinaryScalar, LargeBinaryScalar, BinaryViewScalar,
+ StringScalar, LargeStringScalar, StringViewScalar

(arrow) branch main updated: GH-39640: [Docs] Pin pydata-sphinx-theme to 0.14.* (#39758)

2024-01-30 Thread jorisvandenbossche

This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
 new c6ab28677d GH-39640: [Docs] Pin pydata-sphinx-theme to 0.14.* (#39758)
c6ab28677d is described below

commit c6ab28677ddf22799f3db277137708ac5b070acd
Author: Joris Van den Bossche 
AuthorDate: Tue Jan 30 09:16:53 2024 +0100

GH-39640: [Docs] Pin pydata-sphinx-theme to 0.14.* (#39758)

### Rationale for this change

Fixing the pinning syntax so we get the latest 0.14.x version (which is 
currently 0.14.4)

* Closes: #39640

Authored-by: Joris Van den Bossche 
Signed-off-by: Joris Van den Bossche 
---
 ci/conda_env_sphinx.txt| 2 +-
 docs/requirements.txt  | 2 +-
 docs/source/python/api/compute.rst | 2 +-
 docs/source/python/compute.rst | 4 ++--
 docs/source/python/pandas.rst  | 2 +-
 5 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/ci/conda_env_sphinx.txt b/ci/conda_env_sphinx.txt
index d0f494d2e0..0e50875fc1 100644
--- a/ci/conda_env_sphinx.txt
+++ b/ci/conda_env_sphinx.txt
@@ -20,7 +20,7 @@ breathe
 doxygen
 ipython
 numpydoc
-pydata-sphinx-theme=0.14.1
+pydata-sphinx-theme=0.14
 sphinx-autobuild
 sphinx-design
 sphinx-copybutton
diff --git a/docs/requirements.txt b/docs/requirements.txt
index aee2eb662c..5d6fec7ddf 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -5,7 +5,7 @@
 breathe
 ipython
 numpydoc
-pydata-sphinx-theme==0.14.1
+pydata-sphinx-theme~=0.14
 sphinx-autobuild
 sphinx-design
 sphinx-copybutton
diff --git a/docs/source/python/api/compute.rst 
b/docs/source/python/api/compute.rst
index b879643017..928c607d13 100644
--- a/docs/source/python/api/compute.rst
+++ b/docs/source/python/api/compute.rst
@@ -590,4 +590,4 @@ User-Defined Functions
:toctree: ../generated/
 
register_scalar_function
-   ScalarUdfContext
+   UdfContext
diff --git a/docs/source/python/compute.rst b/docs/source/python/compute.rst
index e8a5b613c6..c02059a4f8 100644
--- a/docs/source/python/compute.rst
+++ b/docs/source/python/compute.rst
@@ -445,9 +445,9 @@ output type need to be defined. Using 
:func:`pyarrow.compute.register_scalar_fun
 
 The implementation of a user-defined function always takes a first *context*
 parameter (named ``ctx`` in the example above) which is an instance of
-:class:`pyarrow.compute.ScalarUdfContext`.
+:class:`pyarrow.compute.UdfContext`.
 This context exposes several useful attributes, particularly a
-:attr:`~pyarrow.compute.ScalarUdfContext.memory_pool` to be used for
+:attr:`~pyarrow.compute.UdfContext.memory_pool` to be used for
 allocations in the context of the user-defined function.
 
 You can call a user-defined function directly using 
:func:`pyarrow.compute.call_function`:
diff --git a/docs/source/python/pandas.rst b/docs/source/python/pandas.rst
index fda90c4f2a..23a4b73bd0 100644
--- a/docs/source/python/pandas.rst
+++ b/docs/source/python/pandas.rst
@@ -197,7 +197,7 @@ use the ``datetime64[ns]`` type in Pandas and are converted 
to an Arrow
 
 .. ipython:: python
 
-   df = pd.DataFrame({"datetime": pd.date_range("2020-01-01T00:00:00Z", 
freq="H", periods=3)})
+   df = pd.DataFrame({"datetime": pd.date_range("2020-01-01T00:00:00Z", 
freq="h", periods=3)})
df.dtypes
df

(arrow) branch main updated: GH-39732: [Python][CI] Fix test failures with latest/nightly pandas (#39760)

2024-01-25 Thread jorisvandenbossche

This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
 new c67d0260d4 GH-39732: [Python][CI] Fix test failures with 
latest/nightly pandas (#39760)
c67d0260d4 is described below

commit c67d0260d4e96472b5cbdff66ca67ead2b9abe4c
Author: Alenka Frim 
AuthorDate: Thu Jan 25 10:21:57 2024 +0100

GH-39732: [Python][CI] Fix test failures with latest/nightly pandas (#39760)

This PR rearranges if-else blocks in the `table` function (`table.pxi`) so 
that pandas dataframe object comes before checking for `__arrow_c_stream__` and 
`__arrow_c_array__`.
* Closes: #39732

Authored-by: AlenkaF 
Signed-off-by: Joris Van den Bossche 
---
 python/pyarrow/table.pxi | 18 +-
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi
index d98c93e1c0..3c450d61a7 100644
--- a/python/pyarrow/table.pxi
+++ b/python/pyarrow/table.pxi
@@ -5202,7 +5202,17 @@ def table(data, names=None, schema=None, metadata=None, 
nthreads=None):
 raise ValueError(
 "The 'names' argument is not valid when passing a dictionary")
 return Table.from_pydict(data, schema=schema, metadata=metadata)
+elif _pandas_api.is_data_frame(data):
+if names is not None or metadata is not None:
+raise ValueError(
+"The 'names' and 'metadata' arguments are not valid when "
+"passing a pandas DataFrame")
+return Table.from_pandas(data, schema=schema, nthreads=nthreads)
 elif hasattr(data, "__arrow_c_stream__"):
+if names is not None or metadata is not None:
+raise ValueError(
+"The 'names' and 'metadata' arguments are not valid when "
+"using Arrow PyCapsule Interface")
 if schema is not None:
 requested = schema.__arrow_c_schema__()
 else:
@@ -5216,14 +5226,12 @@ def table(data, names=None, schema=None, metadata=None, 
nthreads=None):
 table = table.cast(schema)
 return table
 elif hasattr(data, "__arrow_c_array__"):
-batch = record_batch(data, schema)
-return Table.from_batches([batch])
-elif _pandas_api.is_data_frame(data):
 if names is not None or metadata is not None:
 raise ValueError(
 "The 'names' and 'metadata' arguments are not valid when "
-"passing a pandas DataFrame")
-return Table.from_pandas(data, schema=schema, nthreads=nthreads)
+"using Arrow PyCapsule Interface")
+batch = record_batch(data, schema)
+return Table.from_batches([batch])
 else:
 raise TypeError(
 "Expected pandas DataFrame, python dictionary or list of arrays")

(arrow) branch main updated: GH-38655: [C++] "iso_calendar" kernel returns incorrect results for array length > 32 (#39360)

2024-01-23 Thread jorisvandenbossche

This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
 new 7e9f265878 GH-38655: [C++] "iso_calendar" kernel returns incorrect 
results for array length > 32 (#39360)
7e9f265878 is described below

commit 7e9f2658786b966685ddedf6b90415968f207b75
Author: Rok Mihevc 
AuthorDate: Tue Jan 23 12:43:05 2024 +0100

GH-38655: [C++] "iso_calendar" kernel returns incorrect results for array 
length > 32 (#39360)

### Rationale for this change

When defining `StructArray`'s field builders for `ISOCalendar` we don't 
pre-allocate memory and then use unsafe append. This causes the resulting array 
to be at most 32 rows long.

### What changes are included in this PR?

This introduces required memory pre-allocation in the `ISOCalendar` c++ 
kernel.

### Are these changes tested?

This adds a test for the Python wrapper.

### Are there any user-facing changes?

Fixes the behavior of `iso_calendar` kernel.
* Closes: #38655

Lead-authored-by: Rok Mihevc 
Co-authored-by: Joris Van den Bossche 
Signed-off-by: Joris Van den Bossche 
---
 cpp/src/arrow/compute/kernels/scalar_temporal_unary.cc |  2 +-
 python/pyarrow/tests/test_compute.py   | 13 +
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/cpp/src/arrow/compute/kernels/scalar_temporal_unary.cc 
b/cpp/src/arrow/compute/kernels/scalar_temporal_unary.cc
index a88ce38936..f49e201492 100644
--- a/cpp/src/arrow/compute/kernels/scalar_temporal_unary.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_temporal_unary.cc
@@ -1510,7 +1510,7 @@ struct ISOCalendar {
 for (int i = 0; i < 3; i++) {
   field_builders.push_back(
   checked_cast(struct_builder->field_builder(i)));
-  RETURN_NOT_OK(field_builders[i]->Reserve(1));
+  RETURN_NOT_OK(field_builders[i]->Reserve(in.length));
 }
 auto visit_null = [&]() { return struct_builder->AppendNull(); };
 std::function visit_value;
diff --git a/python/pyarrow/tests/test_compute.py 
b/python/pyarrow/tests/test_compute.py
index 34d4da580f..4b58dc65ba 100644
--- a/python/pyarrow/tests/test_compute.py
+++ b/python/pyarrow/tests/test_compute.py
@@ -2263,6 +2263,19 @@ def test_extract_datetime_components():
 _check_datetime_components(timestamps, timezone)
 
 
+@pytest.mark.parametrize("unit", ["s", "ms", "us", "ns"])
+def test_iso_calendar_longer_array(unit):
+# https://github.com/apache/arrow/issues/38655
+# ensure correct result for array length > 32
+arr = pa.array([datetime.datetime(2022, 1, 2, 9)]*50, pa.timestamp(unit))
+result = pc.iso_calendar(arr)
+expected = pa.StructArray.from_arrays(
+[[2021]*50, [52]*50, [7]*50],
+names=['iso_year', 'iso_week', 'iso_day_of_week']
+)
+assert result.equals(expected)
+
+
 @pytest.mark.pandas
 @pytest.mark.skipif(sys.platform == "win32" and not util.windows_has_tzdata(),
 reason="Timezone database is not installed on Windows")

(arrow) branch main updated: MINOR: [Docs] Fix formatting of note on Device data interface docs (#39757)

2024-01-23 Thread jorisvandenbossche

This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
 new eed53bbd59 MINOR: [Docs] Fix formatting of note on Device data 
interface docs (#39757)
eed53bbd59 is described below

commit eed53bbd59957a80c8f55fe4d265cd2371fbea11
Author: Joris Van den Bossche 
AuthorDate: Tue Jan 23 12:32:57 2024 +0100

MINOR: [Docs] Fix formatting of note on Device data interface docs (#39757)

Authored-by: Joris Van den Bossche 
Signed-off-by: Joris Van den Bossche 
---
 docs/source/format/CDeviceDataInterface.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/format/CDeviceDataInterface.rst 
b/docs/source/format/CDeviceDataInterface.rst
index 76b7132681..b5b7229a67 100644
--- a/docs/source/format/CDeviceDataInterface.rst
+++ b/docs/source/format/CDeviceDataInterface.rst
@@ -341,8 +341,8 @@ Notes:
 
 * \(1) Currently unknown if framework has an event type to support.
 * \(2) Extension Device has producer defined semantics and thus if
-   synchronization is needed for an extension device, the producer
-   should document the type.
+  synchronization is needed for an extension device, the producer
+  should document the type.
 
 
 Semantics

(arrow) branch main updated: GH-39599: [Python] Avoid leaking references to Numpy dtypes (#39636)

2024-01-17 Thread jorisvandenbossche

This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
 new 96645ebc50 GH-39599: [Python] Avoid leaking references to Numpy dtypes 
(#39636)
96645ebc50 is described below

commit 96645ebc5037b6b4eab127c274f4871bbef99d77
Author: Antoine Pitrou 
AuthorDate: Wed Jan 17 11:26:37 2024 +0100

GH-39599: [Python] Avoid leaking references to Numpy dtypes (#39636)

### Rationale for this change

`PyArray_DescrFromScalar` returns a new reference, so we should be careful 
to decref it when we don't use it anymore.

### Are these changes tested?

No.

### Are there any user-facing changes?

No.
* Closes: #39599

Authored-by: Antoine Pitrou 
Signed-off-by: Joris Van den Bossche 
---
 python/pyarrow/array.pxi   |  3 +-
 python/pyarrow/includes/libarrow_python.pxd|  2 +-
 python/pyarrow/src/arrow/python/inference.cc   |  5 +-
 python/pyarrow/src/arrow/python/numpy_convert.cc   | 77 ++
 python/pyarrow/src/arrow/python/numpy_convert.h|  6 +-
 python/pyarrow/src/arrow/python/numpy_to_arrow.cc  | 11 ++--
 python/pyarrow/src/arrow/python/python_to_arrow.cc |  6 +-
 python/pyarrow/types.pxi   |  6 +-
 8 files changed, 48 insertions(+), 68 deletions(-)

diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index 5c2d22aef1..1416f5f434 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -66,8 +66,7 @@ cdef shared_ptr[CDataType] _ndarray_to_type(object values,
 dtype = values.dtype
 
 if type is None and dtype != object:
-with nogil:
-check_status(NumPyDtypeToArrow(dtype, _type))
+c_type = GetResultValue(NumPyDtypeToArrow(dtype))
 
 if type is not None:
 c_type = type.sp_type
diff --git a/python/pyarrow/includes/libarrow_python.pxd 
b/python/pyarrow/includes/libarrow_python.pxd
index e3179062a1..906f0b7d28 100644
--- a/python/pyarrow/includes/libarrow_python.pxd
+++ b/python/pyarrow/includes/libarrow_python.pxd
@@ -73,7 +73,7 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py" 
nogil:
 object obj, object mask, const PyConversionOptions& options,
 CMemoryPool* pool)
 
-CStatus NumPyDtypeToArrow(object dtype, shared_ptr[CDataType]* type)
+CResult[shared_ptr[CDataType]] NumPyDtypeToArrow(object dtype)
 
 CStatus NdarrayToArrow(CMemoryPool* pool, object ao, object mo,
c_bool from_pandas,
diff --git a/python/pyarrow/src/arrow/python/inference.cc 
b/python/pyarrow/src/arrow/python/inference.cc
index 9537aec574..10116f9afa 100644
--- a/python/pyarrow/src/arrow/python/inference.cc
+++ b/python/pyarrow/src/arrow/python/inference.cc
@@ -468,10 +468,7 @@ class TypeInferrer {
 if (numpy_dtype_count_ > 0) {
   // All NumPy scalars and Nones/nulls
   if (numpy_dtype_count_ + none_count_ == total_count_) {
-std::shared_ptr type;
-RETURN_NOT_OK(NumPyDtypeToArrow(numpy_unifier_.current_dtype(), 
));
-*out = type;
-return Status::OK();
+return NumPyDtypeToArrow(numpy_unifier_.current_dtype()).Value(out);
   }
 
   // The "bad path": data contains a mix of NumPy scalars and
diff --git a/python/pyarrow/src/arrow/python/numpy_convert.cc 
b/python/pyarrow/src/arrow/python/numpy_convert.cc
index 4970680764..dfee88c092 100644
--- a/python/pyarrow/src/arrow/python/numpy_convert.cc
+++ b/python/pyarrow/src/arrow/python/numpy_convert.cc
@@ -59,12 +59,11 @@ NumPyBuffer::~NumPyBuffer() {
 
 #define TO_ARROW_TYPE_CASE(NPY_NAME, FACTORY) \
   case NPY_##NPY_NAME:\
-*out = FACTORY(); \
-break;
+return FACTORY();
 
 namespace {
 
-Status GetTensorType(PyObject* dtype, std::shared_ptr* out) {
+Result> GetTensorType(PyObject* dtype) {
   if (!PyObject_TypeCheck(dtype, _Type)) {
 return Status::TypeError("Did not pass numpy.dtype object");
   }
@@ -84,11 +83,8 @@ Status GetTensorType(PyObject* dtype, 
std::shared_ptr* out) {
 TO_ARROW_TYPE_CASE(FLOAT16, float16);
 TO_ARROW_TYPE_CASE(FLOAT32, float32);
 TO_ARROW_TYPE_CASE(FLOAT64, float64);
-default: {
-  return Status::NotImplemented("Unsupported numpy type ", 
descr->type_num);
-}
   }
-  return Status::OK();
+  return Status::NotImplemented("Unsupported numpy type ", descr->type_num);
 }
 
 Status GetNumPyType(const DataType& type, int* type_num) {
@@ -120,15 +116,21 @@ Status GetNumPyType(const DataType& type, int* type_num) {
 
 }  // namespace
 
-Status NumPyDtypeToArrow(PyObject* dtype, std::shared_ptr* out) {
+Result> NumPyScalarToArrowDataType(PyObject* scalar) 
{
+  PyArray_Descr

(arrow) branch main updated: GH-36412: [Python][CI] Fix extra deprecation warnings in the pandas nightly build (#39609)

2024-01-17 Thread jorisvandenbossche

This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
 new 63b769 GH-36412: [Python][CI] Fix extra deprecation warnings in 
the pandas nightly build (#39609)
63b769 is described below

commit 63b769f3ad6c724305b4182526307ab025d5
Author: Alenka Frim 
AuthorDate: Wed Jan 17 11:12:41 2024 +0100

GH-36412: [Python][CI] Fix extra deprecation warnings in the pandas nightly 
build (#39609)

Fixes left deprecation warnings coming from the pandas development version, 
by updating our test code to avoid the deprecated patterns.
* Closes: #36412

Lead-authored-by: AlenkaF 
Co-authored-by: Alenka Frim 
Co-authored-by: Joris Van den Bossche 
Signed-off-by: Joris Van den Bossche 
---
 python/pyarrow/pandas_compat.py   | 15 ++
 python/pyarrow/tests/parquet/test_datetime.py |  4 +--
 python/pyarrow/tests/test_compute.py  |  6 ++--
 python/pyarrow/tests/test_dataset.py  |  6 ++--
 python/pyarrow/tests/test_pandas.py   | 42 +++
 5 files changed, 35 insertions(+), 38 deletions(-)

diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py
index 39dee85492..61e6318e29 100644
--- a/python/pyarrow/pandas_compat.py
+++ b/python/pyarrow/pandas_compat.py
@@ -967,20 +967,9 @@ def _extract_index_level(table, result_table, field_name,
 # The serialized index column was removed by the user
 return result_table, None, None
 
-pd = _pandas_api.pd
-
 col = table.column(i)
-values = col.to_pandas(types_mapper=types_mapper).values
-
-if hasattr(values, 'flags') and not values.flags.writeable:
-# ARROW-1054: in pandas 0.19.2, factorize will reject
-# non-writeable arrays when calling MultiIndex.from_arrays
-values = values.copy()
-
-if isinstance(col.type, pa.lib.TimestampType) and col.type.tz is not None:
-index_level = make_tz_aware(pd.Series(values, copy=False), col.type.tz)
-else:
-index_level = pd.Series(values, dtype=values.dtype, copy=False)
+index_level = col.to_pandas(types_mapper=types_mapper)
+index_level.name = None
 result_table = result_table.remove_column(
 result_table.schema.get_field_index(field_name)
 )
diff --git a/python/pyarrow/tests/parquet/test_datetime.py 
b/python/pyarrow/tests/parquet/test_datetime.py
index 6a9cbd4f73..0896eb37e6 100644
--- a/python/pyarrow/tests/parquet/test_datetime.py
+++ b/python/pyarrow/tests/parquet/test_datetime.py
@@ -116,7 +116,7 @@ def test_coerce_timestamps(tempdir):
 df_expected = df.copy()
 for i, x in enumerate(df_expected['datetime64']):
 if isinstance(x, np.ndarray):
-df_expected['datetime64'][i] = x.astype('M8[us]')
+df_expected.loc[i, 'datetime64'] = x.astype('M8[us]')
 
 tm.assert_frame_equal(df_expected, df_read)
 
@@ -429,7 +429,7 @@ def 
test_noncoerced_nanoseconds_written_without_exception(tempdir):
 # nanosecond timestamps by default
 n = 9
 df = pd.DataFrame({'x': range(n)},
-  index=pd.date_range('2017-01-01', freq='1n', periods=n))
+  index=pd.date_range('2017-01-01', freq='ns', periods=n))
 tb = pa.Table.from_pandas(df)
 
 filename = tempdir / 'written.parquet'
diff --git a/python/pyarrow/tests/test_compute.py 
b/python/pyarrow/tests/test_compute.py
index d1eb605c71..34d4da580f 100644
--- a/python/pyarrow/tests/test_compute.py
+++ b/python/pyarrow/tests/test_compute.py
@@ -2360,10 +2360,10 @@ def _check_temporal_rounding(ts, values, unit):
 unit_shorthand = {
 "nanosecond": "ns",
 "microsecond": "us",
-"millisecond": "L",
+"millisecond": "ms",
 "second": "s",
 "minute": "min",
-"hour": "H",
+"hour": "h",
 "day": "D"
 }
 greater_unit = {
@@ -2371,7 +2371,7 @@ def _check_temporal_rounding(ts, values, unit):
 "microsecond": "ms",
 "millisecond": "s",
 "second": "min",
-"minute": "H",
+"minute": "h",
 "hour": "d",
 }
 ta = pa.array(ts)
diff --git a/python/pyarrow/tests/test_dataset.py 
b/python/pyarrow/tests/test_dataset.py
index ae2146c0bd..d473299f20 100644
--- a/python/pyarrow/tests/test_dataset.py
+++ b/python/pyarrow/tests/test_dataset.py
@@ -178,12 +178,14 @@ def multisourcefs(request):
 
 # simply split the dataframe into four chunks to construct a data source
 # from each chunk into its ow

(arrow) branch main updated: GH-39533: [Python] NumPy 2.0 compat: remove usage of np.core (#39535)

2024-01-10 Thread jorisvandenbossche

This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
 new 72ed58449e GH-39533: [Python] NumPy 2.0 compat: remove usage of 
np.core (#39535)
72ed58449e is described below

commit 72ed58449ea71aab1343d9adce19f177f20705cf
Author: Joris Van den Bossche 
AuthorDate: Wed Jan 10 09:13:02 2024 +0100

GH-39533: [Python] NumPy 2.0 compat: remove usage of np.core (#39535)

### Rationale for this change

Removing usage of `np.core`, as that is deprecated and will be removed in 
numpy 2.0.

For this specific case, we can just hardcode the list of data types instead 
of using a numpy api (this list doesn't typically change).

* Closes: #39533

Authored-by: Joris Van den Bossche 
Signed-off-by: Joris Van den Bossche 
---
 python/pyarrow/pandas_compat.py | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py
index 3757d81a47..39dee85492 100644
--- a/python/pyarrow/pandas_compat.py
+++ b/python/pyarrow/pandas_compat.py
@@ -30,7 +30,6 @@ import re
 import warnings
 
 import numpy as np
-from numpy.core.numerictypes import sctypes as _np_sctypes
 
 import pyarrow as pa
 from pyarrow.lib import _pandas_api, frombytes  # noqa
@@ -789,9 +788,10 @@ def table_to_dataframe(
 # Set of the string repr of all numpy dtypes that can be stored in a pandas
 # dataframe (complex not included since not supported by Arrow)
 _pandas_supported_numpy_types = {
-str(np.dtype(typ))
-for typ in (_np_sctypes['int'] + _np_sctypes['uint'] + 
_np_sctypes['float'] +
-['object', 'bool'])
+"int8", "int16", "int32", "int64",
+"uint8", "uint16", "uint32", "uint64",
+"float16", "float32", "float64",
+"object", "bool"
 }

(arrow) branch main updated: GH-39537: [Packaging][Python] Add a numpy<2 pin to the install requirements for the 15.x release branch (#39538)

2024-01-10 Thread jorisvandenbossche

This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
 new 32d785ff40 GH-39537: [Packaging][Python] Add a numpy<2 pin to the 
install requirements for the 15.x release branch (#39538)
32d785ff40 is described below

commit 32d785ff405e3cc31866faa38bc2704eb44fda60
Author: Joris Van den Bossche 
AuthorDate: Wed Jan 10 09:11:11 2024 +0100

GH-39537: [Packaging][Python] Add a numpy<2 pin to the install requirements 
for the 15.x release branch (#39538)

### Rationale for this change

PyArrow wheels for the 15.0.0 release will not be compatible with future 
numpy 2.0 packages, therefore it is recommended to add this upper pin now for 
_releases_. We will keep the more flexible pin on the development branch (by 
reverting this commit on main, but so it can be cherry-picked in the release 
branch)

* Closes: #39537

Authored-by: Joris Van den Bossche 
Signed-off-by: Joris Van den Bossche 
---
 python/setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/setup.py b/python/setup.py
index b1c825d84d..51eb40af08 100755
--- a/python/setup.py
+++ b/python/setup.py
@@ -449,7 +449,7 @@ class BinaryDistribution(Distribution):
 
 
 install_requires = (
-'numpy >= 1.16.6',
+'numpy >= 1.16.6, <2',
 )

(arrow) branch main updated: GH-39437: [CI][Python] Update pandas tests failing on pandas nightly CI build (#39498)

2024-01-08 Thread jorisvandenbossche

This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
 new 48f704e2a3 GH-39437: [CI][Python] Update pandas tests failing on 
pandas nightly CI build (#39498)
48f704e2a3 is described below

commit 48f704e2a316131d180e0c2198c00671756c
Author: Alenka Frim 
AuthorDate: Mon Jan 8 17:38:26 2024 +0100

GH-39437: [CI][Python] Update pandas tests failing on pandas nightly CI 
build (#39498)

Update version checks and assertions of pyarrow array equality for pandas 
failing tests on the CI: 
[test-conda-python-3.10-pandas-nightly](https://github.com/ursacomputing/crossbow/actions/runs/7391976015/job/20109720695)

* Closes: #39437

Lead-authored-by: AlenkaF 
Co-authored-by: Alenka Frim 
Co-authored-by: Joris Van den Bossche 
Signed-off-by: Joris Van den Bossche 
---
 python/pyarrow/tests/parquet/test_pandas.py | 10 +++---
 python/pyarrow/tests/test_pandas.py | 16 ++--
 2 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/python/pyarrow/tests/parquet/test_pandas.py 
b/python/pyarrow/tests/parquet/test_pandas.py
index f194d12876..b5913bf5c6 100644
--- a/python/pyarrow/tests/parquet/test_pandas.py
+++ b/python/pyarrow/tests/parquet/test_pandas.py
@@ -404,6 +404,10 @@ caratcut  color  clarity  depth  table  price 
x y z
 
 @pytest.mark.pandas
 def test_backwards_compatible_column_metadata_handling(datadir):
+if Version("2.2.0") <= Version(pd.__version__):
+# TODO: regression in pandas
+# https://github.com/pandas-dev/pandas/issues/56775
+pytest.skip("Regression in pandas 2.2.0")
 expected = pd.DataFrame(
 {'a': [1, 2, 3], 'b': [.1, .2, .3],
  'c': pd.date_range("2017-01-01", periods=3, tz='Europe/Brussels')})
@@ -504,9 +508,9 @@ def test_categories_with_string_pyarrow_dtype(tempdir):
 df2 = df2.astype("category")
 
 # categories should be converted to pa.Array
-assert pa.array(df1["x"]) == pa.array(df2["x"])
-assert pa.array(df1["x"].cat.categories.values) == pa.array(
-df2["x"].cat.categories.values)
+assert pa.array(df1["x"]).to_pylist() == pa.array(df2["x"]).to_pylist()
+assert pa.array(df1["x"].cat.categories.values).to_pylist() == pa.array(
+df2["x"].cat.categories.values).to_pylist()
 
 path = str(tempdir / 'cat.parquet')
 pq.write_table(pa.table(df1), path)
diff --git a/python/pyarrow/tests/test_pandas.py 
b/python/pyarrow/tests/test_pandas.py
index 342beaaeb5..3353bebce7 100644
--- a/python/pyarrow/tests/test_pandas.py
+++ b/python/pyarrow/tests/test_pandas.py
@@ -261,6 +261,12 @@ class TestConvertMetadata:
 
 with warnings.catch_warnings():
 warnings.simplefilter(action="error")
+# make_block deprecation in pandas, still under discussion
+# https://github.com/pandas-dev/pandas/pull/56422
+# https://github.com/pandas-dev/pandas/issues/40226
+warnings.filterwarnings(
+"ignore", "make_block is deprecated", DeprecationWarning
+)
 _check_pandas_roundtrip(df, preserve_index=True)
 
 def test_multiindex_columns(self):
@@ -311,6 +317,12 @@ class TestConvertMetadata:
 
 with warnings.catch_warnings():
 warnings.simplefilter(action="error")
+# make_block deprecation in pandas, still under discussion
+# https://github.com/pandas-dev/pandas/pull/56422
+# https://github.com/pandas-dev/pandas/issues/40226
+warnings.filterwarnings(
+"ignore", "make_block is deprecated", DeprecationWarning
+)
 _check_pandas_roundtrip(df, preserve_index=True)
 
 def test_integer_index_column(self):
@@ -465,7 +477,7 @@ class TestConvertMetadata:
 preserve_index=True)
 
 def test_binary_column_name(self):
-if Version("2.0.0") <= Version(pd.__version__) < Version("2.2.0"):
+if Version("2.0.0") <= Version(pd.__version__) < Version("2.3.0"):
 # TODO: regression in pandas, hopefully fixed in next version
 # https://issues.apache.org/jira/browse/ARROW-18394
 # https://github.com/pandas-dev/pandas/issues/50127
@@ -3095,7 +3107,7 @@ def _fully_loaded_dataframe_example():
 
 @pytest.mark.parametrize('columns', ([b'foo'], ['foo']))
 def test_roundtrip_with_bytes_unicode(columns):
-if Version("2.0.0") <= Version(pd.__version__) < Version("2.2.0"):
+if Version("2.0.0") <= Version(pd.__version__) < Version("2.3.0"):
 # TODO: regression in pandas, hopefully fixed in next version
 # https://issues.apache.org/jira/browse/ARROW-18394
 # https://github.com/pandas-dev/pandas/issues/50127

(arrow) branch main updated (dc40e5fba1 -> 60b89ff0c9)

2024-01-08 Thread jorisvandenbossche

This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a change to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


from dc40e5fba1 GH-39217: [Python] RecordBatchReader.from_stream 
constructor for objects implementing the Arrow PyCapsule protocol (#39218)
 add 60b89ff0c9 GH-33500: [Python] add `Table.to/from_struct_array` (#38520)

No new revisions were added by this update.

Summary of changes:
 python/pyarrow/table.pxi   | 54 +++
 python/pyarrow/tests/test_table.py | 75 ++
 2 files changed, 129 insertions(+)

(arrow) branch main updated: GH-39217: [Python] RecordBatchReader.from_stream constructor for objects implementing the Arrow PyCapsule protocol (#39218)

2024-01-08 Thread jorisvandenbossche

This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
 new dc40e5fba1 GH-39217: [Python] RecordBatchReader.from_stream 
constructor for objects implementing the Arrow PyCapsule protocol (#39218)
dc40e5fba1 is described below

commit dc40e5fba1c9ace6da3de14158bb6195bed6fc58
Author: Joris Van den Bossche 
AuthorDate: Mon Jan 8 16:49:14 2024 +0100

GH-39217: [Python] RecordBatchReader.from_stream constructor for objects 
implementing the Arrow PyCapsule protocol (#39218)

### Rationale for this change

In contrast to Array, RecordBatch and Schema, for the C Stream (mapping to 
RecordBatchReader) we haven't an equivalent factory function that can accept 
any Arrow-compatible object and turn it into a pyarrow object through the 
PyCapsule Protocol.

For that reason, this proposes an explicit constructor class method for 
this: `RecordBatchReader.from_stream` (this is a quite generic name, so other 
name suggestions are certainly welcome).

### Are these changes tested?
TODO

* Closes: #39217

Authored-by: Joris Van den Bossche 
Signed-off-by: Joris Van den Bossche 
---
 python/pyarrow/ipc.pxi | 43 +
 python/pyarrow/tests/test_array.py |  4 ++--
 python/pyarrow/tests/test_ipc.py   | 44 ++
 python/pyarrow/tests/test_table.py | 12 +--
 4 files changed, 95 insertions(+), 8 deletions(-)

diff --git a/python/pyarrow/ipc.pxi b/python/pyarrow/ipc.pxi
index ae52f5cf34..da9636dfc8 100644
--- a/python/pyarrow/ipc.pxi
+++ b/python/pyarrow/ipc.pxi
@@ -883,6 +883,49 @@ cdef class RecordBatchReader(_Weakrefable):
 self.reader = c_reader
 return self
 
+@staticmethod
+def from_stream(data, schema=None):
+"""
+Create RecordBatchReader from a Arrow-compatible stream object.
+
+This accepts objects implementing the Arrow PyCapsule Protocol for
+streams, i.e. objects that have a ``__arrow_c_stream__`` method.
+
+Parameters
+--
+data : Arrow-compatible stream object
+Any object that implements the Arrow PyCapsule Protocol for
+streams.
+schema : Schema, default None
+The schema to which the stream should be casted, if supported
+by the stream object.
+
+Returns
+---
+RecordBatchReader
+"""
+
+if not hasattr(data, "__arrow_c_stream__"):
+raise TypeError(
+"Expected an object implementing the Arrow PyCapsule Protocol 
for "
+"streams (i.e. having a `__arrow_c_stream__` method), "
+f"got {type(data)!r}."
+)
+
+if schema is not None:
+if not hasattr(schema, "__arrow_c_schema__"):
+raise TypeError(
+"Expected an object implementing the Arrow PyCapsule 
Protocol for "
+"schema (i.e. having a `__arrow_c_schema__` method), "
+f"got {type(schema)!r}."
+)
+requested = schema.__arrow_c_schema__()
+else:
+requested = None
+
+capsule = data.__arrow_c_stream__(requested)
+return RecordBatchReader._import_from_c_capsule(capsule)
+
 @staticmethod
 def from_batches(Schema schema not None, batches):
 """
diff --git a/python/pyarrow/tests/test_array.py 
b/python/pyarrow/tests/test_array.py
index d598630dc2..3dcbf399f3 100644
--- a/python/pyarrow/tests/test_array.py
+++ b/python/pyarrow/tests/test_array.py
@@ -3351,8 +3351,8 @@ def test_c_array_protocol():
 def __init__(self, data):
 self.data = data
 
-def __arrow_c_array__(self, requested_type=None):
-return self.data.__arrow_c_array__(requested_type)
+def __arrow_c_array__(self, requested_schema=None):
+return self.data.__arrow_c_array__(requested_schema)
 
 # Can roundtrip through the C array protocol
 arr = ArrayWrapper(pa.array([1, 2, 3], type=pa.int64()))
diff --git a/python/pyarrow/tests/test_ipc.py b/python/pyarrow/tests/test_ipc.py
index 450d26e3b7..f75ec8158a 100644
--- a/python/pyarrow/tests/test_ipc.py
+++ b/python/pyarrow/tests/test_ipc.py
@@ -1194,3 +1194,47 @@ def test_py_record_batch_reader():
 with pytest.raises(TypeError):
 reader = pa.RecordBatchReader.from_batches(None, batches)
 pass
+
+
+def test_record_batch_reader_from_arrow_stream():
+
+class StreamWrapper:
+def __init__(self, batches):
+self.batches = batches
+
+def __arrow_c_stream__(self, requested

(arrow) branch main updated: GH-39064: [C++][Parquet] Support row group filtering for nested paths for struct fields (#39065)

2024-01-08 Thread jorisvandenbossche

This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
 new ffcfabdb95 GH-39064: [C++][Parquet] Support row group filtering for 
nested paths for struct fields (#39065)
ffcfabdb95 is described below

commit ffcfabdb956d72707557a1fcf113c6b7cb118f50
Author: Joris Van den Bossche 
AuthorDate: Mon Jan 8 16:06:59 2024 +0100

GH-39064: [C++][Parquet] Support row group filtering for nested paths for 
struct fields (#39065)

### Rationale for this change

Currently when filtering with a nested field reference, we were taking the 
corresponding parquet SchemaField for just the first index of the nested path, 
i.e. the parent node in the Parquet schema. But logically, filtering on 
statistics only works for a primitive leaf node.

This PR changes that logic to iterate over all indices of the FieldPath, if 
nested, to ensure we use the actual corresponding child leaf node of the 
ParquetSchema to get the statistics from.

### Are there any user-facing changes?

No, only improving performance by doing the filtering at the row group 
stage, instead of afterwards on the read data

* Closes: #39064

Authored-by: Joris Van den Bossche 
Signed-off-by: Joris Van den Bossche 
---
 cpp/src/arrow/dataset/file_parquet.cc  | 39 ++
 cpp/src/arrow/dataset/file_parquet.h   |  8 ++
 cpp/src/arrow/dataset/file_parquet_test.cc |  6 +
 python/pyarrow/tests/test_dataset.py   | 36 +++
 4 files changed, 79 insertions(+), 10 deletions(-)

diff --git a/cpp/src/arrow/dataset/file_parquet.cc 
b/cpp/src/arrow/dataset/file_parquet.cc
index 1c2fd2dea6..0ce0850292 100644
--- a/cpp/src/arrow/dataset/file_parquet.cc
+++ b/cpp/src/arrow/dataset/file_parquet.cc
@@ -161,7 +161,8 @@ bool IsNan(const Scalar& value) {
 }
 
 std::optional ColumnChunkStatisticsAsExpression(
-const SchemaField& schema_field, const parquet::RowGroupMetaData& 
metadata) {
+const FieldRef& field_ref, const SchemaField& schema_field,
+const parquet::RowGroupMetaData& metadata) {
   // For the remaining of this function, failure to extract/parse statistics
   // are ignored by returning nullptr. The goal is two fold. First
   // avoid an optimization which breaks the computation. Second, allow the
@@ -180,7 +181,8 @@ std::optional 
ColumnChunkStatisticsAsExpression(
 return std::nullopt;
   }
 
-  return ParquetFileFragment::EvaluateStatisticsAsExpression(*field, 
*statistics);
+  return ParquetFileFragment::EvaluateStatisticsAsExpression(*field, field_ref,
+ *statistics);
 }
 
 void AddColumnIndices(const SchemaField& schema_field,
@@ -360,8 +362,9 @@ Result IsSupportedParquetFile(const 
ParquetFileFormat& format,
 }  // namespace
 
 std::optional 
ParquetFileFragment::EvaluateStatisticsAsExpression(
-const Field& field, const parquet::Statistics& statistics) {
-  auto field_expr = compute::field_ref(field.name());
+const Field& field, const FieldRef& field_ref,
+const parquet::Statistics& statistics) {
+  auto field_expr = compute::field_ref(field_ref);
 
   // Optimize for corner case where all values are nulls
   if (statistics.num_values() == 0 && statistics.null_count() > 0) {
@@ -418,6 +421,13 @@ std::optional 
ParquetFileFragment::EvaluateStatisticsAsExpr
   return std::nullopt;
 }
 
+std::optional 
ParquetFileFragment::EvaluateStatisticsAsExpression(
+const Field& field, const parquet::Statistics& statistics) {
+  const auto field_name = field.name();
+  return EvaluateStatisticsAsExpression(field, FieldRef(std::move(field_name)),
+statistics);
+}
+
 ParquetFileFormat::ParquetFileFormat()
 : FileFormat(std::make_shared()) {}
 
@@ -810,7 +820,7 @@ Status ParquetFileFragment::SetMetadata(
   manifest_ = std::move(manifest);
 
   statistics_expressions_.resize(row_groups_->size(), compute::literal(true));
-  statistics_expressions_complete_.resize(physical_schema_->num_fields(), 
false);
+  statistics_expressions_complete_.resize(manifest_->descr->num_columns(), 
false);
 
   for (int row_group : *row_groups_) {
 // Ensure RowGroups are indexing valid RowGroups before augmenting.
@@ -900,16 +910,25 @@ Result> 
ParquetFileFragment::TestRowGroups(
 ARROW_ASSIGN_OR_RAISE(auto match, ref.FindOneOrNone(*physical_schema_));
 
 if (match.empty()) continue;
-if (statistics_expressions_complete_[match[0]]) continue;
-statistics_expressions_complete_[match[0]] = true;
+const SchemaField* schema_field = _->schema_fields[match[0]];
+
+for (size_t i = 1; i < match.indices().size(); ++i) {
+  if (schema

(arrow) branch main updated: GH-39500: [Docs] Pin pydata-sphinx-theme to 0.14 (#39501)

2024-01-08 Thread jorisvandenbossche

This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
 new 6ce3c3f884 GH-39500: [Docs] Pin pydata-sphinx-theme to 0.14 (#39501)
6ce3c3f884 is described below

commit 6ce3c3f8840cdd5294f22a6e662b6d2c0ff0a077
Author: Joris Van den Bossche 
AuthorDate: Mon Jan 8 15:29:04 2024 +0100

GH-39500: [Docs] Pin pydata-sphinx-theme to 0.14 (#39501)

### Rationale for this change

The latest pydata-sphinx-theme release 0.15 of a few days ago had some 
breakages. So let's pin to 0.14.x until 0.15 has stabilized.

* Closes: #39500

Lead-authored-by: Joris Van den Bossche 
Co-authored-by: Sutou Kouhei 
Signed-off-by: Joris Van den Bossche 
---
 ci/conda_env_sphinx.txt | 2 +-
 docs/requirements.txt   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/ci/conda_env_sphinx.txt b/ci/conda_env_sphinx.txt
index af1bfe9b78..0e50875fc1 100644
--- a/ci/conda_env_sphinx.txt
+++ b/ci/conda_env_sphinx.txt
@@ -20,7 +20,7 @@ breathe
 doxygen
 ipython
 numpydoc
-pydata-sphinx-theme
+pydata-sphinx-theme=0.14
 sphinx-autobuild
 sphinx-design
 sphinx-copybutton
diff --git a/docs/requirements.txt b/docs/requirements.txt
index 37a50d51dd..da2327a6df 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -5,7 +5,7 @@
 breathe
 ipython
 numpydoc
-pydata-sphinx-theme
+pydata-sphinx-theme==0.14
 sphinx-autobuild
 sphinx-design
 sphinx-copybutton

(arrow) branch main updated: GH-30117: [C++][Python] Add "Z" to the end of timestamp print string when tz defined (#39272)

2024-01-08 Thread jorisvandenbossche

This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
 new a288364d97 GH-30117:  [C++][Python] Add "Z" to the end of timestamp 
print string when tz defined (#39272)
a288364d97 is described below

commit a288364d971ab9a6a3f05a903a5df83ebeddf0a0
Author: Alenka Frim 
AuthorDate: Mon Jan 8 14:26:13 2024 +0100

GH-30117:  [C++][Python] Add "Z" to the end of timestamp print string when 
tz defined (#39272)

### What changes are included in this PR?

This PR updates the PrettyPrint for Timestamp type so that "Z" is printed 
at the end of the output string if the timezone has been defined. This way we 
add minimum information about the values being stored in UTC.

### Are these changes tested?

Yes.

### Are there any user-facing changes?

There is a change in how `TimestampArray` prints out the data. With this 
change "Z" would be added to the end of the string if the timezone is defined.
* Closes: #30117

Lead-authored-by: AlenkaF 
Co-authored-by: Alenka Frim 
Co-authored-by: Rok Mihevc 
Signed-off-by: Joris Van den Bossche 
---
 cpp/src/arrow/pretty_print_test.cc |  6 +++---
 cpp/src/arrow/util/formatting.h|  7 ++-
 cpp/src/arrow/util/formatting_util_test.cc | 28 
 python/pyarrow/tests/test_types.py | 11 +++
 4 files changed, 48 insertions(+), 4 deletions(-)

diff --git a/cpp/src/arrow/pretty_print_test.cc 
b/cpp/src/arrow/pretty_print_test.cc
index 0db6ae4867..5d2256e8c5 100644
--- a/cpp/src/arrow/pretty_print_test.cc
+++ b/cpp/src/arrow/pretty_print_test.cc
@@ -350,10 +350,10 @@ TEST_F(TestPrettyPrint, DateTimeTypes) {
 std::vector values = {
 0, 1, 2, 678 + 100 * (5 + 60 * (4 + 60 * (3 + 24 * int64_t(1, 
4};
 static const char* expected = R"expected([
-  1970-01-01 00:00:00.00,
-  1970-01-01 00:00:00.01,
+  1970-01-01 00:00:00.00Z,
+  1970-01-01 00:00:00.01Z,
   null,
-  1970-01-02 03:04:05.000678,
+  1970-01-02 03:04:05.000678Z,
   null
 ])expected";
 CheckPrimitive(timestamp(TimeUnit::MICRO, 
"Transylvania"),
diff --git a/cpp/src/arrow/util/formatting.h b/cpp/src/arrow/util/formatting.h
index 9dcc6463fb..71bae74629 100644
--- a/cpp/src/arrow/util/formatting.h
+++ b/cpp/src/arrow/util/formatting.h
@@ -470,7 +470,8 @@ class StringFormatter {
   using value_type = int64_t;
 
   explicit StringFormatter(const DataType* type)
-  : unit_(checked_cast(*type).unit()) {}
+  : unit_(checked_cast(*type).unit()),
+timezone_(checked_cast(*type).timezone()) {}
 
   template 
   Return operator()(Duration, value_type value, Appender&& append) {
@@ -503,6 +504,9 @@ class StringFormatter {
 std::array buffer;
 char* cursor = buffer.data() + buffer_size;
 
+if (timezone_.size() > 0) {
+  detail::FormatOneChar('Z', );
+}
 detail::FormatHH_MM_SS(arrow_vendored::date::make_time(since_midnight), 
);
 detail::FormatOneChar(' ', );
 detail::Format_MM_DD(timepoint_days, );
@@ -516,6 +520,7 @@ class StringFormatter {
 
  private:
   TimeUnit::type unit_;
+  std::string timezone_;
 };
 
 template 
diff --git a/cpp/src/arrow/util/formatting_util_test.cc 
b/cpp/src/arrow/util/formatting_util_test.cc
index 9afbc91063..13f57a495d 100644
--- a/cpp/src/arrow/util/formatting_util_test.cc
+++ b/cpp/src/arrow/util/formatting_util_test.cc
@@ -522,6 +522,34 @@ TEST(Formatting, Timestamp) {
 AssertFormatting(formatter, -2203932304LL * 10LL + 8,
  "1900-02-28 12:34:56.8");
   }
+
+  {
+auto timestamp_types = {timestamp(TimeUnit::SECOND, "US/Eastern"),
+timestamp(TimeUnit::SECOND, "+01:00")};
+for (auto ty : timestamp_types) {
+  StringFormatter formatter(ty.get());
+
+  AssertFormatting(formatter, 0, "1970-01-01 00:00:00Z");
+}
+  }
+
+  {
+auto ty = timestamp(TimeUnit::MILLI, "Pacific/Maruesas");
+StringFormatter formatter(ty.get());
+AssertFormatting(formatter, 0, "1970-01-01 00:00:00.000Z");
+  }
+
+  {
+auto ty = timestamp(TimeUnit::MICRO, "-42:00");
+StringFormatter formatter(ty.get());
+AssertFormatting(formatter, 0, "1970-01-01 00:00:00.00Z");
+  }
+
+  {
+auto ty = timestamp(TimeUnit::NANO, "Mars/Mariner_Valley");
+StringFormatter formatter(ty.get());
+AssertFormatting(formatter, 0, "1970-01-01 00:00:00.0Z");
+  }
 }
 
 TEST(Formatting, Interval) {
diff --git a/python/pyarrow/tests/test_types.py 
b/python/pyarrow/tests/test_types.py
index 7600f1dd33..c8a52c6b62 100644
--- a/python/pyarrow/te

(arrow) branch main updated: GH-38341: [Python] Remove usage of pandas internals DatetimeTZBlock (#38321)

2024-01-08 Thread jorisvandenbossche

This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
 new 6b93c4a0e8 GH-38341: [Python] Remove usage of pandas internals 
DatetimeTZBlock (#38321)
6b93c4a0e8 is described below

commit 6b93c4a0e8cb5110c6c4d3746f4e8bb0a8b76ec8
Author: Joris Van den Bossche 
AuthorDate: Mon Jan 8 14:21:10 2024 +0100

GH-38341: [Python] Remove usage of pandas internals DatetimeTZBlock (#38321)

### Rationale for this change

This usage probably stems from a long time ago that it was required to 
specify the Block type, but nowadays it's good enough to just specify the 
dtype, and thus cutting down on our usage of internal pandas objects.

Part of #35081

* Closes: #38341

Authored-by: Joris Van den Bossche 
Signed-off-by: Joris Van den Bossche 
---
 python/pyarrow/pandas_compat.py | 12 +---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py
index 80e313be02..3757d81a47 100644
--- a/python/pyarrow/pandas_compat.py
+++ b/python/pyarrow/pandas_compat.py
@@ -717,9 +717,15 @@ def _reconstruct_block(item, columns=None, 
extension_columns=None):
 elif 'timezone' in item:
 unit, _ = np.datetime_data(block_arr.dtype)
 dtype = make_datetimetz(unit, item['timezone'])
-block = _int.make_block(block_arr, placement=placement,
-klass=_int.DatetimeTZBlock,
-dtype=dtype)
+if _pandas_api.is_ge_v21():
+pd_arr = _pandas_api.pd.array(
+block_arr.view("int64"), dtype=dtype, copy=False
+)
+block = _int.make_block(pd_arr, placement=placement)
+else:
+block = _int.make_block(block_arr, placement=placement,
+klass=_int.DatetimeTZBlock,
+dtype=dtype)
 elif 'py_array' in item:
 # create ExtensionBlock
 arr = item['py_array']

(arrow) branch main updated: GH-39196: [Python][Docs] Document the Arrow PyCapsule protocol in the 'extending pyarrow' section of the Python docs (#39199)

2023-12-21 Thread jorisvandenbossche

This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
 new 2f9f892a00 GH-39196: [Python][Docs] Document the Arrow PyCapsule 
protocol in the 'extending pyarrow' section of the Python docs (#39199)
2f9f892a00 is described below

commit 2f9f892a0075d990a1b42dc97a97d490b6b08345
Author: Joris Van den Bossche 
AuthorDate: Thu Dec 21 15:53:41 2023 +0100

GH-39196: [Python][Docs] Document the Arrow PyCapsule protocol in the 
'extending pyarrow' section of the Python docs (#39199)

### Rationale for this change

While the Arrow PyCapsule protocol itself is defined in the specification 
part of the docs, this PR adds a section about it in the Python user guide as 
well (referring to the specification for most details), where users might 
typically look for Python specific docs.
* Closes: #39196

Lead-authored-by: Joris Van den Bossche 
Co-authored-by: Antoine Pitrou 
Signed-off-by: Joris Van den Bossche 
---
 .../format/CDataInterface/PyCapsuleInterface.rst   |  2 ++
 docs/source/python/extending_types.rst | 32 ++
 2 files changed, 34 insertions(+)

diff --git a/docs/source/format/CDataInterface/PyCapsuleInterface.rst 
b/docs/source/format/CDataInterface/PyCapsuleInterface.rst
index 0c1a01d7c6..03095aa2e9 100644
--- a/docs/source/format/CDataInterface/PyCapsuleInterface.rst
+++ b/docs/source/format/CDataInterface/PyCapsuleInterface.rst
@@ -16,6 +16,8 @@
 .. under the License.
 
 
+.. _arrow-pycapsule-interface:
+
 =
 The Arrow PyCapsule Interface
 =
diff --git a/docs/source/python/extending_types.rst 
b/docs/source/python/extending_types.rst
index ee92cebcb5..b7261005e6 100644
--- a/docs/source/python/extending_types.rst
+++ b/docs/source/python/extending_types.rst
@@ -21,6 +21,38 @@
 Extending pyarrow
 =
 
+Controlling conversion to (Py)Arrow with the PyCapsule Interface
+
+
+The :ref:`Arrow C data interface ` allows moving Arrow data 
between
+different implementations of Arrow. This is a generic, cross-language 
interface not
+specific to Python, but for Python libraries this interface is extended with a 
Python
+specific layer: :ref:`arrow-pycapsule-interface`.
+
+This Python interface ensures that different libraries that support the C Data 
interface
+can export Arrow data structures in a standard way and recognize each other's 
objects.
+
+If you have a Python library providing data structures that hold 
Arrow-compatible data
+under the hood, you can implement the following methods on those objects:
+
+- ``__arrow_c_schema__`` for schema or type-like objects.
+- ``__arrow_c_array__`` for arrays and record batches (contiguous tables).
+- ``__arrow_c_stream__`` for chunked tables or streams of data.
+
+Those methods return `PyCapsule 
<https://docs.python.org/3/c-api/capsule.html>`__
+objects, and more details on the exact semantics can be found in the
+:ref:`specification `.
+
+When your data structures have those methods defined, the PyArrow constructors
+(such as :func:`pyarrow.array` or :func:`pyarrow.table`) will recognize those 
objects as
+supporting this protocol, and convert them to PyArrow data structures 
zero-copy. And the
+same can be true for any other library supporting this protocol on ingesting 
data.
+
+Similarly, if your library has functions that accept user-provided data, you 
can add
+support for this protocol by checking for the presence of those methods, and
+therefore accept any Arrow data (instead of harcoding support for a specific
+Arrow producer such as PyArrow).
+
 .. _arrow_array_protocol:
 
 Controlling conversion to pyarrow.Array with the ``__arrow_array__`` protocol

(arrow) branch main updated: MINOR: [Docs] local_timestamp kernel docs are not linked in python docs (#39274)

2023-12-20 Thread jorisvandenbossche

This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
 new b1fcba1b39 MINOR: [Docs] local_timestamp kernel docs are not linked in 
python docs (#39274)
b1fcba1b39 is described below

commit b1fcba1b395e0aedddcdab19958c14809d780d4c
Author: Rok Mihevc 
AuthorDate: Wed Dec 20 11:06:57 2023 +0100

MINOR: [Docs] local_timestamp kernel docs are not linked in python docs 
(#39274)

### Rationale for this change

local_timestamp kernel docs are linked in 
[cpp](https://arrow.apache.org/docs/cpp/compute.html#timezone-handling) but not 
in [python 
docs](https://arrow.apache.org/docs/python/api/compute.html#timezone-handling).

### What changes are included in this PR?

This adds a rst link in python docs

### Are these changes tested?

No

### Are there any user-facing changes?

Change will be visible in the docs

Authored-by: Rok Mihevc 
Signed-off-by: Joris Van den Bossche 
---
 docs/source/python/api/compute.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/source/python/api/compute.rst 
b/docs/source/python/api/compute.rst
index 4ee364fcf6..b879643017 100644
--- a/docs/source/python/api/compute.rst
+++ b/docs/source/python/api/compute.rst
@@ -468,6 +468,7 @@ Timezone Handling
:toctree: ../generated/
 
assume_timezone
+   local_timestamp
 
 Associative Transforms
 --

(arrow) branch main updated: GH-38683: [Python][Docs] Update docstrings for Time32Type and Time64Type (#39059)

2023-12-19 Thread jorisvandenbossche

This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
 new 9cb78addf7 GH-38683: [Python][Docs] Update docstrings for Time32Type 
and Time64Type (#39059)
9cb78addf7 is described below

commit 9cb78addf7fcd662de1579db9dff55bd1a420fe4
Author: Alenka Frim 
AuthorDate: Tue Dec 19 09:45:41 2023 +0100

GH-38683: [Python][Docs] Update docstrings for Time32Type and Time64Type 
(#39059)

### Rationale for this change

`Time32Type` and `Time64Type` unit docs are not correctly documented.

### What changes are included in this PR?

Update the docstrings for `Time32Type` and `Time64Type` `unit`.
* Closes: #38683

Authored-by: AlenkaF 
Signed-off-by: Joris Van den Bossche 
---
 python/pyarrow/types.pxi | 10 --
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi
index a0ddf09d69..912ee39f7d 100644
--- a/python/pyarrow/types.pxi
+++ b/python/pyarrow/types.pxi
@@ -1108,6 +1108,9 @@ cdef class Time32Type(DataType):
 """
 Concrete class for time32 data types.
 
+Supported time unit resolutions are 's' [second]
+and 'ms' [millisecond].
+
 Examples
 
 Create an instance of time32 type:
@@ -1124,7 +1127,7 @@ cdef class Time32Type(DataType):
 @property
 def unit(self):
 """
-The time unit ('s', 'ms', 'us' or 'ns').
+The time unit ('s' or 'ms').
 
 Examples
 
@@ -1140,6 +1143,9 @@ cdef class Time64Type(DataType):
 """
 Concrete class for time64 data types.
 
+Supported time unit resolutions are 'us' [microsecond]
+and 'ns' [nanosecond].
+
 Examples
 
 Create an instance of time64 type:
@@ -1156,7 +1162,7 @@ cdef class Time64Type(DataType):
 @property
 def unit(self):
 """
-The time unit ('s', 'ms', 'us' or 'ns').
+The time unit ('us' or 'ns').
 
 Examples

(arrow) branch main updated: GH-38535: [Python] Fix S3FileSystem equals None segfault (#39276)

2023-12-19 Thread jorisvandenbossche

This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
 new f5dd3d4a1c GH-38535: [Python] Fix S3FileSystem equals None segfault 
(#39276)
f5dd3d4a1c is described below

commit f5dd3d4a1c0efb7c8587287da0c536988bcd1559
Author: Alenka Frim 
AuthorDate: Tue Dec 19 09:45:00 2023 +0100

GH-38535: [Python] Fix S3FileSystem equals None segfault (#39276)

### Rationale for this change

`S3FileSystem` equals `None` currently causes bus error.

### What changes are included in this PR?

Add `not None` to `FileSystem.equals` signature.

### Are these changes tested?

Yes.

### Are there any user-facing changes?

No.
* Closes: #38535

Authored-by: AlenkaF 
Signed-off-by: Joris Van den Bossche 
---
 python/pyarrow/_fs.pyx  | 2 +-
 python/pyarrow/tests/test_fs.py | 7 +++
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/python/pyarrow/_fs.pyx b/python/pyarrow/_fs.pyx
index ef8db31bfc..395f488144 100644
--- a/python/pyarrow/_fs.pyx
+++ b/python/pyarrow/_fs.pyx
@@ -505,7 +505,7 @@ cdef class FileSystem(_Weakrefable):
 cdef inline shared_ptr[CFileSystem] unwrap(self) nogil:
 return self.wrapped
 
-def equals(self, FileSystem other):
+def equals(self, FileSystem other not None):
 """
 Parameters
 --
diff --git a/python/pyarrow/tests/test_fs.py b/python/pyarrow/tests/test_fs.py
index 59c9c44942..d0fa253e31 100644
--- a/python/pyarrow/tests/test_fs.py
+++ b/python/pyarrow/tests/test_fs.py
@@ -542,6 +542,13 @@ def test_filesystem_equals():
 assert SubTreeFileSystem('/base', fs0) != SubTreeFileSystem('/other', fs0)
 
 
+def test_filesystem_equals_none(fs):
+with pytest.raises(TypeError, match="got NoneType"):
+fs.equals(None)
+
+assert fs is not None
+
+
 def test_subtree_filesystem():
 localfs = LocalFileSystem()

(arrow) branch main updated: GH-36441: [Python] Make `CacheOptions` configurable from Python (#36627)

2023-12-14 Thread jorisvandenbossche

This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
 new 3236c129d1 GH-36441: [Python] Make `CacheOptions` configurable from 
Python  (#36627)
3236c129d1 is described below

commit 3236c129d1cbe3f73359278d1459a3f20e5c4df0
Author: Thomas Newton 
AuthorDate: Thu Dec 14 14:12:17 2023 +

GH-36441: [Python] Make `CacheOptions` configurable from Python  (#36627)



### Rationale for this change
Resolves: https://github.com/apache/arrow/issues/36441

### What changes are included in this PR?
- Add python bindings for `CacheOptions` from the C++ side.
- Allow setting `cache_options` on `ParquetFragmentScanOptions` from the 
python side.
- Adjust some of the comments on `CacheOptions`

### Are these changes tested?
Yes. I added python side tests for these newly available configs similar to 
other configs. I have not added an integration test that ensures setting the 
configs on the python side leads to correctly using them on the C++ side.

### Are there any user-facing changes?
Yes. The are new configs available on the python side but the defaults are 
unchanged. I've added/updated docstrings where relevant.

* Closes: #36441

Lead-authored-by: Thomas Newton 
Co-authored-by: Joris Van den Bossche 
Signed-off-by: Joris Van den Bossche 
---
 cpp/src/arrow/io/caching.h   |  10 ++-
 python/pyarrow/__init__.py   |   2 +-
 python/pyarrow/_dataset_parquet.pyx  |  21 +-
 python/pyarrow/_parquet.pxd  |   6 +-
 python/pyarrow/includes/libarrow.pxd |  16 +
 python/pyarrow/io.pxi| 134 +++
 python/pyarrow/lib.pxd   |  12 
 python/pyarrow/tests/test_dataset.py |  28 +---
 python/pyarrow/tests/test_io.py  |  59 +++
 9 files changed, 271 insertions(+), 17 deletions(-)

diff --git a/cpp/src/arrow/io/caching.h b/cpp/src/arrow/io/caching.h
index 9c1b8fe88b..e2b911fafd 100644
--- a/cpp/src/arrow/io/caching.h
+++ b/cpp/src/arrow/io/caching.h
@@ -42,6 +42,11 @@ struct ARROW_EXPORT CacheOptions {
   ///   size greater than this, they are not combined
   int64_t range_size_limit;
   /// \brief A lazy cache does not perform any I/O until requested.
+  ///   lazy = false: request all byte ranges when PreBuffer or WillNeed is 
called.
+  ///   lazy = True, prefetch_limit = 0: request merged byte ranges only after 
the reader
+  ///   needs them.
+  ///   lazy = True, prefetch_limit = k: prefetch up to k merged byte ranges 
ahead of the
+  ///   range that is currently being read.
   bool lazy;
   /// \brief The maximum number of ranges to be prefetched. This is only used
   ///   for lazy cache to asynchronously read some ranges after reading the 
target range.
@@ -56,9 +61,10 @@ struct ARROW_EXPORT CacheOptions {
   /// \brief Construct CacheOptions from network storage metrics (e.g. S3).
   ///
   /// \param[in] time_to_first_byte_millis Seek-time or Time-To-First-Byte 
(TTFB) in
-  ///   milliseconds, also called call setup latency of a new S3 request.
+  ///   milliseconds, also called call setup latency of a new read request.
   ///   The value is a positive integer.
-  /// \param[in] transfer_bandwidth_mib_per_sec Data transfer Bandwidth (BW) 
in MiB/sec.
+  /// \param[in] transfer_bandwidth_mib_per_sec Data transfer Bandwidth (BW) 
in MiB/sec
+  ///   (per connection).
   ///   The value is a positive integer.
   /// \param[in] ideal_bandwidth_utilization_frac Transfer bandwidth 
utilization fraction
   ///   (per connection) to maximize the net data load.
diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py
index cd66abcb44..9da94885ec 100644
--- a/python/pyarrow/__init__.py
+++ b/python/pyarrow/__init__.py
@@ -243,7 +243,7 @@ from pyarrow.lib import (MemoryPool, LoggingMemoryPool, 
ProxyMemoryPool,
 
 # I/O
 from pyarrow.lib import (NativeFile, PythonFile,
- BufferedInputStream, BufferedOutputStream,
+ BufferedInputStream, BufferedOutputStream, 
CacheOptions,
  CompressedInputStream, CompressedOutputStream,
  TransformInputStream, transcoding_input_stream,
  FixedSizeBufferWriter,
diff --git a/python/pyarrow/_dataset_parquet.pyx 
b/python/pyarrow/_dataset_parquet.pyx
index d458ac4ee7..61e051f56c 100644
--- a/python/pyarrow/_dataset_parquet.pyx
+++ b/python/pyarrow/_dataset_parquet.pyx
@@ -42,6 +42,7 @@ from pyarrow._dataset cimport (
 FileWriteOptions,
 Fragment,
 FragmentScanOptions,
+CacheOptions,
 Partitioning,
 PartitioningFactory,
 WrittenFile
@@ -693,6 +694,10 @@ cdef class ParquetFragmentScanOptions(FragmentScanOptions):
 parallel using a background

(arrow) branch main updated: GH-39096: [Python] Release GIL in `.nbytes` (#39097)

2023-12-07 Thread jorisvandenbossche

This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
 new 6e61c5e216 GH-39096: [Python] Release GIL in `.nbytes` (#39097)
6e61c5e216 is described below

commit 6e61c5e2163c8509411143752afc7f3bb37184cb
Author: Hendrik Makait 
AuthorDate: Thu Dec 7 14:18:06 2023 +0100

GH-39096: [Python] Release GIL in `.nbytes` (#39097)

### Rationale for this change

The `.nbytes` holds the GIL while computing the data size in C++, which has 
caused performance issues in Dask because threads were blocking each other

See #39096

### Are these changes tested?

I am not sure if additional tests are necessary here. If so, I'm happy to 
add them but would welcome some pointers.

### Are there any user-facing changes?

No

* Closes: #39096

Authored-by: Hendrik Makait 
Signed-off-by: Joris Van den Bossche 
---
 python/pyarrow/array.pxi |  5 +++--
 python/pyarrow/table.pxi | 15 +--
 2 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index 9d62bed51f..789e30d3e9 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -1206,8 +1206,9 @@ cdef class Array(_PandasConvertible):
 cdef:
 CResult[int64_t] c_size_res
 
-c_size_res = ReferencedBufferSize(deref(self.ap))
-size = GetResultValue(c_size_res)
+with nogil:
+c_size_res = ReferencedBufferSize(deref(self.ap))
+size = GetResultValue(c_size_res)
 return size
 
 def get_total_buffer_size(self):
diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi
index f93f595090..2f8d1abd1f 100644
--- a/python/pyarrow/table.pxi
+++ b/python/pyarrow/table.pxi
@@ -248,8 +248,9 @@ cdef class ChunkedArray(_PandasConvertible):
 cdef:
 CResult[int64_t] c_res_buffer
 
-c_res_buffer = ReferencedBufferSize(deref(self.chunked_array))
-size = GetResultValue(c_res_buffer)
+with nogil:
+c_res_buffer = ReferencedBufferSize(deref(self.chunked_array))
+size = GetResultValue(c_res_buffer)
 return size
 
 def get_total_buffer_size(self):
@@ -2386,8 +2387,9 @@ cdef class RecordBatch(_Tabular):
 cdef:
 CResult[int64_t] c_res_buffer
 
-c_res_buffer = ReferencedBufferSize(deref(self.batch))
-size = GetResultValue(c_res_buffer)
+with nogil:
+c_res_buffer = ReferencedBufferSize(deref(self.batch))
+size = GetResultValue(c_res_buffer)
 return size
 
 def get_total_buffer_size(self):
@@ -4337,8 +4339,9 @@ cdef class Table(_Tabular):
 cdef:
 CResult[int64_t] c_res_buffer
 
-c_res_buffer = ReferencedBufferSize(deref(self.table))
-size = GetResultValue(c_res_buffer)
+with nogil:
+c_res_buffer = ReferencedBufferSize(deref(self.table))
+size = GetResultValue(c_res_buffer)
 return size
 
 def get_total_buffer_size(self):

(arrow) branch main updated: GH-38618: [C++] S3FileSystem: fix regression in deleting explicitly created sub-directories (#38845)

2023-12-05 Thread jorisvandenbossche

This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
 new cf80bd1135 GH-38618: [C++] S3FileSystem: fix regression in deleting 
explicitly created sub-directories (#38845)
cf80bd1135 is described below

commit cf80bd1135bbd9cee7c0ae3e6370f93270cba250
Author: Joris Van den Bossche 
AuthorDate: Tue Dec 5 18:23:15 2023 +0100

GH-38618: [C++] S3FileSystem: fix regression in deleting explicitly created 
sub-directories (#38845)

### Rationale for this change

See https://github.com/apache/arrow/issues/38618#issuecomment-1821252024 
and below for the analysis. When deleting the dir contents, we use a 
GetFileInfo with recursive FileSelector to list all objects to delete, but when 
doing that the file paths for directories don't end in a trailing `/`, so for 
deleting explicitly created directories we need to add the `kSep` here as well 
to properly delete the object.

### Are these changes tested?

I tested them manually with an actual S3 bucket. The problem is that MinIO 
doesn't have the same problem, and so it's not actually tested with the test I 
added using our MinIO testing setup.

### Are there any user-facing changes?

Fixes the regression
* Closes: #38618

Lead-authored-by: Joris Van den Bossche 
Co-authored-by: Antoine Pitrou 
Signed-off-by: Joris Van den Bossche 
---
 cpp/src/arrow/filesystem/s3fs.cc | 11 ++-
 python/pyarrow/tests/test_fs.py  | 32 
 2 files changed, 42 insertions(+), 1 deletion(-)

diff --git a/cpp/src/arrow/filesystem/s3fs.cc b/cpp/src/arrow/filesystem/s3fs.cc
index 511448cb2f..62bec9b23b 100644
--- a/cpp/src/arrow/filesystem/s3fs.cc
+++ b/cpp/src/arrow/filesystem/s3fs.cc
@@ -2409,7 +2409,16 @@ class S3FileSystem::Impl : public 
std::enable_shared_from_this file_paths;
   for (const auto& file_info : file_infos) {
 DCHECK_GT(file_info.path().size(), bucket.size());
-file_paths.push_back(file_info.path().substr(bucket.size() 
+ 1));
+auto file_path = file_info.path().substr(bucket.size() + 
1);
+if (file_info.IsDirectory()) {
+  // The selector returns FileInfo objects for directories 
with a
+  // a path that never ends in a trailing slash, but for 
AWS the file
+  // needs to have a trailing slash to recognize it as 
directory
+  // (https://github.com/apache/arrow/issues/38618)
+  DCHECK_OK(internal::AssertNoTrailingSlash(file_path));
+  file_path = file_path + kSep;
+}
+file_paths.push_back(std::move(file_path));
   }
   scheduler->AddSimpleTask(
   [=, file_paths = std::move(file_paths)] {
diff --git a/python/pyarrow/tests/test_fs.py b/python/pyarrow/tests/test_fs.py
index 1002e13471..59c9c44942 100644
--- a/python/pyarrow/tests/test_fs.py
+++ b/python/pyarrow/tests/test_fs.py
@@ -760,6 +760,38 @@ def test_delete_dir(fs, pathfn):
 fs.delete_dir(d)
 
 
+def test_delete_dir_with_explicit_subdir(fs, pathfn):
+# GH-38618: regression with AWS failing to delete directories,
+# depending on whether they were created explicitly. Note that
+# Minio doesn't reproduce the issue, so this test is not a regression
+# test in itself.
+skip_fsspec_s3fs(fs)
+
+d = pathfn('directory/')
+nd = pathfn('directory/nested/')
+
+# deleting dir with explicit subdir
+fs.create_dir(d)
+fs.create_dir(nd)
+fs.delete_dir(d)
+dir_info = fs.get_file_info(d)
+assert dir_info.type == FileType.NotFound
+
+# deleting dir with blob in explicit subdir
+d = pathfn('directory2')
+nd = pathfn('directory2/nested')
+f = pathfn('directory2/nested/target-file')
+
+fs.create_dir(d)
+fs.create_dir(nd)
+with fs.open_output_stream(f) as s:
+s.write(b'data')
+
+fs.delete_dir(d)
+dir_info = fs.get_file_info(d)
+assert dir_info.type == FileType.NotFound
+
+
 def test_delete_dir_contents(fs, pathfn):
 skip_fsspec_s3fs(fs)

(arrow) branch main updated: GH-38950: [Docs] Fix spelling (#38951)

2023-12-01 Thread jorisvandenbossche

This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
 new 3531396803 GH-38950: [Docs] Fix spelling (#38951)
3531396803 is described below

commit 353139680311e809d2413ea46e17e1656069ac5e
Author: Josh Soref <2119212+jso...@users.noreply.github.com>
AuthorDate: Fri Dec 1 12:33:09 2023 -0500

GH-38950: [Docs] Fix spelling (#38951)



### Rationale for this change

### What changes are included in this PR?

Spelling fixes to docs/

### Are these changes tested?

### Are there any user-facing changes?

* Closes: #38950

Lead-authored-by: Josh Soref <2119212+jso...@users.noreply.github.com>
Co-authored-by: Sutou Kouhei 
Signed-off-by: Joris Van den Bossche 
---
 docs/source/_static/theme_overrides.css|  6 ++--
 docs/source/conf.py|  2 +-
 docs/source/cpp/acero/developer_guide.rst  | 34 +++---
 docs/source/cpp/acero/overview.rst |  4 +--
 docs/source/cpp/acero/substrait.rst|  2 +-
 docs/source/cpp/acero/user_guide.rst   |  4 +--
 docs/source/cpp/compute.rst|  6 ++--
 docs/source/cpp/datatypes.rst  |  2 +-
 .../cpp/examples/compute_and_write_example.rst |  2 +-
 .../cpp/examples/dataset_skyhook_scan_example.rst  |  4 +--
 docs/source/cpp/overview.rst   |  2 +-
 docs/source/cpp/tutorials/basic_arrow.rst  |  2 +-
 .../developers/continuous_integration/archery.rst  |  2 +-
 .../developers/continuous_integration/crossbow.rst |  4 +--
 .../developers/continuous_integration/docker.rst   |  4 +--
 .../developers/continuous_integration/overview.rst |  4 +--
 docs/source/developers/documentation.rst   |  2 +-
 docs/source/developers/guide/documentation.rst |  2 +-
 docs/source/developers/guide/resources.rst |  2 +-
 .../guide/step_by_step/finding_issues.rst  |  2 +-
 .../developers/guide/tutorials/r_tutorial.rst  |  2 +-
 docs/source/developers/java/building.rst   | 14 -
 docs/source/developers/release.rst |  6 ++--
 docs/source/developers/reviewing.rst   |  4 +--
 docs/source/format/ADBC.rst|  4 +--
 docs/source/format/CDataInterface.rst  |  2 +-
 docs/source/format/CDeviceDataInterface.rst|  8 ++---
 docs/source/format/CanonicalExtensions.rst |  2 +-
 docs/source/format/Columnar.rst|  2 +-
 docs/source/java/dataset.rst   |  4 +--
 docs/source/python/api/compute.rst |  2 +-
 docs/source/python/dataset.rst |  2 +-
 docs/source/python/getting_involved.rst|  2 +-
 docs/source/python/integration.rst |  2 +-
 docs/source/python/integration/python_java.rst |  2 +-
 docs/source/python/interchange_protocol.rst| 16 +-
 docs/source/python/memory.rst  |  2 +-
 docs/source/python/parquet.rst |  2 +-
 38 files changed, 85 insertions(+), 85 deletions(-)

diff --git a/docs/source/_static/theme_overrides.css 
b/docs/source/_static/theme_overrides.css
index bf84267aea..58f4554d11 100644
--- a/docs/source/_static/theme_overrides.css
+++ b/docs/source/_static/theme_overrides.css
@@ -33,7 +33,7 @@
   }
 }
 
-/* Contibuting landing page overview cards */
+/* Contributing landing page overview cards */
 
 .contrib-card {
   border-radius: 0;
@@ -68,7 +68,7 @@
 }
 
 /* This is the bootstrap CSS style for "table-striped". Since the theme does
-not yet provide an easy way to configure this globaly, it easier to simply
+not yet provide an easy way to configure this globally, it easier to simply
 include this snippet here than updating each table in all rst files to
 add ":class: table-striped" */
 
@@ -76,7 +76,7 @@ add ":class: table-striped" */
   background-color: rgba(0, 0, 0, 0.05);
 }
 
-/* Iprove the vertical spacing in the C++ API docs
+/* Improve the vertical spacing in the C++ API docs
 (ideally this should be upstreamed to the pydata-sphinx-theme */
 
 dl.cpp dd p {
diff --git a/docs/source/conf.py b/docs/source/conf.py
index f11d78fe05..cde0c2b31f 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -139,7 +139,7 @@ autodoc_default_options = {
 breathe_projects = {"arrow_cpp": "../../cpp/apidoc/xml"}
 breathe_default_project = "arrow_cpp"
 
-# Overriden conditionally below
+# Overridden conditionally below
 autodoc_mock_imports = []
 
 # copybutton configuration
diff --git a/docs/source/cpp/acero/developer_guide.rst 
b/docs/source/cpp/acero/developer_guide.rst
index c893e41ff8..331cd833b5 100644
---

(arrow) branch main updated: GH-39028: [Python][CI] Fix dask integration build by temporarily skipping test_categorize_info (#39029)

2023-12-01 Thread jorisvandenbossche

This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
 new 530a63a81b GH-39028: [Python][CI] Fix dask integration build by 
temporarily skipping test_categorize_info (#39029)
530a63a81b is described below

commit 530a63a81b11d68bd66dc0e32c82e7e56030d762
Author: Joris Van den Bossche 
AuthorDate: Fri Dec 1 17:45:59 2023 +0100

GH-39028: [Python][CI] Fix dask integration build by temporarily skipping 
test_categorize_info (#39029)

The test requires an downstream fix in dask (because of a valid change in 
Arrow), until then temporarily skipping this test (see the issue for more 
details).

* Closes: #39028

Authored-by: Joris Van den Bossche 
Signed-off-by: Joris Van den Bossche 
---
 ci/scripts/integration_dask.sh | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/ci/scripts/integration_dask.sh b/ci/scripts/integration_dask.sh
index f91d21b921..bf306dc652 100755
--- a/ci/scripts/integration_dask.sh
+++ b/ci/scripts/integration_dask.sh
@@ -32,7 +32,9 @@ python -c "import dask.dataframe"
 # pytest -sv --pyargs dask.bytes.tests.test_local
 
 # The "skip_with_pyarrow_strings" marker is meant to skip automatically, but 
that doesn't work with --pyargs, so de-selecting manually
-pytest -v --pyargs dask.dataframe.tests.test_dataframe -m "not 
skip_with_pyarrow_strings"
+# - The 'test_categorize_info' test is failing because of change in 
StringArray's nbytes and
+#   an upstream fix (https://github.com/apache/arrow/issues/39028)
+pytest -v --pyargs dask.dataframe.tests.test_dataframe -m "not 
skip_with_pyarrow_strings" -k "not test_categorize_info"
 pytest -v --pyargs dask.dataframe.io.tests.test_orc
 pytest -v --pyargs dask.dataframe.io.tests.test_parquet \
   -m "not skip_with_pyarrow_strings and not xfail_with_pyarrow_strings"

(arrow) branch main updated: GH-38857: [Python] Fix append mode for cython 2 (#39027)

2023-12-01 Thread jorisvandenbossche

This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
 new 2bd8e06b48 GH-38857: [Python] Fix append mode for cython 2 (#39027)
2bd8e06b48 is described below

commit 2bd8e06b4867acea3dc5479e991998672804e8ea
Author: Joris Van den Bossche 
AuthorDate: Fri Dec 1 14:48:32 2023 +0100

GH-38857: [Python] Fix append mode for cython 2 (#39027)

### Rationale for this change

Small fixup of the change in https://github.com/apache/arrow/pull/38820 to 
fix the build failure on cython 2 (nightly crossbow build)
* Closes: #38857

Authored-by: Joris Van den Bossche 
Signed-off-by: Joris Van den Bossche 
---
 python/pyarrow/includes/libarrow.pxd | 3 ++-
 python/pyarrow/io.pxi| 4 +++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/python/pyarrow/includes/libarrow.pxd 
b/python/pyarrow/includes/libarrow.pxd
index 59b63b5fb7..b0b89f8614 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -1386,7 +1386,8 @@ cdef extern from "arrow/io/api.h" namespace "arrow::io" 
nogil:
 CResult[shared_ptr[COutputStream]] Open(const c_string& path)
 
 @staticmethod
-CResult[shared_ptr[COutputStream]] Open(const c_string& path, c_bool 
append)
+CResult[shared_ptr[COutputStream]] OpenWithAppend" Open"(
+const c_string& path, c_bool append)
 
 int file_descriptor()
 
diff --git a/python/pyarrow/io.pxi b/python/pyarrow/io.pxi
index 3086845efa..6f39166401 100644
--- a/python/pyarrow/io.pxi
+++ b/python/pyarrow/io.pxi
@@ -1167,7 +1167,9 @@ cdef class OSFile(NativeFile):
 
 cdef _open_writable(self, c_string path, c_bool append=False):
 with nogil:
-self.output_stream = GetResultValue(FileOutputStream.Open(path, 
append))
+self.output_stream = GetResultValue(
+FileOutputStream.OpenWithAppend(path, append)
+)
 self.is_writable = True
 self._is_appending = append

(arrow) branch main updated: GH-38342: [Python] Update to_pandas to use non-deprecated DataFrame constructor (#38374)

2023-12-01 Thread jorisvandenbossche

This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
 new 2fadab2aa6 GH-38342: [Python] Update to_pandas to use non-deprecated 
DataFrame constructor (#38374)
2fadab2aa6 is described below

commit 2fadab2aa65425ec4e392e5cf8fd2082f3685212
Author: Joris Van den Bossche 
AuthorDate: Fri Dec 1 13:11:35 2023 +0100

GH-38342: [Python] Update to_pandas to use non-deprecated DataFrame 
constructor (#38374)

### Rationale for this change

Avoiding a deprecation warning from pandas

* Closes: #38342

Authored-by: Joris Van den Bossche 
Signed-off-by: Joris Van den Bossche 
---
 python/pyarrow/pandas-shim.pxi  | 11 ---
 python/pyarrow/pandas_compat.py | 13 ++---
 python/pyarrow/table.pxi|  6 +++---
 3 files changed, 21 insertions(+), 9 deletions(-)

diff --git a/python/pyarrow/pandas-shim.pxi b/python/pyarrow/pandas-shim.pxi
index a0c0cabf6d..273575b779 100644
--- a/python/pyarrow/pandas-shim.pxi
+++ b/python/pyarrow/pandas-shim.pxi
@@ -37,7 +37,7 @@ cdef class _PandasAPIShim(object):
 object _array_like_types, _is_extension_array_dtype
 bint has_sparse
 bint _pd024
-bint _is_v1
+bint _is_v1, _is_ge_v21
 
 def __init__(self):
 self._tried_importing_pandas = False
@@ -74,8 +74,9 @@ cdef class _PandasAPIShim(object):
 "installed. Therefore, pandas-specific integration is not "
 "used.".format(self._version), stacklevel=2)
 return
-elif self._loose_version < Version('2.0.0'):
-self._is_v1 = True
+
+self._is_v1 = self._loose_version < Version('2.0.0')
+self._is_ge_v21 = self._loose_version >= Version('2.1.0')
 
 self._compat_module = pdcompat
 self._data_frame = pd.DataFrame
@@ -158,6 +159,10 @@ cdef class _PandasAPIShim(object):
 self._check_import()
 return self._is_v1
 
+def is_ge_v21(self):
+self._check_import()
+return self._is_ge_v21
+
 @property
 def categorical_type(self):
 self._check_import()
diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py
index be29f68a13..80e313be02 100644
--- a/python/pyarrow/pandas_compat.py
+++ b/python/pyarrow/pandas_compat.py
@@ -744,9 +744,11 @@ def make_datetimetz(unit, tz):
 return _pandas_api.datetimetz_type(unit, tz=tz)
 
 
-def table_to_blockmanager(options, table, categories=None,
-  ignore_metadata=False, types_mapper=None):
+def table_to_dataframe(
+options, table, categories=None, ignore_metadata=False, types_mapper=None
+):
 from pandas.core.internals import BlockManager
+from pandas import DataFrame
 
 all_columns = []
 column_indexes = []
@@ -770,7 +772,12 @@ def table_to_blockmanager(options, table, categories=None,
 blocks = _table_to_blocks(options, table, categories, ext_columns_dtypes)
 
 axes = [columns, index]
-return BlockManager(blocks, axes)
+mgr = BlockManager(blocks, axes)
+if _pandas_api.is_ge_v21():
+df = DataFrame._from_mgr(mgr, mgr.axes)
+else:
+df = DataFrame(mgr)
+return df
 
 
 # Set of the string repr of all numpy dtypes that can be stored in a pandas
diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi
index bbed789553..f93f595090 100644
--- a/python/pyarrow/table.pxi
+++ b/python/pyarrow/table.pxi
@@ -4191,12 +4191,12 @@ cdef class Table(_Tabular):
 
 def _to_pandas(self, options, categories=None, ignore_metadata=False,
types_mapper=None):
-from pyarrow.pandas_compat import table_to_blockmanager
-mgr = table_to_blockmanager(
+from pyarrow.pandas_compat import table_to_dataframe
+df = table_to_dataframe(
 options, self, categories,
 ignore_metadata=ignore_metadata,
 types_mapper=types_mapper)
-return pandas_api.data_frame(mgr)
+return df
 
 @property
 def schema(self):

1 2 3 4 5 6 >

1 - 100 of 532 matches

Mail list logo