(arrow) branch main updated (01d2fa0d46 -> b51e997df7)
This is an automated email from the ASF dual-hosted git repository. jorisvandenbossche pushed a change to branch main in repository https://gitbox.apache.org/repos/asf/arrow.git from 01d2fa0d46 GH-41307: [Java] Use org.apache:apache parent pom version 31 (#41772) add b51e997df7 GH-41960: Expose new S3 option check_directory_existence_before_creation (#41972) No new revisions were added by this update. Summary of changes: python/pyarrow/_s3fs.pyx| 20 python/pyarrow/includes/libarrow_fs.pxd | 1 + python/pyarrow/tests/test_fs.py | 5 + 3 files changed, 22 insertions(+), 4 deletions(-)
(arrow) branch main updated (37d0acdccb -> 0b5f0a2af1)
This is an automated email from the ASF dual-hosted git repository. jorisvandenbossche pushed a change to branch main in repository https://gitbox.apache.org/repos/asf/arrow.git from 37d0acdccb GH-41983: [Dev] Run issue labeling bot only when opening an issue (not editing) (#41986) add 0b5f0a2af1 GH-41502: [Python] Fix reading column index with decimal values (#41503) No new revisions were added by this update. Summary of changes: python/pyarrow/pandas_compat.py | 5 + python/pyarrow/tests/test_pandas.py | 11 +++ 2 files changed, 16 insertions(+)
(arrow) branch main updated: GH-41684: [C++][Python] Add optional null_bitmap to MapArray::FromArrays (#41757)
This is an automated email from the ASF dual-hosted git repository. jorisvandenbossche pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/arrow.git The following commit(s) were added to refs/heads/main by this push: new 255dbf990c GH-41684: [C++][Python] Add optional null_bitmap to MapArray::FromArrays (#41757) 255dbf990c is described below commit 255dbf990c3d3e5fb1270a2a11efe0af2be195ab Author: Alenka Frim AuthorDate: Fri May 31 10:09:54 2024 +0200 GH-41684: [C++][Python] Add optional null_bitmap to MapArray::FromArrays (#41757) ### Rationale for this change When constructing a `MapArray` with `FromArrays` one can not supply a `null_bitmap`. ### What changes are included in this PR? Optional `null_bitmap` argument is added to `MapArray::FromArrays`. ### Are these changes tested? TODO (have them locally, need to clean them up and commit. ### Are there any user-facing changes? No. * GitHub Issue: #41684 Authored-by: AlenkaF Signed-off-by: Joris Van den Bossche --- cpp/src/arrow/array/array_list_test.cc | 17 + cpp/src/arrow/array/array_nested.cc| 45 +++--- cpp/src/arrow/array/array_nested.h | 9 --- python/pyarrow/array.pxi | 11 ++--- python/pyarrow/includes/libarrow.pxd | 8 -- python/pyarrow/tests/test_array.py | 34 + 6 files changed, 102 insertions(+), 22 deletions(-) diff --git a/cpp/src/arrow/array/array_list_test.cc b/cpp/src/arrow/array/array_list_test.cc index e79ce6fe17..55f91dc341 100644 --- a/cpp/src/arrow/array/array_list_test.cc +++ b/cpp/src/arrow/array/array_list_test.cc @@ -1368,6 +1368,23 @@ TEST_F(TestMapArray, FromArrays) { ASSERT_EQ(keys_with_null->length(), tmp_items->length()); ASSERT_RAISES(Invalid, MapArray::FromArrays(offsets1, keys_with_null, tmp_items, pool_)); + + // With null_bitmap + ASSERT_OK_AND_ASSIGN(auto map7, MapArray::FromArrays(offsets1, keys, items, pool_, + offsets3->data()->buffers[0])); + ASSERT_OK(map7->Validate()); + MapArray expected7(map_type, length, offsets1->data()->buffers[1], keys, items, + offsets3->data()->buffers[0], 1); + AssertArraysEqual(expected7, *map7); + + // Null bitmap and offset with null + ASSERT_RAISES(Invalid, MapArray::FromArrays(offsets3, keys, items, pool_, + offsets3->data()->buffers[0])); + + // Null bitmap and offset with offset + ASSERT_RAISES(NotImplemented, +MapArray::FromArrays(offsets3->Slice(2), keys, items, pool_, + offsets3->data()->buffers[0])); } TEST_F(TestMapArray, FromArraysEquality) { diff --git a/cpp/src/arrow/array/array_nested.cc b/cpp/src/arrow/array/array_nested.cc index 67a499c2b8..bb5c6bf018 100644 --- a/cpp/src/arrow/array/array_nested.cc +++ b/cpp/src/arrow/array/array_nested.cc @@ -807,7 +807,7 @@ MapArray::MapArray(const std::shared_ptr& type, int64_t length, Result> MapArray::FromArraysInternal( std::shared_ptr type, const std::shared_ptr& offsets, const std::shared_ptr& keys, const std::shared_ptr& items, -MemoryPool* pool) { +MemoryPool* pool, const std::shared_ptr& null_bitmap) { using offset_type = typename MapType::offset_type; using OffsetArrowType = typename CTypeTraits::ArrowType; @@ -827,6 +827,15 @@ Result> MapArray::FromArraysInternal( return Status::Invalid("Map key and item arrays must be equal length"); } + if (null_bitmap != nullptr && offsets->null_count() > 0) { +return Status::Invalid( +"Ambiguous to specify both validity map and offsets with nulls"); + } + + if (null_bitmap != nullptr && offsets->offset() != 0) { +return Status::NotImplemented("Null bitmap with offsets slice not supported."); + } + if (offsets->null_count() > 0) { ARROW_ASSIGN_OR_RAISE(auto buffers, CleanListOffsets(NULLPTR, *offsets, pool)); @@ -836,24 +845,32 @@ Result> MapArray::FromArraysInternal( using OffsetArrayType = typename TypeTraits::ArrayType; const auto& typed_offsets = checked_cast(*offsets); - auto buffers = BufferVector({nullptr, typed_offsets.values()}); + + BufferVector buffers; + int64_t null_count; + if (null_bitmap != nullptr) { +buffers = BufferVector({std::move(null_bitmap), typed_offsets.values()}); +null_count = null_bitmap->size(); + } else { +buffers = BufferVector({null_bitmap, typed_offsets.values()}); +null_count = 0; + } return std::make_shared(type, offsets->length() - 1, std::move(buffers), keys, -items, /*null_count=*/0, offsets-&g
(arrow) branch main updated: GH-41126: [Python] Basic bindings for Device and MemoryManager classes (#41685)
This is an automated email from the ASF dual-hosted git repository. jorisvandenbossche pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/arrow.git The following commit(s) were added to refs/heads/main by this push: new 31fe24dd33 GH-41126: [Python] Basic bindings for Device and MemoryManager classes (#41685) 31fe24dd33 is described below commit 31fe24dd3345d387ba52d46c2915a909a5667813 Author: Joris Van den Bossche AuthorDate: Fri May 31 09:48:54 2024 +0200 GH-41126: [Python] Basic bindings for Device and MemoryManager classes (#41685) ### Rationale for this change Add bindings for the C++ `arrow::Device` and `arrow::MemoryManager` classes. ### What changes are included in this PR? Basic bindings by adding the `pyarrow.Device` and `pyarrow.MemoryManager` classes, and just tested for CPU. What is not included here are additional methods on the `MemoryManager` class (eg to allocate or copy buffers), and this is also not yet tested for CUDA. Planning to do this as follow-ups, and first doing those basic bindings should enable further enhancements to be done in parallel. ### Are these changes tested? Yes, for the CPU device only. * GitHub Issue: #41126 Authored-by: Joris Van den Bossche Signed-off-by: Joris Van den Bossche --- python/pyarrow/__init__.py | 3 + python/pyarrow/device.pxi| 162 +++ python/pyarrow/includes/libarrow.pxd | 35 python/pyarrow/io.pxi| 33 +++ python/pyarrow/lib.pxd | 20 + python/pyarrow/lib.pyx | 3 + python/pyarrow/tests/test_device.py | 43 ++ python/pyarrow/tests/test_misc.py| 2 + 8 files changed, 301 insertions(+) diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py index 936f473697..e52e0d242b 100644 --- a/python/pyarrow/__init__.py +++ b/python/pyarrow/__init__.py @@ -236,6 +236,9 @@ from pyarrow.lib import (null, bool_, RunEndEncodedScalar, ExtensionScalar) # Buffers, allocation +from pyarrow.lib import (DeviceAllocationType, Device, MemoryManager, + default_cpu_memory_manager) + from pyarrow.lib import (Buffer, ResizableBuffer, foreign_buffer, py_buffer, Codec, compress, decompress, allocate_buffer) diff --git a/python/pyarrow/device.pxi b/python/pyarrow/device.pxi new file mode 100644 index 00..6e60347520 --- /dev/null +++ b/python/pyarrow/device.pxi @@ -0,0 +1,162 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# cython: profile=False +# distutils: language = c++ +# cython: embedsignature = True + + +cpdef enum DeviceAllocationType: +CPU = CDeviceAllocationType_kCPU +CUDA = CDeviceAllocationType_kCUDA +CUDA_HOST = CDeviceAllocationType_kCUDA_HOST +OPENCL = CDeviceAllocationType_kOPENCL +VULKAN = CDeviceAllocationType_kVULKAN +METAL = CDeviceAllocationType_kMETAL +VPI = CDeviceAllocationType_kVPI +ROCM = CDeviceAllocationType_kROCM +ROCM_HOST = CDeviceAllocationType_kROCM_HOST +EXT_DEV = CDeviceAllocationType_kEXT_DEV +CUDA_MANAGED = CDeviceAllocationType_kCUDA_MANAGED +ONEAPI = CDeviceAllocationType_kONEAPI +WEBGPU = CDeviceAllocationType_kWEBGPU +HEXAGON = CDeviceAllocationType_kHEXAGON + + +cdef object _wrap_device_allocation_type(CDeviceAllocationType device_type): +return DeviceAllocationType( device_type) + + +cdef class Device(_Weakrefable): +""" +Abstract interface for hardware devices + +This object represents a device with access to some memory spaces. +When handling a Buffer or raw memory address, it allows deciding in which +context the raw memory address should be interpreted +(e.g. CPU-accessible memory, or embedded memory on some particular GPU). +""" + +def __init__(self): +raise TypeError("Do not call Device's constructor directly, " +"use the device attribute of the MemoryManager instead.") + +cdef void init(s
(arrow) branch main updated: GH-41748: [Python][Parquet] Update BYTE_STREAM_SPLIT description in write_table() docstring (#41759)
This is an automated email from the ASF dual-hosted git repository. jorisvandenbossche pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/arrow.git The following commit(s) were added to refs/heads/main by this push: new 065a6da852 GH-41748: [Python][Parquet] Update BYTE_STREAM_SPLIT description in write_table() docstring (#41759) 065a6da852 is described below commit 065a6da8520bd65fb4f59b2e3e496fe1124ac685 Author: Antoine Pitrou AuthorDate: Wed May 22 10:37:52 2024 +0200 GH-41748: [Python][Parquet] Update BYTE_STREAM_SPLIT description in write_table() docstring (#41759) ### Rationale for this change In PR #40094 (issue GH-39978), we forgot to update the `write_table` docstring with an accurate description of the supported data types for BYTE_STREAM_SPLIT. ### Are these changes tested? No (only a doc change). ### Are there any user-facing changes? No. * GitHub Issue: #41748 Authored-by: Antoine Pitrou Signed-off-by: Joris Van den Bossche --- python/pyarrow/parquet/core.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/python/pyarrow/parquet/core.py b/python/pyarrow/parquet/core.py index f54a203c87..81798b1544 100644 --- a/python/pyarrow/parquet/core.py +++ b/python/pyarrow/parquet/core.py @@ -797,8 +797,9 @@ use_byte_stream_split : bool or list, default False Specify if the byte_stream_split encoding should be used in general or only for some columns. If both dictionary and byte_stream_stream are enabled, then dictionary is preferred. -The byte_stream_split encoding is valid only for floating-point data types -and should be combined with a compression codec. +The byte_stream_split encoding is valid for integer, floating-point +and fixed-size binary data types (including decimals); it should be +combined with a compression codec so as to achieve size reduction. column_encoding : string or dict, default None Specify the encoding scheme on a per column basis. Can only be used when ``use_dictionary`` is set to False, and
(arrow) branch main updated (1f07404dac -> e254c43c09)
This is an automated email from the ASF dual-hosted git repository. jorisvandenbossche pushed a change to branch main in repository https://gitbox.apache.org/repos/asf/arrow.git from 1f07404dac GH-41321: [C++][Parquet] More strict Parquet level checking (#41346) add e254c43c09 GH-41389: [Python] Expose byte_width and bit_width of ExtensionType in terms of the storage type (#41413) No new revisions were added by this update. Summary of changes: python/pyarrow/includes/libarrow.pxd| 2 ++ python/pyarrow/tests/test_extension_type.py | 30 +++-- python/pyarrow/types.pxi| 18 + 3 files changed, 48 insertions(+), 2 deletions(-)
(arrow) branch main updated: GH-41688: [Dev] Include all relevant CMakeLists.txt files in cmake-format precommit hook (#41689)
This is an automated email from the ASF dual-hosted git repository. jorisvandenbossche pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/arrow.git The following commit(s) were added to refs/heads/main by this push: new 14b8ca5317 GH-41688: [Dev] Include all relevant CMakeLists.txt files in cmake-format precommit hook (#41689) 14b8ca5317 is described below commit 14b8ca53171435113a0f0f0c4ff1063d12543bc4 Author: Joris Van den Bossche AuthorDate: Fri May 17 14:35:02 2024 +0200 GH-41688: [Dev] Include all relevant CMakeLists.txt files in cmake-format precommit hook (#41689) ### Rationale for this change Some CMakeLists.txt files are not included in the pre-commit hook (causing failures on CI through archery if you rely on the pre-commit hook locally) ### What changes are included in this PR? Include all CMakeLists.txt files by default anywhere in the repo, and explicitly exclude the ones we don't want (vendored files). In practice, compared to the current set of files covered by the hook, those new files are included in the search: 'cpp/CMakeLists.txt', 'java/CMakeLists.txt', 'matlab/CMakeLists.txt', 'python/CMakeLists.txt' ### Are these changes tested? Yes * GitHub Issue: #41688 Authored-by: Joris Van den Bossche Signed-off-by: Joris Van den Bossche --- .pre-commit-config.yaml | 7 ++- dev/archery/archery/utils/lint.py | 2 +- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 7dcc1c9816..1e4b91e27e 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -116,17 +116,14 @@ repos: name: CMake Format files: >- ( + ?.*CMakeLists\.txt$| ?^ci/.*/.*\.cmake$| ?^cpp/.*/.*\.cmake\.in$| ?^cpp/.*/.*\.cmake$| - ?^cpp/.*/CMakeLists\.txt$| - ?^go/.*/CMakeLists\.txt$| - ?^java/.*/CMakeLists\.txt$| - ?^matlab/.*/CMakeLists\.txt$| - ?^python/.*/CMakeLists\.txt$| ) exclude: >- ( + ?^ci/conan/all/.*CMakeLists\.txt$| ?^cpp/cmake_modules/FindNumPy\.cmake$| ?^cpp/cmake_modules/FindPythonLibsNew\.cmake$| ?^cpp/cmake_modules/UseCython\.cmake$| diff --git a/dev/archery/archery/utils/lint.py b/dev/archery/archery/utils/lint.py index 108c9ded36..92b7f79fc1 100644 --- a/dev/archery/archery/utils/lint.py +++ b/dev/archery/archery/utils/lint.py @@ -157,7 +157,7 @@ def cmake_linter(src, fix=False): 'go/**/CMakeLists.txt', 'java/**/CMakeLists.txt', 'matlab/**/CMakeLists.txt', -'python/CMakeLists.txt', +'python/**/CMakeLists.txt', ], exclude_patterns=[ 'cpp/cmake_modules/FindNumPy.cmake',
(arrow) branch main updated: MINOR: [Python][Docs] Use CMake presets to simplify Python build installation (#41500)
This is an automated email from the ASF dual-hosted git repository. jorisvandenbossche pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/arrow.git The following commit(s) were added to refs/heads/main by this push: new 2dbc5e26dc MINOR: [Python][Docs] Use CMake presets to simplify Python build installation (#41500) 2dbc5e26dc is described below commit 2dbc5e26dcbc6826b4eb7a330fa8090836f6b727 Author: William Ayd AuthorDate: Fri May 17 04:24:56 2024 -0400 MINOR: [Python][Docs] Use CMake presets to simplify Python build installation (#41500) ### Rationale for this change This should simplify the number of steps users have to go through to get a working Python installation from source Authored-by: Will Ayd Signed-off-by: Joris Van den Bossche --- docs/source/developers/python.rst | 29 - docs/source/python/data.rst | 2 +- 2 files changed, 21 insertions(+), 10 deletions(-) diff --git a/docs/source/developers/python.rst b/docs/source/developers/python.rst index be9fac067c..e84cd25201 100644 --- a/docs/source/developers/python.rst +++ b/docs/source/developers/python.rst @@ -302,10 +302,24 @@ created above (stored in ``$ARROW_HOME``): .. code-block:: - $ mkdir arrow/cpp/build - $ pushd arrow/cpp/build - $ cmake -DCMAKE_INSTALL_PREFIX=$ARROW_HOME \ - -DCMAKE_INSTALL_LIBDIR=lib \ + $ cmake -S arrow/cpp -B arrow/cpp/build \ + -DCMAKE_INSTALL_PREFIX=$ARROW_HOME \ + --preset ninja-release-python + $ cmake --build arrow/cpp/build --target install + +``ninja-release-python`` is not the only preset available - if you would like a +build with more features like CUDA, Flight and Gandiva support you may opt for +the ``ninja-release-python-maximal`` preset. If you wanted less features, (i.e. +removing ORC and dataset support) you could opt for +``ninja-release-python-minimal``. Changing the word ``release`` to ``debug`` +with any of the aforementioned presets will generate a debug build of Arrow. + +The presets are provided as a convenience, but you may instead opt to +specify the individual components: + +.. code-block:: + $ cmake -S arrow/cpp -B arrow/cpp/build \ + -DCMAKE_INSTALL_PREFIX=$ARROW_HOME \ -DCMAKE_BUILD_TYPE=Debug \ -DARROW_BUILD_TESTS=ON \ -DARROW_COMPUTE=ON \ @@ -321,11 +335,8 @@ created above (stored in ``$ARROW_HOME``): -DARROW_WITH_SNAPPY=ON \ -DARROW_WITH_ZLIB=ON \ -DARROW_WITH_ZSTD=ON \ - -DPARQUET_REQUIRE_ENCRYPTION=ON \ - .. - $ make -j4 - $ make install - $ popd + -DPARQUET_REQUIRE_ENCRYPTION=ON + $ cmake --build arrow/cpp/build --target install -j4 There are a number of optional components that can be switched ON by adding flags with ``ON``: diff --git a/docs/source/python/data.rst b/docs/source/python/data.rst index f17475138c..598c8c125f 100644 --- a/docs/source/python/data.rst +++ b/docs/source/python/data.rst @@ -561,7 +561,7 @@ schema without having to get any of the batches.:: It can also be sent between languages using the :ref:`C stream interface `. -Conversion of RecordBatch do Tensor +Conversion of RecordBatch to Tensor --- Each array of the ``RecordBatch`` has it's own contiguous memory that is not necessarily
(arrow) branch main updated: GH-38575: [Python] Include metadata when creating pa.schema from PyCapsule (#41538)
This is an automated email from the ASF dual-hosted git repository. jorisvandenbossche pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/arrow.git The following commit(s) were added to refs/heads/main by this push: new 6a9e2d53b5 GH-38575: [Python] Include metadata when creating pa.schema from PyCapsule (#41538) 6a9e2d53b5 is described below commit 6a9e2d53b5cdd0f387bfcd44e9549f122fac93e5 Author: Jacob Hayes AuthorDate: Fri May 17 03:07:02 2024 -0400 GH-38575: [Python] Include metadata when creating pa.schema from PyCapsule (#41538) ### Rationale for this change Fixes the dropped `pa.schema` metadata reported in #38575, which was introduced in #37797. ### What changes are included in this PR? Passes through the `metadata` to the short-circuited `Schema` created with `_import_from_c_capsule`. ### Are these changes tested? Yes - added `metadata` to the existing test. ### Are there any user-facing changes? I'm not sure this quite rises to the `(b) a bug that caused incorrect or invalid data to be produced,` condition, but I added that note to be safe since the resulting schema is "incorrect" (and broke some round-trip tests on my end after a pyarrow update): **This PR contains a "Critical Fix".** * GitHub Issue: #38575 Lead-authored-by: Jacob Hayes Co-authored-by: Joris Van den Bossche Signed-off-by: Joris Van den Bossche --- python/pyarrow/tests/test_types.py | 5 - python/pyarrow/types.pxi | 5 - 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/python/pyarrow/tests/test_types.py b/python/pyarrow/tests/test_types.py index 4f66a6f416..f7b6040f51 100644 --- a/python/pyarrow/tests/test_types.py +++ b/python/pyarrow/tests/test_types.py @@ -1331,10 +1331,13 @@ def test_schema_import_c_schema_interface(): def __arrow_c_schema__(self): return self.schema.__arrow_c_schema__() -schema = pa.schema([pa.field("field_name", pa.int32())]) +schema = pa.schema([pa.field("field_name", pa.int32())], metadata={"a": "b"}) +assert schema.metadata == {b"a": b"b"} wrapped_schema = Wrapper(schema) assert pa.schema(wrapped_schema) == schema +assert pa.schema(wrapped_schema).metadata == {b"a": b"b"} +assert pa.schema(wrapped_schema, metadata={"a": "c"}).metadata == {b"a": b"c"} def test_field_import_c_schema_interface(): diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index 018099ae7e..480f19c81d 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -5332,7 +5332,10 @@ def schema(fields, metadata=None): if isinstance(fields, Mapping): fields = fields.items() elif hasattr(fields, "__arrow_c_schema__"): -return Schema._import_from_c_capsule(fields.__arrow_c_schema__()) +result = Schema._import_from_c_capsule(fields.__arrow_c_schema__()) +if metadata is not None: +result = result.with_metadata(metadata) +return result for item in fields: if isinstance(item, tuple):
(arrow-site) branch asf-site updated: MINOR: Update docs/python/install.html with GH-41105 (#521)
This is an automated email from the ASF dual-hosted git repository. jorisvandenbossche pushed a commit to branch asf-site in repository https://gitbox.apache.org/repos/asf/arrow-site.git The following commit(s) were added to refs/heads/asf-site by this push: new 22b975f4ca7 MINOR: Update docs/python/install.html with GH-41105 (#521) 22b975f4ca7 is described below commit 22b975f4ca718883b472a78dc64933b8a7cc3586 Author: Bryce Mecum AuthorDate: Thu May 16 22:57:52 2024 -0800 MINOR: Update docs/python/install.html with GH-41105 (#521) My attempt at updating docs/python/install.html with teh changes in https://github.com/apache/arrow/pull/41135. I generated the docs locally, copied the generated install.html into arrow-site, and then only committed the hunks I know changed. I didn't commit the entire changed file since the diff included many more changes, some of which looked like they'd break the page. --- docs/python/install.html | 115 +++ 1 file changed, 115 insertions(+) diff --git a/docs/python/install.html b/docs/python/install.html index d012eceb315..124d1fdf796 100644 --- a/docs/python/install.html +++ b/docs/python/install.html @@ -1549,6 +1549,13 @@ Linux distributions. We strongly recommend using a 64-bit system. conda install -c conda-forge pyarrow + +Note +While the pyarrow https://conda-forge.org/;>conda-forge package is +the right choice for most users, both a minimal and maximal variant of the +package exist, either of which may be better for your use case. See +Differences between conda-forge packages. + Using Pip# @@ -1597,6 +1604,114 @@ a custom path to the database from Python: + +Differences between conda-forge packages# +On https://conda-forge.org/;>conda-forge, PyArrow is published as three +separate packages, each providing varying levels of functionality. This is in +contrast to PyPi, where only a single PyArrow package is provided. +The purpose of this split is to minimize the size of the installed package for +most users (pyarrow), provide a smaller, minimal package for specialized use +cases (pyarrow-core), while still providing a complete package for users who +require it (pyarrow-all). What was historically pyarrow on +https://conda-forge.org/;>conda-forge is now pyarrow-all, though most +users can continue using pyarrow. +The pyarrow-core package includes the following functionality: + +Data Types and In-Memory Data Model +Compute Functions (i.e., pyarrow.compute) +Memory and IO Interfaces +Streaming, Serialization, and IPC (i.e., pyarrow.ipc) +Filesystem Interface (i.e., pyarrow.fs. Note: It’s planned to move cloud fileystems (i.e., S3, GCSFile formats: Arrow/Feather, JSON, CSV, ORC (but not Parquet) + +The pyarrow package adds the following: + +Acero (i.e., pyarrow.acero) +Tabular Datasets (i.e., pyarrow.dataset) +Parquet (i.e., pyarrow.parquet) +Substrait (i.e., pyarrow.substrait) + +Finally, pyarrow-all adds: + +Arrow Flight RPC and Flight SQL (i.e., pyarrow.flight) +Gandiva (i.e., pyarrow.gandiva) + +The following table lists the functionality provided by each package and may be +useful when deciding to use one package over another or when +Creating A Custom Selection. + + +Component +Package +pyarrow-core +pyarrow +pyarrow-all + +Core +pyarrow-core +✓ +✓ +✓ + +Parquet +libparquet + +✓ +✓ + +Dataset +libarrow-dataset + +✓ +✓ + +Acero +libarrow-acero + +✓ +✓ + +Substrait +libarrow-substrait + +✓ +✓ + +Flight +libarrow-flight + + +✓ + +Flight SQL +libarrow-flight-sql + + +✓ + +Gandiva +libarrow-gandiva + + +✓ + + + + +Creating A Custom Selection# +If you know which components you need and want to control what’s installed, you +can create a custom selection of packages to include only the extra features you +need. For example, to install pyarrow-core and add support for reading and +writing Parquet, install libparquet alongside pyarrow-core: +conda install -c conda-forge pyarrow-core libparquet + + +Or if you wish to use pyarrow but need support for Flight RPC: +conda install -c conda-forge pyarrow libarrow-flight + + + +
(arrow) branch main updated: GH-41480: [Python] Building PyArrow: enable/disable python components by default based on availability in Arrow C++ (#41494)
This is an automated email from the ASF dual-hosted git repository. jorisvandenbossche pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/arrow.git The following commit(s) were added to refs/heads/main by this push: new 1c546fb3c1 GH-41480: [Python] Building PyArrow: enable/disable python components by default based on availability in Arrow C++ (#41494) 1c546fb3c1 is described below commit 1c546fb3c130fc6a4f3e06ad31dc49d923785104 Author: Joris Van den Bossche AuthorDate: Thu May 16 14:15:57 2024 +0200 GH-41480: [Python] Building PyArrow: enable/disable python components by default based on availability in Arrow C++ (#41494) ### Rationale for this change Currently, when building pyarrow from source, one needs to manually enable the optional components through setting `PYARROW_WITH_...` environment variables. However, we could also make a default choice of components based on which ones where enabled in the Arrow C++ build. ### What changes are included in this PR? Set defaults for the various `PYARROW_BUILD_` based on the `ARROW_` setting. Keep the current `PYARROW_WITH_` environment variables working to allow to override this default. ### Are there any user-facing changes? No * GitHub Issue: #41480 Lead-authored-by: Joris Van den Bossche Co-authored-by: Sutou Kouhei Signed-off-by: Joris Van den Bossche --- ci/appveyor-cpp-build.bat | 1 - python/CMakeLists.txt | 115 +-- python/setup.py | 134 +- 3 files changed, 123 insertions(+), 127 deletions(-) diff --git a/ci/appveyor-cpp-build.bat b/ci/appveyor-cpp-build.bat index 8cfa67c437..f688fbb63a 100644 --- a/ci/appveyor-cpp-build.bat +++ b/ci/appveyor-cpp-build.bat @@ -129,7 +129,6 @@ set PYARROW_WITH_ORC=%ARROW_ORC% set PYARROW_WITH_PARQUET=ON set PYARROW_WITH_PARQUET_ENCRYPTION=ON set PYARROW_WITH_S3=%ARROW_S3% -set PYARROW_WITH_STATIC_BOOST=ON set PYARROW_WITH_SUBSTRAIT=ON set ARROW_HOME=%CONDA_PREFIX%\Library diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 212862357a..07acb9e31a 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -108,25 +108,6 @@ if(UNIX) endif() endif() -# Top level cmake dir -if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}") - option(PYARROW_BUILD_ACERO "Build the PyArrow Acero integration" OFF) - option(PYARROW_BUILD_CUDA "Build the PyArrow CUDA support" OFF) - option(PYARROW_BUILD_DATASET "Build the PyArrow Dataset integration" OFF) - option(PYARROW_BUILD_FLIGHT "Build the PyArrow Flight integration" OFF) - option(PYARROW_BUILD_GANDIVA "Build the PyArrow Gandiva integration" OFF) - option(PYARROW_BUILD_ORC "Build the PyArrow ORC integration" OFF) - option(PYARROW_BUILD_PARQUET "Build the PyArrow Parquet integration" OFF) - option(PYARROW_BUILD_PARQUET_ENCRYPTION - "Build the PyArrow Parquet encryption integration" OFF) - option(PYARROW_BUNDLE_ARROW_CPP "Bundle the Arrow C++ libraries" OFF) - option(PYARROW_BUNDLE_CYTHON_CPP "Bundle the C++ files generated by Cython" OFF) - option(PYARROW_GENERATE_COVERAGE "Build with Cython code coverage enabled" OFF) - set(PYARROW_CXXFLAGS - "" - CACHE STRING "Compiler flags to append when compiling Arrow") -endif() - find_program(CCACHE_FOUND ccache) if(CCACHE_FOUND AND NOT CMAKE_C_COMPILER_LAUNCHER @@ -265,11 +246,70 @@ message(STATUS "NumPy include dir: ${NUMPY_INCLUDE_DIRS}") include(UseCython) -# PyArrow C++ +# Arrow C++ and set default PyArrow build options include(GNUInstallDirs) - find_package(Arrow REQUIRED) +macro(define_option name description arrow_option) + set("PYARROW_${name}" + "AUTO" + CACHE STRING ${description}) + + if("${PYARROW_${name}}" STREQUAL "AUTO") +# by default, first check if env variable exists, otherwise use Arrow C++ config +set(env_variable "PYARROW_WITH_${name}") +if(DEFINED ENV{${env_variable}}) + if($ENV{${env_variable}}) +set("PYARROW_BUILD_${name}" ON) + else() +set("PYARROW_BUILD_${name}" OFF) + endif() +else() + if(${arrow_option}) +set("PYARROW_BUILD_${name}" ON) + else() +set("PYARROW_BUILD_${name}" OFF) + endif() +endif() + else() +if("${PYARROW_${name}}") + set("PYARROW_BUILD_${name}" ON) +else() + set("PYARROW_BUILD_${name}" OFF) +endif() + endif() +endmacro() + +define_option(ACERO "Build the PyArrow Acero integration" ARROW_ACERO) +define_option(CUDA "Build the PyArrow CUDA support" ARROW_CUDA) +define_option(DATA
(arrow-nanoarrow) branch main updated: fix(python): Add iterator for null/na type (#467)
This is an automated email from the ASF dual-hosted git repository. jorisvandenbossche pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/arrow-nanoarrow.git The following commit(s) were added to refs/heads/main by this push: new 65e90b7f fix(python): Add iterator for null/na type (#467) 65e90b7f is described below commit 65e90b7f340ca40901a30fee577453c08abdba77 Author: Dewey Dunnington AuthorDate: Tue May 14 12:58:41 2024 -0300 fix(python): Add iterator for null/na type (#467) Closes #465 --- python/src/nanoarrow/iterator.py | 6 +- python/tests/test_iterator.py| 5 + 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/python/src/nanoarrow/iterator.py b/python/src/nanoarrow/iterator.py index 2364ea82..3ff1714f 100644 --- a/python/src/nanoarrow/iterator.py +++ b/python/src/nanoarrow/iterator.py @@ -17,7 +17,7 @@ import warnings from functools import cached_property -from itertools import islice +from itertools import islice, repeat from typing import Iterable, Tuple from nanoarrow._lib import CArrayView, CArrowType @@ -482,6 +482,9 @@ class PyIterator(ArrayViewBaseIterator): else: return iter(items) +def _null_iter(self, offset, length): +return repeat(None, length) + class RowTupleIterator(PyIterator): """Iterate over rows of a struct array (stream) where each row is a @@ -545,6 +548,7 @@ def _get_tzinfo(tz_string, strategy=None): _ITEMS_ITER_LOOKUP = { +CArrowType.NA: "_null_iter", CArrowType.BINARY: "_binary_iter", CArrowType.LARGE_BINARY: "_binary_iter", CArrowType.STRING: "_string_iter", diff --git a/python/tests/test_iterator.py b/python/tests/test_iterator.py index ff0b34e2..fe6e8bbd 100644 --- a/python/tests/test_iterator.py +++ b/python/tests/test_iterator.py @@ -513,3 +513,8 @@ def test_iterator_extension(): with pytest.warns(UnregisteredExtensionWarning): assert list(iter_py(extension_array)) == [1, 2, 3] + + +def test_iterator_null(): +array = na.c_array_from_buffers(na.null(), 3, []) +assert list(iter_py(array)) == [None, None, None]
(arrow) branch main updated (fd84ec0b1a -> d7c22601e7)
This is an automated email from the ASF dual-hosted git repository. jorisvandenbossche pushed a change to branch main in repository https://gitbox.apache.org/repos/asf/arrow.git from fd84ec0b1a GH-39129 [Python] pa.array: add check for byte-swapped numpy arrays inside python objects (#41549) add d7c22601e7 GH-41464: [Python] Fix StructArray.sort() for by=None (#41495) No new revisions were added by this update. Summary of changes: python/pyarrow/array.pxi | 7 +++ python/pyarrow/tests/test_array.py | 8 2 files changed, 11 insertions(+), 4 deletions(-)
(arrow) branch main updated (fc7c723bab -> fd84ec0b1a)
This is an automated email from the ASF dual-hosted git repository. jorisvandenbossche pushed a change to branch main in repository https://gitbox.apache.org/repos/asf/arrow.git from fc7c723bab MINOR: [Go] Bump golang.org/x/tools from 0.20.0 to 0.21.0 in /go (#41639) add fd84ec0b1a GH-39129 [Python] pa.array: add check for byte-swapped numpy arrays inside python objects (#41549) No new revisions were added by this update. Summary of changes: python/pyarrow/src/arrow/python/python_to_arrow.cc | 4 python/pyarrow/tests/test_array.py | 24 ++ 2 files changed, 28 insertions(+)
(arrow) branch main updated (52321377cc -> b719408f4a)
This is an automated email from the ASF dual-hosted git repository. jorisvandenbossche pushed a change to branch main in repository https://gitbox.apache.org/repos/asf/arrow.git from 52321377cc GH-40997: [C++] Get null_bit_id according to are_cols_in_encoding_order in NullUpdateColumnToRow_avx2 (#40998) add b719408f4a GH-40560: [Python] RunEndEncodedArray.from_arrays: bugfix for Array arguments (#40560) (#41093) No new revisions were added by this update. Summary of changes: python/pyarrow/array.pxi | 2 +- python/pyarrow/tests/test_array.py | 11 +++ 2 files changed, 12 insertions(+), 1 deletion(-)
(arrow) branch main updated: GH-41491: [Python] remove special methods related to buffers in python <2.6 (#41492)
This is an automated email from the ASF dual-hosted git repository. jorisvandenbossche pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/arrow.git The following commit(s) were added to refs/heads/main by this push: new 3c67091f93 GH-41491: [Python] remove special methods related to buffers in python <2.6 (#41492) 3c67091f93 is described below commit 3c67091f93223f2d12f5a73d3e5bc51e7b389a00 Author: Thomas A Caswell AuthorDate: Thu May 2 08:18:21 2024 -0400 GH-41491: [Python] remove special methods related to buffers in python <2.6 (#41492) ### Rationale for this change These methods are not actually used and will be removed from Cython in an upcoming release. Closes #41491 ### What changes are included in this PR? ### Are these changes tested? Trust CI ### Are there any user-facing changes? No, this code should never be actually used. * GitHub Issue: #41491 Authored-by: Thomas A Caswell Signed-off-by: Joris Van den Bossche --- python/pyarrow/io.pxi | 47 +-- 1 file changed, 13 insertions(+), 34 deletions(-) diff --git a/python/pyarrow/io.pxi b/python/pyarrow/io.pxi index 7890bf4b2d..9e8026deb4 100644 --- a/python/pyarrow/io.pxi +++ b/python/pyarrow/io.pxi @@ -1446,27 +1446,6 @@ cdef class Buffer(_Weakrefable): buffer.strides = self.strides buffer.suboffsets = NULL -def __getsegcount__(self, Py_ssize_t *len_out): -if len_out != NULL: -len_out[0] = self.size -return 1 - -def __getreadbuffer__(self, Py_ssize_t idx, void **p): -if idx != 0: -raise SystemError("accessing nonexistent buffer segment") -if p != NULL: -p[0] = self.buffer.get().data() -return self.size - -def __getwritebuffer__(self, Py_ssize_t idx, void **p): -if not self.buffer.get().is_mutable(): -raise SystemError("trying to write an immutable buffer") -if idx != 0: -raise SystemError("accessing nonexistent buffer segment") -if p != NULL: -p[0] = self.buffer.get().data() -return self.size - cdef class ResizableBuffer(Buffer): """ @@ -2142,21 +2121,21 @@ cdef class CacheOptions(_Weakrefable): Parameters -- hole_size_limit : int, default 8KiB -The maximum distance in bytes between two consecutive ranges; beyond +The maximum distance in bytes between two consecutive ranges; beyond this value, ranges are not combined. range_size_limit : int, default 32MiB -The maximum size in bytes of a combined range; if combining two -consecutive ranges would produce a range of a size greater than this, +The maximum size in bytes of a combined range; if combining two +consecutive ranges would produce a range of a size greater than this, they are not combined lazy : bool, default True lazy = false: request all byte ranges when PreBuffer or WillNeed is called. -lazy = True, prefetch_limit = 0: request merged byte ranges only after the reader -needs them. -lazy = True, prefetch_limit = k: prefetch up to k merged byte ranges ahead of the +lazy = True, prefetch_limit = 0: request merged byte ranges only after the reader +needs them. +lazy = True, prefetch_limit = k: prefetch up to k merged byte ranges ahead of the range that is currently being read. prefetch_limit : int, default 0 -The maximum number of ranges to be prefetched. This is only used for -lazy cache to asynchronously read some ranges after reading the target +The maximum number of ranges to be prefetched. This is only used for +lazy cache to asynchronously read some ranges after reading the target range. """ @@ -2227,19 +2206,19 @@ cdef class CacheOptions(_Weakrefable): """ Create suiteable CacheOptions based on provided network metrics. -Typically this will be used with object storage solutions like Amazon S3, +Typically this will be used with object storage solutions like Amazon S3, Google Cloud Storage and Azure Blob Storage. Parameters -- time_to_first_byte_millis : int -Seek-time or Time-To-First-Byte (TTFB) in milliseconds, also called call -setup latency of a new read request. The value is a positive integer. +Seek-time or Time-To-First-Byte (TTFB) in milliseconds, also called call +setup latency of a new read request. The value is a positive integer. transfer_bandwidth_mib_per_sec : int -Data transfer Bandwidth (BW) in MiB/sec (per connection). The value is
(arrow) branch main updated: GH-41463: [C++] Skip TestConcurrentFillFromScalar for platforms without threading support (#41461)
This is an automated email from the ASF dual-hosted git repository. jorisvandenbossche pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/arrow.git The following commit(s) were added to refs/heads/main by this push: new 250291500b GH-41463: [C++] Skip TestConcurrentFillFromScalar for platforms without threading support (#41461) 250291500b is described below commit 250291500b6a7d5d934901acef708cef2eb1dc08 Author: Rossi Sun AuthorDate: Wed May 1 14:39:35 2024 +0800 GH-41463: [C++] Skip TestConcurrentFillFromScalar for platforms without threading support (#41461) ### Rationale for this change See #41463 and https://github.com/apache/arrow/pull/40237#issuecomment-2084577090 ### What changes are included in this PR? Skip test for platforms that have no threading support. ### Are these changes tested? Change is test. ### Are there any user-facing changes? None. * GitHub Issue: #41463 Authored-by: Ruoxi Sun Signed-off-by: Joris Van den Bossche --- cpp/src/arrow/array/array_test.cc | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cpp/src/arrow/array/array_test.cc b/cpp/src/arrow/array/array_test.cc index af64908b59..7e25ad61fa 100644 --- a/cpp/src/arrow/array/array_test.cc +++ b/cpp/src/arrow/array/array_test.cc @@ -827,6 +827,9 @@ TEST_F(TestArray, TestFillFromScalar) { // GH-40069: Data-race when concurrent calling ArraySpan::FillFromScalar of the same // scalar instance. TEST_F(TestArray, TestConcurrentFillFromScalar) { +#ifndef ARROW_ENABLE_THREADING + GTEST_SKIP() << "Test requires threading support"; +#endif for (auto type : TestArrayUtilitiesAgainstTheseTypes()) { ARROW_SCOPED_TRACE("type = ", type->ToString()); for (auto seed : {0u, 0xdeadbeef, 42u}) {
(arrow) branch main updated: GH-40342: [Python] Fix pickling of LocalFileSystem for cython 2 (#41459)
This is an automated email from the ASF dual-hosted git repository. jorisvandenbossche pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/arrow.git The following commit(s) were added to refs/heads/main by this push: new b609de374c GH-40342: [Python] Fix pickling of LocalFileSystem for cython 2 (#41459) b609de374c is described below commit b609de374c7c00e1537eb8092e1ff2db718d2b61 Author: Joris Van den Bossche AuthorDate: Tue Apr 30 13:42:31 2024 +0200 GH-40342: [Python] Fix pickling of LocalFileSystem for cython 2 (#41459) Small follow-up fix for the failure introduced by https://github.com/apache/arrow/pull/40356 * GitHub Issue: #40342 Authored-by: Joris Van den Bossche Signed-off-by: Joris Van den Bossche --- python/pyarrow/_fs.pyx | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/pyarrow/_fs.pyx b/python/pyarrow/_fs.pyx index 0e635b2c8a..dbfb6ed114 100644 --- a/python/pyarrow/_fs.pyx +++ b/python/pyarrow/_fs.pyx @@ -18,6 +18,7 @@ # cython: language_level = 3 from cpython.datetime cimport datetime, PyDateTime_DateTime +from cython cimport binding from pyarrow.includes.common cimport * from pyarrow.includes.libarrow_python cimport PyDateTime_to_TimePoint @@ -421,6 +422,7 @@ cdef class FileSystem(_Weakrefable): "SubTreeFileSystem") @staticmethod +@binding(True) # Required for cython < 3 def _from_uri(uri): fs, _path = FileSystem.from_uri(uri) return fs
(arrow-nanoarrow) branch main updated: feat(python): add back nanoarrow.array(..) constructor (#441)
This is an automated email from the ASF dual-hosted git repository. jorisvandenbossche pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/arrow-nanoarrow.git The following commit(s) were added to refs/heads/main by this push: new c677d4d3 feat(python): add back nanoarrow.array(..) constructor (#441) c677d4d3 is described below commit c677d4d396e75d362a626db6c56207ef4ee4befa Author: Joris Van den Bossche AuthorDate: Tue Apr 23 21:15:49 2024 +0200 feat(python): add back nanoarrow.array(..) constructor (#441) Closes https://github.com/apache/arrow-nanoarrow/issues/434 - Co-authored-by: Dewey Dunnington --- python/src/nanoarrow/__init__.py | 3 ++- python/src/nanoarrow/array.py| 40 +++- python/tests/test_array.py | 5 + 3 files changed, 46 insertions(+), 2 deletions(-) diff --git a/python/src/nanoarrow/__init__.py b/python/src/nanoarrow/__init__.py index 5f99dc22..1e220932 100644 --- a/python/src/nanoarrow/__init__.py +++ b/python/src/nanoarrow/__init__.py @@ -73,7 +73,7 @@ from nanoarrow.schema import ( decimal256, struct, ) -from nanoarrow.array import Array +from nanoarrow.array import array, Array from nanoarrow._version import __version__ # noqa: F401 # Helps Sphinx automatically populate an API reference section @@ -125,4 +125,5 @@ __all__ = [ "uint64", "uint8", "Array", +"array", ] diff --git a/python/src/nanoarrow/array.py b/python/src/nanoarrow/array.py index e38dc9c0..d3730e07 100644 --- a/python/src/nanoarrow/array.py +++ b/python/src/nanoarrow/array.py @@ -97,7 +97,7 @@ class Array: The Array is nanoarrow's high-level in-memory array representation whose scope maps to that of a fully-consumed ArrowArrayStream in the Arrow C Data -interface. See :func:`array` for class details. +interface. The :class:`Array` class is nanoarrow's high-level in-memory array representation, encompasing the role of PyArrow's ``Array``, @@ -498,3 +498,41 @@ class Array: """ self._assert_one_chunk("inspect") print(_repr_utils.array_inspect(c_array(self))) + + +def array(obj, schema=None, device=None) -> Array: +""" +Create a nanoarrow.Array from array-like input. + +The :class:`Array` class is nanoarrow's high-level in-memory array +representation whose scope maps to that of a fully-consumed +ArrowArrayStream in the Arrow C Data interface. Note that an +:class:`Array` is not necessarily contiguous in memory (i.e., +it may consist of zero or more ``ArrowArray``s). +See :class:`Array` for class details. + +Parameters +-- +obj : array or array stream-like +An array-like or array stream-like object. This can be any object +supporting the Arrow PyCapsule interface, the Python buffer +protocol, or an iterable of Python objects. +schema : schema-like, optional +An optional schema. This can be a Schema object, or object +implementing the Arrow PyCapsule interface for schemas +(i.e. having the ``__arrow_c_schema__`` protocol method). +device : Device, optional +The device associated with the buffers held by this Array. +Defaults to the CPU device. + +Examples + + +>>> import nanoarrow as na +>>> na.array([1, 2, 3], na.int32()) +nanoarrow.Array[3] +1 +2 +3 +""" +return Array(obj, schema=schema, device=device) diff --git a/python/tests/test_array.py b/python/tests/test_array.py index 553a6350..f99b38f5 100644 --- a/python/tests/test_array.py +++ b/python/tests/test_array.py @@ -38,6 +38,11 @@ def test_array_construct(): iter(array) +def test_array_constructor(): +array = na.array([1, 2, 3], na.int32()) +assert array.schema.type == na.Type.INT32 + + def test_array_empty(): array = na.Array([], na.int32()) assert array.schema.type == na.Type.INT32
(arrow-nanoarrow) branch main updated: feat(python): function to inspect a single-chunk Array (#436)
This is an automated email from the ASF dual-hosted git repository. jorisvandenbossche pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/arrow-nanoarrow.git The following commit(s) were added to refs/heads/main by this push: new 821b580a feat(python): function to inspect a single-chunk Array (#436) 821b580a is described below commit 821b580a2fd964b1e4536bbaed927e208dcff6cc Author: Joris Van den Bossche AuthorDate: Mon Apr 22 20:02:29 2024 +0200 feat(python): function to inspect a single-chunk Array (#436) --- python/src/nanoarrow/_repr_utils.py | 43 + python/src/nanoarrow/array.py | 8 +++ python/tests/test_array.py | 19 3 files changed, 70 insertions(+) diff --git a/python/src/nanoarrow/_repr_utils.py b/python/src/nanoarrow/_repr_utils.py index 3209a341..bd090af5 100644 --- a/python/src/nanoarrow/_repr_utils.py +++ b/python/src/nanoarrow/_repr_utils.py @@ -248,3 +248,46 @@ def device_repr(device): device_type = f"- device_type: {device.device_type.name} <{device.device_type_id}>" device_id = f"- device_id: {device.device_id}" return "\n".join([title_line, device_type, device_id]) + + +def array_inspect(array, indent=0, max_char_width=80): +array_view = array.view() + +if max_char_width < 20: +max_char_width = 20 + +indent_str = " " * indent +class_label = "ArrowArray" +if array._addr() == 0: +return f"<{class_label} >" +elif not array.is_valid(): +return f"<{class_label} >" + +schema_string = array.schema._to_string( +max_chars=max_char_width - indent - 23, recursive=True +) +lines = [f"<{class_label} {schema_string}>"] +for attr in ("length", "offset", "null_count"): +attr_repr = repr(getattr(array, attr)) +lines.append(f"{indent_str}- {attr}: {attr_repr}") + +lines.append(f"{indent_str}- buffers[{array_view.n_buffers}]:") +for i, buffer in enumerate(array_view.buffers): +buffer_type = array_view.buffer_type(i) +lines.append( +f"{indent_str} - {buffer_type} " +f"<{buffer_view_repr(buffer, max_char_width - indent - 4 - len(buffer))}>" +) + +if array.dictionary: +dictionary_repr = array_inspect(array.dictionary, indent=indent + 2) +lines.append(f"{indent_str}- dictionary: {dictionary_repr}") +else: +lines.append(f"{indent_str}- dictionary: NULL") + +lines.append(f"{indent_str}- children[{array.n_children}]:") +for child in array.children: +child_repr = array_inspect(child, indent=indent + 4) +lines.append(f"{indent_str} {repr(child.schema.name)}: {child_repr}") + +return "\n".join(lines) diff --git a/python/src/nanoarrow/array.py b/python/src/nanoarrow/array.py index af2e3cd4..e38dc9c0 100644 --- a/python/src/nanoarrow/array.py +++ b/python/src/nanoarrow/array.py @@ -490,3 +490,11 @@ class Array: def __repr__(self) -> str: return self.to_string() + +def inspect(self): +""" +Print the details of the array (type, length, offset, buffers, +and children arrays). +""" +self._assert_one_chunk("inspect") +print(_repr_utils.array_inspect(c_array(self))) diff --git a/python/tests/test_array.py b/python/tests/test_array.py index ee88d20d..553a6350 100644 --- a/python/tests/test_array.py +++ b/python/tests/test_array.py @@ -280,3 +280,22 @@ def test_array_repr_long(): assert len(repr_lines) == 2 assert repr_lines[1].endswith("...") assert len(repr_lines[1]) == 80 + + +def test_array_inspect(capsys): +array = na.Array(range(10), na.int32()) +array.inspect() +captured = capsys.readouterr() +assert captured.out.startswith("") + +# with children +c_array = na.c_array_from_buffers( +na.struct({f"col{i}": na.int32() for i in range(100)}), +length=1, +buffers=[None], +children=[na.c_array([123456], na.int32())] * 100, +) +array = na.Array(c_array) +array.inspect() +captured = capsys.readouterr() +assert captured.out.startswith("
(arrow-nanoarrow) branch ci-upload-nightly-wheels deleted (was fa35ec5d)
This is an automated email from the ASF dual-hosted git repository. jorisvandenbossche pushed a change to branch ci-upload-nightly-wheels in repository https://gitbox.apache.org/repos/asf/arrow-nanoarrow.git was fa35ec5d Merge remote-tracking branch 'upstream/main' into ci-upload-nightly-wheels The revisions that were on this branch are still contained in other references; therefore, this change does not discard any commits from the repository.
(arrow-nanoarrow) branch main updated (b921dae1 -> db6630b7)
This is an automated email from the ASF dual-hosted git repository. jorisvandenbossche pushed a change to branch main in repository https://gitbox.apache.org/repos/asf/arrow-nanoarrow.git from b921dae1 Update dist/ for commit 626e219dac259ebf4109c8a4188ddbc6ce93cd4a add db6630b7 ci(python): upload nightly python packages (#429) No new revisions were added by this update. Summary of changes: .github/workflows/python-wheels.yaml | 38 ++-- 1 file changed, 36 insertions(+), 2 deletions(-)
(arrow-nanoarrow) branch ci-upload-nightly-wheels updated (a830d78e -> fa35ec5d)
This is an automated email from the ASF dual-hosted git repository. jorisvandenbossche pushed a change to branch ci-upload-nightly-wheels in repository https://gitbox.apache.org/repos/asf/arrow-nanoarrow.git from a830d78e fix syntax add 8e8e38d3 chore(python): Restructure buffer packing to support nulls and improve performance (#426) add 3a78aa45 fix: Relax comparison strictness such that integration tests pass (#399) add b5d2742e fix: Ensure negative return values from snprintf() are not used as indexes (#418) add 917e8e7d Update dist/ for commit b5d2742e2d0aee71c2ca5a277169e53c335f6c43 add 09481518 feat(python): Create string/binary arrays from iterables (#430) add fa35ec5d Merge remote-tracking branch 'upstream/main' into ci-upload-nightly-wheels No new revisions were added by this update. Summary of changes: dist/nanoarrow.c | 68 +++-- dist/nanoarrow_ipc.c | 30 +++ dist/nanoarrow_testing.hpp | 174 +++-- .../src/nanoarrow/nanoarrow_ipc_decoder.c | 30 +++ python/bootstrap.py| 2 +- python/src/nanoarrow/_lib.pyx | 266 +++- python/src/nanoarrow/c_lib.py | 83 ++- python/tests/test_c_array.py | 113 - python/tests/test_c_buffer.py | 34 ++- python/tests/test_iterator.py | 37 +-- src/nanoarrow/array.c | 26 +- src/nanoarrow/array_test.cc| 35 +++ src/nanoarrow/integration/c_data_integration.cc| 10 + src/nanoarrow/nanoarrow_testing.hpp| 174 +++-- src/nanoarrow/nanoarrow_testing_test.cc| 274 - src/nanoarrow/schema.c | 35 ++- src/nanoarrow/utils.c | 7 + 17 files changed, 1253 insertions(+), 145 deletions(-)
(arrow) branch main updated: GH-35081: [Python] construct pandas.DataFrame with public API in `to_pandas` (#40897)
This is an automated email from the ASF dual-hosted git repository. jorisvandenbossche pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/arrow.git The following commit(s) were added to refs/heads/main by this push: new eb47fd653f GH-35081: [Python] construct pandas.DataFrame with public API in `to_pandas` (#40897) eb47fd653f is described below commit eb47fd653fbbe03efc18daf5488369cb87752f96 Author: Joris Van den Bossche AuthorDate: Tue Apr 16 09:59:51 2024 +0200 GH-35081: [Python] construct pandas.DataFrame with public API in `to_pandas` (#40897) ### Rationale for this change Avoiding using pandas internals to create Block objects ourselves, using a new API for pandas>=3 * GitHub Issue: #35081 Authored-by: Joris Van den Bossche Signed-off-by: Joris Van den Bossche --- python/pyarrow/pandas-shim.pxi | 7 +++- python/pyarrow/pandas_compat.py | 75 +++-- 2 files changed, 48 insertions(+), 34 deletions(-) diff --git a/python/pyarrow/pandas-shim.pxi b/python/pyarrow/pandas-shim.pxi index 0409e133ad..74f0d981b5 100644 --- a/python/pyarrow/pandas-shim.pxi +++ b/python/pyarrow/pandas-shim.pxi @@ -38,7 +38,7 @@ cdef class _PandasAPIShim(object): object _array_like_types, _is_extension_array_dtype, _lock bint has_sparse bint _pd024 -bint _is_v1, _is_ge_v21 +bint _is_v1, _is_ge_v21, _is_ge_v3 def __init__(self): self._lock = Lock() @@ -79,6 +79,7 @@ cdef class _PandasAPIShim(object): self._is_v1 = self._loose_version < Version('2.0.0') self._is_ge_v21 = self._loose_version >= Version('2.1.0') +self._is_ge_v3 = self._loose_version >= Version('3.0.0.dev0') self._compat_module = pdcompat self._data_frame = pd.DataFrame @@ -169,6 +170,10 @@ cdef class _PandasAPIShim(object): self._check_import() return self._is_ge_v21 +def is_ge_v3(self): +self._check_import() +return self._is_ge_v3 + @property def categorical_type(self): self._check_import() diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py index 5bd0dfcf6b..00fa19604e 100644 --- a/python/pyarrow/pandas_compat.py +++ b/python/pyarrow/pandas_compat.py @@ -676,7 +676,7 @@ def get_datetimetz_type(values, dtype, type_): # Converting pyarrow.Table efficiently to pandas.DataFrame -def _reconstruct_block(item, columns=None, extension_columns=None): +def _reconstruct_block(item, columns=None, extension_columns=None, return_block=True): """ Construct a pandas Block from the `item` dictionary coming from pyarrow's serialization or returned by arrow::python::ConvertTableToPandas. @@ -709,22 +709,23 @@ def _reconstruct_block(item, columns=None, extension_columns=None): block_arr = item.get('block', None) placement = item['placement'] if 'dictionary' in item: -cat = _pandas_api.categorical_type.from_codes( +arr = _pandas_api.categorical_type.from_codes( block_arr, categories=item['dictionary'], ordered=item['ordered']) -block = _int.make_block(cat, placement=placement) elif 'timezone' in item: unit, _ = np.datetime_data(block_arr.dtype) dtype = make_datetimetz(unit, item['timezone']) if _pandas_api.is_ge_v21(): -pd_arr = _pandas_api.pd.array( +arr = _pandas_api.pd.array( block_arr.view("int64"), dtype=dtype, copy=False ) -block = _int.make_block(pd_arr, placement=placement) else: -block = _int.make_block(block_arr, placement=placement, -klass=_int.DatetimeTZBlock, -dtype=dtype) +arr = block_arr +if return_block: +block = _int.make_block(block_arr, placement=placement, +klass=_int.DatetimeTZBlock, +dtype=dtype) +return block elif 'py_array' in item: # create ExtensionBlock arr = item['py_array'] @@ -734,12 +735,14 @@ def _reconstruct_block(item, columns=None, extension_columns=None): if not hasattr(pandas_dtype, '__from_arrow__'): raise ValueError("This column does not support to be converted " "to a pandas ExtensionArray") -pd_ext_arr = pandas_dtype.__from_arrow__(arr) -block = _int.make_block(pd_ext_arr, placement=placement) +arr = pandas_dtype.__from_arrow__(arr) else: -block = _int.make_block(block_arr, placement=placement) +arr = block_arr -return block +if return_block: +return _int.make_block(arr, placement=placement) +else: +return arr, place
(arrow-nanoarrow) branch ci-upload-nightly-wheels created (now a830d78e)
This is an automated email from the ASF dual-hosted git repository. jorisvandenbossche pushed a change to branch ci-upload-nightly-wheels in repository https://gitbox.apache.org/repos/asf/arrow-nanoarrow.git at a830d78e fix syntax No new revisions were added by this update.
(arrow) branch main updated: GH-38010: [Python] Construct pyarrow.Field and ChunkedArray through Arrow PyCapsule Protocol (#40818)
This is an automated email from the ASF dual-hosted git repository. jorisvandenbossche pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/arrow.git The following commit(s) were added to refs/heads/main by this push: new b842b530d1 GH-38010: [Python] Construct pyarrow.Field and ChunkedArray through Arrow PyCapsule Protocol (#40818) b842b530d1 is described below commit b842b530d14a752697f4283c33f16f2f293713ff Author: Joris Van den Bossche AuthorDate: Mon Apr 15 10:22:15 2024 +0200 GH-38010: [Python] Construct pyarrow.Field and ChunkedArray through Arrow PyCapsule Protocol (#40818) ### Rationale for this change See https://github.com/apache/arrow/issues/38010#issuecomment-2010601912 for more context. Right now for _consuming_ ArrowSchema-compatible objects that implement the PyCapsule interface, we only have the private `_import_from_c_capsule` (on Schema, Field, DataType) and we check for the protocol in the public `pa.schema(..)`. But that means you currently can only consume objects that represent the schema of a batch (struct type), and not schemas of individual arrays. ### What changes are included in this PR? Expand the `pa.field(..)` constructor to accept objects implementing the protocol method. ### Are these changes tested? TODO * GitHub Issue: #38010 Authored-by: Joris Van den Bossche Signed-off-by: Joris Van den Bossche --- docs/source/python/extending_types.rst | 29 ++-- python/pyarrow/table.pxi | 37 ++ python/pyarrow/tests/test_array.py | 2 +- python/pyarrow/tests/test_cffi.py | 12 -- python/pyarrow/tests/test_table.py | 41 ++ python/pyarrow/tests/test_types.py | 22 ++ python/pyarrow/types.pxi | 18 ++- 7 files changed, 151 insertions(+), 10 deletions(-) diff --git a/docs/source/python/extending_types.rst b/docs/source/python/extending_types.rst index b7261005e6..8df0ef0b1f 100644 --- a/docs/source/python/extending_types.rst +++ b/docs/source/python/extending_types.rst @@ -37,14 +37,14 @@ under the hood, you can implement the following methods on those objects: - ``__arrow_c_schema__`` for schema or type-like objects. - ``__arrow_c_array__`` for arrays and record batches (contiguous tables). -- ``__arrow_c_stream__`` for chunked tables or streams of data. +- ``__arrow_c_stream__`` for chunked arrays, tables and streams of data. Those methods return `PyCapsule <https://docs.python.org/3/c-api/capsule.html>`__ objects, and more details on the exact semantics can be found in the :ref:`specification `. When your data structures have those methods defined, the PyArrow constructors -(such as :func:`pyarrow.array` or :func:`pyarrow.table`) will recognize those objects as +(see below) will recognize those objects as supporting this protocol, and convert them to PyArrow data structures zero-copy. And the same can be true for any other library supporting this protocol on ingesting data. @@ -53,6 +53,31 @@ support for this protocol by checking for the presence of those methods, and therefore accept any Arrow data (instead of harcoding support for a specific Arrow producer such as PyArrow). +For consuming data through this protocol with PyArrow, the following constructors +can be used to create the various PyArrow objects: + +++---++ +| Result class | PyArrow constructor | Supported protocol | +++===++ +| :class:`Array` | :func:`pyarrow.array` | array | +++---++ +| :class:`ChunkedArray` | :func:`pyarrow.chunked_array` | array, stream | +++---++ +| :class:`RecordBatch` | :func:`pyarrow.record_batch` | array | +++---++ +| :class:`Table` | :func:`pyarrow.table` | array, stream | +++---++ +| :class:`RecordBatchReader` | :meth:`pyarrow.RecordBatchReader.from_stream` | stream | +++---++ +| :class:`Field` | :func:`pyarrow.field` |
(arrow) branch main updated (6e1b62509b -> 831b94a65e)
This is an automated email from the ASF dual-hosted git repository. jorisvandenbossche pushed a change to branch main in repository https://gitbox.apache.org/repos/asf/arrow.git from 6e1b62509b GH-40801: [Docs] Clarify device identifier documentation in the Arrow C Device data interface (#41101) add 831b94a65e GH-40866: [C++][Python] Basic conversion of RecordBatch to Arrow Tensor - add support for row-major (#40867) No new revisions were added by this update. Summary of changes: cpp/src/arrow/record_batch.cc| 79 ++--- cpp/src/arrow/record_batch.h | 4 +- cpp/src/arrow/record_batch_test.cc | 212 ++- python/pyarrow/includes/libarrow.pxd | 3 +- python/pyarrow/table.pxi | 36 -- python/pyarrow/tests/test_table.py | 74 +++- 6 files changed, 326 insertions(+), 82 deletions(-)
(arrow) branch main updated (aeb1618a30 -> 75a100a113)
This is an automated email from the ASF dual-hosted git repository. jorisvandenbossche pushed a change to branch main in repository https://gitbox.apache.org/repos/asf/arrow.git from aeb1618a30 GH-41020: [C++] Introduce portable compiler assumptions (#41021) add 75a100a113 GH-38768: [Python] Empty slicing an array backwards beyond the start is now empty (#40682) No new revisions were added by this update. Summary of changes: python/pyarrow/array.pxi | 29 + python/pyarrow/tests/test_array.py | 1 + 2 files changed, 2 insertions(+), 28 deletions(-)
(arrow) branch main updated: GH-40061: [C++][Python] Basic conversion of RecordBatch to Arrow Tensor - add option to cast NULL to NaN (#40803)
This is an automated email from the ASF dual-hosted git repository. jorisvandenbossche pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/arrow.git The following commit(s) were added to refs/heads/main by this push: new 96f686b81b GH-40061: [C++][Python] Basic conversion of RecordBatch to Arrow Tensor - add option to cast NULL to NaN (#40803) 96f686b81b is described below commit 96f686b81ba148f4d434846f0b9e161c538f131d Author: Alenka Frim AuthorDate: Fri Mar 29 08:30:03 2024 +0100 GH-40061: [C++][Python] Basic conversion of RecordBatch to Arrow Tensor - add option to cast NULL to NaN (#40803) ### Rationale for this change The conversion from `RecordBatch` to `Tensor` class exists but it doesn't support record batches with validity bitmaps. This PR adds support for an option to convert null values to NaN. ### What changes are included in this PR? This PR adds a `nul_to_nan` option in `RecordBatch::ToTensor` so that null values are converted to NaN in the resulting `Tensor`. This for example works: ```python >>> import pyarrow as pa >>> batch = pa.record_batch( ... [ ... pa.array([1, 2, 3, 4, None], type=pa.int32()), ... pa.array([10, 20, 30, 40, None], type=pa.float32()), ... ], names = ["a", "b"] ... ) >>> batch pyarrow.RecordBatch a: int32 b: float a: [1,2,3,4,null] b: [10,20,30,40,null] >>> batch.to_tensor(null_to_nan=True) type: double shape: (5, 2) strides: (8, 40) >>> batch.to_tensor(null_to_nan=True).to_numpy() array([[ 1., 10.], [ 2., 20.], [ 3., 30.], [ 4., 40.], [nan, nan]]) ``` but default would raise: ```python >>> batch.to_tensor() Traceback (most recent call last): File "", line 1, in File "pyarrow/table.pxi", line 3421, in pyarrow.lib.RecordBatch.to_tensor a: int32 File "pyarrow/error.pxi", line 154, in pyarrow.lib.pyarrow_internal_check_status return check_status(status) File "pyarrow/error.pxi", line 91, in pyarrow.lib.check_status raise convert_status(status) pyarrow.lib.ArrowTypeError: Can only convert a RecordBatch with no nulls. Set null_to_nan to true to convert nulls to nan ``` ### Are these changes tested? Yes. ### Are there any user-facing changes? No. * GitHub Issue: #40061 Lead-authored-by: AlenkaF Co-authored-by: Alenka Frim Co-authored-by: Joris Van den Bossche Signed-off-by: Joris Van den Bossche --- cpp/src/arrow/record_batch.cc| 47 -- cpp/src/arrow/record_batch.h | 6 ++- cpp/src/arrow/record_batch_test.cc | 76 +++- python/pyarrow/includes/libarrow.pxd | 2 +- python/pyarrow/table.pxi | 49 +-- python/pyarrow/tests/test_table.py | 48 ++- 6 files changed, 208 insertions(+), 20 deletions(-) diff --git a/cpp/src/arrow/record_batch.cc b/cpp/src/arrow/record_batch.cc index 0d8bda9b66..6f3b8e75a2 100644 --- a/cpp/src/arrow/record_batch.cc +++ b/cpp/src/arrow/record_batch.cc @@ -18,6 +18,7 @@ #include "arrow/record_batch.h" #include +#include #include #include #include @@ -261,12 +262,19 @@ struct ConvertColumnsToTensorVisitor { using In = typename T::c_type; auto in_values = ArraySpan(in_data).GetSpan(1, in_data.length); - if constexpr (std::is_same_v) { -memcpy(out_values, in_values.data(), in_values.size_bytes()); -out_values += in_values.size(); + if (in_data.null_count == 0) { +if constexpr (std::is_same_v) { + memcpy(out_values, in_values.data(), in_values.size_bytes()); + out_values += in_values.size(); +} else { + for (In in_value : in_values) { +*out_values++ = static_cast(in_value); + } +} } else { -for (In in_value : in_values) { - *out_values++ = static_cast(in_value); +for (int64_t i = 0; i < in_data.length; ++i) { + *out_values++ = + in_data.IsNull(i) ? static_cast(NAN) : static_cast(in_values[i]); } } return Status::OK(); @@ -286,16 +294,20 @@ inline void ConvertColumnsToTensor(const RecordBatch& batch, uint8_t* out) { } } -Result> RecordBatch::ToTensor(MemoryPool* pool) const { +Result> RecordBatch::ToTensor(bool null_to_nan, + MemoryPool* pool) const { if (num_columns() == 0) { return Status::TypeError( "Conversion to Tensor for RecordBatches without columns/schema is not " "suppo
(arrow) branch main updated: GH-40841: [Docs][C++][Python] Add initial documentation for RecordBatch::Tensor conversion (#40842)
This is an automated email from the ASF dual-hosted git repository. jorisvandenbossche pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/arrow.git The following commit(s) were added to refs/heads/main by this push: new ed8c3630db GH-40841: [Docs][C++][Python] Add initial documentation for RecordBatch::Tensor conversion (#40842) ed8c3630db is described below commit ed8c3630dbe2261bed9123a4ccfc7df0e3f031bd Author: Alenka Frim AuthorDate: Fri Mar 29 08:29:28 2024 +0100 GH-40841: [Docs][C++][Python] Add initial documentation for RecordBatch::Tensor conversion (#40842) ### Rationale for this change The work on the conversion from `Table`/`RecordBatch` to `Tensor` is progressing and we have to make sure to add information to the documentation. ### What changes are included in this PR? I propose to add - new page (`converting_recordbatch_to_tensor.rst`) in the `cpp/examples` section, - added section (Conversion of RecordBatch do Tensor) in the `docs/source/python/data.rst` the content above would be updated as the features are added in the future (row-major conversion, `Table::ToTensor`, DLPack support for `Tensor` class, etc.) ### Are these changes tested? It will be tested with the crossbow preview-docs job. ### Are there any user-facing changes? No, just documentation. * GitHub Issue: #40841 Lead-authored-by: AlenkaF Co-authored-by: Alenka Frim Co-authored-by: Joris Van den Bossche Signed-off-by: Joris Van den Bossche --- .../examples/converting_recordbatch_to_tensor.rst | 46 +++ docs/source/cpp/examples/index.rst | 1 + docs/source/python/data.rst| 52 ++ 3 files changed, 99 insertions(+) diff --git a/docs/source/cpp/examples/converting_recordbatch_to_tensor.rst b/docs/source/cpp/examples/converting_recordbatch_to_tensor.rst new file mode 100644 index 00..2be27096cf --- /dev/null +++ b/docs/source/cpp/examples/converting_recordbatch_to_tensor.rst @@ -0,0 +1,46 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. default-domain:: cpp +.. highlight:: cpp + +Conversion of ``RecordBatch`` to ``Tensor`` instances += + +Arrow provides a method to convert ``RecordBatch`` objects to a ``Tensor`` +with two dimensions: + +.. code:: + + std::shared_ptr batch; + + ASSERT_OK_AND_ASSIGN(auto tensor, batch->ToTensor()); + ASSERT_OK(tensor->Validate()); + +The conversion supports signed and unsigned integer types plus float types. +In case the ``RecordBatch`` has null values the conversion succeeds if +``null_to_nan`` parameter is set to ``true``. In this case all +types will be promoted to a floating-point data type. + +.. code:: + + std::shared_ptr batch; + + ASSERT_OK_AND_ASSIGN(auto tensor, batch->ToTensor(/*null_to_nan=*/true)); + ASSERT_OK(tensor->Validate()); + +Currently only column-major conversion is supported. diff --git a/docs/source/cpp/examples/index.rst b/docs/source/cpp/examples/index.rst index b886a0d29e..90b00bbdf6 100644 --- a/docs/source/cpp/examples/index.rst +++ b/docs/source/cpp/examples/index.rst @@ -27,3 +27,4 @@ Examples dataset_skyhook_scan_example row_columnar_conversion std::tuple-like ranges to Arrow + Converting RecordBatch to Tensor diff --git a/docs/source/python/data.rst b/docs/source/python/data.rst index 2cc33561d4..9156157fcd 100644 --- a/docs/source/python/data.rst +++ b/docs/source/python/data.rst @@ -560,3 +560,55 @@ schema without having to get any of the batches.:: x: int64 It can also be sent between languages using the :ref:`C stream interface `. + +Conversion of RecordBatch do Tensor +--- + +Each array of the ``RecordBatch`` has it's own contiguous memory that is not necessarily +adjacent to other arrays. A different memory structure that is used in machine learning +libraries is a two dimensional array (also called a 2-dim tensor or a matrix) which takes +only one contiguous block of memory. + +For
(arrow) branch main updated (aae2557e30 -> a407a6b45e)
This is an automated email from the ASF dual-hosted git repository. jorisvandenbossche pushed a change to branch main in repository https://gitbox.apache.org/repos/asf/arrow.git from aae2557e30 GH-39377: [C++] IO: Reuse same buffer in CompressedInputStream (#39807) add a407a6b45e GH-40698: [C++] Create registry for Devices to map DeviceType to MemoryManager in C Device Data import (#40699) No new revisions were added by this update. Summary of changes: cpp/src/arrow/buffer_test.cc | 13 + cpp/src/arrow/c/bridge.cc| 11 --- cpp/src/arrow/c/bridge.h | 12 cpp/src/arrow/device.cc | 63 cpp/src/arrow/device.h | 28 ++ cpp/src/arrow/gpu/cuda_memory.cc | 19 cpp/src/arrow/gpu/cuda_memory.h | 4 ++- cpp/src/arrow/gpu/cuda_test.cc | 15 ++ 8 files changed, 139 insertions(+), 26 deletions(-)
(arrow) branch main updated (32437a5aeb -> 434f87274e)
This is an automated email from the ASF dual-hosted git repository. jorisvandenbossche pushed a change to branch main in repository https://gitbox.apache.org/repos/asf/arrow.git from 32437a5aeb GH-40205: [Python] ListView arrow-to-pandas conversion (#40482) add 434f87274e GH-40060: [C++][Python] Basic conversion of RecordBatch to Arrow Tensor - add support for different data types (#40359) No new revisions were added by this update. Summary of changes: cpp/src/arrow/record_batch.cc | 91 -- cpp/src/arrow/record_batch_test.cc | 128 + python/pyarrow/table.pxi | 3 + python/pyarrow/tests/test_table.py | 97 +++- 4 files changed, 268 insertions(+), 51 deletions(-)
(arrow) branch main updated (dbff1f4a3e -> 32437a5aeb)
This is an automated email from the ASF dual-hosted git repository. jorisvandenbossche pushed a change to branch main in repository https://gitbox.apache.org/repos/asf/arrow.git from dbff1f4a3e GH-36026: [C++][ORC] Catch all ORC exceptions to avoid crash (#40697) add 32437a5aeb GH-40205: [Python] ListView arrow-to-pandas conversion (#40482) No new revisions were added by this update. Summary of changes: python/pyarrow/src/arrow/python/arrow_to_pandas.cc | 44 +--- python/pyarrow/tests/test_pandas.py| 82 ++ 2 files changed, 115 insertions(+), 11 deletions(-)
(arrow) branch main updated (5e1a4fd8a4 -> 7d4d744794)
This is an automated email from the ASF dual-hosted git repository. jorisvandenbossche pushed a change to branch main in repository https://gitbox.apache.org/repos/asf/arrow.git from 5e1a4fd8a4 GH-40767: [C++][Parquet] Simplify PageWriter and ColumnWriter creation (#40768) add 7d4d744794 GH-40720: [Python] Simplify and improve perf of creation of the column names in Table.to_pandas (#40721) No new revisions were added by this update. Summary of changes: python/pyarrow/pandas_compat.py | 67 +++-- 1 file changed, 17 insertions(+), 50 deletions(-)
(arrow) branch main updated: GH-40357: [C++] Add benchmark for ToTensor conversions (#40358)
This is an automated email from the ASF dual-hosted git repository. jorisvandenbossche pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/arrow.git The following commit(s) were added to refs/heads/main by this push: new fc87fd75d6 GH-40357: [C++] Add benchmark for ToTensor conversions (#40358) fc87fd75d6 is described below commit fc87fd75d6602562e64abf8744890332e35f979e Author: Alenka Frim AuthorDate: Tue Mar 26 08:59:50 2024 +0100 GH-40357: [C++] Add benchmark for ToTensor conversions (#40358) ### Rationale for this change We should add benchmarks to be sure not to cause regressions while working on additional implementations of `RecordBatch::ToTensor` and `Table::ToTensor`. ### What changes are included in this PR? New `cpp/src/arrow/to_tensor_benchmark.cc file`. * GitHub Issue: #40357 Lead-authored-by: AlenkaF Co-authored-by: Alenka Frim Co-authored-by: Joris Van den Bossche Signed-off-by: Joris Van den Bossche --- cpp/src/arrow/CMakeLists.txt | 1 + cpp/src/arrow/tensor_benchmark.cc | 68 +++ 2 files changed, 69 insertions(+) diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 3d1b621db0..4bf1008af4 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -1175,6 +1175,7 @@ add_arrow_benchmark(builder_benchmark) add_arrow_benchmark(compare_benchmark) add_arrow_benchmark(memory_pool_benchmark) add_arrow_benchmark(type_benchmark) +add_arrow_benchmark(tensor_benchmark) # # Recurse into sub-directories diff --git a/cpp/src/arrow/tensor_benchmark.cc b/cpp/src/arrow/tensor_benchmark.cc new file mode 100644 index 00..91a9270ef3 --- /dev/null +++ b/cpp/src/arrow/tensor_benchmark.cc @@ -0,0 +1,68 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "benchmark/benchmark.h" + +#include "arrow/record_batch.h" +#include "arrow/testing/gtest_util.h" +#include "arrow/testing/random.h" +#include "arrow/type.h" +#include "arrow/util/benchmark_util.h" + +namespace arrow { + +template +static void BatchToTensorSimple(benchmark::State& state) { + using CType = typename ValueType::c_type; + std::shared_ptr ty = TypeTraits::type_singleton(); + + const int64_t num_cols = state.range(1); + const int64_t num_rows = state.range(0) / num_cols / sizeof(CType); + arrow::random::RandomArrayGenerator gen_{42}; + + std::vector> fields = {}; + std::vector> columns = {}; + + for (int64_t i = 0; i < num_cols; ++i) { +fields.push_back(field("f" + std::to_string(i), ty)); +columns.push_back(gen_.ArrayOf(ty, num_rows)); + } + auto schema = std::make_shared(std::move(fields)); + auto batch = RecordBatch::Make(schema, num_rows, columns); + + for (auto _ : state) { +ASSERT_OK_AND_ASSIGN(auto tensor, batch->ToTensor()); + } + state.SetItemsProcessed(state.iterations() * num_rows * num_cols); + state.SetBytesProcessed(state.iterations() * ty->byte_width() * num_rows * num_cols); +} + +void SetArgs(benchmark::internal::Benchmark* bench) { + for (int64_t size : {kL1Size, kL2Size}) { +for (int64_t num_columns : {3, 30, 300}) { + bench->Args({size, num_columns}); + bench->ArgNames({"size", "num_columns"}); +} + } +} + +BENCHMARK_TEMPLATE(BatchToTensorSimple, Int8Type)->Apply(SetArgs); +BENCHMARK_TEMPLATE(BatchToTensorSimple, Int16Type)->Apply(SetArgs); +BENCHMARK_TEMPLATE(BatchToTensorSimple, Int32Type)->Apply(SetArgs); +BENCHMARK_TEMPLATE(BatchToTensorSimple, Int64Type)->Apply(SetArgs); + +} // namespace arrow
(arrow) branch main updated (dada4e1aad -> cc9d52ca1f)
This is an automated email from the ASF dual-hosted git repository. jorisvandenbossche pushed a change to branch main in repository https://gitbox.apache.org/repos/asf/arrow.git from dada4e1aad GH-40659: [Python][C++] Support conversion of pyarrow.RunEndEncodedArray to numpy/pandas (#40661) add cc9d52ca1f GH-36399: [Python] Add missing `shape` property to `RecordBatch` (#40643) No new revisions were added by this update. Summary of changes: python/pyarrow/table.pxi | 42 - python/pyarrow/tests/test_table.py | 189 + 2 files changed, 129 insertions(+), 102 deletions(-)
(arrow) branch main updated (54c4cedd45 -> dada4e1aad)
This is an automated email from the ASF dual-hosted git repository. jorisvandenbossche pushed a change to branch main in repository https://gitbox.apache.org/repos/asf/arrow.git from 54c4cedd45 GH-40328: [C++][Parquet] Allow use of FileDecryptionProperties after the CryptoFactory is destroyed (#40329) add dada4e1aad GH-40659: [Python][C++] Support conversion of pyarrow.RunEndEncodedArray to numpy/pandas (#40661) No new revisions were added by this update. Summary of changes: python/pyarrow/src/arrow/python/arrow_to_pandas.cc | 25 ++ python/pyarrow/tests/test_array.py | 22 +++ 2 files changed, 47 insertions(+)
(arrow) branch main updated: GH-37328: [Python] Add a function to download and extract timezone database on Windows (#38179)
This is an automated email from the ASF dual-hosted git repository. jorisvandenbossche pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/arrow.git The following commit(s) were added to refs/heads/main by this push: new e52017a727 GH-37328: [Python] Add a function to download and extract timezone database on Windows (#38179) e52017a727 is described below commit e52017a72735d502c3ac3323d9d1fc61a15a6ae0 Author: Alenka Frim AuthorDate: Wed Mar 20 08:59:14 2024 +0100 GH-37328: [Python] Add a function to download and extract timezone database on Windows (#38179) ### Rationale for this change There is a section in the [Arrow C++ documentation with the instructions](https://arrow.apache.org/docs/dev/cpp/build_system.html#runtime-dependencies) on how to download and extract text version of the IANA timezone database and on Windows. We should provide a function in PyArrow that a user would call to download and extract the timezone database from Python. ### What changes are included in this PR? Function `download_tzdata_on_windows()` added to python/pyarrow/util.py that downloads and extracts timezone database to a standard location in `%USERPROFILE%\Downloads\tzdata` on Widnows. ### Are these changes tested? Yes. ### Are there any user-facing changes? No. * Closes: #37328 Lead-authored-by: AlenkaF Co-authored-by: Alenka Frim Co-authored-by: Joris Van den Bossche Signed-off-by: Joris Van den Bossche --- docs/source/python/install.rst| 3 ++- python/pyarrow/tests/test_util.py | 22 +- python/pyarrow/util.py| 28 3 files changed, 51 insertions(+), 2 deletions(-) diff --git a/docs/source/python/install.rst b/docs/source/python/install.rst index 4555977ece..4b966e6d26 100644 --- a/docs/source/python/install.rst +++ b/docs/source/python/install.rst @@ -82,7 +82,8 @@ tzdata on Windows While Arrow uses the OS-provided timezone database on Linux and macOS, it requires a user-provided database on Windows. To download and extract the text version of the IANA timezone database follow the instructions in the C++ -:ref:`download-timezone-database`. +:ref:`download-timezone-database` or use pyarrow utility function +`pyarrow.util.download_tzdata_on_windows()` that does the same. By default, the timezone database will be detected at ``%USERPROFILE%\Downloads\tzdata``. If the database has been downloaded in a different location, you will need to set diff --git a/python/pyarrow/tests/test_util.py b/python/pyarrow/tests/test_util.py index 9fccb76112..e584b04111 100644 --- a/python/pyarrow/tests/test_util.py +++ b/python/pyarrow/tests/test_util.py @@ -16,14 +16,17 @@ # under the License. import gc +import os import signal +import shutil import sys import textwrap import weakref import pytest -from pyarrow.util import doc, _break_traceback_cycle_from_frame +from pyarrow.util import (doc, _break_traceback_cycle_from_frame, + download_tzdata_on_windows) from pyarrow.tests.util import disabled_gc @@ -207,3 +210,20 @@ def test_signal_refcycle(): assert wr() is not None _break_traceback_cycle_from_frame(sys._getframe(0)) assert wr() is None + + +@pytest.mark.skipif(sys.platform != "win32", +reason="Timezone database is already provided.") +def test_download_tzdata_on_windows(): +tzdata_path = os.path.expandvars(r"%USERPROFILE%\Downloads\tzdata") + +# Download timezone database and remove data in case it already exists +if (os.path.exists(tzdata_path)): +shutil.rmtree(tzdata_path) +download_tzdata_on_windows() + +# Inspect the folder +assert os.path.exists(tzdata_path) +assert os.path.exists(os.path.join(tzdata_path, "windowsZones.xml")) +assert os.path.exists(os.path.join(tzdata_path, "europe")) +assert 'version' in os.listdir(tzdata_path) diff --git a/python/pyarrow/util.py b/python/pyarrow/util.py index bb693cd663..89780da10f 100644 --- a/python/pyarrow/util.py +++ b/python/pyarrow/util.py @@ -228,3 +228,31 @@ def _break_traceback_cycle_from_frame(frame): # us visit the outer frame). refs = gc.get_referrers(frame) refs = frame = this_frame = None + + +def download_tzdata_on_windows(): +r""" +Download and extract latest IANA timezone database into the +location expected by Arrow which is %USERPROFILE%\Downloads\tzdata. +""" +if sys.platform != 'win32': +raise TypeError(f"Timezone database is already provided by {sys.platform}") + +import tarfile + +tzdata_path = os.path.expandvars(r"%USERPROFILE%\Downloads\tzdata") +tzdata_compressed = os.path.join(tzdata_path, "tzdata.tar.gz") +os.makedirs(tzdata_path, exist
(arrow) branch main updated: GH-40273: [Python] Support construction of Run-End Encoded arrays in pa.array(..) (#40341)
This is an automated email from the ASF dual-hosted git repository. jorisvandenbossche pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/arrow.git The following commit(s) were added to refs/heads/main by this push: new dd3d3cd1be GH-40273: [Python] Support construction of Run-End Encoded arrays in pa.array(..) (#40341) dd3d3cd1be is described below commit dd3d3cd1be27da7c872bfced553f25b8a0240021 Author: Alenka Frim AuthorDate: Wed Mar 20 08:44:08 2024 +0100 GH-40273: [Python] Support construction of Run-End Encoded arrays in pa.array(..) (#40341) ### Rationale for this change We want to enable the construction of a Run-End Encoded arrays with `pyarrow.array `constructor ### What changes are included in this PR? Added a check for Run-End Encoded Type in the `pyarrow.array` constructor code. ### Are these changes tested? Yes, added test_run_end_encoded_from_array_with_type. ### Are there any user-facing changes? No. * GitHub Issue: #40273 Lead-authored-by: AlenkaF Co-authored-by: Joris Van den Bossche Signed-off-by: Joris Van den Bossche --- python/pyarrow/array.pxi | 18 +++--- python/pyarrow/tests/test_array.py | 39 ++ 2 files changed, 54 insertions(+), 3 deletions(-) diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index def4c5e9ba..59d2e91ef6 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -336,11 +336,23 @@ def array(object obj, type=None, mask=None, size=None, from_pandas=None, if pandas_api.have_pandas: values, type = pandas_api.compat.get_datetimetz_type( values, obj.dtype, type) -result = _ndarray_to_array(values, mask, type, c_from_pandas, safe, - pool) +if type and type.id == _Type_RUN_END_ENCODED: +arr = _ndarray_to_array( +values, mask, type.value_type, c_from_pandas, safe, pool) +result = _pc().run_end_encode(arr, run_end_type=type.run_end_type, + memory_pool=memory_pool) +else: +result = _ndarray_to_array(values, mask, type, c_from_pandas, safe, + pool) else: +if type and type.id == _Type_RUN_END_ENCODED: +arr = _sequence_to_array( +obj, mask, size, type.value_type, pool, from_pandas) +result = _pc().run_end_encode(arr, run_end_type=type.run_end_type, + memory_pool=memory_pool) # ConvertPySequence does strict conversion if type is explicitly passed -result = _sequence_to_array(obj, mask, size, type, pool, c_from_pandas) +else: +result = _sequence_to_array(obj, mask, size, type, pool, c_from_pandas) if extension_type is not None: result = ExtensionArray.from_storage(extension_type, result) diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index a8cd20720e..999c1af453 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -3580,6 +3580,45 @@ def test_run_end_encoded_from_buffers(): 1, offset, children) +def test_run_end_encoded_from_array_with_type(): +run_ends = [1, 3, 6] +values = [1, 2, 3] +ree_type = pa.run_end_encoded(pa.int32(), pa.int64()) +expected = pa.RunEndEncodedArray.from_arrays(run_ends, values, + ree_type) + +arr = [1, 2, 2, 3, 3, 3] +result = pa.array(arr, type=ree_type) +assert result.equals(expected) +result = pa.array(np.array(arr), type=ree_type) +assert result.equals(expected) + +ree_type_2 = pa.run_end_encoded(pa.int16(), pa.float32()) +result = pa.array(arr, type=ree_type_2) +assert not result.equals(expected) +expected_2 = pa.RunEndEncodedArray.from_arrays(run_ends, values, + ree_type_2) +assert result.equals(expected_2) + +run_ends = [1, 3, 5, 6] +values = [1, 2, 3, None] +expected = pa.RunEndEncodedArray.from_arrays(run_ends, values, + ree_type) + +arr = [1, 2, 2, 3, 3, None] +result = pa.array(arr, type=ree_type) +assert result.equals(expected) + +run_ends = [1, 3, 4, 5, 6] +values = [1, 2, None, 3, None] +expected = pa.RunEndEncodedArray.from_arrays(run_ends, values, + ree_type) + +mask = pa.array([False, False, False, True, False, True]) +result = pa.array(arr, type=ree_type, mask=mask) +assert result.equals(expected) + + @pytest.mark.parametrize(('list_array_type', 'list_type_factory
(arrow) branch main updated: GH-39958: [Python][CI] Remove upper pin on pytest (#40487)
This is an automated email from the ASF dual-hosted git repository. jorisvandenbossche pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/arrow.git The following commit(s) were added to refs/heads/main by this push: new 6d5cfb2b2f GH-39958: [Python][CI] Remove upper pin on pytest (#40487) 6d5cfb2b2f is described below commit 6d5cfb2b2fc3a2cbb0bb7ecc9aff24e2834ade66 Author: Alenka Frim AuthorDate: Tue Mar 19 15:13:16 2024 +0100 GH-39958: [Python][CI] Remove upper pin on pytest (#40487) ### Rationale for this change The latest version of pytest (`8.0.0`) is breaking our CI: - S3 fixture from out test suite fails - `doctest-cython` check fails ### What changes are included in this PR? - added `allow_bucket_creation=True` to the `s3_example_fs` fixture - removed the pin on pytest, except for the doc builds ### Are these changes tested? Yes. ### Are there any user-facing changes? No Closes: - Closes https://github.com/apache/arrow/issues/39958 - Closes https://github.com/apache/arrow/issues/39957 * GitHub Issue: #39958 Lead-authored-by: AlenkaF Co-authored-by: Joris Van den Bossche Signed-off-by: Joris Van den Bossche --- ci/conda_env_python.txt | 2 +- docker-compose.yml | 10 ++ python/pyarrow/tests/parquet/conftest.py | 1 + python/requirements-test.txt | 2 +- python/requirements-wheel-test.txt | 2 +- 5 files changed, 10 insertions(+), 7 deletions(-) diff --git a/ci/conda_env_python.txt b/ci/conda_env_python.txt index 19e94d7d3e..4366e30010 100644 --- a/ci/conda_env_python.txt +++ b/ci/conda_env_python.txt @@ -23,7 +23,7 @@ cloudpickle fsspec hypothesis numpy>=1.16.6 -pytest<8 +pytest pytest-faulthandler s3fs>=2023.10.0 setuptools diff --git a/docker-compose.yml b/docker-compose.yml index eb434b9062..9b0610fe55 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1208,15 +1208,17 @@ services: LANG: "C.UTF-8" BUILD_DOCS_CPP: "ON" BUILD_DOCS_PYTHON: "ON" - # GH-31506/GH-33609: Remove --disable-warnings once - # https://github.com/lgpage/pytest-cython/issues/24 is resolved - # and a new version that includes the fix is released. - PYTEST_ARGS: "--doctest-modules --doctest-cython --disable-warnings" + PYTEST_ARGS: "--doctest-modules --doctest-cython" volumes: *conda-volumes +# pytest is installed with an upper pin of 8.0.0 because +# newer version breaks cython doctesting, see: +# https://github.com/lgpage/pytest-cython/issues/58 +# Remove pip install pytest~=7 when upstream issue is resolved command: ["/arrow/ci/scripts/cpp_build.sh /arrow /build && /arrow/ci/scripts/python_build.sh /arrow /build && pip install -e /arrow/dev/archery[numpydoc] && +pip install pytest~=7.4 && archery numpydoc --allow-rule GL10,PR01,PR03,PR04,PR05,PR10,RT03,YD01 && /arrow/ci/scripts/python_test.sh /arrow"] diff --git a/python/pyarrow/tests/parquet/conftest.py b/python/pyarrow/tests/parquet/conftest.py index 461c24af22..767e7f6b69 100644 --- a/python/pyarrow/tests/parquet/conftest.py +++ b/python/pyarrow/tests/parquet/conftest.py @@ -81,6 +81,7 @@ def s3_example_fs(s3_server): host, port, access_key, secret_key = s3_server['connection'] uri = ( "s3://{}:{}@mybucket/data.parquet?scheme=http_override={}:{}" +"_bucket_creation=True" .format(access_key, secret_key, host, port) ) fs, path = FileSystem.from_uri(uri) diff --git a/python/requirements-test.txt b/python/requirements-test.txt index 2108d70a54..975477c422 100644 --- a/python/requirements-test.txt +++ b/python/requirements-test.txt @@ -1,5 +1,5 @@ cffi hypothesis pandas -pytest<8 +pytest pytz diff --git a/python/requirements-wheel-test.txt b/python/requirements-wheel-test.txt index a1046bc18c..46bedc13ba 100644 --- a/python/requirements-wheel-test.txt +++ b/python/requirements-wheel-test.txt @@ -1,7 +1,7 @@ cffi cython hypothesis -pytest<8 +pytest pytz tzdata; sys_platform == 'win32'
(arrow) branch main updated: GH-34235: [Python] Correct test marker for join_asof tests (#40666)
This is an automated email from the ASF dual-hosted git repository. jorisvandenbossche pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/arrow.git The following commit(s) were added to refs/heads/main by this push: new ed47ad22c8 GH-34235: [Python] Correct test marker for join_asof tests (#40666) ed47ad22c8 is described below commit ed47ad22c8537b32abf27580e75fcf514be11f7e Author: Joris Van den Bossche AuthorDate: Tue Mar 19 13:57:56 2024 +0100 GH-34235: [Python] Correct test marker for join_asof tests (#40666) Small follow-up on https://github.com/apache/arrow/pull/34234 fixing the marker for a newly added test, fixing the minimal builds * GitHub Issue: #34235 Authored-by: Joris Van den Bossche Signed-off-by: Joris Van den Bossche --- python/pyarrow/tests/test_table.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/pyarrow/tests/test_table.py b/python/pyarrow/tests/test_table.py index d0a7ccacac..72e8cb73e1 100644 --- a/python/pyarrow/tests/test_table.py +++ b/python/pyarrow/tests/test_table.py @@ -2901,6 +2901,7 @@ def test_table_join_asof_by_length_mismatch(): ) +@pytest.mark.dataset def test_table_join_asof_by_type_mismatch(): t1 = pa.table({ "colA": [1, 2, 6], @@ -2922,6 +2923,7 @@ def test_table_join_asof_by_type_mismatch(): ) +@pytest.mark.dataset def test_table_join_asof_on_type_mismatch(): t1 = pa.table({ "colA": [1, 2, 6],
(arrow) branch main updated (00a48217e9 -> 681be03cfc)
This is an automated email from the ASF dual-hosted git repository. jorisvandenbossche pushed a change to branch main in repository https://gitbox.apache.org/repos/asf/arrow.git from 00a48217e9 GH-38768: [Python] Slicing an array backwards beyond the start now includes first item. (#39240) add 681be03cfc GH-34235: [Python] Add `join_asof` binding (#34234) No new revisions were added by this update. Summary of changes: cpp/src/arrow/acero/asof_join_node.cc | 2 +- python/pyarrow/_acero.pyx | 79 ++ python/pyarrow/_dataset.pyx| 64 +++ python/pyarrow/acero.py| 87 +++ python/pyarrow/includes/libarrow_acero.pxd | 7 ++ python/pyarrow/table.pxi | 86 +++ python/pyarrow/tests/test_acero.py | 35 +++ python/pyarrow/tests/test_dataset.py | 114 python/pyarrow/tests/test_table.py | 163 + 9 files changed, 636 insertions(+), 1 deletion(-)
(arrow) branch main updated (03c771a626 -> 00a48217e9)
This is an automated email from the ASF dual-hosted git repository. jorisvandenbossche pushed a change to branch main in repository https://gitbox.apache.org/repos/asf/arrow.git from 03c771a626 GH-37989: [Python] Plug reference leaks when creating Arrow array from Python list of dicts (#40412) add 00a48217e9 GH-38768: [Python] Slicing an array backwards beyond the start now includes first item. (#39240) No new revisions were added by this update. Summary of changes: python/pyarrow/array.pxi | 40 +- python/pyarrow/tests/test_array.py | 1 + 2 files changed, 23 insertions(+), 18 deletions(-)
(arrow) branch main updated (7f361fd806 -> 03c771a626)
This is an automated email from the ASF dual-hosted git repository. jorisvandenbossche pushed a change to branch main in repository https://gitbox.apache.org/repos/asf/arrow.git from 7f361fd806 GH-30915: [C++][Python] Add missing methods to `RecordBatch` (#39506) add 03c771a626 GH-37989: [Python] Plug reference leaks when creating Arrow array from Python list of dicts (#40412) No new revisions were added by this update. Summary of changes: python/pyarrow/src/arrow/python/python_to_arrow.cc | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-)
(arrow) branch main updated: GH-30915: [C++][Python] Add missing methods to `RecordBatch` (#39506)
This is an automated email from the ASF dual-hosted git repository. jorisvandenbossche pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/arrow.git The following commit(s) were added to refs/heads/main by this push: new 7f361fd806 GH-30915: [C++][Python] Add missing methods to `RecordBatch` (#39506) 7f361fd806 is described below commit 7f361fd80651010f3dc91ec6302f661a16892291 Author: Judah Rand <17158624+judahr...@users.noreply.github.com> AuthorDate: Fri Mar 15 15:15:38 2024 + GH-30915: [C++][Python] Add missing methods to `RecordBatch` (#39506) ### Rationale for this change These methods are present on `Table` but missing on `RecordBatch`: * `add_column` * `append_column` * `remove_column` * `set_column` * `drop_columns` * `rename_columns` * `cast` We also should probably accept a `dict` as input to `pa.record_batch` like we do for `pa.table`. ### What changes are included in this PR? Add the methods. ### Are these changes tested? Yes. * Parent issue: https://github.com/apache/arrow/issues/36399 * Related: #30559 * Closes #30915 * GitHub Issue: #30915 Lead-authored-by: Judah Rand <17158624+judahr...@users.noreply.github.com> Co-authored-by: Joris Van den Bossche Signed-off-by: Joris Van den Bossche --- cpp/src/arrow/record_batch.cc| 29 +++ cpp/src/arrow/record_batch.h | 7 + cpp/src/arrow/record_batch_test.cc | 26 ++ python/pyarrow/includes/libarrow.pxd | 7 + python/pyarrow/table.pxi | 474 +++ python/pyarrow/tests/test_table.py | 202 +-- 6 files changed, 561 insertions(+), 184 deletions(-) diff --git a/cpp/src/arrow/record_batch.cc b/cpp/src/arrow/record_batch.cc index d23b2b584b..d52ebe053b 100644 --- a/cpp/src/arrow/record_batch.cc +++ b/cpp/src/arrow/record_batch.cc @@ -395,6 +395,35 @@ Result> RecordBatch::ReplaceSchema( return RecordBatch::Make(std::move(schema), num_rows(), columns()); } +std::vector RecordBatch::ColumnNames() const { + std::vector names(num_columns()); + for (int i = 0; i < num_columns(); ++i) { +names[i] = schema()->field(i)->name(); + } + return names; +} + +Result> RecordBatch::RenameColumns( +const std::vector& names) const { + int n = num_columns(); + + if (static_cast(names.size()) != n) { +return Status::Invalid("tried to rename a record batch of ", n, " columns but only ", + names.size(), " names were provided"); + } + + ArrayVector columns(n); + FieldVector fields(n); + + for (int i = 0; i < n; ++i) { +columns[i] = column(i); +fields[i] = schema()->field(i)->WithName(names[i]); + } + + return RecordBatch::Make(::arrow::schema(std::move(fields)), num_rows(), + std::move(columns)); +} + Result> RecordBatch::SelectColumns( const std::vector& indices) const { int n = static_cast(indices.size()); diff --git a/cpp/src/arrow/record_batch.h b/cpp/src/arrow/record_batch.h index 8a2c1ba6d7..16d721caad 100644 --- a/cpp/src/arrow/record_batch.h +++ b/cpp/src/arrow/record_batch.h @@ -227,6 +227,13 @@ class ARROW_EXPORT RecordBatch { /// \return PrettyPrint representation suitable for debugging std::string ToString() const; + /// \brief Return names of all columns + std::vector ColumnNames() const; + + /// \brief Rename columns with provided names + Result> RenameColumns( + const std::vector& names) const; + /// \brief Return new record batch with specified columns Result> SelectColumns( const std::vector& indices) const; diff --git a/cpp/src/arrow/record_batch_test.cc b/cpp/src/arrow/record_batch_test.cc index db68a9a937..45cf7cae65 100644 --- a/cpp/src/arrow/record_batch_test.cc +++ b/cpp/src/arrow/record_batch_test.cc @@ -315,6 +315,32 @@ TEST_F(TestRecordBatch, RemoveColumn) { AssertBatchesEqual(*new_batch, *batch4); } +TEST_F(TestRecordBatch, RenameColumns) { + const int length = 10; + + auto field1 = field("f1", int32()); + auto field2 = field("f2", uint8()); + auto field3 = field("f3", int16()); + + auto schema1 = ::arrow::schema({field1, field2, field3}); + + random::RandomArrayGenerator gen(42); + + auto array1 = gen.ArrayOf(int32(), length); + auto array2 = gen.ArrayOf(uint8(), length); + auto array3 = gen.ArrayOf(int16(), length); + + auto batch = RecordBatch::Make(schema1, length, {array1, array2, array3}); + EXPECT_THAT(batch->ColumnNames(), testing::ElementsAre("f1", "f2", "f3")); + + ASSERT_OK_AND_ASSIGN(auto renamed, batch->RenameColumns({"zero", "one", "two"})); + EXPECT_THAT(renamed->ColumnNames(), testing::ElementsAre("zero", "one", &quo
(arrow) branch main updated: GH-40291: [Python] Accept dict in pyarrow.record_batch() function (#40292)
This is an automated email from the ASF dual-hosted git repository. jorisvandenbossche pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/arrow.git The following commit(s) were added to refs/heads/main by this push: new 0402e306a9 GH-40291: [Python] Accept dict in pyarrow.record_batch() function (#40292) 0402e306a9 is described below commit 0402e306a9d9f57ff22c87bf8689b8e7203483e5 Author: Joris Van den Bossche AuthorDate: Fri Mar 15 15:17:14 2024 +0100 GH-40291: [Python] Accept dict in pyarrow.record_batch() function (#40292) ### Rationale for this change `pa.table(dict)` works, but `pa.record_batch(dict)` is not supported. Let's make this consistent. Also harmonized the documentation for the `data` argument for both functions. * GitHub Issue: #40291 Authored-by: Joris Van den Bossche Signed-off-by: Joris Van den Bossche --- python/pyarrow/table.pxi | 49 +--- 1 file changed, 38 insertions(+), 11 deletions(-) diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index dfd549befc..9f60150427 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -5109,10 +5109,10 @@ def record_batch(data, names=None, schema=None, metadata=None): Parameters -- -data : pandas.DataFrame, list, Arrow-compatible table -A DataFrame, list of arrays or chunked arrays, or a tabular object -implementing the Arrow PyCapsule Protocol (has an -``__arrow_c_array__`` method). +data : dict, list, pandas.DataFrame, Arrow-compatible table +A mapping of strings to Arrays or Python lists, a list of Arrays, +a pandas DataFame, or any tabular object implementing the +Arrow PyCapsule Protocol (has an ``__arrow_c_array__`` method). names : list, default None Column names if list of arrays passed as data. Mutually exclusive with 'schema' argument. @@ -5137,16 +5137,16 @@ def record_batch(data, names=None, schema=None, metadata=None): >>> animals = pa.array(["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"]) >>> names = ["n_legs", "animals"] -Creating a RecordBatch from a list of arrays with names: +Construct a RecordBatch from a python dictionary: ->>> pa.record_batch([n_legs, animals], names=names) +>>> pa.record_batch({"n_legs": n_legs, "animals": animals}) pyarrow.RecordBatch n_legs: int64 animals: string n_legs: [2,2,4,4,5,100] animals: ["Flamingo","Parrot","Dog","Horse","Brittle stars","Centipede"] ->>> pa.record_batch([n_legs, animals], names=["n_legs", "animals"]).to_pandas() +>>> pa.record_batch({"n_legs": n_legs, "animals": animals}).to_pandas() n_legsanimals 0 2 Flamingo 1 2 Parrot @@ -5155,6 +5155,16 @@ def record_batch(data, names=None, schema=None, metadata=None): 4 5 Brittle stars 5 100 Centipede +Creating a RecordBatch from a list of arrays with names: + +>>> pa.record_batch([n_legs, animals], names=names) +pyarrow.RecordBatch +n_legs: int64 +animals: string + +n_legs: [2,2,4,4,5,100] +animals: ["Flamingo","Parrot","Dog","Horse","Brittle stars","Centipede"] + Creating a RecordBatch from a list of arrays with names and metadata: >>> my_metadata={"n_legs": "How many legs does an animal have?"} @@ -5231,6 +5241,11 @@ def record_batch(data, names=None, schema=None, metadata=None): if isinstance(data, (list, tuple)): return RecordBatch.from_arrays(data, names=names, schema=schema, metadata=metadata) +elif isinstance(data, dict): +if names is not None: +raise ValueError( +"The 'names' argument is not valid when passing a dictionary") +return RecordBatch.from_pydict(data, schema=schema, metadata=metadata) elif hasattr(data, "__arrow_c_array__"): if schema is not None: requested_schema = schema.__arrow_c_schema__() @@ -5241,7 +5256,7 @@ def record_batch(data, names=None, schema=None, metadata=None): if schema is not None and batch.schema != schema: # __arrow_c_array__ coerces schema with best effort, so we might # need to cast it if the producer wasn't able to cast to exact schema. -batch = Table.from_batches([batch]).cast(schema).to_batches()[0] +batch = batch.cast(schema) return bat
(arrow) branch main updated (a1fd4c4964 -> fd1e9ca81f)
This is an automated email from the ASF dual-hosted git repository. jorisvandenbossche pushed a change to branch main in repository https://gitbox.apache.org/repos/asf/arrow.git from a1fd4c4964 MINOR: [Go] Bump github.com/andybalholm/brotli from 1.0.5 to 1.1.0 in /go (#40531) add fd1e9ca81f GH-39444: [Python] Fix parquet import in encryption test (#40505) No new revisions were added by this update. Summary of changes: dev/tasks/tasks.yml | 1 + python/pyarrow/tests/test_dataset_encryption.py | 4 +++- 2 files changed, 4 insertions(+), 1 deletion(-)
(arrow) branch main updated (dd6d7288e4 -> 9f6dc1feb5)
This is an automated email from the ASF dual-hosted git repository. jorisvandenbossche pushed a change to branch main in repository https://gitbox.apache.org/repos/asf/arrow.git from dd6d7288e4 GH-39444: [C++][Parquet] Fix crash in Modular Encryption (#39623) add 9f6dc1feb5 GH-39968: [Python][FS][Azure] Minimal Python bindings for `AzureFileSystem` (#40021) No new revisions were added by this update. Summary of changes: ci/docker/alpine-linux-3.16-cpp.dockerfile| 1 + ci/docker/fedora-39-cpp.dockerfile| 1 + ci/docker/linux-apt-docs.dockerfile | 1 + ci/docker/ubuntu-20.04-cpp-minimal.dockerfile | 1 + ci/docker/ubuntu-22.04-cpp-minimal.dockerfile | 1 + cpp/src/arrow/filesystem/api.h| 5 +- cpp/src/arrow/filesystem/azurefs_test.cc | 2 + cpp/src/arrow/filesystem/type_fwd.h | 7 +- cpp/src/arrow/util/config.h.cmake | 1 + python/CMakeLists.txt | 4 + python/pyarrow/__init__.py| 3 +- python/pyarrow/_azurefs.pyx | 134 ++ python/pyarrow/_fs.pyx| 3 + python/pyarrow/conftest.py| 9 +- python/pyarrow/fs.py | 4 + python/pyarrow/includes/libarrow_fs.pxd | 16 +++ python/pyarrow/tests/conftest.py | 31 ++ python/pyarrow/tests/test_fs.py | 78 ++- python/setup.py | 8 ++ 19 files changed, 303 insertions(+), 7 deletions(-) create mode 100644 python/pyarrow/_azurefs.pyx
(arrow) branch main updated: GH-40428: [Python][CI] Fix dataset partition filter tests with pandas nightly (#40429)
This is an automated email from the ASF dual-hosted git repository. jorisvandenbossche pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/arrow.git The following commit(s) were added to refs/heads/main by this push: new 788200a434 GH-40428: [Python][CI] Fix dataset partition filter tests with pandas nightly (#40429) 788200a434 is described below commit 788200a434462325c9feff4b52203520a90694e4 Author: Joris Van den Bossche AuthorDate: Wed Mar 13 14:20:52 2024 +0100 GH-40428: [Python][CI] Fix dataset partition filter tests with pandas nightly (#40429) ### Rationale for this change From debugging the failure, it seems this is due to pandas changing a filter operation to sometimes preserve a RangeIndex now instead of returning an Integer64Index. And the conversion to Arrow changes based on that (RangeIndex is metadata only by default, integer index becomes a column) Therefore making the tests more robust to ensure there is always at least one non-partition column in the DataFrame, so it doesn't depend on the index whether the result is empty or not. * GitHub Issue: #40428 Authored-by: Joris Van den Bossche Signed-off-by: Joris Van den Bossche --- python/pyarrow/tests/parquet/test_dataset.py | 12 ++-- 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/python/pyarrow/tests/parquet/test_dataset.py b/python/pyarrow/tests/parquet/test_dataset.py index 30dae05124..47e608a140 100644 --- a/python/pyarrow/tests/parquet/test_dataset.py +++ b/python/pyarrow/tests/parquet/test_dataset.py @@ -107,9 +107,9 @@ def test_filters_equivalency(tempdir): df = pd.DataFrame({ 'integer': np.array(integer_keys, dtype='i4').repeat(15), 'string': np.tile(np.tile(np.array(string_keys, dtype=object), 5), 2), -'boolean': np.tile(np.tile(np.array(boolean_keys, dtype='bool'), 5), - 3), -}, columns=['integer', 'string', 'boolean']) +'boolean': np.tile(np.tile(np.array(boolean_keys, dtype='bool'), 5), 3), +'values': np.arange(30), +}) _generate_partition_directories(local, base_path, partition_spec, df) @@ -312,9 +312,9 @@ def test_filters_inclusive_set(tempdir): df = pd.DataFrame({ 'integer': np.array(integer_keys, dtype='i4').repeat(15), 'string': np.tile(np.tile(np.array(string_keys, dtype=object), 5), 2), -'boolean': np.tile(np.tile(np.array(boolean_keys, dtype='bool'), 5), - 3), -}, columns=['integer', 'string', 'boolean']) +'boolean': np.tile(np.tile(np.array(boolean_keys, dtype='bool'), 5), 3), +'values': np.arange(30), +}) _generate_partition_directories(local, base_path, partition_spec, df)
(arrow) branch main updated (acdf2a7f68 -> a421314900)
This is an automated email from the ASF dual-hosted git repository. jorisvandenbossche pushed a change to branch main in repository https://gitbox.apache.org/repos/asf/arrow.git from acdf2a7f68 GH-40312: [Python] Add ListView documentation to user guide (#40313) add a421314900 GH-40376: [Python] Update for NumPy 2.0 ABI change in PyArray_Descr->elsize (#40418) No new revisions were added by this update. Summary of changes: .env | 2 +- python/CMakeLists.txt | 3 +++ python/pyarrow/src/arrow/python/arrow_to_pandas.cc | 10 ++ python/pyarrow/src/arrow/python/numpy_convert.cc | 6 +++--- python/pyarrow/src/arrow/python/numpy_interop.h| 7 +++ python/pyarrow/src/arrow/python/numpy_to_arrow.cc | 21 - 6 files changed, 32 insertions(+), 17 deletions(-)
(arrow) branch main updated (b202ede131 -> 6121b3fd06)
This is an automated email from the ASF dual-hosted git repository. jorisvandenbossche pushed a change to branch main in repository https://gitbox.apache.org/repos/asf/arrow.git from b202ede131 GH-40458: [Release][Docs] Changes for version and warning banner should not affect minor releases (#40459) add 6121b3fd06 GH-40485: [Python][CI] Skip failing test_dateutil_tzinfo_to_string (#40486) No new revisions were added by this update. Summary of changes: python/pyarrow/tests/test_types.py | 5 + 1 file changed, 5 insertions(+)
(arrow) branch main updated: GH-40377: [Python][CI] Fix install of nightly dask in integration tests (#40378)
This is an automated email from the ASF dual-hosted git repository. jorisvandenbossche pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/arrow.git The following commit(s) were added to refs/heads/main by this push: new e950eb4baa GH-40377: [Python][CI] Fix install of nightly dask in integration tests (#40378) e950eb4baa is described below commit e950eb4baa73b9ab4e498e71354738c56287c48d Author: Joris Van den Bossche AuthorDate: Thu Mar 7 13:36:53 2024 +0100 GH-40377: [Python][CI] Fix install of nightly dask in integration tests (#40378) ### Rationale for this change Use a proper (non-deprecated) way of installing from git with an "extra", which also fixes the currently failing installation. * GitHub Issue: #40377 Authored-by: Joris Van den Bossche Signed-off-by: Joris Van den Bossche --- ci/scripts/install_dask.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ci/scripts/install_dask.sh b/ci/scripts/install_dask.sh index 478c1d5997..b89e43cfb3 100755 --- a/ci/scripts/install_dask.sh +++ b/ci/scripts/install_dask.sh @@ -27,7 +27,8 @@ fi dask=$1 if [ "${dask}" = "upstream_devel" ]; then - pip install https://github.com/dask/dask/archive/main.tar.gz#egg=dask[dataframe] + pip install "dask[dataframe] @ git+https://github.com/dask/dask.git; + pip install -U git+https://github.com/dask-contrib/dask-expr.git elif [ "${dask}" = "latest" ]; then pip install dask[dataframe] else
(arrow) branch main updated (ef6ea6beed -> 3d467ac7bf)
This is an automated email from the ASF dual-hosted git repository. jorisvandenbossche pushed a change to branch main in repository https://gitbox.apache.org/repos/asf/arrow.git from ef6ea6beed GH-40345: [FlightRPC][C++][Java][Go] Add URI scheme to reuse connection (#40084) add 3d467ac7bf GH-20127: [Python][CI] Remove legacy hdfs tests from hdfs and hypothesis setup (#40363) No new revisions were added by this update. Summary of changes: ci/scripts/integration_hdfs.sh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-)
(arrow) branch main updated (4ce9a5edd2 -> 3ba6d286ca)
This is an automated email from the ASF dual-hosted git repository. jorisvandenbossche pushed a change to branch main in repository https://gitbox.apache.org/repos/asf/arrow.git from 4ce9a5edd2 GH-40153: [Python] Make `Tensor.__getbuffer__` work on 32-bit platforms (#40294) add 3ba6d286ca GH-40059: [C++][Python] Basic conversion of RecordBatch to Arrow Tensor (#40064) No new revisions were added by this update. Summary of changes: cpp/src/arrow/record_batch.cc| 92 ++ cpp/src/arrow/record_batch.h | 8 ++ cpp/src/arrow/record_batch_test.cc | 229 +++ python/pyarrow/includes/libarrow.pxd | 2 + python/pyarrow/table.pxi | 14 +++ python/pyarrow/tests/test_table.py | 142 ++ 6 files changed, 487 insertions(+)
(arrow) branch main updated: GH-40153: [Python] Make `Tensor.__getbuffer__` work on 32-bit platforms (#40294)
This is an automated email from the ASF dual-hosted git repository. jorisvandenbossche pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/arrow.git The following commit(s) were added to refs/heads/main by this push: new 4ce9a5edd2 GH-40153: [Python] Make `Tensor.__getbuffer__` work on 32-bit platforms (#40294) 4ce9a5edd2 is described below commit 4ce9a5edd2710fb8bf0c642fd0e3863b01c2ea20 Author: Antoine Pitrou AuthorDate: Tue Mar 5 08:56:25 2024 +0100 GH-40153: [Python] Make `Tensor.__getbuffer__` work on 32-bit platforms (#40294) ### Rationale for this change `Tensor.__getbuffer__` would silently assume that `Py_ssize_t` is the same width as `int64_t`, which is true only on 64-bit platforms. ### What changes are included in this PR? Create an internal buffer of `Py_ssize_t` values mirroring a Tensor's shape and strides, to avoid relying on the aforementioned assumption. ### Are these changes tested? Yes. ### Are there any user-facing changes? No. * GitHub Issue: #40153 Authored-by: Antoine Pitrou Signed-off-by: Joris Van den Bossche --- python/pyarrow/lib.pxd| 2 ++ python/pyarrow/tensor.pxi | 17 + 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd index 48350212c2..b1187a77c2 100644 --- a/python/pyarrow/lib.pxd +++ b/python/pyarrow/lib.pxd @@ -295,6 +295,8 @@ cdef class Tensor(_Weakrefable): cdef readonly: DataType type +bytes _ssize_t_shape +bytes _ssize_t_strides cdef void init(self, const shared_ptr[CTensor]& sp_tensor) diff --git a/python/pyarrow/tensor.pxi b/python/pyarrow/tensor.pxi index 1afce7f4a1..6fb4fc99d7 100644 --- a/python/pyarrow/tensor.pxi +++ b/python/pyarrow/tensor.pxi @@ -15,6 +15,9 @@ # specific language governing permissions and limitations # under the License. +# Avoid name clash with `pa.struct` function +import struct as _struct + cdef class Tensor(_Weakrefable): """ @@ -40,6 +43,14 @@ cdef class Tensor(_Weakrefable): self.sp_tensor = sp_tensor self.tp = sp_tensor.get() self.type = pyarrow_wrap_data_type(self.tp.type()) +self._ssize_t_shape = self._make_shape_or_strides_buffer(self.shape) +self._ssize_t_strides = self._make_shape_or_strides_buffer(self.strides) + +def _make_shape_or_strides_buffer(self, values): +""" +Make a bytes object holding an array of `values` cast to `Py_ssize_t`. +""" +return _struct.pack(f"{len(values)}n", *values) def __repr__(self): return """ @@ -282,10 +293,8 @@ strides: {0.strides}""".format(self) buffer.readonly = 0 else: buffer.readonly = 1 -# NOTE: This assumes Py_ssize_t == int64_t, and that the shape -# and strides arrays lifetime is tied to the tensor's -buffer.shape = ()[0] -buffer.strides = ()[0] +buffer.shape = cp.PyBytes_AsString(self._ssize_t_shape) +buffer.strides = cp.PyBytes_AsString(self._ssize_t_strides) buffer.suboffsets = NULL
(arrow) branch main updated: GH-20127: [Python] Remove deprecated pyarrow.filesystem legacy implementations (#39825)
This is an automated email from the ASF dual-hosted git repository. jorisvandenbossche pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/arrow.git The following commit(s) were added to refs/heads/main by this push: new 2b194ad222 GH-20127: [Python] Remove deprecated pyarrow.filesystem legacy implementations (#39825) 2b194ad222 is described below commit 2b194ad222f4dc8ecf2eb73539ab8cab5b1fc5e7 Author: Alenka Frim AuthorDate: Mon Mar 4 13:33:18 2024 +0100 GH-20127: [Python] Remove deprecated pyarrow.filesystem legacy implementations (#39825) This PR removes the `pyarrow.filesystem` and `pyarrow.hdfs` filesystems that have been deprecated since 2.0.0. * Closes: #20127 Lead-authored-by: AlenkaF Co-authored-by: Alenka Frim Co-authored-by: Antoine Pitrou Co-authored-by: Joris Van den Bossche Signed-off-by: Joris Van den Bossche --- docs/source/python/filesystems_deprecated.rst | 88 docs/source/python/index.rst | 1 - python/CMakeLists.txt | 1 - python/pyarrow/__init__.py | 47 +- python/pyarrow/_hdfsio.pyx | 478 --- python/pyarrow/filesystem.py | 511 - python/pyarrow/fs.py | 25 +- python/pyarrow/hdfs.py | 240 -- python/pyarrow/io.pxi | 13 + python/pyarrow/parquet/core.py | 30 +- python/pyarrow/tests/parquet/test_basic.py | 5 +- python/pyarrow/tests/parquet/test_dataset.py | 137 +++--- .../pyarrow/tests/parquet/test_parquet_writer.py | 43 -- python/pyarrow/tests/test_filesystem.py| 75 --- python/pyarrow/tests/test_hdfs.py | 451 -- python/setup.py| 1 - 16 files changed, 93 insertions(+), 2053 deletions(-) diff --git a/docs/source/python/filesystems_deprecated.rst b/docs/source/python/filesystems_deprecated.rst deleted file mode 100644 index c51245341b..00 --- a/docs/source/python/filesystems_deprecated.rst +++ /dev/null @@ -1,88 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one -.. or more contributor license agreements. See the NOTICE file -.. distributed with this work for additional information -.. regarding copyright ownership. The ASF licenses this file -.. to you under the Apache License, Version 2.0 (the -.. "License"); you may not use this file except in compliance -.. with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, -.. software distributed under the License is distributed on an -.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -.. KIND, either express or implied. See the License for the -.. specific language governing permissions and limitations -.. under the License. - -Filesystem Interface (legacy) -= - -.. warning:: - This section documents the deprecated filesystem layer. You should - use the :ref:`new filesystem layer ` instead. - -.. _hdfs: - -Hadoop File System (HDFS) -- - -PyArrow comes with bindings to a C++-based interface to the Hadoop File -System. You connect like so: - -.. code-block:: python - - import pyarrow as pa - fs = pa.hdfs.connect(host, port, user=user, kerb_ticket=ticket_cache_path) - with fs.open(path, 'rb') as f: - # Do something with f - -By default, ``pyarrow.hdfs.HadoopFileSystem`` uses libhdfs, a JNI-based -interface to the Java Hadoop client. This library is loaded **at runtime** -(rather than at link / library load time, since the library may not be in your -LD_LIBRARY_PATH), and relies on some environment variables. - -* ``HADOOP_HOME``: the root of your installed Hadoop distribution. Often has - `lib/native/libhdfs.so`. - -* ``JAVA_HOME``: the location of your Java SDK installation. - -* ``ARROW_LIBHDFS_DIR`` (optional): explicit location of ``libhdfs.so`` if it is - installed somewhere other than ``$HADOOP_HOME/lib/native``. - -* ``CLASSPATH``: must contain the Hadoop jars. You can set these using: - -.. code-block:: shell - -export CLASSPATH=`$HADOOP_HOME/bin/hdfs classpath --glob` - -If ``CLASSPATH`` is not set, then it will be set automatically if the -``hadoop`` executable is in your system path, or if ``HADOOP_HOME`` is set. - -HDFS API - - -.. currentmodule:: pyarrow - -.. autosummary:: - :toctree: generated/ - - hdfs.connect - HadoopFileSystem.cat - HadoopFileSystem.chmod - HadoopFileSystem.chown - HadoopFileSystem.delete - HadoopFileSystem.df - HadoopFileSystem.disk_usage - HadoopFileSystem.download - HadoopFileSystem.exists - HadoopFileSystem.get_capacity - HadoopFileS
(arrow) branch main updated: GH-39855: [Python] ListView support for pa.array() (#40160)
This is an automated email from the ASF dual-hosted git repository. jorisvandenbossche pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/arrow.git The following commit(s) were added to refs/heads/main by this push: new 7c4f4c2bb1 GH-39855: [Python] ListView support for pa.array() (#40160) 7c4f4c2bb1 is described below commit 7c4f4c2bb140fb51a6c26908f2420a972c7f48e0 Author: Dane Pitkin <48041712+danepit...@users.noreply.github.com> AuthorDate: Fri Mar 1 02:30:01 2024 -0500 GH-39855: [Python] ListView support for pa.array() (#40160) ### Rationale for this change Add pa.array() instantiation support for ListView and LargeListView formats. ### What changes are included in this PR? * pa.array() supports creating ListView and LargeListView types * ListArray, LargeListArray now have their size initialized before adding elements during python-to-arrow conversion. This allows these types to be convertible to ListViewArray and LargeListViewArray types. ### Are these changes tested? Yes, unit tested. ### Are there any user-facing changes? Yes, new feature added. * Closes: #39855 * GitHub Issue: #39855 Authored-by: Dane Pitkin Signed-off-by: Joris Van den Bossche --- python/pyarrow/src/arrow/python/python_to_arrow.cc | 23 +++- python/pyarrow/tests/strategies.py | 4 +- python/pyarrow/tests/test_array.py | 147 +++-- python/pyarrow/tests/test_convert_builtin.py | 44 -- python/pyarrow/tests/test_scalars.py | 43 +++--- 5 files changed, 218 insertions(+), 43 deletions(-) diff --git a/python/pyarrow/src/arrow/python/python_to_arrow.cc b/python/pyarrow/src/arrow/python/python_to_arrow.cc index 3c4d59d659..a0bae2f501 100644 --- a/python/pyarrow/src/arrow/python/python_to_arrow.cc +++ b/python/pyarrow/src/arrow/python/python_to_arrow.cc @@ -581,7 +581,8 @@ struct PyConverterTrait< }; template -struct PyConverterTrait> { +struct PyConverterTrait< +T, enable_if_t::value || is_list_view_type::value>> { using type = PyListConverter; }; @@ -803,7 +804,6 @@ class PyListConverter : public ListConverter { return this->list_builder_->AppendNull(); } -RETURN_NOT_OK(this->list_builder_->Append()); if (PyArray_Check(value)) { RETURN_NOT_OK(AppendNdarray(value)); } else if (PySequence_Check(value)) { @@ -824,6 +824,21 @@ class PyListConverter : public ListConverter { } protected: + // MapType does not support args in the Append() method + Status AppendTo(const MapType*, int64_t size) { return this->list_builder_->Append(); } + + // FixedSizeListType does not support args in the Append() method + Status AppendTo(const FixedSizeListType*, int64_t size) { +return this->list_builder_->Append(); + } + + // ListType requires the size argument in the Append() method + // in order to be convertible to a ListViewType. ListViewType + // requires the size argument in the Append() method always. + Status AppendTo(const BaseListType*, int64_t size) { +return this->list_builder_->Append(true, size); + } + Status ValidateBuilder(const MapType*) { if (this->list_builder_->key_builder()->null_count() > 0) { return Status::Invalid("Invalid Map: key field cannot contain null values"); @@ -836,11 +851,14 @@ class PyListConverter : public ListConverter { Status AppendSequence(PyObject* value) { int64_t size = static_cast(PySequence_Size(value)); +RETURN_NOT_OK(AppendTo(this->list_type_, size)); RETURN_NOT_OK(this->list_builder_->ValidateOverflow(size)); return this->value_converter_->Extend(value, size); } Status AppendIterable(PyObject* value) { +auto size = static_cast(PyObject_Size(value)); +RETURN_NOT_OK(AppendTo(this->list_type_, size)); PyObject* iterator = PyObject_GetIter(value); OwnedRef iter_ref(iterator); while (PyObject* item = PyIter_Next(iterator)) { @@ -857,6 +875,7 @@ class PyListConverter : public ListConverter { return Status::Invalid("Can only convert 1-dimensional array values"); } const int64_t size = PyArray_SIZE(ndarray); +RETURN_NOT_OK(AppendTo(this->list_type_, size)); RETURN_NOT_OK(this->list_builder_->ValidateOverflow(size)); const auto value_type = this->value_converter_->builder()->type(); diff --git a/python/pyarrow/tests/strategies.py b/python/pyarrow/tests/strategies.py index bb88a4dcb7..7affe815a2 100644 --- a/python/pyarrow/tests/strategies.py +++ b/python/pyarrow/tests/strategies.py @@ -167,7 +167,9 @@ def list_types(item_strategy=primitive_types): pa.list_, item_strategy, st.integers(min_value=0, max_value=16) -) +), +st.builds(pa.list_v
(arrow) branch main updated (99c5412a6a -> d6b9051fa0)
This is an automated email from the ASF dual-hosted git repository. jorisvandenbossche pushed a change to branch main in repository https://gitbox.apache.org/repos/asf/arrow.git from 99c5412a6a GH-39979: [Python] Low-level bindings for exporting/importing the C Device Interface (#39980) add d6b9051fa0 GH-40066: [Python] Support `requested_schema` in `__arrow_c_stream__()` (#40070) No new revisions were added by this update. Summary of changes: python/pyarrow/includes/libarrow_python.pxd | 8 python/pyarrow/ipc.pxi | 39 ++--- python/pyarrow/src/arrow/python/ipc.cc | 66 python/pyarrow/src/arrow/python/ipc.h | 20 + python/pyarrow/table.pxi| 23 ++ python/pyarrow/tests/test_cffi.py | 18 +++- python/pyarrow/tests/test_ipc.py| 68 +++-- python/pyarrow/tests/test_table.py | 32 +- 8 files changed, 261 insertions(+), 13 deletions(-)
(arrow) branch main updated: GH-40266: [Python] Mark ListView as a nested type (#40265)
This is an automated email from the ASF dual-hosted git repository. jorisvandenbossche pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/arrow.git The following commit(s) were added to refs/heads/main by this push: new d519a4cb05 GH-40266: [Python] Mark ListView as a nested type (#40265) d519a4cb05 is described below commit d519a4cb05773dc6ef36e02c963b5e27c73d06e5 Author: Dane Pitkin <48041712+danepit...@users.noreply.github.com> AuthorDate: Wed Feb 28 04:37:34 2024 -0500 GH-40266: [Python] Mark ListView as a nested type (#40265) ### Rationale for this change ListView types are nested, so `is_nested()` should return True. ### What changes are included in this PR? * `pa.types.is_nested(pa.list_view())` returns True ### Are these changes tested? Yes, unit tested. ### Are there any user-facing changes? Yes. * GitHub Issue: #40266 Authored-by: Dane Pitkin Signed-off-by: Joris Van den Bossche --- python/pyarrow/tests/test_types.py | 2 ++ python/pyarrow/types.py| 1 + 2 files changed, 3 insertions(+) diff --git a/python/pyarrow/tests/test_types.py b/python/pyarrow/tests/test_types.py index e048ed6fa5..a79702a8ca 100644 --- a/python/pyarrow/tests/test_types.py +++ b/python/pyarrow/tests/test_types.py @@ -216,6 +216,8 @@ def test_is_nested_or_struct(): assert types.is_nested(pa.list_(pa.int32())) assert types.is_nested(pa.list_(pa.int32(), 3)) assert types.is_nested(pa.large_list(pa.int32())) +assert types.is_nested(pa.list_view(pa.int32())) +assert types.is_nested(pa.large_list_view(pa.int32())) assert not types.is_nested(pa.int32()) diff --git a/python/pyarrow/types.py b/python/pyarrow/types.py index 6c262b49cb..66b1ec3395 100644 --- a/python/pyarrow/types.py +++ b/python/pyarrow/types.py @@ -41,6 +41,7 @@ _TEMPORAL_TYPES = ({lib.Type_TIMESTAMP, _INTERVAL_TYPES) _UNION_TYPES = {lib.Type_SPARSE_UNION, lib.Type_DENSE_UNION} _NESTED_TYPES = {lib.Type_LIST, lib.Type_FIXED_SIZE_LIST, lib.Type_LARGE_LIST, + lib.Type_LIST_VIEW, lib.Type_LARGE_LIST_VIEW, lib.Type_STRUCT, lib.Type_MAP} | _UNION_TYPES
(arrow) branch main updated (3f7b2884dc -> 06d841ee7d)
This is an automated email from the ASF dual-hosted git repository. jorisvandenbossche pushed a change to branch main in repository https://gitbox.apache.org/repos/asf/arrow.git from 3f7b2884dc GH-40171: [Python] Add Type_FIXED_SIZE_LIST to _NESTED_TYPES set (#40172) add 06d841ee7d MINOR: [Documentation][C++][Python][R] Clarify docstrings around max_chunksize (#40251) No new revisions were added by this update. Summary of changes: cpp/src/arrow/ipc/writer.h | 4 ++-- cpp/src/arrow/table.h | 4 ++-- python/pyarrow/_flight.pyx | 4 ++-- python/pyarrow/ipc.pxi | 4 ++-- python/pyarrow/table.pxi | 12 ++-- r/R/flight.R | 3 ++- r/man/flight_put.Rd| 3 ++- 7 files changed, 18 insertions(+), 16 deletions(-)
(arrow) branch main updated (06d841ee7d -> c57115de8d)
This is an automated email from the ASF dual-hosted git repository. jorisvandenbossche pushed a change to branch main in repository https://gitbox.apache.org/repos/asf/arrow.git from 06d841ee7d MINOR: [Documentation][C++][Python][R] Clarify docstrings around max_chunksize (#40251) add c57115de8d GH-40142: [Python] Allow FileInfo instances to be passed to dataset init (#40143) No new revisions were added by this update. Summary of changes: python/pyarrow/_dataset.pyx | 34 +--- python/pyarrow/dataset.py| 16 +++-- python/pyarrow/includes/libarrow_dataset.pxd | 8 +++ python/pyarrow/tests/test_dataset.py | 10 4 files changed, 58 insertions(+), 10 deletions(-)
(arrow) branch main updated: GH-40171: [Python] Add Type_FIXED_SIZE_LIST to _NESTED_TYPES set (#40172)
This is an automated email from the ASF dual-hosted git repository. jorisvandenbossche pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/arrow.git The following commit(s) were added to refs/heads/main by this push: new 3f7b2884dc GH-40171: [Python] Add Type_FIXED_SIZE_LIST to _NESTED_TYPES set (#40172) 3f7b2884dc is described below commit 3f7b2884dccb4c0164092b754a2a76ccbb900154 Author: Hussein Awala AuthorDate: Tue Feb 27 14:28:55 2024 +0100 GH-40171: [Python] Add Type_FIXED_SIZE_LIST to _NESTED_TYPES set (#40172) ### Rationale for this change ### What changes are included in this PR? This PR fixes a minor bug in `types.is_nested` which doesn't consider the `FIXED_SIZE_LIST` type as nested type. ### Are these changes tested? ### Are there any user-facing changes? * Closes: #40171 Authored-by: hussein-awala Signed-off-by: Joris Van den Bossche --- python/pyarrow/tests/test_types.py | 1 + python/pyarrow/types.py| 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/python/pyarrow/tests/test_types.py b/python/pyarrow/tests/test_types.py index 0add578608..e048ed6fa5 100644 --- a/python/pyarrow/tests/test_types.py +++ b/python/pyarrow/tests/test_types.py @@ -214,6 +214,7 @@ def test_is_nested_or_struct(): assert types.is_nested(struct_ex) assert types.is_nested(pa.list_(pa.int32())) +assert types.is_nested(pa.list_(pa.int32(), 3)) assert types.is_nested(pa.large_list(pa.int32())) assert not types.is_nested(pa.int32()) diff --git a/python/pyarrow/types.py b/python/pyarrow/types.py index 0f68ca9fe5..6c262b49cb 100644 --- a/python/pyarrow/types.py +++ b/python/pyarrow/types.py @@ -40,8 +40,8 @@ _TEMPORAL_TYPES = ({lib.Type_TIMESTAMP, lib.Type_DURATION} | _TIME_TYPES | _DATE_TYPES | _INTERVAL_TYPES) _UNION_TYPES = {lib.Type_SPARSE_UNION, lib.Type_DENSE_UNION} -_NESTED_TYPES = {lib.Type_LIST, lib.Type_LARGE_LIST, lib.Type_STRUCT, - lib.Type_MAP} | _UNION_TYPES +_NESTED_TYPES = {lib.Type_LIST, lib.Type_FIXED_SIZE_LIST, lib.Type_LARGE_LIST, + lib.Type_STRUCT, lib.Type_MAP} | _UNION_TYPES @doc(datatype="null")
(arrow) branch main updated: MINOR: [Format] Clarify that the buffers for the Binary View layout differ in the C Data Interface (#40156)
This is an automated email from the ASF dual-hosted git repository. jorisvandenbossche pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/arrow.git The following commit(s) were added to refs/heads/main by this push: new 5f3688351f MINOR: [Format] Clarify that the buffers for the Binary View layout differ in the C Data Interface (#40156) 5f3688351f is described below commit 5f3688351f3adfba9a84d9e0bd65b300eabe35d2 Author: Joris Van den Bossche AuthorDate: Tue Feb 27 09:15:55 2024 +0100 MINOR: [Format] Clarify that the buffers for the Binary View layout differ in the C Data Interface (#40156) ### Rationale for this change Attempt to draw more attention to the fact that the buffer listing / number of buffers differ between the main Format spec and the C Data Interface, for the Binary View layout. Triggered by feedback from implementing this in duckdb at https://github.com/duckdb/duckdb/pull/10481#discussion_r1489245865 Authored-by: Joris Van den Bossche Signed-off-by: Joris Van den Bossche --- docs/source/format/CDataInterface.rst | 7 ++- docs/source/format/Columnar.rst | 3 +++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/docs/source/format/CDataInterface.rst b/docs/source/format/CDataInterface.rst index ef4bf1cf32..fd9952b037 100644 --- a/docs/source/format/CDataInterface.rst +++ b/docs/source/format/CDataInterface.rst @@ -467,7 +467,10 @@ It has the following fields: Mandatory. The number of physical buffers backing this array. The number of buffers is a function of the data type, as described in the - :ref:`Columnar format specification `. + :ref:`Columnar format specification `, except for the + the binary or utf-8 view type, which has one additional buffer compared + to the Columnar format specification (see + :ref:`c-data-interface-binary-view-arrays`). Buffers of children arrays are not included. @@ -552,6 +555,8 @@ parameterized extension types). The ``ArrowArray`` structure exported from an extension array simply points to the storage data of the extension array. +.. _c-data-interface-binary-view-arrays: + Binary view arrays -- diff --git a/docs/source/format/Columnar.rst b/docs/source/format/Columnar.rst index 84f251968f..7b74b972f2 100644 --- a/docs/source/format/Columnar.rst +++ b/docs/source/format/Columnar.rst @@ -409,6 +409,9 @@ All integers (length, buffer index, and offset) are signed. This layout is adapted from TU Munich's `UmbraDB`_. +Note that this layout uses one additional buffer to store the variadic buffer +lengths in the :ref:`Arrow C data interface `. + .. _variable-size-list-layout: Variable-size List Layout
(arrow) branch main updated: GH-40092: [Python] Support Binary/StringView conversion to numpy/pandas (#40093)
This is an automated email from the ASF dual-hosted git repository. jorisvandenbossche pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/arrow.git The following commit(s) were added to refs/heads/main by this push: new 8e53451cc4 GH-40092: [Python] Support Binary/StringView conversion to numpy/pandas (#40093) 8e53451cc4 is described below commit 8e53451cc48081df20fdf52b82edcc52ea778ec5 Author: Joris Van den Bossche AuthorDate: Thu Feb 22 10:19:17 2024 +0100 GH-40092: [Python] Support Binary/StringView conversion to numpy/pandas (#40093) Last step for Binary/StringView support in Python (https://github.com/apache/arrow/issues/39633), now adding it to the arrow->pandas/numpy conversion code path. * Closes: #40092 Authored-by: Joris Van den Bossche Signed-off-by: Joris Van den Bossche --- python/pyarrow/src/arrow/python/arrow_to_pandas.cc | 22 +++--- python/pyarrow/tests/test_pandas.py| 14 ++ 2 files changed, 33 insertions(+), 3 deletions(-) diff --git a/python/pyarrow/src/arrow/python/arrow_to_pandas.cc b/python/pyarrow/src/arrow/python/arrow_to_pandas.cc index e979342b88..2115cd8015 100644 --- a/python/pyarrow/src/arrow/python/arrow_to_pandas.cc +++ b/python/pyarrow/src/arrow/python/arrow_to_pandas.cc @@ -133,6 +133,13 @@ struct WrapBytes { } }; +template <> +struct WrapBytes { + static inline PyObject* Wrap(const char* data, int64_t length) { +return PyUnicode_FromStringAndSize(data, length); + } +}; + template <> struct WrapBytes { static inline PyObject* Wrap(const char* data, int64_t length) { @@ -147,6 +154,13 @@ struct WrapBytes { } }; +template <> +struct WrapBytes { + static inline PyObject* Wrap(const char* data, int64_t length) { +return PyBytes_FromStringAndSize(data, length); + } +}; + template <> struct WrapBytes { static inline PyObject* Wrap(const char* data, int64_t length) { @@ -1154,7 +1168,8 @@ struct ObjectWriterVisitor { } template - enable_if_t::value || is_fixed_size_binary_type::value, + enable_if_t::value || is_binary_view_like_type::value || + is_fixed_size_binary_type::value, Status> Visit(const Type& type) { auto WrapValue = [](const std::string_view& view, PyObject** out) { @@ -1355,8 +1370,7 @@ struct ObjectWriterVisitor { std::is_same::value || (std::is_base_of::value && !std::is_same::value) || - std::is_base_of::value || - std::is_base_of::value, + std::is_base_of::value, Status> Visit(const Type& type) { return Status::NotImplemented("No implemented conversion to object dtype: ", @@ -2086,8 +2100,10 @@ static Status GetPandasWriterType(const ChunkedArray& data, const PandasOptions& break; case Type::STRING:// fall through case Type::LARGE_STRING: // fall through +case Type::STRING_VIEW: // fall through case Type::BINARY:// fall through case Type::LARGE_BINARY: +case Type::BINARY_VIEW: case Type::NA: // fall through case Type::FIXED_SIZE_BINARY:// fall through case Type::STRUCT: // fall through diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py index 89a241a27e..fdfd123a8c 100644 --- a/python/pyarrow/tests/test_pandas.py +++ b/python/pyarrow/tests/test_pandas.py @@ -1760,6 +1760,20 @@ class TestConvertStringLikeTypes: _check_pandas_roundtrip( df, schema=pa.schema([('a', pa.large_string())])) +def test_binary_view(self): +s = pd.Series([b'123', b'', b'a', None]) +_check_series_roundtrip(s, type_=pa.binary_view()) +df = pd.DataFrame({'a': s}) +_check_pandas_roundtrip( +df, schema=pa.schema([('a', pa.binary_view())])) + +def test_string_view(self): +s = pd.Series(['123', '', 'a', None]) +_check_series_roundtrip(s, type_=pa.string_view()) +df = pd.DataFrame({'a': s}) +_check_pandas_roundtrip( +df, schema=pa.schema([('a', pa.string_view())])) + def test_table_empty_str(self): values = ['', '', '', '', ''] df = pd.DataFrame({'strings': values})
(arrow) branch main updated: GH-39291: [Docs] Remove the "Show source" links from doc pages (#40167)
This is an automated email from the ASF dual-hosted git repository. jorisvandenbossche pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/arrow.git The following commit(s) were added to refs/heads/main by this push: new 6a22a1dee7 GH-39291: [Docs] Remove the "Show source" links from doc pages (#40167) 6a22a1dee7 is described below commit 6a22a1dee78b0f7daa7e4d8793d663e29a5712a6 Author: Divyansh200102 <146909065+divyansh200...@users.noreply.github.com> AuthorDate: Wed Feb 21 20:00:24 2024 +0530 GH-39291: [Docs] Remove the "Show source" links from doc pages (#40167) ### Rationale for this change To fix the show source button links to 404 page problem ### What changes are included in this PR? The show source button link will be removed. ### Are these changes tested? Not yet ### Are there any user-facing changes? Yes * Closes: #39291 * GitHub Issue: #39291 Authored-by: Divyansh200102 Signed-off-by: Joris Van den Bossche --- docs/source/conf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/conf.py b/docs/source/conf.py index 5af7b7955f..c6be6cb94c 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -414,7 +414,7 @@ html_baseurl = "https://arrow.apache.org/docs/; # If true, links to the reST sources are added to the pages. # -# html_show_sourcelink = True +html_show_sourcelink = False # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. #
(arrow) branch main updated: GH-39999: [Python] Fix tests for pandas with CoW / nightly integration tests (#40000)
This is an automated email from the ASF dual-hosted git repository. jorisvandenbossche pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/arrow.git The following commit(s) were added to refs/heads/main by this push: new 40cb0a22c1 GH-3: [Python] Fix tests for pandas with CoW / nightly integration tests (#4) 40cb0a22c1 is described below commit 40cb0a22c1685a1861652b68b6eb394903cf3cba Author: Joris Van den Bossche AuthorDate: Fri Feb 9 09:04:16 2024 +0100 GH-3: [Python] Fix tests for pandas with CoW / nightly integration tests (#4) ### Rationale for this change Fixing a failing test with pandas nightly because of CoW changes. * Closes: #3 Authored-by: Joris Van den Bossche Signed-off-by: Joris Van den Bossche --- python/pyarrow/tests/test_pandas.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py index 8106219057..676cc96151 100644 --- a/python/pyarrow/tests/test_pandas.py +++ b/python/pyarrow/tests/test_pandas.py @@ -3650,7 +3650,8 @@ def test_singleton_blocks_zero_copy(): prior_allocation = pa.total_allocated_bytes() result = t.to_pandas() -assert result['f0'].values.flags.writeable +# access private `_values` because the public `values` is made read-only by pandas +assert result['f0']._values.flags.writeable assert pa.total_allocated_bytes() > prior_allocation
(arrow-site) branch main updated: Fix errant line with pantab note (#472)
This is an automated email from the ASF dual-hosted git repository. jorisvandenbossche pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/arrow-site.git The following commit(s) were added to refs/heads/main by this push: new 2cdbe04f91b Fix errant line with pantab note (#472) 2cdbe04f91b is described below commit 2cdbe04f91b956476dc47445b678d155da0eb940 Author: William Ayd AuthorDate: Thu Feb 8 10:11:23 2024 -0500 Fix errant line with pantab note (#472) Follow up to https://github.com/apache/arrow-site/pull/471 this wasn't meant to be there --- powered_by.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/powered_by.md b/powered_by.md index 1ba4278d0ea..edb3ff53f9a 100644 --- a/powered_by.md +++ b/powered_by.md @@ -155,9 +155,8 @@ short description of your use case. supports reading and writing Parquet files using pyarrow. Several pandas core developers are also contributors to Apache Arrow. * **[pantab][52]:** Allows high performance read/writes of popular dataframe libraries - like pandas, polars pyarrow, etc... to/from Tableau's Hyper database. pantab uses nanoarrow + like pandas, polars, pyarrow, etc... to/from Tableau's Hyper database. pantab uses nanoarrow and the Arrow PyCapsule interface to make that exchange process seamless. - core developers are also contributors to Apache Arrow. * **[Parseable][51]:** Log analytics platform built for scale and usability. Ingest logs from anywhere and unify logs with Parseable. Parseable uses Arrow as the intermediary, in-memory data format for log data ingestion. * **[Perspective][23]:** Perspective is a streaming data visualization engine in JavaScript for building real-time & user-configurable analytics entirely in the browser. * **[Petastorm][28]:** Petastorm enables single machine or distributed training
(arrow) branch main updated: GH-39812: [Python] Add bindings for ListView and LargeListView (#39813)
This is an automated email from the ASF dual-hosted git repository. jorisvandenbossche pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/arrow.git The following commit(s) were added to refs/heads/main by this push: new 42e35f101e GH-39812: [Python] Add bindings for ListView and LargeListView (#39813) 42e35f101e is described below commit 42e35f101e87e689dcc48981abf81bc32c41d162 Author: Dane Pitkin <48041712+danepit...@users.noreply.github.com> AuthorDate: Thu Feb 8 09:44:19 2024 -0500 GH-39812: [Python] Add bindings for ListView and LargeListView (#39813) ### Rationale for this change Add bindings to the ListView and LargeListView array formats. ### What changes are included in this PR? * Add initial implementation for ListView and LargeListView * Add basic unit tests ### Are these changes tested? * Basic unit tests only (follow up PRs will be needed to implement full functionality) ### Are there any user-facing changes? Yes, documentation is updated in this PR to include the new PyArrow objects. * Closes: #39812 Lead-authored-by: Dane Pitkin Co-authored-by: Dane Pitkin <48041712+danepit...@users.noreply.github.com> Co-authored-by: Joris Van den Bossche Signed-off-by: Joris Van den Bossche --- docs/source/python/api/arrays.rst| 4 + docs/source/python/api/datatypes.rst | 4 + python/pyarrow/__init__.py | 14 +- python/pyarrow/array.pxi | 574 +++ python/pyarrow/includes/libarrow.pxd | 90 ++ python/pyarrow/lib.pxd | 18 ++ python/pyarrow/lib.pyx | 2 + python/pyarrow/public-api.pxi| 4 + python/pyarrow/scalar.pxi| 10 + python/pyarrow/tests/test_array.py | 71 + python/pyarrow/tests/test_misc.py| 4 + python/pyarrow/tests/test_scalars.py | 8 +- python/pyarrow/tests/test_types.py | 49 +++ python/pyarrow/types.pxi | 171 +++ python/pyarrow/types.py | 10 + 15 files changed, 1027 insertions(+), 6 deletions(-) diff --git a/docs/source/python/api/arrays.rst b/docs/source/python/api/arrays.rst index b858862dcf..e6f6c3dbbd 100644 --- a/docs/source/python/api/arrays.rst +++ b/docs/source/python/api/arrays.rst @@ -77,6 +77,8 @@ may expose data type-specific methods or properties. ListArray FixedSizeListArray LargeListArray + ListViewArray + LargeListViewArray MapArray RunEndEncodedArray StructArray @@ -135,6 +137,8 @@ classes may expose data type-specific methods or properties. RunEndEncodedScalar ListScalar LargeListScalar + ListViewScalar + LargeListViewScalar MapScalar StructScalar UnionScalar diff --git a/docs/source/python/api/datatypes.rst b/docs/source/python/api/datatypes.rst index 642c243b21..62bf4b7723 100644 --- a/docs/source/python/api/datatypes.rst +++ b/docs/source/python/api/datatypes.rst @@ -60,6 +60,8 @@ These should be used to create Arrow data types and schemas. decimal128 list_ large_list + list_view + large_list_view map_ struct dictionary @@ -149,6 +151,8 @@ represents a given data type (such as ``int32``) or general category is_list is_large_list is_fixed_size_list + is_list_view + is_large_list_view is_struct is_union is_nested diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py index 4dbd1258d3..2ee97ddb66 100644 --- a/python/pyarrow/__init__.py +++ b/python/pyarrow/__init__.py @@ -166,7 +166,8 @@ from pyarrow.lib import (null, bool_, binary, string, utf8, binary_view, string_view, large_binary, large_string, large_utf8, decimal128, decimal256, - list_, large_list, map_, struct, + list_, large_list, list_view, large_list_view, + map_, struct, union, sparse_union, dense_union, dictionary, run_end_encoded, @@ -174,8 +175,9 @@ from pyarrow.lib import (null, bool_, field, type_for_alias, DataType, DictionaryType, StructType, - ListType, LargeListType, MapType, FixedSizeListType, - UnionType, SparseUnionType, DenseUnionType, + ListType, LargeListType, FixedSizeListType, + ListViewType, LargeListViewType, + MapType, UnionType, SparseUnionType, DenseUnionType, TimestampType, Time32Type, Time64Type, DurationType, FixedSizeBinaryType, Decimal128Type, Decimal256Type, BaseExtensionType, ExtensionType, @@ -201,8 +203,9 @@ from pyarrow.lib
(arrow) branch main updated: GH-39852: [Python] Support creating Binary/StringView arrays from python objects (#39853)
This is an automated email from the ASF dual-hosted git repository. jorisvandenbossche pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/arrow.git The following commit(s) were added to refs/heads/main by this push: new 7e2fe4fe76 GH-39852: [Python] Support creating Binary/StringView arrays from python objects (#39853) 7e2fe4fe76 is described below commit 7e2fe4fe7634c359017213b79255c9040786fc06 Author: Joris Van den Bossche AuthorDate: Wed Feb 7 15:21:37 2024 +0100 GH-39852: [Python] Support creating Binary/StringView arrays from python objects (#39853) Next step for Binary/StringView support in Python (https://github.com/apache/arrow/issues/39633), now adding it to the python->arrow conversion code path. * Closes: #39852 Authored-by: Joris Van den Bossche Signed-off-by: Joris Van den Bossche --- python/pyarrow/src/arrow/python/python_to_arrow.cc | 35 ++ python/pyarrow/tests/test_convert_builtin.py | 19 ++-- python/pyarrow/tests/test_scalars.py | 28 +++-- 3 files changed, 42 insertions(+), 40 deletions(-) diff --git a/python/pyarrow/src/arrow/python/python_to_arrow.cc b/python/pyarrow/src/arrow/python/python_to_arrow.cc index d1d94ac17a..3c4d59d659 100644 --- a/python/pyarrow/src/arrow/python/python_to_arrow.cc +++ b/python/pyarrow/src/arrow/python/python_to_arrow.cc @@ -486,6 +486,10 @@ class PyValue { return view.ParseString(obj); } + static Status Convert(const BinaryViewType*, const O&, I obj, PyBytesView& view) { +return view.ParseString(obj); + } + static Status Convert(const FixedSizeBinaryType* type, const O&, I obj, PyBytesView& view) { ARROW_RETURN_NOT_OK(view.ParseString(obj)); @@ -499,8 +503,8 @@ class PyValue { } template - static enable_if_string Convert(const T*, const O& options, I obj, - PyBytesView& view) { + static enable_if_t::value || is_string_view_type::value, Status> + Convert(const T*, const O& options, I obj, PyBytesView& view) { if (options.strict) { // Strict conversion, force output to be unicode / utf8 and validate that // any binary values are utf8 @@ -570,18 +574,12 @@ struct PyConverterTrait; template struct PyConverterTrait< -T, -enable_if_t<(!is_nested_type::value && !is_interval_type::value && - !is_extension_type::value && !is_binary_view_like_type::value) || -std::is_same::value>> { +T, enable_if_t<(!is_nested_type::value && !is_interval_type::value && +!is_extension_type::value) || + std::is_same::value>> { using type = PyPrimitiveConverter; }; -template -struct PyConverterTrait> { - // not implemented -}; - template struct PyConverterTrait> { using type = PyListConverter; @@ -699,11 +697,22 @@ class PyPrimitiveConverter:: PyBytesView view_; }; +template +struct OffsetTypeTrait { + using type = typename T::offset_type; +}; + +template +struct OffsetTypeTrait> { + using type = int64_t; +}; + template -class PyPrimitiveConverter> +class PyPrimitiveConverter< +T, enable_if_t::value || is_binary_view_like_type::value>> : public PrimitiveConverter { public: - using OffsetType = typename T::offset_type; + using OffsetType = typename OffsetTypeTrait::type; Status Append(PyObject* value) override { if (PyValue::IsNull(this->options_, value)) { diff --git a/python/pyarrow/tests/test_convert_builtin.py b/python/pyarrow/tests/test_convert_builtin.py index 49c4f1a6e7..55ea28f50f 100644 --- a/python/pyarrow/tests/test_convert_builtin.py +++ b/python/pyarrow/tests/test_convert_builtin.py @@ -763,6 +763,16 @@ def test_sequence_unicode(): assert arr.to_pylist() == data +@pytest.mark.parametrize("ty", [pa.string(), pa.large_string(), pa.string_view()]) +def test_sequence_unicode_explicit_type(ty): +data = ['foo', 'bar', None, 'mañana'] +arr = pa.array(data, type=ty) +assert len(arr) == 4 +assert arr.null_count == 1 +assert arr.type == ty +assert arr.to_pylist() == data + + def check_array_mixed_unicode_bytes(binary_type, string_type): values = ['qux', b'foo', bytearray(b'barz')] b_values = [b'qux', b'foo', b'barz'] @@ -787,6 +797,7 @@ def check_array_mixed_unicode_bytes(binary_type, string_type): def test_array_mixed_unicode_bytes(): check_array_mixed_unicode_bytes(pa.binary(), pa.string()) check_array_mixed_unicode_bytes(pa.large_binary(), pa.large_string()) +check_array_mixed_unicode_bytes(pa.binary_view(), pa.string_view()) @pytest.mark.large_memory @@ -818,7 +829,7 @@ def test_large_binary_value(ty): @pytest.mark.large_memory -@pytest.mark.parametrize("ty", [pa.binary(
(arrow) branch main updated (a1c1773b72 -> 4ceb661013)
This is an automated email from the ASF dual-hosted git repository. jorisvandenbossche pushed a change to branch main in repository https://gitbox.apache.org/repos/asf/arrow.git from a1c1773b72 GH-39555: [Packaging][Python] Enable building pyarrow against numpy 2.0 (#39557) add 4ceb661013 GH-39880: [Python][CI] Pin moto<5 for dask integration tests (#39881) No new revisions were added by this update. Summary of changes: ci/scripts/install_dask.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-)
(arrow) branch main updated: GH-39555: [Packaging][Python] Enable building pyarrow against numpy 2.0 (#39557)
This is an automated email from the ASF dual-hosted git repository. jorisvandenbossche pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/arrow.git The following commit(s) were added to refs/heads/main by this push: new a1c1773b72 GH-39555: [Packaging][Python] Enable building pyarrow against numpy 2.0 (#39557) a1c1773b72 is described below commit a1c1773b724e4d78faf9a097247c7e976cd2cbfa Author: Joris Van den Bossche AuthorDate: Thu Feb 1 14:53:35 2024 +0100 GH-39555: [Packaging][Python] Enable building pyarrow against numpy 2.0 (#39557) ### Rationale for this change Ensure we can build pyarrow against numpy 2.0 nightly (update pyproject.toml to allow this), and test this by building our nightly wheels with numpy nightly. This also ensures that other projects that use our nightly wheels to test together with numpy nightly can do that (numpy 2.0 changes the ABI, so to run with numpy 2.0, your package needs to be built with numpy 2.x; currently pyarrow installed with our nightly wheel will fail to import when also numpy nightly is installed). See the parent issue https://github.com/apache/arrow/issues/39532 for details, and https://numpy.org/devdocs/dev/depending_on_numpy.html#numpy-2-0-specific-advice for a direct link to the NumPy guidelines on updating build dependencies for NumPy 2.0. * Closes: #39555 Lead-authored-by: Joris Van den Bossche Co-authored-by: Antoine Pitrou Signed-off-by: Joris Van den Bossche --- ci/docker/python-wheel-manylinux.dockerfile | 5 +++-- ci/docker/python-wheel-windows-vs2017.dockerfile | 3 ++- ci/scripts/python_wheel_macos_build.sh | 5 - python/pyproject.toml| 7 ++- python/requirements-build.txt| 3 ++- python/requirements-wheel-build.txt | 3 ++- python/setup.py | 2 +- 7 files changed, 20 insertions(+), 8 deletions(-) diff --git a/ci/docker/python-wheel-manylinux.dockerfile b/ci/docker/python-wheel-manylinux.dockerfile index 0a50d450c2..a07c727ac7 100644 --- a/ci/docker/python-wheel-manylinux.dockerfile +++ b/ci/docker/python-wheel-manylinux.dockerfile @@ -28,7 +28,7 @@ ENV MANYLINUX_VERSION=${manylinux} RUN yum install -y dnf # Install basic dependencies -RUN dnf install -y git flex curl autoconf zip perl-IPC-Cmd wget kernel-headers +RUN dnf install -y git flex curl autoconf zip perl-IPC-Cmd wget # A system Python is required for ninja and vcpkg in this Dockerfile. # On manylinux2014 base images, system Python is 2.7.5, while @@ -97,4 +97,5 @@ SHELL ["/bin/bash", "-i", "-c"] ENTRYPOINT ["/bin/bash", "-i", "-c"] COPY python/requirements-wheel-build.txt /arrow/python/ -RUN pip install -r /arrow/python/requirements-wheel-build.txt +# TODO(GH-39848) Remove the `--pre --extra-index-url` for numpy nightly again before the 16.0 release +RUN pip install -r /arrow/python/requirements-wheel-build.txt --pre --extra-index-url "https://pypi.anaconda.org/scientific-python-nightly-wheels/simple; diff --git a/ci/docker/python-wheel-windows-vs2017.dockerfile b/ci/docker/python-wheel-windows-vs2017.dockerfile index faf07800c9..067105b3a7 100644 --- a/ci/docker/python-wheel-windows-vs2017.dockerfile +++ b/ci/docker/python-wheel-windows-vs2017.dockerfile @@ -88,7 +88,8 @@ RUN choco install -r -y --no-progress python --version=%PYTHON_VERSION% RUN python -m pip install -U pip setuptools COPY python/requirements-wheel-build.txt arrow/python/ -RUN python -m pip install -r arrow/python/requirements-wheel-build.txt +# TODO(GH-39848) Remove the `--pre --extra-index-url` for numpy nightly again before the 16.0 release +RUN python -m pip install -r arrow/python/requirements-wheel-build.txt --pre --extra-index-url "https://pypi.anaconda.org/scientific-python-nightly-wheels/simple; # ENV CLCACHE_DIR="C:\clcache" # ENV CLCACHE_COMPRESS=1 diff --git a/ci/scripts/python_wheel_macos_build.sh b/ci/scripts/python_wheel_macos_build.sh index fd845c512d..8123a9fdf1 100755 --- a/ci/scripts/python_wheel_macos_build.sh +++ b/ci/scripts/python_wheel_macos_build.sh @@ -50,12 +50,15 @@ echo "=== (${PYTHON_VERSION}) Install Python build dependencies ===" export PIP_SITE_PACKAGES=$(python -c 'import site; print(site.getsitepackages()[0])') export PIP_TARGET_PLATFORM="macosx_${MACOSX_DEPLOYMENT_TARGET//./_}_${arch}" +# TODO(GH-39848) Remove the `--pre --extra-index-url` for numpy nightly again before the 16.0 release pip install \ --upgrade \ --only-binary=:all: \ --target $PIP_SITE_PACKAGES \ --platform $PIP_TARGET_PLATFORM \ - -r ${source_dir}/python/requirements-wheel-build.txt + -r ${source_dir}/python/requirements-wheel-build.txt \ + --pre \ + --extra-index-url "https://pypi.anaconda.org/scientific-python-nightly-whe
(arrow) branch main updated: GH-39779: [Python] Expose force_virtual_addressing in PyArrow (#39819)
This is an automated email from the ASF dual-hosted git repository. jorisvandenbossche pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/arrow.git The following commit(s) were added to refs/heads/main by this push: new 3d45ac9653 GH-39779: [Python] Expose force_virtual_addressing in PyArrow (#39819) 3d45ac9653 is described below commit 3d45ac96534fc76b820b488aa02182e6b93a388f Author: y.yoshida5 <39612448+yo1...@users.noreply.github.com> AuthorDate: Thu Feb 1 22:36:59 2024 +0900 GH-39779: [Python] Expose force_virtual_addressing in PyArrow (#39819) ### Rationale for this change / What changes are included in this PR? To expose force_virtual_addressing in PyArrow. ### Are these changes tested? Existing unit tests are not broken, and a new test case have been added. ### Are there any user-facing changes? pyarrow.fs.S3FileSystem: it becomes possible to specify the argument 'force_virtual_addressing'. * Closes: #39779 Authored-by: yo1956 Signed-off-by: Joris Van den Bossche --- python/pyarrow/_s3fs.pyx| 11 ++- python/pyarrow/includes/libarrow_fs.pxd | 1 + python/pyarrow/tests/test_fs.py | 4 3 files changed, 15 insertions(+), 1 deletion(-) diff --git a/python/pyarrow/_s3fs.pyx b/python/pyarrow/_s3fs.pyx index 13b8c748cb..f5bab99a49 100644 --- a/python/pyarrow/_s3fs.pyx +++ b/python/pyarrow/_s3fs.pyx @@ -245,6 +245,11 @@ cdef class S3FileSystem(FileSystem): retry_strategy : S3RetryStrategy, default AwsStandardS3RetryStrategy(max_attempts=3) The retry strategy to use with S3; fail after max_attempts. Available strategies are AwsStandardS3RetryStrategy, AwsDefaultS3RetryStrategy. +force_virtual_addressing : bool, default False +Whether to use virtual addressing of buckets. +If true, then virtual addressing is always enabled. +If false, then virtual addressing is only enabled if `endpoint_override` is empty. +This can be used for non-AWS backends that only support virtual hosted-style access. Examples @@ -268,7 +273,9 @@ cdef class S3FileSystem(FileSystem): role_arn=None, session_name=None, external_id=None, load_frequency=900, proxy_options=None, allow_bucket_creation=False, allow_bucket_deletion=False, - retry_strategy: S3RetryStrategy = AwsStandardS3RetryStrategy(max_attempts=3)): + retry_strategy: S3RetryStrategy = AwsStandardS3RetryStrategy( + max_attempts=3), + force_virtual_addressing=False): cdef: optional[CS3Options] options shared_ptr[CS3FileSystem] wrapped @@ -380,6 +387,7 @@ cdef class S3FileSystem(FileSystem): options.value().allow_bucket_creation = allow_bucket_creation options.value().allow_bucket_deletion = allow_bucket_deletion +options.value().force_virtual_addressing = force_virtual_addressing if isinstance(retry_strategy, AwsStandardS3RetryStrategy): options.value().retry_strategy = CS3RetryStrategy.GetAwsStandardRetryStrategy( @@ -447,6 +455,7 @@ cdef class S3FileSystem(FileSystem): opts.proxy_options.username), 'password': frombytes( opts.proxy_options.password)}, +force_virtual_addressing=opts.force_virtual_addressing, ),) ) diff --git a/python/pyarrow/includes/libarrow_fs.pxd b/python/pyarrow/includes/libarrow_fs.pxd index cb30f4e750..7876fb0f96 100644 --- a/python/pyarrow/includes/libarrow_fs.pxd +++ b/python/pyarrow/includes/libarrow_fs.pxd @@ -167,6 +167,7 @@ cdef extern from "arrow/filesystem/api.h" namespace "arrow::fs" nogil: c_bool background_writes c_bool allow_bucket_creation c_bool allow_bucket_deletion +c_bool force_virtual_addressing shared_ptr[const CKeyValueMetadata] default_metadata c_string role_arn c_string session_name diff --git a/python/pyarrow/tests/test_fs.py b/python/pyarrow/tests/test_fs.py index ab10addfc3..6ba5137e4f 100644 --- a/python/pyarrow/tests/test_fs.py +++ b/python/pyarrow/tests/test_fs.py @@ -1186,6 +1186,10 @@ def test_s3_options(pickle_module): assert pickle_module.loads(pickle_module.dumps(fs2)) == fs2 assert fs2 != fs +fs = S3FileSystem(endpoint_override='localhost:8999', force_virtual_addressing=True) +assert isinstance(fs, S3FileSystem) +assert pickle_module.loads(pickle_module.dumps(fs)) == fs + with pytest.raises(ValueError): S3FileSystem(access_key='access') with pytest.raises(ValueError):
(arrow) branch main updated: GH-39849: [Python] Remove the use of pytest-lazy-fixture (#39850)
This is an automated email from the ASF dual-hosted git repository. jorisvandenbossche pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/arrow.git The following commit(s) were added to refs/heads/main by this push: new 44d5597a0e GH-39849: [Python] Remove the use of pytest-lazy-fixture (#39850) 44d5597a0e is described below commit 44d5597a0e8a4d635f1aec82ba885f61b5c17829 Author: Alenka Frim AuthorDate: Thu Feb 1 14:35:32 2024 +0100 GH-39849: [Python] Remove the use of pytest-lazy-fixture (#39850) ### Rationale for this change Removing the use of `pytest-lazy-fixture` in our test suite as it is unmaintained. Changes in this PR include: - Remove the use of `pytest-lazy-fixture` - Remove marks from fixtures to avoid future error, see ``` PytestRemovedIn9Warning: Marks applied to fixtures have no effect See docs: https://docs.pytest.org/en/stable/deprecations.html#applying-a-mark-to-a-fixture-function ``` - Catch two different warnings in `def test_legacy_int_type()` ### Are these changes tested? The changes affect the tests so they must pass. ### Are there any user-facing changes? No. * Closes: #39849 Lead-authored-by: AlenkaF Co-authored-by: Joris Van den Bossche Signed-off-by: Joris Van den Bossche --- ci/conda_env_python.txt | 3 +-- dev/tasks/conda-recipes/arrow-cpp/meta.yaml | 1 - python/pyarrow/tests/conftest.py| 7 +++--- python/pyarrow/tests/test_dataset.py| 3 --- python/pyarrow/tests/test_extension_type.py | 5 + python/pyarrow/tests/test_fs.py | 34 ++--- python/pyarrow/tests/test_ipc.py| 6 ++--- python/requirements-test.txt| 1 - python/requirements-wheel-test.txt | 1 - 9 files changed, 25 insertions(+), 36 deletions(-) diff --git a/ci/conda_env_python.txt b/ci/conda_env_python.txt index 5fdd21d2bd..59e2def1bf 100644 --- a/ci/conda_env_python.txt +++ b/ci/conda_env_python.txt @@ -23,9 +23,8 @@ cloudpickle fsspec hypothesis numpy>=1.16.6 -pytest<8 # pytest-lazy-fixture broken on pytest 8.0.0 +pytest<8 pytest-faulthandler -pytest-lazy-fixture s3fs>=2023.10.0 setuptools setuptools_scm<8.0.0 diff --git a/dev/tasks/conda-recipes/arrow-cpp/meta.yaml b/dev/tasks/conda-recipes/arrow-cpp/meta.yaml index b8ffbfdb71..367445c595 100644 --- a/dev/tasks/conda-recipes/arrow-cpp/meta.yaml +++ b/dev/tasks/conda-recipes/arrow-cpp/meta.yaml @@ -340,7 +340,6 @@ outputs: # test_cpp_extension_in_python requires a compiler - {{ compiler("cxx") }} # [linux] - pytest -- pytest-lazy-fixture - backports.zoneinfo # [py<39] - boto3 - cffi diff --git a/python/pyarrow/tests/conftest.py b/python/pyarrow/tests/conftest.py index a5941e8c8d..0da757a4bc 100644 --- a/python/pyarrow/tests/conftest.py +++ b/python/pyarrow/tests/conftest.py @@ -24,7 +24,6 @@ import time import urllib.request import pytest -from pytest_lazyfixture import lazy_fixture import hypothesis as h from ..conftest import groups, defaults @@ -259,13 +258,13 @@ def gcs_server(): @pytest.fixture( params=[ -lazy_fixture('builtin_pickle'), -lazy_fixture('cloudpickle') +'builtin_pickle', +'cloudpickle' ], scope='session' ) def pickle_module(request): -return request.param +return request.getfixturevalue(request.param) @pytest.fixture(scope='session') diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py index a4838d63a6..a9054f0b17 100644 --- a/python/pyarrow/tests/test_dataset.py +++ b/python/pyarrow/tests/test_dataset.py @@ -100,7 +100,6 @@ def assert_dataset_fragment_convenience_methods(dataset): @pytest.fixture -@pytest.mark.parquet def mockfs(): mockfs = fs._MockFileSystem() @@ -221,7 +220,6 @@ def multisourcefs(request): @pytest.fixture -@pytest.mark.parquet def dataset(mockfs): format = ds.ParquetFileFormat() selector = fs.FileSelector('subdir', recursive=True) @@ -2692,7 +2690,6 @@ def test_dataset_partitioned_dictionary_type_reconstruct(tempdir, pickle_module) @pytest.fixture -@pytest.mark.parquet def s3_example_simple(s3_server): from pyarrow.fs import FileSystem diff --git a/python/pyarrow/tests/test_extension_type.py b/python/pyarrow/tests/test_extension_type.py index a88e20eefe..d8c792ef00 100644 --- a/python/pyarrow/tests/test_extension_type.py +++ b/python/pyarrow/tests/test_extension_type.py @@ -1485,10 +1485,7 @@ def test_legacy_int_type(): batch = pa.RecordBatch.from_arrays([ext_arr], names=['ext']) buf = ipc_write_batch(batch) -with pytest.warns( -RuntimeWarning, -match="pickle-based deserialization of pyarrow.PyExtensionType " -
(arrow) branch main updated: GH-39651: [Python] Basic pyarrow bindings for Binary/StringView classes (#39652)
This is an automated email from the ASF dual-hosted git repository. jorisvandenbossche pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/arrow.git The following commit(s) were added to refs/heads/main by this push: new 787afa1594 GH-39651: [Python] Basic pyarrow bindings for Binary/StringView classes (#39652) 787afa1594 is described below commit 787afa1594586d2d556d21471647f9cd2c55b18f Author: Joris Van den Bossche AuthorDate: Tue Jan 30 12:54:19 2024 +0100 GH-39651: [Python] Basic pyarrow bindings for Binary/StringView classes (#39652) ### Rationale for this change First step for https://github.com/apache/arrow/issues/39633: exposing the Array, DataType and Scalar classes for BinaryView and StringView, such that those can already be represented in pyarrow. (I exposed a variant of StringBuilder as well, just for now to be able to create test data) * Closes: #39651 Authored-by: Joris Van den Bossche Signed-off-by: Joris Van den Bossche --- docs/source/python/api/arrays.rst | 4 ++ docs/source/python/api/datatypes.rst | 4 ++ python/pyarrow/__init__.py | 7 ++-- python/pyarrow/array.pxi | 14 +++ python/pyarrow/builder.pxi | 66 ++ python/pyarrow/includes/libarrow.pxd | 9 python/pyarrow/lib.pxd | 8 python/pyarrow/lib.pyx | 2 + python/pyarrow/scalar.pxi | 10 + python/pyarrow/src/arrow/python/helpers.cc | 2 + python/pyarrow/tests/test_builder.py | 21 +- python/pyarrow/tests/test_misc.py | 4 ++ python/pyarrow/tests/test_scalars.py | 28 - python/pyarrow/tests/test_types.py | 8 python/pyarrow/types.pxi | 32 +++ python/pyarrow/types.py| 10 + 16 files changed, 223 insertions(+), 6 deletions(-) diff --git a/docs/source/python/api/arrays.rst b/docs/source/python/api/arrays.rst index 73b5e063ff..b858862dcf 100644 --- a/docs/source/python/api/arrays.rst +++ b/docs/source/python/api/arrays.rst @@ -63,6 +63,8 @@ may expose data type-specific methods or properties. FixedSizeBinaryArray LargeBinaryArray LargeStringArray + BinaryViewArray, + StringViewArray, Time32Array Time64Array Date32Array @@ -119,6 +121,8 @@ classes may expose data type-specific methods or properties. FixedSizeBinaryScalar LargeBinaryScalar LargeStringScalar + BinaryViewScalar + StringViewScalar Time32Scalar Time64Scalar Date32Scalar diff --git a/docs/source/python/api/datatypes.rst b/docs/source/python/api/datatypes.rst index 4066ef3142..642c243b21 100644 --- a/docs/source/python/api/datatypes.rst +++ b/docs/source/python/api/datatypes.rst @@ -55,6 +55,8 @@ These should be used to create Arrow data types and schemas. large_binary large_string large_utf8 + binary_view + string_view decimal128 list_ large_list @@ -168,6 +170,8 @@ represents a given data type (such as ``int32``) or general category is_large_binary is_large_unicode is_large_string + is_binary_view + is_string_view is_fixed_size_binary is_map is_dictionary diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py index 9da94885ec..4dbd1258d3 100644 --- a/python/pyarrow/__init__.py +++ b/python/pyarrow/__init__.py @@ -163,7 +163,7 @@ from pyarrow.lib import (null, bool_, time32, time64, timestamp, date32, date64, duration, month_day_nano_interval, float16, float32, float64, - binary, string, utf8, + binary, string, utf8, binary_view, string_view, large_binary, large_string, large_utf8, decimal128, decimal256, list_, large_list, map_, struct, @@ -205,6 +205,7 @@ from pyarrow.lib import (null, bool_, FixedSizeListArray, UnionArray, BinaryArray, StringArray, LargeBinaryArray, LargeStringArray, + BinaryViewArray, StringViewArray, FixedSizeBinaryArray, DictionaryArray, Date32Array, Date64Array, TimestampArray, @@ -223,8 +224,8 @@ from pyarrow.lib import (null, bool_, Time32Scalar, Time64Scalar, TimestampScalar, DurationScalar, MonthDayNanoIntervalScalar, - BinaryScalar, LargeBinaryScalar, - StringScalar, LargeStringScalar, + BinaryScalar, LargeBinaryScalar, BinaryViewScalar, + StringScalar, LargeStringScalar, StringViewScalar
(arrow) branch main updated: GH-39640: [Docs] Pin pydata-sphinx-theme to 0.14.* (#39758)
This is an automated email from the ASF dual-hosted git repository. jorisvandenbossche pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/arrow.git The following commit(s) were added to refs/heads/main by this push: new c6ab28677d GH-39640: [Docs] Pin pydata-sphinx-theme to 0.14.* (#39758) c6ab28677d is described below commit c6ab28677ddf22799f3db277137708ac5b070acd Author: Joris Van den Bossche AuthorDate: Tue Jan 30 09:16:53 2024 +0100 GH-39640: [Docs] Pin pydata-sphinx-theme to 0.14.* (#39758) ### Rationale for this change Fixing the pinning syntax so we get the latest 0.14.x version (which is currently 0.14.4) * Closes: #39640 Authored-by: Joris Van den Bossche Signed-off-by: Joris Van den Bossche --- ci/conda_env_sphinx.txt| 2 +- docs/requirements.txt | 2 +- docs/source/python/api/compute.rst | 2 +- docs/source/python/compute.rst | 4 ++-- docs/source/python/pandas.rst | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/ci/conda_env_sphinx.txt b/ci/conda_env_sphinx.txt index d0f494d2e0..0e50875fc1 100644 --- a/ci/conda_env_sphinx.txt +++ b/ci/conda_env_sphinx.txt @@ -20,7 +20,7 @@ breathe doxygen ipython numpydoc -pydata-sphinx-theme=0.14.1 +pydata-sphinx-theme=0.14 sphinx-autobuild sphinx-design sphinx-copybutton diff --git a/docs/requirements.txt b/docs/requirements.txt index aee2eb662c..5d6fec7ddf 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -5,7 +5,7 @@ breathe ipython numpydoc -pydata-sphinx-theme==0.14.1 +pydata-sphinx-theme~=0.14 sphinx-autobuild sphinx-design sphinx-copybutton diff --git a/docs/source/python/api/compute.rst b/docs/source/python/api/compute.rst index b879643017..928c607d13 100644 --- a/docs/source/python/api/compute.rst +++ b/docs/source/python/api/compute.rst @@ -590,4 +590,4 @@ User-Defined Functions :toctree: ../generated/ register_scalar_function - ScalarUdfContext + UdfContext diff --git a/docs/source/python/compute.rst b/docs/source/python/compute.rst index e8a5b613c6..c02059a4f8 100644 --- a/docs/source/python/compute.rst +++ b/docs/source/python/compute.rst @@ -445,9 +445,9 @@ output type need to be defined. Using :func:`pyarrow.compute.register_scalar_fun The implementation of a user-defined function always takes a first *context* parameter (named ``ctx`` in the example above) which is an instance of -:class:`pyarrow.compute.ScalarUdfContext`. +:class:`pyarrow.compute.UdfContext`. This context exposes several useful attributes, particularly a -:attr:`~pyarrow.compute.ScalarUdfContext.memory_pool` to be used for +:attr:`~pyarrow.compute.UdfContext.memory_pool` to be used for allocations in the context of the user-defined function. You can call a user-defined function directly using :func:`pyarrow.compute.call_function`: diff --git a/docs/source/python/pandas.rst b/docs/source/python/pandas.rst index fda90c4f2a..23a4b73bd0 100644 --- a/docs/source/python/pandas.rst +++ b/docs/source/python/pandas.rst @@ -197,7 +197,7 @@ use the ``datetime64[ns]`` type in Pandas and are converted to an Arrow .. ipython:: python - df = pd.DataFrame({"datetime": pd.date_range("2020-01-01T00:00:00Z", freq="H", periods=3)}) + df = pd.DataFrame({"datetime": pd.date_range("2020-01-01T00:00:00Z", freq="h", periods=3)}) df.dtypes df
(arrow) branch main updated: GH-39732: [Python][CI] Fix test failures with latest/nightly pandas (#39760)
This is an automated email from the ASF dual-hosted git repository. jorisvandenbossche pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/arrow.git The following commit(s) were added to refs/heads/main by this push: new c67d0260d4 GH-39732: [Python][CI] Fix test failures with latest/nightly pandas (#39760) c67d0260d4 is described below commit c67d0260d4e96472b5cbdff66ca67ead2b9abe4c Author: Alenka Frim AuthorDate: Thu Jan 25 10:21:57 2024 +0100 GH-39732: [Python][CI] Fix test failures with latest/nightly pandas (#39760) This PR rearranges if-else blocks in the `table` function (`table.pxi`) so that pandas dataframe object comes before checking for `__arrow_c_stream__` and `__arrow_c_array__`. * Closes: #39732 Authored-by: AlenkaF Signed-off-by: Joris Van den Bossche --- python/pyarrow/table.pxi | 18 +- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index d98c93e1c0..3c450d61a7 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -5202,7 +5202,17 @@ def table(data, names=None, schema=None, metadata=None, nthreads=None): raise ValueError( "The 'names' argument is not valid when passing a dictionary") return Table.from_pydict(data, schema=schema, metadata=metadata) +elif _pandas_api.is_data_frame(data): +if names is not None or metadata is not None: +raise ValueError( +"The 'names' and 'metadata' arguments are not valid when " +"passing a pandas DataFrame") +return Table.from_pandas(data, schema=schema, nthreads=nthreads) elif hasattr(data, "__arrow_c_stream__"): +if names is not None or metadata is not None: +raise ValueError( +"The 'names' and 'metadata' arguments are not valid when " +"using Arrow PyCapsule Interface") if schema is not None: requested = schema.__arrow_c_schema__() else: @@ -5216,14 +5226,12 @@ def table(data, names=None, schema=None, metadata=None, nthreads=None): table = table.cast(schema) return table elif hasattr(data, "__arrow_c_array__"): -batch = record_batch(data, schema) -return Table.from_batches([batch]) -elif _pandas_api.is_data_frame(data): if names is not None or metadata is not None: raise ValueError( "The 'names' and 'metadata' arguments are not valid when " -"passing a pandas DataFrame") -return Table.from_pandas(data, schema=schema, nthreads=nthreads) +"using Arrow PyCapsule Interface") +batch = record_batch(data, schema) +return Table.from_batches([batch]) else: raise TypeError( "Expected pandas DataFrame, python dictionary or list of arrays")
(arrow) branch main updated: GH-38655: [C++] "iso_calendar" kernel returns incorrect results for array length > 32 (#39360)
This is an automated email from the ASF dual-hosted git repository. jorisvandenbossche pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/arrow.git The following commit(s) were added to refs/heads/main by this push: new 7e9f265878 GH-38655: [C++] "iso_calendar" kernel returns incorrect results for array length > 32 (#39360) 7e9f265878 is described below commit 7e9f2658786b966685ddedf6b90415968f207b75 Author: Rok Mihevc AuthorDate: Tue Jan 23 12:43:05 2024 +0100 GH-38655: [C++] "iso_calendar" kernel returns incorrect results for array length > 32 (#39360) ### Rationale for this change When defining `StructArray`'s field builders for `ISOCalendar` we don't pre-allocate memory and then use unsafe append. This causes the resulting array to be at most 32 rows long. ### What changes are included in this PR? This introduces required memory pre-allocation in the `ISOCalendar` c++ kernel. ### Are these changes tested? This adds a test for the Python wrapper. ### Are there any user-facing changes? Fixes the behavior of `iso_calendar` kernel. * Closes: #38655 Lead-authored-by: Rok Mihevc Co-authored-by: Joris Van den Bossche Signed-off-by: Joris Van den Bossche --- cpp/src/arrow/compute/kernels/scalar_temporal_unary.cc | 2 +- python/pyarrow/tests/test_compute.py | 13 + 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_temporal_unary.cc b/cpp/src/arrow/compute/kernels/scalar_temporal_unary.cc index a88ce38936..f49e201492 100644 --- a/cpp/src/arrow/compute/kernels/scalar_temporal_unary.cc +++ b/cpp/src/arrow/compute/kernels/scalar_temporal_unary.cc @@ -1510,7 +1510,7 @@ struct ISOCalendar { for (int i = 0; i < 3; i++) { field_builders.push_back( checked_cast(struct_builder->field_builder(i))); - RETURN_NOT_OK(field_builders[i]->Reserve(1)); + RETURN_NOT_OK(field_builders[i]->Reserve(in.length)); } auto visit_null = [&]() { return struct_builder->AppendNull(); }; std::function visit_value; diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py index 34d4da580f..4b58dc65ba 100644 --- a/python/pyarrow/tests/test_compute.py +++ b/python/pyarrow/tests/test_compute.py @@ -2263,6 +2263,19 @@ def test_extract_datetime_components(): _check_datetime_components(timestamps, timezone) +@pytest.mark.parametrize("unit", ["s", "ms", "us", "ns"]) +def test_iso_calendar_longer_array(unit): +# https://github.com/apache/arrow/issues/38655 +# ensure correct result for array length > 32 +arr = pa.array([datetime.datetime(2022, 1, 2, 9)]*50, pa.timestamp(unit)) +result = pc.iso_calendar(arr) +expected = pa.StructArray.from_arrays( +[[2021]*50, [52]*50, [7]*50], +names=['iso_year', 'iso_week', 'iso_day_of_week'] +) +assert result.equals(expected) + + @pytest.mark.pandas @pytest.mark.skipif(sys.platform == "win32" and not util.windows_has_tzdata(), reason="Timezone database is not installed on Windows")
(arrow) branch main updated: MINOR: [Docs] Fix formatting of note on Device data interface docs (#39757)
This is an automated email from the ASF dual-hosted git repository. jorisvandenbossche pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/arrow.git The following commit(s) were added to refs/heads/main by this push: new eed53bbd59 MINOR: [Docs] Fix formatting of note on Device data interface docs (#39757) eed53bbd59 is described below commit eed53bbd59957a80c8f55fe4d265cd2371fbea11 Author: Joris Van den Bossche AuthorDate: Tue Jan 23 12:32:57 2024 +0100 MINOR: [Docs] Fix formatting of note on Device data interface docs (#39757) Authored-by: Joris Van den Bossche Signed-off-by: Joris Van den Bossche --- docs/source/format/CDeviceDataInterface.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/format/CDeviceDataInterface.rst b/docs/source/format/CDeviceDataInterface.rst index 76b7132681..b5b7229a67 100644 --- a/docs/source/format/CDeviceDataInterface.rst +++ b/docs/source/format/CDeviceDataInterface.rst @@ -341,8 +341,8 @@ Notes: * \(1) Currently unknown if framework has an event type to support. * \(2) Extension Device has producer defined semantics and thus if - synchronization is needed for an extension device, the producer - should document the type. + synchronization is needed for an extension device, the producer + should document the type. Semantics
(arrow) branch main updated: GH-39599: [Python] Avoid leaking references to Numpy dtypes (#39636)
This is an automated email from the ASF dual-hosted git repository. jorisvandenbossche pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/arrow.git The following commit(s) were added to refs/heads/main by this push: new 96645ebc50 GH-39599: [Python] Avoid leaking references to Numpy dtypes (#39636) 96645ebc50 is described below commit 96645ebc5037b6b4eab127c274f4871bbef99d77 Author: Antoine Pitrou AuthorDate: Wed Jan 17 11:26:37 2024 +0100 GH-39599: [Python] Avoid leaking references to Numpy dtypes (#39636) ### Rationale for this change `PyArray_DescrFromScalar` returns a new reference, so we should be careful to decref it when we don't use it anymore. ### Are these changes tested? No. ### Are there any user-facing changes? No. * Closes: #39599 Authored-by: Antoine Pitrou Signed-off-by: Joris Van den Bossche --- python/pyarrow/array.pxi | 3 +- python/pyarrow/includes/libarrow_python.pxd| 2 +- python/pyarrow/src/arrow/python/inference.cc | 5 +- python/pyarrow/src/arrow/python/numpy_convert.cc | 77 ++ python/pyarrow/src/arrow/python/numpy_convert.h| 6 +- python/pyarrow/src/arrow/python/numpy_to_arrow.cc | 11 ++-- python/pyarrow/src/arrow/python/python_to_arrow.cc | 6 +- python/pyarrow/types.pxi | 6 +- 8 files changed, 48 insertions(+), 68 deletions(-) diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 5c2d22aef1..1416f5f434 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -66,8 +66,7 @@ cdef shared_ptr[CDataType] _ndarray_to_type(object values, dtype = values.dtype if type is None and dtype != object: -with nogil: -check_status(NumPyDtypeToArrow(dtype, _type)) +c_type = GetResultValue(NumPyDtypeToArrow(dtype)) if type is not None: c_type = type.sp_type diff --git a/python/pyarrow/includes/libarrow_python.pxd b/python/pyarrow/includes/libarrow_python.pxd index e3179062a1..906f0b7d28 100644 --- a/python/pyarrow/includes/libarrow_python.pxd +++ b/python/pyarrow/includes/libarrow_python.pxd @@ -73,7 +73,7 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py" nogil: object obj, object mask, const PyConversionOptions& options, CMemoryPool* pool) -CStatus NumPyDtypeToArrow(object dtype, shared_ptr[CDataType]* type) +CResult[shared_ptr[CDataType]] NumPyDtypeToArrow(object dtype) CStatus NdarrayToArrow(CMemoryPool* pool, object ao, object mo, c_bool from_pandas, diff --git a/python/pyarrow/src/arrow/python/inference.cc b/python/pyarrow/src/arrow/python/inference.cc index 9537aec574..10116f9afa 100644 --- a/python/pyarrow/src/arrow/python/inference.cc +++ b/python/pyarrow/src/arrow/python/inference.cc @@ -468,10 +468,7 @@ class TypeInferrer { if (numpy_dtype_count_ > 0) { // All NumPy scalars and Nones/nulls if (numpy_dtype_count_ + none_count_ == total_count_) { -std::shared_ptr type; -RETURN_NOT_OK(NumPyDtypeToArrow(numpy_unifier_.current_dtype(), )); -*out = type; -return Status::OK(); +return NumPyDtypeToArrow(numpy_unifier_.current_dtype()).Value(out); } // The "bad path": data contains a mix of NumPy scalars and diff --git a/python/pyarrow/src/arrow/python/numpy_convert.cc b/python/pyarrow/src/arrow/python/numpy_convert.cc index 4970680764..dfee88c092 100644 --- a/python/pyarrow/src/arrow/python/numpy_convert.cc +++ b/python/pyarrow/src/arrow/python/numpy_convert.cc @@ -59,12 +59,11 @@ NumPyBuffer::~NumPyBuffer() { #define TO_ARROW_TYPE_CASE(NPY_NAME, FACTORY) \ case NPY_##NPY_NAME:\ -*out = FACTORY(); \ -break; +return FACTORY(); namespace { -Status GetTensorType(PyObject* dtype, std::shared_ptr* out) { +Result> GetTensorType(PyObject* dtype) { if (!PyObject_TypeCheck(dtype, _Type)) { return Status::TypeError("Did not pass numpy.dtype object"); } @@ -84,11 +83,8 @@ Status GetTensorType(PyObject* dtype, std::shared_ptr* out) { TO_ARROW_TYPE_CASE(FLOAT16, float16); TO_ARROW_TYPE_CASE(FLOAT32, float32); TO_ARROW_TYPE_CASE(FLOAT64, float64); -default: { - return Status::NotImplemented("Unsupported numpy type ", descr->type_num); -} } - return Status::OK(); + return Status::NotImplemented("Unsupported numpy type ", descr->type_num); } Status GetNumPyType(const DataType& type, int* type_num) { @@ -120,15 +116,21 @@ Status GetNumPyType(const DataType& type, int* type_num) { } // namespace -Status NumPyDtypeToArrow(PyObject* dtype, std::shared_ptr* out) { +Result> NumPyScalarToArrowDataType(PyObject* scalar) { + PyArray_Descr
(arrow) branch main updated: GH-36412: [Python][CI] Fix extra deprecation warnings in the pandas nightly build (#39609)
This is an automated email from the ASF dual-hosted git repository. jorisvandenbossche pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/arrow.git The following commit(s) were added to refs/heads/main by this push: new 63b769 GH-36412: [Python][CI] Fix extra deprecation warnings in the pandas nightly build (#39609) 63b769 is described below commit 63b769f3ad6c724305b4182526307ab025d5 Author: Alenka Frim AuthorDate: Wed Jan 17 11:12:41 2024 +0100 GH-36412: [Python][CI] Fix extra deprecation warnings in the pandas nightly build (#39609) Fixes left deprecation warnings coming from the pandas development version, by updating our test code to avoid the deprecated patterns. * Closes: #36412 Lead-authored-by: AlenkaF Co-authored-by: Alenka Frim Co-authored-by: Joris Van den Bossche Signed-off-by: Joris Van den Bossche --- python/pyarrow/pandas_compat.py | 15 ++ python/pyarrow/tests/parquet/test_datetime.py | 4 +-- python/pyarrow/tests/test_compute.py | 6 ++-- python/pyarrow/tests/test_dataset.py | 6 ++-- python/pyarrow/tests/test_pandas.py | 42 +++ 5 files changed, 35 insertions(+), 38 deletions(-) diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py index 39dee85492..61e6318e29 100644 --- a/python/pyarrow/pandas_compat.py +++ b/python/pyarrow/pandas_compat.py @@ -967,20 +967,9 @@ def _extract_index_level(table, result_table, field_name, # The serialized index column was removed by the user return result_table, None, None -pd = _pandas_api.pd - col = table.column(i) -values = col.to_pandas(types_mapper=types_mapper).values - -if hasattr(values, 'flags') and not values.flags.writeable: -# ARROW-1054: in pandas 0.19.2, factorize will reject -# non-writeable arrays when calling MultiIndex.from_arrays -values = values.copy() - -if isinstance(col.type, pa.lib.TimestampType) and col.type.tz is not None: -index_level = make_tz_aware(pd.Series(values, copy=False), col.type.tz) -else: -index_level = pd.Series(values, dtype=values.dtype, copy=False) +index_level = col.to_pandas(types_mapper=types_mapper) +index_level.name = None result_table = result_table.remove_column( result_table.schema.get_field_index(field_name) ) diff --git a/python/pyarrow/tests/parquet/test_datetime.py b/python/pyarrow/tests/parquet/test_datetime.py index 6a9cbd4f73..0896eb37e6 100644 --- a/python/pyarrow/tests/parquet/test_datetime.py +++ b/python/pyarrow/tests/parquet/test_datetime.py @@ -116,7 +116,7 @@ def test_coerce_timestamps(tempdir): df_expected = df.copy() for i, x in enumerate(df_expected['datetime64']): if isinstance(x, np.ndarray): -df_expected['datetime64'][i] = x.astype('M8[us]') +df_expected.loc[i, 'datetime64'] = x.astype('M8[us]') tm.assert_frame_equal(df_expected, df_read) @@ -429,7 +429,7 @@ def test_noncoerced_nanoseconds_written_without_exception(tempdir): # nanosecond timestamps by default n = 9 df = pd.DataFrame({'x': range(n)}, - index=pd.date_range('2017-01-01', freq='1n', periods=n)) + index=pd.date_range('2017-01-01', freq='ns', periods=n)) tb = pa.Table.from_pandas(df) filename = tempdir / 'written.parquet' diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py index d1eb605c71..34d4da580f 100644 --- a/python/pyarrow/tests/test_compute.py +++ b/python/pyarrow/tests/test_compute.py @@ -2360,10 +2360,10 @@ def _check_temporal_rounding(ts, values, unit): unit_shorthand = { "nanosecond": "ns", "microsecond": "us", -"millisecond": "L", +"millisecond": "ms", "second": "s", "minute": "min", -"hour": "H", +"hour": "h", "day": "D" } greater_unit = { @@ -2371,7 +2371,7 @@ def _check_temporal_rounding(ts, values, unit): "microsecond": "ms", "millisecond": "s", "second": "min", -"minute": "H", +"minute": "h", "hour": "d", } ta = pa.array(ts) diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py index ae2146c0bd..d473299f20 100644 --- a/python/pyarrow/tests/test_dataset.py +++ b/python/pyarrow/tests/test_dataset.py @@ -178,12 +178,14 @@ def multisourcefs(request): # simply split the dataframe into four chunks to construct a data source # from each chunk into its ow
(arrow) branch main updated: GH-39533: [Python] NumPy 2.0 compat: remove usage of np.core (#39535)
This is an automated email from the ASF dual-hosted git repository. jorisvandenbossche pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/arrow.git The following commit(s) were added to refs/heads/main by this push: new 72ed58449e GH-39533: [Python] NumPy 2.0 compat: remove usage of np.core (#39535) 72ed58449e is described below commit 72ed58449ea71aab1343d9adce19f177f20705cf Author: Joris Van den Bossche AuthorDate: Wed Jan 10 09:13:02 2024 +0100 GH-39533: [Python] NumPy 2.0 compat: remove usage of np.core (#39535) ### Rationale for this change Removing usage of `np.core`, as that is deprecated and will be removed in numpy 2.0. For this specific case, we can just hardcode the list of data types instead of using a numpy api (this list doesn't typically change). * Closes: #39533 Authored-by: Joris Van den Bossche Signed-off-by: Joris Van den Bossche --- python/pyarrow/pandas_compat.py | 8 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py index 3757d81a47..39dee85492 100644 --- a/python/pyarrow/pandas_compat.py +++ b/python/pyarrow/pandas_compat.py @@ -30,7 +30,6 @@ import re import warnings import numpy as np -from numpy.core.numerictypes import sctypes as _np_sctypes import pyarrow as pa from pyarrow.lib import _pandas_api, frombytes # noqa @@ -789,9 +788,10 @@ def table_to_dataframe( # Set of the string repr of all numpy dtypes that can be stored in a pandas # dataframe (complex not included since not supported by Arrow) _pandas_supported_numpy_types = { -str(np.dtype(typ)) -for typ in (_np_sctypes['int'] + _np_sctypes['uint'] + _np_sctypes['float'] + -['object', 'bool']) +"int8", "int16", "int32", "int64", +"uint8", "uint16", "uint32", "uint64", +"float16", "float32", "float64", +"object", "bool" }
(arrow) branch main updated: GH-39537: [Packaging][Python] Add a numpy<2 pin to the install requirements for the 15.x release branch (#39538)
This is an automated email from the ASF dual-hosted git repository. jorisvandenbossche pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/arrow.git The following commit(s) were added to refs/heads/main by this push: new 32d785ff40 GH-39537: [Packaging][Python] Add a numpy<2 pin to the install requirements for the 15.x release branch (#39538) 32d785ff40 is described below commit 32d785ff405e3cc31866faa38bc2704eb44fda60 Author: Joris Van den Bossche AuthorDate: Wed Jan 10 09:11:11 2024 +0100 GH-39537: [Packaging][Python] Add a numpy<2 pin to the install requirements for the 15.x release branch (#39538) ### Rationale for this change PyArrow wheels for the 15.0.0 release will not be compatible with future numpy 2.0 packages, therefore it is recommended to add this upper pin now for _releases_. We will keep the more flexible pin on the development branch (by reverting this commit on main, but so it can be cherry-picked in the release branch) * Closes: #39537 Authored-by: Joris Van den Bossche Signed-off-by: Joris Van den Bossche --- python/setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/setup.py b/python/setup.py index b1c825d84d..51eb40af08 100755 --- a/python/setup.py +++ b/python/setup.py @@ -449,7 +449,7 @@ class BinaryDistribution(Distribution): install_requires = ( -'numpy >= 1.16.6', +'numpy >= 1.16.6, <2', )
(arrow) branch main updated: GH-39437: [CI][Python] Update pandas tests failing on pandas nightly CI build (#39498)
This is an automated email from the ASF dual-hosted git repository. jorisvandenbossche pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/arrow.git The following commit(s) were added to refs/heads/main by this push: new 48f704e2a3 GH-39437: [CI][Python] Update pandas tests failing on pandas nightly CI build (#39498) 48f704e2a3 is described below commit 48f704e2a316131d180e0c2198c00671756c Author: Alenka Frim AuthorDate: Mon Jan 8 17:38:26 2024 +0100 GH-39437: [CI][Python] Update pandas tests failing on pandas nightly CI build (#39498) Update version checks and assertions of pyarrow array equality for pandas failing tests on the CI: [test-conda-python-3.10-pandas-nightly](https://github.com/ursacomputing/crossbow/actions/runs/7391976015/job/20109720695) * Closes: #39437 Lead-authored-by: AlenkaF Co-authored-by: Alenka Frim Co-authored-by: Joris Van den Bossche Signed-off-by: Joris Van den Bossche --- python/pyarrow/tests/parquet/test_pandas.py | 10 +++--- python/pyarrow/tests/test_pandas.py | 16 ++-- 2 files changed, 21 insertions(+), 5 deletions(-) diff --git a/python/pyarrow/tests/parquet/test_pandas.py b/python/pyarrow/tests/parquet/test_pandas.py index f194d12876..b5913bf5c6 100644 --- a/python/pyarrow/tests/parquet/test_pandas.py +++ b/python/pyarrow/tests/parquet/test_pandas.py @@ -404,6 +404,10 @@ caratcut color clarity depth table price x y z @pytest.mark.pandas def test_backwards_compatible_column_metadata_handling(datadir): +if Version("2.2.0") <= Version(pd.__version__): +# TODO: regression in pandas +# https://github.com/pandas-dev/pandas/issues/56775 +pytest.skip("Regression in pandas 2.2.0") expected = pd.DataFrame( {'a': [1, 2, 3], 'b': [.1, .2, .3], 'c': pd.date_range("2017-01-01", periods=3, tz='Europe/Brussels')}) @@ -504,9 +508,9 @@ def test_categories_with_string_pyarrow_dtype(tempdir): df2 = df2.astype("category") # categories should be converted to pa.Array -assert pa.array(df1["x"]) == pa.array(df2["x"]) -assert pa.array(df1["x"].cat.categories.values) == pa.array( -df2["x"].cat.categories.values) +assert pa.array(df1["x"]).to_pylist() == pa.array(df2["x"]).to_pylist() +assert pa.array(df1["x"].cat.categories.values).to_pylist() == pa.array( +df2["x"].cat.categories.values).to_pylist() path = str(tempdir / 'cat.parquet') pq.write_table(pa.table(df1), path) diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py index 342beaaeb5..3353bebce7 100644 --- a/python/pyarrow/tests/test_pandas.py +++ b/python/pyarrow/tests/test_pandas.py @@ -261,6 +261,12 @@ class TestConvertMetadata: with warnings.catch_warnings(): warnings.simplefilter(action="error") +# make_block deprecation in pandas, still under discussion +# https://github.com/pandas-dev/pandas/pull/56422 +# https://github.com/pandas-dev/pandas/issues/40226 +warnings.filterwarnings( +"ignore", "make_block is deprecated", DeprecationWarning +) _check_pandas_roundtrip(df, preserve_index=True) def test_multiindex_columns(self): @@ -311,6 +317,12 @@ class TestConvertMetadata: with warnings.catch_warnings(): warnings.simplefilter(action="error") +# make_block deprecation in pandas, still under discussion +# https://github.com/pandas-dev/pandas/pull/56422 +# https://github.com/pandas-dev/pandas/issues/40226 +warnings.filterwarnings( +"ignore", "make_block is deprecated", DeprecationWarning +) _check_pandas_roundtrip(df, preserve_index=True) def test_integer_index_column(self): @@ -465,7 +477,7 @@ class TestConvertMetadata: preserve_index=True) def test_binary_column_name(self): -if Version("2.0.0") <= Version(pd.__version__) < Version("2.2.0"): +if Version("2.0.0") <= Version(pd.__version__) < Version("2.3.0"): # TODO: regression in pandas, hopefully fixed in next version # https://issues.apache.org/jira/browse/ARROW-18394 # https://github.com/pandas-dev/pandas/issues/50127 @@ -3095,7 +3107,7 @@ def _fully_loaded_dataframe_example(): @pytest.mark.parametrize('columns', ([b'foo'], ['foo'])) def test_roundtrip_with_bytes_unicode(columns): -if Version("2.0.0") <= Version(pd.__version__) < Version("2.2.0"): +if Version("2.0.0") <= Version(pd.__version__) < Version("2.3.0"): # TODO: regression in pandas, hopefully fixed in next version # https://issues.apache.org/jira/browse/ARROW-18394 # https://github.com/pandas-dev/pandas/issues/50127
(arrow) branch main updated (dc40e5fba1 -> 60b89ff0c9)
This is an automated email from the ASF dual-hosted git repository. jorisvandenbossche pushed a change to branch main in repository https://gitbox.apache.org/repos/asf/arrow.git from dc40e5fba1 GH-39217: [Python] RecordBatchReader.from_stream constructor for objects implementing the Arrow PyCapsule protocol (#39218) add 60b89ff0c9 GH-33500: [Python] add `Table.to/from_struct_array` (#38520) No new revisions were added by this update. Summary of changes: python/pyarrow/table.pxi | 54 +++ python/pyarrow/tests/test_table.py | 75 ++ 2 files changed, 129 insertions(+)
(arrow) branch main updated: GH-39217: [Python] RecordBatchReader.from_stream constructor for objects implementing the Arrow PyCapsule protocol (#39218)
This is an automated email from the ASF dual-hosted git repository. jorisvandenbossche pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/arrow.git The following commit(s) were added to refs/heads/main by this push: new dc40e5fba1 GH-39217: [Python] RecordBatchReader.from_stream constructor for objects implementing the Arrow PyCapsule protocol (#39218) dc40e5fba1 is described below commit dc40e5fba1c9ace6da3de14158bb6195bed6fc58 Author: Joris Van den Bossche AuthorDate: Mon Jan 8 16:49:14 2024 +0100 GH-39217: [Python] RecordBatchReader.from_stream constructor for objects implementing the Arrow PyCapsule protocol (#39218) ### Rationale for this change In contrast to Array, RecordBatch and Schema, for the C Stream (mapping to RecordBatchReader) we haven't an equivalent factory function that can accept any Arrow-compatible object and turn it into a pyarrow object through the PyCapsule Protocol. For that reason, this proposes an explicit constructor class method for this: `RecordBatchReader.from_stream` (this is a quite generic name, so other name suggestions are certainly welcome). ### Are these changes tested? TODO * Closes: #39217 Authored-by: Joris Van den Bossche Signed-off-by: Joris Van den Bossche --- python/pyarrow/ipc.pxi | 43 + python/pyarrow/tests/test_array.py | 4 ++-- python/pyarrow/tests/test_ipc.py | 44 ++ python/pyarrow/tests/test_table.py | 12 +-- 4 files changed, 95 insertions(+), 8 deletions(-) diff --git a/python/pyarrow/ipc.pxi b/python/pyarrow/ipc.pxi index ae52f5cf34..da9636dfc8 100644 --- a/python/pyarrow/ipc.pxi +++ b/python/pyarrow/ipc.pxi @@ -883,6 +883,49 @@ cdef class RecordBatchReader(_Weakrefable): self.reader = c_reader return self +@staticmethod +def from_stream(data, schema=None): +""" +Create RecordBatchReader from a Arrow-compatible stream object. + +This accepts objects implementing the Arrow PyCapsule Protocol for +streams, i.e. objects that have a ``__arrow_c_stream__`` method. + +Parameters +-- +data : Arrow-compatible stream object +Any object that implements the Arrow PyCapsule Protocol for +streams. +schema : Schema, default None +The schema to which the stream should be casted, if supported +by the stream object. + +Returns +--- +RecordBatchReader +""" + +if not hasattr(data, "__arrow_c_stream__"): +raise TypeError( +"Expected an object implementing the Arrow PyCapsule Protocol for " +"streams (i.e. having a `__arrow_c_stream__` method), " +f"got {type(data)!r}." +) + +if schema is not None: +if not hasattr(schema, "__arrow_c_schema__"): +raise TypeError( +"Expected an object implementing the Arrow PyCapsule Protocol for " +"schema (i.e. having a `__arrow_c_schema__` method), " +f"got {type(schema)!r}." +) +requested = schema.__arrow_c_schema__() +else: +requested = None + +capsule = data.__arrow_c_stream__(requested) +return RecordBatchReader._import_from_c_capsule(capsule) + @staticmethod def from_batches(Schema schema not None, batches): """ diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index d598630dc2..3dcbf399f3 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -3351,8 +3351,8 @@ def test_c_array_protocol(): def __init__(self, data): self.data = data -def __arrow_c_array__(self, requested_type=None): -return self.data.__arrow_c_array__(requested_type) +def __arrow_c_array__(self, requested_schema=None): +return self.data.__arrow_c_array__(requested_schema) # Can roundtrip through the C array protocol arr = ArrayWrapper(pa.array([1, 2, 3], type=pa.int64())) diff --git a/python/pyarrow/tests/test_ipc.py b/python/pyarrow/tests/test_ipc.py index 450d26e3b7..f75ec8158a 100644 --- a/python/pyarrow/tests/test_ipc.py +++ b/python/pyarrow/tests/test_ipc.py @@ -1194,3 +1194,47 @@ def test_py_record_batch_reader(): with pytest.raises(TypeError): reader = pa.RecordBatchReader.from_batches(None, batches) pass + + +def test_record_batch_reader_from_arrow_stream(): + +class StreamWrapper: +def __init__(self, batches): +self.batches = batches + +def __arrow_c_stream__(self, requested
(arrow) branch main updated: GH-39064: [C++][Parquet] Support row group filtering for nested paths for struct fields (#39065)
This is an automated email from the ASF dual-hosted git repository. jorisvandenbossche pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/arrow.git The following commit(s) were added to refs/heads/main by this push: new ffcfabdb95 GH-39064: [C++][Parquet] Support row group filtering for nested paths for struct fields (#39065) ffcfabdb95 is described below commit ffcfabdb956d72707557a1fcf113c6b7cb118f50 Author: Joris Van den Bossche AuthorDate: Mon Jan 8 16:06:59 2024 +0100 GH-39064: [C++][Parquet] Support row group filtering for nested paths for struct fields (#39065) ### Rationale for this change Currently when filtering with a nested field reference, we were taking the corresponding parquet SchemaField for just the first index of the nested path, i.e. the parent node in the Parquet schema. But logically, filtering on statistics only works for a primitive leaf node. This PR changes that logic to iterate over all indices of the FieldPath, if nested, to ensure we use the actual corresponding child leaf node of the ParquetSchema to get the statistics from. ### Are there any user-facing changes? No, only improving performance by doing the filtering at the row group stage, instead of afterwards on the read data * Closes: #39064 Authored-by: Joris Van den Bossche Signed-off-by: Joris Van den Bossche --- cpp/src/arrow/dataset/file_parquet.cc | 39 ++ cpp/src/arrow/dataset/file_parquet.h | 8 ++ cpp/src/arrow/dataset/file_parquet_test.cc | 6 + python/pyarrow/tests/test_dataset.py | 36 +++ 4 files changed, 79 insertions(+), 10 deletions(-) diff --git a/cpp/src/arrow/dataset/file_parquet.cc b/cpp/src/arrow/dataset/file_parquet.cc index 1c2fd2dea6..0ce0850292 100644 --- a/cpp/src/arrow/dataset/file_parquet.cc +++ b/cpp/src/arrow/dataset/file_parquet.cc @@ -161,7 +161,8 @@ bool IsNan(const Scalar& value) { } std::optional ColumnChunkStatisticsAsExpression( -const SchemaField& schema_field, const parquet::RowGroupMetaData& metadata) { +const FieldRef& field_ref, const SchemaField& schema_field, +const parquet::RowGroupMetaData& metadata) { // For the remaining of this function, failure to extract/parse statistics // are ignored by returning nullptr. The goal is two fold. First // avoid an optimization which breaks the computation. Second, allow the @@ -180,7 +181,8 @@ std::optional ColumnChunkStatisticsAsExpression( return std::nullopt; } - return ParquetFileFragment::EvaluateStatisticsAsExpression(*field, *statistics); + return ParquetFileFragment::EvaluateStatisticsAsExpression(*field, field_ref, + *statistics); } void AddColumnIndices(const SchemaField& schema_field, @@ -360,8 +362,9 @@ Result IsSupportedParquetFile(const ParquetFileFormat& format, } // namespace std::optional ParquetFileFragment::EvaluateStatisticsAsExpression( -const Field& field, const parquet::Statistics& statistics) { - auto field_expr = compute::field_ref(field.name()); +const Field& field, const FieldRef& field_ref, +const parquet::Statistics& statistics) { + auto field_expr = compute::field_ref(field_ref); // Optimize for corner case where all values are nulls if (statistics.num_values() == 0 && statistics.null_count() > 0) { @@ -418,6 +421,13 @@ std::optional ParquetFileFragment::EvaluateStatisticsAsExpr return std::nullopt; } +std::optional ParquetFileFragment::EvaluateStatisticsAsExpression( +const Field& field, const parquet::Statistics& statistics) { + const auto field_name = field.name(); + return EvaluateStatisticsAsExpression(field, FieldRef(std::move(field_name)), +statistics); +} + ParquetFileFormat::ParquetFileFormat() : FileFormat(std::make_shared()) {} @@ -810,7 +820,7 @@ Status ParquetFileFragment::SetMetadata( manifest_ = std::move(manifest); statistics_expressions_.resize(row_groups_->size(), compute::literal(true)); - statistics_expressions_complete_.resize(physical_schema_->num_fields(), false); + statistics_expressions_complete_.resize(manifest_->descr->num_columns(), false); for (int row_group : *row_groups_) { // Ensure RowGroups are indexing valid RowGroups before augmenting. @@ -900,16 +910,25 @@ Result> ParquetFileFragment::TestRowGroups( ARROW_ASSIGN_OR_RAISE(auto match, ref.FindOneOrNone(*physical_schema_)); if (match.empty()) continue; -if (statistics_expressions_complete_[match[0]]) continue; -statistics_expressions_complete_[match[0]] = true; +const SchemaField* schema_field = _->schema_fields[match[0]]; + +for (size_t i = 1; i < match.indices().size(); ++i) { + if (schema
(arrow) branch main updated: GH-39500: [Docs] Pin pydata-sphinx-theme to 0.14 (#39501)
This is an automated email from the ASF dual-hosted git repository. jorisvandenbossche pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/arrow.git The following commit(s) were added to refs/heads/main by this push: new 6ce3c3f884 GH-39500: [Docs] Pin pydata-sphinx-theme to 0.14 (#39501) 6ce3c3f884 is described below commit 6ce3c3f8840cdd5294f22a6e662b6d2c0ff0a077 Author: Joris Van den Bossche AuthorDate: Mon Jan 8 15:29:04 2024 +0100 GH-39500: [Docs] Pin pydata-sphinx-theme to 0.14 (#39501) ### Rationale for this change The latest pydata-sphinx-theme release 0.15 of a few days ago had some breakages. So let's pin to 0.14.x until 0.15 has stabilized. * Closes: #39500 Lead-authored-by: Joris Van den Bossche Co-authored-by: Sutou Kouhei Signed-off-by: Joris Van den Bossche --- ci/conda_env_sphinx.txt | 2 +- docs/requirements.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ci/conda_env_sphinx.txt b/ci/conda_env_sphinx.txt index af1bfe9b78..0e50875fc1 100644 --- a/ci/conda_env_sphinx.txt +++ b/ci/conda_env_sphinx.txt @@ -20,7 +20,7 @@ breathe doxygen ipython numpydoc -pydata-sphinx-theme +pydata-sphinx-theme=0.14 sphinx-autobuild sphinx-design sphinx-copybutton diff --git a/docs/requirements.txt b/docs/requirements.txt index 37a50d51dd..da2327a6df 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -5,7 +5,7 @@ breathe ipython numpydoc -pydata-sphinx-theme +pydata-sphinx-theme==0.14 sphinx-autobuild sphinx-design sphinx-copybutton
(arrow) branch main updated: GH-30117: [C++][Python] Add "Z" to the end of timestamp print string when tz defined (#39272)
This is an automated email from the ASF dual-hosted git repository. jorisvandenbossche pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/arrow.git The following commit(s) were added to refs/heads/main by this push: new a288364d97 GH-30117: [C++][Python] Add "Z" to the end of timestamp print string when tz defined (#39272) a288364d97 is described below commit a288364d971ab9a6a3f05a903a5df83ebeddf0a0 Author: Alenka Frim AuthorDate: Mon Jan 8 14:26:13 2024 +0100 GH-30117: [C++][Python] Add "Z" to the end of timestamp print string when tz defined (#39272) ### What changes are included in this PR? This PR updates the PrettyPrint for Timestamp type so that "Z" is printed at the end of the output string if the timezone has been defined. This way we add minimum information about the values being stored in UTC. ### Are these changes tested? Yes. ### Are there any user-facing changes? There is a change in how `TimestampArray` prints out the data. With this change "Z" would be added to the end of the string if the timezone is defined. * Closes: #30117 Lead-authored-by: AlenkaF Co-authored-by: Alenka Frim Co-authored-by: Rok Mihevc Signed-off-by: Joris Van den Bossche --- cpp/src/arrow/pretty_print_test.cc | 6 +++--- cpp/src/arrow/util/formatting.h| 7 ++- cpp/src/arrow/util/formatting_util_test.cc | 28 python/pyarrow/tests/test_types.py | 11 +++ 4 files changed, 48 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/pretty_print_test.cc b/cpp/src/arrow/pretty_print_test.cc index 0db6ae4867..5d2256e8c5 100644 --- a/cpp/src/arrow/pretty_print_test.cc +++ b/cpp/src/arrow/pretty_print_test.cc @@ -350,10 +350,10 @@ TEST_F(TestPrettyPrint, DateTimeTypes) { std::vector values = { 0, 1, 2, 678 + 100 * (5 + 60 * (4 + 60 * (3 + 24 * int64_t(1, 4}; static const char* expected = R"expected([ - 1970-01-01 00:00:00.00, - 1970-01-01 00:00:00.01, + 1970-01-01 00:00:00.00Z, + 1970-01-01 00:00:00.01Z, null, - 1970-01-02 03:04:05.000678, + 1970-01-02 03:04:05.000678Z, null ])expected"; CheckPrimitive(timestamp(TimeUnit::MICRO, "Transylvania"), diff --git a/cpp/src/arrow/util/formatting.h b/cpp/src/arrow/util/formatting.h index 9dcc6463fb..71bae74629 100644 --- a/cpp/src/arrow/util/formatting.h +++ b/cpp/src/arrow/util/formatting.h @@ -470,7 +470,8 @@ class StringFormatter { using value_type = int64_t; explicit StringFormatter(const DataType* type) - : unit_(checked_cast(*type).unit()) {} + : unit_(checked_cast(*type).unit()), +timezone_(checked_cast(*type).timezone()) {} template Return operator()(Duration, value_type value, Appender&& append) { @@ -503,6 +504,9 @@ class StringFormatter { std::array buffer; char* cursor = buffer.data() + buffer_size; +if (timezone_.size() > 0) { + detail::FormatOneChar('Z', ); +} detail::FormatHH_MM_SS(arrow_vendored::date::make_time(since_midnight), ); detail::FormatOneChar(' ', ); detail::Format_MM_DD(timepoint_days, ); @@ -516,6 +520,7 @@ class StringFormatter { private: TimeUnit::type unit_; + std::string timezone_; }; template diff --git a/cpp/src/arrow/util/formatting_util_test.cc b/cpp/src/arrow/util/formatting_util_test.cc index 9afbc91063..13f57a495d 100644 --- a/cpp/src/arrow/util/formatting_util_test.cc +++ b/cpp/src/arrow/util/formatting_util_test.cc @@ -522,6 +522,34 @@ TEST(Formatting, Timestamp) { AssertFormatting(formatter, -2203932304LL * 10LL + 8, "1900-02-28 12:34:56.8"); } + + { +auto timestamp_types = {timestamp(TimeUnit::SECOND, "US/Eastern"), +timestamp(TimeUnit::SECOND, "+01:00")}; +for (auto ty : timestamp_types) { + StringFormatter formatter(ty.get()); + + AssertFormatting(formatter, 0, "1970-01-01 00:00:00Z"); +} + } + + { +auto ty = timestamp(TimeUnit::MILLI, "Pacific/Maruesas"); +StringFormatter formatter(ty.get()); +AssertFormatting(formatter, 0, "1970-01-01 00:00:00.000Z"); + } + + { +auto ty = timestamp(TimeUnit::MICRO, "-42:00"); +StringFormatter formatter(ty.get()); +AssertFormatting(formatter, 0, "1970-01-01 00:00:00.00Z"); + } + + { +auto ty = timestamp(TimeUnit::NANO, "Mars/Mariner_Valley"); +StringFormatter formatter(ty.get()); +AssertFormatting(formatter, 0, "1970-01-01 00:00:00.0Z"); + } } TEST(Formatting, Interval) { diff --git a/python/pyarrow/tests/test_types.py b/python/pyarrow/tests/test_types.py index 7600f1dd33..c8a52c6b62 100644 --- a/python/pyarrow/te
(arrow) branch main updated: GH-38341: [Python] Remove usage of pandas internals DatetimeTZBlock (#38321)
This is an automated email from the ASF dual-hosted git repository. jorisvandenbossche pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/arrow.git The following commit(s) were added to refs/heads/main by this push: new 6b93c4a0e8 GH-38341: [Python] Remove usage of pandas internals DatetimeTZBlock (#38321) 6b93c4a0e8 is described below commit 6b93c4a0e8cb5110c6c4d3746f4e8bb0a8b76ec8 Author: Joris Van den Bossche AuthorDate: Mon Jan 8 14:21:10 2024 +0100 GH-38341: [Python] Remove usage of pandas internals DatetimeTZBlock (#38321) ### Rationale for this change This usage probably stems from a long time ago that it was required to specify the Block type, but nowadays it's good enough to just specify the dtype, and thus cutting down on our usage of internal pandas objects. Part of #35081 * Closes: #38341 Authored-by: Joris Van den Bossche Signed-off-by: Joris Van den Bossche --- python/pyarrow/pandas_compat.py | 12 +--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py index 80e313be02..3757d81a47 100644 --- a/python/pyarrow/pandas_compat.py +++ b/python/pyarrow/pandas_compat.py @@ -717,9 +717,15 @@ def _reconstruct_block(item, columns=None, extension_columns=None): elif 'timezone' in item: unit, _ = np.datetime_data(block_arr.dtype) dtype = make_datetimetz(unit, item['timezone']) -block = _int.make_block(block_arr, placement=placement, -klass=_int.DatetimeTZBlock, -dtype=dtype) +if _pandas_api.is_ge_v21(): +pd_arr = _pandas_api.pd.array( +block_arr.view("int64"), dtype=dtype, copy=False +) +block = _int.make_block(pd_arr, placement=placement) +else: +block = _int.make_block(block_arr, placement=placement, +klass=_int.DatetimeTZBlock, +dtype=dtype) elif 'py_array' in item: # create ExtensionBlock arr = item['py_array']
(arrow) branch main updated: GH-39196: [Python][Docs] Document the Arrow PyCapsule protocol in the 'extending pyarrow' section of the Python docs (#39199)
This is an automated email from the ASF dual-hosted git repository. jorisvandenbossche pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/arrow.git The following commit(s) were added to refs/heads/main by this push: new 2f9f892a00 GH-39196: [Python][Docs] Document the Arrow PyCapsule protocol in the 'extending pyarrow' section of the Python docs (#39199) 2f9f892a00 is described below commit 2f9f892a0075d990a1b42dc97a97d490b6b08345 Author: Joris Van den Bossche AuthorDate: Thu Dec 21 15:53:41 2023 +0100 GH-39196: [Python][Docs] Document the Arrow PyCapsule protocol in the 'extending pyarrow' section of the Python docs (#39199) ### Rationale for this change While the Arrow PyCapsule protocol itself is defined in the specification part of the docs, this PR adds a section about it in the Python user guide as well (referring to the specification for most details), where users might typically look for Python specific docs. * Closes: #39196 Lead-authored-by: Joris Van den Bossche Co-authored-by: Antoine Pitrou Signed-off-by: Joris Van den Bossche --- .../format/CDataInterface/PyCapsuleInterface.rst | 2 ++ docs/source/python/extending_types.rst | 32 ++ 2 files changed, 34 insertions(+) diff --git a/docs/source/format/CDataInterface/PyCapsuleInterface.rst b/docs/source/format/CDataInterface/PyCapsuleInterface.rst index 0c1a01d7c6..03095aa2e9 100644 --- a/docs/source/format/CDataInterface/PyCapsuleInterface.rst +++ b/docs/source/format/CDataInterface/PyCapsuleInterface.rst @@ -16,6 +16,8 @@ .. under the License. +.. _arrow-pycapsule-interface: + = The Arrow PyCapsule Interface = diff --git a/docs/source/python/extending_types.rst b/docs/source/python/extending_types.rst index ee92cebcb5..b7261005e6 100644 --- a/docs/source/python/extending_types.rst +++ b/docs/source/python/extending_types.rst @@ -21,6 +21,38 @@ Extending pyarrow = +Controlling conversion to (Py)Arrow with the PyCapsule Interface + + +The :ref:`Arrow C data interface ` allows moving Arrow data between +different implementations of Arrow. This is a generic, cross-language interface not +specific to Python, but for Python libraries this interface is extended with a Python +specific layer: :ref:`arrow-pycapsule-interface`. + +This Python interface ensures that different libraries that support the C Data interface +can export Arrow data structures in a standard way and recognize each other's objects. + +If you have a Python library providing data structures that hold Arrow-compatible data +under the hood, you can implement the following methods on those objects: + +- ``__arrow_c_schema__`` for schema or type-like objects. +- ``__arrow_c_array__`` for arrays and record batches (contiguous tables). +- ``__arrow_c_stream__`` for chunked tables or streams of data. + +Those methods return `PyCapsule <https://docs.python.org/3/c-api/capsule.html>`__ +objects, and more details on the exact semantics can be found in the +:ref:`specification `. + +When your data structures have those methods defined, the PyArrow constructors +(such as :func:`pyarrow.array` or :func:`pyarrow.table`) will recognize those objects as +supporting this protocol, and convert them to PyArrow data structures zero-copy. And the +same can be true for any other library supporting this protocol on ingesting data. + +Similarly, if your library has functions that accept user-provided data, you can add +support for this protocol by checking for the presence of those methods, and +therefore accept any Arrow data (instead of harcoding support for a specific +Arrow producer such as PyArrow). + .. _arrow_array_protocol: Controlling conversion to pyarrow.Array with the ``__arrow_array__`` protocol
(arrow) branch main updated: MINOR: [Docs] local_timestamp kernel docs are not linked in python docs (#39274)
This is an automated email from the ASF dual-hosted git repository. jorisvandenbossche pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/arrow.git The following commit(s) were added to refs/heads/main by this push: new b1fcba1b39 MINOR: [Docs] local_timestamp kernel docs are not linked in python docs (#39274) b1fcba1b39 is described below commit b1fcba1b395e0aedddcdab19958c14809d780d4c Author: Rok Mihevc AuthorDate: Wed Dec 20 11:06:57 2023 +0100 MINOR: [Docs] local_timestamp kernel docs are not linked in python docs (#39274) ### Rationale for this change local_timestamp kernel docs are linked in [cpp](https://arrow.apache.org/docs/cpp/compute.html#timezone-handling) but not in [python docs](https://arrow.apache.org/docs/python/api/compute.html#timezone-handling). ### What changes are included in this PR? This adds a rst link in python docs ### Are these changes tested? No ### Are there any user-facing changes? Change will be visible in the docs Authored-by: Rok Mihevc Signed-off-by: Joris Van den Bossche --- docs/source/python/api/compute.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/python/api/compute.rst b/docs/source/python/api/compute.rst index 4ee364fcf6..b879643017 100644 --- a/docs/source/python/api/compute.rst +++ b/docs/source/python/api/compute.rst @@ -468,6 +468,7 @@ Timezone Handling :toctree: ../generated/ assume_timezone + local_timestamp Associative Transforms --
(arrow) branch main updated: GH-38683: [Python][Docs] Update docstrings for Time32Type and Time64Type (#39059)
This is an automated email from the ASF dual-hosted git repository. jorisvandenbossche pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/arrow.git The following commit(s) were added to refs/heads/main by this push: new 9cb78addf7 GH-38683: [Python][Docs] Update docstrings for Time32Type and Time64Type (#39059) 9cb78addf7 is described below commit 9cb78addf7fcd662de1579db9dff55bd1a420fe4 Author: Alenka Frim AuthorDate: Tue Dec 19 09:45:41 2023 +0100 GH-38683: [Python][Docs] Update docstrings for Time32Type and Time64Type (#39059) ### Rationale for this change `Time32Type` and `Time64Type` unit docs are not correctly documented. ### What changes are included in this PR? Update the docstrings for `Time32Type` and `Time64Type` `unit`. * Closes: #38683 Authored-by: AlenkaF Signed-off-by: Joris Van den Bossche --- python/pyarrow/types.pxi | 10 -- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index a0ddf09d69..912ee39f7d 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -1108,6 +1108,9 @@ cdef class Time32Type(DataType): """ Concrete class for time32 data types. +Supported time unit resolutions are 's' [second] +and 'ms' [millisecond]. + Examples Create an instance of time32 type: @@ -1124,7 +1127,7 @@ cdef class Time32Type(DataType): @property def unit(self): """ -The time unit ('s', 'ms', 'us' or 'ns'). +The time unit ('s' or 'ms'). Examples @@ -1140,6 +1143,9 @@ cdef class Time64Type(DataType): """ Concrete class for time64 data types. +Supported time unit resolutions are 'us' [microsecond] +and 'ns' [nanosecond]. + Examples Create an instance of time64 type: @@ -1156,7 +1162,7 @@ cdef class Time64Type(DataType): @property def unit(self): """ -The time unit ('s', 'ms', 'us' or 'ns'). +The time unit ('us' or 'ns'). Examples
(arrow) branch main updated: GH-38535: [Python] Fix S3FileSystem equals None segfault (#39276)
This is an automated email from the ASF dual-hosted git repository. jorisvandenbossche pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/arrow.git The following commit(s) were added to refs/heads/main by this push: new f5dd3d4a1c GH-38535: [Python] Fix S3FileSystem equals None segfault (#39276) f5dd3d4a1c is described below commit f5dd3d4a1c0efb7c8587287da0c536988bcd1559 Author: Alenka Frim AuthorDate: Tue Dec 19 09:45:00 2023 +0100 GH-38535: [Python] Fix S3FileSystem equals None segfault (#39276) ### Rationale for this change `S3FileSystem` equals `None` currently causes bus error. ### What changes are included in this PR? Add `not None` to `FileSystem.equals` signature. ### Are these changes tested? Yes. ### Are there any user-facing changes? No. * Closes: #38535 Authored-by: AlenkaF Signed-off-by: Joris Van den Bossche --- python/pyarrow/_fs.pyx | 2 +- python/pyarrow/tests/test_fs.py | 7 +++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/python/pyarrow/_fs.pyx b/python/pyarrow/_fs.pyx index ef8db31bfc..395f488144 100644 --- a/python/pyarrow/_fs.pyx +++ b/python/pyarrow/_fs.pyx @@ -505,7 +505,7 @@ cdef class FileSystem(_Weakrefable): cdef inline shared_ptr[CFileSystem] unwrap(self) nogil: return self.wrapped -def equals(self, FileSystem other): +def equals(self, FileSystem other not None): """ Parameters -- diff --git a/python/pyarrow/tests/test_fs.py b/python/pyarrow/tests/test_fs.py index 59c9c44942..d0fa253e31 100644 --- a/python/pyarrow/tests/test_fs.py +++ b/python/pyarrow/tests/test_fs.py @@ -542,6 +542,13 @@ def test_filesystem_equals(): assert SubTreeFileSystem('/base', fs0) != SubTreeFileSystem('/other', fs0) +def test_filesystem_equals_none(fs): +with pytest.raises(TypeError, match="got NoneType"): +fs.equals(None) + +assert fs is not None + + def test_subtree_filesystem(): localfs = LocalFileSystem()
(arrow) branch main updated: GH-36441: [Python] Make `CacheOptions` configurable from Python (#36627)
This is an automated email from the ASF dual-hosted git repository. jorisvandenbossche pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/arrow.git The following commit(s) were added to refs/heads/main by this push: new 3236c129d1 GH-36441: [Python] Make `CacheOptions` configurable from Python (#36627) 3236c129d1 is described below commit 3236c129d1cbe3f73359278d1459a3f20e5c4df0 Author: Thomas Newton AuthorDate: Thu Dec 14 14:12:17 2023 + GH-36441: [Python] Make `CacheOptions` configurable from Python (#36627) ### Rationale for this change Resolves: https://github.com/apache/arrow/issues/36441 ### What changes are included in this PR? - Add python bindings for `CacheOptions` from the C++ side. - Allow setting `cache_options` on `ParquetFragmentScanOptions` from the python side. - Adjust some of the comments on `CacheOptions` ### Are these changes tested? Yes. I added python side tests for these newly available configs similar to other configs. I have not added an integration test that ensures setting the configs on the python side leads to correctly using them on the C++ side. ### Are there any user-facing changes? Yes. The are new configs available on the python side but the defaults are unchanged. I've added/updated docstrings where relevant. * Closes: #36441 Lead-authored-by: Thomas Newton Co-authored-by: Joris Van den Bossche Signed-off-by: Joris Van den Bossche --- cpp/src/arrow/io/caching.h | 10 ++- python/pyarrow/__init__.py | 2 +- python/pyarrow/_dataset_parquet.pyx | 21 +- python/pyarrow/_parquet.pxd | 6 +- python/pyarrow/includes/libarrow.pxd | 16 + python/pyarrow/io.pxi| 134 +++ python/pyarrow/lib.pxd | 12 python/pyarrow/tests/test_dataset.py | 28 +--- python/pyarrow/tests/test_io.py | 59 +++ 9 files changed, 271 insertions(+), 17 deletions(-) diff --git a/cpp/src/arrow/io/caching.h b/cpp/src/arrow/io/caching.h index 9c1b8fe88b..e2b911fafd 100644 --- a/cpp/src/arrow/io/caching.h +++ b/cpp/src/arrow/io/caching.h @@ -42,6 +42,11 @@ struct ARROW_EXPORT CacheOptions { /// size greater than this, they are not combined int64_t range_size_limit; /// \brief A lazy cache does not perform any I/O until requested. + /// lazy = false: request all byte ranges when PreBuffer or WillNeed is called. + /// lazy = True, prefetch_limit = 0: request merged byte ranges only after the reader + /// needs them. + /// lazy = True, prefetch_limit = k: prefetch up to k merged byte ranges ahead of the + /// range that is currently being read. bool lazy; /// \brief The maximum number of ranges to be prefetched. This is only used /// for lazy cache to asynchronously read some ranges after reading the target range. @@ -56,9 +61,10 @@ struct ARROW_EXPORT CacheOptions { /// \brief Construct CacheOptions from network storage metrics (e.g. S3). /// /// \param[in] time_to_first_byte_millis Seek-time or Time-To-First-Byte (TTFB) in - /// milliseconds, also called call setup latency of a new S3 request. + /// milliseconds, also called call setup latency of a new read request. /// The value is a positive integer. - /// \param[in] transfer_bandwidth_mib_per_sec Data transfer Bandwidth (BW) in MiB/sec. + /// \param[in] transfer_bandwidth_mib_per_sec Data transfer Bandwidth (BW) in MiB/sec + /// (per connection). /// The value is a positive integer. /// \param[in] ideal_bandwidth_utilization_frac Transfer bandwidth utilization fraction /// (per connection) to maximize the net data load. diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py index cd66abcb44..9da94885ec 100644 --- a/python/pyarrow/__init__.py +++ b/python/pyarrow/__init__.py @@ -243,7 +243,7 @@ from pyarrow.lib import (MemoryPool, LoggingMemoryPool, ProxyMemoryPool, # I/O from pyarrow.lib import (NativeFile, PythonFile, - BufferedInputStream, BufferedOutputStream, + BufferedInputStream, BufferedOutputStream, CacheOptions, CompressedInputStream, CompressedOutputStream, TransformInputStream, transcoding_input_stream, FixedSizeBufferWriter, diff --git a/python/pyarrow/_dataset_parquet.pyx b/python/pyarrow/_dataset_parquet.pyx index d458ac4ee7..61e051f56c 100644 --- a/python/pyarrow/_dataset_parquet.pyx +++ b/python/pyarrow/_dataset_parquet.pyx @@ -42,6 +42,7 @@ from pyarrow._dataset cimport ( FileWriteOptions, Fragment, FragmentScanOptions, +CacheOptions, Partitioning, PartitioningFactory, WrittenFile @@ -693,6 +694,10 @@ cdef class ParquetFragmentScanOptions(FragmentScanOptions): parallel using a background
(arrow) branch main updated: GH-39096: [Python] Release GIL in `.nbytes` (#39097)
This is an automated email from the ASF dual-hosted git repository. jorisvandenbossche pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/arrow.git The following commit(s) were added to refs/heads/main by this push: new 6e61c5e216 GH-39096: [Python] Release GIL in `.nbytes` (#39097) 6e61c5e216 is described below commit 6e61c5e2163c8509411143752afc7f3bb37184cb Author: Hendrik Makait AuthorDate: Thu Dec 7 14:18:06 2023 +0100 GH-39096: [Python] Release GIL in `.nbytes` (#39097) ### Rationale for this change The `.nbytes` holds the GIL while computing the data size in C++, which has caused performance issues in Dask because threads were blocking each other See #39096 ### Are these changes tested? I am not sure if additional tests are necessary here. If so, I'm happy to add them but would welcome some pointers. ### Are there any user-facing changes? No * Closes: #39096 Authored-by: Hendrik Makait Signed-off-by: Joris Van den Bossche --- python/pyarrow/array.pxi | 5 +++-- python/pyarrow/table.pxi | 15 +-- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 9d62bed51f..789e30d3e9 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -1206,8 +1206,9 @@ cdef class Array(_PandasConvertible): cdef: CResult[int64_t] c_size_res -c_size_res = ReferencedBufferSize(deref(self.ap)) -size = GetResultValue(c_size_res) +with nogil: +c_size_res = ReferencedBufferSize(deref(self.ap)) +size = GetResultValue(c_size_res) return size def get_total_buffer_size(self): diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index f93f595090..2f8d1abd1f 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -248,8 +248,9 @@ cdef class ChunkedArray(_PandasConvertible): cdef: CResult[int64_t] c_res_buffer -c_res_buffer = ReferencedBufferSize(deref(self.chunked_array)) -size = GetResultValue(c_res_buffer) +with nogil: +c_res_buffer = ReferencedBufferSize(deref(self.chunked_array)) +size = GetResultValue(c_res_buffer) return size def get_total_buffer_size(self): @@ -2386,8 +2387,9 @@ cdef class RecordBatch(_Tabular): cdef: CResult[int64_t] c_res_buffer -c_res_buffer = ReferencedBufferSize(deref(self.batch)) -size = GetResultValue(c_res_buffer) +with nogil: +c_res_buffer = ReferencedBufferSize(deref(self.batch)) +size = GetResultValue(c_res_buffer) return size def get_total_buffer_size(self): @@ -4337,8 +4339,9 @@ cdef class Table(_Tabular): cdef: CResult[int64_t] c_res_buffer -c_res_buffer = ReferencedBufferSize(deref(self.table)) -size = GetResultValue(c_res_buffer) +with nogil: +c_res_buffer = ReferencedBufferSize(deref(self.table)) +size = GetResultValue(c_res_buffer) return size def get_total_buffer_size(self):
(arrow) branch main updated: GH-38618: [C++] S3FileSystem: fix regression in deleting explicitly created sub-directories (#38845)
This is an automated email from the ASF dual-hosted git repository. jorisvandenbossche pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/arrow.git The following commit(s) were added to refs/heads/main by this push: new cf80bd1135 GH-38618: [C++] S3FileSystem: fix regression in deleting explicitly created sub-directories (#38845) cf80bd1135 is described below commit cf80bd1135bbd9cee7c0ae3e6370f93270cba250 Author: Joris Van den Bossche AuthorDate: Tue Dec 5 18:23:15 2023 +0100 GH-38618: [C++] S3FileSystem: fix regression in deleting explicitly created sub-directories (#38845) ### Rationale for this change See https://github.com/apache/arrow/issues/38618#issuecomment-1821252024 and below for the analysis. When deleting the dir contents, we use a GetFileInfo with recursive FileSelector to list all objects to delete, but when doing that the file paths for directories don't end in a trailing `/`, so for deleting explicitly created directories we need to add the `kSep` here as well to properly delete the object. ### Are these changes tested? I tested them manually with an actual S3 bucket. The problem is that MinIO doesn't have the same problem, and so it's not actually tested with the test I added using our MinIO testing setup. ### Are there any user-facing changes? Fixes the regression * Closes: #38618 Lead-authored-by: Joris Van den Bossche Co-authored-by: Antoine Pitrou Signed-off-by: Joris Van den Bossche --- cpp/src/arrow/filesystem/s3fs.cc | 11 ++- python/pyarrow/tests/test_fs.py | 32 2 files changed, 42 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/filesystem/s3fs.cc b/cpp/src/arrow/filesystem/s3fs.cc index 511448cb2f..62bec9b23b 100644 --- a/cpp/src/arrow/filesystem/s3fs.cc +++ b/cpp/src/arrow/filesystem/s3fs.cc @@ -2409,7 +2409,16 @@ class S3FileSystem::Impl : public std::enable_shared_from_this file_paths; for (const auto& file_info : file_infos) { DCHECK_GT(file_info.path().size(), bucket.size()); -file_paths.push_back(file_info.path().substr(bucket.size() + 1)); +auto file_path = file_info.path().substr(bucket.size() + 1); +if (file_info.IsDirectory()) { + // The selector returns FileInfo objects for directories with a + // a path that never ends in a trailing slash, but for AWS the file + // needs to have a trailing slash to recognize it as directory + // (https://github.com/apache/arrow/issues/38618) + DCHECK_OK(internal::AssertNoTrailingSlash(file_path)); + file_path = file_path + kSep; +} +file_paths.push_back(std::move(file_path)); } scheduler->AddSimpleTask( [=, file_paths = std::move(file_paths)] { diff --git a/python/pyarrow/tests/test_fs.py b/python/pyarrow/tests/test_fs.py index 1002e13471..59c9c44942 100644 --- a/python/pyarrow/tests/test_fs.py +++ b/python/pyarrow/tests/test_fs.py @@ -760,6 +760,38 @@ def test_delete_dir(fs, pathfn): fs.delete_dir(d) +def test_delete_dir_with_explicit_subdir(fs, pathfn): +# GH-38618: regression with AWS failing to delete directories, +# depending on whether they were created explicitly. Note that +# Minio doesn't reproduce the issue, so this test is not a regression +# test in itself. +skip_fsspec_s3fs(fs) + +d = pathfn('directory/') +nd = pathfn('directory/nested/') + +# deleting dir with explicit subdir +fs.create_dir(d) +fs.create_dir(nd) +fs.delete_dir(d) +dir_info = fs.get_file_info(d) +assert dir_info.type == FileType.NotFound + +# deleting dir with blob in explicit subdir +d = pathfn('directory2') +nd = pathfn('directory2/nested') +f = pathfn('directory2/nested/target-file') + +fs.create_dir(d) +fs.create_dir(nd) +with fs.open_output_stream(f) as s: +s.write(b'data') + +fs.delete_dir(d) +dir_info = fs.get_file_info(d) +assert dir_info.type == FileType.NotFound + + def test_delete_dir_contents(fs, pathfn): skip_fsspec_s3fs(fs)
(arrow) branch main updated: GH-38950: [Docs] Fix spelling (#38951)
This is an automated email from the ASF dual-hosted git repository. jorisvandenbossche pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/arrow.git The following commit(s) were added to refs/heads/main by this push: new 3531396803 GH-38950: [Docs] Fix spelling (#38951) 3531396803 is described below commit 353139680311e809d2413ea46e17e1656069ac5e Author: Josh Soref <2119212+jso...@users.noreply.github.com> AuthorDate: Fri Dec 1 12:33:09 2023 -0500 GH-38950: [Docs] Fix spelling (#38951) ### Rationale for this change ### What changes are included in this PR? Spelling fixes to docs/ ### Are these changes tested? ### Are there any user-facing changes? * Closes: #38950 Lead-authored-by: Josh Soref <2119212+jso...@users.noreply.github.com> Co-authored-by: Sutou Kouhei Signed-off-by: Joris Van den Bossche --- docs/source/_static/theme_overrides.css| 6 ++-- docs/source/conf.py| 2 +- docs/source/cpp/acero/developer_guide.rst | 34 +++--- docs/source/cpp/acero/overview.rst | 4 +-- docs/source/cpp/acero/substrait.rst| 2 +- docs/source/cpp/acero/user_guide.rst | 4 +-- docs/source/cpp/compute.rst| 6 ++-- docs/source/cpp/datatypes.rst | 2 +- .../cpp/examples/compute_and_write_example.rst | 2 +- .../cpp/examples/dataset_skyhook_scan_example.rst | 4 +-- docs/source/cpp/overview.rst | 2 +- docs/source/cpp/tutorials/basic_arrow.rst | 2 +- .../developers/continuous_integration/archery.rst | 2 +- .../developers/continuous_integration/crossbow.rst | 4 +-- .../developers/continuous_integration/docker.rst | 4 +-- .../developers/continuous_integration/overview.rst | 4 +-- docs/source/developers/documentation.rst | 2 +- docs/source/developers/guide/documentation.rst | 2 +- docs/source/developers/guide/resources.rst | 2 +- .../guide/step_by_step/finding_issues.rst | 2 +- .../developers/guide/tutorials/r_tutorial.rst | 2 +- docs/source/developers/java/building.rst | 14 - docs/source/developers/release.rst | 6 ++-- docs/source/developers/reviewing.rst | 4 +-- docs/source/format/ADBC.rst| 4 +-- docs/source/format/CDataInterface.rst | 2 +- docs/source/format/CDeviceDataInterface.rst| 8 ++--- docs/source/format/CanonicalExtensions.rst | 2 +- docs/source/format/Columnar.rst| 2 +- docs/source/java/dataset.rst | 4 +-- docs/source/python/api/compute.rst | 2 +- docs/source/python/dataset.rst | 2 +- docs/source/python/getting_involved.rst| 2 +- docs/source/python/integration.rst | 2 +- docs/source/python/integration/python_java.rst | 2 +- docs/source/python/interchange_protocol.rst| 16 +- docs/source/python/memory.rst | 2 +- docs/source/python/parquet.rst | 2 +- 38 files changed, 85 insertions(+), 85 deletions(-) diff --git a/docs/source/_static/theme_overrides.css b/docs/source/_static/theme_overrides.css index bf84267aea..58f4554d11 100644 --- a/docs/source/_static/theme_overrides.css +++ b/docs/source/_static/theme_overrides.css @@ -33,7 +33,7 @@ } } -/* Contibuting landing page overview cards */ +/* Contributing landing page overview cards */ .contrib-card { border-radius: 0; @@ -68,7 +68,7 @@ } /* This is the bootstrap CSS style for "table-striped". Since the theme does -not yet provide an easy way to configure this globaly, it easier to simply +not yet provide an easy way to configure this globally, it easier to simply include this snippet here than updating each table in all rst files to add ":class: table-striped" */ @@ -76,7 +76,7 @@ add ":class: table-striped" */ background-color: rgba(0, 0, 0, 0.05); } -/* Iprove the vertical spacing in the C++ API docs +/* Improve the vertical spacing in the C++ API docs (ideally this should be upstreamed to the pydata-sphinx-theme */ dl.cpp dd p { diff --git a/docs/source/conf.py b/docs/source/conf.py index f11d78fe05..cde0c2b31f 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -139,7 +139,7 @@ autodoc_default_options = { breathe_projects = {"arrow_cpp": "../../cpp/apidoc/xml"} breathe_default_project = "arrow_cpp" -# Overriden conditionally below +# Overridden conditionally below autodoc_mock_imports = [] # copybutton configuration diff --git a/docs/source/cpp/acero/developer_guide.rst b/docs/source/cpp/acero/developer_guide.rst index c893e41ff8..331cd833b5 100644 ---
(arrow) branch main updated: GH-39028: [Python][CI] Fix dask integration build by temporarily skipping test_categorize_info (#39029)
This is an automated email from the ASF dual-hosted git repository. jorisvandenbossche pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/arrow.git The following commit(s) were added to refs/heads/main by this push: new 530a63a81b GH-39028: [Python][CI] Fix dask integration build by temporarily skipping test_categorize_info (#39029) 530a63a81b is described below commit 530a63a81b11d68bd66dc0e32c82e7e56030d762 Author: Joris Van den Bossche AuthorDate: Fri Dec 1 17:45:59 2023 +0100 GH-39028: [Python][CI] Fix dask integration build by temporarily skipping test_categorize_info (#39029) The test requires an downstream fix in dask (because of a valid change in Arrow), until then temporarily skipping this test (see the issue for more details). * Closes: #39028 Authored-by: Joris Van den Bossche Signed-off-by: Joris Van den Bossche --- ci/scripts/integration_dask.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ci/scripts/integration_dask.sh b/ci/scripts/integration_dask.sh index f91d21b921..bf306dc652 100755 --- a/ci/scripts/integration_dask.sh +++ b/ci/scripts/integration_dask.sh @@ -32,7 +32,9 @@ python -c "import dask.dataframe" # pytest -sv --pyargs dask.bytes.tests.test_local # The "skip_with_pyarrow_strings" marker is meant to skip automatically, but that doesn't work with --pyargs, so de-selecting manually -pytest -v --pyargs dask.dataframe.tests.test_dataframe -m "not skip_with_pyarrow_strings" +# - The 'test_categorize_info' test is failing because of change in StringArray's nbytes and +# an upstream fix (https://github.com/apache/arrow/issues/39028) +pytest -v --pyargs dask.dataframe.tests.test_dataframe -m "not skip_with_pyarrow_strings" -k "not test_categorize_info" pytest -v --pyargs dask.dataframe.io.tests.test_orc pytest -v --pyargs dask.dataframe.io.tests.test_parquet \ -m "not skip_with_pyarrow_strings and not xfail_with_pyarrow_strings"
(arrow) branch main updated: GH-38857: [Python] Fix append mode for cython 2 (#39027)
This is an automated email from the ASF dual-hosted git repository. jorisvandenbossche pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/arrow.git The following commit(s) were added to refs/heads/main by this push: new 2bd8e06b48 GH-38857: [Python] Fix append mode for cython 2 (#39027) 2bd8e06b48 is described below commit 2bd8e06b4867acea3dc5479e991998672804e8ea Author: Joris Van den Bossche AuthorDate: Fri Dec 1 14:48:32 2023 +0100 GH-38857: [Python] Fix append mode for cython 2 (#39027) ### Rationale for this change Small fixup of the change in https://github.com/apache/arrow/pull/38820 to fix the build failure on cython 2 (nightly crossbow build) * Closes: #38857 Authored-by: Joris Van den Bossche Signed-off-by: Joris Van den Bossche --- python/pyarrow/includes/libarrow.pxd | 3 ++- python/pyarrow/io.pxi| 4 +++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 59b63b5fb7..b0b89f8614 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -1386,7 +1386,8 @@ cdef extern from "arrow/io/api.h" namespace "arrow::io" nogil: CResult[shared_ptr[COutputStream]] Open(const c_string& path) @staticmethod -CResult[shared_ptr[COutputStream]] Open(const c_string& path, c_bool append) +CResult[shared_ptr[COutputStream]] OpenWithAppend" Open"( +const c_string& path, c_bool append) int file_descriptor() diff --git a/python/pyarrow/io.pxi b/python/pyarrow/io.pxi index 3086845efa..6f39166401 100644 --- a/python/pyarrow/io.pxi +++ b/python/pyarrow/io.pxi @@ -1167,7 +1167,9 @@ cdef class OSFile(NativeFile): cdef _open_writable(self, c_string path, c_bool append=False): with nogil: -self.output_stream = GetResultValue(FileOutputStream.Open(path, append)) +self.output_stream = GetResultValue( +FileOutputStream.OpenWithAppend(path, append) +) self.is_writable = True self._is_appending = append
(arrow) branch main updated: GH-38342: [Python] Update to_pandas to use non-deprecated DataFrame constructor (#38374)
This is an automated email from the ASF dual-hosted git repository. jorisvandenbossche pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/arrow.git The following commit(s) were added to refs/heads/main by this push: new 2fadab2aa6 GH-38342: [Python] Update to_pandas to use non-deprecated DataFrame constructor (#38374) 2fadab2aa6 is described below commit 2fadab2aa65425ec4e392e5cf8fd2082f3685212 Author: Joris Van den Bossche AuthorDate: Fri Dec 1 13:11:35 2023 +0100 GH-38342: [Python] Update to_pandas to use non-deprecated DataFrame constructor (#38374) ### Rationale for this change Avoiding a deprecation warning from pandas * Closes: #38342 Authored-by: Joris Van den Bossche Signed-off-by: Joris Van den Bossche --- python/pyarrow/pandas-shim.pxi | 11 --- python/pyarrow/pandas_compat.py | 13 ++--- python/pyarrow/table.pxi| 6 +++--- 3 files changed, 21 insertions(+), 9 deletions(-) diff --git a/python/pyarrow/pandas-shim.pxi b/python/pyarrow/pandas-shim.pxi index a0c0cabf6d..273575b779 100644 --- a/python/pyarrow/pandas-shim.pxi +++ b/python/pyarrow/pandas-shim.pxi @@ -37,7 +37,7 @@ cdef class _PandasAPIShim(object): object _array_like_types, _is_extension_array_dtype bint has_sparse bint _pd024 -bint _is_v1 +bint _is_v1, _is_ge_v21 def __init__(self): self._tried_importing_pandas = False @@ -74,8 +74,9 @@ cdef class _PandasAPIShim(object): "installed. Therefore, pandas-specific integration is not " "used.".format(self._version), stacklevel=2) return -elif self._loose_version < Version('2.0.0'): -self._is_v1 = True + +self._is_v1 = self._loose_version < Version('2.0.0') +self._is_ge_v21 = self._loose_version >= Version('2.1.0') self._compat_module = pdcompat self._data_frame = pd.DataFrame @@ -158,6 +159,10 @@ cdef class _PandasAPIShim(object): self._check_import() return self._is_v1 +def is_ge_v21(self): +self._check_import() +return self._is_ge_v21 + @property def categorical_type(self): self._check_import() diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py index be29f68a13..80e313be02 100644 --- a/python/pyarrow/pandas_compat.py +++ b/python/pyarrow/pandas_compat.py @@ -744,9 +744,11 @@ def make_datetimetz(unit, tz): return _pandas_api.datetimetz_type(unit, tz=tz) -def table_to_blockmanager(options, table, categories=None, - ignore_metadata=False, types_mapper=None): +def table_to_dataframe( +options, table, categories=None, ignore_metadata=False, types_mapper=None +): from pandas.core.internals import BlockManager +from pandas import DataFrame all_columns = [] column_indexes = [] @@ -770,7 +772,12 @@ def table_to_blockmanager(options, table, categories=None, blocks = _table_to_blocks(options, table, categories, ext_columns_dtypes) axes = [columns, index] -return BlockManager(blocks, axes) +mgr = BlockManager(blocks, axes) +if _pandas_api.is_ge_v21(): +df = DataFrame._from_mgr(mgr, mgr.axes) +else: +df = DataFrame(mgr) +return df # Set of the string repr of all numpy dtypes that can be stored in a pandas diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index bbed789553..f93f595090 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -4191,12 +4191,12 @@ cdef class Table(_Tabular): def _to_pandas(self, options, categories=None, ignore_metadata=False, types_mapper=None): -from pyarrow.pandas_compat import table_to_blockmanager -mgr = table_to_blockmanager( +from pyarrow.pandas_compat import table_to_dataframe +df = table_to_dataframe( options, self, categories, ignore_metadata=ignore_metadata, types_mapper=types_mapper) -return pandas_api.data_frame(mgr) +return df @property def schema(self):