jorisvandenbossche commented on code in PR #43795:
URL: https://github.com/apache/arrow/pull/43795#discussion_r1732739401
##########
cpp/src/arrow/chunked_array.h:
##########
@@ -182,6 +182,9 @@ class ARROW_EXPORT ChunkedArray {
/// \return Status
Status ValidateFull() const;
+ /// \brief Determine if all chunks are located on the CPU
+ bool IsCpu() const;
Review Comment:
For other places we have something like this, we use `is_cpu()` notation. Is
that here CamelCase because it is calculated and not a cheap property?
##########
python/pyarrow/tests/test_table.py:
##########
@@ -3357,3 +3357,179 @@ def test_invalid_non_join_column():
with pytest.raises(pa.lib.ArrowInvalid) as excinfo:
t2.join(t1, 'id', join_type='inner')
assert exp_error_msg in str(excinfo.value)
+
+
[email protected]
+def cuda_context():
+ cuda = pytest.importorskip("pyarrow.cuda")
+ return cuda.Context(0)
+
+
[email protected]
+def schema():
+ return pa.schema([pa.field('c0', pa.int16()), pa.field('c1', pa.int32())])
+
+
[email protected]
+def cpu_arrays():
+ return [pa.array([1, 2, 3, 4, 5], pa.int32()),
+ pa.array([-10, -5, 0, None, 10], pa.int32())]
+
+
[email protected]
+def cuda_arrays(cuda_context, cpu_arrays):
+ return [arr.copy_to(cuda_context.memory_manager) for arr in cpu_arrays]
+
+
[email protected]
+def cpu_chunked_array(cpu_arrays):
+ chunked_array = pa.chunked_array(cpu_arrays)
+ assert chunked_array.is_cpu() is True
+ return chunked_array
+
+
[email protected]
+def cuda_chunked_array(cuda_arrays):
+ chunked_array = pa.chunked_array(cuda_arrays)
+ assert chunked_array.is_cpu() is False
+ return chunked_array
+
+
[email protected]
+def cpu_and_cuda_chunked_array(cpu_arrays, cuda_arrays):
+ chunked_array = pa.chunked_array(cpu_arrays + cuda_arrays)
+ assert chunked_array.is_cpu() is False
+ return chunked_array
+
+
+def test_chunked_array_non_cpu(cuda_context, cpu_chunked_array,
cuda_chunked_array,
+ cpu_and_cuda_chunked_array):
+ # type test
+ assert cuda_chunked_array.type == cpu_chunked_array.type
+
+ # length() test
+ assert cuda_chunked_array.length() == cpu_chunked_array.length()
+
+ # str() test
+ assert str(cuda_chunked_array) == str(cpu_chunked_array)
+
+ # repr() test
+ assert str(cuda_chunked_array) in repr(cuda_chunked_array)
+
+ # validate() test
+ cuda_chunked_array.validate()
+ with pytest.raises(NotImplementedError):
+ cuda_chunked_array.validate(full=True)
+
+ # null_count test
+ assert cuda_chunked_array.null_count == cpu_chunked_array.null_count
+
+ # nbytes() test
+ with pytest.raises(NotImplementedError):
+ cuda_chunked_array.nbytes
+
+ # get_total_buffer_size() test
+ with pytest.raises(NotImplementedError):
+ cuda_chunked_array.get_total_buffer_size()
+
+ # getitem() test
+ with pytest.raises(NotImplementedError):
+ cuda_chunked_array[0]
+
+ # is_null() test
+ with pytest.raises(NotImplementedError):
+ cuda_chunked_array.is_null()
+
+ # is_nan() test
+ with pytest.raises(NotImplementedError):
+ cuda_chunked_array.is_nan()
+
+ # is_valid() test
+ with pytest.raises(NotImplementedError):
+ cuda_chunked_array.is_valid()
+
+ # fill_null() test
+ with pytest.raises(NotImplementedError):
+ cuda_chunked_array.fill_null(0)
+
+ # equals() test
+ with pytest.raises(NotImplementedError):
+ cuda_chunked_array == cuda_chunked_array
+
+ # to_pandas() test
+ with pytest.raises(NotImplementedError):
+ cuda_chunked_array.to_pandas()
+
+ # to_numpy() test
+ with pytest.raises(NotImplementedError):
+ cuda_chunked_array.to_numpy()
+
+ # __array__() test
+ with pytest.raises(NotImplementedError):
+ cuda_chunked_array.__array__()
+
+ # cast() test
+ with pytest.raises(NotImplementedError):
+ cuda_chunked_array.cast()
+
+ # dictionary_encode() test
+ with pytest.raises(NotImplementedError):
+ cuda_chunked_array.dictionary_encode()
+
+ # flatten() test
+ with pytest.raises(NotImplementedError):
+ cuda_chunked_array.flatten()
+
+ # combine_chunks() test
+ with pytest.raises(NotImplementedError):
+ cuda_chunked_array.combine_chunks()
+
+ # unique() test
+ with pytest.raises(NotImplementedError):
+ cuda_chunked_array.unique()
+
+ # value_counts() test
+ with pytest.raises(NotImplementedError):
+ cuda_chunked_array.value_counts()
+
+ # filter() test
+ with pytest.raises(NotImplementedError):
+ cuda_chunked_array.filter([True, False, True, False, True])
+
+ # index() test
+ with pytest.raises(NotImplementedError):
+ cuda_chunked_array.index(5)
+
+ # slice() test
+ with pytest.raises(NotImplementedError):
+ cuda_chunked_array.slice(2, 2)
+
+ # take() test
+ with pytest.raises(NotImplementedError):
+ cuda_chunked_array.take([1])
+
+ # drop_null() test
+ with pytest.raises(NotImplementedError):
+ cuda_chunked_array.drop_null()
+
+ # sort() test
+ with pytest.raises(NotImplementedError):
+ cuda_chunked_array.sort()
+
+ # unify_dictionaries() test
+ with pytest.raises(NotImplementedError):
+ cuda_chunked_array.unify_dictionaries()
+
+ # num_chunks test
+ assert cuda_chunked_array.num_chunks == cpu_chunked_array.num_chunks
+
+ # chunks test
+ assert len(cuda_chunked_array.chunks) == len(cpu_chunked_array.chunks)
+
+ # to_pylist() test
+ with pytest.raises(NotImplementedError):
+ cuda_chunked_array.to_pylist()
+
+ # __arrow_c_stream__() test
+ with pytest.raises(NotImplementedError):
+ cuda_chunked_array.__arrow_c_stream__()
Review Comment:
Also test `__reduce__` ?
##########
python/pyarrow/tests/test_table.py:
##########
@@ -3357,3 +3357,179 @@ def test_invalid_non_join_column():
with pytest.raises(pa.lib.ArrowInvalid) as excinfo:
t2.join(t1, 'id', join_type='inner')
assert exp_error_msg in str(excinfo.value)
+
+
[email protected]
+def cuda_context():
+ cuda = pytest.importorskip("pyarrow.cuda")
+ return cuda.Context(0)
+
+
[email protected]
+def schema():
+ return pa.schema([pa.field('c0', pa.int16()), pa.field('c1', pa.int32())])
+
+
[email protected]
+def cpu_arrays():
+ return [pa.array([1, 2, 3, 4, 5], pa.int32()),
+ pa.array([-10, -5, 0, None, 10], pa.int32())]
+
+
[email protected]
+def cuda_arrays(cuda_context, cpu_arrays):
+ return [arr.copy_to(cuda_context.memory_manager) for arr in cpu_arrays]
+
+
[email protected]
+def cpu_chunked_array(cpu_arrays):
+ chunked_array = pa.chunked_array(cpu_arrays)
+ assert chunked_array.is_cpu() is True
+ return chunked_array
+
+
[email protected]
+def cuda_chunked_array(cuda_arrays):
+ chunked_array = pa.chunked_array(cuda_arrays)
+ assert chunked_array.is_cpu() is False
+ return chunked_array
+
+
[email protected]
+def cpu_and_cuda_chunked_array(cpu_arrays, cuda_arrays):
+ chunked_array = pa.chunked_array(cpu_arrays + cuda_arrays)
+ assert chunked_array.is_cpu() is False
+ return chunked_array
+
+
+def test_chunked_array_non_cpu(cuda_context, cpu_chunked_array,
cuda_chunked_array,
+ cpu_and_cuda_chunked_array):
+ # type test
+ assert cuda_chunked_array.type == cpu_chunked_array.type
+
+ # length() test
+ assert cuda_chunked_array.length() == cpu_chunked_array.length()
+
+ # str() test
+ assert str(cuda_chunked_array) == str(cpu_chunked_array)
+
+ # repr() test
+ assert str(cuda_chunked_array) in repr(cuda_chunked_array)
+
+ # validate() test
+ cuda_chunked_array.validate()
+ with pytest.raises(NotImplementedError):
+ cuda_chunked_array.validate(full=True)
+
+ # null_count test
+ assert cuda_chunked_array.null_count == cpu_chunked_array.null_count
Review Comment:
`null_count` can potentially trigger a computation, AFAIK (maybe not in the
case of your test data here, but in general). I see that for Array we added an
assert_cpu for this
##########
python/pyarrow/table.pxi:
##########
@@ -1407,6 +1435,19 @@ cdef class ChunkedArray(_PandasConvertible):
self.init(c_chunked_array)
return self
+ def is_cpu(self):
Review Comment:
A similar question: all other `is_cpu` cases we have in pyarrow are
properties, while here it is a method. Should we already make it property
anyway expecting we optimize this later?
Or, we can also still cache this on the python side, if that makes us feel
better to make it directly a property.
##########
python/pyarrow/tests/test_table.py:
##########
@@ -3357,3 +3357,179 @@ def test_invalid_non_join_column():
with pytest.raises(pa.lib.ArrowInvalid) as excinfo:
t2.join(t1, 'id', join_type='inner')
assert exp_error_msg in str(excinfo.value)
+
+
[email protected]
+def cuda_context():
+ cuda = pytest.importorskip("pyarrow.cuda")
+ return cuda.Context(0)
+
+
[email protected]
+def schema():
+ return pa.schema([pa.field('c0', pa.int16()), pa.field('c1', pa.int32())])
+
+
[email protected]
+def cpu_arrays():
+ return [pa.array([1, 2, 3, 4, 5], pa.int32()),
+ pa.array([-10, -5, 0, None, 10], pa.int32())]
+
+
[email protected]
+def cuda_arrays(cuda_context, cpu_arrays):
+ return [arr.copy_to(cuda_context.memory_manager) for arr in cpu_arrays]
+
+
[email protected]
+def cpu_chunked_array(cpu_arrays):
+ chunked_array = pa.chunked_array(cpu_arrays)
+ assert chunked_array.is_cpu() is True
+ return chunked_array
+
+
[email protected]
+def cuda_chunked_array(cuda_arrays):
+ chunked_array = pa.chunked_array(cuda_arrays)
+ assert chunked_array.is_cpu() is False
+ return chunked_array
+
+
[email protected]
+def cpu_and_cuda_chunked_array(cpu_arrays, cuda_arrays):
+ chunked_array = pa.chunked_array(cpu_arrays + cuda_arrays)
+ assert chunked_array.is_cpu() is False
+ return chunked_array
+
+
+def test_chunked_array_non_cpu(cuda_context, cpu_chunked_array,
cuda_chunked_array,
+ cpu_and_cuda_chunked_array):
+ # type test
+ assert cuda_chunked_array.type == cpu_chunked_array.type
+
+ # length() test
+ assert cuda_chunked_array.length() == cpu_chunked_array.length()
+
+ # str() test
+ assert str(cuda_chunked_array) == str(cpu_chunked_array)
+
+ # repr() test
+ assert str(cuda_chunked_array) in repr(cuda_chunked_array)
+
+ # validate() test
+ cuda_chunked_array.validate()
+ with pytest.raises(NotImplementedError):
+ cuda_chunked_array.validate(full=True)
+
+ # null_count test
+ assert cuda_chunked_array.null_count == cpu_chunked_array.null_count
+
+ # nbytes() test
+ with pytest.raises(NotImplementedError):
+ cuda_chunked_array.nbytes
+
+ # get_total_buffer_size() test
+ with pytest.raises(NotImplementedError):
+ cuda_chunked_array.get_total_buffer_size()
+
+ # getitem() test
+ with pytest.raises(NotImplementedError):
+ cuda_chunked_array[0]
+
+ # is_null() test
+ with pytest.raises(NotImplementedError):
+ cuda_chunked_array.is_null()
+
+ # is_nan() test
+ with pytest.raises(NotImplementedError):
+ cuda_chunked_array.is_nan()
+
+ # is_valid() test
+ with pytest.raises(NotImplementedError):
+ cuda_chunked_array.is_valid()
+
+ # fill_null() test
+ with pytest.raises(NotImplementedError):
+ cuda_chunked_array.fill_null(0)
+
+ # equals() test
+ with pytest.raises(NotImplementedError):
+ cuda_chunked_array == cuda_chunked_array
+
+ # to_pandas() test
+ with pytest.raises(NotImplementedError):
+ cuda_chunked_array.to_pandas()
+
+ # to_numpy() test
+ with pytest.raises(NotImplementedError):
+ cuda_chunked_array.to_numpy()
+
+ # __array__() test
+ with pytest.raises(NotImplementedError):
+ cuda_chunked_array.__array__()
+
+ # cast() test
+ with pytest.raises(NotImplementedError):
+ cuda_chunked_array.cast()
+
+ # dictionary_encode() test
+ with pytest.raises(NotImplementedError):
+ cuda_chunked_array.dictionary_encode()
+
+ # flatten() test
+ with pytest.raises(NotImplementedError):
+ cuda_chunked_array.flatten()
+
+ # combine_chunks() test
+ with pytest.raises(NotImplementedError):
+ cuda_chunked_array.combine_chunks()
+
+ # unique() test
+ with pytest.raises(NotImplementedError):
+ cuda_chunked_array.unique()
+
+ # value_counts() test
+ with pytest.raises(NotImplementedError):
+ cuda_chunked_array.value_counts()
+
+ # filter() test
+ with pytest.raises(NotImplementedError):
+ cuda_chunked_array.filter([True, False, True, False, True])
+
+ # index() test
+ with pytest.raises(NotImplementedError):
+ cuda_chunked_array.index(5)
+
+ # slice() test
+ with pytest.raises(NotImplementedError):
+ cuda_chunked_array.slice(2, 2)
+
+ # take() test
+ with pytest.raises(NotImplementedError):
+ cuda_chunked_array.take([1])
+
+ # drop_null() test
+ with pytest.raises(NotImplementedError):
+ cuda_chunked_array.drop_null()
+
+ # sort() test
+ with pytest.raises(NotImplementedError):
+ cuda_chunked_array.sort()
+
+ # unify_dictionaries() test
+ with pytest.raises(NotImplementedError):
+ cuda_chunked_array.unify_dictionaries()
+
+ # num_chunks test
+ assert cuda_chunked_array.num_chunks == cpu_chunked_array.num_chunks
+
+ # chunks test
+ assert len(cuda_chunked_array.chunks) == len(cpu_chunked_array.chunks)
Review Comment:
Maybe here also test that you can get a `.chunk(0)` and that the result in
an array with the correct device type
##########
python/pyarrow/tests/test_table.py:
##########
@@ -3357,3 +3357,179 @@ def test_invalid_non_join_column():
with pytest.raises(pa.lib.ArrowInvalid) as excinfo:
t2.join(t1, 'id', join_type='inner')
assert exp_error_msg in str(excinfo.value)
+
+
[email protected]
+def cuda_context():
+ cuda = pytest.importorskip("pyarrow.cuda")
+ return cuda.Context(0)
+
+
[email protected]
+def schema():
+ return pa.schema([pa.field('c0', pa.int16()), pa.field('c1', pa.int32())])
+
+
[email protected]
+def cpu_arrays():
+ return [pa.array([1, 2, 3, 4, 5], pa.int32()),
+ pa.array([-10, -5, 0, None, 10], pa.int32())]
+
+
[email protected]
+def cuda_arrays(cuda_context, cpu_arrays):
+ return [arr.copy_to(cuda_context.memory_manager) for arr in cpu_arrays]
+
+
[email protected]
+def cpu_chunked_array(cpu_arrays):
+ chunked_array = pa.chunked_array(cpu_arrays)
+ assert chunked_array.is_cpu() is True
+ return chunked_array
+
+
[email protected]
+def cuda_chunked_array(cuda_arrays):
+ chunked_array = pa.chunked_array(cuda_arrays)
+ assert chunked_array.is_cpu() is False
+ return chunked_array
+
+
[email protected]
+def cpu_and_cuda_chunked_array(cpu_arrays, cuda_arrays):
+ chunked_array = pa.chunked_array(cpu_arrays + cuda_arrays)
+ assert chunked_array.is_cpu() is False
+ return chunked_array
+
+
+def test_chunked_array_non_cpu(cuda_context, cpu_chunked_array,
cuda_chunked_array,
+ cpu_and_cuda_chunked_array):
+ # type test
+ assert cuda_chunked_array.type == cpu_chunked_array.type
+
+ # length() test
+ assert cuda_chunked_array.length() == cpu_chunked_array.length()
+
+ # str() test
+ assert str(cuda_chunked_array) == str(cpu_chunked_array)
+
+ # repr() test
+ assert str(cuda_chunked_array) in repr(cuda_chunked_array)
+
+ # validate() test
+ cuda_chunked_array.validate()
+ with pytest.raises(NotImplementedError):
+ cuda_chunked_array.validate(full=True)
+
+ # null_count test
+ assert cuda_chunked_array.null_count == cpu_chunked_array.null_count
+
+ # nbytes() test
+ with pytest.raises(NotImplementedError):
+ cuda_chunked_array.nbytes
+
+ # get_total_buffer_size() test
+ with pytest.raises(NotImplementedError):
+ cuda_chunked_array.get_total_buffer_size()
+
+ # getitem() test
+ with pytest.raises(NotImplementedError):
+ cuda_chunked_array[0]
+
+ # is_null() test
+ with pytest.raises(NotImplementedError):
+ cuda_chunked_array.is_null()
+
+ # is_nan() test
+ with pytest.raises(NotImplementedError):
+ cuda_chunked_array.is_nan()
+
+ # is_valid() test
+ with pytest.raises(NotImplementedError):
+ cuda_chunked_array.is_valid()
+
+ # fill_null() test
+ with pytest.raises(NotImplementedError):
+ cuda_chunked_array.fill_null(0)
+
+ # equals() test
+ with pytest.raises(NotImplementedError):
+ cuda_chunked_array == cuda_chunked_array
+
+ # to_pandas() test
+ with pytest.raises(NotImplementedError):
+ cuda_chunked_array.to_pandas()
+
+ # to_numpy() test
+ with pytest.raises(NotImplementedError):
+ cuda_chunked_array.to_numpy()
+
+ # __array__() test
+ with pytest.raises(NotImplementedError):
+ cuda_chunked_array.__array__()
+
+ # cast() test
+ with pytest.raises(NotImplementedError):
+ cuda_chunked_array.cast()
+
+ # dictionary_encode() test
+ with pytest.raises(NotImplementedError):
+ cuda_chunked_array.dictionary_encode()
+
+ # flatten() test
+ with pytest.raises(NotImplementedError):
+ cuda_chunked_array.flatten()
+
+ # combine_chunks() test
+ with pytest.raises(NotImplementedError):
+ cuda_chunked_array.combine_chunks()
+
+ # unique() test
+ with pytest.raises(NotImplementedError):
+ cuda_chunked_array.unique()
+
+ # value_counts() test
+ with pytest.raises(NotImplementedError):
+ cuda_chunked_array.value_counts()
+
+ # filter() test
+ with pytest.raises(NotImplementedError):
+ cuda_chunked_array.filter([True, False, True, False, True])
+
+ # index() test
+ with pytest.raises(NotImplementedError):
+ cuda_chunked_array.index(5)
+
+ # slice() test
+ with pytest.raises(NotImplementedError):
+ cuda_chunked_array.slice(2, 2)
Review Comment:
Could slice work for non-cpu? We do allow it for the other objects (array,
record batch)
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]