This is an automated email from the ASF dual-hosted git repository.
fokko pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg-python.git
The following commit(s) were added to refs/heads/main by this push:
new 7f959a26 Remove `numpy` dependency (#1270)
7f959a26 is described below
commit 7f959a26327cb893baf307a995b746ed3d77ad08
Author: Fokko Driesprong <[email protected]>
AuthorDate: Wed Oct 30 21:30:14 2024 +0100
Remove `numpy` dependency (#1270)
* Remove numpy as a hard dependency
With Arrow 18.0.0 numpy is not a dependency anymore:
https://github.com/apache/arrow/pull/44148
I think it would be good to also remove it from PyIceberg
* Add link to issue
---
pyiceberg/io/pyarrow.py | 13 +++++++++++--
1 file changed, 11 insertions(+), 2 deletions(-)
diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py
index a7a1f5a6..c6ecd4a6 100644
--- a/pyiceberg/io/pyarrow.py
+++ b/pyiceberg/io/pyarrow.py
@@ -57,7 +57,6 @@ from typing import (
)
from urllib.parse import urlparse
-import numpy as np
import pyarrow as pa
import pyarrow.compute as pc
import pyarrow.dataset as ds
@@ -812,7 +811,17 @@ def _combine_positional_deletes(positional_deletes:
List[pa.ChunkedArray], start
all_chunks = positional_deletes[0]
else:
all_chunks = pa.chunked_array(itertools.chain(*[arr.chunks for arr in
positional_deletes]))
- return np.subtract(np.setdiff1d(np.arange(start_index, end_index),
all_chunks, assume_unique=False), start_index)
+
+ # Create the full range array with pyarrow
+ full_range = pa.array(range(start_index, end_index))
+ # When available, replace with Arrow generator to improve performance
+ # See https://github.com/apache/iceberg-python/issues/1271 for details
+
+ # Filter out values in all_chunks from full_range
+ result = pc.filter(full_range, pc.invert(pc.is_in(full_range,
value_set=all_chunks)))
+
+ # Subtract the start_index from each element in the result
+ return pc.subtract(result, pa.scalar(start_index))
def pyarrow_to_schema(