(iceberg-python) branch main updated: Replace `numpy` usage and remove from `pyproject.toml` (#1272)

fokko Thu, 31 Oct 2024 07:51:20 -0700

This is an automated email from the ASF dual-hosted git repository.

fokko pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg-python.git



The following commit(s) were added to refs/heads/main by this push:
     new 0cebec48 Replace `numpy` usage and remove from `pyproject.toml` (#1272)
0cebec48 is described below

commit 0cebec48833f75eeca02b1a965112615b1cbc1c8
Author: Kevin Liu <[email protected]>
AuthorDate: Thu Oct 31 10:49:15 2024 -0400

    Replace `numpy` usage and remove from `pyproject.toml` (#1272)
    
    * use random instead of numpy
    
    * remove numpy from pyproject.toml
---
 poetry.lock                                  | 10 ++++----
 pyproject.toml                               | 35 ++++------------------------
 tests/integration/test_writes/test_writes.py | 16 ++++++-------
 3 files changed, 17 insertions(+), 44 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index 2c3e02fd..b144fd16 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -4519,14 +4519,14 @@ cffi = ["cffi (>=1.11)"]
 [extras]
 adlfs = ["adlfs"]
 daft = ["getdaft"]
-duckdb = ["duckdb", "numpy", "pyarrow"]
+duckdb = ["duckdb", "pyarrow"]
 dynamodb = ["boto3"]
 gcsfs = ["gcsfs"]
 glue = ["boto3", "mypy-boto3-glue"]
 hive = ["thrift"]
-pandas = ["numpy", "pandas", "pyarrow"]
-pyarrow = ["numpy", "pyarrow"]
-ray = ["numpy", "pandas", "pyarrow", "ray", "ray"]
+pandas = ["pandas", "pyarrow"]
+pyarrow = ["pyarrow"]
+ray = ["pandas", "pyarrow", "ray", "ray"]
 s3fs = ["s3fs"]
 snappy = ["python-snappy"]
 sql-postgres = ["psycopg2-binary", "sqlalchemy"]
@@ -4536,4 +4536,4 @@ zstandard = ["zstandard"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9, <3.13, !=3.9.7"
-content-hash = 
"c8e9ed26f57ff8c43dde985f66cd30694ec0ac032ed9da9cda375fbe05bd3302"
+content-hash = 
"9ff6b794eee7db5b198ff9df41d3a3f74eed4d620555dc286e62d33a1b1bb3f0"
diff --git a/pyproject.toml b/pyproject.toml
index eb159463..f3a9bdfe 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -78,9 +78,6 @@ gcsfs = { version = ">=2023.1.0,<2024.1.0", optional = true }
 psycopg2-binary = { version = ">=2.9.6", optional = true }
 sqlalchemy = { version = "^2.0.18", optional = true }
 getdaft = { version = ">=0.2.12", optional = true }
-numpy = [
-    { version = "1.26.0", python = ">=3.9,<3.13", optional = true },
-]
 cachetools = "^5.5.0"
 
 [tool.poetry.group.dev.dependencies]
@@ -238,10 +235,6 @@ ignore_missing_imports = true
 module = "sortedcontainers.*"
 ignore_missing_imports = true
 
-[[tool.mypy.overrides]]
-module = "numpy.*"
-ignore_missing_imports = true
-
 [[tool.mypy.overrides]]
 module = "sqlalchemy.*"
 ignore_missing_imports = true
@@ -394,10 +387,6 @@ ignore_missing_imports = true
 module = "sortedcontainers.*"
 ignore_missing_imports = true
 
-[[tool.mypy.overrides]]
-module = "numpy.*"
-ignore_missing_imports = true
-
 [[tool.mypy.overrides]]
 module = "sqlalchemy.*"
 ignore_missing_imports = true
@@ -550,10 +539,6 @@ ignore_missing_imports = true
 module = "sortedcontainers.*"
 ignore_missing_imports = true
 
-[[tool.mypy.overrides]]
-module = "numpy.*"
-ignore_missing_imports = true
-
 [[tool.mypy.overrides]]
 module = "sqlalchemy.*"
 ignore_missing_imports = true
@@ -706,10 +691,6 @@ ignore_missing_imports = true
 module = "sortedcontainers.*"
 ignore_missing_imports = true
 
-[[tool.mypy.overrides]]
-module = "numpy.*"
-ignore_missing_imports = true
-
 [[tool.mypy.overrides]]
 module = "sqlalchemy.*"
 ignore_missing_imports = true
@@ -862,10 +843,6 @@ ignore_missing_imports = true
 module = "sortedcontainers.*"
 ignore_missing_imports = true
 
-[[tool.mypy.overrides]]
-module = "numpy.*"
-ignore_missing_imports = true
-
 [[tool.mypy.overrides]]
 module = "sqlalchemy.*"
 ignore_missing_imports = true
@@ -894,10 +871,10 @@ generate-setup-file = false
 script = "build-module.py"
 
 [tool.poetry.extras]
-pyarrow = ["pyarrow", "numpy"]
-pandas = ["pandas", "pyarrow", "numpy"]
-duckdb = ["duckdb", "pyarrow", "numpy"]
-ray = ["ray", "pyarrow", "pandas", "numpy"]
+pyarrow = ["pyarrow"]
+pandas = ["pandas", "pyarrow"]
+duckdb = ["duckdb", "pyarrow"]
+ray = ["ray", "pyarrow", "pandas"]
 daft = ["getdaft"]
 snappy = ["python-snappy"]
 hive = ["thrift"]
@@ -1084,10 +1061,6 @@ ignore_missing_imports = true
 module = "sortedcontainers.*"
 ignore_missing_imports = true
 
-[[tool.mypy.overrides]]
-module = "numpy.*"
-ignore_missing_imports = true
-
 [[tool.mypy.overrides]]
 module = "sqlalchemy.*"
 ignore_missing_imports = true
diff --git a/tests/integration/test_writes/test_writes.py 
b/tests/integration/test_writes/test_writes.py
index 49c7c2df..01744514 100644
--- a/tests/integration/test_writes/test_writes.py
+++ b/tests/integration/test_writes/test_writes.py
@@ -17,13 +17,13 @@
 # pylint:disable=redefined-outer-name
 import math
 import os
+import random
 import time
 from datetime import date, datetime, timedelta
 from pathlib import Path
 from typing import Any, Dict
 from urllib.parse import urlparse
 
-import numpy as np
 import pandas as pd
 import pyarrow as pa
 import pyarrow.compute as pc
@@ -1373,14 +1373,14 @@ def test_delete_threshold(session_catalog: Catalog) -> 
None:
     date_start, date_end = date(2024, 1, 1), date(2024, 2, 1)
 
     # Generate the 'id' column
-    id_column = np.random.randint(id_min, id_max, num_rows)
+    id_column = [random.randint(id_min, id_max) for _ in range(num_rows)]
 
     # Generate the 'created_at' column as dates only
-    date_range = pd.date_range(start=date_start, end=date_end, freq="D")  # 
Daily frequency for dates
-    created_at_column = np.random.choice(date_range, num_rows)  # Convert to 
string (YYYY-MM-DD format)
+    date_range = pd.date_range(start=date_start, end=date_end, 
freq="D").to_list()  # Daily frequency for dates
+    created_at_column = [random.choice(date_range) for _ in range(num_rows)]  
# Convert to string (YYYY-MM-DD format)
 
     # Generate the 'relevancy_score' column with a peak around 0.1
-    relevancy_score_column = np.random.beta(a=2, b=20, size=num_rows)  # 
Adjusting parameters to peak around 0.1
+    relevancy_score_column = [random.betavariate(2, 20) for _ in 
range(num_rows)]  # Adjusting parameters to peak around 0.1
 
     # Create the dataframe
     df = pd.DataFrame({"id": id_column, "created_at": created_at_column, 
"relevancy_score": relevancy_score_column})
@@ -1403,12 +1403,12 @@ def test_delete_threshold(session_catalog: Catalog) -> 
None:
 
 @pytest.mark.integration
 def test_rewrite_manifest_after_partition_evolution(session_catalog: Catalog) 
-> None:
-    np.random.seed(876)
+    random.seed(876)
     N = 1440
     d = {
         "timestamp": pa.array([datetime(2023, 1, 1, 0, 0, 0) + 
timedelta(minutes=i) for i in range(N)]),
-        "category": pa.array([np.random.choice(["A", "B", "C"]) for _ in 
range(N)]),
-        "value": pa.array(np.random.normal(size=N)),
+        "category": pa.array([random.choice(["A", "B", "C"]) for _ in 
range(N)]),
+        "value": pa.array([random.gauss(0, 1) for _ in range(N)]),
     }
     data = pa.Table.from_pydict(d)

(iceberg-python) branch main updated: Replace `numpy` usage and remove from `pyproject.toml` (#1272)

Reply via email to