This is an automated email from the ASF dual-hosted git repository.
JingsongLi pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/paimon.git
The following commit(s) were added to refs/heads/master by this push:
new 75a9cd298b [python] Use pypaimon-rust 0.2.0 from PyPI instead of
building from source (#8024)
75a9cd298b is described below
commit 75a9cd298be186af50d413e0335ebd08ba56ed2e
Author: Jingsong Lee <[email protected]>
AuthorDate: Fri May 29 10:09:28 2026 +0800
[python] Use pypaimon-rust 0.2.0 from PyPI instead of building from source
(#8024)
---
.github/workflows/paimon-python-checks.yml | 11 +----------
paimon-python/pypaimon/tests/blob_table_test.py | 21 ++++++++++-----------
2 files changed, 11 insertions(+), 21 deletions(-)
diff --git a/.github/workflows/paimon-python-checks.yml
b/.github/workflows/paimon-python-checks.yml
index de00337be3..5aaa2ab72a 100755
--- a/.github/workflows/paimon-python-checks.yml
+++ b/.github/workflows/paimon-python-checks.yml
@@ -145,7 +145,7 @@ jobs:
else
python -m pip install --upgrade pip
pip install torch --index-url https://download.pytorch.org/whl/cpu
- python -m pip install pyroaring readerwriterlock==1.0.9
fsspec==2024.3.1 cachetools==5.3.3 ossfs==2023.12.0 ray==2.48.0
fastavro==1.11.1 pyarrow==16.0.0 zstandard==0.24.0 polars==1.32.0 duckdb==1.3.2
numpy==1.24.3 pandas==2.0.3 pylance==0.39.0 cramjam flake8==4.0.1 pytest~=7.0
py4j==0.10.9.9 requests parameterized==0.9.0 'daft>=0.7.6'
+ python -m pip install pyroaring readerwriterlock==1.0.9
fsspec==2024.3.1 cachetools==5.3.3 ossfs==2023.12.0 ray==2.48.0
fastavro==1.11.1 pyarrow==16.0.0 zstandard==0.24.0 polars==1.32.0 duckdb==1.3.2
numpy==1.24.3 pandas==2.0.3 pylance==0.39.0 cramjam flake8==4.0.1 pytest~=7.0
py4j==0.10.9.9 requests parameterized==0.9.0 'daft>=0.7.6' pypaimon-rust==0.2.0
python -m pip install 'lumina-data>=${{ env.LUMINA_DATA_VERSION
}}' -i https://pypi.org/simple/
if python -c "import sys; sys.exit(0 if sys.version_info >= (3,
11) else 1)"; then
python -m pip install vortex-data==0.70.0
@@ -163,15 +163,6 @@ jobs:
maturin build --release
pip install target/wheels/tantivy-*.whl
- - name: Build and install pypaimon-rust from source
- if: matrix.python-version != '3.6.15'
- shell: bash
- run: |
- git clone https://github.com/apache/paimon-rust.git /tmp/paimon-rust
- cd /tmp/paimon-rust/bindings/python
- maturin build --release -o dist
- pip install dist/pypaimon_rust-*.whl
- pip install 'datafusion>=52'
- name: Run lint-python.sh
shell: bash
diff --git a/paimon-python/pypaimon/tests/blob_table_test.py
b/paimon-python/pypaimon/tests/blob_table_test.py
index c4e5a4d1bd..7261503450 100755
--- a/paimon-python/pypaimon/tests/blob_table_test.py
+++ b/paimon-python/pypaimon/tests/blob_table_test.py
@@ -412,7 +412,7 @@ class DataBlobWriterTest(unittest.TestCase):
blob_writer.close()
def test_data_blob_writer_write_large_blob(self):
- """Test DataBlobWriter with very large blob data (50MB per item) in 10
batches."""
+ """Test DataBlobWriter with large blob data (5MB per item) in 10
batches."""
from pypaimon import Schema
# Create schema with blob column
@@ -436,28 +436,27 @@ class DataBlobWriterTest(unittest.TestCase):
write_builder = table.new_batch_write_builder()
blob_writer = write_builder.new_write()
- # Create 50MB blob data per item
- # Using a pattern to make the data more realistic and compressible
- target_size = 50 * 1024 * 1024 # 50MB in bytes
+ # Create 5MB blob data per item
+ target_size = 5 * 1024 * 1024 # 5MB in bytes
blob_pattern = b'LARGE_BLOB_DATA_PATTERN_' + b'X' * 1024 # ~1KB
pattern
pattern_size = len(blob_pattern)
repetitions = target_size // pattern_size
large_blob_data = blob_pattern * repetitions
- # Verify the blob size is approximately 50MB
+ # Verify the blob size is approximately 5MB
blob_size_mb = len(large_blob_data) / (1024 * 1024)
- self.assertGreater(blob_size_mb, 49) # Should be at least 49MB
- self.assertLess(blob_size_mb, 51) # Should be less than 51MB
+ self.assertGreater(blob_size_mb, 4) # Should be at least 4MB
+ self.assertLess(blob_size_mb, 6) # Should be less than 6MB
total_rows = 0
# Write 10 batches, each with 5 rows (50 rows total)
- # Total data volume: 50 rows * 50MB = 2.5GB of blob data
+ # Total data volume: 50 rows * 5MB = 250MB of blob data
for batch_num in range(10):
batch_data = pa.Table.from_pydict({
'id': [batch_num * 5 + i for i in range(5)],
'description': [f'Large blob batch {batch_num}, row {i}' for i
in range(5)],
- 'large_blob': [large_blob_data] * 5 # 5 rows per batch, each
with 50MB blob
+ 'large_blob': [large_blob_data] * 5 # 5 rows per batch, each
with 5MB blob
}, schema=pa_schema)
# Write each batch
@@ -502,9 +501,9 @@ class DataBlobWriterTest(unittest.TestCase):
# Verify total data written (50 rows of normal data + 50 rows of blob
data = 100 total)
self.assertEqual(total_row_count, 50)
- # Verify total file size is substantial (should be much larger than
2.5GB due to overhead)
+ # Verify total file size is substantial (should be at least 200MB)
total_size_mb = total_file_size / (1024 * 1024)
- self.assertGreater(total_size_mb, 2000) # Should be at least 2GB due
to overhead
+ self.assertGreater(total_size_mb, 200)
total_files = sum(len(commit_msg.new_files) for commit_msg in
commit_messages)
print(f"Total data written: {total_size_mb:.2f}MB across {total_files}
files")