This is an automated email from the ASF dual-hosted git repository.

JingsongLi pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/paimon.git


The following commit(s) were added to refs/heads/master by this push:
     new 75a9cd298b [python] Use pypaimon-rust 0.2.0 from PyPI instead of 
building from source (#8024)
75a9cd298b is described below

commit 75a9cd298be186af50d413e0335ebd08ba56ed2e
Author: Jingsong Lee <[email protected]>
AuthorDate: Fri May 29 10:09:28 2026 +0800

    [python] Use pypaimon-rust 0.2.0 from PyPI instead of building from source 
(#8024)
---
 .github/workflows/paimon-python-checks.yml      | 11 +----------
 paimon-python/pypaimon/tests/blob_table_test.py | 21 ++++++++++-----------
 2 files changed, 11 insertions(+), 21 deletions(-)

diff --git a/.github/workflows/paimon-python-checks.yml 
b/.github/workflows/paimon-python-checks.yml
index de00337be3..5aaa2ab72a 100755
--- a/.github/workflows/paimon-python-checks.yml
+++ b/.github/workflows/paimon-python-checks.yml
@@ -145,7 +145,7 @@ jobs:
           else
             python -m pip install --upgrade pip
             pip install torch --index-url https://download.pytorch.org/whl/cpu
-            python -m pip install pyroaring readerwriterlock==1.0.9 
fsspec==2024.3.1 cachetools==5.3.3 ossfs==2023.12.0 ray==2.48.0 
fastavro==1.11.1 pyarrow==16.0.0 zstandard==0.24.0 polars==1.32.0 duckdb==1.3.2 
numpy==1.24.3 pandas==2.0.3 pylance==0.39.0 cramjam flake8==4.0.1 pytest~=7.0 
py4j==0.10.9.9 requests parameterized==0.9.0 'daft>=0.7.6'
+            python -m pip install pyroaring readerwriterlock==1.0.9 
fsspec==2024.3.1 cachetools==5.3.3 ossfs==2023.12.0 ray==2.48.0 
fastavro==1.11.1 pyarrow==16.0.0 zstandard==0.24.0 polars==1.32.0 duckdb==1.3.2 
numpy==1.24.3 pandas==2.0.3 pylance==0.39.0 cramjam flake8==4.0.1 pytest~=7.0 
py4j==0.10.9.9 requests parameterized==0.9.0 'daft>=0.7.6' pypaimon-rust==0.2.0
             python -m pip install 'lumina-data>=${{ env.LUMINA_DATA_VERSION 
}}' -i https://pypi.org/simple/
             if python -c "import sys; sys.exit(0 if sys.version_info >= (3, 
11) else 1)"; then
               python -m pip install vortex-data==0.70.0
@@ -163,15 +163,6 @@ jobs:
           maturin build --release
           pip install target/wheels/tantivy-*.whl
 
-      - name: Build and install pypaimon-rust from source
-        if: matrix.python-version != '3.6.15'
-        shell: bash
-        run: |
-          git clone https://github.com/apache/paimon-rust.git /tmp/paimon-rust
-          cd /tmp/paimon-rust/bindings/python
-          maturin build --release -o dist
-          pip install dist/pypaimon_rust-*.whl
-          pip install 'datafusion>=52'
 
       - name: Run lint-python.sh
         shell: bash
diff --git a/paimon-python/pypaimon/tests/blob_table_test.py 
b/paimon-python/pypaimon/tests/blob_table_test.py
index c4e5a4d1bd..7261503450 100755
--- a/paimon-python/pypaimon/tests/blob_table_test.py
+++ b/paimon-python/pypaimon/tests/blob_table_test.py
@@ -412,7 +412,7 @@ class DataBlobWriterTest(unittest.TestCase):
         blob_writer.close()
 
     def test_data_blob_writer_write_large_blob(self):
-        """Test DataBlobWriter with very large blob data (50MB per item) in 10 
batches."""
+        """Test DataBlobWriter with large blob data (5MB per item) in 10 
batches."""
         from pypaimon import Schema
 
         # Create schema with blob column
@@ -436,28 +436,27 @@ class DataBlobWriterTest(unittest.TestCase):
         write_builder = table.new_batch_write_builder()
         blob_writer = write_builder.new_write()
 
-        # Create 50MB blob data per item
-        # Using a pattern to make the data more realistic and compressible
-        target_size = 50 * 1024 * 1024  # 50MB in bytes
+        # Create 5MB blob data per item
+        target_size = 5 * 1024 * 1024  # 5MB in bytes
         blob_pattern = b'LARGE_BLOB_DATA_PATTERN_' + b'X' * 1024  # ~1KB 
pattern
         pattern_size = len(blob_pattern)
         repetitions = target_size // pattern_size
         large_blob_data = blob_pattern * repetitions
 
-        # Verify the blob size is approximately 50MB
+        # Verify the blob size is approximately 5MB
         blob_size_mb = len(large_blob_data) / (1024 * 1024)
-        self.assertGreater(blob_size_mb, 49)  # Should be at least 49MB
-        self.assertLess(blob_size_mb, 51)  # Should be less than 51MB
+        self.assertGreater(blob_size_mb, 4)  # Should be at least 4MB
+        self.assertLess(blob_size_mb, 6)  # Should be less than 6MB
 
         total_rows = 0
 
         # Write 10 batches, each with 5 rows (50 rows total)
-        # Total data volume: 50 rows * 50MB = 2.5GB of blob data
+        # Total data volume: 50 rows * 5MB = 250MB of blob data
         for batch_num in range(10):
             batch_data = pa.Table.from_pydict({
                 'id': [batch_num * 5 + i for i in range(5)],
                 'description': [f'Large blob batch {batch_num}, row {i}' for i 
in range(5)],
-                'large_blob': [large_blob_data] * 5  # 5 rows per batch, each 
with 50MB blob
+                'large_blob': [large_blob_data] * 5  # 5 rows per batch, each 
with 5MB blob
             }, schema=pa_schema)
 
             # Write each batch
@@ -502,9 +501,9 @@ class DataBlobWriterTest(unittest.TestCase):
         # Verify total data written (50 rows of normal data + 50 rows of blob 
data = 100 total)
         self.assertEqual(total_row_count, 50)
 
-        # Verify total file size is substantial (should be much larger than 
2.5GB due to overhead)
+        # Verify total file size is substantial (should be at least 200MB)
         total_size_mb = total_file_size / (1024 * 1024)
-        self.assertGreater(total_size_mb, 2000)  # Should be at least 2GB due 
to overhead
+        self.assertGreater(total_size_mb, 200)
 
         total_files = sum(len(commit_msg.new_files) for commit_msg in 
commit_messages)
         print(f"Total data written: {total_size_mb:.2f}MB across {total_files} 
files")

Reply via email to