xiedeyantu opened a new issue, #46768: URL: https://github.com/apache/arrow/issues/46768
### Describe the usage question you have. Please include as many useful details as possible. The python script is following: ``` import json import numpy as np import os from datasets import Dataset import pyarrow as pa import pyarrow.compute as pc import pyarrow.gandiva as gandiva import time # ==================== 1. Generate JSONL test file ==================== def generate_jsonl(output_path, num_rows=1_000_000): """Generate JSONL file with random data""" with open(output_path, 'w') as f: for i in range(num_rows): record = { "id": i, "value": int(np.random.randint(0, 100)) } f.write(json.dumps(record) + "\n") # ==================== 2. Test functions ==================== def test_pyarrow_gandiva(table, batch_size=100_000): """Test Gandiva filtering performance""" table = table.combine_chunks() # Merge all small batches to ensure max_chunksize works batches = table.to_batches(max_chunksize=batch_size) # Build Gandiva expression builder = gandiva.TreeExprBuilder() value_field = builder.make_field(table.schema.field("value")) literal_50 = builder.make_literal(50, pa.int64()) condition = builder.make_function("greater_than", [value_field, literal_50], pa.bool_()) gandiva_filter = gandiva.make_filter(table.schema, builder.make_condition(condition)) # Execute filtering start_time = time.time() count = 0 for batch in batches: sv = gandiva_filter.evaluate(batch, pa.default_memory_pool()) count += len(sv.to_array()) print(f"\ncount: {count:,}") elapsed = time.time() - start_time return {"method": "Gandiva", "time": elapsed, "count": count} def test_pyarrow_compute(table): """Test PyArrow Compute filtering performance""" start_time = time.time() mask = pc.greater(table['value'], 50) count = pc.sum(mask).as_py() elapsed = time.time() - start_time return {"method": "PyArrow Compute", "time": elapsed, "count": count} # ==================== 3. Main execution logic ==================== def main(): # Total data volume total_rows = 10000000 # 100 million # Generate test data jsonl_path = "test_data.jsonl" if not os.path.exists(jsonl_path): print("Generating test data...") generate_jsonl(jsonl_path, num_rows=total_rows) # Load data from JSONL print("Loading data...") dataset = Dataset.from_json(jsonl_path) # Fix point: directly use dataset._data to get PyArrow Table table = dataset._data if hasattr(dataset, '_data') else pa.Table.from_batches(dataset.data) print(f"Total rows: {table.num_rows:,}") # Run tests print("\nRunning performance tests...") gandiva_result = test_pyarrow_gandiva(table, batch_size=total_rows) compute_result = test_pyarrow_compute(table) # Print results print("\nResults:") print(f"{gandiva_result['method']}:") print(f" Filtered rows: {gandiva_result['count']:,}") print(f" Time: {gandiva_result['time']:.4f}s") print(f"\n{compute_result['method']}:") print(f" Filtered rows: {compute_result['count']:,}") print(f" Time: {compute_result['time']:.4f}s") # Performance comparison speedup = gandiva_result['time'] / compute_result['time'] print(f"\nPyArrow Compute is {speedup:.1f}x faster") if __name__ == "__main__": main() ``` No matter how I adjust the batch size, Gandiva cannot be faster than Compute. Is there something wrong with me? I use docker to test this. CPU is root@1a5c6ae7847b:/test# lscpu | grep "Model name" || cat /proc/cpuinfo | grep "model name" | head -n 1 Model name: 12th Gen Intel(R) Core(TM) i7-1260P docker image is apache/arrow-dev:amd64-conda-python-3.10 mamba install -c conda-forge numpy datasets libarrow-gandiva mamba install -c conda-forge pyarrow ### Component(s) Gandiva -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@arrow.apache.org.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org