zhuqi-lucas commented on code in PR #19042:
URL: https://github.com/apache/datafusion/pull/19042#discussion_r2594658232
##########
benchmarks/bench.sh:
##########
@@ -1197,10 +1206,105 @@ compare_benchmarks() {
}
+# Creates sorted ClickBench data from hits.parquet (full dataset)
+# The data is sorted by EventTime in ascending order
+# Uses datafusion-cli to reduce dependencies
+data_sorted_clickbench() {
+ SORTED_FILE="${DATA_DIR}/hits_sorted.parquet"
+ ORIGINAL_FILE="${DATA_DIR}/hits.parquet"
+
+ # Default memory limit is 12GB, can be overridden with
DATAFUSION_MEMORY_GB env var
+ MEMORY_LIMIT_GB=${DATAFUSION_MEMORY_GB:-12}
+
+ echo "Creating sorted ClickBench dataset from hits.parquet..."
+ echo "Configuration:"
+ echo " Memory limit: ${MEMORY_LIMIT_GB}G"
+ echo " Row group size: 64K rows"
+ echo " Compression: uncompressed"
+
+ if [ ! -f "${ORIGINAL_FILE}" ]; then
+ echo "hits.parquet not found. Running data_clickbench_1 first..."
+ data_clickbench_1
+ fi
+
+ if [ -f "${SORTED_FILE}" ]; then
+ echo "Sorted hits.parquet already exists at ${SORTED_FILE}"
+ return 0
+ fi
+
+ echo "Sorting hits.parquet by EventTime (this may take several minutes)..."
+
+ pushd "${DATAFUSION_DIR}" > /dev/null
+ echo "Building datafusion-cli..."
+ cargo build --release --bin datafusion-cli
+ DATAFUSION_CLI="${DATAFUSION_DIR}/target/release/datafusion-cli"
+ popd > /dev/null
+
+ echo "Using datafusion-cli to create sorted parquet file..."
+ "${DATAFUSION_CLI}" << EOF
+-- Memory and performance configuration
+SET datafusion.runtime.memory_limit = '${MEMORY_LIMIT_GB}G';
+SET datafusion.execution.spill_compression = 'uncompressed';
+SET datafusion.execution.sort_spill_reservation_bytes = 10485760; -- 10MB
+SET datafusion.execution.batch_size = 8192;
+SET datafusion.execution.target_partitions = 1;
Review Comment:
Updated, @alamb I add the duration logs in latest PR now for the default
behavior (12GB memory, and 1 target partition), the time is fast for it for my
local mac, less than 5mins:
```rust
+----------+
| count |
+----------+
| 99997497 |
+----------+
1 row(s) fetched.
Elapsed 278.468 seconds.
\q
End time: 2025-12-06 16:27:54
✓ Successfully created sorted ClickBench dataset
Input: 14095 MB
Output: 36159 MB
Time Statistics:
Total duration: 280 seconds (00:04:40)
Throughput: 50 MB/s
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]