zhuqi-lucas commented on code in PR #19042:
URL: https://github.com/apache/datafusion/pull/19042#discussion_r2594658232


##########
benchmarks/bench.sh:
##########
@@ -1197,10 +1206,105 @@ compare_benchmarks() {
 
 }
 
+# Creates sorted ClickBench data from hits.parquet (full dataset)
+# The data is sorted by EventTime in ascending order
+# Uses datafusion-cli to reduce dependencies
+data_sorted_clickbench() {
+    SORTED_FILE="${DATA_DIR}/hits_sorted.parquet"
+    ORIGINAL_FILE="${DATA_DIR}/hits.parquet"
+
+    # Default memory limit is 12GB, can be overridden with 
DATAFUSION_MEMORY_GB env var
+    MEMORY_LIMIT_GB=${DATAFUSION_MEMORY_GB:-12}
+
+    echo "Creating sorted ClickBench dataset from hits.parquet..."
+    echo "Configuration:"
+    echo "  Memory limit: ${MEMORY_LIMIT_GB}G"
+    echo "  Row group size: 64K rows"
+    echo "  Compression: uncompressed"
+
+    if [ ! -f "${ORIGINAL_FILE}" ]; then
+        echo "hits.parquet not found. Running data_clickbench_1 first..."
+        data_clickbench_1
+    fi
+
+    if [ -f "${SORTED_FILE}" ]; then
+        echo "Sorted hits.parquet already exists at ${SORTED_FILE}"
+        return 0
+    fi
+
+    echo "Sorting hits.parquet by EventTime (this may take several minutes)..."
+
+    pushd "${DATAFUSION_DIR}" > /dev/null
+    echo "Building datafusion-cli..."
+    cargo build --release --bin datafusion-cli
+    DATAFUSION_CLI="${DATAFUSION_DIR}/target/release/datafusion-cli"
+    popd > /dev/null
+
+    echo "Using datafusion-cli to create sorted parquet file..."
+    "${DATAFUSION_CLI}" << EOF
+-- Memory and performance configuration
+SET datafusion.runtime.memory_limit = '${MEMORY_LIMIT_GB}G';
+SET datafusion.execution.spill_compression = 'uncompressed';
+SET datafusion.execution.sort_spill_reservation_bytes = 10485760; -- 10MB
+SET datafusion.execution.batch_size = 8192;
+SET datafusion.execution.target_partitions = 1;

Review Comment:
   Updated, @alamb I add the duration logs in latest PR now for the default 
behavior (12GB memory, and 1 target partition), the time is fast for it for my 
local mac, less than 5mins:
   
   
   ```rust
   +----------+
   | count    |
   +----------+
   | 99997497 |
   +----------+
   1 row(s) fetched.
   Elapsed 278.468 seconds.
   
   \q
   End time: 2025-12-06 16:27:54
   ✓ Successfully created sorted ClickBench dataset
     Input:  14095 MB
     Output: 36159 MB
   
   Time Statistics:
     Total duration: 280 seconds (00:04:40)
     Throughput: 50 MB/s
   ```
   



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to