zhuqi-lucas commented on code in PR #19042:
URL: https://github.com/apache/datafusion/pull/19042#discussion_r2581515353
##########
benchmarks/bench.sh:
##########
@@ -1197,6 +1206,86 @@ compare_benchmarks() {
}
+# Sorted Data Benchmark Functions (Optimized for hits_0.parquet)
+# Add these functions to bench.sh
+
+# Creates sorted ClickBench data from hits_0.parquet (partitioned dataset)
+# The data is sorted by EventTime in ascending order
+# Using hits_0.parquet (~150MB) instead of full hits.parquet (~14GB) for
faster testing
+data_sorted_clickbench() {
+ SORTED_FILE="${DATA_DIR}/hits_0_sorted.parquet"
+ ORIGINAL_FILE="${DATA_DIR}/hits_partitioned/hits_0.parquet"
+
+ echo "Creating sorted ClickBench dataset from hits_0.parquet..."
+
+ # Check if partitioned data exists
+ if [ ! -f "${ORIGINAL_FILE}" ]; then
+ echo "hits_partitioned/hits_0.parquet not found. Running
data_clickbench_partitioned first..."
+ data_clickbench_partitioned
+ fi
+
+ # Check if sorted file already exists
+ if [ -f "${SORTED_FILE}" ]; then
+ echo "Sorted hits_0.parquet already exists at ${SORTED_FILE}"
+ return 0
+ fi
+
+ echo "Sorting hits_0.parquet by EventTime (this takes ~10 seconds)..."
+
+ # Ensure virtual environment exists and has pyarrow
+ if [ ! -d "$VIRTUAL_ENV" ]; then
+ echo "Creating virtual environment at $VIRTUAL_ENV..."
+ python3 -m venv "$VIRTUAL_ENV"
+ fi
+
+ # Activate virtual environment
+ source "$VIRTUAL_ENV/bin/activate"
+
+ # Check and install pyarrow if needed
+ if ! python3 -c "import pyarrow" 2>/dev/null; then
+ echo "Installing pyarrow (this may take a minute)..."
+ pip install --quiet pyarrow
+ fi
+
+ # Use the standalone Python script to sort
+ python3 "${SCRIPT_DIR}"/sort_clickbench.py "${ORIGINAL_FILE}"
"${SORTED_FILE}"
+ local result=$?
+
+ # Deactivate virtual environment
+ deactivate
+
+ if [ $result -eq 0 ]; then
+ echo "✓ Successfully created sorted ClickBench dataset"
+ return 0
+ else
+ echo "✗ Error: Failed to create sorted dataset"
+ return 1
+ fi
+}
+
+# Sorted Data Benchmark Functions for bench.sh
+# Add these functions to your bench.sh script
Review Comment:
Removed it in latest PR.
##########
benchmarks/bench.sh:
##########
@@ -1197,6 +1206,86 @@ compare_benchmarks() {
}
+# Sorted Data Benchmark Functions (Optimized for hits_0.parquet)
+# Add these functions to bench.sh
Review Comment:
Removed it in latest PR.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]