zhuqi-lucas commented on code in PR #19042:
URL: https://github.com/apache/datafusion/pull/19042#discussion_r2588004724


##########
benchmarks/bench.sh:
##########
@@ -1197,6 +1206,80 @@ compare_benchmarks() {
 
 }
 
+# Creates sorted ClickBench data from hits_0.parquet (partitioned dataset)
+# The data is sorted by EventTime in ascending order
+# Using hits_0.parquet (~150MB) instead of full hits.parquet (~14GB) for 
faster testing
+data_sorted_clickbench() {
+    SORTED_FILE="${DATA_DIR}/hits_0_sorted.parquet"
+    ORIGINAL_FILE="${DATA_DIR}/hits_partitioned/hits_0.parquet"
+
+    echo "Creating sorted ClickBench dataset from hits_0.parquet..."
+
+    # Check if partitioned data exists
+    if [ ! -f "${ORIGINAL_FILE}" ]; then
+        echo "hits_partitioned/hits_0.parquet not found. Running 
data_clickbench_partitioned first..."
+        data_clickbench_partitioned
+    fi
+
+    # Check if sorted file already exists
+    if [ -f "${SORTED_FILE}" ]; then
+        echo "Sorted hits_0.parquet already exists at ${SORTED_FILE}"
+        return 0
+    fi
+
+    echo "Sorting hits_0.parquet by EventTime (this takes ~10 seconds)..."
+
+    # Ensure virtual environment exists and has pyarrow
+    if [ ! -d "$VIRTUAL_ENV" ]; then
+        echo "Creating virtual environment at $VIRTUAL_ENV..."
+        python3 -m venv "$VIRTUAL_ENV"
+    fi
+
+    # Activate virtual environment
+    source "$VIRTUAL_ENV/bin/activate"
+
+    # Check and install pyarrow if needed
+    if ! python3 -c "import pyarrow" 2>/dev/null; then
+        echo "Installing pyarrow (this may take a minute)..."
+        pip install --quiet pyarrow
+    fi
+
+    # Use the standalone Python script to sort
+    python3 "${SCRIPT_DIR}"/sort_clickbench.py "${ORIGINAL_FILE}" 
"${SORTED_FILE}"
+    local result=$?
+
+    # Deactivate virtual environment
+    deactivate

Review Comment:
   Good idea, i will do this!



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to