This is an automated email from the ASF dual-hosted git repository.

vterentev pushed a commit to branch fix-vllm-gemma
in repository https://gitbox.apache.org/repos/asf/beam.git

commit 55cd1745c8c27619553f00e44b6b59810ce52901
Author: Vitaly Terentyev <[email protected]>
AuthorDate: Fri Sep 26 18:14:14 2025 +0400

    Fix vllm gemma
---
 ..._Python_Benchmarks_Dataflow_VLLM_Gemma_Batch.txt |  4 ++--
 .../ml/inference/test_resources/vllm.dockerfile     | 21 +++++++++++++++++----
 2 files changed, 19 insertions(+), 6 deletions(-)

diff --git 
a/.github/workflows/load-tests-pipeline-options/beam_Inference_Python_Benchmarks_Dataflow_VLLM_Gemma_Batch.txt
 
b/.github/workflows/load-tests-pipeline-options/beam_Inference_Python_Benchmarks_Dataflow_VLLM_Gemma_Batch.txt
index 6101fe5da45..23af8197d8d 100644
--- 
a/.github/workflows/load-tests-pipeline-options/beam_Inference_Python_Benchmarks_Dataflow_VLLM_Gemma_Batch.txt
+++ 
b/.github/workflows/load-tests-pipeline-options/beam_Inference_Python_Benchmarks_Dataflow_VLLM_Gemma_Batch.txt
@@ -20,7 +20,7 @@
 --input=gs://apache-beam-ml/testing/inputs/sentences_50k.txt
 --machine_type=n1-standard-8
 --worker_zone=us-central1-b
---disk_size_gb=50
+--disk_size_gb=200
 --input_options={}
 --num_workers=8
 --max_num_workers=25
@@ -33,4 +33,4 @@
 --influx_measurement=gemma_vllm_batch
 --model_gcs_path=gs://apache-beam-ml/models/gemma-2b-it
 
--dataflow_service_options=worker_accelerator=type:nvidia-tesla-t4;count:1;install-nvidia-driver
---experiments=use_runner_v2
\ No newline at end of file
+--experiments=use_runner_v2
diff --git 
a/sdks/python/apache_beam/ml/inference/test_resources/vllm.dockerfile 
b/sdks/python/apache_beam/ml/inference/test_resources/vllm.dockerfile
index 5727437809c..200497659de 100644
--- a/sdks/python/apache_beam/ml/inference/test_resources/vllm.dockerfile
+++ b/sdks/python/apache_beam/ml/inference/test_resources/vllm.dockerfile
@@ -19,9 +19,22 @@
 
 FROM nvidia/cuda:12.4.1-devel-ubuntu22.04
 
-# 1) Non-interactive + timezone
+# 1) Non-interactive + timezone + Redirect all heavy temp/cache away from /tmp
 ENV DEBIAN_FRONTEND=noninteractive \
-    TZ=Etc/UTC
+    TZ=Etc/UTC \
+    PYTHONUNBUFFERED=1 \
+    PIP_NO_CACHE_DIR=1 \
+    TMPDIR=/var/beam_tmp \
+    HF_HOME=/var/beam_hf \
+    HF_HUB_CACHE=/var/beam_hf/hub \
+    TRANSFORMERS_CACHE=/var/beam_hf/hub \
+    VLLM_CACHE_ROOT=/var/beam_hf/vllm \
+    VLLM_RPC_BASE_PATH=/var/beam_tmp \
+    TOKENIZERS_PARALLELISM=false
+
+# Make sure target dirs exist (mounted on worker PD at runtime)
+RUN mkdir -p /var/beam_tmp /var/beam_hf/hub /var/beam_hf/vllm && \
+    chmod -R 777 /var/beam_tmp /var/beam_hf
 
 RUN apt-get update && \
     apt-get install -y --no-install-recommends \
@@ -46,7 +59,7 @@ RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3 
&& \
     python3 -m pip install --upgrade pip setuptools wheel
 
 # 4) Copy the Beam SDK harness (for Dataflow workers)
-COPY --from=gcr.io/apache-beam-testing/beam-sdk/beam_python3.10_sdk:2.68.0.dev 
\
+COPY --from=gcr.io/apache-beam-testing/beam-sdk/beam_python3.10_sdk:2.69.0.dev 
\
      /opt/apache/beam /opt/apache/beam
 
 # 5) Make sure the harness is discovered first
@@ -65,4 +78,4 @@ RUN python3 -m pip install --no-cache-dir \
       triton>=3.1.0
 
 # 8) Use the Beam boot script as entrypoint
-ENTRYPOINT ["/opt/apache/beam/boot"]
\ No newline at end of file
+ENTRYPOINT ["/opt/apache/beam/boot"]

Reply via email to