This is an automated email from the ASF dual-hosted git repository. vterentev pushed a commit to branch fix-vllm-gemma in repository https://gitbox.apache.org/repos/asf/beam.git
commit 55cd1745c8c27619553f00e44b6b59810ce52901 Author: Vitaly Terentyev <[email protected]> AuthorDate: Fri Sep 26 18:14:14 2025 +0400 Fix vllm gemma --- ..._Python_Benchmarks_Dataflow_VLLM_Gemma_Batch.txt | 4 ++-- .../ml/inference/test_resources/vllm.dockerfile | 21 +++++++++++++++++---- 2 files changed, 19 insertions(+), 6 deletions(-) diff --git a/.github/workflows/load-tests-pipeline-options/beam_Inference_Python_Benchmarks_Dataflow_VLLM_Gemma_Batch.txt b/.github/workflows/load-tests-pipeline-options/beam_Inference_Python_Benchmarks_Dataflow_VLLM_Gemma_Batch.txt index 6101fe5da45..23af8197d8d 100644 --- a/.github/workflows/load-tests-pipeline-options/beam_Inference_Python_Benchmarks_Dataflow_VLLM_Gemma_Batch.txt +++ b/.github/workflows/load-tests-pipeline-options/beam_Inference_Python_Benchmarks_Dataflow_VLLM_Gemma_Batch.txt @@ -20,7 +20,7 @@ --input=gs://apache-beam-ml/testing/inputs/sentences_50k.txt --machine_type=n1-standard-8 --worker_zone=us-central1-b ---disk_size_gb=50 +--disk_size_gb=200 --input_options={} --num_workers=8 --max_num_workers=25 @@ -33,4 +33,4 @@ --influx_measurement=gemma_vllm_batch --model_gcs_path=gs://apache-beam-ml/models/gemma-2b-it --dataflow_service_options=worker_accelerator=type:nvidia-tesla-t4;count:1;install-nvidia-driver ---experiments=use_runner_v2 \ No newline at end of file +--experiments=use_runner_v2 diff --git a/sdks/python/apache_beam/ml/inference/test_resources/vllm.dockerfile b/sdks/python/apache_beam/ml/inference/test_resources/vllm.dockerfile index 5727437809c..200497659de 100644 --- a/sdks/python/apache_beam/ml/inference/test_resources/vllm.dockerfile +++ b/sdks/python/apache_beam/ml/inference/test_resources/vllm.dockerfile @@ -19,9 +19,22 @@ FROM nvidia/cuda:12.4.1-devel-ubuntu22.04 -# 1) Non-interactive + timezone +# 1) Non-interactive + timezone + Redirect all heavy temp/cache away from /tmp ENV DEBIAN_FRONTEND=noninteractive \ - TZ=Etc/UTC + TZ=Etc/UTC \ + PYTHONUNBUFFERED=1 \ + PIP_NO_CACHE_DIR=1 \ + TMPDIR=/var/beam_tmp \ + HF_HOME=/var/beam_hf \ + HF_HUB_CACHE=/var/beam_hf/hub \ + TRANSFORMERS_CACHE=/var/beam_hf/hub \ + VLLM_CACHE_ROOT=/var/beam_hf/vllm \ + VLLM_RPC_BASE_PATH=/var/beam_tmp \ + TOKENIZERS_PARALLELISM=false + +# Make sure target dirs exist (mounted on worker PD at runtime) +RUN mkdir -p /var/beam_tmp /var/beam_hf/hub /var/beam_hf/vllm && \ + chmod -R 777 /var/beam_tmp /var/beam_hf RUN apt-get update && \ apt-get install -y --no-install-recommends \ @@ -46,7 +59,7 @@ RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3 && \ python3 -m pip install --upgrade pip setuptools wheel # 4) Copy the Beam SDK harness (for Dataflow workers) -COPY --from=gcr.io/apache-beam-testing/beam-sdk/beam_python3.10_sdk:2.68.0.dev \ +COPY --from=gcr.io/apache-beam-testing/beam-sdk/beam_python3.10_sdk:2.69.0.dev \ /opt/apache/beam /opt/apache/beam # 5) Make sure the harness is discovered first @@ -65,4 +78,4 @@ RUN python3 -m pip install --no-cache-dir \ triton>=3.1.0 # 8) Use the Beam boot script as entrypoint -ENTRYPOINT ["/opt/apache/beam/boot"] \ No newline at end of file +ENTRYPOINT ["/opt/apache/beam/boot"]
