This is an automated email from the ASF dual-hosted git repository.

potiuk pushed a commit to branch optimize-prod-image-building-cache
in repository https://gitbox.apache.org/repos/asf/airflow.git

commit fd33c641c59c9a4c24dc8d343b9442feff71eecf
Author: Jarek Potiuk <[email protected]>
AuthorDate: Sat Nov 4 15:43:12 2023 +0100

    Optimize PROD image caching in CI
    
    Turns out that some of the layers in our PROD image got
    invalidated because AIRFLOW_CONSTRAINTS_MODE used to build the
    cache for PROD image is "constraints" by default, while building
    images in "build-images" workflow for regular PRs and canary
    build uses "constraints-source-providers". The former is fine as
    default for PROD image (as oppose to CI image we build PROD image
    from released PyPI packages by default) but the latter is "proper"
    for the CI cache, because there, the image is built out of local
    packages prepared from sources.
    
    Turns out that the CONSTRAINT_MODE parameter had a profound impact
    on caching - because it was set before the
    "install_packages_from_branch_tip" step and - in fact - even
    before "install database clients" step, which caused our cache to
    only work for the "base OS dependencies" - installing database
    clients and installing airflow from branch tip (which works great
    for CI image) had always been done in PRs because the layers in
    cache with constraints env invalidated all subsequent layers.
    
    This had no big impact before when testing usually took much longer
    time - but since the testing has been vastly improved in #35160, now
    PROD image building continues running even after test complete and
    becomes the next frontier of optimization.
    
    This PR optimizes PROD image building in two ways:
    
    * caching is prepared with "source_providers" constraint mode, same
      as regular build
    
    * the AIRFLOW_CONSTRAINT_MODE and related arguments are moved after
      installing database clients, so that this parameter does not
      impact their caching.
---
 .github/workflows/ci.yml |  1 +
 Dockerfile               | 76 +++++++++++++++++++++++++-----------------------
 2 files changed, 40 insertions(+), 37 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 3f54cc5096..e540b6829e 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -1886,6 +1886,7 @@ jobs:
           --builder airflow_cache
           --install-packages-from-context
           --run-in-parallel
+          --airflow-constraints-mode constraints-source-providers
           --prepare-buildx-cache
           --platform ${{ matrix.platform }}
         env:
diff --git a/Dockerfile b/Dockerfile
index 5297a4f6b6..a69fef736b 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1223,6 +1223,44 @@ ARG INSTALL_MYSQL_CLIENT="true"
 ARG INSTALL_MYSQL_CLIENT_TYPE="mysql"
 ARG INSTALL_MSSQL_CLIENT="true"
 ARG INSTALL_POSTGRES_CLIENT="true"
+ARG AIRFLOW_PIP_VERSION
+
+ENV INSTALL_MYSQL_CLIENT=${INSTALL_MYSQL_CLIENT} \
+    INSTALL_MYSQL_CLIENT_TYPE=${INSTALL_MYSQL_CLIENT_TYPE} \
+    INSTALL_MSSQL_CLIENT=${INSTALL_MSSQL_CLIENT} \
+    INSTALL_POSTGRES_CLIENT=${INSTALL_POSTGRES_CLIENT}
+
+# Only copy mysql/mssql installation scripts for now - so that changing the 
other
+# scripts which are needed much later will not invalidate the docker layer here
+COPY --from=scripts install_mysql.sh install_mssql.sh install_postgres.sh 
/scripts/docker/
+
+# THE 3 LINES ARE ONLY NEEDED IN ORDER TO MAKE PYMSSQL BUILD WORK WITH LATEST 
CYTHON
+# AND SHOULD BE REMOVED WHEN WORKAROUND IN install_mssql.sh IS REMOVED
+ARG AIRFLOW_PIP_VERSION=23.3.1
+ENV AIRFLOW_PIP_VERSION=${AIRFLOW_PIP_VERSION}
+COPY --from=scripts common.sh /scripts/docker/
+
+RUN bash /scripts/docker/install_mysql.sh dev && \
+    bash /scripts/docker/install_mssql.sh dev && \
+    bash /scripts/docker/install_postgres.sh dev
+ENV PATH=${PATH}:/opt/mssql-tools/bin
+
+# By default we do not install from docker context files but if we decide to 
install from docker context
+# files, we should override those variables to "docker-context-files"
+ARG DOCKER_CONTEXT_FILES="Dockerfile"
+
+COPY ${DOCKER_CONTEXT_FILES} /docker-context-files
+
+ARG AIRFLOW_HOME
+ARG AIRFLOW_USER_HOME_DIR
+ARG AIRFLOW_UID
+
+RUN adduser --gecos "First Last,RoomNumber,WorkPhone,HomePhone" 
--disabled-password \
+       --quiet "airflow" --uid "${AIRFLOW_UID}" --gid "0" --home 
"${AIRFLOW_USER_HOME_DIR}" && \
+    mkdir -p ${AIRFLOW_HOME} && chown -R "airflow:0" 
"${AIRFLOW_USER_HOME_DIR}" ${AIRFLOW_HOME}
+
+USER airflow
+
 ARG AIRFLOW_REPO=apache/airflow
 ARG AIRFLOW_BRANCH=main
 ARG AIRFLOW_EXTRAS
@@ -1233,7 +1271,7 @@ ARG AIRFLOW_CONSTRAINTS_MODE="constraints"
 ARG AIRFLOW_CONSTRAINTS_REFERENCE=""
 ARG AIRFLOW_CONSTRAINTS_LOCATION=""
 ARG DEFAULT_CONSTRAINTS_BRANCH="constraints-main"
-ARG AIRFLOW_PIP_VERSION
+
 # By default PIP has progress bar but you can disable it.
 ARG PIP_PROGRESS_BAR
 # By default we do not use pre-cached packages, but in CI/Breeze environment 
we override this to speed up
@@ -1262,42 +1300,6 @@ ARG UPGRADE_TO_NEWER_DEPENDENCIES="false"
 ARG AIRFLOW_SOURCES_FROM="Dockerfile"
 ARG AIRFLOW_SOURCES_TO="/Dockerfile"
 
-# By default we do not install from docker context files but if we decide to 
install from docker context
-# files, we should override those variables to "docker-context-files"
-ARG DOCKER_CONTEXT_FILES="Dockerfile"
-
-ARG AIRFLOW_HOME
-ARG AIRFLOW_USER_HOME_DIR
-ARG AIRFLOW_UID
-
-ENV INSTALL_MYSQL_CLIENT=${INSTALL_MYSQL_CLIENT} \
-    INSTALL_MYSQL_CLIENT_TYPE=${INSTALL_MYSQL_CLIENT_TYPE} \
-    INSTALL_MSSQL_CLIENT=${INSTALL_MSSQL_CLIENT} \
-    INSTALL_POSTGRES_CLIENT=${INSTALL_POSTGRES_CLIENT}
-
-# Only copy mysql/mssql installation scripts for now - so that changing the 
other
-# scripts which are needed much later will not invalidate the docker layer here
-COPY --from=scripts install_mysql.sh install_mssql.sh install_postgres.sh 
/scripts/docker/
-
-# THE 3 LINES ARE ONLY NEEDED IN ORDER TO MAKE PYMSSQL BUILD WORK WITH LATEST 
CYTHON
-# AND SHOULD BE REMOVED WHEN WORKAROUND IN install_mssql.sh IS REMOVED
-ARG AIRFLOW_PIP_VERSION=23.3.1
-ENV AIRFLOW_PIP_VERSION=${AIRFLOW_PIP_VERSION}
-COPY --from=scripts common.sh /scripts/docker/
-
-
-RUN bash /scripts/docker/install_mysql.sh dev && \
-    bash /scripts/docker/install_mssql.sh dev && \
-    bash /scripts/docker/install_postgres.sh dev
-ENV PATH=${PATH}:/opt/mssql-tools/bin
-
-COPY ${DOCKER_CONTEXT_FILES} /docker-context-files
-
-RUN adduser --gecos "First Last,RoomNumber,WorkPhone,HomePhone" 
--disabled-password \
-       --quiet "airflow" --uid "${AIRFLOW_UID}" --gid "0" --home 
"${AIRFLOW_USER_HOME_DIR}" && \
-    mkdir -p ${AIRFLOW_HOME} && chown -R "airflow:0" 
"${AIRFLOW_USER_HOME_DIR}" ${AIRFLOW_HOME}
-
-USER airflow
 
 RUN if [[ -f /docker-context-files/pip.conf ]]; then \
         mkdir -p ${AIRFLOW_USER_HOME_DIR}/.config/pip; \

Reply via email to