ashb commented on a change in pull request #6266: [AIRFLOW-2439] Production 
Docker image support including refactoring of build scripts - depends on 
[AIRFLOW-5704]
URL: https://github.com/apache/airflow/pull/6266#discussion_r336902442
 
 

 ##########
 File path: Dockerfile
 ##########
 @@ -77,252 +75,300 @@ RUN curl -sL https://deb.nodesource.com/setup_10.x | 
bash - \
            libssl-dev \
            locales  \
            netcat \
-           nodejs \
            rsync \
            sasl2-bin \
            sudo \
+           libmariadb-dev-compat \
     && apt-get autoremove -yqq --purge \
     && apt-get clean \
     && rm -rf /var/lib/apt/lists/*
 
-# Install graphviz - needed to build docs with diagrams
-RUN apt-get update \
-    && apt-get install -y --no-install-recommends \
-           graphviz \
-    && apt-get autoremove -yqq --purge \
-    && apt-get clean \
-    && rm -rf /var/lib/apt/lists/*
-
-# Install MySQL client from Oracle repositories (Debian installs mariadb)
-RUN KEY="A4A9406876FCBD3C456770C88C718D3B5072E1F5" \
-    && GNUPGHOME="$(mktemp -d)" \
-    && export GNUPGHOME \
-    && for KEYSERVER in $(shuf -e \
-            ha.pool.sks-keyservers.net \
-            hkp://p80.pool.sks-keyservers.net:80 \
-            keyserver.ubuntu.com \
-            hkp://keyserver.ubuntu.com:80 \
-            pgp.mit.edu) ; do \
-          gpg --keyserver "${KEYSERVER}" --recv-keys "${KEY}" && break || true 
; \
-       done \
-    && gpg --export "${KEY}" | apt-key add - \
-    && gpgconf --kill all \
-    rm -rf "${GNUPGHOME}"; \
-    apt-key list > /dev/null \
-    && echo "deb http://repo.mysql.com/apt/debian/ stretch mysql-5.6" | tee -a 
/etc/apt/sources.list.d/mysql.list \
-    && apt-get update \
-    && apt-get install --no-install-recommends -y \
-        libmysqlclient-dev \
-        mysql-client \
-    && apt-get autoremove -yqq --purge \
-    && apt-get clean && rm -rf /var/lib/apt/lists/*
-
 RUN adduser airflow \
     && echo "airflow ALL=(ALL) NOPASSWD: ALL" > /etc/sudoers.d/airflow \
     && chmod 0440 /etc/sudoers.d/airflow
 
 
############################################################################################################
-# This is an image with all APT dependencies needed by CI. It is built on top 
of the airlfow APT image
-# Parameters:
-#     airflow-apt-deps - this is the base image for CI deps image.
+# CI airflow image
 
############################################################################################################
-FROM airflow-apt-deps-ci-slim as airflow-apt-deps-ci
+FROM airflow-base as airflow-ci
 
 SHELL ["/bin/bash", "-o", "pipefail", "-e", "-u", "-x", "-c"]
 
-ENV JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/
+# Setting to 1 speeds up building the image. Cassandra driver without CYTHON 
saves around 10 minutes
+# But might not be suitable for production image
+ENV CASS_DRIVER_NO_CYTHON="1"
+ENV CASS_DRIVER_BUILD_CONCURRENCY=8
+
+ENV JAVA_HOME=/usr/lib/jvm/adoptopenjdk-8-hotspot-amd64/
+
+# By changing the CI build epoch we can force reinstalling apt dependenecies 
for CI
+# It can also be overwritten manually by setting the build variable.
+ARG CI_APT_DEPENDENCIES_EPOCH_NUMBER="1"
+ENV CI_APT_DEPENDENCIES_EPOCH_NUMBER=${CI_APT_DEPENDENCIES_EPOCH_NUMBER}
+
+RUN apt-get update \
+    && apt-get install --no-install-recommends -y \
+         apt-transport-https ca-certificates wget dirmngr gnupg 
software-properties-common curl gnupg2 \
+    && export APT_KEY_DONT_WARN_ON_DANGEROUS_USAGE=1 \
+    && curl -sL https://adoptopenjdk.jfrog.io/adoptopenjdk/api/gpg/key/public 
| apt-key add - \
+    && curl -sL https://deb.nodesource.com/setup_10.x | bash - \
+    && add-apt-repository --yes 
https://adoptopenjdk.jfrog.io/adoptopenjdk/deb/ \
+    && apt-get update \
+    && apt-get install --no-install-recommends -y \
+      gnupg \
+      graphviz \
+      krb5-user \
+      ldap-utils \
+      less \
+      lsb-release \
+      nodejs \
+      net-tools \
+      adoptopenjdk-8-hotspot \
+      openssh-client \
+      openssh-server \
+      postgresql-client \
+      python-selinux \
+      sqlite3 \
+      tmux \
+      unzip \
+      vim \
+    && apt-get autoremove -yqq --purge \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/* \
+    ;
+
+ENV HADOOP_DISTRO="cdh" HADOOP_MAJOR="5" HADOOP_DISTRO_VERSION="5.11.0" 
HADOOP_VERSION="2.6.0" \
+    HADOOP_HOME="/tmp/hadoop-cdh"
+ENV HIVE_VERSION="1.1.0" HIVE_HOME="/tmp/hive"
+ENV 
HADOOP_URL="https://archive.cloudera.com/${HADOOP_DISTRO}${HADOOP_MAJOR}/${HADOOP_DISTRO}/${HADOOP_MAJOR}/";
+ENV 
MINICLUSTER_BASE="https://github.com/bolkedebruin/minicluster/releases/download/";
 \
+    MINICLUSTER_HOME="/tmp/minicluster" \
+    MINICLUSTER_VER="1.1"
+
+RUN mkdir -pv "${HADOOP_HOME}" \
+    && mkdir -pv "${HIVE_HOME}" \
+    && mkdir -pv "${MINICLUSTER_HOME}" \
+    && mkdir -pv "/user/hive/warehouse" \
+    && chmod -R 777 "${HIVE_HOME}" \
+    &&chmod -R 777 "/user/"
+
+ENV 
HADOOP_DOWNLOAD_URL="${HADOOP_URL}hadoop-${HADOOP_VERSION}-${HADOOP_DISTRO}${HADOOP_DISTRO_VERSION}.tar.gz"
 \
+    HADOOP_TMP_FILE="/tmp/hadoop.tar.gz"
+
+RUN curl -sL "${HADOOP_DOWNLOAD_URL}" >"${HADOOP_TMP_FILE}" \
+    && tar xzf "${HADOOP_TMP_FILE}" --absolute-names --strip-components 1 -C 
"${HADOOP_HOME}" \
+    && rm "${HADOOP_TMP_FILE}"
+
+ENV 
HIVE_URL="${HADOOP_URL}hive-${HIVE_VERSION}-${HADOOP_DISTRO}${HADOOP_DISTRO_VERSION}.tar.gz"
 \
+    HIVE_TMP_FILE="/tmp/hive.tar.gz"
+
+RUN curl -sL "${HIVE_URL}" >"${HIVE_TMP_FILE}" \
+    && tar xzf "${HIVE_TMP_FILE}" --strip-components 1 -C "${HIVE_HOME}" \
+    && rm "${HIVE_TMP_FILE}"
+
+ENV 
MINICLUSTER_URL="${MINICLUSTER_BASE}${MINICLUSTER_VER}/minicluster-${MINICLUSTER_VER}-SNAPSHOT-bin.zip"
 \
+    MINICLUSTER_TMP_FILE="/tmp/minicluster.zip"
+
+RUN curl -sL "${MINICLUSTER_URL}" > "${MINICLUSTER_TMP_FILE}" \
+    && unzip "${MINICLUSTER_TMP_FILE}" -d "/tmp" \
+    && rm "${MINICLUSTER_TMP_FILE}"
+
+ENV PATH "${PATH}:/tmp/hive/bin"
+
+RUN curl -fsSL https://download.docker.com/linux/debian/gpg | apt-key add - \
+    && add-apt-repository "deb [arch=amd64] 
https://download.docker.com/linux/debian stretch stable" \
+    && apt-get update \
+    && apt-get -y install --no-install-recommends docker-ce \
+    && apt-get autoremove -yqq --purge \
+    && apt-get clean && rm -rf /var/lib/apt/lists/*
 
-ARG APT_DEPS_IMAGE="airflow-apt-deps-ci-slim"
-ENV APT_DEPS_IMAGE=${APT_DEPS_IMAGE}
 ARG KUBERNETES_VERSION="v1.15.0"
 ENV KUBERNETES_VERSION=${KUBERNETES_VERSION}
 ARG KIND_VERSION="v0.5.0"
 ENV KIND_VERSION=${KIND_VERSION}
 
-RUN echo "${APT_DEPS_IMAGE}"
-
-# Note the ifs below might be removed if Buildkit will become usable. It 
should skip building this
-# image automatically if it is not used. For now we still go through all 
layers below but they are empty
-RUN if [[ "${APT_DEPS_IMAGE}" == "airflow-apt-deps-ci" ]]; then \
-        # Note missing man directories on debian-stretch
-        # https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=863199
-        mkdir -pv /usr/share/man/man1 \
-        && mkdir -pv /usr/share/man/man7 \
-        && apt-get update \
-        && apt-get install --no-install-recommends -y \
-          gnupg \
-          apt-transport-https \
-          ca-certificates \
-          software-properties-common \
-          krb5-user \
-          ldap-utils \
-          less \
-          lsb-release \
-          net-tools \
-          openjdk-8-jdk \
-          openssh-client \
-          openssh-server \
-          postgresql-client \
-          python-selinux \
-          sqlite3 \
-          tmux \
-          unzip \
-          vim \
-        && apt-get autoremove -yqq --purge \
-        && apt-get clean \
-        && rm -rf /var/lib/apt/lists/* \
-        ;\
+RUN curl -Lo kubectl \
+  
"https://storage.googleapis.com/kubernetes-release/release/${KUBERNETES_VERSION}/bin/linux/amd64/kubectl";
 \
+  && chmod +x kubectl \
+  && mv kubectl /usr/local/bin/kubectl
+
+RUN curl -Lo kind \
+   
"https://github.com/kubernetes-sigs/kind/releases/download/${KIND_VERSION}/kind-linux-amd64";
 \
+   && chmod +x kind \
+   && mv kind /usr/local/bin/kind
+
+ARG AIRFLOW_REPO=apache/airflow
+ENV AIRFLOW_REPO=${AIRFLOW_REPO}
+
+ARG AIRFLOW_BRANCH=master
+ENV AIRFLOW_BRANCH=${AIRFLOW_BRANCH}
+
+# Airflow Extras installed
+ARG AIRFLOW_CI_EXTRAS="all,devel"
+ENV AIRFLOW_CI_EXTRAS=${AIRFLOW_CI_EXTRAS}
+
+RUN echo "Installing with extras: ${AIRFLOW_CI_EXTRAS}."
+
+# Increase the value here to force reinstalling pip dependencies from the 
scratch for CI build
+# It can also be overwritten manually by setting the build variable.
+ARG PIP_DEPENDENCIES_EPOCH_NUMBER="1"
+ENV PIP_DEPENDENCIES_EPOCH_NUMBER=${PIP_DEPENDENCIES_EPOCH_NUMBER}
+
+ENV PATH="/root/.local/bin:/root:${PATH}"
+
+# In case of CI builds we want to pre-install master version of airflow 
dependencies so that
+# We do not have to always reinstall it from the scratch and loose time for 
that.
+# CI build is optimised for build speed
+RUN pip install --user \
+        
"https://github.com/${AIRFLOW_REPO}/archive/${AIRFLOW_BRANCH}.tar.gz#egg=apache-airflow[${AIRFLOW_CI_EXTRAS}]";
 \
+        && pip uninstall --yes apache-airflow snakebite
+
+ARG AIRFLOW_SOURCES=/opt/airflow
+ENV AIRFLOW_SOURCES=${AIRFLOW_SOURCES}
+
+# Copy all www files here so that we can run npm building for production
+COPY airflow/www/ ${AIRFLOW_SOURCES}/airflow/www/
+
+WORKDIR ${AIRFLOW_SOURCES}/airflow/www
+
+ARG DEBUG_FIXING_PERMISSIONS=false
+
+RUN  \
+    if [[ ${DEBUG_FIXING_PERMISSIONS:=} == "true" ]]; then \
+       find . -type d -exec ls -la {} +; \
     fi
 
-# TODO: We should think about removing those and moving them into 
docker-compose dependencies.
-COPY scripts/ci/docker_build/ci_build_install_deps.sh 
/tmp/ci_build_install_deps.sh
+RUN mkdir -p "${AIRFLOW_SOURCES}/airflow/www/static" \
+    && mkdir -p "${AIRFLOW_SOURCES}/docs/build/_html" \
+    && pushd "${AIRFLOW_SOURCES}/airflow/www/static" || exit \
+    && ln -sf ../../../docs/_build/html docs \
+    && popd || exit
 
-# Kubernetes dependencies
-RUN \
-if [[ "${APT_DEPS_IMAGE}" == "airflow-apt-deps-ci" ]]; then \
-    curl -fsSL https://download.docker.com/linux/debian/gpg | apt-key add - \
-    && add-apt-repository "deb [arch=amd64] 
https://download.docker.com/linux/debian stretch stable" \
-    && apt-get update \
-    && apt-get -y install --no-install-recommends docker-ce \
-    && apt-get autoremove -yqq --purge \
-    && apt-get clean && rm -rf /var/lib/apt/lists/* \
-    ;\
-fi
-
-RUN \
-if [[ "${APT_DEPS_IMAGE}" == "airflow-apt-deps-ci" ]]; then \
-    curl -Lo kubectl \
-    
"https://storage.googleapis.com/kubernetes-release/release/${KUBERNETES_VERSION}/bin/linux/amd64/kubectl";
 \
-    && chmod +x kubectl \
-    && mv kubectl /usr/local/bin/kubectl \
-    ;\
-fi
-
-RUN \
-if [[ "${APT_DEPS_IMAGE}" == "airflow-apt-deps-ci" ]]; then \
-    curl -Lo kind \
-    
"https://github.com/kubernetes-sigs/kind/releases/download/${KIND_VERSION}/kind-linux-amd64";
 \
-    && chmod +x kind \
-    && mv kind /usr/local/bin/kind \
-    ;\
-fi
-
-ENV HADOOP_DISTRO=cdh \
-    HADOOP_MAJOR=5 \
-    HADOOP_DISTRO_VERSION=5.11.0 \
-    HADOOP_VERSION=2.6.0 \
-    HIVE_VERSION=1.1.0
-ENV 
HADOOP_URL=https://archive.cloudera.com/${HADOOP_DISTRO}${HADOOP_MAJOR}/${HADOOP_DISTRO}/${HADOOP_MAJOR}/
-ENV HADOOP_HOME=/tmp/hadoop-cdh HIVE_HOME=/tmp/hive
-
-RUN if [[ "${APT_DEPS_IMAGE}" == "airflow-apt-deps-ci" ]]; then 
/tmp/ci_build_install_deps.sh; fi
+RUN npm ci
 
-ENV PATH "${PATH}:/tmp/hive/bin"
+RUN npm run prod
 
-############################################################################################################
-# This is the target image - it installs PIP and NPM dependencies including 
efficient caching
-# mechanisms - it might be used to build the bare airflow build or CI build
-# Parameters:
-#    APT_DEPS_IMAGE - image with APT dependencies. It might either be base 
deps image with airflow
-#                     dependencies or CI deps image that contains also 
CI-required dependencies
-############################################################################################################
-FROM ${APT_DEPS_IMAGE} as main
+WORKDIR ${AIRFLOW_SOURCES}
 
-SHELL ["/bin/bash", "-o", "pipefail", "-e", "-u", "-x", "-c"]
+# Airflow sources change frequently but dependency configuration won't change 
that often
+# We copy setup.py and other files needed to perform setup of dependencies
+# So in case setup.py changes we can install latest dependencies required.
+COPY setup.py ${AIRFLOW_SOURCES}/setup.py
+COPY setup.cfg ${AIRFLOW_SOURCES}/setup.cfg
 
-RUN echo "Airflow version: ${AIRFLOW_VERSION}"
+COPY airflow/version.py ${AIRFLOW_SOURCES}/airflow/version.py
+COPY airflow/__init__.py ${AIRFLOW_SOURCES}/airflow/__init__.py
+COPY airflow/bin/airflow ${AIRFLOW_SOURCES}/airflow/bin/airflow
 
-ARG AIRFLOW_USER=airflow
-ENV AIRFLOW_USER=${AIRFLOW_USER}
+# The goal of this line is to install the dependencies from the most current 
setup.py from sources
+# This will be usually incremental small set of packages in CI optimized 
build, so it will be very fast
+# For production optimised build it is the first time dependencies are 
installed so it will be slower
+RUN pip install --user -e ".[${AIRFLOW_CI_EXTRAS}]" \
+    && pip uninstall --yes apache-airflow
+
+# Cache for this line will be automatically invalidated if any
+# of airflow sources change
+COPY . ${AIRFLOW_SOURCES}/
+
+# Reinstall airflow again - this time with sources and remove the sources 
after installation
+# It is not perfect because the sources are added as layer but it is still 
better
+RUN pip install --user -e ".[${AIRFLOW_CI_EXTRAS}]"
 
-ARG HOME=/home/airflow
-ENV HOME=${HOME}
+# Additional python deps to install
+ARG ADDITIONAL_PYTHON_DEPS=""
+
+RUN if [[ -n "${ADDITIONAL_PYTHON_DEPS}" ]]; then \
+        pip install --user ${ADDITIONAL_PYTHON_DEPS}; \
+    fi
 
-ARG AIRFLOW_HOME=${HOME}/airflow
+COPY ./scripts/docker/entrypoint.sh /entrypoint.sh
+
+ARG AIRFLOW_HOME=/root/airflow
 ENV AIRFLOW_HOME=${AIRFLOW_HOME}
 
+RUN mkdir -pv "${AIRFLOW_HOME}" \
+    && mkdir -pv "${AIRFLOW_HOME}/dags" \
+    && mkdir -pv "${AIRFLOW_HOME}/logs"
+
+WORKDIR ${AIRFLOW_SOURCES}
+
+COPY .bash_completion run-tests-complete run-tests /root/
+COPY .bash_completion.d/run-tests-complete 
/root/.bash_completion.d/run-tests-complete
+
+ENV 
PYTHONPATH=${PYTHONPATH}:${AIRFLOW_SOURCES}:${AIRFLOW_SOURCES}/tests/test_utils
+
+RUN "${AIRFLOW_SOURCES}/scripts/ci/docker_build/extract_tests.sh"
+
+ENV AIRFLOW_USER=root
+ENV HOME=/root
+
+EXPOSE 8080
+
+ENTRYPOINT ["/root/.local/bin/dumb-init", "--", "/entrypoint.sh"]
+
+CMD ["--help"]
+
+############################################################################################################
+# This is separate stage for packaging. WWW files with npm so that no node is 
needed for production image
+############################################################################################################
+FROM ${NODE_BASE_IMAGE} as airflow-www
+
+SHELL ["/bin/bash", "-o", "pipefail", "-e", "-u", "-x", "-c"]
+
 ARG AIRFLOW_SOURCES=/opt/airflow
 ENV AIRFLOW_SOURCES=${AIRFLOW_SOURCES}
 
-WORKDIR ${AIRFLOW_SOURCES}
+COPY airflow/www/ ${AIRFLOW_SOURCES}/airflow/www/
 
-RUN mkdir -pv ${AIRFLOW_HOME} \
-    mkdir -pv ${AIRFLOW_HOME}/dags \
-    mkdir -pv ${AIRFLOW_HOME}/logs \
-    && chown -R ${AIRFLOW_USER}.${AIRFLOW_USER} ${AIRFLOW_HOME}
+WORKDIR ${AIRFLOW_SOURCES}/airflow/www
 
-# Increase the value here to force reinstalling Apache Airflow pip dependencies
-ARG PIP_DEPENDENCIES_EPOCH_NUMBER="1"
-ENV PIP_DEPENDENCIES_EPOCH_NUMBER=${PIP_DEPENDENCIES_EPOCH_NUMBER}
+RUN npm ci
 
-# Optimizing installation of Cassandra driver
-# Speeds up building the image - cassandra driver without CYTHON saves around 
10 minutes
-ARG CASS_DRIVER_NO_CYTHON="1"
-# Build cassandra driver on multiple CPUs
-ARG CASS_DRIVER_BUILD_CONCURRENCY="8"
+RUN mkdir -p "${AIRFLOW_SOURCES}/airflow/www/static" \
+    && mkdir -p "${AIRFLOW_SOURCES}/docs/build/_html" \
+    && pushd "${AIRFLOW_SOURCES}/airflow/www/static" || exit \
+    && ln -sf ../../../docs/_build/html docs \
 
 Review comment:
   (not that it hurts, but we don't need to do this for the production image do 
we?)

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

Reply via email to