This is an automated email from the ASF dual-hosted git repository. potiuk pushed a commit to branch v2-0-test in repository https://gitbox.apache.org/repos/asf/airflow.git
commit 76415068219c9247845374e3388ea13f70918dad Author: Kamil Breguła <[email protected]> AuthorDate: Sun Mar 21 09:08:10 2021 +0100 Create a documentation package for Docker image (#14846) (cherry picked from commit a18cbc4e91b86c7b32589b3b00fa71cceceb755d) --- docs/apache-airflow/installation.rst | 2 +- docs/apache-airflow/production-deployment.rst | 847 +-------------------- docs/apache-airflow/start/docker.rst | 2 +- docs/build_docs.py | 12 +- docs/conf.py | 3 + docs/docker-stack/build.rst | 511 +++++++++++++ .../docker-images-recipes/gcloud.Dockerfile | 0 .../docker-images-recipes/hadoop.Dockerfile | 0 docs/docker-stack/entrypoint.rst | 201 +++++ docs/docker-stack/img/docker-logo.png | Bin 0 -> 50112 bytes docs/docker-stack/index.rst | 54 ++ docs/docker-stack/recipes.rst | 70 ++ docs/exts/airflow_intersphinx.py | 13 +- .../exts/docs_build/dev_index_template.html.jinja2 | 11 + docs/exts/docs_build/docs_builder.py | 11 +- docs/exts/docs_build/fetch_inventories.py | 51 +- 16 files changed, 915 insertions(+), 873 deletions(-) diff --git a/docs/apache-airflow/installation.rst b/docs/apache-airflow/installation.rst index eac6894..0184216 100644 --- a/docs/apache-airflow/installation.rst +++ b/docs/apache-airflow/installation.rst @@ -27,7 +27,7 @@ installation with other tools as well. .. note:: - Airflow is also distributed as a Docker image (OCI Image). For more information, see: :ref:`docker_image` + Airflow is also distributed as a Docker image (OCI Image). Consider using it to guarantee that software will always run the same no matter where it is deployed. For more information, see: :doc:`docker-stack:index`. Prerequisites ''''''''''''' diff --git a/docs/apache-airflow/production-deployment.rst b/docs/apache-airflow/production-deployment.rst index 4fb693d..ecc6077 100644 --- a/docs/apache-airflow/production-deployment.rst +++ b/docs/apache-airflow/production-deployment.rst @@ -118,852 +118,7 @@ To mitigate these issues, make sure you have a :doc:`health check </logging-moni Production Container Images =========================== -Production-ready reference Image --------------------------------- - -For the ease of deployment in production, the community releases a production-ready reference container -image. - -The docker image provided (as convenience binary package) in the -`Apache Airflow DockerHub <https://hub.docker.com/r/apache/airflow>`_ is a bare image -that has a few external dependencies and extras installed.. - -The Apache Airflow image provided as convenience package is optimized for size, so -it provides just a bare minimal set of the extras and dependencies installed and in most cases -you want to either extend or customize the image. You can see all possible extras in -:doc:`extra-packages-ref`. The set of extras used in Airflow Production image are available in the -`Dockerfile <https://github.com/apache/airflow/blob/2c6c7fdb2308de98e142618836bdf414df9768c8/Dockerfile#L39>`_. - -The production images are build in DockerHub from released version and release candidates. There -are also images published from branches but they are used mainly for development and testing purpose. -See `Airflow Git Branching <https://github.com/apache/airflow/blob/master/CONTRIBUTING.rst#airflow-git-branches>`_ -for details. - - -Customizing or extending the Production Image ---------------------------------------------- - -Before you dive-deeply in the way how the Airflow Image is build, named and why we are doing it the -way we do, you might want to know very quickly how you can extend or customize the existing image -for Apache Airflow. This chapter gives you a short answer to those questions. - -Airflow Summit 2020's `Production Docker Image <https://youtu.be/wDr3Y7q2XoI>`_ talk provides more -details about the context, architecture and customization/extension methods for the Production Image. - -Extending the image -................... - -Extending the image is easiest if you just need to add some dependencies that do not require -compiling. The compilation framework of Linux (so called ``build-essential``) is pretty big, and -for the production images, size is really important factor to optimize for, so our Production Image -does not contain ``build-essential``. If you need compiler like gcc or g++ or make/cmake etc. - those -are not found in the image and it is recommended that you follow the "customize" route instead. - -How to extend the image - it is something you are most likely familiar with - simply -build a new image using Dockerfile's ``FROM`` directive and add whatever you need. Then you can add your -Debian dependencies with ``apt`` or PyPI dependencies with ``pip install`` or any other stuff you need. - -You should be aware, about a few things: - -* The production image of airflow uses "airflow" user, so if you want to add some of the tools - as ``root`` user, you need to switch to it with ``USER`` directive of the Dockerfile. Also you - should remember about following the - `best practises of Dockerfiles <https://docs.docker.com/develop/develop-images/dockerfile_best-practices/>`_ - to make sure your image is lean and small. - -.. code-block:: dockerfile - - FROM apache/airflow:2.0.0 - USER root - RUN apt-get update \ - && apt-get install -y --no-install-recommends \ - my-awesome-apt-dependency-to-add \ - && apt-get autoremove -yqq --purge \ - && apt-get clean \ - && rm -rf /var/lib/apt/lists/* - USER airflow - - -* PyPI dependencies in Apache Airflow are installed in the user library, of the "airflow" user, so - you need to install them with the ``--user`` flag and WITHOUT switching to airflow user. Note also - that using --no-cache-dir is a good idea that can help to make your image smaller. - -.. code-block:: dockerfile - - FROM apache/airflow:2.0.0 - RUN pip install --no-cache-dir --user my-awesome-pip-dependency-to-add - -* As of 2.0.1 image the ``--user`` flag is turned on by default by setting ``PIP_USER`` environment variable - to ``true``. This can be disabled by un-setting the variable or by setting it to ``false``. - - -* If your apt, or PyPI dependencies require some of the build-essentials, then your best choice is - to follow the "Customize the image" route. However it requires to checkout sources of Apache Airflow, - so you might still want to choose to add build essentials to your image, even if your image will - be significantly bigger. - -.. code-block:: dockerfile - - FROM apache/airflow:2.0.0 - USER root - RUN apt-get update \ - && apt-get install -y --no-install-recommends \ - build-essential my-awesome-apt-dependency-to-add \ - && apt-get autoremove -yqq --purge \ - && apt-get clean \ - && rm -rf /var/lib/apt/lists/* - USER airflow - RUN pip install --no-cache-dir --user my-awesome-pip-dependency-to-add - - -* You can also embed your dags in the image by simply adding them with COPY directive of Airflow. - The DAGs in production image are in /opt/airflow/dags folder. - -Customizing the image -..................... - -Customizing the image is an alternative way of adding your own dependencies to the image - better -suited to prepare optimized production images. - -The advantage of this method is that it produces optimized image even if you need some compile-time -dependencies that are not needed in the final image. You need to use Airflow Sources to build such images -from the `official distribution folder of Apache Airflow <https://downloads.apache.org/airflow/>`_ for the -released versions, or checked out from the GitHub project if you happen to do it from git sources. - -The easiest way to build the image is to use ``breeze`` script, but you can also build such customized -image by running appropriately crafted docker build in which you specify all the ``build-args`` -that you need to add to customize it. You can read about all the args and ways you can build the image -in the `<#production-image-build-arguments>`_ chapter below. - -Here just a few examples are presented which should give you general understanding of what you can customize. - -This builds the production image in version 3.7 with additional airflow extras from 2.0.0 PyPI package and -additional apt dev and runtime dependencies. - -.. code-block:: bash - - docker build . \ - --build-arg PYTHON_BASE_IMAGE="python:3.7-slim-buster" \ - --build-arg PYTHON_MAJOR_MINOR_VERSION=3.7 \ - --build-arg AIRFLOW_INSTALLATION_METHOD="apache-airflow" \ - --build-arg AIRFLOW_VERSION="2.0.0" \ - --build-arg AIRFLOW_VERSION_SPECIFICATION="==2.0.0" \ - --build-arg AIRFLOW_CONSTRAINTS_REFERENCE="constraints-2-0" \ - --build-arg AIRFLOW_SOURCES_FROM="empty" \ - --build-arg AIRFLOW_SOURCES_TO="/empty" \ - --build-arg ADDITIONAL_AIRFLOW_EXTRAS="jdbc" \ - --build-arg ADDITIONAL_PYTHON_DEPS="pandas" \ - --build-arg ADDITIONAL_DEV_APT_DEPS="gcc g++" \ - --build-arg ADDITIONAL_RUNTIME_APT_DEPS="default-jre-headless" \ - --tag my-image - - -the same image can be built using ``breeze`` (it supports auto-completion of the options): - -.. code-block:: bash - - ./breeze build-image \ - --production-image --python 3.7 --install-airflow-version=2.0.0 \ - --additional-extras=jdbc --additional-python-deps="pandas" \ - --additional-dev-apt-deps="gcc g++" --additional-runtime-apt-deps="default-jre-headless" - - -You can customize more aspects of the image - such as additional commands executed before apt dependencies -are installed, or adding extra sources to install your dependencies from. You can see all the arguments -described below but here is an example of rather complex command to customize the image -based on example in `this comment <https://github.com/apache/airflow/issues/8605#issuecomment-690065621>`_: - -.. code-block:: bash - - docker build . -f Dockerfile \ - --build-arg PYTHON_BASE_IMAGE="python:3.7-slim-buster" \ - --build-arg PYTHON_MAJOR_MINOR_VERSION=3.7 \ - --build-arg AIRFLOW_INSTALLATION_METHOD="apache-airflow" \ - --build-arg AIRFLOW_VERSION="2.0.0" \ - --build-arg AIRFLOW_VERSION_SPECIFICATION="==2.0.0" \ - --build-arg AIRFLOW_CONSTRAINTS_REFERENCE="constraints-2-0" \ - --build-arg AIRFLOW_SOURCES_FROM="empty" \ - --build-arg AIRFLOW_SOURCES_TO="/empty" \ - --build-arg ADDITIONAL_AIRFLOW_EXTRAS="slack" \ - --build-arg ADDITIONAL_PYTHON_DEPS=" \ - apache-airflow-providers-odbc \ - azure-storage-blob \ - sshtunnel \ - google-api-python-client \ - oauth2client \ - beautifulsoup4 \ - dateparser \ - rocketchat_API \ - typeform" \ - --build-arg ADDITIONAL_DEV_APT_DEPS="msodbcsql17 unixodbc-dev g++" \ - --build-arg ADDITIONAL_DEV_APT_COMMAND="curl https://packages.microsoft.com/keys/microsoft.asc | \ - apt-key add --no-tty - && \ - curl https://packages.microsoft.com/config/debian/10/prod.list > /etc/apt/sources.list.d/mssql-release.list" \ - --build-arg ADDITIONAL_DEV_ENV_VARS="ACCEPT_EULA=Y" \ - --build-arg ADDITIONAL_RUNTIME_APT_COMMAND="curl https://packages.microsoft.com/keys/microsoft.asc | \ - apt-key add --no-tty - && \ - curl https://packages.microsoft.com/config/debian/10/prod.list > /etc/apt/sources.list.d/mssql-release.list" \ - --build-arg ADDITIONAL_RUNTIME_APT_DEPS="msodbcsql17 unixodbc git procps vim" \ - --build-arg ADDITIONAL_RUNTIME_ENV_VARS="ACCEPT_EULA=Y" \ - --tag my-image - -Customizing images in high security restricted environments -........................................................... - -You can also make sure your image is only build using local constraint file and locally downloaded -wheel files. This is often useful in Enterprise environments where the binary files are verified and -vetted by the security teams. - -This builds below builds the production image in version 3.7 with packages and constraints used from the local -``docker-context-files`` rather than installed from PyPI or GitHub. It also disables MySQL client -installation as it is using external installation method. - -Note that as a prerequisite - you need to have downloaded wheel files. In the example below we -first download such constraint file locally and then use ``pip download`` to get the .whl files needed -but in most likely scenario, those wheel files should be copied from an internal repository of such .whl -files. Note that ``AIRFLOW_VERSION_SPECIFICATION`` is only there for reference, the apache airflow .whl file -in the right version is part of the .whl files downloaded. - -Note that 'pip download' will only works on Linux host as some of the packages need to be compiled from -sources and you cannot install them providing ``--platform`` switch. They also need to be downloaded using -the same python version as the target image. - -The ``pip download`` might happen in a separate environment. The files can be committed to a separate -binary repository and vetted/verified by the security team and used subsequently to build images -of Airflow when needed on an air-gaped system. - -Preparing the constraint files and wheel files: - -.. code-block:: bash - - rm docker-context-files/*.whl docker-context-files/*.txt - - curl -Lo "docker-context-files/constraints-2-0.txt" \ - https://raw.githubusercontent.com/apache/airflow/constraints-2-0/constraints-3.7.txt - - pip download --dest docker-context-files \ - --constraint docker-context-files/constraints-2-0.txt \ - apache-airflow[async,aws,azure,celery,dask,elasticsearch,gcp,kubernetes,mysql,postgres,redis,slack,ssh,statsd,virtualenv]==2.0.0 - -Since apache-airflow .whl packages are treated differently by the docker image, you need to rename the -downloaded apache-airflow* files, for example: - -.. code-block:: bash - - pushd docker-context-files - for file in apache?airflow* - do - mv ${file} _${file} - done - popd - -Building the image: - -.. code-block:: bash - - ./breeze build-image \ - --production-image --python 3.7 --install-airflow-version=2.0.0 \ - --disable-mysql-client-installation --disable-pip-cache --install-from-local-files-when-building \ - --constraints-location="/docker-context-files/constraints-2-0.txt" - -or - -.. code-block:: bash - - docker build . \ - --build-arg PYTHON_BASE_IMAGE="python:3.7-slim-buster" \ - --build-arg PYTHON_MAJOR_MINOR_VERSION=3.7 \ - --build-arg AIRFLOW_INSTALLATION_METHOD="apache-airflow" \ - --build-arg AIRFLOW_VERSION="2.0.0" \ - --build-arg AIRFLOW_VERSION_SPECIFICATION="==2.0.0" \ - --build-arg AIRFLOW_CONSTRAINTS_REFERENCE="constraints-2-0" \ - --build-arg AIRFLOW_SOURCES_FROM="empty" \ - --build-arg AIRFLOW_SOURCES_TO="/empty" \ - --build-arg INSTALL_MYSQL_CLIENT="false" \ - --build-arg AIRFLOW_PRE_CACHED_PIP_PACKAGES="false" \ - --build-arg INSTALL_FROM_DOCKER_CONTEXT_FILES="true" \ - --build-arg AIRFLOW_CONSTRAINTS_LOCATION="/docker-context-files/constraints-2-0.txt" - - -Customizing & extending the image together -.......................................... - -You can combine both - customizing & extending the image. You can build the image first using -``customize`` method (either with docker command or with ``breeze`` and then you can ``extend`` -the resulting image using ``FROM`` any dependencies you want. - -Customizing PYPI installation -............................. - -You can customize PYPI sources used during image build by adding a docker-context-files/.pypirc file -This .pypirc will never be committed to the repository and will not be present in the final production image. -It is added and used only in the build segment of the image so it is never copied to the final image. - -External sources for dependencies ---------------------------------- - -In corporate environments, there is often the need to build your Container images using -other than default sources of dependencies. The docker file uses standard sources (such as -Debian apt repositories or PyPI repository. However, in corporate environments, the dependencies -are often only possible to be installed from internal, vetted repositories that are reviewed and -approved by the internal security teams. In those cases, you might need to use those different -sources. - -This is rather easy if you extend the image - you simply write your extension commands -using the right sources - either by adding/replacing the sources in apt configuration or -specifying the source repository in pip install command. - -It's a bit more involved in the case of customizing the image. We do not have yet (but we are working -on it) a capability of changing the sources via build args. However, since the builds use -Dockerfile that is a source file, you can rather easily simply modify the file manually and -specify different sources to be used by either of the commands. - - -Comparing extending and customizing the image ---------------------------------------------- - -Here is the comparison of the two types of building images. - -+----------------------------------------------------+---------------------+-----------------------+ -| | Extending the image | Customizing the image | -+====================================================+=====================+=======================+ -| Produces optimized image | No | Yes | -+----------------------------------------------------+---------------------+-----------------------+ -| Use Airflow Dockerfile sources to build the image | No | Yes | -+----------------------------------------------------+---------------------+-----------------------+ -| Requires Airflow sources | No | Yes | -+----------------------------------------------------+---------------------+-----------------------+ -| You can build it with Breeze | No | Yes | -+----------------------------------------------------+---------------------+-----------------------+ -| Allows to use non-default sources for dependencies | Yes | No [1] | -+----------------------------------------------------+---------------------+-----------------------+ - -[1] When you combine customizing and extending the image, you can use external sources -in the "extend" part. There are plans to add functionality to add external sources -option to image customization. You can also modify Dockerfile manually if you want to -use non-default sources for dependencies. - -Using the production image --------------------------- - -The PROD image entrypoint works as follows: - -* In case the user is not "airflow" (with undefined user id) and the group id of the user is set to 0 (root), - then the user is dynamically added to /etc/passwd at entry using USER_NAME variable to define the user name. - This is in order to accommodate the - `OpenShift Guidelines <https://docs.openshift.com/enterprise/3.0/creating_images/guidelines.html>`_ - -* The ``AIRFLOW_HOME`` is set by default to ``/opt/airflow/`` - this means that DAGs - are in default in the ``/opt/airflow/dags`` folder and logs are in the ``/opt/airflow/logs`` - -* The working directory is ``/opt/airflow`` by default. - -* If ``AIRFLOW__CORE__SQL_ALCHEMY_CONN`` variable is passed to the container and it is either mysql or postgres - SQL alchemy connection, then the connection is checked and the script waits until the database is reachable. - If ``AIRFLOW__CORE__SQL_ALCHEMY_CONN_CMD`` variable is passed to the container, it is evaluated as a - command to execute and result of this evaluation is used as ``AIRFLOW__CORE__SQL_ALCHEMY_CONN``. The - ``_CMD`` variable takes precedence over the ``AIRFLOW__CORE__SQL_ALCHEMY_CONN`` variable. - -* If no ``AIRFLOW__CORE__SQL_ALCHEMY_CONN`` variable is set then SQLite database is created in - ${AIRFLOW_HOME}/airflow.db and db reset is executed. - -* If first argument equals to "bash" - you are dropped to a bash shell or you can executes bash command - if you specify extra arguments. For example: - -.. code-block:: bash - - docker run -it apache/airflow:master-python3.6 bash -c "ls -la" - total 16 - drwxr-xr-x 4 airflow root 4096 Jun 5 18:12 . - drwxr-xr-x 1 root root 4096 Jun 5 18:12 .. - drwxr-xr-x 2 airflow root 4096 Jun 5 18:12 dags - drwxr-xr-x 2 airflow root 4096 Jun 5 18:12 logs - -* If first argument is equal to "python" - you are dropped in python shell or python commands are executed if - you pass extra parameters. For example: - -.. code-block:: bash - - > docker run -it apache/airflow:master-python3.6 python -c "print('test')" - test - -* If first argument equals to "airflow" - the rest of the arguments is treated as an airflow command - to execute. Example: - -.. code-block:: bash - - docker run -it apache/airflow:master-python3.6 airflow webserver - -* If there are any other arguments - they are simply passed to the "airflow" command - -.. code-block:: bash - - > docker run -it apache/airflow:master-python3.6 version - 2.0.0.dev0 - -* If ``AIRFLOW__CELERY__BROKER_URL`` variable is passed and airflow command with - scheduler, worker of flower command is used, then the script checks the broker connection - and waits until the Celery broker database is reachable. - If ``AIRFLOW__CELERY__BROKER_URL_CMD`` variable is passed to the container, it is evaluated as a - command to execute and result of this evaluation is used as ``AIRFLOW__CELERY__BROKER_URL``. The - ``_CMD`` variable takes precedence over the ``AIRFLOW__CELERY__BROKER_URL`` variable. - -Production image build arguments --------------------------------- - -The following build arguments (``--build-arg`` in docker build command) can be used for production images: - -+------------------------------------------+------------------------------------------+------------------------------------------+ -| Build argument | Default value | Description | -+==========================================+==========================================+==========================================+ -| ``PYTHON_BASE_IMAGE`` | ``python:3.6-slim-buster`` | Base python image. | -+------------------------------------------+------------------------------------------+------------------------------------------+ -| ``PYTHON_MAJOR_MINOR_VERSION`` | ``3.6`` | major/minor version of Python (should | -| | | match base image). | -+------------------------------------------+------------------------------------------+------------------------------------------+ -| ``AIRFLOW_VERSION`` | ``2.0.0.dev0`` | version of Airflow. | -+------------------------------------------+------------------------------------------+------------------------------------------+ -| ``AIRFLOW_REPO`` | ``apache/airflow`` | the repository from which PIP | -| | | dependencies are pre-installed. | -+------------------------------------------+------------------------------------------+------------------------------------------+ -| ``AIRFLOW_BRANCH`` | ``master`` | the branch from which PIP dependencies | -| | | are pre-installed initially. | -+------------------------------------------+------------------------------------------+------------------------------------------+ -| ``AIRFLOW_CONSTRAINTS_LOCATION`` | | If not empty, it will override the | -| | | source of the constraints with the | -| | | specified URL or file. Note that the | -| | | file has to be in docker context so | -| | | it's best to place such file in | -| | | one of the folders included in | -| | | .dockerignore. | -+------------------------------------------+------------------------------------------+------------------------------------------+ -| ``AIRFLOW_CONSTRAINTS_REFERENCE`` | ``constraints-master`` | Reference (branch or tag) from GitHub | -| | | where constraints file is taken from | -| | | It can be ``constraints-master`` but | -| | | also can be ``constraints-1-10`` for | -| | | 1.10.* installation. In case of building | -| | | specific version you want to point it | -| | | to specific tag, for example | -| | | ``constraints-1.10.15``. | -+------------------------------------------+------------------------------------------+------------------------------------------+ -| ``INSTALL_PROVIDERS_FROM_SOURCES`` | ``false`` | If set to ``true`` and image is built | -| | | from sources, all provider packages are | -| | | installed from sources rather than from | -| | | packages. It has no effect when | -| | | installing from PyPI or GitHub repo. | -+------------------------------------------+------------------------------------------+------------------------------------------+ -| ``AIRFLOW_EXTRAS`` | (see Dockerfile) | Default extras with which airflow is | -| | | installed. | -+------------------------------------------+------------------------------------------+------------------------------------------+ -| ``INSTALL_FROM_PYPI`` | ``true`` | If set to true, Airflow is installed | -| | | from PyPI. if you want to install | -| | | Airflow from self-build package | -| | | you can set it to false, put package in | -| | | ``docker-context-files`` and set | -| | | ``INSTALL_FROM_DOCKER_CONTEXT_FILES`` to | -| | | ``true``. For this you have to also keep | -| | | ``AIRFLOW_PRE_CACHED_PIP_PACKAGES`` flag | -| | | set to ``false``. | -+------------------------------------------+------------------------------------------+------------------------------------------+ -| ``AIRFLOW_PRE_CACHED_PIP_PACKAGES`` | ``false`` | Allows to pre-cache airflow PIP packages | -| | | from the GitHub of Apache Airflow | -| | | This allows to optimize iterations for | -| | | Image builds and speeds up CI builds. | -+------------------------------------------+------------------------------------------+------------------------------------------+ -| ``INSTALL_FROM_DOCKER_CONTEXT_FILES`` | ``false`` | If set to true, Airflow, providers and | -| | | all dependencies are installed from | -| | | from locally built/downloaded | -| | | .whl and .tar.gz files placed in the | -| | | ``docker-context-files``. In certain | -| | | corporate environments, this is required | -| | | to install airflow from such pre-vetted | -| | | packages rather than from PyPI. For this | -| | | to work, also set ``INSTALL_FROM_PYPI``. | -| | | Note that packages starting with | -| | | ``apache?airflow`` glob are treated | -| | | differently than other packages. All | -| | | ``apache?airflow`` packages are | -| | | installed with dependencies limited by | -| | | airflow constraints. All other packages | -| | | are installed without dependencies | -| | | 'as-is'. If you wish to install airflow | -| | | via 'pip download' with all dependencies | -| | | downloaded, you have to rename the | -| | | apache airflow and provider packages to | -| | | not start with ``apache?airflow`` glob. | -+------------------------------------------+------------------------------------------+------------------------------------------+ -| ``UPGRADE_TO_NEWER_DEPENDENCIES`` | ``false`` | If set to true, the dependencies are | -| | | upgraded to newer versions matching | -| | | setup.py before installation. | -+------------------------------------------+------------------------------------------+------------------------------------------+ -| ``CONTINUE_ON_PIP_CHECK_FAILURE`` | ``false`` | By default the image build fails if pip | -| | | check fails for it. This is good for | -| | | interactive building but on CI the | -| | | image should be built regardless - we | -| | | have a separate step to verify image. | -+------------------------------------------+------------------------------------------+------------------------------------------+ -| ``ADDITIONAL_AIRFLOW_EXTRAS`` | | Optional additional extras with which | -| | | airflow is installed. | -+------------------------------------------+------------------------------------------+------------------------------------------+ -| ``ADDITIONAL_PYTHON_DEPS`` | | Optional python packages to extend | -| | | the image with some extra dependencies. | -+------------------------------------------+------------------------------------------+------------------------------------------+ -| ``DEV_APT_COMMAND`` | (see Dockerfile) | Dev apt command executed before dev deps | -| | | are installed in the Build image. | -+------------------------------------------+------------------------------------------+------------------------------------------+ -| ``ADDITIONAL_DEV_APT_COMMAND`` | | Additional Dev apt command executed | -| | | before dev dep are installed | -| | | in the Build image. Should start with | -| | | ``&&``. | -+------------------------------------------+------------------------------------------+------------------------------------------+ -| ``DEV_APT_DEPS`` | (see Dockerfile) | Dev APT dependencies installed | -| | | in the Build image. | -+------------------------------------------+------------------------------------------+------------------------------------------+ -| ``ADDITIONAL_DEV_APT_DEPS`` | | Additional apt dev dependencies | -| | | installed in the Build image. | -+------------------------------------------+------------------------------------------+------------------------------------------+ -| ``ADDITIONAL_DEV_APT_ENV`` | | Additional env variables defined | -| | | when installing dev deps. | -+------------------------------------------+------------------------------------------+------------------------------------------+ -| ``RUNTIME_APT_COMMAND`` | (see Dockerfile) | Runtime apt command executed before deps | -| | | are installed in the Main image. | -+------------------------------------------+------------------------------------------+------------------------------------------+ -| ``ADDITIONAL_RUNTIME_APT_COMMAND`` | | Additional Runtime apt command executed | -| | | before runtime dep are installed | -| | | in the Main image. Should start with | -| | | ``&&``. | -+------------------------------------------+------------------------------------------+------------------------------------------+ -| ``RUNTIME_APT_DEPS`` | (see Dockerfile) | Runtime APT dependencies installed | -| | | in the Main image. | -+------------------------------------------+------------------------------------------+------------------------------------------+ -| ``ADDITIONAL_RUNTIME_APT_DEPS`` | | Additional apt runtime dependencies | -| | | installed in the Main image. | -+------------------------------------------+------------------------------------------+------------------------------------------+ -| ``ADDITIONAL_RUNTIME_APT_ENV`` | | Additional env variables defined | -| | | when installing runtime deps. | -+------------------------------------------+------------------------------------------+------------------------------------------+ -| ``AIRFLOW_HOME`` | ``/opt/airflow`` | Airflow’s HOME (that’s where logs and | -| | | SQLite databases are stored). | -+------------------------------------------+------------------------------------------+------------------------------------------+ -| ``AIRFLOW_UID`` | ``50000`` | Airflow user UID. | -+------------------------------------------+------------------------------------------+------------------------------------------+ -| ``AIRFLOW_GID`` | ``50000`` | Airflow group GID. Note that most files | -| | | created on behalf of airflow user belong | -| | | to the ``root`` group (0) to keep | -| | | OpenShift Guidelines compatibility. | -+------------------------------------------+------------------------------------------+------------------------------------------+ -| ``AIRFLOW_USER_HOME_DIR`` | ``/home/airflow`` | Home directory of the Airflow user. | -+------------------------------------------+------------------------------------------+------------------------------------------+ -| ``CASS_DRIVER_BUILD_CONCURRENCY`` | ``8`` | Number of processors to use for | -| | | cassandra PIP install (speeds up | -| | | installing in case cassandra extra is | -| | | used). | -+------------------------------------------+------------------------------------------+------------------------------------------+ -| ``INSTALL_MYSQL_CLIENT`` | ``true`` | Whether MySQL client should be installed | -| | | The mysql extra is removed from extras | -| | | if the client is not installed. | -+------------------------------------------+------------------------------------------+------------------------------------------+ -| ``AIRFLOW_PIP_VERSION`` | ``20.2.4`` | PIP version used. | -+------------------------------------------+------------------------------------------+------------------------------------------+ -| ``PIP_PROGRESS_BAR`` | ``on`` | Progress bar for PIP installation | -+------------------------------------------+------------------------------------------+------------------------------------------+ - -There are build arguments that determine the installation mechanism of Apache Airflow for the -production image. There are three types of build: - -* From local sources (by default for example when you use ``docker build .``) -* You can build the image from released PyPI airflow package (used to build the official Docker image) -* You can build the image from any version in GitHub repository(this is used mostly for system testing). - -+-----------------------------------+------------------------+-----------------------------------------------------------------------------------+ -| Build argument | Default | What to specify | -+===================================+========================+===================================================================================+ -| ``AIRFLOW_INSTALLATION_METHOD`` | ``apache-airflow`` | Should point to the installation method of Apache Airflow. It can be | -| | | ``apache-airflow`` for installation from packages and URL to installation from | -| | | GitHub repository tag or branch or "." to install from sources. | -| | | Note that installing from local sources requires appropriate values of the | -| | | ``AIRFLOW_SOURCES_FROM`` and ``AIRFLOW_SOURCES_TO`` variables as described below. | -| | | Only used when ``INSTALL_FROM_PYPI`` is set to ``true``. | -+-----------------------------------+------------------------+-----------------------------------------------------------------------------------+ -| ``AIRFLOW_VERSION_SPECIFICATION`` | | Optional - might be used for package installation of different Airflow version | -| | | for example"==2.0.0". For consistency, you should also set``AIRFLOW_VERSION`` | -| | | to the same value AIRFLOW_VERSION is resolved as label in the image created. | -+-----------------------------------+------------------------+-----------------------------------------------------------------------------------+ -| ``AIRFLOW_CONSTRAINTS_REFERENCE`` | ``constraints-master`` | Reference (branch or tag) from GitHub where constraints file is taken from. | -| | | It can be ``constraints-master`` but also can be``constraints-1-10`` for | -| | | 1.10.* installations. In case of building specific version | -| | | you want to point it to specific tag, for example ``constraints-2.0.0`` | -+-----------------------------------+------------------------+-----------------------------------------------------------------------------------+ -| ``AIRFLOW_WWW`` | ``www`` | In case of Airflow 2.0 it should be "www", in case of Airflow 1.10 | -| | | series it should be "www_rbac". | -+-----------------------------------+------------------------+-----------------------------------------------------------------------------------+ -| ``AIRFLOW_SOURCES_FROM`` | ``empty`` | Sources of Airflow. Set it to "." when you install airflow from | -| | | local sources. | -+-----------------------------------+------------------------+-----------------------------------------------------------------------------------+ -| ``AIRFLOW_SOURCES_TO`` | ``/empty`` | Target for Airflow sources. Set to "/opt/airflow" when | -| | | you want to install airflow from local sources. | -+-----------------------------------+------------------------+-----------------------------------------------------------------------------------+ - -This builds production image in version 3.6 with default extras from the local sources (master version -of 2.0 currently): - -.. code-block:: bash - - docker build . - -This builds the production image in version 3.7 with default extras from 2.0.0 tag and -constraints taken from constraints-2-0 branch in GitHub. - -.. code-block:: bash - - docker build . \ - --build-arg PYTHON_BASE_IMAGE="python:3.7-slim-buster" \ - --build-arg PYTHON_MAJOR_MINOR_VERSION=3.7 \ - --build-arg AIRFLOW_INSTALLATION_METHOD="https://github.com/apache/airflow/archive/2.0.0.tar.gz#egg=apache-airflow" \ - --build-arg AIRFLOW_CONSTRAINTS_REFERENCE="constraints-2-0" \ - --build-arg AIRFLOW_BRANCH="v1-10-test" \ - --build-arg AIRFLOW_SOURCES_FROM="empty" \ - --build-arg AIRFLOW_SOURCES_TO="/empty" - -This builds the production image in version 3.7 with default extras from 2.0.0 PyPI package and -constraints taken from 2.0.0 tag in GitHub and pre-installed pip dependencies from the top -of v1-10-test branch. - -.. code-block:: bash - - docker build . \ - --build-arg PYTHON_BASE_IMAGE="python:3.7-slim-buster" \ - --build-arg PYTHON_MAJOR_MINOR_VERSION=3.7 \ - --build-arg AIRFLOW_INSTALLATION_METHOD="apache-airflow" \ - --build-arg AIRFLOW_VERSION="2.0.0" \ - --build-arg AIRFLOW_VERSION_SPECIFICATION="==2.0.0" \ - --build-arg AIRFLOW_BRANCH="v1-10-test" \ - --build-arg AIRFLOW_CONSTRAINTS_REFERENCE="constraints-2.0.0" \ - --build-arg AIRFLOW_SOURCES_FROM="empty" \ - --build-arg AIRFLOW_SOURCES_TO="/empty" - -This builds the production image in version 3.7 with additional airflow extras from 2.0.0 PyPI package and -additional python dependencies and pre-installed pip dependencies from 2.0.0 tagged constraints. - -.. code-block:: bash - - docker build . \ - --build-arg PYTHON_BASE_IMAGE="python:3.7-slim-buster" \ - --build-arg PYTHON_MAJOR_MINOR_VERSION=3.7 \ - --build-arg AIRFLOW_INSTALLATION_METHOD="apache-airflow" \ - --build-arg AIRFLOW_VERSION="2.0.0" \ - --build-arg AIRFLOW_VERSION_SPECIFICATION="==2.0.0" \ - --build-arg AIRFLOW_BRANCH="v1-10-test" \ - --build-arg AIRFLOW_CONSTRAINTS_REFERENCE="constraints-2.0.0" \ - --build-arg AIRFLOW_SOURCES_FROM="empty" \ - --build-arg AIRFLOW_SOURCES_TO="/empty" \ - --build-arg ADDITIONAL_AIRFLOW_EXTRAS="mssql,hdfs" \ - --build-arg ADDITIONAL_PYTHON_DEPS="sshtunnel oauth2client" - -This builds the production image in version 3.7 with additional airflow extras from 2.0.0 PyPI package and -additional apt dev and runtime dependencies. - -.. code-block:: bash - - docker build . \ - --build-arg PYTHON_BASE_IMAGE="python:3.7-slim-buster" \ - --build-arg PYTHON_MAJOR_MINOR_VERSION=3.7 \ - --build-arg AIRFLOW_INSTALLATION_METHOD="apache-airflow" \ - --build-arg AIRFLOW_VERSION="2.0.0" \ - --build-arg AIRFLOW_VERSION_SPECIFICATION="==2.0.0" \ - --build-arg AIRFLOW_CONSTRAINTS_REFERENCE="constraints-2-0" \ - --build-arg AIRFLOW_SOURCES_FROM="empty" \ - --build-arg AIRFLOW_SOURCES_TO="/empty" \ - --build-arg ADDITIONAL_AIRFLOW_EXTRAS="jdbc" \ - --build-arg ADDITIONAL_DEV_APT_DEPS="gcc g++" \ - --build-arg ADDITIONAL_RUNTIME_APT_DEPS="default-jre-headless" - - -Actions executed at image start -------------------------------- - -If you are using the default entrypoint of the production image, -there are a few actions that are automatically performed when the container starts. -In some cases, you can pass environment variables to the image to trigger some of that behaviour. - -The variables that control the "execution" behaviour start with ``_AIRFLOW`` to distinguish them -from the variables used to build the image starting with ``AIRFLOW``. - -Creating system user -.................... - -Airflow image is Open-Shift compatible, which means that you can start it with random user ID and group id 0. -Airflow will automatically create such a user and make it's home directory point to ``/home/airflow``. -You can read more about it in the "Support arbitrary user ids" chapter in the -`Openshift best practices <https://docs.openshift.com/container-platform/4.1/openshift_images/create-images.html#images-create-guide-openshift_create-images>`_. - -Waits for Airflow DB connection -............................... - -In case Postgres or MySQL DB is used, the entrypoint will wait until the airflow DB connection becomes -available. This happens always when you use the default entrypoint. - -The script detects backend type depending on the URL schema and assigns default port numbers if not specified -in the URL. Then it loops until the connection to the host/port specified can be established -It tries ``CONNECTION_CHECK_MAX_COUNT`` times and sleeps ``CONNECTION_CHECK_SLEEP_TIME`` between checks -To disable check, set ``CONNECTION_CHECK_MAX_COUNT=0``. - -Supported schemes: - -* ``postgres://`` - default port 5432 -* ``mysql://`` - default port 3306 -* ``sqlite://`` - -In case of SQLite backend, there is no connection to establish and waiting is skipped. - -Upgrading Airflow DB -.................... - -If you set ``_AIRFLOW_DB_UPGRADE`` variable to a non-empty value, the entrypoint will run -the ``airflow db upgrade`` command right after verifying the connection. You can also use this -when you are running airflow with internal SQLite database (default) to upgrade the db and create -admin users at entrypoint, so that you can start the webserver immediately. Note - using SQLite is -intended only for testing purpose, never use SQLite in production as it has severe limitations when it -comes to concurrency. - - -Creating admin user -................... - -The entrypoint can also create webserver user automatically when you enter it. you need to set -``_AIRFLOW_WWW_USER_CREATE`` to a non-empty value in order to do that. This is not intended for -production, it is only useful if you would like to run a quick test with the production image. -You need to pass at least password to create such user via ``_AIRFLOW_WWW_USER_PASSWORD_CMD`` or -``_AIRFLOW_WWW_USER_PASSWORD_CMD`` similarly like for other ``*_CMD`` variables, the content of -the ``*_CMD`` will be evaluated as shell command and it's output will be set ass password. - -User creation will fail if none of the ``PASSWORD`` variables are set - there is no default for -password for security reasons. - -+-----------+--------------------------+----------------------------------------------------------------------+ -| Parameter | Default | Environment variable | -+===========+==========================+======================================================================+ -| username | admin | ``_AIRFLOW_WWW_USER_USERNAME`` | -+-----------+--------------------------+----------------------------------------------------------------------+ -| password | | ``_AIRFLOW_WWW_USER_PASSWORD_CMD`` or ``_AIRFLOW_WWW_USER_PASSWORD`` | -+-----------+--------------------------+----------------------------------------------------------------------+ -| firstname | Airflow | ``_AIRFLOW_WWW_USER_FIRSTNAME`` | -+-----------+--------------------------+----------------------------------------------------------------------+ -| lastname | Admin | ``_AIRFLOW_WWW_USER_LASTNAME`` | -+-----------+--------------------------+----------------------------------------------------------------------+ -| email | [email protected] | ``_AIRFLOW_WWW_USER_EMAIL`` | -+-----------+--------------------------+----------------------------------------------------------------------+ -| role | Admin | ``_AIRFLOW_WWW_USER_ROLE`` | -+-----------+--------------------------+----------------------------------------------------------------------+ - -In case the password is specified, the user will be attempted to be created, but the entrypoint will -not fail if the attempt fails (this accounts for the case that the user is already created). - -You can, for example start the webserver in the production image with initializing the internal SQLite -database and creating an ``admin/admin`` Admin user with the following command: - -.. code-block:: bash - - docker run -it -p 8080:8080 \ - --env "_AIRFLOW_DB_UPGRADE=true" \ - --env "_AIRFLOW_WWW_USER_CREATE=true" \ - --env "_AIRFLOW_WWW_USER_PASSWORD=admin" \ - apache/airflow:master-python3.8 webserver - - -.. code-block:: bash - - docker run -it -p 8080:8080 \ - --env "_AIRFLOW_DB_UPGRADE=true" \ - --env "_AIRFLOW_WWW_USER_CREATE=true" \ - --env "_AIRFLOW_WWW_USER_PASSWORD_CMD=echo admin" \ - apache/airflow:master-python3.8 webserver - -The commands above perform initialization of the SQLite database, create admin user with admin password -and Admin role. They also forward local port ``8080`` to the webserver port and finally start the webserver. - - -Waits for celery broker connection -.................................. - -In case Postgres or MySQL DB is used, and one of the ``scheduler``, ``celery``, ``worker``, or ``flower`` -commands are used the entrypoint will wait until the celery broker DB connection is available. - -The script detects backend type depending on the URL schema and assigns default port numbers if not specified -in the URL. Then it loops until connection to the host/port specified can be established -It tries ``CONNECTION_CHECK_MAX_COUNT`` times and sleeps ``CONNECTION_CHECK_SLEEP_TIME`` between checks. -To disable check, set ``CONNECTION_CHECK_MAX_COUNT=0``. - -Supported schemes: - -* ``amqp(s)://`` (rabbitmq) - default port 5672 -* ``redis://`` - default port 6379 -* ``postgres://`` - default port 5432 -* ``mysql://`` - default port 3306 -* ``sqlite://`` - -In case of SQLite backend, there is no connection to establish and waiting is skipped. - - -Recipes -------- - -Users sometimes share interesting ways of using the Docker images. We encourage users to contribute these -recipes to the documentation in case they prove useful to other members of the community by -submitting a pull request. The sections below capture this knowledge. - -Google Cloud SDK installation -............................. - -Some operators, such as :class:`airflow.providers.google.cloud.operators.kubernetes_engine.GKEStartPodOperator`, -:class:`airflow.providers.google.cloud.operators.dataflow.DataflowStartSqlJobOperator`, require -the installation of `Google Cloud SDK <https://cloud.google.com/sdk>`__ (includes ``gcloud``). -You can also run these commands with BashOperator. - -Create a new Dockerfile like the one shown below. - -.. exampleinclude:: /docker-images-recipes/gcloud.Dockerfile - :language: dockerfile - -Then build a new image. - -.. code-block:: bash - - docker build . \ - --build-arg BASE_AIRFLOW_IMAGE="apache/airflow:2.0.0" \ - -t my-airflow-image - - -Apache Hadoop Stack installation -................................ - -Airflow is often used to run tasks on Hadoop cluster. It required Java Runtime Environment (JRE) to run. -Below are the steps to take tools that are frequently used in Hadoop-world: - -- Java Runtime Environment (JRE) -- Apache Hadoop -- Apache Hive -- `Cloud Storage connector for Apache Hadoop <https://cloud.google.com/dataproc/docs/concepts/connectors/cloud-storage>`__ - - -Create a new Dockerfile like the one shown below. - -.. exampleinclude:: /docker-images-recipes/hadoop.Dockerfile - :language: dockerfile - -Then build a new image. - -.. code-block:: bash - - docker build . \ - --build-arg BASE_AIRFLOW_IMAGE="apache/airflow:2.0.0" \ - -t my-airflow-image - -More details about the images ------------------------------ - -You can read more details about the images - the context, their parameters and internal structure in the -`IMAGES.rst <https://github.com/apache/airflow/blob/master/IMAGES.rst>`_ document. +We provide :doc:`a Docker Image (OCI) for Apache Airflow <docker-stack:index>` for use in a containerized environment. Consider using it to guarantee that software will always run the same no matter where it’s deployed. .. _production-deployment:kerberos: diff --git a/docs/apache-airflow/start/docker.rst b/docs/apache-airflow/start/docker.rst index e79cae5..0e2becf 100644 --- a/docs/apache-airflow/start/docker.rst +++ b/docs/apache-airflow/start/docker.rst @@ -195,7 +195,7 @@ To stop and delete containers, delete volumes with database data and download im Notes ===== -By default, the Docker Compose file uses the latest Airflow image (`apache/airflow <https://hub.docker.com/r/apache/airflow>`__). If you need, you can :ref:`customize and extend it <docker_image>`. +By default, the Docker Compose file uses the latest Airflow image (`apache/airflow <https://hub.docker.com/r/apache/airflow>`__). If you need, you can :doc:`customize and extend it <docker-stack:index>`. What's Next? ============ diff --git a/docs/build_docs.py b/docs/build_docs.py index 1080533..4e4786f 100755 --- a/docs/build_docs.py +++ b/docs/build_docs.py @@ -205,19 +205,23 @@ def main(): _promote_new_flags() with with_group("Available packages"): - for pkg in available_packages: + for pkg in sorted(available_packages): print(f" - {pkg}") if package_filters: print("Current package filters: ", package_filters) current_packages = process_package_filters(available_packages, package_filters) + + with with_group("Fetching inventories"): + # Inventories that could not be retrieved should be retrieved first. This may mean this is a + # new package. + priority_packages = fetch_inventories() + current_packages = sorted(current_packages, key=lambda d: -1 if d in priority_packages else 1) + with with_group(f"Documentation will be built for {len(current_packages)} package(s)"): for pkg_no, pkg in enumerate(current_packages, start=1): print(f"{pkg_no}. {pkg}") - with with_group("Fetching inventories"): - fetch_inventories() - all_build_errors: Dict[Optional[str], List[DocBuildError]] = {} all_spelling_errors: Dict[Optional[str], List[SpellingError]] = {} package_build_errors, package_spelling_errors = build_docs_for_packages( diff --git a/docs/conf.py b/docs/conf.py index 2a4ca2b..678f053 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -145,6 +145,9 @@ if PACKAGE_NAME == "apache-airflow-providers": 'providers_packages_ref', ] ) +elif PACKAGE_NAME in ("helm-chart", "docker-stack"): + # No extra extensions + pass else: extensions.append('autoapi.extension') # List of patterns, relative to source directory, that match files and diff --git a/docs/docker-stack/build.rst b/docs/docker-stack/build.rst new file mode 100644 index 0000000..a07a837 --- /dev/null +++ b/docs/docker-stack/build.rst @@ -0,0 +1,511 @@ + .. Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + .. http://www.apache.org/licenses/LICENSE-2.0 + + .. Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. + +Building the image +================== + +Before you dive-deeply in the way how the Airflow Image is build, named and why we are doing it the +way we do, you might want to know very quickly how you can extend or customize the existing image +for Apache Airflow. This chapter gives you a short answer to those questions. + +Extending vs. customizing the image +----------------------------------- + +Here is the comparison of the two types of building images. Here is your guide if you want to choose +how you want to build your image. + ++----------------------------------------------------+-----------+-------------+ +| | Extending | Customizing | ++====================================================+===========+=============+ +| Can be built without airflow sources | Yes | No | ++----------------------------------------------------+-----------+-------------+ +| Uses familiar 'FROM ' pattern of image building | Yes | No | ++----------------------------------------------------+-----------+-------------+ +| Requires only basic knowledge about images | Yes | No | ++----------------------------------------------------+-----------+-------------+ +| Builds quickly | Yes | No | ++----------------------------------------------------+-----------+-------------+ +| Produces image heavily optimized for size | No | Yes | ++----------------------------------------------------+-----------+-------------+ +| Can build from custom airflow sources (forks) | No | Yes | ++----------------------------------------------------+-----------+-------------+ +| Can build on air-gaped system | No | Yes | ++----------------------------------------------------+-----------+-------------+ + +TL;DR; If you have a need to build custom image, it is easier to start with "Extending" however if your +dependencies require compilation step or when your require to build the image from security vetted +packages, switching to "Customizing" the image provides much more optimized images. In the example further +where we compare equivalent "Extending" and "Customizing" the image, similar images build by +Extending vs. Customization had shown 1.1GB vs 874MB image sizes respectively - with 20% improvement in +size of the Customized image. + +.. note:: + + You can also combine both - customizing & extending the image in one. You can build your + optimized base image first using ``customization`` method (for example by your admin team) with all + the heavy compilation required dependencies and you can publish it in your registry and let others + ``extend`` your image using ``FROM`` and add their own lightweight dependencies. This reflects well + the split where typically "Casual" users will Extend the image and "Power-users" will customize it. + +Airflow Summit 2020's `Production Docker Image <https://youtu.be/wDr3Y7q2XoI>`_ talk provides more +details about the context, architecture and customization/extension methods for the Production Image. + +Extending the image +------------------- + +Extending the image is easiest if you just need to add some dependencies that do not require +compiling. The compilation framework of Linux (so called ``build-essential``) is pretty big, and +for the production images, size is really important factor to optimize for, so our Production Image +does not contain ``build-essential``. If you need compiler like gcc or g++ or make/cmake etc. - those +are not found in the image and it is recommended that you follow the "customize" route instead. + +How to extend the image - it is something you are most likely familiar with - simply +build a new image using Dockerfile's ``FROM`` directive and add whatever you need. Then you can add your +Debian dependencies with ``apt`` or PyPI dependencies with ``pip install`` or any other stuff you need. + +You should be aware, about a few things: + +* The production image of airflow uses "airflow" user, so if you want to add some of the tools + as ``root`` user, you need to switch to it with ``USER`` directive of the Dockerfile and switch back to + ``airflow`` user when you are done. Also you should remember about following the + `best practises of Dockerfiles <https://docs.docker.com/develop/develop-images/dockerfile_best-practices/>`_ + to make sure your image is lean and small. + +* The PyPI dependencies in Apache Airflow are installed in the user library, of the "airflow" user, so + PIP packages are installed to ``~/.local`` folder as if the ``--user`` flag was specified when running PIP. + Note also that using ``--no-cache-dir`` is a good idea that can help to make your image smaller. + +* If your apt, or PyPI dependencies require some of the ``build-essential`` or other packages that need + to compile your python dependencies, then your best choice is to follow the "Customize the image" route, + because you can build a highly-optimized (for size) image this way. However it requires to checkout sources + of Apache Airflow, so you might still want to choose to add ``build-essential`` to your image, + even if your image will be significantly bigger. + +* You can also embed your dags in the image by simply adding them with COPY directive of Airflow. + The DAGs in production image are in ``/opt/airflow/dags`` folder. + +* You can build your image without any need for Airflow sources. It is enough that you place the + ``Dockerfile`` and any files that are referred to (such as Dag files) in a separate directory and run + a command ``docker build . --tag my-image:my-tag`` (where ``my-image`` is the name you want to name it + and ``my-tag`` is the tag you want to tag the image with. + +.. note:: + As of 2.0.1 image the ``--user`` flag is turned on by default by setting ``PIP_USER`` environment variable + to ``true``. This can be disabled by un-setting the variable or by setting it to ``false``. In the + 2.0.0 image you had to add the ``--user`` flag as ``pip install --user`` command. + +Examples of image extending +--------------------------- + +An ``apt`` package example +.......................... + +The following example adds ``vim`` to the airflow image. + +.. exampleinclude:: docker-examples/extending/add-apt-packages/Dockerfile + :language: Dockerfile + :start-after: [START Dockerfile] + :end-before: [END Dockerfile] + +A ``PyPI`` package example +.......................... + +The following example adds ``lxml`` python package from PyPI to the image. + +.. exampleinclude:: docker-examples/extending/add-pypi-packages/Dockerfile + :language: Dockerfile + :start-after: [START Dockerfile] + :end-before: [END Dockerfile] + +A ``build-essential`` requiring package example +............................................... + +The following example adds ``mpi4py`` package which requires both ``build-essential`` and ``mpi compiler``. + +.. exampleinclude:: docker-examples/extending/add-build-essential-extend/Dockerfile + :language: Dockerfile + :start-after: [START Dockerfile] + :end-before: [END Dockerfile] + +The size of this image is ~ 1.1 GB when build. As you will see further, you can achieve 20% reduction in +size of the image in case you use "Customizing" rather than "Extending" the image. + +DAG embedding example +..................... + +The following example adds ``test_dag.py`` to your image in the ``/opt/airflow/dags`` folder. + +.. exampleinclude:: docker-examples/extending/embedding-dags/Dockerfile + :language: Dockerfile + :start-after: [START Dockerfile] + :end-before: [END Dockerfile] + + +.. exampleinclude:: docker-examples/extending/embedding-dags/test_dag.py + :language: Python + :start-after: [START dag] + :end-before: [END dag] + +Customizing the image +--------------------- + +Customizing the image is an optimized way of adding your own dependencies to the image - better +suited to prepare highly optimized (for size) production images, especially when you have dependencies +that require to be compiled before installing (such as ``mpi4py``). + +It also allows more sophisticated usages, needed by "Power-users" - for example using forked version +of Airflow, or building the images from security-vetted sources. + +The big advantage of this method is that it produces optimized image even if you need some compile-time +dependencies that are not needed in the final image. + +The disadvantage is that you need to use Airflow Sources to build such images from the +`official distribution repository of Apache Airflow <https://downloads.apache.org/airflow/>`_ for the +released versions, or from the checked out sources (using release tags or main branches) in the +`Airflow GitHub Project <https://github.com/apache/airflow>`_ or from your own fork +if you happen to do maintain your own fork of Airflow. + +Another disadvantage is that the pattern of building Docker images with ``--build-arg`` is less familiar +to developers of such images. However it is quite well-known to "power-users". That's why the +customizing flow is better suited for those users who have more familiarity and have more custom +requirements. + +The image also usually builds much longer than the equivalent "Extended" image because instead of +extending the layers that are already coming from the base image, it rebuilds the layers needed +to add extra dependencies needed at early stages of image building. + +When customizing the image you can choose a number of options how you install Airflow: + + * From the PyPI releases (default) + * From the custom installation sources - using additional/replacing the original apt or PyPI repositories + * From local sources. This is used mostly during development. + * From tag or branch, or specific commit from a GitHub Airflow repository (or fork). This is particularly + useful when you build image for a custom version of Airflow that you keep in your fork and you do not + want to release the custom Airflow version to PyPI. + * From locally stored binary packages for Airflow, Airflow Providers and other dependencies. This is + particularly useful if you want to build Airflow in a highly-secure environment where all such packages + must be vetted by your security team and stored in your private artifact registry. This also + allows to build airflow image in an air-gaped environment. + * Side note. Building ``Airflow`` in an ``air-gaped`` environment sounds pretty funny, doesn't it? + +You can also add a range of customizations while building the image: + + * base python image you use for Airflow + * version of Airflow to install + * extras to install for Airflow (or even removing some default extras) + * additional apt/python dependencies to use while building Airflow (DEV dependencies) + * additional apt/python dependencies to install for runtime version of Airflow (RUNTIME dependencies) + * additional commands and variables to set if needed during building or preparing Airflow runtime + * choosing constraint file to use when installing Airflow + +Additional explanation is needed for the last point. Airflow uses constraints to make sure +that it can be predictably installed, even if some new versions of Airflow dependencies are +released (or even dependencies of our dependencies!). The docker image and accompanying scripts +usually determine automatically the right versions of constraints to be used based on the Airflow +version installed and Python version. For example 2.0.1 version of Airflow installed from PyPI +uses constraints from ``constraints-2.0.1`` tag). However in some cases - when installing airflow from +GitHub for example - you have to manually specify the version of constraints used, otherwise +it will default to the latest version of the constraints which might not be compatible with the +version of Airflow you use. + +You can also download any version of Airflow constraints and adapt it with your own set of +constraints and manually set your own versions of dependencies in your own constraints and use the version +of constraints that you manually prepared. + +You can read more about constraints in the documentation of the +`Installation <http://airflow.apache.org/docs/apache-airflow/stable/installation.html#constraints-files>`_ + +Examples of image customizing +----------------------------- + +.. _image-build-pypi: + + +Building from PyPI packages +........................... + +This is the basic way of building the custom images from sources. + +The following example builds the production image in version ``3.6`` with latest PyPI-released Airflow, +with default set of Airflow extras and dependencies. The ``2.0.1`` constraints are used automatically. + +.. exampleinclude:: docker-examples/customizing/stable-airflow.sh + :language: bash + :start-after: [START build] + :end-before: [END build] + +The following example builds the production image in version ``3.7`` with default extras from ``2.0.1`` PyPI +package. The ``2.0.1`` constraints are used automatically. + +.. exampleinclude:: docker-examples/customizing/pypi-selected-version.sh + :language: bash + :start-after: [START build] + :end-before: [END build] + +The following example builds the production image in version ``3.8`` with additional airflow extras +(``mssql,hdfs``) from ``2.0.1`` PyPI package, and additional dependency (``oauth2client``). + +.. exampleinclude:: docker-examples/customizing/pypi-extras-and-deps.sh + :language: bash + :start-after: [START build] + :end-before: [END build] + + +The following example adds ``mpi4py`` package which requires both ``build-essential`` and ``mpi compiler``. + +.. exampleinclude:: docker-examples/customizing/add-build-essential-custom.sh + :language: bash + :start-after: [START build] + :end-before: [END build] + +The above image is equivalent of the "extended" image from previous chapter but it's size is only +874 MB. Comparing to 1.1 GB of the "extended image" this is about 230 MB less, so you can achieve ~20% +improvement in size of the image by using "customization" vs. extension. The saving can increase in case you +have more complex dependencies to build. + + +.. _image-build-optimized: + +Building optimized images +......................... + +The following example the production image in version ``3.6`` with additional airflow extras from ``2.0.1`` +PyPI package but it includes additional apt dev and runtime dependencies. + +The dev dependencies are those that require ``build-essential`` and usually need to involve recompiling +of some python dependencies so those packages might require some additional DEV dependencies to be +present during recompilation. Those packages are not needed at runtime, so we only install them for the +"build" time. They are not installed in the final image, thus producing much smaller images. +In this case pandas requires recompilation so it also needs gcc and g++ as dev APT dependencies. +The ``jre-headless`` does not require recompiling so it can be installed as the runtime APT dependency. + +.. exampleinclude:: docker-examples/customizing/pypi-dev-runtime-deps.sh + :language: bash + :start-after: [START build] + :end-before: [END build] + +.. _image-build-github: + + +Building from GitHub +.................... + +This method is usually used for development purpose. But in case you have your own fork you can point +it to your forked version of source code without having to release it to PyPI. It is enough to have +a branch or tag in your repository and use the tag or branch in the URL that you point the installation to. + +In case of GitHyb builds you need to pass the constraints reference manually in case you want to use +specific constraints, otherwise the default ``constraints-master`` is used. + +The following example builds the production image in version ``3.7`` with default extras from the latest master version and +constraints are taken from latest version of the constraints-master branch in GitHub. + +.. exampleinclude:: docker-examples/customizing/github-master.sh + :language: bash + :start-after: [START build] + :end-before: [END build] + +The following example builds the production image with default extras from the +latest ``v2-0-test`` version and constraints are taken from the latest version of +the ``constraints-2-0`` branch in GitHub. Note that this command might fail occasionally as only +the "released version" constraints when building a version and "master" constraints when building +master are guaranteed to work. + +.. exampleinclude:: docker-examples/customizing/github-v2-0-test.sh + :language: bash + :start-after: [START build] + :end-before: [END build] + +You can also specify another repository to build from. If you also want to use different constraints +repository source, you must specify it as additional ``CONSTRAINTS_GITHUB_REPOSITORY`` build arg. + +The following example builds the production image using ``potiuk/airflow`` fork of Airflow and constraints +are also downloaded from that repository. + +.. exampleinclude:: docker-examples/customizing/github-different-repository.sh + :language: bash + :start-after: [START build] + :end-before: [END build] + +.. _image-build-custom: + +Using custom installation sources +................................. + +You can customize more aspects of the image - such as additional commands executed before apt dependencies +are installed, or adding extra sources to install your dependencies from. You can see all the arguments +described below but here is an example of rather complex command to customize the image +based on example in `this comment <https://github.com/apache/airflow/issues/8605#issuecomment-690065621>`_: + +In case you need to use your custom PyPI package indexes, you can also customize PYPI sources used during +image build by adding a ``docker-context-files``/``.pypirc`` file when building the image. +This ``.pypirc`` will not be committed to the repository (it is added to ``.gitignore``) and it will not be +present in the final production image. It is added and used only in the build segment of the image. +Therefore this ``.pypirc`` file can safely contain list of package indexes you want to use, +usernames and passwords used for authentication. More details about ``.pypirc`` file can be found in the +`pypirc specification <https://packaging.python.org/specifications/pypirc/>`_. + +Such customizations are independent of the way how airflow is installed. + +.. note:: + Similar results could be achieved by modifying the Dockerfile manually (see below) and injecting the + commands needed, but by specifying the customizations via build-args, you avoid the need of + synchronizing the changes from future Airflow Dockerfiles. Those customizations should work with the + future version of Airflow's official ``Dockerfile`` at most with minimal modifications od parameter + names (if any), so using the build command for your customizations makes your custom image more + future-proof. + +The following - rather complex - example shows capabilities of: + + * Adding airflow extras (slack, odbc) + * Adding PyPI dependencies (``azure-storage-blob, oauth2client, beautifulsoup4, dateparser, rocketchat_API,typeform``) + * Adding custom environment variables while installing ``apt`` dependencies - both DEV and RUNTIME + (``ACCEPT_EULA=Y'``) + * Adding custom curl command for adding keys and configuring additional apt sources needed to install + ``apt`` dependencies (both DEV and RUNTIME) + * Adding custom ``apt`` dependencies, both DEV (``msodbcsql17 unixodbc-dev g++) and runtime msodbcsql17 unixodbc git procps vim``) + +.. exampleinclude:: docker-examples/customizing/custom-sources.sh + :language: bash + :start-after: [START build] + :end-before: [END build] + +.. _image-build-secure-environments: + +Build images in security restricted environments +................................................ + +You can also make sure your image is only build using local constraint file and locally downloaded +wheel files. This is often useful in Enterprise environments where the binary files are verified and +vetted by the security teams. It is also the most complex way of building the image. You should be an +expert of building and using Dockerfiles in order to use it and have to have specific needs of security if +you want to follow that route. + +This builds below builds the production image with packages and constraints used from the local +``docker-context-files`` rather than installed from PyPI or GitHub. It also disables MySQL client +installation as it is using external installation method. + +Note that as a prerequisite - you need to have downloaded wheel files. In the example below we +first download such constraint file locally and then use ``pip download`` to get the ``.whl`` files needed +but in most likely scenario, those wheel files should be copied from an internal repository of such .whl +files. Note that ``AIRFLOW_VERSION_SPECIFICATION`` is only there for reference, the apache airflow ``.whl`` file +in the right version is part of the ``.whl`` files downloaded. + +Note that 'pip download' will only works on Linux host as some of the packages need to be compiled from +sources and you cannot install them providing ``--platform`` switch. They also need to be downloaded using +the same python version as the target image. + +The ``pip download`` might happen in a separate environment. The files can be committed to a separate +binary repository and vetted/verified by the security team and used subsequently to build images +of Airflow when needed on an air-gaped system. + +Example of preparing the constraint files and wheel files. Note that ``mysql`` dependency is removed +as ``mysqlclient`` is installed from Oracle's ``apt`` repository and if you want to add it, you need +to provide this library from you repository if you want to build Airflow image in an "air-gaped" system. + +.. exampleinclude:: docker-examples/restricted/restricted_environments.sh + :language: bash + :start-after: [START download] + :end-before: [END download] + +After this step is finished, your ``docker-context-files`` folder will contain all the packages that +are needed to install Airflow from. + +Those downloaded packages and constraint file can be pre-vetted by your security team before you attempt +to install the image. You can also store those downloaded binary packages in your private artifact registry +which allows for the flow where you will download the packages on one machine, submit only new packages for +security vetting and only use the new packages when they were vetted. + +On a separate (air-gaped) system, all the PyPI packages can be copied to ``docker-context-files`` +where you can build the image using the packages downloaded by passing those build args: + + * ``INSTALL_FROM_DOCKER_CONTEXT_FILES="true"`` - to use packages present in ``docker-context-files`` + * ``AIRFLOW_PRE_CACHED_PIP_PACKAGES="false"`` - to not pre-cache packages from PyPI when building image + * ``AIRFLOW_CONSTRAINTS_LOCATION=/docker-context-files/YOUR_CONSTRAINT_FILE.txt`` - to downloaded constraint files + * (Optional) ``INSTALL_MYSQL_CLIENT="false"`` if you do not want to install ``MySQL`` + client from the Oracle repositories. In this case also make sure that your + +Note, that the solution we have for installing python packages from local packages, only solves the problem +of "air-gaped" python installation. The Docker image also downloads ``apt`` dependencies and ``node-modules``. +Those type of dependencies are however more likely to be available in your "air-gaped" system via transparent +proxies and it should automatically reach out to your private registries, however in the future the +solution might be applied to both of those installation steps. + +You can also use techniques described in the previous chapter to make ``docker build`` use your private +apt sources or private PyPI repositories (via ``.pypirc``) available which can be security-vetted. + +If you fulfill all the criteria, you can build the image on an air-gaped system by running command similar +to the below: + +.. exampleinclude:: docker-examples/restricted/restricted_environments.sh + :language: bash + :start-after: [START build] + :end-before: [END build] + +Modifying the Dockerfile +........................ + +The build arg approach is a convenience method if you do not want to manually modify the ``Dockerfile``. +Our approach is flexible enough, to be able to accommodate most requirements and +customizations out-of-the-box. When you use it, you do not need to worry about adapting the image every +time new version of Airflow is released. However sometimes it is not enough if you have very +specific needs and want to build a very custom image. In such case you can simply modify the +``Dockerfile`` manually as you see fit and store it in your forked repository. However you will have to +make sure to rebase your changes whenever new version of Airflow is released, because we might modify +the approach of our Dockerfile builds in the future and you might need to resolve conflicts +and rebase your changes. + +There are a few things to remember when you modify the ``Dockerfile``: + +* We are using the widely recommended pattern of ``.dockerignore`` where everything is ignored by default + and only the required folders are added through exclusion (!). This allows to keep docker context small + because there are many binary artifacts generated in the sources of Airflow and if they are added to + the context, the time of building the image would increase significantly. If you want to add any new + folders to be available in the image you must add it here with leading ``!``. + + .. code-block:: text + + # Ignore everything + ** + + # Allow only these directories + !airflow + ... + + +* The ``docker-context-files`` folder is automatically added to the context of the image, so if you want + to add individual files, binaries, requirement files etc you can add them there. The + ``docker-context-files`` is copied to the ``/docker-context-files`` folder of the build segment of the + image, so it is not present in the final image - which makes the final image smaller in case you want + to use those files only in the ``build`` segment. You must copy any files from the directory manually, + using COPY command if you want to get the files in your final image (in the main image segment). + + +More details +------------ + +Build Args reference +.................... + +The detailed ``--build-arg`` reference can be found in :doc:`build-arg-ref`. + + +The architecture of the images +.............................. + +You can read more details about the images - the context, their parameters and internal structure in the +`IMAGES.rst <https://github.com/apache/airflow/blob/master/IMAGES.rst>`_ document. diff --git a/docs/apache-airflow/docker-images-recipes/gcloud.Dockerfile b/docs/docker-stack/docker-images-recipes/gcloud.Dockerfile similarity index 100% rename from docs/apache-airflow/docker-images-recipes/gcloud.Dockerfile rename to docs/docker-stack/docker-images-recipes/gcloud.Dockerfile diff --git a/docs/apache-airflow/docker-images-recipes/hadoop.Dockerfile b/docs/docker-stack/docker-images-recipes/hadoop.Dockerfile similarity index 100% rename from docs/apache-airflow/docker-images-recipes/hadoop.Dockerfile rename to docs/docker-stack/docker-images-recipes/hadoop.Dockerfile diff --git a/docs/docker-stack/entrypoint.rst b/docs/docker-stack/entrypoint.rst new file mode 100644 index 0000000..a7889c4 --- /dev/null +++ b/docs/docker-stack/entrypoint.rst @@ -0,0 +1,201 @@ + .. Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + .. http://www.apache.org/licenses/LICENSE-2.0 + + .. Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. + +Entrypoint +========== + +If you are using the default entrypoint of the production image, +there are a few actions that are automatically performed when the container starts. +In some cases, you can pass environment variables to the image to trigger some of that behaviour. + +The variables that control the "execution" behaviour start with ``_AIRFLOW`` to distinguish them +from the variables used to build the image starting with ``AIRFLOW``. + +The image entrypoint works as follows: + +* In case the user is not "airflow" (with undefined user id) and the group id of the user is set to ``0`` (root), + then the user is dynamically added to ``/etc/passwd`` at entry using ``USER_NAME`` variable to define the user name. + This is in order to accommodate the + `OpenShift Guidelines <https://docs.openshift.com/enterprise/3.0/creating_images/guidelines.html>`_ + +* The ``AIRFLOW_HOME`` is set by default to ``/opt/airflow/`` - this means that DAGs + are in default in the ``/opt/airflow/dags`` folder and logs are in the ``/opt/airflow/logs`` + +* The working directory is ``/opt/airflow`` by default. + +* If ``AIRFLOW__CORE__SQL_ALCHEMY_CONN`` variable is passed to the container and it is either mysql or postgres + SQL alchemy connection, then the connection is checked and the script waits until the database is reachable. + If ``AIRFLOW__CORE__SQL_ALCHEMY_CONN_CMD`` variable is passed to the container, it is evaluated as a + command to execute and result of this evaluation is used as ``AIRFLOW__CORE__SQL_ALCHEMY_CONN``. The + ``_CMD`` variable takes precedence over the ``AIRFLOW__CORE__SQL_ALCHEMY_CONN`` variable. + +* If no ``AIRFLOW__CORE__SQL_ALCHEMY_CONN`` variable is set then SQLite database is created in + ``${AIRFLOW_HOME}/airflow.db`` and db reset is executed. + +* If first argument equals to "bash" - you are dropped to a bash shell or you can executes bash command + if you specify extra arguments. For example: + + .. code-block:: bash + + docker run -it apache/airflow:master-python3.6 bash -c "ls -la" + total 16 + drwxr-xr-x 4 airflow root 4096 Jun 5 18:12 . + drwxr-xr-x 1 root root 4096 Jun 5 18:12 .. + drwxr-xr-x 2 airflow root 4096 Jun 5 18:12 dags + drwxr-xr-x 2 airflow root 4096 Jun 5 18:12 logs + +* If first argument is equal to ``python`` - you are dropped in python shell or python commands are executed if + you pass extra parameters. For example: + + .. code-block:: bash + + > docker run -it apache/airflow:master-python3.6 python -c "print('test')" + test + +* If first argument equals to "airflow" - the rest of the arguments is treated as an airflow command + to execute. Example: + + .. code-block:: bash + + docker run -it apache/airflow:master-python3.6 airflow webserver + +* If there are any other arguments - they are simply passed to the "airflow" command + + .. code-block:: bash + + > docker run -it apache/airflow:master-python3.6 version + 2.1.0.dev0 + +* If ``AIRFLOW__CELERY__BROKER_URL`` variable is passed and airflow command with + scheduler, worker of flower command is used, then the script checks the broker connection + and waits until the Celery broker database is reachable. + If ``AIRFLOW__CELERY__BROKER_URL_CMD`` variable is passed to the container, it is evaluated as a + command to execute and result of this evaluation is used as ``AIRFLOW__CELERY__BROKER_URL``. The + ``_CMD`` variable takes precedence over the ``AIRFLOW__CELERY__BROKER_URL`` variable. + +Creating system user +-------------------- + +Airflow image is Open-Shift compatible, which means that you can start it with random user ID and group id 0. +Airflow will automatically create such a user and make it's home directory point to ``/home/airflow``. +You can read more about it in the "Support arbitrary user ids" chapter in the +`Openshift best practices <https://docs.openshift.com/container-platform/4.1/openshift_images/create-images.html#images-create-guide-openshift_create-images>`_. + +Waits for Airflow DB connection +------------------------------- + +In case Postgres or MySQL DB is used, the entrypoint will wait until the airflow DB connection becomes +available. This happens always when you use the default entrypoint. + +The script detects backend type depending on the URL schema and assigns default port numbers if not specified +in the URL. Then it loops until the connection to the host/port specified can be established +It tries ``CONNECTION_CHECK_MAX_COUNT`` times and sleeps ``CONNECTION_CHECK_SLEEP_TIME`` between checks +To disable check, set ``CONNECTION_CHECK_MAX_COUNT=0``. + +Supported schemes: + +* ``postgres://`` - default port 5432 +* ``mysql://`` - default port 3306 +* ``sqlite://`` + +In case of SQLite backend, there is no connection to establish and waiting is skipped. + +Upgrading Airflow DB +-------------------- + +If you set ``_AIRFLOW_DB_UPGRADE`` variable to a non-empty value, the entrypoint will run +the ``airflow db upgrade`` command right after verifying the connection. You can also use this +when you are running airflow with internal SQLite database (default) to upgrade the db and create +admin users at entrypoint, so that you can start the webserver immediately. Note - using SQLite is +intended only for testing purpose, never use SQLite in production as it has severe limitations when it +comes to concurrency. + +Creating admin user +------------------- + +The entrypoint can also create webserver user automatically when you enter it. you need to set +``_AIRFLOW_WWW_USER_CREATE`` to a non-empty value in order to do that. This is not intended for +production, it is only useful if you would like to run a quick test with the production image. +You need to pass at least password to create such user via ``_AIRFLOW_WWW_USER_PASSWORD_CMD`` or +``_AIRFLOW_WWW_USER_PASSWORD_CMD`` similarly like for other ``*_CMD`` variables, the content of +the ``*_CMD`` will be evaluated as shell command and it's output will be set as password. + +User creation will fail if none of the ``PASSWORD`` variables are set - there is no default for +password for security reasons. + ++-----------+--------------------------+----------------------------------------------------------------------+ +| Parameter | Default | Environment variable | ++===========+==========================+======================================================================+ +| username | admin | ``_AIRFLOW_WWW_USER_USERNAME`` | ++-----------+--------------------------+----------------------------------------------------------------------+ +| password | | ``_AIRFLOW_WWW_USER_PASSWORD_CMD`` or ``_AIRFLOW_WWW_USER_PASSWORD`` | ++-----------+--------------------------+----------------------------------------------------------------------+ +| firstname | Airflow | ``_AIRFLOW_WWW_USER_FIRSTNAME`` | ++-----------+--------------------------+----------------------------------------------------------------------+ +| lastname | Admin | ``_AIRFLOW_WWW_USER_LASTNAME`` | ++-----------+--------------------------+----------------------------------------------------------------------+ +| email | [email protected] | ``_AIRFLOW_WWW_USER_EMAIL`` | ++-----------+--------------------------+----------------------------------------------------------------------+ +| role | Admin | ``_AIRFLOW_WWW_USER_ROLE`` | ++-----------+--------------------------+----------------------------------------------------------------------+ + +In case the password is specified, the user will be attempted to be created, but the entrypoint will +not fail if the attempt fails (this accounts for the case that the user is already created). + +You can, for example start the webserver in the production image with initializing the internal SQLite +database and creating an ``admin/admin`` Admin user with the following command: + +.. code-block:: bash + + docker run -it -p 8080:8080 \ + --env "_AIRFLOW_DB_UPGRADE=true" \ + --env "_AIRFLOW_WWW_USER_CREATE=true" \ + --env "_AIRFLOW_WWW_USER_PASSWORD=admin" \ + apache/airflow:master-python3.8 webserver + + +.. code-block:: bash + + docker run -it -p 8080:8080 \ + --env "_AIRFLOW_DB_UPGRADE=true" \ + --env "_AIRFLOW_WWW_USER_CREATE=true" \ + --env "_AIRFLOW_WWW_USER_PASSWORD_CMD=echo admin" \ + apache/airflow:master-python3.8 webserver + +The commands above perform initialization of the SQLite database, create admin user with admin password +and Admin role. They also forward local port ``8080`` to the webserver port and finally start the webserver. + +Waits for celery broker connection +---------------------------------- + +In case Postgres or MySQL DB is used, and one of the ``scheduler``, ``celery``, ``worker``, or ``flower`` +commands are used the entrypoint will wait until the celery broker DB connection is available. + +The script detects backend type depending on the URL schema and assigns default port numbers if not specified +in the URL. Then it loops until connection to the host/port specified can be established +It tries ``CONNECTION_CHECK_MAX_COUNT`` times and sleeps ``CONNECTION_CHECK_SLEEP_TIME`` between checks. +To disable check, set ``CONNECTION_CHECK_MAX_COUNT=0``. + +Supported schemes: + +* ``amqp(s)://`` (rabbitmq) - default port 5672 +* ``redis://`` - default port 6379 +* ``postgres://`` - default port 5432 +* ``mysql://`` - default port 3306 +* ``sqlite://`` + +In case of SQLite backend, there is no connection to establish and waiting is skipped. diff --git a/docs/docker-stack/img/docker-logo.png b/docs/docker-stack/img/docker-logo.png new file mode 100644 index 0000000..d83e54a Binary files /dev/null and b/docs/docker-stack/img/docker-logo.png differ diff --git a/docs/docker-stack/index.rst b/docs/docker-stack/index.rst new file mode 100644 index 0000000..29a7daf --- /dev/null +++ b/docs/docker-stack/index.rst @@ -0,0 +1,54 @@ + .. Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + .. http://www.apache.org/licenses/LICENSE-2.0 + + .. Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. + +.. image:: /img/docker-logo.png + :width: 100 + +Docker Image for Apache Airflow +=============================== + +.. toctree:: + :hidden: + + Home <self> + build + entrypoint + recipes + +.. toctree:: + :hidden: + :caption: References + + build-arg-ref + +For the ease of deployment in production, the community releases a production-ready reference container +image. + +The docker image provided (as convenience binary package) in the +`apache/airflow DockerHub <https://hub.docker.com/r/apache/airflow>`_ is a bare image +that has a few external dependencies and extras installed.. + +The Apache Airflow image provided as convenience package is optimized for size, so +it provides just a bare minimal set of the extras and dependencies installed and in most cases +you want to either extend or customize the image. You can see all possible extras in +:doc:`extra-packages-ref`. The set of extras used in Airflow Production image are available in the +`Dockerfile <https://github.com/apache/airflow/blob/2c6c7fdb2308de98e142618836bdf414df9768c8/Dockerfile#L39>`_. + +The production images are build in DockerHub from released version and release candidates. There +are also images published from branches but they are used mainly for development and testing purpose. +See `Airflow Git Branching <https://github.com/apache/airflow/blob/master/CONTRIBUTING.rst#airflow-git-branches>`_ +for details. diff --git a/docs/docker-stack/recipes.rst b/docs/docker-stack/recipes.rst new file mode 100644 index 0000000..8b89a3e --- /dev/null +++ b/docs/docker-stack/recipes.rst @@ -0,0 +1,70 @@ + .. Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + .. http://www.apache.org/licenses/LICENSE-2.0 + + .. Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. + +Recipes +======= + +Users sometimes share interesting ways of using the Docker images. We encourage users to contribute these +recipes to the documentation in case they prove useful to other members of the community by +submitting a pull request. The sections below capture this knowledge. + +Google Cloud SDK installation +----------------------------- + +Some operators, such as :class:`~airflow.providers.google.cloud.operators.kubernetes_engine.GKEStartPodOperator`, +:class:`~airflow.providers.google.cloud.operators.dataflow.DataflowStartSqlJobOperator`, require +the installation of `Google Cloud SDK <https://cloud.google.com/sdk>`__ (includes ``gcloud``). +You can also run these commands with BashOperator. + +Create a new Dockerfile like the one shown below. + +.. exampleinclude:: /docker-images-recipes/gcloud.Dockerfile + :language: dockerfile + +Then build a new image. + +.. code-block:: bash + + docker build . \ + --build-arg BASE_AIRFLOW_IMAGE="apache/airflow:2.0.1" \ + -t my-airflow-image + + +Apache Hadoop Stack installation +-------------------------------- + +Airflow is often used to run tasks on Hadoop cluster. It required Java Runtime Environment (JRE) to run. +Below are the steps to take tools that are frequently used in Hadoop-world: + +- Java Runtime Environment (JRE) +- Apache Hadoop +- Apache Hive +- `Cloud Storage connector for Apache Hadoop <https://cloud.google.com/dataproc/docs/concepts/connectors/cloud-storage>`__ + + +Create a new Dockerfile like the one shown below. + +.. exampleinclude:: /docker-images-recipes/hadoop.Dockerfile + :language: dockerfile + +Then build a new image. + +.. code-block:: bash + + docker build . \ + --build-arg BASE_AIRFLOW_IMAGE="apache/airflow:2.0.1" \ + -t my-airflow-image diff --git a/docs/exts/airflow_intersphinx.py b/docs/exts/airflow_intersphinx.py index ee83b8f..750579f 100644 --- a/docs/exts/airflow_intersphinx.py +++ b/docs/exts/airflow_intersphinx.py @@ -67,14 +67,15 @@ def _generate_provider_intersphinx_mapping(): f'/docs/apache-airflow/{current_version}/', (doc_inventory if os.path.exists(doc_inventory) else cache_inventory,), ) + for pkg_name in ['apache-airflow-providers', 'docker-stack']: + if os.environ.get('AIRFLOW_PACKAGE_NAME') == pkg_name: + continue + doc_inventory = f'{DOCS_DIR}/_build/docs/{pkg_name}/objects.inv' + cache_inventory = f'{DOCS_DIR}/_inventory_cache/{pkg_name}/objects.inv' - if os.environ.get('AIRFLOW_PACKAGE_NAME') != 'apache-airflow-providers': - doc_inventory = f'{DOCS_DIR}/_build/docs/apache-airflow-providers/objects.inv' - cache_inventory = f'{DOCS_DIR}/_inventory_cache/apache-airflow-providers/objects.inv' - - airflow_mapping['apache-airflow-providers'] = ( + airflow_mapping[pkg_name] = ( # base URI - '/docs/apache-airflow-providers/', + f'/docs/{pkg_name}/', (doc_inventory if os.path.exists(doc_inventory) else cache_inventory,), ) diff --git a/docs/exts/docs_build/dev_index_template.html.jinja2 b/docs/exts/docs_build/dev_index_template.html.jinja2 index 0de5879..b680255 100644 --- a/docs/exts/docs_build/dev_index_template.html.jinja2 +++ b/docs/exts/docs_build/dev_index_template.html.jinja2 @@ -67,6 +67,17 @@ </ul> </div> </div> + <div class="row"> + <div class="col-md order-md-1"> + <img src="/docs/docker-stack/_images/docker-logo.png" alt="Docker - logo" width="100" height="86"> + </div> + <div class="col-md"> + <h2><a href="/docs/docker-stack/index.html">Docker image</a></h2> + <p> + It makes efficient, lightweight, self-contained environment and guarantees that software will always run the same no matter of where it’s deployed. + </p> + </div> + </div> </div> </body> </html> diff --git a/docs/exts/docs_build/docs_builder.py b/docs/exts/docs_build/docs_builder.py index 6874f78..71e4acb 100644 --- a/docs/exts/docs_build/docs_builder.py +++ b/docs/exts/docs_build/docs_builder.py @@ -54,9 +54,9 @@ class AirflowDocsBuilder: @property def is_versioned(self): """Is current documentation package versioned?""" - # Disable versioning. This documentation does not apply to any issued product and we can update + # Disable versioning. This documentation does not apply to any released product and we can update # it as needed, i.e. with each new package of providers. - return self.package_name != 'apache-airflow-providers' + return self.package_name not in ('apache-airflow-providers', 'docker-stack') @property def _build_dir(self) -> str: @@ -231,4 +231,9 @@ def get_available_providers_packages(): def get_available_packages(): """Get list of all available packages to build.""" provider_package_names = get_available_providers_packages() - return ["apache-airflow", *provider_package_names, "apache-airflow-providers"] + return [ + "apache-airflow", + *provider_package_names, + "apache-airflow-providers", + "docker-stack", + ] diff --git a/docs/exts/docs_build/fetch_inventories.py b/docs/exts/docs_build/fetch_inventories.py index e9da264..da66d02 100644 --- a/docs/exts/docs_build/fetch_inventories.py +++ b/docs/exts/docs_build/fetch_inventories.py @@ -20,10 +20,13 @@ import concurrent.futures import datetime import os import shutil +from itertools import repeat +from typing import Iterator, List, Tuple import requests from requests.adapters import DEFAULT_POOLSIZE +from airflow.utils.helpers import partition from docs.exts.docs_build.docs_builder import ( # pylint: disable=no-name-in-module get_available_providers_packages, ) @@ -42,17 +45,22 @@ S3_DOC_URL_VERSIONED = S3_DOC_URL + "/docs/{package_name}/latest/objects.inv" S3_DOC_URL_NON_VERSIONED = S3_DOC_URL + "/docs/{package_name}/objects.inv" -def _fetch_file(session: requests.Session, url: str, path: str): +def _fetch_file(session: requests.Session, package_name: str, url: str, path: str) -> Tuple[str, bool]: + """ + Download a file and returns status information as a tuple with package + name and success status(bool value). + """ response = session.get(url, allow_redirects=True, stream=True) if not response.ok: print(f"Failed to fetch inventory: {url}") - return + return package_name, False os.makedirs(os.path.dirname(path), exist_ok=True) with open(path, 'wb') as f: response.raw.decode_content = True shutil.copyfileobj(response.raw, f) print(f"Fetched inventory: {url}") + return package_name, True def _is_outdated(path: str): @@ -65,42 +73,61 @@ def _is_outdated(path: str): def fetch_inventories(): """Fetch all inventories for Airflow documentation packages and store in cache.""" os.makedirs(os.path.dirname(CACHE_DIR), exist_ok=True) - to_download = [] + to_download: List[Tuple[str, str, str]] = [] for pkg_name in get_available_providers_packages(): to_download.append( ( + pkg_name, S3_DOC_URL_VERSIONED.format(package_name=pkg_name), f'{CACHE_DIR}/{pkg_name}/objects.inv', ) ) to_download.append( ( + "apache-airflow", S3_DOC_URL_VERSIONED.format(package_name='apache-airflow'), f'{CACHE_DIR}/apache-airflow/objects.inv', ) ) - to_download.append( - ( - S3_DOC_URL_NON_VERSIONED.format(package_name='apache-airflow-providers'), - f'{CACHE_DIR}/apache-airflow-providers/objects.inv', + for pkg_name in ['apache-airflow-providers', 'docker-stack']: + to_download.append( + ( + pkg_name, + S3_DOC_URL_NON_VERSIONED.format(package_name=pkg_name), + f'{CACHE_DIR}/{pkg_name}/objects.inv', + ) ) - ) to_download.extend( ( + pkg_name, f"{doc_url}/objects.inv", f'{CACHE_DIR}/{pkg_name}/objects.inv', ) for pkg_name, doc_url in THIRD_PARTY_INDEXES.items() ) - to_download = [(url, path) for url, path in to_download if _is_outdated(path)] + to_download = [(pkg_name, url, path) for pkg_name, url, path in to_download if _is_outdated(path)] if not to_download: print("Nothing to do") - return + return [] print(f"To download {len(to_download)} inventorie(s)") with requests.Session() as session, concurrent.futures.ThreadPoolExecutor(DEFAULT_POOLSIZE) as pool: - for url, path in to_download: - pool.submit(_fetch_file, session=session, url=url, path=path) + download_results: Iterator[Tuple[str, bool]] = pool.map( + _fetch_file, + repeat(session, len(to_download)), + (pkg_name for pkg_name, _, _ in to_download), + (url for _, url, _ in to_download), + (path for _, _, path in to_download), + ) + failed, success = partition(lambda d: d[1], download_results) + failed, success = list(failed), list(failed) + print(f"Result: {len(success)}, success {len(failed)} failed") + if failed: + print("Failed packages:") + for pkg_no, (pkg_name, _) in enumerate(failed, start=1): + print(f"{pkg_no}. {pkg_name}") + + return [pkg_name for pkg_name, status in failed]
