This is an automated email from the ASF dual-hosted git repository.
hansva pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/hop.git
The following commit(s) were added to refs/heads/main by this push:
new 1b26d3e5de indentation fix, minor updates #3967 (#3977)
1b26d3e5de is described below
commit 1b26d3e5de504e65b62eed25e8df963d0eb0a6c4
Author: Bart Maertens <[email protected]>
AuthorDate: Mon May 27 08:42:59 2024 +0200
indentation fix, minor updates #3967 (#3977)
---
.../apache-airflow/docker-compose.yaml | 35 +++++--------
.../how-to-guides/run-hop-in-apache-airflow.adoc | 60 +++++++++++-----------
2 files changed, 43 insertions(+), 52 deletions(-)
diff --git
a/docs/hop-user-manual/modules/ROOT/assets/files/how-to-guides/apache-airflow/docker-compose.yaml
b/docs/hop-user-manual/modules/ROOT/assets/files/how-to-guides/apache-airflow/docker-compose.yaml
index e8b8884833..a2984185eb 100644
---
a/docs/hop-user-manual/modules/ROOT/assets/files/how-to-guides/apache-airflow/docker-compose.yaml
+++
b/docs/hop-user-manual/modules/ROOT/assets/files/how-to-guides/apache-airflow/docker-compose.yaml
@@ -24,7 +24,7 @@
# The following variables are supported:
#
# AIRFLOW_IMAGE_NAME - Docker image name used to run Airflow.
-# Default: apache/airflow:2.6.0
+# Default: apache/airflow:2.9.1
# AIRFLOW_UID - User ID in Airflow containers
# Default: 50000
# AIRFLOW_PROJ_DIR - Base path to which all the files will be
volumed.
@@ -44,20 +44,17 @@
#
# Feel free to modify this file to suit your needs.
---
-version: '3.8'
x-airflow-common:
&airflow-common
# In order to add custom dependencies or upgrade provider packages you can
use your extended image.
# Comment the image line, place your Dockerfile in the directory where you
placed the docker-compose.yaml
# and uncomment the "build" line below, Then run `docker-compose build` to
build the images.
- image: ${AIRFLOW_IMAGE_NAME:-apache/airflow:2.6.0}
+ image: ${AIRFLOW_IMAGE_NAME:-apache/airflow:2.9.1}
# build: .
environment:
&airflow-common-env
AIRFLOW__CORE__EXECUTOR: CeleryExecutor
AIRFLOW__DATABASE__SQL_ALCHEMY_CONN:
postgresql+psycopg2://airflow:airflow@postgres/airflow
- # For backward compatibility, with Airflow <2.3
- AIRFLOW__CORE__SQL_ALCHEMY_CONN:
postgresql+psycopg2://airflow:airflow@postgres/airflow
AIRFLOW__CELERY__RESULT_BACKEND:
db+postgresql://airflow:airflow@postgres/airflow
AIRFLOW__CELERY__BROKER_URL: redis://:@redis:6379/0
AIRFLOW__CORE__FERNET_KEY: ''
@@ -72,11 +69,14 @@ x-airflow-common:
# WARNING: Use _PIP_ADDITIONAL_REQUIREMENTS option ONLY for a quick checks
# for other purpose (development, test and especially production usage)
build/extend Airflow image.
_PIP_ADDITIONAL_REQUIREMENTS: ${_PIP_ADDITIONAL_REQUIREMENTS:-}
+ # The following line can be used to set a custom config file, stored in
the local config folder
+ # If you want to use it, outcomment it and replace airflow.cfg with the
name of your config file
+ # AIRFLOW_CONFIG: '/opt/airflow/config/airflow.cfg'
volumes:
- ${AIRFLOW_PROJ_DIR:-.}/dags:/opt/airflow/dags
- ${AIRFLOW_PROJ_DIR:-.}/logs:/opt/airflow/logs
+ - ${AIRFLOW_PROJ_DIR:-.}/config:/opt/airflow/config
- ${AIRFLOW_PROJ_DIR:-.}/plugins:/opt/airflow/plugins
- - /var/run/docker.sock:/var/run/docker.sock
user: "${AIRFLOW_UID:-50000}:0"
depends_on:
&airflow-common-depends-on
@@ -102,7 +102,9 @@ services:
restart: always
redis:
- image: redis:latest
+ # Redis is limited to 7.2-bookworm due to licencing change
+ # https://redis.io/blog/redis-adopts-dual-source-available-licensing/
+ image: redis:7.2-bookworm
expose:
- 6379
healthcheck:
@@ -149,9 +151,10 @@ services:
<<: *airflow-common
command: celery worker
healthcheck:
+ # yamllint disable rule:line-length
test:
- "CMD-SHELL"
- - 'celery --app airflow.executors.celery_executor.app inspect ping -d
"celery@$${HOSTNAME}"'
+ - 'celery --app airflow.providers.celery.executors.celery_executor.app
inspect ping -d "celery@$${HOSTNAME}" || celery --app
airflow.executors.celery_executor.app inspect ping -d "celery@$${HOSTNAME}"'
interval: 30s
timeout: 10s
retries: 5
@@ -189,20 +192,6 @@ services:
command:
- -c
- |
- function ver() {
- printf "%04d%04d%04d%04d" $${1//./ }
- }
- airflow_version=$$(AIRFLOW__LOGGING__LOGGING_LEVEL=INFO && gosu
airflow airflow version)
- airflow_version_comparable=$$(ver $${airflow_version})
- min_airflow_version=2.2.0
- min_airflow_version_comparable=$$(ver $${min_airflow_version})
- if (( airflow_version_comparable < min_airflow_version_comparable ));
then
- echo
- echo -e "\033[1;31mERROR!!!: Too old Airflow version
$${airflow_version}!\e[0m"
- echo "The minimum Airflow version supported:
$${min_airflow_version}. Only use this or higher!"
- echo
- exit 1
- fi
if [[ -z "${AIRFLOW_UID}" ]]; then
echo
echo -e "\033[1;33mWARNING!!!: AIRFLOW_UID not set!\e[0m"
@@ -251,7 +240,7 @@ services:
# yamllint enable rule:line-length
environment:
<<: *airflow-common-env
- _AIRFLOW_DB_UPGRADE: 'true'
+ _AIRFLOW_DB_MIGRATE: 'true'
_AIRFLOW_WWW_USER_CREATE: 'true'
_AIRFLOW_WWW_USER_USERNAME: ${_AIRFLOW_WWW_USER_USERNAME:-airflow}
_AIRFLOW_WWW_USER_PASSWORD: ${_AIRFLOW_WWW_USER_PASSWORD:-airflow}
diff --git
a/docs/hop-user-manual/modules/ROOT/pages/how-to-guides/run-hop-in-apache-airflow.adoc
b/docs/hop-user-manual/modules/ROOT/pages/how-to-guides/run-hop-in-apache-airflow.adoc
index b8767de582..6da47397d2 100644
---
a/docs/hop-user-manual/modules/ROOT/pages/how-to-guides/run-hop-in-apache-airflow.adoc
+++
b/docs/hop-user-manual/modules/ROOT/pages/how-to-guides/run-hop-in-apache-airflow.adoc
@@ -39,7 +39,7 @@ The goal of this page is to get a basic Airflow setup running
to demonstrate how
To keep things simple, we'll use Docker Compose to get Apache Airflow up and
running in a matter of minutes. Even though
https://docs.docker.com/compose/[Docker Compose^] has been said to be on the
verge of extinction for quite a while now, it still is a quick and convenient
way to experiment with data platforms that would otherwise be time-consuming
and difficult to set up.
-Apache Airflow provides a
https://airflow.apache.org/docs/apache-airflow/2.6.0/docker-compose.yaml[docker-compose.yaml^]
file. Our goal is to run Apache Hop workflows and pipelines in Apache Airflow,
so we're not interested in the Airflow sample DAGs that come with this
docker-compose file.
+Apache Airflow provides a
https://airflow.apache.org/docs/apache-airflow/2.9.1/docker-compose.yaml[docker-compose.yaml^]
file. Our goal is to run Apache Hop workflows and pipelines in Apache Airflow,
so we're not interested in the Airflow sample DAGs that come with this
docker-compose file.
Change the **AIRFLOW\__CORE__LOAD_EXAMPLES** variable to "false" in the
default file, and add an additional line
**/var/run/docker.sock:/var/run/docker.sock** in the volumes section.
All of this has already been done if you use the
https://github.com/apache/hop/tree/master/docs/hop-user-manual/modules/ROOT/assets/files/how-to-guides/apache-airflow/docker-compose.yaml[the
file] in our github repository.
@@ -55,19 +55,19 @@ The various Apache Airflow need a couple of moments to
start. Once you see a cou
[source, bash]
----
-apache-airflow-airflow-triggerer-1 | [2023-05-07 07:50:08 +0000] [24] [INFO]
Booting worker with pid: 24
-apache-airflow-airflow-triggerer-1 | [2023-05-07 07:50:08 +0000] [25] [INFO]
Booting worker with pid: 25
+apache-airflow-airflow-triggerer-1 | [2024-05-07 07:50:08 +0000] [24] [INFO]
Booting worker with pid: 24
+apache-airflow-airflow-triggerer-1 | [2024-05-07 07:50:08 +0000] [25] [INFO]
Booting worker with pid: 25
apache-airflow-airflow-scheduler-1 | ____________ _____________
apache-airflow-airflow-scheduler-1 | ____ |__( )_________ __/__
/________ __
apache-airflow-airflow-scheduler-1 | ____ /| |_ /__ ___/_ /_ __ /_ __
\_ | /| / /
apache-airflow-airflow-scheduler-1 | ___ ___ | / _ / _ __/ _ / / /_/
/_ |/ |/ /
apache-airflow-airflow-scheduler-1 | _/_/ |_/_/ /_/ /_/ /_/
\____/____/|__/
-apache-airflow-airflow-scheduler-1 | [2023-05-07T07:50:08.601+0000]
{executor_loader.py:114} INFO - Loaded executor: CeleryExecutor
-apache-airflow-airflow-scheduler-1 | [2023-05-07T07:50:08.652+0000]
{scheduler_job_runner.py:823} INFO - Starting the scheduler
-apache-airflow-airflow-scheduler-1 | [2023-05-07T07:50:08.653+0000]
{scheduler_job_runner.py:830} INFO - Processing each file at most -1 times
-apache-airflow-airflow-scheduler-1 | [2023-05-07T07:50:08.657+0000]
{manager.py:165} INFO - Launched DagFileProcessorManager with pid: 34
-apache-airflow-airflow-scheduler-1 | [2023-05-07T07:50:08.658+0000]
{scheduler_job_runner.py:1576} INFO - Resetting orphaned tasks for active dag
runs
-apache-airflow-airflow-scheduler-1 | [2023-05-07T07:50:08.660+0000]
{settings.py:60} INFO - Configured default timezone Timezone('UTC')
+apache-airflow-airflow-scheduler-1 | [2024-05-07T07:50:08.601+0000]
{executor_loader.py:114} INFO - Loaded executor: CeleryExecutor
+apache-airflow-airflow-scheduler-1 | [2024-05-07T07:50:08.652+0000]
{scheduler_job_runner.py:823} INFO - Starting the scheduler
+apache-airflow-airflow-scheduler-1 | [2024-05-07T07:50:08.653+0000]
{scheduler_job_runner.py:830} INFO - Processing each file at most -1 times
+apache-airflow-airflow-scheduler-1 | [2024-05-07T07:50:08.657+0000]
{manager.py:165} INFO - Launched DagFileProcessorManager with pid: 34
+apache-airflow-airflow-scheduler-1 | [2024-05-07T07:50:08.658+0000]
{scheduler_job_runner.py:1576} INFO - Resetting orphaned tasks for active dag
runs
+apache-airflow-airflow-scheduler-1 | [2024-05-07T07:50:08.660+0000]
{settings.py:60} INFO - Configured default timezone Timezone('UTC')
----
Go to http://localhost:8080/home in your browser and log on with username
"airflow" and password "airflow".
@@ -171,7 +171,7 @@ with DAG('sample-pipeline', default_args=default_args,
schedule_interval=None, c
end_dag = DummyOperator(
task_id='end_dag'
)
- hop = DockerOperator(
+ hop = DockerOperator(
task_id='sample-pipeline',
# use the Apache Hop Docker image. Add your tags here in the default
apache/hop: syntax
image='apache/hop',
@@ -201,10 +201,12 @@ All it takes to deploy your dag is to put it in Airflow's
dags folder. Our docke
Save the DAG we just created in your dags folder as apache-hop-dag-simple.py.
After a short wait, your DAG will show up in the list of dags.
-If there are any syntax errors in your DAG, Airflow will let you know. Expand
the error dialog for more details about the error.
-
+If there are any syntax errors in your DAG, Airflow will let you know. Expand
the error dialog for more details about the error, as shown in the image below.
Don't worry, you shouldn't have any errors with the DAG we just created.
+
image:how-to-guides/run-hop-in-apache-airflow/apache-airflow-dag-error.png[Apache
Airflow - DAG error, width="45%"]
+If your DAG is deployed correctly (it should), you'll see it show up in the
list of available DAGs.
+
image:how-to-guides/run-hop-in-apache-airflow/apache-airflow-dag-available.png[Apache
Airflow - DAG available, width="75%"]
Click on the **sample-pipeline** DAG to see more details about it. From the
tab list at the top of the page, select "Code" to review the DAG you just
deployed, or "Graph" to see the graph representation of the DAG. This graph is
extremely simple, but we're exploring Apache Airflow, so that's intentional.
@@ -221,12 +223,12 @@
image:how-to-guides/run-hop-in-apache-airflow/apache-airflow-dag-logs.png[Apache
[source, bash]
----
-2023-05-07, 13:54:39 UTC] {docker.py:391} INFO - 2023/05/07 13:54:39 - Ouput.0
- Finished processing (I=0, O=0, R=5, W=5, U=0, E=0)
-[2023-05-07, 13:54:39 UTC] {docker.py:391} INFO - 2023/05/07 13:54:39 -
null-if-basic - Pipeline duration : 0.45 seconds [ 0.450 ]
-[2023-05-07, 13:54:39 UTC] {docker.py:391} INFO - HopRun exit.
-[2023-05-07, 13:54:39 UTC] {docker.py:391} INFO - 2023/05/07 13:54:39 -
null-if-basic - Execution finished on a local pipeline engine with run
configuration 'local'
-[2023-05-07, 13:54:40 UTC] {taskinstance.py:1373} INFO - Marking task as
SUCCESS. dag_id=sample-pipeline, task_id=sample-pipeline,
execution_date=20230507T135409, start_date=20230507T135411,
end_date=20230507T135440
-[2023-05-07, 13:54:40 UTC] {local_task_job_runner.py:232} INFO - Task exited
with return code 0
+2024-05-07, 13:54:39 UTC] {docker.py:391} INFO - 2023/05/07 13:54:39 - Ouput.0
- Finished processing (I=0, O=0, R=5, W=5, U=0, E=0)
+[2024-05-07, 13:54:39 UTC] {docker.py:391} INFO - 2023/05/07 13:54:39 -
null-if-basic - Pipeline duration : 0.45 seconds [ 0.450 ]
+[2024-05-07, 13:54:39 UTC] {docker.py:391} INFO - HopRun exit.
+[2024-05-07, 13:54:39 UTC] {docker.py:391} INFO - 2023/05/07 13:54:39 -
null-if-basic - Execution finished on a local pipeline engine with run
configuration 'local'
+[2024-05-07, 13:54:40 UTC] {taskinstance.py:1373} INFO - Marking task as
SUCCESS. dag_id=sample-pipeline, task_id=sample-pipeline,
execution_date=20230507T135409, start_date=20230507T135411,
end_date=20230507T135440
+[2024-05-07, 13:54:40 UTC] {local_task_job_runner.py:232} INFO - Task exited
with return code 0
----
When you return to the Airflow home screen, your DAG will now show green
circles for successful runs.
@@ -310,17 +312,17 @@ Your DAG logs will now show the environment variable and
the parameter we used i
[source, bash]
----
-[2023-05-08, 08:21:34 UTC] {docker.py:391} INFO - 2023/05/08 08:21:34 -
pipeline-with-parameter - Pipeline has allocated 5 threads and 4 rowsets.
-[2023-05-08, 08:21:34 UTC] {docker.py:391} INFO - 2023/05/08 08:21:34 -
generate 1 row.0 - Starting to run...
-[2023-05-08, 08:21:34 UTC] {docker.py:391} INFO - 2023/05/08 08:21:34 -
generate 1 row.0 - Finished processing (I=0, O=0, R=0, W=1, U=0, E=0)
-[2023-05-08, 08:21:34 UTC] {docker.py:391} INFO - 2023/05/08 08:21:34 - get
${PRM_EXAMPLE}.0 - field [example] has value [EXAMPLE VALUE]
-[2023-05-08, 08:21:34 UTC] {docker.py:391} INFO - 2023/05/08 08:21:34 - get
${PRM_EXAMPLE}.0 - Finished processing (I=0, O=0, R=1, W=1, U=0, E=0)
-[2023-05-08, 08:21:34 UTC] {docker.py:391} INFO - 2023/05/08 08:21:34 - write
parameter to log.0 -
-[2023-05-08, 08:21:34 UTC] {docker.py:391} INFO - 2023/05/08 08:21:34 - get
${ENV_VARIABLE}.0 - field [env_variable] has value [variable value]
-[2023-05-08, 08:21:34 UTC] {docker.py:391} INFO - 2023/05/08 08:21:34 - write
env_variable to log.0 -
-[2023-05-08, 08:21:34 UTC] {docker.py:391} INFO - 2023/05/08 08:21:34 - write
parameter to log.0 - Finished processing (I=0, O=0, R=1, W=1, U=0, E=0)
-[2023-05-08, 08:21:34 UTC] {docker.py:391} INFO - 2023/05/08 08:21:34 - get
${ENV_VARIABLE}.0 - Finished processing (I=0, O=0, R=1, W=1, U=0, E=0)
-[2023-05-08, 08:21:34 UTC] {docker.py:391} INFO - 2023/05/08 08:21:34 - write
env_variable to log.0 - Finished processing (I=0, O=0, R=1, W=1, U=0, E=0)
+[2024-05-08, 08:21:34 UTC] {docker.py:391} INFO - 2023/05/08 08:21:34 -
pipeline-with-parameter - Pipeline has allocated 5 threads and 4 rowsets.
+[2024-05-08, 08:21:34 UTC] {docker.py:391} INFO - 2023/05/08 08:21:34 -
generate 1 row.0 - Starting to run...
+[2024-05-08, 08:21:34 UTC] {docker.py:391} INFO - 2023/05/08 08:21:34 -
generate 1 row.0 - Finished processing (I=0, O=0, R=0, W=1, U=0, E=0)
+[2024-05-08, 08:21:34 UTC] {docker.py:391} INFO - 2023/05/08 08:21:34 - get
${PRM_EXAMPLE}.0 - field [example] has value [EXAMPLE VALUE]
+[2024-05-08, 08:21:34 UTC] {docker.py:391} INFO - 2023/05/08 08:21:34 - get
${PRM_EXAMPLE}.0 - Finished processing (I=0, O=0, R=1, W=1, U=0, E=0)
+[2024-05-08, 08:21:34 UTC] {docker.py:391} INFO - 2023/05/08 08:21:34 - write
parameter to log.0 -
+[2024-05-08, 08:21:34 UTC] {docker.py:391} INFO - 2023/05/08 08:21:34 - get
${ENV_VARIABLE}.0 - field [env_variable] has value [variable value]
+[2024-05-08, 08:21:34 UTC] {docker.py:391} INFO - 2023/05/08 08:21:34 - write
env_variable to log.0 -
+[2024-05-08, 08:21:34 UTC] {docker.py:391} INFO - 2023/05/08 08:21:34 - write
parameter to log.0 - Finished processing (I=0, O=0, R=1, W=1, U=0, E=0)
+[2024-05-08, 08:21:34 UTC] {docker.py:391} INFO - 2023/05/08 08:21:34 - get
${ENV_VARIABLE}.0 - Finished processing (I=0, O=0, R=1, W=1, U=0, E=0)
+[2024-05-08, 08:21:34 UTC] {docker.py:391} INFO - 2023/05/08 08:21:34 - write
env_variable to log.0 - Finished processing (I=0, O=0, R=1, W=1, U=0, E=0)
----
== Scheduling a DAG in Apache Airflow