robinisme2 opened a new issue, #43295:
URL: https://github.com/apache/airflow/issues/43295
### Official Helm Chart version
1.13.1
### Apache Airflow version
2.8.3
### Kubernetes Version
1.30.4
### Helm Chart configuration
team: 'ds'
environment: 'uat'
ingress:
web:
enabled: true
hosts:
- airflow-ds-pub-uat-misc.uat.sportybet2.com
ingressClassName: "nginx-internal"
flower:
enabled: true
hosts:
- flower-ds-pub-uat-misc.uat.sportybet2.com
ingressClassName: "nginx-internal"
# createUserJob:
# useHelmHooks: false
# applyCustomEnv: false
# migrateDatabaseJob:
# useHelmHooks: false
# applyCustomEnv: false
# jobAnnotations:
# argocd.argoproj.io/hook: Sync
postgresql:
enabled: false
redis:
enabled: false
data:
# redis
brokerUrl: redis://bi-ds-airflow-t1.redis.pub.s.sportybet
metadataConnection:
user: airflow
pass: airflow
protocol: mysql
host: ds-airflow-t1.mysql.pub.s.sportybet
port: 3580
db: airflow
sslmode: disable
executor: 'CeleryExecutor'
images:
airflow:
repository:
942878658013.dkr.ecr.eu-central-1.amazonaws.com/third-party/patched-image
tag: apache-airflow-2.8.4-py3.10-all
flower:
repository:
942878658013.dkr.ecr.eu-central-1.amazonaws.com/third-party/patched-image
tag: apache-airflow-2.8.4-py3.10-all
scheduler:
podDisruptionBudget:
enabled: true
config:
maxUnavailable: null
minAvailable: 1
resources:
limits:
cpu: 2.5
memory: "4Gi"
requests:
cpu: 2
memory: "3Gi"
webserver:
podDisruptionBudget:
enabled: true
config:
maxUnavailable: null
minAvailable: 1
serviceAccount:
annotations:
eks.amazonaws.com/role-arn:
arn:aws:iam::942878658013:role/sportybet-pub-uat-misc-ds-airflow-eu-central-1
resources:
limits:
cpu: 2
memory: "6Gi"
requests:
cpu: 1
memory: "4Gi"
triggerer:
persistence:
enabled: false
resources:
limits:
cpu: 2
memory: "3Gi"
requests:
cpu: 1
memory: "2Gi"
hpa:
enabled: true
minReplicaCount: 2
maxReplicaCount: 5
metrics:
- type: Resource
resource:
name: cpu
target:
type: Utilization
averageUtilization: 80
workers:
serviceAccount:
annotations:
eks.amazonaws.com/role-arn:
arn:aws:iam::942878658013:role/sportybet-pub-uat-misc-ds-airflow-eu-central-1
persistence:
enabled: true
size: 10Gi
resources:
limits:
cpu: 2
memory: "8Gi"
requests:
cpu: 2
memory: "6Gi"
hpa:
enabled: true
minReplicaCount: 2
maxReplicaCount: 5
metrics:
- type: ContainerResource
containerResource:
name: cpu
container: worker
target:
type: Utilization
averageUtilization: 60
extraVolumes:
- name: airflow-data
persistentVolumeClaim:
claimName: "nfs-pvc-airflow-ds-data"
extraVolumeMounts:
- name: airflow-data
mountPath: /opt/airflow/data/
readOnly: false
logs:
# Configuration for empty dir volume (if logs.persistence.enabled == false)
# emptyDirConfig:
# sizeLimit: 1Gi
# medium: Memory
persistence:
# Enable persistent volume for storing logs
enabled: true
# Volume size for logs
size: 10Gi
# Annotations for the logs PVC
annotations: {}
# If using a custom storageClass, pass name here
storageClassName:
## the name of an existing PVC to use
existingClaim: nfs-pvc-airflow-ds-logs
flower:
enabled: true
config:
logging:
remote_logging: 'False'
logging_level: 'INFO'
remote_base_log_folder: 's3://sportybet-pub-airflow/uat/ds/logs' #
Specify the S3 bucket used for logging
remote_log_conn_id: 'aws_s3_logs' # Notice that this name is used in
Step3 for creating connections through Airflow UI
delete_worker_pods: 'False'
encrypt_s3_logs: 'True'
dags:
persistence:
enabled: false
existingClaim: nfs-pvc-airflow-ds-dags
gitSync:
enabled: true
repo: https://github.com/opennetltd/data_science.git
ref: uat
subPath: ""
credentialsSecret: git-credentials
depth: 1
verbose: 9
statsd:
overrideMappings:
# Airflow StatsD metrics mappings
(https://airflow.apache.org/docs/apache-airflow/stable/logging-monitoring/metrics.html)
# === Counters ===
- match: "(.+)\\.(.+)_start$"
match_metric_type: counter
name: "airflow_agg_job_start"
match_type: regex
labels:
airflow_id: "$1"
job_name: "$2"
- match: "(.+)\\.(.+)_end$"
match_metric_type: counter
name: "airflow_agg_job_end"
match_type: regex
labels:
airflow_id: "$1"
job_name: "$2"
- match: "(.+)\\.operator_failures_(.+)$"
match_metric_type: counter
name: "airflow_agg_operator_failures"
match_type: regex
labels:
airflow_id: "$1"
operator_name: "$2"
- match: "(.+)\\.operator_successes_(.+)$"
match_metric_type: counter
name: "airflow_agg_operator_successes"
match_type: regex
labels:
airflow_id: "$1"
operator_name: "$2"
- match: "*.ti_failures"
match_metric_type: counter
name: "airflow_agg_ti_failures"
labels:
airflow_id: "$1"
- match: "*.ti_successes"
match_metric_type: counter
name: "airflow_agg_ti_successes"
labels:
airflow_id: "$1"
- match: "*.zombies_killed"
match_metric_type: counter
name: "airflow_agg_zombies_killed"
labels:
airflow_id: "$1"
- match: "*.scheduler_heartbeat"
match_metric_type: counter
name: "airflow_agg_scheduler_heartbeat"
labels:
airflow_id: "$1"
- match: "*.dag_processing.processes"
match_metric_type: counter
name: "airflow_agg_dag_processing_processes"
labels:
airflow_id: "$1"
- match: "*.scheduler.tasks.killed_externally"
match_metric_type: counter
name: "airflow_agg_scheduler_tasks_killed_externally"
labels:
airflow_id: "$1"
- match: "*.scheduler.tasks.running"
match_metric_type: counter
name: "airflow_agg_scheduler_tasks_running"
labels:
airflow_id: "$1"
- match: "*.scheduler.tasks.starving"
match_metric_type: counter
name: "airflow_agg_scheduler_tasks_starving"
labels:
airflow_id: "$1"
- match: "*.scheduler.orphaned_tasks.cleared"
match_metric_type: counter
name: "airflow_agg_scheduler_orphaned_tasks_cleared"
labels:
airflow_id: "$1"
- match: "*.scheduler.orphaned_tasks.adopted"
match_metric_type: counter
name: "airflow_agg_scheduler_orphaned_tasks_adopted"
labels:
airflow_id: "$1"
- match: "*.scheduler.critical_section_busy"
match_metric_type: counter
name: "airflow_agg_scheduler_critical_section_busy"
labels:
airflow_id: "$1"
- match: "*.sla_email_notification_failure"
match_metric_type: counter
name: "airflow_agg_sla_email_notification_failure"
labels:
airflow_id: "$1"
- match: "*.ti.start.*.*"
match_metric_type: counter
name: "airflow_agg_ti_start"
labels:
airflow_id: "$1"
dag_id: "$2"
task_id: "$3"
- match: "*.ti.finish.*.*.*"
match_metric_type: counter
name: "airflow_agg_ti_finish"
labels:
airflow_id: "$1"
dag_id: "$2"
task_id: "$3"
state: "$4"
- match: "*.dag.callback_exceptions"
match_metric_type: counter
name: "airflow_agg_dag_callback_exceptions"
labels:
airflow_id: "$1"
- match: "*.celery.task_timeout_error"
match_metric_type: counter
name: "airflow_agg_celery_task_timeout_error"
labels:
airflow_id: "$1"
# === Gauges ===
- match: "*.dagbag_size"
match_metric_type: gauge
name: "airflow_agg_dagbag_size"
labels:
airflow_id: "$1"
- match: "*.dag_processing.import_errors"
match_metric_type: gauge
name: "airflow_agg_dag_processing_import_errors"
labels:
airflow_id: "$1"
- match: "*.dag_processing.total_parse_time"
match_metric_type: gauge
name: "airflow_agg_dag_processing_total_parse_time"
labels:
airflow_id: "$1"
- match: "*.dag_processing.last_runtime.*"
match_metric_type: gauge
name: "airflow_agg_dag_processing_last_runtime"
labels:
airflow_id: "$1"
dag_file: "$2"
- match: "*.dag_processing.last_run.seconds_ago.*"
match_metric_type: gauge
name: "airflow_agg_dag_processing_last_run_seconds"
labels:
airflow_id: "$1"
dag_file: "$2"
- match: "*.dag_processing.processor_timeouts"
match_metric_type: gauge
name: "airflow_agg_dag_processing_processor_timeouts"
labels:
airflow_id: "$1"
- match: "*.executor.open_slots"
match_metric_type: gauge
name: "airflow_agg_executor_open_slots"
labels:
airflow_id: "$1"
- match: "*.executor.queued_tasks"
match_metric_type: gauge
name: "airflow_agg_executor_queued_tasks"
labels:
airflow_id: "$1"
- match: "*.executor.running_tasks"
match_metric_type: gauge
name: "airflow_agg_executor_running_tasks"
labels:
airflow_id: "$1"
- match: "*.pool.open_slots.*"
match_metric_type: gauge
name: "airflow_agg_pool_open_slots"
labels:
airflow_id: "$1"
pool_name: "$2"
- match: "*.pool.queued_slots.*"
match_metric_type: gauge
name: "airflow_agg_pool_queued_slots"
labels:
airflow_id: "$1"
pool_name: "$2"
- match: "*.pool.running_slots.*"
match_metric_type: gauge
name: "airflow_agg_pool_running_slots"
labels:
airflow_id: "$1"
pool_name: "$2"
- match: "*.pool.starving_tasks.*"
match_metric_type: gauge
name: "airflow_agg_pool_starving_tasks"
labels:
airflow_id: "$1"
pool_name: "$2"
- match: "*.smart_sensor_operator.poked_tasks"
match_metric_type: gauge
name: "airflow_agg_smart_sensor_operator_poked_tasks"
labels:
airflow_id: "$1"
- match: "*.smart_sensor_operator.poked_success"
match_metric_type: gauge
name: "airflow_agg_smart_sensor_operator_poked_success"
labels:
airflow_id: "$1"
- match: "*.smart_sensor_operator.poked_exception"
match_metric_type: gauge
name: "airflow_agg_smart_sensor_operator_poked_exception"
labels:
airflow_id: "$1"
- match: "*.smart_sensor_operator.exception_failures"
match_metric_type: gauge
name: "airflow_agg_smart_sensor_operator_exception_failures"
labels:
airflow_id: "$1"
- match: "*.smart_sensor_operator.infra_failures"
match_metric_type: gauge
name: "airflow_agg_smart_sensor_operator_infra_failures"
labels:
airflow_id: "$1"
# === Timers ===
- match: "*.dagrun.dependency-check.*"
match_metric_type: observer
name: "airflow_agg_dagrun_dependency_check"
labels:
airflow_id: "$1"
dag_id: "$2"
- match: "*.dag.*.*.duration"
match_metric_type: observer
name: "airflow_agg_dag_task_duration"
labels:
airflow_id: "$1"
dag_id: "$2"
task_id: "$3"
- match: "*.dag.*.*.*.duration"
match_metric_type: observer
name: "airflow_agg_dag_group_task_duration"
labels:
airflow_id: "$1"
dag_id: "$2"
group_id: "$3"
task_id: "$4"
- match: "*.dag_processing.last_duration.*"
match_metric_type: observer
name: "airflow_agg_dag_processing_duration"
labels:
airflow_id: "$1"
dag_file: "$2"
- match: "*.dagrun.duration.success.*"
match_metric_type: observer
name: "airflow_agg_dagrun_duration_success"
labels:
airflow_id: "$1"
dag_id: "$2"
- match: "*.dagrun.duration.failed.*"
match_metric_type: observer
name: "airflow_agg_dagrun_duration_failed"
labels:
airflow_id: "$1"
dag_id: "$2"
- match: "*.dagrun.schedule_delay.*"
match_metric_type: observer
name: "airflow_agg_dagrun_schedule_delay"
labels:
airflow_id: "$1"
dag_id: "$2"
- match: "*.scheduler.critical_section_duration"
match_metric_type: observer
name: "airflow_agg_scheduler_critical_section_duration"
labels:
airflow_id: "$1"
- match: "*.dagrun.*.first_task_scheduling_delay"
match_metric_type: observer
name: "airflow_agg_dagrun_first_task_scheduling_delay"
labels:
airflow_id: "$1"
dag_id: "$2"
extraEnv: |
- name: AIRFLOW__CORE__LOAD_EXAMPLES
value: 'False'
### Docker Image customizations
No
### What happened
I created an airflow application by helm chart on my k8s cluster, and I
found that the worker and webserver will restart everyday at the same time for
no reason, there's nothing special in the log, and also not liveness issue. Did
anyone encounter the same issue? thanks
### What you think should happen instead
_No response_
### How to reproduce
It happens everyday at the same time
### Anything else
_No response_
### Are you willing to submit PR?
- [ ] Yes I am willing to submit a PR!
### Code of Conduct
- [X] I agree to follow this project's [Code of
Conduct](https://github.com/apache/airflow/blob/main/CODE_OF_CONDUCT.md)
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]