robinisme2 opened a new issue, #43295:
URL: https://github.com/apache/airflow/issues/43295

   ### Official Helm Chart version
   
   1.13.1
   
   ### Apache Airflow version
   
   2.8.3
   
   ### Kubernetes Version
   
   1.30.4
   
   ### Helm Chart configuration
   
   team: 'ds'
   environment: 'uat'
   ingress:
     web:
       enabled: true
       hosts:
         - airflow-ds-pub-uat-misc.uat.sportybet2.com
       ingressClassName: "nginx-internal"
     flower:
       enabled: true
       hosts:
         - flower-ds-pub-uat-misc.uat.sportybet2.com
       ingressClassName: "nginx-internal"
   
   
   # createUserJob:
   #   useHelmHooks: false
   #   applyCustomEnv: false
   # migrateDatabaseJob:
   #   useHelmHooks: false
   #   applyCustomEnv: false
   #   jobAnnotations:
   #     argocd.argoproj.io/hook: Sync
   
   postgresql:
     enabled: false
   
   redis:
     enabled: false
   
   data:
     # redis
     brokerUrl: redis://bi-ds-airflow-t1.redis.pub.s.sportybet
     metadataConnection:
       user: airflow
       pass: airflow
       protocol: mysql
       host: ds-airflow-t1.mysql.pub.s.sportybet
       port: 3580
       db: airflow
       sslmode: disable
   
   executor: 'CeleryExecutor'
   
   images:
     airflow:
       repository: 
942878658013.dkr.ecr.eu-central-1.amazonaws.com/third-party/patched-image
       tag: apache-airflow-2.8.4-py3.10-all
     flower:
       repository: 
942878658013.dkr.ecr.eu-central-1.amazonaws.com/third-party/patched-image
       tag: apache-airflow-2.8.4-py3.10-all
   
   scheduler:
     podDisruptionBudget:
       enabled: true
       config:
         maxUnavailable: null
         minAvailable: 1
     resources:
       limits:
         cpu: 2.5
         memory: "4Gi"
       requests:
         cpu: 2
         memory: "3Gi"
   
   webserver:
     podDisruptionBudget:
       enabled: true
       config:
         maxUnavailable: null
         minAvailable: 1
     serviceAccount:
       annotations:
         eks.amazonaws.com/role-arn: 
arn:aws:iam::942878658013:role/sportybet-pub-uat-misc-ds-airflow-eu-central-1
     resources:
       limits:
         cpu: 2
         memory: "6Gi"
       requests:
         cpu: 1
         memory: "4Gi"
   
   triggerer:
     persistence:
       enabled: false
     resources:
       limits:
         cpu: 2
         memory: "3Gi"
       requests:
         cpu: 1
         memory: "2Gi"
     hpa:
       enabled: true
       minReplicaCount: 2
       maxReplicaCount: 5
       metrics:
         - type: Resource
           resource:
             name: cpu
             target:
               type: Utilization
               averageUtilization: 80
   
   workers:
     serviceAccount:
       annotations:
         eks.amazonaws.com/role-arn: 
arn:aws:iam::942878658013:role/sportybet-pub-uat-misc-ds-airflow-eu-central-1
     persistence:
       enabled: true
       size: 10Gi
     resources:
       limits:
         cpu: 2
         memory: "8Gi"
       requests:
         cpu: 2
         memory: "6Gi"
     hpa:
       enabled: true
       minReplicaCount: 2
       maxReplicaCount: 5
       metrics:
         - type: ContainerResource
           containerResource:
             name: cpu
             container: worker
             target:
               type: Utilization
               averageUtilization: 60
   
     extraVolumes:
       - name: airflow-data
         persistentVolumeClaim:
           claimName: "nfs-pvc-airflow-ds-data"
   
     extraVolumeMounts:
       - name: airflow-data
         mountPath: /opt/airflow/data/
         readOnly: false
   
   logs:
     # Configuration for empty dir volume (if logs.persistence.enabled == false)
     # emptyDirConfig:
     #   sizeLimit: 1Gi
     #   medium: Memory
   
     persistence:
       # Enable persistent volume for storing logs
       enabled: true
       # Volume size for logs
       size: 10Gi
       # Annotations for the logs PVC
       annotations: {}
       # If using a custom storageClass, pass name here
       storageClassName:
       ## the name of an existing PVC to use
       existingClaim: nfs-pvc-airflow-ds-logs
   
   flower:
     enabled: true
   
   config:
     logging:
       remote_logging: 'False'
       logging_level: 'INFO'
       remote_base_log_folder: 's3://sportybet-pub-airflow/uat/ds/logs' # 
Specify the S3 bucket used for logging
       remote_log_conn_id: 'aws_s3_logs' # Notice that this name is used in 
Step3 for creating connections through Airflow UI
       delete_worker_pods: 'False'
       encrypt_s3_logs: 'True'
   
   dags:
     persistence:
       enabled: false
       existingClaim: nfs-pvc-airflow-ds-dags
     gitSync:
       enabled: true
       repo: https://github.com/opennetltd/data_science.git
       ref: uat
       subPath: ""
       credentialsSecret: git-credentials
       depth: 1
       verbose: 9
   
   statsd:
     overrideMappings:
       # Airflow StatsD metrics mappings 
(https://airflow.apache.org/docs/apache-airflow/stable/logging-monitoring/metrics.html)
       # === Counters ===
       - match: "(.+)\\.(.+)_start$"
         match_metric_type: counter
         name: "airflow_agg_job_start"
         match_type: regex
         labels:
           airflow_id: "$1"
           job_name: "$2"
       - match: "(.+)\\.(.+)_end$"
         match_metric_type: counter
         name: "airflow_agg_job_end"
         match_type: regex
         labels:
           airflow_id: "$1"
           job_name: "$2"
       - match: "(.+)\\.operator_failures_(.+)$"
         match_metric_type: counter
         name: "airflow_agg_operator_failures"
         match_type: regex
         labels:
           airflow_id: "$1"
           operator_name: "$2"
       - match: "(.+)\\.operator_successes_(.+)$"
         match_metric_type: counter
         name: "airflow_agg_operator_successes"
         match_type: regex
         labels:
           airflow_id: "$1"
           operator_name: "$2"
       - match: "*.ti_failures"
         match_metric_type: counter
         name: "airflow_agg_ti_failures"
         labels:
           airflow_id: "$1"
       - match: "*.ti_successes"
         match_metric_type: counter
         name: "airflow_agg_ti_successes"
         labels:
           airflow_id: "$1"
       - match: "*.zombies_killed"
         match_metric_type: counter
         name: "airflow_agg_zombies_killed"
         labels:
           airflow_id: "$1"
       - match: "*.scheduler_heartbeat"
         match_metric_type: counter
         name: "airflow_agg_scheduler_heartbeat"
         labels:
           airflow_id: "$1"
       - match: "*.dag_processing.processes"
         match_metric_type: counter
         name: "airflow_agg_dag_processing_processes"
         labels:
           airflow_id: "$1"
       - match: "*.scheduler.tasks.killed_externally"
         match_metric_type: counter
         name: "airflow_agg_scheduler_tasks_killed_externally"
         labels:
           airflow_id: "$1"
       - match: "*.scheduler.tasks.running"
         match_metric_type: counter
         name: "airflow_agg_scheduler_tasks_running"
         labels:
           airflow_id: "$1"
       - match: "*.scheduler.tasks.starving"
         match_metric_type: counter
         name: "airflow_agg_scheduler_tasks_starving"
         labels:
           airflow_id: "$1"
       - match: "*.scheduler.orphaned_tasks.cleared"
         match_metric_type: counter
         name: "airflow_agg_scheduler_orphaned_tasks_cleared"
         labels:
           airflow_id: "$1"
       - match: "*.scheduler.orphaned_tasks.adopted"
         match_metric_type: counter
         name: "airflow_agg_scheduler_orphaned_tasks_adopted"
         labels:
           airflow_id: "$1"
       - match: "*.scheduler.critical_section_busy"
         match_metric_type: counter
         name: "airflow_agg_scheduler_critical_section_busy"
         labels:
           airflow_id: "$1"
       - match: "*.sla_email_notification_failure"
         match_metric_type: counter
         name: "airflow_agg_sla_email_notification_failure"
         labels:
           airflow_id: "$1"
       - match: "*.ti.start.*.*"
         match_metric_type: counter
         name: "airflow_agg_ti_start"
         labels:
           airflow_id: "$1"
           dag_id: "$2"
           task_id: "$3"
       - match: "*.ti.finish.*.*.*"
         match_metric_type: counter
         name: "airflow_agg_ti_finish"
         labels:
           airflow_id: "$1"
           dag_id: "$2"
           task_id: "$3"
           state: "$4"
       - match: "*.dag.callback_exceptions"
         match_metric_type: counter
         name: "airflow_agg_dag_callback_exceptions"
         labels:
           airflow_id: "$1"
       - match: "*.celery.task_timeout_error"
         match_metric_type: counter
         name: "airflow_agg_celery_task_timeout_error"
         labels:
           airflow_id: "$1"
   
       # === Gauges ===
       - match: "*.dagbag_size"
         match_metric_type: gauge
         name: "airflow_agg_dagbag_size"
         labels:
           airflow_id: "$1"
       - match: "*.dag_processing.import_errors"
         match_metric_type: gauge
         name: "airflow_agg_dag_processing_import_errors"
         labels:
           airflow_id: "$1"
       - match: "*.dag_processing.total_parse_time"
         match_metric_type: gauge
         name: "airflow_agg_dag_processing_total_parse_time"
         labels:
           airflow_id: "$1"
       - match: "*.dag_processing.last_runtime.*"
         match_metric_type: gauge
         name: "airflow_agg_dag_processing_last_runtime"
         labels:
           airflow_id: "$1"
           dag_file: "$2"
       - match: "*.dag_processing.last_run.seconds_ago.*"
         match_metric_type: gauge
         name: "airflow_agg_dag_processing_last_run_seconds"
         labels:
           airflow_id: "$1"
           dag_file: "$2"
       - match: "*.dag_processing.processor_timeouts"
         match_metric_type: gauge
         name: "airflow_agg_dag_processing_processor_timeouts"
         labels:
           airflow_id: "$1"
       - match: "*.executor.open_slots"
         match_metric_type: gauge
         name: "airflow_agg_executor_open_slots"
         labels:
           airflow_id: "$1"
       - match: "*.executor.queued_tasks"
         match_metric_type: gauge
         name: "airflow_agg_executor_queued_tasks"
         labels:
           airflow_id: "$1"
       - match: "*.executor.running_tasks"
         match_metric_type: gauge
         name: "airflow_agg_executor_running_tasks"
         labels:
           airflow_id: "$1"
       - match: "*.pool.open_slots.*"
         match_metric_type: gauge
         name: "airflow_agg_pool_open_slots"
         labels:
           airflow_id: "$1"
           pool_name: "$2"
       - match: "*.pool.queued_slots.*"
         match_metric_type: gauge
         name: "airflow_agg_pool_queued_slots"
         labels:
           airflow_id: "$1"
           pool_name: "$2"
       - match: "*.pool.running_slots.*"
         match_metric_type: gauge
         name: "airflow_agg_pool_running_slots"
         labels:
           airflow_id: "$1"
           pool_name: "$2"
       - match: "*.pool.starving_tasks.*"
         match_metric_type: gauge
         name: "airflow_agg_pool_starving_tasks"
         labels:
           airflow_id: "$1"
           pool_name: "$2"
       - match: "*.smart_sensor_operator.poked_tasks"
         match_metric_type: gauge
         name: "airflow_agg_smart_sensor_operator_poked_tasks"
         labels:
           airflow_id: "$1"
       - match: "*.smart_sensor_operator.poked_success"
         match_metric_type: gauge
         name: "airflow_agg_smart_sensor_operator_poked_success"
         labels:
           airflow_id: "$1"
       - match: "*.smart_sensor_operator.poked_exception"
         match_metric_type: gauge
         name: "airflow_agg_smart_sensor_operator_poked_exception"
         labels:
           airflow_id: "$1"
       - match: "*.smart_sensor_operator.exception_failures"
         match_metric_type: gauge
         name: "airflow_agg_smart_sensor_operator_exception_failures"
         labels:
           airflow_id: "$1"
       - match: "*.smart_sensor_operator.infra_failures"
         match_metric_type: gauge
         name: "airflow_agg_smart_sensor_operator_infra_failures"
         labels:
           airflow_id: "$1"
   
       # === Timers ===
       - match: "*.dagrun.dependency-check.*"
         match_metric_type: observer
         name: "airflow_agg_dagrun_dependency_check"
         labels:
           airflow_id: "$1"
           dag_id: "$2"
       - match: "*.dag.*.*.duration"
         match_metric_type: observer
         name: "airflow_agg_dag_task_duration"
         labels:
           airflow_id: "$1"
           dag_id: "$2"
           task_id: "$3"
       - match: "*.dag.*.*.*.duration"
         match_metric_type: observer
         name: "airflow_agg_dag_group_task_duration"
         labels:
           airflow_id: "$1"
           dag_id: "$2"
           group_id: "$3"
           task_id: "$4"
       - match: "*.dag_processing.last_duration.*"
         match_metric_type: observer
         name: "airflow_agg_dag_processing_duration"
         labels:
           airflow_id: "$1"
           dag_file: "$2"
       - match: "*.dagrun.duration.success.*"
         match_metric_type: observer
         name: "airflow_agg_dagrun_duration_success"
         labels:
           airflow_id: "$1"
           dag_id: "$2"
       - match: "*.dagrun.duration.failed.*"
         match_metric_type: observer
         name: "airflow_agg_dagrun_duration_failed"
         labels:
           airflow_id: "$1"
           dag_id: "$2"
       - match: "*.dagrun.schedule_delay.*"
         match_metric_type: observer
         name: "airflow_agg_dagrun_schedule_delay"
         labels:
           airflow_id: "$1"
           dag_id: "$2"
       - match: "*.scheduler.critical_section_duration"
         match_metric_type: observer
         name: "airflow_agg_scheduler_critical_section_duration"
         labels:
           airflow_id: "$1"
       - match: "*.dagrun.*.first_task_scheduling_delay"
         match_metric_type: observer
         name: "airflow_agg_dagrun_first_task_scheduling_delay"
         labels:
           airflow_id: "$1"
           dag_id: "$2"
   
   extraEnv: |
     - name: AIRFLOW__CORE__LOAD_EXAMPLES
       value: 'False'
   
   ### Docker Image customizations
   
   No
   
   ### What happened
   
   I created an airflow application by helm chart on my k8s cluster, and I 
found that the worker and webserver will restart everyday at the same time for 
no reason, there's nothing special in the log, and also not liveness issue. Did 
anyone encounter the same issue? thanks
   
   ### What you think should happen instead
   
   _No response_
   
   ### How to reproduce
   
   It happens everyday at the same time
   
   ### Anything else
   
   _No response_
   
   ### Are you willing to submit PR?
   
   - [ ] Yes I am willing to submit a PR!
   
   ### Code of Conduct
   
   - [X] I agree to follow this project's [Code of 
Conduct](https://github.com/apache/airflow/blob/main/CODE_OF_CONDUCT.md)
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to