Hello Here is my alert rules for node_exporter Note that node_memory_MemAvailable_bytes metric is absent for old kernels(you need additional rule to calc it if it is absent) groups:
- name: 'Node exporter alerts' rules: - alert: 'Node_exporter_service' #expr: up{job="node_exporter"} == 0 expr: avg_over_time(up{job="node_exporter",noAlarmOn!~"(.*,|^)node(,.*|$)"}[15m]) < 0.8 for: 15m labels: severity: 'critical' annotations: title: "Node Exporter seems down" description: 'Node exporter does not return any metrics!' - name: 'Node exporter Hardware alerts' rules: - alert: 'MDRAID degraded' expr: (node_md_disks{job="node_exporter", noAlarmOn!~"(.*,|^)raid(,.*|$)" } - node_md_disks_active{job="node_exporter"}) != 0 for: 1m labels: severity: 'warning' annotations: title: "MDRAID status" description: '{{ if ne ($value | humanize) "0" }} Degraded {{ else }} Healthy {{ end }} RAID array {{ $labels.device }} on {{ $labels.instance }}. {{ if ne ($value | humanize) "0" }} some disks failed. {{ end }} node_md_disks{job="{{ $labels.job }}",instance="{{ $labels.instance }}"} - node_md_disks_active{job="{{ $labels.job }}",instance="{{ $labels.instance }}"}) = {{$value | humanize}}' - alert: "Bond degraded" expr: (node_bonding_active{ noAlarmOn!~"(.*,|^)bond(,.*|$)" } - node_bonding_slaves{job="node_exporter"}) != 0 for: 1m labels: severity: 'warning' annotations: title: "Bonding status" description: 'Bond {{ $labels.master }} is {{ if ne ($value | humanize) "0" }} degraded {{ else }} healthy {{ end }} on {{ $labels.instance }}. node_bonding_active - node_bonding_slaves = {{ $value | humanize }}' - name: "Node exporter OS alerts" rules: - alert: "CPU Load linux" expr: node_load15{job="node_exporter", noAlarmOn!~"(.*,|^)cpu(,.*|$)" } / (count without (cpu, mode) (node_cpu_seconds_total{job="node_exporter", mode="system"})) > 1.5 for: 30m labels: severity: 'critical' annotations: title: "Avg load15m > 1.5*cpu" description: 'Avg load15m is more than cpu count in {{ $value | printf "%2.3f" }} times for {{ $labels.ip }} ({{ $labels.host }})' - alert: "CPU Load linux" expr: node_load15{job="node_exporter", noAlarmOn!~"(.*,|^)cpu(,.*|$)" } / (count without (cpu, mode) (node_cpu_seconds_total{job="node_exporter", mode="system"})) > 1.2 for: 30m labels: severity: 'warning' annotations: title: "Avg load15m > 1.2*cpu" description: 'Avg load15m is more than cpu count in {{ $value | printf "%2.3f" }} times for {{ $labels.ip }} ({{ $labels.host }})' - alert: "DiskSpaceUsage" expr: (node_filesystem_free_bytes{job="node_exporter", fstype=~"(ext.|xfs|zfs)", noAlarmOn!~"(.*,|^)space(,.*|$)" } / node_filesystem_size_bytes{job="node_exporter",fstype=~"(ext.|xfs|zfs)"} * 100) < 7 for: 1m labels: severity: 'critical' annotations: title: "Disk Space Usage" description: 'On {{ $labels.ip }} device {{ $labels.device }} mounted on {{ $labels.mountpoint }} has free space of {{ $value | printf "%2.3f" }}%' - alert: "DiskSpaceUsage" expr: (node_filesystem_free_bytes{job="node_exporter", fstype=~"(ext.|xfs|zfs)", noAlarmOn!~"(.*,|^)space(,.*|$)" } / node_filesystem_size_bytes{job="node_exporter",fstype=~"(ext.|xfs|zfs)"} * 100) < 10 for: 10m labels: severity: 'warning' annotations: title: "Disk Space Usage" description: 'On {{ $labels.ip }} device {{ $labels.device }} mounted on {{ $labels.mountpoint }} has free space of {{ $value | printf "%2.3f" }}%' - alert: "DiskSpaceUsage" expr: predict_linear(node_filesystem_free_bytes{job="node_exporter", fstype=~"(ext.|xfs|zfs)", noAlarmOn!~"(.*,|^)predict(,.*|$)", noAlarmOn!~"(.*,|^)space(,.*|$)",owner!="platforms"}[1h], 4*3600) < 0 for: 5m labels: severity: 'predict' annotations: title: "Disk Space Usage on {{ $labels.ip }} predict for next 4 hours" description: "On {{ $labels.ip }} device {{ $labels.device }} mounted on {{ $labels.mountpoint }} will predict have space of {{ $value | humanize1024 }}bytes in 4 hours." - alert: "Inode Usage" expr: ( ( node_filesystem_files_free{job="node_exporter", fstype=~"(ext.|xfs)"} / node_filesystem_files{job="node_exporter", fstype=~"(ext.|xfs)", noAlarmOn!~"(.*,|^)space(,.*|$)"} ) * 100 ) <= 20 for: 15m labels: severity: 'warning' annotations: title: "Inode usage" description: 'Inode usage on {{ $labels.ip }} device {{ $labels.device }} mounted as {{ $labels.mountpoint }} is {{ $value | printf "%2.3f" }}% left. {{ if le $value 20.0 }} Remove unused files, investigate the causes of their appearance. Run "find {{ $labels.mountpoint }} -mount" {{end}} ' - alert: "MemAvailable" expr: predict_linear(node_memory_MemAvailable_bytes{job="node_exporter", noAlarmOn!~"(.*,|^)memory(,.*|$)",owner!~"(test|^)platforms"}[2h], 1*3600) <= 0 for: 15m labels: severity: 'predict' annotations: title: "No Memory Available predict for next hour" description: 'On {{ $labels.ip }} predict to 1Hour No Memory Available in 1 Hour' - alert: "MemAvailable" expr: (node_memory_MemAvailable_bytes{job="node_exporter", noAlarmOn!~"(.*,|^)memory(,.*|$)" } / node_memory_MemTotal_bytes{job="node_exporter"} * 100) < 5 and rate(node_vmstat_pgmajfault{job="node_exporter"}[3m]) > 100 for: 5m labels: severity: 'critical' annotations: title: "Memory Available" description: "{{ with printf \"node_memory_MemAvailable_bytes{instance='%s',job='%s'}\" .Labels.instance .Labels.job | query }}{{ . | first | value | humanize1024 }}Bytes Available,{{end}} {{ with printf \"node_memory_MemTotal_bytes{instance='%s',job='%s'}\" .Labels.instance .Labels.job | query }}{{ . | first | value | humanize1024 }}Bytes Total{{end}} ( {{ $value | humanize }}% Available) {{ with printf \"rate(node_vmstat_pgmajfault{instance='%s',job='%s'}[3m])\" .Labels.instance .Labels.job | query }}, scanrate is {{ . | first | value | printf \"%2.4f\" }} pgmajfaults/sec{{end}}" - alert: "SwapUsed" expr: 100 * ( node_memory_SwapTotal_bytes{job="node_exporter", noAlarmOn!~"(.*,|^)memory(,.*|$)"} - node_memory_SwapFree_bytes{} - node_memory_SwapCached_bytes{} ) / node_memory_SwapTotal_bytes{} > 75 for: 5m labels: severity: 'warning' annotations: title: "Swap Used more than 75%" description: 'Swap is used for {{ $value | printf "%2.2f" }}%' - alert: "DiskIO_read_Time" expr: rate(node_disk_read_time_seconds_total{noAlarmOn!~"(.*,|^)diskwait(,.*|$)"}[10m]) / rate(node_disk_reads_completed_total[10m]) * 1000 > 300 for: 1h labels: severity: 'warning' annotations: title: "Disk reads wait time is large >300ms" description: 'On {{ $labels.ip }} device {{ $labels.device }} has disk wait time {{ $value | printf "%1.1f" }}ms.' - alert: "DiskIO_write_Time" expr: rate(node_disk_write_time_seconds_total{noAlarmOn!~"(.*,|^)diskwait(,.*|$)"}[10m]) / rate(node_disk_writes_completed_total[10m]) * 1000 > 300 for: 1h labels: severity: 'warning' annotations: title: "Disk writes wait time is large >300ms" description: 'On {{ $labels.ip }} device {{ $labels.device }} has disk wait time {{ $value | printf "%1.1f" }}ms.' - alert: "DiskIO%b" expr: rate(node_disk_io_time_seconds_total[30m])*100 > 95 for: 10h labels: severity: 'warning' annotations: title: "Disk %b is large for 10 hours" description: 'On {{ $labels.ip }} device {{ $labels.device }} has {{ $value | printf "%2.3f" }}%busy time.' - alert: "MemAvailable" expr: (node_memory_MemAvailable_bytes{job="node_exporter", noAlarmOn!~"(.*,|^)memory(,.*|$)" } / node_memory_MemTotal_bytes{job="node_exporter"} * 100) < 5 for: 15m labels: severity: 'warning' annotations: title: "Memory Available" description: "{{ with printf \"node_memory_MemAvailable_bytes{instance='%s',job='%s'}\" .Labels.instance .Labels.job | query }}{{ . | first | value | humanize1024 }}Bytes Available,{{end}} {{ with printf \"node_memory_MemTotal_bytes{instance='%s',job='%s'}\" .Labels.instance .Labels.job | query }}{{ . | first | value | humanize1024 }}Bytes Total{{end}} ( {{ $value | humanize }}% Available) {{ with printf \"rate(node_vmstat_pgmajfault{instance='%s',job='%s'}[3m])\" .Labels.instance .Labels.job | query }}, scanrate is {{ . | first | value | printf \"%2.4f\" }} pgmajfaults/sec{{end}}" - alert: "ProcessNearFDLimits" expr: process_open_fds{job="node_exporter"} / process_max_fds{job="node_exporter"} > 0.8 for: 10m labels: severity: 'warning' annotations: title: "Process File descriptors." description: "On {{ $labels.ip }} process_open_fds / process_max_fds is about {{ $value | humanize }}%" - alert: "ProcessNearFDLimits" expr: process_open_fds{job="node_exporter"} / process_max_fds{job="node_exporter"} > 0.9 for: 10m labels: severity: 'critical' annotations: title: "Process File descriptors." description: "On {{ $labels.ip }} process_open_fds / process_max_fds is about {{ $value | humanize }}%" - alert: "Uptime" expr: time() - node_boot_time_seconds{job="node_exporter"} < 3600 for: 1m labels: severity: warning annotations: title: "System rebooted" description: "Host {{ $labels.instance }} uptime is less than 1 hour. \n VALUE = {{ $value | humanizeDuration }} seconds" - alert: "NetworkTransmitDrops" expr: 100 * irate(node_network_transmit_drop_total{device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*'}[30m]) / irate(node_network_transmit_packets_total{device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*'}[30m]) > 0 for: 1m labels: severity: warning annotations: title: "Network Transmit Drops" description: 'Host {{ $labels.instance }} drops {{ $value | printf "%2.6f" }}% of transmit packets on NIC {{ $labels.device }}' - alert: "NetworkReceiveErrs" expr: 100 * irate(node_network_receive_errs_total{device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*'}[30m]) / irate(node_network_receive_packets_total{device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*'}[30m]) > 0 for: 1m labels: severity: warning annotations: title: "Network Receive Errs" description: 'Host {{ $labels.instance }} have errors in {{ $value | printf "%2.6f" }}% of received packets on NIC {{ $labels.device }}' - alert: "NetworkTranmitColls" expr: 100 * irate(node_network_transmit_colls_total{device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*'}[30m]) / (irate(node_network_transmit_colls_total{device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*'}[30m]) + irate(node_network_transmit_packets_total{device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*'}[30m])) > 0 for: 1m labels: severity: warning annotations: title: "Network Tranmit Collisions" description: 'Host {{ $labels.instance }} have collisions in {{ $value | printf "%2.6f" }}% of tried to tranmit packets on NIC {{ $labels.device }}' - alert: "Localtime" expr: abs(node_time_seconds{job="node_exporter"} - timestamp(node_time_seconds{job="node_exporter"})) > 10 for: 10m labels: severity: warning annotations: title: "Time is wrong" description: "Localtime of {{ $labels.instance }} differs from Prometheus time for more than 10 second (value {{ $value | humanize }} sec). Check Time Sync." понедельник, 14 декабря 2020 г. в 19:56:24 UTC+3, snehasha...@gmail.com: > Could any one please share Prometheus queries to monitor > > 1. Virtual memory: Paging and swapping > 2. Memory Available and Swap Used % > 3. Disk Performance % > > -- You received this message because you are subscribed to the Google Groups "Prometheus Users" group. To unsubscribe from this group and stop receiving emails from it, send an email to prometheus-users+unsubscr...@googlegroups.com. To view this discussion on the web visit https://groups.google.com/d/msgid/prometheus-users/73929c88-11e7-463c-9d77-6f86db660757n%40googlegroups.com.