Filippo Giunchedi has uploaded a new change for review. https://gerrit.wikimedia.org/r/280652
Change subject: prometheus: add server support ...................................................................... prometheus: add server support a prometheus::server class is provided that will install and setup a prometheus server. additionally, an example of labs deployment is provided with role::prometheus::labs_project, currently running at https://prometheus-staging.wmflabs.org it will autodiscover and scrape/poll metrics from all labs instances running prometheus's node_exporter in the same project. (see also I6832dcf31) Bug: T126785 Change-Id: Ia2a8f204ea0e3ac865a6ab8bd7b4af6c7915bcef --- A modules/prometheus/files/etc/prometheus/alerts_default.conf A modules/prometheus/files/usr/local/bin/prometheus-labs-targets A modules/prometheus/manifests/server.pp A modules/prometheus/templates/etc/default/prometheus.erb A modules/prometheus/templates/etc/prometheus/prometheus.yml.erb A modules/prometheus/templates/initscripts/prometheus.systemd.erb A modules/role/manifests/prometheus/labs_project.pp 7 files changed, 247 insertions(+), 0 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/operations/puppet refs/changes/52/280652/1 diff --git a/modules/prometheus/files/etc/prometheus/alerts_default.conf b/modules/prometheus/files/etc/prometheus/alerts_default.conf new file mode 100644 index 0000000..f8426d4 --- /dev/null +++ b/modules/prometheus/files/etc/prometheus/alerts_default.conf @@ -0,0 +1,23 @@ +# default alerting rules + +ALERT InstanceDown + IF up == 0 + FOR 3m + LABELS { + severity="warn" + } + ANNOTATIONS { + SUMMARY = "Instance {{$labels.instance}} down", + DESCRIPTION = "{{$labels.instance}} of job {{$labels.job}} has been down for more than 3 minutes.", + } + +ALERT PrometheusReloadFailed + IF prometheus_config_last_reload_successful == 0 + FOR 1h + LABELS { + severity="warn" + } + ANNOTATIONS { + SUMMARY = "Prometheus {{$labels.instance}} config reload fail", + DESCRIPTION = "Prometheus server at {{$labels.instance}} of job {{$labels.job}} has failed to reload its configuration", + } diff --git a/modules/prometheus/files/usr/local/bin/prometheus-labs-targets b/modules/prometheus/files/usr/local/bin/prometheus-labs-targets new file mode 100755 index 0000000..cd25fb0 --- /dev/null +++ b/modules/prometheus/files/usr/local/bin/prometheus-labs-targets @@ -0,0 +1,57 @@ +#!/usr/bin/python3 +# Generate prometheus targets for a given project from wikitech's nova +# instances list. + +import argparse +import codecs +import json +import logging +import sys +import urllib.parse +import urllib.request +import yaml + + +def project_instances(project): + req = urllib.request.urlopen( + 'https://wikitech.wikimedia.org/w/api.php?' + + urllib.parse.urlencode({ + 'action': 'query', + 'list': 'novainstances', + 'niregion': 'eqiad', + 'format': 'json', + 'niproject': project, + }) + ) + + reader = codecs.getreader('utf-8') + data = json.load(reader(req)) + for instance in data['query']['novainstances']: + yield instance + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('--project', dest='project') + parser.add_argument('--port', dest='port', default='9100') + args = parser.parse_args() + + if args.project is None: + try: + with open('/etc/wmflabs-project') as f: + args.project = f.read().strip() + except IOError as e: + parser.error('unable to detect project from /etc/wmflabs-project: %r' % e) + return 1 + + scrape_configs = [] + targets = {'targets': []} + for instance in project_instances(args.project): + targets['targets'].append("%s:%s" % (instance['name'], args.port)) + targets['targets'] = sorted(targets['targets']) + scrape_configs.append(targets) + print(yaml.dump(scrape_configs, default_flow_style=False)) + + +if __name__ == '__main__': + sys.exit(main()) diff --git a/modules/prometheus/manifests/server.pp b/modules/prometheus/manifests/server.pp new file mode 100644 index 0000000..7d8bf4a --- /dev/null +++ b/modules/prometheus/manifests/server.pp @@ -0,0 +1,117 @@ +# == Class: prometheus::server +# +# The prometheus server takes care of 'scraping' (polling) a list of 'targets' +# via HTTP using one of +# https://prometheus.io/docs/instrumenting/exposition_formats/ and making the +# scraped metrics available for querying. Metrics will be stored locally in +# $storage_path for $storage_retention time. +# +# The shipped configuration below includes prometheus server scraping itself +# for metrics on localhost:9090. + +class prometheus::server ( + $scrape_interval = '60s', + $storage_path = '/srv/prometheus', + $storage_retention = '4320h0m0s', + $global_config_extra = {}, + $scrape_configs_extra = [], + $rule_files_extra = [], +) { + if ! os_version('debian >= jessie') { + fail('only Debian jessie supported') + } + + require_package('prometheus') + + $global_config_default = { + 'scrape_interval' => $scrape_interval, + } + $global_config = merge($global_config_default, $global_config_extra) + + $scrape_configs_default = [ + { + 'job_name' => 'prometheus', + 'target_groups' => [ + { 'targets' => [ 'localhost:9090' ] }, + ] + }, + { + 'job_name' => 'node', + 'file_sd_configs' => [ + { 'names' => [ '/etc/prometheus/targets/node_*.yml' ] }, + ] + }, + ] + $scrape_configs = concat($scrape_configs_default, $scrape_configs_extra) + + $rule_files_default = [ + '/etc/prometheus/rules/rules_*.conf', + '/etc/prometheus/rules/alerts_*.conf', + ] + $rule_files = concat($rule_files_default, $rule_files_extra) + + file { '/etc/prometheus/rules/alerts_default.conf': + ensure => file, + mode => 0444, + owner => 'root', + source => 'puppet:///modules/prometheus/etc/prometheus/alerts_default.conf', + notify => Exec['prometheus-reload'], + require => File['/etc/prometheus/rules'], + } + + file { '/etc/prometheus/prometheus.yml': + ensure => present, + mode => '0444', + owner => 'root', + group => 'root', + notify => Exec['prometheus-reload'], + content => template('prometheus/etc/prometheus/prometheus.yml.erb'), + } + + file { '/etc/default/prometheus': + ensure => present, + mode => '0444', + owner => 'root', + group => 'root', + notify => Service['prometheus'], + content => template('prometheus/etc/default/prometheus.erb'), + } + + file { $storage_path: + ensure => directory, + mode => 0750, + owner => 'prometheus', + group => 'prometheus', + } + + file { '/etc/prometheus/rules': + ensure => directory, + mode => 0755, + owner => 'root', + group => 'root', + } + + # output all nova instances for the current labs project as prometheus + # 'targets' + file { '/usr/local/bin/prometheus-labs-targets': + ensure => file, + mode => 0555, + owner => 'root', + group => 'root', + source => 'puppet:///modules/prometheus/usr/local/bin/prometheus-labs-targets', + } + + exec { 'prometheus-reload': + command => '/bin/systemctl reload prometheus', + refreshonly => true, + } + + base::service_unit { 'prometheus': + ensure => present, + systemd => true, + service_params => { + enable => true, + hasrestart => true, + }, + } +} diff --git a/modules/prometheus/templates/etc/default/prometheus.erb b/modules/prometheus/templates/etc/default/prometheus.erb new file mode 100644 index 0000000..f7f34c0 --- /dev/null +++ b/modules/prometheus/templates/etc/default/prometheus.erb @@ -0,0 +1,13 @@ +ARGS="-storage.local.path <%= @storage_path %> -storage.local.retention <%= @storage_retention %>" + +# if using sysv init, temporary fix for https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=817403 +do_reload() +{ + log_daemon_msg "Reloading $DESC configuration files" "$NAME" + $HELPER $HELPER_ARGS --running || return 1 + helper_pid=$(cat $PIDFILE) + [ -z "$helper_pid" ] && return 1 + start-stop-daemon --stop --signal 1 --quiet \ + --ppid "$helper_pid" --exec "$DAEMON" + log_end_msg $? +} diff --git a/modules/prometheus/templates/etc/prometheus/prometheus.yml.erb b/modules/prometheus/templates/etc/prometheus/prometheus.yml.erb new file mode 100644 index 0000000..92c2aed --- /dev/null +++ b/modules/prometheus/templates/etc/prometheus/prometheus.yml.erb @@ -0,0 +1,5 @@ +<% require 'yaml' %> +<%= { 'global' => @global_config, + 'rule_files' => @rule_files, + 'scrape_configs' => @scrape_configs + }.to_yaml %> diff --git a/modules/prometheus/templates/initscripts/prometheus.systemd.erb b/modules/prometheus/templates/initscripts/prometheus.systemd.erb new file mode 100644 index 0000000..ad394d4 --- /dev/null +++ b/modules/prometheus/templates/initscripts/prometheus.systemd.erb @@ -0,0 +1,11 @@ +[Unit] +Description=prometheus server + +[Service] +User=prometheus +Group=prometheus +EnvironmentFile=-/etc/default/prometheus +ExecStart=/usr/bin/prometheus $ARGS +Restart=on-failure +RestartSec=10s +ExecReload=/bin/kill -HUP $MAINPID diff --git a/modules/role/manifests/prometheus/labs_project.pp b/modules/role/manifests/prometheus/labs_project.pp new file mode 100644 index 0000000..febe583 --- /dev/null +++ b/modules/role/manifests/prometheus/labs_project.pp @@ -0,0 +1,21 @@ +# == Class: role::prometheus::labs_project +# +# This class provides a prometheus server to do node (host) monitoring for all +# instances of the labs projects it is running in. +# Instance autodiscovery is accomplished by querying wikitech HTTP API for +# instances list and writing a list of <instance>:9100 'targets' for prometheus +# to pick up. See also prometheus::node_exporter. + +class role::prometheus::labs_project { + include prometheus::server + + $targets_file = '/etc/prometheus/targets/node_project.yml' + + cron { 'prometheus_labs_project_targets': + ensure => present, + command => "/usr/local/bin/prometheus-labs-targets > ${targets_file}.$$ && mv ${targets_file}.$$ ${targets_file}", + minute => '*/10', + hour => '*', + require => Class['prometheus::server'], + } +} -- To view, visit https://gerrit.wikimedia.org/r/280652 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: Ia2a8f204ea0e3ac865a6ab8bd7b4af6c7915bcef Gerrit-PatchSet: 1 Gerrit-Project: operations/puppet Gerrit-Branch: production Gerrit-Owner: Filippo Giunchedi <[email protected]> _______________________________________________ MediaWiki-commits mailing list [email protected] https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits
