Filippo Giunchedi has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/280652

Change subject: prometheus: add server support
......................................................................

prometheus: add server support

a prometheus::server class is provided that will install and setup a prometheus
server.

additionally, an example of labs deployment is provided with
role::prometheus::labs_project, currently running at
https://prometheus-staging.wmflabs.org it will autodiscover and scrape/poll
metrics from all labs instances running prometheus's node_exporter in the same
project. (see also I6832dcf31)

Bug: T126785
Change-Id: Ia2a8f204ea0e3ac865a6ab8bd7b4af6c7915bcef
---
A modules/prometheus/files/etc/prometheus/alerts_default.conf
A modules/prometheus/files/usr/local/bin/prometheus-labs-targets
A modules/prometheus/manifests/server.pp
A modules/prometheus/templates/etc/default/prometheus.erb
A modules/prometheus/templates/etc/prometheus/prometheus.yml.erb
A modules/prometheus/templates/initscripts/prometheus.systemd.erb
A modules/role/manifests/prometheus/labs_project.pp
7 files changed, 247 insertions(+), 0 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/operations/puppet 
refs/changes/52/280652/1

diff --git a/modules/prometheus/files/etc/prometheus/alerts_default.conf 
b/modules/prometheus/files/etc/prometheus/alerts_default.conf
new file mode 100644
index 0000000..f8426d4
--- /dev/null
+++ b/modules/prometheus/files/etc/prometheus/alerts_default.conf
@@ -0,0 +1,23 @@
+# default alerting rules
+
+ALERT InstanceDown
+  IF up == 0
+  FOR 3m
+  LABELS {
+    severity="warn"
+  }
+  ANNOTATIONS {
+    SUMMARY = "Instance {{$labels.instance}} down",
+    DESCRIPTION = "{{$labels.instance}} of job {{$labels.job}} has been down 
for more than 3 minutes.",
+  }
+
+ALERT PrometheusReloadFailed
+  IF prometheus_config_last_reload_successful == 0
+  FOR 1h
+  LABELS {
+    severity="warn"
+  }
+  ANNOTATIONS {
+    SUMMARY = "Prometheus {{$labels.instance}} config reload fail",
+    DESCRIPTION = "Prometheus server at {{$labels.instance}} of job 
{{$labels.job}} has failed to reload its configuration",
+  }
diff --git a/modules/prometheus/files/usr/local/bin/prometheus-labs-targets 
b/modules/prometheus/files/usr/local/bin/prometheus-labs-targets
new file mode 100755
index 0000000..cd25fb0
--- /dev/null
+++ b/modules/prometheus/files/usr/local/bin/prometheus-labs-targets
@@ -0,0 +1,57 @@
+#!/usr/bin/python3
+# Generate prometheus targets for a given project from wikitech's nova
+# instances list.
+
+import argparse
+import codecs
+import json
+import logging
+import sys
+import urllib.parse
+import urllib.request
+import yaml
+
+
+def project_instances(project):
+    req = urllib.request.urlopen(
+            'https://wikitech.wikimedia.org/w/api.php?' +
+            urllib.parse.urlencode({
+                'action': 'query',
+                'list': 'novainstances',
+                'niregion': 'eqiad',
+                'format': 'json',
+                'niproject': project,
+                })
+            )
+
+    reader = codecs.getreader('utf-8')
+    data = json.load(reader(req))
+    for instance in data['query']['novainstances']:
+        yield instance
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--project', dest='project')
+    parser.add_argument('--port', dest='port', default='9100')
+    args = parser.parse_args()
+
+    if args.project is None:
+        try:
+            with open('/etc/wmflabs-project') as f:
+                args.project = f.read().strip()
+        except IOError as e:
+            parser.error('unable to detect project from /etc/wmflabs-project: 
%r' % e)
+            return 1
+
+    scrape_configs = []
+    targets = {'targets': []}
+    for instance in project_instances(args.project):
+        targets['targets'].append("%s:%s" % (instance['name'], args.port))
+    targets['targets'] = sorted(targets['targets'])
+    scrape_configs.append(targets)
+    print(yaml.dump(scrape_configs, default_flow_style=False))
+
+
+if __name__ == '__main__':
+  sys.exit(main())
diff --git a/modules/prometheus/manifests/server.pp 
b/modules/prometheus/manifests/server.pp
new file mode 100644
index 0000000..7d8bf4a
--- /dev/null
+++ b/modules/prometheus/manifests/server.pp
@@ -0,0 +1,117 @@
+# == Class: prometheus::server
+#
+# The prometheus server takes care of 'scraping' (polling) a list of 'targets'
+# via HTTP using one of
+# https://prometheus.io/docs/instrumenting/exposition_formats/ and making the
+# scraped metrics available for querying. Metrics will be stored locally in
+# $storage_path for $storage_retention time.
+#
+# The shipped configuration below includes prometheus server scraping itself
+# for metrics on localhost:9090.
+
+class prometheus::server (
+    $scrape_interval = '60s',
+    $storage_path = '/srv/prometheus',
+    $storage_retention = '4320h0m0s',
+    $global_config_extra = {},
+    $scrape_configs_extra = [],
+    $rule_files_extra = [],
+) {
+    if ! os_version('debian >= jessie') {
+        fail('only Debian jessie supported')
+    }
+
+    require_package('prometheus')
+
+    $global_config_default = {
+      'scrape_interval' => $scrape_interval,
+    }
+    $global_config = merge($global_config_default, $global_config_extra)
+
+    $scrape_configs_default = [
+      {
+        'job_name'      => 'prometheus',
+        'target_groups' => [
+           { 'targets'  => [ 'localhost:9090' ] },
+        ]
+      },
+      {
+        'job_name'      => 'node',
+        'file_sd_configs' => [
+           { 'names'  => [ '/etc/prometheus/targets/node_*.yml' ] },
+        ]
+      },
+    ]
+    $scrape_configs = concat($scrape_configs_default, $scrape_configs_extra)
+
+    $rule_files_default = [
+      '/etc/prometheus/rules/rules_*.conf',
+      '/etc/prometheus/rules/alerts_*.conf',
+    ]
+    $rule_files = concat($rule_files_default, $rule_files_extra)
+
+    file { '/etc/prometheus/rules/alerts_default.conf':
+        ensure  => file,
+        mode    => 0444,
+        owner   => 'root',
+        source  => 
'puppet:///modules/prometheus/etc/prometheus/alerts_default.conf',
+        notify  => Exec['prometheus-reload'],
+        require => File['/etc/prometheus/rules'],
+    }
+
+    file { '/etc/prometheus/prometheus.yml':
+        ensure  => present,
+        mode    => '0444',
+        owner   => 'root',
+        group   => 'root',
+        notify  => Exec['prometheus-reload'],
+        content => template('prometheus/etc/prometheus/prometheus.yml.erb'),
+    }
+
+    file { '/etc/default/prometheus':
+        ensure  => present,
+        mode    => '0444',
+        owner   => 'root',
+        group   => 'root',
+        notify  => Service['prometheus'],
+        content => template('prometheus/etc/default/prometheus.erb'),
+    }
+
+    file { $storage_path:
+        ensure => directory,
+        mode   => 0750,
+        owner  => 'prometheus',
+        group  => 'prometheus',
+    }
+
+    file { '/etc/prometheus/rules':
+        ensure => directory,
+        mode   => 0755,
+        owner  => 'root',
+        group  => 'root',
+    }
+
+    # output all nova instances for the current labs project as prometheus
+    # 'targets'
+    file { '/usr/local/bin/prometheus-labs-targets':
+        ensure => file,
+        mode   => 0555,
+        owner  => 'root',
+        group  => 'root',
+        source => 
'puppet:///modules/prometheus/usr/local/bin/prometheus-labs-targets',
+    }
+
+    exec { 'prometheus-reload':
+        command     => '/bin/systemctl reload prometheus',
+        refreshonly => true,
+    }
+
+    base::service_unit { 'prometheus':
+        ensure         => present,
+        systemd        => true,
+        service_params => {
+            enable     => true,
+            hasrestart => true,
+        },
+    }
+}
diff --git a/modules/prometheus/templates/etc/default/prometheus.erb 
b/modules/prometheus/templates/etc/default/prometheus.erb
new file mode 100644
index 0000000..f7f34c0
--- /dev/null
+++ b/modules/prometheus/templates/etc/default/prometheus.erb
@@ -0,0 +1,13 @@
+ARGS="-storage.local.path <%= @storage_path %> -storage.local.retention <%= 
@storage_retention %>"
+
+# if using sysv init, temporary fix for 
https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=817403
+do_reload()
+{
+    log_daemon_msg "Reloading $DESC configuration files" "$NAME"
+    $HELPER $HELPER_ARGS --running || return 1
+    helper_pid=$(cat $PIDFILE)
+    [ -z "$helper_pid" ] && return 1
+    start-stop-daemon --stop --signal 1 --quiet \
+        --ppid "$helper_pid" --exec "$DAEMON"
+    log_end_msg $?
+}
diff --git a/modules/prometheus/templates/etc/prometheus/prometheus.yml.erb 
b/modules/prometheus/templates/etc/prometheus/prometheus.yml.erb
new file mode 100644
index 0000000..92c2aed
--- /dev/null
+++ b/modules/prometheus/templates/etc/prometheus/prometheus.yml.erb
@@ -0,0 +1,5 @@
+<% require 'yaml' %>
+<%= { 'global'         => @global_config,
+      'rule_files'     => @rule_files,
+      'scrape_configs' => @scrape_configs
+    }.to_yaml %>
diff --git a/modules/prometheus/templates/initscripts/prometheus.systemd.erb 
b/modules/prometheus/templates/initscripts/prometheus.systemd.erb
new file mode 100644
index 0000000..ad394d4
--- /dev/null
+++ b/modules/prometheus/templates/initscripts/prometheus.systemd.erb
@@ -0,0 +1,11 @@
+[Unit]
+Description=prometheus server
+
+[Service]
+User=prometheus
+Group=prometheus
+EnvironmentFile=-/etc/default/prometheus
+ExecStart=/usr/bin/prometheus $ARGS
+Restart=on-failure
+RestartSec=10s
+ExecReload=/bin/kill -HUP $MAINPID
diff --git a/modules/role/manifests/prometheus/labs_project.pp 
b/modules/role/manifests/prometheus/labs_project.pp
new file mode 100644
index 0000000..febe583
--- /dev/null
+++ b/modules/role/manifests/prometheus/labs_project.pp
@@ -0,0 +1,21 @@
+# == Class: role::prometheus::labs_project
+#
+# This class provides a prometheus server to do node (host) monitoring for all
+# instances of the labs projects it is running in.
+# Instance autodiscovery is accomplished by querying wikitech HTTP API for
+# instances list and writing a list of <instance>:9100 'targets' for prometheus
+# to pick up. See also prometheus::node_exporter.
+
+class role::prometheus::labs_project {
+  include prometheus::server
+
+  $targets_file = '/etc/prometheus/targets/node_project.yml'
+
+  cron { 'prometheus_labs_project_targets':
+    ensure  => present,
+    command => "/usr/local/bin/prometheus-labs-targets > ${targets_file}.$$ && 
mv ${targets_file}.$$ ${targets_file}",
+    minute  => '*/10',
+    hour    => '*',
+    require => Class['prometheus::server'],
+  }
+}

-- 
To view, visit https://gerrit.wikimedia.org/r/280652
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Ia2a8f204ea0e3ac865a6ab8bd7b4af6c7915bcef
Gerrit-PatchSet: 1
Gerrit-Project: operations/puppet
Gerrit-Branch: production
Gerrit-Owner: Filippo Giunchedi <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to