Milimetric has uploaded a new change for review.
https://gerrit.wikimedia.org/r/231574
Change subject: [WIP] Add an Analytics specific instance of RESTBase
......................................................................
[WIP] Add an Analytics specific instance of RESTBase
Very much a work in progress. I am not at all sure how hiera works and
if I've copy/pasted properly here. I'll work on this hopefully with
some help.
Bug: T107056
Change-Id: I29c872ed6a811cf1fd1ca9e5242bf513cba401ba
---
A hieradata/role/common/cassandra-analytics.yaml
A hieradata/role/common/restbase-analytics.yaml
A manifests/role/cassandra-analytics.pp
A manifests/role/restbase-analytics.pp
M manifests/site.pp
A modules/restbase/templates/config.analytics.yaml.erb
6 files changed, 357 insertions(+), 0 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/operations/puppet
refs/changes/74/231574/1
diff --git a/hieradata/role/common/cassandra-analytics.yaml
b/hieradata/role/common/cassandra-analytics.yaml
new file mode 100644
index 0000000..3780567
--- /dev/null
+++ b/hieradata/role/common/cassandra-analytics.yaml
@@ -0,0 +1,20 @@
+cassandra::metrics::graphite_host: 'graphite-in.eqiad.wmnet'
+cassandra::start_rpc: 'false'
+
+# TODO: set up a cluster variable similar to MySQL clusters to share
+# cassandra cluster configs between cassandra & clients
+
+cassandra::seeds:
+ - restbase-analytics1001.eqiad.wmnet
+ - restbase-analytics1002.eqiad.wmnet
+cassandra::max_heap_size: 16g
+# 1/4 heap size, no more than 100m/thread
+cassandra::heap_newsize: 2048m
+cassandra::compaction_throughput_mb_per_sec: 60
+cassandra::concurrent_compactors: 10
+cassandra::concurrent_writes: 18
+cassandra::concurrent_reads: 18
+
+cassandra::dc: "%{::site}"
+cassandra::cluster_name: "%{::site}"
+
diff --git a/hieradata/role/common/restbase-analytics.yaml
b/hieradata/role/common/restbase-analytics.yaml
new file mode 100644
index 0000000..1839610
--- /dev/null
+++ b/hieradata/role/common/restbase-analytics.yaml
@@ -0,0 +1,21 @@
+#
+# RESTBase
+#
+cluster: restbase-analytics
+restbase::seeds:
+ - restbase-analytics1001.eqiad.wmnet
+ - restbase-analytics1002.eqiad.wmnet
+restbase::config_template: restbase/config.analytics.yaml.erb
+restbase::logstash_host: logstash1001.eqiad.wmnet
+restbase::cassandra_defaultConsistency: localQuorum
+restbase::cassandra_localDc: "%{::site}"
+restbase::statsd_host: statsd.eqiad.wmnet
+restbase::parsoid_uri: undef
+restbase::graphoid_host_port: undef
+
+# TODO: I don't understand this
+#lvs::realserver::realserver_ips:
+ #- '10.2.2.17' # restbase.svc.eqiad.wmnet
+
+admin::groups:
+ - restbase-roots
diff --git a/manifests/role/cassandra-analytics.pp
b/manifests/role/cassandra-analytics.pp
new file mode 100644
index 0000000..7f583ba
--- /dev/null
+++ b/manifests/role/cassandra-analytics.pp
@@ -0,0 +1,54 @@
+# == Class role::cassandra-analytics
+#
+class role::cassandra-analytics {
+ # Parameters to be set by Hiera
+ class { '::cassandra': }
+ class { '::cassandra::metrics': }
+ class { '::cassandra::logging': }
+
+ # temporary collector, T78514
+ diamond::collector { 'CassandraCollector':
+ ensure => absent,
+ }
+
+ system::role { 'role::cassandra-analytics':
+ description => 'Analytics Cassandra server',
+ }
+
+ # Emit an Icinga alert unless there is exactly one Java process belonging
+ # to user 'cassandra' and with 'CassandraDaemon' in its argument list.
+ nrpe::monitor_service { 'cassandra-analytics':
+ description => 'Analytics Cassandra database',
+ nrpe_command => '/usr/lib/nagios/plugins/check_procs -c 1:1 -u
cassandra -C java -a CassandraDaemon',
+ }
+
+ # CQL query interface monitoring (T93886)
+ monitoring::service { 'cassandra-analytics-cql':
+ description => 'Analytics Cassanda CQL query interface',
+ check_command => 'check_tcp!9042',
+ contact_group => 'admins,team-analytics',
+ }
+
+ $cassandra_hosts = hiera('cassandra::seeds')
+ $cassandra_hosts_ferm = join($cassandra_hosts, ' ')
+
+ # Cassandra intra-node messaging
+ ferm::service { 'cassandra-analytics-intra-node':
+ proto => 'tcp',
+ port => '7000',
+ srange => "@resolve(($cassandra_hosts_ferm))",
+ }
+ # Cassandra JMX/RMI
+ ferm::service { 'cassandra-analytics-jmx-rmi':
+ proto => 'tcp',
+ port => '7199',
+ srange => "@resolve(($cassandra_hosts_ferm))",
+ }
+ # Cassandra CQL query interface
+ ferm::service { 'cassandra-analytics-cql':
+ proto => 'tcp',
+ port => '9042',
+ srange => "@resolve(($cassandra_hosts_ferm))",
+ }
+
+}
diff --git a/manifests/role/restbase-analytics.pp
b/manifests/role/restbase-analytics.pp
new file mode 100644
index 0000000..4a26f6b
--- /dev/null
+++ b/manifests/role/restbase-analytics.pp
@@ -0,0 +1,124 @@
+# == Class role::restbase
+#
+
+@monitoring::group { 'restbase_analytics_eqiad': description => 'Analytics
Restbase eqiad' }
+@monitoring::group { 'restbase_analytics_codfw': description => 'Analytics
Restbase codfw' }
+
+# Config should be pulled from hiera
+class role::restbase-analytics {
+ system::role { 'restbase-analytics': description => "Analytics Restbase
${::realm}" }
+
+ include ::restbase
+ include ::restbase::monitoring
+
+ include lvs::realserver
+
+
+ ferm::service {'restbase_web':
+ proto => 'tcp',
+ port => '7231',
+ }
+
+}
+
+class role::restbase::alerts {
+ monitoring::graphite_threshold { 'restbase_analytics_request_5xx_rate':
+ description => 'Analytics RESTBase req/s returning 5xx
http://grafana.wikimedia.org/#/dashboard/db/restbase',
+ metric =>
'transformNull(restbase.v1_page_html_-title-_-revision--_tid-.GET.5xx.sample_rate,
0)',
+ from => '10min',
+ warning => '1', # 1 5xx/s
+ critical => '3', # 5 5xx/s
+ percentage => '20',
+ contact_group => 'team-analytics',
+ }
+
+ monitoring::graphite_threshold {
'restbase_analytics_html_storage_hit_latency':
+ description => 'Analytics RESTBase HTML storage load mean latency ms
http://grafana.wikimedia.org/#/dashboard/db/restbase',
+ metric =>
'movingMedian(restbase.sys_key-rev-value_-bucket-_-key--_revision--_tid-.GET.2xx.mean,
15)',
+ from => '10min',
+ warning => '25', # 25ms
+ critical => '50', # 50ms
+ percentage => '50',
+ contact_group => 'team-analytics',
+ }
+
+ monitoring::graphite_threshold {
'restbase_analytics_html_storage_hit_latency_99p':
+ description => 'Analytics RESTBase HTML storage load 99p latency ms
http://grafana.wikimedia.org/#/dashboard/db/restbase',
+ metric =>
'movingMedian(restbase.sys_key-rev-value_-bucket-_-key--_revision--_tid-.GET.2xx.p99,
15)',
+ from => '10min',
+ warning => '1500', # 1.5s
+ critical => '3000', # 3s
+ percentage => '50',
+ contact_group => 'team-analytics',
+ }
+
+ monitoring::graphite_threshold {
'restbase_analytics_cassandra_highest_storage_exceptions':
+ description => 'Analytics RESTBase Cassandra highest storage
exceptions
http://grafana.wikimedia.org/#/dashboard/db/restbase-cassandra-storage',
+ metric =>
'highestMax(nonNegativeDerivative(cassandra.restbase10*.org.apache.cassandra.metrics.Storage.Exceptions.count),
1)',
+ from => '10min',
+ warning => '5',
+ critical => '10',
+ percentage => '50',
+ contact_group => 'team-analytics',
+ }
+
+ monitoring::graphite_threshold {
'restbase_analytics_cassandra_highest_total_hints':
+ description => 'Analytics RESTBase Cassandra highest total hints
http://grafana.wikimedia.org/#/dashboard/db/restbase-cassandra-storage',
+ metric =>
'highestMax(nonNegativeDerivative(cassandra.restbase10*.org.apache.cassandra.metrics.Storage.TotalHints.count),
1)',
+ from => '10min',
+ warning => '600',
+ critical => '1000',
+ percentage => '50',
+ contact_group => 'team-analytics',
+ }
+
+ monitoring::graphite_threshold {
'restbase_analytics_cassandra_highest_pending_compactions':
+ description => 'Analytics RESTBase Cassandra highest pending
compactions
http://grafana.wikimedia.org/#/dashboard/db/restbase-cassandra-compaction',
+ metric =>
'highestMax(cassandra.restbase10*.org.apache.cassandra.metrics.Compaction.PendingTasks.value,
1)',
+ from => '60min',
+ warning => '100',
+ critical => '400',
+ percentage => '50',
+ contact_group => 'team-analytics',
+ }
+
+ monitoring::graphite_threshold {
'restbase_analytics_cassandra_highest_sstables_per_read':
+ description => 'Analytics RESTBase Cassandra highest SSTables
per-read
http://grafana.wikimedia.org/#/dashboard/db/restbase-cassandra-cf-sstables-per-read',
+ metric =>
'highestMax(cassandra.restbase10*.org.apache.cassandra.metrics.ColumnFamily.all.SSTablesPerReadHistogram.99percentile,
1)',
+ from => '10min',
+ warning => '6',
+ critical => '10',
+ percentage => '50',
+ contact_group => 'team-analytics',
+ }
+
+ monitoring::graphite_threshold {
'restbase_analytics_cassandra_highest_tombstones_scanned':
+ description => 'Analytics RESTBase Cassandra highest tombstones
scanned
http://grafana.wikimedia.org/#/dashboard/db/restbase-cassandra-cf-tombstones-scanned',
+ metric =>
'highestMax(cassandra.restbase10*.org.apache.cassandra.metrics.ColumnFamily.all.TombstoneScannedHistogram.99percentile,
1)',
+ from => '10min',
+ warning => '1000',
+ critical => '1500',
+ percentage => '50',
+ contact_group => 'team-analytics',
+ }
+
+ monitoring::graphite_threshold {
'restbase_analytics_cassandra_highest_pending_internal':
+ description => 'Analytics RESTBase Cassandra highest pending
internal thread pool tasks
http://grafana.wikimedia.org/#/dashboard/db/restbase-cassandra-thread-pools',
+ metric =>
'highestMax(exclude(cassandra.restbase10*.org.apache.cassandra.metrics.ThreadPools.internal.*.PendingTasks.value,
"CompactionExecutor"), 1)',
+ from => '10min',
+ warning => '500',
+ critical => '1000',
+ percentage => '50',
+ contact_group => 'team-analytics',
+ }
+
+ monitoring::graphite_threshold {
'restbase_analytics_cassandra_highest_dropped_messages':
+ description => 'Analytics RESTBase Cassandra highest dropped message
rate
http://grafana.wikimedia.org/#/dashboard/db/restbase-cassandra-dropped-messages',
+ metric =>
'highestMax(cassandra.restbase10*.org.apache.cassandra.metrics.DroppedMessage.*.Dropped.1MinuteRate,
1)',
+ from => '10min',
+ warning => '50',
+ critical => '100',
+ percentage => '50',
+ contact_group => 'team-analytics',
+ }
+}
diff --git a/manifests/site.pp b/manifests/site.pp
index db124ec..644a4f4 100644
--- a/manifests/site.pp
+++ b/manifests/site.pp
@@ -2157,6 +2157,13 @@
include standard
}
+# analytics restbase eqiad cluster
+node /^restbase-analytics100[1-2]\.eqiad\.wmnet$/ {
+ role restbase-analytics, cassandra-analytics
+ include base::firewall
+ include standard
+}
+
# network insights (netflow/pmacct, etc.)
node 'rhenium.wikimedia.org' {
role pmacct
diff --git a/modules/restbase/templates/config.analytics.yaml.erb
b/modules/restbase/templates/config.analytics.yaml.erb
new file mode 100644
index 0000000..768fd9e
--- /dev/null
+++ b/modules/restbase/templates/config.analytics.yaml.erb
@@ -0,0 +1,131 @@
+# Analytics RESTBase config
+
+info:
+ name: restbase-analytics
+
+
+
+# Swagger spec templates, referenced using yaml references in the spec section
+# below.
+templates:
+
+ wmf-content-1.0.0: &wp/content/1.0.0
+ swagger: '2.0'
+ # swagger options, overriding the shared ones from the merged specs (?)
+ info:
+ version: 1.0.0-beta
+ title: Wikimedia REST API
+ description: >
+ This API aims to provide straightforward and low-latency access to
+ Wikimedia Analytics data. It is currently in beta testing, so
+ things aren't completely locked down yet. Each entry point has
+ explicit stability markers to inform you about development status
+ and change policy, according to [our API version
+ policy](https://www.mediawiki.org/wiki/API_versioning).
+
+ ### High-volume access
+ - As a general rule, don't perform more than 200 requests/s to
+ this API.
+ - Set a unique `User-Agent` header that allows us to contact you
+ quickly. Email addresses or URLs of contact pages work well.
+ - Consider using our [HTML
+ dumps](https://phabricator.wikimedia.org/T17017) once they
+ become available.
+
+ termsOfService: https://wikimediafoundation.org/wiki/Terms_of_Use
+ contact:
+ name: the Wikimedia Services team
+ url: http://mediawiki.org/wiki/RESTBase
+ license:
+ name: Apache2
+ url: http://www.apache.org/licenses/LICENSE-2.0
+
+ # Override the base path for host-based (proxied) requests. In our case,
+ # we proxy https://{domain}/api/rest_v1/ to the API.
+ x-host-basePath: /api/rest_v1
+
+ security:
+ # ACLs for public *.wikipedia.org wikis
+ - mediaWikiAuth:
+ - user:read
+ x-subspecs:
+ - analytics/pageviews
+
+ wmf-sys-1.0.0: &wp/sys/1.0.0
+ info:
+ title: Default MediaWiki sys API module
+ version: 1.0.0
+ paths:
+ /{module:table}:
+ x-modules:
+ # There can be multiple modules too per stanza, as long as the
+ # exported symbols don't conflict. The operationIds from the spec
+ # will be resolved against all of the modules.
+ - name: restbase-mod-table-cassandra
+ version: 1.0.0
+ type: npm
+ options: # Passed to the module constructor
+ conf:
+ hosts: [<%= Array(@seeds).join(',') %>]
+ keyspace: system
+ localDc: <%= @cassandra_localDc %>
+ username: <%= @cassandra_user %>
+ password: <%= @cassandra_password %>
+ defaultConsistency: <%= @cassandra_defaultConsistency %>
+ storage_groups:
+ - name: phase0.group.local
+ domains:
+ - /^(?:test.*\.wiki.*\.org|www.mediawiki.org)$/
+ - name: wikipedia.group.local
+ domains: /\.wikipedia.org$/
+ # Catch-all group
+ - name: default.group.local
+ domains: /./
+
+ wp-default-1.0.0: &wp/default/1.0.0
+ x-subspecs:
+ - paths:
+ /{api:v1}:
+ x-subspec: *wp/content/1.0.0
+ - paths:
+ /{api:sys}:
+ x-subspec: *wp/sys/1.0.0
+
+
+# Swagger spec root.
+spec: &spec
+ title: "The Analytics RESTBase root"
+ paths:
+ # list taken from Parsoid's beta config
+ /{domain:analytics.wikimedia.org}: *wp/default/1.0.0
+
+
+# The main service setup. Each worker can offer one or more services.
+services:
+ - name: restbase-analytics
+ module: ./restbase/lib/server
+ conf:
+ port: <%= @port %>
+ spec: *spec
+ salt: <%= @salt_key %>
+ default_page_size: <%= @page_size %>
+
+# Log error messages and gracefully restart a worker if v8 reports using more
+# heap (note: not RSS).
+worker_heap_limit_mb: 300
+
+logging:
+ name: restbase-analytics
+ level: <%= @logging_level %>
+ streams:
+ # XXX: Use gelf-stream -> logstash
+ - type: gelf
+ host: <%= @logstash_host %>
+ port: <%= @logstash_port %>
+
+# StatsD metrics collection
+metrics:
+ name: restbase-analytics
+ type: statsd # default, but lets be explicit
+ host: <%= @statsd_host %>
+ port: <%= @statsd_port %>
--
To view, visit https://gerrit.wikimedia.org/r/231574
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: I29c872ed6a811cf1fd1ca9e5242bf513cba401ba
Gerrit-PatchSet: 1
Gerrit-Project: operations/puppet
Gerrit-Branch: production
Gerrit-Owner: Milimetric <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits