BBlack has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/339669 )
Change subject: varnish: per-app routing [WIP, 3/4] ...................................................................... varnish: per-app routing [WIP, 3/4] Change-Id: Ic32b015d68df4a14e43cfbd577e6050d695ecddb --- M hieradata/labs.yaml M hieradata/role/common/cache/maps.yaml M hieradata/role/common/cache/misc.yaml M hieradata/role/common/cache/text.yaml M hieradata/role/common/cache/upload.yaml M modules/role/manifests/cache/maps.pp M modules/role/manifests/cache/misc.pp M modules/role/manifests/cache/text.pp M modules/role/manifests/cache/upload.pp M modules/varnish/templates/vcl/wikimedia-backend.vcl.erb M modules/varnish/templates/vcl/wikimedia-common.inc.vcl.erb 11 files changed, 204 insertions(+), 143 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/operations/puppet refs/changes/69/339669/1 diff --git a/hieradata/labs.yaml b/hieradata/labs.yaml index 7063f40..a500017 100644 --- a/hieradata/labs.yaml +++ b/hieradata/labs.yaml @@ -43,7 +43,7 @@ # Cache-layer stuff cache::route_table: - eqiad: 'direct' + eqiad: 'eqiad' cache::text::nodes: eqiad: - 'deployment-cache-text04.deployment-prep.eqiad.wmflabs' diff --git a/hieradata/role/common/cache/maps.yaml b/hieradata/role/common/cache/maps.yaml index eb51d7d..835d136 100644 --- a/hieradata/role/common/cache/maps.yaml +++ b/hieradata/role/common/cache/maps.yaml @@ -2,26 +2,15 @@ cache::cluster: maps cache::tune_for_media: true # The contents of this hash control our DC->DC routing for varnish backend -# daemons. There should be a key for every cache datacenter, and the -# values can be another datacenter or 'direct', which means contact the -# applayer directly. -# -# Currently, the possible datacenter-name values for non-direct are limited -# to 'eqiad', and 'codfw' because they're the only ones defined in -# 'backend_caches' in role::cache::instances. -# -# Note that it is possible to create permanent or temporary-race-condition -# loops by making poor changes to this hash! Keeping in mind that the -# state here will be applied to cache daemons asynchronously, the general -# rules of safety would be: -# 1. Obviously, a commit should not create actual loops which do not -# eventually resolve to 'direct' -# 2. A single commit->deploy should only change one single value at a time -# unless you're absolutely certain of what you're doing, as changing -# multiple values could cause a race-condition of intermediate states. +# daemons. There should be a key for every cache datacenter. The values must +# be a core datacenter (eqiad or codfw), or at least must lead indirectly to +# a core datacenter when traversing the table recursively. A loop between +# the two core datacenters is expected and normal here. The only reason to +# edit this is to remove a datacenter from active service (due to fault or +# maintenance) by routing around it from the edge sites. # cache::route_table: - eqiad: 'direct' + eqiad: 'codfw' codfw: 'eqiad' ulsfo: 'codfw' esams: 'eqiad' diff --git a/hieradata/role/common/cache/misc.yaml b/hieradata/role/common/cache/misc.yaml index 842e8aa..c0b1568 100644 --- a/hieradata/role/common/cache/misc.yaml +++ b/hieradata/role/common/cache/misc.yaml @@ -1,28 +1,17 @@ cluster: cache_misc cache::cluster: misc +# note this only affects tlsproxy now, should be moved to param there... +cache::websocket_support: true # The contents of this hash control our DC->DC routing for varnish backend -# daemons. There should be a key for every cache datacenter, and the -# values can be another datacenter or 'direct', which means contact the -# applayer directly. -# -# Currently, the possible datacenter-name values for non-direct are limited -# to 'eqiad', and 'codfw' because they're the only ones defined in -# 'backend_caches' in role::cache::instances. -# -# Note that it is possible to create permanent or temporary-race-condition -# loops by making poor changes to this hash! Keeping in mind that the -# state here will be applied to cache daemons asynchronously, the general -# rules of safety would be: -# 1. Obviously, a commit should not create actual loops which do not -# eventually resolve to 'direct' -# 2. A single commit->deploy should only change one single value at a time -# unless you're absolutely certain of what you're doing, as changing -# multiple values could cause a race-condition of intermediate states. +# daemons. There should be a key for every cache datacenter. The values must +# be a core datacenter (eqiad or codfw), or at least must lead indirectly to +# a core datacenter when traversing the table recursively. A loop between +# the two core datacenters is expected and normal here. The only reason to +# edit this is to remove a datacenter from active service (due to fault or +# maintenance) by routing around it from the edge sites. # cache::route_table: - eqiad: 'direct' + eqiad: 'codfw' codfw: 'eqiad' ulsfo: 'codfw' esams: 'eqiad' -# note this only affects tlsproxy now, should be moved to param there... -cache::websocket_support: true diff --git a/hieradata/role/common/cache/text.yaml b/hieradata/role/common/cache/text.yaml index ae2fe39..82a93b2 100644 --- a/hieradata/role/common/cache/text.yaml +++ b/hieradata/role/common/cache/text.yaml @@ -3,26 +3,15 @@ admin::groups: - perf-roots # The contents of this hash control our DC->DC routing for varnish backend -# daemons. There should be a key for every cache datacenter, and the -# values can be another datacenter or 'direct', which means contact the -# applayer directly. -# -# Currently, the possible datacenter-name values for non-direct are limited -# to 'eqiad', and 'codfw' because they're the only ones defined in -# 'backend_caches' in role::cache::instances. -# -# Note that it is possible to create permanent or temporary-race-condition -# loops by making poor changes to this hash! Keeping in mind that the -# state here will be applied to cache daemons asynchronously, the general -# rules of safety would be: -# 1. Obviously, a commit should not create actual loops which do not -# eventually resolve to 'direct' -# 2. A single commit->deploy should only change one single value at a time -# unless you're absolutely certain of what you're doing, as changing -# multiple values could cause a race-condition of intermediate states. +# daemons. There should be a key for every cache datacenter. The values must +# be a core datacenter (eqiad or codfw), or at least must lead indirectly to +# a core datacenter when traversing the table recursively. A loop between +# the two core datacenters is expected and normal here. The only reason to +# edit this is to remove a datacenter from active service (due to fault or +# maintenance) by routing around it from the edge sites. # cache::route_table: - eqiad: 'direct' + eqiad: 'codfw' codfw: 'eqiad' ulsfo: 'codfw' esams: 'eqiad' diff --git a/hieradata/role/common/cache/upload.yaml b/hieradata/role/common/cache/upload.yaml index 6c83be9..d01ba4a 100644 --- a/hieradata/role/common/cache/upload.yaml +++ b/hieradata/role/common/cache/upload.yaml @@ -4,26 +4,15 @@ - perf-roots cache::tune_for_media: true # The contents of this hash control our DC->DC routing for varnish backend -# daemons. There should be a key for every cache datacenter, and the -# values can be another datacenter or 'direct', which means contact the -# applayer directly. -# -# Currently, the possible datacenter-name values for non-direct are limited -# to 'eqiad', and 'codfw' because they're the only ones defined in -# 'backend_caches' in role::cache::instances. -# -# Note that it is possible to create permanent or temporary-race-condition -# loops by making poor changes to this hash! Keeping in mind that the -# state here will be applied to cache daemons asynchronously, the general -# rules of safety would be: -# 1. Obviously, a commit should not create actual loops which do not -# eventually resolve to 'direct' -# 2. A single commit->deploy should only change one single value at a time -# unless you're absolutely certain of what you're doing, as changing -# multiple values could cause a race-condition of intermediate states. +# daemons. There should be a key for every cache datacenter. The values must +# be a core datacenter (eqiad or codfw), or at least must lead indirectly to +# a core datacenter when traversing the table recursively. A loop between +# the two core datacenters is expected and normal here. The only reason to +# edit this is to remove a datacenter from active service (due to fault or +# maintenance) by routing around it from the edge sites. # cache::route_table: - eqiad: 'direct' + eqiad: 'codfw' codfw: 'eqiad' ulsfo: 'codfw' esams: 'eqiad' diff --git a/modules/role/manifests/cache/maps.pp b/modules/role/manifests/cache/maps.pp index f49b4bb..e1b5380 100644 --- a/modules/role/manifests/cache/maps.pp +++ b/modules/role/manifests/cache/maps.pp @@ -38,7 +38,9 @@ $app_directors = { 'kartotherian' => { - 'backend' => 'kartotherian.svc.eqiad.wmnet', + 'backends' => { + 'eqiad' => 'kartotherian.svc.eqiad.wmnet', + }, }, } diff --git a/modules/role/manifests/cache/misc.pp b/modules/role/manifests/cache/misc.pp index ade32d2..1773690 100644 --- a/modules/role/manifests/cache/misc.pp +++ b/modules/role/manifests/cache/misc.pp @@ -40,104 +40,164 @@ # $app_directors = { 'analytics1027' => { # Hue (Hadoop GUI) - 'backend' => 'analytics1027.eqiad.wmnet', + 'backends' => { + 'eqiad' => 'analytics1027.eqiad.wmnet', + }, 'be_opts' => { 'port' => 8888 }, }, 'bromine' => { # ganeti VM for misc. static HTML sites - 'backend' => 'bromine.eqiad.wmnet', + 'backends' => { + 'eqiad' => 'bromine.eqiad.wmnet', + }, }, 'bohrium' => { - 'backend' => 'bohrium.eqiad.wmnet', + 'backends' => { + 'eqiad' => 'bohrium.eqiad.wmnet', + }, 'probe' => { 'url' => '/piwik.php', 'timeout' => '3s', }, }, 'californium' => { - 'backend' => 'californium.wikimedia.org', + 'backends' => { + 'eqiad' => 'californium.wikimedia.org', + }, }, 'darmstadtium' => { - 'backend' => 'darmstadtium.eqiad.wmnet', + 'backends' => { + 'eqiad' => 'darmstadtium.eqiad.wmnet', + }, 'be_opts' => {'port' => 81, 'max_connections' => 5}, }, 'labtestweb2001' => { - 'backend' => 'labtestweb2001.wikimedia.org', + 'backends' => { + 'eqiad' => 'labtestweb2001.wikimedia.org', + }, }, 'labtestspice' => { - 'backend' => 'labtestcontrol2001.wikimedia.org', + 'backends' => { + 'eqiad' => 'labtestcontrol2001.wikimedia.org', + }, 'be_opts' => { 'port' => 6082 }, }, 'labspice' => { - 'backend' => 'labcontrol1001.wikimedia.org', + 'backends' => { + 'eqiad' => 'labcontrol1001.wikimedia.org', + }, 'be_opts' => { 'port' => 6082 }, }, 'etherpad1001' => { - 'backend' => 'etherpad1001.eqiad.wmnet', + 'backends' => { + 'eqiad' => 'etherpad1001.eqiad.wmnet', + }, 'be_opts' => { 'port' => 9001 }, }, 'eventstreams' => { - 'backend' => 'eventstreams.svc.eqiad.wmnet', + 'backends' => { + 'eqiad' => 'eventstreams.svc.eqiad.wmnet', + }, 'be_opts' => { 'port' => 8092 }, }, 'contint1001' => { # CI server - 'backend' => 'contint1001.wikimedia.org', + 'backends' => { + 'eqiad' => 'contint1001.wikimedia.org', + }, }, 'graphite1001' => { - 'backend' => 'graphite1001.eqiad.wmnet', + 'backends' => { + 'eqiad' => 'graphite1001.eqiad.wmnet', + }, }, 'graphite2001' => { - 'backend' => 'graphite2001.codfw.wmnet', + 'backends' => { + 'eqiad' => 'graphite2001.codfw.wmnet', + }, }, 'iridium' => { # main phab - 'backend' => 'iridium.eqiad.wmnet', + 'backends' => { + 'eqiad' => 'iridium.eqiad.wmnet', + }, }, 'krypton' => { # ganeti VM for misc. PHP apps - 'backend' => 'krypton.eqiad.wmnet', + 'backends' => { + 'eqiad' => 'krypton.eqiad.wmnet', + }, }, 'labmon1001' => { - 'backend' => 'labmon1001.eqiad.wmnet', + 'backends' => { + 'eqiad' => 'labmon1001.eqiad.wmnet', + }, }, 'netmon1001' => { # servermon - 'backend' => 'netmon1001.wikimedia.org', + 'backends' => { + 'eqiad' => 'netmon1001.wikimedia.org', + }, }, 'noc' => { # noc.wikimedia.org and dbtree.wikimedia.org - 'backend' => 'terbium.eqiad.wmnet', + 'backends' => { + 'eqiad' => 'terbium.eqiad.wmnet', + }, }, 'planet1001' => { - 'backend' => 'planet1001.eqiad.wmnet', + 'backends' => { + 'eqiad' => 'planet1001.eqiad.wmnet', + }, }, 'pybal_config' => { - 'backend' => 'puppetmaster1001.eqiad.wmnet', + 'backends' => { + 'eqiad' => 'puppetmaster1001.eqiad.wmnet', + }, }, 'rcstream' => { - 'backend' => 'rcs1001.eqiad.wmnet', - # 'backend' => 'rcs1002.eqiad.wmnet', # manual backup option if 1001 fails + 'backends' => { + 'eqiad' => 'rcs1001.eqiad.wmnet', + }, + # 'backends' => { + 'eqiad' => 'rcs1002.eqiad.wmnet', + }, # manual backup option if 1001 fails 'be_opts' => { max_connections => 1000 }, }, 'ruthenium' => { # parsoid rt test server - 'backend' => 'ruthenium.eqiad.wmnet', + 'backends' => { + 'eqiad' => 'ruthenium.eqiad.wmnet', + }, 'be_opts' => { 'port' => 8001 }, }, 'rutherfordium' => { # people.wikimedia.org - 'backend' => 'rutherfordium.eqiad.wmnet', + 'backends' => { + 'eqiad' => 'rutherfordium.eqiad.wmnet', + }, }, 'thorium' => { # metrics and metrics-api - 'backend' => 'thorium.eqiad.wmnet', + 'backends' => { + 'eqiad' => 'thorium.eqiad.wmnet', + }, }, 'ununpentium' => { # rt.wikimedia.org - 'backend' => 'ununpentium.wikimedia.org', + 'backends' => { + 'eqiad' => 'ununpentium.wikimedia.org', + }, }, 'mendelevium' => { # OTRS - 'backend' => 'mendelevium.eqiad.wmnet', + 'backends' => { + 'eqiad' => 'mendelevium.eqiad.wmnet', + }, }, 'logstash_director' => { - 'backend' => 'kibana.svc.eqiad.wmnet', + 'backends' => { + 'eqiad' => 'kibana.svc.eqiad.wmnet', + }, }, 'wdqs_director' => { - 'backend' => 'wdqs.svc.eqiad.wmnet', + 'backends' => { + 'eqiad' => 'wdqs.svc.eqiad.wmnet', + }, }, 'ores' => { - 'backend' => 'ores.svc.eqiad.wmnet', + 'backends' => { + 'eqiad' => 'ores.svc.eqiad.wmnet', + }, 'be_opts' => { 'port' => 8081 }, }, } diff --git a/modules/role/manifests/cache/text.pp b/modules/role/manifests/cache/text.pp index 8b71c85..ef10d58 100644 --- a/modules/role/manifests/cache/text.pp +++ b/modules/role/manifests/cache/text.pp @@ -46,32 +46,48 @@ $app_directors = { 'appservers' => { - 'backend' => 'appservers.svc.eqiad.wmnet', + 'backends' => { + 'eqiad' => 'appservers.svc.eqiad.wmnet', + }, }, 'api' => { - 'backend' => 'api.svc.eqiad.wmnet', + 'backends' => { + 'eqiad' => 'api.svc.eqiad.wmnet', + }, }, 'rendering' => { - 'backend' => 'rendering.svc.eqiad.wmnet', + 'backends' => { + 'eqiad' => 'rendering.svc.eqiad.wmnet', + }, }, 'security_audit' => { - 'backend' => 'appservers.svc.eqiad.wmnet', + 'backends' => { + 'eqiad' => 'appservers.svc.eqiad.wmnet', + }, }, 'appservers_debug' => { - # 'backend' => 'hassium.eqiad.wmnet', - 'backend' => 'hassaleh.codfw.wmnet', + 'backends' => { + 'eqiad' => 'hassium.eqiad.wmnet', + # 'codfw' => 'hassaleh.codfw.wmnet', + }, 'be_opts' => { 'max_connections' => 20 }, }, 'restbase_backend' => { - 'backend' => 'restbase.svc.eqiad.wmnet', + 'backends' => { + 'eqiad' => 'restbase.svc.eqiad.wmnet', + }, 'be_opts' => { 'port' => 7231, 'max_connections' => 5000 }, }, 'cxserver_backend' => { # LEGACY: should be removed eventually - 'backend' => 'cxserver.svc.eqiad.wmnet', + 'backends' => { + 'eqiad' => 'cxserver.svc.eqiad.wmnet', + }, 'be_opts' => { 'port' => 8080 }, }, 'citoid_backend' => { # LEGACY: should be removed eventually - 'backend' => 'citoid.svc.eqiad.wmnet', + 'backends' => { + 'eqiad' => 'citoid.svc.eqiad.wmnet', + }, 'be_opts' => { 'port' => 1970 }, }, } diff --git a/modules/role/manifests/cache/upload.pp b/modules/role/manifests/cache/upload.pp index 95ed0c9..734c01b 100644 --- a/modules/role/manifests/cache/upload.pp +++ b/modules/role/manifests/cache/upload.pp @@ -42,10 +42,14 @@ $app_directors = { 'swift' => { - 'backend' => 'ms-fe.svc.eqiad.wmnet', + 'backends' => { + 'eqiad' => 'ms-fe.svc.eqiad.wmnet', + }, }, 'swift_thumbs' => { - 'backend' => 'ms-fe-thumbs.svc.eqiad.wmnet', + 'backends' => { + 'eqiad' => 'ms-fe-thumbs.svc.eqiad.wmnet', + }, }, } diff --git a/modules/varnish/templates/vcl/wikimedia-backend.vcl.erb b/modules/varnish/templates/vcl/wikimedia-backend.vcl.erb index 71b16cd..a65ecee 100644 --- a/modules/varnish/templates/vcl/wikimedia-backend.vcl.erb +++ b/modules/varnish/templates/vcl/wikimedia-backend.vcl.erb @@ -10,13 +10,36 @@ call wm_common_directors_init; } -sub set_backend_app__ { +sub set_backend__ { <% def set_director(dirname, debug_dir) - if debug_dir.nil? - return "set req.backend_hint = #{dirname}.backend();" + dirs = @app_directors[dirname]['backends'] + if dirs.key?(@site) + next_dir = dirname else - return "if (req.http.X-Wikimedia-Debug) { set req.backend_hint = #{debug_dir}.backend(); } else { set req.backend_hint = #{dirname}.backend(); }" + next_dir = "cache_#{@cache_route}" + end + if dirs.empty? + dir_act = "XXX no backends defined, return an error!" + else + dir_act = "set req.backend_hint = #{next_dir}.backend();" + end + end + if debug_dir.nil? + return dir_act + else + debug_dirs = @app_directors[debug_dir]['backends'] + if debug_dirs.key?(@site) + next_debug_dir = debug_dir + else + next_debug_dir = "cache_#{@cache_route}" + end + if debug_dirs.empty? + debug_act = "XXX no backends defined, return an error!" + else + debug_act = "set req.backend_hint = #{next_debug_dir}.backend();" + end + return "if (req.http.X-Wikimedia-Debug) { #{debug_act} } else { #{dir_act} }" end end @@ -51,15 +74,6 @@ set_backend = if_stmts.join(' els') %> <%= set_backend %> -} - -sub set_backend__ { -<% if @cache_route == 'direct' -%> - // tier-one caches must select an applayer backend - call set_backend_app__; -<% else -%> - set req.backend_hint = cache_<%= @cache_route %>.backend(); -<% end -%> } sub vcl_recv { diff --git a/modules/varnish/templates/vcl/wikimedia-common.inc.vcl.erb b/modules/varnish/templates/vcl/wikimedia-common.inc.vcl.erb index 50ea0d2..fa8bae6 100644 --- a/modules/varnish/templates/vcl/wikimedia-common.inc.vcl.erb +++ b/modules/varnish/templates/vcl/wikimedia-common.inc.vcl.erb @@ -91,7 +91,12 @@ # } # @app_directors = { # not always defined (e.g. frontends, for now) # 'foo' => { -# 'backend' => "foo.svc.eqiad.wmnet", # required service hostname +# 'backends' => { +# # If more than one DC listed here, service is active:active +# # If zero DCs listed here, service is effictively disabled +# 'eqiad' => 'foo.svc.eqiad.wmnet', +# 'codfw' => 'foo.svc.codfw.wmnet', +# }, # 'be_opts' => { # optional overrides of app_def_be_opts # 'port' = 80, # 'connect_timeout' = '2s', @@ -147,10 +152,11 @@ @app_directors.keys.sort.each do |director_name| director = @app_directors[director_name] be_opts = @app_def_be_opts.merge(director['be_opts'] || {}) - backend = director['backend'] - next if apphost_seen.key?(backend) - apphost_seen[backend] = 1 - name = 'be_' + backend.gsub(/[-.]/, '_') + if director['backends'].key?(@site) + backend = director['backends'][@site] + next if apphost_seen.key?(backend) + apphost_seen[backend] = 1 + name = 'be_' + backend.gsub(/[-.]/, '_') -%> backend <%= name %> { @@ -167,6 +173,7 @@ <%- end -%> } +<% end # @site if-condition -%> <% end # app_directors loop -%> <% end # !varnish_testing -%> @@ -198,18 +205,20 @@ <% end #dynamic_backend_caches -%> -<% @app_directors.keys.sort.each do |director_name| -director = @app_directors[director_name] -backend = director['backend'] -if @varnish_testing - name = "vtc_backend"; -else - name = 'be_' + backend.gsub(/[-.]/, '_') -end +<% +@app_directors.keys.sort.each do |director_name| + if @app_directors[director_name]['backends'].key?(@site) + backend = @app_directors[director_name]['backends'][@site] + if @varnish_testing + name = "vtc_backend"; + else + name = 'be_' + backend.gsub(/[-.]/, '_') + end -%> new <%= director_name %> = directors.random(); <%= director_name %>.add_backend(<%= name %>, 100); +<% end # @site if-condition -%> <% end # app_directors loop -%> } # end wm_common_directors_init -- To view, visit https://gerrit.wikimedia.org/r/339669 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: Ic32b015d68df4a14e43cfbd577e6050d695ecddb Gerrit-PatchSet: 1 Gerrit-Project: operations/puppet Gerrit-Branch: production Gerrit-Owner: BBlack <bbl...@wikimedia.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits