Colin Watson has proposed merging ~cjwatson/launchpad:builder-metrics-by-region into launchpad:master.
Commit message: Add region to builder metrics Requested reviews: Launchpad code reviewers (launchpad-reviewers) For more details, see: https://code.launchpad.net/~cjwatson/launchpad/+git/launchpad/+merge/444189 Launchpad builders don't really have a built-in notion of "region" (i.e. a collection of builders that are on common infrastructure and that are all managed as a group). However, in practice, we always create builders with names that allow us to infer a region from their names: for instance, `lcy02-amd64-{001..120}` currently form a region. Add the region to builder metrics where appropriate, so that we can write alerts of the form "no builder in a given region has been in the idle or building state for the last hour", which would indicate a problem with that region's builder reset handling. -- Your team Launchpad code reviewers is requested to review the proposed merge of ~cjwatson/launchpad:builder-metrics-by-region into launchpad:master.
diff --git a/lib/lp/buildmaster/interactor.py b/lib/lp/buildmaster/interactor.py index 174411c..1ba8b7b 100644 --- a/lib/lp/buildmaster/interactor.py +++ b/lib/lp/buildmaster/interactor.py @@ -375,6 +375,7 @@ BuilderVitals = namedtuple( "clean_status", "active", "failure_count", + "region", ), ) @@ -400,6 +401,7 @@ def extract_vitals_from_db(builder, build_queue=_BQ_UNSPECIFIED): builder.clean_status, builder.active, builder.failure_count, + builder.region, ) diff --git a/lib/lp/buildmaster/interfaces/builder.py b/lib/lp/buildmaster/interfaces/builder.py index 9225325..afca0b3 100644 --- a/lib/lp/buildmaster/interfaces/builder.py +++ b/lib/lp/buildmaster/interfaces/builder.py @@ -311,6 +311,16 @@ class IBuilderView(IHasBuildRecords, IHasOwner): ) ) + region = TextLine( + title=_("Region"), + required=True, + description=_( + "The builder's region. This is derived purely based on its name " + "(e.g. lcy02-amd64-001 is in the 'lcy02-amd64' region), and is " + "used for metrics." + ), + ) + def gotFailure(): """Increment failure_count on the builder.""" diff --git a/lib/lp/buildmaster/manager.py b/lib/lp/buildmaster/manager.py index 67d564a..17c90a6 100644 --- a/lib/lp/buildmaster/manager.py +++ b/lib/lp/buildmaster/manager.py @@ -401,6 +401,7 @@ def recover_failure(logger, vitals, builder, retry, exception): labels = { "build": True, "arch": job.specific_build.processor.name, + "region": builder.region, "builder_name": builder.name, "virtualized": str(builder.virtualized), "job_type": job.specific_build.job_type.name, @@ -596,6 +597,7 @@ class WorkerScanner: { "build": True, "arch": builder.current_build.processor.name, + "region": builder.region, } ) else: diff --git a/lib/lp/buildmaster/model/builder.py b/lib/lp/buildmaster/model/builder.py index 9de4908..d03f10a 100644 --- a/lib/lp/buildmaster/model/builder.py +++ b/lib/lp/buildmaster/model/builder.py @@ -7,6 +7,7 @@ __all__ = [ "BuilderSet", ] +import re from datetime import timezone from storm.expr import Coalesce, Count, Sum @@ -43,6 +44,8 @@ from lp.services.propertycache import cachedproperty, get_property_cache from lp.soyuz.interfaces.binarypackagebuild import IBinaryPackageBuildSet from lp.soyuz.interfaces.buildrecords import IHasBuildRecords +region_re = re.compile(r"(^[a-z0-9][a-z0-9+.-]+)-\d+$") + @implementer(IBuilder, IHasBuildRecords) class Builder(StormBase): @@ -234,6 +237,11 @@ class Builder(StormBase): self, status=build_state, user=user ) + @property + def region(self): + region_match = region_re.match(self.name) + return region_match.group(1) if region_match is not None else "" + class BuilderProcessor(StormBase): __storm_table__ = "BuilderProcessor" diff --git a/lib/lp/buildmaster/model/buildfarmjob.py b/lib/lp/buildmaster/model/buildfarmjob.py index 5549891..e69651f 100644 --- a/lib/lp/buildmaster/model/buildfarmjob.py +++ b/lib/lp/buildmaster/model/buildfarmjob.py @@ -197,6 +197,7 @@ class BuildFarmJobMixin: { "builder_name": self.builder.name, "virtualized": str(self.builder.virtualized), + "region": self.builder.region, } ) labels.update(extra) diff --git a/lib/lp/buildmaster/model/buildfarmjobbehaviour.py b/lib/lp/buildmaster/model/buildfarmjobbehaviour.py index f4218df..450e375 100644 --- a/lib/lp/buildmaster/model/buildfarmjobbehaviour.py +++ b/lib/lp/buildmaster/model/buildfarmjobbehaviour.py @@ -197,6 +197,7 @@ class BuildFarmJobBehaviourBase: labels={ "job_type": job_type_name, "builder_name": self._builder.name, + "region": self._builder.region, }, ) diff --git a/lib/lp/buildmaster/tests/mock_workers.py b/lib/lp/buildmaster/tests/mock_workers.py index 1840d4b..a880a9a 100644 --- a/lib/lp/buildmaster/tests/mock_workers.py +++ b/lib/lp/buildmaster/tests/mock_workers.py @@ -38,6 +38,7 @@ from txfixtures.tachandler import TacTestFixture from lp.buildmaster.enums import BuilderCleanStatus, BuilderResetProtocol from lp.buildmaster.interactor import BuilderWorker from lp.buildmaster.interfaces.builder import CannotFetchFile +from lp.buildmaster.model.builder import region_re from lp.services.config import config from lp.services.daemons.tachandler import twistd_script from lp.services.webapp import urlappend @@ -95,6 +96,11 @@ class MockBuilder: self.builderok = False self.failnotes = reason + @property + def region(self): + region_match = region_re.match(self.name) + return region_match.group(1) if region_match is not None else "" + # XXX: It would be *really* nice to run some set of tests against the real # BuilderWorker and this one to prevent interface skew. diff --git a/lib/lp/buildmaster/tests/test_builder.py b/lib/lp/buildmaster/tests/test_builder.py index 39da0c8..f122c43 100644 --- a/lib/lp/buildmaster/tests/test_builder.py +++ b/lib/lp/buildmaster/tests/test_builder.py @@ -83,6 +83,14 @@ class TestBuilder(TestCaseWithFactory): self.assertEqual(proc, builder.processor) self.assertEqual([proc], builder.processors) + def test_region_empty(self): + builder = self.factory.makeBuilder(name="some-builder-name") + self.assertEqual("", builder.region) + + def test_region(self): + builder = self.factory.makeBuilder(name="some-region-001") + self.assertEqual("some-region", builder.region) + # XXX cjwatson 2020-05-18: All these tests would now make more sense in # lp.buildmaster.tests.test_buildqueue, and should be moved there when diff --git a/lib/lp/buildmaster/tests/test_buildfarmjobbehaviour.py b/lib/lp/buildmaster/tests/test_buildfarmjobbehaviour.py index 924b131..a164b11 100644 --- a/lib/lp/buildmaster/tests/test_buildfarmjobbehaviour.py +++ b/lib/lp/buildmaster/tests/test_buildfarmjobbehaviour.py @@ -326,11 +326,11 @@ class TestDispatchBuildToWorker(StatsMixin, TestCase): yield behaviour.dispatchBuildToWorker(logger) self.assertEqual(1, self.stats_client.incr.call_count) self.assertEqual( - self.stats_client.incr.call_args_list[0][0], ( "build.count,builder_name=mock-builder,env=test," - "job_type=UNKNOWN", + "job_type=UNKNOWN,region=", ), + self.stats_client.incr.call_args_list[0][0], ) diff --git a/lib/lp/buildmaster/tests/test_manager.py b/lib/lp/buildmaster/tests/test_manager.py index dfd7530..a0edb84 100644 --- a/lib/lp/buildmaster/tests/test_manager.py +++ b/lib/lp/buildmaster/tests/test_manager.py @@ -1417,17 +1417,20 @@ class TestFailureAssessmentsAndStatsdMetrics(StatsMixin, TestCaseWithFactory): [ mock.call( "builders.job_reset,arch={},build=True,builder_name={}," - "env=test,job_type=RECIPEBRANCHBUILD," + "env=test,job_type=RECIPEBRANCHBUILD,region={}," "virtualized=True".format( build.processor.name, self.builder.name, + self.builder.region, ) ), mock.call( "build.reset,arch={},builder_name={},env=test," - "job_type=RECIPEBRANCHBUILD,virtualized=True".format( + "job_type=RECIPEBRANCHBUILD,region={}," + "virtualized=True".format( build.processor.name, self.builder.name, + self.builder.region, ) ), ] @@ -1481,15 +1484,15 @@ class TestFailureAssessmentsAndStatsdMetrics(StatsMixin, TestCaseWithFactory): mock.call( "builders.job_cancelled,arch={},build=True," "builder_name={},env=test,job_type=RECIPEBRANCHBUILD," - "virtualized=True".format( + "region=builder-name,virtualized=True".format( naked_build.processor.name, self.builder.name, ) ), mock.call( "build.finished,arch={},builder_name={},env=test," - "job_type=RECIPEBRANCHBUILD,status=CANCELLED," - "virtualized=True".format( + "job_type=RECIPEBRANCHBUILD,region=builder-name," + "status=CANCELLED,virtualized=True".format( naked_build.processor.name, self.builder.name, ) @@ -1514,15 +1517,15 @@ class TestFailureAssessmentsAndStatsdMetrics(StatsMixin, TestCaseWithFactory): [ mock.call( "build.finished,arch={},builder_name={},env=test," - "job_type=RECIPEBRANCHBUILD,status=FAILEDTOBUILD," - "virtualized=True".format( + "job_type=RECIPEBRANCHBUILD,region=builder-name," + "status=FAILEDTOBUILD,virtualized=True".format( naked_build.processor.name, self.builder.name, ) ), mock.call( "builders.job_failed,arch={},build=True,builder_name={}," - "env=test,job_type=RECIPEBRANCHBUILD," + "env=test,job_type=RECIPEBRANCHBUILD,region=builder-name," "virtualized=True".format( naked_build.processor.name, self.builder.name, @@ -1556,15 +1559,15 @@ class TestFailureAssessmentsAndStatsdMetrics(StatsMixin, TestCaseWithFactory): [ mock.call( "build.finished,arch={},builder_name={},env=test," - "job_type=RECIPEBRANCHBUILD,status=FULLYBUILT," - "virtualized=True".format( + "job_type=RECIPEBRANCHBUILD,region=builder-name," + "status=FULLYBUILT,virtualized=True".format( naked_build.processor.name, self.builder.name, ) ), mock.call( "builders.job_failed,arch={},build=True,builder_name={}," - "env=test,job_type=RECIPEBRANCHBUILD," + "env=test,job_type=RECIPEBRANCHBUILD,region=builder-name," "virtualized=True".format( naked_build.processor.name, self.builder.name, @@ -1592,15 +1595,15 @@ class TestFailureAssessmentsAndStatsdMetrics(StatsMixin, TestCaseWithFactory): mock.call( "builders.job_cancelled,arch={},build=True," "builder_name={},env=test,job_type=RECIPEBRANCHBUILD," - "virtualized=True".format( + "region=builder-name,virtualized=True".format( naked_build.processor.name, self.builder.name, ) ), mock.call( "build.finished,arch={},builder_name={},env=test," - "job_type=RECIPEBRANCHBUILD,status=CANCELLED," - "virtualized=True".format( + "job_type=RECIPEBRANCHBUILD,region=builder-name," + "status=CANCELLED,virtualized=True".format( naked_build.processor.name, self.builder.name, ) diff --git a/lib/lp/charms/tests/test_charmrecipebuildbehaviour.py b/lib/lp/charms/tests/test_charmrecipebuildbehaviour.py index 10e676d..2851a9f 100644 --- a/lib/lp/charms/tests/test_charmrecipebuildbehaviour.py +++ b/lib/lp/charms/tests/test_charmrecipebuildbehaviour.py @@ -516,7 +516,9 @@ class TestAsyncCharmRecipeBuildBehaviour( self.stats_client.incr.call_args_list[0][0], ( "build.count,builder_name={},env=test," - "job_type=CHARMRECIPEBUILD".format(builder.name), + "job_type=CHARMRECIPEBUILD,region={}".format( + builder.name, builder.region + ), ), ) diff --git a/lib/lp/code/model/tests/test_cibuildbehaviour.py b/lib/lp/code/model/tests/test_cibuildbehaviour.py index 4d6221f..578c38e 100644 --- a/lib/lp/code/model/tests/test_cibuildbehaviour.py +++ b/lib/lp/code/model/tests/test_cibuildbehaviour.py @@ -710,7 +710,9 @@ class TestAsyncCIBuildBehaviour(StatsMixin, TestCIBuildBehaviourBase): self.stats_client.incr.call_args_list[0][0], ( "build.count,builder_name={},env=test," - "job_type=CIBUILD".format(builder.name), + "job_type=CIBUILD,region={}".format( + builder.name, builder.region + ), ), ) diff --git a/lib/lp/oci/tests/test_ocirecipebuildbehaviour.py b/lib/lp/oci/tests/test_ocirecipebuildbehaviour.py index 8e2c214..90ba1b7 100644 --- a/lib/lp/oci/tests/test_ocirecipebuildbehaviour.py +++ b/lib/lp/oci/tests/test_ocirecipebuildbehaviour.py @@ -782,7 +782,9 @@ class TestAsyncOCIRecipeBuildBehaviour( self.stats_client.incr.call_args_list[0][0], ( "build.count,builder_name={},env=test," - "job_type=OCIRECIPEBUILD".format(builder.name), + "job_type=OCIRECIPEBUILD,region={}".format( + builder.name, builder.region + ), ), ) diff --git a/lib/lp/services/statsd/numbercruncher.py b/lib/lp/services/statsd/numbercruncher.py index bd8cd3b..b4666cd 100644 --- a/lib/lp/services/statsd/numbercruncher.py +++ b/lib/lp/services/statsd/numbercruncher.py @@ -111,7 +111,7 @@ class NumberCruncher(service.Service): continue for processor_name in builder.processor_names: counts = counts_by_processor.setdefault( - (processor_name, builder.virtualized), + (processor_name, builder.virtualized, builder.region), {"cleaning": 0, "idle": 0, "disabled": 0, "building": 0}, ) if not builder.builderok: @@ -127,7 +127,11 @@ class NumberCruncher(service.Service): builder.failure_count, labels={"builder_name": builder.name}, ) - for (processor, virtualized), counts in counts_by_processor.items(): + for ( + processor, + virtualized, + region, + ), counts in counts_by_processor.items(): for count_name, count_value in counts.items(): self._sendGauge( "builders", @@ -135,6 +139,7 @@ class NumberCruncher(service.Service): labels={ "status": count_name, "arch": processor, + "region": region, "virtualized": virtualized, }, ) diff --git a/lib/lp/services/statsd/tests/test_numbercruncher.py b/lib/lp/services/statsd/tests/test_numbercruncher.py index a7d4dd7..9db157f 100644 --- a/lib/lp/services/statsd/tests/test_numbercruncher.py +++ b/lib/lp/services/statsd/tests/test_numbercruncher.py @@ -59,8 +59,8 @@ class TestNumberCruncher(StatsMixin, TestCaseWithFactory): ] expected_gauges.extend( [ - "builders,arch=386,env=test,status=%s,virtualized=True" - % status + "builders,arch=386,env=test,region=builder-name,status=%s," + "virtualized=True" % status for status in ("building", "cleaning", "disabled", "idle") ] ) @@ -74,16 +74,21 @@ class TestNumberCruncher(StatsMixin, TestCaseWithFactory): def test_multiple_processor_counts(self): builders = [ self.factory.makeBuilder( + name=self.factory.getUniqueUnicode(region), processors=[ getUtility(IProcessorSet).getByName(processor_name) ], virtualized=virtualized, ) - for processor_name, virtualized in ( - ("386", True), - ("386", False), - ("amd64", True), - ("amd64", False), + for processor_name, virtualized, region in ( + ("386", True, "test1"), + ("386", True, "test2"), + ("386", False, "test1"), + ("386", False, "test2"), + ("amd64", True, "test1"), + ("amd64", True, "test2"), + ("amd64", False, "test1"), + ("amd64", False, "test2"), ) ] for builder in builders: @@ -101,10 +106,11 @@ class TestNumberCruncher(StatsMixin, TestCaseWithFactory): ] expected_gauges.extend( [ - "builders,arch=%s,env=test,status=%s,virtualized=%s" - % (arch, status, virtualized) + "builders,arch=%s,env=test,region=%s,status=%s,virtualized=%s" + % (arch, region, status, virtualized) for arch in ("386", "amd64") for virtualized in (True, False) + for region in ("test1", "test2") for status in ("building", "cleaning", "disabled", "idle") ] ) @@ -182,7 +188,7 @@ class TestNumberCruncher(StatsMixin, TestCaseWithFactory): ) expected_gauges.update( { - "builders,arch=amd64,env=test,status=%s," + "builders,arch=amd64,env=test,region=builder-name,status=%s," "virtualized=True" % status: count for status, count in ( ("building", 2), diff --git a/lib/lp/snappy/tests/test_snapbuildbehaviour.py b/lib/lp/snappy/tests/test_snapbuildbehaviour.py index ef4e269..28ffc4b 100644 --- a/lib/lp/snappy/tests/test_snapbuildbehaviour.py +++ b/lib/lp/snappy/tests/test_snapbuildbehaviour.py @@ -1208,7 +1208,9 @@ class TestAsyncSnapBuildBehaviour(StatsMixin, TestSnapBuildBehaviourBase): self.stats_client.incr.call_args_list[0][0], ( "build.count,builder_name={},env=test," - "job_type=SNAPBUILD".format(builder.name), + "job_type=SNAPBUILD,region={}".format( + builder.name, builder.region + ), ), ) diff --git a/lib/lp/soyuz/tests/test_binarypackagebuildbehaviour.py b/lib/lp/soyuz/tests/test_binarypackagebuildbehaviour.py index 073b387..ee8278e 100644 --- a/lib/lp/soyuz/tests/test_binarypackagebuildbehaviour.py +++ b/lib/lp/soyuz/tests/test_binarypackagebuildbehaviour.py @@ -235,7 +235,9 @@ class TestBinaryBuildPackageBehaviour(StatsMixin, TestCaseWithFactory): self.stats_client.incr.call_args_list[0][0], ( "build.count,builder_name={},env=test," - "job_type=PACKAGEBUILD".format(builder.name), + "job_type=PACKAGEBUILD,region={}".format( + builder.name, builder.region + ), ), ) @@ -318,7 +320,9 @@ class TestBinaryBuildPackageBehaviour(StatsMixin, TestCaseWithFactory): self.stats_client.incr.call_args_list[0][0], ( "build.count,builder_name={},env=test," - "job_type=PACKAGEBUILD".format(builder.name), + "job_type=PACKAGEBUILD,region={}".format( + builder.name, builder.region + ), ), )
_______________________________________________ Mailing list: https://launchpad.net/~launchpad-reviewers Post to : launchpad-reviewers@lists.launchpad.net Unsubscribe : https://launchpad.net/~launchpad-reviewers More help : https://help.launchpad.net/ListHelp