Add periodic CPython garbage collector statistics logging to aid in tracking down a memory leak and as a general health beacon for the long running process.
The interval at which stats are logged is configurable. Signed-off-by: Gregory Smith <[email protected]> --- autotest/global_config.ini 2009-12-18 02:24:44.000000000 -0800 +++ autotest/global_config.ini 2009-12-18 02:24:44.000000000 -0800 @@ -70,6 +70,7 @@ secs_to_wait_for_atomic_group_hosts: 600 pidfile_timeout_mins: 300 max_pidfile_refreshes: 2000 +gc_stats_interval_mins: 360 [HOSTS] wait_up_processes: --- /dev/null 2009-12-17 12:29:38.000000000 -0800 +++ autotest/scheduler/gc_stats.py 2009-12-18 02:24:44.000000000 -0800 @@ -0,0 +1,89 @@ +# Compute and gather statistics about garbage collection in this process. +# This module depends on the CPython gc module and garbage collection behavior. + +import gc, logging, pprint + + +verbose = False + + +# A mapping from type objects to a count of instances of those types in the +# garbage collectors all objects list on the previous call to +# _log_garbage_collector_stats(). +_previous_obj_type_map = {} + + +# A set of object ids for everything in the all objects list on the +# previous call to _log_garbage_collector_stats(). +_previous_obj_ids = set() + + +def _log_garbage_collector_stats(minimum_count=10): + """ + Log statistics about how many of what type of Python object exist in this + process. + + @param minimum_count: The minimum number of instances of a type for it + to be considered worthy of logging. + """ + global _previous_obj_type_map + global _previous_obj_ids + + # We get all objects -before- creating any new objects within this function. + # to avoid having our own local instances in the list. + all_objects = gc.get_objects() + obj = None + new_objects = [] + try: + obj_type_map = {} + object_ids = set() + for obj in all_objects: + obj_type = type(obj) + obj_type_map.setdefault(obj_type, 0) + obj_type_map[obj_type] += 1 + object_ids.add(id(obj)) + whats_new_big_str = '' + if verbose and _previous_obj_ids: + new_object_ids = object_ids - _previous_obj_ids + for obj in all_objects: + if id(obj) in new_object_ids: + new_objects.append(obj) + whats_new_big_str = pprint.pformat(new_objects, indent=1) + finally: + # Never keep references to stuff returned by gc.get_objects() around + # or it'll just make the future cyclic gc runs more difficult. + del all_objects + del obj + del new_objects + + + delta = {} + for obj_type, count in obj_type_map.iteritems(): + if obj_type not in _previous_obj_type_map: + delta[obj_type] = count + elif _previous_obj_type_map[obj_type] != count: + delta[obj_type] = count - _previous_obj_type_map[obj_type] + + sorted_stats = reversed(sorted( + (count, obj_type) for obj_type, count in obj_type_map.iteritems())) + sorted_delta = reversed(sorted( + (count, obj_type) for obj_type, count in delta.iteritems())) + + logging.debug('Garbage collector object type counts:') + for count, obj_type in sorted_stats: + if count >= minimum_count: + logging.debug(' %d\t%s', count, obj_type) + + logging.info('Change in object counts since previous GC stats:') + for change, obj_type in sorted_delta: + if obj_type_map[obj_type] > minimum_count: + logging.info(' %+d\t%s\tto %d', change, obj_type, + obj_type_map[obj_type]) + + if verbose and whats_new_big_str: + logging.debug('Pretty printed representation of the new objects:') + logging.debug(whats_new_big_str) + + _previous_obj_type_map = obj_type_map + if verbose: + _previous_obj_ids = object_ids --- /dev/null 2009-12-17 12:29:38.000000000 -0800 +++ autotest/scheduler/gc_stats_unittest.py 2009-12-18 02:24:44.000000000 -0800 @@ -0,0 +1,37 @@ +#!/usr/bin/python + +import gc +import logging + +import common +from autotest_lib.client.common_lib.test_utils import mock +from autotest_lib.client.common_lib.test_utils import unittest +from autotest_lib.scheduler import gc_stats + + +class TestGcStats(unittest.TestCase): + def setUp(self): + self.god = mock.mock_god() + + + def tearDown(self): + self.god.unstub_all() + + + def test_log_garbage_collector_stats(self): + # Call this for code coverage. + # Prevent log spam from this test but do test that the log + # message formats correctly. + def _mock_logging_func(message, *args): + if args: + message %= args + self.god.stub_with(logging, 'debug', _mock_logging_func) + self.god.stub_with(logging, 'info', _mock_logging_func) + gc_stats._log_garbage_collector_stats() + # Add a new dict, exercise the delta counting & printing code. + y = {} + gc_stats._log_garbage_collector_stats(1) + + +if __name__ == '__main__': + unittest.main() --- autotest/scheduler/monitor_db.py 2009-12-18 02:24:44.000000000 -0800 +++ autotest/scheduler/monitor_db.py 2009-12-18 02:24:44.000000000 -0800 @@ -7,7 +7,7 @@ import datetime, errno, optparse, os, pwd, Queue, re, shutil, signal import smtplib, socket, stat, subprocess, sys, tempfile, time, traceback -import itertools, logging, weakref +import itertools, logging, weakref, gc import common import MySQLdb from autotest_lib.scheduler import scheduler_logging_config @@ -19,6 +19,7 @@ from autotest_lib.scheduler import drone_manager, drones, email_manager from autotest_lib.scheduler import monitor_db_cleanup from autotest_lib.scheduler import status_server, scheduler_config +from autotest_lib.scheduler import gc_stats BABYSITTER_PID_FILE_PREFIX = 'monitor_db_babysitter' PID_FILE_PREFIX = 'monitor_db' @@ -631,6 +632,12 @@ self._24hr_upkeep = monitor_db_cleanup.TwentyFourHourUpkeep(_db) self._host_agents = {} self._queue_entry_agents = {} + self._tick_count = 0 + self._last_garbage_stats_time = time.time() + self._seconds_between_garbage_stats = 60 * ( + global_config.global_config.get_config_value( + scheduler_config.CONFIG_SECTION, + 'gc_stats_interval_mins', type=int, default=6*60)) def initialize(self, recover_hosts=True): @@ -645,6 +652,7 @@ def tick(self): + self._garbage_collection() _drone_manager.refresh() self._run_cleanup() self._find_aborting() @@ -656,6 +664,7 @@ self._handle_agents() _drone_manager.execute_actions() email_manager.manager.send_queued_emails() + self._tick_count += 1 def _run_cleanup(self): @@ -663,6 +672,21 @@ self._24hr_upkeep.run_cleanup_maybe() + def _garbage_collection(self): + threshold_time = time.time() - self._seconds_between_garbage_stats + if threshold_time < self._last_garbage_stats_time: + # Don't generate these reports very often. + return + + self._last_garbage_stats_time = time.time() + # Force a full level 0 collection (because we can, it doesn't hurt + # at this interval). + gc.collect() + logging.info('Logging garbage collector stats on tick %d.', + self._tick_count) + gc_stats._log_garbage_collector_stats() + + def _register_agent_for_ids(self, agent_dict, object_ids, agent): for object_id in object_ids: agent_dict.setdefault(object_id, set()).add(agent) --- autotest/scheduler/monitor_db_unittest.py 2009-12-18 02:24:44.000000000 -0800 +++ autotest/scheduler/monitor_db_unittest.py 2009-12-18 02:24:44.000000000 -0800 @@ -1,18 +1,19 @@ #!/usr/bin/python -import unittest, time, subprocess, os, StringIO, tempfile, datetime, shutil -import logging +import time, subprocess, os, StringIO, tempfile, datetime, shutil +import gc, logging import common import MySQLdb from autotest_lib.frontend import setup_django_environment from autotest_lib.frontend.afe import frontend_test_utils from autotest_lib.client.common_lib import global_config, host_protections from autotest_lib.client.common_lib.test_utils import mock +from autotest_lib.client.common_lib.test_utils import unittest from autotest_lib.database import database_connection, migrate from autotest_lib.frontend import thread_local from autotest_lib.frontend.afe import models from autotest_lib.scheduler import monitor_db, drone_manager, email_manager -from autotest_lib.scheduler import scheduler_config +from autotest_lib.scheduler import scheduler_config, gc_stats _DEBUG = False @@ -771,6 +772,22 @@ self._check_for_extra_schedulings() + def test_garbage_collection(self): + self.god.stub_with(self._dispatcher, '_seconds_between_garbage_stats', + 999999) + self.god.stub_function(gc, 'collect') + self.god.stub_function(gc_stats, '_log_garbage_collector_stats') + gc.collect.expect_call().and_return(0) + gc_stats._log_garbage_collector_stats.expect_call() + # Force a garbage collection run + self._dispatcher._last_garbage_stats_time = 0 + self._dispatcher._garbage_collection() + # The previous call should have reset the time, it won't do anything + # the second time. If it does, we'll get an unexpected call. + self._dispatcher._garbage_collection() + + + class DispatcherThrottlingTest(BaseSchedulerTest): """ Test that the dispatcher throttles: _______________________________________________ Autotest mailing list [email protected] http://test.kernel.org/cgi-bin/mailman/listinfo/autotest
