Title: [opsview] [13233] add deadlock detection with automatic retries
Revision
13233
Author
aburzynski
Date
2013-08-14 17:25:59 +0100 (Wed, 14 Aug 2013)

Log Message

add deadlock detection with automatic retries

Modified Paths


Modified: trunk/opsview-core/bin/ndoutils_configdumpend
===================================================================
--- trunk/opsview-core/bin/ndoutils_configdumpend	2013-08-14 11:08:21 UTC (rev 13232)
+++ trunk/opsview-core/bin/ndoutils_configdumpend	2013-08-14 16:25:59 UTC (rev 13233)
@@ -42,8 +42,9 @@
 $0 = "ndoutils_configdumpend";
 my $logger = Log::Log4perl->get_logger($0);
 
+my $MAX_RETRIES = 5;
+
 my $dbh = Runtime->db_Main;
-$dbh->{HandleError} = sub { $logger->fatal(shift) };
 $logger->info( "Start" );
 
 my $opsview_db = Opsview::Config->db;
@@ -61,8 +62,9 @@
         $new_table = $org_table . '_tmp';
     }
     $logger->debug( 'Copying ' . $org_table . ' to ' . $new_table );
-    $dbh->do( 'DROP TABLE IF EXISTS ' . $new_table );
-    $dbh->do( $create . ' TABLE ' . $new_table . ' LIKE ' . $org_table );
+    deadlock_protected( 'DROP TABLE IF EXISTS ' . $new_table );
+    deadlock_protected(
+        $create . ' TABLE ' . $new_table . ' LIKE ' . $org_table );
     return $new_table;
 }
 
@@ -71,37 +73,71 @@
     ( my $new_table = $org_table ) =~ s/_tmp$//;
     my $old_table = $new_table . '_old';
     $logger->debug( 'Renaming ' . $org_table . ' to ' . $new_table );
-    $dbh->do( 'RENAME TABLE '
+    deadlock_protected( 'RENAME TABLE '
           . $new_table . ' TO '
           . $old_table . ', '
           . $org_table . ' TO '
           . $new_table );
     $logger->debug( 'Dropping ' . $old_table );
-    $dbh->do( 'DROP TABLE ' . $old_table );
+    deadlock_protected( 'DROP TABLE ' . $old_table );
 }
 
+sub deadlock_protected {
+    my @args = @_;
+    my $try  = 0;
+    SQL_DO: eval {
+        $try++;
+        $logger->warn("Restarting query due to deadlock, try: $try.\n")
+          if $try > 1;
+        if ( @args == 1 && ref $args[0] eq 'CODE' ) {
+            $args[0]->();
+        }
+        else {
+            $dbh->do(@args);
+        }
+    };
+    if ( my $e = $@ ) {
+        chomp $e;
+        if ( $e =~ /Deadlock.*transaction/ ) {
+            my $wait_for = $MAX_RETRIES - $try + 1;
+            $logger->warn( "Deadlock detected, retry in ${wait_for}s.\n$e\n" );
+            sleep $wait_for;
+            goto SQL_DO unless $try >= $MAX_RETRIES;
+        }
+        else {
+            $logger->fatal($e);
+        }
+    }
+}
+
 my $table;
 my $temp_table;
 
 # Matches contacts between runtime and opsview
-$table = copy_table( 'opsview_contacts' );
-$dbh->do( "
-INSERT INTO $table
-SELECT c.id, o.object_id, c.name
-FROM
- $opsview_db.contacts c,
- nagios_objects o
-WHERE
- o.objecttype_id = 10
- AND binary c.name = o.name1
-" );
-rename_table($table);
+deadlock_protected(
+    sub {
+        $table = copy_table('opsview_contacts');
+        $dbh->do( "
+    INSERT INTO $table
+    SELECT c.id, o.object_id, c.name
+    FROM
+     $opsview_db.contacts c,
+     nagios_objects o
+    WHERE
+     o.objecttype_id = 10
+     AND binary c.name = o.name1
+    " );
+        rename_table($table);
+    }
+);
 
 # Matches contacts with services based on contactgroups
 # Note: if a contact is marked against a service manually, this will not get picked up
 # Note: duplicate rows are ignored because of the intermediate table
-$table = copy_table( 'opsview_contact_services' );
-$dbh->do( "
+deadlock_protected(
+    sub {
+        $table = copy_table('opsview_contact_services');
+        $dbh->do( "
 INSERT INTO $table 
 SELECT DISTINCT c.id, s.service_object_id
 FROM
@@ -117,24 +153,29 @@
  AND scg.service_id = s.service_id
  AND s.config_type = 1
 " );
-rename_table($table);
+        rename_table($table);
+    }
+);
 
 # Copies hostgroup table from opsview
-$table = copy_table( 'opsview_hostgroups' );
-$dbh->do(
-    "INSERT INTO $table SELECT id, parentid, name, lft, rgt, matpath, matpathid FROM $opsview_db.hostgroups"
+deadlock_protected(
+    sub {
+        $table = copy_table('opsview_hostgroups');
+        $dbh->do(
+            "INSERT INTO $table SELECT id, parentid, name, lft, rgt, matpath, matpathid FROM $opsview_db.hostgroups"
+        );
+        rename_table($table);
+    }
 );
-rename_table($table);
 
-$dbh->do(
-    "CREATE TEMPORARY TABLE temp_hosts (host_object_id int, hostname varchar(64))"
+deadlock_protected(
+    "CREATE TEMPORARY TABLE temp_hosts SELECT object_id AS host_object_id, name1 AS hostname FROM nagios_objects WHERE objecttype_id=1 AND is_active=1"
 );
-$dbh->do(
-    "INSERT INTO temp_hosts SELECT object_id, name1 FROM nagios_objects WHERE objecttype_id=1 AND is_active=1"
-);
 
-$table = copy_table( 'opsview_hostgroup_hosts' );
-$dbh->do( "
+deadlock_protected(
+    sub {
+        $table = copy_table('opsview_hostgroup_hosts');
+        $dbh->do( "
 INSERT INTO $table
 SELECT parent.id, o.host_object_id 
 FROM $opsview_db.hosts oh, opsview_hostgroups node, opsview_hostgroups parent, temp_hosts o 
@@ -143,10 +184,14 @@
  AND node.lft BETWEEN parent.lft AND parent.rgt 
  AND o.hostname = oh.name
 " );
-rename_table($table);
+        rename_table($table);
+    }
+);
 
-$table = copy_table( 'opsview_monitoringservers' );
-$dbh->do( "
+deadlock_protected(
+    sub {
+        $table = copy_table( 'opsview_monitoringservers' );
+        $dbh->do( "
 INSERT INTO $table
 SELECT monitoringservers.id, monitoringservers.name, monitoringservers.activated, monitoringservers.passive,
  GROUP_CONCAT(hosts.name)
@@ -162,10 +207,14 @@
  monitoringservers.host = hosts.id
 GROUP BY monitoringservers.id
 " );
-rename_table($table);
+        rename_table($table);
+    }
+);
 
-$table = copy_table( 'opsview_monitoringclusternodes' );
-$dbh->do( "
+deadlock_protected(
+    sub {
+        $table = copy_table('opsview_monitoringclusternodes');
+        $dbh->do( "
 INSERT INTO $table
 SELECT monitoringclusternodes.id, oh.name, oh.ip
 FROM
@@ -174,10 +223,15 @@
 WHERE
  monitoringclusternodes.host = oh.id
 " );
-rename_table($table);
+        rename_table($table);
+    }
+);
 
 my $opsview_hosts_new = copy_table( 'opsview_hosts' );
-$dbh->do( "
+
+deadlock_protected(
+    sub {
+        $dbh->do( "
 INSERT INTO $opsview_hosts_new 
 SELECT temp_hosts.host_object_id, oh.id, temp_hosts.hostname, oh.ip, oh.alias, icons.filename, oh.hostgroup, oh.monitored_by, h2mcn.primary_node, h2mcn.secondary_node, 0, 0, 0
 FROM $opsview_db.hosts oh
@@ -187,21 +241,32 @@
 WHERE temp_hosts.hostname = oh.name
 AND oh.icon = icons.name
 " );
-$dbh->do( "DROP TABLE temp_hosts" );
+    }
+);
 
+deadlock_protected( "DROP TABLE temp_hosts" );
+
 # Update num_interfaces with number of active interfaces set for this host
 $logger->debug( 'Updating num_interfaces' );
-$dbh->do( "
+
+deadlock_protected(
+    sub {
+        $dbh->do( "
 UPDATE
  $opsview_hosts_new hosts,
  (SELECT hostid, COUNT(*) AS c FROM $opsview_db.hostsnmpinterfaces WHERE active=1 GROUP BY hostid) tt
 SET hosts.num_interfaces=tt.c
 WHERE tt.hostid=hosts.opsview_host_id
 " );
+    }
+);
 
 # Matches hosts with services, based on ndoutils' object ids
-$table = copy_table( 'opsview_host_services' );
-$dbh->do( "
+
+deadlock_protected(
+    sub {
+        $table = copy_table( 'opsview_host_services' );
+        $dbh->do( "
 INSERT INTO $table 
 SELECT s.host_object_id, o.name1, o.object_id, o.name2, (s.notes_url != ''), sc.markdown_filter, sc.id, sc.servicegroup, hosts.icon_filename 
 FROM  nagios_objects o
@@ -214,9 +279,11 @@
 WHERE 
  o.objecttype_id=2 AND s.config_type = 1
 " );
+    }
+);
 
 # Above misses out multi-servicechecks. Have to do in two steps otherwise the LIKE brings in other servicechecks
-$dbh->do( "
+deadlock_protected( "
 UPDATE $table
 LEFT JOIN $opsview_db.servicechecks sc
  ON servicename LIKE CONCAT(sc.name,': %')
@@ -227,7 +294,7 @@
 
 # Update opsview_hosts' num_services. We don't rename this until after we populated opsview_host_services
 $logger->debug( 'Updating num_services' );
-$dbh->do( "
+deadlock_protected( "
 UPDATE 
  $opsview_hosts_new hosts,
  (SELECT host_object_id AS id, COUNT(*) AS c FROM $table GROUP BY id) tt
@@ -239,8 +306,10 @@
 rename_table($opsview_hosts_new);
 rename_table($table);
 
-$table = copy_table( "opsview_servicechecks" );
-$dbh->do( "
+deadlock_protected(
+    sub {
+        $table = copy_table( "opsview_servicechecks" );
+        $dbh->do( "
 INSERT INTO $table
 SELECT DISTINCT(sc.id),sc.name,sc.description,(sc.attribute IS NOT NULL),(sc.checktype=1 || sc.checktype=5),sc.markdown_filter,sc.cascaded_from,sc.servicegroup
 FROM 
@@ -249,10 +318,14 @@
 WHERE
  opsview_host_services.servicecheck_id = sc.id
  " );
-rename_table($table);
+        rename_table($table);
+    }
+);
 
-$table = copy_table( 'opsview_servicegroups' );
-$dbh->do( "
+deadlock_protected(
+    sub {
+        $table = copy_table( 'opsview_servicegroups' );
+        $dbh->do( "
 INSERT INTO $table
 SELECT DISTINCT(sg.id),sg.name
 FROM 
@@ -261,11 +334,15 @@
 WHERE 
  opsview_servicechecks.servicegroup_id = sg.id
 " );
-rename_table($table);
+        rename_table($table);
+    }
+);
 
 # Create lookup table so that can link opsview_hostgroup_hosts to nagios_statehistory
-$table = copy_table( 'opsview_host_objects' );
-$dbh->do( "
+deadlock_protected(
+    sub {
+        $table = copy_table( 'opsview_host_objects' );
+        $dbh->do( "
 INSERT INTO $table 
 (
 SELECT host_object_id, hostname, service_object_id, servicename, perfdata_available, markdown_filter, servicecheck_id, servicegroup_id FROM opsview_host_services
@@ -275,11 +352,15 @@
 SELECT id, name, id, NULL, 0, 0, 0, 0 FROM opsview_hosts
 )
 " );
-rename_table($table);
+        rename_table($table);
+    }
+);
 
 # Make a hosts only table
-$table = copy_table( 'opsview_contact_hosts' );
-$dbh->do( "
+deadlock_protected(
+    sub {
+        $table = copy_table('opsview_contact_hosts');
+        $dbh->do( "
 INSERT INTO $table
 SELECT DISTINCT opsview_contact_services.contactid, opsview_host_services.host_object_id 
 FROM
@@ -288,26 +369,34 @@
 WHERE
  opsview_host_services.service_object_id = opsview_contact_services.service_object_id
 " );
-rename_table($table);
+        rename_table($table);
+    }
+);
 
 # Make a generic objects table (primarily for joining to nagios_statechanges)
-$table = copy_table( 'opsview_contact_objects' );
-$logger->debug( 'Running first insert' );
-$dbh->do( "
+deadlock_protected(
+    sub {
+        $table = copy_table('opsview_contact_objects');
+        $logger->debug('Running first insert');
+        $dbh->do( "
 INSERT INTO $table
 SELECT contactid, service_object_id
 FROM opsview_contact_services
 " );
-$logger->debug( 'Running second insert' );
-$dbh->do( "
+        $logger->debug('Running second insert');
+        $dbh->do( "
 INSERT INTO $table
 SELECT contactid, host_object_id
 FROM opsview_contact_hosts
 " );
-rename_table($table);
+        rename_table($table);
+    }
+);
 
-$temp_table = copy_table( 'opsview_viewports', 'temporary' );
-$dbh->do( "
+deadlock_protected(
+    sub {
+        $temp_table = copy_table( 'opsview_viewports', 'temporary' );
+        $dbh->do( "
 INSERT INTO $temp_table 
 SELECT 
  kh.keywordid, 
@@ -333,7 +422,7 @@
  AND k.all_hosts = 0
  AND k.all_servicechecks = 0
 " );
-$dbh->do( "
+        $dbh->do( "
 INSERT INTO $temp_table 
 SELECT 
  k.id,
@@ -353,7 +442,7 @@
  AND ohs.hostname = h.name 
  AND k.all_servicechecks = 1
 " );
-$dbh->do( "
+        $dbh->do( "
 INSERT INTO $temp_table 
 SELECT 
  k.id,
@@ -373,89 +462,102 @@
  AND (ohs.servicename = sc.name || ohs.servicename like concat(sc.name,': %') )
  AND k.all_hosts = 1
 " );
-$table = copy_table( 'opsview_viewports' );
+    }
+);
 
-# Inserts the services
-$dbh->do( "
+deadlock_protected(
+    sub {
+        $table = copy_table('opsview_viewports');
+
+        # Inserts the services
+        $dbh->do( "
 INSERT INTO $table
 SELECT DISTINCT viewportid, keyword, hostname, servicename, host_object_id, object_id 
 FROM $temp_table
 " );
 
-# Adds the hosts
-$dbh->do( "
+        # Adds the hosts
+        $dbh->do( "
 INSERT INTO $table
 SELECT DISTINCT viewportid, keyword, hostname, NULL, host_object_id, host_object_id
 FROM $temp_table
 " );
 
-rename_table($table);
+        rename_table($table);
+    }
+);
 
 # Make a table with all known hostnames, servicenames and metricnames
-$table = copy_table( 'opsview_performance_metrics' );
-my @perfmetric_values;
-my $do_large_perfmetric_insert = sub {
-    my $force = shift;
+deadlock_protected(
+    sub {
+        $table = copy_table( 'opsview_performance_metrics' );
+        my @perfmetric_values;
+        my $do_large_perfmetric_insert = sub {
+            my $force = shift;
 
-    # Choose 1000 as a general optimisation
-    if (   ( ( my $vals = scalar @perfmetric_values / 4 ) >= 1000 )
-        || ( $force && @perfmetric_values ) )
-    {
-        $dbh->do(
-            "INSERT INTO $table (hostname, servicename, metricname, uom) VALUES "
-              . ( join ',', ("(?,?,?,?)") x $vals ),
-            {}, @perfmetric_values
-        );
-        @perfmetric_values = ();
-    }
-};
-
-# Read all hostnames, servicenames, metricnames from var/rrd directory
-my $rootdir = $ENV{OPSVIEW_TEST_ROOTDIR} || Opsview::Config->root_dir;
-my $rrddir = "$rootdir/var/rrd";
-opendir( RRDDIR, "$rrddir" );
-foreach my $hostname ( sort grep !/^\./, readdir RRDDIR ) {
-    opendir( HOST, "$rrddir/$hostname" );
-    foreach my $servicename ( sort grep !/^\./, readdir HOST ) {
-        opendir( SERVICE, "$rrddir/$hostname/$servicename" );
-        foreach my $metric ( sort grep !/^\./, readdir SERVICE ) {
-            my $uom = "";
-            if (   -e "$rrddir/$hostname/$servicename/$metric/uom"
-                && -s "$rrddir/$hostname/$servicename/$metric/uom" )
+            # Choose 1000 as a general optimisation
+            if (   ( ( my $vals = scalar @perfmetric_values / 4 ) >= 1000 )
+                || ( $force && @perfmetric_values ) )
             {
-                open UOM, "$rrddir/$hostname/$servicename/$metric/uom";
-                $uom = <UOM>;
-                chomp $uom;
-                close UOM;
+                $dbh->do(
+                    "INSERT INTO $table (hostname, servicename, metricname, uom) VALUES "
+                      . ( join ',', ("(?,?,?,?)") x $vals ),
+                    {}, @perfmetric_values
+                );
+                @perfmetric_values = ();
             }
-            push @perfmetric_values, urldecode($hostname),
-              urldecode($servicename), urldecode($metric), $uom;
+        };
+
+        # Read all hostnames, servicenames, metricnames from var/rrd directory
+        my $rootdir = $ENV{OPSVIEW_TEST_ROOTDIR} || Opsview::Config->root_dir;
+        my $rrddir = "$rootdir/var/rrd";
+        opendir( RRDDIR, "$rrddir" );
+        foreach my $hostname ( sort grep !/^\./, readdir RRDDIR ) {
+            opendir( HOST, "$rrddir/$hostname" );
+            foreach my $servicename ( sort grep !/^\./, readdir HOST ) {
+                opendir( SERVICE, "$rrddir/$hostname/$servicename" );
+                foreach my $metric ( sort grep !/^\./, readdir SERVICE ) {
+                    my $uom = "";
+                    if (   -e "$rrddir/$hostname/$servicename/$metric/uom"
+                        && -s "$rrddir/$hostname/$servicename/$metric/uom" )
+                    {
+                        open UOM, "$rrddir/$hostname/$servicename/$metric/uom";
+                        $uom = <UOM>;
+                        chomp $uom;
+                        close UOM;
+                    }
+                    push @perfmetric_values, urldecode($hostname),
+                      urldecode($servicename), urldecode($metric), $uom;
+                }
+            }
+            $do_large_perfmetric_insert->();
         }
-    }
-    $do_large_perfmetric_insert->();
-}
-$do_large_perfmetric_insert->(1);
+        $do_large_perfmetric_insert->(1);
 
-# Set the service_object_id
-$dbh->do(
-    "UPDATE $table, opsview_host_services 
+        # Set the service_object_id
+        $dbh->do(
+            "UPDATE $table, opsview_host_services 
 SET $table.service_object_id=opsview_host_services.service_object_id 
 WHERE 
  opsview_host_services.hostname = $table.hostname
  AND opsview_host_services.servicename = $table.servicename
 "
-);
+        );
 
-# These are rrds that exist, but the services are not defined in Opsview/Nagios
-$dbh->do( "DELETE FROM $table WHERE service_object_id=0" );
+        # These are rrds that exist, but the services are not defined in Opsview/Nagios
+        $dbh->do("DELETE FROM $table WHERE service_object_id=0");
 
-rename_table($table);
+        rename_table($table);
+    }
+);
 
 # Network topology map
-$table = copy_table( 'opsview_topology_map' );
+deadlock_protected(
+    sub {
+        $table = copy_table( 'opsview_topology_map' );
 
-$dbh->do(
-    qq{
+        $dbh->do(
+            qq{
 INSERT INTO $table ( monitored_by, object_id, host_id, opsview_host_id,
     name, parent_id, parent_object_id, parent_name, child_id, child_object_id, child_name )
 SELECT * FROM (
@@ -483,11 +585,11 @@
 ) order by opsview_host_id, IF(ISNULL(parent_id),1,0), parent_id, IF(ISNULL(child_id),1,0), child_id
 ) _
     }
-);
+        );
 
-# Update num_interfaces with number of active interfaces set for this host
-$logger->debug( 'Updating num_children' );
-$dbh->do( "
+        # Update num_interfaces with number of active interfaces set for this host
+        $logger->debug( 'Updating num_children' );
+        $dbh->do( "
 UPDATE
  opsview_hosts hosts,
  (SELECT object_id, COUNT(*) AS c FROM $table WHERE child_id IS NOT NULL GROUP BY object_id) tt
@@ -495,19 +597,22 @@
 WHERE tt.object_id=hosts.id
 " );
 
-rename_table($table);
+        rename_table($table);
+    }
+);
 
-$table = copy_table( 'opsview_hosts_matpaths' );
-{
+deadlock_protected(
+    sub {
+        $table = copy_table( 'opsview_hosts_matpaths' );
 
-    my $all_host_ids = $dbh->selectcol_arrayref(
-        q{
+        my $all_host_ids = $dbh->selectcol_arrayref(
+            q{
             SELECT id FROM opsview_hosts
         }
-    );
+        );
 
-    my $find_parent_sth = $dbh->prepare_cached(
-        q{
+        my $sth_find_parent = $dbh->prepare_cached(
+            q{
             SELECT parent_object_id
             FROM opsview_topology_map
             WHERE object_id = ?
@@ -515,59 +620,60 @@
                 AND parent_object_ID IS NOT NULL
             GROUP BY parent_object_id
         }
-    );
+        );
 
-    my $set_host_matpath = $dbh->prepare_cached(
-        qq{
+        my $sth_host_matpath = $dbh->prepare_cached(
+            qq{
             INSERT INTO $table (object_id, matpath, depth) VALUES(?, ?, ?)
         }
-    );
+        );
 
-    my $find_parent;
-    $find_parent = sub {
-        my $id = shift;
+        my $find_parent;
+        $find_parent = sub {
+            my $id = shift;
 
-        return unless $id;
+            return unless $id;
 
-        my $parents = $dbh->selectcol_arrayref( $find_parent_sth, {}, $id );
+            my $parents = $dbh->selectcol_arrayref( $sth_find_parent, {}, $id );
 
-        return @$parents;
-    };
+            return @$parents;
+        };
 
-    my $set_host_matpath = sub {
-        my $object_id = shift;
+        my $set_host_matpath = sub {
+            my $object_id = shift;
 
-        my $make_tree;
-        $make_tree = sub {
-            my ( $id, $depth, @tree ) = @_;
+            my $make_tree;
+            $make_tree = sub {
+                my ( $id, $depth, @tree ) = @_;
 
-            return unless $id;
+                return unless $id;
 
-            my @parents = $find_parent->($id);
+                my @parents = $find_parent->($id);
 
-            if (@parents) {
-                for my $parent (@parents) {
-                    $make_tree->( $parent, $depth + 1, @tree, $id );
+                if (@parents) {
+                    for my $parent (@parents) {
+                        $make_tree->( $parent, $depth + 1, @tree, $id );
+                    }
                 }
-            }
-            else { # reached root
-                push @tree, $id;
-                my $matpath = join( ',', reverse @tree ) . ",";
-                $set_host_matpath->execute( $object_id, $matpath, $depth );
-            }
+                else { # reached root
+                    push @tree, $id;
+                    my $matpath = join( ',', reverse @tree ) . ",";
+                    $sth_host_matpath->execute( $object_id, $matpath, $depth );
+                }
+            };
+            $make_tree->( $object_id, 1 );
         };
-        $make_tree->( $object_id, 1 );
-    };
-    for my $host_id (@$all_host_ids) {
-        $set_host_matpath->($host_id);
+        for my $host_id (@$all_host_ids) {
+            $set_host_matpath->($host_id);
+        }
+        rename_table($table);
     }
-}
-rename_table($table);
+);
 
 # Clear out downtime that should have finished.  Nagios should pass an event
 # to the db to say the downtime is complete but some circumstances such as
 # db being unavailable can prevent this
-$dbh->do(
+deadlock_protected(
     'delete from nagios_scheduleddowntime where scheduled_end_time < NOW()'
 );
 

_______________________________________________
Opsview-checkins mailing list
Opsview-checkins@lists.opsview.org
http://lists.opsview.org/lists/listinfo/opsview-checkins

Reply via email to