On 09/01/2009 06:43 AM, Bruce Simpson wrote:
Ben,

Thanks for this change. As of today, I've applied a very small portion
of it, by introducing debug_msg() calls into the path(s) where you've
added XLOG warnings.

I'm merging with upstream....

Why did you remove the part where I also updated the error_msg?  That gives
the caller some idea why it failed.

I'm fine with getting rid of the XLOG warnings, as that was mostly for
my own debugging needs.

The pop_xrl changes fix real bugs with the state machine (it could
get hung on certain error conditions, at least).

I'm attaching a patch of all my changes for the pim/ directory in
case you want to apply them, it includes:

*  Fix xrl task state machine dead-lock due to un-balanced pop/send_xrl calls.
*  Improve error messages
*  Don't panic on network device removal.

Thanks,
Ben

--
Ben Greear <[email protected]>
Candela Technologies Inc  http://www.candelatech.com

diff --git a/pim/pim_node.cc b/pim/pim_node.cc
index 5890d86..db3eac2 100644
--- a/pim/pim_node.cc
+++ b/pim/pim_node.cc
@@ -1159,7 +1159,7 @@ PimNode::enable_vif(const string& vif_name, string& 
error_msg)
 {
     PimVif *pim_vif = vif_find_by_name(vif_name);
     if (pim_vif == NULL) {
-       error_msg = c_format("Cannot enable vif %s: no such vif",
+       error_msg = c_format("PimNode:  Cannot enable vif %s: no such vif",
                             vif_name.c_str());
        XLOG_ERROR("%s", error_msg.c_str());
        return (XORP_ERROR);
@@ -1187,7 +1187,9 @@ PimNode::disable_vif(const string& vif_name, string& 
error_msg)
        error_msg = c_format("Cannot disable vif %s: no such vif",
                             vif_name.c_str());
        XLOG_ERROR("%s", error_msg.c_str());
-       return (XORP_ERROR);
+       // It's as disabled as it's going to get..don't fail the commit.
+       error_msg = "";
+       return XORP_OK;
     }
     
     pim_vif->disable();
@@ -1518,8 +1520,9 @@ PimNode::pim_send(const string& if_name,
                  string& error_msg)
 {
     if (! (is_up() || is_pending_down())) {
-       error_msg = c_format("MLD/IGMP node is not UP");
+       error_msg = c_format("PimNode::pim_send MLD/IGMP node is not UP");
+       XLOG_ERROR("%s", error_msg.c_str());
        return (XORP_ERROR);
     }
     
diff --git a/pim/pim_vif.cc b/pim/pim_vif.cc
index 499f79a..73cf2ff 100644
--- a/pim/pim_vif.cc
+++ b/pim/pim_vif.cc
@@ -578,6 +578,7 @@ PimVif::pim_send(const IPvX& src, const IPvX& dst,
     bool ip_internet_control = true;   // XXX: might be overwritten below
 
     if (! (is_up() || is_pending_down())) {
+       error_msg += "Interface: " + name() + " is down or pending down when 
trying pim_send\n";
        debug_msg("Vif %s is currently down\n", name().c_str());
        return (XORP_ERROR);
     }
@@ -593,8 +594,9 @@ PimVif::pim_send(const IPvX& src, const IPvX& dst,
        case PIM_ASSERT:
        case PIM_GRAFT:
        case PIM_GRAFT_ACK:
+           error_msg += "Invalid message_type, is_pim_register == true\n";
            debug_msg("Invalid message type %d on register vif\n",
-               message_type);
+                     message_type);
            return (XORP_ERROR);        // Those messages are not allowed
        case PIM_REGISTER:
        case PIM_REGISTER_STOP:
@@ -734,12 +736,12 @@ PimVif::pim_send(const IPvX& src, const IPvX& dst,
     BUFFER_COPYPUT_INET_CKSUM(cksum, buffer, 2);       // XXX: the checksum
 
     XLOG_TRACE(pim_node().is_log_trace(),
-              "TX %s from %s to %s on vif %s",
-              PIMTYPE2ASCII(message_type),
-              cstring(src),
-              cstring(dst),
-              name().c_str());
-    
+              "pim_send: TX %s from %s to %s on vif %s",
+              PIMTYPE2ASCII(message_type),
+              cstring(src),
+              cstring(dst),
+              name().c_str());    
+
     //
     // Send the message
     //
diff --git a/pim/xrl_pim_node.cc b/pim/xrl_pim_node.cc
index cc113a3..e0c3278 100644
--- a/pim/xrl_pim_node.cc
+++ b/pim/xrl_pim_node.cc
@@ -487,8 +487,6 @@ 
XrlPimNode::finder_send_register_unregister_interest_cb(const XrlError& xrl_erro
                PimNode::decr_shutdown_requests_n();    // XXX: for the MFEA
            }
        }
-       pop_xrl_task();
-       send_xrl_task();
        break;
 
     case COMMAND_FAILED:
@@ -518,8 +516,6 @@ 
XrlPimNode::finder_send_register_unregister_interest_cb(const XrlError& xrl_erro
            if (entry->target_name() == _mfea_target) {
                _is_mfea_registered = false;
            }
-           pop_xrl_task();
-           send_xrl_task();
        }
        break;
 
@@ -543,8 +539,11 @@ 
XrlPimNode::finder_send_register_unregister_interest_cb(const XrlError& xrl_erro
                   "Will try again.",
                   entry->operation_name(), xrl_error.str().c_str());
        retry_xrl_task();
-       break;
+       return;
     }
+
+    pop_xrl_task();
+    send_xrl_task();
 }
 
 //
@@ -1130,8 +1129,6 @@ 
XrlPimNode::fea_client_send_register_unregister_receiver_cb(
        else {
            PimNode::decr_shutdown_requests_n(); // XXX: for FEA-non-receiver
        }
-       pop_xrl_task();
-       send_xrl_task();
        break;
 
     case COMMAND_FAILED:
@@ -1156,8 +1153,6 @@ 
XrlPimNode::fea_client_send_register_unregister_receiver_cb(
            XLOG_ERROR("XRL communication error: %s", xrl_error.str().c_str());
        } else {
            PimNode::decr_shutdown_requests_n();  // XXX: for FEA-non-receiver
-           pop_xrl_task();
-           send_xrl_task();
        }
        break;
 
@@ -1181,8 +1176,10 @@ 
XrlPimNode::fea_client_send_register_unregister_receiver_cb(
                   "Will try again.",
                   entry->operation_name(), xrl_error.str().c_str());
        retry_xrl_task();
-       break;
+       return;
     }
+    pop_xrl_task();
+    send_xrl_task();
 }
 
 int
@@ -1319,16 +1316,15 @@ 
XrlPimNode::mfea_client_send_register_unregister_protocol_cb(
            PimNode::decr_startup_requests_n();  // XXX: for MFEA-protocol
        else
            PimNode::decr_shutdown_requests_n(); // XXX: for MFEA-non-protocol
-       pop_xrl_task();
-       send_xrl_task();
        break;
 
     case COMMAND_FAILED:
        //
        // If a command failed because the other side rejected it, this is
        // fatal.
+       // Shouldn't be fatal, network device can suddenly dissappear, for 
instance.
        //
-       XLOG_FATAL("Cannot %s protocol with the MFEA: %s",
+       XLOG_ERROR("Cannot %s protocol with the MFEA: %s",
                   entry->operation_name(), xrl_error.str().c_str());
        break;
 
@@ -1345,8 +1341,6 @@ 
XrlPimNode::mfea_client_send_register_unregister_protocol_cb(
            XLOG_ERROR("XRL communication error: %s", xrl_error.str().c_str());
        } else {
            PimNode::decr_shutdown_requests_n(); // XXX: for MFEA-non-protocol
-           pop_xrl_task();
-           send_xrl_task();
        }
        break;
 
@@ -1370,8 +1364,10 @@ 
XrlPimNode::mfea_client_send_register_unregister_protocol_cb(
                   "Will try again.",
                   entry->operation_name(), xrl_error.str().c_str());
        retry_xrl_task();
-       break;
+       return;
     }
+    pop_xrl_task();
+    send_xrl_task();
 }
 
 int
@@ -1514,8 +1510,6 @@ XrlPimNode::fea_client_send_join_leave_multicast_group_cb(
            PimNode::decr_startup_requests_n();         // XXX: for FEA-join
        else
            PimNode::decr_shutdown_requests_n();        // XXX: for FEA-leave
-       pop_xrl_task();
-       send_xrl_task();
        break;
 
     case COMMAND_FAILED:
@@ -1523,7 +1517,7 @@ XrlPimNode::fea_client_send_join_leave_multicast_group_cb(
        // If a command failed because the other side rejected it, this is
        // fatal.
        //
-       XLOG_FATAL("Cannot %s a multicast group with the FEA: %s",
+       XLOG_WARNING("Cannot %s a multicast group with the FEA: %s",
                   entry->operation_name(),
                   xrl_error.str().c_str());
        break;
@@ -1541,8 +1535,6 @@ XrlPimNode::fea_client_send_join_leave_multicast_group_cb(
            XLOG_ERROR("XRL communication error: %s", xrl_error.str().c_str());
        } else {
            PimNode::decr_shutdown_requests_n();        // XXX: for FEA-leave
-           pop_xrl_task();
-           send_xrl_task();
        }
        break;
 
@@ -1571,8 +1563,10 @@ 
XrlPimNode::fea_client_send_join_leave_multicast_group_cb(
                   entry->vif_name().c_str(),
                   xrl_error.str().c_str());
        retry_xrl_task();
-       break;
+       return;
     }
+    pop_xrl_task();
+    send_xrl_task();
 }
 
 int
@@ -1713,8 +1707,6 @@ XrlPimNode::mfea_client_send_add_delete_mfc_cb(
        //
        // If success, then schedule the next task
        //
-       pop_xrl_task();
-       send_xrl_task();
        break;
 
     case COMMAND_FAILED:
@@ -1725,8 +1717,6 @@ XrlPimNode::mfea_client_send_add_delete_mfc_cb(
        XLOG_ERROR("Cannot %s a multicast forwarding entry with the MFEA: %s",
                   entry->operation_name(),
                   xrl_error.str().c_str());
-       pop_xrl_task();
-       send_xrl_task();
        break;
 
     case NO_FINDER:
@@ -1762,8 +1752,10 @@ XrlPimNode::mfea_client_send_add_delete_mfc_cb(
                   "Will try again.",
                   xrl_error.str().c_str());
        retry_xrl_task();
-       break;
+       return;
     }
+    pop_xrl_task();
+    send_xrl_task();
 }
 
 int
@@ -1989,16 +1981,12 @@ 
XrlPimNode::mfea_client_send_add_delete_dataflow_monitor_cb(
        //
        // If success, then schedule the next task
        //
-       pop_xrl_task();
-       send_xrl_task();
        break;
 
     case COMMAND_FAILED:
        XLOG_ERROR("Cannot %s dataflow monitor entry with the MFEA: %s",
                   entry->operation_name(),
                   xrl_error.str().c_str());
-       pop_xrl_task();
-       send_xrl_task();
        break;
 
     case NO_FINDER:
@@ -2034,8 +2022,10 @@ 
XrlPimNode::mfea_client_send_add_delete_dataflow_monitor_cb(
                   entry->operation_name(),
                   xrl_error.str().c_str());
        retry_xrl_task();
-       break;
+       return;
     }
+    pop_xrl_task();
+    send_xrl_task();
 }
 
 int
@@ -2343,8 +2333,10 @@ XrlPimNode::send_protocol_message()
 {
     bool success = true;
 
-    if (! _is_finder_alive)
+    if (! _is_finder_alive) {
+       XLOG_ERROR("ERROR: XrlPimNode::send_protocol_message, finder is NOT 
alive!\n");
        return;         // The Finder is dead
+    }
 
     XLOG_ASSERT(! _xrl_tasks_queue.empty());
     XrlTaskBase* xrl_task_base = _xrl_tasks_queue.front();
@@ -2353,10 +2345,15 @@ XrlPimNode::send_protocol_message()
     entry = dynamic_cast<SendProtocolMessage*>(xrl_task_base);
     XLOG_ASSERT(entry != NULL);
 
+    //XLOG_ERROR("XrlPimNode::send_protocol_message interface/vif %s/%s. ",
+    //       entry->if_name().c_str(),
+    //       entry->vif_name().c_str());
+
     //
     // Check whether we have already registered with the FEA
     //
     if (! _is_fea_registered) {
+       XLOG_ERROR("ERROR: XrlPimNode::send_protocol_message, finder is NOT 
registered!\n");
        retry_xrl_task();
        return;
     }
@@ -2440,8 +2437,6 @@ XrlPimNode::fea_client_send_protocol_message_cb(const 
XrlError& xrl_error)
        //
        // If success, then schedule the next task
        //
-       pop_xrl_task();
-       send_xrl_task();
        break;
 
     case COMMAND_FAILED:
@@ -2459,8 +2454,6 @@ XrlPimNode::fea_client_send_protocol_message_cb(const 
XrlError& xrl_error)
        //
        XLOG_ERROR("Cannot send a protocol message: %s",
                   xrl_error.str().c_str());
-       pop_xrl_task();
-       send_xrl_task();
        break;
 
     case NO_FINDER:
@@ -2474,8 +2467,6 @@ XrlPimNode::fea_client_send_protocol_message_cb(const 
XrlError& xrl_error)
        //
        XLOG_ERROR("Cannot send a protocol message: %s",
                   xrl_error.str().c_str());
-       pop_xrl_task();
-       send_xrl_task();
        break;
 
     case BAD_ARGS:
@@ -2499,11 +2490,13 @@ XrlPimNode::fea_client_send_protocol_message_cb(const 
XrlError& xrl_error)
        //
        XLOG_ERROR("Failed to send a protocol message: %s",
                   xrl_error.str().c_str());
-       pop_xrl_task();
-       send_xrl_task();
        break;
     }
+
+    // In all cases, do the next task.
+    pop_xrl_task();
+    send_xrl_task();
 }
 
 //
_______________________________________________
Xorp-hackers mailing list
[email protected]
http://mailman.ICSI.Berkeley.EDU/mailman/listinfo/xorp-hackers

Reply via email to