On 09/01/2009 06:43 AM, Bruce Simpson wrote:
Ben,
Thanks for this change. As of today, I've applied a very small portion
of it, by introducing debug_msg() calls into the path(s) where you've
added XLOG warnings.
I'm merging with upstream....
Why did you remove the part where I also updated the error_msg? That gives
the caller some idea why it failed.
I'm fine with getting rid of the XLOG warnings, as that was mostly for
my own debugging needs.
The pop_xrl changes fix real bugs with the state machine (it could
get hung on certain error conditions, at least).
I'm attaching a patch of all my changes for the pim/ directory in
case you want to apply them, it includes:
* Fix xrl task state machine dead-lock due to un-balanced pop/send_xrl calls.
* Improve error messages
* Don't panic on network device removal.
Thanks,
Ben
--
Ben Greear <[email protected]>
Candela Technologies Inc http://www.candelatech.com
diff --git a/pim/pim_node.cc b/pim/pim_node.cc
index 5890d86..db3eac2 100644
--- a/pim/pim_node.cc
+++ b/pim/pim_node.cc
@@ -1159,7 +1159,7 @@ PimNode::enable_vif(const string& vif_name, string&
error_msg)
{
PimVif *pim_vif = vif_find_by_name(vif_name);
if (pim_vif == NULL) {
- error_msg = c_format("Cannot enable vif %s: no such vif",
+ error_msg = c_format("PimNode: Cannot enable vif %s: no such vif",
vif_name.c_str());
XLOG_ERROR("%s", error_msg.c_str());
return (XORP_ERROR);
@@ -1187,7 +1187,9 @@ PimNode::disable_vif(const string& vif_name, string&
error_msg)
error_msg = c_format("Cannot disable vif %s: no such vif",
vif_name.c_str());
XLOG_ERROR("%s", error_msg.c_str());
- return (XORP_ERROR);
+ // It's as disabled as it's going to get..don't fail the commit.
+ error_msg = "";
+ return XORP_OK;
}
pim_vif->disable();
@@ -1518,8 +1520,9 @@ PimNode::pim_send(const string& if_name,
string& error_msg)
{
if (! (is_up() || is_pending_down())) {
- error_msg = c_format("MLD/IGMP node is not UP");
+ error_msg = c_format("PimNode::pim_send MLD/IGMP node is not UP");
+ XLOG_ERROR("%s", error_msg.c_str());
return (XORP_ERROR);
}
diff --git a/pim/pim_vif.cc b/pim/pim_vif.cc
index 499f79a..73cf2ff 100644
--- a/pim/pim_vif.cc
+++ b/pim/pim_vif.cc
@@ -578,6 +578,7 @@ PimVif::pim_send(const IPvX& src, const IPvX& dst,
bool ip_internet_control = true; // XXX: might be overwritten below
if (! (is_up() || is_pending_down())) {
+ error_msg += "Interface: " + name() + " is down or pending down when
trying pim_send\n";
debug_msg("Vif %s is currently down\n", name().c_str());
return (XORP_ERROR);
}
@@ -593,8 +594,9 @@ PimVif::pim_send(const IPvX& src, const IPvX& dst,
case PIM_ASSERT:
case PIM_GRAFT:
case PIM_GRAFT_ACK:
+ error_msg += "Invalid message_type, is_pim_register == true\n";
debug_msg("Invalid message type %d on register vif\n",
- message_type);
+ message_type);
return (XORP_ERROR); // Those messages are not allowed
case PIM_REGISTER:
case PIM_REGISTER_STOP:
@@ -734,12 +736,12 @@ PimVif::pim_send(const IPvX& src, const IPvX& dst,
BUFFER_COPYPUT_INET_CKSUM(cksum, buffer, 2); // XXX: the checksum
XLOG_TRACE(pim_node().is_log_trace(),
- "TX %s from %s to %s on vif %s",
- PIMTYPE2ASCII(message_type),
- cstring(src),
- cstring(dst),
- name().c_str());
-
+ "pim_send: TX %s from %s to %s on vif %s",
+ PIMTYPE2ASCII(message_type),
+ cstring(src),
+ cstring(dst),
+ name().c_str());
+
//
// Send the message
//
diff --git a/pim/xrl_pim_node.cc b/pim/xrl_pim_node.cc
index cc113a3..e0c3278 100644
--- a/pim/xrl_pim_node.cc
+++ b/pim/xrl_pim_node.cc
@@ -487,8 +487,6 @@
XrlPimNode::finder_send_register_unregister_interest_cb(const XrlError& xrl_erro
PimNode::decr_shutdown_requests_n(); // XXX: for the MFEA
}
}
- pop_xrl_task();
- send_xrl_task();
break;
case COMMAND_FAILED:
@@ -518,8 +516,6 @@
XrlPimNode::finder_send_register_unregister_interest_cb(const XrlError& xrl_erro
if (entry->target_name() == _mfea_target) {
_is_mfea_registered = false;
}
- pop_xrl_task();
- send_xrl_task();
}
break;
@@ -543,8 +539,11 @@
XrlPimNode::finder_send_register_unregister_interest_cb(const XrlError& xrl_erro
"Will try again.",
entry->operation_name(), xrl_error.str().c_str());
retry_xrl_task();
- break;
+ return;
}
+
+ pop_xrl_task();
+ send_xrl_task();
}
//
@@ -1130,8 +1129,6 @@
XrlPimNode::fea_client_send_register_unregister_receiver_cb(
else {
PimNode::decr_shutdown_requests_n(); // XXX: for FEA-non-receiver
}
- pop_xrl_task();
- send_xrl_task();
break;
case COMMAND_FAILED:
@@ -1156,8 +1153,6 @@
XrlPimNode::fea_client_send_register_unregister_receiver_cb(
XLOG_ERROR("XRL communication error: %s", xrl_error.str().c_str());
} else {
PimNode::decr_shutdown_requests_n(); // XXX: for FEA-non-receiver
- pop_xrl_task();
- send_xrl_task();
}
break;
@@ -1181,8 +1176,10 @@
XrlPimNode::fea_client_send_register_unregister_receiver_cb(
"Will try again.",
entry->operation_name(), xrl_error.str().c_str());
retry_xrl_task();
- break;
+ return;
}
+ pop_xrl_task();
+ send_xrl_task();
}
int
@@ -1319,16 +1316,15 @@
XrlPimNode::mfea_client_send_register_unregister_protocol_cb(
PimNode::decr_startup_requests_n(); // XXX: for MFEA-protocol
else
PimNode::decr_shutdown_requests_n(); // XXX: for MFEA-non-protocol
- pop_xrl_task();
- send_xrl_task();
break;
case COMMAND_FAILED:
//
// If a command failed because the other side rejected it, this is
// fatal.
+ // Shouldn't be fatal, network device can suddenly dissappear, for
instance.
//
- XLOG_FATAL("Cannot %s protocol with the MFEA: %s",
+ XLOG_ERROR("Cannot %s protocol with the MFEA: %s",
entry->operation_name(), xrl_error.str().c_str());
break;
@@ -1345,8 +1341,6 @@
XrlPimNode::mfea_client_send_register_unregister_protocol_cb(
XLOG_ERROR("XRL communication error: %s", xrl_error.str().c_str());
} else {
PimNode::decr_shutdown_requests_n(); // XXX: for MFEA-non-protocol
- pop_xrl_task();
- send_xrl_task();
}
break;
@@ -1370,8 +1364,10 @@
XrlPimNode::mfea_client_send_register_unregister_protocol_cb(
"Will try again.",
entry->operation_name(), xrl_error.str().c_str());
retry_xrl_task();
- break;
+ return;
}
+ pop_xrl_task();
+ send_xrl_task();
}
int
@@ -1514,8 +1510,6 @@ XrlPimNode::fea_client_send_join_leave_multicast_group_cb(
PimNode::decr_startup_requests_n(); // XXX: for FEA-join
else
PimNode::decr_shutdown_requests_n(); // XXX: for FEA-leave
- pop_xrl_task();
- send_xrl_task();
break;
case COMMAND_FAILED:
@@ -1523,7 +1517,7 @@ XrlPimNode::fea_client_send_join_leave_multicast_group_cb(
// If a command failed because the other side rejected it, this is
// fatal.
//
- XLOG_FATAL("Cannot %s a multicast group with the FEA: %s",
+ XLOG_WARNING("Cannot %s a multicast group with the FEA: %s",
entry->operation_name(),
xrl_error.str().c_str());
break;
@@ -1541,8 +1535,6 @@ XrlPimNode::fea_client_send_join_leave_multicast_group_cb(
XLOG_ERROR("XRL communication error: %s", xrl_error.str().c_str());
} else {
PimNode::decr_shutdown_requests_n(); // XXX: for FEA-leave
- pop_xrl_task();
- send_xrl_task();
}
break;
@@ -1571,8 +1563,10 @@
XrlPimNode::fea_client_send_join_leave_multicast_group_cb(
entry->vif_name().c_str(),
xrl_error.str().c_str());
retry_xrl_task();
- break;
+ return;
}
+ pop_xrl_task();
+ send_xrl_task();
}
int
@@ -1713,8 +1707,6 @@ XrlPimNode::mfea_client_send_add_delete_mfc_cb(
//
// If success, then schedule the next task
//
- pop_xrl_task();
- send_xrl_task();
break;
case COMMAND_FAILED:
@@ -1725,8 +1717,6 @@ XrlPimNode::mfea_client_send_add_delete_mfc_cb(
XLOG_ERROR("Cannot %s a multicast forwarding entry with the MFEA: %s",
entry->operation_name(),
xrl_error.str().c_str());
- pop_xrl_task();
- send_xrl_task();
break;
case NO_FINDER:
@@ -1762,8 +1752,10 @@ XrlPimNode::mfea_client_send_add_delete_mfc_cb(
"Will try again.",
xrl_error.str().c_str());
retry_xrl_task();
- break;
+ return;
}
+ pop_xrl_task();
+ send_xrl_task();
}
int
@@ -1989,16 +1981,12 @@
XrlPimNode::mfea_client_send_add_delete_dataflow_monitor_cb(
//
// If success, then schedule the next task
//
- pop_xrl_task();
- send_xrl_task();
break;
case COMMAND_FAILED:
XLOG_ERROR("Cannot %s dataflow monitor entry with the MFEA: %s",
entry->operation_name(),
xrl_error.str().c_str());
- pop_xrl_task();
- send_xrl_task();
break;
case NO_FINDER:
@@ -2034,8 +2022,10 @@
XrlPimNode::mfea_client_send_add_delete_dataflow_monitor_cb(
entry->operation_name(),
xrl_error.str().c_str());
retry_xrl_task();
- break;
+ return;
}
+ pop_xrl_task();
+ send_xrl_task();
}
int
@@ -2343,8 +2333,10 @@ XrlPimNode::send_protocol_message()
{
bool success = true;
- if (! _is_finder_alive)
+ if (! _is_finder_alive) {
+ XLOG_ERROR("ERROR: XrlPimNode::send_protocol_message, finder is NOT
alive!\n");
return; // The Finder is dead
+ }
XLOG_ASSERT(! _xrl_tasks_queue.empty());
XrlTaskBase* xrl_task_base = _xrl_tasks_queue.front();
@@ -2353,10 +2345,15 @@ XrlPimNode::send_protocol_message()
entry = dynamic_cast<SendProtocolMessage*>(xrl_task_base);
XLOG_ASSERT(entry != NULL);
+ //XLOG_ERROR("XrlPimNode::send_protocol_message interface/vif %s/%s. ",
+ // entry->if_name().c_str(),
+ // entry->vif_name().c_str());
+
//
// Check whether we have already registered with the FEA
//
if (! _is_fea_registered) {
+ XLOG_ERROR("ERROR: XrlPimNode::send_protocol_message, finder is NOT
registered!\n");
retry_xrl_task();
return;
}
@@ -2440,8 +2437,6 @@ XrlPimNode::fea_client_send_protocol_message_cb(const
XrlError& xrl_error)
//
// If success, then schedule the next task
//
- pop_xrl_task();
- send_xrl_task();
break;
case COMMAND_FAILED:
@@ -2459,8 +2454,6 @@ XrlPimNode::fea_client_send_protocol_message_cb(const
XrlError& xrl_error)
//
XLOG_ERROR("Cannot send a protocol message: %s",
xrl_error.str().c_str());
- pop_xrl_task();
- send_xrl_task();
break;
case NO_FINDER:
@@ -2474,8 +2467,6 @@ XrlPimNode::fea_client_send_protocol_message_cb(const
XrlError& xrl_error)
//
XLOG_ERROR("Cannot send a protocol message: %s",
xrl_error.str().c_str());
- pop_xrl_task();
- send_xrl_task();
break;
case BAD_ARGS:
@@ -2499,11 +2490,13 @@ XrlPimNode::fea_client_send_protocol_message_cb(const
XrlError& xrl_error)
//
XLOG_ERROR("Failed to send a protocol message: %s",
xrl_error.str().c_str());
- pop_xrl_task();
- send_xrl_task();
break;
}
+
+ // In all cases, do the next task.
+ pop_xrl_task();
+ send_xrl_task();
}
//
_______________________________________________
Xorp-hackers mailing list
[email protected]
http://mailman.ICSI.Berkeley.EDU/mailman/listinfo/xorp-hackers