On some error conditions related to interface removal, the PIM callbacks would
not handle the next task, and so nothing would ever look at the task queue
again, effectively hanging the multicast routing daemon.

The attached patch fixes it.  I'll open a bugzilla entry as well.

Thanks,
Ben

--
Ben Greear <[email protected]>
Candela Technologies Inc  http://www.candelatech.com

diff --git a/fea/xrl_fea_target.cc b/fea/xrl_fea_target.cc
index dc7f663..310ba54 100644
--- a/fea/xrl_fea_target.cc
+++ b/fea/xrl_fea_target.cc
@@ -3514,6 +3514,8 @@ XrlFeaTarget::raw_packet4_0_1_send(
                            payload, error_msg)
        != XORP_OK) {
        assert(error_msg.size()); // don't allow empty error messages.
+       //XLOG_ERROR("Failed raw_packet4_0_1_send, iface: %s/%s  error_msg: 
%s\n",
+       //         if_name.c_str(), vif_name.c_str(), error_msg.c_str());
        return XrlCmdError::COMMAND_FAILED(error_msg);
     }
 
diff --git a/pim/pim_node.cc b/pim/pim_node.cc
index 3d1d233..765b5e8 100644
--- a/pim/pim_node.cc
+++ b/pim/pim_node.cc
@@ -1521,7 +1521,8 @@ PimNode::pim_send(const string& if_name,
                  string& error_msg)
 {
     if (! (is_up() || is_pending_down())) {
-       error_msg = c_format("MLD/IGMP node is not UP");
+       error_msg = c_format("PimNode::pim_send MLD/IGMP node is not UP");
+       XLOG_ERROR("%s", error_msg.c_str());
        return (XORP_ERROR);
     }
     
diff --git a/pim/pim_proto_hello.cc b/pim/pim_proto_hello.cc
index f5bbd19..537b545 100644
--- a/pim/pim_proto_hello.cc
+++ b/pim/pim_proto_hello.cc
@@ -119,6 +119,10 @@ PimVif::pim_hello_recv(PimNbr *pim_nbr,
     list<IPvX> secondary_addr_list;
     list<IPvX>::iterator addr_list_iter;
     int                rcvd_family;
+
+    //XLOG_WARNING("pim_hello_recv:  RX %s from %s to %s: ",
+    //  PIMTYPE2ASCII(PIM_HELLO),
+    //  cstring(src), cstring(dst));
     
     //
     // Parse the message
@@ -669,8 +673,8 @@ PimVif::pim_hello_send(string& error_msg)
        }
     }
     
-    return (pim_send(primary_addr(), IPvX::PIM_ROUTERS(family()),
-                    PIM_HELLO, buffer, error_msg));
+    return pim_send(primary_addr(), IPvX::PIM_ROUTERS(family()),
+                   PIM_HELLO, buffer, error_msg);
     
  invalid_addr_family_error:
     XLOG_UNREACHABLE();
diff --git a/pim/pim_vif.cc b/pim/pim_vif.cc
index 0719452..7737bdb 100644
--- a/pim/pim_vif.cc
+++ b/pim/pim_vif.cc
@@ -577,8 +577,11 @@ PimVif::pim_send(const IPvX& src, const IPvX& dst,
     int ttl = MINTTL;
     bool ip_internet_control = true;   // XXX: might be overwritten below
 
-    if (! (is_up() || is_pending_down()))
+    if (! (is_up() || is_pending_down())) {
+       error_msg += "Interface: " + name() + " is down or pending down when 
trying pim_send\n";
+       XLOG_WARNING("%s", error_msg.c_str());
        return (XORP_ERROR);
+    }
 
     //
     // Some of the messages should never be send via the PIM Register vif
@@ -591,6 +594,7 @@ PimVif::pim_send(const IPvX& src, const IPvX& dst,
        case PIM_ASSERT:
        case PIM_GRAFT:
        case PIM_GRAFT_ACK:
+           error_msg += "Invalid message_type, is_pim_register == true\n";
            return (XORP_ERROR);        // Those messages are not allowed
        case PIM_REGISTER:
        case PIM_REGISTER_STOP:
@@ -730,12 +734,12 @@ PimVif::pim_send(const IPvX& src, const IPvX& dst,
     BUFFER_COPYPUT_INET_CKSUM(cksum, buffer, 2);       // XXX: the checksum
 
     XLOG_TRACE(pim_node().is_log_trace(),
-              "pim_send: TX %s from %s to %s on vif %s",
-              PIMTYPE2ASCII(message_type),
-              cstring(src),
-              cstring(dst),
-              name().c_str());
-    
+              "pim_send: TX %s from %s to %s on vif %s",
+              PIMTYPE2ASCII(message_type),
+              cstring(src),
+              cstring(dst),
+              name().c_str());    
+
     //
     // Send the message
     //
diff --git a/pim/xrl_pim_node.cc b/pim/xrl_pim_node.cc
index ff8f584..daf789e 100644
--- a/pim/xrl_pim_node.cc
+++ b/pim/xrl_pim_node.cc
@@ -487,8 +487,6 @@ 
XrlPimNode::finder_send_register_unregister_interest_cb(const XrlError& xrl_erro
                PimNode::decr_shutdown_requests_n();    // XXX: for the MFEA
            }
        }
-       pop_xrl_task();
-       send_xrl_task();
        break;
 
     case COMMAND_FAILED:
@@ -518,8 +516,6 @@ 
XrlPimNode::finder_send_register_unregister_interest_cb(const XrlError& xrl_erro
            if (entry->target_name() == _mfea_target) {
                _is_mfea_registered = false;
            }
-           pop_xrl_task();
-           send_xrl_task();
        }
        break;
 
@@ -543,8 +539,11 @@ 
XrlPimNode::finder_send_register_unregister_interest_cb(const XrlError& xrl_erro
                   "Will try again.",
                   entry->operation_name(), xrl_error.str().c_str());
        retry_xrl_task();
-       break;
+       return;
     }
+
+    pop_xrl_task();
+    send_xrl_task();
 }
 
 //
@@ -1130,8 +1129,6 @@ 
XrlPimNode::fea_client_send_register_unregister_receiver_cb(
        else {
            PimNode::decr_shutdown_requests_n(); // XXX: for FEA-non-receiver
        }
-       pop_xrl_task();
-       send_xrl_task();
        break;
 
     case COMMAND_FAILED:
@@ -1156,8 +1153,6 @@ 
XrlPimNode::fea_client_send_register_unregister_receiver_cb(
            XLOG_ERROR("XRL communication error: %s", xrl_error.str().c_str());
        } else {
            PimNode::decr_shutdown_requests_n();  // XXX: for FEA-non-receiver
-           pop_xrl_task();
-           send_xrl_task();
        }
        break;
 
@@ -1181,8 +1176,10 @@ 
XrlPimNode::fea_client_send_register_unregister_receiver_cb(
                   "Will try again.",
                   entry->operation_name(), xrl_error.str().c_str());
        retry_xrl_task();
-       break;
+       return;
     }
+    pop_xrl_task();
+    send_xrl_task();
 }
 
 int
@@ -1319,8 +1316,6 @@ 
XrlPimNode::mfea_client_send_register_unregister_protocol_cb(
            PimNode::decr_startup_requests_n();  // XXX: for MFEA-protocol
        else
            PimNode::decr_shutdown_requests_n(); // XXX: for MFEA-non-protocol
-       pop_xrl_task();
-       send_xrl_task();
        break;
 
     case COMMAND_FAILED:
@@ -1346,8 +1341,6 @@ 
XrlPimNode::mfea_client_send_register_unregister_protocol_cb(
            XLOG_ERROR("XRL communication error: %s", xrl_error.str().c_str());
        } else {
            PimNode::decr_shutdown_requests_n(); // XXX: for MFEA-non-protocol
-           pop_xrl_task();
-           send_xrl_task();
        }
        break;
 
@@ -1371,8 +1364,10 @@ 
XrlPimNode::mfea_client_send_register_unregister_protocol_cb(
                   "Will try again.",
                   entry->operation_name(), xrl_error.str().c_str());
        retry_xrl_task();
-       break;
+       return;
     }
+    pop_xrl_task();
+    send_xrl_task();
 }
 
 int
@@ -1515,8 +1510,6 @@ XrlPimNode::fea_client_send_join_leave_multicast_group_cb(
            PimNode::decr_startup_requests_n();         // XXX: for FEA-join
        else
            PimNode::decr_shutdown_requests_n();        // XXX: for FEA-leave
-       pop_xrl_task();
-       send_xrl_task();
        break;
 
     case COMMAND_FAILED:
@@ -1524,7 +1517,7 @@ XrlPimNode::fea_client_send_join_leave_multicast_group_cb(
        // If a command failed because the other side rejected it, this is
        // fatal.
        //
-       XLOG_FATAL("Cannot %s a multicast group with the FEA: %s",
+       XLOG_WARNING("Cannot %s a multicast group with the FEA: %s",
                   entry->operation_name(),
                   xrl_error.str().c_str());
        break;
@@ -1542,8 +1535,6 @@ XrlPimNode::fea_client_send_join_leave_multicast_group_cb(
            XLOG_ERROR("XRL communication error: %s", xrl_error.str().c_str());
        } else {
            PimNode::decr_shutdown_requests_n();        // XXX: for FEA-leave
-           pop_xrl_task();
-           send_xrl_task();
        }
        break;
 
@@ -1572,8 +1563,10 @@ 
XrlPimNode::fea_client_send_join_leave_multicast_group_cb(
                   entry->vif_name().c_str(),
                   xrl_error.str().c_str());
        retry_xrl_task();
-       break;
+       return;
     }
+    pop_xrl_task();
+    send_xrl_task();
 }
 
 int
@@ -1714,8 +1707,6 @@ XrlPimNode::mfea_client_send_add_delete_mfc_cb(
        //
        // If success, then schedule the next task
        //
-       pop_xrl_task();
-       send_xrl_task();
        break;
 
     case COMMAND_FAILED:
@@ -1726,8 +1717,6 @@ XrlPimNode::mfea_client_send_add_delete_mfc_cb(
        XLOG_ERROR("Cannot %s a multicast forwarding entry with the MFEA: %s",
                   entry->operation_name(),
                   xrl_error.str().c_str());
-       pop_xrl_task();
-       send_xrl_task();
        break;
 
     case NO_FINDER:
@@ -1763,8 +1752,10 @@ XrlPimNode::mfea_client_send_add_delete_mfc_cb(
                   "Will try again.",
                   xrl_error.str().c_str());
        retry_xrl_task();
-       break;
+       return;
     }
+    pop_xrl_task();
+    send_xrl_task();
 }
 
 int
@@ -1990,16 +1981,12 @@ 
XrlPimNode::mfea_client_send_add_delete_dataflow_monitor_cb(
        //
        // If success, then schedule the next task
        //
-       pop_xrl_task();
-       send_xrl_task();
        break;
 
     case COMMAND_FAILED:
        XLOG_ERROR("Cannot %s dataflow monitor entry with the MFEA: %s",
                   entry->operation_name(),
                   xrl_error.str().c_str());
-       pop_xrl_task();
-       send_xrl_task();
        break;
 
     case NO_FINDER:
@@ -2035,8 +2022,10 @@ 
XrlPimNode::mfea_client_send_add_delete_dataflow_monitor_cb(
                   entry->operation_name(),
                   xrl_error.str().c_str());
        retry_xrl_task();
-       break;
+       return;
     }
+    pop_xrl_task();
+    send_xrl_task();
 }
 
 int
@@ -2344,8 +2333,10 @@ XrlPimNode::send_protocol_message()
 {
     bool success = true;
 
-    if (! _is_finder_alive)
+    if (! _is_finder_alive) {
+       XLOG_ERROR("ERROR: XrlPimNode::send_protocol_message, finder is NOT 
alive!\n");
        return;         // The Finder is dead
+    }
 
     XLOG_ASSERT(! _xrl_tasks_queue.empty());
     XrlTaskBase* xrl_task_base = _xrl_tasks_queue.front();
@@ -2354,10 +2345,15 @@ XrlPimNode::send_protocol_message()
     entry = dynamic_cast<SendProtocolMessage*>(xrl_task_base);
     XLOG_ASSERT(entry != NULL);
 
+    //XLOG_ERROR("XrlPimNode::send_protocol_message interface/vif %s/%s. ",
+    //       entry->if_name().c_str(),
+    //       entry->vif_name().c_str());
+
     //
     // Check whether we have already registered with the FEA
     //
     if (! _is_fea_registered) {
+       XLOG_ERROR("ERROR: XrlPimNode::send_protocol_message, finder is NOT 
registered!\n");
        retry_xrl_task();
        return;
     }
@@ -2441,8 +2437,6 @@ XrlPimNode::fea_client_send_protocol_message_cb(const 
XrlError& xrl_error)
        //
        // If success, then schedule the next task
        //
-       pop_xrl_task();
-       send_xrl_task();
        break;
 
     case COMMAND_FAILED:
@@ -2460,8 +2454,6 @@ XrlPimNode::fea_client_send_protocol_message_cb(const 
XrlError& xrl_error)
        //
        XLOG_ERROR("Cannot send a protocol message: %s",
                   xrl_error.str().c_str());
-       pop_xrl_task();
-       send_xrl_task();
        break;
 
     case NO_FINDER:
@@ -2475,8 +2467,6 @@ XrlPimNode::fea_client_send_protocol_message_cb(const 
XrlError& xrl_error)
        //
        XLOG_ERROR("Cannot send a protocol message: %s",
                   xrl_error.str().c_str());
-       pop_xrl_task();
-       send_xrl_task();
        break;
 
     case BAD_ARGS:
@@ -2500,10 +2490,12 @@ XrlPimNode::fea_client_send_protocol_message_cb(const 
XrlError& xrl_error)
        //
        XLOG_ERROR("Failed to send a protocol message: %s",
                   xrl_error.str().c_str());
-       pop_xrl_task();
-       send_xrl_task();
        break;
     }
+
+    // In all cases, do the next task.
+    pop_xrl_task();
+    send_xrl_task();
 }
 
 //
@@ -3646,8 +3638,8 @@ XrlPimNode::mld6igmp_client_0_1_delete_membership4(
 
     if (PimNode::delete_membership(vif_index, IPvX(source), IPvX(group))
        != XORP_OK) {
-       error_msg = c_format("Failed to delete membership for (%s, %s)",
-                            cstring(source), cstring(group));
+       error_msg = c_format("Failed to delete membership for (src: %s, group: 
%s  vif_idx: %i)",
+                            cstring(source), cstring(group), vif_index);
        return XrlCmdError::COMMAND_FAILED(error_msg);
     }
     
_______________________________________________
Xorp-hackers mailing list
[email protected]
http://mailman.ICSI.Berkeley.EDU/mailman/listinfo/xorp-hackers

Reply via email to