>>
>> +/*
>> + * OVS Shared Memory structure
>> + *
>> + * The information in the shared memory block will be read by collectd.
>> + * */
>> +struct dpdk_keepalive_shm {
>> +    /* IPC semaphore. Posted when a core dies */
>> +    sem_t core_died;
>> +
>> +    /*
>> +     * Relayed status of each core.
>> +     * UNUSED[0], ALIVE[1], DEAD[2], GONE[3], MISSING[4], DOZING[5],
>SLEEP[6]
>> +     **/
>> +    enum rte_keepalive_state core_state[RTE_KEEPALIVE_MAXCORES];
>
>What is 'DOZING'?  What is 'MISSING'?  Where is a definition of these states
>and what they mean?  What is DEAD&GONE?

State 'DOZING' means core going idle,
           'MISSING' indicates first heartbeat miss,
           'DEAD' indicates two heart beat misses,
           'GONE' means it missed three or more heartbeats and is completely 
'burried'

Note that these states are defined in DPDK Keepalive library. [rte_keepalive.h]

>
>> +    /* Last seen timestamp of the core */
>> +    uint64_t core_last_seen_times[RTE_KEEPALIVE_MAXCORES];
>> +
>> +    /* Store pmd thread tid */
>> +    pid_t thread_id[RTE_KEEPALIVE_MAXCORES];
>> +};
>> +
>> +static struct dpdk_keepalive_shm *ka_shm;
>>  static int netdev_dpdk_class_init(void);  static int
>> netdev_dpdk_vhost_class_init(void);
>>
>> @@ -586,6 +613,202 @@ netdev_dpdk_mempool_configure(struct
>netdev_dpdk *dev)
>>      return 0;
>>  }
>>
>> +void
>> +dpdk_ka_get_tid(unsigned core_idx)
>> +{
>> +    uint32_t tid = rte_sys_gettid();
>> +
>> +    if (dpdk_is_ka_enabled() && ka_shm) {
>> +            ka_shm->thread_id[core_idx] = tid;
>> +    }
>> +}
>> +
>> +/* Callback function invoked on heartbeat miss.  Verify if it is
>> +genuine
>> + * heartbeat miss or a false positive and log the message accordingly.
>> + */
>> +static void
>> +dpdk_failcore_cb(void *ptr_data, const int core_id) {
>> +    struct dpdk_keepalive_shm *ka_shm = (struct dpdk_keepalive_shm
>> +*)ptr_data;
>> +
>> +    if (ka_shm) {
>> +        int pstate;
>> +        uint32_t tid = ka_shm->thread_id[core_id];
>> +        int err = get_process_status(tid, &pstate);
>> +
>> +        if (!err) {
>> +            switch (pstate) {
>> +
>> +            case ACTIVE_STATE:
>> +                VLOG_INFO_RL(&rl,"False positive, pmd tid[%"PRIu32"] 
>> alive\n",
>> +                                  tid);
>
>Can we get false positives?  Doesn't that diminish the usefulness?

You made a good point! Thanks for asking this. 
On false positives, this can happen in few scenarios and depends on the load 
and port distribution among PMD threads.  This was also observed when the timer 
intervals aren't appropriately tuned in OvS and collectd service. 

For example, If single PMD thread is handling multiple PHY ports and  vhostuser 
ports it can have a long list  of ports to be polled  and at times can miss 
successive heart beats. This may happen because PMD thread is spending  more 
time processing the packets and haven't marked itself alive for a while. If the 
timer intervals are aggressive the chances of it missing the heartbeats are 
higher. This made me to add helper functions to check if the PMD thread is 
active and subsequently log it appropriately.

This doesn't diminish the usefulness of the feature as it all boils down to 
setting sensible timeout values in ovsdb and collectd services. I would guess 
millisecond granularity isn't needed by  every user and the case I explained 
above is  more for customers looking at <=5ms granularity.

>
>> +                break;
>> +            case STOPPED_STATE:
>> +            case TRACED_STATE:
>> +            case DEFUNC_STATE:
>> +            case UNINTERRUPTIBLE_SLEEP_STATE:
>> +                VLOG_WARN_RL(&rl,
>> +                    "PMD tid[%"PRIu32"] on core[%d] is unresponsive\n",
>> +                    tid, core_id);
>> +                break;
>> +            default:
>> +                VLOG_DBG("%s: The process state: %d\n", __FUNCTION__,
>pstate);
>> +                OVS_NOT_REACHED();
>> +            }
>> +        }
>> +    }
>> +}
>> +
>> +/* Notify the external monitoring application for change in core state.
>> + *
>> + * On a consecutive heartbeat miss the core is considered dead and
>> +the status
>> + * is relayed to monitoring framework by unlocking the semaphore.
>> + */
>> +static void
>> +dpdk_ka_relay_core_state(void *ptr_data, const int core_id,
>> +       const enum rte_keepalive_state core_state, uint64_t
>> +last_alive) {
>> +    struct dpdk_keepalive_shm *ka_shm = (struct dpdk_keepalive_shm
>*)ptr_data;
>> +    int count;
>> +
>> +    if (!ka_shm) {
>> +        VLOG_ERR("KeepAlive: Invalid shared memory block\n");
>> +        return;
>> +    }
>> +
>> +    VLOG_DBG_RL(&rl,
>> +               "TS(%lu):CORE%d, old state:%d, current_state:%d\n",
>> +               (unsigned 
>> long)time(NULL),core_id,ka_shm->core_state[core_id],
>> +               core_state);
>> +    
>> +    switch (core_state) {
>> +    case RTE_KA_STATE_ALIVE:
>> +    case RTE_KA_STATE_MISSING:
>> +        ka_shm->core_state[core_id] = RTE_KA_STATE_ALIVE;
>> +        ka_shm->core_last_seen_times[core_id] = last_alive;
>> +        break;
>> +    case RTE_KA_STATE_DOZING:
>> +    case RTE_KA_STATE_SLEEP:
>> +    case RTE_KA_STATE_DEAD:
>> +    case RTE_KA_STATE_GONE:
>> +        ka_shm->core_state[core_id] = core_state;
>> +        ka_shm->core_last_seen_times[core_id] = last_alive;
>> +        break;
>> +    case RTE_KA_STATE_UNUSED:
>> +        ka_shm->core_state[core_id] = RTE_KA_STATE_UNUSED;
>> +        break;
>> +    }
>> +
>> +    if (OVS_UNLIKELY(core_state == RTE_KA_STATE_DEAD)) {
>> +        /* To handle inactive collectd, increment the semaphore
>> +         * if count is '0'. */
>
>This whole mechanism seems very error prone.  Is it possible to hang a thread
>with the subsequent sem_post?

The relay function is called by 'ovs_keepalive' thread. I didn't completely 
understand
your concerns here. I would be happy to verify if you have any scenarios in 
mind that you
think pose problems. 

Please note that this feature is meant to be used by ceilometer service. I 
would like to test
the scenarios keeping OpenStack in mind than simulating  lab cases.

- Bhanuprakash.
_______________________________________________
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Reply via email to