Re: [ovs-dev] [PATCH 1/5] dpif-netdev: Rework rxq scheduling code.

Kevin Traynor Thu, 01 Jul 2021 16:15:26 -0700

On 23/06/2021 14:04, David Marchand wrote:
> On Fri, Jun 4, 2021 at 11:19 PM Kevin Traynor <[email protected]> wrote:
>>
>> This reworks the current rxq scheduling code to break it into more
>> generic and reusable pieces.
>>
>> The behaviour does not change from a user perspective, except the logs
>> are updated to be more consistent.
>>
>> From an implementation view, there are some changes with mind to adding
>> functionality and reuse in later patches.
>>
>> The high level reusable functions added in this patch are:
>> - Generate a list of current numas and pmds
>> - Perform rxq scheduling into the list
>> - Effect the rxq scheduling assignments so they are used
>>
>> The rxq scheduling is updated to handle both pinned and non-pinned rxqs
>> in the same call.
> 
> As a global comment, I prefer consistency for function names.
> I put suggestions inline, you can take these as simple nits.
>


went with the suggestions, thanks for them.

> 
> This patch prepares arrival of new scheduling algorithm, but uses a
> single boolean flag.
> I would avoid this temp boolean and introduce the enum (used later in
> this series).
> 

Added enum (without SCHED_GROUP for now).

> 
>>
>> Signed-off-by: Kevin Traynor <[email protected]>
>> ---
>>  lib/dpif-netdev.c | 538 ++++++++++++++++++++++++++++++++++++++--------
>>  tests/pmd.at      |   2 +-
>>  2 files changed, 446 insertions(+), 94 deletions(-)
>>
>> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
>> index 650e67ab3..57d23e112 100644
>> --- a/lib/dpif-netdev.c
>> +++ b/lib/dpif-netdev.c
>> @@ -5006,4 +5006,211 @@ rr_numa_list_destroy(struct rr_numa_list *rr)
>>  }
>>
>> +struct sched_numa_list {
>> +    struct hmap numas;  /* Contains 'struct sched_numa'. */
>> +};
>> +
>> +/* Meta data for out-of-place pmd rxq assignments. */
>> +struct sched_pmd {
>> +    /* Associated PMD thread. */
>> +    struct dp_netdev_pmd_thread *pmd;
>> +    uint64_t pmd_proc_cycles;
>> +    struct dp_netdev_rxq **rxqs;
>> +    unsigned n_rxq;
>> +    bool isolated;
>> +};
> 
> sched_pmd objects are associated in a unique fashion to a sched_numa object.
> Having a back pointer to sched_numa in the sched_pmd object removes
> the need for sched_numa_list_find_numa().
> 

done

> 
>> +
>> +struct sched_numa {
>> +    struct hmap_node node;
>> +    int numa_id;
>> +    /* PMDs on numa node. */
>> +    struct sched_pmd *pmds;
>> +    /* Num of PMDs on numa node. */
>> +    unsigned n_pmds;
>> +    /* Num of isolated PMDs on numa node. */
>> +    unsigned n_iso;
>> +    int rr_cur_index;
>> +    bool rr_idx_inc;
>> +};
>> +
>> +static size_t
>> +sched_numa_list_count(struct sched_numa_list *numa_list)
>> +{
>> +    return hmap_count(&numa_list->numas);
>> +}
>> +
>> +static struct sched_numa *
>> +sched_numa_list_next(struct sched_numa_list *numa_list,
>> +                     const struct sched_numa *numa)
>> +{
>> +    struct hmap_node *node = NULL;
>> +
>> +    if (numa) {
>> +        node = hmap_next(&numa_list->numas, &numa->node);
>> +    }
>> +    if (!node) {
>> +        node = hmap_first(&numa_list->numas);
>> +    }
>> +
>> +    return (node) ? CONTAINER_OF(node, struct sched_numa, node) : NULL;
>> +}
>> +
>> +static struct sched_numa *
>> +sched_numa_list_lookup(struct sched_numa_list * numa_list, int numa_id)
> 
> Nit: no space *numa_list
> 

fixed

> 
>> +{
>> +    struct sched_numa *numa;
>> +
>> +    HMAP_FOR_EACH_WITH_HASH (numa, node, hash_int(numa_id, 0),
>> +                             &numa_list->numas) {
>> +        if (numa->numa_id == numa_id) {
>> +            return numa;
>> +        }
>> +    }
>> +    return NULL;
>> +}
>> +
>> +/* Populate numas and pmds on those numas */
> 
> Nit: missing a '.'.
> 

fixed

> 
>> +static void
>> +sched_numa_list_populate(struct sched_numa_list *numa_list,
>> +                         struct dp_netdev *dp)
>> +{
>> +    struct dp_netdev_pmd_thread *pmd;
> 
> Nit: missing a blank line after var definition.
> 

fixed

> 
>> +    hmap_init(&numa_list->numas);
>> +
>> +    /* For each pmd on this datapath. */
>> +    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
>> +        struct sched_numa *numa;
>> +        struct sched_pmd *sched_pmd;
>> +        if (pmd->core_id == NON_PMD_CORE_ID) {
>> +            continue;
>> +        }
>> +
>> +        /* Get the numa of the PMD. */
>> +        numa = sched_numa_list_lookup(numa_list, pmd->numa_id);
>> +        /* Create a new numa node for it if not already created */
> 
> Nit: missing a '.'.
> 

fixed

> 
>> +        if (!numa) {
>> +            numa = xzalloc(sizeof *numa);
>> +            numa->numa_id = pmd->numa_id;
>> +            hmap_insert(&numa_list->numas, &numa->node,
>> +                        hash_int(pmd->numa_id, 0));
>> +        }
>> +
>> +        /* Create a sched_pmd on this numa for the pmd. */
>> +        numa->n_pmds++;
>> +        numa->pmds = xrealloc(numa->pmds, numa->n_pmds * sizeof 
>> *numa->pmds);
>> +        sched_pmd = &numa->pmds[numa->n_pmds - 1];
>> +        memset(sched_pmd ,0, sizeof *sched_pmd);
> 
> Nit: should be sched_pmd, 0,
> 

fixed

> 
>> +        sched_pmd->pmd = pmd;
>> +        /* At least one pmd is present so initialize curr_idx and idx_inc. 
>> */
>> +        numa->rr_cur_index = 0;
>> +        numa->rr_idx_inc = true;
>> +    }
>> +}
>> +
>> +static void
>> +sched_numa_list_free_entries(struct sched_numa_list *numa_list)
>> +{
>> +    struct sched_numa *numa;
>> +
>> +    HMAP_FOR_EACH_POP (numa, node, &numa_list->numas) {
>> +        for (unsigned i = 0; i < numa->n_pmds; i++) {
>> +            struct sched_pmd *sched_pmd;
>> +
>> +            sched_pmd = &numa->pmds[i];
>> +            sched_pmd->n_rxq = 0;
>> +            free(sched_pmd->rxqs);
>> +        }
>> +        numa->n_pmds = 0;
>> +        free(numa->pmds);
>> +    }
> 
> Are we leaking numa objects?
> 

i shouldn't have hinted about it :-) yes, fixed now.

> 
>> +    hmap_destroy(&numa_list->numas);
>> +}
>> +
>> +static struct sched_pmd *
>> +find_sched_pmd_by_pmd(struct sched_numa_list *numa_list,
>> +                      struct dp_netdev_pmd_thread *pmd)
> 
> Nit: it returns a sched_pmd object, I would go with sched_pmd_find_by_pmd().
> 
> 
>> +{
>> +    struct sched_numa *numa;
>> +
>> +    HMAP_FOR_EACH (numa, node, &numa_list->numas) {
>> +        for (unsigned i = 0; i < numa->n_pmds; i++) {
>> +            struct sched_pmd *sched_pmd;
>> +
>> +            sched_pmd = &numa->pmds[i];
>> +            if (pmd == sched_pmd->pmd) {
>> +                return sched_pmd;
>> +            }
>> +        }
>> +    }
>> +    return NULL;
>> +}
>> +
>> +static struct sched_numa *
>> +sched_numa_list_find_numa(struct sched_numa_list *numa_list,
>> +                           struct sched_pmd *sched_pmd)
> 
> As commented around the sched_pmd struct definition, this function can
> be removed if sched_pmd has a reference to sched_numa.
> 

yep, removed it.

> 
>> +{
>> +    struct sched_numa *numa;
>> +
>> +    HMAP_FOR_EACH (numa, node, &numa_list->numas) {
>> +        for (unsigned i = 0; i < numa->n_pmds; i++) {
>> +            struct sched_pmd *numa_sched_pmd;
>> +
>> +            numa_sched_pmd = &numa->pmds[i];
>> +            if (numa_sched_pmd == sched_pmd) {
>> +                return numa;
>> +            }
>> +        }
>> +    }
>> +    return NULL;
>> +}
>> +
>> +static void
>> +sched_add_rxq_to_sched_pmd(struct sched_pmd *sched_pmd,
>> +                           struct dp_netdev_rxq *rxq, uint64_t cycles)
> 
> Nit: name could be sched_pmd_add_rxq().
> 
> 
>> +{
>> +    /* As sched_pmd is allocated outside this fn. better to not assume
>> +     * rxq is initialized to NULL. */
>> +    if (sched_pmd->n_rxq == 0) {
>> +        sched_pmd->rxqs = xmalloc(sizeof *sched_pmd->rxqs);
>> +    } else {
>> +        sched_pmd->rxqs = xrealloc(sched_pmd->rxqs, (sched_pmd->n_rxq + 1) *
>> +                                                    sizeof 
>> *sched_pmd->rxqs);
>> +    }
>> +
>> +    sched_pmd->rxqs[sched_pmd->n_rxq++] = rxq;
>> +    sched_pmd->pmd_proc_cycles += cycles;
>> +}
>> +
>> +static void
>> +sched_numa_list_put_in_place(struct sched_numa_list *numa_list)
>> +{
>> +    struct sched_numa *numa;
>> +
>> +    /* For each numa */
>> +    HMAP_FOR_EACH (numa, node, &numa_list->numas) {
>> +        /* For each pmd */
>> +        for (int i = 0; i < numa->n_pmds; i++) {
>> +            struct sched_pmd *sched_pmd;
>> +
>> +            sched_pmd = &numa->pmds[i];
>> +            sched_pmd->pmd->isolated = sched_pmd->isolated;
>> +            /* For each rxq. */
>> +            for (unsigned k = 0; k < sched_pmd->n_rxq; k++) {
>> +                /* Store the new pmd from the out of place sched_numa_list
>> +                 * struct to the dp_netdev_rxq struct */
>> +                sched_pmd->rxqs[k]->pmd = sched_pmd->pmd;
>> +            }
>> +        }
>> +    }
>> +}
>> +
>> +static unsigned
>> +sched_get_numa_pmd_noniso(struct sched_numa *numa)
> 
> Nit: we are not getting a pmd from this function, 
> sched_numa_noniso_pmd_count()?
> 
> 
>> +{
>> +    if (numa->n_pmds > numa->n_iso) {
>> +        return numa->n_pmds - numa->n_iso;
>> +    }
>> +    return 0;
>> +}
>> +
>>  /* Sort Rx Queues by the processing cycles they are consuming. */
>>  static int
>> @@ -5037,22 +5244,106 @@ compare_rxq_cycles(const void *a, const void *b)
>>  }
>>
>> -/* Assign pmds to queues.  If 'pinned' is true, assign pmds to pinned
>> - * queues and marks the pmds as isolated.  Otherwise, assign non isolated
>> - * pmds to unpinned queues.
>> +/*
>> + * Returns the next pmd from the numa node.
>>   *
>> - * The function doesn't touch the pmd threads, it just stores the assignment
>> - * in the 'pmd' member of each rxq. */
>> + * If 'updown' is 'true' it will alternate between selecting the next pmd in
>> + * either an up or down walk, switching between up/down when the first or 
>> last
>> + * core is reached. e.g. 1,2,3,3,2,1,1,2...
>> + *
>> + * If 'updown' is 'false' it will select the next pmd wrapping around when
>> + * last core reached. e.g. 1,2,3,1,2,3,1,2...
>> + */
>> +static struct sched_pmd *
>> +get_rr_pmd(struct sched_numa *numa, bool updown)
> 
> Nit: sched_pmd_next_rr().
> 
> 
>> +{
>> +    int numa_idx = numa->rr_cur_index;
>> +
>> +    if (numa->rr_idx_inc == true) {
>> +        /* Incrementing through list of pmds. */
>> +        if (numa->rr_cur_index == numa->n_pmds - 1) {
>> +            /* Reached the last pmd. */
>> +            if (updown) {
>> +                numa->rr_idx_inc = false;
>> +            } else {
>> +                numa->rr_cur_index = 0;
>> +            }
>> +        } else {
>> +            numa->rr_cur_index++;
>> +        }
>> +    } else {
>> +        /* Decrementing through list of pmds. */
>> +        if (numa->rr_cur_index == 0) {
>> +            /* Reached the first pmd. */
>> +            numa->rr_idx_inc = true;
>> +        } else {
>> +            numa->rr_cur_index--;
>> +        }
>> +    }
>> +    return &numa->pmds[numa_idx];
>> +}
>> +
>> +static struct sched_pmd *
>> +get_available_rr_pmd(struct sched_numa *numa, bool updown)
> 
> Nit: sched_pmd_next_noniso_rr().
> 
> 
>> +{
>> +    struct sched_pmd *sched_pmd = NULL;
>> +
>> +    /* get_rr_pmd() may return duplicate PMDs before all PMDs have been
>> +     * returned depending on updown. Extend the number of call to ensure all
>> +     * PMDs can be checked. */
>> +    for (unsigned i = 0; i < numa->n_pmds * 2; i++) {
>> +        sched_pmd = get_rr_pmd(numa, updown);
>> +        if (!sched_pmd->isolated) {
>> +            break;
>> +        }
>> +        sched_pmd = NULL;
>> +    }
>> +    return sched_pmd;
>> +}
>> +
>> +static struct sched_pmd *
>> +get_next_pmd(struct sched_numa *numa, bool algo)
> 
> Nit: sched_pmd_next().
> 
> 
>> +{
>> +    return get_available_rr_pmd(numa, algo);
>> +}
>> +
>> +static const char *
>> +get_assignment_type_string(bool algo)
>> +{
>> +    if (algo == false) {
>> +        return "roundrobin";
>> +    }
>> +    return "cycles";
>> +}
>> +
>> +#define MAX_RXQ_CYC_STRLEN (INT_STRLEN(uint64_t) + 40)
> 
> I am not sure where this 40 comes from.
> Is this "big enough in most cases"?
> 

Yes, it was just to allow for the message part and not have it so fine
that a small edit would break it. There will be some variance in the
size as it is with INT_STRLEN. I can do this to try and show what the 40
is for:

#define MAX_RXQ_CYC_MSG 40
#define MAX_RXQ_CYC_STRLEN (INT_STRLEN(uint64_t) + MAX_RXQ_CYC_MSG)

> 
>> +
>> +static bool
>> +get_rxq_cyc_log(char *a, bool algo, uint64_t cycles)
>> +{
>> +    int ret = 0;
>> +
>> +    if (algo) {
>> +        ret = snprintf(a, MAX_RXQ_CYC_STRLEN,
>> +                 " (measured processing cycles %"PRIu64").",
> 
> The final "." must be removed since it is not consistent with the !algo case.
> 
> 
>> +                 cycles);
>> +    }
>> +    return ret > 0;
>> +}
> 
> We can do a[0] = '\0' when !algo and on error and return 'a' unconditionnally.
> This way, callers don't need to check this function succeeded.
> 

That is nice, it avoids the ternary check in the VLOG.

> 
> 
>> +
>>  static void
>> -rxq_scheduling(struct dp_netdev *dp, bool pinned) 
>> OVS_REQUIRES(dp->port_mutex)
>> +sched_numa_list_schedule(struct sched_numa_list *numa_list,
>> +                         struct dp_netdev *dp,
>> +                         bool algo,
>> +                         enum vlog_level level)
>> +    OVS_REQUIRES(dp->port_mutex)
>>  {
>>      struct dp_netdev_port *port;
>> -    struct rr_numa_list rr;
>> -    struct rr_numa *non_local_numa = NULL;
>>      struct dp_netdev_rxq ** rxqs = NULL;
> 
> **rxqs
> 

fixed

>> -    int n_rxqs = 0;
>> -    struct rr_numa *numa = NULL;
>> -    int numa_id;
>> -    bool assign_cyc = dp->pmd_rxq_assign_cyc;
>> +    struct sched_numa *last_cross_numa;
>> +    unsigned n_rxqs = 0;
>> +    bool start_logged = false;
>> +    size_t n_numa;
>>
>> +    /* For each port. */
>>      HMAP_FOR_EACH (port, node, &dp->ports) {
>>          if (!netdev_is_pmd(port->netdev)) {
>> @@ -5060,48 +5351,68 @@ rxq_scheduling(struct dp_netdev *dp, bool pinned) 
>> OVS_REQUIRES(dp->port_mutex)
>>          }
>>
>> +        /* For each rxq on the port. */
>>          for (int qid = 0; qid < port->n_rxq; qid++) {
>> -            struct dp_netdev_rxq *q = &port->rxqs[qid];
>> +            struct dp_netdev_rxq *rxq = &port->rxqs[qid];
>>
>> -            if (pinned && q->core_id != OVS_CORE_UNSPEC) {
>> -                struct dp_netdev_pmd_thread *pmd;
>> +            rxqs = xrealloc(rxqs, (n_rxqs + 1) * sizeof *rxqs);
>> +            rxqs[n_rxqs++] = rxq;
> 
> We might as well only store unpinned queues in rxqs (maybe rename this
> var) in this first loop... (see comment in second loop)
> 

I wasn't convinced when first reading, but by the time i got to the
later patch I had been converted to this way of thinking :-)

> 
>>
>> -                pmd = dp_netdev_get_pmd(dp, q->core_id);
>> -                if (!pmd) {
>> -                    VLOG_WARN("There is no PMD thread on core %d. Queue "
>> -                              "%d on port \'%s\' will not be polled.",
>> -                              q->core_id, qid, 
>> netdev_get_name(port->netdev));
>> -                } else {
>> -                    q->pmd = pmd;
>> -                    pmd->isolated = true;
>> -                    VLOG_INFO("Core %d on numa node %d assigned port \'%s\' 
>> "
>> -                              "rx queue %d.", pmd->core_id, pmd->numa_id,
>> -                              netdev_rxq_get_name(q->rx),
>> -                              netdev_rxq_get_queue_id(q->rx));
>> -                    dp_netdev_pmd_unref(pmd);
>> -                }
>> -            } else if (!pinned && q->core_id == OVS_CORE_UNSPEC) {
>> +            if (algo == true) {
>>                  uint64_t cycle_hist = 0;
>>
>> -                if (n_rxqs == 0) {
>> -                    rxqs = xmalloc(sizeof *rxqs);
>> -                } else {
>> -                    rxqs = xrealloc(rxqs, sizeof *rxqs * (n_rxqs + 1));
>> +                /* Sum the queue intervals and store the cycle history. */
>> +                for (unsigned i = 0; i < PMD_RXQ_INTERVAL_MAX; i++) {
>> +                    cycle_hist += dp_netdev_rxq_get_intrvl_cycles(rxq, i);
>>                  }
>> +                dp_netdev_rxq_set_cycles(rxq, RXQ_CYCLES_PROC_HIST,
>> +                                         cycle_hist);
>> +            }
>>
>> -                if (assign_cyc) {
>> -                    /* Sum the queue intervals and store the cycle history. 
>> */
>> -                    for (unsigned i = 0; i < PMD_RXQ_INTERVAL_MAX; i++) {
>> -                        cycle_hist += dp_netdev_rxq_get_intrvl_cycles(q, i);
>> -                    }
>> -                    dp_netdev_rxq_set_cycles(q, RXQ_CYCLES_PROC_HIST,
>> -                                             cycle_hist);
>> +            /* Check if this rxq is pinned. */
>> +            if (rxq->core_id != OVS_CORE_UNSPEC) {
>> +                struct sched_pmd *sched_pmd = NULL;
> 
> No need to initialize to NULL.
> 

right, removed it

> 
>> +                struct dp_netdev_pmd_thread *pmd;
>> +                struct sched_numa *numa;
>> +                uint64_t proc_cycles;
>> +                char rxq_cyc_log[MAX_RXQ_CYC_STRLEN];
>> +
>> +
>> +                /* This rxq should be pinned, pin it now. */
>> +                pmd = dp_netdev_get_pmd(dp, rxq->core_id);
>> +                sched_pmd = find_sched_pmd_by_pmd(numa_list, pmd);
>> +                if (!sched_pmd) {
>> +                    /* Cannot find the PMD.  Cannot pin this rxq. */
>> +                    VLOG(level == VLL_DBG ? VLL_DBG : VLL_WARN,
>> +                            "Core %2u cannot be pinned with "
>> +                            "port \'%s\' rx queue %d. Use pmd-cpu-mask to "
>> +                            "enable a pmd on core %u.",
>> +                            rxq->core_id,
>> +                            netdev_rxq_get_name(rxq->rx),
>> +                            netdev_rxq_get_queue_id(rxq->rx),
>> +                            rxq->core_id);
>> +                    continue;
>> +                }
>> +                /* Mark PMD as isolated if not done already. */
>> +                if (sched_pmd->isolated == false) {
>> +                    sched_pmd->isolated = true;
>> +                    numa = sched_numa_list_find_numa(numa_list,
>> +                                                     sched_pmd);
>> +                    numa->n_iso++;
>>                  }
>> -                /* Store the queue. */
>> -                rxqs[n_rxqs++] = q;
>> +                proc_cycles = dp_netdev_rxq_get_cycles(rxq,
>> +                                                       
>> RXQ_CYCLES_PROC_HIST);
>> +                VLOG(level, "Core %2u on numa node %d is pinned with "
>> +                            "port \'%s\' rx queue %d.%s",
> 
> rx queue %d.%s -> rx queue %d%s. (once get_rxq_cyc_log() does not put
> a final '.').
> 

done

> 
> 
>> +                            sched_pmd->pmd->core_id, 
>> sched_pmd->pmd->numa_id,
>> +                            netdev_rxq_get_name(rxq->rx),
>> +                            netdev_rxq_get_queue_id(rxq->rx),
>> +                            get_rxq_cyc_log(rxq_cyc_log, algo, proc_cycles)
>> +                               ? rxq_cyc_log : "");
>> +                sched_add_rxq_to_sched_pmd(sched_pmd, rxq, proc_cycles);
>>              }
>>          }
>>      }
>>
>> -    if (n_rxqs > 1 && assign_cyc) {
>> +    if (n_rxqs > 1 && algo) {
>>          /* Sort the queues in order of the processing cycles
>>           * they consumed during their last pmd interval. */
>> @@ -5109,54 +5420,100 @@ rxq_scheduling(struct dp_netdev *dp, bool pinned) 
>> OVS_REQUIRES(dp->port_mutex)
>>      }
>>
>> -    rr_numa_list_populate(dp, &rr);
>> -    /* Assign the sorted queues to pmds in round robin. */
>> -    for (int i = 0; i < n_rxqs; i++) {
>> -        numa_id = netdev_get_numa_id(rxqs[i]->port->netdev);
>> -        numa = rr_numa_list_lookup(&rr, numa_id);
>> -        if (!numa) {
>> -            /* There are no pmds on the queue's local NUMA node.
>> -               Round robin on the NUMA nodes that do have pmds. */
>> -            non_local_numa = rr_numa_list_next(&rr, non_local_numa);
>> -            if (!non_local_numa) {
>> -                VLOG_ERR("There is no available (non-isolated) pmd "
>> -                         "thread for port \'%s\' queue %d. This queue "
>> -                         "will not be polled. Is pmd-cpu-mask set to "
>> -                         "zero? Or are all PMDs isolated to other "
>> -                         "queues?", netdev_rxq_get_name(rxqs[i]->rx),
>> -                         netdev_rxq_get_queue_id(rxqs[i]->rx));
>> -                continue;
>> +    last_cross_numa = NULL;
>> +    n_numa = sched_numa_list_count(numa_list);
>> +    for (unsigned i = 0; i < n_rxqs; i++) {
>> +        struct dp_netdev_rxq *rxq = rxqs[i];
>> +        struct sched_pmd *sched_pmd;
>> +        struct sched_numa *numa;
>> +        int numa_id;
>> +        uint64_t proc_cycles;
>> +        char rxq_cyc_log[MAX_RXQ_CYC_STRLEN];
>> +
>> +        if (rxq->core_id != OVS_CORE_UNSPEC) {
>> +            continue;
>> +        }
> 
> (cont of comment on rxqs array)... this way there is no need for a "is
> rxq pinned?" check here.
> 

removed

> 
>> +
>> +        if (start_logged == false && level != VLL_DBG) {
>> +            VLOG(level, "Performing pmd to rx queue assignment using %s "
>> +                        "algorithm.", get_assignment_type_string(algo));
>> +            start_logged = true;
>> +        }
>> +
>> +        /* Store the cycles for this rxq as we will log these later. */
>> +        proc_cycles = dp_netdev_rxq_get_cycles(rxq, RXQ_CYCLES_PROC_HIST);
>> +        /* Select the numa that should be used for this rxq. */
>> +        numa_id = netdev_get_numa_id(rxq->port->netdev);
>> +        numa = sched_numa_list_lookup(numa_list, numa_id);
>> +
>> +        /* Ensure that there is at least one non-isolated pmd on that numa. 
>> */
>> +        if (numa && !sched_get_numa_pmd_noniso(numa)) {
>> +            numa = NULL;
>> +        }
> 
> I think this block above can be covered with the check below.
> 

yes, it was leftover from an earlier version. Harmless but redundent.
Removed it.

> 
>> +
>> +        if (!numa || !sched_get_numa_pmd_noniso(numa)) {
>> +            /* Find any numa with available pmds. */
>> +            for (int k = 0; k < n_numa; k++) {
> 
> Super nit: previous index is 'i', I would expect 'j'.
> 

ok, but if I ever get i and j mixed up, I will blame you.

> 
>> +                numa = sched_numa_list_next(numa_list, last_cross_numa);
>> +                if (sched_get_numa_pmd_noniso(numa)) {
>> +                    break;
>> +                }
>> +                last_cross_numa = numa;
>> +                numa = NULL;
>>              }
>> -            rxqs[i]->pmd = rr_numa_get_pmd(non_local_numa, assign_cyc);
>> -            VLOG_WARN("There's no available (non-isolated) pmd thread "
>> -                      "on numa node %d. Queue %d on port \'%s\' will "
>> -                      "be assigned to the pmd on core %d "
>> -                      "(numa node %d). Expect reduced performance.",
>> -                      numa_id, netdev_rxq_get_queue_id(rxqs[i]->rx),
>> -                      netdev_rxq_get_name(rxqs[i]->rx),
>> -                      rxqs[i]->pmd->core_id, rxqs[i]->pmd->numa_id);
>> -        } else {
>> -            rxqs[i]->pmd = rr_numa_get_pmd(numa, assign_cyc);
>> -            if (assign_cyc) {
>> -                VLOG_INFO("Core %d on numa node %d assigned port \'%s\' "
>> -                          "rx queue %d "
>> -                          "(measured processing cycles %"PRIu64").",
>> -                          rxqs[i]->pmd->core_id, numa_id,
>> -                          netdev_rxq_get_name(rxqs[i]->rx),
>> -                          netdev_rxq_get_queue_id(rxqs[i]->rx),
>> -                          dp_netdev_rxq_get_cycles(rxqs[i],
>> -                                                   RXQ_CYCLES_PROC_HIST));
>> -            } else {
>> -                VLOG_INFO("Core %d on numa node %d assigned port \'%s\' "
>> -                          "rx queue %d.", rxqs[i]->pmd->core_id, numa_id,
>> -                          netdev_rxq_get_name(rxqs[i]->rx),
>> -                          netdev_rxq_get_queue_id(rxqs[i]->rx));
>> +        }
>> +        if (numa && numa->numa_id != numa_id) {
>> +            VLOG(level, "There's no available (non-isolated) pmd thread "
>> +                        "on numa node %d. Port \'%s\' rx queue %d will "
>> +                        "be assigned to a pmd on numa node %d. "
>> +                        "This may lead to reduced performance.",
>> +                        numa_id, netdev_rxq_get_name(rxq->rx),
>> +                        netdev_rxq_get_queue_id(rxq->rx), numa->numa_id);
>> +        }
>> +

fyi, i changed the logic around checking (numa) etc in v2 to simplify a bit

>> +        sched_pmd = NULL;
>> +        if (numa) {
>> +            /* Select the PMD that should be used for this rxq. */
>> +            sched_pmd = get_next_pmd(numa, algo);
>> +            if (sched_pmd) {
>> +                VLOG(level, "Core %2u on numa node %d assigned port \'%s\' "
>> +                            "rx queue %d.%s",
> 
> %d%s.
> 

done

> 
>> +                            sched_pmd->pmd->core_id, 
>> sched_pmd->pmd->numa_id,
>> +                            netdev_rxq_get_name(rxq->rx),
>> +                            netdev_rxq_get_queue_id(rxq->rx),
>> +                            get_rxq_cyc_log(rxq_cyc_log, algo, proc_cycles)
>> +                                ? rxq_cyc_log : "");
>> +                sched_add_rxq_to_sched_pmd(sched_pmd, rxq, proc_cycles);
>>              }
>>          }
>> +        if (!sched_pmd) {
>> +            VLOG(level == VLL_DBG ? level : VLL_WARN,
>> +                    "No non-isolated pmd on any numa available for "
>> +                    "port \'%s\' rx queue %d.%s "
> 
> %d%s.
> 

done

> 
> 
>> +                    "This rx queue will not be polled.",
>> +                    netdev_rxq_get_name(rxq->rx),
>> +                    netdev_rxq_get_queue_id(rxq->rx),
>> +                    get_rxq_cyc_log(rxq_cyc_log, algo, proc_cycles)
>> +                        ? rxq_cyc_log :  "");
>> +        }
>>      }
>> -
>> -    rr_numa_list_destroy(&rr);
>>      free(rxqs);
>>  }
>>
>> +static void
>> +rxq_scheduling(struct dp_netdev *dp) OVS_REQUIRES(dp->port_mutex)
>> +{
>> +    struct sched_numa_list *numa_list;
>> +    bool algo = dp->pmd_rxq_assign_cyc;
>> +
>> +    numa_list = xzalloc(sizeof *numa_list);
>> +
>> +    sched_numa_list_populate(numa_list, dp);
>> +    sched_numa_list_schedule(numa_list, dp, algo, VLL_INFO);
>> +    sched_numa_list_put_in_place(numa_list);
>> +
>> +    sched_numa_list_free_entries(numa_list);
>> +    free(numa_list);
>> +}
>> +
>>  static void
>>  reload_affected_pmds(struct dp_netdev *dp)
>> @@ -5406,10 +5763,5 @@ reconfigure_datapath(struct dp_netdev *dp)
>>          }
>>      }
>> -
>> -    /* Add pinned queues and mark pmd threads isolated. */
>> -    rxq_scheduling(dp, true);
>> -
>> -    /* Add non-pinned queues. */
>> -    rxq_scheduling(dp, false);
>> +    rxq_scheduling(dp);
>>
>>      /* Step 5: Remove queues not compliant with new scheduling. */
>> diff --git a/tests/pmd.at b/tests/pmd.at
>> index cc5371d5a..78105bf45 100644
>> --- a/tests/pmd.at
>> +++ b/tests/pmd.at
>> @@ -580,5 +580,5 @@ p1 3 0 2
>>  ])
>>
>> -OVS_VSWITCHD_STOP(["/dpif_netdev|WARN|There is no PMD thread on core/d"])
>> +OVS_VSWITCHD_STOP(["/cannot be pinned with port/d"])
>>  AT_CLEANUP
>>
>> --
>> 2.31.1
>>
> 
> 
> 

_______________________________________________
dev mailing list
[email protected]
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Re: [ovs-dev] [PATCH 1/5] dpif-netdev: Rework rxq scheduling code.

Reply via email to