Re: [ovs-dev] [PATCH v10 3/5] dpif-netdev: Time based output batching.

Ilya Maximets Sun, 14 Jan 2018 23:23:42 -0800

On 12.01.2018 19:42, Jan Scheurich wrote:
> Hi Ilya,
> 
> I understand that for the purpose of the present patch this is good enough to 
> count rx and tx packets together, but the PMD performance metrics patch will 
> depend on counting them separately. It would have been nice if you had simply 
> included the changes I proposed. Now I will have to revert some of your 
> changes to make the tx_packets available to pmd_perf_end_iteration(). But so 
> be it.



Hello, Jan.
I tried to find anything about tx_packets in your v5 of supervision patch-set
and RFC rebased series on mail-list but failed. At the first glance these series
just ignores the tx packets and the cycles spent for sending them. Am I missing
something? Or you didn't send these changes to mail-list?

In general, I think, it's better if you'll made this changes yourself, because
you know your code better.

Best regards, Ilya Maximets.
 
> 
> Tested-by: Jan Scheurich <[email protected]>
> Acked-by: Jan Scheurich <[email protected]>
> 
> Regards, Jan
> 
>> -----Original Message-----
>> From: Ilya Maximets [mailto:[email protected]]
>> Sent: Friday, 12 January, 2018 12:17
>> To: [email protected]
>> Cc: Heetae Ahn <[email protected]>; Bhanuprakash Bodireddy 
>> <[email protected]>; Antonio Fischetti
>> <[email protected]>; Eelco Chaudron <[email protected]>; Ciara 
>> Loftus <[email protected]>; Kevin Traynor
>> <[email protected]>; Jan Scheurich <[email protected]>; Billy 
>> O'Mahony <[email protected]>; Ian Stokes
>> <[email protected]>; Ilya Maximets <[email protected]>
>> Subject: [PATCH v10 3/5] dpif-netdev: Time based output batching.
>>
>> This allows to collect packets from more than one RX burst
>> and send them together with a configurable intervals.
>>
>> 'other_config:tx-flush-interval' can be used to configure
>> time that a packet can wait in output batch for sending.
>>
>> 'tx-flush-interval' has microsecond resolution.
>>
>> Signed-off-by: Ilya Maximets <[email protected]>
>> ---
>>  lib/dpif-netdev.c    | 108 
>> ++++++++++++++++++++++++++++++++++++++++-----------
>>  vswitchd/vswitch.xml |  16 ++++++++
>>  2 files changed, 102 insertions(+), 22 deletions(-)
>>
>> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
>> index 6909a03..f0ba2b2 100644
>> --- a/lib/dpif-netdev.c
>> +++ b/lib/dpif-netdev.c
>> @@ -87,6 +87,9 @@ VLOG_DEFINE_THIS_MODULE(dpif_netdev);
>>  #define MAX_RECIRC_DEPTH 6
>>  DEFINE_STATIC_PER_THREAD_DATA(uint32_t, recirc_depth, 0)
>>
>> +/* Use instant packet send by default. */
>> +#define DEFAULT_TX_FLUSH_INTERVAL 0
>> +
>>  /* Configuration parameters. */
>>  enum { MAX_FLOWS = 65536 };     /* Maximum number of flows in flow table. */
>>  enum { MAX_METERS = 65536 };    /* Maximum number of meters. */
>> @@ -273,6 +276,9 @@ struct dp_netdev {
>>      struct hmap ports;
>>      struct seq *port_seq;       /* Incremented whenever a port changes. */
>>
>> +    /* The time that a packet can wait in output batch for sending. */
>> +    atomic_uint32_t tx_flush_interval;
>> +
>>      /* Meters. */
>>      struct ovs_mutex meter_locks[N_METER_LOCKS];
>>      struct dp_meter *meters[MAX_METERS]; /* Meter bands. */
>> @@ -500,6 +506,7 @@ struct tx_port {
>>      int qid;
>>      long long last_used;
>>      struct hmap_node node;
>> +    long long flush_time;
>>      struct dp_packet_batch output_pkts;
>>      struct dp_netdev_rxq *output_pkts_rxqs[NETDEV_MAX_BURST];
>>  };
>> @@ -581,6 +588,9 @@ struct dp_netdev_pmd_thread {
>>       * than 'cmap_count(dp->poll_threads)'. */
>>      uint32_t static_tx_qid;
>>
>> +    /* Number of filled output batches. */
>> +    int n_output_batches;
>> +
>>      struct ovs_mutex port_mutex;    /* Mutex for 'poll_list' and 
>> 'tx_ports'. */
>>      /* List of rx queues to poll. */
>>      struct hmap poll_list OVS_GUARDED;
>> @@ -670,8 +680,9 @@ static void dp_netdev_add_rxq_to_pmd(struct 
>> dp_netdev_pmd_thread *pmd,
>>  static void dp_netdev_del_rxq_from_pmd(struct dp_netdev_pmd_thread *pmd,
>>                                         struct rxq_poll *poll)
>>      OVS_REQUIRES(pmd->port_mutex);
>> -static void
>> -dp_netdev_pmd_flush_output_packets(struct dp_netdev_pmd_thread *pmd);
>> +static int
>> +dp_netdev_pmd_flush_output_packets(struct dp_netdev_pmd_thread *pmd,
>> +                                   bool force);
>>
>>  static void reconfigure_datapath(struct dp_netdev *dp)
>>      OVS_REQUIRES(dp->port_mutex);
>> @@ -1245,6 +1256,7 @@ create_dp_netdev(const char *name, const struct 
>> dpif_class *class,
>>      conntrack_init(&dp->conntrack);
>>
>>      atomic_init(&dp->emc_insert_min, DEFAULT_EM_FLOW_INSERT_MIN);
>> +    atomic_init(&dp->tx_flush_interval, DEFAULT_TX_FLUSH_INTERVAL);
>>
>>      cmap_init(&dp->poll_threads);
>>
>> @@ -2911,7 +2923,7 @@ dpif_netdev_execute(struct dpif *dpif, struct 
>> dpif_execute *execute)
>>      dp_packet_batch_init_packet(&pp, execute->packet);
>>      dp_netdev_execute_actions(pmd, &pp, false, execute->flow,
>>                                execute->actions, execute->actions_len);
>> -    dp_netdev_pmd_flush_output_packets(pmd);
>> +    dp_netdev_pmd_flush_output_packets(pmd, true);
>>
>>      if (pmd->core_id == NON_PMD_CORE_ID) {
>>          ovs_mutex_unlock(&dp->non_pmd_mutex);
>> @@ -2960,6 +2972,16 @@ dpif_netdev_set_config(struct dpif *dpif, const 
>> struct smap *other_config)
>>          smap_get_ullong(other_config, "emc-insert-inv-prob",
>>                          DEFAULT_EM_FLOW_INSERT_INV_PROB);
>>      uint32_t insert_min, cur_min;
>> +    uint32_t tx_flush_interval, cur_tx_flush_interval;
>> +
>> +    tx_flush_interval = smap_get_int(other_config, "tx-flush-interval",
>> +                                     DEFAULT_TX_FLUSH_INTERVAL);
>> +    atomic_read_relaxed(&dp->tx_flush_interval, &cur_tx_flush_interval);
>> +    if (tx_flush_interval != cur_tx_flush_interval) {
>> +        atomic_store_relaxed(&dp->tx_flush_interval, tx_flush_interval);
>> +        VLOG_INFO("Flushing interval for tx queues set to %"PRIu32" us",
>> +                  tx_flush_interval);
>> +    }
>>
>>      if (!nullable_string_is_equal(dp->pmd_cmask, cmask)) {
>>          free(dp->pmd_cmask);
>> @@ -3154,7 +3176,7 @@ dp_netdev_rxq_get_intrvl_cycles(struct dp_netdev_rxq 
>> *rx, unsigned idx)
>>      return processing_cycles;
>>  }
>>
>> -static void
>> +static int
>>  dp_netdev_pmd_flush_output_on_port(struct dp_netdev_pmd_thread *pmd,
>>                                     struct tx_port *p)
>>  {
>> @@ -3164,6 +3186,7 @@ dp_netdev_pmd_flush_output_on_port(struct 
>> dp_netdev_pmd_thread *pmd,
>>      bool dynamic_txqs;
>>      struct cycle_timer timer;
>>      uint64_t cycles;
>> +    uint32_t tx_flush_interval;
>>
>>      cycle_timer_start(&pmd->perf_stats, &timer);
>>
>> @@ -3180,6 +3203,13 @@ dp_netdev_pmd_flush_output_on_port(struct 
>> dp_netdev_pmd_thread *pmd,
>>      netdev_send(p->port->netdev, tx_qid, &p->output_pkts, dynamic_txqs);
>>      dp_packet_batch_init(&p->output_pkts);
>>
>> +    /* Update time of the next flush. */
>> +    atomic_read_relaxed(&pmd->dp->tx_flush_interval, &tx_flush_interval);
>> +    p->flush_time = pmd->ctx.now + tx_flush_interval;
>> +
>> +    ovs_assert(pmd->n_output_batches > 0);
>> +    pmd->n_output_batches--;
>> +
>>      pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SENT_PKTS, 
>> output_cnt);
>>      pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SENT_BATCHES, 1);
>>
>> @@ -3191,18 +3221,28 @@ dp_netdev_pmd_flush_output_on_port(struct 
>> dp_netdev_pmd_thread *pmd,
>>                                       RXQ_CYCLES_PROC_CURR, cycles);
>>          }
>>      }
>> +
>> +    return output_cnt;
>>  }
>>
>> -static void
>> -dp_netdev_pmd_flush_output_packets(struct dp_netdev_pmd_thread *pmd)
>> +static int
>> +dp_netdev_pmd_flush_output_packets(struct dp_netdev_pmd_thread *pmd,
>> +                                   bool force)
>>  {
>>      struct tx_port *p;
>> +    int output_cnt = 0;
>> +
>> +    if (!pmd->n_output_batches) {
>> +        return 0;
>> +    }
>>
>>      HMAP_FOR_EACH (p, node, &pmd->send_port_cache) {
>> -        if (!dp_packet_batch_is_empty(&p->output_pkts)) {
>> -            dp_netdev_pmd_flush_output_on_port(pmd, p);
>> +        if (!dp_packet_batch_is_empty(&p->output_pkts)
>> +            && (force || pmd->ctx.now >= p->flush_time)) {
>> +            output_cnt += dp_netdev_pmd_flush_output_on_port(pmd, p);
>>          }
>>      }
>> +    return output_cnt;
>>  }
>>
>>  static int
>> @@ -3213,7 +3253,7 @@ dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread 
>> *pmd,
>>      struct dp_packet_batch batch;
>>      struct cycle_timer timer;
>>      int error;
>> -    int batch_cnt = 0;
>> +    int batch_cnt = 0, output_cnt = 0;
>>      uint64_t cycles;
>>
>>      /* Measure duration for polling and processing rx burst. */
>> @@ -3235,7 +3275,7 @@ dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread 
>> *pmd,
>>          cycles = cycle_timer_stop(&pmd->perf_stats, &timer);
>>          dp_netdev_rxq_add_cycles(rxq, RXQ_CYCLES_PROC_CURR, cycles);
>>
>> -        dp_netdev_pmd_flush_output_packets(pmd);
>> +        output_cnt = dp_netdev_pmd_flush_output_packets(pmd, false);
>>      } else {
>>          /* Discard cycles. */
>>          cycle_timer_stop(&pmd->perf_stats, &timer);
>> @@ -3249,7 +3289,7 @@ dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread 
>> *pmd,
>>
>>      pmd->ctx.last_rxq = NULL;
>>
>> -    return batch_cnt;
>> +    return batch_cnt + output_cnt;
>>  }
>>
>>  static struct tx_port *
>> @@ -3872,6 +3912,7 @@ dpif_netdev_run(struct dpif *dpif)
>>      struct dp_netdev *dp = get_dp_netdev(dpif);
>>      struct dp_netdev_pmd_thread *non_pmd;
>>      uint64_t new_tnl_seq;
>> +    bool need_to_flush = true;
>>
>>      ovs_mutex_lock(&dp->port_mutex);
>>      non_pmd = dp_netdev_get_pmd(dp, NON_PMD_CORE_ID);
>> @@ -3882,13 +3923,22 @@ dpif_netdev_run(struct dpif *dpif)
>>                  int i;
>>
>>                  for (i = 0; i < port->n_rxq; i++) {
>> -                    dp_netdev_process_rxq_port(non_pmd,
>> -                                               &port->rxqs[i],
>> -                                               port->port_no);
>> +                    if (dp_netdev_process_rxq_port(non_pmd,
>> +                                                   &port->rxqs[i],
>> +                                                   port->port_no)) {
>> +                        need_to_flush = false;
>> +                    }
>>                  }
>>              }
>>          }
>> -        pmd_thread_ctx_time_update(non_pmd);
>> +        if (need_to_flush) {
>> +            /* We didn't receive anything in the process loop.
>> +             * Check if we need to send something.
>> +             * There was no time updates on current iteration. */
>> +            pmd_thread_ctx_time_update(non_pmd);
>> +            dp_netdev_pmd_flush_output_packets(non_pmd, false);
>> +        }
>> +
>>          dpif_netdev_xps_revalidate_pmd(non_pmd, false);
>>          ovs_mutex_unlock(&dp->non_pmd_mutex);
>>
>> @@ -3939,6 +3989,8 @@ pmd_free_cached_ports(struct dp_netdev_pmd_thread *pmd)
>>  {
>>      struct tx_port *tx_port_cached;
>>
>> +    /* Flush all the queued packets. */
>> +    dp_netdev_pmd_flush_output_packets(pmd, true);
>>      /* Free all used tx queue ids. */
>>      dpif_netdev_xps_revalidate_pmd(pmd, true);
>>
>> @@ -4069,6 +4121,7 @@ reload:
>>      cycles_counter_update(s);
>>      for (;;) {
>>          uint64_t iter_packets = 0;
>> +
>>          pmd_perf_start_iteration(s);
>>          for (i = 0; i < poll_cnt; i++) {
>>              process_packets =
>> @@ -4077,15 +4130,20 @@ reload:
>>              iter_packets += process_packets;
>>          }
>>
>> +        if (!iter_packets) {
>> +            /* We didn't receive anything in the process loop.
>> +             * Check if we need to send something.
>> +             * There was no time updates on current iteration. */
>> +            pmd_thread_ctx_time_update(pmd);
>> +            iter_packets += dp_netdev_pmd_flush_output_packets(pmd, false);
>> +        }
>> +
>>          if (lc++ > 1024) {
>>              bool reload;
>>
>>              lc = 0;
>>
>>              coverage_try_clear();
>> -            /* It's possible that the time was not updated on current
>> -             * iteration, if there were no received packets. */
>> -            pmd_thread_ctx_time_update(pmd);
>>              dp_netdev_pmd_try_optimize(pmd, poll_list, poll_cnt);
>>              if (!ovsrcu_try_quiesce()) {
>>                  emc_cache_slow_sweep(&pmd->flow_cache);
>> @@ -4524,6 +4582,7 @@ dp_netdev_configure_pmd(struct dp_netdev_pmd_thread 
>> *pmd, struct dp_netdev *dp,
>>      pmd->core_id = core_id;
>>      pmd->numa_id = numa_id;
>>      pmd->need_reload = false;
>> +    pmd->n_output_batches = 0;
>>
>>      ovs_refcount_init(&pmd->ref_cnt);
>>      latch_init(&pmd->exit_latch);
>> @@ -4713,6 +4772,7 @@ dp_netdev_add_port_tx_to_pmd(struct 
>> dp_netdev_pmd_thread *pmd,
>>
>>      tx->port = port;
>>      tx->qid = -1;
>> +    tx->flush_time = 0LL;
>>      dp_packet_batch_init(&tx->output_pkts);
>>
>>      hmap_insert(&pmd->tx_ports, &tx->node, hash_port_no(tx->port->port_no));
>> @@ -5407,12 +5467,16 @@ dp_execute_cb(void *aux_, struct dp_packet_batch 
>> *packets_,
>>                  dp_netdev_pmd_flush_output_on_port(pmd, p);
>>              }
>>  #endif
>> -            if (OVS_UNLIKELY(dp_packet_batch_size(&p->output_pkts)
>> -                       + dp_packet_batch_size(packets_) > 
>> NETDEV_MAX_BURST)) {
>> -                /* Some packets was generated while input batch processing.
>> -                 * Flush here to avoid overflow. */
>> +            if (dp_packet_batch_size(&p->output_pkts)
>> +                + dp_packet_batch_size(packets_) > NETDEV_MAX_BURST) {
>> +                /* Flush here to avoid overflow. */
>>                  dp_netdev_pmd_flush_output_on_port(pmd, p);
>>              }
>> +
>> +            if (dp_packet_batch_is_empty(&p->output_pkts)) {
>> +                pmd->n_output_batches++;
>> +            }
>> +
>>              DP_PACKET_BATCH_FOR_EACH (packet, packets_) {
>>                  p->output_pkts_rxqs[dp_packet_batch_size(&p->output_pkts)] =
>>                                                               
>> pmd->ctx.last_rxq;
>> diff --git a/vswitchd/vswitch.xml b/vswitchd/vswitch.xml
>> index 58c0ebd..61fb7b1 100644
>> --- a/vswitchd/vswitch.xml
>> +++ b/vswitchd/vswitch.xml
>> @@ -359,6 +359,22 @@
>>          </p>
>>        </column>
>>
>> +      <column name="other_config" key="tx-flush-interval"
>> +              type='{"type": "integer",
>> +                     "minInteger": 0, "maxInteger": 1000000}'>
>> +        <p>
>> +          Specifies the time in microseconds that a packet can wait in 
>> output
>> +          batch for sending i.e. amount of time that packet can spend in an
>> +          intermediate output queue before sending to netdev.
>> +          This option can be used to configure balance between throughput
>> +          and latency. Lower values decreases latency while higher values
>> +          may be useful to achieve higher performance.
>> +        </p>
>> +        <p>
>> +          Defaults to 0 i.e. instant packet sending (latency optimized).
>> +        </p>
>> +      </column>
>> +
>>        <column name="other_config" key="n-handler-threads"
>>                type='{"type": "integer", "minInteger": 1}'>
>>          <p>
>> --
>> 2.7.4
> 
> 
> 
> 
_______________________________________________
dev mailing list
[email protected]
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Re: [ovs-dev] [PATCH v10 3/5] dpif-netdev: Time based output batching.

Reply via email to