Hi,

After rebasing, the performance of branch master boosted in my env
from 12Mpps to 13Mpps. However, this specific patch brings down
to 12Mpps. I am using dpif_scalar and generic lookup (no AVX512).

(I don't think this report should block the patch because the
counter are interesting and the analysis below doesn't point
directly to the proposed changes.)

This is a diff using all patches applied versus this patch reverted:
    21.44%     +6.08%  ovs-vswitchd        [.] miniflow_extract
     8.94%     -1.92%  libc-2.28.so        [.] __memcmp_avx2_movbe
    14.62%     +1.44%  ovs-vswitchd        [.] dp_netdev_input__
     2.80%     -1.08%  ovs-vswitchd        [.] 
dp_netdev_pmd_flush_output_on_port
     3.44%     -0.91%  ovs-vswitchd        [.] netdev_send

This is the code side by side, patch applied on the right side:
(sorry, long lines)

Percent│      *datap = data + size;                             ▒│Percent│      
*datap = data + size;                             ▒
  1.37 │        mov    %r14,0x30(%rsp)                          ▒│  1.32 │      
  mov    %r14,0x30(%rsp)                          ▒
       │      *sizep -= size;                                   ▒│       │      
*sizep -= size;                                   ▒
  0.24 │        mov    %rax,0x38(%rsp)                          ▒│  0.94 │      
  mov    %rax,0x38(%rsp)                          ▒
       │      parse_ethertype():                                ▒│       │      
parse_ethertype():                                ▒
       │      proto = *(ovs_be16 *) data_pull(datap, sizep, size▒│       │      
proto = *(ovs_be16 *) data_pull(datap, sizep, size▒
  0.64 │        movzwl (%rdx),%r10d                             ▒│  0.26 │      
  movzwl (%rdx),%r10d                             ▒
       │      __bswap_16():                                     ▒│       │      
__bswap_16():                                     ▒
       │                                                        ▒│       │      
                                                  ▒
       │      static __inline __uint16_t                        ◆│       │      
static __inline __uint16_t                        ◆
       │      __bswap_16 (__uint16_t __bsx)                     ▒│       │      
__bswap_16 (__uint16_t __bsx)                     ▒
       │      {                                                 ▒│       │      
{                                                 ▒
       │      #if __GNUC_PREREQ (4, 8)                          ▒│       │      
#if __GNUC_PREREQ (4, 8)                          ▒
       │      return __builtin_bswap16 (__bsx);                 ▒│       │      
return __builtin_bswap16 (__bsx);                 ▒
  0.07 │        mov    %r10d,%ecx                               ▒│  0.06 │      
  mov    %r10d,%ecx                               ▒
  1.23 │        rol    $0x8,%cx                                 ▒│  0.81 │      
  rol    $0x8,%cx                                 ▒
       │      parse_ethertype():                                ▒│       │      
parse_ethertype():                                ▒
       │      if (OVS_LIKELY(ntohs(proto) >= ETH_TYPE_MIN)) {   ▒│       │      
if (OVS_LIKELY(ntohs(proto) >= ETH_TYPE_MIN)) {   ▒
  0.19 │        cmp    $0x5ff,%cx                               ▒│  0.77 │      
  cmp    $0x5ff,%cx                               ▒
       │      ↓ jbe    a08                                      ▒│       │      
↓ jbe    a08                                      ▒
       │      return llc->snap.snap_type;                       ▒│       │      
return llc->snap.snap_type;                       ▒
  0.40 │ 1f8:   lea    -0x4788(%r10),%r8d                       ▒│  0.13 │ 1f8: 
  lea    -0x4788(%r10),%r8d                       ▒
  0.07 │        and    $0xfeff,%r8w                             ▒│  0.11 │      
  and    $0xfeff,%r8w                             ▒
       │      miniflow_extract():                               ▒│       │      
miniflow_extract():                               ▒
       │      union flow_vlan_hdr vlans[FLOW_MAX_VLAN_HEADERS]; ▒│       │      
union flow_vlan_hdr vlans[FLOW_MAX_VLAN_HEADERS]; ▒
       │      size_t num_vlans = parse_vlan(&data, &size, vlans)▒│       │      
size_t num_vlans = parse_vlan(&data, &size, vlans)▒
       │                                                        ▒│       │      
                                                  ▒
       │      dl_type = parse_ethertype(&data, &size);          ▒│       │      
dl_type = parse_ethertype(&data, &size);          ▒
       │      miniflow_push_be16(mf, dl_type, dl_type);         ▒│       │      
miniflow_push_be16(mf, dl_type, dl_type);         ▒
       │      miniflow_pad_to_64(mf, dl_type);                  ▒│       │      
miniflow_pad_to_64(mf, dl_type);                  ▒
  1.37 │ 205:   xor    %r15d,%r15d                              ▒│  0.81 │ 205: 
  xor    %r15d,%r15d                              ▒
       │      miniflow_push_be16(mf, dl_type, dl_type);         ▒│       │      
miniflow_push_be16(mf, dl_type, dl_type);         ▒
  0.19 │        mov    %r10w,0xc(%r13)                          ▒│  1.91 │      
  mov    %r10w,0xc(%r13)                          ▒
       │      miniflow_pad_to_64(mf, dl_type);                  ▒│       │      
miniflow_pad_to_64(mf, dl_type);                  ▒
  0.42 │        add    $0x10,%r13                               ▒│  0.32 │      
  add    $0x10,%r13                               ▒
  0.26 │        mov    %r15w,-0x2(%r13)                         ▒│  5.62 │      
  mov    %r15w,-0x2(%r13)                         ▒
       │      if (num_vlans > 0) {                              ▒│       │      
if (num_vlans > 0) {                              ▒
       │        test   %rbp,%rbp                                ▒│       │      
  test   %rbp,%rbp                                ▒
  1.42 │      ↓ je     2b5                                      ▒│  0.11 │      
↓ je     2b5                                      ▒
       │      miniflow_push_words_32(mf, vlans, vlans, num_vlans▒│       │      
miniflow_push_words_32(mf, vlans, vlans, num_vlans▒
       │        lea    0x1(%rbp),%rcx                           ▒│       │      
  lea    0x1(%rbp),%rcx                           ▒
       │        mov    %r13,%rdi                                ▒│       │      
  mov    %r13,%rdi                                ▒
       │        lea    0x40(%rsp),%rsi                          ▒│       │      
  lea    0x40(%rsp),%rsi                          ▒
       │        mov    %r9,0x28(%rsp)                           ▒│       │      
  mov    %r9,0x28(%rsp)                           ▒
       │        shr    %rcx                                     ▒│       │      
  shr    %rcx                                     ▒
       │      flowmap_set():                                    ▒│       │      
flowmap_set():                                    ▒
       │      map_t n_bits_mask = (MAP_1 << n_bits) - 1;        ▒│       │      
map_t n_bits_mask = (MAP_1 << n_bits) - 1;        ▒
       │        mov    $0x1,%r15d                               ▒│       │      
  mov    $0x1,%r15d                               ▒
       │        mov    %r8d,0x24(%rsp)                          ▒│       │      
  mov    %r8d,0x24(%rsp)                          ▒
       │      miniflow_extract():                               ▒│       │      
miniflow_extract():                               ▒
       │        lea    0x0(,%rbp,4),%rdx                        ▒│       │      
  lea    0x0(,%rbp,4),%rdx                        ▒
       │      flowmap_set():                                    ▒│       │      
flowmap_set():                                    ▒
       │        shl    %cl,%r15                                 ▒│       │      
  shl    %cl,%r15                                 ▒
       │        mov    %r10d,0x20(%rsp)                         ▒│       │      
  mov    %r10d,0x20(%rsp)                         ▒
       │        sub    $0x1,%r15                                ▒│       │      
  sub    $0x1,%r15                                ▒
       │        mov    %r11,0x18(%rsp)                          ▒│       │      
  mov    %r11,0x18(%rsp)                          ▒


I don't see any relevant optimization difference in the code
above, but the "mov %r15w,-0x2(%r13)" on the right side accounts
for almost all the difference, though on the left side it seems
a bit more spread.

I applied the patch below and it helped to get to 12.7Mpps, so
almost at the same levels. I wonder if you see the same result.

diff --git a/lib/flow.c b/lib/flow.c
index 729d59b1b..4572e356b 100644
--- a/lib/flow.c
+++ b/lib/flow.c
@@ -746,6 +746,9 @@ miniflow_extract(struct dp_packet *packet, struct miniflow 
*dst)
     uint8_t *ct_nw_proto_p = NULL;
     ovs_be16 ct_tp_src = 0, ct_tp_dst = 0;
 
+    /* dltype will be updated later. */
+    OVS_PREFETCH_WRITE(miniflow_pointer(mf, dl_type));
+
     /* Metadata. */
     if (flow_tnl_dst_is_set(&md->tunnel)) {
         miniflow_push_words(mf, tunnel, &md->tunnel,


fbl

On Thu, Jul 08, 2021 at 03:02:36PM +0100, Cian Ferriter wrote:
> It is possible for packets traversing the userspace datapath to match a
> flow before hitting on EMC by using a mark ID provided by a NIC. Add a
> PMD statistic for this hit.
> 
> Signed-off-by: Cian Ferriter <[email protected]>
> Acked-by: Flavio Leitner <[email protected]>
> 
> ---
> 
> Cc: Gaetan Rivet <[email protected]>
> Cc: Sriharsha Basavapatna <[email protected]>
> 
> v14:
> - Added Flavio's Acked-by tag.
> 
> v13:
> - Minor refactoring to address review comments.
> - Update manpages to reflect the new format of the pmd-perf-show
>   command.
> ---
>  NEWS                        | 2 ++
>  lib/dpif-netdev-avx512.c    | 3 +++
>  lib/dpif-netdev-perf.c      | 3 +++
>  lib/dpif-netdev-perf.h      | 1 +
>  lib/dpif-netdev-unixctl.man | 1 +
>  lib/dpif-netdev.c           | 9 +++++++--
>  tests/pmd.at                | 6 ++++--
>  7 files changed, 21 insertions(+), 4 deletions(-)
> 
> diff --git a/NEWS b/NEWS
> index 9e34027bf..d39b0ddf9 100644
> --- a/NEWS
> +++ b/NEWS
> @@ -21,6 +21,8 @@ Post-v2.15.0
>       * Add avx512 implementation of dpif which can process non recirculated
>         packets. It supports partial HWOL, EMC, SMC and DPCLS lookups.
>       * Add commands to get and set the dpif implementations.
> +     * Add a partial HWOL PMD statistic counting hits similar to existing
> +       EMC/SMC/DPCLS stats.
>     - ovs-ctl:
>       * New option '--no-record-hostname' to disable hostname configuration
>         in ovsdb on startup.
> diff --git a/lib/dpif-netdev-avx512.c b/lib/dpif-netdev-avx512.c
> index 1ae66ca6c..6f9aa8284 100644
> --- a/lib/dpif-netdev-avx512.c
> +++ b/lib/dpif-netdev-avx512.c
> @@ -127,6 +127,7 @@ dp_netdev_input_outer_avx512(struct dp_netdev_pmd_thread 
> *pmd,
>  
>      uint32_t emc_hits = 0;
>      uint32_t smc_hits = 0;
> +    uint32_t phwol_hits = 0;
>  
>      /* A 1 bit in this mask indicates a hit, so no DPCLS lookup on the pkt. 
> */
>      uint32_t hwol_emc_smc_hitmask = 0;
> @@ -178,6 +179,7 @@ dp_netdev_input_outer_avx512(struct dp_netdev_pmd_thread 
> *pmd,
>                  rules[i] = &f->cr;
>                  pkt_meta[i].tcp_flags = parse_tcp_flags(packet);
>                  pkt_meta[i].bytes = dp_packet_size(packet);
> +                phwol_hits++;
>                  hwol_emc_smc_hitmask |= (1 << i);
>                  continue;
>              }
> @@ -286,6 +288,7 @@ dp_netdev_input_outer_avx512(struct dp_netdev_pmd_thread 
> *pmd,
>  
>      /* At this point we don't return error anymore, so commit stats here. */
>      pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_RECV, batch_size);
> +    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_PHWOL_HIT, 
> phwol_hits);
>      pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_EXACT_HIT, emc_hits);
>      pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SMC_HIT, smc_hits);
>      pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MASKED_HIT,
> diff --git a/lib/dpif-netdev-perf.c b/lib/dpif-netdev-perf.c
> index 9560e7c3c..7103a2d4d 100644
> --- a/lib/dpif-netdev-perf.c
> +++ b/lib/dpif-netdev-perf.c
> @@ -246,6 +246,7 @@ pmd_perf_format_overall_stats(struct ds *str, struct 
> pmd_perf_stats *s,
>          ds_put_format(str,
>              "  Rx packets:        %12"PRIu64"  (%.0f Kpps, %.0f 
> cycles/pkt)\n"
>              "  Datapath passes:   %12"PRIu64"  (%.2f passes/pkt)\n"
> +            "  - PHWOL hits:      %12"PRIu64"  (%5.1f %%)\n"
>              "  - EMC hits:        %12"PRIu64"  (%5.1f %%)\n"
>              "  - SMC hits:        %12"PRIu64"  (%5.1f %%)\n"
>              "  - Megaflow hits:   %12"PRIu64"  (%5.1f %%, %.2f "
> @@ -255,6 +256,8 @@ pmd_perf_format_overall_stats(struct ds *str, struct 
> pmd_perf_stats *s,
>              rx_packets, (rx_packets / duration) / 1000,
>              1.0 * stats[PMD_CYCLES_ITER_BUSY] / rx_packets,
>              passes, rx_packets ? 1.0 * passes / rx_packets : 0,
> +            stats[PMD_STAT_PHWOL_HIT],
> +            100.0 * stats[PMD_STAT_PHWOL_HIT] / passes,
>              stats[PMD_STAT_EXACT_HIT],
>              100.0 * stats[PMD_STAT_EXACT_HIT] / passes,
>              stats[PMD_STAT_SMC_HIT],
> diff --git a/lib/dpif-netdev-perf.h b/lib/dpif-netdev-perf.h
> index 72645b6b3..8b1a52387 100644
> --- a/lib/dpif-netdev-perf.h
> +++ b/lib/dpif-netdev-perf.h
> @@ -56,6 +56,7 @@ extern "C" {
>  /* Set of counter types maintained in pmd_perf_stats. */
>  
>  enum pmd_stat_type {
> +    PMD_STAT_PHWOL_HIT,     /* Packets that had a partial HWOL hit (phwol). 
> */
>      PMD_STAT_EXACT_HIT,     /* Packets that had an exact match (emc). */
>      PMD_STAT_SMC_HIT,       /* Packets that had a sig match hit (SMC). */
>      PMD_STAT_MASKED_HIT,    /* Packets that matched in the flow table. */
> diff --git a/lib/dpif-netdev-unixctl.man b/lib/dpif-netdev-unixctl.man
> index 5f9256215..83ce4f1c5 100644
> --- a/lib/dpif-netdev-unixctl.man
> +++ b/lib/dpif-netdev-unixctl.man
> @@ -135,6 +135,7 @@ pmd thread numa_id 0 core_id 1:
>    - busy iterations:        86009  ( 84.1 % of used cycles)
>    Rx packets:             2399607  (2381 Kpps, 848 cycles/pkt)
>    Datapath passes:        3599415  (1.50 passes/pkt)
> +  - PHWOL hits:                 0  (  0.0 %)
>    - EMC hits:              336472  (  9.3 %)
>    - SMC hits:                   0  ( 0.0 %)
>    - Megaflow hits:        3262943  ( 90.7 %, 1.00 subtbl lookups/hit)
> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
> index 8dfbdef15..7d36b71fb 100644
> --- a/lib/dpif-netdev.c
> +++ b/lib/dpif-netdev.c
> @@ -646,6 +646,7 @@ pmd_info_show_stats(struct ds *reply,
>                    "  packets received: %"PRIu64"\n"
>                    "  packet recirculations: %"PRIu64"\n"
>                    "  avg. datapath passes per packet: %.02f\n"
> +                  "  phwol hits: %"PRIu64"\n"
>                    "  emc hits: %"PRIu64"\n"
>                    "  smc hits: %"PRIu64"\n"
>                    "  megaflow hits: %"PRIu64"\n"
> @@ -654,7 +655,8 @@ pmd_info_show_stats(struct ds *reply,
>                    "  miss with failed upcall: %"PRIu64"\n"
>                    "  avg. packets per output batch: %.02f\n",
>                    total_packets, stats[PMD_STAT_RECIRC],
> -                  passes_per_pkt, stats[PMD_STAT_EXACT_HIT],
> +                  passes_per_pkt, stats[PMD_STAT_PHWOL_HIT],
> +                  stats[PMD_STAT_EXACT_HIT],
>                    stats[PMD_STAT_SMC_HIT],
>                    stats[PMD_STAT_MASKED_HIT], lookups_per_hit,
>                    stats[PMD_STAT_MISS], stats[PMD_STAT_LOST],
> @@ -1683,6 +1685,7 @@ dpif_netdev_get_stats(const struct dpif *dpif, struct 
> dpif_dp_stats *stats)
>      CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
>          stats->n_flows += cmap_count(&pmd->flow_table);
>          pmd_perf_read_counters(&pmd->perf_stats, pmd_stats);
> +        stats->n_hit += pmd_stats[PMD_STAT_PHWOL_HIT];
>          stats->n_hit += pmd_stats[PMD_STAT_EXACT_HIT];
>          stats->n_hit += pmd_stats[PMD_STAT_SMC_HIT];
>          stats->n_hit += pmd_stats[PMD_STAT_MASKED_HIT];
> @@ -6805,7 +6808,7 @@ dfc_processing(struct dp_netdev_pmd_thread *pmd,
>                 bool md_is_valid, odp_port_t port_no)
>  {
>      struct netdev_flow_key *key = &keys[0];
> -    size_t n_missed = 0, n_emc_hit = 0;
> +    size_t n_missed = 0, n_emc_hit = 0, n_phwol_hit = 0;
>      struct dfc_cache *cache = &pmd->flow_cache;
>      struct dp_packet *packet;
>      const size_t cnt = dp_packet_batch_size(packets_);
> @@ -6850,6 +6853,7 @@ dfc_processing(struct dp_netdev_pmd_thread *pmd,
>              }
>              if (OVS_LIKELY(flow)) {
>                  tcp_flags = parse_tcp_flags(packet);
> +                n_phwol_hit++;
>                  if (OVS_LIKELY(batch_enable)) {
>                      dp_netdev_queue_batches(packet, flow, tcp_flags, batches,
>                                              n_batches);
> @@ -6912,6 +6916,7 @@ dfc_processing(struct dp_netdev_pmd_thread *pmd,
>      /* Count of packets which are not flow batched. */
>      *n_flows = map_cnt;
>  
> +    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_PHWOL_HIT, 
> n_phwol_hit);
>      pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_EXACT_HIT, n_emc_hit);
>  
>      if (!smc_enable_db) {
> diff --git a/tests/pmd.at b/tests/pmd.at
> index 9c5824c55..46d9ede5e 100644
> --- a/tests/pmd.at
> +++ b/tests/pmd.at
> @@ -202,11 +202,12 @@ dummy@ovs-dummy: hit:0 missed:0
>      p0 7/1: (dummy-pmd: configured_rx_queues=4, 
> configured_tx_queues=<cleared>, requested_rx_queues=4, 
> requested_tx_queues=<cleared>)
>  ])
>  
> -AT_CHECK([ovs-appctl dpif-netdev/pmd-stats-show | sed SED_NUMA_CORE_PATTERN 
> | sed '/cycles/d' | grep pmd -A 9], [0], [dnl
> +AT_CHECK([ovs-appctl dpif-netdev/pmd-stats-show | sed SED_NUMA_CORE_PATTERN 
> | sed '/cycles/d' | grep pmd -A 10], [0], [dnl
>  pmd thread numa_id <cleared> core_id <cleared>:
>    packets received: 0
>    packet recirculations: 0
>    avg. datapath passes per packet: 0.00
> +  phwol hits: 0
>    emc hits: 0
>    smc hits: 0
>    megaflow hits: 0
> @@ -233,11 +234,12 @@ AT_CHECK([cat ovs-vswitchd.log | filter_flow_install | 
> strip_xout], [0], [dnl
>  
> recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:77,dst=50:54:00:00:01:78),eth_type(0x0800),ipv4(frag=no),
>  actions: <del>
>  ])
>  
> -AT_CHECK([ovs-appctl dpif-netdev/pmd-stats-show | sed SED_NUMA_CORE_PATTERN 
> | sed '/cycles/d' | grep pmd -A 9], [0], [dnl
> +AT_CHECK([ovs-appctl dpif-netdev/pmd-stats-show | sed SED_NUMA_CORE_PATTERN 
> | sed '/cycles/d' | grep pmd -A 10], [0], [dnl
>  pmd thread numa_id <cleared> core_id <cleared>:
>    packets received: 20
>    packet recirculations: 0
>    avg. datapath passes per packet: 1.00
> +  phwol hits: 0
>    emc hits: 19
>    smc hits: 0
>    megaflow hits: 0
> -- 
> 2.32.0
> 
> _______________________________________________
> dev mailing list
> [email protected]
> https://mail.openvswitch.org/mailman/listinfo/ovs-dev

-- 
fbl
_______________________________________________
dev mailing list
[email protected]
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Reply via email to