date:20170824

Re: [ovs-dev] [PATCH v2 0/2] nsh: rework NSH netlink keys and actions

2017-08-24 Thread Yang, Yi

On Thu, Aug 24, 2017 at 04:24:07PM -0700, Ben Pfaff wrote:
> > 
> > Ok, I'll send patch 1 without datapath actions name change and netlink
> > change, but it isn't a good result if people see encap_nsh and decep_nsh
> > in 2.8 and see they are changed to push_nsh and pop_nsh in 2.9 and
> > later.
> 
> It doesn't look to me like these names are changing for OpenFlow
> actions.  Datapath actions are an implementation detail and keeping them
> 100% stable is not a priority.
> 
> > > Making 2.8 compliant with the NSH draft makes sense.
> > 
> > I have finished dec_nsh_ttl action patch, does it make sense for 2.8?
> 
> You can add that as a patch 3 in the next version and we'll take a look.
> 
> We want to release 2.8 next week.

I have sent them out as v4 and included patch 3 which is just for new
action dec_nsh_ttl, please review v4.

https://mail.openvswitch.org/pipermail/ovs-dev/2017-August/337877.html

> 
> Thanks,
> 
> Ben.
___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

[ovs-dev] [PATCH v4 3/3] nsh: add dec_nsh_ttl action

2017-08-24 Thread Yi Yang

IETF NSH spec defines a ttl field in NSH header, it is a 6-bit
field ranged from 0 to 63, it should be decremented by 1 every
hop, if it is 0 or it is so after decremented, the packet should
be dropped and a packet-in message is sent to main controller.

Signed-off-by: Yi Yang 
---
 include/openvswitch/ofp-actions.h |  1 +
 lib/ofp-actions.c | 49 +++
 ofproto/ofproto-dpif-xlate.c  | 31 +
 tests/nsh.at  | 23 +-
 utilities/ovs-ofctl.8.in  | 13 ++-
 5 files changed, 105 insertions(+), 12 deletions(-)

diff --git a/include/openvswitch/ofp-actions.h 
b/include/openvswitch/ofp-actions.h
index ad8e1be..1296a9c 100644
--- a/include/openvswitch/ofp-actions.h
+++ b/include/openvswitch/ofp-actions.h
@@ -93,6 +93,7 @@ struct vl_mff_map;
 OFPACT(DEC_MPLS_TTL,ofpact_null,ofpact, "dec_mpls_ttl") \
 OFPACT(PUSH_MPLS,   ofpact_push_mpls,   ofpact, "push_mpls")\
 OFPACT(POP_MPLS,ofpact_pop_mpls,ofpact, "pop_mpls") \
+OFPACT(DEC_NSH_TTL, ofpact_null,ofpact, "dec_nsh_ttl")  \
 \
 /* Generic encap & decap */ \
 OFPACT(ENCAP,   ofpact_encap,   props, "encap") \
diff --git a/lib/ofp-actions.c b/lib/ofp-actions.c
index 71eb70c..1a92b95 100644
--- a/lib/ofp-actions.c
+++ b/lib/ofp-actions.c
@@ -348,6 +348,9 @@ enum ofp_raw_action_type {
 /* NX1.3+(47): struct nx_action_decap, ... */
 NXAST_RAW_DECAP,
 
+/* NX1.3+(48): void. */
+NXAST_RAW_DEC_NSH_TTL,
+
 /* ## -- ## */
 /* ## Debugging actions. ## */
 /* ## -- ## */
@@ -480,6 +483,7 @@ ofpact_next_flattened(const struct ofpact *ofpact)
 case OFPACT_NAT:
 case OFPACT_ENCAP:
 case OFPACT_DECAP:
+case OFPACT_DEC_NSH_TTL:
 return ofpact_next(ofpact);
 
 case OFPACT_CLONE:
@@ -4330,6 +4334,39 @@ format_DECAP(const struct ofpact_decap *a,
 ds_put_format(s, "%s)%s", colors.paren, colors.end);
 }
 
+/* Action dec_nsh_ttl */
+
+static enum ofperr
+decode_NXAST_RAW_DEC_NSH_TTL(struct ofpbuf *out)
+{
+ofpact_put_DEC_NSH_TTL(out);
+return 0;
+}
+
+static void
+encode_DEC_NSH_TTL(const struct ofpact_null *null OVS_UNUSED,
+enum ofp_version ofp_version OVS_UNUSED, struct ofpbuf *out)
+{
+put_NXAST_DEC_NSH_TTL(out);
+}
+
+static char * OVS_WARN_UNUSED_RESULT
+parse_DEC_NSH_TTL(char *arg OVS_UNUSED,
+   const struct ofputil_port_map *port_map OVS_UNUSED,
+   struct ofpbuf *ofpacts,
+   enum ofputil_protocol *usable_protocols OVS_UNUSED)
+{
+ofpact_put_DEC_NSH_TTL(ofpacts);
+return NULL;
+}
+
+static void
+format_DEC_NSH_TTL(const struct ofpact_null *a OVS_UNUSED,
+const struct ofputil_port_map *port_map OVS_UNUSED, struct ds *s)
+{
+ds_put_format(s, "%sdec_nsh_ttl%s", colors.special, colors.end);
+}
+
 
 /* Action structures for NXAST_RESUBMIT, NXAST_RESUBMIT_TABLE, and
  * NXAST_RESUBMIT_TABLE_CT.
@@ -7114,6 +7151,7 @@ ofpact_is_set_or_move_action(const struct ofpact *a)
 case OFPACT_SET_VLAN_VID:
 case OFPACT_ENCAP:
 case OFPACT_DECAP:
+case OFPACT_DEC_NSH_TTL:
 return true;
 case OFPACT_BUNDLE:
 case OFPACT_CLEAR_ACTIONS:
@@ -7191,6 +7229,7 @@ ofpact_is_allowed_in_actions_set(const struct ofpact *a)
 case OFPACT_STRIP_VLAN:
 case OFPACT_ENCAP:
 case OFPACT_DECAP:
+case OFPACT_DEC_NSH_TTL:
 return true;
 
 /* In general these actions are excluded because they are not part of
@@ -7304,6 +7343,7 @@ ofpacts_execute_action_set(struct ofpbuf *action_list,
 ofpacts_copy_last(action_list, action_set, OFPACT_PUSH_VLAN);
 ofpacts_copy_last(action_list, action_set, OFPACT_DEC_TTL);
 ofpacts_copy_last(action_list, action_set, OFPACT_DEC_MPLS_TTL);
+ofpacts_copy_last(action_list, action_set, OFPACT_DEC_NSH_TTL);
 ofpacts_copy_all(action_list, action_set, ofpact_is_set_or_move_action);
 ofpacts_copy_last(action_list, action_set, OFPACT_SET_QUEUE);
 
@@ -7445,6 +7485,7 @@ ovs_instruction_type_from_ofpact_type(enum ofpact_type 
type)
 case OFPACT_NAT:
 case OFPACT_ENCAP:
 case OFPACT_DECAP:
+case OFPACT_DEC_NSH_TTL:
 default:
 return OVSINST_OFPIT11_APPLY_ACTIONS;
 }
@@ -8131,6 +8172,13 @@ ofpact_check__(enum ofputil_protocol *usable_protocols, 
struct ofpact *a,
 }
 return 0;
 
+case OFPACT_DEC_NSH_TTL:
+if ((flow->packet_type != htonl(PT_NSH)) &&
+(flow->dl_type != htons(ETH_TYPE_NSH))) {
+inconsistent_match(usable_protocols);
+}
+return 0;
+
 default:
 OVS_NOT_REACHED();
 }
@@ -8626,6 +8674,7 @@ ofpact_outputs_to_port(const struct ofpact *ofpact, 
ofp_port_t port)
 case OFPACT_NAT:
 case OFPACT_ENCAP:

[ovs-dev] [PATCH v4 2/3] nsh: add new flow key 'ttl'

2017-08-24 Thread Yi Yang

IETF NSH draft will be approved by end of August, NSH header
format has been finalized and won't be change anymore, so we
need to follow this final spec to implement nsh.

kernel data path also needs finalized uAPIs, they can't be
changed once they are merged.

This patch adds new nsh key 'ttl', bits of flags and mdtype
fields are also changed to follow the final spec.

A new action dec_nsh_ttl will be added in the following patch.

Signed-off-by: Yi Yang 
---
 datapath/linux/compat/include/linux/openvswitch.h |  17 +-
 include/openvswitch/flow.h|   6 +-
 include/openvswitch/meta-flow.h   |  31 +-
 include/openvswitch/nsh.h | 326 --
 include/openvswitch/packets.h |  18 +-
 lib/flow.c|  64 ++---
 lib/flow.h|   2 +-
 lib/match.c   |  12 +-
 lib/meta-flow.c   |  56 +++-
 lib/meta-flow.xml |   6 +-
 lib/nx-match.c|  16 +-
 lib/odp-execute.c |  29 +-
 lib/odp-util.c| 152 +-
 lib/packets.c |   5 +-
 ofproto/ofproto-dpif-xlate.c  |  11 +-
 tests/nsh.at  |  46 +--
 16 files changed, 555 insertions(+), 242 deletions(-)

diff --git a/datapath/linux/compat/include/linux/openvswitch.h 
b/datapath/linux/compat/include/linux/openvswitch.h
index bc6c94b..32804db 100644
--- a/datapath/linux/compat/include/linux/openvswitch.h
+++ b/datapath/linux/compat/include/linux/openvswitch.h
@@ -492,15 +492,6 @@ struct ovs_key_ct_labels {
};
 };
 
-struct ovs_key_nsh {
-__u8 flags;
-__u8 mdtype;
-__u8 np;
-__u8 pad;
-__be32 path_hdr;
-__be32 c[4];
-};
-
 /* OVS_KEY_ATTR_CT_STATE flags */
 #define OVS_CS_F_NEW   0x01 /* Beginning of a new connection. */
 #define OVS_CS_F_ESTABLISHED   0x02 /* Part of an existing connection. */
@@ -793,10 +784,10 @@ struct ovs_action_push_eth {
struct ovs_key_ethernet addresses;
 };
 
-#define OVS_ENCAP_NSH_MAX_MD_LEN 16
 /*
  * struct ovs_action_encap_nsh - %OVS_ACTION_ATTR_ENCAP_NSH
  * @flags: NSH header flags.
+ * @ttl: NSH header TTL.
  * @mdtype: NSH metadata type.
  * @mdlen: Length of NSH metadata in bytes.
  * @np: NSH next_protocol: Inner packet type.
@@ -805,11 +796,13 @@ struct ovs_action_push_eth {
  */
 struct ovs_action_encap_nsh {
 uint8_t flags;
+uint8_t ttl;
 uint8_t mdtype;
-uint8_t mdlen;
 uint8_t np;
 __be32 path_hdr;
-uint8_t metadata[OVS_ENCAP_NSH_MAX_MD_LEN];
+uint8_t mdlen;
+uint8_t pad[7]; /* Aligned to 64 bit boundary for metadata */
+uint8_t metadata[];
 };
 
 /**
diff --git a/include/openvswitch/flow.h b/include/openvswitch/flow.h
index a658a58..cd61fff 100644
--- a/include/openvswitch/flow.h
+++ b/include/openvswitch/flow.h
@@ -146,7 +146,7 @@ struct flow {
 struct eth_addr arp_tha;/* ARP/ND target hardware address. */
 ovs_be16 tcp_flags; /* TCP flags. With L3 to avoid matching L4. */
 ovs_be16 pad2;  /* Pad to 64 bits. */
-struct flow_nsh nsh;/* Network Service Header keys */
+struct ovs_key_nsh nsh; /* Network Service Header keys */
 
 /* L4 (64-bit aligned) */
 ovs_be16 tp_src;/* TCP/UDP/SCTP source port/ICMP type. */
@@ -159,13 +159,13 @@ struct flow {
 };
 BUILD_ASSERT_DECL(sizeof(struct flow) % sizeof(uint64_t) == 0);
 BUILD_ASSERT_DECL(sizeof(struct flow_tnl) % sizeof(uint64_t) == 0);
-BUILD_ASSERT_DECL(sizeof(struct flow_nsh) % sizeof(uint64_t) == 0);
+BUILD_ASSERT_DECL(sizeof(struct ovs_key_nsh) % sizeof(uint64_t) == 0);
 
 #define FLOW_U64S (sizeof(struct flow) / sizeof(uint64_t))
 
 /* Remember to update FLOW_WC_SEQ when changing 'struct flow'. */
 BUILD_ASSERT_DECL(offsetof(struct flow, igmp_group_ip4) + sizeof(uint32_t)
-  == sizeof(struct flow_tnl) + sizeof(struct flow_nsh) + 300
+  == sizeof(struct flow_tnl) + sizeof(struct ovs_key_nsh) + 300
   && FLOW_WC_SEQ == 40);
 
 /* Incremental points at which flow classification may be performed in
diff --git a/include/openvswitch/meta-flow.h b/include/openvswitch/meta-flow.h
index 436501f..14e6b59 100644
--- a/include/openvswitch/meta-flow.h
+++ b/include/openvswitch/meta-flow.h
@@ -1757,6 +1757,21 @@ enum OVS_PACKED_ENUM mf_field_id {
  */
 MFF_NSH_FLAGS,
 
+/* "nsh_ttl".
+ *
+ * TTL field in NSH base header.
+ *
+ * Type: u8.
+ * Maskable: no.
+ * Formatting: decimal.
+ * Prerequisites: NSH.
+ * Access: read/write.
+ * NXM: none.
+ * OXM: NXOXM_NSH_TTL(2) since OF1.3 and v2.8.
+ */
+MFF_NSH_TTL,
+
+
 /* "nsh_mdtype".
  *
  *

[ovs-dev] [PATCH v4 1/3] nsh: fix an implicit bug in nsh_hdr_len

2017-08-24 Thread Yi Yang

Operator '*' will be executed prior to operator '>>',
but we expect operator '>>' is executed prior to '*',
this patch fixed the issue.

Signed-off-by: Yi Yang 
---
 include/openvswitch/nsh.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/openvswitch/nsh.h b/include/openvswitch/nsh.h
index f4ccadc..a3611d0 100644
--- a/include/openvswitch/nsh.h
+++ b/include/openvswitch/nsh.h
@@ -113,7 +113,7 @@ struct nsh_hdr {
 static inline uint16_t
 nsh_hdr_len(const struct nsh_hdr *nsh)
 {
-return 4 * (ntohs(nsh->ver_flags_len) & NSH_LEN_MASK) >> NSH_LEN_SHIFT;
+return ((ntohs(nsh->ver_flags_len) & NSH_LEN_MASK) >> NSH_LEN_SHIFT) << 2;
 }
 
 static inline struct nsh_md1_ctx *
-- 
2.1.0

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

[ovs-dev] [PATCH v4 0/3] nsh: add new nsh key ttl and action dec_nsh_ttl

2017-08-24 Thread Yi Yang

v3->v4
  - Add new action dec_nsh_ttl
  - Remove encap_nsh and decap_nsh changes
  - Remove netlink rework to adapt to OVS 2.8
  - Dynamically allocate struct ovs_action_encap_nsh and put
appropriate size for ENCAP_NSH netlink message.

v2->v3
  - Fix several comments Jan Scheurich

v1->v2
  - Rework per kernel datapath review comments
  - Add new NSH key ttl
  - Add many helpers in nsh.h and replace much code
with these helpers
  - nsh.h includes the lasted NSH spec
  - bits of flags and mdtype have a change

This patch adds new NSH key 'ttl' and a new action
dec_nsh_ttl to follow the lasted IETF NSH draft:

https://datatracker.ietf.org/doc/draft-ietf-sfc-nsh/

I have double confirmed from one of its authors, this
is a final version which will be approved as IETF RFC,
the NSH header format won't be change anymore.

Yi Yang (3):
  nsh: fix an implicit bug in nsh_hdr_len
  nsh: add new flow key 'ttl'
  nsh: add dec_nsh_ttl action

 datapath/linux/compat/include/linux/openvswitch.h |  17 +-
 include/openvswitch/flow.h|   6 +-
 include/openvswitch/meta-flow.h   |  31 +-
 include/openvswitch/nsh.h | 326 --
 include/openvswitch/ofp-actions.h |   1 +
 include/openvswitch/packets.h |  18 +-
 lib/flow.c|  64 ++---
 lib/flow.h|   2 +-
 lib/match.c   |  12 +-
 lib/meta-flow.c   |  56 +++-
 lib/meta-flow.xml |   6 +-
 lib/nx-match.c|  16 +-
 lib/odp-execute.c |  29 +-
 lib/odp-util.c| 152 +-
 lib/ofp-actions.c |  49 
 lib/packets.c |   5 +-
 ofproto/ofproto-dpif-xlate.c  |  42 ++-
 tests/nsh.at  |  61 ++--
 utilities/ovs-ofctl.8.in  |  13 +-
 19 files changed, 656 insertions(+), 250 deletions(-)

-- 
2.1.0

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

[ovs-dev] 带客服功能的营销型官网，只要160元！

2017-08-24 Thread cdlyim

我经营企业20多年，现在都50多岁了，却越来越不知道该如何往下走了！

官网、1688、微信公众号、微博…… 让我眼花缭乱。
90后同事说，都做了吧！于是，就都做了！

我慢慢发现，似乎没什么价值呀！
1688上没几个单！公众号没粉丝！微博没关注！
还是官网比较靠谱，至少还拿到了客户。

点此下载APP免费创建营销型官网

不知道你们都是怎么做的？
反正，我觉得官网很重要！营销型的官网更重要！
最好，还带着客服功能！哈哈，我太贪心了！ 
  
 点此下载APP免费创建营销型官网
 
 

  
 不是81难吗？不是一难一集的吗？我一直都这么认为的 
 
不想再收到此类邮件，点击退订？退订邮件
___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Re: [ovs-dev] [PATCH] Fix: vhost user port status

2017-08-24 Thread 王志克

Hi Darrell,

Thanks for your comment. I just create a new patch accordingly.

Yes, I am testing vhost user client port.
After ovs-vswitchd restarts, I did nothing. I just used command "ovs-ofctl show 
br" to get the port status, and found the status is "LINK DOWN".
With the patch, the status is 0 as expected.

Br,
Wang Zhike

-Original Message-
From: Darrell Ball [mailto:db...@vmware.com] 
Sent: Friday, August 25, 2017 9:35 AM
To: 王志克; d...@openvswitch.org
Subject: Re: [ovs-dev] [PATCH] Fix: vhost user port status

Hi Lawrence 

I am not very particular about the title prefix, but could we use something more
technically specific than ‘Fix’; maybe netdev-dpdk  ?
Title should also end with a period.

On 8/23/17, 9:13 PM, "ovs-dev-boun...@openvswitch.org on behalf of wangzhike" 
 wrote:

After ovs-vswitchd reboots, vhost user status is displayed as
LINK DOWN though the traffic is OK.

The problem is that the port may be udpated while the vhost_reconfigured
is false. Then the vhost_reconfigured is updated to true. As a result,
the vhost user status is kept as LINK-DOWN.

[Darrell]
Just so I understand, are you actually using vhost-user or really 
vhost-user-client ports ?
JTBS, can you describe what you did after vswitchd restart ?


Signed-off-by: wangzhike 
---
 lib/netdev-dpdk.c | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
index 1aaf6f7..e90fd0e 100644
--- a/lib/netdev-dpdk.c
+++ b/lib/netdev-dpdk.c
@@ -3227,7 +3227,11 @@ dpdk_vhost_reconfigure_helper(struct netdev_dpdk 
*dev)
 }
 
 if (netdev_dpdk_get_vid(dev) >= 0) {
-dev->vhost_reconfigured = true;
+if (dev->vhost_reconfigured == false) {
+dev->vhost_reconfigured = true;
+/* change vhost_reconfigured may affect carrier status */

[Darrell]
Maybe we can say ?
/* Carrier status may need updating. */
Would this be ok ?
BTW, comments should end with a period.

+netdev_change_seq_changed(>up);
+}
 }
 
 return 0;
-- 
1.8.3.1

___
dev mailing list
d...@openvswitch.org

https://urldefense.proofpoint.com/v2/url?u=https-3A__mail.openvswitch.org_mailman_listinfo_ovs-2Ddev=DwICAg=uilaK90D4TOVoH58JNXRgQ=BVhFA09CGX7JQ5Ih-uZnsw=TOaqootDYDg45mYkwMOQnfh-LiXCCGZAYC37gEf9aWE=5_PkYdvwnY5DuxfvSrVJ2WbjCGRVTpwE6kjLs6FLbzI=
 


___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

[ovs-dev] [PATCH] netdev-dpdk: update vhost user client port status.

2017-08-24 Thread wangzhike

After ovs-vswitchd reboots, vhost user cient port status is displayed as
LINK DOWN though the traffic is OK.

The problem is that the port may be udpated while the vhost_reconfigured
is false. Then the vhost_reconfigured is updated to true. As a result,
the vhost user status is kept as LINK-DOWN.

Signed-off-by: wangzhike 
---
 lib/netdev-dpdk.c | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
index 1aaf6f7..80415ef 100644
--- a/lib/netdev-dpdk.c
+++ b/lib/netdev-dpdk.c
@@ -3227,7 +3227,11 @@ dpdk_vhost_reconfigure_helper(struct netdev_dpdk *dev)
 }
 
 if (netdev_dpdk_get_vid(dev) >= 0) {
-dev->vhost_reconfigured = true;
+if (dev->vhost_reconfigured == false) {
+dev->vhost_reconfigured = true;
+/* Carrier status may need updating. */
+netdev_change_seq_changed(>up);
+}
 }
 
 return 0;
-- 
1.8.3.1

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

[ovs-dev] [PATCH] netdev-dpdk: vhost get stats fix

2017-08-24 Thread wangzhike

1. "+=" should be "="
2. tx_errors is a generic param, and should be 0 since vhost does not
   create such error.
   Or some app, like libvirt will complain for failure to find this key.

Signed-off-by: wangzhike 
---
 lib/netdev-dpdk.c | 7 ---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
index e90fd0e..1c50aa3 100644
--- a/lib/netdev-dpdk.c
+++ b/lib/netdev-dpdk.c
@@ -2016,14 +2016,15 @@ netdev_dpdk_vhost_get_stats(const struct netdev *netdev,
 
 rte_spinlock_lock(>stats_lock);
 /* Supported Stats */
-stats->rx_packets += dev->stats.rx_packets;
-stats->tx_packets += dev->stats.tx_packets;
+stats->rx_packets = dev->stats.rx_packets;
+stats->tx_packets = dev->stats.tx_packets;
 stats->rx_dropped = dev->stats.rx_dropped;
-stats->tx_dropped += dev->stats.tx_dropped;
+stats->tx_dropped = dev->stats.tx_dropped;
 stats->multicast = dev->stats.multicast;
 stats->rx_bytes = dev->stats.rx_bytes;
 stats->tx_bytes = dev->stats.tx_bytes;
 stats->rx_errors = dev->stats.rx_errors;
+stats->tx_errors = 0;
 stats->rx_length_errors = dev->stats.rx_length_errors;
 
 stats->rx_1_to_64_packets = dev->stats.rx_1_to_64_packets;
-- 
1.8.3.1

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Re: [ovs-dev] [ovs-discuss] OVS+DPDK QoS rate limit issue

2017-08-24 Thread Darrell Ball

Hi Ian

Did you have any suggestions for this fix or is it ok as is ?

Thanks Darrell



On 8/24/17, 7:16 PM, "ovs-dev-boun...@openvswitch.org on behalf of 王志克" 
 wrote:

Hi Lance,



Your patch works. Thanks.



BR,

Wang Zhike



-Original Message-

From: Lance Richardson [mailto:lrich...@redhat.com] 

Sent: Thursday, August 24, 2017 8:10 PM

To: 王志克

Cc: ovs-dev@openvswitch.org; ovs-disc...@openvswitch.org

Subject: Re: [ovs-discuss] OVS+DPDK QoS rate limit issue





> From: "王志克" 

> To: ovs-dev@openvswitch.org, ovs-disc...@openvswitch.org

> Sent: Wednesday, August 23, 2017 11:41:05 PM

> Subject: [ovs-discuss] OVS+DPDK QoS rate limit issue

> 

> 

> 

> Hi All,

> 

> 

> 

> I am using OVS2.7.0 and DPDK 16.11, and testing rate limit function.

> 

> 

> 

> I found that if the policing_rate is set very large, say 5Gbps, the rate 
is

> limited dramatically to very low value, like 800Mbps.

> 

> The command is as below:

> 

> ovs-vsctl set interface port-7zel2so9sg ingress_policing_rate=500

> ingress_policing_burst=50

> 

> 

> 

> If we set the rate lower than 4Gbps, the rate is limited correctly.

> 

> 

> 

> Test setup:

> 

> Sender (DPDK pktGen) sends out about 10Gbps udp packet, with size about 
1420

> IP size.

> 

> The rate limit is set on VM vhost-user-client port.

> 

> 

> 

> Any idea about this issue? Is that known issue?

> 

> 



It seems 32-bit arithmetic is being used when converting the rate from

kilobits per second to bytes per second. Could you give this patch a try?



diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c

index 1aaf6f7e2..d6ed2c7b0 100644

--- a/lib/netdev-dpdk.c

+++ b/lib/netdev-dpdk.c

@@ -2229,8 +2229,8 @@ netdev_dpdk_policer_construct(uint32_t rate, uint32_t 
burst)

 rte_spinlock_init(>policer_lock);

 

 /* rte_meter requires bytes so convert kbits rate and burst to bytes. 
*/

-rate_bytes = rate * 1000/8;

-burst_bytes = burst * 1000/8;

+rate_bytes = rate * 1000ULL/8;

+burst_bytes = burst * 1000ULL/8;

 

 policer->app_srtcm_params.cir = rate_bytes;

 policer->app_srtcm_params.cbs = burst_bytes;



Regards,



   Lance Richardson



> 

> Br,

> 

> Wang Zhike

> 

> ___

> discuss mailing list

> disc...@openvswitch.org

> 
https://urldefense.proofpoint.com/v2/url?u=https-3A__mail.openvswitch.org_mailman_listinfo_ovs-2Ddiscuss=DwIGaQ=uilaK90D4TOVoH58JNXRgQ=BVhFA09CGX7JQ5Ih-uZnsw=uh9yIsPRl-Wg_mzYEUSGJQ8MIc8bQSMveIRvPzVhs3o=wFA4X4_UKYadj8wqKID9C9HTVu7B_oGv6QnzzZOjLRs=
 

> 

___
dev mailing list
d...@openvswitch.org

https://urldefense.proofpoint.com/v2/url?u=https-3A__mail.openvswitch.org_mailman_listinfo_ovs-2Ddev=DwIGaQ=uilaK90D4TOVoH58JNXRgQ=BVhFA09CGX7JQ5Ih-uZnsw=uh9yIsPRl-Wg_mzYEUSGJQ8MIc8bQSMveIRvPzVhs3o=SR-iNl6jVSY3oDZ9W-JMageFfsPuE9WBvXeCFV8filc=
 


___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Re: [ovs-dev] [ovs-discuss] OVS+DPDK QoS rate limit issue

2017-08-24 Thread 王志克

Hi Lance,

Your patch works. Thanks.

BR,
Wang Zhike

-Original Message-
From: Lance Richardson [mailto:lrich...@redhat.com] 
Sent: Thursday, August 24, 2017 8:10 PM
To: 王志克
Cc: ovs-dev@openvswitch.org; ovs-disc...@openvswitch.org
Subject: Re: [ovs-discuss] OVS+DPDK QoS rate limit issue


> From: "王志克" 
> To: ovs-dev@openvswitch.org, ovs-disc...@openvswitch.org
> Sent: Wednesday, August 23, 2017 11:41:05 PM
> Subject: [ovs-discuss] OVS+DPDK QoS rate limit issue
> 
> 
> 
> Hi All,
> 
> 
> 
> I am using OVS2.7.0 and DPDK 16.11, and testing rate limit function.
> 
> 
> 
> I found that if the policing_rate is set very large, say 5Gbps, the rate is
> limited dramatically to very low value, like 800Mbps.
> 
> The command is as below:
> 
> ovs-vsctl set interface port-7zel2so9sg ingress_policing_rate=500
> ingress_policing_burst=50
> 
> 
> 
> If we set the rate lower than 4Gbps, the rate is limited correctly.
> 
> 
> 
> Test setup:
> 
> Sender (DPDK pktGen) sends out about 10Gbps udp packet, with size about 1420
> IP size.
> 
> The rate limit is set on VM vhost-user-client port.
> 
> 
> 
> Any idea about this issue? Is that known issue?
> 
> 

It seems 32-bit arithmetic is being used when converting the rate from
kilobits per second to bytes per second. Could you give this patch a try?

diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
index 1aaf6f7e2..d6ed2c7b0 100644
--- a/lib/netdev-dpdk.c
+++ b/lib/netdev-dpdk.c
@@ -2229,8 +2229,8 @@ netdev_dpdk_policer_construct(uint32_t rate, uint32_t 
burst)
     rte_spinlock_init(>policer_lock);
 
     /* rte_meter requires bytes so convert kbits rate and burst to bytes. */
-    rate_bytes = rate * 1000/8;
-    burst_bytes = burst * 1000/8;
+    rate_bytes = rate * 1000ULL/8;
+    burst_bytes = burst * 1000ULL/8;
 
     policer->app_srtcm_params.cir = rate_bytes;
     policer->app_srtcm_params.cbs = burst_bytes;

Regards,

   Lance Richardson

> 
> Br,
> 
> Wang Zhike
> 
> ___
> discuss mailing list
> disc...@openvswitch.org
> https://mail.openvswitch.org/mailman/listinfo/ovs-discuss
> 
___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

[ovs-dev] 答复: Re: 答复: Re: [PATCH v2] ovn: Support for taas(tap-as-a-service) function

2017-08-24 Thread wang . qianyu

Hi zhenyu,
Thanks for your opinion.
The mirror flag is not always exist, so I do not think add a new geneve 
option is a good idea.

Thanks. 





Gao Zhenyu 
2017/08/25 09:34
 
收件人：wang.qia...@zte.com.cn, 
抄送：  Russell Bryant , ovs dev 
, zhou.huij...@zte.com.cn, xurong00037997 

主题：  Re: [ovs-dev] 答复: Re: [PATCH v2] ovn: Support for 
taas(tap-as-a-service) function


Although adding a new geneve option is more complicate but I think it 
still worth having that.
Once the destination chassis found that geneve option, it can tag the 
mirror flag on packet. And it make the whole process looks same no matter 
on same chassis or not.

Thanks
Zhenyu Gao

2017-08-25 9:15 GMT+08:00 :
Hi Russell,

Thanks for your review.

When the mirror destination is in other chassis, the mirror flag which
used to mark the packet need be transmitted to the destination chassis.

We could use the inport, geneve option or new type of out port to indicate
the packet as a mirrored packet.

When we use inport to indicate the flag, this may need use inport as the
match field in the egress pipeline, I think this may conflict with the
egress pipeline.

If use geneve option to deliver the mirror flag, this may be more
complicated. So, I add a new type of port as the destination of mirror
flow. The port types of mirror and taas corresponding to configurations of
tap-flow and tap-service.

Thanks.





Russell Bryant 
2017/08/25 04:44

收件人：wang.qia...@zte.com.cn,
抄送：  ovs dev , zhou.huij...@zte.com.cn,
xurong00037997 
主题：  Re: [ovs-dev] [PATCH v2] ovn: Support for
taas(tap-as-a-service) function


Sorry for the delay in getting back to this ...

On Tue, Aug 15, 2017 at 4:28 AM,   wrote:
> Taas was designed to provide tenants and service providers a means of
> monitoring the traffic flowing in their Neutron provisioned virtual
> networks. It is useful for network trouble-shooting, security and
> analytics. The taas presentations could be found from
>
https://github.com/openstack/tap-as-a-service/blob/master/doc/source/presentations.rst


> , and the api reference could be found from
>
https://github.com/openstack/tap-as-a-service/blob/master/API_REFERENCE.rst


>
> To support taas function, this patch add two type of logica_switch_port,
> "mirror" and "taas". port with type "mirror" is used as inport for
monitor
> flow in logica_switch, and port with type "taas" is used as outport for
> monitor flow in logica_switch.
>
> The ovn-controller will make the relations of the ports in tap_service
and
> tap_flow to mirror port and taas port.
>
> Signed-off-by: wang qianyu 

> diff --git a/ovn/ovn-nb.xml b/ovn/ovn-nb.xml
> index 31303a8..5fdd045 100644
> --- a/ovn/ovn-nb.xml
> +++ b/ovn/ovn-nb.xml
> @@ -301,6 +301,20 @@
>
>  A port to a logical switch on a VTEP gateway.
>
> +
> +  mirror
> +  
> +A port indicate the inport of mirrored flows. The user need
> to
> +create this port in the logical_switch. This port should
one
> to
> +one correspondence with the the tap_flows
> +  
> +
> +  taas
> +  
> +A port indicate the outport of mirrored flows. The user
need
> to
> +create this port in logical_switch. This port should one to
> +one correspondence with the the tap_service.
> +  
>  
>
>  
> @@ -445,6 +459,61 @@
>interface, in bits.
>  
>
> +
> +  
> +
> +  These options apply when  is
> +  mirror.
> +
> +
> +
> +  Required.  The  of the  +  table="Logical_switch_Port"/> that indicates where the
> +  cloned flows come from.
> +
> +
> +
> +  Required.  The  of the  +  table="Logical_switch_Port"/> with type taas.
> +
> +
> +
> + 
> +This option indicates whitch
direction(from-port/to-port/all)
> of
> +packet will be cloned to the taas-port. The directions are
> defined
> +as follow:
> +  
> +  
> +from-port
> +
> +  The packets from this port will be cloned to specified
> mirror
> +  port.
> +
> +to-port
> +
> +  The packets to this port will be cloned to specified
mirror
> +  port.
> +
> +both
> +
> +  The packets both from and to this port will be cloned to
> +  specified mirror port.
> +
> +  
> +
> +  
> +
> +  
> +
> +  These options apply when  is
> taas.
> +
> +
> +

[ovs-dev] 答复: Re: [PATCH v2] ovn: Support for taas(tap-as-a-service) function

2017-08-24 Thread wang . qianyu

Hi Russell,

Thanks for your review.

When the mirror destination is in other chassis, the mirror flag which 
used to mark the packet need be transmitted to the destination chassis.

We could use the inport, geneve option or new type of out port to indicate 
the packet as a mirrored packet.

When we use inport to indicate the flag, this may need use inport as the 
match field in the egress pipeline, I think this may conflict with the 
egress pipeline.

If use geneve option to deliver the mirror flag, this may be more 
complicated. So, I add a new type of port as the destination of mirror 
flow. The port types of mirror and taas corresponding to configurations of 
tap-flow and tap-service.

Thanks.





Russell Bryant 
2017/08/25 04:44
 
收件人：wang.qia...@zte.com.cn, 
抄送：  ovs dev , zhou.huij...@zte.com.cn, 
xurong00037997 
主题：  Re: [ovs-dev] [PATCH v2] ovn: Support for 
taas(tap-as-a-service) function


Sorry for the delay in getting back to this ...

On Tue, Aug 15, 2017 at 4:28 AM,   wrote:
> Taas was designed to provide tenants and service providers a means of
> monitoring the traffic flowing in their Neutron provisioned virtual
> networks. It is useful for network trouble-shooting, security and
> analytics. The taas presentations could be found from
> 
https://github.com/openstack/tap-as-a-service/blob/master/doc/source/presentations.rst

> , and the api reference could be found from
> 
https://github.com/openstack/tap-as-a-service/blob/master/API_REFERENCE.rst

>
> To support taas function, this patch add two type of logica_switch_port,
> "mirror" and "taas". port with type "mirror" is used as inport for 
monitor
> flow in logica_switch, and port with type "taas" is used as outport for
> monitor flow in logica_switch.
>
> The ovn-controller will make the relations of the ports in tap_service 
and
> tap_flow to mirror port and taas port.
>
> Signed-off-by: wang qianyu 

> diff --git a/ovn/ovn-nb.xml b/ovn/ovn-nb.xml
> index 31303a8..5fdd045 100644
> --- a/ovn/ovn-nb.xml
> +++ b/ovn/ovn-nb.xml
> @@ -301,6 +301,20 @@
>
>  A port to a logical switch on a VTEP gateway.
>
> +
> +  mirror
> +  
> +A port indicate the inport of mirrored flows. The user need
> to
> +create this port in the logical_switch. This port should 
one
> to
> +one correspondence with the the tap_flows
> +  
> +
> +  taas
> +  
> +A port indicate the outport of mirrored flows. The user 
need
> to
> +create this port in logical_switch. This port should one to
> +one correspondence with the the tap_service.
> +  
>  
>
>  
> @@ -445,6 +459,61 @@
>interface, in bits.
>  
>
> +
> +  
> +
> +  These options apply when  is
> +  mirror.
> +
> +
> +
> +  Required.  The  of the  +  table="Logical_switch_Port"/> that indicates where the
> +  cloned flows come from.
> +
> +
> +
> +  Required.  The  of the  +  table="Logical_switch_Port"/> with type taas.
> +
> +
> +
> + 
> +This option indicates whitch 
direction(from-port/to-port/all)
> of
> +packet will be cloned to the taas-port. The directions are
> defined
> +as follow:
> +  
> +  
> +from-port
> +
> +  The packets from this port will be cloned to specified
> mirror
> +  port.
> +
> +to-port
> +
> +  The packets to this port will be cloned to specified 
mirror
> +  port.
> +
> +both
> +
> +  The packets both from and to this port will be cloned to
> +  specified mirror port.
> +
> +  
> +
> +  
> +
> +  
> +
> +  These options apply when  is
> taas.
> +
> +
> +
> +  Required.  The  of the  +  table="Logical_switch_Port"/> that indicates where the
> +  cloned flows come to.
> +
> +  
>  
>
>  

I'm having a hard time understanding this schema.  Could you expand on
why both a "mirror" and "taas" port type was needed?

I was hoping for only a single new port type, "mirror" for example,
with options to specify what port it is receiving a mirror of traffic
for.

Does something like that not express everything needed here?

-- 
Russell Bryant


___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Re: [ovs-dev] [PATCH 00/40] Fix static code analysis warnings.

2017-08-24 Thread Shashank Ram

> From: ovs-dev-boun...@openvswitch.org  on 
> behalf of Ben Pfaff 
> Sent: Wednesday, August 2, 2017 11:35 AM
> To: Alin Serdean
> Cc: d...@openvswitch.org
> Subject: Re: [ovs-dev] [PATCH 00/40] Fix static code analysis warnings.
>
> On Fri, Jul 14, 2017 at 04:40:51AM +, Alin Serdean wrote:
> > The following patches are fixes found with the WDK 8.1 and 10
> > static code analysis.
>
> I applied most of this series to master.  The patches that I didn't
> apply are the ones where Shashank had some requests.  (Alin, you should
> be able to find the remaining patches by simply rebasing the series
> against current master--the patches that were applied will disappear on
> rebase.)
>
> Thank you very much!
> ___
>
> Hi Ben, all the 40 patches in this series have been applied to
> master. Some of the patches have issues such as memory leaks as I
> pointed out in my review. Were these applied by mistake?

If that's what happened, then yes it was a mistake.  But I don't think
it's true.  I applied the following 33 commits on August 2, omitting the
other 7:

1e30f5fa96d8 datapath-windows: Fix shared variables which use Interlocked 
functions
17cf43bce5b3 datapath-windows: Add annotation for OvsIpFragmentEntryCleaner
553966a7942c datapath-windows: Add annotation for OvsCtRelatedEntryCleaner
7ce89fa2d4a6 datapath-windows: Treat TCP_HDR_LEN static analysis warnings
3630a2f32122 datapath-windows: Check return status when using APIs
1c5875f7d9d7 datapath-windows: fix excessive stack usage in iphelper
f0aeea81bd27 datapath-windows: Add dummy parameter for NotifyRouteChange2
20804d501994 datapath-windows: Suppress warning in jhash
445995a8e270 datapath-windows: Fix possible NULL deference in OvsFullCopyNBL
0d4374aa82de datapath-windows: Add assert in OvsPartialCopyNBL
b27b4e5f05a6 datapath-windows: Use annotations instead for macros
c5a12e53f86f datapath-windows: Fix misspelling in comment.
f0187396b0a7 datapath-windows: Add annotations for OvsAcquirePidHashLock
cfc854f0bad5 datapath-windows: Add annotations for OvsReleasePidHashLock
7dcfa473bfb6 datapath-windows: Add annotations for OvsReleaseEventQueueLock
f05dc2c26e8c datapath-windows: Add function annotations for 
OvsAcquireEventQueueLock
7fba23033f21 datapath-windows: Add function annotations for OvsCancelIrpDatapath
8ba92ebee99d datapath-windows: Add function annotations for 
OvsTunnelFilterCancelIrp
5f0d140d127d datapath-windows: Add function annotations for OvsCancelIrp
5635920bb4ea datapath-windows: Add function annotations for OvsReleaseDatapath
41d18c494a7b datapath-windows: Add function annotations for 
OvsAcquireDatapathWrite
3e6d3fe13bdd datapath-windows: Add function annotations for 
OvsAcquireDatapathRead
9330795b9790 datapath-windows: Remove function declarations from Tunnel.c
3cbbcb1ba611 datapath-windows: Add annotations for OvsReleaseCtrlLock
9f6573fe07db datapath-windows: Add annotations for OvsAcquireCtrlLock
9264367cd549 datapath-windows: Add an assert in recirculation
5a3cb5148f1e datapath-windows: Fix possible NULL dereference in BufferMgmt
1919e76d6b8e datapath-windows: Suppress PAGED_CODE warnings
b0586ec79b48 datapath-windows: Add asserts to Stt
d82c1ff1dbd7 datapath-windows: Fix code alignment in Stt
4f967565aed2 datapath-windows: interfaceName overflow in IpHelper
043b1405b50a datapath-windows: Remove annotations in Switch.c
fe09b60d4923 datapath-windows: Use non-executable memory when allocating memory

Can you be more specific about the problems?

>> You are right Ben. I misread the patches applied vs the ones "accepted" on 
>> patchworks.

Thanks,
Shashank
___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Re: [ovs-dev] [PATCH 00/40] Fix static code analysis warnings.

2017-08-24 Thread Ben Pfaff

On Thu, Aug 24, 2017 at 11:15:35PM +, Shashank Ram wrote:
> 
> 
> 
> From: ovs-dev-boun...@openvswitch.org  on 
> behalf of Ben Pfaff 
> Sent: Wednesday, August 2, 2017 11:35 AM
> To: Alin Serdean
> Cc: d...@openvswitch.org
> Subject: Re: [ovs-dev] [PATCH 00/40] Fix static code analysis warnings.
> 
> On Fri, Jul 14, 2017 at 04:40:51AM +, Alin Serdean wrote:
> > The following patches are fixes found with the WDK 8.1 and 10
> > static code analysis.
> 
> I applied most of this series to master.  The patches that I didn't
> apply are the ones where Shashank had some requests.  (Alin, you should
> be able to find the remaining patches by simply rebasing the series
> against current master--the patches that were applied will disappear on
> rebase.)
> 
> Thank you very much!
> ___
> 
> Hi Ben, all the 40 patches in this series have been applied to
> master. Some of the patches have issues such as memory leaks as I
> pointed out in my review. Were these applied by mistake?

If that's what happened, then yes it was a mistake.  But I don't think
it's true.  I applied the following 33 commits on August 2, omitting the
other 7:

1e30f5fa96d8 datapath-windows: Fix shared variables which use Interlocked 
functions
17cf43bce5b3 datapath-windows: Add annotation for OvsIpFragmentEntryCleaner
553966a7942c datapath-windows: Add annotation for OvsCtRelatedEntryCleaner
7ce89fa2d4a6 datapath-windows: Treat TCP_HDR_LEN static analysis warnings
3630a2f32122 datapath-windows: Check return status when using APIs
1c5875f7d9d7 datapath-windows: fix excessive stack usage in iphelper
f0aeea81bd27 datapath-windows: Add dummy parameter for NotifyRouteChange2
20804d501994 datapath-windows: Suppress warning in jhash
445995a8e270 datapath-windows: Fix possible NULL deference in OvsFullCopyNBL
0d4374aa82de datapath-windows: Add assert in OvsPartialCopyNBL
b27b4e5f05a6 datapath-windows: Use annotations instead for macros
c5a12e53f86f datapath-windows: Fix misspelling in comment.
f0187396b0a7 datapath-windows: Add annotations for OvsAcquirePidHashLock
cfc854f0bad5 datapath-windows: Add annotations for OvsReleasePidHashLock
7dcfa473bfb6 datapath-windows: Add annotations for OvsReleaseEventQueueLock
f05dc2c26e8c datapath-windows: Add function annotations for 
OvsAcquireEventQueueLock
7fba23033f21 datapath-windows: Add function annotations for OvsCancelIrpDatapath
8ba92ebee99d datapath-windows: Add function annotations for 
OvsTunnelFilterCancelIrp
5f0d140d127d datapath-windows: Add function annotations for OvsCancelIrp
5635920bb4ea datapath-windows: Add function annotations for OvsReleaseDatapath
41d18c494a7b datapath-windows: Add function annotations for 
OvsAcquireDatapathWrite
3e6d3fe13bdd datapath-windows: Add function annotations for 
OvsAcquireDatapathRead
9330795b9790 datapath-windows: Remove function declarations from Tunnel.c
3cbbcb1ba611 datapath-windows: Add annotations for OvsReleaseCtrlLock
9f6573fe07db datapath-windows: Add annotations for OvsAcquireCtrlLock
9264367cd549 datapath-windows: Add an assert in recirculation
5a3cb5148f1e datapath-windows: Fix possible NULL dereference in BufferMgmt
1919e76d6b8e datapath-windows: Suppress PAGED_CODE warnings
b0586ec79b48 datapath-windows: Add asserts to Stt
d82c1ff1dbd7 datapath-windows: Fix code alignment in Stt
4f967565aed2 datapath-windows: interfaceName overflow in IpHelper
043b1405b50a datapath-windows: Remove annotations in Switch.c
fe09b60d4923 datapath-windows: Use non-executable memory when allocating memory

Can you be more specific about the problems?

Thanks,

Ben.
___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

[ovs-dev] [PATCH v6 6/6] dpif-netdev: Add ovs-appctl dpif-netdev/pmd-rxq-rebalance.

2017-08-24 Thread Kevin Traynor

Rxqs consumed processing cycles are used to improve the balance
of how rxqs are assigned to pmds. Currently some reconfiguration
is needed to perform a reassignment.

Add an ovs-appctl command to perform a new assignment in order
to balance based on the latest rxq processing cycle information.

Note: Jan requested this for testing purposes.

Suggested-by: Jan Scheurich 
Signed-off-by: Kevin Traynor 
---
 Documentation/howto/dpdk.rst |  5 -
 lib/dpif-netdev.c| 35 +++
 vswitchd/ovs-vswitchd.8.in   |  2 ++
 3 files changed, 41 insertions(+), 1 deletion(-)

diff --git a/Documentation/howto/dpdk.rst b/Documentation/howto/dpdk.rst
index bac51de..d123819 100644
--- a/Documentation/howto/dpdk.rst
+++ b/Documentation/howto/dpdk.rst
@@ -140,5 +140,8 @@ Core 7: Q4 (70%) | Q5 (10%)
 core 8: Q3 (60%) | Q0 (30%)
 
-Rxq to pmds assignment takes place whenever there are configuration changes.
+Rxq to pmds assignment takes place whenever there are configuration changes
+or can be triggered by using::
+
+$ ovs-appctl dpif-netdev/pmd-rxq-rebalance
 
 QoS
diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 0023f19..ddb038a 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -726,4 +726,6 @@ static inline bool emc_entry_alive(struct emc_entry *ce);
 static void emc_clear_entry(struct emc_entry *ce);
 
+static void dp_netdev_request_reconfigure(struct dp_netdev *dp);
+
 static void
 emc_cache_init(struct emc_cache *flow_cache)
@@ -1019,4 +1021,34 @@ sorted_poll_thread_list(struct dp_netdev *dp,
 
 static void
+dpif_netdev_pmd_rebalance(struct unixctl_conn *conn, int argc,
+  const char *argv[], void *aux OVS_UNUSED)
+{
+struct ds reply = DS_EMPTY_INITIALIZER;
+struct dp_netdev *dp = NULL;
+
+ovs_mutex_lock(_netdev_mutex);
+
+if (argc == 2) {
+dp = shash_find_data(_netdevs, argv[1]);
+} else if (shash_count(_netdevs) == 1) {
+/* There's only one datapath */
+dp = shash_first(_netdevs)->data;
+}
+
+if (!dp) {
+ovs_mutex_unlock(_netdev_mutex);
+unixctl_command_reply_error(conn,
+"please specify an existing datapath");
+return;
+}
+
+dp_netdev_request_reconfigure(dp);
+ovs_mutex_unlock(_netdev_mutex);
+ds_put_cstr(, "pmd rxq rebalance requested.\n");
+unixctl_command_reply(conn, ds_cstr());
+ds_destroy();
+}
+
+static void
 dpif_netdev_pmd_info(struct unixctl_conn *conn, int argc, const char *argv[],
  void *aux)
@@ -1096,4 +1128,7 @@ dpif_netdev_init(void)
  0, 1, dpif_netdev_pmd_info,
  (void *)_aux);
+unixctl_command_register("dpif-netdev/pmd-rxq-rebalance", "[dp]",
+ 0, 1, dpif_netdev_pmd_rebalance,
+ NULL);
 return 0;
 }
diff --git a/vswitchd/ovs-vswitchd.8.in b/vswitchd/ovs-vswitchd.8.in
index dfd209e..c18baf6 100644
--- a/vswitchd/ovs-vswitchd.8.in
+++ b/vswitchd/ovs-vswitchd.8.in
@@ -281,4 +281,6 @@ bridge statistics, only the values shown by the above 
command.
 For each pmd thread of the datapath \fIdp\fR shows list of queue-ids with
 port names, which this thread polls.
+.IP "\fBdpif-netdev/pmd-rxq-rebalance\fR [\fIdp\fR]"
+Reassigns rxqs to pmds in the datapath \fIdp\fR based on their current usage.
 .
 .so ofproto/ofproto-dpif-unixctl.man
-- 
1.8.3.1

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

[ovs-dev] [PATCH v6 5/6] dpif-netdev: Change pmd selection order.

2017-08-24 Thread Kevin Traynor

Up to his point rxqs are sorted by processing cycles they
consumed and assigned to pmds in a round robin manner.

Ian pointed out that on wrap around the most loaded pmd will be
the next one to be assigned an additional rxq and that it would be
better to reverse the pmd order when wraparound occurs.

In other words, change from assigning by rr to assigning in a forward
and reverse cycle through pmds.

Also, now that the algorithm has finalized, document an example.

Suggested-by: Ian Stokes 
Signed-off-by: Kevin Traynor 
---
 Documentation/howto/dpdk.rst | 16 
 lib/dpif-netdev.c| 27 ++-
 tests/pmd.at |  2 +-
 3 files changed, 43 insertions(+), 2 deletions(-)

diff --git a/Documentation/howto/dpdk.rst b/Documentation/howto/dpdk.rst
index a67f3a1..bac51de 100644
--- a/Documentation/howto/dpdk.rst
+++ b/Documentation/howto/dpdk.rst
@@ -124,4 +124,20 @@ will be used where known to assign rxqs to pmd based on a 
round robin of the
 sorted rxqs.
 
+For example, in the case where here there are 5 rxqs and 3 cores (e.g. 3,7,8)
+available, and the measured usage of core cycles per rxq over the last
+interval is seen to be:
+
+- Queue #0: 30%
+- Queue #1: 80%
+- Queue #3: 60%
+- Queue #4: 70%
+- Queue #5: 10%
+
+The rxqs will be assigned to cores 3,7,8 in the following order:
+
+Core 3: Q1 (80%) |
+Core 7: Q4 (70%) | Q5 (10%)
+core 8: Q3 (60%) | Q0 (30%)
+
 Rxq to pmds assignment takes place whenever there are configuration changes.
 
diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 4b1cb85..0023f19 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -3288,4 +3288,5 @@ struct rr_numa {
 
 int cur_index;
+bool idx_inc;
 };
 
@@ -3344,11 +3345,35 @@ rr_numa_list_populate(struct dp_netdev *dp, struct 
rr_numa_list *rr)
 numa->pmds = xrealloc(numa->pmds, numa->n_pmds * sizeof *numa->pmds);
 numa->pmds[numa->n_pmds - 1] = pmd;
+/* At least one pmd so initialise curr_idx and idx_inc. */
+numa->cur_index = 0;
+numa->idx_inc = true;
 }
 }
 
+/* Returns the next pmd from the numa node in
+ * incrementing or decrementing order. */
 static struct dp_netdev_pmd_thread *
 rr_numa_get_pmd(struct rr_numa *numa)
 {
-return numa->pmds[numa->cur_index++ % numa->n_pmds];
+int numa_idx = numa->cur_index;
+
+if (numa->idx_inc == true) {
+/* Incrementing through list of pmds. */
+if (numa->cur_index == numa->n_pmds-1) {
+/* Reached the last pmd. */
+numa->idx_inc = false;
+} else {
+numa->cur_index++;
+}
+} else {
+/* Decrementing through list of pmds. */
+if (numa->cur_index == 0) {
+/* Reached the first pmd. */
+numa->idx_inc = true;
+} else {
+numa->cur_index--;
+}
+}
+return numa->pmds[numa_idx];
 }
 
diff --git a/tests/pmd.at b/tests/pmd.at
index b6732ea..e39a23a 100644
--- a/tests/pmd.at
+++ b/tests/pmd.at
@@ -54,5 +54,5 @@ m4_define([CHECK_PMD_THREADS_CREATED], [
 
 m4_define([SED_NUMA_CORE_PATTERN], ["s/\(numa_id \)[[0-9]]*\( core_id 
\)[[0-9]]*:/\1\2:/"])
-m4_define([SED_NUMA_CORE_QUEUE_PATTERN], ["s/\(numa_id \)[[0-9]]*\( core_id 
\)[[0-9]]*:/\1\2:/;s/\(queue-id: \)0 2 4 
6/\1/;s/\(queue-id: \)1 3 5 7/\1/"])
+m4_define([SED_NUMA_CORE_QUEUE_PATTERN], ["s/\(numa_id \)[[0-9]]*\( core_id 
\)[[0-9]]*:/\1\2:/;s/\(queue-id: \)1 2 5 
6/\1/;s/\(queue-id: \)0 3 4 7/\1/"])
 m4_define([DUMMY_NUMA], [--dummy-numa="0,0,0,0"])
 
-- 
1.8.3.1

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

[ovs-dev] [PATCH v6 4/6] dpif-netdev: Change rxq_scheduling to use rxq processing cycles.

2017-08-24 Thread Kevin Traynor

Previously rxqs were assigned to pmds by round robin in
port/queue order.

Now that we have the processing cycles used for existing rxqs,
use that information to try and produced a better balanced
distribution of rxqs across pmds. i.e. given multiple pmds, the
rxqs which have consumed the largest amount of processing cycles
will be placed on different pmds.

The rxqs are sorted by their processing cycles and assigned (in
sorted order) round robin across pmds.

Signed-off-by: Kevin Traynor 
---
 Documentation/howto/dpdk.rst |   7 +++
 lib/dpif-netdev.c| 123 ---
 2 files changed, 99 insertions(+), 31 deletions(-)

diff --git a/Documentation/howto/dpdk.rst b/Documentation/howto/dpdk.rst
index d7f6610..a67f3a1 100644
--- a/Documentation/howto/dpdk.rst
+++ b/Documentation/howto/dpdk.rst
@@ -119,4 +119,11 @@ After that PMD threads on cores where RX queues was pinned 
will become
   thread.
 
+If pmd-rxq-affinity is not set for rxqs, they will be assigned to pmds (cores)
+automatically. The processing cycles that have been stored for each rxq
+will be used where known to assign rxqs to pmd based on a round robin of the
+sorted rxqs.
+
+Rxq to pmds assignment takes place whenever there are configuration changes.
+
 QoS
 ---
diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index cdf2662..4b1cb85 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -715,4 +715,6 @@ static void
 dp_netdev_rxq_set_intrvl_cycles(struct dp_netdev_rxq *rx,
unsigned long long cycles);
+static uint64_t
+dp_netdev_rxq_get_intrvl_cycles(struct dp_netdev_rxq *rx, unsigned idx);
 static void
 dpif_netdev_xps_revalidate_pmd(const struct dp_netdev_pmd_thread *pmd,
@@ -3169,4 +3171,12 @@ dp_netdev_rxq_set_intrvl_cycles(struct dp_netdev_rxq *rx,
 }
 
+static uint64_t
+dp_netdev_rxq_get_intrvl_cycles(struct dp_netdev_rxq *rx, unsigned idx)
+{
+unsigned long long processing_cycles;
+atomic_read_relaxed(>cycles_intrvl[idx], _cycles);
+return processing_cycles;
+}
+
 static int
 dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd,
@@ -3355,8 +3365,37 @@ rr_numa_list_destroy(struct rr_numa_list *rr)
 }
 
+/* Sort Rx Queues by the processing cycles they are consuming. */
+static int
+rxq_cycle_sort(const void *a, const void *b)
+{
+struct dp_netdev_rxq * qa;
+struct dp_netdev_rxq * qb;
+uint64_t total_qa, total_qb;
+unsigned i;
+
+qa = *(struct dp_netdev_rxq **) a;
+qb = *(struct dp_netdev_rxq **) b;
+
+total_qa = total_qb = 0;
+for (i = 0; i < PMD_RXQ_INTERVAL_MAX; i++) {
+total_qa += dp_netdev_rxq_get_intrvl_cycles(qa, i);
+total_qb += dp_netdev_rxq_get_intrvl_cycles(qb, i);
+}
+dp_netdev_rxq_set_cycles(qa, RXQ_CYCLES_PROC_HIST, total_qa);
+dp_netdev_rxq_set_cycles(qb, RXQ_CYCLES_PROC_HIST, total_qb);
+
+if (total_qa >= total_qb) {
+return -1;
+}
+return 1;
+}
+
 /* Assign pmds to queues.  If 'pinned' is true, assign pmds to pinned
  * queues and marks the pmds as isolated.  Otherwise, assign non isolated
  * pmds to unpinned queues.
  *
+ * If 'pinned' is false queues will be sorted by processing cycles they are
+ * consuming and then assigned to pmds in round robin order.
+ *
  * The function doesn't touch the pmd threads, it just stores the assignment
  * in the 'pmd' member of each rxq. */
@@ -3367,18 +3406,14 @@ rxq_scheduling(struct dp_netdev *dp, bool pinned) 
OVS_REQUIRES(dp->port_mutex)
 struct rr_numa_list rr;
 struct rr_numa *non_local_numa = NULL;
-
-rr_numa_list_populate(dp, );
+struct dp_netdev_rxq ** rxqs = NULL;
+int i, n_rxqs = 0;
+struct rr_numa *numa = NULL;
+int numa_id;
 
 HMAP_FOR_EACH (port, node, >ports) {
-struct rr_numa *numa;
-int numa_id;
-
 if (!netdev_is_pmd(port->netdev)) {
 continue;
 }
 
-numa_id = netdev_get_numa_id(port->netdev);
-numa = rr_numa_list_lookup(, numa_id);
-
 for (int qid = 0; qid < port->n_rxq; qid++) {
 struct dp_netdev_rxq *q = >rxqs[qid];
@@ -3398,34 +3433,60 @@ rxq_scheduling(struct dp_netdev *dp, bool pinned) 
OVS_REQUIRES(dp->port_mutex)
 }
 } else if (!pinned && q->core_id == OVS_CORE_UNSPEC) {
-if (!numa) {
-/* There are no pmds on the queue's local NUMA node.
-   Round-robin on the NUMA nodes that do have pmds. */
-non_local_numa = rr_numa_list_next(, non_local_numa);
-if (!non_local_numa) {
-VLOG_ERR("There is no available (non-isolated) pmd "
- "thread for port \'%s\' queue %d. This queue "
- "will not be polled. Is pmd-cpu-mask set to "
- "zero? Or are all PMDs isolated to other "
- "queues?",

[ovs-dev] [PATCH v6 3/6] dpif-netdev: Count the rxq processing cycles for an rxq.

2017-08-24 Thread Kevin Traynor

Count the cycles used for processing an rxq during the
pmd rxq interval. As this is an in flight counter and
pmds run independently, also store the total cycles used
during the last full interval.

Signed-off-by: Kevin Traynor 
---
 lib/dpif-netdev.c | 75 ---
 1 file changed, 71 insertions(+), 4 deletions(-)

diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 8731435..cdf2662 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -182,4 +182,8 @@ struct emc_cache {
 #define DPCLS_OPTIMIZATION_INTERVAL 1000
 
+/* Time in ms of the interval in which rxq processing cycles used in
+ * rxq to pmd assignments is measured and stored. */
+#define PMD_RXQ_INTERVAL_LEN 1
+
 /* Number of intervals for which cycles are stored
  * and used during rxq to pmd assignment. */
@@ -570,4 +574,7 @@ struct dp_netdev_pmd_thread {
 /* Periodically sort subtable vectors according to hit frequencies */
 long long int next_optimization;
+/* End of the next time interval for which processing cycles
+   are stored for each polled rxq. */
+long long int rxq_interval;
 
 /* Statistics. */
@@ -696,5 +703,16 @@ static void pmd_load_cached_ports(struct 
dp_netdev_pmd_thread *pmd)
 OVS_REQUIRES(pmd->port_mutex);
 static inline void
-dp_netdev_pmd_try_optimize(struct dp_netdev_pmd_thread *pmd);
+dp_netdev_pmd_try_optimize(struct dp_netdev_pmd_thread *pmd,
+   struct polled_queue *poll_list, int poll_cnt);
+static void
+dp_netdev_rxq_set_cycles(struct dp_netdev_rxq *rx,
+ enum rxq_cycles_counter_type type,
+ unsigned long long cycles);
+static uint64_t
+dp_netdev_rxq_get_cycles(struct dp_netdev_rxq *rx,
+ enum rxq_cycles_counter_type type);
+static void
+dp_netdev_rxq_set_intrvl_cycles(struct dp_netdev_rxq *rx,
+   unsigned long long cycles);
 static void
 dpif_netdev_xps_revalidate_pmd(const struct dp_netdev_pmd_thread *pmd,
@@ -3126,4 +3144,29 @@ cycles_count_intermediate(struct dp_netdev_pmd_thread 
*pmd,
 }
 
+static void
+dp_netdev_rxq_set_cycles(struct dp_netdev_rxq *rx,
+ enum rxq_cycles_counter_type type,
+ unsigned long long cycles)
+{
+   atomic_store_relaxed(>cycles[type], cycles);
+}
+
+static uint64_t
+dp_netdev_rxq_get_cycles(struct dp_netdev_rxq *rx,
+ enum rxq_cycles_counter_type type)
+{
+unsigned long long processing_cycles;
+atomic_read_relaxed(>cycles[type], _cycles);
+return processing_cycles;
+}
+
+static void
+dp_netdev_rxq_set_intrvl_cycles(struct dp_netdev_rxq *rx,
+unsigned long long cycles)
+{
+   atomic_store_relaxed(>cycles_intrvl[rx->intrvl_idx++
+   % PMD_RXQ_INTERVAL_MAX], cycles);
+}
+
 static int
 dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd,
@@ -3179,4 +3222,5 @@ port_reconfigure(struct dp_netdev_port *port)
 port->rxqs[i].rx = NULL;
 }
+unsigned last_nrxq = port->n_rxq;
 port->n_rxq = 0;
 
@@ -3199,4 +3243,12 @@ port_reconfigure(struct dp_netdev_port *port)
 for (i = 0; i < netdev_n_rxq(netdev); i++) {
 port->rxqs[i].port = port;
+if (i >= last_nrxq) {
+/* Only reset cycle stats for new queues */
+dp_netdev_rxq_set_cycles(>rxqs[i], RXQ_CYCLES_PROC_CURR, 0);
+dp_netdev_rxq_set_cycles(>rxqs[i], RXQ_CYCLES_PROC_HIST, 0);
+for (unsigned j = 0; j < PMD_RXQ_INTERVAL_MAX; j++) {
+dp_netdev_rxq_set_intrvl_cycles(>rxqs[i], 0);
+}
+}
 err = netdev_rxq_open(netdev, >rxqs[i].rx, i);
 if (err) {
@@ -3879,5 +3931,5 @@ reload:
 dp_netdev_process_rxq_port(pmd, poll_list[i].rxq->rx,
poll_list[i].port_no);
-cycles_count_intermediate(pmd, NULL,
+cycles_count_intermediate(pmd, poll_list[i].rxq,
   process_packets ? PMD_CYCLES_PROCESSING
   : PMD_CYCLES_IDLE);
@@ -3890,5 +3942,5 @@ reload:
 
 coverage_try_clear();
-dp_netdev_pmd_try_optimize(pmd);
+dp_netdev_pmd_try_optimize(pmd, poll_list, poll_cnt);
 if (!ovsrcu_try_quiesce()) {
 emc_cache_slow_sweep(>flow_cache);
@@ -4334,4 +4386,5 @@ dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd, 
struct dp_netdev *dp,
 cmap_init(>classifiers);
 pmd->next_optimization = time_msec() + DPCLS_OPTIMIZATION_INTERVAL;
+pmd->rxq_interval = time_msec() + PMD_RXQ_INTERVAL_LEN;
 hmap_init(>poll_list);
 hmap_init(>tx_ports);
@@ -5771,9 +5824,23 @@ dpcls_sort_subtable_vector(struct dpcls *cls)
 
 static inline void
-dp_netdev_pmd_try_optimize(struct dp_netdev_pmd_thread *pmd)

[ovs-dev] [PATCH v6 2/6] dpif-netdev: Add rxq processing cycle counters.

2017-08-24 Thread Kevin Traynor

Add counters to dp_netdev_rxq which will later be used for storing the
processing cycles of an rxq. Processing cycles will be stored in reference
to a defined time interval. We will store the cycles of the current in progress
interval, a number of completed intervals and the sum of the completed
intervals.

cycles_count_intermediate was used to count cycles for a pmd. With some small
additions we can also use it to count the cycles used for processing an rxq.

Signed-off-by: Kevin Traynor 
---
 lib/dpif-netdev.c | 30 +++---
 1 file changed, 27 insertions(+), 3 deletions(-)

diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index f35c079..8731435 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -182,4 +182,8 @@ struct emc_cache {
 #define DPCLS_OPTIMIZATION_INTERVAL 1000
 
+/* Number of intervals for which cycles are stored
+ * and used during rxq to pmd assignment. */
+#define PMD_RXQ_INTERVAL_MAX 6
+
 struct dpcls {
 struct cmap_node node;  /* Within dp_netdev_pmd_thread.classifiers */
@@ -340,4 +344,13 @@ enum pmd_cycles_counter_type {
 };
 
+enum rxq_cycles_counter_type {
+RXQ_CYCLES_PROC_CURR,   /* Cycles spent successfully polling and
+   processing packets during the current
+   interval. */
+RXQ_CYCLES_PROC_HIST,   /* Total cycles of all intervals that are used
+   during rxq to pmd assignment. */
+RXQ_N_CYCLES
+};
+
 #define XPS_TIMEOUT_MS 500LL
 
@@ -351,4 +364,11 @@ struct dp_netdev_rxq {
   particular core. */
 struct dp_netdev_pmd_thread *pmd;  /* pmd thread that polls this queue. */
+
+/* Counters of cycles spent successfully polling and processing pkts. */
+atomic_ullong cycles[RXQ_N_CYCLES];
+/* We store PMD_RXQ_INTERVAL_MAX intervals of data for an rxq and then
+   sum them to yield the cycles used for an rxq. */
+atomic_ullong cycles_intrvl[PMD_RXQ_INTERVAL_MAX];
+unsigned intrvl_idx;   /* Write index for 'cycles_intrvl'. */
 };
 
@@ -677,5 +697,4 @@ static void pmd_load_cached_ports(struct 
dp_netdev_pmd_thread *pmd)
 static inline void
 dp_netdev_pmd_try_optimize(struct dp_netdev_pmd_thread *pmd);
-
 static void
 dpif_netdev_xps_revalidate_pmd(const struct dp_netdev_pmd_thread *pmd,
@@ -3092,4 +3111,5 @@ cycles_count_end(struct dp_netdev_pmd_thread *pmd,
 static inline void
 cycles_count_intermediate(struct dp_netdev_pmd_thread *pmd,
+  struct dp_netdev_rxq *rxq,
   enum pmd_cycles_counter_type type)
 OVS_NO_THREAD_SAFETY_ANALYSIS
@@ -3100,4 +3120,8 @@ cycles_count_intermediate(struct dp_netdev_pmd_thread 
*pmd,
 
 non_atomic_ullong_add(>cycles.n[type], interval);
+if (rxq && (type == PMD_CYCLES_PROCESSING)) {
+/* Add to the amount of current processing cycles. */
+non_atomic_ullong_add(>cycles[RXQ_CYCLES_PROC_CURR], interval);
+}
 }
 
@@ -3668,5 +3692,5 @@ dpif_netdev_run(struct dpif *dpif)
port->rxqs[i].rx,
port->port_no);
-cycles_count_intermediate(non_pmd, process_packets ?
+cycles_count_intermediate(non_pmd, NULL, process_packets ?
PMD_CYCLES_PROCESSING
  : PMD_CYCLES_IDLE);
@@ -3855,5 +3879,5 @@ reload:
 dp_netdev_process_rxq_port(pmd, poll_list[i].rxq->rx,
poll_list[i].port_no);
-cycles_count_intermediate(pmd,
+cycles_count_intermediate(pmd, NULL,
   process_packets ? PMD_CYCLES_PROCESSING
   : PMD_CYCLES_IDLE);
-- 
1.8.3.1

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Re: [ovs-dev] [PATCH v5 5/6] dpif-netdev: Change pmd selection order.

2017-08-24 Thread Kevin Traynor

On 08/24/2017 09:51 AM, Darrell Ball wrote:
> 
> 
> On 8/23/17, 6:34 AM, "Kevin Traynor"  wrote:
> 
> Up to his point rxqs are sorted by processing cycles they
> consumed and assigned to pmds in a round robin manner.
> 
> Ian pointed out that on wrap around the most loaded pmd will be
> the next one to be assigned an additional rxq and that it would be
> better to reverse the pmd order when wraparound occurs.
> 
> In other words, change from assigning by rr to assigning in a forward
> and reverse cycle through pmds.
> 
> Also, now that the algothim has finalised, document an example.
> 
> Typo: algothim

changed, and US'ified finalised

> 
> Suggested-by: Ian Stokes 
> Signed-off-by: Kevin Traynor 
> ---
>  Documentation/howto/dpdk.rst | 16 
>  lib/dpif-netdev.c| 21 -
>  tests/pmd.at |  2 +-
>  3 files changed, 37 insertions(+), 2 deletions(-)
> 
> diff --git a/Documentation/howto/dpdk.rst 
> b/Documentation/howto/dpdk.rst
> index 44737e4..493e215 100644
> --- a/Documentation/howto/dpdk.rst
> +++ b/Documentation/howto/dpdk.rst
> @@ -124,4 +124,20 @@ will be used where known to assign rxqs with the 
> highest consumption of
>  processing cycles to different pmds.
>  
> +For example, in the case where here there are 5 rxqs and 3 cores 
> (e.g. 3,7,8)
> +available, and the measured usage of core cycles per rxq over the 
> last
> +interval is seen to be:
> +
> +- Queue #0: 30%
> +- Queue #1: 80%
> +- Queue #3: 60%
> +- Queue #4: 70%
> +- Queue #5: 10%
> +
> +The rxqs will be assigned to cores 3,7,8 in the following order:
> +
> +Core 3: Q1 (80%) |
> +Core 7: Q4 (70%) | Q5 (10%)
> +core 8: Q0 (60%) | Q0 (30%)
> 
> [Darrell]
> There is a typo in the example:
> +core 8: Q0 (60%) …..
> should be
> +core 8: Q3 (60%) …..
> 

oh, that's annoying :( thanks for spotting it. fixed it now.

> 
> 
> +
>  Rxq to pmds assignment takes place whenever there are configuration 
> changes.
>  
> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
> index afbf591..6cc0a1e 100644
> --- a/lib/dpif-netdev.c
> +++ b/lib/dpif-netdev.c
> @@ -3285,4 +3285,5 @@ struct rr_numa {
>  
>  int cur_index;
> +bool idx_inc;
>  };
>  
> @@ -3341,4 +3342,7 @@ rr_numa_list_populate(struct dp_netdev *dp, 
> struct rr_numa_list *rr)
>  numa->pmds = xrealloc(numa->pmds, numa->n_pmds * sizeof 
> *numa->pmds);
>  numa->pmds[numa->n_pmds - 1] = pmd;
> +/* At least one pmd so initialise curr_idx and idx_inc. */
> +numa->cur_index = 0;
> +numa->idx_inc = true;
>  }
>  }
> @@ -3347,5 +3351,20 @@ static struct dp_netdev_pmd_thread *
>  rr_numa_get_pmd(struct rr_numa *numa)
>  {
> -return numa->pmds[numa->cur_index++ % numa->n_pmds];
> +int numa_idx = numa->cur_index;
> +
> +if (numa->idx_inc == true) {
> +if (numa->cur_index == numa->n_pmds-1) {
> +numa->idx_inc = false;
> +} else {
> +numa->cur_index++;
> +}
> +} else {
> 
> Maybe one sentence comment for others benefit?
> 

yeah, good point. I added a few comments to make it clear what's
happening here.

> +if (numa->cur_index == 0) {
> +numa->idx_inc = true;
> +} else {
> +numa->cur_index--;
> +}
> +}
> +return numa->pmds[numa_idx];
>  }
>  
> diff --git a/tests/pmd.at b/tests/pmd.at
> index b6732ea..e39a23a 100644
> --- a/tests/pmd.at
> +++ b/tests/pmd.at
> @@ -54,5 +54,5 @@ m4_define([CHECK_PMD_THREADS_CREATED], [
>  
>  m4_define([SED_NUMA_CORE_PATTERN], ["s/\(numa_id \)[[0-9]]*\( 
> core_id \)[[0-9]]*:/\1\2:/"])
> -m4_define([SED_NUMA_CORE_QUEUE_PATTERN], ["s/\(numa_id \)[[0-9]]*\( 
> core_id \)[[0-9]]*:/\1\2:/;s/\(queue-id: \)0 2 4 
> 6/\1/;s/\(queue-id: \)1 3 5 7/\1/"])
> +m4_define([SED_NUMA_CORE_QUEUE_PATTERN], ["s/\(numa_id \)[[0-9]]*\( 
> core_id \)[[0-9]]*:/\1\2:/;s/\(queue-id: \)1 2 5 
> 6/\1/;s/\(queue-id: \)0 3 4 7/\1/"])
>  m4_define([DUMMY_NUMA], [--dummy-numa="0,0,0,0"])
>  
> -- 
> 1.8.3.1
> 
> 
> 
> 
> 
> 
> 
> ___
> dev mailing

Re: [ovs-dev] [PATCH v5 4/6] dpif-netdev: Change rxq_scheduling to use rxq processing cycles.

2017-08-24 Thread Kevin Traynor

On 08/24/2017 10:06 AM, Darrell Ball wrote:
> 
> 
> On 8/24/17, 1:47 AM, "Darrell Ball"  wrote:
> 
> There is a minor checkpatch warning
> 
> 
> == Checking 
> "/home/dball/Desktop/patches/ovs-dev-v5-4-6-dpif-netdev-Change-rxq_scheduling-to-use-rxq-processing-cycles..patch"
>  ==
> WARNING: Line lacks whitespace around operator
> #170 FILE: lib/dpif-netdev.c:3456:
>Round-robin on the NUMA nodes that do have pmds. */
> 
> Lines checked: 204, Warnings: 1, Errors: 0
> 
> [Darrell] 
> Maybe try
> * Round-robin on the NUMA nodes that do have pmds. */
> 
> 
> I marked it below
> 
> Ignore what I marked for the warning inline
> 

I noticed the checkpatch warning too. It's a false positive for the
"Round-robin" in a comment block from a previous commit that I moved to
later in the function. I didn't want to confuse by fixing inline with
the other changes but now you've highlighted it I might as well change it.

> 
> On 8/23/17, 6:34 AM, "Kevin Traynor"  wrote:
> 
> Previously rxqs were assigned to pmds by round robin in
> port/queue order.
> 
> Now that we have the processing cycles used for existing rxqs,
> use that information to try and produced a better balanced
> distribution of rxqs across pmds. i.e. given multiple pmds, the
> rxqs which have consumed the largest amount of processing cycles
> will be placed on different pmds.
> 
> The rxqs are sorted by their processing cycles and assigned (in
> sorted order) round robin across pmds.
> 
> Signed-off-by: Kevin Traynor 
> ---
>  Documentation/howto/dpdk.rst |   7 +++
>  lib/dpif-netdev.c| 123 
> ---
>  2 files changed, 99 insertions(+), 31 deletions(-)
> 
> diff --git a/Documentation/howto/dpdk.rst 
> b/Documentation/howto/dpdk.rst
> index d7f6610..44737e4 100644
> --- a/Documentation/howto/dpdk.rst
> +++ b/Documentation/howto/dpdk.rst
> @@ -119,4 +119,11 @@ After that PMD threads on cores where RX 
> queues was pinned will become
>thread.
>  
> +If pmd-rxq-affinity is not set for rxqs, they will be assigned 
> to pmds (cores)
> +automatically. The processing cycles that have been required for 
> each rxq
> 

I noticed this also
s/required/stored

> +will be used where known to assign rxqs with the highest 
> consumption of
> +processing cycles to different pmds.
> 
> ‘will be used where known to assign rxqs to pmds based on round robin of 
> the sorted rxqs’ ?
> 

changed

> 
> +
> +Rxq to pmds assignment takes place whenever there are 
> configuration changes.
> +
>  QoS
>  ---
> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
> index 5670c55..afbf591 100644
> --- a/lib/dpif-netdev.c
> +++ b/lib/dpif-netdev.c
> @@ -712,4 +712,6 @@ static void
>  dp_netdev_rxq_set_interval(struct dp_netdev_rxq *rx,
> unsigned long long cycles);
> +static uint64_t
> +dp_netdev_rxq_get_interval(struct dp_netdev_rxq *rx, unsigned 
> idx);
>  static void
>  dpif_netdev_xps_revalidate_pmd(const struct dp_netdev_pmd_thread 
> *pmd,
> @@ -3166,4 +3168,12 @@ dp_netdev_rxq_set_interval(struct 
> dp_netdev_rxq *rx,
>  }
>  
> +static uint64_t
> +dp_netdev_rxq_get_interval(struct dp_netdev_rxq *rx, unsigned 
> idx)
> +{
> +unsigned long long tmp;
> +atomic_read_relaxed(>cycles_intrvl[idx], );
> +return tmp;
> 
> Could we use something like ‘intrv_processing_cycles’ instead of ‘tmp’
> Also _get_intrv_cycles ?; same forward comment I mentioned in patch 3
> 

removed tmp and changed to processing_cycles. made _get_intrvl_cycles to
match the _set_intrvl_cycles in 3/6

> 
> +}
> +
>  static int
>  dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd,
> @@ -3352,8 +3362,37 @@ rr_numa_list_destroy(struct rr_numa_list 
> *rr)
>  }
>  
> +/* Sort Rx Queues by the processing cycles they are consuming. */
> +static int
> +rxq_cycle_sort(const void *a, const void *b)
> +{
> +struct dp_netdev_rxq * qa;
> +struct dp_netdev_rxq * qb;
> +uint64_t total_qa, total_qb;
> +unsigned i;
>

Re: [ovs-dev] [PATCH v5 3/6] dpif-netdev: Count the rxq processing cycles for an rxq.

2017-08-24 Thread Kevin Traynor

On 08/24/2017 09:44 AM, Darrell Ball wrote:
> 
> On 8/23/17, 6:34 AM, "Kevin Traynor"  wrote:
> 
> Count the cycles used for processing an rxq during the
> pmd rxq interval. As this is an in flight counter and
> pmds run independently, also store the total cycles used
> during the last full interval.
> 
> Signed-off-by: Kevin Traynor 
> ---
>  lib/dpif-netdev.c | 78 
> +++
>  1 file changed, 73 insertions(+), 5 deletions(-)
> 
> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
> index 51d4050..5670c55 100644
> --- a/lib/dpif-netdev.c
> +++ b/lib/dpif-netdev.c
> @@ -182,4 +182,8 @@ struct emc_cache {
>  #define DPCLS_OPTIMIZATION_INTERVAL 1000
>  
> +/* Time in ms of the interval in which rxq processing cycles used in
> + * rxq to pmd assignments is measured and stored. */
> +#define PMD_RXQ_INTERVAL_LEN 1
> +
>  /* Number of intervals for which cycles are stored
>   * and used during rxq to pmd assignment. */
> @@ -568,4 +572,6 @@ struct dp_netdev_pmd_thread {
>  /* Periodically sort subtable vectors according to hit 
> frequencies */
>  long long int next_optimization;
> +/* Periodically store the processing cycles used for each rxq. */
> 
> The above comment seems a bit deceiving; should it say something like ‘the 
> end of the next time interval
> for which processing cycles are stored for each rxq’ 
> 

you're right, changed it

> +long long int rxq_interval;
> 
> 
>  /* Statistics. */
> @@ -694,5 +700,16 @@ static void pmd_load_cached_ports(struct 
> dp_netdev_pmd_thread *pmd)
>  OVS_REQUIRES(pmd->port_mutex);
>  static inline void
> -dp_netdev_pmd_try_optimize(struct dp_netdev_pmd_thread *pmd);
> +dp_netdev_pmd_try_optimize(struct dp_netdev_pmd_thread *pmd,
> +   struct polled_queue *poll_list, int 
> poll_cnt);
> +static void
> +dp_netdev_rxq_set_cycles(struct dp_netdev_rxq *rx,
> + enum rxq_cycles_counter_type type,
> + unsigned long long cycles);
> +static uint64_t
> +dp_netdev_rxq_get_cycles(struct dp_netdev_rxq *rx,
> + enum rxq_cycles_counter_type type);
> +static void
> +dp_netdev_rxq_set_interval(struct dp_netdev_rxq *rx,
> +   unsigned long long cycles);
>  static void
>  dpif_netdev_xps_revalidate_pmd(const struct dp_netdev_pmd_thread 
> *pmd,
> @@ -3124,4 +3141,29 @@ cycles_count_intermediate(struct 
> dp_netdev_pmd_thread *pmd,
>  }
>  
> +static void
> +dp_netdev_rxq_set_cycles(struct dp_netdev_rxq *rx,
> + enum rxq_cycles_counter_type type,
> + unsigned long long cycles)
> +{
> +   atomic_store_relaxed(>cycles[type], cycles);
> +}
> +
> +static uint64_t
> +dp_netdev_rxq_get_cycles(struct dp_netdev_rxq *rx,
> + enum rxq_cycles_counter_type type)
> +{
> +unsigned long long tmp;
> +atomic_read_relaxed(>cycles[type], );
> +return tmp;
> 
> Could we use a name like ‘processing_cycles’ or something other than ‘tmp’ ?
> 

sure, changed

> 
> +}
> +
> +static void
> +dp_netdev_rxq_set_interval(struct dp_netdev_rxq *rx,
> +   unsigned long long cycles)
> 
> 
> _set_interval seems a bit vague ?; well at least to me ?
> _set_intrv_cycles ?
> same for _get_interval for patch 4 ?

completely agree, adding _cycles makes it clearer. changed to
_set_intrvl_cycles

> 
> 
> +{
> +   atomic_store_relaxed(>cycles_intrvl[rx->intrvl_idx++
> +   % PMD_RXQ_INTERVAL_MAX], 
> cycles);
> +}
> +
>  static int
>  dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd,
> @@ -3168,5 +3210,5 @@ port_reconfigure(struct dp_netdev_port *port)
>  {
>  struct netdev *netdev = port->netdev;
> -int i, err;
> +int i, err, last_nrxq, j;
> 
> move ‘j’ down to the for loop and use unsigned

changed

> move  last_nrxq down to where it is first used

changed

>  
>  port->need_reconfigure = false;
> @@ -3177,4 +3219,5 @@ port_reconfigure(struct dp_netdev_port *port)
>  port->rxqs[i].rx = NULL;
>  }
> +last_nrxq = port->n_rxq;
> 
> uint32_t last_nrxq = port->n_rxq;
> 

Made this change, but used unsigned to be

Re: [ovs-dev] [PATCH v5 2/6] dpif-netdev: Add rxq processing cycle counters.

2017-08-24 Thread Kevin Traynor

Darrell, Thank you for the very thorough review. All look fine. Brief
reply to comments inline.

thanks,
Kevin.

On 08/24/2017 09:34 AM, Darrell Ball wrote:
> 
> 
> On 8/23/17, 6:34 AM, "Kevin Traynor"  wrote:
> 
> Add counters to dp_netdev_rxq which will later be used for storing the
> processing cycles of an rxq. Processing cycles will be stored in 
> reference
> to a defined time interval. We will store the cycles of the current 
> in progress
> interval, a number of completed intervals and the sum of the completed
> intervals.
> 
> cycles_count_intermediate was used to count cycles for a pmd. With 
> some small
> additions we can also use it to count the cycles used for processing 
> an rxq.
> 
> Signed-off-by: Kevin Traynor 
> ---
>  lib/dpif-netdev.c | 28 +---
>  1 file changed, 25 insertions(+), 3 deletions(-)
> 
> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
> index f35c079..51d4050 100644
> --- a/lib/dpif-netdev.c
> +++ b/lib/dpif-netdev.c
> @@ -182,4 +182,8 @@ struct emc_cache {
>  #define DPCLS_OPTIMIZATION_INTERVAL 1000
>  
> +/* Number of intervals for which cycles are stored
> + * and used during rxq to pmd assignment. */
> +#define PMD_RXQ_INTERVAL_MAX 6
> +
>  struct dpcls {
>  struct cmap_node node;  /* Within 
> dp_netdev_pmd_thread.classifiers */
> @@ -340,4 +344,13 @@ enum pmd_cycles_counter_type {
>  };
>  
> +enum rxq_cycles_counter_type {
> +RXQ_CYCLES_PROC_CURR,   /* Cycles spent successfully polling 
> and
> +   processing packets during the 
> current
> +   interval. */
> +RXQ_CYCLES_PROC_HIST,   /* Total cycles of all intervals 
> that are used
> +   during rxq to pmd assignment. */
> +RXQ_N_CYCLES
> +};
> +
>  #define XPS_TIMEOUT_MS 500LL
>  
> @@ -351,4 +364,9 @@ struct dp_netdev_rxq {
>particular core. */
>  struct dp_netdev_pmd_thread *pmd;  /* pmd thread that polls this 
> queue. */
> +
> +/* Counters of cycles spent polling and processing packets. */
> 
> Do we need to specify ‘successfully’ polling to avoid confusion ?
> 

Make sense, added

> +atomic_ullong cycles[RXQ_N_CYCLES];
> 
> Do you think below deserves a separate description – ‘we store the last 
> PMD_RXQ_INTERVAL_MAX  intervals and sum them up to 
> to yield the cycles used for a given rxq…’ ?
> 

Yes, I added in the comment

> +atomic_ullong cycles_intrvl[PMD_RXQ_INTERVAL_MAX];
> +unsigned intrvl_idx;   /* Write index for 
> 'cycles_intrvl'. */
>  };
>  
> @@ -677,5 +695,4 @@ static void pmd_load_cached_ports(struct 
> dp_netdev_pmd_thread *pmd)
>  static inline void
>  dp_netdev_pmd_try_optimize(struct dp_netdev_pmd_thread *pmd);
> -
>  static void
>  dpif_netdev_xps_revalidate_pmd(const struct dp_netdev_pmd_thread 
> *pmd,
> @@ -3092,4 +3109,5 @@ cycles_count_end(struct dp_netdev_pmd_thread 
> *pmd,
>  static inline void
>  cycles_count_intermediate(struct dp_netdev_pmd_thread *pmd,
> +  struct dp_netdev_rxq *rxq,
>enum pmd_cycles_counter_type type)
>  OVS_NO_THREAD_SAFETY_ANALYSIS
> @@ -3100,4 +3118,8 @@ cycles_count_intermediate(struct 
> dp_netdev_pmd_thread *pmd,
>  
>  non_atomic_ullong_add(>cycles.n[type], interval);
> +if (rxq && (type == PMD_CYCLES_PROCESSING)) {
> +/* Add to the amount of current processing cycles. */
> +non_atomic_ullong_add(>cycles[RXQ_CYCLES_PROC_CURR], 
> interval);
> +}
>  }
>  
> @@ -3668,5 +3690,5 @@ dpif_netdev_run(struct dpif *dpif)
> port->rxqs[i].rx,
> port->port_no);
> -cycles_count_intermediate(non_pmd, 
> process_packets ?
> +cycles_count_intermediate(non_pmd, NULL, 
> process_packets ?
> 
> PMD_CYCLES_PROCESSING
>   : 
> PMD_CYCLES_IDLE);
> @@ -3855,5 +3877,5 @@ reload:
>  dp_netdev_process_rxq_port(pmd, poll_list[i].rxq->rx,
> poll_list[i].port_no);
>

Re: [ovs-dev] [PATCH v2 0/2] nsh: rework NSH netlink keys and actions

2017-08-24 Thread Ben Pfaff

On Fri, Aug 25, 2017 at 06:43:49AM +0800, Yang, Yi wrote:
> On Fri, Aug 25, 2017 at 01:39:29AM +0800, Ben Pfaff wrote:
> > > do the below changes:
> > > 
> > > 1. change ecanp_nsh and decap_nsh to push_nsh and pop_nsh
> > > 2. Use nested OVS_KEY_ATTR_NSH to handle push_nsh.
> > > 
> > > Patch 1 is precisely doing this way.
> > 
> > Since this is targeted at 2.8, I'm only planning to take actual bug
> > fixes, or changes that affect Open vSwitch external interfaces where we
> > historically maintain a high degree of backward compatibility.
> > 
> > In patch 1, it looks like the change to nsh_hdr_len() is a bug fix.  At
> > a glance, I am not sure whether any of the other changes in patch 1 are
> > bug fixes.
> > 
> > Also in patch 1, renaming and Netlink restructuring isn't a bug fix,
> > doesn't appear to affect, say, OpenFlow action names, and it doesn't
> > affect any ABIs, so it's not appropriate for 2.8.
> 
> Ok, I'll send patch 1 without datapath actions name change and netlink
> change, but it isn't a good result if people see encap_nsh and decep_nsh
> in 2.8 and see they are changed to push_nsh and pop_nsh in 2.9 and
> later.

It doesn't look to me like these names are changing for OpenFlow
actions.  Datapath actions are an implementation detail and keeping them
100% stable is not a priority.

> > Making 2.8 compliant with the NSH draft makes sense.
> 
> I have finished dec_nsh_ttl action patch, does it make sense for 2.8?

You can add that as a patch 3 in the next version and we'll take a look.

We want to release 2.8 next week.

Thanks,

Ben.
___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Re: [ovs-dev] [PATCH 00/40] Fix static code analysis warnings.

2017-08-24 Thread Shashank Ram

From: ovs-dev-boun...@openvswitch.org  on 
behalf of Ben Pfaff 
Sent: Wednesday, August 2, 2017 11:35 AM
To: Alin Serdean
Cc: d...@openvswitch.org
Subject: Re: [ovs-dev] [PATCH 00/40] Fix static code analysis warnings.

On Fri, Jul 14, 2017 at 04:40:51AM +, Alin Serdean wrote:
> The following patches are fixes found with the WDK 8.1 and 10
> static code analysis.

I applied most of this series to master.  The patches that I didn't
apply are the ones where Shashank had some requests.  (Alin, you should
be able to find the remaining patches by simply rebasing the series
against current master--the patches that were applied will disappear on
rebase.)

Thank you very much!
___

Hi Ben, all the 40 patches in this series have been applied to master. Some of 
the patches have issues such as memory leaks as I pointed out in my review. 
Were these applied by mistake?

Thanks,
Shashank
___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Re: [ovs-dev] [PATCH v2 0/2] nsh: rework NSH netlink keys and actions

2017-08-24 Thread Yang, Yi

On Fri, Aug 25, 2017 at 01:39:29AM +0800, Ben Pfaff wrote:
> > do the below changes:
> > 
> > 1. change ecanp_nsh and decap_nsh to push_nsh and pop_nsh
> > 2. Use nested OVS_KEY_ATTR_NSH to handle push_nsh.
> > 
> > Patch 1 is precisely doing this way.
> 
> Since this is targeted at 2.8, I'm only planning to take actual bug
> fixes, or changes that affect Open vSwitch external interfaces where we
> historically maintain a high degree of backward compatibility.
> 
> In patch 1, it looks like the change to nsh_hdr_len() is a bug fix.  At
> a glance, I am not sure whether any of the other changes in patch 1 are
> bug fixes.
> 
> Also in patch 1, renaming and Netlink restructuring isn't a bug fix,
> doesn't appear to affect, say, OpenFlow action names, and it doesn't
> affect any ABIs, so it's not appropriate for 2.8.

Ok, I'll send patch 1 without datapath actions name change and netlink
change, but it isn't a good result if people see encap_nsh and decep_nsh
in 2.8 and see they are changed to push_nsh and pop_nsh in 2.9 and
later.

> 
> Making 2.8 compliant with the NSH draft makes sense.

I have finished dec_nsh_ttl action patch, does it make sense for 2.8?

> 
> Can you edit this series so that patch 1 is just bug fixes, and then
> re-send it?
> 
> Thanks,
> 
> Ben.
___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Re: [ovs-dev] [PATCH v2] ovn: Support for taas(tap-as-a-service) function

2017-08-24 Thread Russell Bryant

Sorry for the delay in getting back to this ...

On Tue, Aug 15, 2017 at 4:28 AM,   wrote:
> Taas was designed to provide tenants and service providers a means of
> monitoring the traffic flowing in their Neutron provisioned virtual
> networks. It is useful for network trouble-shooting, security and
> analytics. The taas presentations could be found from
> https://github.com/openstack/tap-as-a-service/blob/master/doc/source/presentations.rst
> , and the api reference could be found from
> https://github.com/openstack/tap-as-a-service/blob/master/API_REFERENCE.rst
>
> To support taas function, this patch add two type of logica_switch_port,
> "mirror" and "taas". port with type "mirror" is used as inport for monitor
> flow in logica_switch, and port with type "taas" is used as outport for
> monitor flow in logica_switch.
>
> The ovn-controller will make the relations of the ports in tap_service and
> tap_flow to mirror port and taas port.
>
> Signed-off-by: wang qianyu 

> diff --git a/ovn/ovn-nb.xml b/ovn/ovn-nb.xml
> index 31303a8..5fdd045 100644
> --- a/ovn/ovn-nb.xml
> +++ b/ovn/ovn-nb.xml
> @@ -301,6 +301,20 @@
>
>  A port to a logical switch on a VTEP gateway.
>
> +
> +  mirror
> +  
> +A port indicate the inport of mirrored flows. The user need
> to
> +create this port in the logical_switch. This port should one
> to
> +one correspondence with the the tap_flows
> +  
> +
> +  taas
> +  
> +A port indicate the outport of mirrored flows. The user need
> to
> +create this port in logical_switch. This port should one to
> +one correspondence with the the tap_service.
> +  
>  
>
>  
> @@ -445,6 +459,61 @@
>interface, in bits.
>  
>
> +
> +  
> +
> +  These options apply when  is
> +  mirror.
> +
> +
> +
> +  Required.  The  of the  +  table="Logical_switch_Port"/> that indicates where the
> +  cloned flows come from.
> +
> +
> +
> +  Required.  The  of the  +  table="Logical_switch_Port"/> with type taas.
> +
> +
> +
> + 
> +This option indicates whitch direction(from-port/to-port/all)
> of
> +packet will be cloned to the taas-port. The directions are
> defined
> +as follow:
> +  
> +  
> +from-port
> +
> +  The packets from this port will be cloned to specified
> mirror
> +  port.
> +
> +to-port
> +
> +  The packets to this port will be cloned to specified mirror
> +  port.
> +
> +both
> +
> +  The packets both from and to this port will be cloned to
> +  specified mirror port.
> +
> +  
> +
> +  
> +
> +  
> +
> +  These options apply when  is
> taas.
> +
> +
> +
> +  Required.  The  of the  +  table="Logical_switch_Port"/> that indicates where the
> +  cloned flows come to.
> +
> +  
>  
>
>  

I'm having a hard time understanding this schema.  Could you expand on
why both a "mirror" and "taas" port type was needed?

I was hoping for only a single new port type, "mirror" for example,
with options to specify what port it is receiving a mirror of traffic
for.

Does something like that not express everything needed here?

-- 
Russell Bryant
___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Re: [ovs-dev] [PATCH] conntrack: Fix ct-clean thread crash bug

2017-08-24 Thread Darrell Ball



On 8/24/17, 11:03 AM, "Darrell Ball"  wrote:

Thanks for testing.
I’ll look at in detail and get back to you today.

Darrell

On 8/24/17, 3:36 AM, "ovs-dev-boun...@openvswitch.org on behalf of 
huanglili"  wrote:

From: Lili Huang 

Conn should be removed from the list before freed.

This crash will be triggered when a established flow do ct(nat)
again, like
"ip,actions=ct(table=1)
 table=1,in_port=1,ip,actions=ct(commit,nat(dst=5.5.5.5)),2
 table=1,in_port=2,ip,ct_state=+est,actions=1
 table=1,in_port=1,ip,ct_state=+est,actions=2"

Signed-off-by: Lili Huang 
---
 lib/conntrack.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/lib/conntrack.c b/lib/conntrack.c
index 1c0e023..dd73e1a 100644
--- a/lib/conntrack.c
+++ b/lib/conntrack.c
@@ -779,6 +779,8 @@ conn_not_found(struct conntrack *ct, struct 
dp_packet *pkt,
ct, nc, conn_for_un_nat_copy);
 
 if (!nat_res) {
+ovs_list_remove(>exp_node);
+ctx->conn = NULL;
 goto nat_res_exhaustion;
 }

Hi Lily

Does the below alternative work for you ?

diff --git a/lib/conntrack.c b/lib/conntrack.c
index 1c0e023..4918aaf 100644
--- a/lib/conntrack.c
+++ b/lib/conntrack.c
@@ -805,6 +805,7 @@ conn_not_found(struct conntrack *ct, struct dp_packet *pkt,
  * against with firewall rules or a separate firewall.
  * Also using zone partitioning can limit DoS impact. */
 nat_res_exhaustion:
+ovs_list_remove(>exp_node);
 delete_conn(nc);
 /* conn_for_un_nat_copy is a local variable in process_one; this
  * memset() serves to document that conn_for_un_nat_copy is from

Thank you 
Darrell

 
-- 
1.8.3.1


___
dev mailing list
d...@openvswitch.org

https://urldefense.proofpoint.com/v2/url?u=https-3A__mail.openvswitch.org_mailman_listinfo_ovs-2Ddev=DwICAg=uilaK90D4TOVoH58JNXRgQ=BVhFA09CGX7JQ5Ih-uZnsw=SFTrBlBXqEC4qsGMh3ikI8j9tYy0Wbb5Vt9FXlMqNDI=d9q2ZnEz8iZ2cJuZ5tAFpEMdeP45pFBeL_FmBSjyCv4=
 




___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Re: [ovs-dev] [PATCH v2] ovn: Check for known logical switch port types.

2017-08-24 Thread Russell Bryant

On Thu, Aug 24, 2017 at 2:13 PM, Mark Michelson  wrote:
> OVN is lenient with the types of logical switch ports. Maybe too
> lenient. This patch attempts to solve this problem on two fronts:
>
> 1) In ovn-nbctl, if you attempt to set the port type to an unknown
> type, the command will not end up setting the type.
> 2) In northd, when copying the port type from the northbound database to
> the corresponding port-binding in the southbound database, a warning
> will be issued if the port is of an unknown type.
>
> Signed-off-by: Mark Michelson 
> ---
>  ovn/lib/ovn-util.c| 30 +++
>  ovn/lib/ovn-util.h|  2 ++
>  ovn/northd/ovn-northd.c   |  4 +++
>  ovn/utilities/ovn-nbctl.c |  7 -
>  tests/ovn-nbctl.at| 73 
> +++
>  5 files changed, 115 insertions(+), 1 deletion(-)
>

Did you look into whether it was possible to use an enum in the db
schema?  In particular, what would happen on an upgrade of an existing
database?  and what happens if the existing db has an invalid type in
it already?

I just want to make sure we clearly rule that out first.

> diff --git a/ovn/lib/ovn-util.c b/ovn/lib/ovn-util.c
> index 037d0798a..883ce4ccb 100644
> --- a/ovn/lib/ovn-util.c
> +++ b/ovn/lib/ovn-util.c
> @@ -299,3 +299,33 @@ default_sb_db(void)
>  }
>  return def;
>  }
> +
> +/* l3gateway, chassisredirect, and patch
> + * are not in this list since they are
> + * only set in the SB DB by northd
> + */
> +const char *OVN_NB_LSP_TYPES[] = {

I would add "static" here.

> +"router",
> +"localnet",
> +"localport",
> +"l2gateway",
> +"vtep",
> +};
> +
> +bool
> +ovn_is_known_nb_lsp_type(const char *type)
> +{
> +int i;
> +
> +if (!type || !type[0]) {
> +return true;
> +}
> +
> +for (i = 0; i < ARRAY_SIZE(OVN_NB_LSP_TYPES); ++i) {
> +if (!strcmp(OVN_NB_LSP_TYPES[i], type)) {
> +return true;
> +}
> +}
> +
> +return false;
> +}
> diff --git a/ovn/lib/ovn-util.h b/ovn/lib/ovn-util.h
> index b3d2125a3..9b456426d 100644
> --- a/ovn/lib/ovn-util.h
> +++ b/ovn/lib/ovn-util.h
> @@ -67,4 +67,6 @@ char *alloc_nat_zone_key(const struct uuid *key, const char 
> *type);
>  const char *default_nb_db(void);
>  const char *default_sb_db(void);
>
> +bool ovn_is_known_nb_lsp_type(const char *type);
> +
>  #endif
> diff --git a/ovn/northd/ovn-northd.c b/ovn/northd/ovn-northd.c
> index 49e4ac338..177df5a1e 100644
> --- a/ovn/northd/ovn-northd.c
> +++ b/ovn/northd/ovn-northd.c
> @@ -1962,6 +1962,10 @@ ovn_port_update_sbrec(struct northd_context *ctx,
>  }
>  sbrec_port_binding_set_options(op->sb, );
>  smap_destroy();
> +if (!ovn_is_known_nb_lsp_type(op->nbsp->type)) {
> +VLOG_WARN("Unknown port type '%s' set on logical switch 
> '%s'. "
> +  "Using anyway.", op->nbsp->type, op->nbsp->name);
> +}
>  sbrec_port_binding_set_type(op->sb, op->nbsp->type);
>  } else {
>  const char *chassis = NULL;

I tend to think we should ignore the port as if it didn't exist in
this case since it's an invalid configuration.

-- 
Russell Bryant
___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Re: [ovs-dev] [PATCH v2] ovn: Fix BFD error config on gateway

2017-08-24 Thread Russell Bryant

Thanks for the patch!  I applied this to master and branch-2.8.


On Wed, Aug 23, 2017 at 5:35 AM, Anil Venkata  wrote:
> Reviewed change and it looks fine. Also tested the test case.
>
> On Sun, Aug 20, 2017 at 8:07 PM, Zhenyu Gao  wrote:
>>
>> The bfd_calculate_chassis function calculates gateway's peer datapaths
>> to figure out which tunnel's BFD should be enabled to from the current
>> chassis.
>> Existing algorithm only calculats peer datapaths at one hop, but multiple
>> logical switches and E/W routers could be in the path, making several hops
>> which were not considered on the calculation.
>> It may disable BFD on some gw's tunnel ports. Then a port on a remote ovs
>> cannot send packet out because it believes all remote gateways are down.
>>
>> This patch will go through whole graph and visit all datapath's port
>> which has connection with gateways.
>>
>> Signed-off-by: Zhenyu Gao 
>> ---
>>
>>  v1->v2: Address Miguel's comments and add ovn test
>>
>>  ovn/controller/bfd.c | 102 +-
>>  tests/ovn.at | 203
>> ++-
>>  2 files changed, 286 insertions(+), 19 deletions(-)
>>
>> diff --git a/ovn/controller/bfd.c b/ovn/controller/bfd.c
>> index d1448b1..dccfc98 100644
>> --- a/ovn/controller/bfd.c
>> +++ b/ovn/controller/bfd.c
>> @@ -100,6 +100,88 @@ bfd_calculate_active_tunnels(const struct
>> ovsrec_bridge *br_int,
>>  }
>>  }
>>
>> +struct local_datapath_node {
>> +struct ovs_list node;
>> +struct local_datapath *dp;
>> +};
>> +
>> +static void
>> +bfd_travel_gw_related_chassis(struct local_datapath *dp,
>> +  const struct hmap *local_datapaths,
>> +  struct ovsdb_idl_index_cursor *cursor,
>> +  struct sbrec_port_binding *lpval,
>> +  struct sset *bfd_chassis)
>> +{
>> +struct ovs_list dp_list;
>> +const struct sbrec_port_binding *pb;
>> +struct sset visited_dp = SSET_INITIALIZER(_dp);
>> +const char *dp_key;
>> +struct local_datapath_node *dp_binding;
>> +
>> +if (!(dp_key = smap_get(>datapath->external_ids,
>> "logical-router")) &&
>> +!(dp_key = smap_get(>datapath->external_ids,
>> "logical-switch"))) {
>> +VLOG_INFO("datapath has no uuid, cannot travel graph");
>> +return;
>> +}
>> +
>> +sset_add(_dp, dp_key);
>> +
>> +ovs_list_init(_list);
>> +dp_binding = xmalloc(sizeof *dp_binding);
>> +dp_binding->dp = dp;
>> +ovs_list_push_back(_list, _binding->node);
>> +
>> +/*
>> + * Go through whole graph to figure out all chassis which may deliver
>> + * packetsto gateway. */
>> +do {
>> +dp_binding = CONTAINER_OF(ovs_list_pop_front(_list),
>> +  struct local_datapath_node, node);
>> +dp = dp_binding->dp;
>> +free(dp_binding);
>> +for (size_t i = 0; i < dp->n_peer_dps; i++) {
>> +const struct sbrec_datapath_binding *pdp = dp->peer_dps[i];
>> +if (!pdp) {
>> +continue;
>> +}
>> +
>> +if (!(dp_key = smap_get(>external_ids,
>> "logical-router")) &&
>> +!(dp_key = smap_get(>external_ids,
>> "logical-switch"))) {
>> +continue;
>> +}
>> +
>> +if (sset_contains(_dp, dp_key)) {
>> +continue;
>> +}
>> +
>> +sset_add(_dp, dp_key);
>> +
>> +struct hmap_node *node =
>> hmap_first_with_hash(local_datapaths,
>> +
>> pdp->tunnel_key);
>> +if (!node) {
>> +continue;
>> +}
>> +
>> +dp_binding = xmalloc(sizeof *dp_binding);
>> +dp_binding->dp = CONTAINER_OF(node, struct local_datapath,
>> +  hmap_node);
>> +ovs_list_push_back(_list, _binding->node);
>> +
>> +sbrec_port_binding_index_set_datapath(lpval, pdp);
>> +SBREC_PORT_BINDING_FOR_EACH_EQUAL (pb, cursor, lpval) {
>> +if (pb->chassis) {
>> +const char *chassis_name = pb->chassis->name;
>> +if (chassis_name) {
>> +sset_add(bfd_chassis, chassis_name);
>> +}
>> +}
>> +}
>> +}
>> +} while (!ovs_list_is_empty(_list));
>> +
>> +sset_destroy(_dp);
>> +}
>> +
>>  static void
>>  bfd_calculate_chassis(struct controller_ctx *ctx,
>>const struct sbrec_chassis *our_chassis,
>> @@ -155,24 +237,8 @@ bfd_calculate_chassis(struct controller_ctx *ctx,
>>  }
>>  }
>>  if (our_chassis_is_gw_for_dp) {
>> -for (size_t i = 0; i < dp->n_peer_dps; i++) {
>> -const struct sbrec_datapath_binding *pdp =
>> dp->peer_dps[i];
>> -if

Re: [ovs-dev] [PATCH] conntrack: Fix ct-clean thread crash bug

2017-08-24 Thread Darrell Ball

Thanks for testing.
I’ll look at in detail and get back to you today.

Darrell

On 8/24/17, 3:36 AM, "ovs-dev-boun...@openvswitch.org on behalf of huanglili" 
 wrote:

From: Lili Huang 

Conn should be removed from the list before freed.

This crash will be triggered when a established flow do ct(nat)
again, like
"ip,actions=ct(table=1)
 table=1,in_port=1,ip,actions=ct(commit,nat(dst=5.5.5.5)),2
 table=1,in_port=2,ip,ct_state=+est,actions=1
 table=1,in_port=1,ip,ct_state=+est,actions=2"

Signed-off-by: Lili Huang 
---
 lib/conntrack.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/lib/conntrack.c b/lib/conntrack.c
index 1c0e023..dd73e1a 100644
--- a/lib/conntrack.c
+++ b/lib/conntrack.c
@@ -779,6 +779,8 @@ conn_not_found(struct conntrack *ct, struct dp_packet 
*pkt,
ct, nc, conn_for_un_nat_copy);
 
 if (!nat_res) {
+ovs_list_remove(>exp_node);
+ctx->conn = NULL;
 goto nat_res_exhaustion;
 }
 
-- 
1.8.3.1


___
dev mailing list
d...@openvswitch.org

https://urldefense.proofpoint.com/v2/url?u=https-3A__mail.openvswitch.org_mailman_listinfo_ovs-2Ddev=DwICAg=uilaK90D4TOVoH58JNXRgQ=BVhFA09CGX7JQ5Ih-uZnsw=SFTrBlBXqEC4qsGMh3ikI8j9tYy0Wbb5Vt9FXlMqNDI=d9q2ZnEz8iZ2cJuZ5tAFpEMdeP45pFBeL_FmBSjyCv4=
 


___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Re: [ovs-dev] OVN: IPv6 Periodic Router Advertisement Plan

2017-08-24 Thread Ben Pfaff

On Tue, Aug 22, 2017 at 09:43:29PM +, Mark Michelson wrote:
> For a while now, in a "learn the OVN project from scratch" project, I've
> been investigating what it would take in order to allow for OVN to send
> periodic router advertisements (RAs) as part of a greater goal of rounding
> out IPv6 support. I've had some discussions about individual parts of this
> with various people, and now I want to come forward with an actual proposal
> of how I envision accomplishing this.

Thanks.  I read through this plan a couple of times.  I don't know IPv6
well, but it sounds like a solid approach from an OVN point of view.  I
don't have any comments or suggestions at this time, and I want to
encourage you to keep going.

Thanks,

Ben.
___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Re: [ovs-dev] [PATCH 1/4] netdev: Add set_ingress_sched to netdev api

2017-08-24 Thread Kavanagh, Mark B



>From: O Mahony, Billy
>Sent: Wednesday, August 16, 2017 5:16 PM
>To: Kavanagh, Mark B ; d...@openvswitch.org
>Subject: RE: [ovs-dev] [PATCH 1/4] netdev: Add set_ingress_sched to netdev api
>
>Hi Mark,

Hey Billy,

Apologies for the delayed response - comments inline as usual.

Thanks,
Mark

>
>> -Original Message-
>> From: O Mahony, Billy
>> Sent: Wednesday, August 16, 2017 4:53 PM
>> To: Kavanagh, Mark B ; d...@openvswitch.org
>> Subject: RE: [ovs-dev] [PATCH 1/4] netdev: Add set_ingress_sched to netdev
>> api
>>
>> Hi Mark,
>>
>> I'm continuing with rework/rebase. Some comments below.
>>
>> > -Original Message-
>> > From: Kavanagh, Mark B
>> > Sent: Friday, August 4, 2017 3:49 PM
>> > To: O Mahony, Billy ; d...@openvswitch.org
>> > Subject: RE: [ovs-dev] [PATCH 1/4] netdev: Add set_ingress_sched to
>> > netdev api
>> >
>> > >From: ovs-dev-boun...@openvswitch.org
>> > >[mailto:ovs-dev-boun...@openvswitch.org]
>> > >On Behalf Of Billy O'Mahony
>> > >Sent: Thursday, July 20, 2017 5:11 PM
>> > >To: d...@openvswitch.org
>> > >Subject: [ovs-dev] [PATCH 1/4] netdev: Add set_ingress_sched to
>> > >netdev api
>> > >
>> > >Passes ingress_sched config item from other_config column of
>> > >Interface table to the netdev.
>> >
>> >
>> > Hi Billy,
>> >
>> > Thanks for the patch - some review comments inline.
>> >
>> > Cheers,
>> > Mark
>> >
>> > >
>> > >Signed-off-by: Billy O'Mahony 
>> > >---
>> > > lib/netdev-bsd.c  |  1 +
>> > > lib/netdev-dpdk.c | 19 +++
>> > > lib/netdev-dummy.c|  1 +
>> > > lib/netdev-linux.c|  1 +
>> > > lib/netdev-provider.h | 10 ++
>> > > lib/netdev-vport.c|  1 +
>> > > lib/netdev.c  | 22 ++
>> > > lib/netdev.h  |  1 +
>> > > vswitchd/bridge.c |  2 ++
>> > > 9 files changed, 58 insertions(+)
>> > >
>> > >diff --git a/lib/netdev-bsd.c b/lib/netdev-bsd.c index
>> > >6cc83d3..eadf7bf
>> > >100644
>> > >--- a/lib/netdev-bsd.c
>> > >+++ b/lib/netdev-bsd.c
>> > >@@ -1509,6 +1509,7 @@ netdev_bsd_update_flags(struct netdev
>> > *netdev_,
>> > >enum netdev_flags off,
>> > > netdev_bsd_get_etheraddr,\
>> > > netdev_bsd_get_mtu,  \
>> > > NULL, /* set_mtu */  \
>> > >+NULL, /* set_ingress_sched */\
>> > > netdev_bsd_get_ifindex,  \
>> > > netdev_bsd_get_carrier,  \
>> > > NULL, /* get_carrier_resets */   \
>> > >diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index
>> > >ea17b97..e74c50f 100644
>> > >--- a/lib/netdev-dpdk.c
>> > >+++ b/lib/netdev-dpdk.c
>> > >@@ -369,6 +369,9 @@ struct netdev_dpdk {
>> > > /* If true, device was attached by rte_eth_dev_attach(). */
>> > > bool attached;
>> > >
>> > >+/* Ingress Scheduling config */
>> > >+char *ingress_sched_str;
>> >
>> > Would ingress_sched_cfg be more apt?
>> [[BO'M]] I find it useful to have the hint that this is a (human readable)
>string
>> as opposed to a struct.

Gotcha

>> >
>> > >+
>> > > /* In dpdk_list. */
>> > > struct ovs_list list_node OVS_GUARDED_BY(dpdk_mutex);
>> > >
>> > >@@ -1018,6 +1021,7 @@ netdev_dpdk_destruct(struct netdev *netdev)
>> > > }
>> > >
>> > > free(dev->devargs);
>> > >+free(dev->ingress_sched_str);
>> >
>> > There is a bug here.
>> >
>> > In the case that a user doesn't set an ingress scheduling policy,
>> > netdev_dpdk's ingress_sched_str will not have been set. However, since
>> > it is not initialized/set to the NULL string anywhere in the code, it
>> > could potentially point to a random area of memory. Upon destruction
>> > of the port, the call to free(dev->ingress_sched_str) will free said
>> > memory, causing undesired behavior for any application/process using it.
>> >
>> [[BO'M]] I'm happy to put in a checks in here -  just generally in OVS I see
>> checks for things that ought never happen are generally not made. But
>> maybe that is just in cycle-critical packet handling code paths. The
>> ingress_sched_str ptr is set to NULL in common_construct() (that may be in
>> one of the other patches) so it will either be NULL or set to a malloc'd
>location
>> and should not be an issue. But TBH I'm happier with a check in front of
>code
>> like this too.

Yes, it's initialized to NULL in a later patch alright; best to introduce the 
check nonetheless (safety first and all that).

>> >
>> > > common_destruct(dev);
>> > >
>> > > ovs_mutex_unlock(_mutex);
>> > >@@ -1941,6 +1945,20 @@ netdev_dpdk_set_mtu(struct netdev *netdev,
>> > int
>> > >mtu)  }
>> > >
>> > > static int
>> > >+netdev_dpdk_set_ingress_sched(struct netdev *netdev,
>> > >+  const char *ingress_sched_str) {
>> > >+struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
>> > >+
>> > >+

[ovs-dev] [PATCH V2 6/6] datapath: fix skb_panic due to the incorrect actions attrlen

2017-08-24 Thread Greg Rose

Upstream commit:
commit 494bea39f3201776cdfddc232705f54a0bd210c4
Author: Liping Zhang 
Date:   Wed Aug 16 13:30:07 2017 +0800

openvswitch: fix skb_panic due to the incorrect actions attrlen

For sw_flow_actions, the actions_len only represents the kernel part's
size, and when we dump the actions to the userspace, we will do the
convertions, so it's true size may become bigger than the actions_len.

But unfortunately, for OVS_PACKET_ATTR_ACTIONS, we use the actions_len
to alloc the skbuff, so the user_skb's size may become insufficient and
oops will happen like this:
  skbuff: skb_over_panic: text:8148fabf len:1749 put:157 head:
  881300f39000 data:881300f39000 tail:0x6d5 end:0x6c0 dev:
  [ cut here ]
  kernel BUG at net/core/skbuff.c:129!
  [...]
  Call Trace:
   
   [] skb_put+0x43/0x44
   [] skb_zerocopy+0x6c/0x1f4
   [] queue_userspace_packet+0x3a3/0x448 [openvswitch]
   [] ovs_dp_upcall+0x30/0x5c [openvswitch]
   [] output_userspace+0x132/0x158 [openvswitch]
   [] ? ip6_rcv_finish+0x74/0x77 [ipv6]
   [] do_execute_actions+0xcc1/0xdc8 [openvswitch]
   [] ovs_execute_actions+0x74/0x106 [openvswitch]
   [] ovs_dp_process_packet+0xe1/0xfd [openvswitch]
   [] ? key_extract+0x63c/0x8d5 [openvswitch]
   [] ovs_vport_receive+0xa1/0xc3 [openvswitch]
  [...]

Also we can find that the actions_len is much little than the orig_len:
  crash> struct sw_flow_actions 0x8812f539d000
  struct sw_flow_actions {
rcu = {
  next = 0x8812f5398800,
  func = 0xe3b00035db32
},
orig_len = 1384,
actions_len = 592,
actions = 0x8812f539d01c
  }

So as a quick fix, use the orig_len instead of the actions_len to alloc
the user_skb.

Last, this oops happened on our system running a relative old kernel, but
the same risk still exists on the mainline, since we use the wrong
actions_len from the beginning.

Fixes: ccea74457bbd ("openvswitch: include datapath actions with sampled-pac
Cc: Neil McKee 
Signed-off-by: Liping Zhang 
Acked-by: Pravin B Shelar 
Signed-off-by: David S. Miller 

Fixes: 0e469d3b380c ("datapath: Include datapath actions with sampled-packet 
upcall to userspace.")
Signed-off-by: Greg Rose 
---
 datapath/actions.c  | 1 +
 datapath/datapath.c | 7 ---
 datapath/datapath.h | 2 ++
 3 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/datapath/actions.c b/datapath/actions.c
index 59d91b2..ad18c2c 100644
--- a/datapath/actions.c
+++ b/datapath/actions.c
@@ -1348,6 +1348,7 @@ int ovs_execute_actions(struct datapath *dp, struct 
sk_buff *skb,
goto out;
}
 
+   OVS_CB(skb)->acts_origlen = acts->orig_len;
err = do_execute_actions(dp, skb, key,
 acts->actions, acts->actions_len);
 
diff --git a/datapath/datapath.c b/datapath/datapath.c
index b565fc5..1780819 100644
--- a/datapath/datapath.c
+++ b/datapath/datapath.c
@@ -388,7 +388,7 @@ static int queue_gso_packets(struct datapath *dp, struct 
sk_buff *skb,
 }
 
 static size_t upcall_msg_size(const struct dp_upcall_info *upcall_info,
- unsigned int hdrlen)
+ unsigned int hdrlen, int actions_attrlen)
 {
size_t size = NLMSG_ALIGN(sizeof(struct ovs_header))
+ nla_total_size(hdrlen) /* OVS_PACKET_ATTR_PACKET */
@@ -405,7 +405,7 @@ static size_t upcall_msg_size(const struct dp_upcall_info 
*upcall_info,
 
/* OVS_PACKET_ATTR_ACTIONS */
if (upcall_info->actions_len)
-   size += nla_total_size(upcall_info->actions_len);
+   size += nla_total_size(actions_attrlen);
 
/* OVS_PACKET_ATTR_MRU */
if (upcall_info->mru)
@@ -472,7 +472,8 @@ static int queue_userspace_packet(struct datapath *dp, 
struct sk_buff *skb,
else
hlen = skb->len;
 
-   len = upcall_msg_size(upcall_info, hlen - cutlen);
+   len = upcall_msg_size(upcall_info, hlen - cutlen,
+ OVS_CB(skb)->acts_origlen);
user_skb = genlmsg_new(len, GFP_ATOMIC);
if (!user_skb) {
err = -ENOMEM;
diff --git a/datapath/datapath.h b/datapath/datapath.h
index f20deed..70ad0ac 100644
--- a/datapath/datapath.h
+++ b/datapath/datapath.h
@@ -100,11 +100,13 @@ struct datapath {
  * when a packet is received by OVS.
  * @mru: The maximum received fragement size; 0 if the packet is not
  * fragmented.
+ * @acts_origlen: The netlink size of the flow actions applied to this skb.
  * @cutlen: The number of bytes from the packet end to be removed.
  */
 struct ovs_skb_cb {
struct vport*input_vport;
u16

[ovs-dev] [PATCH V2 5/6] datapath: Remove unnecessary newlines from OVS_NLERR uses

2017-08-24 Thread Greg Rose

Upstream commit:
commit 0ed80da518a1f27562a013f106505e495e891fe4
Author: Joe Perches 
Date:   Fri Aug 11 04:26:26 2017 -0700

openvswitch: Remove unnecessary newlines from OVS_NLERR uses

OVS_NLERR already adds a newline so these just add blank
lines to the logging.

Signed-off-by: Joe Perches 
Acked-by: Joe Stringer 
Signed-off-by: David S. Miller 

Signed-off-by: Greg Rose 
---
 datapath/conntrack.c| 14 +-
 datapath/flow_netlink.c |  2 +-
 2 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/datapath/conntrack.c b/datapath/conntrack.c
index b645ab1..d517a87 100644
--- a/datapath/conntrack.c
+++ b/datapath/conntrack.c
@@ -1221,15 +1221,13 @@ static int parse_nat(const struct nlattr *attr,
int type = nla_type(a);
 
if (type > OVS_NAT_ATTR_MAX) {
-   OVS_NLERR(log,
- "Unknown NAT attribute (type=%d, max=%d).\n",
+   OVS_NLERR(log, "Unknown NAT attribute (type=%d, 
max=%d)",
  type, OVS_NAT_ATTR_MAX);
return -EINVAL;
}
 
if (nla_len(a) != ovs_nat_attr_lens[type][ip_vers]) {
-   OVS_NLERR(log,
- "NAT attribute type %d has unexpected length 
(%d != %d).\n",
+   OVS_NLERR(log, "NAT attribute type %d has unexpected 
length (%d != %d)",
  type, nla_len(a),
  ovs_nat_attr_lens[type][ip_vers]);
return -EINVAL;
@@ -1239,9 +1237,7 @@ static int parse_nat(const struct nlattr *attr,
case OVS_NAT_ATTR_SRC:
case OVS_NAT_ATTR_DST:
if (info->nat) {
-   OVS_NLERR(log,
- "Only one type of NAT may be 
specified.\n"
- );
+   OVS_NLERR(log, "Only one type of NAT may be 
specified");
return -ERANGE;
}
info->nat |= OVS_CT_NAT;
@@ -1291,13 +1287,13 @@ static int parse_nat(const struct nlattr *attr,
break;
 
default:
-   OVS_NLERR(log, "Unknown nat attribute (%d).\n", type);
+   OVS_NLERR(log, "Unknown nat attribute (%d)", type);
return -EINVAL;
}
}
 
if (rem > 0) {
-   OVS_NLERR(log, "NAT attribute has %d unknown bytes.\n", rem);
+   OVS_NLERR(log, "NAT attribute has %d unknown bytes", rem);
return -EINVAL;
}
if (!info->nat) {
diff --git a/datapath/flow_netlink.c b/datapath/flow_netlink.c
index 9b48612..df9d88e 100644
--- a/datapath/flow_netlink.c
+++ b/datapath/flow_netlink.c
@@ -1257,7 +1257,7 @@ static int ovs_key_from_nlattrs(struct net *net, struct 
sw_flow_match *match,
}
 
if (!is_mask && ipv6_key->ipv6_label & htonl(0xFFF0)) {
-   OVS_NLERR(log, "IPv6 flow label %x is out of range 
(max=%x).\n",
+   OVS_NLERR(log, "IPv6 flow label %x is out of range 
(max=%x)",
  ntohl(ipv6_key->ipv6_label), (1 << 20) - 1);
return -EINVAL;
}
-- 
1.8.3.1

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

[ovs-dev] [PATCH V2 4/6] datapath: Optimize operations for OvS flow_stats.

2017-08-24 Thread Greg Rose

Upstream commit:
commit c4b2bf6b4a35348fe6d1eb06928eb68d7b9d99a9
Author: Tonghao Zhang 
Date:   Mon Jul 17 23:28:06 2017 -0700

openvswitch: Optimize operations for OvS flow_stats.

When calling the flow_free() to free the flow, we call many times
(cpu_possible_mask, eg. 128 as default) cpumask_next(). That will
take up our CPU usage if we call the flow_free() frequently.
When we put all packets to userspace via upcall, and OvS will send
them back via netlink to ovs_packet_cmd_execute(will call flow_free).

The test topo is shown as below. VM01 sends TCP packets to VM02,
and OvS forward packtets. When testing, we use perf to report the
system performance.

VM01 --- OvS-VM --- VM02

Without this patch, perf-top show as below: The flow_free() is
3.02% CPU usage.

4.23%  [kernel][k] _raw_spin_unlock_irqrestore
3.62%  [kernel][k] __do_softirq
3.16%  [kernel][k] __memcpy
3.02%  [kernel][k] flow_free
2.42%  libc-2.17.so[.] __memcpy_ssse3_back
2.18%  [kernel][k] copy_user_generic_unrolled
2.17%  [kernel][k] find_next_bit

When applied this patch, perf-top show as below: Not shown on
the list anymore.

4.11%  [kernel][k] _raw_spin_unlock_irqrestore
3.79%  [kernel][k] __do_softirq
3.46%  [kernel][k] __memcpy
2.73%  libc-2.17.so[.] __memcpy_ssse3_back
2.25%  [kernel][k] copy_user_generic_unrolled
1.89%  libc-2.17.so[.] _int_malloc
1.53%  ovs-vswitchd[.] xlate_actions

With this patch, the TCP throughput(we dont use Megaflow Cache
+ Microflow Cache) between VMs is 1.18Gbs/sec up to 1.30Gbs/sec
(maybe ~10% performance imporve).

This patch adds cpumask struct, the cpu_used_mask stores the cpu_id
that the flow used. And we only check the flow_stats on the cpu we
used, and it is unncessary to check all possible cpu when getting,
cleaning, and updating the flow_stats. Adding the cpu_used_mask to
sw_flow struct does’t increase the cacheline number.

Signed-off-by: Tonghao Zhang 
Acked-by: Pravin B Shelar 
Signed-off-by: David S. Miller 

Signed-off-by: Greg Rose 
---
 datapath/flow.c   | 7 ---
 datapath/flow.h   | 2 ++
 datapath/flow_table.c | 4 +++-
 3 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/datapath/flow.c b/datapath/flow.c
index 30e4d21..5da7e3e 100644
--- a/datapath/flow.c
+++ b/datapath/flow.c
@@ -71,7 +71,7 @@ void ovs_flow_stats_update(struct sw_flow *flow, __be16 
tcp_flags,
   const struct sk_buff *skb)
 {
struct flow_stats *stats;
-   int cpu = smp_processor_id();
+   unsigned int cpu = smp_processor_id();
int len = skb->len + (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0);
 
stats = rcu_dereference(flow->stats[cpu]);
@@ -116,6 +116,7 @@ void ovs_flow_stats_update(struct sw_flow *flow, __be16 
tcp_flags,
 
rcu_assign_pointer(flow->stats[cpu],
   new_stats);
+   cpumask_set_cpu(cpu, 
>cpu_used_mask);
goto unlock;
}
}
@@ -143,7 +144,7 @@ void ovs_flow_stats_get(const struct sw_flow *flow,
memset(ovs_stats, 0, sizeof(*ovs_stats));
 
/* We open code this to make sure cpu 0 is always considered */
-   for (cpu = 0; cpu < nr_cpu_ids; cpu = cpumask_next(cpu, 
cpu_possible_mask)) {
+   for (cpu = 0; cpu < nr_cpu_ids; cpu = cpumask_next(cpu, 
>cpu_used_mask)) {
struct flow_stats *stats = 
rcu_dereference_ovsl(flow->stats[cpu]);
 
if (stats) {
@@ -167,7 +168,7 @@ void ovs_flow_stats_clear(struct sw_flow *flow)
int cpu;
 
/* We open code this to make sure cpu 0 is always considered */
-   for (cpu = 0; cpu < nr_cpu_ids; cpu = cpumask_next(cpu, 
cpu_possible_mask)) {
+   for (cpu = 0; cpu < nr_cpu_ids; cpu = cpumask_next(cpu, 
>cpu_used_mask)) {
struct flow_stats *stats = ovsl_dereference(flow->stats[cpu]);
 
if (stats) {
diff --git a/datapath/flow.h b/datapath/flow.h
index 07af912..0796b09 100644
--- a/datapath/flow.h
+++ b/datapath/flow.h
@@ -31,6 +31,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -218,6 +219,7 @@ struct sw_flow {
 */
struct sw_flow_key key;
struct sw_flow_id id;
+   struct cpumask cpu_used_mask;
struct sw_flow_mask *mask;
struct sw_flow_actions __rcu *sf_acts;
struct flow_stats __rcu

[ovs-dev] [PATCH V2 3/6] datapath: Optimize updating for OvS flow_stats.

2017-08-24 Thread Greg Rose

Upstream commit:
commit c57c054eb5b1ccf230c49f736f7a018fcbc3e952
Author: Tonghao Zhang 
Date:   Mon Jul 17 23:28:05 2017 -0700

openvswitch: Optimize updating for OvS flow_stats.

In the ovs_flow_stats_update(), we only use the node
var to alloc flow_stats struct. But this is not a
common case, it is unnecessary to call the numa_node_id()
everytime. This patch is not a bugfix, but there maybe
a small increase.

Signed-off-by: Tonghao Zhang 
Signed-off-by: David S. Miller 

Signed-off-by: Greg Rose 
---
 datapath/flow.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/datapath/flow.c b/datapath/flow.c
index 599b4e5..30e4d21 100644
--- a/datapath/flow.c
+++ b/datapath/flow.c
@@ -71,7 +71,6 @@ void ovs_flow_stats_update(struct sw_flow *flow, __be16 
tcp_flags,
   const struct sk_buff *skb)
 {
struct flow_stats *stats;
-   int node = numa_node_id();
int cpu = smp_processor_id();
int len = skb->len + (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0);
 
@@ -107,7 +106,7 @@ void ovs_flow_stats_update(struct sw_flow *flow, __be16 
tcp_flags,
   __GFP_THISNODE |
   __GFP_NOWARN |
  __GFP_NOMEMALLOC,
- node);
+ numa_node_id());
if (likely(new_stats)) {
new_stats->used = jiffies;
new_stats->packet_count = 1;
-- 
1.8.3.1

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

[ovs-dev] [PATCH V2 2/6] datapath: Remove all references to SKB_GSO_UDP.

2017-08-24 Thread Greg Rose

Upstream commit:
commit 880388aa3c07fdea4f9b85e35641753017b1852f
Author: David S. Miller 
Date:   Mon Jul 3 07:29:12 2017 -0700

net: Remove all references to SKB_GSO_UDP.

Such packets are no longer possible.

Signed-off-by: David S. Miller 

SKB_GSO_UDP is removed in the upstream kernel.  Use HAVE_SKB_GSO_UDP
define from acinclude to detect if SKB_GSO_UDP exists and if so apply
openvswitch section of this upstream patch.

Signed-off-by: Greg Rose 
---
 datapath/flow.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/datapath/flow.c b/datapath/flow.c
index c4f63b0..599b4e5 100644
--- a/datapath/flow.c
+++ b/datapath/flow.c
@@ -589,8 +589,12 @@ static int key_extract(struct sk_buff *skb, struct 
sw_flow_key *key)
key->ip.frag = OVS_FRAG_TYPE_LATER;
return 0;
}
+#ifdef HAVE_SKB_GSO_UDP
if (nh->frag_off & htons(IP_MF) ||
skb_shinfo(skb)->gso_type & SKB_GSO_UDP)
+#else
+   if (nh->frag_off & htons(IP_MF))
+#endif
key->ip.frag = OVS_FRAG_TYPE_FIRST;
else
key->ip.frag = OVS_FRAG_TYPE_NONE;
@@ -707,9 +711,11 @@ static int key_extract(struct sk_buff *skb, struct 
sw_flow_key *key)
 
if (key->ip.frag == OVS_FRAG_TYPE_LATER)
return 0;
+#ifdef HAVE_SKB_GSO_UDP
if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP)
key->ip.frag = OVS_FRAG_TYPE_FIRST;
 
+#endif
/* Transport layer. */
if (key->ip.proto == NEXTHDR_TCP) {
if (tcphdr_ok(skb)) {
-- 
1.8.3.1

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

[ovs-dev] [PATCH V2 1/6] acinclude: Check for SKB_GSO_UDP

2017-08-24 Thread Greg Rose

Removed in kernel 4.13

Signed-off-by: Greg Rose 
---
 acinclude.m4 | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/acinclude.m4 b/acinclude.m4
index 74cc046..9bd1c49 100644
--- a/acinclude.m4
+++ b/acinclude.m4
@@ -763,6 +763,9 @@ AC_DEFUN([OVS_CHECK_LINUX_COMPAT], [
 [OVS_DEFINE([HAVE_VXLAN_DEV_CFG])])
   OVS_GREP_IFELSE([$KSRC/include/net/netfilter/nf_conntrack_helper.h],
   [nf_conntrack_helper_put])
+  OVS_GREP_IFELSE([$KSRC/include/linux/skbuff.h],[SKB_GSO_UDP],
+  [OVS_DEFINE([HAVE_SKB_GSO_UDP])])
+
 
   if cmp -s datapath/linux/kcompat.h.new \
 datapath/linux/kcompat.h >/dev/null 2>&1; then
-- 
1.8.3.1

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Re: [ovs-dev] [PATCH v1] netdev-dpdk: Implement TCP/UDP TX cksum in ovs-dpdk side

2017-08-24 Thread Gao Zhenyu

Thanks for the comments!

Yes, the best way is calculating cksum if destination need cksum.
But in current ovs-dpdk process,  it is hard to tell whether this whole
batch packets need cksum or not when delivering to destination.
If we check(check PKT_TX_L4_MASK and has l4 header) packets one by one will
introduce regression in some usecases. (In the previous email, Ciara give a
testing on my first patch and see about 4% regression in pure forwarding
packet testing )

About offlording to physical nic, I had make some testing on it and it
doesn't show significant improvment but disable dpdk tx vectorization.(may
not good for small packets) I prefer to implement software cksum first then
count hardware offloading later.

The VM I use for testing is centos7,kernel version
is 3.10.0-514.16.1.el7.x86_64. Supporting cksum has a additional benefit,
the vhost-net can enable NETIF_F_SG (enable scatter-gather feature).

2017-08-24 17:07 GMT+08:00 O Mahony, Billy :

> Hi Gao,
>
> Thanks for working on this. Lack of checksum offload is big difference
> between ovs and ovs-dpdk when using linux stack in the guest.
>
> The thing that struck me was that rather than immediately calculating the
> L4 checksum in the host on vhost rx that the calculation should be delayed
> until it's known to be absolutely required to be done on the host. If the
> packet is for another VM a checksum is not required as the bits are not
> going over a physical medium. And if the packets is destined for a NIC then
> the checksum can be offloaded if the NIC supports it.
>
> I'm not sure why doing the L4 sum in the guest should give a performance
> gain. The processing still has to be done. Maybe the guest code was
> compiled for an older architecture and is not using as efficient a set of
> instructions?
>
> In any case the best advantage of having dpdk virtio device  support
> offload is if it can further offload to a NIC or avoid cksum entirely if
> the packet is destined for a local VM.
>
> Thanks,
> Billy.
>
>
> > -Original Message-
> > From: ovs-dev-boun...@openvswitch.org [mailto:ovs-dev-
> > boun...@openvswitch.org] On Behalf Of Gao Zhenyu
> > Sent: Wednesday, August 23, 2017 4:12 PM
> > To: Loftus, Ciara 
> > Cc: d...@openvswitch.org; us...@dpdk.org
> > Subject: Re: [ovs-dev] [PATCH v1] netdev-dpdk: Implement TCP/UDP TX
> > cksum in ovs-dpdk side
> >
> > Yes, maintaining only one implementation is resonable.
> > However making ovs-dpdk to support vhost tx-cksum first is doable as
> well.
> > We can have it in ovs, and replace it with new DPDK API once ovs update
> its
> > dpdk version which contains the tx-cksum implementation.
> >
> >
> > Thanks
> > Zhenyu Gao
> >
> > 2017-08-23 21:59 GMT+08:00 Loftus, Ciara :
> >
> > > >
> > > > Hi Ciara
> > > >
> > > > You had a general concern below; can we conclude on that before
> > > > going further ?
> > > >
> > > > Thanks Darrell
> > > >
> > > > “
> > > > > On another note I have a general concern. I understand similar
> > > functionality
> > > > > is present in the DPDK vhost sample app. I wonder if it would be
> > > feasible
> > > > for
> > > > > this to be implemented in the DPDK vhost library and leveraged
> > > > > here,
> > > > rather
> > > > > than having two implementations in two separate code bases.
> > >
> > > This is something I'd like to see, although I wouldn't block on this
> > > patch waiting for it.
> > > Maybe we can have the initial implementation as it is (if it proves
> > > beneficial), then move to a common DPDK API if/when it becomes
> > available.
> > >
> > > I've cc'ed DPDK users list hoping for some input. To summarise:
> > > From my understanding, the DPDK vhost sample application calculates TX
> > > checksum for packets received from vHost ports with invalid/0
> checksums:
> > > http://dpdk.org/browse/dpdk/tree/examples/vhost/main.c#n910
> > > The patch being discussed in this thread (also here:
> > > https://patchwork.ozlabs.org/patch/802070/) it seems does something
> > > very similar.
> > > Wondering on the feasibility of putting this functionality in a
> > > rte_vhost library call such that we don't have two separate
> > implementations?
> > >
> > > Thanks,
> > > Ciara
> > >
> > > > >
> > > > > I have some other comments inline.
> > > > >
> > > > > Thanks,
> > > > > Ciara
> > > > “
> > > >
> > > >
> > > >
> > > > From: Gao Zhenyu 
> > > > Date: Wednesday, August 16, 2017 at 6:38 AM
> > > > To: "Loftus, Ciara" 
> > > > Cc: "b...@ovn.org" , "Chandran, Sugesh"
> > > > , "ktray...@redhat.com"
> > > > , Darrell Ball ,
> > > > "d...@openvswitch.org" 
> > > > Subject: Re: [ovs-dev] [PATCH v1] netdev-dpdk: Implement TCP/UDP TX
> > > > cksum in ovs-dpdk side
> > > >
> > > > Hi Loftus,
> > > >I had submitted a new version, please see
> > > >

Re: [ovs-dev] [PATCH] controller: Remote connection option to OpenFlow switch.

2017-08-24 Thread Lance Richardson

> From: "JaiSingh Rana" 
> To: "Jai Singh Rana" , d...@openvswitch.org
> Sent: Thursday, August 24, 2017 9:50:51 AM
> Subject: Re: [ovs-dev] [PATCH] controller: Remote connection option to
> OpenFlow switch.
> 
> Hi,
> 
> 
> Any feedback on this patch. Appreciate any suggestions for moving this
> forward.
> 

It has already been pointed out, but this patch makes a fundamental
assumption that ovn-controller and ovs-vswitchd can run on different hosts.
This isn't a good assumption, for example, ovn-controller implements QoS
features by configuring qdiscs on kernel netdevices corresponding to
interfaces on the ovs integration bridge.

Regards,

   Lance Richardson

> 
> Thanks,
> 
> Jai
> 
> 
> 
> From: Jai Singh Rana 
> Sent: 07 August 2017 20:48
> To: d...@openvswitch.org
> Cc: jai.r...@gmail.com; Rana, JaiSingh
> Subject: [PATCH] controller: Remote connection option to OpenFlow switch.
> 
> Currently ovn-controller uses default unix domain socket in Open
> vSwitch's "run" directory to connect to OpenFlow switch. If
> ovn-controller/ovsdb-server and vswitchd are running on different
> systems, remote connection method needs to be provided.
> 
> Added configuration option to override default connection by using OVSDB
> external-ids "ovn-ofswitch-remote". Using this external-id, desired tcp
> or unix connection to OpenFlow switch can be specified.
> 
> Tested this by using tcp/unix method configured through external-id
> "ovn-ofswitch-remote" and confirmed connection as flows getting updated
> in Open vSwitch.
> 
> Signed-off-by: Jai Singh Rana 
> ---
>  ovn/controller/ofctrl.c | 13 +++--
>  ovn/controller/ofctrl.h |  4 ++--
>  ovn/controller/ovn-controller.8.xml |  9 +
>  ovn/controller/ovn-controller.c | 37
>  ++---
>  ovn/controller/pinctrl.c|  5 +++--
>  ovn/controller/pinctrl.h|  3 ++-
>  6 files changed, 57 insertions(+), 14 deletions(-)
> 
> diff --git a/ovn/controller/ofctrl.c b/ovn/controller/ofctrl.c
> index fc88a41..c2ea1a5 100644
> --- a/ovn/controller/ofctrl.c
> +++ b/ovn/controller/ofctrl.c
> @@ -451,14 +451,15 @@ recv_S_UPDATE_FLOWS(const struct ofp_header *oh, enum
> ofptype type,
>  }
>  }
> 
> -/* Runs the OpenFlow state machine against 'br_int', which is local to the
> - * hypervisor on which we are running.  Attempts to negotiate a Geneve
> option
> - * field for class OVN_GENEVE_CLASS, type OVN_GENEVE_TYPE.  If successful,
> - * returns the MFF_* field ID for the option, otherwise returns 0. */
> +/* Runs the OpenFlow state machine against 'ovn_ofswitch_remote', which can
> + * be local or remote to hypervisor on which we are running. Attempts to
> + * negotiate a Geneve option field for class OVN_GENEVE_CLASS,
> + * type OVN_GENEVE_TYPE. If successful, returns the MFF_* field ID for the
> + * option, otherwise returns 0. */
>  enum mf_field_id
> -ofctrl_run(const struct ovsrec_bridge *br_int, struct shash
> *pending_ct_zones)
> +ofctrl_run(struct shash *pending_ct_zones, char *ovn_ofswitch_remote)
>  {
> -char *target = xasprintf("unix:%s/%s.mgmt", ovs_rundir(), br_int->name);
> +char *target = xstrdup(ovn_ofswitch_remote);
>  if (strcmp(target, rconn_get_target(swconn))) {
>  VLOG_INFO("%s: connecting to switch", target);
>  rconn_connect(swconn, target, target);
> diff --git a/ovn/controller/ofctrl.h b/ovn/controller/ofctrl.h
> index d83f6ae..a23ed4c 100644
> --- a/ovn/controller/ofctrl.h
> +++ b/ovn/controller/ofctrl.h
> @@ -32,8 +32,8 @@ struct shash;
> 
>  /* Interface for OVN main loop. */
>  void ofctrl_init(struct group_table *group_table);
> -enum mf_field_id ofctrl_run(const struct ovsrec_bridge *br_int,
> -struct shash *pending_ct_zones);
> +enum mf_field_id ofctrl_run(struct shash *pending_ct_zones,
> +char *ovn_ofswitch_remote);
>  bool ofctrl_can_put(void);
>  void ofctrl_put(struct hmap *flow_table, struct shash *pending_ct_zones,
>  int64_t nb_cfg);
> diff --git a/ovn/controller/ovn-controller.8.xml
> b/ovn/controller/ovn-controller.8.xml
> index 5641abc..4cd68c4 100644
> --- a/ovn/controller/ovn-controller.8.xml
> +++ b/ovn/controller/ovn-controller.8.xml
> @@ -151,6 +151,15 @@
>  network interface card, enabling encapsulation checksum may incur
>  performance loss. In such cases, encapsulation checksums can be
>  disabled.
>
> +
> +  external_ids:ovn-ofswitch-remote
> +  
> +The OpenFlow connection method that this system can connect to reach
> +Open vSwitch using the unix or tcp forms
> +documented above for the ovs-database. If not configured,
> +default local unix domain socket file in the local Open vSwitch's
> +"run" directory is used.
> +  
>

[ovs-dev] [PATCH] netdev-dpdk: use 64-bit arithmetic when converting rates

2017-08-24 Thread Lance Richardson

Force 64-bit arithmetic to be used when converting uint32_t rate
and burst parameters from kilobits per second to bytes per second,
avoiding incorrect behavior for rates exceeding UINT_MAX bits
per second.

Reported-by: "王志克" 
Fixes: 9509913aa722 ("netdev-dpdk.c: Add ingress-policing functionality.")
Signed-off-by: Lance Richardson 
---
Note: Found by inspection, untested.

 lib/netdev-dpdk.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
index 1aaf6f7e2..4e8aaa3d8 100644
--- a/lib/netdev-dpdk.c
+++ b/lib/netdev-dpdk.c
@@ -2229,8 +2229,8 @@ netdev_dpdk_policer_construct(uint32_t rate, uint32_t 
burst)
 rte_spinlock_init(>policer_lock);
 
 /* rte_meter requires bytes so convert kbits rate and burst to bytes. */
-rate_bytes = rate * 1000/8;
-burst_bytes = burst * 1000/8;
+rate_bytes = rate * 1000ULL / 8;
+burst_bytes = burst * 1000ULL / 8;
 
 policer->app_srtcm_params.cir = rate_bytes;
 policer->app_srtcm_params.cbs = burst_bytes;
-- 
2.13.4

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

[ovs-dev] [RFC PATCH 2/2] sflow: export extended NAT data

2017-08-24 Thread Michal Weglicki

With addition of sFlow output sampling, it is possible to export additional
data, which isn't available during input sampling. This patch introduces
support for exporting extend NAT data.

Signed-off-by: Przemyslaw Szczerbik 
---
 ofproto/ofproto-dpif-sflow.c | 37 +
 1 file changed, 37 insertions(+)

diff --git a/ofproto/ofproto-dpif-sflow.c b/ofproto/ofproto-dpif-sflow.c
index b92f681..bd02bdd 100644
--- a/ofproto/ofproto-dpif-sflow.c
+++ b/ofproto/ofproto-dpif-sflow.c
@@ -878,6 +878,31 @@ dpif_sflow_tunnel_v4(uint8_t tunnel_ipproto,
 ipv4->dst_port = (OVS_FORCE uint16_t) tunnel->tp_dst;
 }
 
+static int
+dpif_sflow_ex_nat(const struct flow *flow, SFLExtended_nat *nat)
+
+{
+int ret = 0;
+uint16_t dl_type = ntohs(flow->dl_type);
+
+if (dl_type == ETH_TYPE_IP) {
+nat->src.type = SFLADDRESSTYPE_IP_V4;
+nat->dst.type = SFLADDRESSTYPE_IP_V4;
+nat->src.address.ip_v4.addr = flow->ct_nw_src;
+nat->dst.address.ip_v4.addr = flow->ct_nw_dst;
+} else if (dl_type == ETH_TYPE_IPV6) {
+nat->src.type = SFLADDRESSTYPE_IP_V6;
+nat->dst.type = SFLADDRESSTYPE_IP_V6;
+memcpy(nat->src.address.ip_v6.addr, flow->ct_ipv6_src.s6_addr, 16);
+memcpy(nat->dst.address.ip_v6.addr, flow->ct_ipv6_dst.s6_addr, 16);
+} else {
+/*XXX: Do we need to handle other cases? */
+ret = -1;
+}
+
+return ret;
+}
+
 static void
 dpif_sflow_push_mpls_lse(struct dpif_sflow_actions *sflow_actions,
  ovs_be32 lse)
@@ -1285,6 +1310,7 @@ dpif_sflow_received(struct dpif_sflow *ds, const struct 
dp_packet *packet,
 SFLFlow_sample_element hdrElem;
 SFLSampled_header *header;
 SFLFlow_sample_element switchElem;
+SFLFlow_sample_element natElem;
 uint8_t tnlInProto, tnlOutProto;
 SFLFlow_sample_element tnlInElem, tnlOutElem;
 SFLFlow_sample_element vniInElem, vniOutElem;
@@ -1343,6 +1369,14 @@ dpif_sflow_received(struct dpif_sflow *ds, const struct 
dp_packet *packet,
 
 fs.output = cookie->sflow.output;
 
+/* Add extended NAT element if src or dst address was mangled by NAT */
+bool add_ex_nat = flow->ct_state & OVS_CS_F_NAT_MASK;
+if (add_ex_nat) {
+memset(, 0, sizeof(natElem));
+natElem.tag = SFLFLOW_EX_NAT;
+add_ex_nat = !dpif_sflow_ex_nat(flow, );
+}
+
 /* Input tunnel. */
 if (flow->tunnel.ip_dst) {
memset(, 0, sizeof(tnlInElem));
@@ -1407,6 +1441,9 @@ dpif_sflow_received(struct dpif_sflow *ds, const struct 
dp_packet *packet,
 /* Submit the flow sample to be encoded into the next datagram. */
 SFLADD_ELEMENT(, );
 SFLADD_ELEMENT(, );
+if (add_ex_nat) {
+SFLADD_ELEMENT(, );
+}
 sfl_sampler_writeFlowSample(sampler, );
 
 out:
-- 
1.8.3.1

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Re: [ovs-dev] [ovs-discuss] OVS+DPDK QoS rate limit issue

2017-08-24 Thread Lance Richardson

> From: "王志克" 
> To: ovs-dev@openvswitch.org, ovs-disc...@openvswitch.org
> Sent: Wednesday, August 23, 2017 11:41:05 PM
> Subject: [ovs-discuss] OVS+DPDK QoS rate limit issue
> 
> 
> 
> Hi All,
> 
> 
> 
> I am using OVS2.7.0 and DPDK 16.11, and testing rate limit function.
> 
> 
> 
> I found that if the policing_rate is set very large, say 5Gbps, the rate is
> limited dramatically to very low value, like 800Mbps.
> 
> The command is as below:
> 
> ovs-vsctl set interface port-7zel2so9sg ingress_policing_rate=500
> ingress_policing_burst=50
> 
> 
> 
> If we set the rate lower than 4Gbps, the rate is limited correctly.
> 
> 
> 
> Test setup:
> 
> Sender (DPDK pktGen) sends out about 10Gbps udp packet, with size about 1420
> IP size.
> 
> The rate limit is set on VM vhost-user-client port.
> 
> 
> 
> Any idea about this issue? Is that known issue?
> 
> 

It seems 32-bit arithmetic is being used when converting the rate from
kilobits per second to bytes per second. Could you give this patch a try?

diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
index 1aaf6f7e2..d6ed2c7b0 100644
--- a/lib/netdev-dpdk.c
+++ b/lib/netdev-dpdk.c
@@ -2229,8 +2229,8 @@ netdev_dpdk_policer_construct(uint32_t rate, uint32_t 
burst)
     rte_spinlock_init(>policer_lock);
 
     /* rte_meter requires bytes so convert kbits rate and burst to bytes. */
-    rate_bytes = rate * 1000/8;
-    burst_bytes = burst * 1000/8;
+    rate_bytes = rate * 1000ULL/8;
+    burst_bytes = burst * 1000ULL/8;
 
     policer->app_srtcm_params.cir = rate_bytes;
     policer->app_srtcm_params.cbs = burst_bytes;

Regards,

   Lance Richardson

> 
> Br,
> 
> Wang Zhike
> 
> ___
> discuss mailing list
> disc...@openvswitch.org
> https://mail.openvswitch.org/mailman/listinfo/ovs-discuss
> 
___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

[ovs-dev] [PATCH] conntrack: Fix ct-clean thread crash bug

2017-08-24 Thread huanglili

From: Lili Huang 

Conn should be removed from the list before freed.

This crash will be triggered when a established flow do ct(nat)
again, like
"ip,actions=ct(table=1)
 table=1,in_port=1,ip,actions=ct(commit,nat(dst=5.5.5.5)),2
 table=1,in_port=2,ip,ct_state=+est,actions=1
 table=1,in_port=1,ip,ct_state=+est,actions=2"

Signed-off-by: Lili Huang 
---
 lib/conntrack.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/lib/conntrack.c b/lib/conntrack.c
index 1c0e023..dd73e1a 100644
--- a/lib/conntrack.c
+++ b/lib/conntrack.c
@@ -779,6 +779,8 @@ conn_not_found(struct conntrack *ct, struct dp_packet *pkt,
ct, nc, conn_for_un_nat_copy);
 
 if (!nat_res) {
+ovs_list_remove(>exp_node);
+ctx->conn = NULL;
 goto nat_res_exhaustion;
 }
 
-- 
1.8.3.1


___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Re: [ovs-dev] [PATCH v1] netdev-dpdk: Implement TCP/UDP TX cksum in ovs-dpdk side

2017-08-24 Thread O Mahony, Billy

Hi Gao,

Thanks for working on this. Lack of checksum offload is big difference between 
ovs and ovs-dpdk when using linux stack in the guest.
 
The thing that struck me was that rather than immediately calculating the L4 
checksum in the host on vhost rx that the calculation should be delayed until 
it's known to be absolutely required to be done on the host. If the packet is 
for another VM a checksum is not required as the bits are not going over a 
physical medium. And if the packets is destined for a NIC then the checksum can 
be offloaded if the NIC supports it.

I'm not sure why doing the L4 sum in the guest should give a performance gain. 
The processing still has to be done. Maybe the guest code was compiled for an 
older architecture and is not using as efficient a set of instructions?

In any case the best advantage of having dpdk virtio device  support offload is 
if it can further offload to a NIC or avoid cksum entirely if the packet is 
destined for a local VM.

Thanks,
Billy. 


> -Original Message-
> From: ovs-dev-boun...@openvswitch.org [mailto:ovs-dev-
> boun...@openvswitch.org] On Behalf Of Gao Zhenyu
> Sent: Wednesday, August 23, 2017 4:12 PM
> To: Loftus, Ciara 
> Cc: d...@openvswitch.org; us...@dpdk.org
> Subject: Re: [ovs-dev] [PATCH v1] netdev-dpdk: Implement TCP/UDP TX
> cksum in ovs-dpdk side
> 
> Yes, maintaining only one implementation is resonable.
> However making ovs-dpdk to support vhost tx-cksum first is doable as well.
> We can have it in ovs, and replace it with new DPDK API once ovs update its
> dpdk version which contains the tx-cksum implementation.
> 
> 
> Thanks
> Zhenyu Gao
> 
> 2017-08-23 21:59 GMT+08:00 Loftus, Ciara :
> 
> > >
> > > Hi Ciara
> > >
> > > You had a general concern below; can we conclude on that before
> > > going further ?
> > >
> > > Thanks Darrell
> > >
> > > “
> > > > On another note I have a general concern. I understand similar
> > functionality
> > > > is present in the DPDK vhost sample app. I wonder if it would be
> > feasible
> > > for
> > > > this to be implemented in the DPDK vhost library and leveraged
> > > > here,
> > > rather
> > > > than having two implementations in two separate code bases.
> >
> > This is something I'd like to see, although I wouldn't block on this
> > patch waiting for it.
> > Maybe we can have the initial implementation as it is (if it proves
> > beneficial), then move to a common DPDK API if/when it becomes
> available.
> >
> > I've cc'ed DPDK users list hoping for some input. To summarise:
> > From my understanding, the DPDK vhost sample application calculates TX
> > checksum for packets received from vHost ports with invalid/0 checksums:
> > http://dpdk.org/browse/dpdk/tree/examples/vhost/main.c#n910
> > The patch being discussed in this thread (also here:
> > https://patchwork.ozlabs.org/patch/802070/) it seems does something
> > very similar.
> > Wondering on the feasibility of putting this functionality in a
> > rte_vhost library call such that we don't have two separate
> implementations?
> >
> > Thanks,
> > Ciara
> >
> > > >
> > > > I have some other comments inline.
> > > >
> > > > Thanks,
> > > > Ciara
> > > “
> > >
> > >
> > >
> > > From: Gao Zhenyu 
> > > Date: Wednesday, August 16, 2017 at 6:38 AM
> > > To: "Loftus, Ciara" 
> > > Cc: "b...@ovn.org" , "Chandran, Sugesh"
> > > , "ktray...@redhat.com"
> > > , Darrell Ball ,
> > > "d...@openvswitch.org" 
> > > Subject: Re: [ovs-dev] [PATCH v1] netdev-dpdk: Implement TCP/UDP TX
> > > cksum in ovs-dpdk side
> > >
> > > Hi Loftus,
> > >I had submitted a new version, please see
> > > https://patchwork.ozlabs.org/patch/802070/
> > >It move the cksum to vhost receive side.
> > > Thanks
> > > Zhenyu Gao
> > >
> > > 2017-08-10 12:35 GMT+08:00 Gao Zhenyu :
> > > I see, for flows in phy-phy setup, they should not be calculate cksum.
> > > I will revise my patch to do the cksum for vhost port only. I will
> > > send
> > a new
> > > patch next week.
> > >
> > > Thanks
> > > Zhenyu Gao
> > >
> > > 2017-08-08 17:53 GMT+08:00 Loftus, Ciara :
> > > >
> > > > Hi Loftus,
> > > >
> > > > Thanks for testing and the comments!
> > > > Can you show more details about your phy-vm-phy,phy-phy setup and
> > > > testing steps? Then I can reproduce it to see if I can solve this
> > > > pps
> > problem.
> > >
> > > You're welcome. I forgot to mention my tests were with 64B packets.
> > >
> > > For phy-phy the setup is a single host with 2 dpdk physical ports
> > > and 1
> > flow
> > > rule port1 -> port2.
> > > See figure 3 here:
> > > https://tools.ietf.org/html/draft-ietf-bmwg-vswitch-
> > > opnfv-04#section-4
> > >
> > > For the phy-vm-phy the setup is a single host with 2 dpdk physical
> > > ports
> > and 2
> > >

Re: [ovs-dev] [PATCH v5 4/6] dpif-netdev: Change rxq_scheduling to use rxq processing cycles.

2017-08-24 Thread Darrell Ball



On 8/24/17, 1:47 AM, "Darrell Ball"  wrote:

There is a minor checkpatch warning


== Checking 
"/home/dball/Desktop/patches/ovs-dev-v5-4-6-dpif-netdev-Change-rxq_scheduling-to-use-rxq-processing-cycles..patch"
 ==
WARNING: Line lacks whitespace around operator
#170 FILE: lib/dpif-netdev.c:3456:
   Round-robin on the NUMA nodes that do have pmds. */

Lines checked: 204, Warnings: 1, Errors: 0

[Darrell] 
Maybe try
* Round-robin on the NUMA nodes that do have pmds. */


I marked it below

Ignore what I marked for the warning inline


On 8/23/17, 6:34 AM, "Kevin Traynor"  wrote:

Previously rxqs were assigned to pmds by round robin in
port/queue order.

Now that we have the processing cycles used for existing rxqs,
use that information to try and produced a better balanced
distribution of rxqs across pmds. i.e. given multiple pmds, the
rxqs which have consumed the largest amount of processing cycles
will be placed on different pmds.

The rxqs are sorted by their processing cycles and assigned (in
sorted order) round robin across pmds.

Signed-off-by: Kevin Traynor 
---
 Documentation/howto/dpdk.rst |   7 +++
 lib/dpif-netdev.c| 123 
---
 2 files changed, 99 insertions(+), 31 deletions(-)

diff --git a/Documentation/howto/dpdk.rst 
b/Documentation/howto/dpdk.rst
index d7f6610..44737e4 100644
--- a/Documentation/howto/dpdk.rst
+++ b/Documentation/howto/dpdk.rst
@@ -119,4 +119,11 @@ After that PMD threads on cores where RX 
queues was pinned will become
   thread.
 
+If pmd-rxq-affinity is not set for rxqs, they will be assigned to 
pmds (cores)
+automatically. The processing cycles that have been required for 
each rxq

+will be used where known to assign rxqs with the highest 
consumption of
+processing cycles to different pmds.

‘will be used where known to assign rxqs to pmds based on round robin of 
the sorted rxqs’ ?


+
+Rxq to pmds assignment takes place whenever there are 
configuration changes.
+
 QoS
 ---
diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 5670c55..afbf591 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -712,4 +712,6 @@ static void
 dp_netdev_rxq_set_interval(struct dp_netdev_rxq *rx,
unsigned long long cycles);
+static uint64_t
+dp_netdev_rxq_get_interval(struct dp_netdev_rxq *rx, unsigned idx);
 static void
 dpif_netdev_xps_revalidate_pmd(const struct dp_netdev_pmd_thread 
*pmd,
@@ -3166,4 +3168,12 @@ dp_netdev_rxq_set_interval(struct 
dp_netdev_rxq *rx,
 }
 
+static uint64_t
+dp_netdev_rxq_get_interval(struct dp_netdev_rxq *rx, unsigned idx)
+{
+unsigned long long tmp;
+atomic_read_relaxed(>cycles_intrvl[idx], );
+return tmp;

Could we use something like ‘intrv_processing_cycles’ instead of ‘tmp’
Also _get_intrv_cycles ?; same forward comment I mentioned in patch 3


+}
+
 static int
 dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd,
@@ -3352,8 +3362,37 @@ rr_numa_list_destroy(struct rr_numa_list *rr)
 }
 
+/* Sort Rx Queues by the processing cycles they are consuming. */
+static int
+rxq_cycle_sort(const void *a, const void *b)
+{
+struct dp_netdev_rxq * qa;
+struct dp_netdev_rxq * qb;
+uint64_t total_qa, total_qb;
+unsigned i;
+
+qa = *(struct dp_netdev_rxq **) a;
+qb = *(struct dp_netdev_rxq **) b;
+
+total_qa = total_qb = 0;
+for (i = 0; i < PMD_RXQ_INTERVAL_MAX; i++) {
+total_qa += dp_netdev_rxq_get_interval(qa, i);
+total_qb += dp_netdev_rxq_get_interval(qb, i);
+}
+dp_netdev_rxq_set_cycles(qa, RXQ_CYCLES_PROC_HIST, total_qa);
+dp_netdev_rxq_set_cycles(qb, RXQ_CYCLES_PROC_HIST, total_qb);
+
+if (total_qa >= total_qb) {
+return -1;
+}
+return 1;
+}
+
 /* Assign

Re: [ovs-dev] [PATCH v5 4/6] dpif-netdev: Change rxq_scheduling to use rxq processing cycles.

2017-08-24 Thread Darrell Ball

There is a minor checkpatch warning


== Checking 
"/home/dball/Desktop/patches/ovs-dev-v5-4-6-dpif-netdev-Change-rxq_scheduling-to-use-rxq-processing-cycles..patch"
 ==
WARNING: Line lacks whitespace around operator
#170 FILE: lib/dpif-netdev.c:3456:
   Round-robin on the NUMA nodes that do have pmds. */

Lines checked: 204, Warnings: 1, Errors: 0

I marked it below


On 8/23/17, 6:34 AM, "Kevin Traynor"  wrote:

Previously rxqs were assigned to pmds by round robin in
port/queue order.

Now that we have the processing cycles used for existing rxqs,
use that information to try and produced a better balanced
distribution of rxqs across pmds. i.e. given multiple pmds, the
rxqs which have consumed the largest amount of processing cycles
will be placed on different pmds.

The rxqs are sorted by their processing cycles and assigned (in
sorted order) round robin across pmds.

Signed-off-by: Kevin Traynor 
---
 Documentation/howto/dpdk.rst |   7 +++
 lib/dpif-netdev.c| 123 
---
 2 files changed, 99 insertions(+), 31 deletions(-)

diff --git a/Documentation/howto/dpdk.rst b/Documentation/howto/dpdk.rst
index d7f6610..44737e4 100644
--- a/Documentation/howto/dpdk.rst
+++ b/Documentation/howto/dpdk.rst
@@ -119,4 +119,11 @@ After that PMD threads on cores where RX queues 
was pinned will become
   thread.
 
+If pmd-rxq-affinity is not set for rxqs, they will be assigned to pmds 
(cores)
+automatically. The processing cycles that have been required for each 
rxq

+will be used where known to assign rxqs with the highest consumption of
+processing cycles to different pmds.

‘will be used where known to assign rxqs to pmds based on round robin of the 
sorted rxqs’ ?


+
+Rxq to pmds assignment takes place whenever there are configuration 
changes.
+
 QoS
 ---
diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 5670c55..afbf591 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -712,4 +712,6 @@ static void
 dp_netdev_rxq_set_interval(struct dp_netdev_rxq *rx,
unsigned long long cycles);
+static uint64_t
+dp_netdev_rxq_get_interval(struct dp_netdev_rxq *rx, unsigned idx);
 static void
 dpif_netdev_xps_revalidate_pmd(const struct dp_netdev_pmd_thread *pmd,
@@ -3166,4 +3168,12 @@ dp_netdev_rxq_set_interval(struct dp_netdev_rxq 
*rx,
 }
 
+static uint64_t
+dp_netdev_rxq_get_interval(struct dp_netdev_rxq *rx, unsigned idx)
+{
+unsigned long long tmp;
+atomic_read_relaxed(>cycles_intrvl[idx], );
+return tmp;

Could we use something like ‘intrv_processing_cycles’ instead of ‘tmp’
Also _get_intrv_cycles ?; same forward comment I mentioned in patch 3


+}
+
 static int
 dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd,
@@ -3352,8 +3362,37 @@ rr_numa_list_destroy(struct rr_numa_list *rr)
 }
 
+/* Sort Rx Queues by the processing cycles they are consuming. */
+static int
+rxq_cycle_sort(const void *a, const void *b)
+{
+struct dp_netdev_rxq * qa;
+struct dp_netdev_rxq * qb;
+uint64_t total_qa, total_qb;
+unsigned i;
+
+qa = *(struct dp_netdev_rxq **) a;
+qb = *(struct dp_netdev_rxq **) b;
+
+total_qa = total_qb = 0;
+for (i = 0; i < PMD_RXQ_INTERVAL_MAX; i++) {
+total_qa += dp_netdev_rxq_get_interval(qa, i);
+total_qb += dp_netdev_rxq_get_interval(qb, i);
+}
+dp_netdev_rxq_set_cycles(qa, RXQ_CYCLES_PROC_HIST, total_qa);
+dp_netdev_rxq_set_cycles(qb, RXQ_CYCLES_PROC_HIST, total_qb);
+
+if (total_qa >= total_qb) {
+return -1;
+}
+return 1;
+}
+
 /* Assign pmds to queues.  If 'pinned' is true, assign pmds to pinned
  * queues and marks the pmds as isolated.  Otherwise, assign non 
isolated
  * pmds to unpinned queues.
  *
+ * If 'pinned' is false queues will be sorted by processing cycles 
they are
+ * consuming and then assigned to pmds in round robin order.
+ *
  * The function doesn't touch the pmd threads, it just stores the 
assignment
  * in the 'pmd' member of each rxq. */
@@ -3364,18 +3403,14 @@ rxq_scheduling(struct dp_netdev *dp, bool 
pinned) OVS_REQUIRES(dp->port_mutex)
 struct rr_numa_list

Re: [ovs-dev] [PATCH v5 2/6] dpif-netdev: Add rxq processing cycle counters.

2017-08-24 Thread Darrell Ball



On 8/23/17, 6:34 AM, "Kevin Traynor"  wrote:

Add counters to dp_netdev_rxq which will later be used for storing the
processing cycles of an rxq. Processing cycles will be stored in 
reference
to a defined time interval. We will store the cycles of the current in 
progress
interval, a number of completed intervals and the sum of the completed
intervals.

cycles_count_intermediate was used to count cycles for a pmd. With some 
small
additions we can also use it to count the cycles used for processing an 
rxq.

Signed-off-by: Kevin Traynor 
---
 lib/dpif-netdev.c | 28 +---
 1 file changed, 25 insertions(+), 3 deletions(-)

diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index f35c079..51d4050 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -182,4 +182,8 @@ struct emc_cache {
 #define DPCLS_OPTIMIZATION_INTERVAL 1000
 
+/* Number of intervals for which cycles are stored
+ * and used during rxq to pmd assignment. */
+#define PMD_RXQ_INTERVAL_MAX 6
+
 struct dpcls {
 struct cmap_node node;  /* Within 
dp_netdev_pmd_thread.classifiers */
@@ -340,4 +344,13 @@ enum pmd_cycles_counter_type {
 };
 
+enum rxq_cycles_counter_type {
+RXQ_CYCLES_PROC_CURR,   /* Cycles spent successfully polling 
and
+   processing packets during the 
current
+   interval. */
+RXQ_CYCLES_PROC_HIST,   /* Total cycles of all intervals that 
are used
+   during rxq to pmd assignment. */
+RXQ_N_CYCLES
+};
+
 #define XPS_TIMEOUT_MS 500LL
 
@@ -351,4 +364,9 @@ struct dp_netdev_rxq {
   particular core. */
 struct dp_netdev_pmd_thread *pmd;  /* pmd thread that polls this 
queue. */
+
+/* Counters of cycles spent polling and processing packets. */

Do we need to specify ‘successfully’ polling to avoid confusion ?

+atomic_ullong cycles[RXQ_N_CYCLES];

Do you think below deserves a separate description – ‘we store the last 
PMD_RXQ_INTERVAL_MAX  intervals and sum them up to 
to yield the cycles used for a given rxq…’ ?

+atomic_ullong cycles_intrvl[PMD_RXQ_INTERVAL_MAX];
+unsigned intrvl_idx;   /* Write index for 
'cycles_intrvl'. */
 };
 
@@ -677,5 +695,4 @@ static void pmd_load_cached_ports(struct 
dp_netdev_pmd_thread *pmd)
 static inline void
 dp_netdev_pmd_try_optimize(struct dp_netdev_pmd_thread *pmd);
-
 static void
 dpif_netdev_xps_revalidate_pmd(const struct dp_netdev_pmd_thread *pmd,
@@ -3092,4 +3109,5 @@ cycles_count_end(struct dp_netdev_pmd_thread *pmd,
 static inline void
 cycles_count_intermediate(struct dp_netdev_pmd_thread *pmd,
+  struct dp_netdev_rxq *rxq,
   enum pmd_cycles_counter_type type)
 OVS_NO_THREAD_SAFETY_ANALYSIS
@@ -3100,4 +3118,8 @@ cycles_count_intermediate(struct 
dp_netdev_pmd_thread *pmd,
 
 non_atomic_ullong_add(>cycles.n[type], interval);
+if (rxq && (type == PMD_CYCLES_PROCESSING)) {
+/* Add to the amount of current processing cycles. */
+non_atomic_ullong_add(>cycles[RXQ_CYCLES_PROC_CURR], 
interval);
+}
 }
 
@@ -3668,5 +3690,5 @@ dpif_netdev_run(struct dpif *dpif)
port->rxqs[i].rx,
port->port_no);
-cycles_count_intermediate(non_pmd, process_packets 
?
+cycles_count_intermediate(non_pmd, NULL, 
process_packets ?

PMD_CYCLES_PROCESSING
  : 
PMD_CYCLES_IDLE);
@@ -3855,5 +3877,5 @@ reload:
 dp_netdev_process_rxq_port(pmd, poll_list[i].rxq->rx,
poll_list[i].port_no);
-cycles_count_intermediate(pmd,
+cycles_count_intermediate(pmd, NULL,
   process_packets ? 
PMD_CYCLES_PROCESSING
   : 
PMD_CYCLES_IDLE);
-- 
1.8.3.1









___
dev mailing list
d...@openvswitch.org

Re: [ovs-dev] [PATCH 3/4] tc: Add header rewrite using tc pedit action

2017-08-24 Thread Paul Blakey




On 23/08/2017 03:20, Joe Stringer wrote:

On 22 August 2017 at 13:31, Paul Blakey  wrote:



On 22/08/2017 23:07, Paul Blakey wrote:




On 21/08/2017 20:45, Joe Stringer wrote:


On 20 August 2017 at 01:50, Paul Blakey  wrote:




On 18/08/2017 00:52, Joe Stringer wrote:



On 17 August 2017 at 02:18, Paul Blakey  wrote:





On 15/08/2017 20:04, Joe Stringer wrote:




On 15 August 2017 at 01:19, Paul Blakey  wrote:






On 15/08/2017 00:56, Joe Stringer wrote:





On 8 August 2017 at 07:21, Roi Dayan  wrote:





From: Paul Blakey 

@@ -117,6 +118,17 @@ struct tc_flower {
  uint64_t lastused;

  struct {
+bool rewrite;
+uint8_t pad1[3];
+struct tc_flower_key key;
+uint8_t pad2[3];
+struct tc_flower_key mask;
+uint8_t pad3[3];
+} rewrite;





Now that I get why the pads are here.. ;)

Is there an existing macro we can use to ensure that these pad out to
32-bit boundaries?



I'm not sure if that's possible, the size is a minimum of extra
24bits,
so
its can't overflow with writing on any address below it. The compiler
might add some extra padding but that shouldn't matter.




My thought was that the main concern here is to align the fields to
32-bit boundaries, so if it already does this then I wouldn't expect
to need any padding at all? For instance, on my system with these
patches I see with pahole that 'struct tc_flower_key' has size 84, and
the 'pad2' and 'pad3' appear to be unnecessary:

   struct {
  _Bool  rewrite;  /*   216
1 */
  uint8_tpad1[0];  /*   217
0 */
  struct tc_flower_key key;/*   220
84 */
  /* --- cacheline 1 boundary (64 bytes) was 21 bytes
ago
--- */
  uint8_tpad2[0];  /*   304
0 */
  struct tc_flower_key mask;   /*   308
84 */
  /* --- cacheline 2 boundary (128 bytes) was 41 bytes
ago
--- */
  uint8_tpad3[0];  /*   392
0 */
  } rewrite;   /*   216
180 */

However, if in future someone adds a 1-byte member to tc_flower_key
then I'm not sure that this alignment will hold. On my system, it
seems like that would end up padding tc_flower_key out to 88B so these
extra padding would still be unnecessary (although pahole says that it
has a brain fart, so I'm not sure exactly how much confidence I should
have in this).

What I was thinking about was whether we could use something like
PADDED_MEMBERS() in this case.



Are you talking about alignment in regards to performance?
Because the padding is there for overflowing.
If someone adds a new member to struct key/mask that is smaller than a
32bits to the end of the struct, setting it via a pointer might overflow
the
struct. I don't think PADDED_MEMBERS will work for this type of padding.

We do mask the write to the write size, and it should still be in memory
of
owned by struct flower and since the compiler can't reorder the struct
we
actually only need the last padding to cover the above case.

Maybe we can add alignment when we measure the performance of the code?



Ah. I wasn't concerned about performance, I was just wondering if this
forces us to add a few extra unnecessary bytes to 'rewrite' regardless
of the size of 'struct tc_flower'. For instance, if 'struct tc_flower'
is 63B long because someone adds a small field to the end of the
structure, then I'd expect to need only one extra byte of padding. If
it were a multiple of 32 bits, I wouldn't expect any need for padding
at all.



I see. You're right but keep in mind that this struct is on the stack and
a couple of bytes shouldn't matter much. I'm not sure how to define a macro
that adds padding  based on the last member in the struct unless I do define
it like PADDED_MEMBERS but with the last member isolated from
the rest (e.g PAD_TO_LAST(unit, members, last_member)) so I can test it's
size. A build time assert would be the same.

Since we mask the writes/reads, Maybe we can assert whether reading 32bit
(or actually 24bit) past the struct rewrite is within struct flower bounds.
Basically that no one moved it to the end of struct flower.

pseudo code: BUILD_DECL_ASSERT(member_offsetof(flower.rewrite) +
sizeof(uint32_t) < sizeof(struct flower))



BUILD_DECL_ASSERT(member_offsetof(flower.rewrite) + sizeof(flower.rewrite) +
sizeof(uint32_t) -1 < sizeof(struct flower))


Could we just check that (member_offsetof(flower.rewrite) +
sizeof(flower.rewrite)) % sizeof(uint32_t) == 0 ?

Also for the flower.rewrite.key and flower.rewrite.mask ?



Maybe I'm missing something but that doesn't check that we wouldn't 
spill over to outside of flower if someone defines rewrite (moves the 
rewrite struct)

51 matches

Mail list logo