[net-next v5 4/4] openvswitch: Add meter action support

2017-11-10 Thread Andy Zhou
Implements OVS kernel meter action support.

Signed-off-by: Andy Zhou <az...@ovn.org>
---
 include/uapi/linux/openvswitch.h | 3 +++
 net/openvswitch/actions.c| 6 ++
 net/openvswitch/datapath.h   | 1 +
 net/openvswitch/flow_netlink.c   | 6 ++
 4 files changed, 16 insertions(+)

diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index d60b9a4cf3d1..4265d7f9e1f2 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -838,6 +838,8 @@ struct ovs_action_push_eth {
  * @OVS_ACTION_ATTR_CT_CLEAR: Clear conntrack state from the packet.
  * @OVS_ACTION_ATTR_PUSH_NSH: push NSH header to the packet.
  * @OVS_ACTION_ATTR_POP_NSH: pop the outermost NSH header off the packet.
+ * @OVS_ACTION_ATTR_METER: Run packet through a meter, which may drop the
+ * packet, or modify the packet (e.g., change the DSCP field).
  *
  * Only a single header can be set with a single %OVS_ACTION_ATTR_SET.  Not all
  * fields within a header are modifiable, e.g. the IPv4 protocol and fragment
@@ -870,6 +872,7 @@ enum ovs_action_attr {
OVS_ACTION_ATTR_CT_CLEAR, /* No argument. */
OVS_ACTION_ATTR_PUSH_NSH, /* Nested OVS_NSH_KEY_ATTR_*. */
OVS_ACTION_ATTR_POP_NSH,  /* No argument. */
+   OVS_ACTION_ATTR_METER,/* u32 meter ID. */
 
__OVS_ACTION_ATTR_MAX,/* Nothing past this will be accepted
   * from userspace. */
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index 9a6a6d51e421..30a5df27116e 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -1330,6 +1330,12 @@ static int do_execute_actions(struct datapath *dp, 
struct sk_buff *skb,
case OVS_ACTION_ATTR_POP_NSH:
err = pop_nsh(skb, key);
break;
+
+   case OVS_ACTION_ATTR_METER:
+   if (ovs_meter_execute(dp, skb, key, nla_get_u32(a))) {
+   consume_skb(skb);
+   return 0;
+   }
}
 
if (unlikely(err)) {
diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h
index 5d2997b42460..523d65526766 100644
--- a/net/openvswitch/datapath.h
+++ b/net/openvswitch/datapath.h
@@ -30,6 +30,7 @@
 #include "conntrack.h"
 #include "flow.h"
 #include "flow_table.h"
+#include "meter.h"
 #include "vport-internal_dev.h"
 
 #define DP_MAX_PORTS   USHRT_MAX
diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index 4201f9293af3..bb4dae198c78 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -90,6 +90,7 @@ static bool actions_may_change_flow(const struct nlattr 
*actions)
case OVS_ACTION_ATTR_SAMPLE:
case OVS_ACTION_ATTR_SET:
case OVS_ACTION_ATTR_SET_MASKED:
+   case OVS_ACTION_ATTR_METER:
default:
return true;
}
@@ -2844,6 +2845,7 @@ static int __ovs_nla_copy_actions(struct net *net, const 
struct nlattr *attr,
[OVS_ACTION_ATTR_POP_ETH] = 0,
[OVS_ACTION_ATTR_PUSH_NSH] = (u32)-1,
[OVS_ACTION_ATTR_POP_NSH] = 0,
+   [OVS_ACTION_ATTR_METER] = sizeof(u32),
};
const struct ovs_action_push_vlan *vlan;
int type = nla_type(a);
@@ -3029,6 +3031,10 @@ static int __ovs_nla_copy_actions(struct net *net, const 
struct nlattr *attr,
break;
}
 
+   case OVS_ACTION_ATTR_METER:
+   /* Non-existent meters are simply ignored.  */
+   break;
+
default:
OVS_NLERR(log, "Unknown Action type %d", type);
return -EINVAL;
-- 
1.8.3.1



[net-next v5 3/4] openvswitch: Add meter infrastructure

2017-11-10 Thread Andy Zhou
OVS kernel datapath so far does not support Openflow meter action.
This is the first stab at adding kernel datapath meter support.
This implementation supports only drop band type.

Signed-off-by: Andy Zhou <az...@ovn.org>
---
 net/openvswitch/Makefile   |   1 +
 net/openvswitch/datapath.c |  14 +-
 net/openvswitch/datapath.h |   3 +
 net/openvswitch/meter.c| 604 +
 net/openvswitch/meter.h|  54 
 5 files changed, 674 insertions(+), 2 deletions(-)
 create mode 100644 net/openvswitch/meter.c
 create mode 100644 net/openvswitch/meter.h

diff --git a/net/openvswitch/Makefile b/net/openvswitch/Makefile
index 299f4476cf44..41109c326f3a 100644
--- a/net/openvswitch/Makefile
+++ b/net/openvswitch/Makefile
@@ -12,6 +12,7 @@ openvswitch-y := \
flow.o \
flow_netlink.o \
flow_table.o \
+   meter.o \
vport.o \
vport-internal_dev.o \
vport-netdev.o
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 6e098035bb8f..0dab33fb9844 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -55,6 +55,7 @@
 #include "flow.h"
 #include "flow_table.h"
 #include "flow_netlink.h"
+#include "meter.h"
 #include "vport-internal_dev.h"
 #include "vport-netdev.h"
 
@@ -174,6 +175,7 @@ static void destroy_dp_rcu(struct rcu_head *rcu)
ovs_flow_tbl_destroy(>table);
free_percpu(dp->stats_percpu);
kfree(dp->ports);
+   ovs_meters_exit(dp);
kfree(dp);
 }
 
@@ -1572,6 +1574,10 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct 
genl_info *info)
for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++)
INIT_HLIST_HEAD(>ports[i]);
 
+   err = ovs_meters_init(dp);
+   if (err)
+   goto err_destroy_ports_array;
+
/* Set up our datapath device. */
parms.name = nla_data(a[OVS_DP_ATTR_NAME]);
parms.type = OVS_VPORT_TYPE_INTERNAL;
@@ -1600,7 +1606,7 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct 
genl_info *info)
ovs_dp_reset_user_features(skb, info);
}
 
-   goto err_destroy_ports_array;
+   goto err_destroy_meters;
}
 
err = ovs_dp_cmd_fill_info(dp, reply, info->snd_portid,
@@ -1615,8 +1621,10 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct 
genl_info *info)
ovs_notify(_datapath_genl_family, reply, info);
return 0;
 
-err_destroy_ports_array:
+err_destroy_meters:
ovs_unlock();
+   ovs_meters_exit(dp);
+err_destroy_ports_array:
kfree(dp->ports);
 err_destroy_percpu:
free_percpu(dp->stats_percpu);
@@ -2265,6 +2273,7 @@ struct genl_family dp_vport_genl_family __ro_after_init = 
{
_vport_genl_family,
_flow_genl_family,
_packet_genl_family,
+   _meter_genl_family,
 };
 
 static void dp_unregister_genl(int n_families)
@@ -2445,3 +2454,4 @@ static void dp_cleanup(void)
 MODULE_ALIAS_GENL_FAMILY(OVS_VPORT_FAMILY);
 MODULE_ALIAS_GENL_FAMILY(OVS_FLOW_FAMILY);
 MODULE_ALIAS_GENL_FAMILY(OVS_PACKET_FAMILY);
+MODULE_ALIAS_GENL_FAMILY(OVS_METER_FAMILY);
diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h
index 954c4ed465a5..5d2997b42460 100644
--- a/net/openvswitch/datapath.h
+++ b/net/openvswitch/datapath.h
@@ -92,6 +92,9 @@ struct datapath {
u32 user_features;
 
u32 max_headroom;
+
+   /* Switch meters. */
+   struct hlist_head *meters;
 };
 
 /**
diff --git a/net/openvswitch/meter.c b/net/openvswitch/meter.c
new file mode 100644
index ..2a5ba356c472
--- /dev/null
+++ b/net/openvswitch/meter.c
@@ -0,0 +1,604 @@
+/*
+ * Copyright (c) 2017 Nicira, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include 
+#include 
+
+#include "datapath.h"
+#include "meter.h"
+
+#define METER_HASH_BUCKETS 1024
+
+static const struct nla_policy meter_policy[OVS_METER_ATTR_MAX + 1] = {
+   [OVS_METER_ATTR_ID] = { .type = NLA_U32, },
+   [OVS_METER_ATTR_KBPS] = { .type = NLA_FLAG },
+   [OVS_METER_ATTR_STATS] = { .len = sizeof(struct ovs_flow_stats) },
+   [OVS_METER_ATTR_BANDS] = { .type = NLA_NESTED },
+   [OVS_METER_ATTR_USED] = { .type = NLA_U64 },
+   [OVS_METER_ATTR_CLEAR] = { .type = NLA_FLAG },
+   [OVS_METER_ATTR_MAX_METERS] = { .type = NLA_U32 },
+   [OVS_METER_ATTR_MAX_BANDS] = { .type = NLA_U32 },
+};
+
+static const struct nla_policy band_policy[OVS_BAND_ATTR_MAX + 1] = {
+   [OVS_BAND_ATTR_TYPE] = { .type = NLA_U32, },
+   [OVS_BAND_ATTR_RATE] = { .type = 

[net-next v5 2/4] openvswitch: export get_dp() API.

2017-11-10 Thread Andy Zhou
Later patches will invoke get_dp() outside of datapath.c. Export it.

Signed-off-by: Andy Zhou <az...@ovn.org>
---
 net/openvswitch/datapath.c | 29 -
 net/openvswitch/datapath.h | 31 +++
 2 files changed, 31 insertions(+), 29 deletions(-)

diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 4d38ac044cee..6e098035bb8f 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -142,35 +142,6 @@ static int queue_userspace_packet(struct datapath *dp, 
struct sk_buff *,
  const struct dp_upcall_info *,
  uint32_t cutlen);
 
-/* Must be called with rcu_read_lock. */
-static struct datapath *get_dp_rcu(struct net *net, int dp_ifindex)
-{
-   struct net_device *dev = dev_get_by_index_rcu(net, dp_ifindex);
-
-   if (dev) {
-   struct vport *vport = ovs_internal_dev_get_vport(dev);
-   if (vport)
-   return vport->dp;
-   }
-
-   return NULL;
-}
-
-/* The caller must hold either ovs_mutex or rcu_read_lock to keep the
- * returned dp pointer valid.
- */
-static inline struct datapath *get_dp(struct net *net, int dp_ifindex)
-{
-   struct datapath *dp;
-
-   WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_ovsl_is_held());
-   rcu_read_lock();
-   dp = get_dp_rcu(net, dp_ifindex);
-   rcu_read_unlock();
-
-   return dp;
-}
-
 /* Must be called with rcu_read_lock or ovs_mutex. */
 const char *ovs_dp_name(const struct datapath *dp)
 {
diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h
index 4a104ef9e12c..954c4ed465a5 100644
--- a/net/openvswitch/datapath.h
+++ b/net/openvswitch/datapath.h
@@ -30,6 +30,7 @@
 #include "conntrack.h"
 #include "flow.h"
 #include "flow_table.h"
+#include "vport-internal_dev.h"
 
 #define DP_MAX_PORTS   USHRT_MAX
 #define DP_VPORT_HASH_BUCKETS  1024
@@ -190,6 +191,36 @@ static inline struct vport *ovs_vport_ovsl(const struct 
datapath *dp, int port_n
return ovs_lookup_vport(dp, port_no);
 }
 
+/* Must be called with rcu_read_lock. */
+static inline struct datapath *get_dp_rcu(struct net *net, int dp_ifindex)
+{
+   struct net_device *dev = dev_get_by_index_rcu(net, dp_ifindex);
+
+   if (dev) {
+   struct vport *vport = ovs_internal_dev_get_vport(dev);
+
+   if (vport)
+   return vport->dp;
+   }
+
+   return NULL;
+}
+
+/* The caller must hold either ovs_mutex or rcu_read_lock to keep the
+ * returned dp pointer valid.
+ */
+static inline struct datapath *get_dp(struct net *net, int dp_ifindex)
+{
+   struct datapath *dp;
+
+   WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_ovsl_is_held());
+   rcu_read_lock();
+   dp = get_dp_rcu(net, dp_ifindex);
+   rcu_read_unlock();
+
+   return dp;
+}
+
 extern struct notifier_block ovs_dp_device_notifier;
 extern struct genl_family dp_vport_genl_family;
 
-- 
1.8.3.1



[net-next v5 0/4] Openvswitch meter action

2017-11-10 Thread Andy Zhou
This patch series is the first attempt to add openvswitch
meter support. We have previously experimented with adding
metering support in nftables. However 1) It was not clear
how to expose a named nftables object cleanly, and 2)
the logic that implements metering is quite small, < 100 lines
of code.

With those two observations, it seems cleaner to add meter
support in the openvswitch module directly.

---

v1(RFC)->v2:  remove unused code improve locking
  and other review comments
v2 -> v3: rebase
v3 -> v4: fix undefined "__udivdi3" references on 32 bit builds.
  use div_u64() instead.
v4 -> v5: rebase

Andy Zhou (4):
  openvswitch: Add meter netlink definitions
  openvswitch: export get_dp() API.
  openvswitch: Add meter infrastructure
  openvswitch: Add meter action support

 include/uapi/linux/openvswitch.h |  54 
 net/openvswitch/Makefile |   1 +
 net/openvswitch/actions.c|   6 +
 net/openvswitch/datapath.c   |  43 +--
 net/openvswitch/datapath.h   |  35 +++
 net/openvswitch/flow_netlink.c   |   6 +
 net/openvswitch/meter.c  | 604 +++
 net/openvswitch/meter.h  |  54 
 8 files changed, 772 insertions(+), 31 deletions(-)
 create mode 100644 net/openvswitch/meter.c
 create mode 100644 net/openvswitch/meter.h

-- 
1.8.3.1

*** BLURB HERE ***

Andy Zhou (4):
  openvswitch: Add meter netlink definitions
  openvswitch: export get_dp() API.
  openvswitch: Add meter infrastructure
  openvswitch: Add meter action support

 include/uapi/linux/openvswitch.h |  54 
 net/openvswitch/Makefile |   1 +
 net/openvswitch/actions.c|   6 +
 net/openvswitch/datapath.c   |  43 +--
 net/openvswitch/datapath.h   |  35 +++
 net/openvswitch/flow_netlink.c   |   6 +
 net/openvswitch/meter.c  | 604 +++
 net/openvswitch/meter.h  |  54 
 8 files changed, 772 insertions(+), 31 deletions(-)
 create mode 100644 net/openvswitch/meter.c
 create mode 100644 net/openvswitch/meter.h

-- 
1.8.3.1



[net-next v5 1/4] openvswitch: Add meter netlink definitions

2017-11-10 Thread Andy Zhou
Meter has its own netlink family. Define netlink messages and attributes
for communicating with the user space programs.

Signed-off-by: Andy Zhou <az...@ovn.org>
---
 include/uapi/linux/openvswitch.h | 51 
 1 file changed, 51 insertions(+)

diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index ec75a685f1dd..d60b9a4cf3d1 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -883,4 +883,55 @@ enum ovs_action_attr {
 
 #define OVS_ACTION_ATTR_MAX (__OVS_ACTION_ATTR_MAX - 1)
 
+/* Meters. */
+#define OVS_METER_FAMILY  "ovs_meter"
+#define OVS_METER_MCGROUP "ovs_meter"
+#define OVS_METER_VERSION 0x1
+
+enum ovs_meter_cmd {
+   OVS_METER_CMD_UNSPEC,
+   OVS_METER_CMD_FEATURES, /* Get features supported by the datapath. */
+   OVS_METER_CMD_SET,  /* Add or modify a meter. */
+   OVS_METER_CMD_DEL,  /* Delete a meter. */
+   OVS_METER_CMD_GET   /* Get meter stats. */
+};
+
+enum ovs_meter_attr {
+   OVS_METER_ATTR_UNSPEC,
+   OVS_METER_ATTR_ID,  /* u32 meter ID within datapath. */
+   OVS_METER_ATTR_KBPS,/* No argument. If set, units in kilobits
+* per second. Otherwise, units in
+* packets per second.
+*/
+   OVS_METER_ATTR_STATS,   /* struct ovs_flow_stats for the meter. */
+   OVS_METER_ATTR_BANDS,   /* Nested attributes for meter bands. */
+   OVS_METER_ATTR_USED,/* u64 msecs last used in monotonic time. */
+   OVS_METER_ATTR_CLEAR,   /* Flag to clear stats, used. */
+   OVS_METER_ATTR_MAX_METERS, /* u32 number of meters supported. */
+   OVS_METER_ATTR_MAX_BANDS,  /* u32 max number of bands per meter. */
+   OVS_METER_ATTR_PAD,
+   __OVS_METER_ATTR_MAX
+};
+
+#define OVS_METER_ATTR_MAX (__OVS_METER_ATTR_MAX - 1)
+
+enum ovs_band_attr {
+   OVS_BAND_ATTR_UNSPEC,
+   OVS_BAND_ATTR_TYPE, /* u32 OVS_METER_BAND_TYPE_* constant. */
+   OVS_BAND_ATTR_RATE, /* u32 band rate in meter units (see above). */
+   OVS_BAND_ATTR_BURST,/* u32 burst size in meter units. */
+   OVS_BAND_ATTR_STATS,/* struct ovs_flow_stats for the band. */
+   __OVS_BAND_ATTR_MAX
+};
+
+#define OVS_BAND_ATTR_MAX (__OVS_BAND_ATTR_MAX - 1)
+
+enum ovs_meter_band_type {
+   OVS_METER_BAND_TYPE_UNSPEC,
+   OVS_METER_BAND_TYPE_DROP,   /* Drop exceeding packets. */
+   __OVS_METER_BAND_TYPE_MAX
+};
+
+#define OVS_METER_BAND_TYPE_MAX (__OVS_METER_BAND_TYPE_MAX - 1)
+
 #endif /* _LINUX_OPENVSWITCH_H */
-- 
1.8.3.1



Re: [net-next v4 4/4] openvswitch: Add meter action support

2017-11-10 Thread Andy Zhou
On Thu, Nov 9, 2017 at 12:50 AM, Pravin Shelar <pshe...@ovn.org> wrote:
> On Thu, Nov 9, 2017 at 11:00 AM, Andy Zhou <az...@ovn.org> wrote:
>> Implements OVS kernel meter action support.
>>
>> Signed-off-by: Andy Zhou <az...@ovn.org>
>> ---
> I could not apply this patch on latest net-next.
I will rebase and repost.


[net-next v4 4/4] openvswitch: Add meter action support

2017-11-08 Thread Andy Zhou
Implements OVS kernel meter action support.

Signed-off-by: Andy Zhou <az...@ovn.org>
---
 include/uapi/linux/openvswitch.h | 3 +++
 net/openvswitch/actions.c| 6 ++
 net/openvswitch/datapath.h   | 1 +
 net/openvswitch/flow_netlink.c   | 6 ++
 4 files changed, 16 insertions(+)

diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index 368ce7d3ecdc..d6a8f3eaf433 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -811,6 +811,8 @@ struct ovs_action_push_eth {
  * @OVS_ACTION_ATTR_POP_ETH: Pop the outermost Ethernet header off the
  * packet.
  * @OVS_ACTION_ATTR_CT_CLEAR: Clear conntrack state from the packet.
+ * @OVS_ACTION_ATTR_METER: Run packet through a meter, which may drop the
+ * packet, or modify the packet (e.g., change the DSCP field).
  *
  * Only a single header can be set with a single %OVS_ACTION_ATTR_SET.  Not all
  * fields within a header are modifiable, e.g. the IPv4 protocol and fragment
@@ -841,6 +843,7 @@ enum ovs_action_attr {
OVS_ACTION_ATTR_PUSH_ETH, /* struct ovs_action_push_eth. */
OVS_ACTION_ATTR_POP_ETH,  /* No argument. */
OVS_ACTION_ATTR_CT_CLEAR, /* No argument. */
+   OVS_ACTION_ATTR_METER,/* u32 meter ID. */
 
__OVS_ACTION_ATTR_MAX,/* Nothing past this will be accepted
   * from userspace. */
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index a551232daf61..df342fe66492 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -1214,6 +1214,12 @@ static int do_execute_actions(struct datapath *dp, 
struct sk_buff *skb,
case OVS_ACTION_ATTR_POP_ETH:
err = pop_eth(skb, key);
break;
+
+   case OVS_ACTION_ATTR_METER:
+   if (ovs_meter_execute(dp, skb, key, nla_get_u32(a))) {
+   consume_skb(skb);
+   return 0;
+   }
}
 
if (unlikely(err)) {
diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h
index 5d2997b42460..523d65526766 100644
--- a/net/openvswitch/datapath.h
+++ b/net/openvswitch/datapath.h
@@ -30,6 +30,7 @@
 #include "conntrack.h"
 #include "flow.h"
 #include "flow_table.h"
+#include "meter.h"
 #include "vport-internal_dev.h"
 
 #define DP_MAX_PORTS   USHRT_MAX
diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index dc0d79092e74..6789bb1ddf0e 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -87,6 +87,7 @@ static bool actions_may_change_flow(const struct nlattr 
*actions)
case OVS_ACTION_ATTR_SAMPLE:
case OVS_ACTION_ATTR_SET:
case OVS_ACTION_ATTR_SET_MASKED:
+   case OVS_ACTION_ATTR_METER:
default:
return true;
}
@@ -2533,6 +2534,7 @@ static int __ovs_nla_copy_actions(struct net *net, const 
struct nlattr *attr,
[OVS_ACTION_ATTR_TRUNC] = sizeof(struct 
ovs_action_trunc),
[OVS_ACTION_ATTR_PUSH_ETH] = sizeof(struct 
ovs_action_push_eth),
[OVS_ACTION_ATTR_POP_ETH] = 0,
+   [OVS_ACTION_ATTR_METER] = sizeof(u32),
};
const struct ovs_action_push_vlan *vlan;
int type = nla_type(a);
@@ -2690,6 +2692,10 @@ static int __ovs_nla_copy_actions(struct net *net, const 
struct nlattr *attr,
mac_proto = MAC_PROTO_ETHERNET;
break;
 
+   case OVS_ACTION_ATTR_METER:
+   /* Non-existent meters are simply ignored.  */
+   break;
+
default:
OVS_NLERR(log, "Unknown Action type %d", type);
return -EINVAL;
-- 
1.8.3.1



[net-next v4 3/4] openvswitch: Add meter infrastructure

2017-11-08 Thread Andy Zhou
OVS kernel datapath so far does not support Openflow meter action.
This is the first stab at adding kernel datapath meter support.
This implementation supports only drop band type.

Signed-off-by: Andy Zhou <az...@ovn.org>
---
 net/openvswitch/Makefile   |   1 +
 net/openvswitch/datapath.c |  14 +-
 net/openvswitch/datapath.h |   3 +
 net/openvswitch/meter.c| 604 +
 net/openvswitch/meter.h|  54 
 5 files changed, 674 insertions(+), 2 deletions(-)
 create mode 100644 net/openvswitch/meter.c
 create mode 100644 net/openvswitch/meter.h

diff --git a/net/openvswitch/Makefile b/net/openvswitch/Makefile
index 299f4476cf44..41109c326f3a 100644
--- a/net/openvswitch/Makefile
+++ b/net/openvswitch/Makefile
@@ -12,6 +12,7 @@ openvswitch-y := \
flow.o \
flow_netlink.o \
flow_table.o \
+   meter.o \
vport.o \
vport-internal_dev.o \
vport-netdev.o
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 6e098035bb8f..0dab33fb9844 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -55,6 +55,7 @@
 #include "flow.h"
 #include "flow_table.h"
 #include "flow_netlink.h"
+#include "meter.h"
 #include "vport-internal_dev.h"
 #include "vport-netdev.h"
 
@@ -174,6 +175,7 @@ static void destroy_dp_rcu(struct rcu_head *rcu)
ovs_flow_tbl_destroy(>table);
free_percpu(dp->stats_percpu);
kfree(dp->ports);
+   ovs_meters_exit(dp);
kfree(dp);
 }
 
@@ -1572,6 +1574,10 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct 
genl_info *info)
for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++)
INIT_HLIST_HEAD(>ports[i]);
 
+   err = ovs_meters_init(dp);
+   if (err)
+   goto err_destroy_ports_array;
+
/* Set up our datapath device. */
parms.name = nla_data(a[OVS_DP_ATTR_NAME]);
parms.type = OVS_VPORT_TYPE_INTERNAL;
@@ -1600,7 +1606,7 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct 
genl_info *info)
ovs_dp_reset_user_features(skb, info);
}
 
-   goto err_destroy_ports_array;
+   goto err_destroy_meters;
}
 
err = ovs_dp_cmd_fill_info(dp, reply, info->snd_portid,
@@ -1615,8 +1621,10 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct 
genl_info *info)
ovs_notify(_datapath_genl_family, reply, info);
return 0;
 
-err_destroy_ports_array:
+err_destroy_meters:
ovs_unlock();
+   ovs_meters_exit(dp);
+err_destroy_ports_array:
kfree(dp->ports);
 err_destroy_percpu:
free_percpu(dp->stats_percpu);
@@ -2265,6 +2273,7 @@ struct genl_family dp_vport_genl_family __ro_after_init = 
{
_vport_genl_family,
_flow_genl_family,
_packet_genl_family,
+   _meter_genl_family,
 };
 
 static void dp_unregister_genl(int n_families)
@@ -2445,3 +2454,4 @@ static void dp_cleanup(void)
 MODULE_ALIAS_GENL_FAMILY(OVS_VPORT_FAMILY);
 MODULE_ALIAS_GENL_FAMILY(OVS_FLOW_FAMILY);
 MODULE_ALIAS_GENL_FAMILY(OVS_PACKET_FAMILY);
+MODULE_ALIAS_GENL_FAMILY(OVS_METER_FAMILY);
diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h
index 954c4ed465a5..5d2997b42460 100644
--- a/net/openvswitch/datapath.h
+++ b/net/openvswitch/datapath.h
@@ -92,6 +92,9 @@ struct datapath {
u32 user_features;
 
u32 max_headroom;
+
+   /* Switch meters. */
+   struct hlist_head *meters;
 };
 
 /**
diff --git a/net/openvswitch/meter.c b/net/openvswitch/meter.c
new file mode 100644
index ..2a5ba356c472
--- /dev/null
+++ b/net/openvswitch/meter.c
@@ -0,0 +1,604 @@
+/*
+ * Copyright (c) 2017 Nicira, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include 
+#include 
+
+#include "datapath.h"
+#include "meter.h"
+
+#define METER_HASH_BUCKETS 1024
+
+static const struct nla_policy meter_policy[OVS_METER_ATTR_MAX + 1] = {
+   [OVS_METER_ATTR_ID] = { .type = NLA_U32, },
+   [OVS_METER_ATTR_KBPS] = { .type = NLA_FLAG },
+   [OVS_METER_ATTR_STATS] = { .len = sizeof(struct ovs_flow_stats) },
+   [OVS_METER_ATTR_BANDS] = { .type = NLA_NESTED },
+   [OVS_METER_ATTR_USED] = { .type = NLA_U64 },
+   [OVS_METER_ATTR_CLEAR] = { .type = NLA_FLAG },
+   [OVS_METER_ATTR_MAX_METERS] = { .type = NLA_U32 },
+   [OVS_METER_ATTR_MAX_BANDS] = { .type = NLA_U32 },
+};
+
+static const struct nla_policy band_policy[OVS_BAND_ATTR_MAX + 1] = {
+   [OVS_BAND_ATTR_TYPE] = { .type = NLA_U32, },
+   [OVS_BAND_ATTR_RATE] = { .type = 

[net-next v4 2/4] openvswitch: export get_dp() API.

2017-11-08 Thread Andy Zhou
Later patches will invoke get_dp() outside of datapath.c. Export it.

Signed-off-by: Andy Zhou <az...@ovn.org>
---
 net/openvswitch/datapath.c | 29 -
 net/openvswitch/datapath.h | 31 +++
 2 files changed, 31 insertions(+), 29 deletions(-)

diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 4d38ac044cee..6e098035bb8f 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -142,35 +142,6 @@ static int queue_userspace_packet(struct datapath *dp, 
struct sk_buff *,
  const struct dp_upcall_info *,
  uint32_t cutlen);
 
-/* Must be called with rcu_read_lock. */
-static struct datapath *get_dp_rcu(struct net *net, int dp_ifindex)
-{
-   struct net_device *dev = dev_get_by_index_rcu(net, dp_ifindex);
-
-   if (dev) {
-   struct vport *vport = ovs_internal_dev_get_vport(dev);
-   if (vport)
-   return vport->dp;
-   }
-
-   return NULL;
-}
-
-/* The caller must hold either ovs_mutex or rcu_read_lock to keep the
- * returned dp pointer valid.
- */
-static inline struct datapath *get_dp(struct net *net, int dp_ifindex)
-{
-   struct datapath *dp;
-
-   WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_ovsl_is_held());
-   rcu_read_lock();
-   dp = get_dp_rcu(net, dp_ifindex);
-   rcu_read_unlock();
-
-   return dp;
-}
-
 /* Must be called with rcu_read_lock or ovs_mutex. */
 const char *ovs_dp_name(const struct datapath *dp)
 {
diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h
index 4a104ef9e12c..954c4ed465a5 100644
--- a/net/openvswitch/datapath.h
+++ b/net/openvswitch/datapath.h
@@ -30,6 +30,7 @@
 #include "conntrack.h"
 #include "flow.h"
 #include "flow_table.h"
+#include "vport-internal_dev.h"
 
 #define DP_MAX_PORTS   USHRT_MAX
 #define DP_VPORT_HASH_BUCKETS  1024
@@ -190,6 +191,36 @@ static inline struct vport *ovs_vport_ovsl(const struct 
datapath *dp, int port_n
return ovs_lookup_vport(dp, port_no);
 }
 
+/* Must be called with rcu_read_lock. */
+static inline struct datapath *get_dp_rcu(struct net *net, int dp_ifindex)
+{
+   struct net_device *dev = dev_get_by_index_rcu(net, dp_ifindex);
+
+   if (dev) {
+   struct vport *vport = ovs_internal_dev_get_vport(dev);
+
+   if (vport)
+   return vport->dp;
+   }
+
+   return NULL;
+}
+
+/* The caller must hold either ovs_mutex or rcu_read_lock to keep the
+ * returned dp pointer valid.
+ */
+static inline struct datapath *get_dp(struct net *net, int dp_ifindex)
+{
+   struct datapath *dp;
+
+   WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_ovsl_is_held());
+   rcu_read_lock();
+   dp = get_dp_rcu(net, dp_ifindex);
+   rcu_read_unlock();
+
+   return dp;
+}
+
 extern struct notifier_block ovs_dp_device_notifier;
 extern struct genl_family dp_vport_genl_family;
 
-- 
1.8.3.1



[net-next v4 1/4] openvswitch: Add meter netlink definitions

2017-11-08 Thread Andy Zhou
Meter has its own netlink family. Define netlink messages and attributes
for communicating with the user space programs.

Signed-off-by: Andy Zhou <az...@ovn.org>
---
 include/uapi/linux/openvswitch.h | 51 
 1 file changed, 51 insertions(+)

diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index 501e4c4e2a03..368ce7d3ecdc 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -854,4 +854,55 @@ enum ovs_action_attr {
 
 #define OVS_ACTION_ATTR_MAX (__OVS_ACTION_ATTR_MAX - 1)
 
+/* Meters. */
+#define OVS_METER_FAMILY  "ovs_meter"
+#define OVS_METER_MCGROUP "ovs_meter"
+#define OVS_METER_VERSION 0x1
+
+enum ovs_meter_cmd {
+   OVS_METER_CMD_UNSPEC,
+   OVS_METER_CMD_FEATURES, /* Get features supported by the datapath. */
+   OVS_METER_CMD_SET,  /* Add or modify a meter. */
+   OVS_METER_CMD_DEL,  /* Delete a meter. */
+   OVS_METER_CMD_GET   /* Get meter stats. */
+};
+
+enum ovs_meter_attr {
+   OVS_METER_ATTR_UNSPEC,
+   OVS_METER_ATTR_ID,  /* u32 meter ID within datapath. */
+   OVS_METER_ATTR_KBPS,/* No argument. If set, units in kilobits
+* per second. Otherwise, units in
+* packets per second.
+*/
+   OVS_METER_ATTR_STATS,   /* struct ovs_flow_stats for the meter. */
+   OVS_METER_ATTR_BANDS,   /* Nested attributes for meter bands. */
+   OVS_METER_ATTR_USED,/* u64 msecs last used in monotonic time. */
+   OVS_METER_ATTR_CLEAR,   /* Flag to clear stats, used. */
+   OVS_METER_ATTR_MAX_METERS, /* u32 number of meters supported. */
+   OVS_METER_ATTR_MAX_BANDS,  /* u32 max number of bands per meter. */
+   OVS_METER_ATTR_PAD,
+   __OVS_METER_ATTR_MAX
+};
+
+#define OVS_METER_ATTR_MAX (__OVS_METER_ATTR_MAX - 1)
+
+enum ovs_band_attr {
+   OVS_BAND_ATTR_UNSPEC,
+   OVS_BAND_ATTR_TYPE, /* u32 OVS_METER_BAND_TYPE_* constant. */
+   OVS_BAND_ATTR_RATE, /* u32 band rate in meter units (see above). */
+   OVS_BAND_ATTR_BURST,/* u32 burst size in meter units. */
+   OVS_BAND_ATTR_STATS,/* struct ovs_flow_stats for the band. */
+   __OVS_BAND_ATTR_MAX
+};
+
+#define OVS_BAND_ATTR_MAX (__OVS_BAND_ATTR_MAX - 1)
+
+enum ovs_meter_band_type {
+   OVS_METER_BAND_TYPE_UNSPEC,
+   OVS_METER_BAND_TYPE_DROP,   /* Drop exceeding packets. */
+   __OVS_METER_BAND_TYPE_MAX
+};
+
+#define OVS_METER_BAND_TYPE_MAX (__OVS_METER_BAND_TYPE_MAX - 1)
+
 #endif /* _LINUX_OPENVSWITCH_H */
-- 
1.8.3.1



[net-next v3 0/4] Openvswitch meter action

2017-11-08 Thread Andy Zhou
This patch series is the first attempt to add openvswitch
meter support. We have previously experimented with adding
metering support in nftables. However 1) It was not clear
how to expose a named nftables object cleanly, and 2)
the logic that implements metering is quite small, < 100 lines
of code.

With those two observations, it seems cleaner to add meter
support in the openvswitch module directly.

---

v1(RFC)->v2:  remove unused code improve locking
  and other review comments
v2 -> v3: rebase
v3 -> v4: fix undefined "__udivdi3" references on 32 bit builds.
  use div_u64() instead.


Andy Zhou (4):
  openvswitch: Add meter netlink definitions
  openvswitch: export get_dp() API.
  openvswitch: Add meter infrastructure
  openvswitch: Add meter action support

 include/uapi/linux/openvswitch.h |  54 
 net/openvswitch/Makefile |   1 +
 net/openvswitch/actions.c|   6 +
 net/openvswitch/datapath.c   |  43 +--
 net/openvswitch/datapath.h   |  35 +++
 net/openvswitch/flow_netlink.c   |   6 +
 net/openvswitch/meter.c  | 604 +++
 net/openvswitch/meter.h  |  54 
 8 files changed, 772 insertions(+), 31 deletions(-)
 create mode 100644 net/openvswitch/meter.c
 create mode 100644 net/openvswitch/meter.h

-- 
1.8.3.1

*** BLURB HERE ***

Andy Zhou (4):
  openvswitch: Add meter netlink definitions
  openvswitch: export get_dp() API.
  openvswitch: Add meter infrastructure
  openvswitch: Add meter action support

 include/uapi/linux/openvswitch.h |  54 
 net/openvswitch/Makefile |   1 +
 net/openvswitch/actions.c|   6 +
 net/openvswitch/datapath.c   |  43 +--
 net/openvswitch/datapath.h   |  35 +++
 net/openvswitch/flow_netlink.c   |   6 +
 net/openvswitch/meter.c  | 604 +++
 net/openvswitch/meter.h  |  54 
 8 files changed, 772 insertions(+), 31 deletions(-)
 create mode 100644 net/openvswitch/meter.c
 create mode 100644 net/openvswitch/meter.h

-- 
1.8.3.1



[net-next v3 2/4] openvswitch: export get_dp() API.

2017-11-06 Thread Andy Zhou
Later patches will invoke get_dp() outside of datapath.c. Export it.

Signed-off-by: Andy Zhou <az...@ovn.org>
---
 net/openvswitch/datapath.c | 29 -
 net/openvswitch/datapath.h | 31 +++
 2 files changed, 31 insertions(+), 29 deletions(-)

diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 4d38ac044cee..6e098035bb8f 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -142,35 +142,6 @@ static int queue_userspace_packet(struct datapath *dp, 
struct sk_buff *,
  const struct dp_upcall_info *,
  uint32_t cutlen);
 
-/* Must be called with rcu_read_lock. */
-static struct datapath *get_dp_rcu(struct net *net, int dp_ifindex)
-{
-   struct net_device *dev = dev_get_by_index_rcu(net, dp_ifindex);
-
-   if (dev) {
-   struct vport *vport = ovs_internal_dev_get_vport(dev);
-   if (vport)
-   return vport->dp;
-   }
-
-   return NULL;
-}
-
-/* The caller must hold either ovs_mutex or rcu_read_lock to keep the
- * returned dp pointer valid.
- */
-static inline struct datapath *get_dp(struct net *net, int dp_ifindex)
-{
-   struct datapath *dp;
-
-   WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_ovsl_is_held());
-   rcu_read_lock();
-   dp = get_dp_rcu(net, dp_ifindex);
-   rcu_read_unlock();
-
-   return dp;
-}
-
 /* Must be called with rcu_read_lock or ovs_mutex. */
 const char *ovs_dp_name(const struct datapath *dp)
 {
diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h
index 4a104ef9e12c..954c4ed465a5 100644
--- a/net/openvswitch/datapath.h
+++ b/net/openvswitch/datapath.h
@@ -30,6 +30,7 @@
 #include "conntrack.h"
 #include "flow.h"
 #include "flow_table.h"
+#include "vport-internal_dev.h"
 
 #define DP_MAX_PORTS   USHRT_MAX
 #define DP_VPORT_HASH_BUCKETS  1024
@@ -190,6 +191,36 @@ static inline struct vport *ovs_vport_ovsl(const struct 
datapath *dp, int port_n
return ovs_lookup_vport(dp, port_no);
 }
 
+/* Must be called with rcu_read_lock. */
+static inline struct datapath *get_dp_rcu(struct net *net, int dp_ifindex)
+{
+   struct net_device *dev = dev_get_by_index_rcu(net, dp_ifindex);
+
+   if (dev) {
+   struct vport *vport = ovs_internal_dev_get_vport(dev);
+
+   if (vport)
+   return vport->dp;
+   }
+
+   return NULL;
+}
+
+/* The caller must hold either ovs_mutex or rcu_read_lock to keep the
+ * returned dp pointer valid.
+ */
+static inline struct datapath *get_dp(struct net *net, int dp_ifindex)
+{
+   struct datapath *dp;
+
+   WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_ovsl_is_held());
+   rcu_read_lock();
+   dp = get_dp_rcu(net, dp_ifindex);
+   rcu_read_unlock();
+
+   return dp;
+}
+
 extern struct notifier_block ovs_dp_device_notifier;
 extern struct genl_family dp_vport_genl_family;
 
-- 
1.8.3.1



[net-next v3 1/4] openvswitch: Add meter netlink definitions

2017-11-06 Thread Andy Zhou
Meter has its own netlink family. Define netlink messages and attributes
for communicating with the user space programs.

Signed-off-by: Andy Zhou <az...@ovn.org>
---
 include/uapi/linux/openvswitch.h | 51 
 1 file changed, 51 insertions(+)

diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index 501e4c4e2a03..368ce7d3ecdc 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -854,4 +854,55 @@ enum ovs_action_attr {
 
 #define OVS_ACTION_ATTR_MAX (__OVS_ACTION_ATTR_MAX - 1)
 
+/* Meters. */
+#define OVS_METER_FAMILY  "ovs_meter"
+#define OVS_METER_MCGROUP "ovs_meter"
+#define OVS_METER_VERSION 0x1
+
+enum ovs_meter_cmd {
+   OVS_METER_CMD_UNSPEC,
+   OVS_METER_CMD_FEATURES, /* Get features supported by the datapath. */
+   OVS_METER_CMD_SET,  /* Add or modify a meter. */
+   OVS_METER_CMD_DEL,  /* Delete a meter. */
+   OVS_METER_CMD_GET   /* Get meter stats. */
+};
+
+enum ovs_meter_attr {
+   OVS_METER_ATTR_UNSPEC,
+   OVS_METER_ATTR_ID,  /* u32 meter ID within datapath. */
+   OVS_METER_ATTR_KBPS,/* No argument. If set, units in kilobits
+* per second. Otherwise, units in
+* packets per second.
+*/
+   OVS_METER_ATTR_STATS,   /* struct ovs_flow_stats for the meter. */
+   OVS_METER_ATTR_BANDS,   /* Nested attributes for meter bands. */
+   OVS_METER_ATTR_USED,/* u64 msecs last used in monotonic time. */
+   OVS_METER_ATTR_CLEAR,   /* Flag to clear stats, used. */
+   OVS_METER_ATTR_MAX_METERS, /* u32 number of meters supported. */
+   OVS_METER_ATTR_MAX_BANDS,  /* u32 max number of bands per meter. */
+   OVS_METER_ATTR_PAD,
+   __OVS_METER_ATTR_MAX
+};
+
+#define OVS_METER_ATTR_MAX (__OVS_METER_ATTR_MAX - 1)
+
+enum ovs_band_attr {
+   OVS_BAND_ATTR_UNSPEC,
+   OVS_BAND_ATTR_TYPE, /* u32 OVS_METER_BAND_TYPE_* constant. */
+   OVS_BAND_ATTR_RATE, /* u32 band rate in meter units (see above). */
+   OVS_BAND_ATTR_BURST,/* u32 burst size in meter units. */
+   OVS_BAND_ATTR_STATS,/* struct ovs_flow_stats for the band. */
+   __OVS_BAND_ATTR_MAX
+};
+
+#define OVS_BAND_ATTR_MAX (__OVS_BAND_ATTR_MAX - 1)
+
+enum ovs_meter_band_type {
+   OVS_METER_BAND_TYPE_UNSPEC,
+   OVS_METER_BAND_TYPE_DROP,   /* Drop exceeding packets. */
+   __OVS_METER_BAND_TYPE_MAX
+};
+
+#define OVS_METER_BAND_TYPE_MAX (__OVS_METER_BAND_TYPE_MAX - 1)
+
 #endif /* _LINUX_OPENVSWITCH_H */
-- 
1.8.3.1



[net-next v3 0/4] Openvswitch meter action

2017-11-06 Thread Andy Zhou
This patch series is the first attempt to add openvswitch
meter support. We have previously experimented with adding
metering support in nftables. However 1) It was not clear
how to expose a named nftables object cleanly, and 2)
the logic that implements metering is quite small, < 100 lines
of code.

With those two observations, it seems cleaner to add meter
support in the openvswitch module directly.

---

v1(RFC)->v2:  remove unused code improve locking
  and other review comments
v2 -> v3: rebase


Andy Zhou (4):
  openvswitch: Add meter netlink definitions
  openvswitch: export get_dp() API.
  openvswitch: Add meter infrastructure
  openvswitch: Add meter action support

 include/uapi/linux/openvswitch.h |  54 
 net/openvswitch/Makefile |   1 +
 net/openvswitch/actions.c|   6 +
 net/openvswitch/datapath.c   |  43 +--
 net/openvswitch/datapath.h   |  35 +++
 net/openvswitch/flow_netlink.c   |   6 +
 net/openvswitch/meter.c  | 604 +++
 net/openvswitch/meter.h  |  54 
 8 files changed, 772 insertions(+), 31 deletions(-)
 create mode 100644 net/openvswitch/meter.c
 create mode 100644 net/openvswitch/meter.h

-- 
1.8.3.1



[net-next v3 3/4] openvswitch: Add meter infrastructure

2017-11-06 Thread Andy Zhou
OVS kernel datapath so far does not support Openflow meter action.
This is the first stab at adding kernel datapath meter support.
This implementation supports only drop band type.

Signed-off-by: Andy Zhou <az...@ovn.org>
---
 net/openvswitch/Makefile   |   1 +
 net/openvswitch/datapath.c |  14 +-
 net/openvswitch/datapath.h |   3 +
 net/openvswitch/meter.c| 604 +
 net/openvswitch/meter.h|  54 
 5 files changed, 674 insertions(+), 2 deletions(-)
 create mode 100644 net/openvswitch/meter.c
 create mode 100644 net/openvswitch/meter.h

diff --git a/net/openvswitch/Makefile b/net/openvswitch/Makefile
index 299f4476cf44..41109c326f3a 100644
--- a/net/openvswitch/Makefile
+++ b/net/openvswitch/Makefile
@@ -12,6 +12,7 @@ openvswitch-y := \
flow.o \
flow_netlink.o \
flow_table.o \
+   meter.o \
vport.o \
vport-internal_dev.o \
vport-netdev.o
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 6e098035bb8f..0dab33fb9844 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -55,6 +55,7 @@
 #include "flow.h"
 #include "flow_table.h"
 #include "flow_netlink.h"
+#include "meter.h"
 #include "vport-internal_dev.h"
 #include "vport-netdev.h"
 
@@ -174,6 +175,7 @@ static void destroy_dp_rcu(struct rcu_head *rcu)
ovs_flow_tbl_destroy(>table);
free_percpu(dp->stats_percpu);
kfree(dp->ports);
+   ovs_meters_exit(dp);
kfree(dp);
 }
 
@@ -1572,6 +1574,10 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct 
genl_info *info)
for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++)
INIT_HLIST_HEAD(>ports[i]);
 
+   err = ovs_meters_init(dp);
+   if (err)
+   goto err_destroy_ports_array;
+
/* Set up our datapath device. */
parms.name = nla_data(a[OVS_DP_ATTR_NAME]);
parms.type = OVS_VPORT_TYPE_INTERNAL;
@@ -1600,7 +1606,7 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct 
genl_info *info)
ovs_dp_reset_user_features(skb, info);
}
 
-   goto err_destroy_ports_array;
+   goto err_destroy_meters;
}
 
err = ovs_dp_cmd_fill_info(dp, reply, info->snd_portid,
@@ -1615,8 +1621,10 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct 
genl_info *info)
ovs_notify(_datapath_genl_family, reply, info);
return 0;
 
-err_destroy_ports_array:
+err_destroy_meters:
ovs_unlock();
+   ovs_meters_exit(dp);
+err_destroy_ports_array:
kfree(dp->ports);
 err_destroy_percpu:
free_percpu(dp->stats_percpu);
@@ -2265,6 +2273,7 @@ struct genl_family dp_vport_genl_family __ro_after_init = 
{
_vport_genl_family,
_flow_genl_family,
_packet_genl_family,
+   _meter_genl_family,
 };
 
 static void dp_unregister_genl(int n_families)
@@ -2445,3 +2454,4 @@ static void dp_cleanup(void)
 MODULE_ALIAS_GENL_FAMILY(OVS_VPORT_FAMILY);
 MODULE_ALIAS_GENL_FAMILY(OVS_FLOW_FAMILY);
 MODULE_ALIAS_GENL_FAMILY(OVS_PACKET_FAMILY);
+MODULE_ALIAS_GENL_FAMILY(OVS_METER_FAMILY);
diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h
index 954c4ed465a5..5d2997b42460 100644
--- a/net/openvswitch/datapath.h
+++ b/net/openvswitch/datapath.h
@@ -92,6 +92,9 @@ struct datapath {
u32 user_features;
 
u32 max_headroom;
+
+   /* Switch meters. */
+   struct hlist_head *meters;
 };
 
 /**
diff --git a/net/openvswitch/meter.c b/net/openvswitch/meter.c
new file mode 100644
index ..ff57c1cb5f6a
--- /dev/null
+++ b/net/openvswitch/meter.c
@@ -0,0 +1,604 @@
+/*
+ * Copyright (c) 2017 Nicira, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include 
+#include 
+
+#include "datapath.h"
+#include "meter.h"
+
+#define METER_HASH_BUCKETS 1024
+
+static const struct nla_policy meter_policy[OVS_METER_ATTR_MAX + 1] = {
+   [OVS_METER_ATTR_ID] = { .type = NLA_U32, },
+   [OVS_METER_ATTR_KBPS] = { .type = NLA_FLAG },
+   [OVS_METER_ATTR_STATS] = { .len = sizeof(struct ovs_flow_stats) },
+   [OVS_METER_ATTR_BANDS] = { .type = NLA_NESTED },
+   [OVS_METER_ATTR_USED] = { .type = NLA_U64 },
+   [OVS_METER_ATTR_CLEAR] = { .type = NLA_FLAG },
+   [OVS_METER_ATTR_MAX_METERS] = { .type = NLA_U32 },
+   [OVS_METER_ATTR_MAX_BANDS] = { .type = NLA_U32 },
+};
+
+static const struct nla_policy band_policy[OVS_BAND_ATTR_MAX + 1] = {
+   [OVS_BAND_ATTR_TYPE] = { .type = NLA_U32, },
+   [OVS_BAND_ATTR_RATE] = { .type = 

[net-next v3 4/4] openvswitch: Add meter action support

2017-11-06 Thread Andy Zhou
Implements OVS kernel meter action support.

Signed-off-by: Andy Zhou <az...@ovn.org>
---
 include/uapi/linux/openvswitch.h | 3 +++
 net/openvswitch/actions.c| 6 ++
 net/openvswitch/datapath.h   | 1 +
 net/openvswitch/flow_netlink.c   | 6 ++
 4 files changed, 16 insertions(+)

diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index 368ce7d3ecdc..d6a8f3eaf433 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -811,6 +811,8 @@ struct ovs_action_push_eth {
  * @OVS_ACTION_ATTR_POP_ETH: Pop the outermost Ethernet header off the
  * packet.
  * @OVS_ACTION_ATTR_CT_CLEAR: Clear conntrack state from the packet.
+ * @OVS_ACTION_ATTR_METER: Run packet through a meter, which may drop the
+ * packet, or modify the packet (e.g., change the DSCP field).
  *
  * Only a single header can be set with a single %OVS_ACTION_ATTR_SET.  Not all
  * fields within a header are modifiable, e.g. the IPv4 protocol and fragment
@@ -841,6 +843,7 @@ enum ovs_action_attr {
OVS_ACTION_ATTR_PUSH_ETH, /* struct ovs_action_push_eth. */
OVS_ACTION_ATTR_POP_ETH,  /* No argument. */
OVS_ACTION_ATTR_CT_CLEAR, /* No argument. */
+   OVS_ACTION_ATTR_METER,/* u32 meter ID. */
 
__OVS_ACTION_ATTR_MAX,/* Nothing past this will be accepted
   * from userspace. */
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index a551232daf61..df342fe66492 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -1214,6 +1214,12 @@ static int do_execute_actions(struct datapath *dp, 
struct sk_buff *skb,
case OVS_ACTION_ATTR_POP_ETH:
err = pop_eth(skb, key);
break;
+
+   case OVS_ACTION_ATTR_METER:
+   if (ovs_meter_execute(dp, skb, key, nla_get_u32(a))) {
+   consume_skb(skb);
+   return 0;
+   }
}
 
if (unlikely(err)) {
diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h
index 5d2997b42460..523d65526766 100644
--- a/net/openvswitch/datapath.h
+++ b/net/openvswitch/datapath.h
@@ -30,6 +30,7 @@
 #include "conntrack.h"
 #include "flow.h"
 #include "flow_table.h"
+#include "meter.h"
 #include "vport-internal_dev.h"
 
 #define DP_MAX_PORTS   USHRT_MAX
diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index dc0d79092e74..6789bb1ddf0e 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -87,6 +87,7 @@ static bool actions_may_change_flow(const struct nlattr 
*actions)
case OVS_ACTION_ATTR_SAMPLE:
case OVS_ACTION_ATTR_SET:
case OVS_ACTION_ATTR_SET_MASKED:
+   case OVS_ACTION_ATTR_METER:
default:
return true;
}
@@ -2533,6 +2534,7 @@ static int __ovs_nla_copy_actions(struct net *net, const 
struct nlattr *attr,
[OVS_ACTION_ATTR_TRUNC] = sizeof(struct 
ovs_action_trunc),
[OVS_ACTION_ATTR_PUSH_ETH] = sizeof(struct 
ovs_action_push_eth),
[OVS_ACTION_ATTR_POP_ETH] = 0,
+   [OVS_ACTION_ATTR_METER] = sizeof(u32),
};
const struct ovs_action_push_vlan *vlan;
int type = nla_type(a);
@@ -2690,6 +2692,10 @@ static int __ovs_nla_copy_actions(struct net *net, const 
struct nlattr *attr,
mac_proto = MAC_PROTO_ETHERNET;
break;
 
+   case OVS_ACTION_ATTR_METER:
+   /* Non-existent meters are simply ignored.  */
+   break;
+
default:
OVS_NLERR(log, "Unknown Action type %d", type);
return -EINVAL;
-- 
1.8.3.1



Re: [net-next v2 3/4] openvswitch: Add meter infrastructure

2017-11-02 Thread Andy Zhou
On Thu, Nov 2, 2017 at 5:07 AM, Pravin Shelar <pshe...@ovn.org> wrote:
> On Thu, Nov 2, 2017 at 3:07 AM, Andy Zhou <az...@ovn.org> wrote:
>> On Fri, Oct 20, 2017 at 8:32 PM, Pravin Shelar <pshe...@ovn.org> wrote:
>>> On Thu, Oct 19, 2017 at 5:58 PM, Andy Zhou <az...@ovn.org> wrote:
>>>>
>>>> On Thu, Oct 19, 2017 at 02:47 Pravin Shelar <pshe...@ovn.org> wrote:
>>>>>
>>>>> On Tue, Oct 17, 2017 at 12:36 AM, Andy Zhou <az...@ovn.org> wrote:
>>>>> > OVS kernel datapath so far does not support Openflow meter action.
>>>>> > This is the first stab at adding kernel datapath meter support.
>>>>> > This implementation supports only drop band type.
>>>>> >
>>>>> > Signed-off-by: Andy Zhou <az...@ovn.org>
>>>>> > ---
>>>>> >  net/openvswitch/Makefile   |   1 +
>>>>> >  net/openvswitch/datapath.c |  14 +-
>>>>> >  net/openvswitch/datapath.h |   3 +
>>>>> >  net/openvswitch/meter.c| 604
>>>>> > +
>>>>> >  net/openvswitch/meter.h|  54 
>>>>> >  5 files changed, 674 insertions(+), 2 deletions(-)
>>>>> >  create mode 100644 net/openvswitch/meter.c
>>>>> >  create mode 100644 net/openvswitch/meter.h
>>>>> >
>>>>> This patch mostly looks good. I have one comment below.
>>>>>
>>>>> > +static int ovs_meter_cmd_set(struct sk_buff *skb, struct genl_info
>>>>> > *info)
>>>>> > +{
>>>>> > +   struct nlattr **a = info->attrs;
>>>>> > +   struct dp_meter *meter, *old_meter;
>>>>> > +   struct sk_buff *reply;
>>>>> > +   struct ovs_header *ovs_reply_header;
>>>>> > +   struct ovs_header *ovs_header = info->userhdr;
>>>>> > +   struct datapath *dp;
>>>>> > +   int err;
>>>>> > +   u32 meter_id;
>>>>> > +   bool failed;
>>>>> > +
>>>>> > +   meter = dp_meter_create(a);
>>>>> > +   if (IS_ERR_OR_NULL(meter))
>>>>> > +   return PTR_ERR(meter);
>>>>> > +
>>>>> > +   reply = ovs_meter_cmd_reply_start(info, OVS_METER_CMD_SET,
>>>>> > + _reply_header);
>>>>> > +   if (IS_ERR(reply)) {
>>>>> > +   err = PTR_ERR(reply);
>>>>> > +   goto exit_free_meter;
>>>>> > +   }
>>>>> > +
>>>>> > +   ovs_lock();
>>>>> > +   dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex);
>>>>> > +   if (!dp) {
>>>>> > +   err = -ENODEV;
>>>>> > +   goto exit_unlock;
>>>>> > +   }
>>>>> > +
>>>>> > +   if (!a[OVS_METER_ATTR_ID]) {
>>>>> > +   err = -ENODEV;
>>>>> > +   goto exit_unlock;
>>>>> > +   }
>>>>> > +
>>>>> > +   meter_id = nla_get_u32(a[OVS_METER_ATTR_ID]);
>>>>> > +
>>>>> > +   /* Cannot fail after this. */
>>>>> > +   old_meter = lookup_meter(dp, meter_id);
>>>>> I do not see RCU read lock taken here. This is not correctness issue
>>>>> but it could cause RCU checker to spit out warning message. You could
>>>>> do same trick that is done in get_dp() to avoid this issue.
>>>>
>>>> O.K.
>>>>>
>>>>>
>>>>>
>>>>> Can you also test the code with rcu sparse check config option enabled?
>>>>
>>>>
>>>> Do you mean to sparse compile with CONFIG_PROVE_LOCKING and
>>>> CONFIG_DENUG_OBJECTS_RCU_HEAD?
>>>
>>> You could use all following options simultaneously:
>>> CONFIG_PREEMPT
>>> CONFIG_DEBUG_PREEMPT
>>> CONFIG_DEBUG_SPINLOCK
>>> CONFIG_DEBUG_ATOMIC_SLEEP
>>> CONFIG_PROVE_RCU
>>> CONFIG_DEBUG_OBJECTS_RCU_HEAD
>>
>> Thanks, I turned on those flags but did not get any error message. Do you
>> mind share the RCU checker message?
>
> There would be assert failure and stack trace. so it would be pretty
> obvious in kernel log messages.
> Let me know if you do not see any stack trace while running meter
> create, delete and execute.

No I did not see them.


Re: [net-next v2 3/4] openvswitch: Add meter infrastructure

2017-11-02 Thread Andy Zhou
On Fri, Oct 20, 2017 at 8:32 PM, Pravin Shelar <pshe...@ovn.org> wrote:
> On Thu, Oct 19, 2017 at 5:58 PM, Andy Zhou <az...@ovn.org> wrote:
>>
>> On Thu, Oct 19, 2017 at 02:47 Pravin Shelar <pshe...@ovn.org> wrote:
>>>
>>> On Tue, Oct 17, 2017 at 12:36 AM, Andy Zhou <az...@ovn.org> wrote:
>>> > OVS kernel datapath so far does not support Openflow meter action.
>>> > This is the first stab at adding kernel datapath meter support.
>>> > This implementation supports only drop band type.
>>> >
>>> > Signed-off-by: Andy Zhou <az...@ovn.org>
>>> > ---
>>> >  net/openvswitch/Makefile   |   1 +
>>> >  net/openvswitch/datapath.c |  14 +-
>>> >  net/openvswitch/datapath.h |   3 +
>>> >  net/openvswitch/meter.c| 604
>>> > +
>>> >  net/openvswitch/meter.h|  54 
>>> >  5 files changed, 674 insertions(+), 2 deletions(-)
>>> >  create mode 100644 net/openvswitch/meter.c
>>> >  create mode 100644 net/openvswitch/meter.h
>>> >
>>> This patch mostly looks good. I have one comment below.
>>>
>>> > +static int ovs_meter_cmd_set(struct sk_buff *skb, struct genl_info
>>> > *info)
>>> > +{
>>> > +   struct nlattr **a = info->attrs;
>>> > +   struct dp_meter *meter, *old_meter;
>>> > +   struct sk_buff *reply;
>>> > +   struct ovs_header *ovs_reply_header;
>>> > +   struct ovs_header *ovs_header = info->userhdr;
>>> > +   struct datapath *dp;
>>> > +   int err;
>>> > +   u32 meter_id;
>>> > +   bool failed;
>>> > +
>>> > +   meter = dp_meter_create(a);
>>> > +   if (IS_ERR_OR_NULL(meter))
>>> > +   return PTR_ERR(meter);
>>> > +
>>> > +   reply = ovs_meter_cmd_reply_start(info, OVS_METER_CMD_SET,
>>> > + _reply_header);
>>> > +   if (IS_ERR(reply)) {
>>> > +   err = PTR_ERR(reply);
>>> > +   goto exit_free_meter;
>>> > +   }
>>> > +
>>> > +   ovs_lock();
>>> > +   dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex);
>>> > +   if (!dp) {
>>> > +   err = -ENODEV;
>>> > +   goto exit_unlock;
>>> > +   }
>>> > +
>>> > +   if (!a[OVS_METER_ATTR_ID]) {
>>> > +   err = -ENODEV;
>>> > +   goto exit_unlock;
>>> > +   }
>>> > +
>>> > +   meter_id = nla_get_u32(a[OVS_METER_ATTR_ID]);
>>> > +
>>> > +   /* Cannot fail after this. */
>>> > +   old_meter = lookup_meter(dp, meter_id);
>>> I do not see RCU read lock taken here. This is not correctness issue
>>> but it could cause RCU checker to spit out warning message. You could
>>> do same trick that is done in get_dp() to avoid this issue.
>>
>> O.K.
>>>
>>>
>>>
>>> Can you also test the code with rcu sparse check config option enabled?
>>
>>
>> Do you mean to sparse compile with CONFIG_PROVE_LOCKING and
>> CONFIG_DENUG_OBJECTS_RCU_HEAD?
>
> You could use all following options simultaneously:
> CONFIG_PREEMPT
> CONFIG_DEBUG_PREEMPT
> CONFIG_DEBUG_SPINLOCK
> CONFIG_DEBUG_ATOMIC_SLEEP
> CONFIG_PROVE_RCU
> CONFIG_DEBUG_OBJECTS_RCU_HEAD

Thanks, I turned on those flags but did not get any error message. Do you
mind share the RCU checker message?


Re: [net-next RFC 3/4] openvswitch: Add meter infrastructure

2017-10-17 Thread Andy Zhou
On Mon, Oct 16, 2017 at 10:49 AM, Pravin Shelar <pshe...@ovn.org> wrote:
> On Mon, Oct 16, 2017 at 12:05 AM, Andy Zhou <az...@ovn.org> wrote:
>> On Fri, Oct 13, 2017 at 5:12 PM, Pravin Shelar <pshe...@ovn.org> wrote:
>>> On Thu, Oct 12, 2017 at 3:38 PM, Andy Zhou <az...@ovn.org> wrote:
>>>> OVS kernel datapath so far does not support Openflow meter action.
>>>> This is the first stab at adding kernel datapath meter support.
>>>> This implementation supports only drop band type.
>>>>
>>>> Signed-off-by: Andy Zhou <az...@ovn.org>
>>>> ---
>>>>  net/openvswitch/Makefile   |   1 +
>>>>  net/openvswitch/datapath.c |  14 +-
>>>>  net/openvswitch/datapath.h |   3 +
>>>>  net/openvswitch/meter.c| 611 
>>>> +
>>>>  net/openvswitch/meter.h|  54 
>>>>  5 files changed, 681 insertions(+), 2 deletions(-)
>>>>  create mode 100644 net/openvswitch/meter.c
>>>>  create mode 100644 net/openvswitch/meter.h
>>>>
>>> ...
>>>
>>>> diff --git a/net/openvswitch/meter.c b/net/openvswitch/meter.c
>>>> new file mode 100644
>>>> index ..f24ebb5f7af4
>>>> --- /dev/null
>>>> +++ b/net/openvswitch/meter.c
>>>
>>> 
>>> 
>>>> +static int ovs_meter_cmd_features(struct sk_buff *skb, struct genl_info 
>>>> *info)
>>>> +{
>>>> +   struct datapath *dp;
>>>> +   struct ovs_header *ovs_header = info->userhdr;
>>>> +   struct sk_buff *reply;
>>>> +   struct ovs_header *ovs_reply_header;
>>>> +   struct nlattr *nla, *band_nla;
>>>> +   int err;
>>>> +
>>>> +   /* Check that the datapath exists */
>>>> +   ovs_lock();
>>>> +   dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex);
>>>> +   ovs_unlock();
>>>> +   if (!dp)
>>>> +   return -ENODEV;
>>>> +
>>> why dp check is required for this API?
>> Is it possible for another core delete the dp, before ovs_lock()
>> returns? Then, in theory, get_dp() can
>> return NULL, no?
>
> I do not see dp used anywhere in function. so the question was why is
> dp lookup done here?

Got it. I misunderstood. You are right, we don't need the dp pointer.
Just posted v2.

>
>>>
>>>> +   reply = ovs_meter_cmd_reply_start(info, OVS_METER_CMD_FEATURES,
>>>> + _reply_header);
>>>> +   if (!reply)
>>>> +   return PTR_ERR(reply);
>>>> +
>>>> +   if (nla_put_u32(reply, OVS_METER_ATTR_MAX_METERS, U32_MAX) ||
>>>> +   nla_put_u32(reply, OVS_METER_ATTR_MAX_BANDS, DP_MAX_BANDS))
>>>> +   goto nla_put_failure;
>>>> +
>>>> +   nla = nla_nest_start(reply, OVS_METER_ATTR_BANDS);
>>>> +   if (!nla)
>>>> +   goto nla_put_failure;
>>>> +
>>>> +   band_nla = nla_nest_start(reply, OVS_BAND_ATTR_UNSPEC);
>>>> +   if (!band_nla)
>>>> +   goto nla_put_failure;
>>>> +   /* Currently only DROP band type is supported. */
>>>> +   if (nla_put_u32(reply, OVS_BAND_ATTR_TYPE, 
>>>> OVS_METER_BAND_TYPE_DROP))
>>>> +   goto nla_put_failure;
>>>> +   nla_nest_end(reply, band_nla);
>>>> +   nla_nest_end(reply, nla);
>>>> +
>>>> +   genlmsg_end(reply, ovs_reply_header);
>>>> +   return genlmsg_reply(reply, info);
>>>> +
>>>> +nla_put_failure:
>>>> +   nlmsg_free(reply);
>>>> +   err = -EMSGSIZE;
>>>> +   return err;
>>>> +}
>>>> +
>>> 
>>>
>>>> +static int ovs_meter_cmd_set(struct sk_buff *skb, struct genl_info *info)
>>>> +{
>>>> +   struct nlattr **a = info->attrs;
>>>> +   struct dp_meter *meter, *old_meter;
>>>> +   struct sk_buff *reply;
>>>> +   struct ovs_header *ovs_reply_header;
>>>> +   struct ovs_header *ovs_header = info->userhdr;
>>>> +   struct datapath *dp;
>>>> +   int err;
>>>> +   u32 meter_id;
>>>> +   bool failed;
>>>> +
>>>> +   meter

[net-next v2 4/4] openvswitch: Add meter action support

2017-10-17 Thread Andy Zhou
Implements OVS kernel meter action support.

Signed-off-by: Andy Zhou <az...@ovn.org>
---
 include/uapi/linux/openvswitch.h | 3 +++
 net/openvswitch/actions.c| 6 ++
 net/openvswitch/datapath.h   | 1 +
 net/openvswitch/flow_netlink.c   | 6 ++
 4 files changed, 16 insertions(+)

diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index d413a2398214..de2807a9cc60 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -808,6 +808,8 @@ struct ovs_action_push_eth {
  * @OVS_ACTION_ATTR_POP_ETH: Pop the outermost Ethernet header off the
  * packet.
  * @OVS_ACTION_ATTR_CT_CLEAR: Clear conntrack state from the packet.
+ * @OVS_ACTION_ATTR_METER: Run packet through a meter, which may drop the
+ * packet, or modify the packet (e.g., change the DSCP field).
  *
  * Only a single header can be set with a single %OVS_ACTION_ATTR_SET.  Not all
  * fields within a header are modifiable, e.g. the IPv4 protocol and fragment
@@ -838,6 +840,7 @@ enum ovs_action_attr {
OVS_ACTION_ATTR_PUSH_ETH, /* struct ovs_action_push_eth. */
OVS_ACTION_ATTR_POP_ETH,  /* No argument. */
OVS_ACTION_ATTR_CT_CLEAR, /* No argument. */
+   OVS_ACTION_ATTR_METER,/* u32 meter ID. */
 
__OVS_ACTION_ATTR_MAX,/* Nothing past this will be accepted
   * from userspace. */
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index a551232daf61..df342fe66492 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -1214,6 +1214,12 @@ static int do_execute_actions(struct datapath *dp, 
struct sk_buff *skb,
case OVS_ACTION_ATTR_POP_ETH:
err = pop_eth(skb, key);
break;
+
+   case OVS_ACTION_ATTR_METER:
+   if (ovs_meter_execute(dp, skb, key, nla_get_u32(a))) {
+   consume_skb(skb);
+   return 0;
+   }
}
 
if (unlikely(err)) {
diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h
index d1ffa1d9fe57..cda40c6af40a 100644
--- a/net/openvswitch/datapath.h
+++ b/net/openvswitch/datapath.h
@@ -30,6 +30,7 @@
 #include "conntrack.h"
 #include "flow.h"
 #include "flow_table.h"
+#include "meter.h"
 #include "vport-internal_dev.h"
 
 #define DP_MAX_PORTS   USHRT_MAX
diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index dc0d79092e74..6789bb1ddf0e 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -87,6 +87,7 @@ static bool actions_may_change_flow(const struct nlattr 
*actions)
case OVS_ACTION_ATTR_SAMPLE:
case OVS_ACTION_ATTR_SET:
case OVS_ACTION_ATTR_SET_MASKED:
+   case OVS_ACTION_ATTR_METER:
default:
return true;
}
@@ -2533,6 +2534,7 @@ static int __ovs_nla_copy_actions(struct net *net, const 
struct nlattr *attr,
[OVS_ACTION_ATTR_TRUNC] = sizeof(struct 
ovs_action_trunc),
[OVS_ACTION_ATTR_PUSH_ETH] = sizeof(struct 
ovs_action_push_eth),
[OVS_ACTION_ATTR_POP_ETH] = 0,
+   [OVS_ACTION_ATTR_METER] = sizeof(u32),
};
const struct ovs_action_push_vlan *vlan;
int type = nla_type(a);
@@ -2690,6 +2692,10 @@ static int __ovs_nla_copy_actions(struct net *net, const 
struct nlattr *attr,
mac_proto = MAC_PROTO_ETHERNET;
break;
 
+   case OVS_ACTION_ATTR_METER:
+   /* Non-existent meters are simply ignored.  */
+   break;
+
default:
OVS_NLERR(log, "Unknown Action type %d", type);
return -EINVAL;
-- 
1.8.3.1



[net-next v2 2/4] openvswitch: export get_dp() API.

2017-10-17 Thread Andy Zhou
Later patches will invoke get_dp() outside of datapath.c. Export it.

Signed-off-by: Andy Zhou <az...@ovn.org>
---
 net/openvswitch/datapath.c | 29 -
 net/openvswitch/datapath.h | 31 +++
 2 files changed, 31 insertions(+), 29 deletions(-)

diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index c3aec6227c91..ac7154018676 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -142,35 +142,6 @@ static int queue_userspace_packet(struct datapath *dp, 
struct sk_buff *,
  const struct dp_upcall_info *,
  uint32_t cutlen);
 
-/* Must be called with rcu_read_lock. */
-static struct datapath *get_dp_rcu(struct net *net, int dp_ifindex)
-{
-   struct net_device *dev = dev_get_by_index_rcu(net, dp_ifindex);
-
-   if (dev) {
-   struct vport *vport = ovs_internal_dev_get_vport(dev);
-   if (vport)
-   return vport->dp;
-   }
-
-   return NULL;
-}
-
-/* The caller must hold either ovs_mutex or rcu_read_lock to keep the
- * returned dp pointer valid.
- */
-static inline struct datapath *get_dp(struct net *net, int dp_ifindex)
-{
-   struct datapath *dp;
-
-   WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_ovsl_is_held());
-   rcu_read_lock();
-   dp = get_dp_rcu(net, dp_ifindex);
-   rcu_read_unlock();
-
-   return dp;
-}
-
 /* Must be called with rcu_read_lock or ovs_mutex. */
 const char *ovs_dp_name(const struct datapath *dp)
 {
diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h
index 480600649d0b..ad14b571219d 100644
--- a/net/openvswitch/datapath.h
+++ b/net/openvswitch/datapath.h
@@ -30,6 +30,7 @@
 #include "conntrack.h"
 #include "flow.h"
 #include "flow_table.h"
+#include "vport-internal_dev.h"
 
 #define DP_MAX_PORTS   USHRT_MAX
 #define DP_VPORT_HASH_BUCKETS  1024
@@ -190,6 +191,36 @@ static inline struct vport *ovs_vport_ovsl(const struct 
datapath *dp, int port_n
return ovs_lookup_vport(dp, port_no);
 }
 
+/* Must be called with rcu_read_lock. */
+static inline struct datapath *get_dp_rcu(struct net *net, int dp_ifindex)
+{
+   struct net_device *dev = dev_get_by_index_rcu(net, dp_ifindex);
+
+   if (dev) {
+   struct vport *vport = ovs_internal_dev_get_vport(dev);
+
+   if (vport)
+   return vport->dp;
+   }
+
+   return NULL;
+}
+
+/* The caller must hold either ovs_mutex or rcu_read_lock to keep the
+ * returned dp pointer valid.
+ */
+static inline struct datapath *get_dp(struct net *net, int dp_ifindex)
+{
+   struct datapath *dp;
+
+   WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_ovsl_is_held());
+   rcu_read_lock();
+   dp = get_dp_rcu(net, dp_ifindex);
+   rcu_read_unlock();
+
+   return dp;
+}
+
 extern struct notifier_block ovs_dp_device_notifier;
 extern struct genl_family dp_vport_genl_family;
 
-- 
1.8.3.1



[net-next v2 1/4] openvswitch: Add meter netlink definitions

2017-10-17 Thread Andy Zhou
Meter has its own netlink family. Define netlink messages and attributes
for communicating with the user space programs.

Signed-off-by: Andy Zhou <az...@ovn.org>
---
 include/uapi/linux/openvswitch.h | 51 
 1 file changed, 51 insertions(+)

diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index 0cd6f8833147..d413a2398214 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -851,4 +851,55 @@ enum ovs_action_attr {
 
 #define OVS_ACTION_ATTR_MAX (__OVS_ACTION_ATTR_MAX - 1)
 
+/* Meters. */
+#define OVS_METER_FAMILY  "ovs_meter"
+#define OVS_METER_MCGROUP "ovs_meter"
+#define OVS_METER_VERSION 0x1
+
+enum ovs_meter_cmd {
+   OVS_METER_CMD_UNSPEC,
+   OVS_METER_CMD_FEATURES, /* Get features supported by the datapath. */
+   OVS_METER_CMD_SET,  /* Add or modify a meter. */
+   OVS_METER_CMD_DEL,  /* Delete a meter. */
+   OVS_METER_CMD_GET   /* Get meter stats. */
+};
+
+enum ovs_meter_attr {
+   OVS_METER_ATTR_UNSPEC,
+   OVS_METER_ATTR_ID,  /* u32 meter ID within datapath. */
+   OVS_METER_ATTR_KBPS,/* No argument. If set, units in kilobits
+* per second. Otherwise, units in
+* packets per second.
+*/
+   OVS_METER_ATTR_STATS,   /* struct ovs_flow_stats for the meter. */
+   OVS_METER_ATTR_BANDS,   /* Nested attributes for meter bands. */
+   OVS_METER_ATTR_USED,/* u64 msecs last used in monotonic time. */
+   OVS_METER_ATTR_CLEAR,   /* Flag to clear stats, used. */
+   OVS_METER_ATTR_MAX_METERS, /* u32 number of meters supported. */
+   OVS_METER_ATTR_MAX_BANDS,  /* u32 max number of bands per meter. */
+   OVS_METER_ATTR_PAD,
+   __OVS_METER_ATTR_MAX
+};
+
+#define OVS_METER_ATTR_MAX (__OVS_METER_ATTR_MAX - 1)
+
+enum ovs_band_attr {
+   OVS_BAND_ATTR_UNSPEC,
+   OVS_BAND_ATTR_TYPE, /* u32 OVS_METER_BAND_TYPE_* constant. */
+   OVS_BAND_ATTR_RATE, /* u32 band rate in meter units (see above). */
+   OVS_BAND_ATTR_BURST,/* u32 burst size in meter units. */
+   OVS_BAND_ATTR_STATS,/* struct ovs_flow_stats for the band. */
+   __OVS_BAND_ATTR_MAX
+};
+
+#define OVS_BAND_ATTR_MAX (__OVS_BAND_ATTR_MAX - 1)
+
+enum ovs_meter_band_type {
+   OVS_METER_BAND_TYPE_UNSPEC,
+   OVS_METER_BAND_TYPE_DROP,   /* Drop exceeding packets. */
+   __OVS_METER_BAND_TYPE_MAX
+};
+
+#define OVS_METER_BAND_TYPE_MAX (__OVS_METER_BAND_TYPE_MAX - 1)
+
 #endif /* _LINUX_OPENVSWITCH_H */
-- 
1.8.3.1



[net-next v2 3/4] openvswitch: Add meter infrastructure

2017-10-17 Thread Andy Zhou
OVS kernel datapath so far does not support Openflow meter action.
This is the first stab at adding kernel datapath meter support.
This implementation supports only drop band type.

Signed-off-by: Andy Zhou <az...@ovn.org>
---
 net/openvswitch/Makefile   |   1 +
 net/openvswitch/datapath.c |  14 +-
 net/openvswitch/datapath.h |   3 +
 net/openvswitch/meter.c| 604 +
 net/openvswitch/meter.h|  54 
 5 files changed, 674 insertions(+), 2 deletions(-)
 create mode 100644 net/openvswitch/meter.c
 create mode 100644 net/openvswitch/meter.h

diff --git a/net/openvswitch/Makefile b/net/openvswitch/Makefile
index 60f809085b92..658383fbdf53 100644
--- a/net/openvswitch/Makefile
+++ b/net/openvswitch/Makefile
@@ -11,6 +11,7 @@ openvswitch-y := \
flow.o \
flow_netlink.o \
flow_table.o \
+   meter.o \
vport.o \
vport-internal_dev.o \
vport-netdev.o
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index ac7154018676..eef8d3ea3aae 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -55,6 +55,7 @@
 #include "flow.h"
 #include "flow_table.h"
 #include "flow_netlink.h"
+#include "meter.h"
 #include "vport-internal_dev.h"
 #include "vport-netdev.h"
 
@@ -174,6 +175,7 @@ static void destroy_dp_rcu(struct rcu_head *rcu)
ovs_flow_tbl_destroy(>table);
free_percpu(dp->stats_percpu);
kfree(dp->ports);
+   ovs_meters_exit(dp);
kfree(dp);
 }
 
@@ -1572,6 +1574,10 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct 
genl_info *info)
for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++)
INIT_HLIST_HEAD(>ports[i]);
 
+   err = ovs_meters_init(dp);
+   if (err)
+   goto err_destroy_ports_array;
+
/* Set up our datapath device. */
parms.name = nla_data(a[OVS_DP_ATTR_NAME]);
parms.type = OVS_VPORT_TYPE_INTERNAL;
@@ -1600,7 +1606,7 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct 
genl_info *info)
ovs_dp_reset_user_features(skb, info);
}
 
-   goto err_destroy_ports_array;
+   goto err_destroy_meters;
}
 
err = ovs_dp_cmd_fill_info(dp, reply, info->snd_portid,
@@ -1615,8 +1621,10 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct 
genl_info *info)
ovs_notify(_datapath_genl_family, reply, info);
return 0;
 
-err_destroy_ports_array:
+err_destroy_meters:
ovs_unlock();
+   ovs_meters_exit(dp);
+err_destroy_ports_array:
kfree(dp->ports);
 err_destroy_percpu:
free_percpu(dp->stats_percpu);
@@ -2244,6 +2252,7 @@ struct genl_family dp_vport_genl_family __ro_after_init = 
{
_vport_genl_family,
_flow_genl_family,
_packet_genl_family,
+   _meter_genl_family,
 };
 
 static void dp_unregister_genl(int n_families)
@@ -2424,3 +2433,4 @@ static void dp_cleanup(void)
 MODULE_ALIAS_GENL_FAMILY(OVS_VPORT_FAMILY);
 MODULE_ALIAS_GENL_FAMILY(OVS_FLOW_FAMILY);
 MODULE_ALIAS_GENL_FAMILY(OVS_PACKET_FAMILY);
+MODULE_ALIAS_GENL_FAMILY(OVS_METER_FAMILY);
diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h
index ad14b571219d..d1ffa1d9fe57 100644
--- a/net/openvswitch/datapath.h
+++ b/net/openvswitch/datapath.h
@@ -92,6 +92,9 @@ struct datapath {
u32 user_features;
 
u32 max_headroom;
+
+   /* Switch meters. */
+   struct hlist_head *meters;
 };
 
 /**
diff --git a/net/openvswitch/meter.c b/net/openvswitch/meter.c
new file mode 100644
index ..ff57c1cb5f6a
--- /dev/null
+++ b/net/openvswitch/meter.c
@@ -0,0 +1,604 @@
+/*
+ * Copyright (c) 2017 Nicira, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include 
+#include 
+
+#include "datapath.h"
+#include "meter.h"
+
+#define METER_HASH_BUCKETS 1024
+
+static const struct nla_policy meter_policy[OVS_METER_ATTR_MAX + 1] = {
+   [OVS_METER_ATTR_ID] = { .type = NLA_U32, },
+   [OVS_METER_ATTR_KBPS] = { .type = NLA_FLAG },
+   [OVS_METER_ATTR_STATS] = { .len = sizeof(struct ovs_flow_stats) },
+   [OVS_METER_ATTR_BANDS] = { .type = NLA_NESTED },
+   [OVS_METER_ATTR_USED] = { .type = NLA_U64 },
+   [OVS_METER_ATTR_CLEAR] = { .type = NLA_FLAG },
+   [OVS_METER_ATTR_MAX_METERS] = { .type = NLA_U32 },
+   [OVS_METER_ATTR_MAX_BANDS] = { .type = NLA_U32 },
+};
+
+static const struct nla_policy band_policy[OVS_BAND_ATTR_MAX + 1] = {
+   [OVS_BAND_ATTR_TYPE] = { .type = NLA_U32, },
+   [OVS_BAND_ATTR_RATE] = { .type = 

[net-next v2 0/4] Openvswitch meter action

2017-10-17 Thread Andy Zhou
This patch series is the first attempt to add openvswitch
meter support. We have previously experimented with adding
metering support in nftables. However 1) It was not clear
how to expose a named nftables object cleanly, and 2)
the logic that implements metering is quite small, < 100 lines
of code.

With those two observations, it seems cleaner to add meter
support in the openvswitch module directly.

---
v1(RFC)->v2:  remove unused code
  improve locking
  and other review comments

Andy Zhou (4):
  openvswitch: Add meter netlink definitions
  openvswitch: export get_dp() API.
  openvswitch: Add meter infrastructure
  openvswitch: Add meter action support

 include/uapi/linux/openvswitch.h |  54 
 net/openvswitch/Makefile |   1 +
 net/openvswitch/actions.c|   6 +
 net/openvswitch/datapath.c   |  43 +--
 net/openvswitch/datapath.h   |  35 +++
 net/openvswitch/flow_netlink.c   |   6 +
 net/openvswitch/meter.c  | 604 +++
 net/openvswitch/meter.h  |  54 
 8 files changed, 772 insertions(+), 31 deletions(-)
 create mode 100644 net/openvswitch/meter.c
 create mode 100644 net/openvswitch/meter.h

-- 
1.8.3.1



Re: [net-next RFC 4/4] openvswitch: Add meter action support

2017-10-16 Thread Andy Zhou
On Fri, Oct 13, 2017 at 5:13 PM, Pravin Shelar <pshe...@ovn.org> wrote:
> On Thu, Oct 12, 2017 at 3:38 PM, Andy Zhou <az...@ovn.org> wrote:
>> Implements OVS kernel meter action support.
>>
>> Signed-off-by: Andy Zhou <az...@ovn.org>
>> ---
>>  include/uapi/linux/openvswitch.h |  1 +
>>  net/openvswitch/actions.c| 12 
>>  net/openvswitch/datapath.h   |  1 +
>>  net/openvswitch/flow_netlink.c   |  6 ++
>>  4 files changed, 20 insertions(+)
>>
>> diff --git a/include/uapi/linux/openvswitch.h 
>> b/include/uapi/linux/openvswitch.h
>> index 325049a129e4..11fe1a06cdd6 100644
>> --- a/include/uapi/linux/openvswitch.h
>> +++ b/include/uapi/linux/openvswitch.h
>> @@ -835,6 +835,7 @@ enum ovs_action_attr {
>> OVS_ACTION_ATTR_TRUNC,/* u32 struct ovs_action_trunc. */
>> OVS_ACTION_ATTR_PUSH_ETH, /* struct ovs_action_push_eth. */
>> OVS_ACTION_ATTR_POP_ETH,  /* No argument. */
>> +   OVS_ACTION_ATTR_METER,/* u32 meter ID. */
>>
>> __OVS_ACTION_ATTR_MAX,/* Nothing past this will be accepted
>>* from userspace. */
>> diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
>> index a54a556fcdb5..4eb160ac5a27 100644
>> --- a/net/openvswitch/actions.c
>> +++ b/net/openvswitch/actions.c
>> @@ -1210,6 +1210,12 @@ static int do_execute_actions(struct datapath *dp, 
>> struct sk_buff *skb,
>> case OVS_ACTION_ATTR_POP_ETH:
>> err = pop_eth(skb, key);
>> break;
>> +
>> +   case OVS_ACTION_ATTR_METER:
>> +   if (ovs_meter_execute(dp, skb, key, nla_get_u32(a))) 
>> {
>> +   consume_skb(skb);
>> +   return 0;
>> +   }
>> }
>>
>> if (unlikely(err)) {
>> @@ -1341,6 +1347,12 @@ int ovs_execute_actions(struct datapath *dp, struct 
>> sk_buff *skb,
>> err = do_execute_actions(dp, skb, key,
>>  acts->actions, acts->actions_len);
>>
>> +   /* OVS action has dropped the packet, do not expose it
>> +* to the user.
>> +*/
>> +   if (err == -ENODATA)
>> +   err = 0;
>> +
> I am not sure who is returning this error code?
Ah, this hunk was left over from experimenting with per band actions
list. Will remove. Thanks for catching this!


Re: [net-next RFC 3/4] openvswitch: Add meter infrastructure

2017-10-16 Thread Andy Zhou
On Fri, Oct 13, 2017 at 5:12 PM, Pravin Shelar <pshe...@ovn.org> wrote:
> On Thu, Oct 12, 2017 at 3:38 PM, Andy Zhou <az...@ovn.org> wrote:
>> OVS kernel datapath so far does not support Openflow meter action.
>> This is the first stab at adding kernel datapath meter support.
>> This implementation supports only drop band type.
>>
>> Signed-off-by: Andy Zhou <az...@ovn.org>
>> ---
>>  net/openvswitch/Makefile   |   1 +
>>  net/openvswitch/datapath.c |  14 +-
>>  net/openvswitch/datapath.h |   3 +
>>  net/openvswitch/meter.c| 611 
>> +
>>  net/openvswitch/meter.h|  54 
>>  5 files changed, 681 insertions(+), 2 deletions(-)
>>  create mode 100644 net/openvswitch/meter.c
>>  create mode 100644 net/openvswitch/meter.h
>>
> ...
>
>> diff --git a/net/openvswitch/meter.c b/net/openvswitch/meter.c
>> new file mode 100644
>> index ..f24ebb5f7af4
>> --- /dev/null
>> +++ b/net/openvswitch/meter.c
>
> 
> 
>> +static int ovs_meter_cmd_features(struct sk_buff *skb, struct genl_info 
>> *info)
>> +{
>> +   struct datapath *dp;
>> +   struct ovs_header *ovs_header = info->userhdr;
>> +   struct sk_buff *reply;
>> +   struct ovs_header *ovs_reply_header;
>> +   struct nlattr *nla, *band_nla;
>> +   int err;
>> +
>> +   /* Check that the datapath exists */
>> +   ovs_lock();
>> +   dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex);
>> +   ovs_unlock();
>> +   if (!dp)
>> +   return -ENODEV;
>> +
> why dp check is required for this API?
Is it possible for another core delete the dp, before ovs_lock()
returns? Then, in theory, get_dp() can
return NULL, no?
>
>> +   reply = ovs_meter_cmd_reply_start(info, OVS_METER_CMD_FEATURES,
>> + _reply_header);
>> +   if (!reply)
>> +   return PTR_ERR(reply);
>> +
>> +   if (nla_put_u32(reply, OVS_METER_ATTR_MAX_METERS, U32_MAX) ||
>> +   nla_put_u32(reply, OVS_METER_ATTR_MAX_BANDS, DP_MAX_BANDS))
>> +   goto nla_put_failure;
>> +
>> +   nla = nla_nest_start(reply, OVS_METER_ATTR_BANDS);
>> +   if (!nla)
>> +   goto nla_put_failure;
>> +
>> +   band_nla = nla_nest_start(reply, OVS_BAND_ATTR_UNSPEC);
>> +   if (!band_nla)
>> +   goto nla_put_failure;
>> +   /* Currently only DROP band type is supported. */
>> +   if (nla_put_u32(reply, OVS_BAND_ATTR_TYPE, OVS_METER_BAND_TYPE_DROP))
>> +   goto nla_put_failure;
>> +   nla_nest_end(reply, band_nla);
>> +   nla_nest_end(reply, nla);
>> +
>> +   genlmsg_end(reply, ovs_reply_header);
>> +   return genlmsg_reply(reply, info);
>> +
>> +nla_put_failure:
>> +   nlmsg_free(reply);
>> +   err = -EMSGSIZE;
>> +   return err;
>> +}
>> +
> 
>
>> +static int ovs_meter_cmd_set(struct sk_buff *skb, struct genl_info *info)
>> +{
>> +   struct nlattr **a = info->attrs;
>> +   struct dp_meter *meter, *old_meter;
>> +   struct sk_buff *reply;
>> +   struct ovs_header *ovs_reply_header;
>> +   struct ovs_header *ovs_header = info->userhdr;
>> +   struct datapath *dp;
>> +   int err;
>> +   u32 meter_id;
>> +   bool failed;
>> +
>> +   meter = dp_meter_create(a);
>> +   if (IS_ERR_OR_NULL(meter))
>> +   return PTR_ERR(meter);
>> +
>> +   reply = ovs_meter_cmd_reply_start(info, OVS_METER_CMD_SET,
>> + _reply_header);
>> +   if (IS_ERR(reply)) {
>> +   err = PTR_ERR(reply);
>> +   goto exit_free_meter;
>> +   }
>> +
>> +   ovs_lock();
>> +   dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex);
>> +   if (!dp) {
>> +   err = -ENODEV;
>> +   goto exit_unlock;
>> +   }
>> +
>> +   if (!a[OVS_METER_ATTR_ID]) {
>> +   err = -ENODEV;
>> +   goto exit_unlock;
>> +   }
>> +
>> +   meter_id = nla_get_u32(a[OVS_METER_ATTR_ID]);
>> +
>> +   /* Cannot fail after this. */
>> +   old_meter = lookup_meter(dp, meter_id);
>> +   attach_meter(dp, meter);
>> +   ovs_unlock();
>> +
> After the unlock, it is not safe to keep the r

[net-next RFC 0/4] Openvswitch meter action

2017-10-12 Thread Andy Zhou
This patch series is the first attempt to add openvswitch
meter support. We have previously experimented with adding
metering support in nftables. However 1) It was not clear
how to expose a named nftables object cleanly, and 2)
the logic that implements metering is quite small, < 100 lines
of code.

With those two observations, it seems cleaner to add meter
support in the openvswitch module directly.


Andy Zhou (4):
  openvswitch: Add meter netlink definitions
  openvswitch: export get_dp() API.
  openvswitch: Add meter infrastructure
  openvswitch: Add meter action support

 include/uapi/linux/openvswitch.h |  52 
 net/openvswitch/Makefile |   1 +
 net/openvswitch/actions.c|  12 +
 net/openvswitch/datapath.c   |  43 +--
 net/openvswitch/datapath.h   |  35 +++
 net/openvswitch/flow_netlink.c   |   6 +
 net/openvswitch/meter.c  | 611 +++
 net/openvswitch/meter.h  |  54 
 8 files changed, 783 insertions(+), 31 deletions(-)
 create mode 100644 net/openvswitch/meter.c
 create mode 100644 net/openvswitch/meter.h

-- 
1.8.3.1



[net-next RFC 2/4] openvswitch: export get_dp() API.

2017-10-12 Thread Andy Zhou
Later patches will invoke get_dp() outside of datapath.c. Export it.

Signed-off-by: Andy Zhou <az...@ovn.org>
---
 net/openvswitch/datapath.c | 29 -
 net/openvswitch/datapath.h | 31 +++
 2 files changed, 31 insertions(+), 29 deletions(-)

diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index c3aec6227c91..ac7154018676 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -142,35 +142,6 @@ static int queue_userspace_packet(struct datapath *dp, 
struct sk_buff *,
  const struct dp_upcall_info *,
  uint32_t cutlen);
 
-/* Must be called with rcu_read_lock. */
-static struct datapath *get_dp_rcu(struct net *net, int dp_ifindex)
-{
-   struct net_device *dev = dev_get_by_index_rcu(net, dp_ifindex);
-
-   if (dev) {
-   struct vport *vport = ovs_internal_dev_get_vport(dev);
-   if (vport)
-   return vport->dp;
-   }
-
-   return NULL;
-}
-
-/* The caller must hold either ovs_mutex or rcu_read_lock to keep the
- * returned dp pointer valid.
- */
-static inline struct datapath *get_dp(struct net *net, int dp_ifindex)
-{
-   struct datapath *dp;
-
-   WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_ovsl_is_held());
-   rcu_read_lock();
-   dp = get_dp_rcu(net, dp_ifindex);
-   rcu_read_unlock();
-
-   return dp;
-}
-
 /* Must be called with rcu_read_lock or ovs_mutex. */
 const char *ovs_dp_name(const struct datapath *dp)
 {
diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h
index 480600649d0b..ad14b571219d 100644
--- a/net/openvswitch/datapath.h
+++ b/net/openvswitch/datapath.h
@@ -30,6 +30,7 @@
 #include "conntrack.h"
 #include "flow.h"
 #include "flow_table.h"
+#include "vport-internal_dev.h"
 
 #define DP_MAX_PORTS   USHRT_MAX
 #define DP_VPORT_HASH_BUCKETS  1024
@@ -190,6 +191,36 @@ static inline struct vport *ovs_vport_ovsl(const struct 
datapath *dp, int port_n
return ovs_lookup_vport(dp, port_no);
 }
 
+/* Must be called with rcu_read_lock. */
+static inline struct datapath *get_dp_rcu(struct net *net, int dp_ifindex)
+{
+   struct net_device *dev = dev_get_by_index_rcu(net, dp_ifindex);
+
+   if (dev) {
+   struct vport *vport = ovs_internal_dev_get_vport(dev);
+
+   if (vport)
+   return vport->dp;
+   }
+
+   return NULL;
+}
+
+/* The caller must hold either ovs_mutex or rcu_read_lock to keep the
+ * returned dp pointer valid.
+ */
+static inline struct datapath *get_dp(struct net *net, int dp_ifindex)
+{
+   struct datapath *dp;
+
+   WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_ovsl_is_held());
+   rcu_read_lock();
+   dp = get_dp_rcu(net, dp_ifindex);
+   rcu_read_unlock();
+
+   return dp;
+}
+
 extern struct notifier_block ovs_dp_device_notifier;
 extern struct genl_family dp_vport_genl_family;
 
-- 
1.8.3.1



[net-next RFC 3/4] openvswitch: Add meter infrastructure

2017-10-12 Thread Andy Zhou
OVS kernel datapath so far does not support Openflow meter action.
This is the first stab at adding kernel datapath meter support.
This implementation supports only drop band type.

Signed-off-by: Andy Zhou <az...@ovn.org>
---
 net/openvswitch/Makefile   |   1 +
 net/openvswitch/datapath.c |  14 +-
 net/openvswitch/datapath.h |   3 +
 net/openvswitch/meter.c| 611 +
 net/openvswitch/meter.h|  54 
 5 files changed, 681 insertions(+), 2 deletions(-)
 create mode 100644 net/openvswitch/meter.c
 create mode 100644 net/openvswitch/meter.h

diff --git a/net/openvswitch/Makefile b/net/openvswitch/Makefile
index 60f809085b92..658383fbdf53 100644
--- a/net/openvswitch/Makefile
+++ b/net/openvswitch/Makefile
@@ -11,6 +11,7 @@ openvswitch-y := \
flow.o \
flow_netlink.o \
flow_table.o \
+   meter.o \
vport.o \
vport-internal_dev.o \
vport-netdev.o
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index ac7154018676..eef8d3ea3aae 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -55,6 +55,7 @@
 #include "flow.h"
 #include "flow_table.h"
 #include "flow_netlink.h"
+#include "meter.h"
 #include "vport-internal_dev.h"
 #include "vport-netdev.h"
 
@@ -174,6 +175,7 @@ static void destroy_dp_rcu(struct rcu_head *rcu)
ovs_flow_tbl_destroy(>table);
free_percpu(dp->stats_percpu);
kfree(dp->ports);
+   ovs_meters_exit(dp);
kfree(dp);
 }
 
@@ -1572,6 +1574,10 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct 
genl_info *info)
for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++)
INIT_HLIST_HEAD(>ports[i]);
 
+   err = ovs_meters_init(dp);
+   if (err)
+   goto err_destroy_ports_array;
+
/* Set up our datapath device. */
parms.name = nla_data(a[OVS_DP_ATTR_NAME]);
parms.type = OVS_VPORT_TYPE_INTERNAL;
@@ -1600,7 +1606,7 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct 
genl_info *info)
ovs_dp_reset_user_features(skb, info);
}
 
-   goto err_destroy_ports_array;
+   goto err_destroy_meters;
}
 
err = ovs_dp_cmd_fill_info(dp, reply, info->snd_portid,
@@ -1615,8 +1621,10 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct 
genl_info *info)
ovs_notify(_datapath_genl_family, reply, info);
return 0;
 
-err_destroy_ports_array:
+err_destroy_meters:
ovs_unlock();
+   ovs_meters_exit(dp);
+err_destroy_ports_array:
kfree(dp->ports);
 err_destroy_percpu:
free_percpu(dp->stats_percpu);
@@ -2244,6 +2252,7 @@ struct genl_family dp_vport_genl_family __ro_after_init = 
{
_vport_genl_family,
_flow_genl_family,
_packet_genl_family,
+   _meter_genl_family,
 };
 
 static void dp_unregister_genl(int n_families)
@@ -2424,3 +2433,4 @@ static void dp_cleanup(void)
 MODULE_ALIAS_GENL_FAMILY(OVS_VPORT_FAMILY);
 MODULE_ALIAS_GENL_FAMILY(OVS_FLOW_FAMILY);
 MODULE_ALIAS_GENL_FAMILY(OVS_PACKET_FAMILY);
+MODULE_ALIAS_GENL_FAMILY(OVS_METER_FAMILY);
diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h
index ad14b571219d..d1ffa1d9fe57 100644
--- a/net/openvswitch/datapath.h
+++ b/net/openvswitch/datapath.h
@@ -92,6 +92,9 @@ struct datapath {
u32 user_features;
 
u32 max_headroom;
+
+   /* Switch meters. */
+   struct hlist_head *meters;
 };
 
 /**
diff --git a/net/openvswitch/meter.c b/net/openvswitch/meter.c
new file mode 100644
index ..f24ebb5f7af4
--- /dev/null
+++ b/net/openvswitch/meter.c
@@ -0,0 +1,611 @@
+/*
+ * Copyright (c) 2017 Nicira, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include 
+#include 
+
+#include "datapath.h"
+#include "meter.h"
+
+#define METER_HASH_BUCKETS 1024
+
+static const struct nla_policy meter_policy[OVS_METER_ATTR_MAX + 1] = {
+   [OVS_METER_ATTR_ID] = { .type = NLA_U32, },
+   [OVS_METER_ATTR_KBPS] = { .type = NLA_FLAG },
+   [OVS_METER_ATTR_STATS] = { .len = sizeof(struct ovs_flow_stats) },
+   [OVS_METER_ATTR_BANDS] = { .type = NLA_NESTED },
+   [OVS_METER_ATTR_USED] = { .type = NLA_U64 },
+   [OVS_METER_ATTR_CLEAR] = { .type = NLA_FLAG },
+   [OVS_METER_ATTR_MAX_METERS] = { .type = NLA_U32 },
+   [OVS_METER_ATTR_MAX_BANDS] = { .type = NLA_U32 },
+};
+
+static const struct nla_policy band_policy[OVS_BAND_ATTR_MAX + 1] = {
+   [OVS_BAND_ATTR_TYPE] = { .type = NLA_U32, },
+   [OVS_BAND_ATTR_RATE] = { .type = 

[net-next RFC 1/4] openvswitch: Add meter netlink definitions

2017-10-12 Thread Andy Zhou
Meter has its own netlink family. Define netlink messages and attributes
for communicating with the user space programs.

Signed-off-by: Andy Zhou <az...@ovn.org>
---
 include/uapi/linux/openvswitch.h | 51 
 1 file changed, 51 insertions(+)

diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index 156ee4cab82e..325049a129e4 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -848,4 +848,55 @@ enum ovs_action_attr {
 
 #define OVS_ACTION_ATTR_MAX (__OVS_ACTION_ATTR_MAX - 1)
 
+/* Meters. */
+#define OVS_METER_FAMILY  "ovs_meter"
+#define OVS_METER_MCGROUP "ovs_meter"
+#define OVS_METER_VERSION 0x1
+
+enum ovs_meter_cmd {
+   OVS_METER_CMD_UNSPEC,
+   OVS_METER_CMD_FEATURES, /* Get features supported by the datapath. */
+   OVS_METER_CMD_SET,  /* Add or modify a meter. */
+   OVS_METER_CMD_DEL,  /* Delete a meter. */
+   OVS_METER_CMD_GET   /* Get meter stats. */
+};
+
+enum ovs_meter_attr {
+   OVS_METER_ATTR_UNSPEC,
+   OVS_METER_ATTR_ID,  /* u32 meter ID within datapath. */
+   OVS_METER_ATTR_KBPS,/* No argument. If set, units in kilobits
+* per second. Otherwise, units in
+* packets per second.
+*/
+   OVS_METER_ATTR_STATS,   /* struct ovs_flow_stats for the meter. */
+   OVS_METER_ATTR_BANDS,   /* Nested attributes for meter bands. */
+   OVS_METER_ATTR_USED,/* u64 msecs last used in monotonic time. */
+   OVS_METER_ATTR_CLEAR,   /* Flag to clear stats, used. */
+   OVS_METER_ATTR_MAX_METERS, /* u32 number of meters supported. */
+   OVS_METER_ATTR_MAX_BANDS,  /* u32 max number of bands per meter. */
+   OVS_METER_ATTR_PAD,
+   __OVS_METER_ATTR_MAX
+};
+
+#define OVS_METER_ATTR_MAX (__OVS_METER_ATTR_MAX - 1)
+
+enum ovs_band_attr {
+   OVS_BAND_ATTR_UNSPEC,
+   OVS_BAND_ATTR_TYPE, /* u32 OVS_METER_BAND_TYPE_* constant. */
+   OVS_BAND_ATTR_RATE, /* u32 band rate in meter units (see above). */
+   OVS_BAND_ATTR_BURST,/* u32 burst size in meter units. */
+   OVS_BAND_ATTR_STATS,/* struct ovs_flow_stats for the band. */
+   __OVS_BAND_ATTR_MAX
+};
+
+#define OVS_BAND_ATTR_MAX (__OVS_BAND_ATTR_MAX - 1)
+
+enum ovs_meter_band_type {
+   OVS_METER_BAND_TYPE_UNSPEC,
+   OVS_METER_BAND_TYPE_DROP,   /* Drop exceeding packets. */
+   __OVS_METER_BAND_TYPE_MAX
+};
+
+#define OVS_METER_BAND_TYPE_MAX (__OVS_METER_BAND_TYPE_MAX - 1)
+
 #endif /* _LINUX_OPENVSWITCH_H */
-- 
1.8.3.1



[net-next RFC 4/4] openvswitch: Add meter action support

2017-10-12 Thread Andy Zhou
Implements OVS kernel meter action support.

Signed-off-by: Andy Zhou <az...@ovn.org>
---
 include/uapi/linux/openvswitch.h |  1 +
 net/openvswitch/actions.c| 12 
 net/openvswitch/datapath.h   |  1 +
 net/openvswitch/flow_netlink.c   |  6 ++
 4 files changed, 20 insertions(+)

diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index 325049a129e4..11fe1a06cdd6 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -835,6 +835,7 @@ enum ovs_action_attr {
OVS_ACTION_ATTR_TRUNC,/* u32 struct ovs_action_trunc. */
OVS_ACTION_ATTR_PUSH_ETH, /* struct ovs_action_push_eth. */
OVS_ACTION_ATTR_POP_ETH,  /* No argument. */
+   OVS_ACTION_ATTR_METER,/* u32 meter ID. */
 
__OVS_ACTION_ATTR_MAX,/* Nothing past this will be accepted
   * from userspace. */
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index a54a556fcdb5..4eb160ac5a27 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -1210,6 +1210,12 @@ static int do_execute_actions(struct datapath *dp, 
struct sk_buff *skb,
case OVS_ACTION_ATTR_POP_ETH:
err = pop_eth(skb, key);
break;
+
+   case OVS_ACTION_ATTR_METER:
+   if (ovs_meter_execute(dp, skb, key, nla_get_u32(a))) {
+   consume_skb(skb);
+   return 0;
+   }
}
 
if (unlikely(err)) {
@@ -1341,6 +1347,12 @@ int ovs_execute_actions(struct datapath *dp, struct 
sk_buff *skb,
err = do_execute_actions(dp, skb, key,
 acts->actions, acts->actions_len);
 
+   /* OVS action has dropped the packet, do not expose it
+* to the user.
+*/
+   if (err == -ENODATA)
+   err = 0;
+
if (level == 1)
process_deferred_actions(dp);
 
diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h
index d1ffa1d9fe57..cda40c6af40a 100644
--- a/net/openvswitch/datapath.h
+++ b/net/openvswitch/datapath.h
@@ -30,6 +30,7 @@
 #include "conntrack.h"
 #include "flow.h"
 #include "flow_table.h"
+#include "meter.h"
 #include "vport-internal_dev.h"
 
 #define DP_MAX_PORTS   USHRT_MAX
diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index e8eb427ce6d1..39b548431f68 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -85,6 +85,7 @@ static bool actions_may_change_flow(const struct nlattr 
*actions)
case OVS_ACTION_ATTR_SAMPLE:
case OVS_ACTION_ATTR_SET:
case OVS_ACTION_ATTR_SET_MASKED:
+   case OVS_ACTION_ATTR_METER:
default:
return true;
}
@@ -2482,6 +2483,7 @@ static int __ovs_nla_copy_actions(struct net *net, const 
struct nlattr *attr,
[OVS_ACTION_ATTR_TRUNC] = sizeof(struct 
ovs_action_trunc),
[OVS_ACTION_ATTR_PUSH_ETH] = sizeof(struct 
ovs_action_push_eth),
[OVS_ACTION_ATTR_POP_ETH] = 0,
+   [OVS_ACTION_ATTR_METER] = sizeof(u32),
};
const struct ovs_action_push_vlan *vlan;
int type = nla_type(a);
@@ -2636,6 +2638,10 @@ static int __ovs_nla_copy_actions(struct net *net, const 
struct nlattr *attr,
mac_proto = MAC_PROTO_ETHERNET;
break;
 
+   case OVS_ACTION_ATTR_METER:
+   /* Non-existent meters are simply ignored.  */
+   break;
+
default:
OVS_NLERR(log, "Unknown Action type %d", type);
return -EINVAL;
-- 
1.8.3.1



[net-next sample action optimization v4 1/4] openvswitch: Deferred fifo API change.

2017-03-20 Thread Andy Zhou
add_deferred_actions() API currently requires actions to be passed in
as a fully encoded netlink message. So far both 'sample' and 'recirc'
actions happens to carry actions as fully encoded netlink messages.
However, this requirement is more restrictive than necessary, future
patch will need to pass in action lists that are not fully encoded
by themselves.

Signed-off-by: Andy Zhou <az...@ovn.org>
Acked-by: Joe Stringer <j...@ovn.org>
---
 net/openvswitch/actions.c | 18 +++---
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index c82301c..75182e9 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -51,6 +51,7 @@ static int do_execute_actions(struct datapath *dp, struct 
sk_buff *skb,
 struct deferred_action {
struct sk_buff *skb;
const struct nlattr *actions;
+   int actions_len;
 
/* Store pkt_key clone when creating deferred action. */
struct sw_flow_key pkt_key;
@@ -119,8 +120,9 @@ static struct deferred_action *action_fifo_put(struct 
action_fifo *fifo)
 
 /* Return true if fifo is not full */
 static struct deferred_action *add_deferred_actions(struct sk_buff *skb,
-   const struct sw_flow_key 
*key,
-   const struct nlattr *attr)
+   const struct sw_flow_key *key,
+   const struct nlattr *actions,
+   const int actions_len)
 {
struct action_fifo *fifo;
struct deferred_action *da;
@@ -129,7 +131,8 @@ static struct deferred_action *add_deferred_actions(struct 
sk_buff *skb,
da = action_fifo_put(fifo);
if (da) {
da->skb = skb;
-   da->actions = attr;
+   da->actions = actions;
+   da->actions_len = actions_len;
da->pkt_key = *key;
}
 
@@ -966,7 +969,8 @@ static int sample(struct datapath *dp, struct sk_buff *skb,
/* Skip the sample action when out of memory. */
return 0;
 
-   if (!add_deferred_actions(skb, key, a)) {
+   if (!add_deferred_actions(skb, key, nla_data(acts_list),
+ nla_len(acts_list))) {
if (net_ratelimit())
pr_warn("%s: deferred actions limit reached, dropping 
sample action\n",
ovs_dp_name(dp));
@@ -1123,7 +1127,7 @@ static int execute_recirc(struct datapath *dp, struct 
sk_buff *skb,
return 0;
}
 
-   da = add_deferred_actions(skb, key, NULL);
+   da = add_deferred_actions(skb, key, NULL, 0);
if (da) {
da->pkt_key.recirc_id = nla_get_u32(a);
} else {
@@ -1278,10 +1282,10 @@ static void process_deferred_actions(struct datapath 
*dp)
struct sk_buff *skb = da->skb;
struct sw_flow_key *key = >pkt_key;
const struct nlattr *actions = da->actions;
+   int actions_len = da->actions_len;
 
if (actions)
-   do_execute_actions(dp, skb, key, actions,
-  nla_len(actions));
+   do_execute_actions(dp, skb, key, actions, actions_len);
else
ovs_dp_process_packet(skb, key);
} while (!action_fifo_is_empty(fifo));
-- 
1.8.3.1



[net-next sample action optimization v4 0/4]

2017-03-20 Thread Andy Zhou
The sample action can be used for translating Openflow 'clone' action.
However its implementation has not been sufficiently optimized for this
use case. This series attempts to close the gap.

Patch 3 commit message has more details on the specific optimizations
implemented.

---
v3->v4: Enhance patch 4.
Fix two bugs pointed out by Pravin,
Remove 'is_sample' variable.

v2->v3: Enhance patch 4, Rafctor to move more common logic to clone_execute().

v1->v2: Address Pravin's comment, Refactor recirc and sample
to share more common code

Andy Zhou (4):
openvswitch: Deferred fifo API change.
openvswitch: Refactor recirc key allocation.
openvswitch: Optimize sample action for the clone use cases
Openvswitch: Refactor sample and recirc actions implementation

 include/uapi/linux/openvswitch.h |  15 +++
 net/openvswitch/actions.c| 271 ++-
 net/openvswitch/datapath.h   |   2 -
 net/openvswitch/flow_netlink.c   | 141 +---
 4 files changed, 263 insertions(+), 166 deletions(-)

-- 
1.8.3.1



[net-next sample action optimization v4 4/4] Openvswitch: Refactor sample and recirc actions implementation

2017-03-20 Thread Andy Zhou
Added clone_execute() that both the sample and the recirc
action implementation can use.

Signed-off-by: Andy Zhou <az...@ovn.org>
---
 net/openvswitch/actions.c | 176 --
 1 file changed, 93 insertions(+), 83 deletions(-)

diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index 3529f7b..e461067 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -44,10 +44,6 @@
 #include "conntrack.h"
 #include "vport.h"
 
-static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
- struct sw_flow_key *key,
- const struct nlattr *attr, int len);
-
 struct deferred_action {
struct sk_buff *skb;
const struct nlattr *actions;
@@ -166,6 +162,12 @@ static bool is_flow_key_valid(const struct sw_flow_key 
*key)
return !(key->mac_proto & SW_FLOW_KEY_INVALID);
 }
 
+static int clone_execute(struct datapath *dp, struct sk_buff *skb,
+struct sw_flow_key *key,
+u32 recirc_id,
+const struct nlattr *actions, int len,
+bool last, bool clone_flow_key);
+
 static void update_ethertype(struct sk_buff *skb, struct ethhdr *hdr,
 __be16 ethertype)
 {
@@ -938,10 +940,9 @@ static int sample(struct datapath *dp, struct sk_buff *skb,
 {
struct nlattr *actions;
struct nlattr *sample_arg;
-   struct sw_flow_key *orig_key = key;
int rem = nla_len(attr);
-   int err = 0;
const struct sample_arg *arg;
+   bool clone_flow_key;
 
/* The first action is always 'OVS_SAMPLE_ATTR_ARG'. */
sample_arg = nla_data(attr);
@@ -955,43 +956,9 @@ static int sample(struct datapath *dp, struct sk_buff *skb,
return 0;
}
 
-   /* Unless the last action, sample works on the clone of SKB.  */
-   skb = last ? skb : skb_clone(skb, GFP_ATOMIC);
-   if (!skb) {
-   /* Out of memory, skip this sample action.
-*/
-   return 0;
-   }
-
-   /* In case the sample actions won't change 'key',
-* it can be used directly to execute sample actions.
-* Otherwise, allocate a new key from the
-* next recursion level of 'flow_keys'. If
-* successful, execute the sample actions without
-* deferring.
-*
-* Defer the sample actions if the recursion
-* limit has been reached.
-*/
-   if (!arg->exec) {
-   __this_cpu_inc(exec_actions_level);
-   key = clone_key(key);
-   }
-
-   if (key) {
-   err = do_execute_actions(dp, skb, key, actions, rem);
-   } else if (!add_deferred_actions(skb, orig_key, actions, rem)) {
-
-   if (net_ratelimit())
-   pr_warn("%s: deferred action limit reached, drop sample 
action\n",
-   ovs_dp_name(dp));
-   kfree_skb(skb);
-   }
-
-   if (!arg->exec)
-   __this_cpu_dec(exec_actions_level);
-
-   return err;
+   clone_flow_key = !arg->exec;
+   return clone_execute(dp, skb, key, 0, actions, rem, last,
+clone_flow_key);
 }
 
 static void execute_hash(struct sk_buff *skb, struct sw_flow_key *key,
@@ -1102,10 +1069,9 @@ static int execute_masked_set_action(struct sk_buff *skb,
 
 static int execute_recirc(struct datapath *dp, struct sk_buff *skb,
  struct sw_flow_key *key,
- const struct nlattr *a, int rem)
+ const struct nlattr *a, bool last)
 {
-   struct sw_flow_key *recirc_key;
-   struct deferred_action *da;
+   u32 recirc_id;
 
if (!is_flow_key_valid(key)) {
int err;
@@ -1116,40 +1082,8 @@ static int execute_recirc(struct datapath *dp, struct 
sk_buff *skb,
}
BUG_ON(!is_flow_key_valid(key));
 
-   if (!nla_is_last(a, rem)) {
-   /* Recirc action is the not the last action
-* of the action list, need to clone the skb.
-*/
-   skb = skb_clone(skb, GFP_ATOMIC);
-
-   /* Skip the recirc action when out of memory, but
-* continue on with the rest of the action list.
-*/
-   if (!skb)
-   return 0;
-   }
-
-   /* If within the limit of 'OVS_DEFERRED_ACTION_THRESHOLD',
-* recirc immediately, otherwise, defer it for later execution.
-*/
-   recirc_key = clone_key(key);
-   if (recirc_key) {
-   recirc_key->recirc_id = nla_get_u32(a);
-   ovs_dp_process_packet(skb, recirc_key);
-   } else {
-   da = add_deferred_actions(skb, key, NULL, 0);
-   if (da) {
-   recirc_key = 

[net-next sample action optimization v4 2/4] openvswitch: Refactor recirc key allocation.

2017-03-20 Thread Andy Zhou
The logic of allocating and copy key for each 'exec_actions_level'
was specific to execute_recirc(). However, future patches will reuse
as well.  Refactor the logic into its own function clone_key().

Signed-off-by: Andy Zhou <az...@ovn.org>
Acked-by: Pravin B Shelar <pshe...@ovn.org>
---
 net/openvswitch/actions.c | 66 ---
 1 file changed, 40 insertions(+), 26 deletions(-)

diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index 75182e9..8c9c60c 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2007-2014 Nicira, Inc.
+ * Copyright (c) 2007-2017 Nicira, Inc.
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of version 2 of the GNU General Public
@@ -83,14 +83,31 @@ struct action_fifo {
struct deferred_action fifo[DEFERRED_ACTION_FIFO_SIZE];
 };
 
-struct recirc_keys {
+struct action_flow_keys {
struct sw_flow_key key[OVS_DEFERRED_ACTION_THRESHOLD];
 };
 
 static struct action_fifo __percpu *action_fifos;
-static struct recirc_keys __percpu *recirc_keys;
+static struct action_flow_keys __percpu *flow_keys;
 static DEFINE_PER_CPU(int, exec_actions_level);
 
+/* Make a clone of the 'key', using the pre-allocated percpu 'flow_keys'
+ * space. Return NULL if out of key spaces.
+ */
+static struct sw_flow_key *clone_key(const struct sw_flow_key *key_)
+{
+   struct action_flow_keys *keys = this_cpu_ptr(flow_keys);
+   int level = this_cpu_read(exec_actions_level);
+   struct sw_flow_key *key = NULL;
+
+   if (level <= OVS_DEFERRED_ACTION_THRESHOLD) {
+   key = >key[level - 1];
+   *key = *key_;
+   }
+
+   return key;
+}
+
 static void action_fifo_init(struct action_fifo *fifo)
 {
fifo->head = 0;
@@ -1090,8 +1107,8 @@ static int execute_recirc(struct datapath *dp, struct 
sk_buff *skb,
  struct sw_flow_key *key,
  const struct nlattr *a, int rem)
 {
+   struct sw_flow_key *recirc_key;
struct deferred_action *da;
-   int level;
 
if (!is_flow_key_valid(key)) {
int err;
@@ -1115,29 +1132,26 @@ static int execute_recirc(struct datapath *dp, struct 
sk_buff *skb,
return 0;
}
 
-   level = this_cpu_read(exec_actions_level);
-   if (level <= OVS_DEFERRED_ACTION_THRESHOLD) {
-   struct recirc_keys *rks = this_cpu_ptr(recirc_keys);
-   struct sw_flow_key *recirc_key = >key[level - 1];
-
-   *recirc_key = *key;
+   /* If within the limit of 'OVS_DEFERRED_ACTION_THRESHOLD',
+* recirc immediately, otherwise, defer it for later execution.
+*/
+   recirc_key = clone_key(key);
+   if (recirc_key) {
recirc_key->recirc_id = nla_get_u32(a);
ovs_dp_process_packet(skb, recirc_key);
-
-   return 0;
-   }
-
-   da = add_deferred_actions(skb, key, NULL, 0);
-   if (da) {
-   da->pkt_key.recirc_id = nla_get_u32(a);
} else {
-   kfree_skb(skb);
-
-   if (net_ratelimit())
-   pr_warn("%s: deferred action limit reached, drop recirc 
action\n",
-   ovs_dp_name(dp));
+   da = add_deferred_actions(skb, key, NULL, 0);
+   if (da) {
+   recirc_key = >pkt_key;
+   recirc_key->recirc_id = nla_get_u32(a);
+   } else {
+   /* Log an error in case action fifo is full.  */
+   kfree_skb(skb);
+   if (net_ratelimit())
+   pr_warn("%s: deferred action limit reached, 
drop recirc action\n",
+   ovs_dp_name(dp));
+   }
}
-
return 0;
 }
 
@@ -1327,8 +1341,8 @@ int action_fifos_init(void)
if (!action_fifos)
return -ENOMEM;
 
-   recirc_keys = alloc_percpu(struct recirc_keys);
-   if (!recirc_keys) {
+   flow_keys = alloc_percpu(struct action_flow_keys);
+   if (!flow_keys) {
free_percpu(action_fifos);
return -ENOMEM;
}
@@ -1339,5 +1353,5 @@ int action_fifos_init(void)
 void action_fifos_exit(void)
 {
free_percpu(action_fifos);
-   free_percpu(recirc_keys);
+   free_percpu(flow_keys);
 }
-- 
1.8.3.1



[net-next sample action optimization v4 3/4] openvswitch: Optimize sample action for the clone use cases

2017-03-20 Thread Andy Zhou
With the introduction of open flow 'clone' action, the OVS user space
can now translate the 'clone' action into kernel datapath 'sample'
action, with 100% probability, to ensure that the clone semantics,
which is that the packet seen by the clone action is the same as the
packet seen by the action after clone, is faithfully carried out
in the datapath.

While the sample action in the datpath has the matching semantics,
its implementation is only optimized for its original use.
Specifically, there are two limitation: First, there is a 3 level of
nesting restriction, enforced at the flow downloading time. This
limit turns out to be too restrictive for the 'clone' use case.
Second, the implementation avoid recursive call only if the sample
action list has a single userspace action.

The main optimization implemented in this series removes the static
nesting limit check, instead, implement the run time recursion limit
check, and recursion avoidance similar to that of the 'recirc' action.
This optimization solve both #1 and #2 issues above.

One related optimization attempts to avoid copying flow key as
long as the actions enclosed does not change the flow key. The
detection is performed only once at the flow downloading time.

Another related optimization is to rewrite the action list
at flow downloading time in order to save the fast path from parsing
the sample action list in its original form repeatedly.

Signed-off-by: Andy Zhou <az...@ovn.org>
---
 include/uapi/linux/openvswitch.h |  15 +
 net/openvswitch/actions.c| 107 ++---
 net/openvswitch/datapath.h   |   2 -
 net/openvswitch/flow_netlink.c   | 141 +++
 4 files changed, 167 insertions(+), 98 deletions(-)

diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index 7f41f7d..66d1c3c 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -578,10 +578,25 @@ enum ovs_sample_attr {
OVS_SAMPLE_ATTR_PROBABILITY, /* u32 number */
OVS_SAMPLE_ATTR_ACTIONS, /* Nested OVS_ACTION_ATTR_* attributes. */
__OVS_SAMPLE_ATTR_MAX,
+
+#ifdef __KERNEL__
+   OVS_SAMPLE_ATTR_ARG  /* struct sample_arg  */
+#endif
 };
 
 #define OVS_SAMPLE_ATTR_MAX (__OVS_SAMPLE_ATTR_MAX - 1)
 
+#ifdef __KERNEL__
+struct sample_arg {
+   bool exec;   /* When true, actions in sample will not
+ * change flow keys. False otherwise.
+ */
+   u32  probability;/* Same value as
+ * 'OVS_SAMPLE_ATTR_PROBABILITY'.
+ */
+};
+#endif
+
 /**
  * enum ovs_userspace_attr - Attributes for %OVS_ACTION_ATTR_USERSPACE action.
  * @OVS_USERSPACE_ATTR_PID: u32 Netlink PID to which the %OVS_PACKET_CMD_ACTION
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index 8c9c60c..3529f7b 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -928,73 +928,70 @@ static int output_userspace(struct datapath *dp, struct 
sk_buff *skb,
return ovs_dp_upcall(dp, skb, key, , cutlen);
 }
 
+/* When 'last' is true, sample() should always consume the 'skb'.
+ * Otherwise, sample() should keep 'skb' intact regardless what
+ * actions are executed within sample().
+ */
 static int sample(struct datapath *dp, struct sk_buff *skb,
  struct sw_flow_key *key, const struct nlattr *attr,
- const struct nlattr *actions, int actions_len)
+ bool last)
 {
-   const struct nlattr *acts_list = NULL;
-   const struct nlattr *a;
-   int rem;
-   u32 cutlen = 0;
+   struct nlattr *actions;
+   struct nlattr *sample_arg;
+   struct sw_flow_key *orig_key = key;
+   int rem = nla_len(attr);
+   int err = 0;
+   const struct sample_arg *arg;
 
-   for (a = nla_data(attr), rem = nla_len(attr); rem > 0;
-a = nla_next(a, )) {
-   u32 probability;
+   /* The first action is always 'OVS_SAMPLE_ATTR_ARG'. */
+   sample_arg = nla_data(attr);
+   arg = nla_data(sample_arg);
+   actions = nla_next(sample_arg, );
 
-   switch (nla_type(a)) {
-   case OVS_SAMPLE_ATTR_PROBABILITY:
-   probability = nla_get_u32(a);
-   if (!probability || prandom_u32() > probability)
-   return 0;
-   break;
-
-   case OVS_SAMPLE_ATTR_ACTIONS:
-   acts_list = a;
-   break;
-   }
+   if ((arg->probability != U32_MAX) &&
+   (!arg->probability || prandom_u32() > arg->probability)) {
+   if (last)
+   consume_skb(skb);
+   return 0;
}
 
-   rem = nla_len(acts_list);
-   a = nla_data(

Re: [net-next sample action optimization v3 4/4] Openvswitch: Refactor sample and recirc actions implementation

2017-03-20 Thread Andy Zhou
On Sat, Mar 18, 2017 at 12:22 PM, Pravin Shelar <pshe...@ovn.org> wrote:
> On Thu, Mar 16, 2017 at 3:48 PM, Andy Zhou <az...@ovn.org> wrote:
>> Added clone_execute() that both the sample and the recirc
>> action implementation can use.
>>
>> Signed-off-by: Andy Zhou <az...@ovn.org>
>> ---
>>  net/openvswitch/actions.c | 175 
>> --
>>  1 file changed, 92 insertions(+), 83 deletions(-)
>>
>> diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
>> index 3529f7b..e38fa7f 100644
>> --- a/net/openvswitch/actions.c
>> +++ b/net/openvswitch/actions.c
>> @@ -44,10 +44,6 @@
>>  #include "conntrack.h"
>>  #include "vport.h"
>>
>> -static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
>> - struct sw_flow_key *key,
>> - const struct nlattr *attr, int len);
>> -
>>  struct deferred_action {
>> struct sk_buff *skb;
>> const struct nlattr *actions;
>> @@ -166,6 +162,12 @@ static bool is_flow_key_valid(const struct sw_flow_key 
>> *key)
>> return !(key->mac_proto & SW_FLOW_KEY_INVALID);
>>  }
>>
>> +static int clone_execute(struct datapath *dp, struct sk_buff *skb,
>> +struct sw_flow_key *key,
>> +u32 recirc_id,
>> +const struct nlattr *actions, int len,
>> +bool last, bool clone_flow_key);
>> +
> With this function the diff stat looks much better.
>
> ...
> ...
>> +/* Execute the actions on the clone of the packet. The effect of the
>> + * execution does not affect the original 'skb' nor the original 'key'.
>> + *
>> + * The execution may be deferred in case the actions can not be executed
>> + * immediately.
>> + */
>> +static int clone_execute(struct datapath *dp, struct sk_buff *skb,
>> +struct sw_flow_key *key, u32 recirc_id,
>> +const struct nlattr *actions, int len,
>> +bool last, bool clone_flow_key)
>> +{
>> +   bool is_sample = actions;
> Standard practice is use !! to convert pointer to boolean.
> I think this function does not need to know about sample action. So we
> can rename the boolean to have_actions or something similar.
O.K.  We can just check for actions pointer.

However, it will be obvious we are actually checking for sample or recirc
action. I will add some comments.

>
>> +   struct deferred_action *da;
>> +   struct sw_flow_key *clone;
>> +   int err = 0;
>> +
>> +   skb = last ? skb : skb_clone(skb, GFP_ATOMIC);
>> +   if (!skb) {
>> +   /* Out of memory, skip this action.
>> +*/
>> +   return 0;
>> +   }
>> +
>> +   /* In case the sample actions won't change the 'key',
>> +* current key can be used directly to execute sample actions.
>> +* Otherwise, allocate a new key from the
>> +* next recursion level of 'flow_keys'. If
>> +* successful, execute the sample actions without
>> +* deferring.
>> +*/
>> +   if (is_sample && clone_flow_key)
>> +   __this_cpu_inc(exec_actions_level);
>> +
> There is no need to increment actions level up here. it is only
> required for do_execute_actions(). ovs_dp_process_packet() already
> does it.

Right, that's why it is only done for 'is_sample'. There is a bug though,
the 'inc' needs to be done after clone.  I will rearrange the code to move 'inc'
only above the do_execute_actions().
>
>
>> +   clone = clone_flow_key ? clone_key(key) : key;
>> +   if (clone) {
>> +   if (is_sample) {
>> +   err = do_execute_actions(dp, skb, clone,
>> +actions, len);
>> +   } else {
>> +   clone->recirc_id = recirc_id;
>> +   ovs_dp_process_packet(skb, clone);
>> +   }
>> +   }
> wont this execute action twice, once here and again in deferred actions list?

Right, the return is missing for the if (clone) case.  I will post v4
soon. Thanks for the review
and comments.
>
>> +
>> +   if (is_sample && clone_flow_key)
>> +   __this_cpu_dec(exec_actions_level);
>> +
>> +   /* Out of 'flow_keys' space. Defer them */
>> +   da = add_defer

[net-next sample action optimization v3 2/4] openvswitch: Refactor recirc key allocation.

2017-03-16 Thread Andy Zhou
The logic of allocating and copy key for each 'exec_actions_level'
was specific to execute_recirc(). However, future patches will reuse
as well.  Refactor the logic into its own function clone_key().

Signed-off-by: Andy Zhou <az...@ovn.org>
Acked-by: Pravin B Shelar <pshe...@ovn.org>
---
 net/openvswitch/actions.c | 66 ---
 1 file changed, 40 insertions(+), 26 deletions(-)

diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index 75182e9..8c9c60c 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2007-2014 Nicira, Inc.
+ * Copyright (c) 2007-2017 Nicira, Inc.
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of version 2 of the GNU General Public
@@ -83,14 +83,31 @@ struct action_fifo {
struct deferred_action fifo[DEFERRED_ACTION_FIFO_SIZE];
 };
 
-struct recirc_keys {
+struct action_flow_keys {
struct sw_flow_key key[OVS_DEFERRED_ACTION_THRESHOLD];
 };
 
 static struct action_fifo __percpu *action_fifos;
-static struct recirc_keys __percpu *recirc_keys;
+static struct action_flow_keys __percpu *flow_keys;
 static DEFINE_PER_CPU(int, exec_actions_level);
 
+/* Make a clone of the 'key', using the pre-allocated percpu 'flow_keys'
+ * space. Return NULL if out of key spaces.
+ */
+static struct sw_flow_key *clone_key(const struct sw_flow_key *key_)
+{
+   struct action_flow_keys *keys = this_cpu_ptr(flow_keys);
+   int level = this_cpu_read(exec_actions_level);
+   struct sw_flow_key *key = NULL;
+
+   if (level <= OVS_DEFERRED_ACTION_THRESHOLD) {
+   key = >key[level - 1];
+   *key = *key_;
+   }
+
+   return key;
+}
+
 static void action_fifo_init(struct action_fifo *fifo)
 {
fifo->head = 0;
@@ -1090,8 +1107,8 @@ static int execute_recirc(struct datapath *dp, struct 
sk_buff *skb,
  struct sw_flow_key *key,
  const struct nlattr *a, int rem)
 {
+   struct sw_flow_key *recirc_key;
struct deferred_action *da;
-   int level;
 
if (!is_flow_key_valid(key)) {
int err;
@@ -1115,29 +1132,26 @@ static int execute_recirc(struct datapath *dp, struct 
sk_buff *skb,
return 0;
}
 
-   level = this_cpu_read(exec_actions_level);
-   if (level <= OVS_DEFERRED_ACTION_THRESHOLD) {
-   struct recirc_keys *rks = this_cpu_ptr(recirc_keys);
-   struct sw_flow_key *recirc_key = >key[level - 1];
-
-   *recirc_key = *key;
+   /* If within the limit of 'OVS_DEFERRED_ACTION_THRESHOLD',
+* recirc immediately, otherwise, defer it for later execution.
+*/
+   recirc_key = clone_key(key);
+   if (recirc_key) {
recirc_key->recirc_id = nla_get_u32(a);
ovs_dp_process_packet(skb, recirc_key);
-
-   return 0;
-   }
-
-   da = add_deferred_actions(skb, key, NULL, 0);
-   if (da) {
-   da->pkt_key.recirc_id = nla_get_u32(a);
} else {
-   kfree_skb(skb);
-
-   if (net_ratelimit())
-   pr_warn("%s: deferred action limit reached, drop recirc 
action\n",
-   ovs_dp_name(dp));
+   da = add_deferred_actions(skb, key, NULL, 0);
+   if (da) {
+   recirc_key = >pkt_key;
+   recirc_key->recirc_id = nla_get_u32(a);
+   } else {
+   /* Log an error in case action fifo is full.  */
+   kfree_skb(skb);
+   if (net_ratelimit())
+   pr_warn("%s: deferred action limit reached, 
drop recirc action\n",
+   ovs_dp_name(dp));
+   }
}
-
return 0;
 }
 
@@ -1327,8 +1341,8 @@ int action_fifos_init(void)
if (!action_fifos)
return -ENOMEM;
 
-   recirc_keys = alloc_percpu(struct recirc_keys);
-   if (!recirc_keys) {
+   flow_keys = alloc_percpu(struct action_flow_keys);
+   if (!flow_keys) {
free_percpu(action_fifos);
return -ENOMEM;
}
@@ -1339,5 +1353,5 @@ int action_fifos_init(void)
 void action_fifos_exit(void)
 {
free_percpu(action_fifos);
-   free_percpu(recirc_keys);
+   free_percpu(flow_keys);
 }
-- 
1.8.3.1



[net-next sample action optimization v3 1/4] openvswitch: Deferred fifo API change.

2017-03-16 Thread Andy Zhou
add_deferred_actions() API currently requires actions to be passed in
as a fully encoded netlink message. So far both 'sample' and 'recirc'
actions happens to carry actions as fully encoded netlink messages.
However, this requirement is more restrictive than necessary, future
patch will need to pass in action lists that are not fully encoded
by themselves.

Signed-off-by: Andy Zhou <az...@ovn.org>
Acked-by: Joe Stringer <j...@ovn.org>
---
 net/openvswitch/actions.c | 18 +++---
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index c82301c..75182e9 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -51,6 +51,7 @@ static int do_execute_actions(struct datapath *dp, struct 
sk_buff *skb,
 struct deferred_action {
struct sk_buff *skb;
const struct nlattr *actions;
+   int actions_len;
 
/* Store pkt_key clone when creating deferred action. */
struct sw_flow_key pkt_key;
@@ -119,8 +120,9 @@ static struct deferred_action *action_fifo_put(struct 
action_fifo *fifo)
 
 /* Return true if fifo is not full */
 static struct deferred_action *add_deferred_actions(struct sk_buff *skb,
-   const struct sw_flow_key 
*key,
-   const struct nlattr *attr)
+   const struct sw_flow_key *key,
+   const struct nlattr *actions,
+   const int actions_len)
 {
struct action_fifo *fifo;
struct deferred_action *da;
@@ -129,7 +131,8 @@ static struct deferred_action *add_deferred_actions(struct 
sk_buff *skb,
da = action_fifo_put(fifo);
if (da) {
da->skb = skb;
-   da->actions = attr;
+   da->actions = actions;
+   da->actions_len = actions_len;
da->pkt_key = *key;
}
 
@@ -966,7 +969,8 @@ static int sample(struct datapath *dp, struct sk_buff *skb,
/* Skip the sample action when out of memory. */
return 0;
 
-   if (!add_deferred_actions(skb, key, a)) {
+   if (!add_deferred_actions(skb, key, nla_data(acts_list),
+ nla_len(acts_list))) {
if (net_ratelimit())
pr_warn("%s: deferred actions limit reached, dropping 
sample action\n",
ovs_dp_name(dp));
@@ -1123,7 +1127,7 @@ static int execute_recirc(struct datapath *dp, struct 
sk_buff *skb,
return 0;
}
 
-   da = add_deferred_actions(skb, key, NULL);
+   da = add_deferred_actions(skb, key, NULL, 0);
if (da) {
da->pkt_key.recirc_id = nla_get_u32(a);
} else {
@@ -1278,10 +1282,10 @@ static void process_deferred_actions(struct datapath 
*dp)
struct sk_buff *skb = da->skb;
struct sw_flow_key *key = >pkt_key;
const struct nlattr *actions = da->actions;
+   int actions_len = da->actions_len;
 
if (actions)
-   do_execute_actions(dp, skb, key, actions,
-  nla_len(actions));
+   do_execute_actions(dp, skb, key, actions, actions_len);
else
ovs_dp_process_packet(skb, key);
} while (!action_fifo_is_empty(fifo));
-- 
1.8.3.1



[net-next sample action optimization v3 0/4]

2017-03-16 Thread Andy Zhou
The sample action can be used for translating Openflow 'clone' action.
However its implementation has not been sufficiently optimized for this
use case. This series attempts to close the gap.

Patch 3 commit message has more details on the specific optimizations
implemented.

---
v1->v2: Address pravin's comment, Refactor recirc and sample
to share more common code

v2->v3: Enhace patch 4, add more loigc to the common code

Andy Zhou (4):
  openvswitch: Deferred fifo API change.
  openvswitch: Refactor recirc key allocation.
  openvswitch: Optimize sample action for the clone use cases
  Openvswitch: Refactor sample and recirc actions implementation

 include/uapi/linux/openvswitch.h |  15 +++
 net/openvswitch/actions.c| 270 ++-
 net/openvswitch/datapath.h   |   2 -
 net/openvswitch/flow_netlink.c   | 141 +---
 4 files changed, 262 insertions(+), 166 deletions(-)

-- 
1.8.3.1



[net-next sample action optimization v3 3/4] openvswitch: Optimize sample action for the clone use cases

2017-03-16 Thread Andy Zhou
With the introduction of open flow 'clone' action, the OVS user space
can now translate the 'clone' action into kernel datapath 'sample'
action, with 100% probability, to ensure that the clone semantics,
which is that the packet seen by the clone action is the same as the
packet seen by the action after clone, is faithfully carried out
in the datapath.

While the sample action in the datpath has the matching semantics,
its implementation is only optimized for its original use.
Specifically, there are two limitation: First, there is a 3 level of
nesting restriction, enforced at the flow downloading time. This
limit turns out to be too restrictive for the 'clone' use case.
Second, the implementation avoid recursive call only if the sample
action list has a single userspace action.

The main optimization implemented in this series removes the static
nesting limit check, instead, implement the run time recursion limit
check, and recursion avoidance similar to that of the 'recirc' action.
This optimization solve both #1 and #2 issues above.

One related optimization attempts to avoid copying flow key as
long as the actions enclosed does not change the flow key. The
detection is performed only once at the flow downloading time.

Another related optimization is to rewrite the action list
at flow downloading time in order to save the fast path from parsing
the sample action list in its original form repeatedly.

Signed-off-by: Andy Zhou <az...@ovn.org>
---
 include/uapi/linux/openvswitch.h |  15 +
 net/openvswitch/actions.c| 107 ++---
 net/openvswitch/datapath.h   |   2 -
 net/openvswitch/flow_netlink.c   | 141 +++
 4 files changed, 167 insertions(+), 98 deletions(-)

diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index 7f41f7d..66d1c3c 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -578,10 +578,25 @@ enum ovs_sample_attr {
OVS_SAMPLE_ATTR_PROBABILITY, /* u32 number */
OVS_SAMPLE_ATTR_ACTIONS, /* Nested OVS_ACTION_ATTR_* attributes. */
__OVS_SAMPLE_ATTR_MAX,
+
+#ifdef __KERNEL__
+   OVS_SAMPLE_ATTR_ARG  /* struct sample_arg  */
+#endif
 };
 
 #define OVS_SAMPLE_ATTR_MAX (__OVS_SAMPLE_ATTR_MAX - 1)
 
+#ifdef __KERNEL__
+struct sample_arg {
+   bool exec;   /* When true, actions in sample will not
+ * change flow keys. False otherwise.
+ */
+   u32  probability;/* Same value as
+ * 'OVS_SAMPLE_ATTR_PROBABILITY'.
+ */
+};
+#endif
+
 /**
  * enum ovs_userspace_attr - Attributes for %OVS_ACTION_ATTR_USERSPACE action.
  * @OVS_USERSPACE_ATTR_PID: u32 Netlink PID to which the %OVS_PACKET_CMD_ACTION
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index 8c9c60c..3529f7b 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -928,73 +928,70 @@ static int output_userspace(struct datapath *dp, struct 
sk_buff *skb,
return ovs_dp_upcall(dp, skb, key, , cutlen);
 }
 
+/* When 'last' is true, sample() should always consume the 'skb'.
+ * Otherwise, sample() should keep 'skb' intact regardless what
+ * actions are executed within sample().
+ */
 static int sample(struct datapath *dp, struct sk_buff *skb,
  struct sw_flow_key *key, const struct nlattr *attr,
- const struct nlattr *actions, int actions_len)
+ bool last)
 {
-   const struct nlattr *acts_list = NULL;
-   const struct nlattr *a;
-   int rem;
-   u32 cutlen = 0;
+   struct nlattr *actions;
+   struct nlattr *sample_arg;
+   struct sw_flow_key *orig_key = key;
+   int rem = nla_len(attr);
+   int err = 0;
+   const struct sample_arg *arg;
 
-   for (a = nla_data(attr), rem = nla_len(attr); rem > 0;
-a = nla_next(a, )) {
-   u32 probability;
+   /* The first action is always 'OVS_SAMPLE_ATTR_ARG'. */
+   sample_arg = nla_data(attr);
+   arg = nla_data(sample_arg);
+   actions = nla_next(sample_arg, );
 
-   switch (nla_type(a)) {
-   case OVS_SAMPLE_ATTR_PROBABILITY:
-   probability = nla_get_u32(a);
-   if (!probability || prandom_u32() > probability)
-   return 0;
-   break;
-
-   case OVS_SAMPLE_ATTR_ACTIONS:
-   acts_list = a;
-   break;
-   }
+   if ((arg->probability != U32_MAX) &&
+   (!arg->probability || prandom_u32() > arg->probability)) {
+   if (last)
+   consume_skb(skb);
+   return 0;
}
 
-   rem = nla_len(acts_list);
-   a = nla_data(

[net-next sample action optimization v3 4/4] Openvswitch: Refactor sample and recirc actions implementation

2017-03-16 Thread Andy Zhou
Added clone_execute() that both the sample and the recirc
action implementation can use.

Signed-off-by: Andy Zhou <az...@ovn.org>
---
 net/openvswitch/actions.c | 175 --
 1 file changed, 92 insertions(+), 83 deletions(-)

diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index 3529f7b..e38fa7f 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -44,10 +44,6 @@
 #include "conntrack.h"
 #include "vport.h"
 
-static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
- struct sw_flow_key *key,
- const struct nlattr *attr, int len);
-
 struct deferred_action {
struct sk_buff *skb;
const struct nlattr *actions;
@@ -166,6 +162,12 @@ static bool is_flow_key_valid(const struct sw_flow_key 
*key)
return !(key->mac_proto & SW_FLOW_KEY_INVALID);
 }
 
+static int clone_execute(struct datapath *dp, struct sk_buff *skb,
+struct sw_flow_key *key,
+u32 recirc_id,
+const struct nlattr *actions, int len,
+bool last, bool clone_flow_key);
+
 static void update_ethertype(struct sk_buff *skb, struct ethhdr *hdr,
 __be16 ethertype)
 {
@@ -938,10 +940,9 @@ static int sample(struct datapath *dp, struct sk_buff *skb,
 {
struct nlattr *actions;
struct nlattr *sample_arg;
-   struct sw_flow_key *orig_key = key;
int rem = nla_len(attr);
-   int err = 0;
const struct sample_arg *arg;
+   bool clone_flow_key;
 
/* The first action is always 'OVS_SAMPLE_ATTR_ARG'. */
sample_arg = nla_data(attr);
@@ -955,43 +956,9 @@ static int sample(struct datapath *dp, struct sk_buff *skb,
return 0;
}
 
-   /* Unless the last action, sample works on the clone of SKB.  */
-   skb = last ? skb : skb_clone(skb, GFP_ATOMIC);
-   if (!skb) {
-   /* Out of memory, skip this sample action.
-*/
-   return 0;
-   }
-
-   /* In case the sample actions won't change 'key',
-* it can be used directly to execute sample actions.
-* Otherwise, allocate a new key from the
-* next recursion level of 'flow_keys'. If
-* successful, execute the sample actions without
-* deferring.
-*
-* Defer the sample actions if the recursion
-* limit has been reached.
-*/
-   if (!arg->exec) {
-   __this_cpu_inc(exec_actions_level);
-   key = clone_key(key);
-   }
-
-   if (key) {
-   err = do_execute_actions(dp, skb, key, actions, rem);
-   } else if (!add_deferred_actions(skb, orig_key, actions, rem)) {
-
-   if (net_ratelimit())
-   pr_warn("%s: deferred action limit reached, drop sample 
action\n",
-   ovs_dp_name(dp));
-   kfree_skb(skb);
-   }
-
-   if (!arg->exec)
-   __this_cpu_dec(exec_actions_level);
-
-   return err;
+   clone_flow_key = !arg->exec;
+   return clone_execute(dp, skb, key, 0, actions, rem, last,
+clone_flow_key);
 }
 
 static void execute_hash(struct sk_buff *skb, struct sw_flow_key *key,
@@ -1102,10 +1069,9 @@ static int execute_masked_set_action(struct sk_buff *skb,
 
 static int execute_recirc(struct datapath *dp, struct sk_buff *skb,
  struct sw_flow_key *key,
- const struct nlattr *a, int rem)
+ const struct nlattr *a, bool last)
 {
-   struct sw_flow_key *recirc_key;
-   struct deferred_action *da;
+   u32 recirc_id;
 
if (!is_flow_key_valid(key)) {
int err;
@@ -1116,40 +1082,8 @@ static int execute_recirc(struct datapath *dp, struct 
sk_buff *skb,
}
BUG_ON(!is_flow_key_valid(key));
 
-   if (!nla_is_last(a, rem)) {
-   /* Recirc action is the not the last action
-* of the action list, need to clone the skb.
-*/
-   skb = skb_clone(skb, GFP_ATOMIC);
-
-   /* Skip the recirc action when out of memory, but
-* continue on with the rest of the action list.
-*/
-   if (!skb)
-   return 0;
-   }
-
-   /* If within the limit of 'OVS_DEFERRED_ACTION_THRESHOLD',
-* recirc immediately, otherwise, defer it for later execution.
-*/
-   recirc_key = clone_key(key);
-   if (recirc_key) {
-   recirc_key->recirc_id = nla_get_u32(a);
-   ovs_dp_process_packet(skb, recirc_key);
-   } else {
-   da = add_deferred_actions(skb, key, NULL, 0);
-   if (da) {
-   recirc_key = 

Re: [net-next sample action optimization v2 1/4] openvswitch: Deferred fifo API change.

2017-03-16 Thread Andy Zhou
On Thu, Mar 16, 2017 at 10:28 AM, Pravin Shelar <pshe...@ovn.org> wrote:
> On Tue, Mar 14, 2017 at 4:08 PM, Andy Zhou <az...@ovn.org> wrote:
>> add_deferred_actions() API currently requires actions to be passed in
>> as a fully encoded netlink message. So far both 'sample' and 'recirc'
>> actions happens to carry actions as fully encoded netlink messages.
>> However, this requirement is more restrictive than necessary, future
>> patch will need to pass in action lists that are not fully encoded
>> by themselves.
>
> It is not obvious why this change is required?
> can you explain it.

The original 'attr' requires a nested netlink message for the callee
to get the size of
the actions list.  In the sample case, since we rewrite sample into an
internal format that
does not have actions encoded as a nested netlink, we will need to
pass both pointer
to the first action, and the size of the actions list.


Re: [net-next sample action optimization 3/3] openvswitch: Optimize sample action for the clone use cases

2017-03-14 Thread Andy Zhou
> Actions parameter which hints if it is recirc or sample. We can add
> recic-id param and set it if it is recic case.
> will this work?
>
Just posted v2.  I added a patch in the end to implement this
refactoring as suggested.


[net-next sample action optimization v2 0/4]

2017-03-14 Thread Andy Zhou
The sample action can be used for translating Openflow 'clone' action.
However its implementation has not been sufficiently optimized for this
use case. This series attempts to close the gap.

Patch 3 commit message has more details on the specific optimizations
implemented.

Andy Zhou (4):
  openvswitch: Deferred fifo API change.
  openvswitch: Refactor recirc key allocation.
  openvswitch: Optimize sample action for the clone use cases
  Openvswitch: Refactor sample and recirc actions implementation

 include/uapi/linux/openvswitch.h |  13 +++
 net/openvswitch/actions.c| 234 +++
 net/openvswitch/datapath.h   |   2 -
 net/openvswitch/flow_netlink.c   | 141 +++
 4 files changed, 249 insertions(+), 141 deletions(-)

-- 
1.8.3.1



[net-next sample action optimization v2 3/4] openvswitch: Optimize sample action for the clone use cases

2017-03-14 Thread Andy Zhou
With the introduction of open flow 'clone' action, the OVS user space
can now translate the 'clone' action into kernel datapath 'sample'
action, with 100% probability, to ensure that the clone semantics,
which is that the packet seen by the clone action is the same as the
packet seen by the action after clone, is faithfully carried out
in the datapath.

While the sample action in the datpath has the matching semantics,
its implementation is only optimized for its original use.
Specifically, there are two limitation: First, there is a 3 level of
nesting restriction, enforced at the flow downloading time. This
limit turns out to be too restrictive for the 'clone' use case.
Second, the implementation avoid recursive call only if the sample
action list has a single userspace action.

The main optimization implemented in this series removes the static
nesting limit check, instead, implement the run time recursion limit
check, and recursion avoidance similar to that of the 'recirc' action.
This optimization solve both #1 and #2 issues above.

One related optimization attemps to avoid copying flow key as
long as the actions enclosed does not change the flow key. The
detection is performed only once at the flow downloading time.

Another related optimization is to rewrite the action list
at flow downloading time in order to save the fast path from parsing
the sample action list in its original form repeatedly.

Signed-off-by: Andy Zhou <az...@ovn.org>
---
 include/uapi/linux/openvswitch.h |  13 
 net/openvswitch/actions.c| 108 +++---
 net/openvswitch/datapath.h   |   2 -
 net/openvswitch/flow_netlink.c   | 141 +++
 4 files changed, 166 insertions(+), 98 deletions(-)

diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index 7f41f7d..0dfe69b 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -578,10 +578,23 @@ enum ovs_sample_attr {
OVS_SAMPLE_ATTR_PROBABILITY, /* u32 number */
OVS_SAMPLE_ATTR_ACTIONS, /* Nested OVS_ACTION_ATTR_* attributes. */
__OVS_SAMPLE_ATTR_MAX,
+
+#ifdef __KERNEL__
+   OVS_SAMPLE_ATTR_ARG  /* struct sample_arg  */
+#endif
 };
 
 #define OVS_SAMPLE_ATTR_MAX (__OVS_SAMPLE_ATTR_MAX - 1)
 
+#ifdef __KERNEL__
+struct sample_arg {
+   bool exec;   /* When true, actions in sample will not
+   change flow keys. False otherwise. */
+   u32  probability;/* Same value as
+   'OVS_SAMPLE_ATTR_PROBABILITY'. */
+};
+#endif
+
 /**
  * enum ovs_userspace_attr - Attributes for %OVS_ACTION_ATTR_USERSPACE action.
  * @OVS_USERSPACE_ATTR_PID: u32 Netlink PID to which the %OVS_PACKET_CMD_ACTION
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index 8c9c60c..1638370 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -928,73 +928,71 @@ static int output_userspace(struct datapath *dp, struct 
sk_buff *skb,
return ovs_dp_upcall(dp, skb, key, , cutlen);
 }
 
+
+/* When 'last' is true, sample() should always consume the 'skb'.
+ * Otherwise, sample() should keep 'skb' intact regardless what
+ * actions are executed within sample().
+ */
 static int sample(struct datapath *dp, struct sk_buff *skb,
  struct sw_flow_key *key, const struct nlattr *attr,
- const struct nlattr *actions, int actions_len)
+ bool last)
 {
-   const struct nlattr *acts_list = NULL;
-   const struct nlattr *a;
-   int rem;
-   u32 cutlen = 0;
+   struct nlattr *actions;
+   struct nlattr *sample_arg;
+   struct sw_flow_key *orig_key = key;
+   int rem = nla_len(attr);
+   int err = 0;
+   const struct sample_arg *arg;
 
-   for (a = nla_data(attr), rem = nla_len(attr); rem > 0;
-a = nla_next(a, )) {
-   u32 probability;
+   /* The first action is always 'OVS_SAMPLE_ATTR_ARG'. */
+   sample_arg = nla_data(attr);
+   arg = nla_data(sample_arg);
+   actions = nla_next(sample_arg, );
 
-   switch (nla_type(a)) {
-   case OVS_SAMPLE_ATTR_PROBABILITY:
-   probability = nla_get_u32(a);
-   if (!probability || prandom_u32() > probability)
-   return 0;
-   break;
-
-   case OVS_SAMPLE_ATTR_ACTIONS:
-   acts_list = a;
-   break;
-   }
+   if ((arg->probability != U32_MAX) &&
+   (!arg->probability || prandom_u32() > arg->probability)) {
+   if (last)
+   consume_skb(skb);
+   return 0;
}
 
-   rem = nla_len(acts_list);
-   a = nla_data(acts_list);
-
-   /* Actions list is empty, do nothing */
-   if (u

[net-next sample action optimization v2 4/4] Openvswitch: Refactor sample and recirc actions implementation

2017-03-14 Thread Andy Zhou
Added execute_or_defer_actions() that both sample and recirc
action's implementation can use.

Signed-off-by: Andy Zhou <az...@ovn.org>
---
 net/openvswitch/actions.c | 96 +--
 1 file changed, 59 insertions(+), 37 deletions(-)

diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index 1638370..fd7d903 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -44,10 +44,6 @@
 #include "conntrack.h"
 #include "vport.h"
 
-static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
- struct sw_flow_key *key,
- const struct nlattr *attr, int len);
-
 struct deferred_action {
struct sk_buff *skb;
const struct nlattr *actions;
@@ -166,6 +162,12 @@ static bool is_flow_key_valid(const struct sw_flow_key 
*key)
return !(key->mac_proto & SW_FLOW_KEY_INVALID);
 }
 
+static int execute_or_defer_actions(struct datapath *dp, struct sk_buff *skb,
+   struct sw_flow_key *clone,
+   struct sw_flow_key *key,
+   u32 *recirc_id,
+   const struct nlattr *actions, int len);
+
 static void update_ethertype(struct sk_buff *skb, struct ethhdr *hdr,
 __be16 ethertype)
 {
@@ -941,7 +943,7 @@ static int sample(struct datapath *dp, struct sk_buff *skb,
struct nlattr *sample_arg;
struct sw_flow_key *orig_key = key;
int rem = nla_len(attr);
-   int err = 0;
+   int err;
const struct sample_arg *arg;
 
/* The first action is always 'OVS_SAMPLE_ATTR_ARG'. */
@@ -979,15 +981,8 @@ static int sample(struct datapath *dp, struct sk_buff *skb,
key = clone_key(key);
}
 
-   if (key) {
-   err = do_execute_actions(dp, skb, key, actions, rem);
-   } else if (!add_deferred_actions(skb, orig_key, actions, rem)) {
-
-   if (net_ratelimit())
-   pr_warn("%s: deferred action limit reached, drop sample 
action\n",
-   ovs_dp_name(dp));
-   kfree_skb(skb);
-   }
+   err = execute_or_defer_actions(dp, skb, key, orig_key, NULL,
+  actions, rem);
 
if (!arg->exec)
__this_cpu_dec(exec_actions_level);
@@ -1105,8 +1100,7 @@ static int execute_recirc(struct datapath *dp, struct 
sk_buff *skb,
  struct sw_flow_key *key,
  const struct nlattr *a, int rem)
 {
-   struct sw_flow_key *recirc_key;
-   struct deferred_action *da;
+   u32 recirc_id;
 
if (!is_flow_key_valid(key)) {
int err;
@@ -1130,27 +1124,9 @@ static int execute_recirc(struct datapath *dp, struct 
sk_buff *skb,
return 0;
}
 
-   /* If within the limit of 'OVS_DEFERRED_ACTION_THRESHOLD',
-* recirc immediately, otherwise, defer it for later execution.
-*/
-   recirc_key = clone_key(key);
-   if (recirc_key) {
-   recirc_key->recirc_id = nla_get_u32(a);
-   ovs_dp_process_packet(skb, recirc_key);
-   } else {
-   da = add_deferred_actions(skb, key, NULL, 0);
-   if (da) {
-   recirc_key = >pkt_key;
-   recirc_key->recirc_id = nla_get_u32(a);
-   } else {
-   /* Log an error in case action fifo is full.  */
-   kfree_skb(skb);
-   if (net_ratelimit())
-   pr_warn("%s: deferred action limit reached, 
drop recirc action\n",
-   ovs_dp_name(dp));
-   }
-   }
-   return 0;
+   recirc_id = nla_get_u32(a);
+   return execute_or_defer_actions(dp, skb, clone_key(key),
+   key, _id, NULL, 0);
 }
 
 /* Execute a list of actions against 'skb'. */
@@ -1286,6 +1262,52 @@ static int do_execute_actions(struct datapath *dp, 
struct sk_buff *skb,
return 0;
 }
 
+static int execute_or_defer_actions(struct datapath *dp, struct sk_buff *skb,
+   struct sw_flow_key *clone,
+   struct sw_flow_key *key,
+   u32 *recirc_id,
+   const struct nlattr *actions, int len)
+{
+   struct deferred_action *da;
+
+   /* If within the limit of 'OVS_DEFERRED_ACTION_THRESHOLD',
+* recirc immediately, otherwise, defer it for later execution.
+*/
+   if (clone) {
+   if (recirc_id) {
+   clone->recirc_id = *recirc_id;
+   ovs_dp_process_packet(skb, clone);
+

[net-next sample action optimization v2 2/4] openvswitch: Refactor recirc key allocation.

2017-03-14 Thread Andy Zhou
The logic of allocating and copy key for each 'exec_actions_level'
was specific to execute_recirc(). However, future patches will reuse
as well.  Refactor the logic into its own function clone_key().

Signed-off-by: Andy Zhou <az...@ovn.org>
---
 net/openvswitch/actions.c | 66 ---
 1 file changed, 40 insertions(+), 26 deletions(-)

diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index 75182e9..8c9c60c 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2007-2014 Nicira, Inc.
+ * Copyright (c) 2007-2017 Nicira, Inc.
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of version 2 of the GNU General Public
@@ -83,14 +83,31 @@ struct action_fifo {
struct deferred_action fifo[DEFERRED_ACTION_FIFO_SIZE];
 };
 
-struct recirc_keys {
+struct action_flow_keys {
struct sw_flow_key key[OVS_DEFERRED_ACTION_THRESHOLD];
 };
 
 static struct action_fifo __percpu *action_fifos;
-static struct recirc_keys __percpu *recirc_keys;
+static struct action_flow_keys __percpu *flow_keys;
 static DEFINE_PER_CPU(int, exec_actions_level);
 
+/* Make a clone of the 'key', using the pre-allocated percpu 'flow_keys'
+ * space. Return NULL if out of key spaces.
+ */
+static struct sw_flow_key *clone_key(const struct sw_flow_key *key_)
+{
+   struct action_flow_keys *keys = this_cpu_ptr(flow_keys);
+   int level = this_cpu_read(exec_actions_level);
+   struct sw_flow_key *key = NULL;
+
+   if (level <= OVS_DEFERRED_ACTION_THRESHOLD) {
+   key = >key[level - 1];
+   *key = *key_;
+   }
+
+   return key;
+}
+
 static void action_fifo_init(struct action_fifo *fifo)
 {
fifo->head = 0;
@@ -1090,8 +1107,8 @@ static int execute_recirc(struct datapath *dp, struct 
sk_buff *skb,
  struct sw_flow_key *key,
  const struct nlattr *a, int rem)
 {
+   struct sw_flow_key *recirc_key;
struct deferred_action *da;
-   int level;
 
if (!is_flow_key_valid(key)) {
int err;
@@ -1115,29 +1132,26 @@ static int execute_recirc(struct datapath *dp, struct 
sk_buff *skb,
return 0;
}
 
-   level = this_cpu_read(exec_actions_level);
-   if (level <= OVS_DEFERRED_ACTION_THRESHOLD) {
-   struct recirc_keys *rks = this_cpu_ptr(recirc_keys);
-   struct sw_flow_key *recirc_key = >key[level - 1];
-
-   *recirc_key = *key;
+   /* If within the limit of 'OVS_DEFERRED_ACTION_THRESHOLD',
+* recirc immediately, otherwise, defer it for later execution.
+*/
+   recirc_key = clone_key(key);
+   if (recirc_key) {
recirc_key->recirc_id = nla_get_u32(a);
ovs_dp_process_packet(skb, recirc_key);
-
-   return 0;
-   }
-
-   da = add_deferred_actions(skb, key, NULL, 0);
-   if (da) {
-   da->pkt_key.recirc_id = nla_get_u32(a);
} else {
-   kfree_skb(skb);
-
-   if (net_ratelimit())
-   pr_warn("%s: deferred action limit reached, drop recirc 
action\n",
-   ovs_dp_name(dp));
+   da = add_deferred_actions(skb, key, NULL, 0);
+   if (da) {
+   recirc_key = >pkt_key;
+   recirc_key->recirc_id = nla_get_u32(a);
+   } else {
+   /* Log an error in case action fifo is full.  */
+   kfree_skb(skb);
+   if (net_ratelimit())
+   pr_warn("%s: deferred action limit reached, 
drop recirc action\n",
+   ovs_dp_name(dp));
+   }
}
-
return 0;
 }
 
@@ -1327,8 +1341,8 @@ int action_fifos_init(void)
if (!action_fifos)
return -ENOMEM;
 
-   recirc_keys = alloc_percpu(struct recirc_keys);
-   if (!recirc_keys) {
+   flow_keys = alloc_percpu(struct action_flow_keys);
+   if (!flow_keys) {
free_percpu(action_fifos);
return -ENOMEM;
}
@@ -1339,5 +1353,5 @@ int action_fifos_init(void)
 void action_fifos_exit(void)
 {
free_percpu(action_fifos);
-   free_percpu(recirc_keys);
+   free_percpu(flow_keys);
 }
-- 
1.8.3.1



[net-next sample action optimization v2 1/4] openvswitch: Deferred fifo API change.

2017-03-14 Thread Andy Zhou
add_deferred_actions() API currently requires actions to be passed in
as a fully encoded netlink message. So far both 'sample' and 'recirc'
actions happens to carry actions as fully encoded netlink messages.
However, this requirement is more restrictive than necessary, future
patch will need to pass in action lists that are not fully encoded
by themselves.

Signed-off-by: Andy Zhou <az...@ovn.org>
Acked-by: Joe Stringer <j...@ovn.org>
---
 net/openvswitch/actions.c | 18 +++---
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index c82301c..75182e9 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -51,6 +51,7 @@ static int do_execute_actions(struct datapath *dp, struct 
sk_buff *skb,
 struct deferred_action {
struct sk_buff *skb;
const struct nlattr *actions;
+   int actions_len;
 
/* Store pkt_key clone when creating deferred action. */
struct sw_flow_key pkt_key;
@@ -119,8 +120,9 @@ static struct deferred_action *action_fifo_put(struct 
action_fifo *fifo)
 
 /* Return true if fifo is not full */
 static struct deferred_action *add_deferred_actions(struct sk_buff *skb,
-   const struct sw_flow_key 
*key,
-   const struct nlattr *attr)
+   const struct sw_flow_key *key,
+   const struct nlattr *actions,
+   const int actions_len)
 {
struct action_fifo *fifo;
struct deferred_action *da;
@@ -129,7 +131,8 @@ static struct deferred_action *add_deferred_actions(struct 
sk_buff *skb,
da = action_fifo_put(fifo);
if (da) {
da->skb = skb;
-   da->actions = attr;
+   da->actions = actions;
+   da->actions_len = actions_len;
da->pkt_key = *key;
}
 
@@ -966,7 +969,8 @@ static int sample(struct datapath *dp, struct sk_buff *skb,
/* Skip the sample action when out of memory. */
return 0;
 
-   if (!add_deferred_actions(skb, key, a)) {
+   if (!add_deferred_actions(skb, key, nla_data(acts_list),
+ nla_len(acts_list))) {
if (net_ratelimit())
pr_warn("%s: deferred actions limit reached, dropping 
sample action\n",
ovs_dp_name(dp));
@@ -1123,7 +1127,7 @@ static int execute_recirc(struct datapath *dp, struct 
sk_buff *skb,
return 0;
}
 
-   da = add_deferred_actions(skb, key, NULL);
+   da = add_deferred_actions(skb, key, NULL, 0);
if (da) {
da->pkt_key.recirc_id = nla_get_u32(a);
} else {
@@ -1278,10 +1282,10 @@ static void process_deferred_actions(struct datapath 
*dp)
struct sk_buff *skb = da->skb;
struct sw_flow_key *key = >pkt_key;
const struct nlattr *actions = da->actions;
+   int actions_len = da->actions_len;
 
if (actions)
-   do_execute_actions(dp, skb, key, actions,
-  nla_len(actions));
+   do_execute_actions(dp, skb, key, actions, actions_len);
else
ovs_dp_process_packet(skb, key);
} while (!action_fifo_is_empty(fifo));
-- 
1.8.3.1



Re: [net-next sample action optimization 3/3] openvswitch: Optimize sample action for the clone use cases

2017-03-13 Thread Andy Zhou
>>> -   skb = skb_clone(skb, GFP_ATOMIC);
>>> -   if (!skb)
>>> -   /* Skip the sample action when out of memory. */
>>> -   return 0;
>>> +   if (key) {
>>> +   err = do_execute_actions(dp, skb, key, actions, rem);
>>> +   } else if (!add_deferred_actions(skb, orig, actions, rem)) {
>>>
>> We can refactor this code to avoid duplicate code all actions
>> implementation. This way there could be single function dealing with
>> both defered action and key fifo arrays.
>
> O.K. I will make the change in the next version.

After looking more at it, the sample action and recirc action are different
enough that I don't see clean ways refactor them. For example, recirc
needs to deal with setting recirc_id.  recirc calls
ovs_dp_process_packet, and sample calls do_execute_action.

>> I am not sure if we can put sample or recirc in "may not change flow"
>> actions list. Consider following set of actions:
>> sample(sample(set-IP)),userpsace(),...
>> In this case the userspace action could recieve skb with inconsistent
>> flow due to preceding of set action nested in the sample action.
>
> Good catch.  Will fix.

What's the objection on recirc action? It always works on clone key.


Re: [net-next sample action optimization 3/3] openvswitch: Optimize sample action for the clone use cases

2017-03-13 Thread Andy Zhou
Thanks for the review. Please see comments inline.

On Mon, Mar 13, 2017 at 12:08 AM, Pravin Shelar <pshe...@ovn.org> wrote:
> On Fri, Mar 10, 2017 at 4:51 PM, Andy Zhou <az...@ovn.org> wrote:
>> With the introduction of open flow 'clone' action, the OVS user space
>> can now translate the 'clone' action into kernel datapath 'sample'
>> action, with 100% probability, to ensure that the clone semantics,
>> which is that the packet seen by the clone action is the same as the
>> packet seen by the action after clone, is faithfully carried out
>> in the datapath.
>>
>> While the sample action in the datpath has the matching semantics,
>> its implementation is only optimized for its original use.
>> Specifically, there are two limitation: First, there is a 3 level of
>> nesting restriction, enforced at the flow downloading time. This
>> limit turns out to be too restrictive for the 'clone' use case.
>> Second, the implementation avoid recursive call only if the sample
>> action list has a single userspace action.
>>
>> The main optimization implemented in this series removes the static
>> nesting limit check, instead, implement the run time recursion limit
>> check, and recursion avoidance similar to that of the 'recirc' action.
>> This optimization solve both #1 and #2 issues above.
>>
>> One related optimization attemps to avoid copying flow key as
>> long as the actions enclosed does not change the flow key. The
>> detection is performed only once at the flow downloading time.
>>
>> Another related optimization is to rewrite the action list
>> at flow downloading time in order to save the fast path from parsing
>> the sample action list in its original form repeatedly.
>>
>> Signed-off-by: Andy Zhou <az...@ovn.org>
>> ---
>>  include/uapi/linux/openvswitch.h |  13 
>>  net/openvswitch/actions.c| 111 ++
>>  net/openvswitch/datapath.h   |   1 +
>>  net/openvswitch/flow_netlink.c   | 126 
>> ---
>>  4 files changed, 162 insertions(+), 89 deletions(-)
>>
> 
> ...
>>  static int sample(struct datapath *dp, struct sk_buff *skb,
>>   struct sw_flow_key *key, const struct nlattr *attr,
>> - const struct nlattr *actions, int actions_len)
>> + bool last)
>>  {
>> -   const struct nlattr *acts_list = NULL;
>> -   const struct nlattr *a;
>> -   int rem;
>> -   u32 cutlen = 0;
>> -
>> -   for (a = nla_data(attr), rem = nla_len(attr); rem > 0;
>> -a = nla_next(a, )) {
>> -   u32 probability;
>> +   struct nlattr *actions;
>> +   struct nlattr *sample_arg;
>> +   struct sk_buff *clone_skb;
>> +   struct sw_flow_key *orig = key;
>> +   int rem = nla_len(attr);
>> +   int err = 0;
>> +   const struct sample_arg *arg;
>>
>> -   switch (nla_type(a)) {
>> -   case OVS_SAMPLE_ATTR_PROBABILITY:
>> -   probability = nla_get_u32(a);
>> -   if (!probability || prandom_u32() > probability)
>> -   return 0;
>> -   break;
>> +   /* The first action is always 'OVS_SAMPLE_ATTR_ARG'. */
>> +   sample_arg = nla_data(attr);
>> +   arg = nla_data(sample_arg);
>> +   actions = nla_next(sample_arg, );
>>
>> -   case OVS_SAMPLE_ATTR_ACTIONS:
>> -   acts_list = a;
>> -   break;
>> -   }
>> +   if ((arg->probability != U32_MAX) &&
>> +   (!arg->probability || prandom_u32() > arg->probability)) {
>> +   if (last)
>> +   consume_skb(skb);
> To simplify let the existing code in do_execute_action() handle
> freeing skb in this case.

In the 'last' case, this function always consumes skb. In case the
probability test passes, this is done by not cloning the skb
before passing the skb to 'do_execute_actions'.  In case the
probability test does not pass, the skb has to be explicitly freed.
Currently, the caller does not know which case it is.  Are you
suggesting that we change the function signature to pass
this information back to the caller? Or something else?
>
>> +   return 0;
>> }
>>
>> -   rem = nla_len(acts_list);
>> -   a = nla_data(acts_list);
>> -
>> -   /* Actions list is empty, do nothing

[net-next sample action optimization 2/3] openvswitch: Refactor recirc key allocation.

2017-03-10 Thread Andy Zhou
The logic of allocating and copy key for each 'exec_actions_level'
was specific to execute_recirc(). However, future patches will reuse
as well.  Refactor the logic into its own function clone_key().

Signed-off-by: Andy Zhou <az...@ovn.org>
---
 net/openvswitch/actions.c | 71 ---
 1 file changed, 43 insertions(+), 28 deletions(-)

diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index 75182e9..a622c19 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2007-2014 Nicira, Inc.
+ * Copyright (c) 2007-2017 Nicira, Inc.
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of version 2 of the GNU General Public
@@ -75,7 +75,7 @@ struct ovs_frag_data {
 
 #define DEFERRED_ACTION_FIFO_SIZE 10
 #define OVS_RECURSION_LIMIT 5
-#define OVS_DEFERRED_ACTION_THRESHOLD (OVS_RECURSION_LIMIT - 2)
+#define OVS_DEFERRED_ACITON_THRESHOLD (OVS_RECURSION_LIMIT - 2)
 struct action_fifo {
int head;
int tail;
@@ -83,14 +83,31 @@ struct action_fifo {
struct deferred_action fifo[DEFERRED_ACTION_FIFO_SIZE];
 };
 
-struct recirc_keys {
-   struct sw_flow_key key[OVS_DEFERRED_ACTION_THRESHOLD];
+struct action_flow_keys {
+   struct sw_flow_key key[OVS_DEFERRED_ACITON_THRESHOLD];
 };
 
 static struct action_fifo __percpu *action_fifos;
-static struct recirc_keys __percpu *recirc_keys;
+static struct action_flow_keys __percpu *flow_keys;
 static DEFINE_PER_CPU(int, exec_actions_level);
 
+/* Make a clone of the 'key', using the pre-allocated percpu 'flow_keys'
+ * space. Return NULL if out of key spaces.
+ */
+static struct sw_flow_key *clone_key(const struct sw_flow_key *key_)
+{
+   struct action_flow_keys *keys = this_cpu_ptr(flow_keys);
+   int level = this_cpu_read(exec_actions_level);
+   struct sw_flow_key *key = NULL;
+
+   if (level <= OVS_DEFERRED_ACITON_THRESHOLD) {
+   key = >key[level - 1];
+   *key = *key_;
+   }
+
+   return key;
+}
+
 static void action_fifo_init(struct action_fifo *fifo)
 {
fifo->head = 0;
@@ -1090,8 +1107,8 @@ static int execute_recirc(struct datapath *dp, struct 
sk_buff *skb,
  struct sw_flow_key *key,
  const struct nlattr *a, int rem)
 {
+   struct sw_flow_key *recirc_key;
struct deferred_action *da;
-   int level;
 
if (!is_flow_key_valid(key)) {
int err;
@@ -1115,29 +1132,27 @@ static int execute_recirc(struct datapath *dp, struct 
sk_buff *skb,
return 0;
}
 
-   level = this_cpu_read(exec_actions_level);
-   if (level <= OVS_DEFERRED_ACTION_THRESHOLD) {
-   struct recirc_keys *rks = this_cpu_ptr(recirc_keys);
-   struct sw_flow_key *recirc_key = >key[level - 1];
-
-   *recirc_key = *key;
+   /* If we are within the limit of 'OVS_DEFERRED_ACITON_THRESHOLD',
+* recirc immediately, otherwise, defer it for later execution.
+*/
+   recirc_key = clone_key(key);
+   if (recirc_key) {
recirc_key->recirc_id = nla_get_u32(a);
ovs_dp_process_packet(skb, recirc_key);
-
-   return 0;
-   }
-
-   da = add_deferred_actions(skb, key, NULL, 0);
-   if (da) {
-   da->pkt_key.recirc_id = nla_get_u32(a);
} else {
-   kfree_skb(skb);
-
-   if (net_ratelimit())
-   pr_warn("%s: deferred action limit reached, drop recirc 
action\n",
-   ovs_dp_name(dp));
+   da = add_deferred_actions(skb, key, NULL, 0);
+   if (da) {
+   recirc_key = >pkt_key;
+   recirc_key->recirc_id = nla_get_u32(a);
+   } else {
+   /* Log an error in case action fifo is full.
+*/
+   kfree_skb(skb);
+   if (net_ratelimit())
+   pr_warn("%s: deferred action limit reached, 
drop recirc action\n",
+   ovs_dp_name(dp));
+   }
}
-
return 0;
 }
 
@@ -1327,8 +1342,8 @@ int action_fifos_init(void)
if (!action_fifos)
return -ENOMEM;
 
-   recirc_keys = alloc_percpu(struct recirc_keys);
-   if (!recirc_keys) {
+   flow_keys = alloc_percpu(struct action_flow_keys);
+   if (!flow_keys) {
free_percpu(action_fifos);
return -ENOMEM;
}
@@ -1339,5 +1354,5 @@ int action_fifos_init(void)
 void action_fifos_exit(void)
 {
free_percpu(action_fifos);
-   free_percpu(recirc_keys);
+   free_percpu(flow_keys);
 }
-- 
1.8.3.1



[net-next sample action optimization 0/3]

2017-03-10 Thread Andy Zhou
The sample action can be used for translating Openflow 'clone' action.
However its implementation has not been sufficiently optimized for this
use case. This series attempts to close the gap.

Patch 3 commit message has more details on the specific optimizations
implemented.

Andy Zhou (3):
  openvswitch: Deferred fifo API change.
  openvswitch: Refactor recirc key allocation.
  openvswitch: Optimize sample action for the clone use cases

 include/uapi/linux/openvswitch.h |  13 +++
 net/openvswitch/actions.c| 194 ++-
 net/openvswitch/datapath.h   |   1 +
 net/openvswitch/flow_netlink.c   | 126 +
 4 files changed, 213 insertions(+), 121 deletions(-)

-- 
1.8.3.1



[net-next sample action optimization 3/3] openvswitch: Optimize sample action for the clone use cases

2017-03-10 Thread Andy Zhou
With the introduction of open flow 'clone' action, the OVS user space
can now translate the 'clone' action into kernel datapath 'sample'
action, with 100% probability, to ensure that the clone semantics,
which is that the packet seen by the clone action is the same as the
packet seen by the action after clone, is faithfully carried out
in the datapath.

While the sample action in the datpath has the matching semantics,
its implementation is only optimized for its original use.
Specifically, there are two limitation: First, there is a 3 level of
nesting restriction, enforced at the flow downloading time. This
limit turns out to be too restrictive for the 'clone' use case.
Second, the implementation avoid recursive call only if the sample
action list has a single userspace action.

The main optimization implemented in this series removes the static
nesting limit check, instead, implement the run time recursion limit
check, and recursion avoidance similar to that of the 'recirc' action.
This optimization solve both #1 and #2 issues above.

One related optimization attemps to avoid copying flow key as
long as the actions enclosed does not change the flow key. The
detection is performed only once at the flow downloading time.

Another related optimization is to rewrite the action list
at flow downloading time in order to save the fast path from parsing
the sample action list in its original form repeatedly.

Signed-off-by: Andy Zhou <az...@ovn.org>
---
 include/uapi/linux/openvswitch.h |  13 
 net/openvswitch/actions.c| 111 ++
 net/openvswitch/datapath.h   |   1 +
 net/openvswitch/flow_netlink.c   | 126 ---
 4 files changed, 162 insertions(+), 89 deletions(-)

diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index 7f41f7d..0dfe69b 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -578,10 +578,23 @@ enum ovs_sample_attr {
OVS_SAMPLE_ATTR_PROBABILITY, /* u32 number */
OVS_SAMPLE_ATTR_ACTIONS, /* Nested OVS_ACTION_ATTR_* attributes. */
__OVS_SAMPLE_ATTR_MAX,
+
+#ifdef __KERNEL__
+   OVS_SAMPLE_ATTR_ARG  /* struct sample_arg  */
+#endif
 };
 
 #define OVS_SAMPLE_ATTR_MAX (__OVS_SAMPLE_ATTR_MAX - 1)
 
+#ifdef __KERNEL__
+struct sample_arg {
+   bool exec;   /* When true, actions in sample will not
+   change flow keys. False otherwise. */
+   u32  probability;/* Same value as
+   'OVS_SAMPLE_ATTR_PROBABILITY'. */
+};
+#endif
+
 /**
  * enum ovs_userspace_attr - Attributes for %OVS_ACTION_ATTR_USERSPACE action.
  * @OVS_USERSPACE_ATTR_PID: u32 Netlink PID to which the %OVS_PACKET_CMD_ACTION
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index a622c19..8428438 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -928,73 +928,72 @@ static int output_userspace(struct datapath *dp, struct 
sk_buff *skb,
return ovs_dp_upcall(dp, skb, key, , cutlen);
 }
 
+
+/* When 'last' is true, sample() should always consume the 'skb'.
+ * Otherwise, sample() should keep 'skb' intact regardless what
+ * actions are executed within sample().
+ */
 static int sample(struct datapath *dp, struct sk_buff *skb,
  struct sw_flow_key *key, const struct nlattr *attr,
- const struct nlattr *actions, int actions_len)
+ bool last)
 {
-   const struct nlattr *acts_list = NULL;
-   const struct nlattr *a;
-   int rem;
-   u32 cutlen = 0;
-
-   for (a = nla_data(attr), rem = nla_len(attr); rem > 0;
-a = nla_next(a, )) {
-   u32 probability;
+   struct nlattr *actions;
+   struct nlattr *sample_arg;
+   struct sk_buff *clone_skb;
+   struct sw_flow_key *orig = key;
+   int rem = nla_len(attr);
+   int err = 0;
+   const struct sample_arg *arg;
 
-   switch (nla_type(a)) {
-   case OVS_SAMPLE_ATTR_PROBABILITY:
-   probability = nla_get_u32(a);
-   if (!probability || prandom_u32() > probability)
-   return 0;
-   break;
+   /* The first action is always 'OVS_SAMPLE_ATTR_ARG'. */
+   sample_arg = nla_data(attr);
+   arg = nla_data(sample_arg);
+   actions = nla_next(sample_arg, );
 
-   case OVS_SAMPLE_ATTR_ACTIONS:
-   acts_list = a;
-   break;
-   }
+   if ((arg->probability != U32_MAX) &&
+   (!arg->probability || prandom_u32() > arg->probability)) {
+   if (last)
+   consume_skb(skb);
+   return 0;
}
 
-   rem = nla_len(acts_list);
-   a = nla_data(acts_list);
-
-   /* A

[net-next sample action optimization 1/3] openvswitch: Deferred fifo API change.

2017-03-10 Thread Andy Zhou
add_deferred_actions() API currently requires actions to be passed in
as a fully encoded netlink message. So far both 'sample' and 'recirc'
actions happens to carry actions as fully encoded netlink messages.
However, this requirement is more restrictive than necessary, future
patch will need to pass in action lists that are not fully encoded
by themselves.

Signed-off-by: Andy Zhou <az...@ovn.org>
Acked-by: Joe Stringer <j...@ovn.org>
---
 net/openvswitch/actions.c | 18 +++---
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index c82301c..75182e9 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -51,6 +51,7 @@ static int do_execute_actions(struct datapath *dp, struct 
sk_buff *skb,
 struct deferred_action {
struct sk_buff *skb;
const struct nlattr *actions;
+   int actions_len;
 
/* Store pkt_key clone when creating deferred action. */
struct sw_flow_key pkt_key;
@@ -119,8 +120,9 @@ static struct deferred_action *action_fifo_put(struct 
action_fifo *fifo)
 
 /* Return true if fifo is not full */
 static struct deferred_action *add_deferred_actions(struct sk_buff *skb,
-   const struct sw_flow_key 
*key,
-   const struct nlattr *attr)
+   const struct sw_flow_key *key,
+   const struct nlattr *actions,
+   const int actions_len)
 {
struct action_fifo *fifo;
struct deferred_action *da;
@@ -129,7 +131,8 @@ static struct deferred_action *add_deferred_actions(struct 
sk_buff *skb,
da = action_fifo_put(fifo);
if (da) {
da->skb = skb;
-   da->actions = attr;
+   da->actions = actions;
+   da->actions_len = actions_len;
da->pkt_key = *key;
}
 
@@ -966,7 +969,8 @@ static int sample(struct datapath *dp, struct sk_buff *skb,
/* Skip the sample action when out of memory. */
return 0;
 
-   if (!add_deferred_actions(skb, key, a)) {
+   if (!add_deferred_actions(skb, key, nla_data(acts_list),
+ nla_len(acts_list))) {
if (net_ratelimit())
pr_warn("%s: deferred actions limit reached, dropping 
sample action\n",
ovs_dp_name(dp));
@@ -1123,7 +1127,7 @@ static int execute_recirc(struct datapath *dp, struct 
sk_buff *skb,
return 0;
}
 
-   da = add_deferred_actions(skb, key, NULL);
+   da = add_deferred_actions(skb, key, NULL, 0);
if (da) {
da->pkt_key.recirc_id = nla_get_u32(a);
} else {
@@ -1278,10 +1282,10 @@ static void process_deferred_actions(struct datapath 
*dp)
struct sk_buff *skb = da->skb;
struct sw_flow_key *key = >pkt_key;
const struct nlattr *actions = da->actions;
+   int actions_len = da->actions_len;
 
if (actions)
-   do_execute_actions(dp, skb, key, actions,
-  nla_len(actions));
+   do_execute_actions(dp, skb, key, actions, actions_len);
else
ovs_dp_process_packet(skb, key);
} while (!action_fifo_is_empty(fifo));
-- 
1.8.3.1



Re: [RFC net-next sample action optimization 3/3] openvswitch: Optimize sample action for the clone use cases

2017-03-10 Thread Andy Zhou
On Thu, Mar 9, 2017 at 11:46 AM, Joe Stringer <j...@ovn.org> wrote:
> On 7 March 2017 at 16:15, Andy Zhou <az...@ovn.org> wrote:
>> With the introduction of open flow 'clone' action, the OVS user space
>> can now translate the 'clone' action into kernel datapath 'sample'
>> action, with 100% probability, to ensure that the clone semantics,
>> which is that the packet seen by the clone action is the same as the
>> packet seen by the action after clone, is faithfully carried out
>> in the datapath.
>>
>> While the sample action in the datpath has the matching semantics,
>> its implementation is only optimized for its original use.
>> Specifically, there are two limitation: First, there is a 3 level of
>> nesting restriction, enforced at the flow downloading time. This
>> limit turns out to be too restrictive for the 'clone' use case.
>> Second, the implementation avoid recursive call only if the sample
>> action list has a single userspace action.
>>
>> The optimization implemented in this series removes the static
>> nesting limit check, instead, implement the run time recursion limit
>> check, and recursion avoidance similar to that of the 'recirc' action.
>> This optimization solve both #1 and #2 issues above.
>>
>> Another optimization implemented is to avoid coping flow key as
>
> *copying
>
>> long as the actions enclosed does not change the flow key. The
>> detection is performed only once at the flow downloading time.
>>
>> The third optimization implemented is to rewrite the action list
>> at flow downloading time in order to save the fast path from parsing
>> the sample action list in its original form repeatedly.
>
> Whenever there is an enumeration (1, 2, 3; ..another..., third thing
> implemented) in a commit message, I have to ask whether each "another
> change..." should be a separate patch. It certainly makes it easier to
> review.
>
They are all part of the same implementation. Spliting them probably won't
make much sense. I think I will drop #2 and #3 in the commit message since
#1 is the main optimization.

> I ran this through the OVS kernel tests and it's working correctly
> from that point of view, but I didn't get a chance to dig in and
> ensure for example, runtime behaviour of several nested
> sample(actions(sample(actions(sample(actions(output)) handles
> reasonably when it runs out of stack and deferred actions space. At a
> high level though, this seems pretty straightforward.
>
> Several comments below, thanks.
>
>>
>> Signed-off-by: Andy Zhou <az...@ovn.org>
>> ---
>>  net/openvswitch/actions.c  | 106 ++--
>>  net/openvswitch/datapath.h |   7 +++
>>  net/openvswitch/flow_netlink.c | 118 
>> -
>>  3 files changed, 140 insertions(+), 91 deletions(-)
>>
>> diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
>> index 259aea9..2e8c372 100644
>> --- a/net/openvswitch/actions.c
>> +++ b/net/openvswitch/actions.c
>> @@ -930,71 +930,52 @@ static int output_userspace(struct datapath *dp, 
>> struct sk_buff *skb,
>>  }
>>
>>  static int sample(struct datapath *dp, struct sk_buff *skb,
>> - struct sw_flow_key *key, const struct nlattr *attr,
>> - const struct nlattr *actions, int actions_len)
>> + struct sw_flow_key *key, const struct nlattr *attr)
>>  {
>> -   const struct nlattr *acts_list = NULL;
>> -   const struct nlattr *a;
>> -   int rem;
>> -   u32 cutlen = 0;
>> -
>> -   for (a = nla_data(attr), rem = nla_len(attr); rem > 0;
>> -a = nla_next(a, )) {
>> -   u32 probability;
>> -
>> -   switch (nla_type(a)) {
>> -   case OVS_SAMPLE_ATTR_PROBABILITY:
>> -   probability = nla_get_u32(a);
>> -   if (!probability || prandom_u32() > probability)
>> -   return 0;
>> -   break;
>> -
>> -   case OVS_SAMPLE_ATTR_ACTIONS:
>> -   acts_list = a;
>> -   break;
>> -   }
>> -   }
>> +   struct nlattr *actions;
>> +   struct nlattr *sample_arg;
>> +   struct sw_flow_key *orig = key;
>> +   int rem = nla_len(attr);
>> +   int err = 0;
>> +   const struct sample_arg *arg;
>>
>> -   rem = nla_len(acts_list);
>> -   a = nla_data

Re: [RFC net-next sample action optimization 2/3] openvswitch: Refactor recirc key allocation.

2017-03-10 Thread Andy Zhou
On Thu, Mar 9, 2017 at 11:11 AM, Joe Stringer <j...@ovn.org> wrote:
> On 7 March 2017 at 16:15, Andy Zhou <az...@ovn.org> wrote:
>> The logic of allocating and copy key for each 'exec_actions_level'
>> was specific to execute_recirc(). However, future patches will reuse
>> as well.  Refactor the logic into its own function clone_key().
>>
>> Signed-off-by: Andy Zhou <az...@ovn.org>
>> ---
>
> 
>
>> @@ -83,14 +83,32 @@ struct action_fifo {
>> struct deferred_action fifo[DEFERRED_ACTION_FIFO_SIZE];
>>  };
>>
>> -struct recirc_keys {
>> -   struct sw_flow_key key[OVS_DEFERRED_ACTION_THRESHOLD];
>> +struct action_flow_keys {
>> +   struct sw_flow_key key[OVS_ACTION_RECURSION_THRESHOLD];
>>  };
>
> I thought the old struct name was clearer on how it would be used -
> for when actions are deferred.

O.K. I will revert it.
>
>>
>>  static struct action_fifo __percpu *action_fifos;
>> -static struct recirc_keys __percpu *recirc_keys;
>> +static struct action_flow_keys __percpu *flow_keys;
>>  static DEFINE_PER_CPU(int, exec_actions_level);
>>
>> +/* Make a clone of the 'key', using the pre-allocated percpu 'flow_keys'
>> + * space. Since the storage is pre-allocated, the caller does not
>> + * need to check for NULL return pointer.
>> + */
>
> Hmm? if level > OVS_ACTION_RECURSION_THRESHOLD, this function returns NULL.
Thanks for catching this. I will update the comment.


[RFC net-next sample action optimization 3/3] openvswitch: Optimize sample action for the clone use cases

2017-03-07 Thread Andy Zhou
With the introduction of open flow 'clone' action, the OVS user space
can now translate the 'clone' action into kernel datapath 'sample'
action, with 100% probability, to ensure that the clone semantics,
which is that the packet seen by the clone action is the same as the
packet seen by the action after clone, is faithfully carried out
in the datapath.

While the sample action in the datpath has the matching semantics,
its implementation is only optimized for its original use.
Specifically, there are two limitation: First, there is a 3 level of
nesting restriction, enforced at the flow downloading time. This
limit turns out to be too restrictive for the 'clone' use case.
Second, the implementation avoid recursive call only if the sample
action list has a single userspace action.

The optimization implemented in this series removes the static
nesting limit check, instead, implement the run time recursion limit
check, and recursion avoidance similar to that of the 'recirc' action.
This optimization solve both #1 and #2 issues above.

Another optimization implemented is to avoid coping flow key as
long as the actions enclosed does not change the flow key. The
detection is performed only once at the flow downloading time.

The third optimization implemented is to rewrite the action list
at flow downloading time in order to save the fast path from parsing
the sample action list in its original form repeatedly.

Signed-off-by: Andy Zhou <az...@ovn.org>
---
 net/openvswitch/actions.c  | 106 ++--
 net/openvswitch/datapath.h |   7 +++
 net/openvswitch/flow_netlink.c | 118 -
 3 files changed, 140 insertions(+), 91 deletions(-)

diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index 259aea9..2e8c372 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -930,71 +930,52 @@ static int output_userspace(struct datapath *dp, struct 
sk_buff *skb,
 }
 
 static int sample(struct datapath *dp, struct sk_buff *skb,
- struct sw_flow_key *key, const struct nlattr *attr,
- const struct nlattr *actions, int actions_len)
+ struct sw_flow_key *key, const struct nlattr *attr)
 {
-   const struct nlattr *acts_list = NULL;
-   const struct nlattr *a;
-   int rem;
-   u32 cutlen = 0;
-
-   for (a = nla_data(attr), rem = nla_len(attr); rem > 0;
-a = nla_next(a, )) {
-   u32 probability;
-
-   switch (nla_type(a)) {
-   case OVS_SAMPLE_ATTR_PROBABILITY:
-   probability = nla_get_u32(a);
-   if (!probability || prandom_u32() > probability)
-   return 0;
-   break;
-
-   case OVS_SAMPLE_ATTR_ACTIONS:
-   acts_list = a;
-   break;
-   }
-   }
+   struct nlattr *actions;
+   struct nlattr *sample_arg;
+   struct sw_flow_key *orig = key;
+   int rem = nla_len(attr);
+   int err = 0;
+   const struct sample_arg *arg;
 
-   rem = nla_len(acts_list);
-   a = nla_data(acts_list);
+   /* The first action is always 'OVS_SAMPLE_ATTR_AUX'. */
+   sample_arg = nla_data(attr);
+   arg = nla_data(sample_arg);
+   actions = nla_next(sample_arg, );
 
-   /* Actions list is empty, do nothing */
-   if (unlikely(!rem))
-   return 0;
+   if (arg->probability != U32_MAX)
+   if (!arg->probability || prandom_u32() > arg->probability)
+   return 0;
 
-   /* The only known usage of sample action is having a single user-space
-* action, or having a truncate action followed by a single user-space
-* action. Treat this usage as a special case.
-* The output_userspace() should clone the skb to be sent to the
-* user space. This skb will be consumed by its caller.
+   /* In case the sample actions won't change 'key',
+* we can use key for the clone execution.
+* Otherwise, try to allocate a key from the
+* next recursion level of 'flow_keys'. If
+* successful, we can still execute the clone
+* actions  without deferring.
+*
+* Defer the clone action if the action recursion
+* limit has been reached.
 */
-   if (unlikely(nla_type(a) == OVS_ACTION_ATTR_TRUNC)) {
-   struct ovs_action_trunc *trunc = nla_data(a);
-
-   if (skb->len > trunc->max_len)
-   cutlen = skb->len - trunc->max_len;
-
-   a = nla_next(a, );
+   if (!arg->exec) {
+   __this_cpu_inc(exec_actions_level);
+   key = clone_key(key);
}
 
-   if (likely(nla_type(a) == OVS_ACTION_ATTR_USERSPACE &&
-  nla_is_last(a, rem)))
- 

[RFC net-next sample action optimization 2/3] openvswitch: Refactor recirc key allocation.

2017-03-07 Thread Andy Zhou
The logic of allocating and copy key for each 'exec_actions_level'
was specific to execute_recirc(). However, future patches will reuse
as well.  Refactor the logic into its own function clone_key().

Signed-off-by: Andy Zhou <az...@ovn.org>
---
 net/openvswitch/actions.c | 72 +--
 1 file changed, 44 insertions(+), 28 deletions(-)

diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index 75182e9..259aea9 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2007-2014 Nicira, Inc.
+ * Copyright (c) 2007-2017 Nicira, Inc.
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of version 2 of the GNU General Public
@@ -75,7 +75,7 @@ struct ovs_frag_data {
 
 #define DEFERRED_ACTION_FIFO_SIZE 10
 #define OVS_RECURSION_LIMIT 5
-#define OVS_DEFERRED_ACTION_THRESHOLD (OVS_RECURSION_LIMIT - 2)
+#define OVS_ACTION_RECURSION_THRESHOLD (OVS_RECURSION_LIMIT - 2)
 struct action_fifo {
int head;
int tail;
@@ -83,14 +83,32 @@ struct action_fifo {
struct deferred_action fifo[DEFERRED_ACTION_FIFO_SIZE];
 };
 
-struct recirc_keys {
-   struct sw_flow_key key[OVS_DEFERRED_ACTION_THRESHOLD];
+struct action_flow_keys {
+   struct sw_flow_key key[OVS_ACTION_RECURSION_THRESHOLD];
 };
 
 static struct action_fifo __percpu *action_fifos;
-static struct recirc_keys __percpu *recirc_keys;
+static struct action_flow_keys __percpu *flow_keys;
 static DEFINE_PER_CPU(int, exec_actions_level);
 
+/* Make a clone of the 'key', using the pre-allocated percpu 'flow_keys'
+ * space. Since the storage is pre-allocated, the caller does not
+ * need to check for NULL return pointer.
+ */
+static struct sw_flow_key *clone_key(const struct sw_flow_key *key_)
+{
+   struct action_flow_keys *keys = this_cpu_ptr(flow_keys);
+   int level = this_cpu_read(exec_actions_level);
+   struct sw_flow_key *key = NULL;
+
+   if (level <= OVS_ACTION_RECURSION_THRESHOLD) {
+   key = >key[level - 1];
+   *key = *key_;
+   }
+
+   return key;
+}
+
 static void action_fifo_init(struct action_fifo *fifo)
 {
fifo->head = 0;
@@ -1090,8 +1108,8 @@ static int execute_recirc(struct datapath *dp, struct 
sk_buff *skb,
  struct sw_flow_key *key,
  const struct nlattr *a, int rem)
 {
+   struct sw_flow_key *recirc_key;
struct deferred_action *da;
-   int level;
 
if (!is_flow_key_valid(key)) {
int err;
@@ -1115,29 +1133,27 @@ static int execute_recirc(struct datapath *dp, struct 
sk_buff *skb,
return 0;
}
 
-   level = this_cpu_read(exec_actions_level);
-   if (level <= OVS_DEFERRED_ACTION_THRESHOLD) {
-   struct recirc_keys *rks = this_cpu_ptr(recirc_keys);
-   struct sw_flow_key *recirc_key = >key[level - 1];
-
-   *recirc_key = *key;
+   /* If we are within the limit of 'OVS_ACTION_RECURSION_THRESHOLD',
+* recirc immediately, otherwise, defer it for later execution.
+*/
+   recirc_key = clone_key(key);
+   if (recirc_key) {
recirc_key->recirc_id = nla_get_u32(a);
ovs_dp_process_packet(skb, recirc_key);
-
-   return 0;
-   }
-
-   da = add_deferred_actions(skb, key, NULL, 0);
-   if (da) {
-   da->pkt_key.recirc_id = nla_get_u32(a);
} else {
-   kfree_skb(skb);
-
-   if (net_ratelimit())
-   pr_warn("%s: deferred action limit reached, drop recirc 
action\n",
-   ovs_dp_name(dp));
+   da = add_deferred_actions(skb, key, NULL, 0);
+   if (da) {
+   recirc_key = >pkt_key;
+   recirc_key->recirc_id = nla_get_u32(a);
+   } else {
+   /* Log an error in case action fifo is full.
+*/
+   kfree_skb(skb);
+   if (net_ratelimit())
+   pr_warn("%s: deferred action limit reached, 
drop recirc action\n",
+   ovs_dp_name(dp));
+   }
}
-
return 0;
 }
 
@@ -1327,8 +1343,8 @@ int action_fifos_init(void)
if (!action_fifos)
return -ENOMEM;
 
-   recirc_keys = alloc_percpu(struct recirc_keys);
-   if (!recirc_keys) {
+   flow_keys = alloc_percpu(struct action_flow_keys);
+   if (!flow_keys) {
free_percpu(action_fifos);
return -ENOMEM;
}
@@ -1339,5 +1355,5 @@ int action_fifos_init(void)
 void action_fifos_exit(void)
 {
free_percpu(action_fifos);
-   free_percpu(recirc_keys);
+   free_percpu(flow_keys);
 }
-- 
1.8.3.1



[RFC net-next sample action optimization 0/3]

2017-03-07 Thread Andy Zhou
The sample action can be used for translating Openflow 'clone' action.
However its implementation has not been sufficiently optimized for this
use case. This series attempts to close the gap.

Patch 3 commit message has more details on the specific optimizations
implemented.


Andy Zhou (3):
  openvswitch: deferred fifo api change
  openvswitch: Refactor recirc key allocation.
  openvswitch: Optimize sample action for the clone use cases

 net/openvswitch/actions.c  | 190 ++---
 net/openvswitch/datapath.h |   7 ++
 net/openvswitch/flow_netlink.c | 118 +
 3 files changed, 192 insertions(+), 123 deletions(-)

-- 
1.8.3.1



[RFC net-next sample action optimization 1/3] openvswitch: deferred fifo api change

2017-03-07 Thread Andy Zhou
add_deferred_actions() API currently requires actions to be passed in
as a fully encoded netlink message. So far both 'sample' and 'recirc'
actions happens to carry actions as fully encoded netlink messages.
However, this requirement is more restrictive than necessary, future
patch will need to pass in action lists that are not fully encoded
by themselves.

Signed-off-by: Andy Zhou <az...@ovn.org>
---
 net/openvswitch/actions.c | 18 +++---
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index c82301c..75182e9 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -51,6 +51,7 @@ static int do_execute_actions(struct datapath *dp, struct 
sk_buff *skb,
 struct deferred_action {
struct sk_buff *skb;
const struct nlattr *actions;
+   int actions_len;
 
/* Store pkt_key clone when creating deferred action. */
struct sw_flow_key pkt_key;
@@ -119,8 +120,9 @@ static struct deferred_action *action_fifo_put(struct 
action_fifo *fifo)
 
 /* Return true if fifo is not full */
 static struct deferred_action *add_deferred_actions(struct sk_buff *skb,
-   const struct sw_flow_key 
*key,
-   const struct nlattr *attr)
+   const struct sw_flow_key *key,
+   const struct nlattr *actions,
+   const int actions_len)
 {
struct action_fifo *fifo;
struct deferred_action *da;
@@ -129,7 +131,8 @@ static struct deferred_action *add_deferred_actions(struct 
sk_buff *skb,
da = action_fifo_put(fifo);
if (da) {
da->skb = skb;
-   da->actions = attr;
+   da->actions = actions;
+   da->actions_len = actions_len;
da->pkt_key = *key;
}
 
@@ -966,7 +969,8 @@ static int sample(struct datapath *dp, struct sk_buff *skb,
/* Skip the sample action when out of memory. */
return 0;
 
-   if (!add_deferred_actions(skb, key, a)) {
+   if (!add_deferred_actions(skb, key, nla_data(acts_list),
+ nla_len(acts_list))) {
if (net_ratelimit())
pr_warn("%s: deferred actions limit reached, dropping 
sample action\n",
ovs_dp_name(dp));
@@ -1123,7 +1127,7 @@ static int execute_recirc(struct datapath *dp, struct 
sk_buff *skb,
return 0;
}
 
-   da = add_deferred_actions(skb, key, NULL);
+   da = add_deferred_actions(skb, key, NULL, 0);
if (da) {
da->pkt_key.recirc_id = nla_get_u32(a);
} else {
@@ -1278,10 +1282,10 @@ static void process_deferred_actions(struct datapath 
*dp)
struct sk_buff *skb = da->skb;
struct sw_flow_key *key = >pkt_key;
const struct nlattr *actions = da->actions;
+   int actions_len = da->actions_len;
 
if (actions)
-   do_execute_actions(dp, skb, key, actions,
-  nla_len(actions));
+   do_execute_actions(dp, skb, key, actions, actions_len);
else
ovs_dp_process_packet(skb, key);
} while (!action_fifo_is_empty(fifo));
-- 
1.8.3.1



Re: [net-next ovs clone action 3/3] openvswitch: kernel datapath clone action

2017-02-01 Thread Andy Zhou
On Wed, Feb 1, 2017 at 6:30 PM, Pravin Shelar <pshe...@ovn.org> wrote:
> On Tue, Jan 31, 2017 at 8:47 AM, Andy Zhou <az...@ovn.org> wrote:
>> Add 'clone' kernel datapath support. In case the actions within clone
>> do not modify the current flow, the actions are executed without
>> making a copy of current key before execution. This analysis is
>> done once per flow installation.
>>
>> On the other hand, in case the actions within clone may modify
>> current flow key, a key has to be copied. In case the percpu
>> 'flow_keys' is available for the next 'exec_actions_level', the clone
>> actions will be executed without using the deferred fifo. Otherwise,
>> deferred fifo is used this clone action.
>>
>
> I think we can use sample action to clone packet and apply
> optimization to sample action. That will allow greater use of existing
> action without additional complexity.
>
O.K. I will rework the patch series to apply the optimizations to the sample
action.  Thanks for the review and comment.


[net-next ovs clone action 3/3] openvswitch: kernel datapath clone action

2017-01-31 Thread Andy Zhou
Add 'clone' kernel datapath support. In case the actions within clone
do not modify the current flow, the actions are executed without
making a copy of current key before execution. This analysis is
done once per flow installation.

On the other hand, in case the actions within clone may modify
current flow key, a key has to be copied. In case the percpu
'flow_keys' is available for the next 'exec_actions_level', the clone
actions will be executed without using the deferred fifo. Otherwise,
deferred fifo is used this clone action.

Signed-off-by: Andy Zhou <az...@ovn.org>
---
 include/uapi/linux/openvswitch.h |   1 +
 net/openvswitch/actions.c|  66 ++
 net/openvswitch/datapath.h   |   3 +
 net/openvswitch/flow_netlink.c   | 117 ++-
 4 files changed, 186 insertions(+), 1 deletion(-)

diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index 375d812..910969d 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -780,6 +780,7 @@ enum ovs_action_attr {
OVS_ACTION_ATTR_TRUNC,/* u32 struct ovs_action_trunc. */
OVS_ACTION_ATTR_PUSH_ETH, /* struct ovs_action_push_eth. */
OVS_ACTION_ATTR_POP_ETH,  /* No argument. */
+   OVS_ACTION_ATTR_CLONE,/* Nested OVS_ACTION_ATTR_*.  */
 
__OVS_ACTION_ATTR_MAX,/* Nothing past this will be accepted
   * from userspace. */
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index 73bd4ad..b75388f 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -1156,6 +1156,55 @@ static int execute_recirc(struct datapath *dp, struct 
sk_buff *skb,
return 0;
 }
 
+static int execute_clone(struct datapath *dp, struct sk_buff *skb,
+struct sw_flow_key *key, const struct nlattr *a)
+{
+   struct nlattr *actions;
+   struct sw_flow_key *orig = key;
+   int rem;
+   int err = 0;
+   bool exec = false;
+
+   actions = nla_data(a);
+   rem = nla_len(a);
+   if (nla_type(a) == OVS_CLONE_ATTR_EXEC) {
+   exec = true;
+   actions = nla_next(actions, );
+   }
+
+   /* In case the clone actions won't change 'key',
+* we can use key for the clone execution.
+* Otherwise, try to allocate a key from the
+* next recursion level of 'flow_keys'. If
+* successful, we can still execute the clone
+* actions  without deferring.
+*
+* Defer the clone action if the action recursion
+* limit has been reached.
+*/
+   if (!exec) {
+   __this_cpu_inc(exec_actions_level);
+   key = clone_key(key);
+   }
+
+   if (key) {
+   err = do_execute_actions(dp, skb, key, actions, rem);
+   } else {
+   struct deferred_action *da;
+
+   da = add_deferred_actions(skb, orig, actions, rem);
+
+   if (!da && net_ratelimit())
+   pr_warn("%s: deferred action limit reached, drop clone 
action\n",
+   ovs_dp_name(dp));
+   }
+
+   if (!exec)
+   __this_cpu_dec(exec_actions_level);
+
+   return err;
+}
+
 /* Execute a list of actions against 'skb'. */
 static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
  struct sw_flow_key *key,
@@ -1271,6 +1320,23 @@ static int do_execute_actions(struct datapath *dp, 
struct sk_buff *skb,
case OVS_ACTION_ATTR_POP_ETH:
err = pop_eth(skb, key);
break;
+
+   case OVS_ACTION_ATTR_CLONE: {
+   bool last = nla_is_last(a, rem);
+   struct sk_buff *clone_skb;
+
+   clone_skb = last ? skb : skb_clone(skb, GFP_ATOMIC);
+
+   if (!clone_skb)
+   /* Out of memory, skip this clone action.
+*/
+   break;
+
+   err = execute_clone(dp, clone_skb, key, a);
+   if (last)
+   return err;
+   break;
+   }
}
 
if (unlikely(err)) {
diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h
index 1c6e937..2ea9f30 100644
--- a/net/openvswitch/datapath.h
+++ b/net/openvswitch/datapath.h
@@ -220,4 +220,7 @@ int ovs_execute_actions(struct datapath *dp, struct sk_buff 
*skb,
if (logging_allowed && net_ratelimit()) \
pr_info("netlink: " fmt "\n", ##__VA_ARGS__);   \
 } while (0)
+
+#define OVS_CLONE_ATTR_EXEC  (OVS_ACTION_ATTR_MAX + 1)
+
 #endif /* datapath.h */
diff --git a/net/openvswitch/flow_netlink.c b/net/open

[net-next ovs clone action 2/3] openvswitch: Refactor recirc key allocation.

2017-01-31 Thread Andy Zhou
The logic of allocating and copy key for each 'exec_actions_level'
was specific to execute_recirc(). However, future patches will reuse
as well.  Refactor the logic into its own function clone_key().

Signed-off-by: Andy Zhou <az...@ovn.org>
---
 net/openvswitch/actions.c | 72 +--
 1 file changed, 44 insertions(+), 28 deletions(-)

diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index 6f025cd..73bd4ad 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2007-2014 Nicira, Inc.
+ * Copyright (c) 2007-2017 Nicira, Inc.
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of version 2 of the GNU General Public
@@ -75,7 +75,7 @@ struct ovs_frag_data {
 
 #define DEFERRED_ACTION_FIFO_SIZE 10
 #define OVS_RECURSION_LIMIT 5
-#define OVS_DEFERRED_ACTION_THRESHOLD (OVS_RECURSION_LIMIT - 2)
+#define OVS_ACTION_RECURSION_THRESHOLD (OVS_RECURSION_LIMIT - 2)
 struct action_fifo {
int head;
int tail;
@@ -83,14 +83,32 @@ struct action_fifo {
struct deferred_action fifo[DEFERRED_ACTION_FIFO_SIZE];
 };
 
-struct recirc_keys {
-   struct sw_flow_key key[OVS_DEFERRED_ACTION_THRESHOLD];
+struct action_flow_keys {
+   struct sw_flow_key key[OVS_ACTION_RECURSION_THRESHOLD];
 };
 
 static struct action_fifo __percpu *action_fifos;
-static struct recirc_keys __percpu *recirc_keys;
+static struct action_flow_keys __percpu *flow_keys;
 static DEFINE_PER_CPU(int, exec_actions_level);
 
+/* Make a clone of the 'key', using the pre-allocated percpu 'flow_keys'
+ * space. Since the storage is pre-allocated, the caller does not
+ * need to check for NULL return pointer.
+ */
+static struct sw_flow_key *clone_key(const struct sw_flow_key *key_)
+{
+   struct action_flow_keys *keys = this_cpu_ptr(flow_keys);
+   int level = this_cpu_read(exec_actions_level);
+   struct sw_flow_key *key = NULL;
+
+   if (level <= OVS_ACTION_RECURSION_THRESHOLD) {
+   key = >key[level - 1];
+   *key = *key_;
+   }
+
+   return key;
+}
+
 static void action_fifo_init(struct action_fifo *fifo)
 {
fifo->head = 0;
@@ -1089,8 +1107,8 @@ static int execute_recirc(struct datapath *dp, struct 
sk_buff *skb,
  struct sw_flow_key *key,
  const struct nlattr *a, int rem)
 {
+   struct sw_flow_key *recirc_key;
struct deferred_action *da;
-   int level;
 
if (!is_flow_key_valid(key)) {
int err;
@@ -1114,29 +1132,27 @@ static int execute_recirc(struct datapath *dp, struct 
sk_buff *skb,
return 0;
}
 
-   level = this_cpu_read(exec_actions_level);
-   if (level <= OVS_DEFERRED_ACTION_THRESHOLD) {
-   struct recirc_keys *rks = this_cpu_ptr(recirc_keys);
-   struct sw_flow_key *recirc_key = >key[level - 1];
-
-   *recirc_key = *key;
+   /* If we are within the limit of 'OVS_ACTION_RECURSION_THRESHOLD',
+* recirc immediately, otherwise, defer it for later execution.
+*/
+   recirc_key = clone_key(key);
+   if (recirc_key) {
recirc_key->recirc_id = nla_get_u32(a);
ovs_dp_process_packet(skb, recirc_key);
-
-   return 0;
-   }
-
-   da = add_deferred_actions(skb, key, NULL, 0);
-   if (da) {
-   da->pkt_key.recirc_id = nla_get_u32(a);
} else {
-   kfree_skb(skb);
-
-   if (net_ratelimit())
-   pr_warn("%s: deferred action limit reached, drop recirc 
action\n",
-   ovs_dp_name(dp));
+   da = add_deferred_actions(skb, key, NULL, 0);
+   if (da) {
+   recirc_key = >pkt_key;
+   recirc_key->recirc_id = nla_get_u32(a);
+   } else {
+   /* Log an error in case action fifo is full.
+*/
+   kfree_skb(skb);
+   if (net_ratelimit())
+   pr_warn("%s: deferred action limit reached, 
drop recirc action\n",
+   ovs_dp_name(dp));
+   }
}
-
return 0;
 }
 
@@ -1326,8 +1342,8 @@ int action_fifos_init(void)
if (!action_fifos)
return -ENOMEM;
 
-   recirc_keys = alloc_percpu(struct recirc_keys);
-   if (!recirc_keys) {
+   flow_keys = alloc_percpu(struct action_flow_keys);
+   if (!flow_keys) {
free_percpu(action_fifos);
return -ENOMEM;
}
@@ -1338,5 +1354,5 @@ int action_fifos_init(void)
 void action_fifos_exit(void)
 {
free_percpu(action_fifos);
-   free_percpu(recirc_keys);
+   free_percpu(flow_keys);
 }
-- 
1.8.3.1



[net-next ovs clone action 1/3] openvswitch: deferred fifo api change

2017-01-31 Thread Andy Zhou
add_deferred_actions() API currently requires actions to be passed in
as a fully encoded netlink message. So far both 'sample' and 'recirc'
actions happens to carry actions as fully encoded netlink messages.
However, this requirement is more restrictive than necessary, future
patch will need to pass in action lists that are not fully encoded
by themselves.

Signed-off-by: Andy Zhou <az...@ovn.org>
---
 net/openvswitch/actions.c | 18 +++---
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index efa9a88..6f025cd 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -51,6 +51,7 @@ static int do_execute_actions(struct datapath *dp, struct 
sk_buff *skb,
 struct deferred_action {
struct sk_buff *skb;
const struct nlattr *actions;
+   int actions_len;
 
/* Store pkt_key clone when creating deferred action. */
struct sw_flow_key pkt_key;
@@ -119,8 +120,9 @@ static struct deferred_action *action_fifo_put(struct 
action_fifo *fifo)
 
 /* Return true if fifo is not full */
 static struct deferred_action *add_deferred_actions(struct sk_buff *skb,
-   const struct sw_flow_key 
*key,
-   const struct nlattr *attr)
+   const struct sw_flow_key *key,
+   const struct nlattr *actions,
+   const int actions_len)
 {
struct action_fifo *fifo;
struct deferred_action *da;
@@ -129,7 +131,8 @@ static struct deferred_action *add_deferred_actions(struct 
sk_buff *skb,
da = action_fifo_put(fifo);
if (da) {
da->skb = skb;
-   da->actions = attr;
+   da->actions = actions;
+   da->actions_len = actions_len;
da->pkt_key = *key;
}
 
@@ -967,7 +970,8 @@ static int sample(struct datapath *dp, struct sk_buff *skb,
/* Skip the sample action when out of memory. */
return 0;
 
-   if (!add_deferred_actions(skb, key, a)) {
+   if (!add_deferred_actions(skb, key, nla_data(acts_list),
+ nla_len(acts_list))) {
if (net_ratelimit())
pr_warn("%s: deferred actions limit reached, dropping 
sample action\n",
ovs_dp_name(dp));
@@ -1122,7 +1126,7 @@ static int execute_recirc(struct datapath *dp, struct 
sk_buff *skb,
return 0;
}
 
-   da = add_deferred_actions(skb, key, NULL);
+   da = add_deferred_actions(skb, key, NULL, 0);
if (da) {
da->pkt_key.recirc_id = nla_get_u32(a);
} else {
@@ -1277,10 +1281,10 @@ static void process_deferred_actions(struct datapath 
*dp)
struct sk_buff *skb = da->skb;
struct sw_flow_key *key = >pkt_key;
const struct nlattr *actions = da->actions;
+   int actions_len = da->actions_len;
 
if (actions)
-   do_execute_actions(dp, skb, key, actions,
-  nla_len(actions));
+   do_execute_actions(dp, skb, key, actions, actions_len);
else
ovs_dp_process_packet(skb, key);
} while (!action_fifo_is_empty(fifo));
-- 
1.8.3.1



Re: [userspace meter v3 1/5] netdev-dummy: Add --len option for netdev-dummy/receive command

2017-01-30 Thread Andy Zhou
Please discard this series. These are ovs user space changes, not
kernel changes. They are sent in error. Sorry.

On Mon, Jan 30, 2017 at 10:39 PM, Andy Zhou <az...@ovn.org> wrote:
> Currently, there is no way to specify the packet size when injecting
> a packet via "netdev-dummy/receive" with a flow specification. Thus
> far, packet size is not important for testing OVS features, but it
> becomes useful in writing unit tests for the meter implementation
> in a later patch.
>
> Signed-off-by: Andy Zhou <az...@ovn.org>
> ---
>  lib/netdev-dummy.c | 38 --
>  1 file changed, 28 insertions(+), 10 deletions(-)
>
> diff --git a/lib/netdev-dummy.c b/lib/netdev-dummy.c
> index e6e36cd..10df0a7 100644
> --- a/lib/netdev-dummy.c
> +++ b/lib/netdev-dummy.c
> @@ -1,5 +1,5 @@
>  /*
> - * Copyright (c) 2010, 2011, 2012, 2013, 2015, 2016 Nicira, Inc.
> + * Copyright (c) 2010, 2011, 2012, 2013, 2015, 2016, 2017 Nicira, Inc.
>   *
>   * Licensed under the Apache License, Version 2.0 (the "License");
>   * you may not use this file except in compliance with the License.
> @@ -1433,7 +1433,15 @@ pkt_list_delete(struct ovs_list *l)
>  }
>
>  static struct dp_packet *
> -eth_from_packet_or_flow(const char *s)
> +eth_from_packet(const char *s)
> +{
> +struct dp_packet *packet;
> +eth_from_hex(s, );
> +return packet;
> +}
> +
> +static struct dp_packet *
> +eth_from_flow(const char *s)
>  {
>  enum odp_key_fitness fitness;
>  struct dp_packet *packet;
> @@ -1441,10 +1449,6 @@ eth_from_packet_or_flow(const char *s)
>  struct flow flow;
>  int error;
>
> -if (!eth_from_hex(s, )) {
> -return packet;
> -}
> -
>  /* Convert string to datapath key.
>   *
>   * It would actually be nicer to parse an OpenFlow-like flow key here, 
> but
> @@ -1540,10 +1544,24 @@ netdev_dummy_receive(struct unixctl_conn *conn,
>  for (i = k; i < argc; i++) {
>  struct dp_packet *packet;
>
> -packet = eth_from_packet_or_flow(argv[i]);
> +/* Try to parse 'argv[i]' as packet in hex. */
> +packet = eth_from_packet(argv[i]);
> +
>  if (!packet) {
> -unixctl_command_reply_error(conn, "bad packet syntax");
> -goto exit;
> +/* Try parse 'argv[i]' as odp flow. */
> +packet = eth_from_flow(argv[i]);
> +
> +if (!packet) {
> +unixctl_command_reply_error(conn, "bad packet or flow 
> syntax");
> +goto exit;
> +}
> +
> +/* Parse optional --len argument immediately follow a 'flow'.  */
> +if (argc >= i + 2 && !strcmp(argv[i + 1], "--len")) {
> +int packet_size = strtol(argv[i + 2], NULL, 10);
> +dp_packet_set_size(packet, packet_size);
> +i+=2;
> +}
>  }
>
>  netdev_dummy_queue_packet(dummy_dev, packet, rx_qid);
> @@ -1757,7 +1775,7 @@ void
>  netdev_dummy_register(enum dummy_level level)
>  {
>  unixctl_command_register("netdev-dummy/receive",
> - "name [--qid queue_id] packet|flow...",
> + "name [--qid queue_id] packet|flow [--len 
> packet len]..",
>   2, INT_MAX, netdev_dummy_receive, NULL);
>  unixctl_command_register("netdev-dummy/set-admin-state",
>   "[netdev] up|down", 1, 2,
> --
> 1.9.1
>


Re: [userspace meter v3 0/5] Userspace meter

2017-01-30 Thread Andy Zhou
Please discard this series. These are ovs user space changes, not
kernel changes. They are sent in error. Sorry.


On Mon, Jan 30, 2017 at 10:39 PM, Andy Zhou <az...@ovn.org> wrote:
> Repost user space meter support. This is based Jarno's original work
> at: https://mail.openvswitch.org/pipermail/ovs-dev/2015-November/306304.html.
> With some enhancements, and rebased to current master.
>
> ---
> v1-v2: rebase and repost.
> v2-v3: simplify patch 2/5.
>
> Andy Zhou (2):
>   netdev-dummy: Add --len option for netdev-dummy/receive command
>   dp-packet: Enhance packet batch APIs.
>
> Jarno Rajahalme (3):
>   dpif: Meter framework.
>   ofproto: Meter translation.
>   dpif-netdev: Simple DROP meter implementation.
>
>  datapath/linux/compat/include/linux/openvswitch.h |   4 +-
>  include/openvswitch/ofp-actions.h |   1 +
>  lib/dp-packet.h   | 140 +--
>  lib/dpif-netdev.c | 433 
> --
>  lib/dpif-netlink.c|  46 ++-
>  lib/dpif-provider.h   |  29 ++
>  lib/dpif.c| 128 ++-
>  lib/dpif.h|  13 +-
>  lib/netdev-dummy.c|  48 ++-
>  lib/netdev-linux.c|   3 +-
>  lib/netdev.c  |  24 +-
>  lib/odp-execute.c |  58 +--
>  lib/odp-util.c|  14 +
>  lib/ofp-actions.c |   1 +
>  ofproto/ofproto-dpif-sflow.c  |   1 +
>  ofproto/ofproto-dpif-xlate.c  |  13 +-
>  ofproto/ofproto-dpif.c|  60 ++-
>  ofproto/ofproto-provider.h|  13 +-
>  ofproto/ofproto.c |  51 +--
>  tests/dpif-netdev.at  | 106 ++
>  tests/test-conntrack.c|  56 ++-
>  21 files changed, 1036 insertions(+), 206 deletions(-)
>
> --
> 1.9.1
>


[userspace meter v3 4/5] ofproto: Meter translation.

2017-01-30 Thread Andy Zhou
From: Jarno Rajahalme <ja...@ovn.org>

Translate OpenFlow METER instructions to datapath meter actions.

Signed-off-by: Jarno Rajahalme <ja...@ovn.org>
Signed-off-by: Andy Zhou <az...@ovn.org>
---
 include/openvswitch/ofp-actions.h |  1 +
 lib/dpif.c| 40 +---
 lib/ofp-actions.c |  1 +
 ofproto/ofproto-dpif-xlate.c  | 11 -
 ofproto/ofproto.c | 49 ++-
 5 files changed, 72 insertions(+), 30 deletions(-)

diff --git a/include/openvswitch/ofp-actions.h 
b/include/openvswitch/ofp-actions.h
index 8ca787a..e269901 100644
--- a/include/openvswitch/ofp-actions.h
+++ b/include/openvswitch/ofp-actions.h
@@ -532,6 +532,7 @@ struct ofpact_metadata {
 struct ofpact_meter {
 struct ofpact ofpact;
 uint32_t meter_id;
+uint32_t provider_meter_id;
 };
 
 /* OFPACT_WRITE_ACTIONS, OFPACT_CLONE.
diff --git a/lib/dpif.c b/lib/dpif.c
index 4e9476c..cc49d94 100644
--- a/lib/dpif.c
+++ b/lib/dpif.c
@@ -1104,6 +1104,7 @@ struct dpif_execute_helper_aux {
 struct dpif *dpif;
 const struct flow *flow;
 int error;
+const struct nlattr *meter_action; /* Non-NULL, if have a meter action. */
 };
 
 /* This is called for actions that need the context of the datapath to be
@@ -1119,6 +1120,13 @@ dpif_execute_helper_cb(void *aux_, struct 
dp_packet_batch *packets_,
 ovs_assert(packets_->count == 1);
 
 switch ((enum ovs_action_attr)type) {
+case OVS_ACTION_ATTR_METER:
+/* Maintain a pointer to the first meter action seen. */
+if (!aux->meter_action) {
+aux->meter_action = action;
+}
+   break;
+
 case OVS_ACTION_ATTR_CT:
 case OVS_ACTION_ATTR_OUTPUT:
 case OVS_ACTION_ATTR_TUNNEL_PUSH:
@@ -1129,15 +1137,29 @@ dpif_execute_helper_cb(void *aux_, struct 
dp_packet_batch *packets_,
 struct ofpbuf execute_actions;
 uint64_t stub[256 / 8];
 struct pkt_metadata *md = >md;
-bool dst_set;
 
-dst_set = flow_tnl_dst_is_set(>tunnel);
-if (dst_set) {
+if (flow_tnl_dst_is_set(>tunnel) || aux->meter_action) {
+ofpbuf_use_stub(_actions, stub, sizeof stub);
+
+if (aux->meter_action) {
+const struct nlattr *a = aux->meter_action;
+
+do {
+ofpbuf_put(_actions, a, NLA_ALIGN(a->nla_len));
+/* Find next meter action before 'action', if any. */
+do {
+a = nl_attr_next(a);
+} while (a != action &&
+ nl_attr_type(a) != OVS_ACTION_ATTR_METER);
+} while (a != action);
+}
+
 /* The Linux kernel datapath throws away the tunnel information
  * that we supply as metadata.  We have to use a "set" action to
  * supply it. */
-ofpbuf_use_stub(_actions, stub, sizeof stub);
-odp_put_tunnel_action(>tunnel, _actions);
+if (md->tunnel.ip_dst) {
+odp_put_tunnel_action(>tunnel, _actions);
+}
 ofpbuf_put(_actions, action, NLA_ALIGN(action->nla_len));
 
 execute.actions = execute_actions.data;
@@ -1170,14 +1192,16 @@ dpif_execute_helper_cb(void *aux_, struct 
dp_packet_batch *packets_,
 
 dp_packet_delete(clone);
 
-if (dst_set) {
+if (flow_tnl_dst_is_set(>tunnel) || aux->meter_action) {
 ofpbuf_uninit(_actions);
+
+/* Do not re-use the same meters for later output actions. */
+aux->meter_action = NULL;
 }
 break;
 }
 
 case OVS_ACTION_ATTR_HASH:
-case OVS_ACTION_ATTR_METER:
 case OVS_ACTION_ATTR_PUSH_VLAN:
 case OVS_ACTION_ATTR_POP_VLAN:
 case OVS_ACTION_ATTR_PUSH_MPLS:
@@ -1201,7 +1225,7 @@ dpif_execute_helper_cb(void *aux_, struct dp_packet_batch 
*packets_,
 static int
 dpif_execute_with_help(struct dpif *dpif, struct dpif_execute *execute)
 {
-struct dpif_execute_helper_aux aux = {dpif, execute->flow, 0};
+struct dpif_execute_helper_aux aux = {dpif, execute->flow, 0, NULL};
 struct dp_packet_batch pb;
 
 COVERAGE_INC(dpif_execute_with_help);
diff --git a/lib/ofp-actions.c b/lib/ofp-actions.c
index cf1ad0f..733b2c5 100644
--- a/lib/ofp-actions.c
+++ b/lib/ofp-actions.c
@@ -6868,6 +6868,7 @@ ofpacts_pull_openflow_instructions(struct ofpbuf 
*openflow,
 
 om = ofpact_put_METER(ofpacts);
 om->meter_id = ntohl(oim->meter_id);
+om->provider_meter_id = UINT32_MAX; /* No provider meter ID. */
 }
 if (insts[OVSINST_OFPIT11_APPLY_ACTIONS]) {
 const struct ofp_action_header *actions;
diff --git a/ofproto/ofproto-dpif-xlate.c b/ofproto/ofproto-dpif-xlate.c
index 48b27a6..166e236 100644
--- a/ofproto/ofproto-dpif-xlate.c
+++

[userspace meter v3 3/5] dpif: Meter framework.

2017-01-30 Thread Andy Zhou
From: Jarno Rajahalme <ja...@ovn.org>

Add DPIF-level infrastructure for meters.  Allow meter_set to modify
the meter configuration (e.g. set the burst size if unspecified).

Signed-off-by: Jarno Rajahalme <ja...@ovn.org>
Signed-off-by: Andy Zhou <az...@ovn.org>
---
 datapath/linux/compat/include/linux/openvswitch.h |  4 +-
 lib/dpif-netdev.c | 45 
 lib/dpif-netlink.c| 46 +++-
 lib/dpif-provider.h   | 29 
 lib/dpif.c| 88 +++
 lib/dpif.h| 13 +++-
 lib/odp-execute.c |  3 +
 lib/odp-util.c| 14 
 ofproto/ofproto-dpif-sflow.c  |  1 +
 ofproto/ofproto-dpif.c| 60 ++--
 ofproto/ofproto-provider.h| 13 ++--
 ofproto/ofproto.c |  2 +-
 12 files changed, 304 insertions(+), 14 deletions(-)

diff --git a/datapath/linux/compat/include/linux/openvswitch.h 
b/datapath/linux/compat/include/linux/openvswitch.h
index 425d3a4..b121391 100644
--- a/datapath/linux/compat/include/linux/openvswitch.h
+++ b/datapath/linux/compat/include/linux/openvswitch.h
@@ -787,13 +787,14 @@ enum ovs_nat_attr {
  * fields within a header are modifiable, e.g. the IPv4 protocol and fragment
  * type may not be changed.
  *
- *
  * @OVS_ACTION_ATTR_SET_TO_MASKED: Kernel internal masked set action translated
  * from the @OVS_ACTION_ATTR_SET.
  * @OVS_ACTION_ATTR_TUNNEL_PUSH: Push tunnel header described by struct
  * ovs_action_push_tnl.
  * @OVS_ACTION_ATTR_TUNNEL_POP: Lookup tunnel port by port-no passed and pop
  * tunnel header.
+ * @OVS_ACTION_ATTR_METER: Run packet through a meter, which may drop the
+ * packet, or modify the packet (e.g., change the DSCP field).
  */
 
 enum ovs_action_attr {
@@ -819,6 +820,7 @@ enum ovs_action_attr {
OVS_ACTION_ATTR_TUNNEL_PUSH,   /* struct ovs_action_push_tnl*/
OVS_ACTION_ATTR_TUNNEL_POP,/* u32 port number. */
OVS_ACTION_ATTR_CLONE, /* Nested OVS_CLONE_ATTR_*.  */
+   OVS_ACTION_ATTR_METER, /* u32 meter number. */
 #endif
__OVS_ACTION_ATTR_MAX,/* Nothing past this will be accepted
   * from userspace. */
diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 719a518..f81b464 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -3597,6 +3597,46 @@ dp_netdev_disable_upcall(struct dp_netdev *dp)
 fat_rwlock_wrlock(>upcall_rwlock);
 }
 
+
+/* Meters */
+static void
+dpif_netdev_meter_get_features(const struct dpif * dpif OVS_UNUSED,
+   struct ofputil_meter_features *features)
+{
+features->max_meters = 0;
+features->band_types = 0;
+features->capabilities = 0;
+features->max_bands = 0;
+features->max_color = 0;
+}
+
+static int
+dpif_netdev_meter_set(struct dpif *dpif OVS_UNUSED,
+  ofproto_meter_id *meter_id OVS_UNUSED,
+  struct ofputil_meter_config *config OVS_UNUSED)
+{
+return EFBIG; /* meter_id out of range */
+}
+
+static int
+dpif_netdev_meter_get(const struct dpif *dpif OVS_UNUSED,
+  ofproto_meter_id meter_id OVS_UNUSED,
+  struct ofputil_meter_stats *stats OVS_UNUSED,
+  uint16_t n_bands OVS_UNUSED)
+{
+return EFBIG; /* meter_id out of range */
+}
+
+static int
+dpif_netdev_meter_del(struct dpif *dpif OVS_UNUSED,
+  ofproto_meter_id meter_id OVS_UNUSED,
+  struct ofputil_meter_stats *stats OVS_UNUSED,
+  uint16_t n_bands OVS_UNUSED)
+{
+return EFBIG; /* meter_id out of range */
+}
+
+
 static void
 dpif_netdev_disable_upcall(struct dpif *dpif)
 OVS_NO_THREAD_SAFETY_ANALYSIS
@@ -4721,6 +4761,7 @@ dp_execute_cb(void *aux_, struct dp_packet_batch 
*packets_,
 break;
 }
 
+case OVS_ACTION_ATTR_METER:
 case OVS_ACTION_ATTR_PUSH_VLAN:
 case OVS_ACTION_ATTR_POP_VLAN:
 case OVS_ACTION_ATTR_PUSH_MPLS:
@@ -4858,6 +4899,10 @@ const struct dpif_class dpif_netdev_class = {
 dpif_netdev_ct_dump_next,
 dpif_netdev_ct_dump_done,
 dpif_netdev_ct_flush,
+dpif_netdev_meter_get_features,
+dpif_netdev_meter_set,
+dpif_netdev_meter_get,
+dpif_netdev_meter_del,
 };
 
 static void
diff --git a/lib/dpif-netlink.c b/lib/dpif-netlink.c
index c8b0e37..8a48227 100644
--- a/lib/dpif-netlink.c
+++ b/lib/dpif-netlink.c
@@ -2356,6 +2356,46 @@ dpif_netlink_ct_flush(struct dpif *dpif OVS_UNUSED, 
const uint16_t *zone)
 }
 }
 
+
+/* Meters */
+static void
+dpif_netlink_meter_get_features(const struct dpif * dpif OVS_UNUSED,
+struct ofputil_meter_features *featur

[userspace meter v3 5/5] dpif-netdev: Simple DROP meter implementation.

2017-01-30 Thread Andy Zhou
From: Jarno Rajahalme <ja...@ovn.org>

Meters may be used by any flow, so some kind of locking must be used.
In this version we have an adaptive mutex for each meter, which may
not be optimal for DPDK.  However, this should serve as a basis for
further improvement.

A batch of packets is first tried as a whole, and only if some of the
meter bands are hit, we need to process the packets individually.

Signed-off-by: Jarno Rajahalme <ja...@ovn.org>
Signed-off-by: Andy Zhou <az...@ovn.org>
---
 lib/dpif-netdev.c| 362 ---
 tests/dpif-netdev.at | 106 +++
 2 files changed, 450 insertions(+), 18 deletions(-)

diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index f81b464..96c283a 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -86,6 +86,8 @@ DEFINE_STATIC_PER_THREAD_DATA(uint32_t, recirc_depth, 0)
 
 /* Configuration parameters. */
 enum { MAX_FLOWS = 65536 }; /* Maximum number of flows in flow table. */
+enum { MAX_METERS = 65536 };/* Maximum number of meters. */
+enum { MAX_BANDS = 8 }; /* Maximum number of bands / meter. */
 
 /* Protects against changes to 'dp_netdevs'. */
 static struct ovs_mutex dp_netdev_mutex = OVS_MUTEX_INITIALIZER;
@@ -193,6 +195,31 @@ static bool dpcls_lookup(struct dpcls *cls,
  struct dpcls_rule **rules, size_t cnt,
  int *num_lookups_p);
 
+/* Set of supported meter flags */
+#define DP_SUPPORTED_METER_FLAGS_MASK \
+(OFPMF13_STATS | OFPMF13_PKTPS | OFPMF13_KBPS | OFPMF13_BURST)
+
+/* Set of supported meter band types */
+#define DP_SUPPORTED_METER_BAND_TYPES   \
+( 1 << OFPMBT13_DROP )
+
+struct dp_meter_band {
+struct ofputil_meter_band up; /* type, prec_level, pad, rate, burst_size */
+uint32_t bucket; /* In 1/1000 packets (for PKTPS), or in bits (for KBPS) */
+uint64_t packet_count;
+uint64_t byte_count;
+};
+
+struct dp_meter {
+uint16_t flags;
+uint16_t n_bands;
+uint32_t max_delta_t;
+uint64_t used;
+uint64_t packet_count;
+uint64_t byte_count;
+struct dp_meter_band bands[];
+};
+
 /* Datapath based on the network device interface from netdev.h.
  *
  *
@@ -223,6 +250,11 @@ struct dp_netdev {
 struct hmap ports;
 struct seq *port_seq;   /* Incremented whenever a port changes. */
 
+/* Meters. */
+struct ovs_mutex meter_locks[MAX_METERS];
+struct dp_meter *meters[MAX_METERS]; /* Meter bands. */
+uint32_t meter_free; /* Next free meter. */
+
 /* Protects access to ofproto-dpif-upcall interface during revalidator
  * thread synchronization. */
 struct fat_rwlock upcall_rwlock;
@@ -1059,6 +1091,10 @@ create_dp_netdev(const char *name, const struct 
dpif_class *class,
 dp->reconfigure_seq = seq_create();
 dp->last_reconfigure_seq = seq_read(dp->reconfigure_seq);
 
+for (int i = 0; i < MAX_METERS; ++i) {
+ovs_mutex_init_adaptive(>meter_locks[i]);
+}
+
 /* Disable upcalls by default. */
 dp_netdev_disable_upcall(dp);
 dp->upcall_aux = NULL;
@@ -1136,6 +1172,16 @@ dp_netdev_destroy_upcall_lock(struct dp_netdev *dp)
 fat_rwlock_destroy(>upcall_rwlock);
 }
 
+static void
+dp_delete_meter(struct dp_netdev *dp, uint32_t meter_id)
+OVS_REQUIRES(dp->meter_locks[meter_id])
+{
+if (dp->meters[meter_id]) {
+free(dp->meters[meter_id]);
+dp->meters[meter_id] = NULL;
+}
+}
+
 /* Requires dp_netdev_mutex so that we can't get a new reference to 'dp'
  * through the 'dp_netdevs' shash while freeing 'dp'. */
 static void
@@ -1151,6 +1197,7 @@ dp_netdev_free(struct dp_netdev *dp)
 do_del_port(dp, port);
 }
 ovs_mutex_unlock(>port_mutex);
+
 dp_netdev_destroy_all_pmds(dp, true);
 cmap_destroy(>poll_threads);
 
@@ -1169,6 +1216,13 @@ dp_netdev_free(struct dp_netdev *dp)
 /* Upcalls must be disabled at this point */
 dp_netdev_destroy_upcall_lock(dp);
 
+for (int i = 0; i < MAX_METERS; ++i) {
+ovs_mutex_lock(>meter_locks[i]);
+dp_delete_meter(dp, i);
+ovs_mutex_unlock(>meter_locks[i]);
+ovs_mutex_destroy(>meter_locks[i]);
+}
+
 free(dp->pmd_cmask);
 free(CONST_CAST(char *, dp->name));
 free(dp);
@@ -3603,37 +3657,304 @@ static void
 dpif_netdev_meter_get_features(const struct dpif * dpif OVS_UNUSED,
struct ofputil_meter_features *features)
 {
-features->max_meters = 0;
-features->band_types = 0;
-features->capabilities = 0;
-features->max_bands = 0;
+features->max_meters = MAX_METERS;
+features->band_types = DP_SUPPORTED_METER_BAND_TYPES;
+features->capabilities = DP_SUPPORTED_METER_FLAGS_MASK;
+features->max_bands = MAX_BANDS;
 features->max_color = 0;
 }
 
+/* Returns false when packet needs to be dropped. */
+static voi

[userspace meter v3 2/5] dp-packet: Enhance packet batch APIs.

2017-01-30 Thread Andy Zhou
One common use case of 'struct dp_packet_batch' is to process all
packets in the batch in order. Add an iterator for this use case
to simplify the logic of calling sites,

Another common use case is to drop packets in the batch, by reading
all packets, but writing back pointers of fewer packets. Add macros
to support this use case.

Signed-off-by: Andy Zhou <az...@ovn.org>
---
 lib/dp-packet.h  | 140 +++
 lib/dpif-netdev.c|  62 +--
 lib/dpif.c   |   2 +-
 lib/netdev-dummy.c   |  10 ++--
 lib/netdev-linux.c   |   3 +-
 lib/netdev.c |  24 
 lib/odp-execute.c|  55 -
 ofproto/ofproto-dpif-xlate.c |   2 +-
 tests/test-conntrack.c   |  56 +++--
 9 files changed, 201 insertions(+), 153 deletions(-)

diff --git a/lib/dp-packet.h b/lib/dp-packet.h
index cf7d247..d0c14a5 100644
--- a/lib/dp-packet.h
+++ b/lib/dp-packet.h
@@ -632,79 +632,143 @@ reset_dp_packet_checksum_ol_flags(struct dp_packet *p)
 enum { NETDEV_MAX_BURST = 32 }; /* Maximum number packets in a batch. */
 
 struct dp_packet_batch {
-int count;
+size_t count;
 bool trunc; /* true if the batch needs truncate. */
 struct dp_packet *packets[NETDEV_MAX_BURST];
 };
 
-static inline void dp_packet_batch_init(struct dp_packet_batch *b)
+static inline void
+dp_packet_batch_init(struct dp_packet_batch *batch)
 {
-b->count = 0;
-b->trunc = false;
+batch->count = 0;
+batch->trunc = false;
 }
 
 static inline void
-dp_packet_batch_clone(struct dp_packet_batch *dst,
-  struct dp_packet_batch *src)
+dp_packet_batch_add__(struct dp_packet_batch *batch,
+  struct dp_packet *packet, size_t limit)
 {
-int i;
-
-for (i = 0; i < src->count; i++) {
-dst->packets[i] = dp_packet_clone(src->packets[i]);
+if (batch->count < limit) {
+batch->packets[batch->count++] = packet;
+} else {
+dp_packet_delete(packet);
 }
-dst->count = src->count;
-dst->trunc = src->trunc;
 }
 
 static inline void
-packet_batch_init_packet(struct dp_packet_batch *b, struct dp_packet *p)
+dp_packet_batch_add(struct dp_packet_batch *batch, struct dp_packet *packet)
+{
+dp_packet_batch_add__(batch, packet, NETDEV_MAX_BURST);
+}
+
+static inline size_t
+dp_packet_batch_size(const struct dp_packet_batch *batch)
+{
+return batch->count;
+}
+
+/*
+ * Clear 'batch' for refill. Use dp_packet_batch_refill() to add
+ * packets back into the 'batch'.
+ *
+ * Return the original size of the 'batch'.  */
+static inline void
+dp_packet_batch_refill_init(struct dp_packet_batch *batch)
 {
-b->count = 1;
-b->trunc = false;
-b->packets[0] = p;
+batch->count = 0;
+};
+
+static inline void
+dp_packet_batch_refill(struct dp_packet_batch *batch,
+   struct dp_packet *packet, size_t idx)
+{
+dp_packet_batch_add__(batch, packet, MIN(NETDEV_MAX_BURST, idx + 1));
+}
+
+static inline void
+dp_packet_batch_init_packet(struct dp_packet_batch *batch, struct dp_packet *p)
+{
+dp_packet_batch_init(batch);
+batch->count = 1;
+batch->packets[0] = p;
+}
+
+static inline bool
+dp_packet_batch_is_empty(const struct dp_packet_batch *batch)
+{
+return !dp_packet_batch_size(batch);
+}
+
+#define DP_PACKET_BATCH_FOR_EACH(PACKET, BATCH)\
+for (size_t i = 0; i < dp_packet_batch_size(BATCH); i++)  \
+if ((PACKET = BATCH->packets[i]) != NULL)
+
+/* Use this macro for cases where some packets in the 'BATCH' may be
+ * dropped after going through each packet in the 'BATCH'.
+ *
+ * For packets to stay in the 'BATCH', they need to be refilled back
+ * into the 'BATCH' by calling dp_packet_batch_refill(). Caller owns
+ * the packets that are not refilled.
+ *
+ * Caller needs to supply 'SIZE', that stores the current number of
+ * packets in 'BATCH'. It is best to declare this variable with
+ * the 'const' modifier since it should not be modified by
+ * the iterator.  */
+#define DP_PACKET_BATCH_REFILL_FOR_EACH(IDX, SIZE, PACKET, BATCH)   \
+for (dp_packet_batch_refill_init(BATCH), IDX=0; IDX < SIZE; IDX++)  \
+ if ((PACKET = BATCH->packets[IDX]) != NULL)
+
+static inline void
+dp_packet_batch_clone(struct dp_packet_batch *dst,
+  struct dp_packet_batch *src)
+{
+struct dp_packet *packet;
+
+dp_packet_batch_init(dst);
+DP_PACKET_BATCH_FOR_EACH (packet, src) {
+dp_packet_batch_add(dst, dp_packet_clone(packet));
+}
 }
 
 static inline void
 dp_packet_delete_batch(struct dp_packet_batch *batch, bool may_steal)
 {
 if (may_steal) {
-int i;
+struct dp_packet *packet;
 
-for (i = 0; i < batch->count; i++) {
-dp_packet_delete(batch->packets[i]);
+DP_PACKET_BATCH_FOR_EACH (packet

[userspace meter v3 1/5] netdev-dummy: Add --len option for netdev-dummy/receive command

2017-01-30 Thread Andy Zhou
Currently, there is no way to specify the packet size when injecting
a packet via "netdev-dummy/receive" with a flow specification. Thus
far, packet size is not important for testing OVS features, but it
becomes useful in writing unit tests for the meter implementation
in a later patch.

Signed-off-by: Andy Zhou <az...@ovn.org>
---
 lib/netdev-dummy.c | 38 --
 1 file changed, 28 insertions(+), 10 deletions(-)

diff --git a/lib/netdev-dummy.c b/lib/netdev-dummy.c
index e6e36cd..10df0a7 100644
--- a/lib/netdev-dummy.c
+++ b/lib/netdev-dummy.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2010, 2011, 2012, 2013, 2015, 2016 Nicira, Inc.
+ * Copyright (c) 2010, 2011, 2012, 2013, 2015, 2016, 2017 Nicira, Inc.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -1433,7 +1433,15 @@ pkt_list_delete(struct ovs_list *l)
 }
 
 static struct dp_packet *
-eth_from_packet_or_flow(const char *s)
+eth_from_packet(const char *s)
+{
+struct dp_packet *packet;
+eth_from_hex(s, );
+return packet;
+}
+
+static struct dp_packet *
+eth_from_flow(const char *s)
 {
 enum odp_key_fitness fitness;
 struct dp_packet *packet;
@@ -1441,10 +1449,6 @@ eth_from_packet_or_flow(const char *s)
 struct flow flow;
 int error;
 
-if (!eth_from_hex(s, )) {
-return packet;
-}
-
 /* Convert string to datapath key.
  *
  * It would actually be nicer to parse an OpenFlow-like flow key here, but
@@ -1540,10 +1544,24 @@ netdev_dummy_receive(struct unixctl_conn *conn,
 for (i = k; i < argc; i++) {
 struct dp_packet *packet;
 
-packet = eth_from_packet_or_flow(argv[i]);
+/* Try to parse 'argv[i]' as packet in hex. */
+packet = eth_from_packet(argv[i]);
+
 if (!packet) {
-unixctl_command_reply_error(conn, "bad packet syntax");
-goto exit;
+/* Try parse 'argv[i]' as odp flow. */
+packet = eth_from_flow(argv[i]);
+
+if (!packet) {
+unixctl_command_reply_error(conn, "bad packet or flow syntax");
+goto exit;
+}
+
+/* Parse optional --len argument immediately follow a 'flow'.  */
+if (argc >= i + 2 && !strcmp(argv[i + 1], "--len")) {
+int packet_size = strtol(argv[i + 2], NULL, 10);
+dp_packet_set_size(packet, packet_size);
+i+=2;
+}
 }
 
 netdev_dummy_queue_packet(dummy_dev, packet, rx_qid);
@@ -1757,7 +1775,7 @@ void
 netdev_dummy_register(enum dummy_level level)
 {
 unixctl_command_register("netdev-dummy/receive",
- "name [--qid queue_id] packet|flow...",
+ "name [--qid queue_id] packet|flow [--len packet 
len]..",
  2, INT_MAX, netdev_dummy_receive, NULL);
 unixctl_command_register("netdev-dummy/set-admin-state",
  "[netdev] up|down", 1, 2,
-- 
1.9.1



[userspace meter v3 0/5] Userspace meter

2017-01-30 Thread Andy Zhou
Repost user space meter support. This is based Jarno's original work
at: https://mail.openvswitch.org/pipermail/ovs-dev/2015-November/306304.html.
With some enhancements, and rebased to current master.

---
v1-v2: rebase and repost.
v2-v3: simplify patch 2/5. 

Andy Zhou (2):
  netdev-dummy: Add --len option for netdev-dummy/receive command
  dp-packet: Enhance packet batch APIs.

Jarno Rajahalme (3):
  dpif: Meter framework.
  ofproto: Meter translation.
  dpif-netdev: Simple DROP meter implementation.

 datapath/linux/compat/include/linux/openvswitch.h |   4 +-
 include/openvswitch/ofp-actions.h |   1 +
 lib/dp-packet.h   | 140 +--
 lib/dpif-netdev.c | 433 --
 lib/dpif-netlink.c|  46 ++-
 lib/dpif-provider.h   |  29 ++
 lib/dpif.c| 128 ++-
 lib/dpif.h|  13 +-
 lib/netdev-dummy.c|  48 ++-
 lib/netdev-linux.c|   3 +-
 lib/netdev.c  |  24 +-
 lib/odp-execute.c |  58 +--
 lib/odp-util.c|  14 +
 lib/ofp-actions.c |   1 +
 ofproto/ofproto-dpif-sflow.c  |   1 +
 ofproto/ofproto-dpif-xlate.c  |  13 +-
 ofproto/ofproto-dpif.c|  60 ++-
 ofproto/ofproto-provider.h|  13 +-
 ofproto/ofproto.c |  51 +--
 tests/dpif-netdev.at  | 106 ++
 tests/test-conntrack.c|  56 ++-
 21 files changed, 1036 insertions(+), 206 deletions(-)

-- 
1.9.1



[net-next v2] openvswitch: Simplify do_execute_actions().

2017-01-27 Thread Andy Zhou
do_execute_actions() implements a worthwhile optimization: in case
an output action is the last action in an action list, skb_clone()
can be avoided by outputing the current skb. However, the
implementation is more complicated than necessary.  This patch
simplify this logic.

Signed-off-by: Andy Zhou <az...@ovn.org>
---
v1->v2:  drop skb NULL check in do_output()


---
 net/openvswitch/actions.c | 42 --
 1 file changed, 20 insertions(+), 22 deletions(-)

diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index 514f7bc..efa9a88 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -1141,12 +1141,6 @@ static int do_execute_actions(struct datapath *dp, 
struct sk_buff *skb,
  struct sw_flow_key *key,
  const struct nlattr *attr, int len)
 {
-   /* Every output action needs a separate clone of 'skb', but the common
-* case is just a single output action, so that doing a clone and
-* then freeing the original skbuff is wasteful.  So the following code
-* is slightly obscure just to avoid that.
-*/
-   int prev_port = -1;
const struct nlattr *a;
int rem;
 
@@ -1154,20 +1148,28 @@ static int do_execute_actions(struct datapath *dp, 
struct sk_buff *skb,
 a = nla_next(a, )) {
int err = 0;
 
-   if (unlikely(prev_port != -1)) {
-   struct sk_buff *out_skb = skb_clone(skb, GFP_ATOMIC);
-
-   if (out_skb)
-   do_output(dp, out_skb, prev_port, key);
+   switch (nla_type(a)) {
+   case OVS_ACTION_ATTR_OUTPUT: {
+   int port = nla_get_u32(a);
+   struct sk_buff *clone;
+
+   /* Every output action needs a separate clone
+* of 'skb', In case the output action is the
+* last action, cloning can be avoided.
+*/
+   if (nla_is_last(a, rem)) {
+   do_output(dp, skb, port, key);
+   /* 'skb' has been used for output.
+*/
+   return 0;
+   }
 
+   clone = skb_clone(skb, GFP_ATOMIC);
+   if (clone)
+   do_output(dp, clone, port, key);
OVS_CB(skb)->cutlen = 0;
-   prev_port = -1;
-   }
-
-   switch (nla_type(a)) {
-   case OVS_ACTION_ATTR_OUTPUT:
-   prev_port = nla_get_u32(a);
break;
+   }
 
case OVS_ACTION_ATTR_TRUNC: {
struct ovs_action_trunc *trunc = nla_data(a);
@@ -1257,11 +1259,7 @@ static int do_execute_actions(struct datapath *dp, 
struct sk_buff *skb,
}
}
 
-   if (prev_port != -1)
-   do_output(dp, skb, prev_port, key);
-   else
-   consume_skb(skb);
-
+   consume_skb(skb);
return 0;
 }
 
-- 
1.8.3.1



Re: [net-next] openvswitch: Simplify do_execute_actions().

2017-01-27 Thread Andy Zhou
On Fri, Jan 27, 2017 at 12:42 PM, Pravin Shelar <pshe...@ovn.org> wrote:
> On Wed, Jan 25, 2017 at 9:24 PM, Andy Zhou <az...@ovn.org> wrote:
>> do_execute_actions() implements a worthwhile optimization: in case
>> an output action is the last action in an action list, skb_clone()
>> can be avoided by outputing the current skb. However, the
>> implementation is more complicated than necessary.  This patch
>> simplify this logic.
>>
>> Signed-off-by: Andy Zhou <az...@ovn.org>
>> ---
>>  net/openvswitch/actions.c | 40 +++-
>>  1 file changed, 19 insertions(+), 21 deletions(-)
>>
>> diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
>> index 514f7bc..3866608 100644
>> --- a/net/openvswitch/actions.c
>> +++ b/net/openvswitch/actions.c
>> @@ -830,6 +830,9 @@ static void do_output(struct datapath *dp, struct 
>> sk_buff *skb, int out_port,
>>  {
>> struct vport *vport = ovs_vport_rcu(dp, out_port);
>>
>> +   if (unlikely(!skb))
>> +   return;
>> +
> Patch looks good to me. But I wanted to know if you considered moving
> this check to do_execute_actions() in case skb-clone is done? This way
> we can avoid this unlikely check from likely case :)
>
Good point.  O.K. I will repost a version without this check.  Thanks
for the review and comment.


[net-next] openvswitch: Simplify do_execute_actions().

2017-01-25 Thread Andy Zhou
do_execute_actions() implements a worthwhile optimization: in case
an output action is the last action in an action list, skb_clone()
can be avoided by outputing the current skb. However, the
implementation is more complicated than necessary.  This patch
simplify this logic.

Signed-off-by: Andy Zhou <az...@ovn.org>
---
 net/openvswitch/actions.c | 40 +++-
 1 file changed, 19 insertions(+), 21 deletions(-)

diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index 514f7bc..3866608 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -830,6 +830,9 @@ static void do_output(struct datapath *dp, struct sk_buff 
*skb, int out_port,
 {
struct vport *vport = ovs_vport_rcu(dp, out_port);
 
+   if (unlikely(!skb))
+   return;
+
if (likely(vport)) {
u16 mru = OVS_CB(skb)->mru;
u32 cutlen = OVS_CB(skb)->cutlen;
@@ -1141,12 +1144,6 @@ static int do_execute_actions(struct datapath *dp, 
struct sk_buff *skb,
  struct sw_flow_key *key,
  const struct nlattr *attr, int len)
 {
-   /* Every output action needs a separate clone of 'skb', but the common
-* case is just a single output action, so that doing a clone and
-* then freeing the original skbuff is wasteful.  So the following code
-* is slightly obscure just to avoid that.
-*/
-   int prev_port = -1;
const struct nlattr *a;
int rem;
 
@@ -1154,20 +1151,25 @@ static int do_execute_actions(struct datapath *dp, 
struct sk_buff *skb,
 a = nla_next(a, )) {
int err = 0;
 
-   if (unlikely(prev_port != -1)) {
-   struct sk_buff *out_skb = skb_clone(skb, GFP_ATOMIC);
+   switch (nla_type(a)) {
+   case OVS_ACTION_ATTR_OUTPUT: {
+   int port = nla_get_u32(a);
 
-   if (out_skb)
-   do_output(dp, out_skb, prev_port, key);
+   /* Every output action needs a separate clone
+* of 'skb', In case the output action is the
+* last action, cloning can be avoided.
+*/
+   if (nla_is_last(a, rem)) {
+   do_output(dp, skb, port, key);
+   /* 'skb' has been used for output.
+*/
+   return 0;
+   }
 
+   do_output(dp, skb_clone(skb, GFP_ATOMIC), port, key);
OVS_CB(skb)->cutlen = 0;
-   prev_port = -1;
-   }
-
-   switch (nla_type(a)) {
-   case OVS_ACTION_ATTR_OUTPUT:
-   prev_port = nla_get_u32(a);
break;
+   }
 
case OVS_ACTION_ATTR_TRUNC: {
struct ovs_action_trunc *trunc = nla_data(a);
@@ -1257,11 +1259,7 @@ static int do_execute_actions(struct datapath *dp, 
struct sk_buff *skb,
}
}
 
-   if (prev_port != -1)
-   do_output(dp, skb, prev_port, key);
-   else
-   consume_skb(skb);
-
+   consume_skb(skb);
return 0;
 }
 
-- 
1.8.3.1



[net] openvswitch: Allow deferred action fifo to expand during run time

2016-03-19 Thread Andy Zhou
Current openvswitch implementation allows up to 10 recirculation actions
for each packet. This limit was sufficient for most use cases in the
past, but with more new features, such as supporting connection
tracking, and testing in larger scale network environment,
This limit may be too restrictive.

An obvious design choice is to increase the hard coded limit
from 10 to a larger number.  However, there are two draw backs to this
approach. First, it is not obvious what the upper limit should be.
Second, the fifos are allocated per CPU core. Statically allocated
large buffers will be over kill for most packet execution, and will
increase the module's memory footprint unnecessarily for some
smaller scale applications.

This patch makes a different design choice that accommodates packets
that require more than 10 recirculation actions without significantly
increase module's memory footprint.  kmalloc() is used to expand
deferred action fifo at run time whenever a packet's action contains
more than 10 recirculation actions. The dynamically allocated fifos are
freed as soon as the packet's actions execution are completed.

Reported-and-tested-by: Ramu Ramamurthy <srama...@linux.vnet.ibm.com>
Reported-at: http://openvswitch.org/pipermail/dev/2016-March/067672.html
Signed-off-by: Andy Zhou <az...@ovn.org>
---
 net/openvswitch/actions.c | 45 -
 1 file changed, 40 insertions(+), 5 deletions(-)

diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index 2d59df5..eb068c7 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -70,12 +70,14 @@ struct ovs_frag_data {
 
 static DEFINE_PER_CPU(struct ovs_frag_data, ovs_frag_data_storage);
 
-#define DEFERRED_ACTION_FIFO_SIZE 10
+#define DEFERRED_ACTION_FIFO_INIT_SIZE 10
 struct action_fifo {
int head;
int tail;
/* Deferred action fifo queue storage. */
-   struct deferred_action fifo[DEFERRED_ACTION_FIFO_SIZE];
+   struct deferred_action *fifo;
+   size_t size; /* Current size of the fifo in bytes */
+   struct deferred_action initial_fifo[DEFERRED_ACTION_FIFO_INIT_SIZE];
 };
 
 static struct action_fifo __percpu *action_fifos;
@@ -85,6 +87,16 @@ static void action_fifo_init(struct action_fifo *fifo)
 {
fifo->head = 0;
fifo->tail = 0;
+   fifo->fifo = fifo->initial_fifo;
+   fifo->size = sizeof(*fifo->fifo) * DEFERRED_ACTION_FIFO_INIT_SIZE;
+}
+
+static void action_fifo_clear(struct action_fifo *fifo)
+{
+   if (fifo->fifo != fifo->initial_fifo)
+   kfree(fifo->fifo);
+
+   action_fifo_init(fifo);
 }
 
 static bool action_fifo_is_empty(const struct action_fifo *fifo)
@@ -102,8 +114,22 @@ static struct deferred_action *action_fifo_get(struct 
action_fifo *fifo)
 
 static struct deferred_action *action_fifo_put(struct action_fifo *fifo)
 {
-   if (fifo->head >= DEFERRED_ACTION_FIFO_SIZE - 1)
-   return NULL;
+   if (fifo->head >= fifo->size / sizeof(*fifo->fifo) - 1) {
+   /* out of fifo buffer space, try to allocate a new fifo
+* buffer. */
+   struct deferred_action *new_fifo;
+   int new_size = fifo->size * 2;
+
+   new_fifo = kmalloc(new_size, GFP_ATOMIC);
+   if (new_fifo) {
+   memcpy(new_fifo, fifo->fifo, fifo->size);
+   if (fifo->fifo != fifo->initial_fifo)
+   kfree(fifo->fifo);
+   fifo->fifo = new_fifo;
+   fifo->size = new_size;
+   } else
+   return NULL;
+   }
 
return >fifo[fifo->head++];
 }
@@ -1152,7 +1178,7 @@ static void process_deferred_actions(struct datapath *dp)
} while (!action_fifo_is_empty(fifo));
 
/* Reset FIFO for the next packet.  */
-   action_fifo_init(fifo);
+   action_fifo_clear(fifo);
 }
 
 /* Execute a list of actions against 'skb'. */
@@ -1185,10 +1211,19 @@ out:
 
 int action_fifos_init(void)
 {
+   int i;
+
action_fifos = alloc_percpu(struct action_fifo);
if (!action_fifos)
return -ENOMEM;
 
+   /* Use pre allocated fifo. */
+   for_each_possible_cpu(i) {
+   struct action_fifo *action_fifo;
+
+   action_fifo = per_cpu_ptr(action_fifos, i);
+   action_fifo_init(action_fifo);
+   }
return 0;
 }
 
-- 
1.9.1