[jkirsher/next-queue PATCH v2 2/7] net: Add support for subordinate device traffic classes

2018-07-09 Thread Alexander Duyck
This patch is meant to provide the basic tools needed to allow us to create
subordinate device traffic classes. The general idea here is to allow
subdividing the queues of a device into queue groups accessible through an
upper device such as a macvlan.

The idea here is to enforce the idea that an upper device has to be a
single queue device, ideally with IFF_NO_QUQUE set. With that being the
case we can pretty much guarantee that the tc_to_txq mappings and XPS maps
for the upper device are unused. As such we could reuse those in order to
support subdividing the lower device and distributing those queues between
the subordinate devices.

In order to distinguish between a regular set of traffic classes and if a
device is carrying subordinate traffic classes I changed num_tc from a u8
to a s16 value and use the negative values to represent the suboordinate
pool values. So starting at -1 and running to -32768 we can encode those as
pool values, and the existing values of 0 to 15 can be maintained.

Signed-off-by: Alexander Duyck 
---
 include/linux/netdevice.h |   16 
 net/core/dev.c|   89 +
 net/core/net-sysfs.c  |   21 ++-
 3 files changed, 124 insertions(+), 2 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index b683971..4648a9a 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -575,6 +575,9 @@ struct netdev_queue {
 * (/sys/class/net/DEV/Q/trans_timeout)
 */
unsigned long   trans_timeout;
+
+   /* Suboordinate device that the queue has been assigned to */
+   struct net_device   *sb_dev;
 /*
  * write-mostly part
  */
@@ -1991,7 +1994,7 @@ struct net_device {
 #ifdef CONFIG_DCB
const struct dcbnl_rtnl_ops *dcbnl_ops;
 #endif
-   u8  num_tc;
+   s16 num_tc;
struct netdev_tc_txqtc_to_txq[TC_MAX_QUEUE];
u8  prio_tc_map[TC_BITMASK + 1];
 
@@ -2045,6 +2048,17 @@ int netdev_get_num_tc(struct net_device *dev)
return dev->num_tc;
 }
 
+void netdev_unbind_sb_channel(struct net_device *dev,
+ struct net_device *sb_dev);
+int netdev_bind_sb_channel_queue(struct net_device *dev,
+struct net_device *sb_dev,
+u8 tc, u16 count, u16 offset);
+int netdev_set_sb_channel(struct net_device *dev, u16 channel);
+static inline int netdev_get_sb_channel(struct net_device *dev)
+{
+   return max_t(int, -dev->num_tc, 0);
+}
+
 static inline
 struct netdev_queue *netdev_get_tx_queue(const struct net_device *dev,
 unsigned int index)
diff --git a/net/core/dev.c b/net/core/dev.c
index 89825c1..cc1d6bb 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2067,11 +2067,13 @@ int netdev_txq_to_tc(struct net_device *dev, unsigned 
int txq)
struct netdev_tc_txq *tc = >tc_to_txq[0];
int i;
 
+   /* walk through the TCs and see if it falls into any of them */
for (i = 0; i < TC_MAX_QUEUE; i++, tc++) {
if ((txq - tc->offset) < tc->count)
return i;
}
 
+   /* didn't find it, just return -1 to indicate no match */
return -1;
}
 
@@ -2260,7 +2262,14 @@ int __netif_set_xps_queue(struct net_device *dev, const 
unsigned long *mask,
unsigned int nr_ids;
 
if (dev->num_tc) {
+   /* Do not allow XPS on subordinate device directly */
num_tc = dev->num_tc;
+   if (num_tc < 0)
+   return -EINVAL;
+
+   /* If queue belongs to subordinate dev use its map */
+   dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev;
+
tc = netdev_txq_to_tc(dev, index);
if (tc < 0)
return -EINVAL;
@@ -2448,11 +2457,25 @@ int netif_set_xps_queue(struct net_device *dev, const 
struct cpumask *mask,
 EXPORT_SYMBOL(netif_set_xps_queue);
 
 #endif
+static void netdev_unbind_all_sb_channels(struct net_device *dev)
+{
+   struct netdev_queue *txq = >_tx[dev->num_tx_queues];
+
+   /* Unbind any subordinate channels */
+   while (txq-- != >_tx[0]) {
+   if (txq->sb_dev)
+   netdev_unbind_sb_channel(dev, txq->sb_dev);
+   }
+}
+
 void netdev_reset_tc(struct net_device *dev)
 {
 #ifdef CONFIG_XPS
netif_reset_xps_queues_gt(dev, 0);
 #endif
+   netdev_unbind_all_sb_channels(dev);
+
+   /* Reset TC configuration of device */
dev->num_tc = 0;
memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq));
memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map));
@@ -2481,11 +2504,77 @@ int netdev_set_num_tc(struct net_device *dev, u8 num_tc)
 #ifdef CONFIG_XPS

Re: [Intel-wired-lan] [jkirsher/next-queue PATCH v2 2/7] net: Add support for subordinate device traffic classes

2018-06-12 Thread Alexander Duyck
On Tue, Jun 12, 2018 at 10:49 AM, Florian Fainelli  wrote:
> On 06/12/2018 08:18 AM, Alexander Duyck wrote:
>> This patch is meant to provide the basic tools needed to allow us to create
>> subordinate device traffic classes. The general idea here is to allow
>> subdividing the queues of a device into queue groups accessible through an
>> upper device such as a macvlan.
>>
>> The idea here is to enforce the idea that an upper device has to be a
>> single queue device, ideally with IFF_NO_QUQUE set. With that being the
>> case we can pretty much guarantee that the tc_to_txq mappings and XPS maps
>> for the upper device are unused. As such we could reuse those in order to
>> support subdividing the lower device and distributing those queues between
>> the subordinate devices.
>
> This is not necessarily a valid paradigm to work with. For instance in
> DSA we have IFF_NO_QUEUE devices, but we still expose multiple egress
> queues because that is how an application can choose how it wants to get
> packets transmitted at the switch level. We have a 1:1 representation
> between a queue at the net_device level, and what an egress queue at the
> switch level is, so things like buffer reservation etc. can be configured.

I'm not saying that IFF_NO_QUEUE implies that a device is single
queue, but in this case we enforce that the upper device has to be a
single queue device so that the code in netdev_pick_tx will ignore the
XPS and tc_to_txq mappings for that netdev. I had mentioned
IFF_NO_QUEUE as a suggestion as that allows us to avoid head-of-line
blocking if the lower device starts to apply back-pressure.

> I think you should consider that an upper device might want to have a
> 1:1 mapping to the lower device's queues and make that permissible.
> Thoughts?

I had considered that. However the issue becomes that at that point it
makes the setup much more rigid. With this approach I can enable and
disable the offload without needing to stop the upper device to either
create or remove qdiscs. I would much rather keep the upper device
generic and leave it to the lower device to populate the rings and
such.


Re: [jkirsher/next-queue PATCH v2 2/7] net: Add support for subordinate device traffic classes

2018-06-12 Thread Florian Fainelli
On 06/12/2018 08:18 AM, Alexander Duyck wrote:
> This patch is meant to provide the basic tools needed to allow us to create
> subordinate device traffic classes. The general idea here is to allow
> subdividing the queues of a device into queue groups accessible through an
> upper device such as a macvlan.
> 
> The idea here is to enforce the idea that an upper device has to be a
> single queue device, ideally with IFF_NO_QUQUE set. With that being the
> case we can pretty much guarantee that the tc_to_txq mappings and XPS maps
> for the upper device are unused. As such we could reuse those in order to
> support subdividing the lower device and distributing those queues between
> the subordinate devices.

This is not necessarily a valid paradigm to work with. For instance in
DSA we have IFF_NO_QUEUE devices, but we still expose multiple egress
queues because that is how an application can choose how it wants to get
packets transmitted at the switch level. We have a 1:1 representation
between a queue at the net_device level, and what an egress queue at the
switch level is, so things like buffer reservation etc. can be configured.

I think you should consider that an upper device might want to have a
1:1 mapping to the lower device's queues and make that permissible.
Thoughts?

> 
> In order to distinguish between a regular set of traffic classes and if a
> device is carrying subordinate traffic classes I changed num_tc from a u8
> to a s16 value and use the negative values to represent the suboordinate
> pool values. So starting at -1 and running to -32768 we can encode those as
> pool values, and the existing values of 0 to 15 can be maintained.
> 
> Signed-off-by: Alexander Duyck 
> ---
>  include/linux/netdevice.h |   16 
>  net/core/dev.c|   89 
> +
>  net/core/net-sysfs.c  |   21 ++-
>  3 files changed, 124 insertions(+), 2 deletions(-)
> 
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index 3ec9850..41b4660 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -569,6 +569,9 @@ struct netdev_queue {
>* (/sys/class/net/DEV/Q/trans_timeout)
>*/
>   unsigned long   trans_timeout;
> +
> + /* Suboordinate device that the queue has been assigned to */
> + struct net_device   *sb_dev;
>  /*
>   * write-mostly part
>   */
> @@ -1978,7 +1981,7 @@ struct net_device {
>  #ifdef CONFIG_DCB
>   const struct dcbnl_rtnl_ops *dcbnl_ops;
>  #endif
> - u8  num_tc;
> + s16 num_tc;
>   struct netdev_tc_txqtc_to_txq[TC_MAX_QUEUE];
>   u8  prio_tc_map[TC_BITMASK + 1];
>  
> @@ -2032,6 +2035,17 @@ int netdev_get_num_tc(struct net_device *dev)
>   return dev->num_tc;
>  }
>  
> +void netdev_unbind_sb_channel(struct net_device *dev,
> +   struct net_device *sb_dev);
> +int netdev_bind_sb_channel_queue(struct net_device *dev,
> +  struct net_device *sb_dev,
> +  u8 tc, u16 count, u16 offset);
> +int netdev_set_sb_channel(struct net_device *dev, u16 channel);
> +static inline int netdev_get_sb_channel(struct net_device *dev)
> +{
> + return max_t(int, -dev->num_tc, 0);
> +}
> +
>  static inline
>  struct netdev_queue *netdev_get_tx_queue(const struct net_device *dev,
>unsigned int index)
> diff --git a/net/core/dev.c b/net/core/dev.c
> index 6e18242..27fe4f2 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -2068,11 +2068,13 @@ int netdev_txq_to_tc(struct net_device *dev, unsigned 
> int txq)
>   struct netdev_tc_txq *tc = >tc_to_txq[0];
>   int i;
>  
> + /* walk through the TCs and see if it falls into any of them */
>   for (i = 0; i < TC_MAX_QUEUE; i++, tc++) {
>   if ((txq - tc->offset) < tc->count)
>   return i;
>   }
>  
> + /* didn't find it, just return -1 to indicate no match */
>   return -1;
>   }
>  
> @@ -2215,7 +2217,14 @@ int netif_set_xps_queue(struct net_device *dev, const 
> struct cpumask *mask,
>   bool active = false;
>  
>   if (dev->num_tc) {
> + /* Do not allow XPS on subordinate device directly */
>   num_tc = dev->num_tc;
> + if (num_tc < 0)
> + return -EINVAL;
> +
> + /* If queue belongs to subordinate dev use its map */
> + dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev;
> +
>   tc = netdev_txq_to_tc(dev, index);
>   if (tc < 0)
>   return -EINVAL;
> @@ -2366,11 +2375,25 @@ int netif_set_xps_queue(struct net_device *dev, const 
> struct cpumask *mask,
>  EXPORT_SYMBOL(netif_set_xps_queue);
>  
>  #endif
> +static void 

[jkirsher/next-queue PATCH v2 2/7] net: Add support for subordinate device traffic classes

2018-06-12 Thread Alexander Duyck
This patch is meant to provide the basic tools needed to allow us to create
subordinate device traffic classes. The general idea here is to allow
subdividing the queues of a device into queue groups accessible through an
upper device such as a macvlan.

The idea here is to enforce the idea that an upper device has to be a
single queue device, ideally with IFF_NO_QUQUE set. With that being the
case we can pretty much guarantee that the tc_to_txq mappings and XPS maps
for the upper device are unused. As such we could reuse those in order to
support subdividing the lower device and distributing those queues between
the subordinate devices.

In order to distinguish between a regular set of traffic classes and if a
device is carrying subordinate traffic classes I changed num_tc from a u8
to a s16 value and use the negative values to represent the suboordinate
pool values. So starting at -1 and running to -32768 we can encode those as
pool values, and the existing values of 0 to 15 can be maintained.

Signed-off-by: Alexander Duyck 
---
 include/linux/netdevice.h |   16 
 net/core/dev.c|   89 +
 net/core/net-sysfs.c  |   21 ++-
 3 files changed, 124 insertions(+), 2 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 3ec9850..41b4660 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -569,6 +569,9 @@ struct netdev_queue {
 * (/sys/class/net/DEV/Q/trans_timeout)
 */
unsigned long   trans_timeout;
+
+   /* Suboordinate device that the queue has been assigned to */
+   struct net_device   *sb_dev;
 /*
  * write-mostly part
  */
@@ -1978,7 +1981,7 @@ struct net_device {
 #ifdef CONFIG_DCB
const struct dcbnl_rtnl_ops *dcbnl_ops;
 #endif
-   u8  num_tc;
+   s16 num_tc;
struct netdev_tc_txqtc_to_txq[TC_MAX_QUEUE];
u8  prio_tc_map[TC_BITMASK + 1];
 
@@ -2032,6 +2035,17 @@ int netdev_get_num_tc(struct net_device *dev)
return dev->num_tc;
 }
 
+void netdev_unbind_sb_channel(struct net_device *dev,
+ struct net_device *sb_dev);
+int netdev_bind_sb_channel_queue(struct net_device *dev,
+struct net_device *sb_dev,
+u8 tc, u16 count, u16 offset);
+int netdev_set_sb_channel(struct net_device *dev, u16 channel);
+static inline int netdev_get_sb_channel(struct net_device *dev)
+{
+   return max_t(int, -dev->num_tc, 0);
+}
+
 static inline
 struct netdev_queue *netdev_get_tx_queue(const struct net_device *dev,
 unsigned int index)
diff --git a/net/core/dev.c b/net/core/dev.c
index 6e18242..27fe4f2 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2068,11 +2068,13 @@ int netdev_txq_to_tc(struct net_device *dev, unsigned 
int txq)
struct netdev_tc_txq *tc = >tc_to_txq[0];
int i;
 
+   /* walk through the TCs and see if it falls into any of them */
for (i = 0; i < TC_MAX_QUEUE; i++, tc++) {
if ((txq - tc->offset) < tc->count)
return i;
}
 
+   /* didn't find it, just return -1 to indicate no match */
return -1;
}
 
@@ -2215,7 +2217,14 @@ int netif_set_xps_queue(struct net_device *dev, const 
struct cpumask *mask,
bool active = false;
 
if (dev->num_tc) {
+   /* Do not allow XPS on subordinate device directly */
num_tc = dev->num_tc;
+   if (num_tc < 0)
+   return -EINVAL;
+
+   /* If queue belongs to subordinate dev use its map */
+   dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev;
+
tc = netdev_txq_to_tc(dev, index);
if (tc < 0)
return -EINVAL;
@@ -2366,11 +2375,25 @@ int netif_set_xps_queue(struct net_device *dev, const 
struct cpumask *mask,
 EXPORT_SYMBOL(netif_set_xps_queue);
 
 #endif
+static void netdev_unbind_all_sb_channels(struct net_device *dev)
+{
+   struct netdev_queue *txq = >_tx[dev->num_tx_queues];
+
+   /* Unbind any subordinate channels */
+   while (txq-- != >_tx[0]) {
+   if (txq->sb_dev)
+   netdev_unbind_sb_channel(dev, txq->sb_dev);
+   }
+}
+
 void netdev_reset_tc(struct net_device *dev)
 {
 #ifdef CONFIG_XPS
netif_reset_xps_queues_gt(dev, 0);
 #endif
+   netdev_unbind_all_sb_channels(dev);
+
+   /* Reset TC configuration of device */
dev->num_tc = 0;
memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq));
memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map));
@@ -2399,11 +2422,77 @@ int netdev_set_num_tc(struct net_device *dev, u8 num_tc)
 #ifdef CONFIG_XPS