Re: [PATCH net-next 1/3] ipv4: Lock-less per-packet multipath

2015-06-20 Thread Peter Nørlund
On Thu, 18 Jun 2015 12:42:05 -0700
Alexander Duyck  wrote:

> On 06/17/2015 01:08 PM, Peter Nørlund wrote:
> > The current multipath attempted to be quasi random, but in most
> > cases it behaved just like a round robin balancing. This patch
> > refactors the algorithm to be exactly that and in doing so, avoids
> > the spin lock.
> >
> > The new design paves the way for hash-based multipath, replacing the
> > modulo with thresholds, minimizing disruption in case of failing
> > paths or route replacements.
> >
> > Signed-off-by: Peter Nørlund 
> > ---
> >   include/net/ip_fib.h |   6 +--
> >   net/ipv4/Kconfig |   1 +
> >   net/ipv4/fib_semantics.c | 116
> > ++- 3 files changed, 68
> > insertions(+), 55 deletions(-)
> >
> > diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
> > index 54271ed..4be4f25 100644
> > --- a/include/net/ip_fib.h
> > +++ b/include/net/ip_fib.h
> > @@ -76,8 +76,8 @@ struct fib_nh {
> > unsigned intnh_flags;
> > unsigned char   nh_scope;
> >   #ifdef CONFIG_IP_ROUTE_MULTIPATH
> > -   int nh_weight;
> > -   int nh_power;
> > +   int nh_mp_weight;
> > +   atomic_tnh_mp_upper_bound;
> >   #endif
> >   #ifdef CONFIG_IP_ROUTE_CLASSID
> > __u32   nh_tclassid;
> > @@ -115,7 +115,7 @@ struct fib_info {
> >   #define fib_advmss fib_metrics[RTAX_ADVMSS-1]
> > int fib_nhs;
> >   #ifdef CONFIG_IP_ROUTE_MULTIPATH
> > -   int fib_power;
> > +   int fib_mp_weight;
> >   #endif
> > struct rcu_head rcu;
> > struct fib_nh   fib_nh[0];
> 
> I could do without some of this renaming.  For example you could 
> probably not bother with adding the _mp piece to the name.  That way
> we don't have to track all the nh_weight -> nh_mp_weight changes.
> Also you could probably just use the name fib_weight since not
> including the _mp was already the convention for the multipath
> portions of the structure anyway.
> 
> This isn't really improving readability at all so I would say don't 
> bother renaming it.
> 

Good point. I'll skip the renaming.

> > diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
> > index d83071d..cb91f67 100644
> > --- a/net/ipv4/Kconfig
> > +++ b/net/ipv4/Kconfig
> > @@ -81,6 +81,7 @@ config IP_MULTIPLE_TABLES
> >   config IP_ROUTE_MULTIPATH
> > bool "IP: equal cost multipath"
> > depends on IP_ADVANCED_ROUTER
> > +   select BITREVERSE
> > help
> >   Normally, the routing tables specify a single action to
> > be taken in a deterministic manner for a given packet. If you say Y
> > here diff --git a/net/ipv4/fib_semantics.c
> > b/net/ipv4/fib_semantics.c index 28ec3c1..8c8df80 100644
> > --- a/net/ipv4/fib_semantics.c
> > +++ b/net/ipv4/fib_semantics.c
> > @@ -15,6 +15,7 @@
> >
> >   #include 
> >   #include 
> > +#include 
> >   #include 
> >   #include 
> >   #include 
> > @@ -57,7 +58,7 @@ static struct hlist_head
> > fib_info_devhash[DEVINDEX_HASHSIZE];
> >
> >   #ifdef CONFIG_IP_ROUTE_MULTIPATH
> >
> > -static DEFINE_SPINLOCK(fib_multipath_lock);
> > +static DEFINE_PER_CPU(u8, fib_mp_rr_counter);
> >
> >   #define for_nexthops(fi)
> > {   \ int nhsel; const
> > struct fib_nh *nh;  \ @@ -261,7
> > +262,7 @@ static inline int nh_comp(const struct fib_info *fi,
> > const struct fib_info *ofi) nh->nh_gw  != onh->nh_gw ||
> > nh->nh_scope != onh->nh_scope || #ifdef CONFIG_IP_ROUTE_MULTIPATH
> > -   nh->nh_weight != onh->nh_weight ||
> > +   nh->nh_mp_weight != onh->nh_mp_weight ||
> >   #endif
> >   #ifdef CONFIG_IP_ROUTE_CLASSID
> > nh->nh_tclassid != onh->nh_tclassid ||
> > @@ -449,6 +450,43 @@ static int fib_count_nexthops(struct rtnexthop
> > *rtnh, int remaining) return remaining > 0 ? 0 : nhs;
> >   }
> >
> 
> This is a good example.  If we don't do the rename we don't have to 
> review changes like the one above which just add extra overhead to
> the patch.
> 

Right.

> > +static void fib_rebalance(struct fib_info *fi)
> > +{
> > +   int factor;
> > +   int total;
> > +   int w;
> > +
> > +   if (fi->fib_nhs < 2)
> > +   return;
> > +
> > +   total = 0;
> > +   for_nexthops(fi) {
> > +   if (!(nh->nh_flags & RTNH_F_DEAD))
> > +   total += nh->nh_mp_weight;
> > +   } endfor_nexthops(fi);
> > +
> > +   if (likely(total != 0)) {
> > +   factor = DIV_ROUND_UP(total, 8388608);
> > +   total /= factor;
> > +   } else {
> > +   factor = 1;
> > +   }
> > +
> 
> So where does the 8388608 value come from?  Is it just here to help 
> restrict the upper_bound to a u8 value?
> 

Yes. Although I think it is rare for the weight to be that large, the
API supports it, so I'd better make sure nothing weird happens. Or is
it too hypothetical? Today, if one were

Re: [PATCH net-next 1/3] ipv4: Lock-less per-packet multipath

2015-06-18 Thread Alexander Duyck

On 06/17/2015 01:08 PM, Peter Nørlund wrote:

The current multipath attempted to be quasi random, but in most cases it
behaved just like a round robin balancing. This patch refactors the
algorithm to be exactly that and in doing so, avoids the spin lock.

The new design paves the way for hash-based multipath, replacing the
modulo with thresholds, minimizing disruption in case of failing paths or
route replacements.

Signed-off-by: Peter Nørlund 
---
  include/net/ip_fib.h |   6 +--
  net/ipv4/Kconfig |   1 +
  net/ipv4/fib_semantics.c | 116 ++-
  3 files changed, 68 insertions(+), 55 deletions(-)

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 54271ed..4be4f25 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -76,8 +76,8 @@ struct fib_nh {
unsigned intnh_flags;
unsigned char   nh_scope;
  #ifdef CONFIG_IP_ROUTE_MULTIPATH
-   int nh_weight;
-   int nh_power;
+   int nh_mp_weight;
+   atomic_tnh_mp_upper_bound;
  #endif
  #ifdef CONFIG_IP_ROUTE_CLASSID
__u32   nh_tclassid;
@@ -115,7 +115,7 @@ struct fib_info {
  #define fib_advmss fib_metrics[RTAX_ADVMSS-1]
int fib_nhs;
  #ifdef CONFIG_IP_ROUTE_MULTIPATH
-   int fib_power;
+   int fib_mp_weight;
  #endif
struct rcu_head rcu;
struct fib_nh   fib_nh[0];


I could do without some of this renaming.  For example you could 
probably not bother with adding the _mp piece to the name.  That way we 
don't have to track all the nh_weight -> nh_mp_weight changes.   Also 
you could probably just use the name fib_weight since not including the 
_mp was already the convention for the multipath portions of the 
structure anyway.


This isn't really improving readability at all so I would say don't 
bother renaming it.



diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index d83071d..cb91f67 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -81,6 +81,7 @@ config IP_MULTIPLE_TABLES
  config IP_ROUTE_MULTIPATH
bool "IP: equal cost multipath"
depends on IP_ADVANCED_ROUTER
+   select BITREVERSE
help
  Normally, the routing tables specify a single action to be taken in
  a deterministic manner for a given packet. If you say Y here
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 28ec3c1..8c8df80 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -15,6 +15,7 @@

  #include 
  #include 
+#include 
  #include 
  #include 
  #include 
@@ -57,7 +58,7 @@ static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE];

  #ifdef CONFIG_IP_ROUTE_MULTIPATH

-static DEFINE_SPINLOCK(fib_multipath_lock);
+static DEFINE_PER_CPU(u8, fib_mp_rr_counter);

  #define for_nexthops(fi) {\
int nhsel; const struct fib_nh *nh; \
@@ -261,7 +262,7 @@ static inline int nh_comp(const struct fib_info *fi, const 
struct fib_info *ofi)
nh->nh_gw  != onh->nh_gw ||
nh->nh_scope != onh->nh_scope ||
  #ifdef CONFIG_IP_ROUTE_MULTIPATH
-   nh->nh_weight != onh->nh_weight ||
+   nh->nh_mp_weight != onh->nh_mp_weight ||
  #endif
  #ifdef CONFIG_IP_ROUTE_CLASSID
nh->nh_tclassid != onh->nh_tclassid ||
@@ -449,6 +450,43 @@ static int fib_count_nexthops(struct rtnexthop *rtnh, int 
remaining)
return remaining > 0 ? 0 : nhs;
  }



This is a good example.  If we don't do the rename we don't have to 
review changes like the one above which just add extra overhead to the 
patch.



+static void fib_rebalance(struct fib_info *fi)
+{
+   int factor;
+   int total;
+   int w;
+
+   if (fi->fib_nhs < 2)
+   return;
+
+   total = 0;
+   for_nexthops(fi) {
+   if (!(nh->nh_flags & RTNH_F_DEAD))
+   total += nh->nh_mp_weight;
+   } endfor_nexthops(fi);
+
+   if (likely(total != 0)) {
+   factor = DIV_ROUND_UP(total, 8388608);
+   total /= factor;
+   } else {
+   factor = 1;
+   }
+


So where does the 8388608 value come from?  Is it just here to help 
restrict the upper_bound to a u8 value?



+   w = 0;
+   change_nexthops(fi) {
+   int upper_bound;
+
+   if (nexthop_nh->nh_flags & RTNH_F_DEAD) {
+   upper_bound = -1;
+   } else {
+   w += nexthop_nh->nh_mp_weight / factor;
+   upper_bound = DIV_ROUND_CLOSEST(256 * w, total);
+   }


This is doing some confusing stuff.  I assume the whole point is to get 
the value to convert the upper_bound into a u8 value based on the weight 
where you end

[PATCH net-next 1/3] ipv4: Lock-less per-packet multipath

2015-06-17 Thread Peter Nørlund
The current multipath attempted to be quasi random, but in most cases it
behaved just like a round robin balancing. This patch refactors the
algorithm to be exactly that and in doing so, avoids the spin lock.

The new design paves the way for hash-based multipath, replacing the
modulo with thresholds, minimizing disruption in case of failing paths or
route replacements.

Signed-off-by: Peter Nørlund 
---
 include/net/ip_fib.h |   6 +--
 net/ipv4/Kconfig |   1 +
 net/ipv4/fib_semantics.c | 116 ++-
 3 files changed, 68 insertions(+), 55 deletions(-)

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 54271ed..4be4f25 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -76,8 +76,8 @@ struct fib_nh {
unsigned intnh_flags;
unsigned char   nh_scope;
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
-   int nh_weight;
-   int nh_power;
+   int nh_mp_weight;
+   atomic_tnh_mp_upper_bound;
 #endif
 #ifdef CONFIG_IP_ROUTE_CLASSID
__u32   nh_tclassid;
@@ -115,7 +115,7 @@ struct fib_info {
 #define fib_advmss fib_metrics[RTAX_ADVMSS-1]
int fib_nhs;
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
-   int fib_power;
+   int fib_mp_weight;
 #endif
struct rcu_head rcu;
struct fib_nh   fib_nh[0];
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index d83071d..cb91f67 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -81,6 +81,7 @@ config IP_MULTIPLE_TABLES
 config IP_ROUTE_MULTIPATH
bool "IP: equal cost multipath"
depends on IP_ADVANCED_ROUTER
+   select BITREVERSE
help
  Normally, the routing tables specify a single action to be taken in
  a deterministic manner for a given packet. If you say Y here
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 28ec3c1..8c8df80 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -15,6 +15,7 @@
 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -57,7 +58,7 @@ static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE];
 
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
 
-static DEFINE_SPINLOCK(fib_multipath_lock);
+static DEFINE_PER_CPU(u8, fib_mp_rr_counter);
 
 #define for_nexthops(fi) { \
int nhsel; const struct fib_nh *nh; \
@@ -261,7 +262,7 @@ static inline int nh_comp(const struct fib_info *fi, const 
struct fib_info *ofi)
nh->nh_gw  != onh->nh_gw ||
nh->nh_scope != onh->nh_scope ||
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
-   nh->nh_weight != onh->nh_weight ||
+   nh->nh_mp_weight != onh->nh_mp_weight ||
 #endif
 #ifdef CONFIG_IP_ROUTE_CLASSID
nh->nh_tclassid != onh->nh_tclassid ||
@@ -449,6 +450,43 @@ static int fib_count_nexthops(struct rtnexthop *rtnh, int 
remaining)
return remaining > 0 ? 0 : nhs;
 }
 
+static void fib_rebalance(struct fib_info *fi)
+{
+   int factor;
+   int total;
+   int w;
+
+   if (fi->fib_nhs < 2)
+   return;
+
+   total = 0;
+   for_nexthops(fi) {
+   if (!(nh->nh_flags & RTNH_F_DEAD))
+   total += nh->nh_mp_weight;
+   } endfor_nexthops(fi);
+
+   if (likely(total != 0)) {
+   factor = DIV_ROUND_UP(total, 8388608);
+   total /= factor;
+   } else {
+   factor = 1;
+   }
+
+   w = 0;
+   change_nexthops(fi) {
+   int upper_bound;
+
+   if (nexthop_nh->nh_flags & RTNH_F_DEAD) {
+   upper_bound = -1;
+   } else {
+   w += nexthop_nh->nh_mp_weight / factor;
+   upper_bound = DIV_ROUND_CLOSEST(256 * w, total);
+   }
+
+   atomic_set(&nexthop_nh->nh_mp_upper_bound, upper_bound);
+   } endfor_nexthops(fi);
+}
+
 static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
   int remaining, struct fib_config *cfg)
 {
@@ -461,7 +499,7 @@ static int fib_get_nhs(struct fib_info *fi, struct 
rtnexthop *rtnh,
nexthop_nh->nh_flags =
(cfg->fc_flags & ~0xFF) | rtnh->rtnh_flags;
nexthop_nh->nh_oif = rtnh->rtnh_ifindex;
-   nexthop_nh->nh_weight = rtnh->rtnh_hops + 1;
+   nexthop_nh->nh_mp_weight = rtnh->rtnh_hops + 1;
 
attrlen = rtnh_attrlen(rtnh);
if (attrlen > 0) {
@@ -884,7 +922,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
fi->fib_net->ipv4.fib_num_tclassid_users++;
 #endif
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
-   nh->nh_weight = 1;
+   nh->nh_mp_w