This is a first swat and not in final form.  I hope folks here will help vet
my thinking on it.

This fills in a missed niche in policy routing support.  It allows multipath
routes to select nexthop based on the source realm, inside the routing
decision step, immediately after RPF is performed.  It moves RPF before
multipath selection.

This would be for people wanting to do policy routing based on a table
injected by a dynamic routing protocol, e.g. quagga, rather than static rules.

The existing methods for achieving this effect are all a bit tacky for
various reasons:

1) "iptables -m realm --realm X -j ROUTE" in FORWARD,mangle
because ipt_ROUTE is not a well supported iptables target
and has started to get dropped from mainstream distros.  Maybe
for lack of maintenence, but perhaps it is intentionally
deprecated. (?)

2) "tc route from" ... "action mirred egress redirect" happens
too late in the packet processing to do much else to the
packet, like say edit the MAC addresses which remain what they
were on the original output dev.  Doing this is really an
abuse of the queueing system and involves setting up qdiscs in
weird ways when one may only want to route.

3) Userspace scripts to glue loading from a kernel routing table
to a pre-routing ipset, iptables -j MARK, then "ip rule add fwmark"
because the kernel then has to check the source address against two
tables rather than one, and they could get quite large.  Plus it's
hackery.

This patch is a raw proof-of-concept I put together to get
things working just enough to ensure that nothing blows up when
packets are routed this way.  As such it does a couple of distasteful
things and has a couple rough edges:

  Reuses the nh_weight field as the realm
  Does not allow normal load balancing to fully mix in
  ipv4 only
  forward only, no code for local/output route.
  probably will break ifndef CONFIG_NET_CLS_ROUTE

Were this general idea to be deemed worthy, and as long as limiting
sizeof(struct fib_nh) is not a major concern to any linux routing
application.  I could work up a more thorough/cleaner patch allowing
statistical multipath and SAD policy-routing multipath to play nicely
together.

Especially needing comments on proper multipath RPF: The mainline code
only checks the selected path and if RPF fails it does not choose a
different one.  From this I assumed it is OK to do RPF on any old nexthop,
and we just assume the user won't or can't put any PR rule in that would gum
that up.  Otherwise both the mainline code and this code would have to
RPF multiple times, defeating the goal of good performance.  (Not to
mention that could get extra confusing when you are using the source
realm to choose.)  Special attention to the spec_dest handling, what
should be (?) OK since this is forward-only.

Also to consider is what this means to multipath caching should that
make a comeback.

I've only tested this code lightly so far, just bouncing things around
to static arp maps on the same if.

After patching iproute2, just substitute "weight X" with "byrealm X" to
activate it.  Probably you want to avoid realm 0.  You should be able to
put catch-all nexthops in with "weight X" alongside the "byrealm" ones
but they do not interact statistically.  Comments on that syntax
also welcome.

Sorry about the attachments, no real MUAs available here that won't
corrupt tabs.
diff -r -U2 linux-source-2.6.23-dsc/include/linux/rtnetlink.h 
linux-source-2.6.23-dsc-dsad/include/linux/rtnetlink.h
--- linux-source-2.6.23-dsc/include/linux/rtnetlink.h   2007-10-09 
16:31:38.000000000 -0400
+++ linux-source-2.6.23-dsc-dsad/include/linux/rtnetlink.h      2007-12-06 
20:23:25.000000000 -0500
@@ -294,4 +294,5 @@
 #define RTNH_F_PERVASIVE       2       /* Do recursive gateway lookup  */
 #define RTNH_F_ONLINK          4       /* Gateway is forced on link    */
+#define RTNH_F_DSAD            8       /* Dynamic PBR (weight = source realm)  
*/
 
 /* Macros to handle hexthops */
diff -r -U2 linux-source-2.6.23-dsc/include/net/ip_fib.h 
linux-source-2.6.23-dsc-dsad/include/net/ip_fib.h
--- linux-source-2.6.23-dsc/include/net/ip_fib.h        2007-10-09 
16:31:38.000000000 -0400
+++ linux-source-2.6.23-dsc-dsad/include/net/ip_fib.h   2007-12-06 
20:23:25.000000000 -0500
@@ -202,5 +202,6 @@
 extern int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
                               struct net_device *dev, __be32 *spec_dst, u32 
*itag);
-extern void fib_select_multipath(const struct flowi *flp, struct fib_result 
*res);
+extern void fib_select_multipath(const struct flowi *flp, 
+                              struct fib_result *res, u32 itag);
 
 struct rtentry;
diff -r -U2 linux-source-2.6.23-dsc/net/ipv4/fib_semantics.c 
linux-source-2.6.23-dsc-dsad/net/ipv4/fib_semantics.c
--- linux-source-2.6.23-dsc/net/ipv4/fib_semantics.c    2007-10-09 
16:31:38.000000000 -0400
+++ linux-source-2.6.23-dsc-dsad/net/ipv4/fib_semantics.c       2007-12-07 
14:36:10.000000000 -0500
@@ -1164,5 +1164,6 @@
  */
 
-void fib_select_multipath(const struct flowi *flp, struct fib_result *res)
+void fib_select_multipath(const struct flowi *flp, struct fib_result *res,
+                         u32 itag)
 {
        struct fib_info *fi = res->fi;
@@ -1170,8 +1171,20 @@
 
        spin_lock_bh(&fib_multipath_lock);
+       change_nexthops(fi) {
+         if (net_ratelimit())
+           printk(KERN_CRIT "DSAD Considering %x %x\n",nh->nh_weight,itag);
+               if ((nh->nh_flags & RTNH_F_DSAD) && 
+                   (((u32)nh->nh_weight - 1) == (itag >> 16))) {
+                       res->nh_sel = nhsel;
+                       spin_unlock_bh(&fib_multipath_lock);
+                       if (net_ratelimit())
+                         printk(KERN_CRIT "Chose a DSAD multiroute\n");
+                       return;
+               }
+       } endfor_nexthops(fi);
        if (fi->fib_power <= 0) {
                int power = 0;
                change_nexthops(fi) {
-                       if (!(nh->nh_flags&RTNH_F_DEAD)) {
+                       if (!(nh->nh_flags&(RTNH_F_DEAD|RTNH_F_DSAD))) {
                                power += nh->nh_weight;
                                nh->nh_power = nh->nh_weight;
@@ -1195,5 +1208,5 @@
 
        change_nexthops(fi) {
-               if (!(nh->nh_flags&RTNH_F_DEAD) && nh->nh_power) {
+               if (!(nh->nh_flags&(RTNH_F_DEAD|RTNH_F_DSAD)) && nh->nh_power) {
                        if ((w -= nh->nh_power) <= 0) {
                                nh->nh_power--;
diff -r -U2 linux-source-2.6.23-dsc/net/ipv4/route.c 
linux-source-2.6.23-dsc-dsad/net/ipv4/route.c
--- linux-source-2.6.23-dsc/net/ipv4/route.c    2007-10-09 16:31:38.000000000 
-0400
+++ linux-source-2.6.23-dsc-dsad/net/ipv4/route.c       2007-12-06 
20:24:45.000000000 -0500
@@ -1622,19 +1622,20 @@
 }
 
-static inline int __mkroute_input(struct sk_buff *skb,
-                                 struct fib_result* res,
-                                 struct in_device *in_dev,
-                                 __be32 daddr, __be32 saddr, u32 tos,
-                                 struct rtable **result)
+static inline int ip_mkroute_input(struct sk_buff *skb,
+                                  struct fib_result* res,
+                                  const struct flowi *fl,
+                                  struct in_device *in_dev,
+                                  __be32 daddr, __be32 saddr, u32 tos)
 {
-
-       struct rtable *rth;
-       int err;
+       struct rtable* rth = NULL;
        struct in_device *out_dev;
        unsigned flags = 0;
        __be32 spec_dst;
        u32 itag;
+       int err;
+       unsigned hash;
 
-       /* get a working reference to the output device */
+       /* get a working reference to a potential output device */
+       res->nh_sel = 0; /* needed? */
        out_dev = in_dev_get(FIB_RES_DEV(*res));
        if (out_dev == NULL) {
@@ -1645,7 +1646,8 @@
        }
 
-
+       /* Do not check each.  spec_dest entirely source dep (I hope.) */
        err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
                                  in_dev->dev, &spec_dst, &itag);
+
        if (err < 0) {
                ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
@@ -1659,4 +1661,19 @@
                flags |= RTCF_DIRECTSRC;
 
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+       if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
+               fib_select_multipath(fl, res, itag);
+#endif
+
+       /* Get reference to actual output device */
+       in_dev_put(out_dev);
+       out_dev = in_dev_get(FIB_RES_DEV(*res));
+       if (out_dev == NULL) {
+               if (net_ratelimit())
+                       printk(KERN_CRIT "Bug in ip_route_input" \
+                              "_slow(). Please, report\n");
+               return -EINVAL;
+       }
+
        if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) &&
            (IN_DEV_SHARED_MEDIA(out_dev) ||
@@ -1708,36 +1725,13 @@
 
        rth->rt_flags = flags;
-
-       *result = rth;
        err = 0;
- cleanup:
-       /* release the working reference to the output device */
-       in_dev_put(out_dev);
-       return err;
-}
-
-static inline int ip_mkroute_input(struct sk_buff *skb,
-                                  struct fib_result* res,
-                                  const struct flowi *fl,
-                                  struct in_device *in_dev,
-                                  __be32 daddr, __be32 saddr, u32 tos)
-{
-       struct rtable* rth = NULL;
-       int err;
-       unsigned hash;
-
-#ifdef CONFIG_IP_ROUTE_MULTIPATH
-       if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
-               fib_select_multipath(fl, res);
-#endif
-
-       /* create a routing cache entry */
-       err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
-       if (err)
-               return err;
 
        /* put it into the cache */
        hash = rt_hash(daddr, saddr, fl->iif);
        return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
+ cleanup:
+       /* release the working reference to the output device */
+       in_dev_put(out_dev);
+       return err;
 }
 
@@ -2305,5 +2299,5 @@
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
        if (res.fi->fib_nhs > 1 && fl.oif == 0)
-               fib_select_multipath(&fl, &res);
+               fib_select_multipath(&fl, &res, 0);
        else
 #endif
diff -r -U2 iproute-20070313-dsc/include/linux/rtnetlink.h 
iproute-20070313-dsc-dsad/include/linux/rtnetlink.h
--- iproute-20070313-dsc/include/linux/rtnetlink.h      2007-03-13 
17:50:56.000000000 -0400
+++ iproute-20070313-dsc-dsad/include/linux/rtnetlink.h 2007-12-06 
19:56:50.000000000 -0500
@@ -295,4 +295,5 @@
 #define RTNH_F_PERVASIVE       2       /* Do recursive gateway lookup  */
 #define RTNH_F_ONLINK          4       /* Gateway is forced on link    */
+#define RTNH_F_DSAD            8       /* Dynamic PBR (choose nh by src) */
 
 /* Macros to handle hexthops */
Binary files iproute-20070313-dsc/ip/ip and iproute-20070313-dsc-dsad/ip/ip 
differ
diff -r -U2 iproute-20070313-dsc/ip/iproute.c 
iproute-20070313-dsc-dsad/ip/iproute.c
--- iproute-20070313-dsc/ip/iproute.c   2007-12-07 16:39:48.000000000 -0500
+++ iproute-20070313-dsc-dsad/ip/iproute.c      2007-12-06 19:57:01.000000000 
-0500
@@ -70,5 +70,5 @@
        fprintf(stderr, "             [ mpath MP_ALGO ]\n");
        fprintf(stderr, "INFO_SPEC := NH OPTIONS FLAGS [ nexthop NH ]...\n");
-       fprintf(stderr, "NH := [ via ADDRESS ] [ dev STRING ] [ weight NUMBER ] 
NHFLAGS\n");
+       fprintf(stderr, "NH := [ via ADDRESS ] [ dev STRING ] [ weight NUMBER | 
byrealm REALM ] NHFLAGS\n");
        fprintf(stderr, "OPTIONS := FLAGS [ mtu NUMBER ] [ advmss NUMBER ]\n");
        fprintf(stderr, "           [ rtt NUMBER ] [ rttvar NUMBER ]\n");
@@ -583,5 +583,9 @@
                        } else {
                                fprintf(fp, " dev %s", 
ll_index_to_name(nh->rtnh_ifindex));
-                               fprintf(fp, " weight %d", nh->rtnh_hops+1);
+                               if (nh->rtnh_flags & RTNH_F_DSAD) {
+                                       fprintf(fp, " byrealm %x", 
nh->rtnh_hops);
+                               } else {
+                                       fprintf(fp, " weight %d", 
nh->rtnh_hops+1);
+                               }
                        }
                        if (nh->rtnh_flags & RTNH_F_DEAD)
@@ -623,4 +627,11 @@
                                invarg("\"weight\" is invalid\n", *argv);
                        rtnh->rtnh_hops = w - 1;
+               } else if (strcmp(*argv, "byrealm") == 0) {
+                       unsigned r;
+                       NEXT_ARG();
+                       if (get_unsigned(&r, *argv, 0) || r == 0 || r > 0x7fff)
+                               invarg("\"byrealm\" is invalid\n", *argv);
+                       rtnh->rtnh_hops = r;
+                       rtnh->rtnh_flags |= RTNH_F_DSAD;
                } else if (strcmp(*argv, "onlink") == 0) {
                        rtnh->rtnh_flags |= RTNH_F_ONLINK;

Reply via email to