[PATCH] via-velocity: unconditionally drop frames with bad l2 length

2015-11-16 Thread Timo Teräs
By default the driver allowed incorrect frames to be received. What is
worse the code does not handle very short frames correctly. The FCS
length is unconditionally subtracted, and the underflow can cause
skb_put to be called with large number after implicit cast to unsigned.
And indeed, an skb_over_panic() was observed with via-velocity.

This removes the module parameter as it does not work in it's
current state, and should be implemented via NETIF_F_RXALL if needed.

Suggested-by: Francois Romieu <rom...@fr.zoreil.com>
Signed-off-by: Timo Teräs <timo.te...@iki.fi>
---
Francois, is this something like you had in mind? I can try give this
a test spin in the known bad location, if this looks otherwise ok.

 drivers/net/ethernet/via/via-velocity.c | 24 +++-
 1 file changed, 3 insertions(+), 21 deletions(-)

diff --git a/drivers/net/ethernet/via/via-velocity.c 
b/drivers/net/ethernet/via/via-velocity.c
index a43e849..03ce386 100644
--- a/drivers/net/ethernet/via/via-velocity.c
+++ b/drivers/net/ethernet/via/via-velocity.c
@@ -345,13 +345,6 @@ VELOCITY_PARAM(flow_control, "Enable flow control 
ability");
 */
 VELOCITY_PARAM(speed_duplex, "Setting the speed and duplex mode");
 
-#define VAL_PKT_LEN_DEF 0
-/* ValPktLen[] is used for setting the checksum offload ability of NIC.
-   0: Receive frame with invalid layer 2 length (Default)
-   1: Drop frame with invalid layer 2 length
-*/
-VELOCITY_PARAM(ValPktLen, "Receiving or Drop invalid 802.3 frame");
-
 #define WOL_OPT_DEF 0
 #define WOL_OPT_MIN 0
 #define WOL_OPT_MAX 7
@@ -494,7 +487,6 @@ static void velocity_get_options(struct velocity_opt *opts, 
int index,
 
velocity_set_int_opt(>flow_cntl, flow_control[index], 
FLOW_CNTL_MIN, FLOW_CNTL_MAX, FLOW_CNTL_DEF, "flow_control", devname);
velocity_set_bool_opt(>flags, IP_byte_align[index], IP_ALIG_DEF, 
VELOCITY_FLAGS_IP_ALIGN, "IP_byte_align", devname);
-   velocity_set_bool_opt(>flags, ValPktLen[index], VAL_PKT_LEN_DEF, 
VELOCITY_FLAGS_VAL_PKT_LEN, "ValPktLen", devname);
velocity_set_int_opt((int *) >spd_dpx, speed_duplex[index], 
MED_LNK_MIN, MED_LNK_MAX, MED_LNK_DEF, "Media link mode", devname);
velocity_set_int_opt(>wol_opts, wol_opts[index], WOL_OPT_MIN, 
WOL_OPT_MAX, WOL_OPT_DEF, "Wake On Lan options", devname);
opts->numrx = (opts->numrx & ~3);
@@ -2055,8 +2047,9 @@ static int velocity_receive_frame(struct velocity_info 
*vptr, int idx)
int pkt_len = le16_to_cpu(rd->rdesc0.len) & 0x3fff;
struct sk_buff *skb;
 
-   if (rd->rdesc0.RSR & (RSR_STP | RSR_EDP)) {
-   VELOCITY_PRT(MSG_LEVEL_VERBOSE, KERN_ERR " %s : the received 
frame spans multiple RDs.\n", vptr->netdev->name);
+   if (unlikely(rd->rdesc0.RSR & (RSR_STP | RSR_EDP | RSR_RL))) {
+   if (rd->rdesc0.RSR & (RSR_STP | RSR_EDP))
+   VELOCITY_PRT(MSG_LEVEL_VERBOSE, KERN_ERR " %s : the 
received frame spans multiple RDs.\n", vptr->netdev->name);
stats->rx_length_errors++;
return -EINVAL;
}
@@ -2069,17 +2062,6 @@ static int velocity_receive_frame(struct velocity_info 
*vptr, int idx)
dma_sync_single_for_cpu(vptr->dev, rd_info->skb_dma,
vptr->rx.buf_sz, DMA_FROM_DEVICE);
 
-   /*
-*  Drop frame not meeting IEEE 802.3
-*/
-
-   if (vptr->flags & VELOCITY_FLAGS_VAL_PKT_LEN) {
-   if (rd->rdesc0.RSR & RSR_RL) {
-   stats->rx_length_errors++;
-   return -EINVAL;
-   }
-   }
-
velocity_rx_csum(rd, skb);
 
if (velocity_rx_copy(, pkt_len, vptr) < 0) {
-- 
2.6.3

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] ip_tunnel: fix ipv4 pmtu check to honor inner ip header df

2015-07-06 Thread Timo Teräs
Frag needed should be sent only if the inner header asked
to not fragment. Currently fragmentation is broken if the
tunnel has df set. The tunnel's df needs to be still checked
to update internally the pmtu cache.

This got broken in commit 23a3647bc4f93bac and this fixes
the pmtu check back to the way it was.

Fixes: 23a3647bc4f93bac (ip_tunnels: Use skb-len to PMTU check.)
Cc: Pravin B Shelar pshe...@nicira.com
---
Should go to -stable queues (3.12.y and never).

 net/ipv4/ip_tunnel.c | 8 +---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index 0bb8e14..6822572 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -587,7 +587,8 @@ int ip_tunnel_encap(struct sk_buff *skb, struct ip_tunnel 
*t,
 EXPORT_SYMBOL(ip_tunnel_encap);
 
 static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
-   struct rtable *rt, __be16 df)
+   struct rtable *rt, __be16 df,
+   const struct iphdr *inner_iph)
 {
struct ip_tunnel *tunnel = netdev_priv(dev);
int pkt_size = skb-len - tunnel-hlen - dev-hard_header_len;
@@ -604,7 +605,8 @@ static int tnl_update_pmtu(struct net_device *dev, struct 
sk_buff *skb,
 
if (skb-protocol == htons(ETH_P_IP)) {
if (!skb_is_gso(skb) 
-   (df  htons(IP_DF))  mtu  pkt_size) {
+   (inner_iph-frag_off  htons(IP_DF)) 
+   mtu  pkt_size) {
memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, 
htonl(mtu));
return -E2BIG;
@@ -738,7 +740,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device 
*dev,
goto tx_error;
}
 
-   if (tnl_update_pmtu(dev, skb, rt, tnl_params-frag_off)) {
+   if (tnl_update_pmtu(dev, skb, rt, tnl_params-frag_off, inner_iph)) {
ip_rt_put(rt);
goto tx_error;
}
-- 
2.4.5

--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH net,v2] ip_tunnel: fix ipv4 pmtu check to honor inner ip header df

2015-07-06 Thread Timo Teräs
Frag needed should be sent only if the inner header asked
to not fragment. Currently fragmentation is broken if the
tunnel has df set, but df was not asked in the original
packet. The tunnel's df needs to be still checked to update
internally the pmtu cache.

Commit 23a3647bc4f93bac broke it, and this commit fixes
the ipv4 df check back to the way it was.

Fixes: 23a3647bc4f93bac (ip_tunnels: Use skb-len to PMTU check.)
Cc: Pravin B Shelar pshe...@nicira.com
Signed-off-by: Timo Teräs timo.te...@iki.fi
---
Should go to -stable queues (3.12.y and newer).

v2: revised commit message wording a bit, and added
signed-off-by line that was forgotten accidentally.

 net/ipv4/ip_tunnel.c | 8 +---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index 4c2c3ba..626d9e5 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -586,7 +586,8 @@ int ip_tunnel_encap(struct sk_buff *skb, struct ip_tunnel 
*t,
 EXPORT_SYMBOL(ip_tunnel_encap);
 
 static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
-   struct rtable *rt, __be16 df)
+   struct rtable *rt, __be16 df,
+   const struct iphdr *inner_iph)
 {
struct ip_tunnel *tunnel = netdev_priv(dev);
int pkt_size = skb-len - tunnel-hlen - dev-hard_header_len;
@@ -603,7 +604,8 @@ static int tnl_update_pmtu(struct net_device *dev, struct 
sk_buff *skb,
 
if (skb-protocol == htons(ETH_P_IP)) {
if (!skb_is_gso(skb) 
-   (df  htons(IP_DF))  mtu  pkt_size) {
+   (inner_iph-frag_off  htons(IP_DF)) 
+   mtu  pkt_size) {
memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, 
htonl(mtu));
return -E2BIG;
@@ -737,7 +739,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device 
*dev,
goto tx_error;
}
 
-   if (tnl_update_pmtu(dev, skb, rt, tnl_params-frag_off)) {
+   if (tnl_update_pmtu(dev, skb, rt, tnl_params-frag_off, inner_iph)) {
ip_rt_put(rt);
goto tx_error;
}
-- 
2.4.5

--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCHv2] [XFRM]: Speed up xfrm_policy and xfrm_state walking

2008-02-15 Thread Timo Teräs
Change xfrm_policy and xfrm_state walking algorithm from O(n^2) to O(n).
This is achieved adding the entries to one more list which is used
solely for walking the entries.

This also fixes some races where the dump can have duplicate or missing
entries when the SPD/SADB is modified during an ongoing dump.

Dumping SADB with 2 entries using time ip xfrm state the sys
time dropped from 1.012s to 0.080s.

Signed-off-by: Timo Teras [EMAIL PROTECTED]
---
David Miller wrote:
 This will potentially leak walk-policy in the != -EEXIST case.
 
 I think what needs to happen is we invoke xfrm_policy_walk_done()
 unconditionally, then we'll potentially return reqid.

Good catch. Fixed as suggested.

Also the walk context structure layout was changed to align
better on 64-bit machines as suggested by Margit Schubert-While.

 include/linux/xfrm.h   |3 +-
 include/net/xfrm.h |   52 ++-
 net/key/af_key.c   |   24 --
 net/xfrm/xfrm_policy.c |   79 
 net/xfrm/xfrm_state.c  |   53 ++--
 net/xfrm/xfrm_user.c   |   71 ++-
 6 files changed, 197 insertions(+), 85 deletions(-)

diff --git a/include/linux/xfrm.h b/include/linux/xfrm.h
index e31b8c8..0c82c80 100644
--- a/include/linux/xfrm.h
+++ b/include/linux/xfrm.h
@@ -113,7 +113,8 @@ enum
 {
XFRM_POLICY_TYPE_MAIN   = 0,
XFRM_POLICY_TYPE_SUB= 1,
-   XFRM_POLICY_TYPE_MAX= 2
+   XFRM_POLICY_TYPE_MAX= 2,
+   XFRM_POLICY_TYPE_ANY= 255
 };
 
 enum
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index eea7785..9b62056 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -121,6 +121,7 @@ extern struct mutex xfrm_cfg_mutex;
 struct xfrm_state
 {
/* Note: bydst is re-used during gc */
+   struct list_headall;
struct hlist_node   bydst;
struct hlist_node   bysrc;
struct hlist_node   byspi;
@@ -424,6 +425,7 @@ struct xfrm_tmpl
 struct xfrm_policy
 {
struct xfrm_policy  *next;
+   struct list_headbytype;
struct hlist_node   bydst;
struct hlist_node   byidx;
 
@@ -1160,6 +1162,18 @@ struct xfrm6_tunnel {
int priority;
 };
 
+struct xfrm_state_walk {
+   struct xfrm_state *state;
+   int count;
+   u8 proto;
+};
+
+struct xfrm_policy_walk {
+   struct xfrm_policy *policy;
+   int count;
+   u8 type, cur_type;
+};
+
 extern void xfrm_init(void);
 extern void xfrm4_init(void);
 extern void xfrm_state_init(void);
@@ -1184,7 +1198,23 @@ static inline void xfrm6_fini(void)
 extern int xfrm_proc_init(void);
 #endif
 
-extern int xfrm_state_walk(u8 proto, int (*func)(struct xfrm_state *, int, 
void*), void *);
+static inline void xfrm_state_walk_init(struct xfrm_state_walk *walk, u8 proto)
+{
+   walk-proto = proto;
+   walk-state = NULL;
+   walk-count = 0;
+}
+
+static inline void xfrm_state_walk_done(struct xfrm_state_walk *walk)
+{
+   if (walk-state != NULL) {
+   xfrm_state_put(walk-state);
+   walk-state = NULL;
+   }
+}
+
+extern int xfrm_state_walk(struct xfrm_state_walk *walk,
+  int (*func)(struct xfrm_state *, int, void*), void 
*);
 extern struct xfrm_state *xfrm_state_alloc(void);
 extern struct xfrm_state *xfrm_state_find(xfrm_address_t *daddr, 
xfrm_address_t *saddr, 
  struct flowi *fl, struct xfrm_tmpl 
*tmpl,
@@ -1306,7 +1336,25 @@ static inline int xfrm4_udp_encap_rcv(struct sock *sk, 
struct sk_buff *skb)
 #endif
 
 struct xfrm_policy *xfrm_policy_alloc(gfp_t gfp);
-extern int xfrm_policy_walk(u8 type, int (*func)(struct xfrm_policy *, int, 
int, void*), void *);
+
+static inline void xfrm_policy_walk_init(struct xfrm_policy_walk *walk, u8 
type)
+{
+   walk-cur_type = XFRM_POLICY_TYPE_MAIN;
+   walk-type = type;
+   walk-policy = NULL;
+   walk-count = 0;
+}
+
+static inline void xfrm_policy_walk_done(struct xfrm_policy_walk *walk)
+{
+   if (walk-policy != NULL) {
+   xfrm_pol_put(walk-policy);
+   walk-policy = NULL;
+   }
+}
+
+extern int xfrm_policy_walk(struct xfrm_policy_walk *walk,
+   int (*func)(struct xfrm_policy *, int, int, void*), void *);
 int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl);
 struct xfrm_policy *xfrm_policy_bysel_ctx(u8 type, int dir,
  struct xfrm_selector *sel,
diff --git a/net/key/af_key.c b/net/key/af_key.c
index 1c85392..0d7c195 100644
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -1742,12 +1742,18 @@ static int pfkey_dump(struct sock *sk, struct sk_buff 
*skb, struct sadb_msg *hdr
 {
u8 proto;
struct pfkey_dump_data data = { .skb = skb, .hdr = hdr, .sk = sk };
+   struct xfrm_state_walk walk;
+   int rc;
 
proto = 

[PATCH RESEND] [XFRM]: Speed up xfrm_policy and xfrm_state walking

2008-02-01 Thread Timo Teräs
Change xfrm_policy and xfrm_state walking algorithm from O(n^2) to O(n).
This is achieved adding the entries to one more list which is used
solely for walking the entries.

This also fixes some races where the dump can have duplicate or missing
entries when the SPD/SADB is modified during an ongoing dump.

Dumping SADB with 2 entries using time ip xfrm state the sys
time dropped from 1.012s to 0.080s.

Signed-off-by: Timo Teras [EMAIL PROTECTED]
---

 include/linux/xfrm.h   |3 +-
 include/net/xfrm.h |   52 ++-
 net/key/af_key.c   |   22 +++--
 net/xfrm/xfrm_policy.c |   79 
 net/xfrm/xfrm_state.c  |   53 ++--
 net/xfrm/xfrm_user.c   |   71 ++-
 6 files changed, 195 insertions(+), 85 deletions(-)

diff --git a/include/linux/xfrm.h b/include/linux/xfrm.h
index e31b8c8..0c82c80 100644
--- a/include/linux/xfrm.h
+++ b/include/linux/xfrm.h
@@ -113,7 +113,8 @@ enum
 {
XFRM_POLICY_TYPE_MAIN   = 0,
XFRM_POLICY_TYPE_SUB= 1,
-   XFRM_POLICY_TYPE_MAX= 2
+   XFRM_POLICY_TYPE_MAX= 2,
+   XFRM_POLICY_TYPE_ANY= 255
 };
 
 enum
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index ac72116..7bd021b 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -121,6 +121,7 @@ extern struct mutex xfrm_cfg_mutex;
 struct xfrm_state
 {
/* Note: bydst is re-used during gc */
+   struct list_headall;
struct hlist_node   bydst;
struct hlist_node   bysrc;
struct hlist_node   byspi;
@@ -424,6 +425,7 @@ struct xfrm_tmpl
 struct xfrm_policy
 {
struct xfrm_policy  *next;
+   struct list_headbytype;
struct hlist_node   bydst;
struct hlist_node   byidx;
 
@@ -1157,6 +1159,18 @@ struct xfrm6_tunnel {
int priority;
 };
 
+struct xfrm_state_walk {
+   u8 proto;
+   struct xfrm_state *state;
+   int count;
+};
+
+struct xfrm_policy_walk {
+   u8 type, cur_type;
+   struct xfrm_policy *policy;
+   int count;
+};
+
 extern void xfrm_init(void);
 extern void xfrm4_init(void);
 extern void xfrm_state_init(void);
@@ -1181,7 +1195,23 @@ static inline void xfrm6_fini(void)
 extern int xfrm_proc_init(void);
 #endif
 
-extern int xfrm_state_walk(u8 proto, int (*func)(struct xfrm_state *, int, 
void*), void *);
+static inline void xfrm_state_walk_init(struct xfrm_state_walk *walk, u8 proto)
+{
+   walk-proto = proto;
+   walk-state = NULL;
+   walk-count = 0;
+}
+
+static inline void xfrm_state_walk_done(struct xfrm_state_walk *walk)
+{
+   if (walk-state != NULL) {
+   xfrm_state_put(walk-state);
+   walk-state = NULL;
+   }
+}
+
+extern int xfrm_state_walk(struct xfrm_state_walk *walk,
+  int (*func)(struct xfrm_state *, int, void*), void 
*);
 extern struct xfrm_state *xfrm_state_alloc(void);
 extern struct xfrm_state *xfrm_state_find(xfrm_address_t *daddr, 
xfrm_address_t *saddr, 
  struct flowi *fl, struct xfrm_tmpl 
*tmpl,
@@ -1303,7 +1333,25 @@ static inline int xfrm4_udp_encap_rcv(struct sock *sk, 
struct sk_buff *skb)
 #endif
 
 struct xfrm_policy *xfrm_policy_alloc(gfp_t gfp);
-extern int xfrm_policy_walk(u8 type, int (*func)(struct xfrm_policy *, int, 
int, void*), void *);
+
+static inline void xfrm_policy_walk_init(struct xfrm_policy_walk *walk, u8 
type)
+{
+   walk-cur_type = XFRM_POLICY_TYPE_MAIN;
+   walk-type = type;
+   walk-policy = NULL;
+   walk-count = 0;
+}
+
+static inline void xfrm_policy_walk_done(struct xfrm_policy_walk *walk)
+{
+   if (walk-policy != NULL) {
+   xfrm_pol_put(walk-policy);
+   walk-policy = NULL;
+   }
+}
+
+extern int xfrm_policy_walk(struct xfrm_policy_walk *walk,
+   int (*func)(struct xfrm_policy *, int, int, void*), void *);
 int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl);
 struct xfrm_policy *xfrm_policy_bysel_ctx(u8 type, int dir,
  struct xfrm_selector *sel,
diff --git a/net/key/af_key.c b/net/key/af_key.c
index 16b72b5..d24bb9c 100644
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -1742,12 +1742,18 @@ static int pfkey_dump(struct sock *sk, struct sk_buff 
*skb, struct sadb_msg *hdr
 {
u8 proto;
struct pfkey_dump_data data = { .skb = skb, .hdr = hdr, .sk = sk };
+   struct xfrm_state_walk walk;
+   int rc;
 
proto = pfkey_satype2proto(hdr-sadb_msg_satype);
if (proto == 0)
return -EINVAL;
 
-   return xfrm_state_walk(proto, dump_sa, data);
+   xfrm_state_walk_init(walk, proto);
+   rc = xfrm_state_walk(walk, dump_sa, data);
+   xfrm_state_walk_done(walk);
+
+   return rc;
 }
 
 static int pfkey_promisc(struct sock *sk, struct sk_buff *skb, struct sadb_msg 

[PATCH net-2.6.25] [XFRM]: Speed up xfrm_policy and xfrm_state walking

2008-01-25 Thread Timo Teräs
Change xfrm_policy and xfrm_state walking algorithm from O(n^2) to O(n).
This is achieved adding the entries to one more list which is used
solely for walking the entries.

This also fixes some races where the dump can have duplicate or missing
entries when the SPD/SADB is modified during an ongoing dump.

Dumping SADB with 2 entries using time ip xfrm state the sys
time dropped from 1.012s to 0.080s.

Signed-off-by: Timo Teras [EMAIL PROTECTED]
---
 include/linux/xfrm.h   |3 +-
 include/net/xfrm.h |   52 ++-
 net/key/af_key.c   |   22 +++--
 net/xfrm/xfrm_policy.c |   79 
 net/xfrm/xfrm_state.c  |   53 ++--
 net/xfrm/xfrm_user.c   |   71 ++-
 6 files changed, 195 insertions(+), 85 deletions(-)

diff --git a/include/linux/xfrm.h b/include/linux/xfrm.h
index 9b5b00c..60e7395 100644
--- a/include/linux/xfrm.h
+++ b/include/linux/xfrm.h
@@ -106,7 +106,8 @@ enum
 {
XFRM_POLICY_TYPE_MAIN   = 0,
XFRM_POLICY_TYPE_SUB= 1,
-   XFRM_POLICY_TYPE_MAX= 2
+   XFRM_POLICY_TYPE_MAX= 2,
+   XFRM_POLICY_TYPE_ANY= 255
 };
 
 enum
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 5ebb9ba..a6a6adf 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -121,6 +121,7 @@ extern struct mutex xfrm_cfg_mutex;
 struct xfrm_state
 {
/* Note: bydst is re-used during gc */
+   struct list_headall;
struct hlist_node   bydst;
struct hlist_node   bysrc;
struct hlist_node   byspi;
@@ -423,6 +424,7 @@ struct xfrm_tmpl
 struct xfrm_policy
 {
struct xfrm_policy  *next;
+   struct list_headbytype;
struct hlist_node   bydst;
struct hlist_node   byidx;
 
@@ -1151,6 +1153,18 @@ struct xfrm6_tunnel {
int priority;
 };
 
+struct xfrm_state_walk {
+   u8 proto;
+   struct xfrm_state *state;
+   int count;
+};
+
+struct xfrm_policy_walk {
+   u8 type, cur_type;
+   struct xfrm_policy *policy;
+   int count;
+};
+
 extern void xfrm_init(void);
 extern void xfrm4_init(void);
 extern void xfrm_state_init(void);
@@ -1175,7 +1189,23 @@ static inline void xfrm6_fini(void)
 extern int xfrm_proc_init(void);
 #endif
 
-extern int xfrm_state_walk(u8 proto, int (*func)(struct xfrm_state *, int, 
void*), void *);
+static inline void xfrm_state_walk_init(struct xfrm_state_walk *walk, u8 proto)
+{
+   walk-proto = proto;
+   walk-state = NULL;
+   walk-count = 0;
+}
+
+static inline void xfrm_state_walk_done(struct xfrm_state_walk *walk)
+{
+   if (walk-state != NULL) {
+   xfrm_state_put(walk-state);
+   walk-state = NULL;
+   }
+}
+
+extern int xfrm_state_walk(struct xfrm_state_walk *walk,
+  int (*func)(struct xfrm_state *, int, void*), void 
*);
 extern struct xfrm_state *xfrm_state_alloc(void);
 extern struct xfrm_state *xfrm_state_find(xfrm_address_t *daddr, 
xfrm_address_t *saddr, 
  struct flowi *fl, struct xfrm_tmpl 
*tmpl,
@@ -1297,7 +1327,25 @@ static inline int xfrm4_udp_encap_rcv(struct sock *sk, 
struct sk_buff *skb)
 #endif
 
 struct xfrm_policy *xfrm_policy_alloc(gfp_t gfp);
-extern int xfrm_policy_walk(u8 type, int (*func)(struct xfrm_policy *, int, 
int, void*), void *);
+
+static inline void xfrm_policy_walk_init(struct xfrm_policy_walk *walk, u8 
type)
+{
+   walk-cur_type = XFRM_POLICY_TYPE_MAIN;
+   walk-type = type;
+   walk-policy = NULL;
+   walk-count = 0;
+}
+
+static inline void xfrm_policy_walk_done(struct xfrm_policy_walk *walk)
+{
+   if (walk-policy != NULL) {
+   xfrm_pol_put(walk-policy);
+   walk-policy = NULL;
+   }
+}
+
+extern int xfrm_policy_walk(struct xfrm_policy_walk *walk,
+   int (*func)(struct xfrm_policy *, int, int, void*), void *);
 int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl);
 struct xfrm_policy *xfrm_policy_bysel_ctx(u8 type, int dir,
  struct xfrm_selector *sel,
diff --git a/net/key/af_key.c b/net/key/af_key.c
index 16b72b5..d24bb9c 100644
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -1742,12 +1742,18 @@ static int pfkey_dump(struct sock *sk, struct sk_buff 
*skb, struct sadb_msg *hdr
 {
u8 proto;
struct pfkey_dump_data data = { .skb = skb, .hdr = hdr, .sk = sk };
+   struct xfrm_state_walk walk;
+   int rc;
 
proto = pfkey_satype2proto(hdr-sadb_msg_satype);
if (proto == 0)
return -EINVAL;
 
-   return xfrm_state_walk(proto, dump_sa, data);
+   xfrm_state_walk_init(walk, proto);
+   rc = xfrm_state_walk(walk, dump_sa, data);
+   xfrm_state_walk_done(walk);
+
+   return rc;
 }
 
 static int pfkey_promisc(struct sock *sk, struct sk_buff *skb, struct sadb_msg 

Re: [RFC][PATCH] Fixing SA/SP dumps on netlink/af_key

2008-01-17 Thread Timo Teräs
David Miller wrote:
 Doing anything other than life support bug fixes for AF_KEY is
 inappropriate.

Yes. I thought my patch would qualify as life support bug fix.
Currently racoon fails to work if there are too many SPDs or SAs
because the kernel cannot handle the dump request properly. And
this is what my patch fixes for pfkey. It adds no new features or
functionality; just makes the dumping work with large databases.

Then there's also the xfrm dumping changes which change the
algorithm from O(n^2) to O(n) with some memory overhead, but
that is a different story. Any comments on that?

- Timo
--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC][PATCH] Fixing SA/SP dumps on netlink/af_key

2008-01-17 Thread Timo Teräs
David Miller wrote:
 From: Timo_Teräs [EMAIL PROTECTED]
 Date: Thu, 17 Jan 2008 10:11:17 +0200
 
 I thought my patch would qualify as life support bug fix.
 Currently racoon fails to work if there are too many SPDs or SAs
 because the kernel cannot handle the dump request properly. And this
 is what my patch fixes for pfkey. It adds no new features or
 functionality; just makes the dumping work with large databases.
 
 Racoon should use netlink for reasons far and beyond the
 problem you are trying to address.

Yes. But this is fairly major thing to do. One needs to create
API abstraction layer (still need to use pfkey in *BSD). Test it.
A lot of work that is not going to happen very soon.

Where as the pfkey bug fix is non-intrusive and helps all
legacy applications still using af_key by _fixing a bug in
kernel_.

 The dumping behavior of AF_KEY is just horrific, as one of
 several examples.

If af_key is all that bad and does not qualify to get maintanace
bug fixes, why not remove it complitely?

That would make userland adapt faster.

 Then there's also the xfrm dumping changes which change the
 algorithm from O(n^2) to O(n) with some memory overhead, but
 that is a different story. Any comments on that?
 
 I have no general objections to those changes although I am
 backlogged and thus have not studied them in detail.  Jamal
 is having what appears to be a healthy dialogue with you about
 the details so I'm not concerned much :)

Ok. I hope someone can also give feedback on the naming
conventions. And about the api changes to xfrm policy/state
walking.

- Timo

--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC][PATCH] Fixing SA/SP dumps on netlink/af_key

2008-01-17 Thread Timo Teräs
David Miller wrote:
 From: Timo_Teräs [EMAIL PROTECTED]
 Date: Thu, 17 Jan 2008 11:20:42 +0200
 
 Where as the pfkey bug fix is non-intrusive and helps all
 legacy applications still using af_key by _fixing a bug in
 kernel_.
 
 It's not a bug.  You're fixing a speed issue, not a crash
 or a case where AF_KEY is providing incorrect data.
 
 That is what I mean when I mean life support, we fix crashes and
 data corruption.  We don't make performance tweaks.

No. The speed issue is complitely handled in xfrm_state
and xfrm_user changes.

The af_key issue is that in big dumps you get only first X
entries. The rest of the entries are dropped because the
socket receive buffer goes full. You get data corruption:
missing entries.

- Timo

--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC][PATCH] Fixing SA/SP dumps on netlink/af_key

2008-01-17 Thread Timo Teräs
David Miller wrote:
 From: Timo_Teräs [EMAIL PROTECTED]
 Date: Thu, 17 Jan 2008 11:38:13 +0200
 
 The af_key issue is that in big dumps you get only first X
 entries. The rest of the entries are dropped because the
 socket receive buffer goes full. You get data corruption:
 missing entries.
 
 This is an inherent aspect of AF_KEY (and what it was
 derived from, BSD routing sockets).

Yes, this is the way BSD does it.
 
 It has to provide dumps atomically, and if there is no
 space there is no way to provide those entries which
 would require more rcvbuf space.

RFC does not say it has to be atomic.

It does say that the dump is terminated with SADB_DUMP
message having sadb_seq field set to zero. Currently
that is dropped too when the problem occurs. Thus the
socket is left in a bad state: dump ends never. This
can cause applications without any workarounds to hang.

- Timo
 
--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC][PATCH] Fixing SA/SP dumps on netlink/af_key

2008-01-17 Thread Timo Teräs
David Miller wrote:
 From: Timo_Teräs [EMAIL PROTECTED]
 Date: Thu, 17 Jan 2008 12:01:17 +0200
 
 David Miller wrote:
 This is an inherent aspect of AF_KEY (and what it was
 derived from, BSD routing sockets).
 Yes, this is the way BSD does it.
  
 It has to provide dumps atomically, and if there is no
 space there is no way to provide those entries which
 would require more rcvbuf space.
 RFC does not say it has to be atomic.
 
 Every application out there in the universe expects BSD socket
 semantics, and therefore atomic dumps.  You cannot fix things
 without breaking applications.

IMHO, it's a lot better then losing 50% of entries and the end
of sequence message on big dumps. SPD and SADB are not that
volatile; in most of the cases the dump would be as good as an
atomic one.

Even if it did change during ongoing dump you still get an usable
dump. All the entries reflect real data and there is no dependency
between different entries.

I'm not sure if there's other major applications that we should
be concerned about, but at least ipsec-tools racoon does not
expect to get atomic dumps (which btw, comes originally from BSD).

Cheers,
  Timo

--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC][PATCH] Fixing SA/SP dumps on netlink/af_key

2008-01-17 Thread Timo Teräs
Herbert Xu wrote:
 On Thu, Jan 17, 2008 at 07:54:32AM +0200, Timo Teräs wrote:
 Racoon doesn't use pfkey dumping as far as I know.
 ipsec-tools racoon uses pfkey and only pfkey. And it's non trivial to
 make it use netlink; it relies heavily all around the code to pfkey
 structs. It also runs on BSD so we cannot rip pfkey away; adding a
 layer to work with both pfkey and netlink would be doable, but just a
 lot of work.
 
 Sure racoon uses pfkey but the question is does it use pfkey dumping?

Yes it does.

It does SPD dump at startup and keeps the SP database in sync by
listening to notifications.

It also does SA dumps when it is figuring out which SAs to purge.

I started to work on the xfrm/pfkey patch only because racoon is
having problems with (as is anything else using pfkey).

- Timo

--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC][PATCH] Fixing SA/SP dumps on netlink/af_key

2008-01-17 Thread Timo Teräs
David Miller wrote:
 From: Timo_Teräs [EMAIL PROTECTED]
 Date: Thu, 17 Jan 2008 13:00:09 +0200
 
 IMHO, it's a lot better then losing 50% of entries and the end
 of sequence message on big dumps. SPD and SADB are not that
 volatile; in most of the cases the dump would be as good as an
 atomic one.
 
 I humbly disagree with you.  Interface behavior stability
 is more important.

Small SPDs/SADBs would still be dumped atomically. The patch
affects only the cases when the receive queue is getting full.

 I'm not sure if there's other major applications that we should
 be concerned about, but at least ipsec-tools racoon does not
 expect to get atomic dumps (which btw, comes originally from BSD).
 
 Racoon was written as an addon to the BSD stack by an IPV6/IPSEC
 project in Japan named KAME, it did not come from BSD.  It was
 added to BSD.
 
 There are also other BSD based IPSEC daemons such as the one written
 by the OpenBSD folks.

Yes. I meant that it was originally written to be used in BSD. The
Linux port came later. Sorry for the ambiguous wording.

 I don't think this is arguable at all.  We're not changing semantics
 over what we've done for 4+ years and applications might depend upon.
 It's for a deprecated interface, which makes any semantic changes that
 much less inviting.
 
 You can argue all you want, but it will not change the invariants in
 the previous paragraph.

True. If no one else agrees with me, I'll drop it. I can always run
my own patched kernel.

I'd appreciate feedback on the xfrm changes. I'll try to make that
part usable patch against net-2.6.25 git tree next week.
--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC][PATCH] Fixing SA/SP dumps on netlink/af_key

2008-01-17 Thread Timo Teräs
Herbert Xu wrote:
 On Thu, Jan 17, 2008 at 07:42:30AM -0500, jamal wrote:
 Looking at the pfkey RFC one more time, heres a funny quote:
 
 The dump message is used for debugging
 purposes only and is not intended for production use.
 
 
 In fact it goes much further:
 
Support for the dump message MAY be discontinued in future versions
of PF_KEY.  Key management applications MUST NOT depend on this
message for basic operation.

I guess the idea was that application should know about the SAs it
created. Though a SA dump needs to be done if you want to check
for existing entries (created by other processes, or if you are
recovering from a crash).

SPD dumping is still a must if you want to work nicely with kernel.

As noted earlier pfkey is not really standardized. E.g. the SPD
dumping message are not in the RFC as David noted. The above RFC
comments and the fact that SPD stuff is unspecified made me think
that making non-atomic dumps would be a lot better alternative then
leaving the socket to bad state which would make the application
completely unusable.

--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC][PATCH] Fixing SA/SP dumps on netlink/af_key

2008-01-17 Thread Timo Teräs
Herbert Xu wrote:
 On Thu, Jan 17, 2008 at 03:31:14PM +0200, Timo Teräs wrote:
 I guess the idea was that application should know about the SAs it
 created. Though a SA dump needs to be done if you want to check
 for existing entries (created by other processes, or if you are
 recovering from a crash).
 
 That's what SADB_GET is for.  In any case KMs cannot coexist so this
 is pointless.  After a crash you should simply flush all states and
 policies.

I thought KMs could coexist. Actually, in strongSwan you have two
daemons: charon and pluto. Both can run at the same time. The other
is for IKEv1 and the other one for IKEv2. It uses netlink, though.
Looking the way pfkey works, it looks like being designed to work
with multiple KMs (e.g. acquires are sent to all registered sockets).

 SPD dumping is still a must if you want to work nicely with kernel.
 
 No it isn't.  Look at how Openswan does it.  No dumping anywhere at all.

Then you have to have all policy/static association configuration in
the application configuration. ipsec-tools wanted separate that. As
this is more robust if someone decided to run multiple KMs.

My point (as an ipsec-tools developer) is that the patch would fix
a lot of problems until ipsec-tools is netlink enabled. It would
handle perfectly the dumping even as non-atomic. And that ipsec-tools
is the KM you get by default in almost all distributions.

But apparently you are too worried that all the so many other KMs
still using pfkey (is there others? I think most others use netlink
already) in Linux might break because the rely on an unspecified
feature of pfkey.

I'm also tired of arguing since this is going nowhere. I'll run my
patched kernel and try to get ipsec-tools fixed to use netlink...
eventually.

--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC][PATCH] Fixing SA/SP dumps on netlink/af_key

2008-01-16 Thread Timo Teräs
jamal wrote:
 On Sun, 2008-13-01 at 14:26 +0200, Timo Teräs wrote:
 I tried to address all these problems, and added the xfrm_policy
 and xfrm_state structure to a new linked list that is used solely
 for dumping. This way the dumps can be done in O(n) and have an
 iterator point to them. I also modified to af_key to stop dumping
 when receive queue is filling up and continue it from a callback at
 recvfrom() when there's more room.

 But I'd like to hear about:
 - if the approach is ok (adding the entries in one more list)?
 - if the new/modified variables, structures and function names are ok?
 - if I should port these against net-2.6 or net-2.6.25 git tree?
 - if I should split this to more than one commit? (maybe xfrm core,
 xfrm user and af_key parts?)
 
 To answer your last 2 questions:
 There are two issues that are inter-mingled in there. The most
 important is pf_key not being robust on dump. The other being the
 accurracy of netlink dumping - this has much broader implications. I
 think you need to separate the patches into those two at least and
 prioritize on the pf_key as a patch that you push in. I would also
 try to make the pf_key dumping approach to mirror what netlink does
 today - in the worst case it would be as innacurate as netlink; which
 is not bad. I think youll find no resistance getting such a patch in.

Creating a separate af_key patch would not be a big problem. I was
just hoping avoiding it as the xfrm_state / xfrm_policy changes
modify the API and requires changing af_key also.

 To answer your first 2 questions:
 
 My main dilema with your approach is the policy of maintaining such a
  list in the kernel and memory consumption needed vs the innacuracy 
 of netlink dumps.With your approach of maintaining extra SA/P D, i
 would need double the RAM amount.

No. I'm not creating second copies of the SADB/SPD entries. The entries
are just added to one more list. Memory requirements go up on per entry
(figures based on 2.6.22.9 kernel):
sizeof(struct xfrm_policy) 636 - 644 bytes
sizeof(struct xfrm_state)  452 - 460 bytes

For 400K entries it would take about 3 megs of more memory. Where the
total database size is around 240-250 megs.

 Netlink dumping gives up accuracy[1], uses 50% of the memory you are 
 proposing but abuses more cpu cycles. User space could maintain the
 list you are proposing instead - and compensate for the innaccuracies
 by watching events[2] Of course that shifts the accuracy to events
 which could be lost on their way to user space. This issue is
 alleviated a little more with your approach of keeping the list in
 the kernel and adding new updates to the tail of the dump list
 (which, btw, your patch didnt seem to do); however, even if you solve
 that: ** you are still will be faced with challenges of not being
 atomic on updates; example an entry already on your dump list could
 be deleted before being read by user space. I cant see you solving
 that without abusing a _lot_ more cpu (trying to find the entry on
 the dump list that may have gotten updated or deleted). Theres also
 the issue of how long to keep the dump list before aging it out and
 how to deal with multiple users.

If more entries are added, you can get notifications of them.
 
Cheers,
  Timo

--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC][PATCH] Fixing SA/SP dumps on netlink/af_key

2008-01-16 Thread Timo Teräs
jamal wrote:
 On Wed, 2008-16-01 at 16:28 +0200, Timo Teräs wrote:
  No. I'm not creating second copies of the SADB/SPD entries. The entries
  are just added to one more list.
 
 Ah, sorry - yes, that sounds reasonable.
 So what happens if i delete an entry; does it get removed from the list?
 Also what happens on modification?

If the entry is removed befored it is dumped, it wont be dumped at all.
The state during dump code execution is returned. Depending when the
modification occurs it might or might not be reflected in the dumped
entry.

  If more entries are added, you can get notifications of them.
 
 how would a user app (example racoon) appropriately deal with it?
 Example an entry sits in the dump-list, it gets deleted - an event gets
 generated user-space and later that entry shows up in user space dump.

You listen for the events. It is guaranteed that if the dumping code
does return the entry to be deleted, the deletion notification will
occur after that dump entry.

Herbert Xu wrote:
 On Wed, Jan 16, 2008 at 08:39:40PM -0500, jamal wrote:
 I wouldnt disagree except some apps like racoon which depend on pfkey
 are unfortunately beyond repair. Timo has a pretty good handle on the
 
 Racoon doesn't use pfkey dumping as far as I know.

ipsec-tools racoon uses pfkey and only pfkey. And it's non trivial to
make it use netlink; it relies heavily all around the code to pfkey
structs. It also runs on BSD so we cannot rip pfkey away; adding a
layer to work with both pfkey and netlink would be doable, but just a
lot of work.

Also ipsec-tools racoon seems to be the default IKE daemon in some
popular distros. So for the time being I think pfkey is an evil we have
to live with.

Cheers,
  Timo
--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC][PATCH] Fixing SA/SP dumps on netlink/af_key

2008-01-16 Thread Timo Teräs
Herbert Xu wrote:
 jamal [EMAIL PROTECTED] wrote:
 There are two issues that are inter-mingled in there. The most important
 is pf_key not being robust on dump. The other being the accurracy of
 
 IMHO doing significant work on af_key is a waste of time.  It has no
 advantages at all over xfrm_user since neither is portable.  So we
 should discourage people from using af_key wherever possible.

I don't know about netlink. But pfkey works in *BSD too and it is RFC'd.
So I'd say pfkey might be a bit more portable. Though netlink is definitely
more robust and extensive.

Cheers,
  Timo


--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC][PATCH] Fixing SA/SP dumps on netlink/af_key

2008-01-16 Thread Timo Teräs
David Miller wrote:
 From: Timo_Teräs [EMAIL PROTECTED]
 Date: Thu, 17 Jan 2008 08:27:14 +0200
 
 I don't know about netlink. But pfkey works in *BSD too and it is RFC'd.
 So I'd say pfkey might be a bit more portable. Though netlink is definitely
 more robust and extensive.
 
 The RFCs say absolutely nothing about policy interfaces for AF_KEY,
 everybody rolls their own in slightly incompatible ways.
 
 It is therefore anything but standardized.

Yes, there's non-standardized extensions. But the point was that there are
other implementations of pfkey. And ipsec-tools racoon is an example of
a widely used application that runs in Linux and *BSD using this API. So
for the time being I'd consider having pfkey fixes as a good thing. This
pfkey dumping problem seems to be affecting many users.

- Timo
--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC][PATCH] Fixing SA/SP dumps on netlink/af_key

2008-01-13 Thread Timo Teräs
Hi all,

The problem with IPsec SP/SA dumping is that they don't work well with large 
SPD/SADB:s. In netlink the dumping code is O(n^2) and can miss some 
entries/dump duplicates if the DB is changed between the recv() calls to read 
the dump entries. This is due to the entry index counting done in xfrm_user 
code. With af_key the dump entries are just dropped when the socket receive 
queue is filled.

I tried to address all these problems, and added the xfrm_policy and xfrm_state 
structure to a new linked list that is used solely for dumping. This way the 
dumps can be done in O(n) and have an iterator point to them. I also modified 
to af_key to stop dumping when receive queue is filling up and continue it from 
a callback at recvfrom() when there's more room.

This patch is against 2.6.23 vanilla. I wanted to get some feedback before I 
make it against the git tree.

But I'd like to hear about:
- if the approach is ok (adding the entries in one more list)?
- if the new/modified variables, structures and function names are ok?
- if I should port these against net-2.6 or net-2.6.25 git tree?
- if I should split this to more than one commit? (maybe xfrm core, xfrm user 
and af_key parts?)

Cheers,
  Timo

Index: linux-2.6.23/include/net/xfrm.h
===
--- linux-2.6.23.orig/include/net/xfrm.h2008-01-13 14:13:21.0 
+0200
+++ linux-2.6.23/include/net/xfrm.h 2008-01-13 14:13:34.0 +0200
@@ -104,6 +104,7 @@
 struct xfrm_state
 {
/* Note: bydst is re-used during gc */
+   struct list_headall;
struct hlist_node   bydst;
struct hlist_node   bysrc;
struct hlist_node   byspi;
@@ -346,6 +347,7 @@
 struct xfrm_policy
 {
struct xfrm_policy  *next;
+   struct list_headbytype;
struct hlist_node   bydst;
struct hlist_node   byidx;
 
@@ -912,6 +914,17 @@
int priority;
 };
 
+struct xfrm_state_walker {
+   struct xfrm_state *state;
+   int count;
+};
+
+struct xfrm_policy_walker {
+   struct xfrm_policy *policy;
+   int count;
+   int type;
+};
+
 extern void xfrm_init(void);
 extern void xfrm4_init(void);
 extern void xfrm6_init(void);
@@ -921,7 +934,15 @@
 extern void xfrm6_state_init(void);
 extern void xfrm6_state_fini(void);
 
-extern int xfrm_state_walk(u8 proto, int (*func)(struct xfrm_state *, int, 
void*), void *);
+static inline void xfrm_state_walk_start(struct xfrm_state_walker *walk)
+{
+   walk-state = NULL;
+   walk-count = 0;
+}
+
+extern int xfrm_state_walk(struct xfrm_state_walker *walk, u8 proto,
+  int (*func)(struct xfrm_state *, int, void*), void 
*);
+extern void xfrm_state_walk_end(struct xfrm_state_walker *walk);
 extern struct xfrm_state *xfrm_state_alloc(void);
 extern struct xfrm_state *xfrm_state_find(xfrm_address_t *daddr, 
xfrm_address_t *saddr, 
  struct flowi *fl, struct xfrm_tmpl 
*tmpl,
@@ -1025,7 +1046,17 @@
 #endif
 
 struct xfrm_policy *xfrm_policy_alloc(gfp_t gfp);
-extern int xfrm_policy_walk(u8 type, int (*func)(struct xfrm_policy *, int, 
int, void*), void *);
+
+static inline void xfrm_policy_walk_start(struct xfrm_policy_walker *walk)
+{
+   walk-policy = NULL;
+   walk-count = 0;
+   walk-type = XFRM_POLICY_TYPE_MAIN;
+}
+
+extern int xfrm_policy_walk(struct xfrm_policy_walker *walk, u8 type,
+   int (*func)(struct xfrm_policy *, int, int, void*), void *);
+extern void xfrm_policy_walk_end(struct xfrm_policy_walker *walk);
 int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl);
 struct xfrm_policy *xfrm_policy_bysel_ctx(u8 type, int dir,
  struct xfrm_selector *sel,
Index: linux-2.6.23/net/key/af_key.c
===
--- linux-2.6.23.orig/net/key/af_key.c  2008-01-13 14:13:21.0 +0200
+++ linux-2.6.23/net/key/af_key.c   2008-01-13 14:13:34.0 +0200
@@ -48,6 +48,16 @@
struct sock sk;
int registered;
int promisc;
+
+   u8  dump_type;
+   uint8_t dump_msg_version;
+   uint32_tdump_msg_pid;
+   int (*dump)(struct pfkey_sock *sk);
+   void(*dump_done)(struct pfkey_sock *sk);
+   union {
+   struct xfrm_policy_walker   policy;
+   struct xfrm_state_walkerstate;
+   } u;
 };
 
 static inline struct pfkey_sock *pfkey_sk(struct sock *sk)
@@ -55,6 +65,30 @@
return (struct pfkey_sock *)sk;
 }
 
+static int pfkey_can_dump(struct sock *sk)
+{
+   if (3 * atomic_read(sk-sk_rmem_alloc) = 2 * sk-sk_rcvbuf)
+   return 1;
+   return 0;
+}
+
+static int pfkey_do_dump(struct pfkey_sock *pfk)
+{
+   int rc;
+
+   rc = pfk-dump(pfk);
+   if (rc != 0) {
+   

[PATCH][IPV4] ip_gre: set mac_header correctly in receive path

2007-12-19 Thread Timo Teräs
From: Timo Teras [EMAIL PROTECTED]

mac_header update in ipgre_recv() was incorrectly changed to
skb_reset_mac_header() when it was introduced.

Signed-off-by: Timo Teras [EMAIL PROTECTED]
---
This replaces my earlier patch titled ip_gre: use skb-{mac,
network}_header consistently. Apparently I hadn't done my homework how
to use *_header correctly. And I should have done a bit more testing to
figure out the previous patch does not work.

But the main problem was the receive path in the first place, and this
patch fixes it.

The bug was introduced in commit 459a98ed881802dee55897441bc7f77af614368e.
There might be other similar incorrect replaces.


 net/ipv4/ip_gre.c |2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 02b02a8..4b93f32 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -613,7 +613,7 @@ static int ipgre_rcv(struct sk_buff *skb)
offset += 4;
}
 
-   skb_reset_mac_header(skb);
+   skb-mac_header = skb-network_header;
__pskb_pull(skb, offset);
skb_reset_network_header(skb);
skb_postpull_rcsum(skb, skb_transport_header(skb), offset);
-- 
1.5.2.5
--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC][PATCH][IPV4] ip_gre: use skb-{mac,network}_header consistently

2007-12-18 Thread Timo Teräs
From: Timo Teras [EMAIL PROTECTED]

Make both send and receive paths use consistently skb-*_header:
  mac_header = outer IPv4 header
  network_header = encapsulated packet (e.g. inner IPv4 header)

Signed-off-by: Timo Teras [EMAIL PROTECTED]
---
Currently in send path:
  network_header = outer IPv4 header
  transport_header = encapsulated packet
And receive path:
  network_header = encapsulated packet
  mac_header = GRE header

mac_header used to be the outer IPv4 header on receive path, but it was
broken/changed when skb_reset_mac_header() was introduced.

This also makes ipgre_header_parse() work correctly as it assumes that
mac_header points to the outer IPv4 header always. Though, it returns
zeros for outgoing packets that are tcpdumped from gre tunnel without
explicit local address, since the routing is done in ipgre_tunnel_xmit().

 net/ipv4/ip_gre.c |8 
 1 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 02b02a8..eba1ade 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -613,7 +613,7 @@ static int ipgre_rcv(struct sk_buff *skb)
offset += 4;
}
 
-   skb_reset_mac_header(skb);
+   skb-mac_header = skb-network_header;
__pskb_pull(skb, offset);
skb_reset_network_header(skb);
skb_postpull_rcsum(skb, skb_transport_header(skb), offset);
@@ -826,9 +826,8 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct 
net_device *dev)
old_iph = ip_hdr(skb);
}
 
-   skb-transport_header = skb-network_header;
skb_push(skb, gre_hlen);
-   skb_reset_network_header(skb);
+   skb_reset_mac_header(skb);
memset((IPCB(skb)-opt), 0, sizeof(IPCB(skb)-opt));
IPCB(skb)-flags = ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
  IPSKB_REROUTED);
@@ -839,7 +838,7 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct 
net_device *dev)
 *  Push down and install the IPIP header.
 */
 
-   iph =   ip_hdr(skb);
+   iph =   (struct iphdr *) skb_mac_header(skb);
iph-version=   4;
iph-ihl=   sizeof(struct iphdr)  2;
iph-frag_off   =   df;
@@ -1070,6 +1069,7 @@ static int ipgre_header(struct sk_buff *skb, struct 
net_device *dev,
struct iphdr *iph = (struct iphdr *)skb_push(skb, t-hlen);
__be16 *p = (__be16*)(iph+1);
 
+   skb_reset_mac_header(skb);
memcpy(iph, t-parms.iph, sizeof(struct iphdr));
p[0]= t-parms.o_flags;
p[1]= htons(type);
-- 
1.5.2.5


--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH RESEND] ip_gre: sendto/recvfrom NBMA address

2007-10-23 Thread Timo Teräs

When GRE tunnel is in NBMA mode, this patch allows an application to use
a PF_PACKET socket to:
- send a packet to specific NBMA address with sendto()
- use recvfrom() to receive packet and check which NBMA address it came from

Signed-off-by: Timo Teras [EMAIL PROTECTED]

---

This is useful for implementing NHRP (and other low level protocols) over
NBMA GRE tunnels.

Posted about a week ago, but got no reply. Could someone comment on this?
Or commit it? Thanks.

net/ipv4/ip_gre.c |   12 +++-
1 files changed, 11 insertions(+), 1 deletions(-)

diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index f151900..b1e3816 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -1033,7 +1033,6 @@ static int ipgre_tunnel_change_mtu(struct net_device 
*dev, int new_mtu)
return 0;
}

-#ifdef CONFIG_NET_IPGRE_BROADCAST
/* Nice toy. Unfortunately, useless in real life :-)
 It allows to construct virtual multiprotocol broadcast LAN
 over the Internet, provided multicast routing is tuned.
@@ -1092,10 +1091,19 @@ static int ipgre_header(struct sk_buff *skb, struct 
net_device *dev,
return -t-hlen;
}

+static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
+{
+   struct iphdr *iph = (struct iphdr *) skb_mac_header(skb);
+   memcpy(haddr, iph-saddr, 4);
+   return 4;
+}
+
static const struct header_ops ipgre_header_ops = {
.create = ipgre_header,
+   .parse  = ipgre_header_parse,
};

+#ifdef CONFIG_NET_IPGRE_BROADCAST
static int ipgre_open(struct net_device *dev)
{
struct ip_tunnel *t = netdev_priv(dev);
@@ -1197,6 +1205,8 @@ static int ipgre_tunnel_init(struct net_device *dev)
dev-stop = ipgre_close;
}
#endif
+   } else {
+   dev-header_ops = ipgre_header_ops;
}

if (!tdev  tunnel-parms.link)


-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH RESEND] ip_gre: sendto/recvfrom NBMA address

2007-10-23 Thread Timo Teräs

When GRE tunnel is in NBMA mode, this patch allows an application to use
a PF_PACKET socket to:
- send a packet to specific NBMA address with sendto()
- use recvfrom() to receive packet and check which NBMA address it came from

This is required to implement properly NHRP over GRE tunnel.

Signed-off-by: Timo Teras [EMAIL PROTECTED]

---

Your mailer mangled tabs, it won't apply like this. Try sending to
yourself for testing, then repost when it works properly. 


Sorry. Tabs were ok, but the initial white space on context lines was somehow
removed. Apparently a bug in my editor where I cut/pasted from.

net/ipv4/ip_gre.c |   12 +++-
1 files changed, 11 insertions(+), 1 deletions(-)

diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index f151900..b1e3816 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -1033,7 +1033,6 @@ static int ipgre_tunnel_change_mtu(struct net_device 
*dev, int new_mtu)
return 0;
}

-#ifdef CONFIG_NET_IPGRE_BROADCAST
/* Nice toy. Unfortunately, useless in real life :-)
   It allows to construct virtual multiprotocol broadcast LAN
   over the Internet, provided multicast routing is tuned.
@@ -1092,10 +1091,19 @@ static int ipgre_header(struct sk_buff *skb, struct 
net_device *dev,
return -t-hlen;
}

+static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
+{
+   struct iphdr *iph = (struct iphdr*) skb_mac_header(skb);
+   memcpy(haddr, iph-saddr, 4);
+   return 4;
+}
+
static const struct header_ops ipgre_header_ops = {
.create = ipgre_header,
+   .parse  = ipgre_header_parse,
};

+#ifdef CONFIG_NET_IPGRE_BROADCAST
static int ipgre_open(struct net_device *dev)
{
struct ip_tunnel *t = netdev_priv(dev);
@@ -1197,6 +1205,8 @@ static int ipgre_tunnel_init(struct net_device *dev)
dev-stop = ipgre_close;
}
#endif
+   } else {
+   dev-header_ops = ipgre_header_ops;
}

if (!tdev  tunnel-parms.link)



-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH RESEND] ip_gre: sendto/recvfrom NBMA address

2007-10-23 Thread Timo Teräs

When GRE tunnel is in NBMA mode, this patch allows an application to use
a PF_PACKET socket to:
- send a packet to specific NBMA address with sendto()
- use recvfrom() to receive packet and check which NBMA address it came from

This is required to implement properly NHRP over GRE tunnel.

Signed-off-by: Timo Teras [EMAIL PROTECTED]

---
Patrick McHardy wrote:

Your mailer mangled tabs, it won't apply like this. Try sending to
yourself for testing, then repost when it works properly.


Tabs were ok, I think. But somehow it seems that cut'n'paste from my
default editor loses the space from context lines. It should be good now.

net/ipv4/ip_gre.c |   12 +++-
1 files changed, 11 insertions(+), 1 deletions(-)

diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index f151900..b1e3816 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -1033,7 +1033,6 @@ static int ipgre_tunnel_change_mtu(struct net_device 
*dev, int new_mtu)
return 0;
}

-#ifdef CONFIG_NET_IPGRE_BROADCAST
/* Nice toy. Unfortunately, useless in real life :-)
   It allows to construct virtual multiprotocol broadcast LAN
   over the Internet, provided multicast routing is tuned.
@@ -1092,10 +1091,19 @@ static int ipgre_header(struct sk_buff *skb, struct 
net_device *dev,
return -t-hlen;
}

+static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
+{
+   struct iphdr *iph = (struct iphdr*) skb_mac_header(skb);
+   memcpy(haddr, iph-saddr, 4);
+   return 4;
+}
+
static const struct header_ops ipgre_header_ops = {
.create = ipgre_header,
+   .parse  = ipgre_header_parse,
};

+#ifdef CONFIG_NET_IPGRE_BROADCAST
static int ipgre_open(struct net_device *dev)
{
struct ip_tunnel *t = netdev_priv(dev);
@@ -1197,6 +1205,8 @@ static int ipgre_tunnel_init(struct net_device *dev)
dev-stop = ipgre_close;
}
#endif
+   } else {
+   dev-header_ops = ipgre_header_ops;
}

if (!tdev  tunnel-parms.link)
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH RESEND] ip_gre: sendto/recvfrom NBMA address

2007-10-23 Thread Timo Teräs

Hi,

Alexey Kuznetsov wrote:

The only dubious case is when next hop is set using routing tables.
But code in ipgre_tunnel_xmit() is ready to accept this situation,
it checks for zero destination address and fixes it when it is able to.


Nevertheless, it does not work.

Another thoughts?


Could you explain this a little more?

I was able to set a nbma gre tunnel, add routes to it and it worked
perfectly ok.

Link-level next hop worked:
ip route add route via link-level-address dev tunnel-dev onlink

Also normal route with static neighbor worked:
ip neigh add router-tunnel-ip lladdr link-level-address nud permanent dev 
tunnel-dev
ip route add route via router-tunnel-ip

In both cases packets were routed as expected to hosts accessible using
the added route. And sendto() sent a packet correctly with destination
link-level-address set. Did I miss/misunderstand something?

Cheers,
 Timo
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] ip_gre: sendto/recvfrom NBMA address

2007-10-17 Thread Timo Teräs

When GRE tunnel is in NBMA mode, this patch allows an application to use
a PF_PACKET socket to:
- send a packet to specific NBMA address with sendto()
- use recvfrom() to receive packet and check which NBMA address it came from

Signed-off-by: Timo Teras [EMAIL PROTECTED]

---

This is useful for implementing NHRP over NBMA GRE tunnels.

Sent earlier to netdev list only. Resending to wider audience with
a style issue fixed.

net/ipv4/ip_gre.c |   12 +++-
1 files changed, 11 insertions(+), 1 deletions(-)

diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index f151900..b1e3816 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -1033,7 +1033,6 @@ static int ipgre_tunnel_change_mtu(struct net_device 
*dev, int new_mtu)
return 0;
}

-#ifdef CONFIG_NET_IPGRE_BROADCAST
/* Nice toy. Unfortunately, useless in real life :-)
  It allows to construct virtual multiprotocol broadcast LAN
  over the Internet, provided multicast routing is tuned.
@@ -1092,10 +1091,19 @@ static int ipgre_header(struct sk_buff *skb, struct 
net_device *dev,
return -t-hlen;
}

+static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
+{
+   struct iphdr *iph = (struct iphdr *) skb_mac_header(skb);
+   memcpy(haddr, iph-saddr, 4);
+   return 4;
+}
+
static const struct header_ops ipgre_header_ops = {
.create = ipgre_header,
+   .parse  = ipgre_header_parse,
};

+#ifdef CONFIG_NET_IPGRE_BROADCAST
static int ipgre_open(struct net_device *dev)
{
struct ip_tunnel *t = netdev_priv(dev);
@@ -1197,6 +1205,8 @@ static int ipgre_tunnel_init(struct net_device *dev)
dev-stop = ipgre_close;
}
#endif
+   } else {
+   dev-header_ops = ipgre_header_ops;
}

if (!tdev  tunnel-parms.link)

-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


ip_gre: sendto/recvfrom NBMA address

2007-10-15 Thread Timo Teräs

When GRE tunnel is in NBMA mode, this patch allows an application to use
a PF_PACKET socket to:
- send a packet to specific NBMA address with sendto()
- use recvfrom() to receive packet and check which NBMA address it came from

This is required to implement properly NHRP over GRE tunnel.

Signed-off-by: Timo Teras [EMAIL PROTECTED]

---
commit 14ebc80596af5e8422b5db2120cf3e0d801ddd3a
tree b9fc7b2b8465d506deb5a5f05f22c04f5049cd5f
parent 4271e0f7e12bdbbd7ce131187aaae2ba5233a309
author Timo Teras [EMAIL PROTECTED] Mon, 15 Oct 2007 19:49:52 +0300
committer Timo Teras [EMAIL PROTECTED] Mon, 15 Oct 2007 19:49:52 +0300

net/ipv4/ip_gre.c |   12 +++-
1 files changed, 11 insertions(+), 1 deletions(-)

diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index f151900..b1e3816 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -1033,7 +1033,6 @@ static int ipgre_tunnel_change_mtu(struct net_device 
*dev, int new_mtu)
return 0;
}

-#ifdef CONFIG_NET_IPGRE_BROADCAST
/* Nice toy. Unfortunately, useless in real life :-)
   It allows to construct virtual multiprotocol broadcast LAN
   over the Internet, provided multicast routing is tuned.
@@ -1092,10 +1091,19 @@ static int ipgre_header(struct sk_buff *skb, struct 
net_device *dev,
return -t-hlen;
}

+static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
+{
+   struct iphdr *iph = (struct iphdr*) skb_mac_header(skb);
+   memcpy(haddr, iph-saddr, 4);
+   return 4;
+}
+
static const struct header_ops ipgre_header_ops = {
.create = ipgre_header,
+   .parse  = ipgre_header_parse,
};

+#ifdef CONFIG_NET_IPGRE_BROADCAST
static int ipgre_open(struct net_device *dev)
{
struct ip_tunnel *t = netdev_priv(dev);
@@ -1197,6 +1205,8 @@ static int ipgre_tunnel_init(struct net_device *dev)
dev-stop = ipgre_close;
}
#endif
+   } else {
+   dev-header_ops = ipgre_header_ops;
}

if (!tdev  tunnel-parms.link)
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html