[PATCH] via-velocity: unconditionally drop frames with bad l2 length
By default the driver allowed incorrect frames to be received. What is worse the code does not handle very short frames correctly. The FCS length is unconditionally subtracted, and the underflow can cause skb_put to be called with large number after implicit cast to unsigned. And indeed, an skb_over_panic() was observed with via-velocity. This removes the module parameter as it does not work in it's current state, and should be implemented via NETIF_F_RXALL if needed. Suggested-by: Francois Romieu <rom...@fr.zoreil.com> Signed-off-by: Timo Teräs <timo.te...@iki.fi> --- Francois, is this something like you had in mind? I can try give this a test spin in the known bad location, if this looks otherwise ok. drivers/net/ethernet/via/via-velocity.c | 24 +++- 1 file changed, 3 insertions(+), 21 deletions(-) diff --git a/drivers/net/ethernet/via/via-velocity.c b/drivers/net/ethernet/via/via-velocity.c index a43e849..03ce386 100644 --- a/drivers/net/ethernet/via/via-velocity.c +++ b/drivers/net/ethernet/via/via-velocity.c @@ -345,13 +345,6 @@ VELOCITY_PARAM(flow_control, "Enable flow control ability"); */ VELOCITY_PARAM(speed_duplex, "Setting the speed and duplex mode"); -#define VAL_PKT_LEN_DEF 0 -/* ValPktLen[] is used for setting the checksum offload ability of NIC. - 0: Receive frame with invalid layer 2 length (Default) - 1: Drop frame with invalid layer 2 length -*/ -VELOCITY_PARAM(ValPktLen, "Receiving or Drop invalid 802.3 frame"); - #define WOL_OPT_DEF 0 #define WOL_OPT_MIN 0 #define WOL_OPT_MAX 7 @@ -494,7 +487,6 @@ static void velocity_get_options(struct velocity_opt *opts, int index, velocity_set_int_opt(>flow_cntl, flow_control[index], FLOW_CNTL_MIN, FLOW_CNTL_MAX, FLOW_CNTL_DEF, "flow_control", devname); velocity_set_bool_opt(>flags, IP_byte_align[index], IP_ALIG_DEF, VELOCITY_FLAGS_IP_ALIGN, "IP_byte_align", devname); - velocity_set_bool_opt(>flags, ValPktLen[index], VAL_PKT_LEN_DEF, VELOCITY_FLAGS_VAL_PKT_LEN, "ValPktLen", devname); velocity_set_int_opt((int *) >spd_dpx, speed_duplex[index], MED_LNK_MIN, MED_LNK_MAX, MED_LNK_DEF, "Media link mode", devname); velocity_set_int_opt(>wol_opts, wol_opts[index], WOL_OPT_MIN, WOL_OPT_MAX, WOL_OPT_DEF, "Wake On Lan options", devname); opts->numrx = (opts->numrx & ~3); @@ -2055,8 +2047,9 @@ static int velocity_receive_frame(struct velocity_info *vptr, int idx) int pkt_len = le16_to_cpu(rd->rdesc0.len) & 0x3fff; struct sk_buff *skb; - if (rd->rdesc0.RSR & (RSR_STP | RSR_EDP)) { - VELOCITY_PRT(MSG_LEVEL_VERBOSE, KERN_ERR " %s : the received frame spans multiple RDs.\n", vptr->netdev->name); + if (unlikely(rd->rdesc0.RSR & (RSR_STP | RSR_EDP | RSR_RL))) { + if (rd->rdesc0.RSR & (RSR_STP | RSR_EDP)) + VELOCITY_PRT(MSG_LEVEL_VERBOSE, KERN_ERR " %s : the received frame spans multiple RDs.\n", vptr->netdev->name); stats->rx_length_errors++; return -EINVAL; } @@ -2069,17 +2062,6 @@ static int velocity_receive_frame(struct velocity_info *vptr, int idx) dma_sync_single_for_cpu(vptr->dev, rd_info->skb_dma, vptr->rx.buf_sz, DMA_FROM_DEVICE); - /* -* Drop frame not meeting IEEE 802.3 -*/ - - if (vptr->flags & VELOCITY_FLAGS_VAL_PKT_LEN) { - if (rd->rdesc0.RSR & RSR_RL) { - stats->rx_length_errors++; - return -EINVAL; - } - } - velocity_rx_csum(rd, skb); if (velocity_rx_copy(, pkt_len, vptr) < 0) { -- 2.6.3 -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] ip_tunnel: fix ipv4 pmtu check to honor inner ip header df
Frag needed should be sent only if the inner header asked to not fragment. Currently fragmentation is broken if the tunnel has df set. The tunnel's df needs to be still checked to update internally the pmtu cache. This got broken in commit 23a3647bc4f93bac and this fixes the pmtu check back to the way it was. Fixes: 23a3647bc4f93bac (ip_tunnels: Use skb-len to PMTU check.) Cc: Pravin B Shelar pshe...@nicira.com --- Should go to -stable queues (3.12.y and never). net/ipv4/ip_tunnel.c | 8 +--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index 0bb8e14..6822572 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -587,7 +587,8 @@ int ip_tunnel_encap(struct sk_buff *skb, struct ip_tunnel *t, EXPORT_SYMBOL(ip_tunnel_encap); static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb, - struct rtable *rt, __be16 df) + struct rtable *rt, __be16 df, + const struct iphdr *inner_iph) { struct ip_tunnel *tunnel = netdev_priv(dev); int pkt_size = skb-len - tunnel-hlen - dev-hard_header_len; @@ -604,7 +605,8 @@ static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb, if (skb-protocol == htons(ETH_P_IP)) { if (!skb_is_gso(skb) - (df htons(IP_DF)) mtu pkt_size) { + (inner_iph-frag_off htons(IP_DF)) + mtu pkt_size) { memset(IPCB(skb), 0, sizeof(*IPCB(skb))); icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); return -E2BIG; @@ -738,7 +740,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, goto tx_error; } - if (tnl_update_pmtu(dev, skb, rt, tnl_params-frag_off)) { + if (tnl_update_pmtu(dev, skb, rt, tnl_params-frag_off, inner_iph)) { ip_rt_put(rt); goto tx_error; } -- 2.4.5 -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH net,v2] ip_tunnel: fix ipv4 pmtu check to honor inner ip header df
Frag needed should be sent only if the inner header asked to not fragment. Currently fragmentation is broken if the tunnel has df set, but df was not asked in the original packet. The tunnel's df needs to be still checked to update internally the pmtu cache. Commit 23a3647bc4f93bac broke it, and this commit fixes the ipv4 df check back to the way it was. Fixes: 23a3647bc4f93bac (ip_tunnels: Use skb-len to PMTU check.) Cc: Pravin B Shelar pshe...@nicira.com Signed-off-by: Timo Teräs timo.te...@iki.fi --- Should go to -stable queues (3.12.y and newer). v2: revised commit message wording a bit, and added signed-off-by line that was forgotten accidentally. net/ipv4/ip_tunnel.c | 8 +--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index 4c2c3ba..626d9e5 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -586,7 +586,8 @@ int ip_tunnel_encap(struct sk_buff *skb, struct ip_tunnel *t, EXPORT_SYMBOL(ip_tunnel_encap); static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb, - struct rtable *rt, __be16 df) + struct rtable *rt, __be16 df, + const struct iphdr *inner_iph) { struct ip_tunnel *tunnel = netdev_priv(dev); int pkt_size = skb-len - tunnel-hlen - dev-hard_header_len; @@ -603,7 +604,8 @@ static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb, if (skb-protocol == htons(ETH_P_IP)) { if (!skb_is_gso(skb) - (df htons(IP_DF)) mtu pkt_size) { + (inner_iph-frag_off htons(IP_DF)) + mtu pkt_size) { memset(IPCB(skb), 0, sizeof(*IPCB(skb))); icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); return -E2BIG; @@ -737,7 +739,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, goto tx_error; } - if (tnl_update_pmtu(dev, skb, rt, tnl_params-frag_off)) { + if (tnl_update_pmtu(dev, skb, rt, tnl_params-frag_off, inner_iph)) { ip_rt_put(rt); goto tx_error; } -- 2.4.5 -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCHv2] [XFRM]: Speed up xfrm_policy and xfrm_state walking
Change xfrm_policy and xfrm_state walking algorithm from O(n^2) to O(n). This is achieved adding the entries to one more list which is used solely for walking the entries. This also fixes some races where the dump can have duplicate or missing entries when the SPD/SADB is modified during an ongoing dump. Dumping SADB with 2 entries using time ip xfrm state the sys time dropped from 1.012s to 0.080s. Signed-off-by: Timo Teras [EMAIL PROTECTED] --- David Miller wrote: This will potentially leak walk-policy in the != -EEXIST case. I think what needs to happen is we invoke xfrm_policy_walk_done() unconditionally, then we'll potentially return reqid. Good catch. Fixed as suggested. Also the walk context structure layout was changed to align better on 64-bit machines as suggested by Margit Schubert-While. include/linux/xfrm.h |3 +- include/net/xfrm.h | 52 ++- net/key/af_key.c | 24 -- net/xfrm/xfrm_policy.c | 79 net/xfrm/xfrm_state.c | 53 ++-- net/xfrm/xfrm_user.c | 71 ++- 6 files changed, 197 insertions(+), 85 deletions(-) diff --git a/include/linux/xfrm.h b/include/linux/xfrm.h index e31b8c8..0c82c80 100644 --- a/include/linux/xfrm.h +++ b/include/linux/xfrm.h @@ -113,7 +113,8 @@ enum { XFRM_POLICY_TYPE_MAIN = 0, XFRM_POLICY_TYPE_SUB= 1, - XFRM_POLICY_TYPE_MAX= 2 + XFRM_POLICY_TYPE_MAX= 2, + XFRM_POLICY_TYPE_ANY= 255 }; enum diff --git a/include/net/xfrm.h b/include/net/xfrm.h index eea7785..9b62056 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -121,6 +121,7 @@ extern struct mutex xfrm_cfg_mutex; struct xfrm_state { /* Note: bydst is re-used during gc */ + struct list_headall; struct hlist_node bydst; struct hlist_node bysrc; struct hlist_node byspi; @@ -424,6 +425,7 @@ struct xfrm_tmpl struct xfrm_policy { struct xfrm_policy *next; + struct list_headbytype; struct hlist_node bydst; struct hlist_node byidx; @@ -1160,6 +1162,18 @@ struct xfrm6_tunnel { int priority; }; +struct xfrm_state_walk { + struct xfrm_state *state; + int count; + u8 proto; +}; + +struct xfrm_policy_walk { + struct xfrm_policy *policy; + int count; + u8 type, cur_type; +}; + extern void xfrm_init(void); extern void xfrm4_init(void); extern void xfrm_state_init(void); @@ -1184,7 +1198,23 @@ static inline void xfrm6_fini(void) extern int xfrm_proc_init(void); #endif -extern int xfrm_state_walk(u8 proto, int (*func)(struct xfrm_state *, int, void*), void *); +static inline void xfrm_state_walk_init(struct xfrm_state_walk *walk, u8 proto) +{ + walk-proto = proto; + walk-state = NULL; + walk-count = 0; +} + +static inline void xfrm_state_walk_done(struct xfrm_state_walk *walk) +{ + if (walk-state != NULL) { + xfrm_state_put(walk-state); + walk-state = NULL; + } +} + +extern int xfrm_state_walk(struct xfrm_state_walk *walk, + int (*func)(struct xfrm_state *, int, void*), void *); extern struct xfrm_state *xfrm_state_alloc(void); extern struct xfrm_state *xfrm_state_find(xfrm_address_t *daddr, xfrm_address_t *saddr, struct flowi *fl, struct xfrm_tmpl *tmpl, @@ -1306,7 +1336,25 @@ static inline int xfrm4_udp_encap_rcv(struct sock *sk, struct sk_buff *skb) #endif struct xfrm_policy *xfrm_policy_alloc(gfp_t gfp); -extern int xfrm_policy_walk(u8 type, int (*func)(struct xfrm_policy *, int, int, void*), void *); + +static inline void xfrm_policy_walk_init(struct xfrm_policy_walk *walk, u8 type) +{ + walk-cur_type = XFRM_POLICY_TYPE_MAIN; + walk-type = type; + walk-policy = NULL; + walk-count = 0; +} + +static inline void xfrm_policy_walk_done(struct xfrm_policy_walk *walk) +{ + if (walk-policy != NULL) { + xfrm_pol_put(walk-policy); + walk-policy = NULL; + } +} + +extern int xfrm_policy_walk(struct xfrm_policy_walk *walk, + int (*func)(struct xfrm_policy *, int, int, void*), void *); int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl); struct xfrm_policy *xfrm_policy_bysel_ctx(u8 type, int dir, struct xfrm_selector *sel, diff --git a/net/key/af_key.c b/net/key/af_key.c index 1c85392..0d7c195 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -1742,12 +1742,18 @@ static int pfkey_dump(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr { u8 proto; struct pfkey_dump_data data = { .skb = skb, .hdr = hdr, .sk = sk }; + struct xfrm_state_walk walk; + int rc; proto =
[PATCH RESEND] [XFRM]: Speed up xfrm_policy and xfrm_state walking
Change xfrm_policy and xfrm_state walking algorithm from O(n^2) to O(n). This is achieved adding the entries to one more list which is used solely for walking the entries. This also fixes some races where the dump can have duplicate or missing entries when the SPD/SADB is modified during an ongoing dump. Dumping SADB with 2 entries using time ip xfrm state the sys time dropped from 1.012s to 0.080s. Signed-off-by: Timo Teras [EMAIL PROTECTED] --- include/linux/xfrm.h |3 +- include/net/xfrm.h | 52 ++- net/key/af_key.c | 22 +++-- net/xfrm/xfrm_policy.c | 79 net/xfrm/xfrm_state.c | 53 ++-- net/xfrm/xfrm_user.c | 71 ++- 6 files changed, 195 insertions(+), 85 deletions(-) diff --git a/include/linux/xfrm.h b/include/linux/xfrm.h index e31b8c8..0c82c80 100644 --- a/include/linux/xfrm.h +++ b/include/linux/xfrm.h @@ -113,7 +113,8 @@ enum { XFRM_POLICY_TYPE_MAIN = 0, XFRM_POLICY_TYPE_SUB= 1, - XFRM_POLICY_TYPE_MAX= 2 + XFRM_POLICY_TYPE_MAX= 2, + XFRM_POLICY_TYPE_ANY= 255 }; enum diff --git a/include/net/xfrm.h b/include/net/xfrm.h index ac72116..7bd021b 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -121,6 +121,7 @@ extern struct mutex xfrm_cfg_mutex; struct xfrm_state { /* Note: bydst is re-used during gc */ + struct list_headall; struct hlist_node bydst; struct hlist_node bysrc; struct hlist_node byspi; @@ -424,6 +425,7 @@ struct xfrm_tmpl struct xfrm_policy { struct xfrm_policy *next; + struct list_headbytype; struct hlist_node bydst; struct hlist_node byidx; @@ -1157,6 +1159,18 @@ struct xfrm6_tunnel { int priority; }; +struct xfrm_state_walk { + u8 proto; + struct xfrm_state *state; + int count; +}; + +struct xfrm_policy_walk { + u8 type, cur_type; + struct xfrm_policy *policy; + int count; +}; + extern void xfrm_init(void); extern void xfrm4_init(void); extern void xfrm_state_init(void); @@ -1181,7 +1195,23 @@ static inline void xfrm6_fini(void) extern int xfrm_proc_init(void); #endif -extern int xfrm_state_walk(u8 proto, int (*func)(struct xfrm_state *, int, void*), void *); +static inline void xfrm_state_walk_init(struct xfrm_state_walk *walk, u8 proto) +{ + walk-proto = proto; + walk-state = NULL; + walk-count = 0; +} + +static inline void xfrm_state_walk_done(struct xfrm_state_walk *walk) +{ + if (walk-state != NULL) { + xfrm_state_put(walk-state); + walk-state = NULL; + } +} + +extern int xfrm_state_walk(struct xfrm_state_walk *walk, + int (*func)(struct xfrm_state *, int, void*), void *); extern struct xfrm_state *xfrm_state_alloc(void); extern struct xfrm_state *xfrm_state_find(xfrm_address_t *daddr, xfrm_address_t *saddr, struct flowi *fl, struct xfrm_tmpl *tmpl, @@ -1303,7 +1333,25 @@ static inline int xfrm4_udp_encap_rcv(struct sock *sk, struct sk_buff *skb) #endif struct xfrm_policy *xfrm_policy_alloc(gfp_t gfp); -extern int xfrm_policy_walk(u8 type, int (*func)(struct xfrm_policy *, int, int, void*), void *); + +static inline void xfrm_policy_walk_init(struct xfrm_policy_walk *walk, u8 type) +{ + walk-cur_type = XFRM_POLICY_TYPE_MAIN; + walk-type = type; + walk-policy = NULL; + walk-count = 0; +} + +static inline void xfrm_policy_walk_done(struct xfrm_policy_walk *walk) +{ + if (walk-policy != NULL) { + xfrm_pol_put(walk-policy); + walk-policy = NULL; + } +} + +extern int xfrm_policy_walk(struct xfrm_policy_walk *walk, + int (*func)(struct xfrm_policy *, int, int, void*), void *); int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl); struct xfrm_policy *xfrm_policy_bysel_ctx(u8 type, int dir, struct xfrm_selector *sel, diff --git a/net/key/af_key.c b/net/key/af_key.c index 16b72b5..d24bb9c 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -1742,12 +1742,18 @@ static int pfkey_dump(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr { u8 proto; struct pfkey_dump_data data = { .skb = skb, .hdr = hdr, .sk = sk }; + struct xfrm_state_walk walk; + int rc; proto = pfkey_satype2proto(hdr-sadb_msg_satype); if (proto == 0) return -EINVAL; - return xfrm_state_walk(proto, dump_sa, data); + xfrm_state_walk_init(walk, proto); + rc = xfrm_state_walk(walk, dump_sa, data); + xfrm_state_walk_done(walk); + + return rc; } static int pfkey_promisc(struct sock *sk, struct sk_buff *skb, struct sadb_msg
[PATCH net-2.6.25] [XFRM]: Speed up xfrm_policy and xfrm_state walking
Change xfrm_policy and xfrm_state walking algorithm from O(n^2) to O(n). This is achieved adding the entries to one more list which is used solely for walking the entries. This also fixes some races where the dump can have duplicate or missing entries when the SPD/SADB is modified during an ongoing dump. Dumping SADB with 2 entries using time ip xfrm state the sys time dropped from 1.012s to 0.080s. Signed-off-by: Timo Teras [EMAIL PROTECTED] --- include/linux/xfrm.h |3 +- include/net/xfrm.h | 52 ++- net/key/af_key.c | 22 +++-- net/xfrm/xfrm_policy.c | 79 net/xfrm/xfrm_state.c | 53 ++-- net/xfrm/xfrm_user.c | 71 ++- 6 files changed, 195 insertions(+), 85 deletions(-) diff --git a/include/linux/xfrm.h b/include/linux/xfrm.h index 9b5b00c..60e7395 100644 --- a/include/linux/xfrm.h +++ b/include/linux/xfrm.h @@ -106,7 +106,8 @@ enum { XFRM_POLICY_TYPE_MAIN = 0, XFRM_POLICY_TYPE_SUB= 1, - XFRM_POLICY_TYPE_MAX= 2 + XFRM_POLICY_TYPE_MAX= 2, + XFRM_POLICY_TYPE_ANY= 255 }; enum diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 5ebb9ba..a6a6adf 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -121,6 +121,7 @@ extern struct mutex xfrm_cfg_mutex; struct xfrm_state { /* Note: bydst is re-used during gc */ + struct list_headall; struct hlist_node bydst; struct hlist_node bysrc; struct hlist_node byspi; @@ -423,6 +424,7 @@ struct xfrm_tmpl struct xfrm_policy { struct xfrm_policy *next; + struct list_headbytype; struct hlist_node bydst; struct hlist_node byidx; @@ -1151,6 +1153,18 @@ struct xfrm6_tunnel { int priority; }; +struct xfrm_state_walk { + u8 proto; + struct xfrm_state *state; + int count; +}; + +struct xfrm_policy_walk { + u8 type, cur_type; + struct xfrm_policy *policy; + int count; +}; + extern void xfrm_init(void); extern void xfrm4_init(void); extern void xfrm_state_init(void); @@ -1175,7 +1189,23 @@ static inline void xfrm6_fini(void) extern int xfrm_proc_init(void); #endif -extern int xfrm_state_walk(u8 proto, int (*func)(struct xfrm_state *, int, void*), void *); +static inline void xfrm_state_walk_init(struct xfrm_state_walk *walk, u8 proto) +{ + walk-proto = proto; + walk-state = NULL; + walk-count = 0; +} + +static inline void xfrm_state_walk_done(struct xfrm_state_walk *walk) +{ + if (walk-state != NULL) { + xfrm_state_put(walk-state); + walk-state = NULL; + } +} + +extern int xfrm_state_walk(struct xfrm_state_walk *walk, + int (*func)(struct xfrm_state *, int, void*), void *); extern struct xfrm_state *xfrm_state_alloc(void); extern struct xfrm_state *xfrm_state_find(xfrm_address_t *daddr, xfrm_address_t *saddr, struct flowi *fl, struct xfrm_tmpl *tmpl, @@ -1297,7 +1327,25 @@ static inline int xfrm4_udp_encap_rcv(struct sock *sk, struct sk_buff *skb) #endif struct xfrm_policy *xfrm_policy_alloc(gfp_t gfp); -extern int xfrm_policy_walk(u8 type, int (*func)(struct xfrm_policy *, int, int, void*), void *); + +static inline void xfrm_policy_walk_init(struct xfrm_policy_walk *walk, u8 type) +{ + walk-cur_type = XFRM_POLICY_TYPE_MAIN; + walk-type = type; + walk-policy = NULL; + walk-count = 0; +} + +static inline void xfrm_policy_walk_done(struct xfrm_policy_walk *walk) +{ + if (walk-policy != NULL) { + xfrm_pol_put(walk-policy); + walk-policy = NULL; + } +} + +extern int xfrm_policy_walk(struct xfrm_policy_walk *walk, + int (*func)(struct xfrm_policy *, int, int, void*), void *); int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl); struct xfrm_policy *xfrm_policy_bysel_ctx(u8 type, int dir, struct xfrm_selector *sel, diff --git a/net/key/af_key.c b/net/key/af_key.c index 16b72b5..d24bb9c 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -1742,12 +1742,18 @@ static int pfkey_dump(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr { u8 proto; struct pfkey_dump_data data = { .skb = skb, .hdr = hdr, .sk = sk }; + struct xfrm_state_walk walk; + int rc; proto = pfkey_satype2proto(hdr-sadb_msg_satype); if (proto == 0) return -EINVAL; - return xfrm_state_walk(proto, dump_sa, data); + xfrm_state_walk_init(walk, proto); + rc = xfrm_state_walk(walk, dump_sa, data); + xfrm_state_walk_done(walk); + + return rc; } static int pfkey_promisc(struct sock *sk, struct sk_buff *skb, struct sadb_msg
Re: [RFC][PATCH] Fixing SA/SP dumps on netlink/af_key
David Miller wrote: Doing anything other than life support bug fixes for AF_KEY is inappropriate. Yes. I thought my patch would qualify as life support bug fix. Currently racoon fails to work if there are too many SPDs or SAs because the kernel cannot handle the dump request properly. And this is what my patch fixes for pfkey. It adds no new features or functionality; just makes the dumping work with large databases. Then there's also the xfrm dumping changes which change the algorithm from O(n^2) to O(n) with some memory overhead, but that is a different story. Any comments on that? - Timo -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC][PATCH] Fixing SA/SP dumps on netlink/af_key
David Miller wrote: From: Timo_Teräs [EMAIL PROTECTED] Date: Thu, 17 Jan 2008 10:11:17 +0200 I thought my patch would qualify as life support bug fix. Currently racoon fails to work if there are too many SPDs or SAs because the kernel cannot handle the dump request properly. And this is what my patch fixes for pfkey. It adds no new features or functionality; just makes the dumping work with large databases. Racoon should use netlink for reasons far and beyond the problem you are trying to address. Yes. But this is fairly major thing to do. One needs to create API abstraction layer (still need to use pfkey in *BSD). Test it. A lot of work that is not going to happen very soon. Where as the pfkey bug fix is non-intrusive and helps all legacy applications still using af_key by _fixing a bug in kernel_. The dumping behavior of AF_KEY is just horrific, as one of several examples. If af_key is all that bad and does not qualify to get maintanace bug fixes, why not remove it complitely? That would make userland adapt faster. Then there's also the xfrm dumping changes which change the algorithm from O(n^2) to O(n) with some memory overhead, but that is a different story. Any comments on that? I have no general objections to those changes although I am backlogged and thus have not studied them in detail. Jamal is having what appears to be a healthy dialogue with you about the details so I'm not concerned much :) Ok. I hope someone can also give feedback on the naming conventions. And about the api changes to xfrm policy/state walking. - Timo -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC][PATCH] Fixing SA/SP dumps on netlink/af_key
David Miller wrote: From: Timo_Teräs [EMAIL PROTECTED] Date: Thu, 17 Jan 2008 11:20:42 +0200 Where as the pfkey bug fix is non-intrusive and helps all legacy applications still using af_key by _fixing a bug in kernel_. It's not a bug. You're fixing a speed issue, not a crash or a case where AF_KEY is providing incorrect data. That is what I mean when I mean life support, we fix crashes and data corruption. We don't make performance tweaks. No. The speed issue is complitely handled in xfrm_state and xfrm_user changes. The af_key issue is that in big dumps you get only first X entries. The rest of the entries are dropped because the socket receive buffer goes full. You get data corruption: missing entries. - Timo -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC][PATCH] Fixing SA/SP dumps on netlink/af_key
David Miller wrote: From: Timo_Teräs [EMAIL PROTECTED] Date: Thu, 17 Jan 2008 11:38:13 +0200 The af_key issue is that in big dumps you get only first X entries. The rest of the entries are dropped because the socket receive buffer goes full. You get data corruption: missing entries. This is an inherent aspect of AF_KEY (and what it was derived from, BSD routing sockets). Yes, this is the way BSD does it. It has to provide dumps atomically, and if there is no space there is no way to provide those entries which would require more rcvbuf space. RFC does not say it has to be atomic. It does say that the dump is terminated with SADB_DUMP message having sadb_seq field set to zero. Currently that is dropped too when the problem occurs. Thus the socket is left in a bad state: dump ends never. This can cause applications without any workarounds to hang. - Timo -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC][PATCH] Fixing SA/SP dumps on netlink/af_key
David Miller wrote: From: Timo_Teräs [EMAIL PROTECTED] Date: Thu, 17 Jan 2008 12:01:17 +0200 David Miller wrote: This is an inherent aspect of AF_KEY (and what it was derived from, BSD routing sockets). Yes, this is the way BSD does it. It has to provide dumps atomically, and if there is no space there is no way to provide those entries which would require more rcvbuf space. RFC does not say it has to be atomic. Every application out there in the universe expects BSD socket semantics, and therefore atomic dumps. You cannot fix things without breaking applications. IMHO, it's a lot better then losing 50% of entries and the end of sequence message on big dumps. SPD and SADB are not that volatile; in most of the cases the dump would be as good as an atomic one. Even if it did change during ongoing dump you still get an usable dump. All the entries reflect real data and there is no dependency between different entries. I'm not sure if there's other major applications that we should be concerned about, but at least ipsec-tools racoon does not expect to get atomic dumps (which btw, comes originally from BSD). Cheers, Timo -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC][PATCH] Fixing SA/SP dumps on netlink/af_key
Herbert Xu wrote: On Thu, Jan 17, 2008 at 07:54:32AM +0200, Timo Teräs wrote: Racoon doesn't use pfkey dumping as far as I know. ipsec-tools racoon uses pfkey and only pfkey. And it's non trivial to make it use netlink; it relies heavily all around the code to pfkey structs. It also runs on BSD so we cannot rip pfkey away; adding a layer to work with both pfkey and netlink would be doable, but just a lot of work. Sure racoon uses pfkey but the question is does it use pfkey dumping? Yes it does. It does SPD dump at startup and keeps the SP database in sync by listening to notifications. It also does SA dumps when it is figuring out which SAs to purge. I started to work on the xfrm/pfkey patch only because racoon is having problems with (as is anything else using pfkey). - Timo -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC][PATCH] Fixing SA/SP dumps on netlink/af_key
David Miller wrote: From: Timo_Teräs [EMAIL PROTECTED] Date: Thu, 17 Jan 2008 13:00:09 +0200 IMHO, it's a lot better then losing 50% of entries and the end of sequence message on big dumps. SPD and SADB are not that volatile; in most of the cases the dump would be as good as an atomic one. I humbly disagree with you. Interface behavior stability is more important. Small SPDs/SADBs would still be dumped atomically. The patch affects only the cases when the receive queue is getting full. I'm not sure if there's other major applications that we should be concerned about, but at least ipsec-tools racoon does not expect to get atomic dumps (which btw, comes originally from BSD). Racoon was written as an addon to the BSD stack by an IPV6/IPSEC project in Japan named KAME, it did not come from BSD. It was added to BSD. There are also other BSD based IPSEC daemons such as the one written by the OpenBSD folks. Yes. I meant that it was originally written to be used in BSD. The Linux port came later. Sorry for the ambiguous wording. I don't think this is arguable at all. We're not changing semantics over what we've done for 4+ years and applications might depend upon. It's for a deprecated interface, which makes any semantic changes that much less inviting. You can argue all you want, but it will not change the invariants in the previous paragraph. True. If no one else agrees with me, I'll drop it. I can always run my own patched kernel. I'd appreciate feedback on the xfrm changes. I'll try to make that part usable patch against net-2.6.25 git tree next week. -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC][PATCH] Fixing SA/SP dumps on netlink/af_key
Herbert Xu wrote: On Thu, Jan 17, 2008 at 07:42:30AM -0500, jamal wrote: Looking at the pfkey RFC one more time, heres a funny quote: The dump message is used for debugging purposes only and is not intended for production use. In fact it goes much further: Support for the dump message MAY be discontinued in future versions of PF_KEY. Key management applications MUST NOT depend on this message for basic operation. I guess the idea was that application should know about the SAs it created. Though a SA dump needs to be done if you want to check for existing entries (created by other processes, or if you are recovering from a crash). SPD dumping is still a must if you want to work nicely with kernel. As noted earlier pfkey is not really standardized. E.g. the SPD dumping message are not in the RFC as David noted. The above RFC comments and the fact that SPD stuff is unspecified made me think that making non-atomic dumps would be a lot better alternative then leaving the socket to bad state which would make the application completely unusable. -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC][PATCH] Fixing SA/SP dumps on netlink/af_key
Herbert Xu wrote: On Thu, Jan 17, 2008 at 03:31:14PM +0200, Timo Teräs wrote: I guess the idea was that application should know about the SAs it created. Though a SA dump needs to be done if you want to check for existing entries (created by other processes, or if you are recovering from a crash). That's what SADB_GET is for. In any case KMs cannot coexist so this is pointless. After a crash you should simply flush all states and policies. I thought KMs could coexist. Actually, in strongSwan you have two daemons: charon and pluto. Both can run at the same time. The other is for IKEv1 and the other one for IKEv2. It uses netlink, though. Looking the way pfkey works, it looks like being designed to work with multiple KMs (e.g. acquires are sent to all registered sockets). SPD dumping is still a must if you want to work nicely with kernel. No it isn't. Look at how Openswan does it. No dumping anywhere at all. Then you have to have all policy/static association configuration in the application configuration. ipsec-tools wanted separate that. As this is more robust if someone decided to run multiple KMs. My point (as an ipsec-tools developer) is that the patch would fix a lot of problems until ipsec-tools is netlink enabled. It would handle perfectly the dumping even as non-atomic. And that ipsec-tools is the KM you get by default in almost all distributions. But apparently you are too worried that all the so many other KMs still using pfkey (is there others? I think most others use netlink already) in Linux might break because the rely on an unspecified feature of pfkey. I'm also tired of arguing since this is going nowhere. I'll run my patched kernel and try to get ipsec-tools fixed to use netlink... eventually. -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC][PATCH] Fixing SA/SP dumps on netlink/af_key
jamal wrote: On Sun, 2008-13-01 at 14:26 +0200, Timo Teräs wrote: I tried to address all these problems, and added the xfrm_policy and xfrm_state structure to a new linked list that is used solely for dumping. This way the dumps can be done in O(n) and have an iterator point to them. I also modified to af_key to stop dumping when receive queue is filling up and continue it from a callback at recvfrom() when there's more room. But I'd like to hear about: - if the approach is ok (adding the entries in one more list)? - if the new/modified variables, structures and function names are ok? - if I should port these against net-2.6 or net-2.6.25 git tree? - if I should split this to more than one commit? (maybe xfrm core, xfrm user and af_key parts?) To answer your last 2 questions: There are two issues that are inter-mingled in there. The most important is pf_key not being robust on dump. The other being the accurracy of netlink dumping - this has much broader implications. I think you need to separate the patches into those two at least and prioritize on the pf_key as a patch that you push in. I would also try to make the pf_key dumping approach to mirror what netlink does today - in the worst case it would be as innacurate as netlink; which is not bad. I think youll find no resistance getting such a patch in. Creating a separate af_key patch would not be a big problem. I was just hoping avoiding it as the xfrm_state / xfrm_policy changes modify the API and requires changing af_key also. To answer your first 2 questions: My main dilema with your approach is the policy of maintaining such a list in the kernel and memory consumption needed vs the innacuracy of netlink dumps.With your approach of maintaining extra SA/P D, i would need double the RAM amount. No. I'm not creating second copies of the SADB/SPD entries. The entries are just added to one more list. Memory requirements go up on per entry (figures based on 2.6.22.9 kernel): sizeof(struct xfrm_policy) 636 - 644 bytes sizeof(struct xfrm_state) 452 - 460 bytes For 400K entries it would take about 3 megs of more memory. Where the total database size is around 240-250 megs. Netlink dumping gives up accuracy[1], uses 50% of the memory you are proposing but abuses more cpu cycles. User space could maintain the list you are proposing instead - and compensate for the innaccuracies by watching events[2] Of course that shifts the accuracy to events which could be lost on their way to user space. This issue is alleviated a little more with your approach of keeping the list in the kernel and adding new updates to the tail of the dump list (which, btw, your patch didnt seem to do); however, even if you solve that: ** you are still will be faced with challenges of not being atomic on updates; example an entry already on your dump list could be deleted before being read by user space. I cant see you solving that without abusing a _lot_ more cpu (trying to find the entry on the dump list that may have gotten updated or deleted). Theres also the issue of how long to keep the dump list before aging it out and how to deal with multiple users. If more entries are added, you can get notifications of them. Cheers, Timo -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC][PATCH] Fixing SA/SP dumps on netlink/af_key
jamal wrote: On Wed, 2008-16-01 at 16:28 +0200, Timo Teräs wrote: No. I'm not creating second copies of the SADB/SPD entries. The entries are just added to one more list. Ah, sorry - yes, that sounds reasonable. So what happens if i delete an entry; does it get removed from the list? Also what happens on modification? If the entry is removed befored it is dumped, it wont be dumped at all. The state during dump code execution is returned. Depending when the modification occurs it might or might not be reflected in the dumped entry. If more entries are added, you can get notifications of them. how would a user app (example racoon) appropriately deal with it? Example an entry sits in the dump-list, it gets deleted - an event gets generated user-space and later that entry shows up in user space dump. You listen for the events. It is guaranteed that if the dumping code does return the entry to be deleted, the deletion notification will occur after that dump entry. Herbert Xu wrote: On Wed, Jan 16, 2008 at 08:39:40PM -0500, jamal wrote: I wouldnt disagree except some apps like racoon which depend on pfkey are unfortunately beyond repair. Timo has a pretty good handle on the Racoon doesn't use pfkey dumping as far as I know. ipsec-tools racoon uses pfkey and only pfkey. And it's non trivial to make it use netlink; it relies heavily all around the code to pfkey structs. It also runs on BSD so we cannot rip pfkey away; adding a layer to work with both pfkey and netlink would be doable, but just a lot of work. Also ipsec-tools racoon seems to be the default IKE daemon in some popular distros. So for the time being I think pfkey is an evil we have to live with. Cheers, Timo -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC][PATCH] Fixing SA/SP dumps on netlink/af_key
Herbert Xu wrote: jamal [EMAIL PROTECTED] wrote: There are two issues that are inter-mingled in there. The most important is pf_key not being robust on dump. The other being the accurracy of IMHO doing significant work on af_key is a waste of time. It has no advantages at all over xfrm_user since neither is portable. So we should discourage people from using af_key wherever possible. I don't know about netlink. But pfkey works in *BSD too and it is RFC'd. So I'd say pfkey might be a bit more portable. Though netlink is definitely more robust and extensive. Cheers, Timo -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC][PATCH] Fixing SA/SP dumps on netlink/af_key
David Miller wrote: From: Timo_Teräs [EMAIL PROTECTED] Date: Thu, 17 Jan 2008 08:27:14 +0200 I don't know about netlink. But pfkey works in *BSD too and it is RFC'd. So I'd say pfkey might be a bit more portable. Though netlink is definitely more robust and extensive. The RFCs say absolutely nothing about policy interfaces for AF_KEY, everybody rolls their own in slightly incompatible ways. It is therefore anything but standardized. Yes, there's non-standardized extensions. But the point was that there are other implementations of pfkey. And ipsec-tools racoon is an example of a widely used application that runs in Linux and *BSD using this API. So for the time being I'd consider having pfkey fixes as a good thing. This pfkey dumping problem seems to be affecting many users. - Timo -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[RFC][PATCH] Fixing SA/SP dumps on netlink/af_key
Hi all, The problem with IPsec SP/SA dumping is that they don't work well with large SPD/SADB:s. In netlink the dumping code is O(n^2) and can miss some entries/dump duplicates if the DB is changed between the recv() calls to read the dump entries. This is due to the entry index counting done in xfrm_user code. With af_key the dump entries are just dropped when the socket receive queue is filled. I tried to address all these problems, and added the xfrm_policy and xfrm_state structure to a new linked list that is used solely for dumping. This way the dumps can be done in O(n) and have an iterator point to them. I also modified to af_key to stop dumping when receive queue is filling up and continue it from a callback at recvfrom() when there's more room. This patch is against 2.6.23 vanilla. I wanted to get some feedback before I make it against the git tree. But I'd like to hear about: - if the approach is ok (adding the entries in one more list)? - if the new/modified variables, structures and function names are ok? - if I should port these against net-2.6 or net-2.6.25 git tree? - if I should split this to more than one commit? (maybe xfrm core, xfrm user and af_key parts?) Cheers, Timo Index: linux-2.6.23/include/net/xfrm.h === --- linux-2.6.23.orig/include/net/xfrm.h2008-01-13 14:13:21.0 +0200 +++ linux-2.6.23/include/net/xfrm.h 2008-01-13 14:13:34.0 +0200 @@ -104,6 +104,7 @@ struct xfrm_state { /* Note: bydst is re-used during gc */ + struct list_headall; struct hlist_node bydst; struct hlist_node bysrc; struct hlist_node byspi; @@ -346,6 +347,7 @@ struct xfrm_policy { struct xfrm_policy *next; + struct list_headbytype; struct hlist_node bydst; struct hlist_node byidx; @@ -912,6 +914,17 @@ int priority; }; +struct xfrm_state_walker { + struct xfrm_state *state; + int count; +}; + +struct xfrm_policy_walker { + struct xfrm_policy *policy; + int count; + int type; +}; + extern void xfrm_init(void); extern void xfrm4_init(void); extern void xfrm6_init(void); @@ -921,7 +934,15 @@ extern void xfrm6_state_init(void); extern void xfrm6_state_fini(void); -extern int xfrm_state_walk(u8 proto, int (*func)(struct xfrm_state *, int, void*), void *); +static inline void xfrm_state_walk_start(struct xfrm_state_walker *walk) +{ + walk-state = NULL; + walk-count = 0; +} + +extern int xfrm_state_walk(struct xfrm_state_walker *walk, u8 proto, + int (*func)(struct xfrm_state *, int, void*), void *); +extern void xfrm_state_walk_end(struct xfrm_state_walker *walk); extern struct xfrm_state *xfrm_state_alloc(void); extern struct xfrm_state *xfrm_state_find(xfrm_address_t *daddr, xfrm_address_t *saddr, struct flowi *fl, struct xfrm_tmpl *tmpl, @@ -1025,7 +1046,17 @@ #endif struct xfrm_policy *xfrm_policy_alloc(gfp_t gfp); -extern int xfrm_policy_walk(u8 type, int (*func)(struct xfrm_policy *, int, int, void*), void *); + +static inline void xfrm_policy_walk_start(struct xfrm_policy_walker *walk) +{ + walk-policy = NULL; + walk-count = 0; + walk-type = XFRM_POLICY_TYPE_MAIN; +} + +extern int xfrm_policy_walk(struct xfrm_policy_walker *walk, u8 type, + int (*func)(struct xfrm_policy *, int, int, void*), void *); +extern void xfrm_policy_walk_end(struct xfrm_policy_walker *walk); int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl); struct xfrm_policy *xfrm_policy_bysel_ctx(u8 type, int dir, struct xfrm_selector *sel, Index: linux-2.6.23/net/key/af_key.c === --- linux-2.6.23.orig/net/key/af_key.c 2008-01-13 14:13:21.0 +0200 +++ linux-2.6.23/net/key/af_key.c 2008-01-13 14:13:34.0 +0200 @@ -48,6 +48,16 @@ struct sock sk; int registered; int promisc; + + u8 dump_type; + uint8_t dump_msg_version; + uint32_tdump_msg_pid; + int (*dump)(struct pfkey_sock *sk); + void(*dump_done)(struct pfkey_sock *sk); + union { + struct xfrm_policy_walker policy; + struct xfrm_state_walkerstate; + } u; }; static inline struct pfkey_sock *pfkey_sk(struct sock *sk) @@ -55,6 +65,30 @@ return (struct pfkey_sock *)sk; } +static int pfkey_can_dump(struct sock *sk) +{ + if (3 * atomic_read(sk-sk_rmem_alloc) = 2 * sk-sk_rcvbuf) + return 1; + return 0; +} + +static int pfkey_do_dump(struct pfkey_sock *pfk) +{ + int rc; + + rc = pfk-dump(pfk); + if (rc != 0) { +
[PATCH][IPV4] ip_gre: set mac_header correctly in receive path
From: Timo Teras [EMAIL PROTECTED] mac_header update in ipgre_recv() was incorrectly changed to skb_reset_mac_header() when it was introduced. Signed-off-by: Timo Teras [EMAIL PROTECTED] --- This replaces my earlier patch titled ip_gre: use skb-{mac, network}_header consistently. Apparently I hadn't done my homework how to use *_header correctly. And I should have done a bit more testing to figure out the previous patch does not work. But the main problem was the receive path in the first place, and this patch fixes it. The bug was introduced in commit 459a98ed881802dee55897441bc7f77af614368e. There might be other similar incorrect replaces. net/ipv4/ip_gre.c |2 +- 1 files changed, 1 insertions(+), 1 deletions(-) diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 02b02a8..4b93f32 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -613,7 +613,7 @@ static int ipgre_rcv(struct sk_buff *skb) offset += 4; } - skb_reset_mac_header(skb); + skb-mac_header = skb-network_header; __pskb_pull(skb, offset); skb_reset_network_header(skb); skb_postpull_rcsum(skb, skb_transport_header(skb), offset); -- 1.5.2.5 -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[RFC][PATCH][IPV4] ip_gre: use skb-{mac,network}_header consistently
From: Timo Teras [EMAIL PROTECTED] Make both send and receive paths use consistently skb-*_header: mac_header = outer IPv4 header network_header = encapsulated packet (e.g. inner IPv4 header) Signed-off-by: Timo Teras [EMAIL PROTECTED] --- Currently in send path: network_header = outer IPv4 header transport_header = encapsulated packet And receive path: network_header = encapsulated packet mac_header = GRE header mac_header used to be the outer IPv4 header on receive path, but it was broken/changed when skb_reset_mac_header() was introduced. This also makes ipgre_header_parse() work correctly as it assumes that mac_header points to the outer IPv4 header always. Though, it returns zeros for outgoing packets that are tcpdumped from gre tunnel without explicit local address, since the routing is done in ipgre_tunnel_xmit(). net/ipv4/ip_gre.c |8 1 files changed, 4 insertions(+), 4 deletions(-) diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 02b02a8..eba1ade 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -613,7 +613,7 @@ static int ipgre_rcv(struct sk_buff *skb) offset += 4; } - skb_reset_mac_header(skb); + skb-mac_header = skb-network_header; __pskb_pull(skb, offset); skb_reset_network_header(skb); skb_postpull_rcsum(skb, skb_transport_header(skb), offset); @@ -826,9 +826,8 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) old_iph = ip_hdr(skb); } - skb-transport_header = skb-network_header; skb_push(skb, gre_hlen); - skb_reset_network_header(skb); + skb_reset_mac_header(skb); memset((IPCB(skb)-opt), 0, sizeof(IPCB(skb)-opt)); IPCB(skb)-flags = ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED | IPSKB_REROUTED); @@ -839,7 +838,7 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) * Push down and install the IPIP header. */ - iph = ip_hdr(skb); + iph = (struct iphdr *) skb_mac_header(skb); iph-version= 4; iph-ihl= sizeof(struct iphdr) 2; iph-frag_off = df; @@ -1070,6 +1069,7 @@ static int ipgre_header(struct sk_buff *skb, struct net_device *dev, struct iphdr *iph = (struct iphdr *)skb_push(skb, t-hlen); __be16 *p = (__be16*)(iph+1); + skb_reset_mac_header(skb); memcpy(iph, t-parms.iph, sizeof(struct iphdr)); p[0]= t-parms.o_flags; p[1]= htons(type); -- 1.5.2.5 -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH RESEND] ip_gre: sendto/recvfrom NBMA address
When GRE tunnel is in NBMA mode, this patch allows an application to use a PF_PACKET socket to: - send a packet to specific NBMA address with sendto() - use recvfrom() to receive packet and check which NBMA address it came from Signed-off-by: Timo Teras [EMAIL PROTECTED] --- This is useful for implementing NHRP (and other low level protocols) over NBMA GRE tunnels. Posted about a week ago, but got no reply. Could someone comment on this? Or commit it? Thanks. net/ipv4/ip_gre.c | 12 +++- 1 files changed, 11 insertions(+), 1 deletions(-) diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index f151900..b1e3816 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -1033,7 +1033,6 @@ static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu) return 0; } -#ifdef CONFIG_NET_IPGRE_BROADCAST /* Nice toy. Unfortunately, useless in real life :-) It allows to construct virtual multiprotocol broadcast LAN over the Internet, provided multicast routing is tuned. @@ -1092,10 +1091,19 @@ static int ipgre_header(struct sk_buff *skb, struct net_device *dev, return -t-hlen; } +static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr) +{ + struct iphdr *iph = (struct iphdr *) skb_mac_header(skb); + memcpy(haddr, iph-saddr, 4); + return 4; +} + static const struct header_ops ipgre_header_ops = { .create = ipgre_header, + .parse = ipgre_header_parse, }; +#ifdef CONFIG_NET_IPGRE_BROADCAST static int ipgre_open(struct net_device *dev) { struct ip_tunnel *t = netdev_priv(dev); @@ -1197,6 +1205,8 @@ static int ipgre_tunnel_init(struct net_device *dev) dev-stop = ipgre_close; } #endif + } else { + dev-header_ops = ipgre_header_ops; } if (!tdev tunnel-parms.link) - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH RESEND] ip_gre: sendto/recvfrom NBMA address
When GRE tunnel is in NBMA mode, this patch allows an application to use a PF_PACKET socket to: - send a packet to specific NBMA address with sendto() - use recvfrom() to receive packet and check which NBMA address it came from This is required to implement properly NHRP over GRE tunnel. Signed-off-by: Timo Teras [EMAIL PROTECTED] --- Your mailer mangled tabs, it won't apply like this. Try sending to yourself for testing, then repost when it works properly. Sorry. Tabs were ok, but the initial white space on context lines was somehow removed. Apparently a bug in my editor where I cut/pasted from. net/ipv4/ip_gre.c | 12 +++- 1 files changed, 11 insertions(+), 1 deletions(-) diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index f151900..b1e3816 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -1033,7 +1033,6 @@ static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu) return 0; } -#ifdef CONFIG_NET_IPGRE_BROADCAST /* Nice toy. Unfortunately, useless in real life :-) It allows to construct virtual multiprotocol broadcast LAN over the Internet, provided multicast routing is tuned. @@ -1092,10 +1091,19 @@ static int ipgre_header(struct sk_buff *skb, struct net_device *dev, return -t-hlen; } +static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr) +{ + struct iphdr *iph = (struct iphdr*) skb_mac_header(skb); + memcpy(haddr, iph-saddr, 4); + return 4; +} + static const struct header_ops ipgre_header_ops = { .create = ipgre_header, + .parse = ipgre_header_parse, }; +#ifdef CONFIG_NET_IPGRE_BROADCAST static int ipgre_open(struct net_device *dev) { struct ip_tunnel *t = netdev_priv(dev); @@ -1197,6 +1205,8 @@ static int ipgre_tunnel_init(struct net_device *dev) dev-stop = ipgre_close; } #endif + } else { + dev-header_ops = ipgre_header_ops; } if (!tdev tunnel-parms.link) - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH RESEND] ip_gre: sendto/recvfrom NBMA address
When GRE tunnel is in NBMA mode, this patch allows an application to use a PF_PACKET socket to: - send a packet to specific NBMA address with sendto() - use recvfrom() to receive packet and check which NBMA address it came from This is required to implement properly NHRP over GRE tunnel. Signed-off-by: Timo Teras [EMAIL PROTECTED] --- Patrick McHardy wrote: Your mailer mangled tabs, it won't apply like this. Try sending to yourself for testing, then repost when it works properly. Tabs were ok, I think. But somehow it seems that cut'n'paste from my default editor loses the space from context lines. It should be good now. net/ipv4/ip_gre.c | 12 +++- 1 files changed, 11 insertions(+), 1 deletions(-) diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index f151900..b1e3816 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -1033,7 +1033,6 @@ static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu) return 0; } -#ifdef CONFIG_NET_IPGRE_BROADCAST /* Nice toy. Unfortunately, useless in real life :-) It allows to construct virtual multiprotocol broadcast LAN over the Internet, provided multicast routing is tuned. @@ -1092,10 +1091,19 @@ static int ipgre_header(struct sk_buff *skb, struct net_device *dev, return -t-hlen; } +static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr) +{ + struct iphdr *iph = (struct iphdr*) skb_mac_header(skb); + memcpy(haddr, iph-saddr, 4); + return 4; +} + static const struct header_ops ipgre_header_ops = { .create = ipgre_header, + .parse = ipgre_header_parse, }; +#ifdef CONFIG_NET_IPGRE_BROADCAST static int ipgre_open(struct net_device *dev) { struct ip_tunnel *t = netdev_priv(dev); @@ -1197,6 +1205,8 @@ static int ipgre_tunnel_init(struct net_device *dev) dev-stop = ipgre_close; } #endif + } else { + dev-header_ops = ipgre_header_ops; } if (!tdev tunnel-parms.link) - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH RESEND] ip_gre: sendto/recvfrom NBMA address
Hi, Alexey Kuznetsov wrote: The only dubious case is when next hop is set using routing tables. But code in ipgre_tunnel_xmit() is ready to accept this situation, it checks for zero destination address and fixes it when it is able to. Nevertheless, it does not work. Another thoughts? Could you explain this a little more? I was able to set a nbma gre tunnel, add routes to it and it worked perfectly ok. Link-level next hop worked: ip route add route via link-level-address dev tunnel-dev onlink Also normal route with static neighbor worked: ip neigh add router-tunnel-ip lladdr link-level-address nud permanent dev tunnel-dev ip route add route via router-tunnel-ip In both cases packets were routed as expected to hosts accessible using the added route. And sendto() sent a packet correctly with destination link-level-address set. Did I miss/misunderstand something? Cheers, Timo - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] ip_gre: sendto/recvfrom NBMA address
When GRE tunnel is in NBMA mode, this patch allows an application to use a PF_PACKET socket to: - send a packet to specific NBMA address with sendto() - use recvfrom() to receive packet and check which NBMA address it came from Signed-off-by: Timo Teras [EMAIL PROTECTED] --- This is useful for implementing NHRP over NBMA GRE tunnels. Sent earlier to netdev list only. Resending to wider audience with a style issue fixed. net/ipv4/ip_gre.c | 12 +++- 1 files changed, 11 insertions(+), 1 deletions(-) diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index f151900..b1e3816 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -1033,7 +1033,6 @@ static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu) return 0; } -#ifdef CONFIG_NET_IPGRE_BROADCAST /* Nice toy. Unfortunately, useless in real life :-) It allows to construct virtual multiprotocol broadcast LAN over the Internet, provided multicast routing is tuned. @@ -1092,10 +1091,19 @@ static int ipgre_header(struct sk_buff *skb, struct net_device *dev, return -t-hlen; } +static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr) +{ + struct iphdr *iph = (struct iphdr *) skb_mac_header(skb); + memcpy(haddr, iph-saddr, 4); + return 4; +} + static const struct header_ops ipgre_header_ops = { .create = ipgre_header, + .parse = ipgre_header_parse, }; +#ifdef CONFIG_NET_IPGRE_BROADCAST static int ipgre_open(struct net_device *dev) { struct ip_tunnel *t = netdev_priv(dev); @@ -1197,6 +1205,8 @@ static int ipgre_tunnel_init(struct net_device *dev) dev-stop = ipgre_close; } #endif + } else { + dev-header_ops = ipgre_header_ops; } if (!tdev tunnel-parms.link) - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
ip_gre: sendto/recvfrom NBMA address
When GRE tunnel is in NBMA mode, this patch allows an application to use a PF_PACKET socket to: - send a packet to specific NBMA address with sendto() - use recvfrom() to receive packet and check which NBMA address it came from This is required to implement properly NHRP over GRE tunnel. Signed-off-by: Timo Teras [EMAIL PROTECTED] --- commit 14ebc80596af5e8422b5db2120cf3e0d801ddd3a tree b9fc7b2b8465d506deb5a5f05f22c04f5049cd5f parent 4271e0f7e12bdbbd7ce131187aaae2ba5233a309 author Timo Teras [EMAIL PROTECTED] Mon, 15 Oct 2007 19:49:52 +0300 committer Timo Teras [EMAIL PROTECTED] Mon, 15 Oct 2007 19:49:52 +0300 net/ipv4/ip_gre.c | 12 +++- 1 files changed, 11 insertions(+), 1 deletions(-) diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index f151900..b1e3816 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -1033,7 +1033,6 @@ static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu) return 0; } -#ifdef CONFIG_NET_IPGRE_BROADCAST /* Nice toy. Unfortunately, useless in real life :-) It allows to construct virtual multiprotocol broadcast LAN over the Internet, provided multicast routing is tuned. @@ -1092,10 +1091,19 @@ static int ipgre_header(struct sk_buff *skb, struct net_device *dev, return -t-hlen; } +static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr) +{ + struct iphdr *iph = (struct iphdr*) skb_mac_header(skb); + memcpy(haddr, iph-saddr, 4); + return 4; +} + static const struct header_ops ipgre_header_ops = { .create = ipgre_header, + .parse = ipgre_header_parse, }; +#ifdef CONFIG_NET_IPGRE_BROADCAST static int ipgre_open(struct net_device *dev) { struct ip_tunnel *t = netdev_priv(dev); @@ -1197,6 +1205,8 @@ static int ipgre_tunnel_init(struct net_device *dev) dev-stop = ipgre_close; } #endif + } else { + dev-header_ops = ipgre_header_ops; } if (!tdev tunnel-parms.link) - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html