Hi Dave!

Unfortunately we've again discovered a bug within netfilter.  This time
it's about the handling of ICMP error messages that are created on the
firewall itself.

I'll describe the two problems, so you can understand why there
are changes to the ipv4 networking code.

1) ICMP errors for unconfirmed connections

normal case:
- Let's say a TCP SYN packet arrives at the PRE_ROUTING hook.
- conntrack at PRE_ROUTING creates conntrack entry, not placed in hashes
- packet is forwarded by ip_forward.c code
- packet arrives at POST_ROUTING, conntrack entry is placed in hashes
  (called 'confirm' step)

broken case:
- TCP SYN arrives at PRE_ROUTING
- conntrack at PRE_ROUTING creates conntrack entry, not placed in hashes
- packet is dropped by ip_forward.c code because TTL <= 1
- icmp packet is sent through LOCAL_OUT hook
- icmp packet can not be associated RELATED with TCP SYN, since conntrack
  entry is not in conntrack hash.

So for ICMP error messages, we need to manually associate it to a particular
conntrack entry.  This needs to be done in the icmp send code.

(There are possible other solutions which do not need to touch the core ipv4
 code, but add significant complexity and overhead)

If the ICMP packet is not associated (like with the current code), the
problem is getting worse when NAT happens:  We leak un-nat'ed addresses
in the inner IP packet (there was a recent bug report about this on
linux-kernel).

2) NAT of inner IP packet (encapsulated in ICMP error)

In certain cases, the current code does not correctly reverse-NAT the
inner IP packet in gateway-generated ICMP error messges - even if the
connection is fully established. 


The two bugs are solved in the following patch (by James Morris).  Please tell
me if this solution is not possible because of the core ipv4 changes, and we
will implement one of the sub-optimal other possible solutions.

Patch for 2.5.x will follow in the next mail.


diff -urN linux-2.4.19-pre6.orig/include/linux/skbuff.h 
linux-2.4.19-pre6-nf-01/include/linux/skbuff.h
--- linux-2.4.19-pre6.orig/include/linux/skbuff.h       Sun Apr  7 15:27:29 2002
+++ linux-2.4.19-pre6-nf-01/include/linux/skbuff.h      Fri Apr 12 00:52:31 2002
@@ -1144,6 +1144,17 @@
        if (nfct)
                atomic_inc(&nfct->master->use);
 }
+static inline struct nf_ct_info *
+skb_nf_ct(struct sk_buff *skb)
+{
+       return skb->nfct;
+}
+#else
+static inline struct nf_ct_info *
+skb_nf_ct(struct sk_buff *skb)
+{
+       return NULL;
+}
 #endif
 
 #endif /* __KERNEL__ */
diff -urN linux-2.4.19-pre6.orig/include/net/ip.h 
linux-2.4.19-pre6-nf-01/include/net/ip.h
--- linux-2.4.19-pre6.orig/include/net/ip.h     Sat Apr 28 22:01:26 2001
+++ linux-2.4.19-pre6-nf-01/include/net/ip.h    Fri Apr 12 00:52:31 2002
@@ -66,6 +66,7 @@
 
 extern struct ip_ra_chain *ip_ra_chain;
 extern rwlock_t ip_ra_lock;
+struct nf_ct_info;
 
 /* IP flags. */
 #define IP_CE          0x8000          /* Flag: "Congestion"           */
@@ -106,7 +107,8 @@
                                      unsigned length,
                                      struct ipcm_cookie *ipc,
                                      struct rtable *rt,
-                                     int flags);
+                                     int flags,
+                                     struct nf_ct_info *nfct);
 
 /*
  *     Map a multicast IP onto multicast MAC for type Token Ring.
diff -urN linux-2.4.19-pre6.orig/net/ipv4/icmp.c 
linux-2.4.19-pre6-nf-01/net/ipv4/icmp.c
--- linux-2.4.19-pre6.orig/net/ipv4/icmp.c      Sun Apr  7 15:27:29 2002
+++ linux-2.4.19-pre6-nf-01/net/ipv4/icmp.c     Fri Apr 12 00:52:31 2002
@@ -370,7 +370,7 @@
                               icmp_param->data.icmph.code)) { 
                ip_build_xmit(sk, icmp_glue_bits, icmp_param, 
                              icmp_param->data_len+icmp_param->head_len,
-                             &ipc, rt, MSG_DONTWAIT);
+                             &ipc, rt, MSG_DONTWAIT, NULL);
        }
        ip_rt_put(rt);
 out:
@@ -528,7 +529,7 @@
 
        ip_build_xmit(icmp_socket->sk, icmp_glue_bits, &icmp_param, 
                icmp_param.data_len+sizeof(struct icmphdr),
-               &ipc, rt, MSG_DONTWAIT);
+               &ipc, rt, MSG_DONTWAIT, skb_nf_ct(skb_in));
 
 ende:
        ip_rt_put(rt);
diff -urN linux-2.4.19-pre6.orig/net/ipv4/ip_output.c 
linux-2.4.19-pre6-nf-01/net/ipv4/ip_output.c
--- linux-2.4.19-pre6.orig/net/ipv4/ip_output.c Sun Apr  7 15:27:29 2002
+++ linux-2.4.19-pre6-nf-01/net/ipv4/ip_output.c        Fri Apr 12 00:52:31 2002
@@ -405,6 +405,22 @@
        return -EHOSTUNREACH;
 }
 
+#ifdef CONFIG_NETFILTER
+/* If the original packet is part of a connection, but the connection
+   is not confirmed, our manufactured reply will not be associated
+   with it, so we need to do this manually. */
+static void nfct_attach(struct sk_buff *new_skb, struct nf_ct_info *nfct)
+{
+       void (*attach)(struct sk_buff *, struct nf_ct_info *);
+
+       /* Avoid module unload race with ip_ct_attach being NULLed out */
+       if (nfct && (attach = ip_ct_attach) != NULL)
+               attach(new_skb, nfct);
+}
+#else
+static void nfct_attach(struct sk_buff *new_skb, struct nf_ct_info *nfct) { }
+#endif
+
 /*
  *     Build and send a packet, with as little as one copy
  *
@@ -434,7 +450,8 @@
                  unsigned length,
                  struct ipcm_cookie *ipc,
                  struct rtable *rt,
-                 int flags)
+                 int flags,
+                 struct nf_ct_info *nfct)
 {
        unsigned int fraglen, maxfraglen, fragheaderlen;
        int err;
@@ -599,6 +616,7 @@
 
                nfrags++;
 
+               nfct_attach(skb, nfct);
                err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, 
                              skb->dst->dev, output_maybe_reroute);
                if (err) {
@@ -633,7 +651,8 @@
                  unsigned length,
                  struct ipcm_cookie *ipc,
                  struct rtable *rt,
-                 int flags)
+                 int flags,
+                 struct nf_ct_info *nfct)
 {
        int err;
        struct sk_buff *skb;
@@ -652,7 +671,7 @@
                 *      Check for slow path.
                 */
                if (length > rt->u.dst.pmtu || ipc->opt != NULL)  
-                       return 
ip_build_xmit_slow(sk,getfrag,frag,length,ipc,rt,flags); 
+                       return 
+ip_build_xmit_slow(sk,getfrag,frag,length,ipc,rt,flags,nfct); 
        } else {
                if (length > rt->u.dst.dev->mtu) {
                        ip_local_error(sk, EMSGSIZE, rt->rt_dst, sk->dport, 
rt->u.dst.dev->mtu);
@@ -710,6 +729,7 @@
        if (err)
                goto error_fault;
 
+       nfct_attach(skb, nfct);
        err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
                      output_maybe_reroute);
        if (err > 0)
@@ -977,7 +997,8 @@
        sk->protinfo.af_inet.tos = skb->nh.iph->tos;
        sk->priority = skb->priority;
        sk->protocol = skb->nh.iph->protocol;
-       ip_build_xmit(sk, ip_reply_glue_bits, arg, len, &ipc, rt, MSG_DONTWAIT);
+       ip_build_xmit(sk, ip_reply_glue_bits, arg, len, &ipc, rt, MSG_DONTWAIT,
+                     NULL);
        bh_unlock_sock(sk);
 
        ip_rt_put(rt);
diff -urN linux-2.4.19-pre6.orig/net/ipv4/netfilter/ip_nat_core.c 
linux-2.4.19-pre6-nf-01/net/ipv4/netfilter/ip_nat_core.c
--- linux-2.4.19-pre6.orig/net/ipv4/netfilter/ip_nat_core.c     Sun Apr  7 15:27:29 
2002
+++ linux-2.4.19-pre6-nf-01/net/ipv4/netfilter/ip_nat_core.c    Fri Apr 12 00:52:31 
+2002
@@ -780,6 +780,18 @@
        } else return NF_ACCEPT;
 }
 
+/*
+ * Decide whether to map inner header of an ICMP reply, including when
+ * we generate the reply ourselves.
+ */
+static inline int
+map_innards(unsigned int maniphook, unsigned int hooknum)
+{
+       return (maniphook == opposite_hook[hooknum]
+               || (hooknum == NF_IP_LOCAL_OUT
+                    && HOOK2MANIP(maniphook) == IP_NAT_MANIP_SRC));
+}
+
 unsigned int
 icmp_reply_translation(struct sk_buff *skb,
                       struct ip_conntrack *conntrack,
@@ -837,7 +849,7 @@
                   packet, except it was never src/dst reversed, so
                   where we would normally apply a dst manip, we apply
                   a src, and vice versa. */
-               if (info->manips[i].hooknum == opposite_hook[hooknum]) {
+               if (map_innards(info->manips[i].hooknum, hooknum)) {
                        DEBUGP("icmp_reply: inner %s -> %u.%u.%u.%u %u\n",
                               info->manips[i].maniptype == IP_NAT_MANIP_SRC
                               ? "DST" : "SRC",
diff -urN linux-2.4.19-pre6.orig/net/ipv4/netfilter/ipt_REJECT.c 
linux-2.4.19-pre6-nf-01/net/ipv4/netfilter/ipt_REJECT.c
--- linux-2.4.19-pre6.orig/net/ipv4/netfilter/ipt_REJECT.c      Sun Apr  7 15:27:29 
2002
+++ linux-2.4.19-pre6-nf-01/net/ipv4/netfilter/ipt_REJECT.c     Fri Apr 12 00:52:31 
+2002
@@ -32,7 +32,8 @@
                attach(new_skb, nfct);
 }
 
-/* Send RST reply */
+/* Send RST reply: we want to use the dest as the RST src ip, so can't
+   use normal RST routine. --RR */
 static void send_reset(struct sk_buff *oldskb, int local)
 {
        struct sk_buff *nskb;
@@ -153,6 +154,7 @@
        kfree_skb(nskb);
 }
 
+#if 0
 static void send_unreach(struct sk_buff *skb_in, int code)
 {
        struct iphdr *iph;
@@ -270,6 +272,12 @@
        NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, nskb, NULL, nskb->dst->dev,
                ip_finish_output);
 }      
+#else
+static void send_unreach(struct sk_buff *skb_in, int code)
+{
+       icmp_send(skb_in, ICMP_DEST_UNREACH, code, 0);
+}
+#endif
 
 static unsigned int reject(struct sk_buff **pskb,
                           unsigned int hooknum,
diff -urN linux-2.4.19-pre6.orig/net/ipv4/raw.c linux-2.4.19-pre6-nf-01/net/ipv4/raw.c
--- linux-2.4.19-pre6.orig/net/ipv4/raw.c       Sun Apr  7 15:27:29 2002
+++ linux-2.4.19-pre6-nf-01/net/ipv4/raw.c      Fri Apr 12 00:54:14 2002
@@ -427,7 +427,8 @@
        if (!ipc.addr)
                ipc.addr = rt->rt_dst;
        err = ip_build_xmit(sk, sk->protinfo.af_inet.hdrincl ? raw_getrawfrag :
-                           raw_getfrag, &rfh, len, &ipc, rt, msg->msg_flags);
+                           raw_getfrag, &rfh, len, &ipc, rt, msg->msg_flags,
+                           NULL);
 
 done:
        if (free)
diff -urN linux-2.4.19-pre6.orig/net/ipv4/udp.c linux-2.4.19-pre6-nf-01/net/ipv4/udp.c
--- linux-2.4.19-pre6.orig/net/ipv4/udp.c       Sun Apr  7 15:27:29 2002
+++ linux-2.4.19-pre6-nf-01/net/ipv4/udp.c      Fri Apr 12 00:52:32 2002
@@ -548,7 +548,7 @@
                            (sk->no_check == UDP_CSUM_NOXMIT ?
                             udp_getfrag_nosum :
                             udp_getfrag),
-                           &ufh, ulen, &ipc, rt, msg->msg_flags);
+                           &ufh, ulen, &ipc, rt, msg->msg_flags, NULL);
 
 out:
        ip_rt_put(rt);
-- 
Live long and prosper
- Harald Welte / [EMAIL PROTECTED]               http://www.gnumonks.org/
============================================================================
GCS/E/IT d- s-: a-- C+++ UL++++$ P+++ L++++$ E--- W- N++ o? K- w--- O- M+ 
V-- PS++ PE-- Y++ PGP++ t+ 5-- !X !R tv-- b+++ !DI !D G+ e* h--- r++ y+(*)

Reply via email to