[PATCH 3/4] UDP memory accounting and limitation(take 5): memory accounting
This patch introduces memory usage accounting for UDP. signed-off-by: Satoshi Oshima [EMAIL PROTECTED] signed-off-by: Hideo Aoki [EMAIL PROTECTED] Index: 2.6.23-udp_limit/net/ipv4/ip_output.c === --- 2.6.23-udp_limit.orig/net/ipv4/ip_output.c +++ 2.6.23-udp_limit/net/ipv4/ip_output.c @@ -743,6 +743,8 @@ static inline int ip_ufo_append_data(str /* specify the length of each IP datagram fragment*/ skb_shinfo(skb)-gso_size = mtu - fragheaderlen; skb_shinfo(skb)-gso_type = SKB_GSO_UDP; + atomic_add(sk_datagram_pages(skb-truesize), + sk-sk_prot-memory_allocated); __skb_queue_tail(sk-sk_write_queue, skb); return 0; @@ -924,6 +926,9 @@ alloc_new_skb: } if (skb == NULL) goto error; + if (sk-sk_prot-memory_allocated) + atomic_add(sk_datagram_pages(skb-truesize), + sk-sk_prot-memory_allocated); /* * Fill in the control structures @@ -1023,6 +1028,8 @@ alloc_new_skb: frag = skb_shinfo(skb)-frags[i]; skb-truesize += PAGE_SIZE; atomic_add(PAGE_SIZE, sk-sk_wmem_alloc); + if (sk-sk_prot-memory_allocated) + atomic_inc(sk-sk_prot-memory_allocated); } else { err = -EMSGSIZE; goto error; @@ -1123,7 +1130,9 @@ ssize_t ip_append_page(struct sock *sk, if (unlikely(!skb)) { err = -ENOBUFS; goto error; - } + } else if (sk-sk_prot-memory_allocated) + atomic_add(sk_datagram_pages(skb-truesize), + sk-sk_prot-memory_allocated); /* * Fill in the control structures @@ -1202,13 +1211,14 @@ int ip_push_pending_frames(struct sock * struct iphdr *iph; __be16 df = 0; __u8 ttl; - int err = 0; + int err = 0, send_page_size; if ((skb = __skb_dequeue(sk-sk_write_queue)) == NULL) goto out; tail_skb = (skb_shinfo(skb)-frag_list); /* move skb-data to ip header from ext header */ + send_page_size = sk_datagram_pages(skb-truesize); if (skb-data skb_network_header(skb)) __skb_pull(skb, skb_network_offset(skb)); while ((tmp_skb = __skb_dequeue(sk-sk_write_queue)) != NULL) { @@ -1218,6 +1228,7 @@ int ip_push_pending_frames(struct sock * skb-len += tmp_skb-len; skb-data_len += tmp_skb-len; skb-truesize += tmp_skb-truesize; + send_page_size += sk_datagram_pages(tmp_skb-truesize); __sock_put(tmp_skb-sk); tmp_skb-destructor = NULL; tmp_skb-sk = NULL; @@ -1269,6 +1280,8 @@ int ip_push_pending_frames(struct sock * /* Netfilter gets whole the not fragmented skb. */ err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, skb-dst-dev, dst_output); + if (sk-sk_prot-memory_allocated) + atomic_sub(send_page_size, sk-sk_prot-memory_allocated); if (err) { if (err 0) err = inet-recverr ? net_xmit_errno(err) : 0; @@ -1298,9 +1311,15 @@ void ip_flush_pending_frames(struct sock { struct inet_sock *inet = inet_sk(sk); struct sk_buff *skb; + int num_flush_mem = 0; - while ((skb = __skb_dequeue_tail(sk-sk_write_queue)) != NULL) + while ((skb = __skb_dequeue_tail(sk-sk_write_queue)) != NULL) { + num_flush_mem += sk_datagram_pages(skb-truesize); kfree_skb(skb); + } + + if (sk-sk_prot-memory_allocated) + atomic_sub(num_flush_mem, sk-sk_prot-memory_allocated); inet-cork.flags = ~IPCORK_OPT; kfree(inet-cork.opt); Index: 2.6.23-udp_limit/net/ipv4/udp.c === --- 2.6.23-udp_limit.orig/net/ipv4/udp.c +++ 2.6.23-udp_limit/net/ipv4/udp.c @@ -885,6 +885,9 @@ try_again: err = ulen; out_free: + atomic_sub(sk_datagram_pages(skb-truesize), + sk-sk_prot-memory_allocated); + skb_free_datagram(sk, skb); out: return err; @@ -892,6 +895,9 @@ out: csum_copy_err: UDP_INC_STATS_BH(UDP_MIB_INERRORS, is_udplite); + atomic_sub(sk_datagram_pages(skb-truesize), + sk-sk_prot-memory_allocated
[PATCH 2/4] UDP memory accounting and limitation(take 5): accounting unit and variable
This patch introduces global variable for UDP memory accounting. The unit is page. signed-off-by: Satoshi Oshima [EMAIL PROTECTED] signed-off-by: Hideo Aoki [EMAIL PROTECTED] Index: 2.6.23-udp_limit/include/net/sock.h === --- 2.6.23-udp_limit.orig/include/net/sock.h +++ 2.6.23-udp_limit/include/net/sock.h @@ -723,6 +723,13 @@ static inline int sk_stream_wmem_schedul sk_stream_mem_schedule(sk, size, 0); } +#define SK_DATAGRAM_MEM_QUANTUM ((int)PAGE_SIZE) + +static inline int sk_datagram_pages(int amt) +{ + return DIV_ROUND_UP(amt, SK_DATAGRAM_MEM_QUANTUM); +} + /* Used by processes to lock a socket state, so that * interrupts and bottom half handlers won't change it * from under us. It essentially blocks any incoming Index: 2.6.23-udp_limit/include/net/udp.h === --- 2.6.23-udp_limit.orig/include/net/udp.h +++ 2.6.23-udp_limit/include/net/udp.h @@ -65,6 +65,8 @@ extern rwlock_t udp_hash_lock; extern struct proto udp_prot; +extern atomic_t udp_memory_allocated; + struct sk_buff; /* Index: 2.6.23-udp_limit/net/ipv4/proc.c === --- 2.6.23-udp_limit.orig/net/ipv4/proc.c +++ 2.6.23-udp_limit/net/ipv4/proc.c @@ -66,7 +66,8 @@ static int sockstat_seq_show(struct seq_ fold_prot_inuse(tcp_prot), atomic_read(tcp_orphan_count), tcp_death_row.tw_count, atomic_read(tcp_sockets_allocated), atomic_read(tcp_memory_allocated)); - seq_printf(seq, UDP: inuse %d\n, fold_prot_inuse(udp_prot)); + seq_printf(seq, UDP: inuse %d mem %d\n, fold_prot_inuse(udp_prot), + atomic_read(udp_memory_allocated)); seq_printf(seq, UDPLITE: inuse %d\n, fold_prot_inuse(udplite_prot)); seq_printf(seq, RAW: inuse %d\n, fold_prot_inuse(raw_prot)); seq_printf(seq, FRAG: inuse %d memory %d\n, ip_frag_nqueues, Index: 2.6.23-udp_limit/net/ipv4/udp.c === --- 2.6.23-udp_limit.orig/net/ipv4/udp.c +++ 2.6.23-udp_limit/net/ipv4/udp.c @@ -113,6 +113,8 @@ DEFINE_SNMP_STAT(struct udp_mib, udp_sta struct hlist_head udp_hash[UDP_HTABLE_SIZE]; DEFINE_RWLOCK(udp_hash_lock); +atomic_t udp_memory_allocated; + static int udp_port_rover; static inline int __udp_lib_lport_inuse(__u16 num, struct hlist_head udptable[]) - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 0/4]UDP memory accounting and limitation(take 5)
Hi, I revised a patch set of UDP memory accounting and limitation. This patch set is for kernel 2.6.23. The differences from take 4 are * removing unnessesary EXPORT_SYMBOLs * adding minimal limit of /proc/sys/net/udp_mem * bugfix of UDP limit affecting protocol other than UDP * introducing __ip_check_max_skb_pages() * using CTL_UNNUMBERED * adding udp_mem usage to Documentation/networking/ip_sysctl.txt How to use UDP memory limitation: This patch set add /proc/sys/net/ipv4/udp_mem as a tuning parameter. When you give the number that is greater than 4096, UDP memory limitation will work. The number of pages for socket buffer is limited up to udp_mem[pages]. Currently this function drops the packet when it is sent or received and the number of pages for socket buffer is beyond the limit. It won't collect the buffer that is already allocated. On the other hand, udp_mem is specified as 4096, UDP memory limitaion will not work. The deafult number of udp_mem is 4096. Comment, review and test are welcome. Thanks, Satoshi Oshima - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC/PATCH 4/4] UDP memory usage accounting (take 4): memory limitation
Hi Stephen, On Thu, 11 Oct 2007 21:51:14 +0900 Satoshi OSHIMA [EMAIL PROTECTED] wrote: Hi Stephen, Thank you for your comment. { + .ctl_name = NET_UDP_MEM, + .procname = udp_mem, + .data = sysctl_udp_mem, + .maxlen = sizeof(sysctl_udp_mem), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { .ctl_name = NET_TCP_APP_WIN, .procname = tcp_app_win, .data = sysctl_tcp_app_win, if you use proc_dointvec_minmax, then you could inforce min/max values for udp_mem for the sysctl One other comment. Sysctl value indexes are deprecated at this point so all new values should use CTL_UNNUMBERED. Therefore unless NET_UDP_MEM already exists, please don't add it. Thank you for letting me know. I will fix it. Satoshi Oshima - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 1/4] UDP memory accounting and limitation(take 5): fix send buffer check
This patch introduces sndbuf size check before memory allcation for send buffer. signed-off-by: Satoshi Oshima [EMAIL PROTECTED] signed-off-by: Hideo Aoki [EMAIL PROTECTED] Index: 2.6.23-rc7-udp_limit/net/ipv4/ip_output.c === --- 2.6.23-rc7-udp_limit.orig/net/ipv4/ip_output.c +++ 2.6.23-rc7-udp_limit/net/ipv4/ip_output.c @@ -1004,6 +1004,11 @@ alloc_new_skb: frag = skb_shinfo(skb)-frags[i]; } } else if (i MAX_SKB_FRAGS) { + if (atomic_read(sk-sk_wmem_alloc) + PAGE_SIZE +2 * sk-sk_sndbuf) { + err = -ENOBUFS; + goto error; + } if (copy PAGE_SIZE) copy = PAGE_SIZE; page = alloc_pages(sk-sk_allocation, 0); - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 4/4] UDP memory accounting and limitation(take 5): memory limitation
This patch introduces memory limitation for UDP. signed-off-by: Satoshi Oshima [EMAIL PROTECTED] signed-off-by: Hideo Aoki [EMAIL PROTECTED] Index: 2.6.23-udp_limit/include/net/udp.h === --- 2.6.23-udp_limit.orig/include/net/udp.h +++ 2.6.23-udp_limit/include/net/udp.h @@ -65,7 +65,10 @@ extern rwlock_t udp_hash_lock; extern struct proto udp_prot; +/* Used by memory accounting and capping */ +#define UDP_MIN_SKB_PAGES 4096 extern atomic_t udp_memory_allocated; +extern int sysctl_udp_mem; struct sk_buff; Index: 2.6.23-udp_limit/net/ipv4/udp.c === --- 2.6.23-udp_limit.orig/net/ipv4/udp.c +++ 2.6.23-udp_limit/net/ipv4/udp.c @@ -114,6 +114,7 @@ struct hlist_head udp_hash[UDP_HTABLE_SI DEFINE_RWLOCK(udp_hash_lock); atomic_t udp_memory_allocated; +int sysctl_udp_mem = UDP_MIN_SKB_PAGES; static int udp_port_rover; @@ -1016,6 +1017,16 @@ int udp_queue_rcv_skb(struct sock * sk, goto drop; } + if (sk-sk_prot-sysctl_mem[0] UDP_MIN_SKB_PAGES) { + if ((atomic_read(sk-sk_prot-memory_allocated) + + sk_datagram_pages(skb-truesize)) + = sk-sk_prot-sysctl_mem[0]) { + UDP_INC_STATS_BH(UDP_MIB_RCVBUFERRORS, + up-pcflag); + goto drop; + } + } + if ((rc = sock_queue_rcv_skb(sk,skb)) 0) { /* Note that an ENOMEM error is charged twice */ if (rc == -ENOMEM) @@ -1451,6 +1462,7 @@ struct proto udp_prot = { .unhash= udp_lib_unhash, .get_port = udp_v4_get_port, .memory_allocated = udp_memory_allocated, + .sysctl_mem= sysctl_udp_mem, .obj_size = sizeof(struct udp_sock), #ifdef CONFIG_COMPAT .compat_setsockopt = compat_udp_setsockopt, Index: 2.6.23-udp_limit/net/ipv4/sysctl_net_ipv4.c === --- 2.6.23-udp_limit.orig/net/ipv4/sysctl_net_ipv4.c +++ 2.6.23-udp_limit/net/ipv4/sysctl_net_ipv4.c @@ -17,6 +17,7 @@ #include net/ip.h #include net/route.h #include net/tcp.h +#include net/udp.h #include net/cipso_ipv4.h /* From af_inet.c */ @@ -25,6 +26,7 @@ extern int sysctl_ip_nonlocal_bind; #ifdef CONFIG_SYSCTL static int zero; static int tcp_retr1_max = 255; +static int udp_mem_min = UDP_MIN_SKB_PAGES; static int ip_local_port_range_min[] = { 1, 1 }; static int ip_local_port_range_max[] = { 65535, 65535 }; #endif @@ -599,6 +601,16 @@ ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { + .ctl_name = CTL_UNNUMBERED, + .procname = udp_mem, + .data = sysctl_udp_mem, + .maxlen = sizeof(sysctl_udp_mem), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .strategy = sysctl_intvec, + .extra1 = udp_mem_min + }, + { .ctl_name = NET_TCP_APP_WIN, .procname = tcp_app_win, .data = sysctl_tcp_app_win, Index: 2.6.23-udp_limit/net/ipv4/ip_output.c === --- 2.6.23-udp_limit.orig/net/ipv4/ip_output.c +++ 2.6.23-udp_limit/net/ipv4/ip_output.c @@ -75,6 +75,7 @@ #include net/icmp.h #include net/checksum.h #include net/inetpeer.h +#include net/udp.h #include linux/igmp.h #include linux/netfilter_ipv4.h #include linux/netfilter_bridge.h @@ -699,6 +700,21 @@ csum_page(struct page *page, int offset, return csum; } +static inline int __ip_check_max_skb_pages(struct sock *sk, int size) +{ + switch(sk-sk_protocol) { + case IPPROTO_UDP: + if (sk-sk_prot-sysctl_mem[0] UDP_MIN_SKB_PAGES) + if (atomic_read(sk-sk_prot-memory_allocated)+size + = sk-sk_prot-sysctl_mem[0]) + return -ENOBUFS; + /* Fall through */ + default: + break; + } + return 0; +} + static inline int ip_ufo_append_data(struct sock *sk, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), @@ -910,6 +926,12 @@ alloc_new_skb: if (datalen == length + fraggap) alloclen += rt-u.dst.trailer_len; + err = __ip_check_max_skb_pages(sk, + sk_datagram_pages(SKB_DATA_ALIGN(alloclen + hh_len + 15) + + sizeof(struct sk_buff))); + if (err) + goto error
Re: [RFC/PATCH 4/4] UDP memory usage accounting (take 4): memory limitation
Hi Stephen, Thank you for your comment. { +.ctl_name = NET_UDP_MEM, +.procname = udp_mem, +.data = sysctl_udp_mem, +.maxlen = sizeof(sysctl_udp_mem), +.mode = 0644, +.proc_handler = proc_dointvec +}, +{ .ctl_name = NET_TCP_APP_WIN, .procname = tcp_app_win, .data = sysctl_tcp_app_win, if you use proc_dointvec_minmax, then you could inforce min/max values for udp_mem for the sysctl udp_mem has two meanings: * turn off this limitation function (currently udp_mem=4096) * limit udp memory (currently udp_mem4096) To realize this, udp_mem is evaluated whether udp_mem equals 4096 or smaller in UDP and IP layers. If udp_mem has proc_dointvec_minmax or dedicated proc handler, turn off check must be done in UDP and IP layers. This means there is no reduction of the check in UDP and IP layers. If you pointed out that minus value of udp_mem is strange, I agree. I'll fix it. How about this? min=4096 (and turn off limitation) udp_mem4096 (and turn on limitation) Satoshi Oshima - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC/PATCH 2/4] UDP memory usage accounting (take 4): accounting unit and variable
Hi Evgeniy, Thank you for your comment. Hi. On Sat, Oct 06, 2007 at 12:01:07AM +0900, Satoshi OSHIMA ([EMAIL PROTECTED]) wrote: --- 2.6.23-rc3-udp_limit.orig/net/ipv4/udp.c +++ 2.6.23-rc3-udp_limit/net/ipv4/udp.c @@ -113,6 +113,10 @@ DEFINE_SNMP_STAT(struct udp_mib, udp_sta struct hlist_head udp_hash[UDP_HTABLE_SIZE]; DEFINE_RWLOCK(udp_hash_lock); +atomic_t udp_memory_allocated; + +EXPORT_SYMBOL(udp_memory_allocated); + Why do you export this variable? It is not accessed from modules in your patchset. Good point! I'll fix it. Satoshi Oshima - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[RFC/PATCH 1/4] UDP memory usage accounting (take 4): fix send buffer check
This patch introduces sndbuf size check before memory allcation for send buffer. signed-off-by: Satoshi Oshima [EMAIL PROTECTED] signed-off-by: Hideo Aoki [EMAIL PROTECTED] Index: 2.6.23-rc7-udp_limit/net/ipv4/ip_output.c === --- 2.6.23-rc7-udp_limit.orig/net/ipv4/ip_output.c +++ 2.6.23-rc7-udp_limit/net/ipv4/ip_output.c @@ -1004,6 +1004,11 @@ alloc_new_skb: frag = skb_shinfo(skb)-frags[i]; } } else if (i MAX_SKB_FRAGS) { + if (atomic_read(sk-sk_wmem_alloc) + PAGE_SIZE +2 * sk-sk_sndbuf) { + err = -ENOBUFS; + goto error; + } if (copy PAGE_SIZE) copy = PAGE_SIZE; page = alloc_pages(sk-sk_allocation, 0); - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[RFC/PATCH 2/4] UDP memory usage accounting (take 4): accounting unit and variable
This patch introduces global variable for UDP memory accounting. The unit is page. signed-off-by: Satoshi Oshima [EMAIL PROTECTED] signed-off-by: Hideo Aoki [EMAIL PROTECTED] Index: 2.6.23-rc3-udp_limit/include/net/sock.h === --- 2.6.23-rc3-udp_limit.orig/include/net/sock.h +++ 2.6.23-rc3-udp_limit/include/net/sock.h @@ -723,6 +723,13 @@ static inline int sk_stream_wmem_schedul sk_stream_mem_schedule(sk, size, 0); } +#define SK_DATAGRAM_MEM_QUANTUM ((int)PAGE_SIZE) + +static inline int sk_datagram_pages(int amt) +{ + return DIV_ROUND_UP(amt, SK_DATAGRAM_MEM_QUANTUM); +} + /* Used by processes to lock a socket state, so that * interrupts and bottom half handlers won't change it * from under us. It essentially blocks any incoming Index: 2.6.23-rc3-udp_limit/include/net/udp.h === --- 2.6.23-rc3-udp_limit.orig/include/net/udp.h +++ 2.6.23-rc3-udp_limit/include/net/udp.h @@ -65,6 +65,8 @@ extern rwlock_t udp_hash_lock; extern struct proto udp_prot; +extern atomic_t udp_memory_allocated; + struct sk_buff; /* Index: 2.6.23-rc3-udp_limit/net/ipv4/proc.c === --- 2.6.23-rc3-udp_limit.orig/net/ipv4/proc.c +++ 2.6.23-rc3-udp_limit/net/ipv4/proc.c @@ -66,7 +66,8 @@ static int sockstat_seq_show(struct seq_ fold_prot_inuse(tcp_prot), atomic_read(tcp_orphan_count), tcp_death_row.tw_count, atomic_read(tcp_sockets_allocated), atomic_read(tcp_memory_allocated)); - seq_printf(seq, UDP: inuse %d\n, fold_prot_inuse(udp_prot)); + seq_printf(seq, UDP: inuse %d mem %d\n, fold_prot_inuse(udp_prot), + atomic_read(udp_memory_allocated)); seq_printf(seq, UDPLITE: inuse %d\n, fold_prot_inuse(udplite_prot)); seq_printf(seq, RAW: inuse %d\n, fold_prot_inuse(raw_prot)); seq_printf(seq, FRAG: inuse %d memory %d\n, ip_frag_nqueues, Index: 2.6.23-rc3-udp_limit/net/ipv4/udp.c === --- 2.6.23-rc3-udp_limit.orig/net/ipv4/udp.c +++ 2.6.23-rc3-udp_limit/net/ipv4/udp.c @@ -113,6 +113,10 @@ DEFINE_SNMP_STAT(struct udp_mib, udp_sta struct hlist_head udp_hash[UDP_HTABLE_SIZE]; DEFINE_RWLOCK(udp_hash_lock); +atomic_t udp_memory_allocated; + +EXPORT_SYMBOL(udp_memory_allocated); + static int udp_port_rover; static inline int __udp_lib_lport_inuse(__u16 num, struct hlist_head udptable[]) - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[RFC/PATCH 3/4] UDP memory usage accounting (take 4): memory usage accounting
This patch introduces memory usage accounting for UDP. signed-off-by: Satoshi Oshima [EMAIL PROTECTED] signed-off-by: Hideo Aoki [EMAIL PROTECTED] Index: 2.6.23-rc8-udp_limit/net/ipv4/ip_output.c === --- 2.6.23-rc8-udp_limit.orig/net/ipv4/ip_output.c +++ 2.6.23-rc8-udp_limit/net/ipv4/ip_output.c @@ -743,6 +743,8 @@ static inline int ip_ufo_append_data(str /* specify the length of each IP datagram fragment*/ skb_shinfo(skb)-gso_size = mtu - fragheaderlen; skb_shinfo(skb)-gso_type = SKB_GSO_UDP; + atomic_add(sk_datagram_pages(skb-truesize), + sk-sk_prot-memory_allocated); __skb_queue_tail(sk-sk_write_queue, skb); return 0; @@ -924,6 +926,9 @@ alloc_new_skb: } if (skb == NULL) goto error; + if (sk-sk_prot-memory_allocated) + atomic_add(sk_datagram_pages(skb-truesize), + sk-sk_prot-memory_allocated); /* * Fill in the control structures @@ -1023,6 +1028,8 @@ alloc_new_skb: frag = skb_shinfo(skb)-frags[i]; skb-truesize += PAGE_SIZE; atomic_add(PAGE_SIZE, sk-sk_wmem_alloc); + if (sk-sk_prot-memory_allocated) + atomic_inc(sk-sk_prot-memory_allocated); } else { err = -EMSGSIZE; goto error; @@ -1123,7 +1130,9 @@ ssize_t ip_append_page(struct sock *sk, if (unlikely(!skb)) { err = -ENOBUFS; goto error; - } + } else if (sk-sk_prot-memory_allocated) + atomic_add(sk_datagram_pages(skb-truesize), + sk-sk_prot-memory_allocated); /* * Fill in the control structures @@ -1202,13 +1211,14 @@ int ip_push_pending_frames(struct sock * struct iphdr *iph; __be16 df = 0; __u8 ttl; - int err = 0; + int err = 0, send_page_size; if ((skb = __skb_dequeue(sk-sk_write_queue)) == NULL) goto out; tail_skb = (skb_shinfo(skb)-frag_list); /* move skb-data to ip header from ext header */ + send_page_size = sk_datagram_pages(skb-truesize); if (skb-data skb_network_header(skb)) __skb_pull(skb, skb_network_offset(skb)); while ((tmp_skb = __skb_dequeue(sk-sk_write_queue)) != NULL) { @@ -1218,6 +1228,7 @@ int ip_push_pending_frames(struct sock * skb-len += tmp_skb-len; skb-data_len += tmp_skb-len; skb-truesize += tmp_skb-truesize; + send_page_size += sk_datagram_pages(tmp_skb-truesize); __sock_put(tmp_skb-sk); tmp_skb-destructor = NULL; tmp_skb-sk = NULL; @@ -1269,6 +1280,8 @@ int ip_push_pending_frames(struct sock * /* Netfilter gets whole the not fragmented skb. */ err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, skb-dst-dev, dst_output); + if (sk-sk_prot-memory_allocated) + atomic_sub(send_page_size, sk-sk_prot-memory_allocated); if (err) { if (err 0) err = inet-recverr ? net_xmit_errno(err) : 0; @@ -1298,9 +1311,15 @@ void ip_flush_pending_frames(struct sock { struct inet_sock *inet = inet_sk(sk); struct sk_buff *skb; + int num_flush_mem = 0; - while ((skb = __skb_dequeue_tail(sk-sk_write_queue)) != NULL) + while ((skb = __skb_dequeue_tail(sk-sk_write_queue)) != NULL) { + num_flush_mem += sk_datagram_pages(skb-truesize); kfree_skb(skb); + } + + if (sk-sk_prot-memory_allocated) + atomic_sub(num_flush_mem, sk-sk_prot-memory_allocated); inet-cork.flags = ~IPCORK_OPT; kfree(inet-cork.opt); Index: 2.6.23-rc8-udp_limit/net/ipv4/udp.c === --- 2.6.23-rc8-udp_limit.orig/net/ipv4/udp.c +++ 2.6.23-rc8-udp_limit/net/ipv4/udp.c @@ -887,6 +887,9 @@ try_again: err = ulen; out_free: + atomic_sub(sk_datagram_pages(skb-truesize), + sk-sk_prot-memory_allocated); + skb_free_datagram(sk, skb); out: return err; @@ -894,6 +897,9 @@ out: csum_copy_err: UDP_INC_STATS_BH(UDP_MIB_INERRORS, is_udplite); + atomic_sub(sk_datagram_pages(skb-truesize), + sk-sk_prot
[RFC/PATCH 4/4] UDP memory usage accounting (take 4): memory limitation
This patch introduces memory limitation for UDP. signed-off-by: Satoshi Oshima [EMAIL PROTECTED] signed-off-by: Hideo Aoki [EMAIL PROTECTED] Index: 2.6.23-rc9-udp_limit/include/net/udp.h === --- 2.6.23-rc9-udp_limit.orig/include/net/udp.h +++ 2.6.23-rc9-udp_limit/include/net/udp.h @@ -65,7 +65,10 @@ extern rwlock_t udp_hash_lock; extern struct proto udp_prot; +/* Used by memory accounting and capping */ +#define UDP_MIN_SKB_PAGES 4096 extern atomic_t udp_memory_allocated; +extern int sysctl_udp_mem; struct sk_buff; Index: 2.6.23-rc9-udp_limit/net/ipv4/udp.c === --- 2.6.23-rc9-udp_limit.orig/net/ipv4/udp.c +++ 2.6.23-rc9-udp_limit/net/ipv4/udp.c @@ -114,8 +114,10 @@ struct hlist_head udp_hash[UDP_HTABLE_SI DEFINE_RWLOCK(udp_hash_lock); atomic_t udp_memory_allocated; +int sysctl_udp_mem = 0; EXPORT_SYMBOL(udp_memory_allocated); +EXPORT_SYMBOL(sysctl_udp_mem); static int udp_port_rover; @@ -1018,6 +1020,16 @@ int udp_queue_rcv_skb(struct sock * sk, goto drop; } + if (sk-sk_prot-sysctl_mem[0] UDP_MIN_SKB_PAGES) { + if ((atomic_read(sk-sk_prot-memory_allocated) + + sk_datagram_pages(skb-truesize)) + = sk-sk_prot-sysctl_mem[0]) { + UDP_INC_STATS_BH(UDP_MIB_RCVBUFERRORS, + up-pcflag); + goto drop; + } + } + if ((rc = sock_queue_rcv_skb(sk,skb)) 0) { /* Note that an ENOMEM error is charged twice */ if (rc == -ENOMEM) @@ -1453,6 +1465,7 @@ struct proto udp_prot = { .unhash= udp_lib_unhash, .get_port = udp_v4_get_port, .memory_allocated = udp_memory_allocated, + .sysctl_mem= sysctl_udp_mem, .obj_size = sizeof(struct udp_sock), #ifdef CONFIG_COMPAT .compat_setsockopt = compat_udp_setsockopt, Index: 2.6.23-rc9-udp_limit/net/ipv4/sysctl_net_ipv4.c === --- 2.6.23-rc9-udp_limit.orig/net/ipv4/sysctl_net_ipv4.c +++ 2.6.23-rc9-udp_limit/net/ipv4/sysctl_net_ipv4.c @@ -17,6 +17,7 @@ #include net/ip.h #include net/route.h #include net/tcp.h +#include net/udp.h #include net/cipso_ipv4.h /* From af_inet.c */ @@ -599,6 +600,14 @@ ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { + .ctl_name = NET_UDP_MEM, + .procname = udp_mem, + .data = sysctl_udp_mem, + .maxlen = sizeof(sysctl_udp_mem), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { .ctl_name = NET_TCP_APP_WIN, .procname = tcp_app_win, .data = sysctl_tcp_app_win, Index: 2.6.23-rc9-udp_limit/include/linux/sysctl.h === --- 2.6.23-rc9-udp_limit.orig/include/linux/sysctl.h +++ 2.6.23-rc9-udp_limit/include/linux/sysctl.h @@ -441,6 +441,7 @@ enum NET_TCP_ALLOWED_CONG_CONTROL=123, NET_TCP_MAX_SSTHRESH=124, NET_TCP_FRTO_RESPONSE=125, + NET_UDP_MEM=126, }; enum { Index: 2.6.23-rc9-udp_limit/net/ipv4/ip_output.c === --- 2.6.23-rc9-udp_limit.orig/net/ipv4/ip_output.c +++ 2.6.23-rc9-udp_limit/net/ipv4/ip_output.c @@ -75,6 +75,7 @@ #include net/icmp.h #include net/checksum.h #include net/inetpeer.h +#include net/udp.h #include linux/igmp.h #include linux/netfilter_ipv4.h #include linux/netfilter_bridge.h @@ -910,6 +911,17 @@ alloc_new_skb: if (datalen == length + fraggap) alloclen += rt-u.dst.trailer_len; + if (sk-sk_prot-sysctl_mem) + if (sk-sk_prot-sysctl_mem[0] UDP_MIN_SKB_PAGES) + if ((atomic_read(sk-sk_prot-memory_allocated) + + sk_datagram_pages( + SKB_DATA_ALIGN(alloclen + hh_len + 15) ++ sizeof(struct sk_buff))) += sk-sk_prot-sysctl_mem[0]) { + err = -ENOBUFS; + goto error; + } + if (transhdrlen) { skb = sock_alloc_send_skb(sk, alloclen + hh_len + 15, @@ -1009,6 +1021,15 @@ alloc_new_skb
[RFC/PATCH 0/4]UDP memory accounting (take 4)
Hi, I post a patch set of UDP memory accounting and limitation. This patch set is for kernel 2.6.23-rc9. The differences from take 3 are * fixing double accounting bug of ip_send_page() * adding UDP memory limitation. How to use UDP memory limitation: This patch set add /proc/sys/net/ipv4/udp_mem as a tuning parameter. When you give the number that is greater than 4096, UDP memory limitation will work. The number of pages for socket buffer is limited up to udp_mem[pages]. Currently this function drops the packet when it is sent or received and the number of pages for socket buffer is beyond the limit. It won't collect the buffer that is already allocated. On the other hand, udp_mem is specified as 4096 or smaller, UDP memory limitaion will not work. The deafult number is 0. Comment, review and test are welcome. By the way, David pointed out that we should have the better solution such as memory reclaiming by callback from vmm. I seriously consider it. But As Herbert pointed out, it is very difficult to apply it to TCP because some of the buffer is already acked. I couldn't find the good solution that is applicable for TCP, UDP, route cache and so on. Let me know, if you find the good way to solve this problem. Thanks, Satoshi Oshima - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[RFC/PATCH 0/3] UDP memory usage accounting (take 3)
This patch set try to introduce memory usage accounting for UDP(currently ipv4 only). This is the second post of take 2 patch, because previous post was broken by my MUA setting. Only what I chage is my MUA setting. There is no code change from take 2. This patch set is for 2.6.23-rc8. I appreciate your comment/test/feedback. Satoshi Oshima - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[RFC/PATCH 1/3] UDP memory usage accounting (take 3): fix send buffer check
This patch introduces sndbuf size check before memory allcation for send buffer. signed-off-by: Satoshi Oshima [EMAIL PROTECTED] signed-off-by: Hideo Aoki [EMAIL PROTECTED] Index: 2.6.23-rc7-udp_limit/net/ipv4/ip_output.c === --- 2.6.23-rc7-udp_limit.orig/net/ipv4/ip_output.c +++ 2.6.23-rc7-udp_limit/net/ipv4/ip_output.c @@ -1004,6 +1004,11 @@ alloc_new_skb: frag = skb_shinfo(skb)-frags[i]; } } else if (i MAX_SKB_FRAGS) { + if (atomic_read(sk-sk_wmem_alloc) + PAGE_SIZE +2 * sk-sk_sndbuf) { + err = -ENOBUFS; + goto error; + } if (copy PAGE_SIZE) copy = PAGE_SIZE; page = alloc_pages(sk-sk_allocation, 0); - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[RFC/PATCH 2/3] UDP memory usage accounting (take 3): accounting unit and variable
This patch introduces global variable for UDP memory accounting. The unit is page. signed-off-by: Satoshi Oshima [EMAIL PROTECTED] signed-off-by: Hideo Aoki [EMAIL PROTECTED] Index: 2.6.23-rc3-udp_limit/include/net/sock.h === --- 2.6.23-rc3-udp_limit.orig/include/net/sock.h +++ 2.6.23-rc3-udp_limit/include/net/sock.h @@ -723,6 +723,13 @@ static inline int sk_stream_wmem_schedul sk_stream_mem_schedule(sk, size, 0); } +#define SK_DATAGRAM_MEM_QUANTUM ((int)PAGE_SIZE) + +static inline int sk_datagram_pages(int amt) +{ + return DIV_ROUND_UP(amt, SK_DATAGRAM_MEM_QUANTUM); +} + /* Used by processes to lock a socket state, so that * interrupts and bottom half handlers won't change it * from under us. It essentially blocks any incoming Index: 2.6.23-rc3-udp_limit/include/net/udp.h === --- 2.6.23-rc3-udp_limit.orig/include/net/udp.h +++ 2.6.23-rc3-udp_limit/include/net/udp.h @@ -65,6 +65,8 @@ extern rwlock_t udp_hash_lock; extern struct proto udp_prot; +extern atomic_t udp_memory_allocated; + struct sk_buff; /* Index: 2.6.23-rc3-udp_limit/net/ipv4/proc.c === --- 2.6.23-rc3-udp_limit.orig/net/ipv4/proc.c +++ 2.6.23-rc3-udp_limit/net/ipv4/proc.c @@ -66,7 +66,8 @@ static int sockstat_seq_show(struct seq_ fold_prot_inuse(tcp_prot), atomic_read(tcp_orphan_count), tcp_death_row.tw_count, atomic_read(tcp_sockets_allocated), atomic_read(tcp_memory_allocated)); - seq_printf(seq, UDP: inuse %d\n, fold_prot_inuse(udp_prot)); + seq_printf(seq, UDP: inuse %d mem %d\n, fold_prot_inuse(udp_prot), + atomic_read(udp_memory_allocated)); seq_printf(seq, UDPLITE: inuse %d\n, fold_prot_inuse(udplite_prot)); seq_printf(seq, RAW: inuse %d\n, fold_prot_inuse(raw_prot)); seq_printf(seq, FRAG: inuse %d memory %d\n, ip_frag_nqueues, Index: 2.6.23-rc3-udp_limit/net/ipv4/udp.c === --- 2.6.23-rc3-udp_limit.orig/net/ipv4/udp.c +++ 2.6.23-rc3-udp_limit/net/ipv4/udp.c @@ -113,6 +113,10 @@ DEFINE_SNMP_STAT(struct udp_mib, udp_sta struct hlist_head udp_hash[UDP_HTABLE_SIZE]; DEFINE_RWLOCK(udp_hash_lock); +atomic_t udp_memory_allocated; + +EXPORT_SYMBOL(udp_memory_allocated); + static int udp_port_rover; static inline int __udp_lib_lport_inuse(__u16 num, struct hlist_head udptable[]) - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[RFC/PATCH 3/3] UDP memory usage accounting (take 3): measurement
This patch introduces memory usage measurement for UDP. signed-off-by: Satoshi Oshima [EMAIL PROTECTED] signed-off-by: Hideo Aoki [EMAIL PROTECTED] Index: 2.6.23-rc8-udp_limit/net/ipv4/ip_output.c === --- 2.6.23-rc8-udp_limit.orig/net/ipv4/ip_output.c +++ 2.6.23-rc8-udp_limit/net/ipv4/ip_output.c @@ -743,6 +743,8 @@ static inline int ip_ufo_append_data(str /* specify the length of each IP datagram fragment*/ skb_shinfo(skb)-gso_size = mtu - fragheaderlen; skb_shinfo(skb)-gso_type = SKB_GSO_UDP; + atomic_add(sk_datagram_pages(skb-truesize), + sk-sk_prot-memory_allocated); __skb_queue_tail(sk-sk_write_queue, skb); return 0; @@ -924,6 +926,9 @@ alloc_new_skb: } if (skb == NULL) goto error; + if (sk-sk_prot-memory_allocated) + atomic_add(sk_datagram_pages(skb-truesize), + sk-sk_prot-memory_allocated); /* * Fill in the control structures @@ -1023,6 +1028,8 @@ alloc_new_skb: frag = skb_shinfo(skb)-frags[i]; skb-truesize += PAGE_SIZE; atomic_add(PAGE_SIZE, sk-sk_wmem_alloc); + if (sk-sk_prot-memory_allocated) + atomic_inc(sk-sk_prot-memory_allocated); } else { err = -EMSGSIZE; goto error; @@ -1123,7 +1130,9 @@ ssize_t ip_append_page(struct sock *sk, if (unlikely(!skb)) { err = -ENOBUFS; goto error; - } + } else if (sk-sk_prot-memory_allocated) + atomic_add(sk_datagram_pages(skb-truesize), + sk-sk_prot-memory_allocated); /* * Fill in the control structures @@ -1152,6 +1161,8 @@ ssize_t ip_append_page(struct sock *sk, /* * Put the packet on the pending queue. */ + atomic_add(sk_datagram_pages(skb-truesize), + sk-sk_prot-memory_allocated); __skb_queue_tail(sk-sk_write_queue, skb); continue; } @@ -1202,13 +1213,14 @@ int ip_push_pending_frames(struct sock * struct iphdr *iph; __be16 df = 0; __u8 ttl; - int err = 0; + int err = 0, send_page_size; if ((skb = __skb_dequeue(sk-sk_write_queue)) == NULL) goto out; tail_skb = (skb_shinfo(skb)-frag_list); /* move skb-data to ip header from ext header */ + send_page_size = sk_datagram_pages(skb-truesize); if (skb-data skb_network_header(skb)) __skb_pull(skb, skb_network_offset(skb)); while ((tmp_skb = __skb_dequeue(sk-sk_write_queue)) != NULL) { @@ -1218,6 +1230,7 @@ int ip_push_pending_frames(struct sock * skb-len += tmp_skb-len; skb-data_len += tmp_skb-len; skb-truesize += tmp_skb-truesize; + send_page_size += sk_datagram_pages(tmp_skb-truesize); __sock_put(tmp_skb-sk); tmp_skb-destructor = NULL; tmp_skb-sk = NULL; @@ -1269,6 +1282,8 @@ int ip_push_pending_frames(struct sock * /* Netfilter gets whole the not fragmented skb. */ err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, skb-dst-dev, dst_output); + if (sk-sk_prot-memory_allocated) + atomic_sub(send_page_size, sk-sk_prot-memory_allocated); if (err) { if (err 0) err = inet-recverr ? net_xmit_errno(err) : 0; @@ -1298,9 +1313,15 @@ void ip_flush_pending_frames(struct sock { struct inet_sock *inet = inet_sk(sk); struct sk_buff *skb; + int num_flush_mem = 0; - while ((skb = __skb_dequeue_tail(sk-sk_write_queue)) != NULL) + while ((skb = __skb_dequeue_tail(sk-sk_write_queue)) != NULL) { + num_flush_mem += sk_datagram_pages(skb-truesize); kfree_skb(skb); + } + + if (sk-sk_prot-memory_allocated) + atomic_sub(num_flush_mem, sk-sk_prot-memory_allocated); inet-cork.flags = ~IPCORK_OPT; kfree(inet-cork.opt); Index: 2.6.23-rc8-udp_limit/net/ipv4/udp.c === --- 2.6.23-rc8-udp_limit.orig/net/ipv4/udp.c +++ 2.6.23-rc8-udp_limit
Re: [RFC/PATCH 3/3] UDP memory usage accounting (take 2): measurement
Evgeniy Polyakov wrote: On Fri, Sep 28, 2007 at 10:41:31PM +0900, Satoshi OSHIMA ([EMAIL PROTECTED]) wrote: This patch introduces memory usage measurement for UDP. These 3 points were updated. - UDP specific codes in IP layer were removed. - atomic_sub() in a loop was removed - accounting during socket destruction Another approach is to account only at the highest UDP layer and having datagram skb destructor just like it is done in TCP, but this approach is also resonable. This patch set try to introduce a memory accounting by the page because TCP does. And ip_append_data() merges payloads to a sk_buff if previous sk_buff has enough space. The problem is that udp_append_data() doesn't recognize whether this merge happens or not. If the accounting must be in UDP layer, we need to change the interface of ip_append_data() to know this merge happens. Once the interface is changed, we have to maintain other protocol stacks to keep up with the change. But I didn't want to do it to keep this patch set small in the first step. I already told that patches 1 and 3 have broken indent, please fix that. Oops! I will fix that. A hint: when you are about to submit something network related for inclusion, and strongly believes it is ready, it can be a not that bad idea to add David Miller [EMAIL PROTECTED] to copy list, he can complain about backlog and so on, but will read you mail twice :) but do not tell anyone. Thank you for your advice. I will do that! Satoshi Oshima - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC/PATCH 0/3] UDP memory usage accounting
Herbert Xu wrote: On Fri, Sep 28, 2007 at 09:51:59PM -0700, David Miller wrote: There is a per-socket send buffer limit, and there is a per-user open file descriptor limit. Multiply the two to determine how much system memory the user can consume using sockets. We do have these limits but they're per-process, not per-user. Unless you lock down the number of processes each user can have to no more than a handful then this is basically useless. For example, let's say each socket can lock down 64K of kernel memory (which is quite easy to do BTW, just open a TCP/UDP socket, send data to it from another socket but keep the data in the socket by not calling recvmsg), and that each process can have 1024 file descriptors (the default), then each process can pin 64K x 1024 = 64M of memory. So if the user can have 10 processes, then that's 640M of kernel memory that can be pinned down. Usually the process limit is at least 10 times higher. Thank you very mush for your comment. What you pointed out is my motivation to make this patch. I think that per-process limits won't help to solve this problem. - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC/PATCH 0/3] UDP memory usage accounting
On Fri, Sep 28, 2007 at 09:47:37PM -0700, David Miller wrote: There are two things we (might) need to guard against, one local and one remote. Right I was focusing on the local threat. If you do a per-user limit, apache would basically just stop at that redzone point. In some sense making the attack more effective because then it's trivial to shut down an entire web server this way. Having a per-user limit doesn't necessarily mean that we have to apply the limit differently to how we apply the system-wide limits. We could keep exactly the same code as we have now but check against a per-user limit instead of a system-wide one. In other words your apache scenario will continue to work as is even with a per-user limit. I'm afraid that per-user limit won't work for system administrator, because he can't know who is the rogue user in advance (before such attack is made). And once the attack is made, system will not responce because of the lack of memory for slab. So if he only has per-user limit, he need to split the memory budget for UDP to each user. The limit per user will be very small if number of users in the system is large. Now where it does become useful is when we have a rogue local user. As it is that user can chew up all of the budgeted TCP memory by simply not calling recvmsg. As I've stated in the other email, the existing rlimits don't help because they're per-process rather than per-user. BTW, this is not fatal for TCP because TCP provides a minimum amount of memory for each socket even when we are over the limit. However, if we this was implemented for UDP without a minimum guarantee then it'd be quite useless. Hmm, I didn't realize that. Thank you for your good suggestion. I will think of it. I see no valid argument against doing something similar for sockets. Such a register_shrinker() handler for TCP could, for example, look for TCP flows which haven't made forward progress in more than a certain amount of time and attempt to trim SKB memory from them. Yes I agree this would be quite useful for sending. However, it'll be tough to shrink skbs that we've already acked for but the app for some reason has decided to leave in the socket by not calling recvmsg. UDP and other datagram sockets are troublesome because the memory gets wholly tied up immediately during the send call and it's not easy to liberate anything. The nice part about datagram sockets, however, is that they make forward progress quickly and their memory is liberated as soon as the device transmits the packet. They don't have to wait for ACKs, windows openning up, or anything like that to happen. Agreed. Also the recvmsg case I've described above is much simpler for UDP as we can just go through all the sockets and free skbs at random :) To be honest I don't even think UDP is much of a real problem for this reason. It's not a hard problem but we do need to have some code for it. I believe so. Currently, a nasty user can easily stop the system without root privilege. This may not be a serious problem, but this is the problem to be fixed. - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[RFC/PATCH 3/3] UDP memory usage accounting (take 2): measurement
This patch introduces memory usage measurement for UDP. These 3 points were updated. - UDP specific codes in IP layer were removed. - atomic_sub() in a loop was removed - accounting during socket destruction signed-off-by: Satoshi Oshima [EMAIL PROTECTED] signed-off-by: Hideo Aoki [EMAIL PROTECTED] Index: 2.6.23-rc8-udp_limit/net/ipv4/ip_output.c === --- 2.6.23-rc8-udp_limit.orig/net/ipv4/ip_output.c +++ 2.6.23-rc8-udp_limit/net/ipv4/ip_output.c @@ -743,6 +743,8 @@ static inline int ip_ufo_append_data(str /* specify the length of each IP datagram fragment*/ skb_shinfo(skb)-gso_size = mtu - fragheaderlen; skb_shinfo(skb)-gso_type = SKB_GSO_UDP; +atomic_add(sk_datagram_pages(skb-truesize), + sk-sk_prot-memory_allocated); __skb_queue_tail(sk-sk_write_queue, skb); return 0; @@ -924,6 +926,9 @@ alloc_new_skb: } if (skb == NULL) goto error; +if (sk-sk_prot-memory_allocated) +atomic_add(sk_datagram_pages(skb-truesize), + sk-sk_prot-memory_allocated); /* *Fill in the control structures @@ -1023,6 +1028,8 @@ alloc_new_skb: frag = skb_shinfo(skb)-frags[i]; skb-truesize += PAGE_SIZE; atomic_add(PAGE_SIZE, sk-sk_wmem_alloc); +if (sk-sk_prot-memory_allocated) +atomic_inc(sk-sk_prot-memory_allocated); } else { err = -EMSGSIZE; goto error; @@ -1123,7 +1130,9 @@ ssize_tip_append_page(struct sock *sk, if (unlikely(!skb)) { err = -ENOBUFS; goto error; -} +} else if (sk-sk_prot-memory_allocated) +atomic_add(sk_datagram_pages(skb-truesize), + sk-sk_prot-memory_allocated); /* *Fill in the control structures @@ -1152,6 +1161,8 @@ ssize_tip_append_page(struct sock *sk, /* * Put the packet on the pending queue. */ +atomic_add(sk_datagram_pages(skb-truesize), + sk-sk_prot-memory_allocated); __skb_queue_tail(sk-sk_write_queue, skb); continue; } @@ -1202,13 +1213,14 @@ int ip_push_pending_frames(struct sock * struct iphdr *iph; __be16 df = 0; __u8 ttl; -int err = 0; +int err = 0, send_page_size; if ((skb = __skb_dequeue(sk-sk_write_queue)) == NULL) goto out; tail_skb = (skb_shinfo(skb)-frag_list); /* move skb-data to ip header from ext header */ +send_page_size = sk_datagram_pages(skb-truesize); if (skb-data skb_network_header(skb)) __skb_pull(skb, skb_network_offset(skb)); while ((tmp_skb = __skb_dequeue(sk-sk_write_queue)) != NULL) { @@ -1218,6 +1230,7 @@ int ip_push_pending_frames(struct sock * skb-len += tmp_skb-len; skb-data_len += tmp_skb-len; skb-truesize += tmp_skb-truesize; +send_page_size += sk_datagram_pages(tmp_skb-truesize); __sock_put(tmp_skb-sk); tmp_skb-destructor = NULL; tmp_skb-sk = NULL; @@ -1269,6 +1282,8 @@ int ip_push_pending_frames(struct sock * /* Netfilter gets whole the not fragmented skb. */ err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, skb-dst-dev, dst_output); +if (sk-sk_prot-memory_allocated) +atomic_sub(send_page_size, sk-sk_prot-memory_allocated); if (err) { if (err 0) err = inet-recverr ? net_xmit_errno(err) : 0; @@ -1298,9 +1313,15 @@ void ip_flush_pending_frames(struct sock { struct inet_sock *inet = inet_sk(sk); struct sk_buff *skb; +int num_flush_mem = 0; -while ((skb = __skb_dequeue_tail(sk-sk_write_queue)) != NULL) +while ((skb = __skb_dequeue_tail(sk-sk_write_queue)) != NULL) { +num_flush_mem += sk_datagram_pages(skb-truesize); kfree_skb(skb); +} + +if (sk-sk_prot-memory_allocated) +atomic_sub(num_flush_mem, sk-sk_prot-memory_allocated); inet-cork.flags = ~IPCORK_OPT; kfree(inet-cork.opt); Index: 2.6.23-rc8-udp_limit/net/ipv4/udp.c === --- 2.6.23-rc8-udp_limit.orig/net/ipv4/udp.c +++ 2.6.23-rc8-udp_limit/net/ipv4/udp.c @@ -887,6 +887,9 @@ try_again: err = ulen; out_free: +atomic_sub(sk_datagram_pages(skb-truesize), + sk-sk_prot-memory_allocated); + skb_free_datagram(sk, skb); out: return err; @@ -894,6 +897,9 @@ out: csum_copy_err: UDP_INC_STATS_BH(UDP_MIB_INERRORS, is_udplite); +atomic_sub(sk_datagram_pages(skb-truesize), + sk-sk_prot
[RFC/PATCH 2/3] UDP memory usage accounting: accounting unit and variable
This patch introduces global variable for UDP memory accounting. The unit is page. signed-off-by: Satoshi Oshima [EMAIL PROTECTED] signed-off-by: Hideo Aoki [EMAIL PROTECTED] Index: 2.6.23-rc3-udp_limit/include/net/sock.h === --- 2.6.23-rc3-udp_limit.orig/include/net/sock.h +++ 2.6.23-rc3-udp_limit/include/net/sock.h @@ -723,6 +723,13 @@ static inline int sk_stream_wmem_schedul sk_stream_mem_schedule(sk, size, 0); } +#define SK_DATAGRAM_MEM_QUANTUM ((int)PAGE_SIZE) + +static inline int sk_datagram_pages(int amt) +{ + return DIV_ROUND_UP(amt, SK_DATAGRAM_MEM_QUANTUM); +} + /* Used by processes to lock a socket state, so that * interrupts and bottom half handlers won't change it * from under us. It essentially blocks any incoming Index: 2.6.23-rc3-udp_limit/include/net/udp.h === --- 2.6.23-rc3-udp_limit.orig/include/net/udp.h +++ 2.6.23-rc3-udp_limit/include/net/udp.h @@ -65,6 +65,8 @@ extern rwlock_t udp_hash_lock; extern struct proto udp_prot; +extern atomic_t udp_memory_allocated; + struct sk_buff; /* Index: 2.6.23-rc3-udp_limit/net/ipv4/proc.c === --- 2.6.23-rc3-udp_limit.orig/net/ipv4/proc.c +++ 2.6.23-rc3-udp_limit/net/ipv4/proc.c @@ -66,7 +66,8 @@ static int sockstat_seq_show(struct seq_ fold_prot_inuse(tcp_prot), atomic_read(tcp_orphan_count), tcp_death_row.tw_count, atomic_read(tcp_sockets_allocated), atomic_read(tcp_memory_allocated)); - seq_printf(seq, UDP: inuse %d\n, fold_prot_inuse(udp_prot)); + seq_printf(seq, UDP: inuse %d mem %d\n, fold_prot_inuse(udp_prot), + atomic_read(udp_memory_allocated)); seq_printf(seq, UDPLITE: inuse %d\n, fold_prot_inuse(udplite_prot)); seq_printf(seq, RAW: inuse %d\n, fold_prot_inuse(raw_prot)); seq_printf(seq, FRAG: inuse %d memory %d\n, ip_frag_nqueues, Index: 2.6.23-rc3-udp_limit/net/ipv4/udp.c === --- 2.6.23-rc3-udp_limit.orig/net/ipv4/udp.c +++ 2.6.23-rc3-udp_limit/net/ipv4/udp.c @@ -113,6 +113,10 @@ DEFINE_SNMP_STAT(struct udp_mib, udp_sta struct hlist_head udp_hash[UDP_HTABLE_SIZE]; DEFINE_RWLOCK(udp_hash_lock); +atomic_t udp_memory_allocated; + +EXPORT_SYMBOL(udp_memory_allocated); + static int udp_port_rover; static inline int __udp_lib_lport_inuse(__u16 num, struct hlist_head udptable[]) - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC/PATCH 0/3] UDP memory usage accounting
Hi, Thank you for your comment. Evgeniy Polyakov wrote: Hi. On Fri, Sep 21, 2007 at 09:18:07PM +0900, Satoshi OSHIMA ([EMAIL PROTECTED]) wrote: This patch set try to introduce memory usage accounting for UDP(currently ipv4 only). Currently, memory usage of UDP can be observed as the sam of usage of tx_queue and rx_queue. But I believe that the system wide accounting is usefull when heavy loaded condition. In the next step, I would like to add memory usage quota for UDP to avoid unlimited memory consumption problem under DDOS attack. Could you please desribed such attack in more details? Each UDP socket has its queue length which can not be exceeded (roughly), no new sockets are created when remote side sends a packet (like after special steps in TCP), so where is possibility to eat all the mem? For example, sk_buff is put on the slab and slab can be acquired only from ZONE_NORMAL in i386. In such case, from 300 to 500MB memory consumption will be fatal. Users can easily open 1000 sockets per process under default ulimit. If such sockets hold messages but user processes don't receive it. Almost all slab will be occupied by sk_buff. This patch set is for 2.6.23-rc7. I seriously doubt you want to put udp specific hacks and zillions of atomic ops all around the code just to know exact number of bytes eaten for UDP. Please use udp specific code (like udp_sendmsg()) for proper accounting if you need that, but not hacks in generic ip code. I couldn't find the way to account UDP memory consumption in UDP layer. In receive path, accounting can be done in UDP layer because sk_buff is marked for UDP in UDP layer and it is released in UDP layer. In send path, sk_buff is aquired in IP layer and also released in IP layer. Especially, there is a possibility of appending data to the preceding sk_buff in send queue. On the other hand, I agree that UDP specific code in IP layer is not preferable. So I generalize UDP specific code in IP layer in take 2. Could you take a look at my take 2 patch set? Satoshi Oshima - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[RFC/PATCH 1/3] UDP memory usage accounting (take 2): fix send buffer check
This patch introduces sndbuf size check before memory allcation for send buffer. signed-off-by: Satoshi Oshima [EMAIL PROTECTED] signed-off-by: Hideo Aoki [EMAIL PROTECTED] Index: 2.6.23-rc7-udp_limit/net/ipv4/ip_output.c === --- 2.6.23-rc7-udp_limit.orig/net/ipv4/ip_output.c +++ 2.6.23-rc7-udp_limit/net/ipv4/ip_output.c @@ -1004,6 +1004,11 @@ alloc_new_skb: frag = skb_shinfo(skb)-frags[i]; } } else if (i MAX_SKB_FRAGS) { +if (atomic_read(sk-sk_wmem_alloc) + PAGE_SIZE + 2 * sk-sk_sndbuf) { +err = -ENOBUFS; +goto error; +} if (copy PAGE_SIZE) copy = PAGE_SIZE; page = alloc_pages(sk-sk_allocation, 0); - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC/PATCH 2/3] UDP memory usage accounting: accounting unit and variable
Hi, Thank you for your comment. Andi Kleen wrote: Satoshi OSHIMA [EMAIL PROTECTED] writes: This patch introduces global variable for UDP memory accounting. The unit is page. The global variable doesn't seem to be very MP scalable, especially if you change it for each packet. This will be a very hot cache line, in the worst case bouncing around a large machine. I understand what you pointed out. But I think the accounting method I'm proposing is very similar to TCP accounting and per socket accounting. How do you think of it? Possible alternatives: - Per CPU variables I'm afraid that sockets and socket buffers are handled on various CPUs. I mean that socket creation might be done on CPU-A but socket receiving might be done on CPU-B. And per CPU variables must be counted up when socket cap is checked. I'm afraid that per CPU vaiables are also costly enough. - You only change the global on socket creation time (by pre allocating a large amount) or when the system comes under memory pressure. - Batching of the global updates for multiple packets [that's a variant of the previous one, might be still too costly though] Also for such variables it's usually good to cache line pad them on SMP to avoid false sharing with something else. I believe that memory usage accounting should be done accurately. Currently I couldn't see how can we know the accurate memory accounting only when the system is under memory pressure. But I revised the patch to avoid some atomic operations. If I could find the good way to avoid atomic operation more, I will add it. Satoshi Oshima - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[RFC/PATCH 0/3] UDP memory usage accounting(take 2)
This patch set try to introduce memory usage accounting for UDP(currently ipv4 only). 3 points are improved along with some feedback. (a) to improve scalability, avoiding atomic_*()s as small as possible (b) avoiding UDP specific code in IP layer (c) supporting socket destruction accounting To implement (b), there is a side effect which affects accounting on TCP socket. If you find the good solution to avoid this side effect, please let me know. Unfortunately, I don't have any NIC with UFO. So this patch set is not tested with UFO supported device. This patch set is for 2.6.23-rc8. I appreciate your comment/test/feedback. Satoshi Oshima - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[RFC/PATCH 0/3] UDP memory usage accounting
This patch set try to introduce memory usage accounting for UDP(currently ipv4 only). Currently, memory usage of UDP can be observed as the sam of usage of tx_queue and rx_queue. But I believe that the system wide accounting is usefull when heavy loaded condition. In the next step, I would like to add memory usage quota for UDP to avoid unlimited memory consumption problem under DDOS attack. This patch set is for 2.6.23-rc7. Unfortunately, I don't have any NIC with UFO. So this patch set is not tested with UFO supported device. I appreciate your comment/test/feedback. Satoshi Oshima - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[RFC/PATCH 3/3] UDP memory usage accounting: measurement
This patch introduces memory usage measurement for UDP. signed-off-by: Satoshi Oshima [EMAIL PROTECTED] signed-off-by: Hideo Aoki [EMAIL PROTECTED] Index: 2.6.23-rc7-udp_limit/net/ipv4/ip_output.c === --- 2.6.23-rc7-udp_limit.orig/net/ipv4/ip_output.c +++ 2.6.23-rc7-udp_limit/net/ipv4/ip_output.c @@ -743,6 +743,8 @@ static inline int ip_ufo_append_data(str /* specify the length of each IP datagram fragment*/ skb_shinfo(skb)-gso_size = mtu - fragheaderlen; skb_shinfo(skb)-gso_type = SKB_GSO_UDP; + atomic_add(sk_datagram_pages(skb-truesize), + sk-sk_prot-memory_allocated); __skb_queue_tail(sk-sk_write_queue, skb); return 0; @@ -924,6 +926,9 @@ alloc_new_skb: } if (skb == NULL) goto error; + if (sk-sk_protocol == IPPROTO_UDP) + atomic_add(sk_datagram_pages(skb-truesize), + sk-sk_prot-memory_allocated); /* * Fill in the control structures @@ -1023,6 +1028,8 @@ alloc_new_skb: frag = skb_shinfo(skb)-frags[i]; skb-truesize += PAGE_SIZE; atomic_add(PAGE_SIZE, sk-sk_wmem_alloc); + if (sk-sk_protocol == IPPROTO_UDP) + atomic_inc(sk-sk_prot-memory_allocated); } else { err = -EMSGSIZE; goto error; @@ -1123,7 +1130,9 @@ ssize_t ip_append_page(struct sock *sk, if (unlikely(!skb)) { err = -ENOBUFS; goto error; - } + } else if (sk-sk_protocol == IPPROTO_UDP) + atomic_add(sk_datagram_pages(skb-truesize), + sk-sk_prot-memory_allocated); /* * Fill in the control structures @@ -1152,6 +1161,8 @@ ssize_t ip_append_page(struct sock *sk, /* * Put the packet on the pending queue. */ + atomic_add(sk_datagram_pages(skb-truesize), + sk-sk_prot-memory_allocated); __skb_queue_tail(sk-sk_write_queue, skb); continue; } @@ -1202,13 +1213,14 @@ int ip_push_pending_frames(struct sock * struct iphdr *iph; __be16 df = 0; __u8 ttl; - int err = 0; + int err = 0, send_page_size; if ((skb = __skb_dequeue(sk-sk_write_queue)) == NULL) goto out; tail_skb = (skb_shinfo(skb)-frag_list); /* move skb-data to ip header from ext header */ + send_page_size = sk_datagram_pages(skb-truesize); if (skb-data skb_network_header(skb)) __skb_pull(skb, skb_network_offset(skb)); while ((tmp_skb = __skb_dequeue(sk-sk_write_queue)) != NULL) { @@ -1218,6 +1230,7 @@ int ip_push_pending_frames(struct sock * skb-len += tmp_skb-len; skb-data_len += tmp_skb-len; skb-truesize += tmp_skb-truesize; + send_page_size += sk_datagram_pages(tmp_skb-truesize); __sock_put(tmp_skb-sk); tmp_skb-destructor = NULL; tmp_skb-sk = NULL; @@ -1269,6 +1282,8 @@ int ip_push_pending_frames(struct sock * /* Netfilter gets whole the not fragmented skb. */ err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, skb-dst-dev, dst_output); + if (sk-sk_protocol == IPPROTO_UDP) + atomic_sub(send_page_size, sk-sk_prot-memory_allocated); if (err) { if (err 0) err = inet-recverr ? net_xmit_errno(err) : 0; @@ -1299,8 +1314,12 @@ void ip_flush_pending_frames(struct sock struct inet_sock *inet = inet_sk(sk); struct sk_buff *skb; - while ((skb = __skb_dequeue_tail(sk-sk_write_queue)) != NULL) + while ((skb = __skb_dequeue_tail(sk-sk_write_queue)) != NULL) { + if (sk-sk_protocol == IPPROTO_UDP) + atomic_sub(sk_datagram_pages(skb-truesize), + sk-sk_prot-memory_allocated); kfree_skb(skb); + } inet-cork.flags = ~IPCORK_OPT; kfree(inet-cork.opt); Index: 2.6.23-rc7-udp_limit/net/ipv4/udp.c === --- 2.6.23-rc7-udp_limit.orig/net/ipv4/udp.c +++ 2.6.23-rc7-udp_limit/net/ipv4/udp.c
[RFC/PATCH 1/3] UDP memory usage accounting: fix send buffer check
This patch introduces sndbuf size check before memory allcation for send buffer. signed-off-by: Satoshi Oshima [EMAIL PROTECTED] signed-off-by: Hideo Aoki [EMAIL PROTECTED] Index: 2.6.23-rc7-udp_limit/net/ipv4/ip_output.c === --- 2.6.23-rc7-udp_limit.orig/net/ipv4/ip_output.c +++ 2.6.23-rc7-udp_limit/net/ipv4/ip_output.c @@ -1004,6 +1004,11 @@ alloc_new_skb: frag = skb_shinfo(skb)-frags[i]; } } else if (i MAX_SKB_FRAGS) { + if (atomic_read(sk-sk_wmem_alloc) + PAGE_SIZE +2 * sk-sk_sndbuf) { + err = -ENOBUFS; + goto error; + } if (copy PAGE_SIZE) copy = PAGE_SIZE; page = alloc_pages(sk-sk_allocation, 0); - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[RFC/PATCH 2/3] UDP memory usage accounting: accounting unit and variable
This patch introduces global variable for UDP memory accounting. The unit is page. signed-off-by: Satoshi Oshima [EMAIL PROTECTED] signed-off-by: Hideo Aoki [EMAIL PROTECTED] Index: 2.6.23-rc3-udp_limit/include/net/sock.h === --- 2.6.23-rc3-udp_limit.orig/include/net/sock.h +++ 2.6.23-rc3-udp_limit/include/net/sock.h @@ -723,6 +723,13 @@ static inline int sk_stream_wmem_schedul sk_stream_mem_schedule(sk, size, 0); } +#define SK_DATAGRAM_MEM_QUANTUM ((int)PAGE_SIZE) + +static inline int sk_datagram_pages(int amt) +{ + return DIV_ROUND_UP(amt, SK_DATAGRAM_MEM_QUANTUM); +} + /* Used by processes to lock a socket state, so that * interrupts and bottom half handlers won't change it * from under us. It essentially blocks any incoming Index: 2.6.23-rc3-udp_limit/include/net/udp.h === --- 2.6.23-rc3-udp_limit.orig/include/net/udp.h +++ 2.6.23-rc3-udp_limit/include/net/udp.h @@ -65,6 +65,8 @@ extern rwlock_t udp_hash_lock; extern struct proto udp_prot; +extern atomic_t udp_memory_allocated; + struct sk_buff; /* Index: 2.6.23-rc3-udp_limit/net/ipv4/proc.c === --- 2.6.23-rc3-udp_limit.orig/net/ipv4/proc.c +++ 2.6.23-rc3-udp_limit/net/ipv4/proc.c @@ -66,7 +66,8 @@ static int sockstat_seq_show(struct seq_ fold_prot_inuse(tcp_prot), atomic_read(tcp_orphan_count), tcp_death_row.tw_count, atomic_read(tcp_sockets_allocated), atomic_read(tcp_memory_allocated)); - seq_printf(seq, UDP: inuse %d\n, fold_prot_inuse(udp_prot)); + seq_printf(seq, UDP: inuse %d mem %d\n, fold_prot_inuse(udp_prot), + atomic_read(udp_memory_allocated)); seq_printf(seq, UDPLITE: inuse %d\n, fold_prot_inuse(udplite_prot)); seq_printf(seq, RAW: inuse %d\n, fold_prot_inuse(raw_prot)); seq_printf(seq, FRAG: inuse %d memory %d\n, ip_frag_nqueues, Index: 2.6.23-rc3-udp_limit/net/ipv4/udp.c === --- 2.6.23-rc3-udp_limit.orig/net/ipv4/udp.c +++ 2.6.23-rc3-udp_limit/net/ipv4/udp.c @@ -113,6 +113,10 @@ DEFINE_SNMP_STAT(struct udp_mib, udp_sta struct hlist_head udp_hash[UDP_HTABLE_SIZE]; DEFINE_RWLOCK(udp_hash_lock); +atomic_t udp_memory_allocated; + +EXPORT_SYMBOL(udp_memory_allocated); + static int udp_port_rover; static inline int __udp_lib_lport_inuse(__u16 num, struct hlist_head udptable[]) - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html