[PATCH 4/6] [IPV6]: Reorganize strut ipv6_mc_socklist to save 8 bytes
/home/acme/git/net-2.6/net/dccp/ipv6.c: struct ipv6_mc_socklist | -8 1 struct changed Signed-off-by: Arnaldo Carvalho de Melo [EMAIL PROTECTED] --- include/net/if_inet6.h |2 +- 1 files changed, 1 insertions(+), 1 deletions(-) diff --git a/include/net/if_inet6.h b/include/net/if_inet6.h index b0b882e..b2509d4 100644 --- a/include/net/if_inet6.h +++ b/include/net/if_inet6.h @@ -82,9 +82,9 @@ struct ipv6_mc_socklist { struct in6_addr addr; int ifindex; + unsigned intsfmode; /* MCAST_{INCLUDE,EXCLUDE} */ struct ipv6_mc_socklist *next; rwlock_tsflock; - unsigned intsfmode; /* MCAST_{INCLUDE,EXCLUDE} */ struct ip6_sf_socklist *sflist; }; -- 1.5.3.8 - To unsubscribe from this list: send the line unsubscribe dccp in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 2/6] [INET6]: Reorganize struct inet6_dev to save 8 bytes
And make it a multiple of a 64 bytes, reducing cacheline trashing: Before: [EMAIL PROTECTED] net-2.6]$ pahole -C inet6_dev net/dccp/ipv6.o struct inet6_dev { SNIP long unsigned int mc_maxdelay; /*48 8 */ unsigned char mc_qrv; /*56 1 */ unsigned char mc_gq_running;/*57 1 */ unsigned char mc_ifc_count; /*58 1 */ /* XXX 5 bytes hole, try to pack */ /* --- cacheline 1 boundary (64 bytes) --- */ struct timer_list mc_gq_timer; /*6448 */ SNIP __u32 if_flags; /* 180 4 */ intdead; /* 184 4 */ u8 rndid[8]; /* 188 8 */ /* XXX 4 bytes hole, try to pack */ /* --- cacheline 3 boundary (192 bytes) was 8 bytes ago --- */ struct timer_list regen_timer; /* 20048 */ SNIP /* size: 456, cachelines: 8 */ /* sum members: 447, holes: 2, sum holes: 9 */ /* last cacheline: 8 bytes */ }; After: [EMAIL PROTECTED] net-2.6]$ codiff net/dccp/ipv6.o.old net/dccp/ipv6.o /home/acme/git/net-2.6/net/dccp/ipv6.c: struct inet6_dev | -8 1 struct changed Signed-off-by: Arnaldo Carvalho de Melo [EMAIL PROTECTED] --- include/net/if_inet6.h |2 +- 1 files changed, 1 insertions(+), 1 deletions(-) diff --git a/include/net/if_inet6.h b/include/net/if_inet6.h index b24508a..b0b882e 100644 --- a/include/net/if_inet6.h +++ b/include/net/if_inet6.h @@ -171,6 +171,7 @@ struct inet6_dev unsigned char mc_qrv; unsigned char mc_gq_running; unsigned char mc_ifc_count; + int dead; struct timer_list mc_gq_timer;/* general query timer */ struct timer_list mc_ifc_timer; /* interface change timer */ @@ -178,7 +179,6 @@ struct inet6_dev rwlock_tlock; atomic_trefcnt; __u32 if_flags; - int dead; #ifdef CONFIG_IPV6_PRIVACY u8 rndid[8]; -- 1.5.3.8 - To unsubscribe from this list: send the line unsubscribe dccp in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 6/6] [TCP]: Reorganize struct tcp_sock to save 16 bytes on 64-bit arch
/home/acme/git/net-2.6/net/ipv6/tcp_ipv6.c: struct tcp_sock | -16 struct tcp6_sock | -16 2 structs changed Now it is at: /* size: 1552, cachelines: 25 */ /* paddings: 2, sum paddings: 8 */ /* last cacheline: 16 bytes */ As soon as we stop using skb_queue_list we'll get it down to 24 cachelines. Signed-off-by: Arnaldo Carvalho de Melo [EMAIL PROTECTED] --- include/linux/tcp.h |6 -- 1 files changed, 4 insertions(+), 2 deletions(-) diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 08027f1..f48644d 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -304,7 +304,6 @@ struct tcp_sock { u32 rtt_seq;/* sequence number to update rttvar */ u32 packets_out;/* Packets which are in flight*/ - u32 retrans_out;/* Retransmitted packets out*/ /* * Options received (usually on last packet, some only on SYN packets). */ @@ -332,6 +331,8 @@ struct tcp_sock { struct tcp_sack_block recv_sack_cache[4]; + u32 retrans_out;/* Retransmitted packets out*/ + struct sk_buff *highest_sack; /* highest skb with SACK received * (validity guaranteed only if * sacked_out 0) @@ -372,7 +373,6 @@ struct tcp_sock { unsigned intkeepalive_time; /* time before keep alive takes place */ unsigned intkeepalive_intvl; /* time interval between keep alive probes */ - int linger2; unsigned long last_synq_overflow; @@ -398,6 +398,8 @@ struct tcp_sock { u32 probe_seq_end; } mtu_probe; + int linger2; + #ifdef CONFIG_TCP_MD5SIG /* TCP AF-Specific parts; only used by MD5 Signature support so far */ struct tcp_sock_af_ops *af_specific; -- 1.5.3.8 - To unsubscribe from this list: send the line unsubscribe dccp in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 1/6] [SOCK] proto: Add hashinfo member to struct proto
This way we can remove TCP and DCCP specific versions of sk-sk_prot-get_port: both v4 and v6 use inet_csk_get_port sk-sk_prot-hash: inet_hash is directly used, only v6 need a specific version to deal with mapped sockets sk-sk_prot-unhash: both v4 and v6 use inet_hash directly struct inet_connection_sock_af_ops also gets a new member, bind_conflict, so that inet_csk_get_port can find the per family routine. Now only the lookup routines receive as a parameter a struct inet_hashtable. With this we further reuse code, reducing the difference among INET transport protocols. Eventually work has to be done on UDP and SCTP to make them share this infrastructure and get as a bonus inet_diag interfaces so that iproute can be used with these protocols. vmlinux: add/remove: 2/4 grow/shrink: 13/6 up/down: 539/-508 (31) function old new delta inet_hash - 237+237 inet_unhash- 190+190 inet_csk_get_port558 573 +15 tcp_sendmsg 27682777 +9 unix_proto 344 352 +8 udplite_prot 344 352 +8 udp_prot 344 352 +8 tcp_prot 344 352 +8 raw_prot 344 352 +8 packet_proto 344 352 +8 netlink_proto344 352 +8 ipv4_specific 96 104 +8 inet_put_port137 145 +8 __inet_hash_nolisten 310 318 +8 __inet6_hash 352 360 +8 tcp_v4_syn_recv_sock 666 664 -2 superblock_doinit587 582 -5 tcp_v4_destroy_sock 557 550 -7 tcp_set_state420 413 -7 __inet_hash_connect 673 665 -8 __inet_lookup_listener 333 313 -20 tcp_v4_get_port 31 - -31 tcp_v4_hash 48 - -48 tcp_unhash 179 --179 __inet_hash 201 --201 ipv6.ko: add/remove: 0/1 grow/shrink: 6/2 up/down: 48/-42 (6) function old new delta udpv6_prot 344 352 +8 udplitev6_prot 344 352 +8 tcpv6_prot 344 352 +8 rawv6_prot 344 352 +8 ipv6_specific 96 104 +8 ipv6_mapped 96 104 +8 tcp_v6_syn_recv_sock16331629 -4 tcp_v6_hash 70 63 -7 tcp_v6_get_port 31 - -31 dccp.ko: add/remove: 0/2 grow/shrink: 1/4 up/down: 2/-243 (-241) function old new delta dccp_feat_change_recv 10501052 +2 init_module 961 960 -1 dccp_init961 960 -1 dccp_set_state 474 467 -7 dccp_destroy_sock191 184 -7 dccp_hash 48 - -48 dccp_unhash 179 --179 dccp_ipv4.ko: add/remove: 0/1 grow/shrink: 2/1 up/down: 16/-33 (-17) function old new delta dccp_v4_prot 344 352 +8 dccp_ipv4_af_ops 96 104 +8 dccp_v4_request_recv_sock522 520 -2 dccp_v4_get_port 31 - -31 dccp_ipv6.ko: add/remove: 0/1 grow/shrink: 4/1 up/down: 29/-38 (-9) function old new delta dccp_v6_prot 344 352 +8 dccp_ipv6_mapped 96 104 +8 dccp_ipv6_af_ops 96 104 +8 dccp_v6_request_recv_sock 15141519 +5 dccp_v6_hash 68 61 -7 dccp_v6_get_port 31 - -31 Signed-off-by: Arnaldo Carvalho de Melo [EMAIL PROTECTED] --- include/net/inet6_hashtables.h |2 +- include/net/inet_connection_sock.h |8 ++--- include/net/inet_hashtables.h | 51
[PATCH 3/6] [DCCP]: Reorganize struct dccp_sock to save 8 bytes
/home/acme/git/net-2.6/net/dccp/ipv6.c: struct dccp_sock | -8 struct dccp6_sock | -8 2 structs changed Signed-off-by: Arnaldo Carvalho de Melo [EMAIL PROTECTED] --- include/linux/dccp.h |2 +- 1 files changed, 1 insertions(+), 1 deletions(-) diff --git a/include/linux/dccp.h b/include/linux/dccp.h index 484e45c..aa07370 100644 --- a/include/linux/dccp.h +++ b/include/linux/dccp.h @@ -525,6 +525,7 @@ struct dccp_sock { __u64 dccps_gsr; __u64 dccps_gar; __be32 dccps_service; + __u32 dccps_mss_cache; struct dccp_service_list*dccps_service_list; __u32 dccps_timestamp_echo; __u32 dccps_timestamp_time; @@ -533,7 +534,6 @@ struct dccp_sock { __u16 dccps_pcslen; __u16 dccps_pcrlen; unsigned long dccps_ndp_count; - __u32 dccps_mss_cache; unsigned long dccps_rate_last; struct dccp_minisockdccps_minisock; struct dccp_ackvec *dccps_hc_rx_ackvec; -- 1.5.3.8 - To unsubscribe from this list: send the line unsubscribe dccp in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCHES 0/6]: Move hashinfo to sk_prot and struct reorgs
Hi David, Please consider pulling from: master.kernel.org:/pub/scm/linux/kernel/git/acme/net-2.6 Best Regards, - Arnaldo include/linux/dccp.h |2 - include/linux/tcp.h|6 ++- include/net/if_inet6.h |4 +- include/net/inet6_hashtables.h |2 - include/net/inet_connection_sock.h |8 + include/net/inet_hashtables.h | 51 +--- include/net/inet_timewait_sock.h |3 - include/net/sock.h |3 + net/dccp/dccp.h|2 - net/dccp/ipv4.c| 18 --- net/dccp/ipv6.c| 20 +--- net/dccp/proto.c | 18 +-- net/ipv4/inet_connection_sock.c|8 + net/ipv4/inet_hashtables.c | 58 ++--- net/ipv4/tcp.c |2 - net/ipv4/tcp_ipv4.c| 31 +-- net/ipv6/inet6_hashtables.c|4 +- net/ipv6/tcp_ipv6.c| 19 +--- 18 files changed, 111 insertions(+), 148 deletions(-) - To unsubscribe from this list: send the line unsubscribe dccp in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 5/6] [INET_TIMEWAIT_SOCK]: Reorganize struct inet_timewait_sock to save some bytes
/home/acme/git/net-2.6/net/ipv6/tcp_ipv6.c: struct inet_timewait_sock | -8 struct tcp_timewait_sock | -8 2 structs changed tcp_v6_rcv| -6 1 function changed, 6 bytes removed, diff: -6 Signed-off-by: Arnaldo Carvalho de Melo [EMAIL PROTECTED] --- include/net/inet_timewait_sock.h |3 +-- 1 files changed, 1 insertions(+), 2 deletions(-) diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h index 67e9250..faead52 100644 --- a/include/net/inet_timewait_sock.h +++ b/include/net/inet_timewait_sock.h @@ -116,8 +116,8 @@ struct inet_timewait_sock { #define tw_hash__tw_common.skc_hash #define tw_prot__tw_common.skc_prot #define tw_net __tw_common.skc_net + int tw_timeout; volatile unsigned char tw_substate; - /* 3 bits hole, try to pack */ unsigned char tw_rcv_wscale; /* Socket demultiplex comparisons on incoming packets. */ /* these five are in inet_sock */ @@ -130,7 +130,6 @@ struct inet_timewait_sock { __u8tw_ipv6only:1; /* 15 bits hole, try to pack */ __u16 tw_ipv6_offset; - int tw_timeout; unsigned long tw_ttd; struct inet_bind_bucket *tw_tb; struct hlist_node tw_death_node; -- 1.5.3.8 - To unsubscribe from this list: send the line unsubscribe dccp in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 6/6] [TCP]: Reorganize struct tcp_sock to save 16 bytes on 64-bit arch
Em Thu, Jan 31, 2008 at 08:57:53PM +0100, Eric Dumazet escreveu: Arnaldo Carvalho de Melo a écrit : /home/acme/git/net-2.6/net/ipv6/tcp_ipv6.c: struct tcp_sock | -16 struct tcp6_sock | -16 2 structs changed Now it is at: /* size: 1552, cachelines: 25 */ /* paddings: 2, sum paddings: 8 */ /* last cacheline: 16 bytes */ As soon as we stop using skb_queue_list we'll get it down to 24 cachelines. Signed-off-by: Arnaldo Carvalho de Melo [EMAIL PROTECTED] --- include/linux/tcp.h |6 -- 1 files changed, 4 insertions(+), 2 deletions(-) diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 08027f1..f48644d 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -304,7 +304,6 @@ struct tcp_sock { u32 rtt_seq;/* sequence number to update rttvar */ u32 packets_out;/* Packets which are in flight*/ -u32 retrans_out;/* Retransmitted packets out*/ /* * Options received (usually on last packet, some only on SYN packets). */ @@ -332,6 +331,8 @@ struct tcp_sock { struct tcp_sack_block recv_sack_cache[4]; + u32 retrans_out;/* Retransmitted packets out*/ + Hum... retrans_out should sit close to packets_out (or lost_out/sacked_out ???), please. 'struct tcp_sock' is very large on 64 bits, so I would prefer to make sure most paths dont need to touch all 24 cache lines (or 25 cache lines). That is perfectly fine, I'll replace my patch with another, that states this beyond doubt. - Arnaldo - To unsubscribe from this list: send the line unsubscribe dccp in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: FW: ccid2/ccid3 oopses
Em Wed, Jan 09, 2008 at 12:28:27PM +, Gerrit Renker escreveu: Roland, - apparently, i got crashes when loading/unloading other driver modules just after ccid2 or ccid3 had been loaded/unloaded _once_ (have not used them at all, just modprobe module;modprobe -r module) snip the easiest way to reproduce is: while true;do modprobe dccp_ccid2/3;modprobe -r dccp_ccid2/3;done after short time, the kernel oopses (messages below) i`m not sure if this is worth to be filed at kernel bugzilla, so i`m contacting you personally first. The issue is known: once loaded, the DCCP modules can not be unloaded without causing a crash as the one you have observed. This is due to the fact that dccp_ipv{4,6} use control sockets which need to be released before the module can be unloaded. When the control sockets are not released then crashes will always result. In earlier versions of DCCP there was a kernel option known as unload hack, which conditionally inserted sock_release(dccp_v{4,6}_ctl_socket); in dccp_v{4,6}_exit() However, as the name says, it is a hack since there are other issues to be considered: * sockets in timewait state * other wait states (e.g. half-open connections) * memory which has not been released * module dependencies With regard to the latter, I am normally using the Unload Hack and release modules in the following order: dccp_probe = dccp_ccid2 = dccp_ccid3 = dccp_tfrc_lib = dccp_ipv6 = dccp_ipv4 = dccp_diag = dccp Long story short * the CCID/DCCP modules can currently not safely be unloaded * maybe we should disable module unloading for the mainline kernel * if anyone is interested to use the unload hack, here is the old patch http://www.erg.abdn.ac.uk/users/gerrit/dccp/testing_dccp/Unload_Hack.diff Gerrit, the control socket isn't attached to any CCID module, so the CCID modules should be safe to remove, and IIRC they were safe to unload. The unload hack was for something else, for the core DCCP modules. We can't unload because there are refcounts held by the control sock, so the unload hack would just destroy the control sock and thus the module refcount would reach zero and it could then be unloaded. I've been consistently being sidetracked with work (huh :-)) and couldn't look at this issue, but the CCID modules should be safe to unload. - Arnaldo - To unsubscribe from this list: send the line unsubscribe dccp in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: dccp send
Em Fri, Jan 04, 2008 at 11:25:29AM +1300, Ian McDonald escreveu: On Jan 4, 2008 11:19 AM, Tomasz Grobelny [EMAIL PROTECTED] wrote: I think I almost got it. Thanks a lot for the detailed explanation. But I've got two more questions: 1. How can I control the amount of memory allowed to be allocated for the socket send buffer? It it somehow connected with tx_qlen or is it not? I wrote the code here so let me comment - there is no fixed amount of memory allocated - it is determined by the tx_qlen. I was looking at there is no memory allocated in advance nor explicit, DCCP specific mechanism for limiting the amount of memory a socket can use for its write queue, but DCCP uses sock_alloc_send_skb and that function does memory accounting and checks if the maximum memory (SO_SNDBUF) has been reached, when it makes the app to sleep if O_NONBLOCK is not set. implementing memory limits but it got too hard and as DCCP doesn't combine packets I thought it made it much simpler just to put a limit by packets. 2. If I decide that for whatever reason a given skb already in sk_write_queue should not be transmitted I should remove it from queue and call kfree_skb on it, it that right? Correct. Remember of course all your locking mechanisms. If you want a good example look at the below towards the end: http://wand.net.nz/~iam4/dccp/patches20/30-best_packet_next.diff (NB this patch applies against 2.6.20 after previous patches in series). Correct if he is in-kernel. - Arnaldo - To unsubscribe from this list: send the line unsubscribe dccp in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: dccp send
Em Wed, Jan 02, 2008 at 11:34:25PM +0100, Tomasz Grobelny escreveu: Dnia Wednesday 02 of January 2008, Arnaldo Carvalho de Melo napisał: Em Wed, Jan 02, 2008 at 01:41:16AM +0100, Tomasz Grobelny escreveu: When I use dccp does sendmsg function block (until it sends the packet)? If so, should it? In either case, how to make it just queue the packet and return? The interface is the same as for other AF_INET transports, use O_NONBLOCK (open, fcntl) if you want it to be non blocking. But what are the semantics? In TCP if sendmsg blocks the application has no choice but to wait. The time for which sendmsg blocks it tightly connected In DCCP its exactly the same semantic, sendmsg may block waiting for send buffer space. If you don't want that use O_NONBLOCK and it will return right away with EAGAIN. with speed at which packets can be transmitted over the network. Is it the case for dccp as well? If so, does it mean that provided stable network conditions over longer period of time and using blocking version of sendmsg in a loop very few packets should be lost? (By very few I mean 10%) If the application sends many packets in rapid succession before the DCCP core gets permission from the CCID in use to send those packets to the network what will happen is that sk_write_queue will have packets waiting, using the memory allowed to be allocated for the socket send buffer, which will make sock_alloc_send_skb wait for the packets in sk_write_queue to be sent, freeing up send buffer space, when it will be possible for sock_alloc_send_skb to actually allocate memory for the packet and return to dccp_sendmsg, that in turn will return to the application after putting the newly allocated packet in sk_write_queue, doing the send buffer accounting and trying to send something in dccp_write_xmit, where it will ask the CCID if it can send some more packets, got it? Look at these sequences: sock_alloc_send_skb - skb_set_owner_w This will set the skb destructor to be sock_wfree Now look at what happens when the NIC driver finally frees the sk_buff (the network packet): kfree_skb - __kfree_skb - skb_release_all - skb-destructor (sock_wfree) sock_wfree, in turn, will call sk-sk_write_space, that for DCCP is dccp_write_space (see inet_create - sk-sk_prot-init (dccp_v4_init_sock) - dccp_init_sock), that will wake up any tasks that are sleeping in sk_sleep, that is... the process doing the dccp_sendmsg - sock_alloc_send_skb. All this behaviour can be controlled thru O_NONBLOCK _and_ setsockopt(SO_SNDTIMEO), just like TCP. It queues it in the write routine and tries to send it right away, but doesn't waits for actually sending the packet, i.e. it only checks if there is write space available, if you set O_NONBLOCK and there is no space it returns ENOBUFS, if O_NONBLOCK is not set it will sleep waiting for write space to be made available, when the process will be awaken. dccp_sendmsg seems to block on call to sock_alloc_send_skb. Which at first look does not seems to send anything anywhere. So why does it block at all? Can't it just allocate the data needed and return? See above. - Arnaldo - To unsubscribe from this list: send the line unsubscribe dccp in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: panic on 2.6.24rc5
Em Sun, Dec 30, 2007 at 04:18:36PM +0100, Tomasz Grobelny escreveu: On Friday 28 December 2007, I wrote: Dnia Wednesday 26 of December 2007, napisałeś: What are the panics you are getting? It might be worth posting them to the list. Here is the screenshot I captured a few days ago. Details: - kernel-vanilla 2.6.24rc5, Now I'm using kernel as described in Arnaldo's mail (davem/net-2.6.25 + patches 0001 to 0051). dccp_hdlr_ack_ratio is not on net-2.6.25, which means it is in one of the 0001 to 0051 patches from Gerrit. So, to help us understand where is the problem you could try building a kernel without applying any of the 0001 to 0051 patches. Could you do this at and report the results? I'm also assuming you are using CCID2 either by explicitely using feature negotiation setsockopt calls or by using the default, that is CCID2. If this is the case it would also be interesting to, before rebuilding the kernel, to try using CCID3 as the problem you're experiencing when using netem is exactly in the interface between the core DCCP code and the CCID being used. - Arnaldo - To unsubscribe from this list: send the line unsubscribe dccp in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: panic on 2.6.24rc5
Em Tue, Jan 01, 2008 at 10:30:56PM +0100, Tomasz Grobelny escreveu: Dnia Tuesday 01 of January 2008, Arnaldo Carvalho de Melo napisał: Em Sun, Dec 30, 2007 at 04:18:36PM +0100, Tomasz Grobelny escreveu: On Friday 28 December 2007, I wrote: Dnia Wednesday 26 of December 2007, napisałeś: What are the panics you are getting? It might be worth posting them to the list. Here is the screenshot I captured a few days ago. Details: - kernel-vanilla 2.6.24rc5, Now I'm using kernel as described in Arnaldo's mail (davem/net-2.6.25 + patches 0001 to 0051). dccp_hdlr_ack_ratio is not on net-2.6.25, which means it is in one of the 0001 to 0051 patches from Gerrit. So, to help us understand where is the problem you could try building a kernel without applying any of the 0001 to 0051 patches. Could you do this at and report the results? But what should I exactly test? Just whether the delays are gone or something more? I'll try to when I have some time (hopefully during weekend). If the kernel oopses, if the results are the same or are some problem introduced in the patches by Gerrit. I.e. you would help us to narrow down the problem by trying a binary search of changeset history built kernels. Please take a look at Documentation/BUG-HUNTING in the kernel sources. The process is somehow time consuming and its understandable if you can't perform it, your reports are already of great help, but if you can try helping us to narrow down exactly when some bugs you notice appeared, or if they were always present after some kernel builds, we'd be really grateful :-) I'm also assuming you are using CCID2 either by explicitely using feature negotiation setsockopt calls or by using the default, that is In fact I was using ccid3. When I switched to ccid2 it started to work more or less ok. It seems that for whatever reason ccid_hc_tx_send_packet is returning too big values (up to 64000). That is an excellent data point, ccid3 code is way more complex than ccid2, so trying with both is always a valuable data point. CCID2. If this is the case it would also be interesting to, before rebuilding the kernel, to try using CCID3 as the problem you're experiencing when using netem is exactly in the interface between the core DCCP code and the CCID being used. The problem with netem exists with both ccid2 and ccid3. I suspect that when all three elements of the connection (server, client and netem) are on one host netem is able to communicate packet loss by returning error. If netem was on a diffrent host the packet would be sent correctly (no BUG: err=1 after ccid_hc_tx_packet_sent) but dropped on another host. I think that in this situation dccp should behave as if the packet was simply dropped. I can't work on this right now, will look at it tomorrow, but thanks for the data points! - Arnaldo - To unsubscribe from this list: send the line unsubscribe dccp in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: dccp send
Em Wed, Jan 02, 2008 at 01:41:16AM +0100, Tomasz Grobelny escreveu: When I use dccp does sendmsg function block (until it sends the packet)? If so, should it? In either case, how to make it just queue the packet and return? The interface is the same as for other AF_INET transports, use O_NONBLOCK (open, fcntl) if you want it to be non blocking. It queues it in the write routine and tries to send it right away, but doesn't waits for actually sending the packet, i.e. it only checks if there is write space available, if you set O_NONBLOCK and there is no space it returns ENOBUFS, if O_NONBLOCK is not set it will sleep waiting for write space to be made available, when the process will be awaken. Use setsockopt(SO_SNDTIMEO) to change the default send timeout, etc. - Arnaldo - To unsubscribe from this list: send the line unsubscribe dccp in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: panic on 2.6.24rc5
Em Wed, Jan 02, 2008 at 01:57:14AM +0100, Tomasz Grobelny escreveu: Dnia Wednesday 02 of January 2008, Arnaldo Carvalho de Melo napisał: If the kernel oopses, if the results are the same or are some problem introduced in the patches by Gerrit. I.e. you would help us to narrow down the problem by trying a binary search of changeset history built kernels. Oh, and by the way: does there exist any set of automated tests for dccp? It would be nice to have one, wouldn't it? Otherwise accepting any patch is quite risky... There are test programs, documented in the wiki, and there is peer review too :-) And DCCP on Linux was written in such a way that a large part of its core engine is actually shared with TCP, benefiting from a much bigger set of developers and testers. But please feel free to add more automated tests, it'll benefit us all. - Arnaldo - To unsubscribe from this list: send the line unsubscribe dccp in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: mirror?
Em Tue, Dec 25, 2007 at 08:23:05AM +1300, Ian McDonald escreveu: On 12/25/07, Tomasz Grobelny [EMAIL PROTECTED] wrote: As http://www.erg.abdn.ac.uk/ seems to be down at least since yesterday I'd like to ask whether any mirror of dccp git tree and/or dccp patches to mainline kernel is available. TIA, -- Regards, Tomasz Grobelny Here is the announcement of the outage: Both the dccp and the ccid4 trees have been brought up-to-date and uploaded. Due to necessary building work, we have to power down almost all servers in the department from Friday 21st December 2007until Sunday 6th January 2008 I have a clone and just made the whole series available as a single patch at: http://oops.ghostprotocols.net:81/acme/dccp/gerrit-2.6/gerrit-pending-review.patch.bz2 The broken out patch series is also at: http://oops.ghostprotocols.net:81/acme/dccp/gerrit-2.6/ You have to clone from DaveM's net-2.6.25 and then apply the series, for your convenience: git clone git://.kernel.org/pub/scm/linux/kernel/git/davem/net-2.6.25 cd net-2.6.25 wget http://oops.ghostprotocols.net:81/acme/dccp/gerrit-2.6/gerrit-pending-review.patch.bz2 bzcat gerrit-pending-review.patch.bz2 | patch -p1 I plan to review the feature negotiation patch series that is the bulk of what I still haven't reviewed in Gerrit's pending tree while its still 2007. Regards, - Arnaldo - To unsubscribe from this list: send the line unsubscribe dccp in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [DCCP] [PATCH 1/1]: Whitespace / outdated documentation
Em Tue, Dec 18, 2007 at 12:41:08PM +, Gerrit Renker escreveu: [CCID3]: Whitespace cleanups and outdated documentation This removes outdated documentation which had been forgotten to be removed (x_recv, rtt now appear twice, p was removed from rx_sock); and removes new whitespace. Signed-off-by: Gerrit Renker [EMAIL PROTECTED] Thanks, I'll get my scripts fixed... :-\ - Arnaldo - To unsubscribe from this list: send the line unsubscribe dccp in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[RFC]: Break up a patch in two (rfc3448bis changes to feedback reception)
Hi Gerrit, Please take a look at the two attached patches, they were made from your patch [CCID3]: Implement rfc3448bis changes to feedback reception, that has this changeset comment: -- [CCID 3]: Implement rfc3448bis changes to feedback reception This implements the algorithm to update the allowed sending rate X upon receiving feedback packets, as described in draft rfc3448bis, 4.2/4.3. The patch further removes two irrelevant states in TX feedback handling: * the NO_SENT state is only triggered in bidirectional mode, costing unnecessary processing. * the TERM (terminating) state is irrelevant. -- The second part further removes, was made a separate patch that I have applied before the rfc3448bis changes to feedback reception. Doing it this way eases understanding the change as we don't mixup identantion changes made due to removing the switch statement. The end result should be equivalent, but please take a look and let me know if the algorithm was changed in any way. Thanks a lot, - Arnaldo From 9165240abd3d4e8280647b9372dc3f223a802347 Mon Sep 17 00:00:00 2001 From: Gerrit Renker [EMAIL PROTECTED] Date: Mon, 17 Dec 2007 10:25:06 -0200 Subject: [PATCH 2/3] [CCID3]: Remove two irrelevant states in TX feedback handling * the NO_SENT state is only triggered in bidirectional mode, costing unnecessary processing. * the TERM (terminating) state is irrelevant. Signed-off-by: Gerrit Renker [EMAIL PROTECTED] Signed-off-by: Ian McDonald [EMAIL PROTECTED] Signed-off-by: Arnaldo Carvalho de Melo [EMAIL PROTECTED] --- net/dccp/ccids/ccid3.c | 173 +++- 1 files changed, 84 insertions(+), 89 deletions(-) diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index 90e0454..1b1cb74 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -402,112 +402,107 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) if (!(DCCP_SKB_CB(skb)-dccpd_type == DCCP_PKT_ACK || DCCP_SKB_CB(skb)-dccpd_type == DCCP_PKT_DATAACK)) return; + /* ... and only in the established state */ + if (hctx-ccid3hctx_state != TFRC_SSTATE_FBACK + hctx-ccid3hctx_state != TFRC_SSTATE_NO_FBACK) + return; opt_recv = hctx-ccid3hctx_options_received; + now = ktime_get_real(); - switch (hctx-ccid3hctx_state) { - case TFRC_SSTATE_NO_FBACK: - case TFRC_SSTATE_FBACK: - now = ktime_get_real(); - - /* estimate RTT from history if ACK number is valid */ - r_sample = tfrc_tx_hist_rtt(hctx-ccid3hctx_hist, - DCCP_SKB_CB(skb)-dccpd_ack_seq, now); - if (r_sample == 0) { - DCCP_WARN(%s(%p): %s with bogus ACK-%llu\n, dccp_role(sk), sk, - dccp_packet_name(DCCP_SKB_CB(skb)-dccpd_type), - (unsigned long long)DCCP_SKB_CB(skb)-dccpd_ack_seq); - return; - } + /* Estimate RTT from history if ACK number is valid */ + r_sample = tfrc_tx_hist_rtt(hctx-ccid3hctx_hist, + DCCP_SKB_CB(skb)-dccpd_ack_seq, now); + if (r_sample == 0) { + DCCP_WARN(%s(%p): %s with bogus ACK-%llu\n, dccp_role(sk), sk, + dccp_packet_name(DCCP_SKB_CB(skb)-dccpd_type), + (unsigned long long)DCCP_SKB_CB(skb)-dccpd_ack_seq); + return; + } + + /* Update receive rate in units of 64 * bytes/second */ + hctx-ccid3hctx_x_recv = opt_recv-ccid3or_receive_rate; + hctx-ccid3hctx_x_recv = 6; - /* Update receive rate in units of 64 * bytes/second */ - hctx-ccid3hctx_x_recv = opt_recv-ccid3or_receive_rate; - hctx-ccid3hctx_x_recv = 6; + /* Update loss event rate (which is scaled by 1e6) */ + pinv = opt_recv-ccid3or_loss_event_rate; + if (pinv == ~0U || pinv == 0) /* see RFC 4342, 8.5 */ + hctx-ccid3hctx_p = 0; + else /* can not exceed 100% */ + hctx-ccid3hctx_p = 100 / pinv; + /* +* Validate new RTT sample and update moving average +*/ + r_sample = dccp_sample_rtt(sk, r_sample); + hctx-ccid3hctx_rtt = tfrc_ewma(hctx-ccid3hctx_rtt, r_sample, 9); - /* Update loss event rate */ - pinv = opt_recv-ccid3or_loss_event_rate; - if (pinv == ~0U || pinv == 0) /* see RFC 4342, 8.5 */ - hctx-ccid3hctx_p = 0; - else /* can not exceed 100% */ - hctx-ccid3hctx_p = 100 / pinv; + if (hctx-ccid3hctx_state == TFRC_SSTATE_NO_FBACK
[PATCHES 0/5]: DCCP patches for 2.6.25
Hi David, Please consider pulling from: master.kernel.org:/pub/scm/linux/kernel/git/acme/net-2.6.25 Best Regards, - Arnaldo net/dccp/ccids/ccid3.c | 252 +- net/dccp/ccids/ccid3.h |8 - net/dccp/dccp.h|6 - 3 files changed, 130 insertions(+), 136 deletions(-) - To unsubscribe from this list: send the line unsubscribe dccp in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 1/5] [CCID3]: Use a function to update p_inv, and p is never used
From: Gerrit Renker [EMAIL PROTECTED] This patch 1) concentrates previously scattered computation of p_inv into one function; 2) removes the `p' element of the CCID3 RX sock (it is redundant); 3) makes the tfrc_rx_info structure standalone, only used on demand. Signed-off-by: Gerrit Renker [EMAIL PROTECTED] Signed-off-by: Ian McDonald [EMAIL PROTECTED] Signed-off-by: Arnaldo Carvalho de Melo [EMAIL PROTECTED] --- net/dccp/ccids/ccid3.c | 11 --- net/dccp/ccids/ccid3.h |8 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index 876c747..90e0454 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -917,6 +917,7 @@ static int ccid3_hc_rx_getsockopt(struct sock *sk, const int optname, int len, u32 __user *optval, int __user *optlen) { const struct ccid3_hc_rx_sock *hcrx; + struct tfrc_rx_info rx_info; const void *val; /* Listen socks doesn't have a private CCID block */ @@ -926,10 +927,14 @@ static int ccid3_hc_rx_getsockopt(struct sock *sk, const int optname, int len, hcrx = ccid3_hc_rx_sk(sk); switch (optname) { case DCCP_SOCKOPT_CCID_RX_INFO: - if (len sizeof(hcrx-ccid3hcrx_tfrc)) + if (len sizeof(rx_info)) return -EINVAL; - len = sizeof(hcrx-ccid3hcrx_tfrc); - val = hcrx-ccid3hcrx_tfrc; + rx_info.tfrcrx_x_recv = hcrx-ccid3hcrx_x_recv; + rx_info.tfrcrx_rtt= hcrx-ccid3hcrx_rtt; + rx_info.tfrcrx_p = hcrx-ccid3hcrx_pinv == 0 ? ~0U : + scaled_div(1, hcrx-ccid3hcrx_pinv); + len = sizeof(rx_info); + val = rx_info; break; default: return -ENOPROTOOPT; diff --git a/net/dccp/ccids/ccid3.h b/net/dccp/ccids/ccid3.h index e9f6ff4..49ca32b 100644 --- a/net/dccp/ccids/ccid3.h +++ b/net/dccp/ccids/ccid3.h @@ -139,6 +139,8 @@ enum ccid3_hc_rx_states { * @ccid3hcrx_last_counter - Tracks window counter (RFC 4342, 8.1) * @ccid3hcrx_state - Receiver state, one of %ccid3_hc_rx_states * @ccid3hcrx_bytes_recv - Total sum of DCCP payload bytes + * @ccid3hcrx_x_recv - Receiver estimate of send rate (RFC 3448, sec. 4.3) + * @ccid3hcrx_rtt - Receiver estimate of RTT * @ccid3hcrx_tstamp_last_feedback - Time at which last feedback was sent * @ccid3hcrx_tstamp_last_ack - Time at which last feedback was sent * @ccid3hcrx_hist - Packet history (loss detection + RTT sampling) @@ -147,13 +149,11 @@ enum ccid3_hc_rx_states { * @ccid3hcrx_pinv - Inverse of Loss Event Rate (RFC 4342, sec. 8.5) */ struct ccid3_hc_rx_sock { - struct tfrc_rx_info ccid3hcrx_tfrc; -#define ccid3hcrx_x_recv ccid3hcrx_tfrc.tfrcrx_x_recv -#define ccid3hcrx_rtt ccid3hcrx_tfrc.tfrcrx_rtt -#define ccid3hcrx_pccid3hcrx_tfrc.tfrcrx_p u8 ccid3hcrx_last_counter:4; enum ccid3_hc_rx_states ccid3hcrx_state:8; u32 ccid3hcrx_bytes_recv; + u32 ccid3hcrx_x_recv; + u32 ccid3hcrx_rtt; ktime_t ccid3hcrx_tstamp_last_feedback; struct tfrc_rx_hist ccid3hcrx_hist; struct tfrc_loss_hist ccid3hcrx_li_hist; -- 1.5.3.6 - To unsubscribe from this list: send the line unsubscribe dccp in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 5/5] [DCCP]: Remove unused inline function
From: Gerrit Renker [EMAIL PROTECTED] The function follows48(), which is a special-case of dccp_delta_seqno(), is nowhere used in the DCCP code, thus removed by this patch. Signed-off-by: Gerrit Renker [EMAIL PROTECTED] Signed-off-by: Ian McDonald [EMAIL PROTECTED] Signed-off-by: Arnaldo Carvalho de Melo [EMAIL PROTECTED] --- net/dccp/dccp.h |6 -- 1 files changed, 0 insertions(+), 6 deletions(-) diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index b138e20..ebe59d9 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -153,12 +153,6 @@ static inline u64 max48(const u64 seq1, const u64 seq2) return after48(seq1, seq2) ? seq1 : seq2; } -/* is seq1 next seqno after seq2 */ -static inline int follows48(const u64 seq1, const u64 seq2) -{ - return dccp_delta_seqno(seq2, seq1) == 1; -} - enum { DCCP_MIB_NUM = 0, DCCP_MIB_ACTIVEOPENS, /* ActiveOpens */ -- 1.5.3.6 - To unsubscribe from this list: send the line unsubscribe dccp in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 4/5] [CCID3]: Nofeedback timer according to rfc3448bis
From: Gerrit Renker [EMAIL PROTECTED] This implements the changes to the nofeedback timer handling suggested in draft rfc3448bis00, section 4.4. In particular, these changes mean: * better handling of the lossless case (p == 0) * the timestamp for computing t_ld becomes obsolete * much more recent document (RFC 3448 is almost 5 years old) * concepts in rfc3448bis arose from a real, working implementation (cf. sec. 12) Signed-off-by: Gerrit Renker [EMAIL PROTECTED] Signed-off-by: Ian McDonald [EMAIL PROTECTED] Signed-off-by: Arnaldo Carvalho de Melo [EMAIL PROTECTED] --- net/dccp/ccids/ccid3.c | 63 ++-- 1 files changed, 29 insertions(+), 34 deletions(-) diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index 7c8e9ad..00b5f11 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -131,12 +131,11 @@ static u32 ccid3_hc_tx_idle_rtt(struct ccid3_hc_tx_sock *hctx, ktime_t now) * */ static void ccid3_hc_tx_update_x(struct sock *sk, ktime_t *stamp) - { struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); __u64 min_rate = 2 * hctx-ccid3hctx_x_recv; const __u64 old_x = hctx-ccid3hctx_x; - ktime_t now = stamp? *stamp : ktime_get_real(); + ktime_t now = stamp ? *stamp : ktime_get_real(); /* * Handle IDLE periods: do not reduce below RFC3390 initial sending rate @@ -230,27 +229,27 @@ static void ccid3_hc_tx_no_feedback_timer(unsigned long data) ccid3_pr_debug(%s(%p, state=%s) - entry \n, dccp_role(sk), sk, ccid3_tx_state_name(hctx-ccid3hctx_state)); - switch (hctx-ccid3hctx_state) { - case TFRC_SSTATE_NO_FBACK: - /* RFC 3448, 4.4: Halve send rate directly */ + if (hctx-ccid3hctx_state == TFRC_SSTATE_FBACK) + ccid3_hc_tx_set_state(sk, TFRC_SSTATE_NO_FBACK); + else if (hctx-ccid3hctx_state != TFRC_SSTATE_NO_FBACK) + goto out; + + /* +* Determine new allowed sending rate X as per draft rfc3448bis-00, 4.4 +*/ + if (hctx-ccid3hctx_t_rto == 0 || /* no feedback received yet */ + hctx-ccid3hctx_p == 0) { + + /* halve send rate directly */ hctx-ccid3hctx_x = max(hctx-ccid3hctx_x / 2, (((__u64)hctx-ccid3hctx_s) 6) / TFRC_T_MBI); - - ccid3_pr_debug(%s(%p, state=%s), updated tx rate to %u - bytes/s\n, dccp_role(sk), sk, - ccid3_tx_state_name(hctx-ccid3hctx_state), - (unsigned)(hctx-ccid3hctx_x 6)); - /* The value of R is still undefined and so we can not recompute -* the timout value. Keep initial value as per [RFC 4342, 5]. */ - t_nfb = TFRC_INITIAL_TIMEOUT; ccid3_update_send_interval(hctx); - break; - case TFRC_SSTATE_FBACK: + } else { /* -* Modify the cached value of X_recv [RFC 3448, 4.4] +* Modify the cached value of X_recv * -* If (p == 0 || X_calc 2 * X_recv) +* If (X_calc 2 * X_recv) *X_recv = max(X_recv / 2, s / (2 * t_mbi)); * Else *X_recv = X_calc / 4; @@ -259,32 +258,28 @@ static void ccid3_hc_tx_no_feedback_timer(unsigned long data) */ BUG_ON(hctx-ccid3hctx_p !hctx-ccid3hctx_x_calc); - if (hctx-ccid3hctx_p == 0 || - (hctx-ccid3hctx_x_calc (hctx-ccid3hctx_x_recv 5))) { - + if (hctx-ccid3hctx_x_calc (hctx-ccid3hctx_x_recv 5)) hctx-ccid3hctx_x_recv = max(hctx-ccid3hctx_x_recv / 2, (((__u64)hctx-ccid3hctx_s) 6) / (2 * TFRC_T_MBI)); - } else { + else { hctx-ccid3hctx_x_recv = hctx-ccid3hctx_x_calc; hctx-ccid3hctx_x_recv = 4; } - /* Now recalculate X [RFC 3448, 4.3, step (4)] */ ccid3_hc_tx_update_x(sk, NULL); - /* -* Schedule no feedback timer to expire in -* max(t_RTO, 2 * s/X) = max(t_RTO, 2 * t_ipi) -* See comments in packet_recv() regarding the value of t_RTO. -*/ - t_nfb = max(hctx-ccid3hctx_t_rto, 2 * hctx-ccid3hctx_t_ipi); - break; - case TFRC_SSTATE_NO_SENT: - DCCP_BUG(%s(%p) - Illegal state NO_SENT, dccp_role(sk), sk); - /* fall through */ - case TFRC_SSTATE_TERM: - goto out; } + ccid3_pr_debug(Reduced X to %llu/64
Re: [PATCHES 0/3]: DCCP patches for 2.6.25
Em Fri, Dec 14, 2007 at 11:29:14AM -0800, David Miller escreveu: From: Arnaldo Carvalho de Melo [EMAIL PROTECTED] Date: Thu, 13 Dec 2007 23:41:59 -0200 Please consider pulling from: master.kernel.org:/pub/scm/linux/kernel/git/acme/net-2.6.25 Pulled, but could you please reformat Gerrit's changelog entries in the future? They have these 80+ long lines which are painful to read in ascii email clients and in terminal output. I'll do this by hand during my next rebase for this case, but I will push back when I see it again in future pull requests. OK, will take that into account in future requests, Thanks a lot, - Arnaldo - To unsubscribe from this list: send the line unsubscribe dccp in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 05/12] [DCCP]: Collapse repeated `len' statements into one
From: Gerrit Renker [EMAIL PROTECTED] This replaces 4 individual assignments for `len' with a single one, placed where the control flow of those 4 leads to. Signed-off-by: Gerrit Renker [EMAIL PROTECTED] Signed-off-by: Ian McDonald [EMAIL PROTECTED] Signed-off-by: Arnaldo Carvalho de Melo [EMAIL PROTECTED] --- net/dccp/proto.c |5 + 1 files changed, 1 insertions(+), 4 deletions(-) diff --git a/net/dccp/proto.c b/net/dccp/proto.c index cc87c50..0bed4a6 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -657,19 +657,15 @@ static int do_dccp_getsockopt(struct sock *sk, int level, int optname, (__be32 __user *)optval, optlen); case DCCP_SOCKOPT_GET_CUR_MPS: val = dp-dccps_mss_cache; - len = sizeof(val); break; case DCCP_SOCKOPT_SERVER_TIMEWAIT: val = dp-dccps_server_timewait; - len = sizeof(val); break; case DCCP_SOCKOPT_SEND_CSCOV: val = dp-dccps_pcslen; - len = sizeof(val); break; case DCCP_SOCKOPT_RECV_CSCOV: val = dp-dccps_pcrlen; - len = sizeof(val); break; case 128 ... 191: return ccid_hc_rx_getsockopt(dp-dccps_hc_rx_ccid, sk, optname, @@ -681,6 +677,7 @@ static int do_dccp_getsockopt(struct sock *sk, int level, int optname, return -ENOPROTOOPT; } + len = sizeof(val); if (put_user(len, optlen) || copy_to_user(optval, val, len)) return -EFAULT; -- 1.5.3.4 - To unsubscribe from this list: send the line unsubscribe dccp in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 02/12] [DCCP]: Shift the retransmit timer for active-close into output.c
From: Gerrit Renker [EMAIL PROTECTED] When performing active close, RFC 4340, 8.3. requires to retransmit the Close/CloseReq with a backoff-retransmit timer starting at intially 2 RTTs. This patch shifts the existing code for active-close retransmit timer into output.c, so that the retransmit timer is started when the first Close/CloseReq is sent. Previously, the timer was started when, after releasing the socket in dccp_close(), the actively-closing side had not yet reached the CLOSED/TIMEWAIT state. The patch further reduces the initial timeout from 3 seconds to the required 2 RTTs, where - in absence of a known RTT - the fallback value specified in RFC 4340, 3.4 is used. Signed-off-by: Gerrit Renker [EMAIL PROTECTED] Signed-off-by: Ian McDonald [EMAIL PROTECTED] Signed-off-by: Arnaldo Carvalho de Melo [EMAIL PROTECTED] --- net/dccp/output.c | 13 - net/dccp/proto.c | 18 -- 2 files changed, 12 insertions(+), 19 deletions(-) diff --git a/net/dccp/output.c b/net/dccp/output.c index 7caa7f5..e97584a 100644 --- a/net/dccp/output.c +++ b/net/dccp/output.c @@ -574,7 +574,18 @@ void dccp_send_close(struct sock *sk, const int active) dccp_write_xmit(sk, 1); dccp_skb_entail(sk, skb); dccp_transmit_skb(sk, skb_clone(skb, prio)); - /* FIXME do we need a retransmit timer here? */ + /* +* Retransmission timer for active-close: RFC 4340, 8.3 requires +* to retransmit the Close/CloseReq until the CLOSING/CLOSEREQ +* state can be left. The initial timeout is 2 RTTs. +* Since RTT measurement is done by the CCIDs, there is no easy +* way to get an RTT sample. The fallback RTT from RFC 4340, 3.4 +* is too low (200ms); we use a high value to avoid unnecessary +* retransmissions when the link RTT is 0.2 seconds. +* FIXME: Let main module sample RTTs and use that instead. +*/ + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, + DCCP_TIMEOUT_INIT, DCCP_RTO_MAX); } else dccp_transmit_skb(sk, skb); } diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 60f40ec..8a73c8f 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -996,24 +996,6 @@ adjudge_to_death: if (state != DCCP_CLOSED sk-sk_state == DCCP_CLOSED) goto out; - /* -* The last release_sock may have processed the CLOSE or RESET -* packet moving sock to CLOSED state, if not we have to fire -* the CLOSE/CLOSEREQ retransmission timer, see 8.3. Termination -* in draft-ietf-dccp-spec-11. -acme -*/ - if (sk-sk_state == DCCP_CLOSING) { - /* FIXME: should start at 2 * RTT */ - /* Timer for repeating the CLOSE/CLOSEREQ until an answer. */ - inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, - inet_csk(sk)-icsk_rto, - DCCP_RTO_MAX); -#if 0 - /* Yeah, we should use sk-sk_prot-orphan_count, etc */ - dccp_set_state(sk, DCCP_CLOSED); -#endif - } - if (sk-sk_state == DCCP_CLOSED) inet_csk_destroy_sock(sk); -- 1.5.3.4 - To unsubscribe from this list: send the line unsubscribe dccp in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCHES 0/12]: DCCP patches for 2.6.25
Hi David, Please consider pulling from: master.kernel.org:/pub/scm/linux/kernel/git/acme/net-2.6.25 Best Regards, - Arnaldo Documentation/networking/dccp.txt |6 + include/linux/dccp.h | 24 ++- net/dccp/dccp.h | 10 ++- net/dccp/feat.c | 29 net/dccp/feat.h | 26 --- net/dccp/input.c | 28 +--- net/dccp/ipv4.c |8 +- net/dccp/ipv6.c |8 +- net/dccp/minisocks.c | 33 + net/dccp/options.c| 126 +- net/dccp/output.c | 21 +- net/dccp/proto.c | 34 +++--- 12 files changed, 208 insertions(+), 145 deletions(-) - To unsubscribe from this list: send the line unsubscribe dccp in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 06/12] [DCCP]: Allow to parse options on Request Sockets
From: Gerrit Renker [EMAIL PROTECTED] The option parsing code currently only parses on full sk's. This causes a problem for options sent during the initial handshake (in particular timestamps and feature-negotiation options). Therefore, this patch extends the option parsing code with an additional argument for request_socks: if it is non-NULL, options are parsed on the request socket, otherwise the normal path (parsing on the sk) is used. Subsequent patches, which implement feature negotiation during connection setup, make use of this facility. Signed-off-by: Gerrit Renker [EMAIL PROTECTED] Signed-off-by: Ian McDonald [EMAIL PROTECTED] Signed-off-by: Arnaldo Carvalho de Melo [EMAIL PROTECTED] --- include/linux/dccp.h |5 +++-- net/dccp/input.c |6 +++--- net/dccp/ipv4.c |8 net/dccp/ipv6.c |8 net/dccp/options.c | 34 +++--- 5 files changed, 37 insertions(+), 24 deletions(-) diff --git a/include/linux/dccp.h b/include/linux/dccp.h index c676021..7214031 100644 --- a/include/linux/dccp.h +++ b/include/linux/dccp.h @@ -407,8 +407,6 @@ struct dccp_opt_pend { extern void dccp_minisock_init(struct dccp_minisock *dmsk); -extern int dccp_parse_options(struct sock *sk, struct sk_buff *skb); - struct dccp_request_sock { struct inet_request_sock dreq_inet_rsk; __u64dreq_iss; @@ -423,6 +421,9 @@ static inline struct dccp_request_sock *dccp_rsk(const struct request_sock *req) extern struct inet_timewait_death_row dccp_death_row; +extern int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq, + struct sk_buff *skb); + struct dccp_options_received { u32 dccpor_ndp; /* only 24 bits */ u32 dccpor_timestamp; diff --git a/net/dccp/input.c b/net/dccp/input.c index dacd4fd..08392ed 100644 --- a/net/dccp/input.c +++ b/net/dccp/input.c @@ -369,7 +369,7 @@ int dccp_rcv_established(struct sock *sk, struct sk_buff *skb, if (dccp_check_seqno(sk, skb)) goto discard; - if (dccp_parse_options(sk, skb)) + if (dccp_parse_options(sk, NULL, skb)) goto discard; if (DCCP_SKB_CB(skb)-dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ) @@ -427,7 +427,7 @@ static int dccp_rcv_request_sent_state_process(struct sock *sk, goto out_invalid_packet; } - if (dccp_parse_options(sk, skb)) + if (dccp_parse_options(sk, NULL, skb)) goto out_invalid_packet; /* Obtain usec RTT sample from SYN exchange (used by CCID 3) */ @@ -609,7 +609,7 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb, /* * Step 8: Process options and mark acknowledgeable */ - if (dccp_parse_options(sk, skb)) + if (dccp_parse_options(sk, NULL, skb)) goto discard; if (dcb-dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ) diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index db17b83..02fc91c 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -600,11 +600,12 @@ int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb) if (req == NULL) goto drop; - if (dccp_parse_options(sk, skb)) - goto drop_and_free; - dccp_reqsk_init(req, skb); + dreq = dccp_rsk(req); + if (dccp_parse_options(sk, dreq, skb)) + goto drop_and_free; + if (security_inet_conn_request(sk, skb, req)) goto drop_and_free; @@ -621,7 +622,6 @@ int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb) * In fact we defer setting S.GSR, S.SWL, S.SWH to * dccp_create_openreq_child. */ - dreq = dccp_rsk(req); dreq-dreq_isr = dcb-dccpd_seq; dreq-dreq_iss = dccp_v4_init_sequence(skb); dreq-dreq_service = service; diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index a08e2cb..f42b75c 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -415,11 +415,12 @@ static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb) if (req == NULL) goto drop; - if (dccp_parse_options(sk, skb)) - goto drop_and_free; - dccp_reqsk_init(req, skb); + dreq = dccp_rsk(req); + if (dccp_parse_options(sk, dreq, skb)) + goto drop_and_free; + if (security_inet_conn_request(sk, skb, req)) goto drop_and_free; @@ -449,7 +450,6 @@ static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb) * In fact we defer setting S.GSR, S.SWL, S.SWH to * dccp_create_openreq_child. */ - dreq = dccp_rsk(req); dreq-dreq_isr = dcb-dccpd_seq; dreq-dreq_iss = dccp_v6_init_sequence(skb); dreq-dreq_service = service
[PATCH 09/12] [DCCP]: Support inserting options during the 3-way handshake
From: Gerrit Renker [EMAIL PROTECTED] This provides a separate routine to insert options during the initial handshake. The main purpose is to conduct feature negotiation, for the moment the only user is the timestamp echo needed for the (CCID3) handshake RTT sample. Padding of options has been put into a small separate routine, to be shared among the two functions. This could also be used as a generic routine to finish inserting options. Also removed an `XXX' comment since its content was obvious. Signed-off-by: Gerrit Renker [EMAIL PROTECTED] Signed-off-by: Ian McDonald [EMAIL PROTECTED] Signed-off-by: Arnaldo Carvalho de Melo [EMAIL PROTECTED] --- net/dccp/dccp.h|1 + net/dccp/options.c | 32 ++-- net/dccp/output.c |2 +- 3 files changed, 24 insertions(+), 11 deletions(-) diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index 3af3320..b138e20 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -428,6 +428,7 @@ static inline int dccp_ack_pending(const struct sock *sk) } extern int dccp_insert_options(struct sock *sk, struct sk_buff *skb); +extern int dccp_insert_options_rsk(struct dccp_request_sock*, struct sk_buff*); extern int dccp_insert_option_elapsed_time(struct sock *sk, struct sk_buff *skb, u32 elapsed_time); diff --git a/net/dccp/options.c b/net/dccp/options.c index 0c996d8..bedb5da 100644 --- a/net/dccp/options.c +++ b/net/dccp/options.c @@ -537,6 +537,18 @@ static int dccp_insert_options_feat(struct sock *sk, struct sk_buff *skb) return 0; } +/* The length of all options needs to be a multiple of 4 (5.8) */ +static void dccp_insert_option_padding(struct sk_buff *skb) +{ + int padding = DCCP_SKB_CB(skb)-dccpd_opt_len % 4; + + if (padding != 0) { + padding = 4 - padding; + memset(skb_push(skb, padding), 0, padding); + DCCP_SKB_CB(skb)-dccpd_opt_len += padding; + } +} + int dccp_insert_options(struct sock *sk, struct sk_buff *skb) { struct dccp_sock *dp = dccp_sk(sk); @@ -580,18 +592,18 @@ int dccp_insert_options(struct sock *sk, struct sk_buff *skb) dccp_insert_option_timestamp_echo(dp, NULL, skb)) return -1; - /* XXX: insert other options when appropriate */ + dccp_insert_option_padding(skb); + return 0; +} - if (DCCP_SKB_CB(skb)-dccpd_opt_len != 0) { - /* The length of all options has to be a multiple of 4 */ - int padding = DCCP_SKB_CB(skb)-dccpd_opt_len % 4; +int dccp_insert_options_rsk(struct dccp_request_sock *dreq, struct sk_buff *skb) +{ + DCCP_SKB_CB(skb)-dccpd_opt_len = 0; - if (padding != 0) { - padding = 4 - padding; - memset(skb_push(skb, padding), 0, padding); - DCCP_SKB_CB(skb)-dccpd_opt_len += padding; - } - } + if (dreq-dreq_timestamp_echo != 0 + dccp_insert_option_timestamp_echo(NULL, dreq, skb)) + return -1; + dccp_insert_option_padding(skb); return 0; } diff --git a/net/dccp/output.c b/net/dccp/output.c index b2e1791..5589a5e 100644 --- a/net/dccp/output.c +++ b/net/dccp/output.c @@ -303,7 +303,7 @@ struct sk_buff *dccp_make_response(struct sock *sk, struct dst_entry *dst, DCCP_SKB_CB(skb)-dccpd_type = DCCP_PKT_RESPONSE; DCCP_SKB_CB(skb)-dccpd_seq = dreq-dreq_iss; - if (dccp_insert_options(sk, skb)) { + if (dccp_insert_options_rsk(dreq, skb)) { kfree_skb(skb); return NULL; } -- 1.5.3.4 - To unsubscribe from this list: send the line unsubscribe dccp in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 07/12] [DCCP]: Add (missing) option parsing to request_sock processing
From: Gerrit Renker [EMAIL PROTECTED] This adds option-parsing code to processing of Acks in the listening state on request_socks on the server, serving two purposes (i) resolves a FIXME (removed); (ii) paves the way for feature-negotiation during connection-setup. There is an intended subtlety here with regard to dccp_check_req: Parsing options happens only after testing whether the received packet is a retransmitted Request. Otherwise, if the Request contained (a possibly large number of) feature-negotiation options, recomputing state would have to happen each time a retransmitted Request arrives, which opens the door to an easy DoS attack. Since in a genuine retransmission the options should not be different from the original, reusing the already computed state seems better. The other point is - if there are timestamp options on the Request, they will not be answered; which means that in the presence of retransmission (likely due to loss and/or other problems), the use of Request/Response RTT sampling is suspended, so that startup problems here do not propagate. Signed-off-by: Gerrit Renker [EMAIL PROTECTED] Signed-off-by: Ian McDonald [EMAIL PROTECTED] Signed-off-by: Arnaldo Carvalho de Melo [EMAIL PROTECTED] --- net/dccp/minisocks.c | 12 ++-- 1 files changed, 6 insertions(+), 6 deletions(-) diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c index 831b76e..b1d5da6 100644 --- a/net/dccp/minisocks.c +++ b/net/dccp/minisocks.c @@ -200,10 +200,10 @@ struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb, struct request_sock **prev) { struct sock *child = NULL; + struct dccp_request_sock *dreq = dccp_rsk(req); /* Check for retransmitted REQUEST */ if (dccp_hdr(skb)-dccph_type == DCCP_PKT_REQUEST) { - struct dccp_request_sock *dreq = dccp_rsk(req); if (after48(DCCP_SKB_CB(skb)-dccpd_seq, dreq-dreq_isr)) { dccp_pr_debug(Retransmitted REQUEST\n); @@ -227,22 +227,22 @@ struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb, goto drop; /* Invalid ACK */ - if (DCCP_SKB_CB(skb)-dccpd_ack_seq != dccp_rsk(req)-dreq_iss) { + if (DCCP_SKB_CB(skb)-dccpd_ack_seq != dreq-dreq_iss) { dccp_pr_debug(Invalid ACK number: ack_seq=%llu, dreq_iss=%llu\n, (unsigned long long) DCCP_SKB_CB(skb)-dccpd_ack_seq, - (unsigned long long) - dccp_rsk(req)-dreq_iss); + (unsigned long long) dreq-dreq_iss); goto drop; } + if (dccp_parse_options(sk, dreq, skb)) +goto drop; + child = inet_csk(sk)-icsk_af_ops-syn_recv_sock(sk, skb, req, NULL); if (child == NULL) goto listen_overflow; - /* FIXME: deal with options */ - inet_csk_reqsk_queue_unlink(sk, req, prev); inet_csk_reqsk_queue_removed(sk, req); inet_csk_reqsk_queue_add(sk, req, child); -- 1.5.3.4 - To unsubscribe from this list: send the line unsubscribe dccp in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 12/12] [DCCP]: Ignore feature negotiation on Data packets
From: Gerrit Renker [EMAIL PROTECTED] This implements [RFC 4340, p. 32]: any feature negotiation options received on DCCP-Data packets MUST be ignored. Also added a FIXME for further processing, since the code currently (wrongly) classifies empty Confirm options as invalid - this needs to be resolved in a separate patch. Signed-off-by: Gerrit Renker [EMAIL PROTECTED] Signed-off-by: Ian McDonald [EMAIL PROTECTED] Signed-off-by: Arnaldo Carvalho de Melo [EMAIL PROTECTED] --- net/dccp/options.c |6 +- 1 files changed, 5 insertions(+), 1 deletions(-) diff --git a/net/dccp/options.c b/net/dccp/options.c index bedb5da..d2a84a2 100644 --- a/net/dccp/options.c +++ b/net/dccp/options.c @@ -132,6 +132,8 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq, case DCCPO_CHANGE_L: /* fall through */ case DCCPO_CHANGE_R: + if (pkt_type == DCCP_PKT_DATA) + break; if (len 2) goto out_invalid_option; rc = dccp_feat_change_recv(sk, opt, *value, value + 1, @@ -148,7 +150,9 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq, case DCCPO_CONFIRM_L: /* fall through */ case DCCPO_CONFIRM_R: - if (len 2) + if (pkt_type == DCCP_PKT_DATA) + break; + if (len 2)/* FIXME this disallows empty confirm */ goto out_invalid_option; if (dccp_feat_confirm_recv(sk, opt, *value, value + 1, len - 1)) -- 1.5.3.4 - To unsubscribe from this list: send the line unsubscribe dccp in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 01/12] [DCCP]: Perform SHUT_RD and SHUT_WR on receiving close
From: Gerrit Renker [EMAIL PROTECTED] This patch performs two changes: 1) Close the write-end in addition to the read-end when a fin-like segment (Close or CloseReq) is received by DCCP. This accounts for the fact that DCCP, in contrast to TCP, does not have a half-close. RFC 4340 says in this respect that when a fin-like segment has been sent there is no guarantee at all that any further data will be processed. Thus this patch performs SHUT_WR in addition to the SHUT_RD when a fin-like segment is encountered. 2) Minor change: I noted that code appears twice in different places and think it makes sense to put this into a self-contained function (dccp_enqueue()). Signed-off-by: Gerrit Renker [EMAIL PROTECTED] Signed-off-by: Ian McDonald [EMAIL PROTECTED] Signed-off-by: Arnaldo Carvalho de Melo [EMAIL PROTECTED] --- net/dccp/input.c | 22 +++--- 1 files changed, 15 insertions(+), 7 deletions(-) diff --git a/net/dccp/input.c b/net/dccp/input.c index decf2f2..dacd4fd 100644 --- a/net/dccp/input.c +++ b/net/dccp/input.c @@ -22,16 +22,27 @@ /* rate-limit for syncs in reply to sequence-invalid packets; RFC 4340, 7.5.4 */ int sysctl_dccp_sync_ratelimit __read_mostly = HZ / 8; -static void dccp_fin(struct sock *sk, struct sk_buff *skb) +static void dccp_enqueue_skb(struct sock *sk, struct sk_buff *skb) { - sk-sk_shutdown |= RCV_SHUTDOWN; - sock_set_flag(sk, SOCK_DONE); __skb_pull(skb, dccp_hdr(skb)-dccph_doff * 4); __skb_queue_tail(sk-sk_receive_queue, skb); skb_set_owner_r(skb, sk); sk-sk_data_ready(sk, 0); } +static void dccp_fin(struct sock *sk, struct sk_buff *skb) +{ + /* +* On receiving Close/CloseReq, both RD/WR shutdown are performed. +* RFC 4340, 8.3 says that we MAY send further Data/DataAcks after +* receiving the closing segment, but there is no guarantee that such +* data will be processed at all. +*/ + sk-sk_shutdown = SHUTDOWN_MASK; + sock_set_flag(sk, SOCK_DONE); + dccp_enqueue_skb(sk, skb); +} + static int dccp_rcv_close(struct sock *sk, struct sk_buff *skb) { int queued = 0; @@ -282,10 +293,7 @@ static int __dccp_rcv_established(struct sock *sk, struct sk_buff *skb, * - sk_shutdown == RCV_SHUTDOWN, use Code 1, Not Listening * - sk_receive_queue is full, use Code 2, Receive Buffer */ - __skb_pull(skb, dh-dccph_doff * 4); - __skb_queue_tail(sk-sk_receive_queue, skb); - skb_set_owner_r(skb, sk); - sk-sk_data_ready(sk, 0); + dccp_enqueue_skb(sk, skb); return 0; case DCCP_PKT_ACK: goto discard; -- 1.5.3.4 - To unsubscribe from this list: send the line unsubscribe dccp in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 03/12] [DCCP]: Use maximum-RTO backoff from DCCP spec
From: Gerrit Renker [EMAIL PROTECTED] This removes another Fixme, using the TCP maximum RTO rather than the value specified by the DCCP specification. Across the sections in RFC 4340, 64 seconds is consistently suggested as maximum RTO backoff value; and this is the value which is now used. I have checked both termination cases for retransmissions of Close/CloseReq: with the default value 15 of `retries2', and an initial icsk_retransmit = 0, it takes about 614 seconds to declare a non-responding peer as dead, after which the final terminating Reset is sent. With the TCP maximum RTO value of 120 seconds it takes (as might be expected) almost twice as long, about 23 minutes. Signed-off-by: Gerrit Renker [EMAIL PROTECTED] Signed-off-by: Ian McDonald [EMAIL PROTECTED] Signed-off-by: Arnaldo Carvalho de Melo [EMAIL PROTECTED] --- net/dccp/dccp.h |9 - 1 files changed, 8 insertions(+), 1 deletions(-) diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index 07dcbe7..3af3320 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -72,7 +72,14 @@ extern void dccp_time_wait(struct sock *sk, int state, int timeo); /* RFC 1122, 4.2.3.1 initial RTO value */ #define DCCP_TIMEOUT_INIT ((unsigned)(3 * HZ)) -#define DCCP_RTO_MAX ((unsigned)(120 * HZ)) /* FIXME: using TCP value */ +/* + * The maximum back-off value for retransmissions. This is needed for + * - retransmitting client-Requests (sec. 8.1.1), + * - retransmitting Close/CloseReq when closing (sec. 8.3), + * - feature-negotiation retransmission (sec. 6.6.3), + * - Acks in client-PARTOPEN state (sec. 8.1.5). + */ +#define DCCP_RTO_MAX ((unsigned)(64 * HZ)) /* * RTT sampling: sanity bounds and fallback RTT value from RFC 4340, section 3.4 -- 1.5.3.4 - To unsubscribe from this list: send the line unsubscribe dccp in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 04/12] [DCCP]: Support for server holding timewait state
From: Gerrit Renker [EMAIL PROTECTED] This adds a socket option and signalling support for the case where the server holds timewait state on closing the connection, as described in RFC 4340, 8.3. Since holding timewait state at the server is the non-usual case, it is enabled via a socket option. Documentation for this socket option has been added. The setsockopt statement has been made resilient against different possible cases of expressing boolean `true' values using a suggestion by Ian McDonald. Signed-off-by: Gerrit Renker [EMAIL PROTECTED] Signed-off-by: Ian McDonald [EMAIL PROTECTED] Signed-off-by: Arnaldo Carvalho de Melo [EMAIL PROTECTED] --- Documentation/networking/dccp.txt |6 ++ include/linux/dccp.h |3 +++ net/dccp/output.c |6 -- net/dccp/proto.c | 13 - 4 files changed, 25 insertions(+), 3 deletions(-) diff --git a/Documentation/networking/dccp.txt b/Documentation/networking/dccp.txt index d76905a..39131a3 100644 --- a/Documentation/networking/dccp.txt +++ b/Documentation/networking/dccp.txt @@ -57,6 +57,12 @@ can be set before calling bind(). DCCP_SOCKOPT_GET_CUR_MPS is read-only and retrieves the current maximum packet size (application payload size) in bytes, see RFC 4340, section 14. +DCCP_SOCKOPT_SERVER_TIMEWAIT enables the server (listening socket) to hold +timewait state when closing the connection (RFC 4340, 8.3). The usual case is +that the closing server sends a CloseReq, whereupon the client holds timewait +state. When this boolean socket option is on, the server sends a Close instead +and will enter TIMEWAIT. This option must be set after accept() returns. + DCCP_SOCKOPT_SEND_CSCOV and DCCP_SOCKOPT_RECV_CSCOV are used for setting the partial checksum coverage (RFC 4340, sec. 9.2). The default is that checksums always cover the entire packet and that only fully covered application data is diff --git a/include/linux/dccp.h b/include/linux/dccp.h index 312b989..c676021 100644 --- a/include/linux/dccp.h +++ b/include/linux/dccp.h @@ -205,6 +205,7 @@ struct dccp_so_feat { #define DCCP_SOCKOPT_CHANGE_L 3 #define DCCP_SOCKOPT_CHANGE_R 4 #define DCCP_SOCKOPT_GET_CUR_MPS 5 +#define DCCP_SOCKOPT_SERVER_TIMEWAIT 6 #define DCCP_SOCKOPT_SEND_CSCOV10 #define DCCP_SOCKOPT_RECV_CSCOV11 #define DCCP_SOCKOPT_CCID_RX_INFO 128 @@ -492,6 +493,7 @@ struct dccp_ackvec; * @dccps_role - role of this sock, one of %dccp_role * @dccps_hc_rx_insert_options - receiver wants to add options when acking * @dccps_hc_tx_insert_options - sender wants to add options when sending + * @dccps_server_timewait - server holds timewait state on close (RFC 4340, 8.3) * @dccps_xmit_timer - timer for when CCID is not ready to send * @dccps_syn_rtt - RTT sample from Request/Response exchange (in usecs) */ @@ -528,6 +530,7 @@ struct dccp_sock { enum dccp_role dccps_role:2; __u8dccps_hc_rx_insert_options:1; __u8dccps_hc_tx_insert_options:1; + __u8dccps_server_timewait:1; struct timer_list dccps_xmit_timer; }; diff --git a/net/dccp/output.c b/net/dccp/output.c index e97584a..b2e1791 100644 --- a/net/dccp/output.c +++ b/net/dccp/output.c @@ -567,8 +567,10 @@ void dccp_send_close(struct sock *sk, const int active) /* Reserve space for headers and prepare control bits. */ skb_reserve(skb, sk-sk_prot-max_header); - DCCP_SKB_CB(skb)-dccpd_type = dp-dccps_role == DCCP_ROLE_CLIENT ? - DCCP_PKT_CLOSE : DCCP_PKT_CLOSEREQ; + if (dp-dccps_role == DCCP_ROLE_SERVER !dp-dccps_server_timewait) + DCCP_SKB_CB(skb)-dccpd_type = DCCP_PKT_CLOSEREQ; + else + DCCP_SKB_CB(skb)-dccpd_type = DCCP_PKT_CLOSE; if (active) { dccp_write_xmit(sk, 1); diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 8a73c8f..cc87c50 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -551,6 +551,12 @@ static int do_dccp_setsockopt(struct sock *sk, int level, int optname, (struct dccp_so_feat __user *) optval); break; + case DCCP_SOCKOPT_SERVER_TIMEWAIT: + if (dp-dccps_role != DCCP_ROLE_SERVER) + err = -EOPNOTSUPP; + else + dp-dccps_server_timewait = (val != 0); + break; case DCCP_SOCKOPT_SEND_CSCOV: /* sender side, RFC 4340, sec. 9.2 */ if (val 0 || val 15) err = -EINVAL; @@ -653,6 +659,10 @@ static int do_dccp_getsockopt(struct sock *sk, int level, int optname, val = dp-dccps_mss_cache; len = sizeof(val
[PATCH 08/12] [DCCP]: Handle timestamps on Request/Response exchange separately
From: Gerrit Renker [EMAIL PROTECTED] In DCCP, timestamps can occur on packets anytime, CCID3 uses a timestamp(/echo) on the Request/Response exchange. This patch addresses the following situation: * timestamps are recorded on the listening socket; * Responses are sent from dccp_request_sockets; * suppose two connections reach the listening socket with very small time in between: * the first timestamp value gets overwritten by the second connection request. This is not really good, so this patch separates timestamps into * those which are received by the server during the initial handshake (on dccp_request_sock); * those which are received by the client or the client after connection establishment. As before, a timestamp of 0 is regarded as indicating that no (meaningful) timestamp has been received (in addition, a warning message is printed if hosts send 0-valued timestamps). The timestamp-echoing now works as follows: * when a timestamp is present on the initial Request, it is placed into dreq, due to the call to dccp_parse_options in dccp_v{4,6}_conn_request; * when a timestamp is present on the Ack leading from RESPOND = OPEN, it is copied over from the request_sock into the child cocket in dccp_create_openreq_child; * timestamps received on an (established) dccp_sock are treated as before. Since Elapsed Time is measured in hundredths of milliseconds (13.2), the new dccp_timestamp() function is used, as it is expected that the time between receiving the timestamp and sending the timestamp echo will be very small against the wrap-around time. As a byproduct, this allows smaller timestamping-time fields. Furthermore, inserting the Timestamp Echo option has been taken out of the block starting with '!dccp_packet_without_ack()', since Timestamp Echo can be carried on any packet (5.8 and 13.3). Signed-off-by: Gerrit Renker [EMAIL PROTECTED] Acked-by: Ian McDonald [EMAIL PROTECTED] Signed-off-by: Arnaldo Carvalho de Melo [EMAIL PROTECTED] --- include/linux/dccp.h | 16 - net/dccp/minisocks.c | 21 +++--- net/dccp/options.c | 56 - 3 files changed, 63 insertions(+), 30 deletions(-) diff --git a/include/linux/dccp.h b/include/linux/dccp.h index 7214031..484e45c 100644 --- a/include/linux/dccp.h +++ b/include/linux/dccp.h @@ -407,11 +407,23 @@ struct dccp_opt_pend { extern void dccp_minisock_init(struct dccp_minisock *dmsk); +/** + * struct dccp_request_sock - represent DCCP-specific connection request + * @dreq_inet_rsk: structure inherited from + * @dreq_iss: initial sequence number sent on the Response (RFC 4340, 7.1) + * @dreq_isr: initial sequence number received on the Request + * @dreq_service: service code present on the Request (there is just one) + * The following two fields are analogous to the ones in dccp_sock: + * @dreq_timestamp_echo: last received timestamp to echo (13.1) + * @dreq_timestamp_echo: the time of receiving the last @dreq_timestamp_echo + */ struct dccp_request_sock { struct inet_request_sock dreq_inet_rsk; __u64dreq_iss; __u64dreq_isr; __be32 dreq_service; + __u32dreq_timestamp_echo; + __u32dreq_timestamp_time; }; static inline struct dccp_request_sock *dccp_rsk(const struct request_sock *req) @@ -477,8 +489,8 @@ struct dccp_ackvec; * @dccps_gar - greatest valid ack number received on a non-Sync; initialized to %dccps_iss * @dccps_service - first (passive sock) or unique (active sock) service code * @dccps_service_list - second .. last service code on passive socket - * @dccps_timestamp_time - time of latest TIMESTAMP option * @dccps_timestamp_echo - latest timestamp received on a TIMESTAMP option + * @dccps_timestamp_time - time of receiving latest @dccps_timestamp_echo * @dccps_l_ack_ratio - feature-local Ack Ratio * @dccps_r_ack_ratio - feature-remote Ack Ratio * @dccps_pcslen - sender partial checksum coverage (via sockopt) @@ -514,8 +526,8 @@ struct dccp_sock { __u64 dccps_gar; __be32 dccps_service; struct dccp_service_list*dccps_service_list; - ktime_t dccps_timestamp_time; __u32 dccps_timestamp_echo; + __u32 dccps_timestamp_time; __u16 dccps_l_ack_ratio; __u16 dccps_r_ack_ratio; __u16 dccps_pcslen; diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c index b1d5da6..027d181 100644 --- a/net/dccp/minisocks.c +++ b/net/dccp/minisocks.c @@ -117,11 +117,13 @@ struct sock *dccp_create_openreq_child(struct sock *sk, struct dccp_sock *newdp = dccp_sk(newsk
[PATCH 1/3] [DCCP]: Documentation for CCID operations
From: Gerrit Renker [EMAIL PROTECTED] This adds documentation for the ccid_operations structure. Signed-off-by: Gerrit Renker [EMAIL PROTECTED] Signed-off-by: Ian McDonald [EMAIL PROTECTED] Signed-off-by: Arnaldo Carvalho de Melo [EMAIL PROTECTED] --- net/dccp/ccid.h | 35 --- 1 files changed, 28 insertions(+), 7 deletions(-) diff --git a/net/dccp/ccid.h b/net/dccp/ccid.h index c65cb24..e3cdd8a 100644 --- a/net/dccp/ccid.h +++ b/net/dccp/ccid.h @@ -23,14 +23,35 @@ struct tcp_info; +/** + * struct ccid_operations - Interface to Congestion-Control Infrastructure + * + * @ccid_id: numerical CCID ID (up to %CCID_MAX, cf. table 5 in RFC 4340, 10.) + * @ccid_name: alphabetical identifier string for @ccid_id + * @ccid_owner: module which implements/owns this CCID + * @ccid_hc_{r,t}x_slab: memory pool for the receiver/sender half-connection + * @ccid_hc_{r,t}x_obj_size: size of the receiver/sender half-connection socket + * + * @ccid_hc_{r,t}x_init: CCID-specific initialisation routine (before startup) + * @ccid_hc_{r,t}x_exit: CCID-specific cleanup routine (before destruction) + * @ccid_hc_rx_packet_recv: implements the HC-receiver side + * @ccid_hc_{r,t}x_parse_options: parsing routine for CCID/HC-specific options + * @ccid_hc_{r,t}x_insert_options: insert routine for CCID/HC-specific options + * @ccid_hc_tx_packet_recv: implements feedback processing for the HC-sender + * @ccid_hc_tx_send_packet: implements the sending part of the HC-sender + * @ccid_hc_tx_packet_sent: does accounting for packets in flight by HC-sender + * @ccid_hc_{r,t}x_get_info: INET_DIAG information for HC-receiver/sender + * @ccid_hc_{r,t}x_getsockopt: socket options specific to HC-receiver/sender + */ struct ccid_operations { - unsigned char ccid_id; - const char *ccid_name; - struct module *ccid_owner; - struct kmem_cache *ccid_hc_rx_slab; - __u32 ccid_hc_rx_obj_size; - struct kmem_cache *ccid_hc_tx_slab; - __u32 ccid_hc_tx_obj_size; + unsigned char ccid_id; + const char *ccid_name; + struct module *ccid_owner; + struct kmem_cache *ccid_hc_rx_slab, + *ccid_hc_tx_slab; + __u32 ccid_hc_rx_obj_size, + ccid_hc_tx_obj_size; + /* Interface Routines */ int (*ccid_hc_rx_init)(struct ccid *ccid, struct sock *sk); int (*ccid_hc_tx_init)(struct ccid *ccid, struct sock *sk); void(*ccid_hc_rx_exit)(struct sock *sk); -- 1.5.3.4 - To unsubscribe from this list: send the line unsubscribe dccp in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 2/3] [CCID]: More informative registration
From: Gerrit Renker [EMAIL PROTECTED] The patch makes the registration messages of CCID 2/3 a bit more informative: instead of repeating the CCID number as currently done, CCID: Registered CCID 2 (ccid2) or CCID: Registered CCID 3 (ccid3), the descriptive names of the CCID's (from RFCs) are now used: CCID: Registered CCID 2 (TCP-like) and CCID: Registered CCID 3 (TCP-Friendly Rate Control). To allow spaces in the name, the slab name string has been changed to refer to the numeric CCID identifier, using the same format as before. Signed-off-by: Gerrit Renker [EMAIL PROTECTED] Signed-off-by: Ian McDonald [EMAIL PROTECTED] Signed-off-by: Arnaldo Carvalho de Melo [EMAIL PROTECTED] --- net/dccp/ccid.c|8 net/dccp/ccids/ccid2.c |2 +- net/dccp/ccids/ccid3.c |2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/net/dccp/ccid.c b/net/dccp/ccid.c index c45088b..4809753 100644 --- a/net/dccp/ccid.c +++ b/net/dccp/ccid.c @@ -92,15 +92,15 @@ int ccid_register(struct ccid_operations *ccid_ops) ccid_ops-ccid_hc_rx_slab = ccid_kmem_cache_create(ccid_ops-ccid_hc_rx_obj_size, - %s_hc_rx_sock, - ccid_ops-ccid_name); + ccid%u_hc_rx_sock, + ccid_ops-ccid_id); if (ccid_ops-ccid_hc_rx_slab == NULL) goto out; ccid_ops-ccid_hc_tx_slab = ccid_kmem_cache_create(ccid_ops-ccid_hc_tx_obj_size, - %s_hc_tx_sock, - ccid_ops-ccid_name); + ccid%u_hc_tx_sock, + ccid_ops-ccid_id); if (ccid_ops-ccid_hc_tx_slab == NULL) goto out_free_rx_slab; diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c index 48eeb44..b5b52eb 100644 --- a/net/dccp/ccids/ccid2.c +++ b/net/dccp/ccids/ccid2.c @@ -770,7 +770,7 @@ static void ccid2_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) static struct ccid_operations ccid2 = { .ccid_id= DCCPC_CCID2, - .ccid_name = ccid2, + .ccid_name = TCP-like, .ccid_owner = THIS_MODULE, .ccid_hc_tx_obj_size= sizeof(struct ccid2_hc_tx_sock), .ccid_hc_tx_init= ccid2_hc_tx_init, diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index a818a1e..876c747 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -943,7 +943,7 @@ static int ccid3_hc_rx_getsockopt(struct sock *sk, const int optname, int len, static struct ccid_operations ccid3 = { .ccid_id = DCCPC_CCID3, - .ccid_name = ccid3, + .ccid_name = TCP-Friendly Rate Control, .ccid_owner= THIS_MODULE, .ccid_hc_tx_obj_size = sizeof(struct ccid3_hc_tx_sock), .ccid_hc_tx_init = ccid3_hc_tx_init, -- 1.5.3.4 - To unsubscribe from this list: send the line unsubscribe dccp in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 3/3] [DCCP]: Introducing CCMPS
From: Gerrit Renker [EMAIL PROTECTED] This introduces a CCMPS field for setting a CCID-specific upper bound on the application payload size, as is defined in RFC 4340, section 14. Only the TX CCID is considered in setting this limit, since the RX CCID generates comparatively small (DCCP-Ack) feedback packets. The CCMPS field includes network and transport layer header lengths. The only current CCMPS customer is CCID4 (via RFC 4828). A wrapper is used to allow querying the CCMPS even at times where the CCID modules may not have been fully negotiated yet. In dccp_sync_mss() the variable `mss_now' has been renamed into `cur_mps', to reflect that we are dealing with an MPS, but not an MSS. Since the DCCP code closely follows the TCP code, the identifiers `dccp_sync_mss' and `dccps_mss_cache' have been kept, as they have direct TCP counterparts. Signed-off-by: Gerrit Renker [EMAIL PROTECTED] Signed-off-by: Ian McDonald [EMAIL PROTECTED] Signed-off-by: Arnaldo Carvalho de Melo [EMAIL PROTECTED] --- net/dccp/ccid.h |2 ++ net/dccp/output.c | 30 +++--- 2 files changed, 25 insertions(+), 7 deletions(-) diff --git a/net/dccp/ccid.h b/net/dccp/ccid.h index e3cdd8a..fdeae7b 100644 --- a/net/dccp/ccid.h +++ b/net/dccp/ccid.h @@ -27,6 +27,7 @@ struct tcp_info; * struct ccid_operations - Interface to Congestion-Control Infrastructure * * @ccid_id: numerical CCID ID (up to %CCID_MAX, cf. table 5 in RFC 4340, 10.) + * @ccid_ccmps: the CCMPS including network/transport headers (0 when disabled) * @ccid_name: alphabetical identifier string for @ccid_id * @ccid_owner: module which implements/owns this CCID * @ccid_hc_{r,t}x_slab: memory pool for the receiver/sender half-connection @@ -45,6 +46,7 @@ struct tcp_info; */ struct ccid_operations { unsigned char ccid_id; + __u32 ccid_ccmps; const char *ccid_name; struct module *ccid_owner; struct kmem_cache *ccid_hc_rx_slab, diff --git a/net/dccp/output.c b/net/dccp/output.c index 5589a5e..3b763db 100644 --- a/net/dccp/output.c +++ b/net/dccp/output.c @@ -133,15 +133,31 @@ static int dccp_transmit_skb(struct sock *sk, struct sk_buff *skb) return -ENOBUFS; } +/** + * dccp_determine_ccmps - Find out about CCID-specfic packet-size limits + * We only consider the HC-sender CCID for setting the CCMPS (RFC 4340, 14.), + * since the RX CCID is restricted to feedback packets (Acks), which are small + * in comparison with the data traffic. A value of 0 means no current CCMPS. + */ +static u32 dccp_determine_ccmps(const struct dccp_sock *dp) +{ + const struct ccid *tx_ccid = dp-dccps_hc_tx_ccid; + + if (tx_ccid == NULL || tx_ccid-ccid_ops == NULL) + return 0; + return tx_ccid-ccid_ops-ccid_ccmps; +} + unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu) { struct inet_connection_sock *icsk = inet_csk(sk); struct dccp_sock *dp = dccp_sk(sk); - int mss_now = (pmtu - icsk-icsk_af_ops-net_header_len - - sizeof(struct dccp_hdr) - sizeof(struct dccp_hdr_ext)); + u32 ccmps = dccp_determine_ccmps(dp); + int cur_mps = ccmps ? min(pmtu, ccmps) : pmtu; - /* Now subtract optional transport overhead */ - mss_now -= icsk-icsk_ext_hdr_len; + /* Account for header lengths and IPv4/v6 option overhead */ + cur_mps -= (icsk-icsk_af_ops-net_header_len + icsk-icsk_ext_hdr_len + + sizeof(struct dccp_hdr) + sizeof(struct dccp_hdr_ext)); /* * FIXME: this should come from the CCID infrastructure, where, say, @@ -151,13 +167,13 @@ unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu) * make it a multiple of 4 */ - mss_now -= ((5 + 6 + 10 + 6 + 6 + 6 + 3) / 4) * 4; + cur_mps -= ((5 + 6 + 10 + 6 + 6 + 6 + 3) / 4) * 4; /* And store cached results */ icsk-icsk_pmtu_cookie = pmtu; - dp-dccps_mss_cache = mss_now; + dp-dccps_mss_cache = cur_mps; - return mss_now; + return cur_mps; } EXPORT_SYMBOL_GPL(dccp_sync_mss); -- 1.5.3.4 - To unsubscribe from this list: send the line unsubscribe dccp in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCHES 0/3]: DCCP patches for 2.6.25
Hi David, Please consider pulling from: master.kernel.org:/pub/scm/linux/kernel/git/acme/net-2.6.25 Best Regards, - Arnaldo net/dccp/ccid.c|8 net/dccp/ccid.h| 37 ++--- net/dccp/ccids/ccid2.c |2 +- net/dccp/ccids/ccid3.c |2 +- net/dccp/output.c | 30 +++--- 5 files changed, 59 insertions(+), 20 deletions(-) - To unsubscribe from this list: send the line unsubscribe dccp in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 2/7] [TFRC]: Loss interval code needs the macros/inlines that were moved
From: Gerrit Renker [EMAIL PROTECTED] This moves the inlines (which were previously declared as macros) back into packet_history.h since the loss detection code needs to be able to read entries from the RX history in order to create the relevant loss entries: it needs at least tfrc_rx_hist_loss_prev() and tfrc_rx_hist_last_rcv(), which in turn require the definition of the other inlines (macros). Signed-off-by: Gerrit Renker [EMAIL PROTECTED] Signed-off-by: Arnaldo Carvalho de Melo [EMAIL PROTECTED] --- net/dccp/ccids/lib/packet_history.c | 35 --- net/dccp/ccids/lib/packet_history.h | 35 +++ 2 files changed, 35 insertions(+), 35 deletions(-) diff --git a/net/dccp/ccids/lib/packet_history.c b/net/dccp/ccids/lib/packet_history.c index 727b17d..dd2cf2d 100644 --- a/net/dccp/ccids/lib/packet_history.c +++ b/net/dccp/ccids/lib/packet_history.c @@ -151,23 +151,6 @@ void tfrc_rx_packet_history_exit(void) } } -/** - * tfrc_rx_hist_index - index to reach n-th entry after loss_start - */ -static inline u8 tfrc_rx_hist_index(const struct tfrc_rx_hist *h, const u8 n) -{ - return (h-loss_start + n) TFRC_NDUPACK; -} - -/** - * tfrc_rx_hist_last_rcv - entry with highest-received-seqno so far - */ -static inline struct tfrc_rx_hist_entry * - tfrc_rx_hist_last_rcv(const struct tfrc_rx_hist *h) -{ - return h-ring[tfrc_rx_hist_index(h, h-loss_count)]; -} - void tfrc_rx_hist_add_packet(struct tfrc_rx_hist *h, const struct sk_buff *skb, const u32 ndp) @@ -183,24 +166,6 @@ void tfrc_rx_hist_add_packet(struct tfrc_rx_hist *h, } EXPORT_SYMBOL_GPL(tfrc_rx_hist_add_packet); -/** - * tfrc_rx_hist_entry - return the n-th history entry after loss_start - */ -static inline struct tfrc_rx_hist_entry * - tfrc_rx_hist_entry(const struct tfrc_rx_hist *h, const u8 n) -{ - return h-ring[tfrc_rx_hist_index(h, n)]; -} - -/** - * tfrc_rx_hist_loss_prev - entry with highest-received-seqno before loss was detected - */ -static inline struct tfrc_rx_hist_entry * - tfrc_rx_hist_loss_prev(const struct tfrc_rx_hist *h) -{ - return h-ring[h-loss_start]; -} - /* has the packet contained in skb been seen before? */ int tfrc_rx_hist_duplicate(struct tfrc_rx_hist *h, struct sk_buff *skb) { diff --git a/net/dccp/ccids/lib/packet_history.h b/net/dccp/ccids/lib/packet_history.h index 3dfd182..e58b0fc 100644 --- a/net/dccp/ccids/lib/packet_history.h +++ b/net/dccp/ccids/lib/packet_history.h @@ -84,6 +84,41 @@ struct tfrc_rx_hist { #define rtt_sample_prev loss_start }; +/** + * tfrc_rx_hist_index - index to reach n-th entry after loss_start + */ +static inline u8 tfrc_rx_hist_index(const struct tfrc_rx_hist *h, const u8 n) +{ + return (h-loss_start + n) TFRC_NDUPACK; +} + +/** + * tfrc_rx_hist_last_rcv - entry with highest-received-seqno so far + */ +static inline struct tfrc_rx_hist_entry * + tfrc_rx_hist_last_rcv(const struct tfrc_rx_hist *h) +{ + return h-ring[tfrc_rx_hist_index(h, h-loss_count)]; +} + +/** + * tfrc_rx_hist_entry - return the n-th history entry after loss_start + */ +static inline struct tfrc_rx_hist_entry * + tfrc_rx_hist_entry(const struct tfrc_rx_hist *h, const u8 n) +{ + return h-ring[tfrc_rx_hist_index(h, n)]; +} + +/** + * tfrc_rx_hist_loss_prev - entry with highest-received-seqno before loss was detected + */ +static inline struct tfrc_rx_hist_entry * + tfrc_rx_hist_loss_prev(const struct tfrc_rx_hist *h) +{ + return h-ring[h-loss_start]; +} + extern void tfrc_rx_hist_add_packet(struct tfrc_rx_hist *h, const struct sk_buff *skb, const u32 ndp); -- 1.5.3.4 - To unsubscribe from this list: send the line unsubscribe dccp in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 5/7] [TFRC]: CCID3 (and CCID4) needs to access these inlines
From: Gerrit Renker [EMAIL PROTECTED] This moves two inlines back to packet_history.h: these are not private to packet_history.c, but are needed by CCID3/4 to detect whether a new loss is indicated, or whether a loss is already pending. Signed-off-by: Gerrit Renker [EMAIL PROTECTED] Signed-off-by: Arnaldo Carvalho de Melo [EMAIL PROTECTED] --- net/dccp/ccids/lib/packet_history.c | 26 -- net/dccp/ccids/lib/packet_history.h | 35 +++ 2 files changed, 31 insertions(+), 30 deletions(-) diff --git a/net/dccp/ccids/lib/packet_history.c b/net/dccp/ccids/lib/packet_history.c index 5b10a1e..20af1a6 100644 --- a/net/dccp/ccids/lib/packet_history.c +++ b/net/dccp/ccids/lib/packet_history.c @@ -191,32 +191,6 @@ int tfrc_rx_hist_duplicate(struct tfrc_rx_hist *h, struct sk_buff *skb) } EXPORT_SYMBOL_GPL(tfrc_rx_hist_duplicate); -/* initialise loss detection and disable RTT sampling */ -static inline void tfrc_rx_hist_loss_indicated(struct tfrc_rx_hist *h) -{ - h-loss_count = 1; -} - -/* indicate whether previously a packet was detected missing */ -static inline int tfrc_rx_hist_loss_pending(const struct tfrc_rx_hist *h) -{ - return h-loss_count; -} - -/* any data packets missing between last reception and skb ? */ -int tfrc_rx_hist_new_loss_indicated(struct tfrc_rx_hist *h, - const struct sk_buff *skb, u32 ndp) -{ - int delta = dccp_delta_seqno(tfrc_rx_hist_last_rcv(h)-tfrchrx_seqno, -DCCP_SKB_CB(skb)-dccpd_seq); - - if (delta 1 ndp delta) - tfrc_rx_hist_loss_indicated(h); - - return tfrc_rx_hist_loss_pending(h); -} -EXPORT_SYMBOL_GPL(tfrc_rx_hist_new_loss_indicated); - static void tfrc_rx_hist_swap(struct tfrc_rx_hist *h, const u8 a, const u8 b) { const u8 idx_a = tfrc_rx_hist_index(h, a), diff --git a/net/dccp/ccids/lib/packet_history.h b/net/dccp/ccids/lib/packet_history.h index 24edd8d..c7eeda4 100644 --- a/net/dccp/ccids/lib/packet_history.h +++ b/net/dccp/ccids/lib/packet_history.h @@ -118,16 +118,43 @@ static inline struct tfrc_rx_hist_entry * return h-ring[h-loss_start]; } +/* initialise loss detection and disable RTT sampling */ +static inline void tfrc_rx_hist_loss_indicated(struct tfrc_rx_hist *h) +{ + h-loss_count = 1; +} + +/* indicate whether previously a packet was detected missing */ +static inline int tfrc_rx_hist_loss_pending(const struct tfrc_rx_hist *h) +{ + return h-loss_count; +} + +/* any data packets missing between last reception and skb ? */ +static inline int tfrc_rx_hist_new_loss_indicated(struct tfrc_rx_hist *h, + const struct sk_buff *skb, + u32 ndp) +{ + int delta = dccp_delta_seqno(tfrc_rx_hist_last_rcv(h)-tfrchrx_seqno, +DCCP_SKB_CB(skb)-dccpd_seq); + + if (delta 1 ndp delta) + tfrc_rx_hist_loss_indicated(h); + + return tfrc_rx_hist_loss_pending(h); +} + extern void tfrc_rx_hist_add_packet(struct tfrc_rx_hist *h, const struct sk_buff *skb, const u32 ndp); extern int tfrc_rx_hist_duplicate(struct tfrc_rx_hist *h, struct sk_buff *skb); -extern int tfrc_rx_hist_new_loss_indicated(struct tfrc_rx_hist *h, - const struct sk_buff *skb, u32 ndp); + struct tfrc_loss_hist; -extern int tfrc_rx_handle_loss(struct tfrc_rx_hist *, struct tfrc_loss_hist *, +extern int tfrc_rx_handle_loss(struct tfrc_rx_hist *h, + struct tfrc_loss_hist *lh, struct sk_buff *skb, u32 ndp, - u32 (*first_li)(struct sock *), struct sock *); + u32 (*first_li)(struct sock *sk), + struct sock *sk); extern u32 tfrc_rx_hist_sample_rtt(struct tfrc_rx_hist *h, const struct sk_buff *skb); extern int tfrc_rx_hist_alloc(struct tfrc_rx_hist *h); -- 1.5.3.4 - To unsubscribe from this list: send the line unsubscribe dccp in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 4/7] [CCID3]: Redundant debugging output / documentation
From: Gerrit Renker [EMAIL PROTECTED] Each time feedback is sent two lines are printed: ccid3_hc_rx_send_feedback: client ... - entry ccid3_hc_rx_send_feedback: Interval ...usec, X_recv=..., 1/p=... The first line is redundant and thus removed. Further, documentation of ccid3_hc_rx_sock (capitalisation) is made consistent. Signed-off-by: Gerrit Renker [EMAIL PROTECTED] Signed-off-by: Arnaldo Carvalho de Melo [EMAIL PROTECTED] --- net/dccp/ccids/ccid3.c |2 -- net/dccp/ccids/ccid3.h |4 ++-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index 60fcb31..b92069b 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -685,8 +685,6 @@ static void ccid3_hc_rx_send_feedback(struct sock *sk, ktime_t now; s64 delta = 0; - ccid3_pr_debug(%s(%p) - entry \n, dccp_role(sk), sk); - if (unlikely(hcrx-ccid3hcrx_state == TFRC_RSTATE_TERM)) return; diff --git a/net/dccp/ccids/ccid3.h b/net/dccp/ccids/ccid3.h index 3c33dc6..6ceeb80 100644 --- a/net/dccp/ccids/ccid3.h +++ b/net/dccp/ccids/ccid3.h @@ -135,9 +135,9 @@ enum ccid3_hc_rx_states { * * @ccid3hcrx_x_recv - Receiver estimate of send rate (RFC 3448 4.3) * @ccid3hcrx_rtt - Receiver estimate of rtt (non-standard) - * @ccid3hcrx_p - current loss event rate (RFC 3448 5.4) + * @ccid3hcrx_p - Current loss event rate (RFC 3448 5.4) * @ccid3hcrx_last_counter - Tracks window counter (RFC 4342, 8.1) - * @ccid3hcrx_state - receiver state, one of %ccid3_hc_rx_states + * @ccid3hcrx_state - Receiver state, one of %ccid3_hc_rx_states * @ccid3hcrx_bytes_recv - Total sum of DCCP payload bytes * @ccid3hcrx_tstamp_last_feedback - Time at which last feedback was sent * @ccid3hcrx_tstamp_last_ack - Time at which last feedback was sent -- 1.5.3.4 - To unsubscribe from this list: send the line unsubscribe dccp in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCHES 0/7]: DCCP patches for 2.6.25
Hi David, Please consider pulling from: master.kernel.org:/pub/scm/linux/kernel/git/acme/net-2.6.25 Best Regards, - Arnaldo b/net/dccp/ccids/ccid3.c |2 b/net/dccp/ccids/ccid3.h |5 b/net/dccp/ccids/lib/loss_interval.c | 161 ++- b/net/dccp/ccids/lib/loss_interval.h | 56 ++ b/net/dccp/ccids/lib/packet_history.c | 68 +++- b/net/dccp/ccids/lib/packet_history.h | 36 b/net/dccp/ccids/lib/tfrc.c | 32 ++- b/net/dccp/ccids/lib/tfrc.h |4 b/net/dccp/dccp.h |8 net/dccp/ccids/ccid3.c| 72 +++- net/dccp/ccids/ccid3.h| 10 - net/dccp/ccids/lib/loss_interval.c| 284 +- net/dccp/ccids/lib/loss_interval.h| 11 - net/dccp/ccids/lib/packet_history.c | 279 + net/dccp/ccids/lib/packet_history.h | 47 - net/dccp/ccids/lib/tfrc.c | 10 - 16 files changed, 643 insertions(+), 442 deletions(-) - To unsubscribe from this list: send the line unsubscribe dccp in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 6/7] [CCID3]: Interface CCID3 code with newer Loss Intervals Database
From: Gerrit Renker [EMAIL PROTECTED] This hooks up the TFRC Loss Interval database with CCID 3 packet reception. In addition, it makes the CCID-specific computation of the first loss interval (which requires access to all the guts of CCID3) local to ccid3.c. The patch also fixes an omission in the DCCP code, that of a default / fallback RTT value (defined in section 3.4 of RFC 4340 as 0.2 sec); while at it, the upper bound of 4 seconds for an RTT sample has been reduced to match the initial TCP RTO value of 3 seconds from[RFC 1122, 4.2.3.1]. Signed-off-by: Gerrit Renker [EMAIL PROTECTED] Signed-off-by: Ian McDonald [EMAIL PROTECTED] Signed-off-by: Arnaldo Carvalho de Melo [EMAIL PROTECTED] --- net/dccp/ccids/ccid3.c | 72 ++-- net/dccp/ccids/ccid3.h | 10 ++-- net/dccp/ccids/lib/loss_interval.c | 18 net/dccp/ccids/lib/tfrc.c | 10 ++-- net/dccp/dccp.h|7 ++- 5 files changed, 84 insertions(+), 33 deletions(-) diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index b92069b..a818a1e 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -1,6 +1,7 @@ /* * net/dccp/ccids/ccid3.c * + * Copyright (c) 2007 The University of Aberdeen, Scotland, UK * Copyright (c) 2005-7 The University of Waikato, Hamilton, New Zealand. * Copyright (c) 2005-7 Ian McDonald [EMAIL PROTECTED] * @@ -33,11 +34,7 @@ * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ -#include ../ccid.h #include ../dccp.h -#include lib/packet_history.h -#include lib/loss_interval.h -#include lib/tfrc.h #include ccid3.h #include asm/unaligned.h @@ -757,6 +754,46 @@ static int ccid3_hc_rx_insert_options(struct sock *sk, struct sk_buff *skb) return 0; } +/** ccid3_first_li - Implements [RFC 3448, 6.3.1] + * + * Determine the length of the first loss interval via inverse lookup. + * Assume that X_recv can be computed by the throughput equation + * s + * X_recv = + * R * fval + * Find some p such that f(p) = fval; return 1/p (scaled). + */ +static u32 ccid3_first_li(struct sock *sk) +{ + struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk); + u32 x_recv, p, delta; + u64 fval; + + if (hcrx-ccid3hcrx_rtt == 0) { + DCCP_WARN(No RTT estimate available, using fallback RTT\n); + hcrx-ccid3hcrx_rtt = DCCP_FALLBACK_RTT; + } + + delta = ktime_to_us(net_timedelta(hcrx-ccid3hcrx_tstamp_last_feedback)); + x_recv = scaled_div32(hcrx-ccid3hcrx_bytes_recv, delta); + if (x_recv == 0) { /* would also trigger divide-by-zero */ + DCCP_WARN(X_recv==0\n); + if ((x_recv = hcrx-ccid3hcrx_x_recv) == 0) { + DCCP_BUG(stored value of X_recv is zero); + return ~0U; + } + } + + fval = scaled_div(hcrx-ccid3hcrx_s, hcrx-ccid3hcrx_rtt); + fval = scaled_div32(fval, x_recv); + p = tfrc_calc_x_reverse_lookup(fval); + + ccid3_pr_debug(%s(%p), receive rate=%u bytes/s, implied + loss rate=%u\n, dccp_role(sk), sk, x_recv, p); + + return p == 0 ? ~0U : scaled_div(1, p); +} + static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) { struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk); @@ -794,6 +831,14 @@ static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) /* * Handle pending losses and otherwise check for new loss */ + if (tfrc_rx_hist_loss_pending(hcrx-ccid3hcrx_hist) + tfrc_rx_handle_loss(hcrx-ccid3hcrx_hist, + hcrx-ccid3hcrx_li_hist, + skb, ndp, ccid3_first_li, sk) ) { + do_feedback = CCID3_FBACK_PARAM_CHANGE; + goto done_receiving; + } + if (tfrc_rx_hist_new_loss_indicated(hcrx-ccid3hcrx_hist, skb, ndp)) goto update_records; @@ -803,7 +848,7 @@ static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) if (unlikely(!is_data_packet)) goto update_records; - if (list_empty(hcrx-ccid3hcrx_li_hist)) { /* no loss so far: p = 0 */ + if (!tfrc_lh_is_initialised(hcrx-ccid3hcrx_li_hist)) { const u32 sample = tfrc_rx_hist_sample_rtt(hcrx-ccid3hcrx_hist, skb); /* * Empty loss history: no loss so far, hence p stays 0. @@ -812,6 +857,13 @@ static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) */ if (sample != 0) hcrx-ccid3hcrx_rtt = tfrc_ewma(hcrx-ccid3hcrx_rtt, sample, 9); + + } else if (tfrc_lh_update_i_mean(hcrx-ccid3hcrx_li_hist, skb)) { + /* +* Step (3
[PATCH 3/7] [TFRC]: Ringbuffer to track loss interval history
From: Gerrit Renker [EMAIL PROTECTED] A ringbuffer-based implementation of loss interval history is easier to maintain, allocate, and update. The `swap' routine to keep the RX history sorted is due to and was written by Arnaldo Carvalho de Melo, simplifying an earlier macro-based variant. Details: * access to the Loss Interval Records via macro wrappers (with safety checks); * simplified, on-demand allocation of entries (no extra memory consumption on lossless links); cache allocation is local to the module / exported as service; * provision of RFC-compliant algorithm to re-compute average loss interval; * provision of comprehensive, new loss detection algorithm - support for all cases of loss, including re-ordered/duplicate packets; - waiting for NDUPACK=3 packets to fill the hole; - updating loss records when a late-arriving packet fills a hole. Signed-off-by: Gerrit Renker [EMAIL PROTECTED] Signed-off-by: Ian McDonald [EMAIL PROTECTED] Signed-off-by: Arnaldo Carvalho de Melo [EMAIL PROTECTED] --- net/dccp/ccids/lib/loss_interval.c | 161 +- net/dccp/ccids/lib/loss_interval.h | 56 +- net/dccp/ccids/lib/packet_history.c | 218 ++- net/dccp/ccids/lib/packet_history.h | 11 +- net/dccp/ccids/lib/tfrc.h |3 + 5 files changed, 435 insertions(+), 14 deletions(-) diff --git a/net/dccp/ccids/lib/loss_interval.c b/net/dccp/ccids/lib/loss_interval.c index c0a933a..39980d1 100644 --- a/net/dccp/ccids/lib/loss_interval.c +++ b/net/dccp/ccids/lib/loss_interval.c @@ -1,6 +1,7 @@ /* * net/dccp/ccids/lib/loss_interval.c * + * Copyright (c) 2007 The University of Aberdeen, Scotland, UK * Copyright (c) 2005-7 The University of Waikato, Hamilton, New Zealand. * Copyright (c) 2005-7 Ian McDonald [EMAIL PROTECTED] * Copyright (c) 2005 Arnaldo Carvalho de Melo [EMAIL PROTECTED] @@ -10,12 +11,7 @@ * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. */ - -#include linux/module.h #include net/sock.h -#include ../../dccp.h -#include loss_interval.h -#include packet_history.h #include tfrc.h #define DCCP_LI_HIST_IVAL_F_LENGTH 8 @@ -27,6 +23,54 @@ struct dccp_li_hist_entry { u32 dccplih_interval; }; +static struct kmem_cache *tfrc_lh_slab __read_mostly; +/* Loss Interval weights from [RFC 3448, 5.4], scaled by 10 */ +static const int tfrc_lh_weights[NINTERVAL] = { 10, 10, 10, 10, 8, 6, 4, 2 }; + +/* implements LIFO semantics on the array */ +static inline u8 LIH_INDEX(const u8 ctr) +{ + return (LIH_SIZE - 1 - (ctr % LIH_SIZE)); +} + +/* the `counter' index always points at the next entry to be populated */ +static inline struct tfrc_loss_interval *tfrc_lh_peek(struct tfrc_loss_hist *lh) +{ + return lh-counter ? lh-ring[LIH_INDEX(lh-counter - 1)] : NULL; +} + +/* given i with 0 = i = k, return I_i as per the rfc3448bis notation */ +static inline u32 tfrc_lh_get_interval(struct tfrc_loss_hist *lh, const u8 i) +{ + BUG_ON(i = lh-counter); + return lh-ring[LIH_INDEX(lh-counter - i - 1)]-li_length; +} + +/* + * On-demand allocation and de-allocation of entries + */ +static struct tfrc_loss_interval *tfrc_lh_demand_next(struct tfrc_loss_hist *lh) +{ + if (lh-ring[LIH_INDEX(lh-counter)] == NULL) + lh-ring[LIH_INDEX(lh-counter)] = kmem_cache_alloc(tfrc_lh_slab, + GFP_ATOMIC); + return lh-ring[LIH_INDEX(lh-counter)]; +} + +void tfrc_lh_cleanup(struct tfrc_loss_hist *lh) +{ + if (!tfrc_lh_is_initialised(lh)) + return; + + for (lh-counter = 0; lh-counter LIH_SIZE; lh-counter++) + if (lh-ring[LIH_INDEX(lh-counter)] != NULL) { + kmem_cache_free(tfrc_lh_slab, + lh-ring[LIH_INDEX(lh-counter)]); + lh-ring[LIH_INDEX(lh-counter)] = NULL; + } +} +EXPORT_SYMBOL_GPL(tfrc_lh_cleanup); + static struct kmem_cache *dccp_li_cachep __read_mostly; static inline struct dccp_li_hist_entry *dccp_li_hist_entry_new(const gfp_t prio) @@ -98,6 +142,65 @@ u32 dccp_li_hist_calc_i_mean(struct list_head *list) EXPORT_SYMBOL_GPL(dccp_li_hist_calc_i_mean); +static void tfrc_lh_calc_i_mean(struct tfrc_loss_hist *lh) +{ + u32 i_i, i_tot0 = 0, i_tot1 = 0, w_tot = 0; + int i, k = tfrc_lh_length(lh) - 1; /* k is as in rfc3448bis, 5.4 */ + + for (i=0; i = k; i++) { + i_i = tfrc_lh_get_interval(lh, i); + + if (i k) { + i_tot0 += i_i * tfrc_lh_weights[i]; + w_tot += tfrc_lh_weights[i]; + } + if (i 0) + i_tot1 += i_i * tfrc_lh_weights[i-1]; + } + + BUG_ON(w_tot == 0); + lh-i_mean = max(i_tot0, i_tot1) / w_tot
Re: [PATCH 8/8] [PATCH v2] [CCID3]: Interface CCID3 code with newer Loss Intervals Database
Em Wed, Dec 12, 2007 at 04:56:32PM +, Gerrit Renker escreveu: | This time around I'm not doing any reordering, just trying to use your | patches as is, but adding this patch as-is produces a kernel that will | crash, no? | | The loss history and the RX/TX packet history slabs are all created in | tfrc.c using the three different __init routines of the dccp_tfrc_lib. | | Yes, the init routines are called and in turn they create the slab | caches, but up to the patch [PATCH 8/8] [PATCH v2] [CCID3]: Interface | CCID3 code with newer Loss Intervals Database the new li slab is not | being created, no? See what I'm talking? | Sorry, there is some weird kind of mix-up going on. Can you please check your patch set: it seems this email exchange refers to an older variant. In the most recent patch set, the slab is introduced in the patch [TFRC]: Ringbuffer to track loss interval history --- a/net/dccp/ccids/lib/loss_interval.c +++ b/net/dccp/ccids/lib/loss_interval.c @@ -27,6 +23,54 @@ struct dccp_li_hist_entry { u32 dccplih_interval; }; +static struct kmem_cache *tfrc_lh_slab __read_mostly; /* === */ Yup, this one, is introduced as above but is not initialized at the module init routine, please see, it should be OK and we can move on: http://git.kernel.org/?p=linux/kernel/git/acme/net-2.6.25.git;a=commitdiff;h=a925429ce2189b548dc19037d3ebd4ff35ae4af7 +/* Loss Interval weights from [RFC 3448, 5.4], scaled by 10 */ +static const int tfrc_lh_weights[NINTERVAL] = { 10, 10, 10, 10, 8, 6, 4, 2 }; // ... And this is 6/8, i.e. before 8/8, cf. http://www.mail-archive.com/dccp@vger.kernel.org/msg03000.html I don't know which tree you are working off, would it be possible to check against the test tree git://eden-feed.erg.abdn.ac.uk/dccp_exp [dccp] I'm doing a fresh clone now. But I think that everything is OK after today's merge request I sent to David. - Arnaldo - To unsubscribe from this list: send the line unsubscribe dccp in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 8/8] [PATCH v2] [CCID3]: Interface CCID3 code with newer Loss Intervals Database
Em Tue, Dec 11, 2007 at 09:42:38AM +, Gerrit Renker escreveu: | When interfacing we must make sure that ccid3 tfrc_lh_slab is created | and then tfrc_li_cachep is not needed. I'm doing this while keeping | the structure of the patches, i.e. one introducing, the other removing. | But we need to create tfrc_lh_slab if we want the tree to be bisectable. | | I'm doing this and keeping your Signed-off-line, please holler if you | disagree for some reason. If you are just shifting and reordering then that is fine with me. But it seems you mean a different patch since in this one there is no slab initialisation. This time around I'm not doing any reordering, just trying to use your patches as is, but adding this patch as-is produces a kernel that will crash, no? The loss history and the RX/TX packet history slabs are all created in tfrc.c using the three different __init routines of the dccp_tfrc_lib. Yes, the init routines are called and in turn they create the slab caches, but up to the patch [PATCH 8/8] [PATCH v2] [CCID3]: Interface CCID3 code with newer Loss Intervals Database the new li slab is not being created, no? See what I'm talking? - Arnaldo - To unsubscribe from this list: send the line unsubscribe dccp in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [Announce]: Test results with latest CCID3 patch set
Em Tue, Dec 11, 2007 at 02:21:28PM +, Gerrit Renker escreveu: | I am new of this mailing list and I am really interested in the | measurements you are performing with DCCP. This was more of a regression test, as there had been recent changes in the test tree, to see that the kernel (not userspace) still performs in a predictable way. | Which tool are you using ? Are you using Iperf for such measurements ? The setup is the one from http://www.linux-foundation.org/en/Net:DCCP_Testing#Regression_testing and, yes, it uses iperf. | Have you ever heard about D-ITG ? | You can find more information here: | http://www.grid.unina.it/software/ITG | | I am one of the authors of such platform and I have also | performed some very preliminary tests with DCCP. | | I would be very glad to have your opinion on that and I'm very interested | in improving its features, also with specific regard to the support of | transport protocols. | It is a very nice tool with many features. I only ran simple tests with it (version 2.6), again only as basic sanity tests -- the throughput result was similar to the one tested with iperf. I think that the tool has more to offer and can help improve/extend DCCP testing. Here is my list of points, hoping that the others will add theirs, too: * would be good to have a standardised set of scripts, for comparison/benchmarking * the built-in VoIP module only works for UDP -- is it possible to port this to DCCP? * as per previous email, more complex traffic scenarios would be good, in particular - switching on/off background traffic at times to observe TCP/flow-friendliness - running multiple DCCP flows in parallel and at overlapping times Does this tool records results in a database keyed by kernel version/buildid for us to use it as a regression tool? Something that would produce results around these lines: WARNING: test #23 counter #3 variance bigger than specified since the last kernel tested (git cset 55ed793afb4a8025d33a8e6a5f2f89d5ac4d8432)! - Arnaldo - To unsubscribe from this list: send the line unsubscribe dccp in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 5/8] [TFRC]: Loss interval code needs the macros/inlines that were moved
Em Mon, Dec 10, 2007 at 11:31:53AM +, Gerrit Renker escreveu: | |distcc[24516] ERROR: compile /root/.ccache/packet_his.tmp.aspire.home.net.24512.i on _tiptop failed |/usr/src/davem-2.6/net/dccp/ccids/lib/packet_history.c: In function '__one_after_loss': |/usr/src/davem-2.6/net/dccp/ccids/lib/packet_history.c:266: error: lvalue required as unary '' operand snip | | Because you do it this way: | | tfrc_rx_hist_swap(TFRC_RX_HIST_ENTRY(h, 0), TFRC_RX_HIST_ENTRY(h, 3)); | | I checked and at least in this patch series all uses are of this type, | so why not do it using just the indexes, which would be simpler: | | tfrc_rx_hist_swap(h, 0, 3); | | With this implementation: | | static void tfrc_rx_hist_swap(struct tfrc_rx_hist *h, const int a, const int b) | { | const int idx_a = tfrc_rx_hist_index(h, a), | int idx_b = tfrc_rx_hist_index(h, b); | struct tfrc_rx_hist_entry *tmp = h-ring[idx_a]; | | h-ring[idx_a] = h-ring[idx_b]; | h-ring[idx_b] = tmp; | } | Agreed, that is useful in the present case, since then everything uses inlines. The only suggestion I'd like to make is to use `u8' instead of `int' since the indices will have very low values. Agreed. There is a related point: you will probably have noticed that loss_interval.c also uses macros. I don't know if you are planning to convert these also into inlines. I think that there would be less benefit in converting these, since they are locl to loss_interval.c and mostly serve to improve readability. In general I'm against using macros for functions, so please always consider doing things as inlines. I'll read some more patches today and provide comments as to if I think it is ok for now to keep it as macros. As I have at least one other patch to revise (plus another minor one), I'll rework this according to the above. Thank you. - Arnaldo - To unsubscribe from this list: send the line unsubscribe dccp in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 8/8] [PATCH v2] [CCID3]: Interface CCID3 code with newer Loss Intervals Database
Em Sat, Dec 08, 2007 at 10:06:28AM +, Gerrit Renker escreveu: This hooks up the TFRC Loss Interval database with CCID 3 packet reception. In addition, it makes the CCID-specific computation of the first loss interval (which requires access to all the guts of CCID3) local to ccid3.c. The patch also fixes an omission in the DCCP code, that of a default / fallback RTT value (defined in section 3.4 of RFC 4340 as 0.2 sec); while at it, the upper bound of 4 seconds for an RTT sample has been reduced to match the initial TCP RTO value of 3 seconds from[RFC 1122, 4.2.3.1]. Signed-off-by: Gerrit Renker [EMAIL PROTECTED] Signed-off-by: Ian McDonald [EMAIL PROTECTED] When interfacing we must make sure that ccid3 tfrc_lh_slab is created and then tfrc_li_cachep is not needed. I'm doing this while keeping the structure of the patches, i.e. one introducing, the other removing. But we need to create tfrc_lh_slab if we want the tree to be bisectable. I'm doing this and keeping your Signed-off-line, please holler if you disagree for some reason. - Arnaldo - To unsubscribe from this list: send the line unsubscribe dccp in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 1/8] [TFRC]: Whitespace cleanups
Em Sat, Dec 08, 2007 at 10:06:21AM +, Gerrit Renker escreveu: Just some tidy-ups to keep git/quilt happy. Also moved up the comment Receiver routines above the first occurrence of RX history routines. Signed-off-by: Gerrit Renker [EMAIL PROTECTED] Thanks, applied. - To unsubscribe from this list: send the line unsubscribe dccp in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/8] [TFRC]: Put RX/TX initialisation into tfrc.c
Em Sat, Dec 08, 2007 at 10:06:22AM +, Gerrit Renker escreveu: This separates RX/TX initialisation and puts all packet history / loss intervals initialisation into tfrc.c. The organisation is uniform: slab declaration - {rx,tx}_init() - {rx,tx}_exit() NAK, you can't call a __exit marked routine from a __init marked routine. - Arnaldo Signed-off-by: Gerrit Renker [EMAIL PROTECTED] --- net/dccp/ccids/lib/packet_history.c | 68 -- net/dccp/ccids/lib/tfrc.c | 31 2 files changed, 55 insertions(+), 44 deletions(-) diff --git a/net/dccp/ccids/lib/packet_history.c b/net/dccp/ccids/lib/packet_history.c index 54cd23e..af0db71 100644 --- a/net/dccp/ccids/lib/packet_history.c +++ b/net/dccp/ccids/lib/packet_history.c @@ -57,6 +57,22 @@ struct tfrc_tx_hist_entry { */ SNIP + +void __exit tfrc_tx_packet_history_exit(void) +{ + if (tfrc_tx_hist_slab != NULL) { + kmem_cache_destroy(tfrc_tx_hist_slab); + tfrc_tx_hist_slab = NULL; + } +} + SNIP diff --git a/net/dccp/ccids/lib/tfrc.c b/net/dccp/ccids/lib/tfrc.c index 3a7a183..20763fa 100644 --- a/net/dccp/ccids/lib/tfrc.c +++ b/net/dccp/ccids/lib/tfrc.c @@ -14,27 +14,42 @@ module_param(tfrc_debug, bool, 0444); MODULE_PARM_DESC(tfrc_debug, Enable debug messages); #endif +extern int tfrc_tx_packet_history_init(void); +extern void tfrc_tx_packet_history_exit(void); +extern int tfrc_rx_packet_history_init(void); +extern void tfrc_rx_packet_history_exit(void); + extern int dccp_li_init(void); extern void dccp_li_exit(void); -extern int packet_history_init(void); -extern void packet_history_exit(void); static int __init tfrc_module_init(void) { int rc = dccp_li_init(); - if (rc == 0) { - rc = packet_history_init(); - if (rc != 0) - dccp_li_exit(); - } + if (rc) + goto out; + + rc = tfrc_tx_packet_history_init(); + if (rc) + goto out_free_loss_intervals; + rc = tfrc_rx_packet_history_init(); + if (rc) + goto out_free_tx_history; + return 0; + +out_free_tx_history: + tfrc_tx_packet_history_exit(); +out_free_loss_intervals: + dccp_li_exit(); +out: return rc; } SNIP - To unsubscribe from this list: send the line unsubscribe dccp in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 3/8] [TFRC/CCID3]: Remove now unused functions / function calls
Em Sat, Dec 08, 2007 at 10:06:23AM +, Gerrit Renker escreveu: This removes two things which now have become redundant: 1. The function tfrc_rx_hist_entry_delete() is no longer referenced anywhere. 2. The CCID3 HC-receiver still inserted timestamps, but received timestamps are not parsed/referenced/used by the HC-sender, it serves no function. Signed-off-by: Gerrit Renker [EMAIL PROTECTED] Thanks, applying as two separate patches. - Arnaldo - To unsubscribe from this list: send the line unsubscribe dccp in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 5/8] [TFRC]: Loss interval code needs the macros/inlines that were moved
Em Sat, Dec 08, 2007 at 10:06:25AM +, Gerrit Renker escreveu: This moves the inlines (which were previously declared as macros) back into packet_history.h since the loss detection code needs to be able to read entries from the RX history in order to create the relevant loss entries: it needs at least tfrc_rx_hist_loss_prev() and tfrc_rx_hist_last_rcv(), which in turn require the definition of the other inlines (macros). Additionally, inn one case the use of inlines instead of a macro broke the algorithm: rx_hist_swap() (introduced in next patch) needs to be able to swap the history entries; when using an inline returning a pointer instead, one gets compilation errors such as: distcc[24516] ERROR: compile /root/.ccache/packet_his.tmp.aspire.home.net.24512.i on _tiptop failed /usr/src/davem-2.6/net/dccp/ccids/lib/packet_history.c: In function '__one_after_loss': /usr/src/davem-2.6/net/dccp/ccids/lib/packet_history.c:266: error: lvalue required as unary '' operand /usr/src/davem-2.6/net/dccp/ccids/lib/packet_history.c:267: error: lvalue required as unary '' operand /usr/src/davem-2.6/net/dccp/ccids/lib/packet_history.c: In function '__two_after_loss': /usr/src/davem-2.6/net/dccp/ccids/lib/packet_history.c:298: error: lvalue required as unary '' operand /usr/src/davem-2.6/net/dccp/ccids/lib/packet_history.c:299: error: lvalue required as unary '' operand /usr/src/davem-2.6/net/dccp/ccids/lib/packet_history.c:336: error: lvalue required as unary '' operand /usr/src/davem-2.6/net/dccp/ccids/lib/packet_history.c:337: error: lvalue required as unary '' operand make[4]: *** [net/dccp/ccids/lib/packet_history.o] Error 1 make[3]: *** [net/dccp/ccids/lib] Error 2 make[2]: *** [net/dccp/ccids] Error 2 make[1]: *** [net/dccp/] Error 2 make: *** [sub-make] Error 2 Because you do it this way: tfrc_rx_hist_swap(TFRC_RX_HIST_ENTRY(h, 0), TFRC_RX_HIST_ENTRY(h, 3)); I checked and at least in this patch series all uses are of this type, so why not do it using just the indexes, which would be simpler: tfrc_rx_hist_swap(h, 0, 3); With this implementation: static void tfrc_rx_hist_swap(struct tfrc_rx_hist *h, const int a, const int b) { const int idx_a = tfrc_rx_hist_index(h, a), int idx_b = tfrc_rx_hist_index(h, b); struct tfrc_rx_hist_entry *tmp = h-ring[idx_a]; h-ring[idx_a] = h-ring[idx_b]; h-ring[idx_b] = tmp; } - Arnaldo - To unsubscribe from this list: send the line unsubscribe dccp in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 2/3] [TFRC]: The function tfrc_rx_hist_entry_delete() is not used anymore
From: Gerrit Renker [EMAIL PROTECTED] Signed-off-by: Gerrit Renker [EMAIL PROTECTED] Signed-off-by: Arnaldo Carvalho de Melo [EMAIL PROTECTED] --- net/dccp/ccids/lib/packet_history.c |5 - 1 files changed, 0 insertions(+), 5 deletions(-) diff --git a/net/dccp/ccids/lib/packet_history.c b/net/dccp/ccids/lib/packet_history.c index 54cd23e..af44082 100644 --- a/net/dccp/ccids/lib/packet_history.c +++ b/net/dccp/ccids/lib/packet_history.c @@ -151,11 +151,6 @@ void tfrc_rx_hist_add_packet(struct tfrc_rx_hist *h, } EXPORT_SYMBOL_GPL(tfrc_rx_hist_add_packet); -static inline void tfrc_rx_hist_entry_delete(struct tfrc_rx_hist_entry *entry) -{ - kmem_cache_free(tfrc_rx_hist_slab, entry); -} - /** * tfrc_rx_hist_entry - return the n-th history entry after loss_start */ -- 1.5.3.4 - To unsubscribe from this list: send the line unsubscribe dccp in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCHES 0/3]: DCCP patches for 2.6.25
Hi David, Please consider pulling from: master.kernel.org:/pub/scm/linux/kernel/git/acme/net-2.6.25 Best Regards, - Arnaldo b/net/dccp/ccids/ccid3.c | 16 b/net/dccp/ccids/lib/loss_interval.c |2 +- b/net/dccp/ccids/lib/packet_history.c | 13 ++--- net/dccp/ccids/ccid3.c|4 +--- net/dccp/ccids/lib/packet_history.c |6 -- 5 files changed, 16 insertions(+), 25 deletions(-) - To unsubscribe from this list: send the line unsubscribe dccp in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC2][PATCH 7/7] [TFRC]: New rx history code
Em Thu, Dec 06, 2007 at 02:02:25PM +, Gerrit Renker escreveu: | The first six patches in this series are unmodified, so if you | are OK with them please send me your Signed-off-by. Patches [1/7], [2/7], and [6/7] already have a signed-off and there are no changes. Just acknowledged [3..5/7], will look at [7/7] now. OK, please let me know if there are still any problems. The removal of timestamp insertion in ccid3_hc_rx_insert_options will be put in another cset. - Arnaldo - To unsubscribe from this list: send the line unsubscribe dccp in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 6/7] [CCID3]: The receiver of a half-connection does not set window counter values
From: Gerrit Renker [EMAIL PROTECTED] Only the sender sets window counters [RFC 4342, sections 5 and 8.1]. Signed-off-by: Gerrit Renker [EMAIL PROTECTED] Signed-off-by: Ian McDonald [EMAIL PROTECTED] Signed-off-by: Arnaldo Carvalho de Melo [EMAIL PROTECTED] --- net/dccp/ccids/ccid3.c |1 - 1 files changed, 0 insertions(+), 1 deletions(-) diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index c95dca8..5ff5aab 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -733,7 +733,6 @@ static int ccid3_hc_rx_insert_options(struct sock *sk, struct sk_buff *skb) return 0; hcrx = ccid3_hc_rx_sk(sk); - DCCP_SKB_CB(skb)-dccpd_ccval = hcrx-ccid3hcrx_ccval_last_counter; if (dccp_packet_without_ack(skb)) return 0; -- 1.5.3.4 - To unsubscribe from this list: send the line unsubscribe dccp in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 4/7] [TFRC]: Make the rx history slab be global
This is in preparation for merging the new rx history code written by Gerrit Renker. Signed-off-by: Gerrit Renker [EMAIL PROTECTED] Signed-off-by: Arnaldo Carvalho de Melo [EMAIL PROTECTED] --- net/dccp/ccids/ccid3.c | 35 ++--- net/dccp/ccids/lib/packet_history.c | 95 ++- net/dccp/ccids/lib/packet_history.h | 43 ++-- 3 files changed, 60 insertions(+), 113 deletions(-) diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index 5dea690..07920bb 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -49,8 +49,6 @@ static int ccid3_debug; #define ccid3_pr_debug(format, a...) #endif -static struct dccp_rx_hist *ccid3_rx_hist; - /* * Transmitter Half-Connection Routines */ @@ -807,9 +805,9 @@ static int ccid3_hc_rx_detect_loss(struct sock *sk, } detect_out: - dccp_rx_hist_add_packet(ccid3_rx_hist, hcrx-ccid3hcrx_hist, - hcrx-ccid3hcrx_li_hist, packet, - hcrx-ccid3hcrx_seqno_nonloss); + dccp_rx_hist_add_packet(hcrx-ccid3hcrx_hist, + hcrx-ccid3hcrx_li_hist, packet, + hcrx-ccid3hcrx_seqno_nonloss); return loss; } @@ -852,8 +850,7 @@ static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) return; } - packet = dccp_rx_hist_entry_new(ccid3_rx_hist, opt_recv-dccpor_ndp, - skb, GFP_ATOMIC); + packet = dccp_rx_hist_entry_new(opt_recv-dccpor_ndp, skb, GFP_ATOMIC); if (unlikely(packet == NULL)) { DCCP_WARN(%s(%p), Not enough mem to add rx packet to history, consider it lost!\n, dccp_role(sk), sk); @@ -936,7 +933,7 @@ static void ccid3_hc_rx_exit(struct sock *sk) ccid3_hc_rx_set_state(sk, TFRC_RSTATE_TERM); /* Empty packet history */ - dccp_rx_hist_purge(ccid3_rx_hist, hcrx-ccid3hcrx_hist); + dccp_rx_hist_purge(hcrx-ccid3hcrx_hist); /* Empty loss interval history */ dccp_li_hist_purge(hcrx-ccid3hcrx_li_hist); @@ -1013,33 +1010,13 @@ MODULE_PARM_DESC(ccid3_debug, Enable debug messages); static __init int ccid3_module_init(void) { - int rc = -ENOBUFS; - - ccid3_rx_hist = dccp_rx_hist_new(ccid3); - if (ccid3_rx_hist == NULL) - goto out; - - rc = ccid_register(ccid3); - if (rc != 0) - goto out_free_rx; -out: - return rc; - -out_free_rx: - dccp_rx_hist_delete(ccid3_rx_hist); - ccid3_rx_hist = NULL; - goto out; + return ccid_register(ccid3); } module_init(ccid3_module_init); static __exit void ccid3_module_exit(void) { ccid_unregister(ccid3); - - if (ccid3_rx_hist != NULL) { - dccp_rx_hist_delete(ccid3_rx_hist); - ccid3_rx_hist = NULL; - } } module_exit(ccid3_module_exit); diff --git a/net/dccp/ccids/lib/packet_history.c b/net/dccp/ccids/lib/packet_history.c index b628714..e1ab853 100644 --- a/net/dccp/ccids/lib/packet_history.c +++ b/net/dccp/ccids/lib/packet_history.c @@ -114,48 +114,33 @@ EXPORT_SYMBOL_GPL(tfrc_tx_hist_rtt); /* * Receiver History Routines */ -struct dccp_rx_hist *dccp_rx_hist_new(const char *name) +static struct kmem_cache *tfrc_rx_hist_slab; + +struct dccp_rx_hist_entry *dccp_rx_hist_entry_new(const u32 ndp, + const struct sk_buff *skb, + const gfp_t prio) { - struct dccp_rx_hist *hist = kmalloc(sizeof(*hist), GFP_ATOMIC); - static const char dccp_rx_hist_mask[] = rx_hist_%s; - char *slab_name; - - if (hist == NULL) - goto out; - - slab_name = kmalloc(strlen(name) + sizeof(dccp_rx_hist_mask) - 1, - GFP_ATOMIC); - if (slab_name == NULL) - goto out_free_hist; - - sprintf(slab_name, dccp_rx_hist_mask, name); - hist-dccprxh_slab = kmem_cache_create(slab_name, -sizeof(struct dccp_rx_hist_entry), -0, SLAB_HWCACHE_ALIGN, NULL); - if (hist-dccprxh_slab == NULL) - goto out_free_slab_name; -out: - return hist; -out_free_slab_name: - kfree(slab_name); -out_free_hist: - kfree(hist); - hist = NULL; - goto out; -} + struct dccp_rx_hist_entry *entry = kmem_cache_alloc(tfrc_rx_hist_slab, + prio); -EXPORT_SYMBOL_GPL(dccp_rx_hist_new); + if (entry != NULL) { + const struct dccp_hdr *dh = dccp_hdr(skb); -void dccp_rx_hist_delete(struct dccp_rx_hist *hist) -{ - const char* name = kmem_cache_name(hist-dccprxh_slab); + entry-dccphrx_seqno = DCCP_SKB_CB(skb)-dccpd_seq; + entry-dccphrx_ccval = dh
[PATCH 7/7] [TFRC]: New rx history code
Credit here goes to Gerrit Renker, that provided the initial implementation for this new codebase. I modified it just to try to make it closer to the existing API, renaming some functions, add namespacing and fix one bug where the tfrc_rx_hist_alloc was not freeing the allocated ring entries on the error path. Original changeset comment from Gerrit: --- This provides a new, self-contained and generic RX history service for TFRC based protocols. Details: * new data structure, initialisation and cleanup routines; * allocation of dccp_rx_hist entries local to packet_history.c, as a service exported by the dccp_tfrc_lib module. * interface to automatically track highest-received seqno; * receiver-based RTT estimation (needed for instance by RFC 3448, 6.3.1); * a generic function to test for `data packets' as per RFC 4340, sec. 7.7. Signed-off-by: Gerrit Renker [EMAIL PROTECTED] Signed-off-by: Arnaldo Carvalho de Melo [EMAIL PROTECTED] --- net/dccp/ccids/ccid3.c | 288 -- net/dccp/ccids/ccid3.h | 14 +- net/dccp/ccids/lib/loss_interval.c | 13 ++- net/dccp/ccids/lib/packet_history.c | 290 +-- net/dccp/ccids/lib/packet_history.h | 83 +-- 5 files changed, 330 insertions(+), 358 deletions(-) diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index 5ff5aab..faacffa 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -641,6 +641,15 @@ static int ccid3_hc_tx_getsockopt(struct sock *sk, const int optname, int len, /* * Receiver Half-Connection Routines */ + +/* CCID3 feedback types */ +enum ccid3_fback_type { + CCID3_FBACK_NONE = 0, + CCID3_FBACK_INITIAL, + CCID3_FBACK_PERIODIC, + CCID3_FBACK_PARAM_CHANGE +}; + #ifdef CONFIG_IP_DCCP_CCID3_DEBUG static const char *ccid3_rx_state_name(enum ccid3_hc_rx_states state) { @@ -667,59 +676,60 @@ static void ccid3_hc_rx_set_state(struct sock *sk, hcrx-ccid3hcrx_state = state; } -static inline void ccid3_hc_rx_update_s(struct ccid3_hc_rx_sock *hcrx, int len) -{ - if (likely(len 0))/* don't update on empty packets (e.g. ACKs) */ - hcrx-ccid3hcrx_s = tfrc_ewma(hcrx-ccid3hcrx_s, len, 9); -} - -static void ccid3_hc_rx_send_feedback(struct sock *sk) +static void ccid3_hc_rx_send_feedback(struct sock *sk, + const struct sk_buff *skb, + enum ccid3_fback_type fbtype) { struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk); struct dccp_sock *dp = dccp_sk(sk); - struct tfrc_rx_hist_entry *packet; ktime_t now; - suseconds_t delta; + s64 delta = 0; ccid3_pr_debug(%s(%p) - entry \n, dccp_role(sk), sk); + if (unlikely(hcrx-ccid3hcrx_state == TFRC_RSTATE_TERM)) + return; + now = ktime_get_real(); - switch (hcrx-ccid3hcrx_state) { - case TFRC_RSTATE_NO_DATA: + switch (fbtype) { + case CCID3_FBACK_INITIAL: hcrx-ccid3hcrx_x_recv = 0; + hcrx-ccid3hcrx_pinv = ~0U; /* see RFC 4342, 8.5 */ break; - case TFRC_RSTATE_DATA: - delta = ktime_us_delta(now, - hcrx-ccid3hcrx_tstamp_last_feedback); - DCCP_BUG_ON(delta 0); - hcrx-ccid3hcrx_x_recv = - scaled_div32(hcrx-ccid3hcrx_bytes_recv, delta); + case CCID3_FBACK_PARAM_CHANGE: + /* +* When parameters change (new loss or p p_prev), we do not +* have a reliable estimate for R_m of [RFC 3448, 6.2] and so +* need to reuse the previous value of X_recv. However, when +* X_recv was 0 (due to early loss), this would kill X down to +* s/t_mbi (i.e. one packet in 64 seconds). +* To avoid such drastic reduction, we approximate X_recv as +* the number of bytes since last feedback. +* This is a safe fallback, since X is bounded above by X_calc. +*/ + if (hcrx-ccid3hcrx_x_recv 0) + break; + /* fall through */ + case CCID3_FBACK_PERIODIC: + delta = ktime_us_delta(now, hcrx-ccid3hcrx_tstamp_last_feedback); + if (delta = 0) + DCCP_BUG(delta (%ld) = 0, (long)delta); + else + hcrx-ccid3hcrx_x_recv = + scaled_div32(hcrx-ccid3hcrx_bytes_recv, delta); break; - case TFRC_RSTATE_TERM: - DCCP_BUG(%s(%p) - Illegal state TERM, dccp_role(sk), sk); + default: return; } - packet = tfrc_rx_hist_find_data_packet(hcrx-ccid3hcrx_hist); - if (unlikely(packet == NULL)) { - DCCP_WARN(%s(%p), no data
[PATCHES 0/7]: DCCP patches for 2.6.25
Hi David, Please consider pulling from: master.kernel.org:/pub/scm/linux/kernel/git/acme/net-2.6.25 Best Regards, - Arnaldo b/net/dccp/ccids/Kconfig | 13 b/net/dccp/ccids/ccid3.c | 35 -- b/net/dccp/ccids/ccid3.h | 14 b/net/dccp/ccids/lib/Makefile |2 b/net/dccp/ccids/lib/loss_interval.c | 14 b/net/dccp/ccids/lib/packet_history.c | 27 - b/net/dccp/ccids/lib/packet_history.h |3 b/net/dccp/ccids/lib/tfrc.c | 48 +++ b/net/dccp/ccids/lib/tfrc.h | 18 - b/net/dccp/dccp.h | 13 net/dccp/ccids/ccid3.c| 322 -- net/dccp/ccids/lib/loss_interval.c| 13 net/dccp/ccids/lib/packet_history.c | 496 +++--- net/dccp/ccids/lib/packet_history.h | 177 14 files changed, 579 insertions(+), 616 deletions(-) - To unsubscribe from this list: send the line unsubscribe dccp in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 3/7] [TFRC]: Rename tfrc_tx_hist to tfrc_tx_hist_slab, for consistency
Signed-off-by: Gerrit Renker [EMAIL PROTECTED] Signed-off-by: Arnaldo Carvalho de Melo [EMAIL PROTECTED] --- net/dccp/ccids/lib/packet_history.c | 20 ++-- 1 files changed, 10 insertions(+), 10 deletions(-) diff --git a/net/dccp/ccids/lib/packet_history.c b/net/dccp/ccids/lib/packet_history.c index 1d4d6ee..b628714 100644 --- a/net/dccp/ccids/lib/packet_history.c +++ b/net/dccp/ccids/lib/packet_history.c @@ -53,7 +53,7 @@ struct tfrc_tx_hist_entry { /* * Transmitter History Routines */ -static struct kmem_cache *tfrc_tx_hist; +static struct kmem_cache *tfrc_tx_hist_slab; static struct tfrc_tx_hist_entry * tfrc_tx_hist_find_entry(struct tfrc_tx_hist_entry *head, u64 seqno) @@ -66,7 +66,7 @@ static struct tfrc_tx_hist_entry * int tfrc_tx_hist_add(struct tfrc_tx_hist_entry **headp, u64 seqno) { - struct tfrc_tx_hist_entry *entry = kmem_cache_alloc(tfrc_tx_hist, gfp_any()); + struct tfrc_tx_hist_entry *entry = kmem_cache_alloc(tfrc_tx_hist_slab, gfp_any()); if (entry == NULL) return -ENOBUFS; @@ -85,7 +85,7 @@ void tfrc_tx_hist_purge(struct tfrc_tx_hist_entry **headp) while (head != NULL) { struct tfrc_tx_hist_entry *next = head-next; - kmem_cache_free(tfrc_tx_hist, head); + kmem_cache_free(tfrc_tx_hist_slab, head); head = next; } @@ -278,17 +278,17 @@ EXPORT_SYMBOL_GPL(dccp_rx_hist_purge); __init int packet_history_init(void) { - tfrc_tx_hist = kmem_cache_create(tfrc_tx_hist, -sizeof(struct tfrc_tx_hist_entry), 0, -SLAB_HWCACHE_ALIGN, NULL); + tfrc_tx_hist_slab = kmem_cache_create(tfrc_tx_hist, + sizeof(struct tfrc_tx_hist_entry), 0, + SLAB_HWCACHE_ALIGN, NULL); - return tfrc_tx_hist == NULL ? -ENOBUFS : 0; + return tfrc_tx_hist_slab == NULL ? -ENOBUFS : 0; } void packet_history_exit(void) { - if (tfrc_tx_hist != NULL) { - kmem_cache_destroy(tfrc_tx_hist); - tfrc_tx_hist = NULL; + if (tfrc_tx_hist_slab != NULL) { + kmem_cache_destroy(tfrc_tx_hist_slab); + tfrc_tx_hist_slab = NULL; } } -- 1.5.3.4 - To unsubscribe from this list: send the line unsubscribe dccp in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 1/7] [TFRC]: Provide central source file and debug facility
From: Gerrit Renker [EMAIL PROTECTED] This patch changes the tfrc_lib module in the following manner: (1) a dedicated tfrc source file to call the packet history loss interval init/exit functions. (2) a dedicated tfrc_pr_debug macro with toggle switch `tfrc_debug'. Commiter note: renamed tfrc_module.c to tfrc.c, and made CONFIG_IP_DCCP_CCID3 select IP_DCCP_TFRC_LIB. Signed-off-by: Gerrit Renker [EMAIL PROTECTED] Signed-off-by: Ian McDonald [EMAIL PROTECTED] Signed-off-by: Arnaldo Carvalho de Melo [EMAIL PROTECTED] --- net/dccp/ccids/Kconfig | 13 ++--- net/dccp/ccids/lib/Makefile |2 +- net/dccp/ccids/lib/packet_history.c | 27 ++- net/dccp/ccids/lib/packet_history.h |3 +- net/dccp/ccids/lib/tfrc.c | 48 +++ net/dccp/ccids/lib/tfrc.h | 17 +--- 6 files changed, 75 insertions(+), 35 deletions(-) create mode 100644 net/dccp/ccids/lib/tfrc.c diff --git a/net/dccp/ccids/Kconfig b/net/dccp/ccids/Kconfig index 3d7d867..1227594 100644 --- a/net/dccp/ccids/Kconfig +++ b/net/dccp/ccids/Kconfig @@ -38,6 +38,7 @@ config IP_DCCP_CCID2_DEBUG config IP_DCCP_CCID3 tristate CCID3 (TCP-Friendly) (EXPERIMENTAL) def_tristate IP_DCCP + select IP_DCCP_TFRC_LIB ---help--- CCID 3 denotes TCP-Friendly Rate Control (TFRC), an equation-based rate-controlled congestion control mechanism. TFRC is designed to @@ -63,10 +64,6 @@ config IP_DCCP_CCID3 If in doubt, say M. -config IP_DCCP_TFRC_LIB - depends on IP_DCCP_CCID3 - def_tristate IP_DCCP_CCID3 - config IP_DCCP_CCID3_DEBUG bool CCID3 debugging messages depends on IP_DCCP_CCID3 @@ -110,5 +107,13 @@ config IP_DCCP_CCID3_RTO is serious network congestion: experimenting with larger values should therefore not be performed on WANs. +config IP_DCCP_TFRC_LIB + tristate + default n + +config IP_DCCP_TFRC_DEBUG + bool + depends on IP_DCCP_TFRC_LIB + default y if IP_DCCP_CCID3_DEBUG endmenu diff --git a/net/dccp/ccids/lib/Makefile b/net/dccp/ccids/lib/Makefile index 5f940a6..68c93e3 100644 --- a/net/dccp/ccids/lib/Makefile +++ b/net/dccp/ccids/lib/Makefile @@ -1,3 +1,3 @@ obj-$(CONFIG_IP_DCCP_TFRC_LIB) += dccp_tfrc_lib.o -dccp_tfrc_lib-y := loss_interval.o packet_history.o tfrc_equation.o +dccp_tfrc_lib-y := tfrc.o tfrc_equation.o packet_history.o loss_interval.o diff --git a/net/dccp/ccids/lib/packet_history.c b/net/dccp/ccids/lib/packet_history.c index 4805de9..1d4d6ee 100644 --- a/net/dccp/ccids/lib/packet_history.c +++ b/net/dccp/ccids/lib/packet_history.c @@ -35,7 +35,6 @@ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ -#include linux/module.h #include linux/string.h #include packet_history.h @@ -277,39 +276,19 @@ void dccp_rx_hist_purge(struct dccp_rx_hist *hist, struct list_head *list) EXPORT_SYMBOL_GPL(dccp_rx_hist_purge); -extern int __init dccp_li_init(void); -extern void dccp_li_exit(void); - -static __init int packet_history_init(void) +__init int packet_history_init(void) { - if (dccp_li_init() != 0) - goto out; - tfrc_tx_hist = kmem_cache_create(tfrc_tx_hist, sizeof(struct tfrc_tx_hist_entry), 0, SLAB_HWCACHE_ALIGN, NULL); - if (tfrc_tx_hist == NULL) - goto out_li_exit; - return 0; -out_li_exit: - dccp_li_exit(); -out: - return -ENOBUFS; + return tfrc_tx_hist == NULL ? -ENOBUFS : 0; } -module_init(packet_history_init); -static __exit void packet_history_exit(void) +void packet_history_exit(void) { if (tfrc_tx_hist != NULL) { kmem_cache_destroy(tfrc_tx_hist); tfrc_tx_hist = NULL; } - dccp_li_exit(); } -module_exit(packet_history_exit); - -MODULE_AUTHOR(Ian McDonald [EMAIL PROTECTED], - Arnaldo Carvalho de Melo [EMAIL PROTECTED]); -MODULE_DESCRIPTION(DCCP TFRC library); -MODULE_LICENSE(GPL); diff --git a/net/dccp/ccids/lib/packet_history.h b/net/dccp/ccids/lib/packet_history.h index 0670f46..9a2642e 100644 --- a/net/dccp/ccids/lib/packet_history.h +++ b/net/dccp/ccids/lib/packet_history.h @@ -39,8 +39,7 @@ #include linux/ktime.h #include linux/list.h #include linux/slab.h - -#include ../../dccp.h +#include tfrc.h /* Number of later packets received before one is considered lost */ #define TFRC_RECV_NUM_LATE_LOSS 3 diff --git a/net/dccp/ccids/lib/tfrc.c b/net/dccp/ccids/lib/tfrc.c new file mode 100644 index 000..3a7a183 --- /dev/null +++ b/net/dccp/ccids/lib/tfrc.c @@ -0,0 +1,48 @@ +/* + * TFRC: main module holding the pieces of the TFRC library together + * + * Copyright (c) 2007 The University of Aberdeen, Scotland, UK + * Copyright (c) 2007 Arnaldo Carvalho de Melo [EMAIL PROTECTED] + */ +#include linux
Re: [PATCH 7/7] [TFRC] New rx history code
Em Wed, Dec 05, 2007 at 09:35:30AM +, Gerrit Renker escreveu: Quoting Arnaldo: | Em Tue, Dec 04, 2007 at 06:55:17AM +, Gerrit Renker escreveu: | NAK. You have changed the control flow of the algorithm and the underlying | data structure. Originally it had been an array of pointers, and this had | been a design decision right from the first submissions in March. From I | of http://www.erg.abdn.ac.uk/users/gerrit/dccp/notes/ccid3_packet_reception/loss_detection/loss_detection_algorithm_notes.txt | | 1. 'ring' is an array of pointers rather than a contiguous area in | memory. The reason is that, when having to swap adjacent entries | to sort the history, it is easier to swap pointers than to copy | (heavy-weight) history-entry data structs. | | But in your algorithm it becomes a normal linear array. | | As a consequence all subsequent code needs to be rewritten to use | copying instead of swapping pointers. And there is a lot of that, since | the loss detection code makes heavy use of swapping entries in order to | cope with reordered and/or delayed packets. | | So lets not do that, the decision was made based on the RX history patch | alone, where, as pointed out, no swapping occurs. | | This is not what I would call entirely equivalent as you claim. Your | algorithm is fundamentally different, the control flow is not the same, | nor are the underlying data structures. | | Its is equivalent up to what is in the tree, thank you for point out | that in subsequent patches it will be used. | | Had this design decision been made explicit in the changeset comment | that introduces the data structure I wouldn't have tried to reduce the | number of allocations. | | And you even said that it was a good decision when first reacting to | this change, which I saw as positive feedback for the RFC I sent. | That was my fault. Your solution looked much better to me but even I had forgotten that the algorithm is based on an array of pointers. When I finally sat down And you wrote that code, I acted on looking the code together with reading the changeset comment, to me it also looked much better to do that, but as there were many changes, I sent an [RFC] message. It served its purpose, as you after two iterations realised it was in fact not a good idea as later an array of pointers is required. I should have taken that swap routine as the definitive hint that indeed it was needed. and looked through the patch set I found the original notes from March and saw that using a linear array instead of an array of pointers will require quite a few changes and means changing the algorithm. I then thought through whether there could be any advantage in using a linear array as in this patch, but could find none. In the end this went back to the same questions and issues that were tackled between February and March this year. I wish we could have had this dicussion then; it would have made this email unnecessary. That is why it is important that the changeset comment collects these scattered notes and discussions. Think about in some years from now we will be left with a situation where we would have to look at many places to understando some aspects of what is in the code. While this happens and we have google for that I think that since you keep such detailed notes it is not a problem to get them in the changesets. I thank you for your replay and I am sorry if you took offense at my perhaps a little strong reaction, here is the essence what I tried to convey but what maybe did not succeed in conveying: * I am absolutely ok with you making changes to variable names, namespaces, file organisation, overall arrangement etc (as per previous email). You have experience and this will often be a benefit. For instance, you combined tfrc_entry_from_skb() with tfrc_rx_hist_update() to give a new function, tfrc_rx_hist_add_packet(). This is ok since entry_from_skb() is not otherwise used (and I only found this out when reading your patch). (On the other hand, since this is a 4-element ring buffer, it is no real adding, the same entry will frequently be overwritten, this is why it was initially called hist_update().) From the RX history API user perspective (CCID3 right now, others may come) it is adding a packet to the history. In the past it went to a list, now we have a ring and don't keep more than TFRC_NDUPACK + 1 entries, but this is an internal detail that users don't need to know. * I am also not so much concerned about credit. As long as the (changed) end result is as least as good as or hopefully better than the submitted input, the author name is a side issue; so I do think that your approach is sound and fair. The only place where credits are needed is for the SatSix project (www.ist-satsix.org) which funded this work. This is why some of the
Re: [PATCH v2 0/3][BUG-FIX]: Test tree updates and bug fixes
Em Wed, Dec 05, 2007 at 02:53:09PM +, Gerrit Renker escreveu: | Thanks, I folded this into the reorganized RX history handling patch, | together with reverting ccid3_hc_rx_packet_recv to very close to your | original patch, with this changes: | | 1. no need to calculate the payload size for non data packets as this |value won't be used. | 2. Initialize hcrx-ccid3hcrx_bytes_recv with the payload size when |hcrx-ccid3hcrx_state == TFRC_RSTATE_NO_DATA. | 3. tfrc_rx_hist_duplicate() is only called when ccid3hcrx_state != |TFRC_RSTATE_NO_DATA, so it doesn't need to goto the done_receiving |label (that was removed as this was its only use) as do_feedback |would always be CCID3_FBACK_NONE and so the test would always fail |and no feedback would be sent, so just return right there. | | Now it reads: | | static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) | { | struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk); | enum ccid3_fback_type do_feedback = CCID3_FBACK_NONE; | const u32 ndp = dccp_sk(sk)-dccps_options_received.dccpor_ndp; | const bool is_data_packet = dccp_data_packet(skb); | | if (unlikely(hcrx-ccid3hcrx_state == TFRC_RSTATE_NO_DATA)) { | if (is_data_packet) { | const u32 payload = skb-len - dccp_hdr(skb)-dccph_doff * 4; | do_feedback = FBACK_INITIAL; | ccid3_hc_rx_set_state(sk, TFRC_RSTATE_DATA); | hcrx-ccid3hcrx_s = | hcrx-ccid3hcrx_bytes_recv = payload_size; == Please see other email regarding bytes_recv, but I think you already got that. Maybe one could then write hcrx-ccid3hcrx_s = skb-len - dccp_hdr(skb)-dccph_doff * 4; OK, I got that. | } | goto update_records; | } | | if (tfrc_rx_hist_duplicate(hcrx-ccid3hcrx_hist, skb)) | return; /* done receiving */ | | if (is_data_packet) { | const u32 payload = skb-len - dccp_hdr(skb)-dccph_doff * 4; | /* | * Update moving-average of s and the sum of received payload bytes | */ | hcrx-ccid3hcrx_s = tfrc_ewma(hcrx-ccid3hcrx_s, payload_size, 9); | hcrx-ccid3hcrx_bytes_recv += payload_size; | } | | /* | * Handle pending losses and otherwise check for new loss | */ | if (tfrc_rx_hist_new_loss_indicated(hcrx-ccid3hcrx_hist, skb, ndp)) | goto update_records; | | /* | * Handle data packets: RTT sampling and monitoring p | */ | if (unlikely(!is_data_packet)) | goto update_records; | | if (list_empty(hcrx-ccid3hcrx_li_hist)) { /* no loss so far: p = 0 */ | const u32 sample = tfrc_rx_hist_sample_rtt(hcrx-ccid3hcrx_hist, skb); == If you like, you could add the original comment here that p=0 if no loss occured, i.e. /* * Empty loss history: no loss so far, hence p stays 0. * Sample RTT values, since an RTT estimate is required for the * computation of p when the first loss occurs; RFC 3448, 6.3.1. */ done | if (sample != 0) | hcrx-ccid3hcrx_rtt = tfrc_ewma(hcrx-ccid3hcrx_rtt, sample, 9); | } | | /* | * Check if the periodic once-per-RTT feedback is due; RFC 4342, 10.3 | */ | if (SUB16(dccp_hdr(skb)-dccph_ccval, hcrx-ccid3hcrx_last_counter) 3) | do_feedback = CCID3_FBACK_PERIODIC; | | update_records: | tfrc_rx_hist_add_packet(hcrx-ccid3hcrx_hist, skb, ndp); | == here a jump label is missing. It is not needed by this patch and above you have replaced it with a return + comment, but it is needed in a later patch: when a new loss event occurs, the control jumps to `done_receiving' and sends a feedback packet with type FBACK_PARAM_CHANGE. OK, I was wondering that the user for FBACK_PARAM_CHANGE in ccid3_hc_rx_send_feedback was in another patch. done_receiving: Ok, we can add the jump label when we make use of it | if (do_feedback) | ccid3_hc_rx_send_feedback(sk, skb, do_feedback); | } | | Now to some questions and please bear with me as I haven't got to the | patches after this: | | tfrc_rx_hist-loss_count as of now is a boolean, my understanding is | that you are counting loss events, i.e. it doesn't matter in: | It is not a boolean, but uses a hidden trick which maybe should be commented: * here and in the TCP world NDUPACK = 3 * hence the bitfield size for loss_count is 2 bits, which can express at most 3 = NDUPACK (that is why it is declared as loss_count:2) * the trick is that when the loss count increases beyond 3, it automatically cycles back to 0 (although the code does not rely on that features and does this
Re: [PATCH v2 0/3][BUG-FIX]: Test tree updates and bug fixes
Em Wed, Dec 05, 2007 at 01:55:11PM +, Gerrit Renker escreveu: | @@ -788,8 +782,8 @@ static void ccid3_hc_rx_packet_recv(stru |if (unlikely(hcrx-ccid3hcrx_state == TFRC_RSTATE_NO_DATA)) { |if (is_data_packet) { |do_feedback = FBACK_INITIAL; | + hcrx-ccid3hcrx_s = payload_size; |ccid3_hc_rx_set_state(sk, TFRC_RSTATE_DATA); | - ccid3_hc_rx_update_s(hcrx, payload_size); | | We have to set ccid3hcrx_bytes_recv to the payload_size here too, I'm | fixing this on the reworked patch that introduces the RX history. | I almost did the same error again by wanting to agree too prematurely. But updating ccid3hcrx_bytes_recv is in fact not needed here and if it would be done it would not have a useable effect. The reason is that the first data packet will trigger the initial feedback; and in the initial feedback packet X_recv (which is ccid3hcrx_bytes_recv / the_time_spent) is set to 0 (RFC 3448, 6.3). For this reason, updating bytes_recv for the first data packet is also not in the flowchart on http://www.erg.abdn.ac.uk/users/gerrit/dccp/notes/ccid3_packet_reception/ OK, I will add a comment on the code stating why it is not needed so that new people don't commit the same mistake again. - Arnaldo - To unsubscribe from this list: send the line unsubscribe dccp in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v2 0/3][BUG-FIX]: Test tree updates and bug fixes
Em Wed, Dec 05, 2007 at 11:19:44AM +, Gerrit Renker escreveu: This revision fixes a bug present in the per-socket allocation of RX history entries; identification of this bug is thanks to Arnaldo Carvalho de Melo. The bug was in not deallocating history entries when the allocation of one array element failed. The solution in this revised patch set is the original one written by Arnaldo. Ok, this one I posted with some other comments and explanations about changes other than namespacing, renaming of some functions and removal of functions not used in this patch. - Arnaldo - To unsubscribe from this list: send the line unsubscribe dccp in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v2 0/3][BUG-FIX]: Test tree updates and bug fixes
Em Wed, Dec 05, 2007 at 11:19:45AM +, Gerrit Renker escreveu: This fixes a problem in the initial revision of the patch: The loss interval history was not de-allocated when the initialisation of the packet history failed. The identification of this problem is also thanks due to Arnaldo. The interdiff to the previous revision is: --- b/net/dccp/ccids/lib/tfrc_module.c +++ b/net/dccp/ccids/lib/tfrc_module.c @@ -26,7 +26,12 @@ if (rc == 0) rc = packet_history_init(); + if (rc == 0) + goto out; +out_free_loss_intervals: + dccp_li_exit(); +out: return rc; } Ok, this one is kept on the series I have pending submission. - Arnaldo - To unsubscribe from this list: send the line unsubscribe dccp in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC][PATCHES 0/7]: Reorganization of RX history patches
Note: removed Ingo from the CC list, I had added it first just because he advocated reducing the number of mailing lists, so I wanted him to know that we're trying to do that. Em Wed, Dec 05, 2007 at 10:27:36AM +, Gerrit Renker escreveu: Today being Wednesday, below is some feedback after working through the patch set. Hope this is helpful. Patch #1: - Several new good points are introduced: - IP_DCCP_TFRC_DEBUG is made dependent on IP_DCCP_TFRC_DEBUG which is useful - the select from CONFIG_IP_DCCP_CCID3 = CONFIG_DCCP_TFRC_LIB - the cleanup action in tfrc_module_init() (when packet_history_init() fails) was previously missing, this is a good catch. Also a note: tfrc_pr_debug() is not currently used (but may be later should the code common to both CCID3 and CCID4 be shared). OK Patches #2/#6: -- Separated from main patch, no problems (were initially submitted in this format). I wonder whether switching back to smaller patch sizes is better? Patches that do one thing that is logically separated from others are preferred, as its analisys is made faster. Patches that rewrite an existing functionality are done best when the functions being replaced keep as much as possible the same name. For instance, in the new RX handling code we need to alloc the RX hist internal structures, in the previous implementation we also needed to do that, it also needs to purge entries, as the old one needed. Previous one was a linked list, the new one is a ring. As we go we can and should add new APIs when needed, but trying to keep an API so that if in the future we decide to rework the internal structures, this can be done in a plugin fashion, without changing too much its users (CCIDs in this case), so that we can get back to the old one with minor disruption to the users if needed. Sometimes we manage to do that, some times we need to improve upon the current API, but the goal remains the same. Patch #3: - Renames s/tfrc_tx_hist/tfrc_tx_hist_slab/g; no problems. This was for consistency with the other slabs in DCCP: [EMAIL PROTECTED] net-2.6.25]$ find net/dccp/ -name *.c | xargs grep 'struct kmem_cache' net/dccp/ccids/lib/packet_history.c:static struct kmem_cache *tfrc_tx_hist_slab; net/dccp/ccids/lib/packet_history.c:static struct kmem_cache *tfrc_rx_hist_slab; net/dccp/ccids/lib/loss_interval.c:static struct kmem_cache *dccp_li_cachep __read_mostly; net/dccp/ackvec.c:static struct kmem_cache *dccp_ackvec_slab; net/dccp/ackvec.c:static struct kmem_cache *dccp_ackvec_record_slab; net/dccp/ccid.c:struct kmem_cache *slab; net/dccp/ccid.c:static void ccid_kmem_cache_destroy(struct kmem_cache *slab) [EMAIL PROTECTED] net-2.6.25]$ Patch #4: - packet_history_init() initialises both RX and TX history and is later called by the module_init() function in net/dccp/ccids/lib/tfrc.c. Just a suggestion, it is possible to simplify this further, by doing all the initialisation / cleanup in tfrc.c: int __init {rx,tx}_packet_history_init() { tfrc_{rx,tx}_hist_slab = kmem_cache_create(tfrc_{rx,tx}_hist, ...); return tfrc_{rx,tx}_hist_slab == NULL ? - ENOBUFS : 0; } and then call these successively in tfrc_module_init(). Idea here was to have each C source file to have a init module. Perhaps we should try to break packet_history.c into tx_packet_history and rx_packet_history.c. We can do that later to try to meet the goal of being able to see what is being replaced. Patch #5: - The naming scheme introduced here is s/dccp_rx/tfrc_rx/g; s/dccphrx/tfrchrx/g; I wonder, while at it, would it be possible to extend this and extend this to other parts of the code? Basically this is the same discussion as earlier on [EMAIL PROTECTED] with Leandro, who first came up with this naming scheme. There the same question came up and the result of the discussion was that a prefix of `tfrchrx' is cryptic; if something simpler is possible then it would be great. As we did with the ackvecs, will do that later. Patch #7: - * ccid3_hc_rx_detect_loss() can be fully removed since no other code references it any further. I left it to try have the diff not mixing up removal of its implementation with the implementation of the function just after it, ccid3_hx_rx_packet_recv. Will remove it later. * bytes_recv also counts the payload_size's of non-data packets, which is wrong (it should only sum up the sum of bytes transferred as data packets) As said to you in private message I'll get back to the way you wrote, as you mentioned that upcoming patches will make good use of that. I'll try to restrain me to not do too many changes. * loss handling is not correctly taken care of: unlike in the other part, both data and non-data packets are used to detect loss (this is
Re: [PATCH v2 0/3][BUG-FIX]: Test tree updates and bug fixes
Em Wed, Dec 05, 2007 at 11:19:46AM +, Gerrit Renker escreveu: This patch removes the following redundancies: * ccid3_hc_rx_update_s() is only called for data packets (that is what it should be called for); * each call to ccid3_hc_rx_update_s() is wrapped inside a if (is_data_packet) test'; * therefore testing each time if len 0 is redundant (pointed out by Arnaldo); * no other code calls this function, hence the inline function can go. Also replaced the first call to updating s with direct assignment of `payload_size'; this has also been pointed out by Arnaldo. Thanks, I folded this into the reorganized RX history handling patch, together with reverting ccid3_hc_rx_packet_recv to very close to your original patch, with this changes: 1. no need to calculate the payload size for non data packets as this value won't be used. 2. Initialize hcrx-ccid3hcrx_bytes_recv with the payload size when hcrx-ccid3hcrx_state == TFRC_RSTATE_NO_DATA. 3. tfrc_rx_hist_duplicate() is only called when ccid3hcrx_state != TFRC_RSTATE_NO_DATA, so it doesn't need to goto the done_receiving label (that was removed as this was its only use) as do_feedback would always be CCID3_FBACK_NONE and so the test would always fail and no feedback would be sent, so just return right there. Now it reads: static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) { struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk); enum ccid3_fback_type do_feedback = CCID3_FBACK_NONE; const u32 ndp = dccp_sk(sk)-dccps_options_received.dccpor_ndp; const bool is_data_packet = dccp_data_packet(skb); if (unlikely(hcrx-ccid3hcrx_state == TFRC_RSTATE_NO_DATA)) { if (is_data_packet) { const u32 payload = skb-len - dccp_hdr(skb)-dccph_doff * 4; do_feedback = FBACK_INITIAL; ccid3_hc_rx_set_state(sk, TFRC_RSTATE_DATA); hcrx-ccid3hcrx_s = hcrx-ccid3hcrx_bytes_recv = payload_size; } goto update_records; } if (tfrc_rx_hist_duplicate(hcrx-ccid3hcrx_hist, skb)) return; /* done receiving */ if (is_data_packet) { const u32 payload = skb-len - dccp_hdr(skb)-dccph_doff * 4; /* * Update moving-average of s and the sum of received payload bytes */ hcrx-ccid3hcrx_s = tfrc_ewma(hcrx-ccid3hcrx_s, payload_size, 9); hcrx-ccid3hcrx_bytes_recv += payload_size; } /* * Handle pending losses and otherwise check for new loss */ if (tfrc_rx_hist_new_loss_indicated(hcrx-ccid3hcrx_hist, skb, ndp)) goto update_records; /* * Handle data packets: RTT sampling and monitoring p */ if (unlikely(!is_data_packet)) goto update_records; if (list_empty(hcrx-ccid3hcrx_li_hist)) { /* no loss so far: p = 0 */ const u32 sample = tfrc_rx_hist_sample_rtt(hcrx-ccid3hcrx_hist, skb); if (sample != 0) hcrx-ccid3hcrx_rtt = tfrc_ewma(hcrx-ccid3hcrx_rtt, sample, 9); } /* * Check if the periodic once-per-RTT feedback is due; RFC 4342, 10.3 */ if (SUB16(dccp_hdr(skb)-dccph_ccval, hcrx-ccid3hcrx_last_counter) 3) do_feedback = CCID3_FBACK_PERIODIC; update_records: tfrc_rx_hist_add_packet(hcrx-ccid3hcrx_hist, skb, ndp); if (do_feedback) ccid3_hc_rx_send_feedback(sk, skb, do_feedback); } Now to some questions and please bear with me as I haven't got to the patches after this: tfrc_rx_hist-loss_count as of now is a boolean, my understanding is that you are counting loss events, i.e. it doesn't matter in: /* any data packets missing between last reception and skb ? */ int tfrc_rx_hist_new_loss_indicated(struct tfrc_rx_hist *h, const struct sk_buff *skb, u32 ndp) { int delta = dccp_delta_seqno(tfrc_rx_hist_last_rcv(h)-tfrchrx_seqno, DCCP_SKB_CB(skb)-dccpd_seq); if (delta 1 ndp delta) tfrc_rx_hist_loss_indicated(h); return tfrc_rx_hist_loss_pending(h); } if (delta - ndp) is 1, i.e. tfrc_rx_hist-loss_count only indicates that there was loss. But then in other parts of the code it assumes it can be more than 1: /** * tfrc_rx_hist - RX history structure for TFRC-based protocols * * @ring: Packet history for RTT sampling and loss detection * @loss_count: Number of entries in circular history * @loss_start: Movable index (for loss detection) * @rtt_sample_prev:Used during RTT sampling, points to candidate entry */ struct tfrc_rx_hist { struct tfrc_rx_hist_entry
Re: [RFC][PATCHES 0/7]: Reorganization of RX history patches
Em Mon, Dec 03, 2007 at 08:35:12AM +, Gerrit Renker escreveu: Hi Arnaldo, hank you for going through this. I have just backported your recent patches of 2.6.25 to the DCCP/CCID4/Faster Restart test tree at git://eden-feed.erg.abdn.ac.uk/dccp_exp {dccp,ccid4,dccp_fr} as per subsequent message. | do, so please consider moving DCCP discussion to [EMAIL PROTECTED], | where lots of smart networking folks are present and can help our efforts | on turning RFCs to code. Are you suggesting using netdev exclusively or in addition to [EMAIL PROTECTED] Well, since at least one person that has contributed significantly in the past has said he can't cope with traffic on netdev, we can CC [EMAIL PROTECTED] | Please take a look at this patch series where I reorganized your work on the new | TFRC rx history handling code. I'll wait for your considerations and then do as many | interactions as reasonable to get your work merged. | | It should be completely equivalent, plus some fixes and optimizations, such as: It will be necessary to address these points one-by-one. Before diving into making fixes and `optimisations', have you tested your code? The patches you are referring to I have performed basic tests, and when doing so noticed a bug in inet_diag, that I commented with Herbert Xu and he has since provided a fix. have been posted and re-posted for a period of over 9 months on [EMAIL PROTECTED], and Being posted and re-posted does not guarantee that the patch is OK or that is in a form that is acceptable to all tree maintainers. DaveM is subscribed to [EMAIL PROTECTED] and could have picked this if he had the time to do the review. I didn't, but now I'm trying to spend my time on reviewing your patches and in the process doing modifications I find appropriate while trying hard not to introduce bugs in your hard work to get it merged. there are regression tests which show that this code improves on the existing Linux implementation. These are labelled as `test tree' on http://www.linux-foundation.org/en/Net:DCCP_Testing#Regression_testing So if you are making changes to the code, I would like to ask if you have run similar regression tests, to avoid having to step back later. Fair enough, I will do that before asking Herbert or Dave to pull from my tree. | . The code that allocates the RX ring deals with failures when one of the entries in | the ring buffer is not successfully allocated, the original code was leaking the | successfully allocated entries. Sorry for not point out exactly this, here it goes: Your original patch: +int tfrc_rx_hist_init(struct tfrc_rx_hist *h) +{ + int i; + + for (i = 0; i = NDUPACK; i++) { + h-ring[i] = kmem_cache_alloc(tfrc_rxh_cache, GFP_ATOMIC); + if (h-ring[i] == NULL) + return 1; + } + h-loss_count = 0; + h-loss_start = 0; + return 0; +} +EXPORT_SYMBOL_GPL(tfrc_rx_hist_init); Then, in ccid3_hc_rx_init you do: static int ccid3_hc_rx_init(struct ccid *ccid, struct sock *sk) { struct ccid3_hc_rx_sock *hcrx = ccid_priv(ccid); ccid3_pr_debug(entry\n); hcrx-ccid3hcrx_state = TFRC_RSTATE_NO_DATA; tfrc_lh_init(hcrx-ccid3hcrx_li_hist); return tfrc_rx_hist_init(hcrx-ccid3hcrx_hist); } So if tfrc_rx_hist_init fail to allocate, say, the last entry in the ring, it will return 1, and from looking at how you initialize h-loss_{count,start} to zero I assumed that the whole tfrc_rx_hist is not zeroed when tfrc_rx_hist_init is called, so one can also assume that the ring entries are not initialized to NULL and that if the error recovery is to assume that later on tfrc_rx_hist_cleanup is called we would not have a leak, but an OOPS when tfrc_rx_hist_cleanup tries to call kfree_cache_free on the uninitialized ring entries. But if you look at ccid_new(), that calls ccid3_hc_rx_init(), you'll see that when the ccid rx or hx routine fails, it just frees the struct ccid with the area where h-ring is, so, what was not cleaned up by the ccid init routine is effectively forgot, leaked. I first did the cleanup at tfrc_rx_hist_init (that I renamed to tfrc_rx_hist_alloc, since it doesn't just initializes things, but allocates entries from slab), but then I just made the rx history slab have arrays of tfrc_rx_hist_entry objects, not individual ones as your code always allocates them as arrays. | . We do just one allocation for the ring buffer, as the number of entries is fixed we | should just do one allocation and not TFRC_NDUPACK times. Will look at the first point in the patch; with regard to the second point I agree, this will make the code simpler, which is good. good | . I haven't checked if all the code was commited, as I tried to introduce just what was | immediatelly used, probably we'll need to do some changes when working on the merge | of
Re: [RFC][PATCHES 0/7]: Reorganization of RX history patches
Em Mon, Dec 03, 2007 at 01:49:47PM +, Gerrit Renker escreveu: | Are you suggesting using netdev exclusively or in addition to [EMAIL PROTECTED] | | Well, since at least one person that has contributed significantly in | the past has said he can't cope with traffic on netdev, we can CC | dccp@vger.kernel.org I have a similar problem with the traffic but agree and will copy as well. | have been posted and re-posted for a period of over 9 months on [EMAIL PROTECTED], and | | Being posted and re-posted does not guarantee that the patch is OK or | that is in a form that is acceptable to all tree maintainers. With the first point I couldn't agree more, but this is not really what I meant - the point was that despite posting and re-posting there was often silence. And now there is feedback, in form of a patchset made by you; and all that I am asking for is just to be given the time to look that through. Last time a RFC patch appeared at 3pm and was checked in the same evening (only found out next morning). I was too optimistic about that one, feeling that it was safe, sorry about that, will avoid doing that in the future. With your experience and long background as a maintainer maybe you expect quicker turn-around times, but I also had waited patiently until you had had a chance to review the stuff I sent. Agreed, my bad, will be more patient with my side as you've been with yours :-) | | . The code that allocates the RX ring deals with failures when one of the entries in | | the ring buffer is not successfully allocated, the original code was leaking the | | successfully allocated entries. | | Sorry for not point out exactly this, here it goes: | | Your original patch: | | +int tfrc_rx_hist_init(struct tfrc_rx_hist *h) | +{ | + int i; | + | + for (i = 0; i = NDUPACK; i++) { | + h-ring[i] = kmem_cache_alloc(tfrc_rxh_cache, GFP_ATOMIC); | + if (h-ring[i] == NULL) | + return 1; | + } | + h-loss_count = 0; | + h-loss_start = 0; | + return 0; | +} | +EXPORT_SYMBOL_GPL(tfrc_rx_hist_init); | I expected this and it actually was very clear from your original message. I fully back up your solution in this point, see below. But my question above was rather: are there any other bugs rather than the above leakage, which is what the previous email seemed to indicate? With regard to your solution - you are using int tfrc_rx_hist_alloc(struct tfrc_rx_hist *h) { h-ring = kmem_cache_alloc(tfrc_rx_hist_slab, GFP_ATOMIC); h-loss_count = h-loss_start = 0; return h-ring == NULL; } which is better not only for one but for two reasons. It solves the leakage and in addition makes the entire code simpler. Fully agreed. good | | | . I haven't checked if all the code was commited, as I tried to introduce just what was | | immediatelly used, probably we'll need to do some changes when working on the merge | | of your loss intervals code. | Sorry I don't understand this point. | | Let me check now and tell you for sure: | | tfrc_rx_hist_delta_seqno and tfrc_rx_hist_swap were not included, as | they were not used, we should introduce them later, when getting to the | working on the loss interval code. Ah thanks, that was really not clear. Just beginning to work through the set. great |static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) |{ |// ... |u32 sample, payload_size = skb-len - dccp_hdr(skb)-dccph_doff * 4; | |if (unlikely(hcrx-ccid3hcrx_state == TFRC_RSTATE_NO_DATA)) { |if (is_data_packet) { |do_feedback = FBACK_INITIAL; |ccid3_hc_rx_set_state(sk, TFRC_RSTATE_DATA); |ccid3_hc_rx_update_s(hcrx, payload_size); |} |goto update_records; |} | | == Non-data packets are ignored for the purposes of computing s (this is in the RFC), | consequently update_s() is only called for data packets; using the two following | functions: | | |static inline u32 tfrc_ewma(const u32 avg, const u32 newval, const u8 weight) |{ |return avg ? (weight * avg + (10 - weight) * newval) / 10 : newval; |} | | I hadn't considered that tfrc_ewma would for every packet check if the | avg was 0 and I find it suboptimal now that I look at it, we are just | feeding data packets, no? Yes exactly, only data packets are used for s. |static inline void ccid3_hc_rx_update_s(struct ccid3_hc_rx_sock *hcrx, int len) |{ |if (likely(len 0))/* don't update on empty packets (e.g. ACKs) */ |hcrx-ccid3hcrx_s = tfrc_ewma(hcrx-ccid3hcrx_s, len, 9); |} | | And we
[PATCH 6/7] [CCID3]: The receiver of a half-connection does not set window counter values
From: Gerrit Renker [EMAIL PROTECTED] Only the sender sets window counters [RFC 4342, sections 5 and 8.1]. Signed-off-by: Gerrit Renker [EMAIL PROTECTED] Signed-off-by: Ian McDonald [EMAIL PROTECTED] Signed-off-by: Arnaldo Carvalho de Melo [EMAIL PROTECTED] --- net/dccp/ccids/ccid3.c |1 - 1 files changed, 0 insertions(+), 1 deletions(-) diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index c95dca8..5ff5aab 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -733,7 +733,6 @@ static int ccid3_hc_rx_insert_options(struct sock *sk, struct sk_buff *skb) return 0; hcrx = ccid3_hc_rx_sk(sk); - DCCP_SKB_CB(skb)-dccpd_ccval = hcrx-ccid3hcrx_ccval_last_counter; if (dccp_packet_without_ack(skb)) return 0; -- 1.5.3.4 - To unsubscribe from this list: send the line unsubscribe dccp in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 1/7] [TFRC]: Provide central source file and debug facility
From: Gerrit Renker [EMAIL PROTECTED] This patch changes the tfrc_lib module in the following manner: (1) a dedicated tfrc source file to call the packet history loss interval init/exit functions. (2) a dedicated tfrc_pr_debug macro with toggle switch `tfrc_debug'. Commiter note: renamed tfrc_module.c to tfrc.c, and made CONFIG_IP_DCCP_CCID3 select IP_DCCP_TFRC_LIB. Signed-off-by: Gerrit Renker [EMAIL PROTECTED] Signed-off-by: Ian McDonald [EMAIL PROTECTED] Signed-off-by: Arnaldo Carvalho de Melo [EMAIL PROTECTED] --- net/dccp/ccids/Kconfig | 13 ++--- net/dccp/ccids/lib/Makefile |2 +- net/dccp/ccids/lib/packet_history.c | 27 ++- net/dccp/ccids/lib/packet_history.h |3 +- net/dccp/ccids/lib/tfrc.c | 48 +++ net/dccp/ccids/lib/tfrc.h | 17 +--- 6 files changed, 75 insertions(+), 35 deletions(-) create mode 100644 net/dccp/ccids/lib/tfrc.c diff --git a/net/dccp/ccids/Kconfig b/net/dccp/ccids/Kconfig index 3d7d867..1227594 100644 --- a/net/dccp/ccids/Kconfig +++ b/net/dccp/ccids/Kconfig @@ -38,6 +38,7 @@ config IP_DCCP_CCID2_DEBUG config IP_DCCP_CCID3 tristate CCID3 (TCP-Friendly) (EXPERIMENTAL) def_tristate IP_DCCP + select IP_DCCP_TFRC_LIB ---help--- CCID 3 denotes TCP-Friendly Rate Control (TFRC), an equation-based rate-controlled congestion control mechanism. TFRC is designed to @@ -63,10 +64,6 @@ config IP_DCCP_CCID3 If in doubt, say M. -config IP_DCCP_TFRC_LIB - depends on IP_DCCP_CCID3 - def_tristate IP_DCCP_CCID3 - config IP_DCCP_CCID3_DEBUG bool CCID3 debugging messages depends on IP_DCCP_CCID3 @@ -110,5 +107,13 @@ config IP_DCCP_CCID3_RTO is serious network congestion: experimenting with larger values should therefore not be performed on WANs. +config IP_DCCP_TFRC_LIB + tristate + default n + +config IP_DCCP_TFRC_DEBUG + bool + depends on IP_DCCP_TFRC_LIB + default y if IP_DCCP_CCID3_DEBUG endmenu diff --git a/net/dccp/ccids/lib/Makefile b/net/dccp/ccids/lib/Makefile index 5f940a6..68c93e3 100644 --- a/net/dccp/ccids/lib/Makefile +++ b/net/dccp/ccids/lib/Makefile @@ -1,3 +1,3 @@ obj-$(CONFIG_IP_DCCP_TFRC_LIB) += dccp_tfrc_lib.o -dccp_tfrc_lib-y := loss_interval.o packet_history.o tfrc_equation.o +dccp_tfrc_lib-y := tfrc.o tfrc_equation.o packet_history.o loss_interval.o diff --git a/net/dccp/ccids/lib/packet_history.c b/net/dccp/ccids/lib/packet_history.c index 4805de9..1d4d6ee 100644 --- a/net/dccp/ccids/lib/packet_history.c +++ b/net/dccp/ccids/lib/packet_history.c @@ -35,7 +35,6 @@ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ -#include linux/module.h #include linux/string.h #include packet_history.h @@ -277,39 +276,19 @@ void dccp_rx_hist_purge(struct dccp_rx_hist *hist, struct list_head *list) EXPORT_SYMBOL_GPL(dccp_rx_hist_purge); -extern int __init dccp_li_init(void); -extern void dccp_li_exit(void); - -static __init int packet_history_init(void) +__init int packet_history_init(void) { - if (dccp_li_init() != 0) - goto out; - tfrc_tx_hist = kmem_cache_create(tfrc_tx_hist, sizeof(struct tfrc_tx_hist_entry), 0, SLAB_HWCACHE_ALIGN, NULL); - if (tfrc_tx_hist == NULL) - goto out_li_exit; - return 0; -out_li_exit: - dccp_li_exit(); -out: - return -ENOBUFS; + return tfrc_tx_hist == NULL ? -ENOBUFS : 0; } -module_init(packet_history_init); -static __exit void packet_history_exit(void) +void packet_history_exit(void) { if (tfrc_tx_hist != NULL) { kmem_cache_destroy(tfrc_tx_hist); tfrc_tx_hist = NULL; } - dccp_li_exit(); } -module_exit(packet_history_exit); - -MODULE_AUTHOR(Ian McDonald [EMAIL PROTECTED], - Arnaldo Carvalho de Melo [EMAIL PROTECTED]); -MODULE_DESCRIPTION(DCCP TFRC library); -MODULE_LICENSE(GPL); diff --git a/net/dccp/ccids/lib/packet_history.h b/net/dccp/ccids/lib/packet_history.h index 0670f46..9a2642e 100644 --- a/net/dccp/ccids/lib/packet_history.h +++ b/net/dccp/ccids/lib/packet_history.h @@ -39,8 +39,7 @@ #include linux/ktime.h #include linux/list.h #include linux/slab.h - -#include ../../dccp.h +#include tfrc.h /* Number of later packets received before one is considered lost */ #define TFRC_RECV_NUM_LATE_LOSS 3 diff --git a/net/dccp/ccids/lib/tfrc.c b/net/dccp/ccids/lib/tfrc.c new file mode 100644 index 000..3a7a183 --- /dev/null +++ b/net/dccp/ccids/lib/tfrc.c @@ -0,0 +1,48 @@ +/* + * TFRC: main module holding the pieces of the TFRC library together + * + * Copyright (c) 2007 The University of Aberdeen, Scotland, UK + * Copyright (c) 2007 Arnaldo Carvalho de Melo [EMAIL PROTECTED] + */ +#include linux
[PATCH 2/7] [DCCP]: Introduce generic function to test for `data packets'
From: Gerrit Renker [EMAIL PROTECTED] as per RFC 4340, sec. 7.7. Signed-off-by: Gerrit Renker [EMAIL PROTECTED] Signed-off-by: Ian McDonald [EMAIL PROTECTED] Signed-off-by: Arnaldo Carvalho de Melo [EMAIL PROTECTED] --- net/dccp/dccp.h | 12 1 files changed, 12 insertions(+), 0 deletions(-) diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index ee97950..f4a5ea1 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -334,6 +334,7 @@ struct dccp_skb_cb { #define DCCP_SKB_CB(__skb) ((struct dccp_skb_cb *)((__skb)-cb[0])) +/* RFC 4340, sec. 7.7 */ static inline int dccp_non_data_packet(const struct sk_buff *skb) { const __u8 type = DCCP_SKB_CB(skb)-dccpd_type; @@ -346,6 +347,17 @@ static inline int dccp_non_data_packet(const struct sk_buff *skb) type == DCCP_PKT_SYNCACK; } +/* RFC 4340, sec. 7.7 */ +static inline int dccp_data_packet(const struct sk_buff *skb) +{ + const __u8 type = DCCP_SKB_CB(skb)-dccpd_type; + + return type == DCCP_PKT_DATA || + type == DCCP_PKT_DATAACK || + type == DCCP_PKT_REQUEST || + type == DCCP_PKT_RESPONSE; +} + static inline int dccp_packet_without_ack(const struct sk_buff *skb) { const __u8 type = DCCP_SKB_CB(skb)-dccpd_type; -- 1.5.3.4 - To unsubscribe from this list: send the line unsubscribe dccp in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 5/7] [TFRC]: Rename dccp_rx_ to tfrc_rx_
This is in preparation for merging the new rx history code written by Gerrit Renker. Signed-off-by: Arnaldo Carvalho de Melo [EMAIL PROTECTED] --- net/dccp/ccids/ccid3.c | 32 ++-- net/dccp/ccids/lib/loss_interval.c | 14 +++--- net/dccp/ccids/lib/packet_history.c | 90 +- net/dccp/ccids/lib/packet_history.h | 48 +- 4 files changed, 92 insertions(+), 92 deletions(-) diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index 07920bb..c95dca8 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -677,7 +677,7 @@ static void ccid3_hc_rx_send_feedback(struct sock *sk) { struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk); struct dccp_sock *dp = dccp_sk(sk); - struct dccp_rx_hist_entry *packet; + struct tfrc_rx_hist_entry *packet; ktime_t now; suseconds_t delta; @@ -701,7 +701,7 @@ static void ccid3_hc_rx_send_feedback(struct sock *sk) return; } - packet = dccp_rx_hist_find_data_packet(hcrx-ccid3hcrx_hist); + packet = tfrc_rx_hist_find_data_packet(hcrx-ccid3hcrx_hist); if (unlikely(packet == NULL)) { DCCP_WARN(%s(%p), no data packet in history!\n, dccp_role(sk), sk); @@ -709,7 +709,7 @@ static void ccid3_hc_rx_send_feedback(struct sock *sk) } hcrx-ccid3hcrx_tstamp_last_feedback = now; - hcrx-ccid3hcrx_ccval_last_counter = packet-dccphrx_ccval; + hcrx-ccid3hcrx_ccval_last_counter = packet-tfrchrx_ccval; hcrx-ccid3hcrx_bytes_recv = 0; if (hcrx-ccid3hcrx_p == 0) @@ -752,12 +752,12 @@ static int ccid3_hc_rx_insert_options(struct sock *sk, struct sk_buff *skb) } static int ccid3_hc_rx_detect_loss(struct sock *sk, - struct dccp_rx_hist_entry *packet) + struct tfrc_rx_hist_entry *packet) { struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk); - struct dccp_rx_hist_entry *rx_hist = - dccp_rx_hist_head(hcrx-ccid3hcrx_hist); - u64 seqno = packet-dccphrx_seqno; + struct tfrc_rx_hist_entry *rx_hist = + tfrc_rx_hist_head(hcrx-ccid3hcrx_hist); + u64 seqno = packet-tfrchrx_seqno; u64 tmp_seqno; int loss = 0; u8 ccval; @@ -766,9 +766,9 @@ static int ccid3_hc_rx_detect_loss(struct sock *sk, tmp_seqno = hcrx-ccid3hcrx_seqno_nonloss; if (!rx_hist || - follows48(packet-dccphrx_seqno, hcrx-ccid3hcrx_seqno_nonloss)) { + follows48(packet-tfrchrx_seqno, hcrx-ccid3hcrx_seqno_nonloss)) { hcrx-ccid3hcrx_seqno_nonloss = seqno; - hcrx-ccid3hcrx_ccval_nonloss = packet-dccphrx_ccval; + hcrx-ccid3hcrx_ccval_nonloss = packet-tfrchrx_ccval; goto detect_out; } @@ -789,7 +789,7 @@ static int ccid3_hc_rx_detect_loss(struct sock *sk, dccp_inc_seqno(tmp_seqno); hcrx-ccid3hcrx_seqno_nonloss = tmp_seqno; dccp_inc_seqno(tmp_seqno); - while (dccp_rx_hist_find_entry(hcrx-ccid3hcrx_hist, + while (tfrc_rx_hist_find_entry(hcrx-ccid3hcrx_hist, tmp_seqno, ccval)) { hcrx-ccid3hcrx_seqno_nonloss = tmp_seqno; hcrx-ccid3hcrx_ccval_nonloss = ccval; @@ -799,13 +799,13 @@ static int ccid3_hc_rx_detect_loss(struct sock *sk, /* FIXME - this code could be simplified with above while */ /* but works at moment */ - if (follows48(packet-dccphrx_seqno, hcrx-ccid3hcrx_seqno_nonloss)) { + if (follows48(packet-tfrchrx_seqno, hcrx-ccid3hcrx_seqno_nonloss)) { hcrx-ccid3hcrx_seqno_nonloss = seqno; - hcrx-ccid3hcrx_ccval_nonloss = packet-dccphrx_ccval; + hcrx-ccid3hcrx_ccval_nonloss = packet-tfrchrx_ccval; } detect_out: - dccp_rx_hist_add_packet(hcrx-ccid3hcrx_hist, + tfrc_rx_hist_add_packet(hcrx-ccid3hcrx_hist, hcrx-ccid3hcrx_li_hist, packet, hcrx-ccid3hcrx_seqno_nonloss); return loss; @@ -815,7 +815,7 @@ static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) { struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk); const struct dccp_options_received *opt_recv; - struct dccp_rx_hist_entry *packet; + struct tfrc_rx_hist_entry *packet; u32 p_prev, r_sample, rtt_prev; int loss, payload_size; ktime_t now; @@ -850,7 +850,7 @@ static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) return; } - packet = dccp_rx_hist_entry_new(opt_recv-dccpor_ndp, skb, GFP_ATOMIC); + packet = tfrc_rx_hist_entry_new(opt_recv-dccpor_ndp, skb, GFP_ATOMIC); if (unlikely(packet == NULL
Re: [RFC]: tfrc_tx_hist_rtt
Em Fri, Nov 30, 2007 at 12:19:36PM +, Gerrit Renker escreveu: Sorry I only got this email today and it is a busy day, too. The changes look good and are in general a further improvement on the code. I like the idea of hiding the internals of the list structure in the source file. I wonder if one could go one step further and also take the timestamp directly when looking up the previous history sample, e.g. u32 tfrc_tx_hist_rtt(struct tfrc_tx_hist_entry *head, const u64 seqno) { u32 rtt = 0; struct tfrc_tx_hist_entry *packet = tfrc_tx_hist_find_entry(head, seqno); if (packet != NULL) { rtt = ktime_us_delta(ktime_get_real(), packet-stamp); /* * Garbage-collect older (irrelevant) entries: */ tfrc_tx_hist_purge(packet-next); } return rtt; } Just a suggestion. I thought about that, but ccid3_hc_tx_packet_recv needs a timestamp some lines below, when we reuse the ktime_get_real call. - Arnaldo - To unsubscribe from this list: send the line unsubscribe dccp in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[RFC]: tfrc_tx_hist_rtt
Hi Gerrit, Coming back to hiding the TX history data structures, take a look at what I'm commiting to my tree: 1. struct tfrc_tx_hist_entry is completely hidden in net/dccp/ccids/lib/packet_history.c 2. I found tfrc_tx_hist_when confusing and returning two results, if it had found the ackno in the history and the timestamp, so I introduced tfrc_tx_hist_rtt instead, that returns 0 if the ackno was not found. Does this looks reasonable? - Arnaldo diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index 2668de8..5dea690 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -399,7 +399,6 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) { struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); struct ccid3_options_received *opt_recv; - struct tfrc_tx_hist_entry *packet; ktime_t now; unsigned long t_nfb; u32 pinv, r_sample; @@ -414,19 +413,17 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) switch (hctx-ccid3hctx_state) { case TFRC_SSTATE_NO_FBACK: case TFRC_SSTATE_FBACK: + now = ktime_get_real(); + /* estimate RTT from history if ACK number is valid */ - packet = tfrc_tx_hist_find_entry(hctx-ccid3hctx_hist, - DCCP_SKB_CB(skb)-dccpd_ack_seq); - if (packet == NULL) { + r_sample = tfrc_tx_hist_rtt(hctx-ccid3hctx_hist, + DCCP_SKB_CB(skb)-dccpd_ack_seq, now); + if (r_sample == 0) { DCCP_WARN(%s(%p): %s with bogus ACK-%llu\n, dccp_role(sk), sk, dccp_packet_name(DCCP_SKB_CB(skb)-dccpd_type), (unsigned long long)DCCP_SKB_CB(skb)-dccpd_ack_seq); return; } - /* -* Garbage-collect older (irrelevant) entries -*/ - tfrc_tx_hist_purge(packet-next); /* Update receive rate in units of 64 * bytes/second */ hctx-ccid3hctx_x_recv = opt_recv-ccid3or_receive_rate; @@ -438,12 +435,10 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) hctx-ccid3hctx_p = 0; else /* can not exceed 100% */ hctx-ccid3hctx_p = 100 / pinv; - - now = ktime_get_real(); /* -* Calculate new RTT sample and update moving average +* Validate new RTT sample and update moving average */ - r_sample = dccp_sample_rtt(sk, ktime_us_delta(now, packet-stamp)); + r_sample = dccp_sample_rtt(sk, r_sample); hctx-ccid3hctx_rtt = tfrc_ewma(hctx-ccid3hctx_rtt, r_sample, 9); if (hctx-ccid3hctx_state == TFRC_SSTATE_NO_FBACK) { diff --git a/net/dccp/ccids/lib/packet_history.c b/net/dccp/ccids/lib/packet_history.c index 1397360..4805de9 100644 --- a/net/dccp/ccids/lib/packet_history.c +++ b/net/dccp/ccids/lib/packet_history.c @@ -39,12 +39,24 @@ #include linux/string.h #include packet_history.h +/** + * tfrc_tx_hist_entry - Simple singly-linked TX history list + * @next: next oldest entry (LIFO order) + * @seqno: sequence number of this entry + * @stamp: send time of packet with sequence number @seqno + */ +struct tfrc_tx_hist_entry { + struct tfrc_tx_hist_entry *next; + u64 seqno; + ktime_t stamp; +}; + /* * Transmitter History Routines */ static struct kmem_cache *tfrc_tx_hist; -struct tfrc_tx_hist_entry * +static struct tfrc_tx_hist_entry * tfrc_tx_hist_find_entry(struct tfrc_tx_hist_entry *head, u64 seqno) { while (head != NULL head-seqno != seqno) @@ -52,7 +64,6 @@ struct tfrc_tx_hist_entry * return head; } -EXPORT_SYMBOL_GPL(tfrc_tx_hist_find_entry); int tfrc_tx_hist_add(struct tfrc_tx_hist_entry **headp, u64 seqno) { @@ -83,6 +94,24 @@ void tfrc_tx_hist_purge(struct tfrc_tx_hist_entry **headp) } EXPORT_SYMBOL_GPL(tfrc_tx_hist_purge); +u32 tfrc_tx_hist_rtt(struct tfrc_tx_hist_entry *head, const u64 seqno, +const ktime_t now) +{ + u32 rtt = 0; + struct tfrc_tx_hist_entry *packet = tfrc_tx_hist_find_entry(head, seqno); + + if (packet != NULL) { + rtt = ktime_us_delta(now, packet-stamp); + /* +* Garbage-collect older (irrelevant) entries: +*/ + tfrc_tx_hist_purge(packet-next); + } + + return rtt; +} +EXPORT_SYMBOL_GPL(tfrc_tx_hist_rtt); + /* * Receiver History Routines */ diff --git a/net/dccp/ccids/lib/packet_history.h b/net/dccp/ccids/lib/packet_history.h index 5c07182..0670f46 100644 ---
[PATCH 1/1][TFRC]: Hide tx history details from the CCIDs
Hi Herbert, Please consider pulling from: master.kernel.org:/pub/scm/linux/kernel/git/acme/net-2.6.25 Thanks a lot! - Arnaldo commit f1ff09e7955a2cffa3dfdda8a63a1ce991f11a7b Author: Arnaldo Carvalho de Melo [EMAIL PROTECTED] Date: Thu Nov 29 22:47:15 2007 -0200 [TFRC]: Hide tx history details from the CCIDs Based on a previous patch by Gerrit Renker. Signed-off-by: Arnaldo Carvalho de Melo [EMAIL PROTECTED] diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index 2668de8..5dea690 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -399,7 +399,6 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) { struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); struct ccid3_options_received *opt_recv; - struct tfrc_tx_hist_entry *packet; ktime_t now; unsigned long t_nfb; u32 pinv, r_sample; @@ -414,19 +413,17 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) switch (hctx-ccid3hctx_state) { case TFRC_SSTATE_NO_FBACK: case TFRC_SSTATE_FBACK: + now = ktime_get_real(); + /* estimate RTT from history if ACK number is valid */ - packet = tfrc_tx_hist_find_entry(hctx-ccid3hctx_hist, - DCCP_SKB_CB(skb)-dccpd_ack_seq); - if (packet == NULL) { + r_sample = tfrc_tx_hist_rtt(hctx-ccid3hctx_hist, + DCCP_SKB_CB(skb)-dccpd_ack_seq, now); + if (r_sample == 0) { DCCP_WARN(%s(%p): %s with bogus ACK-%llu\n, dccp_role(sk), sk, dccp_packet_name(DCCP_SKB_CB(skb)-dccpd_type), (unsigned long long)DCCP_SKB_CB(skb)-dccpd_ack_seq); return; } - /* -* Garbage-collect older (irrelevant) entries -*/ - tfrc_tx_hist_purge(packet-next); /* Update receive rate in units of 64 * bytes/second */ hctx-ccid3hctx_x_recv = opt_recv-ccid3or_receive_rate; @@ -438,12 +435,10 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) hctx-ccid3hctx_p = 0; else /* can not exceed 100% */ hctx-ccid3hctx_p = 100 / pinv; - - now = ktime_get_real(); /* -* Calculate new RTT sample and update moving average +* Validate new RTT sample and update moving average */ - r_sample = dccp_sample_rtt(sk, ktime_us_delta(now, packet-stamp)); + r_sample = dccp_sample_rtt(sk, r_sample); hctx-ccid3hctx_rtt = tfrc_ewma(hctx-ccid3hctx_rtt, r_sample, 9); if (hctx-ccid3hctx_state == TFRC_SSTATE_NO_FBACK) { diff --git a/net/dccp/ccids/lib/packet_history.c b/net/dccp/ccids/lib/packet_history.c index 1397360..4805de9 100644 --- a/net/dccp/ccids/lib/packet_history.c +++ b/net/dccp/ccids/lib/packet_history.c @@ -39,12 +39,24 @@ #include linux/string.h #include packet_history.h +/** + * tfrc_tx_hist_entry - Simple singly-linked TX history list + * @next: next oldest entry (LIFO order) + * @seqno: sequence number of this entry + * @stamp: send time of packet with sequence number @seqno + */ +struct tfrc_tx_hist_entry { + struct tfrc_tx_hist_entry *next; + u64 seqno; + ktime_t stamp; +}; + /* * Transmitter History Routines */ static struct kmem_cache *tfrc_tx_hist; -struct tfrc_tx_hist_entry * +static struct tfrc_tx_hist_entry * tfrc_tx_hist_find_entry(struct tfrc_tx_hist_entry *head, u64 seqno) { while (head != NULL head-seqno != seqno) @@ -52,7 +64,6 @@ struct tfrc_tx_hist_entry * return head; } -EXPORT_SYMBOL_GPL(tfrc_tx_hist_find_entry); int tfrc_tx_hist_add(struct tfrc_tx_hist_entry **headp, u64 seqno) { @@ -83,6 +94,24 @@ void tfrc_tx_hist_purge(struct tfrc_tx_hist_entry **headp) } EXPORT_SYMBOL_GPL(tfrc_tx_hist_purge); +u32 tfrc_tx_hist_rtt(struct tfrc_tx_hist_entry *head, const u64 seqno, +const ktime_t now) +{ + u32 rtt = 0; + struct tfrc_tx_hist_entry *packet = tfrc_tx_hist_find_entry(head, seqno); + + if (packet != NULL) { + rtt = ktime_us_delta(now, packet-stamp); + /* +* Garbage-collect older (irrelevant) entries: +*/ + tfrc_tx_hist_purge(packet-next); + } + + return rtt; +} +EXPORT_SYMBOL_GPL(tfrc_tx_hist_rtt); + /* * Receiver History Routines */ diff --git a/net/dccp/ccids/lib/packet_history.h b/net/dccp/ccids/lib/packet_history.h index 5c07182..0670f46 100644 --- a/net/dccp/ccids/lib
Re: [PATCH 1/4]: Use AF-independent rebuild_header routine
Em Wed, Nov 28, 2007 at 08:35:08AM +, Gerrit Renker escreveu: [DCCP]: Use AF-independent rebuild_header routine This fixes a nasty bug: dccp_send_reset() is called by both DCCPv4 and DCCPv6, but uses inet_sk_rebuild_header() in each case. This leads to unpredictable and weird behaviour: under some conditions, DCCPv6 Resets were sent, in other not. The fix is to use the AF-independent rebuild_header routine. Signed-off-by: Gerrit Renker [EMAIL PROTECTED] Thanks, applied. - Arnaldo - To unsubscribe from this list: send the line unsubscribe dccp in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 4/4]: Remove duplicate test for CloseReq
Em Wed, Nov 28, 2007 at 08:35:11AM +, Gerrit Renker escreveu: [DCCP]: Remove duplicate test for CloseReq This removes a redundant test for unexpected packet types. In dccp_rcv_state_process it is tested twice whether a DCCP-server has received a CloseReq (Step 7): * first in the combined if-statement, * then in the call to dccp_rcv_closereq(). The latter is necesssary since dccp_rcv_closereq() is also called from __dccp_rcv_established(). This patch removes the duplicate test. Signed-off-by: Gerrit Renker [EMAIL PROTECTED] Thanks, applied. - Arnaldo - To unsubscribe from this list: send the line unsubscribe dccp in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 3/4]: Integrate state transitions for passive-close
Em Wed, Nov 28, 2007 at 08:35:10AM +, Gerrit Renker escreveu: [DCCP]: Integrate state transitions for passive-close This adds the necessary state transitions for the two forms of passive-close * PASSIVE_CLOSE- which is entered when a host receives a Close; * PASSIVE_CLOSEREQ - which is entered when a client receives a CloseReq. Here is a detailed account of what the patch does in each state. 1) Receiving CloseReq -- The pseudo-code in 8.5 says: Step 13: Process CloseReq If P.type == CloseReq and S.state CLOSEREQ, Generate Close S.state := CLOSING Set CLOSING timer. This means we need to address what to do in CLOSED, LISTEN, REQUEST, RESPOND, PARTOPEN, and OPEN. * CLOSED: silently ignore - it may be a late or duplicate CloseReq; * LISTEN/RESPOND: will not appear, since Step 7 is performed first (we know we are the client); * REQUEST:perform Step 13 directly (no need to enqueue packet); * OPEN/PARTOPEN: enter PASSIVE_CLOSEREQ so that the application has a chance to process unread data. When already in PASSIVE_CLOSEREQ, no second CloseReq is enqueued. In any other state, the CloseReq is ignored. I think that this offers some robustness against rare and pathological cases: e.g. a simultaneous close where the client sends a Close and the server a CloseReq. The client will then be retransmitting its Close until it gets the Reset, so ignoring the CloseReq while in state CLOSING is sane. 2) Receiving Close --- The code below from 8.5 is unconditional. Step 14: Process Close If P.type == Close, Generate Reset(Closed) Tear down connection Drop packet and return Thus we need to consider all states: * CLOSED: silently ignore, since this can happen when a retransmitted or late Close arrives; * LISTEN: dccp_rcv_state_process() will generate a Reset (No Connection); * REQUEST: perform Step 14 directly (no need to enqueue packet); * RESPOND: dccp_check_req() will generate a Reset (Packet Error) -- left it at that; * OPEN/PARTOPEN:enter PASSIVE_CLOSE so that application has a chance to process unread data; * CLOSEREQ: server performed active-close -- perform Step 14; * CLOSING: simultaneous-close: use a tie-breaker to avoid message ping-pong (see comment); * PASSIVE_CLOSEREQ: ignore - the peer has a bug (sending first a CloseReq and now a Close); * TIMEWAIT: packet is ignored. Note that the condition of receiving a packet in state CLOSED here is different from the condition there is no socket for such a connection: the socket still exists, but its state indicates it is unusable. Last, dccp_finish_passive_close sets either DCCP_CLOSED or DCCP_CLOSING = TCP_CLOSING, so that sk_stream_wait_close() will wait for the final Reset (which will trigger CLOSING = CLOSED). Signed-off-by: Gerrit Renker [EMAIL PROTECTED] Applied - To unsubscribe from this list: send the line unsubscribe dccp in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/4]: Dedicated auxiliary states to support passive-close
Em Wed, Nov 28, 2007 at 08:35:09AM +, Gerrit Renker escreveu: [DCCP]: Dedicated auxiliary states to support passive-close This adds two auxiliary states to deal with passive closes: * PASSIVE_CLOSE(reached from OPEN via reception of Close)and * PASSIVE_CLOSEREQ (reached from OPEN via reception of CloseReq) as internal intermediate states. These states are used to allow a receiver to process unread data before acknowledging the received connection-termination-request (the Close/CloseReq). Without such support, it will happen that passively-closed sockets enter CLOSED state while there is still unprocessed data in the queue; leading to unexpected and erratic API behaviour. PASSIVE_CLOSE has been mapped into TCPF_CLOSE_WAIT, so that the code will seamlessly work with inet_accept() (which tests for this state). The state names are thanks to Arnaldo, who suggested this naming scheme following an earlier revision of this patch. Signed-off-by: Gerrit Renker [EMAIL PROTECTED] Thanks, applied. - Arnaldo - To unsubscribe from this list: send the line unsubscribe dccp in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 1/5] [TFRC]: Migrate TX history to singly-linked lis
This patch was based on another made by Gerrit Renker, his changelog was: -- The patch set migrates TFRC TX history to a singly-linked list. The details are: * use of a consistent naming scheme (all TFRC functions now begin with `tfrc_'); * allocation and cleanup are taken care of internally; * provision of a lookup function, which is used by the CCID TX infrastructure to determine the time a packet was sent (in turn used for RTT sampling); * integration of the new interface with the present use in CCID3. -- Simplifications I did: . removing the tfrc_tx_hist_head that had a pointer to the list head and another for the slabcache. . No need for creating a slabcache for each CCID that wants to use the TFRC tx history routines, create a single slabcache when the dccp_tfrc_lib module init routine is called. Signed-off-by: Arnaldo Carvalho de Melo [EMAIL PROTECTED] --- net/dccp/ccids/ccid3.c | 57 -- net/dccp/ccids/ccid3.h |3 +- net/dccp/ccids/lib/loss_interval.c | 12 ++-- net/dccp/ccids/lib/packet_history.c | 138 --- net/dccp/ccids/lib/packet_history.h | 79 5 files changed, 102 insertions(+), 187 deletions(-) diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index 964ec91..2668de8 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -49,7 +49,6 @@ static int ccid3_debug; #define ccid3_pr_debug(format, a...) #endif -static struct dccp_tx_hist *ccid3_tx_hist; static struct dccp_rx_hist *ccid3_rx_hist; /* @@ -389,28 +388,18 @@ static void ccid3_hc_tx_packet_sent(struct sock *sk, int more, unsigned int len) { struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); - struct dccp_tx_hist_entry *packet; ccid3_hc_tx_update_s(hctx, len); - packet = dccp_tx_hist_entry_new(ccid3_tx_hist, GFP_ATOMIC); - if (unlikely(packet == NULL)) { + if (tfrc_tx_hist_add(hctx-ccid3hctx_hist, dccp_sk(sk)-dccps_gss)) DCCP_CRIT(packet history - out of memory!); - return; - } - dccp_tx_hist_add_entry(hctx-ccid3hctx_hist, packet); - - packet-dccphtx_tstamp = ktime_get_real(); - packet-dccphtx_seqno = dccp_sk(sk)-dccps_gss; - packet-dccphtx_rtt= hctx-ccid3hctx_rtt; - packet-dccphtx_sent = 1; } static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) { struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); struct ccid3_options_received *opt_recv; - struct dccp_tx_hist_entry *packet; + struct tfrc_tx_hist_entry *packet; ktime_t now; unsigned long t_nfb; u32 pinv, r_sample; @@ -425,16 +414,19 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) switch (hctx-ccid3hctx_state) { case TFRC_SSTATE_NO_FBACK: case TFRC_SSTATE_FBACK: - /* get packet from history to look up t_recvdata */ - packet = dccp_tx_hist_find_entry(hctx-ccid3hctx_hist, - DCCP_SKB_CB(skb)-dccpd_ack_seq); - if (unlikely(packet == NULL)) { - DCCP_WARN(%s(%p), seqno %llu(%s) doesn't exist - in history!\n, dccp_role(sk), sk, - (unsigned long long)DCCP_SKB_CB(skb)-dccpd_ack_seq, - dccp_packet_name(DCCP_SKB_CB(skb)-dccpd_type)); + /* estimate RTT from history if ACK number is valid */ + packet = tfrc_tx_hist_find_entry(hctx-ccid3hctx_hist, + DCCP_SKB_CB(skb)-dccpd_ack_seq); + if (packet == NULL) { + DCCP_WARN(%s(%p): %s with bogus ACK-%llu\n, dccp_role(sk), sk, + dccp_packet_name(DCCP_SKB_CB(skb)-dccpd_type), + (unsigned long long)DCCP_SKB_CB(skb)-dccpd_ack_seq); return; } + /* +* Garbage-collect older (irrelevant) entries +*/ + tfrc_tx_hist_purge(packet-next); /* Update receive rate in units of 64 * bytes/second */ hctx-ccid3hctx_x_recv = opt_recv-ccid3or_receive_rate; @@ -451,7 +443,7 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) /* * Calculate new RTT sample and update moving average */ - r_sample = dccp_sample_rtt(sk, ktime_us_delta(now, packet-dccphtx_tstamp)); + r_sample = dccp_sample_rtt(sk, ktime_us_delta(now, packet-stamp)); hctx-ccid3hctx_rtt = tfrc_ewma(hctx-ccid3hctx_rtt, r_sample, 9); if (hctx
[PATCH 4/5] [DCCP]: Integrate state transitions for passive-close
From: Gerrit Renker [EMAIL PROTECTED] This adds the necessary state transitions for the two forms of passive-close * PASSIVE_CLOSE- which is entered when a host receives a Close; * PASSIVE_CLOSEREQ - which is entered when a client receives a CloseReq. Here is a detailed account of what the patch does in each state. 1) Receiving CloseReq The pseudo-code in 8.5 says: Step 13: Process CloseReq If P.type == CloseReq and S.state CLOSEREQ, Generate Close S.state := CLOSING Set CLOSING timer. This means we need to address what to do in CLOSED, LISTEN, REQUEST, RESPOND, PARTOPEN, and OPEN. * CLOSED: silently ignore - it may be a late or duplicate CloseReq; * LISTEN/RESPOND: will not appear, since Step 7 is performed first (we know we are the client); * REQUEST:perform Step 13 directly (no need to enqueue packet); * OPEN/PARTOPEN: enter PASSIVE_CLOSEREQ so that the application has a chance to process unread data. When already in PASSIVE_CLOSEREQ, no second CloseReq is enqueued. In any other state, the CloseReq is ignored. I think that this offers some robustness against rare and pathological cases: e.g. a simultaneous close where the client sends a Close and the server a CloseReq. The client will then be retransmitting its Close until it gets the Reset, so ignoring the CloseReq while in state CLOSING is sane. 2) Receiving Close The code below from 8.5 is unconditional. Step 14: Process Close If P.type == Close, Generate Reset(Closed) Tear down connection Drop packet and return Thus we need to consider all states: * CLOSED: silently ignore, since this can happen when a retransmitted or late Close arrives; * LISTEN: dccp_rcv_state_process() will generate a Reset (No Connection); * REQUEST: perform Step 14 directly (no need to enqueue packet); * RESPOND: dccp_check_req() will generate a Reset (Packet Error) -- left it at that; * OPEN/PARTOPEN:enter PASSIVE_CLOSE so that application has a chance to process unread data; * CLOSEREQ: server performed active-close -- perform Step 14; * CLOSING: simultaneous-close: use a tie-breaker to avoid message ping-pong (see comment); * PASSIVE_CLOSEREQ: ignore - the peer has a bug (sending first a CloseReq and now a Close); * TIMEWAIT: packet is ignored. Note that the condition of receiving a packet in state CLOSED here is different from the condition there is no socket for such a connection: the socket still exists, but its state indicates it is unusable. Last, dccp_finish_passive_close sets either DCCP_CLOSED or DCCP_CLOSING = TCP_CLOSING, so that sk_stream_wait_close() will wait for the final Reset (which will trigger CLOSING = CLOSED). Signed-off-by: Gerrit Renker [EMAIL PROTECTED] Signed-off-by: Arnaldo Carvalho de Melo [EMAIL PROTECTED] --- include/linux/dccp.h |1 - net/dccp/input.c | 88 + net/dccp/proto.c | 88 +- 3 files changed, 131 insertions(+), 46 deletions(-) diff --git a/include/linux/dccp.h b/include/linux/dccp.h index 8b3f9ad..312b989 100644 --- a/include/linux/dccp.h +++ b/include/linux/dccp.h @@ -260,7 +260,6 @@ enum dccp_state { }; #define DCCP_STATE_MASK 0x1f -#define DCCP_ACTION_FIN (17) enum { DCCPF_OPEN= TCPF_ESTABLISHED, diff --git a/net/dccp/input.c b/net/dccp/input.c index ef299fb..fe4b0fb 100644 --- a/net/dccp/input.c +++ b/net/dccp/input.c @@ -32,16 +32,56 @@ static void dccp_fin(struct sock *sk, struct sk_buff *skb) sk-sk_data_ready(sk, 0); } -static void dccp_rcv_close(struct sock *sk, struct sk_buff *skb) +static int dccp_rcv_close(struct sock *sk, struct sk_buff *skb) { - dccp_send_reset(sk, DCCP_RESET_CODE_CLOSED); - dccp_fin(sk, skb); - dccp_set_state(sk, DCCP_CLOSED); - sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP); + int queued = 0; + + switch (sk-sk_state) { + /* +* We ignore Close when received in one of the following states: +* - CLOSED(may be a late or duplicate packet) +* - PASSIVE_CLOSEREQ (the peer has sent a CloseReq earlier) +* - RESPOND (already handled by dccp_check_req) +*/ + case DCCP_CLOSING: + /* +* Simultaneous-close: receiving a Close after sending one. This +* can happen if both client and server perform active-close and +* will result in an endless ping-pong of crossing and retrans- +* mitted Close packets, which only terminates when one of the +* nodes times out (min. 64 seconds). Quicker convergence can be +* achieved when one of the nodes acts as tie
[PATCH 3/5] [DCCP]: Dedicated auxiliary states to support passive-close
From: Gerrit Renker [EMAIL PROTECTED] This adds two auxiliary states to deal with passive closes: * PASSIVE_CLOSE(reached from OPEN via reception of Close)and * PASSIVE_CLOSEREQ (reached from OPEN via reception of CloseReq) as internal intermediate states. These states are used to allow a receiver to process unread data before acknowledging the received connection-termination-request (the Close/CloseReq). Without such support, it will happen that passively-closed sockets enter CLOSED state while there is still unprocessed data in the queue; leading to unexpected and erratic API behaviour. PASSIVE_CLOSE has been mapped into TCPF_CLOSE_WAIT, so that the code will seamlessly work with inet_accept() (which tests for this state). The state names are thanks to Arnaldo, who suggested this naming scheme following an earlier revision of this patch. Signed-off-by: Gerrit Renker [EMAIL PROTECTED] Signed-off-by: Arnaldo Carvalho de Melo [EMAIL PROTECTED] --- include/linux/dccp.h | 56 ++--- net/dccp/proto.c | 22 ++- 2 files changed, 51 insertions(+), 27 deletions(-) diff --git a/include/linux/dccp.h b/include/linux/dccp.h index a007326..8b3f9ad 100644 --- a/include/linux/dccp.h +++ b/include/linux/dccp.h @@ -227,29 +227,51 @@ struct dccp_so_feat { #include net/tcp_states.h enum dccp_state { - DCCP_OPEN = TCP_ESTABLISHED, - DCCP_REQUESTING = TCP_SYN_SENT, - DCCP_LISTEN = TCP_LISTEN, - DCCP_RESPOND= TCP_SYN_RECV, - DCCP_CLOSING= TCP_CLOSING, - DCCP_TIME_WAIT = TCP_TIME_WAIT, - DCCP_CLOSED = TCP_CLOSE, - DCCP_PARTOPEN = TCP_MAX_STATES, + DCCP_OPEN= TCP_ESTABLISHED, + DCCP_REQUESTING = TCP_SYN_SENT, + DCCP_LISTEN = TCP_LISTEN, + DCCP_RESPOND = TCP_SYN_RECV, + /* +* States involved in closing a DCCP connection: +* 1) ACTIVE_CLOSEREQ is entered by a server sending a CloseReq. +* +* 2) CLOSING can have three different meanings (RFC 4340, 8.3): +* a. Client has performed active-close, has sent a Close to the server +* from state OPEN or PARTOPEN, and is waiting for the final Reset +* (in this case, SOCK_DONE == 1). +* b. Client is asked to perform passive-close, by receiving a CloseReq +* in (PART)OPEN state. It sends a Close and waits for final Reset +* (in this case, SOCK_DONE == 0). +* c. Server performs an active-close as in (a), keeps TIMEWAIT state. +* +* 3) The following intermediate states are employed to give passively +*closing nodes a chance to process their unread data: +*- PASSIVE_CLOSE(from OPEN = CLOSED) and +*- PASSIVE_CLOSEREQ (from (PART)OPEN to CLOSING; case (b) above). +*/ + DCCP_ACTIVE_CLOSEREQ = TCP_FIN_WAIT1, + DCCP_PASSIVE_CLOSE = TCP_CLOSE_WAIT, /* any node receiving a Close */ + DCCP_CLOSING = TCP_CLOSING, + DCCP_TIME_WAIT = TCP_TIME_WAIT, + DCCP_CLOSED = TCP_CLOSE, + DCCP_PARTOPEN= TCP_MAX_STATES, + DCCP_PASSIVE_CLOSEREQ, /* clients receiving CloseReq */ DCCP_MAX_STATES }; -#define DCCP_STATE_MASK 0xf +#define DCCP_STATE_MASK 0x1f #define DCCP_ACTION_FIN (17) enum { - DCCPF_OPEN = TCPF_ESTABLISHED, - DCCPF_REQUESTING = TCPF_SYN_SENT, - DCCPF_LISTEN = TCPF_LISTEN, - DCCPF_RESPOND= TCPF_SYN_RECV, - DCCPF_CLOSING= TCPF_CLOSING, - DCCPF_TIME_WAIT = TCPF_TIME_WAIT, - DCCPF_CLOSED = TCPF_CLOSE, - DCCPF_PARTOPEN = 1 DCCP_PARTOPEN, + DCCPF_OPEN= TCPF_ESTABLISHED, + DCCPF_REQUESTING = TCPF_SYN_SENT, + DCCPF_LISTEN = TCPF_LISTEN, + DCCPF_RESPOND = TCPF_SYN_RECV, + DCCPF_ACTIVE_CLOSEREQ = TCPF_FIN_WAIT1, + DCCPF_CLOSING = TCPF_CLOSING, + DCCPF_TIME_WAIT = TCPF_TIME_WAIT, + DCCPF_CLOSED = TCPF_CLOSE, + DCCPF_PARTOPEN= (1 DCCP_PARTOPEN), }; static inline struct dccp_hdr *dccp_hdr(const struct sk_buff *skb) diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 73006b7..3489d3f 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -60,8 +60,7 @@ void dccp_set_state(struct sock *sk, const int state) { const int oldstate = sk-sk_state; - dccp_pr_debug(%s(%p) %-10.10s - %s\n, - dccp_role(sk), sk, + dccp_pr_debug(%s(%p) %s -- %s\n, dccp_role(sk), sk, dccp_state_name(oldstate), dccp_state_name(state)); WARN_ON(state == oldstate); @@ -134,14 +133,17 @@ EXPORT_SYMBOL_GPL(dccp_packet_name); const char *dccp_state_name(const int state) { static char *dccp_state_names[] = { - [DCCP_OPEN] = OPEN, - [DCCP_REQUESTING
[PATCH 2/5] Use AF-independent rebuild_header routine
From: Gerrit Renker [EMAIL PROTECTED] [DCCP]: Use AF-independent rebuild_header routine This fixes a nasty bug: dccp_send_reset() is called by both DCCPv4 and DCCPv6, but uses inet_sk_rebuild_header() in each case. This leads to unpredictable and weird behaviour: under some conditions, DCCPv6 Resets were sent, in other not. The fix is to use the AF-independent rebuild_header routine. Signed-off-by: Gerrit Renker [EMAIL PROTECTED] Signed-off-by: Arnaldo Carvalho de Melo [EMAIL PROTECTED] --- net/dccp/output.c |2 +- 1 files changed, 1 insertions(+), 1 deletions(-) diff --git a/net/dccp/output.c b/net/dccp/output.c index 33ce737..7caa7f5 100644 --- a/net/dccp/output.c +++ b/net/dccp/output.c @@ -391,7 +391,7 @@ int dccp_send_reset(struct sock *sk, enum dccp_reset_codes code) * FIXME: what if rebuild_header fails? * Should we be doing a rebuild_header here? */ - int err = inet_sk_rebuild_header(sk); + int err = inet_csk(sk)-icsk_af_ops-rebuild_header(sk); if (err != 0) return err; -- 1.5.3.4 - To unsubscribe from this list: send the line unsubscribe dccp in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCHES 0/5]: DCCP patches for 2.6.25
Hi Herbert, Please consider pulling from: master.kernel.org:/pub/scm/linux/kernel/git/acme/net-2.6.25 Best Regards, - Arnaldo b/include/linux/dccp.h| 56 + b/net/dccp/ccids/ccid3.c | 57 -- b/net/dccp/ccids/ccid3.h |3 b/net/dccp/ccids/lib/loss_interval.c | 12 +- b/net/dccp/ccids/lib/packet_history.c | 138 +++--- b/net/dccp/ccids/lib/packet_history.h | 80 +++ b/net/dccp/input.c| 88 + b/net/dccp/output.c |3 b/net/dccp/proto.c| 23 ++--- include/linux/dccp.h |1 net/dccp/input.c |7 - net/dccp/proto.c | 89 ++--- 12 files changed, 287 insertions(+), 270 deletions(-) - To unsubscribe from this list: send the line unsubscribe dccp in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 5/5] [DCCP]: Remove duplicate test for CloseReq
From: Gerrit Renker [EMAIL PROTECTED] This removes a redundant test for unexpected packet types. In dccp_rcv_state_process it is tested twice whether a DCCP-server has received a CloseReq (Step 7): * first in the combined if-statement, * then in the call to dccp_rcv_closereq(). The latter is necesssary since dccp_rcv_closereq() is also called from __dccp_rcv_established(). This patch removes the duplicate test. Signed-off-by: Gerrit Renker [EMAIL PROTECTED] Signed-off-by: Arnaldo Carvalho de Melo [EMAIL PROTECTED] --- net/dccp/input.c |6 ++ 1 files changed, 2 insertions(+), 4 deletions(-) diff --git a/net/dccp/input.c b/net/dccp/input.c index fe4b0fb..decf2f2 100644 --- a/net/dccp/input.c +++ b/net/dccp/input.c @@ -629,16 +629,14 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb, return 0; /* * Step 7: Check for unexpected packet types -* If (S.is_server and P.type == CloseReq) -* or (S.is_server and P.type == Response) +* If (S.is_server and P.type == Response) * or (S.is_client and P.type == Request) * or (S.state == RESPOND and P.type == Data), *Send Sync packet acknowledging P.seqno *Drop packet and return */ } else if ((dp-dccps_role != DCCP_ROLE_CLIENT - (dh-dccph_type == DCCP_PKT_RESPONSE || -dh-dccph_type == DCCP_PKT_CLOSEREQ)) || + dh-dccph_type == DCCP_PKT_RESPONSE) || (dp-dccps_role == DCCP_ROLE_CLIENT dh-dccph_type == DCCP_PKT_REQUEST) || (sk-sk_state == DCCP_RESPOND -- 1.5.3.4 - To unsubscribe from this list: send the line unsubscribe dccp in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 04/18] [CCID2]: Don't assign negative values to Ack Ratio
From: Gerrit Renker [EMAIL PROTECTED] Since it makes not sense to assign negative values to Ack Ratio, this patch disallows this possibility. As a consequence, a Bug test for negative Ack Ratio values becomes obsolete. Furthermore, a check against overflow (as Ack Ratio may not exceed 2 bytes, due to RFC 4340, 11.3) has been added. Signed-off-by: Gerrit Renker [EMAIL PROTECTED] Acked-by: Ian McDonald [EMAIL PROTECTED] Signed-off-by: Arnaldo Carvalho de Melo [EMAIL PROTECTED] --- net/dccp/ccids/ccid2.c |9 + 1 files changed, 5 insertions(+), 4 deletions(-) diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c index 5552218..f18235e 100644 --- a/net/dccp/ccids/ccid2.c +++ b/net/dccp/ccids/ccid2.c @@ -140,7 +140,7 @@ static int ccid2_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) return 1; /* XXX CCID should dequeue when ready instead of polling */ } -static void ccid2_change_l_ack_ratio(struct sock *sk, int val) +static void ccid2_change_l_ack_ratio(struct sock *sk, u32 val) { struct dccp_sock *dp = dccp_sk(sk); /* @@ -159,9 +159,10 @@ static void ccid2_change_l_ack_ratio(struct sock *sk, int val) if (val max) val = max; } + if (val 0x) /* RFC 4340, 11.3 */ + val = 0x; - ccid2_pr_debug(changing local ack ratio to %d\n, val); - WARN_ON(val = 0); + ccid2_pr_debug(changing local ack ratio to %u\n, val); dp-dccps_l_ack_ratio = val; } @@ -572,7 +573,7 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) hctx-ccid2hctx_rpdupack = -1; /* XXX lame */ hctx-ccid2hctx_rpseq = 0; - ccid2_change_l_ack_ratio(sk, dp-dccps_l_ack_ratio 1); + ccid2_change_l_ack_ratio(sk, 2 * dp-dccps_l_ack_ratio); } } } -- 1.5.3.4 - To unsubscribe from this list: send the line unsubscribe dccp in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 03/18] [CCID2]: Fix sequence number arithmetic/comparisons
From: Gerrit Renker [EMAIL PROTECTED] This replaces use of normal subtraction with modulo-48 subtraction. Signed-off-by: Gerrit Renker [EMAIL PROTECTED] Acked-by: Ian McDonald [EMAIL PROTECTED] Signed-off-by: Arnaldo Carvalho de Melo [EMAIL PROTECTED] --- net/dccp/ccids/ccid2.c |9 ++--- 1 files changed, 2 insertions(+), 7 deletions(-) diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c index 7873dc7..5552218 100644 --- a/net/dccp/ccids/ccid2.c +++ b/net/dccp/ccids/ccid2.c @@ -24,9 +24,6 @@ /* * This implementation should follow RFC 4341 - * - * BUGS: - * - sequence number wrapping */ #include ../ccid.h @@ -619,9 +616,8 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) /* go through this ack vector */ while (veclen--) { const u8 rl = *vector DCCP_ACKVEC_LEN_MASK; - u64 ackno_end_rl; + u64 ackno_end_rl = SUB48(ackno, rl); - dccp_set_seqno(ackno_end_rl, ackno - rl); ccid2_pr_debug(ackvec start:%llu end:%llu\n, (unsigned long long)ackno, (unsigned long long)ackno_end_rl); @@ -671,8 +667,7 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) if (done) break; - - dccp_set_seqno(ackno, ackno_end_rl - 1); + ackno = SUB48(ackno_end_rl, 1); vector++; } if (done) -- 1.5.3.4 - To unsubscribe from this list: send the line unsubscribe dccp in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 02/18] [CCID2]: Bug in reading Ack Vectors
From: Gerrit Renker [EMAIL PROTECTED] In CCID2 the receiver-history is sorted in ascending order of sequence number, but the processing of received Ack Vectors requires the list traversal in the opposite direction. The current code has a bug in this regard: the list traversal is upwards. As a consequence, only Ack Vectors with a run length of 1 will pass, in all other Ack Vectors the remaining (acked) sequence numbers are missed, and may later falsely be identified as lost. Note: This bug is only visible when Ack Ratio 1, since otherwise the run lengths of Ack Vectors are 0. Signed-off-by: Gerrit Renker [EMAIL PROTECTED] Signed-off-by: Ian McDonald [EMAIL PROTECTED] Signed-off-by: Arnaldo Carvalho de Melo [EMAIL PROTECTED] --- net/dccp/ccids/ccid2.c |2 +- 1 files changed, 1 insertions(+), 1 deletions(-) diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c index c9c465e..7873dc7 100644 --- a/net/dccp/ccids/ccid2.c +++ b/net/dccp/ccids/ccid2.c @@ -666,7 +666,7 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) done = 1; break; } - seqp = seqp-ccid2s_next; + seqp = seqp-ccid2s_prev; } if (done) break; -- 1.5.3.4 - To unsubscribe from this list: send the line unsubscribe dccp in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 06/18] [DCCP]: Initialize dccp_sock before calling the ccid constructors
This is because in the next patch CCID2 will assume that dccps_mss_cache is non-zero. Signed-off-by: Arnaldo Carvalho de Melo [EMAIL PROTECTED] --- net/dccp/proto.c | 25 + 1 files changed, 13 insertions(+), 12 deletions(-) diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 0aec735..d48005f 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -174,6 +174,19 @@ int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized) dccp_minisock_init(dp-dccps_minisock); + icsk-icsk_rto = DCCP_TIMEOUT_INIT; + icsk-icsk_syn_retries = sysctl_dccp_request_retries; + sk-sk_state= DCCP_CLOSED; + sk-sk_write_space = dccp_write_space; + icsk-icsk_sync_mss = dccp_sync_mss; + dp-dccps_mss_cache = 536; + dp-dccps_rate_last = jiffies; + dp-dccps_role = DCCP_ROLE_UNDEFINED; + dp-dccps_service = DCCP_SERVICE_CODE_IS_ABSENT; + dp-dccps_l_ack_ratio = dp-dccps_r_ack_ratio = 1; + + dccp_init_xmit_timers(sk); + /* * FIXME: We're hardcoding the CCID, and doing this at this point makes * the listening (master) sock get CCID control blocks, which is not @@ -213,18 +226,6 @@ int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized) INIT_LIST_HEAD(dmsk-dccpms_conf); } - dccp_init_xmit_timers(sk); - icsk-icsk_rto = DCCP_TIMEOUT_INIT; - icsk-icsk_syn_retries = sysctl_dccp_request_retries; - sk-sk_state= DCCP_CLOSED; - sk-sk_write_space = dccp_write_space; - icsk-icsk_sync_mss = dccp_sync_mss; - dp-dccps_mss_cache = 536; - dp-dccps_rate_last = jiffies; - dp-dccps_role = DCCP_ROLE_UNDEFINED; - dp-dccps_service = DCCP_SERVICE_CODE_IS_ABSENT; - dp-dccps_l_ack_ratio = dp-dccps_r_ack_ratio = 1; - return 0; } -- 1.5.3.4 - To unsubscribe from this list: send the line unsubscribe dccp in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 07/18] [CCID2]: Larger initial windows also for CCID2
From: Gerrit Renker [EMAIL PROTECTED] RFC 4341, sec. 5 states that The cwnd parameter is initialized to at most four packets for new connections, following the rules from [RFC3390], which is implemented by this patch. Signed-off-by: Gerrit Renker [EMAIL PROTECTED] Acked-by: Ian McDonald [EMAIL PROTECTED] Signed-off-by: Arnaldo Carvalho de Melo [EMAIL PROTECTED] --- net/dccp/ccids/ccid2.c | 20 +++- 1 files changed, 15 insertions(+), 5 deletions(-) diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c index ef19fb8..9c5b6c7 100644 --- a/net/dccp/ccids/ccid2.c +++ b/net/dccp/ccids/ccid2.c @@ -741,15 +741,25 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk) { struct ccid2_hc_tx_sock *hctx = ccid_priv(ccid); + struct dccp_sock *dp = dccp_sk(sk); + u32 max_ratio; - ccid2_change_cwnd(hctx, 1); - /* Initialize ssthresh to infinity. This means that we will exit the -* initial slow-start after the first packet loss. This is what we -* want. -*/ + /* RFC 4341, 5: initialise ssthresh to arbitrarily high (max) value */ hctx-ccid2hctx_ssthresh = ~0; hctx-ccid2hctx_numdupack = 3; + /* +* RFC 4341, 5: The cwnd parameter is initialized to at most four +* packets for new connections, following the rules from [RFC3390]. +* We need to convert the bytes of RFC3390 into the packets of RFC 4341. +*/ + hctx-ccid2hctx_cwnd = min(4U, max(2U, 4380U / dp-dccps_mss_cache)); + + /* Make sure that Ack Ratio is enabled and within bounds. */ + max_ratio = DIV_ROUND_UP(hctx-ccid2hctx_cwnd, 2); + if (dp-dccps_l_ack_ratio == 0 || dp-dccps_l_ack_ratio max_ratio) + dp-dccps_l_ack_ratio = max_ratio; + /* XXX init ~ to window size... */ if (ccid2_hc_tx_alloc_seq(hctx)) return -ENOMEM; -- 1.5.3.4 - To unsubscribe from this list: send the line unsubscribe dccp in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 08/18] [CCID2]: Disable broken Ack Ratio adaptation algorithm
From: Gerrit Renker [EMAIL PROTECTED] This comments out a problematic section comprising a half-finished algorithm: - The variable `ccid2hctx_ackloss' is never initialised to a value different from 0 and hence in fact is a read-only constant. - The `arsent' variable counts packets other than Acks (it is incremented for every packet), and there is no test for Ack Loss. - The concept of counting Acks as such leads to a complex calculation, and the calculation at the moment is inconsistent with this concept. The problem is that the number of Acks - rather than the number of windows - is counted, which leads to a complex (cubic/quadratic) expression - this is not even implemented. In its current state, the commented-out algorithm interfers with normal processing by changing Ack Ratio incorrectly, and at the wrong times. A new algorithm is necessary, which will not necessarily use the same variables as used by the unfinished one; hence the old variables have been removed. Signed-off-by: Gerrit Renker [EMAIL PROTECTED] Acked-by: Ian McDonald [EMAIL PROTECTED] Signed-off-by: Arnaldo Carvalho de Melo [EMAIL PROTECTED] --- net/dccp/ccids/ccid2.c | 23 +-- net/dccp/ccids/ccid2.h |4 2 files changed, 21 insertions(+), 6 deletions(-) diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c index 9c5b6c7..a901202 100644 --- a/net/dccp/ccids/ccid2.c +++ b/net/dccp/ccids/ccid2.c @@ -224,8 +224,6 @@ static void ccid2_hc_tx_rto_expire(unsigned long data) hctx-ccid2hctx_sent= 0; /* clear ack ratio state. */ - hctx-ccid2hctx_arsent = 0; - hctx-ccid2hctx_ackloss = 0; hctx-ccid2hctx_rpseq= 0; hctx-ccid2hctx_rpdupack = -1; ccid2_change_l_ack_ratio(sk, 1); @@ -289,6 +287,26 @@ static void ccid2_hc_tx_packet_sent(struct sock *sk, int more, unsigned int len) hctx-ccid2hctx_sent++; + /* +* FIXME: The code below is broken and the variables have been removed +* from the socket struct. The `ackloss' variable was always set to 0, +* and with arsent there are several problems: +* (i) it doesn't just count the number of Acks, but all sent packets; +* (ii) it is expressed in # of packets, not # of windows, so the +* comparison below uses the wrong formula: Appendix A of RFC 4341 +* comes up with the number K = cwnd / (R^2 - R) of consecutive windows +* of data with no lost or marked Ack packets. If arsent were the # of +* consecutive Acks received without loss, then Ack Ratio needs to be +* decreased by 1 when +*arsent = K * cwnd / R = cwnd^2 / (R^3 - R^2) +* where cwnd / R is the number of Acks received per window of data +* (cf. RFC 4341, App. A). The problems are that +* - arsent counts other packets as well; +* - the comparison uses a formula different from RFC 4341; +* - computing a cubic/quadratic equation each time is too complicated. +* Hence a different algorithm is needed. +*/ +#if 0 /* Ack Ratio. Need to maintain a concept of how many windows we sent */ hctx-ccid2hctx_arsent++; /* We had an ack loss in this window... */ @@ -316,6 +334,7 @@ static void ccid2_hc_tx_packet_sent(struct sock *sk, int more, unsigned int len) hctx-ccid2hctx_arsent = 0; /* or maybe set it to cwnd*/ } } +#endif /* setup RTO timer */ if (!timer_pending(hctx-ccid2hctx_rtotimer)) diff --git a/net/dccp/ccids/ccid2.h b/net/dccp/ccids/ccid2.h index d9daa53..ed95f8f 100644 --- a/net/dccp/ccids/ccid2.h +++ b/net/dccp/ccids/ccid2.h @@ -44,8 +44,6 @@ struct ccid2_seq { * @ccid2hctx_acks - ACKS recv in AI phase * @ccid2hctx_sent - packets sent in this window * @ccid2hctx_lastrtt -time RTT was last measured - * @ccid2hctx_arsent - packets sent [ack ratio] - * @ccid2hctx_ackloss - ack was lost in this win * @ccid2hctx_rpseq - last consecutive seqno * @ccid2hctx_rpdupack - dupacks since rpseq */ @@ -66,8 +64,6 @@ struct ccid2_hc_tx_sock { int ccid2hctx_sent; unsigned long ccid2hctx_lastrtt; struct timer_list ccid2hctx_rtotimer; - unsigned long ccid2hctx_arsent; - int ccid2hctx_ackloss; u64 ccid2hctx_rpseq; int ccid2hctx_rpdupack; int ccid2hctx_sendwait; -- 1.5.3.4 - To unsubscribe from this list: send the line unsubscribe dccp in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 10/18] [CCID2]: Replace read-only variable with constant
From: Gerrit Renker [EMAIL PROTECTED] This replaces the field member `numdupack', which was used as a read-only constant in the code, with a #define. Signed-off-by: Gerrit Renker [EMAIL PROTECTED] Signed-off-by: Ian McDonald [EMAIL PROTECTED] Signed-off-by: Arnaldo Carvalho de Melo [EMAIL PROTECTED] --- net/dccp/ccids/ccid2.c |8 +++- net/dccp/ccids/ccid2.h |3 ++- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c index 9fa0eb5..a3c42cd 100644 --- a/net/dccp/ccids/ccid2.c +++ b/net/dccp/ccids/ccid2.c @@ -586,8 +586,7 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) hctx-ccid2hctx_rpdupack++; /* check if we got enough dupacks */ - if (hctx-ccid2hctx_rpdupack = - hctx-ccid2hctx_numdupack) { + if (hctx-ccid2hctx_rpdupack = NUMDUPACK) { hctx-ccid2hctx_rpdupack = -1; /* XXX lame */ hctx-ccid2hctx_rpseq = 0; @@ -708,7 +707,7 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) while (1) { if (seqp-ccid2s_acked) { done++; - if (done == hctx-ccid2hctx_numdupack) + if (done == NUMDUPACK) break; } if (seqp == hctx-ccid2hctx_seqt) @@ -719,7 +718,7 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) /* If there are at least 3 acknowledgements, anything unacknowledged * below the last sequence number is considered lost */ - if (done == hctx-ccid2hctx_numdupack) { + if (done == NUMDUPACK) { struct ccid2_seq *last_acked = seqp; /* check for lost packets */ @@ -761,7 +760,6 @@ static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk) /* RFC 4341, 5: initialise ssthresh to arbitrarily high (max) value */ hctx-ccid2hctx_ssthresh = ~0; - hctx-ccid2hctx_numdupack = 3; /* * RFC 4341, 5: The cwnd parameter is initialized to at most four diff --git a/net/dccp/ccids/ccid2.h b/net/dccp/ccids/ccid2.h index 6d0b4e3..443f08a 100644 --- a/net/dccp/ccids/ccid2.h +++ b/net/dccp/ccids/ccid2.h @@ -24,6 +24,8 @@ #include linux/timer.h #include linux/types.h #include ../ccid.h +/* NUMDUPACK parameter from RFC 4341, p. 6 */ +#define NUMDUPACK 3 struct sock; @@ -52,7 +54,6 @@ struct ccid2_hc_tx_sock { int ccid2hctx_acks; unsigned intccid2hctx_ssthresh; int ccid2hctx_pipe; - int ccid2hctx_numdupack; struct ccid2_seq*ccid2hctx_seqbuf[CCID2_SEQBUF_MAX]; int ccid2hctx_seqbufc; struct ccid2_seq*ccid2hctx_seqh; -- 1.5.3.4 - To unsubscribe from this list: send the line unsubscribe dccp in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 17/18] [DCCP]: Check for unread data on close
From: Gerrit Renker [EMAIL PROTECTED] This removes one FIXME with regard to close when there is still unread data. The mechanism is implemented similar to TCP: with regard to DCCP-specifics, a Reset with Code 2, Aborted is sent to the peer. This corresponds in part to RFC 4340, 8.1.1 and 8.1.5. Signed-off-by: Gerrit Renker [EMAIL PROTECTED] Signed-off-by: Ian McDonald [EMAIL PROTECTED] Signed-off-by: Arnaldo Carvalho de Melo [EMAIL PROTECTED] --- net/dccp/proto.c | 10 -- 1 files changed, 8 insertions(+), 2 deletions(-) diff --git a/net/dccp/proto.c b/net/dccp/proto.c index d48005f..5f47b45 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -891,6 +891,7 @@ void dccp_close(struct sock *sk, long timeout) { struct dccp_sock *dp = dccp_sk(sk); struct sk_buff *skb; + u32 data_was_unread = 0; int state; lock_sock(sk); @@ -913,12 +914,17 @@ void dccp_close(struct sock *sk, long timeout) * descriptor close, not protocol-sourced closes, because the *reader process may not have drained the data yet! */ - /* FIXME: check for unread data */ while ((skb = __skb_dequeue(sk-sk_receive_queue)) != NULL) { + data_was_unread += skb-len; __kfree_skb(skb); } - if (sock_flag(sk, SOCK_LINGER) !sk-sk_lingertime) { + if (data_was_unread) { + /* Unread data was tossed, send an appropriate Reset Code */ + DCCP_WARN(DCCP: ABORT -- %u bytes unread\n, data_was_unread); + dccp_send_reset(sk, DCCP_RESET_CODE_ABORTED); + dccp_set_state(sk, DCCP_CLOSED); + } else if (sock_flag(sk, SOCK_LINGER) !sk-sk_lingertime) { /* Check zero linger _after_ checking for unread data. */ sk-sk_prot-disconnect(sk, 0); } else if (dccp_close_state(sk)) { -- 1.5.3.4 - To unsubscribe from this list: send the line unsubscribe dccp in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 12/18] [CCID2]: Replace pipe assignment-function with assignment
From: Gerrit Renker [EMAIL PROTECTED] The function ccid2_change_pipe only does an assignment. This patch simplifies the code by replacing the function with the assignment it performs. Furthermore, the type of pipe is promoted from `signed' to unsigned (increasing the range). As a result, a BUG_ON test for negative values now becomes obsolete (for safety not removed, but replaced with a less annoying `DCCP_BUG'). Signed-off-by: Gerrit Renker [EMAIL PROTECTED] Signed-off-by: Ian McDonald [EMAIL PROTECTED] Signed-off-by: Arnaldo Carvalho de Melo [EMAIL PROTECTED] --- net/dccp/ccids/ccid2.c | 16 ++-- net/dccp/ccids/ccid2.h |3 ++- 2 files changed, 8 insertions(+), 11 deletions(-) diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c index 747fa1c..237007b 100644 --- a/net/dccp/ccids/ccid2.c +++ b/net/dccp/ccids/ccid2.c @@ -171,11 +171,6 @@ static void ccid2_change_srtt(struct ccid2_hc_tx_sock *hctx, long val) hctx-ccid2hctx_srtt = val; } -static void ccid2_change_pipe(struct ccid2_hc_tx_sock *hctx, long val) -{ - hctx-ccid2hctx_pipe = val; -} - static void ccid2_start_rto_timer(struct sock *sk); static void ccid2_hc_tx_rto_expire(unsigned long data) @@ -205,11 +200,11 @@ static void ccid2_hc_tx_rto_expire(unsigned long data) ccid2_start_rto_timer(sk); /* adjust pipe, cwnd etc */ - ccid2_change_pipe(hctx, 0); hctx-ccid2hctx_ssthresh = hctx-ccid2hctx_cwnd / 2; if (hctx-ccid2hctx_ssthresh 2) hctx-ccid2hctx_ssthresh = 2; hctx-ccid2hctx_cwnd = 1; + hctx-ccid2hctx_pipe = 0; /* clear state about stuff we sent */ hctx-ccid2hctx_seqt= hctx-ccid2hctx_seqh; @@ -248,8 +243,7 @@ static void ccid2_hc_tx_packet_sent(struct sock *sk, int more, unsigned int len) BUG_ON(!hctx-ccid2hctx_sendwait); hctx-ccid2hctx_sendwait = 0; - ccid2_change_pipe(hctx, hctx-ccid2hctx_pipe + 1); - BUG_ON(hctx-ccid2hctx_pipe 0); + hctx-ccid2hctx_pipe++; /* There is an issue. What if another packet is sent between * packet_send() and packet_sent(). Then the sequence number would be @@ -519,8 +513,10 @@ static void ccid2_hc_tx_dec_pipe(struct sock *sk) { struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); - ccid2_change_pipe(hctx, hctx-ccid2hctx_pipe-1); - BUG_ON(hctx-ccid2hctx_pipe 0); + if (hctx-ccid2hctx_pipe == 0) + DCCP_BUG(pipe == 0); + else + hctx-ccid2hctx_pipe--; if (hctx-ccid2hctx_pipe == 0) ccid2_hc_tx_kill_rto_timer(sk); diff --git a/net/dccp/ccids/ccid2.h b/net/dccp/ccids/ccid2.h index b72e955..bc659f0 100644 --- a/net/dccp/ccids/ccid2.h +++ b/net/dccp/ccids/ccid2.h @@ -42,6 +42,7 @@ struct ccid2_seq { /** struct ccid2_hc_tx_sock - CCID2 TX half connection * + * @ccid2hctx_{cwnd,ssthresh,pipe}: as per RFC 4341, section 5 * @ccid2hctx_ssacks - ACKs recv in slow start * @ccid2hctx_acks - ACKS recv in AI phase * @ccid2hctx_lastrtt -time RTT was last measured @@ -51,9 +52,9 @@ struct ccid2_seq { struct ccid2_hc_tx_sock { u32 ccid2hctx_cwnd; u32 ccid2hctx_ssthresh; + u32 ccid2hctx_pipe; int ccid2hctx_ssacks; int ccid2hctx_acks; - int ccid2hctx_pipe; struct ccid2_seq*ccid2hctx_seqbuf[CCID2_SEQBUF_MAX]; int ccid2hctx_seqbufc; struct ccid2_seq*ccid2hctx_seqh; -- 1.5.3.4 - To unsubscribe from this list: send the line unsubscribe dccp in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 18/18] [DCCP]: Add support for abortive release
From: Gerrit Renker [EMAIL PROTECTED] This continues from the previous patch and adds support for actively aborting a DCCP connection, using a Reset Code 2, Aborted to inform the peer of an abortive release. I have tried this in various client/server settings and it works as expected. Signed-off-by: Gerrit Renker [EMAIL PROTECTED] Signed-off-by: Ian McDonald [EMAIL PROTECTED] Signed-off-by: Arnaldo Carvalho de Melo [EMAIL PROTECTED] --- net/dccp/proto.c | 15 +-- 1 files changed, 13 insertions(+), 2 deletions(-) diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 5f47b45..73006b7 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -276,6 +276,12 @@ static inline int dccp_listen_start(struct sock *sk, int backlog) return inet_csk_listen_start(sk, backlog); } +static inline int dccp_need_reset(int state) +{ + return state != DCCP_CLOSED state != DCCP_LISTEN + state != DCCP_REQUESTING; +} + int dccp_disconnect(struct sock *sk, int flags) { struct inet_connection_sock *icsk = inet_csk(sk); @@ -286,10 +292,15 @@ int dccp_disconnect(struct sock *sk, int flags) if (old_state != DCCP_CLOSED) dccp_set_state(sk, DCCP_CLOSED); - /* ABORT function of RFC793 */ + /* +* This corresponds to the ABORT function of RFC793, sec. 3.8 +* TCP uses a RST segment, DCCP a Reset packet with Code 2, Aborted. +*/ if (old_state == DCCP_LISTEN) { inet_csk_listen_stop(sk); - /* FIXME: do the active reset thing */ + } else if (dccp_need_reset(old_state)) { + dccp_send_reset(sk, DCCP_RESET_CODE_ABORTED); + sk-sk_err = ECONNRESET; } else if (old_state == DCCP_REQUESTING) sk-sk_err = ECONNRESET; -- 1.5.3.4 - To unsubscribe from this list: send the line unsubscribe dccp in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 09/18] [CCID2]: Remove unused variable
From: Gerrit Renker [EMAIL PROTECTED] This removes a variable `ccid2hctx_sent' which is incremented but never referenced/read (i.e., dead code). Signed-off-by: Gerrit Renker [EMAIL PROTECTED] Signed-off-by: Ian McDonald [EMAIL PROTECTED] Signed-off-by: Arnaldo Carvalho de Melo [EMAIL PROTECTED] --- net/dccp/ccids/ccid2.c |4 net/dccp/ccids/ccid2.h |2 -- 2 files changed, 0 insertions(+), 6 deletions(-) diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c index a901202..9fa0eb5 100644 --- a/net/dccp/ccids/ccid2.c +++ b/net/dccp/ccids/ccid2.c @@ -221,7 +221,6 @@ static void ccid2_hc_tx_rto_expire(unsigned long data) hctx-ccid2hctx_seqt= hctx-ccid2hctx_seqh; hctx-ccid2hctx_ssacks = 0; hctx-ccid2hctx_acks= 0; - hctx-ccid2hctx_sent= 0; /* clear ack ratio state. */ hctx-ccid2hctx_rpseq= 0; @@ -285,8 +284,6 @@ static void ccid2_hc_tx_packet_sent(struct sock *sk, int more, unsigned int len) ccid2_pr_debug(cwnd=%d pipe=%d\n, hctx-ccid2hctx_cwnd, hctx-ccid2hctx_pipe); - hctx-ccid2hctx_sent++; - /* * FIXME: The code below is broken and the variables have been removed * from the socket struct. The `ackloss' variable was always set to 0, @@ -517,7 +514,6 @@ static inline void ccid2_new_ack(struct sock *sk, ccid2_pr_debug(srtt: %ld rttvar: %ld rto: %ld (HZ=%d) R=%lu\n, hctx-ccid2hctx_srtt, hctx-ccid2hctx_rttvar, hctx-ccid2hctx_rto, HZ, r); - hctx-ccid2hctx_sent = 0; } /* we got a new ack, so re-start RTO timer */ diff --git a/net/dccp/ccids/ccid2.h b/net/dccp/ccids/ccid2.h index ed95f8f..6d0b4e3 100644 --- a/net/dccp/ccids/ccid2.h +++ b/net/dccp/ccids/ccid2.h @@ -42,7 +42,6 @@ struct ccid2_seq { * * @ccid2hctx_ssacks - ACKs recv in slow start * @ccid2hctx_acks - ACKS recv in AI phase - * @ccid2hctx_sent - packets sent in this window * @ccid2hctx_lastrtt -time RTT was last measured * @ccid2hctx_rpseq - last consecutive seqno * @ccid2hctx_rpdupack - dupacks since rpseq @@ -61,7 +60,6 @@ struct ccid2_hc_tx_sock { longccid2hctx_rto; longccid2hctx_srtt; longccid2hctx_rttvar; - int ccid2hctx_sent; unsigned long ccid2hctx_lastrtt; struct timer_list ccid2hctx_rtotimer; u64 ccid2hctx_rpseq; -- 1.5.3.4 - To unsubscribe from this list: send the line unsubscribe dccp in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 01/18] [ACKVEC]: Reduce length of identifiers
From: Gerrit Renker [EMAIL PROTECTED] This is reduces the length of the struct ackvec/ackvec_record fields. It is a purely text-based replacement: s#dccpavr_#avr_#g; s#dccpav_#av_#g; and increases readability somewhat. Signed-off-by: Gerrit Renker [EMAIL PROTECTED] Signed-off-by: Arnaldo Carvalho de Melo [EMAIL PROTECTED] --- net/dccp/ackvec.c | 163 + net/dccp/ackvec.h | 62 ++-- 2 files changed, 108 insertions(+), 117 deletions(-) diff --git a/net/dccp/ackvec.c b/net/dccp/ackvec.c index 83378f3..6de4bd1 100644 --- a/net/dccp/ackvec.c +++ b/net/dccp/ackvec.c @@ -30,7 +30,7 @@ static struct dccp_ackvec_record *dccp_ackvec_record_new(void) kmem_cache_alloc(dccp_ackvec_record_slab, GFP_ATOMIC); if (avr != NULL) - INIT_LIST_HEAD(avr-dccpavr_node); + INIT_LIST_HEAD(avr-avr_node); return avr; } @@ -40,7 +40,7 @@ static void dccp_ackvec_record_delete(struct dccp_ackvec_record *avr) if (unlikely(avr == NULL)) return; /* Check if deleting a linked record */ - WARN_ON(!list_empty(avr-dccpavr_node)); + WARN_ON(!list_empty(avr-avr_node)); kmem_cache_free(dccp_ackvec_record_slab, avr); } @@ -52,16 +52,15 @@ static void dccp_ackvec_insert_avr(struct dccp_ackvec *av, * just add the AVR at the head of the list. * -sorbo. */ - if (!list_empty(av-dccpav_records)) { + if (!list_empty(av-av_records)) { const struct dccp_ackvec_record *head = - list_entry(av-dccpav_records.next, + list_entry(av-av_records.next, struct dccp_ackvec_record, - dccpavr_node); - BUG_ON(before48(avr-dccpavr_ack_seqno, - head-dccpavr_ack_seqno)); + avr_node); + BUG_ON(before48(avr-avr_ack_seqno, head-avr_ack_seqno)); } - list_add(avr-dccpavr_node, av-dccpav_records); + list_add(avr-avr_node, av-av_records); } int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb) @@ -69,9 +68,8 @@ int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb) struct dccp_sock *dp = dccp_sk(sk); struct dccp_ackvec *av = dp-dccps_hc_rx_ackvec; /* Figure out how many options do we need to represent the ackvec */ - const u16 nr_opts = DIV_ROUND_UP(av-dccpav_vec_len, -DCCP_MAX_ACKVEC_OPT_LEN); - u16 len = av-dccpav_vec_len + 2 * nr_opts, i; + const u16 nr_opts = DIV_ROUND_UP(av-av_vec_len, DCCP_MAX_ACKVEC_OPT_LEN); + u16 len = av-av_vec_len + 2 * nr_opts, i; u32 elapsed_time; const unsigned char *tail, *from; unsigned char *to; @@ -81,7 +79,7 @@ int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb) if (DCCP_SKB_CB(skb)-dccpd_opt_len + len DCCP_MAX_OPT_LEN) return -1; - delta = ktime_us_delta(ktime_get_real(), av-dccpav_time); + delta = ktime_us_delta(ktime_get_real(), av-av_time); elapsed_time = delta / 10; if (elapsed_time != 0 @@ -95,9 +93,9 @@ int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb) DCCP_SKB_CB(skb)-dccpd_opt_len += len; to = skb_push(skb, len); - len = av-dccpav_vec_len; - from = av-dccpav_buf + av-dccpav_buf_head; - tail = av-dccpav_buf + DCCP_MAX_ACKVEC_LEN; + len = av-av_vec_len; + from = av-av_buf + av-av_buf_head; + tail = av-av_buf + DCCP_MAX_ACKVEC_LEN; for (i = 0; i nr_opts; ++i) { int copylen = len; @@ -116,7 +114,7 @@ int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb) to += tailsize; len -= tailsize; copylen -= tailsize; - from= av-dccpav_buf; + from= av-av_buf; } memcpy(to, from, copylen); @@ -134,19 +132,19 @@ int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb) * buf_head; ack_ackno will equal buf_ackno; and ack_nonce will * equal buf_nonce. */ - avr-dccpavr_ack_seqno = DCCP_SKB_CB(skb)-dccpd_seq; - avr-dccpavr_ack_ptr = av-dccpav_buf_head; - avr-dccpavr_ack_ackno = av-dccpav_buf_ackno; - avr-dccpavr_ack_nonce = av-dccpav_buf_nonce; - avr-dccpavr_sent_len = av-dccpav_vec_len; + avr-avr_ack_seqno = DCCP_SKB_CB(skb)-dccpd_seq; + avr-avr_ack_ptr = av-av_buf_head; + avr-avr_ack_ackno = av-av_buf_ackno; + avr-avr_ack_nonce = av-av_buf_nonce; + avr-avr_sent_len = av