Sometimes we have to provision boxes across regions, such as California to Sweden. The http server has a 10 minute timeout, so if we can't get our 250mb image transferred fast enough our provisioning fails, which is not ideal. So add tcp window scaling on open connections and set the window size to 1mb. With this change we're able to get higher sustained transfers between regions and can transfer our image in well below 10 minutes. Without this patch we'd time out every time halfway through the transfer.
RTTM is needed in order to make window scaling work well under heavy congestion or packet loss. In most cases grub could recover with just window scaling enabled, but on some machines the congestion would be so high that it would never recover and would timeout. I've made the window size configureable with the grub env variable "tcp_window_size". By default this is set to 1mb but can be configured to whatever a user wants, and we will calculate the appropriate window size and scale settings. Thanks, Signed-off-by: Josef Bacik <jba...@fb.com> --- V1->V2: -Address Andrei's concerns about making the window size configurable. -Also make the tcp option stuff more dynamic. -Add RTTM support to make higher window sizes more stable. grub-core/net/tcp.c | 141 ++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 132 insertions(+), 9 deletions(-) diff --git a/grub-core/net/tcp.c b/grub-core/net/tcp.c index 5da8b11..be5ef30 100644 --- a/grub-core/net/tcp.c +++ b/grub-core/net/tcp.c @@ -22,6 +22,7 @@ #include <grub/net/netbuff.h> #include <grub/time.h> #include <grub/priority_queue.h> +#include <grub/env.h> #define TCP_SYN_RETRANSMISSION_TIMEOUT GRUB_NET_INTERVAL #define TCP_SYN_RETRANSMISSION_COUNT GRUB_NET_TRIES @@ -65,6 +66,7 @@ struct grub_net_tcp_socket grub_uint32_t my_cur_seq; grub_uint32_t their_start_seq; grub_uint32_t their_cur_seq; + grub_uint32_t cur_tsecr; grub_uint16_t my_window; struct unacked *unack_first; struct unacked *unack_last; @@ -94,6 +96,8 @@ struct grub_net_tcp_listen void *hook_data; }; +#define ALIGNWORD(var) ((var) + 3) & (~3) + struct tcphdr { grub_uint16_t src; @@ -106,6 +110,25 @@ struct tcphdr grub_uint16_t urgent; } GRUB_PACKED; +struct tcp_opt +{ + grub_uint8_t kind; + grub_uint8_t length; +} GRUB_PACKED; + +struct tcp_scale_opt +{ + struct tcp_opt opt; + grub_uint8_t scale; +} GRUB_PACKED; + +struct tcp_timestamp_opt +{ + struct tcp_opt opt; + grub_uint32_t tsval; + grub_uint32_t tsecr; +} GRUB_PACKED; + struct tcp_pseudohdr { grub_uint32_t src; @@ -299,9 +322,12 @@ ack_real (grub_net_tcp_socket_t sock, int res) { struct grub_net_buff *nb_ack; struct tcphdr *tcph_ack; + struct tcp_timestamp_opt *timestamp; + grub_size_t headersize; grub_err_t err; - nb_ack = grub_netbuff_alloc (sizeof (*tcph_ack) + 128); + headersize = ALIGNWORD(sizeof (*tcph_ack) + sizeof (*timestamp)); + nb_ack = grub_netbuff_alloc (headersize + 128); if (!nb_ack) return; err = grub_netbuff_reserve (nb_ack, 128); @@ -313,7 +339,7 @@ ack_real (grub_net_tcp_socket_t sock, int res) return; } - err = grub_netbuff_put (nb_ack, sizeof (*tcph_ack)); + err = grub_netbuff_put (nb_ack, headersize); if (err) { grub_netbuff_free (nb_ack); @@ -322,22 +348,28 @@ ack_real (grub_net_tcp_socket_t sock, int res) return; } tcph_ack = (void *) nb_ack->data; + grub_memset (tcph_ack, 0, headersize); if (res) { tcph_ack->ack = grub_cpu_to_be32_compile_time (0); - tcph_ack->flags = grub_cpu_to_be16_compile_time ((5 << 12) | TCP_RST); + tcph_ack->flags = grub_cpu_to_be16_compile_time ((headersize << 10) | TCP_RST); tcph_ack->window = grub_cpu_to_be16_compile_time (0); } else { tcph_ack->ack = grub_cpu_to_be32 (sock->their_cur_seq); - tcph_ack->flags = grub_cpu_to_be16_compile_time ((5 << 12) | TCP_ACK); + tcph_ack->flags = grub_cpu_to_be16_compile_time ((headersize << 10) | TCP_ACK); tcph_ack->window = !sock->i_stall ? grub_cpu_to_be16 (sock->my_window) : 0; } tcph_ack->urgent = 0; tcph_ack->src = grub_cpu_to_be16 (sock->in_port); tcph_ack->dst = grub_cpu_to_be16 (sock->out_port); + timestamp = (struct tcp_timestamp_opt *)(tcph_ack + 1); + timestamp->opt.kind = 8; + timestamp->opt.length = 10; + timestamp->tsval = grub_cpu_to_be32 (grub_get_time_ms ()); + timestamp->tsecr = grub_cpu_to_be32 (sock->cur_tsecr); err = tcp_send (nb_ack, sock); if (err) { @@ -503,7 +535,7 @@ grub_net_tcp_accept (grub_net_tcp_socket_t sock, sock->error_hook = error_hook; sock->fin_hook = fin_hook; sock->hook_data = hook_data; - nb_ack = grub_netbuff_alloc (sizeof (*tcph) + nb_ack = grub_netbuff_alloc (sizeof (*tcph) + + GRUB_NET_OUR_MAX_IP_HEADER_SIZE + GRUB_NET_MAX_LINK_HEADER_SIZE); if (!nb_ack) @@ -536,6 +568,37 @@ grub_net_tcp_accept (grub_net_tcp_socket_t sock, return GRUB_ERR_NONE; } +static grub_uint32_t +tcp_window_size (void) +{ + const char *val; + grub_uint32_t ret = 1048576; + + val = grub_env_get ("tcp_window_size"); + if (!val) + return ret; + + grub_error_push (); + ret = (grub_uint32_t) grub_strtoul (val, 0, 0); + + if (grub_errno != GRUB_ERR_NONE) + { + grub_env_unset ("tcp_window_size"); + grub_errno = GRUB_ERR_NONE; + ret = 1048576; + } + + grub_error_pop (); + + /* A window size greater than 1gb is invalid */ + if (ret > 1024 * 1024 * 1024) + { + grub_env_unset ("tcp_window_size"); + ret = 1048576; + } + return ret; +} + grub_net_tcp_socket_t grub_net_tcp_open (char *server, grub_uint16_t out_port, @@ -556,9 +619,14 @@ grub_net_tcp_open (char *server, static grub_uint16_t in_port = 21550; struct grub_net_buff *nb; struct tcphdr *tcph; + struct tcp_scale_opt *scale; + struct tcp_timestamp_opt *timestamp; int i; grub_uint8_t *nbd; grub_net_link_level_address_t ll_target_addr; + grub_size_t headersize; + grub_uint32_t window_size = tcp_window_size (); + grub_uint8_t window_scale = 0; err = grub_net_resolve_address (server, &addr); if (err) @@ -583,6 +651,14 @@ grub_net_tcp_open (char *server, if (socket == NULL) return NULL; + /* Have to scale down the window_size so it fits in 16bits and calculate the + scale at the same time. */ + while (window_size > 65535) + { + window_size >>= 1; + window_scale += 1; + } + socket->out_port = out_port; socket->inf = inf; socket->out_nla = addr; @@ -593,7 +669,9 @@ grub_net_tcp_open (char *server, socket->fin_hook = fin_hook; socket->hook_data = hook_data; - nb = grub_netbuff_alloc (sizeof (*tcph) + 128); + headersize = ALIGNWORD (sizeof (*tcph) + sizeof (*scale) + + sizeof (*timestamp)); + nb = grub_netbuff_alloc (headersize + 128); if (!nb) return NULL; err = grub_netbuff_reserve (nb, 128); @@ -603,7 +681,7 @@ grub_net_tcp_open (char *server, return NULL; } - err = grub_netbuff_put (nb, sizeof (*tcph)); + err = grub_netbuff_put (nb, headersize); if (err) { grub_netbuff_free (nb); @@ -617,17 +695,30 @@ grub_net_tcp_open (char *server, } tcph = (void *) nb->data; + grub_memset(tcph, 0, headersize); socket->my_start_seq = grub_get_time_ms (); socket->my_cur_seq = socket->my_start_seq + 1; - socket->my_window = 8192; + socket->my_window = window_size; tcph->seqnr = grub_cpu_to_be32 (socket->my_start_seq); tcph->ack = grub_cpu_to_be32_compile_time (0); - tcph->flags = grub_cpu_to_be16_compile_time ((5 << 12) | TCP_SYN); + tcph->flags = grub_cpu_to_be16_compile_time ((headersize << 10) | TCP_SYN); tcph->window = grub_cpu_to_be16 (socket->my_window); tcph->urgent = 0; tcph->src = grub_cpu_to_be16 (socket->in_port); tcph->dst = grub_cpu_to_be16 (socket->out_port); tcph->checksum = 0; + + scale = (struct tcp_scale_opt *)(tcph + 1); + scale->opt.kind = 3; + scale->opt.length = 3; + scale->scale = window_scale; + + timestamp = (struct tcp_timestamp_opt *)(scale + 1); + timestamp->opt.kind = 8; + timestamp->opt.length = 10; + timestamp->tsval = grub_cpu_to_be32 (grub_get_time_ms ()); + timestamp->tsecr = 0; + tcph->checksum = grub_net_ip_transport_checksum (nb, GRUB_NET_IP_TCP, &socket->inf->address, &socket->out_nla); @@ -745,6 +836,7 @@ grub_net_recv_tcp_packet (struct grub_net_buff *nb, { struct tcphdr *tcph; grub_net_tcp_socket_t sock; + grub_uint32_t tsecr = 0; grub_err_t err; /* Ignore broadcast. */ @@ -771,6 +863,35 @@ grub_net_recv_tcp_packet (struct grub_net_buff *nb, return GRUB_ERR_NONE; } + if ((grub_be_to_cpu16 (tcph->flags >> 12)) > 5) + { + struct tcp_opt *opt; + grub_size_t remaining = nb->tail - nb->data; + + opt = (struct tcp_opt *)(tcph + 1); + while (remaining > 0) + { + grub_uint8_t len = opt->length; + if (opt->kind == 8 || opt->kind == 0) + break; + if (opt->kind == 1) + len = 1; + if (len > remaining) + remaining = 0; + remaining -= len; + opt = (struct tcp_opt *)((grub_uint8_t *)opt + len); + } + + /* Ok we definitely have the timestamp option. */ + if (opt->kind == 8) + { + struct tcp_timestamp_opt *timestamp; + + timestamp = (struct tcp_timestamp_opt *)opt; + tsecr = grub_be_to_cpu32 (timestamp->tsval); + } + } + FOR_TCP_SOCKETS (sock) { if (!(grub_be_to_cpu16 (tcph->dst) == sock->in_port @@ -906,6 +1027,8 @@ grub_net_recv_tcp_packet (struct grub_net_buff *nb, return err; } + /* We only update the tsecr when we advance the window. */ + sock->cur_tsecr = tsecr; sock->their_cur_seq += (nb_top->tail - nb_top->data); if (grub_be_to_cpu16 (tcph->flags) & TCP_FIN) { -- 1.8.1 _______________________________________________ Grub-devel mailing list Grub-devel@gnu.org https://lists.gnu.org/mailman/listinfo/grub-devel