(CC'ing Felix) Aaron Conole <[email protected]> writes:
> Felix Huettner via dev <[email protected]> writes: > >> All previous benchmarks did not test individual connections that >> are quickly opened and closed across multiple zones. This allows us to >> better evaluate how much conntrack is optimized for multithreading. >> >> Signed-off-by: Felix Huettner <[email protected]> >> --- >> tests/test-conntrack.c | 269 ++++++++++++++++++++++++++++++++++++++++- >> 1 file changed, 264 insertions(+), 5 deletions(-) >> >> diff --git a/tests/test-conntrack.c b/tests/test-conntrack.c >> index 22db95f91..ae88538cf 100644 >> --- a/tests/test-conntrack.c >> +++ b/tests/test-conntrack.c >> @@ -176,10 +176,14 @@ destroy_packets(struct dp_packet_batch *pkt_batch) >> struct thread_aux { >> pthread_t thread; >> unsigned tid; >> + bool first; >> + uint64_t sleep_counter; >> + uint64_t conn_lookup_errors; >> }; >> >> static struct conntrack *ct; >> -static unsigned long n_threads, n_pkts, batch_size; >> +static unsigned long n_threads, n_pkts, batch_size, n_conns, >> + n_zones, n_data_pkts, n_iterations; > > Can we keep these local instead of promoting to global scope? This > seems like they could be kept local. > >> static bool change_conn = false; >> static struct ovs_barrier barrier; >> >> @@ -193,6 +197,7 @@ ct_thread_main(void *aux_) >> size_t i; >> long long now = time_msec(); >> >> + ovs_assert(batch_size > 0); >> pkt_batch = prepare_packets(batch_size, change_conn, aux->tid, >> &dl_type); >> ovs_barrier_block(&barrier); >> for (i = 0; i < n_pkts; i += batch_size) { >> @@ -259,10 +264,257 @@ test_benchmark(struct ovs_cmdl_context *ctx) >> free(threads); >> } >> >> +static struct dp_packet_batch *** >> +gen_tcp_connections(size_t n_packets_per_conn, unsigned tid) >> +{ >> + struct eth_addr eth_src, eth_dst; >> + uint16_t sport, dport; >> + ovs_be32 ip_src, ip_dst; >> + struct dp_packet *pkt; >> + size_t i, j; >> + >> + /* Some generic stuff we use for all conns */ >> + struct eth_addr glob_eth_src = ETH_ADDR_C(00, 01, 02, 03, 04, 05); >> + struct eth_addr glob_eth_dst = ETH_ADDR_C(00, 06, 07, 08, 09, 0a); >> + ovs_be32 glob_ip_dst = inet_addr("192.168.0.1"); >> + uint16_t glob_sport = 12345; >> + uint16_t glob_dport = 80; >> + >> + struct dp_packet_batch ***pkt_batches_per_pkt = xzalloc( >> + n_packets_per_conn * sizeof(*pkt_batches_per_pkt)); >> + for (i = 0; i < n_packets_per_conn; i++) { >> + struct dp_packet_batch **pkt_batches = xzalloc( >> + n_conns * sizeof(*pkt_batches)); >> + pkt_batches_per_pkt[i] = pkt_batches; >> + >> + for (j = 0; j < n_conns; j++) { >> + /* We just define 10.X.Y.Y for source ips. >> + * X is the thread id and YY is the connection id. */ >> + ovs_be32 glob_ip_src = htonl( >> + 0x0A000000 | ((tid & 0xFF) << 16) | (j & 0xFFFF)); > > while it isn't common, there may be machines with enough cores to go > beyond 255, and this will cause a weird kind of wrap. Maybe limit it? > >> + bool forward = true; >> + uint16_t tcp_flags = TCP_ACK; >> + >> + if (i == 0) { >> + tcp_flags = TCP_SYN; >> + } else if (i == 1) { >> + tcp_flags |= TCP_SYN; >> + forward = false; >> + } else if (i == n_packets_per_conn - 3) { >> + tcp_flags |= TCP_FIN; >> + } else if (i == n_packets_per_conn - 2) { >> + tcp_flags |= TCP_FIN; >> + forward = false; >> + } else if (i % 2 == 1) { >> + /* Data. */ >> + tcp_flags |= TCP_PSH; >> + } else { >> + /* ACK for Data. */ >> + forward = false; >> + } > > Am I misunderstanding the counter here? > > Let's say n_packets_per_conn == 15, then we get: > > [0] = tcp_flags=tcp_syn > [1] = tcp_flags=tcp_syn|tcp_ack > [2,4,6,8,10] = tcp_flags=tcp_ack > [3,5,7,9,11] = tcp_flags=tcp_ack|tcp_psh > > [12] = tcp_flags=tcp_ack|tcp_fin > [13] = tcp_flags=tcp_ack|tcp_fin > [14] = tcp_flags=tcp_ack > > Now questions. > > There seems to be a weird ending packet here. > > For i=2, forward will be false - but shouldn't it be in the forward > direction (same as i==0). > > For n_packets_per_conn == 16, it looks like the last packet will contain > a tcp_psh flag (and maybe be in the wrong direction)? > >> + >> + if (forward) { >> + eth_src = glob_eth_src; >> + eth_dst = glob_eth_dst; >> + ip_src = glob_ip_src; >> + ip_dst = glob_ip_dst; >> + sport = glob_sport; >> + dport = glob_dport; >> + } else { >> + eth_src = glob_eth_dst; >> + eth_dst = glob_eth_src; >> + ip_src = glob_ip_dst; >> + ip_dst = glob_ip_src; >> + sport = glob_dport; >> + dport = glob_sport; >> + } >> + >> + pkt = build_eth_ip_packet(NULL, eth_src, eth_dst, >> + ip_src, ip_dst, >> + IPPROTO_TCP, 0); >> + build_tcp_packet(pkt, sport, dport, tcp_flags, NULL, 0); >> + pkt_batches[j] = xzalloc(sizeof(struct dp_packet_batch)); >> + dp_packet_batch_init_packet(pkt_batches[j], pkt); >> + } >> + } >> + return pkt_batches_per_pkt; >> +} >> + >> +static void * >> +ct_thread_tcp_main(void *aux_) >> +{ >> + struct thread_aux *aux = aux_; >> + size_t i, j; >> + >> + /* Prepare packets to run complete tcp connections. >> + * n_data_pkts defines how many normal packets come as part of the >> + * connection. >> + * So we need to generate: >> + * src->dst: SYN >> + * dst->src: SYN+ACK >> + * src->dst: ACK >> + * for each n_data_pkts: >> + * src->dst: Data >> + * dst->src: ACK >> + * src->dst: FIN+ACK >> + * dst->src: FIN+ACK >> + * src->dst: ACK >> + * >> + * So we need 'n_data_pkts * 2 + 6' packets for each connection.*/ >> + size_t n_packets_per_conn = n_data_pkts * 2 + 6; >> + struct dp_packet_batch ***pkt_batches_per_pkt = gen_tcp_connections( >> + n_packets_per_conn, aux->tid); >> + >> + ovs_barrier_block(&barrier); >> + >> + /* Each thread has a zone_offset, so that not all threads try to access >> + * the same zone at the same time and thereby move in >> lockstep. That would >> + * be unrealistic in real cases. We therefor distribute them as much as >> + * possible over all zones. >> + * The thread id will generally be only different by 1 between different >> + * threads as they are started directly afterwards. So we need to >> equally >> + * distribute it. */ >> + uint16_t zone_offset = aux->tid * n_zones / n_threads; >> + >> + for (i = 0; i < n_packets_per_conn * n_iterations; i++) { >> + size_t packet_offset = i % n_packets_per_conn; >> + if (packet_offset == 0 && aux->first) { >> + printf("Iteration: %"PRIuSIZE"/%lu\n", >> + i / n_packets_per_conn, n_iterations); >> + } >> + >> + struct dp_packet_batch **pkt_batches = pkt_batches_per_pkt[ >> + i % n_packets_per_conn]; >> + for (j = 0; j < n_conns; j++) { >> + long long now = time_msec(); >> + uint16_t zone = (j + zone_offset) % n_zones; >> + /* We set mark here to some value since it allows us to detect >> if >> + * conntrack actually inserted the connection or not. >> + * If e.g. the conntrack table is full then mark will not be >> set. >> + * We can not use CT_TRACKED, since it is also set on a full >> + * table. */ >> + uint32_t mark[2] = {aux->tid + 1, 0xFFFF}; >> + >> + pkt_metadata_init(&pkt_batches[j]->packets[0]->md, 0); >> + bool commit = packet_offset == 0; >> + conntrack_execute(ct, pkt_batches[j], htons(ETH_TYPE_IP), >> + false, commit, zone, mark, NULL, >> + NULL, NULL, now, 0); >> + >> + uint8_t ct_state = pkt_batches[j]->packets[0]->md.ct_state; >> + >> + if (packet_offset == 0) { >> + /* Check if something prevented a new connection to be added to >> + * conntrack we should detect it here. This can generally >> + * happen if the ct_sweep is too slow for all the >> connections >> + * we generate. In this case we will just sleep a little and >> + * then try again. */ >> + if (pkt_batches[j]->packets[0]->md.ct_mark == 0) { > > Is it possible that this never converges? We should have a bail on the > number of times through this, I think. Otherwise, we'll keep iterating > on the same element over and over. > >> + xnanosleep(1000000); >> + aux->sleep_counter++; >> + j--; >> + continue; >> + } >> + ovs_assert(ct_state & CS_NEW); >> + } >> + >> + if (packet_offset != 0 && (ct_state & CS_ESTABLISHED) == 0) { >> + aux->conn_lookup_errors++; >> + continue; >> + } >> + >> + ovs_assert((ct_state & CS_INVALID) == 0); >> + ovs_assert(pkt_batches[j]->packets[0]->md.ct_mark != 0); >> + >> + } >> + ovsrcu_quiesce(); >> + } >> + >> + ovs_barrier_block(&barrier); >> + >> + for (i = 0; i < n_packets_per_conn; i++) { >> + for (j = 0; j < n_conns; j++) { >> + destroy_packets(pkt_batches_per_pkt[i][j]); >> + } >> + free(pkt_batches_per_pkt[i]); >> + } >> + free(pkt_batches_per_pkt); >> + >> + return NULL; >> +} >> + >> +static void >> +test_benchmark_tcp(struct ovs_cmdl_context *ctx) >> +{ >> + struct thread_aux *threads; >> + uint64_t sleep_counter = 0; >> + uint64_t lookup_errors = 0; >> + long long start; >> + unsigned i; >> + >> + fatal_signal_init(); >> + >> + /* Parse arguments */ >> + n_threads = strtoul(ctx->argv[1], NULL, 0); >> + if (!n_threads) { >> + ovs_fatal(0, "n_threads must be at least one"); >> + } >> + n_conns = strtoul(ctx->argv[2], NULL, 0); >> + n_zones = strtoul(ctx->argv[3], NULL, 0); >> + n_data_pkts = strtoul(ctx->argv[4], NULL, 0); >> + n_iterations = strtoul(ctx->argv[5], NULL, 0); > > maybe use str_to_ullong here so it gets validation? > >> + if (n_conns == 0 || n_zones == 0) { >> + ovs_fatal(0, "n_conns and n_zones must be at least 1"); >> + } >> + if (n_conns > UINT16_MAX) { > > NOTE: n_conns can be larger than uint16_max in test_benchmark_zones, so > having it as a global looks even stranger since the semantic is changing > based on the test. > >> + ovs_fatal(0, "n_conns may be at most %u", UINT16_MAX); >> + } >> + >> + threads = xcalloc(n_threads, sizeof *threads); >> + ovs_barrier_init(&barrier, n_threads + 1); >> + ct = conntrack_init(); >> + conntrack_set_sweep_interval(ct, 10); >> + >> + /* Create threads */ >> + start = time_msec(); >> + for (i = 0; i < n_threads; i++) { >> + threads[i].first = i == 0; >> + threads[i].tid = i; >> + threads[i].sleep_counter = 0; >> + threads[i].conn_lookup_errors = 0; >> + threads[i].thread = ovs_thread_create("ct_thread", >> ct_thread_tcp_main, >> + &threads[i]); >> + } >> + /* Starts the work inside the threads */ >> + ovs_barrier_block(&barrier); >> + printf("initial packet generation time: %lld ms\n", time_msec() - >> start); >> + start = time_msec(); >> + >> + /* Wait for the threads to finish the work */ >> + ovs_barrier_block(&barrier); >> + printf("conntrack: %5lld ms\n", time_msec() - start); >> + >> + for (i = 0; i < n_threads; i++) { >> + xpthread_join(threads[i].thread, NULL); >> + sleep_counter += threads[i].sleep_counter; >> + lookup_errors += threads[i].conn_lookup_errors; >> + } >> + >> + printf("total sleep to wait for conntrack space " >> + "(sum over all threads): %" PRIu64 " ms\n", sleep_counter); >> + if (lookup_errors) { >> + printf("total conntrack lookup errors. Probably the connection >> expired" >> + "because of a timeout policy: %" PRIu64 "\n", lookup_errors); >> + } >> + >> + conntrack_destroy(ct); >> + ovs_barrier_destroy(&barrier); >> + free(threads); >> +} >> + >> static void >> test_benchmark_zones(struct ovs_cmdl_context *ctx) >> { >> - unsigned long n_conns, n_zones, iterations; >> long long start; >> unsigned i, j; >> ovs_be16 dl_type; >> @@ -279,8 +531,8 @@ test_benchmark_zones(struct ovs_cmdl_context *ctx) >> if (n_zones == 0 || n_zones >= UINT16_MAX) { >> ovs_fatal(0, "n_zones must be between 1 and 2^16"); >> } >> - iterations = strtoul(ctx->argv[3], NULL, 0); >> - if (iterations == 0) { >> + n_iterations = strtoul(ctx->argv[3], NULL, 0); >> + if (n_iterations == 0) { >> ovs_fatal(0, "iterations must be greater than 0"); >> } >> >> @@ -289,6 +541,7 @@ test_benchmark_zones(struct ovs_cmdl_context *ctx) >> /* Create initial connection entries */ >> start = time_msec(); >> struct dp_packet_batch **pkt_batch = xzalloc(n_conns * sizeof >> *pkt_batch); >> + ovs_assert(n_conns > 0); > > This looks dead. Earlier in this function, we validate: > > if (n_conns == 0 || n_conns >= UINT32_MAX) { > ovs_fatal(0, "n_conns must be between 1 and 2^32"); > } > >> for (i = 0; i < n_conns; i++) { >> pkt_batch[i] = xzalloc(sizeof(struct dp_packet_batch)); >> dp_packet_batch_init(pkt_batch[i]); >> @@ -322,7 +575,7 @@ test_benchmark_zones(struct ovs_cmdl_context *ctx) >> stopwatch_create(STOPWATCH_FLUSH_FULL_ZONE, SW_US); >> stopwatch_create(STOPWATCH_FLUSH_EMPTY_ZONE, SW_US); >> start = time_msec(); >> - for (i = 0; i < iterations; i++) { >> + for (i = 0; i < n_iterations; i++) { >> /* Testing flushing a full zone */ >> stopwatch_start(STOPWATCH_FLUSH_FULL_ZONE, time_usec()); >> uint16_t zone = 1; >> @@ -597,6 +850,12 @@ static const struct ovs_cmdl_command commands[] = { >> * is rewritten to the SNAT target rather than causing a crash. */ >> {"ftp-alg-large-payload", "", 0, 0, >> test_ftp_alg_large_payload, OVS_RO}, >> + /* Starts 'n_threads' threads. Each thread will open 'n_conns' tcp >> + * connections across 'n_zones' separate conntrack zones. Each conn will >> + * send 'n_data_pkts' data packets within each connection. >> + * Each conn will run through conntrack 'iterations' times. */ >> + {"benchmark-tcp", "n_threads n_conns n_zones n_data_pkts iterations", >> 5, 5, >> + test_benchmark_tcp, OVS_RO}, >> >> {NULL, NULL, 0, 0, NULL, OVS_RO}, >> }; _______________________________________________ dev mailing list [email protected] https://mail.openvswitch.org/mailman/listinfo/ovs-dev
