While looking at Viswa's example, I've found what seems to be a
problem using lots of QPs on mem-free HCAs.  This could easily be an
mthca driver bug, but I'd appreciate it if Mellanox would take a look
and help track down the issue.  I looked at the mthca code and don't
see anything wrong, so either narrowing down the software bug or
telling me it's actually a FW/HW bug would be great.

I'm attaching a fairly simple program that shows the problem on my
systems.  It just creates a bunch of QPs and has one side send one
message from each QP.  The other side waits for receives and sends a
reply back for every receive it gets.  When all the replies are
received, it loops around and does it again.

To build the example, just do:

   gcc -o rc-test rc-test.c -libverbs

To run, do

    rc-test

on one system, and

    rc-test <listening address>

on the other.  In fact, I can reproduce the problem even on a single
system just with

    rc-test &
    rc-test localhost

On a system with a PCI-X HCA, this works perfectly.  However, on a
system with Arbel HCAs (with mem-free FW 5.1.0), I get the following
output (going on forever):

      local address:  LID 0x0008
      remote address: LID 0x0007
    After 1.000066 sec, 104/4000 comps
    After 2.000276 sec, 104/4000 comps
    After 3.000295 sec, 104/4000 comps
    After 4.000332 sec, 104/4000 comps
    After 5.000375 sec, 104/4000 comps

which shows that only 104 out of the 4000 send/receive pairs ever
complete.  On the other side I see the same number of completions.  It
seems the HCA loses a bunch of doorbells, although an IPoIB traffic
running in the background continues fine.

Viswa seems to have seen the same problem with Sinai & FW 1.0.1.

Let me know if you need more info.

Thanks,
  Roland

/*
 * Copyright (c) 2005 Cisco Systems.  All rights reserved.
 *
 * This software is available to you under a choice of one of two
 * licenses.  You may choose to be licensed under the terms of the GNU
 * General Public License (GPL) Version 2, available from the file
 * COPYING in the main directory of this source tree, or the
 * OpenIB.org BSD license below:
 *
 *     Redistribution and use in source and binary forms, with or
 *     without modification, are permitted provided that the following
 *     conditions are met:
 *
 *      - Redistributions of source code must retain the above
 *        copyright notice, this list of conditions and the following
 *        disclaimer.
 *
 *      - Redistributions in binary form must reproduce the above
 *        copyright notice, this list of conditions and the following
 *        disclaimer in the documentation and/or other materials
 *        provided with the distribution.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#define _GNU_SOURCE

#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <string.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <sys/time.h>
#include <netdb.h>
#include <malloc.h>
#include <getopt.h>
#include <arpa/inet.h>
#include <time.h>

#include <infiniband/verbs.h>

enum {
	NQP = 2000,
	SZ  = 100
};

static int page_size;
static int sockfd = -1;

struct pingpong_context {
	struct ibv_context *context;
	struct ibv_pd      *pd;
	struct ibv_mr      *mr;
	struct ibv_cq      *cq[NQP];
	struct ibv_qp      *qp[NQP];
	void               *buf;
};

struct pingpong_dest {
	int lid;
	int qpn;
	int psn;
};

static uint16_t pp_get_local_lid(struct pingpong_context *ctx, int port)
{
	struct ibv_port_attr attr;

	if (ibv_query_port(ctx->context, port, &attr))
		return 0;

	return attr.lid;
}

static int pp_connect_ctx(struct pingpong_context *ctx, int port,
			  const struct pingpong_dest *my_dest,
			  const struct pingpong_dest *dest)
{
	int i;

	for (i = 0; i < NQP; ++i) {
		struct ibv_qp_attr attr = {
			.qp_state		= IBV_QPS_RTR,
			.path_mtu		= IBV_MTU_1024,
			.dest_qp_num		= dest[i].qpn,
			.rq_psn 		= dest[i].psn,
			.max_dest_rd_atomic	= 1,
			.min_rnr_timer		= 12,
			.ah_attr		= {
				.is_global	= 0,
				.dlid		= dest[i].lid,
				.sl		= 0,
				.src_path_bits	= 0,
				.port_num	= port
			}
		};
		if (ibv_modify_qp(ctx->qp[i], &attr,
				  IBV_QP_STATE              |
				  IBV_QP_AV                 |
				  IBV_QP_PATH_MTU           |
				  IBV_QP_DEST_QPN           |
				  IBV_QP_RQ_PSN             |
				  IBV_QP_MAX_DEST_RD_ATOMIC |
				  IBV_QP_MIN_RNR_TIMER)) {
			fprintf(stderr, "Failed to modify QP[%d] to RTR\n", i);
			return 1;
		}

		attr.qp_state 	    = IBV_QPS_RTS;
		attr.timeout 	    = 16;
		attr.retry_cnt 	    = 7;
		attr.rnr_retry 	    = 7;
		attr.sq_psn 	    = my_dest[i].psn;
		attr.max_rd_atomic  = 1;
		if (ibv_modify_qp(ctx->qp[i], &attr,
				  IBV_QP_STATE              |
				  IBV_QP_TIMEOUT            |
				  IBV_QP_RETRY_CNT          |
				  IBV_QP_RNR_RETRY          |
				  IBV_QP_SQ_PSN             |
				  IBV_QP_MAX_QP_RD_ATOMIC)) {
			fprintf(stderr, "Failed to modify QP[%d] to RTS\n", i);
			return 1;
		}
	}

	return 0;
}

static struct pingpong_dest *pp_client_exch_dest(const char *servername, int port,
						 const struct pingpong_dest *my_dest)
{
	struct addrinfo *res, *t;
	struct addrinfo hints = {
		.ai_family   = AF_UNSPEC,
		.ai_socktype = SOCK_STREAM
	};
	char *service;
	char msg[ sizeof "0000:000000:000000"];
	int n;
	int r;
	int i;
	struct pingpong_dest *rem_dest = NULL;

	asprintf(&service, "%d", port);
	n = getaddrinfo(servername, service, &hints, &res);

	if (n < 0) {
		fprintf(stderr, "%s for %s:%d\n", gai_strerror(n), servername, port);
		return NULL;
	}

	for (t = res; t; t = t->ai_next) {
		sockfd = socket(t->ai_family, t->ai_socktype, t->ai_protocol);
		if (sockfd >= 0) {
			if (!connect(sockfd, t->ai_addr, t->ai_addrlen))
				break;
			close(sockfd);
			sockfd = -1;
		}
	}

	freeaddrinfo(res);

	if (sockfd < 0) {
		fprintf(stderr, "Couldn't connect to %s:%d\n", servername, port);
		return NULL;
	}

	for (i = 0; i < NQP; ++i) {
		sprintf(msg, "%04x:%06x:%06x", my_dest[i].lid, my_dest[i].qpn, my_dest[i].psn);
		if (write(sockfd, msg, sizeof msg) != sizeof msg) {
			fprintf(stderr, "Couldn't send local address\n");
			goto out;
		}
	}

	rem_dest = malloc(NQP * sizeof *rem_dest);
	if (!rem_dest)
		goto out;

	for (i = 0; i < NQP; ++i) {
		n = 0;
		while (n < sizeof msg) {
			r = read(sockfd, msg + n, sizeof msg - n);
			if (r < 0) {
				perror("client read");
				fprintf(stderr, "%d/%d: Couldn't read remote address [%d]\n",
					n, (int) sizeof msg, i);
				goto out;
			}
			n += r;
		}

		sscanf(msg, "%x:%x:%x",
		       &rem_dest[i].lid, &rem_dest[i].qpn, &rem_dest[i].psn);
	}

	write(sockfd, "done", sizeof "done");

out:
	return rem_dest;
}

static struct pingpong_dest *pp_server_exch_dest(struct pingpong_context *ctx,
						 int ib_port, int port,
						 const struct pingpong_dest *my_dest)
{
	struct addrinfo *res, *t;
	struct addrinfo hints = {
		.ai_flags    = AI_PASSIVE,
		.ai_family   = AF_UNSPEC,
		.ai_socktype = SOCK_STREAM
	};
	char *service;
	char msg[ sizeof "0000:000000:000000"];
	int n;
	int r;
	int i;
	int listenfd = -1;
	struct pingpong_dest *rem_dest = NULL;

	asprintf(&service, "%d", port);
	n = getaddrinfo(NULL, service, &hints, &res);

	if (n < 0) {
		fprintf(stderr, "%s for port %d\n", gai_strerror(n), port);
		return NULL;
	}

	for (t = res; t; t = t->ai_next) {
		listenfd = socket(t->ai_family, t->ai_socktype, t->ai_protocol);
		if (listenfd >= 0) {
			n = 1;

			setsockopt(listenfd, SOL_SOCKET, SO_REUSEADDR, &n, sizeof n);

			if (!bind(listenfd, t->ai_addr, t->ai_addrlen))
				break;
			close(listenfd);
			listenfd = -1;
		}
	}

	freeaddrinfo(res);

	if (listenfd < 0) {
		fprintf(stderr, "Couldn't listen to port %d\n", port);
		return NULL;
	}

	listen(listenfd, 1);
	sockfd = accept(listenfd, NULL, 0);
	close(listenfd);
	if (sockfd < 0) {
		fprintf(stderr, "accept() failed\n");
		return NULL;
	}

	rem_dest = malloc(NQP *sizeof *rem_dest);
	if (!rem_dest)
		goto out;

	for (i = 0; i < NQP; ++i) {
		n = 0;
		while (n < sizeof msg) {
			r = read(sockfd, msg + n, sizeof msg - n);
			if (r < 0) {
				perror("server read");
				fprintf(stderr, "%d/%d: Couldn't read remote address [%d]\n",
					n, (int) sizeof msg, i);
				goto out;
			}
			n += r;
		}

		sscanf(msg, "%x:%x:%x",
		       &rem_dest[i].lid, &rem_dest[i].qpn, &rem_dest[i].psn);
	}

	if (pp_connect_ctx(ctx, ib_port, my_dest, rem_dest)) {
		fprintf(stderr, "Couldn't connect to remote QP\n");
		free(rem_dest);
		rem_dest = NULL;
		goto out;
	}

	for (i = 0; i < NQP; ++i) {
		sprintf(msg, "%04x:%06x:%06x", my_dest[i].lid, my_dest[i].qpn, my_dest[i].psn);
		if (write(sockfd, msg, sizeof msg) != sizeof msg) {
			fprintf(stderr, "Couldn't send local address\n");
			free(rem_dest);
			rem_dest = NULL;
			goto out;
		}
	}

	read(sockfd, msg, sizeof msg);

out:
	return rem_dest;
}

static struct pingpong_context *pp_init_ctx(struct ibv_device *ib_dev, int port)
{
	struct pingpong_context *ctx;
	int i;

	ctx = malloc(sizeof *ctx);
	if (!ctx)
		return NULL;

	ctx->buf = memalign(page_size, NQP * 2 * SZ);
	if (!ctx->buf) {
		fprintf(stderr, "Couldn't allocate work buf.\n");
		return NULL;
	}

	memset(ctx->buf,            0, SZ * NQP);
	memset(ctx->buf + SZ * NQP, 1, SZ * NQP);

	ctx->context = ibv_open_device(ib_dev);
	if (!ctx->context) {
		fprintf(stderr, "Couldn't get context for %s\n",
			ibv_get_device_name(ib_dev));
		return NULL;
	}

	ctx->pd = ibv_alloc_pd(ctx->context);
	if (!ctx->pd) {
		fprintf(stderr, "Couldn't allocate PD\n");
		return NULL;
	}

	ctx->mr = ibv_reg_mr(ctx->pd, ctx->buf, NQP * 2 * SZ, IBV_ACCESS_LOCAL_WRITE);
	if (!ctx->mr) {
		fprintf(stderr, "Couldn't allocate MR\n");
		return NULL;
	}

	for (i = 0; i < NQP; ++i) {
		ctx->cq[i] = ibv_create_cq(ctx->context, 2, NULL);
		if (!ctx->cq[i]) {
			fprintf(stderr, "Couldn't create CQ %d\n", i);
			return NULL;
		}
	}

	for (i = 0; i < NQP; ++i) {
		struct ibv_qp_init_attr attr = {
			.send_cq = ctx->cq[i],
			.recv_cq = ctx->cq[i],
			.cap     = {
				.max_send_wr  = 2,
				.max_send_sge = 1,
				.max_recv_wr  = 2,
				.max_recv_sge = 1,
			},
			.qp_type = IBV_QPT_RC
		};

		ctx->qp[i] = ibv_create_qp(ctx->pd, &attr);
		if (!ctx->qp[i])  {
			fprintf(stderr, "Couldn't create QP[%d]\n", i);
			return NULL;
		}
	}

	for (i = 0; i < NQP; ++i) {
		struct ibv_qp_attr attr;

		attr.qp_state        = IBV_QPS_INIT;
		attr.pkey_index      = 0;
		attr.port_num        = port;
		attr.qp_access_flags = 0;

		if (ibv_modify_qp(ctx->qp[i], &attr,
				  IBV_QP_STATE              |
				  IBV_QP_PKEY_INDEX         |
				  IBV_QP_PORT               |
				  IBV_QP_ACCESS_FLAGS)) {
			fprintf(stderr, "Failed to modify QP[%d] to INIT\n", i);
			return NULL;
		}
	}

	return ctx;
}

static int pp_post_recv(struct pingpong_context *ctx)
{
	struct ibv_sge list = {
		.length = SZ,
		.lkey 	= ctx->mr->lkey
	};
	struct ibv_recv_wr wr = {
		.sg_list    = &list,
		.num_sge    = 1,
	};
	struct ibv_recv_wr *bad_wr;
	int i;

	for (i = 0; i < NQP; ++i) {
		list.addr = (uintptr_t) ctx->buf + i * SZ;
		wr.wr_id  = i;
		if (ibv_post_recv(ctx->qp[i], &wr, &bad_wr))
			break;
	}

	return i;
}

static inline int pp_post_send(struct pingpong_context *ctx)
{
	struct ibv_sge list = {
		.length = SZ,
		.lkey 	= ctx->mr->lkey
	};
	struct ibv_send_wr wr = {
		.sg_list    = &list,
		.num_sge    = 1,
		.opcode     = IBV_WR_SEND,
		.send_flags = IBV_SEND_SIGNALED,
	};
	struct ibv_send_wr *bad_wr;
	int i;

	for (i = 0; i < NQP; ++i) {
		list.addr = (uintptr_t) ctx->buf + SZ * (NQP + i);
		wr.wr_id  = i;
		if (ibv_post_send(ctx->qp[i], &wr, &bad_wr))
			break;
	}

	return i;
}

static inline int pp_post_reply(struct pingpong_context *ctx, int n)
{
	struct ibv_sge list = {
		.length = SZ,
		.lkey 	= ctx->mr->lkey,
		.addr   = (uintptr_t) ctx->buf + SZ * (NQP + n)
	};
	struct ibv_send_wr wr = {
		.sg_list    = &list,
		.num_sge    = 1,
		.opcode     = IBV_WR_SEND,
		.send_flags = IBV_SEND_SIGNALED,
		.wr_id      = NQP + n
	};
	struct ibv_send_wr *bad_wr;

	return ibv_post_send(ctx->qp[n], &wr, &bad_wr);
}

static void usage(const char *argv0)
{
	printf("Usage:\n");
	printf("  %s            start a server and wait for connection\n", argv0);
	printf("  %s <host>     connect to server at <host>\n", argv0);
	printf("\n");
	printf("Options:\n");
	printf("  -p, --port=<port>      listen on/connect to port <port> (default 18515)\n");
	printf("  -d, --ib-dev=<dev>     use IB device <dev> (default first device found)\n");
	printf("  -i, --ib-port=<port>   use port <port> of IB device (default 1)\n");
}

int main(int argc, char *argv[])
{
	struct dlist 	  	*dev_list;
	struct ibv_device 	*ib_dev;
	struct pingpong_context *ctx;
	struct pingpong_dest     my_dest[NQP];
	struct pingpong_dest    *rem_dest;
	struct timeval           start, end;
	char                    *ib_devname = NULL;
	char                    *servername = NULL;
	int                      port = 18515;
	int                      ib_port = 1;
	int                      outs;
	int                      cnt;
	int                      i;
	int			 usec, tmp = 0;
	char			 sync;

	srand48(getpid() * time(NULL));

	while (1) {
		int c;

		static struct option long_options[] = {
			{ .name = "port",     .has_arg = 1, .val = 'p' },
			{ .name = "ib-dev",   .has_arg = 1, .val = 'd' },
			{ .name = "ib-port",  .has_arg = 1, .val = 'i' },
			{ 0 }
		};

		c = getopt_long(argc, argv, "p:d:i:s:q:r:n:e", long_options, NULL);
		if (c == -1)
			break;

		switch (c) {
		case 'p':
			port = strtol(optarg, NULL, 0);
			if (port < 0 || port > 65535) {
				usage(argv[0]);
				return 1;
			}
			break;

		case 'd':
			ib_devname = strdupa(optarg);
			break;

		case 'i':
			ib_port = strtol(optarg, NULL, 0);
			if (ib_port < 0) {
				usage(argv[0]);
				return 1;
			}
			break;

		default:
			usage(argv[0]);
			return 1;
		}
	}

	if (optind == argc - 1)
		servername = strdupa(argv[optind]);
	else if (optind < argc) {
		usage(argv[0]);
		return 1;
	}

	page_size = sysconf(_SC_PAGESIZE);

	dev_list = ibv_get_devices();
	if (!dev_list) {
		fprintf(stderr, "No IB devices found\n");
		return 1;
	}

	dlist_start(dev_list);
	if (!ib_devname) {
		ib_dev = dlist_next(dev_list);
		if (!ib_dev) {
			fprintf(stderr, "No IB devices found\n");
			return 1;
		}
	} else {
		dlist_for_each_data(dev_list, ib_dev, struct ibv_device)
			if (!strcmp(ibv_get_device_name(ib_dev), ib_devname))
				break;
		if (!ib_dev) {
			fprintf(stderr, "IB device %s not found\n", ib_devname);
			return 1;
		}
	}

	ctx = pp_init_ctx(ib_dev, ib_port);
	if (!ctx)
		return 1;

	for (i = 0; i < NQP; ++i) {
		my_dest[i].qpn = ctx->qp[i]->qp_num;
		my_dest[i].psn = lrand48() & 0xffffff;
		my_dest[i].lid = pp_get_local_lid(ctx, ib_port);
		if (!my_dest[i].lid) {
			fprintf(stderr, "Couldn't get local LID\n");
			return 1;
		}
	}

	printf("  local address:  LID 0x%04x\n",  my_dest[0].lid);

	if (servername)
		rem_dest = pp_client_exch_dest(servername, port, my_dest);
	else
		rem_dest = pp_server_exch_dest(ctx, ib_port, port, my_dest);

	if (!rem_dest)
		return 1;

	printf("  remote address: LID 0x%04x\n", rem_dest[0].lid);

	if (servername)
		if (pp_connect_ctx(ctx, ib_port, my_dest, rem_dest))
			return 1;

	if (gettimeofday(&start, NULL)) {
		perror("gettimeofday");
		return 1;
	}

	while (1) {
		outs = pp_post_recv(ctx);

		if (outs < NQP) {
			fprintf(stderr, "Couldn't post recvs (%d)\n", outs);
			return 1;
		}

		if (servername) {
			if (read(sockfd, &sync, 1) != 1) {
				perror("read");
				return 1;
			}

			outs += pp_post_send(ctx);

			if (outs < 2 * NQP) {
				fprintf(stderr, "Couldn't post sends (%d)\n", outs);
				return 1;
			}
		} else {
			if (write(sockfd, &sync, 1) != 1) {
				perror("write");
				return 1;
			}
		}


		cnt = 0;
		while (cnt < outs) {
			struct ibv_wc wc;
			int ne;

			for (i = 0; i < NQP; ++i) {
				ne = ibv_poll_cq(ctx->cq[i], 1, &wc);
				if (ne < 0) {
					fprintf(stderr, "poll CQ failed %d\n", ne);
					return 1;
				}
				if (ne)
					break;
			}

			if (wc.status != IBV_WC_SUCCESS) {
				fprintf(stderr, "Failed status %d for wr_id %d (%d/%d done)\n",
					wc.status, (int) wc.wr_id, cnt, outs);
				return 1;
			}

			if (!servername && ne && wc.wr_id < NQP) {
				if (pp_post_reply(ctx, i)) {
					fprintf(stderr, "Failed to send reply %d\n", i);
					return 1;
				}
				++outs;
			}
				
			cnt += ne;

			if (gettimeofday(&end, NULL)) {
				perror("gettimeofday");
				return 1;
			}

			usec = (end.tv_sec - start.tv_sec) * 1000000 +
				(end.tv_usec - start.tv_usec);
			if (usec >= tmp + 1000000) {
				printf("After %d.%06d sec, %d/%d comps\n",
				       usec / 1000000, usec % 1000000, cnt, outs);
				tmp = usec;
			}
		}

		if (gettimeofday(&end, NULL)) {
			perror("gettimeofday");
			return 1;
		}

		usec = (end.tv_sec - start.tv_sec) * 1000000 + (end.tv_usec - start.tv_usec);

		if (!servername) {
			cnt = 0;
			if (0) {
				printf("\nMemory contents:\n  ");
				for (i = 0; i < NQP; ++i) {
					printf("%d", ((uint8_t *) ctx->buf)[i * SZ]);
					if ((i + 1) % 72 == 0 || i == NQP - 1)
						printf("\n  ");
				}
				printf("\n");
			}

			for (i = 0; i < NQP; ++i)
				cnt += ((uint8_t *) ctx->buf)[i * SZ];

			printf("After %d.%06d sec, %d comps (%d recvs)\n",
			       usec / 1000000, usec % 1000000, outs, cnt);
		} else
			printf("After %d.%06d sec, %d comps\n",
			       usec / 1000000, usec % 1000000, cnt);
	}

	return 0;
}
_______________________________________________
openib-general mailing list
[email protected]
http://openib.org/mailman/listinfo/openib-general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general

Reply via email to