While looking at Viswa's example, I've found what seems to be a
problem using lots of QPs on mem-free HCAs. This could easily be an
mthca driver bug, but I'd appreciate it if Mellanox would take a look
and help track down the issue. I looked at the mthca code and don't
see anything wrong, so either narrowing down the software bug or
telling me it's actually a FW/HW bug would be great.
I'm attaching a fairly simple program that shows the problem on my
systems. It just creates a bunch of QPs and has one side send one
message from each QP. The other side waits for receives and sends a
reply back for every receive it gets. When all the replies are
received, it loops around and does it again.
To build the example, just do:
gcc -o rc-test rc-test.c -libverbs
To run, do
rc-test
on one system, and
rc-test <listening address>
on the other. In fact, I can reproduce the problem even on a single
system just with
rc-test &
rc-test localhost
On a system with a PCI-X HCA, this works perfectly. However, on a
system with Arbel HCAs (with mem-free FW 5.1.0), I get the following
output (going on forever):
local address: LID 0x0008
remote address: LID 0x0007
After 1.000066 sec, 104/4000 comps
After 2.000276 sec, 104/4000 comps
After 3.000295 sec, 104/4000 comps
After 4.000332 sec, 104/4000 comps
After 5.000375 sec, 104/4000 comps
which shows that only 104 out of the 4000 send/receive pairs ever
complete. On the other side I see the same number of completions. It
seems the HCA loses a bunch of doorbells, although an IPoIB traffic
running in the background continues fine.
Viswa seems to have seen the same problem with Sinai & FW 1.0.1.
Let me know if you need more info.
Thanks,
Roland
/*
* Copyright (c) 2005 Cisco Systems. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
* General Public License (GPL) Version 2, available from the file
* COPYING in the main directory of this source tree, or the
* OpenIB.org BSD license below:
*
* Redistribution and use in source and binary forms, with or
* without modification, are permitted provided that the following
* conditions are met:
*
* - Redistributions of source code must retain the above
* copyright notice, this list of conditions and the following
* disclaimer.
*
* - Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following
* disclaimer in the documentation and/or other materials
* provided with the distribution.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <string.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <sys/time.h>
#include <netdb.h>
#include <malloc.h>
#include <getopt.h>
#include <arpa/inet.h>
#include <time.h>
#include <infiniband/verbs.h>
enum {
NQP = 2000,
SZ = 100
};
static int page_size;
static int sockfd = -1;
struct pingpong_context {
struct ibv_context *context;
struct ibv_pd *pd;
struct ibv_mr *mr;
struct ibv_cq *cq[NQP];
struct ibv_qp *qp[NQP];
void *buf;
};
struct pingpong_dest {
int lid;
int qpn;
int psn;
};
static uint16_t pp_get_local_lid(struct pingpong_context *ctx, int port)
{
struct ibv_port_attr attr;
if (ibv_query_port(ctx->context, port, &attr))
return 0;
return attr.lid;
}
static int pp_connect_ctx(struct pingpong_context *ctx, int port,
const struct pingpong_dest *my_dest,
const struct pingpong_dest *dest)
{
int i;
for (i = 0; i < NQP; ++i) {
struct ibv_qp_attr attr = {
.qp_state = IBV_QPS_RTR,
.path_mtu = IBV_MTU_1024,
.dest_qp_num = dest[i].qpn,
.rq_psn = dest[i].psn,
.max_dest_rd_atomic = 1,
.min_rnr_timer = 12,
.ah_attr = {
.is_global = 0,
.dlid = dest[i].lid,
.sl = 0,
.src_path_bits = 0,
.port_num = port
}
};
if (ibv_modify_qp(ctx->qp[i], &attr,
IBV_QP_STATE |
IBV_QP_AV |
IBV_QP_PATH_MTU |
IBV_QP_DEST_QPN |
IBV_QP_RQ_PSN |
IBV_QP_MAX_DEST_RD_ATOMIC |
IBV_QP_MIN_RNR_TIMER)) {
fprintf(stderr, "Failed to modify QP[%d] to RTR\n", i);
return 1;
}
attr.qp_state = IBV_QPS_RTS;
attr.timeout = 16;
attr.retry_cnt = 7;
attr.rnr_retry = 7;
attr.sq_psn = my_dest[i].psn;
attr.max_rd_atomic = 1;
if (ibv_modify_qp(ctx->qp[i], &attr,
IBV_QP_STATE |
IBV_QP_TIMEOUT |
IBV_QP_RETRY_CNT |
IBV_QP_RNR_RETRY |
IBV_QP_SQ_PSN |
IBV_QP_MAX_QP_RD_ATOMIC)) {
fprintf(stderr, "Failed to modify QP[%d] to RTS\n", i);
return 1;
}
}
return 0;
}
static struct pingpong_dest *pp_client_exch_dest(const char *servername, int port,
const struct pingpong_dest *my_dest)
{
struct addrinfo *res, *t;
struct addrinfo hints = {
.ai_family = AF_UNSPEC,
.ai_socktype = SOCK_STREAM
};
char *service;
char msg[ sizeof "0000:000000:000000"];
int n;
int r;
int i;
struct pingpong_dest *rem_dest = NULL;
asprintf(&service, "%d", port);
n = getaddrinfo(servername, service, &hints, &res);
if (n < 0) {
fprintf(stderr, "%s for %s:%d\n", gai_strerror(n), servername, port);
return NULL;
}
for (t = res; t; t = t->ai_next) {
sockfd = socket(t->ai_family, t->ai_socktype, t->ai_protocol);
if (sockfd >= 0) {
if (!connect(sockfd, t->ai_addr, t->ai_addrlen))
break;
close(sockfd);
sockfd = -1;
}
}
freeaddrinfo(res);
if (sockfd < 0) {
fprintf(stderr, "Couldn't connect to %s:%d\n", servername, port);
return NULL;
}
for (i = 0; i < NQP; ++i) {
sprintf(msg, "%04x:%06x:%06x", my_dest[i].lid, my_dest[i].qpn, my_dest[i].psn);
if (write(sockfd, msg, sizeof msg) != sizeof msg) {
fprintf(stderr, "Couldn't send local address\n");
goto out;
}
}
rem_dest = malloc(NQP * sizeof *rem_dest);
if (!rem_dest)
goto out;
for (i = 0; i < NQP; ++i) {
n = 0;
while (n < sizeof msg) {
r = read(sockfd, msg + n, sizeof msg - n);
if (r < 0) {
perror("client read");
fprintf(stderr, "%d/%d: Couldn't read remote address [%d]\n",
n, (int) sizeof msg, i);
goto out;
}
n += r;
}
sscanf(msg, "%x:%x:%x",
&rem_dest[i].lid, &rem_dest[i].qpn, &rem_dest[i].psn);
}
write(sockfd, "done", sizeof "done");
out:
return rem_dest;
}
static struct pingpong_dest *pp_server_exch_dest(struct pingpong_context *ctx,
int ib_port, int port,
const struct pingpong_dest *my_dest)
{
struct addrinfo *res, *t;
struct addrinfo hints = {
.ai_flags = AI_PASSIVE,
.ai_family = AF_UNSPEC,
.ai_socktype = SOCK_STREAM
};
char *service;
char msg[ sizeof "0000:000000:000000"];
int n;
int r;
int i;
int listenfd = -1;
struct pingpong_dest *rem_dest = NULL;
asprintf(&service, "%d", port);
n = getaddrinfo(NULL, service, &hints, &res);
if (n < 0) {
fprintf(stderr, "%s for port %d\n", gai_strerror(n), port);
return NULL;
}
for (t = res; t; t = t->ai_next) {
listenfd = socket(t->ai_family, t->ai_socktype, t->ai_protocol);
if (listenfd >= 0) {
n = 1;
setsockopt(listenfd, SOL_SOCKET, SO_REUSEADDR, &n, sizeof n);
if (!bind(listenfd, t->ai_addr, t->ai_addrlen))
break;
close(listenfd);
listenfd = -1;
}
}
freeaddrinfo(res);
if (listenfd < 0) {
fprintf(stderr, "Couldn't listen to port %d\n", port);
return NULL;
}
listen(listenfd, 1);
sockfd = accept(listenfd, NULL, 0);
close(listenfd);
if (sockfd < 0) {
fprintf(stderr, "accept() failed\n");
return NULL;
}
rem_dest = malloc(NQP *sizeof *rem_dest);
if (!rem_dest)
goto out;
for (i = 0; i < NQP; ++i) {
n = 0;
while (n < sizeof msg) {
r = read(sockfd, msg + n, sizeof msg - n);
if (r < 0) {
perror("server read");
fprintf(stderr, "%d/%d: Couldn't read remote address [%d]\n",
n, (int) sizeof msg, i);
goto out;
}
n += r;
}
sscanf(msg, "%x:%x:%x",
&rem_dest[i].lid, &rem_dest[i].qpn, &rem_dest[i].psn);
}
if (pp_connect_ctx(ctx, ib_port, my_dest, rem_dest)) {
fprintf(stderr, "Couldn't connect to remote QP\n");
free(rem_dest);
rem_dest = NULL;
goto out;
}
for (i = 0; i < NQP; ++i) {
sprintf(msg, "%04x:%06x:%06x", my_dest[i].lid, my_dest[i].qpn, my_dest[i].psn);
if (write(sockfd, msg, sizeof msg) != sizeof msg) {
fprintf(stderr, "Couldn't send local address\n");
free(rem_dest);
rem_dest = NULL;
goto out;
}
}
read(sockfd, msg, sizeof msg);
out:
return rem_dest;
}
static struct pingpong_context *pp_init_ctx(struct ibv_device *ib_dev, int port)
{
struct pingpong_context *ctx;
int i;
ctx = malloc(sizeof *ctx);
if (!ctx)
return NULL;
ctx->buf = memalign(page_size, NQP * 2 * SZ);
if (!ctx->buf) {
fprintf(stderr, "Couldn't allocate work buf.\n");
return NULL;
}
memset(ctx->buf, 0, SZ * NQP);
memset(ctx->buf + SZ * NQP, 1, SZ * NQP);
ctx->context = ibv_open_device(ib_dev);
if (!ctx->context) {
fprintf(stderr, "Couldn't get context for %s\n",
ibv_get_device_name(ib_dev));
return NULL;
}
ctx->pd = ibv_alloc_pd(ctx->context);
if (!ctx->pd) {
fprintf(stderr, "Couldn't allocate PD\n");
return NULL;
}
ctx->mr = ibv_reg_mr(ctx->pd, ctx->buf, NQP * 2 * SZ, IBV_ACCESS_LOCAL_WRITE);
if (!ctx->mr) {
fprintf(stderr, "Couldn't allocate MR\n");
return NULL;
}
for (i = 0; i < NQP; ++i) {
ctx->cq[i] = ibv_create_cq(ctx->context, 2, NULL);
if (!ctx->cq[i]) {
fprintf(stderr, "Couldn't create CQ %d\n", i);
return NULL;
}
}
for (i = 0; i < NQP; ++i) {
struct ibv_qp_init_attr attr = {
.send_cq = ctx->cq[i],
.recv_cq = ctx->cq[i],
.cap = {
.max_send_wr = 2,
.max_send_sge = 1,
.max_recv_wr = 2,
.max_recv_sge = 1,
},
.qp_type = IBV_QPT_RC
};
ctx->qp[i] = ibv_create_qp(ctx->pd, &attr);
if (!ctx->qp[i]) {
fprintf(stderr, "Couldn't create QP[%d]\n", i);
return NULL;
}
}
for (i = 0; i < NQP; ++i) {
struct ibv_qp_attr attr;
attr.qp_state = IBV_QPS_INIT;
attr.pkey_index = 0;
attr.port_num = port;
attr.qp_access_flags = 0;
if (ibv_modify_qp(ctx->qp[i], &attr,
IBV_QP_STATE |
IBV_QP_PKEY_INDEX |
IBV_QP_PORT |
IBV_QP_ACCESS_FLAGS)) {
fprintf(stderr, "Failed to modify QP[%d] to INIT\n", i);
return NULL;
}
}
return ctx;
}
static int pp_post_recv(struct pingpong_context *ctx)
{
struct ibv_sge list = {
.length = SZ,
.lkey = ctx->mr->lkey
};
struct ibv_recv_wr wr = {
.sg_list = &list,
.num_sge = 1,
};
struct ibv_recv_wr *bad_wr;
int i;
for (i = 0; i < NQP; ++i) {
list.addr = (uintptr_t) ctx->buf + i * SZ;
wr.wr_id = i;
if (ibv_post_recv(ctx->qp[i], &wr, &bad_wr))
break;
}
return i;
}
static inline int pp_post_send(struct pingpong_context *ctx)
{
struct ibv_sge list = {
.length = SZ,
.lkey = ctx->mr->lkey
};
struct ibv_send_wr wr = {
.sg_list = &list,
.num_sge = 1,
.opcode = IBV_WR_SEND,
.send_flags = IBV_SEND_SIGNALED,
};
struct ibv_send_wr *bad_wr;
int i;
for (i = 0; i < NQP; ++i) {
list.addr = (uintptr_t) ctx->buf + SZ * (NQP + i);
wr.wr_id = i;
if (ibv_post_send(ctx->qp[i], &wr, &bad_wr))
break;
}
return i;
}
static inline int pp_post_reply(struct pingpong_context *ctx, int n)
{
struct ibv_sge list = {
.length = SZ,
.lkey = ctx->mr->lkey,
.addr = (uintptr_t) ctx->buf + SZ * (NQP + n)
};
struct ibv_send_wr wr = {
.sg_list = &list,
.num_sge = 1,
.opcode = IBV_WR_SEND,
.send_flags = IBV_SEND_SIGNALED,
.wr_id = NQP + n
};
struct ibv_send_wr *bad_wr;
return ibv_post_send(ctx->qp[n], &wr, &bad_wr);
}
static void usage(const char *argv0)
{
printf("Usage:\n");
printf(" %s start a server and wait for connection\n", argv0);
printf(" %s <host> connect to server at <host>\n", argv0);
printf("\n");
printf("Options:\n");
printf(" -p, --port=<port> listen on/connect to port <port> (default 18515)\n");
printf(" -d, --ib-dev=<dev> use IB device <dev> (default first device found)\n");
printf(" -i, --ib-port=<port> use port <port> of IB device (default 1)\n");
}
int main(int argc, char *argv[])
{
struct dlist *dev_list;
struct ibv_device *ib_dev;
struct pingpong_context *ctx;
struct pingpong_dest my_dest[NQP];
struct pingpong_dest *rem_dest;
struct timeval start, end;
char *ib_devname = NULL;
char *servername = NULL;
int port = 18515;
int ib_port = 1;
int outs;
int cnt;
int i;
int usec, tmp = 0;
char sync;
srand48(getpid() * time(NULL));
while (1) {
int c;
static struct option long_options[] = {
{ .name = "port", .has_arg = 1, .val = 'p' },
{ .name = "ib-dev", .has_arg = 1, .val = 'd' },
{ .name = "ib-port", .has_arg = 1, .val = 'i' },
{ 0 }
};
c = getopt_long(argc, argv, "p:d:i:s:q:r:n:e", long_options, NULL);
if (c == -1)
break;
switch (c) {
case 'p':
port = strtol(optarg, NULL, 0);
if (port < 0 || port > 65535) {
usage(argv[0]);
return 1;
}
break;
case 'd':
ib_devname = strdupa(optarg);
break;
case 'i':
ib_port = strtol(optarg, NULL, 0);
if (ib_port < 0) {
usage(argv[0]);
return 1;
}
break;
default:
usage(argv[0]);
return 1;
}
}
if (optind == argc - 1)
servername = strdupa(argv[optind]);
else if (optind < argc) {
usage(argv[0]);
return 1;
}
page_size = sysconf(_SC_PAGESIZE);
dev_list = ibv_get_devices();
if (!dev_list) {
fprintf(stderr, "No IB devices found\n");
return 1;
}
dlist_start(dev_list);
if (!ib_devname) {
ib_dev = dlist_next(dev_list);
if (!ib_dev) {
fprintf(stderr, "No IB devices found\n");
return 1;
}
} else {
dlist_for_each_data(dev_list, ib_dev, struct ibv_device)
if (!strcmp(ibv_get_device_name(ib_dev), ib_devname))
break;
if (!ib_dev) {
fprintf(stderr, "IB device %s not found\n", ib_devname);
return 1;
}
}
ctx = pp_init_ctx(ib_dev, ib_port);
if (!ctx)
return 1;
for (i = 0; i < NQP; ++i) {
my_dest[i].qpn = ctx->qp[i]->qp_num;
my_dest[i].psn = lrand48() & 0xffffff;
my_dest[i].lid = pp_get_local_lid(ctx, ib_port);
if (!my_dest[i].lid) {
fprintf(stderr, "Couldn't get local LID\n");
return 1;
}
}
printf(" local address: LID 0x%04x\n", my_dest[0].lid);
if (servername)
rem_dest = pp_client_exch_dest(servername, port, my_dest);
else
rem_dest = pp_server_exch_dest(ctx, ib_port, port, my_dest);
if (!rem_dest)
return 1;
printf(" remote address: LID 0x%04x\n", rem_dest[0].lid);
if (servername)
if (pp_connect_ctx(ctx, ib_port, my_dest, rem_dest))
return 1;
if (gettimeofday(&start, NULL)) {
perror("gettimeofday");
return 1;
}
while (1) {
outs = pp_post_recv(ctx);
if (outs < NQP) {
fprintf(stderr, "Couldn't post recvs (%d)\n", outs);
return 1;
}
if (servername) {
if (read(sockfd, &sync, 1) != 1) {
perror("read");
return 1;
}
outs += pp_post_send(ctx);
if (outs < 2 * NQP) {
fprintf(stderr, "Couldn't post sends (%d)\n", outs);
return 1;
}
} else {
if (write(sockfd, &sync, 1) != 1) {
perror("write");
return 1;
}
}
cnt = 0;
while (cnt < outs) {
struct ibv_wc wc;
int ne;
for (i = 0; i < NQP; ++i) {
ne = ibv_poll_cq(ctx->cq[i], 1, &wc);
if (ne < 0) {
fprintf(stderr, "poll CQ failed %d\n", ne);
return 1;
}
if (ne)
break;
}
if (wc.status != IBV_WC_SUCCESS) {
fprintf(stderr, "Failed status %d for wr_id %d (%d/%d done)\n",
wc.status, (int) wc.wr_id, cnt, outs);
return 1;
}
if (!servername && ne && wc.wr_id < NQP) {
if (pp_post_reply(ctx, i)) {
fprintf(stderr, "Failed to send reply %d\n", i);
return 1;
}
++outs;
}
cnt += ne;
if (gettimeofday(&end, NULL)) {
perror("gettimeofday");
return 1;
}
usec = (end.tv_sec - start.tv_sec) * 1000000 +
(end.tv_usec - start.tv_usec);
if (usec >= tmp + 1000000) {
printf("After %d.%06d sec, %d/%d comps\n",
usec / 1000000, usec % 1000000, cnt, outs);
tmp = usec;
}
}
if (gettimeofday(&end, NULL)) {
perror("gettimeofday");
return 1;
}
usec = (end.tv_sec - start.tv_sec) * 1000000 + (end.tv_usec - start.tv_usec);
if (!servername) {
cnt = 0;
if (0) {
printf("\nMemory contents:\n ");
for (i = 0; i < NQP; ++i) {
printf("%d", ((uint8_t *) ctx->buf)[i * SZ]);
if ((i + 1) % 72 == 0 || i == NQP - 1)
printf("\n ");
}
printf("\n");
}
for (i = 0; i < NQP; ++i)
cnt += ((uint8_t *) ctx->buf)[i * SZ];
printf("After %d.%06d sec, %d comps (%d recvs)\n",
usec / 1000000, usec % 1000000, outs, cnt);
} else
printf("After %d.%06d sec, %d comps\n",
usec / 1000000, usec % 1000000, cnt);
}
return 0;
}
_______________________________________________
openib-general mailing list
[email protected]
http://openib.org/mailman/listinfo/openib-general
To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general