Hi again,
We understood that the last issue report was a bit minimalistic.
We hope that report will be of more interest. If you have ANY
question please send me email and I will answer!
We have run into a strange problem:
* RDMA(ibv_post_send) fails when using malloc in a special way
* We have an issue which we have reduced down to the enclosed
program. it is not neat; but is able to demonstrate what we think
is a problem with either ibverbs, libc, or the kernel
* The issue is related to the sending process
* Receiver sees incorrect data
* The receiver will use "private" receive buffers for all RDMAs,
i.e., each RDMA "put" will place memory into a distinct receive
memory area
* The sender will re-use the memory area
* To trig the problem, we need a malloc() attempting to allocate
huge amount of memory, but which fails. Without this failing
malloc(), everything is OK. Please note that malloc changes
allocation policy after this failing malloc (see below), and this
behavior is what we observed in a pthreads program where we first
discovered the issue. It must also be noted that if we allocate
buffers with malloc instead of valloc it works fine...
* Someone reviewing this would probably say: "The problem comes from
potential munmap()+mmap() or an mremap()". We acknowledge that the
failing program is vulnerable in this context, but strace does not
reveal any such change in virtual to physical mapping. (And be
aware, this is a stripped down example of a much more complicated
scenario)
* We are concerned that the call to ibv_reg_mr() does not imply a
call to madvise()
* We have tested with
rhel4.6,
kernel: 2.6.9-67.ELsmp, x86_64
libibverbs-1.1.1-1.ofed1.3.1,
Mellanox Technologies MT23108 InfiniHost (rev a1)
and with:
rhel5.2,
kernel: 2.6.18-92.el5, x86_64
libibverbs-1.1.2-1.ofed1.4.rc6,
Mellanox Technologies MT25418 (rev a0)
Here is the special malloc behaviour:
[ 3afb6c40bc] mmap(NULL, 39999000576, PROT_READ|PROT_WRITE,
MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = -1 ENOMEM (Cannot allocate memory)
[ 3afb6bfb0a] brk(0x95073b000) = 0x526000
[ 3afb6c40bc] mmap(NULL, 39999135744, PROT_READ|PROT_WRITE,
MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = -1 ENOMEM (Cannot allocate memory)
[ 3afb6c40bc] mmap(NULL, 2097152, PROT_NONE,
MAP_PRIVATE|MAP_ANONYMOUS|MAP_NORESERVE, -1, 0) = 0x46e8632000
[ 3afb6c40e9] munmap(0x46e8632000, 843776) = 0
[ 3afb6c40e9] munmap(0x46e8800000, 204800) = 0
[ 3afb6c4119] mprotect(0x46e8700000, 135168, PROT_READ|PROT_WRITE) = 0
[ 3afb6c40bc] mmap(NULL, 39999000576, PROT_READ|PROT_WRITE,
MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = -1 ENOMEM (Cannot allocate memory)
Please note the mmap() returning 0x46e8632000. Its uses PROT_NONE and
MAP_NORESERVE. Not sure if it is related. The RDMA send buffer that
fails, comes from this mmap.
Here is a shortform description of the way we allocate/free buffers:
* Buffers are allocated using valloc
* Buffers are registered using ibv_reg_mr if it not already registered
* Buffers are initiated with unique data
* Data is copied to receiver with ibv_post_send
* We wait with ibv_poll_cq
* Buffers are freed using free
* When we start getting same buffer addresses from valloc and we
don't register memory, data becomes wrong at the receiver side. We
get partial data from previous buffer.
strace/ltrace are available if anyone is interested.
/*
* Copyright (c) 2005 Topspin Communications. All rights reserved.
* Copyright (c) 2005 Mellanox Technologies Ltd. All rights reserved.
* Copyright (c) 2005 Hewlett Packard, Inc (Grant Grundler)
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
* General Public License (GPL) Version 2, available from the file
* COPYING in the main directory of this source tree, or the
* OpenIB.org BSD license below:
*
* Redistribution and use in source and binary forms, with or
* without modification, are permitted provided that the following
* conditions are met:
*
* - Redistributions of source code must retain the above
* copyright notice, this list of conditions and the following
* disclaimer.
*
* - Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following
* disclaimer in the documentation and/or other materials
* provided with the distribution.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
* $Id$
*/
#if HAVE_CONFIG_H
# include <config.h>
#endif /* HAVE_CONFIG_H */
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <string.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <sys/time.h>
#include <netdb.h>
#include <malloc.h>
#include <getopt.h>
#include <arpa/inet.h>
#include <byteswap.h>
#include <time.h>
#include <assert.h>
#include <infiniband/verbs.h>
#include <rdma/rdma_cma.h>
#include "get_clock.h"
/* -------------------------------------------------------------------------*/
/* -------------------------------------------------------------------------*/
#define PMPI_SIZE 40000
#define PMPI_BUFFER_SIZE (40000+8192)
/* -------------------------------------------------------------------------*/
/* -------------------------------------------------------------------------*/
#define PINGPONG_RDMA_WRID 3
#define MAX_INLINE 400
static int inline_size = MAX_INLINE;
static int page_size;
static pid_t pid;
struct report_options {
int unsorted;
int histogram;
int cycles; /* report delta's in cycles, not microsec's */
};
struct pingpong_context {
struct ibv_context *context;
struct ibv_pd *pd;
struct ibv_mr *mr;
struct ibv_cq *rcq;
struct ibv_cq *scq;
struct ibv_qp *qp;
void *buf;
volatile char *post_buf;
volatile char *poll_buf;
int size;
int tx_depth;
struct ibv_sge list;
struct ibv_send_wr wr;
};
struct pingpong_dest {
int lid;
int qpn;
int psn;
unsigned rkey;
unsigned long long vaddr;
};
struct pp_data {
int port;
int ib_port;
unsigned size;
int tx_depth;
int use_cma;
int sockfd;
char *servername;
struct pingpong_dest my_dest;
struct pingpong_dest *rem_dest;
struct ibv_device *ib_dev;
struct rdma_event_channel *cm_channel;
struct rdma_cm_id *cm_id;
};
static struct pingpong_context *pp_init_ctx(void *, struct pp_data *);
static uint16_t pp_get_local_lid(struct pingpong_context *ctx, int port)
{
struct ibv_port_attr attr;
if (ibv_query_port(ctx->context, port, &attr))
return 0;
return attr.lid;
}
static struct ibv_device *pp_find_dev(const char *ib_devname)
{
struct ibv_device **dev_list;
struct ibv_device *ib_dev = NULL;
dev_list = ibv_get_device_list(NULL);
if (!ib_devname) {
ib_dev = dev_list[0];
if (!ib_dev)
fprintf(stderr, "No IB devices found\n");
} else {
for (; (ib_dev = *dev_list); ++dev_list) {
if (!strcmp(ibv_get_device_name(ib_dev), ib_devname))
break;
}
if (!ib_dev)
fprintf(stderr, "IB device %s not found\n", ib_devname);
}
return ib_dev;
}
#define KEY_MSG_SIZE (sizeof "0000:000000:000000:00000000:0000000000000000")
#define KEY_PRINT_FMT "%04x:%06x:%06x:%08x:%016Lx"
static int pp_write_keys(int sockfd, const struct pingpong_dest *my_dest)
{
char msg[KEY_MSG_SIZE];
sprintf(msg, KEY_PRINT_FMT, my_dest->lid, my_dest->qpn,
my_dest->psn, my_dest->rkey, my_dest->vaddr);
if (write(sockfd, msg, sizeof msg) != sizeof msg) {
perror("client write");
fprintf(stderr, "Couldn't send local address\n");
return -1;
}
return 0;
}
static int pp_read_keys(int sockfd, const struct pingpong_dest *my_dest,
struct pingpong_dest *rem_dest)
{
int parsed;
char msg[KEY_MSG_SIZE];
if (read(sockfd, msg, sizeof msg) != sizeof msg) {
perror("pp_read_keys");
fprintf(stderr, "Couldn't read remote address\n");
return -1;
}
parsed = sscanf(msg, KEY_PRINT_FMT, &rem_dest->lid, &rem_dest->qpn,
&rem_dest->psn, &rem_dest->rkey, &rem_dest->vaddr);
if (parsed != 5) {
fprintf(stderr, "Couldn't parse line <%.*s>\n",
(int)sizeof msg, msg);
return -1;
}
return 0;
}
static struct pingpong_context *pp_client_connect(struct pp_data *data)
{
struct addrinfo *res, *t;
struct addrinfo hints = {
.ai_family = AF_UNSPEC,
.ai_socktype = SOCK_STREAM
};
char *service;
int n;
int sockfd = -1;
struct pingpong_context *ctx = NULL;
if (asprintf(&service, "%d", data->port) < 0)
goto err4;
n = getaddrinfo(data->servername, service, &hints, &res);
if (n < 0) {
fprintf(stderr, "%d:%s: %s for %s:%d\n",
pid, __func__, gai_strerror(n),
data->servername, data->port);
goto err4;
}
for (t = res; t; t = t->ai_next) {
sockfd = socket(t->ai_family, t->ai_socktype,
t->ai_protocol);
if (sockfd >= 0) {
if (!connect(sockfd, t->ai_addr, t->ai_addrlen))
break;
close(sockfd);
sockfd = -1;
}
}
if (sockfd < 0) {
fprintf(stderr, "%d:%s: Couldn't connect to %s:%d\n",
pid, __func__, data->servername, data->port);
goto err3;
}
ctx = pp_init_ctx(data->ib_dev, data);
if (!ctx)
goto err3;
data->sockfd = sockfd;
freeaddrinfo(res);
return ctx;
err3:
freeaddrinfo(res);
err4:
return NULL;
}
static int pp_client_exch_dest(struct pp_data *data)
{
if (data->rem_dest != NULL)
free(data->rem_dest);
data->rem_dest = malloc(sizeof *data->rem_dest);
if (!data->rem_dest)
return -1;
if (pp_write_keys(data->sockfd, &data->my_dest))
return -1;
return pp_read_keys(data->sockfd, &data->my_dest, data->rem_dest);
}
static struct pingpong_context *pp_server_connect(struct pp_data *data)
{
struct addrinfo *res, *t;
struct addrinfo hints = {
.ai_flags = AI_PASSIVE,
.ai_family = AF_UNSPEC,
.ai_socktype = SOCK_STREAM
};
char *service;
int sockfd = -1, connfd;
int n;
struct pingpong_context *ctx = NULL;
if (asprintf(&service, "%d", data->port) < 0)
goto err5;
if ( (n = getaddrinfo(NULL, service, &hints, &res)) < 0 ) {
fprintf(stderr, "%d:%s: %s for port %d\n", pid, __func__,
gai_strerror(n), data->port);
goto err5;
}
for (t = res; t; t = t->ai_next) {
sockfd = socket(t->ai_family, t->ai_socktype, t->ai_protocol);
if (sockfd >= 0) {
n = 1;
setsockopt(sockfd, SOL_SOCKET, SO_REUSEADDR, &n, sizeof n);
if (!bind(sockfd, t->ai_addr, t->ai_addrlen))
break;
close(sockfd);
sockfd = -1;
}
}
if (sockfd < 0) {
fprintf(stderr, "%d:%s: Couldn't listen to port %d\n", pid,
__func__, data->port);
goto err4;
}
listen(sockfd, 1);
connfd = accept(sockfd, NULL, 0);
if (connfd < 0) {
perror("server accept");
fprintf(stderr, "%d:%s: accept() failed\n", pid, __func__);
close(sockfd);
goto err4;
}
close(sockfd);
ctx = pp_init_ctx(data->ib_dev, data);
if (!ctx)
goto err4;
data->sockfd = connfd;
freeaddrinfo(res);
return ctx;
err4:
freeaddrinfo(res);
err5:
return NULL;
}
static int pp_server_exch_dest(struct pp_data *data)
{
if (data->rem_dest != NULL)
free(data->rem_dest);
data->rem_dest = malloc(sizeof *data->rem_dest);
if (!data->rem_dest)
return -1;
if (pp_read_keys(data->sockfd, &data->my_dest, data->rem_dest))
return -1;
return pp_write_keys(data->sockfd, &data->my_dest);
}
static struct pingpong_context *pp_init_ctx(void *ptr, struct pp_data *data)
{
struct pingpong_context *ctx;
struct ibv_device *ib_dev;
ctx = malloc(sizeof *ctx);
if (!ctx)
return NULL;
ctx->size = 4096 + 12 * 40960;
ctx->tx_depth = data->tx_depth;
ctx->buf = memalign(page_size, ctx->size );
if (!ctx->buf) {
fprintf(stderr, "%d:%s: Couldn't allocate work buf.\n",
pid, __func__);
return NULL;
}
memset(ctx->buf, 0, ctx->size);
ctx->post_buf = (char *)ctx->buf;
ctx->poll_buf = (char *)ctx->buf + 4;
ib_dev = (struct ibv_device *)ptr;
ctx->context = ibv_open_device(ib_dev);
if (!ctx->context) {
fprintf(stderr, "%d:%s: Couldn't get context for %s\n",
pid, __func__, ibv_get_device_name(ib_dev));
return NULL;
}
ctx->pd = ibv_alloc_pd(ctx->context);
if (!ctx->pd) {
fprintf(stderr, "%d:%s: Couldn't allocate PD\n", pid, __func__);
return NULL;
}
/* We dont really want IBV_ACCESS_LOCAL_WRITE, but IB spec says:
* The Consumer is not allowed to assign Remote Write or Remote Atomic
to
* a Memory Region that has not been assigned Local Write. */
ctx->mr = ibv_reg_mr(ctx->pd, ctx->buf, ctx->size,
IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE);
if (!ctx->mr) {
fprintf(stderr, "%d:%s: Couldn't allocate MR\n", pid, __func__);
return NULL;
}
ctx->rcq = ibv_create_cq(ctx->context, 1, NULL, NULL, 0);
if (!ctx->rcq) {
fprintf(stderr, "%d:%s: Couldn't create recv CQ\n", pid,
__func__);
return NULL;
}
ctx->scq = ibv_create_cq(ctx->context, ctx->tx_depth, ctx, NULL, 0);
if (!ctx->scq) {
fprintf(stderr, "%d:%s: Couldn't create send CQ\n", pid,
__func__);
return NULL;
}
struct ibv_qp_init_attr attr = {
.send_cq = ctx->scq,
.recv_cq = ctx->rcq,
.cap = {
.max_send_wr = ctx->tx_depth,
/* Work around: driver doesnt support
* recv_wr = 0 */
.max_recv_wr = 1,
.max_send_sge = 1,
.max_recv_sge = 1,
.max_inline_data = inline_size,
},
.qp_type = IBV_QPT_RC
};
ctx->qp = ibv_create_qp(ctx->pd, &attr);
if (!ctx->qp) {
fprintf(stderr, "%d:%s: Couldn't create QP\n", pid, __func__);
return NULL;
}
{
struct ibv_qp_attr attr;
attr.qp_state = IBV_QPS_INIT;
attr.pkey_index = 0;
attr.port_num = data->ib_port;
attr.qp_access_flags = IBV_ACCESS_REMOTE_WRITE;
if (ibv_modify_qp(ctx->qp, &attr,
IBV_QP_STATE |
IBV_QP_PKEY_INDEX |
IBV_QP_PORT |
IBV_QP_ACCESS_FLAGS)) {
fprintf(stderr, "%d:%s: Failed to modify QP to INIT\n",
pid, __func__);
return NULL;
}
}
return ctx;
}
static int pp_connect_ctx(struct pingpong_context *ctx, struct pp_data *data)
{
struct ibv_qp_attr attr = {
.qp_state = IBV_QPS_RTR,
.path_mtu = IBV_MTU_256,
.dest_qp_num = data->rem_dest->qpn,
.rq_psn = data->rem_dest->psn,
.max_dest_rd_atomic = 1,
.min_rnr_timer = 12,
.ah_attr.is_global = 0,
.ah_attr.dlid = data->rem_dest->lid,
.ah_attr.sl = 0,
.ah_attr.src_path_bits = 0,
.ah_attr.port_num = data->ib_port
};
if (ibv_modify_qp(ctx->qp, &attr,
IBV_QP_STATE |
IBV_QP_AV |
IBV_QP_PATH_MTU |
IBV_QP_DEST_QPN |
IBV_QP_RQ_PSN |
IBV_QP_MAX_DEST_RD_ATOMIC |
IBV_QP_MIN_RNR_TIMER)) {
fprintf(stderr, "%s: Failed to modify QP to RTR\n", __func__);
return 1;
}
attr.qp_state = IBV_QPS_RTS;
attr.timeout = 14;
attr.retry_cnt = 7;
attr.rnr_retry = 7;
attr.sq_psn = data->my_dest.psn;
attr.max_rd_atomic = 1;
if (ibv_modify_qp(ctx->qp, &attr,
IBV_QP_STATE |
IBV_QP_TIMEOUT |
IBV_QP_RETRY_CNT |
IBV_QP_RNR_RETRY |
IBV_QP_SQ_PSN |
IBV_QP_MAX_QP_RD_ATOMIC)) {
fprintf(stderr, "%s: Failed to modify QP to RTS\n", __func__);
return 1;
}
return 0;
}
static int pp_open_port(struct pingpong_context *ctx, struct pp_data *data )
{
char addr_fmt[] = "%8s address: LID %#04x QPN %#06x PSN %#06x RKey
%#08x VAddr %#016Lx\n";
/* Create connection between client and server.
* We do it by exchanging data over a TCP socket connection. */
data->my_dest.lid = pp_get_local_lid(ctx, data->ib_port);
data->my_dest.qpn = ctx->qp->qp_num;
data->my_dest.psn = lrand48() & 0xffffff;
if (!data->my_dest.lid) {
fprintf(stderr, "Local lid 0x0 detected. Is an SM running?\n");
return -1;
}
data->my_dest.rkey = ctx->mr->rkey;
data->my_dest.vaddr = (uintptr_t)ctx->buf;
printf(addr_fmt, "local", data->my_dest.lid, data->my_dest.qpn,
data->my_dest.psn,
data->my_dest.rkey, data->my_dest.vaddr);
if (data->servername) {
if (pp_client_exch_dest(data))
return 1;
} else {
if (pp_server_exch_dest(data))
return 1;
}
printf(addr_fmt, "remote", data->rem_dest->lid, data->rem_dest->qpn,
data->rem_dest->psn, data->rem_dest->rkey,
data->rem_dest->vaddr);
if (pp_connect_ctx(ctx, data))
return 1;
/* An additional handshake is required *after* moving qp to RTR.
Arbitrarily reuse exch_dest for this purpose. */
if (data->servername) {
if (pp_client_exch_dest(data))
return -1;
} else {
if (pp_server_exch_dest(data))
return -1;
}
if (write(data->sockfd, "done", sizeof "done") != sizeof "done"){
perror("write");
fprintf(stderr, "Couldn't write to socket\n");
return 1;
}
close(data->sockfd);
return 0;
}
static void usage(const char *argv0)
{
printf("Usage:\n");
printf(" %s start a server and wait for connection\n",
argv0);
printf(" %s <host> connect to server at <host>\n", argv0);
printf("\n");
printf("Options:\n");
printf(" -p, --port=<port> listen on/connect to port <port>
(default 18515)\n");
printf(" -d, --ib-dev=<dev> use IB device <dev> (default first
device found)\n");
printf(" -i, --ib-port=<port> use port <port> of IB device (default
1)\n");
printf(" -s, --size=<size> size of message to exchange (default
1)\n");
printf(" -t, --tx-depth=<dep> size of tx queue (default 50)\n");
printf(" -n, --iters=<iters> number of exchanges (at least 2,
default 1000)\n");
printf(" -I, --inline_size=<size> max size of message to be sent in
inline mode (default 400)\n");
printf(" -C, --report-cycles report times in cpu cycle units
(default microseconds)\n");
printf(" -H, --report-histogram print out all results (default print
summary only)\n");
printf(" -U, --report-unsorted (implies -H) print out unsorted
results (default sorted)\n");
printf(" -c, --cma Use the RDMA CMA to setup the RDMA
connection\n");
}
/* -------------------------------------------------------------------------*/
/* Code added to show problem */
/* Start */
/* -------------------------------------------------------------------------*/
void
post_send_and_poll(struct ibv_qp *qp,
struct ibv_send_wr *wr,
struct ibv_send_wr **bad_wr,
struct ibv_cq *cq)
{
int ne;
struct ibv_wc wc;
if (ibv_post_send(qp, wr, bad_wr)) {
fprintf(stderr, "Couldn't post send:\n");
assert(0);
}
do {
ne = ibv_poll_cq(cq, 1, &wc);
} while (ne == 0);
if (ne < 0) {
fprintf(stderr, "poll CQ failed %d\n", ne);
assert(0);
}
if (wc.status != IBV_WC_SUCCESS) {
fprintf(stderr, "Failed status %d: wr_id %d\n",
wc.status, (int) wc.wr_id);
assert(0);
}
}
/* -------------------------------------------------------------------------*/
static char* pin_addr[12];
static struct ibv_mr* pin_mr[12];
static int pinlist_init_done = 0;
/* -------------------------------------------------------------------------*/
void
send_buffer(struct pingpong_context* ctx, struct pp_data data, char* buffer,
int tag, int loop)
{
struct ibv_sge list;
struct ibv_send_wr wr;
struct ibv_send_wr *bad_wr;
struct ibv_mr* mr;
int i;
int found = 0;
int displ = (((loop *3) + tag - 11) * 40960) +4096;
if (pinlist_init_done == 0) {
for (i = 0; i < 12; i++) {
pin_addr[i] = NULL;
}
pinlist_init_done = 1;
}
for (i = 0; i < 12; i++) {
if (pin_addr[i] == buffer) {
mr = pin_mr[i];
found = 1;
break;
}
}
if (found == 0) {
printf(" %p Must be pinned\n", buffer);
mr = ibv_reg_mr(ctx->pd, buffer, 40000,
IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE);
if (mr == NULL) {
fprintf(stderr, "%d:%s: Couldn't allocate MR\n", pid, __func__);
assert(0);
}
for (i = 0; i < 12; i++) {
if (pin_addr[i] == 0) {
pin_addr[i] =buffer;
pin_mr[i] = mr;
break;
}
}
} else {
printf(" %p Already pinned\n", buffer);
}
list.addr = (uintptr_t) buffer;
list.length = 40000;
list.lkey = mr->lkey;
wr.wr.rdma.remote_addr = data.rem_dest->vaddr + displ;
wr.wr.rdma.rkey = data.rem_dest->rkey;
wr.wr_id = PINGPONG_RDMA_WRID;
wr.sg_list = &list;
wr.num_sge = 1;
wr.opcode = IBV_WR_RDMA_WRITE;
// wr.send_flags = IBV_SEND_SIGNALED| IBV_SEND_INLINE;
wr.send_flags = IBV_SEND_SIGNALED;
wr.next = NULL;
post_send_and_poll(ctx->qp, &wr, &bad_wr, ctx->scq);
}
/* -------------------------------------------------------------------------*/
/* -------------------------------------------------------------------------*/
/* -------------------------------------------------------------------------*/
void
init(char* b, int tag, int l)
{
int i;
int* p = (int*) b;
for (i = 0; i < 10000; i++) {
p[i] = l << 24 | tag << 16 | i;
}
}
/* -------------------------------------------------------------------------*/
void
do_sender(struct pingpong_context* ctx, struct pp_data data)
{
int i;
char* rbuf1 = 0;
char* rbuf2 = 0;
char* rbuf3 = 0;
char* rbuf4 = 0;
char* sbuf1 = 0;
char* sbuf2 = 0;
char* sbuf3 = 0;
char* sbuf4 = 0;
i = 0;
rbuf1 = (char*)valloc(88192);
sbuf1 = (char*)valloc(48192);
rbuf2 = (char*)valloc(88192);
sbuf2 = (char*)valloc(48192);
rbuf3 = (char*)valloc(88192);
sbuf3 = (char*)valloc(48192);
rbuf4 = (char*)valloc(88192);
sbuf4 = (char*)valloc(48192);
init(sbuf1,11,i);
init(sbuf2,12,i);
init(sbuf3,13,i);
init(sbuf4,14,i);
printf("sbuf %p %p %p %p\n", sbuf1, sbuf2, sbuf3, sbuf4);
send_buffer(ctx, data, sbuf1, 11, i);
send_buffer(ctx, data, sbuf2, 12, i);
send_buffer(ctx, data, sbuf3, 13, i);
free(sbuf1);
free(sbuf2);
free(sbuf3);
i = 1;
sbuf1 = (char*)valloc(48192);
sbuf2 = (char*)valloc(48192);
sbuf3 = (char*)valloc(48192);
init(sbuf1,11,i);
init(sbuf2,12,i);
init(sbuf3,13,i);
init(sbuf4,14,i);
printf("sbuf %p %p %p %p\n", sbuf1, sbuf2, sbuf3, sbuf4);
send_buffer(ctx, data, sbuf1, 11, i);
send_buffer(ctx, data, sbuf2, 12, i);
send_buffer(ctx, data, sbuf3, 13, i);
free(sbuf1);
free(sbuf2);
free(sbuf3);
i = 2;
sbuf1 = (char*)valloc(48192);
sbuf2 = (char*)valloc(48192);
sbuf3 = (char*)valloc(48192);
init(sbuf1,11,i);
init(sbuf2,12,i);
init(sbuf3,13,i);
init(sbuf4,14,i);
printf("sbuf %p %p %p %p\n", sbuf1, sbuf2, sbuf3, sbuf4);
send_buffer(ctx, data, sbuf1, 11, i);
send_buffer(ctx, data, sbuf2, 12, i);
send_buffer(ctx, data, sbuf3, 13, i);
free(sbuf1);
free(sbuf2);
free(sbuf3);
}
/* -------------------------------------------------------------------------*/
void
check(char* b, int tag, int l)
{
int i;
int count = 0;
int* p = (int*) b;
for (i = 0; i < 10000; i++) {
int exp = l << 24 | tag << 16 | i;
if (p[i] != exp) {
int pl = (p[i] >> 24) & 0xff;
int pt = (p[i] >> 16) & 0xff;
int pi = p[i]& 0xffff;
if (count ++ < 10) {
printf("----- check buff loop: %d tag: %d buff[%d] exp: %x(%d %d
%d) act: %x(%d %d %d)\n", l, tag, i, exp, l,tag,i, p[i], pl,pt,pi);
}
}
}
if (count != 0) {
printf("----- check buff loop: %d tag: %d --- found %d errors (but just
10 printed)\n", l, tag, count);
}
}
/* -------------------------------------------------------------------------*/
void
do_receiver(struct pingpong_context* ctx, struct pp_data data)
{
char* rbuf = (char*)ctx->buf + 4096;
check(rbuf,11,0); rbuf += 40960;
check(rbuf,12,0); rbuf += 40960;
check(rbuf,13,0); rbuf += 40960;
check(rbuf,11,1); rbuf += 40960;
check(rbuf,12,1); rbuf += 40960;
check(rbuf,13,1); rbuf += 40960;
check(rbuf,11,2); rbuf += 40960;
check(rbuf,12,2); rbuf += 40960;
check(rbuf,13,2);
}
/* -------------------------------------------------------------------------*/
/* End */
/* Code added to show problem */
/* -------------------------------------------------------------------------*/
/* -------------------------------------------------------------------------*/
/* -------------------------------------------------------------------------*/
/* -------------------------------------------------------------------------*/
int main(int argc, char *argv[])
{
const char *ib_devname = NULL;
int iters = 1000;
struct report_options report = {};
struct pingpong_context *ctx;
struct ibv_qp *qp;
struct ibv_send_wr *wr;
volatile char *poll_buf;
volatile char *post_buf;
struct ibv_send_wr *bad_wr;
int scnt, rcnt, ccnt;
cycles_t *tstamp;
struct pp_data data = {
.port = 18515,
.ib_port = 1,
.size = 1,
.tx_depth = 50,
.use_cma = 0,
.servername = NULL,
.rem_dest = NULL,
.ib_dev = NULL,
.cm_channel = NULL,
.cm_id = NULL
};
/* Parameter parsing. */
while (1) {
int c;
static struct option long_options[] = {
{ .name = "port", .has_arg = 1, .val = 'p' },
{ .name = "ib-dev", .has_arg = 1, .val = 'd' },
{ .name = "ib-port", .has_arg = 1, .val = 'i' },
{ .name = "size", .has_arg = 1, .val = 's' },
{ .name = "iters", .has_arg = 1, .val = 'n' },
{ .name = "tx-depth", .has_arg = 1, .val = 't' },
{ .name = "inline_size", .has_arg = 1, .val = 'I' },
{ .name = "report-cycles", .has_arg = 0, .val = 'C' },
{ .name = "report-histogram",.has_arg = 0, .val = 'H' },
{ .name = "report-unsorted",.has_arg = 0, .val = 'U' },
{ .name = "cma", .has_arg = 0, .val = 'c' },
{ 0 }
};
c = getopt_long(argc, argv, "p:d:i:s:n:t:I:CHUc", long_options,
NULL);
if (c == -1)
break;
switch (c) {
case 'p':
data.port = strtol(optarg, NULL, 0);
if (data.port < 0 || data.port > 65535) {
usage(argv[0]);
return 1;
}
break;
case 'd':
ib_devname = strdup(optarg);
break;
case 'i':
data.ib_port = strtol(optarg, NULL, 0);
if (data.ib_port < 0) {
usage(argv[0]);
return 2;
}
break;
case 's':
data.size = strtol(optarg, NULL, 0);
if (data.size < 1) { usage(argv[0]); return 3; }
break;
case 't':
data.tx_depth = strtol(optarg, NULL, 0);
if (data.tx_depth < 1) { usage(argv[0]); return
4; }
break;
case 'n':
iters = strtol(optarg, NULL, 0);
if (iters < 2) {
usage(argv[0]);
return 5;
}
break;
case 'I':
inline_size = strtol(optarg, NULL, 0);
break;
case 'C':
report.cycles = 1;
break;
case 'H':
report.histogram = 1;
break;
case 'U':
report.unsorted = 1;
break;
case 'c':
data.use_cma = 1;
break;
default:
usage(argv[0]);
return 5;
}
}
if (optind == argc - 1)
data.servername = strdup(argv[optind]);
else if (optind < argc) {
usage(argv[0]);
return 6;
}
/*
* Done with parameter parsing. Perform setup.
*/
pid = getpid();
srand48(pid * time(NULL));
page_size = sysconf(_SC_PAGESIZE);
data.ib_dev = pp_find_dev(ib_devname);
if (!data.ib_dev)
return 7;
if (data.servername) {
ctx = pp_client_connect(&data);
if (!ctx)
return 8;
} else {
ctx = pp_server_connect(&data);
if (!ctx)
return 8;
}
if (pp_open_port(ctx, &data))
return 9;
wr = &ctx->wr;
ctx->list.addr = (uintptr_t) ctx->buf;
ctx->list.length = 1;
ctx->list.lkey = ctx->mr->lkey;
wr->wr.rdma.remote_addr = data.rem_dest->vaddr + 4;
wr->wr.rdma.rkey = data.rem_dest->rkey;
ctx->wr.wr_id = PINGPONG_RDMA_WRID;
ctx->wr.sg_list = &ctx->list;
ctx->wr.num_sge = 1;
ctx->wr.opcode = IBV_WR_RDMA_WRITE;
if (ctx->size > inline_size || ctx->size == 0) {
ctx->wr.send_flags = IBV_SEND_SIGNALED;
} else {
ctx->wr.send_flags = IBV_SEND_SIGNALED | IBV_SEND_INLINE;
}
ctx->wr.next = NULL;
scnt = 0;
rcnt = 0;
ccnt = 0;
poll_buf = ctx->poll_buf;
post_buf = ctx->post_buf;
qp = ctx->qp;
tstamp = malloc(iters * sizeof *tstamp);
if (!tstamp) {
perror("malloc");
return 10;
}
/* -------------------------------------------------------------------------*/
/* -------------------------------------------------------------------------*/
/* -------------------------------------------------------------------------*/
/* -------------------------------------------------------------------------*/
/* Code added to show problem */
/* Start */
/* -------------------------------------------------------------------------*/
if (data.servername == 0) {
char* dummy = (char*) malloc(39999000000);
assert(dummy == NULL);
do_sender(ctx, data);
*post_buf = (char)123; /* Send "data-ready to receiver */
post_send_and_poll(qp, wr, &bad_wr, ctx->scq);
} else {
while (*poll_buf == 0) { /* Wait for data-ready from sender */
}
do_receiver(ctx, data);
}
/* -------------------------------------------------------------------------*/
/* End */
/* Code added to show problem */
/* -------------------------------------------------------------------------*/
return 0;
}
_______________________________________________
general mailing list
[email protected]
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general
To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general