From: Majd Dibbiny <[email protected]>
Memory Windows(MW) is a method to raise the remote privileges of memory range
within a Memory Region(MR).
A MW is allocated using ibv_alloc_mw, and for it to be useful it should be
bound to a MR using ibv_bind_mw.
The bind operation generates a new R_key with the new permissions for the MW.
The advantages of MWs is the light weight generation of R_key with changing
permissions.
MW type 1's R_key can be invalidated by binding the MW to a MR where the length
of the MW is zero.
A local MW type 2's R_key can be invalidated by sending a work request(WR),
where the immediate data contains the MW's R_key and the opcode is
IBV_WR_LOCAL_INV.
When done, the user can unbind and deallocate the MW using ibv_dealloc_mw.
Add the following verbs:
1. ibv_alloc_mw: Takes a Protection Domain(PD) and type of MW, and return
a MW.
2. ibv_bind_mw: Takes a Queue Pair(QP), a type 1 MW and bind information (MR,
address, length, access flags). Then it posts a bind request
to the given QP. Upon success, the MW's R_key is updated.
For type 2 MW, one should directly post bind WQE to the QP,
using ibv_post_send.
3. ibv_dealloc_mw: Unbinds and deallocates the MW.
Signed-off-by: Majd Dibbiny <[email protected]>
---
Makefile.am | 7 ++-
include/infiniband/driver.h | 6 +++
include/infiniband/kern-abi.h | 23 ++++++++++
include/infiniband/verbs.h | 75 ++++++++++++++++++++++++++++++---
man/ibv_alloc_mw.3 | 49 ++++++++++++++++++++++
man/ibv_bind_mw.3 | 91 +++++++++++++++++++++++++++++++++++++++++
man/ibv_inc_rkey.3 | 29 +++++++++++++
man/ibv_post_send.3 | 22 ++++++++++
src/cmd.c | 37 +++++++++++++++++
src/libibverbs.map | 3 +
10 files changed, 333 insertions(+), 9 deletions(-)
create mode 100644 man/ibv_alloc_mw.3
create mode 100644 man/ibv_bind_mw.3
create mode 100644 man/ibv_inc_rkey.3
diff --git a/Makefile.am b/Makefile.am
index a6767de..5ae1dab 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -58,7 +58,8 @@ man_MANS = man/ibv_asyncwatch.1 man/ibv_devices.1
man/ibv_devinfo.1 \
man/ibv_query_srq.3 man/ibv_rate_to_mult.3 man/ibv_reg_mr.3
\
man/ibv_req_notify_cq.3 man/ibv_resize_cq.3 man/ibv_rate_to_mbps.3 \
man/ibv_create_qp_ex.3 man/ibv_create_srq_ex.3 man/ibv_open_xrcd.3 \
- man/ibv_get_srq_num.3 man/ibv_open_qp.3
+ man/ibv_get_srq_num.3 man/ibv_open_qp.3 man/ibv_alloc_mw.3 \
+ man/ibv_bind_mw.3 man/ibv_inc_rkey.3
DEBIAN = debian/changelog debian/compat debian/control debian/copyright \
debian/ibverbs-utils.install debian/libibverbs1.install \
@@ -94,6 +95,7 @@ install-data-hook:
$(RM) ibv_port_state_str.3 && \
$(RM) mbps_to_ibv_rate.3 && \
$(RM) ibv_close_xrcd.3 && \
+ $(RM) ibv_dealloc_mw.3 && \
$(LN_S) ibv_get_async_event.3 ibv_ack_async_event.3 && \
$(LN_S) ibv_get_cq_event.3 ibv_ack_cq_events.3 && \
$(LN_S) ibv_open_device.3 ibv_close_device.3 && \
@@ -111,4 +113,5 @@ install-data-hook:
$(LN_S) ibv_event_type_str.3 ibv_node_type_str.3 && \
$(LN_S) ibv_event_type_str.3 ibv_port_state_str.3 && \
$(LN_S) ibv_rate_to_mbps.3 mbps_to_ibv_rate.3 && \
- $(LN_S) ibv_open_xrcd.3 ibv_close_xrcd.3
+ $(LN_S) ibv_open_xrcd.3 ibv_close_xrcd.3 && \
+ $(LN_S) ibv_alloc_mw.3 ibv_dealloc_mw.3
diff --git a/include/infiniband/driver.h b/include/infiniband/driver.h
index 5cc092b..e3b7401 100644
--- a/include/infiniband/driver.h
+++ b/include/infiniband/driver.h
@@ -129,6 +129,12 @@ int ibv_cmd_reg_mr(struct ibv_pd *pd, void *addr, size_t
length,
size_t cmd_size,
struct ibv_reg_mr_resp *resp, size_t resp_size);
int ibv_cmd_dereg_mr(struct ibv_mr *mr);
+int ibv_cmd_alloc_mw(struct ibv_pd *pd, enum ibv_mw_type type,
+ struct ibv_mw *mw, struct ibv_alloc_mw *cmd,
+ size_t cmd_size,
+ struct ibv_alloc_mw_resp *resp, size_t resp_size);
+int ibv_cmd_dealloc_mw(struct ibv_mw *mw,
+ struct ibv_dealloc_mw *cmd, size_t cmd_size);
int ibv_cmd_create_cq(struct ibv_context *context, int cqe,
struct ibv_comp_channel *channel,
int comp_vector, struct ibv_cq *cq,
diff --git a/include/infiniband/kern-abi.h b/include/infiniband/kern-abi.h
index 91b45d8..ceb2ca9 100644
--- a/include/infiniband/kern-abi.h
+++ b/include/infiniband/kern-abi.h
@@ -340,6 +340,29 @@ struct ibv_dereg_mr {
__u32 mr_handle;
};
+struct ibv_alloc_mw {
+ __u32 command;
+ __u16 in_words;
+ __u16 out_words;
+ __u64 response;
+ __u32 pd_handle;
+ __u8 mw_type;
+ __u8 reserved[3];
+};
+
+struct ibv_alloc_mw_resp {
+ __u32 mw_handle;
+ __u32 rkey;
+};
+
+struct ibv_dealloc_mw {
+ __u32 command;
+ __u16 in_words;
+ __u16 out_words;
+ __u32 mw_handle;
+ __u32 reserved;
+};
+
struct ibv_create_comp_channel {
__u32 command;
__u16 in_words;
diff --git a/include/infiniband/verbs.h b/include/infiniband/verbs.h
index cfa1156..dcee050 100644
--- a/include/infiniband/verbs.h
+++ b/include/infiniband/verbs.h
@@ -115,8 +115,11 @@ enum ibv_device_cap_flags {
IBV_DEVICE_RC_RNR_NAK_GEN = 1 << 12,
IBV_DEVICE_SRQ_RESIZE = 1 << 13,
IBV_DEVICE_N_NOTIFY_CQ = 1 << 14,
+ IBV_DEVICE_MEM_WINDOW = 1 << 17,
IBV_DEVICE_XRC = 1 << 20,
- IBV_DEVICE_MANAGED_FLOW_STEERING = 1 << 29
+ IBV_DEVICE_MEM_WINDOW_TYPE_2A = 1 << 23,
+ IBV_DEVICE_MEM_WINDOW_TYPE_2B = 1 << 24,
+ IBV_DEVICE_MANAGED_FLOW_STEERING = 1 << 29,
};
enum ibv_atomic_cap {
@@ -280,6 +283,7 @@ enum ibv_wc_opcode {
IBV_WC_COMP_SWAP,
IBV_WC_FETCH_ADD,
IBV_WC_BIND_MW,
+ IBV_WC_LOCAL_INV,
/*
* Set value of IBV_WC_RECV so consumers can test if a completion is a
* receive by testing (opcode & IBV_WC_RECV).
@@ -314,7 +318,15 @@ enum ibv_access_flags {
IBV_ACCESS_REMOTE_WRITE = (1<<1),
IBV_ACCESS_REMOTE_READ = (1<<2),
IBV_ACCESS_REMOTE_ATOMIC = (1<<3),
- IBV_ACCESS_MW_BIND = (1<<4)
+ IBV_ACCESS_MW_BIND = (1<<4),
+ IBV_ACCESS_ZERO_BASED = (1<<5)
+};
+
+struct ibv_mw_bind_info {
+ struct ibv_mr *mr;
+ uint64_t addr;
+ uint64_t length;
+ uint64_t mw_access_flags; /* use ibv_access_flags */
};
struct ibv_pd {
@@ -364,6 +376,8 @@ struct ibv_mw {
struct ibv_context *context;
struct ibv_pd *pd;
uint32_t rkey;
+ uint32_t handle;
+ enum ibv_mw_type type;
};
struct ibv_global_route {
@@ -620,7 +634,9 @@ enum ibv_wr_opcode {
IBV_WR_SEND_WITH_IMM,
IBV_WR_RDMA_READ,
IBV_WR_ATOMIC_CMP_AND_SWP,
- IBV_WR_ATOMIC_FETCH_AND_ADD
+ IBV_WR_ATOMIC_FETCH_AND_ADD,
+ IBV_WR_LOCAL_INV,
+ IBV_WR_BIND_MW
};
enum ibv_send_flags {
@@ -666,6 +682,11 @@ struct ibv_send_wr {
uint32_t remote_srqn;
} xrc;
} qp_type;
+ struct {
+ struct ibv_mw *mw;
+ uint32_t rkey;
+ struct ibv_mw_bind_info bind_info;
+ } bind_mw;
};
struct ibv_recv_wr {
@@ -677,11 +698,8 @@ struct ibv_recv_wr {
struct ibv_mw_bind {
uint64_t wr_id;
- struct ibv_mr *mr;
- void *addr;
- size_t length;
int send_flags;
- int mw_access_flags;
+ struct ibv_mw_bind_info bind_info;
};
struct ibv_srq {
@@ -1167,6 +1185,49 @@ int ibv_dereg_mr(struct ibv_mr *mr);
struct ibv_comp_channel *ibv_create_comp_channel(struct ibv_context *context);
/**
+ * ibv_alloc_mw - Allocate a memory window
+ */
+static inline struct ibv_mw *ibv_alloc_mw(struct ibv_pd *pd,
+ enum ibv_mw_type type)
+{
+ if (!pd->context->ops.alloc_mw) {
+ errno = ENOSYS;
+ return NULL;
+ }
+
+ struct ibv_mw *mw = pd->context->ops.alloc_mw(pd, type);
+
+ return mw;
+}
+
+/**
+ * ibv_dealloc_mw - Free a memory window
+ */
+static inline int ibv_dealloc_mw(struct ibv_mw *mw)
+{
+ return mw->context->ops.dealloc_mw(mw);
+}
+
+/**
+ * ibv_inc_rkey - increase the 8 lsb in the given rkey
+ */
+static inline uint32_t ibv_inc_rkey(uint32_t rkey)
+{
+ const uint32_t mask = 0x000000ff;
+ uint8_t newtag = (uint8_t) ((rkey + 1) & mask);
+ return (rkey & ~mask) | newtag;
+}
+
+/**
+ * ibv_bind_mw - Bind a memory window to a region
+ */
+static inline int ibv_bind_mw(struct ibv_qp *qp, struct ibv_mw *mw,
+ struct ibv_mw_bind *mw_bind)
+{
+ return mw->context->ops.bind_mw(qp, mw, mw_bind);
+}
+
+/**
* ibv_destroy_comp_channel - Destroy a completion event channel
*/
int ibv_destroy_comp_channel(struct ibv_comp_channel *channel);
diff --git a/man/ibv_alloc_mw.3 b/man/ibv_alloc_mw.3
new file mode 100644
index 0000000..5da9d69
--- /dev/null
+++ b/man/ibv_alloc_mw.3
@@ -0,0 +1,49 @@
+.\" -*- nroff -*-
+.\"
+.TH IBV_ALLOC_MW 3 2015-01-27 libibverbs "Libibverbs Programmer's Manual"
+.SH "NAME"
+ibv_alloc_mw, ibv_dealloc_mw \- allocate or deallocate a memory window (MW)
+.SH "SYNOPSIS"
+.nf
+.B #include <infiniband/verbs.h>
+.sp
+.BI "struct ibv_mw *ibv_alloc_mw(struct ibv_pd " "*pd" ,
+.BI " enum ibv_mw_type " "type");
+.sp
+.BI "int ibv_dealloc_mw(struct ibv_mw " "*mw" );
+.fi
+.SH "DESCRIPTION"
+.B ibv_alloc_mw()
+allocates a memory window (MW) associated with the protection domain
+.I pd\fR.
+The MW's type (1 or 2A/2B) is
+.I type\fR.
+.PP
+The MW is created not bound. For it to be useful, the MW must be bound,
through either ibv_bind_mw (type 1) or a special WQE (type 2). Once bound, the
memory window allows RDMA (remote) access to a subset of the MR to which it was
bound, until invalidated/unbound/deallocated.
+.PP
+.B ibv_dealloc_mw()
+Unbinds and deallocates the MW
+.I mw\fR.
+.SH "RETURN VALUE"
+.B ibv_alloc_mw()
+returns a pointer to the registered MW, or NULL if the request fails.
+The remote key (\fBR_Key\fR)
+field
+.B rkey
+is used by remote processes to perform Atomic and RDMA operations. This key
will be changed during bind operations. The remote process places this
+.B rkey
+as the rkey field of struct ibv_send_wr passed to the ibv_post_send function.
+.PP
+.B ibv_dealloc_mw()
+returns 0 on success, or the value of errno on failure (which indicates the
failure reason).
+.SH "NOTES"
+.B ibv_dereg_mr()
+fails if any memory window is still bound to this MR.
+.SH "SEE ALSO"
+.BR ibv_alloc_pd (3),
+.BR ibv_post_send (3),
+.BR ibv_bind_mw (3),
+.BR ibv_reg_mr (3),
+.SH "AUTHORS"
+.TP
+Majd Dibbiny <[email protected]>
diff --git a/man/ibv_bind_mw.3 b/man/ibv_bind_mw.3
new file mode 100644
index 0000000..54e7bcb
--- /dev/null
+++ b/man/ibv_bind_mw.3
@@ -0,0 +1,91 @@
+.\" -*- nroff -*-
+.\"
+.TH IBV_BIND_MW 3 2015-01-27 libibverbs "Libibverbs Programmer's Manual"
+.SH "NAME"
+ibv_bind_mw \- post a request to bind a type 1 memory window to a memory region
+.SH "SYNOPSIS"
+.nf
+.B #include <infiniband/verbs.h>
+.sp
+.BI "int ibv_bind_mw(struct ibv_qp " "*qp" ", struct ibv_mw " "*mw" ",
+.BI " struct ibv_mw_bind " "*mw_bind" ");
+.fi
+.SH "DESCRIPTION"
+.B ibv_bind_mw()
+posts to the queue pair
+.I qp
+a request to bind the memory window
+.I mw
+according to the details in
+.I mw_bind\fR.
+.PP
+The argument
+.I mw_bind
+is an ibv_mw_bind struct, as defined in <infiniband/verbs.h>.
+.PP
+.nf
+struct ibv_mw_bind {
+.in +8
+uint64_t wr_id; /* User defined WR ID */
+int send_flags; /* Use ibv_send_flags */
+struct ibv_mw_bind_info bind_info; /* MW bind information */
+.in -8
+}
+.fi
+.PP
+.nf
+struct ibv_mw_bind_info {
+.in +8
+struct ibv_mr *mr; /* The MR to bind the MW to */
+void *addr; /* The address the MW should
start at */
+uint64_t length; /* The length (in byte) the MW
should span */
+uint64_t mw_access_flags; /* Access flags to the MW. use
ibv_access_flags */
+.in -8
+};
+.fi
+.PP
+The QP Transport Service Type must be either UC or RC for bind operations.
+.PP
+The attribute send_flags describes the properties of the \s-1WR\s0. It is
either 0 or the bitwise \s-1OR\s0 of one or more of the following flags:
+.PP
+.TP
+.B IBV_SEND_FENCE \fR Set the fence indicator. Valid only for QPs with
Transport Service Type \fBIBV_QPT_RC
+.TP
+.B IBV_SEND_SIGNALED \fR Set the completion notification indicator. Relevant
only if QP was created with sq_sig_all=0
+.TP
+.B IBV_SEND_SOLICITED \fR Set the solicited event indicator. Valid only for
Send and RDMA Write with immediate
+.PP
+The mw_access_flags define the allowed access to the MW after the bind
+completes successfully. It is either 0 or the bitwise \s-1OR\s0 of one
+or more of the following flags:
+.TP
+.B IBV_ACCESS_REMOTE_WRITE \fR Enable Remote Write Access. Requires local
write access to the MR.
+.TP
+.B IBV_ACCESS_REMOTE_READ\fR Enable Remote Read Access
+.TP
+.B IBV_ACCESS_REMOTE_ATOMIC\fR Enable Remote Atomic Operation Access (if
supported). Requires local write access to the MR.
+.TP
+.B IBV_ACCESS_ZERO_BASED\fR If set, the address given in post send is offset
from the MW's start address.
+.SH "RETURN VALUE"
+.B ibv_bind_mw()
+returns 0 on success, or the value of errno on failure (which
+indicates the failure reason). In case of a success, the R_Key of the
+memory window after the bind is returned in the mw_bind->mw->rkey field.
+.SH "NOTES"
+The bind does not complete when the function return - it is merely
+posted to the QP. The user should keep a copy of the old R_Key, and
+fix the mw structure if the subsequent CQE for the bind operation
+indicates a failure. The user may safely send the R_Key using a send
+request on the same QP, but must not transfer it to the remote in any
+other manner before reading a successful CQE.
+.PP
+Note that for type 2 MW, one should directly post bind WQE to the QP,
+using ibv_post_send.
+.SH "SEE ALSO"
+.BR ibv_alloc_mw (3),
+.BR ibv_post_send (3),
+.BR ibv_poll_cq (3)
+.BR ibv_reg_mr (3),
+.SH "AUTHORS"
+.TP
+Majd Dibbiny <[email protected]>
diff --git a/man/ibv_inc_rkey.3 b/man/ibv_inc_rkey.3
new file mode 100644
index 0000000..9864179
--- /dev/null
+++ b/man/ibv_inc_rkey.3
@@ -0,0 +1,29 @@
+.\" -*- nroff -*-
+.\"
+.TH IBV_INC_RKEY 3 2015-01-29 libibverbs "Libibverbs Programmer's Manual"
+.SH "NAME"
+.nf
+ibv_inc_rkey \- creates a new rkey from the given one
+.SH "SYNOPSIS"
+.nf
+.B #include <infiniband/verbs.h>
+.sp
+.BI "uint32_t ibv_inc_rkey(uint32_t " "rkey" ");
+.fi
+.SH "DESCRIPTION"
+.B ibv_inc_rkey()
+Increases the 8 LSB of
+.I rkey
+and returns the new value.
+.PP
+.SH "RETURN VALUE"
+.B ibv_inc_rkey()
+returns the new rkey.
+.SH "NOTES"
+.PP
+A use case for this verb can be to create a new rkey from a Memory window's
rkey
+when binding it to a Memory region.
+.SH "SEE ALSO"
+.SH "AUTHORS"
+.TP
+Majd Dibbiny <[email protected]>
diff --git a/man/ibv_post_send.3 b/man/ibv_post_send.3
index 9571575..0d599ad 100644
--- a/man/ibv_post_send.3
+++ b/man/ibv_post_send.3
@@ -69,6 +69,24 @@ uint32_t remote_srqn; /* Number of the remote SRQ
*/
} xrc;
.in -8
} qp_type;
+struct {
+.in +8
+struct ibv_mw *mw; /* Memory window (MW) of type 2 to
bind */
+uint32_t rkey; /* The desired new rkey of the MW */
+struct ibv_mw_bind_info bind_info; /* MW additional bind information */
+.in -8
+} bind_mw;
+.in -8
+};
+.fi
+.sp
+.nf
+struct ibv_mw_bind_info {
+.in +8
+struct ibv_mr *mr; /* The Memory region (MR) to bind
the MW to*/
+void *addr; /* The address the MW should start
at */
+size_t length; /* The length (in byte) the MW
should span */
+int mw_access_flags; /* Acess flags to the MW */
.in -8
};
.fi
@@ -95,6 +113,8 @@ IBV_WR_RDMA_WRITE_WITH_IMM | | X | X
IBV_WR_RDMA_READ | | | X
IBV_WR_ATOMIC_CMP_AND_SWP | | | X
IBV_WR_ATOMIC_FETCH_AND_ADD | | | X
+IBV_WR_LOCAL_INV | | X | X
+IBV_WR_BIND_MW | | X | X
.fi
.PP
The attribute send_flags describes the properties of the \s-1WR\s0. It is
either 0 or the bitwise \s-1OR\s0 of one or more of the following flags:
@@ -131,3 +151,5 @@ after the call returns.
.SH "AUTHORS"
.TP
Dotan Barak <[email protected]>
+.TP
+Majd Dibbiny <[email protected]>
diff --git a/src/cmd.c b/src/cmd.c
index 45ea06f..4230d0f 100644
--- a/src/cmd.c
+++ b/src/cmd.c
@@ -280,6 +280,43 @@ int ibv_cmd_dereg_mr(struct ibv_mr *mr)
return 0;
}
+int ibv_cmd_alloc_mw(struct ibv_pd *pd, enum ibv_mw_type type,
+ struct ibv_mw *mw, struct ibv_alloc_mw *cmd,
+ size_t cmd_size,
+ struct ibv_alloc_mw_resp *resp, size_t resp_size)
+{
+ IBV_INIT_CMD_RESP(cmd, cmd_size, ALLOC_MW, resp, resp_size);
+ cmd->pd_handle = pd->handle;
+ cmd->mw_type = type;
+ memset(cmd->reserved, 0, sizeof(cmd->reserved));
+
+ if (write(pd->context->cmd_fd, cmd, cmd_size) != cmd_size)
+ return errno;
+
+ VALGRIND_MAKE_MEM_DEFINED(resp, resp_size);
+
+ mw->context = pd->context;
+ mw->pd = pd;
+ mw->rkey = resp->rkey;
+ mw->handle = resp->mw_handle;
+ mw->type = type;
+
+ return 0;
+}
+
+int ibv_cmd_dealloc_mw(struct ibv_mw *mw,
+ struct ibv_dealloc_mw *cmd, size_t cmd_size)
+{
+ IBV_INIT_CMD(cmd, cmd_size, DEALLOC_MW);
+ cmd->mw_handle = mw->handle;
+ cmd->reserved = 0;
+
+ if (write(mw->context->cmd_fd, cmd, cmd_size) != cmd_size)
+ return errno;
+
+ return 0;
+}
+
int ibv_cmd_create_cq(struct ibv_context *context, int cqe,
struct ibv_comp_channel *channel,
int comp_vector, struct ibv_cq *cq,
diff --git a/src/libibverbs.map b/src/libibverbs.map
index 30212f3..bbb2259 100644
--- a/src/libibverbs.map
+++ b/src/libibverbs.map
@@ -100,6 +100,9 @@ IBVERBS_1.1 {
ibv_event_type_str;
ibv_wc_status_str;
+ ibv_cmd_alloc_mw;
+ ibv_cmd_dealloc_mw;
+
ibv_rate_to_mbps;
mbps_to_ibv_rate;
--
1.7.1
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to [email protected]
More majordomo info at http://vger.kernel.org/majordomo-info.html