date:20150804

Re: [PATCH for-next V1 2/3] IB/core: RoCE GID management separate cleanup and release

2015-08-04 Thread Matan Barak

On Tue, Aug 4, 2015 at 6:10 AM, Jason Gunthorpe
jguntho...@obsidianresearch.com wrote:
 On Mon, Aug 03, 2015 at 04:09:01PM +0300, Matan Barak wrote:

 The release function is called after the device was put.
 Although vendor drivers aren't expected to use IB cache in their
 removal process, we postpone freeing the cache in order to avoid
 possible use-after-free errors.

 It isn't so much that they are not expected, it is that they may not
 have a choice. A driver cannot tear down things like tasklets and work
 queues until after removal finishes, and any of those async things
 could call into the core. That is why a driver audit would be so hard..


Correct, I'll change this comment to:
The release function is called after the device was put.
This is in order to avoid use-after-free errors if the vendor
driver's teardown code uses IB cache.


 @@ -902,9 +925,7 @@ int ib_cache_setup_one(struct ib_device *device)
 (rdma_end_port(device) -
  rdma_start_port(device) + 1),
 GFP_KERNEL);
 - err = gid_table_setup_one(device);
 -
 - if (!device-cache.pkey_cache || !device-cache.gid_cache ||
 + if (!device-cache.pkey_cache ||
   !device-cache.lmc_cache) {
   printk(KERN_WARNING Couldn't allocate cache 
  for %s\n, device-name);
 @@ -912,6 +933,10 @@ int ib_cache_setup_one(struct ib_device *device)
   goto err;
   }

 + err = gid_table_setup_one(device);
 + if (err)
 + goto err;
 +
   for (p = 0; p = rdma_end_port(device) - rdma_start_port(device); ++p) 
 {
   device-cache.pkey_cache[p] = NULL;
   ib_cache_update(device, p + rdma_start_port(device));
 @@ -929,29 +954,46 @@ err_cache:
   for (p = 0; p = rdma_end_port(device) - rdma_start_port(device); ++p)
   kfree(device-cache.pkey_cache[p]);

 + gid_table_cleanup_one(device);
 + gid_table_release_one(device);
  err:
   kfree(device-cache.pkey_cache);
 - gid_table_cleanup_one(device);
   kfree(device-cache.lmc_cache);

 This still seems to double kfree on error..

 Pick a scheme and use it consistently, either rely on release to be
 called on error via kref-put, or kfree and null, for all the kfress in
 release_one.


I'll switch to kref-put cleanup. That means:
gid_table_setup_one - should only call cleanup and not release in error-flow
ib_cache_setup_one - shouldn't free any allocated memory/release the GID cache
  but just cleanup the GID cache.
ib_cache_release_one - should check if the pkey_cache is NULL before freeing it.


 + ib_cache_cleanup_one(device);
   ib_device_unregister_sysfs(device);

 I didn't check closely, but I suspect the above order should be
 swapped, and the matching swap in register. sysfs can legitimately
 call into core code, but vice-versa shouldn't happen...


I didn't understand this comment. The cleanup code calls del_gid which tells the
vendor to delete this GID (and dev_put the ndevs). The kref-put (which
is called when
the device is unregistered) frees the memory. If we switch the order,
we would have
use-after-free scenario.

 Jason

Thanks for looking at this patch.

Matan

 --
 To unsubscribe from this list: send the line unsubscribe linux-rdma in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH for-next V7 1/6] IB/uverbs: Fix reference counting usage of event files

2015-08-04 Thread Yishai Hadas

Fix the reference counting usage to be handled in the event file
creation/destruction function, instead of being done by the caller.
This is done for both async/non-async event files.

Based on Jason Gunthorpe report at https://www.mail-archive.com/
linux-rdma@vger.kernel.org/msg24680.html:
The existing code for this is broken, in ib_uverbs_get_context all
the error paths between ib_uverbs_alloc_event_file and the
kref_get(file-ref) are wrong - this will result in fput() which will
call ib_uverbs_event_close, which will try to do kref_put and
ib_unregister_event_handler - which are no longer paired.

Signed-off-by: Yishai Hadas yish...@mellanox.com
Signed-off-by: Shachar Raindel rain...@mellanox.com
---
 drivers/infiniband/core/uverbs.h  |1 +
 drivers/infiniband/core/uverbs_cmd.c  |   11 +---
 drivers/infiniband/core/uverbs_main.c |   44 
 3 files changed, 40 insertions(+), 16 deletions(-)

diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h
index ba365b6..60e6e3d 100644
--- a/drivers/infiniband/core/uverbs.h
+++ b/drivers/infiniband/core/uverbs.h
@@ -178,6 +178,7 @@ void idr_remove_uobj(struct idr *idp, struct ib_uobject 
*uobj);
 
 struct file *ib_uverbs_alloc_event_file(struct ib_uverbs_file *uverbs_file,
int is_async);
+void ib_uverbs_free_async_event_file(struct ib_uverbs_file *uverbs_file);
 struct ib_uverbs_event_file *ib_uverbs_lookup_comp_file(int fd);
 
 void ib_uverbs_release_ucq(struct ib_uverbs_file *file,
diff --git a/drivers/infiniband/core/uverbs_cmd.c 
b/drivers/infiniband/core/uverbs_cmd.c
index bbb02ff..5720a92 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -367,16 +367,6 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
goto err_file;
}
 
-   file-async_file = filp-private_data;
-
-   INIT_IB_EVENT_HANDLER(file-event_handler, file-device-ib_dev,
- ib_uverbs_event_handler);
-   ret = ib_register_event_handler(file-event_handler);
-   if (ret)
-   goto err_file;
-
-   kref_get(file-async_file-ref);
-   kref_get(file-ref);
file-ucontext = ucontext;
 
fd_install(resp.async_fd, filp);
@@ -386,6 +376,7 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
return in_len;
 
 err_file:
+   ib_uverbs_free_async_event_file(file);
fput(filp);
 
 err_fd:
diff --git a/drivers/infiniband/core/uverbs_main.c 
b/drivers/infiniband/core/uverbs_main.c
index f6eef2d..c238eba 100644
--- a/drivers/infiniband/core/uverbs_main.c
+++ b/drivers/infiniband/core/uverbs_main.c
@@ -406,10 +406,9 @@ static int ib_uverbs_event_close(struct inode *inode, 
struct file *filp)
}
spin_unlock_irq(file-lock);
 
-   if (file-is_async) {
+   if (file-is_async)
ib_unregister_event_handler(file-uverbs_file-event_handler);
-   kref_put(file-uverbs_file-ref, ib_uverbs_release_file);
-   }
+   kref_put(file-uverbs_file-ref, ib_uverbs_release_file);
kref_put(file-ref, ib_uverbs_release_event_file);
 
return 0;
@@ -541,13 +540,20 @@ void ib_uverbs_event_handler(struct ib_event_handler 
*handler,
NULL, NULL);
 }
 
+void ib_uverbs_free_async_event_file(struct ib_uverbs_file *file)
+{
+   kref_put(file-async_file-ref, ib_uverbs_release_event_file);
+   file-async_file = NULL;
+}
+
 struct file *ib_uverbs_alloc_event_file(struct ib_uverbs_file *uverbs_file,
int is_async)
 {
struct ib_uverbs_event_file *ev_file;
struct file *filp;
+   int ret;
 
-   ev_file = kmalloc(sizeof *ev_file, GFP_KERNEL);
+   ev_file = kzalloc(sizeof(*ev_file), GFP_KERNEL);
if (!ev_file)
return ERR_PTR(-ENOMEM);
 
@@ -556,15 +562,41 @@ struct file *ib_uverbs_alloc_event_file(struct 
ib_uverbs_file *uverbs_file,
INIT_LIST_HEAD(ev_file-event_list);
init_waitqueue_head(ev_file-poll_wait);
ev_file-uverbs_file = uverbs_file;
+   kref_get(ev_file-uverbs_file-ref);
ev_file-async_queue = NULL;
-   ev_file-is_async= is_async;
ev_file-is_closed   = 0;
 
filp = anon_inode_getfile([infinibandevent], uverbs_event_fops,
  ev_file, O_RDONLY);
if (IS_ERR(filp))
-   kfree(ev_file);
+   goto err_put_refs;
+
+   if (is_async) {
+   WARN_ON(uverbs_file-async_file);
+   uverbs_file-async_file = ev_file;
+   kref_get(uverbs_file-async_file-ref);
+   INIT_IB_EVENT_HANDLER(uverbs_file-event_handler,
+ uverbs_file-device-ib_dev,
+ ib_uverbs_event_handler);
+   ret = ib_register_event_handler(uverbs_file-event_handler);
+

[PATCH for-next V7 3/6] IB/uverbs: Explicitly pass ib_dev to uverbs commands

2015-08-04 Thread Yishai Hadas

Done in preparation for deploying RCU for the device removal
flow. Allows isolating the RCU handling to the uverb_main layer and
keeping the uverbs_cmd code as is.

Signed-off-by: Yishai Hadas yish...@mellanox.com
Signed-off-by: Shachar Raindel rain...@mellanox.com
---
 drivers/infiniband/core/uverbs.h  |3 +
 drivers/infiniband/core/uverbs_cmd.c  |  103 ++---
 drivers/infiniband/core/uverbs_main.c |   21 +--
 3 files changed, 88 insertions(+), 39 deletions(-)

diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h
index 92ec765..ea52db1 100644
--- a/drivers/infiniband/core/uverbs.h
+++ b/drivers/infiniband/core/uverbs.h
@@ -178,6 +178,7 @@ extern struct idr ib_uverbs_rule_idr;
 void idr_remove_uobj(struct idr *idp, struct ib_uobject *uobj);
 
 struct file *ib_uverbs_alloc_event_file(struct ib_uverbs_file *uverbs_file,
+   struct ib_device *ib_dev,
int is_async);
 void ib_uverbs_free_async_event_file(struct ib_uverbs_file *uverbs_file);
 struct ib_uverbs_event_file *ib_uverbs_lookup_comp_file(int fd);
@@ -214,6 +215,7 @@ struct ib_uverbs_flow_spec {
 
 #define IB_UVERBS_DECLARE_CMD(name)\
ssize_t ib_uverbs_##name(struct ib_uverbs_file *file,   \
+struct ib_device *ib_dev,  \
 const char __user *buf, int in_len,\
 int out_len)
 
@@ -255,6 +257,7 @@ IB_UVERBS_DECLARE_CMD(close_xrcd);
 
 #define IB_UVERBS_DECLARE_EX_CMD(name) \
int ib_uverbs_ex_##name(struct ib_uverbs_file *file,\
+   struct ib_device *ib_dev,   \
struct ib_udata *ucore, \
struct ib_udata *uhw)
 
diff --git a/drivers/infiniband/core/uverbs_cmd.c 
b/drivers/infiniband/core/uverbs_cmd.c
index 5720a92..29443c0 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -282,13 +282,13 @@ static void put_xrcd_read(struct ib_uobject *uobj)
 }
 
 ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
  const char __user *buf,
  int in_len, int out_len)
 {
struct ib_uverbs_get_context  cmd;
struct ib_uverbs_get_context_resp resp;
struct ib_udata   udata;
-   struct ib_device *ibdev = file-device-ib_dev;
 #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
struct ib_device_attr dev_attr;
 #endif
@@ -313,13 +313,13 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
   (unsigned long) cmd.response + sizeof resp,
   in_len - sizeof cmd, out_len - sizeof resp);
 
-   ucontext = ibdev-alloc_ucontext(ibdev, udata);
+   ucontext = ib_dev-alloc_ucontext(ib_dev, udata);
if (IS_ERR(ucontext)) {
ret = PTR_ERR(ucontext);
goto err;
}
 
-   ucontext-device = ibdev;
+   ucontext-device = ib_dev;
INIT_LIST_HEAD(ucontext-pd_list);
INIT_LIST_HEAD(ucontext-mr_list);
INIT_LIST_HEAD(ucontext-mw_list);
@@ -340,7 +340,7 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
ucontext-odp_mrs_count = 0;
INIT_LIST_HEAD(ucontext-no_private_counters);
 
-   ret = ib_query_device(ibdev, dev_attr);
+   ret = ib_query_device(ib_dev, dev_attr);
if (ret)
goto err_free;
if (!(dev_attr.device_cap_flags  IB_DEVICE_ON_DEMAND_PAGING))
@@ -355,7 +355,7 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
goto err_free;
resp.async_fd = ret;
 
-   filp = ib_uverbs_alloc_event_file(file, 1);
+   filp = ib_uverbs_alloc_event_file(file, ib_dev, 1);
if (IS_ERR(filp)) {
ret = PTR_ERR(filp);
goto err_fd;
@@ -384,7 +384,7 @@ err_fd:
 
 err_free:
put_pid(ucontext-tgid);
-   ibdev-dealloc_ucontext(ucontext);
+   ib_dev-dealloc_ucontext(ucontext);
 
 err:
mutex_unlock(file-mutex);
@@ -392,11 +392,12 @@ err:
 }
 
 static void copy_query_dev_fields(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
  struct ib_uverbs_query_device_resp *resp,
  struct ib_device_attr *attr)
 {
resp-fw_ver= attr-fw_ver;
-   resp-node_guid = file-device-ib_dev-node_guid;
+   resp-node_guid = ib_dev-node_guid;
resp-sys_image_guid= attr-sys_image_guid;
resp-max_mr_size   = attr-max_mr_size;
resp-page_size_cap = attr-page_size_cap;
@@ -434,10 +435,11 @@ static void

[PATCH for-next V7 0/6] HW Device hot-removal support

2015-08-04 Thread Yishai Hadas

Currently, if there is any user space application using an IB device,
it is impossible to unload the HW device driver for this device.

Similarly, if the device is hot-unplugged or reset, the device driver
hardware removal flow blocks until all user contexts are destroyed.

This patchset removes the above limitations from both uverbs and ucma.

The IB-core and uverbs layers are still required to remain loaded as
long as there are user applications using the verbs API. However, the
hardware device drivers are not blocked any more by the user space
activity.

To support this, the hardware device needs to expose a new kernel API
named 'disassociate_ucontext'. The device driver is given a ucontext
to detach from, and it should block this user context from any future
hardware access. In the IB-core level, we use this interface to
deactivate all ucontext that address a specific device when handling a
remove_one callback for it.

In the RDMA-CM layer, a similar change is needed in the ucma module,
to prevent blocking of the remove_one operation. This allows for
hot-removal of devices with RDMA-CM users in the user space.

The first three patches are preparation for this series.
The first patch fixes a reference counting issue pointed by Jason Gunthorpe.
The second patch fixes a race condition issue pointed by Jason Gunthorpe.
The third patch is a preparation step for deploying RCU for the device
removal flow.

The fourth patch introduces the new API between the HW device driver and
the IB core. For devices which implement the functionality, IB core
will use it in remove_one, disassociating any active ucontext from the
hardware device. Other drivers that didn't implement it will behave as
today, remove_one will block until all ucontexts referring the device
are destroyed before returning.

The fifth patch provides implementation of this API for the mlx4
driver.

The last patch extends ucma to avoid blocking remove_one operation in
the cma module. When such device removal event is received, ucma is
turning all user contexts to zombie contexts. This is done by
releasing all underlying resources and preventing any further user
operations on the context.

Changes from V6:
Added an extra patch #2 to solve a race that was introduced 5 years ago and was 
reported by Jason.
patch #4 (previously #3): Adapted to the fix of patch #2.

Changes from V5:
Addressed Jason's comments for below patches:
patch #1: Improve kref usage.
patch #3: Use 2 different krefs for complete and memory, improve some comments.

Changes from V4:
patch #1,#3 - addressed Jason's comments.
patch #2, #4 - rebased upon last stuff.

Changes from V3:
Add 2 patches as a preparation for this series, details above.
patch #3: Change the locking schema based on Jason's comments.

Changes from V2:
patch #1: Rebase over ODP patches.

Changes from V1:
patch #1: Use uverbs flags instead of disassociate support, drop 
fatal_event_raised flag.
patch #3: Add support in ucma for handling device removal.

Changes from V0:
patch #1: ib_uverbs_close, reduced mutex scope to enable tasks run in parallel.
Yishai Hadas (6):
  IB/uverbs: Fix reference counting usage of event files
  IB/uverbs: Fix race between ib_uverbs_open and remove_one
  IB/uverbs: Explicitly pass ib_dev to uverbs commands
  IB/uverbs: Enable device removal when there are active user space
applications
  IB/mlx4_ib: Disassociate support
  IB/ucma: HW Device hot-removal support

 drivers/infiniband/core/ucma.c|  130 +-
 drivers/infiniband/core/uverbs.h  |   16 +-
 drivers/infiniband/core/uverbs_cmd.c  |  114 ++
 drivers/infiniband/core/uverbs_main.c |  442 +++--
 drivers/infiniband/hw/mlx4/main.c |  139 ++-
 drivers/infiniband/hw/mlx4/mlx4_ib.h  |   13 +
 include/rdma/ib_verbs.h   |1 +
 7 files changed, 718 insertions(+), 137 deletions(-)

--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH for-next V7 2/6] IB/uverbs: Fix race between ib_uverbs_open and remove_one

2015-08-04 Thread Yishai Hadas

Fixes: 2a72f212263701b927559f6850446421d5906c41 (IB/uverbs: Remove dev_table)

Before this commit there was a device look-up table that was protected
by a spin_lock used by ib_uverbs_open and by ib_uverbs_remove_one. When
it was dropped and container_of was used instead, it enabled the race
with remove_one as dev might be freed just after:
dev = container_of(inode-i_cdev, struct ib_uverbs_device, cdev) but
before the kref_get.

In addition, this buggy patch added some dead code as
container_of(x,y,z) can never be NULL and so dev can never be NULL.
As a result the comment above ib_uverbs_open saying the open method
will either immediately run -ENXIO is wrong as it can never happen.

The solution follows Jason Gunthorpe suggestion from below URL:
https://www.mail-archive.com/linux-rdma@vger.kernel.org/msg25692.html

cdev will hold a kref on the parent (the containing structure,
ib_uverbs_device) and only when that kref is released it is
guaranteed that open will never be called again.

In addition, fixes the active count scheme to use an atomic
not a kref to prevent WARN_ON as pointed by above comment
from Jason.

Signed-off-by: Yishai Hadas yish...@mellanox.com
Signed-off-by: Shachar Raindel rain...@mellanox.com
---
 drivers/infiniband/core/uverbs.h  |3 +-
 drivers/infiniband/core/uverbs_main.c |   43 +++--
 2 files changed, 32 insertions(+), 14 deletions(-)

diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h
index 60e6e3d..92ec765 100644
--- a/drivers/infiniband/core/uverbs.h
+++ b/drivers/infiniband/core/uverbs.h
@@ -85,7 +85,7 @@
  */
 
 struct ib_uverbs_device {
-   struct kref ref;
+   atomic_trefcount;
int num_comp_vectors;
struct completion   comp;
struct device  *dev;
@@ -94,6 +94,7 @@ struct ib_uverbs_device {
struct cdev cdev;
struct rb_root  xrcd_tree;
struct mutexxrcd_tree_mutex;
+   struct kobject  kobj;
 };
 
 struct ib_uverbs_event_file {
diff --git a/drivers/infiniband/core/uverbs_main.c 
b/drivers/infiniband/core/uverbs_main.c
index c238eba..9f39978 100644
--- a/drivers/infiniband/core/uverbs_main.c
+++ b/drivers/infiniband/core/uverbs_main.c
@@ -130,14 +130,18 @@ static int (*uverbs_ex_cmd_table[])(struct ib_uverbs_file 
*file,
 static void ib_uverbs_add_one(struct ib_device *device);
 static void ib_uverbs_remove_one(struct ib_device *device);
 
-static void ib_uverbs_release_dev(struct kref *ref)
+static void ib_uverbs_release_dev(struct kobject *kobj)
 {
struct ib_uverbs_device *dev =
-   container_of(ref, struct ib_uverbs_device, ref);
+   container_of(kobj, struct ib_uverbs_device, kobj);
 
-   complete(dev-comp);
+   kfree(dev);
 }
 
+static struct kobj_type ib_uverbs_dev_ktype = {
+   .release = ib_uverbs_release_dev,
+};
+
 static void ib_uverbs_release_event_file(struct kref *ref)
 {
struct ib_uverbs_event_file *file =
@@ -303,13 +307,19 @@ static int ib_uverbs_cleanup_ucontext(struct 
ib_uverbs_file *file,
return context-device-dealloc_ucontext(context);
 }
 
+static void ib_uverbs_comp_dev(struct ib_uverbs_device *dev)
+{
+   complete(dev-comp);
+}
+
 static void ib_uverbs_release_file(struct kref *ref)
 {
struct ib_uverbs_file *file =
container_of(ref, struct ib_uverbs_file, ref);
 
module_put(file-device-ib_dev-owner);
-   kref_put(file-device-ref, ib_uverbs_release_dev);
+   if (atomic_dec_and_test(file-device-refcount))
+   ib_uverbs_comp_dev(file-device);
 
kfree(file);
 }
@@ -775,9 +785,7 @@ static int ib_uverbs_open(struct inode *inode, struct file 
*filp)
int ret;
 
dev = container_of(inode-i_cdev, struct ib_uverbs_device, cdev);
-   if (dev)
-   kref_get(dev-ref);
-   else
+   if (!atomic_inc_not_zero(dev-refcount))
return -ENXIO;
 
if (!try_module_get(dev-ib_dev-owner)) {
@@ -798,6 +806,7 @@ static int ib_uverbs_open(struct inode *inode, struct file 
*filp)
mutex_init(file-mutex);
 
filp-private_data = file;
+   kobject_get(dev-kobj);
 
return nonseekable_open(inode, filp);
 
@@ -805,13 +814,16 @@ err_module:
module_put(dev-ib_dev-owner);
 
 err:
-   kref_put(dev-ref, ib_uverbs_release_dev);
+   if (atomic_dec_and_test(dev-refcount))
+   ib_uverbs_comp_dev(dev);
+
return ret;
 }
 
 static int ib_uverbs_close(struct inode *inode, struct file *filp)
 {
struct ib_uverbs_file *file = filp-private_data;
+   struct ib_uverbs_device *dev = file-device;
 
ib_uverbs_cleanup_ucontext(file, file-ucontext);
 
@@ -819,6 +831,7 @@ static int

[PATCH for-next V7 4/6] IB/uverbs: Enable device removal when there are active user space applications

2015-08-04 Thread Yishai Hadas

Enables the uverbs_remove_one to succeed despite the fact that there are
running IB applications working with the given ib device.  This
functionality enables a HW device to be unbind/reset despite the fact that
there are running user space applications using it.

It exposes a new IB kernel API named 'disassociate_ucontext' which lets
a driver detaching its HW resources from a given user context without
crashing/terminating the application. In case a driver implemented the
above API and registered with ib_uverb there will be no dependency between its
device to its uverbs_device. Upon calling remove_one of ib_uverbs the call
should return after disassociating the open HW resources without waiting to
clients disconnecting. In case driver didn't implement this API there will be no
change to current behaviour and uverbs_remove_one will return only when last
client has disconnected and reference count on uverbs device became 0.

In case the lower driver device was removed any application will
continue working over some zombie HCA, further calls will ended with an
immediate error.

Signed-off-by: Yishai Hadas yish...@mellanox.com
Signed-off-by: Shachar Raindel rain...@mellanox.com
---
 drivers/infiniband/core/uverbs.h  |9 +-
 drivers/infiniband/core/uverbs_main.c |  360 +++--
 include/rdma/ib_verbs.h   |1 +
 3 files changed, 302 insertions(+), 68 deletions(-)

diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h
index ea52db1..3863d33 100644
--- a/drivers/infiniband/core/uverbs.h
+++ b/drivers/infiniband/core/uverbs.h
@@ -89,12 +89,16 @@ struct ib_uverbs_device {
int num_comp_vectors;
struct completion   comp;
struct device  *dev;
-   struct ib_device   *ib_dev;
+   struct ib_device__rcu  *ib_dev;
int devnum;
struct cdev cdev;
struct rb_root  xrcd_tree;
struct mutexxrcd_tree_mutex;
struct kobject  kobj;
+   struct srcu_struct  disassociate_srcu;
+   struct mutexlists_mutex; /* protect lists */
+   struct list_headuverbs_file_list;
+   struct list_headuverbs_events_file_list;
 };
 
 struct ib_uverbs_event_file {
@@ -106,6 +110,7 @@ struct ib_uverbs_event_file {
wait_queue_head_t   poll_wait;
struct fasync_struct   *async_queue;
struct list_headevent_list;
+   struct list_headlist;
 };
 
 struct ib_uverbs_file {
@@ -115,6 +120,8 @@ struct ib_uverbs_file {
struct ib_ucontext *ucontext;
struct ib_event_handler event_handler;
struct ib_uverbs_event_file*async_file;
+   struct list_headlist;
+   int is_closed;
 };
 
 struct ib_uverbs_event {
diff --git a/drivers/infiniband/core/uverbs_main.c 
b/drivers/infiniband/core/uverbs_main.c
index dc968df..59b28a6 100644
--- a/drivers/infiniband/core/uverbs_main.c
+++ b/drivers/infiniband/core/uverbs_main.c
@@ -137,6 +137,7 @@ static void ib_uverbs_release_dev(struct kobject *kobj)
struct ib_uverbs_device *dev =
container_of(kobj, struct ib_uverbs_device, kobj);
 
+   cleanup_srcu_struct(dev-disassociate_srcu);
kfree(dev);
 }
 
@@ -207,9 +208,6 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file 
*file,
 {
struct ib_uobject *uobj, *tmp;
 
-   if (!context)
-   return 0;
-
context-closing = 1;
 
list_for_each_entry_safe(uobj, tmp, context-ah_list, list) {
@@ -318,8 +316,16 @@ static void ib_uverbs_release_file(struct kref *ref)
 {
struct ib_uverbs_file *file =
container_of(ref, struct ib_uverbs_file, ref);
+   struct ib_device *ib_dev;
+   int srcu_key;
+
+   srcu_key = srcu_read_lock(file-device-disassociate_srcu);
+   ib_dev = srcu_dereference(file-device-ib_dev,
+ file-device-disassociate_srcu);
+   if (ib_dev  !ib_dev-disassociate_ucontext)
+   module_put(ib_dev-owner);
+   srcu_read_unlock(file-device-disassociate_srcu, srcu_key);
 
-   module_put(file-device-ib_dev-owner);
if (atomic_dec_and_test(file-device-refcount))
ib_uverbs_comp_dev(file-device);
 
@@ -343,9 +349,19 @@ static ssize_t ib_uverbs_event_read(struct file *filp, 
char __user *buf,
return -EAGAIN;
 
if (wait_event_interruptible(file-poll_wait,
-

[PATCH for-next V7 5/6] IB/mlx4_ib: Disassociate support

2015-08-04 Thread Yishai Hadas

Implements the IB core disassociate_ucontext API. The driver detaches the HW
resources for a given user context to prevent a dependency between application
termination and device disconnecting. This is done by managing the VMAs that
were mapped to the HW bars such as door bell and blueflame. When need to detach
remap them to an arbitrary kernel page returned by the zap API.

Signed-off-by: Yishai Hadas yish...@mellanox.com
Signed-off-by: Jack Morgenstein ja...@mellanox.com
---
 drivers/infiniband/hw/mlx4/main.c|  139 +-
 drivers/infiniband/hw/mlx4/mlx4_ib.h |   13 +++
 2 files changed, 150 insertions(+), 2 deletions(-)

diff --git a/drivers/infiniband/hw/mlx4/main.c 
b/drivers/infiniband/hw/mlx4/main.c
index 8be6db8..3097a27 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -692,7 +692,7 @@ static struct ib_ucontext *mlx4_ib_alloc_ucontext(struct 
ib_device *ibdev,
resp.cqe_size = dev-dev-caps.cqe_size;
}
 
-   context = kmalloc(sizeof *context, GFP_KERNEL);
+   context = kzalloc(sizeof(*context), GFP_KERNEL);
if (!context)
return ERR_PTR(-ENOMEM);
 
@@ -729,21 +729,143 @@ static int mlx4_ib_dealloc_ucontext(struct ib_ucontext 
*ibcontext)
return 0;
 }
 
+static void  mlx4_ib_vma_open(struct vm_area_struct *area)
+{
+   /* vma_open is called when a new VMA is created on top of our VMA.
+* This is done through either mremap flow or split_vma (usually due
+* to mlock, madvise, munmap, etc.). We do not support a clone of the
+* vma, as this VMA is strongly hardware related. Therefore we set the
+* vm_ops of the newly created/cloned VMA to NULL, to prevent it from
+* calling us again and trying to do incorrect actions. We assume that
+* the original vma size is exactly a single page that there will be no
+* splitting operations on.
+*/
+   area-vm_ops = NULL;
+}
+
+static void  mlx4_ib_vma_close(struct vm_area_struct *area)
+{
+   struct mlx4_ib_vma_private_data *mlx4_ib_vma_priv_data;
+
+   /* It's guaranteed that all VMAs opened on a FD are closed before the
+* file itself is closed, therefore no sync is needed with the regular
+* closing flow. (e.g. mlx4_ib_dealloc_ucontext) However need a sync
+* with accessing the vma as part of mlx4_ib_disassociate_ucontext.
+* The close operation is usually called under mm-mmap_sem except when
+* process is exiting.  The exiting case is handled explicitly as part
+* of mlx4_ib_disassociate_ucontext.
+*/
+   mlx4_ib_vma_priv_data = (struct mlx4_ib_vma_private_data *)
+   area-vm_private_data;
+
+   /* set the vma context pointer to null in the mlx4_ib driver's private
+* data to protect against a race condition in 
mlx4_ib_dissassociate_ucontext().
+*/
+   mlx4_ib_vma_priv_data-vma = NULL;
+}
+
+static const struct vm_operations_struct mlx4_ib_vm_ops = {
+   .open = mlx4_ib_vma_open,
+   .close = mlx4_ib_vma_close
+};
+
+static void mlx4_ib_disassociate_ucontext(struct ib_ucontext *ibcontext)
+{
+   int i;
+   int ret = 0;
+   struct vm_area_struct *vma;
+   struct mlx4_ib_ucontext *context = to_mucontext(ibcontext);
+   struct task_struct *owning_process  = NULL;
+   struct mm_struct   *owning_mm   = NULL;
+
+   owning_process = get_pid_task(ibcontext-tgid, PIDTYPE_PID);
+   if (!owning_process)
+   return;
+
+   owning_mm = get_task_mm(owning_process);
+   if (!owning_mm) {
+   pr_info(no mm, disassociate ucontext is pending task 
termination\n);
+   while (1) {
+   /* make sure that task is dead before returning, it may
+* prevent a rare case of module down in parallel to a
+* call to mlx4_ib_vma_close.
+*/
+   put_task_struct(owning_process);
+   msleep(1);
+   owning_process = get_pid_task(ibcontext-tgid,
+ PIDTYPE_PID);
+   if (!owning_process ||
+   owning_process-state == TASK_DEAD) {
+   pr_info(disassociate ucontext done, task was 
terminated\n);
+   /* in case task was dead need to release the 
task struct */
+   if (owning_process)
+   put_task_struct(owning_process);
+   return;
+   }
+   }
+   }
+
+   /* need to protect from a race on closing the vma as part of
+* mlx4_ib_vma_close().
+*/
+   down_read(owning_mm-mmap_sem);
+   for (i = 0; i  HW_BAR_COUNT; i++) {
+   vma =

[RFC] split struct ib_send_wr

2015-08-04 Thread Christoph Hellwig

Hi all,

please take a look at my RFC patch here:


http://git.infradead.org/users/hch/scsi.git/commitdiff/751774250b71da83a26ba8584cff70f5e7bb7b1e

the commit contains my explanation, but apparently the patch is too
large for the list limit and didn't make it through.
--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v8 2/4] IB/core: Add rdma netlink helper functions

2015-08-04 Thread Jason Gunthorpe

On Tue, Aug 04, 2015 at 08:48:31PM -0400, ira.weiny wrote:
 We are using Netlink multicast.  I believe that netlink_bind only allows root
 to bind to multicast.

That is a good start...

 That said I have not tested the ability to change the timeout settings if one
 were to bind without multicast and send a message.

I rather expect that needs fixing..

And I bet sequence numbers are sufficiently predictable that the path
reply should be restricted to root as well.

Jason
--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v8 2/4] IB/core: Add rdma netlink helper functions

2015-08-04 Thread ira.weiny

On Mon, Aug 03, 2015 at 09:15:34PM -0600, Jason Gunthorpe wrote:
 On Thu, Jul 09, 2015 at 01:34:26PM -0400, kaike@intel.com wrote:
  From: Kaike Wan kaike@intel.com
  
  This patch adds a function to check if listeners for a netlink multicast
  group are present. It also adds a function to receive netlink response
  messages.
  
  Signed-off-by: Kaike Wan kaike@intel.com
  Signed-off-by: John Fleck john.fl...@intel.com
  Signed-off-by: Ira Weiny ira.we...@intel.com
   drivers/infiniband/core/netlink.c |   55 
  +
   include/rdma/rdma_netlink.h   |7 +
   2 files changed, 62 insertions(+), 0 deletions(-)
  
  diff --git a/drivers/infiniband/core/netlink.c 
  b/drivers/infiniband/core/netlink.c
  index 23dd5a5..d47df93 100644
  +++ b/drivers/infiniband/core/netlink.c
  @@ -49,6 +49,14 @@ static DEFINE_MUTEX(ibnl_mutex);
   static struct sock *nls;
   static LIST_HEAD(client_list);
   
  +int ibnl_chk_listeners(unsigned int group)
  +{
  +   if (netlink_has_listeners(nls, group) == 0)
  +   return -1;
  +   return 0;
  +}
  +EXPORT_SYMBOL(ibnl_chk_listeners);
 
 I was thinking about this today, and, where is the security?
 
 What prevents a non-root user from making the above true and/or worse?

We are using Netlink multicast.  I believe that netlink_bind only allows root
to bind to multicast.

static int netlink_bind(struct socket *sock, struct sockaddr *addr,
int addr_len)
{

...
/* Only superuser is allowed to listen multicasts */
if (groups) {
if (!netlink_allowed(sock, NL_CFG_F_NONROOT_RECV))
return -EPERM;
err = netlink_realloc_groups(sk);
if (err)
return err;
}
...


That said I have not tested the ability to change the timeout settings if one
were to bind without multicast and send a message.

I'll see if I can get some time to test this as Kaike is out on vacation.

Ira

--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH for-next V7 2/6] IB/uverbs: Fix race between ib_uverbs_open and remove_one

2015-08-04 Thread Jason Gunthorpe

On Tue, Aug 04, 2015 at 05:03:24PM +0300, Yishai Hadas wrote:

 Signed-off-by: Yishai Hadas yish...@mellanox.com
 Signed-off-by: Shachar Raindel rain...@mellanox.com
  drivers/infiniband/core/uverbs.h  |3 +-
  drivers/infiniband/core/uverbs_main.c |   43 
 +++--
  2 files changed, 32 insertions(+), 14 deletions(-)

This looks OK

Reviewed-by: Jason Gunthorpe jguntho...@obsidianresearch.com

 @@ -798,6 +806,7 @@ static int ib_uverbs_open(struct inode *inode, struct 
 file *filp)
   mutex_init(file-mutex);
  
   filp-private_data = file;
 + kobject_get(dev-kobj);

Minor quibble, this and the matching put can be dropped. Seeing
container_of followed by a get with out a copy is a red flag to anyone
reading this code.

Jason
--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 20/22] IB/iser: Support up to 8MB data transfer in a single command

2015-08-04 Thread Or Gerlitz

On Tue, Aug 4, 2015 at 8:10 PM, Sagi Grimberg sa...@dev.mellanox.co.il wrote:

 Why SIZE_4K and not PAGE_SIZE?


 Yes, I'll change that to PAGE_SIZE.

 Thanks.


 Would non-4KB pages (e.g. PPC 64KB) be an issue? Would this work between
 hosts with different page sizes?


 iser was always using 4K segments for reasons I don't perfectly
 understand. Maybe Or can comment on this? I do plan to move it to work
 with system page size (soon I hope).

Yep, back when we integrated the code upstream, we were testing on
IA64 too - where the system page size was 16KB but for some reasons
SGs provided by the block layer were many times NON FMR aligned if you
worked with 16KB FMR chunks. I would recommend to test on PPC64 and if
possible on IA64 before/after removing the usage of 4k segments.
--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH for-next V7 6/6] IB/ucma: HW Device hot-removal support

2015-08-04 Thread Jason Gunthorpe

On Tue, Aug 04, 2015 at 05:03:28PM +0300, Yishai Hadas wrote:
 Currently, IB/cma remove_one flow blocks until all user descriptor managed by
 IB/ucma are released. This prevents hot-removal of IB devices. This patch
 allows IB/cma to remove devices regardless of user space activity. Upon 
 getting
 the RDMA_CM_EVENT_DEVICE_REMOVAL event we close all the underlying HW 
 resources
 for the given ucontext. The ucontext itself is still alive till its explicit
 destroying by its creator.
 
 Running applications at that time will have some zombie device, further
 operations may fail.

I've never made it far enough to look at this patch before... It is
Sean's stuff so he is best qualified to comment on it. The rest of the
series is OK now Sean.

 +static void ucma_close_id(struct work_struct *work)
 +{
 + struct ucma_context *ctx =  container_of(work, struct ucma_context, 
 close_work);
 +
 + /* Fence to ensure that ctx-closing was seen by all
 +  * ucma_get_ctx running
 +  */
 + mutex_lock(mut);
 + mutex_unlock(mut);

Uh, no. That isn't how to use mutex's.

I don't really see any consistent locking scheme for closing..

Jason
--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH] IB/ucma: Fix theoretical user triggered use-after-free

2015-08-04 Thread Jason Gunthorpe

Something like this:

CPU A CPU B
  
ucma_destroy_id()
 wait_for_completion()
  .. anything
ucma_put_ctx()
  complete()
 .. continues ...
  ucma_leave_multicast()
   mutex_lock(mut)
 atomic_inc(ctx-ref)
   mutex_unlock(mut)
 ucma_free_ctx()
  ucma_cleanup_multicast()
   mutex_lock(mut)
 kfree(mc)
   rdma_leave_multicast(mc-ctx-cm_id,..

Fix it by latching the ref at 0. Once it goes to 0 mc and ctx cannot
leave the mutex(mut) protection.

The other atomic_inc in ucma_get_ctx is OK because mutex(mut) protects
it from racing with ucma_destroy_id.

Signed-off-by: Jason Gunthorpe jguntho...@obsidianresearch.com
---
 drivers/infiniband/core/ucma.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c
index 29b21213ea75..acac9eafdbf6 100644
--- a/drivers/infiniband/core/ucma.c
+++ b/drivers/infiniband/core/ucma.c
@@ -1321,10 +1321,10 @@ static ssize_t ucma_leave_multicast(struct ucma_file 
*file,
mc = ERR_PTR(-ENOENT);
else if (mc-ctx-file != file)
mc = ERR_PTR(-EINVAL);
-   else {
+   else if (!atomic_inc_not_zero(mc-ctx-ref))
+   mc = ERR_PTR(-ENXIO);
+   else
idr_remove(multicast_idr, mc-id);
-   atomic_inc(mc-ctx-ref);
-   }
mutex_unlock(mut);
 
if (IS_ERR(mc)) {
-- 
1.9.1

--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH for-next V1 2/3] IB/core: RoCE GID management separate cleanup and release

2015-08-04 Thread Jason Gunthorpe

On Tue, Aug 04, 2015 at 09:55:03PM +0300, Matan Barak wrote:

 If it fails in ib_device_register_sysfs, the device release function
 isn't called and the device pointer isn't freed.

Ugh, yes, the abuse in ib_dealloc_device needs to go too.

Attached is a compile tested patch that fixes that up. Feel free to
fix it up as necessary. It should be sequenced before your 'Add RoCE
GID table management'.. It would probably be helpful for doug to
resend that one patch adjusted.

 If ib_cache_setup_one fails, we call ib_device_unregister_sysfs and
 the memory will be freed.

?? ib_device_unregister_sysfs doesn't free memory..

At the driver the sequence should be:
 ib_alloc_device
 ib_register_device
 ib_unregister_device
 ib_dealloc_device

The issue I'm looking (which I suspected before, but just went and
confirmed) at is that sysfs's show_port_gid calls ib_query_gid which
your series now makes use the cache, so sysfs should ideally be shut
off before the cache stuff.

.. and we must setup the cache before setting up sysfs, otherwise
there is a race where userspace can trigger a cache call before
setup.. (null deref?)

 ib_device_unregister_sysfs (otherwise I'll have use-after-free).
 What do you think?

I'm still not sure what you are seeing..

You might also want the cache code to have an entry from
ib_alloc_device to setup locks and other no-fail initalization
things. AFIAK there isn't a strong reason to do this other than to
keep with the basic idiom.

Jason

From 32bafdef61a9e571ef1457f7f1966a7372d1b8d7 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe jguntho...@obsidianresearch.com
Date: Tue, 4 Aug 2015 15:11:41 -0600
Subject: [PATCH] IB/core: Make ib_alloc_device initialize the kobject

This gets rid of the weird in-between state where struct ib_device
was allocated but the kobject didn't work.

Consequently ib_device_release is now guaranteed to be called in
all situations and we can't duplicate its kfrees on error paths.
Choose to just drop kfree(port_immutable)

Signed-off-by: Jason Gunthorpe jguntho...@obsidianresearch.com
---
 drivers/infiniband/core/core_priv.h |  3 --
 drivers/infiniband/core/device.c| 92 -
 drivers/infiniband/core/sysfs.c | 51 ++--
 3 files changed, 65 insertions(+), 81 deletions(-)

diff --git a/drivers/infiniband/core/core_priv.h 
b/drivers/infiniband/core/core_priv.h
index 87d1936f5c1c..950583a62e3b 100644
--- a/drivers/infiniband/core/core_priv.h
+++ b/drivers/infiniband/core/core_priv.h
@@ -43,9 +43,6 @@ int  ib_device_register_sysfs(struct ib_device *device,
   u8, struct kobject *));
 void ib_device_unregister_sysfs(struct ib_device *device);
 
-int  ib_sysfs_setup(void);
-void ib_sysfs_cleanup(void);
-
 int  ib_cache_setup(void);
 void ib_cache_cleanup(void);
 
diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index 9567756ca4f9..0a2c28610026 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -152,6 +152,35 @@ static int alloc_name(char *name)
return 0;
 }
 
+static void ib_device_release(struct device *device)
+{
+   struct ib_device *dev = container_of(device, struct ib_device, dev);
+
+   kfree(dev-port_immutable);
+   kfree(dev);
+}
+
+static int ib_device_uevent(struct device *device,
+   struct kobj_uevent_env *env)
+{
+   struct ib_device *dev = container_of(device, struct ib_device, dev);
+
+   if (add_uevent_var(env, NAME=%s, dev-name))
+   return -ENOMEM;
+
+   /*
+* It would be nice to pass the node GUID with the event...
+*/
+
+   return 0;
+}
+
+static struct class ib_class = {
+   .name= infiniband,
+   .dev_release = ib_device_release,
+   .dev_uevent = ib_device_uevent,
+};
+
 /**
  * ib_alloc_device - allocate an IB device struct
  * @size:size of structure to allocate
@@ -164,9 +193,27 @@ static int alloc_name(char *name)
  */
 struct ib_device *ib_alloc_device(size_t size)
 {
-   BUG_ON(size  sizeof (struct ib_device));
+   struct ib_device *device;
+
+   if (WARN_ON(size  sizeof(struct ib_device)))
+   return NULL;
+
+   device = kzalloc(size, GFP_KERNEL);
+   if (!device)
+   return NULL;
+
+   device-dev.class = ib_class;
+   device_initialize(device-dev);
+
+   dev_set_drvdata(device-dev, device);
+
+   INIT_LIST_HEAD(device-event_handler_list);
+   spin_lock_init(device-event_handler_lock);
+   spin_lock_init(device-client_data_lock);
+   INIT_LIST_HEAD(device-client_data_list);
+   INIT_LIST_HEAD(device-port_list);
 
-   return kzalloc(size, GFP_KERNEL);
+   return device;
 }
 EXPORT_SYMBOL(ib_alloc_device);
 
@@ -178,13 +225,8 @@ EXPORT_SYMBOL(ib_alloc_device);
  */
 void ib_dealloc_device(struct ib_device *device)
 {
-   if (device-reg_state == IB_DEV_UNINITIALIZED) {
-

Re: [PATCH for-next V7 6/6] IB/ucma: HW Device hot-removal support

2015-08-04 Thread Jason Gunthorpe

On Tue, Aug 04, 2015 at 05:03:28PM +0300, Yishai Hadas wrote:
 Currently, IB/cma remove_one flow blocks until all user descriptor managed by
 IB/ucma are released. This prevents hot-removal of IB devices. This patch
 allows IB/cma to remove devices regardless of user space activity. Upon 
 getting
 the RDMA_CM_EVENT_DEVICE_REMOVAL event we close all the underlying HW 
 resources
 for the given ucontext. The ucontext itself is still alive till its explicit
 destroying by its creator.

Implementation aside,

This changes the policy of the ucma from
 Tell user space and expect it to synchronously clean up
To
 Tell user space we already nuked the RDMA device asynchronously

Do we even want to do that unconditionally?

Shouldn't the kernel at least give userspace some time to respond to
the event before nuking the world?

I'm now also wondering if we have some ordering issues on
removal. Will uverbs continue to work while ucma is active?

It looks like at least we can rmmod ib_uverbs and stuff goes really
sideways for poor apps. That is probably a corner case though..

Jason
--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [RFC] split struct ib_send_wr

2015-08-04 Thread Christoph Hellwig

On Tue, Aug 04, 2015 at 04:07:42PM +, Hefty, Sean wrote:
 This looks like a reasonable start.  It may help with feedback if you
 could just post the changes to ib_verbs.h.

Not sure it's all that useful, but here we go:


diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 0940051..666b571 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -1100,55 +1100,96 @@ struct ib_send_wr {
__be32  imm_data;
u32 invalidate_rkey;
} ex;
-   union {
-   struct {
-   u64 remote_addr;
-   u32 rkey;
-   } rdma;
-   struct {
-   u64 remote_addr;
-   u64 compare_add;
-   u64 swap;
-   u64 compare_add_mask;
-   u64 swap_mask;
-   u32 rkey;
-   } atomic;
-   struct {
-   struct ib_ah *ah;
-   void   *header;
-   int hlen;
-   int mss;
-   u32 remote_qpn;
-   u32 remote_qkey;
-   u16 pkey_index; /* valid for GSI only */
-   u8  port_num;   /* valid for DR SMPs on switch only 
*/
-   } ud;
-   struct {
-   u64 iova_start;
-   struct ib_fast_reg_page_list   *page_list;
-   unsigned intpage_shift;
-   unsigned intpage_list_len;
-   u32 length;
-   int access_flags;
-   u32 rkey;
-   } fast_reg;
-   struct {
-   struct ib_mw*mw;
-   /* The new rkey for the memory window. */
-   u32  rkey;
-   struct ib_mw_bind_info   bind_info;
-   } bind_mw;
-   struct {
-   struct ib_sig_attrs*sig_attrs;
-   struct ib_mr   *sig_mr;
-   int access_flags;
-   struct ib_sge  *prot;
-   } sig_handover;
-   } wr;
u32 xrc_remote_srq_num; /* XRC TGT QPs only */
 };
 
+struct ib_rdma_wr {
+   struct ib_send_wr   wr;
+   u64 remote_addr;
+   u32 rkey;
+};
+
+static inline struct ib_rdma_wr *rdma_wr(struct ib_send_wr *wr)
+{
+   return container_of(wr, struct ib_rdma_wr, wr);
+}
+
+struct ib_atomic_wr {
+   struct ib_send_wr   wr;
+   u64 remote_addr;
+   u64 compare_add;
+   u64 swap;
+   u64 compare_add_mask;
+   u64 swap_mask;
+   u32 rkey;
+};
+
+static inline struct ib_atomic_wr *atomic_wr(struct ib_send_wr *wr)
+{
+   return container_of(wr, struct ib_atomic_wr, wr);
+}
+
+struct ib_ud_wr {
+   struct ib_send_wr   wr;
+   struct ib_ah*ah;
+   void*header;
+   int hlen;
+   int mss;
+   u32 remote_qpn;
+   u32 remote_qkey;
+   u16 pkey_index; /* valid for GSI only */
+   u8  port_num;   /* valid for DR SMPs on switch only 
*/
+};
+
+static inline struct ib_ud_wr *ud_wr(struct ib_send_wr *wr)
+{
+   return container_of(wr, struct ib_ud_wr, wr);
+}
+
+struct ib_fast_reg_wr {
+   struct ib_send_wr   wr;
+   u64 iova_start;
+   struct ib_fast_reg_page_list *page_list;
+   unsigned intpage_shift;
+   unsigned intpage_list_len;
+   u32 length;
+   int access_flags;
+   u32 rkey;
+};
+
+static inline struct ib_fast_reg_wr *fast_reg_wr(struct ib_send_wr *wr)
+{
+   return container_of(wr, struct ib_fast_reg_wr, wr);
+}
+
+struct ib_bind_mw_wr {
+   struct ib_send_wr   wr;
+   struct ib_mw*mw;
+   /* The new rkey for the memory window. */
+   u32 rkey;
+   struct ib_mw_bind_info  bind_info;
+};
+
+static inline struct ib_bind_mw_wr *bind_mw_wr(struct ib_send_wr *wr)
+{
+   return container_of(wr, struct ib_bind_mw_wr, wr);
+}
+
+struct ib_sig_handover_wr {
+   struct ib_send_wr   wr;
+   struct ib_sig_attrs*sig_attrs;
+   struct ib_mr   *sig_mr;
+   int

Re: [PATCH for-next V1 2/3] IB/core: RoCE GID management separate cleanup and release

2015-08-04 Thread Jason Gunthorpe

On Tue, Aug 04, 2015 at 03:09:39PM +0300, Matan Barak wrote:
 Correct, I'll change this comment to:
 The release function is called after the device was put.
 This is in order to avoid use-after-free errors if the vendor
 driver's teardown code uses IB cache.

.. the vendor driver uses IB cache from async contexts ..

  + ib_cache_cleanup_one(device);
ib_device_unregister_sysfs(device);
 
  I didn't check closely, but I suspect the above order should be
  swapped, and the matching swap in register. sysfs can legitimately
  call into core code, but vice-versa shouldn't happen...
 
 
 I didn't understand this comment. The cleanup code calls del_gid
 which tells the vendor to delete this GID (and dev_put the
 ndevs). The kref-put (which is called when the device is
 unregistered) frees the memory. If we switch the order, we would
 have use-after-free scenario.

I don't understand your comment either.

What code path from ib_cache will go into ib_sysfs?

Jason
--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [RFC] split struct ib_send_wr

2015-08-04 Thread Bart Van Assche


On 08/04/2015 09:29 AM, Christoph Hellwig wrote:

On Tue, Aug 04, 2015 at 04:07:42PM +, Hefty, Sean wrote:

This looks like a reasonable start.  It may help with feedback if you
could just post the changes to ib_verbs.h.


Not sure it's all that useful, but here we go:

diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h

 [ ... ]

  struct ib_recv_wr {
+   struct ib_send_wr   wr;
struct ib_recv_wr  *next;
u64 wr_id;
struct ib_sge  *sg_list;


Hello Christoph,

This part of the patch surprised me ?

Bart.
--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

RE: [PATCH] RDMA/amso1100: deprecate the amso1100 provider

2015-08-04 Thread Steve Wise

 -Original Message-
 From: linux-rdma-ow...@vger.kernel.org 
 [mailto:linux-rdma-ow...@vger.kernel.org] On Behalf Of Marciniszyn, Mike
 Sent: Tuesday, August 04, 2015 11:33 AM
 To: Doug Ledford; Steve Wise
 Cc: linux-rdma@vger.kernel.org; t...@opengridcomputing.com
 Subject: RE: [PATCH] RDMA/amso1100: deprecate the amso1100 provider

  Subject: Re: [PATCH] RDMA/amso1100: deprecate the amso1100 provider

  On 07/29/2015 10:44 AM, Steve Wise wrote:
   The HW hasn't been sold since 2005, and the SW has definite bit rot.
   Its time to remove it.  So move it to staging for a few releases and
   then remove it after that.

   Signed-off-by: Steve Wise sw...@opengridcomputing.com

  Thanks, applied.

 This might run into the same issue as in 
 https://lists.01.org/pipermail/kbuild-all/2015-August/011216.html.

Hey Mike, what is the issue exactly?

--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v2 00/12] IB: Replace safe uses for ib_get_dma_mr with pd-local_dma_lkey

2015-08-04 Thread Jason Gunthorpe

On Mon, Aug 03, 2015 at 11:33:51AM -0700, Bart Van Assche wrote:
 Bart, do you know what hardware this workaround is for?
 
 I hope the HW vendors can comment on this. Sorry but I'm not sure which HCA
 models and/or firmware versions do not support FMR mapping with a non-zero
 offset.

Perhaps David can remember why he added this:

commit 8f26c9ff9cd0317ad867bce972f69e0c6c2cbe3c
IB/srp: rework mapping engine to use multiple FMR entries

+   /* If we start at an offset into the FMR page, don't merge into
+* the current FMR. Finish it out, and use the kernel's MR for this
+* sg entry. This is to avoid potential bugs on some SRP targets
+* that were never quite defined, but went away when the initiator
+* avoided using FMR on such page fragments.
+*/
+   if (dma_addr  ~dev-fmr_page_mask || dma_len  dev-fmr_max_size) {
+   ret = srp_map_finish_fmr(state, target);
+   if (ret)
+   return ret;
+
+   srp_map_desc(state, dma_addr, dma_len, target-rkey);
+   srp_map_update_start(state, NULL, 0, 0);
+   return 0;
}

The way it is phrased makes it seem like a target bug, not a HCA bug?

Jason
--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

RE: [RFC] split struct ib_send_wr

2015-08-04 Thread Hefty, Sean

 please take a look at my RFC patch here:
 
   http://git.infradead.org/users/hch/scsi.git/commitdiff/751774250b71d
 a83a26ba8584cff70f5e7bb7b1e
 
 the commit contains my explanation, but apparently the patch is too
 large for the list limit and didn't make it through.

This looks like a reasonable start.  It may help with feedback if you could 
just post the changes to ib_verbs.h.
--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH] IB/hfi1: Add Infiniband dependency to Kconfig

2015-08-04 Thread Mike Marciniszyn

From: Jubin John jubin.j...@intel.com

The hfi1 driver depends on the infiniband core stack.
Building a kernel without the infiniband core but with the
hfi1 driver causes undefined reference errors to the ib_* functions.

Reported by 0-day build:
https://lists.01.org/pipermail/kbuild-all/2015-August/011216.html

This is a temporary patch and is only needed while the hfi1 driver is
in staging. Remove when moving driver back to drivers/infiniband.

Reviewed-by: Dennis Dalessandro dennis.dalessan...@intel.com
Reviewed-by: Mike Marciniszyn mike.marcinis...@intel.com
Signed-off-by: Jubin John jubin.j...@intel.com
---
 drivers/staging/hfi1/Kconfig |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/staging/hfi1/Kconfig b/drivers/staging/hfi1/Kconfig
index 87a385a..78bc89e 100644
--- a/drivers/staging/hfi1/Kconfig
+++ b/drivers/staging/hfi1/Kconfig
@@ -1,6 +1,6 @@
 config INFINIBAND_HFI1
tristate Intel OPA Gen1 support
-   depends on X86_64
+   depends on X86_64  INFINIBAND
default m
---help---
This is a low-level driver for Intel OPA Gen1 adapter.

--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH] RDMA/ipath: Do not build ipath driver without infiniband subsystem

2015-08-04 Thread dennis . dalessandro

From: Dennis Dalessandro dennis.dalessan...@intel.com

Moving the ipath driver to staging now requires a guard on the INFINIBAND
subsytem in order to prevent the driver from being built without infiniband
enabled. This will lead to a broken build.

Reviewed-by: Mike Marciniszyn mike.marcinis...@intel.com
Signed-off-by: Dennis Dalessandro dennis.dalessan...@intel.com
---
 drivers/staging/ipath/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/staging/ipath/Kconfig b/drivers/staging/ipath/Kconfig
index 041ce06..12dcd40 100644
--- a/drivers/staging/ipath/Kconfig
+++ b/drivers/staging/ipath/Kconfig
@@ -1,6 +1,6 @@
 config INFINIBAND_IPATH
tristate QLogic HTX HCA support
-   depends on 64BIT  NET  HT_IRQ
+   depends on 64BIT  NET  HT_IRQ  INFINIBAND
---help---
This is a driver for the deprecated QLogic Hyper-Transport
IB host channel adapter (model QHT7140),
-- 
1.8.3.1

--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 20/22] IB/iser: Support up to 8MB data transfer in a single command

2015-08-04 Thread Sagi Grimberg




Why SIZE_4K and not PAGE_SIZE?


Yes, I'll change that to PAGE_SIZE.

Thanks.


Would non-4KB pages (e.g. PPC 64KB) be an issue? Would this work between hosts 
with different page sizes?


iser was always using 4K segments for reasons I don't perfectly
understand. Maybe Or can comment on this? I do plan to move it to work
with system page size (soon I hope).
--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH, RFC] rdma: split struct ib_send_wr

2015-08-04 Thread Christoph Hellwig

On Tue, Aug 04, 2015 at 08:06:16PM +0300, Sagi Grimberg wrote:
 Question though, a ULP may want to keep a couple of WRs around instead
 of having each allocated in the stack and handled one by one. We need
 to provide it with a hint of what is the size it needs.

Note that with the drastic shrink of the struct size the typical case
of needing a fast_reg + one send or a few rdma wr should be just fine
on stack now.  Even more so with your registration cleanups which
will drastically shrink the size of the fast reg mrs again.

 I just posted a patch to do that in iser 
 (http://www.spinics.net/lists/linux-rdma/msg27632.html).

That's actually part of the reason why I didn't manage to convert iser
as it seems so convoluted..

 So if I would want to preallocate an array of work requests, what is the
 size of the space I'd need?
 is it some form of max(sizeof(struct ib_send_wr),
sizeof(struct ib_fastreg_wr),
sizeof(struct sig_handover), ..)?

Preferably you'd preallocate them in a type safe manner, if you really
need to overlay them do it as a proper union.  For example the rds
code is doing that although it only preallocates on per work item.
--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH, RFC] rdma: split struct ib_send_wr

2015-08-04 Thread Christoph Hellwig

On Tue, Aug 04, 2015 at 08:44:26PM +0300, Sagi Grimberg wrote:
 I do agree that the size on the stack is less of an issue now. What
 still can matter is handling each wr one by one vs. doing a collective
 post.

But if structured correctly you can still do that with on-stack WRs.

 I can understand that. In the last patches I tried to simplify the flow
 (I noticed that these aren't in your tree correct?). There always room
 for cleanups... I'm trying to have a gradual progress towards getting it
 to be simpler.

I used Doug's to be rebased for 4.3 tree from yesterday or so.  So
whatever he merged is in.
--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

RE: [PATCH] RDMA/amso1100: deprecate the amso1100 provider

2015-08-04 Thread Steve Wise

   This might run into the same issue as in
  https://lists.01.org/pipermail/kbuild-all/2015-August/011216.html.
  
 
  Hey Mike, what is the issue exactly?
 
 The problem is when CONFIG_INFINIBAND=n and CONFIG_INFINIBAND_HFI1=y. Here is 
 what I did for ipath which had the same issue:
 
 http://marc.info/?l=linux-rdmam=143870511731459w=2
 
 -Denny

Thanks.  I'll submit a patch for amso1100 shortly.

Steve.

--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH, RFC] rdma: split struct ib_send_wr

2015-08-04 Thread Sagi Grimberg


On 8/4/2015 8:47 PM, Christoph Hellwig wrote:

On Tue, Aug 04, 2015 at 08:44:26PM +0300, Sagi Grimberg wrote:

I do agree that the size on the stack is less of an issue now. What
still can matter is handling each wr one by one vs. doing a collective
post.


But if structured correctly you can still do that with on-stack WRs.


No argues.
--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [RFC] split struct ib_send_wr

2015-08-04 Thread Christoph Hellwig

On Tue, Aug 04, 2015 at 09:36:49AM -0700, Bart Van Assche wrote:
 diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
  [ ... ]
   struct ib_recv_wr {
 +struct ib_send_wr   wr;
  struct ib_recv_wr  *next;
  u64 wr_id;
  struct ib_sge  *sg_list;
 
 Hello Christoph,
 
 This part of the patch surprised me ?

It should.  It's a stupid cut  paste error, the new member isn't used
at all.  I'll fix it up.
--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] IB/core, cma: Make __attribute_const__ declarations sparse-friendly

2015-08-04 Thread Sagi Grimberg


On 8/3/2015 8:01 PM, Bart Van Assche wrote:

Move the __attribute_const__ declarations such that sparse understands
that these apply to the function itself and not to the return type.
This avoids that sparse reports error messages like the following:

drivers/infiniband/core/verbs.c:73:12: error: symbol 'ib_event_msg' redeclared 
with different type (originally declared at include/rdma/ib_verbs.h:470) - 
different modifiers

Fixes: 2b1b5b601230 (IB/core, cma: Nice log-friendly string helpers)
Signed-off-by: Bart Van Assche bart.vanass...@sandisk.com
Cc: Sagi Grimberg sa...@mellanox.com


Thanks Bart.

Reviewed-by: Sagi Grimberg sa...@mellanox.com
--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH, RFC] rdma: split struct ib_send_wr

2015-08-04 Thread Sagi Grimberg


On 8/4/2015 10:16 AM, Christoph Hellwig wrote:

This patch split up struct ib_send_wr so that all non-trivial verbs
use their own structure which embedds struct ib_send_wr.  This dramaticly
shrinks the size of a WR for most common operations.


Hey Christoph,

I think this looks good in general.

Question though, a ULP may want to keep a couple of WRs around instead
of having each allocated in the stack and handled one by one. We need
to provide it with a hint of what is the size it needs.

I just posted a patch to do that in iser 
(http://www.spinics.net/lists/linux-rdma/msg27632.html).


So if I would want to preallocate an array of work requests, what is the
size of the space I'd need?
is it some form of max(sizeof(struct ib_send_wr),
   sizeof(struct ib_fastreg_wr),
   sizeof(struct sig_handover), ..)?



This is just a WIP with basic testing for now.  While all in-tree drivers
except for iSER and iSERt are converted testing coverage is very limited.

For iSER I could really use a helping hand as the WR usage is rather
confusing..


I can do it. no problem.
--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH] RDMA/amso1100: Do not build amso1100 without infiniband subsystem

2015-08-04 Thread Steve Wise

Moving the amso1100 driver to staging now requires a guard on the
INFINIBAND subsytem in order to prevent the driver from being built
without infiniband enabled. This will lead to a broken build.

Signed-off-by: Steve Wise sw...@opengridcomputing.com
---

 drivers/staging/amso1100/Kconfig |2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/drivers/staging/amso1100/Kconfig b/drivers/staging/amso1100/Kconfig
index e6ce5f2..809cb14 100644
--- a/drivers/staging/amso1100/Kconfig
+++ b/drivers/staging/amso1100/Kconfig
@@ -1,6 +1,6 @@
 config INFINIBAND_AMSO1100
tristate Ammasso 1100 HCA support
-   depends on PCI  INET
+   depends on PCI  INET  INFINIBAND
---help---
  This is a low-level driver for the Ammasso 1100 host
  channel adapter (HCA).

--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

RE: [PATCH] RDMA/amso1100: deprecate the amso1100 provider

2015-08-04 Thread Dalessandro, Dennis

 -Original Message-
 From: linux-rdma-ow...@vger.kernel.org [mailto:linux-rdma-
 ow...@vger.kernel.org] On Behalf Of Steve Wise
 Sent: Tuesday, August 4, 2015 1:13 PM
 To: Marciniszyn, Mike; 'Doug Ledford'
 Cc: linux-rdma@vger.kernel.org; t...@opengridcomputing.com
 Subject: RE: [PATCH] RDMA/amso1100: deprecate the amso1100 provider

  -Original Message-
  From: linux-rdma-ow...@vger.kernel.org [mailto:linux-rdma-
 ow...@vger.kernel.org] On Behalf Of Marciniszyn, Mike
  Sent: Tuesday, August 04, 2015 11:33 AM
  To: Doug Ledford; Steve Wise
  Cc: linux-rdma@vger.kernel.org; t...@opengridcomputing.com
  Subject: RE: [PATCH] RDMA/amso1100: deprecate the amso1100 provider

   Subject: Re: [PATCH] RDMA/amso1100: deprecate the amso1100
 provider

   On 07/29/2015 10:44 AM, Steve Wise wrote:
The HW hasn't been sold since 2005, and the SW has definite bit rot.
Its time to remove it.  So move it to staging for a few releases and
then remove it after that.

Signed-off-by: Steve Wise sw...@opengridcomputing.com

   Thanks, applied.

  This might run into the same issue as in
 https://lists.01.org/pipermail/kbuild-all/2015-August/011216.html.

 Hey Mike, what is the issue exactly?

The problem is when CONFIG_INFINIBAND=n and CONFIG_INFINIBAND_HFI1=y. Here is 
what I did for ipath which had the same issue:

http://marc.info/?l=linux-rdmam=143870511731459w=2

-Denny

--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

RE: [PATCH] RDMA/amso1100: deprecate the amso1100 provider

2015-08-04 Thread Marciniszyn, Mike

 Subject: Re: [PATCH] RDMA/amso1100: deprecate the amso1100 provider
 
 On 07/29/2015 10:44 AM, Steve Wise wrote:
  The HW hasn't been sold since 2005, and the SW has definite bit rot.
  Its time to remove it.  So move it to staging for a few releases and
  then remove it after that.
 
  Signed-off-by: Steve Wise sw...@opengridcomputing.com
 
 Thanks, applied.
 

This might run into the same issue as in 
https://lists.01.org/pipermail/kbuild-all/2015-August/011216.html.

Mike
--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH, RFC] rdma: split struct ib_send_wr

2015-08-04 Thread Sagi Grimberg


On 8/4/2015 8:23 PM, Christoph Hellwig wrote:

On Tue, Aug 04, 2015 at 08:06:16PM +0300, Sagi Grimberg wrote:

Question though, a ULP may want to keep a couple of WRs around instead
of having each allocated in the stack and handled one by one. We need
to provide it with a hint of what is the size it needs.


Note that with the drastic shrink of the struct size the typical case
of needing a fast_reg + one send or a few rdma wr should be just fine
on stack now.  Even more so with your registration cleanups which
will drastically shrink the size of the fast reg mrs again.


I do agree that the size on the stack is less of an issue now. What
still can matter is handling each wr one by one vs. doing a collective
post.




I just posted a patch to do that in iser
(http://www.spinics.net/lists/linux-rdma/msg27632.html).


That's actually part of the reason why I didn't manage to convert iser
as it seems so convoluted..


I can understand that. In the last patches I tried to simplify the flow
(I noticed that these aren't in your tree correct?). There always room
for cleanups... I'm trying to have a gradual progress towards getting it
to be simpler.




So if I would want to preallocate an array of work requests, what is the
size of the space I'd need?
is it some form of max(sizeof(struct ib_send_wr),
sizeof(struct ib_fastreg_wr),
sizeof(struct sig_handover), ..)?


Preferably you'd preallocate them in a type safe manner, if you really
need to overlay them do it as a proper union.


Perhaps...
--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

RE: [PATCH V6 6/9] isert: Rename IO functions to more descriptive names

2015-08-04 Thread Steve Wise

 -Original Message-
 From: Sagi Grimberg [mailto:sa...@dev.mellanox.co.il]
 Sent: Tuesday, August 04, 2015 12:26 PM
 To: Steve Wise; dledf...@redhat.com
 Cc: infinip...@intel.com; sa...@mellanox.com; ogerl...@mellanox.com; 
 r...@mellanox.com; linux-rdma@vger.kernel.org;
 e...@mellanox.com; target-de...@vger.kernel.org; linux-...@vger.kernel.org; 
 bfie...@fieldses.org
 Subject: Re: [PATCH V6 6/9] isert: Rename IO functions to more descriptive 
 names

  Hey Sagi, how is this coming along?  How can I help?

 Hi Steve,

 This is taking longer than I expected, the changes needed seem
 pretty extensive throughout the IO path. I don't think it will be ready
 for 4.3

Perhaps then we should go with my version that adds iwarp-only FRMR IO handlers 
for 4.3.  Then they can be phased out as the rework matures.  Thoughts?

 I'll try to send you soon a preliminary version to play with.
 Acceptable?

I can test the iwarp parts once you think the code is ready to try.

Steve.

--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH for-next V1 2/3] IB/core: RoCE GID management separate cleanup and release

2015-08-04 Thread Matan Barak

On Tue, Aug 4, 2015 at 7:46 PM, Jason Gunthorpe
jguntho...@obsidianresearch.com wrote:
 On Tue, Aug 04, 2015 at 03:09:39PM +0300, Matan Barak wrote:
 Correct, I'll change this comment to:
 The release function is called after the device was put.
 This is in order to avoid use-after-free errors if the vendor
 driver's teardown code uses IB cache.

 .. the vendor driver uses IB cache from async contexts ..


right (or just from the unregister code path of the vendor driver) :)

  + ib_cache_cleanup_one(device);
ib_device_unregister_sysfs(device);
 
  I didn't check closely, but I suspect the above order should be
  swapped, and the matching swap in register. sysfs can legitimately
  call into core code, but vice-versa shouldn't happen...
 

 I didn't understand this comment. The cleanup code calls del_gid
 which tells the vendor to delete this GID (and dev_put the
 ndevs). The kref-put (which is called when the device is
 unregistered) frees the memory. If we switch the order, we would
 have use-after-free scenario.

 I don't understand your comment either.

 What code path from ib_cache will go into ib_sysfs?


The device code goes to ib_sysfs and ib_cache. I was rethinking this
and I think it still might be a bit problematic.
The ib_register_device error flow could fail either in
ib_device_register_sysfs or ib_cache_setup_one.
If it fails in ib_device_register_sysfs, the device release function
isn't called and the device pointer isn't freed.
If ib_cache_setup_one fails, we call ib_device_unregister_sysfs and
the memory will be freed. So it seems like
ib_device_register_sysfs should be the last call in
ib_reigster_device. But in the ib_unregister_device path, I still
need to first call ib_cache_cleanup_once and then call
ib_device_unregister_sysfs (otherwise I'll have use-after-free).
What do you think?

 Jason

Thanks again for the helpful comments.

Matan
--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v3] IB/srp: Do not create an all physical insecure rkey by default

2015-08-04 Thread Jason Gunthorpe

The ULP only needs this if the insecure register_always performance
optimization is enabled, or if FRWR/FMR is not supported in the driver.

This is a WIP for this functionality, there are several WARN_ONs in
this patch that can be hit under certain work loads. Additional patches
will be needed to re-arrange each of those cases appropriately.

This patch also changes the default mode to FRWR as FMR will always
hit a WARN_ON.

Signed-off-by: Jason Gunthorpe jguntho...@obsidianresearch.com
---
 drivers/infiniband/ulp/srp/ib_srp.c | 72 ++---
 drivers/infiniband/ulp/srp/ib_srp.h |  4 +--
 2 files changed, 53 insertions(+), 23 deletions(-)

This version includes the changes Christoph and Bart requested, it
might work a bit out of the box now.
 - Make prefer_fr = true
 - Get rid of taget-rkey and use a mr pointer instead.
   Check the mr pointer for null before every usage.
   This gets rid of register_always checks in the data flow.

Doug, as discussed, this patch is still known not to work, the two
cases that were ID's are guarded by WARN_ON's that will trigger.

Someone else more familiar with SRP will need to build additional
patches to correct those cases. They can be done incrementally before
this patch by testing for register_always.

This is probably unmergable until all the WARN_ONs are fully audited and
removed after proving they cannot trigger.

diff --git a/drivers/infiniband/ulp/srp/ib_srp.c 
b/drivers/infiniband/ulp/srp/ib_srp.c
index 19a1356f8b2a..22d936d47f71 100644
--- a/drivers/infiniband/ulp/srp/ib_srp.c
+++ b/drivers/infiniband/ulp/srp/ib_srp.c
@@ -68,8 +68,8 @@ static unsigned int srp_sg_tablesize;
 static unsigned int cmd_sg_entries;
 static unsigned int indirect_sg_entries;
 static bool allow_ext_sg;
-static bool prefer_fr;
-static bool register_always;
+static bool prefer_fr = true;
+static bool register_always = true;
 static int topspin_workarounds = 1;
 
 module_param(srp_sg_tablesize, uint, 0444);
@@ -1338,9 +1338,9 @@ static int srp_finish_mapping(struct srp_map_state *state,
if (state-npages == 0)
return 0;
 
-   if (state-npages == 1  !register_always)
+   if (state-npages == 1  target-global_rkey_mr)
srp_map_desc(state, state-base_dma_addr, state-dma_len,
-target-rkey);
+target-global_rkey_mr-rkey);
else
ret = target-srp_host-srp_dev-use_fast_reg ?
srp_map_finish_fr(state, ch) :
@@ -1385,7 +1385,11 @@ static int srp_map_sg_entry(struct srp_map_state *state,
 * go back to FMR or FR mode, so no need to update anything
 * other than the descriptor.
 */
-   srp_map_desc(state, dma_addr, dma_len, target-rkey);
+   if (WARN_ON(!target-global_rkey_mr))
+   return -EINVAL;
+
+   srp_map_desc(state, dma_addr, dma_len,
+target-global_rkey_mr-rkey);
return 0;
}
 
@@ -1397,11 +1401,14 @@ static int srp_map_sg_entry(struct srp_map_state *state,
 */
if ((!dev-use_fast_reg  dma_addr  ~dev-mr_page_mask) ||
dma_len  dev-mr_max_size) {
+   if (WARN_ON(!target-global_rkey_mr))
+   return -EINVAL;
ret = srp_finish_mapping(state, ch);
if (ret)
return ret;
 
-   srp_map_desc(state, dma_addr, dma_len, target-rkey);
+   srp_map_desc(state, dma_addr, dma_len,
+target-global_rkey_mr-rkey);
srp_map_update_start(state, NULL, 0, 0);
return 0;
}
@@ -1462,12 +1469,16 @@ static int srp_map_sg(struct srp_map_state *state, 
struct srp_rdma_ch *ch,
 
state-desc = req-indirect_desc;
state-pages= req-map_page;
+
+   use_mr = !target-global_rkey_mr;
if (dev-use_fast_reg) {
state-next_fr = req-fr_list;
-   use_mr = !!ch-fr_pool;
+   if (WARN_ON(!ch-fr_pool))
+   return -EINVAL;
} else {
state-next_fmr = req-fmr_list;
-   use_mr = !!ch-fmr_pool;
+   if (WARN_ON(!ch-fmr_pool))
+   return -EINVAL;
}
 
for_each_sg(scat, sg, count, i) {
@@ -1489,7 +1500,11 @@ backtrack:
dma_len -= (state-unmapped_addr - dma_addr);
dma_addr = state-unmapped_addr;
use_mr = false;
-   srp_map_desc(state, dma_addr, dma_len, target-rkey);
+   /* FIXME: This is surely the wrong error out */
+   if (WARN_ON(!target-global_rkey_mr))
+   return -EINVAL;
+   srp_map_desc(state, dma_addr, dma_len,
+

Re: [PATCH for-next V7 4/6] IB/uverbs: Enable device removal when there are active user space applications

2015-08-04 Thread Jason Gunthorpe

On Tue, Aug 04, 2015 at 05:03:26PM +0300, Yishai Hadas wrote:
 Enables the uverbs_remove_one to succeed despite the fact that there are
 running IB applications working with the given ib device.  This
 functionality enables a HW device to be unbind/reset despite the fact that
 there are running user space applications using it.
 
 It exposes a new IB kernel API named 'disassociate_ucontext' which lets
 a driver detaching its HW resources from a given user context without
 crashing/terminating the application. In case a driver implemented the
 above API and registered with ib_uverb there will be no dependency between its
 device to its uverbs_device. Upon calling remove_one of ib_uverbs the call
 should return after disassociating the open HW resources without waiting to
 clients disconnecting. In case driver didn't implement this API there will be 
 no
 change to current behaviour and uverbs_remove_one will return only when last
 client has disconnected and reference count on uverbs device became 0.
 
 In case the lower driver device was removed any application will
 continue working over some zombie HCA, further calls will ended with an
 immediate error.

Looks OK now..

Reviewed-by: Jason Gunthorpe jguntho...@obsidianresearch.com

I think I provided reviews for 1-4 now, you forgot the copy them
forward from v6 to help Doug.

Jason
--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v3 01/12] IB/core: Guarantee that a local_dma_lkey is available

2015-08-04 Thread ira.weiny

On Tue, Aug 04, 2015 at 04:21:30PM -0600, Jason Gunthorpe wrote:
 Every single ULP requires a local_dma_lkey to do anything with
 a QP, so let us ensure one exists for every PD created.
 
 If the driver can supply a global local_dma_lkey then use that, otherwise
 ask the driver to create a local use all physical memory MR associated
 with the new PD.
 
 Signed-off-by: Jason Gunthorpe jguntho...@obsidianresearch.com
 Reviewed-by: Sagi Grimberg sa...@dev.mellanox.co.il
 Acked-by: Christoph Hellwig h...@infradead.org
 Reviewed-by: Steve Wise sw...@opengridcomputing.com

I hit this bug as well.

Reviewed-by: Ira Weiny ira.we...@intel.com
Tested-by: Ira Weiny ira.we...@intel.com


 ---
  drivers/infiniband/core/uverbs_cmd.c |  1 +
  drivers/infiniband/core/verbs.c  | 47 
 
  include/rdma/ib_verbs.h  |  9 ++-
  3 files changed, 45 insertions(+), 12 deletions(-)
 
 This has the extra null assignment in uverbs that Haggai discovered. No 
 changes
 to other patches in the series.
 
 I also wrote a cleanup patch for ib_dealloc_pd:
 
  
 https://github.com/jgunthorpe/linux/commit/7ea87a7f394c2113cc2232edfe785089eb0aea32
 
 I'll post it when I've tested it a bit..
 
 diff --git a/drivers/infiniband/core/uverbs_cmd.c 
 b/drivers/infiniband/core/uverbs_cmd.c
 index bbb02ffe87df..258485ee46b2 100644
 --- a/drivers/infiniband/core/uverbs_cmd.c
 +++ b/drivers/infiniband/core/uverbs_cmd.c
 @@ -562,6 +562,7 @@ ssize_t ib_uverbs_alloc_pd(struct ib_uverbs_file *file,
  
   pd-device  = file-device-ib_dev;
   pd-uobject = uobj;
 + pd-local_mr = NULL;
   atomic_set(pd-usecnt, 0);
  
   uobj-object = pd;
 diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
 index bac3fb406a74..e9d72a28a9a5 100644
 --- a/drivers/infiniband/core/verbs.c
 +++ b/drivers/infiniband/core/verbs.c
 @@ -213,18 +213,49 @@ EXPORT_SYMBOL(rdma_port_get_link_layer);
  
  /* Protection domains */
  
 +/**
 + * ib_alloc_pd - Allocates an unused protection domain.
 + * @device: The device on which to allocate the protection domain.
 + *
 + * A protection domain object provides an association between QPs, shared
 + * receive queues, address handles, memory regions, and memory windows.
 + *
 + * Every PD has a local_dma_lkey which can be used as the lkey value for 
 local
 + * memory operations.
 + */
  struct ib_pd *ib_alloc_pd(struct ib_device *device)
  {
   struct ib_pd *pd;
 + struct ib_device_attr devattr;
 + int rc;
 +
 + rc = ib_query_device(device, devattr);
 + if (rc)
 + return ERR_PTR(rc);
  
   pd = device-alloc_pd(device, NULL, NULL);
 + if (IS_ERR(pd))
 + return pd;
 +
 + pd-device = device;
 + pd-uobject = NULL;
 + pd-local_mr = NULL;
 + atomic_set(pd-usecnt, 0);
 +
 + if (devattr.device_cap_flags  IB_DEVICE_LOCAL_DMA_LKEY)
 + pd-local_dma_lkey = device-local_dma_lkey;
 + else {
 + struct ib_mr *mr;
 +
 + mr = ib_get_dma_mr(pd, IB_ACCESS_LOCAL_WRITE);
 + if (IS_ERR(mr)) {
 + ib_dealloc_pd(pd);
 + return (struct ib_pd *)mr;
 + }
  
 - if (!IS_ERR(pd)) {
 - pd-device  = device;
 - pd-uobject = NULL;
 - atomic_set(pd-usecnt, 0);
 + pd-local_mr = mr;
 + pd-local_dma_lkey = pd-local_mr-lkey;
   }
 -
   return pd;
  }
  EXPORT_SYMBOL(ib_alloc_pd);
 @@ -234,6 +265,12 @@ int ib_dealloc_pd(struct ib_pd *pd)
   if (atomic_read(pd-usecnt))
   return -EBUSY;
  
 + if (pd-local_mr) {
 + if (ib_dereg_mr(pd-local_mr))
 + return -EBUSY;
 + pd-local_mr = NULL;
 + }
 +
   return pd-device-dealloc_pd(pd);
  }
  EXPORT_SYMBOL(ib_dealloc_pd);
 diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
 index b0f898e3b2e7..eaec3081fb87 100644
 --- a/include/rdma/ib_verbs.h
 +++ b/include/rdma/ib_verbs.h
 @@ -1252,9 +1252,11 @@ struct ib_udata {
  };
  
  struct ib_pd {
 + u32 local_dma_lkey;
   struct ib_device   *device;
   struct ib_uobject  *uobject;
   atomic_tusecnt; /* count all resources */
 + struct ib_mr   *local_mr;
  };
  
  struct ib_xrcd {
 @@ -2135,13 +2137,6 @@ int ib_find_gid(struct ib_device *device, union ib_gid 
 *gid,
  int ib_find_pkey(struct ib_device *device,
u8 port_num, u16 pkey, u16 *index);
  
 -/**
 - * ib_alloc_pd - Allocates an unused protection domain.
 - * @device: The device on which to allocate the protection domain.
 - *
 - * A protection domain object provides an association between QPs, shared
 - * receive queues, address handles, memory regions, and memory windows.
 - */
  struct ib_pd *ib_alloc_pd(struct ib_device *device);
  
  /**
 -- 
 1.9.1
 
 --
 To unsubscribe from this list: send the line unsubscribe linux-rdma in
 the

41 matches

Mail list logo