Re: [PATCH 02/25] IB/mthca, net/mlx4: remove counting semaphores

2015-10-29 Thread Jack Morgenstein
On Wed, 28 Oct 2015 03:45:50 +0100
Arnd Bergmann  wrote:

> As far as I can tell, there is a preexisting race condition
> regarding the cmd->use_events flag, which is not protected
> by any lock. When this flag is toggled while another command
> is being started, that command gets stuck until the mode is
> toggled back.

We fixed this issue in mellanox ofed in a manner that allowed keeping
the same counting mechanism.  IMHO, this is preferable, rather than
totally changing the mechanism.

We will submit a similar patch to the upstream kernel shortly.

-Jack

net/mlx4: Switching between sending commands via polling and events may results 
in hung tasks

When switching between those methonds of sending commands, it's possbile that a
task will keep waiting for the polling sempahore, but may never be able to 
acquire it.
This is due to mlx4_cmd_use_events which "down"s the sempahore back to 0.

Reproducing it involves in sending commands while chaning between 
mlx4_cmd_use_polling
and mlx4_cmd_use_events.

Solving it by adding a read-write semaphore when switching between modes.

issue: 402565
Change-Id: I19f0d40dbb327c49b39a9abbcb2bb002b0279b0b
Signed-off-by: Matan Barak 
---
 drivers/net/ethernet/mellanox/mlx4/cmd.c  | 23 +--
 drivers/net/ethernet/mellanox/mlx4/mlx4.h |  2 ++
 2 files changed, 19 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/cmd.c 
b/drivers/net/ethernet/mellanox/mlx4/cmd.c
index def1338..f94a960 100644
--- a/drivers/net/ethernet/mellanox/mlx4/cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx4/cmd.c
@@ -766,17 +766,23 @@ int __mlx4_cmd(struct mlx4_dev *dev, u64 in_param, u64 
*out_param,
return mlx4_cmd_reset_flow(dev, op, op_modifier, -EIO);
 
if (!mlx4_is_mfunc(dev) || (native && mlx4_is_master(dev))) {
+   int ret;
+
if (dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR)
return mlx4_internal_err_ret_value(dev, op,
  op_modifier);
+   down_read(_priv(dev)->cmd.switch_sem);
if (mlx4_priv(dev)->cmd.use_events)
-   return mlx4_cmd_wait(dev, in_param, out_param,
-out_is_imm, in_modifier,
-op_modifier, op, timeout);
+   ret = mlx4_cmd_wait(dev, in_param, out_param,
+   out_is_imm, in_modifier,
+   op_modifier, op, timeout);
else
-   return mlx4_cmd_poll(dev, in_param, out_param,
-out_is_imm, in_modifier,
-op_modifier, op, timeout);
+   ret = mlx4_cmd_poll(dev, in_param, out_param,
+   out_is_imm, in_modifier,
+   op_modifier, op, timeout);
+
+   up_read(_priv(dev)->cmd.switch_sem);
+   return ret;
}
return mlx4_slave_cmd(dev, in_param, out_param, out_is_imm,
  in_modifier, op_modifier, op, timeout);
@@ -2437,6 +2443,7 @@ int mlx4_cmd_init(struct mlx4_dev *dev)
int flags = 0;
 
if (!priv->cmd.initialized) {
+   init_rwsem(>cmd.switch_sem);
mutex_init(>cmd.slave_cmd_mutex);
sema_init(>cmd.poll_sem, 1);
priv->cmd.use_events = 0;
@@ -2566,6 +2573,7 @@ int mlx4_cmd_use_events(struct mlx4_dev *dev)
if (!priv->cmd.context)
return -ENOMEM;
 
+   down_write(>cmd.switch_sem);
for (i = 0; i < priv->cmd.max_cmds; ++i) {
priv->cmd.context[i].token = i;
priv->cmd.context[i].next  = i + 1;
@@ -2590,6 +2598,7 @@ int mlx4_cmd_use_events(struct mlx4_dev *dev)
 
down(>cmd.poll_sem);
priv->cmd.use_events = 1;
+   up_write(>cmd.switch_sem);
 
return err;
 }
@@ -2602,6 +2611,7 @@ void mlx4_cmd_use_polling(struct mlx4_dev *dev)
struct mlx4_priv *priv = mlx4_priv(dev);
int i;
 
+   down_write(>cmd.switch_sem);
priv->cmd.use_events = 0;
 
for (i = 0; i < priv->cmd.max_cmds; ++i)
@@ -2610,6 +2620,7 @@ void mlx4_cmd_use_polling(struct mlx4_dev *dev)
kfree(priv->cmd.context);
 
up(>cmd.poll_sem);
+   up_write(>cmd.switch_sem);
 }
 
 struct mlx4_cmd_mailbox *mlx4_alloc_cmd_mailbox(struct mlx4_dev *dev)
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4.h 
b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
index 6c58021..2f03e6e 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
@@ -45,6 +45,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -626,6 +627,7 @@ struct mlx4_cmd {
struct 

Re: [PATCH 02/25] IB/mthca, net/mlx4: remove counting semaphores

2015-10-28 Thread Sagi Grimberg

Hi Arnd,


Since we want to make counting semaphores go away,


Why do we want to make counting semaphores go away? completely?
or just for binary use cases?

I have a use case in iser target code where a counting semaphore is the
best suited synchronizing mechanism.

I have a single thread handling connect requests (one at a time) while
connect requests are event driven and come asynchronously. This is
why I use a queue and a counting semaphore to handle this situation.

I'd need to rethink of a new strategy to handle this without counting
semaphores and I'm not entirely sure it would be simpler.


this patch replaces the semaphore counting the event-driven
commands with an open-coded wait-queue, which should
be an equivalent transformation of the code, although
it does not make it any nicer.

As far as I can tell, there is a preexisting race condition
regarding the cmd->use_events flag, which is not protected
by any lock. When this flag is toggled while another command
is being started, that command gets stuck until the mode is
toggled back.

A better solution that would solve the race condition and
at the same time improve the code readability would create
a new locking primitive that replaces both semaphores, like

static int mlx4_use_events(struct mlx4_cmd *cmd)
{
int ret = -EAGAIN;
spin_lock(>lock);
if (cmd->use_events && cmd->commands < cmd->max_commands) {
cmd->commands++;
ret = 1;
} else if (!cmd->use_events && cmd->commands == 0) {
cmd->commands = 1;
ret = 0;
}
spin_unlock(>lock);
return ret;
}

static bool mlx4_use_events(struct mlx4_cmd *cmd)
{
int ret;
wait_event(cmd->events_wq, ret = __mlx4_use_events(cmd) >= 0);
return ret;
}

Cc: Roland Dreier 
Cc: Eli Cohen 
Cc: Yevgeny Petrilin 
Cc: netdev@vger.kernel.org
Cc: linux-r...@vger.kernel.org
Signed-off-by: Arnd Bergmann 

Conflicts:

drivers/net/mlx4/cmd.c
drivers/net/mlx4/mlx4.h
---
  drivers/infiniband/hw/mthca/mthca_cmd.c   | 12 
  drivers/infiniband/hw/mthca/mthca_dev.h   |  3 ++-
  drivers/net/ethernet/mellanox/mlx4/cmd.c  | 12 
  drivers/net/ethernet/mellanox/mlx4/mlx4.h |  3 ++-
  4 files changed, 20 insertions(+), 10 deletions(-)

diff --git a/drivers/infiniband/hw/mthca/mthca_cmd.c 
b/drivers/infiniband/hw/mthca/mthca_cmd.c
index 9d3e5c1ac60e..aad1852e8e10 100644
--- a/drivers/infiniband/hw/mthca/mthca_cmd.c
+++ b/drivers/infiniband/hw/mthca/mthca_cmd.c
@@ -417,7 +417,8 @@ static int mthca_cmd_wait(struct mthca_dev *dev,
int err = 0;
struct mthca_cmd_context *context;

-   down(>cmd.event_sem);
+   wait_event(dev->cmd.event_wait,
+  atomic_add_unless(>cmd.commands, -1, 0));

spin_lock(>cmd.context_lock);
BUG_ON(dev->cmd.free_head < 0);
@@ -459,7 +460,8 @@ out:
dev->cmd.free_head = context - dev->cmd.context;
spin_unlock(>cmd.context_lock);

-   up(>cmd.event_sem);
+   atomic_inc(>cmd.commands);
+   wake_up(>cmd.event_wait);
return err;
  }

@@ -571,7 +573,8 @@ int mthca_cmd_use_events(struct mthca_dev *dev)
dev->cmd.context[dev->cmd.max_cmds - 1].next = -1;
dev->cmd.free_head = 0;

-   sema_init(>cmd.event_sem, dev->cmd.max_cmds);
+   init_waitqueue_head(>cmd.event_wait);
+   atomic_set(>cmd.commands, dev->cmd.max_cmds);
spin_lock_init(>cmd.context_lock);

for (dev->cmd.token_mask = 1;
@@ -597,7 +600,8 @@ void mthca_cmd_use_polling(struct mthca_dev *dev)
dev->cmd.flags &= ~MTHCA_CMD_USE_EVENTS;

for (i = 0; i < dev->cmd.max_cmds; ++i)
-   down(>cmd.event_sem);
+   wait_event(dev->cmd.event_wait,
+  atomic_add_unless(>cmd.commands, -1, 0));

kfree(dev->cmd.context);

diff --git a/drivers/infiniband/hw/mthca/mthca_dev.h 
b/drivers/infiniband/hw/mthca/mthca_dev.h
index 7e6a6d64ad4e..3055f5c12ac8 100644
--- a/drivers/infiniband/hw/mthca/mthca_dev.h
+++ b/drivers/infiniband/hw/mthca/mthca_dev.h
@@ -121,7 +121,8 @@ struct mthca_cmd {
struct pci_pool  *pool;
struct mutex  hcr_mutex;
struct semaphore  poll_sem;
-   struct semaphore  event_sem;
+   wait_queue_head_t event_wait;
+   atomic_t  commands;
int   max_cmds;
spinlock_tcontext_lock;
int   free_head;
diff --git a/drivers/net/ethernet/mellanox/mlx4/cmd.c 
b/drivers/net/ethernet/mellanox/mlx4/cmd.c
index 78f5a1a0b8c8..60134a4245ef 100644
--- a/drivers/net/ethernet/mellanox/mlx4/cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx4/cmd.c
@@ -273,7 +273,8 @@ static int mlx4_cmd_wait(struct mlx4_dev *dev, u64 
in_param, u64 *out_param,
struct 

[PATCH 02/25] IB/mthca, net/mlx4: remove counting semaphores

2015-10-27 Thread Arnd Bergmann
The mthca and mlx4 device drivers use the same method
to switch between polling and event-driven command mode,
abusing two semaphores to create a mutual exclusion between
one polled command or multiple concurrent event driven
commands.

Since we want to make counting semaphores go away, this
patch replaces the semaphore counting the event-driven
commands with an open-coded wait-queue, which should
be an equivalent transformation of the code, although
it does not make it any nicer.

As far as I can tell, there is a preexisting race condition
regarding the cmd->use_events flag, which is not protected
by any lock. When this flag is toggled while another command
is being started, that command gets stuck until the mode is
toggled back.

A better solution that would solve the race condition and
at the same time improve the code readability would create
a new locking primitive that replaces both semaphores, like

static int mlx4_use_events(struct mlx4_cmd *cmd)
{
int ret = -EAGAIN;
spin_lock(>lock);
if (cmd->use_events && cmd->commands < cmd->max_commands) {
cmd->commands++;
ret = 1;
} else if (!cmd->use_events && cmd->commands == 0) {
cmd->commands = 1;
ret = 0;
}
spin_unlock(>lock);
return ret;
}

static bool mlx4_use_events(struct mlx4_cmd *cmd)
{
int ret;
wait_event(cmd->events_wq, ret = __mlx4_use_events(cmd) >= 0);
return ret;
}

Cc: Roland Dreier 
Cc: Eli Cohen 
Cc: Yevgeny Petrilin 
Cc: netdev@vger.kernel.org
Cc: linux-r...@vger.kernel.org
Signed-off-by: Arnd Bergmann 

Conflicts:

drivers/net/mlx4/cmd.c
drivers/net/mlx4/mlx4.h
---
 drivers/infiniband/hw/mthca/mthca_cmd.c   | 12 
 drivers/infiniband/hw/mthca/mthca_dev.h   |  3 ++-
 drivers/net/ethernet/mellanox/mlx4/cmd.c  | 12 
 drivers/net/ethernet/mellanox/mlx4/mlx4.h |  3 ++-
 4 files changed, 20 insertions(+), 10 deletions(-)

diff --git a/drivers/infiniband/hw/mthca/mthca_cmd.c 
b/drivers/infiniband/hw/mthca/mthca_cmd.c
index 9d3e5c1ac60e..aad1852e8e10 100644
--- a/drivers/infiniband/hw/mthca/mthca_cmd.c
+++ b/drivers/infiniband/hw/mthca/mthca_cmd.c
@@ -417,7 +417,8 @@ static int mthca_cmd_wait(struct mthca_dev *dev,
int err = 0;
struct mthca_cmd_context *context;
 
-   down(>cmd.event_sem);
+   wait_event(dev->cmd.event_wait,
+  atomic_add_unless(>cmd.commands, -1, 0));
 
spin_lock(>cmd.context_lock);
BUG_ON(dev->cmd.free_head < 0);
@@ -459,7 +460,8 @@ out:
dev->cmd.free_head = context - dev->cmd.context;
spin_unlock(>cmd.context_lock);
 
-   up(>cmd.event_sem);
+   atomic_inc(>cmd.commands);
+   wake_up(>cmd.event_wait);
return err;
 }
 
@@ -571,7 +573,8 @@ int mthca_cmd_use_events(struct mthca_dev *dev)
dev->cmd.context[dev->cmd.max_cmds - 1].next = -1;
dev->cmd.free_head = 0;
 
-   sema_init(>cmd.event_sem, dev->cmd.max_cmds);
+   init_waitqueue_head(>cmd.event_wait);
+   atomic_set(>cmd.commands, dev->cmd.max_cmds);
spin_lock_init(>cmd.context_lock);
 
for (dev->cmd.token_mask = 1;
@@ -597,7 +600,8 @@ void mthca_cmd_use_polling(struct mthca_dev *dev)
dev->cmd.flags &= ~MTHCA_CMD_USE_EVENTS;
 
for (i = 0; i < dev->cmd.max_cmds; ++i)
-   down(>cmd.event_sem);
+   wait_event(dev->cmd.event_wait,
+  atomic_add_unless(>cmd.commands, -1, 0));
 
kfree(dev->cmd.context);
 
diff --git a/drivers/infiniband/hw/mthca/mthca_dev.h 
b/drivers/infiniband/hw/mthca/mthca_dev.h
index 7e6a6d64ad4e..3055f5c12ac8 100644
--- a/drivers/infiniband/hw/mthca/mthca_dev.h
+++ b/drivers/infiniband/hw/mthca/mthca_dev.h
@@ -121,7 +121,8 @@ struct mthca_cmd {
struct pci_pool  *pool;
struct mutex  hcr_mutex;
struct semaphore  poll_sem;
-   struct semaphore  event_sem;
+   wait_queue_head_t event_wait;
+   atomic_t  commands;
int   max_cmds;
spinlock_tcontext_lock;
int   free_head;
diff --git a/drivers/net/ethernet/mellanox/mlx4/cmd.c 
b/drivers/net/ethernet/mellanox/mlx4/cmd.c
index 78f5a1a0b8c8..60134a4245ef 100644
--- a/drivers/net/ethernet/mellanox/mlx4/cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx4/cmd.c
@@ -273,7 +273,8 @@ static int mlx4_cmd_wait(struct mlx4_dev *dev, u64 
in_param, u64 *out_param,
struct mlx4_cmd_context *context;
int err = 0;
 
-   down(>event_sem);
+   wait_event(cmd->event_wait,
+  atomic_add_unless(>commands, -1, 0));
 
spin_lock(>context_lock);
BUG_ON(cmd->free_head < 0);
@@ -305,7 +306,8 @@ out:
cmd->free_head = context - cmd->context;