Re: [PATCH 2/2] IB/mad: Use IDR for agent IDs

2018-06-13 Thread jackm
On Fri,  8 Jun 2018 10:42:18 -0700
Matthew Wilcox  wrote:

> From: Matthew Wilcox 
> 
> Allocate agent IDs from a global IDR instead of an atomic variable.
> This eliminates the possibility of reusing an ID which is already in
> use after 4 billion registrations, and we can also limit the assigned
> ID to be less than 2^24, which fixes a bug in the mlx4 device.
> 
> We look up the agent under protection of the RCU lock, which means we
> have to free the agent using kfree_rcu, and only increment the
> reference counter if it is not 0.
> 
> Signed-off-by: Matthew Wilcox 
> ---
>  drivers/infiniband/core/mad.c  | 78
> ++ drivers/infiniband/core/mad_priv.h |
> 7 +-- include/linux/idr.h|  9 
>  3 files changed, 59 insertions(+), 35 deletions(-)
> 
> diff --git a/drivers/infiniband/core/mad.c
> b/drivers/infiniband/core/mad.c index 68f4dda916c8..62384a3dd3ec
> 100644 --- a/drivers/infiniband/core/mad.c
> +++ b/drivers/infiniband/core/mad.c
> @@ -38,6 +38,7 @@
>  #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
>  
>  #include 
> +#include 
>  #include 
>  #include 
>  #include 
> @@ -58,8 +59,8 @@ MODULE_PARM_DESC(send_queue_size, "Size of send
> queue in number of work requests module_param_named(recv_queue_size,
> mad_recvq_size, int, 0444); MODULE_PARM_DESC(recv_queue_size, "Size
> of receive queue in number of work requests"); 
> +static DEFINE_IDR(ib_mad_clients);
>  static struct list_head ib_mad_port_list;
> -static atomic_t ib_mad_client_id = ATOMIC_INIT(0);
>  
>  /* Port list lock */
>  static DEFINE_SPINLOCK(ib_mad_port_list_lock);
> @@ -377,13 +378,24 @@ struct ib_mad_agent
> *ib_register_mad_agent(struct ib_device *device, goto error4;
>   }
>  
> - spin_lock_irq(_priv->reg_lock);
> - mad_agent_priv->agent.hi_tid =
> atomic_inc_return(_mad_client_id);
> + idr_preload(GFP_KERNEL);
> + idr_lock(_mad_clients);
> + ret2 = idr_alloc_cyclic(_mad_clients, mad_agent_priv, 0,
> + (1 << 24), GFP_ATOMIC);
> + idr_unlock(_mad_clients);
> + idr_preload_end();
> +
> + if (ret2 < 0) {
> + ret = ERR_PTR(ret2);
> + goto error5;
> + }
> + mad_agent_priv->agent.hi_tid = ret2;
>  
>   /*
>* Make sure MAD registration (if supplied)
>* is non overlapping with any existing ones
>*/
> + spin_lock_irq(_priv->reg_lock);
>   if (mad_reg_req) {
>   mgmt_class =
> convert_mgmt_class(mad_reg_req->mgmt_class); if
> (!is_vendor_class(mgmt_class)) { @@ -394,7 +406,7 @@ struct
> ib_mad_agent *ib_register_mad_agent(struct ib_device *device, if
> (method) { if (method_in_use(,
>  mad_reg_req))
> - goto error5;
> + goto error6;
>   }
>   }
>   ret2 = add_nonoui_reg_req(mad_reg_req,
> mad_agent_priv, @@ -410,24 +422,25 @@ struct ib_mad_agent
> *ib_register_mad_agent(struct ib_device *device, if
> (is_vendor_method_in_use( vendor_class,
>   mad_reg_req))
> - goto error5;
> + goto error6;
>   }
>   }
>   ret2 = add_oui_reg_req(mad_reg_req,
> mad_agent_priv); }
>   if (ret2) {
>   ret = ERR_PTR(ret2);
> - goto error5;
> + goto error6;
>   }
>   }
> -
> - /* Add mad agent into port's agent list */
> - list_add_tail(_agent_priv->agent_list,
> _priv->agent_list); spin_unlock_irq(_priv->reg_lock);
>  
>   return _agent_priv->agent;
> -error5:
> +error6:
>   spin_unlock_irq(_priv->reg_lock);
> + idr_lock(_mad_clients);
> + idr_remove(_mad_clients, mad_agent_priv->agent.hi_tid);
> + idr_unlock(_mad_clients);
> +error5:
>   ib_mad_agent_security_cleanup(_agent_priv->agent);
>  error4:
>   kfree(reg_req);
> @@ -589,8 +602,10 @@ static void unregister_mad_agent(struct
> ib_mad_agent_private *mad_agent_priv) 
>   spin_lock_irq(_priv->reg_lock);
>   remove_mad_reg_req(mad_agent_priv);
> - list_del(_agent_priv->agent_list);
>   spin_unlock_irq(_priv->reg_lock);
> + idr_lock(_mad_clients);
> + idr_remove(_mad_clients, mad_agent_priv->agent.hi_tid);
> + idr_unlock(_mad_clients);
>  
>   flush_workqueue(port_priv->wq);
>   ib_cancel_rmpp_recvs(mad_agent_priv);
> @@ -601,7 +616,7 @@ static void unregister_mad_agent(struct
> ib_mad_agent_private *mad_agent_priv)
> ib_mad_agent_security_cleanup(_agent_priv->agent); 
>   kfree(mad_agent_priv->reg_req);
> - kfree(mad_agent_priv);
> + kfree_rcu(mad_agent_priv, rcu);
>  }
>  
>  static void unregister_mad_snoop(struct ib_mad_snoop_private
> 

Re: [PATCH 2/2] IB/mad: Use IDR for agent IDs

2018-06-13 Thread Leon Romanovsky
On Tue, Jun 12, 2018 at 05:07:39PM -0700, Matthew Wilcox wrote:
> On Tue, Jun 12, 2018 at 02:33:22PM -0600, Jason Gunthorpe wrote:
> > > @@ -377,13 +378,24 @@ struct ib_mad_agent *ib_register_mad_agent(struct 
> > > ib_device *device,
> > >   goto error4;
> > >   }
> > >
> > > - spin_lock_irq(_priv->reg_lock);
> > > - mad_agent_priv->agent.hi_tid = atomic_inc_return(_mad_client_id);
> > > + idr_preload(GFP_KERNEL);
> > > + idr_lock(_mad_clients);
> > > + ret2 = idr_alloc_cyclic(_mad_clients, mad_agent_priv, 0,
> > > + (1 << 24), GFP_ATOMIC);
> >
> > I like this series, my only concern is this magic number here, at the
> > very least it deserves a big comment explaining why it
> > exists..
>
> Yes, you're right.  Maybe something like this ...
>
> /*
>  * The mlx4 driver uses the top byte to distinguish which virtual function
>  * generated the MAD, so we must avoid using it.
>  */
> #define AGENT_ID_LIMIT(1 << 24)
>
> ...
>
>   ret2 = idr_alloc_cyclic(_mad_clients, mad_agent_priv, 0,
>   AGENT_ID_LIMIT, GFP_ATOMIC);
>
> > Let me see if I can get someone to give it some test time, I assume
> > you haven't been able to test it it?
>
> I don't have any IB hardware.  Frankly I'm scared of Infiniband ;-)

I took it for regression, reliable results will be after -rc1 only.

Thanks


signature.asc
Description: PGP signature


Re: [PATCH 2/2] IB/mad: Use IDR for agent IDs

2018-06-12 Thread Matthew Wilcox
On Tue, Jun 12, 2018 at 02:33:22PM -0600, Jason Gunthorpe wrote:
> > @@ -377,13 +378,24 @@ struct ib_mad_agent *ib_register_mad_agent(struct 
> > ib_device *device,
> > goto error4;
> > }
> >  
> > -   spin_lock_irq(_priv->reg_lock);
> > -   mad_agent_priv->agent.hi_tid = atomic_inc_return(_mad_client_id);
> > +   idr_preload(GFP_KERNEL);
> > +   idr_lock(_mad_clients);
> > +   ret2 = idr_alloc_cyclic(_mad_clients, mad_agent_priv, 0,
> > +   (1 << 24), GFP_ATOMIC);
> 
> I like this series, my only concern is this magic number here, at the
> very least it deserves a big comment explaining why it
> exists..

Yes, you're right.  Maybe something like this ...

/*
 * The mlx4 driver uses the top byte to distinguish which virtual function
 * generated the MAD, so we must avoid using it.
 */
#define AGENT_ID_LIMIT  (1 << 24)

...

ret2 = idr_alloc_cyclic(_mad_clients, mad_agent_priv, 0,
AGENT_ID_LIMIT, GFP_ATOMIC);

> Let me see if I can get someone to give it some test time, I assume
> you haven't been able to test it it?

I don't have any IB hardware.  Frankly I'm scared of Infiniband ;-)

> > +#define idr_lock(idr)  xa_lock(&(idr)->idr_rt)
> > +#define idr_unlock(idr)xa_unlock(&(idr)->idr_rt)
> > +#define idr_lock_irq(idr)  xa_lock_irq(&(idr)->idr_rt)
> > +#define idr_unlock_irq(idr)xa_unlock_irq(&(idr)->idr_rt)
> > +#define idr_lock_irqsave(idr, flags) \
> > +   xa_lock_irqsave(&(idr)->idr_rt, flags)
> > +#define idr_unlock_irqrestore(idr, flags) \
> > +   xa_unlock_irqrestore(&(idr)->idr_rt, flags)
> > +
> 
> And you are Ok to take these through the rdma tree?

Yes, that's fine with me; I'm not planning on merging any IDR patches
this cycle.  Thanks!


Re: [PATCH 2/2] IB/mad: Use IDR for agent IDs

2018-06-12 Thread Jason Gunthorpe
On Fri, Jun 08, 2018 at 10:42:18AM -0700, Matthew Wilcox wrote:
> From: Matthew Wilcox 
> 
> Allocate agent IDs from a global IDR instead of an atomic variable.
> This eliminates the possibility of reusing an ID which is already in
> use after 4 billion registrations, and we can also limit the assigned
> ID to be less than 2^24, which fixes a bug in the mlx4 device.
> 
> We look up the agent under protection of the RCU lock, which means we
> have to free the agent using kfree_rcu, and only increment the reference
> counter if it is not 0.
> 
> Signed-off-by: Matthew Wilcox 
>  drivers/infiniband/core/mad.c  | 78 ++
>  drivers/infiniband/core/mad_priv.h |  7 +--
>  include/linux/idr.h|  9 
>  3 files changed, 59 insertions(+), 35 deletions(-)
> 
> diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c
> index 68f4dda916c8..62384a3dd3ec 100644
> +++ b/drivers/infiniband/core/mad.c
> @@ -38,6 +38,7 @@
>  #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
>  
>  #include 
> +#include 
>  #include 
>  #include 
>  #include 
> @@ -58,8 +59,8 @@ MODULE_PARM_DESC(send_queue_size, "Size of send queue in 
> number of work requests
>  module_param_named(recv_queue_size, mad_recvq_size, int, 0444);
>  MODULE_PARM_DESC(recv_queue_size, "Size of receive queue in number of work 
> requests");
>  
> +static DEFINE_IDR(ib_mad_clients);
>  static struct list_head ib_mad_port_list;
> -static atomic_t ib_mad_client_id = ATOMIC_INIT(0);
>  
>  /* Port list lock */
>  static DEFINE_SPINLOCK(ib_mad_port_list_lock);
> @@ -377,13 +378,24 @@ struct ib_mad_agent *ib_register_mad_agent(struct 
> ib_device *device,
>   goto error4;
>   }
>  
> - spin_lock_irq(_priv->reg_lock);
> - mad_agent_priv->agent.hi_tid = atomic_inc_return(_mad_client_id);
> + idr_preload(GFP_KERNEL);
> + idr_lock(_mad_clients);
> + ret2 = idr_alloc_cyclic(_mad_clients, mad_agent_priv, 0,
> + (1 << 24), GFP_ATOMIC);

I like this series, my only concern is this magic number here, at the
very least it deserves a big comment explaining why it
exists..

Let me see if I can get someone to give it some test time, I assume
you haven't been able to test it it?

> diff --git a/include/linux/idr.h b/include/linux/idr.h
> index e856f4e0ab35..bef0df8600e2 100644
> +++ b/include/linux/idr.h
> @@ -81,6 +81,15 @@ static inline void idr_set_cursor(struct idr *idr, 
> unsigned int val)
>   WRITE_ONCE(idr->idr_next, val);
>  }
>  
> +#define idr_lock(idr)xa_lock(&(idr)->idr_rt)
> +#define idr_unlock(idr)  xa_unlock(&(idr)->idr_rt)
> +#define idr_lock_irq(idr)xa_lock_irq(&(idr)->idr_rt)
> +#define idr_unlock_irq(idr)  xa_unlock_irq(&(idr)->idr_rt)
> +#define idr_lock_irqsave(idr, flags) \
> + xa_lock_irqsave(&(idr)->idr_rt, flags)
> +#define idr_unlock_irqrestore(idr, flags) \
> + xa_unlock_irqrestore(&(idr)->idr_rt, flags)
> +

And you are Ok to take these through the rdma tree?

Thanks,
Jason


Re: [PATCH 2/2] IB/mad: Use IDR for agent IDs

2018-06-12 Thread Jason Gunthorpe
On Tue, Jun 12, 2018 at 07:59:42AM +0300, jackm wrote:
> On Mon, 11 Jun 2018 10:19:18 -0600
> Jason Gunthorpe  wrote:
> 
> > On Mon, Jun 11, 2018 at 09:19:14AM +0300, jackm wrote:
> > > On Sun, 10 Jun 2018 22:42:03 -0600
> > > Jason Gunthorpe  wrote:
> > >   
> > > > Er, the spec has nothing to do with this. In Linux the TID is made
> > > > unique because the core code provides 32 bits that are unique and
> > > > the user provides another 32 bits that are unique. The driver
> > > > cannot change any of those bits without risking non-uniquenes,
> > > > which is exactly the bug mlx4 created when it stepped outside its
> > > > bounds and improperly overrode bits in the TID for its own
> > > > internal use.  
> > > 
> > > Actually, the opposite is true here.  When SRIOV is active, each VM
> > > generates its *own* TIDs -- with 32 bits of agent number and 32 bits
> > > of counter.  
> > 
> > And it does it while re-using the LRH of the host, so all VMs and the
> > host are now forced to share a TID space, yes I know.
> > 
> > > There is a chance that two different VMs can generate the same TID!
> > > Encoding the slave (VM) number in the packet actually guarantees
> > > uniqueness here.  
> > 
> > Virtualizing the TID in the driver would be fine, but it must
> > virtualize all the TIDs (even those generated by the HOST).
> 
> It DOES do so.  The host slave id is 0. Slave numbers start with 1.
> If the MS byte contains a zero after paravirtualization, the MAD
> was sent by the host.
> In fact, ALL mads are paravirtualized -- including those to/from the host.

Just assuming the byte is 0 and replacing it with something else is
*NOT* virtualization.

Jason


Re: [PATCH 2/2] IB/mad: Use IDR for agent IDs

2018-06-12 Thread Matthew Wilcox
On Tue, Jun 12, 2018 at 11:50:46AM +0300, jackm wrote:
> On Fri,  8 Jun 2018 10:42:18 -0700
> Matthew Wilcox  wrote:
> 
> > +   rcu_read_lock();
> > +   mad_agent = idr_find(_mad_clients, hi_tid);
> > +   if (mad_agent
> > && !atomic_inc_not_zero(_agent->refcount))
> > +   mad_agent = NULL;
> > +   rcu_read_unlock();
> 
> Hi Matthew,
> 
> I don't see the flow which can explain using atomic_inc_not_zero() here.
> 
> The refcount will go to zero only when unregister_mad_agent() is
> called (code below, see asterisks):
> idr_lock(_mad_clients);
>  ***idr_remove(_mad_clients, mad_agent_priv->agent.hi_tid);
> idr_unlock(_mad_clients);
> 
> flush_workqueue(port_priv->wq);
> ib_cancel_rmpp_recvs(mad_agent_priv);
> 
>  ***deref_mad_agent(mad_agent_priv);
>   [JPM] The call to idr_find in the interrupt context
>   would need to occur here for the refcount to have a
>   possibility of being zero.
>   Shouldn't idr_find in the interrupt context fail, since
>   idr_remove has already been invoked?

RCU is tricky.  Here's the flow:

CPU 0   CPU 1
rcu_read_lock();
mad_agent = idr_find(_mad_clients, hi_tid);
idr_lock(_mad_clients);
idr_remove(_mad_clients, mad_agent_priv->agent.hi_tid);
idr_unlock(_mad_clients);
flush_workqueue(port_priv->wq);
ib_cancel_rmpp_recvs(mad_agent_priv);
deref_mad_agent(mad_agent_priv);

Now, you're going to argue that CPU 0 is running in interrupt context, but
with virtualisation, it can have the CPU taken away from it at any time.
This window which looks like a couple of instructions long can actually
be seconds long.

>   wait_for_completion(_agent_priv->comp);
> 
> The refcount will be able to go to zero only after deref_mad_agent is
> called above.  Before this, however, idr_remove() has been called --
> so, if my understanding is correct, the idr_find call in
> find_mad_agent() should not succeed since the refcount can get to zero
> only AFTER the idr_remove call.
> 
> Could you please explain the flow which can result in idr_find
> succeeding (in the interrupt context) after idr_remove has been invoked
> (in the process context)?  Will idr_find succeed even after
> idr_remove, and only fail after kfree_rcu is invoked as well? (or,
> maybe after some garbage-collection delay?)

Ordering is weird in SMP systems.  You can appear to have causality
violations when you're operating locklessly (and rcu_read_lock()
is essentially lockless).  So we can absolutely observe the store to
agent->refcount before we observe the store to idr->agent.

Documentation/memory-barriers.txt has a LOT more information on this.


Re: [PATCH 2/2] IB/mad: Use IDR for agent IDs

2018-06-12 Thread jackm
On Fri,  8 Jun 2018 10:42:18 -0700
Matthew Wilcox  wrote:

> + rcu_read_lock();
> + mad_agent = idr_find(_mad_clients, hi_tid);
> + if (mad_agent
> && !atomic_inc_not_zero(_agent->refcount))
> + mad_agent = NULL;
> + rcu_read_unlock();

Hi Matthew,

I don't see the flow which can explain using atomic_inc_not_zero() here.

The refcount will go to zero only when unregister_mad_agent() is
called (code below, see asterisks):
idr_lock(_mad_clients);
 ***idr_remove(_mad_clients, mad_agent_priv->agent.hi_tid);
idr_unlock(_mad_clients);

flush_workqueue(port_priv->wq);
ib_cancel_rmpp_recvs(mad_agent_priv);

 ***deref_mad_agent(mad_agent_priv);
[JPM] The call to idr_find in the interrupt context
would need to occur here for the refcount to have a
possibility of being zero.
Shouldn't idr_find in the interrupt context fail, since
idr_remove has already been invoked?
wait_for_completion(_agent_priv->comp);

The refcount will be able to go to zero only after deref_mad_agent is
called above.  Before this, however, idr_remove() has been called --
so, if my understanding is correct, the idr_find call in
find_mad_agent() should not succeed since the refcount can get to zero
only AFTER the idr_remove call.

Could you please explain the flow which can result in idr_find
succeeding (in the interrupt context) after idr_remove has been invoked
(in the process context)?  Will idr_find succeed even after
idr_remove, and only fail after kfree_rcu is invoked as well? (or,
maybe after some garbage-collection delay?)

Thx!

-Jack


Re: [PATCH 2/2] IB/mad: Use IDR for agent IDs

2018-06-11 Thread jackm
On Mon, 11 Jun 2018 10:19:18 -0600
Jason Gunthorpe  wrote:

> On Mon, Jun 11, 2018 at 09:19:14AM +0300, jackm wrote:
> > On Sun, 10 Jun 2018 22:42:03 -0600
> > Jason Gunthorpe  wrote:
> >   
> > > Er, the spec has nothing to do with this. In Linux the TID is made
> > > unique because the core code provides 32 bits that are unique and
> > > the user provides another 32 bits that are unique. The driver
> > > cannot change any of those bits without risking non-uniquenes,
> > > which is exactly the bug mlx4 created when it stepped outside its
> > > bounds and improperly overrode bits in the TID for its own
> > > internal use.  
> > 
> > Actually, the opposite is true here.  When SRIOV is active, each VM
> > generates its *own* TIDs -- with 32 bits of agent number and 32 bits
> > of counter.  
> 
> And it does it while re-using the LRH of the host, so all VMs and the
> host are now forced to share a TID space, yes I know.
> 
> > There is a chance that two different VMs can generate the same TID!
> > Encoding the slave (VM) number in the packet actually guarantees
> > uniqueness here.  
> 
> Virtualizing the TID in the driver would be fine, but it must
> virtualize all the TIDs (even those generated by the HOST).

It DOES do so.  The host slave id is 0. Slave numbers start with 1.
If the MS byte contains a zero after paravirtualization, the MAD
was sent by the host.
In fact, ALL mads are paravirtualized -- including those to/from the host.

> 
> Just blindly assuming the host doesn't generate TID's that overlap
> with the virtualization process is a bug.
> 
Not the case, host mads are also paravirtualized.

> > There is nothing wrong with modifying the TID in a reversible way in
> > order to: a. guarantee uniqueness b. identify the VM which should
> > receive the response packet  
> 
> Sure, as long as *all* TID's sharing a LRH are vitalized like this.
> 
> > The problem was created when the agent-id numbers started to use the
> > most-significant byte (thus making the MSB slave-id addition
> > impossible).  
> 
> It hasn't always been this way? What commit?
> 
Commit: 37bfc7c1e83f1 ("IB/mlx4: SR-IOV multiplex and demultiplex MADs"):

Code snippet which replaces the MS byte is below (file 
drivers/infiniband/hw/mlx4/mad.c,
procedure mlx4_ib_multiplex_mad() ).
Just an aside:  Oracle noted the problem on the *host*: The host's message log
contained the warning issued on line 1529, with slave=0 (which is the 
hypervisor).

37bfc7c1e (Jack Morgenstein2012-08-03 08:40:44 + 1519)  switch 
(tunnel->mad.mad_hdr.method) {
37bfc7c1e (Jack Morgenstein2012-08-03 08:40:44 + 1520)  case 
IB_MGMT_METHOD_SET:
37bfc7c1e (Jack Morgenstein2012-08-03 08:40:44 + 1521)  case 
IB_MGMT_METHOD_GET:
37bfc7c1e (Jack Morgenstein2012-08-03 08:40:44 + 1522)  case 
IB_MGMT_METHOD_REPORT:
37bfc7c1e (Jack Morgenstein2012-08-03 08:40:44 + 1523)  case 
IB_SA_METHOD_GET_TABLE:
37bfc7c1e (Jack Morgenstein2012-08-03 08:40:44 + 1524)  case 
IB_SA_METHOD_DELETE:
37bfc7c1e (Jack Morgenstein2012-08-03 08:40:44 + 1525)  case 
IB_SA_METHOD_GET_MULTI:
37bfc7c1e (Jack Morgenstein2012-08-03 08:40:44 + 1526)  case 
IB_SA_METHOD_GET_TRACE_TBL:
37bfc7c1e (Jack Morgenstein2012-08-03 08:40:44 + 1527)  
slave_id = (u8 *) >mad.mad_hdr.tid;
37bfc7c1e (Jack Morgenstein2012-08-03 08:40:44 + 1528)  
if (*slave_id) {
37bfc7c1e (Jack Morgenstein2012-08-03 08:40:44 + 1529)  
mlx4_ib_warn(ctx->ib_dev, "egress mad has non-null tid msb:%d "
37bfc7c1e (Jack Morgenstein2012-08-03 08:40:44 + 1530)  
 "class:%d slave:%d\n", *slave_id,
37bfc7c1e (Jack Morgenstein2012-08-03 08:40:44 + 1531)  
 tunnel->mad.mad_hdr.mgmt_class, slave);
37bfc7c1e (Jack Morgenstein2012-08-03 08:40:44 + 1532)  
return;
37bfc7c1e (Jack Morgenstein2012-08-03 08:40:44 + 1533)  
} else
37bfc7c1e (Jack Morgenstein2012-08-03 08:40:44 + 1534)  
*slave_id = slave;
37bfc7c1e (Jack Morgenstein2012-08-03 08:40:44 + 1535)  default:
37bfc7c1e (Jack Morgenstein2012-08-03 08:40:44 + 1536)  
/* nothing */;
37bfc7c1e (Jack Morgenstein2012-08-03 08:40:44 + 1537)  }

> Jason 



Re: [PATCH 2/2] IB/mad: Use IDR for agent IDs

2018-06-11 Thread Jason Gunthorpe
On Mon, Jun 11, 2018 at 09:19:14AM +0300, jackm wrote:
> On Sun, 10 Jun 2018 22:42:03 -0600
> Jason Gunthorpe  wrote:
> 
> > Er, the spec has nothing to do with this. In Linux the TID is made
> > unique because the core code provides 32 bits that are unique and the
> > user provides another 32 bits that are unique. The driver cannot
> > change any of those bits without risking non-uniquenes, which is
> > exactly the bug mlx4 created when it stepped outside its bounds and
> > improperly overrode bits in the TID for its own internal use.
> 
> Actually, the opposite is true here.  When SRIOV is active, each VM
> generates its *own* TIDs -- with 32 bits of agent number and 32 bits
> of counter.

And it does it while re-using the LRH of the host, so all VMs and the
host are now forced to share a TID space, yes I know.

> There is a chance that two different VMs can generate the same TID!
> Encoding the slave (VM) number in the packet actually guarantees
> uniqueness here.

Virtualizing the TID in the driver would be fine, but it must
virtualize all the TIDs (even those generated by the HOST).

Just blindly assuming the host doesn't generate TID's that overlap
with the virtualization process is a bug.

> There is nothing wrong with modifying the TID in a reversible way in
> order to: a. guarantee uniqueness b. identify the VM which should
> receive the response packet

Sure, as long as *all* TID's sharing a LRH are vitalized like this.

> The problem was created when the agent-id numbers started to use the
> most-significant byte (thus making the MSB slave-id addition
> impossible).

It hasn't always been this way? What commit?

Jason 


Re: [PATCH 2/2] IB/mad: Use IDR for agent IDs

2018-06-11 Thread jackm
On Sun, 10 Jun 2018 22:42:03 -0600
Jason Gunthorpe  wrote:

> Er, the spec has nothing to do with this. In Linux the TID is made
> unique because the core code provides 32 bits that are unique and the
> user provides another 32 bits that are unique. The driver cannot
> change any of those bits without risking non-uniquenes, which is
> exactly the bug mlx4 created when it stepped outside its bounds and
> improperly overrode bits in the TID for its own internal use.

Actually, the opposite is true here.
When SRIOV is active, each VM generates its *own* TIDs -- with 32 bits
of agent number and 32 bits of counter.

There is a  chance that two different VMs can generate the same TID!
Encoding the slave (VM) number in the packet actually guarantees
uniqueness here.
There is nothing wrong with modifying the TID in a reversible way in
order to:
a. guarantee uniqueness
b. identify the VM which should receive the response packet

The problem was created when the agent-id numbers started to use the
most-significant byte (thus making the MSB slave-id addition
impossible).


Re: [PATCH 2/2] IB/mad: Use IDR for agent IDs

2018-06-10 Thread Jason Gunthorpe
On Mon, Jun 11, 2018 at 07:34:25AM +0300, Leon Romanovsky wrote:
> On Sun, Jun 10, 2018 at 02:30:27PM -0600, Jason Gunthorpe wrote:
> > On Sun, Jun 10, 2018 at 03:25:05PM +0300, Leon Romanovsky wrote:
> > > On Sun, Jun 10, 2018 at 03:43:05AM -0700, Matthew Wilcox wrote:
> > > > On Sun, Jun 10, 2018 at 09:30:28AM +0300, Leon Romanovsky wrote:
> > > > > 1. IBTA spec doesn't talk at all about the size of TransactionID, more
> > > > > on that in section "13.4.6.4 TRANSACTION ID USAGE", the specification
> > > > > says: "The contents of the TransactionID (TID) field are 
> > > > > implementation-
> > > > > dependent. So let's don't call it mlx4 bug.
> > > >
> > > > I was loosely paraphrasing the original bug report; suggested rewording
> > > > of the comments gratefully accepted.
> > >
> > > Just replace "mlx4 bug" with something like "to comply with mlx4
> > > implementation".
> >
> > Well, it is a bug. Blindly replacing the upper 8 bits of the TID in a
> > driver without accommodation from the core is totally, bonkers wrong.
> 
> I provided cite from spec that says that TID can be whatever you want as
> long as you success to do it unique.

Er, the spec has nothing to do with this. In Linux the TID is made
unique because the core code provides 32 bits that are unique and the
user provides another 32 bits that are unique. The driver cannot
change any of those bits without risking non-uniquenes, which is
exactly the bug mlx4 created when it stepped outside its bounds and
improperly overrode bits in the TID for its own internal use.

Jason


Re: [PATCH 2/2] IB/mad: Use IDR for agent IDs

2018-06-10 Thread Leon Romanovsky
On Sun, Jun 10, 2018 at 02:30:27PM -0600, Jason Gunthorpe wrote:
> On Sun, Jun 10, 2018 at 03:25:05PM +0300, Leon Romanovsky wrote:
> > On Sun, Jun 10, 2018 at 03:43:05AM -0700, Matthew Wilcox wrote:
> > > On Sun, Jun 10, 2018 at 09:30:28AM +0300, Leon Romanovsky wrote:
> > > > 1. IBTA spec doesn't talk at all about the size of TransactionID, more
> > > > on that in section "13.4.6.4 TRANSACTION ID USAGE", the specification
> > > > says: "The contents of the TransactionID (TID) field are implementation-
> > > > dependent. So let's don't call it mlx4 bug.
> > >
> > > I was loosely paraphrasing the original bug report; suggested rewording
> > > of the comments gratefully accepted.
> >
> > Just replace "mlx4 bug" with something like "to comply with mlx4
> > implementation".
>
> Well, it is a bug. Blindly replacing the upper 8 bits of the TID in a
> driver without accommodation from the core is totally, bonkers wrong.

I provided cite from spec that says that TID can be whatever you want as
long as you success to do it unique.

>
> The original concept was that the 1<<24 limit would come from the
> driver and only mlx4 would have less than 1<<32, because only mlx4
> does this thing..
>
> Thanks Matt,
> Jason
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html


signature.asc
Description: PGP signature


Re: [PATCH 2/2] IB/mad: Use IDR for agent IDs

2018-06-10 Thread Jason Gunthorpe
On Sun, Jun 10, 2018 at 03:25:05PM +0300, Leon Romanovsky wrote:
> On Sun, Jun 10, 2018 at 03:43:05AM -0700, Matthew Wilcox wrote:
> > On Sun, Jun 10, 2018 at 09:30:28AM +0300, Leon Romanovsky wrote:
> > > 1. IBTA spec doesn't talk at all about the size of TransactionID, more
> > > on that in section "13.4.6.4 TRANSACTION ID USAGE", the specification
> > > says: "The contents of the TransactionID (TID) field are implementation-
> > > dependent. So let's don't call it mlx4 bug.
> >
> > I was loosely paraphrasing the original bug report; suggested rewording
> > of the comments gratefully accepted.
> 
> Just replace "mlx4 bug" with something like "to comply with mlx4
> implementation".

Well, it is a bug. Blindly replacing the upper 8 bits of the TID in a
driver without accommodation from the core is totally, bonkers wrong.

The original concept was that the 1<<24 limit would come from the
driver and only mlx4 would have less than 1<<32, because only mlx4
does this thing..

Thanks Matt,
Jason


Re: [PATCH 2/2] IB/mad: Use IDR for agent IDs

2018-06-10 Thread Leon Romanovsky
On Sun, Jun 10, 2018 at 03:43:05AM -0700, Matthew Wilcox wrote:
> On Sun, Jun 10, 2018 at 09:30:28AM +0300, Leon Romanovsky wrote:
> > 1. IBTA spec doesn't talk at all about the size of TransactionID, more
> > on that in section "13.4.6.4 TRANSACTION ID USAGE", the specification
> > says: "The contents of the TransactionID (TID) field are implementation-
> > dependent. So let's don't call it mlx4 bug.
>
> I was loosely paraphrasing the original bug report; suggested rewording
> of the comments gratefully accepted.

Just replace "mlx4 bug" with something like "to comply with mlx4
implementation".

>
> > 2. The current implementation means that in mlx5-only network we will
> > still have upto (1 << 24) TIDs. I don't know if it is really important,
> > but worth to mention.
>
> Actually, the TIDs will be up to 2^56.  We're limited to 2^24 clients,
> each of which can send up to 2^32 messages.

It sounds like more than enough.

Thanks

>


signature.asc
Description: PGP signature


Re: [PATCH 2/2] IB/mad: Use IDR for agent IDs

2018-06-10 Thread Matthew Wilcox
On Sun, Jun 10, 2018 at 09:30:28AM +0300, Leon Romanovsky wrote:
> 1. IBTA spec doesn't talk at all about the size of TransactionID, more
> on that in section "13.4.6.4 TRANSACTION ID USAGE", the specification
> says: "The contents of the TransactionID (TID) field are implementation-
> dependent. So let's don't call it mlx4 bug.

I was loosely paraphrasing the original bug report; suggested rewording
of the comments gratefully accepted.

> 2. The current implementation means that in mlx5-only network we will
> still have upto (1 << 24) TIDs. I don't know if it is really important,
> but worth to mention.

Actually, the TIDs will be up to 2^56.  We're limited to 2^24 clients,
each of which can send up to 2^32 messages.



Re: [PATCH 2/2] IB/mad: Use IDR for agent IDs

2018-06-10 Thread Leon Romanovsky
On Fri, Jun 08, 2018 at 10:42:18AM -0700, Matthew Wilcox wrote:
> From: Matthew Wilcox 
>
> Allocate agent IDs from a global IDR instead of an atomic variable.
> This eliminates the possibility of reusing an ID which is already in
> use after 4 billion registrations, and we can also limit the assigned
> ID to be less than 2^24, which fixes a bug in the mlx4 device.
>
> We look up the agent under protection of the RCU lock, which means we
> have to free the agent using kfree_rcu, and only increment the reference
> counter if it is not 0.
>
> Signed-off-by: Matthew Wilcox 
> ---
>  drivers/infiniband/core/mad.c  | 78 ++
>  drivers/infiniband/core/mad_priv.h |  7 +--
>  include/linux/idr.h|  9 
>  3 files changed, 59 insertions(+), 35 deletions(-)
>
> diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c
> index 68f4dda916c8..62384a3dd3ec 100644
> --- a/drivers/infiniband/core/mad.c
> +++ b/drivers/infiniband/core/mad.c
> @@ -38,6 +38,7 @@
>  #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
>
>  #include 
> +#include 
>  #include 
>  #include 
>  #include 
> @@ -58,8 +59,8 @@ MODULE_PARM_DESC(send_queue_size, "Size of send queue in 
> number of work requests
>  module_param_named(recv_queue_size, mad_recvq_size, int, 0444);
>  MODULE_PARM_DESC(recv_queue_size, "Size of receive queue in number of work 
> requests");
>
> +static DEFINE_IDR(ib_mad_clients);
>  static struct list_head ib_mad_port_list;
> -static atomic_t ib_mad_client_id = ATOMIC_INIT(0);
>
>  /* Port list lock */
>  static DEFINE_SPINLOCK(ib_mad_port_list_lock);
> @@ -377,13 +378,24 @@ struct ib_mad_agent *ib_register_mad_agent(struct 
> ib_device *device,
>   goto error4;
>   }
>
> - spin_lock_irq(_priv->reg_lock);
> - mad_agent_priv->agent.hi_tid = atomic_inc_return(_mad_client_id);
> + idr_preload(GFP_KERNEL);
> + idr_lock(_mad_clients);
> + ret2 = idr_alloc_cyclic(_mad_clients, mad_agent_priv, 0,
> + (1 << 24), GFP_ATOMIC);

Hi Matthew,

Thank you for looking on that, I have a couple of comments to the proposed 
patch.

1. IBTA spec doesn't talk at all about the size of TransactionID, more
on that in section "13.4.6.4 TRANSACTION ID USAGE", the specification
says: "The contents of the TransactionID (TID) field are implementation-
dependent. So let's don't call it mlx4 bug.

2. The current implementation means that in mlx5-only network we will
still have upto (1 << 24) TIDs. I don't know if it is really important,
but worth to mention.

Thanks


signature.asc
Description: PGP signature