from:"Marcelo Ricardo Leitner"

Re: [PATCH net] sctp: frag_point sanity check

2018-12-04 Thread Marcelo Ricardo Leitner

On Tue, Dec 04, 2018 at 03:56:33PM -0500, Neil Horman wrote:
> On Tue, Dec 04, 2018 at 05:52:02PM -0200, Marcelo Ricardo Leitner wrote:
> > On Tue, Dec 04, 2018 at 02:39:46PM -0500, Neil Horman wrote:
> > > On Tue, Dec 04, 2018 at 08:27:41PM +0100, Jakub Audykowicz wrote:
> > > > If for some reason an association's fragmentation point is zero,
> > > > sctp_datamsg_from_user will try to endlessly try to divide a message
> > > > into zero-sized chunks. This eventually causes kernel panic due to
> > > > running out of memory.
> > > > 
> > > > Although this situation is quite unlikely, it has occurred before as
> > > > reported. I propose to add this simple last-ditch sanity check due to
> > > > the severity of the potential consequences.
> > > > 
> > > > Signed-off-by: Jakub Audykowicz 
> > > > ---
> > > >  include/net/sctp/sctp.h | 5 +
> > > >  net/sctp/chunk.c| 6 ++
> > > >  net/sctp/socket.c   | 3 +--
> > > >  3 files changed, 12 insertions(+), 2 deletions(-)
> > > > 
> > > > diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
> > > > index ab9242e51d9e..2abbc15824af 100644
> > > > --- a/include/net/sctp/sctp.h
> > > > +++ b/include/net/sctp/sctp.h
> > > > @@ -620,4 +620,9 @@ static inline bool sctp_transport_pmtu_check(struct 
> > > > sctp_transport *t)
> > > > return false;
> > > >  }
> > > >  
> > > > +static inline __u32 sctp_min_frag_point(struct sctp_sock *sp, __u16 
> > > > datasize)
> > > > +{
> > > > +   return sctp_mtu_payload(sp, SCTP_DEFAULT_MINSEGMENT, datasize);
> > > > +}
> > > > +
> > > >  #endif /* __net_sctp_h__ */
> > > > diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c
> > > > index ce8087846f05..d5b91bc8a377 100644
> > > > --- a/net/sctp/chunk.c
> > > > +++ b/net/sctp/chunk.c
> > > > @@ -191,6 +191,12 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct 
> > > > sctp_association *asoc,
> > > >  * the packet
> > > >  */
> > > > max_data = asoc->frag_point;
> > > > +   if (unlikely(!max_data)) {
> > > > +   max_data = sctp_min_frag_point(sctp_sk(asoc->base.sk),
> > > > +  
> > > > sctp_datachk_len(>stream));
> > > > +   pr_warn_ratelimited("%s: asoc:%p frag_point is zero, 
> > > > forcing max_data to default minimum (%d)",
> > > > +   __func__, asoc, max_data);
> > > > +   }
> > > >  
> > > > /* If the the peer requested that we authenticate DATA chunks
> > > >  * we need to account for bundling of the AUTH chunks along with
> > > > diff --git a/net/sctp/socket.c b/net/sctp/socket.c
> > > > index bf618d1b41fd..b8cebd5a87e5 100644
> > > > --- a/net/sctp/socket.c
> > > > +++ b/net/sctp/socket.c
> > > > @@ -3324,8 +3324,7 @@ static int sctp_setsockopt_maxseg(struct sock 
> > > > *sk, char __user *optval, unsigned
> > > > __u16 datasize = asoc ? sctp_datachk_len(>stream) 
> > > > :
> > > >  sizeof(struct sctp_data_chunk);
> > > >  
> > > > -   min_len = sctp_mtu_payload(sp, SCTP_DEFAULT_MINSEGMENT,
> > > > -  datasize);
> > > > +   min_len = sctp_min_frag_point(sp, datasize);
> > > > max_len = SCTP_MAX_CHUNK_LEN - datasize;
> > > >  
> > > > if (val < min_len || val > max_len)
> > > > -- 
> > > > 2.17.1
> > > > 
> > > > 
> > > Why not just prevent the frag point from ever going below
> > > SCTP_DEFAULT_MINSEGMENT in the first place in 
> > > sctp_assoc_update_frag_point?
> > > Something like:
> > > 
> > > asoc->frag_point = SCTP_TRUNC4(frag) < SCTP_DEFAILT_MINSEGMENT) ? \
> > > SCTP_DEFAILT_MINSEGMENT : SCTP_TRUNC4(frag);
> > > 
> > > Should do the trick I would think
> > > Neil
> > 
> > This is in the light of "sctp: update frag_point when
> > stream_interleave is set".
> > 
> > Because of
> > https://www.mail-archive.com/netdev@vger.kernel.org/msg256575.html
> > This wouldn't have helped because sctp_assoc_update_frag_point()
> > didn't get called. The issue is not that the calc issued a bad value,
> > but that it wasn't done.
> >  
> >   Marcelo
> > 
> Ah, thank you for the clarification.
> 
> Acked-by: Neil Horman 
> 

Cool!

Acked-by: Marcelo Ricardo Leitner

Re: [PATCH net] sctp: frag_point sanity check

2018-12-04 Thread Marcelo Ricardo Leitner

On Tue, Dec 04, 2018 at 02:39:46PM -0500, Neil Horman wrote:
> On Tue, Dec 04, 2018 at 08:27:41PM +0100, Jakub Audykowicz wrote:
> > If for some reason an association's fragmentation point is zero,
> > sctp_datamsg_from_user will try to endlessly try to divide a message
> > into zero-sized chunks. This eventually causes kernel panic due to
> > running out of memory.
> > 
> > Although this situation is quite unlikely, it has occurred before as
> > reported. I propose to add this simple last-ditch sanity check due to
> > the severity of the potential consequences.
> > 
> > Signed-off-by: Jakub Audykowicz 
> > ---
> >  include/net/sctp/sctp.h | 5 +
> >  net/sctp/chunk.c| 6 ++
> >  net/sctp/socket.c   | 3 +--
> >  3 files changed, 12 insertions(+), 2 deletions(-)
> > 
> > diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
> > index ab9242e51d9e..2abbc15824af 100644
> > --- a/include/net/sctp/sctp.h
> > +++ b/include/net/sctp/sctp.h
> > @@ -620,4 +620,9 @@ static inline bool sctp_transport_pmtu_check(struct 
> > sctp_transport *t)
> > return false;
> >  }
> >  
> > +static inline __u32 sctp_min_frag_point(struct sctp_sock *sp, __u16 
> > datasize)
> > +{
> > +   return sctp_mtu_payload(sp, SCTP_DEFAULT_MINSEGMENT, datasize);
> > +}
> > +
> >  #endif /* __net_sctp_h__ */
> > diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c
> > index ce8087846f05..d5b91bc8a377 100644
> > --- a/net/sctp/chunk.c
> > +++ b/net/sctp/chunk.c
> > @@ -191,6 +191,12 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct 
> > sctp_association *asoc,
> >  * the packet
> >  */
> > max_data = asoc->frag_point;
> > +   if (unlikely(!max_data)) {
> > +   max_data = sctp_min_frag_point(sctp_sk(asoc->base.sk),
> > +  sctp_datachk_len(>stream));
> > +   pr_warn_ratelimited("%s: asoc:%p frag_point is zero, forcing 
> > max_data to default minimum (%d)",
> > +   __func__, asoc, max_data);
> > +   }
> >  
> > /* If the the peer requested that we authenticate DATA chunks
> >  * we need to account for bundling of the AUTH chunks along with
> > diff --git a/net/sctp/socket.c b/net/sctp/socket.c
> > index bf618d1b41fd..b8cebd5a87e5 100644
> > --- a/net/sctp/socket.c
> > +++ b/net/sctp/socket.c
> > @@ -3324,8 +3324,7 @@ static int sctp_setsockopt_maxseg(struct sock *sk, 
> > char __user *optval, unsigned
> > __u16 datasize = asoc ? sctp_datachk_len(>stream) :
> >  sizeof(struct sctp_data_chunk);
> >  
> > -   min_len = sctp_mtu_payload(sp, SCTP_DEFAULT_MINSEGMENT,
> > -  datasize);
> > +   min_len = sctp_min_frag_point(sp, datasize);
> > max_len = SCTP_MAX_CHUNK_LEN - datasize;
> >  
> > if (val < min_len || val > max_len)
> > -- 
> > 2.17.1
> > 
> > 
> Why not just prevent the frag point from ever going below
> SCTP_DEFAULT_MINSEGMENT in the first place in sctp_assoc_update_frag_point?
> Something like:
> 
> asoc->frag_point = SCTP_TRUNC4(frag) < SCTP_DEFAILT_MINSEGMENT) ? \
> SCTP_DEFAILT_MINSEGMENT : SCTP_TRUNC4(frag);
> 
> Should do the trick I would think
> Neil

This is in the light of "sctp: update frag_point when
stream_interleave is set".

Because of
https://www.mail-archive.com/netdev@vger.kernel.org/msg256575.html
This wouldn't have helped because sctp_assoc_update_frag_point()
didn't get called. The issue is not that the calc issued a bad value,
but that it wasn't done.
 
  Marcelo

Re: [PATCH net] sctp: always set frag_point on pmtu change

2018-12-04 Thread Marcelo Ricardo Leitner

On Tue, Dec 04, 2018 at 07:51:54PM +0100, Jakub Audykowicz wrote:
...
> Thanks, I've taken your remarks into account and ended up with this 
> simple solution:

LGTM! Thanks

> 
> diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
> index ab9242e51d9e..3487686f2cf5 100644
> --- a/include/net/sctp/sctp.h
> +++ b/include/net/sctp/sctp.h
> @@ -620,4 +620,9 @@ static inline bool sctp_transport_pmtu_check(struct 
> sctp_transport *t)
>   return false;
>  }
>  
> +static inline __u32 sctp_min_frag_point(struct sctp_sock *sp, __u16 datasize)
> +{
> +return sctp_mtu_payload(sp, SCTP_DEFAULT_MINSEGMENT, datasize);

Not sure how final the patch is but be sure to run checkpatch.pl on
it before submitting it officially. It flagged some issues like the
above indentation using spaces, for example.

> +}
> +
>  #endif /* __net_sctp_h__ */
> diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c
> index ce8087846f05..dc12c2ba487f 100644
> --- a/net/sctp/chunk.c
> +++ b/net/sctp/chunk.c
> @@ -191,6 +191,12 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct 
> sctp_association *asoc,
>* the packet
>*/
>   max_data = asoc->frag_point;
> + if (unlikely(!max_data)) {
> + pr_warn_ratelimited("%s: asoc:%p frag_point is zero, forcing 
> max_data to default minimum",
> + __func__, asoc);
> + max_data = sctp_min_frag_point(
> + sctp_sk(asoc->base.sk), 
> sctp_datachk_len(>stream));
> + }
>  
>   /* If the the peer requested that we authenticate DATA chunks
>* we need to account for bundling of the AUTH chunks along with
> diff --git a/net/sctp/socket.c b/net/sctp/socket.c
> index bf618d1b41fd..b8cebd5a87e5 100644
> --- a/net/sctp/socket.c
> +++ b/net/sctp/socket.c
> @@ -3324,8 +3324,7 @@ static int sctp_setsockopt_maxseg(struct sock *sk, char 
> __user *optval, unsigned
>   __u16 datasize = asoc ? sctp_datachk_len(>stream) :
>sizeof(struct sctp_data_chunk);
>  
> - min_len = sctp_mtu_payload(sp, SCTP_DEFAULT_MINSEGMENT,
> -datasize);
> + min_len = sctp_min_frag_point(sp, datasize);
>   max_len = SCTP_MAX_CHUNK_LEN - datasize;
>  
>   if (val < min_len || val > max_len)
> 
>  
>

Re: [PATCH net] sctp: always set frag_point on pmtu change

2018-12-04 Thread Marcelo Ricardo Leitner

On Tue, Dec 04, 2018 at 06:00:51PM +0100, Jakub Audykowicz wrote:
...
> OK, let's forget about that "if" :)
> Coming back to the sanity check, I came up with something like below,
> based on the code from sctp_setsockopt_maxseg, like you mentioned.
> I may have overcomplicated things since I didn't know how to accomplish
> the same thing without passing sctp_sock* to sctp_datamsg_from_user.

Yep. More below.

> I wanted to avoid calling sctp_min_frag_point unless absolutely
> necessary, so I just check the frag_point against the zero that is
> causing the eventual kernel panic.

Makes sense.

>  
> diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
> index ab9242e51d9e..7e67c0257b3f 100644
> --- a/include/net/sctp/sctp.h
> +++ b/include/net/sctp/sctp.h
> @@ -620,4 +620,15 @@ static inline bool sctp_transport_pmtu_check(struct 
> sctp_transport *t)
>   return false;
>  }
>  
> +static inline __u16 sctp_data_chunk_size(struct sctp_association *asoc)
> +{
> +return asoc ? sctp_datachk_len(>stream) :
> +  sizeof(struct sctp_data_chunk);

I don't think we need another layer on top of data chunk sizes here.
We don't need this asoc check by the sendmsg callpath because it
cannot be NULL by then. That said, I think we have avoid this helper,
for now at least.

One other way would be to include the check into sctp_datachk_len(),
but we currently have 9 calls to that but only 1 requires the check.

As asoc is initialized and considering the fix we just had in this
area, stream->si will also be initialized so you should be good to
just call sctp_datachk_len() directly.

> +}
> +
> +static inline __u32 sctp_min_frag_point(struct sctp_sock *sp, __u16 datasize)
> +{
> +return sctp_mtu_payload(sp, SCTP_DEFAULT_MINSEGMENT, datasize);
> +}

This is a good helper to have. Makes it clearer.

> +
>  #endif /* __net_sctp_h__ */
> diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
> index a11f93790476..d09b5de73c92 100644
> --- a/include/net/sctp/structs.h
> +++ b/include/net/sctp/structs.h
> @@ -543,7 +543,8 @@ struct sctp_datamsg {
>  
>  struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *,
>   struct sctp_sndrcvinfo *,
> - struct iov_iter *);
> + struct iov_iter *,
> + struct sctp_sock *);
>  void sctp_datamsg_free(struct sctp_datamsg *);
>  void sctp_datamsg_put(struct sctp_datamsg *);
>  void sctp_chunk_fail(struct sctp_chunk *, int error);
> diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c
> index ce8087846f05..753c2c123767 100644
> --- a/net/sctp/chunk.c
> +++ b/net/sctp/chunk.c
> @@ -164,7 +164,8 @@ static void sctp_datamsg_assign(struct sctp_datamsg *msg, 
> struct sctp_chunk *chu
>   */
>  struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
>   struct sctp_sndrcvinfo *sinfo,
> - struct iov_iter *from)
> + struct iov_iter *from,
> + struct sctp_sock *sp)
>  {
>   size_t len, first_len, max_data, remaining;
>   size_t msg_len = iov_iter_count(from);
> @@ -173,6 +174,7 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct 
> sctp_association *asoc,
>   struct sctp_chunk *chunk;
>   struct sctp_datamsg *msg;
>   int err;
> + __u32 min_frag_point;

Reverse christmass tree.. swap the last two lines:
+   __u32 min_frag_point;
int err;
But I guess we don't need this var anymore:

>  
>   msg = sctp_datamsg_new(GFP_KERNEL);
>   if (!msg)
> @@ -190,6 +192,12 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct 
> sctp_association *asoc,
>   /* This is the biggest possible DATA chunk that can fit into
>* the packet
>*/
> + if (unlikely(asoc->frag_point == 0)) {
 !asoc->frag_point   instead

You can get to sctp_sock here with: sctp_sk(asoc->base.sk)
struct sctp_sock *sp = sctp_sk(asoc->base.sk);

> + min_frag_point = sctp_min_frag_point(sp, 
> sctp_data_chunk_size(asoc));
> + pr_warn_ratelimited("%s: asoc:%p frag_point is too low (%d < 
> %d), using default minimum",
> + __func__, asoc, asoc->frag_point, min_frag_point);
> + asoc->frag_point = min_frag_point;

No no. Lets not fix up things here. If we do this assignment, we may
make the disparity even worse.
With that, we can work only on max_data and avoid the need of min_frag_point.
max_data = asoc->frag_point;
if (unlikely(!max_data)) {
...
max_data = ...
}

> + }
>   max_data = asoc->frag_point;
>  
>   /* If the the peer requested that we authenticate DATA chunks
> diff --git a/net/sctp/socket.c b/net/sctp/socket.c
> index

Re: [PATCH net] sctp: kfree_rcu asoc

2018-11-30 Thread Marcelo Ricardo Leitner

On Sat, Dec 01, 2018 at 01:36:59AM +0800, Xin Long wrote:
> In sctp_hash_transport/sctp_epaddr_lookup_transport, it dereferences
> a transport's asoc under rcu_read_lock while asoc is freed not after
> a grace period, which leads to a use-after-free panic.
> 
> This patch fixes it by calling kfree_rcu to make asoc be freed after
> a grace period.
> 
> Note that only the asoc's memory is delayed to free in the patch, it
> won't cause sk to linger longer.
> 
> Thanks Neil and Marcelo to make this clear.
> 
> Fixes: 7fda702f9315 ("sctp: use new rhlist interface on sctp transport 
> rhashtable")
> Fixes: cd2b70875058 ("sctp: check duplicate node before inserting a new 
> transport")
> Reported-by: syzbot+0b05d8aa7cb185107...@syzkaller.appspotmail.com
> Reported-by: syzbot+aad231d51b1923158...@syzkaller.appspotmail.com
> Suggested-by: Neil Horman 
> Signed-off-by: Xin Long 

Phew :-)

Acked-by: Marcelo Ricardo Leitner 

> ---
>  include/net/sctp/structs.h | 2 ++
>  net/sctp/associola.c   | 2 +-
>  2 files changed, 3 insertions(+), 1 deletion(-)
> 
> diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
> index a11f937..feada35 100644
> --- a/include/net/sctp/structs.h
> +++ b/include/net/sctp/structs.h
> @@ -2075,6 +2075,8 @@ struct sctp_association {
>  
>   __u64 abandoned_unsent[SCTP_PR_INDEX(MAX) + 1];
>   __u64 abandoned_sent[SCTP_PR_INDEX(MAX) + 1];
> +
> + struct rcu_head rcu;
>  };
>  
>  
> diff --git a/net/sctp/associola.c b/net/sctp/associola.c
> index 6a28b96..3702f48 100644
> --- a/net/sctp/associola.c
> +++ b/net/sctp/associola.c
> @@ -434,7 +434,7 @@ static void sctp_association_destroy(struct 
> sctp_association *asoc)
>  
>   WARN_ON(atomic_read(>rmem_alloc));
>  
> - kfree(asoc);
> + kfree_rcu(asoc, rcu);
>   SCTP_DBG_OBJCNT_DEC(assoc);
>  }
>  
> -- 
> 2.1.0
>

Re: [PATCHv2 net] sctp: hold transport before accessing its asoc in sctp_epaddr_lookup_transport

2018-11-30 Thread Marcelo Ricardo Leitner

On Fri, Nov 30, 2018 at 09:05:06AM -0500, Neil Horman wrote:
> On Fri, Nov 30, 2018 at 11:33:15AM -0200, Marcelo Ricardo Leitner wrote:
> > On Fri, Nov 30, 2018 at 07:32:36AM -0500, Neil Horman wrote:
> > > On Fri, Nov 30, 2018 at 02:04:16PM +0900, Xin Long wrote:
> > > > On Fri, Nov 30, 2018 at 5:52 AM Neil Horman  
> > > > wrote:
> > > > >
> > > > > On Thu, Nov 29, 2018 at 02:44:07PM +0800, Xin Long wrote:
> > > > > > Without holding transport to dereference its asoc, a use after
> > > > > > free panic can be caused in sctp_epaddr_lookup_transport. Note
> > > > > > that a sock lock can't protect these transports that belong to
> > > > > > other socks.
> > > > > >
> > > > > > A similar fix as Commit bab1be79a516 ("sctp: hold transport
> > > > > > before accessing its asoc in sctp_transport_get_next") is
> > > > > > needed to hold the transport before accessing its asoc in
> > > > > > sctp_epaddr_lookup_transport.
> > > > > >
> > > > > > Note that this extra atomic operation is on the datapath,
> > > > > > but as rhlist keeps the lists to a small size, it won't
> > > > > > see a noticeable performance hurt.
> > > > > >
> > > > > > v1->v2:
> > > > > >   - improve the changelog.
> > > > > >
> > > > > > Fixes: 7fda702f9315 ("sctp: use new rhlist interface on sctp 
> > > > > > transport rhashtable")
> > > > > > Reported-by: syzbot+aad231d51b1923158...@syzkaller.appspotmail.com
> > > > > > Signed-off-by: Xin Long 
> > > > > > ---
> > > > > >  net/sctp/input.c | 10 --
> > > > > >  1 file changed, 8 insertions(+), 2 deletions(-)
> > > > > >
> > > > > > diff --git a/net/sctp/input.c b/net/sctp/input.c
> > > > > > index 5c36a99..ce7351c 100644
> > > > > > --- a/net/sctp/input.c
> > > > > > +++ b/net/sctp/input.c
> > > > > > @@ -967,9 +967,15 @@ struct sctp_transport 
> > > > > > *sctp_epaddr_lookup_transport(
> > > > > >   list = rhltable_lookup(_transport_hashtable, ,
> > > > > >  sctp_hash_params);
> > > > > >
> > > > > > - rhl_for_each_entry_rcu(t, tmp, list, node)
> > > > > > - if (ep == t->asoc->ep)
> > > > > > + rhl_for_each_entry_rcu(t, tmp, list, node) {
> > > > > > + if (!sctp_transport_hold(t))
> > > > > > + continue;
> > > > > > + if (ep == t->asoc->ep) {
> > > > > > + sctp_transport_put(t);
> > > > > >   return t;
> > > > > > + }
> > > > > > + sctp_transport_put(t);
> > > > > > + }
> > > > > >
> > > > > >   return NULL;
> > > > > >  }
> > > > >
> > > > > Wait a second, what if we just added an rcu_head to the association 
> > > > > structure
> > > > > and changed the kfree call in sctp_association_destroy to a kfree_rcu 
> > > > > call
> > > > > instead?  That would force the actual freeing of the association to 
> > > > > pass through
> > > > > a grace period, during which any in flight list traversal in
> > > > > sctp_epaddr_lookup_transport could complete safely.  Its another two 
> > > > > pointers
> > > > We discussed this in last thread:
> > > > https://www.spinics.net/lists/netdev/msg535191.html
> > > > 
> > > > It will cause closed sk to linger longer.
> > > > 
> > > Yes, but we never really got resolution on that topic.  I don't see that a
> > 
> > Fair point. We should have brought back the discussion online.
> > 
> > > socket lingering for an extra grace period is that big a deal.  I also 
> > > don't see
> > 
> > What we really don't want is to bring back
> > 8c98653f0553 ("sctp: sctp_close: fix release of bindings for deferred 
> > call_rcu's").
> > (more below). That's where our fear lies.
> > 
> > > how sending the actual kfree through a grace period is g

Re: [PATCHv2 net] sctp: hold transport before accessing its asoc in sctp_epaddr_lookup_transport

2018-11-30 Thread Marcelo Ricardo Leitner

On Fri, Nov 30, 2018 at 07:32:36AM -0500, Neil Horman wrote:
> On Fri, Nov 30, 2018 at 02:04:16PM +0900, Xin Long wrote:
> > On Fri, Nov 30, 2018 at 5:52 AM Neil Horman  wrote:
> > >
> > > On Thu, Nov 29, 2018 at 02:44:07PM +0800, Xin Long wrote:
> > > > Without holding transport to dereference its asoc, a use after
> > > > free panic can be caused in sctp_epaddr_lookup_transport. Note
> > > > that a sock lock can't protect these transports that belong to
> > > > other socks.
> > > >
> > > > A similar fix as Commit bab1be79a516 ("sctp: hold transport
> > > > before accessing its asoc in sctp_transport_get_next") is
> > > > needed to hold the transport before accessing its asoc in
> > > > sctp_epaddr_lookup_transport.
> > > >
> > > > Note that this extra atomic operation is on the datapath,
> > > > but as rhlist keeps the lists to a small size, it won't
> > > > see a noticeable performance hurt.
> > > >
> > > > v1->v2:
> > > >   - improve the changelog.
> > > >
> > > > Fixes: 7fda702f9315 ("sctp: use new rhlist interface on sctp transport 
> > > > rhashtable")
> > > > Reported-by: syzbot+aad231d51b1923158...@syzkaller.appspotmail.com
> > > > Signed-off-by: Xin Long 
> > > > ---
> > > >  net/sctp/input.c | 10 --
> > > >  1 file changed, 8 insertions(+), 2 deletions(-)
> > > >
> > > > diff --git a/net/sctp/input.c b/net/sctp/input.c
> > > > index 5c36a99..ce7351c 100644
> > > > --- a/net/sctp/input.c
> > > > +++ b/net/sctp/input.c
> > > > @@ -967,9 +967,15 @@ struct sctp_transport 
> > > > *sctp_epaddr_lookup_transport(
> > > >   list = rhltable_lookup(_transport_hashtable, ,
> > > >  sctp_hash_params);
> > > >
> > > > - rhl_for_each_entry_rcu(t, tmp, list, node)
> > > > - if (ep == t->asoc->ep)
> > > > + rhl_for_each_entry_rcu(t, tmp, list, node) {
> > > > + if (!sctp_transport_hold(t))
> > > > + continue;
> > > > + if (ep == t->asoc->ep) {
> > > > + sctp_transport_put(t);
> > > >   return t;
> > > > + }
> > > > + sctp_transport_put(t);
> > > > + }
> > > >
> > > >   return NULL;
> > > >  }
> > >
> > > Wait a second, what if we just added an rcu_head to the association 
> > > structure
> > > and changed the kfree call in sctp_association_destroy to a kfree_rcu call
> > > instead?  That would force the actual freeing of the association to pass 
> > > through
> > > a grace period, during which any in flight list traversal in
> > > sctp_epaddr_lookup_transport could complete safely.  Its another two 
> > > pointers
> > We discussed this in last thread:
> > https://www.spinics.net/lists/netdev/msg535191.html
> > 
> > It will cause closed sk to linger longer.
> > 
> Yes, but we never really got resolution on that topic.  I don't see that a

Fair point. We should have brought back the discussion online.

> socket lingering for an extra grace period is that big a deal.  I also don't 
> see

What we really don't want is to bring back
8c98653f0553 ("sctp: sctp_close: fix release of bindings for deferred 
call_rcu's").
(more below). That's where our fear lies.

> how sending the actual kfree through a grace period is going to cause the 
> socket
> to linger.  If you look at sctp_association_destroy, we call sock_put prior to
> calling kfree at the end of the function.  All I'm looking for here is for the
> memory free to wait until any list traversal in sctp_epaddr_lookup_transport 
> is
> done, which is what you are trying to do with your atomics.
> 
> As for your comment regarding sctp_transport_destroy_rcu, yes, that forces a
> grace period when a transport is being destroyed, which will protect against
> list corruption of the transport list here.  Thats good, but isn't what you 
> are
> trying to fix.  Your initial claim was that the asoc pointer for a given
> transport was no longer valid, because it was getting freed while the 
> transport
> was still on the list.  That can clearly happen as we release all the 
> transports
> in sctp_association_free prior to calling what ostensibly is the last refrence
> to their parent association at the end of that function, but its only the
> transports that pass through a grace period before getting freed, the
> association happens synchrnously, ignoring any grace period, and thats what we
> need to change.
> 
> The more I look at it the more I'm convinced. What you're doing here is
> definately overkill.  You need to add an rcu_head to the association and just 
> do
> the kfree of its memory after a grace period.  Its actually only a single 
> grace
> period as well.  If someone is traversing the transport list, both the 
> transport
> and association rcu callbacks will get run once the rcu_read_lock is released.

Ok, delaying *just* the kfree works too. It wouldn't bring back the
issue I mentioned above.

We have basically 3 options then:

1) your proposal above
   extends sctp_association by

Re: [PATCHv2 net] sctp: hold transport before accessing its asoc in sctp_epaddr_lookup_transport

2018-11-30 Thread Marcelo Ricardo Leitner

On Thu, Nov 29, 2018 at 02:44:07PM +0800, Xin Long wrote:
> Without holding transport to dereference its asoc, a use after
> free panic can be caused in sctp_epaddr_lookup_transport. Note
> that a sock lock can't protect these transports that belong to
> other socks.
> 
> A similar fix as Commit bab1be79a516 ("sctp: hold transport
> before accessing its asoc in sctp_transport_get_next") is
> needed to hold the transport before accessing its asoc in
> sctp_epaddr_lookup_transport.
> 
> Note that this extra atomic operation is on the datapath,
> but as rhlist keeps the lists to a small size, it won't
> see a noticeable performance hurt.
> 
> v1->v2:
>   - improve the changelog.
> 
> Fixes: 7fda702f9315 ("sctp: use new rhlist interface on sctp transport 
> rhashtable")
> Reported-by: syzbot+aad231d51b1923158...@syzkaller.appspotmail.com
> Signed-off-by: Xin Long 

Acked-by: Marcelo Ricardo Leitner 

> ---
>  net/sctp/input.c | 10 --
>  1 file changed, 8 insertions(+), 2 deletions(-)
> 
> diff --git a/net/sctp/input.c b/net/sctp/input.c
> index 5c36a99..ce7351c 100644
> --- a/net/sctp/input.c
> +++ b/net/sctp/input.c
> @@ -967,9 +967,15 @@ struct sctp_transport *sctp_epaddr_lookup_transport(
>   list = rhltable_lookup(_transport_hashtable, ,
>  sctp_hash_params);
>  
> - rhl_for_each_entry_rcu(t, tmp, list, node)
> - if (ep == t->asoc->ep)
> + rhl_for_each_entry_rcu(t, tmp, list, node) {
> + if (!sctp_transport_hold(t))
> + continue;
> + if (ep == t->asoc->ep) {
> + sctp_transport_put(t);
>   return t;
> + }
> + sctp_transport_put(t);
> + }
>  
>   return NULL;
>  }
> -- 
> 2.1.0
>

Re: [PATCHv2 net] sctp: hold transport before accessing its asoc in sctp_hash_transport

2018-11-30 Thread Marcelo Ricardo Leitner

On Thu, Nov 29, 2018 at 02:49:28PM +0800, Xin Long wrote:
> In sctp_hash_transport, it dereferences a transport's asoc only under
> rcu_read_lock. Without holding the transport, its asoc could be freed
> already, which leads to a use-after-free panic.
> 
> A similar fix as Commit bab1be79a516 ("sctp: hold transport before
> accessing its asoc in sctp_transport_get_next") is needed to hold
> the transport before accessing its asoc in sctp_hash_transport.
> 
> Note that as rhlist keeps the lists to a small size, this extra
> atomic operation won't cause a noticeable latency on inserting
> a transport. Yet it's not in a datapath.
> 
> v1->v2:
>   - improve the changelog.
> 
> Fixes: cd2b70875058 ("sctp: check duplicate node before inserting a new 
> transport")
> Reported-by: syzbot+0b05d8aa7cb185107...@syzkaller.appspotmail.com
> Signed-off-by: Xin Long 

Acked-by: Marcelo Ricardo Leitner 

> ---
>  net/sctp/input.c | 7 ++-
>  1 file changed, 6 insertions(+), 1 deletion(-)
> 
> diff --git a/net/sctp/input.c b/net/sctp/input.c
> index ce7351c..c2c0816 100644
> --- a/net/sctp/input.c
> +++ b/net/sctp/input.c
> @@ -896,11 +896,16 @@ int sctp_hash_transport(struct sctp_transport *t)
>   list = rhltable_lookup(_transport_hashtable, ,
>  sctp_hash_params);
>  
> - rhl_for_each_entry_rcu(transport, tmp, list, node)
> + rhl_for_each_entry_rcu(transport, tmp, list, node) {
> + if (!sctp_transport_hold(transport))
> + continue;
>   if (transport->asoc->ep == t->asoc->ep) {
> + sctp_transport_put(transport);
>   rcu_read_unlock();
>   return -EEXIST;
>   }
> + sctp_transport_put(transport);
> + }
>   rcu_read_unlock();
>  
>   err = rhltable_insert_key(_transport_hashtable, ,
> -- 
> 2.1.0
>

Re: [PATCH net] sctp: hold transport before accessing its asoc in sctp_hash_transport

2018-11-28 Thread Marcelo Ricardo Leitner

On Wed, Nov 28, 2018 at 06:36:45PM +0900, Xin Long wrote:
> On Thu, Nov 22, 2018 at 2:53 AM Marcelo Ricardo Leitner
>  wrote:
> >
> > On Wed, Nov 21, 2018 at 03:47:33PM +0900, Xin Long wrote:
> > > On Wed, Nov 21, 2018 at 9:46 AM Marcelo Ricardo Leitner
> > >  wrote:
> > > >
> > > > On Tue, Nov 20, 2018 at 07:52:48AM -0500, Neil Horman wrote:
> > > > > On Tue, Nov 20, 2018 at 07:09:16PM +0800, Xin Long wrote:
> > > > > > In sctp_hash_transport, it dereferences a transport's asoc only 
> > > > > > under
> > > > > > rcu_read_lock. Without holding the transport, its asoc could be 
> > > > > > freed
> > > > > > already, which leads to a use-after-free panic.
> > > > > >
> > > > > > A similar fix as Commit bab1be79a516 ("sctp: hold transport before
> > > > > > accessing its asoc in sctp_transport_get_next") is needed to hold
> > > > > > the transport before accessing its asoc in sctp_hash_transport.
> > > > > >
> > > > > > Fixes: cd2b70875058 ("sctp: check duplicate node before inserting a 
> > > > > > new transport")
> > > > > > Reported-by: syzbot+0b05d8aa7cb185107...@syzkaller.appspotmail.com
> > > > > > Signed-off-by: Xin Long 
> > > > > > ---
> > > > > >  net/sctp/input.c | 7 ++-
> > > > > >  1 file changed, 6 insertions(+), 1 deletion(-)
> > > > > >
> > > > > > diff --git a/net/sctp/input.c b/net/sctp/input.c
> > > > > > index 5c36a99..69584e9 100644
> > > > > > --- a/net/sctp/input.c
> > > > > > +++ b/net/sctp/input.c
> > > > > > @@ -896,11 +896,16 @@ int sctp_hash_transport(struct sctp_transport 
> > > > > > *t)
> > > > > > list = rhltable_lookup(_transport_hashtable, ,
> > > > > >sctp_hash_params);
> > > > > >
> > > > > > -   rhl_for_each_entry_rcu(transport, tmp, list, node)
> > > > > > +   rhl_for_each_entry_rcu(transport, tmp, list, node) {
> > > > > > +   if (!sctp_transport_hold(transport))
> > > > > > +   continue;
> > > > > > if (transport->asoc->ep == t->asoc->ep) {
> > > > > > +   sctp_transport_put(transport);
> > > > > > rcu_read_unlock();
> > > > > > return -EEXIST;
> > > > > > }
> > > > > > +   sctp_transport_put(transport);
> > > > > > +   }
> > > > > > rcu_read_unlock();
> > > > > >
> > > > > > err = rhltable_insert_key(_transport_hashtable, ,
> > > > > > --
> > > > > > 2.1.0
> > > > > >
> > > > > >
> > > > >
> > > > > something doesn't feel at all right about this.  If we are inserting 
> > > > > a transport
> > > > > to an association, it would seem to me that we should have at least 
> > > > > one user of
> > > > > the association (i.e. non-zero refcount).  As such it seems something 
> > > > > is wrong
> > > > > with the association refcount here.  At the very least, if there is a 
> > > > > case where
> > > > > an association is being removed while a transport is being added, the 
> > > > > better
> > > > > solution would be to ensure that sctp_association_destroy goes 
> > > > > through a
> > > > > quiescent point prior to unhashing transports from the list, to 
> > > > > ensure that
> > > > > there is no conflict with the add operation above.
> > > Changing to do call_rcu(>rcu, sctp_association_destroy) can
> > > work for this case.
> > > But it means asoc and socket (taking the port) will have to wait for a
> > > grace period, which is not expected. We seemed to have talked about
> > > this before, Marcelo?
> >
> > Yes. This would cause it to linger longer and cause bind conflicts
> > meanwhile.
> > Note that we already have sctp_transport_destroy_rcu(), so this would
> > be a 2nd grace period.
> >
> > >
> > > >
> > > > Consider that the rhl_for_each_entry_rcu() is traversi

Re: [PATCH net] sctp: always set frag_point on pmtu change

2018-11-28 Thread Marcelo Ricardo Leitner

On Wed, Nov 28, 2018 at 12:08:38AM -0200, Marcelo Ricardo Leitner wrote:
> On Tue, Nov 27, 2018 at 11:18:02PM +0100, Jakub Audykowicz wrote:
> > On 2018-11-19 08:20, Xin Long wrote:
> > 
> > > On Mon, Nov 19, 2018 at 5:49 AM Jakub Audykowicz
> > >  wrote:
> > >> Calling send on a connected SCTP socket results in kernel panic if
> > >> spp_pathmtu was configured manually before an association is established
> > >> and it was not reconfigured to another value once the association is
> > >> established.
> > >>
> > >> Steps to reproduce:
> > >> 1. Set up a listening SCTP server socket.
> > >> 2. Set up an SCTP client socket.
> > >> 3. Configure client socket using setsockopt SCTP_PEER_ADDR_PARAMS with
> > >> spp_pathmtu set to a legal value (e.g. 1000) and
> > >> SPP_PMTUD_DISABLE set in spp_flags.
> > >> 4. Connect client to server.
> > >> 5. Send message from client to server.
> > >>
> > >> At this point oom-killer is invoked, which will eventually lead to:
> > >> [5.197262] Out of memory and no killable processes...
> > >> [5.198107] Kernel panic - not syncing: System is deadlocked on memory
> > >>
> > >> Commit 2f5e3c9df693 ("sctp: introduce sctp_assoc_update_frag_point")
> > >> introduces sctp_assoc_update_frag_point, but this function is not called
> > >> in this case, causing frag_point to be zero:
> > >>  void sctp_assoc_set_pmtu(struct sctp_association *asoc, __u32 pmtu)
> > >>  {
> > >> -   if (asoc->pathmtu != pmtu)
> > >> +   if (asoc->pathmtu != pmtu) {
> > >> asoc->pathmtu = pmtu;
> > >> +   sctp_assoc_update_frag_point(asoc);
> > >> +   }
> > >>
> > >> In this scenario, on association establishment, asoc->pathmtu is already
> > >> 1000 and pmtu will be as well. Before this commit the frag_point was 
> > >> being
> > >> correctly set in the scenario described. Moving the call outside the if
> > >> block fixes the issue.
> > >>
> > >> I will be providing a separate patch to lksctp-tools with a simple test
> > >> reproducing this problem ("func_tests: frag_point should never be zero").
> > >>
> > >> I have also taken the liberty to introduce a sanity check in chunk.c to
> > >> set the frag_point to a non-negative value in order to avoid chunking
> > >> endlessly (which is the reason for the eventual panic).
> > >>
> > >> Fixes: 2f5e3c9df693 ("sctp: introduce sctp_assoc_update_frag_point")
> > >> Signed-off-by: Jakub Audykowicz 
> > >> ---
> > >>  include/net/sctp/constants.h |  3 +++
> > >>  net/sctp/associola.c | 13 +++--
> > >>  net/sctp/chunk.c |  6 ++
> > >>  3 files changed, 16 insertions(+), 6 deletions(-)
> > >>
> > >> diff --git a/include/net/sctp/constants.h b/include/net/sctp/constants.h
> > >> index 8dadc74c22e7..90316fab6f04 100644
> > >> --- a/include/net/sctp/constants.h
> > >> +++ b/include/net/sctp/constants.h
> > >> @@ -293,6 +293,9 @@ enum { SCTP_MAX_GABS = 16 };
> > >>  */
> > >>  #define SCTP_DEFAULT_MINSEGMENT 512/* MTU size ... if no mtu disc */
> > >>
> > >> +/* An association's fragmentation point should never be non-positive */
> > >> +#define SCTP_FRAG_POINT_MIN 1
> > >> +
> > >>  #define SCTP_SECRET_SIZE 32/* Number of octets in a 256 
> > >> bits. */
> > >>
> > >>  #define SCTP_SIGNATURE_SIZE 20 /* size of a SLA-1 signature */
> > >> diff --git a/net/sctp/associola.c b/net/sctp/associola.c
> > >> index 6a28b96e779e..44d71a1af62e 100644
> > >> --- a/net/sctp/associola.c
> > >> +++ b/net/sctp/associola.c
> > >> @@ -1431,13 +1431,14 @@ void sctp_assoc_update_frag_point(struct 
> > >> sctp_association *asoc)
> > >>
> > >>  void sctp_assoc_set_pmtu(struct sctp_association *asoc, __u32 pmtu)
> > >>  {
> > >> -   if (asoc->pathmtu != pmtu) {
> > >> -   asoc->pathmtu = pmtu;
> > >> -   sctp_assoc_update_frag_point(asoc);
> > >> -   }
> >

Re: [PATCH net] sctp: always set frag_point on pmtu change

2018-11-27 Thread Marcelo Ricardo Leitner

On Tue, Nov 27, 2018 at 11:18:02PM +0100, Jakub Audykowicz wrote:
> On 2018-11-19 08:20, Xin Long wrote:
> 
> > On Mon, Nov 19, 2018 at 5:49 AM Jakub Audykowicz
> >  wrote:
> >> Calling send on a connected SCTP socket results in kernel panic if
> >> spp_pathmtu was configured manually before an association is established
> >> and it was not reconfigured to another value once the association is
> >> established.
> >>
> >> Steps to reproduce:
> >> 1. Set up a listening SCTP server socket.
> >> 2. Set up an SCTP client socket.
> >> 3. Configure client socket using setsockopt SCTP_PEER_ADDR_PARAMS with
> >> spp_pathmtu set to a legal value (e.g. 1000) and
> >> SPP_PMTUD_DISABLE set in spp_flags.
> >> 4. Connect client to server.
> >> 5. Send message from client to server.
> >>
> >> At this point oom-killer is invoked, which will eventually lead to:
> >> [5.197262] Out of memory and no killable processes...
> >> [5.198107] Kernel panic - not syncing: System is deadlocked on memory
> >>
> >> Commit 2f5e3c9df693 ("sctp: introduce sctp_assoc_update_frag_point")
> >> introduces sctp_assoc_update_frag_point, but this function is not called
> >> in this case, causing frag_point to be zero:
> >>  void sctp_assoc_set_pmtu(struct sctp_association *asoc, __u32 pmtu)
> >>  {
> >> -   if (asoc->pathmtu != pmtu)
> >> +   if (asoc->pathmtu != pmtu) {
> >> asoc->pathmtu = pmtu;
> >> +   sctp_assoc_update_frag_point(asoc);
> >> +   }
> >>
> >> In this scenario, on association establishment, asoc->pathmtu is already
> >> 1000 and pmtu will be as well. Before this commit the frag_point was being
> >> correctly set in the scenario described. Moving the call outside the if
> >> block fixes the issue.
> >>
> >> I will be providing a separate patch to lksctp-tools with a simple test
> >> reproducing this problem ("func_tests: frag_point should never be zero").
> >>
> >> I have also taken the liberty to introduce a sanity check in chunk.c to
> >> set the frag_point to a non-negative value in order to avoid chunking
> >> endlessly (which is the reason for the eventual panic).
> >>
> >> Fixes: 2f5e3c9df693 ("sctp: introduce sctp_assoc_update_frag_point")
> >> Signed-off-by: Jakub Audykowicz 
> >> ---
> >>  include/net/sctp/constants.h |  3 +++
> >>  net/sctp/associola.c | 13 +++--
> >>  net/sctp/chunk.c |  6 ++
> >>  3 files changed, 16 insertions(+), 6 deletions(-)
> >>
> >> diff --git a/include/net/sctp/constants.h b/include/net/sctp/constants.h
> >> index 8dadc74c22e7..90316fab6f04 100644
> >> --- a/include/net/sctp/constants.h
> >> +++ b/include/net/sctp/constants.h
> >> @@ -293,6 +293,9 @@ enum { SCTP_MAX_GABS = 16 };
> >>  */
> >>  #define SCTP_DEFAULT_MINSEGMENT 512/* MTU size ... if no mtu disc */
> >>
> >> +/* An association's fragmentation point should never be non-positive */
> >> +#define SCTP_FRAG_POINT_MIN 1
> >> +
> >>  #define SCTP_SECRET_SIZE 32/* Number of octets in a 256 bits. 
> >> */
> >>
> >>  #define SCTP_SIGNATURE_SIZE 20 /* size of a SLA-1 signature */
> >> diff --git a/net/sctp/associola.c b/net/sctp/associola.c
> >> index 6a28b96e779e..44d71a1af62e 100644
> >> --- a/net/sctp/associola.c
> >> +++ b/net/sctp/associola.c
> >> @@ -1431,13 +1431,14 @@ void sctp_assoc_update_frag_point(struct 
> >> sctp_association *asoc)
> >>
> >>  void sctp_assoc_set_pmtu(struct sctp_association *asoc, __u32 pmtu)
> >>  {
> >> -   if (asoc->pathmtu != pmtu) {
> >> -   asoc->pathmtu = pmtu;
> >> -   sctp_assoc_update_frag_point(asoc);
> >> -   }
> >> +   pr_debug("%s: before asoc:%p, pmtu:%d, frag_point:%d\n",
> >> +   __func__, asoc, asoc->pathmtu, asoc->frag_point);
> >> +
> >> +   asoc->pathmtu = pmtu;
> >> +   sctp_assoc_update_frag_point(asoc);
> >>
> >> -   pr_debug("%s: asoc:%p, pmtu:%d, frag_point:%d\n", __func__, asoc,
> >> -asoc->pathmtu, asoc->frag_point);
> >> +   pr_debug("%s: after asoc:%p, pmtu:%d, frag_point:%d\n",
> >> +   __func__, asoc, asoc->pathmtu, asoc->frag_point);
> >>  }
> > The idea was whenever asoc->pathmtu changes,  frag_point should
> > be updated, but we missed one place in sctp_association_init().
> >
> > Another issue is after 4-shakehand, the client's asoc->intl_enable
> > may be changed from 0 to 1, which means the frag_point should
> > also be updated, since [1]:
> >
> > void sctp_assoc_update_frag_point(struct sctp_association *asoc)
> > {
> > int frag = sctp_mtu_payload(sctp_sk(asoc->base.sk), asoc->pathmtu,
> > sctp_datachk_len(>stream)); <--- 
> > [1]
> >
> > So one fix for both issues is:
> >
> > diff --git a/net/sctp/stream_interleave.c b/net/sctp/stream_interleave.c
> > index 0a78cdf..19d596d 100644
> > --- a/net/sctp/stream_interleave.c
> > +++ b/net/sctp/stream_interleave.c
> > @@ -1327,4 +1327,5 @@

Re: [PATCHv2 net] sctp: update frag_point when stream_interleave is set

2018-11-27 Thread Marcelo Ricardo Leitner

On Tue, Nov 27, 2018 at 07:11:50PM +0800, Xin Long wrote:
> sctp_assoc_update_frag_point() should be called whenever asoc->pathmtu
> changes, but we missed one place in sctp_association_init(). It would
> cause frag_point is zero when sending data.
> 
> As says in Jakub's reproducer, if sp->pathmtu is set by socketopt, the
> new asoc->pathmtu inherits it in sctp_association_init(). Later when
> transports are added and their pmtu >= asoc->pathmtu, it will never
> call sctp_assoc_update_frag_point() to set frag_point.
> 
> This patch is to fix it by updating frag_point after asoc->pathmtu is
> set as sp->pathmtu in sctp_association_init(). Note that it moved them
> after sctp_stream_init(), as stream->si needs to be set first.
> 
> Frag_point's calculation is also related with datachunk's type, so it
> needs to update frag_point when stream->si may be changed in
> sctp_process_init().
> 
> v1->v2:
>   - call sctp_assoc_update_frag_point() separately in sctp_process_init
> and sctp_association_init, per Marcelo's suggestion.
> 
> Fixes: 2f5e3c9df693 ("sctp: introduce sctp_assoc_update_frag_point")
> Reported-by: Jakub Audykowicz 
> Signed-off-by: Xin Long 

Acked-by: Marcelo Ricardo Leitner 

> ---
>  net/sctp/associola.c | 7 ---
>  net/sctp/sm_make_chunk.c | 3 +++
>  2 files changed, 7 insertions(+), 3 deletions(-)
> 
> diff --git a/net/sctp/associola.c b/net/sctp/associola.c
> index 6a28b96..dd77ec3 100644
> --- a/net/sctp/associola.c
> +++ b/net/sctp/associola.c
> @@ -118,9 +118,6 @@ static struct sctp_association *sctp_association_init(
>   asoc->flowlabel = sp->flowlabel;
>   asoc->dscp = sp->dscp;
>  
> - /* Initialize default path MTU. */
> - asoc->pathmtu = sp->pathmtu;
> -
>   /* Set association default SACK delay */
>   asoc->sackdelay = msecs_to_jiffies(sp->sackdelay);
>   asoc->sackfreq = sp->sackfreq;
> @@ -252,6 +249,10 @@ static struct sctp_association *sctp_association_init(
>0, gfp))
>   goto fail_init;
>  
> + /* Initialize default path MTU. */
> + asoc->pathmtu = sp->pathmtu;
> + sctp_assoc_update_frag_point(asoc);
> +
>   /* Assume that peer would support both address types unless we are
>* told otherwise.
>*/
> diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
> index 4a4fd19..f4ac6c5 100644
> --- a/net/sctp/sm_make_chunk.c
> +++ b/net/sctp/sm_make_chunk.c
> @@ -2462,6 +2462,9 @@ int sctp_process_init(struct sctp_association *asoc, 
> struct sctp_chunk *chunk,
>asoc->c.sinit_max_instreams, gfp))
>   goto clean_up;
>  
> + /* Update frag_point when stream_interleave may get changed. */
> + sctp_assoc_update_frag_point(asoc);
> +
>   if (!asoc->temp && sctp_assoc_set_id(asoc, gfp))
>   goto clean_up;
>  
> -- 
> 2.1.0
>

Re: [PATCH net-next,v3 00/12] add flow_rule infrastructure

2018-11-26 Thread Marcelo Ricardo Leitner

On Mon, Nov 26, 2018 at 08:33:36PM +0100, Pablo Neira Ayuso wrote:
> Hi Marcelo,

Hello!

> 
> On Thu, Nov 22, 2018 at 07:08:32PM -0200, Marcelo Ricardo Leitner wrote:
> > On Thu, Nov 22, 2018 at 02:22:20PM -0200, Marcelo Ricardo Leitner wrote:
> > > On Wed, Nov 21, 2018 at 03:51:20AM +0100, Pablo Neira Ayuso wrote:
> > > > Hi,
> > > > 
> > > > This patchset is the third iteration [1] [2] [3] to introduce a kernel
> > > > intermediate (IR) to express ACL hardware offloads.
> > > 
> > > On v2 cover letter you had:
> > > 
> > > """
> > > However, cost of this layer is very small, adding 1 million rules via
> > > tc -batch, perf shows:
> > > 
> > >  0.06%  tc   [kernel.vmlinux][k] tc_setup_flow_action
> > > """
> > > 
> > > The above doesn't include time spent on children calls and I'm worried
> > > about the new allocation done by flow_rule_alloc(), as it can impact
> > > rule insertion rate. I'll run some tests here and report back.
> > 
> > I'm seeing +60ms on 1.75s (~3.4%) to add 40k flower rules on ingress
> > with skip_hw and tc in batch mode, with flows like:
> > 
> > filter add dev p6p2 parent : protocol ip prio 1 flower skip_hw
> > src_mac ec:13:db:00:00:00 dst_mac ec:14:c2:00:00:00 src_ip
> > 56.0.0.0 dst_ip 55.0.0.0 action drop
> > 
> > Only 20ms out of those 60ms were consumed within fl_change() calls
> > (considering children calls), though.
> > 
> > Do you see something similar?  I used current net-next (d59da3fbfe3f)
> > and with this patchset applied.
> 
> I see lots of send() and recv() in tc -batch via strace, using this
> example rule, repeating it N times:
> 
> filter add dev eth0 parent : protocol ip pref 1 flower dst_mac 
> f4:52:14:10:df:92 action mirred egress redirect dev eth1
> 
> This is taking ~8 seconds for 40k rules from my old laptop [*], this
> is already not too fast (without my patchset).

On a E5-2643 v3 @ 3.40GHz I see a total of 1.17s with an old iproute
(4.11) (more below).

> 
> I remember we discussed about adding support for real batching for tc
> - probably we can probably do this transparently by assuming that if the
> skbuff length mismatches nlmsghdr->len field, then we enter the batch
> mode from the kernel. This would require to update iproute2 to use
> libmnl batching routines, or code that follows similar approach
> otherwise.

Yes, I believe you're referring to

commit 485d0c6001c4aa134b99c86913d6a7089b7b2ab0
Author: Chris Mi 
Date:   Fri Jan 12 14:13:16 2018 +0900

tc: Add batchsize feature for filter and actions

Which is present in 4.16. It does transparent batching on app side.

With tc from today's tip, I get 1.05s for 40k rules, both with this
patchset applied.

> 
> [*] 0.5 seconds in nft (similar ruleset), this is using netlink batching.

Nice.

Cheers,
  Marcelo

Re: [PATCH net] sctp: update frag_point when stream_interleave is set

2018-11-26 Thread Marcelo Ricardo Leitner

On Mon, Nov 26, 2018 at 10:16:50PM +0900, Xin Long wrote:
> On Mon, Nov 26, 2018 at 9:29 PM Marcelo Ricardo Leitner
>  wrote:
> >
> > On Mon, Nov 26, 2018 at 05:02:11PM +0800, Xin Long wrote:
> > > sctp_assoc_update_frag_point() should be called whenever asoc->pathmtu
> > > changes, but we missed one place in sctp_association_init(). It would
> > > cause frag_point is zero when sending data.
> > >
> > > As says in Jakub's reproducer, if sp->pathmtu is set by socketopt, the
> > > new asoc->pathmtu inherits it in sctp_association_init(). Later when
> > > transports are added and their pmtu >= asoc->pathmtu, it will never
> > > call sctp_assoc_update_frag_point() to set frag_point.
> > >
> > > This patch is to fix it by updating frag_point when stream_interleave
> > > is set in sctp_stream_interleave_init(), which is also called in
> > > sctp_association_init(). We're doing this also because frag_point
> > > is affected by datachunk's type, namely stream_interleave_0/1.
> > >
> > > Fixes: 2f5e3c9df693 ("sctp: introduce sctp_assoc_update_frag_point")
> > > Reported-by: Jakub Audykowicz 
> > > Signed-off-by: Xin Long 
> > > ---
> > >  net/sctp/stream_interleave.c | 1 +
> > >  1 file changed, 1 insertion(+)
> > >
> > > diff --git a/net/sctp/stream_interleave.c b/net/sctp/stream_interleave.c
> > > index 0a78cdf..19d596d 100644
> > > --- a/net/sctp/stream_interleave.c
> > > +++ b/net/sctp/stream_interleave.c
> > > @@ -1327,4 +1327,5 @@ void sctp_stream_interleave_init(struct sctp_stream 
> > > *stream)
> > >   asoc = container_of(stream, struct sctp_association, stream);
> > >   stream->si = asoc->intl_enable ? _stream_interleave_1
> > >  : _stream_interleave_0;
> > > + sctp_assoc_update_frag_point(asoc);
> >
> > I get that by adding it here we avoid adding it twice, one in
> > sctp_association_init and another in sctp_process_init, but here it is
> > out of context.
> >
> > The decision on data chunk format is not made on this function but
> > higher in the stack and we can leverage that for sctp_process_init,
> > and for sctp_association_init, we should have it as close as possible
> > to where it initialized pathmtu and did not update the frag point.
> okay, but both have to be after sctp_stream_init().
> though we want sctp_assoc_update_frag_point()
> called right after "asoc->pathmtu = sp->pathmtu;".

Good point, sctp_datachk_len needs stream->si there.

> 
> diff --git a/net/sctp/associola.c b/net/sctp/associola.c
> index a827a1f..a614937 100644
> --- a/net/sctp/associola.c
> +++ b/net/sctp/associola.c
> @@ -252,6 +252,8 @@ static struct sctp_association *sctp_association_init(
>  0, gfp))
> goto fail_init;
> 

Can we move the asoc->pathmtu initialization down here too then?
I don't see anything that would block it.
Otherwise LGTM.

> +   sctp_assoc_update_frag_point(asoc);
> +
> /* Assume that peer would support both address types unless we are
>  * told otherwise.
>  */
> diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
> index 4a4fd19..600ca0d 100644
> --- a/net/sctp/sm_make_chunk.c
> +++ b/net/sctp/sm_make_chunk.c
> @@ -2462,6 +2462,8 @@ int sctp_process_init(struct sctp_association
> *asoc, struct sctp_chunk *chunk,
>  asoc->c.sinit_max_instreams, gfp))
> goto clean_up;
> 
> +   sctp_assoc_update_frag_point(asoc);
> +
> if (!asoc->temp && sctp_assoc_set_id(asoc, gfp))
> goto clean_up;
> 
> >
> > >  }
> > > --
> > > 2.1.0
> > >

Re: [PATCH net] sctp: update frag_point when stream_interleave is set

2018-11-26 Thread Marcelo Ricardo Leitner

On Mon, Nov 26, 2018 at 05:02:11PM +0800, Xin Long wrote:
> sctp_assoc_update_frag_point() should be called whenever asoc->pathmtu
> changes, but we missed one place in sctp_association_init(). It would
> cause frag_point is zero when sending data.
> 
> As says in Jakub's reproducer, if sp->pathmtu is set by socketopt, the
> new asoc->pathmtu inherits it in sctp_association_init(). Later when
> transports are added and their pmtu >= asoc->pathmtu, it will never
> call sctp_assoc_update_frag_point() to set frag_point.
> 
> This patch is to fix it by updating frag_point when stream_interleave
> is set in sctp_stream_interleave_init(), which is also called in
> sctp_association_init(). We're doing this also because frag_point
> is affected by datachunk's type, namely stream_interleave_0/1.
> 
> Fixes: 2f5e3c9df693 ("sctp: introduce sctp_assoc_update_frag_point")
> Reported-by: Jakub Audykowicz 
> Signed-off-by: Xin Long 
> ---
>  net/sctp/stream_interleave.c | 1 +
>  1 file changed, 1 insertion(+)
> 
> diff --git a/net/sctp/stream_interleave.c b/net/sctp/stream_interleave.c
> index 0a78cdf..19d596d 100644
> --- a/net/sctp/stream_interleave.c
> +++ b/net/sctp/stream_interleave.c
> @@ -1327,4 +1327,5 @@ void sctp_stream_interleave_init(struct sctp_stream 
> *stream)
>   asoc = container_of(stream, struct sctp_association, stream);
>   stream->si = asoc->intl_enable ? _stream_interleave_1
>  : _stream_interleave_0;
> + sctp_assoc_update_frag_point(asoc);

I get that by adding it here we avoid adding it twice, one in
sctp_association_init and another in sctp_process_init, but here it is
out of context.

The decision on data chunk format is not made on this function but
higher in the stack and we can leverage that for sctp_process_init,
and for sctp_association_init, we should have it as close as possible
to where it initialized pathmtu and did not update the frag point.

>  }
> -- 
> 2.1.0
>

Re: [PATCH net] sctp: increase sk_wmem_alloc when head->truesize is increased

2018-11-26 Thread Marcelo Ricardo Leitner

On Mon, Nov 26, 2018 at 02:52:44PM +0800, Xin Long wrote:
> I changed to count sk_wmem_alloc by skb truesize instead of 1 to
> fix the sk_wmem_alloc leak caused by later truesize's change in
> xfrm in Commit 02968ccf0125 ("sctp: count sk_wmem_alloc by skb
> truesize in sctp_packet_transmit").
> 
> But I should have also increased sk_wmem_alloc when head->truesize
> is increased in sctp_packet_gso_append() as xfrm does. Otherwise,
> sctp gso packet will cause sk_wmem_alloc underflow.
> 
> Fixes: 02968ccf0125 ("sctp: count sk_wmem_alloc by skb truesize in 
> sctp_packet_transmit")
> Signed-off-by: Xin Long 

Acked-by: Marcelo Ricardo Leitner 

> ---
>  net/sctp/output.c | 1 +
>  1 file changed, 1 insertion(+)
> 
> diff --git a/net/sctp/output.c b/net/sctp/output.c
> index b0e74a3..025f48e 100644
> --- a/net/sctp/output.c
> +++ b/net/sctp/output.c
> @@ -410,6 +410,7 @@ static void sctp_packet_gso_append(struct sk_buff *head, 
> struct sk_buff *skb)
>   head->truesize += skb->truesize;
>   head->data_len += skb->len;
>   head->len += skb->len;
> + refcount_add(skb->truesize, >sk->sk_wmem_alloc);
>  
>   __skb_header_release(skb);
>  }
> -- 
> 2.1.0
>

Re: [PATCH net-next,v3 00/12] add flow_rule infrastructure

2018-11-22 Thread Marcelo Ricardo Leitner

On Thu, Nov 22, 2018 at 02:22:20PM -0200, Marcelo Ricardo Leitner wrote:
> On Wed, Nov 21, 2018 at 03:51:20AM +0100, Pablo Neira Ayuso wrote:
> > Hi,
> > 
> > This patchset is the third iteration [1] [2] [3] to introduce a kernel
> > intermediate (IR) to express ACL hardware offloads.
> 
> On v2 cover letter you had:
> 
> """
> However, cost of this layer is very small, adding 1 million rules via
> tc -batch, perf shows:
> 
>  0.06%  tc   [kernel.vmlinux][k] tc_setup_flow_action
> """
> 
> The above doesn't include time spent on children calls and I'm worried
> about the new allocation done by flow_rule_alloc(), as it can impact
> rule insertion rate. I'll run some tests here and report back.

I'm seeing +60ms on 1.75s (~3.4%) to add 40k flower rules on ingress
with skip_hw and tc in batch mode, with flows like:

filter add dev p6p2 parent : protocol ip prio 1 flower skip_hw
src_mac ec:13:db:00:00:00 dst_mac ec:14:c2:00:00:00 src_ip
56.0.0.0 dst_ip 55.0.0.0 action drop

Only 20ms out of those 60ms were consumed within fl_change() calls
(considering children calls), though.

Do you see something similar?  I used current net-next (d59da3fbfe3f)
and with this patchset applied.

Re: [PATCH net-next,v3 12/12] qede: use ethtool_rx_flow_rule() to remove duplicated parser code

2018-11-22 Thread Marcelo Ricardo Leitner

On Wed, Nov 21, 2018 at 03:51:32AM +0100, Pablo Neira Ayuso wrote:
...
>  static int
>  qede_parse_flower_attr(struct qede_dev *edev, __be16 proto,
> -struct tc_cls_flower_offload *f,
> -struct qede_arfs_tuple *tuple)
> +struct flow_rule *rule, struct qede_arfs_tuple *tuple)

What about s/qede_parse_flower_attr/qede_parse_flow_attr/ or so? As it
is not about flower anymore.

It also helps here:

> -static int qede_flow_spec_to_tuple(struct qede_dev *edev,
> -struct qede_arfs_tuple *t,
> -struct ethtool_rx_flow_spec *fs)
> +static int qede_flow_spec_to_rule(struct qede_dev *edev,
> +   struct qede_arfs_tuple *t,
> +   struct ethtool_rx_flow_spec *fs)
>  {
...
> +
> + if (qede_parse_flower_attr(edev, proto, flow->rule, t)) {
> + err = -EINVAL;
> + goto err_out;
> + }
> +
> + /* Make sure location is valid and filter isn't already set */
> + err = qede_flow_spec_validate(edev, >rule->action, t,
> +   fs->location);
...

Re: [PATCH net-next,v3 04/12] cls_api: add translator to flow_action representation

2018-11-22 Thread Marcelo Ricardo Leitner

On Wed, Nov 21, 2018 at 03:51:24AM +0100, Pablo Neira Ayuso wrote:
...
> +int tc_setup_flow_action(struct flow_action *flow_action,
> +  const struct tcf_exts *exts)
> +{
> + const struct tc_action *act;
> + int i, j, k;
> +
> + if (!exts)
> + return 0;
> +
> + j = 0;
> + tcf_exts_for_each_action(i, act, exts) {
> + struct flow_action_entry *key;
   ^  ^^^

> +
> + key = _action->entries[j];
^^^ ^^^

Considering previous changes, what about a s/key/entry/ in the
variable name here too?

> + if (is_tcf_gact_ok(act)) {
> + key->id = FLOW_ACTION_ACCEPT;
> + } else if (is_tcf_gact_shot(act)) {
> + key->id = FLOW_ACTION_DROP;
> + } else if (is_tcf_gact_trap(act)) {
> + key->id = FLOW_ACTION_TRAP;
> + } else if (is_tcf_gact_goto_chain(act)) {
> + key->id = FLOW_ACTION_GOTO;
> + key->chain_index = tcf_gact_goto_chain_index(act);
> + } else if (is_tcf_mirred_egress_redirect(act)) {
> + key->id = FLOW_ACTION_REDIRECT;
> + key->dev = tcf_mirred_dev(act);
> + } else if (is_tcf_mirred_egress_mirror(act)) {
> + key->id = FLOW_ACTION_MIRRED;
> + key->dev = tcf_mirred_dev(act);
> + } else if (is_tcf_vlan(act)) {
> + switch (tcf_vlan_action(act)) {
> + case TCA_VLAN_ACT_PUSH:
> + key->id = FLOW_ACTION_VLAN_PUSH;
> + key->vlan.vid = tcf_vlan_push_vid(act);
> + key->vlan.proto = tcf_vlan_push_proto(act);
> + key->vlan.prio = tcf_vlan_push_prio(act);
> + break;
> + case TCA_VLAN_ACT_POP:
> + key->id = FLOW_ACTION_VLAN_POP;
> + break;
> + case TCA_VLAN_ACT_MODIFY:
> + key->id = FLOW_ACTION_VLAN_MANGLE;
> + key->vlan.vid = tcf_vlan_push_vid(act);
> + key->vlan.proto = tcf_vlan_push_proto(act);
> + key->vlan.prio = tcf_vlan_push_prio(act);
> + break;
> + default:
> + goto err_out;
> + }
> + } else if (is_tcf_tunnel_set(act)) {
> + key->id = FLOW_ACTION_TUNNEL_ENCAP;
> + key->tunnel = tcf_tunnel_info(act);
> + } else if (is_tcf_tunnel_release(act)) {
> + key->id = FLOW_ACTION_TUNNEL_DECAP;
> + key->tunnel = tcf_tunnel_info(act);
> + } else if (is_tcf_pedit(act)) {
> + for (k = 0; k < tcf_pedit_nkeys(act); k++) {
> + switch (tcf_pedit_cmd(act, k)) {
> + case TCA_PEDIT_KEY_EX_CMD_SET:
> + key->id = FLOW_ACTION_MANGLE;
> + break;
> + case TCA_PEDIT_KEY_EX_CMD_ADD:
> + key->id = FLOW_ACTION_ADD;
> + break;
> + default:
> + goto err_out;
> + }
> + key->mangle.htype = tcf_pedit_htype(act, k);
> + key->mangle.mask = tcf_pedit_mask(act, k);
> + key->mangle.val = tcf_pedit_val(act, k);
> + key->mangle.offset = tcf_pedit_offset(act, k);
> + key = _action->entries[++j];
> + }
> + } else if (is_tcf_csum(act)) {
> + key->id = FLOW_ACTION_CSUM;
> + key->csum_flags = tcf_csum_update_flags(act);
> + } else if (is_tcf_skbedit_mark(act)) {
> + key->id = FLOW_ACTION_MARK;
> + key->mark = tcf_skbedit_mark(act);
> + } else {
> + goto err_out;
> + }
> +
> + if (!is_tcf_pedit(act))
> + j++;
> + }
> + return 0;
> +err_out:
> + return -EOPNOTSUPP;
> +}
> +EXPORT_SYMBOL(tc_setup_flow_action);

Re: [PATCH net-next,v3 00/12] add flow_rule infrastructure

2018-11-22 Thread Marcelo Ricardo Leitner

On Wed, Nov 21, 2018 at 03:51:20AM +0100, Pablo Neira Ayuso wrote:
> Hi,
> 
> This patchset is the third iteration [1] [2] [3] to introduce a kernel
> intermediate (IR) to express ACL hardware offloads.

On v2 cover letter you had:

"""
However, cost of this layer is very small, adding 1 million rules via
tc -batch, perf shows:

 0.06%  tc   [kernel.vmlinux][k] tc_setup_flow_action
"""

The above doesn't include time spent on children calls and I'm worried
about the new allocation done by flow_rule_alloc(), as it can impact
rule insertion rate. I'll run some tests here and report back.

Re: [PATCH net-next,v3 04/12] cls_api: add translator to flow_action representation

2018-11-21 Thread Marcelo Ricardo Leitner

On Wed, Nov 21, 2018 at 03:51:24AM +0100, Pablo Neira Ayuso wrote:
> This patch implements a new function to translate from native TC action
> to the new flow_action representation. Moreover, this patch also updates
> cls_flower to use this new function.
> 
> Signed-off-by: Pablo Neira Ayuso 
> ---
> v3: add tcf_exts_num_actions() and pass it to flow_rule_alloc() to calculate
> the size of the array of actions.
> 
>  include/net/pkt_cls.h  |   5 +++
>  net/sched/cls_api.c| 116 
> +
>  net/sched/cls_flower.c |  21 +++--
>  3 files changed, 139 insertions(+), 3 deletions(-)
> 
> diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
> index 359876ee32be..abb035f84321 100644
> --- a/include/net/pkt_cls.h
> +++ b/include/net/pkt_cls.h
> @@ -620,6 +620,11 @@ tcf_match_indev(struct sk_buff *skb, int ifindex)
>  }
>  #endif /* CONFIG_NET_CLS_IND */
>  
> +unsigned int tcf_exts_num_actions(struct tcf_exts *exts);
> +
> +int tc_setup_flow_action(struct flow_action *flow_action,
> +  const struct tcf_exts *exts);
> +
>  int tc_setup_cb_call(struct tcf_block *block, struct tcf_exts *exts,
>enum tc_setup_type type, void *type_data, bool err_stop);
>  
> diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
> index d92f44ac4c39..6f8b953dabc4 100644
> --- a/net/sched/cls_api.c
> +++ b/net/sched/cls_api.c
> @@ -31,6 +31,14 @@
>  #include 
>  #include 
>  #include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
>  
>  extern const struct nla_policy rtm_tca_policy[TCA_MAX + 1];
>  
> @@ -2567,6 +2575,114 @@ int tc_setup_cb_call(struct tcf_block *block, struct 
> tcf_exts *exts,
>  }
>  EXPORT_SYMBOL(tc_setup_cb_call);
>  
> +int tc_setup_flow_action(struct flow_action *flow_action,
> +  const struct tcf_exts *exts)
> +{
> + const struct tc_action *act;
> + int i, j, k;
> +
> + if (!exts)
> + return 0;
> +
> + j = 0;
> + tcf_exts_for_each_action(i, act, exts) {
> + struct flow_action_entry *key;
> +
> + key = _action->entries[j];
> + if (is_tcf_gact_ok(act)) {
> + key->id = FLOW_ACTION_ACCEPT;
> + } else if (is_tcf_gact_shot(act)) {
> + key->id = FLOW_ACTION_DROP;
> + } else if (is_tcf_gact_trap(act)) {
> + key->id = FLOW_ACTION_TRAP;
> + } else if (is_tcf_gact_goto_chain(act)) {
> + key->id = FLOW_ACTION_GOTO;
> + key->chain_index = tcf_gact_goto_chain_index(act);
> + } else if (is_tcf_mirred_egress_redirect(act)) {
> + key->id = FLOW_ACTION_REDIRECT;
> + key->dev = tcf_mirred_dev(act);
> + } else if (is_tcf_mirred_egress_mirror(act)) {
> + key->id = FLOW_ACTION_MIRRED;
> + key->dev = tcf_mirred_dev(act);
> + } else if (is_tcf_vlan(act)) {
> + switch (tcf_vlan_action(act)) {
> + case TCA_VLAN_ACT_PUSH:
> + key->id = FLOW_ACTION_VLAN_PUSH;
> + key->vlan.vid = tcf_vlan_push_vid(act);
> + key->vlan.proto = tcf_vlan_push_proto(act);
> + key->vlan.prio = tcf_vlan_push_prio(act);
> + break;
> + case TCA_VLAN_ACT_POP:
> + key->id = FLOW_ACTION_VLAN_POP;
> + break;
> + case TCA_VLAN_ACT_MODIFY:
> + key->id = FLOW_ACTION_VLAN_MANGLE;
> + key->vlan.vid = tcf_vlan_push_vid(act);
> + key->vlan.proto = tcf_vlan_push_proto(act);
> + key->vlan.prio = tcf_vlan_push_prio(act);
> + break;
> + default:
> + goto err_out;
> + }
> + } else if (is_tcf_tunnel_set(act)) {
> + key->id = FLOW_ACTION_TUNNEL_ENCAP;
> + key->tunnel = tcf_tunnel_info(act);
> + } else if (is_tcf_tunnel_release(act)) {
> + key->id = FLOW_ACTION_TUNNEL_DECAP;
> + key->tunnel = tcf_tunnel_info(act);
> + } else if (is_tcf_pedit(act)) {
> + for (k = 0; k < tcf_pedit_nkeys(act); k++) {
> + switch (tcf_pedit_cmd(act, k)) {
> + case TCA_PEDIT_KEY_EX_CMD_SET:
> + key->id = FLOW_ACTION_MANGLE;
> + break;
> + case TCA_PEDIT_KEY_EX_CMD_ADD:
> + key->id = FLOW_ACTION_ADD;
> +

Re: [PATCH net] sctp: hold transport before accessing its asoc in sctp_hash_transport

2018-11-21 Thread Marcelo Ricardo Leitner

On Wed, Nov 21, 2018 at 08:27:21AM -0500, Neil Horman wrote:
> On Tue, Nov 20, 2018 at 10:46:26PM -0200, Marcelo Ricardo Leitner wrote:
> > On Tue, Nov 20, 2018 at 07:52:48AM -0500, Neil Horman wrote:
> > > On Tue, Nov 20, 2018 at 07:09:16PM +0800, Xin Long wrote:
> > > > In sctp_hash_transport, it dereferences a transport's asoc only under
> > > > rcu_read_lock. Without holding the transport, its asoc could be freed
> > > > already, which leads to a use-after-free panic.
> > > > 
> > > > A similar fix as Commit bab1be79a516 ("sctp: hold transport before
> > > > accessing its asoc in sctp_transport_get_next") is needed to hold
> > > > the transport before accessing its asoc in sctp_hash_transport.
> > > > 
> > > > Fixes: cd2b70875058 ("sctp: check duplicate node before inserting a new 
> > > > transport")
> > > > Reported-by: syzbot+0b05d8aa7cb185107...@syzkaller.appspotmail.com
> > > > Signed-off-by: Xin Long 
> > > > ---
> > > >  net/sctp/input.c | 7 ++-
> > > >  1 file changed, 6 insertions(+), 1 deletion(-)
> > > > 
> > > > diff --git a/net/sctp/input.c b/net/sctp/input.c
> > > > index 5c36a99..69584e9 100644
> > > > --- a/net/sctp/input.c
> > > > +++ b/net/sctp/input.c
> > > > @@ -896,11 +896,16 @@ int sctp_hash_transport(struct sctp_transport *t)
> > > > list = rhltable_lookup(_transport_hashtable, ,
> > > >sctp_hash_params);
> > > >  
> > > > -   rhl_for_each_entry_rcu(transport, tmp, list, node)
> > > > +   rhl_for_each_entry_rcu(transport, tmp, list, node) {
> > > > +   if (!sctp_transport_hold(transport))
> > > > +   continue;
> > > > if (transport->asoc->ep == t->asoc->ep) {
> > > > +   sctp_transport_put(transport);
> > > > rcu_read_unlock();
> > > > return -EEXIST;
> > > > }
> > > > +   sctp_transport_put(transport);
> > > > +   }
> > > > rcu_read_unlock();
> > > >  
> > > > err = rhltable_insert_key(_transport_hashtable, ,
> > > > -- 
> > > > 2.1.0
> > > > 
> > > > 
> > > 
> > > something doesn't feel at all right about this.  If we are inserting a 
> > > transport
> > > to an association, it would seem to me that we should have at least one 
> > > user of
> > > the association (i.e. non-zero refcount).  As such it seems something is 
> > > wrong
> > > with the association refcount here.  At the very least, if there is a 
> > > case where
> > > an association is being removed while a transport is being added, the 
> > > better
> > > solution would be to ensure that sctp_association_destroy goes through a
> > > quiescent point prior to unhashing transports from the list, to ensure 
> > > that
> > > there is no conflict with the add operation above.
> > 
> > Consider that the rhl_for_each_entry_rcu() is traversing the global
> > rhashtable, and that it may operate on unrelated transports/asocs.
> > E.g., transport->asoc in the for() is potentially different from the
> > asoc under socket lock.
> > 
> Ah, ok, we're comparing associations that are not related to the association
> being searched for, that makes sense.
> 
> > The core of the fix is at:
> > +   if (!sctp_transport_hold(transport))
> > +   continue;
> > If we can get a hold, the asoc will be available for dereferencing in
> > subsequent lines. Otherwise, move on.
> > 
> > With that, the patch makes sense to me.
> > 
> Yes, I agree, but as you note below, this still seems like a lousy way to fix
> the problem.
> 
> > Although I would prefer if we come up with a better way to do this
> > jump, or even avoid the jump. We are only comparing pointers here and
> > if we had asoc->ep cached on sctp_transport itself, we could avoid the
> > atomics here.
> > 
> > This change, in the next patch on sctp_epaddr_lookup_transport, will
> > hurt performance as that is called in datapath. Rhashtable will help
> > on keeping entry lists to a size, but still.
> > 
> I still think the rcu_read_lock would be sufficient here, if we just ensured
> that removals from the list occured after a quiescent point.  The lookup is in

I'm not sure I follow.

> the datapath, but adds/removes can have a little more latency added to them, 
> and
> if it removes the atomic operation from the fast path, I think thats a net 
> win.

Agree.

Re: [PATCH net] sctp: hold transport before accessing its asoc in sctp_hash_transport

2018-11-21 Thread Marcelo Ricardo Leitner

On Wed, Nov 21, 2018 at 03:47:33PM +0900, Xin Long wrote:
> On Wed, Nov 21, 2018 at 9:46 AM Marcelo Ricardo Leitner
>  wrote:
> >
> > On Tue, Nov 20, 2018 at 07:52:48AM -0500, Neil Horman wrote:
> > > On Tue, Nov 20, 2018 at 07:09:16PM +0800, Xin Long wrote:
> > > > In sctp_hash_transport, it dereferences a transport's asoc only under
> > > > rcu_read_lock. Without holding the transport, its asoc could be freed
> > > > already, which leads to a use-after-free panic.
> > > >
> > > > A similar fix as Commit bab1be79a516 ("sctp: hold transport before
> > > > accessing its asoc in sctp_transport_get_next") is needed to hold
> > > > the transport before accessing its asoc in sctp_hash_transport.
> > > >
> > > > Fixes: cd2b70875058 ("sctp: check duplicate node before inserting a new 
> > > > transport")
> > > > Reported-by: syzbot+0b05d8aa7cb185107...@syzkaller.appspotmail.com
> > > > Signed-off-by: Xin Long 
> > > > ---
> > > >  net/sctp/input.c | 7 ++-
> > > >  1 file changed, 6 insertions(+), 1 deletion(-)
> > > >
> > > > diff --git a/net/sctp/input.c b/net/sctp/input.c
> > > > index 5c36a99..69584e9 100644
> > > > --- a/net/sctp/input.c
> > > > +++ b/net/sctp/input.c
> > > > @@ -896,11 +896,16 @@ int sctp_hash_transport(struct sctp_transport *t)
> > > > list = rhltable_lookup(_transport_hashtable, ,
> > > >sctp_hash_params);
> > > >
> > > > -   rhl_for_each_entry_rcu(transport, tmp, list, node)
> > > > +   rhl_for_each_entry_rcu(transport, tmp, list, node) {
> > > > +   if (!sctp_transport_hold(transport))
> > > > +   continue;
> > > > if (transport->asoc->ep == t->asoc->ep) {
> > > > +   sctp_transport_put(transport);
> > > > rcu_read_unlock();
> > > > return -EEXIST;
> > > > }
> > > > +   sctp_transport_put(transport);
> > > > +   }
> > > > rcu_read_unlock();
> > > >
> > > > err = rhltable_insert_key(_transport_hashtable, ,
> > > > --
> > > > 2.1.0
> > > >
> > > >
> > >
> > > something doesn't feel at all right about this.  If we are inserting a 
> > > transport
> > > to an association, it would seem to me that we should have at least one 
> > > user of
> > > the association (i.e. non-zero refcount).  As such it seems something is 
> > > wrong
> > > with the association refcount here.  At the very least, if there is a 
> > > case where
> > > an association is being removed while a transport is being added, the 
> > > better
> > > solution would be to ensure that sctp_association_destroy goes through a
> > > quiescent point prior to unhashing transports from the list, to ensure 
> > > that
> > > there is no conflict with the add operation above.
> Changing to do call_rcu(>rcu, sctp_association_destroy) can
> work for this case.
> But it means asoc and socket (taking the port) will have to wait for a
> grace period, which is not expected. We seemed to have talked about
> this before, Marcelo?

Yes. This would cause it to linger longer and cause bind conflicts
meanwhile.
Note that we already have sctp_transport_destroy_rcu(), so this would
be a 2nd grace period.

> 
> >
> > Consider that the rhl_for_each_entry_rcu() is traversing the global
> > rhashtable, and that it may operate on unrelated transports/asocs.
> > E.g., transport->asoc in the for() is potentially different from the
> > asoc under socket lock.
> >
> > The core of the fix is at:
> > +   if (!sctp_transport_hold(transport))
> > +   continue;
> > If we can get a hold, the asoc will be available for dereferencing in
> > subsequent lines. Otherwise, move on.
> >
> > With that, the patch makes sense to me.
> >
> > Although I would prefer if we come up with a better way to do this
> > jump, or even avoid the jump. We are only comparing pointers here and,
> > if we had asoc->ep cached on sctp_transport itself, we could avoid the
> > atomics here.
> Right, but it's another u64.

Strictly speaking, a pointer :-) (32bits, on 32bits archs)
But just an idea. It would cost one additional pointer per transport
but saves the atomics and also one extra dereference per iteration.

> 
> >
> > This change, in the next patch on sctp_epaddr_lookup_transport, will
> > hurt performance as that is called in datapath. Rhashtable will help
> > on keeping entry lists to a size, but still.
> This loop is not long normally, will only a few atomic operations hurt

Right.

> a noticeable performance?

I guess we can't know without actually testing this.

Re: [PATCH net] sctp: count sk_wmem_alloc by skb truesize in sctp_packet_transmit

2018-11-20 Thread Marcelo Ricardo Leitner

On Mon, Nov 19, 2018 at 12:39:55PM -0800, David Miller wrote:
> From: Xin Long 
> Date: Sun, 18 Nov 2018 15:07:38 +0800
> 
> > Now sctp increases sk_wmem_alloc by 1 when doing set_owner_w for the
> > skb allocked in sctp_packet_transmit and decreases by 1 when freeing
> > this skb.
> > 
> > But when this skb goes through networking stack, some subcomponents
> > might change skb->truesize and add the same amount on sk_wmem_alloc.
> > However sctp doesn't know the amount to decrease by, it would cause
> > a leak on sk->sk_wmem_alloc and the sock can never be freed.
> > 
> > Xiumei found this issue when it hit esp_output_head() by using sctp
> > over ipsec, where skb->truesize is added and so is sk->sk_wmem_alloc.
> > 
> > Since sctp has used sk_wmem_queued to count for writable space since
> > Commit cd305c74b0f8 ("sctp: use sk_wmem_queued to check for writable
> > space"), it's ok to fix it by counting sk_wmem_alloc by skb truesize
> > in sctp_packet_transmit.
> > 
> > Fixes: cac2661c53f3 ("esp4: Avoid skb_cow_data whenever possible")
> > Reported-by: Xiumei Mu 
> > Signed-off-by: Xin Long 
> 
> Applied and queued up for -stable.

Dave, is there a way that we can check to which versions you queued it
up?

Asking because even though this patch fixes cac2661c53f3 (v4.10) and
the patch probably applies cleanly, it has a dependency on
cd305c74b0f8 (v4.19) and fixing the issue in older kernels either need
a different fix or backport of cd305c74b0f8 too.

Re: [PATCH net] sctp: hold transport before accessing its asoc in sctp_hash_transport

2018-11-20 Thread Marcelo Ricardo Leitner

On Tue, Nov 20, 2018 at 07:52:48AM -0500, Neil Horman wrote:
> On Tue, Nov 20, 2018 at 07:09:16PM +0800, Xin Long wrote:
> > In sctp_hash_transport, it dereferences a transport's asoc only under
> > rcu_read_lock. Without holding the transport, its asoc could be freed
> > already, which leads to a use-after-free panic.
> > 
> > A similar fix as Commit bab1be79a516 ("sctp: hold transport before
> > accessing its asoc in sctp_transport_get_next") is needed to hold
> > the transport before accessing its asoc in sctp_hash_transport.
> > 
> > Fixes: cd2b70875058 ("sctp: check duplicate node before inserting a new 
> > transport")
> > Reported-by: syzbot+0b05d8aa7cb185107...@syzkaller.appspotmail.com
> > Signed-off-by: Xin Long 
> > ---
> >  net/sctp/input.c | 7 ++-
> >  1 file changed, 6 insertions(+), 1 deletion(-)
> > 
> > diff --git a/net/sctp/input.c b/net/sctp/input.c
> > index 5c36a99..69584e9 100644
> > --- a/net/sctp/input.c
> > +++ b/net/sctp/input.c
> > @@ -896,11 +896,16 @@ int sctp_hash_transport(struct sctp_transport *t)
> > list = rhltable_lookup(_transport_hashtable, ,
> >sctp_hash_params);
> >  
> > -   rhl_for_each_entry_rcu(transport, tmp, list, node)
> > +   rhl_for_each_entry_rcu(transport, tmp, list, node) {
> > +   if (!sctp_transport_hold(transport))
> > +   continue;
> > if (transport->asoc->ep == t->asoc->ep) {
> > +   sctp_transport_put(transport);
> > rcu_read_unlock();
> > return -EEXIST;
> > }
> > +   sctp_transport_put(transport);
> > +   }
> > rcu_read_unlock();
> >  
> > err = rhltable_insert_key(_transport_hashtable, ,
> > -- 
> > 2.1.0
> > 
> > 
> 
> something doesn't feel at all right about this.  If we are inserting a 
> transport
> to an association, it would seem to me that we should have at least one user 
> of
> the association (i.e. non-zero refcount).  As such it seems something is wrong
> with the association refcount here.  At the very least, if there is a case 
> where
> an association is being removed while a transport is being added, the better
> solution would be to ensure that sctp_association_destroy goes through a
> quiescent point prior to unhashing transports from the list, to ensure that
> there is no conflict with the add operation above.

Consider that the rhl_for_each_entry_rcu() is traversing the global
rhashtable, and that it may operate on unrelated transports/asocs.
E.g., transport->asoc in the for() is potentially different from the
asoc under socket lock.

The core of the fix is at:
+   if (!sctp_transport_hold(transport))
+   continue;
If we can get a hold, the asoc will be available for dereferencing in
subsequent lines. Otherwise, move on.

With that, the patch makes sense to me.

Although I would prefer if we come up with a better way to do this
jump, or even avoid the jump. We are only comparing pointers here and
if we had asoc->ep cached on sctp_transport itself, we could avoid the
atomics here.

This change, in the next patch on sctp_epaddr_lookup_transport, will
hurt performance as that is called in datapath. Rhashtable will help
on keeping entry lists to a size, but still.

  Marcelo

Re: [PATCH net] sctp: not allow to set asoc prsctp_enable by sockopt

2018-11-18 Thread Marcelo Ricardo Leitner

On Sun, Nov 18, 2018 at 04:02:25PM +0900, Xin Long wrote:
> On Sat, Nov 17, 2018 at 12:12 AM Neil Horman  wrote:
> >
> > On Thu, Nov 15, 2018 at 09:41:01PM -0200, Marcelo Ricardo Leitner wrote:
> > > [ re-sending, without html this time ]
> > >
> > > On Thu, Nov 15, 2018, 15:26 Neil Horman  > >
> > > > On Thu, Nov 15, 2018 at 08:25:36PM -0200, Marcelo Ricardo Leitner wrote:
> > > > > On Thu, Nov 15, 2018 at 04:43:10PM -0500, Neil Horman wrote:
> > > > > > On Thu, Nov 15, 2018 at 03:22:21PM -0200, Marcelo Ricardo Leitner
> > > > wrote:
> > > > > > > On Thu, Nov 15, 2018 at 07:14:28PM +0800, Xin Long wrote:
> > > > > > > > As rfc7496#section4.5 says about SCTP_PR_SUPPORTED:
> > > > > > > >
> > > > > > > >This socket option allows the enabling or disabling of the
> > > > > > > >negotiation of PR-SCTP support for future associations.  For
> > > > existing
> > > > > > > >associations, it allows one to query whether or not PR-SCTP
> > > > support
> > > > > > > >was negotiated on a particular association.
> > > > > > > >
> > > > > > > > It means only sctp sock's prsctp_enable can be set.
> > > > > > > >
> > > > > > > > Note that for the limitation of SCTP_{CURRENT|ALL}_ASSOC, we 
> > > > > > > > will
> > > > > > > > add it when introducing SCTP_{FUTURE|CURRENT|ALL}_ASSOC for 
> > > > > > > > linux
> > > > > > > > sctp in another patchset.
> > > > > > > >
> > > > > > > > Fixes: 28aa4c26fce2 ("sctp: add SCTP_PR_SUPPORTED on sctp 
> > > > > > > > sockopt")
> > > > > > > > Reported-by: Ying Xu 
> > > > > > > > Signed-off-by: Xin Long 
> > > > > > > > ---
> > > > > > > >  net/sctp/socket.c | 13 +++--
> > > > > > > >  1 file changed, 3 insertions(+), 10 deletions(-)
> > > > > > > >
> > > > > > > > diff --git a/net/sctp/socket.c b/net/sctp/socket.c
> > > > > > > > index 739f3e5..e9b8232 100644
> > > > > > > > --- a/net/sctp/socket.c
> > > > > > > > +++ b/net/sctp/socket.c
> > > > > > > > @@ -3940,7 +3940,6 @@ static int
> > > > sctp_setsockopt_pr_supported(struct sock *sk,
> > > > > > > > unsigned int optlen)
> > > > > > > >  {
> > > > > > > > struct sctp_assoc_value params;
> > > > > > > > -   struct sctp_association *asoc;
> > > > > > > > int retval = -EINVAL;
> > > > > > > >
> > > > > > > > if (optlen != sizeof(params))
> > > > > > > > @@ -3951,16 +3950,10 @@ static int
> > > > sctp_setsockopt_pr_supported(struct sock *sk,
> > > > > > > > goto out;
> > > > > > > > }
> > > > > > > >
> > > > > > > > -   asoc = sctp_id2assoc(sk, params.assoc_id);
> > > > > > > > -   if (asoc) {
> > > > > > > > -   asoc->prsctp_enable = !!params.assoc_value;
> > > > > > > > -   } else if (!params.assoc_id) {
> > > > > > > > -   struct sctp_sock *sp = sctp_sk(sk);
> > > > > > > > -
> > > > > > > > -   sp->ep->prsctp_enable = !!params.assoc_value;
> > > > > > > > -   } else {
> > > > > > > > +   if (sctp_style(sk, UDP) && sctp_id2assoc(sk,
> > > > params.assoc_id))
> > > > > > >
> > > > > > > This would allow using a non-existent assoc id on UDP-style 
> > > > > > > sockets
> > > > to
> > > > > > > set it at the socket, which is not expected. It should be more 
> > > > > > > like:
> > > > > > >
> > > > > > > + if (sctp_style(sk, UDP) && params.assoc_id)
> > > > > > How do you see that to be the case? sctp_id2assoc will return NULL 
> &

Re: [PATCH net] sctp: not allow to set asoc prsctp_enable by sockopt

2018-11-15 Thread Marcelo Ricardo Leitner

On Thu, Nov 15, 2018 at 04:43:10PM -0500, Neil Horman wrote:
> On Thu, Nov 15, 2018 at 03:22:21PM -0200, Marcelo Ricardo Leitner wrote:
> > On Thu, Nov 15, 2018 at 07:14:28PM +0800, Xin Long wrote:
> > > As rfc7496#section4.5 says about SCTP_PR_SUPPORTED:
> > > 
> > >This socket option allows the enabling or disabling of the
> > >negotiation of PR-SCTP support for future associations.  For existing
> > >associations, it allows one to query whether or not PR-SCTP support
> > >was negotiated on a particular association.
> > > 
> > > It means only sctp sock's prsctp_enable can be set.
> > > 
> > > Note that for the limitation of SCTP_{CURRENT|ALL}_ASSOC, we will
> > > add it when introducing SCTP_{FUTURE|CURRENT|ALL}_ASSOC for linux
> > > sctp in another patchset.
> > > 
> > > Fixes: 28aa4c26fce2 ("sctp: add SCTP_PR_SUPPORTED on sctp sockopt")
> > > Reported-by: Ying Xu 
> > > Signed-off-by: Xin Long 
> > > ---
> > >  net/sctp/socket.c | 13 +++--
> > >  1 file changed, 3 insertions(+), 10 deletions(-)
> > > 
> > > diff --git a/net/sctp/socket.c b/net/sctp/socket.c
> > > index 739f3e5..e9b8232 100644
> > > --- a/net/sctp/socket.c
> > > +++ b/net/sctp/socket.c
> > > @@ -3940,7 +3940,6 @@ static int sctp_setsockopt_pr_supported(struct sock 
> > > *sk,
> > >   unsigned int optlen)
> > >  {
> > >   struct sctp_assoc_value params;
> > > - struct sctp_association *asoc;
> > >   int retval = -EINVAL;
> > >  
> > >   if (optlen != sizeof(params))
> > > @@ -3951,16 +3950,10 @@ static int sctp_setsockopt_pr_supported(struct 
> > > sock *sk,
> > >   goto out;
> > >   }
> > >  
> > > - asoc = sctp_id2assoc(sk, params.assoc_id);
> > > - if (asoc) {
> > > - asoc->prsctp_enable = !!params.assoc_value;
> > > - } else if (!params.assoc_id) {
> > > - struct sctp_sock *sp = sctp_sk(sk);
> > > -
> > > - sp->ep->prsctp_enable = !!params.assoc_value;
> > > - } else {
> > > + if (sctp_style(sk, UDP) && sctp_id2assoc(sk, params.assoc_id))
> > 
> > This would allow using a non-existent assoc id on UDP-style sockets to
> > set it at the socket, which is not expected. It should be more like:
> > 
> > +   if (sctp_style(sk, UDP) && params.assoc_id)
> How do you see that to be the case? sctp_id2assoc will return NULL if an
> association isn't found, so the use of sctp_id2assoc should work just fine.

Right, it will return NULL, and because of that it won't bail out as
it should and will adjust the socket config instead.

> Just checking params.assoc_id would instead fail the setting of any 
> association
> id that isn't 0, which I don't think is what we want at all.

I think it is.

For exisitng associations, we can't set this anymore because it was
already negotiated on the handshake
(sctp_process_ext_param()/SCTP_CID_FWD_TSN) and there is no way back
after it. 
For non-existing assocs, they will always inherit it from the socket
value.

Question then is which semantics we want on validating the parameter
here. We have cases such as in sctp_setsockopt_delayed_ack() on which
it will reject using invalid asoc_ids as a way to mean the socket
itself for UDP-style sockets:

asoc = sctp_id2assoc(sk, params.sack_assoc_id);
if (!asoc && params.sack_assoc_id && sctp_style(sk, UDP))
return -EINVAL;

As we are returning the same error for both situations(invalid assoc id
and setting it on existing asoc), we don't need the asoc pointer
itself and can avoid sctp_id2assoc() call, leading to the if() I
suggested.

  Marcelo

> 
> Neil
> 
> > 
> > >   goto out;
> > > - }
> > > +
> > > + sctp_sk(sk)->ep->prsctp_enable = !!params.assoc_value;
> > >  
> > >   retval = 0;
> > >  
> > > -- 
> > > 2.1.0
> > > 
> >

Re: [PATCH net] sctp: not allow to set asoc prsctp_enable by sockopt

2018-11-15 Thread Marcelo Ricardo Leitner

On Thu, Nov 15, 2018 at 07:14:28PM +0800, Xin Long wrote:
> As rfc7496#section4.5 says about SCTP_PR_SUPPORTED:
> 
>This socket option allows the enabling or disabling of the
>negotiation of PR-SCTP support for future associations.  For existing
>associations, it allows one to query whether or not PR-SCTP support
>was negotiated on a particular association.
> 
> It means only sctp sock's prsctp_enable can be set.
> 
> Note that for the limitation of SCTP_{CURRENT|ALL}_ASSOC, we will
> add it when introducing SCTP_{FUTURE|CURRENT|ALL}_ASSOC for linux
> sctp in another patchset.
> 
> Fixes: 28aa4c26fce2 ("sctp: add SCTP_PR_SUPPORTED on sctp sockopt")
> Reported-by: Ying Xu 
> Signed-off-by: Xin Long 
> ---
>  net/sctp/socket.c | 13 +++--
>  1 file changed, 3 insertions(+), 10 deletions(-)
> 
> diff --git a/net/sctp/socket.c b/net/sctp/socket.c
> index 739f3e5..e9b8232 100644
> --- a/net/sctp/socket.c
> +++ b/net/sctp/socket.c
> @@ -3940,7 +3940,6 @@ static int sctp_setsockopt_pr_supported(struct sock *sk,
>   unsigned int optlen)
>  {
>   struct sctp_assoc_value params;
> - struct sctp_association *asoc;
>   int retval = -EINVAL;
>  
>   if (optlen != sizeof(params))
> @@ -3951,16 +3950,10 @@ static int sctp_setsockopt_pr_supported(struct sock 
> *sk,
>   goto out;
>   }
>  
> - asoc = sctp_id2assoc(sk, params.assoc_id);
> - if (asoc) {
> - asoc->prsctp_enable = !!params.assoc_value;
> - } else if (!params.assoc_id) {
> - struct sctp_sock *sp = sctp_sk(sk);
> -
> - sp->ep->prsctp_enable = !!params.assoc_value;
> - } else {
> + if (sctp_style(sk, UDP) && sctp_id2assoc(sk, params.assoc_id))

This would allow using a non-existent assoc id on UDP-style sockets to
set it at the socket, which is not expected. It should be more like:

+   if (sctp_style(sk, UDP) && params.assoc_id)

>   goto out;
> - }
> +
> + sctp_sk(sk)->ep->prsctp_enable = !!params.assoc_value;
>  
>   retval = 0;
>  
> -- 
> 2.1.0
>

Re: [PATCH net-next 0/3] sctp: add support for sk_reuseport

2018-10-22 Thread Marcelo Ricardo Leitner

On Sun, Oct 21, 2018 at 12:43:35PM +0800, Xin Long wrote:
> sctp sk_reuseport allows multiple socks to listen on the same port and
> addresses, as long as these socks have the same uid. This works pretty
> much as TCP/UDP does, the only difference is that sctp is multi-homing
> and all the bind_addrs in these socks will have to completely matched,
> otherwise listen() will return err.
> 

FWIW, I won't be able to review this patchset thoroughly. The 2 small
comments that I sent are all I have.

Thanks,
Marcelo

Re: [PATCH net-next 1/3] sctp: do reuseport_select_sock in __sctp_rcv_lookup_endpoint

2018-10-22 Thread Marcelo Ricardo Leitner

On Sun, Oct 21, 2018 at 12:43:36PM +0800, Xin Long wrote:
> This is a part of sk_reuseport support for sctp, and it selects a
> sock by the hashkey of lport, paddr and dport by default. It will
> work until sk_reuseport support is added in sctp_get_port_local()
> in the next patch.
> 
> Signed-off-by: Xin Long 
> ---
>  net/sctp/input.c | 69 
> +---
>  1 file changed, 41 insertions(+), 28 deletions(-)
> 
> diff --git a/net/sctp/input.c b/net/sctp/input.c
> index 5c36a99..60ede89 100644
> --- a/net/sctp/input.c
> +++ b/net/sctp/input.c
> @@ -57,6 +57,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  
>  /* Forward declarations for internal helpers. */
>  static int sctp_rcv_ootb(struct sk_buff *);
> @@ -65,8 +66,10 @@ static struct sctp_association *__sctp_rcv_lookup(struct 
> net *net,
> const union sctp_addr *paddr,
> const union sctp_addr *laddr,
> struct sctp_transport **transportp);
> -static struct sctp_endpoint *__sctp_rcv_lookup_endpoint(struct net *net,
> - const union sctp_addr *laddr);
> +static struct sctp_endpoint *__sctp_rcv_lookup_endpoint(
> + struct net *net, struct sk_buff *skb,
> + const union sctp_addr *laddr,
> + const union sctp_addr *daddr);
>  static struct sctp_association *__sctp_lookup_association(
>   struct net *net,
>   const union sctp_addr *local,
> @@ -171,7 +174,7 @@ int sctp_rcv(struct sk_buff *skb)
>   asoc = __sctp_rcv_lookup(net, skb, , , );
>  
>   if (!asoc)
> - ep = __sctp_rcv_lookup_endpoint(net, );
> + ep = __sctp_rcv_lookup_endpoint(net, skb, , );
>  
>   /* Retrieve the common input handling substructure. */
>   rcvr = asoc ? >base : >base;
> @@ -770,16 +773,35 @@ void sctp_unhash_endpoint(struct sctp_endpoint *ep)
>   local_bh_enable();
>  }
>  
> +static inline __u32 sctp_hashfn(const struct net *net, __be16 lport,
> + const union sctp_addr *paddr, __u32 seed)
> +{
> + __u32 addr;
> +
> + if (paddr->sa.sa_family == AF_INET6)
> + addr = jhash(>v6.sin6_addr, 16, seed);
> + else
> + addr = (__force __u32)paddr->v4.sin_addr.s_addr;
> +
> + return  jhash_3words(addr, ((__force __u32)paddr->v4.sin_port) << 16 |
> +  (__force __u32)lport, net_hash_mix(net), seed);
> +}
> +
>  /* Look up an endpoint. */
> -static struct sctp_endpoint *__sctp_rcv_lookup_endpoint(struct net *net,
> - const union sctp_addr *laddr)
> +static struct sctp_endpoint *__sctp_rcv_lookup_endpoint(
> + struct net *net, struct sk_buff *skb,
> + const union sctp_addr *laddr,
> + const union sctp_addr *paddr)
>  {
>   struct sctp_hashbucket *head;
>   struct sctp_ep_common *epb;
>   struct sctp_endpoint *ep;
> + struct sock *sk;
> + __be32 lport;

This could be a __be16 one.

>   int hash;
>  
> - hash = sctp_ep_hashfn(net, ntohs(laddr->v4.sin_port));
> + lport = laddr->v4.sin_port;
> + hash = sctp_ep_hashfn(net, ntohs(lport));
>   head = _ep_hashtable[hash];
>   read_lock(>lock);
>   sctp_for_each_hentry(epb, >chain) {
> @@ -791,6 +813,15 @@ static struct sctp_endpoint 
> *__sctp_rcv_lookup_endpoint(struct net *net,
>   ep = sctp_sk(net->sctp.ctl_sock)->ep;
>  
>  hit:
> + sk = ep->base.sk;
> + if (sk->sk_reuseport) {
> + __u32 phash = sctp_hashfn(net, lport, paddr, 0);
> +
> + sk = reuseport_select_sock(sk, phash, skb,
> +sizeof(struct sctphdr));
> + if (sk)
> + ep = sctp_sk(sk)->ep;
> + }
>   sctp_endpoint_hold(ep);
>   read_unlock(>lock);
>   return ep;
> @@ -829,35 +860,17 @@ static inline int sctp_hash_cmp(struct 
> rhashtable_compare_arg *arg,
>  static inline __u32 sctp_hash_obj(const void *data, u32 len, u32 seed)
>  {
>   const struct sctp_transport *t = data;
> - const union sctp_addr *paddr = >ipaddr;
> - const struct net *net = sock_net(t->asoc->base.sk);
> - __be16 lport = htons(t->asoc->base.bind_addr.port);
> - __u32 addr;
> -
> - if (paddr->sa.sa_family == AF_INET6)
> - addr = jhash(>v6.sin6_addr, 16, seed);
> - else
> - addr = (__force __u32)paddr->v4.sin_addr.s_addr;
>  
> - return  jhash_3words(addr, ((__force __u32)paddr->v4.sin_port) << 16 |
> -  (__force __u32)lport, net_hash_mix(net), seed);
> + return sctp_hashfn(sock_net(t->asoc->base.sk),
> +

Re: [PATCH net-next 2/3] sctp: add sock_reuseport for the sock in __sctp_hash_endpoint

2018-10-22 Thread Marcelo Ricardo Leitner

On Sun, Oct 21, 2018 at 12:43:37PM +0800, Xin Long wrote:
> This is a part of sk_reuseport support for sctp. It defines a helper
> sctp_bind_addrs_check() to check if the bind_addrs in two socks are
> matched. It will add sock_reuseport if they are completely matched,
> and return err if they are partly matched, and alloc sock_reuseport
> if all socks are not matched at all.
> 
> It will work until sk_reuseport support is added in
> sctp_get_port_local() in the next patch.
> 
> Signed-off-by: Xin Long 
> ---
>  include/net/sctp/sctp.h|  2 +-
>  include/net/sctp/structs.h |  2 ++
>  net/core/sock_reuseport.c  |  1 +
>  net/sctp/bind_addr.c   | 28 ++
>  net/sctp/input.c   | 60 
> +++---
>  net/sctp/socket.c  |  3 +--
>  6 files changed, 85 insertions(+), 11 deletions(-)
> 
> diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
> index 8c2caa3..b8cd58d 100644
> --- a/include/net/sctp/sctp.h
> +++ b/include/net/sctp/sctp.h
> @@ -152,7 +152,7 @@ int sctp_primitive_RECONF(struct net *net, struct 
> sctp_association *asoc,
>   */
>  int sctp_rcv(struct sk_buff *skb);
>  void sctp_v4_err(struct sk_buff *skb, u32 info);
> -void sctp_hash_endpoint(struct sctp_endpoint *);
> +int sctp_hash_endpoint(struct sctp_endpoint *ep);
>  void sctp_unhash_endpoint(struct sctp_endpoint *);
>  struct sock *sctp_err_lookup(struct net *net, int family, struct sk_buff *,
>struct sctphdr *, struct sctp_association **,
> diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
> index a11f937..15d017f 100644
> --- a/include/net/sctp/structs.h
> +++ b/include/net/sctp/structs.h
> @@ -1190,6 +1190,8 @@ int sctp_bind_addr_conflict(struct sctp_bind_addr *, 
> const union sctp_addr *,
>struct sctp_sock *, struct sctp_sock *);
>  int sctp_bind_addr_state(const struct sctp_bind_addr *bp,
>const union sctp_addr *addr);
> +int sctp_bind_addrs_check(struct sctp_sock *sp,
> +   struct sctp_sock *sp2, int cnt2);
>  union sctp_addr *sctp_find_unmatch_addr(struct sctp_bind_addr*bp,
>   const union sctp_addr   *addrs,
>   int addrcnt,
> diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c
> index ba5cba5..d8fe3e5 100644
> --- a/net/core/sock_reuseport.c
> +++ b/net/core/sock_reuseport.c
> @@ -187,6 +187,7 @@ int reuseport_add_sock(struct sock *sk, struct sock *sk2, 
> bool bind_inany)
>   call_rcu(_reuse->rcu, reuseport_free_rcu);
>   return 0;
>  }
> +EXPORT_SYMBOL(reuseport_add_sock);
>  
>  void reuseport_detach_sock(struct sock *sk)
>  {
> diff --git a/net/sctp/bind_addr.c b/net/sctp/bind_addr.c
> index 7df3704..78d0d93 100644
> --- a/net/sctp/bind_addr.c
> +++ b/net/sctp/bind_addr.c
> @@ -337,6 +337,34 @@ int sctp_bind_addr_match(struct sctp_bind_addr *bp,
>   return match;
>  }
>  
> +int sctp_bind_addrs_check(struct sctp_sock *sp,
> +   struct sctp_sock *sp2, int cnt2)
> +{
> + struct sctp_bind_addr *bp2 = >ep->base.bind_addr;
> + struct sctp_bind_addr *bp = >ep->base.bind_addr;
> + struct sctp_sockaddr_entry *laddr, *laddr2;
> + bool exist = false;
> + int cnt = 0;
> +
> + rcu_read_lock();
> + list_for_each_entry_rcu(laddr, >address_list, list) {
> + list_for_each_entry_rcu(laddr2, >address_list, list) {
> + if (sp->pf->af->cmp_addr(>a, >a) &&
> + laddr->valid == laddr2->valid) {

I think by here in the normal run laddr2->valid will always be true,
but as is it gives the impression that it accepts 0 == 0 too, which
would be bad.  May be on a fast BINDX_REM/BINDX_ADD it could trigger
laddr2->valid = 0 in there, not sure.

Anyway, may be '... laddr->valid && laddr2->valid' instead or you
really want to allow the 0 == 0 case?

> + exist = true;
> + goto next;
> + }
> + }
> + cnt = 0;
> + break;
> +next:
> + cnt++;
> + }
> + rcu_read_unlock();
> +
> + return (cnt == cnt2) ? 0 : (exist ? -EEXIST : 1);
> +}
> +
>  /* Does the address 'addr' conflict with any addresses in
>   * the bp.
>   */
> diff --git a/net/sctp/input.c b/net/sctp/input.c
> index 60ede89..6bfeb10 100644
> --- a/net/sctp/input.c
> +++ b/net/sctp/input.c
> @@ -723,43 +723,87 @@ static int sctp_rcv_ootb(struct sk_buff *skb)
>  }
>  
>  /* Insert endpoint into the hash table.  */
> -static void __sctp_hash_endpoint(struct sctp_endpoint *ep)
> +static int __sctp_hash_endpoint(struct sctp_endpoint *ep)
>  {
> - struct net *net = sock_net(ep->base.sk);
> - struct sctp_ep_common *epb;
> + struct sock *sk = ep->base.sk;
> + struct net *net = sock_net(sk);
>   struct sctp_hashbucket *head;
> +

Re: [PATCH net] sctp: fix the data size calculation in sctp_data_size

2018-10-17 Thread Marcelo Ricardo Leitner

On Wed, Oct 17, 2018 at 09:11:27PM +0800, Xin Long wrote:
> sctp data size should be calculated by subtracting data chunk header's
> length from chunk_hdr->length, not just data header.
> 
> Fixes: 668c9beb9020 ("sctp: implement assign_number for 
> sctp_stream_interleave")
> Signed-off-by: Xin Long 

Acked-by: Marcelo Ricardo Leitner 

> ---
>  include/net/sctp/sm.h | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/include/net/sctp/sm.h b/include/net/sctp/sm.h
> index 5ef1bad..9e3d327 100644
> --- a/include/net/sctp/sm.h
> +++ b/include/net/sctp/sm.h
> @@ -347,7 +347,7 @@ static inline __u16 sctp_data_size(struct sctp_chunk 
> *chunk)
>   __u16 size;
>  
>   size = ntohs(chunk->chunk_hdr->length);
> - size -= sctp_datahdr_len(>asoc->stream);
> + size -= sctp_datachk_len(>asoc->stream);
>  
>   return size;
>  }
> -- 
> 2.1.0
>

Re: [PATCH net] sctp: not free the new asoc when sctp_wait_for_connect returns err

2018-10-17 Thread Marcelo Ricardo Leitner

On Wed, Oct 17, 2018 at 03:06:12AM +0800, Xin Long wrote:
> When sctp_wait_for_connect is called to wait for connect ready
> for sp->strm_interleave in sctp_sendmsg_to_asoc, a panic could
> be triggered if cpu is scheduled out and the new asoc is freed
> elsewhere, as it will return err and later the asoc gets freed
> again in sctp_sendmsg.
> 
> [  285.840764] list_del corruption, 9f0f7b284078->next is LIST_POISON1 
> (dead0100)
> [  285.843590] WARNING: CPU: 1 PID: 8861 at lib/list_debug.c:47 
> __list_del_entry_valid+0x50/0xa0
> [  285.846193] Kernel panic - not syncing: panic_on_warn set ...
> [  285.846193]
> [  285.848206] CPU: 1 PID: 8861 Comm: sctp_ndata Kdump: loaded Not tainted 
> 4.19.0-rc7.label #584
> [  285.850559] Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011
> [  285.852164] Call Trace:
> ...
> [  285.872210]  ? __list_del_entry_valid+0x50/0xa0
> [  285.872894]  sctp_association_free+0x42/0x2d0 [sctp]
> [  285.873612]  sctp_sendmsg+0x5a4/0x6b0 [sctp]
> [  285.874236]  sock_sendmsg+0x30/0x40
> [  285.874741]  ___sys_sendmsg+0x27a/0x290
> [  285.875304]  ? __switch_to_asm+0x34/0x70
> [  285.875872]  ? __switch_to_asm+0x40/0x70
> [  285.876438]  ? ptep_set_access_flags+0x2a/0x30
> [  285.877083]  ? do_wp_page+0x151/0x540
> [  285.877614]  __sys_sendmsg+0x58/0xa0
> [  285.878138]  do_syscall_64+0x55/0x180
> [  285.878669]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
> 
> This is a similar issue with the one fixed in Commit ca3af4dd28cf
> ("sctp: do not free asoc when it is already dead in sctp_sendmsg").
> But this one can't be fixed by returning -ESRCH for the dead asoc
> in sctp_wait_for_connect, as it will break sctp_connect's return
> value to users.
> 
> This patch is to simply set err to -ESRCH before it returns to
> sctp_sendmsg when any err is returned by sctp_wait_for_connect
> for sp->strm_interleave, so that no asoc would be freed due to
> this.
> 
> When users see this error, they will know the packet hasn't been
> sent. And it also makes sense to not free asoc because waiting
> connect fails, like the second call for sctp_wait_for_connect in
> sctp_sendmsg_to_asoc.
> 
> Fixes: 668c9beb9020 ("sctp: implement assign_number for 
> sctp_stream_interleave")
> Signed-off-by: Xin Long 

Acked-by: Marcelo Ricardo Leitner 

> ---
>  net/sctp/socket.c | 4 +++-
>  1 file changed, 3 insertions(+), 1 deletion(-)
> 
> diff --git a/net/sctp/socket.c b/net/sctp/socket.c
> index e25a20f..1baa9d9 100644
> --- a/net/sctp/socket.c
> +++ b/net/sctp/socket.c
> @@ -1946,8 +1946,10 @@ static int sctp_sendmsg_to_asoc(struct 
> sctp_association *asoc,
>   if (sp->strm_interleave) {
>   timeo = sock_sndtimeo(sk, 0);
>   err = sctp_wait_for_connect(asoc, );
> - if (err)
> + if (err) {
> + err = -ESRCH;
>   goto err;
> + }
>   } else {
>   wait_connect = true;
>   }
> -- 
> 2.1.0
>

[PATCH net] sctp: fix race on sctp_id2asoc

2018-10-16 Thread Marcelo Ricardo Leitner

syzbot reported an use-after-free involving sctp_id2asoc.  Dmitry Vyukov
helped to root cause it and it is because of reading the asoc after it
was freed:

CPU 1   CPU 2
(working on socket 1)(working on socket 2)
 sctp_association_destroy
sctp_id2asoc
   spin lock
 grab the asoc from idr
   spin unlock
   spin lock
 remove asoc from idr
   spin unlock
   free(asoc)
   if asoc->base.sk != sk ... [*]

This can only be hit if trying to fetch asocs from different sockets. As
we have a single IDR for all asocs, in all SCTP sockets, their id is
unique on the system. An application can try to send stuff on an id
that matches on another socket, and the if in [*] will protect from such
usage. But it didn't consider that as that asoc may belong to another
socket, it may be freed in parallel (read: under another socket lock).

We fix it by moving the checks in [*] into the protected region. This
fixes it because the asoc cannot be freed while the lock is held.

Reported-by: syzbot+c7dd55d7aec49d48e...@syzkaller.appspotmail.com
Acked-by: Dmitry Vyukov 
Signed-off-by: Marcelo Ricardo Leitner 
---
 net/sctp/socket.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 
f73e9d38d5ba734d7ee3347e4015fd30d355bbfa..a7722f43aa69801c31409d4914c99946ee5533f5
 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -271,11 +271,10 @@ struct sctp_association *sctp_id2assoc(struct sock *sk, 
sctp_assoc_t id)
 
spin_lock_bh(_assocs_id_lock);
asoc = (struct sctp_association *)idr_find(_assocs_id, (int)id);
+   if (asoc && (asoc->base.sk != sk || asoc->base.dead))
+   asoc = NULL;
spin_unlock_bh(_assocs_id_lock);
 
-   if (!asoc || (asoc->base.sk != sk) || asoc->base.dead)
-   return NULL;
-
return asoc;
 }
 
-- 
2.17.1

Re: [PATCH net] sctp: use the pmtu from the icmp packet to update transport pathmtu

2018-10-15 Thread Marcelo Ricardo Leitner

On Mon, Oct 15, 2018 at 07:58:29PM +0800, Xin Long wrote:
> Other than asoc pmtu sync from all transports, sctp_assoc_sync_pmtu
> is also processing transport pmtu_pending by icmp packets. But it's
> meaningless to use sctp_dst_mtu(t->dst) as new pmtu for a transport.
> 
> The right pmtu value should come from the icmp packet, and it would
> be saved into transport->mtu_info in this patch and used later when
> the pmtu sync happens in sctp_sendmsg_to_asoc or sctp_packet_config.
> 
> Besides, without this patch, as pmtu can only be updated correctly
> when receiving a icmp packet and no place is holding sock lock, it
> will take long time if the sock is busy with sending packets.
> 
> Note that it doesn't process transport->mtu_info in .release_cb(),
> as there is no enough information for pmtu update, like for which
> asoc or transport. It is not worth traversing all asocs to check
> pmtu_pending. So unlike tcp, sctp does this in tx path, for which
> mtu_info needs to be atomic_t.
> 
> Signed-off-by: Xin Long 

Acked-by: Marcelo Ricardo Leitner 

> ---
>  include/net/sctp/structs.h | 2 ++
>  net/sctp/associola.c   | 3 ++-
>  net/sctp/input.c   | 1 +
>  net/sctp/output.c  | 6 ++
>  4 files changed, 11 insertions(+), 1 deletion(-)
> 
> diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
> index 28a7c8e..a11f937 100644
> --- a/include/net/sctp/structs.h
> +++ b/include/net/sctp/structs.h
> @@ -876,6 +876,8 @@ struct sctp_transport {
>   unsigned long sackdelay;
>   __u32 sackfreq;
>  
> + atomic_t mtu_info;
> +
>   /* When was the last time that we heard from this transport? We use
>* this to pick new active and retran paths.
>*/
> diff --git a/net/sctp/associola.c b/net/sctp/associola.c
> index 297d9cf..a827a1f 100644
> --- a/net/sctp/associola.c
> +++ b/net/sctp/associola.c
> @@ -1450,7 +1450,8 @@ void sctp_assoc_sync_pmtu(struct sctp_association *asoc)
>   /* Get the lowest pmtu of all the transports. */
>   list_for_each_entry(t, >peer.transport_addr_list, transports) {
>   if (t->pmtu_pending && t->dst) {
> - sctp_transport_update_pmtu(t, sctp_dst_mtu(t->dst));
> + sctp_transport_update_pmtu(t,
> +atomic_read(>mtu_info));
>   t->pmtu_pending = 0;
>   }
>   if (!pmtu || (t->pathmtu < pmtu))
> diff --git a/net/sctp/input.c b/net/sctp/input.c
> index 9bbc5f9..5c36a99 100644
> --- a/net/sctp/input.c
> +++ b/net/sctp/input.c
> @@ -395,6 +395,7 @@ void sctp_icmp_frag_needed(struct sock *sk, struct 
> sctp_association *asoc,
>   return;
>  
>   if (sock_owned_by_user(sk)) {
> + atomic_set(>mtu_info, pmtu);
>   asoc->pmtu_pending = 1;
>   t->pmtu_pending = 1;
>   return;
> diff --git a/net/sctp/output.c b/net/sctp/output.c
> index 7f849b0..67939ad 100644
> --- a/net/sctp/output.c
> +++ b/net/sctp/output.c
> @@ -120,6 +120,12 @@ void sctp_packet_config(struct sctp_packet *packet, 
> __u32 vtag,
>   sctp_assoc_sync_pmtu(asoc);
>   }
>  
> + if (asoc->pmtu_pending) {
> + if (asoc->param_flags & SPP_PMTUD_ENABLE)
> + sctp_assoc_sync_pmtu(asoc);
> + asoc->pmtu_pending = 0;
> + }
> +
>   /* If there a is a prepend chunk stick it on the list before
>* any other chunks get appended.
>*/
> -- 
> 2.1.0
>

Re: [RFC 4/5] netlink: prepare validate extack setting for recursion

2018-09-20 Thread Marcelo Ricardo Leitner

On Thu, Sep 20, 2018 at 10:14:10AM +0200, Johannes Berg wrote:
> Anyway - we got into this discussion because of all the extra recursion
> stuff I was adding. With the change suggested by David we don't need
> that now at all, so I guess it'd be better to propose a patch if you (or
> perhaps I will see a need later) need such a facility for multiple
> messages or multiple message levels?

Yep! 

Thanks,
  Marcelo

Re: [PATCH net] sctp: update dst pmtu with the correct daddr

2018-09-20 Thread Marcelo Ricardo Leitner

On Thu, Sep 20, 2018 at 05:27:28PM +0800, Xin Long wrote:
> When processing pmtu update from an icmp packet, it calls .update_pmtu
> with sk instead of skb in sctp_transport_update_pmtu.
> 
> However for sctp, the daddr in the transport might be different from
> inet_sock->inet_daddr or sk->sk_v6_daddr, which is used to update or
> create the route cache. The incorrect daddr will cause a different
> route cache created for the path.
> 
> So before calling .update_pmtu, inet_sock->inet_daddr/sk->sk_v6_daddr
> should be updated with the daddr in the transport, and update it back
> after it's done.
> 
> The issue has existed since route exceptions introduction.
> 
> Fixes: 4895c771c7f0 ("ipv4: Add FIB nexthop exceptions.")
> Reported-by: ian.per...@dialogic.com
> Signed-off-by: Xin Long 

Acked-by: Marcelo Ricardo Leitner 

> ---
>  net/sctp/transport.c | 12 ++--
>  1 file changed, 10 insertions(+), 2 deletions(-)
> 
> diff --git a/net/sctp/transport.c b/net/sctp/transport.c
> index 12cac85..033696e 100644
> --- a/net/sctp/transport.c
> +++ b/net/sctp/transport.c
> @@ -260,6 +260,7 @@ void sctp_transport_pmtu(struct sctp_transport 
> *transport, struct sock *sk)
>  bool sctp_transport_update_pmtu(struct sctp_transport *t, u32 pmtu)
>  {
>   struct dst_entry *dst = sctp_transport_dst_check(t);
> + struct sock *sk = t->asoc->base.sk;
>   bool change = true;
>  
>   if (unlikely(pmtu < SCTP_DEFAULT_MINSEGMENT)) {
> @@ -271,12 +272,19 @@ bool sctp_transport_update_pmtu(struct sctp_transport 
> *t, u32 pmtu)
>   pmtu = SCTP_TRUNC4(pmtu);
>  
>   if (dst) {
> - dst->ops->update_pmtu(dst, t->asoc->base.sk, NULL, pmtu);
> + struct sctp_pf *pf = sctp_get_pf_specific(dst->ops->family);
> + union sctp_addr addr;
> +
> + pf->af->from_sk(, sk);
> + pf->to_sk_daddr(>ipaddr, sk);
> + dst->ops->update_pmtu(dst, sk, NULL, pmtu);
> + pf->to_sk_daddr(, sk);
> +
>   dst = sctp_transport_dst_check(t);
>   }
>  
>   if (!dst) {
> - t->af_specific->get_dst(t, >saddr, >fl, t->asoc->base.sk);
> + t->af_specific->get_dst(t, >saddr, >fl, sk);
>   dst = t->dst;
>   }
>  
> -- 
> 2.1.0
>

Re: [RFC 4/5] netlink: prepare validate extack setting for recursion

2018-09-19 Thread Marcelo Ricardo Leitner

On Wed, Sep 19, 2018 at 09:19:31PM +0200, Johannes Berg wrote:
> On Wed, 2018-09-19 at 15:46 -0300, Marcelo Ricardo Leitner wrote:
> 
> > >   NL_SET_ERR_MSG(extack, "warning: deprecated command");
> > >   err = nla_parse(..., extack);
> > >   if (err)
> > >   return err;
> > >   /* do something */
> > >   return 0;
> > > 
> > > Here you could consider the message there a warning that's transported
> > > out even if we return 0, but if we return with a failure from
> > > nla_parse() (or nla_validate instead if you wish), then that failure
> > > message "wins".
> > 
> > Agree. This is the core issue here, IMHO. Once out of the context that
> > set the message, we have no way of knowing if we can nor should
> > overwrite the message that is already in there.
> 
> True.
> 
> I'm not really sure I'd go as far as calling it an issue though.

Ok, s/issue/matter in question/.

...
> > > I suppose we could - technically - make that generic, in that we could
> > > have both
> > > 
> > >   NLA_SET_WARN_MSG(extack, "...");
> > >   NLA_SET_ERR_MSG(extack, "...");
> > 
> > I like this.
> 
> I'm not really sure what for though :-)

Hehe :-)

> 
> FWIW, if you do think that there's a need for distinguishing this, then
> I'd argue that perhaps the right way to address this would be to extend
> this all the way to userspace and have two separate attributes for
> errors and warnings in the extended ACK message?

Likely, yes. And perhaps even support multiple messages. That way we
could, for example, parse all attributes and return a list of the all
the offending ones, instead of returning one at a time. Net benefit?
Not clear.. over engineering? Possibly.

I agree with you that in general we should not need this.

> > While for the situation you are describing here, it will set a generic
> > error message in case the inner code didn't do it.
> 
> Yes, but that's not a change in semantics as far as the caller of
> nla_parse/nla_validate is concerned - it'll continue to unconditionally
> set a message if an error occurred, only the internal behaviour as to
> which message seems more relevant is at issue, and the whole recursion
> thing and avoiding an outer one overwriting an inner one is IMHO more
> useful because that's the more specific error.

That's fine. My point here was only to clarify the use case, which
would be used in other places too (like in the example below).

> 
> > Using the semantics of NLA_SET_WARN_MSG and ERR, then WARN would
> > replace other WARNs but not ERRs, and ERR would replace other WARNs
> > too but not other ERRs. All we need to do handle this is a bit in
> > extack saying if the message is considered a warning or not, or an
> > error/fatal message or not.
> 
> I'm still not really sure what the use case for a warning is, so not
> sure I can really comment on this.

Good point. From iproute POV, a warning is a non-fatal message. From
an user perspective, that probably translates are nothing because in
the end the command actually worked. :-)

Seriously, I do think it's more of a hint for developers than anyone
else.

> 
> > Okay but we have split parsing of netlink messages and this could be
> > useful in there too:
> > In cls_api.c, tc_new_tfilter() calls nmmsg_parse() and do some
> > processing, and then handle it to a classifier. cls_flower, for
> > example, will then do another parsing. If, for whatever reason, flower
> > failed and did not set an extack msg, tc_new_tfilter() could set a
> > default error message, but at this moment we can't tell if the msg
> > already set was just a warning from the first parsing (or any other
> > code before engaging flower) (which we can overwrite) or if it a
> > message that flower set (which we should not overwrite).
> > 
> > Hopefully my description clear.. 8-)
> > 
> > I think this is the same situation as with the nested parsing you're
> > proposing.
> 
> Yes, I admit that's the same, just not part of pure policy checking, but
> open-coded (in a way).
> 
> > Currently it (tc_new_tfilter) doesn't set any default error message,
> > so this issue is/was not noticed.
> 
> True.
> 
> Except that I'd still say that tc_new_tfilter() can very well assume
> that nothing has set a message before it ran, it's invoked directly by
> the core netlink code after all.
> 
> So IMHO we don't have an issue here because there aren't arbitrary
> callers of this and it can't know what the state is; it does in fact
> know very well what the state is when it's called.

Good point.

> 
>

Re: [RFC 4/5] netlink: prepare validate extack setting for recursion

2018-09-19 Thread Marcelo Ricardo Leitner

On Wed, Sep 19, 2018 at 11:25:17AM +0200, Johannes Berg wrote:
> On Wed, 2018-09-19 at 00:37 -0300, Marcelo Ricardo Leitner wrote:
> 
> > Did you consider indicating the message level, and only overwrite the
> > message that is already in there if the new message level is higher
> > than the current one?
> 
> Hmm, no, I guess I didn't - I'm not even sure I understand what you're
> saying.
> 
> This code in itself generates no "warning" messages; that was just a
> construct we discussed in the NLA_REJECT thread, e.g. if you say (like I
> just also wrote in my reply to Jiri):
> 
>   NL_SET_ERR_MSG(extack, "warning: deprecated command");
>   err = nla_parse(..., extack);
>   if (err)
>   return err;
>   /* do something */
>   return 0;
> 
> Here you could consider the message there a warning that's transported
> out even if we return 0, but if we return with a failure from
> nla_parse() (or nla_validate instead if you wish), then that failure
> message "wins".

Agree. This is the core issue here, IMHO. Once out of the context that
set the message, we have no way of knowing if we can nor should
overwrite the message that is already in there.

> 
> > This way the first to set an Error message will have it, and Warning
> > messages would be overwritten by such if needed. But it also would
> > cause the first warning to be held, and not the last one, as it does
> > today. We want the first error, but the last warning otherwise.
> > 
> > It would not be possible to overwrite if new_msglvl >= cur_msglvl
> > because then it would trigger the initial issue again, so some extra
> > logic would be needed to solve this.
> 
> That sounds way more complex than what I'm doing now?

Yes but hopefully it would a clearer way of how it is/should be handled.

> 
> Note, like I said above, this isn't *generic* in any way. This code here
> will only ever set error messages that should "win".
> 
> I suppose we could - technically - make that generic, in that we could
> have both
> 
>   NLA_SET_WARN_MSG(extack, "...");
>   NLA_SET_ERR_MSG(extack, "...");

I like this.

> 
> and keep track of warning vs. error; however, just like my first version
> of the NLA_REJECT patch, that would break existing code.

Hm, I may have missed something but I think the discussion in there
was for a different context. For an extack msg to be set by
when validate_nla() call returns on nla_parse(), the previous message
had to be a "warning" because otherwise the parsing wouldn't be even
attempted. So in that case, we are safe to simply overwrite it.

While for the situation you are describing here, it will set a generic
error message in case the inner code didn't do it.

Using the semantics of NLA_SET_WARN_MSG and ERR, then WARN would
replace other WARNs but not ERRs, and ERR would replace other WARNs
too but not other ERRs. All we need to do handle this is a bit in
extack saying if the message is considered a warning or not, or an
error/fatal message or not.

> 
> I also don't think that we actually *need* this complexity in general.
> It should almost always be possible (and realistically, pretty easy) to
> structure your code in a way that warning messages only go out if no
> error message overwrites them. The only reason we were ever even
> discussing this was that in NLA_REJECT I missed the fact that somebody
> could've set a message before and thus would keep it rather than
> overwrite it, which was a change in behaviour.

Okay but we have split parsing of netlink messages and this could be
useful in there too:
In cls_api.c, tc_new_tfilter() calls nmmsg_parse() and do some
processing, and then handle it to a classifier. cls_flower, for
example, will then do another parsing. If, for whatever reason, flower
failed and did not set an extack msg, tc_new_tfilter() could set a
default error message, but at this moment we can't tell if the msg
already set was just a warning from the first parsing (or any other
code before engaging flower) (which we can overwrite) or if it a
message that flower set (which we should not overwrite).

Hopefully my description clear.. 8-)

I think this is the same situation as with the nested parsing you're
proposing.

Currently it (tc_new_tfilter) doesn't set any default error message,
so this issue is/was not noticed.

> 
> Now, with this patch, all I'm doing is changing the internal behaviour
> of nla_parse/nla_validate - externally, it still overwrites any existing
> message if an error occurs, but internally it keeps the inner-most
> error.
> 
> Why is this? Consider this:
> 
> static const struct nla_policy inner_policy[] = {
>   [INNER_FLAG] = { .type = NLA_REJECT,
>

Re: [RFC 4/5] netlink: prepare validate extack setting for recursion

2018-09-18 Thread Marcelo Ricardo Leitner

On Tue, Sep 18, 2018 at 03:12:11PM +0200, Johannes Berg wrote:
> From: Johannes Berg 
> 
> In one of my previous patches in this area I introduced code
> to pass out just the error message to store in the extack, for
> use in NLA_REJECT.
> 
> Change this code now to set both the error message and the bad
> attribute pointer, and carry around a boolean indicating that
> the values have been set.
> 
> This will be used in the next patch to allow recursive validation
> of nested policies, while preserving the innermost error message
> rather than overwriting it with a generic out-level message.

Did you consider indicating the message level, and only overwrite the
message that is already in there if the new message level is higher
than the current one?

This way the first to set an Error message will have it, and Warning
messages would be overwritten by such if needed. But it also would
cause the first warning to be held, and not the last one, as it does
today. We want the first error, but the last warning otherwise.

It would not be possible to overwrite if new_msglvl >= cur_msglvl
because then it would trigger the initial issue again, so some extra
logic would be needed to solve this.

  Marcelo

Re: [PATCH v2 2/2] netlink: add ethernet address policy types

2018-09-17 Thread Marcelo Ricardo Leitner

On Mon, Sep 17, 2018 at 11:57:29AM +0200, Johannes Berg wrote:
> From: Johannes Berg 
> 
> Commonly, ethernet addresses are just using a policy of
>   { .len = ETH_ALEN }
> which leaves userspace free to send more data than it should,
> which may hide bugs.
> 
> Introduce NLA_EXACT_LEN which checks for exact size, rejecting
> the attribute if it's not exactly that length. Also add
> NLA_EXACT_LEN_WARN which requires the minimum length and will
> warn on longer attributes, for backward compatibility.
> 
> Use these to define NLA_POLICY_ETH_ADDR (new strict policy) and
> NLA_POLICY_ETH_ADDR_COMPAT (compatible policy with warning);
> these are used like this:
> 
> static const struct nla_policy [...] = {
> [NL_ATTR_NAME] = NLA_POLICY_ETH_ADDR,
> ...
> };
> 
> Signed-off-by: Johannes Berg 

Reviewed-by: Marcelo Ricardo Leitner 

> ---
> v2: add only NLA_EXACT_LEN/NLA_EXACT_LEN_WARN and build on top
> of that for ethernet address validation, so it can be extended
> for other types (e.g. IPv6 addresses)
> ---
>  include/net/netlink.h | 13 +
>  lib/nlattr.c  |  8 +++-
>  2 files changed, 20 insertions(+), 1 deletion(-)
> 
> diff --git a/include/net/netlink.h b/include/net/netlink.h
> index b318b0a9f6c3..318b1ded3833 100644
> --- a/include/net/netlink.h
> +++ b/include/net/netlink.h
> @@ -181,6 +181,8 @@ enum {
>   NLA_S64,
>   NLA_BITFIELD32,
>   NLA_REJECT,
> + NLA_EXACT_LEN,
> + NLA_EXACT_LEN_WARN,
>   __NLA_TYPE_MAX,
>  };
>  
> @@ -211,6 +213,10 @@ enum {
>   * just like "All other"
>   *NLA_BITFIELD32   Unused
>   *NLA_REJECT   Unused
> + *NLA_EXACT_LENAttribute must have exactly this length, otherwise
> + * it is rejected.
> + *NLA_EXACT_LEN_WARN   Attribute should have exactly this length, a 
> warning
> + * is logged if it is longer, shorter is rejected.
>   *All otherMinimum length of attribute payload
>   *
>   * Meaning of `validation_data' field:
> @@ -236,6 +242,13 @@ struct nla_policy {
>   void*validation_data;
>  };
>  
> +#define NLA_POLICY_EXACT_LEN(_len)   { .type = NLA_EXACT_LEN, .len = _len }
> +#define NLA_POLICY_EXACT_LEN_WARN(_len)  { .type = NLA_EXACT_LEN_WARN, \
> +   .len = _len }
> +
> +#define NLA_POLICY_ETH_ADDR  NLA_POLICY_EXACT_LEN(ETH_ALEN)
> +#define NLA_POLICY_ETH_ADDR_COMPAT   NLA_POLICY_EXACT_LEN_WARN(ETH_ALEN)
> +
>  /**
>   * struct nl_info - netlink source information
>   * @nlh: Netlink message header of original request
> diff --git a/lib/nlattr.c b/lib/nlattr.c
> index 36d74b079151..bb6fe5ed4ecf 100644
> --- a/lib/nlattr.c
> +++ b/lib/nlattr.c
> @@ -82,12 +82,18 @@ static int validate_nla(const struct nlattr *nla, int 
> maxtype,
>  
>   BUG_ON(pt->type > NLA_TYPE_MAX);
>  
> - if (nla_attr_len[pt->type] && attrlen != nla_attr_len[pt->type]) {
> + if ((nla_attr_len[pt->type] && attrlen != nla_attr_len[pt->type]) ||
> + (pt->type == NLA_EXACT_LEN_WARN && attrlen != pt->len)) {
>   pr_warn_ratelimited("netlink: '%s': attribute type %d has an 
> invalid length.\n",
>   current->comm, type);
>   }
>  
>   switch (pt->type) {
> + case NLA_EXACT_LEN:
> + if (attrlen != pt->len)
> + return -ERANGE;
> + break;
> +
>   case NLA_REJECT:
>   if (pt->validation_data && error_msg)
>   *error_msg = pt->validation_data;
> -- 
> 2.14.4
>

Re: [PATCH v2 1/2] netlink: add NLA_REJECT policy type

2018-09-17 Thread Marcelo Ricardo Leitner

On Mon, Sep 17, 2018 at 11:57:28AM +0200, Johannes Berg wrote:
> From: Johannes Berg 
> 
> In some situations some netlink attributes may be used for output
> only (kernel->userspace) or may be reserved for future use. It's
> then helpful to be able to prevent userspace from using them in
> messages sent to the kernel, since they'd otherwise be ignored and
> any future will become impossible if this happens.
> 
> Add NLA_REJECT to the policy which does nothing but reject (with
> EINVAL) validation of any messages containing this attribute.
> Allow for returning a specific extended ACK error message in the
> validation_data pointer.
> 
> While at it clear up the documentation a bit - the NLA_BITFIELD32
> documentation was added to the list of len field descriptions.
> 
> Also, use NL_SET_BAD_ATTR() in one place where it's open-coded.
> 
> The specific case I have in mind now is a shared nested attribute
> containing request/response data, and it would be pointless and
> potentially confusing to have userspace include response data in
> the messages that actually contain a request.
> 
> Signed-off-by: Johannes Berg 

Reviewed-by: Marcelo Ricardo Leitner 

> ---
> v2: preserve behaviour of overwriting the extack message, with
> either the generic or the specific one now
> ---
>  include/net/netlink.h | 13 -
>  lib/nlattr.c  | 23 ---
>  2 files changed, 28 insertions(+), 8 deletions(-)
> 
> diff --git a/include/net/netlink.h b/include/net/netlink.h
> index 0c154f98e987..b318b0a9f6c3 100644
> --- a/include/net/netlink.h
> +++ b/include/net/netlink.h
> @@ -180,6 +180,7 @@ enum {
>   NLA_S32,
>   NLA_S64,
>   NLA_BITFIELD32,
> + NLA_REJECT,
>   __NLA_TYPE_MAX,
>  };
>  
> @@ -208,9 +209,19 @@ enum {
>   *NLA_MSECSLeaving the length field zero will verify the
>   * given type fits, using it verifies minimum length
>   * just like "All other"
> - *NLA_BITFIELD32  A 32-bit bitmap/bitselector attribute
> + *NLA_BITFIELD32   Unused
> + *NLA_REJECT   Unused
>   *All otherMinimum length of attribute payload
>   *
> + * Meaning of `validation_data' field:
> + *NLA_BITFIELD32   This is a 32-bit bitmap/bitselector attribute and
> + * validation data must point to a u32 value of valid
> + * flags
> + *NLA_REJECT   This attribute is always rejected and validation 
> data
> + * may point to a string to report as the error 
> instead
> + * of the generic one in extended ACK.
> + *All otherUnused
> + *
>   * Example:
>   * static const struct nla_policy my_policy[ATTR_MAX+1] = {
>   *   [ATTR_FOO] = { .type = NLA_U16 },
> diff --git a/lib/nlattr.c b/lib/nlattr.c
> index e335bcafa9e4..36d74b079151 100644
> --- a/lib/nlattr.c
> +++ b/lib/nlattr.c
> @@ -69,7 +69,8 @@ static int validate_nla_bitfield32(const struct nlattr *nla,
>  }
>  
>  static int validate_nla(const struct nlattr *nla, int maxtype,
> - const struct nla_policy *policy)
> + const struct nla_policy *policy,
> + const char **error_msg)
>  {
>   const struct nla_policy *pt;
>   int minlen = 0, attrlen = nla_len(nla), type = nla_type(nla);
> @@ -87,6 +88,11 @@ static int validate_nla(const struct nlattr *nla, int 
> maxtype,
>   }
>  
>   switch (pt->type) {
> + case NLA_REJECT:
> + if (pt->validation_data && error_msg)
> + *error_msg = pt->validation_data;
> + return -EINVAL;
> +
>   case NLA_FLAG:
>   if (attrlen > 0)
>   return -ERANGE;
> @@ -180,11 +186,10 @@ int nla_validate(const struct nlattr *head, int len, 
> int maxtype,
>   int rem;
>  
>   nla_for_each_attr(nla, head, len, rem) {
> - int err = validate_nla(nla, maxtype, policy);
> + int err = validate_nla(nla, maxtype, policy, NULL);
>  
>   if (err < 0) {
> - if (extack)
> - extack->bad_attr = nla;
> + NL_SET_BAD_ATTR(extack, nla);
>   return err;
>   }
>   }
> @@ -250,11 +255,15 @@ int nla_parse(struct nlattr **tb, int maxtype, const 
> struct nlattr *head,
>   u16 type = nla_type(nla);
>  
>   if (type > 0 && type <= maxtype) {
> + static const char _msg[] = &

Re: [PATCH net 0/2] sctp: two fixes for spp_ipv6_flowlabel and spp_dscp sockopts

2018-09-03 Thread Marcelo Ricardo Leitner

On Mon, Sep 03, 2018 at 03:47:09PM +0800, Xin Long wrote:
> This patchset fixes two problems in sctp_apply_peer_addr_params()
> when setting spp_ipv6_flowlabel or spp_dscp.
> 
> Xin Long (2):
>   sctp: fix invalid reference to the index variable of the iterator
>   sctp: not traverse asoc trans list if non-ipv6 trans exists for
> ipv6_flowlabel
> 
>  net/sctp/socket.c | 34 +++---
>  1 file changed, 19 insertions(+), 15 deletions(-)

Series
Acked-by: Marcelo Ricardo Leitner

Re: phys_port_id in switchdev mode?

2018-09-03 Thread Marcelo Ricardo Leitner

On Sat, Sep 01, 2018 at 01:34:12PM +0200, Jakub Kicinski wrote:
> On Fri, 31 Aug 2018 17:13:22 -0300, Marcelo Ricardo Leitner wrote:
> > On Tue, Aug 28, 2018 at 08:43:51PM +0200, Jakub Kicinski wrote:
> > > Ugh, CC: netdev..
> > > 
> > > On Tue, 28 Aug 2018 20:05:39 +0200, Jakub Kicinski wrote:  
> > > > Hi!
> > > > 
> > > > I wonder if we can use phys_port_id in switchdev to group together
> > > > interfaces of a single PCI PF?  Here is the problem:  
> > 
> > On Mellanox cards, this is already possible via phys_switch_id, as
> > each PF has its own phys_switch_id. So all VFs with a given
> > phys_switch_id belong to the PF with that same phys_switch_id.
> 
> You mean Connect-X4 and on, Connect-X3 also shares PF between ports.

Yes ConnectX-3 shares PF beween ports but doesn't support switchdev
mode.

I see the issue now. I was still considering the external ports as
uplink representors.

Re: phys_port_id in switchdev mode?

2018-08-31 Thread Marcelo Ricardo Leitner

On Tue, Aug 28, 2018 at 08:43:51PM +0200, Jakub Kicinski wrote:
> Ugh, CC: netdev..
> 
> On Tue, 28 Aug 2018 20:05:39 +0200, Jakub Kicinski wrote:
> > Hi!
> > 
> > I wonder if we can use phys_port_id in switchdev to group together
> > interfaces of a single PCI PF?  Here is the problem:

On Mellanox cards, this is already possible via phys_switch_id, as
each PF has its own phys_switch_id. So all VFs with a given
phys_switch_id belong to the PF with that same phys_switch_id.

I understand this is a vendor-specific design, but if you have the
same phys_switch_id across PFs, does it really matter on which PF the
VF was created on?

  Marcelo

Re: [PATCH net] sctp: remove useless start_fail from sctp_ht_iter in proc

2018-08-27 Thread Marcelo Ricardo Leitner

On Mon, Aug 27, 2018 at 06:40:18PM +0800, Xin Long wrote:
> After changing rhashtable_walk_start to return void, start_fail would
> never be set other value than 0, and the checking for start_fail is
> pointless, so remove it.
> 
> Fixes: 97a6ec4ac021 ("rhashtable: Change rhashtable_walk_start to return 
> void")
> Signed-off-by: Xin Long 

Acked-by: Marcelo Ricardo Leitner 

> ---
>  net/sctp/proc.c | 4 
>  1 file changed, 4 deletions(-)
> 
> diff --git a/net/sctp/proc.c b/net/sctp/proc.c
> index 4d6f1c8..a644292 100644
> --- a/net/sctp/proc.c
> +++ b/net/sctp/proc.c
> @@ -215,7 +215,6 @@ static const struct seq_operations sctp_eps_ops = {
>  struct sctp_ht_iter {
>   struct seq_net_private p;
>   struct rhashtable_iter hti;
> - int start_fail;
>  };
>  
>  static void *sctp_transport_seq_start(struct seq_file *seq, loff_t *pos)
> @@ -224,7 +223,6 @@ static void *sctp_transport_seq_start(struct seq_file 
> *seq, loff_t *pos)
>  
>   sctp_transport_walk_start(>hti);
>  
> - iter->start_fail = 0;
>   return sctp_transport_get_idx(seq_file_net(seq), >hti, *pos);
>  }
>  
> @@ -232,8 +230,6 @@ static void sctp_transport_seq_stop(struct seq_file *seq, 
> void *v)
>  {
>   struct sctp_ht_iter *iter = seq->private;
>  
> - if (iter->start_fail)
> - return;
>   sctp_transport_walk_stop(>hti);
>  }
>  
> -- 
> 2.1.0
>

Re: [PATCH net] sctp: hold transport before accessing its asoc in sctp_transport_get_next

2018-08-27 Thread Marcelo Ricardo Leitner

On Mon, Aug 27, 2018 at 06:38:31PM +0800, Xin Long wrote:
> As Marcelo noticed, in sctp_transport_get_next, it is iterating over
> transports but then also accessing the association directly, without
> checking any refcnts before that, which can cause an use-after-free
> Read.
> 
> So fix it by holding transport before accessing the association. With
> that, sctp_transport_hold calls can be removed in the later places.
> 
> Fixes: 626d16f50f39 ("sctp: export some apis or variables for sctp_diag and 
> reuse some for proc")
> Reported-by: syzbot+fe62a0c9aa6a85c6d...@syzkaller.appspotmail.com
> Signed-off-by: Xin Long 

Acked-by: Marcelo Ricardo Leitner 

> ---
>  net/sctp/proc.c   |  4 
>  net/sctp/socket.c | 22 +++---
>  2 files changed, 15 insertions(+), 11 deletions(-)
> 
> diff --git a/net/sctp/proc.c b/net/sctp/proc.c
> index ef5c9a8..4d6f1c8 100644
> --- a/net/sctp/proc.c
> +++ b/net/sctp/proc.c
> @@ -264,8 +264,6 @@ static int sctp_assocs_seq_show(struct seq_file *seq, 
> void *v)
>   }
>  
>   transport = (struct sctp_transport *)v;
> - if (!sctp_transport_hold(transport))
> - return 0;
>   assoc = transport->asoc;
>   epb = >base;
>   sk = epb->sk;
> @@ -322,8 +320,6 @@ static int sctp_remaddr_seq_show(struct seq_file *seq, 
> void *v)
>   }
>  
>   transport = (struct sctp_transport *)v;
> - if (!sctp_transport_hold(transport))
> - return 0;
>   assoc = transport->asoc;
>  
>   list_for_each_entry_rcu(tsp, >peer.transport_addr_list,
> diff --git a/net/sctp/socket.c b/net/sctp/socket.c
> index e96b15a..aa76586 100644
> --- a/net/sctp/socket.c
> +++ b/net/sctp/socket.c
> @@ -5005,9 +5005,14 @@ struct sctp_transport *sctp_transport_get_next(struct 
> net *net,
>   break;
>   }
>  
> + if (!sctp_transport_hold(t))
> + continue;
> +
>   if (net_eq(sock_net(t->asoc->base.sk), net) &&
>   t->asoc->peer.primary_path == t)
>   break;
> +
> + sctp_transport_put(t);
>   }
>  
>   return t;
> @@ -5017,13 +5022,18 @@ struct sctp_transport *sctp_transport_get_idx(struct 
> net *net,
> struct rhashtable_iter *iter,
> int pos)
>  {
> - void *obj = SEQ_START_TOKEN;
> + struct sctp_transport *t;
>  
> - while (pos && (obj = sctp_transport_get_next(net, iter)) &&
> -!IS_ERR(obj))
> - pos--;
> + if (!pos)
> + return SEQ_START_TOKEN;
>  
> - return obj;
> + while ((t = sctp_transport_get_next(net, iter)) && !IS_ERR(t)) {
> + if (!--pos)
> + break;
> + sctp_transport_put(t);
> + }
> +
> + return t;
>  }
>  
>  int sctp_for_each_endpoint(int (*cb)(struct sctp_endpoint *, void *),
> @@ -5082,8 +5092,6 @@ int sctp_for_each_transport(int (*cb)(struct 
> sctp_transport *, void *),
>  
>   tsp = sctp_transport_get_idx(net, , *pos + 1);
>   for (; !IS_ERR_OR_NULL(tsp); tsp = sctp_transport_get_next(net, )) {
> - if (!sctp_transport_hold(tsp))
> - continue;
>   ret = cb(tsp, p);
>   if (ret)
>   break;
> -- 
> 2.1.0
>

Re: [PATCH v2 0/2] net/sctp: Avoid allocating high order memory with kmalloc()

2018-08-10 Thread Marcelo Ricardo Leitner

On Fri, Aug 10, 2018 at 08:03:51PM +0300, Konstantin Khorenko wrote:
> On 08/09/2018 11:43 AM, Konstantin Khorenko wrote:
> > On 08/04/2018 02:36 AM, Marcelo Ricardo Leitner wrote:
> > > On Fri, Aug 03, 2018 at 07:21:00PM +0300, Konstantin Khorenko wrote:
> > > ...
> > > > Performance results:
> > > > 
> > > >   * Kernel: v4.18-rc6 - stock and with 2 patches from Oleg (earlier in 
> > > > this thread)
> > > >   * Node: CPU (8 cores): Intel(R) Xeon(R) CPU E31230 @ 3.20GHz
> > > >   RAM: 32 Gb
> > > > 
> > > >   * netperf: taken from https://github.com/HewlettPackard/netperf.git,
> > > >  compiled from sources with sctp support
> > > >   * netperf server and client are run on the same node
> > > >   * ip link set lo mtu 1500
> > > > 
> > > > The script used to run tests:
> > > >  # cat run_tests.sh
> > > >  #!/bin/bash
> > > > 
> > > > for test in SCTP_STREAM SCTP_STREAM_MANY SCTP_RR SCTP_RR_MANY; do
> > > >   echo "TEST: $test";
> > > >   for i in `seq 1 3`; do
> > > > echo "Iteration: $i";
> > > > set -x
> > > > netperf -t $test -H localhost -p 2 -S 20,20 -s 
> > > > 20,20 \
> > > > -l 60 -- -m 1452;
> > > > set +x
> > > >   done
> > > > done
> > > > 
> > > > 
> > > > Results (a bit reformatted to be more readable):
> > > ...
> > > 
> > > Nice, good numbers.
> > > 
> > > I'm missing some test that actually uses more than 1 stream. All tests
> > > in netperf uses only 1 stream. They can use 1 or Many associations on
> > > a socket, but not multiple streams. That means the numbers here show
> > > that we shouldn't see any regression on the more traditional uses, per
> > > Michael's reply on the other email, but it is not testing how it will
> > > behave if we go crazy and use the 64k streams (worst case).
> > > 
> > > You'll need some other tool to test it. One idea is sctp_test, from
> > > lksctp-tools. Something like:
> > > 
> > > Server side:
> > >   ./sctp_test -H 172.0.0.1 -P 2 -l -d 0
> > > Client side:
> > >   time ./sctp_test -H 172.0.0.1 -P 1 \
> > >   -h 172.0.0.1 -p 2 -s \
> > >   -c 1 -M 65535 -T -t 1 -x 10 -d 0
> > > 
> > > And then measure the difference on how long each test took. Can you
> > > get these too?
> > > 
> > > Interesting that in my laptop just to start this test for the first
> > > time can took some *seconds*. Seems kernel had a hard time
> > > defragmenting the memory here. :)
> 
> Hi Marcelo,
> 
> got 3 of 4 results, please take a look, but i failed to measure running
> the test on stock kernel when memory is fragmented, test fails with
> *** connect:  Cannot allocate memory ***

Hah, okay.

> 
> 
> Performance results:
> 
>   * Kernel: v4.18-rc8 - stock and with 2 patches v3
>   * Node: CPU (8 cores): Intel(R) Xeon(R) CPU E31230 @ 3.20GHz
>   RAM: 32 Gb
> 
>   * sctp_test: https://github.com/sctp/lksctp-tools
>   * both server and client are run on the same node
>   * ip link set lo mtu 1500
>   * sysctl -w vm.max_map_count=6553 (need it to make memory fragmented)
> 
> The script used to run tests:
> =
> # cat run_sctp_test.sh
> #!/bin/bash
> 
> set -x
> 
> uname -r
> ip link set lo mtu 1500
> swapoff -a
> 
> free
> cat /proc/buddyinfo
> 
> ./src/apps/sctp_test -H 127.0.0.1 -P 2 -l -d 0 &
> sleep 3
> 
> time ./src/apps/sctp_test -H 127.0.0.1 -P 1 -h 127.0.0.1 -p 2 \
> -s -c 1 -M 65535 -T -t 1 -x 10 -d 0 1>/dev/null
> 
> killall -9 lt-sctp_test
> ===
> 
> Results (a bit reformatted to be more readable):
> 
> 1) ms stock kernel v4.18-rc8, no memory fragmentation
> Info about memory - more or less same to iterations:
> # free
>   totalusedfree  shared  buff/cache   
> available
> Mem:   32906008  21315632178184 764  514668
> 32260968
> Swap: 0   0   0
> 
> cat /proc/buddyinfo
> Node 0, zone  DMA  0  1  1  0  2  1  1  0 
>  1  1  3
> Node 0, zone

Re: [PATCH v2 0/2] net/sctp: Avoid allocating high order memory with kmalloc()

2018-08-08 Thread Marcelo Ricardo Leitner

On Mon, Aug 06, 2018 at 09:34:05AM +, David Laight wrote:
> From: Michael Tuexen
> > Sent: 03 August 2018 21:57
> ...
> > >> Given how useless SCTP streams are, does anything actually use
> > >> more than about 4?
> > >
> > > Maybe Michael can help us with that. I'm also curious now.
> > In the context of SIGTRAN I have seen 17 streams...
> 
> Ok, I've seen 17 there as well, 5 is probably more common.
> 
> > In the context of WebRTC I have seen more streams. In general,
> > the streams concept seems to be useful. QUIC has lots of streams.

That means the migration to flex_array should not have a noticeable
impact, as for a small number of streams it will behave just as the
same as it does now. (yes, considering the app won't use high-order
stream id's just because)

...

> 
> Thought
> Could we let the application set large stream-ids, but actually mask them
> down to (say) 32 for the protocol code?

This would require both peers to know about the mapping, as stream ids
must be same on both sides. Seems to be it is better to just adjust
the application and make use of low numbers.

  Marcelo

Re: [PATCH net-next 13/14] net: core: add new/replace rate estimator lock parameter

2018-08-08 Thread Marcelo Ricardo Leitner

On Wed, Aug 08, 2018 at 03:30:41PM +0300, Vlad Buslov wrote:
> On Wed 08 Aug 2018 at 01:37, Marcelo Ricardo Leitner 
>  wrote:
> > On Mon, Aug 06, 2018 at 09:54:24AM +0300, Vlad Buslov wrote:
> >> Extend rate estimator 'new' and 'replace' APIs with additional spinlock
> >> parameter to be used by rtnl-unlocked actions to protect rate_est pointer
> >> from concurrent modification.
> >
> > I'm wondering if this additional parameter is really needed. So far,
> > the only case in which it is not NULL, the same lock that is used to
> > protect the stats is also used in this new parameter.
> >
> > ...
> >
> >> --- a/net/sched/act_police.c
> >> +++ b/net/sched/act_police.c
> >> @@ -138,7 +138,7 @@ static int tcf_act_police_init(struct net *net, struct 
> >> nlattr *nla,
> >>  
> >>if (est) {
> >>err = gen_replace_estimator(>tcf_bstats, NULL,
> >> -  >tcf_rate_est,
> >> +  >tcf_rate_est, NULL,
> >>>tcf_lock,
> >>NULL, est);
> >
> > Which is here, and this new NULL arg is replaced by >tcf_lock
> > in the next patch.
> >
> > Do you foresee a case in which a different lock will be used?
> 
> Not in my use-case, no.
> 
> > Or maybe it is because the current one is explicitly aimed towards the
> > stats?
> 
> Yes, stats lock is only taken when fetching counters. You think better
> approach would be to rely on the fact that, in case of police action,
> same lock is already passed as stats lock? Having it as standalone

And the fact that we have no foreseeable user of two different locks.

> argument looked like cleaner approach to me. If you think this change is
> too much code for very little benefit, I can reuse stats lock.

That's my current thinking, yes. Especially considering the amount of
parameters this function already has, I would refrain from adding yet
another unless really needed.

Maybe s/stats_lock/lock/ in function parameter (struct member doesn't
need to be changed) and doctext:
 * @lock: lock for statistics and control path.

wdyt?

> 
> >
> >   Marcelo
> 
> Thank you for reviewing my code!

Re: [PATCH net-next 13/14] net: core: add new/replace rate estimator lock parameter

2018-08-07 Thread Marcelo Ricardo Leitner

On Mon, Aug 06, 2018 at 09:54:24AM +0300, Vlad Buslov wrote:
> Extend rate estimator 'new' and 'replace' APIs with additional spinlock
> parameter to be used by rtnl-unlocked actions to protect rate_est pointer
> from concurrent modification.

I'm wondering if this additional parameter is really needed. So far,
the only case in which it is not NULL, the same lock that is used to
protect the stats is also used in this new parameter.

...

> --- a/net/sched/act_police.c
> +++ b/net/sched/act_police.c
> @@ -138,7 +138,7 @@ static int tcf_act_police_init(struct net *net, struct 
> nlattr *nla,
>  
>   if (est) {
>   err = gen_replace_estimator(>tcf_bstats, NULL,
> - >tcf_rate_est,
> + >tcf_rate_est, NULL,
>   >tcf_lock,
>   NULL, est);

Which is here, and this new NULL arg is replaced by >tcf_lock
in the next patch.

Do you foresee a case in which a different lock will be used?
Or maybe it is because the current one is explicitly aimed towards the
stats?

  Marcelo

Re: [PATCH v2 0/2] net/sctp: Avoid allocating high order memory with kmalloc()

2018-08-03 Thread Marcelo Ricardo Leitner

On Fri, Aug 03, 2018 at 07:21:00PM +0300, Konstantin Khorenko wrote:
...
> Performance results:
> 
>   * Kernel: v4.18-rc6 - stock and with 2 patches from Oleg (earlier in this 
> thread)
>   * Node: CPU (8 cores): Intel(R) Xeon(R) CPU E31230 @ 3.20GHz
>   RAM: 32 Gb
> 
>   * netperf: taken from https://github.com/HewlettPackard/netperf.git,
>compiled from sources with sctp support
>   * netperf server and client are run on the same node
>   * ip link set lo mtu 1500
> 
> The script used to run tests:
>  # cat run_tests.sh
>  #!/bin/bash
> 
> for test in SCTP_STREAM SCTP_STREAM_MANY SCTP_RR SCTP_RR_MANY; do
>   echo "TEST: $test";
>   for i in `seq 1 3`; do
> echo "Iteration: $i";
> set -x
> netperf -t $test -H localhost -p 2 -S 20,20 -s 20,20 \
> -l 60 -- -m 1452;
> set +x
>   done
> done
> 
> 
> Results (a bit reformatted to be more readable):
...

Nice, good numbers.

I'm missing some test that actually uses more than 1 stream. All tests
in netperf uses only 1 stream. They can use 1 or Many associations on
a socket, but not multiple streams. That means the numbers here show
that we shouldn't see any regression on the more traditional uses, per
Michael's reply on the other email, but it is not testing how it will
behave if we go crazy and use the 64k streams (worst case).

You'll need some other tool to test it. One idea is sctp_test, from
lksctp-tools. Something like:

Server side: 
./sctp_test -H 172.0.0.1 -P 2 -l -d 0
Client side: 
time ./sctp_test -H 172.0.0.1 -P 1 \
-h 172.0.0.1 -p 2 -s \
-c 1 -M 65535 -T -t 1 -x 10 -d 0

And then measure the difference on how long each test took. Can you
get these too?

Interesting that in my laptop just to start this test for the first
time can took some *seconds*. Seems kernel had a hard time
defragmenting the memory here. :)

Thanks,
Marcelo

Re: [PATCH v2 1/2] net/sctp: Make wrappers for accessing in/out streams

2018-08-03 Thread Marcelo Ricardo Leitner

On Fri, Aug 03, 2018 at 07:21:01PM +0300, Konstantin Khorenko wrote:
> This patch introduces wrappers for accessing in/out streams indirectly.
> This will enable to replace physically contiguous memory arrays
> of streams with flexible arrays (or maybe any other appropriate
> mechanism) which do memory allocation on a per-page basis.
> 
> Signed-off-by: Oleg Babin 
> Signed-off-by: Konstantin Khorenko 
> 
> ---
> v2 changes:
>  sctp_stream_in() users are updated to provide stream as an argument,
>  sctp_stream_{in,out}_ptr() are now just sctp_stream_{in,out}().
> ---

...

>  
>  struct sctp_stream {
> - struct sctp_stream_out *out;
> - struct sctp_stream_in *in;
> + struct flex_array *out;
> + struct flex_array *in;

If this patch was meant to be a preparation, shouldn't this belong to
the next patch instead?

  Marcelo

Re: [PATCH v2 0/2] net/sctp: Avoid allocating high order memory with kmalloc()

2018-08-03 Thread Marcelo Ricardo Leitner

On Fri, Aug 03, 2018 at 04:43:28PM +, David Laight wrote:
> From: Konstantin Khorenko
> > Sent: 03 August 2018 17:21
> > 
> > Each SCTP association can have up to 65535 input and output streams.
> > For each stream type an array of sctp_stream_in or sctp_stream_out
> > structures is allocated using kmalloc_array() function. This function
> > allocates physically contiguous memory regions, so this can lead
> > to allocation of memory regions of very high order, i.e.:
> ...
> 
> Given how useless SCTP streams are, does anything actually use
> more than about 4?

Maybe Michael can help us with that. I'm also curious now.

  Marcelo

Re: [PATCH net-next v4 1/4] net/sched: user-space can't set unknown tcfa_action values

2018-07-26 Thread Marcelo Ricardo Leitner

Hi,

On Thu, Jul 26, 2018 at 04:34:57PM +0200, Paolo Abeni wrote:
...
> @@ -895,6 +904,14 @@ struct tc_action *tcf_action_init_1(struct net *net, 
> struct tcf_proto *tp,
>   }
>   }
>  
> + if (!tcf_action_valid(a->tcfa_action)) {
> + net_warn_ratelimited("invalid %d action value, using "
> +  "TC_ACT_UNSPEC instead", a->tcfa_action);

Now that it is reporting the error via extack, do we really need this
warn net_warn?
extack will be shown as a warning by iproute2 even if the command
succeeds.

> + NL_SET_ERR_MSG(extack, "invalid action value, using "
> +"TC_ACT_UNSPEC instead");

Quoted strings shouldn't be broken down into multiple lines..

> + a->tcfa_action = TC_ACT_UNSPEC;
> + }
> +
>   return a;
>  
>  err_mod:
> -- 
> 2.17.1
>

Re: [PATCH net-next v3 3/5] tc/act: remove unneeded RCU lock in action callback

2018-07-25 Thread Marcelo Ricardo Leitner

On Wed, Jul 25, 2018 at 07:59:48AM -0400, Jamal Hadi Salim wrote:
> On 24/07/18 04:06 PM, Paolo Abeni wrote:
> > Each lockless action currently does its own RCU locking in ->act().
> > This is allows using plain RCU accessor, even if the context
> > is really RCU BH.
> > 
> > This change drops the per action RCU lock, replace the accessors
> > with _bh variant, cleans up a bit the surronding code and documents
> > the RCU status in the relevant header.
> > No functional nor performance change is intended.
> > 
> > The goal of this patch is clarifying that the RCU critical section
> > used by the tc actions extends up to the classifier's caller.
> > 
> 
> This and 2/5 seems to stand on their own merit.

So does 1/5, I think.

> 
> cheers,
> jamal

Re: [PATCH net-next v3 4/5] net/tc: introduce TC_ACT_REINJECT.

2018-07-25 Thread Marcelo Ricardo Leitner

On Wed, Jul 25, 2018 at 09:48:16AM -0700, Cong Wang wrote:
> On Wed, Jul 25, 2018 at 5:27 AM Jamal Hadi Salim  wrote:
> >
> > Those changes were there from the beginning (above patch did
> > not introduce them).
> > IIRC, the reason was to distinguish between policy intended
> > drops and drops because of errors.
> 
> There must be a limit for "overlimit" to make sense. There is
> no limit in mirred action's context, probably there is only
> such a limit in act_police. So, all rest should not touch overlimit.

+1

Re: [PATCH net-next 0/2] net/sctp: Avoid allocating high order memory with kmalloc()

2018-07-24 Thread Marcelo Ricardo Leitner

On Tue, Jul 24, 2018 at 06:35:35PM +0300, Konstantin Khorenko wrote:
> Hi Marcelo,
> 
> pity to abandon Oleg's attempt to avoid high order allocations and use
> flex_array instead, so i tried to do the performance measurements with
> options you kindly suggested.

Nice, thanks!

...
> As we can see single stream tests do not show any noticeable degradation,
> and SCTP_*_MANY tests spread decreased significantly when -S/-s options are 
> used,
> but still too big to consider the performance test pass or fail.
> 
> Can you please advise anything else to try - to decrease the dispersion rate -

In addition, you can try also using a veth tunnel or reducing lo mtu
down to 1500, and also make use of sctp tests (need to be after the --
) option -m 1452.  These will alleaviate issues with cwnd handling
that happen on loopback due to the big MTU and minimize issues with
rwnd/buffer size too.

Even with -S, -s, -m and the lower MTU, it is usual to see some
fluctuation, but not that much.

> or can we just consider values are fine and i'm reworking the patch according
> to your comment about sctp_stream_in(asoc, sid)/sctp_stream_in_ptr(stream, 
> sid)
> and that's it?

Ok, thanks. It seems so, yes.

  Marcelo

Re: [PATCH v3 net-next] net/sched: add skbprio scheduler

2018-07-13 Thread Marcelo Ricardo Leitner

On Fri, Jul 13, 2018 at 11:17:18AM -0700, Cong Wang wrote:
...
> > > > > Isn't the whole point of sch_prio offloading the queueing to
> > > > > each class? If you need a limit, there is one for each child
> > > > > qdisc if you use for example pfifo or bfifo (depending on you
> > > > > want to limit bytes or packets).
> > > >
> > > > Yes, but Michel wants to drop from other lower priorities if needed,
> > > > and that's not possible if you handle the limit already in a child
> > > > qdisc as they don't know about their siblings. The idea in the example
> > > > above is to discard it from whatever lower priority is needed, then
> > > > queue it. (ok, the example missed to check the priority level)
> > >
> > > So it disproves your point of adding a flag to sch_prio, right?
> >
> > I don't see how?
> 
> Interesting, you said "Michel wants to drop from other lower
> priorities if needed", but sch_prio has no knowledge of this,
> you confirmed with "...if you handle the limit already in a child
> qdisc as they don't know about their siblings."
> 
> The if clause is true as the limit is indeed handled by its child
> qdiscs as designed.
> 
> Therefore, a simple of adding a flag to sch_prio, as you
> suggested and demonstrated above, doesn't work, as
> confirmed by your own words.

Well, it would help if you didn't cut out key parts of my words.

> 
> What am I missing here?
> 
> Are you go further by suggesting moving the limit out of prio?
> Or are you going to expand your definition of "adding a flag"?
> Perhaps two flags? :)
> 
> I am very open for discussion to see how far we can go.

I am not keen on continuing this discussion if you keep twisting my
words just for fun.

Re: [PATCH v3 net-next] net/sched: add skbprio scheduler

2018-07-13 Thread Marcelo Ricardo Leitner

On Fri, Jul 13, 2018 at 11:26:28AM -0700, Cong Wang wrote:
> On Fri, Jul 13, 2018 at 6:04 AM Marcelo Ricardo Leitner
>  wrote:
> >
> > On Thu, Jul 12, 2018 at 11:05:45PM -0700, Cong Wang wrote:
> > > On Wed, Jul 11, 2018 at 12:33 PM Marcelo Ricardo Leitner
> > >  wrote:
> > > >
> > > > On Tue, Jul 10, 2018 at 07:25:53PM -0700, Cong Wang wrote:
> > > > > On Mon, Jul 9, 2018 at 2:40 PM Marcelo Ricardo Leitner
> > > > >  wrote:
> > > > > >
> > > > > > On Mon, Jul 09, 2018 at 05:03:31PM -0400, Michel Machado wrote:
> > > > > > >Changing TC_PRIO_MAX from 15 to 63 risks breaking backward 
> > > > > > > compatibility
> > > > > > > with applications.
> > > > > >
> > > > > > If done, it needs to be done carefully, indeed. I don't know if it's
> > > > > > doable, neither I know how hard is your requirement for 64 different
> > > > > > priorities.
> > > > >
> > > > > struct tc_prio_qopt {
> > > > > int bands;  /* Number of bands */
> > > > > __u8priomap[TC_PRIO_MAX+1]; /* Map: logical priority -> 
> > > > > PRIO band */
> > > > > };
> > > > >
> > > > > How would you do it carefully?
> > > >
> > > > quick shot, multiplex v1 and v2 formats based on bands and sizeof():
> > > >
> > > > #define TCQ_PRIO_BANDS_V1   16
> > > > #define TCQ_PRIO_BANDS_V2   64
> > > > #define TC_PRIO_MAX_V2  64
> > > >
> > > > struct tc_prio_qopt_v2 {
> > > > int bands;  /* Number of bands */
> > > > __u8priomap[TC_PRIO_MAX_V2+1]; /* Map: logical priority -> 
> > > > PRIO band */
> > > > };
> > > >
> > >
> > > Good try, but:
> > >
> > > 1. You don't take padding into account, although the difference
> > > between 16 and 64 is big here. If it were 16 and 20, almost certainly
> > > wouldn't work.
> >
> > It still would work, no matter how much padding you have, as currently
> > you can't use more than 3 bands.
> 
> I am lost.
> 
> With your proposal above, you have 16 bands for V1 and 64 bands
> for V2, where does 3 come from???

My bad. s/3/16/

> 
> 
> >
> > >
> > > 2. What if I compile a new iproute2 on an old kernel? The iproute2
> > > will use V2, while old kernel has no knowledge of V2, so it only
> > > copies a part of V2 in the end
> >
> > Yes, and that's not a problem:
> > - Either bands is > 3 and it will return EINVAL, protecting from
> >   reading beyond the buffer.
> > - Or 2 <= bands <= 3 and it will handle it as a _v1 struct, and use
> >   only the original size.
> 
> Again why 3 not 16 or 64 ??

Again, s/3/16/

> 
> Also, why does an old kernel has the logic in its binary to determine
> this?

It won't, and it doesn't need to. If you use bands > 16 with an old
kernel, it will reject per current code (that I already pasted):

if (qopt->bands > TCQ_PRIO_BANDS || qopt->bands < 2)
return -EINVAL;

Simple as that. If you try to use more bands than it supports, it will
reject it.

> 
> >
> > iproute2 (or other app) may still use _v1 if it wants, btw.
> 
> Yes, old iproute2 must still have v1, what's point? Are you

??

> suggesting new iproute2 should still have v1 after you propose
> v1 and v2 for kernel?

I'm only saying that both versions will be accepted by a new kernel.

> 
> I must seriously miss something. Please help.
> 
> Thanks!

Re: [PATCH v3 net-next] net/sched: add skbprio scheduler

2018-07-13 Thread Marcelo Ricardo Leitner

On Thu, Jul 12, 2018 at 11:05:45PM -0700, Cong Wang wrote:
> On Wed, Jul 11, 2018 at 12:33 PM Marcelo Ricardo Leitner
>  wrote:
> >
> > On Tue, Jul 10, 2018 at 07:25:53PM -0700, Cong Wang wrote:
> > > On Mon, Jul 9, 2018 at 2:40 PM Marcelo Ricardo Leitner
> > >  wrote:
> > > >
> > > > On Mon, Jul 09, 2018 at 05:03:31PM -0400, Michel Machado wrote:
> > > > >Changing TC_PRIO_MAX from 15 to 63 risks breaking backward 
> > > > > compatibility
> > > > > with applications.
> > > >
> > > > If done, it needs to be done carefully, indeed. I don't know if it's
> > > > doable, neither I know how hard is your requirement for 64 different
> > > > priorities.
> > >
> > > struct tc_prio_qopt {
> > > int bands;  /* Number of bands */
> > > __u8priomap[TC_PRIO_MAX+1]; /* Map: logical priority -> PRIO 
> > > band */
> > > };
> > >
> > > How would you do it carefully?
> >
> > quick shot, multiplex v1 and v2 formats based on bands and sizeof():
> >
> > #define TCQ_PRIO_BANDS_V1   16
> > #define TCQ_PRIO_BANDS_V2   64
> > #define TC_PRIO_MAX_V2  64
> >
> > struct tc_prio_qopt_v2 {
> > int bands;  /* Number of bands */
> > __u8priomap[TC_PRIO_MAX_V2+1]; /* Map: logical priority -> PRIO 
> > band */
> > };
> >
> 
> Good try, but:
> 
> 1. You don't take padding into account, although the difference
> between 16 and 64 is big here. If it were 16 and 20, almost certainly
> wouldn't work.

It still would work, no matter how much padding you have, as currently
you can't use more than 3 bands.

> 
> 2. What if I compile a new iproute2 on an old kernel? The iproute2
> will use V2, while old kernel has no knowledge of V2, so it only
> copies a part of V2 in the end

Yes, and that's not a problem:
- Either bands is > 3 and it will return EINVAL, protecting from
  reading beyond the buffer.
- Or 2 <= bands <= 3 and it will handle it as a _v1 struct, and use
  only the original size.

iproute2 (or other app) may still use _v1 if it wants, btw.

Re: [PATCH v3 net-next] net/sched: add skbprio scheduler

2018-07-13 Thread Marcelo Ricardo Leitner

On Thu, Jul 12, 2018 at 10:07:30PM -0700, Cong Wang wrote:
> On Wed, Jul 11, 2018 at 11:37 AM Marcelo Ricardo Leitner
>  wrote:
> >
> > On Tue, Jul 10, 2018 at 07:32:43PM -0700, Cong Wang wrote:
> > > On Mon, Jul 9, 2018 at 12:53 PM Marcelo Ricardo Leitner
> > >  wrote:
> > > >
> > > > On Mon, Jul 09, 2018 at 02:18:33PM -0400, Michel Machado wrote:
> > > > >
> > > > >2. sch_prio.c does not have a global limit on the number of 
> > > > > packets on
> > > > > all its queues, only a limit per queue.
> > > >
> > > > It can be useful to sch_prio.c as well, why not?
> > > > prio_enqueue()
> > > > {
> > > > ...
> > > > +   if (count > sch->global_limit)
> > > > +   prio_tail_drop(sch);   /* to be implemented */
> > > > ret = qdisc_enqueue(skb, qdisc, to_free);
> > > >
> > >
> > > Isn't the whole point of sch_prio offloading the queueing to
> > > each class? If you need a limit, there is one for each child
> > > qdisc if you use for example pfifo or bfifo (depending on you
> > > want to limit bytes or packets).
> >
> > Yes, but Michel wants to drop from other lower priorities if needed,
> > and that's not possible if you handle the limit already in a child
> > qdisc as they don't know about their siblings. The idea in the example
> > above is to discard it from whatever lower priority is needed, then
> > queue it. (ok, the example missed to check the priority level)
> 
> So it disproves your point of adding a flag to sch_prio, right?

I don't see how?

> 
> Also, you have to re-introduce qdisc->ops->drop() if you really want
> to go this direction.

Again, yes. What's the deal with it?

> 
> >
> > As for the different units, sch_prio holds a count of how many packets
> > are queued on its children, and that's what would be used for the limit.
> >
> > >
> > > Also, what's your plan for backward compatibility here?
> >
> > say:
> >   if (sch->global_limit && count > sch->global_limit)
> > as in, only do the limit check/enforcing if needed.
> 
> Obviously doesn't work, users could pass 0 to effectively
> disable the qdisc from enqueue'ing any packet.

If you only had considered the right 'limit' variable, you would be
right here.

Re: [PATCH iproute2-next v2] net:sched: add action inheritdsfield to skbedit

2018-07-12 Thread Marcelo Ricardo Leitner

On Thu, Jul 12, 2018 at 12:09:26PM -0400, Qiaobin Fu wrote:
> @@ -156,6 +162,9 @@ parse_skbedit(struct action_util *a, int *argc_p, char 
> ***argv_p, int tca_id,
>   if (flags & SKBEDIT_F_PTYPE)
>   addattr_l(n, MAX_MSG, TCA_SKBEDIT_PTYPE,
> , sizeof(ptype));
> + if (pure_flags != 0)
> + addattr_l(n, MAX_MSG, TCA_SKBEDIT_FLAGS,
> + _flags, sizeof(pure_flags));

It is missing 2 spaces  ^--- here, to make the indentation right. (as
in the block above)

  Marcelo

Re: [PATCH net-next] net: sched: fix unprotected access to rcu cookie pointer

2018-07-11 Thread Marcelo Ricardo Leitner

On Mon, Jul 09, 2018 at 11:44:38PM +0300, Vlad Buslov wrote:
> 
> On Mon 09 Jul 2018 at 20:34, Marcelo Ricardo Leitner 
>  wrote:
> > On Mon, Jul 09, 2018 at 08:26:47PM +0300, Vlad Buslov wrote:
> >> Fix action attribute size calculation function to take rcu read lock and
> >> access act_cookie pointer with rcu dereference.
> >> 
> >> Fixes: eec94fdb0480 ("net: sched: use rcu for action cookie update")
> >> Reported-by: Marcelo Ricardo Leitner 
> >> Signed-off-by: Vlad Buslov 
> >> ---
> >>  net/sched/act_api.c | 9 +++--
> >>  1 file changed, 7 insertions(+), 2 deletions(-)
> >> 
> >> diff --git a/net/sched/act_api.c b/net/sched/act_api.c
> >> index 66dc19746c63..148a89ab789b 100644
> >> --- a/net/sched/act_api.c
> >> +++ b/net/sched/act_api.c
> >> @@ -149,10 +149,15 @@ EXPORT_SYMBOL(__tcf_idr_release);
> >>  
> >>  static size_t tcf_action_shared_attrs_size(const struct tc_action *act)
> >>  {
> >> +  struct tc_cookie *act_cookie;
> >>u32 cookie_len = 0;
> >>  
> >> -  if (act->act_cookie)
> >> -  cookie_len = nla_total_size(act->act_cookie->len);
> >> +  rcu_read_lock();
> >> +  act_cookie = rcu_dereference(act->act_cookie);
> >> +
> >> +  if (act_cookie)
> >> +  cookie_len = nla_total_size(act_cookie->len);
> >> +  rcu_read_unlock();
> >
> > I am not sure if this is enough to fix the entire issue. Now it will
> > fetch the length correctly but, what guarantees that when it tries to
> > actually copy the key (tcf_action_dump_1), the same act_cookie pointer
> > will be used? As in, can't the new re-fetch be different/smaller than
> > the object used here?
> 
> I checked the code of nlmsg_put() and similar functions, and they check
> that there is enough free space at skb tailroom. If not, they fail
> gracefully and return error. Am I missing something?

Talked offline with Vlad and I agree that this is fine as is.

Reviewed-by: Marcelo Ricardo Leitner 

Thanks,
Marcelo

Re: [PATCH v3 net-next] net/sched: add skbprio scheduler

2018-07-11 Thread Marcelo Ricardo Leitner

On Tue, Jul 10, 2018 at 07:25:53PM -0700, Cong Wang wrote:
> On Mon, Jul 9, 2018 at 2:40 PM Marcelo Ricardo Leitner
>  wrote:
> >
> > On Mon, Jul 09, 2018 at 05:03:31PM -0400, Michel Machado wrote:
> > >Changing TC_PRIO_MAX from 15 to 63 risks breaking backward 
> > > compatibility
> > > with applications.
> >
> > If done, it needs to be done carefully, indeed. I don't know if it's
> > doable, neither I know how hard is your requirement for 64 different
> > priorities.
> 
> struct tc_prio_qopt {
> int bands;  /* Number of bands */
> __u8priomap[TC_PRIO_MAX+1]; /* Map: logical priority -> PRIO band 
> */
> };
> 
> How would you do it carefully?

quick shot, multiplex v1 and v2 formats based on bands and sizeof():

#define TCQ_PRIO_BANDS_V1   16
#define TCQ_PRIO_BANDS_V2   64
#define TC_PRIO_MAX_V2  64

struct tc_prio_qopt_v2 {
int bands;  /* Number of bands */
__u8priomap[TC_PRIO_MAX_V2+1]; /* Map: logical priority -> PRIO 
band */
};

static int prio_tune(struct Qdisc *sch, struct nlattr *opt,
 struct netlink_ext_ack *extack)
{
struct prio_sched_data *q = qdisc_priv(sch);
struct Qdisc *queues[TCQ_PRIO_BANDS_V2];
int oldbands = q->bands, i;
struct tc_prio_qopt_v2 *qopt;

if (nla_len(opt) < sizeof(int))
return -EINVAL;
qopt = nla_data(opt);

if (qopt->bands <= TCQ_PRIO_BANDS_V1 &&
nla_len(opt) < sizeof(struct tc_prio_qopt))
return -EINVAL;

if (qopt->bands <= TCQ_PRIO_BANDS_V2 &&
nla_len(opt) < sizeof(*qopt))
return -EINVAL;

/* By here, if it has up to 3 bands, we can assume it is using the _v1
 * layout, while if it has up to TCQ_PRIO_BANDS_V2 it is using the _v2
 * format.
 */

if (qopt->bands > TCQ_PRIO_BANDS_V2 || qopt->bands < 2)
return -EINVAL;
...

With something like this I think it can keep compatibility with old
software while also allowing the new usage.

> Also, it is not only used by prio but also pfifo_fast.

Yes. More is needed, indeed. prio2band would also need to be expanded,
etc. Yet, I still don't see any blocker.

Re: [PATCH v3 net-next] net/sched: add skbprio scheduler

2018-07-11 Thread Marcelo Ricardo Leitner

On Tue, Jul 10, 2018 at 07:32:43PM -0700, Cong Wang wrote:
> On Mon, Jul 9, 2018 at 12:53 PM Marcelo Ricardo Leitner
>  wrote:
> >
> > On Mon, Jul 09, 2018 at 02:18:33PM -0400, Michel Machado wrote:
> > >
> > >2. sch_prio.c does not have a global limit on the number of packets on
> > > all its queues, only a limit per queue.
> >
> > It can be useful to sch_prio.c as well, why not?
> > prio_enqueue()
> > {
> > ...
> > +   if (count > sch->global_limit)
> > +   prio_tail_drop(sch);   /* to be implemented */
> > ret = qdisc_enqueue(skb, qdisc, to_free);
> >
> 
> Isn't the whole point of sch_prio offloading the queueing to
> each class? If you need a limit, there is one for each child
> qdisc if you use for example pfifo or bfifo (depending on you
> want to limit bytes or packets).

Yes, but Michel wants to drop from other lower priorities if needed,
and that's not possible if you handle the limit already in a child
qdisc as they don't know about their siblings. The idea in the example
above is to discard it from whatever lower priority is needed, then
queue it. (ok, the example missed to check the priority level)

As for the different units, sch_prio holds a count of how many packets
are queued on its children, and that's what would be used for the limit.

> 
> Also, what's your plan for backward compatibility here?

say:
  if (sch->global_limit && count > sch->global_limit)
as in, only do the limit check/enforcing if needed.

Re: [PATCH v3 net-next] net/sched: add skbprio scheduler

2018-07-10 Thread Marcelo Ricardo Leitner

On Tue, Jul 10, 2018 at 10:03:22AM -0400, Michel Machado wrote:
...
> > You can get 64 different priorities by stacking sch_prio, btw. And if
> > you implement drop_from_tail() as part of Qdisc, you can even get it
> > working for this cascading case too.
> 
>A solution would be to add another flag to switch between the current
> prio_classify() and a new one to just use skb->priority as in skbprio. This

Sounds promising.

> way we don't risk breaking applications that rely on tcf_classify() and this
> odd behavior that I found in prio_classify():
> 
> band = TC_H_MIN(band) - 1;
> if (band >= q->bands)
> return q->queues[q->prio2band[0]];
> return q->queues[band];
> 
>When band is zero, it returns q->queues[q->prio2band[0]] instead of
> q->queues[band] as it would for other bands less than q->bands.

Agreed, this looks odd. It came from 1d8ae3fdeb00 ("pkt_sched: Remove
RR scheduler."):

band = TC_H_MIN(band) - 1;
if (band >= q->bands)
-   band = q->prio2band[0];
-out:
-   if (q->mq)
-   skb_set_queue_mapping(skb, band);
+   return q->queues[q->prio2band[0]];
+
return q->queues[band];
 }

I can see how it made sense before the change, but not after.

> 
> > > > >  3. The queues of sch_prio.c are struct Qdisc, which don't have a 
> > > > > method
> > > > > to drop at its tail.
> > > > 
> > > > That can be implemented, most likely as prio_tail_drop() as above.
> > > 
> > > struct Qdisc represents *all* qdiscs. My knowledge of the other 
> > > qdiscs is
> > > limited, but not all qdiscs may have a meaningful method to drop at the
> > > tail. For example: a qdisc that works over flows may not know with flow is
> > 
> > True, but it doesn't mean you have to implement it for all available qdiscs.
> 
>If it is not implemented for all available qdiscs and the flag to drop at
> the tail is on, sch_prio.c would need to issue a log message whenever a
> packet goes into one of the subqueues that don't drop at the tail and have a
> failsafe behavior.

That's fine. pr_warn_ratelimit() is probably what we need for logging
the error, so it a) doesn't flood kernel log and b) gets activated
even if the sysadmin later try again with another qdisc (as opposed to
pr_warn_once).

For the failsafe behavior, it probably can then just drop the incoming
packet. It is not what you want, yes, but it's an easy way out out of
a non-expected situation and that works well enough.

> 
> > > the tail. Not to mention that this would be a widespread patch to only
> > > support this new prio qdisc. It would be prudent to wait for the 
> > > production
> > > success of the proposed, self-contained qdisc before making this 
> > > commitment.
> > 
> > On the other hand, by adding another qdisc you're adding more work
> > that one needs to do when dealing with qdisc infrastructure, such as
> > updating enqueue() prototype, for example.
> > 
> > Once this new qdisc is in, it won't be easy to deprecate it.
> 
>We need to choose between (1) having skbprio that has some duplicate code
> with sch_prio.c and (2) adding flags to sch_prio.c and make a major
> refactoring of the schedule subsystem to add drop an the tail to qdiscs.

Yes,

> 
>I mean major because we are not just talking about adding the method
> dequeue_tail() to struct Qdisc and adding dequeue_tail() to all qdiscs. One

I think it is. :-)

> will need to come up with definitions of dequeue_tail() for qdiscs that
> don't naturally have it and even rewrite the data structures of qdiscs. To
> substantiate this last point, consider sch_fifo.c, one of the simplest
> qdiscs available. sch_fifo.c keeps its packets in sch->q, which is of type
> struct qdisc_skb_head. struct qdisc_skb_head doesn't set skb->prev, so it
> cannot drop at the tail without walking through its list.

Yes but this would only be needed for the qdiscs that you want to
support with this flag. Nobody said you need to implement it on all
qdiscs that we have...

> 
>I do understand the motivation for minimizing duplicate code. But the
> small amount of duplicate code that skbprio adds is cheaper than refactoring
> the scheduler system to only support this new sch_prio.c.

I'm afraid that without the code for option (2) above, this discussion
will become subjective. I'll wait for other opinions here.

Cheers,
  Marcelo

Re: [PATCH v3 net-next] net/sched: add skbprio scheduler

2018-07-09 Thread Marcelo Ricardo Leitner

On Mon, Jul 09, 2018 at 05:03:31PM -0400, Michel Machado wrote:
> On 07/09/2018 03:53 PM, Marcelo Ricardo Leitner wrote:
> > On Mon, Jul 09, 2018 at 02:18:33PM -0400, Michel Machado wrote:
> > > On 07/09/2018 11:44 AM, Marcelo Ricardo Leitner wrote:
> > > > On Sat, Jul 07, 2018 at 03:43:55PM +0530, Nishanth Devarajan wrote:
> > > > > net/sched: add skbprio scheduer
> > > > > 
> > > > > Skbprio (SKB Priority Queue) is a queueing discipline that 
> > > > > prioritizes packets
> > > > > according to their skb->priority field. Under congestion, 
> > > > > already-enqueued lower
> > > > > priority packets will be dropped to make space available for higher 
> > > > > priority
> > > > > packets. Skbprio was conceived as a solution for denial-of-service 
> > > > > defenses that
> > > > > need to route packets with different priorities as a means to 
> > > > > overcome DoS
> > > > > attacks.
> > > > 
> > > > Why can't we implement this as a new flag for sch_prio.c?
> > > > 
> > > > I don't see why this duplication is needed, especially because it will
> > > > only be "slower" (as in, it will do more work) when qdisc is already
> > > > full and dropping packets anyway.
> > > 
> > > sch_prio.c and skbprio diverge on a number of aspects:
> > > 
> > > 1. sch_prio.c supports up to 16 priorities whereas skbprio 64. This is
> > > not just a matter of changing a constant since sch_prio.c doesn't use
> > > skb->priority.
> > 
> > Yes it does use skb->priority for classifying into a band:
> > 
> > prio_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
> > {
> >  struct prio_sched_data *q = qdisc_priv(sch);
> >  u32 band = skb->priority;
> > ...
> 
>Changing TC_PRIO_MAX from 15 to 63 risks breaking backward compatibility
> with applications.

If done, it needs to be done carefully, indeed. I don't know if it's
doable, neither I know how hard is your requirement for 64 different
priorities.

You can get 64 different priorities by stacking sch_prio, btw. And if
you implement drop_from_tail() as part of Qdisc, you can even get it
working for this cascading case too.

> 
> > > 3. The queues of sch_prio.c are struct Qdisc, which don't have a 
> > > method
> > > to drop at its tail.
> > 
> > That can be implemented, most likely as prio_tail_drop() as above.
> 
>struct Qdisc represents *all* qdiscs. My knowledge of the other qdiscs is
> limited, but not all qdiscs may have a meaningful method to drop at the
> tail. For example: a qdisc that works over flows may not know with flow is

True, but it doesn't mean you have to implement it for all available qdiscs.

> the tail. Not to mention that this would be a widespread patch to only
> support this new prio qdisc. It would be prudent to wait for the production
> success of the proposed, self-contained qdisc before making this commitment.

On the other hand, by adding another qdisc you're adding more work
that one needs to do when dealing with qdisc infrastructure, such as
updating enqueue() prototype, for example.

Once this new qdisc is in, it won't be easy to deprecate it.

  Marcelo

Re: [PATCH net-next] net: sched: fix unprotected access to rcu cookie pointer

2018-07-09 Thread Marcelo Ricardo Leitner

On Mon, Jul 09, 2018 at 08:26:47PM +0300, Vlad Buslov wrote:
> Fix action attribute size calculation function to take rcu read lock and
> access act_cookie pointer with rcu dereference.
> 
> Fixes: eec94fdb0480 ("net: sched: use rcu for action cookie update")
> Reported-by: Marcelo Ricardo Leitner 
> Signed-off-by: Vlad Buslov 
> ---
>  net/sched/act_api.c | 9 +++--
>  1 file changed, 7 insertions(+), 2 deletions(-)
> 
> diff --git a/net/sched/act_api.c b/net/sched/act_api.c
> index 66dc19746c63..148a89ab789b 100644
> --- a/net/sched/act_api.c
> +++ b/net/sched/act_api.c
> @@ -149,10 +149,15 @@ EXPORT_SYMBOL(__tcf_idr_release);
>  
>  static size_t tcf_action_shared_attrs_size(const struct tc_action *act)
>  {
> + struct tc_cookie *act_cookie;
>   u32 cookie_len = 0;
>  
> - if (act->act_cookie)
> - cookie_len = nla_total_size(act->act_cookie->len);
> + rcu_read_lock();
> + act_cookie = rcu_dereference(act->act_cookie);
> +
> + if (act_cookie)
> + cookie_len = nla_total_size(act_cookie->len);
> + rcu_read_unlock();

I am not sure if this is enough to fix the entire issue. Now it will
fetch the length correctly but, what guarantees that when it tries to
actually copy the key (tcf_action_dump_1), the same act_cookie pointer
will be used? As in, can't the new re-fetch be different/smaller than
the object used here?

>  
>   return  nla_total_size(0) /* action number nested */
>   + nla_total_size(IFNAMSIZ) /* TCA_ACT_KIND */
> -- 
> 2.7.5
>

Re: [PATCH v3 net-next] net/sched: add skbprio scheduler

2018-07-09 Thread Marcelo Ricardo Leitner

On Mon, Jul 09, 2018 at 02:18:33PM -0400, Michel Machado wrote:
> On 07/09/2018 11:44 AM, Marcelo Ricardo Leitner wrote:
> > On Sat, Jul 07, 2018 at 03:43:55PM +0530, Nishanth Devarajan wrote:
> > > net/sched: add skbprio scheduer
> > > 
> > > Skbprio (SKB Priority Queue) is a queueing discipline that prioritizes 
> > > packets
> > > according to their skb->priority field. Under congestion, 
> > > already-enqueued lower
> > > priority packets will be dropped to make space available for higher 
> > > priority
> > > packets. Skbprio was conceived as a solution for denial-of-service 
> > > defenses that
> > > need to route packets with different priorities as a means to overcome DoS
> > > attacks.
> > 
> > Why can't we implement this as a new flag for sch_prio.c?
> > 
> > I don't see why this duplication is needed, especially because it will
> > only be "slower" (as in, it will do more work) when qdisc is already
> > full and dropping packets anyway.
> 
>sch_prio.c and skbprio diverge on a number of aspects:
> 
>1. sch_prio.c supports up to 16 priorities whereas skbprio 64. This is
> not just a matter of changing a constant since sch_prio.c doesn't use
> skb->priority.

Yes it does use skb->priority for classifying into a band:

prio_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
{
struct prio_sched_data *q = qdisc_priv(sch);
u32 band = skb->priority;
...

> 
>2. sch_prio.c does not have a global limit on the number of packets on
> all its queues, only a limit per queue.

It can be useful to sch_prio.c as well, why not?
prio_enqueue()
{
...
+   if (count > sch->global_limit)
+   prio_tail_drop(sch);   /* to be implemented */
ret = qdisc_enqueue(skb, qdisc, to_free);

> 
>3. The queues of sch_prio.c are struct Qdisc, which don't have a method
> to drop at its tail.

That can be implemented, most likely as prio_tail_drop() as above.

> 
>Given the divergences, adding flags to sch_prio.c will essentially keep
> both implementations together instead of being isolated as being proposed.

I don't agree. There aren't that many flags. I see only 2, one which
makes sense to sch_prio as it is already (the global limit) and from
where it should drop, the overflown packet or from tail.

All other code will be reused: stats handling, netlink handling,
enqueue and dequeue at least.

If we add this other qdisc, named as it is, it will be very confusing
to sysadmins: both are named very closely and work essentially in the
same way, but one drops from tail and another drops the incoming
packet.

> 
>On the speed point, there may not be noticeable difference between both
> qdiscs because the enqueueing and dequeueing costs of both qdics are O(1).
> Notice that the "extra work" (i.e. dropping lower priority packets) is a key
> aspect of skbprio since it gives routers a cheap way to choose which packets
> to drop during a DoS.

On that I agree. I was more referring to something like: "lets not make
sch_prio slow and implement a new one instead.", which I don't it's
valid because the extra "cost" is only visible when it's already
dropping packets. Hopefully it's clearer now :)

[]s
Marcelo

Re: [PATCH v3 net-next] net/sched: add skbprio scheduler

2018-07-09 Thread Marcelo Ricardo Leitner

On Sat, Jul 07, 2018 at 03:43:55PM +0530, Nishanth Devarajan wrote:
> net/sched: add skbprio scheduer
> 
> Skbprio (SKB Priority Queue) is a queueing discipline that prioritizes packets
> according to their skb->priority field. Under congestion, already-enqueued 
> lower
> priority packets will be dropped to make space available for higher priority
> packets. Skbprio was conceived as a solution for denial-of-service defenses 
> that
> need to route packets with different priorities as a means to overcome DoS
> attacks.

Why can't we implement this as a new flag for sch_prio.c?

I don't see why this duplication is needed, especially because it will
only be "slower" (as in, it will do more work) when qdisc is already
full and dropping packets anyway.

  Marcelo

Re: [net-next:master 464/475] net/sched/act_api.c:71:15: sparse: incorrect type in initializer (different address spaces)

2018-07-09 Thread Marcelo Ricardo Leitner

As I pointed out inthe other thread, Dave fixed these:

> >> net/sched/act_api.c:71:15: sparse: incorrect type in initializer 
> >> (different address spaces) @@expected struct tc_cookie [noderef] 
> >> *__ret @@got [noderef] *__ret @@
>net/sched/act_api.c:71:15:expected struct tc_cookie [noderef] 
> *__ret
>net/sched/act_api.c:71:15:got struct tc_cookie *new_cookie
> >> net/sched/act_api.c:71:13: sparse: incorrect type in assignment (different 
> >> address spaces) @@expected struct tc_cookie *old @@got struct 
> >> tc_cookie [noderef] net/sched/act_api.c:71:13:expected struct tc_cookie *old
>net/sched/act_api.c:71:13:got struct tc_cookie [noderef] 
> *[assigned] __ret

in 0dbc81eab4d1 ("net: sched: Fix warnings from xchg() on RCU'd cookie
pointer.")
But this one is still there:

> >> net/sched/act_api.c:132:48: sparse: dereference of noderef expression
...
>127static size_t tcf_action_shared_attrs_size(const struct 
> tc_action *act)
>128{
>129u32 cookie_len = 0;
>130
>131if (act->act_cookie)
>  > 132cookie_len = 
> nla_total_size(act->act_cookie->len);

It can't be done this way, as act_cookie now is a __rcu var.

>133
>134return  nla_total_size(0) /* action number nested */
>135+ nla_total_size(IFNAMSIZ) /* TCA_ACT_KIND */
>136+ cookie_len /* TCA_ACT_COOKIE */
>137+ nla_total_size(0) /* TCA_ACT_STATS nested */
>138/* TCA_STATS_BASIC */
>139+ nla_total_size_64bit(sizeof(struct 
> gnet_stats_basic))
>140/* TCA_STATS_QUEUE */
>141+ nla_total_size_64bit(sizeof(struct 
> gnet_stats_queue))
>142+ nla_total_size(0) /* TCA_OPTIONS nested */
>143+ nla_total_size(sizeof(struct tcf_t)); /* 
> TCA_GACT_TM */
>144}
>145
> 
> ---
> 0-DAY kernel test infrastructureOpen Source Technology Center
> https://lists.01.org/pipermail/kbuild-all   Intel Corporation
>

Re: [PATCH] net: sched: Fix warnings from xchg() on RCU'd cookie pointer.

2018-07-09 Thread Marcelo Ricardo Leitner

On Sun, Jul 08, 2018 at 05:03:58PM +0900, David Miller wrote:
> 
> The kbuild test robot reports:
> 
> >> net/sched/act_api.c:71:15: sparse: incorrect type in initializer 
> >> (different address spaces) @@expected struct tc_cookie [noderef] 
> >> *__ret @@got [noderef] *__ret @@
>net/sched/act_api.c:71:15:expected struct tc_cookie [noderef] 
> *__ret
>net/sched/act_api.c:71:15:got struct tc_cookie *new_cookie
> >> net/sched/act_api.c:71:13: sparse: incorrect type in assignment (different 
> >> address spaces) @@expected struct tc_cookie *old @@got struct 
> >> tc_cookie [noderef] net/sched/act_api.c:71:13:expected struct tc_cookie *old
>net/sched/act_api.c:71:13:got struct tc_cookie [noderef] 
> *[assigned] __ret

This one:

> >> net/sched/act_api.c:132:48: sparse: dereference of noderef expression

Actually belongs to a different issue, that was reported in the same
email, but which wasn't handled in this patch.

  Marcelo

Re: [PATCHv2 net] sctp: fix the issue that pathmtu may be set lower than MINSEGMENT

2018-07-03 Thread Marcelo Ricardo Leitner

On Tue, Jul 03, 2018 at 04:30:47PM +0800, Xin Long wrote:
> After commit b6c5734db070 ("sctp: fix the handling of ICMP Frag Needed
> for too small MTUs"), sctp_transport_update_pmtu would refetch pathmtu
> from the dst and set it to transport's pathmtu without any check.
> 
> The new pathmtu may be lower than MINSEGMENT if the dst is obsolete and
> updated by .get_dst() in sctp_transport_update_pmtu. In this case, it
> could have a smaller MTU as well, and thus we should validate it
> against MINSEGMENT instead.
> 
> Syzbot reported a warning in sctp_mtu_payload caused by this.
> 
> This patch refetches the pathmtu by calling sctp_dst_mtu where it does
> the check against MINSEGMENT.
> 
> v1->v2:
>   - refetch the pathmtu by calling sctp_dst_mtu instead as Marcelo's
> suggestion.
> 
> Fixes: b6c5734db070 ("sctp: fix the handling of ICMP Frag Needed for too 
> small MTUs")
> Reported-by: syzbot+f0d9d7cba052f9344...@syzkaller.appspotmail.com
> Suggested-by: Marcelo Ricardo Leitner 
> Signed-off-by: Xin Long 

Acked-by: Marcelo Ricardo Leitner 

> ---
>  net/sctp/transport.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/net/sctp/transport.c b/net/sctp/transport.c
> index 445b7ef..12cac85 100644
> --- a/net/sctp/transport.c
> +++ b/net/sctp/transport.c
> @@ -282,7 +282,7 @@ bool sctp_transport_update_pmtu(struct sctp_transport *t, 
> u32 pmtu)
>  
>   if (dst) {
>   /* Re-fetch, as under layers may have a higher minimum size */
> - pmtu = SCTP_TRUNC4(dst_mtu(dst));
> + pmtu = sctp_dst_mtu(dst);
>   change = t->pathmtu != pmtu;
>   }
>   t->pathmtu = pmtu;
> -- 
> 2.1.0
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-sctp" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>

Re: [PATCHv2 net-next 0/5] sctp: fully support for dscp and flowlabel per transport

2018-07-02 Thread Marcelo Ricardo Leitner

On Mon, Jul 02, 2018 at 06:21:10PM +0800, Xin Long wrote:
> Now dscp and flowlabel are set from sock when sending the packets,
> but being multi-homing, sctp also supports for dscp and flowlabel
> per transport, which is described in section 8.1.12 in RFC6458.
> 
> v1->v2:
>   - define ip_queue_xmit as inline in net/ip.h, instead of exporting
> it in Patch 1/5 according to David's suggestion.
>   - fix the param len check in sctp_s/getsockopt_peer_addr_params()
> in Patch 3/5 to guarantee that an old app built with old kernel
> headers could work on the newer kernel per Marcelo's point.
> 
> Xin Long (5):
>   ipv4: add __ip_queue_xmit() that supports tos param
>   sctp: add support for dscp and flowlabel per transport
>   sctp: add spp_ipv6_flowlabel and spp_dscp for sctp_paddrparams
>   sctp: add support for setting flowlabel when adding a transport
>   sctp: check for ipv6_pinfo legal sndflow with flowlabel in
> sctp_v6_get_dst

Series
Acked-by: Marcelo Ricardo Leitner

Re: [PATCH net] sctp: fix the issue that pathmtu may be set lower than MINSEGMENT

2018-07-02 Thread Marcelo Ricardo Leitner

On Mon, Jul 02, 2018 at 07:45:12AM -0400, Neil Horman wrote:
> On Mon, Jul 02, 2018 at 02:51:16PM +0800, Xin Long wrote:
> > After commit b6c5734db070 ("sctp: fix the handling of ICMP Frag Needed
> > for too small MTUs"), sctp_transport_update_pmtu would refetch pathmtu
> > from the dst and set it to transport's pathmtu without any check.
> > 
> > The new pathmtu may be lower than MINSEGMENT if the dst is obsolete and
> > updated by .get_dst() in sctp_transport_update_pmtu.

In this case, it could have a smaller MTU as well, and thus we should
validate it against MINSEGMENT instead.

> > 
> > Syzbot reported a warning in sctp_mtu_payload caused by this.
> > 
> > This fix uses the refetched pathmtu only when it's greater than the
> > frag_needed pmtu.
> > 
> > Fixes: b6c5734db070 ("sctp: fix the handling of ICMP Frag Needed for too 
> > small MTUs")
> > Reported-by: syzbot+f0d9d7cba052f9344...@syzkaller.appspotmail.com
> > Signed-off-by: Xin Long 
> > ---
> >  net/sctp/transport.c | 5 -
> >  1 file changed, 4 insertions(+), 1 deletion(-)
> > 
> > diff --git a/net/sctp/transport.c b/net/sctp/transport.c
> > index 445b7ef..ddfb687 100644
> > --- a/net/sctp/transport.c
> > +++ b/net/sctp/transport.c
> > @@ -282,7 +282,10 @@ bool sctp_transport_update_pmtu(struct sctp_transport 
> > *t, u32 pmtu)
> >  
> > if (dst) {
> > /* Re-fetch, as under layers may have a higher minimum size */
> > -   pmtu = SCTP_TRUNC4(dst_mtu(dst));
> > +   u32 mtu = SCTP_TRUNC4(dst_mtu(dst));
> > +
> > +   if (pmtu < mtu)
> > +   pmtu = mtu;
> nit, but why not u32 mtu = min(pmtu, SCTP_TRUNC4(dst_mtu(dst))) here ?

sctp_dst_mtu() is wrapping all that for us :)

-   pmtu = SCTP_TRUNC4(dst_mtu(dst));
+   pmtu = sctp_dst_mtu(dst);

> 
> Neil
> 
> > change = t->pathmtu != pmtu;
> > }
> > t->pathmtu = pmtu;
> > -- 
> > 2.1.0
> > 
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-sctp" in
> > the body of a message to majord...@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-sctp" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>

Re: [PATCH net-next 3/5] sctp: add spp_ipv6_flowlabel and spp_dscp for sctp_paddrparams

2018-06-25 Thread Marcelo Ricardo Leitner

Hi,

On Tue, Jun 26, 2018 at 01:12:00AM +0900, 吉藤英明 wrote:
> Hi,
> 
> 2018-06-25 22:03 GMT+09:00 Marcelo Ricardo Leitner 
> :
> > On Mon, Jun 25, 2018 at 07:28:47AM -0400, Neil Horman wrote:
> >> On Mon, Jun 25, 2018 at 04:31:26PM +0900, David Miller wrote:
> >> > From: Xin Long 
> >> > Date: Mon, 25 Jun 2018 10:14:35 +0800
> >> >
> >> > >  struct sctp_paddrparams {
> >> > > @@ -773,6 +775,8 @@ struct sctp_paddrparams {
> >> > >   __u32   spp_pathmtu;
> >> > >   __u32   spp_sackdelay;
> >> > >   __u32   spp_flags;
> >> > > + __u32   spp_ipv6_flowlabel;
> >> > > + __u8spp_dscp;
> >> > >  } __attribute__((packed, aligned(4)));
> >> >
> >> > I don't think you can change the size of this structure like this.
> >> >
> >> > This check in sctp_setsockopt_peer_addr_params():
> >> >
> >> > if (optlen != sizeof(struct sctp_paddrparams))
> >> > return -EINVAL;
> >> >
> >> > is going to trigger in old kernels when executing programs
> >> > built against the new struct definition.
> >
> > That will happen, yes, but do we really care about being future-proof
> > here? I mean: if we also update such check(s) to support dealing with
> > smaller-than-supported structs, newer kernels will be able to run
> > programs built against the old struct, and the new one; while building
> > using newer headers and running on older kernel may fool the
> > application in other ways too (like enabling support for something
> > that is available on newer kernel and that is not present in the older
> > one).
> 
> We should not break existing apps.
> We still accept apps of pre-2.4 era without sin6_scope_id
> (e.g., net/ipv6/af_inet6.c:inet6_bind()).

Yes. That's what I tried to say. That is supporting an old app built
with old kernel headers and running on a newer kernel, and not the
other way around (an app built with fresh headers and running on an
old kernel).

> 
> >
> >> >
> >> I think thats also the reason its a packed aligned attribute, it can't be
> >> changed, or older kernels won't be able to fill it out properly.
> >> Neil
> >
> > It's more for supporting running 32-bits apps on 64-bit kernels
> > (according to 20c9c825b12fc).
> >
> >   Marcelo

Re: [PATCHv2 net-next] sctp: add support for SCTP_REUSE_PORT sockopt

2018-06-25 Thread Marcelo Ricardo Leitner

On Mon, Jun 25, 2018 at 10:06:46AM +0800, Xin Long wrote:
> This feature is actually already supported by sk->sk_reuse which can be
> set by socket level opt SO_REUSEADDR. But it's not working exactly as
> RFC6458 demands in section 8.1.27, like:
> 
>   - This option only supports one-to-one style SCTP sockets
>   - This socket option must not be used after calling bind()
> or sctp_bindx().
> 
> Besides, SCTP_REUSE_PORT sockopt should be provided for user's programs.
> Otherwise, the programs with SCTP_REUSE_PORT from other systems will not
> work in linux.
> 
> To separate it from the socket level version, this patch adds 'reuse' in
> sctp_sock and it works pretty much as sk->sk_reuse, but with some extra
> setup limitations that are needed when it is being enabled.
> 
> "It should be noted that the behavior of the socket-level socket option
> to reuse ports and/or addresses for SCTP sockets is unspecified", so it
> leaves SO_REUSEADDR as is for the compatibility.
> 
> Note that the name SCTP_REUSE_PORT is kind of confusing, it is identical
> to SO_REUSEADDR with some extra restriction, so here it uses 'reuse' in
> sctp_sock instead of 'reuseport'. As for sk->sk_reuseport support for
> SCTP, it will be added in another patch.

To help changelog readers later, please update to something like:

"""\
Note that the name SCTP_REUSE_PORT is somewhat confusing, as its
functionality is nearly identical to SO_REUSEADDR, but with some
extra restrictions. Here it uses 'reuse' in sctp_sock instead of
'reuseport'. As for sk->sk_reuseport support for SCTP, it will be
added in another patch.
"""

Makes sense, can you note the difference?

> 
> Thanks to Neil to make this clear.
> 
> v1->v2:
>   - add sctp_sk->reuse to separate it from the socket level version.
> 
> Acked-by: Neil Horman 
> Signed-off-by: Xin Long 
> ---
>  include/net/sctp/structs.h |  1 +
>  include/uapi/linux/sctp.h  |  1 +
>  net/sctp/socket.c  | 62 
> --
>  3 files changed, 57 insertions(+), 7 deletions(-)
> 
> diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
> index e0f962d..701a517 100644
> --- a/include/net/sctp/structs.h
> +++ b/include/net/sctp/structs.h
> @@ -220,6 +220,7 @@ struct sctp_sock {
>   __u32 adaptation_ind;
>   __u32 pd_point;
>   __u16   nodelay:1,
> + reuse:1,
>   disable_fragments:1,
>   v4mapped:1,
>   frag_interleave:1,
> diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h
> index b64d583..c02986a 100644
> --- a/include/uapi/linux/sctp.h
> +++ b/include/uapi/linux/sctp.h
> @@ -100,6 +100,7 @@ typedef __s32 sctp_assoc_t;
>  #define SCTP_RECVNXTINFO 33
>  #define SCTP_DEFAULT_SNDINFO 34
>  #define SCTP_AUTH_DEACTIVATE_KEY 35
> +#define SCTP_REUSE_PORT  36
>  
>  /* Internal Socket Options. Some of the sctp library functions are
>   * implemented using these socket options.
> diff --git a/net/sctp/socket.c b/net/sctp/socket.c
> index 0e91e83..bf11f9c 100644
> --- a/net/sctp/socket.c
> +++ b/net/sctp/socket.c
> @@ -4170,6 +4170,28 @@ static int 
> sctp_setsockopt_interleaving_supported(struct sock *sk,
>   return retval;
>  }
>  
> +static int sctp_setsockopt_reuse_port(struct sock *sk, char __user *optval,
> +   unsigned int optlen)
> +{
> + int val;
> +
> + if (!sctp_style(sk, TCP))
> + return -EOPNOTSUPP;
> +
> + if (sctp_sk(sk)->ep->base.bind_addr.port)
> + return -EFAULT;
> +
> + if (optlen < sizeof(int))
> + return -EINVAL;
> +
> + if (get_user(val, (int __user *)optval))
> + return -EFAULT;
> +
> + sctp_sk(sk)->reuse = !!val;
> +
> + return 0;
> +}
> +
>  /* API 6.2 setsockopt(), getsockopt()
>   *
>   * Applications use setsockopt() and getsockopt() to set or retrieve
> @@ -4364,6 +4386,9 @@ static int sctp_setsockopt(struct sock *sk, int level, 
> int optname,
>   retval = sctp_setsockopt_interleaving_supported(sk, optval,
>   optlen);
>   break;
> + case SCTP_REUSE_PORT:
> + retval = sctp_setsockopt_reuse_port(sk, optval, optlen);
> + break;
>   default:
>   retval = -ENOPROTOOPT;
>   break;
> @@ -7197,6 +7222,26 @@ static int 
> sctp_getsockopt_interleaving_supported(struct sock *sk, int len,
>   return retval;
>  }
>  
> +static int sctp_getsockopt_reuse_port(struct sock *sk, int len,
> +   char __user *optval,
> +   int __user *optlen)
> +{
> + int val;
> +
> + if (len < sizeof(int))
> + return -EINVAL;
> +
> + len = sizeof(int);
> + val = sctp_sk(sk)->reuse;
> + if (put_user(len, optlen))
> + return -EFAULT;
> +
> + if (copy_to_user(optval, , len))
> + return -EFAULT;
> +
> +

Re: [PATCH net-next 3/5] sctp: add spp_ipv6_flowlabel and spp_dscp for sctp_paddrparams

2018-06-25 Thread Marcelo Ricardo Leitner

On Mon, Jun 25, 2018 at 07:28:47AM -0400, Neil Horman wrote:
> On Mon, Jun 25, 2018 at 04:31:26PM +0900, David Miller wrote:
> > From: Xin Long 
> > Date: Mon, 25 Jun 2018 10:14:35 +0800
> > 
> > >  struct sctp_paddrparams {
> > > @@ -773,6 +775,8 @@ struct sctp_paddrparams {
> > >   __u32   spp_pathmtu;
> > >   __u32   spp_sackdelay;
> > >   __u32   spp_flags;
> > > + __u32   spp_ipv6_flowlabel;
> > > + __u8spp_dscp;
> > >  } __attribute__((packed, aligned(4)));
> > 
> > I don't think you can change the size of this structure like this.
> > 
> > This check in sctp_setsockopt_peer_addr_params():
> > 
> > if (optlen != sizeof(struct sctp_paddrparams))
> > return -EINVAL;
> > 
> > is going to trigger in old kernels when executing programs
> > built against the new struct definition.

That will happen, yes, but do we really care about being future-proof
here? I mean: if we also update such check(s) to support dealing with
smaller-than-supported structs, newer kernels will be able to run
programs built against the old struct, and the new one; while building
using newer headers and running on older kernel may fool the
application in other ways too (like enabling support for something
that is available on newer kernel and that is not present in the older
one).

> > 
> I think thats also the reason its a packed aligned attribute, it can't be
> changed, or older kernels won't be able to fill it out properly.
> Neil

It's more for supporting running 32-bits apps on 64-bit kernels
(according to 20c9c825b12fc).

  Marcelo

Re: [PATCH v4 net-next] net:sched: add action inheritdsfield to skbedit

2018-06-20 Thread Marcelo Ricardo Leitner

On Wed, Jun 20, 2018 at 07:02:41PM +0200, Davide Caratti wrote:
...
> > I agree that we should update the drop counter, but given that 
> > you're already converting the stats to be per-cpu counters, whatever we 
> > add now will be just symbolic since you're going to change it anyway.

It wouldn't be symbolic. One thing is to convert a given increment
into something else, another is to start increasing it for some (new)
reason.

> 
> that's ok for me also, as I can use the current v4 code for the rebase
> (and not wait for another respin) _ but let's hear what reviewers think.
> 
> >  If 
> > reviewers think that Qiaobin's patch must add the update line, could you 
> > provide the exact line and location so we avoid going to v6 of this patch?
> 
> In case, I was thinking of something like: 
> 
> https://elixir.bootlin.com/linux/v4.18-rc1/source/net/sched/act_ipt.c#L249
> 
> so, between 'err:' and 'spin_unlock(>tcf_lock)', insert a line like:
> 
> d->tcf_qstats.drop++;

I prefer the more complete version. Then it will have a more complete
(git) history and help people when troubleshooting.

Thanks,
Marcelo

[PATCH net] sctp: fix erroneous inc of snmp SctpFragUsrMsgs

2018-06-20 Thread Marcelo Ricardo Leitner

Currently it is incrementing SctpFragUsrMsgs when the user message size
is of the exactly same size as the maximum fragment size, which is wrong.

The fix is to increment it only when user message is bigger than the
maximum fragment size.

Fixes: bfd2e4b8734d ("sctp: refactor sctp_datamsg_from_user")
Signed-off-by: Marcelo Ricardo Leitner 
---
 net/sctp/chunk.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c
index 
79daa98208c391c780440144d69bc7be875c3476..bfb9f812e2ef9fa605b08dc1f534781573c3abf8
 100644
--- a/net/sctp/chunk.c
+++ b/net/sctp/chunk.c
@@ -237,7 +237,9 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct 
sctp_association *asoc,
/* Account for a different sized first fragment */
if (msg_len >= first_len) {
msg->can_delay = 0;
-   SCTP_INC_STATS(sock_net(asoc->base.sk), SCTP_MIB_FRAGUSRMSGS);
+   if (msg_len > first_len)
+   SCTP_INC_STATS(sock_net(asoc->base.sk),
+  SCTP_MIB_FRAGUSRMSGS);
} else {
/* Which may be the only one... */
first_len = msg_len;
-- 
2.14.4

Re: [PATCH v4 net-next] net:sched: add action inheritdsfield to skbedit

2018-06-20 Thread Marcelo Ricardo Leitner

On Tue, Jun 12, 2018 at 03:42:55PM +, Fu, Qiaobin wrote:
> The new action inheritdsfield copies the field DS of
> IPv4 and IPv6 packets into skb->priority. This enables
> later classification of packets based on the DS field.
> 
> v4:
> *Not allow setting flags other than the expected ones.
> 
> *Allow dumping the pure flags.
> 
> Original idea by Jamal Hadi Salim 
> 
> Signed-off-by: Qiaobin Fu 
> Reviewed-by: Michel Machado 

Reviewed-by: Marcelo Ricardo Leitner 

> ---
> 
> Note that the motivation for this patch is found in the following discussion:
> https://www.spinics.net/lists/netdev/msg501061.html
> ---
> diff --git a/include/uapi/linux/tc_act/tc_skbedit.h 
> b/include/uapi/linux/tc_act/tc_skbedit.h
> index fbcfe27a4e6c..6de6071ebed6 100644
> --- a/include/uapi/linux/tc_act/tc_skbedit.h
> +++ b/include/uapi/linux/tc_act/tc_skbedit.h
> @@ -30,6 +30,7 @@
>  #define SKBEDIT_F_MARK   0x4
>  #define SKBEDIT_F_PTYPE  0x8
>  #define SKBEDIT_F_MASK   0x10
> +#define SKBEDIT_F_INHERITDSFIELD 0x20
>  
>  struct tc_skbedit {
>   tc_gen;
> @@ -45,6 +46,7 @@ enum {
>   TCA_SKBEDIT_PAD,
>   TCA_SKBEDIT_PTYPE,
>   TCA_SKBEDIT_MASK,
> + TCA_SKBEDIT_FLAGS,
>   __TCA_SKBEDIT_MAX
>  };
>  #define TCA_SKBEDIT_MAX (__TCA_SKBEDIT_MAX - 1)
> diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c
> index 6138d1d71900..9adbcfa3f5fe 100644
> --- a/net/sched/act_skbedit.c
> +++ b/net/sched/act_skbedit.c
> @@ -23,6 +23,9 @@
>  #include 
>  #include 
>  #include 
> +#include 
> +#include 
> +#include 
>  
>  #include 
>  #include 
> @@ -41,6 +44,25 @@ static int tcf_skbedit(struct sk_buff *skb, const struct 
> tc_action *a,
>  
>   if (d->flags & SKBEDIT_F_PRIORITY)
>   skb->priority = d->priority;
> + if (d->flags & SKBEDIT_F_INHERITDSFIELD) {
> + int wlen = skb_network_offset(skb);
> +
> + switch (tc_skb_protocol(skb)) {
> + case htons(ETH_P_IP):
> + wlen += sizeof(struct iphdr);
> + if (!pskb_may_pull(skb, wlen))
> + goto err;
> + skb->priority = ipv4_get_dsfield(ip_hdr(skb)) >> 2;
> + break;
> +
> + case htons(ETH_P_IPV6):
> + wlen += sizeof(struct ipv6hdr);
> + if (!pskb_may_pull(skb, wlen))
> + goto err;
> + skb->priority = ipv6_get_dsfield(ipv6_hdr(skb)) >> 2;
> + break;
> + }
> + }
>   if (d->flags & SKBEDIT_F_QUEUE_MAPPING &&
>   skb->dev->real_num_tx_queues > d->queue_mapping)
>   skb_set_queue_mapping(skb, d->queue_mapping);
> @@ -53,6 +75,10 @@ static int tcf_skbedit(struct sk_buff *skb, const struct 
> tc_action *a,
>  
>   spin_unlock(>tcf_lock);
>   return d->tcf_action;
> +
> +err:
> + spin_unlock(>tcf_lock);
> + return TC_ACT_SHOT;
>  }
>  
>  static const struct nla_policy skbedit_policy[TCA_SKBEDIT_MAX + 1] = {
> @@ -62,6 +88,7 @@ static const struct nla_policy 
> skbedit_policy[TCA_SKBEDIT_MAX + 1] = {
>   [TCA_SKBEDIT_MARK]  = { .len = sizeof(u32) },
>   [TCA_SKBEDIT_PTYPE] = { .len = sizeof(u16) },
>   [TCA_SKBEDIT_MASK]  = { .len = sizeof(u32) },
> + [TCA_SKBEDIT_FLAGS] = { .len = sizeof(u64) },
>  };
>  
>  static int tcf_skbedit_init(struct net *net, struct nlattr *nla,
> @@ -73,6 +100,7 @@ static int tcf_skbedit_init(struct net *net, struct nlattr 
> *nla,
>   struct tc_skbedit *parm;
>   struct tcf_skbedit *d;
>   u32 flags = 0, *priority = NULL, *mark = NULL, *mask = NULL;
> + u64 *pure_flags = NULL;
>   u16 *queue_mapping = NULL, *ptype = NULL;
>   bool exists = false;
>   int ret = 0, err;
> @@ -114,6 +142,12 @@ static int tcf_skbedit_init(struct net *net, struct 
> nlattr *nla,
>   mask = nla_data(tb[TCA_SKBEDIT_MASK]);
>   }
>  
> + if (tb[TCA_SKBEDIT_FLAGS] != NULL) {
> + pure_flags = nla_data(tb[TCA_SKBEDIT_FLAGS]);
> + if (*pure_flags & SKBEDIT_F_INHERITDSFIELD)
> + flags |= SKBEDIT_F_INHERITDSFIELD;
> + }
> +
>   parm = nla_data(tb[TCA_SKBEDIT_PARMS]);
>  
>   exists = tcf_idr_check(tn, parm->index, a, bind);
> @@ -178,6 +212,7 @@ static int tcf_skbedit_dump(struct sk_buff *skb, struct 
> tc_action *a,
>   .act

Re: [PATCH net-next] sctp: define sctp_packet_gso_append to build GSO frames

2018-06-13 Thread Marcelo Ricardo Leitner

On Thu, Jun 14, 2018 at 07:37:02AM +0800, Xin Long wrote:
> Now sctp GSO uses skb_gro_receive() to append the data into head
> skb frag_list. However it actually only needs very few code from
> skb_gro_receive(). Besides, NAPI_GRO_CB has to be set while most
> of its members are not needed here.
> 
> This patch is to add sctp_packet_gso_append() to build GSO frames
> instead of skb_gro_receive(), and it would avoid many unnecessary
> checks and make the code clearer.
> 
> Note that sctp will use page frags instead of frag_list to build
> GSO frames in another patch. But it may take time, as sctp's GSO
> frames may have different size. skb_segment() can only split it
> into the frags with the same size, which would break the border
> of sctp chunks.
> 
> Signed-off-by: Xin Long 

I cannot test it, but it looks good to me. Thanks Xin

Reviewed-by: Marcelo Ricardo Leitner 

> ---
>  include/net/sctp/structs.h |  5 +
>  net/sctp/output.c  | 28 ++--
>  2 files changed, 23 insertions(+), 10 deletions(-)
> 
> diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
> index ebf809e..dbe1b91 100644
> --- a/include/net/sctp/structs.h
> +++ b/include/net/sctp/structs.h
> @@ -1133,6 +1133,11 @@ struct sctp_input_cb {
>  };
>  #define SCTP_INPUT_CB(__skb) ((struct sctp_input_cb *)&((__skb)->cb[0]))
>  
> +struct sctp_output_cb {
> + struct sk_buff *last;
> +};
> +#define SCTP_OUTPUT_CB(__skb)((struct sctp_output_cb 
> *)&((__skb)->cb[0]))
> +
>  static inline const struct sk_buff *sctp_gso_headskb(const struct sk_buff 
> *skb)
>  {
>   const struct sctp_chunk *chunk = SCTP_INPUT_CB(skb)->chunk;
> diff --git a/net/sctp/output.c b/net/sctp/output.c
> index e672dee..7f849b0 100644
> --- a/net/sctp/output.c
> +++ b/net/sctp/output.c
> @@ -409,6 +409,21 @@ static void sctp_packet_set_owner_w(struct sk_buff *skb, 
> struct sock *sk)
>   refcount_inc(>sk_wmem_alloc);
>  }
>  
> +static void sctp_packet_gso_append(struct sk_buff *head, struct sk_buff *skb)
> +{
> + if (SCTP_OUTPUT_CB(head)->last == head)
> + skb_shinfo(head)->frag_list = skb;
> + else
> + SCTP_OUTPUT_CB(head)->last->next = skb;
> + SCTP_OUTPUT_CB(head)->last = skb;
> +
> + head->truesize += skb->truesize;
> + head->data_len += skb->len;
> + head->len += skb->len;
> +
> + __skb_header_release(skb);
> +}
> +
>  static int sctp_packet_pack(struct sctp_packet *packet,
>   struct sk_buff *head, int gso, gfp_t gfp)
>  {
> @@ -422,7 +437,7 @@ static int sctp_packet_pack(struct sctp_packet *packet,
>  
>   if (gso) {
>   skb_shinfo(head)->gso_type = sk->sk_gso_type;
> - NAPI_GRO_CB(head)->last = head;
> + SCTP_OUTPUT_CB(head)->last = head;
>   } else {
>   nskb = head;
>   pkt_size = packet->size;
> @@ -503,15 +518,8 @@ static int sctp_packet_pack(struct sctp_packet *packet,
>>chunk_list);
>   }
>  
> - if (gso) {
> - if (skb_gro_receive(, nskb)) {
> - kfree_skb(nskb);
> - return 0;
> - }
> - if (WARN_ON_ONCE(skb_shinfo(head)->gso_segs >=
> -  sk->sk_gso_max_segs))
> - return 0;
> - }
> + if (gso)
> + sctp_packet_gso_append(head, nskb);
>  
>   pkt_count++;
>   } while (!list_empty(>chunk_list));
> -- 
> 2.1.0
>

Re: problems with SCTP GSO

2018-06-12 Thread Marcelo Ricardo Leitner

On Tue, Jun 12, 2018 at 02:05:06PM -0300, Marcelo Ricardo Leitner wrote:
> On Mon, Jun 11, 2018 at 08:29:05PM -0700, David Miller wrote:
> > 
> > I would like to bring up some problems with the current GSO
> > implementation in SCTP.
> > 
> > The most important for me right now is that SCTP uses
> > "skb_gro_receive()" to build "GSO" frames :-(
> > 
> > Really it just ends up using the slow path (basically, label 'merge'
> > and onwards).
> > 
> > So, using a GRO helper to build GSO packets is not great.
> 
> Okay.
> 
> > 
> > I want to make major surgery here and the only way I can is if
> > it is exactly the GRO demuxing path that uses skb_gro_receive().
> > 
> > Those paths pass in the list head from the NAPI struct that initiated
> > the GRO code paths.  That makes it easy for me to change this to use a
> > list_head or a hash chain.
> > 
> > Probably in the short term SCTP should just have a private helper that
> > builds the frag list, appending 'skb' to 'head'.
> > 
> > In the long term, SCTP should use the page frags just like TCP to
> > append the data when building GSO frames.  Then it could actually be
> > offloaded and passed into drivers without linearizing.
> 
> Sounds like a plan. Shouldn't be too hard to do it.
> (I'm out on PTO, btw)

Xin will work on this, mean while at least. Thanks Xin.

> 
> Thanks,
> Marcelo
>

Re: problems with SCTP GSO

2018-06-12 Thread Marcelo Ricardo Leitner

On Mon, Jun 11, 2018 at 08:29:05PM -0700, David Miller wrote:
> 
> I would like to bring up some problems with the current GSO
> implementation in SCTP.
> 
> The most important for me right now is that SCTP uses
> "skb_gro_receive()" to build "GSO" frames :-(
> 
> Really it just ends up using the slow path (basically, label 'merge'
> and onwards).
> 
> So, using a GRO helper to build GSO packets is not great.

Okay.

> 
> I want to make major surgery here and the only way I can is if
> it is exactly the GRO demuxing path that uses skb_gro_receive().
> 
> Those paths pass in the list head from the NAPI struct that initiated
> the GRO code paths.  That makes it easy for me to change this to use a
> list_head or a hash chain.
> 
> Probably in the short term SCTP should just have a private helper that
> builds the frag list, appending 'skb' to 'head'.
> 
> In the long term, SCTP should use the page frags just like TCP to
> append the data when building GSO frames.  Then it could actually be
> offloaded and passed into drivers without linearizing.

Sounds like a plan. Shouldn't be too hard to do it.
(I'm out on PTO, btw)

Thanks,
Marcelo

Re: [PATCH v3 net-next] net:sched: add action inheritdsfield to skbedit

2018-06-09 Thread Marcelo Ricardo Leitner

On Sat, Jun 09, 2018 at 10:35:48PM +, Fu, Qiaobin wrote:
...
> @@ -73,6 +100,7 @@ static int tcf_skbedit_init(struct net *net, struct nlattr 
> *nla,
>   struct tc_skbedit *parm;
>   struct tcf_skbedit *d;
>   u32 flags = 0, *priority = NULL, *mark = NULL, *mask = NULL;
> + u64 *pure_flags = NULL;
>   u16 *queue_mapping = NULL, *ptype = NULL;
>   bool exists = false;
>   int ret = 0, err;
> @@ -114,6 +142,11 @@ static int tcf_skbedit_init(struct net *net, struct 
> nlattr *nla,
>   mask = nla_data(tb[TCA_SKBEDIT_MASK]);
>   }
>  
> + if (tb[TCA_SKBEDIT_FLAGS] != NULL) {
> + pure_flags = nla_data(tb[TCA_SKBEDIT_FLAGS]);
> + flags |= *pure_flags;

It shouldn't allow setting flags other than the expected ones.

> + }
> +
>   parm = nla_data(tb[TCA_SKBEDIT_PARMS]);
>  
>   exists = tcf_idr_check(tn, parm->index, a, bind);

Re: [PATCH net] sctp: not allow to set rto_min with a value below 200 msecs

2018-05-29 Thread Marcelo Ricardo Leitner

On Wed, May 30, 2018 at 01:45:08AM +0800, Xin Long wrote:
> If we're counting on max_t to fix this CPU stuck. It should not that
> matter if min rto < the value causing that stuck.

Yes but putting a floor to rto_{min,max} now is to protect the rtx
timer now, not the heartbeat one.

> 
> >
> > Anyway, what about we add a floor to rto_max too, so that RTO can
> > actually grow into something bigger that don't hog the CPU? Like:
> > rto_min floor = 5ms
> > rto_max floor = 50ms
> >
> >   Marcelo
> --
> To unsubscribe from this list: send the line "unsubscribe linux-sctp" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>

Re: [PATCH net] sctp: not allow to set rto_min with a value below 200 msecs

2018-05-29 Thread Marcelo Ricardo Leitner

On Tue, May 29, 2018 at 12:03:46PM -0400, Neal Cardwell wrote:
> On Tue, May 29, 2018 at 11:45 AM Marcelo Ricardo Leitner <
> marcelo.leit...@gmail.com> wrote:
> > - patch2 - fix rtx attack vector
> >- Add the floor value to rto_min to HZ/20 (which fits the values
> >  that Michael shared on the other email)
> 
> I would encourage allowing minimum RTO values down to 5ms, if the ACK
> policy in the receiver makes this feasible. Our experience is that in
> datacenter environments it can be advantageous to allow timer-based loss
> recoveries using timeout values as low as 5ms, e.g.:

Thanks Neal. On Xin's tests, the hearbeat timer becomes an issue at
~25ms already. Xin, can you share more details on the hw, which CPU
was used?

Anyway, what about we add a floor to rto_max too, so that RTO can
actually grow into something bigger that don't hog the CPU? Like:
rto_min floor = 5ms
rto_max floor = 50ms

  Marcelo

Re: [PATCH net] sctp: not allow to set rto_min with a value below 200 msecs

2018-05-29 Thread Marcelo Ricardo Leitner

On Tue, May 29, 2018 at 03:06:06PM +0200, Michael Tuexen wrote:
> > On 29. May 2018, at 13:41, Neil Horman  wrote:
> > 
> > On Mon, May 28, 2018 at 04:43:15PM -0300, Marcelo Ricardo Leitner wrote:
> >> On Sat, May 26, 2018 at 09:01:00PM -0400, Neil Horman wrote:
> >>> On Sat, May 26, 2018 at 05:50:39PM +0200, Dmitry Vyukov wrote:
> >>>> On Sat, May 26, 2018 at 5:42 PM, Michael Tuexen
> >>>>  wrote:
> >>>>>> On 25. May 2018, at 21:13, Neil Horman  wrote:
> >>>>>> 
> >>>>>> On Sat, May 26, 2018 at 01:41:02AM +0800, Xin Long wrote:
> >>>>>>> syzbot reported a rcu_sched self-detected stall on CPU which is caused
> >>>>>>> by too small value set on rto_min with SCTP_RTOINFO sockopt. With this
> >>>>>>> value, hb_timer will get stuck there, as in its timer handler it 
> >>>>>>> starts
> >>>>>>> this timer again with this value, then goes to the timer handler 
> >>>>>>> again.
> >>>>>>> 
> >>>>>>> This problem is there since very beginning, and thanks to Eric for the
> >>>>>>> reproducer shared from a syzbot mail.
> >>>>>>> 
> >>>>>>> This patch fixes it by not allowing to set rto_min with a value below
> >>>>>>> 200 msecs, which is based on TCP's, by either setsockopt or sysctl.
> >>>>>>> 
> >>>>>>> Reported-by: syzbot+3dcd59a1f907245f8...@syzkaller.appspotmail.com
> >>>>>>> Suggested-by: Marcelo Ricardo Leitner 
> >>>>>>> Signed-off-by: Xin Long 
> >>>>>>> ---
> >>>>>>> include/net/sctp/constants.h |  1 +
> >>>>>>> net/sctp/socket.c| 10 +++---
> >>>>>>> net/sctp/sysctl.c|  3 ++-
> >>>>>>> 3 files changed, 10 insertions(+), 4 deletions(-)
> >>>>>>> 
> >>>>>>> diff --git a/include/net/sctp/constants.h 
> >>>>>>> b/include/net/sctp/constants.h
> >>>>>>> index 20ff237..2ee7a7b 100644
> >>>>>>> --- a/include/net/sctp/constants.h
> >>>>>>> +++ b/include/net/sctp/constants.h
> >>>>>>> @@ -277,6 +277,7 @@ enum { SCTP_MAX_GABS = 16 };
> >>>>>>> #define SCTP_RTO_INITIAL (3 * 1000)
> >>>>>>> #define SCTP_RTO_MIN (1 * 1000)
> >>>>>>> #define SCTP_RTO_MAX (60 * 1000)
> >>>>>>> +#define SCTP_RTO_HARD_MIN   200
> >>>>>>> 
> >>>>>>> #define SCTP_RTO_ALPHA  3   /* 1/8 when converted to right 
> >>>>>>> shifts. */
> >>>>>>> #define SCTP_RTO_BETA   2   /* 1/4 when converted to right 
> >>>>>>> shifts. */
> >>>>>>> diff --git a/net/sctp/socket.c b/net/sctp/socket.c
> >>>>>>> index ae7e7c6..6ef12c7 100644
> >>>>>>> --- a/net/sctp/socket.c
> >>>>>>> +++ b/net/sctp/socket.c
> >>>>>>> @@ -3029,7 +3029,8 @@ static int sctp_setsockopt_nodelay(struct sock 
> >>>>>>> *sk, char __user *optval,
> >>>>>>> * be changed.
> >>>>>>> *
> >>>>>>> */
> >>>>>>> -static int sctp_setsockopt_rtoinfo(struct sock *sk, char __user 
> >>>>>>> *optval, unsigned int optlen)
> >>>>>>> +static int sctp_setsockopt_rtoinfo(struct sock *sk, char __user 
> >>>>>>> *optval,
> >>>>>>> +   unsigned int optlen)
> >>>>>>> {
> >>>>>>> struct sctp_rtoinfo rtoinfo;
> >>>>>>> struct sctp_association *asoc;
> >>>>>>> @@ -3056,10 +3057,13 @@ static int sctp_setsockopt_rtoinfo(struct 
> >>>>>>> sock *sk, char __user *optval, unsigne
> >>>>>>> else
> >>>>>>> rto_max = asoc ? asoc->rto_max : sp->rtoinfo.srto_max;
> >>>>>>> 
> >>>>>>> -if (rto_min)
> >>>>>>> +if (rto_min) {
> >>>>>>> +if (rto_min < SCTP_RTO_HARD_MIN)
> >>>>>>&

Re: [PATCH v3 02/11] net: sched: change type of reference and bind counters

2018-05-28 Thread Marcelo Ricardo Leitner

On Mon, May 28, 2018 at 12:17:20AM +0300, Vlad Buslov wrote:
> Change type of action reference counter to refcount_t.
> 
> Change type of action bind counter to atomic_t.
> This type is used to allow decrementing bind counter without testing
> for 0 result.
> 
> Signed-off-by: Vlad Buslov 
> Signed-off-by: Jiri Pirko 

Reviewed-by: Marcelo Ricardo Leitner 

> ---
>  include/net/act_api.h  |  5 +++--
>  net/sched/act_api.c| 32 ++--
>  net/sched/act_bpf.c|  4 ++--
>  net/sched/act_connmark.c   |  4 ++--
>  net/sched/act_csum.c   |  4 ++--
>  net/sched/act_gact.c   |  4 ++--
>  net/sched/act_ife.c|  4 ++--
>  net/sched/act_ipt.c|  4 ++--
>  net/sched/act_mirred.c |  4 ++--
>  net/sched/act_nat.c|  4 ++--
>  net/sched/act_pedit.c  |  4 ++--
>  net/sched/act_police.c |  4 ++--
>  net/sched/act_sample.c |  4 ++--
>  net/sched/act_simple.c |  4 ++--
>  net/sched/act_skbedit.c|  4 ++--
>  net/sched/act_skbmod.c |  4 ++--
>  net/sched/act_tunnel_key.c |  4 ++--
>  net/sched/act_vlan.c   |  4 ++--
>  18 files changed, 57 insertions(+), 44 deletions(-)
> 
> diff --git a/include/net/act_api.h b/include/net/act_api.h
> index f7b59ef7303d..e634014605cb 100644
> --- a/include/net/act_api.h
> +++ b/include/net/act_api.h
> @@ -6,6 +6,7 @@
>   * Public action API for classifiers/qdiscs
>  */
>  
> +#include 
>  #include 
>  #include 
>  #include 
> @@ -26,8 +27,8 @@ struct tc_action {
>   struct tcf_idrinfo  *idrinfo;
>  
>   u32 tcfa_index;
> - int tcfa_refcnt;
> - int tcfa_bindcnt;
> + refcount_t  tcfa_refcnt;
> + atomic_ttcfa_bindcnt;
>   u32 tcfa_capab;
>   int tcfa_action;
>   struct tcf_ttcfa_tm;
> diff --git a/net/sched/act_api.c b/net/sched/act_api.c
> index 02670c7489e3..4f064ecab882 100644
> --- a/net/sched/act_api.c
> +++ b/net/sched/act_api.c
> @@ -105,14 +105,26 @@ int __tcf_idr_release(struct tc_action *p, bool bind, 
> bool strict)
>  
>   ASSERT_RTNL();
>  
> + /* Release with strict==1 and bind==0 is only called through act API
> +  * interface (classifiers always bind). Only case when action with
> +  * positive reference count and zero bind count can exist is when it was
> +  * also created with act API (unbinding last classifier will destroy the
> +  * action if it was created by classifier). So only case when bind count
> +  * can be changed after initial check is when unbound action is
> +  * destroyed by act API while classifier binds to action with same id
> +  * concurrently. This result either creation of new action(same behavior
> +  * as before), or reusing existing action if concurrent process
> +  * increments reference count before action is deleted. Both scenarios
> +  * are acceptable.
> +  */
>   if (p) {
>   if (bind)
> - p->tcfa_bindcnt--;
> - else if (strict && p->tcfa_bindcnt > 0)
> + atomic_dec(>tcfa_bindcnt);
> + else if (strict && atomic_read(>tcfa_bindcnt) > 0)
>   return -EPERM;
>  
> - p->tcfa_refcnt--;
> - if (p->tcfa_bindcnt <= 0 && p->tcfa_refcnt <= 0) {
> + if (atomic_read(>tcfa_bindcnt) <= 0 &&
> + refcount_dec_and_test(>tcfa_refcnt)) {
>   if (p->ops->cleanup)
>   p->ops->cleanup(p);
>   tcf_idr_remove(p->idrinfo, p);
> @@ -304,8 +316,8 @@ bool tcf_idr_check(struct tc_action_net *tn, u32 index, 
> struct tc_action **a,
>  
>   if (index && p) {
>   if (bind)
> - p->tcfa_bindcnt++;
> - p->tcfa_refcnt++;
> + atomic_inc(>tcfa_bindcnt);
> + refcount_inc(>tcfa_refcnt);
>   *a = p;
>   return true;
>   }
> @@ -324,9 +336,9 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, 
> struct nlattr *est,
>  
>   if (unlikely(!p))
>   return -ENOMEM;
> - p->tcfa_refcnt = 1;
> + refcount_set(>tcfa_refcnt, 1);
>   if (bind)
> - p->tcfa_bindcnt = 1;
> + atomic_set(>tcfa_bindcnt, 1);
>  
>   if (cpustats) {
>   p->cpu_bstats = ne

Re: [PATCH v3 05/11] net: sched: implement action API that deletes action by index

2018-05-28 Thread Marcelo Ricardo Leitner

On Mon, May 28, 2018 at 12:17:23AM +0300, Vlad Buslov wrote:
> Implement new action API function that atomically finds and deletes action
> from idr by index. Intended to be used by lockless actions that do not rely
> on rtnl lock.
> 
> Signed-off-by: Vlad Buslov 

Reviewed-by: Marcelo Ricardo Leitner 

> ---
> Changes from V1 to V2:
> - Rename tcf_idr_find_delete to tcf_idr_delete_index.
> 
>  include/net/act_api.h |  1 +
>  net/sched/act_api.c   | 39 +++
>  2 files changed, 40 insertions(+)
> 
> diff --git a/include/net/act_api.h b/include/net/act_api.h
> index 888ff471bbf6..d94ec6400673 100644
> --- a/include/net/act_api.h
> +++ b/include/net/act_api.h
> @@ -153,6 +153,7 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, 
> struct nlattr *est,
>  int bind, bool cpustats);
>  void tcf_idr_insert(struct tc_action_net *tn, struct tc_action *a);
>  
> +int tcf_idr_delete_index(struct tc_action_net *tn, u32 index);
>  int __tcf_idr_release(struct tc_action *a, bool bind, bool strict);
>  
>  static inline int tcf_idr_release(struct tc_action *a, bool bind)
> diff --git a/net/sched/act_api.c b/net/sched/act_api.c
> index aa304d36fee0..0f31f09946ab 100644
> --- a/net/sched/act_api.c
> +++ b/net/sched/act_api.c
> @@ -319,6 +319,45 @@ bool tcf_idr_check(struct tc_action_net *tn, u32 index, 
> struct tc_action **a,
>  }
>  EXPORT_SYMBOL(tcf_idr_check);
>  
> +int tcf_idr_delete_index(struct tc_action_net *tn, u32 index)
> +{
> + struct tcf_idrinfo *idrinfo = tn->idrinfo;
> + struct tc_action *p;
> + int ret = 0;
> +
> + spin_lock(>lock);
> + p = idr_find(>action_idr, index);
> + if (!p) {
> + spin_unlock(>lock);
> + return -ENOENT;
> + }
> +
> + if (!atomic_read(>tcfa_bindcnt)) {
> + if (refcount_dec_and_test(>tcfa_refcnt)) {
> + struct module *owner = p->ops->owner;
> +
> + WARN_ON(p != idr_remove(>action_idr,
> + p->tcfa_index));
> + spin_unlock(>lock);
> +
> + if (p->ops->cleanup)
> + p->ops->cleanup(p);
> +
> + gen_kill_estimator(>tcfa_rate_est);
> + free_tcf(p);
> + module_put(owner);
> + return 0;
> + }
> + ret = 0;
> + } else {
> + ret = -EPERM;
> + }
> +
> + spin_unlock(>lock);
> + return ret;
> +}
> +EXPORT_SYMBOL(tcf_idr_delete_index);
> +
>  int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est,
>  struct tc_action **a, const struct tc_action_ops *ops,
>  int bind, bool cpustats)
> -- 
> 2.7.5
>

Re: [PATCH v3 04/11] net: sched: always take reference to action

2018-05-28 Thread Marcelo Ricardo Leitner

On Mon, May 28, 2018 at 12:17:22AM +0300, Vlad Buslov wrote:
> Without rtnl lock protection it is no longer safe to use pointer to tc
> action without holding reference to it. (it can be destroyed concurrently)
> 
> Remove unsafe action idr lookup function. Instead of it, implement safe tcf
> idr check function that atomically looks up action in idr and increments
> its reference and bind counters. Implement both action search and check
> using new safe function
> 
> Reference taken by idr check is temporal and should not be accounted by
> userspace clients (both logically and to preserver current API behavior).
> Subtract temporal reference when dumping action to userspace using existing
> tca_get_fill function arguments.
> 
> Signed-off-by: Vlad Buslov 

Reviewed-by: Marcelo Ricardo Leitner 

> ---
> Changes from V1 to V2:
> - Make __tcf_idr_check function static
> - Merge changes that take reference to action when performing lookup and
>   changes that account for this additional reference when dumping action
>   to user space into single patch.
> 
>  net/sched/act_api.c | 46 --
>  1 file changed, 20 insertions(+), 26 deletions(-)
> 
> diff --git a/net/sched/act_api.c b/net/sched/act_api.c
> index 256b0c93916c..aa304d36fee0 100644
> --- a/net/sched/act_api.c
> +++ b/net/sched/act_api.c
> @@ -284,44 +284,38 @@ int tcf_generic_walker(struct tc_action_net *tn, struct 
> sk_buff *skb,
>  }
>  EXPORT_SYMBOL(tcf_generic_walker);
>  
> -static struct tc_action *tcf_idr_lookup(u32 index, struct tcf_idrinfo 
> *idrinfo)
> +static bool __tcf_idr_check(struct tc_action_net *tn, u32 index,
> + struct tc_action **a, int bind)
>  {
> - struct tc_action *p = NULL;
> + struct tcf_idrinfo *idrinfo = tn->idrinfo;
> + struct tc_action *p;
>  
>   spin_lock(>lock);
>   p = idr_find(>action_idr, index);
> + if (p) {
> + refcount_inc(>tcfa_refcnt);
> + if (bind)
> + atomic_inc(>tcfa_bindcnt);
> + }
>   spin_unlock(>lock);
>  
> - return p;
> + if (p) {
> + *a = p;
> + return true;
> + }
> + return false;
>  }
>  
>  int tcf_idr_search(struct tc_action_net *tn, struct tc_action **a, u32 index)
>  {
> - struct tcf_idrinfo *idrinfo = tn->idrinfo;
> - struct tc_action *p = tcf_idr_lookup(index, idrinfo);
> -
> - if (p) {
> - *a = p;
> - return 1;
> - }
> - return 0;
> + return __tcf_idr_check(tn, index, a, 0);
>  }
>  EXPORT_SYMBOL(tcf_idr_search);
>  
>  bool tcf_idr_check(struct tc_action_net *tn, u32 index, struct tc_action **a,
>  int bind)
>  {
> - struct tcf_idrinfo *idrinfo = tn->idrinfo;
> - struct tc_action *p = tcf_idr_lookup(index, idrinfo);
> -
> - if (index && p) {
> - if (bind)
> - atomic_inc(>tcfa_bindcnt);
> - refcount_inc(>tcfa_refcnt);
> - *a = p;
> - return true;
> - }
> - return false;
> + return __tcf_idr_check(tn, index, a, bind);
>  }
>  EXPORT_SYMBOL(tcf_idr_check);
>  
> @@ -932,7 +926,7 @@ tcf_get_notify(struct net *net, u32 portid, struct 
> nlmsghdr *n,
>   if (!skb)
>   return -ENOBUFS;
>   if (tca_get_fill(skb, actions, portid, n->nlmsg_seq, 0, event,
> -  0, 0) <= 0) {
> +  0, 1) <= 0) {
>   NL_SET_ERR_MSG(extack, "Failed to fill netlink attributes while 
> adding TC action");
>   kfree_skb(skb);
>   return -EINVAL;
> @@ -1072,7 +1066,7 @@ tcf_del_notify(struct net *net, struct nlmsghdr *n, 
> struct list_head *actions,
>   return -ENOBUFS;
>  
>   if (tca_get_fill(skb, actions, portid, n->nlmsg_seq, 0, RTM_DELACTION,
> -  0, 1) <= 0) {
> +  0, 2) <= 0) {
>   NL_SET_ERR_MSG(extack, "Failed to fill netlink TC action 
> attributes");
>   kfree_skb(skb);
>   return -EINVAL;
> @@ -1131,14 +1125,14 @@ tca_action_gd(struct net *net, struct nlattr *nla, 
> struct nlmsghdr *n,
>   if (event == RTM_GETACTION)
>   ret = tcf_get_notify(net, portid, n, , event, extack);
>   else { /* delete */
> + cleanup_a(, 1); /* lookup took reference */
>   ret = tcf_del_notify(net, n, , portid, attr_size, 
> extack);
>   if (ret)
>   goto err;
>   return ret;
>   }
>  err:
> - if (event != RTM_GETACTION)
> - tcf_action_destroy(, 0);
> + tcf_action_destroy(, 0);
>   return ret;
>  }
>  
> -- 
> 2.7.5
>

Re: [PATCH v3 07/11] net: sched: implement reference counted action release

2018-05-28 Thread Marcelo Ricardo Leitner

On Mon, May 28, 2018 at 12:17:25AM +0300, Vlad Buslov wrote:
> Implement helper delete function that uses new action ops 'delete', instead
> of destroying action directly. This is required so act API could delete
> actions by index, without holding any references to action that is being
> deleted.
> 
> Implement function __tcf_action_put() that releases reference to action and
> frees it, if necessary. Refactor action deletion code to use new put
> function and not to rely on rtnl lock. Remove rtnl lock assertions that are
> no longer needed.
> 
> Signed-off-by: Vlad Buslov 

Reviewed-by: Marcelo Ricardo Leitner 

> ---
> Changes from V1 to V2:
> - Removed redundant actions ops lookup during delete.
> - Assume all actions have delete implemented and don't check for it
>   explicitly.
> - Rearrange variable definitions in tcf_action_delete.
> 
>  net/sched/act_api.c | 84 
> +++--
>  net/sched/cls_api.c |  1 -
>  2 files changed, 62 insertions(+), 23 deletions(-)
> 
> diff --git a/net/sched/act_api.c b/net/sched/act_api.c
> index 0f31f09946ab..a023873db713 100644
> --- a/net/sched/act_api.c
> +++ b/net/sched/act_api.c
> @@ -90,21 +90,39 @@ static void free_tcf(struct tc_action *p)
>   kfree(p);
>  }
>  
> -static void tcf_idr_remove(struct tcf_idrinfo *idrinfo, struct tc_action *p)
> +static void tcf_action_cleanup(struct tc_action *p)
>  {
> - spin_lock(>lock);
> - idr_remove(>action_idr, p->tcfa_index);
> - spin_unlock(>lock);
> + if (p->ops->cleanup)
> + p->ops->cleanup(p);
> +
>   gen_kill_estimator(>tcfa_rate_est);
>   free_tcf(p);
>  }
>  
> +static int __tcf_action_put(struct tc_action *p, bool bind)
> +{
> + struct tcf_idrinfo *idrinfo = p->idrinfo;
> +
> + if (refcount_dec_and_lock(>tcfa_refcnt, >lock)) {
> + if (bind)
> + atomic_dec(>tcfa_bindcnt);
> + idr_remove(>action_idr, p->tcfa_index);
> + spin_unlock(>lock);
> +
> + tcf_action_cleanup(p);
> + return 1;
> + }
> +
> + if (bind)
> + atomic_dec(>tcfa_bindcnt);
> +
> + return 0;
> +}
> +
>  int __tcf_idr_release(struct tc_action *p, bool bind, bool strict)
>  {
>   int ret = 0;
>  
> - ASSERT_RTNL();
> -
>   /* Release with strict==1 and bind==0 is only called through act API
>* interface (classifiers always bind). Only case when action with
>* positive reference count and zero bind count can exist is when it was
> @@ -118,18 +136,11 @@ int __tcf_idr_release(struct tc_action *p, bool bind, 
> bool strict)
>* are acceptable.
>*/
>   if (p) {
> - if (bind)
> - atomic_dec(>tcfa_bindcnt);
> - else if (strict && atomic_read(>tcfa_bindcnt) > 0)
> + if (!bind && strict && atomic_read(>tcfa_bindcnt) > 0)
>   return -EPERM;
>  
> - if (atomic_read(>tcfa_bindcnt) <= 0 &&
> - refcount_dec_and_test(>tcfa_refcnt)) {
> - if (p->ops->cleanup)
> - p->ops->cleanup(p);
> - tcf_idr_remove(p->idrinfo, p);
> + if (__tcf_action_put(p, bind))
>   ret = ACT_P_DELETED;
> - }
>   }
>  
>   return ret;
> @@ -340,11 +351,7 @@ int tcf_idr_delete_index(struct tc_action_net *tn, u32 
> index)
>   p->tcfa_index));
>   spin_unlock(>lock);
>  
> - if (p->ops->cleanup)
> - p->ops->cleanup(p);
> -
> - gen_kill_estimator(>tcfa_rate_est);
> - free_tcf(p);
> + tcf_action_cleanup(p);
>   module_put(owner);
>   return 0;
>   }
> @@ -615,6 +622,11 @@ int tcf_action_destroy(struct list_head *actions, int 
> bind)
>   return ret;
>  }
>  
> +static int tcf_action_put(struct tc_action *p)
> +{
> + return __tcf_action_put(p, false);
> +}
> +
>  int
>  tcf_action_dump_old(struct sk_buff *skb, struct tc_action *a, int bind, int 
> ref)
>  {
> @@ -1092,6 +1104,35 @@ static int tca_action_flush(struct net *net, struct 
> nlattr *nla,
>   return err;
>  }
>  
> +static int tcf_action_delete(struct net *net, struct list_head *actions,
> +

Re: [PATCH v3 03/11] net: sched: implement unlocked action init API

2018-05-28 Thread Marcelo Ricardo Leitner

On Mon, May 28, 2018 at 12:17:21AM +0300, Vlad Buslov wrote:
> Add additional 'rtnl_held' argument to act API init functions. It is
> required to implement actions that need to release rtnl lock before loading
> kernel module and reacquire if afterwards.
> 
> Signed-off-by: Vlad Buslov 

Reviewed-by: Marcelo Ricardo Leitner 

> ---
> Changes from V1 to V2:
> - Rename "unlocked" to "rtnl_held" for clarity.
> 
>  include/net/act_api.h  |  6 --
>  net/sched/act_api.c| 18 +++---
>  net/sched/act_bpf.c|  3 ++-
>  net/sched/act_connmark.c   |  2 +-
>  net/sched/act_csum.c   |  3 ++-
>  net/sched/act_gact.c   |  3 ++-
>  net/sched/act_ife.c|  3 ++-
>  net/sched/act_ipt.c|  6 --
>  net/sched/act_mirred.c |  5 +++--
>  net/sched/act_nat.c|  2 +-
>  net/sched/act_pedit.c  |  3 ++-
>  net/sched/act_police.c |  2 +-
>  net/sched/act_sample.c |  3 ++-
>  net/sched/act_simple.c |  3 ++-
>  net/sched/act_skbedit.c|  3 ++-
>  net/sched/act_skbmod.c |  3 ++-
>  net/sched/act_tunnel_key.c |  3 ++-
>  net/sched/act_vlan.c   |  3 ++-
>  net/sched/cls_api.c|  5 +++--
>  19 files changed, 50 insertions(+), 29 deletions(-)
> 
> diff --git a/include/net/act_api.h b/include/net/act_api.h
> index e634014605cb..888ff471bbf6 100644
> --- a/include/net/act_api.h
> +++ b/include/net/act_api.h
> @@ -92,7 +92,8 @@ struct tc_action_ops {
> struct netlink_ext_ack *extack);
>   int (*init)(struct net *net, struct nlattr *nla,
>   struct nlattr *est, struct tc_action **act, int ovr,
> - int bind, struct netlink_ext_ack *extack);
> + int bind, bool rtnl_held,
> + struct netlink_ext_ack *extack);
>   int (*walk)(struct net *, struct sk_buff *,
>   struct netlink_callback *, int,
>   const struct tc_action_ops *,
> @@ -168,10 +169,11 @@ int tcf_action_exec(struct sk_buff *skb, struct 
> tc_action **actions,
>  int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr 
> *nla,
>   struct nlattr *est, char *name, int ovr, int bind,
>   struct list_head *actions, size_t *attr_size,
> - struct netlink_ext_ack *extack);
> + bool rtnl_held, struct netlink_ext_ack *extack);
>  struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp,
>   struct nlattr *nla, struct nlattr *est,
>   char *name, int ovr, int bind,
> + bool rtnl_held,
>   struct netlink_ext_ack *extack);
>  int tcf_action_dump(struct sk_buff *skb, struct list_head *, int, int);
>  int tcf_action_dump_old(struct sk_buff *skb, struct tc_action *a, int, int);
> diff --git a/net/sched/act_api.c b/net/sched/act_api.c
> index 4f064ecab882..256b0c93916c 100644
> --- a/net/sched/act_api.c
> +++ b/net/sched/act_api.c
> @@ -671,6 +671,7 @@ static struct tc_cookie *nla_memdup_cookie(struct nlattr 
> **tb)
>  struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp,
>   struct nlattr *nla, struct nlattr *est,
>   char *name, int ovr, int bind,
> + bool rtnl_held,
>   struct netlink_ext_ack *extack)
>  {
>   struct tc_action *a;
> @@ -721,9 +722,11 @@ struct tc_action *tcf_action_init_1(struct net *net, 
> struct tcf_proto *tp,
>   a_o = tc_lookup_action_n(act_name);
>   if (a_o == NULL) {
>  #ifdef CONFIG_MODULES
> - rtnl_unlock();
> + if (rtnl_held)
> + rtnl_unlock();
>   request_module("act_%s", act_name);
> - rtnl_lock();
> + if (rtnl_held)
> + rtnl_lock();
>  
>   a_o = tc_lookup_action_n(act_name);
>  
> @@ -746,9 +749,10 @@ struct tc_action *tcf_action_init_1(struct net *net, 
> struct tcf_proto *tp,
>   /* backward compatibility for policer */
>   if (name == NULL)
>   err = a_o->init(net, tb[TCA_ACT_OPTIONS], est, , ovr, bind,
> - extack);
> + rtnl_held, extack);
>   else
> - err = a_o->init(net, nla, est, , ovr, bind, extack);
> + err = a_o->init(net, nla, est, , ovr, bind, rtnl_held,
> + extack);
>   if (err < 0)
>   goto err_mod;
>  
> @@

Re: [PATCH v3 08/11] net: sched: don't release reference on action overwrite

2018-05-28 Thread Marcelo Ricardo Leitner

On Mon, May 28, 2018 at 12:17:26AM +0300, Vlad Buslov wrote:
> Return from action init function with reference to action taken,
> even when overwriting existing action.
> 
> Action init API initializes its fourth argument (pointer to pointer to tc
> action) to either existing action with same index or newly created action.
> In case of existing index(and bind argument is zero), init function returns
> without incrementing action reference counter. Caller of action init then
> proceeds working with action, without actually holding reference to it.
> This means that action could be deleted concurrently.
> 
> Change action init behavior to always take reference to action before
> returning successfully, in order to protect from concurrent deletion.
> 
> Signed-off-by: Vlad Buslov 

Reviewed-by: Marcelo Ricardo Leitner 

> ---
> Changes from V1 to V2:
> - Resplit action lookup/release code to prevent memory leaks in
>   individual patches.
> - Change convoluted commit message.
> 
>  net/sched/act_api.c|  2 --
>  net/sched/act_bpf.c|  8 
>  net/sched/act_connmark.c   |  5 +++--
>  net/sched/act_csum.c   |  8 
>  net/sched/act_gact.c   |  5 +++--
>  net/sched/act_ife.c| 12 +---
>  net/sched/act_ipt.c|  5 +++--
>  net/sched/act_mirred.c |  5 ++---
>  net/sched/act_nat.c|  5 +++--
>  net/sched/act_pedit.c  |  5 +++--
>  net/sched/act_police.c |  8 +++-
>  net/sched/act_sample.c |  8 +++-
>  net/sched/act_simple.c |  5 +++--
>  net/sched/act_skbedit.c|  5 +++--
>  net/sched/act_skbmod.c |  8 +++-
>  net/sched/act_tunnel_key.c |  8 +++-
>  net/sched/act_vlan.c   |  8 +++-
>  17 files changed, 51 insertions(+), 59 deletions(-)
> 
> diff --git a/net/sched/act_api.c b/net/sched/act_api.c
> index a023873db713..f019f0464cec 100644
> --- a/net/sched/act_api.c
> +++ b/net/sched/act_api.c
> @@ -870,8 +870,6 @@ int tcf_action_init(struct net *net, struct tcf_proto 
> *tp, struct nlattr *nla,
>   }
>   act->order = i;
>   sz += tcf_action_fill_size(act);
> - if (ovr)
> - refcount_inc(>tcfa_refcnt);
>   list_add_tail(>list, actions);
>   }
>  
> diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c
> index 7941dd66ff83..d3f4ac6f2c4b 100644
> --- a/net/sched/act_bpf.c
> +++ b/net/sched/act_bpf.c
> @@ -311,9 +311,10 @@ static int tcf_bpf_init(struct net *net, struct nlattr 
> *nla,
>   if (bind)
>   return 0;
>  
> - tcf_idr_release(*act, bind);
> - if (!replace)
> + if (!replace) {
> + tcf_idr_release(*act, bind);
>   return -EEXIST;
> + }
>   }
>  
>   is_bpf = tb[TCA_ACT_BPF_OPS_LEN] && tb[TCA_ACT_BPF_OPS];
> @@ -356,8 +357,7 @@ static int tcf_bpf_init(struct net *net, struct nlattr 
> *nla,
>  
>   return res;
>  out:
> - if (res == ACT_P_CREATED)
> - tcf_idr_release(*act, bind);
> + tcf_idr_release(*act, bind);
>  
>   return ret;
>  }
> diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c
> index 143c2d3de723..701e90244eff 100644
> --- a/net/sched/act_connmark.c
> +++ b/net/sched/act_connmark.c
> @@ -135,9 +135,10 @@ static int tcf_connmark_init(struct net *net, struct 
> nlattr *nla,
>   ci = to_connmark(*a);
>   if (bind)
>   return 0;
> - tcf_idr_release(*a, bind);
> - if (!ovr)
> + if (!ovr) {
> + tcf_idr_release(*a, bind);
>   return -EEXIST;
> + }
>   /* replacing action and zone */
>   ci->tcf_action = parm->action;
>   ci->zone = parm->zone;
> diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c
> index 3768539340e0..5dbee136b0a1 100644
> --- a/net/sched/act_csum.c
> +++ b/net/sched/act_csum.c
> @@ -76,9 +76,10 @@ static int tcf_csum_init(struct net *net, struct nlattr 
> *nla,
>   } else {
>   if (bind)/* dont override defaults */
>   return 0;
> - tcf_idr_release(*a, bind);
> - if (!ovr)
> + if (!ovr) {
> + tcf_idr_release(*a, bind);
>   return -EEXIST;
> + }
>   }
>  
>   p = to_tcf_csum(*a);
> @@ -86,8 +87,7 @@ static int tcf_csum_init(struct net *net, struct nlattr 
> *nla,
>  
>   params_new = kzalloc(sizeof(*params_new), GFP_KERNEL);
>

Re: [PATCH v3 01/11] net: sched: use rcu for action cookie update

2018-05-28 Thread Marcelo Ricardo Leitner

On Mon, May 28, 2018 at 12:17:19AM +0300, Vlad Buslov wrote:
> Implement functions to atomically update and free action cookie
> using rcu mechanism.
> 
> Signed-off-by: Vlad Buslov 
> Signed-off-by: Jiri Pirko 

Reviewed-by: Marcelo Ricardo Leitner 

> ---
>  include/net/act_api.h |  2 +-
>  include/net/pkt_cls.h |  1 +
>  net/sched/act_api.c   | 44 ++--
>  3 files changed, 32 insertions(+), 15 deletions(-)
> 
> diff --git a/include/net/act_api.h b/include/net/act_api.h
> index 9e59ebfded62..f7b59ef7303d 100644
> --- a/include/net/act_api.h
> +++ b/include/net/act_api.h
> @@ -37,7 +37,7 @@ struct tc_action {
>   spinlock_t  tcfa_lock;
>   struct gnet_stats_basic_cpu __percpu *cpu_bstats;
>   struct gnet_stats_queue __percpu *cpu_qstats;
> - struct tc_cookie*act_cookie;
> + struct tc_cookie__rcu *act_cookie;
>   struct tcf_chain*goto_chain;
>  };
>  #define tcf_indexcommon.tcfa_index
> diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
> index e828d31be5da..3068cc8aa0f1 100644
> --- a/include/net/pkt_cls.h
> +++ b/include/net/pkt_cls.h
> @@ -769,6 +769,7 @@ struct tc_mqprio_qopt_offload {
>  struct tc_cookie {
>   u8  *data;
>   u32 len;
> + struct rcu_head rcu;
>  };
>  
>  struct tc_qopt_offload_stats {
> diff --git a/net/sched/act_api.c b/net/sched/act_api.c
> index 3f4cf930f809..02670c7489e3 100644
> --- a/net/sched/act_api.c
> +++ b/net/sched/act_api.c
> @@ -55,6 +55,24 @@ static void tcf_action_goto_chain_exec(const struct 
> tc_action *a,
>   res->goto_tp = rcu_dereference_bh(chain->filter_chain);
>  }
>  
> +static void tcf_free_cookie_rcu(struct rcu_head *p)
> +{
> + struct tc_cookie *cookie = container_of(p, struct tc_cookie, rcu);
> +
> + kfree(cookie->data);
> + kfree(cookie);
> +}
> +
> +static void tcf_set_action_cookie(struct tc_cookie __rcu **old_cookie,
> +   struct tc_cookie *new_cookie)
> +{
> + struct tc_cookie *old;
> +
> + old = xchg(old_cookie, new_cookie);
> + if (old)
> + call_rcu(>rcu, tcf_free_cookie_rcu);
> +}
> +
>  /* XXX: For standalone actions, we don't need a RCU grace period either, 
> because
>   * actions are always connected to filters and filters are already destroyed 
> in
>   * RCU callbacks, so after a RCU grace period actions are already 
> disconnected
> @@ -65,10 +83,7 @@ static void free_tcf(struct tc_action *p)
>   free_percpu(p->cpu_bstats);
>   free_percpu(p->cpu_qstats);
>  
> - if (p->act_cookie) {
> - kfree(p->act_cookie->data);
> - kfree(p->act_cookie);
> - }
> + tcf_set_action_cookie(>act_cookie, NULL);
>   if (p->goto_chain)
>   tcf_action_goto_chain_fini(p);
>  
> @@ -567,16 +582,22 @@ tcf_action_dump_1(struct sk_buff *skb, struct tc_action 
> *a, int bind, int ref)
>   int err = -EINVAL;
>   unsigned char *b = skb_tail_pointer(skb);
>   struct nlattr *nest;
> + struct tc_cookie *cookie;
>  
>   if (nla_put_string(skb, TCA_KIND, a->ops->kind))
>   goto nla_put_failure;
>   if (tcf_action_copy_stats(skb, a, 0))
>   goto nla_put_failure;
> - if (a->act_cookie) {
> - if (nla_put(skb, TCA_ACT_COOKIE, a->act_cookie->len,
> - a->act_cookie->data))
> +
> + rcu_read_lock();
> + cookie = rcu_dereference(a->act_cookie);
> + if (cookie) {
> + if (nla_put(skb, TCA_ACT_COOKIE, cookie->len, cookie->data)) {
> + rcu_read_unlock();
>   goto nla_put_failure;
> + }
>   }
> + rcu_read_unlock();
>  
>   nest = nla_nest_start(skb, TCA_OPTIONS);
>   if (nest == NULL)
> @@ -719,13 +740,8 @@ struct tc_action *tcf_action_init_1(struct net *net, 
> struct tcf_proto *tp,
>   if (err < 0)
>   goto err_mod;
>  
> - if (name == NULL && tb[TCA_ACT_COOKIE]) {
> - if (a->act_cookie) {
> - kfree(a->act_cookie->data);
> - kfree(a->act_cookie);
> - }
> - a->act_cookie = cookie;
> - }
> + if (!name && tb[TCA_ACT_COOKIE])
> + tcf_set_action_cookie(>act_cookie, cookie);
>  
>   /* module count goes up only when brand new policy is created
>* if it exists and is only bound to in a_o->init() then
> -- 
> 2.7.5
>

Re: [PATCH v3 06/11] net: sched: add 'delete' function to action ops

2018-05-28 Thread Marcelo Ricardo Leitner

On Mon, May 28, 2018 at 12:17:24AM +0300, Vlad Buslov wrote:
> Extend action ops with 'delete' function. Each action type to implements
> its own delete function that doesn't depend on rtnl lock.
> 
> Implement delete function that is required to delete actions without
> holding rtnl lock. Use action API function that atomically deletes action
> only if it is still in action idr. This implementation prevents concurrent
> threads from deleting same action twice.
> 
> Signed-off-by: Vlad Buslov 

Reviewed-by: Marcelo Ricardo Leitner 

> ---
> Changes from V1 to V2:
> - Merge action ops delete definition and implementation.
> 
>  include/net/act_api.h  |  1 +
>  net/sched/act_bpf.c|  8 
>  net/sched/act_connmark.c   |  8 
>  net/sched/act_csum.c   |  8 
>  net/sched/act_gact.c   |  8 
>  net/sched/act_ife.c|  8 
>  net/sched/act_ipt.c| 16 
>  net/sched/act_mirred.c |  8 
>  net/sched/act_nat.c|  8 
>  net/sched/act_pedit.c  |  8 
>  net/sched/act_police.c |  8 
>  net/sched/act_sample.c |  8 
>  net/sched/act_simple.c |  8 
>  net/sched/act_skbedit.c|  8 
>  net/sched/act_skbmod.c |  8 
>  net/sched/act_tunnel_key.c |  8 
>  net/sched/act_vlan.c   |  8 
>  17 files changed, 137 insertions(+)
> 
> diff --git a/include/net/act_api.h b/include/net/act_api.h
> index d94ec6400673..d256e20507b9 100644
> --- a/include/net/act_api.h
> +++ b/include/net/act_api.h
> @@ -101,6 +101,7 @@ struct tc_action_ops {
>   void(*stats_update)(struct tc_action *, u64, u32, u64);
>   size_t  (*get_fill_size)(const struct tc_action *act);
>   struct net_device *(*get_dev)(const struct tc_action *a);
> + int (*delete)(struct net *net, u32 index);
>  };
>  
>  struct tc_action_net {
> diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c
> index 8ebf40a3506c..7941dd66ff83 100644
> --- a/net/sched/act_bpf.c
> +++ b/net/sched/act_bpf.c
> @@ -388,6 +388,13 @@ static int tcf_bpf_search(struct net *net, struct 
> tc_action **a, u32 index,
>   return tcf_idr_search(tn, a, index);
>  }
>  
> +static int tcf_bpf_delete(struct net *net, u32 index)
> +{
> + struct tc_action_net *tn = net_generic(net, bpf_net_id);
> +
> + return tcf_idr_delete_index(tn, index);
> +}
> +
>  static struct tc_action_ops act_bpf_ops __read_mostly = {
>   .kind   =   "bpf",
>   .type   =   TCA_ACT_BPF,
> @@ -398,6 +405,7 @@ static struct tc_action_ops act_bpf_ops __read_mostly = {
>   .init   =   tcf_bpf_init,
>   .walk   =   tcf_bpf_walker,
>   .lookup =   tcf_bpf_search,
> + .delete =   tcf_bpf_delete,
>   .size   =   sizeof(struct tcf_bpf),
>  };
>  
> diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c
> index e3787aa0025a..143c2d3de723 100644
> --- a/net/sched/act_connmark.c
> +++ b/net/sched/act_connmark.c
> @@ -193,6 +193,13 @@ static int tcf_connmark_search(struct net *net, struct 
> tc_action **a, u32 index,
>   return tcf_idr_search(tn, a, index);
>  }
>  
> +static int tcf_connmark_delete(struct net *net, u32 index)
> +{
> + struct tc_action_net *tn = net_generic(net, connmark_net_id);
> +
> + return tcf_idr_delete_index(tn, index);
> +}
> +
>  static struct tc_action_ops act_connmark_ops = {
>   .kind   =   "connmark",
>   .type   =   TCA_ACT_CONNMARK,
> @@ -202,6 +209,7 @@ static struct tc_action_ops act_connmark_ops = {
>   .init   =   tcf_connmark_init,
>   .walk   =   tcf_connmark_walker,
>   .lookup =   tcf_connmark_search,
> + .delete =   tcf_connmark_delete,
>   .size   =   sizeof(struct tcf_connmark_info),
>  };
>  
> diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c
> index 334261943f9f..3768539340e0 100644
> --- a/net/sched/act_csum.c
> +++ b/net/sched/act_csum.c
> @@ -654,6 +654,13 @@ static size_t tcf_csum_get_fill_size(const struct 
> tc_action *act)
>   return nla_total_size(sizeof(struct tc_csum));
>  }
>  
> +static int tcf_csum_delete(struct net *net, u32 index)
> +{
> + struct tc_action_net *tn = net_generic(net, csum_net_id);
> +
> + return tcf_idr_delete_index(tn, index);
> +}
> +
>  static struct tc_action_ops act_csum_ops = {
>   .kind   = "csum",
>   .type   = TCA_ACT_CSUM,
> @@ -665,6 +672,7 @@ static struct tc_ac

Re: [PATCH v3 09/11] net: sched: use reference counting action init

2018-05-28 Thread Marcelo Ricardo Leitner

On Mon, May 28, 2018 at 12:17:27AM +0300, Vlad Buslov wrote:
> Change action API to assume that action init function always takes
> reference to action, even when overwriting existing action. This is
> necessary because action API continues to use action pointer after init
> function is done. At this point action becomes accessible for concurrent
> modifications, so user must always hold reference to it.
> 
> Implement helper put list function to atomically release list of actions
> after action API init code is done using them.
> 
> Signed-off-by: Vlad Buslov 

Reviewed-by: Marcelo Ricardo Leitner 

> ---
> Changes from V1 to V2:
> - Resplit action lookup/release code to prevent memory leaks in
>   individual patches.
> 
>  net/sched/act_api.c | 35 +--
>  1 file changed, 17 insertions(+), 18 deletions(-)
> 
> diff --git a/net/sched/act_api.c b/net/sched/act_api.c
> index f019f0464cec..eefe8c2fe667 100644
> --- a/net/sched/act_api.c
> +++ b/net/sched/act_api.c
> @@ -627,6 +627,18 @@ static int tcf_action_put(struct tc_action *p)
>   return __tcf_action_put(p, false);
>  }
>  
> +static void tcf_action_put_lst(struct list_head *actions)
> +{
> + struct tc_action *a, *tmp;
> +
> + list_for_each_entry_safe(a, tmp, actions, list) {
> + const struct tc_action_ops *ops = a->ops;
> +
> + if (tcf_action_put(a))
> + module_put(ops->owner);
> + }
> +}
> +
>  int
>  tcf_action_dump_old(struct sk_buff *skb, struct tc_action *a, int bind, int 
> ref)
>  {
> @@ -835,17 +847,6 @@ struct tc_action *tcf_action_init_1(struct net *net, 
> struct tcf_proto *tp,
>   return ERR_PTR(err);
>  }
>  
> -static void cleanup_a(struct list_head *actions, int ovr)
> -{
> - struct tc_action *a;
> -
> - if (!ovr)
> - return;
> -
> - list_for_each_entry(a, actions, list)
> - refcount_dec(>tcfa_refcnt);
> -}
> -
>  int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr 
> *nla,
>   struct nlattr *est, char *name, int ovr, int bind,
>   struct list_head *actions, size_t *attr_size,
> @@ -874,11 +875,6 @@ int tcf_action_init(struct net *net, struct tcf_proto 
> *tp, struct nlattr *nla,
>   }
>  
>   *attr_size = tcf_action_full_attrs_size(sz);
> -
> - /* Remove the temp refcnt which was necessary to protect against
> -  * destroying an existing action which was being replaced
> -  */
> - cleanup_a(actions, ovr);
>   return 0;
>  
>  err:
> @@ -1209,7 +1205,7 @@ tca_action_gd(struct net *net, struct nlattr *nla, 
> struct nlmsghdr *n,
>   return ret;
>   }
>  err:
> - tcf_action_destroy(, 0);
> + tcf_action_put_lst();
>   return ret;
>  }
>  
> @@ -1251,8 +1247,11 @@ static int tcf_action_add(struct net *net, struct 
> nlattr *nla,
> _size, true, extack);
>   if (ret)
>   return ret;
> + ret = tcf_add_notify(net, n, , portid, attr_size, extack);
> + if (ovr)
> + tcf_action_put_lst();
>  
> - return tcf_add_notify(net, n, , portid, attr_size, extack);
> + return ret;
>  }
>  
>  static u32 tcaa_root_flags_allowed = TCA_FLAG_LARGE_DUMP_ON;
> -- 
> 2.7.5
>

Re: [PATCH v3 10/11] net: sched: atomically check-allocate action

2018-05-28 Thread Marcelo Ricardo Leitner

On Mon, May 28, 2018 at 12:17:28AM +0300, Vlad Buslov wrote:
> Implement function that atomically checks if action exists and either takes
> reference to it, or allocates idr slot for action index to prevent
> concurrent allocations of actions with same index. Use EBUSY error pointer
> to indicate that idr slot is reserved.
> 
> Implement cleanup helper function that removes temporary error pointer from
> idr. (in case of error between idr allocation and insertion of newly
> created action to specified index)
> 
> Refactor all action init functions to insert new action to idr using this
> API.
> 
> Signed-off-by: Vlad Buslov 

Reviewed-by: Marcelo Ricardo Leitner 

> ---
> Changes from V1 to V2:
> - Remove unique idr insertion function. Change original idr insert to do
>   the same thing.
> - Refactor action check-alloc code into standalone function.
> 
>  include/net/act_api.h  |  3 ++
>  net/sched/act_api.c| 92 
> --
>  net/sched/act_bpf.c| 11 --
>  net/sched/act_connmark.c   | 10 +++--
>  net/sched/act_csum.c   | 11 --
>  net/sched/act_gact.c   | 11 --
>  net/sched/act_ife.c|  6 ++-
>  net/sched/act_ipt.c| 13 ++-
>  net/sched/act_mirred.c | 16 ++--
>  net/sched/act_nat.c| 11 --
>  net/sched/act_pedit.c  | 15 ++--
>  net/sched/act_police.c |  9 -
>  net/sched/act_sample.c | 11 --
>  net/sched/act_simple.c | 11 +-
>  net/sched/act_skbedit.c| 11 +-
>  net/sched/act_skbmod.c | 11 +-
>  net/sched/act_tunnel_key.c |  9 -
>  net/sched/act_vlan.c   | 17 -
>  18 files changed, 218 insertions(+), 60 deletions(-)
> 
> diff --git a/include/net/act_api.h b/include/net/act_api.h
> index d256e20507b9..cd4547476074 100644
> --- a/include/net/act_api.h
> +++ b/include/net/act_api.h
> @@ -154,6 +154,9 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, 
> struct nlattr *est,
>  int bind, bool cpustats);
>  void tcf_idr_insert(struct tc_action_net *tn, struct tc_action *a);
>  
> +void tcf_idr_cleanup(struct tc_action_net *tn, u32 index);
> +int tcf_idr_check_alloc(struct tc_action_net *tn, u32 *index,
> + struct tc_action **a, int bind);
>  int tcf_idr_delete_index(struct tc_action_net *tn, u32 index);
>  int __tcf_idr_release(struct tc_action *a, bool bind, bool strict);
>  
> diff --git a/net/sched/act_api.c b/net/sched/act_api.c
> index eefe8c2fe667..9511502e1cbb 100644
> --- a/net/sched/act_api.c
> +++ b/net/sched/act_api.c
> @@ -303,7 +303,9 @@ static bool __tcf_idr_check(struct tc_action_net *tn, u32 
> index,
>  
>   spin_lock(>lock);
>   p = idr_find(>action_idr, index);
> - if (p) {
> + if (IS_ERR(p)) {
> + p = NULL;
> + } else if (p) {
>   refcount_inc(>tcfa_refcnt);
>   if (bind)
>   atomic_inc(>tcfa_bindcnt);
> @@ -371,7 +373,6 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, 
> struct nlattr *est,
>  {
>   struct tc_action *p = kzalloc(ops->size, GFP_KERNEL);
>   struct tcf_idrinfo *idrinfo = tn->idrinfo;
> - struct idr *idr = >action_idr;
>   int err = -ENOMEM;
>  
>   if (unlikely(!p))
> @@ -389,20 +390,6 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, 
> struct nlattr *est,
>   goto err2;
>   }
>   spin_lock_init(>tcfa_lock);
> - idr_preload(GFP_KERNEL);
> - spin_lock(>lock);
> - /* user doesn't specify an index */
> - if (!index) {
> - index = 1;
> - err = idr_alloc_u32(idr, NULL, , UINT_MAX, GFP_ATOMIC);
> - } else {
> - err = idr_alloc_u32(idr, NULL, , index, GFP_ATOMIC);
> - }
> - spin_unlock(>lock);
> - idr_preload_end();
> - if (err)
> - goto err3;
> -
>   p->tcfa_index = index;
>   p->tcfa_tm.install = jiffies;
>   p->tcfa_tm.lastuse = jiffies;
> @@ -412,7 +399,7 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, 
> struct nlattr *est,
>   >tcfa_rate_est,
>   >tcfa_lock, NULL, est);
>   if (err)
> - goto err4;
> + goto err3;
>   }
>  
>   p->idrinfo = idrinfo;
> @@ -420,8 +407,6 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, 
> struct nlattr *est,
>   INIT_LIST_HEAD(>list);
>   *a = p;
>   return 0;
> -err4:
> - idr_remove(idr, index);
>  err3:
>   free_percp

1 2 3 4 5 6 7 8 9 10 >

1 - 100 of 1023 matches

Mail list logo