Re: [PATCH net 1/3] flex_array: make FLEX_ARRAY_BASE_SIZE the same value of FLEX_ARRAY_PART_SIZE

2018-12-05 Thread Xin Long
On Thu, Dec 6, 2018 at 1:38 PM David Miller  wrote:
>
> From: Xin Long 
> Date: Wed,  5 Dec 2018 14:49:40 +0800
>
> > This patch is to separate the base data memory from struct flex_array and
> > save it into a page. With this change, total_nr_elements of a flex_array
> > can grow or shrink without having the old element's memory changed when
> > the new size of the flex_arry crosses FLEX_ARRAY_BASE_SIZE, which will
> > be added in the next patch.
> >
> > Suggested-by: Neil Horman 
> > Signed-off-by: Xin Long 
>
> This needs to be reviewed by the flex array hackers and lkml.
>
> It can't just get reviewed on netdev alone.
Will repost with CCing lkml and
the author:
  "Dave Hansen "
and two contributors:
  "David Rientjes ", "Eric Paris "

Thanks.


[PATCH net 2/3] flex_array: support flex_array_resize

2018-12-04 Thread Xin Long
This function can dynamically change total_nr_elements of a flex_array,
and keep the old elements of the same memory. Returns 0 if it succeeds.

Note that it won't do any memory allocation or shrinking for elements,
which should be only done by flex_array_prealloc and flex_array_shrink.

Suggested-by: Neil Horman 
Signed-off-by: Xin Long 
---
 include/linux/flex_array.h | 11 +
 lib/flex_array.c   | 58 ++
 2 files changed, 69 insertions(+)

diff --git a/include/linux/flex_array.h b/include/linux/flex_array.h
index 29ad65f..19ff58d 100644
--- a/include/linux/flex_array.h
+++ b/include/linux/flex_array.h
@@ -130,6 +130,17 @@ void *flex_array_get(struct flex_array *fa, unsigned int 
element_nr);
  */
 int flex_array_shrink(struct flex_array *fa);
 
+/**
+ * flex_array_resize() - Resize without the old elements memory changed
+ * @fa:array to resize
+ * @total: total number of elements that this would change to
+ * @flags: page allocation flags to use for base array
+ *
+ * Return: Returns 0 if it succeeds.
+ *
+ */
+int flex_array_resize(struct flex_array *fa, unsigned int total, gfp_t flags);
+
 #define flex_array_put_ptr(fa, nr, src, gfp) \
flex_array_put(fa, nr, (void *)&(src), gfp)
 
diff --git a/lib/flex_array.c b/lib/flex_array.c
index 8c0b9b6..2f913e7 100644
--- a/lib/flex_array.c
+++ b/lib/flex_array.c
@@ -405,3 +405,61 @@ int flex_array_shrink(struct flex_array *fa)
return ret;
 }
 EXPORT_SYMBOL(flex_array_shrink);
+
+/**
+ * flex_array_resize - resize without the old elements memory changed
+ * @fa:the flex array to resize
+ * @total: total number of elements that this would change to
+ * @flags: page allocation flags to use for base array
+ *
+ * This function can dynamically change total_nr_elements of a flex_array,
+ * and keep the old elements of the same memory. Returns 0 if it succeeds.
+ * Note that it won't do any memory allocation or shrinking for elements,
+ * which should be only done by flex_array_prealloc and flex_array_shrink.
+ *
+ * Locking must be provided by the caller.
+ */
+int flex_array_resize(struct flex_array *fa, unsigned int total, gfp_t flags)
+{
+   int nr;
+
+   if (total > FLEX_ARRAY_NR_BASE_PTRS * fa->elems_per_part)
+   return -EINVAL;
+
+   if (elements_fit_in_base(fa)) {
+   struct flex_array_part_p *part_p;
+
+   nr = fa->total_nr_elements;
+   fa->total_nr_elements = total;
+   if (elements_fit_in_base(fa))
+   return 0;
+
+   part_p = kzalloc(sizeof(*part_p), flags);
+   if (!part_p) {
+   fa->total_nr_elements = nr;
+   return -ENOMEM;
+   }
+
+   part_p->p_part[0] = (struct flex_array_part *)>parts[0];
+   fa->part_p = part_p;
+   } else {
+   struct flex_array_part *part;
+
+   fa->total_nr_elements = total;
+   if (!elements_fit_in_base(fa))
+   return 0;
+
+   for (nr = 1; nr < FLEX_ARRAY_NR_BASE_PTRS; nr++) {
+   part = fa->parts[nr];
+   if (part) {
+   fa->parts[nr] = NULL;
+   kfree(part);
+   }
+   }
+
+   fa->part_p = (struct flex_array_part_p *)fa->parts[0];
+   }
+
+   return 0;
+}
+EXPORT_SYMBOL(flex_array_resize);
-- 
2.1.0



[PATCH net 3/3] sctp: fa_resize sctp stream instead of redo fa_alloc

2018-12-04 Thread Xin Long
Now when doing 4-shakehand or adding new streams, sctp has to allocate
new memory for asoc->stream and copy the old stream's information from
the old asoc->stream to the new one. It also cause the stream pointers
to change, by which a panic was even caused due to stream->out_curr's
change.

To fix this, flex_array_resize() is used in sctp_stream_alloc_out/in()
when asoc->stream has been allocated. Besides, with this asoc->stream
will only be allocated once, and grow or shrink dynamically later.

Note that flex_array_prealloc() is needed before growing as fa_alloc
does, while flex_array_clear() and flex_array_shrink() are called to
free the unused memory before shrinking.

Fixes: 5e32a431 ("sctp: introduce stream scheduler foundations")
Reported-by: Ying Xu 
Reported-by: syzbot+e33a3a138267ca119...@syzkaller.appspotmail.com
Suggested-by: Neil Horman 
Signed-off-by: Xin Long 
---
 net/sctp/stream.c | 87 +--
 1 file changed, 40 insertions(+), 47 deletions(-)

diff --git a/net/sctp/stream.c b/net/sctp/stream.c
index 3892e76..aff30b2 100644
--- a/net/sctp/stream.c
+++ b/net/sctp/stream.c
@@ -37,6 +37,17 @@
 #include 
 #include 
 
+static void fa_zero(struct flex_array *fa, size_t index, size_t count)
+{
+   void *elem;
+
+   while (count--) {
+   elem = flex_array_get(fa, index);
+   memset(elem, 0, fa->element_size);
+   index++;
+   }
+}
+
 static struct flex_array *fa_alloc(size_t elem_size, size_t elem_count,
   gfp_t gfp)
 {
@@ -48,8 +59,9 @@ static struct flex_array *fa_alloc(size_t elem_size, size_t 
elem_count,
err = flex_array_prealloc(result, 0, elem_count, gfp);
if (err) {
flex_array_free(result);
-   result = NULL;
+   return NULL;
}
+   fa_zero(result, 0, elem_count);
}
 
return result;
@@ -61,27 +73,28 @@ static void fa_free(struct flex_array *fa)
flex_array_free(fa);
 }
 
-static void fa_copy(struct flex_array *fa, struct flex_array *from,
-   size_t index, size_t count)
+static int fa_resize(struct flex_array *fa, size_t count, gfp_t gfp)
 {
-   void *elem;
+   int nr = fa->total_nr_elements, n;
 
-   while (count--) {
-   elem = flex_array_get(from, index);
-   flex_array_put(fa, index, elem, 0);
-   index++;
+   if (count > nr) {
+   if (flex_array_resize(fa, count, gfp))
+   return -ENOMEM;
+   if (flex_array_prealloc(fa, nr, count - nr, gfp))
+   return -ENOMEM;
+   fa_zero(fa, nr, count - nr);
+
+   return 0;
}
-}
 
-static void fa_zero(struct flex_array *fa, size_t index, size_t count)
-{
-   void *elem;
+   /* Shrink the unused memory,
+* FLEX_ARRAY_FREE check is safe for sctp stream.
+*/
+   for (n = count; n < nr; n++)
+   flex_array_clear(fa, n);
+   flex_array_shrink(fa);
 
-   while (count--) {
-   elem = flex_array_get(fa, index);
-   memset(elem, 0, fa->element_size);
-   index++;
-   }
+   return flex_array_resize(fa, count, gfp);
 }
 
 /* Migrates chunks from stream queues to new stream queues if needed,
@@ -138,47 +151,27 @@ static void sctp_stream_outq_migrate(struct sctp_stream 
*stream,
 static int sctp_stream_alloc_out(struct sctp_stream *stream, __u16 outcnt,
 gfp_t gfp)
 {
-   struct flex_array *out;
-   size_t elem_size = sizeof(struct sctp_stream_out);
-
-   out = fa_alloc(elem_size, outcnt, gfp);
-   if (!out)
-   return -ENOMEM;
+   if (!stream->out) {
+   stream->out = fa_alloc(sizeof(struct sctp_stream_out),
+  outcnt, gfp);
 
-   if (stream->out) {
-   fa_copy(out, stream->out, 0, min(outcnt, stream->outcnt));
-   fa_free(stream->out);
+   return stream->out ? 0 : -ENOMEM;
}
 
-   if (outcnt > stream->outcnt)
-   fa_zero(out, stream->outcnt, (outcnt - stream->outcnt));
-
-   stream->out = out;
-
-   return 0;
+   return fa_resize(stream->out, outcnt, gfp);
 }
 
 static int sctp_stream_alloc_in(struct sctp_stream *stream, __u16 incnt,
gfp_t gfp)
 {
-   struct flex_array *in;
-   size_t elem_size = sizeof(struct sctp_stream_in);
+   if (!stream->in) {
+   stream->in = fa_alloc(sizeof(struct sctp_stream_in),
+ incnt, gfp);
 
-   in = fa_alloc(elem_size, incnt, gfp);
-   if (!in)
-   return -ENOMEM;
-
-   if (stream->in) {
-

[PATCH net 1/3] flex_array: make FLEX_ARRAY_BASE_SIZE the same value of FLEX_ARRAY_PART_SIZE

2018-12-04 Thread Xin Long
This patch is to separate the base data memory from struct flex_array and
save it into a page. With this change, total_nr_elements of a flex_array
can grow or shrink without having the old element's memory changed when
the new size of the flex_arry crosses FLEX_ARRAY_BASE_SIZE, which will
be added in the next patch.

Suggested-by: Neil Horman 
Signed-off-by: Xin Long 
---
 include/linux/flex_array.h | 29 +
 lib/flex_array.c   | 15 ---
 2 files changed, 21 insertions(+), 23 deletions(-)

diff --git a/include/linux/flex_array.h b/include/linux/flex_array.h
index b94fa61..29ad65f 100644
--- a/include/linux/flex_array.h
+++ b/include/linux/flex_array.h
@@ -7,9 +7,10 @@
 #include 
 
 #define FLEX_ARRAY_PART_SIZE PAGE_SIZE
-#define FLEX_ARRAY_BASE_SIZE PAGE_SIZE
+#define FLEX_ARRAY_BASE_SIZE FLEX_ARRAY_PART_SIZE
 
 struct flex_array_part;
+struct flex_array_part_p;
 
 /*
  * This is meant to replace cases where an array-like
@@ -19,29 +20,17 @@ struct flex_array_part;
  */
 
 struct flex_array {
-   union {
-   struct {
-   int element_size;
-   int total_nr_elements;
-   int elems_per_part;
-   struct reciprocal_value reciprocal_elems;
-   struct flex_array_part *parts[];
-   };
-   /*
-* This little trick makes sure that
-* sizeof(flex_array) == PAGE_SIZE
-*/
-   char padding[FLEX_ARRAY_BASE_SIZE];
-   };
+   int element_size;
+   int total_nr_elements;
+   int elems_per_part;
+   struct reciprocal_value reciprocal_elems;
+   struct flex_array_part_p *part_p;
+#define parts part_p->p_part
 };
 
-/* Number of bytes left in base struct flex_array, excluding metadata */
-#define FLEX_ARRAY_BASE_BYTES_LEFT \
-   (FLEX_ARRAY_BASE_SIZE - offsetof(struct flex_array, parts))
-
 /* Number of pointers in base to struct flex_array_part pages */
 #define FLEX_ARRAY_NR_BASE_PTRS
\
-   (FLEX_ARRAY_BASE_BYTES_LEFT / sizeof(struct flex_array_part *))
+   (FLEX_ARRAY_BASE_SIZE / sizeof(struct flex_array_part *))
 
 /* Number of elements of size that fit in struct flex_array_part */
 #define FLEX_ARRAY_ELEMENTS_PER_PART(size) \
diff --git a/lib/flex_array.c b/lib/flex_array.c
index 2eed22f..8c0b9b6 100644
--- a/lib/flex_array.c
+++ b/lib/flex_array.c
@@ -30,6 +30,10 @@ struct flex_array_part {
char elements[FLEX_ARRAY_PART_SIZE];
 };
 
+struct flex_array_part_p {
+   struct flex_array_part *p_part[FLEX_ARRAY_NR_BASE_PTRS];
+};
+
 /*
  * If a user requests an allocation which is small
  * enough, we may simply use the space in the
@@ -39,7 +43,7 @@ struct flex_array_part {
 static inline int elements_fit_in_base(struct flex_array *fa)
 {
int data_size = fa->element_size * fa->total_nr_elements;
-   if (data_size <= FLEX_ARRAY_BASE_BYTES_LEFT)
+   if (data_size <= FLEX_ARRAY_BASE_SIZE)
return 1;
return 0;
 }
@@ -105,13 +109,17 @@ struct flex_array *flex_array_alloc(int element_size, 
unsigned int total,
ret = kzalloc(sizeof(struct flex_array), flags);
if (!ret)
return NULL;
+   ret->part_p = kzalloc(sizeof(struct flex_array_part_p), flags);
+   if (!ret->part_p) {
+   kfree(ret);
+   return NULL;
+   }
ret->element_size = element_size;
ret->total_nr_elements = total;
ret->elems_per_part = elems_per_part;
ret->reciprocal_elems = reciprocal_elems;
if (elements_fit_in_base(ret) && !(flags & __GFP_ZERO))
-   memset(>parts[0], FLEX_ARRAY_FREE,
-   FLEX_ARRAY_BASE_BYTES_LEFT);
+   memset(>parts[0], FLEX_ARRAY_FREE, FLEX_ARRAY_BASE_SIZE);
return ret;
 }
 EXPORT_SYMBOL(flex_array_alloc);
@@ -148,6 +156,7 @@ EXPORT_SYMBOL(flex_array_free_parts);
 void flex_array_free(struct flex_array *fa)
 {
flex_array_free_parts(fa);
+   kfree(fa->part_p);
kfree(fa);
 }
 EXPORT_SYMBOL(flex_array_free);
-- 
2.1.0



[PATCH net 0/3] net: add support for flex_array_resize in flex_array

2018-12-04 Thread Xin Long
Without the support for the total_nr_elements's growing or shrinking
dynamically, flex_array is not that 'flexible'. Like when users want
to change the size, they have to redo flex_array_alloc and copy all
the elements from the old to the new one.  The worse thing is every
element's memory gets changed.

To implement flex_array_resize based on current code, the difficult
thing is to process the size border of FLEX_ARRAY_BASE_BYTES_LEFT,
where the base data memory may change to an array for the 2nd level
data memory for growing, likewise for shrinking.

To make this part easier, we separate the base data memory and define
FLEX_ARRAY_BASE_SIZE as a same value of FLEX_ARRAY_PART_SIZE, as Neil
suggested.  When new size is crossing the border, the base memory is
allocated as the array for the 2nd level data memory and its part[0]
is pointed to the old base memory, and do the opposite for shrinking.

But it doesn't do any memory allocation or shrinking for elements in
flex_array_resize, as which should be done by flex_array_prealloc or
flex_array_shrink called by users.  No memory leaks can be caused by
that.

SCTP has benefited a lot from flex_array_resize() for managing its
stream memory so far.

Xin Long (3):
  flex_array: make FLEX_ARRAY_BASE_SIZE the same value of
FLEX_ARRAY_PART_SIZE
  flex_array: support flex_array_resize
  sctp: fa_resize sctp stream instead of redo fa_alloc

 include/linux/flex_array.h | 40 ++---
 lib/flex_array.c   | 73 --
 net/sctp/stream.c  | 87 +-
 3 files changed, 130 insertions(+), 70 deletions(-)

-- 
2.1.0



Re: [PATCHv2 net] sctp: check and update stream->out_curr when allocating stream_out

2018-11-30 Thread Xin Long
On Sat, Dec 1, 2018 at 12:23 AM Neil Horman  wrote:
>
> On Fri, Nov 30, 2018 at 10:48:10PM +0900, Xin Long wrote:
> > On Fri, Nov 30, 2018 at 9:21 PM Neil Horman  wrote:
> > >
> > > On Fri, Nov 30, 2018 at 03:22:39PM +0900, Xin Long wrote:
> > > > On Thu, Nov 29, 2018 at 11:39 PM Neil Horman  
> > > > wrote:
> > > > >
> > > > > On Thu, Nov 29, 2018 at 02:42:56PM +0800, Xin Long wrote:
> > > > > > Now when using stream reconfig to add out streams, stream->out
> > > > > > will get re-allocated, and all old streams' information will
> > > > > > be copied to the new ones and the old ones will be freed.
> > > > > >
> > > > > > So without stream->out_curr updated, next time when trying to
> > > > > > send from stream->out_curr stream, a panic would be caused.
> > > > > >
> > > > > > This patch is to check and update stream->out_curr when
> > > > > > allocating stream_out.
> > > > > >
> > > > > > v1->v2:
> > > > > >   - define fa_index() to get elem index from stream->out_curr.
> > > > > >
> > > > > > Fixes: 5e32a431 ("sctp: introduce stream scheduler foundations")
> > > > > > Reported-by: Ying Xu 
> > > > > > Reported-by: syzbot+e33a3a138267ca119...@syzkaller.appspotmail.com
> > > > > > Signed-off-by: Xin Long 
> > > > > > ---
> > > > > >  net/sctp/stream.c | 20 
> > > > > >  1 file changed, 20 insertions(+)
> > > > > >
> > > > > > diff --git a/net/sctp/stream.c b/net/sctp/stream.c
> > > > > > index 3892e76..30e7809 100644
> > > > > > --- a/net/sctp/stream.c
> > > > > > +++ b/net/sctp/stream.c
> > > > > > @@ -84,6 +84,19 @@ static void fa_zero(struct flex_array *fa, 
> > > > > > size_t index, size_t count)
> > > > > >   }
> > > > > >  }
> > > > > >
> > > > > > +static size_t fa_index(struct flex_array *fa, void *elem, size_t 
> > > > > > count)
> > > > > > +{
> > > > > > + size_t index = 0;
> > > > > > +
> > > > > > + while (count--) {
> > > > > > + if (elem == flex_array_get(fa, index))
> > > > > > + break;
> > > > > > + index++;
> > > > > > + }
> > > > > > +
> > > > > > + return index;
> > > > > > +}
> > > > > > +
> > > > > >  /* Migrates chunks from stream queues to new stream queues if 
> > > > > > needed,
> > > > > >   * but not across associations. Also, removes those chunks to 
> > > > > > streams
> > > > > >   * higher than the new max.
> > > > > > @@ -147,6 +160,13 @@ static int sctp_stream_alloc_out(struct 
> > > > > > sctp_stream *stream, __u16 outcnt,
> > > > > >
> > > > > >   if (stream->out) {
> > > > > >   fa_copy(out, stream->out, 0, min(outcnt, 
> > > > > > stream->outcnt));
> > > > > > + if (stream->out_curr) {
> > > > > > + size_t index = fa_index(stream->out, 
> > > > > > stream->out_curr,
> > > > > > + stream->outcnt);
> > > > > > +
> > > > > > + BUG_ON(index == stream->outcnt);
> > > > > > + stream->out_curr = flex_array_get(out, index);
> > > > > > + }
> > > > > >   fa_free(stream->out);
> > > > > >   }
> > > > > >
> > > > > > --
> > > > > > 2.1.0
> > > > > >
> > > > > >
> > > > >
> > > > > This is the sort of thing I'm talking about. Its a little more code, 
> > > > > but if you
> > > > > augment the flex_array api like this, you can preform a resize 
> > > > > operation on your
> > > > > existing flex array, and you can avoid all the copying, and need to 
> > > > > 

[PATCH net] sctp: kfree_rcu asoc

2018-11-30 Thread Xin Long
In sctp_hash_transport/sctp_epaddr_lookup_transport, it dereferences
a transport's asoc under rcu_read_lock while asoc is freed not after
a grace period, which leads to a use-after-free panic.

This patch fixes it by calling kfree_rcu to make asoc be freed after
a grace period.

Note that only the asoc's memory is delayed to free in the patch, it
won't cause sk to linger longer.

Thanks Neil and Marcelo to make this clear.

Fixes: 7fda702f9315 ("sctp: use new rhlist interface on sctp transport 
rhashtable")
Fixes: cd2b70875058 ("sctp: check duplicate node before inserting a new 
transport")
Reported-by: syzbot+0b05d8aa7cb185107...@syzkaller.appspotmail.com
Reported-by: syzbot+aad231d51b1923158...@syzkaller.appspotmail.com
Suggested-by: Neil Horman 
Signed-off-by: Xin Long 
---
 include/net/sctp/structs.h | 2 ++
 net/sctp/associola.c   | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index a11f937..feada35 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -2075,6 +2075,8 @@ struct sctp_association {
 
__u64 abandoned_unsent[SCTP_PR_INDEX(MAX) + 1];
__u64 abandoned_sent[SCTP_PR_INDEX(MAX) + 1];
+
+   struct rcu_head rcu;
 };
 
 
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index 6a28b96..3702f48 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -434,7 +434,7 @@ static void sctp_association_destroy(struct 
sctp_association *asoc)
 
WARN_ON(atomic_read(>rmem_alloc));
 
-   kfree(asoc);
+   kfree_rcu(asoc, rcu);
SCTP_DBG_OBJCNT_DEC(assoc);
 }
 
-- 
2.1.0



Re: [PATCHv2 net] sctp: hold transport before accessing its asoc in sctp_epaddr_lookup_transport

2018-11-30 Thread Xin Long
On Fri, Nov 30, 2018 at 11:27 PM Neil Horman  wrote:
>
> On Fri, Nov 30, 2018 at 11:15:50PM +0900, Xin Long wrote:
> > On Fri, Nov 30, 2018 at 10:33 PM Marcelo Ricardo Leitner
> >  wrote:
> > >
> > > On Fri, Nov 30, 2018 at 07:32:36AM -0500, Neil Horman wrote:
> > > > On Fri, Nov 30, 2018 at 02:04:16PM +0900, Xin Long wrote:
> > > > > On Fri, Nov 30, 2018 at 5:52 AM Neil Horman  
> > > > > wrote:
> > > > > >
> > > > > > On Thu, Nov 29, 2018 at 02:44:07PM +0800, Xin Long wrote:
> > > > > > > Without holding transport to dereference its asoc, a use after
> > > > > > > free panic can be caused in sctp_epaddr_lookup_transport. Note
> > > > > > > that a sock lock can't protect these transports that belong to
> > > > > > > other socks.
> > > > > > >
> > > > > > > A similar fix as Commit bab1be79a516 ("sctp: hold transport
> > > > > > > before accessing its asoc in sctp_transport_get_next") is
> > > > > > > needed to hold the transport before accessing its asoc in
> > > > > > > sctp_epaddr_lookup_transport.
> > > > > > >
> > > > > > > Note that this extra atomic operation is on the datapath,
> > > > > > > but as rhlist keeps the lists to a small size, it won't
> > > > > > > see a noticeable performance hurt.
> > > > > > >
> > > > > > > v1->v2:
> > > > > > >   - improve the changelog.
> > > > > > >
> > > > > > > Fixes: 7fda702f9315 ("sctp: use new rhlist interface on sctp 
> > > > > > > transport rhashtable")
> > > > > > > Reported-by: syzbot+aad231d51b1923158...@syzkaller.appspotmail.com
> > > > > > > Signed-off-by: Xin Long 
> > > > > > > ---
> > > > > > >  net/sctp/input.c | 10 --
> > > > > > >  1 file changed, 8 insertions(+), 2 deletions(-)
> > > > > > >
> > > > > > > diff --git a/net/sctp/input.c b/net/sctp/input.c
> > > > > > > index 5c36a99..ce7351c 100644
> > > > > > > --- a/net/sctp/input.c
> > > > > > > +++ b/net/sctp/input.c
> > > > > > > @@ -967,9 +967,15 @@ struct sctp_transport 
> > > > > > > *sctp_epaddr_lookup_transport(
> > > > > > >   list = rhltable_lookup(_transport_hashtable, ,
> > > > > > >  sctp_hash_params);
> > > > > > >
> > > > > > > - rhl_for_each_entry_rcu(t, tmp, list, node)
> > > > > > > - if (ep == t->asoc->ep)
> > > > > > > + rhl_for_each_entry_rcu(t, tmp, list, node) {
> > > > > > > + if (!sctp_transport_hold(t))
> > > > > > > + continue;
> > > > > > > + if (ep == t->asoc->ep) {
> > > > > > > + sctp_transport_put(t);
> > > > > > >   return t;
> > > > > > > + }
> > > > > > > + sctp_transport_put(t);
> > > > > > > + }
> > > > > > >
> > > > > > >   return NULL;
> > > > > > >  }
> > > > > >
> > > > > > Wait a second, what if we just added an rcu_head to the association 
> > > > > > structure
> > > > > > and changed the kfree call in sctp_association_destroy to a 
> > > > > > kfree_rcu call
> > > > > > instead?  That would force the actual freeing of the association to 
> > > > > > pass through
> > > > > > a grace period, during which any in flight list traversal in
> > > > > > sctp_epaddr_lookup_transport could complete safely.  Its another 
> > > > > > two pointers
> > > > > We discussed this in last thread:
> > > > > https://www.spinics.net/lists/netdev/msg535191.html
> > > > >
> > > > > It will cause closed sk to linger longer.
> > > > >
> > > > Yes, but we never really got resolution on that topic.  I don't see 
> > > > that a
> > >
> > > Fair point. We should have brought bac

Re: [PATCHv2 net] sctp: hold transport before accessing its asoc in sctp_epaddr_lookup_transport

2018-11-30 Thread Xin Long
On Fri, Nov 30, 2018 at 10:33 PM Marcelo Ricardo Leitner
 wrote:
>
> On Fri, Nov 30, 2018 at 07:32:36AM -0500, Neil Horman wrote:
> > On Fri, Nov 30, 2018 at 02:04:16PM +0900, Xin Long wrote:
> > > On Fri, Nov 30, 2018 at 5:52 AM Neil Horman  wrote:
> > > >
> > > > On Thu, Nov 29, 2018 at 02:44:07PM +0800, Xin Long wrote:
> > > > > Without holding transport to dereference its asoc, a use after
> > > > > free panic can be caused in sctp_epaddr_lookup_transport. Note
> > > > > that a sock lock can't protect these transports that belong to
> > > > > other socks.
> > > > >
> > > > > A similar fix as Commit bab1be79a516 ("sctp: hold transport
> > > > > before accessing its asoc in sctp_transport_get_next") is
> > > > > needed to hold the transport before accessing its asoc in
> > > > > sctp_epaddr_lookup_transport.
> > > > >
> > > > > Note that this extra atomic operation is on the datapath,
> > > > > but as rhlist keeps the lists to a small size, it won't
> > > > > see a noticeable performance hurt.
> > > > >
> > > > > v1->v2:
> > > > >   - improve the changelog.
> > > > >
> > > > > Fixes: 7fda702f9315 ("sctp: use new rhlist interface on sctp 
> > > > > transport rhashtable")
> > > > > Reported-by: syzbot+aad231d51b1923158...@syzkaller.appspotmail.com
> > > > > Signed-off-by: Xin Long 
> > > > > ---
> > > > >  net/sctp/input.c | 10 --
> > > > >  1 file changed, 8 insertions(+), 2 deletions(-)
> > > > >
> > > > > diff --git a/net/sctp/input.c b/net/sctp/input.c
> > > > > index 5c36a99..ce7351c 100644
> > > > > --- a/net/sctp/input.c
> > > > > +++ b/net/sctp/input.c
> > > > > @@ -967,9 +967,15 @@ struct sctp_transport 
> > > > > *sctp_epaddr_lookup_transport(
> > > > >   list = rhltable_lookup(_transport_hashtable, ,
> > > > >  sctp_hash_params);
> > > > >
> > > > > - rhl_for_each_entry_rcu(t, tmp, list, node)
> > > > > - if (ep == t->asoc->ep)
> > > > > + rhl_for_each_entry_rcu(t, tmp, list, node) {
> > > > > + if (!sctp_transport_hold(t))
> > > > > + continue;
> > > > > + if (ep == t->asoc->ep) {
> > > > > + sctp_transport_put(t);
> > > > >   return t;
> > > > > + }
> > > > > + sctp_transport_put(t);
> > > > > + }
> > > > >
> > > > >   return NULL;
> > > > >  }
> > > >
> > > > Wait a second, what if we just added an rcu_head to the association 
> > > > structure
> > > > and changed the kfree call in sctp_association_destroy to a kfree_rcu 
> > > > call
> > > > instead?  That would force the actual freeing of the association to 
> > > > pass through
> > > > a grace period, during which any in flight list traversal in
> > > > sctp_epaddr_lookup_transport could complete safely.  Its another two 
> > > > pointers
> > > We discussed this in last thread:
> > > https://www.spinics.net/lists/netdev/msg535191.html
> > >
> > > It will cause closed sk to linger longer.
> > >
> > Yes, but we never really got resolution on that topic.  I don't see that a
>
> Fair point. We should have brought back the discussion online.
>
> > socket lingering for an extra grace period is that big a deal.  I also 
> > don't see
>
> What we really don't want is to bring back
> 8c98653f0553 ("sctp: sctp_close: fix release of bindings for deferred 
> call_rcu's").
> (more below). That's where our fear lies.
>
> > how sending the actual kfree through a grace period is going to cause the 
> > socket
> > to linger.  If you look at sctp_association_destroy, we call sock_put prior 
> > to
> > calling kfree at the end of the function.  All I'm looking for here is for 
> > the
> > memory free to wait until any list traversal in 
> > sctp_epaddr_lookup_transport is
> > done, which is what you are trying to do with your atomics.
> >
> > As for your comment regarding sctp_transport_destroy_rcu, yes, that f

Re: [PATCHv2 net] sctp: check and update stream->out_curr when allocating stream_out

2018-11-30 Thread Xin Long
On Fri, Nov 30, 2018 at 9:21 PM Neil Horman  wrote:
>
> On Fri, Nov 30, 2018 at 03:22:39PM +0900, Xin Long wrote:
> > On Thu, Nov 29, 2018 at 11:39 PM Neil Horman  wrote:
> > >
> > > On Thu, Nov 29, 2018 at 02:42:56PM +0800, Xin Long wrote:
> > > > Now when using stream reconfig to add out streams, stream->out
> > > > will get re-allocated, and all old streams' information will
> > > > be copied to the new ones and the old ones will be freed.
> > > >
> > > > So without stream->out_curr updated, next time when trying to
> > > > send from stream->out_curr stream, a panic would be caused.
> > > >
> > > > This patch is to check and update stream->out_curr when
> > > > allocating stream_out.
> > > >
> > > > v1->v2:
> > > >   - define fa_index() to get elem index from stream->out_curr.
> > > >
> > > > Fixes: 5e32a431 ("sctp: introduce stream scheduler foundations")
> > > > Reported-by: Ying Xu 
> > > > Reported-by: syzbot+e33a3a138267ca119...@syzkaller.appspotmail.com
> > > > Signed-off-by: Xin Long 
> > > > ---
> > > >  net/sctp/stream.c | 20 
> > > >  1 file changed, 20 insertions(+)
> > > >
> > > > diff --git a/net/sctp/stream.c b/net/sctp/stream.c
> > > > index 3892e76..30e7809 100644
> > > > --- a/net/sctp/stream.c
> > > > +++ b/net/sctp/stream.c
> > > > @@ -84,6 +84,19 @@ static void fa_zero(struct flex_array *fa, size_t 
> > > > index, size_t count)
> > > >   }
> > > >  }
> > > >
> > > > +static size_t fa_index(struct flex_array *fa, void *elem, size_t count)
> > > > +{
> > > > + size_t index = 0;
> > > > +
> > > > + while (count--) {
> > > > + if (elem == flex_array_get(fa, index))
> > > > + break;
> > > > + index++;
> > > > + }
> > > > +
> > > > + return index;
> > > > +}
> > > > +
> > > >  /* Migrates chunks from stream queues to new stream queues if needed,
> > > >   * but not across associations. Also, removes those chunks to streams
> > > >   * higher than the new max.
> > > > @@ -147,6 +160,13 @@ static int sctp_stream_alloc_out(struct 
> > > > sctp_stream *stream, __u16 outcnt,
> > > >
> > > >   if (stream->out) {
> > > >   fa_copy(out, stream->out, 0, min(outcnt, stream->outcnt));
> > > > + if (stream->out_curr) {
> > > > + size_t index = fa_index(stream->out, 
> > > > stream->out_curr,
> > > > + stream->outcnt);
> > > > +
> > > > + BUG_ON(index == stream->outcnt);
> > > > + stream->out_curr = flex_array_get(out, index);
> > > > + }
> > > >   fa_free(stream->out);
> > > >   }
> > > >
> > > > --
> > > > 2.1.0
> > > >
> > > >
> > >
> > > This is the sort of thing I'm talking about. Its a little more code, but 
> > > if you
> > > augment the flex_array api like this, you can preform a resize operation 
> > > on your
> > > existing flex array, and you can avoid all the copying, and need to update
> > > pointers maintained outside the array.  Note this code isn't tested at 
> > > all, but
> > > its close to what I think should work.
> > >
> > >
> > > diff --git a/include/linux/flex_array.h b/include/linux/flex_array.h
> > > index b94fa61b51fb..7fa1f27a91b5 100644
> > > --- a/include/linux/flex_array.h
> > > +++ b/include/linux/flex_array.h
> > > @@ -73,6 +73,8 @@ struct flex_array {
> > >  struct flex_array *flex_array_alloc(int element_size, unsigned int total,
> > > gfp_t flags);
> > >
> > > +struct flex_array *flex_array_resize(struct flex_array *fa, unsigned int 
> > > total, gfp_t flags);
> > > +
> > >  /**
> > >   * flex_array_prealloc() - Ensures that memory for the elements indexed 
> > > in the
> > >   * range defined by start and nr_elements has been allocated.
> > > diff --git a/lib/flex_array.c b/lib/flex_array.c
> > >

Re: [PATCHv2 net] sctp: check and update stream->out_curr when allocating stream_out

2018-11-29 Thread Xin Long
On Thu, Nov 29, 2018 at 11:39 PM Neil Horman  wrote:
>
> On Thu, Nov 29, 2018 at 02:42:56PM +0800, Xin Long wrote:
> > Now when using stream reconfig to add out streams, stream->out
> > will get re-allocated, and all old streams' information will
> > be copied to the new ones and the old ones will be freed.
> >
> > So without stream->out_curr updated, next time when trying to
> > send from stream->out_curr stream, a panic would be caused.
> >
> > This patch is to check and update stream->out_curr when
> > allocating stream_out.
> >
> > v1->v2:
> >   - define fa_index() to get elem index from stream->out_curr.
> >
> > Fixes: 5e32a431 ("sctp: introduce stream scheduler foundations")
> > Reported-by: Ying Xu 
> > Reported-by: syzbot+e33a3a138267ca119...@syzkaller.appspotmail.com
> > Signed-off-by: Xin Long 
> > ---
> >  net/sctp/stream.c | 20 
> >  1 file changed, 20 insertions(+)
> >
> > diff --git a/net/sctp/stream.c b/net/sctp/stream.c
> > index 3892e76..30e7809 100644
> > --- a/net/sctp/stream.c
> > +++ b/net/sctp/stream.c
> > @@ -84,6 +84,19 @@ static void fa_zero(struct flex_array *fa, size_t index, 
> > size_t count)
> >   }
> >  }
> >
> > +static size_t fa_index(struct flex_array *fa, void *elem, size_t count)
> > +{
> > + size_t index = 0;
> > +
> > + while (count--) {
> > + if (elem == flex_array_get(fa, index))
> > + break;
> > + index++;
> > + }
> > +
> > + return index;
> > +}
> > +
> >  /* Migrates chunks from stream queues to new stream queues if needed,
> >   * but not across associations. Also, removes those chunks to streams
> >   * higher than the new max.
> > @@ -147,6 +160,13 @@ static int sctp_stream_alloc_out(struct sctp_stream 
> > *stream, __u16 outcnt,
> >
> >   if (stream->out) {
> >   fa_copy(out, stream->out, 0, min(outcnt, stream->outcnt));
> > + if (stream->out_curr) {
> > + size_t index = fa_index(stream->out, stream->out_curr,
> > + stream->outcnt);
> > +
> > + BUG_ON(index == stream->outcnt);
> > + stream->out_curr = flex_array_get(out, index);
> > + }
> >   fa_free(stream->out);
> >   }
> >
> > --
> > 2.1.0
> >
> >
>
> This is the sort of thing I'm talking about. Its a little more code, but if 
> you
> augment the flex_array api like this, you can preform a resize operation on 
> your
> existing flex array, and you can avoid all the copying, and need to update
> pointers maintained outside the array.  Note this code isn't tested at all, 
> but
> its close to what I think should work.
>
>
> diff --git a/include/linux/flex_array.h b/include/linux/flex_array.h
> index b94fa61b51fb..7fa1f27a91b5 100644
> --- a/include/linux/flex_array.h
> +++ b/include/linux/flex_array.h
> @@ -73,6 +73,8 @@ struct flex_array {
>  struct flex_array *flex_array_alloc(int element_size, unsigned int total,
> gfp_t flags);
>
> +struct flex_array *flex_array_resize(struct flex_array *fa, unsigned int 
> total, gfp_t flags);
> +
>  /**
>   * flex_array_prealloc() - Ensures that memory for the elements indexed in 
> the
>   * range defined by start and nr_elements has been allocated.
> diff --git a/lib/flex_array.c b/lib/flex_array.c
> index 2eed22fa507c..f8d54af3891b 100644
> --- a/lib/flex_array.c
> +++ b/lib/flex_array.c
> @@ -109,6 +109,7 @@ struct flex_array *flex_array_alloc(int element_size, 
> unsigned int total,
> ret->total_nr_elements = total;
> ret->elems_per_part = elems_per_part;
> ret->reciprocal_elems = reciprocal_elems;
> +   ret->elements_used = 0;
> if (elements_fit_in_base(ret) && !(flags & __GFP_ZERO))
> memset(>parts[0], FLEX_ARRAY_FREE,
> FLEX_ARRAY_BASE_BYTES_LEFT);
> @@ -116,6 +117,53 @@ struct flex_array *flex_array_alloc(int element_size, 
> unsigned int total,
>  }
>  EXPORT_SYMBOL(flex_array_alloc);
>
> +static int flex_array_last_element_index(struct flex_array *fa)
> +{
> +   struct flex_array_part *part;
> +   int part_nr;
> +   int i,j;
> +
> +   if (elements_fit_in_base(fa)) {
> +   part = (struct flex_array_part *)>parts[0];
> + 

Re: [PATCHv2 net] sctp: hold transport before accessing its asoc in sctp_epaddr_lookup_transport

2018-11-29 Thread Xin Long
On Fri, Nov 30, 2018 at 5:52 AM Neil Horman  wrote:
>
> On Thu, Nov 29, 2018 at 02:44:07PM +0800, Xin Long wrote:
> > Without holding transport to dereference its asoc, a use after
> > free panic can be caused in sctp_epaddr_lookup_transport. Note
> > that a sock lock can't protect these transports that belong to
> > other socks.
> >
> > A similar fix as Commit bab1be79a516 ("sctp: hold transport
> > before accessing its asoc in sctp_transport_get_next") is
> > needed to hold the transport before accessing its asoc in
> > sctp_epaddr_lookup_transport.
> >
> > Note that this extra atomic operation is on the datapath,
> > but as rhlist keeps the lists to a small size, it won't
> > see a noticeable performance hurt.
> >
> > v1->v2:
> >   - improve the changelog.
> >
> > Fixes: 7fda702f9315 ("sctp: use new rhlist interface on sctp transport 
> > rhashtable")
> > Reported-by: syzbot+aad231d51b1923158...@syzkaller.appspotmail.com
> > Signed-off-by: Xin Long 
> > ---
> >  net/sctp/input.c | 10 --
> >  1 file changed, 8 insertions(+), 2 deletions(-)
> >
> > diff --git a/net/sctp/input.c b/net/sctp/input.c
> > index 5c36a99..ce7351c 100644
> > --- a/net/sctp/input.c
> > +++ b/net/sctp/input.c
> > @@ -967,9 +967,15 @@ struct sctp_transport *sctp_epaddr_lookup_transport(
> >   list = rhltable_lookup(_transport_hashtable, ,
> >  sctp_hash_params);
> >
> > - rhl_for_each_entry_rcu(t, tmp, list, node)
> > - if (ep == t->asoc->ep)
> > + rhl_for_each_entry_rcu(t, tmp, list, node) {
> > + if (!sctp_transport_hold(t))
> > + continue;
> > + if (ep == t->asoc->ep) {
> > + sctp_transport_put(t);
> >   return t;
> > + }
> > + sctp_transport_put(t);
> > + }
> >
> >   return NULL;
> >  }
>
> Wait a second, what if we just added an rcu_head to the association structure
> and changed the kfree call in sctp_association_destroy to a kfree_rcu call
> instead?  That would force the actual freeing of the association to pass 
> through
> a grace period, during which any in flight list traversal in
> sctp_epaddr_lookup_transport could complete safely.  Its another two pointers
We discussed this in last thread:
https://www.spinics.net/lists/netdev/msg535191.html

It will cause closed sk to linger longer.

> worth of space in the association, but I think that would be a worthwhile
> tradeoff for not having to do N atomic adds/puts every time you wanted to
> receive or send a frame.
N is not a big value, as rhlist itself keeps lists in a size.

>
> Neil
>
> > --
> > 2.1.0
> >
> >


Re: [PATCHv2 net] sctp: check and update stream->out_curr when allocating stream_out

2018-11-29 Thread Xin Long
On Thu, Nov 29, 2018 at 9:50 PM Neil Horman  wrote:
>
> On Thu, Nov 29, 2018 at 02:42:56PM +0800, Xin Long wrote:
> > Now when using stream reconfig to add out streams, stream->out
> > will get re-allocated, and all old streams' information will
> > be copied to the new ones and the old ones will be freed.
> >
> > So without stream->out_curr updated, next time when trying to
> > send from stream->out_curr stream, a panic would be caused.
> >
> > This patch is to check and update stream->out_curr when
> > allocating stream_out.
> >
> > v1->v2:
> >   - define fa_index() to get elem index from stream->out_curr.
> >
> > Fixes: 5e32a431 ("sctp: introduce stream scheduler foundations")
> > Reported-by: Ying Xu 
> > Reported-by: syzbot+e33a3a138267ca119...@syzkaller.appspotmail.com
> > Signed-off-by: Xin Long 
> > ---
> >  net/sctp/stream.c | 20 
> >  1 file changed, 20 insertions(+)
> >
> > diff --git a/net/sctp/stream.c b/net/sctp/stream.c
> > index 3892e76..30e7809 100644
> > --- a/net/sctp/stream.c
> > +++ b/net/sctp/stream.c
> > @@ -84,6 +84,19 @@ static void fa_zero(struct flex_array *fa, size_t index, 
> > size_t count)
> >   }
> >  }
> >
> > +static size_t fa_index(struct flex_array *fa, void *elem, size_t count)
> > +{
> > + size_t index = 0;
> > +
> > + while (count--) {
> > + if (elem == flex_array_get(fa, index))
> > + break;
> > + index++;
> > + }
> > +
> > + return index;
> > +}
> > +
> >  /* Migrates chunks from stream queues to new stream queues if needed,
> >   * but not across associations. Also, removes those chunks to streams
> >   * higher than the new max.
> > @@ -147,6 +160,13 @@ static int sctp_stream_alloc_out(struct sctp_stream 
> > *stream, __u16 outcnt,
> >
> >   if (stream->out) {
> >   fa_copy(out, stream->out, 0, min(outcnt, stream->outcnt));
> > + if (stream->out_curr) {
> > + size_t index = fa_index(stream->out, stream->out_curr,
> > + stream->outcnt);
> > +
> > + BUG_ON(index == stream->outcnt);
> > + stream->out_curr = flex_array_get(out, index);
> > + }
> >   fa_free(stream->out);
> >   }
> >
> > --
> > 2.1.0
> >
> >
>
> I'm having a hard time understanding why, as I noted earlier, you don't just
> write a function in the flex_array code that can resize the number of elements
> in your array.  If you do that, you can avoid both all the copying, and the 
> need
> to lookup the in-use pointer again
didn't want to touch the flex_array code, but you're right,
it would avoid both the copying and the lookup. I will
have a try tomorrow in flex_array.c, thanks.

>
> Neil
>


[PATCHv2 net] sctp: hold transport before accessing its asoc in sctp_hash_transport

2018-11-28 Thread Xin Long
In sctp_hash_transport, it dereferences a transport's asoc only under
rcu_read_lock. Without holding the transport, its asoc could be freed
already, which leads to a use-after-free panic.

A similar fix as Commit bab1be79a516 ("sctp: hold transport before
accessing its asoc in sctp_transport_get_next") is needed to hold
the transport before accessing its asoc in sctp_hash_transport.

Note that as rhlist keeps the lists to a small size, this extra
atomic operation won't cause a noticeable latency on inserting
a transport. Yet it's not in a datapath.

v1->v2:
  - improve the changelog.

Fixes: cd2b70875058 ("sctp: check duplicate node before inserting a new 
transport")
Reported-by: syzbot+0b05d8aa7cb185107...@syzkaller.appspotmail.com
Signed-off-by: Xin Long 
---
 net/sctp/input.c | 7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/net/sctp/input.c b/net/sctp/input.c
index ce7351c..c2c0816 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -896,11 +896,16 @@ int sctp_hash_transport(struct sctp_transport *t)
list = rhltable_lookup(_transport_hashtable, ,
   sctp_hash_params);
 
-   rhl_for_each_entry_rcu(transport, tmp, list, node)
+   rhl_for_each_entry_rcu(transport, tmp, list, node) {
+   if (!sctp_transport_hold(transport))
+   continue;
if (transport->asoc->ep == t->asoc->ep) {
+   sctp_transport_put(transport);
rcu_read_unlock();
return -EEXIST;
}
+   sctp_transport_put(transport);
+   }
rcu_read_unlock();
 
err = rhltable_insert_key(_transport_hashtable, ,
-- 
2.1.0



[PATCHv2 net] sctp: hold transport before accessing its asoc in sctp_epaddr_lookup_transport

2018-11-28 Thread Xin Long
Without holding transport to dereference its asoc, a use after
free panic can be caused in sctp_epaddr_lookup_transport. Note
that a sock lock can't protect these transports that belong to
other socks.

A similar fix as Commit bab1be79a516 ("sctp: hold transport
before accessing its asoc in sctp_transport_get_next") is
needed to hold the transport before accessing its asoc in
sctp_epaddr_lookup_transport.

Note that this extra atomic operation is on the datapath,
but as rhlist keeps the lists to a small size, it won't
see a noticeable performance hurt.

v1->v2:
  - improve the changelog.

Fixes: 7fda702f9315 ("sctp: use new rhlist interface on sctp transport 
rhashtable")
Reported-by: syzbot+aad231d51b1923158...@syzkaller.appspotmail.com
Signed-off-by: Xin Long 
---
 net/sctp/input.c | 10 --
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/net/sctp/input.c b/net/sctp/input.c
index 5c36a99..ce7351c 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -967,9 +967,15 @@ struct sctp_transport *sctp_epaddr_lookup_transport(
list = rhltable_lookup(_transport_hashtable, ,
   sctp_hash_params);
 
-   rhl_for_each_entry_rcu(t, tmp, list, node)
-   if (ep == t->asoc->ep)
+   rhl_for_each_entry_rcu(t, tmp, list, node) {
+   if (!sctp_transport_hold(t))
+   continue;
+   if (ep == t->asoc->ep) {
+   sctp_transport_put(t);
return t;
+   }
+   sctp_transport_put(t);
+   }
 
return NULL;
 }
-- 
2.1.0



[PATCHv2 net] sctp: check and update stream->out_curr when allocating stream_out

2018-11-28 Thread Xin Long
Now when using stream reconfig to add out streams, stream->out
will get re-allocated, and all old streams' information will
be copied to the new ones and the old ones will be freed.

So without stream->out_curr updated, next time when trying to
send from stream->out_curr stream, a panic would be caused.

This patch is to check and update stream->out_curr when
allocating stream_out.

v1->v2:
  - define fa_index() to get elem index from stream->out_curr.

Fixes: 5e32a431 ("sctp: introduce stream scheduler foundations")
Reported-by: Ying Xu 
Reported-by: syzbot+e33a3a138267ca119...@syzkaller.appspotmail.com
Signed-off-by: Xin Long 
---
 net/sctp/stream.c | 20 
 1 file changed, 20 insertions(+)

diff --git a/net/sctp/stream.c b/net/sctp/stream.c
index 3892e76..30e7809 100644
--- a/net/sctp/stream.c
+++ b/net/sctp/stream.c
@@ -84,6 +84,19 @@ static void fa_zero(struct flex_array *fa, size_t index, 
size_t count)
}
 }
 
+static size_t fa_index(struct flex_array *fa, void *elem, size_t count)
+{
+   size_t index = 0;
+
+   while (count--) {
+   if (elem == flex_array_get(fa, index))
+   break;
+   index++;
+   }
+
+   return index;
+}
+
 /* Migrates chunks from stream queues to new stream queues if needed,
  * but not across associations. Also, removes those chunks to streams
  * higher than the new max.
@@ -147,6 +160,13 @@ static int sctp_stream_alloc_out(struct sctp_stream 
*stream, __u16 outcnt,
 
if (stream->out) {
fa_copy(out, stream->out, 0, min(outcnt, stream->outcnt));
+   if (stream->out_curr) {
+   size_t index = fa_index(stream->out, stream->out_curr,
+   stream->outcnt);
+
+   BUG_ON(index == stream->outcnt);
+   stream->out_curr = flex_array_get(out, index);
+   }
fa_free(stream->out);
}
 
-- 
2.1.0



Re: [PATCH net] sctp: hold transport before accessing its asoc in sctp_hash_transport

2018-11-28 Thread Xin Long
On Thu, Nov 22, 2018 at 2:53 AM Marcelo Ricardo Leitner
 wrote:
>
> On Wed, Nov 21, 2018 at 03:47:33PM +0900, Xin Long wrote:
> > On Wed, Nov 21, 2018 at 9:46 AM Marcelo Ricardo Leitner
> >  wrote:
> > >
> > > On Tue, Nov 20, 2018 at 07:52:48AM -0500, Neil Horman wrote:
> > > > On Tue, Nov 20, 2018 at 07:09:16PM +0800, Xin Long wrote:
> > > > > In sctp_hash_transport, it dereferences a transport's asoc only under
> > > > > rcu_read_lock. Without holding the transport, its asoc could be freed
> > > > > already, which leads to a use-after-free panic.
> > > > >
> > > > > A similar fix as Commit bab1be79a516 ("sctp: hold transport before
> > > > > accessing its asoc in sctp_transport_get_next") is needed to hold
> > > > > the transport before accessing its asoc in sctp_hash_transport.
> > > > >
> > > > > Fixes: cd2b70875058 ("sctp: check duplicate node before inserting a 
> > > > > new transport")
> > > > > Reported-by: syzbot+0b05d8aa7cb185107...@syzkaller.appspotmail.com
> > > > > Signed-off-by: Xin Long 
> > > > > ---
> > > > >  net/sctp/input.c | 7 ++-
> > > > >  1 file changed, 6 insertions(+), 1 deletion(-)
> > > > >
> > > > > diff --git a/net/sctp/input.c b/net/sctp/input.c
> > > > > index 5c36a99..69584e9 100644
> > > > > --- a/net/sctp/input.c
> > > > > +++ b/net/sctp/input.c
> > > > > @@ -896,11 +896,16 @@ int sctp_hash_transport(struct sctp_transport 
> > > > > *t)
> > > > > list = rhltable_lookup(_transport_hashtable, ,
> > > > >sctp_hash_params);
> > > > >
> > > > > -   rhl_for_each_entry_rcu(transport, tmp, list, node)
> > > > > +   rhl_for_each_entry_rcu(transport, tmp, list, node) {
> > > > > +   if (!sctp_transport_hold(transport))
> > > > > +   continue;
> > > > > if (transport->asoc->ep == t->asoc->ep) {
> > > > > +   sctp_transport_put(transport);
> > > > > rcu_read_unlock();
> > > > > return -EEXIST;
> > > > > }
> > > > > +   sctp_transport_put(transport);
> > > > > +   }
> > > > > rcu_read_unlock();
> > > > >
> > > > > err = rhltable_insert_key(_transport_hashtable, ,
> > > > > --
> > > > > 2.1.0
> > > > >
> > > > >
> > > >
> > > > something doesn't feel at all right about this.  If we are inserting a 
> > > > transport
> > > > to an association, it would seem to me that we should have at least one 
> > > > user of
> > > > the association (i.e. non-zero refcount).  As such it seems something 
> > > > is wrong
> > > > with the association refcount here.  At the very least, if there is a 
> > > > case where
> > > > an association is being removed while a transport is being added, the 
> > > > better
> > > > solution would be to ensure that sctp_association_destroy goes through a
> > > > quiescent point prior to unhashing transports from the list, to ensure 
> > > > that
> > > > there is no conflict with the add operation above.
> > Changing to do call_rcu(>rcu, sctp_association_destroy) can
> > work for this case.
> > But it means asoc and socket (taking the port) will have to wait for a
> > grace period, which is not expected. We seemed to have talked about
> > this before, Marcelo?
>
> Yes. This would cause it to linger longer and cause bind conflicts
> meanwhile.
> Note that we already have sctp_transport_destroy_rcu(), so this would
> be a 2nd grace period.
>
> >
> > >
> > > Consider that the rhl_for_each_entry_rcu() is traversing the global
> > > rhashtable, and that it may operate on unrelated transports/asocs.
> > > E.g., transport->asoc in the for() is potentially different from the
> > > asoc under socket lock.
> > >
> > > The core of the fix is at:
> > > +   if (!sctp_transport_hold(transport))
> > > +   continue;
> > > If we can get a hold, the asoc will be available for dereferencing in
> > > subsequent lines. Otherwise, move on.
> > >
>

[PATCHv2 net] sctp: update frag_point when stream_interleave is set

2018-11-27 Thread Xin Long
sctp_assoc_update_frag_point() should be called whenever asoc->pathmtu
changes, but we missed one place in sctp_association_init(). It would
cause frag_point is zero when sending data.

As says in Jakub's reproducer, if sp->pathmtu is set by socketopt, the
new asoc->pathmtu inherits it in sctp_association_init(). Later when
transports are added and their pmtu >= asoc->pathmtu, it will never
call sctp_assoc_update_frag_point() to set frag_point.

This patch is to fix it by updating frag_point after asoc->pathmtu is
set as sp->pathmtu in sctp_association_init(). Note that it moved them
after sctp_stream_init(), as stream->si needs to be set first.

Frag_point's calculation is also related with datachunk's type, so it
needs to update frag_point when stream->si may be changed in
sctp_process_init().

v1->v2:
  - call sctp_assoc_update_frag_point() separately in sctp_process_init
and sctp_association_init, per Marcelo's suggestion.

Fixes: 2f5e3c9df693 ("sctp: introduce sctp_assoc_update_frag_point")
Reported-by: Jakub Audykowicz 
Signed-off-by: Xin Long 
---
 net/sctp/associola.c | 7 ---
 net/sctp/sm_make_chunk.c | 3 +++
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index 6a28b96..dd77ec3 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -118,9 +118,6 @@ static struct sctp_association *sctp_association_init(
asoc->flowlabel = sp->flowlabel;
asoc->dscp = sp->dscp;
 
-   /* Initialize default path MTU. */
-   asoc->pathmtu = sp->pathmtu;
-
/* Set association default SACK delay */
asoc->sackdelay = msecs_to_jiffies(sp->sackdelay);
asoc->sackfreq = sp->sackfreq;
@@ -252,6 +249,10 @@ static struct sctp_association *sctp_association_init(
 0, gfp))
goto fail_init;
 
+   /* Initialize default path MTU. */
+   asoc->pathmtu = sp->pathmtu;
+   sctp_assoc_update_frag_point(asoc);
+
/* Assume that peer would support both address types unless we are
 * told otherwise.
 */
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 4a4fd19..f4ac6c5 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -2462,6 +2462,9 @@ int sctp_process_init(struct sctp_association *asoc, 
struct sctp_chunk *chunk,
 asoc->c.sinit_max_instreams, gfp))
goto clean_up;
 
+   /* Update frag_point when stream_interleave may get changed. */
+   sctp_assoc_update_frag_point(asoc);
+
if (!asoc->temp && sctp_assoc_set_id(asoc, gfp))
goto clean_up;
 
-- 
2.1.0



Re: [PATCH net] sctp: check and update stream->out_curr when allocating stream_out

2018-11-27 Thread Xin Long
On Tue, Nov 27, 2018 at 7:30 PM Xin Long  wrote:
>
> On Mon, Nov 26, 2018 at 10:59 PM Neil Horman  wrote:
> >
> > On Mon, Nov 26, 2018 at 10:46:33PM +0900, Xin Long wrote:
> > > On Mon, Nov 26, 2018 at 9:54 PM Neil Horman  wrote:
> > > >
> > > > On Mon, Nov 26, 2018 at 07:22:05PM +0800, Xin Long wrote:
> > > > > Now when using stream reconfig to add out streams, stream->out
> > > > > will get re-allocated, and all old streams' information will
> > > > > be copied to the new ones and the old ones will be freed.
> > > > >
> > > > > So without stream->out_curr updated, next time when trying to
> > > > > send from stream->out_curr stream, a panic would be caused.
> > > > >
> > > > > This patch is to define sctp_stream_out_copy used to update the
> > > > > stream->out_curr pointer to the new stream when copying the old
> > > > > streams' information.
> > > > >
> > > > > While at it, rename fa_copy to sctp_stream_in_copy.
> > > > >
> > > > > Fixes: 5e32a431 ("sctp: introduce stream scheduler foundations")
> > > > > Reported-by: Ying Xu 
> > > > > Reported-by: syzbot+e33a3a138267ca119...@syzkaller.appspotmail.com
> > > > > Signed-off-by: Xin Long 
> > > > > ---
> > > > >  net/sctp/stream.c | 46 --
> > > > >  1 file changed, 32 insertions(+), 14 deletions(-)
> > > > >
> > > > > diff --git a/net/sctp/stream.c b/net/sctp/stream.c
> > > > > index 3892e76..0687eeb 100644
> > > > > --- a/net/sctp/stream.c
> > > > > +++ b/net/sctp/stream.c
> > > > > @@ -61,18 +61,6 @@ static void fa_free(struct flex_array *fa)
> > > > >   flex_array_free(fa);
> > > > >  }
> > > > >
> > > > > -static void fa_copy(struct flex_array *fa, struct flex_array *from,
> > > > > - size_t index, size_t count)
> > > > > -{
> > > > > - void *elem;
> > > > > -
> > > > > - while (count--) {
> > > > > - elem = flex_array_get(from, index);
> > > > > - flex_array_put(fa, index, elem, 0);
> > > > > - index++;
> > > > > - }
> > > > > -}
> > > > > -
> > > > >  static void fa_zero(struct flex_array *fa, size_t index, size_t 
> > > > > count)
> > > > >  {
> > > > >   void *elem;
> > > > > @@ -135,6 +123,36 @@ static void sctp_stream_outq_migrate(struct 
> > > > > sctp_stream *stream,
> > > > >   kfree(SCTP_SO(stream, i)->ext);
> > > > >  }
> > > > >
> > > > > +static void sctp_stream_in_copy(struct flex_array *fa,
> > > > > + struct sctp_stream *stream, __u16 count)
> > > > > +{
> > > > > + size_t index = 0;
> > > > > + void *elem;
> > > > > +
> > > > > + count = min(count, stream->incnt);
> > > > > + while (count--) {
> > > > > + elem = flex_array_get(stream->in, index);
> > > > > + flex_array_put(fa, index, elem, 0);
> > > > > + index++;
> > > > > + }
> > > > > +}
> > > > > +
> > > > > +static void sctp_stream_out_copy(struct flex_array *fa,
> > > > > +  struct sctp_stream *stream, __u16 
> > > > > count)
> > > > > +{
> > > > > + size_t index = 0;
> > > > > + void *elem;
> > > > > +
> > > > > + count = min(count, stream->outcnt);
> > > > > + while (count--) {
> > > > > + elem = flex_array_get(stream->out, index);
> > > > > + flex_array_put(fa, index, elem, 0);
> > > > > + if (stream->out_curr == elem)
> > > > > + stream->out_curr = flex_array_get(fa, index);
> > > > > + index++;
> > > > > + }
> > > > > +}
> > > > > +
> > > > Seems like you are duplicating code here.  I think you would be better 
> >

Re: [PATCH net] sctp: check and update stream->out_curr when allocating stream_out

2018-11-27 Thread Xin Long
On Mon, Nov 26, 2018 at 10:59 PM Neil Horman  wrote:
>
> On Mon, Nov 26, 2018 at 10:46:33PM +0900, Xin Long wrote:
> > On Mon, Nov 26, 2018 at 9:54 PM Neil Horman  wrote:
> > >
> > > On Mon, Nov 26, 2018 at 07:22:05PM +0800, Xin Long wrote:
> > > > Now when using stream reconfig to add out streams, stream->out
> > > > will get re-allocated, and all old streams' information will
> > > > be copied to the new ones and the old ones will be freed.
> > > >
> > > > So without stream->out_curr updated, next time when trying to
> > > > send from stream->out_curr stream, a panic would be caused.
> > > >
> > > > This patch is to define sctp_stream_out_copy used to update the
> > > > stream->out_curr pointer to the new stream when copying the old
> > > > streams' information.
> > > >
> > > > While at it, rename fa_copy to sctp_stream_in_copy.
> > > >
> > > > Fixes: 5e32a431 ("sctp: introduce stream scheduler foundations")
> > > > Reported-by: Ying Xu 
> > > > Reported-by: syzbot+e33a3a138267ca119...@syzkaller.appspotmail.com
> > > > Signed-off-by: Xin Long 
> > > > ---
> > > >  net/sctp/stream.c | 46 --
> > > >  1 file changed, 32 insertions(+), 14 deletions(-)
> > > >
> > > > diff --git a/net/sctp/stream.c b/net/sctp/stream.c
> > > > index 3892e76..0687eeb 100644
> > > > --- a/net/sctp/stream.c
> > > > +++ b/net/sctp/stream.c
> > > > @@ -61,18 +61,6 @@ static void fa_free(struct flex_array *fa)
> > > >   flex_array_free(fa);
> > > >  }
> > > >
> > > > -static void fa_copy(struct flex_array *fa, struct flex_array *from,
> > > > - size_t index, size_t count)
> > > > -{
> > > > - void *elem;
> > > > -
> > > > - while (count--) {
> > > > - elem = flex_array_get(from, index);
> > > > - flex_array_put(fa, index, elem, 0);
> > > > - index++;
> > > > - }
> > > > -}
> > > > -
> > > >  static void fa_zero(struct flex_array *fa, size_t index, size_t count)
> > > >  {
> > > >   void *elem;
> > > > @@ -135,6 +123,36 @@ static void sctp_stream_outq_migrate(struct 
> > > > sctp_stream *stream,
> > > >   kfree(SCTP_SO(stream, i)->ext);
> > > >  }
> > > >
> > > > +static void sctp_stream_in_copy(struct flex_array *fa,
> > > > + struct sctp_stream *stream, __u16 count)
> > > > +{
> > > > + size_t index = 0;
> > > > + void *elem;
> > > > +
> > > > + count = min(count, stream->incnt);
> > > > + while (count--) {
> > > > + elem = flex_array_get(stream->in, index);
> > > > + flex_array_put(fa, index, elem, 0);
> > > > + index++;
> > > > + }
> > > > +}
> > > > +
> > > > +static void sctp_stream_out_copy(struct flex_array *fa,
> > > > +  struct sctp_stream *stream, __u16 count)
> > > > +{
> > > > + size_t index = 0;
> > > > + void *elem;
> > > > +
> > > > + count = min(count, stream->outcnt);
> > > > + while (count--) {
> > > > + elem = flex_array_get(stream->out, index);
> > > > + flex_array_put(fa, index, elem, 0);
> > > > + if (stream->out_curr == elem)
> > > > + stream->out_curr = flex_array_get(fa, index);
> > > > + index++;
> > > > + }
> > > > +}
> > > > +
> > > Seems like you are duplicating code here.  I think you would be better off
> > > moving the fa_copy routine to the flex_array api (perhaps renaming it
> > > flex_array_copy), and then codig sctp_stream_*_copy as static inlines 
> > > that just
> > > call the flex_array api to do the copy.  As for setting the out_curr 
> > > pointer,
> > > perhaps you should convert that to an index, so it can be looked up on 
> > > demand,
> > changing to use index only for this  may not worth it.
> > there is no API from flex_array to convert element to index either

Re: [PATCH net] sctp: check and update stream->out_curr when allocating stream_out

2018-11-26 Thread Xin Long
On Mon, Nov 26, 2018 at 9:54 PM Neil Horman  wrote:
>
> On Mon, Nov 26, 2018 at 07:22:05PM +0800, Xin Long wrote:
> > Now when using stream reconfig to add out streams, stream->out
> > will get re-allocated, and all old streams' information will
> > be copied to the new ones and the old ones will be freed.
> >
> > So without stream->out_curr updated, next time when trying to
> > send from stream->out_curr stream, a panic would be caused.
> >
> > This patch is to define sctp_stream_out_copy used to update the
> > stream->out_curr pointer to the new stream when copying the old
> > streams' information.
> >
> > While at it, rename fa_copy to sctp_stream_in_copy.
> >
> > Fixes: 5e32a431 ("sctp: introduce stream scheduler foundations")
> > Reported-by: Ying Xu 
> > Reported-by: syzbot+e33a3a138267ca119...@syzkaller.appspotmail.com
> > Signed-off-by: Xin Long 
> > ---
> >  net/sctp/stream.c | 46 --
> >  1 file changed, 32 insertions(+), 14 deletions(-)
> >
> > diff --git a/net/sctp/stream.c b/net/sctp/stream.c
> > index 3892e76..0687eeb 100644
> > --- a/net/sctp/stream.c
> > +++ b/net/sctp/stream.c
> > @@ -61,18 +61,6 @@ static void fa_free(struct flex_array *fa)
> >   flex_array_free(fa);
> >  }
> >
> > -static void fa_copy(struct flex_array *fa, struct flex_array *from,
> > - size_t index, size_t count)
> > -{
> > - void *elem;
> > -
> > - while (count--) {
> > - elem = flex_array_get(from, index);
> > - flex_array_put(fa, index, elem, 0);
> > - index++;
> > - }
> > -}
> > -
> >  static void fa_zero(struct flex_array *fa, size_t index, size_t count)
> >  {
> >   void *elem;
> > @@ -135,6 +123,36 @@ static void sctp_stream_outq_migrate(struct 
> > sctp_stream *stream,
> >   kfree(SCTP_SO(stream, i)->ext);
> >  }
> >
> > +static void sctp_stream_in_copy(struct flex_array *fa,
> > + struct sctp_stream *stream, __u16 count)
> > +{
> > + size_t index = 0;
> > + void *elem;
> > +
> > + count = min(count, stream->incnt);
> > + while (count--) {
> > + elem = flex_array_get(stream->in, index);
> > + flex_array_put(fa, index, elem, 0);
> > + index++;
> > + }
> > +}
> > +
> > +static void sctp_stream_out_copy(struct flex_array *fa,
> > +  struct sctp_stream *stream, __u16 count)
> > +{
> > + size_t index = 0;
> > + void *elem;
> > +
> > + count = min(count, stream->outcnt);
> > + while (count--) {
> > + elem = flex_array_get(stream->out, index);
> > + flex_array_put(fa, index, elem, 0);
> > + if (stream->out_curr == elem)
> > + stream->out_curr = flex_array_get(fa, index);
> > + index++;
> > + }
> > +}
> > +
> Seems like you are duplicating code here.  I think you would be better off
> moving the fa_copy routine to the flex_array api (perhaps renaming it
> flex_array_copy), and then codig sctp_stream_*_copy as static inlines that 
> just
> call the flex_array api to do the copy.  As for setting the out_curr pointer,
> perhaps you should convert that to an index, so it can be looked up on demand,
changing to use index only for this  may not worth it.
there is no API from flex_array to convert element to index either
the index is also the stream_id, but we didn't save it into stream_out
either, too.

> so that it doesn't have to be updated here at all, or alternatively, just set 
> it
> back to NULL here so that the selected scheduler will be forced to do the next
> lookup.
We can't set it back to NULL. Otherwise, the scheduler may go to
send other msg if the last msg (with multiple chunks) is not yet sent
out completely, which is not allowed when it's not I-Data chunk.

This is not much duplicating, and this can reduce few params.
I'm actually ok with this.

>
> Neil
>
> >  static int sctp_stream_alloc_out(struct sctp_stream *stream, __u16 outcnt,
> >gfp_t gfp)
> >  {
> > @@ -146,7 +164,7 @@ static int sctp_stream_alloc_out(struct sctp_stream 
> > *stream, __u16 outcnt,
> >   return -ENOMEM;
> >
> >   if (stream->out) {
> > - fa_copy(out, stream->out, 0, min(outcnt, stream->outcnt));
> > + sctp_stream_out_copy(out, stream, outcnt);
> >   fa_free(stream->out);
> >   }
> >
> > @@ -169,7 +187,7 @@ static int sctp_stream_alloc_in(struct sctp_stream 
> > *stream, __u16 incnt,
> >   return -ENOMEM;
> >
> >   if (stream->in) {
> > - fa_copy(in, stream->in, 0, min(incnt, stream->incnt));
> > + sctp_stream_in_copy(in, stream, incnt);
> >   fa_free(stream->in);
> >   }
> >
> > --
> > 2.1.0
> >
> >


Re: [PATCH net] sctp: update frag_point when stream_interleave is set

2018-11-26 Thread Xin Long
On Mon, Nov 26, 2018 at 9:29 PM Marcelo Ricardo Leitner
 wrote:
>
> On Mon, Nov 26, 2018 at 05:02:11PM +0800, Xin Long wrote:
> > sctp_assoc_update_frag_point() should be called whenever asoc->pathmtu
> > changes, but we missed one place in sctp_association_init(). It would
> > cause frag_point is zero when sending data.
> >
> > As says in Jakub's reproducer, if sp->pathmtu is set by socketopt, the
> > new asoc->pathmtu inherits it in sctp_association_init(). Later when
> > transports are added and their pmtu >= asoc->pathmtu, it will never
> > call sctp_assoc_update_frag_point() to set frag_point.
> >
> > This patch is to fix it by updating frag_point when stream_interleave
> > is set in sctp_stream_interleave_init(), which is also called in
> > sctp_association_init(). We're doing this also because frag_point
> > is affected by datachunk's type, namely stream_interleave_0/1.
> >
> > Fixes: 2f5e3c9df693 ("sctp: introduce sctp_assoc_update_frag_point")
> > Reported-by: Jakub Audykowicz 
> > Signed-off-by: Xin Long 
> > ---
> >  net/sctp/stream_interleave.c | 1 +
> >  1 file changed, 1 insertion(+)
> >
> > diff --git a/net/sctp/stream_interleave.c b/net/sctp/stream_interleave.c
> > index 0a78cdf..19d596d 100644
> > --- a/net/sctp/stream_interleave.c
> > +++ b/net/sctp/stream_interleave.c
> > @@ -1327,4 +1327,5 @@ void sctp_stream_interleave_init(struct sctp_stream 
> > *stream)
> >   asoc = container_of(stream, struct sctp_association, stream);
> >   stream->si = asoc->intl_enable ? _stream_interleave_1
> >  : _stream_interleave_0;
> > + sctp_assoc_update_frag_point(asoc);
>
> I get that by adding it here we avoid adding it twice, one in
> sctp_association_init and another in sctp_process_init, but here it is
> out of context.
>
> The decision on data chunk format is not made on this function but
> higher in the stack and we can leverage that for sctp_process_init,
> and for sctp_association_init, we should have it as close as possible
> to where it initialized pathmtu and did not update the frag point.
okay, but both have to be after sctp_stream_init().
though we want sctp_assoc_update_frag_point()
called right after "asoc->pathmtu = sp->pathmtu;".

diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index a827a1f..a614937 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -252,6 +252,8 @@ static struct sctp_association *sctp_association_init(
 0, gfp))
goto fail_init;

+   sctp_assoc_update_frag_point(asoc);
+
/* Assume that peer would support both address types unless we are
 * told otherwise.
 */
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 4a4fd19..600ca0d 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -2462,6 +2462,8 @@ int sctp_process_init(struct sctp_association
*asoc, struct sctp_chunk *chunk,
 asoc->c.sinit_max_instreams, gfp))
goto clean_up;

+   sctp_assoc_update_frag_point(asoc);
+
if (!asoc->temp && sctp_assoc_set_id(asoc, gfp))
goto clean_up;

>
> >  }
> > --
> > 2.1.0
> >


Re: [PATCH net] sctp: increase sk_wmem_alloc when head->truesize is increased

2018-11-26 Thread Xin Long
On Mon, Nov 26, 2018 at 9:27 PM Neil Horman  wrote:
>
> On Mon, Nov 26, 2018 at 02:52:44PM +0800, Xin Long wrote:
> > I changed to count sk_wmem_alloc by skb truesize instead of 1 to
> > fix the sk_wmem_alloc leak caused by later truesize's change in
> > xfrm in Commit 02968ccf0125 ("sctp: count sk_wmem_alloc by skb
> > truesize in sctp_packet_transmit").
> >
> > But I should have also increased sk_wmem_alloc when head->truesize
> > is increased in sctp_packet_gso_append() as xfrm does. Otherwise,
> > sctp gso packet will cause sk_wmem_alloc underflow.
> >
> > Fixes: 02968ccf0125 ("sctp: count sk_wmem_alloc by skb truesize in 
> > sctp_packet_transmit")
> > Signed-off-by: Xin Long 
> > ---
> >  net/sctp/output.c | 1 +
> >  1 file changed, 1 insertion(+)
> >
> > diff --git a/net/sctp/output.c b/net/sctp/output.c
> > index b0e74a3..025f48e 100644
> > --- a/net/sctp/output.c
> > +++ b/net/sctp/output.c
> > @@ -410,6 +410,7 @@ static void sctp_packet_gso_append(struct sk_buff 
> > *head, struct sk_buff *skb)
> >   head->truesize += skb->truesize;
> >   head->data_len += skb->len;
> >   head->len += skb->len;
> > + refcount_add(skb->truesize, >sk->sk_wmem_alloc);
> >
> >   __skb_header_release(skb);
> >  }
> This looks to me like you are now double counting every packet that passes
> through sctp_packet_transmit, once in skb_set_owner_w and again in
> sctp_packet_pack=>sctp_packet_gso_append
For gso packet, the skb(head) in sctp_packet_transmit/skb_set_owner_w
is the head_skb, the skbs here are the fragments.

For non-gso packet, it will never come here.

>
> Neil
>
> > --
> > 2.1.0
> >
> >


[PATCH net] sctp: check and update stream->out_curr when allocating stream_out

2018-11-26 Thread Xin Long
Now when using stream reconfig to add out streams, stream->out
will get re-allocated, and all old streams' information will
be copied to the new ones and the old ones will be freed.

So without stream->out_curr updated, next time when trying to
send from stream->out_curr stream, a panic would be caused.

This patch is to define sctp_stream_out_copy used to update the
stream->out_curr pointer to the new stream when copying the old
streams' information.

While at it, rename fa_copy to sctp_stream_in_copy.

Fixes: 5e32a431 ("sctp: introduce stream scheduler foundations")
Reported-by: Ying Xu 
Reported-by: syzbot+e33a3a138267ca119...@syzkaller.appspotmail.com
Signed-off-by: Xin Long 
---
 net/sctp/stream.c | 46 --
 1 file changed, 32 insertions(+), 14 deletions(-)

diff --git a/net/sctp/stream.c b/net/sctp/stream.c
index 3892e76..0687eeb 100644
--- a/net/sctp/stream.c
+++ b/net/sctp/stream.c
@@ -61,18 +61,6 @@ static void fa_free(struct flex_array *fa)
flex_array_free(fa);
 }
 
-static void fa_copy(struct flex_array *fa, struct flex_array *from,
-   size_t index, size_t count)
-{
-   void *elem;
-
-   while (count--) {
-   elem = flex_array_get(from, index);
-   flex_array_put(fa, index, elem, 0);
-   index++;
-   }
-}
-
 static void fa_zero(struct flex_array *fa, size_t index, size_t count)
 {
void *elem;
@@ -135,6 +123,36 @@ static void sctp_stream_outq_migrate(struct sctp_stream 
*stream,
kfree(SCTP_SO(stream, i)->ext);
 }
 
+static void sctp_stream_in_copy(struct flex_array *fa,
+   struct sctp_stream *stream, __u16 count)
+{
+   size_t index = 0;
+   void *elem;
+
+   count = min(count, stream->incnt);
+   while (count--) {
+   elem = flex_array_get(stream->in, index);
+   flex_array_put(fa, index, elem, 0);
+   index++;
+   }
+}
+
+static void sctp_stream_out_copy(struct flex_array *fa,
+struct sctp_stream *stream, __u16 count)
+{
+   size_t index = 0;
+   void *elem;
+
+   count = min(count, stream->outcnt);
+   while (count--) {
+   elem = flex_array_get(stream->out, index);
+   flex_array_put(fa, index, elem, 0);
+   if (stream->out_curr == elem)
+   stream->out_curr = flex_array_get(fa, index);
+   index++;
+   }
+}
+
 static int sctp_stream_alloc_out(struct sctp_stream *stream, __u16 outcnt,
 gfp_t gfp)
 {
@@ -146,7 +164,7 @@ static int sctp_stream_alloc_out(struct sctp_stream 
*stream, __u16 outcnt,
return -ENOMEM;
 
if (stream->out) {
-   fa_copy(out, stream->out, 0, min(outcnt, stream->outcnt));
+   sctp_stream_out_copy(out, stream, outcnt);
fa_free(stream->out);
}
 
@@ -169,7 +187,7 @@ static int sctp_stream_alloc_in(struct sctp_stream *stream, 
__u16 incnt,
return -ENOMEM;
 
if (stream->in) {
-   fa_copy(in, stream->in, 0, min(incnt, stream->incnt));
+   sctp_stream_in_copy(in, stream, incnt);
fa_free(stream->in);
}
 
-- 
2.1.0



[PATCH net] sctp: update frag_point when stream_interleave is set

2018-11-26 Thread Xin Long
sctp_assoc_update_frag_point() should be called whenever asoc->pathmtu
changes, but we missed one place in sctp_association_init(). It would
cause frag_point is zero when sending data.

As says in Jakub's reproducer, if sp->pathmtu is set by socketopt, the
new asoc->pathmtu inherits it in sctp_association_init(). Later when
transports are added and their pmtu >= asoc->pathmtu, it will never
call sctp_assoc_update_frag_point() to set frag_point.

This patch is to fix it by updating frag_point when stream_interleave
is set in sctp_stream_interleave_init(), which is also called in
sctp_association_init(). We're doing this also because frag_point
is affected by datachunk's type, namely stream_interleave_0/1.

Fixes: 2f5e3c9df693 ("sctp: introduce sctp_assoc_update_frag_point")
Reported-by: Jakub Audykowicz 
Signed-off-by: Xin Long 
---
 net/sctp/stream_interleave.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/sctp/stream_interleave.c b/net/sctp/stream_interleave.c
index 0a78cdf..19d596d 100644
--- a/net/sctp/stream_interleave.c
+++ b/net/sctp/stream_interleave.c
@@ -1327,4 +1327,5 @@ void sctp_stream_interleave_init(struct sctp_stream 
*stream)
asoc = container_of(stream, struct sctp_association, stream);
stream->si = asoc->intl_enable ? _stream_interleave_1
   : _stream_interleave_0;
+   sctp_assoc_update_frag_point(asoc);
 }
-- 
2.1.0



[PATCH net] sctp: increase sk_wmem_alloc when head->truesize is increased

2018-11-25 Thread Xin Long
I changed to count sk_wmem_alloc by skb truesize instead of 1 to
fix the sk_wmem_alloc leak caused by later truesize's change in
xfrm in Commit 02968ccf0125 ("sctp: count sk_wmem_alloc by skb
truesize in sctp_packet_transmit").

But I should have also increased sk_wmem_alloc when head->truesize
is increased in sctp_packet_gso_append() as xfrm does. Otherwise,
sctp gso packet will cause sk_wmem_alloc underflow.

Fixes: 02968ccf0125 ("sctp: count sk_wmem_alloc by skb truesize in 
sctp_packet_transmit")
Signed-off-by: Xin Long 
---
 net/sctp/output.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/sctp/output.c b/net/sctp/output.c
index b0e74a3..025f48e 100644
--- a/net/sctp/output.c
+++ b/net/sctp/output.c
@@ -410,6 +410,7 @@ static void sctp_packet_gso_append(struct sk_buff *head, 
struct sk_buff *skb)
head->truesize += skb->truesize;
head->data_len += skb->len;
head->len += skb->len;
+   refcount_add(skb->truesize, >sk->sk_wmem_alloc);
 
__skb_header_release(skb);
 }
-- 
2.1.0



Re: [PATCH net] sctp: hold transport before accessing its asoc in sctp_hash_transport

2018-11-20 Thread Xin Long
On Wed, Nov 21, 2018 at 9:46 AM Marcelo Ricardo Leitner
 wrote:
>
> On Tue, Nov 20, 2018 at 07:52:48AM -0500, Neil Horman wrote:
> > On Tue, Nov 20, 2018 at 07:09:16PM +0800, Xin Long wrote:
> > > In sctp_hash_transport, it dereferences a transport's asoc only under
> > > rcu_read_lock. Without holding the transport, its asoc could be freed
> > > already, which leads to a use-after-free panic.
> > >
> > > A similar fix as Commit bab1be79a516 ("sctp: hold transport before
> > > accessing its asoc in sctp_transport_get_next") is needed to hold
> > > the transport before accessing its asoc in sctp_hash_transport.
> > >
> > > Fixes: cd2b70875058 ("sctp: check duplicate node before inserting a new 
> > > transport")
> > > Reported-by: syzbot+0b05d8aa7cb185107...@syzkaller.appspotmail.com
> > > Signed-off-by: Xin Long 
> > > ---
> > >  net/sctp/input.c | 7 ++-
> > >  1 file changed, 6 insertions(+), 1 deletion(-)
> > >
> > > diff --git a/net/sctp/input.c b/net/sctp/input.c
> > > index 5c36a99..69584e9 100644
> > > --- a/net/sctp/input.c
> > > +++ b/net/sctp/input.c
> > > @@ -896,11 +896,16 @@ int sctp_hash_transport(struct sctp_transport *t)
> > > list = rhltable_lookup(_transport_hashtable, ,
> > >sctp_hash_params);
> > >
> > > -   rhl_for_each_entry_rcu(transport, tmp, list, node)
> > > +   rhl_for_each_entry_rcu(transport, tmp, list, node) {
> > > +   if (!sctp_transport_hold(transport))
> > > +   continue;
> > > if (transport->asoc->ep == t->asoc->ep) {
> > > +   sctp_transport_put(transport);
> > > rcu_read_unlock();
> > > return -EEXIST;
> > > }
> > > +   sctp_transport_put(transport);
> > > +   }
> > > rcu_read_unlock();
> > >
> > > err = rhltable_insert_key(_transport_hashtable, ,
> > > --
> > > 2.1.0
> > >
> > >
> >
> > something doesn't feel at all right about this.  If we are inserting a 
> > transport
> > to an association, it would seem to me that we should have at least one 
> > user of
> > the association (i.e. non-zero refcount).  As such it seems something is 
> > wrong
> > with the association refcount here.  At the very least, if there is a case 
> > where
> > an association is being removed while a transport is being added, the better
> > solution would be to ensure that sctp_association_destroy goes through a
> > quiescent point prior to unhashing transports from the list, to ensure that
> > there is no conflict with the add operation above.
Changing to do call_rcu(>rcu, sctp_association_destroy) can
work for this case.
But it means asoc and socket (taking the port) will have to wait for a
grace period, which is not expected. We seemed to have talked about
this before, Marcelo?

>
> Consider that the rhl_for_each_entry_rcu() is traversing the global
> rhashtable, and that it may operate on unrelated transports/asocs.
> E.g., transport->asoc in the for() is potentially different from the
> asoc under socket lock.
>
> The core of the fix is at:
> +   if (!sctp_transport_hold(transport))
> +   continue;
> If we can get a hold, the asoc will be available for dereferencing in
> subsequent lines. Otherwise, move on.
>
> With that, the patch makes sense to me.
>
> Although I would prefer if we come up with a better way to do this
> jump, or even avoid the jump. We are only comparing pointers here and,
> if we had asoc->ep cached on sctp_transport itself, we could avoid the
> atomics here.
Right, but it's another u64.

>
> This change, in the next patch on sctp_epaddr_lookup_transport, will
> hurt performance as that is called in datapath. Rhashtable will help
> on keeping entry lists to a size, but still.
This loop is not long normally, will only a few atomic operations hurt
a noticeable performance?


[PATCH net] sctp: hold transport before accessing its asoc in sctp_epaddr_lookup_transport

2018-11-20 Thread Xin Long
Without holding transport to dereference its asoc, a use after
free panic can be caused in sctp_epaddr_lookup_transport. Note
that a sock lock can't protect these transports that belong to
other socks.

A similar fix as Commit bab1be79a516 ("sctp: hold transport
before accessing its asoc in sctp_transport_get_next") is
needed to hold the transport before accessing its asoc in
sctp_epaddr_lookup_transport.

Fixes: 7fda702f9315 ("sctp: use new rhlist interface on sctp transport 
rhashtable")
Reported-by: syzbot+aad231d51b1923158...@syzkaller.appspotmail.com
Signed-off-by: Xin Long 
---
 net/sctp/input.c | 10 --
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/net/sctp/input.c b/net/sctp/input.c
index 69584e9..c2c0816 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -972,9 +972,15 @@ struct sctp_transport *sctp_epaddr_lookup_transport(
list = rhltable_lookup(_transport_hashtable, ,
   sctp_hash_params);
 
-   rhl_for_each_entry_rcu(t, tmp, list, node)
-   if (ep == t->asoc->ep)
+   rhl_for_each_entry_rcu(t, tmp, list, node) {
+   if (!sctp_transport_hold(t))
+   continue;
+   if (ep == t->asoc->ep) {
+   sctp_transport_put(t);
return t;
+   }
+   sctp_transport_put(t);
+   }
 
return NULL;
 }
-- 
2.1.0



[PATCH net] sctp: hold transport before accessing its asoc in sctp_hash_transport

2018-11-20 Thread Xin Long
In sctp_hash_transport, it dereferences a transport's asoc only under
rcu_read_lock. Without holding the transport, its asoc could be freed
already, which leads to a use-after-free panic.

A similar fix as Commit bab1be79a516 ("sctp: hold transport before
accessing its asoc in sctp_transport_get_next") is needed to hold
the transport before accessing its asoc in sctp_hash_transport.

Fixes: cd2b70875058 ("sctp: check duplicate node before inserting a new 
transport")
Reported-by: syzbot+0b05d8aa7cb185107...@syzkaller.appspotmail.com
Signed-off-by: Xin Long 
---
 net/sctp/input.c | 7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/net/sctp/input.c b/net/sctp/input.c
index 5c36a99..69584e9 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -896,11 +896,16 @@ int sctp_hash_transport(struct sctp_transport *t)
list = rhltable_lookup(_transport_hashtable, ,
   sctp_hash_params);
 
-   rhl_for_each_entry_rcu(transport, tmp, list, node)
+   rhl_for_each_entry_rcu(transport, tmp, list, node) {
+   if (!sctp_transport_hold(transport))
+   continue;
if (transport->asoc->ep == t->asoc->ep) {
+   sctp_transport_put(transport);
rcu_read_unlock();
return -EEXIST;
}
+   sctp_transport_put(transport);
+   }
rcu_read_unlock();
 
err = rhltable_insert_key(_transport_hashtable, ,
-- 
2.1.0



Re: KASAN: use-after-free Read in __lock_sock

2018-11-19 Thread Xin Long
On Sat, Nov 17, 2018 at 4:18 PM syzbot
 wrote:
>
> Hello,
>
> syzbot found the following crash on:
>
> HEAD commit:ccda4af0f4b9 Linux 4.20-rc2
> git tree:   upstream
> console output: https://syzkaller.appspot.com/x/log.txt?x=156cd53340
> kernel config:  https://syzkaller.appspot.com/x/.config?x=4a0a89f12ca9b0f5
> dashboard link: https://syzkaller.appspot.com/bug?extid=9276d76e83e3bcde6c99
> compiler:   gcc (GCC) 8.0.1 20180413 (experimental)
>
> Unfortunately, I don't have any reproducer for this crash yet.
>
> IMPORTANT: if you fix the bug, please add the following tag to the commit:
> Reported-by: syzbot+9276d76e83e3bcde6...@syzkaller.appspotmail.com
>
> netlink: 5 bytes leftover after parsing attributes in process
> `syz-executor5'.
> ==
> BUG: KASAN: use-after-free in __lock_acquire+0x36d9/0x4c20
> kernel/locking/lockdep.c:3218
> Read of size 8 at addr 8881d26d60e0 by task syz-executor1/13725
>
> CPU: 0 PID: 13725 Comm: syz-executor1 Not tainted 4.20.0-rc2+ #333
> Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS
> Google 01/01/2011
> Call Trace:
>   __dump_stack lib/dump_stack.c:77 [inline]
>   dump_stack+0x244/0x39d lib/dump_stack.c:113
>   print_address_description.cold.7+0x9/0x1ff mm/kasan/report.c:256
>   kasan_report_error mm/kasan/report.c:354 [inline]
>   kasan_report.cold.8+0x242/0x309 mm/kasan/report.c:412
>   __asan_report_load8_noabort+0x14/0x20 mm/kasan/report.c:433
>   __lock_acquire+0x36d9/0x4c20 kernel/locking/lockdep.c:3218
>   lock_acquire+0x1ed/0x520 kernel/locking/lockdep.c:3844
>   __raw_spin_lock_bh include/linux/spinlock_api_smp.h:135 [inline]
>   _raw_spin_lock_bh+0x31/0x40 kernel/locking/spinlock.c:168
>   spin_lock_bh include/linux/spinlock.h:334 [inline]
>   __lock_sock+0x203/0x350 net/core/sock.c:2253
>   lock_sock_nested+0xfe/0x120 net/core/sock.c:2774
>   lock_sock include/net/sock.h:1492 [inline]
>   sctp_sock_dump+0x122/0xb20 net/sctp/diag.c:324

static int sctp_sock_dump(struct sctp_transport *tsp, void *p)
{
struct sctp_endpoint *ep = tsp->asoc->ep;
struct sctp_comm_param *commp = p;
struct sock *sk = ep->base.sk; <--- [1]
...
int err = 0;

lock_sock(sk);  <--- [2]

Between [1] and [2], an asoc peeloff may happen, still thinking
how to avoid this.



>   sctp_for_each_transport+0x2b5/0x370 net/sctp/socket.c:5091
>   sctp_diag_dump+0x3ac/0x660 net/sctp/diag.c:527
>   __inet_diag_dump+0xa8/0x140 net/ipv4/inet_diag.c:1049
>   inet_diag_dump+0x9b/0x110 net/ipv4/inet_diag.c:1065
>   netlink_dump+0x606/0x1080 net/netlink/af_netlink.c:2244
>   __netlink_dump_start+0x59a/0x7c0 net/netlink/af_netlink.c:2352
>   netlink_dump_start include/linux/netlink.h:216 [inline]
>   inet_diag_handler_cmd+0x2ce/0x3f0 net/ipv4/inet_diag.c:1170
>   __sock_diag_cmd net/core/sock_diag.c:232 [inline]
>   sock_diag_rcv_msg+0x31d/0x410 net/core/sock_diag.c:263
>   netlink_rcv_skb+0x172/0x440 net/netlink/af_netlink.c:2477
>   sock_diag_rcv+0x2a/0x40 net/core/sock_diag.c:274
>   netlink_unicast_kernel net/netlink/af_netlink.c:1310 [inline]
>   netlink_unicast+0x5a5/0x760 net/netlink/af_netlink.c:1336
>   netlink_sendmsg+0xa18/0xfc0 net/netlink/af_netlink.c:1917
>   sock_sendmsg_nosec net/socket.c:621 [inline]
>   sock_sendmsg+0xd5/0x120 net/socket.c:631
>   sock_write_iter+0x35e/0x5c0 net/socket.c:900
>   call_write_iter include/linux/fs.h:1857 [inline]
>   do_iter_readv_writev+0x8b0/0xa80 fs/read_write.c:680
>   do_iter_write+0x185/0x5f0 fs/read_write.c:959
>   vfs_writev+0x1f1/0x360 fs/read_write.c:1004
>   do_writev+0x11a/0x310 fs/read_write.c:1039
>   __do_sys_writev fs/read_write.c:1112 [inline]
>   __se_sys_writev fs/read_write.c:1109 [inline]
>   __x64_sys_writev+0x75/0xb0 fs/read_write.c:1109
>   do_syscall_64+0x1b9/0x820 arch/x86/entry/common.c:290
>   entry_SYSCALL_64_after_hwframe+0x49/0xbe
> RIP: 0033:0x457569
> Code: fd b3 fb ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 48 89 f8 48 89 f7
> 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff
> ff 0f 83 cb b3 fb ff c3 66 2e 0f 1f 84 00 00 00 00
> RSP: 002b:7f2cdabbac78 EFLAGS: 0246 ORIG_RAX: 0014
> RAX: ffda RBX: 0003 RCX: 00457569
> RDX: 0001 RSI: 2051c000 RDI: 000e
> RBP: 0072c0e0 R08:  R09: 
> R10:  R11: 0246 R12: 7f2cdabbb6d4
> R13: 004c33b1 R14: 004d97c8 R15: 
>
> Allocated by task 13672:
>   save_stack+0x43/0xd0 mm/kasan/kasan.c:448
>   set_track mm/kasan/kasan.c:460 [inline]
>   kasan_kmalloc+0xc7/0xe0 mm/kasan/kasan.c:553
>   kasan_slab_alloc+0x12/0x20 mm/kasan/kasan.c:490
>   kmem_cache_alloc+0x12e/0x730 mm/slab.c:3554
>   sk_prot_alloc+0x69/0x2e0 net/core/sock.c:1463
>   sk_alloc+0x10d/0x1690 net/core/sock.c:1523
>   inet_create+0x509/0x1070 net/ipv4/af_inet.c:321
>   

Re: KASAN: use-after-free Read in sctp_epaddr_lookup_transport

2018-11-19 Thread Xin Long
On Sat, Nov 17, 2018 at 10:59 AM syzbot
 wrote:
>
> Hello,
>
> syzbot found the following crash on:
>
> HEAD commit:a97b95653383 drivers/net/ethernet/qlogic/qed/qed_rdma.h: f..
> git tree:   net
> console output: https://syzkaller.appspot.com/x/log.txt?x=1217d26d40
> kernel config:  https://syzkaller.appspot.com/x/.config?x=d86f24333880b605
> dashboard link: https://syzkaller.appspot.com/bug?extid=aad231d51b1923158444
> compiler:   gcc (GCC) 8.0.1 20180413 (experimental)
> syz repro:  https://syzkaller.appspot.com/x/repro.syz?x=13b5bb0b40
>
> IMPORTANT: if you fix the bug, please add the following tag to the commit:
> Reported-by: syzbot+aad231d51b1923158...@syzkaller.appspotmail.com
>
> ==
> BUG: KASAN: use-after-free in sctp_epaddr_lookup_transport+0xacb/0xb20
> net/sctp/input.c:971
> Read of size 8 at addr 8881cde426b0 by task syz-executor3/18110
>
The same fix is needed in sctp_epaddr_lookup_transport() as:
commit bab1be79a5169ac748d8292b20c86d874022d7ba
Author: Xin Long 
Date:   Mon Aug 27 18:38:31 2018 +0800

sctp: hold transport before accessing its asoc in sctp_transport_get_next

> CPU: 1 PID: 18110 Comm: syz-executor3 Not tainted 4.20.0-rc2+ #187
> Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS
> Google 01/01/2011
> Call Trace:
>   __dump_stack lib/dump_stack.c:77 [inline]
>   dump_stack+0x244/0x39d lib/dump_stack.c:113
>   print_address_description.cold.7+0x9/0x1ff mm/kasan/report.c:256
>   kasan_report_error mm/kasan/report.c:354 [inline]
>   kasan_report.cold.8+0x242/0x309 mm/kasan/report.c:412
>   __asan_report_load8_noabort+0x14/0x20 mm/kasan/report.c:433
>   sctp_epaddr_lookup_transport+0xacb/0xb20 net/sctp/input.c:971
>   sctp_endpoint_lookup_assoc+0xe0/0x290 net/sctp/endpointola.c:338
>   sctp_addr_id2transport+0x1f8/0x370 net/sctp/socket.c:279
>   sctp_getsockopt_peer_addr_params+0x17c/0x1260 net/sctp/socket.c:5613
>   sctp_getsockopt+0x44f9/0x7d32 net/sctp/socket.c:7462
>   sock_common_getsockopt+0x9a/0xe0 net/core/sock.c:2937
>   __sys_getsockopt+0x1ad/0x390 net/socket.c:1939
>   __do_sys_getsockopt net/socket.c:1950 [inline]
>   __se_sys_getsockopt net/socket.c:1947 [inline]
>   __x64_sys_getsockopt+0xbe/0x150 net/socket.c:1947
>   do_syscall_64+0x1b9/0x820 arch/x86/entry/common.c:290
>   entry_SYSCALL_64_after_hwframe+0x49/0xbe
> RIP: 0033:0x457569
> Code: fd b3 fb ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 48 89 f8 48 89 f7
> 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff
> ff 0f 83 cb b3 fb ff c3 66 2e 0f 1f 84 00 00 00 00
> RSP: 002b:7f177b561c78 EFLAGS: 0246 ORIG_RAX: 0037
> RAX: ffda RBX: 0005 RCX: 00457569
> RDX: 0009 RSI: 0084 RDI: 0006
> RBP: 0072c180 R08: 2044fffc R09: 
> R10: 20a68000 R11: 0246 R12: 7f177b5626d4
> R13: 004c8318 R14: 004ce200 R15: 
>
> Allocated by task 18068:
>   save_stack+0x43/0xd0 mm/kasan/kasan.c:448
>   set_track mm/kasan/kasan.c:460 [inline]
>   kasan_kmalloc+0xc7/0xe0 mm/kasan/kasan.c:553
>   kmem_cache_alloc_trace+0x152/0x750 mm/slab.c:3620
>   kmalloc include/linux/slab.h:546 [inline]
>   kzalloc include/linux/slab.h:741 [inline]
>   sctp_association_new+0x14e/0x2290 net/sctp/associola.c:311
>   sctp_sendmsg_new_asoc+0x39c/0x11f0 net/sctp/socket.c:1723
>   sctp_sendmsg+0x18a5/0x1da0 net/sctp/socket.c:2086
>   inet_sendmsg+0x1a1/0x690 net/ipv4/af_inet.c:798
>   sock_sendmsg_nosec net/socket.c:621 [inline]
>   sock_sendmsg+0xd5/0x120 net/socket.c:631
>   __sys_sendto+0x3d7/0x670 net/socket.c:1788
>   __do_sys_sendto net/socket.c:1800 [inline]
>   __se_sys_sendto net/socket.c:1796 [inline]
>   __x64_sys_sendto+0xe1/0x1a0 net/socket.c:1796
>   do_syscall_64+0x1b9/0x820 arch/x86/entry/common.c:290
>   entry_SYSCALL_64_after_hwframe+0x49/0xbe
>
> Freed by task 18110:
>   save_stack+0x43/0xd0 mm/kasan/kasan.c:448
>   set_track mm/kasan/kasan.c:460 [inline]
>   __kasan_slab_free+0x102/0x150 mm/kasan/kasan.c:521
>   kasan_slab_free+0xe/0x10 mm/kasan/kasan.c:528
>   __cache_free mm/slab.c:3498 [inline]
>   kfree+0xcf/0x230 mm/slab.c:3817
>   sctp_association_destroy net/sctp/associola.c:437 [inline]
>   sctp_association_put+0x264/0x350 net/sctp/associola.c:889
>   sctp_transport_destroy net/sctp/transport.c:180 [inline]
>   sctp_transport_put+0x186/0x1f0 net/sctp/transport.c:340
>   sctp_hash_cmp+0x1ef/0x260 net/sctp/input.c:825
>   __rhashtable_lookup include/linux/rhashtable.h:483 [inline]
>   rhltable_lookup include/linux/rhashtable.h:566 [inline]
>   sctp_epaddr_lookup

Re: KASAN: use-after-free Read in sctp_hash_transport

2018-11-19 Thread Xin Long
On Mon, Nov 19, 2018 at 4:23 AM syzbot
 wrote:
>
> Hello,
>
> syzbot found the following crash on:
>
> HEAD commit:e119a369b0f1 Merge branch 'SMSC95xx-driver-updates'
> git tree:   net-next
> console output: https://syzkaller.appspot.com/x/log.txt?x=124f5f7b40
> kernel config:  https://syzkaller.appspot.com/x/.config?x=d86f24333880b605
> dashboard link: https://syzkaller.appspot.com/bug?extid=0b05d8aa7cb185107483
> compiler:   gcc (GCC) 8.0.1 20180413 (experimental)
>
> Unfortunately, I don't have any reproducer for this crash yet.
>
> IMPORTANT: if you fix the bug, please add the following tag to the commit:
> Reported-by: syzbot+0b05d8aa7cb185107...@syzkaller.appspotmail.com
>
> ==
> BUG: KASAN: use-after-free in sctp_hash_transport+0x803/0x810
> net/sctp/input.c:958
> Read of size 8 at addr 8881c6b98cb0 by task syz-executor5/3552
>
> CPU: 0 PID: 3552 Comm: syz-executor5 Not tainted 4.20.0-rc2+ #299
> Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS
> Google 01/01/2011
> Call Trace:
>   __dump_stack lib/dump_stack.c:77 [inline]
>   dump_stack+0x244/0x39d lib/dump_stack.c:113
>   print_address_description.cold.7+0x9/0x1ff mm/kasan/report.c:256
>   kasan_report_error mm/kasan/report.c:354 [inline]
>   kasan_report.cold.8+0x242/0x309 mm/kasan/report.c:412
>   __asan_report_load8_noabort+0x14/0x20 mm/kasan/report.c:433
>   sctp_hash_transport+0x803/0x810 net/sctp/input.c:958
Caused by:
commit cd2b708750582e327789d8fb07c6eb5f79f7759f
Author: Xin Long 
Date:   Fri Feb 17 16:35:24 2017 +0800

sctp: check duplicate node before inserting a new transport

A same fix is needed as:
commit bab1be79a5169ac748d8292b20c86d874022d7ba
Author: Xin Long 
Date:   Mon Aug 27 18:38:31 2018 +0800

sctp: hold transport before accessing its asoc in sctp_transport_get_next


>   sctp_assoc_add_peer+0xa21/0x10d0 net/sctp/associola.c:724
>   sctp_sendmsg_new_asoc+0x5da/0x11f0 net/sctp/socket.c:1757
>   sctp_sendmsg+0x18a5/0x1da0 net/sctp/socket.c:2086
>   inet_sendmsg+0x1a1/0x690 net/ipv4/af_inet.c:798
>   sock_sendmsg_nosec net/socket.c:621 [inline]
>   sock_sendmsg+0xd5/0x120 net/socket.c:631
>   ___sys_sendmsg+0x7fd/0x930 net/socket.c:2116
>   __sys_sendmsg+0x11d/0x280 net/socket.c:2154
>   __do_sys_sendmsg net/socket.c:2163 [inline]
>   __se_sys_sendmsg net/socket.c:2161 [inline]
>   __x64_sys_sendmsg+0x78/0xb0 net/socket.c:2161
>   do_syscall_64+0x1b9/0x820 arch/x86/entry/common.c:290
>   entry_SYSCALL_64_after_hwframe+0x49/0xbe
> RIP: 0033:0x457569
> Code: fd b3 fb ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 48 89 f8 48 89 f7
> 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff
> ff 0f 83 cb b3 fb ff c3 66 2e 0f 1f 84 00 00 00 00
> RSP: 002b:7f45462c7c78 EFLAGS: 0246 ORIG_RAX: 002e
> RAX: ffda RBX: 0003 RCX: 00457569
> RDX:  RSI: 2001afc8 RDI: 0005
> RBP: 0072c0e0 R08:  R09: 
> R10:  R11: 0246 R12: 7f45462c86d4
> R13: 004c381e R14: 004d59e8 R15: 
>
> Allocated by task 3509:
>   save_stack+0x43/0xd0 mm/kasan/kasan.c:448
>   set_track mm/kasan/kasan.c:460 [inline]
>   kasan_kmalloc+0xc7/0xe0 mm/kasan/kasan.c:553
>   kmem_cache_alloc_trace+0x152/0x750 mm/slab.c:3620
>   kmalloc include/linux/slab.h:546 [inline]
>   kzalloc include/linux/slab.h:741 [inline]
>   sctp_association_new+0x14e/0x2290 net/sctp/associola.c:311
>   sctp_sendmsg_new_asoc+0x39c/0x11f0 net/sctp/socket.c:1723
>   sctp_sendmsg+0x18a5/0x1da0 net/sctp/socket.c:2086
>   inet_sendmsg+0x1a1/0x690 net/ipv4/af_inet.c:798
>   sock_sendmsg_nosec net/socket.c:621 [inline]
>   sock_sendmsg+0xd5/0x120 net/socket.c:631
>   ___sys_sendmsg+0x7fd/0x930 net/socket.c:2116
>   __sys_sendmsg+0x11d/0x280 net/socket.c:2154
>   __do_sys_sendmsg net/socket.c:2163 [inline]
>   __se_sys_sendmsg net/socket.c:2161 [inline]
>   __x64_sys_sendmsg+0x78/0xb0 net/socket.c:2161
>   do_syscall_64+0x1b9/0x820 arch/x86/entry/common.c:290
>   entry_SYSCALL_64_after_hwframe+0x49/0xbe
>
> Freed by task 3552:
>   save_stack+0x43/0xd0 mm/kasan/kasan.c:448
>   set_track mm/kasan/kasan.c:460 [inline]
>   __kasan_slab_free+0x102/0x150 mm/kasan/kasan.c:521
>   kasan_slab_free+0xe/0x10 mm/kasan/kasan.c:528
>   __cache_free mm/slab.c:3498 [inline]
>   kfree+0xcf/0x230 mm/slab.c:3817
>   sctp_association_destroy net/sctp/associola.c:437 [inline]
>   sctp_association_put+0x264/0x350 net/sctp/associola.c:889
>   sctp_transport_destroy net/sctp/transport.c:180 [inline]
>   sctp_transport_put+0x186/0x1f0 net/sctp/

Re: [PATCH net] sctp: always set frag_point on pmtu change

2018-11-18 Thread Xin Long
On Mon, Nov 19, 2018 at 5:49 AM Jakub Audykowicz
 wrote:
>
> Calling send on a connected SCTP socket results in kernel panic if
> spp_pathmtu was configured manually before an association is established
> and it was not reconfigured to another value once the association is
> established.
>
> Steps to reproduce:
> 1. Set up a listening SCTP server socket.
> 2. Set up an SCTP client socket.
> 3. Configure client socket using setsockopt SCTP_PEER_ADDR_PARAMS with
> spp_pathmtu set to a legal value (e.g. 1000) and
> SPP_PMTUD_DISABLE set in spp_flags.
> 4. Connect client to server.
> 5. Send message from client to server.
>
> At this point oom-killer is invoked, which will eventually lead to:
> [5.197262] Out of memory and no killable processes...
> [5.198107] Kernel panic - not syncing: System is deadlocked on memory
>
> Commit 2f5e3c9df693 ("sctp: introduce sctp_assoc_update_frag_point")
> introduces sctp_assoc_update_frag_point, but this function is not called
> in this case, causing frag_point to be zero:
>  void sctp_assoc_set_pmtu(struct sctp_association *asoc, __u32 pmtu)
>  {
> -   if (asoc->pathmtu != pmtu)
> +   if (asoc->pathmtu != pmtu) {
> asoc->pathmtu = pmtu;
> +   sctp_assoc_update_frag_point(asoc);
> +   }
>
> In this scenario, on association establishment, asoc->pathmtu is already
> 1000 and pmtu will be as well. Before this commit the frag_point was being
> correctly set in the scenario described. Moving the call outside the if
> block fixes the issue.
>
> I will be providing a separate patch to lksctp-tools with a simple test
> reproducing this problem ("func_tests: frag_point should never be zero").
>
> I have also taken the liberty to introduce a sanity check in chunk.c to
> set the frag_point to a non-negative value in order to avoid chunking
> endlessly (which is the reason for the eventual panic).
>
> Fixes: 2f5e3c9df693 ("sctp: introduce sctp_assoc_update_frag_point")
> Signed-off-by: Jakub Audykowicz 
> ---
>  include/net/sctp/constants.h |  3 +++
>  net/sctp/associola.c | 13 +++--
>  net/sctp/chunk.c |  6 ++
>  3 files changed, 16 insertions(+), 6 deletions(-)
>
> diff --git a/include/net/sctp/constants.h b/include/net/sctp/constants.h
> index 8dadc74c22e7..90316fab6f04 100644
> --- a/include/net/sctp/constants.h
> +++ b/include/net/sctp/constants.h
> @@ -293,6 +293,9 @@ enum { SCTP_MAX_GABS = 16 };
>  */
>  #define SCTP_DEFAULT_MINSEGMENT 512/* MTU size ... if no mtu disc */
>
> +/* An association's fragmentation point should never be non-positive */
> +#define SCTP_FRAG_POINT_MIN 1
> +
>  #define SCTP_SECRET_SIZE 32/* Number of octets in a 256 bits. */
>
>  #define SCTP_SIGNATURE_SIZE 20 /* size of a SLA-1 signature */
> diff --git a/net/sctp/associola.c b/net/sctp/associola.c
> index 6a28b96e779e..44d71a1af62e 100644
> --- a/net/sctp/associola.c
> +++ b/net/sctp/associola.c
> @@ -1431,13 +1431,14 @@ void sctp_assoc_update_frag_point(struct 
> sctp_association *asoc)
>
>  void sctp_assoc_set_pmtu(struct sctp_association *asoc, __u32 pmtu)
>  {
> -   if (asoc->pathmtu != pmtu) {
> -   asoc->pathmtu = pmtu;
> -   sctp_assoc_update_frag_point(asoc);
> -   }
> +   pr_debug("%s: before asoc:%p, pmtu:%d, frag_point:%d\n",
> +   __func__, asoc, asoc->pathmtu, asoc->frag_point);
> +
> +   asoc->pathmtu = pmtu;
> +   sctp_assoc_update_frag_point(asoc);
>
> -   pr_debug("%s: asoc:%p, pmtu:%d, frag_point:%d\n", __func__, asoc,
> -asoc->pathmtu, asoc->frag_point);
> +   pr_debug("%s: after asoc:%p, pmtu:%d, frag_point:%d\n",
> +   __func__, asoc, asoc->pathmtu, asoc->frag_point);
>  }
The idea was whenever asoc->pathmtu changes,  frag_point should
be updated, but we missed one place in sctp_association_init().

Another issue is after 4-shakehand, the client's asoc->intl_enable
may be changed from 0 to 1, which means the frag_point should
also be updated, since [1]:

void sctp_assoc_update_frag_point(struct sctp_association *asoc)
{
int frag = sctp_mtu_payload(sctp_sk(asoc->base.sk), asoc->pathmtu,
sctp_datachk_len(>stream)); <--- [1]

So one fix for both issues is:

diff --git a/net/sctp/stream_interleave.c b/net/sctp/stream_interleave.c
index 0a78cdf..19d596d 100644
--- a/net/sctp/stream_interleave.c
+++ b/net/sctp/stream_interleave.c
@@ -1327,4 +1327,5 @@ void sctp_stream_interleave_init(struct
sctp_stream *stream)
asoc = container_of(stream, struct sctp_association, stream);
stream->si = asoc->intl_enable ? _stream_interleave_1
   : _stream_interleave_0;
+   sctp_assoc_update_frag_point(asoc);
 }


>
>  /* Update the association's pmtu and frag_point by going through all the
> diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c
> index 

[PATCH net] sctp: not increase stream's incnt before sending addstrm_in request

2018-11-18 Thread Xin Long
Different from processing the addstrm_out request, The receiver handles
an addstrm_in request by sending back an addstrm_out request to the
sender who will increase its stream's in and incnt later.

Now stream->incnt has been increased since it sent out the addstrm_in
request in sctp_send_add_streams(), with the wrong stream->incnt will
even cause crash when copying stream info from the old stream's in to
the new one's in sctp_process_strreset_addstrm_out().

This patch is to fix it by simply removing the stream->incnt change
from sctp_send_add_streams().

Fixes: 242bd2d519d7 ("sctp: implement sender-side procedures for Add 
Incoming/Outgoing Streams Request Parameter")
Reported-by: Jianwen Ji 
Signed-off-by: Xin Long 
---
 net/sctp/stream.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/net/sctp/stream.c b/net/sctp/stream.c
index ffb940d..3892e76 100644
--- a/net/sctp/stream.c
+++ b/net/sctp/stream.c
@@ -535,7 +535,6 @@ int sctp_send_add_streams(struct sctp_association *asoc,
goto out;
}
 
-   stream->incnt = incnt;
stream->outcnt = outcnt;
 
asoc->strreset_outstanding = !!out + !!in;
-- 
2.1.0



[PATCH net] Revert "sctp: remove sctp_transport_pmtu_check"

2018-11-18 Thread Xin Long
This reverts commit 22d7be267eaa8114dcc28d66c1c347f667d7878a.

The dst's mtu in transport can be updated by a non sctp place like
in xfrm where the MTU information didn't get synced between asoc,
transport and dst, so it is still needed to do the pmtu check
in sctp_packet_config.
---
 include/net/sctp/sctp.h | 12 
 net/sctp/output.c   |  3 +++
 2 files changed, 15 insertions(+)

diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
index 8c2caa3..ab9242e 100644
--- a/include/net/sctp/sctp.h
+++ b/include/net/sctp/sctp.h
@@ -608,4 +608,16 @@ static inline __u32 sctp_dst_mtu(const struct dst_entry 
*dst)
 SCTP_DEFAULT_MINSEGMENT));
 }
 
+static inline bool sctp_transport_pmtu_check(struct sctp_transport *t)
+{
+   __u32 pmtu = sctp_dst_mtu(t->dst);
+
+   if (t->pathmtu == pmtu)
+   return true;
+
+   t->pathmtu = pmtu;
+
+   return false;
+}
+
 #endif /* __net_sctp_h__ */
diff --git a/net/sctp/output.c b/net/sctp/output.c
index 67939ad..0860122 100644
--- a/net/sctp/output.c
+++ b/net/sctp/output.c
@@ -118,6 +118,9 @@ void sctp_packet_config(struct sctp_packet *packet, __u32 
vtag,
sctp_transport_route(tp, NULL, sp);
if (asoc->param_flags & SPP_PMTUD_ENABLE)
sctp_assoc_sync_pmtu(asoc);
+   } else if (!sctp_transport_pmtu_check(tp)) {
+   if (asoc->param_flags & SPP_PMTUD_ENABLE)
+   sctp_assoc_sync_pmtu(asoc);
}
 
if (asoc->pmtu_pending) {
-- 
2.1.0



[PATCHv3 net-next 1/4] sctp: define subscribe in sctp_sock as __u16

2018-11-18 Thread Xin Long
The member subscribe in sctp_sock is used to indicate to which of
the events it is subscribed, more like a group of flags. So it's
better to be defined as __u16 (2 bytpes), instead of struct
sctp_event_subscribe (13 bytes).

Note that sctp_event_subscribe is an UAPI struct, used on sockopt
calls, and thus it will not be removed. This patch only changes
the internal storage of the flags.

Signed-off-by: Xin Long 
---
 include/net/sctp/structs.h   |  2 +-
 include/net/sctp/ulpevent.h  | 39 ---
 include/uapi/linux/sctp.h|  6 +-
 net/sctp/chunk.c |  4 ++--
 net/sctp/socket.c| 35 ++-
 net/sctp/stream_interleave.c | 11 ++-
 net/sctp/ulpqueue.c  |  8 
 7 files changed, 68 insertions(+), 37 deletions(-)

diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index af9d494..bc7808a 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -217,7 +217,7 @@ struct sctp_sock {
 * These two structures must be grouped together for the usercopy
 * whitelist region.
 */
-   struct sctp_event_subscribe subscribe;
+   __u16 subscribe;
struct sctp_initmsg initmsg;
 
int user_frag;
diff --git a/include/net/sctp/ulpevent.h b/include/net/sctp/ulpevent.h
index 51b4e06..bd922a0 100644
--- a/include/net/sctp/ulpevent.h
+++ b/include/net/sctp/ulpevent.h
@@ -164,30 +164,39 @@ void sctp_ulpevent_read_nxtinfo(const struct 
sctp_ulpevent *event,
 
 __u16 sctp_ulpevent_get_notification_type(const struct sctp_ulpevent *event);
 
+static inline void sctp_ulpevent_type_set(__u16 *subscribe,
+ __u16 sn_type, __u8 on)
+{
+   if (sn_type > SCTP_SN_TYPE_MAX)
+   return;
+
+   if (on)
+   *subscribe |=  (1 << (sn_type - SCTP_SN_TYPE_BASE));
+   else
+   *subscribe &= ~(1 << (sn_type - SCTP_SN_TYPE_BASE));
+}
+
 /* Is this event type enabled? */
-static inline int sctp_ulpevent_type_enabled(__u16 sn_type,
-struct sctp_event_subscribe *mask)
+static inline bool sctp_ulpevent_type_enabled(__u16 subscribe, __u16 sn_type)
 {
-   int offset = sn_type - SCTP_SN_TYPE_BASE;
-   char *amask = (char *) mask;
+   if (sn_type > SCTP_SN_TYPE_MAX)
+   return false;
 
-   if (offset >= sizeof(struct sctp_event_subscribe))
-   return 0;
-   return amask[offset];
+   return subscribe & (1 << (sn_type - SCTP_SN_TYPE_BASE));
 }
 
 /* Given an event subscription, is this event enabled? */
-static inline int sctp_ulpevent_is_enabled(const struct sctp_ulpevent *event,
-  struct sctp_event_subscribe *mask)
+static inline bool sctp_ulpevent_is_enabled(const struct sctp_ulpevent *event,
+   __u16 subscribe)
 {
__u16 sn_type;
-   int enabled = 1;
 
-   if (sctp_ulpevent_is_notification(event)) {
-   sn_type = sctp_ulpevent_get_notification_type(event);
-   enabled = sctp_ulpevent_type_enabled(sn_type, mask);
-   }
-   return enabled;
+   if (!sctp_ulpevent_is_notification(event))
+   return true;
+
+   sn_type = sctp_ulpevent_get_notification_type(event);
+
+   return sctp_ulpevent_type_enabled(subscribe, sn_type);
 }
 
 #endif /* __sctp_ulpevent_h__ */
diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h
index c81feb3..66afa5b 100644
--- a/include/uapi/linux/sctp.h
+++ b/include/uapi/linux/sctp.h
@@ -632,7 +632,9 @@ union sctp_notification {
  */
 
 enum sctp_sn_type {
-   SCTP_SN_TYPE_BASE = (1<<15),
+   SCTP_SN_TYPE_BASE   = (1<<15),
+   SCTP_DATA_IO_EVENT  = SCTP_SN_TYPE_BASE,
+#define SCTP_DATA_IO_EVENT SCTP_DATA_IO_EVENT
SCTP_ASSOC_CHANGE,
 #define SCTP_ASSOC_CHANGE  SCTP_ASSOC_CHANGE
SCTP_PEER_ADDR_CHANGE,
@@ -657,6 +659,8 @@ enum sctp_sn_type {
 #define SCTP_ASSOC_RESET_EVENT SCTP_ASSOC_RESET_EVENT
SCTP_STREAM_CHANGE_EVENT,
 #define SCTP_STREAM_CHANGE_EVENT   SCTP_STREAM_CHANGE_EVENT
+   SCTP_SN_TYPE_MAX= SCTP_STREAM_CHANGE_EVENT,
+#define SCTP_SN_TYPE_MAX   SCTP_SN_TYPE_MAX
 };
 
 /* Notification error codes used to fill up the error fields in some
diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c
index ce80878..6c761af 100644
--- a/net/sctp/chunk.c
+++ b/net/sctp/chunk.c
@@ -109,8 +109,8 @@ static void sctp_datamsg_destroy(struct sctp_datamsg *msg)
error = asoc->outqueue.error;
 
sp = sctp_sk(asoc->base.sk);
-   notify = sctp_ulpevent_type_enabled(SCTP_SEND_FAILED,
-   >subscribe);
+

[PATCHv3 net-next 3/4] sctp: rename enum sctp_event to sctp_event_type

2018-11-18 Thread Xin Long
sctp_event is a structure name defined in RFC for sockopt
SCTP_EVENT. To avoid the conflict, rename it.

Signed-off-by: Xin Long 
---
 include/net/sctp/constants.h |  2 +-
 include/net/sctp/sm.h|  4 ++--
 net/sctp/primitive.c |  2 +-
 net/sctp/sm_sideeffect.c | 12 ++--
 net/sctp/sm_statetable.c |  2 +-
 5 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/include/net/sctp/constants.h b/include/net/sctp/constants.h
index 8dadc74..4588bdc 100644
--- a/include/net/sctp/constants.h
+++ b/include/net/sctp/constants.h
@@ -71,7 +71,7 @@ enum { SCTP_DEFAULT_INSTREAMS = SCTP_MAX_STREAM };
 SCTP_NUM_AUTH_CHUNK_TYPES)
 
 /* These are the different flavours of event.  */
-enum sctp_event {
+enum sctp_event_type {
SCTP_EVENT_T_CHUNK = 1,
SCTP_EVENT_T_TIMEOUT,
SCTP_EVENT_T_OTHER,
diff --git a/include/net/sctp/sm.h b/include/net/sctp/sm.h
index 9e3d327..24825a8 100644
--- a/include/net/sctp/sm.h
+++ b/include/net/sctp/sm.h
@@ -173,7 +173,7 @@ sctp_state_fn_t sctp_sf_autoclose_timer_expire;
 __u8 sctp_get_chunk_type(struct sctp_chunk *chunk);
 const struct sctp_sm_table_entry *sctp_sm_lookup_event(
struct net *net,
-   enum sctp_event event_type,
+   enum sctp_event_type event_type,
enum sctp_state state,
union sctp_subtype event_subtype);
 int sctp_chunk_iif(const struct sctp_chunk *);
@@ -313,7 +313,7 @@ struct sctp_chunk *sctp_process_strreset_resp(
 
 /* Prototypes for statetable processing. */
 
-int sctp_do_sm(struct net *net, enum sctp_event event_type,
+int sctp_do_sm(struct net *net, enum sctp_event_type event_type,
   union sctp_subtype subtype, enum sctp_state state,
   struct sctp_endpoint *ep, struct sctp_association *asoc,
   void *event_arg, gfp_t gfp);
diff --git a/net/sctp/primitive.c b/net/sctp/primitive.c
index c0817f7a..a8c4c33 100644
--- a/net/sctp/primitive.c
+++ b/net/sctp/primitive.c
@@ -53,7 +53,7 @@
 int sctp_primitive_ ## name(struct net *net, struct sctp_association *asoc, \
void *arg) { \
int error = 0; \
-   enum sctp_event event_type; union sctp_subtype subtype; \
+   enum sctp_event_type event_type; union sctp_subtype subtype; \
enum sctp_state state; \
struct sctp_endpoint *ep; \
\
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index 85d3930..1d143bc 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -52,7 +52,7 @@
 #include 
 #include 
 
-static int sctp_cmd_interpreter(enum sctp_event event_type,
+static int sctp_cmd_interpreter(enum sctp_event_type event_type,
union sctp_subtype subtype,
enum sctp_state state,
struct sctp_endpoint *ep,
@@ -61,7 +61,7 @@ static int sctp_cmd_interpreter(enum sctp_event event_type,
enum sctp_disposition status,
struct sctp_cmd_seq *commands,
gfp_t gfp);
-static int sctp_side_effects(enum sctp_event event_type,
+static int sctp_side_effects(enum sctp_event_type event_type,
 union sctp_subtype subtype,
 enum sctp_state state,
 struct sctp_endpoint *ep,
@@ -623,7 +623,7 @@ static void sctp_cmd_init_failed(struct sctp_cmd_seq 
*commands,
 /* Worker routine to handle SCTP_CMD_ASSOC_FAILED.  */
 static void sctp_cmd_assoc_failed(struct sctp_cmd_seq *commands,
  struct sctp_association *asoc,
- enum sctp_event event_type,
+ enum sctp_event_type event_type,
  union sctp_subtype subtype,
  struct sctp_chunk *chunk,
  unsigned int error)
@@ -1162,7 +1162,7 @@ static void sctp_cmd_send_asconf(struct sctp_association 
*asoc)
  * If you want to understand all of lksctp, this is a
  * good place to start.
  */
-int sctp_do_sm(struct net *net, enum sctp_event event_type,
+int sctp_do_sm(struct net *net, enum sctp_event_type event_type,
   union sctp_subtype subtype, enum sctp_state state,
   struct sctp_endpoint *ep, struct sctp_association *asoc,
   void *event_arg, gfp_t gfp)
@@ -1199,7 +1199,7 @@ int sctp_do_sm(struct net *net, enum sctp_event 
event_type,
 /*
  * This the master state function side effect processing function.
  */
-static int sctp_side_effects(enum sctp_event event_type,
+static int

[PATCHv3 net-next 4/4] sctp: add sockopt SCTP_EVENT

2018-11-18 Thread Xin Long
This patch adds sockopt SCTP_EVENT described in rfc6525#section-6.2.
With this sockopt users can subscribe to an event from a specified
asoc.

Signed-off-by: Xin Long 
---
 include/uapi/linux/sctp.h |  7 
 net/sctp/socket.c | 88 +++
 2 files changed, 95 insertions(+)

diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h
index 66afa5b..d584073 100644
--- a/include/uapi/linux/sctp.h
+++ b/include/uapi/linux/sctp.h
@@ -129,6 +129,7 @@ typedef __s32 sctp_assoc_t;
 #define SCTP_STREAM_SCHEDULER_VALUE124
 #define SCTP_INTERLEAVING_SUPPORTED125
 #define SCTP_SENDMSG_CONNECT   126
+#define SCTP_EVENT 127
 
 /* PR-SCTP policies */
 #define SCTP_PR_SCTP_NONE  0x
@@ -1154,6 +1155,12 @@ struct sctp_add_streams {
uint16_t sas_outstrms;
 };
 
+struct sctp_event {
+   sctp_assoc_t se_assoc_id;
+   uint16_t se_type;
+   uint8_t se_on;
+};
+
 /* SCTP Stream schedulers */
 enum sctp_sched_type {
SCTP_SS_FCFS,
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index c771827..e16c090 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -4288,6 +4288,57 @@ static int sctp_setsockopt_reuse_port(struct sock *sk, 
char __user *optval,
return 0;
 }
 
+static int sctp_setsockopt_event(struct sock *sk, char __user *optval,
+unsigned int optlen)
+{
+   struct sctp_association *asoc;
+   struct sctp_ulpevent *event;
+   struct sctp_event param;
+   int retval = 0;
+
+   if (optlen < sizeof(param)) {
+   retval = -EINVAL;
+   goto out;
+   }
+
+   optlen = sizeof(param);
+   if (copy_from_user(, optval, optlen)) {
+   retval = -EFAULT;
+   goto out;
+   }
+
+   if (param.se_type < SCTP_SN_TYPE_BASE ||
+   param.se_type > SCTP_SN_TYPE_MAX) {
+   retval = -EINVAL;
+   goto out;
+   }
+
+   asoc = sctp_id2assoc(sk, param.se_assoc_id);
+   if (!asoc) {
+   sctp_ulpevent_type_set(_sk(sk)->subscribe,
+  param.se_type, param.se_on);
+   goto out;
+   }
+
+   sctp_ulpevent_type_set(>subscribe, param.se_type, param.se_on);
+
+   if (param.se_type == SCTP_SENDER_DRY_EVENT && param.se_on) {
+   if (sctp_outq_is_empty(>outqueue)) {
+   event = sctp_ulpevent_make_sender_dry_event(asoc,
+   GFP_USER | __GFP_NOWARN);
+   if (!event) {
+   retval = -ENOMEM;
+   goto out;
+   }
+
+   asoc->stream.si->enqueue_event(>ulpq, event);
+   }
+   }
+
+out:
+   return retval;
+}
+
 /* API 6.2 setsockopt(), getsockopt()
  *
  * Applications use setsockopt() and getsockopt() to set or retrieve
@@ -4485,6 +4536,9 @@ static int sctp_setsockopt(struct sock *sk, int level, 
int optname,
case SCTP_REUSE_PORT:
retval = sctp_setsockopt_reuse_port(sk, optval, optlen);
break;
+   case SCTP_EVENT:
+   retval = sctp_setsockopt_event(sk, optval, optlen);
+   break;
default:
retval = -ENOPROTOOPT;
break;
@@ -7430,6 +7484,37 @@ static int sctp_getsockopt_reuse_port(struct sock *sk, 
int len,
return 0;
 }
 
+static int sctp_getsockopt_event(struct sock *sk, int len, char __user *optval,
+int __user *optlen)
+{
+   struct sctp_association *asoc;
+   struct sctp_event param;
+   __u16 subscribe;
+
+   if (len < sizeof(param))
+   return -EINVAL;
+
+   len = sizeof(param);
+   if (copy_from_user(, optval, len))
+   return -EFAULT;
+
+   if (param.se_type < SCTP_SN_TYPE_BASE ||
+   param.se_type > SCTP_SN_TYPE_MAX)
+   return -EINVAL;
+
+   asoc = sctp_id2assoc(sk, param.se_assoc_id);
+   subscribe = asoc ? asoc->subscribe : sctp_sk(sk)->subscribe;
+   param.se_on = sctp_ulpevent_type_enabled(subscribe, param.se_type);
+
+   if (put_user(len, optlen))
+   return -EFAULT;
+
+   if (copy_to_user(optval, , len))
+   return -EFAULT;
+
+   return 0;
+}
+
 static int sctp_getsockopt(struct sock *sk, int level, int optname,
   char __user *optval, int __user *optlen)
 {
@@ -7628,6 +7713,9 @@ static int sctp_getsockopt(struct sock *sk, int level, 
int optname,
case SCTP_REUSE_PORT:
retval = sctp_getsockopt_reuse_port(sk, len, optval, optlen);
break;
+   case SCTP_EVENT:
+   retval = sctp_getsockopt_event(sk, len, optval, optlen);
+   break;
default:
retval = -ENOPROTOOPT;
break;
-- 
2.1.0



[PATCHv3 net-next 0/4] sctp: add subscribe per asoc and sockopt SCTP_EVENT

2018-11-18 Thread Xin Long
This patchset mainly adds the Event Subscription sockopt described in
rfc6525#section-6.2:

"Subscribing to events as described in [RFC6458] uses a setsockopt()
call with the SCTP_EVENT socket option.  This option takes the
following structure, which specifies the association, the event type
(using the same value found in the event type field), and an on/off
boolean.

  struct sctp_event {
sctp_assoc_t se_assoc_id;
uint16_t se_type;
uint8_t  se_on;
  };

The user fills in the se_type field with the same value found in the
strreset_type field, i.e., SCTP_STREAM_RESET_EVENT.  The user will
also fill in the se_assoc_id field with either the association to set
this event on (this field is ignored for one-to-one style sockets) or
one of the reserved constant values defined in [RFC6458].  Finally,
the se_on field is set with a 1 to enable the event or a 0 to disable
the event."

As for the old SCTP_EVENTS Option with struct sctp_event_subscribe,
it's being DEPRECATED.

v1->v2:
  - fix some key word in changelog that triggerred the filters at
vger.kernel.org.
v2->v3:
  - fix an array out of bounds noticed by Neil in patch 1/4.

Xin Long (4):
  sctp: define subscribe in sctp_sock as __u16
  sctp: add subscribe per asoc
  sctp: rename enum sctp_event to sctp_event_type
  sctp: add sockopt SCTP_EVENT

 include/net/sctp/constants.h |   2 +-
 include/net/sctp/sm.h|   4 +-
 include/net/sctp/structs.h   |   4 +-
 include/net/sctp/ulpevent.h  |  39 --
 include/uapi/linux/sctp.h|  13 -
 net/sctp/associola.c |   2 +
 net/sctp/chunk.c |   8 ++-
 net/sctp/primitive.c |   2 +-
 net/sctp/sm_sideeffect.c |  12 ++---
 net/sctp/sm_statetable.c |   2 +-
 net/sctp/socket.c| 125 ---
 net/sctp/stream_interleave.c |  12 +++--
 net/sctp/ulpqueue.c  |   8 +--
 13 files changed, 183 insertions(+), 50 deletions(-)

-- 
2.1.0



[PATCHv3 net-next 2/4] sctp: add subscribe per asoc

2018-11-18 Thread Xin Long
The member subscribe should be per asoc, so that sockopt SCTP_EVENT
in the next patch can subscribe a event from one asoc only.

Signed-off-by: Xin Long 
---
 include/net/sctp/structs.h   | 2 ++
 net/sctp/associola.c | 2 ++
 net/sctp/chunk.c | 6 ++
 net/sctp/socket.c| 6 +-
 net/sctp/stream_interleave.c | 7 ---
 net/sctp/ulpqueue.c  | 4 ++--
 6 files changed, 17 insertions(+), 10 deletions(-)

diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index bc7808a..7eaa294 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -2077,6 +2077,8 @@ struct sctp_association {
 
int sent_cnt_removable;
 
+   __u16 subscribe;
+
__u64 abandoned_unsent[SCTP_PR_INDEX(MAX) + 1];
__u64 abandoned_sent[SCTP_PR_INDEX(MAX) + 1];
 };
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index 6a28b96..685c7ef 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -135,6 +135,8 @@ static struct sctp_association *sctp_association_init(
 */
asoc->max_burst = sp->max_burst;
 
+   asoc->subscribe = sp->subscribe;
+
/* initialize association timers */
asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_COOKIE] = asoc->rto_initial;
asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_INIT] = asoc->rto_initial;
diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c
index 6c761af..0b203b8 100644
--- a/net/sctp/chunk.c
+++ b/net/sctp/chunk.c
@@ -86,11 +86,10 @@ void sctp_datamsg_free(struct sctp_datamsg *msg)
 /* Final destructruction of datamsg memory. */
 static void sctp_datamsg_destroy(struct sctp_datamsg *msg)
 {
+   struct sctp_association *asoc = NULL;
struct list_head *pos, *temp;
struct sctp_chunk *chunk;
-   struct sctp_sock *sp;
struct sctp_ulpevent *ev;
-   struct sctp_association *asoc = NULL;
int error = 0, notify;
 
/* If we failed, we may need to notify. */
@@ -108,8 +107,7 @@ static void sctp_datamsg_destroy(struct sctp_datamsg *msg)
else
error = asoc->outqueue.error;
 
-   sp = sctp_sk(asoc->base.sk);
-   notify = sctp_ulpevent_type_enabled(sp->subscribe,
+   notify = sctp_ulpevent_type_enabled(asoc->subscribe,
SCTP_SEND_FAILED);
}
 
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 9d75129..c771827 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -2307,6 +2307,7 @@ static int sctp_setsockopt_events(struct sock *sk, char 
__user *optval,
struct sctp_event_subscribe subscribe;
__u8 *sn_type = (__u8 *)
struct sctp_sock *sp = sctp_sk(sk);
+   struct sctp_association *asoc;
int i;
 
if (optlen > sizeof(struct sctp_event_subscribe))
@@ -2319,14 +2320,17 @@ static int sctp_setsockopt_events(struct sock *sk, char 
__user *optval,
sctp_ulpevent_type_set(>subscribe, SCTP_SN_TYPE_BASE + i,
   sn_type[i]);
 
+   list_for_each_entry(asoc, >ep->asocs, asocs)
+   asoc->subscribe = sctp_sk(sk)->subscribe;
+
/* At the time when a user app subscribes to SCTP_SENDER_DRY_EVENT,
 * if there is no data to be sent or retransmit, the stack will
 * immediately send up this notification.
 */
if (sctp_ulpevent_type_enabled(sp->subscribe, SCTP_SENDER_DRY_EVENT)) {
-   struct sctp_association *asoc = sctp_id2assoc(sk, 0);
struct sctp_ulpevent *event;
 
+   asoc = sctp_id2assoc(sk, 0);
if (asoc && sctp_outq_is_empty(>outqueue)) {
event = sctp_ulpevent_make_sender_dry_event(asoc,
GFP_USER | __GFP_NOWARN);
diff --git a/net/sctp/stream_interleave.c b/net/sctp/stream_interleave.c
index ceef5a3..a6bf215 100644
--- a/net/sctp/stream_interleave.c
+++ b/net/sctp/stream_interleave.c
@@ -503,7 +503,7 @@ static int sctp_enqueue_event(struct sctp_ulpq *ulpq,
sk_incoming_cpu_update(sk);
}
 
-   if (!sctp_ulpevent_is_enabled(event, sp->subscribe))
+   if (!sctp_ulpevent_is_enabled(event, ulpq->asoc->subscribe))
goto out_free;
 
if (skb_list)
@@ -992,16 +992,17 @@ static void sctp_intl_stream_abort_pd(struct sctp_ulpq 
*ulpq, __u16 sid,
  __u32 mid, __u16 flags, gfp_t gfp)
 {
struct sock *sk = ulpq->asoc->base.sk;
-   struct sctp_sock *sp = sctp_sk(sk);
struct sctp_ulpevent *ev = NULL;
 
-   if (!sctp_ulpevent_type_enabled(sp->subscribe,
+   if (!sctp_ulpevent_type_enabled(ulpq->asoc->subscribe,
SCTP_PARTIAL_DELIVERY_EVENT))
 

[PATCHv2 net] sctp: not allow to set asoc prsctp_enable by sockopt

2018-11-17 Thread Xin Long
As rfc7496#section4.5 says about SCTP_PR_SUPPORTED:

   This socket option allows the enabling or disabling of the
   negotiation of PR-SCTP support for future associations.  For existing
   associations, it allows one to query whether or not PR-SCTP support
   was negotiated on a particular association.

It means only sctp sock's prsctp_enable can be set.

Note that for the limitation of SCTP_{CURRENT|ALL}_ASSOC, we will
add it when introducing SCTP_{FUTURE|CURRENT|ALL}_ASSOC for linux
sctp in another patchset.

v1->v2:
  - drop the params.assoc_id check as Neil suggested.

Fixes: 28aa4c26fce2 ("sctp: add SCTP_PR_SUPPORTED on sctp sockopt")
Reported-by: Ying Xu 
Signed-off-by: Xin Long 
---
 net/sctp/socket.c | 26 +-
 1 file changed, 5 insertions(+), 21 deletions(-)

diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 739f3e5..bf618d1 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -3940,32 +3940,16 @@ static int sctp_setsockopt_pr_supported(struct sock *sk,
unsigned int optlen)
 {
struct sctp_assoc_value params;
-   struct sctp_association *asoc;
-   int retval = -EINVAL;
 
if (optlen != sizeof(params))
-   goto out;
-
-   if (copy_from_user(, optval, optlen)) {
-   retval = -EFAULT;
-   goto out;
-   }
-
-   asoc = sctp_id2assoc(sk, params.assoc_id);
-   if (asoc) {
-   asoc->prsctp_enable = !!params.assoc_value;
-   } else if (!params.assoc_id) {
-   struct sctp_sock *sp = sctp_sk(sk);
+   return -EINVAL;
 
-   sp->ep->prsctp_enable = !!params.assoc_value;
-   } else {
-   goto out;
-   }
+   if (copy_from_user(, optval, optlen))
+   return -EFAULT;
 
-   retval = 0;
+   sctp_sk(sk)->ep->prsctp_enable = !!params.assoc_value;
 
-out:
-   return retval;
+   return 0;
 }
 
 static int sctp_setsockopt_default_prinfo(struct sock *sk,
-- 
2.1.0



[PATCH net] sctp: count sk_wmem_alloc by skb truesize in sctp_packet_transmit

2018-11-17 Thread Xin Long
Now sctp increases sk_wmem_alloc by 1 when doing set_owner_w for the
skb allocked in sctp_packet_transmit and decreases by 1 when freeing
this skb.

But when this skb goes through networking stack, some subcomponents
might change skb->truesize and add the same amount on sk_wmem_alloc.
However sctp doesn't know the amount to decrease by, it would cause
a leak on sk->sk_wmem_alloc and the sock can never be freed.

Xiumei found this issue when it hit esp_output_head() by using sctp
over ipsec, where skb->truesize is added and so is sk->sk_wmem_alloc.

Since sctp has used sk_wmem_queued to count for writable space since
Commit cd305c74b0f8 ("sctp: use sk_wmem_queued to check for writable
space"), it's ok to fix it by counting sk_wmem_alloc by skb truesize
in sctp_packet_transmit.

Fixes: cac2661c53f3 ("esp4: Avoid skb_cow_data whenever possible")
Reported-by: Xiumei Mu 
Signed-off-by: Xin Long 
---
 net/sctp/output.c | 21 +
 1 file changed, 1 insertion(+), 20 deletions(-)

diff --git a/net/sctp/output.c b/net/sctp/output.c
index 67939ad..88dfa6a 100644
--- a/net/sctp/output.c
+++ b/net/sctp/output.c
@@ -396,25 +396,6 @@ enum sctp_xmit sctp_packet_append_chunk(struct sctp_packet 
*packet,
return retval;
 }
 
-static void sctp_packet_release_owner(struct sk_buff *skb)
-{
-   sk_free(skb->sk);
-}
-
-static void sctp_packet_set_owner_w(struct sk_buff *skb, struct sock *sk)
-{
-   skb_orphan(skb);
-   skb->sk = sk;
-   skb->destructor = sctp_packet_release_owner;
-
-   /*
-* The data chunks have already been accounted for in sctp_sendmsg(),
-* therefore only reserve a single byte to keep socket around until
-* the packet has been transmitted.
-*/
-   refcount_inc(>sk_wmem_alloc);
-}
-
 static void sctp_packet_gso_append(struct sk_buff *head, struct sk_buff *skb)
 {
if (SCTP_OUTPUT_CB(head)->last == head)
@@ -601,7 +582,7 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t 
gfp)
if (!head)
goto out;
skb_reserve(head, packet->overhead + MAX_HEADER);
-   sctp_packet_set_owner_w(head, sk);
+   skb_set_owner_w(head, sk);
 
/* set sctp header */
sh = skb_push(head, sizeof(struct sctphdr));
-- 
2.1.0



Re: [PATCH net] sctp: not allow to set asoc prsctp_enable by sockopt

2018-11-17 Thread Xin Long
On Sat, Nov 17, 2018 at 12:12 AM Neil Horman  wrote:
>
> On Thu, Nov 15, 2018 at 09:41:01PM -0200, Marcelo Ricardo Leitner wrote:
> > [ re-sending, without html this time ]
> >
> > On Thu, Nov 15, 2018, 15:26 Neil Horman  >
> > > On Thu, Nov 15, 2018 at 08:25:36PM -0200, Marcelo Ricardo Leitner wrote:
> > > > On Thu, Nov 15, 2018 at 04:43:10PM -0500, Neil Horman wrote:
> > > > > On Thu, Nov 15, 2018 at 03:22:21PM -0200, Marcelo Ricardo Leitner
> > > wrote:
> > > > > > On Thu, Nov 15, 2018 at 07:14:28PM +0800, Xin Long wrote:
> > > > > > > As rfc7496#section4.5 says about SCTP_PR_SUPPORTED:
> > > > > > >
> > > > > > >This socket option allows the enabling or disabling of the
> > > > > > >negotiation of PR-SCTP support for future associations.  For
> > > existing
> > > > > > >associations, it allows one to query whether or not PR-SCTP
> > > support
> > > > > > >was negotiated on a particular association.
> > > > > > >
> > > > > > > It means only sctp sock's prsctp_enable can be set.
> > > > > > >
> > > > > > > Note that for the limitation of SCTP_{CURRENT|ALL}_ASSOC, we will
> > > > > > > add it when introducing SCTP_{FUTURE|CURRENT|ALL}_ASSOC for linux
> > > > > > > sctp in another patchset.
> > > > > > >
> > > > > > > Fixes: 28aa4c26fce2 ("sctp: add SCTP_PR_SUPPORTED on sctp 
> > > > > > > sockopt")
> > > > > > > Reported-by: Ying Xu 
> > > > > > > Signed-off-by: Xin Long 
> > > > > > > ---
> > > > > > >  net/sctp/socket.c | 13 +++--
> > > > > > >  1 file changed, 3 insertions(+), 10 deletions(-)
> > > > > > >
> > > > > > > diff --git a/net/sctp/socket.c b/net/sctp/socket.c
> > > > > > > index 739f3e5..e9b8232 100644
> > > > > > > --- a/net/sctp/socket.c
> > > > > > > +++ b/net/sctp/socket.c
> > > > > > > @@ -3940,7 +3940,6 @@ static int
> > > sctp_setsockopt_pr_supported(struct sock *sk,
> > > > > > > unsigned int optlen)
> > > > > > >  {
> > > > > > > struct sctp_assoc_value params;
> > > > > > > -   struct sctp_association *asoc;
> > > > > > > int retval = -EINVAL;
> > > > > > >
> > > > > > > if (optlen != sizeof(params))
> > > > > > > @@ -3951,16 +3950,10 @@ static int
> > > sctp_setsockopt_pr_supported(struct sock *sk,
> > > > > > > goto out;
> > > > > > > }
> > > > > > >
> > > > > > > -   asoc = sctp_id2assoc(sk, params.assoc_id);
> > > > > > > -   if (asoc) {
> > > > > > > -   asoc->prsctp_enable = !!params.assoc_value;
> > > > > > > -   } else if (!params.assoc_id) {
> > > > > > > -   struct sctp_sock *sp = sctp_sk(sk);
> > > > > > > -
> > > > > > > -   sp->ep->prsctp_enable = !!params.assoc_value;
> > > > > > > -   } else {
> > > > > > > +   if (sctp_style(sk, UDP) && sctp_id2assoc(sk,
> > > params.assoc_id))
> > > > > >
> > > > > > This would allow using a non-existent assoc id on UDP-style sockets
> > > to
> > > > > > set it at the socket, which is not expected. It should be more like:
> > > > > >
> > > > > > + if (sctp_style(sk, UDP) && params.assoc_id)
> > > > > How do you see that to be the case? sctp_id2assoc will return NULL if
> > > an
> > > > > association isn't found, so the use of sctp_id2assoc should work just
> > > fine.
> > > >
> > > > Right, it will return NULL, and because of that it won't bail out as
> > > > it should and will adjust the socket config instead.
> > > >
> > >
> > > Oh, duh, you're absolutely right, NULL will evalutate to false there, and
> > > skip
> > > the conditional goto out;
> > >
> > > that said, It would make more sense

Re: [PATCH net] sctp: not allow to set asoc prsctp_enable by sockopt

2018-11-15 Thread Xin Long
On Fri, Nov 16, 2018 at 2:22 AM Marcelo Ricardo Leitner
 wrote:
>
> On Thu, Nov 15, 2018 at 07:14:28PM +0800, Xin Long wrote:
> > As rfc7496#section4.5 says about SCTP_PR_SUPPORTED:
> >
> >This socket option allows the enabling or disabling of the
> >negotiation of PR-SCTP support for future associations.  For existing
> >associations, it allows one to query whether or not PR-SCTP support
> >was negotiated on a particular association.
> >
> > It means only sctp sock's prsctp_enable can be set.
> >
> > Note that for the limitation of SCTP_{CURRENT|ALL}_ASSOC, we will
> > add it when introducing SCTP_{FUTURE|CURRENT|ALL}_ASSOC for linux
> > sctp in another patchset.
> >
> > Fixes: 28aa4c26fce2 ("sctp: add SCTP_PR_SUPPORTED on sctp sockopt")
> > Reported-by: Ying Xu 
> > Signed-off-by: Xin Long 
> > ---
> >  net/sctp/socket.c | 13 +++--
> >  1 file changed, 3 insertions(+), 10 deletions(-)
> >
> > diff --git a/net/sctp/socket.c b/net/sctp/socket.c
> > index 739f3e5..e9b8232 100644
> > --- a/net/sctp/socket.c
> > +++ b/net/sctp/socket.c
> > @@ -3940,7 +3940,6 @@ static int sctp_setsockopt_pr_supported(struct sock 
> > *sk,
> >   unsigned int optlen)
> >  {
> >   struct sctp_assoc_value params;
> > - struct sctp_association *asoc;
> >   int retval = -EINVAL;
> >
> >   if (optlen != sizeof(params))
> > @@ -3951,16 +3950,10 @@ static int sctp_setsockopt_pr_supported(struct sock 
> > *sk,
> >   goto out;
> >   }
> >
> > - asoc = sctp_id2assoc(sk, params.assoc_id);
> > - if (asoc) {
> > - asoc->prsctp_enable = !!params.assoc_value;
> > - } else if (!params.assoc_id) {
> > - struct sctp_sock *sp = sctp_sk(sk);
> > -
> > - sp->ep->prsctp_enable = !!params.assoc_value;
> > - } else {
> > + if (sctp_style(sk, UDP) && sctp_id2assoc(sk, params.assoc_id))
I got this semantic from BSD's SCTP_PR_SUPPORTED sockopt:
SCTP_FIND_STCB(inp, stcb, av->assoc_id);

if (stcb) {
SCTP_LTRACE_ERR_RET(...);
error = EINVAL;
SCTP_TCB_UNLOCK(stcb);
} else {
...
}

>
> This would allow using a non-existent assoc id on UDP-style sockets to
> set it at the socket, which is not expected. It should be more like:
>
> +   if (sctp_style(sk, UDP) && params.assoc_id)
This way is more strict, but it seems reasonable.

When a user sets params.assoc_id for UDP type socket, it should be
thought as he WANTs to apply this on assoc, which is not allowed here.


[PATCH net] sctp: not allow to set asoc prsctp_enable by sockopt

2018-11-15 Thread Xin Long
As rfc7496#section4.5 says about SCTP_PR_SUPPORTED:

   This socket option allows the enabling or disabling of the
   negotiation of PR-SCTP support for future associations.  For existing
   associations, it allows one to query whether or not PR-SCTP support
   was negotiated on a particular association.

It means only sctp sock's prsctp_enable can be set.

Note that for the limitation of SCTP_{CURRENT|ALL}_ASSOC, we will
add it when introducing SCTP_{FUTURE|CURRENT|ALL}_ASSOC for linux
sctp in another patchset.

Fixes: 28aa4c26fce2 ("sctp: add SCTP_PR_SUPPORTED on sctp sockopt")
Reported-by: Ying Xu 
Signed-off-by: Xin Long 
---
 net/sctp/socket.c | 13 +++--
 1 file changed, 3 insertions(+), 10 deletions(-)

diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 739f3e5..e9b8232 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -3940,7 +3940,6 @@ static int sctp_setsockopt_pr_supported(struct sock *sk,
unsigned int optlen)
 {
struct sctp_assoc_value params;
-   struct sctp_association *asoc;
int retval = -EINVAL;
 
if (optlen != sizeof(params))
@@ -3951,16 +3950,10 @@ static int sctp_setsockopt_pr_supported(struct sock *sk,
goto out;
}
 
-   asoc = sctp_id2assoc(sk, params.assoc_id);
-   if (asoc) {
-   asoc->prsctp_enable = !!params.assoc_value;
-   } else if (!params.assoc_id) {
-   struct sctp_sock *sp = sctp_sk(sk);
-
-   sp->ep->prsctp_enable = !!params.assoc_value;
-   } else {
+   if (sctp_style(sk, UDP) && sctp_id2assoc(sk, params.assoc_id))
goto out;
-   }
+
+   sctp_sk(sk)->ep->prsctp_enable = !!params.assoc_value;
 
retval = 0;
 
-- 
2.1.0



Re: [PATCH net] ipv6: fix a dst leak when removing its exception

2018-11-14 Thread Xin Long
On Thu, Nov 15, 2018 at 3:33 PM David Ahern  wrote:
>
> On 11/14/18 11:03 AM, David Ahern wrote:
> > On 11/13/18 8:48 AM, Xin Long wrote:
> >> These is no need to hold dst before calling rt6_remove_exception_rt().
> >> The call to dst_hold_safe() in ip6_link_failure() was for ip6_del_rt(),
> >> which has been removed in Commit 93531c674315 ("net/ipv6: separate
> >> handling of FIB entries from dst based routes"). Otherwise, it will
> >> cause a dst leak.
> >>
> >> This patch is to simply remove the dst_hold_safe() call before calling
> >> rt6_remove_exception_rt() and also do the same in ip6_del_cached_rt().
> >> It's safe, because the removal of the exception that holds its dst's
> >> refcnt is protected by rt6_exception_lock.
> >>
> >> Fixes: 93531c674315 ("net/ipv6: separate handling of FIB entries from dst 
> >> based routes")
> >> Fixes: 23fb93a4d3f1 ("net/ipv6: Cleanup exception and cache route 
> >> handling")
> >> Reported-by: Li Shuang 
> >> Signed-off-by: Xin Long 
> >> ---
> >>  net/ipv6/route.c | 7 +++
> >>  1 file changed, 3 insertions(+), 4 deletions(-)
> >
> > was this problem actually hit or is this patch based on a code analysis?
> >
>
> I ask because I have not been able to reproduce the leak using existing
> tests (e.g., pmtu) that I know create exceptions.
>
> If this problem was hit, it would be good to get a test case for it.
The attachment is the ip6_dst.sh with IPVS.

# sh ip6_dst.sh

But this one triggers the kernel warnings caused by 2 places:
   unregister_netdevice: waiting for br0 to become free. Usage count = 3

1. one is IPVS, I just posted the fix:
https://patchwork.ozlabs.org/patch/998123/  [1]
2. the other one is IPv6,
ip6_link_failure() will be hit.

So to make this reproduce clearly, you may want to apply
patch [1] firstly.


ip6_dst.sh
Description: Bourne shell script


Re: [PATCHv2 net-next 1/4] sctp: define subscribe in sctp_sock as __u16

2018-11-14 Thread Xin Long
On Wed, Nov 14, 2018 at 2:16 AM Neil Horman  wrote:
>
> On Tue, Nov 13, 2018 at 02:24:53PM +0800, Xin Long wrote:
> >
> >   /* Default Peer Address Parameters.  These defaults can
> >* be modified via SCTP_PEER_ADDR_PARAMS
> > @@ -5267,14 +5274,24 @@ static int sctp_getsockopt_disable_fragments(struct 
> > sock *sk, int len,
> >  static int sctp_getsockopt_events(struct sock *sk, int len, char __user 
> > *optval,
> > int __user *optlen)
> >  {
> > + struct sctp_event_subscribe subscribe;
> > + __u8 *sn_type = (__u8 *)
> > + int i;
> > +
> >   if (len == 0)
> >   return -EINVAL;
> >   if (len > sizeof(struct sctp_event_subscribe))
> >   len = sizeof(struct sctp_event_subscribe);
> >   if (put_user(len, optlen))
> >   return -EFAULT;
> > - if (copy_to_user(optval, _sk(sk)->subscribe, len))
> > +
> > + for (i = 0; i <= len; i++)
> > + sn_type[i] = 
> > sctp_ulpevent_type_enabled(sctp_sk(sk)->subscribe,
> > + SCTP_SN_TYPE_BASE + 
> > i);
> > +
> This seems like an off by one error.  sctp_event_subscribe has N bytes in it 
> (1
> byte for each event), meaning that that events 0-(N-1) are subscribable.
> Iterating this loop imples that you are going to check N events, overrunning 
> the
> sctp_event_subscribe struct.
you're right, thanks.

>
> Neil
>
> >


[PATCH net] ipv6: fix a dst leak when removing its exception

2018-11-13 Thread Xin Long
These is no need to hold dst before calling rt6_remove_exception_rt().
The call to dst_hold_safe() in ip6_link_failure() was for ip6_del_rt(),
which has been removed in Commit 93531c674315 ("net/ipv6: separate
handling of FIB entries from dst based routes"). Otherwise, it will
cause a dst leak.

This patch is to simply remove the dst_hold_safe() call before calling
rt6_remove_exception_rt() and also do the same in ip6_del_cached_rt().
It's safe, because the removal of the exception that holds its dst's
refcnt is protected by rt6_exception_lock.

Fixes: 93531c674315 ("net/ipv6: separate handling of FIB entries from dst based 
routes")
Fixes: 23fb93a4d3f1 ("net/ipv6: Cleanup exception and cache route handling")
Reported-by: Li Shuang 
Signed-off-by: Xin Long 
---
 net/ipv6/route.c | 7 +++
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 2a7423c..14b422f 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2232,8 +2232,7 @@ static void ip6_link_failure(struct sk_buff *skb)
if (rt) {
rcu_read_lock();
if (rt->rt6i_flags & RTF_CACHE) {
-   if (dst_hold_safe(>dst))
-   rt6_remove_exception_rt(rt);
+   rt6_remove_exception_rt(rt);
} else {
struct fib6_info *from;
struct fib6_node *fn;
@@ -3214,8 +3213,8 @@ static int ip6_del_cached_rt(struct rt6_info *rt, struct 
fib6_config *cfg)
if (cfg->fc_flags & RTF_GATEWAY &&
!ipv6_addr_equal(>fc_gateway, >rt6i_gateway))
goto out;
-   if (dst_hold_safe(>dst))
-   rc = rt6_remove_exception_rt(rt);
+
+   rc = rt6_remove_exception_rt(rt);
 out:
return rc;
 }
-- 
2.1.0



Re: [PATCH net-next 0/4] sctp: add subscribe per asoc and sockopt SCTP_EVENT

2018-11-12 Thread Xin Long
On Tue, Nov 13, 2018 at 1:26 AM Xin Long  wrote:
>
> This patchset mainly adds the Event Subscription sockopt described in
> rfc6525#section-6.2:
>
> Subscribing to events as described in [RFC6458] uses a setsockopt()
> call with the SCTP_EVENT socket option.  This option takes the
> following structure, which specifies the association, the event type
> (using the same value found in the event type field), and an on/off
> boolean.
>
>   struct sctp_event {
> sctp_assoc_t se_assoc_id;
> uint16_t se_type;
> uint8_t  se_on;
>   };
>
> The user fills in the se_type field with the same value found in the
> strreset_type field, i.e., SCTP_STREAM_RESET_EVENT.  The user will
> also fill in the se_assoc_id field with either the association to set
> this event on (this field is ignored for one-to-one style sockets) or
> one of the reserved constant values defined in [RFC6458].  Finally,
> the se_on field is set with a 1 to enable the event or a 0 to disable
> the event.
>
> As for the old SCTP_EVENTS Option with struct sctp_event_subscribe,
> it's being DEPRECATED.
>
> Xin Long (4):
>   sctp: define subscribe in sctp_sock as __u16
>   sctp: add subscribe per asoc
>   sctp: rename enum sctp_event to sctp_event_type
>   sctp: add sockopt SCTP_EVENT
>
>  include/net/sctp/constants.h |   2 +-
>  include/net/sctp/sm.h|   4 +-
>  include/net/sctp/structs.h   |   4 +-
>  include/net/sctp/ulpevent.h  |  39 --
>  include/uapi/linux/sctp.h|  13 -
>  net/sctp/associola.c |   2 +
>  net/sctp/chunk.c |   8 ++-
>  net/sctp/primitive.c |   2 +-
>  net/sctp/sm_sideeffect.c |  12 ++---
>  net/sctp/sm_statetable.c |   2 +-
>  net/sctp/socket.c| 126 
> ---
>  net/sctp/stream_interleave.c |  12 +++--
>  net/sctp/ulpqueue.c  |   8 +--
>  13 files changed, 184 insertions(+), 50 deletions(-)
>
> --
> 2.1.0
>
Because some key word in changelog triggerred the filters at vger.kernel.org,
v2 has been posted.


[PATCHv2 net-next 2/4] sctp: add subscribe per asoc

2018-11-12 Thread Xin Long
The member subscribe should be per asoc, so that sockopt SCTP_EVENT
in the next patch can subscribe a event from one asoc only.

Signed-off-by: Xin Long 
---
 include/net/sctp/structs.h   | 2 ++
 net/sctp/associola.c | 2 ++
 net/sctp/chunk.c | 6 ++
 net/sctp/socket.c| 6 +-
 net/sctp/stream_interleave.c | 7 ---
 net/sctp/ulpqueue.c  | 4 ++--
 6 files changed, 17 insertions(+), 10 deletions(-)

diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index bc7808a..7eaa294 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -2077,6 +2077,8 @@ struct sctp_association {
 
int sent_cnt_removable;
 
+   __u16 subscribe;
+
__u64 abandoned_unsent[SCTP_PR_INDEX(MAX) + 1];
__u64 abandoned_sent[SCTP_PR_INDEX(MAX) + 1];
 };
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index 6a28b96..685c7ef 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -135,6 +135,8 @@ static struct sctp_association *sctp_association_init(
 */
asoc->max_burst = sp->max_burst;
 
+   asoc->subscribe = sp->subscribe;
+
/* initialize association timers */
asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_COOKIE] = asoc->rto_initial;
asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_INIT] = asoc->rto_initial;
diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c
index 6c761af..0b203b8 100644
--- a/net/sctp/chunk.c
+++ b/net/sctp/chunk.c
@@ -86,11 +86,10 @@ void sctp_datamsg_free(struct sctp_datamsg *msg)
 /* Final destructruction of datamsg memory. */
 static void sctp_datamsg_destroy(struct sctp_datamsg *msg)
 {
+   struct sctp_association *asoc = NULL;
struct list_head *pos, *temp;
struct sctp_chunk *chunk;
-   struct sctp_sock *sp;
struct sctp_ulpevent *ev;
-   struct sctp_association *asoc = NULL;
int error = 0, notify;
 
/* If we failed, we may need to notify. */
@@ -108,8 +107,7 @@ static void sctp_datamsg_destroy(struct sctp_datamsg *msg)
else
error = asoc->outqueue.error;
 
-   sp = sctp_sk(asoc->base.sk);
-   notify = sctp_ulpevent_type_enabled(sp->subscribe,
+   notify = sctp_ulpevent_type_enabled(asoc->subscribe,
SCTP_SEND_FAILED);
}
 
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 48e0b45..789008d 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -2307,6 +2307,7 @@ static int sctp_setsockopt_events(struct sock *sk, char 
__user *optval,
struct sctp_event_subscribe subscribe;
__u8 *sn_type = (__u8 *)
struct sctp_sock *sp = sctp_sk(sk);
+   struct sctp_association *asoc;
int i;
 
if (optlen > sizeof(struct sctp_event_subscribe))
@@ -2319,14 +2320,17 @@ static int sctp_setsockopt_events(struct sock *sk, char 
__user *optval,
sctp_ulpevent_type_set(>subscribe, SCTP_SN_TYPE_BASE + i,
   sn_type[i]);
 
+   list_for_each_entry(asoc, >ep->asocs, asocs)
+   asoc->subscribe = sctp_sk(sk)->subscribe;
+
/* At the time when a user app subscribes to SCTP_SENDER_DRY_EVENT,
 * if there is no data to be sent or retransmit, the stack will
 * immediately send up this notification.
 */
if (sctp_ulpevent_type_enabled(sp->subscribe, SCTP_SENDER_DRY_EVENT)) {
-   struct sctp_association *asoc = sctp_id2assoc(sk, 0);
struct sctp_ulpevent *event;
 
+   asoc = sctp_id2assoc(sk, 0);
if (asoc && sctp_outq_is_empty(>outqueue)) {
event = sctp_ulpevent_make_sender_dry_event(asoc,
GFP_USER | __GFP_NOWARN);
diff --git a/net/sctp/stream_interleave.c b/net/sctp/stream_interleave.c
index ceef5a3..a6bf215 100644
--- a/net/sctp/stream_interleave.c
+++ b/net/sctp/stream_interleave.c
@@ -503,7 +503,7 @@ static int sctp_enqueue_event(struct sctp_ulpq *ulpq,
sk_incoming_cpu_update(sk);
}
 
-   if (!sctp_ulpevent_is_enabled(event, sp->subscribe))
+   if (!sctp_ulpevent_is_enabled(event, ulpq->asoc->subscribe))
goto out_free;
 
if (skb_list)
@@ -992,16 +992,17 @@ static void sctp_intl_stream_abort_pd(struct sctp_ulpq 
*ulpq, __u16 sid,
  __u32 mid, __u16 flags, gfp_t gfp)
 {
struct sock *sk = ulpq->asoc->base.sk;
-   struct sctp_sock *sp = sctp_sk(sk);
struct sctp_ulpevent *ev = NULL;
 
-   if (!sctp_ulpevent_type_enabled(sp->subscribe,
+   if (!sctp_ulpevent_type_enabled(ulpq->asoc->subscribe,
SCTP_PARTIAL_DELIVERY_EVENT))
 

[PATCHv2 net-next 4/4] sctp: add sockopt SCTP_EVENT

2018-11-12 Thread Xin Long
This patch adds sockopt SCTP_EVENT described in rfc6525#section-6.2.
With this sockopt users can subscribe to an event from a specified
asoc.

Signed-off-by: Xin Long 
---
 include/uapi/linux/sctp.h |  7 
 net/sctp/socket.c | 89 +++
 2 files changed, 96 insertions(+)

diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h
index 66afa5b..d584073 100644
--- a/include/uapi/linux/sctp.h
+++ b/include/uapi/linux/sctp.h
@@ -129,6 +129,7 @@ typedef __s32 sctp_assoc_t;
 #define SCTP_STREAM_SCHEDULER_VALUE124
 #define SCTP_INTERLEAVING_SUPPORTED125
 #define SCTP_SENDMSG_CONNECT   126
+#define SCTP_EVENT 127
 
 /* PR-SCTP policies */
 #define SCTP_PR_SCTP_NONE  0x
@@ -1154,6 +1155,12 @@ struct sctp_add_streams {
uint16_t sas_outstrms;
 };
 
+struct sctp_event {
+   sctp_assoc_t se_assoc_id;
+   uint16_t se_type;
+   uint8_t se_on;
+};
+
 /* SCTP Stream schedulers */
 enum sctp_sched_type {
SCTP_SS_FCFS,
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 789008d..1451211 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -4288,6 +4288,57 @@ static int sctp_setsockopt_reuse_port(struct sock *sk, 
char __user *optval,
return 0;
 }
 
+static int sctp_setsockopt_event(struct sock *sk, char __user *optval,
+unsigned int optlen)
+{
+   struct sctp_association *asoc;
+   struct sctp_ulpevent *event;
+   struct sctp_event param;
+   int retval = 0;
+
+   if (optlen < sizeof(param)) {
+   retval = -EINVAL;
+   goto out;
+   }
+
+   optlen = sizeof(param);
+   if (copy_from_user(, optval, optlen)) {
+   retval = -EFAULT;
+   goto out;
+   }
+
+   if (param.se_type < SCTP_SN_TYPE_BASE ||
+   param.se_type > SCTP_SN_TYPE_MAX) {
+   retval = -EINVAL;
+   goto out;
+   }
+
+   asoc = sctp_id2assoc(sk, param.se_assoc_id);
+   if (!asoc) {
+   sctp_ulpevent_type_set(_sk(sk)->subscribe,
+  param.se_type, param.se_on);
+   goto out;
+   }
+
+   sctp_ulpevent_type_set(>subscribe, param.se_type, param.se_on);
+
+   if (param.se_type == SCTP_SENDER_DRY_EVENT && param.se_on) {
+   if (sctp_outq_is_empty(>outqueue)) {
+   event = sctp_ulpevent_make_sender_dry_event(asoc,
+   GFP_USER | __GFP_NOWARN);
+   if (!event) {
+   retval = -ENOMEM;
+   goto out;
+   }
+
+   asoc->stream.si->enqueue_event(>ulpq, event);
+   }
+   }
+
+out:
+   return retval;
+}
+
 /* API 6.2 setsockopt(), getsockopt()
  *
  * Applications use setsockopt() and getsockopt() to set or retrieve
@@ -4485,6 +4536,9 @@ static int sctp_setsockopt(struct sock *sk, int level, 
int optname,
case SCTP_REUSE_PORT:
retval = sctp_setsockopt_reuse_port(sk, optval, optlen);
break;
+   case SCTP_EVENT:
+   retval = sctp_setsockopt_event(sk, optval, optlen);
+   break;
default:
retval = -ENOPROTOOPT;
break;
@@ -7430,6 +7484,38 @@ static int sctp_getsockopt_reuse_port(struct sock *sk, 
int len,
return 0;
 }
 
+static int sctp_getsockopt_event(struct sock *sk, int len, char __user *optval,
+int __user *optlen)
+{
+   struct sctp_association *asoc;
+   struct sctp_event param;
+   __u16 subscribe;
+
+   if (len < sizeof(param))
+   return -EINVAL;
+
+   len = sizeof(param);
+   if (copy_from_user(, optval, len))
+   return -EFAULT;
+
+   if (param.se_type < SCTP_SN_TYPE_BASE ||
+   param.se_type > SCTP_SN_TYPE_MAX)
+   return -EINVAL;
+
+   asoc = sctp_id2assoc(sk, param.se_assoc_id);
+   subscribe = asoc ? asoc->subscribe : sctp_sk(sk)->subscribe;
+   param.se_on = sctp_ulpevent_type_enabled(subscribe, param.se_type);
+
+   if (put_user(len, optlen))
+   return -EFAULT;
+
+   if (copy_to_user(optval, , len))
+   return -EFAULT;
+
+   return 0;
+}
+
+
 static int sctp_getsockopt(struct sock *sk, int level, int optname,
   char __user *optval, int __user *optlen)
 {
@@ -7628,6 +7714,9 @@ static int sctp_getsockopt(struct sock *sk, int level, 
int optname,
case SCTP_REUSE_PORT:
retval = sctp_getsockopt_reuse_port(sk, len, optval, optlen);
break;
+   case SCTP_EVENT:
+   retval = sctp_getsockopt_event(sk, len, optval, optlen);
+   break;
default:
retval = -ENOPROTOOPT;
break;
-- 
2.1.0



[PATCHv2 net-next 1/4] sctp: define subscribe in sctp_sock as __u16

2018-11-12 Thread Xin Long
The member subscribe in sctp_sock is used to indicate to which of
the events it is subscribed, more like a group of flags. So it's
better to be defined as __u16 (2 bytpes), instead of struct
sctp_event_subscribe (13 bytes).

Note that sctp_event_subscribe is an UAPI struct, used on sockopt
calls, and thus it will not be removed. This patch only changes
the internal storage of the flags.

Signed-off-by: Xin Long 
---
 include/net/sctp/structs.h   |  2 +-
 include/net/sctp/ulpevent.h  | 39 ---
 include/uapi/linux/sctp.h|  6 +-
 net/sctp/chunk.c |  4 ++--
 net/sctp/socket.c| 35 ++-
 net/sctp/stream_interleave.c | 11 ++-
 net/sctp/ulpqueue.c  |  8 
 7 files changed, 68 insertions(+), 37 deletions(-)

diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index af9d494..bc7808a 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -217,7 +217,7 @@ struct sctp_sock {
 * These two structures must be grouped together for the usercopy
 * whitelist region.
 */
-   struct sctp_event_subscribe subscribe;
+   __u16 subscribe;
struct sctp_initmsg initmsg;
 
int user_frag;
diff --git a/include/net/sctp/ulpevent.h b/include/net/sctp/ulpevent.h
index 51b4e06..bd922a0 100644
--- a/include/net/sctp/ulpevent.h
+++ b/include/net/sctp/ulpevent.h
@@ -164,30 +164,39 @@ void sctp_ulpevent_read_nxtinfo(const struct 
sctp_ulpevent *event,
 
 __u16 sctp_ulpevent_get_notification_type(const struct sctp_ulpevent *event);
 
+static inline void sctp_ulpevent_type_set(__u16 *subscribe,
+ __u16 sn_type, __u8 on)
+{
+   if (sn_type > SCTP_SN_TYPE_MAX)
+   return;
+
+   if (on)
+   *subscribe |=  (1 << (sn_type - SCTP_SN_TYPE_BASE));
+   else
+   *subscribe &= ~(1 << (sn_type - SCTP_SN_TYPE_BASE));
+}
+
 /* Is this event type enabled? */
-static inline int sctp_ulpevent_type_enabled(__u16 sn_type,
-struct sctp_event_subscribe *mask)
+static inline bool sctp_ulpevent_type_enabled(__u16 subscribe, __u16 sn_type)
 {
-   int offset = sn_type - SCTP_SN_TYPE_BASE;
-   char *amask = (char *) mask;
+   if (sn_type > SCTP_SN_TYPE_MAX)
+   return false;
 
-   if (offset >= sizeof(struct sctp_event_subscribe))
-   return 0;
-   return amask[offset];
+   return subscribe & (1 << (sn_type - SCTP_SN_TYPE_BASE));
 }
 
 /* Given an event subscription, is this event enabled? */
-static inline int sctp_ulpevent_is_enabled(const struct sctp_ulpevent *event,
-  struct sctp_event_subscribe *mask)
+static inline bool sctp_ulpevent_is_enabled(const struct sctp_ulpevent *event,
+   __u16 subscribe)
 {
__u16 sn_type;
-   int enabled = 1;
 
-   if (sctp_ulpevent_is_notification(event)) {
-   sn_type = sctp_ulpevent_get_notification_type(event);
-   enabled = sctp_ulpevent_type_enabled(sn_type, mask);
-   }
-   return enabled;
+   if (!sctp_ulpevent_is_notification(event))
+   return true;
+
+   sn_type = sctp_ulpevent_get_notification_type(event);
+
+   return sctp_ulpevent_type_enabled(subscribe, sn_type);
 }
 
 #endif /* __sctp_ulpevent_h__ */
diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h
index c81feb3..66afa5b 100644
--- a/include/uapi/linux/sctp.h
+++ b/include/uapi/linux/sctp.h
@@ -632,7 +632,9 @@ union sctp_notification {
  */
 
 enum sctp_sn_type {
-   SCTP_SN_TYPE_BASE = (1<<15),
+   SCTP_SN_TYPE_BASE   = (1<<15),
+   SCTP_DATA_IO_EVENT  = SCTP_SN_TYPE_BASE,
+#define SCTP_DATA_IO_EVENT SCTP_DATA_IO_EVENT
SCTP_ASSOC_CHANGE,
 #define SCTP_ASSOC_CHANGE  SCTP_ASSOC_CHANGE
SCTP_PEER_ADDR_CHANGE,
@@ -657,6 +659,8 @@ enum sctp_sn_type {
 #define SCTP_ASSOC_RESET_EVENT SCTP_ASSOC_RESET_EVENT
SCTP_STREAM_CHANGE_EVENT,
 #define SCTP_STREAM_CHANGE_EVENT   SCTP_STREAM_CHANGE_EVENT
+   SCTP_SN_TYPE_MAX= SCTP_STREAM_CHANGE_EVENT,
+#define SCTP_SN_TYPE_MAX   SCTP_SN_TYPE_MAX
 };
 
 /* Notification error codes used to fill up the error fields in some
diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c
index ce80878..6c761af 100644
--- a/net/sctp/chunk.c
+++ b/net/sctp/chunk.c
@@ -109,8 +109,8 @@ static void sctp_datamsg_destroy(struct sctp_datamsg *msg)
error = asoc->outqueue.error;
 
sp = sctp_sk(asoc->base.sk);
-   notify = sctp_ulpevent_type_enabled(SCTP_SEND_FAILED,
-   >subscribe);
+

[PATCHv2 net-next 3/4] sctp: rename enum sctp_event to sctp_event_type

2018-11-12 Thread Xin Long
sctp_event is a structure name defined in RFC for sockopt
SCTP_EVENT. To avoid the conflict, rename it.

Signed-off-by: Xin Long 
---
 include/net/sctp/constants.h |  2 +-
 include/net/sctp/sm.h|  4 ++--
 net/sctp/primitive.c |  2 +-
 net/sctp/sm_sideeffect.c | 12 ++--
 net/sctp/sm_statetable.c |  2 +-
 5 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/include/net/sctp/constants.h b/include/net/sctp/constants.h
index 8dadc74..4588bdc 100644
--- a/include/net/sctp/constants.h
+++ b/include/net/sctp/constants.h
@@ -71,7 +71,7 @@ enum { SCTP_DEFAULT_INSTREAMS = SCTP_MAX_STREAM };
 SCTP_NUM_AUTH_CHUNK_TYPES)
 
 /* These are the different flavours of event.  */
-enum sctp_event {
+enum sctp_event_type {
SCTP_EVENT_T_CHUNK = 1,
SCTP_EVENT_T_TIMEOUT,
SCTP_EVENT_T_OTHER,
diff --git a/include/net/sctp/sm.h b/include/net/sctp/sm.h
index 9e3d327..24825a8 100644
--- a/include/net/sctp/sm.h
+++ b/include/net/sctp/sm.h
@@ -173,7 +173,7 @@ sctp_state_fn_t sctp_sf_autoclose_timer_expire;
 __u8 sctp_get_chunk_type(struct sctp_chunk *chunk);
 const struct sctp_sm_table_entry *sctp_sm_lookup_event(
struct net *net,
-   enum sctp_event event_type,
+   enum sctp_event_type event_type,
enum sctp_state state,
union sctp_subtype event_subtype);
 int sctp_chunk_iif(const struct sctp_chunk *);
@@ -313,7 +313,7 @@ struct sctp_chunk *sctp_process_strreset_resp(
 
 /* Prototypes for statetable processing. */
 
-int sctp_do_sm(struct net *net, enum sctp_event event_type,
+int sctp_do_sm(struct net *net, enum sctp_event_type event_type,
   union sctp_subtype subtype, enum sctp_state state,
   struct sctp_endpoint *ep, struct sctp_association *asoc,
   void *event_arg, gfp_t gfp);
diff --git a/net/sctp/primitive.c b/net/sctp/primitive.c
index c0817f7a..a8c4c33 100644
--- a/net/sctp/primitive.c
+++ b/net/sctp/primitive.c
@@ -53,7 +53,7 @@
 int sctp_primitive_ ## name(struct net *net, struct sctp_association *asoc, \
void *arg) { \
int error = 0; \
-   enum sctp_event event_type; union sctp_subtype subtype; \
+   enum sctp_event_type event_type; union sctp_subtype subtype; \
enum sctp_state state; \
struct sctp_endpoint *ep; \
\
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index 85d3930..1d143bc 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -52,7 +52,7 @@
 #include 
 #include 
 
-static int sctp_cmd_interpreter(enum sctp_event event_type,
+static int sctp_cmd_interpreter(enum sctp_event_type event_type,
union sctp_subtype subtype,
enum sctp_state state,
struct sctp_endpoint *ep,
@@ -61,7 +61,7 @@ static int sctp_cmd_interpreter(enum sctp_event event_type,
enum sctp_disposition status,
struct sctp_cmd_seq *commands,
gfp_t gfp);
-static int sctp_side_effects(enum sctp_event event_type,
+static int sctp_side_effects(enum sctp_event_type event_type,
 union sctp_subtype subtype,
 enum sctp_state state,
 struct sctp_endpoint *ep,
@@ -623,7 +623,7 @@ static void sctp_cmd_init_failed(struct sctp_cmd_seq 
*commands,
 /* Worker routine to handle SCTP_CMD_ASSOC_FAILED.  */
 static void sctp_cmd_assoc_failed(struct sctp_cmd_seq *commands,
  struct sctp_association *asoc,
- enum sctp_event event_type,
+ enum sctp_event_type event_type,
  union sctp_subtype subtype,
  struct sctp_chunk *chunk,
  unsigned int error)
@@ -1162,7 +1162,7 @@ static void sctp_cmd_send_asconf(struct sctp_association 
*asoc)
  * If you want to understand all of lksctp, this is a
  * good place to start.
  */
-int sctp_do_sm(struct net *net, enum sctp_event event_type,
+int sctp_do_sm(struct net *net, enum sctp_event_type event_type,
   union sctp_subtype subtype, enum sctp_state state,
   struct sctp_endpoint *ep, struct sctp_association *asoc,
   void *event_arg, gfp_t gfp)
@@ -1199,7 +1199,7 @@ int sctp_do_sm(struct net *net, enum sctp_event 
event_type,
 /*
  * This the master state function side effect processing function.
  */
-static int sctp_side_effects(enum sctp_event event_type,
+static int

[PATCHv2 net-next 0/4] sctp: add subscribe per asoc and sockopt SCTP_EVENT

2018-11-12 Thread Xin Long
This patchset mainly adds the Event Subscription sockopt described in
rfc6525#section-6.2:

"Subscribing to events as described in [RFC6458] uses a setsockopt()
call with the SCTP_EVENT socket option.  This option takes the
following structure, which specifies the association, the event type
(using the same value found in the event type field), and an on/off
boolean.

  struct sctp_event {
sctp_assoc_t se_assoc_id;
uint16_t se_type;
uint8_t  se_on;
  };

The user fills in the se_type field with the same value found in the
strreset_type field, i.e., SCTP_STREAM_RESET_EVENT.  The user will
also fill in the se_assoc_id field with either the association to set
this event on (this field is ignored for one-to-one style sockets) or
one of the reserved constant values defined in [RFC6458].  Finally,
the se_on field is set with a 1 to enable the event or a 0 to disable
the event."

As for the old SCTP_EVENTS Option with struct sctp_event_subscribe,
it's being DEPRECATED.

Xin Long (4):
  sctp: define subscribe in sctp_sock as __u16
  sctp: add subscribe per asoc
  sctp: rename enum sctp_event to sctp_event_type
  sctp: add sockopt SCTP_EVENT

 include/net/sctp/constants.h |   2 +-
 include/net/sctp/sm.h|   4 +-
 include/net/sctp/structs.h   |   4 +-
 include/net/sctp/ulpevent.h  |  39 --
 include/uapi/linux/sctp.h|  13 -
 net/sctp/associola.c |   2 +
 net/sctp/chunk.c |   8 ++-
 net/sctp/primitive.c |   2 +-
 net/sctp/sm_sideeffect.c |  12 ++---
 net/sctp/sm_statetable.c |   2 +-
 net/sctp/socket.c| 126 ---
 net/sctp/stream_interleave.c |  12 +++--
 net/sctp/ulpqueue.c  |   8 +--
 13 files changed, 184 insertions(+), 50 deletions(-)

-- 
2.1.0



[PATCH net] l2tp: fix a sock refcnt leak in l2tp_tunnel_register

2018-11-12 Thread Xin Long
This issue happens when trying to add an existent tunnel. It
doesn't call sock_put() before returning -EEXIST to release
the sock refcnt that was held by calling sock_hold() before
the existence check.

This patch is to fix it by holding the sock after doing the
existence check.

Fixes: f6cd651b056f ("l2tp: fix race in duplicate tunnel detection")
Reported-by: Jianlin Shi 
Signed-off-by: Xin Long 
---
 net/l2tp/l2tp_core.c | 9 -
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c
index 82cdf90..26f1d43 100644
--- a/net/l2tp/l2tp_core.c
+++ b/net/l2tp/l2tp_core.c
@@ -1490,12 +1490,7 @@ int l2tp_tunnel_register(struct l2tp_tunnel *tunnel, 
struct net *net,
goto err_sock;
}
 
-   sk = sock->sk;
-
-   sock_hold(sk);
-   tunnel->sock = sk;
tunnel->l2tp_net = net;
-
pn = l2tp_pernet(net);
 
spin_lock_bh(>l2tp_tunnel_list_lock);
@@ -1510,6 +1505,10 @@ int l2tp_tunnel_register(struct l2tp_tunnel *tunnel, 
struct net *net,
list_add_rcu(>list, >l2tp_tunnel_list);
spin_unlock_bh(>l2tp_tunnel_list_lock);
 
+   sk = sock->sk;
+   sock_hold(sk);
+   tunnel->sock = sk;
+
if (tunnel->encap == L2TP_ENCAPTYPE_UDP) {
struct udp_tunnel_sock_cfg udp_cfg = {
.sk_user_data = tunnel,
-- 
2.1.0



[PATCH net-next 3/4] sctp: rename enum sctp_event to sctp_event_type

2018-11-12 Thread Xin Long
sctp_event is a structure name defined in RFC for sockopt
SCTP_EVENT. To avoid the conflict, rename it.

Signed-off-by: Xin Long 
---
 include/net/sctp/constants.h |  2 +-
 include/net/sctp/sm.h|  4 ++--
 net/sctp/primitive.c |  2 +-
 net/sctp/sm_sideeffect.c | 12 ++--
 net/sctp/sm_statetable.c |  2 +-
 5 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/include/net/sctp/constants.h b/include/net/sctp/constants.h
index 8dadc74..4588bdc 100644
--- a/include/net/sctp/constants.h
+++ b/include/net/sctp/constants.h
@@ -71,7 +71,7 @@ enum { SCTP_DEFAULT_INSTREAMS = SCTP_MAX_STREAM };
 SCTP_NUM_AUTH_CHUNK_TYPES)
 
 /* These are the different flavours of event.  */
-enum sctp_event {
+enum sctp_event_type {
SCTP_EVENT_T_CHUNK = 1,
SCTP_EVENT_T_TIMEOUT,
SCTP_EVENT_T_OTHER,
diff --git a/include/net/sctp/sm.h b/include/net/sctp/sm.h
index 9e3d327..24825a8 100644
--- a/include/net/sctp/sm.h
+++ b/include/net/sctp/sm.h
@@ -173,7 +173,7 @@ sctp_state_fn_t sctp_sf_autoclose_timer_expire;
 __u8 sctp_get_chunk_type(struct sctp_chunk *chunk);
 const struct sctp_sm_table_entry *sctp_sm_lookup_event(
struct net *net,
-   enum sctp_event event_type,
+   enum sctp_event_type event_type,
enum sctp_state state,
union sctp_subtype event_subtype);
 int sctp_chunk_iif(const struct sctp_chunk *);
@@ -313,7 +313,7 @@ struct sctp_chunk *sctp_process_strreset_resp(
 
 /* Prototypes for statetable processing. */
 
-int sctp_do_sm(struct net *net, enum sctp_event event_type,
+int sctp_do_sm(struct net *net, enum sctp_event_type event_type,
   union sctp_subtype subtype, enum sctp_state state,
   struct sctp_endpoint *ep, struct sctp_association *asoc,
   void *event_arg, gfp_t gfp);
diff --git a/net/sctp/primitive.c b/net/sctp/primitive.c
index c0817f7a..a8c4c33 100644
--- a/net/sctp/primitive.c
+++ b/net/sctp/primitive.c
@@ -53,7 +53,7 @@
 int sctp_primitive_ ## name(struct net *net, struct sctp_association *asoc, \
void *arg) { \
int error = 0; \
-   enum sctp_event event_type; union sctp_subtype subtype; \
+   enum sctp_event_type event_type; union sctp_subtype subtype; \
enum sctp_state state; \
struct sctp_endpoint *ep; \
\
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index 85d3930..1d143bc 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -52,7 +52,7 @@
 #include 
 #include 
 
-static int sctp_cmd_interpreter(enum sctp_event event_type,
+static int sctp_cmd_interpreter(enum sctp_event_type event_type,
union sctp_subtype subtype,
enum sctp_state state,
struct sctp_endpoint *ep,
@@ -61,7 +61,7 @@ static int sctp_cmd_interpreter(enum sctp_event event_type,
enum sctp_disposition status,
struct sctp_cmd_seq *commands,
gfp_t gfp);
-static int sctp_side_effects(enum sctp_event event_type,
+static int sctp_side_effects(enum sctp_event_type event_type,
 union sctp_subtype subtype,
 enum sctp_state state,
 struct sctp_endpoint *ep,
@@ -623,7 +623,7 @@ static void sctp_cmd_init_failed(struct sctp_cmd_seq 
*commands,
 /* Worker routine to handle SCTP_CMD_ASSOC_FAILED.  */
 static void sctp_cmd_assoc_failed(struct sctp_cmd_seq *commands,
  struct sctp_association *asoc,
- enum sctp_event event_type,
+ enum sctp_event_type event_type,
  union sctp_subtype subtype,
  struct sctp_chunk *chunk,
  unsigned int error)
@@ -1162,7 +1162,7 @@ static void sctp_cmd_send_asconf(struct sctp_association 
*asoc)
  * If you want to understand all of lksctp, this is a
  * good place to start.
  */
-int sctp_do_sm(struct net *net, enum sctp_event event_type,
+int sctp_do_sm(struct net *net, enum sctp_event_type event_type,
   union sctp_subtype subtype, enum sctp_state state,
   struct sctp_endpoint *ep, struct sctp_association *asoc,
   void *event_arg, gfp_t gfp)
@@ -1199,7 +1199,7 @@ int sctp_do_sm(struct net *net, enum sctp_event 
event_type,
 /*
  * This the master state function side effect processing function.
  */
-static int sctp_side_effects(enum sctp_event event_type,
+static int

[PATCH net-next 4/4] sctp: add sockopt SCTP_EVENT

2018-11-12 Thread Xin Long
This patch adds sockopt SCTP_EVENT described in rfc6525#section-6.2.
With this sockopt users can subscribe to an event from a specified
asoc.

Signed-off-by: Xin Long 
---
 include/uapi/linux/sctp.h |  7 
 net/sctp/socket.c | 89 +++
 2 files changed, 96 insertions(+)

diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h
index 66afa5b..d584073 100644
--- a/include/uapi/linux/sctp.h
+++ b/include/uapi/linux/sctp.h
@@ -129,6 +129,7 @@ typedef __s32 sctp_assoc_t;
 #define SCTP_STREAM_SCHEDULER_VALUE124
 #define SCTP_INTERLEAVING_SUPPORTED125
 #define SCTP_SENDMSG_CONNECT   126
+#define SCTP_EVENT 127
 
 /* PR-SCTP policies */
 #define SCTP_PR_SCTP_NONE  0x
@@ -1154,6 +1155,12 @@ struct sctp_add_streams {
uint16_t sas_outstrms;
 };
 
+struct sctp_event {
+   sctp_assoc_t se_assoc_id;
+   uint16_t se_type;
+   uint8_t se_on;
+};
+
 /* SCTP Stream schedulers */
 enum sctp_sched_type {
SCTP_SS_FCFS,
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 789008d..1451211 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -4288,6 +4288,57 @@ static int sctp_setsockopt_reuse_port(struct sock *sk, 
char __user *optval,
return 0;
 }
 
+static int sctp_setsockopt_event(struct sock *sk, char __user *optval,
+unsigned int optlen)
+{
+   struct sctp_association *asoc;
+   struct sctp_ulpevent *event;
+   struct sctp_event param;
+   int retval = 0;
+
+   if (optlen < sizeof(param)) {
+   retval = -EINVAL;
+   goto out;
+   }
+
+   optlen = sizeof(param);
+   if (copy_from_user(, optval, optlen)) {
+   retval = -EFAULT;
+   goto out;
+   }
+
+   if (param.se_type < SCTP_SN_TYPE_BASE ||
+   param.se_type > SCTP_SN_TYPE_MAX) {
+   retval = -EINVAL;
+   goto out;
+   }
+
+   asoc = sctp_id2assoc(sk, param.se_assoc_id);
+   if (!asoc) {
+   sctp_ulpevent_type_set(_sk(sk)->subscribe,
+  param.se_type, param.se_on);
+   goto out;
+   }
+
+   sctp_ulpevent_type_set(>subscribe, param.se_type, param.se_on);
+
+   if (param.se_type == SCTP_SENDER_DRY_EVENT && param.se_on) {
+   if (sctp_outq_is_empty(>outqueue)) {
+   event = sctp_ulpevent_make_sender_dry_event(asoc,
+   GFP_USER | __GFP_NOWARN);
+   if (!event) {
+   retval = -ENOMEM;
+   goto out;
+   }
+
+   asoc->stream.si->enqueue_event(>ulpq, event);
+   }
+   }
+
+out:
+   return retval;
+}
+
 /* API 6.2 setsockopt(), getsockopt()
  *
  * Applications use setsockopt() and getsockopt() to set or retrieve
@@ -4485,6 +4536,9 @@ static int sctp_setsockopt(struct sock *sk, int level, 
int optname,
case SCTP_REUSE_PORT:
retval = sctp_setsockopt_reuse_port(sk, optval, optlen);
break;
+   case SCTP_EVENT:
+   retval = sctp_setsockopt_event(sk, optval, optlen);
+   break;
default:
retval = -ENOPROTOOPT;
break;
@@ -7430,6 +7484,38 @@ static int sctp_getsockopt_reuse_port(struct sock *sk, 
int len,
return 0;
 }
 
+static int sctp_getsockopt_event(struct sock *sk, int len, char __user *optval,
+int __user *optlen)
+{
+   struct sctp_association *asoc;
+   struct sctp_event param;
+   __u16 subscribe;
+
+   if (len < sizeof(param))
+   return -EINVAL;
+
+   len = sizeof(param);
+   if (copy_from_user(, optval, len))
+   return -EFAULT;
+
+   if (param.se_type < SCTP_SN_TYPE_BASE ||
+   param.se_type > SCTP_SN_TYPE_MAX)
+   return -EINVAL;
+
+   asoc = sctp_id2assoc(sk, param.se_assoc_id);
+   subscribe = asoc ? asoc->subscribe : sctp_sk(sk)->subscribe;
+   param.se_on = sctp_ulpevent_type_enabled(subscribe, param.se_type);
+
+   if (put_user(len, optlen))
+   return -EFAULT;
+
+   if (copy_to_user(optval, , len))
+   return -EFAULT;
+
+   return 0;
+}
+
+
 static int sctp_getsockopt(struct sock *sk, int level, int optname,
   char __user *optval, int __user *optlen)
 {
@@ -7628,6 +7714,9 @@ static int sctp_getsockopt(struct sock *sk, int level, 
int optname,
case SCTP_REUSE_PORT:
retval = sctp_getsockopt_reuse_port(sk, len, optval, optlen);
break;
+   case SCTP_EVENT:
+   retval = sctp_getsockopt_event(sk, len, optval, optlen);
+   break;
default:
retval = -ENOPROTOOPT;
break;
-- 
2.1.0



[PATCH net-next 0/4] sctp: add subscribe per asoc and sockopt SCTP_EVENT

2018-11-12 Thread Xin Long
This patchset mainly adds the Event Subscription sockopt described in
rfc6525#section-6.2:

Subscribing to events as described in [RFC6458] uses a setsockopt()
call with the SCTP_EVENT socket option.  This option takes the
following structure, which specifies the association, the event type
(using the same value found in the event type field), and an on/off
boolean.

  struct sctp_event {
sctp_assoc_t se_assoc_id;
uint16_t se_type;
uint8_t  se_on;
  };

The user fills in the se_type field with the same value found in the
strreset_type field, i.e., SCTP_STREAM_RESET_EVENT.  The user will
also fill in the se_assoc_id field with either the association to set
this event on (this field is ignored for one-to-one style sockets) or
one of the reserved constant values defined in [RFC6458].  Finally,
the se_on field is set with a 1 to enable the event or a 0 to disable
the event.

As for the old SCTP_EVENTS Option with struct sctp_event_subscribe,
it's being DEPRECATED.

Xin Long (4):
  sctp: define subscribe in sctp_sock as __u16
  sctp: add subscribe per asoc
  sctp: rename enum sctp_event to sctp_event_type
  sctp: add sockopt SCTP_EVENT

 include/net/sctp/constants.h |   2 +-
 include/net/sctp/sm.h|   4 +-
 include/net/sctp/structs.h   |   4 +-
 include/net/sctp/ulpevent.h  |  39 --
 include/uapi/linux/sctp.h|  13 -
 net/sctp/associola.c |   2 +
 net/sctp/chunk.c |   8 ++-
 net/sctp/primitive.c |   2 +-
 net/sctp/sm_sideeffect.c |  12 ++---
 net/sctp/sm_statetable.c |   2 +-
 net/sctp/socket.c| 126 ---
 net/sctp/stream_interleave.c |  12 +++--
 net/sctp/ulpqueue.c  |   8 +--
 13 files changed, 184 insertions(+), 50 deletions(-)

-- 
2.1.0



[PATCHv2 net-next 2/3] sctp: add sock_reuseport for the sock in __sctp_hash_endpoint

2018-11-12 Thread Xin Long
This is a part of sk_reuseport support for sctp. It defines a helper
sctp_bind_addrs_check() to check if the bind_addrs in two socks are
matched. It will add sock_reuseport if they are completely matched,
and return err if they are partly matched, and alloc sock_reuseport
if all socks are not matched at all.

It will work until sk_reuseport support is added in
sctp_get_port_local() in the next patch.

v1->v2:
  - use 'laddr->valid && laddr2->valid' check instead as Marcelo
pointed in sctp_bind_addrs_check().

Acked-by: Neil Horman 
Signed-off-by: Xin Long 
---
 include/net/sctp/sctp.h|  2 +-
 include/net/sctp/structs.h |  2 ++
 net/core/sock_reuseport.c  |  1 +
 net/sctp/bind_addr.c   | 28 ++
 net/sctp/input.c   | 60 +++---
 net/sctp/socket.c  |  3 +--
 6 files changed, 85 insertions(+), 11 deletions(-)

diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
index 9a3b48a..cdf2e80 100644
--- a/include/net/sctp/sctp.h
+++ b/include/net/sctp/sctp.h
@@ -152,7 +152,7 @@ int sctp_primitive_RECONF(struct net *net, struct 
sctp_association *asoc,
  */
 int sctp_rcv(struct sk_buff *skb);
 int sctp_v4_err(struct sk_buff *skb, u32 info);
-void sctp_hash_endpoint(struct sctp_endpoint *);
+int sctp_hash_endpoint(struct sctp_endpoint *ep);
 void sctp_unhash_endpoint(struct sctp_endpoint *);
 struct sock *sctp_err_lookup(struct net *net, int family, struct sk_buff *,
 struct sctphdr *, struct sctp_association **,
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index a11f937..15d017f 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -1190,6 +1190,8 @@ int sctp_bind_addr_conflict(struct sctp_bind_addr *, 
const union sctp_addr *,
 struct sctp_sock *, struct sctp_sock *);
 int sctp_bind_addr_state(const struct sctp_bind_addr *bp,
 const union sctp_addr *addr);
+int sctp_bind_addrs_check(struct sctp_sock *sp,
+ struct sctp_sock *sp2, int cnt2);
 union sctp_addr *sctp_find_unmatch_addr(struct sctp_bind_addr  *bp,
const union sctp_addr   *addrs,
int addrcnt,
diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c
index ba5cba5..d8fe3e5 100644
--- a/net/core/sock_reuseport.c
+++ b/net/core/sock_reuseport.c
@@ -187,6 +187,7 @@ int reuseport_add_sock(struct sock *sk, struct sock *sk2, 
bool bind_inany)
call_rcu(_reuse->rcu, reuseport_free_rcu);
return 0;
 }
+EXPORT_SYMBOL(reuseport_add_sock);
 
 void reuseport_detach_sock(struct sock *sk)
 {
diff --git a/net/sctp/bind_addr.c b/net/sctp/bind_addr.c
index 7df3704..ebf28ad 100644
--- a/net/sctp/bind_addr.c
+++ b/net/sctp/bind_addr.c
@@ -337,6 +337,34 @@ int sctp_bind_addr_match(struct sctp_bind_addr *bp,
return match;
 }
 
+int sctp_bind_addrs_check(struct sctp_sock *sp,
+ struct sctp_sock *sp2, int cnt2)
+{
+   struct sctp_bind_addr *bp2 = >ep->base.bind_addr;
+   struct sctp_bind_addr *bp = >ep->base.bind_addr;
+   struct sctp_sockaddr_entry *laddr, *laddr2;
+   bool exist = false;
+   int cnt = 0;
+
+   rcu_read_lock();
+   list_for_each_entry_rcu(laddr, >address_list, list) {
+   list_for_each_entry_rcu(laddr2, >address_list, list) {
+   if (sp->pf->af->cmp_addr(>a, >a) &&
+   laddr->valid && laddr2->valid) {
+   exist = true;
+   goto next;
+   }
+   }
+   cnt = 0;
+   break;
+next:
+   cnt++;
+   }
+   rcu_read_unlock();
+
+   return (cnt == cnt2) ? 0 : (exist ? -EEXIST : 1);
+}
+
 /* Does the address 'addr' conflict with any addresses in
  * the bp.
  */
diff --git a/net/sctp/input.c b/net/sctp/input.c
index 00f995e..d7a649d 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -724,43 +724,87 @@ static int sctp_rcv_ootb(struct sk_buff *skb)
 }
 
 /* Insert endpoint into the hash table.  */
-static void __sctp_hash_endpoint(struct sctp_endpoint *ep)
+static int __sctp_hash_endpoint(struct sctp_endpoint *ep)
 {
-   struct net *net = sock_net(ep->base.sk);
-   struct sctp_ep_common *epb;
+   struct sock *sk = ep->base.sk;
+   struct net *net = sock_net(sk);
struct sctp_hashbucket *head;
+   struct sctp_ep_common *epb;
 
epb = >base;
-
epb->hashent = sctp_ep_hashfn(net, epb->bind_addr.port);
head = _ep_hashtable[epb->hashent];
 
+   if (sk->sk_reuseport) {
+   bool any = sctp_is_ep_boundall(sk);
+   struct sctp_ep_common *epb2;
+   struct list_head *list;
+   

[PATCHv2 net-next 3/3] sctp: process sk_reuseport in sctp_get_port_local

2018-11-12 Thread Xin Long
When socks' sk_reuseport is set, the same port and address are allowed
to be bound into these socks who have the same uid.

Note that the difference from sk_reuse is that it allows multiple socks
to listen on the same port and address.

Acked-by: Neil Horman 
Signed-off-by: Xin Long 
---
 include/net/sctp/structs.h |  4 +++-
 net/sctp/socket.c  | 46 +-
 2 files changed, 36 insertions(+), 14 deletions(-)

diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 15d017f..af9d494 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -96,7 +96,9 @@ struct sctp_stream;
 
 struct sctp_bind_bucket {
unsigned short  port;
-   unsigned short  fastreuse;
+   signed char fastreuse;
+   signed char fastreuseport;
+   kuid_t  fastuid;
struct hlist_node   node;
struct hlist_head   owner;
struct net  *net;
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 2e955f1..5299add 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -7644,8 +7644,10 @@ static struct sctp_bind_bucket *sctp_bucket_create(
 
 static long sctp_get_port_local(struct sock *sk, union sctp_addr *addr)
 {
-   bool reuse = (sk->sk_reuse || sctp_sk(sk)->reuse);
+   struct sctp_sock *sp = sctp_sk(sk);
+   bool reuse = (sk->sk_reuse || sp->reuse);
struct sctp_bind_hashbucket *head; /* hash list */
+   kuid_t uid = sock_i_uid(sk);
struct sctp_bind_bucket *pp;
unsigned short snum;
int ret;
@@ -7721,7 +7723,10 @@ static long sctp_get_port_local(struct sock *sk, union 
sctp_addr *addr)
 
pr_debug("%s: found a possible match\n", __func__);
 
-   if (pp->fastreuse && reuse && sk->sk_state != SCTP_SS_LISTENING)
+   if ((pp->fastreuse && reuse &&
+sk->sk_state != SCTP_SS_LISTENING) ||
+   (pp->fastreuseport && sk->sk_reuseport &&
+uid_eq(pp->fastuid, uid)))
goto success;
 
/* Run through the list of sockets bound to the port
@@ -7735,16 +7740,18 @@ static long sctp_get_port_local(struct sock *sk, union 
sctp_addr *addr)
 * in an endpoint.
 */
sk_for_each_bound(sk2, >owner) {
-   struct sctp_endpoint *ep2;
-   ep2 = sctp_sk(sk2)->ep;
+   struct sctp_sock *sp2 = sctp_sk(sk2);
+   struct sctp_endpoint *ep2 = sp2->ep;
 
if (sk == sk2 ||
-   (reuse && (sk2->sk_reuse || sctp_sk(sk2)->reuse) &&
-sk2->sk_state != SCTP_SS_LISTENING))
+   (reuse && (sk2->sk_reuse || sp2->reuse) &&
+sk2->sk_state != SCTP_SS_LISTENING) ||
+   (sk->sk_reuseport && sk2->sk_reuseport &&
+uid_eq(uid, sock_i_uid(sk2
continue;
 
-   if (sctp_bind_addr_conflict(>base.bind_addr, addr,
-sctp_sk(sk2), sctp_sk(sk))) {
+   if (sctp_bind_addr_conflict(>base.bind_addr,
+   addr, sp2, sp)) {
ret = (long)sk2;
goto fail_unlock;
}
@@ -7767,19 +7774,32 @@ static long sctp_get_port_local(struct sock *sk, union 
sctp_addr *addr)
pp->fastreuse = 1;
else
pp->fastreuse = 0;
-   } else if (pp->fastreuse &&
-  (!reuse || sk->sk_state == SCTP_SS_LISTENING))
-   pp->fastreuse = 0;
+
+   if (sk->sk_reuseport) {
+   pp->fastreuseport = 1;
+   pp->fastuid = uid;
+   } else {
+   pp->fastreuseport = 0;
+   }
+   } else {
+   if (pp->fastreuse &&
+   (!reuse || sk->sk_state == SCTP_SS_LISTENING))
+   pp->fastreuse = 0;
+
+   if (pp->fastreuseport &&
+   (!sk->sk_reuseport || !uid_eq(pp->fastuid, uid)))
+   pp->fastreuseport = 0;
+   }
 
/* We are set, so fill up all the data in the hash table
 * entry, tie the socket list information with the rest of the
 * sockets FIXME: Blurry, NPI (ipg).
 */
 success:
-   if (!sctp_sk(sk)->bind_hash) {
+   if (!sp->bind_hash) {
inet_sk(sk)->inet_num = snum;
sk_add_bind_node(sk, >owner);
-   sctp_sk(sk)->bind_hash = pp;
+   sp->bind_hash = pp;
}
ret = 0;
 
-- 
2.1.0



[PATCHv2 net-next 1/3] sctp: do reuseport_select_sock in __sctp_rcv_lookup_endpoint

2018-11-12 Thread Xin Long
This is a part of sk_reuseport support for sctp, and it selects a
sock by the hashkey of lport, paddr and dport by default. It will
work until sk_reuseport support is added in sctp_get_port_local()
in the next patch.

v1->v2:
  - define lport as __be16 instead of __be32 as Marcelo pointed in
__sctp_rcv_lookup_endpoint().

Acked-by: Neil Horman 
Signed-off-by: Xin Long 
---
 net/sctp/input.c | 69 +---
 1 file changed, 41 insertions(+), 28 deletions(-)

diff --git a/net/sctp/input.c b/net/sctp/input.c
index 7ab08a5..00f995e 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -57,6 +57,7 @@
 #include 
 #include 
 #include 
+#include 
 
 /* Forward declarations for internal helpers. */
 static int sctp_rcv_ootb(struct sk_buff *);
@@ -65,8 +66,10 @@ static struct sctp_association *__sctp_rcv_lookup(struct net 
*net,
  const union sctp_addr *paddr,
  const union sctp_addr *laddr,
  struct sctp_transport **transportp);
-static struct sctp_endpoint *__sctp_rcv_lookup_endpoint(struct net *net,
-   const union sctp_addr *laddr);
+static struct sctp_endpoint *__sctp_rcv_lookup_endpoint(
+   struct net *net, struct sk_buff *skb,
+   const union sctp_addr *laddr,
+   const union sctp_addr *daddr);
 static struct sctp_association *__sctp_lookup_association(
struct net *net,
const union sctp_addr *local,
@@ -171,7 +174,7 @@ int sctp_rcv(struct sk_buff *skb)
asoc = __sctp_rcv_lookup(net, skb, , , );
 
if (!asoc)
-   ep = __sctp_rcv_lookup_endpoint(net, );
+   ep = __sctp_rcv_lookup_endpoint(net, skb, , );
 
/* Retrieve the common input handling substructure. */
rcvr = asoc ? >base : >base;
@@ -771,16 +774,35 @@ void sctp_unhash_endpoint(struct sctp_endpoint *ep)
local_bh_enable();
 }
 
+static inline __u32 sctp_hashfn(const struct net *net, __be16 lport,
+   const union sctp_addr *paddr, __u32 seed)
+{
+   __u32 addr;
+
+   if (paddr->sa.sa_family == AF_INET6)
+   addr = jhash(>v6.sin6_addr, 16, seed);
+   else
+   addr = (__force __u32)paddr->v4.sin_addr.s_addr;
+
+   return  jhash_3words(addr, ((__force __u32)paddr->v4.sin_port) << 16 |
+(__force __u32)lport, net_hash_mix(net), seed);
+}
+
 /* Look up an endpoint. */
-static struct sctp_endpoint *__sctp_rcv_lookup_endpoint(struct net *net,
-   const union sctp_addr *laddr)
+static struct sctp_endpoint *__sctp_rcv_lookup_endpoint(
+   struct net *net, struct sk_buff *skb,
+   const union sctp_addr *laddr,
+   const union sctp_addr *paddr)
 {
struct sctp_hashbucket *head;
struct sctp_ep_common *epb;
struct sctp_endpoint *ep;
+   struct sock *sk;
+   __be16 lport;
int hash;
 
-   hash = sctp_ep_hashfn(net, ntohs(laddr->v4.sin_port));
+   lport = laddr->v4.sin_port;
+   hash = sctp_ep_hashfn(net, ntohs(lport));
head = _ep_hashtable[hash];
read_lock(>lock);
sctp_for_each_hentry(epb, >chain) {
@@ -792,6 +814,15 @@ static struct sctp_endpoint 
*__sctp_rcv_lookup_endpoint(struct net *net,
ep = sctp_sk(net->sctp.ctl_sock)->ep;
 
 hit:
+   sk = ep->base.sk;
+   if (sk->sk_reuseport) {
+   __u32 phash = sctp_hashfn(net, lport, paddr, 0);
+
+   sk = reuseport_select_sock(sk, phash, skb,
+  sizeof(struct sctphdr));
+   if (sk)
+   ep = sctp_sk(sk)->ep;
+   }
sctp_endpoint_hold(ep);
read_unlock(>lock);
return ep;
@@ -830,35 +861,17 @@ static inline int sctp_hash_cmp(struct 
rhashtable_compare_arg *arg,
 static inline __u32 sctp_hash_obj(const void *data, u32 len, u32 seed)
 {
const struct sctp_transport *t = data;
-   const union sctp_addr *paddr = >ipaddr;
-   const struct net *net = sock_net(t->asoc->base.sk);
-   __be16 lport = htons(t->asoc->base.bind_addr.port);
-   __u32 addr;
-
-   if (paddr->sa.sa_family == AF_INET6)
-   addr = jhash(>v6.sin6_addr, 16, seed);
-   else
-   addr = (__force __u32)paddr->v4.sin_addr.s_addr;
 
-   return  jhash_3words(addr, ((__force __u32)paddr->v4.sin_port) << 16 |
-(__force __u32)lport, net_hash_mix(net), seed);
+   return sctp_hashfn

[PATCHv2 net-next 0/3] sctp: add support for sk_reuseport

2018-11-12 Thread Xin Long
sctp sk_reuseport allows multiple socks to listen on the same port and
addresses, as long as these socks have the same uid. This works pretty
much as TCP/UDP does, the only difference is that sctp is multi-homing
and all the bind_addrs in these socks will have to completely matched,
otherwise listen() will return err.

The below is when 5 sockets are listening on 172.16.254.254:6400 on a
server, 26 sockets on a client connect to 172.16.254.254:6400 and each
may be processed by a different socket on the server which is selected
by hash(lport, pport, paddr) in reuseport_select_sock():

 # ss --sctp -nn
   State  Recv-Q Send-QLocal Address:Port Peer Address:Port
   LISTEN 0  10   172.16.254.254:6400*:*
   `- ESTAB   0  0   172.16.254.254%eth1:6400   172.16.2.1:1234
   `- ESTAB   0  0   172.16.254.254%eth1:6400   172.16.2.4:1234
   `- ESTAB   0  0   172.16.254.254%eth1:6400   172.16.3.3:1234
   `- ESTAB   0  0   172.16.254.254%eth1:6400   172.16.3.4:1234
   `- ESTAB   0  0   172.16.254.254%eth1:6400   172.16.5.2:1234
   `- ESTAB   0  0   172.16.254.254%eth1:6400   172.16.5.3:1234
   LISTEN 0  10   172.16.254.254:6400*:*
   `- ESTAB   0  0   172.16.254.254%eth1:6400   172.16.1.3:1234
   `- ESTAB   0  0   172.16.254.254%eth1:6400   172.16.1.4:1234
   `- ESTAB   0  0   172.16.254.254%eth1:6400   172.16.3.2:1234
   `- ESTAB   0  0   172.16.254.254%eth1:6400   172.16.4.1:1234
   `- ESTAB   0  0   172.16.254.254%eth1:6400   172.16.4.2:1234
   `- ESTAB   0  0   172.16.254.254%eth1:6400   172.16.4.3:1234
   `- ESTAB   0  0   172.16.254.254%eth1:6400   172.16.4.4:1234
   LISTEN 0  10   172.16.254.254:6400*:*
   `- ESTAB   0  0   172.16.254.254%eth1:6400   172.16.1.2:1234
   `- ESTAB   0  0   172.16.254.254%eth1:6400   172.16.3.5:1234
   `- ESTAB   0  0   172.16.254.254%eth1:6400   172.16.4.5:1234
   `- ESTAB   0  0   172.16.254.254%eth1:6400   172.16.253.253:1234
   LISTEN 0  10   172.16.254.254:6400*:*
   `- ESTAB   0  0   172.16.254.254%eth1:6400   172.16.2.2:1234
   `- ESTAB   0  0   172.16.254.254%eth1:6400   172.16.2.3:1234
   `- ESTAB   0  0   172.16.254.254%eth1:6400   172.16.5.4:1234
   `- ESTAB   0  0   172.16.254.254%eth1:6400   172.16.5.5:1234
   LISTEN 0  10   172.16.254.254:6400*:*
   `- ESTAB   0  0   172.16.254.254%eth1:6400   172.16.1.1:1234
   `- ESTAB   0  0   172.16.254.254%eth1:6400   172.16.1.5:1234
   `- ESTAB   0  0   172.16.254.254%eth1:6400   172.16.2.5:1234
   `- ESTAB   0  0   172.16.254.254%eth1:6400   172.16.3.1:1234
   `- ESTAB   0  0   172.16.254.254%eth1:6400   172.16.5.1:1234

Xin Long (3):
  sctp: do reuseport_select_sock in __sctp_rcv_lookup_endpoint
  sctp: add sock_reuseport for the sock in __sctp_hash_endpoint
  sctp: process sk_reuseport in sctp_get_port_local

 include/net/sctp/sctp.h|   2 +-
 include/net/sctp/structs.h |   6 ++-
 net/core/sock_reuseport.c  |   1 +
 net/sctp/bind_addr.c   |  28 ++
 net/sctp/input.c   | 129 -
 net/sctp/socket.c  |  49 +++--
 6 files changed, 162 insertions(+), 53 deletions(-)

-- 
2.1.0



Re: [PATCH net-next 2/3] sctp: add sock_reuseport for the sock in __sctp_hash_endpoint

2018-11-12 Thread Xin Long
On Mon, Oct 22, 2018 at 11:15 PM Marcelo Ricardo Leitner
 wrote:
>
> On Sun, Oct 21, 2018 at 12:43:37PM +0800, Xin Long wrote:
> > This is a part of sk_reuseport support for sctp. It defines a helper
> > sctp_bind_addrs_check() to check if the bind_addrs in two socks are
> > matched. It will add sock_reuseport if they are completely matched,
> > and return err if they are partly matched, and alloc sock_reuseport
> > if all socks are not matched at all.
> >
> > It will work until sk_reuseport support is added in
> > sctp_get_port_local() in the next patch.
> >
> > Signed-off-by: Xin Long 
> > ---
> >  include/net/sctp/sctp.h|  2 +-
> >  include/net/sctp/structs.h |  2 ++
> >  net/core/sock_reuseport.c  |  1 +
> >  net/sctp/bind_addr.c   | 28 ++
> >  net/sctp/input.c   | 60 
> > +++---
> >  net/sctp/socket.c  |  3 +--
> >  6 files changed, 85 insertions(+), 11 deletions(-)
> >
> > diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
> > index 8c2caa3..b8cd58d 100644
> > --- a/include/net/sctp/sctp.h
> > +++ b/include/net/sctp/sctp.h
> > @@ -152,7 +152,7 @@ int sctp_primitive_RECONF(struct net *net, struct 
> > sctp_association *asoc,
> >   */
> >  int sctp_rcv(struct sk_buff *skb);
> >  void sctp_v4_err(struct sk_buff *skb, u32 info);
> > -void sctp_hash_endpoint(struct sctp_endpoint *);
> > +int sctp_hash_endpoint(struct sctp_endpoint *ep);
> >  void sctp_unhash_endpoint(struct sctp_endpoint *);
> >  struct sock *sctp_err_lookup(struct net *net, int family, struct sk_buff *,
> >struct sctphdr *, struct sctp_association **,
> > diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
> > index a11f937..15d017f 100644
> > --- a/include/net/sctp/structs.h
> > +++ b/include/net/sctp/structs.h
> > @@ -1190,6 +1190,8 @@ int sctp_bind_addr_conflict(struct sctp_bind_addr *, 
> > const union sctp_addr *,
> >struct sctp_sock *, struct sctp_sock *);
> >  int sctp_bind_addr_state(const struct sctp_bind_addr *bp,
> >const union sctp_addr *addr);
> > +int sctp_bind_addrs_check(struct sctp_sock *sp,
> > +   struct sctp_sock *sp2, int cnt2);
> >  union sctp_addr *sctp_find_unmatch_addr(struct sctp_bind_addr*bp,
> >   const union sctp_addr   *addrs,
> >   int addrcnt,
> > diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c
> > index ba5cba5..d8fe3e5 100644
> > --- a/net/core/sock_reuseport.c
> > +++ b/net/core/sock_reuseport.c
> > @@ -187,6 +187,7 @@ int reuseport_add_sock(struct sock *sk, struct sock 
> > *sk2, bool bind_inany)
> >   call_rcu(_reuse->rcu, reuseport_free_rcu);
> >   return 0;
> >  }
> > +EXPORT_SYMBOL(reuseport_add_sock);
> >
> >  void reuseport_detach_sock(struct sock *sk)
> >  {
> > diff --git a/net/sctp/bind_addr.c b/net/sctp/bind_addr.c
> > index 7df3704..78d0d93 100644
> > --- a/net/sctp/bind_addr.c
> > +++ b/net/sctp/bind_addr.c
> > @@ -337,6 +337,34 @@ int sctp_bind_addr_match(struct sctp_bind_addr *bp,
> >   return match;
> >  }
> >
> > +int sctp_bind_addrs_check(struct sctp_sock *sp,
> > +   struct sctp_sock *sp2, int cnt2)
> > +{
> > + struct sctp_bind_addr *bp2 = >ep->base.bind_addr;
> > + struct sctp_bind_addr *bp = >ep->base.bind_addr;
> > + struct sctp_sockaddr_entry *laddr, *laddr2;
> > + bool exist = false;
> > + int cnt = 0;
> > +
> > + rcu_read_lock();
> > + list_for_each_entry_rcu(laddr, >address_list, list) {
> > + list_for_each_entry_rcu(laddr2, >address_list, list) {
> > + if (sp->pf->af->cmp_addr(>a, >a) &&
> > + laddr->valid == laddr2->valid) {
>
> I think by here in the normal run laddr2->valid will always be true,
> but as is it gives the impression that it accepts 0 == 0 too, which
> would be bad.  May be on a fast BINDX_REM/BINDX_ADD it could trigger
> laddr2->valid = 0 in there, not sure.
>
> Anyway, may be '... laddr->valid && laddr2->valid' instead or you
> really want to allow the 0 == 0 case?
>
will improve it in v2. thanks.

> > + exist = true;
> > + goto next;
> > + 

Re: [PATCH net-next 1/3] sctp: do reuseport_select_sock in __sctp_rcv_lookup_endpoint

2018-11-12 Thread Xin Long
On Mon, Oct 22, 2018 at 11:18 PM Marcelo Ricardo Leitner
 wrote:
>
> On Sun, Oct 21, 2018 at 12:43:36PM +0800, Xin Long wrote:
> > This is a part of sk_reuseport support for sctp, and it selects a
> > sock by the hashkey of lport, paddr and dport by default. It will
> > work until sk_reuseport support is added in sctp_get_port_local()
> > in the next patch.
> >
> > Signed-off-by: Xin Long 
> > ---
> >  net/sctp/input.c | 69 
> > +---
> >  1 file changed, 41 insertions(+), 28 deletions(-)
> >
> > diff --git a/net/sctp/input.c b/net/sctp/input.c
> > index 5c36a99..60ede89 100644
> > --- a/net/sctp/input.c
> > +++ b/net/sctp/input.c
> > @@ -57,6 +57,7 @@
> >  #include 
> >  #include 
> >  #include 
> > +#include 
> >
> >  /* Forward declarations for internal helpers. */
> >  static int sctp_rcv_ootb(struct sk_buff *);
> > @@ -65,8 +66,10 @@ static struct sctp_association *__sctp_rcv_lookup(struct 
> > net *net,
> > const union sctp_addr *paddr,
> > const union sctp_addr *laddr,
> > struct sctp_transport **transportp);
> > -static struct sctp_endpoint *__sctp_rcv_lookup_endpoint(struct net *net,
> > - const union sctp_addr *laddr);
> > +static struct sctp_endpoint *__sctp_rcv_lookup_endpoint(
> > + struct net *net, struct sk_buff *skb,
> > + const union sctp_addr *laddr,
> > + const union sctp_addr *daddr);
> >  static struct sctp_association *__sctp_lookup_association(
> >   struct net *net,
> >   const union sctp_addr *local,
> > @@ -171,7 +174,7 @@ int sctp_rcv(struct sk_buff *skb)
> >   asoc = __sctp_rcv_lookup(net, skb, , , );
> >
> >   if (!asoc)
> > - ep = __sctp_rcv_lookup_endpoint(net, );
> > + ep = __sctp_rcv_lookup_endpoint(net, skb, , );
> >
> >   /* Retrieve the common input handling substructure. */
> >   rcvr = asoc ? >base : >base;
> > @@ -770,16 +773,35 @@ void sctp_unhash_endpoint(struct sctp_endpoint *ep)
> >   local_bh_enable();
> >  }
> >
> > +static inline __u32 sctp_hashfn(const struct net *net, __be16 lport,
> > + const union sctp_addr *paddr, __u32 seed)
> > +{
> > + __u32 addr;
> > +
> > + if (paddr->sa.sa_family == AF_INET6)
> > + addr = jhash(>v6.sin6_addr, 16, seed);
> > + else
> > + addr = (__force __u32)paddr->v4.sin_addr.s_addr;
> > +
> > + return  jhash_3words(addr, ((__force __u32)paddr->v4.sin_port) << 16 |
> > +  (__force __u32)lport, net_hash_mix(net), seed);
> > +}
> > +
> >  /* Look up an endpoint. */
> > -static struct sctp_endpoint *__sctp_rcv_lookup_endpoint(struct net *net,
> > - const union sctp_addr *laddr)
> > +static struct sctp_endpoint *__sctp_rcv_lookup_endpoint(
> > + struct net *net, struct sk_buff *skb,
> > + const union sctp_addr *laddr,
> > + const union sctp_addr *paddr)
> >  {
> >   struct sctp_hashbucket *head;
> >   struct sctp_ep_common *epb;
> >   struct sctp_endpoint *ep;
> > + struct sock *sk;
> > + __be32 lport;
>
> This could be a __be16 one.
right, will correct it in v2.

>
> >   int hash;
> >
> > - hash = sctp_ep_hashfn(net, ntohs(laddr->v4.sin_port));
> > + lport = laddr->v4.sin_port;
> > + hash = sctp_ep_hashfn(net, ntohs(lport));
> >   head = _ep_hashtable[hash];
> >   read_lock(>lock);
> >   sctp_for_each_hentry(epb, >chain) {
> > @@ -791,6 +813,15 @@ static struct sctp_endpoint 
> > *__sctp_rcv_lookup_endpoint(struct net *net,
> >   ep = sctp_sk(net->sctp.ctl_sock)->ep;
> >
> >  hit:
> > + sk = ep->base.sk;
> > + if (sk->sk_reuseport) {
> > + __u32 phash = sctp_hashfn(net, lport, paddr, 0);
> > +
> > + sk = reuseport_select_sock(sk, phash, skb,
> > +sizeof(struct sctphdr));
> > + if (sk)
> > + 

[PATCH net] sctp: define SCTP_SS_DEFAULT for Stream schedulers

2018-11-03 Thread Xin Long
According to rfc8260#section-4.3.2, SCTP_SS_DEFAULT is required to
defined as SCTP_SS_FCFS or SCTP_SS_RR.

SCTP_SS_FCFS is used for SCTP_SS_DEFAULT's value in this patch.

Fixes: 5e32a431 ("sctp: introduce stream scheduler foundations")
Reported-by: Jianwen Ji 
Signed-off-by: Xin Long 
---
 include/uapi/linux/sctp.h | 1 +
 net/sctp/outqueue.c   | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h
index 680ecc3..c81feb3 100644
--- a/include/uapi/linux/sctp.h
+++ b/include/uapi/linux/sctp.h
@@ -1153,6 +1153,7 @@ struct sctp_add_streams {
 /* SCTP Stream schedulers */
 enum sctp_sched_type {
SCTP_SS_FCFS,
+   SCTP_SS_DEFAULT = SCTP_SS_FCFS,
SCTP_SS_PRIO,
SCTP_SS_RR,
SCTP_SS_MAX = SCTP_SS_RR
diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index 9cb854b..c37e1c2 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -212,7 +212,7 @@ void sctp_outq_init(struct sctp_association *asoc, struct 
sctp_outq *q)
INIT_LIST_HEAD(>retransmit);
INIT_LIST_HEAD(>sacked);
INIT_LIST_HEAD(>abandoned);
-   sctp_sched_set_sched(asoc, SCTP_SS_FCFS);
+   sctp_sched_set_sched(asoc, SCTP_SS_DEFAULT);
 }
 
 /* Free the outqueue structure and any related pending chunks.
-- 
2.1.0



[PATCH net] sctp: fix strchange_flags name for Stream Change Event

2018-11-03 Thread Xin Long
As defined in rfc6525#section-6.1.3, SCTP_STREAM_CHANGE_DENIED
and SCTP_STREAM_CHANGE_FAILED should be used instead of
SCTP_ASSOC_CHANGE_DENIED and SCTP_ASSOC_CHANGE_FAILED.

To keep the compatibility, fix it by adding two macros.

Fixes: b444153fb5a6 ("sctp: add support for generating add stream change event 
notification")
Reported-by: Jianwen Ji 
Signed-off-by: Xin Long 
---
 include/uapi/linux/sctp.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h
index 34dd3d4..680ecc3 100644
--- a/include/uapi/linux/sctp.h
+++ b/include/uapi/linux/sctp.h
@@ -568,6 +568,8 @@ struct sctp_assoc_reset_event {
 
 #define SCTP_ASSOC_CHANGE_DENIED   0x0004
 #define SCTP_ASSOC_CHANGE_FAILED   0x0008
+#define SCTP_STREAM_CHANGE_DENIED  SCTP_ASSOC_CHANGE_DENIED
+#define SCTP_STREAM_CHANGE_FAILED  SCTP_ASSOC_CHANGE_FAILED
 struct sctp_stream_change_event {
__u16 strchange_type;
__u16 strchange_flags;
-- 
2.1.0



[PATCH net] sctp: check policy more carefully when getting pr status

2018-10-29 Thread Xin Long
When getting pr_assocstatus and pr_streamstatus by sctp_getsockopt,
it doesn't correctly process the case when policy is set with
SCTP_PR_SCTP_ALL | SCTP_PR_SCTP_MASK. It even causes a
slab-out-of-bounds in sctp_getsockopt_pr_streamstatus().

This patch fixes it by return -EINVAL for this case.

Fixes: 0ac1077e3a54 ("sctp: get pr_assoc and pr_stream all status with 
SCTP_PR_SCTP_ALL")
Reported-by: syzbot+5da0d0a72a9e7d791...@syzkaller.appspotmail.com
Suggested-by: Marcelo Ricardo Leitner 
Signed-off-by: Xin Long 
---
 net/sctp/socket.c | 8 +---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index fc0386e..739f3e5 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -7083,14 +7083,15 @@ static int sctp_getsockopt_pr_assocstatus(struct sock 
*sk, int len,
}
 
policy = params.sprstat_policy;
-   if (!policy || (policy & ~(SCTP_PR_SCTP_MASK | SCTP_PR_SCTP_ALL)))
+   if (!policy || (policy & ~(SCTP_PR_SCTP_MASK | SCTP_PR_SCTP_ALL)) ||
+   ((policy & SCTP_PR_SCTP_ALL) && (policy & SCTP_PR_SCTP_MASK)))
goto out;
 
asoc = sctp_id2assoc(sk, params.sprstat_assoc_id);
if (!asoc)
goto out;
 
-   if (policy & SCTP_PR_SCTP_ALL) {
+   if (policy == SCTP_PR_SCTP_ALL) {
params.sprstat_abandoned_unsent = 0;
params.sprstat_abandoned_sent = 0;
for (policy = 0; policy <= SCTP_PR_INDEX(MAX); policy++) {
@@ -7142,7 +7143,8 @@ static int sctp_getsockopt_pr_streamstatus(struct sock 
*sk, int len,
}
 
policy = params.sprstat_policy;
-   if (!policy || (policy & ~(SCTP_PR_SCTP_MASK | SCTP_PR_SCTP_ALL)))
+   if (!policy || (policy & ~(SCTP_PR_SCTP_MASK | SCTP_PR_SCTP_ALL)) ||
+   ((policy & SCTP_PR_SCTP_ALL) && (policy & SCTP_PR_SCTP_MASK)))
goto out;
 
asoc = sctp_id2assoc(sk, params.sprstat_assoc_id);
-- 
2.1.0



[PATCH net] sctp: clear the transport of some out_chunk_list chunks in sctp_assoc_rm_peer

2018-10-29 Thread Xin Long
If a transport is removed by asconf but there still are some chunks with
this transport queuing on out_chunk_list, later an use-after-free issue
will be caused when accessing this transport from these chunks in
sctp_outq_flush().

This is an old bug, we fix it by clearing the transport of these chunks
in out_chunk_list when removing a transport in sctp_assoc_rm_peer().

Reported-by: syzbot+56a40ceee5fb35932...@syzkaller.appspotmail.com
Signed-off-by: Xin Long 
---
 net/sctp/associola.c | 10 +++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index a827a1f..6a28b96 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -499,8 +499,9 @@ void sctp_assoc_set_primary(struct sctp_association *asoc,
 void sctp_assoc_rm_peer(struct sctp_association *asoc,
struct sctp_transport *peer)
 {
-   struct list_head*pos;
-   struct sctp_transport   *transport;
+   struct sctp_transport *transport;
+   struct list_head *pos;
+   struct sctp_chunk *ch;
 
pr_debug("%s: association:%p addr:%pISpc\n",
 __func__, asoc, >ipaddr.sa);
@@ -564,7 +565,6 @@ void sctp_assoc_rm_peer(struct sctp_association *asoc,
 */
if (!list_empty(>transmitted)) {
struct sctp_transport *active = asoc->peer.active_path;
-   struct sctp_chunk *ch;
 
/* Reset the transport of each chunk on this list */
list_for_each_entry(ch, >transmitted,
@@ -586,6 +586,10 @@ void sctp_assoc_rm_peer(struct sctp_association *asoc,
sctp_transport_hold(active);
}
 
+   list_for_each_entry(ch, >outqueue.out_chunk_list, list)
+   if (ch->transport == peer)
+   ch->transport = NULL;
+
asoc->peer.transport_count--;
 
sctp_transport_free(peer);
-- 
2.1.0



Re: [PATCH net-next 0/3] sctp: add support for sk_reuseport

2018-10-21 Thread Xin Long
On Sun, Oct 21, 2018 at 1:43 PM Xin Long  wrote:
>
> sctp sk_reuseport allows multiple socks to listen on the same port and
> addresses, as long as these socks have the same uid. This works pretty
> much as TCP/UDP does, the only difference is that sctp is multi-homing
> and all the bind_addrs in these socks will have to completely matched,
> otherwise listen() will return err.
>
> The below is when 5 sockets are listening on 172.16.254.254:6400 on a
> server, 26 sockets on a client connect to 172.16.254.254:6400 and each
> may be processed by a different socket on the server which is selected
> by hash(lport, pport, paddr) in reuseport_select_sock():
>
>  # ss --sctp -nn
>State  Recv-Q Send-QLocal Address:Port Peer Address:Port
>LISTEN 0  10   172.16.254.254:6400*:*
>`- ESTAB   0  0   172.16.254.254%eth1:6400   172.16.2.1:1234
>`- ESTAB   0  0   172.16.254.254%eth1:6400   172.16.2.4:1234
>`- ESTAB   0  0   172.16.254.254%eth1:6400   172.16.3.3:1234
>`- ESTAB   0  0   172.16.254.254%eth1:6400   172.16.3.4:1234
>`- ESTAB   0  0   172.16.254.254%eth1:6400   172.16.5.2:1234
>`- ESTAB   0  0   172.16.254.254%eth1:6400   172.16.5.3:1234
>LISTEN 0  10   172.16.254.254:6400*:*
>`- ESTAB   0  0   172.16.254.254%eth1:6400   172.16.1.3:1234
>`- ESTAB   0  0   172.16.254.254%eth1:6400   172.16.1.4:1234
>`- ESTAB   0  0   172.16.254.254%eth1:6400   172.16.3.2:1234
>`- ESTAB   0  0   172.16.254.254%eth1:6400   172.16.4.1:1234
>`- ESTAB   0  0   172.16.254.254%eth1:6400   172.16.4.2:1234
>`- ESTAB   0  0   172.16.254.254%eth1:6400   172.16.4.3:1234
>`- ESTAB   0  0   172.16.254.254%eth1:6400   172.16.4.4:1234
>LISTEN 0  10   172.16.254.254:6400*:*
>`- ESTAB   0  0   172.16.254.254%eth1:6400   172.16.1.2:1234
>`- ESTAB   0  0   172.16.254.254%eth1:6400   172.16.3.5:1234
>`- ESTAB   0  0   172.16.254.254%eth1:6400   172.16.4.5:1234
>`- ESTAB   0  0   172.16.254.254%eth1:6400   172.16.253.253:1234
>LISTEN 0  10   172.16.254.254:6400*:*
>`- ESTAB   0  0   172.16.254.254%eth1:6400   172.16.2.2:1234
>`- ESTAB   0  0   172.16.254.254%eth1:6400   172.16.2.3:1234
>`- ESTAB   0  0   172.16.254.254%eth1:6400   172.16.5.4:1234
>`- ESTAB   0  0   172.16.254.254%eth1:6400   172.16.5.5:1234
>LISTEN 0  10   172.16.254.254:6400*:*
>`- ESTAB   0  0   172.16.254.254%eth1:6400   172.16.1.1:1234
>`- ESTAB   0  0   172.16.254.254%eth1:6400   172.16.1.5:1234
>`- ESTAB   0  0   172.16.254.254%eth1:6400   172.16.2.5:1234
>`- ESTAB   0  0   172.16.254.254%eth1:6400   172.16.3.1:1234
>    `- ESTAB   0  0   172.16.254.254%eth1:6400   172.16.5.1:1234
Attached is the testcase based on sctp-tests.git.

>
> Xin Long (3):
>   sctp: do reuseport_select_sock in __sctp_rcv_lookup_endpoint
>   sctp: add sock_reuseport for the sock in __sctp_hash_endpoint
>   sctp: process sk_reuseport in sctp_get_port_local
>
>  include/net/sctp/sctp.h|   2 +-
>  include/net/sctp/structs.h |   6 ++-
>  net/core/sock_reuseport.c  |   1 +
>  net/sctp/bind_addr.c   |  28 ++
>  net/sctp/input.c   | 129 
> -
>  net/sctp/socket.c  |  49 +++--
>  6 files changed, 162 insertions(+), 53 deletions(-)
>
> --
> 2.1.0
>


reuseport.tar.gz
Description: GNU Zip compressed data


[PATCH net-next 3/3] sctp: process sk_reuseport in sctp_get_port_local

2018-10-20 Thread Xin Long
When socks' sk_reuseport is set, the same port and address are allowed
to be bound into these socks who have the same uid.

Note that the difference from sk_reuse is that it allows multiple socks
to listen on the same port and address.

Signed-off-by: Xin Long 
---
 include/net/sctp/structs.h |  4 +++-
 net/sctp/socket.c  | 46 +-
 2 files changed, 36 insertions(+), 14 deletions(-)

diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 15d017f..af9d494 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -96,7 +96,9 @@ struct sctp_stream;
 
 struct sctp_bind_bucket {
unsigned short  port;
-   unsigned short  fastreuse;
+   signed char fastreuse;
+   signed char fastreuseport;
+   kuid_t  fastuid;
struct hlist_node   node;
struct hlist_head   owner;
struct net  *net;
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 44e7d8c..8605705 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -7642,8 +7642,10 @@ static struct sctp_bind_bucket *sctp_bucket_create(
 
 static long sctp_get_port_local(struct sock *sk, union sctp_addr *addr)
 {
-   bool reuse = (sk->sk_reuse || sctp_sk(sk)->reuse);
+   struct sctp_sock *sp = sctp_sk(sk);
+   bool reuse = (sk->sk_reuse || sp->reuse);
struct sctp_bind_hashbucket *head; /* hash list */
+   kuid_t uid = sock_i_uid(sk);
struct sctp_bind_bucket *pp;
unsigned short snum;
int ret;
@@ -7719,7 +7721,10 @@ static long sctp_get_port_local(struct sock *sk, union 
sctp_addr *addr)
 
pr_debug("%s: found a possible match\n", __func__);
 
-   if (pp->fastreuse && reuse && sk->sk_state != SCTP_SS_LISTENING)
+   if ((pp->fastreuse && reuse &&
+sk->sk_state != SCTP_SS_LISTENING) ||
+   (pp->fastreuseport && sk->sk_reuseport &&
+uid_eq(pp->fastuid, uid)))
goto success;
 
/* Run through the list of sockets bound to the port
@@ -7733,16 +7738,18 @@ static long sctp_get_port_local(struct sock *sk, union 
sctp_addr *addr)
 * in an endpoint.
 */
sk_for_each_bound(sk2, >owner) {
-   struct sctp_endpoint *ep2;
-   ep2 = sctp_sk(sk2)->ep;
+   struct sctp_sock *sp2 = sctp_sk(sk2);
+   struct sctp_endpoint *ep2 = sp2->ep;
 
if (sk == sk2 ||
-   (reuse && (sk2->sk_reuse || sctp_sk(sk2)->reuse) &&
-sk2->sk_state != SCTP_SS_LISTENING))
+   (reuse && (sk2->sk_reuse || sp2->reuse) &&
+sk2->sk_state != SCTP_SS_LISTENING) ||
+   (sk->sk_reuseport && sk2->sk_reuseport &&
+uid_eq(uid, sock_i_uid(sk2
continue;
 
-   if (sctp_bind_addr_conflict(>base.bind_addr, addr,
-sctp_sk(sk2), sctp_sk(sk))) {
+   if (sctp_bind_addr_conflict(>base.bind_addr,
+   addr, sp2, sp)) {
ret = (long)sk2;
goto fail_unlock;
}
@@ -7765,19 +7772,32 @@ static long sctp_get_port_local(struct sock *sk, union 
sctp_addr *addr)
pp->fastreuse = 1;
else
pp->fastreuse = 0;
-   } else if (pp->fastreuse &&
-  (!reuse || sk->sk_state == SCTP_SS_LISTENING))
-   pp->fastreuse = 0;
+
+   if (sk->sk_reuseport) {
+   pp->fastreuseport = 1;
+   pp->fastuid = uid;
+   } else {
+   pp->fastreuseport = 0;
+   }
+   } else {
+   if (pp->fastreuse &&
+   (!reuse || sk->sk_state == SCTP_SS_LISTENING))
+   pp->fastreuse = 0;
+
+   if (pp->fastreuseport &&
+   (!sk->sk_reuseport || !uid_eq(pp->fastuid, uid)))
+   pp->fastreuseport = 0;
+   }
 
/* We are set, so fill up all the data in the hash table
 * entry, tie the socket list information with the rest of the
 * sockets FIXME: Blurry, NPI (ipg).
 */
 success:
-   if (!sctp_sk(sk)->bind_hash) {
+   if (!sp->bind_hash) {
inet_sk(sk)->inet_num = snum;
sk_add_bind_node(sk, >owner);
-   sctp_sk(sk)->bind_hash = pp;
+   sp->bind_hash = pp;
}
ret = 0;
 
-- 
2.1.0



[PATCH net-next 0/3] sctp: add support for sk_reuseport

2018-10-20 Thread Xin Long
sctp sk_reuseport allows multiple socks to listen on the same port and
addresses, as long as these socks have the same uid. This works pretty
much as TCP/UDP does, the only difference is that sctp is multi-homing
and all the bind_addrs in these socks will have to completely matched,
otherwise listen() will return err.

The below is when 5 sockets are listening on 172.16.254.254:6400 on a
server, 26 sockets on a client connect to 172.16.254.254:6400 and each
may be processed by a different socket on the server which is selected
by hash(lport, pport, paddr) in reuseport_select_sock():

 # ss --sctp -nn
   State  Recv-Q Send-QLocal Address:Port Peer Address:Port
   LISTEN 0  10   172.16.254.254:6400*:*
   `- ESTAB   0  0   172.16.254.254%eth1:6400   172.16.2.1:1234
   `- ESTAB   0  0   172.16.254.254%eth1:6400   172.16.2.4:1234
   `- ESTAB   0  0   172.16.254.254%eth1:6400   172.16.3.3:1234
   `- ESTAB   0  0   172.16.254.254%eth1:6400   172.16.3.4:1234
   `- ESTAB   0  0   172.16.254.254%eth1:6400   172.16.5.2:1234
   `- ESTAB   0  0   172.16.254.254%eth1:6400   172.16.5.3:1234
   LISTEN 0  10   172.16.254.254:6400*:*
   `- ESTAB   0  0   172.16.254.254%eth1:6400   172.16.1.3:1234
   `- ESTAB   0  0   172.16.254.254%eth1:6400   172.16.1.4:1234
   `- ESTAB   0  0   172.16.254.254%eth1:6400   172.16.3.2:1234
   `- ESTAB   0  0   172.16.254.254%eth1:6400   172.16.4.1:1234
   `- ESTAB   0  0   172.16.254.254%eth1:6400   172.16.4.2:1234
   `- ESTAB   0  0   172.16.254.254%eth1:6400   172.16.4.3:1234
   `- ESTAB   0  0   172.16.254.254%eth1:6400   172.16.4.4:1234
   LISTEN 0  10   172.16.254.254:6400*:*
   `- ESTAB   0  0   172.16.254.254%eth1:6400   172.16.1.2:1234
   `- ESTAB   0  0   172.16.254.254%eth1:6400   172.16.3.5:1234
   `- ESTAB   0  0   172.16.254.254%eth1:6400   172.16.4.5:1234
   `- ESTAB   0  0   172.16.254.254%eth1:6400   172.16.253.253:1234
   LISTEN 0  10   172.16.254.254:6400*:*
   `- ESTAB   0  0   172.16.254.254%eth1:6400   172.16.2.2:1234
   `- ESTAB   0  0   172.16.254.254%eth1:6400   172.16.2.3:1234
   `- ESTAB   0  0   172.16.254.254%eth1:6400   172.16.5.4:1234
   `- ESTAB   0  0   172.16.254.254%eth1:6400   172.16.5.5:1234
   LISTEN 0  10   172.16.254.254:6400*:*
   `- ESTAB   0  0   172.16.254.254%eth1:6400   172.16.1.1:1234
   `- ESTAB   0  0   172.16.254.254%eth1:6400   172.16.1.5:1234
   `- ESTAB   0  0   172.16.254.254%eth1:6400   172.16.2.5:1234
   `- ESTAB   0  0   172.16.254.254%eth1:6400   172.16.3.1:1234
   `- ESTAB   0  0   172.16.254.254%eth1:6400   172.16.5.1:1234

Xin Long (3):
  sctp: do reuseport_select_sock in __sctp_rcv_lookup_endpoint
  sctp: add sock_reuseport for the sock in __sctp_hash_endpoint
  sctp: process sk_reuseport in sctp_get_port_local

 include/net/sctp/sctp.h|   2 +-
 include/net/sctp/structs.h |   6 ++-
 net/core/sock_reuseport.c  |   1 +
 net/sctp/bind_addr.c   |  28 ++
 net/sctp/input.c   | 129 -
 net/sctp/socket.c  |  49 +++--
 6 files changed, 162 insertions(+), 53 deletions(-)

-- 
2.1.0



[PATCH net-next 1/3] sctp: do reuseport_select_sock in __sctp_rcv_lookup_endpoint

2018-10-20 Thread Xin Long
This is a part of sk_reuseport support for sctp, and it selects a
sock by the hashkey of lport, paddr and dport by default. It will
work until sk_reuseport support is added in sctp_get_port_local()
in the next patch.

Signed-off-by: Xin Long 
---
 net/sctp/input.c | 69 +---
 1 file changed, 41 insertions(+), 28 deletions(-)

diff --git a/net/sctp/input.c b/net/sctp/input.c
index 5c36a99..60ede89 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -57,6 +57,7 @@
 #include 
 #include 
 #include 
+#include 
 
 /* Forward declarations for internal helpers. */
 static int sctp_rcv_ootb(struct sk_buff *);
@@ -65,8 +66,10 @@ static struct sctp_association *__sctp_rcv_lookup(struct net 
*net,
  const union sctp_addr *paddr,
  const union sctp_addr *laddr,
  struct sctp_transport **transportp);
-static struct sctp_endpoint *__sctp_rcv_lookup_endpoint(struct net *net,
-   const union sctp_addr *laddr);
+static struct sctp_endpoint *__sctp_rcv_lookup_endpoint(
+   struct net *net, struct sk_buff *skb,
+   const union sctp_addr *laddr,
+   const union sctp_addr *daddr);
 static struct sctp_association *__sctp_lookup_association(
struct net *net,
const union sctp_addr *local,
@@ -171,7 +174,7 @@ int sctp_rcv(struct sk_buff *skb)
asoc = __sctp_rcv_lookup(net, skb, , , );
 
if (!asoc)
-   ep = __sctp_rcv_lookup_endpoint(net, );
+   ep = __sctp_rcv_lookup_endpoint(net, skb, , );
 
/* Retrieve the common input handling substructure. */
rcvr = asoc ? >base : >base;
@@ -770,16 +773,35 @@ void sctp_unhash_endpoint(struct sctp_endpoint *ep)
local_bh_enable();
 }
 
+static inline __u32 sctp_hashfn(const struct net *net, __be16 lport,
+   const union sctp_addr *paddr, __u32 seed)
+{
+   __u32 addr;
+
+   if (paddr->sa.sa_family == AF_INET6)
+   addr = jhash(>v6.sin6_addr, 16, seed);
+   else
+   addr = (__force __u32)paddr->v4.sin_addr.s_addr;
+
+   return  jhash_3words(addr, ((__force __u32)paddr->v4.sin_port) << 16 |
+(__force __u32)lport, net_hash_mix(net), seed);
+}
+
 /* Look up an endpoint. */
-static struct sctp_endpoint *__sctp_rcv_lookup_endpoint(struct net *net,
-   const union sctp_addr *laddr)
+static struct sctp_endpoint *__sctp_rcv_lookup_endpoint(
+   struct net *net, struct sk_buff *skb,
+   const union sctp_addr *laddr,
+   const union sctp_addr *paddr)
 {
struct sctp_hashbucket *head;
struct sctp_ep_common *epb;
struct sctp_endpoint *ep;
+   struct sock *sk;
+   __be32 lport;
int hash;
 
-   hash = sctp_ep_hashfn(net, ntohs(laddr->v4.sin_port));
+   lport = laddr->v4.sin_port;
+   hash = sctp_ep_hashfn(net, ntohs(lport));
head = _ep_hashtable[hash];
read_lock(>lock);
sctp_for_each_hentry(epb, >chain) {
@@ -791,6 +813,15 @@ static struct sctp_endpoint 
*__sctp_rcv_lookup_endpoint(struct net *net,
ep = sctp_sk(net->sctp.ctl_sock)->ep;
 
 hit:
+   sk = ep->base.sk;
+   if (sk->sk_reuseport) {
+   __u32 phash = sctp_hashfn(net, lport, paddr, 0);
+
+   sk = reuseport_select_sock(sk, phash, skb,
+  sizeof(struct sctphdr));
+   if (sk)
+   ep = sctp_sk(sk)->ep;
+   }
sctp_endpoint_hold(ep);
read_unlock(>lock);
return ep;
@@ -829,35 +860,17 @@ static inline int sctp_hash_cmp(struct 
rhashtable_compare_arg *arg,
 static inline __u32 sctp_hash_obj(const void *data, u32 len, u32 seed)
 {
const struct sctp_transport *t = data;
-   const union sctp_addr *paddr = >ipaddr;
-   const struct net *net = sock_net(t->asoc->base.sk);
-   __be16 lport = htons(t->asoc->base.bind_addr.port);
-   __u32 addr;
-
-   if (paddr->sa.sa_family == AF_INET6)
-   addr = jhash(>v6.sin6_addr, 16, seed);
-   else
-   addr = (__force __u32)paddr->v4.sin_addr.s_addr;
 
-   return  jhash_3words(addr, ((__force __u32)paddr->v4.sin_port) << 16 |
-(__force __u32)lport, net_hash_mix(net), seed);
+   return sctp_hashfn(sock_net(t->asoc->base.sk),
+  htons(t->asoc->base.bind_addr.port),
+  >ipaddr, seed)

[PATCH net-next 2/3] sctp: add sock_reuseport for the sock in __sctp_hash_endpoint

2018-10-20 Thread Xin Long
This is a part of sk_reuseport support for sctp. It defines a helper
sctp_bind_addrs_check() to check if the bind_addrs in two socks are
matched. It will add sock_reuseport if they are completely matched,
and return err if they are partly matched, and alloc sock_reuseport
if all socks are not matched at all.

It will work until sk_reuseport support is added in
sctp_get_port_local() in the next patch.

Signed-off-by: Xin Long 
---
 include/net/sctp/sctp.h|  2 +-
 include/net/sctp/structs.h |  2 ++
 net/core/sock_reuseport.c  |  1 +
 net/sctp/bind_addr.c   | 28 ++
 net/sctp/input.c   | 60 +++---
 net/sctp/socket.c  |  3 +--
 6 files changed, 85 insertions(+), 11 deletions(-)

diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
index 8c2caa3..b8cd58d 100644
--- a/include/net/sctp/sctp.h
+++ b/include/net/sctp/sctp.h
@@ -152,7 +152,7 @@ int sctp_primitive_RECONF(struct net *net, struct 
sctp_association *asoc,
  */
 int sctp_rcv(struct sk_buff *skb);
 void sctp_v4_err(struct sk_buff *skb, u32 info);
-void sctp_hash_endpoint(struct sctp_endpoint *);
+int sctp_hash_endpoint(struct sctp_endpoint *ep);
 void sctp_unhash_endpoint(struct sctp_endpoint *);
 struct sock *sctp_err_lookup(struct net *net, int family, struct sk_buff *,
 struct sctphdr *, struct sctp_association **,
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index a11f937..15d017f 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -1190,6 +1190,8 @@ int sctp_bind_addr_conflict(struct sctp_bind_addr *, 
const union sctp_addr *,
 struct sctp_sock *, struct sctp_sock *);
 int sctp_bind_addr_state(const struct sctp_bind_addr *bp,
 const union sctp_addr *addr);
+int sctp_bind_addrs_check(struct sctp_sock *sp,
+ struct sctp_sock *sp2, int cnt2);
 union sctp_addr *sctp_find_unmatch_addr(struct sctp_bind_addr  *bp,
const union sctp_addr   *addrs,
int addrcnt,
diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c
index ba5cba5..d8fe3e5 100644
--- a/net/core/sock_reuseport.c
+++ b/net/core/sock_reuseport.c
@@ -187,6 +187,7 @@ int reuseport_add_sock(struct sock *sk, struct sock *sk2, 
bool bind_inany)
call_rcu(_reuse->rcu, reuseport_free_rcu);
return 0;
 }
+EXPORT_SYMBOL(reuseport_add_sock);
 
 void reuseport_detach_sock(struct sock *sk)
 {
diff --git a/net/sctp/bind_addr.c b/net/sctp/bind_addr.c
index 7df3704..78d0d93 100644
--- a/net/sctp/bind_addr.c
+++ b/net/sctp/bind_addr.c
@@ -337,6 +337,34 @@ int sctp_bind_addr_match(struct sctp_bind_addr *bp,
return match;
 }
 
+int sctp_bind_addrs_check(struct sctp_sock *sp,
+ struct sctp_sock *sp2, int cnt2)
+{
+   struct sctp_bind_addr *bp2 = >ep->base.bind_addr;
+   struct sctp_bind_addr *bp = >ep->base.bind_addr;
+   struct sctp_sockaddr_entry *laddr, *laddr2;
+   bool exist = false;
+   int cnt = 0;
+
+   rcu_read_lock();
+   list_for_each_entry_rcu(laddr, >address_list, list) {
+   list_for_each_entry_rcu(laddr2, >address_list, list) {
+   if (sp->pf->af->cmp_addr(>a, >a) &&
+   laddr->valid == laddr2->valid) {
+   exist = true;
+   goto next;
+   }
+   }
+   cnt = 0;
+   break;
+next:
+   cnt++;
+   }
+   rcu_read_unlock();
+
+   return (cnt == cnt2) ? 0 : (exist ? -EEXIST : 1);
+}
+
 /* Does the address 'addr' conflict with any addresses in
  * the bp.
  */
diff --git a/net/sctp/input.c b/net/sctp/input.c
index 60ede89..6bfeb10 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -723,43 +723,87 @@ static int sctp_rcv_ootb(struct sk_buff *skb)
 }
 
 /* Insert endpoint into the hash table.  */
-static void __sctp_hash_endpoint(struct sctp_endpoint *ep)
+static int __sctp_hash_endpoint(struct sctp_endpoint *ep)
 {
-   struct net *net = sock_net(ep->base.sk);
-   struct sctp_ep_common *epb;
+   struct sock *sk = ep->base.sk;
+   struct net *net = sock_net(sk);
struct sctp_hashbucket *head;
+   struct sctp_ep_common *epb;
 
epb = >base;
-
epb->hashent = sctp_ep_hashfn(net, epb->bind_addr.port);
head = _ep_hashtable[epb->hashent];
 
+   if (sk->sk_reuseport) {
+   bool any = sctp_is_ep_boundall(sk);
+   struct sctp_ep_common *epb2;
+   struct list_head *list;
+   int cnt = 0, err = 1;
+
+   list_for_each(list, >base.bind_addr.address_list)
+   cnt++;
+
+   sctp_for_each_

[PATCH net] sctp: fix the data size calculation in sctp_data_size

2018-10-17 Thread Xin Long
sctp data size should be calculated by subtracting data chunk header's
length from chunk_hdr->length, not just data header.

Fixes: 668c9beb9020 ("sctp: implement assign_number for sctp_stream_interleave")
Signed-off-by: Xin Long 
---
 include/net/sctp/sm.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/net/sctp/sm.h b/include/net/sctp/sm.h
index 5ef1bad..9e3d327 100644
--- a/include/net/sctp/sm.h
+++ b/include/net/sctp/sm.h
@@ -347,7 +347,7 @@ static inline __u16 sctp_data_size(struct sctp_chunk *chunk)
__u16 size;
 
size = ntohs(chunk->chunk_hdr->length);
-   size -= sctp_datahdr_len(>asoc->stream);
+   size -= sctp_datachk_len(>asoc->stream);
 
return size;
 }
-- 
2.1.0



[PATCH net-next 2/2] sctp: use sk_wmem_queued to check for writable space

2018-10-16 Thread Xin Long
sk->sk_wmem_queued is used to count the size of chunks in out queue
while sk->sk_wmem_alloc is for counting the size of chunks has been
sent. sctp is increasing both of them before enqueuing the chunks,
and using sk->sk_wmem_alloc to check for writable space.

However, sk_wmem_alloc is also increased by 1 for the skb allocked
for sending in sctp_packet_transmit() but it will not wake up the
waiters when sk_wmem_alloc is decreased in this skb's destructor.

If msg size is equal to sk_sndbuf and sendmsg is waiting for sndbuf,
the check 'msg_len <= sctp_wspace(asoc)' in sctp_wait_for_sndbuf()
will keep waiting if there's a skb allocked in sctp_packet_transmit,
and later even if this skb got freed, the waiting thread will never
get waked up.

This issue has been there since very beginning, so we change to use
sk->sk_wmem_queued to check for writable space as sk_wmem_queued is
not increased for the skb allocked for sending, also as TCP does.

SOCK_SNDBUF_LOCK check is also removed here as it's for tx buf auto
tuning which I will add in another patch.

Signed-off-by: Xin Long 
---
 net/sctp/socket.c | 38 +-
 1 file changed, 9 insertions(+), 29 deletions(-)

diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index c6f2950..111ebd8 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -83,7 +83,7 @@
 #include 
 
 /* Forward declarations for internal helper functions. */
-static int sctp_writeable(struct sock *sk);
+static bool sctp_writeable(struct sock *sk);
 static void sctp_wfree(struct sk_buff *skb);
 static int sctp_wait_for_sndbuf(struct sctp_association *asoc, long *timeo_p,
size_t msg_len);
@@ -119,25 +119,10 @@ static void sctp_enter_memory_pressure(struct sock *sk)
 /* Get the sndbuf space available at the time on the association.  */
 static inline int sctp_wspace(struct sctp_association *asoc)
 {
-   int amt;
+   struct sock *sk = asoc->base.sk;
 
-   if (asoc->ep->sndbuf_policy)
-   amt = asoc->sndbuf_used;
-   else
-   amt = sk_wmem_alloc_get(asoc->base.sk);
-
-   if (amt >= asoc->base.sk->sk_sndbuf) {
-   if (asoc->base.sk->sk_userlocks & SOCK_SNDBUF_LOCK)
-   amt = 0;
-   else {
-   amt = sk_stream_wspace(asoc->base.sk);
-   if (amt < 0)
-   amt = 0;
-   }
-   } else {
-   amt = asoc->base.sk->sk_sndbuf - amt;
-   }
-   return amt;
+   return asoc->ep->sndbuf_policy ? sk->sk_sndbuf - asoc->sndbuf_used
+  : sk_stream_wspace(sk);
 }
 
 /* Increment the used sndbuf space count of the corresponding association by
@@ -1925,10 +1910,10 @@ static int sctp_sendmsg_to_asoc(struct sctp_association 
*asoc,
asoc->pmtu_pending = 0;
}
 
-   if (sctp_wspace(asoc) < msg_len)
+   if (sctp_wspace(asoc) < (int)msg_len)
sctp_prsctp_prune(asoc, sinfo, msg_len - sctp_wspace(asoc));
 
-   if (!sctp_wspace(asoc)) {
+   if (sctp_wspace(asoc) <= 0) {
timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
err = sctp_wait_for_sndbuf(asoc, , msg_len);
if (err)
@@ -8535,7 +8520,7 @@ static int sctp_wait_for_sndbuf(struct sctp_association 
*asoc, long *timeo_p,
goto do_error;
if (signal_pending(current))
goto do_interrupted;
-   if (msg_len <= sctp_wspace(asoc))
+   if ((int)msg_len <= sctp_wspace(asoc))
break;
 
/* Let another process have a go.  Since we are going
@@ -8610,14 +8595,9 @@ void sctp_write_space(struct sock *sk)
  * UDP-style sockets or TCP-style sockets, this code should work.
  *  - Daisy
  */
-static int sctp_writeable(struct sock *sk)
+static bool sctp_writeable(struct sock *sk)
 {
-   int amt = 0;
-
-   amt = sk->sk_sndbuf - sk_wmem_alloc_get(sk);
-   if (amt < 0)
-   amt = 0;
-   return amt;
+   return sk->sk_sndbuf > sk->sk_wmem_queued;
 }
 
 /* Wait for an association to go into ESTABLISHED state. If timeout is 0,
-- 
2.1.0



[PATCH net-next 1/2] sctp: count both sk and asoc sndbuf with skb truesize and sctp_chunk size

2018-10-16 Thread Xin Long
Now it's confusing that asoc sndbuf_used is doing memory accounting with
SCTP_DATA_SNDSIZE(chunk) + sizeof(sk_buff) + sizeof(sctp_chunk) while sk
sk_wmem_alloc is doing that with skb->truesize + sizeof(sctp_chunk).

It also causes sctp_prsctp_prune to count with a wrong freed memory when
sndbuf_policy is not set.

To make this right and also keep consistent between asoc sndbuf_used, sk
sk_wmem_alloc and sk_wmem_queued, use skb->truesize + sizeof(sctp_chunk)
for them.

Signed-off-by: Xin Long 
---
 include/net/sctp/constants.h |  5 -
 net/sctp/outqueue.c  |  8 ++--
 net/sctp/socket.c| 21 ++---
 3 files changed, 8 insertions(+), 26 deletions(-)

diff --git a/include/net/sctp/constants.h b/include/net/sctp/constants.h
index 86f034b..8dadc74 100644
--- a/include/net/sctp/constants.h
+++ b/include/net/sctp/constants.h
@@ -148,11 +148,6 @@ SCTP_SUBTYPE_CONSTRUCTOR(PRIMITIVE,enum 
sctp_event_primitive, primitive)
 #define sctp_chunk_is_data(a) (a->chunk_hdr->type == SCTP_CID_DATA || \
   a->chunk_hdr->type == SCTP_CID_I_DATA)
 
-/* Calculate the actual data size in a data chunk */
-#define SCTP_DATA_SNDSIZE(c) ((int)((unsigned long)(c->chunk_end) - \
-   (unsigned long)(c->chunk_hdr) - \
-   sctp_datachk_len(>asoc->stream)))
-
 /* Internal error codes */
 enum sctp_ierror {
SCTP_IERROR_NO_ERROR= 0,
diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index 42191ed..9cb854b 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -385,9 +385,7 @@ static int sctp_prsctp_prune_sent(struct sctp_association 
*asoc,
asoc->outqueue.outstanding_bytes -= sctp_data_size(chk);
}
 
-   msg_len -= SCTP_DATA_SNDSIZE(chk) +
-  sizeof(struct sk_buff) +
-  sizeof(struct sctp_chunk);
+   msg_len -= chk->skb->truesize + sizeof(struct sctp_chunk);
if (msg_len <= 0)
break;
}
@@ -421,9 +419,7 @@ static int sctp_prsctp_prune_unsent(struct sctp_association 
*asoc,
streamout->ext->abandoned_unsent[SCTP_PR_INDEX(PRIO)]++;
}
 
-   msg_len -= SCTP_DATA_SNDSIZE(chk) +
-  sizeof(struct sk_buff) +
-  sizeof(struct sctp_chunk);
+   msg_len -= chk->skb->truesize + sizeof(struct sctp_chunk);
sctp_chunk_free(chk);
if (msg_len <= 0)
break;
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index f73e9d3..c6f2950 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -166,12 +166,9 @@ static inline void sctp_set_owner_w(struct sctp_chunk 
*chunk)
/* Save the chunk pointer in skb for sctp_wfree to use later.  */
skb_shinfo(chunk->skb)->destructor_arg = chunk;
 
-   asoc->sndbuf_used += SCTP_DATA_SNDSIZE(chunk) +
-   sizeof(struct sk_buff) +
-   sizeof(struct sctp_chunk);
-
refcount_add(sizeof(struct sctp_chunk), >sk_wmem_alloc);
-   sk->sk_wmem_queued += chunk->skb->truesize;
+   asoc->sndbuf_used += chunk->skb->truesize + sizeof(struct sctp_chunk);
+   sk->sk_wmem_queued += chunk->skb->truesize + sizeof(struct sctp_chunk);
sk_mem_charge(sk, chunk->skb->truesize);
 }
 
@@ -8460,17 +8457,11 @@ static void sctp_wfree(struct sk_buff *skb)
struct sctp_association *asoc = chunk->asoc;
struct sock *sk = asoc->base.sk;
 
-   asoc->sndbuf_used -= SCTP_DATA_SNDSIZE(chunk) +
-   sizeof(struct sk_buff) +
-   sizeof(struct sctp_chunk);
-
-   WARN_ON(refcount_sub_and_test(sizeof(struct sctp_chunk), 
>sk_wmem_alloc));
-
-   /*
-* This undoes what is done via sctp_set_owner_w and sk_mem_charge
-*/
-   sk->sk_wmem_queued   -= skb->truesize;
sk_mem_uncharge(sk, skb->truesize);
+   sk->sk_wmem_queued -= skb->truesize + sizeof(struct sctp_chunk);
+   asoc->sndbuf_used -= skb->truesize + sizeof(struct sctp_chunk);
+   WARN_ON(refcount_sub_and_test(sizeof(struct sctp_chunk),
+ >sk_wmem_alloc));
 
if (chunk->shkey) {
struct sctp_shared_key *shkey = chunk->shkey;
-- 
2.1.0



[PATCH net-next 0/2] sctp: fix sk_wmem_queued and use it to check for writable space

2018-10-16 Thread Xin Long
sctp doesn't count and use asoc sndbuf_used, sk sk_wmem_alloc and
sk_wmem_queued properly, which also causes some problem.

This patchset is to improve it.

Xin Long (2):
  sctp: count both sk and asoc sndbuf with skb truesize and sctp_chunk
size
  sctp: use sk_wmem_queued to check for writable space

 include/net/sctp/constants.h |  5 
 net/sctp/outqueue.c  |  8 ++
 net/sctp/socket.c| 59 +++-
 3 files changed, 17 insertions(+), 55 deletions(-)

-- 
2.1.0



[PATCH net] sctp: not free the new asoc when sctp_wait_for_connect returns err

2018-10-16 Thread Xin Long
When sctp_wait_for_connect is called to wait for connect ready
for sp->strm_interleave in sctp_sendmsg_to_asoc, a panic could
be triggered if cpu is scheduled out and the new asoc is freed
elsewhere, as it will return err and later the asoc gets freed
again in sctp_sendmsg.

[  285.840764] list_del corruption, 9f0f7b284078->next is LIST_POISON1 
(dead0100)
[  285.843590] WARNING: CPU: 1 PID: 8861 at lib/list_debug.c:47 
__list_del_entry_valid+0x50/0xa0
[  285.846193] Kernel panic - not syncing: panic_on_warn set ...
[  285.846193]
[  285.848206] CPU: 1 PID: 8861 Comm: sctp_ndata Kdump: loaded Not tainted 
4.19.0-rc7.label #584
[  285.850559] Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011
[  285.852164] Call Trace:
...
[  285.872210]  ? __list_del_entry_valid+0x50/0xa0
[  285.872894]  sctp_association_free+0x42/0x2d0 [sctp]
[  285.873612]  sctp_sendmsg+0x5a4/0x6b0 [sctp]
[  285.874236]  sock_sendmsg+0x30/0x40
[  285.874741]  ___sys_sendmsg+0x27a/0x290
[  285.875304]  ? __switch_to_asm+0x34/0x70
[  285.875872]  ? __switch_to_asm+0x40/0x70
[  285.876438]  ? ptep_set_access_flags+0x2a/0x30
[  285.877083]  ? do_wp_page+0x151/0x540
[  285.877614]  __sys_sendmsg+0x58/0xa0
[  285.878138]  do_syscall_64+0x55/0x180
[  285.878669]  entry_SYSCALL_64_after_hwframe+0x44/0xa9

This is a similar issue with the one fixed in Commit ca3af4dd28cf
("sctp: do not free asoc when it is already dead in sctp_sendmsg").
But this one can't be fixed by returning -ESRCH for the dead asoc
in sctp_wait_for_connect, as it will break sctp_connect's return
value to users.

This patch is to simply set err to -ESRCH before it returns to
sctp_sendmsg when any err is returned by sctp_wait_for_connect
for sp->strm_interleave, so that no asoc would be freed due to
this.

When users see this error, they will know the packet hasn't been
sent. And it also makes sense to not free asoc because waiting
connect fails, like the second call for sctp_wait_for_connect in
sctp_sendmsg_to_asoc.

Fixes: 668c9beb9020 ("sctp: implement assign_number for sctp_stream_interleave")
Signed-off-by: Xin Long 
---
 net/sctp/socket.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index e25a20f..1baa9d9 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -1946,8 +1946,10 @@ static int sctp_sendmsg_to_asoc(struct sctp_association 
*asoc,
if (sp->strm_interleave) {
timeo = sock_sndtimeo(sk, 0);
err = sctp_wait_for_connect(asoc, );
-   if (err)
+   if (err) {
+   err = -ESRCH;
goto err;
+   }
} else {
wait_connect = true;
}
-- 
2.1.0



[PATCH net] sctp: get pr_assoc and pr_stream all status with SCTP_PR_SCTP_ALL instead

2018-10-16 Thread Xin Long
According to rfc7496 section 4.3 or 4.4:

   sprstat_policy:  This parameter indicates for which PR-SCTP policy
  the user wants the information.  It is an error to use
  SCTP_PR_SCTP_NONE in sprstat_policy.  If SCTP_PR_SCTP_ALL is used,
  the counters provided are aggregated over all supported policies.

We change to dump pr_assoc and pr_stream all status by SCTP_PR_SCTP_ALL
instead, and return error for SCTP_PR_SCTP_NONE, as it also said "It is
an error to use SCTP_PR_SCTP_NONE in sprstat_policy. "

Fixes: 826d253d57b1 ("sctp: add SCTP_PR_ASSOC_STATUS on sctp sockopt")
Fixes: d229d48d183f ("sctp: add SCTP_PR_STREAM_STATUS sockopt for prsctp")
Reported-by: Ying Xu 
Signed-off-by: Xin Long 
---
 include/uapi/linux/sctp.h | 1 +
 net/sctp/socket.c | 8 
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h
index b479db5..34dd3d4 100644
--- a/include/uapi/linux/sctp.h
+++ b/include/uapi/linux/sctp.h
@@ -301,6 +301,7 @@ enum sctp_sinfo_flags {
SCTP_SACK_IMMEDIATELY   = (1 << 3), /* SACK should be sent without 
delay. */
/* 2 bits here have been used by SCTP_PR_SCTP_MASK */
SCTP_SENDALL= (1 << 6),
+   SCTP_PR_SCTP_ALL= (1 << 7),
SCTP_NOTIFICATION   = MSG_NOTIFICATION, /* Next message is not user 
msg but notification. */
SCTP_EOF= MSG_FIN,  /* Initiate graceful shutdown 
process. */
 };
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index f73e9d3..e25a20f 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -7100,14 +7100,14 @@ static int sctp_getsockopt_pr_assocstatus(struct sock 
*sk, int len,
}
 
policy = params.sprstat_policy;
-   if (policy & ~SCTP_PR_SCTP_MASK)
+   if (!policy || (policy & ~(SCTP_PR_SCTP_MASK | SCTP_PR_SCTP_ALL)))
goto out;
 
asoc = sctp_id2assoc(sk, params.sprstat_assoc_id);
if (!asoc)
goto out;
 
-   if (policy == SCTP_PR_SCTP_NONE) {
+   if (policy & SCTP_PR_SCTP_ALL) {
params.sprstat_abandoned_unsent = 0;
params.sprstat_abandoned_sent = 0;
for (policy = 0; policy <= SCTP_PR_INDEX(MAX); policy++) {
@@ -7159,7 +7159,7 @@ static int sctp_getsockopt_pr_streamstatus(struct sock 
*sk, int len,
}
 
policy = params.sprstat_policy;
-   if (policy & ~SCTP_PR_SCTP_MASK)
+   if (!policy || (policy & ~(SCTP_PR_SCTP_MASK | SCTP_PR_SCTP_ALL)))
goto out;
 
asoc = sctp_id2assoc(sk, params.sprstat_assoc_id);
@@ -7175,7 +7175,7 @@ static int sctp_getsockopt_pr_streamstatus(struct sock 
*sk, int len,
goto out;
}
 
-   if (policy == SCTP_PR_SCTP_NONE) {
+   if (policy == SCTP_PR_SCTP_ALL) {
params.sprstat_abandoned_unsent = 0;
params.sprstat_abandoned_sent = 0;
for (policy = 0; policy <= SCTP_PR_INDEX(MAX); policy++) {
-- 
2.1.0



[PATCH net] sctp: use the pmtu from the icmp packet to update transport pathmtu

2018-10-15 Thread Xin Long
Other than asoc pmtu sync from all transports, sctp_assoc_sync_pmtu
is also processing transport pmtu_pending by icmp packets. But it's
meaningless to use sctp_dst_mtu(t->dst) as new pmtu for a transport.

The right pmtu value should come from the icmp packet, and it would
be saved into transport->mtu_info in this patch and used later when
the pmtu sync happens in sctp_sendmsg_to_asoc or sctp_packet_config.

Besides, without this patch, as pmtu can only be updated correctly
when receiving a icmp packet and no place is holding sock lock, it
will take long time if the sock is busy with sending packets.

Note that it doesn't process transport->mtu_info in .release_cb(),
as there is no enough information for pmtu update, like for which
asoc or transport. It is not worth traversing all asocs to check
pmtu_pending. So unlike tcp, sctp does this in tx path, for which
mtu_info needs to be atomic_t.

Signed-off-by: Xin Long 
---
 include/net/sctp/structs.h | 2 ++
 net/sctp/associola.c   | 3 ++-
 net/sctp/input.c   | 1 +
 net/sctp/output.c  | 6 ++
 4 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 28a7c8e..a11f937 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -876,6 +876,8 @@ struct sctp_transport {
unsigned long sackdelay;
__u32 sackfreq;
 
+   atomic_t mtu_info;
+
/* When was the last time that we heard from this transport? We use
 * this to pick new active and retran paths.
 */
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index 297d9cf..a827a1f 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -1450,7 +1450,8 @@ void sctp_assoc_sync_pmtu(struct sctp_association *asoc)
/* Get the lowest pmtu of all the transports. */
list_for_each_entry(t, >peer.transport_addr_list, transports) {
if (t->pmtu_pending && t->dst) {
-   sctp_transport_update_pmtu(t, sctp_dst_mtu(t->dst));
+   sctp_transport_update_pmtu(t,
+  atomic_read(>mtu_info));
t->pmtu_pending = 0;
}
if (!pmtu || (t->pathmtu < pmtu))
diff --git a/net/sctp/input.c b/net/sctp/input.c
index 9bbc5f9..5c36a99 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -395,6 +395,7 @@ void sctp_icmp_frag_needed(struct sock *sk, struct 
sctp_association *asoc,
return;
 
if (sock_owned_by_user(sk)) {
+   atomic_set(>mtu_info, pmtu);
asoc->pmtu_pending = 1;
t->pmtu_pending = 1;
return;
diff --git a/net/sctp/output.c b/net/sctp/output.c
index 7f849b0..67939ad 100644
--- a/net/sctp/output.c
+++ b/net/sctp/output.c
@@ -120,6 +120,12 @@ void sctp_packet_config(struct sctp_packet *packet, __u32 
vtag,
sctp_assoc_sync_pmtu(asoc);
}
 
+   if (asoc->pmtu_pending) {
+   if (asoc->param_flags & SPP_PMTUD_ENABLE)
+   sctp_assoc_sync_pmtu(asoc);
+   asoc->pmtu_pending = 0;
+   }
+
/* If there a is a prepend chunk stick it on the list before
 * any other chunks get appended.
 */
-- 
2.1.0



Re: [PATCH net 0/2] geneve, vxlan: Don't set exceptions if skb->len < mtu

2018-10-15 Thread Xin Long
On Sat, Oct 13, 2018 at 6:54 AM Stefano Brivio  wrote:
>
> This series fixes the exception abuse described in 2/2, and 1/2
> is just a preparatory change to make 2/2 less ugly.
>
> Stefano Brivio (2):
>   geneve, vxlan: Don't check skb_dst() twice
>   geneve, vxlan: Don't set exceptions if skb->len < mtu
>
>  drivers/net/geneve.c | 14 +++---
>  drivers/net/vxlan.c  | 12 ++--
>  include/net/dst.h| 10 ++
>  3 files changed, 15 insertions(+), 21 deletions(-)
>
> --
> 2.19.1
>
Series Reviewed-by: Xin Long 


Re: [PATCH net 2/2] geneve, vxlan: Don't set exceptions if skb->len < mtu

2018-10-15 Thread Xin Long
On Sat, Oct 13, 2018 at 6:54 AM Stefano Brivio  wrote:
>
> We shouldn't abuse exceptions: if the destination MTU is already higher
> than what we're transmitting, no exception should be created.
makes sense, shouldn't ip(6) tunnels also do this?

>
> Fixes: 52a589d51f10 ("geneve: update skb dst pmtu on tx path")
> Fixes: a93bf0ff4490 ("vxlan: update skb dst pmtu on tx path")
> Signed-off-by: Stefano Brivio 
> Reviewed-by: Sabrina Dubroca 
> ---
>  drivers/net/geneve.c |  7 +++
>  drivers/net/vxlan.c  |  4 ++--
>  include/net/dst.h| 10 ++
>  3 files changed, 15 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c
> index 61c4bfbeb41c..493cd382b8aa 100644
> --- a/drivers/net/geneve.c
> +++ b/drivers/net/geneve.c
> @@ -830,8 +830,8 @@ static int geneve_xmit_skb(struct sk_buff *skb, struct 
> net_device *dev,
> if (IS_ERR(rt))
> return PTR_ERR(rt);
>
> -   skb_dst_update_pmtu(skb, dst_mtu(>dst) -
> -GENEVE_IPV4_HLEN - info->options_len);
> +   skb_tunnel_check_pmtu(skb, >dst,
> + GENEVE_IPV4_HLEN + info->options_len);
>
> sport = udp_flow_src_port(geneve->net, skb, 1, USHRT_MAX, true);
> if (geneve->collect_md) {
> @@ -872,8 +872,7 @@ static int geneve6_xmit_skb(struct sk_buff *skb, struct 
> net_device *dev,
> if (IS_ERR(dst))
> return PTR_ERR(dst);
>
> -   skb_dst_update_pmtu(skb, dst_mtu(dst) -
> -GENEVE_IPV6_HLEN - info->options_len);
> +   skb_tunnel_check_pmtu(skb, dst, GENEVE_IPV6_HLEN + info->options_len);
>
> sport = udp_flow_src_port(geneve->net, skb, 1, USHRT_MAX, true);
> if (geneve->collect_md) {
> diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
> index 22e0ce592e07..27bd586b94b0 100644
> --- a/drivers/net/vxlan.c
> +++ b/drivers/net/vxlan.c
> @@ -2194,7 +2194,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct 
> net_device *dev,
> }
>
> ndst = >dst;
> -   skb_dst_update_pmtu(skb, dst_mtu(ndst) - VXLAN_HEADROOM);
> +   skb_tunnel_check_pmtu(skb, ndst, VXLAN_HEADROOM);
>
> tos = ip_tunnel_ecn_encap(tos, old_iph, skb);
> ttl = ttl ? : ip4_dst_hoplimit(>dst);
> @@ -2231,7 +2231,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct 
> net_device *dev,
> goto out_unlock;
> }
>
> -   skb_dst_update_pmtu(skb, dst_mtu(ndst) - VXLAN6_HEADROOM);
> +   skb_tunnel_check_pmtu(skb, ndst, VXLAN6_HEADROOM);
>
> tos = ip_tunnel_ecn_encap(tos, old_iph, skb);
> ttl = ttl ? : ip6_dst_hoplimit(ndst);
> diff --git a/include/net/dst.h b/include/net/dst.h
> index 7f735e76ca73..6cf0870414c7 100644
> --- a/include/net/dst.h
> +++ b/include/net/dst.h
> @@ -527,4 +527,14 @@ static inline void skb_dst_update_pmtu(struct sk_buff 
> *skb, u32 mtu)
> dst->ops->update_pmtu(dst, NULL, skb, mtu);
>  }
>
> +static inline void skb_tunnel_check_pmtu(struct sk_buff *skb,
> +struct dst_entry *encap_dst,
> +int headroom)
> +{
> +   u32 encap_mtu = dst_mtu(encap_dst);
> +
> +   if (skb->len > encap_mtu - headroom)
> +   skb_dst_update_pmtu(skb, encap_mtu - headroom);
> +}
> +
>  #endif /* _NET_DST_H */
> --
> 2.19.1
>


Re: [PATCH net] sctp: update dst pmtu with the correct daddr

2018-09-21 Thread Xin Long
On Fri, Sep 21, 2018 at 2:31 AM David Miller  wrote:
>
> From: Xin Long 
> Date: Thu, 20 Sep 2018 17:27:28 +0800
>
> > When processing pmtu update from an icmp packet, it calls .update_pmtu
> > with sk instead of skb in sctp_transport_update_pmtu.
> >
> > However for sctp, the daddr in the transport might be different from
> > inet_sock->inet_daddr or sk->sk_v6_daddr, which is used to update or
> > create the route cache. The incorrect daddr will cause a different
> > route cache created for the path.
> >
> > So before calling .update_pmtu, inet_sock->inet_daddr/sk->sk_v6_daddr
> > should be updated with the daddr in the transport, and update it back
> > after it's done.
> >
> > The issue has existed since route exceptions introduction.
> >
> > Fixes: 4895c771c7f0 ("ipv4: Add FIB nexthop exceptions.")
> > Reported-by: ian.per...@dialogic.com
> > Signed-off-by: Xin Long 
>
> Applied and queued up for -stable.
>
> Although are you sure it's OK to temporarily change the sockets address
> like this?  What if an asynchronous context looks at the socket state
> and sees the temporarily set address?
It's under the protection of the sock lock, I think any other places that
want to access the address also need to acquire this sock lock first.


[PATCH net] sctp: update dst pmtu with the correct daddr

2018-09-20 Thread Xin Long
When processing pmtu update from an icmp packet, it calls .update_pmtu
with sk instead of skb in sctp_transport_update_pmtu.

However for sctp, the daddr in the transport might be different from
inet_sock->inet_daddr or sk->sk_v6_daddr, which is used to update or
create the route cache. The incorrect daddr will cause a different
route cache created for the path.

So before calling .update_pmtu, inet_sock->inet_daddr/sk->sk_v6_daddr
should be updated with the daddr in the transport, and update it back
after it's done.

The issue has existed since route exceptions introduction.

Fixes: 4895c771c7f0 ("ipv4: Add FIB nexthop exceptions.")
Reported-by: ian.per...@dialogic.com
Signed-off-by: Xin Long 
---
 net/sctp/transport.c | 12 ++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/net/sctp/transport.c b/net/sctp/transport.c
index 12cac85..033696e 100644
--- a/net/sctp/transport.c
+++ b/net/sctp/transport.c
@@ -260,6 +260,7 @@ void sctp_transport_pmtu(struct sctp_transport *transport, 
struct sock *sk)
 bool sctp_transport_update_pmtu(struct sctp_transport *t, u32 pmtu)
 {
struct dst_entry *dst = sctp_transport_dst_check(t);
+   struct sock *sk = t->asoc->base.sk;
bool change = true;
 
if (unlikely(pmtu < SCTP_DEFAULT_MINSEGMENT)) {
@@ -271,12 +272,19 @@ bool sctp_transport_update_pmtu(struct sctp_transport *t, 
u32 pmtu)
pmtu = SCTP_TRUNC4(pmtu);
 
if (dst) {
-   dst->ops->update_pmtu(dst, t->asoc->base.sk, NULL, pmtu);
+   struct sctp_pf *pf = sctp_get_pf_specific(dst->ops->family);
+   union sctp_addr addr;
+
+   pf->af->from_sk(, sk);
+   pf->to_sk_daddr(>ipaddr, sk);
+   dst->ops->update_pmtu(dst, sk, NULL, pmtu);
+   pf->to_sk_daddr(, sk);
+
dst = sctp_transport_dst_check(t);
}
 
if (!dst) {
-   t->af_specific->get_dst(t, >saddr, >fl, t->asoc->base.sk);
+   t->af_specific->get_dst(t, >saddr, >fl, sk);
dst = t->dst;
}
 
-- 
2.1.0



[PATCHv2 net] ipv6: use rt6_info members when dst is set in rt6_fill_node

2018-09-11 Thread Xin Long
In inet6_rtm_getroute, since Commit 93531c674315 ("net/ipv6: separate
handling of FIB entries from dst based routes"), it has used rt->from
to dump route info instead of rt.

However for some route like cache, some of its information like flags
or gateway is not the same as that of the 'from' one. It caused 'ip
route get' to dump the wrong route information.

In Jianlin's testing, the output information even lost the expiration
time for a pmtu route cache due to the wrong fib6_flags.

So change to use rt6_info members for dst addr, src addr, flags and
gateway when it tries to dump a route entry without fibmatch set.

v1->v2:
  - not use rt6i_prefsrc.
  - also fix the gw dump issue.

Fixes: 93531c674315 ("net/ipv6: separate handling of FIB entries from dst based 
routes")
Reported-by: Jianlin Shi 
Signed-off-by: Xin Long 
---
 net/ipv6/route.c | 42 ++
 1 file changed, 30 insertions(+), 12 deletions(-)

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 18e00ce..3eed045 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -4670,20 +4670,31 @@ static int rt6_fill_node(struct net *net, struct 
sk_buff *skb,
 int iif, int type, u32 portid, u32 seq,
 unsigned int flags)
 {
-   struct rtmsg *rtm;
+   struct rt6_info *rt6 = (struct rt6_info *)dst;
+   struct rt6key *rt6_dst, *rt6_src;
+   u32 *pmetrics, table, rt6_flags;
struct nlmsghdr *nlh;
+   struct rtmsg *rtm;
long expires = 0;
-   u32 *pmetrics;
-   u32 table;
 
nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
if (!nlh)
return -EMSGSIZE;
 
+   if (rt6) {
+   rt6_dst = >rt6i_dst;
+   rt6_src = >rt6i_src;
+   rt6_flags = rt6->rt6i_flags;
+   } else {
+   rt6_dst = >fib6_dst;
+   rt6_src = >fib6_src;
+   rt6_flags = rt->fib6_flags;
+   }
+
rtm = nlmsg_data(nlh);
rtm->rtm_family = AF_INET6;
-   rtm->rtm_dst_len = rt->fib6_dst.plen;
-   rtm->rtm_src_len = rt->fib6_src.plen;
+   rtm->rtm_dst_len = rt6_dst->plen;
+   rtm->rtm_src_len = rt6_src->plen;
rtm->rtm_tos = 0;
if (rt->fib6_table)
table = rt->fib6_table->tb6_id;
@@ -4698,7 +4709,7 @@ static int rt6_fill_node(struct net *net, struct sk_buff 
*skb,
rtm->rtm_scope = RT_SCOPE_UNIVERSE;
rtm->rtm_protocol = rt->fib6_protocol;
 
-   if (rt->fib6_flags & RTF_CACHE)
+   if (rt6_flags & RTF_CACHE)
rtm->rtm_flags |= RTM_F_CLONED;
 
if (dest) {
@@ -4706,7 +4717,7 @@ static int rt6_fill_node(struct net *net, struct sk_buff 
*skb,
goto nla_put_failure;
rtm->rtm_dst_len = 128;
} else if (rtm->rtm_dst_len)
-   if (nla_put_in6_addr(skb, RTA_DST, >fib6_dst.addr))
+   if (nla_put_in6_addr(skb, RTA_DST, _dst->addr))
goto nla_put_failure;
 #ifdef CONFIG_IPV6_SUBTREES
if (src) {
@@ -4714,12 +4725,12 @@ static int rt6_fill_node(struct net *net, struct 
sk_buff *skb,
goto nla_put_failure;
rtm->rtm_src_len = 128;
} else if (rtm->rtm_src_len &&
-  nla_put_in6_addr(skb, RTA_SRC, >fib6_src.addr))
+  nla_put_in6_addr(skb, RTA_SRC, _src->addr))
goto nla_put_failure;
 #endif
if (iif) {
 #ifdef CONFIG_IPV6_MROUTE
-   if (ipv6_addr_is_multicast(>fib6_dst.addr)) {
+   if (ipv6_addr_is_multicast(_dst->addr)) {
int err = ip6mr_get_route(net, skb, rtm, portid);
 
if (err == 0)
@@ -4754,7 +4765,14 @@ static int rt6_fill_node(struct net *net, struct sk_buff 
*skb,
/* For multipath routes, walk the siblings list and add
 * each as a nexthop within RTA_MULTIPATH.
 */
-   if (rt->fib6_nsiblings) {
+   if (rt6) {
+   if (rt6_flags & RTF_GATEWAY &&
+   nla_put_in6_addr(skb, RTA_GATEWAY, >rt6i_gateway))
+   goto nla_put_failure;
+
+   if (dst->dev && nla_put_u32(skb, RTA_OIF, dst->dev->ifindex))
+   goto nla_put_failure;
+   } else if (rt->fib6_nsiblings) {
struct fib6_info *sibling, *next_sibling;
struct nlattr *mp;
 
@@ -4777,7 +4795,7 @@ static int rt6_fill_node(struct net *net, struct sk_buff 
*skb,
goto nla_put_failure;
}
 
-   if (rt->fib6_flags & RTF_EXPIRES) {
+   if (rt6_flags & RTF_EXPIRES) {
expires = dst ? dst->expires : rt->expires;
expires -= jiffies;
}
@@ 

Re: [PATCH net] ipv6: use rt6_info members when dst is set in rt6_fill_node

2018-09-10 Thread Xin Long
On Tue, Sep 11, 2018 at 12:13 AM David Ahern  wrote:
>
> On 9/9/18 12:29 AM, Xin Long wrote:
> >>> diff --git a/net/ipv6/route.c b/net/ipv6/route.c
> >>> index 18e00ce..e554922 100644
> >>> --- a/net/ipv6/route.c
> >>> +++ b/net/ipv6/route.c
> >>> @@ -4670,20 +4670,33 @@ static int rt6_fill_node(struct net *net, struct 
> >>> sk_buff *skb,
> >>>int iif, int type, u32 portid, u32 seq,
> >>>unsigned int flags)
> >>>  {
> >>> - struct rtmsg *rtm;
> >>> + struct rt6key *fib6_prefsrc, *fib6_dst, *fib6_src;
> >>> + struct rt6_info *rt6 = (struct rt6_info *)dst;
> >>> + u32 *pmetrics, table, fib6_flags;
> >>>   struct nlmsghdr *nlh;
> >>> + struct rtmsg *rtm;
> >>>   long expires = 0;
> >>> - u32 *pmetrics;
> >>> - u32 table;
> >>>
> >>>   nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
> >>>   if (!nlh)
> >>>   return -EMSGSIZE;
> >>>
> >>> + if (rt6) {
> >>> + fib6_dst = >rt6i_dst;
> >>> + fib6_src = >rt6i_src;
> >>> + fib6_flags = rt6->rt6i_flags;
> >>> + fib6_prefsrc = >rt6i_prefsrc;
> >>> + } else {
> >>> + fib6_dst = >fib6_dst;
> >>> + fib6_src = >fib6_src;
> >>> + fib6_flags = rt->fib6_flags;
> >>> + fib6_prefsrc = >fib6_prefsrc;
> >>> + }
> >>
> >> Unless I am missing something at the moment, an rt6_info can only have
> >> the same dst, src and prefsrc as the fib6_info on which it is based.
> >> Thus, only the flags is needed above. That simplifies this patch a lot.
> > If dst, src and prefsrc in rt6_info are always the same as these in 
> > fib6_info,
> > why do we need them in rt6_info? we could just get it by 'from'.
> >
>
> I just sent a patch removing rt6i_prefsrc. It is set with only 1 reader
> that can be converted.
>
> rt6i_src is checked against the fib6_info to invalidate a dst if the src
> has changed, so a valid rt will always have the same rt6i_src as the
> rt->from.
>
> rt6i_dst is set to the dest address / 128 in cases, so it should be used
> for rt6_info cases above.
So that means, I will use rt6i_dst and rt6i_flags when dst is set?
how about I use rt6i_src there as well? just to make it look clear.
and plus the gw/nh dump fix in rt6_fill_node():
-if (rt->fib6_nsiblings) {
+if (rt6) {
+if (fib6_flags & RTF_GATEWAY)
+if (nla_put_in6_addr(skb, RTA_GATEWAY,
+ >rt6i_gateway) < 0)
+goto nla_put_failure;
+
+if (dst->dev && nla_put_u32(skb, RTA_OIF, dst->dev->ifindex))
+goto nla_put_failure;
+} else if (rt->fib6_nsiblings) {
 struct fib6_info *sibling, *next_sibling;
 struct nlattr *mp;

looks good to you?


Re: [PATCH net] ipv6: use rt6_info members when dst is set in rt6_fill_node

2018-09-09 Thread Xin Long
On Sun, Sep 9, 2018 at 9:45 AM David Ahern  wrote:
>
> On 9/8/18 3:24 AM, Xin Long wrote:
> > In inet6_rtm_getroute, since Commit 93531c674315 ("net/ipv6: separate
> > handling of FIB entries from dst based routes"), it has used rt->from
> > to dump route info instead of rt.
> >
> > However for some route like cache, its information is not the same as
> > that of the 'from' one. It caused 'ip -6 route get' to dump the wrong
> > route information.
> >
> > In Jianlin's testing, the output information even lost the expiration
> > time for a pmtu route cache due to the wrong fib6_flags.
>
> you are right about the flags ...
>
> >
> > So change to use rt6_info members when it tries to dump a route entry
> > without fibmatch set.
>
> but not the src, dst and prefsrc.
>
> >
> > Note that we will fix the gw/nh dump in another patch.
>
> And only the gateway can change do to a redirect and redirects do not
> change the device - only the gateway.
>
> Let's do both changes in a single patch.
>
> >
> > Fixes: 93531c674315 ("net/ipv6: separate handling of FIB entries from dst 
> > based routes")
> > Reported-by: Jianlin Shi 
> > Signed-off-by: Xin Long 
> > ---
> >  net/ipv6/route.c | 39 ++-
> >  1 file changed, 26 insertions(+), 13 deletions(-)
> >
> > diff --git a/net/ipv6/route.c b/net/ipv6/route.c
> > index 18e00ce..e554922 100644
> > --- a/net/ipv6/route.c
> > +++ b/net/ipv6/route.c
> > @@ -4670,20 +4670,33 @@ static int rt6_fill_node(struct net *net, struct 
> > sk_buff *skb,
> >int iif, int type, u32 portid, u32 seq,
> >unsigned int flags)
> >  {
> > - struct rtmsg *rtm;
> > + struct rt6key *fib6_prefsrc, *fib6_dst, *fib6_src;
> > + struct rt6_info *rt6 = (struct rt6_info *)dst;
> > + u32 *pmetrics, table, fib6_flags;
> >   struct nlmsghdr *nlh;
> > + struct rtmsg *rtm;
> >   long expires = 0;
> > - u32 *pmetrics;
> > - u32 table;
> >
> >   nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
> >   if (!nlh)
> >   return -EMSGSIZE;
> >
> > + if (rt6) {
> > + fib6_dst = >rt6i_dst;
> > + fib6_src = >rt6i_src;
> > + fib6_flags = rt6->rt6i_flags;
> > + fib6_prefsrc = >rt6i_prefsrc;
> > + } else {
> > + fib6_dst = >fib6_dst;
> > + fib6_src = >fib6_src;
> > + fib6_flags = rt->fib6_flags;
> > + fib6_prefsrc = >fib6_prefsrc;
> > + }
>
> Unless I am missing something at the moment, an rt6_info can only have
> the same dst, src and prefsrc as the fib6_info on which it is based.
> Thus, only the flags is needed above. That simplifies this patch a lot.
If dst, src and prefsrc in rt6_info are always the same as these in fib6_info,
why do we need them in rt6_info? we could just get it by 'from'.


[PATCH net] ipv6: use rt6_info members when dst is set in rt6_fill_node

2018-09-08 Thread Xin Long
In inet6_rtm_getroute, since Commit 93531c674315 ("net/ipv6: separate
handling of FIB entries from dst based routes"), it has used rt->from
to dump route info instead of rt.

However for some route like cache, its information is not the same as
that of the 'from' one. It caused 'ip -6 route get' to dump the wrong
route information.

In Jianlin's testing, the output information even lost the expiration
time for a pmtu route cache due to the wrong fib6_flags.

So change to use rt6_info members when it tries to dump a route entry
without fibmatch set.

Note that we will fix the gw/nh dump in another patch.

Fixes: 93531c674315 ("net/ipv6: separate handling of FIB entries from dst based 
routes")
Reported-by: Jianlin Shi 
Signed-off-by: Xin Long 
---
 net/ipv6/route.c | 39 ++-
 1 file changed, 26 insertions(+), 13 deletions(-)

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 18e00ce..e554922 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -4670,20 +4670,33 @@ static int rt6_fill_node(struct net *net, struct 
sk_buff *skb,
 int iif, int type, u32 portid, u32 seq,
 unsigned int flags)
 {
-   struct rtmsg *rtm;
+   struct rt6key *fib6_prefsrc, *fib6_dst, *fib6_src;
+   struct rt6_info *rt6 = (struct rt6_info *)dst;
+   u32 *pmetrics, table, fib6_flags;
struct nlmsghdr *nlh;
+   struct rtmsg *rtm;
long expires = 0;
-   u32 *pmetrics;
-   u32 table;
 
nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
if (!nlh)
return -EMSGSIZE;
 
+   if (rt6) {
+   fib6_dst = >rt6i_dst;
+   fib6_src = >rt6i_src;
+   fib6_flags = rt6->rt6i_flags;
+   fib6_prefsrc = >rt6i_prefsrc;
+   } else {
+   fib6_dst = >fib6_dst;
+   fib6_src = >fib6_src;
+   fib6_flags = rt->fib6_flags;
+   fib6_prefsrc = >fib6_prefsrc;
+   }
+
rtm = nlmsg_data(nlh);
rtm->rtm_family = AF_INET6;
-   rtm->rtm_dst_len = rt->fib6_dst.plen;
-   rtm->rtm_src_len = rt->fib6_src.plen;
+   rtm->rtm_dst_len = fib6_dst->plen;
+   rtm->rtm_src_len = fib6_src->plen;
rtm->rtm_tos = 0;
if (rt->fib6_table)
table = rt->fib6_table->tb6_id;
@@ -4698,7 +4711,7 @@ static int rt6_fill_node(struct net *net, struct sk_buff 
*skb,
rtm->rtm_scope = RT_SCOPE_UNIVERSE;
rtm->rtm_protocol = rt->fib6_protocol;
 
-   if (rt->fib6_flags & RTF_CACHE)
+   if (fib6_flags & RTF_CACHE)
rtm->rtm_flags |= RTM_F_CLONED;
 
if (dest) {
@@ -4706,7 +4719,7 @@ static int rt6_fill_node(struct net *net, struct sk_buff 
*skb,
goto nla_put_failure;
rtm->rtm_dst_len = 128;
} else if (rtm->rtm_dst_len)
-   if (nla_put_in6_addr(skb, RTA_DST, >fib6_dst.addr))
+   if (nla_put_in6_addr(skb, RTA_DST, _dst->addr))
goto nla_put_failure;
 #ifdef CONFIG_IPV6_SUBTREES
if (src) {
@@ -4714,12 +4727,12 @@ static int rt6_fill_node(struct net *net, struct 
sk_buff *skb,
goto nla_put_failure;
rtm->rtm_src_len = 128;
} else if (rtm->rtm_src_len &&
-  nla_put_in6_addr(skb, RTA_SRC, >fib6_src.addr))
+  nla_put_in6_addr(skb, RTA_SRC, _src->addr))
goto nla_put_failure;
 #endif
if (iif) {
 #ifdef CONFIG_IPV6_MROUTE
-   if (ipv6_addr_is_multicast(>fib6_dst.addr)) {
+   if (ipv6_addr_is_multicast(_dst->addr)) {
int err = ip6mr_get_route(net, skb, rtm, portid);
 
if (err == 0)
@@ -4737,9 +4750,9 @@ static int rt6_fill_node(struct net *net, struct sk_buff 
*skb,
goto nla_put_failure;
}
 
-   if (rt->fib6_prefsrc.plen) {
+   if (fib6_prefsrc->plen) {
struct in6_addr saddr_buf;
-   saddr_buf = rt->fib6_prefsrc.addr;
+   saddr_buf = fib6_prefsrc->addr;
if (nla_put_in6_addr(skb, RTA_PREFSRC, _buf))
goto nla_put_failure;
}
@@ -4777,7 +4790,7 @@ static int rt6_fill_node(struct net *net, struct sk_buff 
*skb,
goto nla_put_failure;
}
 
-   if (rt->fib6_flags & RTF_EXPIRES) {
+   if (fib6_flags & RTF_EXPIRES) {
expires = dst ? dst->expires : rt->expires;
expires -= jiffies;
}
@@ -4785,7 +4798,7 @@ static int rt6_fill_node(struct net *net, struct sk_buff 
*skb,
if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0)
goto nla_put_failure;

[PATCH net 2/2] sctp: not traverse asoc trans list if non-ipv6 trans exists for ipv6_flowlabel

2018-09-03 Thread Xin Long
When users set params.spp_address and get a trans, ipv6_flowlabel flag
should be applied into this trans. But even if this one is not an ipv6
trans, it should not go to apply it into all other transes of the asoc
but simply ignore it.

Fixes: 0b0dce7a36fb ("sctp: add spp_ipv6_flowlabel and spp_dscp for 
sctp_paddrparams")
Signed-off-by: Xin Long 
---
 net/sctp/socket.c | 10 ++
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index a0ccfa4..f73e9d3 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -2658,10 +2658,12 @@ static int sctp_apply_peer_addr_params(struct 
sctp_paddrparams *params,
}
 
if (params->spp_flags & SPP_IPV6_FLOWLABEL) {
-   if (trans && trans->ipaddr.sa.sa_family == AF_INET6) {
-   trans->flowlabel = params->spp_ipv6_flowlabel &
-  SCTP_FLOWLABEL_VAL_MASK;
-   trans->flowlabel |= SCTP_FLOWLABEL_SET_MASK;
+   if (trans) {
+   if (trans->ipaddr.sa.sa_family == AF_INET6) {
+   trans->flowlabel = params->spp_ipv6_flowlabel &
+  SCTP_FLOWLABEL_VAL_MASK;
+   trans->flowlabel |= SCTP_FLOWLABEL_SET_MASK;
+   }
} else if (asoc) {
struct sctp_transport *t;
 
-- 
2.1.0



[PATCH net 0/2] sctp: two fixes for spp_ipv6_flowlabel and spp_dscp sockopts

2018-09-03 Thread Xin Long
This patchset fixes two problems in sctp_apply_peer_addr_params()
when setting spp_ipv6_flowlabel or spp_dscp.

Xin Long (2):
  sctp: fix invalid reference to the index variable of the iterator
  sctp: not traverse asoc trans list if non-ipv6 trans exists for
ipv6_flowlabel

 net/sctp/socket.c | 34 +++---
 1 file changed, 19 insertions(+), 15 deletions(-)

-- 
2.1.0



[PATCH net 1/2] sctp: fix invalid reference to the index variable of the iterator

2018-09-03 Thread Xin Long
Now in sctp_apply_peer_addr_params(), if SPP_IPV6_FLOWLABEL flag is set
and trans is NULL, it would use trans as the index variable to traverse
transport_addr_list, then trans is set as the last transport of it.

Later, if SPP_DSCP flag is set, it would enter into the wrong branch as
trans is actually an invalid reference.

So fix it by using a new index variable to traverse transport_addr_list
for both SPP_DSCP and SPP_IPV6_FLOWLABEL flags process.

Fixes: 0b0dce7a36fb ("sctp: add spp_ipv6_flowlabel and spp_dscp for 
sctp_paddrparams")
Reported-by: Julia Lawall 
Signed-off-by: Xin Long 
---
 net/sctp/socket.c | 24 +---
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index aa76586..a0ccfa4 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -2663,14 +2663,15 @@ static int sctp_apply_peer_addr_params(struct 
sctp_paddrparams *params,
   SCTP_FLOWLABEL_VAL_MASK;
trans->flowlabel |= SCTP_FLOWLABEL_SET_MASK;
} else if (asoc) {
-   list_for_each_entry(trans,
-   >peer.transport_addr_list,
+   struct sctp_transport *t;
+
+   list_for_each_entry(t, >peer.transport_addr_list,
transports) {
-   if (trans->ipaddr.sa.sa_family != AF_INET6)
+   if (t->ipaddr.sa.sa_family != AF_INET6)
continue;
-   trans->flowlabel = params->spp_ipv6_flowlabel &
-  SCTP_FLOWLABEL_VAL_MASK;
-   trans->flowlabel |= SCTP_FLOWLABEL_SET_MASK;
+   t->flowlabel = params->spp_ipv6_flowlabel &
+  SCTP_FLOWLABEL_VAL_MASK;
+   t->flowlabel |= SCTP_FLOWLABEL_SET_MASK;
}
asoc->flowlabel = params->spp_ipv6_flowlabel &
  SCTP_FLOWLABEL_VAL_MASK;
@@ -2687,12 +2688,13 @@ static int sctp_apply_peer_addr_params(struct 
sctp_paddrparams *params,
trans->dscp = params->spp_dscp & SCTP_DSCP_VAL_MASK;
trans->dscp |= SCTP_DSCP_SET_MASK;
} else if (asoc) {
-   list_for_each_entry(trans,
-   >peer.transport_addr_list,
+   struct sctp_transport *t;
+
+   list_for_each_entry(t, >peer.transport_addr_list,
transports) {
-   trans->dscp = params->spp_dscp &
- SCTP_DSCP_VAL_MASK;
-   trans->dscp |= SCTP_DSCP_SET_MASK;
+   t->dscp = params->spp_dscp &
+ SCTP_DSCP_VAL_MASK;
+   t->dscp |= SCTP_DSCP_SET_MASK;
}
asoc->dscp = params->spp_dscp & SCTP_DSCP_VAL_MASK;
asoc->dscp |= SCTP_DSCP_SET_MASK;
-- 
2.1.0



Re: [PATCH net] sctp: hold transport before accessing its asoc in sctp_transport_get_next

2018-08-31 Thread Xin Long
On Wed, Aug 29, 2018 at 7:36 PM Neil Horman  wrote:
>
> On Wed, Aug 29, 2018 at 12:08:40AM +0800, Xin Long wrote:
> > On Mon, Aug 27, 2018 at 9:08 PM Neil Horman  wrote:
> > >
> > > On Mon, Aug 27, 2018 at 06:38:31PM +0800, Xin Long wrote:
> > > > As Marcelo noticed, in sctp_transport_get_next, it is iterating over
> > > > transports but then also accessing the association directly, without
> > > > checking any refcnts before that, which can cause an use-after-free
> > > > Read.
> > > >
> > > > So fix it by holding transport before accessing the association. With
> > > > that, sctp_transport_hold calls can be removed in the later places.
> > > >
> > > > Fixes: 626d16f50f39 ("sctp: export some apis or variables for sctp_diag 
> > > > and reuse some for proc")
> > > > Reported-by: syzbot+fe62a0c9aa6a85c6d...@syzkaller.appspotmail.com
> > > > Signed-off-by: Xin Long 
> > > > ---
> > > >  net/sctp/proc.c   |  4 
> > > >  net/sctp/socket.c | 22 +++---
> > > >  2 files changed, 15 insertions(+), 11 deletions(-)
> > > >
> > > > diff --git a/net/sctp/proc.c b/net/sctp/proc.c
> > > > index ef5c9a8..4d6f1c8 100644
> > > > --- a/net/sctp/proc.c
> > > > +++ b/net/sctp/proc.c
> > > > @@ -264,8 +264,6 @@ static int sctp_assocs_seq_show(struct seq_file 
> > > > *seq, void *v)
> > > >   }
> > > >
> > > >   transport = (struct sctp_transport *)v;
> > > > - if (!sctp_transport_hold(transport))
> > > > - return 0;
> > > >   assoc = transport->asoc;
> > > >   epb = >base;
> > > >   sk = epb->sk;
> > > > @@ -322,8 +320,6 @@ static int sctp_remaddr_seq_show(struct seq_file 
> > > > *seq, void *v)
> > > >   }
> > > >
> > > >   transport = (struct sctp_transport *)v;
> > > > - if (!sctp_transport_hold(transport))
> > > > - return 0;
> > > >   assoc = transport->asoc;
> > > >
> > > >   list_for_each_entry_rcu(tsp, >peer.transport_addr_list,
> > > > diff --git a/net/sctp/socket.c b/net/sctp/socket.c
> > > > index e96b15a..aa76586 100644
> > > > --- a/net/sctp/socket.c
> > > > +++ b/net/sctp/socket.c
> > > > @@ -5005,9 +5005,14 @@ struct sctp_transport 
> > > > *sctp_transport_get_next(struct net *net,
> > > >   break;
> > > >   }
> > > >
> > > > + if (!sctp_transport_hold(t))
> > > > + continue;
> > > > +
> > > >   if (net_eq(sock_net(t->asoc->base.sk), net) &&
> > > >   t->asoc->peer.primary_path == t)
> > > >   break;
> > > > +
> > > > + sctp_transport_put(t);
> > > >   }
> > > >
> > > >   return t;
> > > > @@ -5017,13 +5022,18 @@ struct sctp_transport 
> > > > *sctp_transport_get_idx(struct net *net,
> > > > struct rhashtable_iter 
> > > > *iter,
> > > > int pos)
> > > >  {
> > > > - void *obj = SEQ_START_TOKEN;
> > > > + struct sctp_transport *t;
> > > >
> > > > - while (pos && (obj = sctp_transport_get_next(net, iter)) &&
> > > > -!IS_ERR(obj))
> > > > - pos--;
> > > > + if (!pos)
> > > > + return SEQ_START_TOKEN;
> > > >
> > > > - return obj;
> > > > + while ((t = sctp_transport_get_next(net, iter)) && !IS_ERR(t)) {
> > > > + if (!--pos)
> > > > + break;
> > > > + sctp_transport_put(t);
> > > > + }
> > > > +
> > > > + return t;
> > > >  }
> > > >
> > > >  int sctp_for_each_endpoint(int (*cb)(struct sctp_endpoint *, void *),
> > > > @@ -5082,8 +5092,6 @@ int sctp_for_each_transport(int (*cb)(struct 
> > > > sctp_transport *, void *),
> > > >
> > > >   tsp = sctp_transport_get_idx(net, , *pos + 1);
> > > >   for (; !IS_ERR_OR

Re: [PATCH net] sctp: hold transport before accessing its asoc in sctp_transport_get_next

2018-08-28 Thread Xin Long
On Mon, Aug 27, 2018 at 9:08 PM Neil Horman  wrote:
>
> On Mon, Aug 27, 2018 at 06:38:31PM +0800, Xin Long wrote:
> > As Marcelo noticed, in sctp_transport_get_next, it is iterating over
> > transports but then also accessing the association directly, without
> > checking any refcnts before that, which can cause an use-after-free
> > Read.
> >
> > So fix it by holding transport before accessing the association. With
> > that, sctp_transport_hold calls can be removed in the later places.
> >
> > Fixes: 626d16f50f39 ("sctp: export some apis or variables for sctp_diag and 
> > reuse some for proc")
> > Reported-by: syzbot+fe62a0c9aa6a85c6d...@syzkaller.appspotmail.com
> > Signed-off-by: Xin Long 
> > ---
> >  net/sctp/proc.c   |  4 
> >  net/sctp/socket.c | 22 +++---
> >  2 files changed, 15 insertions(+), 11 deletions(-)
> >
> > diff --git a/net/sctp/proc.c b/net/sctp/proc.c
> > index ef5c9a8..4d6f1c8 100644
> > --- a/net/sctp/proc.c
> > +++ b/net/sctp/proc.c
> > @@ -264,8 +264,6 @@ static int sctp_assocs_seq_show(struct seq_file *seq, 
> > void *v)
> >   }
> >
> >   transport = (struct sctp_transport *)v;
> > - if (!sctp_transport_hold(transport))
> > - return 0;
> >   assoc = transport->asoc;
> >   epb = >base;
> >   sk = epb->sk;
> > @@ -322,8 +320,6 @@ static int sctp_remaddr_seq_show(struct seq_file *seq, 
> > void *v)
> >   }
> >
> >   transport = (struct sctp_transport *)v;
> > - if (!sctp_transport_hold(transport))
> > - return 0;
> >   assoc = transport->asoc;
> >
> >   list_for_each_entry_rcu(tsp, >peer.transport_addr_list,
> > diff --git a/net/sctp/socket.c b/net/sctp/socket.c
> > index e96b15a..aa76586 100644
> > --- a/net/sctp/socket.c
> > +++ b/net/sctp/socket.c
> > @@ -5005,9 +5005,14 @@ struct sctp_transport 
> > *sctp_transport_get_next(struct net *net,
> >   break;
> >   }
> >
> > + if (!sctp_transport_hold(t))
> > + continue;
> > +
> >   if (net_eq(sock_net(t->asoc->base.sk), net) &&
> >   t->asoc->peer.primary_path == t)
> >   break;
> > +
> > + sctp_transport_put(t);
> >   }
> >
> >   return t;
> > @@ -5017,13 +5022,18 @@ struct sctp_transport 
> > *sctp_transport_get_idx(struct net *net,
> > struct rhashtable_iter *iter,
> > int pos)
> >  {
> > - void *obj = SEQ_START_TOKEN;
> > + struct sctp_transport *t;
> >
> > - while (pos && (obj = sctp_transport_get_next(net, iter)) &&
> > -!IS_ERR(obj))
> > - pos--;
> > + if (!pos)
> > + return SEQ_START_TOKEN;
> >
> > - return obj;
> > + while ((t = sctp_transport_get_next(net, iter)) && !IS_ERR(t)) {
> > + if (!--pos)
> > + break;
> > + sctp_transport_put(t);
> > + }
> > +
> > + return t;
> >  }
> >
> >  int sctp_for_each_endpoint(int (*cb)(struct sctp_endpoint *, void *),
> > @@ -5082,8 +5092,6 @@ int sctp_for_each_transport(int (*cb)(struct 
> > sctp_transport *, void *),
> >
> >   tsp = sctp_transport_get_idx(net, , *pos + 1);
> >   for (; !IS_ERR_OR_NULL(tsp); tsp = sctp_transport_get_next(net, 
> > )) {
> > - if (!sctp_transport_hold(tsp))
> > - continue;
> >   ret = cb(tsp, p);
> >   if (ret)
> >   break;
> > --
> > 2.1.0
> >
> >
> Acked-by: Neil Horman 
>
> Additionally, its not germaine to this particular fix, but why are we still
> using that pos variable in sctp_transport_get_idx?  With the conversion to
> rhashtables, it doesn't seem particularly useful anymore.
For proc, seems so, hti is saved into seq->private.
But for diag, "hti" in sctp_for_each_transport() is a local variable.
do you think where we can save it?


Re: [PATCH net 0/3] ipv6: fix error path of inet6_init()

2018-08-28 Thread Xin Long



- Original Message -
> The error path of inet6_init() can trigger multiple kernel panics,
> mostly due to wrong ordering of cleanups. This series fixes those
> issues.
> 
> Sabrina Dubroca (3):
>   ipv6: fix cleanup ordering for ip6_mr failure
>   ipv6: fix cleanup ordering for pingv6 registration
>   net: rtnl: return early from rtnl_unregister_all when protocol isn't
> registered
> 
>  net/core/rtnetlink.c |  4 
>  net/ipv6/af_inet6.c  | 10 +-
>  2 files changed, 9 insertions(+), 5 deletions(-)
> 
> --
> 2.18.0
> 
> 
Series Reviewed-by: Xin Long 


[PATCH net] erspan: set erspan_ver to 1 by default when adding an erspan dev

2018-08-27 Thread Xin Long
After erspan_ver is introudced, if erspan_ver is not set in iproute, its
value will be left 0 by default. Since Commit 02f99df1875c ("erspan: fix
invalid erspan version."), it has broken the traffic due to the version
check in erspan_xmit if users are not aware of 'erspan_ver' param, like
using an old version of iproute.

To fix this compatibility problem, it sets erspan_ver to 1 by default
when adding an erspan dev in erspan_setup. Note that we can't do it in
ipgre_netlink_parms, as this function is also used by ipgre_changelink.

Fixes: 02f99df1875c ("erspan: fix invalid erspan version.")
Reported-by: Jianlin Shi 
Signed-off-by: Xin Long 
---
 net/ipv4/ip_gre.c  | 3 +++
 net/ipv6/ip6_gre.c | 1 +
 2 files changed, 4 insertions(+)

diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 51a5d06..ae714ae 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -1508,11 +1508,14 @@ static int ipgre_fill_info(struct sk_buff *skb, const 
struct net_device *dev)
 
 static void erspan_setup(struct net_device *dev)
 {
+   struct ip_tunnel *t = netdev_priv(dev);
+
ether_setup(dev);
dev->netdev_ops = _netdev_ops;
dev->priv_flags &= ~IFF_TX_SKB_SHARING;
dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
ip_tunnel_setup(dev, erspan_net_id);
+   t->erspan_ver = 1;
 }
 
 static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 18a3794..e493b04 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -1778,6 +1778,7 @@ static void ip6gre_netlink_parms(struct nlattr *data[],
if (data[IFLA_GRE_COLLECT_METADATA])
parms->collect_md = true;
 
+   parms->erspan_ver = 1;
if (data[IFLA_GRE_ERSPAN_VER])
parms->erspan_ver = nla_get_u8(data[IFLA_GRE_ERSPAN_VER]);
 
-- 
2.1.0



[PATCH net] sctp: remove useless start_fail from sctp_ht_iter in proc

2018-08-27 Thread Xin Long
After changing rhashtable_walk_start to return void, start_fail would
never be set other value than 0, and the checking for start_fail is
pointless, so remove it.

Fixes: 97a6ec4ac021 ("rhashtable: Change rhashtable_walk_start to return void")
Signed-off-by: Xin Long 
---
 net/sctp/proc.c | 4 
 1 file changed, 4 deletions(-)

diff --git a/net/sctp/proc.c b/net/sctp/proc.c
index 4d6f1c8..a644292 100644
--- a/net/sctp/proc.c
+++ b/net/sctp/proc.c
@@ -215,7 +215,6 @@ static const struct seq_operations sctp_eps_ops = {
 struct sctp_ht_iter {
struct seq_net_private p;
struct rhashtable_iter hti;
-   int start_fail;
 };
 
 static void *sctp_transport_seq_start(struct seq_file *seq, loff_t *pos)
@@ -224,7 +223,6 @@ static void *sctp_transport_seq_start(struct seq_file *seq, 
loff_t *pos)
 
sctp_transport_walk_start(>hti);
 
-   iter->start_fail = 0;
return sctp_transport_get_idx(seq_file_net(seq), >hti, *pos);
 }
 
@@ -232,8 +230,6 @@ static void sctp_transport_seq_stop(struct seq_file *seq, 
void *v)
 {
struct sctp_ht_iter *iter = seq->private;
 
-   if (iter->start_fail)
-   return;
sctp_transport_walk_stop(>hti);
 }
 
-- 
2.1.0



[PATCH net] sctp: hold transport before accessing its asoc in sctp_transport_get_next

2018-08-27 Thread Xin Long
As Marcelo noticed, in sctp_transport_get_next, it is iterating over
transports but then also accessing the association directly, without
checking any refcnts before that, which can cause an use-after-free
Read.

So fix it by holding transport before accessing the association. With
that, sctp_transport_hold calls can be removed in the later places.

Fixes: 626d16f50f39 ("sctp: export some apis or variables for sctp_diag and 
reuse some for proc")
Reported-by: syzbot+fe62a0c9aa6a85c6d...@syzkaller.appspotmail.com
Signed-off-by: Xin Long 
---
 net/sctp/proc.c   |  4 
 net/sctp/socket.c | 22 +++---
 2 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/net/sctp/proc.c b/net/sctp/proc.c
index ef5c9a8..4d6f1c8 100644
--- a/net/sctp/proc.c
+++ b/net/sctp/proc.c
@@ -264,8 +264,6 @@ static int sctp_assocs_seq_show(struct seq_file *seq, void 
*v)
}
 
transport = (struct sctp_transport *)v;
-   if (!sctp_transport_hold(transport))
-   return 0;
assoc = transport->asoc;
epb = >base;
sk = epb->sk;
@@ -322,8 +320,6 @@ static int sctp_remaddr_seq_show(struct seq_file *seq, void 
*v)
}
 
transport = (struct sctp_transport *)v;
-   if (!sctp_transport_hold(transport))
-   return 0;
assoc = transport->asoc;
 
list_for_each_entry_rcu(tsp, >peer.transport_addr_list,
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index e96b15a..aa76586 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -5005,9 +5005,14 @@ struct sctp_transport *sctp_transport_get_next(struct 
net *net,
break;
}
 
+   if (!sctp_transport_hold(t))
+   continue;
+
if (net_eq(sock_net(t->asoc->base.sk), net) &&
t->asoc->peer.primary_path == t)
break;
+
+   sctp_transport_put(t);
}
 
return t;
@@ -5017,13 +5022,18 @@ struct sctp_transport *sctp_transport_get_idx(struct 
net *net,
  struct rhashtable_iter *iter,
  int pos)
 {
-   void *obj = SEQ_START_TOKEN;
+   struct sctp_transport *t;
 
-   while (pos && (obj = sctp_transport_get_next(net, iter)) &&
-  !IS_ERR(obj))
-   pos--;
+   if (!pos)
+   return SEQ_START_TOKEN;
 
-   return obj;
+   while ((t = sctp_transport_get_next(net, iter)) && !IS_ERR(t)) {
+   if (!--pos)
+   break;
+   sctp_transport_put(t);
+   }
+
+   return t;
 }
 
 int sctp_for_each_endpoint(int (*cb)(struct sctp_endpoint *, void *),
@@ -5082,8 +5092,6 @@ int sctp_for_each_transport(int (*cb)(struct 
sctp_transport *, void *),
 
tsp = sctp_transport_get_idx(net, , *pos + 1);
for (; !IS_ERR_OR_NULL(tsp); tsp = sctp_transport_get_next(net, )) {
-   if (!sctp_transport_hold(tsp))
-   continue;
ret = cb(tsp, p);
if (ret)
break;
-- 
2.1.0



[PATCH net] ip6_tunnel: use the right value for ipv4 min mtu check in ip6_tnl_xmit

2018-08-05 Thread Xin Long
According to RFC791, 68 bytes is the minimum size of IPv4 datagram every
device must be able to forward without further fragmentation while 576
bytes is the minimum size of IPv4 datagram every device has to be able
to receive, so in ip6_tnl_xmit(), 68(IPV4_MIN_MTU) should be the right
value for the ipv4 min mtu check in ip6_tnl_xmit.

While at it, change to use max() instead of if statement.

Fixes: c9fefa08190f ("ip6_tunnel: get the min mtu properly in ip6_tnl_xmit")
Reported-by: Sabrina Dubroca 
Signed-off-by: Xin Long 
---
 net/ipv6/ip6_tunnel.c | 8 ++--
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 00e138a..1cc9650 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -1133,12 +1133,8 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device 
*dev, __u8 dsfield,
max_headroom += 8;
mtu -= 8;
}
-   if (skb->protocol == htons(ETH_P_IPV6)) {
-   if (mtu < IPV6_MIN_MTU)
-   mtu = IPV6_MIN_MTU;
-   } else if (mtu < 576) {
-   mtu = 576;
-   }
+   mtu = max(mtu, skb->protocol == htons(ETH_P_IPV6) ?
+  IPV6_MIN_MTU : IPV4_MIN_MTU);
 
skb_dst_update_pmtu(skb, mtu);
if (skb->len - t->tun_hlen - eth_hlen > mtu && !skb_is_gso(skb)) {
-- 
2.1.0



[PATCHv4 net-next 1/2] route: add support for directed broadcast forwarding

2018-07-27 Thread Xin Long
This patch implements the feature described in rfc1812#section-5.3.5.2
and rfc2644. It allows the router to forward directed broadcast when
sysctl bc_forwarding is enabled.

Note that this feature could be done by iptables -j TEE, but it would
cause some problems:
  - target TEE's gateway param has to be set with a specific address,
and it's not flexible especially when the route wants forward all
directed broadcasts.
  - this duplicates the directed broadcasts so this may cause side
effects to applications.

Besides, to keep consistent with other os router like BSD, it's also
necessary to implement it in the route rx path.

Note that route cache needs to be flushed when bc_forwarding is
changed.

Signed-off-by: Xin Long 
---
 include/linux/inetdevice.h   |  1 +
 include/uapi/linux/ip.h  |  1 +
 include/uapi/linux/netconf.h |  1 +
 net/ipv4/devinet.c   | 11 +++
 net/ipv4/route.c |  6 +-
 5 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h
index 27650f1..c759d1c 100644
--- a/include/linux/inetdevice.h
+++ b/include/linux/inetdevice.h
@@ -93,6 +93,7 @@ static inline void ipv4_devconf_setall(struct in_device 
*in_dev)
 
 #define IN_DEV_FORWARD(in_dev) IN_DEV_CONF_GET((in_dev), FORWARDING)
 #define IN_DEV_MFORWARD(in_dev)IN_DEV_ANDCONF((in_dev), 
MC_FORWARDING)
+#define IN_DEV_BFORWARD(in_dev)IN_DEV_ANDCONF((in_dev), 
BC_FORWARDING)
 #define IN_DEV_RPFILTER(in_dev)IN_DEV_MAXCONF((in_dev), 
RP_FILTER)
 #define IN_DEV_SRC_VMARK(in_dev)   IN_DEV_ORCONF((in_dev), SRC_VMARK)
 #define IN_DEV_SOURCE_ROUTE(in_dev)IN_DEV_ANDCONF((in_dev), \
diff --git a/include/uapi/linux/ip.h b/include/uapi/linux/ip.h
index b24a742..e42d13b 100644
--- a/include/uapi/linux/ip.h
+++ b/include/uapi/linux/ip.h
@@ -168,6 +168,7 @@ enum
IPV4_DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN,
IPV4_DEVCONF_DROP_UNICAST_IN_L2_MULTICAST,
IPV4_DEVCONF_DROP_GRATUITOUS_ARP,
+   IPV4_DEVCONF_BC_FORWARDING,
__IPV4_DEVCONF_MAX
 };
 
diff --git a/include/uapi/linux/netconf.h b/include/uapi/linux/netconf.h
index c84fcdf..fac4edd 100644
--- a/include/uapi/linux/netconf.h
+++ b/include/uapi/linux/netconf.h
@@ -18,6 +18,7 @@ enum {
NETCONFA_PROXY_NEIGH,
NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN,
NETCONFA_INPUT,
+   NETCONFA_BC_FORWARDING,
__NETCONFA_MAX
 };
 #define NETCONFA_MAX   (__NETCONFA_MAX - 1)
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index d7585ab..ea4bd8a 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1827,6 +1827,8 @@ static int inet_netconf_msgsize_devconf(int type)
size += nla_total_size(4);
if (all || type == NETCONFA_MC_FORWARDING)
size += nla_total_size(4);
+   if (all || type == NETCONFA_BC_FORWARDING)
+   size += nla_total_size(4);
if (all || type == NETCONFA_PROXY_NEIGH)
size += nla_total_size(4);
if (all || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN)
@@ -1873,6 +1875,10 @@ static int inet_netconf_fill_devconf(struct sk_buff 
*skb, int ifindex,
nla_put_s32(skb, NETCONFA_MC_FORWARDING,
IPV4_DEVCONF(*devconf, MC_FORWARDING)) < 0)
goto nla_put_failure;
+   if ((all || type == NETCONFA_BC_FORWARDING) &&
+   nla_put_s32(skb, NETCONFA_BC_FORWARDING,
+   IPV4_DEVCONF(*devconf, BC_FORWARDING)) < 0)
+   goto nla_put_failure;
if ((all || type == NETCONFA_PROXY_NEIGH) &&
nla_put_s32(skb, NETCONFA_PROXY_NEIGH,
IPV4_DEVCONF(*devconf, PROXY_ARP)) < 0)
@@ -2143,6 +2149,10 @@ static int devinet_conf_proc(struct ctl_table *ctl, int 
write,
if ((new_value == 0) && (old_value != 0))
rt_cache_flush(net);
 
+   if (i == IPV4_DEVCONF_BC_FORWARDING - 1 &&
+   new_value != old_value)
+   rt_cache_flush(net);
+
if (i == IPV4_DEVCONF_RP_FILTER - 1 &&
new_value != old_value) {
ifindex = devinet_conf_ifindex(net, cnf);
@@ -2259,6 +2269,7 @@ static struct devinet_sysctl_table {
DEVINET_SYSCTL_COMPLEX_ENTRY(FORWARDING, "forwarding",
 devinet_sysctl_forward),
DEVINET_SYSCTL_RO_ENTRY(MC_FORWARDING, "mc_forwarding"),
+   DEVINET_SYSCTL_RW_ENTRY(BC_FORWARDING, "bc_forwarding"),
 
DEVINET_SYSCTL_RW_ENTRY(ACCEPT_REDIRECTS, "accept_redirects"),
DEVINET_SYSCTL_RW_ENTRY(SECURE_REDIRECTS, "secure_redirects"),
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 1df6e97..b678466 100644
--- a/net/ipv4/route.c
+++ b/net/ip

[PATCHv4 net-next 2/2] selftests: add a selftest for directed broadcast forwarding

2018-07-27 Thread Xin Long
As Ido's suggestion, this patch is to add a selftest for directed
broadcast forwarding with vrf. It does the assertion by checking
the src IP of the echo-reply packet in ping_test_from.

Signed-off-by: Xin Long 
---
 .../selftests/net/forwarding/router_broadcast.sh   | 233 +
 1 file changed, 233 insertions(+)
 create mode 100755 tools/testing/selftests/net/forwarding/router_broadcast.sh

diff --git a/tools/testing/selftests/net/forwarding/router_broadcast.sh 
b/tools/testing/selftests/net/forwarding/router_broadcast.sh
new file mode 100755
index 000..7bd2ebb
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/router_broadcast.sh
@@ -0,0 +1,233 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+ALL_TESTS="ping_ipv4"
+NUM_NETIFS=6
+source lib.sh
+
+h1_create()
+{
+   vrf_create "vrf-h1"
+   ip link set dev $h1 master vrf-h1
+
+   ip link set dev vrf-h1 up
+   ip link set dev $h1 up
+
+   ip address add 192.0.2.2/24 dev $h1
+
+   ip route add 198.51.100.0/24 vrf vrf-h1 nexthop via 192.0.2.1
+   ip route add 198.51.200.0/24 vrf vrf-h1 nexthop via 192.0.2.1
+}
+
+h1_destroy()
+{
+   ip route del 198.51.200.0/24 vrf vrf-h1
+   ip route del 198.51.100.0/24 vrf vrf-h1
+
+   ip address del 192.0.2.2/24 dev $h1
+
+   ip link set dev $h1 down
+   vrf_destroy "vrf-h1"
+}
+
+h2_create()
+{
+   vrf_create "vrf-h2"
+   ip link set dev $h2 master vrf-h2
+
+   ip link set dev vrf-h2 up
+   ip link set dev $h2 up
+
+   ip address add 198.51.100.2/24 dev $h2
+
+   ip route add 192.0.2.0/24 vrf vrf-h2 nexthop via 198.51.100.1
+   ip route add 198.51.200.0/24 vrf vrf-h2 nexthop via 198.51.100.1
+}
+
+h2_destroy()
+{
+   ip route del 198.51.200.0/24 vrf vrf-h2
+   ip route del 192.0.2.0/24 vrf vrf-h2
+
+   ip address del 198.51.100.2/24 dev $h2
+
+   ip link set dev $h2 down
+   vrf_destroy "vrf-h2"
+}
+
+h3_create()
+{
+   vrf_create "vrf-h3"
+   ip link set dev $h3 master vrf-h3
+
+   ip link set dev vrf-h3 up
+   ip link set dev $h3 up
+
+   ip address add 198.51.200.2/24 dev $h3
+
+   ip route add 192.0.2.0/24 vrf vrf-h3 nexthop via 198.51.200.1
+   ip route add 198.51.100.0/24 vrf vrf-h3 nexthop via 198.51.200.1
+}
+
+h3_destroy()
+{
+   ip route del 198.51.100.0/24 vrf vrf-h3
+   ip route del 192.0.2.0/24 vrf vrf-h3
+
+   ip address del 198.51.200.2/24 dev $h3
+
+   ip link set dev $h3 down
+   vrf_destroy "vrf-h3"
+}
+
+router_create()
+{
+   ip link set dev $rp1 up
+   ip link set dev $rp2 up
+   ip link set dev $rp3 up
+
+   ip address add 192.0.2.1/24 dev $rp1
+
+   ip address add 198.51.100.1/24 dev $rp2
+   ip address add 198.51.200.1/24 dev $rp3
+}
+
+router_destroy()
+{
+   ip address del 198.51.200.1/24 dev $rp3
+   ip address del 198.51.100.1/24 dev $rp2
+
+   ip address del 192.0.2.1/24 dev $rp1
+
+   ip link set dev $rp3 down
+   ip link set dev $rp2 down
+   ip link set dev $rp1 down
+}
+
+setup_prepare()
+{
+   h1=${NETIFS[p1]}
+   rp1=${NETIFS[p2]}
+
+   rp2=${NETIFS[p3]}
+   h2=${NETIFS[p4]}
+
+   rp3=${NETIFS[p5]}
+   h3=${NETIFS[p6]}
+
+   vrf_prepare
+
+   h1_create
+   h2_create
+   h3_create
+
+   router_create
+
+   forwarding_enable
+}
+
+cleanup()
+{
+   pre_cleanup
+
+   forwarding_restore
+
+   router_destroy
+
+   h3_destroy
+   h2_destroy
+   h1_destroy
+
+   vrf_cleanup
+}
+
+bc_forwarding_disable()
+{
+   sysctl_set net.ipv4.conf.all.bc_forwarding 0
+   sysctl_set net.ipv4.conf.$rp1.bc_forwarding 0
+}
+
+bc_forwarding_enable()
+{
+   sysctl_set net.ipv4.conf.all.bc_forwarding 1
+   sysctl_set net.ipv4.conf.$rp1.bc_forwarding 1
+}
+
+bc_forwarding_restore()
+{
+   sysctl_restore net.ipv4.conf.$rp1.bc_forwarding
+   sysctl_restore net.ipv4.conf.all.bc_forwarding
+}
+
+ping_test_from()
+{
+   local oif=$1
+   local dip=$2
+   local from=$3
+   local fail=${4:-0}
+
+   RET=0
+
+   log_info "ping $dip, expected reply from $from"
+   ip vrf exec $(master_name_get $oif) \
+   $PING -I $oif $dip -c 10 -i 0.1 -w 2 -b 2>&1 | grep $from &> /dev/null
+   check_err_fail $fail $?
+}
+
+ping_ipv4()
+{
+   sysctl_set net.ipv4.icmp_echo_ignore_broadcasts 0
+
+   bc_forwarding_disable
+   log_info "bc_forwarding disabled on r1 =>"
+   ping_test_from $h1 198.51.100.255 192.0.2.1
+   log_test "h1 -> net2: reply from r1 (not forwarding)"
+   ping_test_from $h1 198.51.200.255 192.0.2.1
+   log_test "h1 -> net3: reply from r1 (not forwarding)"
+   ping_test_from $h1 192.0.2.255 192.0.2.1
+   log_test "h1 -> net1: reply from r1 (not dropping)"
+   ping_test_from $h1 255.25

  1   2   3   4   5   6   7   8   9   10   >