[PATCH V6 2/4 net-next] skbuff: Add userspace zero-copy buffers in skb

2011-05-26 Thread Shirley Ma
This patch adds userspace buffers support in skb shared info. A new 
struct skb_ubuf_info is needed to maintain the userspace buffers
argument and index, a callback is used to notify userspace to release
the buffers once lower device has done DMA (Last reference to that skb
has gone).

If there is any userspace apps to reference these userspace buffers,
then these userspaces buffers will be copied into kernel. This way we
can prevent userspace apps to hold these userspace buffers too long.

Signed-off-by: Shirley Ma x...@us.ibm.com
---

 include/linux/skbuff.h |   26 +++
 net/core/skbuff.c  |   80 ++-
 2 files changed, 104 insertions(+), 2 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index d0ae90a..025de5c 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -189,6 +189,18 @@ enum {
SKBTX_DRV_NEEDS_SK_REF = 1  3,
 };
 
+/*
+ * The callback notifies userspace to release buffers when skb DMA is done in
+ * lower device, the skb last reference should be 0 when calling this.
+ * The desc is used to track userspace buffer index.
+ */
+struct skb_ubuf_info {
+   /* support buffers allocation from userspace */
+   void(*callback)(struct sk_buff *);
+   void*arg;
+   size_t  desc;
+};
+
 /* This data is invariant across clones and lives at
  * the end of the header data, ie. at skb-end.
  */
@@ -211,6 +223,10 @@ struct skb_shared_info {
/* Intermediate layers must ensure that destructor_arg
 * remains valid until skb destructor */
void *  destructor_arg;
+
+   /* DMA mapping from/to userspace buffers */
+   struct skb_ubuf_info ubuf;
+
/* must be last field, see pskb_expand_head() */
skb_frag_t  frags[MAX_SKB_FRAGS];
 };
@@ -2261,5 +2277,15 @@ static inline void skb_checksum_none_assert(struct 
sk_buff *skb)
 }
 
 bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off);
+
+/*
+ * skb_ubuf - is the buffer from userspace
+ * @skb: buffer to check
+ */
+static inline int skb_ubuf(const struct sk_buff *skb)
+{
+   return (skb_shinfo(skb)-ubuf.callback != NULL);
+}
+
 #endif /* __KERNEL__ */
 #endif /* _LINUX_SKBUFF_H */
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 7ebeed0..890447c 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -210,6 +210,8 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t 
gfp_mask,
shinfo = skb_shinfo(skb);
memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
atomic_set(shinfo-dataref, 1);
+   shinfo-ubuf.callback = NULL;
+   shinfo-ubuf.arg = NULL;
kmemcheck_annotate_variable(shinfo-destructor_arg);
 
if (fclone) {
@@ -328,6 +330,14 @@ static void skb_release_data(struct sk_buff *skb)
put_page(skb_shinfo(skb)-frags[i].page);
}
 
+   /*
+* if skb buf is from userspace, we need to notify the caller
+* the lower device DMA has done;
+*/
+   if (skb_ubuf(skb)) {
+   skb_shinfo(skb)-ubuf.callback(skb);
+   skb_shinfo(skb)-ubuf.callback = NULL;
+   }
if (skb_has_frag_list(skb))
skb_drop_fraglist(skb);
 
@@ -480,6 +490,9 @@ bool skb_recycle_check(struct sk_buff *skb, int skb_size)
if (irqs_disabled())
return false;
 
+   if (skb_ubuf(skb))
+   return false;
+
if (skb_is_nonlinear(skb) || skb-fclone != SKB_FCLONE_UNAVAILABLE)
return false;
 
@@ -572,6 +585,7 @@ static struct sk_buff *__skb_clone(struct sk_buff *n, 
struct sk_buff *skb)
atomic_set(n-users, 1);
 
atomic_inc((skb_shinfo(skb)-dataref));
+   skb_shinfo(skb)-ubuf.callback = NULL;
skb-cloned = 1;
 
return n;
@@ -595,6 +609,48 @@ struct sk_buff *skb_morph(struct sk_buff *dst, struct 
sk_buff *src)
 }
 EXPORT_SYMBOL_GPL(skb_morph);
 
+/* skb frags copy userspace buffers to kernel */
+static int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask)
+{
+   int i;
+   int num_frags = skb_shinfo(skb)-nr_frags;
+   struct page *page, *head = NULL;
+
+   for (i = 0; i  num_frags; i++) {
+   u8 *vaddr;
+   skb_frag_t *f = skb_shinfo(skb)-frags[i];
+
+   page = alloc_page(GFP_ATOMIC);
+   if (!page) {
+   while (head) {
+   put_page(head);
+   head = (struct page *)head-private;
+   }
+   return -ENOMEM;
+   }
+   vaddr = kmap_skb_frag(skb_shinfo(skb)-frags[i]);
+   memcpy(page_address(page), vaddr + f-page_offset, f-size);
+   kunmap_skb_frag(vaddr);
+   page-private = (unsigned long)head;
+ 

Re: [PATCH V6 2/4 net-next] skbuff: Add userspace zero-copy buffers in skb

2011-05-26 Thread Eric Dumazet
Le jeudi 26 mai 2011 à 12:36 -0700, Shirley Ma a écrit :
 This patch adds userspace buffers support in skb shared info. A new 
 struct skb_ubuf_info is needed to maintain the userspace buffers
 argument and index, a callback is used to notify userspace to release
 the buffers once lower device has done DMA (Last reference to that skb
 has gone).
 
 If there is any userspace apps to reference these userspace buffers,
 then these userspaces buffers will be copied into kernel. This way we
 can prevent userspace apps to hold these userspace buffers too long.
 
 Signed-off-by: Shirley Ma x...@us.ibm.com
 ---
 
  include/linux/skbuff.h |   26 +++
  net/core/skbuff.c  |   80 ++-
  2 files changed, 104 insertions(+), 2 deletions(-)
 
 diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
 index d0ae90a..025de5c 100644
 --- a/include/linux/skbuff.h
 +++ b/include/linux/skbuff.h
 @@ -189,6 +189,18 @@ enum {
   SKBTX_DRV_NEEDS_SK_REF = 1  3,
  };
  
 +/*
 + * The callback notifies userspace to release buffers when skb DMA is done in
 + * lower device, the skb last reference should be 0 when calling this.
 + * The desc is used to track userspace buffer index.
 + */
 +struct skb_ubuf_info {
 + /* support buffers allocation from userspace */
 + void(*callback)(struct sk_buff *);
 + void*arg;
 + size_t  desc;
 +};

Thats 24 bytes on each skb...   desc for example is not used in this
patch (yes, its used later in patch 4/4)

But still... could you instead use one pointer only in skb ?



 +
  /* This data is invariant across clones and lives at
   * the end of the header data, ie. at skb-end.
   */
 @@ -211,6 +223,10 @@ struct skb_shared_info {
   /* Intermediate layers must ensure that destructor_arg
* remains valid until skb destructor */
   void *  destructor_arg;
 +
 + /* DMA mapping from/to userspace buffers */
 + struct skb_ubuf_info ubuf;
 +
   /* must be last field, see pskb_expand_head() */
   skb_frag_t  frags[MAX_SKB_FRAGS];
  };
 @@ -2261,5 +2277,15 @@ static inline void skb_checksum_none_assert(struct 
 sk_buff *skb)
  }
  
  bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off);
 +
 +/*
 + *   skb_ubuf - is the buffer from userspace
 + *   @skb: buffer to check
 + */
 +static inline int skb_ubuf(const struct sk_buff *skb)
 +{
 + return (skb_shinfo(skb)-ubuf.callback != NULL);

return skb_shinfo(skb)-ubuf.callback != NULL;

 +}
 +
  #endif   /* __KERNEL__ */
  #endif   /* _LINUX_SKBUFF_H */
 diff --git a/net/core/skbuff.c b/net/core/skbuff.c
 index 7ebeed0..890447c 100644
 --- a/net/core/skbuff.c
 +++ b/net/core/skbuff.c
 @@ -210,6 +210,8 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t 
 gfp_mask,
   shinfo = skb_shinfo(skb);
   memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
   atomic_set(shinfo-dataref, 1);
 + shinfo-ubuf.callback = NULL;
 + shinfo-ubuf.arg = NULL;

if you put ubuf ptr before dataref, no need to add this (the memset()
clear all shared_info up to dataref)

   kmemcheck_annotate_variable(shinfo-destructor_arg);
  
   if (fclone) {
 @@ -328,6 +330,14 @@ static void skb_release_data(struct sk_buff *skb)
   put_page(skb_shinfo(skb)-frags[i].page);
   }
  
 + /*
 +  * if skb buf is from userspace, we need to notify the caller
 +  * the lower device DMA has done;
 +  */
 + if (skb_ubuf(skb)) {
 + skb_shinfo(skb)-ubuf.callback(skb);
 + skb_shinfo(skb)-ubuf.callback = NULL;
 + }
   if (skb_has_frag_list(skb))
   skb_drop_fraglist(skb);
  
 @@ -480,6 +490,9 @@ bool skb_recycle_check(struct sk_buff *skb, int skb_size)
   if (irqs_disabled())
   return false;
  
 + if (skb_ubuf(skb))
 + return false;
 +
   if (skb_is_nonlinear(skb) || skb-fclone != SKB_FCLONE_UNAVAILABLE)
   return false;
  
 @@ -572,6 +585,7 @@ static struct sk_buff *__skb_clone(struct sk_buff *n, 
 struct sk_buff *skb)
   atomic_set(n-users, 1);
  
   atomic_inc((skb_shinfo(skb)-dataref));
 + skb_shinfo(skb)-ubuf.callback = NULL;
   skb-cloned = 1;
  
   return n;
 @@ -595,6 +609,48 @@ struct sk_buff *skb_morph(struct sk_buff *dst, struct 
 sk_buff *src)
  }
  EXPORT_SYMBOL_GPL(skb_morph);
  
 +/* skb frags copy userspace buffers to kernel */
 +static int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask)
 +{
 + int i;
 + int num_frags = skb_shinfo(skb)-nr_frags;
 + struct page *page, *head = NULL;
 +
 + for (i = 0; i  num_frags; i++) {
 + u8 *vaddr;
 + skb_frag_t *f = skb_shinfo(skb)-frags[i];
 +
 + page = alloc_page(GFP_ATOMIC);
 + if (!page) {
 + while 

Re: [PATCH V6 2/4 net-next] skbuff: Add userspace zero-copy buffers in skb

2011-05-26 Thread Shirley Ma
On Thu, 2011-05-26 at 22:04 +0200, Eric Dumazet wrote:
 Le jeudi 26 mai 2011 à 12:36 -0700, Shirley Ma a écrit :
  This patch adds userspace buffers support in skb shared info. A new 
  struct skb_ubuf_info is needed to maintain the userspace buffers
  argument and index, a callback is used to notify userspace to
 release
  the buffers once lower device has done DMA (Last reference to that
 skb
  has gone).
  
  If there is any userspace apps to reference these userspace buffers,
  then these userspaces buffers will be copied into kernel. This way
 we
  can prevent userspace apps to hold these userspace buffers too long.
  
  Signed-off-by: Shirley Ma x...@us.ibm.com
  ---
  
   include/linux/skbuff.h |   26 +++
   net/core/skbuff.c  |   80
 ++-
   2 files changed, 104 insertions(+), 2 deletions(-)
  
  diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
  index d0ae90a..025de5c 100644
  --- a/include/linux/skbuff.h
  +++ b/include/linux/skbuff.h
  @@ -189,6 +189,18 @@ enum {
SKBTX_DRV_NEEDS_SK_REF = 1  3,
   };
   
  +/*
  + * The callback notifies userspace to release buffers when skb DMA
 is done in
  + * lower device, the skb last reference should be 0 when calling
 this.
  + * The desc is used to track userspace buffer index.
  + */
  +struct skb_ubuf_info {
  + /* support buffers allocation from userspace */
  + void(*callback)(struct sk_buff *);
  + void*arg;
  + size_t  desc;
  +};
 
 Thats 24 bytes on each skb...   desc for example is not used in this
 patch (yes, its used later in patch 4/4)
 
 But still... could you instead use one pointer only in skb ?

I could reduce callback pointer by moving it to *arg, but not desc, this
indicates that which buffer DMA hasn't done yet in *arg.

 
  +
   /* This data is invariant across clones and lives at
* the end of the header data, ie. at skb-end.
*/
  @@ -211,6 +223,10 @@ struct skb_shared_info {
/* Intermediate layers must ensure that destructor_arg
 * remains valid until skb destructor */
void *  destructor_arg;
  +
  + /* DMA mapping from/to userspace buffers */
  + struct skb_ubuf_info ubuf;
  +
/* must be last field, see pskb_expand_head() */
skb_frag_t  frags[MAX_SKB_FRAGS];
   };
  @@ -2261,5 +2277,15 @@ static inline void
 skb_checksum_none_assert(struct sk_buff *skb)
   }
   
   bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off);
  +
  +/*
  + *   skb_ubuf - is the buffer from userspace
  + *   @skb: buffer to check
  + */
  +static inline int skb_ubuf(const struct sk_buff *skb)
  +{
  + return (skb_shinfo(skb)-ubuf.callback != NULL);
 
 return skb_shinfo(skb)-ubuf.callback != NULL;
 
  +}
  +
   #endif   /* __KERNEL__ */
   #endif   /* _LINUX_SKBUFF_H */
  diff --git a/net/core/skbuff.c b/net/core/skbuff.c
  index 7ebeed0..890447c 100644
  --- a/net/core/skbuff.c
  +++ b/net/core/skbuff.c
  @@ -210,6 +210,8 @@ struct sk_buff *__alloc_skb(unsigned int size,
 gfp_t gfp_mask,
shinfo = skb_shinfo(skb);
memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
atomic_set(shinfo-dataref, 1);
  + shinfo-ubuf.callback = NULL;
  + shinfo-ubuf.arg = NULL;
 
 if you put ubuf ptr before dataref, no need to add this (the memset()
 clear all shared_info up to dataref)

Ok, will remove it.

kmemcheck_annotate_variable(shinfo-destructor_arg);
   
if (fclone) {
  @@ -328,6 +330,14 @@ static void skb_release_data(struct sk_buff
 *skb)
put_page(skb_shinfo(skb)-frags[i].page);
}
   
  + /*
  +  * if skb buf is from userspace, we need to notify the
 caller
  +  * the lower device DMA has done;
  +  */
  + if (skb_ubuf(skb)) {
  + skb_shinfo(skb)-ubuf.callback(skb);
  + skb_shinfo(skb)-ubuf.callback = NULL;
  + }
if (skb_has_frag_list(skb))
skb_drop_fraglist(skb);
   
  @@ -480,6 +490,9 @@ bool skb_recycle_check(struct sk_buff *skb, int
 skb_size)
if (irqs_disabled())
return false;
   
  + if (skb_ubuf(skb))
  + return false;
  +
if (skb_is_nonlinear(skb) || skb-fclone !=
 SKB_FCLONE_UNAVAILABLE)
return false;
   
  @@ -572,6 +585,7 @@ static struct sk_buff *__skb_clone(struct
 sk_buff *n, struct sk_buff *skb)
atomic_set(n-users, 1);
   
atomic_inc((skb_shinfo(skb)-dataref));
  + skb_shinfo(skb)-ubuf.callback = NULL;
skb-cloned = 1;
   
return n;
  @@ -595,6 +609,48 @@ struct sk_buff *skb_morph(struct sk_buff *dst,
 struct sk_buff *src)
   }
   EXPORT_SYMBOL_GPL(skb_morph);
   
  +/* skb frags copy userspace buffers to kernel */
  +static int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask)
 

Re: [PATCH V6 2/4 net-next] skbuff: Add userspace zero-copy buffers in skb

2011-05-26 Thread Eric Dumazet
Le jeudi 26 mai 2011 à 13:24 -0700, Shirley Ma a écrit :

 I could reduce callback pointer by moving it to *arg, but not desc, this
 indicates that which buffer DMA hasn't done yet in *arg.


I guess you dont need to use skb itself to hold all your states ?

I understand its convenient for you, but I believe its worth the pain to
use only one pointer to a (small) object where you put all your stuff.

Some machines alloc/free millions of skbs per second. 

If/when most skb uses are for userspace zero-copy buffers we can embbed
your small object in skb itself ;)


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH V6 2/4 net-next] skbuff: Add userspace zero-copy buffers in skb

2011-05-26 Thread Shirley Ma
On Thu, 2011-05-26 at 22:55 +0200, Eric Dumazet wrote:
 Le jeudi 26 mai 2011 à 13:24 -0700, Shirley Ma a écrit :
 
  I could reduce callback pointer by moving it to *arg, but not desc,
 this
  indicates that which buffer DMA hasn't done yet in *arg.
 
 
 I guess you dont need to use skb itself to hold all your states ?
 
 I understand its convenient for you, but I believe its worth the pain
 to
 use only one pointer to a (small) object where you put all your stuff.
 
 Some machines alloc/free millions of skbs per second. 
 
 If/when most skb uses are for userspace zero-copy buffers we can
 embbed
 your small object in skb itself ;)

You are right, w/o this desc, there will be lots of alloc/dealloc in the
caller side. To have better performance, maybe I should have changed
most skbs to use zero-copy buffers. :)

Ok, for now let me try to move it to caller, just leave one pointer
*uarg here.

Thanks
Shirley

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html