While playing with SDP code in OFED 1.3.1 (latest stable), I have
encountered a number of bugs in the zero-copy send code:
* sdp_bz_setup() code does not handle the case of kernel data segment
correctly (kernel sockets)
* sdp_bz_setup() does not pass ENOMEM, EFAULT or other errors to
sendmsg(). In fact, a negative possible return from get_user_pages() is
nor handled.
* the deallocation of bz descriptor in sendmsg() is not handled properly
-- it is allocated many times, but freed once.
* sdp_bzcopy_get() code does not raise reference count for all pages in
the bz descriptor (only the "partial" pages will get the count raised).
However, the send completion code will call put_page() on all
entries, leading to a crash for page-aligned transfers.
Attached, please find a patch that solves these problems. With this
patch, I can use SDP and send page-aligned kernel buffers even for
zero-copy case.
Still, I do not see any performance benefit when using the zero-copy
method. I have tried various thresholds (32K, 64K, 128K), and zero-copy
was always slower.
It seems that the penalty of memcpy() is negligible compared to the
penalty of reconfiguring the card to use different addresses.
Also, looking at the sndmsg() code, I can say that allocation and
deallocation of bz descriptor for each iov element is not optimal.
Instead, an existing bz descriptor can be re-used if it fits.
--
----------------------------------------
Constantine Gavrilov
Kernel Developer
Platform Group
XIV, an IBM global brand
1 Azrieli Center, Tel-Aviv
Phone: +972-3-6074672
Fax: +972-3-6959749
----------------------------------------
--- drivers/infiniband/ulp/sdp/sdp_main.c.orig 2008-11-16 14:15:26.000000000
+0200
+++ drivers/infiniband/ulp/sdp/sdp_main.c 2008-11-16 14:37:53.000000000
+0200
@@ -1254,14 +1254,14 @@
{
struct bzcopy_state *bz;
unsigned long addr;
- int done_pages;
+ int done_pages = 0;
int thresh;
+ mm_segment_t cur_fs;
- thresh = ssk->zcopy_thresh ? : sdp_zcopy_thresh;
- if (thresh == 0 || len < thresh)
- return NULL;
+ cur_fs = get_fs();
- if (!can_do_mlock())
+ thresh = ssk->zcopy_thresh ? : sdp_zcopy_thresh;
+ if (thresh == 0 || len < thresh || !capable(CAP_IPC_LOCK))
return NULL;
/*
@@ -1275,7 +1275,7 @@
bz = kzalloc(sizeof(*bz), GFP_KERNEL);
if (!bz)
- return NULL;
+ return ERR_PTR(-ENOMEM);
addr = (unsigned long)base;
@@ -1288,34 +1288,41 @@
bz->page_cnt = PAGE_ALIGN(len + bz->cur_offset) >> PAGE_SHIFT;
bz->pages = kcalloc(bz->page_cnt, sizeof(struct page *),
GFP_KERNEL);
- if (!bz->pages)
- goto out_1;
-
- down_write(¤t->mm->mmap_sem);
+ if (!bz->pages) {
+ kfree(bz);
+ return ERR_PTR(-ENOMEM);
+ }
- if (!capable(CAP_IPC_LOCK))
- goto out_2;
addr &= PAGE_MASK;
-
- done_pages = get_user_pages(current, current->mm, addr, bz->page_cnt,
+ if(segment_eq(cur_fs, KERNEL_DS)) {
+ for(done_pages = 0; done_pages < bz->page_cnt; done_pages++) {
+ bz->pages[done_pages] = virt_to_page(addr);
+ if(!bz->pages[done_pages])
+ break;
+ get_page(bz->pages[done_pages]);
+ addr += PAGE_SIZE;
+ }
+ } else {
+ if(current->mm) {
+ down_write(¤t->mm->mmap_sem);
+ done_pages = get_user_pages(current, current->mm, addr,
bz->page_cnt,
0, 0, bz->pages, NULL);
+ up_write(¤t->mm->mmap_sem);
+ }
+ }
if (unlikely(done_pages != bz->page_cnt)){
- bz->page_cnt = done_pages;
- goto out_2;
+ int i;
+ if(done_pages > 0) {
+ for(i = 0; i < done_pages; i++)
+ put_page(bz->pages[i]);
+ }
+ kfree(bz->pages);
+ kfree(bz);
+ bz = ERR_PTR(-EFAULT);
}
- up_write(¤t->mm->mmap_sem);
-
return bz;
-
-out_2:
- up_write(¤t->mm->mmap_sem);
- kfree(bz->pages);
-out_1:
- kfree(bz);
-
- return NULL;
}
@@ -1441,11 +1448,14 @@
if (!sk_stream_wmem_schedule(sk, copy))
return SDP_DO_WAIT_MEM;
+ get_page(bz->pages[bz->cur_page]);
skb_fill_page_desc(skb, skb_shinfo(skb)->nr_frags,
bz->pages[bz->cur_page], bz->cur_offset,
this_page);
BUG_ON(skb_shinfo(skb)->nr_frags >= MAX_SKB_FRAGS);
+ BUG_ON(bz->cur_offset > PAGE_SIZE);
+
bz->cur_offset += this_page;
if (bz->cur_offset == PAGE_SIZE) {
@@ -1453,11 +1463,6 @@
bz->cur_page++;
BUG_ON(bz->cur_page > bz->page_cnt);
- } else {
- BUG_ON(bz->cur_offset > PAGE_SIZE);
-
- if (bz->cur_page != bz->page_cnt || left != this_page)
- get_page(bz->pages[bz->cur_page]);
}
left -= this_page;
@@ -1619,7 +1624,14 @@
iov++;
+ if(bz)
+ sdp_bz_cleanup(bz);
bz = sdp_bz_setup(ssk, from, seglen, size_goal);
+ if(IS_ERR(bz)) {
+ bz = NULL;
+ err = PTR_ERR(bz);
+ goto do_error;
+ }
while (seglen > 0) {
int copy;
_______________________________________________
general mailing list
[email protected]
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general
To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general