Re: [PATCH v4 2/2] videobuf2-dma-sg: Replace vb2_dma_sg_desc with sg_table
Hi Marek, Ricardo, On 01/08/2014 03:07 PM, Marek Szyprowski wrote: Hello All, On 2014-01-03 16:51, Ricardo Ribalda Delgado wrote: Hello Hans What if we move the dma_map_sg and dma_unmap_sg to the vb2 interface, and there do something like: n_sg= dma_map_sg() if (n_sg=-ENOMEM){ split_table() //Breaks down the sg_table into monopages sg n_sg= dma_map_sg() This is not a good approach. Remember that if swiotbl needs to allocate e.g. 17 contiguous pages it will round up to the next power of two, so it allocates 32 pages. So even if dma_map_sg succeeds, it might waste a lot of memory. } if (n_sg=-ENOMEM) return -ENOMEM dma_map_sg/dma_unmap_sg should be moved to vb2-dma-sg memory allocator. The best place for calling them is buf_prepare() and buf_finish() callbacks. I think that I've already pointed this some time ago, but unfortunately I didn't find enough time to convert existing code. That would be nice, but this is a separate issue. For solving the problem described by Hans, I think that vb2-dma-sg memory allocator should check dma mask of the client device and add appropriate GFP_DMA or GFP_DMA32 flags to alloc_pages(). This should fix the issues with failed dma_map_sg due to lack of bouncing buffers. Those GFP flags are for the scatterlist itself, and that can be placed anywhere in memory (frankly, I'm not sure why sg_alloc_table_from_pages has a gfp_flags argument at all and I think it is used incorrectly in videobuf2-dma-sg.c as well). I see two options. The first is the patch I included below: this adds a bool to sg_alloc_table_from_pages() that tells it whether or not page combining should be enabled. It also adds the vb2 queue's gfp_flags as an argument to the get_userptr operation. In videobuf2-dma-sg.c that is checked to see whether or not sg_alloc_table_from_pages() should enable page-combining. The alternative would be to have vb2_queue_init check if the use of V4L2_MEMORY_USERPTR would lead to dma bouncing based on the q-io_modes and q-gfp_flags and if so, remove USERPTR support from io_modes. Do we really want to have page bouncing for video capture? Feedback would be welcome as I am not sure what the best solution is. Regards, Hans PS: for a final version this patch would be split up in 2 or 3 patches. Signed-off-by: Hans Verkuil hans.verk...@cisco.com diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c index 79f8b39..404716f 100644 --- a/arch/arm/mm/dma-mapping.c +++ b/arch/arm/mm/dma-mapping.c @@ -1468,7 +1468,7 @@ static int arm_iommu_get_sgtable(struct device *dev, struct sg_table *sgt, return -ENXIO; return sg_alloc_table_from_pages(sgt, pages, count, 0, size, -GFP_KERNEL); +true, GFP_KERNEL); } static int __dma_direction_to_prot(enum dma_data_direction dir) diff --git a/drivers/gpu/drm/drm_prime.c b/drivers/gpu/drm/drm_prime.c index 56805c3..4c56338 100644 --- a/drivers/gpu/drm/drm_prime.c +++ b/drivers/gpu/drm/drm_prime.c @@ -617,7 +617,7 @@ struct sg_table *drm_prime_pages_to_sg(struct page **pages, int nr_pages) } ret = sg_alloc_table_from_pages(sg, pages, nr_pages, 0, - nr_pages PAGE_SHIFT, GFP_KERNEL); + nr_pages PAGE_SHIFT, true, GFP_KERNEL); if (ret) goto out; diff --git a/drivers/gpu/drm/exynos/exynos_drm_g2d.c b/drivers/gpu/drm/exynos/exynos_drm_g2d.c index 7bccedc..0236f9b 100644 --- a/drivers/gpu/drm/exynos/exynos_drm_g2d.c +++ b/drivers/gpu/drm/exynos/exynos_drm_g2d.c @@ -505,7 +505,7 @@ static dma_addr_t *g2d_userptr_get_dma_addr(struct drm_device *drm_dev, } ret = sg_alloc_table_from_pages(sgt, pages, npages, offset, - size, GFP_KERNEL); + size, true, GFP_KERNEL); if (ret 0) { DRM_ERROR(failed to get sgt from pages.\n); goto err_free_sgt; diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_buffer.c b/drivers/gpu/drm/vmwgfx/vmwgfx_buffer.c index 7776e6f..7ee560c 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_buffer.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_buffer.c @@ -348,6 +348,7 @@ static int vmw_ttm_map_dma(struct vmw_ttm_tt *vmw_tt) vsgt-num_pages, 0, (unsigned long) vsgt-num_pages PAGE_SHIFT, + true, GFP_KERNEL); if (unlikely(ret != 0)) goto out_sg_alloc_fail; diff --git a/drivers/media/v4l2-core/videobuf2-core.c b/drivers/media/v4l2-core/videobuf2-core.c index c203b9c..9dd2149 100644 --- a/drivers/media/v4l2-core/videobuf2-core.c +++ b/drivers/media/v4l2-core/videobuf2-core.c @@
Re: [PATCH v4 2/2] videobuf2-dma-sg: Replace vb2_dma_sg_desc with sg_table
Hello, On 2014-01-13 10:54, Hans Verkuil wrote: Hi Marek, Ricardo, On 01/08/2014 03:07 PM, Marek Szyprowski wrote: Hello All, On 2014-01-03 16:51, Ricardo Ribalda Delgado wrote: Hello Hans What if we move the dma_map_sg and dma_unmap_sg to the vb2 interface, and there do something like: n_sg= dma_map_sg() if (n_sg=-ENOMEM){ split_table() //Breaks down the sg_table into monopages sg n_sg= dma_map_sg() This is not a good approach. Remember that if swiotbl needs to allocate e.g. 17 contiguous pages it will round up to the next power of two, so it allocates 32 pages. So even if dma_map_sg succeeds, it might waste a lot of memory. } if (n_sg=-ENOMEM) return -ENOMEM dma_map_sg/dma_unmap_sg should be moved to vb2-dma-sg memory allocator. The best place for calling them is buf_prepare() and buf_finish() callbacks. I think that I've already pointed this some time ago, but unfortunately I didn't find enough time to convert existing code. That would be nice, but this is a separate issue. For solving the problem described by Hans, I think that vb2-dma-sg memory allocator should check dma mask of the client device and add appropriate GFP_DMA or GFP_DMA32 flags to alloc_pages(). This should fix the issues with failed dma_map_sg due to lack of bouncing buffers. Those GFP flags are for the scatterlist itself, and that can be placed anywhere in memory (frankly, I'm not sure why sg_alloc_table_from_pages has a gfp_flags argument at all and I think it is used incorrectly in videobuf2-dma-sg.c as well). I was talking about GFP flags passed to alloc_pages in vb2_dma_sg allocator, not the sg_alloc_table_from_pages(). IMHO the following changes should fix your problem: 1. add client struct device pointer to vb2_dma_sg_desc, so vb2_dma_sg allocator will be able to check dma parameters of the client device. 2. add following check to vb2_dma_sg_alloc_compacted(): if (dev-dma_mask) { if (dev-dma_mask DMA_BIT_MASK(32)) gfp_mask |= GFP_DMA; else if (dev-dev_mask == DMA_BIT_MASK(32) gfp_mask |= GFP_DMA32; } I see two options. The first is the patch I included below: this adds a bool to sg_alloc_table_from_pages() that tells it whether or not page combining should be enabled. It also adds the vb2 queue's gfp_flags as an argument to the get_userptr operation. In videobuf2-dma-sg.c that is checked to see whether or not sg_alloc_table_from_pages() should enable page-combining. The alternative would be to have vb2_queue_init check if the use of V4L2_MEMORY_USERPTR would lead to dma bouncing based on the q-io_modes and q-gfp_flags and if so, remove USERPTR support from io_modes. Do we really want to have page bouncing for video capture? So the main problem is about user ptr modes? This once again shows that the current user pointer API is too limited and should be replaced by something more reliable. Feedback would be welcome as I am not sure what the best solution is. Best regards -- Marek Szyprowski, PhD Samsung RD Institute Poland -- To unsubscribe from this list: send the line unsubscribe linux-media in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v4 2/2] videobuf2-dma-sg: Replace vb2_dma_sg_desc with sg_table
On 01/13/2014 02:02 PM, Marek Szyprowski wrote: Hello, On 2014-01-13 10:54, Hans Verkuil wrote: Hi Marek, Ricardo, On 01/08/2014 03:07 PM, Marek Szyprowski wrote: Hello All, On 2014-01-03 16:51, Ricardo Ribalda Delgado wrote: Hello Hans What if we move the dma_map_sg and dma_unmap_sg to the vb2 interface, and there do something like: n_sg= dma_map_sg() if (n_sg=-ENOMEM){ split_table() //Breaks down the sg_table into monopages sg n_sg= dma_map_sg() This is not a good approach. Remember that if swiotbl needs to allocate e.g. 17 contiguous pages it will round up to the next power of two, so it allocates 32 pages. So even if dma_map_sg succeeds, it might waste a lot of memory. } if (n_sg=-ENOMEM) return -ENOMEM dma_map_sg/dma_unmap_sg should be moved to vb2-dma-sg memory allocator. The best place for calling them is buf_prepare() and buf_finish() callbacks. I think that I've already pointed this some time ago, but unfortunately I didn't find enough time to convert existing code. That would be nice, but this is a separate issue. For solving the problem described by Hans, I think that vb2-dma-sg memory allocator should check dma mask of the client device and add appropriate GFP_DMA or GFP_DMA32 flags to alloc_pages(). This should fix the issues with failed dma_map_sg due to lack of bouncing buffers. Those GFP flags are for the scatterlist itself, and that can be placed anywhere in memory (frankly, I'm not sure why sg_alloc_table_from_pages has a gfp_flags argument at all and I think it is used incorrectly in videobuf2-dma-sg.c as well). I was talking about GFP flags passed to alloc_pages in vb2_dma_sg allocator, not the sg_alloc_table_from_pages(). IMHO the following changes should fix your problem: 1. add client struct device pointer to vb2_dma_sg_desc, so vb2_dma_sg allocator will be able to check dma parameters of the client device. 2. add following check to vb2_dma_sg_alloc_compacted(): if (dev-dma_mask) { if (dev-dma_mask DMA_BIT_MASK(32)) gfp_mask |= GFP_DMA; else if (dev-dev_mask == DMA_BIT_MASK(32) gfp_mask |= GFP_DMA32; } No, it doesn't. This concerns the USERPTR memory model, so the memory for the buffer is allocated by userspace, not by the kernel. The MMAP memory model works fine, that's not where the problem is. I see two options. The first is the patch I included below: this adds a bool to sg_alloc_table_from_pages() that tells it whether or not page combining should be enabled. It also adds the vb2 queue's gfp_flags as an argument to the get_userptr operation. In videobuf2-dma-sg.c that is checked to see whether or not sg_alloc_table_from_pages() should enable page-combining. The alternative would be to have vb2_queue_init check if the use of V4L2_MEMORY_USERPTR would lead to dma bouncing based on the q-io_modes and q-gfp_flags and if so, remove USERPTR support from io_modes. Do we really want to have page bouncing for video capture? So the main problem is about user ptr modes? This once again shows that the current user pointer API is too limited and should be replaced by something more reliable. The userptr model worked perfectly fine before the memory compaction was added. This is a pure regression. Regards, Hans -- To unsubscribe from this list: send the line unsubscribe linux-media in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v4 2/2] videobuf2-dma-sg: Replace vb2_dma_sg_desc with sg_table
Hello All, On 2014-01-03 16:51, Ricardo Ribalda Delgado wrote: Hello Hans What if we move the dma_map_sg and dma_unmap_sg to the vb2 interface, and there do something like: n_sg= dma_map_sg() if (n_sg=-ENOMEM){ split_table() //Breaks down the sg_table into monopages sg n_sg= dma_map_sg() } if (n_sg=-ENOMEM) return -ENOMEM dma_map_sg/dma_unmap_sg should be moved to vb2-dma-sg memory allocator. The best place for calling them is buf_prepare() and buf_finish() callbacks. I think that I've already pointed this some time ago, but unfortunately I didn't find enough time to convert existing code. For solving the problem described by Hans, I think that vb2-dma-sg memory allocator should check dma mask of the client device and add appropriate GFP_DMA or GFP_DMA32 flags to alloc_pages(). This should fix the issues with failed dma_map_sg due to lack of bouncing buffers. Best regards -- Marek Szyprowski, PhD Samsung RD Institute Poland -- To unsubscribe from this list: send the line unsubscribe linux-media in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v4 2/2] videobuf2-dma-sg: Replace vb2_dma_sg_desc with sg_table
Hi Ricardo, I've run into a problem that is caused by this patch: On 08/02/2013 04:20 PM, Ricardo Ribalda Delgado wrote: Replace the private struct vb2_dma_sg_desc with the struct sg_table so we can benefit from all the helping functions in lib/scatterlist.c for things like allocating the sg or compacting the descriptor marvel-ccic and solo6x10 drivers, that uses this api has been updated Acked-by: Marek Szyprowski m.szyprow...@samsung.com Reviewed-by: Andre Heider a.hei...@gmail.com Signed-off-by: Ricardo Ribalda Delgado ricardo.riba...@gmail.com --- drivers/media/platform/marvell-ccic/mcam-core.c| 14 +-- drivers/media/v4l2-core/videobuf2-dma-sg.c | 103 drivers/staging/media/solo6x10/solo6x10-v4l2-enc.c | 20 ++-- include/media/videobuf2-dma-sg.h | 10 +- 4 files changed, 63 insertions(+), 84 deletions(-) snip diff --git a/drivers/media/v4l2-core/videobuf2-dma-sg.c b/drivers/media/v4l2-core/videobuf2-dma-sg.c index 4999c48..2f86054 100644 --- a/drivers/media/v4l2-core/videobuf2-dma-sg.c +++ b/drivers/media/v4l2-core/videobuf2-dma-sg.c snip @@ -99,17 +98,11 @@ static void *vb2_dma_sg_alloc(void *alloc_ctx, unsigned long size, gfp_t gfp_fla buf-vaddr = NULL; buf-write = 0; buf-offset = 0; - buf-sg_desc.size = size; + buf-size = size; /* size is already page aligned */ - buf-sg_desc.num_pages = size PAGE_SHIFT; - - buf-sg_desc.sglist = vzalloc(buf-sg_desc.num_pages * - sizeof(*buf-sg_desc.sglist)); - if (!buf-sg_desc.sglist) - goto fail_sglist_alloc; - sg_init_table(buf-sg_desc.sglist, buf-sg_desc.num_pages); + buf-num_pages = size PAGE_SHIFT; - buf-pages = kzalloc(buf-sg_desc.num_pages * sizeof(struct page *), + buf-pages = kzalloc(buf-num_pages * sizeof(struct page *), GFP_KERNEL); if (!buf-pages) goto fail_pages_array_alloc; @@ -118,6 +111,11 @@ static void *vb2_dma_sg_alloc(void *alloc_ctx, unsigned long size, gfp_t gfp_fla if (ret) goto fail_pages_alloc; + ret = sg_alloc_table_from_pages(buf-sg_table, buf-pages, + buf-num_pages, 0, size, gfp_flags); + if (ret) + goto fail_table_alloc; + buf-handler.refcount = buf-refcount; buf-handler.put = vb2_dma_sg_put; buf-handler.arg = buf; The problem here is the switch from sg_init_table to sg_alloc_table_from_pages. If the PCI hardware only accepts 32-bit DMA transfers, but it is used on a 64-bit OS with 4GB physical memory, then the kernel will allocate DMA bounce buffers for you. With sg_init_table that works fine since each page in the scatterlist maps to a bounce buffer that is also just one page, but with sg_alloc_table_from_pages the DMA bounce buffers can be multiple pages. This is in turn rounded up to the next power of 2 and allocated in the 32-bit address space. Unfortunately, due to memory fragmentation this very quickly fails with -ENOMEM. I discovered this while converting saa7134 to vb2. I think that when DMA bounce buffers are needed, then it should revert to sg_init_table. I don't know whether this bug also affects non-v4l drivers. For now at least I won't try to fix this myself as I have discovered that dma-sg doesn't work anyway for saa7134 due to a hardware limitation so I will switch to dma-contig for that driver. But at the very least I thought I should write this down so others know about this subtle problem and perhaps someone else wants to tackle this. I actually think that the solo driver is affected by this (I haven't tested it yet). And at some point we need to convert bttv and cx88 to vb2 as well, and I expect that they will hit the same problem. If someone knows a better solution than switching to sg_init_table if bounce buffers are needed, then let me know. Regards, Hans -- To unsubscribe from this list: send the line unsubscribe linux-media in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v4 2/2] videobuf2-dma-sg: Replace vb2_dma_sg_desc with sg_table
Hello Hans Thank you very much for your mail. For what I understand sg_alloc_table_from_pages does not allocate any page or bounce buffer, it just take a set of N pages and makes a sg_table from it, on the process it finds out if page A and A+1are on the same pfn and if it is true they will share the sg. So it is a later function that produces the error. As I see it, before this patch we were reimplementing sg_alloc_table_from_pages. Which function is returning -ENOMEM? Regards! On Fri, Jan 3, 2014 at 3:52 PM, Hans Verkuil hverk...@xs4all.nl wrote: Hi Ricardo, I've run into a problem that is caused by this patch: On 08/02/2013 04:20 PM, Ricardo Ribalda Delgado wrote: Replace the private struct vb2_dma_sg_desc with the struct sg_table so we can benefit from all the helping functions in lib/scatterlist.c for things like allocating the sg or compacting the descriptor marvel-ccic and solo6x10 drivers, that uses this api has been updated Acked-by: Marek Szyprowski m.szyprow...@samsung.com Reviewed-by: Andre Heider a.hei...@gmail.com Signed-off-by: Ricardo Ribalda Delgado ricardo.riba...@gmail.com --- drivers/media/platform/marvell-ccic/mcam-core.c| 14 +-- drivers/media/v4l2-core/videobuf2-dma-sg.c | 103 drivers/staging/media/solo6x10/solo6x10-v4l2-enc.c | 20 ++-- include/media/videobuf2-dma-sg.h | 10 +- 4 files changed, 63 insertions(+), 84 deletions(-) snip diff --git a/drivers/media/v4l2-core/videobuf2-dma-sg.c b/drivers/media/v4l2-core/videobuf2-dma-sg.c index 4999c48..2f86054 100644 --- a/drivers/media/v4l2-core/videobuf2-dma-sg.c +++ b/drivers/media/v4l2-core/videobuf2-dma-sg.c snip @@ -99,17 +98,11 @@ static void *vb2_dma_sg_alloc(void *alloc_ctx, unsigned long size, gfp_t gfp_fla buf-vaddr = NULL; buf-write = 0; buf-offset = 0; - buf-sg_desc.size = size; + buf-size = size; /* size is already page aligned */ - buf-sg_desc.num_pages = size PAGE_SHIFT; - - buf-sg_desc.sglist = vzalloc(buf-sg_desc.num_pages * - sizeof(*buf-sg_desc.sglist)); - if (!buf-sg_desc.sglist) - goto fail_sglist_alloc; - sg_init_table(buf-sg_desc.sglist, buf-sg_desc.num_pages); + buf-num_pages = size PAGE_SHIFT; - buf-pages = kzalloc(buf-sg_desc.num_pages * sizeof(struct page *), + buf-pages = kzalloc(buf-num_pages * sizeof(struct page *), GFP_KERNEL); if (!buf-pages) goto fail_pages_array_alloc; @@ -118,6 +111,11 @@ static void *vb2_dma_sg_alloc(void *alloc_ctx, unsigned long size, gfp_t gfp_fla if (ret) goto fail_pages_alloc; + ret = sg_alloc_table_from_pages(buf-sg_table, buf-pages, + buf-num_pages, 0, size, gfp_flags); + if (ret) + goto fail_table_alloc; + buf-handler.refcount = buf-refcount; buf-handler.put = vb2_dma_sg_put; buf-handler.arg = buf; The problem here is the switch from sg_init_table to sg_alloc_table_from_pages. If the PCI hardware only accepts 32-bit DMA transfers, but it is used on a 64-bit OS with 4GB physical memory, then the kernel will allocate DMA bounce buffers for you. With sg_init_table that works fine since each page in the scatterlist maps to a bounce buffer that is also just one page, but with sg_alloc_table_from_pages the DMA bounce buffers can be multiple pages. This is in turn rounded up to the next power of 2 and allocated in the 32-bit address space. Unfortunately, due to memory fragmentation this very quickly fails with -ENOMEM. I discovered this while converting saa7134 to vb2. I think that when DMA bounce buffers are needed, then it should revert to sg_init_table. I don't know whether this bug also affects non-v4l drivers. For now at least I won't try to fix this myself as I have discovered that dma-sg doesn't work anyway for saa7134 due to a hardware limitation so I will switch to dma-contig for that driver. But at the very least I thought I should write this down so others know about this subtle problem and perhaps someone else wants to tackle this. I actually think that the solo driver is affected by this (I haven't tested it yet). And at some point we need to convert bttv and cx88 to vb2 as well, and I expect that they will hit the same problem. If someone knows a better solution than switching to sg_init_table if bounce buffers are needed, then let me know. Regards, Hans -- Ricardo Ribalda -- To unsubscribe from this list: send the line unsubscribe linux-media in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v4 2/2] videobuf2-dma-sg: Replace vb2_dma_sg_desc with sg_table
On 01/03/2014 04:17 PM, Ricardo Ribalda Delgado wrote: Hello Hans Thank you very much for your mail. For what I understand sg_alloc_table_from_pages does not allocate any page or bounce buffer, it just take a set of N pages and makes a sg_table from it, on the process it finds out if page A and A+1are on the same pfn and if it is true they will share the sg. So it is a later function that produces the error. As I see it, before this patch we were reimplementing sg_alloc_table_from_pages. Which function is returning -ENOMEM? That's dma_map_sg(), which uses the scatter list constructed by sg_alloc_table_from_pages(). For x86 that ends up in lib/swiotlb.c, swiotlb_map_sg_attrs(). Regards, Hans Regards! On Fri, Jan 3, 2014 at 3:52 PM, Hans Verkuil hverk...@xs4all.nl wrote: Hi Ricardo, I've run into a problem that is caused by this patch: On 08/02/2013 04:20 PM, Ricardo Ribalda Delgado wrote: Replace the private struct vb2_dma_sg_desc with the struct sg_table so we can benefit from all the helping functions in lib/scatterlist.c for things like allocating the sg or compacting the descriptor marvel-ccic and solo6x10 drivers, that uses this api has been updated Acked-by: Marek Szyprowski m.szyprow...@samsung.com Reviewed-by: Andre Heider a.hei...@gmail.com Signed-off-by: Ricardo Ribalda Delgado ricardo.riba...@gmail.com --- drivers/media/platform/marvell-ccic/mcam-core.c| 14 +-- drivers/media/v4l2-core/videobuf2-dma-sg.c | 103 drivers/staging/media/solo6x10/solo6x10-v4l2-enc.c | 20 ++-- include/media/videobuf2-dma-sg.h | 10 +- 4 files changed, 63 insertions(+), 84 deletions(-) snip diff --git a/drivers/media/v4l2-core/videobuf2-dma-sg.c b/drivers/media/v4l2-core/videobuf2-dma-sg.c index 4999c48..2f86054 100644 --- a/drivers/media/v4l2-core/videobuf2-dma-sg.c +++ b/drivers/media/v4l2-core/videobuf2-dma-sg.c snip @@ -99,17 +98,11 @@ static void *vb2_dma_sg_alloc(void *alloc_ctx, unsigned long size, gfp_t gfp_fla buf-vaddr = NULL; buf-write = 0; buf-offset = 0; - buf-sg_desc.size = size; + buf-size = size; /* size is already page aligned */ - buf-sg_desc.num_pages = size PAGE_SHIFT; - - buf-sg_desc.sglist = vzalloc(buf-sg_desc.num_pages * - sizeof(*buf-sg_desc.sglist)); - if (!buf-sg_desc.sglist) - goto fail_sglist_alloc; - sg_init_table(buf-sg_desc.sglist, buf-sg_desc.num_pages); + buf-num_pages = size PAGE_SHIFT; - buf-pages = kzalloc(buf-sg_desc.num_pages * sizeof(struct page *), + buf-pages = kzalloc(buf-num_pages * sizeof(struct page *), GFP_KERNEL); if (!buf-pages) goto fail_pages_array_alloc; @@ -118,6 +111,11 @@ static void *vb2_dma_sg_alloc(void *alloc_ctx, unsigned long size, gfp_t gfp_fla if (ret) goto fail_pages_alloc; + ret = sg_alloc_table_from_pages(buf-sg_table, buf-pages, + buf-num_pages, 0, size, gfp_flags); + if (ret) + goto fail_table_alloc; + buf-handler.refcount = buf-refcount; buf-handler.put = vb2_dma_sg_put; buf-handler.arg = buf; The problem here is the switch from sg_init_table to sg_alloc_table_from_pages. If the PCI hardware only accepts 32-bit DMA transfers, but it is used on a 64-bit OS with 4GB physical memory, then the kernel will allocate DMA bounce buffers for you. With sg_init_table that works fine since each page in the scatterlist maps to a bounce buffer that is also just one page, but with sg_alloc_table_from_pages the DMA bounce buffers can be multiple pages. This is in turn rounded up to the next power of 2 and allocated in the 32-bit address space. Unfortunately, due to memory fragmentation this very quickly fails with -ENOMEM. I discovered this while converting saa7134 to vb2. I think that when DMA bounce buffers are needed, then it should revert to sg_init_table. I don't know whether this bug also affects non-v4l drivers. For now at least I won't try to fix this myself as I have discovered that dma-sg doesn't work anyway for saa7134 due to a hardware limitation so I will switch to dma-contig for that driver. But at the very least I thought I should write this down so others know about this subtle problem and perhaps someone else wants to tackle this. I actually think that the solo driver is affected by this (I haven't tested it yet). And at some point we need to convert bttv and cx88 to vb2 as well, and I expect that they will hit the same problem. If someone knows a better solution than switching to sg_init_table if bounce buffers are needed, then let me know. Regards, Hans -- To unsubscribe from this list: send the line unsubscribe linux-media in the body of a message to
Re: [PATCH v4 2/2] videobuf2-dma-sg: Replace vb2_dma_sg_desc with sg_table
Hello Hans What if we move the dma_map_sg and dma_unmap_sg to the vb2 interface, and there do something like: n_sg= dma_map_sg() if (n_sg=-ENOMEM){ split_table() //Breaks down the sg_table into monopages sg n_sg= dma_map_sg() } if (n_sg=-ENOMEM) return -ENOMEM Regards On Fri, Jan 3, 2014 at 4:36 PM, Hans Verkuil hverk...@xs4all.nl wrote: On 01/03/2014 04:17 PM, Ricardo Ribalda Delgado wrote: Hello Hans Thank you very much for your mail. For what I understand sg_alloc_table_from_pages does not allocate any page or bounce buffer, it just take a set of N pages and makes a sg_table from it, on the process it finds out if page A and A+1are on the same pfn and if it is true they will share the sg. So it is a later function that produces the error. As I see it, before this patch we were reimplementing sg_alloc_table_from_pages. Which function is returning -ENOMEM? That's dma_map_sg(), which uses the scatter list constructed by sg_alloc_table_from_pages(). For x86 that ends up in lib/swiotlb.c, swiotlb_map_sg_attrs(). Regards, Hans Regards! On Fri, Jan 3, 2014 at 3:52 PM, Hans Verkuil hverk...@xs4all.nl wrote: Hi Ricardo, I've run into a problem that is caused by this patch: On 08/02/2013 04:20 PM, Ricardo Ribalda Delgado wrote: Replace the private struct vb2_dma_sg_desc with the struct sg_table so we can benefit from all the helping functions in lib/scatterlist.c for things like allocating the sg or compacting the descriptor marvel-ccic and solo6x10 drivers, that uses this api has been updated Acked-by: Marek Szyprowski m.szyprow...@samsung.com Reviewed-by: Andre Heider a.hei...@gmail.com Signed-off-by: Ricardo Ribalda Delgado ricardo.riba...@gmail.com --- drivers/media/platform/marvell-ccic/mcam-core.c| 14 +-- drivers/media/v4l2-core/videobuf2-dma-sg.c | 103 drivers/staging/media/solo6x10/solo6x10-v4l2-enc.c | 20 ++-- include/media/videobuf2-dma-sg.h | 10 +- 4 files changed, 63 insertions(+), 84 deletions(-) snip diff --git a/drivers/media/v4l2-core/videobuf2-dma-sg.c b/drivers/media/v4l2-core/videobuf2-dma-sg.c index 4999c48..2f86054 100644 --- a/drivers/media/v4l2-core/videobuf2-dma-sg.c +++ b/drivers/media/v4l2-core/videobuf2-dma-sg.c snip @@ -99,17 +98,11 @@ static void *vb2_dma_sg_alloc(void *alloc_ctx, unsigned long size, gfp_t gfp_fla buf-vaddr = NULL; buf-write = 0; buf-offset = 0; - buf-sg_desc.size = size; + buf-size = size; /* size is already page aligned */ - buf-sg_desc.num_pages = size PAGE_SHIFT; - - buf-sg_desc.sglist = vzalloc(buf-sg_desc.num_pages * - sizeof(*buf-sg_desc.sglist)); - if (!buf-sg_desc.sglist) - goto fail_sglist_alloc; - sg_init_table(buf-sg_desc.sglist, buf-sg_desc.num_pages); + buf-num_pages = size PAGE_SHIFT; - buf-pages = kzalloc(buf-sg_desc.num_pages * sizeof(struct page *), + buf-pages = kzalloc(buf-num_pages * sizeof(struct page *), GFP_KERNEL); if (!buf-pages) goto fail_pages_array_alloc; @@ -118,6 +111,11 @@ static void *vb2_dma_sg_alloc(void *alloc_ctx, unsigned long size, gfp_t gfp_fla if (ret) goto fail_pages_alloc; + ret = sg_alloc_table_from_pages(buf-sg_table, buf-pages, + buf-num_pages, 0, size, gfp_flags); + if (ret) + goto fail_table_alloc; + buf-handler.refcount = buf-refcount; buf-handler.put = vb2_dma_sg_put; buf-handler.arg = buf; The problem here is the switch from sg_init_table to sg_alloc_table_from_pages. If the PCI hardware only accepts 32-bit DMA transfers, but it is used on a 64-bit OS with 4GB physical memory, then the kernel will allocate DMA bounce buffers for you. With sg_init_table that works fine since each page in the scatterlist maps to a bounce buffer that is also just one page, but with sg_alloc_table_from_pages the DMA bounce buffers can be multiple pages. This is in turn rounded up to the next power of 2 and allocated in the 32-bit address space. Unfortunately, due to memory fragmentation this very quickly fails with -ENOMEM. I discovered this while converting saa7134 to vb2. I think that when DMA bounce buffers are needed, then it should revert to sg_init_table. I don't know whether this bug also affects non-v4l drivers. For now at least I won't try to fix this myself as I have discovered that dma-sg doesn't work anyway for saa7134 due to a hardware limitation so I will switch to dma-contig for that driver. But at the very least I thought I should write this down so others know about this subtle problem and perhaps someone else wants to tackle this. I actually think that the solo driver is affected by this (I haven't tested it yet). And at some point we need to convert bttv and
[PATCH v4 2/2] videobuf2-dma-sg: Replace vb2_dma_sg_desc with sg_table
Replace the private struct vb2_dma_sg_desc with the struct sg_table so we can benefit from all the helping functions in lib/scatterlist.c for things like allocating the sg or compacting the descriptor marvel-ccic and solo6x10 drivers, that uses this api has been updated Acked-by: Marek Szyprowski m.szyprow...@samsung.com Reviewed-by: Andre Heider a.hei...@gmail.com Signed-off-by: Ricardo Ribalda Delgado ricardo.riba...@gmail.com --- drivers/media/platform/marvell-ccic/mcam-core.c| 14 +-- drivers/media/v4l2-core/videobuf2-dma-sg.c | 103 drivers/staging/media/solo6x10/solo6x10-v4l2-enc.c | 20 ++-- include/media/videobuf2-dma-sg.h | 10 +- 4 files changed, 63 insertions(+), 84 deletions(-) diff --git a/drivers/media/platform/marvell-ccic/mcam-core.c b/drivers/media/platform/marvell-ccic/mcam-core.c index 64ab91e..0ac51bd 100644 --- a/drivers/media/platform/marvell-ccic/mcam-core.c +++ b/drivers/media/platform/marvell-ccic/mcam-core.c @@ -1040,16 +1040,16 @@ static int mcam_vb_sg_buf_prepare(struct vb2_buffer *vb) { struct mcam_vb_buffer *mvb = vb_to_mvb(vb); struct mcam_camera *cam = vb2_get_drv_priv(vb-vb2_queue); - struct vb2_dma_sg_desc *sgd = vb2_dma_sg_plane_desc(vb, 0); + struct sg_table *sg_table = vb2_dma_sg_plane_desc(vb, 0); struct mcam_dma_desc *desc = mvb-dma_desc; struct scatterlist *sg; int i; - mvb-dma_desc_nent = dma_map_sg(cam-dev, sgd-sglist, sgd-num_pages, - DMA_FROM_DEVICE); + mvb-dma_desc_nent = dma_map_sg(cam-dev, sg_table-sgl, + sg_table-nents, DMA_FROM_DEVICE); if (mvb-dma_desc_nent = 0) return -EIO; /* Not sure what's right here */ - for_each_sg(sgd-sglist, sg, mvb-dma_desc_nent, i) { + for_each_sg(sg_table-sgl, sg, mvb-dma_desc_nent, i) { desc-dma_addr = sg_dma_address(sg); desc-segment_len = sg_dma_len(sg); desc++; @@ -1060,9 +1060,11 @@ static int mcam_vb_sg_buf_prepare(struct vb2_buffer *vb) static int mcam_vb_sg_buf_finish(struct vb2_buffer *vb) { struct mcam_camera *cam = vb2_get_drv_priv(vb-vb2_queue); - struct vb2_dma_sg_desc *sgd = vb2_dma_sg_plane_desc(vb, 0); + struct sg_table *sg_table = vb2_dma_sg_plane_desc(vb, 0); - dma_unmap_sg(cam-dev, sgd-sglist, sgd-num_pages, DMA_FROM_DEVICE); + if (sg_table) + dma_unmap_sg(cam-dev, sg_table-sgl, + sg_table-nents, DMA_FROM_DEVICE); return 0; } diff --git a/drivers/media/v4l2-core/videobuf2-dma-sg.c b/drivers/media/v4l2-core/videobuf2-dma-sg.c index 4999c48..2f86054 100644 --- a/drivers/media/v4l2-core/videobuf2-dma-sg.c +++ b/drivers/media/v4l2-core/videobuf2-dma-sg.c @@ -35,7 +35,9 @@ struct vb2_dma_sg_buf { struct page **pages; int write; int offset; - struct vb2_dma_sg_desc sg_desc; + struct sg_table sg_table; + size_t size; + unsigned intnum_pages; atomic_trefcount; struct vb2_vmarea_handler handler; }; @@ -46,7 +48,7 @@ static int vb2_dma_sg_alloc_compacted(struct vb2_dma_sg_buf *buf, gfp_t gfp_flags) { unsigned int last_page = 0; - int size = buf-sg_desc.size; + int size = buf-size; while (size 0) { struct page *pages; @@ -74,12 +76,8 @@ static int vb2_dma_sg_alloc_compacted(struct vb2_dma_sg_buf *buf, } split_page(pages, order); - for (i = 0; i (1 order); i++) { - buf-pages[last_page] = pages[i]; - sg_set_page(buf-sg_desc.sglist[last_page], - buf-pages[last_page], PAGE_SIZE, 0); - last_page++; - } + for (i = 0; i (1 order); i++) + buf-pages[last_page++] = pages[i]; size -= PAGE_SIZE order; } @@ -91,6 +89,7 @@ static void *vb2_dma_sg_alloc(void *alloc_ctx, unsigned long size, gfp_t gfp_fla { struct vb2_dma_sg_buf *buf; int ret; + int num_pages; buf = kzalloc(sizeof *buf, GFP_KERNEL); if (!buf) @@ -99,17 +98,11 @@ static void *vb2_dma_sg_alloc(void *alloc_ctx, unsigned long size, gfp_t gfp_fla buf-vaddr = NULL; buf-write = 0; buf-offset = 0; - buf-sg_desc.size = size; + buf-size = size; /* size is already page aligned */ - buf-sg_desc.num_pages = size PAGE_SHIFT; - - buf-sg_desc.sglist = vzalloc(buf-sg_desc.num_pages * - sizeof(*buf-sg_desc.sglist)); - if (!buf-sg_desc.sglist) - goto