From: Tomer Tayar <tta...@habana.ai>

In some cases the calculated number of required entries for the dma-buf
SG table is wrong. For example, if the page size is larger than both the
dma max segment size of the importer device and from the exported side,
or if the exported size is part of a phys_pg_pack that is composed of
several pages.
In these cases, redundant entries will be added to the SG table.

Modify the method that the number of entries is calculated, and the way
they are prepared.

Signed-off-by: Tomer Tayar <tta...@habana.ai>
Reviewed-by: Oded Gabbay <ogab...@kernel.org>
Signed-off-by: Oded Gabbay <ogab...@kernel.org>
---
 drivers/accel/habanalabs/common/memory.c | 199 ++++++++++++-----------
 1 file changed, 104 insertions(+), 95 deletions(-)

diff --git a/drivers/accel/habanalabs/common/memory.c 
b/drivers/accel/habanalabs/common/memory.c
index d0edbe4b4210..5c1e98e73a47 100644
--- a/drivers/accel/habanalabs/common/memory.c
+++ b/drivers/accel/habanalabs/common/memory.c
@@ -1535,21 +1535,17 @@ static struct sg_table 
*alloc_sgt_from_device_pages(struct hl_device *hdev, u64
                                                u64 page_size, u64 
exported_size,
                                                struct device *dev, enum 
dma_data_direction dir)
 {
-       u64 chunk_size, bar_address, dma_max_seg_size, cur_size_to_export, 
cur_npages;
-       struct asic_fixed_properties *prop;
-       int rc, i, j, nents, cur_page;
+       u64 dma_max_seg_size, curr_page, size, chunk_size, left_size_to_export, 
left_size_in_page,
+               left_size_in_dma_seg, device_address, bar_address;
+       struct asic_fixed_properties *prop = &hdev->asic_prop;
        struct scatterlist *sg;
+       unsigned int nents, i;
        struct sg_table *sgt;
+       bool next_sg_entry;
+       int rc;
 
-       prop = &hdev->asic_prop;
-
-       dma_max_seg_size = dma_get_max_seg_size(dev);
-
-       /* We would like to align the max segment size to PAGE_SIZE, so the
-        * SGL will contain aligned addresses that can be easily mapped to
-        * an MMU
-        */
-       dma_max_seg_size = ALIGN_DOWN(dma_max_seg_size, PAGE_SIZE);
+       /* Align max segment size to PAGE_SIZE to fit the minimal IOMMU mapping 
granularity */
+       dma_max_seg_size = ALIGN_DOWN(dma_get_max_seg_size(dev), PAGE_SIZE);
        if (dma_max_seg_size < PAGE_SIZE) {
                dev_err_ratelimited(hdev->dev,
                                "dma_max_seg_size %llu can't be smaller than 
PAGE_SIZE\n",
@@ -1561,120 +1557,133 @@ static struct sg_table 
*alloc_sgt_from_device_pages(struct hl_device *hdev, u64
        if (!sgt)
                return ERR_PTR(-ENOMEM);
 
-       cur_size_to_export = exported_size;
+       /* Calculate the required number of entries for the SG table */
+       curr_page = 0;
+       nents = 1;
+       left_size_to_export = exported_size;
+       left_size_in_page = page_size;
+       left_size_in_dma_seg = dma_max_seg_size;
+       next_sg_entry = false;
+
+       while (true) {
+               size = min3(left_size_to_export, left_size_in_page, 
left_size_in_dma_seg);
+               left_size_to_export -= size;
+               left_size_in_page -= size;
+               left_size_in_dma_seg -= size;
+
+               if (!left_size_to_export)
+                       break;
 
-       /* If the size of each page is larger than the dma max segment size,
-        * then we can't combine pages and the number of entries in the SGL
-        * will just be the
-        * <number of pages> * <chunks of max segment size in each page>
-        */
-       if (page_size > dma_max_seg_size) {
-               /* we should limit number of pages according to the exported 
size */
-               cur_npages = DIV_ROUND_UP_SECTOR_T(cur_size_to_export, 
page_size);
-               nents = cur_npages * DIV_ROUND_UP_SECTOR_T(page_size, 
dma_max_seg_size);
-       } else {
-               cur_npages = npages;
-
-               /* Get number of non-contiguous chunks */
-               for (i = 1, nents = 1, chunk_size = page_size ; i < cur_npages 
; i++) {
-                       if (pages[i - 1] + page_size != pages[i] ||
-                                       chunk_size + page_size > 
dma_max_seg_size) {
-                               nents++;
-                               chunk_size = page_size;
-                               continue;
-                       }
+               if (!left_size_in_page) {
+                       /* left_size_to_export is not zero so there must be 
another page */
+                       if (pages[curr_page] + page_size != pages[curr_page + 
1])
+                               next_sg_entry = true;
+
+                       ++curr_page;
+                       left_size_in_page = page_size;
+               }
 
-                       chunk_size += page_size;
+               if (!left_size_in_dma_seg) {
+                       next_sg_entry = true;
+                       left_size_in_dma_seg = dma_max_seg_size;
+               }
+
+               if (next_sg_entry) {
+                       ++nents;
+                       next_sg_entry = false;
                }
        }
 
        rc = sg_alloc_table(sgt, nents, GFP_KERNEL | __GFP_ZERO);
        if (rc)
-               goto error_free;
-
-       cur_page = 0;
-
-       if (page_size > dma_max_seg_size) {
-               u64 size_left, cur_device_address = 0;
+               goto err_free_sgt;
 
-               size_left = page_size;
+       /* Prepare the SG table entries */
+       curr_page = 0;
+       device_address = pages[curr_page];
+       left_size_to_export = exported_size;
+       left_size_in_page = page_size;
+       left_size_in_dma_seg = dma_max_seg_size;
+       next_sg_entry = false;
 
-               /* Need to split each page into the number of chunks of
-                * dma_max_seg_size
-                */
-               for_each_sgtable_dma_sg(sgt, sg, i) {
-                       if (size_left == page_size)
-                               cur_device_address =
-                                       pages[cur_page] - 
prop->dram_base_address;
-                       else
-                               cur_device_address += dma_max_seg_size;
-
-                       /* make sure not to export over exported size */
-                       chunk_size = min3(size_left, dma_max_seg_size, 
cur_size_to_export);
-
-                       bar_address = hdev->dram_pci_bar_start + 
cur_device_address;
-
-                       rc = set_dma_sg(sg, bar_address, chunk_size, dev, dir);
-                       if (rc)
-                               goto error_unmap;
+       for_each_sgtable_dma_sg(sgt, sg, i) {
+               bar_address = hdev->dram_pci_bar_start + (device_address - 
prop->dram_base_address);
+               chunk_size = 0;
+
+               for ( ; curr_page < npages ; ++curr_page) {
+                       size = min3(left_size_to_export, left_size_in_page, 
left_size_in_dma_seg);
+                       chunk_size += size;
+                       left_size_to_export -= size;
+                       left_size_in_page -= size;
+                       left_size_in_dma_seg -= size;
+
+                       if (!left_size_to_export)
+                               break;
+
+                       if (!left_size_in_page) {
+                               /* left_size_to_export is not zero so there 
must be another page */
+                               if (pages[curr_page] + page_size != 
pages[curr_page + 1]) {
+                                       device_address = pages[curr_page + 1];
+                                       next_sg_entry = true;
+                               }
+
+                               left_size_in_page = page_size;
+                       }
 
-                       cur_size_to_export -= chunk_size;
+                       if (!left_size_in_dma_seg) {
+                               /*
+                                * Skip setting a new device address if already 
moving to a page
+                                * which is not contiguous with the current 
page.
+                                */
+                               if (!next_sg_entry) {
+                                       device_address += chunk_size;
+                                       next_sg_entry = true;
+                               }
+
+                               left_size_in_dma_seg = dma_max_seg_size;
+                       }
 
-                       if (size_left > dma_max_seg_size) {
-                               size_left -= dma_max_seg_size;
-                       } else {
-                               cur_page++;
-                               size_left = page_size;
+                       if (next_sg_entry) {
+                               next_sg_entry = false;
+                               break;
                        }
                }
-       } else {
-               /* Merge pages and put them into the scatterlist */
-               for_each_sgtable_dma_sg(sgt, sg, i) {
-                       chunk_size = page_size;
-                       for (j = cur_page + 1 ; j < cur_npages ; j++) {
-                               if (pages[j - 1] + page_size != pages[j] ||
-                                               chunk_size + page_size > 
dma_max_seg_size)
-                                       break;
-
-                               chunk_size += page_size;
-                       }
-
-                       bar_address = hdev->dram_pci_bar_start +
-                                       (pages[cur_page] - 
prop->dram_base_address);
 
-                       /* make sure not to export over exported size */
-                       chunk_size = min(chunk_size, cur_size_to_export);
-                       rc = set_dma_sg(sg, bar_address, chunk_size, dev, dir);
-                       if (rc)
-                               goto error_unmap;
+               rc = set_dma_sg(sg, bar_address, chunk_size, dev, dir);
+               if (rc)
+                       goto err_unmap;
+       }
 
-                       cur_size_to_export -= chunk_size;
-                       cur_page = j;
-               }
+       /* There should be nothing left to export exactly after looping over 
all SG elements */
+       if (left_size_to_export) {
+               dev_err(hdev->dev,
+                       "left size to export %#llx after initializing %u SG 
elements\n",
+                       left_size_to_export, sgt->nents);
+               rc = -ENOMEM;
+               goto err_unmap;
        }
 
-       /* Because we are not going to include a CPU list we want to have some
-        * chance that other users will detect this by setting the orig_nents
-        * to 0 and using only nents (length of DMA list) when going over the
-        * sgl
+       /*
+        * Because we are not going to include a CPU list, we want to have some 
chance that other
+        * users will detect this when going over SG table, by setting the 
orig_nents to 0 and using
+        * only nents (length of DMA list).
         */
        sgt->orig_nents = 0;
 
        return sgt;
 
-error_unmap:
+err_unmap:
        for_each_sgtable_dma_sg(sgt, sg, i) {
                if (!sg_dma_len(sg))
                        continue;
 
-               dma_unmap_resource(dev, sg_dma_address(sg),
-                                       sg_dma_len(sg), dir,
+               dma_unmap_resource(dev, sg_dma_address(sg), sg_dma_len(sg), dir,
                                        DMA_ATTR_SKIP_CPU_SYNC);
        }
 
        sg_free_table(sgt);
 
-error_free:
+err_free_sgt:
        kfree(sgt);
        return ERR_PTR(rc);
 }
-- 
2.34.1

Reply via email to