Re: [PATCH 1/2] swiotlb: Split up single swiotlb lock

2022-06-29 Thread Christoph Hellwig
On Mon, Jun 27, 2022 at 11:31:49AM -0400, Tianyu Lan wrote:
> +/**
> + * struct io_tlb_area - IO TLB memory area descriptor
> + *
> + * This is a single area with a single lock.
> + *
> + * @used:The number of used IO TLB block.
> + * @index:   The slot index to start searching in this area for next round.
> + * @lock:The lock to protect the above data structures in the map and
> + *   unmap calls.
> + */
> +struct io_tlb_area {
> + unsigned long used;
> + unsigned int index;
> + spinlock_t lock;
> +};

As already mentioned last time, please move this into swiotlb.c,
swiotlb.h only uses a pointer to this structure.

>  static void swiotlb_init_io_tlb_mem(struct io_tlb_mem *mem, phys_addr_t 
> start,
> - unsigned long nslabs, unsigned int flags, bool late_alloc)
> + unsigned long nslabs, unsigned int flags,
> + bool late_alloc, unsigned int nareas)

Nit: the two tab indentation for prototype continuations is a lot easier
to maintain, so don't graciously switch away from it.

> + alloc_size - (offset + ((i - slot_index) << 
> IO_TLB_SHIFT));

Overly long line here.

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH 1/2] swiotlb: Split up single swiotlb lock

2022-06-27 Thread Tianyu Lan
From: Tianyu Lan 

Traditionally swiotlb was not performance critical because it was only
used for slow devices. But in some setups, like TDX/SEV confidential
guests, all IO has to go through swiotlb. Currently swiotlb only has a
single lock. Under high IO load with multiple CPUs this can lead to
significat lock contention on the swiotlb lock.

This patch splits the swiotlb bounce buffer pool into individual areas
which have their own lock. Each CPU tries to allocate in its own area
first. Only if that fails does it search other areas. On freeing the
allocation is freed into the area where the memory was originally
allocated from.

Area number can be set via swiotlb_adjust_nareas() and swiotlb kernel
parameter.

This idea from Andi Kleen patch(https://github.com/intel/tdx/commit/4529b578
4c141782c72ec9bd9a92df2b68cb7d45).

Based-on-idea-by: Andi Kleen 
Signed-off-by: Tianyu Lan 
---
 .../admin-guide/kernel-parameters.txt |   4 +-
 include/linux/swiotlb.h   |  27 +++
 kernel/dma/swiotlb.c  | 202 ++
 3 files changed, 194 insertions(+), 39 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt 
b/Documentation/admin-guide/kernel-parameters.txt
index 2522b11e593f..4a6ad177d4b8 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -5904,8 +5904,10 @@
it if 0 is given (See 
Documentation/admin-guide/cgroup-v1/memory.rst)
 
swiotlb=[ARM,IA-64,PPC,MIPS,X86]
-   Format: {  | force | noforce }
+   Format: {  [,] | force | noforce }
 -- Number of I/O TLB slabs
+-- Second integer after comma. Number of swiotlb
+areas with their own lock. Must be power of 2.
force -- force using of bounce buffers even if they
 wouldn't be automatically used by the kernel
noforce -- Never use bounce buffers (for debugging)
diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index 7ed35dd3de6e..7157428cf3ac 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -62,6 +62,22 @@ dma_addr_t swiotlb_map(struct device *dev, phys_addr_t phys,
 #ifdef CONFIG_SWIOTLB
 extern enum swiotlb_force swiotlb_force;
 
+/**
+ * struct io_tlb_area - IO TLB memory area descriptor
+ *
+ * This is a single area with a single lock.
+ *
+ * @used:  The number of used IO TLB block.
+ * @index: The slot index to start searching in this area for next round.
+ * @lock:  The lock to protect the above data structures in the map and
+ * unmap calls.
+ */
+struct io_tlb_area {
+   unsigned long used;
+   unsigned int index;
+   spinlock_t lock;
+};
+
 /**
  * struct io_tlb_mem - IO TLB Memory Pool Descriptor
  *
@@ -89,6 +105,8 @@ extern enum swiotlb_force swiotlb_force;
  * @late_alloc:%true if allocated using the page allocator
  * @force_bounce: %true if swiotlb bouncing is forced
  * @for_alloc:  %true if the pool is used for memory allocation
+ * @nareas:  The area number in the pool.
+ * @area_nslabs: The slot number in the area.
  */
 struct io_tlb_mem {
phys_addr_t start;
@@ -102,6 +120,9 @@ struct io_tlb_mem {
bool late_alloc;
bool force_bounce;
bool for_alloc;
+   unsigned int nareas;
+   unsigned int area_nslabs;
+   struct io_tlb_area *areas;
struct io_tlb_slot {
phys_addr_t orig_addr;
size_t alloc_size;
@@ -130,6 +151,7 @@ unsigned int swiotlb_max_segment(void);
 size_t swiotlb_max_mapping_size(struct device *dev);
 bool is_swiotlb_active(struct device *dev);
 void __init swiotlb_adjust_size(unsigned long size);
+void __init swiotlb_adjust_nareas(unsigned int nareas);
 #else
 static inline void swiotlb_init(bool addressing_limited, unsigned int flags)
 {
@@ -162,6 +184,11 @@ static inline bool is_swiotlb_active(struct device *dev)
 static inline void swiotlb_adjust_size(unsigned long size)
 {
 }
+
+static inline void swiotlb_adjust_nareas(unsigned int nareas)
+{
+}
+
 #endif /* CONFIG_SWIOTLB */
 
 extern void swiotlb_print_info(void);
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index cb50f8d38360..17154abdfb34 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -70,6 +70,7 @@ struct io_tlb_mem io_tlb_default_mem;
 phys_addr_t swiotlb_unencrypted_base;
 
 static unsigned long default_nslabs = IO_TLB_DEFAULT_SIZE >> IO_TLB_SHIFT;
+static unsigned long default_nareas = 1;
 
 static int __init
 setup_io_tlb_npages(char *str)
@@ -79,6 +80,10 @@ setup_io_tlb_npages(char *str)
default_nslabs =
ALIGN(simple_strtoul(str, , 0), IO_TLB_SEGSIZE);
}
+   if (*str == ',')
+   ++str;
+   if (isdigit(*str))
+   

Re: [RFC PATCH 1/2] swiotlb: Split up single swiotlb lock

2022-04-28 Thread Robin Murphy

On 2022-04-28 17:02, Andi Kleen wrote:


On 4/28/2022 8:07 AM, Robin Murphy wrote:

On 2022-04-28 15:55, Andi Kleen wrote:


On 4/28/2022 7:45 AM, Christoph Hellwig wrote:

On Thu, Apr 28, 2022 at 03:44:36PM +0100, Robin Murphy wrote:
Rather than introduce this extra level of allocator complexity, how 
about
just dividing up the initial SWIOTLB allocation into multiple 
io_tlb_mem

instances?
Yeah.  We're almost done removing all knowledge of swiotlb from 
drivers,

so the very last thing I want is an interface that allows a driver to
allocate a per-device buffer.


At least for TDX need parallelism with a single device for performance.

So if you split up the io tlb mems for a device then you would need a 
new mechanism to load balance the requests for single device over 
those. I doubt it would be any simpler.


Eh, I think it would be, since the round-robin retry loop can then 
just sit around the existing io_tlb_mem-based allocator, vs. the churn 
of inserting it in the middle, plus it's then really easy to 
statically distribute different starting points across different 
devices via dev->dma_io_tlb_mem if we wanted to.


Admittedly the overall patch probably ends up about the same size, 
since it likely pushes a bit more complexity into swiotlb_init to 
compensate, but that's still a trade-off I like.


Unless you completely break the external API this will require a new 
mechanism to search a list of io_tlb_mems for the right area to free into.


If the memory area not contiguous (like in the original patch) this will 
be a O(n) operation on the number of io_tlb_mems, so it would get more 
and more expensive on larger systems. Or you merge them all together (so 
that the simple address arithmetic to look up the area works again), 
which will require even more changes in the setup. Or you add hashing or 
similar which will be even more complicated.


In the end doing it with a single io_tlb_mem is significantly simpler 
and also more natural.


Sorry if "dividing up the initial SWIOTLB allocation" somehow sounded 
like "making multiple separate SWIOTLB allocations all over the place"?


I don't see there being any *functional* difference in whether a slice 
of the overall SWIOTLB memory is represented by 
"io_tlb_default_mem->areas[i]->blah" or "io_tlb_default_mem[i]->blah", 
I'm simply advocating for not churning the already-complex allocator 
internals by pushing the new complexity out to the margins instead.


Thanks,
Robin.
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

Re: [RFC PATCH 1/2] swiotlb: Split up single swiotlb lock

2022-04-28 Thread Andi Kleen


On 4/28/2022 8:07 AM, Robin Murphy wrote:

On 2022-04-28 15:55, Andi Kleen wrote:


On 4/28/2022 7:45 AM, Christoph Hellwig wrote:

On Thu, Apr 28, 2022 at 03:44:36PM +0100, Robin Murphy wrote:
Rather than introduce this extra level of allocator complexity, how 
about
just dividing up the initial SWIOTLB allocation into multiple 
io_tlb_mem

instances?
Yeah.  We're almost done removing all knowledge of swiotlb from 
drivers,

so the very last thing I want is an interface that allows a driver to
allocate a per-device buffer.


At least for TDX need parallelism with a single device for performance.

So if you split up the io tlb mems for a device then you would need a 
new mechanism to load balance the requests for single device over 
those. I doubt it would be any simpler.


Eh, I think it would be, since the round-robin retry loop can then 
just sit around the existing io_tlb_mem-based allocator, vs. the churn 
of inserting it in the middle, plus it's then really easy to 
statically distribute different starting points across different 
devices via dev->dma_io_tlb_mem if we wanted to.


Admittedly the overall patch probably ends up about the same size, 
since it likely pushes a bit more complexity into swiotlb_init to 
compensate, but that's still a trade-off I like.


Unless you completely break the external API this will require a new 
mechanism to search a list of io_tlb_mems for the right area to free into.


If the memory area not contiguous (like in the original patch) this will 
be a O(n) operation on the number of io_tlb_mems, so it would get more 
and more expensive on larger systems. Or you merge them all together (so 
that the simple address arithmetic to look up the area works again), 
which will require even more changes in the setup. Or you add hashing or 
similar which will be even more complicated.


In the end doing it with a single io_tlb_mem is significantly simpler 
and also more natural.


-Andi


___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

Re: [RFC PATCH 1/2] swiotlb: Split up single swiotlb lock

2022-04-28 Thread Tianyu Lan

On 4/28/2022 10:44 PM, Robin Murphy wrote:

On 2022-04-28 15:14, Tianyu Lan wrote:

From: Tianyu Lan 

Traditionally swiotlb was not performance critical because it was only
used for slow devices. But in some setups, like TDX/SEV confidential
guests, all IO has to go through swiotlb. Currently swiotlb only has a
single lock. Under high IO load with multiple CPUs this can lead to
significat lock contention on the swiotlb lock.

This patch splits the swiotlb into individual areas which have their
own lock. When there are swiotlb map/allocate request, allocate
io tlb buffer from areas averagely and free the allocation back
to the associated area. This is to prepare to resolve the overhead
of single spinlock among device's queues. Per device may have its
own io tlb mem and bounce buffer pool.

This idea from Andi Kleen 
patch(https://github.com/intel/tdx/commit/4529b578

4c141782c72ec9bd9a92df2b68cb7d45). Rework it and make it may work
for individual device's io tlb mem. The device driver may determine
area number according to device queue number.


Rather than introduce this extra level of allocator complexity, how 
about just dividing up the initial SWIOTLB allocation into multiple 
io_tlb_mem instances?


Robin.


Agree. Thanks for suggestion. That will be more generic and will update
in the next version.

Thanks.

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [RFC PATCH 1/2] swiotlb: Split up single swiotlb lock

2022-04-28 Thread Andi Kleen



On 4/28/2022 8:05 AM, Christoph Hellwig wrote:

On Thu, Apr 28, 2022 at 07:55:39AM -0700, Andi Kleen wrote:

At least for TDX need parallelism with a single device for performance.

So find a way to make it happen without exposing details to random
drivers.



That's what the original patch (that this one is derived from) did.

It was completely transparent to everyone outside swiotlb.c

-Andi

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [RFC PATCH 1/2] swiotlb: Split up single swiotlb lock

2022-04-28 Thread Robin Murphy

On 2022-04-28 15:55, Andi Kleen wrote:


On 4/28/2022 7:45 AM, Christoph Hellwig wrote:

On Thu, Apr 28, 2022 at 03:44:36PM +0100, Robin Murphy wrote:
Rather than introduce this extra level of allocator complexity, how 
about

just dividing up the initial SWIOTLB allocation into multiple io_tlb_mem
instances?

Yeah.  We're almost done removing all knowledge of swiotlb from drivers,
so the very last thing I want is an interface that allows a driver to
allocate a per-device buffer.


At least for TDX need parallelism with a single device for performance.

So if you split up the io tlb mems for a device then you would need a 
new mechanism to load balance the requests for single device over those. 
I doubt it would be any simpler.


Eh, I think it would be, since the round-robin retry loop can then just 
sit around the existing io_tlb_mem-based allocator, vs. the churn of 
inserting it in the middle, plus it's then really easy to statically 
distribute different starting points across different devices via 
dev->dma_io_tlb_mem if we wanted to.


Admittedly the overall patch probably ends up about the same size, since 
it likely pushes a bit more complexity into swiotlb_init to compensate, 
but that's still a trade-off I like.


Thanks,
Robin.
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

Re: [RFC PATCH 1/2] swiotlb: Split up single swiotlb lock

2022-04-28 Thread Christoph Hellwig
On Thu, Apr 28, 2022 at 07:55:39AM -0700, Andi Kleen wrote:
> At least for TDX need parallelism with a single device for performance.

So find a way to make it happen without exposing details to random
drivers.
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [RFC PATCH 1/2] swiotlb: Split up single swiotlb lock

2022-04-28 Thread Robin Murphy

On 2022-04-28 15:45, Christoph Hellwig wrote:

On Thu, Apr 28, 2022 at 03:44:36PM +0100, Robin Murphy wrote:

Rather than introduce this extra level of allocator complexity, how about
just dividing up the initial SWIOTLB allocation into multiple io_tlb_mem
instances?


Yeah.  We're almost done removing all knowledge of swiotlb from drivers,
so the very last thing I want is an interface that allows a driver to
allocate a per-device buffer.


FWIW I'd already started thinking about having a distinct io_tlb_mem for 
non-coherent devices where vaddr is made non-cacheable to avoid the 
hassle of keeping the arch_dma_sync_* calls lined up, so I'm certainly 
in favour of bringing in a bit more flexibility at this level :)


Robin.
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [RFC PATCH 1/2] swiotlb: Split up single swiotlb lock

2022-04-28 Thread Andi Kleen



On 4/28/2022 7:45 AM, Christoph Hellwig wrote:

On Thu, Apr 28, 2022 at 03:44:36PM +0100, Robin Murphy wrote:

Rather than introduce this extra level of allocator complexity, how about
just dividing up the initial SWIOTLB allocation into multiple io_tlb_mem
instances?

Yeah.  We're almost done removing all knowledge of swiotlb from drivers,
so the very last thing I want is an interface that allows a driver to
allocate a per-device buffer.


At least for TDX need parallelism with a single device for performance.

So if you split up the io tlb mems for a device then you would need a 
new mechanism to load balance the requests for single device over those. 
I doubt it would be any simpler.



-Andi


___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [RFC PATCH 1/2] swiotlb: Split up single swiotlb lock

2022-04-28 Thread Christoph Hellwig
On Thu, Apr 28, 2022 at 03:44:36PM +0100, Robin Murphy wrote:
> Rather than introduce this extra level of allocator complexity, how about
> just dividing up the initial SWIOTLB allocation into multiple io_tlb_mem
> instances?

Yeah.  We're almost done removing all knowledge of swiotlb from drivers,
so the very last thing I want is an interface that allows a driver to
allocate a per-device buffer.
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [RFC PATCH 1/2] swiotlb: Split up single swiotlb lock

2022-04-28 Thread Robin Murphy

On 2022-04-28 15:14, Tianyu Lan wrote:

From: Tianyu Lan 

Traditionally swiotlb was not performance critical because it was only
used for slow devices. But in some setups, like TDX/SEV confidential
guests, all IO has to go through swiotlb. Currently swiotlb only has a
single lock. Under high IO load with multiple CPUs this can lead to
significat lock contention on the swiotlb lock.

This patch splits the swiotlb into individual areas which have their
own lock. When there are swiotlb map/allocate request, allocate
io tlb buffer from areas averagely and free the allocation back
to the associated area. This is to prepare to resolve the overhead
of single spinlock among device's queues. Per device may have its
own io tlb mem and bounce buffer pool.

This idea from Andi Kleen patch(https://github.com/intel/tdx/commit/4529b578
4c141782c72ec9bd9a92df2b68cb7d45). Rework it and make it may work
for individual device's io tlb mem. The device driver may determine
area number according to device queue number.


Rather than introduce this extra level of allocator complexity, how 
about just dividing up the initial SWIOTLB allocation into multiple 
io_tlb_mem instances?


Robin.


Based-on-idea-by: Andi Kleen 
Signed-off-by: Tianyu Lan 
---
  include/linux/swiotlb.h |  25 ++
  kernel/dma/swiotlb.c| 173 +++-
  2 files changed, 162 insertions(+), 36 deletions(-)

diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index 7ed35dd3de6e..489c249da434 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -62,6 +62,24 @@ dma_addr_t swiotlb_map(struct device *dev, phys_addr_t phys,
  #ifdef CONFIG_SWIOTLB
  extern enum swiotlb_force swiotlb_force;
  
+/**

+ * struct io_tlb_area - IO TLB memory area descriptor
+ *
+ * This is a single area with a single lock.
+ *
+ * @used:  The number of used IO TLB block.
+ * @area_index: The index of to tlb area.
+ * @index: The slot index to start searching in this area for next round.
+ * @lock:  The lock to protect the above data structures in the map and
+ * unmap calls.
+ */
+struct io_tlb_area {
+   unsigned long used;
+   unsigned int area_index;
+   unsigned int index;
+   spinlock_t lock;
+};
+
  /**
   * struct io_tlb_mem - IO TLB Memory Pool Descriptor
   *
@@ -89,6 +107,9 @@ extern enum swiotlb_force swiotlb_force;
   * @late_alloc:   %true if allocated using the page allocator
   * @force_bounce: %true if swiotlb bouncing is forced
   * @for_alloc:  %true if the pool is used for memory allocation
+ * @num_areas:  The area number in the pool.
+ * @area_start: The area index to start searching in the next round.
+ * @area_nslabs: The slot number in the area.
   */
  struct io_tlb_mem {
phys_addr_t start;
@@ -102,6 +123,10 @@ struct io_tlb_mem {
bool late_alloc;
bool force_bounce;
bool for_alloc;
+   unsigned int num_areas;
+   unsigned int area_start;
+   unsigned int area_nslabs;
+   struct io_tlb_area *areas;
struct io_tlb_slot {
phys_addr_t orig_addr;
size_t alloc_size;
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index e2ef0864eb1e..00a16f540f20 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -62,6 +62,8 @@
  
  #define INVALID_PHYS_ADDR (~(phys_addr_t)0)
  
+#define NUM_AREAS_DEFAULT 1

+
  static bool swiotlb_force_bounce;
  static bool swiotlb_force_disable;
  
@@ -70,6 +72,25 @@ struct io_tlb_mem io_tlb_default_mem;

  phys_addr_t swiotlb_unencrypted_base;
  
  static unsigned long default_nslabs = IO_TLB_DEFAULT_SIZE >> IO_TLB_SHIFT;

+static unsigned long default_area_num = NUM_AREAS_DEFAULT;
+
+static int swiotlb_setup_areas(struct io_tlb_mem *mem,
+   unsigned int num_areas, unsigned long nslabs)
+{
+   if (nslabs < 1 || !is_power_of_2(num_areas)) {
+   pr_err("swiotlb: Invalid areas parameter %d.\n", num_areas);
+   return -EINVAL;
+   }
+
+   /* Round up number of slabs to the next power of 2.
+* The last area is going be smaller than the rest if default_nslabs is
+* not power of two.
+*/
+   mem->area_start = 0;
+   mem->num_areas = num_areas;
+   mem->area_nslabs = nslabs / num_areas;
+   return 0;
+}
  
  static int __init

  setup_io_tlb_npages(char *str)
@@ -114,6 +135,8 @@ void __init swiotlb_adjust_size(unsigned long size)
return;
size = ALIGN(size, IO_TLB_SIZE);
default_nslabs = ALIGN(size >> IO_TLB_SHIFT, IO_TLB_SEGSIZE);
+   swiotlb_setup_areas(_tlb_default_mem, default_area_num,
+   default_nslabs);
pr_info("SWIOTLB bounce buffer size adjusted to %luMB", size >> 20);
  }
  
@@ -195,7 +218,8 @@ static void swiotlb_init_io_tlb_mem(struct io_tlb_mem *mem, phys_addr_t start,

unsigned long nslabs, bool late_alloc)
  {
void *vaddr = 

[RFC PATCH 1/2] swiotlb: Split up single swiotlb lock

2022-04-28 Thread Tianyu Lan
From: Tianyu Lan 

Traditionally swiotlb was not performance critical because it was only
used for slow devices. But in some setups, like TDX/SEV confidential
guests, all IO has to go through swiotlb. Currently swiotlb only has a
single lock. Under high IO load with multiple CPUs this can lead to
significat lock contention on the swiotlb lock.

This patch splits the swiotlb into individual areas which have their
own lock. When there are swiotlb map/allocate request, allocate
io tlb buffer from areas averagely and free the allocation back
to the associated area. This is to prepare to resolve the overhead
of single spinlock among device's queues. Per device may have its
own io tlb mem and bounce buffer pool.

This idea from Andi Kleen patch(https://github.com/intel/tdx/commit/4529b578
4c141782c72ec9bd9a92df2b68cb7d45). Rework it and make it may work
for individual device's io tlb mem. The device driver may determine
area number according to device queue number.

Based-on-idea-by: Andi Kleen 
Signed-off-by: Tianyu Lan 
---
 include/linux/swiotlb.h |  25 ++
 kernel/dma/swiotlb.c| 173 +++-
 2 files changed, 162 insertions(+), 36 deletions(-)

diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index 7ed35dd3de6e..489c249da434 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -62,6 +62,24 @@ dma_addr_t swiotlb_map(struct device *dev, phys_addr_t phys,
 #ifdef CONFIG_SWIOTLB
 extern enum swiotlb_force swiotlb_force;
 
+/**
+ * struct io_tlb_area - IO TLB memory area descriptor
+ *
+ * This is a single area with a single lock.
+ *
+ * @used:  The number of used IO TLB block.
+ * @area_index: The index of to tlb area.
+ * @index: The slot index to start searching in this area for next round.
+ * @lock:  The lock to protect the above data structures in the map and
+ * unmap calls.
+ */
+struct io_tlb_area {
+   unsigned long used;
+   unsigned int area_index;
+   unsigned int index;
+   spinlock_t lock;
+};
+
 /**
  * struct io_tlb_mem - IO TLB Memory Pool Descriptor
  *
@@ -89,6 +107,9 @@ extern enum swiotlb_force swiotlb_force;
  * @late_alloc:%true if allocated using the page allocator
  * @force_bounce: %true if swiotlb bouncing is forced
  * @for_alloc:  %true if the pool is used for memory allocation
+ * @num_areas:  The area number in the pool.
+ * @area_start: The area index to start searching in the next round.
+ * @area_nslabs: The slot number in the area.
  */
 struct io_tlb_mem {
phys_addr_t start;
@@ -102,6 +123,10 @@ struct io_tlb_mem {
bool late_alloc;
bool force_bounce;
bool for_alloc;
+   unsigned int num_areas;
+   unsigned int area_start;
+   unsigned int area_nslabs;
+   struct io_tlb_area *areas;
struct io_tlb_slot {
phys_addr_t orig_addr;
size_t alloc_size;
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index e2ef0864eb1e..00a16f540f20 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -62,6 +62,8 @@
 
 #define INVALID_PHYS_ADDR (~(phys_addr_t)0)
 
+#define NUM_AREAS_DEFAULT 1
+
 static bool swiotlb_force_bounce;
 static bool swiotlb_force_disable;
 
@@ -70,6 +72,25 @@ struct io_tlb_mem io_tlb_default_mem;
 phys_addr_t swiotlb_unencrypted_base;
 
 static unsigned long default_nslabs = IO_TLB_DEFAULT_SIZE >> IO_TLB_SHIFT;
+static unsigned long default_area_num = NUM_AREAS_DEFAULT;
+
+static int swiotlb_setup_areas(struct io_tlb_mem *mem,
+   unsigned int num_areas, unsigned long nslabs)
+{
+   if (nslabs < 1 || !is_power_of_2(num_areas)) {
+   pr_err("swiotlb: Invalid areas parameter %d.\n", num_areas);
+   return -EINVAL;
+   }
+
+   /* Round up number of slabs to the next power of 2.
+* The last area is going be smaller than the rest if default_nslabs is
+* not power of two.
+*/
+   mem->area_start = 0;
+   mem->num_areas = num_areas;
+   mem->area_nslabs = nslabs / num_areas;
+   return 0;
+}
 
 static int __init
 setup_io_tlb_npages(char *str)
@@ -114,6 +135,8 @@ void __init swiotlb_adjust_size(unsigned long size)
return;
size = ALIGN(size, IO_TLB_SIZE);
default_nslabs = ALIGN(size >> IO_TLB_SHIFT, IO_TLB_SEGSIZE);
+   swiotlb_setup_areas(_tlb_default_mem, default_area_num,
+   default_nslabs);
pr_info("SWIOTLB bounce buffer size adjusted to %luMB", size >> 20);
 }
 
@@ -195,7 +218,8 @@ static void swiotlb_init_io_tlb_mem(struct io_tlb_mem *mem, 
phys_addr_t start,
unsigned long nslabs, bool late_alloc)
 {
void *vaddr = phys_to_virt(start);
-   unsigned long bytes = nslabs << IO_TLB_SHIFT, i;
+   unsigned long bytes = nslabs << IO_TLB_SHIFT, i, j;
+   unsigned int block_list;
 
mem->nslabs = nslabs;
mem->start = start;
@@ -206,8 +230,13