[PATCH 6/6] ACPI/OSL: Fix performance issue in system memory lockings.

2014-10-22 Thread Lv Zheng
It is reported that the synchronize_rcu() used in the hot path is not
performance friendly:
 <6>[3.998532] acpi_ut_update_object_reference: obj:88003c305f78 type:10
 <6>[4.006137] acpi_ut_update_ref_count:locked count:1 action:1 flags:286
 <6>[4.013450] acpi_ut_delete_internal_obj: obj:88003c305f78 flag:2
 <6>[4.020569] acpi_ut_delete_internal_obj:second_desc 
handler_desc:88003c0784c8 flags:1
 <6>[4.029729] acpi_ut_delete_internal_obj: going to setup @ 
81489299
 <6>[4.037431] acpi_ev_system_memory_region_setup: addr:c90b0070, 
length:16
*<6>[4.045716] acpi_os_unmap_memory: locked
*<6>[4.050111] acpi_os_unmap_memory: found map
*<6>[4.054798] acpi_os_unmap_memory: map dropped
*<6>[4.059678] acpi_os_unmap_memory: unlocked
*<6>[  122.761255] acpi_os_map_cleanup: after synchronize_rcu
*<6>[  122.767027] acpi_os_unmap_memory: end
*<6>[  122.771134] After unmap
*<6>[  122.773877] After ACPI_FREE
 <6>[  122.777010] acpi_ut_delete_internal_obj: after setup
 <6>[  122.782576] acpi_ut_update_object_reference: obj:88003c0784c8 type:24
 <6>[  122.790183] acpi_ut_update_ref_count:locked count:1c action:1 flags:282
 <6>[  122.797595] acpi_ut_delete_internal_obj: handler_desc removed
 <6>[  122.804034] acpi_ut_delete_internal_obj: second_desc deleted
 <6>[  122.810376] acpi_ut_delete_internal_obj: object deleted
 <6>[  122.816231] acpi_ns_detach_object: reference removed
Note: The "*" marked acpi_os_unmap_memory() instrumentation messages are
  added by the reporter.

The patch converts RCU synchronization into the deferred executed work
queue item to eliminate the consumed time from the hot path. By doing so,
acpi_os_read_memory()/acpi_os_write_memory() need to hold the reference
count and the RCU locks will then be unnecessary. The original
acpi_ioremap_lock (renamed to acpi_map_lock) must be converted into
spinlock to be used for both the IRQ and the task contexts.
But it is still needed that the map operations are serialized because the
spinlock cannot be held around ioremap(), so that the parallel map
operations can wait for each other instead of inserting duplicated entries
into the acpi_ioremaps. This is achieved by introducing another
acpi_serial_map_mutex. Note that the unmap serialization is not required
due to the reference count protected list removal, thus
acpi_os_unmap_iomem() and acpi_map_relinquish() are not protected by the
acpi_serial_map_mutex.
The test feedback from the reporter:
 "The google reboot stability test passed with your 6 patches. Thanks for
  the help."

Known issues:
1. It is better to use a seperate work queue other than the ACPICA
   dedicated GPE work queue to perform mapping relinquishment. Until a real
   case can be seen on bugzilla.kernel.org complaining this, no further
   improvements are done in this patch.

Signed-off-by: Lv Zheng 
Reported-and-tested-by: Fei Yang 
---
 drivers/acpi/mem.c |   87 ++--
 1 file changed, 58 insertions(+), 29 deletions(-)

diff --git a/drivers/acpi/mem.c b/drivers/acpi/mem.c
index c7c35f7..b631007 100644
--- a/drivers/acpi/mem.c
+++ b/drivers/acpi/mem.c
@@ -30,13 +30,13 @@ struct acpi_ioremap {
unsigned long refcount;
 };
 
+/* ioremap() serialization, no need to serialize iounmap() operations */
+static DEFINE_MUTEX(acpi_serial_map_mutex);
+
 static LIST_HEAD(acpi_ioremaps);
-static DEFINE_MUTEX(acpi_ioremap_lock);
+static DEFINE_SPINLOCK(acpi_map_lock);
 
-/*
- * The following functions must be called with 'acpi_ioremap_lock' or RCU
- * read lock held.
- */
+/* The following functions must be called with 'acpi_map_lock' held. */
 static inline void acpi_map_get(struct acpi_ioremap *map)
 {
map->refcount++;
@@ -45,7 +45,7 @@ static inline void acpi_map_get(struct acpi_ioremap *map)
 static inline void acpi_map_put(struct acpi_ioremap *map)
 {
if (!--map->refcount)
-   list_del_rcu(>list);
+   list_del(>list);
 }
 
 static struct acpi_ioremap *
@@ -53,7 +53,7 @@ acpi_map_lookup_phys(acpi_physical_address phys, acpi_size 
size)
 {
struct acpi_ioremap *map;
 
-   list_for_each_entry_rcu(map, _ioremaps, list)
+   list_for_each_entry(map, _ioremaps, list)
if (map->phys <= phys &&
phys + size <= map->phys + map->size)
return map;
@@ -66,7 +66,7 @@ acpi_map_lookup_virt(void __iomem *virt, acpi_size size)
 {
struct acpi_ioremap *map;
 
-   list_for_each_entry_rcu(map, _ioremaps, list)
+   list_for_each_entry(map, _ioremaps, list)
if (map->virt <= virt &&
virt + size <= map->virt + map->size)
return map;
@@ -80,6 +80,7 @@ acpi_map2virt(struct acpi_ioremap *map, acpi_physical_address 
phys)
return map ? map->virt + (phys - map->phys) : NULL;
 }
 
+/* The following functions must be called without 'acpi_map_lock' held. */
 #ifndef 

[PATCH 6/6] ACPI/OSL: Fix performance issue in system memory lockings.

2014-10-22 Thread Lv Zheng
It is reported that the synchronize_rcu() used in the hot path is not
performance friendly:
 6[3.998532] acpi_ut_update_object_reference: obj:88003c305f78 type:10
 6[4.006137] acpi_ut_update_ref_count:locked count:1 action:1 flags:286
 6[4.013450] acpi_ut_delete_internal_obj: obj:88003c305f78 flag:2
 6[4.020569] acpi_ut_delete_internal_obj:second_desc 
handler_desc:88003c0784c8 flags:1
 6[4.029729] acpi_ut_delete_internal_obj: going to setup @ 
81489299
 6[4.037431] acpi_ev_system_memory_region_setup: addr:c90b0070, 
length:16
*6[4.045716] acpi_os_unmap_memory: locked
*6[4.050111] acpi_os_unmap_memory: found map
*6[4.054798] acpi_os_unmap_memory: map dropped
*6[4.059678] acpi_os_unmap_memory: unlocked
*6[  122.761255] acpi_os_map_cleanup: after synchronize_rcu
*6[  122.767027] acpi_os_unmap_memory: end
*6[  122.771134] After unmap
*6[  122.773877] After ACPI_FREE
 6[  122.777010] acpi_ut_delete_internal_obj: after setup
 6[  122.782576] acpi_ut_update_object_reference: obj:88003c0784c8 type:24
 6[  122.790183] acpi_ut_update_ref_count:locked count:1c action:1 flags:282
 6[  122.797595] acpi_ut_delete_internal_obj: handler_desc removed
 6[  122.804034] acpi_ut_delete_internal_obj: second_desc deleted
 6[  122.810376] acpi_ut_delete_internal_obj: object deleted
 6[  122.816231] acpi_ns_detach_object: reference removed
Note: The * marked acpi_os_unmap_memory() instrumentation messages are
  added by the reporter.

The patch converts RCU synchronization into the deferred executed work
queue item to eliminate the consumed time from the hot path. By doing so,
acpi_os_read_memory()/acpi_os_write_memory() need to hold the reference
count and the RCU locks will then be unnecessary. The original
acpi_ioremap_lock (renamed to acpi_map_lock) must be converted into
spinlock to be used for both the IRQ and the task contexts.
But it is still needed that the map operations are serialized because the
spinlock cannot be held around ioremap(), so that the parallel map
operations can wait for each other instead of inserting duplicated entries
into the acpi_ioremaps. This is achieved by introducing another
acpi_serial_map_mutex. Note that the unmap serialization is not required
due to the reference count protected list removal, thus
acpi_os_unmap_iomem() and acpi_map_relinquish() are not protected by the
acpi_serial_map_mutex.
The test feedback from the reporter:
 The google reboot stability test passed with your 6 patches. Thanks for
  the help.

Known issues:
1. It is better to use a seperate work queue other than the ACPICA
   dedicated GPE work queue to perform mapping relinquishment. Until a real
   case can be seen on bugzilla.kernel.org complaining this, no further
   improvements are done in this patch.

Signed-off-by: Lv Zheng lv.zh...@intel.com
Reported-and-tested-by: Fei Yang fei.y...@intel.com
---
 drivers/acpi/mem.c |   87 ++--
 1 file changed, 58 insertions(+), 29 deletions(-)

diff --git a/drivers/acpi/mem.c b/drivers/acpi/mem.c
index c7c35f7..b631007 100644
--- a/drivers/acpi/mem.c
+++ b/drivers/acpi/mem.c
@@ -30,13 +30,13 @@ struct acpi_ioremap {
unsigned long refcount;
 };
 
+/* ioremap() serialization, no need to serialize iounmap() operations */
+static DEFINE_MUTEX(acpi_serial_map_mutex);
+
 static LIST_HEAD(acpi_ioremaps);
-static DEFINE_MUTEX(acpi_ioremap_lock);
+static DEFINE_SPINLOCK(acpi_map_lock);
 
-/*
- * The following functions must be called with 'acpi_ioremap_lock' or RCU
- * read lock held.
- */
+/* The following functions must be called with 'acpi_map_lock' held. */
 static inline void acpi_map_get(struct acpi_ioremap *map)
 {
map-refcount++;
@@ -45,7 +45,7 @@ static inline void acpi_map_get(struct acpi_ioremap *map)
 static inline void acpi_map_put(struct acpi_ioremap *map)
 {
if (!--map-refcount)
-   list_del_rcu(map-list);
+   list_del(map-list);
 }
 
 static struct acpi_ioremap *
@@ -53,7 +53,7 @@ acpi_map_lookup_phys(acpi_physical_address phys, acpi_size 
size)
 {
struct acpi_ioremap *map;
 
-   list_for_each_entry_rcu(map, acpi_ioremaps, list)
+   list_for_each_entry(map, acpi_ioremaps, list)
if (map-phys = phys 
phys + size = map-phys + map-size)
return map;
@@ -66,7 +66,7 @@ acpi_map_lookup_virt(void __iomem *virt, acpi_size size)
 {
struct acpi_ioremap *map;
 
-   list_for_each_entry_rcu(map, acpi_ioremaps, list)
+   list_for_each_entry(map, acpi_ioremaps, list)
if (map-virt = virt 
virt + size = map-virt + map-size)
return map;
@@ -80,6 +80,7 @@ acpi_map2virt(struct acpi_ioremap *map, acpi_physical_address 
phys)
return map ? map-virt + (phys - map-phys) : NULL;
 }
 
+/* The following functions must be called without 'acpi_map_lock' held. */
 #ifndef