From: Richard Woodruff <r-woodru...@ti.com>

Analysis in TI kernel with ETM showed that using cache mapped flush
in kernel instead of SO mapped flush cost drops by 65% (3.39mS down
to 1.17mS) for clean_l2 which is used during sleep sequences.
Overall:
        - speed up
        - unfortunately there isn't a good alternative flush method today
        - code reduction and less maintenance and potential bug in
          unmaintained code

This also fixes the bug with the clean_l2 function usage.

Reported-by: Tony Lindgren <t...@atomide.com>

Cc: Kevin Hilman <khil...@deeprootsystems.com>
Cc: Tony Lindgren <t...@atomide.com>

Acked-by: Santosh Shilimkar <santosh.shilim...@ti.com>
Acked-by: Jean Pihet <j-pi...@ti.com>

[...@ti.com: ported rkw's proposal to 2.6.37-rc2]
Signed-off-by: Nishanth Menon <n...@ti.com>
Signed-off-by: Richard Woodruff <r-woodru...@ti.com>
---
v3: modified comment to add remark that lr is used since we are running in SRAM
    currently - this is to help developers transition this code eventually to
    SDRAM.
v2: https://patchwork.kernel.org/patch/365222/

 arch/arm/mach-omap2/sleep34xx.S |   80 +++++++--------------------------------
 1 files changed, 14 insertions(+), 66 deletions(-)

diff --git a/arch/arm/mach-omap2/sleep34xx.S b/arch/arm/mach-omap2/sleep34xx.S
index 2fb205a..aa43da5 100644
--- a/arch/arm/mach-omap2/sleep34xx.S
+++ b/arch/arm/mach-omap2/sleep34xx.S
@@ -520,72 +520,18 @@ clean_caches:
        cmp     r9, #1 /* Check whether L2 inval is required or not*/
        bne     skip_l2_inval
 clean_l2:
-       /* read clidr */
-       mrc     p15, 1, r0, c0, c0, 1
-       /* extract loc from clidr */
-       ands    r3, r0, #0x7000000
-       /* left align loc bit field */
-       mov     r3, r3, lsr #23
-       /* if loc is 0, then no need to clean */
-       beq     finished
-       /* start clean at cache level 0 */
-       mov     r10, #0
-loop1:
-       /* work out 3x current cache level */
-       add     r2, r10, r10, lsr #1
-       /* extract cache type bits from clidr*/
-       mov     r1, r0, lsr r2
-       /* mask of the bits for current cache only */
-       and     r1, r1, #7
-       /* see what cache we have at this level */
-       cmp     r1, #2
-       /* skip if no cache, or just i-cache */
-       blt     skip
-       /* select current cache level in cssr */
-       mcr     p15, 2, r10, c0, c0, 0
-       /* isb to sych the new cssr&csidr */
-       isb
-       /* read the new csidr */
-       mrc     p15, 1, r1, c0, c0, 0
-       /* extract the length of the cache lines */
-       and     r2, r1, #7
-       /* add 4 (line length offset) */
-       add     r2, r2, #4
-       ldr     r4, assoc_mask
-       /* find maximum number on the way size */
-       ands    r4, r4, r1, lsr #3
-       /* find bit position of way size increment */
-       clz     r5, r4
-       ldr     r7, numset_mask
-       /* extract max number of the index size*/
-       ands    r7, r7, r1, lsr #13
-loop2:
-       mov     r9, r4
-       /* create working copy of max way size*/
-loop3:
-       /* factor way and cache number into r11 */
-       orr     r11, r10, r9, lsl r5
-       /* factor index number into r11 */
-       orr     r11, r11, r7, lsl r2
-       /*clean & invalidate by set/way */
-       mcr     p15, 0, r11, c7, c10, 2
-       /* decrement the way*/
-       subs    r9, r9, #1
-       bge     loop3
-       /*decrement the index */
-       subs    r7, r7, #1
-       bge     loop2
-skip:
-       add     r10, r10, #2
-       /* increment cache number */
-       cmp     r3, r10
-       bgt     loop1
-finished:
-       /*swith back to cache level 0 */
-       mov     r10, #0
-       /* select current cache level in cssr */
-       mcr     p15, 2, r10, c0, c0, 0
-       isb
+       /*
+        * Jump out to kernel flush routine
+        *  - reuse that code is better
+        *  - it executes in a cached space so is faster than refetch per-block
+        *  - should be faster and will change with kernel
+        *  - 'might' have to copy address, load and jump to it
+        *  - lr is used since we are running in SRAM currently.
+        */
+       ldr r1, kernel_flush
+       mov lr, pc
+       bx  r1
+
 skip_l2_inval:
        /* Data memory barrier and Data sync barrier */
        mov     r1, #0
@@ -668,5 +614,7 @@ cache_pred_disable_mask:
        .word   0xFFFFE7FB
 control_stat:
        .word   CONTROL_STAT
+kernel_flush:
+       .word v7_flush_dcache_all
 ENTRY(omap34xx_cpu_suspend_sz)
        .word   . - omap34xx_cpu_suspend
-- 
1.6.3.3

--
To unsubscribe from this list: send the line "unsubscribe linux-omap" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to