Re: [PATCH v2 1/4] powerpc/code-patching: introduce patch_instructions()

2023-03-11 Thread Christophe Leroy


Le 09/03/2023 à 19:02, Hari Bathini a écrit :
> patch_instruction() entails setting up pte, patching the instruction,
> clearing the pte and flushing the tlb. If multiple instructions need
> to be patched, every instruction would have to go through the above
> drill unnecessarily. Instead, introduce function patch_instructions()
> that patches multiple instructions at one go while setting up the pte,
> clearing the pte and flushing the tlb only once per page range of
> instructions. Observed ~5X improvement in speed of execution using
> patch_instructions() over patch_instructions(), when more instructions
> are to be patched.
> 
> Signed-off-by: Hari Bathini 
> ---
>   arch/powerpc/include/asm/code-patching.h |   1 +
>   arch/powerpc/lib/code-patching.c | 151 ---
>   2 files changed, 106 insertions(+), 46 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/code-patching.h 
> b/arch/powerpc/include/asm/code-patching.h
> index 3f881548fb61..059fc4fe700e 100644
> --- a/arch/powerpc/include/asm/code-patching.h
> +++ b/arch/powerpc/include/asm/code-patching.h
> @@ -74,6 +74,7 @@ int create_cond_branch(ppc_inst_t *instr, const u32 *addr,
>   int patch_branch(u32 *addr, unsigned long target, int flags);
>   int patch_instruction(u32 *addr, ppc_inst_t instr);
>   int raw_patch_instruction(u32 *addr, ppc_inst_t instr);
> +int patch_instructions(u32 *addr, u32 *code, bool fill_inst, size_t len);
>   
>   static inline unsigned long patch_site_addr(s32 *site)
>   {
> diff --git a/arch/powerpc/lib/code-patching.c 
> b/arch/powerpc/lib/code-patching.c
> index b00112d7ad46..33857b9b53de 100644
> --- a/arch/powerpc/lib/code-patching.c
> +++ b/arch/powerpc/lib/code-patching.c
> @@ -278,77 +278,117 @@ static void unmap_patch_area(unsigned long addr)
>   flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
>   }
>   
> -static int __do_patch_instruction_mm(u32 *addr, ppc_inst_t instr)
> +static int __do_patch_instructions_mm(u32 *addr, u32 *code, bool fill_inst, 
> size_t len)

Some time ago we did a huge work to clean up use of u32 as code versus 
ppc_inst_t.

Please carefully audit all places where you use u32 instead of 
ppc_inst_t. If you do so, don't use anymore the word 'instruction' in 
the function name.

>   {
> - int err;
> - u32 *patch_addr;
> - unsigned long text_poke_addr;
> - pte_t *pte;
> - unsigned long pfn = get_patch_pfn(addr);
> - struct mm_struct *patching_mm;
> - struct mm_struct *orig_mm;
> + struct mm_struct *patching_mm, *orig_mm;

This change is cosmetic and not functionnaly liked to the patch.

> + unsigned long text_poke_addr, pfn;
> + u32 *patch_addr, *end, *pend;
> + ppc_inst_t instr;
>   spinlock_t *ptl;
> + int ilen, err;
> + pte_t *pte;

Why move this declaration ?

>   
>   patching_mm = __this_cpu_read(cpu_patching_context.mm);
>   text_poke_addr = __this_cpu_read(cpu_patching_context.addr);
> - patch_addr = (u32 *)(text_poke_addr + offset_in_page(addr));
>   
>   pte = get_locked_pte(patching_mm, text_poke_addr, );
>   if (!pte)
>   return -ENOMEM;
>   
> - __set_pte_at(patching_mm, text_poke_addr, pte, pfn_pte(pfn, 
> PAGE_KERNEL), 0);
> + end = (void *)addr + len;
> + do {
> + pfn = get_patch_pfn(addr);
> + __set_pte_at(patching_mm, text_poke_addr, pte, pfn_pte(pfn, 
> PAGE_KERNEL), 0);
>   
> - /* order PTE update before use, also serves as the hwsync */
> - asm volatile("ptesync": : :"memory");
> -
> - /* order context switch after arbitrary prior code */
> - isync();
> -
> - orig_mm = start_using_temp_mm(patching_mm);
> -
> - err = __patch_instruction(addr, instr, patch_addr);
> + /* order PTE update before use, also serves as the hwsync */
> + asm volatile("ptesync": : :"memory");
>   
> - /* hwsync performed by __patch_instruction (sync) if successful */
> - if (err)
> - mb();  /* sync */
> + /* order context switch after arbitrary prior code */
> + isync();
> +
> + orig_mm = start_using_temp_mm(patching_mm);
> +
> + patch_addr = (u32 *)(text_poke_addr + offset_in_page(addr));
> + pend = (void *)addr + PAGE_SIZE - offset_in_page(addr);
> + if (end < pend)
> + pend = end;
> +
> + while (addr < pend) {
> + instr = ppc_inst_read(code);
> + ilen = ppc_inst_len(instr);
> + err = __patch_instruction(addr, instr, patch_addr);
> + /* hwsync performed by __patch_instruction (sync) if 
> successful */

As you aim at optimising the loops, you should before cache flushed 
outside this loop, with flush_dcache_range() and invalidate_dcache_range().

> + if (err) {
> + mb();  /* sync */
> + break;
> +

Re: [PATCH v2 1/4] powerpc/code-patching: introduce patch_instructions()

2023-03-10 Thread Christophe Leroy


Le 10/03/2023 à 19:26, Christophe Leroy a écrit :
> 
> 
> Le 09/03/2023 à 19:02, Hari Bathini a écrit :
>> patch_instruction() entails setting up pte, patching the instruction,
>> clearing the pte and flushing the tlb. If multiple instructions need
>> to be patched, every instruction would have to go through the above
>> drill unnecessarily. Instead, introduce function patch_instructions()
>> that patches multiple instructions at one go while setting up the pte,
>> clearing the pte and flushing the tlb only once per page range of
>> instructions. Observed ~5X improvement in speed of execution using
>> patch_instructions() over patch_instructions(), when more instructions
>> are to be patched.
> 
> I get a 13% degradation on the time needed to activate ftrace on a 
> powerpc 8xx.
> 
> Before your patch, activation ftrace takes 550k timebase ticks. After 
> your patch it takes 620k timebase ticks.
> 

More details about the problem:

Before your patch, patch_instruction() is a simple, stackless function 
(Note that the first branch is noped out after startup).

0254 :
  254:  48 00 00 6c b   2c0 
  258:  7c e0 00 a6 mfmsr   r7
  25c:  7c 51 13 a6 mtspr   81,r2
  260:  3d 40 00 00 lis r10,0
262: R_PPC_ADDR16_HA.data
  264:  39 4a 00 00 addir10,r10,0
266: R_PPC_ADDR16_LO.data
  268:  7c 69 1b 78 mr  r9,r3
  26c:  3d 29 40 00 addis   r9,r9,16384
  270:  81 0a 00 08 lwz r8,8(r10)
  274:  55 29 00 26 rlwinm  r9,r9,0,0,19
  278:  81 4a 00 04 lwz r10,4(r10)
  27c:  61 29 01 25 ori r9,r9,293
  280:  91 28 00 00 stw r9,0(r8)
  284:  55 49 00 26 rlwinm  r9,r10,0,0,19
  288:  50 6a 05 3e rlwimi  r10,r3,0,20,31
  28c:  90 8a 00 00 stw r4,0(r10)
  290:  7c 00 50 6c dcbst   0,r10
  294:  7c 00 04 ac hwsync
  298:  7c 00 1f ac icbi0,r3
  29c:  7c 00 04 ac hwsync
  2a0:  4c 00 01 2c isync
  2a4:  38 60 00 00 li  r3,0
  2a8:  39 40 00 00 li  r10,0
  2ac:  91 48 00 00 stw r10,0(r8)
  2b0:  7c 00 4a 64 tlbie   r9,r0
  2b4:  7c 00 04 ac hwsync
  2b8:  7c e0 01 24 mtmsr   r7
  2bc:  4e 80 00 20 blr

  2c0:  90 83 00 00 stw r4,0(r3)
  2c4:  7c 00 18 6c dcbst   0,r3
  2c8:  7c 00 04 ac hwsync
  2cc:  7c 00 1f ac icbi0,r3
  2d0:  7c 00 04 ac hwsync
  2d4:  4c 00 01 2c isync
  2d8:  38 60 00 00 li  r3,0
  2dc:  4e 80 00 20 blr
  2e0:  38 60 ff ff li  r3,-1
  2e4:  4b ff ff c4 b   2a8 
  2e8:  38 60 ff ff li  r3,-1
  2ec:  4e 80 00 20 blr


Once your patch is there, patch_instruction() becomes a function that 
has to step up a stack in order to call __do_patch_instructions().
And __do_patch_instructions() is quite a big function.

022c <__do_patch_instructions>:
  22c:  3d 20 00 00 lis r9,0
22e: R_PPC_ADDR16_HA.data
  230:  39 29 00 00 addir9,r9,0
232: R_PPC_ADDR16_LO.data
  234:  81 69 00 04 lwz r11,4(r9)
  238:  2c 05 00 00 cmpwi   r5,0
  23c:  81 89 00 08 lwz r12,8(r9)
  240:  7c c3 32 14 add r6,r3,r6
  244:  55 6b 00 26 rlwinm  r11,r11,0,0,19
  248:  38 00 00 00 li  r0,0
  24c:  54 6a 05 3e clrlwi  r10,r3,20
  250:  21 0a 10 00 subfic  r8,r10,4096
  254:  7d 03 42 14 add r8,r3,r8
  258:  7c 69 1b 78 mr  r9,r3
  25c:  7f 88 30 40 cmplw   cr7,r8,r6
  260:  3d 29 40 00 addis   r9,r9,16384
  264:  55 29 00 26 rlwinm  r9,r9,0,0,19
  268:  61 29 01 25 ori r9,r9,293
  26c:  91 2c 00 00 stw r9,0(r12)
  270:  7d 4a 5b 78 or  r10,r10,r11
  274:  40 9d 00 08 ble cr7,27c <__do_patch_instructions+0x50>
  278:  7c c8 33 78 mr  r8,r6
  27c:  7f 83 40 40 cmplw   cr7,r3,r8
  280:  40 9c 01 2c bge cr7,3ac <__do_patch_instructions+0x180>
  284:  7c 69 18 f8 not r9,r3
  288:  7d 28 4a 14 add r9,r8,r9
  28c:  55 29 f7 fe rlwinm  r9,r9,30,31,31
  290:  7c e3 50 50 subfr7,r3,r10
  294:  80 a4 00 00 lwz r5,0(r4)
  298:  90 aa 00 00 stw r5,0(r10)
  29c:  7c 00 50 6c dcbst   0,r10
  2a0:  7c 00 04 ac hwsync
  2a4:  7c 00 1f ac icbi0,r3
  2a8:  7c 00 04 ac hwsync
  2ac:  4c 00 01 2c isync
  2b0:  38 63 00 04 addir3,r3,4
  2b4:  40 82 00 08 bne 2bc <__do_patch_instructions+0x90>
  2b8:  38 84 00 04 addir4,r4,4
  2bc:  7f 83 40 40 cmplw   cr7,r3,r8
  2c0:  40 9c 00 a4 bge cr7,364 <__do_patch_instructions+0x138>
  2c4:  2f 89 00 00 cmpwi   cr7,r9,0
  2c8:  41 9e 00 38 beq cr7,300 <__do_patch_instructions+0xd4>
  2cc:  7d 23 3a 14 add r9,r3,r7
  2d0:  81 44 00 00 lwz r10,0(r4)
  2d4:  91 49 00 00 stw r10,0(r9)
  2d8:  7c 00 48 6c dcbst   0,r9
  2dc:  7c 00 04 ac hwsync
  2e0:  7c 00 1f ac icbi0,r3
  2e4:  7c 00 04 ac  

Re: [PATCH v2 1/4] powerpc/code-patching: introduce patch_instructions()

2023-03-10 Thread Christophe Leroy


Le 09/03/2023 à 19:02, Hari Bathini a écrit :
> patch_instruction() entails setting up pte, patching the instruction,
> clearing the pte and flushing the tlb. If multiple instructions need
> to be patched, every instruction would have to go through the above
> drill unnecessarily. Instead, introduce function patch_instructions()
> that patches multiple instructions at one go while setting up the pte,
> clearing the pte and flushing the tlb only once per page range of
> instructions. Observed ~5X improvement in speed of execution using
> patch_instructions() over patch_instructions(), when more instructions
> are to be patched.

I get a 13% degradation on the time needed to activate ftrace on a 
powerpc 8xx.

Before your patch, activation ftrace takes 550k timebase ticks. After 
your patch it takes 620k timebase ticks.

Christophe

> 
> Signed-off-by: Hari Bathini 
> ---
>   arch/powerpc/include/asm/code-patching.h |   1 +
>   arch/powerpc/lib/code-patching.c | 151 ---
>   2 files changed, 106 insertions(+), 46 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/code-patching.h 
> b/arch/powerpc/include/asm/code-patching.h
> index 3f881548fb61..059fc4fe700e 100644
> --- a/arch/powerpc/include/asm/code-patching.h
> +++ b/arch/powerpc/include/asm/code-patching.h
> @@ -74,6 +74,7 @@ int create_cond_branch(ppc_inst_t *instr, const u32 *addr,
>   int patch_branch(u32 *addr, unsigned long target, int flags);
>   int patch_instruction(u32 *addr, ppc_inst_t instr);
>   int raw_patch_instruction(u32 *addr, ppc_inst_t instr);
> +int patch_instructions(u32 *addr, u32 *code, bool fill_inst, size_t len);
>   
>   static inline unsigned long patch_site_addr(s32 *site)
>   {
> diff --git a/arch/powerpc/lib/code-patching.c 
> b/arch/powerpc/lib/code-patching.c
> index b00112d7ad46..33857b9b53de 100644
> --- a/arch/powerpc/lib/code-patching.c
> +++ b/arch/powerpc/lib/code-patching.c
> @@ -278,77 +278,117 @@ static void unmap_patch_area(unsigned long addr)
>   flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
>   }
>   
> -static int __do_patch_instruction_mm(u32 *addr, ppc_inst_t instr)
> +static int __do_patch_instructions_mm(u32 *addr, u32 *code, bool fill_inst, 
> size_t len)
>   {
> - int err;
> - u32 *patch_addr;
> - unsigned long text_poke_addr;
> - pte_t *pte;
> - unsigned long pfn = get_patch_pfn(addr);
> - struct mm_struct *patching_mm;
> - struct mm_struct *orig_mm;
> + struct mm_struct *patching_mm, *orig_mm;
> + unsigned long text_poke_addr, pfn;
> + u32 *patch_addr, *end, *pend;
> + ppc_inst_t instr;
>   spinlock_t *ptl;
> + int ilen, err;
> + pte_t *pte;
>   
>   patching_mm = __this_cpu_read(cpu_patching_context.mm);
>   text_poke_addr = __this_cpu_read(cpu_patching_context.addr);
> - patch_addr = (u32 *)(text_poke_addr + offset_in_page(addr));
>   
>   pte = get_locked_pte(patching_mm, text_poke_addr, );
>   if (!pte)
>   return -ENOMEM;
>   
> - __set_pte_at(patching_mm, text_poke_addr, pte, pfn_pte(pfn, 
> PAGE_KERNEL), 0);
> + end = (void *)addr + len;
> + do {
> + pfn = get_patch_pfn(addr);
> + __set_pte_at(patching_mm, text_poke_addr, pte, pfn_pte(pfn, 
> PAGE_KERNEL), 0);
>   
> - /* order PTE update before use, also serves as the hwsync */
> - asm volatile("ptesync": : :"memory");
> -
> - /* order context switch after arbitrary prior code */
> - isync();
> -
> - orig_mm = start_using_temp_mm(patching_mm);
> -
> - err = __patch_instruction(addr, instr, patch_addr);
> + /* order PTE update before use, also serves as the hwsync */
> + asm volatile("ptesync": : :"memory");
>   
> - /* hwsync performed by __patch_instruction (sync) if successful */
> - if (err)
> - mb();  /* sync */
> + /* order context switch after arbitrary prior code */
> + isync();
> +
> + orig_mm = start_using_temp_mm(patching_mm);
> +
> + patch_addr = (u32 *)(text_poke_addr + offset_in_page(addr));
> + pend = (void *)addr + PAGE_SIZE - offset_in_page(addr);
> + if (end < pend)
> + pend = end;
> +
> + while (addr < pend) {
> + instr = ppc_inst_read(code);
> + ilen = ppc_inst_len(instr);
> + err = __patch_instruction(addr, instr, patch_addr);
> + /* hwsync performed by __patch_instruction (sync) if 
> successful */
> + if (err) {
> + mb();  /* sync */
> + break;
> + }
> +
> + patch_addr = (void *)patch_addr + ilen;
> + addr = (void *)addr + ilen;
> + if (!fill_inst)
> + code = (void *)code + ilen;
> + }
>   
> - /* context 

[PATCH v2 1/4] powerpc/code-patching: introduce patch_instructions()

2023-03-09 Thread Hari Bathini
patch_instruction() entails setting up pte, patching the instruction,
clearing the pte and flushing the tlb. If multiple instructions need
to be patched, every instruction would have to go through the above
drill unnecessarily. Instead, introduce function patch_instructions()
that patches multiple instructions at one go while setting up the pte,
clearing the pte and flushing the tlb only once per page range of
instructions. Observed ~5X improvement in speed of execution using
patch_instructions() over patch_instructions(), when more instructions
are to be patched.

Signed-off-by: Hari Bathini 
---
 arch/powerpc/include/asm/code-patching.h |   1 +
 arch/powerpc/lib/code-patching.c | 151 ---
 2 files changed, 106 insertions(+), 46 deletions(-)

diff --git a/arch/powerpc/include/asm/code-patching.h 
b/arch/powerpc/include/asm/code-patching.h
index 3f881548fb61..059fc4fe700e 100644
--- a/arch/powerpc/include/asm/code-patching.h
+++ b/arch/powerpc/include/asm/code-patching.h
@@ -74,6 +74,7 @@ int create_cond_branch(ppc_inst_t *instr, const u32 *addr,
 int patch_branch(u32 *addr, unsigned long target, int flags);
 int patch_instruction(u32 *addr, ppc_inst_t instr);
 int raw_patch_instruction(u32 *addr, ppc_inst_t instr);
+int patch_instructions(u32 *addr, u32 *code, bool fill_inst, size_t len);
 
 static inline unsigned long patch_site_addr(s32 *site)
 {
diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c
index b00112d7ad46..33857b9b53de 100644
--- a/arch/powerpc/lib/code-patching.c
+++ b/arch/powerpc/lib/code-patching.c
@@ -278,77 +278,117 @@ static void unmap_patch_area(unsigned long addr)
flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
 }
 
-static int __do_patch_instruction_mm(u32 *addr, ppc_inst_t instr)
+static int __do_patch_instructions_mm(u32 *addr, u32 *code, bool fill_inst, 
size_t len)
 {
-   int err;
-   u32 *patch_addr;
-   unsigned long text_poke_addr;
-   pte_t *pte;
-   unsigned long pfn = get_patch_pfn(addr);
-   struct mm_struct *patching_mm;
-   struct mm_struct *orig_mm;
+   struct mm_struct *patching_mm, *orig_mm;
+   unsigned long text_poke_addr, pfn;
+   u32 *patch_addr, *end, *pend;
+   ppc_inst_t instr;
spinlock_t *ptl;
+   int ilen, err;
+   pte_t *pte;
 
patching_mm = __this_cpu_read(cpu_patching_context.mm);
text_poke_addr = __this_cpu_read(cpu_patching_context.addr);
-   patch_addr = (u32 *)(text_poke_addr + offset_in_page(addr));
 
pte = get_locked_pte(patching_mm, text_poke_addr, );
if (!pte)
return -ENOMEM;
 
-   __set_pte_at(patching_mm, text_poke_addr, pte, pfn_pte(pfn, 
PAGE_KERNEL), 0);
+   end = (void *)addr + len;
+   do {
+   pfn = get_patch_pfn(addr);
+   __set_pte_at(patching_mm, text_poke_addr, pte, pfn_pte(pfn, 
PAGE_KERNEL), 0);
 
-   /* order PTE update before use, also serves as the hwsync */
-   asm volatile("ptesync": : :"memory");
-
-   /* order context switch after arbitrary prior code */
-   isync();
-
-   orig_mm = start_using_temp_mm(patching_mm);
-
-   err = __patch_instruction(addr, instr, patch_addr);
+   /* order PTE update before use, also serves as the hwsync */
+   asm volatile("ptesync": : :"memory");
 
-   /* hwsync performed by __patch_instruction (sync) if successful */
-   if (err)
-   mb();  /* sync */
+   /* order context switch after arbitrary prior code */
+   isync();
+
+   orig_mm = start_using_temp_mm(patching_mm);
+
+   patch_addr = (u32 *)(text_poke_addr + offset_in_page(addr));
+   pend = (void *)addr + PAGE_SIZE - offset_in_page(addr);
+   if (end < pend)
+   pend = end;
+
+   while (addr < pend) {
+   instr = ppc_inst_read(code);
+   ilen = ppc_inst_len(instr);
+   err = __patch_instruction(addr, instr, patch_addr);
+   /* hwsync performed by __patch_instruction (sync) if 
successful */
+   if (err) {
+   mb();  /* sync */
+   break;
+   }
+
+   patch_addr = (void *)patch_addr + ilen;
+   addr = (void *)addr + ilen;
+   if (!fill_inst)
+   code = (void *)code + ilen;
+   }
 
-   /* context synchronisation performed by __patch_instruction (isync or 
exception) */
-   stop_using_temp_mm(patching_mm, orig_mm);
+   /* context synchronisation performed by __patch_instruction 
(isync or exception) */
+   stop_using_temp_mm(patching_mm, orig_mm);
 
-   pte_clear(patching_mm, text_poke_addr, pte);
-   /*
-* ptesync to order PTE update before TLB 

[PATCH v2 1/4] powerpc/code-patching: introduce patch_instructions()

2023-03-09 Thread Hari Bathini
patch_instruction() entails setting up pte, patching the instruction,
clearing the pte and flushing the tlb. If multiple instructions need
to be patched, every instruction would have to go through the above
drill unnecessarily. Instead, introduce function patch_instructions()
that patches multiple instructions at one go while setting up the pte,
clearing the pte and flushing the tlb only once per page range of
instructions. Observed ~5X improvement in speed of execution using
patch_instructions() over patch_instructions(), when more instructions
are to be patched.

Signed-off-by: Hari Bathini 
---
 arch/powerpc/include/asm/code-patching.h |   1 +
 arch/powerpc/lib/code-patching.c | 151 ---
 2 files changed, 106 insertions(+), 46 deletions(-)

diff --git a/arch/powerpc/include/asm/code-patching.h 
b/arch/powerpc/include/asm/code-patching.h
index 3f881548fb61..059fc4fe700e 100644
--- a/arch/powerpc/include/asm/code-patching.h
+++ b/arch/powerpc/include/asm/code-patching.h
@@ -74,6 +74,7 @@ int create_cond_branch(ppc_inst_t *instr, const u32 *addr,
 int patch_branch(u32 *addr, unsigned long target, int flags);
 int patch_instruction(u32 *addr, ppc_inst_t instr);
 int raw_patch_instruction(u32 *addr, ppc_inst_t instr);
+int patch_instructions(u32 *addr, u32 *code, bool fill_inst, size_t len);
 
 static inline unsigned long patch_site_addr(s32 *site)
 {
diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c
index b00112d7ad46..33857b9b53de 100644
--- a/arch/powerpc/lib/code-patching.c
+++ b/arch/powerpc/lib/code-patching.c
@@ -278,77 +278,117 @@ static void unmap_patch_area(unsigned long addr)
flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
 }
 
-static int __do_patch_instruction_mm(u32 *addr, ppc_inst_t instr)
+static int __do_patch_instructions_mm(u32 *addr, u32 *code, bool fill_inst, 
size_t len)
 {
-   int err;
-   u32 *patch_addr;
-   unsigned long text_poke_addr;
-   pte_t *pte;
-   unsigned long pfn = get_patch_pfn(addr);
-   struct mm_struct *patching_mm;
-   struct mm_struct *orig_mm;
+   struct mm_struct *patching_mm, *orig_mm;
+   unsigned long text_poke_addr, pfn;
+   u32 *patch_addr, *end, *pend;
+   ppc_inst_t instr;
spinlock_t *ptl;
+   int ilen, err;
+   pte_t *pte;
 
patching_mm = __this_cpu_read(cpu_patching_context.mm);
text_poke_addr = __this_cpu_read(cpu_patching_context.addr);
-   patch_addr = (u32 *)(text_poke_addr + offset_in_page(addr));
 
pte = get_locked_pte(patching_mm, text_poke_addr, );
if (!pte)
return -ENOMEM;
 
-   __set_pte_at(patching_mm, text_poke_addr, pte, pfn_pte(pfn, 
PAGE_KERNEL), 0);
+   end = (void *)addr + len;
+   do {
+   pfn = get_patch_pfn(addr);
+   __set_pte_at(patching_mm, text_poke_addr, pte, pfn_pte(pfn, 
PAGE_KERNEL), 0);
 
-   /* order PTE update before use, also serves as the hwsync */
-   asm volatile("ptesync": : :"memory");
-
-   /* order context switch after arbitrary prior code */
-   isync();
-
-   orig_mm = start_using_temp_mm(patching_mm);
-
-   err = __patch_instruction(addr, instr, patch_addr);
+   /* order PTE update before use, also serves as the hwsync */
+   asm volatile("ptesync": : :"memory");
 
-   /* hwsync performed by __patch_instruction (sync) if successful */
-   if (err)
-   mb();  /* sync */
+   /* order context switch after arbitrary prior code */
+   isync();
+
+   orig_mm = start_using_temp_mm(patching_mm);
+
+   patch_addr = (u32 *)(text_poke_addr + offset_in_page(addr));
+   pend = (void *)addr + PAGE_SIZE - offset_in_page(addr);
+   if (end < pend)
+   pend = end;
+
+   while (addr < pend) {
+   instr = ppc_inst_read(code);
+   ilen = ppc_inst_len(instr);
+   err = __patch_instruction(addr, instr, patch_addr);
+   /* hwsync performed by __patch_instruction (sync) if 
successful */
+   if (err) {
+   mb();  /* sync */
+   break;
+   }
+
+   patch_addr = (void *)patch_addr + ilen;
+   addr = (void *)addr + ilen;
+   if (!fill_inst)
+   code = (void *)code + ilen;
+   }
 
-   /* context synchronisation performed by __patch_instruction (isync or 
exception) */
-   stop_using_temp_mm(patching_mm, orig_mm);
+   /* context synchronisation performed by __patch_instruction 
(isync or exception) */
+   stop_using_temp_mm(patching_mm, orig_mm);
 
-   pte_clear(patching_mm, text_poke_addr, pte);
-   /*
-* ptesync to order PTE update before TLB