[PATCH v3] powerpc32: rearrange instructions order in ip_fast_csum()

2015-08-05 Thread Christophe Leroy
On PPC_8xx, lwz has a 2 cycles latency, and branching also takes
2 cycles. On some other powerpc, lwz has 3 cycles.

As the size of the header is minimum 5 words, we can unroll the loop
for the first words to reduce number of branching, and we can re-order
the instructions to limit loading latency.

Signed-off-by: Christophe Leroy 
---
 v3: Only use lwzu for the last load as lwzu has undocumented
additional latency

 arch/powerpc/lib/checksum_32.S | 13 -
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/lib/checksum_32.S b/arch/powerpc/lib/checksum_32.S
index 9c48ee0..3ef6e3f 100644
--- a/arch/powerpc/lib/checksum_32.S
+++ b/arch/powerpc/lib/checksum_32.S
@@ -26,14 +26,17 @@
  */
 _GLOBAL(ip_fast_csum)
lwz r0,0(r3)
-   lwzur5,4(r3)
-   addic.  r4,r4,-2
+   lwz r5,4(r3)
+   lwz r6,8(r3)
+   lwzur7,12(r3)
+   addir4,r4,-4
addcr0,r0,r5
mtctr   r4
-   blelr-
-1: lwzur4,4(r3)
-   adder0,r0,r4
+   adder0,r0,r6
+1: adder0,r0,r7
+   lwzur7,4(r3)
bdnz1b
+   adder0,r0,r7
addze   r0,r0   /* add in final carry */
rlwinm  r3,r0,16,0,31   /* fold two halves together */
add r3,r0,r3
-- 
2.1.0

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v3] powerpc32: rearrange instructions order in ip_fast_csum()

2015-08-05 Thread Christophe Leroy
On PPC_8xx, lwz has a 2 cycles latency, and branching also takes
2 cycles. On some other powerpc, lwz has 3 cycles.

As the size of the header is minimum 5 words, we can unroll the loop
for the first words to reduce number of branching, and we can re-order
the instructions to limit loading latency.

Signed-off-by: Christophe Leroy christophe.le...@c-s.fr
---
 v3: Only use lwzu for the last load as lwzu has undocumented
additional latency

 arch/powerpc/lib/checksum_32.S | 13 -
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/lib/checksum_32.S b/arch/powerpc/lib/checksum_32.S
index 9c48ee0..3ef6e3f 100644
--- a/arch/powerpc/lib/checksum_32.S
+++ b/arch/powerpc/lib/checksum_32.S
@@ -26,14 +26,17 @@
  */
 _GLOBAL(ip_fast_csum)
lwz r0,0(r3)
-   lwzur5,4(r3)
-   addic.  r4,r4,-2
+   lwz r5,4(r3)
+   lwz r6,8(r3)
+   lwzur7,12(r3)
+   addir4,r4,-4
addcr0,r0,r5
mtctr   r4
-   blelr-
-1: lwzur4,4(r3)
-   adder0,r0,r4
+   adder0,r0,r6
+1: adder0,r0,r7
+   lwzur7,4(r3)
bdnz1b
+   adder0,r0,r7
addze   r0,r0   /* add in final carry */
rlwinm  r3,r0,16,0,31   /* fold two halves together */
add r3,r0,r3
-- 
2.1.0

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2] powerpc32: rearrange instructions order in ip_fast_csum()

2015-06-30 Thread Christophe Leroy
On PPC_8xx, lwz has a 2 cycles latency, and branching also takes
2 cycles. On some other powerpc, lwz has 3 cycles.

As the size of the header is minimum 5 words, we can unroll the loop
for the first words to reduce number of branching, and we can re-order
the instructions to limit loading latency.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/lib/checksum_32.S | 11 +++
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/lib/checksum_32.S b/arch/powerpc/lib/checksum_32.S
index 9c48ee0..6fbadfe 100644
--- a/arch/powerpc/lib/checksum_32.S
+++ b/arch/powerpc/lib/checksum_32.S
@@ -27,13 +27,16 @@
 _GLOBAL(ip_fast_csum)
lwz r0,0(r3)
lwzur5,4(r3)
-   addic.  r4,r4,-2
+   lwzur6,4(r3)
+   lwzur7,4(r3)
+   addir4,r4,-4
addcr0,r0,r5
mtctr   r4
-   blelr-
-1: lwzur4,4(r3)
-   adder0,r0,r4
+   adder0,r0,r6
+1: adder0,r0,r7
+   lwzur7,4(r3)
bdnz1b
+   adder0,r0,r7
addze   r0,r0   /* add in final carry */
rlwinm  r3,r0,16,0,31   /* fold two halves together */
add r3,r0,r3
-- 
2.1.0

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2] powerpc32: rearrange instructions order in ip_fast_csum()

2015-06-30 Thread Christophe Leroy
On PPC_8xx, lwz has a 2 cycles latency, and branching also takes
2 cycles. On some other powerpc, lwz has 3 cycles.

As the size of the header is minimum 5 words, we can unroll the loop
for the first words to reduce number of branching, and we can re-order
the instructions to limit loading latency.

Signed-off-by: Christophe Leroy christophe.le...@c-s.fr
---
 arch/powerpc/lib/checksum_32.S | 11 +++
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/lib/checksum_32.S b/arch/powerpc/lib/checksum_32.S
index 9c48ee0..6fbadfe 100644
--- a/arch/powerpc/lib/checksum_32.S
+++ b/arch/powerpc/lib/checksum_32.S
@@ -27,13 +27,16 @@
 _GLOBAL(ip_fast_csum)
lwz r0,0(r3)
lwzur5,4(r3)
-   addic.  r4,r4,-2
+   lwzur6,4(r3)
+   lwzur7,4(r3)
+   addir4,r4,-4
addcr0,r0,r5
mtctr   r4
-   blelr-
-1: lwzur4,4(r3)
-   adder0,r0,r4
+   adder0,r0,r6
+1: adder0,r0,r7
+   lwzur7,4(r3)
bdnz1b
+   adder0,r0,r7
addze   r0,r0   /* add in final carry */
rlwinm  r3,r0,16,0,31   /* fold two halves together */
add r3,r0,r3
-- 
2.1.0

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: powerpc32: rearrange instructions order in ip_fast_csum()

2015-04-28 Thread christophe leroy



Le 25/03/2015 02:22, Scott Wood a écrit :

On Tue, Feb 03, 2015 at 12:39:27PM +0100, LEROY Christophe wrote:

Signed-off-by: Christophe Leroy 
---
  arch/powerpc/lib/checksum_32.S | 10 +++---
  1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/lib/checksum_32.S b/arch/powerpc/lib/checksum_32.S
index 6d67e05..5500704 100644
--- a/arch/powerpc/lib/checksum_32.S
+++ b/arch/powerpc/lib/checksum_32.S
@@ -26,13 +26,17 @@
  _GLOBAL(ip_fast_csum)
lwz r0,0(r3)
lwzur5,4(r3)
-   addic.  r4,r4,-2
+   addic.  r4,r4,-4
addcr0,r0,r5
mtctr   r4
blelr-
-1: lwzur4,4(r3)
-   adder0,r0,r4
+   lwzur5,4(r3)
+   lwzur4,4(r3)

The blelr is pointless since len is guaranteed to be >= 5 (assuming that
comment is accurate), but now it's both pointless and in the wrong place,
since you haven't yet finished the four words that you subtracted from
r4.
The blelr is just there to protect the function against negative value 
of r4 hence ctr.
In any case, the returned result in that case in not correct, has we do 
not touch r3.


How about keeping the blelr, without the -, moving it after the initial
words, and changing the number of inital words to 5?

We can't just do blelr, we would need to fold the result first.
But indeed, this would be useless because I quickly checked and it seems 
that all functions calling ip_fast_csum()

check that the length is not lower than 5.
So I will just remove the blelr

Also maybe do all
the loads up front, since many PPC chips have a three cycle load latency
rather than two.

ok

Christophe

---
L'absence de virus dans ce courrier électronique a été vérifiée par le logiciel 
antivirus Avast.
http://www.avast.com

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: powerpc32: rearrange instructions order in ip_fast_csum()

2015-04-28 Thread christophe leroy



Le 25/03/2015 02:22, Scott Wood a écrit :

On Tue, Feb 03, 2015 at 12:39:27PM +0100, LEROY Christophe wrote:

Signed-off-by: Christophe Leroy christophe.le...@c-s.fr
---
  arch/powerpc/lib/checksum_32.S | 10 +++---
  1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/lib/checksum_32.S b/arch/powerpc/lib/checksum_32.S
index 6d67e05..5500704 100644
--- a/arch/powerpc/lib/checksum_32.S
+++ b/arch/powerpc/lib/checksum_32.S
@@ -26,13 +26,17 @@
  _GLOBAL(ip_fast_csum)
lwz r0,0(r3)
lwzur5,4(r3)
-   addic.  r4,r4,-2
+   addic.  r4,r4,-4
addcr0,r0,r5
mtctr   r4
blelr-
-1: lwzur4,4(r3)
-   adder0,r0,r4
+   lwzur5,4(r3)
+   lwzur4,4(r3)

The blelr is pointless since len is guaranteed to be = 5 (assuming that
comment is accurate), but now it's both pointless and in the wrong place,
since you haven't yet finished the four words that you subtracted from
r4.
The blelr is just there to protect the function against negative value 
of r4 hence ctr.
In any case, the returned result in that case in not correct, has we do 
not touch r3.


How about keeping the blelr, without the -, moving it after the initial
words, and changing the number of inital words to 5?

We can't just do blelr, we would need to fold the result first.
But indeed, this would be useless because I quickly checked and it seems 
that all functions calling ip_fast_csum()

check that the length is not lower than 5.
So I will just remove the blelr

Also maybe do all
the loads up front, since many PPC chips have a three cycle load latency
rather than two.

ok

Christophe

---
L'absence de virus dans ce courrier électronique a été vérifiée par le logiciel 
antivirus Avast.
http://www.avast.com

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: powerpc32: rearrange instructions order in ip_fast_csum()

2015-03-24 Thread Scott Wood
On Tue, Feb 03, 2015 at 12:39:27PM +0100, LEROY Christophe wrote:
> On PPC_8xx, lwz has a 2 cycles latency, and branching also takes 2 cycles.
> As the size of the header is minimum 5 words, we can unroll the loop for the
> first words to reduce number of branching, and we can re-order the 
> instructions
> to limit loading latency.

Please wrap commit messages at around 70 characters.

> Signed-off-by: Christophe Leroy 
> ---
>  arch/powerpc/lib/checksum_32.S | 10 +++---
>  1 file changed, 7 insertions(+), 3 deletions(-)
> 
> diff --git a/arch/powerpc/lib/checksum_32.S b/arch/powerpc/lib/checksum_32.S
> index 6d67e05..5500704 100644
> --- a/arch/powerpc/lib/checksum_32.S
> +++ b/arch/powerpc/lib/checksum_32.S
> @@ -26,13 +26,17 @@
>  _GLOBAL(ip_fast_csum)
>   lwz r0,0(r3)
>   lwzur5,4(r3)
> - addic.  r4,r4,-2
> + addic.  r4,r4,-4
>   addcr0,r0,r5
>   mtctr   r4
>   blelr-
> -1:   lwzur4,4(r3)
> - adder0,r0,r4
> + lwzur5,4(r3)
> + lwzur4,4(r3)

The blelr is pointless since len is guaranteed to be >= 5 (assuming that
comment is accurate), but now it's both pointless and in the wrong place,
since you haven't yet finished the four words that you subtracted from
r4.

How about keeping the blelr, without the -, moving it after the initial
words, and changing the number of inital words to 5?  Also maybe do all
the loads up front, since many PPC chips have a three cycle load latency
rather than two.

-Scott
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: powerpc32: rearrange instructions order in ip_fast_csum()

2015-03-24 Thread Scott Wood
On Tue, Feb 03, 2015 at 12:39:27PM +0100, LEROY Christophe wrote:
 On PPC_8xx, lwz has a 2 cycles latency, and branching also takes 2 cycles.
 As the size of the header is minimum 5 words, we can unroll the loop for the
 first words to reduce number of branching, and we can re-order the 
 instructions
 to limit loading latency.

Please wrap commit messages at around 70 characters.

 Signed-off-by: Christophe Leroy christophe.le...@c-s.fr
 ---
  arch/powerpc/lib/checksum_32.S | 10 +++---
  1 file changed, 7 insertions(+), 3 deletions(-)
 
 diff --git a/arch/powerpc/lib/checksum_32.S b/arch/powerpc/lib/checksum_32.S
 index 6d67e05..5500704 100644
 --- a/arch/powerpc/lib/checksum_32.S
 +++ b/arch/powerpc/lib/checksum_32.S
 @@ -26,13 +26,17 @@
  _GLOBAL(ip_fast_csum)
   lwz r0,0(r3)
   lwzur5,4(r3)
 - addic.  r4,r4,-2
 + addic.  r4,r4,-4
   addcr0,r0,r5
   mtctr   r4
   blelr-
 -1:   lwzur4,4(r3)
 - adder0,r0,r4
 + lwzur5,4(r3)
 + lwzur4,4(r3)

The blelr is pointless since len is guaranteed to be = 5 (assuming that
comment is accurate), but now it's both pointless and in the wrong place,
since you haven't yet finished the four words that you subtracted from
r4.

How about keeping the blelr, without the -, moving it after the initial
words, and changing the number of inital words to 5?  Also maybe do all
the loads up front, since many PPC chips have a three cycle load latency
rather than two.

-Scott
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] powerpc32: rearrange instructions order in ip_fast_csum()

2015-02-03 Thread Christophe Leroy
On PPC_8xx, lwz has a 2 cycles latency, and branching also takes 2 cycles.
As the size of the header is minimum 5 words, we can unroll the loop for the
first words to reduce number of branching, and we can re-order the instructions
to limit loading latency.

Signed-off-by: Christophe Leroy 

---
 arch/powerpc/lib/checksum_32.S | 10 +++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/lib/checksum_32.S b/arch/powerpc/lib/checksum_32.S
index 6d67e05..5500704 100644
--- a/arch/powerpc/lib/checksum_32.S
+++ b/arch/powerpc/lib/checksum_32.S
@@ -26,13 +26,17 @@
 _GLOBAL(ip_fast_csum)
lwz r0,0(r3)
lwzur5,4(r3)
-   addic.  r4,r4,-2
+   addic.  r4,r4,-4
addcr0,r0,r5
mtctr   r4
blelr-
-1: lwzur4,4(r3)
-   adder0,r0,r4
+   lwzur5,4(r3)
+   lwzur4,4(r3)
+   adder0,r0,r5
+1: adder0,r0,r4
+   lwzur4,4(r3)
bdnz1b
+   adder0,r0,r4
addze   r0,r0   /* add in final carry */
rlwinm  r3,r0,16,0,31   /* fold two halves together */
add r3,r0,r3
-- 
2.1.0

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] powerpc32: rearrange instructions order in ip_fast_csum()

2015-02-03 Thread Christophe Leroy
On PPC_8xx, lwz has a 2 cycles latency, and branching also takes 2 cycles.
As the size of the header is minimum 5 words, we can unroll the loop for the
first words to reduce number of branching, and we can re-order the instructions
to limit loading latency.

Signed-off-by: Christophe Leroy christophe.le...@c-s.fr

---
 arch/powerpc/lib/checksum_32.S | 10 +++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/lib/checksum_32.S b/arch/powerpc/lib/checksum_32.S
index 6d67e05..5500704 100644
--- a/arch/powerpc/lib/checksum_32.S
+++ b/arch/powerpc/lib/checksum_32.S
@@ -26,13 +26,17 @@
 _GLOBAL(ip_fast_csum)
lwz r0,0(r3)
lwzur5,4(r3)
-   addic.  r4,r4,-2
+   addic.  r4,r4,-4
addcr0,r0,r5
mtctr   r4
blelr-
-1: lwzur4,4(r3)
-   adder0,r0,r4
+   lwzur5,4(r3)
+   lwzur4,4(r3)
+   adder0,r0,r5
+1: adder0,r0,r4
+   lwzur4,4(r3)
bdnz1b
+   adder0,r0,r4
addze   r0,r0   /* add in final carry */
rlwinm  r3,r0,16,0,31   /* fold two halves together */
add r3,r0,r3
-- 
2.1.0

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


powerpc32: rearrange instructions order in ip_fast_csum()

2014-09-19 Thread Christophe Leroy
On PPC_8xx, lwz has a 2 cycles latency, and branching also takes 2 cycles.
As the size of the header is minimum 5 words, we can unroll the loop for the
first words to reduce number of branching, and we can re-order the instructions
to limit loading latency.

Signed-off-by: Christophe Leroy 

---
 arch/powerpc/lib/checksum_32.S | 10 +++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/lib/checksum_32.S b/arch/powerpc/lib/checksum_32.S
index 6d67e05..5500704 100644
--- a/arch/powerpc/lib/checksum_32.S
+++ b/arch/powerpc/lib/checksum_32.S
@@ -26,13 +26,17 @@
 _GLOBAL(ip_fast_csum)
lwz r0,0(r3)
lwzur5,4(r3)
-   addic.  r4,r4,-2
+   addic.  r4,r4,-4
addcr0,r0,r5
mtctr   r4
blelr-
-1: lwzur4,4(r3)
-   adder0,r0,r4
+   lwzur5,4(r3)
+   lwzur4,4(r3)
+   adder0,r0,r5
+1: adder0,r0,r4
+   lwzur4,4(r3)
bdnz1b
+   adder0,r0,r4
addze   r0,r0   /* add in final carry */
rlwinm  r3,r0,16,0,31   /* fold two halves together */
add r3,r0,r3
-- 
2.1.0

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


powerpc32: rearrange instructions order in ip_fast_csum()

2014-09-19 Thread Christophe Leroy
On PPC_8xx, lwz has a 2 cycles latency, and branching also takes 2 cycles.
As the size of the header is minimum 5 words, we can unroll the loop for the
first words to reduce number of branching, and we can re-order the instructions
to limit loading latency.

Signed-off-by: Christophe Leroy christophe.le...@c-s.fr

---
 arch/powerpc/lib/checksum_32.S | 10 +++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/lib/checksum_32.S b/arch/powerpc/lib/checksum_32.S
index 6d67e05..5500704 100644
--- a/arch/powerpc/lib/checksum_32.S
+++ b/arch/powerpc/lib/checksum_32.S
@@ -26,13 +26,17 @@
 _GLOBAL(ip_fast_csum)
lwz r0,0(r3)
lwzur5,4(r3)
-   addic.  r4,r4,-2
+   addic.  r4,r4,-4
addcr0,r0,r5
mtctr   r4
blelr-
-1: lwzur4,4(r3)
-   adder0,r0,r4
+   lwzur5,4(r3)
+   lwzur4,4(r3)
+   adder0,r0,r5
+1: adder0,r0,r4
+   lwzur4,4(r3)
bdnz1b
+   adder0,r0,r4
addze   r0,r0   /* add in final carry */
rlwinm  r3,r0,16,0,31   /* fold two halves together */
add r3,r0,r3
-- 
2.1.0

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/