Re: expmed costs and i386.c cost for widening mul (PR81444)

2017-07-17 Thread Georg-Johann Lay

On 17.07.2017 10:53, Georg-Johann Lay wrote:

Hi, while testing a patch to fix PR81444, I came across a new FAIL due
to the patch in i386.c/pr71321.c

PR81444 is about wrong modes used by expmed.c as it computes costs for
widening operations like widening mul.  It uses  GET_MODE_WIDER_MODE
for the wider mode where is should use GET_MODE_2XWIDER_MODE (the
internals specify a mode 2 times as wide for the wider mode).

Applying the patch from below then leads to the following change in
code generated for PR71321.c:

Without patch:



cvt_to_2digit_ascii:
 movzbl%dil, %eax# 7*zero_extendqihi2/1[length = 4]
 leal(%rax,%rax,4), %edx# 46*leasi[length = 3]
 leal(%rax,%rdx,8), %edx# 47*leasi[length = 3]
 leal(%rdx,%rdx,4), %edx# 48*leasi[length = 3]
 shrw$11, %dx# 16*lshrhi3_1[length = 4]
 leal(%rdx,%rdx,4), %eax# 49*leasi[length = 3]
 movzbl%dl, %edx# 35*zero_extendqisi2/1[length = 3]
 addl%eax, %eax# 50*ashlsi3_1/1[length = 2]
 subl%eax, %edi# 51*subsi_1/1[length = 2]
 movzbl%dil, %eax# 23*zero_extendqisi2/1[length = 4]
 sall$8, %eax# 24*ashlsi3_1/1[length = 3]
 orl%edx, %eax# 36*iorsi_1/1[length = 2]
 addl$667696, %eax# 37*addsi_1/1[length = 5]
 ret# 55simple_return_internal[length = 1]


With patch applied:

cvt_to_2digit_ascii:
 movl$-51, %edx# 7*movqi_internal/2[length = 5]
 movl%edx, %eax# 35*movqi_internal/1[length = 2]
 mulb%dil# 8*umulqihi3_1[length = 3]
 movl%eax, %edx# 36*movhi_internal/1[length = 2]

(Following code is the same)

 shrw$11, %dx# 10*lshrhi3_1[length = 4]
 leal(%rdx,%rdx,4), %eax# 37*leasi[length = 3]
 movzbl%dl, %edx# 23*zero_extendqisi2/1[length = 3]
 addl%eax, %eax# 38*ashlsi3_1/1[length = 2]
 subl%eax, %edi# 39*subsi_1/1[length = 2]
 movzbl%dil, %eax# 17*zero_extendqisi2/1[length = 4]
 sall$8, %eax# 18*ashlsi3_1/1[length = 3]
 orl%edx, %eax# 24*iorsi_1/1[length = 2]
 addl$667696, %eax# 25*addsi_1/1[length = 5]
 ret# 43simple_return_internal[length = 1]

Is this actually a performance regression or did the code improve 
actually? (compiled with -O2 on x86_64).


What would be the proposed fix:  Adjust the test case or is this some
problem in the i386.c cost computation?



...some more information. Looking into what ix86_rtx_costs is coming
up with. The unchanged version sees the strange modes an computes,
for example:

cost 4 (speed=1, outer=set) =
(zero_extend:HI (reg:XI 87))

cost 4 (speed=1, outer=set) =
(zero_extend:HI (reg:XI 87))

cost 24 (speed=1, outer=set) =
(mult:HI (zero_extend:HI (reg:XI 87))
(zero_extend:HI (reg:XI 87)))

cost 4 (speed=1, outer=truncate) =
(lshiftrt:HI (mult:HI (zero_extend:HI (reg:XI 87))
(zero_extend:HI (reg:XI 87)))
(const_int 8 [0x8]))

cost 0 (speed=1, outer=lshiftrt) =
(const_int 8 [0x8])

cost 4 (speed=1, outer=lshiftrt) =
(zero_extend:HI (reg:XI 87))

cost 4 (speed=1, outer=lshiftrt) =
(zero_extend:HI (reg:XI 87))

cost 24 (speed=1, outer=lshiftrt) =
(mult:HI (zero_extend:HI (reg:XI 87))
(zero_extend:HI (reg:XI 87)))


The patched version doesn't go into the sub-expressions and comes up
with smaller costs of 12 / 12 instead of the 24 / 24 from above:

cost 12 (speed=1, outer=set) =
(mult:HI (zero_extend:HI (reg:QI 87))
(zero_extend:HI (reg:QI 87)))

cost 4 (speed=1, outer=truncate) =
(lshiftrt:HI (mult:HI (zero_extend:HI (reg:QI 87))
(zero_extend:HI (reg:QI 87)))
(const_int 8 [0x8]))

cost 0 (speed=1, outer=lshiftrt) =
(const_int 8 [0x8])

cost 12 (speed=1, outer=lshiftrt) =
(mult:HI (zero_extend:HI (reg:QI 87))
(zero_extend:HI (reg:QI 87)))

Hence, with the proper expressions, costs for widening mul are cheaper
than with the XImode.  XImode is artefact of clobbering mode of
all->reg in expmed.c::init_expmed_one_conv().



Johann



Index: expmed.c
===
--- expmed.c(revision 250090)
+++ expmed.c(working copy)
@@ -119,6 +119,7 @@ init_expmed_one_conv (struct init_expmed
  {
int to_size, from_size;
rtx which;
+  machine_mode orig_mode = GET_MODE (all->reg);

to_size = GET_MODE_PRECISION (to_mode);
from_size = GET_MODE_PRECISION (from_mode);
@@ -140,6 +141,8 @@ init_expmed_one_conv (struct init_expmed
PUT_MODE (all->reg, from_mode);
set_convert_cost (to_mode, from_mode, speed,
  set_src_cost (which, to_mode, speed));
+  // Unclobber the mode, callers still use it.
+  PUT_MODE (all->reg, orig_mode);
  }

  static void
@@ -210,7 +213,7 @@ init_expmed_one_mode (struc

expmed costs and i386.c cost for widening mul (PR81444)

2017-07-17 Thread Georg-Johann Lay

Hi, while testing a patch to fix PR81444, I came across a new FAIL due
to the patch in i386.c/pr71321.c

PR81444 is about wrong modes used by expmed.c as it computes costs for
widening operations like widening mul.  It uses  GET_MODE_WIDER_MODE
for the wider mode where is should use GET_MODE_2XWIDER_MODE (the
internals specify a mode 2 times as wide for the wider mode).

Applying the patch from below then leads to the following change in
code generated for PR71321.c:

Without patch:



cvt_to_2digit_ascii:
movzbl  %dil, %eax  # 7 *zero_extendqihi2/1 [length = 4]
leal(%rax,%rax,4), %edx # 46*leasi  [length = 3]
leal(%rax,%rdx,8), %edx # 47*leasi  [length = 3]
leal(%rdx,%rdx,4), %edx # 48*leasi  [length = 3]
shrw$11, %dx# 16*lshrhi3_1  [length = 4]
leal(%rdx,%rdx,4), %eax # 49*leasi  [length = 3]
movzbl  %dl, %edx   # 35*zero_extendqisi2/1 [length = 3]
addl%eax, %eax  # 50*ashlsi3_1/1[length = 2]
subl%eax, %edi  # 51*subsi_1/1  [length = 2]
movzbl  %dil, %eax  # 23*zero_extendqisi2/1 [length = 4]
sall$8, %eax# 24*ashlsi3_1/1[length = 3]
orl %edx, %eax  # 36*iorsi_1/1  [length = 2]
addl$667696, %eax   # 37*addsi_1/1  [length = 5]
ret # 55simple_return_internal  [length = 1]


With patch applied:

cvt_to_2digit_ascii:
movl$-51, %edx  # 7 *movqi_internal/2   [length = 5]
movl%edx, %eax  # 35*movqi_internal/1   [length = 2]
mulb%dil# 8 *umulqihi3_1[length = 3]
movl%eax, %edx  # 36*movhi_internal/1   [length = 2]

(Following code is the same)

shrw$11, %dx# 10*lshrhi3_1  [length = 4]
leal(%rdx,%rdx,4), %eax # 37*leasi  [length = 3]
movzbl  %dl, %edx   # 23*zero_extendqisi2/1 [length = 3]
addl%eax, %eax  # 38*ashlsi3_1/1[length = 2]
subl%eax, %edi  # 39*subsi_1/1  [length = 2]
movzbl  %dil, %eax  # 17*zero_extendqisi2/1 [length = 4]
sall$8, %eax# 18*ashlsi3_1/1[length = 3]
orl %edx, %eax  # 24*iorsi_1/1  [length = 2]
addl$667696, %eax   # 25*addsi_1/1  [length = 5]
ret # 43simple_return_internal  [length = 1]

Is this actually a performance regression or did the code improve 
actually? (compiled with -O2 on x86_64).


What would be the proposed fix:  Adjust the test case or is this some
problem in the i386.c cost computation?


Johann



Index: expmed.c
===
--- expmed.c(revision 250090)
+++ expmed.c(working copy)
@@ -119,6 +119,7 @@ init_expmed_one_conv (struct init_expmed
 {
   int to_size, from_size;
   rtx which;
+  machine_mode orig_mode = GET_MODE (all->reg);

   to_size = GET_MODE_PRECISION (to_mode);
   from_size = GET_MODE_PRECISION (from_mode);
@@ -140,6 +141,8 @@ init_expmed_one_conv (struct init_expmed
   PUT_MODE (all->reg, from_mode);
   set_convert_cost (to_mode, from_mode, speed,
set_src_cost (which, to_mode, speed));
+  // Unclobber the mode, callers still use it.
+  PUT_MODE (all->reg, orig_mode);
 }

 static void
@@ -210,7 +213,7 @@ init_expmed_one_mode (struct init_expmed
 }
   if (GET_MODE_CLASS (mode) == MODE_INT)
 {
-  machine_mode  wider_mode = GET_MODE_WIDER_MODE (mode);
+  machine_mode wider_mode = GET_MODE_2XWIDER_MODE (mode);
   if (wider_mode != VOIDmode)
{
  PUT_MODE (all->zext, wider_mode);