Re: [PATCH][GCC][AARCH64]Bad code-gen for structure/block/unaligned memory access

2017-11-14 Thread Tamar Christina
Th 11/14/2017 17:48, James Greenhalgh wrote:
> On Tue, Nov 14, 2017 at 04:05:12PM +, Tamar Christina wrote:
> > Hi James,
> > 
> > I have split off the aarch64 bit off from the generic parts and processed 
> > your feedback.
> > 
> > Attached is the reworked patch.
> > 
> > Ok for Tunk?
> 
> Thanks for the respin, I'm a bit confused by this comment.

Sorry, the comment was a bit nonsensical. I update the last part but didn't 
re-read the comment
as a whole.

I've respun the patch again.

Thanks,
Tamar
> 
> > diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> > index 
> > e89c8156976cecf200cd67c1e938c8156c1240c4..bc197cc973a2bbc2c263809c7e8ccfbc86309c26
> >  100644
> > --- a/gcc/config/aarch64/aarch64.c
> > +++ b/gcc/config/aarch64/aarch64.c
> > @@ -13974,6 +13974,28 @@ aarch64_expand_movmem (rtx *operands)
> >base = copy_to_mode_reg (Pmode, XEXP (src, 0));
> >src = adjust_automodify_address (src, VOIDmode, base, 0);
> >  
> > +  /* Optimize routines for MEM to REG copies.
> > + This particular function only handles copying to two
> > + destination types: 1) a regular register and 2) the stack.
> > + When writing to a regular register we may end up writting too much in 
> > cases
> > + where the register already contains a live value or when no data 
> > padding is
> > + happening so we disallow regular registers to use this new code path. 
> >  */
> 
> I'm struggling to understand when you'd end up with a struct held in
> a partial register going through this code path. Maybe the restriction
> makes sense, can you write a testcase, or show some sample code where
> this goes wrong (or simplify the comment).
> 
> Thanks,
> James
> 

-- 
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index e89c8156976cecf200cd67c1e938c8156c1240c4..2597ec70ff2c49c8276622d3f9d6fe394a1f80c1 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -13974,6 +13974,26 @@ aarch64_expand_movmem (rtx *operands)
   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
   src = adjust_automodify_address (src, VOIDmode, base, 0);
 
+  /* Optimize routines for MEM to REG copies.
+ This particular function only handles copying to two
+ destination types: 1) a regular register and 2) the stack and only when
+ the amount to copy fits in less than 8 bytes.  */
+  if (n < 8 && !REG_P (XEXP (operands[0], 0)))
+{
+   machine_mode dest_mode
+	 = smallest_mode_for_size (n * BITS_PER_UNIT, MODE_INT);
+   rtx result = gen_reg_rtx (dest_mode);
+   emit_insn (gen_move_insn (result, GEN_INT (0)));
+
+   src = adjust_address (src, dest_mode, 0);
+   emit_insn (gen_move_insn (result, src));
+   src = aarch64_progress_pointer (src);
+
+   dst = adjust_address (dst, dest_mode, 0);
+   emit_insn (gen_move_insn (dst, result));
+   return true;
+}
+
   /* Simple cases.  Copy 0-3 bytes, as (if applicable) a 2-byte, then a
  1-byte chunk.  */
   if (n < 4)
diff --git a/gcc/testsuite/gcc.target/aarch64/structs.c b/gcc/testsuite/gcc.target/aarch64/structs.c
new file mode 100644
index ..2be8fae9479500cdbd5f22c6ab4f6adcdd7fcbce
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/structs.c
@@ -0,0 +1,226 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O2" } */
+/* Copyright 1996, 1999, 2007 Free Software Foundation, Inc.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   .
+
+   Please email any bugs, comments, and/or additions to this file to:
+   bug-...@prep.ai.mit.edu  */
+
+struct struct1 { char a;};
+struct struct2 { char a, b;};
+struct struct3 { char a, b, c; };
+struct struct4 { char a, b, c, d; };
+struct struct5 { char a, b, c, d, e; };
+struct struct6 { char a, b, c, d, e, f; };
+struct struct7 { char a, b, c, d, e, f, g; };
+struct struct8 { char a, b, c, d, e, f, g, h; };
+struct struct9 { char a, b, c, d, e, f, g, h, i; };
+struct struct10 { char a, b, c, d, e, f, g, h, i, j; };
+struct struct11 { char a, b, c, d, e, f, g, h, i, j, k; };
+struct struct12 { char a, b, c, d, e, f, g, h, i, j, k, l; };
+struct struct16 { char a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p; };
+
+struct struct1 foo1 = {'1'},  L1;
+struct struct2 foo2 = { 'a', 'b'},  L2;
+struct struct3 foo3 = { 'A', 'B', 'C'},  L3;
+struct struct4 foo4 = {'1', '2', '3', 

Re: [PATCH][GCC][AARCH64]Bad code-gen for structure/block/unaligned memory access

2017-11-14 Thread James Greenhalgh
On Tue, Nov 14, 2017 at 04:05:12PM +, Tamar Christina wrote:
> Hi James,
> 
> I have split off the aarch64 bit off from the generic parts and processed 
> your feedback.
> 
> Attached is the reworked patch.
> 
> Ok for Tunk?

Thanks for the respin, I'm a bit confused by this comment.

> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> index 
> e89c8156976cecf200cd67c1e938c8156c1240c4..bc197cc973a2bbc2c263809c7e8ccfbc86309c26
>  100644
> --- a/gcc/config/aarch64/aarch64.c
> +++ b/gcc/config/aarch64/aarch64.c
> @@ -13974,6 +13974,28 @@ aarch64_expand_movmem (rtx *operands)
>base = copy_to_mode_reg (Pmode, XEXP (src, 0));
>src = adjust_automodify_address (src, VOIDmode, base, 0);
>  
> +  /* Optimize routines for MEM to REG copies.
> + This particular function only handles copying to two
> + destination types: 1) a regular register and 2) the stack.
> + When writing to a regular register we may end up writting too much in 
> cases
> + where the register already contains a live value or when no data 
> padding is
> + happening so we disallow regular registers to use this new code path.  
> */

I'm struggling to understand when you'd end up with a struct held in
a partial register going through this code path. Maybe the restriction
makes sense, can you write a testcase, or show some sample code where
this goes wrong (or simplify the comment).

Thanks,
James



RE: [PATCH][GCC][AARCH64]Bad code-gen for structure/block/unaligned memory access

2017-11-14 Thread Tamar Christina
Hi James,

I have split off the aarch64 bit off from the generic parts and processed your 
feedback.

Attached is the reworked patch.

Ok for Tunk?

Thanks,
Tamar

Thanks,
Tamar

gcc/
2017-11-14  Tamar Christina  <tamar.christ...@arm.com>

* config/aarch64/aarch64.c (aarch64_expand_movmem):
Add MEM to REG optimized case.

gcc/testsuite/
2017-11-14  Tamar Christina  <tamar.christ...@arm.com>

* gcc.target/aarch64/structs.c

> -Original Message-
> From: James Greenhalgh [mailto:james.greenha...@arm.com]
> Sent: Wednesday, August 30, 2017 16:56
> To: Tamar Christina <tamar.christ...@arm.com>
> Cc: Richard Sandiford <richard.sandif...@linaro.org>; GCC Patches  patc...@gcc.gnu.org>; nd <n...@arm.com>; Richard Earnshaw
> <richard.earns...@arm.com>; Marcus Shawcroft
> <marcus.shawcr...@arm.com>
> Subject: Re: [PATCH][GCC][AARCH64]Bad code-gen for
> structure/block/unaligned memory access
> 
> Hi Tamar,
> 
> I think the AArch64 parts of this patch can be substantially simplified.
> 
> On Mon, Jul 03, 2017 at 07:17:51AM +0100, Tamar Christina wrote:
> > diff --git a/gcc/config/aarch64/aarch64.c
> > b/gcc/config/aarch64/aarch64.c index
> >
> ab1bdc0233afe7a3c41501cb724a5c4c719123b8..1cf6c4b891571745f740d7dbd0
> 37
> > 54bbba9264a6 100644
> > --- a/gcc/config/aarch64/aarch64.c
> > +++ b/gcc/config/aarch64/aarch64.c
> > @@ -13466,7 +13466,6 @@
> aarch64_copy_one_block_and_progress_pointers
> > (rtx *src, rtx *dst,
> >
> >  /* Expand movmem, as if from a __builtin_memcpy.  Return true if
> > we succeed, otherwise return false.  */
> > -
> >  bool
> >  aarch64_expand_movmem (rtx *operands)  { @@ -13498,16 +13497,55
> @@
> > aarch64_expand_movmem (rtx *operands)
> >base = copy_to_mode_reg (Pmode, XEXP (src, 0));
> >src = adjust_automodify_address (src, VOIDmode, base, 0);
> >
> > +  /* Optimize routines for MEM to REG copies.
> > + This particular function only handles copying to two
> > + destination types: 1) a regular register and 2) the stack.
> > + When writing to a regular register we may end up writting too much in
> cases
> > + where the register already contains a live value or when no data
> padding is
> > + happening so we disallow it.  */  if (n < 8 && !REG_P (XEXP
> > + (operands[0], 0)))
> 
> I don't understand how this comment lines up with the condition on this
> code path. On the one hand you say you permit regular registers, but on the
> other hand you say you disallow regular registers because you may overwrite.
> 
> 
> > +   {
> > + machine_mode mov_mode, dest_mode
> > +   = smallest_mode_for_size (n * BITS_PER_UNIT, MODE_INT);
> > + rtx result = gen_reg_rtx (dest_mode);
> > + emit_insn (gen_move_insn (result, GEN_INT (0)));
> > +
> > + unsigned int shift_cnt = 0;
> 
> Any reason not to just initialise the shift_cnt in place here?
> 
> > + for (; n > shift_cnt; shift_cnt += GET_MODE_SIZE (mov_mode))
> 
> 
>  for (unsigned int shift_cnt = 0;
>   shift_cnt <=  n;
>   shift_cnt += GET_MODE_SIZE (mov_mode))
> 
> Having the loop counter first in the comparison is personal preference.
> 
> mov_mode looks uninitialised, but I guess by the time you check the loop
> condition you have actually initialized it.
> 
> > +   {
> > +int nearest = 0;
> 
> This isn't required, we could do the loop below with one
> 
> > +/* Find the mode to use.  */
> > +for (unsigned max = 1; max <= (n - shift_cnt); max *= 2)
> > + nearest = max;
> 
> Wrong formatting.
> 
> So, when this loop terminates for the first time (shift_cnt == 0) nearest is 
> the
> first power of two greater than n.
> 
> > +
> > + mov_mode = smallest_mode_for_size (nearest * BITS_PER_UNIT,
> > +MODE_INT);
> 
> That means this is always a mode of at least the right size, which in turn
> means that we can't execute round the loop again (GET_MODE_SIZE
> (mov_mode) will always be greater than n). So, we can be sure this loop only
> executes once, and we can fold mov_mode and dest_mode to be equal.
> 
> > + rtx reg = gen_reg_rtx (mov_mode);
> > +
> > + src = adjust_address (src, mov_mode, 0);
> > + emit_insn (gen_move_insn (reg, src));
> > + src = aarch64_progress_pointer (src);
> > +
> > + reg = gen_rtx_ZERO_EXTEND (dest_mode, reg);
> > + reg = gen_rtx_ASHIFT (dest_mode, reg,
> > +   GEN_INT (shift_cnt * BITS_PER_UNIT));
> > + result = gen

Re: [PATCH][GCC][AARCH64]Bad code-gen for structure/block/unaligned memory access

2017-08-30 Thread James Greenhalgh
Hi Tamar,

I think the AArch64 parts of this patch can be substantially simplified.

On Mon, Jul 03, 2017 at 07:17:51AM +0100, Tamar Christina wrote:
> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> index 
> ab1bdc0233afe7a3c41501cb724a5c4c719123b8..1cf6c4b891571745f740d7dbd03754bbba9264a6
>  100644
> --- a/gcc/config/aarch64/aarch64.c
> +++ b/gcc/config/aarch64/aarch64.c
> @@ -13466,7 +13466,6 @@ aarch64_copy_one_block_and_progress_pointers (rtx 
> *src, rtx *dst,
>  
>  /* Expand movmem, as if from a __builtin_memcpy.  Return true if
> we succeed, otherwise return false.  */
> -
>  bool
>  aarch64_expand_movmem (rtx *operands)
>  {
> @@ -13498,16 +13497,55 @@ aarch64_expand_movmem (rtx *operands)
>base = copy_to_mode_reg (Pmode, XEXP (src, 0));
>src = adjust_automodify_address (src, VOIDmode, base, 0);
>  
> +  /* Optimize routines for MEM to REG copies.
> + This particular function only handles copying to two
> + destination types: 1) a regular register and 2) the stack.
> + When writing to a regular register we may end up writting too much in 
> cases
> + where the register already contains a live value or when no data 
> padding is
> + happening so we disallow it.  */
> +  if (n < 8 && !REG_P (XEXP (operands[0], 0)))

I don't understand how this comment lines up with the condition on this
code path. On the one hand you say you permit regular registers, but on the
other hand you say you disallow regular registers because you may overwrite.


> +   {
> + machine_mode mov_mode, dest_mode
> +   = smallest_mode_for_size (n * BITS_PER_UNIT, MODE_INT);
> + rtx result = gen_reg_rtx (dest_mode);
> + emit_insn (gen_move_insn (result, GEN_INT (0)));
> +
> + unsigned int shift_cnt = 0;

Any reason not to just initialise the shift_cnt in place here?

> + for (; n > shift_cnt; shift_cnt += GET_MODE_SIZE (mov_mode))


 for (unsigned int shift_cnt = 0;
  shift_cnt <=  n;
  shift_cnt += GET_MODE_SIZE (mov_mode))

Having the loop counter first in the comparison is personal preference.

mov_mode looks uninitialised, but I guess by the time you check the loop
condition you have actually initialized it.

> +   {
> +  int nearest = 0;

This isn't required, we could do the loop below with one 

> +  /* Find the mode to use.  */
> +  for (unsigned max = 1; max <= (n - shift_cnt); max *= 2)
> +   nearest = max;

Wrong formatting.

So, when this loop terminates for the first time (shift_cnt == 0) nearest
is the first power of two greater than n.

> +
> +   mov_mode = smallest_mode_for_size (nearest * BITS_PER_UNIT, MODE_INT);

That means this is always a mode of at least the right size, which in turn
means that we can't execute round the loop again (GET_MODE_SIZE (mov_mode)
will always be greater than n). So, we can be sure this loop only executes
once, and we can fold mov_mode and dest_mode to be equal.

> +   rtx reg = gen_reg_rtx (mov_mode);
> +
> +   src = adjust_address (src, mov_mode, 0);
> +   emit_insn (gen_move_insn (reg, src));
> +   src = aarch64_progress_pointer (src);
> +
> +   reg = gen_rtx_ZERO_EXTEND (dest_mode, reg);
> +   reg = gen_rtx_ASHIFT (dest_mode, reg,
> + GEN_INT (shift_cnt * BITS_PER_UNIT));
> +   result = gen_rtx_IOR (dest_mode, reg, result);
> +   }
> +
> +dst = adjust_address (dst, dest_mode, 0);
> +emit_insn (gen_move_insn (dst, result));
> +return true;
> +  }
> +
>/* Simple cases.  Copy 0-3 bytes, as (if applicable) a 2-byte, then a
>   1-byte chunk.  */
>if (n < 4)
>  {
>if (n >= 2)
> - {
> -   aarch64_copy_one_block_and_progress_pointers (, , HImode);
> -   n -= 2;
> - }
>  
> +  {
> + aarch64_copy_one_block_and_progress_pointers (, , HImode);
> + n -= 2;
> +  }

These formatting changes leave a blank newline between if (n >= 2) and the
remainder of this block. Why?

>if (n == 1)
>   aarch64_copy_one_block_and_progress_pointers (, , QImode);
>  

Thanks,
James


Re: [PATCH][GCC][AARCH64]Bad code-gen for structure/block/unaligned memory access

2017-08-30 Thread Tamar Christina
Sure, I'll split mid-end parts off and add a test.

Thanks,
Tamar

From: H.J. Lu <hjl.to...@gmail.com>
Sent: Wednesday, August 30, 2017 2:16:12 PM
To: Tamar Christina
Cc: Richard Sandiford; GCC Patches; nd; James Greenhalgh; Richard Earnshaw; 
Marcus Shawcroft
Subject: Re: [PATCH][GCC][AARCH64]Bad code-gen for structure/block/unaligned 
memory access

On Wed, Aug 30, 2017 at 2:21 AM, Tamar Christina
<tamar.christ...@arm.com> wrote:
> Hi,
>
> The test I used was testsuite/gcc.c-torture/compile/structs.c

Results look very nice on x86-64.  Can we add a testcase to scan the
compiler output
to verify this optimization?

Thanks.

> Thanks,
> Tamar
> 
> From: H.J. Lu <hjl.to...@gmail.com>
> Sent: Friday, August 25, 2017 8:10:22 PM
> To: Tamar Christina
> Cc: Richard Sandiford; GCC Patches; nd; James Greenhalgh; Richard Earnshaw; 
> Marcus Shawcroft
> Subject: Re: [PATCH][GCC][AARCH64]Bad code-gen for structure/block/unaligned 
> memory access
>
> On Sun, Jul 2, 2017 at 11:17 PM, Tamar Christina
> <tamar.christ...@arm.com> wrote:
>> Hi All,
>>
>> Sorry I just realized I never actually sent the updated patch...
>>
>> So here it is :)
>>
>
> Are there any testcases?
>
> --
> H.J.



--
H.J.


Re: [PATCH][GCC][AARCH64]Bad code-gen for structure/block/unaligned memory access

2017-08-30 Thread H.J. Lu
On Wed, Aug 30, 2017 at 2:21 AM, Tamar Christina
<tamar.christ...@arm.com> wrote:
> Hi,
>
> The test I used was testsuite/gcc.c-torture/compile/structs.c

Results look very nice on x86-64.  Can we add a testcase to scan the
compiler output
to verify this optimization?

Thanks.

> Thanks,
> Tamar
> 
> From: H.J. Lu <hjl.to...@gmail.com>
> Sent: Friday, August 25, 2017 8:10:22 PM
> To: Tamar Christina
> Cc: Richard Sandiford; GCC Patches; nd; James Greenhalgh; Richard Earnshaw; 
> Marcus Shawcroft
> Subject: Re: [PATCH][GCC][AARCH64]Bad code-gen for structure/block/unaligned 
> memory access
>
> On Sun, Jul 2, 2017 at 11:17 PM, Tamar Christina
> <tamar.christ...@arm.com> wrote:
>> Hi All,
>>
>> Sorry I just realized I never actually sent the updated patch...
>>
>> So here it is :)
>>
>
> Are there any testcases?
>
> --
> H.J.



-- 
H.J.


Re: [PATCH][GCC][AARCH64]Bad code-gen for structure/block/unaligned memory access

2017-08-30 Thread Tamar Christina
Hi,

The test I used was testsuite/gcc.c-torture/compile/structs.c

Thanks,
Tamar

From: H.J. Lu <hjl.to...@gmail.com>
Sent: Friday, August 25, 2017 8:10:22 PM
To: Tamar Christina
Cc: Richard Sandiford; GCC Patches; nd; James Greenhalgh; Richard Earnshaw; 
Marcus Shawcroft
Subject: Re: [PATCH][GCC][AARCH64]Bad code-gen for structure/block/unaligned 
memory access

On Sun, Jul 2, 2017 at 11:17 PM, Tamar Christina
<tamar.christ...@arm.com> wrote:
> Hi All,
>
> Sorry I just realized I never actually sent the updated patch...
>
> So here it is :)
>

Are there any testcases?

--
H.J.


Re: [PATCH][GCC][AARCH64]Bad code-gen for structure/block/unaligned memory access

2017-08-25 Thread H.J. Lu
On Sun, Jul 2, 2017 at 11:17 PM, Tamar Christina
 wrote:
> Hi All,
>
> Sorry I just realized I never actually sent the updated patch...
>
> So here it is :)
>

Are there any testcases?

-- 
H.J.


Re: [PATCH][GCC][AARCH64]Bad code-gen for structure/block/unaligned memory access

2017-08-25 Thread Jeff Law
On 07/03/2017 12:17 AM, Tamar Christina wrote:
> Hi All,
> 
> Sorry I just realized I never actually sent the updated patch...
AFAICT this never got resolved.

I'll let the aarch64 folks own those changes.  I'm going to focus just
on the small change to expr.c where you change the computation of
bitsize in copy_blkmode_to_reg.

My biggest worry is that this could end up breaking STRICT_ALIGNMENT
targets.  Thankfully, the default definition of SLOW_UNALIGNED_ACCESS is
STRICT_ALIGNMENT.  So on a STRICT_ALIGNMENT target that does not define
SLOW_UNALIGNED_ACCESS we should OK.  I also did a quick scan of
STRICT_ALIGNMENT targets that override SLOW_UNALIGNED_ACCESS and they
seem reasonable.

So I'll ack the expr.c change.  You'll need to chase down an ack on the
aarch64 specific changes.


jeff

> 
> So here it is :)
> 
> Regards,
> Tamar
> 
> From: gcc-patches-ow...@gcc.gnu.org <gcc-patches-ow...@gcc.gnu.org> on behalf 
> of Tamar Christina <tamar.christ...@arm.com>
> Sent: Friday, June 9, 2017 4:51:52 PM
> To: Richard Sandiford
> Cc: GCC Patches; nd; James Greenhalgh; Richard Earnshaw; Marcus Shawcroft
> Subject: RE: [PATCH][GCC][AARCH64]Bad code-gen for structure/block/unaligned 
> memory access
> 
>>> +  /* Optimize routines for MEM to REG copies.  */  if (n < 8 &&
>>> + !REG_P (XEXP (operands[0], 0)))
>>
>> This seems to be checking that the address of the original destination
>> memory isn't a plain base register.  Why's it important to reject that case 
>> but
>> allow e.g. base+offset?
> 
> this function is (as far as I could tell) only being called with two types of 
> destinations:
> a location on the stack or a plain register.
> 
> When the destination is a register such as with
> 
> void Fun3(struct struct3 foo3)
> {
>   L3 = foo3;
> }
> 
> You run into the issue you had pointed to below where we might write too 
> much. Ideally the constraint
> I would like is to check if the destination is either a new register (no 
> data) or that the structure was padded.
> 
> I couldn't figure out how to do this and the gains over the existing code for 
> this case was quite small.
> So I just disallowed it leaving it to the existing code, which isn't so bad, 
> only 1 extra instruction.
> 
>>
>>> +   {
>>> + unsigned int max_align = UINTVAL (operands[2]);
>>> + max_align = n < max_align ? max_align : n;
>>
>> Might be misunderstanding, but isn't max_align always equal to n here, since
>> n was set by:
> 
> Correct, previously this patch was made to allow n < 16. These were left over 
> from the cleanup.
> I'll correct.
> 
>>
>>> + result = gen_rtx_IOR (dest_mode, reg, result);
>>> +  }
>>> +
>>> +dst = adjust_address (dst, dest_mode, 0);
>>> +emit_insn (gen_move_insn (dst, result));
>>
>> dest_mode was chosen by smallest_mode_for_size, so can be bigger than n.
>> Doesn't that mean that we'll write beyond the end of the copy region when
>> n is an awkward number?
> 
> Yes, see my answer above. For the other case, when we write onto a location 
> on the stack, this is fine
> due to the alignment.
> 
>>
>>> diff --git a/gcc/expr.c b/gcc/expr.c
>>> index
>>>
>> 91d7ea217229fac62380b5d4b646961bf7c836c1..b1df4651e7942346007cda1cce
>> 8e
>>> e5a19297ab16 100644
>>> --- a/gcc/expr.c
>>> +++ b/gcc/expr.c
>>> @@ -2743,7 +2743,9 @@ copy_blkmode_to_reg (machine_mode mode,
>> tree
>>> src)
>>>
>>>n_regs = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
>>>dst_words = XALLOCAVEC (rtx, n_regs);
>>> -  bitsize = MIN (TYPE_ALIGN (TREE_TYPE (src)), BITS_PER_WORD);
>>> +  bitsize = BITS_PER_WORD;
>>> +  if (SLOW_UNALIGNED_ACCESS (BLKmode, TYPE_ALIGN (TREE_TYPE
>> (src
>>> +bitsize = MIN (TYPE_ALIGN (TREE_TYPE (src)), BITS_PER_WORD);
>>
>> I think this ought to be testing word_mode instead of BLKmode.
>> (Testing BLKmode doesn't really make sense in general, because the mode
>> doesn't have a meaningful alignment.)
> 
> Ah, yes that makes sense. I'll update the patch.
> 
> New patch is validating and will submit it soon.
> 
>>
>> Thanks,
>> Richard



Re: [PATCH][GCC][AARCH64]Bad code-gen for structure/block/unaligned memory access

2017-07-03 Thread Tamar Christina
Hi All,

Sorry I just realized I never actually sent the updated patch...

So here it is :)

Regards,
Tamar

From: gcc-patches-ow...@gcc.gnu.org <gcc-patches-ow...@gcc.gnu.org> on behalf 
of Tamar Christina <tamar.christ...@arm.com>
Sent: Friday, June 9, 2017 4:51:52 PM
To: Richard Sandiford
Cc: GCC Patches; nd; James Greenhalgh; Richard Earnshaw; Marcus Shawcroft
Subject: RE: [PATCH][GCC][AARCH64]Bad code-gen for structure/block/unaligned 
memory access

> > +  /* Optimize routines for MEM to REG copies.  */  if (n < 8 &&
> > + !REG_P (XEXP (operands[0], 0)))
>
> This seems to be checking that the address of the original destination
> memory isn't a plain base register.  Why's it important to reject that case 
> but
> allow e.g. base+offset?

this function is (as far as I could tell) only being called with two types of 
destinations:
a location on the stack or a plain register.

When the destination is a register such as with

void Fun3(struct struct3 foo3)
{
  L3 = foo3;
}

You run into the issue you had pointed to below where we might write too much. 
Ideally the constraint
I would like is to check if the destination is either a new register (no data) 
or that the structure was padded.

I couldn't figure out how to do this and the gains over the existing code for 
this case was quite small.
So I just disallowed it leaving it to the existing code, which isn't so bad, 
only 1 extra instruction.

>
> > +   {
> > + unsigned int max_align = UINTVAL (operands[2]);
> > + max_align = n < max_align ? max_align : n;
>
> Might be misunderstanding, but isn't max_align always equal to n here, since
> n was set by:

Correct, previously this patch was made to allow n < 16. These were left over 
from the cleanup.
I'll correct.

>
> > + result = gen_rtx_IOR (dest_mode, reg, result);
> > +  }
> > +
> > +dst = adjust_address (dst, dest_mode, 0);
> > +emit_insn (gen_move_insn (dst, result));
>
> dest_mode was chosen by smallest_mode_for_size, so can be bigger than n.
> Doesn't that mean that we'll write beyond the end of the copy region when
> n is an awkward number?

Yes, see my answer above. For the other case, when we write onto a location on 
the stack, this is fine
due to the alignment.

>
> > diff --git a/gcc/expr.c b/gcc/expr.c
> > index
> >
> 91d7ea217229fac62380b5d4b646961bf7c836c1..b1df4651e7942346007cda1cce
> 8e
> > e5a19297ab16 100644
> > --- a/gcc/expr.c
> > +++ b/gcc/expr.c
> > @@ -2743,7 +2743,9 @@ copy_blkmode_to_reg (machine_mode mode,
> tree
> > src)
> >
> >n_regs = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
> >dst_words = XALLOCAVEC (rtx, n_regs);
> > -  bitsize = MIN (TYPE_ALIGN (TREE_TYPE (src)), BITS_PER_WORD);
> > +  bitsize = BITS_PER_WORD;
> > +  if (SLOW_UNALIGNED_ACCESS (BLKmode, TYPE_ALIGN (TREE_TYPE
> (src
> > +bitsize = MIN (TYPE_ALIGN (TREE_TYPE (src)), BITS_PER_WORD);
>
> I think this ought to be testing word_mode instead of BLKmode.
> (Testing BLKmode doesn't really make sense in general, because the mode
> doesn't have a meaningful alignment.)

Ah, yes that makes sense. I'll update the patch.

New patch is validating and will submit it soon.

>
> Thanks,
> Richard
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index ab1bdc0233afe7a3c41501cb724a5c4c719123b8..1cf6c4b891571745f740d7dbd03754bbba9264a6 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -13466,7 +13466,6 @@ aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
 
 /* Expand movmem, as if from a __builtin_memcpy.  Return true if
we succeed, otherwise return false.  */
-
 bool
 aarch64_expand_movmem (rtx *operands)
 {
@@ -13498,16 +13497,55 @@ aarch64_expand_movmem (rtx *operands)
   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
   src = adjust_automodify_address (src, VOIDmode, base, 0);
 
+  /* Optimize routines for MEM to REG copies.
+ This particular function only handles copying to two
+ destination types: 1) a regular register and 2) the stack.
+ When writing to a regular register we may end up writting too much in cases
+ where the register already contains a live value or when no data padding is
+ happening so we disallow it.  */
+  if (n < 8 && !REG_P (XEXP (operands[0], 0)))
+   {
+ machine_mode mov_mode, dest_mode
+   = smallest_mode_for_size (n * BITS_PER_UNIT, MODE_INT);
+ rtx result = gen_reg_rtx (dest_mode);
+ emit_insn (gen_move_insn (result, GEN_INT (0)));
+
+ unsigned int shift_cnt = 0;
+ for (; n > shift_cnt; shift_cnt += GET_MODE_SIZE (mov_mode))
+   {
+	 int nearest = 0;
+	 /* Find the mode to use.  */
+	 for (unsigned max = 

RE: [PATCH][GCC][AARCH64]Bad code-gen for structure/block/unaligned memory access

2017-06-09 Thread Tamar Christina

> > +  /* Optimize routines for MEM to REG copies.  */  if (n < 8 &&
> > + !REG_P (XEXP (operands[0], 0)))
> 
> This seems to be checking that the address of the original destination
> memory isn't a plain base register.  Why's it important to reject that case 
> but
> allow e.g. base+offset?

this function is (as far as I could tell) only being called with two types of 
destinations:
a location on the stack or a plain register.

When the destination is a register such as with

void Fun3(struct struct3 foo3)
{
  L3 = foo3;
}

You run into the issue you had pointed to below where we might write too much. 
Ideally the constraint
I would like is to check if the destination is either a new register (no data) 
or that the structure was padded.

I couldn't figure out how to do this and the gains over the existing code for 
this case was quite small.
So I just disallowed it leaving it to the existing code, which isn't so bad, 
only 1 extra instruction.

> 
> > +   {
> > + unsigned int max_align = UINTVAL (operands[2]);
> > + max_align = n < max_align ? max_align : n;
> 
> Might be misunderstanding, but isn't max_align always equal to n here, since
> n was set by:

Correct, previously this patch was made to allow n < 16. These were left over 
from the cleanup.
I'll correct.

> 
> > + result = gen_rtx_IOR (dest_mode, reg, result);
> > +  }
> > +
> > +dst = adjust_address (dst, dest_mode, 0);
> > +emit_insn (gen_move_insn (dst, result));
> 
> dest_mode was chosen by smallest_mode_for_size, so can be bigger than n.
> Doesn't that mean that we'll write beyond the end of the copy region when
> n is an awkward number?

Yes, see my answer above. For the other case, when we write onto a location on 
the stack, this is fine
due to the alignment. 

> 
> > diff --git a/gcc/expr.c b/gcc/expr.c
> > index
> >
> 91d7ea217229fac62380b5d4b646961bf7c836c1..b1df4651e7942346007cda1cce
> 8e
> > e5a19297ab16 100644
> > --- a/gcc/expr.c
> > +++ b/gcc/expr.c
> > @@ -2743,7 +2743,9 @@ copy_blkmode_to_reg (machine_mode mode,
> tree
> > src)
> >
> >n_regs = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
> >dst_words = XALLOCAVEC (rtx, n_regs);
> > -  bitsize = MIN (TYPE_ALIGN (TREE_TYPE (src)), BITS_PER_WORD);
> > +  bitsize = BITS_PER_WORD;
> > +  if (SLOW_UNALIGNED_ACCESS (BLKmode, TYPE_ALIGN (TREE_TYPE
> (src
> > +bitsize = MIN (TYPE_ALIGN (TREE_TYPE (src)), BITS_PER_WORD);
> 
> I think this ought to be testing word_mode instead of BLKmode.
> (Testing BLKmode doesn't really make sense in general, because the mode
> doesn't have a meaningful alignment.)

Ah, yes that makes sense. I'll update the patch.

New patch is validating and will submit it soon.

> 
> Thanks,
> Richard


RE: [PATCH][GCC][AARCH64]Bad code-gen for structure/block/unaligned memory access

2017-06-09 Thread Tamar Christina
> > +  /* Optimize routines for MEM to REG copies.  */  if (n < 8 &&
> > + !REG_P (XEXP (operands[0], 0)))
> 
> This seems to be checking that the address of the original destination
> memory isn't a plain base register.  Why's it important to reject that case 
> but
> allow e.g. base+offset?
> 

In the case of e.g.

void Fun3(struct struct3 foo3)
{
  L3 = foo3;
}


> > +   {
> > + unsigned int max_align = UINTVAL (operands[2]);
> > + max_align = n < max_align ? max_align : n;
> 
> Might be misunderstanding, but isn't max_align always equal to n here, since
> n was set by:
> 
>   n = UINTVAL (operands[2]);
> 
> Indentation of the enclosing { ... } is slightly off.
> 
> > + machine_mode mov_mode, dest_mode
> > +   = smallest_mode_for_size (max_align * BITS_PER_UNIT, MODE_INT);
> > + rtx result = gen_reg_rtx (dest_mode);
> > + emit_insn (gen_move_insn (result, GEN_INT (0)));
> > +
> > + unsigned int shift_cnt = 0;
> > + for (; n > shift_cnt; shift_cnt += GET_MODE_SIZE (mov_mode))
> > +   {
> > +int nearest = 0;
> > +/* Find the mode to use, but limit the max to TI mode.  */
> > +for (unsigned max = 1; max <= (n - shift_cnt) && max <= 16; max *=
> 2)
> > + nearest = max;
> 
> In the if statement above, you required n < 8, so can max ever by > 16 here?
> 
> > + mov_mode = smallest_mode_for_size (nearest * BITS_PER_UNIT,
> MODE_INT);
> > + rtx reg = gen_reg_rtx (mov_mode);
> > +
> > + src = adjust_address (src, mov_mode, 0);
> > + emit_insn (gen_move_insn (reg, src));
> > + src = aarch64_progress_pointer (src);
> > +
> > + reg = gen_rtx_ASHIFT (dest_mode, reg,
> > +   GEN_INT (shift_cnt * BITS_PER_UNIT));
> 
> This seems to be mixing modes: reg has mode "mov_mode" but the result
> has mode "dest_mode".  That isn't well-formed: the mode of a shift result
> needs to be the same as the mode of the operand.  I think the load would
> need to be a zero-extend of "src" into "reg", with "reg" having mode
> "dest_mode".
> 
> > + result = gen_rtx_IOR (dest_mode, reg, result);
> > +  }
> > +
> > +dst = adjust_address (dst, dest_mode, 0);
> > +emit_insn (gen_move_insn (dst, result));
> 
> dest_mode was chosen by smallest_mode_for_size, so can be bigger than n.
> Doesn't that mean that we'll write beyond the end of the copy region when
> n is an awkward number?
> 
> > diff --git a/gcc/expr.c b/gcc/expr.c
> > index
> >
> 91d7ea217229fac62380b5d4b646961bf7c836c1..b1df4651e7942346007cda1cce
> 8e
> > e5a19297ab16 100644
> > --- a/gcc/expr.c
> > +++ b/gcc/expr.c
> > @@ -2743,7 +2743,9 @@ copy_blkmode_to_reg (machine_mode mode,
> tree
> > src)
> >
> >n_regs = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
> >dst_words = XALLOCAVEC (rtx, n_regs);
> > -  bitsize = MIN (TYPE_ALIGN (TREE_TYPE (src)), BITS_PER_WORD);
> > +  bitsize = BITS_PER_WORD;
> > +  if (SLOW_UNALIGNED_ACCESS (BLKmode, TYPE_ALIGN (TREE_TYPE
> (src
> > +bitsize = MIN (TYPE_ALIGN (TREE_TYPE (src)), BITS_PER_WORD);
> 
> I think this ought to be testing word_mode instead of BLKmode.
> (Testing BLKmode doesn't really make sense in general, because the mode
> doesn't have a meaningful alignment.)
> 
> Thanks,
> Richard


Re: [PATCH][GCC][AARCH64]Bad code-gen for structure/block/unaligned memory access

2017-06-08 Thread Richard Sandiford
Tamar Christina  writes:
> Hi All, 
>
> This patch allows larger bitsizes to be used as copy size
> when the target does not have SLOW_UNALIGNED_ACCESS.
>
> It also provides an optimized routine for MEM to REG
> copying which avoid reconstructing the value piecewise on the stack
> and instead uses a combination of shifts and ORs.
>
> This now generates
>
>   adrpx0, .LANCHOR0
>   add x0, x0, :lo12:.LANCHOR0
>   sub sp, sp, #16
>   ldr w1, [x0, 120]
>   str w1, [sp, 8]
>   ldr x0, [x0, 112]
>   ldr x1, [sp, 8]
>   add sp, sp, 16
>
> instead of:
>
>   adrpx3, .LANCHOR0
>   add x3, x3, :lo12:.LANCHOR0
>   mov x0, 0
>   mov x1, 0
>   sub sp, sp, #16
>   ldr x2, [x3, 112]
>   ldr w3, [x3, 120]
>   add sp, sp, 16
>   ubfxx5, x2, 8, 8
>   bfi x0, x2, 0, 8
>   ubfxx4, x2, 16, 8
>   lsr w9, w2, 24
>   bfi x0, x5, 8, 8
>   ubfxx7, x2, 32, 8
>   ubfxx5, x2, 40, 8
>   ubfxx8, x3, 8, 8
>   bfi x0, x4, 16, 8
>   bfi x1, x3, 0, 8
>   ubfxx4, x2, 48, 8
>   ubfxx6, x3, 16, 8
>   bfi x0, x9, 24, 8
>   bfi x1, x8, 8, 8
>   lsr x2, x2, 56
>   lsr w3, w3, 24
>   bfi x0, x7, 32, 8
>   bfi x1, x6, 16, 8
>   bfi x0, x5, 40, 8
>   bfi x1, x3, 24, 8
>   bfi x0, x4, 48, 8
>   bfi x0, x2, 56, 8
>
> To load a 12 1-byte element struct.

Nice!

[...]

> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> index 
> 4f769a40a4e9de83cb5aacfd3ff58301c2feeb78..8906d9a9445ed36f43302708d1f6212bcf017bdc
>  100644
> --- a/gcc/config/aarch64/aarch64.c
> +++ b/gcc/config/aarch64/aarch64.c
> @@ -13498,6 +13498,41 @@ aarch64_expand_movmem (rtx *operands)
>base = copy_to_mode_reg (Pmode, XEXP (src, 0));
>src = adjust_automodify_address (src, VOIDmode, base, 0);
>  
> +  /* Optimize routines for MEM to REG copies.  */
> +  if (n < 8 && !REG_P (XEXP (operands[0], 0)))

This seems to be checking that the address of the original destination
memory isn't a plain base register.  Why's it important to reject that
case but allow e.g. base+offset?

> +   {
> + unsigned int max_align = UINTVAL (operands[2]);
> + max_align = n < max_align ? max_align : n;

Might be misunderstanding, but isn't max_align always equal to n here,
since n was set by:

  n = UINTVAL (operands[2]);

Indentation of the enclosing { ... } is slightly off.

> + machine_mode mov_mode, dest_mode
> +   = smallest_mode_for_size (max_align * BITS_PER_UNIT, MODE_INT);
> + rtx result = gen_reg_rtx (dest_mode);
> + emit_insn (gen_move_insn (result, GEN_INT (0)));
> +
> + unsigned int shift_cnt = 0;
> + for (; n > shift_cnt; shift_cnt += GET_MODE_SIZE (mov_mode))
> +   {
> +  int nearest = 0;
> +  /* Find the mode to use, but limit the max to TI mode.  */
> +  for (unsigned max = 1; max <= (n - shift_cnt) && max <= 16; max *= 2)
> +   nearest = max;

In the if statement above, you required n < 8, so can max ever by > 16 here?

> +   mov_mode = smallest_mode_for_size (nearest * BITS_PER_UNIT, MODE_INT);
> +   rtx reg = gen_reg_rtx (mov_mode);
> +
> +   src = adjust_address (src, mov_mode, 0);
> +   emit_insn (gen_move_insn (reg, src));
> +   src = aarch64_progress_pointer (src);
> +
> +   reg = gen_rtx_ASHIFT (dest_mode, reg,
> + GEN_INT (shift_cnt * BITS_PER_UNIT));

This seems to be mixing modes: reg has mode "mov_mode" but the result has
mode "dest_mode".  That isn't well-formed: the mode of a shift result needs
to be the same as the mode of the operand.  I think the load would need
to be a zero-extend of "src" into "reg", with "reg" having mode "dest_mode".

> +   result = gen_rtx_IOR (dest_mode, reg, result);
> +  }
> +
> +dst = adjust_address (dst, dest_mode, 0);
> +emit_insn (gen_move_insn (dst, result));

dest_mode was chosen by smallest_mode_for_size, so can be bigger than n.
Doesn't that mean that we'll write beyond the end of the copy region
when n is an awkward number?

> diff --git a/gcc/expr.c b/gcc/expr.c
> index 
> 91d7ea217229fac62380b5d4b646961bf7c836c1..b1df4651e7942346007cda1cce8ee5a19297ab16
>  100644
> --- a/gcc/expr.c
> +++ b/gcc/expr.c
> @@ -2743,7 +2743,9 @@ copy_blkmode_to_reg (machine_mode mode, tree src)
>  
>n_regs = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
>dst_words = XALLOCAVEC (rtx, n_regs);
> -  bitsize = MIN (TYPE_ALIGN (TREE_TYPE (src)), BITS_PER_WORD);
> +  bitsize = BITS_PER_WORD;
> +  if (SLOW_UNALIGNED_ACCESS (BLKmode, TYPE_ALIGN (TREE_TYPE (src
> +bitsize = MIN (TYPE_ALIGN (TREE_TYPE (src)), BITS_PER_WORD);

I think this ought to be testing word_mode instead of BLKmode.
(Testing BLKmode doesn't really make sense in general, because the
mode doesn't have a 

[PATCH][GCC][AARCH64]Bad code-gen for structure/block/unaligned memory access

2017-06-07 Thread Tamar Christina
Hi All, 


This patch allows larger bitsizes to be used as copy size
when the target does not have SLOW_UNALIGNED_ACCESS.

It also provides an optimized routine for MEM to REG
copying which avoid reconstructing the value piecewise on the stack
and instead uses a combination of shifts and ORs.

This now generates

adrpx0, .LANCHOR0
add x0, x0, :lo12:.LANCHOR0
sub sp, sp, #16
ldr w1, [x0, 120]
str w1, [sp, 8]
ldr x0, [x0, 112]
ldr x1, [sp, 8]
add sp, sp, 16

instead of:

adrpx3, .LANCHOR0
add x3, x3, :lo12:.LANCHOR0
mov x0, 0
mov x1, 0
sub sp, sp, #16
ldr x2, [x3, 112]
ldr w3, [x3, 120]
add sp, sp, 16
ubfxx5, x2, 8, 8
bfi x0, x2, 0, 8
ubfxx4, x2, 16, 8
lsr w9, w2, 24
bfi x0, x5, 8, 8
ubfxx7, x2, 32, 8
ubfxx5, x2, 40, 8
ubfxx8, x3, 8, 8
bfi x0, x4, 16, 8
bfi x1, x3, 0, 8
ubfxx4, x2, 48, 8
ubfxx6, x3, 16, 8
bfi x0, x9, 24, 8
bfi x1, x8, 8, 8
lsr x2, x2, 56
lsr w3, w3, 24
bfi x0, x7, 32, 8
bfi x1, x6, 16, 8
bfi x0, x5, 40, 8
bfi x1, x3, 24, 8
bfi x0, x4, 48, 8
bfi x0, x2, 56, 8

To load a 12 1-byte element struct.

and

adrpx0, .LANCHOR0
add x0, x0, :lo12:.LANCHOR0
sub sp, sp, #16
ldrbw1, [x0, 18]
ldrhw0, [x0, 16]
orr w0, w0, w1, lsr 16
str w0, [sp, 8]
add sp, sp, 16

instead of

adrpx2, .LANCHOR0
add x2, x2, :lo12:.LANCHOR0
mov x0, 0
sub sp, sp, #16
ldrhw1, [x2, 16]
ldrbw2, [x2, 18]
add sp, sp, 16
bfi x0, x1, 0, 8
ubfxx1, x1, 8, 8
bfi x0, x1, 8, 8
bfi x0, x2, 16, 8

These changes only have an effect on structures smaller than 16 bytes.

The remaining stores come from an existing incomplete data-flow analysis
which thinks the value on the stack is being used and doesn't mark
the value as dead.

Regression tested on aarch64-none-linux-gnu and x86_64-pc-linux-gnu and no 
regressions.

OK for trunk?

Thanks,
Tamar


gcc/
2017-06-07  Tamar Christina  

* expr.c (copy_blkmode_to_reg): Fix bitsize for targets
with fast unaligned access.
* config/aarch64/aarch64.c (aarch64_expand_movmem):
Add MEM to REG optimized case.diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 4f769a40a4e9de83cb5aacfd3ff58301c2feeb78..8906d9a9445ed36f43302708d1f6212bcf017bdc 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -13498,6 +13498,41 @@ aarch64_expand_movmem (rtx *operands)
   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
   src = adjust_automodify_address (src, VOIDmode, base, 0);
 
+  /* Optimize routines for MEM to REG copies.  */
+  if (n < 8 && !REG_P (XEXP (operands[0], 0)))
+   {
+ unsigned int max_align = UINTVAL (operands[2]);
+ max_align = n < max_align ? max_align : n;
+ machine_mode mov_mode, dest_mode
+   = smallest_mode_for_size (max_align * BITS_PER_UNIT, MODE_INT);
+ rtx result = gen_reg_rtx (dest_mode);
+ emit_insn (gen_move_insn (result, GEN_INT (0)));
+
+ unsigned int shift_cnt = 0;
+ for (; n > shift_cnt; shift_cnt += GET_MODE_SIZE (mov_mode))
+   {
+	 int nearest = 0;
+	 /* Find the mode to use, but limit the max to TI mode.  */
+	 for (unsigned max = 1; max <= (n - shift_cnt) && max <= 16; max *= 2)
+	  nearest = max;
+
+	  mov_mode = smallest_mode_for_size (nearest * BITS_PER_UNIT, MODE_INT);
+	  rtx reg = gen_reg_rtx (mov_mode);
+
+	  src = adjust_address (src, mov_mode, 0);
+	  emit_insn (gen_move_insn (reg, src));
+	  src = aarch64_progress_pointer (src);
+
+	  reg = gen_rtx_ASHIFT (dest_mode, reg,
+GEN_INT (shift_cnt * BITS_PER_UNIT));
+	  result = gen_rtx_IOR (dest_mode, reg, result);
+  }
+
+dst = adjust_address (dst, dest_mode, 0);
+emit_insn (gen_move_insn (dst, result));
+return true;
+  }
+
   /* Simple cases.  Copy 0-3 bytes, as (if applicable) a 2-byte, then a
  1-byte chunk.  */
   if (n < 4)
diff --git a/gcc/expr.c b/gcc/expr.c
index 91d7ea217229fac62380b5d4b646961bf7c836c1..b1df4651e7942346007cda1cce8ee5a19297ab16 100644
--- a/gcc/expr.c
+++ b/gcc/expr.c
@@ -2743,7 +2743,9 @@ copy_blkmode_to_reg (machine_mode mode, tree src)
 
   n_regs = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
   dst_words = XALLOCAVEC (rtx, n_regs);
-  bitsize = MIN (TYPE_ALIGN (TREE_TYPE (src)), BITS_PER_WORD);
+  bitsize = BITS_PER_WORD;
+  if (SLOW_UNALIGNED_ACCESS (BLKmode, TYPE_ALIGN (TREE_TYPE (src
+bitsize = MIN (TYPE_ALIGN (TREE_TYPE (src)), BITS_PER_WORD);