Re: [Patch, MIPS] Patch to fix MIPS optimization bug in combine

2015-10-21 Thread pinskia


> On Oct 22, 2015, at 12:44 AM, Steve Ellcey  wrote:
> 
> 
> A bug was reported against the GCC MIPS64 compiler that involves a bad combine
> and this patch fixes the bug.
> 
> When using '-fexpensive-optimizations -march=mips64r2 -mabi=64' GCC is
> combining these instructions:
> 
> (insn 13 12 14 2 (set (reg:DI 206 [ *last_3(D)+-4 ])
>(zero_extend:DI (subreg/s/u:SI (reg:DI 196 [ *last_3(D)+-4 ]) 4))) 
> x.c:21 212 {*zero_extendsidi2_dext}
> (nil))
> 
> (insn 15 14 16 2 (set (reg:DI 208)
>(and:DI (reg:DI 207)
>(const_int 4294967295 [0x]))) x.c:21 182 {*anddi3}
> (expr_list:REG_DEAD (reg:DI 207)
>(nil)))
> 
> (jump_insn 16 15 17 2 (set (pc)
>(if_then_else (ne (reg:DI 206 [ *last_3(D)+-4 ])
>(reg:DI 208))
>(label_ref:DI 35)
>(pc))) x.c:21 473 {*branch_equalitydi}
> (expr_list:REG_DEAD (reg:DI 208)
>(expr_list:REG_DEAD (reg:DI 206 [ *last_3(D)+-4 ])
>(int_list:REG_BR_PROB 8010 (nil
> -> 35)
> 
> Into:
> 
> (jump_insn 16 15 17 2 (set (pc)
>(if_then_else (ne (subreg:SI (reg:DI 207) 4)
>(subreg:SI (reg:DI 196 [ *last_3(D)+-4 ]) 4))
>(label_ref:DI 35)
>(pc))) x.c:21 472 {*branch_equalitysi}
> (expr_list:REG_DEAD (reg:DI 207)
>(int_list:REG_BR_PROB 8010 (nil)))
> -> 35)
> 
> 
> The problem is that the jump_insn on MIPS64 generates a beq/bne instruction
> that compares the entire 64 bit registers and there is no option to only
> look at the bottom 32 bits.  When we got rid of insn 15 we lost the AND that
> cleared the upper 32 bits of one of the registers and the program fails.
> 
> My solution was to not allow subregs in the conditional jump instruction.
> Here is the patch and a test case and I ran the GCC testsuite with no
> regressions.
> 
> OK to checkin?

No this is the wrong approach. The problem is in combine. I had submitted a 
patch to fix a while back but I never got around to the comments to make it 
consistent with the rest of combine. 

Let me dig up my patch in a few minutes. 

Thanks,
Andrew

> 
> Steve Ellcey
> sell...@imgtec.com
> 
> 
> 2015-10-21  Steve Ellcey  
> 
>* mips.c (mips_legitimate_combined_insn): New function.
>(TARGET_LEGITIMATE_COMBINED_INSN): New macro.
> 
> 
> diff --git a/gcc/config/mips/mips.c b/gcc/config/mips/mips.c
> index 0b4a5fa..4338628 100644
> --- a/gcc/config/mips/mips.c
> +++ b/gcc/config/mips/mips.c
> @@ -19606,6 +19606,26 @@ mips_ira_change_pseudo_allocno_class (int regno, 
> reg_class_t allocno_class)
> return GR_REGS;
>   return allocno_class;
> }
> +
> +/* Implement TARGET_LEGITIMATE_COMBINED_INSN */
> +
> +static bool
> +mips_legitimate_combined_insn (rtx_insn* insn)
> +{
> +  /* If we do a conditional jump involving register compares do not allow
> + subregs because beq/bne will always compare the entire register.
> + This should only be an issue with N32/N64 ABI's that do a 32 bit
> + compare and jump.  */
> +
> +  if (any_condjump_p (insn))
> +{
> +  rtx cond = XEXP (SET_SRC (pc_set (insn)), 0);
> +  if (GET_RTX_CLASS (GET_CODE (cond)) == RTX_COMPARE
> +  || GET_RTX_CLASS (GET_CODE (cond)) == RTX_COMM_COMPARE)
> +return !SUBREG_P (XEXP (cond, 0)) && !SUBREG_P (XEXP (cond, 1));
> +}
> +  return true;
> +}
> 
> /* Initialize the GCC target structure.  */
> #undef TARGET_ASM_ALIGNED_HI_OP
> @@ -19868,6 +19888,9 @@ mips_ira_change_pseudo_allocno_class (int regno, 
> reg_class_t allocno_class)
> #undef TARGET_HARD_REGNO_SCRATCH_OK
> #define TARGET_HARD_REGNO_SCRATCH_OK mips_hard_regno_scratch_ok
> 
> +#undef TARGET_LEGITIMATE_COMBINED_INSN
> +#define TARGET_LEGITIMATE_COMBINED_INSN mips_legitimate_combined_insn
> +
> struct gcc_target targetm = TARGET_INITIALIZER;
> 
> #include "gt-mips.h"
> 
> 
> 
> 
> 
> 2015-10-21  Steve Ellcey  
> 
>* gcc.dg/combine-subregs.c: New test.
> 
> 
> diff --git a/gcc/testsuite/gcc.dg/combine-subregs.c 
> b/gcc/testsuite/gcc.dg/combine-subregs.c
> index e69de29..c480f88 100644
> --- a/gcc/testsuite/gcc.dg/combine-subregs.c
> +++ b/gcc/testsuite/gcc.dg/combine-subregs.c
> @@ -0,0 +1,36 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -fexpensive-optimizations" } */
> +
> +#include 
> +#include 
> +
> +void __attribute__ ((noinline))
> +foo (uint64_t state, uint32_t last)
> +{
> +  if (state == last) abort ();
> +}
> +
> +/* This function may do a bad comparision by trying to
> +   use SUBREGS during the compare on machines where comparing
> +   two registers always compares the entire register regardless
> +   of mode.  */
> +
> +int __attribute__ ((noinline))
> +compare (uint64_t state, uint32_t *last, uint8_t buf)
> +{
> +if (*last == ((state | buf) & 0x)) {
> +foo (state, *last);
> +return 0;
> +}
> +return 1;
> +}
> +
> +int
> +main(int argc, char **argv) {
> +uint64_t state = 0xF0100U;
> +   

Re: [PATCH 1/7] Libsanitizer merge from upstream r249633.

2015-10-15 Thread pinskia


> On Oct 15, 2015, at 1:42 AM, Renato Golin  wrote:
> 
>> On 15 October 2015 at 08:29, Yury Gribov  wrote:
>> Do you have any estimation for when full AArch64 support is ready in LLVM?
>> If it's still months away, I wonder if we may want to enable at least
>> current (partial) support for non-Thunder users.
> 
> Hi Yury,
> 
> Unfortunately, no. Basic support is there, better support is a few
> weeks away (fingers crossed), but full support may take years (and
> we're not promising it :).
> 
> However, I can give you an overview of what is done and what is left
> to do, so you can have your own judgement.
> 
> Last year, Christophe added ASAN to AArch64, but because of multiple
> problems with the tests on environments we didn't have (the official
> buildbot was a Juno outside of Linaro), we couldn't enable it by
> default. So, support for ASAN and all other sanitizers on AArch64 was
> put on hold. Since January this year, we've been studying TSAN, and
> realised that there were many shared problems with ASAN, so we decided
> to fix the buildbot problem and implement the other sanitizers.
> 
> Fast-forward to today, we have almost all sanitizers "working" on
> AArch64. Not all features work (Adhemerval knows more about this), and
> it doesn't work on every AArch64 hardware out there, namely those that
> we don't have access to. But there are some things that do work, and
> they are being tested on every commit since a few months ago. This
> means that you can try it out, see what works and what doesn't, and
> let us know by creating a bug in LLVM's bugzilla on Compiler-RT. It
> doesn't mean we'll fix everything, but it means we'll know what's
> broken, so we can plan better.
> 
> One hint to understand the level of support is to look at the tests.
> We use two ways of marking tests: XFAIL, as usual, says the test
> *must* fail, and normally means (merits aside), that a sanitizer
> feature is not implemented, or the test is exclusive for a different
> architecture. The other way is by marking it as "REQUIRES:
> stable-runtime", which is for tests that fail on some platforms but
> not others. This normally means an underlying (internal) sanitizer
> feature is not present or too generic, and the test is not good enough
> to detect it. None of that is good, we know, but it is what it is, and
> as you can imagine, the priority is to get things working first.
> 
> For the future, there are still a few things we have to do:
> 
> 1. Finish adding all the sanitizers. Because they share a lot of code
> and execution, enabling one at a time and fixing all the bugs would
> make us ignore the problems in all the other SANs, and we'd have to
> re-implement a lot again on every new one. Like untangling hair,
> working from inside only makes things worse. This would also allow
> other people to test the sanitizers that they care about and report
> bugs, so we can have a wider view of what's missing and what's broken.
> Adhemerval is on the last ones now and we shall have them upstream,
> tested (modulo XFAIL), in the next few weeks.
> 
> 2. The first critical issue for AArch64 is the VMA problem and is our
> current focus. Some ideas were put forward (including in this thread)
> and right now Adhemerval is investigating Jakub's proposal. It
> shouldn't take long for us to have a prototype, and we'll share our
> findings here as soon as we have it. We'll focus on that problem for
> now and only move to the next step once this is resolved and all SAN
> tests pass on both 39 and 42 VMA. We don't have 48 or 56, so we'll
> rely on you guys (Andrew?) to test the prototype and report back.
> Otherwise, we'll assume working on 39/42 is good enough. This should
> take another few weeks.

huh? You can get 48 bit va support on any normal aarch64 kernel without my 
support. It is a standard feature (not enabled by default though). It just 
happens to be required by thunderx but not unique to thunderx. 52 bit support 
does not exist yet (though it might in next year). 

So in summary just enable 48 bit va support in the upstream kernel right now 
and not needed to test on thunderx. So please enable 48 bit va in the kernel. 
It is supported on a kernel that supports juno, apm and amd processors. 

Note armv8.0 and 8.1 supports up to 48 bit physical addressing and 2 node 
thunderx uses that and that is why 48 bit va is needed for thunderx.

Thanks,
Andre


> 
> 3. After the VMA issue is resolved, we'll look at clearing the
> "stable-runtime" and XFAIL flags from as many tests as possible.
> Either by implementing the features, or by improving the tests, or by
> moving them into an architecture specific place where they don't get
> tested in AArch64 if they don't have to (ex. x86 specific stuff). But
> I have to warn you, this will be at a much lower priority than the VMA
> issue, and it could take years to finish. Basically, we'll do on a
> case by case basis, what people use, what is 

Re: [RFC, PATCH] Disable -fprofile-use related optimizations if corresponding .gcda file not found.

2015-10-07 Thread pinskia


> On Oct 7, 2015, at 9:28 AM, Maxim Ostapenko  
> wrote:
> 
> 
> 
>> On 07/10/15 19:18, Andrew Pinski wrote:
>> On Wed, Oct 7, 2015 at 9:11 AM, Maxim Ostapenko
>>  wrote:
>>> Hi,
>>> 
>>> when testing OpenSSL performance, I found out that sometimes PGO-built
>>> binaries can actually introduce performance regressions. We could identify
>>> affected object files and disable PGO for them by simply removing
>>> corresponding .gcda file. However, even if profile data is not presented,
>>> GCC doesn't switch back -fprofile-use dependent optimizations (e.g.
>>> -fbranch-probabilities, -fvpt, etc). This may also lead to performance
>>> degradations.
>>> 
>>> The issue had already raised quite time ago
>>> (https://gcc.gnu.org/ml/gcc-patches/2009-09/msg02119.html), but for some
>>> reasons wasn't discussed.
>>> 
>>> Here a draft patch that disables -fprofile-use related optimizations if
>>> profile data wasn't found (perhaps it makes sense to introduce a special
>>> switch for this?). Does this look reasonable?
>> I thought if profile is not present, then branch probabilities goes
>> back to the original heuristics?
>> Which option is really causing the performance degradation here?
> 
> -fprofile-use enabled -fpeel-loops that in turn enabled -frename-registers. 
> This caused the scheduler to transform the code in sched2 pass.

Why not figure out how much to improve that instead. Rename register is just 
doing renaming of register usage and not undoing any scheduling at all (well it 
might cause some moves to be removed).  I think you really should dig into 
deeper why it is causing a performance issue. 

> 
>> Also I think your patch is very incomplete as someone could use
>> -frename-registers with -fprofile-use and then you just turned it off.
>> 
>> Thanks,
>> Andrew Pinski
> 
> Doesn't -fprofile-use enable -frename-registers transitively through 
> -fpeel-loops?

Yes. But I am saying some one could do -fprofile-use -frename-registers and 
expect rename registers to stay on even if there is no profile. 

Thanks,
Andrew


> 
>>> Thanks,
>>> -Maxim
> 


Re: [Patch match.pd] Add a simplify rule for x * copysign (1.0, y);

2015-10-01 Thread pinskia

> On Oct 1, 2015, at 7:51 AM, James Greenhalgh  wrote:
> 
> On Thu, Oct 01, 2015 at 03:28:22PM +0100, pins...@gmail.com wrote:
>>> 
>>> On Oct 1, 2015, at 6:57 AM, James Greenhalgh  
>>> wrote:
>>> 
>>> 
>>> Hi,
>>> 
>>> If it is cheap enough to treat a floating-point value as an integer and
>>> to do bitwise arithmetic on it (as it is for AArch64) we can rewrite:
>>> 
>>> x * copysign (1.0, y)
>>> 
>>> as:
>>> 
>>> x ^ (y & (1 << sign_bit_position))
>> 
>> Why not just convert it to copysign (x, y) instead and let expand chose
>> the better implementation?
> 
> Because that transformation is invalid :-)
> 
> let x = -1.0, y = -1.0
> 
>  x * copysign (1.0, y)
>=  -1.0 * copysign (1.0, -1.0)
>= -1.0 * -1.0
>= 1.0
> 
>  copysign (x, y)
>= copysign (-1.0, -1.0)
>= -1.0
> 
> Or have I completely lost my maths skills :-)

No you are correct. Note I would rather see the copysign form in the tree level 
and have the integer form on the rtl level. So placing this in expand would be 
better instead of match.md. 

Thanks,
Andrew

> 
>> Also I think this can only be done for finite and non trapping types. 
> 
> That may be well true, I swithered either way and went for no checks, but
> I'd happily go back on that and wrap this in something suitable restrictive
> if I need to.
> 
> Thanks,
> James
> 
> 
>>> 
>>> This patch implements that rewriting rule in match.pd, and a testcase
>>> expecting the transform.
>>> 
>>> This is worth about 6% in 481.wrf for AArch64. I don't don't know enough
>>> about the x86 microarchitectures to know how productive this transformation
>>> is there. In Spec2006FP I didn't see any interesting results in either
>>> direction. Looking at code generation for the testcase I add, I think the
>>> x86 code generation looks worse, but I can't understand why it doesn't use
>>> a vector-side xor and load the mask vector-side. With that fixed up I think
>>> the code generation would look better - though as I say, I'm not an expert
>>> here...
>>> 
>>> Bootstrapped on both aarch64-none-linux-gnu and x86_64 with no issues.
>>> 
>>> OK for trunk?
>>> 
>>> Thanks,
>>> James
>>> 
>>> ---
>>> gcc/
>>> 
>>> 2015-10-01  James Greenhalgh  
>>> 
>>>   * match.pd (mult (COPYSIGN:s real_onep @0) @1): New simplifier.
>>> 
>>> gcc/testsuite/
>>> 
>>> 2015-10-01  James Greenhalgh  
>>> 
>>>   * gcc.dg/tree-ssa/copysign.c: New.
>>> 
>>> <0001-Patch-match.pd-Add-a-simplify-rule-for-x-copysign-1..patch>
>> 


Re: [Patch match.pd] Add a simplify rule for x * copysign (1.0, y);

2015-10-01 Thread pinskia
> 
> On Oct 1, 2015, at 6:57 AM, James Greenhalgh  wrote:
> 
> 
> Hi,
> 
> If it is cheap enough to treat a floating-point value as an integer and
> to do bitwise arithmetic on it (as it is for AArch64) we can rewrite:
> 
>  x * copysign (1.0, y)
> 
> as:
> 
>  x ^ (y & (1 << sign_bit_position))

Why not just convert it to copysign (x, y) instead and let expand chose the 
better implementation?  Also I think this can only be done for finite and non 
trapping types. 

Thanks,
Andrew

> 
> This patch implements that rewriting rule in match.pd, and a testcase
> expecting the transform.
> 
> This is worth about 6% in 481.wrf for AArch64. I don't don't know enough
> about the x86 microarchitectures to know how productive this transformation
> is there. In Spec2006FP I didn't see any interesting results in either
> direction. Looking at code generation for the testcase I add, I think the
> x86 code generation looks worse, but I can't understand why it doesn't use
> a vector-side xor and load the mask vector-side. With that fixed up I think
> the code generation would look better - though as I say, I'm not an expert
> here...
> 
> Bootstrapped on both aarch64-none-linux-gnu and x86_64 with no issues.
> 
> OK for trunk?
> 
> Thanks,
> James
> 
> ---
> gcc/
> 
> 2015-10-01  James Greenhalgh  
> 
>* match.pd (mult (COPYSIGN:s real_onep @0) @1): New simplifier.
> 
> gcc/testsuite/
> 
> 2015-10-01  James Greenhalgh  
> 
>* gcc.dg/tree-ssa/copysign.c: New.
> 
> <0001-Patch-match.pd-Add-a-simplify-rule-for-x-copysign-1..patch>


Re: [PATCH] Disable -fno-reorder-blocks-and-partition if no -fprofile-use to avoid unnecessary overhead

2015-09-24 Thread pinskia


> On Sep 24, 2015, at 10:16 AM, Teresa Johnson  wrote:
> 
> This patch unsets -freorder-blocks-and-partition when -fprofile-use
> is not specified. Function splitting was not actually being performed
> in that case, as probably_never_executed_bb_p does not distinguish
> any basic blocks as being cold vs hot when there is no profile data.
> 
> Leaving it enabled, however, causes the assembly code generator to create
> (empty) cold sections and labels, leading to unnecessary size overhead.
> 
> Bootstrapped and tested on x86-64-unknown-linux-gnu. Ok for trunk?

This might be ok for now but there is a notion to enable it for non profile 
case. 

Thanks,
Andrew




> 
> Thanks,
> Teresa
> 
> 2015-09-24  Teresa Johnson  
> 
>* opts.c (finish_options): Unset -freorder-blocks-and-partition
>if not using profile.
> 
> Index: opts.c
> ===
> --- opts.c  (revision 228062)
> +++ opts.c  (working copy)
> @@ -821,7 +821,17 @@ finish_options (struct gcc_options *opts, struct g
>   opts->x_flag_reorder_blocks = 1;
> }
> 
> +  /* Disable -freorder-blocks-and-partition when -fprofile-use is not in
> + effect. Function splitting was not actually being performed in that 
> case,
> + as probably_never_executed_bb_p does not distinguish any basic blocks as
> + being cold vs hot when there is no profile data. Leaving it enabled,
> + however, causes the assembly code generator to create (empty) cold
> + sections and labels, leading to unnecessary size overhead.  */
>   if (opts->x_flag_reorder_blocks_and_partition
> +  && !opts_set->x_flag_profile_use)
> +opts->x_flag_reorder_blocks_and_partition = 0;
> +
> +  if (opts->x_flag_reorder_blocks_and_partition
>   && !opts_set->x_flag_reorder_functions)
> opts->x_flag_reorder_functions = 1;
> 
> 
> -- 
> Teresa Johnson | Software Engineer | tejohn...@google.com | 408-460-2413


Re: [Patch/ccmp] Cost instruction sequences to choose better expand order

2015-09-21 Thread pinskia


> On Sep 21, 2015, at 4:39 AM, Bernd Schmidt  wrote:
> 
>> On 09/18/2015 05:21 PM, Jiong Wang wrote:
>> 
>> Current conditional compare (CCMP) support in GCC aim to optimize
>> short circuit for cascade comparision, given a simple conditional
>> compare candidate:
>> 
>>   if (a == 17 || a == 32)
> [...]
>> The problem is current implementation always expand t0 first, then
>> t1. While the expand order need to consider the rtx costs, because "cmp"
>> and "ccmp" may have different restrictions that the expand order will
>> result in performance differences.
>> 
>> For example on AArch64, "ccmp" only accept immediate within -31 ~ 31
>> while "cmp" accept wider range, so if we expand "a == 32" in the second
>> step, then it will use "ccmp", and thus an extra "mov reg, 32"
>> instruction is generated because "32" is out of the range. While if we
>> expand "a == 32" first, then it's fine as "cmp" can encoding it.
> 
> I've played with this patch a bit with an aarch64 cross compiler. First of 
> all - it doesn't seem to work, I get identical costs and the swapping doesn't 
> happen. Did you forget to include a rtx_cost patch?
> 
> I was a little worried about whether this would be expensive for longer 
> sequences of conditions, but it seems like it looks only at leafs where we 
> have two comparisons, so that cost should be minimal. However, it's possible 
> there's room for improvement in code generation. I was curious and looked at 
> a slightly more complex testcase
> 
> int
> foo (int a)
> {
>  if (a == 17 || a == 32 || a == 47 || a == 53 || a == 66 || a == 72)
>return 1;
>  else
>return 2;
> }
> 
> and this doesn't generate a sequence of ccmps as might have been expected; we 
> only get pairs of comparisons merged with a bit_ior:
> 
>  D.2699 = a == 17;
>  D.2700 = a == 32;
>  D.2701 = D.2699 | D.2700;
>  if (D.2701 != 0) goto ; else goto ;
>  :
>  D.2703 = a == 47;
>  D.2704 = a == 53;
>  D.2705 = D.2703 | D.2704;
>  if (D.2705 != 0) goto ; else goto ;
> 
> and the ccmp expander doesn't see the entire thing. I found that a little 
> surprising TBH.

This is a known issue with fold-const folding too early. Replace || with | and 
add some parentheses and you should get a string of ccmp's. 

Thanks,
Andrew


> 
> 
> Bernd


Re: [PATCH 0/3] remove tm.h from libobjc/sendmsg.c

2015-09-10 Thread pinskia




> On Sep 10, 2015, at 7:57 PM, tbsaunde+...@tbsaunde.org wrote:
> 
> From: Trevor Saunders 
> 
> Hi,
> 
> I thought I'd see how hard it is to get tm.h out of libobjc/ so it wouldn't be
> necessary to check there for uses of target macros.

I once had a branch which did this and then I got busy with other things. The 
libobjc side should still apply. 

Thanks,
Andrew


> 
> each patch individually bootstrapped + regtested on x86_64-linux-gnu, ok?
> 
> Trev
> 
> 
> Trevor Saunders (3):
>  remove STRUCT_VALUE macro
>  remove unused defines from sendmsg.c
>  stop including tm.h in sendmsg.c
> 
> gcc/config/arc/arc.h   |  4 
> gcc/config/lm32/lm32.h |  2 --
> gcc/config/mep/mep.h   |  2 --
> gcc/config/visium/visium.h |  8 
> gcc/system.h   |  2 +-
> libobjc/sendmsg.c  | 14 --
> 6 files changed, 1 insertion(+), 31 deletions(-)
> 
> -- 
> 2.4.0
> 


Re: [0/7] Type promotion pass and elimination of zext/sext

2015-09-07 Thread pinskia




> On Sep 7, 2015, at 7:22 PM, Kugan  wrote:
> 
> 
> 
> On 07/09/15 20:46, Wilco Dijkstra wrote:
>>> Kugan wrote:
>>> 2. vector-compare-1.c from c-c++-common/torture fails to assemble with
>>> -O3 -g Error: unaligned opcodes detected in executable segment. It works
>>> fine if I remove the -g. I am looking into it and needs to be fixed as well.
>> 
>> This is a known assembler bug I found a while back, Renlin is looking into 
>> it.
>> Basically when debug tables are inserted at the end of a code section the 
>> assembler doesn't align to the alignment required by the debug tables.
> 
> This is precisely what seems to be happening. Renlin, could you please
> let me know when you have a patch (even if it is a prototype or a hack).


I had noticed that but I read through the assembler code and it sounded very 
much like it was a designed this way and that the compiler was not supposed to 
emit assembly like this and fix up the alignment. 

Thanks,
Andrew

> 
> Thanks,
> Kugan


Re: [PATCH] 2015-07-31 Benedikt Huber <benedikt.hu...@theobroma-systems.com> Philipp Tomsich <philipp.toms...@theobroma-systems.com>

2015-09-03 Thread pinskia




> On Sep 3, 2015, at 11:58 PM, Sebastian Pop  wrote:
> 
> On Wed, Aug 26, 2015 at 11:58 AM, Benedikt Huber
>  wrote:
>> ping
>> 
>> [PATCH v4][aarch64] Implemented reciprocal square root (rsqrt) estimation in 
>> -ffast-math
>> 
>> https://gcc.gnu.org/ml/gcc-patches/2015-07/msg02698.html
>> 
>>> On 31 Jul 2015, at 19:05, Benedikt Huber 
>>>  wrote:
>>> 
>>>  * config/aarch64/aarch64-builtins.c: Builtins for rsqrt and
>>>  rsqrtf.
>>>  * config/aarch64/aarch64-opts.h: -mrecip has a default value
>>>  depending on the core.
>>>  * config/aarch64/aarch64-protos.h: Declare.
>>>  * config/aarch64/aarch64-simd.md: Matching expressions for
>>>  frsqrte and frsqrts.
>>>  * config/aarch64/aarch64-tuning-flags.def: Added
>>>  MRECIP_DEFAULT_ENABLED.
>>>  * config/aarch64/aarch64.c: New functions. Emit rsqrt
>>>  estimation code in fast math mode.
>>>  * config/aarch64/aarch64.md: Added enum entries.
>>>  * config/aarch64/aarch64.opt: Added options -mrecip and
>>>  -mlow-precision-recip-sqrt.
>>>  * testsuite/gcc.target/aarch64/rsqrt-asm-check.c: Assembly scans
>>>  for frsqrte and frsqrts
>>>  * testsuite/gcc.target/aarch64/rsqrt.c: Functional tests for rsqrt.
> 
> The patch looks good to me.
> You will need an ARM/AArch64 maintainer to approve your patch: +Ramana

Except it is missing comments before almost all new functions.  Yes aarch64 
backend does not follow that rule but that does not mean you should not either. 

Thanks,
Andrew

> 
> Thanks,
> Sebastian
> 
>>> 
>>> Signed-off-by: Philipp Tomsich 
>>> ---
>>> gcc/ChangeLog  |  21 
>>> gcc/config/aarch64/aarch64-builtins.c  | 104 
>>> 
>>> gcc/config/aarch64/aarch64-opts.h  |   7 ++
>>> gcc/config/aarch64/aarch64-protos.h|   2 +
>>> gcc/config/aarch64/aarch64-simd.md |  27 ++
>>> gcc/config/aarch64/aarch64-tuning-flags.def|   1 +
>>> gcc/config/aarch64/aarch64.c   | 106 
>>> +++-
>>> gcc/config/aarch64/aarch64.md  |   3 +
>>> gcc/config/aarch64/aarch64.opt |   8 ++
>>> gcc/doc/invoke.texi|  19 
>>> gcc/testsuite/gcc.target/aarch64/rsqrt-asm-check.c |  63 
>>> gcc/testsuite/gcc.target/aarch64/rsqrt.c   | 107 
>>> +
>>> 12 files changed, 463 insertions(+), 5 deletions(-)
>>> create mode 100644 gcc/testsuite/gcc.target/aarch64/rsqrt-asm-check.c
>>> create mode 100644 gcc/testsuite/gcc.target/aarch64/rsqrt.c
>>> 
>>> diff --git a/gcc/ChangeLog b/gcc/ChangeLog
>>> index 3432adb..3bf3098 100644
>>> --- a/gcc/ChangeLog
>>> +++ b/gcc/ChangeLog
>>> @@ -1,3 +1,24 @@
>>> +2015-07-31  Benedikt Huber  
>>> + Philipp Tomsich  
>>> +
>>> + * config/aarch64/aarch64-builtins.c: Builtins for rsqrt and
>>> + rsqrtf.
>>> + * config/aarch64/aarch64-opts.h: -mrecip has a default value
>>> + depending on the core.
>>> + * config/aarch64/aarch64-protos.h: Declare.
>>> + * config/aarch64/aarch64-simd.md: Matching expressions for
>>> + frsqrte and frsqrts.
>>> + * config/aarch64/aarch64-tuning-flags.def: Added
>>> + MRECIP_DEFAULT_ENABLED.
>>> + * config/aarch64/aarch64.c: New functions. Emit rsqrt
>>> + estimation code in fast math mode.
>>> + * config/aarch64/aarch64.md: Added enum entries.
>>> + * config/aarch64/aarch64.opt: Added options -mrecip and
>>> + -mlow-precision-recip-sqrt.
>>> + * testsuite/gcc.target/aarch64/rsqrt-asm-check.c: Assembly scans
>>> + for frsqrte and frsqrts
>>> + * testsuite/gcc.target/aarch64/rsqrt.c: Functional tests for rsqrt.
>>> +
>>> 2015-07-08  Jiong Wang  
>>> 
>>>  * config/aarch64/aarch64.c (aarch64_unspec_may_trap_p): New function.
>>> diff --git a/gcc/config/aarch64/aarch64-builtins.c 
>>> b/gcc/config/aarch64/aarch64-builtins.c
>>> index b6c89b9..b4f443c 100644
>>> --- a/gcc/config/aarch64/aarch64-builtins.c
>>> +++ b/gcc/config/aarch64/aarch64-builtins.c
>>> @@ -335,6 +335,11 @@ enum aarch64_builtins
>>>  AARCH64_BUILTIN_GET_FPSR,
>>>  AARCH64_BUILTIN_SET_FPSR,
>>> 
>>> +  AARCH64_BUILTIN_RSQRT_DF,
>>> +  AARCH64_BUILTIN_RSQRT_SF,
>>> +  AARCH64_BUILTIN_RSQRT_V2DF,
>>> +  AARCH64_BUILTIN_RSQRT_V2SF,
>>> +  AARCH64_BUILTIN_RSQRT_V4SF,
>>>  AARCH64_SIMD_BUILTIN_BASE,
>>>  AARCH64_SIMD_BUILTIN_LANE_CHECK,
>>> #include "aarch64-simd-builtins.def"
>>> @@ -824,6 +829,42 @@ aarch64_init_crc32_builtins ()
>>> }
>>> 
>>> void
>>> +aarch64_add_builtin_rsqrt (void)

Here. 

>>> +{
>>> +  tree fndecl = NULL;
>>> +  tree ftype = NULL;
>>> +
>>> +  tree V2SF_type_node = build_vector_type 

Re: [PATCH][ARM][3/3] Implement negsicc, notsicc optabs

2015-09-01 Thread pinskia




> On Sep 1, 2015, at 11:04 PM, Kyrill Tkachov  wrote:
> 
> Hi all,
> 
> This third patch implements the new optabs for arm.
> Conveniently, we can reuse the recently refactored *if_neg_move pattern
> and extend it to cover the conditional NOT case.
> Although arm has conditional execution capabilities I have found that
> performing the transformation from patch 1/3 early on benefits arm as well.
> For example for code:
> int
> foonegsi (int a)
> {
>  return a ? 25089992 : -25089992;
> }
> 
> we currently generate:
>movwr2, #55240
>movwr3, #10296
>cmp r0, #0
>movtr2, 382
>movtr3, 65153
>movne   r0, r2
>moveq   r0, r3
>bx
> 
> whereas with this patch we generate:
>movwr3, #10296
>cmp r0, #0
>movtr3, 65153
>mov r0, r3
>rsbne   r0, r3, #0
>bx  lr
> 
> 
> In SPEC2006 this doesn't trigger very often, so there were minimal code 
> differences,
> but overall I claim this patch is an improvement.
> 
> Bootstrapped and tested on arm-none-linux-gnueabihf.
> 
> Ok for trunk if the midend changes in 1/3 are approved?

ENOPATCH

> 
> Thanks,
> Kyrill
> 
> 2015-09-01  Kyrylo Tkachov  
> 
>* config/arm/arm.md (sicc): New define_expand.
>(*if_neg_move): Rename to...
>(*if__move): ... This.  Use NOT_NEG code iterator.
>* config/arm/iterators.md (NOT_NEG): New code iterator.
>(NOT_NEG_op): New code attribute.
> 
> 2015-09-01  Kyrylo Tkachov  
> 
>* gcc.target/arm/cond_op_imm_1.c: New test.
> 


Re: [PR66776][PATCH][AARCH64] Add cmovdi_insn_uxtw pattern.

2015-08-06 Thread pinskia




 On Aug 6, 2015, at 11:40 AM, Renlin Li renlin...@arm.com wrote:
 
 Hi all,
 
 This is a simple patch to add a new cmovdi_insn_uxtw rtx pattern to aarch64 
 backend.
 
 For the following simple test case:
 
 unsigned long
 foo (unsigned int a, unsigned int b, unsigned int c)
 {
  return a ? b : c;
 }
 
 With this new pattern, the new code-generation will be:
 
cmpw0, wzr
cselx0, x2, x1, eq

Your example Shows you have the wrong operand types to csel. In the aarch64 abi 
arguments don't need to be zero extended and your csel will not be zero 
extending the arguments.

Note you should also use unsigned long long in the testcase so it is ilp32 and 
llp64l32 friendly. 

Thanks,
Andrew


ret
 
 Without the path, the old code-generation is like this:
uxtwx1, w1
cmp w0, wzr
uxtwx2, w2
cselx0, x2, x1, eq
ret
 
 
 aarch64-none-elf regression test Okay. Okay to commit?
 
 Regards,
 Renlin Li
 
 gcc/ChangeLog:
 
 2015-08-06  Renlin Li  renlin...@arm.com
 
PR target/66776
* config/aarch64/aarch64.md (cmovdi_insn_uxtw): New pattern.
 
 gcc/testsuite/ChangeLog:
 
 2015-08-06  Renlin Li  renlin...@arm.com
 
PR target/66776
* gcc.target/aarch64/pr66776.c: New.
 
 
 
 new-1.diff


Re: [PATCH][AArch64] Properly handle simple arith+extend ops in rtx costs

2015-07-28 Thread pinskia

 On Jul 28, 2015, at 3:25 AM, Kyrill Tkachov kyrylo.tkac...@arm.com wrote:
 
 Hi all,
 
 Currently we assign the wrong rtx cost to instructions of the form
  add x0, x0, x1, sxtw
 
 that is, an arith operation plus a single extend (no shifting).
 We correctly catch the cases where the extend is inside a shift, but
 not the simple case.
 
 This patch fixes that oversight by catching the simple case in
 aarch64_rtx_arith_op_extract_p and thus making sure that it gets
 assigned the alu.extend_arith extra cost.

This patch reminds me, on thunderx the cost for add with sign extend is 
different from add with zero extend.  The zero extend case is the same as a 
normal add while sign extend is one extra cycle. So soon we need to split 
extend to zextend and sextend.  

Thanks,
Andrew

 
 Bootstrapped and tested on aarch64.
 
 Ok for trunk?
 Thanks,
 Kyrill
 
 
 2015-07-28  Kyrylo Tkachov  kyrylo.tkac...@arm.com
 
* config/aarch64/aarch64.c (aarch64_rtx_arith_op_extract_p):
Handle simple SIGN_EXTEND or ZERO_EXTEND.
(aarch64_rtx_costs): Properly strip extend or extract before
passing down to rtx costs again.
 aarch64-arith-extend-costs.patch


Re: [Revert][AArch64] PR 63521 Define REG_ALLOC_ORDER/HONOR_REG_ALLOC_ORDER

2015-07-27 Thread pinskia




 On Jul 27, 2015, at 2:26 AM, Jiong Wang jiong.w...@arm.com wrote:
 
 
 Andrew Pinski writes:
 
 On Fri, Jul 24, 2015 at 2:07 AM, Jiong Wang jiong.w...@arm.com wrote:
 
 James Greenhalgh writes:
 
 On Wed, May 20, 2015 at 01:35:41PM +0100, Jiong Wang wrote:
 Current IRA still use both target macros in a few places.
 
 Tell IRA to use the order we defined rather than with it's own cost
 calculation. Allocate caller saved first, then callee saved.
 
 This is especially useful for LR/x30, as it's free to allocate and is
 pure caller saved when used in leaf function.
 
 Haven't noticed significant impact on benchmarks, but by grepping some
 keywords like Spilling, Push.*spill etc in ira rtl dump, the number
 is smaller.
 
 OK for trunk?
 
 OK, sorry for the delay.
 
 It might be mail client mangling, but please check that the trailing 
 slashes
 line up in the version that gets committed.
 
 Thanks,
 James
 
 2015-05-19  Jiong. Wang  jiong.w...@arm.com
 
 gcc/
  PR 63521
  * config/aarch64/aarch64.h (REG_ALLOC_ORDER): Define.
  (HONOR_REG_ALLOC_ORDER): Define.
 
 Patch reverted.
 
 I did not see a reason why this patch was reverted.  Maybe I am
 missing an email or something.
 
 There are several execution regressions under gcc testsuite, although as
 far as I can see it's this patch exposed hidding bugs in those
 testcases, but there might be one other issue, so to be conservative, I
 temporarily reverted this patch.

If you are talking about:
gcc.target/aarch64/aapcs64/func-ret-2.c execution
Etc.

These test cases are too dependent on the original register allocation order 
and really can be safely ignored. Really these three tests should be moved or 
written in a more sane way. 

Thanks,
Andrew

 
 
 Thanks,
 Andrew
 
 
 
 -- 
 Regards,
 Jiong
 


Re: [PATCH] Use single shared memory block pool for all pool allocators

2015-07-26 Thread pinskia




 On Jul 26, 2015, at 11:50 AM, Andi Kleen a...@firstfloor.org wrote:
 
 Mikhail Maltsev malts...@gmail.com writes:
 
 Hi, all!
 Recently I did some profiling of GCC to find hotspots and areas of possible
 performance improvement among them. glibc malloc(3) is one of (perhaps
 known)
 
 I've been compiling gcc with tcmalloc to do a similar speedup. It would be
 interesting to compare that to your patch.
 
 Another useful optimization is to adjust the allocation size to be =
 2MB. Then modern Linux kernels often can give you a large page,
 which cuts down TLB overhead. I did similar changes some time
 ago for the garbage collector.

Unless you are running with 64k pages which I do all the time on my armv8 
system.  

Thanks,
Andrew


 
 BTW I saw big differences in larger LTO builds.
 
 -Andi
 
 -- 
 a...@linux.intel.com -- Speaking for myself only


Re: [PATCH 1/4] convert ASM_OUTPUT_ASCII to a hook

2015-07-24 Thread pinskia




 On Jul 24, 2015, at 7:36 PM, tbsaunde+...@tbsaunde.org wrote:
 
 From: Trevor Saunders tbsaunde+...@tbsaunde.org
 
 gcc/ChangeLog:
 
 2015-07-25  Trevor Saunders  tsaund...@mozilla.com
 
* defaults.h (ASM_OUTPUT_ASCII): Remove default definition.
* doc/tm.texi: Regenerate.
* doc/tm.texi.in (ASM_OUTPUT_ASCII): Remove documentation of
removed macro.
* target.def (output_ascii): New hook.
* config/arm/aout.h, config/arm/arm-protos.h, config/arm/arm.c,
config/elfos.h, config/i386/att.h, config/i386/i386-protos.h,
config/i386/i386.c, config/i386/i386elf.h, config/mips/mips.c,
config/mips/mips.h, config/mmix/mmix-protos.h, config/mmix/mmix.c,
config/mmix/mmix.h, config/nvptx/nvptx-protos.h, config/nvptx/nvptx.c,
config/nvptx/nvptx.h, config/pa/pa-protos.h, config/pa/pa.c,
config/pa/pa.h, config/pdp11/pdp11-protos.h, config/pdp11/pdp11.c,
config/pdp11/pdp11.h, config/rs6000/rs6000-protos.h,
config/rs6000/rs6000.c, config/rs6000/xcoff.h, dwarf2asm.c,
output.h, varasm.c, varasm.h, vmsdbgout.c: Adjust.
 ---
 gcc/config/arm/aout.h |  5 +--
 gcc/config/arm/arm-protos.h   |  2 +-
 gcc/config/arm/arm.c  |  7 ++--
 gcc/config/elfos.h|  7 ++--
 gcc/config/i386/att.h | 13 +--
 gcc/config/i386/i386-protos.h |  2 +
 gcc/config/i386/i386.c| 77 +++
 gcc/config/i386/i386elf.h | 51 +-
 gcc/config/mips/mips.c|  2 +-
 gcc/config/mips/mips.h|  4 +-
 gcc/config/mmix/mmix-protos.h |  2 +-
 gcc/config/mmix/mmix.c|  4 +-
 gcc/config/mmix/mmix.h|  3 +-
 gcc/config/nvptx/nvptx-protos.h   |  2 +-
 gcc/config/nvptx/nvptx.c  |  2 +-
 gcc/config/nvptx/nvptx.h  |  5 +--
 gcc/config/pa/pa-protos.h |  2 +-
 gcc/config/pa/pa.c|  7 ++--
 gcc/config/pa/pa.h|  3 +-
 gcc/config/pdp11/pdp11-protos.h   |  2 +-
 gcc/config/pdp11/pdp11.c  |  6 +--
 gcc/config/pdp11/pdp11.h  |  3 +-
 gcc/config/rs6000/rs6000-protos.h |  2 +-
 gcc/config/rs6000/rs6000.c|  6 +--
 gcc/config/rs6000/xcoff.h |  2 +-
 gcc/defaults.h| 40 
 gcc/doc/tm.texi   | 14 +++
 gcc/doc/tm.texi.in| 11 +-
 gcc/dwarf2asm.c   |  2 +-
 gcc/output.h  |  2 +-
 gcc/target.def|  9 +
 gcc/varasm.c  | 47 
 gcc/varasm.h  |  2 +
 gcc/vmsdbgout.c   |  7 ++--
 34 files changed, 179 insertions(+), 176 deletions(-)
 
 diff --git a/gcc/config/arm/aout.h b/gcc/config/arm/aout.h
 index 6973d15..c5e63eb 100644
 --- a/gcc/config/arm/aout.h
 +++ b/gcc/config/arm/aout.h
 @@ -241,9 +241,8 @@
   while (0)
 
 
 -#undef  ASM_OUTPUT_ASCII
 -#define ASM_OUTPUT_ASCII(STREAM, PTR, LEN)  \
 -  output_ascii_pseudo_op (STREAM, (const unsigned char *) (PTR), LEN)
 +#undef  TARGET_ASM_OUTPUT_ASCII
 +#define TARGET_ASM_OUTPUT_ASCII output_ascii_pseudo_op
 
 /* Output a gap.  In fact we fill it with nulls.  */
 #undef  ASM_OUTPUT_SKIP
 diff --git a/gcc/config/arm/arm-protos.h b/gcc/config/arm/arm-protos.h
 index 62f91ef..6fc4787d 100644
 --- a/gcc/config/arm/arm-protos.h
 +++ b/gcc/config/arm/arm-protos.h
 @@ -144,7 +144,7 @@ extern int arm_attr_length_move_neon (rtx_insn *);
 extern int arm_address_offset_is_imm (rtx_insn *);
 extern const char *output_add_immediate (rtx *);
 extern const char *arithmetic_instr (rtx, int);
 -extern void output_ascii_pseudo_op (FILE *, const unsigned char *, int);
 +extern void output_ascii_pseudo_op (FILE *, const char *, size_t);
 extern const char *output_return_instruction (rtx, bool, bool, bool);
 extern void arm_poke_function_name (FILE *, const char *);
 extern void arm_final_prescan_insn (rtx_insn *);
 diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
 index eeab8a8..f03abf7 100644
 --- a/gcc/config/arm/arm.c
 +++ b/gcc/config/arm/arm.c
 @@ -18963,14 +18963,13 @@ int_log2 (HOST_WIDE_INT power)
 #define MAX_ASCII_LEN 51
 
 void
 -output_ascii_pseudo_op (FILE *stream, const unsigned char *p, int len)
 +output_ascii_pseudo_op (FILE *stream, const char *p, size_t len)

You changed the signedness of p here for some targets. Can you confirm that 
this code does not depend on if char is signed or unsigned?  Almost all the 
rest of the target macro conversions look correct and kept the casting. Just 
this one seems missing. 

Thanks,
Andrew

 {
 -  int i;
   int len_so_far = 0;
 
   fputs (\t.ascii\t\, stream);
 
 -  for (i = 0; i  len; i++)
 +  for (size_t i = 0; i  len; i++)
 {
   int c = p[i];
 
 @@ -19586,7 +19585,7 @@ arm_poke_function_name (FILE *stream, const char 
 *name)
   length  = strlen (name) + 1;
   alignlength = ROUND_UP_WORD (length);
 
 -  ASM_OUTPUT_ASCII (stream, name, length);
 +  

Re: Fold some equal to and not equal to patterns in match.pd

2015-07-21 Thread pinskia




 On Jul 21, 2015, at 3:31 AM, Kyrill Tkachov kyrylo.tkac...@arm.com wrote:
 
 
 On 21/07/15 10:26, pins...@gmail.com wrote:
 
 
 
 On Jul 21, 2015, at 2:15 AM, Hurugalawadi, Naveen 
 naveen.hurugalaw...@caviumnetworks.com wrote:
 
 Hi,
 
 Please find attached the patch which performs following patterns folding
 in match.pd:-
 
 a ==/!= a p+ b to b ==/!= 0.
 a  N ==/!= 0 to a(-1N) ==/!= 0.
 a * N ==/!= 0 where N is a power of 2 to a  (-1N2) ==/!= 0 where N2 is 
 log2 of N.
 
 Please review the same and let us know if its okay?
 I should note this shows up in perlbmk in spec 2006.
 
 Yes, I see it triggering there for aarch64, but I also see some undesired 
 effects,
 for example in gcc:
lslx24, x24, 3
cbzx24, .L1194
 
 now becomes:
andx0, x24, 2305843009213693951
lslx24, x24, 3
cbzx0, .L1194

Shouldn't the and become a tst instead and the cbz be a b.eq?  Oh I have 
another patch which does that and the reason the performance for me does not 
regress on thunderx (tst and branches can merge before issue). 

Thanks,
Andrew

 
 Thanks,
 Andrew
 
 Regression Tested on X86_64.
 
 On Behalf of Andrew Pinski.
 
 Thanks,
 
 gcc/testsuite/ChangeLog:
 
 2015-01-21  Andrew Pinski  apin...@cavium.com
 
* testsuite/gcc.dg/tree-ssa/compare-shiftmult-1.c: New testcase.
* testsuite/gcc.dg/tree-ssa/compare_pointers-1.c: New testcase.
 
 gcc/ChangeLog:
 
 2015-01-21  Andrew Pinski  apin...@cavium.com
 
* match.pd (define_predicates): Add integer_pow2p.
Add pattern for folding of a ==/!= a p+ b to b ==/!= 0.
(unsigned_integral_valued_p): New match.
Add pattern for folding of aN ==/!= 0 to a(-1N) ==/!= 0.
Add pattern for folding of a*N ==/!= 0 where N is a power of 2
to a(-1N2) ==/!= 0 where N2 is log2 of N.
 gcc_match.patch
 


Re: Fold some equal to and not equal to patterns in match.pd

2015-07-21 Thread pinskia




 On Jul 21, 2015, at 2:15 AM, Hurugalawadi, Naveen 
 naveen.hurugalaw...@caviumnetworks.com wrote:
 
 Hi,
 
 Please find attached the patch which performs following patterns folding
 in match.pd:-
 
 a ==/!= a p+ b to b ==/!= 0.
 a  N ==/!= 0 to a(-1N) ==/!= 0.
 a * N ==/!= 0 where N is a power of 2 to a  (-1N2) ==/!= 0 where N2 is 
 log2 of N.
 
 Please review the same and let us know if its okay?

I should note this shows up in perlbmk in spec 2006. 

Thanks,
Andrew

 
 Regression Tested on X86_64.
 
 On Behalf of Andrew Pinski.
 
 Thanks,
 
 gcc/testsuite/ChangeLog:
 
 2015-01-21  Andrew Pinski  apin...@cavium.com
 
* testsuite/gcc.dg/tree-ssa/compare-shiftmult-1.c: New testcase.
* testsuite/gcc.dg/tree-ssa/compare_pointers-1.c: New testcase.
 
 gcc/ChangeLog:
 
 2015-01-21  Andrew Pinski  apin...@cavium.com
 
* match.pd (define_predicates): Add integer_pow2p.
Add pattern for folding of a ==/!= a p+ b to b ==/!= 0.
(unsigned_integral_valued_p): New match.
Add pattern for folding of aN ==/!= 0 to a(-1N) ==/!= 0.
Add pattern for folding of a*N ==/!= 0 where N is a power of 2
to a(-1N2) ==/!= 0 where N2 is log2 of N.
 gcc_match.patch


Re: [PATCH][AArch64] Improve csinc/csneg/csinv opportunities on immediates

2015-07-17 Thread pinskia




 On Jul 17, 2015, at 9:58 PM, Kyrill Tkachov kyrylo.tkac...@arm.com wrote:
 
 
 On 10/07/15 14:45, Kyrill Tkachov wrote:
 On 10/07/15 10:00, pins...@gmail.com wrote:
 
 
 On Jul 10, 2015, at 1:47 AM, Kyrill Tkachov kyrylo.tkac...@arm.com wrote:
 
 Hi Andrew,
 
 On 10/07/15 09:40, pins...@gmail.com wrote:
 
 
 
 On Jul 10, 2015, at 1:34 AM, Kyrill Tkachov kyrylo.tkac...@arm.com 
 wrote:
 
 Hi all,
 
 Currently when evaluating expressions like (a ? 24 : 25) we will move 24 
 and 25 into
 registers and perform a csel on them.  This misses the opportunity to 
 instead move just 24
 into a register and then perform a csinc, saving us an instruction and a 
 register use.
 Similarly for csneg and csinv.
 
 This patch implements that idea by allowing such pairs of immediates in 
 *cmovmode_insn
 and adding an early splitter that performs the necessary transformation.
 
 The testcase included in the patch demonstrates the kind of 
 opportunities that are now picked up.
 
 With this patch I see about 9.6% more csinc instructions being generated 
 for SPEC2006
 and the generated code looks objectively better (i.e. fewer 
 mov-immediates and slightly
 lower register pressure).
 
 Bootstrapped and tested on aarch64.
 
 Ok for trunk?
 I think this is the wrong place for this optimization. It should happen 
 in expr.c and we should produce cond_expr on the gimple level.
 I had considered it, but I wasn't sure how general the conditional 
 increment/negate/inverse operations
 are to warrant a midend implementation. Do you mean the 
 expand_cond_expr_using_cmove function in expr.c?
 Yes and we can expand it to even have a target hook on how to expand them 
 if needed.
 I played around in that part and it seems that by the time it gets to 
 expansion the midend
 doesn't have a cond_expr of the two immediates, it's a PHI node with the 
 immediates already expanded.
 I have not been able to get it to match a cond_expr of two immediates there, 
 although that could be
 because I'm unfamiliar with that part of the codebase.
 
 So by the time we reach expansion time we don't have a COND_EXPR of two 
 immediates, so I tried getting
 the code in expr.c to do the right thing, but it didn't work out.
 This patch catches this opportunity at the RTL level and could catch such 
 cases if they were to be
 generated by any of the pre-combine RTL passes. Or do you reckon looking for 
 these patterns in RTL
 ifcvt is the way to go? I think it would be somewhat messy to express the 
 CSNEG, CSINV opportunities
 there as we don't have optabs for conditional negate and invert, but 
 conditional increment would work,
 though in the aarch64 case we can only do a conditional by 1 rather than a 
 general conditional add.

Right as I said, I have a patch to phiopt to produce the cond_expr when it is 
useful. That is create cond_expr before we even get to rtl. 

Thanks,
Andrew


 
 Kyrill
 
 
 
 Kyrill
 
 There is already a standard pattern for condition add so the a ? Const1 : 
 const2 can be handled in the a generic way without much troubles. We should 
 handle it better in rtl  ifcvt too (that should be an easier patch). The 
 neg and not cases are very target specific but can be handled by a target 
 hook and expand it directly to it.
 
 
   I have patches to do both but I have not got around to cleaning them 
 up. If anyone wants them, I can send a link to my current gcc 5.1 sources 
 with them included.
 Any chance you can post them on gcc-patches even as a rough idea of what 
 needs to be done?
 I posted my expr patch a few years ago but I never got around to rth's 
 comments. This was the generic increment patch. Basically aarch64 should be 
 implementing that pattern too.
 
 
 The main reason why this should be handled in gimple is that ifcvt on the 
 rtl level is not cheap and does not catch all of the cases the simple 
 expansion of phi-opt does. I can dig that patch up and I will be doing that 
 next week anyways.
 
 Thanks,
 Andrew
 
 Thanks,
 Kyrill
 
   Thanks,
 Andrew
 
 Thanks,
 Kyrill
 
 2015-07-10  Kyrylo Tkachov  kyrylo.tkac...@arm.com
 
 * config/aarch64/aarch64.md (*cmovmode_insn): Move stricter
 check for operands 3 and 4 to pattern predicate.  Allow immediates
 that can be expressed as csinc/csneg/csinv.  New define_split.
 (*csinv3mode_insn): Rename to...
 (csinv3mode_insn): ... This.
 * config/aarch64/aarch64.h (AARCH64_IMMS_OK_FOR_CSNEG): New macro.
 (AARCH64_IMMS_OK_FOR_CSINC): Likewise.
 (AARCH64_IMMS_OK_FOR_CSINV): Likewise.
 * config/aarch64/aarch64.c (aarch64_imms_ok_for_cond_op_1):
 New function.
 (aarch64_imms_ok_for_cond_op): Likewise.
 * config/aarch64/aarch64-protos.h (aarch64_imms_ok_for_cond_op_1):
 Declare prototype.
 (aarch64_imms_ok_for_cond_op): Likewise.
 
 2015-07-10  Kyrylo Tkachov  kyrylo.tkac...@arm.com
 
 * gcc.target/aarch64/cond-op-imm_1.c: New test.
 aarch64-csinc-imms.patch
 


Re: [PATCH][AArch64] Handle -|x| case using a single csneg

2015-07-14 Thread pinskia




 On Jul 13, 2015, at 5:48 PM, Kyrill Tkachov kyrylo.tkac...@arm.com wrote:
 
 Hi all,
 
 For the testcase in the patch we were generating an extra neg instruction:
cmp w0, wzr
csneg   w0, w0, w0, ge
neg w0, w0
ret
 
 instead of the optimal:
cmp w0, wzr
csneg   w0, w0, w0, lt
ret
 
 The reason is that combine tries to merge the operation into a negation of an 
 abs.
 I considered teaching combine not to do that but it would require telling it 
 that it shouldn't
 do it if there is a conditional negate instruction. There's no optab for that 
 though :(
 Also, we already advertise that we have an abs optab, even though we expand 
 to a compare and
 a csneg anyway. This patch was the cleanest way I could do this. We just 
 match the neg of an abs
 and generate the same csneg sequence as for normal abs, just with the 
 comparison condition inverted.
 
 Bootstrapped and tested on aarch64.
 
 Ok for trunk?
 Thanks,
 Kyrill
 
 2015-07-13  Kyrylo Tkachov  kyrylo.tkac...@arm.com
 
* config/aarch64/aarch64.md (*absneg2mode_insn): New
define_and_split.

This pattern is incorrect as you need to say you are clobbering the flags 
register. Otherwise an optimization between combine and the splitter can move 
an instruction between it. Also it might be better to just have a define_split 
rather than a define_insn_and_split. Combine knows how to use define_split 
without being an insn. 

Thanks,
Andrew

 
 2015-07-13  Kyrylo Tkachov  kyrylo.tkac...@arm.com
 
* gcc.target/aarch64/neg-abs_1.c: New test.
 abs-neg.patch


Re: [PATCH][AArch64] Improve csinc/csneg/csinv opportunities on immediates

2015-07-10 Thread pinskia




 On Jul 10, 2015, at 1:34 AM, Kyrill Tkachov kyrylo.tkac...@arm.com wrote:
 
 Hi all,
 
 Currently when evaluating expressions like (a ? 24 : 25) we will move 24 and 
 25 into
 registers and perform a csel on them.  This misses the opportunity to instead 
 move just 24
 into a register and then perform a csinc, saving us an instruction and a 
 register use.
 Similarly for csneg and csinv.
 
 This patch implements that idea by allowing such pairs of immediates in 
 *cmovmode_insn
 and adding an early splitter that performs the necessary transformation.
 
 The testcase included in the patch demonstrates the kind of opportunities 
 that are now picked up.
 
 With this patch I see about 9.6% more csinc instructions being generated for 
 SPEC2006
 and the generated code looks objectively better (i.e. fewer mov-immediates 
 and slightly
 lower register pressure).
 
 Bootstrapped and tested on aarch64.
 
 Ok for trunk?

I think this is the wrong place for this optimization. It should happen in 
expr.c and we should produce cond_expr on the gimple level. 

I have patches to do both but I have not got around to cleaning them up. If 
anyone wants them, I can send a link to my current gcc 5.1 sources with them 
included. 

Thanks,
Andrew

 
 Thanks,
 Kyrill
 
 2015-07-10  Kyrylo Tkachov  kyrylo.tkac...@arm.com
 
* config/aarch64/aarch64.md (*cmovmode_insn): Move stricter
check for operands 3 and 4 to pattern predicate.  Allow immediates
that can be expressed as csinc/csneg/csinv.  New define_split.
(*csinv3mode_insn): Rename to...
(csinv3mode_insn): ... This.
* config/aarch64/aarch64.h (AARCH64_IMMS_OK_FOR_CSNEG): New macro.
(AARCH64_IMMS_OK_FOR_CSINC): Likewise.
(AARCH64_IMMS_OK_FOR_CSINV): Likewise.
* config/aarch64/aarch64.c (aarch64_imms_ok_for_cond_op_1):
New function.
(aarch64_imms_ok_for_cond_op): Likewise.
* config/aarch64/aarch64-protos.h (aarch64_imms_ok_for_cond_op_1):
Declare prototype.
(aarch64_imms_ok_for_cond_op): Likewise.
 
 2015-07-10  Kyrylo Tkachov  kyrylo.tkac...@arm.com
 
* gcc.target/aarch64/cond-op-imm_1.c: New test.
 aarch64-csinc-imms.patch


Re: [PATCH][AArch64] Improve csinc/csneg/csinv opportunities on immediates

2015-07-10 Thread pinskia




 On Jul 10, 2015, at 1:47 AM, Kyrill Tkachov kyrylo.tkac...@arm.com wrote:
 
 Hi Andrew,
 
 On 10/07/15 09:40, pins...@gmail.com wrote:
 
 
 
 On Jul 10, 2015, at 1:34 AM, Kyrill Tkachov kyrylo.tkac...@arm.com wrote:
 
 Hi all,
 
 Currently when evaluating expressions like (a ? 24 : 25) we will move 24 
 and 25 into
 registers and perform a csel on them.  This misses the opportunity to 
 instead move just 24
 into a register and then perform a csinc, saving us an instruction and a 
 register use.
 Similarly for csneg and csinv.
 
 This patch implements that idea by allowing such pairs of immediates in 
 *cmovmode_insn
 and adding an early splitter that performs the necessary transformation.
 
 The testcase included in the patch demonstrates the kind of opportunities 
 that are now picked up.
 
 With this patch I see about 9.6% more csinc instructions being generated 
 for SPEC2006
 and the generated code looks objectively better (i.e. fewer mov-immediates 
 and slightly
 lower register pressure).
 
 Bootstrapped and tested on aarch64.
 
 Ok for trunk?
 I think this is the wrong place for this optimization. It should happen in 
 expr.c and we should produce cond_expr on the gimple level.
 
 I had considered it, but I wasn't sure how general the conditional 
 increment/negate/inverse operations
 are to warrant a midend implementation. Do you mean the 
 expand_cond_expr_using_cmove function in expr.c?

Yes and we can expand it to even have a target hook on how to expand them if 
needed. 

There is already a standard pattern for condition add so the a ? Const1 : 
const2 can be handled in the a generic way without much troubles. We should 
handle it better in rtl  ifcvt too (that should be an easier patch). The neg 
and not cases are very target specific but can be handled by a target hook and 
expand it directly to it. 


 
  
 I have patches to do both but I have not got around to cleaning them up. If 
 anyone wants them, I can send a link to my current gcc 5.1 sources with them 
 included.
 
 Any chance you can post them on gcc-patches even as a rough idea of what 
 needs to be done?


I posted my expr patch a few years ago but I never got around to rth's 
comments. This was the generic increment patch. Basically aarch64 should be 
implementing that pattern too. 


The main reason why this should be handled in gimple is that ifcvt on the rtl 
level is not cheap and does not catch all of the cases the simple expansion of 
phi-opt does. I can dig that patch up and I will be doing that next week 
anyways. 

Thanks,
Andrew

 
 Thanks,
 Kyrill
 
  
 Thanks,
 Andrew
 
 Thanks,
 Kyrill
 
 2015-07-10  Kyrylo Tkachov  kyrylo.tkac...@arm.com
 
* config/aarch64/aarch64.md (*cmovmode_insn): Move stricter
check for operands 3 and 4 to pattern predicate.  Allow immediates
that can be expressed as csinc/csneg/csinv.  New define_split.
(*csinv3mode_insn): Rename to...
(csinv3mode_insn): ... This.
* config/aarch64/aarch64.h (AARCH64_IMMS_OK_FOR_CSNEG): New macro.
(AARCH64_IMMS_OK_FOR_CSINC): Likewise.
(AARCH64_IMMS_OK_FOR_CSINV): Likewise.
* config/aarch64/aarch64.c (aarch64_imms_ok_for_cond_op_1):
New function.
(aarch64_imms_ok_for_cond_op): Likewise.
* config/aarch64/aarch64-protos.h (aarch64_imms_ok_for_cond_op_1):
Declare prototype.
(aarch64_imms_ok_for_cond_op): Likewise.
 
 2015-07-10  Kyrylo Tkachov  kyrylo.tkac...@arm.com
 
* gcc.target/aarch64/cond-op-imm_1.c: New test.
 aarch64-csinc-imms.patch
 


Re: [PATCH] [aarch64] Implemented reciprocal square root (rsqrt) estimation in -ffast-math

2015-06-29 Thread pinskia




 On Jun 29, 2015, at 4:44 AM, Dr. Philipp Tomsich 
 philipp.toms...@theobroma-systems.com wrote:
 
 James,
 
 On 29 Jun 2015, at 13:36, James Greenhalgh james.greenha...@arm.com wrote:
 
 On Mon, Jun 29, 2015 at 10:18:23AM +0100, Kumar, Venkataramanan wrote:
 
 -Original Message-
 From: Dr. Philipp Tomsich [mailto:philipp.toms...@theobroma-systems.com]
 Sent: Monday, June 29, 2015 2:17 PM
 To: Kumar, Venkataramanan
 Cc: pins...@gmail.com; Benedikt Huber; gcc-patches@gcc.gnu.org
 Subject: Re: [PATCH] [aarch64] Implemented reciprocal square root (rsqrt)
 estimation in -ffast-math
 
 Kumar,
 
 This does not come unexpected, as the initial estimation and each iteration
 will add an architecturally-defined number of bits of precision (ARMv8
 guarantuees only a minimum number of bits provided per operation… the
 exact number is specific to each micro-arch, though).
 Depending on your architecture and on the required number of precise bits
 by any given benchmark, one may see miscompares.
 
 True.  
 
 I would be very uncomfortable with this approach.
 
 Same here. The default must be safe. Always.
 Unlike other architectures, we don’t have a problem with making the proper
 defaults for “safety”, as the ARMv8 ISA guarantees a minimum number of
 precise bits per iteration.
 
 From Richard Biener's post in the thread Michael Matz linked earlier
 in the thread:
 
   It would follow existing practice of things we allow in
   -funsafe-math-optimizations.  Existing practice in that we
   want to allow -ffast-math use with common benchmarks we care
   about.
 
   https://gcc.gnu.org/ml/gcc-patches/2009-11/msg00100.html
 
 With the solution you seem to be converging on (2-steps for some
 microarchitectures, 3 for others), a binary generated for one micro-arch
 may drop below a minimum guarantee of precision when run on another. This
 seems to go against the spirit of the practice above. I would only support
 adding this optimization to -Ofast if we could keep to architectural
 guarantees of precision in the generated code (i.e. 3-steps everywhere).
 
 I don't object to adding a -mlow-precision-recip-sqrt style option,
 which would be off by default, would enable the 2-step mode, and would
 need to be explicitly enabled (i.e. not implied by -mcpu=foo) but I don't
 see what this buys you beyond the Gromacs boost (and even there you would
 be creating an Invalid Run as optimization flags must be applied across
 all workloads).
 
 Any flag that reduces precision (and thus breaks IEEE floating-point 
 semantics)
 needs to be gated with an “unsafe” flag (i.e. one that is never on by 
 default).
 As a consequence, the “peak”-tuning for SPEC will turn this on… but barely 
 anyone else would.
 
 For the 3-step optimization, it is clear to me that for generic tuning
 we don't want this to be enabled by default experimental results and advice
 in this thread argues against it for thunderx and cortex-a57 targets.
 However, enabling it based on the CPU tuning selected seems fine to me.
 
 I do not agree on this one, as I would like to see the safe form (i.e. 3 and 5
 iterations respectively) to become the default. Most “server-type” chips
 should not see a performance regression, while it will be easier to optimise 
 for
 this in hardware than for a (potentially microcoded) sqrt-instruction (and 
 subsequent, dependent divide).
 
 I have not heard anyone claim a performance regression (either on thunderx
 or on cortex-a57), but merely heard a “no speed-up”.

Actually it does regress performance on thunderX, I just assumed that when I 
said not going to be a win it was taken as a slow down. It regress gromacs by 
more than 10% on thunderX but I can't remember how much as i had someone else 
run it. The latency difference is also over 40%; for example single precision: 
29 cycles with div (12) sqrt(17) directly vs 42 cycles with the rsqrte and 2 
iterations of 2mul/rsqrts (double is 53 vs 60). That is huge difference right 
there.  ThunderX has a fast div and a fast sqrt for 32bit and a reasonable one 
for double.   So again this is not just not a win but rather a regression for 
thunderX. I suspect cortex-a57 is also true. 

Thanks,
Andrew

 
 So I am strongly in favor of defaulting to the ‘safe’ number of iterations, 
 even
 when compiling for a generic target.
 
 Best,
 Philipp.
 


Re: [PATCH] [aarch64] Implemented reciprocal square root (rsqrt) estimation in -ffast-math

2015-06-28 Thread pinskia




 On Jun 25, 2015, at 9:44 AM, Kumar, Venkataramanan 
 venkataramanan.ku...@amd.com wrote:
 
 I got around ~12% gain with -Ofast -mcpu=cortex-a57.

I get around 11/12% on thunderX with the patch and the decreasing the 
iterations change (1/2) compared to without the patch. 

Thanks,
Andrew


 
 Regards,
 Venkat.
 
 -Original Message-
 From: gcc-patches-ow...@gcc.gnu.org [mailto:gcc-patches-
 ow...@gcc.gnu.org] On Behalf Of Dr. Philipp Tomsich
 Sent: Thursday, June 25, 2015 9:13 PM
 To: Kumar, Venkataramanan
 Cc: Benedikt Huber; pins...@gmail.com; gcc-patches@gcc.gnu.org
 Subject: Re: [PATCH] [aarch64] Implemented reciprocal square root (rsqrt)
 estimation in -ffast-math
 
 Kumar,
 
 what is the relative gain that you see on Cortex-A57?
 
 Thanks,
 Philipp.
 
 On 25 Jun 2015, at 17:35, Kumar, Venkataramanan
 venkataramanan.ku...@amd.com wrote:
 
 Changing to  1 step for float and 2 steps for double gives better gains
 now for gromacs on cortex-a57.
 
 Regards,
 Venkat.
 -Original Message-
 From: gcc-patches-ow...@gcc.gnu.org [mailto:gcc-patches-
 ow...@gcc.gnu.org] On Behalf Of Benedikt Huber
 Sent: Thursday, June 25, 2015 4:09 PM
 To: pins...@gmail.com
 Cc: gcc-patches@gcc.gnu.org; philipp.toms...@theobroma-systems.com
 Subject: Re: [PATCH] [aarch64] Implemented reciprocal square root
 (rsqrt) estimation in -ffast-math
 
 Andrew,
 
 This is NOT a win on thunderX at least for single precision because
 you have
 to do the divide and sqrt in the same time as it takes 5 multiples
 (estimate and step are multiplies in the thunderX pipeline).  Doubles
 is 10 multiplies which is just the same as what the patch does (but
 it is really slightly less than 10, I rounded up). So in the end this
 is NOT a win at all for thunderX unless we do one less step for both single
 and double.
 
 Yes, the expected benefit from rsqrt estimation is implementation
 specific. If one has a better initial rsqrte or an application that
 can trade precision for execution time, we could offer a command line
 option to do only 2 steps for doulbe and 1 step for float; similar to -
 mrecip-precision for PowerPC.
 What are your thoughts on that?
 
 Best regards,
 Benedikt
 


Re: [PATCH] [aarch64] Implemented reciprocal square root (rsqrt) estimation in -ffast-math

2015-06-25 Thread pinskia




 On Jun 18, 2015, at 5:04 AM, Benedikt Huber 
 benedikt.hu...@theobroma-systems.com wrote:
 
 arch64 offers the instructions frsqrte and frsqrts, for rsqrt estimation and
 a Newton-Raphson step, respectively.
 There are ARMv8 implementations where this is faster than using fdiv and 
 rsqrt.
 It runs three steps for double and two steps for float to achieve the needed 
 precision.

This is NOT a win on thunderX at least for single precision because you have to 
do the divide and sqrt in the same time as it takes 5 multiples (estimate and 
step are multiplies in the thunderX pipeline).  Doubles is 10 multiplies which 
is just the same as what the patch does (but it is really slightly less than 
10, I rounded up). So in the end this is NOT a win at all for thunderX unless 
we do one less step for both single and double. 

Thanks,
Andrew


 
 There is one caveat and open question.
 Since -ffast-math enables flush to zero intermediate values between 
 approximation steps
 will be flushed to zero if they are denormal.
 E.g. This happens in the case of rsqrt (DBL_MAX) and rsqrtf (FLT_MAX).
 The test cases pass, but it is unclear to me whether this is expected 
 behavior with -ffast-math.
 
 The patch applies to commit:
 svn+ssh://gcc.gnu.org/svn/gcc/trunk@224470
 
 Please consider including this patch.
 Thank you and best regards,
 Benedikt Huber
 
 Benedikt Huber (1):
  2015-06-15  Benedikt Huber  benedikt.hu...@theobroma-systems.com
 
 gcc/ChangeLog|   9 +++
 gcc/config/aarch64/aarch64-builtins.c|  60 
 gcc/config/aarch64/aarch64-protos.h  |   2 +
 gcc/config/aarch64/aarch64-simd.md   |  27 
 gcc/config/aarch64/aarch64.c |  63 +
 gcc/config/aarch64/aarch64.md|   3 +
 gcc/testsuite/gcc.target/aarch64/rsqrt.c | 113 +++
 7 files changed, 277 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/rsqrt.c
 
 -- 
 1.9.1
 


Re: New type-based pool allocator code miscompiled due to aliasing issue?

2015-06-11 Thread pinskia




 On Jun 12, 2015, at 12:51 AM, Ulrich Weigand uweig...@de.ibm.com wrote:
 
 Hello,
 
 since the new type-based pool allocator was merged, the SPU toolchain
 automated build / regression test has been failing due to crashes of
 the compiled GCC due to random memory corruption.
 
 Debugging this, it seems this is caused by GCC being miscompiled by
 the host compiler.  As one example, var-tracking.c contains this
 code in vt_initialize:
 
  empty_shared_hash = new shared_hash_def;
  empty_shared_hash-refcount = 1;
 
 Via operator new, these bits of alloc-pool.h are inlined:
 
 template typename T
 inline T *
 pool_allocatorT::allocate ()
 {
 [...]
  /* Pull the first free element from the free list, and return it.  */
  header = m_returned_free_list;
  VALGRIND_DISCARD (VALGRIND_MAKE_MEM_DEFINED (header, sizeof (*header)));
  m_returned_free_list = header-next;
  m_elts_free--;
 [...]
  /* Call default constructor.  */
  return (T *)(header);
 }
 
 Note that the read from header-next accesses the same memory location
 as the write to empty_shared_hash-refcount above.
 
 Type-based aliasing now assumes that those accesses cannot alias, since
 they use incompatible types.  The scheduler then moves the write to
 before the read, which causes m_returned_free_list to be set to 1,
 causing a crash later on.
 
 Now -- I'm only seeing this effect on my SPU toolchain regression test
 machine, which uses a RHEL 5 distribution with its default GCC 4.1.2
 system compiler.  (Using -fno-strict-aliasing fixes the symptom.)
 
 Maybe this old compiler simply has a bug in type-based aliasing.  But
 on the other hand, I'm not completely sure whether this *is* a bug --
 is the code above supposed to be valid C++, or does it in fact violate
 the aliasing rules after all?

This is just a bug in the older compiler. There was a change to fix in 
placement new operator. I can't find the reference right now but this is the 
same issue as that. 

Thanks,
Andrew


 
 Bye,
 Ulrich
 
 -- 
  Dr. Ulrich Weigand
  GNU/Linux compilers and toolchain
  ulrich.weig...@de.ibm.com
 


Re: [Patch AArch64] Add cpu_defines.h for AArch64.

2015-05-19 Thread pinskia




 On May 19, 2015, at 5:54 AM, Ramana Radhakrishnan 
 ramana.radhakrish...@foss.arm.com wrote:
 
 Hi,
 
 Like the ARM port, the AArch64 ports needs to set glibc_integral_traps to 
 false as integer divide instructions do not trap.
 
 Bootstrapped and regression tested on aarch64-none-linux-gnu
 
 Ok to apply ?

Not really questioning your patch but questioning libstdc++'s defaults. I 
wonder if this should be the default as most targets don't trap, only a few 
that does. And it should be safer default to say they don't trap too?

Thanks,
Andrew


 
 regards
 Ramana
 
 
 2015-05-17  Ramana Radhakrishnan  ramana.radhakrish...@arm.com
 
* configure.host: Define cpu_defines_dir for AArch64
* config/cpu/aarch64/cpu_defines.h: New file.
 0002-Do-the-same-for-AArch64.patch


Re: [PATCH] fixup libobjc usage of PCC_BITFIELD_TYPE_MATTERS

2015-05-01 Thread pinskia




 On May 1, 2015, at 8:30 PM, tbsaunde+...@tbsaunde.org wrote:
 
 From: Trevor Saunders tbsaunde+...@tbsaunde.org
 
 Hi,
 
 This adds a configure check to libobjc to find out if types of bitfields 
 effect
 their layout, and uses it to replace the rather broken usage of
 PCC_BITFIELD_TYPE_MATTERS.
 
 bootstrapped + regtested x86_64-linux-gnu, bootstrapped on ppc64le-linux-gnu
 and ran check-objc there without failures, and checked the correct part of the
 ifdef is used on a cross to m68k-linux-elf.  ok?  I'm sure I've gotten
 something wrong since this is a bunch of auto tools ;-)


This is ok.  I have been meaning to try to get rid of all uses of the target 
headers but never got around to finishing it. 

Thanks,
Andrew

 
 Trev
 
 libobjc/ChangeLog:
 
 2015-05-01  Trevor Saunders  tbsaunde+...@tbsaunde.org
 
* acinclude.m4: Include bitfields.m4.
* config.h.in: Regenerate.
* configure: Likewise.
* configure.ac: Invoke gt_BITFIELD_TYPE_MATTERS.
* encoding.c: Check HAVE_BITFIELD_TYPE_MATTERS.
 
 config/ChangeLog:
 
 2015-05-01  Trevor Saunders  tbsaunde+...@tbsaunde.org
 
* bitfields.m4: New file.
 ---
 config/bitfields.m4  | 26 ++
 libobjc/acinclude.m4 |  1 +
 libobjc/config.h.in  |  3 +++
 libobjc/configure| 38 ++
 libobjc/configure.ac |  2 ++
 libobjc/encoding.c   |  3 ++-
 6 files changed, 72 insertions(+), 1 deletion(-)
 create mode 100644 config/bitfields.m4
 
 diff --git a/config/bitfields.m4 b/config/bitfields.m4
 new file mode 100644
 index 000..ee8f3b5
 --- /dev/null
 +++ b/config/bitfields.m4
 @@ -0,0 +1,26 @@
 +dnl Copyright (C) 2015 Free Software Foundation, Inc.
 +dnl This file is free software, distributed under the terms of the GNU
 +dnl General Public License.  As a special exception to the GNU General
 +dnl Public License, this file may be distributed as part of a program
 +dnl that contains a configuration script generated by Autoconf, under
 +dnl the same distribution terms as the rest of that program.
 +
 +# Define HAVE_BITFIELD_TYPE_MATTERS if the type of bitfields effects their
 +# alignment.
 +
 +AC_DEFUN([gt_BITFIELD_TYPE_MATTERS],
 +[
 +  AC_CACHE_CHECK([if the type of bitfields matters], 
 gt_cv_bitfield_type_matters,
 +  [
 +AC_TRY_COMPILE(
 +  [struct foo1 { char x; char :0; char y; };
 +struct foo2 { char x; int :0; char y; };
 +int foo1test[ sizeof (struct foo1) == 2 ? 1 : -1 ];
 +int foo2test[ sizeof (struct foo2) == 5 ? 1 : -1]; ],
 +  [], gt_cv_bitfield_type_matters=yes, gt_cv_bitfield_type_matters=no)
 +  ])
 +  if test $gt_cv_bitfield_type_matters = yes; then
 +AC_DEFINE(HAVE_BITFIELD_TYPE_MATTERS, 1,
 +  [Define if the type of bitfields effects alignment.])
 +  fi
 +])
 diff --git a/libobjc/acinclude.m4 b/libobjc/acinclude.m4
 index bf78dbe..4193018 100644
 --- a/libobjc/acinclude.m4
 +++ b/libobjc/acinclude.m4
 @@ -12,6 +12,7 @@ m4_include(../config/acx.m4)
 m4_include(../config/no-executables.m4)
 m4_include(../config/enable.m4)
 m4_include(../config/tls.m4)
 +m4_include(../config/bitfields.m4)
 
 m4_include(../libtool.m4)
 dnl The lines below arrange for aclocal not to bring an installed
 diff --git a/libobjc/config.h.in b/libobjc/config.h.in
 index c055e7c..20d1fca 100644
 --- a/libobjc/config.h.in
 +++ b/libobjc/config.h.in
 @@ -1,5 +1,8 @@
 /* config.h.in.  Generated from configure.ac by autoheader.  */
 
 +/* Define if the type of bitfields effects alignment. */
 +#undef HAVE_BITFIELD_TYPE_MATTERS
 +
 /* Define to 1 if the target assembler supports thread-local storage. */
 #undef HAVE_CC_TLS
 
 diff --git a/libobjc/configure b/libobjc/configure
 index 642eb9c..0547f91 100755
 --- a/libobjc/configure
 +++ b/libobjc/configure
 @@ -11530,6 +11530,44 @@ ac_compiler_gnu=$ac_cv_c_compiler_gnu
 { $as_echo $as_me:${as_lineno-$LINENO}: result: $ac_exception_model_name 5
 $as_echo $ac_exception_model_name 6; }
 
 +
 +  { $as_echo $as_me:${as_lineno-$LINENO}: checking if the type of bitfields 
 matters 5
 +$as_echo_n checking if the type of bitfields matters...  6; }
 +if test ${gt_cv_bitfield_type_matters+set} = set; then :
 +  $as_echo_n (cached)  6
 +else
 +
 +cat confdefs.h - _ACEOF conftest.$ac_ext
 +/* end confdefs.h.  */
 +struct foo1 { char x; char :0; char y; };
 +struct foo2 { char x; int :0; char y; };
 +int foo1test[ sizeof (struct foo1) == 2 ? 1 : -1 ];
 +int foo2test[ sizeof (struct foo2) == 5 ? 1 : -1];
 +int
 +main ()
 +{
 +
 +  ;
 +  return 0;
 +}
 +_ACEOF
 +if ac_fn_c_try_compile $LINENO; then :
 +  gt_cv_bitfield_type_matters=yes
 +else
 +  gt_cv_bitfield_type_matters=no
 +fi
 +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
 +
 +fi
 +{ $as_echo $as_me:${as_lineno-$LINENO}: result: 
 $gt_cv_bitfield_type_matters 5
 +$as_echo $gt_cv_bitfield_type_matters 6; }
 +  if test $gt_cv_bitfield_type_matters = yes; then
 +
 +$as_echo #define HAVE_BITFIELD_TYPE_MATTERS 1 confdefs.h
 +
 +  fi
 +
 +
 # --
 # Output
 # --
 diff 

Re: [rs6000] Fix compare debug failure on AIX

2015-04-29 Thread pinskia




 On Apr 29, 2015, at 3:22 AM, Eric Botcazou ebotca...@adacore.com wrote:
 
 You can easily get -fcompare-debug failures on AIX with small functions, in 
 fact you get the failure for the empty function:
 
 void foo (void) {}
 
 eric@polaris:~/build/gcc/powerpc-ibm-aix7.1 gcc/xgcc -Bgcc -S t.c -O -
 fcompare-debug
 xgcc: error: t.c: -fcompare-debug failure (length)
 
 Fixed thusly, tested on powerpc-ibm-aix7.1, OK for the mainline?
 
 
 2015-04-29  Eric Botcazou  ebotca...@adacore.com
 
* config/rs6000/rs6000.c (rs6000_stack_info): For XCOFF, replace test
on debug info with test on optimization to decide stack pushing.

We might want to check if doing -Og and not just -O0. 

Thanks,
Andrew

 
 
 2015-04-29  Eric Botcazou  ebotca...@adacore.com
 
* gcc.dg/empty3.c: New test.
 
 
 -- 
 Eric Botcazou
 p.diff
 empty3.c


Re: [PATCH 3/13] aarch64 musl support

2015-04-21 Thread pinskia




 On Apr 20, 2015, at 11:52 AM, Szabolcs Nagy szabolcs.n...@arm.com wrote:
 
 Set up dynamic linker name for aarch64.
 
 gcc/Changelog:
 
 2015-04-16  Gregor Richards  gregor.richa...@uwaterloo.ca
Szabolcs Nagy  szabolcs.n...@arm.com
 
* config/aarch64/aarch64-linux.h (MUSL_DYNAMIC_LINKER): Define.


I don't think you need to check if defaulting to little or big-endian here are 
the specs always have one or the other passing through. 

Also if musl does not support ilp32, you might want to error out. Or even 
define the dynamic linker name even before support goes into musl. 

Thanks,
Andrew

 03-aarch64.patch


Re: [RFC AArch64] Implement TARGET_PROMOTE_FUNCTION_MODE for ILP32 code generation

2015-03-16 Thread pinskia




 On Mar 16, 2015, at 2:28 AM, Yangfei (Felix) felix.y...@huawei.com wrote:
 
 Hi,
 
  For this trivial testcase: 
 
 extern int bar (int , int);
 
 int foo (int *a, int *b)
 {
return bar (*a, *b);
 }
 
 I noticed that GCC generate redundant zero-extension instructions under ILP32 
 (aarch64-linux-gnu-gcc -S -O2 -mabi=ilp32).
 Assembly code:
.arch armv8-a+fp+simd
.file   1.c
.text
.align  2
.p2align 3,,7
.global foo
.type   foo, %function
 foo:
uxtwx0, w0
uxtwx1, w1
ldr w0, [x0]
ldr w1, [x1]
b   bar
.size   foo, .-foo
.ident  GCC: (20140403) 5.0.0 20150312 (experimental)
 
 According the ILP32 ABI, the two uxtw instrunctions here is not necessary.
 The following is a proposed patch to fix this issue, i.e. promoting pointer 
 function arguments to word_mode.
 But I don't know whether it's a good idea to do this for pointer return 
 values.
 Any comments?


Actually they are required. The abi says the upper 32bits are undefined for 
arguments smaller then 64bits. I had this discussion a year or more ago about 
this case. 

A simple struct like
struct a { int * b; int c; }; 

Will break the code if we don't have the zero extends

Try 
void f(int *);
void g(struct a d)
{
  f(d.b);
}

And see that there is no zero extends inside g.  I saw this exact thing when 
working on getting gccgo working.

It also means the assembly functions in glibc are broken and need to be fixed.

Thanks,
Andrew
 
 
 Index: gcc/config/aarch64/aarch64.c
 ===
 --- gcc/config/aarch64/aarch64.c(revision 221393)
 +++ gcc/config/aarch64/aarch64.c(working copy)
 @@ -1587,7 +1587,7 @@ aarch64_function_value (const_tree type, const_tre
   machine_mode ag_mode;
 
   mode = TYPE_MODE (type);
 -  if (INTEGRAL_TYPE_P (type))
 +  if (INTEGRAL_TYPE_P (type) || POINTER_TYPE_P (type))
 mode = promote_function_mode (type, mode, unsignedp, func, 1);
 
   if (aarch64_return_in_msb (type))
 @@ -1650,6 +1650,24 @@ aarch64_function_value_regno_p (const unsigned int
   return false;
 }
 
 +/* Implement TARGET_PROMOTE_FUNCTION_MODE.  */
 +
 +static machine_mode
 +aarch64_promote_function_mode (const_tree type, machine_mode mode,
 +   int *punsignedp, const_tree fntype,
 +   int for_return)
 +{
 +  /* Pointer function arguments and return values are promoted to word_mode. 
  */
 +  if (type != NULL_TREE  POINTER_TYPE_P (type))
 +{
 +  *punsignedp = POINTERS_EXTEND_UNSIGNED;
 +  return word_mode;
 +}
 +
 +  return default_promote_function_mode (type, mode, punsignedp, fntype,
 +for_return);
 +}
 +
 /* Implement TARGET_RETURN_IN_MEMORY.
 
If the type T of the result of a function is such that
 @@ -11329,6 +11347,9 @@ aarch64_gen_adjusted_ldpstp (rtx *operands, bool l
 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
   aarch64_override_options_after_change
 
 +#undef TARGET_PROMOTE_FUNCTION_MODE
 +#define TARGET_PROMOTE_FUNCTION_MODE aarch64_promote_function_mode
 +
 #undef TARGET_PASS_BY_REFERENCE
 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
 aarch64-promote-v2.diff


Re: [PATCH/AARCH64] Fix 64893: ICE with vget_lane_u32 with C++ front-end at -O0

2015-03-06 Thread pinskia




 On Mar 6, 2015, at 1:45 AM, James Greenhalgh james.greenha...@arm.com wrote:
 
 On Thu, Feb 12, 2015 at 03:37:33PM +, Christophe Lyon wrote:
 On 8 February 2015 at 03:24, Andrew Pinski pins...@gmail.com wrote:
 On Fri, Feb 6, 2015 at 5:02 PM, Andrew Pinski pins...@gmail.com wrote:
PR target/64893
* config/aarch64/aarch64-builtins.c (aarch64_init_simd_builtins):
Change the first argument type to size_type_node and add another
size_type_node.
(aarch64_simd_expand_builtin): Handle the new argument to
AARCH64_SIMD_BUILTIN_LANE_CHECK and don't ICE but rather
print an out when the first two arguments are not
nonzero integer constants.
* config/aarch64/arm_neon.h (__AARCH64_LANE_CHECK):
Pass the sizeof directly to __builtin_aarch64_im_lane_boundsi.
 
 testsuite/ChangeLog:
 
* c-c++-common/torture/aarch64-vect-lane-1.c: New testcase.
* c-c++-common/torture/aarch64-vect-lane-2.c: New testcase.
 In case you haven't noticed, aarch64-vect-lane-2.c FAILs when compiled at
 -O2 -flto -fuse-linker-plugin -fno-fat-lto-objects
 but PASSes with the other optimization levels.
 
 Hi Andrew,
 
 Did you get a chance to look at this? Given the way that your error
 checking works, the test probably just needs a dg-skip-if -flto
 directive. Did you intend to come back to this and make it fail earlier
 than link time with -flto?
 
 If I don't hear otherwise, I'll propose the patch adding dg-skip-if on
 Monday.

Yes that is the correct approach as I see it. 

Thanks,
Andrew


 
 Cheers,
 James
 


Re: [PATCH][RFA][LRA] Don't try to break down subreg expressions if insn already matches

2015-02-13 Thread pinskia




 On Feb 13, 2015, at 1:48 AM, Kyrill Tkachov kyrylo.tkac...@arm.com wrote:
 
 Hi all,
 
 In my tree added a pattern to the arm backend that's supposed to match:
 (set (reg:SI r0)
 (subreg:SI
   (plus:DI
 (mult:DI (sign_extend:DI (reg:SI r1))
  (sign_extend:DI (reg:SI r2)))
   (const_int 2147483648 [0x8000])) 4))
 
 That is, take two SImode regs, sign-extend to DImode, multiply in DImode,
 add a const_int and take the most significant SImode subreg.

Seems better to use shifts for the most significant simode and low part subreg 
after that. Isn't that what other targets do?

Thanks,
Andrew


 Combine matches it fine but I get an ICE in lra:
 0xa665a5 crash_signal
$SRC/gcc/toplev.c:383
 0x91ec1c lra_emit_add(rtx_def*, rtx_def*, rtx_def*)
$SRC/gcc/lra.c:418
 0x91ef8a lra_emit_move(rtx_def*, rtx_def*)
$SRC/gcc/lra.c:528
 0x9279b2 insert_move_for_subreg
$SRC/gcc/lra-constraints.c:1371
 0x931bdb simplify_operand_subreg
$SRC/gcc/lra-constraints.c:1507
 0x931bdb curr_insn_transform
$SRC/gcc/lra-constraints.c:3437
 0x934793 lra_constraints(bool)
$SRC/gcc/lra-constraints.c:4451
 0x9217d0 lra(_IO_FILE*)
$SRC/gcc/lra.c:2292
 0x8e0dba do_reload
$SRC/gcc/ira.c:5418
 0x8e0dba execute
$SRC/gcc/ira.c:5589
 
 
 I *think* the problem is that simplify_operand_subreg tries to break down
 this complex RTX involving the subreg into separate MULT/PLUS operations
 and breaking down the line.
 
 If I apply the attached patch that stops trying to simplify the subreg
 expression if the whole thing matches something already my code compiles
 succesfully, and passes arm testing and x86_64 bootstrap.
 However, I'm concerned that calling recog_memoized is too heavy-handed,
 is there a better approach?
 Or is this even the right way to go about this?
 
 I'm also attaching the patch to the arm backend that can be used to
 reproduce this ICE with the following C-code at -O2:
 
 int
 smmulr (int a, int b)
 {
  return ((long long)a * b + 0x8000)  32;
 }
 
 
 Thanks,
 Kyrill
 
 2015-02-10  Kyrylo Tkachov  kyrylo.tkac...@arm.com
 
* lra-constraints.c (simplify_operand_subreg): Do not try to simplify
subreg if the pattern already matches.
 lra-subreg.patch
 arm-smmulr.patch


Re: [PATCH/AARCH64] Fix 64893: ICE with vget_lane_u32 with C++ front-end at -O0

2015-02-03 Thread pinskia




 On Feb 3, 2015, at 3:57 AM, Alan Lawrence alan.lawre...@arm.com wrote:
 
 
 Andrew Pinski wrote:
 While trying to build the GCC 5 with GCC 5, I ran into an ICE when
 building libcpp at -O0.  The problem is the C++ front-end was not
 folding sizeof(a)/sizeof(a[0]) when passed to a function at -O0. The
 C++ front-end keeps around sizeof until the gimplifier and there is no
 way to fold the expressions that involve them.  So to work around the
 issue we need to change __builtin_aarch64_im_lane_boundsi to accept an
 extra argument and change the first two arguments to size_t type so we
 don't get an extra cast there and do the division inside the compiler
 itself.
 Also we don't want to cause an ICE on any source code so I changed the
 assert to be a sorry if either of the two arguments are not integer
 constants.
 
 TBH I think it _is_ appropriate to ICE rather than sorry, or even error, if 
 the size of the vector or vector elements are non-constant or zero. All the 
 calls to this __builtin are in gcc-specific headers, and if those are wrong, 
 then the error is in(ternal to) the compiler. (Of course if the lane index is 
 non-constant that is a programmer error.) We don't wish to support the 
 programmer writing direct calls to the builtin him/herself!


Even if we don't support direct calls to it, it still should not ice. This is 
gcc policy for a long time now.  Iceing on any input is bad form and not 
helpful to users. We can say in the error message that calling it direct is not 
supported and they should not do it. 


Thanks,
Andrew
 
 --Alan
 
 
 OK?  Bootstrapped and tested on aarch64-linux-gnu with no regressions
 and I was able to bootstrap without a modified libcpp.
 Thanks,
 Andrew Pinski
 ChangeLog:
* config/aarch64/aarch64-builtins.c (aarch64_init_simd_builtins):
Change the first argument type to size_type_node and add another
size_type_node.
(aarch64_simd_expand_builtin): Handle the new argument to
AARCH64_SIMD_BUILTIN_LANE_CHECK and don't ICE but rather
print sorry out when the first two arguments are not
integer constants.
* config/aarch64/arm_neon.h (__AARCH64_LANE_CHECK):
Pass the sizeof's directly to __builtin_aarch64_im_lane_boundsi.
 testsuite/ChangeLog:
* c-c++-common/torture/aarch64-vect-lane-1.c: New testcase.
 
 


Re: [PATCH, Aarch64] Add FMA steering pass for Cortex-A57

2015-01-28 Thread pinskia




On Jan 28, 2015, at 2:01 AM, Thomas Preud'homme thomas.preudho...@arm.com 
wrote:

 From: gcc-patches-ow...@gcc.gnu.org [mailto:gcc-patches-
 ow...@gcc.gnu.org] On Behalf Of Thomas Preud'homme
 
 Hi Andrew,
 
 cortex-a57-fma-steering.c is really meant to be autosufficient with
 aarch64_register_fma_steering being the only interface which is why I
 thought it was not worth the trouble. Now I'm not saying I'm against it
 as it do make this interface more explicit.
 
 I'm currently about to take a plane but I'll change it as soon as I arrive
 at work.
 
 My apologies for the delay, got busy when I arrived.
 
 Please find below (and in attachment for real this time) the updated patch and
 ChangeLog entries.
 
 *** gcc/ChangeLog ***
 
 2015-01-26 Thomas Preud'homme thomas.preudho...@arm.com
 
* config.gcc: Add cortex-a57-fma-steering.o to extra_objs for
aarch64-*-*.
* config/aarch64/t-aarch64: Add a rule for cortex-a57-fma-steering.o.
* config/aarch64/aarch64.h (AARCH64_FL_USE_FMA_STEERING_PASS): Define.
(AARCH64_TUNE_FMA_STEERING): Likewise.
* config/aarch64/aarch64-cores.def: Set
AARCH64_FL_USE_FMA_STEERING_PASS for cores with dynamic steering of
FMUL/FMADD instructions.
* config/aarch64/aarch64.c (aarch64_register_fma_steering): Declare.
(aarch64_override_options): Include cortex-a57-fma-steering.h.  Call
aarch64_register_fma_steering () if AARCH64_TUNE_FMA_STEERING is true.
* config/aarch64/cortex-a57-fma-steering.h: New file.
* config/aarch64/cortex-a57-fma-steering.c: Likewise.
 
 
 diff --git a/gcc/config.gcc b/gcc/config.gcc
 index bf67beb..1e97231 100644
 --- a/gcc/config.gcc
 +++ b/gcc/config.gcc
 @@ -302,7 +302,7 @@ m32c*-*-*)
 aarch64*-*-*)
cpu_type=aarch64
extra_headers=arm_neon.h arm_acle.h
 -extra_objs=aarch64-builtins.o aarch-common.o
 +extra_objs=aarch64-builtins.o aarch-common.o cortex-a57-fma-steering.o
target_gtfiles=\$(srcdir)/config/aarch64/aarch64-builtins.c
target_has_targetm_common=yes
;;
 diff --git a/gcc/config/aarch64/aarch64-cores.def 
 b/gcc/config/aarch64/aarch64-cores.def
 index f978eb1..1e3b1e7 100644
 --- a/gcc/config/aarch64/aarch64-cores.def
 +++ b/gcc/config/aarch64/aarch64-cores.def
 @@ -35,10 +35,10 @@
 /* V8 Architecture Processors.  */
 
 AARCH64_CORE(cortex-a53,  cortexa53, cortexa53, 8,  AARCH64_FL_FOR_ARCH8 | 
 AARCH64_FL_CRC, cortexa53)
 -AARCH64_CORE(cortex-a57,  cortexa57, cortexa57, 8,  AARCH64_FL_FOR_ARCH8 | 
 AARCH64_FL_CRC, cortexa57)
 +AARCH64_CORE(cortex-a57,  cortexa57, cortexa57, 8,  AARCH64_FL_FOR_ARCH8 | 
 AARCH64_FL_CRC | AARCH64_FL_USE_FMA_STEERING_PASS, cortexa57)
 AARCH64_CORE(thunderx,thunderx,  thunderx, 8,  AARCH64_FL_FOR_ARCH8 | 
 AARCH64_FL_CRC | AARCH64_FL_CRYPTO, thunderx)
 AARCH64_CORE(xgene1,  xgene1,xgene1,8,  AARCH64_FL_FOR_ARCH8, 
 xgene1)
 
 /* V8 big.LITTLE implementations.  */
 
 -AARCH64_CORE(cortex-a57.cortex-a53,  cortexa57cortexa53, cortexa53, 8,  
 AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC, cortexa57)
 +AARCH64_CORE(cortex-a57.cortex-a53,  cortexa57cortexa53, cortexa53, 8,  
 AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC | AARCH64_FL_USE_FMA_STEERING_PASS, 
 cortexa57)
 diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
 index eed86f7..f749811 100644
 --- a/gcc/config/aarch64/aarch64.h
 +++ b/gcc/config/aarch64/aarch64.h
 @@ -200,6 +200,8 @@ extern unsigned aarch64_architecture_version;
 #define AARCH64_FL_CRYPTO (1  2)/* Has crypto.  */
 #define AARCH64_FL_SLOWMUL(1  3)/* A slow multiply core.  */
 #define AARCH64_FL_CRC(1  4)/* Has CRC.  */
 +/* Has static dispatch of FMA.  */
 +#define AARCH64_FL_USE_FMA_STEERING_PASS (1  5)
 
 /* Has FP and SIMD.  */
 #define AARCH64_FL_FPSIMD (AARCH64_FL_FP | AARCH64_FL_SIMD)
 @@ -220,6 +222,8 @@ extern unsigned long aarch64_isa_flags;
 /* Macros to test tuning flags.  */
 extern unsigned long aarch64_tune_flags;
 #define AARCH64_TUNE_SLOWMUL   (aarch64_tune_flags  AARCH64_FL_SLOWMUL)
 +#define AARCH64_TUNE_FMA_STEERING \
 +  (aarch64_tune_flags  AARCH64_FL_USE_FMA_STEERING_PASS)
 
 /* Crypto is an optional extension to AdvSIMD.  */
 #define TARGET_CRYPTO (TARGET_SIMD  AARCH64_ISA_CRYPTO)
 diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
 index fdb0116..9bcc4bf 100644
 --- a/gcc/config/aarch64/aarch64.c
 +++ b/gcc/config/aarch64/aarch64.c
 @@ -94,6 +94,7 @@
 #include builtins.h
 #include rtl-iter.h
 #include tm-constrs.h
 +#include cortex-a57-fma-steering.h
 
 /* Defined for convenience.  */
 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
 @@ -6875,6 +6876,9 @@ aarch64_override_options (void)
align_functions = aarch64_tune_params-function_align;
 }
 
 +  if (AARCH64_TUNE_FMA_STEERING)
 +aarch64_register_fma_steering ();
 +
   aarch64_override_options_after_change ();
 }
 
 diff --git a/gcc/config/aarch64/cortex-a57-fma-steering.h 
 b/gcc/config/aarch64/cortex-a57-fma-steering.h
 new file mode 100644
 index 

Re: [patch, libobjc] export __objc_get_forward_imp, get_imp again

2015-01-21 Thread pinskia




 On Jan 21, 2015, at 1:02 AM, Matthias Klose d...@ubuntu.com wrote:
 
 __objc_get_forward_imp and get_imp were exported in libobjc since GCC 4.1, for
 some reason these are not exported anymore in GCC 5 (both declared inline).  
 So
 either export these as before, or don't export them and bump the soname.  The
 latter seems to be unwanted, and at least gnustep-base is using the get_imp
 function.  So better keep the references in GCC 5?
 
 Is this an intended change in GCC 5 to not to export inline methods anymore?

Just remove the inline instead. 

Thanks,
Andrew

 
  Matthias
 
 libobjc-extern-inline.diff


Re: [patch libstdc++] Optimize synchronization in std::future if futexes are available.

2015-01-16 Thread pinskia




 On Jan 16, 2015, at 9:57 PM, David Edelsohn dje@gmail.com wrote:
 
 This patch has broken bootstrap on AIX
 
 May I mention that this really should have been tested on systems
 other than x86 Linux.

It also broke all newlib targets too. So you could have tested one listed in 
the sim-test web page. 

Thanks,
Andrew

 
 In file included from 
 /tmp/20150117/powerpc-ibm-aix7.1.0.0/libstdc++-v3/include/
 future:44:0,
 from
 /nasfarm/edelsohn/src/src/libstdc++-v3/src/c++11/compatibility-thread-c++0x.cc:30:
 /tmp/20150117/powerpc-ibm-aix7.1.0.0/libstdc++-v3/include/bits/atomic_futex.h:223:5:
 error: 'mutex' does not name a type
 mutex _M_mutex;
 ^
 /tmp/20150117/powerpc-ibm-aix7.1.0.0/libstdc++-v3/include/bits/atomic_futex.h:224:5:
 error: 'condition_variable' does not name a type
 condition_variable _M_condvar;
 ^
 /tmp/20150117/powerpc-ibm-aix7.1.0.0/libstdc++-v3/include/bits/atomic_futex.h:
 In member function 'unsigned int
 std::__atomic_futex_unsigned_Waiter_bit::_M_load(std::memory_order)':
 /tmp/20150117/powerpc-ibm-aix7.1.0.0/libstdc++-v3/include/bits/atomic_futex.h:232:19:
 error: 'mutex' was not declared in this scope
   unique_lockmutex __lock(_M_mutex);
   ^
 /tmp/20150117/powerpc-ibm-aix7.1.0.0/libstdc++-v3/include/bits/atomic_futex.h:232:24:
 error: template argument 1 is invalid
   unique_lockmutex __lock(_M_mutex);
^
 /tmp/20150117/powerpc-ibm-aix7.1.0.0/libstdc++-v3/include/bits/atomic_futex.h:232:33:
 error: '_M_mutex' was not declared in this scope
   unique_lockmutex __lock(_M_mutex);
 
 /tmp/20150117/powerpc-ibm-aix7.1.0.0/libstdc++-v3/include/bits/atomic_futex.h:
 In member function 'unsigned int
 std::__atomic_futex_unsigned_Waiter_bit::_M_load_when_not_equal(unsigned
 int, std::memory_order)':
 /tmp/20150117/powerpc-ibm-aix7.1.0.0/libstdc++-v3/include/bits/atomic_futex.h:239:19:
 error: 'mutex' was not declared in this scope
   unique_lockmutex __lock(_M_mutex);
   ^
 /tmp/20150117/powerpc-ibm-aix7.1.0.0/libstdc++-v3/include/bits/atomic_futex.h:239:24:
 error: template argument 1 is invalid
   unique_lockmutex __lock(_M_mutex);
^
 /tmp/20150117/powerpc-ibm-aix7.1.0.0/libstdc++-v3/include/bits/atomic_futex.h:239:33:
 error: '_M_mutex' was not declared in this scope
   unique_lockmutex __lock(_M_mutex);
 ^
 /tmp/20150117/powerpc-ibm-aix7.1.0.0/libstdc++-v3/include/bits/atomic_futex.h:241:2:
 error: '_M_condvar' was not declared in this scope
  _M_condvar.wait(__lock);
  ^
 /tmp/20150117/powerpc-ibm-aix7.1.0.0/libstdc++-v3/include/bits/atomic_futex.h:
 In member function 'void
 std::__atomic_futex_unsigned_Waiter_bit::_M_load_when_equal(unsigned
 int, std::memory_order)':
 
 /tmp/20150117/powerpc-ibm-aix7.1.0.0/libstdc++-v3/include/bits/atomic_futex.h:248:19:
 error: 'mutex' was not declared in this scope
   unique_lockmutex __lock(_M_mutex);
   ^
 /tmp/20150117/powerpc-ibm-aix7.1.0.0/libstdc++-v3/include/bits/atomic_futex.h:248:24:
 error: template argument 1 is invalid
   unique_lockmutex __lock(_M_mutex);
 
 /tmp/20150117/powerpc-ibm-aix7.1.0.0/libstdc++-v3/include/bits/atomic_futex.h:248:33:
 error: '_M_mutex' was not declared in this scope
   unique_lockmutex __lock(_M_mutex);
 ^
 /tmp/20150117/powerpc-ibm-aix7.1.0.0/libstdc++-v3/include/bits/atomic_futex.h:250:2:
 error: '_M_condvar' was not declared in this scope
  _M_condvar.wait(__lock);
  ^
 /tmp/20150117/powerpc-ibm-aix7.1.0.0/libstdc++-v3/include/bits/atomic_futex.h:
 In member function 'bool
 std::__atomic_futex_unsigned_Waiter_bit::_M_load_when_equal_for(unsigned
 int, std::memory_order, const std::chrono::duration_Rep, _Period)':
 
 /tmp/20150117/powerpc-ibm-aix7.1.0.0/libstdc++-v3/include/bits/atomic_futex.h:258:19:
 error: 'mutex' was not declared in this scope
   unique_lockmutex __lock(_M_mutex);
   ^
 /tmp/20150117/powerpc-ibm-aix7.1.0.0/libstdc++-v3/include/bits/atomic_futex.h:258:24:
 error: template argument 1 is invalid
   unique_lockmutex __lock(_M_mutex);
^
 /tmp/20150117/powerpc-ibm-aix7.1.0.0/libstdc++-v3/include/bits/atomic_futex.h:258:33:
 error: '_M_mutex' was not declared in this scope
   unique_lockmutex __lock(_M_mutex);
 ^
 /tmp/20150117/powerpc-ibm-aix7.1.0.0/libstdc++-v3/include/bits/atomic_futex.h:259:14:
 error: '_M_condvar' was not declared in this scope
   return _M_condvar.wait_for(__lock, __rtime,
  ^
 
 etc.
 
 - David


Re: [RFC, PATCH][LRA, MIPS] ICE: in decompose_normal_address, at rtlanal.c:5817

2015-01-09 Thread pinskia




 On Jan 9, 2015, at 4:20 AM, Matthew Fortune matthew.fort...@imgtec.com 
 wrote:
 
 Robert Suchanek robert.sucha...@imgtec.com writes:
 
 gcc/
* simplify-rtx.c (simplify_replace_fn_rtx): Simplify (lo_sum (high x)
(const (plus x offset))) to (const (plus x offset)).
 
 The fix appears valid to me. Just some comments on the test case.
 
 a/gcc/testsuite/gcc.target/mips/20150108.c
 b/gcc/testsuite/gcc.target/mips/20150108.c
 new file mode 100644
 index 000..f18dbe7
 --- /dev/null
 +++ b/gcc/testsuite/gcc.target/mips/20150108.c
 @@ -0,0 +1,25 @@
 +/* { dg-do compile } */
 +/* { dg-options -mips32r2 } */
 
 Please remove this line as there is nothing ISA dependent in the test case.

And since there is no mips specific part to the testcase (except for nomips16), 
we should place it in the generic part of the testsuite. 

Thanks,
Andrew

 
 +
 +long long a[10];
 +long long b, c, d, k, m, n, o, p, q, r, s, t, u, v, w; int e, f, g, h,
 +i, j, l, x;
 +
 
 nit, no return type specified.
 
 +NOMIPS16 fn1() {
 
 Nit, newline for the brace.
 
 +  for (; x; x++)
 +if (x  1)
 +  s = h | g;
 +else
 +  s = f | e;
 +  l = ~0;
 +  m = 1 | k;
 +  n = i;
 +  o = j;
 +  p = f | e;
 +  q = h | g;
 +  w = d | c | a[1];
 +  t = c;
 +  v = b | c;
 +  u = v;
 +  r = b | a[4];
 +}
 --
 1.7.9.5
 
 Thanks,
 Matthew


Re: Patch RFA: Support for building Go tools

2015-01-08 Thread pinskia




 On Jan 8, 2015, at 1:35 PM, Ian Lance Taylor i...@golang.org wrote:
 
 This patch adds support to the GCC tree for building tools that are
 used with Go.  There are two external used tools (go, gofmt) and one
 tool used internally by go (cgo).  This patch is pure machinery, with
 no source code.  The tools are not built by default, only when go is
 enabled using --enable-languages.  For the moment the tools are also
 not built when building a cross-compiler, although I hope to change
 that when I figure out what is needed.

When you add support for cross, please also make sure that Canadian cross works 
too. 

Thanks,
Andrew

 
 The tools work just as they do when built for the gc compiler, only
 they use gccgo instead of gc.  For documentation on these tools and
 what they do see http://golang.org/cmd/go ,
 http://golang.org/cmd/gofmt , http://golang.org/cmd/cgo .
 
 This patch parallels the mechanism used in the existing gnattools
 directory for building Ada tools.
 
 Could the build machinery maintainers please take a look and let me
 know if this is OK?  Thanks.
 
 Lynn, please also take a look and see if this works for you.
 
 The first attachment is the real changes for review.  The second
 attachment is a compressed patch for the generated files, for
 convenience for anybody who wants to try this out.
 
 Ian
 foo.txt
 foo1.txt.bz2


Re: [PATCH/TopLevel] Fix compiling libgo with a combined sources

2015-01-04 Thread pinskia




 On Jan 4, 2015, at 1:35 AM, Bin.Cheng amker.ch...@gmail.com wrote:
 
 On Sun, Jan 4, 2015 at 6:55 AM, Andrew Pinski pins...@gmail.com wrote:
 On Mon, Nov 24, 2014 at 1:32 PM, Jeff Law l...@redhat.com wrote:
 On 11/22/14 21:20, Andrew Pinski wrote:
 
 Hi,
   The problem here is here is that OBJCOPY is not being set to the
 newly built objcopy when compiling libgo.  This patch adds
 OBJCOPY_FOR_TARGET to the toplevel configure/Makefile so that when
 libgo is compiled OBJCOPY is set to OBJCOPY_FOR_TARGET.
 
 I noticed this issue when building an aarch64 cross compile on an
 older system where objcopy did not understand aarch64.
 
 OK?  Bootstrapped and tested on x86_64 with no regressions.  Also
 tested with a combined build for a cross compiler to
 aarch64-linux-gnu.
 
 Thanks,
 Andrew Pinski
 
 
 * Makefile.def (flags_to_pass): Pass OBJCOPY_FOR_TARGET also.
 * Makefile.tpl (HOST_EXPORTS): Add OBJCOPY_FOR_TARGET.
 (BASE_TARGET_EXPORTS): Add OBJCOPY.
 (OBJCOPY_FOR_TARGET): New variable.
 (EXTRA_TARGET_FLAGS): Add OBJCOPY.
 * Makefile.in: Regenerate.
 * configure.ac: Check for already installed target objcopy.
 Also GCC_TARGET_TOOL on objcopy.
 * configure: Regenerate.
 
 OK
 
 
 Committed to GCC and gdb/binutils repos now.
 
 Thanks,
 Andrew
 
 Hi Andrew,
 
 +  elif test x$target = x$host; then
 +# We can use an host tool
 +OBJCOPY_FOR_TARGET='$(OBJDUMP)'
 Is it a typo for '$(OBJCOPY)' ?

Yes it is a pasto. I think the fix for this would be obvious. 

Thanks,
Andrew

 
 Thanks,
 bin


Re: [Committed/AARCH64] Fix gcc.target/aarch64/test_frame_*.c testcases after ccmp patches

2014-12-11 Thread pinskia




 On Dec 11, 2014, at 10:06 AM, Tejas Belagod tejas.bela...@arm.com wrote:
 
 On 22/11/14 23:41, Andrew Pinski wrote:
 Hi,
   After the conditional compare patches, the some of the
 gcc.target/aarch64/test_frame_*.c testcases start to fail.  This was
 due to no longer duplicating simple_return and causing the epilogue to
 be duplicated.
 
 This changes the testcases to expect the non duplicated epilogue.
 
 Committed as obvious after a test of aarch64-elf.
 
 Thanks,
 Andrew Pinski
 
 ChangeLog:
 * gcc.target/aarch64/test_frame_1.c: Expect only two loads of x30 (in
 the epilogue).
 * gcc.target/aarch64/test_frame_6.c: Likewise.
 * gcc.target/aarch64/test_frame_2.c: Expect only one pair load of x30
 and x19 (in the epilogue).
 * gcc.target/aarch64/test_frame_4.c: Likewise.
 * gcc.target/aarch64/test_frame_7.c: Likewise.
 
 Hi Andrew,
 
 I'm still seeing the original number of ldr x30 and ldp x19, x30 insns for 
 these tests. What am I missing?

The ccmp patch had to be reverted. But this patch was forgotten when it was. 
Just revert the testcase patch. 


Thanks,
Andrew
 
 FAIL: gcc.target/aarch64/test_frame_1.c scan-assembler-times ldr\tx30, 
 \\[sp\\], [0-9]+ 2
 FAIL: gcc.target/aarch64/test_frame_2.c scan-assembler-times ldp\tx19, x30, 
 \\[sp\\], [0-9]+ 1
 FAIL: gcc.target/aarch64/test_frame_4.c scan-assembler-times ldp\tx19, x30, 
 \\[sp\\], [0-9]+ 1
 FAIL: gcc.target/aarch64/test_frame_6.c scan-assembler-times ldr\tx30, 
 \\[sp\\], [0-9]+ 2
 FAIL: gcc.target/aarch64/test_frame_7.c scan-assembler-times ldp\tx19, x30, 
 \\[sp\\], [0-9]+ 1
 
 Thanks,
 Tejas,
 
 
 
 
 
 
 


Re: [PATCH][AArch64] Remove crypto extension from default for cortex-a53, cortex-a57

2014-11-17 Thread pinskia




 On Nov 17, 2014, at 8:59 AM, Ramana Radhakrishnan ramana@googlemail.com 
 wrote:
 
 On Mon, Nov 17, 2014 at 2:48 PM, Kyrill Tkachov kyrylo.tkac...@arm.com 
 wrote:
 Hi all,
 
 Some configurations of Cortex-A53 and Cortex-A57 don't ship with crypto,
 so enabling it by default for -mcpu=cortex-a53 and cortex-a57 is
 inappropriate.
 
 Tested aarch64-none-elf. Reminder that at the moment all the crypto
 extension does is enable the use of the ACLE crypto intrinsics in arm_neon.h
 
 Ok for trunk?
 
 I can't ok this but ...
 
 Since we've changed behaviour from 4.9 I think it warrants an entry in
 changes.html for 5.0

ThunderX should also disable crypto too by default. I will submit a patch for 
that soon too.

Thanks,
Andrew

 
 Ramana
 
 
 Thanks,
 Kyrill
 
 2014-11-17  Kyrylo Tkachov  kyrylo.tkac...@arm.com
 
* config/aarch64/aarch64-cores.def (cortex-a53): Remove
AARCH64_FL_CRYPTO from feature flags.
(cortex-a57): Likewise.
(cortex-a57.cortex-a53): Likewise.


Re: [AArch64, Patch] Add range-check for Symbol + offset addressing.

2014-11-14 Thread pinskia




 On Nov 14, 2014, at 12:54 AM, Marcus Shawcroft marcus.shawcr...@gmail.com 
 wrote:
 
 On 14 November 2014 08:19, Andrew Pinski pins...@gmail.com wrote:
 On Fri, Nov 14, 2014 at 12:12 AM, Tejas Belagod tejas.bela...@arm.com 
 wrote:
 
 Hi,
 
 Following the discussion here
 https://gcc.gnu.org/ml/gcc-patches/2014-09/msg02237.html, this has been
 tracked down to a range-checking bug with symbol + offset style addressing
 with adrp where we allowed any arbitrary offset and this would cause
 truncation of final addresses when relocations were being resolved by ld.
 When we retreive symbol + offset address, we have to make sure the offset
 does not cause overflow of the final address.  But we have no way of knowing
 the address of symbol at compile time
 so we can't accurately say if the distance between the PC and symbol +
 offset is outside the addressible range of +/-1M in the TINY code model.  So
 we rely on images not being greater than 1M and cap the offset at 1M and
 anything beyond 1M will have to be loaded using an alternate mechanism.
 Similarly for the SMALL code model the offset has been capped at 4G.
 
 The cap value for the offset in each code model is open to debate.
 
 All testing done with Alan's workaround
 patch(https://gcc.gnu.org/ml/gcc-patches/2014-09/msg01509.html) reversed.
 
 bootstrapped aarch64-linux.
 
 OK for trunk?
 
 This looks like a better fix than I would have came up with.
 Since you are touching this area you might want to look into this issue:
 I notice SYMBOL_REF_WEAK (x) is true for references(decls) which are
 comdat's which are declared in the translation unit.  So force them to
 memory when really we know they are declared and don't have a value of
 zero so they will fit in the medium code model.  This happens with
 vtables and we lose some performance because of this.
 
 Andrew, do you mind if we take that as a separate patch, I'd like to
 take Tejas' patch sooner rather than later since it gates building a
 variety of stuff some folks care about.

Yes that is ok.  This was more of since you were looking into this area kind of 
thing but it can wait until later. 

Thanks,
Andrew

 Cheers
 /Marcus


Re: [PATCH][11/n] Merge from match-and-simplify, bit patterns from forwprop

2014-11-06 Thread pinskia




 On Nov 6, 2014, at 12:55 AM, Richard Biener rguent...@suse.de wrote:
 
 
 This merges patterns implementing the bitwise patterns from 
 tree-ssa-forwprop.c.  I've removed duplicate functionality from
 fold-const.c as I found them, some may be still lurking in the
 depths.
 
 This also fixes a bug in genmatch which made user-defined predicates
 matching anything, thus
 
 (match foo
 @0
 (if ...
 
 not work (that is: ignored).
 
 Bootstrapped and tested on x86_64-unknown-linux-gnu, applied to trunk.

Only one small comment. 

 
 Richard.
 
 2014-11-06  Richard Biener  rguent...@suse.de
 
* match.pd: Implement bitwise binary and unary simplifications
from tree-ssa-forwprop.c.
* fold-const.c (fold_unary_loc): Remove them here.
(fold_binary_loc): Likewise.
* tree-ssa-forwprop.c (simplify_not_neg_expr): Remove.
(truth_valued_ssa_name): Likewise.
(lookup_logical_inverted_value): Likewise.
(simplify_bitwise_binary_1): Likewise.
(hoist_conversion_for_bitop_p): Likewise.
(simplify_bitwise_binary_boolean): Likewise.
(simplify_bitwise_binary): Likewise.
(pass_forwprop::execute): Remove calls to simplify_not_neg_expr
and simplify_bitwise_binary.
* genmatch.c (dt_node::append_true_op): Use safe_as_a for parent.
(decision_tree::insert): Also insert non-expressions.
 
* gcc.dg/tree-ssa/forwprop-28.c: Adjust scanning for the
desired transform.
 
 Index: trunk/gcc/fold-const.c
 ===
 --- trunk.orig/gcc/fold-const.c2014-11-05 13:31:01.131942296 +0100
 +++ trunk/gcc/fold-const.c2014-11-05 13:35:29.362930558 +0100
 @@ -8008,8 +8008,6 @@ fold_unary_loc (location_t loc, enum tre
 case BIT_NOT_EXPR:
   if (TREE_CODE (arg0) == INTEGER_CST)
 return fold_not_const (arg0, type);
 -  else if (TREE_CODE (arg0) == BIT_NOT_EXPR)
 -return fold_convert_loc (loc, type, TREE_OPERAND (arg0, 0));
   /* Convert ~ (-A) to A - 1.  */
   else if (INTEGRAL_TYPE_P (type)  TREE_CODE (arg0) == NEGATE_EXPR)
return fold_build2_loc (loc, MINUS_EXPR, type,
 @@ -11152,26 +11150,6 @@ fold_binary_loc (location_t loc,
arg1);
}
 
 -  /* (X  Y) | Y is (X, Y).  */
 -  if (TREE_CODE (arg0) == BIT_AND_EXPR
 -   operand_equal_p (TREE_OPERAND (arg0, 1), arg1, 0))
 -return omit_one_operand_loc (loc, type, arg1, TREE_OPERAND (arg0, 0));
 -  /* (X  Y) | X is (Y, X).  */
 -  if (TREE_CODE (arg0) == BIT_AND_EXPR
 -   operand_equal_p (TREE_OPERAND (arg0, 0), arg1, 0)
 -   reorder_operands_p (TREE_OPERAND (arg0, 1), arg1))
 -return omit_one_operand_loc (loc, type, arg1, TREE_OPERAND (arg0, 1));
 -  /* X | (X  Y) is (Y, X).  */
 -  if (TREE_CODE (arg1) == BIT_AND_EXPR
 -   operand_equal_p (arg0, TREE_OPERAND (arg1, 0), 0)
 -   reorder_operands_p (arg0, TREE_OPERAND (arg1, 1)))
 -return omit_one_operand_loc (loc, type, arg0, TREE_OPERAND (arg1, 1));
 -  /* X | (Y  X) is (Y, X).  */
 -  if (TREE_CODE (arg1) == BIT_AND_EXPR
 -   operand_equal_p (arg0, TREE_OPERAND (arg1, 1), 0)
 -   reorder_operands_p (arg0, TREE_OPERAND (arg1, 0)))
 -return omit_one_operand_loc (loc, type, arg0, TREE_OPERAND (arg1, 0));
 -
   /* (X  ~Y) | (~X  Y) is X ^ Y */
   if (TREE_CODE (arg0) == BIT_AND_EXPR
   TREE_CODE (arg1) == BIT_AND_EXPR)
 @@ -11391,42 +11369,6 @@ fold_binary_loc (location_t loc,
   operand_equal_p (arg0, TREE_OPERAND (arg1, 0), 0))
return omit_one_operand_loc (loc, type, integer_zero_node, arg0);
 
 -  /* Canonicalize (X | C1)  C2 as (X  C2) | (C1  C2).  */
 -  if (TREE_CODE (arg0) == BIT_IOR_EXPR
 -   TREE_CODE (arg1) == INTEGER_CST
 -   TREE_CODE (TREE_OPERAND (arg0, 1)) == INTEGER_CST)
 -{
 -  tree tmp1 = fold_convert_loc (loc, type, arg1);
 -  tree tmp2 = fold_convert_loc (loc, type, TREE_OPERAND (arg0, 0));
 -  tree tmp3 = fold_convert_loc (loc, type, TREE_OPERAND (arg0, 1));
 -  tmp2 = fold_build2_loc (loc, BIT_AND_EXPR, type, tmp2, tmp1);
 -  tmp3 = fold_build2_loc (loc, BIT_AND_EXPR, type, tmp3, tmp1);
 -  return
 -fold_convert_loc (loc, type,
 -  fold_build2_loc (loc, BIT_IOR_EXPR,
 -   type, tmp2, tmp3));
 -}
 -
 -  /* (X | Y)  Y is (X, Y).  */
 -  if (TREE_CODE (arg0) == BIT_IOR_EXPR
 -   operand_equal_p (TREE_OPERAND (arg0, 1), arg1, 0))
 -return omit_one_operand_loc (loc, type, arg1, TREE_OPERAND (arg0, 0));
 -  /* (X | Y)  X is (Y, X).  */
 -  if (TREE_CODE (arg0) == BIT_IOR_EXPR
 -   operand_equal_p (TREE_OPERAND (arg0, 0), arg1, 0)
 -   reorder_operands_p (TREE_OPERAND (arg0, 1), arg1))
 -return omit_one_operand_loc (loc, type, arg1, TREE_OPERAND (arg0, 1));
 -  /* X  (X | Y) is (Y, X).  */
 -  if (TREE_CODE (arg1) == BIT_IOR_EXPR
 -   operand_equal_p (arg0, TREE_OPERAND (arg1, 0), 0)
 -   reorder_operands_p (arg0, TREE_OPERAND 

Re: [PATCH, ifcvt] Check size cost in noce_try_store_flag_mask

2014-10-31 Thread pinskia




 On Oct 31, 2014, at 4:07 AM, Matthew Fortune matthew.fort...@imgtec.com 
 wrote:
 
 Andrew Pinski pins...@gmail.com writes:
 On Thu, Oct 30, 2014 at 11:30 PM, Zhenqiang Chen zhenqiang.c...@arm.com
 wrote:
 Thank you all for the comments. Patch is updated.
 
 Bootstrap and no make check regression on X86-64.
 No make check regression with Cortex-M0 qemu.
 No performance changes for coremark, dhrystone, spec2000 and spec2006 on
 X86-64 and Cortex-A15.
 
 For CSiBE, ARM Cortex-M0 result is a little better. A little regression
 for
 MIPS (less than 0.01%).
 
 I think I have a fix for MIPS which I need to submit too.  The problem
 is IF_THEN_ELSE is not implemented for mips_rtx_costs.
 
 Something like the attached one (though It is not updated for the new
 cores including octeon3).
 
 This looks OK in principle so I have no objection to the original patch
 from Zhengiang. The MIPS patch can follow on.
 
 Andrew: Are you setting higher costs for octeon to try and avoid the
 conversion owing to high latency for MOV[NZ] etc in octeon*?

Yes. In fact I was doing it for the higher latency on octeon 2 than Octeon 1/+. 
I saw a small improvement with that, using other instructions in one or two 
cases which be scheduled with other code. 


 Should that
 be conditional on speed vs size?

Yes though I thought we had a variable for size too. 

Thanks,
Andrew

 
 Thanks,
 Matthew


Re: [PATCH] cleanup in c-parser

2014-10-12 Thread pinskia




 On Oct 12, 2014, at 12:37 PM, Anthony Brandon anthony.bran...@gmail.com 
 wrote:
 
 Hi,
 
 I'm a new contributor and I don't yet have a copyright assignment or
 commit access.


Thanks for you contribution.  Your new function is missing a comment before it 
saying what it does. Yes it might be obvious what the function does but the 
coding style requires it. 

Thanks,
Andrew

 
 This is a cleanup of code duplication in c-parser.
 I bootstrapped and tested on x86_64-linux.
 
 
 gcc/c/ChangeLog:
 
 2014-10-12  Anthony Brandon  anthony.bran...@gmail.com
 
* c-parser.c (c_parser_all_labels): New function to replace
 the duplicate code.
(c_parser_statement): Call the new function.
 cleanup.diff


Re: [Ping] [PATCH, 1/10] two hooks for conditional compare (ccmp)

2014-09-23 Thread pinskia




 On Sep 22, 2014, at 11:43 PM, Zhenqiang Chen zhenqiang.c...@arm.com wrote:
 
 Ping?
 
 Patch is attached for easy to apply.


Note I have been using an earlier version of this patch set in house and not 
have found any issues with it. 

Thanks,
Andrew

 
 Thanks!
 -Zhenqiang
 
 -Original Message-
 From: gcc-patches-ow...@gcc.gnu.org [mailto:gcc-patches-
 ow...@gcc.gnu.org] On Behalf Of Zhenqiang Chen
 Sent: Monday, June 23, 2014 2:57 PM
 To: gcc-patches@gcc.gnu.org
 Subject: [PATCH, 1/10] two hooks for conditional compare (ccmp)
 
 Hi,
 
 The patch adds two hooks for backends to generate conditional compare
 instructions.
 
 * gen_ccmp_first is for the first compare.
 * gen_ccmp_next is for for the following compares.
 
 The patch is separated from
 https://gcc.gnu.org/ml/gcc-patches/2014-02/msg01407.html.
 
 And the original discussion about the hooks was in thread:
 
 https://gcc.gnu.org/ml/gcc-patches/2013-10/msg02601.html
 
 OK for trunk?
 
 Thanks!
 -Zhenqiang
 
 ChangeLog:
 2014-06-23  Zhenqiang Chen  zhenqiang.c...@linaro.org
 
* doc/md.texi (ccmp): Add description about conditional compare
instruction pattern.
(TARGET_GEN_CCMP_FIRST): Define.
(TARGET_GEN_CCMP_NEXT): Define.
* doc/tm.texi.in (TARGET_GEN_CCMP_FIRST,
 TARGET_GEN_CCMP_NEXT): New.
* target.def (gen_ccmp_first, gen_ccmp_first): Add two new hooks.
 
 diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi index e17ffca..988c288
 100644
 --- a/gcc/doc/md.texi
 +++ b/gcc/doc/md.texi
 @@ -6216,6 +6216,42 @@ A typical @code{ctrap} pattern looks like
   @dots{})
 @end smallexample
 
 +@cindex @code{ccmp} instruction pattern @item @samp{ccmp}
 Conditional
 +compare instruction.  Operand 2 and 5 are RTLs which perform two
 +comparisons.  Operand 1 is AND or IOR, which operates on the result of
 +operand 2 and 5.
 +It uses recursive method to support more than two compares.  e.g.
 +
 +  CC0 = CMP (a, b);
 +  CC1 = CCMP (NE (CC0, 0), CMP (e, f));  ...
 +  CCn = CCMP (NE (CCn-1, 0), CMP (...));
 +
 +Two target hooks are used to generate conditional compares.
 +GEN_CCMP_FISRT is used to generate the first CMP.  And
 GEN_CCMP_NEXT is
 +used to generate the following CCMPs.  Operand 1 is AND or IOR.
 +Operand 3 is the result of GEN_CCMP_FISRT or a previous
 GEN_CCMP_NEXT.  Operand 2 is NE.
 +Operand 4, 5 and 6 is another compare expression.
 +
 +A typical CCMP pattern looks like
 +
 +@smallexample
 +(define_insn *ccmp_and_ior
 +  [(set (match_operand 0 dominant_cc_register )
 +(compare
 + (match_operator 1
 +  (match_operator 2 comparison_operator
 +   [(match_operand 3 dominant_cc_register)
 +(const_int 0)])
 +  (match_operator 4 comparison_operator
 +   [(match_operand 5 register_operand)
 +(match_operand 6 compare_operand]))
 + (const_int 0)))]
 +  
 +  @dots{})
 +@end smallexample
 +
 @cindex @code{prefetch} instruction pattern  @item @samp{prefetch} diff
 --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi index c272630..93f7c74 100644
 --- a/gcc/doc/tm.texi
 +++ b/gcc/doc/tm.texi
 @@ -11021,6 +11021,23 @@ This target hook is required only when the
 target has several different  modes and they have different conditional
 execution capability, such as ARM.
 @end deftypefn
 
 +@deftypefn {Target Hook} rtx TARGET_GEN_CCMP_FIRST (int @var{code},
 rtx @var{op0}, rtx @var{op1})
 +This function emits a comparison insn for the first of a sequence of
 +conditional comparisions.  It returns a comparison expression
 +appropriate  for passing to @code{gen_ccmp_next} or to
 @code{cbranch_optab}.
 + @code{unsignedp} is used when converting @code{op0} and
 @code{op1}'s mode.
 +@end deftypefn
 +
 +@deftypefn {Target Hook} rtx TARGET_GEN_CCMP_NEXT (rtx @var{prev},
 int @var{cmp_code}, rtx @var{op0}, rtx @var{op1}, int @var{bit_code})
 +This function emits a conditional comparison within a sequence of
 +conditional comparisons.  The @code{prev} expression is the result of a
 +prior call to @code{gen_ccmp_first} or @code{gen_ccmp_next}.  It may
 +return  @code{NULL} if the combination of @code{prev} and this
 +comparison is  not supported, otherwise the result must be appropriate
 +for passing to  @code{gen_ccmp_next} or @code{cbranch_optab}.
 +@code{bit_code}  is AND or IOR, which is the op on the two compares.
 +@end deftypefn
 +
 @deftypefn {Target Hook} unsigned TARGET_LOOP_UNROLL_ADJUST
 (unsigned @var{nunroll}, struct loop *@var{loop})  This target hook
 returns a
 new value for the number of times @var{loop}  should be unrolled. The
 parameter @var{nunroll} is the number of times diff --git
 a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in index dd72b98..e49f8f5 100644
 --- a/gcc/doc/tm.texi.in
 +++ b/gcc/doc/tm.texi.in
 @@ -8107,6 +8107,10 @@ build_type_attribute_variant (@var{mdecl},
 
 @hook TARGET_HAVE_CONDITIONAL_EXECUTION
 
 +@hook TARGET_GEN_CCMP_FIRST
 +
 +@hook TARGET_GEN_CCMP_NEXT
 +
 @hook TARGET_LOOP_UNROLL_ADJUST
 
 @defmac 

Re: [PATCH AArch64]: Add constraint letter for stack_protect_test pattern)

2014-09-17 Thread pinskia


 On Sep 17, 2014, at 7:43 AM, James Greenhalgh james.greenha...@arm.com 
 wrote:
 
 
 On Wed, Sep 17, 2014 at 09:30:31AM +0100, Richard Earnshaw wrote:
 =r is correct for an early-clobbered scratch.
 
 R.
 
 In that case...
 
 How is the attached patch for trunk? I've bootstrapped it on AArch64
 with -fstack-protector-strong and -frename-registers in the BOOT_CFLAGS
 without seeing any issues.
 
 OK?
 
 Thanks,
 James
 
 ---
 gcc/
 
 2014-09-15  James Greenhalgh  james.greenha...@arm.com
 
   * config/aarch64/aarch64.md (stack_protect_test_mode): Mark
   scratch register as an output to placate register renaming.
 
 gcc/testsuite/
 
 2014-09-15  James Greenhalgh  james.greenha...@arm.com
 
   * gcc.target/aarch64/stack_protector_set_1.c: New.
   * gcc.target/aarch64/stack_protector_set_2.c: Likewise.

There is nothing aarch64 specific about this testcase so I would place them 
under gcc.dg and add the extra marker which says this testcase requires stack 
protector.   And maybe even use compile instead of just assemble too. 

Thanks,
Andrew

 0001-Re-PATCH-AArch64-Add-constraint-letter-for-stack_pro.patch


Re: [PATCHv2] Vimrc config with GNU formatting

2014-09-11 Thread pinskia


 On Sep 10, 2014, at 9:47 PM, Yury Gribov tetra2...@gmail.com wrote:
 
 Segher Boessenkool segher at kernel.crashing.org writes:
 I am saying it is very anti-social to make
 people's editor behave differently from what they are used to.
 ...
 The Emacs dir-locals file simply
 configures some settings for files with certain major modes in that
 dir.
 For example, ours says that c-mode files should use GNU style.  This
 is
 quite harmless, and probably what most Emacs users want.
 
 Hm, so autoformatting in Emacs is good because that's what most users 
 want but autoformatting in Vim is bad because that's not what people are 
 used to?

I don't like auto formatting in any editor. Though I don't use emacs, I use 
vim. I think using auto formatting is cheating and not understanding why coding 
styles exists.  And some folks already have to deal with two more formatting 
styles already: Linux kernel and gnu. So if you add auto formatting to one, 
some folks are going to get confused. 


Thanks,
Andrew

 
 First, you are encouraging
 the use of a plugin that is a gaping wide security hole.
 
 I don't think so. The comment mentions that user can either install a 
 (rather widespread btw) plugin or just call config from his .vimrc.
 
 Secondly, this is a very poor imitation of the mechanism Vim has for
 dealing
 with filetypes, namely, ftplugins.
 
 I'm ready to accept technical suggestions on how to do the thing 
 properly. So what exactly do you propose?
 
 [Snipped some overly optimistic stuff about this all increasing the
 quality
 of posted patches.  Hint: the most frequently made formatting error is
 forgetting to put two spaces at the end of a sentence.
 
 Dunno, I was relying on personal experience. And searching for two|2 
 spaces on http://gcc.gnu.org/ml/gcc-patches returns 2000 results 
 whereas eight|8 spaces only 700.
 
 -Y
 


Re: [Patch, MIPS] Add .note.GNU-stack section

2014-09-10 Thread pinskia


 On Sep 10, 2014, at 9:24 AM, Steve Ellcey  sell...@mips.com wrote:
 
 Someone noticed that the MIPS GCC compiler was not putting out the
 .note.GNU-stack section.  This simple patch fixes that problem by
 calling the standard file_end_indicate_exec_stack function.
 
 Tested on mips-mti-linux-gnu, OK to checkin?

This works except you did not update the assembly files in libgcc or glibc. We 
(Cavium) have the same patch in our tree for a few released versions. 

Thanks,
Andrew

 
 Steve Ellcey
 sell...@mips.com
 
 
 
 2014-09-10  Steve Ellcey  sell...@mips.com
 
* config/mips/mips.c (TARGET_ASM_FILE_END): Define.
 
 
 diff --git a/gcc/config/mips/mips.c b/gcc/config/mips/mips.c
 index 646bb4d..bcaa9cd 100644
 --- a/gcc/config/mips/mips.c
 +++ b/gcc/config/mips/mips.c
 @@ -19146,6 +19146,9 @@ mips_lra_p (void)
 #undef TARGET_LRA_P
 #define TARGET_LRA_P mips_lra_p
 
 +#undef TARGET_ASM_FILE_END
 +#define TARGET_ASM_FILE_END file_end_indicate_exec_stack
 +
 struct gcc_target targetm = TARGET_INITIALIZER;
 
 #include gt-mips.h


Re: [Patch, AArch64] Enable Address sanitizer and UB sanitizer

2014-09-09 Thread pinskia


 On Sep 9, 2014, at 2:50 AM, Marcus Shawcroft marcus.shawcr...@gmail.com 
 wrote:
 
 +static unsigned HOST_WIDE_INT
 +aarch64_asan_shadow_offset (void)
 +{
 +  return (HOST_WIDE_INT_1  36);
 +}
 +
 
 Looking around various other ports I see magic numbers including 29,
 41, 44 Help me understand why 36 is the right choice for aarch64?

Also why 36?  What is the min virtual address space aarch64 Linux kernel 
supports with 4k pages and 3 level page table?  Also does this need to 
conditionalized on lp64?  Since I am about to post glibc patches turning on 
address sanitizer breaks that. 

Thanks,
Andrew



 
 Cheers
 /Marcus
 
 
 On 5 September 2014 15:49, Christophe Lyon christophe.l...@linaro.org 
 wrote:
 Hi,
 
 The attached patch enables the address and undefined behavior sanitizers.
 
 I have tested it on AArch64 hardware, and asan.exp tests pass, but a
 few ubsan.exp tests fail as follows:
 FAIL: c-c++-common/ubsan/float-cast-overflow-1.c   -O3 -g  execution test
 FAIL: c-c++-common/ubsan/float-cast-overflow-1.c   -O2 -flto
 -flto-partition=none  execution test
 FAIL: c-c++-common/ubsan/float-cast-overflow-2.c   -O3 -g  execution test
 FAIL: c-c++-common/ubsan/float-cast-overflow-3.c   -O3 -g  execution test
 FAIL: c-c++-common/ubsan/float-cast-overflow-4.c   -O2  execution test
 FAIL: c-c++-common/ubsan/float-cast-overflow-4.c   -O3 -g  execution test
 FAIL: c-c++-common/ubsan/overflow-int128.c   -O0  (internal compiler error)
 FAIL: c-c++-common/ubsan/overflow-int128.c   -O0  (test for excess errors)
 FAIL: c-c++-common/ubsan/overflow-int128.c   -O1  (internal compiler error)
 FAIL: c-c++-common/ubsan/overflow-int128.c   -O1  (test for excess errors)
 FAIL: c-c++-common/ubsan/overflow-int128.c   -O2  (internal compiler error)
 FAIL: c-c++-common/ubsan/overflow-int128.c   -O2  (test for excess errors)
 FAIL: c-c++-common/ubsan/overflow-int128.c   -O3 -fomit-frame-pointer
 (internal compiler error)
 FAIL: c-c++-common/ubsan/overflow-int128.c   -O3 -fomit-frame-pointer
 (test for excess errors)
 FAIL: c-c++-common/ubsan/overflow-int128.c   -O3 -g  (internal compiler 
 error)
 FAIL: c-c++-common/ubsan/overflow-int128.c   -O3 -g  (test for excess errors)
 FAIL: c-c++-common/ubsan/overflow-int128.c   -Os  (internal compiler error)
 FAIL: c-c++-common/ubsan/overflow-int128.c   -Os  (test for excess errors)
 FAIL: c-c++-common/ubsan/overflow-int128.c   -O2 -flto
 -flto-partition=none  (internal compiler error)
 FAIL: c-c++-common/ubsan/overflow-int128.c   -O2 -flto
 -flto-partition=none  (test for excess errors)
 FAIL: c-c++-common/ubsan/overflow-int128.c   -O2 -flto  (internal
 compiler error)
 FAIL: c-c++-common/ubsan/overflow-int128.c   -O2 -flto  (test for excess 
 errors)
 FAIL: gcc.dg/ubsan/float-cast-overflow-bf.c   -O3 -g  execution test
 
 I think all these failures need to be addressed separately, and should
 not prevent from adding the functionality since most of them pass.
 
 Note that an update of libsanitizer is required, to include at least
 revision 209641 (which fixes internal_fork for AArch64).
 
 OK for trunk?
 
 Christophe.
 
 2014-09-05  Christophe Lyon christophe.l...@linaro.org
gcc/
* config/aarch64/aarch64-linux.h (ASAN_CC1_SPEC): Define.
(CC1_SPEC): Define.
* config/aarch64/aarch64.c (aarch64_asan_shadow_offset): New function.
(TARGET_ASAN_SHADOW_OFFSET): Define.
 
libsanitizer/
* configure.tgt: Add AArch64 pattern.


Re: [PATCH] Force rtl templates to be inlined

2014-09-02 Thread pinskia


 On Sep 2, 2014, at 1:36 AM, Steven Bosscher stevenb@gmail.com wrote:
 
 On Tue, Sep 2, 2014 at 9:22 AM, Andrew Pinski wrote:
 On Tue, Sep 2, 2014 at 12:20 AM, Andi Kleen wrote:
 
 there have been bugs in the past in the area of always_inline too.
 
 You're arguing for my patch. It would find those bugs.
 
 
 No I am arguing against it since the older versions of GCC we cannot change.
 
 Should such bugs turn up, we can account for them in ansidecl.h.
 
 I think Andi's patch should go in.

I does hurt debug ability with older compilers too. So if we need to figure out 
why stage is being miscompiled it is harder to figure how to work around it.  

I think stage should really be -O0 even with respect of inline.  I think we 
should never force inline inside gcc even at -O0 as it is just a hack (we know 
it as we added the attribute in the first place). 

Thanks,
Andrew

 
 Ciao!
 Steven


Re: Fix if-conversion pass for dead type-unsafe code

2014-08-18 Thread pinskia


 On Aug 18, 2014, at 1:33 AM, Tom de Vries tom_devr...@mentor.com wrote:
 
 On 14-08-14 16:34, Richard Biener wrote:
 On Sat, Aug 9, 2014 at 7:14 AM, Tom de Vries tom_devr...@mentor.com wrote:
 On 08-08-14 17:17, Tom de Vries wrote:
 
 Maybe instead of a new mem_alias_equal_p simply compare MEM_ATTRs
 with mem_attrs_eq_p?
 
 
 I propose to fix it this way (as attached) on 4.8/4.9/trunk, and maybe do
 a more
 efficient handling on trunk as a follow-up patch.
 
 I'll put this through bootstrap/test again.
 
 
 Bootstrapped and reg-tested on trunk x86_64.
 
 Re-attached patch OK for trunk, 4.8, 4.9 ?
 
 Ok.
 
 Backported to 4.8 and 4.9 as attached.
 
 The emit-rtl.{c,h} part of my patch was superfluous on trunk given that you 
 already exported mem_attrs_eq_p, something I failed to notice when rebasing. 
 So the backport contains that part of your patch. I've tested the backport 
 patch on 4.9.
 
 (did you check the effect on code-generation?  that is, how many
 opportunities compiling GCC do we lose?)
 
 I haven't done that. I can look into that (after I fix the tail-merge part of 
 pr62004).

Just FYI, this did fix the issue on MIPS that I had saw. 

Thanks,
Andrew

 
 Thanks,
 - Tom
 
 0001-Fix-if-conversion-pass-for-dead-type-unsafe-code.patch


Re: [PATCH AArch64 1/3] Don't disparage add/sub in SIMD registers

2014-08-12 Thread pinskia


 On Aug 12, 2014, at 7:40 AM, Alan Lawrence alan.lawre...@arm.com wrote:
 
 (It is no more expensive.)

Yes on some processors it could be. 

Thanks,
Andrew


 
 gcc/ChangeLog:
 
* config/aarch64/aarch64.md (subdi3, adddi3_aarch64): Don't penalize
SIMD reg variant.
 diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md 
 index 
 f8eb305140e7b0aed006b33f1724a90939e48316..0a8ca4bcc7941f912c8d42200b33206d4188fa48
  100644 --- a/gcc/config/aarch64/aarch64.md +++ 
 b/gcc/config/aarch64/aarch64.md @@ -1188,10 +1188,10 @@ (define_insn 
 *adddi3_aarch64 [(set - (match_operand:DI 0 register_operand 
 =rk,rk,rk,!w) + (match_operand:DI 0 register_operand =rk,rk,rk,w) 
 (plus:DI - (match_operand:DI 1 register_operand %rk,rk,rk,!w) - 
 (match_operand:DI 2 aarch64_plus_operand I,r,J,!w)))] + (match_operand:DI 
 1 register_operand %rk,rk,rk,w) + (match_operand:DI 2 
 aarch64_plus_operand I,r,J,w)))]  @ add\\t%x0, %x1, %2 @@ -1662,9 
 +1662,9 @@ ) (define_insn subdi3 - [(set (match_operand:DI 0 
 register_operand =rk,!w) -   (minus:DI (match_operand:DI 1 
 register_operand r,!w) -   (match_operand:DI 2 register_operand 
 r,!w)))] + [(set (match_operand:DI 0 register_operand =rk,w) +
 (minus:DI (match_operand:DI 1 register_operand r,w) +
 (match_operand:DI 2 register_operand r,w)))]  @ sub\\t%x0, %x1, %x2


Re: [AArch64] Implement movmem for the benefit of inline memcpy

2014-08-01 Thread pinskia


 On Jun 6, 2014, at 1:50 AM, James Greenhalgh james.greenha...@arm.com wrote:
 
 
 Hi,
 
 The move_by_pieces infrastructure performs a copy by repeatedly trying
 the largest safe copy it can make. So for a 15-byte copy we might see:
 
 offset   amount  bytes copied
 08   0-7
 84   8-11
 12   2   12-13
 14   1   14
 
 However, we can implement a 15-byte copy as so:
 
 offset   amount  bytes copied
 08   0-7
 78   7-14
 
 Which can prove more efficient for both space and speed.
 
 In this patch we set MOVE_RATIO low to avoid using move_by_pieces, and
 implement the movmem pattern name to expand small block copy cases. Note, this
 optimization does not apply for -mstrict-align targets, which must continue
 copying byte-by-byte.

Why not change move_by_pieces instead of having a target specific code? This 
seems like a better option. You can check is unaligned slow target macro to see 
if you want to do this optimization too.   As I mentioned in the other email 
make sure you check the volatile ness of the from and to before doing this 
optimization. 

Thanks,
Andrew


 
 Setting MOVE_RATIO low in this way causes a few tests to begin failing,
 both of these are documented in the test-case as expected to fail for
 low MOVE_RATIO targets, which do not allow certain Tree-Level optimizations.
 
 Bootstrapped on aarch64-unknown-linux-gnu with no issues.
 
 OK for trunk?
 
 Thanks,
 James
 
 ---
 gcc/
 
 2014-06-06  James Greenhalgh  james.greenha...@arm.com
 
* config/aarch64/aarch64-protos.h (aarch64_expand_movmem): New.
* config/aarch64/aarch64.c (aarch64_move_pointer): New.
(aarch64_progress_pointer): Likewise.
(aarch64_copy_one_part_and_move_pointers): Likewise.
(aarch64_expand_movmen): Likewise.
* config/aarch64/aarch64.h (MOVE_RATIO): Set low.
* config/aarch64/aarch64.md (movmemmode): New.
 
 gcc/testsuite/
 
 2014-06-06  James Greenhalgh  james.greenha...@arm.com
 
* gcc.dg/tree-ssa/pr42585.c: Skip for AArch64.
* gcc.dg/tree-ssa/sra-12.c: Likewise.
 0001-AArch64-Implement-movmem-for-the-benefit-of-inline-m.patch


Re: [PATCH, rs6000, 4.8/4.9] Fix alignment of non-Altivec vector struct fields

2014-07-27 Thread pinskia


 On Jul 27, 2014, at 4:53 AM, Alan Modra amo...@gmail.com wrote:
 
 On Sun, Jul 27, 2014 at 07:16:07PM +0930, Alan Modra wrote:
 On Sat, Jul 26, 2014 at 01:45:12PM +0200, Matthias Klose wrote:
 Am 17.07.2014 02:41, schrieb Ulrich Weigand:
 Hello,
 
 this is the variant intended for the 4.8/4.9 branches of the patch:
 https://gcc.gnu.org/ml/gcc-patches/2014-07/msg01072.html
 
 As discussed, it does *not* actually change ABI, but only warn when
 encountering a situation where the ABI will change in a future GCC.
 (Avoiding the specific term GCC 4.10 here since I'm not certain
 whether the next GCC release will in fact be called that ...)
 
 Tested on powerpc64-linux and powerpc64le-linux; also verified using
 the ABI compat suite (against an unpatched GCC) that this patch does
 not change the ABI.
 
 this causes PR libobjc/61920, link failures with -lobjc.
 
 Try this.
 
 Index: libobjc/encoding.c
 ===
 --- libobjc/encoding.c(revision 213090)
 +++ libobjc/encoding.c(working copy)
 @@ -192,6 +192,7 @@
? MAX (MAX (COMPUTED, SPECIFIED), 64)\
: MAX (COMPUTED, SPECIFIED));})
 
 +#define rs6000_special_adjust_field_align_p false
 
 /* Skip a variable name, enclosed in quotes ().  */
 static inline
 
 Blah, that won't work of course.  The macro needs to take two
 parameters.
 
 #define rs6000_special_adjust_field_align_p(FIELD, COMPUTED) false

This is pre-approved if it works.  I really should finish off the branch I 
started years ago :). 

Thanks,
Andrew

 
 -- 
 Alan Modra
 Australia Development Lab, IBM


Re: update address taken: don't drop clobbers

2014-07-06 Thread pinskia


 On Jul 6, 2014, at 7:23 AM, Marc Glisse marc.gli...@inria.fr wrote:
 
 On Mon, 30 Jun 2014, Jeff Law wrote:
 
 On 06/28/14 16:33, Marc Glisse wrote:
 In an earlier version of the patch, I was using
 get_or_create_ssa_default_def (cfun, sym);
 (I was reusing the same variable). This passed bootstrap+testsuite on
 all languages except for ada. Indeed, the compiler wanted to coalesce
 several SSA_NAMEs, including those new ones, in out-of-ssa, but
 couldn't.
 And that's what you need to delve deeper into.  Why did it refuse to 
 coalesce?
 
 As long as the lifetimes do not overlap, then coalescing should have worked.
 
 What is the lifetime of an SSA_NAME with a default definition? The way we 
 handle it now, we start from the uses and go back to all blocks that can 
 reach one of the uses, since there is no defining statement where we could 
 stop (intuitively we should stop where the clobber was, if not earlier). This 
 huge lifetime makes it very likely for such an SSA_NAME to conflict with 
 others. And if an abnormal phi is involved, and thus we must coalesce, there 
 is a problem.
 
 The patch attached (it should probably use ssa_undefined_value_p with a new 
 extra argument to say whether we care about partially-undefined) makes the 
 lifetime of undefined local variables empty and lets the original patch work 
 with:
  def = get_or_create_ssa_default_def (cfun, sym);
 instead of creating a new variable.
 
 However, I am not convinced reusing the same variable is necessarily the best 
 thing. For warnings, we can create a new variable with the same name (with .1 
 added by gcc at the end) and copy the location info (I am not doing that 
 yet), so little is lost. A new variable expresses more clearly that the value 
 it holds is random crap. If I reuse the same variable, the SRA patch doesn't 
 warn because SRA explicitly sets TREE_NO_WARNING (this can probably be 
 changed, but that's starting to be a lot of changes). Creating a new variable 
 is also more general. When reading *ssa_name after *ssa_name={v}{CLOBBER}; or 
 after free(ssa_name); we have no variable to reuse so we will need to create 
 a new undefined variable, and if a variable is global or a PARM_DECL, its 
 default definition is not an undefined value (though it will probably happen 
 in a different pass, so it doesn't have to behave the same).
 
 (Side note: we don't seem to have any code to notice that:
 a=phiundef,b
 b=phiundef,a
 means both phis can be replaced with undefined variables.)
 
 Do you have any advice on the right direction?

The below patch won't work for parameters. 

Thanks,
Andrew

 
 -- 
 Marc Glisse
 Index: gcc/tree-ssa-live.c
 ===
 --- gcc/tree-ssa-live.c(revision 212306)
 +++ gcc/tree-ssa-live.c(working copy)
 @@ -1086,20 +1086,28 @@ set_var_live_on_entry (tree ssa_name, tr
   if (stmt)
 {
   def_bb = gimple_bb (stmt);
   /* Mark defs in liveout bitmap temporarily.  */
   if (def_bb)
bitmap_set_bit (live-liveout[def_bb-index], p);
 }
   else
 def_bb = ENTRY_BLOCK_PTR_FOR_FN (cfun);
 
 +  /* An undefined local variable does not need to be very alive.  */
 +  if (SSA_NAME_IS_DEFAULT_DEF (ssa_name))
 +{
 +  tree var = SSA_NAME_VAR (ssa_name);
 +  if (var  TREE_CODE (var) == VAR_DECL  !is_global_var (var))
 +return;
 +}
 +
   /* Visit each use of SSA_NAME and if it isn't in the same block as the def,
  add it to the list of live on entry blocks.  */
   FOR_EACH_IMM_USE_FAST (use, imm_iter, ssa_name)
 {
   gimple use_stmt = USE_STMT (use);
   basic_block add_block = NULL;
 
   if (gimple_code (use_stmt) == GIMPLE_PHI)
 {
  /* Uses in PHI's are considered to be live at exit of the SRC block
 @@ -1422,20 +1430,27 @@ verify_live_on_entry (tree_live_info_p l
  fprintf (stderr, \n);
}
  else
fprintf (stderr,  and there is no default def.\n);
}
}
}
  else
if (d == var)
  {
 +/* An undefined local variable does not need to be very
 +   alive.  */
 +tree real_var = SSA_NAME_VAR (var);
 +if (real_var  TREE_CODE (real_var) == VAR_DECL
 + !is_global_var (real_var))
 +  continue;
 +
/* The only way this var shouldn't be marked live on entry is
   if it occurs in a PHI argument of the block.  */
size_t z;
bool ok = false;
gimple_stmt_iterator gsi;
for (gsi = gsi_start_phis (e-dest);
 !gsi_end_p (gsi)  !ok;
 gsi_next (gsi))
  {
gimple phi = gsi_stmt (gsi);


Re: Make declaration of wide_int_storage match its definition

2014-07-05 Thread pinskia


 On Jul 5, 2014, at 11:42 AM, Gerald Pfeifer ger...@pfeifer.com wrote:
 
 Farther below in wide-int.h, we define wide_int_storage as a class:
 
  class GTY(()) wide_int_storage
  {
  private:
HOST_WIDE_INT val[WIDE_INT_MAX_ELTS];
unsigned int len;
unsigned int precision;
  :
 
 The patch below, which I applied as obvious after a full bootstrap on
 i386-unknown-freebsd10.0, makes the declaration match the definition.

This patch is fine but c++ allows class and struct be used interchangeable. If 
there is a compilers which does not it is broken and should be report to them.  
Yes that means clang is broken. 

Thanks,
Andrew

 
 Gerald
 
 2014-07-05  Gerald Pfeifer  ger...@pfeifer.com
 
* wide-int.h (wide_int_storage): Change declaration from struct 
to class.
 
 Index: wide-int.h
 ===
 --- wide-int.h(revision 212304)
 +++ wide-int.h(working copy)
 @@ -284,7 +284,7 @@
 
 template typename T struct generic_wide_int;
 template int N struct fixed_wide_int_storage;
 -struct wide_int_storage;
 +class wide_int_storage;
 
 /* An N-bit integer.  Until we can use typedef templates, use this instead.  
 */
 #define FIXED_WIDE_INT(N) \


Re: [Committed] New testcase for conditional move with conditional compares

2014-06-24 Thread pinskia


 On Jun 24, 2014, at 2:08 AM, Kyrill Tkachov kyrylo.tkac...@arm.com wrote:
 
 
 On 23/06/14 22:12, Andrew Pinski wrote:
 Hi,
   When looking at the current conditional compare patch, I find that
 we don't have a testcase to test that we don't ICE for the case where
 we have conditional compares and conditional moves where the moves are
 of floating point types.
 
 This patch adds that testcase to the C torture compile test to make
 sure we don't ICE (which I think we do currently).
 
 FWIW, this doesn't ICE for me with aarch64-none-elf trunk.

I meant with conditional compare patches applied. 

Thanks,
Andrew

 
 Kyrill
 
 Thanks,
 Andrew Pinski
 
 2014-06-23  Andrew Pinski  apin...@cavium.com
 
 * gcc.c-torture/compile/20140723-1.c: New testcase.
 
 


Re: [PATCH, 10/10] aarch64: Handle ccmp in ifcvt to make it work with cmov

2014-06-23 Thread pinskia


 On Jun 23, 2014, at 12:37 AM, Zhenqiang Chen zhenqiang.c...@linaro.org 
 wrote:
 
 On 23 June 2014 15:09, Andrew Pinski pins...@gmail.com wrote:
 On Mon, Jun 23, 2014 at 12:01 AM, Zhenqiang Chen
 zhenqiang.c...@linaro.org wrote:
 Hi,
 
 The patch enhances ifcvt to handle conditional compare instruction
 (ccmp) to make it work with cmov. For ccmp, ALLOW_CC_MODE is set to
 TRUE when calling canonicalize_condition. And the backend does not
 need to generate additional compare (CC, 0) for it.
 
 Bootstrap and no check regression on qemu.
 
 OK for trunk?
 
 Thanks!
 -Zhenqiang
 
 ChangeLog:
 2014-06-23  Zhenqiang Chen  zhenqiang.c...@linaro.org
 
* config/aarch64/aarch64.md (movmodecc): Handle ccmp_cc.
* ifcvt.c: #include ccmp.h.
(struct noce_if_info): Add a new field ccmp_p.
(noce_emit_cmove): Allow ccmp condition.
(noce_get_alt_condition): Call canonicalize_condition with ccmp_p.
(noce_get_condition): Set ALLOW_CC_MODE to TRUE for ccmp.
(noce_process_if_block): Set ccmp_p for ccmp.
 
 testsuite/ChangeLog:
 2014-06-23  Zhenqiang Chen  zhenqiang.c...@linaro.org
 
* gcc.target/aarch64/ccmn-csel-1.c: New testcase.
* gcc.target/aarch64/ccmn-csel-2.c: New testcase.
* gcc.target/aarch64/ccmn-csel-3.c: New testcase.
* gcc.target/aarch64/ccmp-csel-1.c: New testcase.
* gcc.target/aarch64/ccmp-csel-2.c: New testcase.
* gcc.target/aarch64/ccmp-csel-3.c: New testcase.
 
 diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
 index fcc5559..82cc561 100644
 --- a/gcc/config/aarch64/aarch64.md
 +++ b/gcc/config/aarch64/aarch64.md
 @@ -2459,15 +2459,19 @@
   (match_operand:ALLI 3 register_operand )))]
   
   {
 -rtx ccreg;
 enum rtx_code code = GET_CODE (operands[1]);
 
 if (code == UNEQ || code == LTGT)
   FAIL;
 
 -ccreg = aarch64_gen_compare_reg (code, XEXP (operands[1], 0),
 - XEXP (operands[1], 1));
 -operands[1] = gen_rtx_fmt_ee (code, VOIDmode, ccreg, const0_rtx);
 +if (!ccmp_cc_register (XEXP (operands[1], 0),
 +  GET_MODE (XEXP (operands[1], 0
 +  {
 +   rtx ccreg;
 +   ccreg = aarch64_gen_compare_reg (code, XEXP (operands[1], 0),
 +XEXP (operands[1], 1));
 +   operands[1] = gen_rtx_fmt_ee (code, VOIDmode, ccreg, const0_rtx);
 +  }
   }
 )
 
 
 
 You should do the same thing for the FP one.  The change to aarch64.md
 is exactly the same patch which I came up with.
 
 Thanks for the comments.
 
 For AARCH64, we can mix INT and FP compares. But FP compare would be
 slower than INT compare.

One point is that this is not about fp compares but rather moving of fp 
register. The fp pattern is used for that.  So something like this would 
fail/ice:
double f(double a, double b, int c, int d)
{
  return c10d20?a:b;
}

Thanks,
Andrew



 
 CMP
 FCCMP
 
 FCMP
 CCMP
 
 FCMP
 FCCMP
 
 I have no enough resource to collect benchmark results to approve them
 valuable. So the patches did not handle FP at all. If you had approved
 CCMP for FP valuable, I will work out a separate patch to support it.
 Or you can share your patches.

I need to l

 
 Thanks!
 -Zhenqiang
 
 For the rest I actually I have a late phi-opt pass which does the
 conversion into COND_EXPR.  That is I don't change ifcvt at all.
 
 And then I needed two more patches after that to get conditional
 compares to work with cmov's.
 
 Thanks. Any patch to improve ccmp is welcome.
 
 -Zhenqiang
 
 The following patch which fixes up expand_cond_expr_using_cmove to
 handle CCmode correctly:
 --- a/gcc/expr.c
 +++ b/gcc/expr.c
 @@ -7989,7 +7989,9 @@ expand_cond_expr_using_cmove (tree treeop0
 ATTRIBUTE_UNUSED,
   op00 = expand_normal (treeop0);
   op01 = const0_rtx;
   comparison_code = NE;
 -  comparison_mode = TYPE_MODE (TREE_TYPE (treeop0));
 +  comparison_mode = GET_MODE (op00);
 +  if (comparison_mode == VOIDmode)
 +   comparison_mode = TYPE_MODE (TREE_TYPE (treeop0));
 }
 
   if (GET_MODE (op1) != mode)
 
 
 --- CUT ---
 And then this one to have ccmp to be expanded from the tree level:
 index cfc4a16..056e9b0 100644 (file)
 --- a/gcc/expr.c
 +++ b/gcc/expr.c
 @@ -9300,26 +9300,36 @@ used_in_cond_stmt_p (tree exp)
   imm_use_iterator ui;
   gimple use_stmt;
   FOR_EACH_IMM_USE_STMT (use_stmt, ui, exp)
 -if (gimple_code (use_stmt) == GIMPLE_COND)
 -  {
 -   tree op1 = gimple_cond_rhs (use_stmt);
 -   /* TBD: If we can convert all
 -   _Bool t;
 +{
 +  if (gimple_code (use_stmt) == GIMPLE_COND)
 +   {
 + tree op1 = gimple_cond_rhs (use_stmt);
 + /* TBD: If we can convert all
 + _Bool t;
 
 -   if (t == 1)
 - goto bb 3;
 -   else
 - goto bb 4;
 -  to
 -   if (t != 0)
 - goto bb 3;
 -   else
 - 

Re: [PATCH][RFC] Add phiopt in early opts (and add -fssa-phiopt option)

2014-06-17 Thread pinskia


 On Jun 17, 2014, at 6:07 AM, Richard Biener rguent...@suse.de wrote:
 
 
 First this adds a controlling option to the phiopt pass (-fssa-phiopt).
 Second, this moves the first phiopt pass from the main optimization
 pipeline into early opts (before merge-phi which confuses phiopt
 but after dce which will help it).
 
 ISTR that adding an early phiopt pass was wanted to perform CFG
 cleanups on the weird CFG that the gimplifier produces from C++
 code (but I fail to recollect the details nor remember a bug number).
 
 Generally doing a phiopt before merge-phi gets the chance to screw
 things up is good.  Also phiopt is a kind of cleanup that is
 always beneficial as it decreases code-size.
 
 Bootstrap and regtest running on x86_64-unknown-linux-gnu.
 
 I felt that -ftree-XXX is bad naming so I went for -fssa-XXX
 even if that is now inconsistent.  Any optinion here?  For
 RTL we simply have unsuffixed names so shall we instead go
 for -fphiopt?  PHI implies SSA anyway and 'SSA' or 'RTL' is
 an implementation detail that the user should not be interested
 in (applies to tree- as well, of course).  Now, 'phiopt' is a
 bad name when thinking of users (but they shouldn't play with
 those options anyway).
 
 So - comments on the pass move?  Comments on the flag naming?
 
 Thanks,
 Richard.
 
 2014-06-17  Richard Biener  rguent...@suse.de
 
* passes.def (pass_all_early_optimizations): Add phi-opt
after dce and before merge-phi.
(pass_all_optimizations): Remove first phi-opt pass.
* common.opt (fssa-phiopt): New option.
* opts.c (default_options_table): Enable -fssa-phiopt with -O1+
but not with -Og.
* tree-ssa-phiopt.c (pass_phiopt): Add gate method.
* doc/invoke.texi (-fssa-phiopt): Document.
 
 Index: gcc/passes.def
 ===
 --- gcc/passes.def(revision 211736)
 +++ gcc/passes.def(working copy)
 @@ -73,8 +73,12 @@ along with GCC; see the file COPYING3.
 execute TODO_rebuild_alias at this point.  */
  NEXT_PASS (pass_build_ealias);
  NEXT_PASS (pass_fre);
 -  NEXT_PASS (pass_merge_phi);
  NEXT_PASS (pass_cd_dce);
 +  NEXT_PASS (pass_phiopt);
 +  /* Do this after phiopt runs as phiopt is confused by
 + PHIs with more than two arguments.  Switch conversion
 + looks for a single PHI block though.  */
 +  NEXT_PASS (pass_merge_phi);

I had made phiopt not be confused by more than two arguments. What has changed? 
 I think we should make phiopt again better with more two arguments. 

Thanks,
Andrew


  NEXT_PASS (pass_early_ipa_sra);
  NEXT_PASS (pass_tail_recursion);
  NEXT_PASS (pass_convert_switch);
 @@ -155,7 +159,6 @@ along with GCC; see the file COPYING3.
   NEXT_PASS (pass_cselim);
   NEXT_PASS (pass_copy_prop);
   NEXT_PASS (pass_tree_ifcombine);
 -  NEXT_PASS (pass_phiopt);
   NEXT_PASS (pass_tail_recursion);
   NEXT_PASS (pass_ch);
   NEXT_PASS (pass_stdarg);
 Index: gcc/common.opt
 ===
 --- gcc/common.opt(revision 211736)
 +++ gcc/common.opt(working copy)
 @@ -1950,6 +1950,10 @@ fsplit-wide-types
 Common Report Var(flag_split_wide_types) Optimization
 Split wide types into independent registers
 
 +fssa-phiopt
 +Common Report Var(flag_ssa_phiopt) Optimization
 +Optimize conditional patterns using SSA PHI nodes
 +
 fvariable-expansion-in-unroller
 Common Report Var(flag_variable_expansion_in_unroller) Optimization
 Apply variable expansion when loops are unrolled
 Index: gcc/opts.c
 ===
 --- gcc/opts.c(revision 211736)
 +++ gcc/opts.c(working copy)
 @@ -457,6 +457,7 @@ static const struct default_options defa
 { OPT_LEVELS_1_PLUS_NOT_DEBUG, OPT_fbranch_count_reg, NULL, 1 },
 { OPT_LEVELS_1_PLUS_NOT_DEBUG, OPT_fmove_loop_invariants, NULL, 1 },
 { OPT_LEVELS_1_PLUS_NOT_DEBUG, OPT_ftree_pta, NULL, 1 },
 +{ OPT_LEVELS_1_PLUS_NOT_DEBUG, OPT_fssa_phiopt, NULL, 1 },
 
 /* -O2 optimizations.  */
 { OPT_LEVELS_2_PLUS, OPT_finline_small_functions, NULL, 1 },
 Index: gcc/tree-ssa-phiopt.c
 ===
 --- gcc/tree-ssa-phiopt.c(revision 211736)
 +++ gcc/tree-ssa-phiopt.c(working copy)
 @@ -2332,6 +2332,7 @@ public:
 
   /* opt_pass methods: */
   opt_pass * clone () { return new pass_phiopt (m_ctxt); }
 +  virtual bool gate (function *) { return flag_ssa_phiopt; }
   virtual unsigned int execute (function *)
 {
   return tree_ssa_phiopt_worker (false, gate_hoist_loads ());
 Index: gcc/doc/invoke.texi
 ===
 --- gcc/doc/invoke.texi(revision 211736)
 +++ gcc/doc/invoke.texi(working copy)
 @@ -412,7 +412,7 @@ Objective-C and Objective-C++ Dialects}.
 -fselective-scheduling -fselective-scheduling2 @gol
 

Re: PING: PATCH [4/n]: Prepare x32: Permute the conversion and addition if one operand is a constant

2014-05-29 Thread pinskia


 On May 29, 2014, at 9:13 AM, H.J. Lu hjl.to...@gmail.com wrote:
 
 On Wed, May 28, 2014 at 9:52 PM, Andrew Pinski pins...@gmail.com wrote:
 On Wed, Jul 13, 2011 at 9:39 AM, H.J. Lu hjl.to...@gmail.com wrote:
 On Wed, Jul 13, 2011 at 9:13 AM, Paolo Bonzini bonz...@gnu.org wrote:
 On 07/11/2011 05:54 PM, H.J. Lu wrote:
 
 The key is the
 
XEXP (x, 1) == convert_memory_address_addr_space
   (to_mode, XEXP (x, 1), as)
 
 test.  It ensures basically that the constant has 31-bit precision,
 because
 otherwise the constant would change from e.g. (const_int -0x7ffc)
 to
 (const_int 0x8004) when zero-extending it from SImode to DImode.
 
 But I'm not sure it's safe.  You have,
 
   (zero_extend:DI (plus:SI FOO:SI) (const_int Y))
 
 and you want to convert it to
 
   (plus:DI FOO:DI (zero_extend:DI (const_int Y)))
 
 (where the zero_extend is folded).  Ignore that FOO is a SYMBOL_REF
 (this
 piece of code does not assume anything about its shape); if FOO ==
 0xfffc and Y = 8, the result will be respectively 0x4 (valid) and
 0x10004 (invalid).
 
 This example contradicts what you said above It ensures basically that
 the
 constant has 31-bit precision.
 
 Why?  Certainly Y = 8 has 31-bit (or less) precision.  So it has the same
 representation in SImode and DImode, and the test above on XEXP (x, 1)
 succeeds.
 
 And then we permute conversion and addition, which leads to the issue you
 raised above.  In another word, the current code permutes conversion
 and addition.
 It leads to different values in case of symbol (0xfffc) + 8.
 Basically the current
 test for 31-bit (or less) precision is bogus.  The real question is
 for a address
 computation, A + B, if address wrap-around is supported in
 convert_memory_address_addr_space.
 
 Unless the code has already reassociated the additions already.
 Like in the AARCH64 ILP32 case:
 
 (plus:SI (plus:SI (mult:SI (reg/v:SI 80 [ b ])
(const_int -4 [0xfffc]))
(subreg/s/u:SI (reg/v/f:DI 79 [ a ]) 0))
(const_int -1073742592 [0xbd00]))
 
 The Tree level is correct in that it did not reassociate the addition
 but the RTL level ignores that.
 
 So this patch is invalid and incorrect unless you know the non
 constant part of the addition is a pointer (which is not the case
 here).
 
 There is an address overflow.  Is the address overflow behavior
 defined here?

There was no address overflow in the original code and there was no address 
overflow in the tree level. The rtl level does introduce an address overflow 
but the semantics of plus is defined to be wrapping so there is no overflow.   
This is blocking me from testing ilp32 under gnu/Linux as ld.so gets 
miscompiled and stack addresses have the sign bit set. 

Thanks,
Andrew


 
 
 -- 
 H.J.


Re: PING: PATCH [4/n]: Prepare x32: Permute the conversion and addition if one operand is a constant

2014-05-29 Thread pinskia


 On May 29, 2014, at 10:09 AM, H.J. Lu hjl.to...@gmail.com wrote:
 
 On Thu, May 29, 2014 at 9:23 AM,  pins...@gmail.com wrote:
 
 
 On May 29, 2014, at 9:13 AM, H.J. Lu hjl.to...@gmail.com wrote:
 
 On Wed, May 28, 2014 at 9:52 PM, Andrew Pinski pins...@gmail.com wrote:
 On Wed, Jul 13, 2011 at 9:39 AM, H.J. Lu hjl.to...@gmail.com wrote:
 On Wed, Jul 13, 2011 at 9:13 AM, Paolo Bonzini bonz...@gnu.org wrote:
 On 07/11/2011 05:54 PM, H.J. Lu wrote:
 
 The key is the
 
   XEXP (x, 1) == convert_memory_address_addr_space
  (to_mode, XEXP (x, 1), as)
 
 test.  It ensures basically that the constant has 31-bit precision,
 because
 otherwise the constant would change from e.g. (const_int -0x7ffc)
 to
 (const_int 0x8004) when zero-extending it from SImode to DImode.
 
 But I'm not sure it's safe.  You have,
 
  (zero_extend:DI (plus:SI FOO:SI) (const_int Y))
 
 and you want to convert it to
 
  (plus:DI FOO:DI (zero_extend:DI (const_int Y)))
 
 (where the zero_extend is folded).  Ignore that FOO is a SYMBOL_REF
 (this
 piece of code does not assume anything about its shape); if FOO ==
 0xfffc and Y = 8, the result will be respectively 0x4 (valid) and
 0x10004 (invalid).
 
 This example contradicts what you said above It ensures basically that
 the
 constant has 31-bit precision.
 
 Why?  Certainly Y = 8 has 31-bit (or less) precision.  So it has the same
 representation in SImode and DImode, and the test above on XEXP (x, 1)
 succeeds.
 
 And then we permute conversion and addition, which leads to the issue you
 raised above.  In another word, the current code permutes conversion
 and addition.
 It leads to different values in case of symbol (0xfffc) + 8.
 Basically the current
 test for 31-bit (or less) precision is bogus.  The real question is
 for a address
 computation, A + B, if address wrap-around is supported in
 convert_memory_address_addr_space.
 
 Unless the code has already reassociated the additions already.
 Like in the AARCH64 ILP32 case:
 
 (plus:SI (plus:SI (mult:SI (reg/v:SI 80 [ b ])
   (const_int -4 [0xfffc]))
   (subreg/s/u:SI (reg/v/f:DI 79 [ a ]) 0))
   (const_int -1073742592 [0xbd00]))
 
 The Tree level is correct in that it did not reassociate the addition
 but the RTL level ignores that.
 
 So this patch is invalid and incorrect unless you know the non
 constant part of the addition is a pointer (which is not the case
 here).
 
 There is an address overflow.  Is the address overflow behavior
 defined here?
 
 There was no address overflow in the original code and there was no address 
 overflow in the tree level. The rtl level does introduce an address overflow 
 but the semantics of plus is defined to be wrapping so there is no overflow. 
   This is blocking me from testing ilp32 under gnu/Linux as ld.so gets 
 miscompiled and stack addresses have the sign bit set.
 
 
 What is your Pmode?

Pmode is dimode while ptr_mode is simode.  Pointers are zero extended when 
converting between si and di modes. 

Thanks,
Andrew


 
 
 -- 
 H.J.


Re: [MIPS] Add sbasic supoert ffor MSA (SIMD)

2014-05-28 Thread pinskia


On May 28, 2014, at 1:03 AM, Matthew Fortune matthew.fort...@imgtec.com wrote:

 You shouldn't need to declare __builtin_* functions anyway.  And if a
 function can be represented directly with GNU C vector extensions, it's
 preferred to implement it that way inline in the header rather than having
 built-in functions duplicating existing GNU C functionality.  (Look at
 what AArch64 arm_neon.h does where possible, and what ARM arm_neon.h has
 been moved towards lately.  I don't now what the msa.h functions do, so I
 don't know if this actually applies to any of them - but it's something to
 consider, so that built-in functions are only defined where actually
 needed.)
 
 In the aarch64 arm_neon.h header there are a decent number of inline asm
 implementations too instead of builtins. It is not immediately obvious to me
 as to what the deciding factor is between adding a builtin and using inline
 asm when vector extensions do not support the operation. Do you happen to
 know why inline asm is used in places?

Most likely simplify implementation at the time. Inline-asm is useless when it 
comes to scheduling code.  So the answer should be easy there. 


 
 This looks like a reasonable idea to use GNU extensions where available. The
 down-side to this approach is that it may be necessary to write quite
 dis-similar headers for LLVM vs GCC which I think is part of the reason why
 the header is written as it is. I don't know if that is a good reason to
 require builtins or not though.

Well clang supports Opencl and Opencl got many of its vector behaviors from 
gcc. So I doubt it would be too hard for so ifdefs in there.

Thanks,
Andrew

 
 Regards,
 Matthew


Re: [patch] Fix over-optimization of calls to pure function

2014-05-19 Thread pinskia


 On May 19, 2014, at 2:39 AM, Richard Biener richard.guent...@gmail.com 
 wrote:
 
 On Mon, May 19, 2014 at 10:58 AM, Eric Botcazou ebotca...@adacore.com 
 wrote:
 Hi,
 
 this fixes an over-optimization of the GIMPLE optimizer, whereby two 
 otherwise
 identical calls to a pure function present in different EH regions are CSEd,
 which changes the semantics of the program because the second EH handler is
 not invoked:
 
  begin
I := F(0);
  exception
when E = N := N + 1;
  end;
 
  begin
I := F(0);
  exception
when E = N := N +1;
  end;
 
 Two passes (DOM and FRE) can optimize this construct and a test modelled on
 stmt_can_throw_internal is used to coax them into not doing so.
 
 Tested on x86_64-suse-linux, OK for the mainline?
 
 Heh, interesting.
 
 Please keep the redundant test, it speeds up comparison on hash
 collisions.  As you are on it I'd use size_int (lr).
 
 I think it's ok to CSE foo (0) for
 
   try {
  foo (0);
   } catch (...) { side-effect }
   foo (0);
 
 but not
 
   foo(0);
   try {
 foo (0);
   } catch (...) { side-effect }
 
 but I don't see a clever way to preserve one but not the other
 (at least in SCCVN).  Do you think it's worth trying to preserve
 that?
 
 Otherwise the patch looks good to me.  (bonus point for a C++
 testcase as Ada is not in our testing requirements and it would
 be nice to exercise this w/o Ada)

I thought we had decided a long time ago that pure and const functions could 
not throw and that was the documented behavior. 

Thanks,
Andrew

 
 Thanks,
 Richard.
 
 
 2014-05-19  Eric Botcazou  ebotca...@adacore.com
 
* tree-ssa-dom.c (hashable_expr_equal_p) EXPR_CALL: Also compare the
EH region of calls to pure functions that can throw an exception.
* tree-ssa-sccvn.c (vn_reference_eq): Remove redundant test.
(copy_reference_ops_from_call): Also copy the EH region of the call if
it can throw an exception.
 
 
 2014-05-19  Eric Botcazou  ebotca...@adacore.com
 
* gnat.dg/opt35.adb: New test.
* gnat.dg/opt35_pkg.ad[sb]: New helper.
 
 
 --
 Eric Botcazou


Re: [PATCH] Fix PR54733 Optimize endian independent load/store

2014-05-16 Thread pinskia


 On May 16, 2014, at 4:13 AM, Richard Biener richard.guent...@gmail.com 
 wrote:
 
 On Fri, May 16, 2014 at 1:03 PM, Richard Biener
 richard.guent...@gmail.com wrote:
 On Fri, May 16, 2014 at 12:56 PM,  pins...@gmail.com wrote:
 
 
 On May 16, 2014, at 3:48 AM, Richard Biener richard.guent...@gmail.com 
 wrote:
 
 On Fri, May 16, 2014 at 12:07 PM, Thomas Preud'homme
 thomas.preudho...@arm.com wrote:
 Ping?
 
 Sorry ...
 
 Best regards,
 
 Thomas Preud'homme
 
 -Original Message-
 From: gcc-patches-ow...@gcc.gnu.org [mailto:gcc-patches-
 ow...@gcc.gnu.org] On Behalf Of Thomas Preud'homme
 Sent: Friday, May 09, 2014 6:26 PM
 To: GCC Patches
 Subject: RE: [PATCH] Fix PR54733 Optimize endian independent load/store
 
 Sorry, took longer than expected as I got distracted by some other patch.
 I merged the whole patchset in a single patch as I was told the current 
 setup
 is actually more difficult to read.
 
 Here are the updated ChangeLogs:
 
 *** gcc/ChangeLog ***
 
 2014-05-09  Thomas Preud'homme  thomas.preudho...@arm.com
 
 PR tree-optimization/54733
 * expr.c (get_inner_reference): Add a parameter to control whether
 a
 MEM_REF should be split into base + offset.
 * tree.h (get_inner_reference): Default new parameter to false.
 * tree-ssa-math-opts.c (nop_stats): New bswap_stats structure.
 (CMPNOP): Define.
 (find_bswap_or_nop_load): New.
 (find_bswap_1): Renamed to ...
 (find_bswap_or_nop_1): This. Also add support for memory source.
 (find_bswap): Renamed to ...
 (find_bswap_or_nop): This. Also add support for memory source and
 detection of bitwise operations equivalent to load in host 
 endianness.
 (execute_optimize_bswap): Likewise. Also move its leading
 comment back
 in place and split statement transformation into ...
 (bswap_replace): This. Add assert when updating bswap_stats.
 
 *** gcc/testsuite/ChangeLog ***
 
 2014-05-09  Thomas Preud'homme  thomas.preudho...@arm.com
 
 PR tree-optimization/54733
 * gcc.dg/optimize-bswapdi-3.c: New test to check extension of
 bswap
 optimization to support memory sources and bitwise operations
 equivalent to load in host endianness.
 * gcc.dg/optimize-bswaphi-1.c: Likewise.
 * gcc.dg/optimize-bswapsi-2.c: Likewise.
 * gcc.c-torture/execute/bswap-2.c: Likewise.
 
 Ok for trunk?
 
 Ok, I now decided otherwise and dislike the new parameter to
 get_inner_reference.  Can you please revert that part and just
 deal with a MEM_REF result in your only caller?
 
 And (of course) I found another possible issue.  The way you
 compute load_type and use it here:
 
 +  /* Perform the load.  */
 +  load_offset_ptr = build_int_cst (n-alias_set, 0);
 +  val_expr = fold_build2 (MEM_REF, load_type, addr_tmp,
 + load_offset_ptr);
 
 makes the load always appear aligned according to the mode of
 load_type.  On strict-alignment targets this may cause faults.
 
 So what you have to do is either (simpler)
 
  unsigned int align = get_pointer_alignment (addr_tmp);
  tree al_load_type = load_type;
  if (align  TYPE_ALIGN (load_type))
al_load_type = build_aligned_type (load_type, align);
 ...
   val_expr = fold_build2 (MEM_REF, al_load_type, addr_tmp,
load_offset_ptr);
 
 or keep track of the first actual load and use
 
  unsigned int align = get_object_alignment (that_first_load);
 
 first in the one that corresponds to addr_tmp.  From that there
 is a much better chance to derive good alignment values.
 
 Of course on STRICT_ALIGNMENT targets a not aligned load
 will be decomposed again, so eventually doing the transformation
 may no longer be profitable(?).
 
 Not always decomposed. On MIPS, it should using the load/store left/right 
 instructions for unaligned load/stores which is normally better than 
 decomposed load/stores. So having a cost model would be nice.
 
 Agreed, but I am happy with doing that as a followup.  Btw,
 a very simple one would be to reject unaligned
 SLOW_UNALIGNED_ACCESS (TYPE_MODE (load_type), align).
 [of course that may be true on MIPS even for the cases where
 a reasonable fast unalgined variant exists - nearly no target
 defines that macro in a too fancy way]
 
 Oh, and what happens for
 
 unsigned foo (unsigned char *x)
 {
  return x[0]  24 | x[2]  8 | x[3];
 }
 
 ?  We could do an unsigned int load from x and zero byte 3
 with an AND.  Enhancement for a followup, similar to also
 considering vector types for the load (also I'm not sure
 that uint64_type_node always has non-BLKmode for all
 targets).

No we cannot if x[4] is on a different page which is not mapped in, we get a 
fault. Not something we want. 

Thanks,
Andrew


 
 Richard.
 
 Richard.
 
 Thanks,
 Andrew
 
 
 Thanks and sorry again for the delay.
 
 Otherwise the patch looks good to me.
 
 Richard.
 
 Best regards,
 
 Thomas
 
 -Original Message-
 From: gcc-patches-ow...@gcc.gnu.org 

Re: libsanitizer merge from upstream r208536

2014-05-13 Thread pinskia


 On May 13, 2014, at 12:59 AM, Konstantin Serebryany 
 konstantin.s.serebry...@gmail.com wrote:
 
 I've committed this upstream and will include it into my next updated patch:
 
 +#if defined(__x86_64__)  !defined(_LP64)
 +  typedef long long __sanitizer_time_t;
 +#else
 +  typedef long __sanitizer_time_t;
 +#endif
 +
   struct __sanitizer_timeb {
 -long time;
 +__sanitizer_time_t time;
 unsigned short millitm;


This will also be needed for ilp32 aarch64 too. 
 
 
 If this is not enough, please contribute patches directly upstream --
 I can not accept them from here.


I am still against the way Libsanitizer is being maintained. I really think the 
maintainers of the source in gcc should submit the patches upstream and not 
burden the target maintainers with it.  Or maybe having libsantizer as another 
project separate from gcc and LLVM would be even better. 

Thanks,
Andrew


 
 Also, what's the story with x32 in LLVM?
 Is there any chance you can set up a public build bot for x32 asan (upstream)?
 
 On Mon, May 12, 2014 at 9:53 PM, H.J. Lu hjl.to...@gmail.com wrote:
 On Mon, May 12, 2014 at 4:20 AM, Konstantin Serebryany
 konstantin.s.serebry...@gmail.com wrote:
 This is the first libsanitizer merge in 4.10 (the last merge was in
 December 2013).
 
 Tested on Ubuntu 12.04 like this:
 rm -rf */{*/,}libsanitizer  make -j 50
 make -j 40 -C gcc check-g{cc,++}
 RUNTESTFLAGS='--target_board=unix\{-m32,-m64\} asan.exp'  \
 make -j 40 -C gcc check-g{cc,++}
 RUNTESTFLAGS='--target_board=unix\{-m32,-m64\} tsan.exp'  \
 make -j 40 -C gcc check
 RUNTESTFLAGS='--target_board=unix\{-m32,-m64\} ubsan.exp'  \
 echo PASS
 
 5 months' worth of changes may break any platform we are not testing 
 ourselves
 (that includes Ubuntu 12.04, 13.10, 14.04, Mac 10.9, Windows 7, Android 
 ARM),
 please help us test this patch on your favorite platform.
 
 Expected ChangeLog entries:
 === gcc/testsuite/ChangeLog
 2014-05-XX  Kostya Serebryany  k...@google.com
 
* c-c++-common/tsan/mutexset1.c: Update the test to match
upstream r208536.
* g++.dg/asan/symbolize-callback-1.C: Delete the deprecated test.
 
 === libsanitizer/ChangeLog
 2014-05-XX  Kostya Serebryany  k...@google.com
 
* All source files: Merge from upstream r208536.
* asan/Makefile.am (asan_files): Added new files.
* asan/Makefile.in: Regenerate.
* tsan/Makefile.am (tsan_files): Added new files.
* tsan/Makefile.in: Regenerate.
* sanitizer_common/Makefile.am (sanitizer_common_files): Added
 new files.
* sanitizer_common/Makefile.in: Regenerate.
 
 --kcc
 
 sanitizer_common/sanitizer_platform_limits_posix.h has
 
  struct __sanitizer_timeb {
long time;
unsigned short millitm;
short timezone;
short dstflag;
  };
 
 On Linux, timeb is
 
 struct timeb
  {
time_t time; /* Seconds since epoch, as from `time'.  */
unsigned short int millitm; /* Additional milliseconds.  */
short int timezone; /* Minutes west of GMT.  */
short int dstflag; /* Nonzero if Daylight Savings Time used.  */
  };
 
 For x32, long is 32-bit and time_t is 64-bit.  We need something
 like
 
 struct __sanitizer_timeb {
 #if defined(__x86_64__)  !defined(_LP64)
long long time;
 #else
long time;
 #endif
 
 similar to other places.
 
 
 --
 H.J.


Re: [PATCH GCC]Add 'force-dwarf-lexical-blocks' command line option

2014-05-07 Thread pinskia


 On May 7, 2014, at 2:32 AM, Herman, Andrei andrei_her...@codesourcery.com 
 wrote:
 
 
 Hi,
 
 Currently GCC only emits DWARF debug information (DW_TAG_lexical_block DIEs)
 for compound statements containing significant local declarations.
 However, code coverage tools that process the DWARF debug information to
 implement block/path coverage need more complete lexical block information. 
 
 This patch adds the necessary functionality under the control of a new 
 command line argument: -fforce-dwarf-lexical-blocks.
 
 When this flag is set, a DW_TAG_lexical_block DIE will be emitted for every
 function body, loop body, switch body, case statement, if-then and if-else
 statement, even if the body is a single statement. 
 Likewise, a lexical block will be emitted for the first label of a labeled
 statement. This block ends at the end of the current lexical scope, or when
 a break, continue, goto or return statement is encountered at the same lexical
 scope level. 
 Consequently, any case in a switch statement that does not flow through to 
 the next case, will have its own dwarf lexical block.
 
 The complete change proposal contains 4 patches (attached first 3):
 1. Add command line option -fforce-dwarf-lexical-blocks

This option since it is specific to the c frontend should go into c.opt instead 
of common.opt. Unless you are going to extend this to Ada, Java and fortran. 

Thanks,
Andrew


 2. Use of flag_force_dwarf_blocks
 3. Create label scopes
 
 A forth patch, extending the proposed functionality to C++ will be submitted 
 in a separate message.
 
 Attached are the proposed ChangeLog additions, named according to the 
 directory each one belongs to.
 
 Best regards,
 Andrei Herman
 Mentor Graphics Corporation
 Israel branch 
 
 gcc_c_ChangeLog
 gcc_c-family_ChangeLog
 gcc_ChangeLog
 0001-Add-command-line-option-fforce_dwarf_lexical_blocks.patch
 0002-Use-flag_force_dwarf_blocks.patch
 0003-Create-label-scopes.patch


Re: [AArch64] Fix integer vabs intrinsics

2014-05-02 Thread pinskia


 On May 2, 2014, at 2:21 AM, James Greenhalgh james.greenha...@arm.com wrote:
 
 On Fri, May 02, 2014 at 10:00:15AM +0100, Andrew Pinski wrote:
 On Fri, May 2, 2014 at 1:48 AM, James Greenhalgh
 james.greenha...@arm.com wrote:
 
 Hi,
 
 Unlike the mid-end's concept of an ABS_EXPR, which treats overflow as
 undefined/impossible, the neon intrinsics vabs intrinsics should behave as
 the hardware. That is to say, the pseudo-code sequence:
 
 
 Only for signed integer types.  You should be able to use an unsigned
 integer type here instead.
 
 If anything, I think that puts us in a worse position.

Not if you cast it back. 


 The issue that
 inspires this patch is that GCC will happily fold:
 
  t1 = ABS_EXPR (x)
  t2 = GE_EXPR (t1, 0)
 
 to
 
  t2 = TRUE
 
 Surely an unsigned integer type is going to suffer the same fate? Certainly I
 can imagine somewhere in the compiler there being a fold path for:

Yes but if add a cast from the unsigned type to the signed type gcc does not 
optimize that. If it does it is a bug since the overflow is defined there. 

 
  (unsigned = 0) == TRUE
 
 
  a = vabs_s8 (vdup_n_s8 (-128));
  assert (a = 0);
 
 does not hold. As in hardware
 
  abs (-128) == -128
 
 Folding vabs intrinsics to an ABS_EXPR is thus a mistake, and we should 
 avoid
 it. In fact, we have to be even more careful than that, and keep the integer
 vabs intrinsics as an unspec in the back end.
 
 No it is not.  The mistake is to use signed integer types here.  Just
 add a conversion to an unsigned integer vector and it will work
 correctly.
 In fact the ABS rtl code is not undefined for the overflow.
 
 Here we are covering ourselves against a seperate issue. For auto-vectorized
 code we want the SABD combine patterns to kick in whenever sensible. For
 intrinsics code, in the case where vsub_s8 (x, y) would cause an underflow:
 
  vabs_s8 (vsub_s8 (x, y)) != vabd_s8 (x, y)
 
 So in this case, the combine would be erroneous. Likewise SABA.

This sounds like it would problematic for unsigned types  and not just for 
vabs_s8 with vsub_s8. So I think you should be using unspec for vabd_s8 
instead. Since in rtl overflow and underflow is defined to be wrapping. 

Thanks,
Andrew Pinski

 
 Thanks,
 James


Re: [C PATCH] proposal to add new warning -Wsizeof-array-argument

2014-04-27 Thread pinskia


 On Apr 27, 2014, at 10:09 AM, Prathamesh Kulkarni 
 bilbotheelffri...@gmail.com wrote:
 
 On Sun, Apr 27, 2014 at 8:48 PM, Trevor Saunders tsaund...@mozilla.com 
 wrote:
 On Sun, Apr 27, 2014 at 06:21:20PM +0530, Prathamesh Kulkarni wrote:
 On Sun, Apr 27, 2014 at 5:31 PM, Trevor Saunders tsaund...@mozilla.com 
 wrote:
 On Sun, Apr 27, 2014 at 02:31:46AM +0530, Prathamesh Kulkarni wrote:
 Hi,
Shall it a good idea to add new warning -Wsizeof-array-argument that
 warns when sizeof is applied on parameter declared as an array ?
 
 Seems reasonable enough.
 
 Similar to clang's -Wsizeof-array-argument:
 http://lists.cs.uiuc.edu/pipermail/cfe-commits/Week-of-Mon-20110613/042812.html
 This was also reported as PR6940:
 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=6940
 
 I have attached a patch that adds the warning to C front-end.
 
 if we're doing this for C, we should probably do it for C++ too.
 
 I implemented it by adding a new member BOOL_BITFIELD is_array_parm to
 tree_parm_decl. Not sure if that's a good approach.
 
 I'm about the last one who should comment on this, but it seems pretty
 crazy you can't use data that's already stored.
 AFAIU, the information about declarator is stored in c_declarator.
 c_declarator-kind == cdk_array holds true
 if the declarator is an array. However in push_parm_decl, call to
 grokdeclarator returns decl of pointer_type, corresponding to array
 declarator if the array is parameter (TREE_TYPE (decl) is
 pointer_type). So I guess we lose that information there.
 
 I guess that sort of makes sense, so I'll shut up ;)
 
 Index: gcc/tree-core.h
 ===
 --- gcc/tree-core.h   (revision 209800)
 +++ gcc/tree-core.h   (working copy)
 @@ -1411,6 +1411,7 @@ struct GTY(()) tree_const_decl {
 struct GTY(()) tree_parm_decl {
   struct tree_decl_with_rtl common;
   rtx incoming_rtl;
 +  BOOL_BITFIELD is_array_parm;
 
 BOOL_BITFIELD only makes sense if you declare it as an actually bitfield
 with size less than that of unisgned int, otherwise you might as well
 use that directly.  On the other hand I wonder if we can't just nuke
 BOOL_BITFIELD, it seems to be legacy from a time of C and bool not being
 a builtin type?
 Thanks, I wasn't aware of that. Could we now use C++ bool type instead ?
 
 you can certainly do |bool x;| as struct fields, that's already all
 over.  However its not entirely clear to me if |bool x : 1;| will work
 everywhere and take the single bit you'd expect, istr there being
 compilers that do stupid things if you use multiple types next to each
 other in bitfields, but I'm not sure if we need to care about any of
 those.
 Changed to bool is_array_parm;
 and from macros to inline functions.

I don't like this field being part of the generic code as it increases the size 
of the struct for all front-ends and even during LTO. Is there a way to do this 
using one of the language specific bitfields that are already there (language 
flags iirc)?

Thanks,
Andrew


 
 [gcc]
 * tree-core.h (tree_parm_decl): Add new member bool is_array_parm
 * tree.h (set_parm_decl_is_array): New function.
(parm_decl_array_p): New function.
 
 [gcc/c]
 * c-decl.c (push_parm_decl): Call set_parm_decl_is_array.
 * c-typeck.c (c_expr_sizeof_expr): Add check for sizeof-array-argument 
 warning.
 
 [gcc/c-family]
 * c.opt (-Wsizeof-array-argument): New option.
 
 [gcc/testsuite/gcc.dg]
 * sizeof-array-argument.c: New test-case.
 
 Thanks and Regards,
 Prathamesh
 
 Trev
 
 
 Index: gcc/tree.h
 ===
 --- gcc/tree.h(revision 209800)
 +++ gcc/tree.h(working copy)
 @@ -1742,6 +1742,7 @@ extern void protected_set_expr_location
 #define TYPE_LANG_SPECIFIC(NODE) \
   (TYPE_CHECK (NODE)-type_with_lang_specific.lang_specific)
 
 +
 #define TYPE_VALUES(NODE) (ENUMERAL_TYPE_CHECK 
 (NODE)-type_non_common.values)
 #define TYPE_DOMAIN(NODE) (ARRAY_TYPE_CHECK 
 (NODE)-type_non_common.values)
 #define TYPE_FIELDS(NODE) \
 @@ -2258,6 +2259,12 @@ extern void decl_value_expr_insert (tree
 #define DECL_INCOMING_RTL(NODE) \
   (PARM_DECL_CHECK (NODE)-parm_decl.incoming_rtl)
 
 +#define SET_PARM_DECL_IS_ARRAY(NODE, val) \
 +  (PARM_DECL_CHECK (NODE)-parm_decl.is_array_parm = (val))
 
 if we're adding more stuff here is there a reason it needs to be a macro
 not a inline function?
 No, shall change that to inline function.
 
 Trev
 sizeof-array-argument.patch


Re: [PING] [PATCH, AARCH64] movmodecc for fcsel

2014-04-22 Thread pinskia


 On Apr 22, 2014, at 2:36 AM, Zhenqiang Chen zhenqiang.c...@linaro.org wrote:
 
 Ping?
 
 Rebase and test. Bootstrap and no make check regression with qemu.
 
 OK for trunk?

This is the exact same patch we (Cavium) came up with for this missed 
optimization. 

Thanks,
Andrew

 
 Thanks!
 -Zhenqiang
 
 On 18 March 2014 16:16, Zhenqiang Chen zhenqiang.c...@linaro.org wrote:
 Hi,
 
 For float value, movsfcc/movdfcc is required by emit_conditional_move
 called in ifcvt pass to expand if-then-else to fcsel insn.
 
 Bootstrap and no make check regression with qemu-aarch64.
 
 Is it OK for next stage1?
 
 Thanks!
 -Zhenqiang
 
 ChangeLog:
 2014-03-18  Zhenqiang Chen  zhenqiang.c...@linaro.org
 
* config/aarch64/aarch64.md (movmodecc): New for GPF.
 
 testsuite/ChangeLog:
 2014-03-18  Zhenqiang Chen  zhenqiang.c...@linaro.org
 
* gcc.target/aarch64/fcsel.c: New test case.
 
 diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
 index 99a6ac8..0f4b8ebf 100644
 --- a/gcc/config/aarch64/aarch64.md
 +++ b/gcc/config/aarch64/aarch64.md
 @@ -2344,6 +2344,25 @@
   }
 )
 
 +(define_expand movmodecc
 +  [(set (match_operand:GPF 0 register_operand )
 +(if_then_else:GPF (match_operand 1 aarch64_comparison_operator )
 +  (match_operand:GPF 2 register_operand )
 +  (match_operand:GPF 3 register_operand )))]
 +  
 +  {
 +rtx ccreg;
 +enum rtx_code code = GET_CODE (operands[1]);
 +
 +if (code == UNEQ || code == LTGT)
 +  FAIL;
 +
 +ccreg = aarch64_gen_compare_reg (code, XEXP (operands[1], 0),
 +  XEXP (operands[1], 1));
 +operands[1] = gen_rtx_fmt_ee (code, VOIDmode, ccreg, const0_rtx);
 +  }
 +)
 +
 (define_insn *csinc2mode_insn
   [(set (match_operand:GPI 0 register_operand =r)
 (plus:GPI (match_operator:GPI 2 aarch64_comparison_operator
 diff --git a/gcc/testsuite/gcc.target/aarch64/fcsel.c
 b/gcc/testsuite/gcc.target/aarch64/fcsel.c
 new file mode 100644
 index 000..9c5431a
 --- /dev/null
 +++ b/gcc/testsuite/gcc.target/aarch64/fcsel.c
 @@ -0,0 +1,20 @@
 +/* { dg-do compile } */
 +/* { dg-options  -O2  } */
 +
 +float f1 (float a, float b, float c, float d)
 +{
 +  if (a  0.0)
 +return c;
 +  else
 +return 2.0;
 +}
 +
 +double f2 (double a, double b, double c, double d)
 +{
 +  if (a  b)
 +return c;
 +  else
 +return d;
 +}
 +
 +/* { dg-final { scan-assembler-times \tfcsel 2 } } */


Re: [PATCH] aarch64 suuport for libitm

2014-04-02 Thread pinskia


 On Apr 2, 2014, at 7:37 AM, Richard Henderson r...@redhat.com wrote:
 
 On 04/01/2014 03:41 PM, Andrew Pinski wrote:
 On Tue, Apr 1, 2014 at 3:24 PM, Richard Henderson r...@redhat.com wrote:
 Comments?  If approved, should this go in for 4.9, or wait for stage1?
 Certainly it's self-contained...
 
 On Cavium's thunder processor the cache line size is going to be
 bigger than 64 bytes, what is your solution to improve performance on
 target's like Thunder?
 
 We can expand the number reasonably.  The only thing it controls is layout of
 some of the internal data structures to attempt to put different locks on
 different lines.
 
 Is 128 big enough for Thunder?  Honestly, I may well not even have it right 
 for
 the processor we have in house.  I didn't bother trying to track down docs to
 find out.

Yes 128 should be enough. 

Thanks,
Andrew

 
 Also I think the default page size for most Linux distros is going to
 be 64k on aarch64 including Redhat Linux so it makes sense not to
 define FIXED_PAGE_SIZE.
 
 Heh.  It turns out these page size defines aren't used any more at all.  
 During
 one of the rewrites we must have delete the bits that used it.  I'll get rid 
 of
 all of them so as to be less confusing.
 
 I will implement the ILP32 version of this patch once it goes in,
 there needs a few changes in gtm_jmpbuf due to long and pointers being
 32bit but the assembly storing 64bits always.
 
 I can minimize those changes now by using unsigned long long...
 
 
 r~
 


Re: [AArch64] Implement ADD in vector registers for 32-bit scalar values.

2014-03-28 Thread pinskia


 On Mar 28, 2014, at 2:12 AM, James Greenhalgh james.greenha...@arm.com 
 wrote:
 
 
 Hi,
 
 There is no way to perform scalar addition in the vector register file,
 but with the RTX costs in place we start rewriting (x  1) to (x + x)
 on almost all cores. The code which makes this decision has no idea that we
 will end up doing this (it happens well before reload) and so we end up with
 very ugly code generation in the case where addition was selected, but
 we are operating in vector registers.
 
 This patch relies on the same gimmick we are already using to allow
 shifts on 32-bit scalars in the vector register file - Use a vector 32x2
 operation instead, knowing that we can safely ignore the top bits.
 
 This restores some normality to scalar_shift_1.c, however the test
 that we generate a left shift by one is clearly bogus, so remove that.
 
 This patch is pretty ugly, but it does generate superficially better
 looking code for this testcase.
 
 Tested on aarch64-none-elf with no issues.
 
 OK for stage 1?

It seems we should also discourage the neon alternatives as there might be 
extra movement between the two register sets which we don't want. 

Thanks,
Andrew


 
 Thanks,
 James
 
 ---
 gcc/
 
 2014-03-27  James Greenhalgh  james.greenha...@arm.com
 
* config/aarch64/aarch64.md (*addsi3_aarch64): Add alternative in
vector registers.
 
 gcc/testsuite/
 2014-03-27  James Greenhalgh  james.greenha...@arm.com
 
* gcc.target/aarch64/scalar_shift_1.c: Fix expected assembler.
 0001-AArch64-Implement-ADD-in-vector-registers-for-32-bit.patch


Re: [AArch64] Implement ADD in vector registers for 32-bit scalar values.

2014-03-28 Thread pinskia


 On Mar 28, 2014, at 7:48 AM, James Greenhalgh james.greenha...@arm.com 
 wrote:
 
 On Fri, Mar 28, 2014 at 11:11:58AM +, pins...@gmail.com wrote:
 On Mar 28, 2014, at 2:12 AM, James Greenhalgh james.greenha...@arm.com 
 wrote:
 Hi,
 
 There is no way to perform scalar addition in the vector register file,
 but with the RTX costs in place we start rewriting (x  1) to (x + x)
 on almost all cores. The code which makes this decision has no idea that we
 will end up doing this (it happens well before reload) and so we end up with
 very ugly code generation in the case where addition was selected, but
 we are operating in vector registers.
 
 This patch relies on the same gimmick we are already using to allow
 shifts on 32-bit scalars in the vector register file - Use a vector 32x2
 operation instead, knowing that we can safely ignore the top bits.
 
 This restores some normality to scalar_shift_1.c, however the test
 that we generate a left shift by one is clearly bogus, so remove that.
 
 This patch is pretty ugly, but it does generate superficially better
 looking code for this testcase.
 
 Tested on aarch64-none-elf with no issues.
 
 OK for stage 1?
 
 It seems we should also discourage the neon alternatives as there might be
 extra movement between the two register sets which we don't want.
 
 I see your point, but we've tried to avoid doing that elsewhere in the
 AArch64 backend. Our argument has been that strictly speaking, it isn't that
 the alternative is expensive, it is the movement between the register sets. We
 do model that elsewhere, and the register allocator should already be trying 
 to
 avoid unneccesary moves between register classes.
 

What about on a specific core where that alternative is expensive; that is the 
vector instructions are worse than the scalar ones. How are we going to handle 
this case?

Thanks,
Andrew


 If those mechanisms are broken, we should fix them - in that case fixing
 this by discouraging valid alternatives would seem to be gaffer-taping over 
 the
 real problem.
 
 Thanks,
 James
 
 
 Thanks,
 Andrew
 
 
 Thanks,
 James
 
 ---
 gcc/
 
 2014-03-27  James Greenhalgh  james.greenha...@arm.com
 
   * config/aarch64/aarch64.md (*addsi3_aarch64): Add alternative in
   vector registers.
 
 gcc/testsuite/
 2014-03-27  James Greenhalgh  james.greenha...@arm.com
 
   * gcc.target/aarch64/scalar_shift_1.c: Fix expected assembler.
 0001-AArch64-Implement-ADD-in-vector-registers-for-32-bit.patch
 


Re: Non-temporal move

2014-02-25 Thread pinskia
The original patch http://gcc.gnu.org/ml/gcc-patches/2007-04/msg01862.htm

Thanks,
Andrew Pinski 

 On Feb 25, 2014, at 12:00 AM, Uros Bizjak ubiz...@gmail.com wrote:
 
 On Tue, Feb 25, 2014 at 7:10 AM, Gopalasubramanian, Ganesh
 ganesh.gopalasubraman...@amd.com wrote:
 I could see storent pattern in x86 machine descriptions (in sse.md)., but 
 internals doc don't mention it. Should we add a description about this in 
 the internals doc?
 
 This pattern was added way back in 2007 and was not documented
 properly. It should be documented in Standard Pattern Names For
 Generation in md.texi.
 
 Uros.


Re: [Patch AArch64] Implement Vector Permute Support

2014-01-14 Thread pinskia


 On Jan 14, 2014, at 7:19 AM, Alex Velenko alex.vele...@arm.com wrote:
 
 Hi,
 
 This patch turns off the vec_perm patterns for aarch64_be, this should resolve
 the issue  highlighted here 
 http://gcc.gnu.org/ml/gcc-patches/2014-01/msg00321.html
 With this patch applied, the test case provided in that link compiles without 
 an ICE.
 
 However, the Big-Endian port is still in development. This patch exposes
 another known but unrelated issue with Big-Endian Large-Int modes.
 
 The patch has been tested on aarch64-none-elf and aarch64_be-none-elf 
 resulting in five
 further regression due to the broken implementation of Big-Endian Large-Int 
 modes.
 
 Kind regards,
 Alex Velenko
 
 gcc/
 
 2014-01-14  Alex Velenko  alex.vele...@arm.com
 
* config/aarch64/aarch64-simd.md (vec_permmode): Add BE check.
* config/aarch64/aarch64.c (aarch64_expand_vec_perm): Add comment.
 
 gcc/testsuite/
 
 2014-01-14  Alex Velenko  alex.vele...@arm.com
 
* lib/target-supports.exp
(check_effective_target_vect_perm): Exclude aarch64_be.
(check_effective_target_vect_perm_byte): Likewise.
(check_effective_target_vect_perm_short): Likewise.

I think you want to use a function to check if the target is effectively 
big-endian instead.  Internally at Cavium, our elf compiler has big-endian 
multi-lib. 

Thanks,
Andrew

 
 vec-perm.patch


Re: Silence class vs. struct warnings (opt_pass, ipa_opt_pass_d)

2013-12-05 Thread pinskia


 On Dec 5, 2013, at 12:48 AM, Oleg Endo oleg.e...@t-online.de wrote:
 
 Hi,
 
 When building GCC on OSX with its native XCode/Clang tools, it outputs
 quite some struct X was previously declared as a class or similar
 warnings (-Wmismatched-tags is enabled by default).
 
 The attached patch fixes this for class opt_pass and class
 ipa_opt_pass_d by removing the redundant 'struct' tag which seems to be
 a leftover from the plain C days.
 
 Tested with make all-gcc.
 OK for trunk?

No I don't think we want this at all. C++ is clear here. In fact we don't turn 
on werror for stage 1 for this exact reason.  Rather it might be better to 
check if that flag to turn off the warning and use that.   Also this warning is 
a bad warning for standard c++ code; clang is wrong to enable by default. 

Thanks,
Andrew Pinski 

 
 Cheers,
 Oleg
 
 gcc/ChangeLog:
* cgraphunit.c: Remove struct tags when referring to class
ipa_opt_pass_d or class opt_pass.
* function.h: Likewise.
* lto-cgraph.c: Likewise.
* pass_manager.h: Likewise.
* passes.c: Likewise.
* tree-pass.h: Likewise.
 struct_opt_pass.patch


Re: Silence class vs. struct warnings (vec)

2013-12-05 Thread pinskia


 On Dec 5, 2013, at 1:00 AM, Oleg Endo oleg.e...@t-online.de wrote:
 
 Hi,
 
 When building GCC on OSX with its native XCode/Clang tools, it outputs
 quite some struct X was previously declared as a class or similar
 warnings (-Wmismatched-tags is enabled by default).
 
 The attached patch fixes a mismatch in struct vec_prefix when referring
 to struct vec.
 
 Tested with make all-gcc.
 OK for trunk?

What is this warning trying to do really?  I think this is a very bad warning 
as points out standard code for no reason. 

Thanks,
Andrew


 
 Cheers,
 Oleg
 
 gcc/ChangeLog:
* vec.h (struct vec_prefix): Use struct vec instead of class 
vec.
 class_vec.patch


Re: Silence class vs. struct warnings (vec)

2013-12-05 Thread pinskia


 On Dec 5, 2013, at 1:33 AM, Oleg Endo oleg.e...@t-online.de wrote:
 
 On Thu, 2013-12-05 at 01:11 -0800, pins...@gmail.com wrote:
 
 On Dec 5, 2013, at 1:00 AM, Oleg Endo oleg.e...@t-online.de wrote:
 
 Hi,
 
 When building GCC on OSX with its native XCode/Clang tools, it outputs
 quite some struct X was previously declared as a class or similar
 warnings (-Wmismatched-tags is enabled by default).
 
 The attached patch fixes a mismatch in struct vec_prefix when referring
 to struct vec.
 
 Tested with make all-gcc.
 OK for trunk?
 
 What is this warning trying to do really?  I think this is a very bad 
 warning as points out standard code for no reason. 
 
 I think the answer is here:
 http://llvm.org/bugs/show_bug.cgi?id=11632

Except we don't support compiling GCC with microsoft's broken compiler. So 
again why make this change for broken compilers that is hard to support in the 
first place?

 
 Cheers,
 Oleg
 


Re: wide-int, ada

2013-11-26 Thread pinskia


 On Nov 26, 2013, at 6:00 AM, H.J. Lu hjl.to...@gmail.com wrote:
 
 On Tue, Nov 26, 2013 at 5:55 AM, Richard Biener
 richard.guent...@gmail.com wrote:
 On Tue, Nov 26, 2013 at 2:44 PM, Richard Earnshaw rearn...@arm.com wrote:
 On 26/11/13 09:18, Eric Botcazou wrote:
 you are correct - this was an incorrect change.   I believe that the
 patch below would be correct, but it is impossible to test it because (i
 believe) that gcc no longer works if the host_bits_per_wide_int is 32.
 I could be wrong about this but if i am correct, what do you want me to 
 do?
 
 While you're right that most mainstream architectures now require a 64-bit
 HWI, not all of them do according to config.gcc, so I don't think that this
 path is entirely dead yet.  I'll carry out the testing once we agree on the
 final change.
 
 I'm hoping, once this patch series is in that we might be able to
 migrate the ARM port back to supporting a 32-bit HWI.  The driving
 factor behind the original switch was supporting 128-bit constants for
 Neon and these patches should resolve that.
 
 i?86 would be another candidate (if you don't build a compiler with -m64
 support).
 
 Not true for x86 since we have
 
 Variable
 HOST_WIDE_INT ix86_isa_flags = TARGET_64BIT_DEFAULT |
 TARGET_SUBTARGET_ISA_DEFAULT
 
 in i386.opt.   We need more than 32 bits for ix86_isa_flags.

Then that should be HOST_WIDEST_INT instead. 

Thanks,
Andrew

 
 -- 
 H.J.


Re: [PATCH] Enhance ifcombine to recover non short circuit branches

2013-11-08 Thread pinskia


 On Nov 8, 2013, at 9:20 AM, Steven Bosscher stevenb@gmail.com wrote:
 
 On Wed, Oct 30, 2013 at 5:03 AM, Andrew Pinski wrote:
 Here is what I applied in the end; Jeff told me just to remove the
 testcase.  I added the comment trying to explain why it was the
 opposite order of PHI-opt.
 
 Thanks,
 Andrew Pinski
 
 ChangeLog:
 * tree-ssa-ifcombine.c: Include rtl.h and tm_p.h.
 
 Eh, why???
 
 The file has this comment:
 
 25/* rtl is needed only because arm back-end requires it for
 26   BRANCH_COST.  */
 27#include rtl.h
 28#include tm_p.h
 
 Can you please clarify why this is not something to be fixed in the
 ARM back end?

  Really BRANCH_COST should be a target hook rather than a macro which should 
fix this issue.  fold-const.c has the same include for the same reason.  I 
thought I had saw a patch which changes it into a hook. Next week if I get 
time, I will do that.  I knew it was the wrong direction which is why I added a 
comment.

Thanks,
Andrew

 
 You're taking the easy way out here, but it's a step in the wrong
 direction from modularity point of view.
 
 Ciao!
 Steven


Re: [PATCH] Final removal of mudflap

2013-10-28 Thread pinskia


 On Oct 28, 2013, at 11:24 AM, Jeff Law l...@redhat.com wrote:
 
 On 10/26/13 15:08, Joseph S. Myers wrote:
 As far as I can see, the commit left empty libmudflap directories around
 rather than removing them (SVN, unlike git, allows empty directories to be
 represented in the repository).
 I'll check out an SVN tree and remove the empty directory.

Looks like mike stump has already did this.

Thanks,
Andrew

 
 jeff


Re: [PATCH] Vectorizing abs(char/short/int) on x86.

2013-10-22 Thread pinskia


Sent from my iPad

 On Oct 22, 2013, at 7:23 PM, Cong Hou co...@google.com wrote:
 
 This patch aims at PR58762.
 
 Currently GCC could not vectorize abs() operation for integers on x86
 with only SSE2 support. For int type, the reason is that the expand on
 abs() is not defined for vector type. This patch defines such an
 expand so that abs(int) will be vectorized with only SSE2.
 
 For abs(char/short), type conversions are needed as the current abs()
 function/operation does not accept argument of char/short type.
 Therefore when we want to get the absolute value of a char_val using
 abs (char_val), it will be converted into abs ((int) char_val). It
 then can be vectorized, but the generated code is not efficient as
 lots of packings and unpackings are envolved. But if we convert
 (char) abs ((int) char_val) to abs (char_val), the vectorizer will be
 able to generate better code. Same for short.
 
 This conversion also enables vectorizing abs(char/short) operation
 with PABSB and PABSW instructions in SSE3.
 
 With only SSE2 support, I developed three methods to expand
 abs(char/short/int) seperately:
 
 1. For 32 bit int value x, we can get abs (x) from (((signed) x 
 (W-1)) ^ x) - ((signed) x  (W-1)). This is better than max (x, -x),
 which needs bit masking.
 
 2. For 16 bit int value x, we can get abs (x) from max (x, -x), as
 SSE2 provides PMAXSW instruction.
 
 3. For 8 bit int value x, we can get abs (x) from min ((unsigned char)
 x, (unsigned char) (-x)), as SSE2 provides PMINUB instruction.
 
 
 The patch is pasted below. Please point out any problem in my patch
 and analysis.
 
 
 thanks,
 Cong
 
 
 
 
 diff --git a/gcc/ChangeLog b/gcc/ChangeLog
 index 8a38316..e0f33ee 100644
 --- a/gcc/ChangeLog
 +++ b/gcc/ChangeLog
 @@ -1,3 +1,13 @@
 +2013-10-22  Cong Hou  co...@google.com
 +
 + PR target/58762
 + * convert.c (convert_to_integer): Convert (char) abs ((int) char_val)
 + into abs (char_val).  Also convert (short) abs ((int) short_val)
 + into abs (short_val).

I don't like this optimization in convert.  I think it should be submitted 
separately and should be done in tree-ssa-forwprop.

Also I think you should have a generic (non x86) test case for the above 
optimization. 

Thanks,
Andrew


 + * config/i386/i386-protos.h (ix86_expand_sse2_absvxsi2): New function.
 + * config/i386/i386.c (ix86_expand_sse2_absvxsi2): New function.
 + * config/i386/sse.md: Add SSE2 support to abs (char/int/short).



 +
 2013-10-14  David Malcolm  dmalc...@redhat.com
 
  * dumpfile.h (gcc::dump_manager): New class, to hold state
 diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
 index 3ab2f3a..e85f663 100644
 --- a/gcc/config/i386/i386-protos.h
 +++ b/gcc/config/i386/i386-protos.h
 @@ -238,6 +238,7 @@ extern void ix86_expand_mul_widen_evenodd (rtx,
 rtx, rtx, bool, bool);
 extern void ix86_expand_mul_widen_hilo (rtx, rtx, rtx, bool, bool);
 extern void ix86_expand_sse2_mulv4si3 (rtx, rtx, rtx);
 extern void ix86_expand_sse2_mulvxdi3 (rtx, rtx, rtx);
 +extern void ix86_expand_sse2_absvxsi2 (rtx, rtx);
 
 /* In i386-c.c  */
 extern void ix86_target_macros (void);
 diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
 index 02cbbbd..8050e02 100644
 --- a/gcc/config/i386/i386.c
 +++ b/gcc/config/i386/i386.c
 @@ -41696,6 +41696,53 @@ ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
gen_rtx_MULT (mode, op1, op2));
 }
 
 +void
 +ix86_expand_sse2_absvxsi2 (rtx op0, rtx op1)
 +{
 +  enum machine_mode mode = GET_MODE (op0);
 +  rtx tmp0, tmp1;
 +
 +  switch (mode)
 +{
 +  /* For 32-bit signed integer X, the best way to calculate the absolute
 + value of X is (((signed) X  (W-1)) ^ X) - ((signed) X  (W-1)).  */
 +  case V4SImode:
 + tmp0 = expand_simple_binop (mode, ASHIFTRT, op1,
 +GEN_INT (GET_MODE_BITSIZE
 + (GET_MODE_INNER (mode)) - 1),
 +NULL, 0, OPTAB_DIRECT);
 + if (tmp0)
 +  tmp1 = expand_simple_binop (mode, XOR, op1, tmp0,
 +  NULL, 0, OPTAB_DIRECT);
 + if (tmp0  tmp1)
 +  expand_simple_binop (mode, MINUS, tmp1, tmp0,
 +   op0, 0, OPTAB_DIRECT);
 + break;
 +
 +  /* For 16-bit signed integer X, the best way to calculate the absolute
 + value of X is max (X, -X), as SSE2 provides the PMAXSW insn.  */
 +  case V8HImode:
 + tmp0 = expand_unop (mode, neg_optab, op1, NULL_RTX, 0);
 + if (tmp0)
 +  expand_simple_binop (mode, SMAX, op1, tmp0, op0, 0,
 +   OPTAB_DIRECT);
 + break;
 +
 +  /* For 8-bit signed integer X, the best way to calculate the absolute
 + value of X is min ((unsigned char) X, (unsigned char) (-X)),
 + as SSE2 provides the PMINUB insn.  */
 +  case V16QImode:
 + tmp0 = expand_unop (mode, neg_optab, op1, NULL_RTX, 0);
 + if (tmp0)
 +  expand_simple_binop (V16QImode, UMIN, op1, tmp0, op0, 0,
 +   OPTAB_DIRECT);
 + break;
 +
 +  default:
 + break;
 +}
 +}
 +
 /* Expand an insert into a vector register through pinsr insn.
Return true if successful.  */
 
 diff --git a/gcc/config/i386/sse.md 

Re: [RFC] By default if-convert only basic blocks that will be vectorized

2013-10-17 Thread pinskia

 On Oct 15, 2013, at 5:32 AM, Jakub Jelinek ja...@redhat.com wrote:
 
 Hi!
 
 Especially on i?86/x86_64 if-conversion pass seems to be often
 a pessimization, but the vectorization relies on it and without it we can't
 vectorize a lot of the loops.

I think on many other targets it actually helps.  I know for one it helps on 
octeon even though octeon has no vector instructions.  I think it helps most 
arm targets too.

Thanks,
Andrew

 
 Here is a prototype of a patch that will by default (unless explicit
 -ftree-loop-if-convert) only if-convert loops internally for vectorization,
 so the COND_EXPRs actually only appear as VEC_COND_EXPRs in the vectorized
 basic blocks, but will not appear if vectorization fails, or in the
 scalar loop if vectorization is conditional, or in the prologue or epilogue
 loops around the vectorized loop.
 
 Instead of moving the ifcvt pass inside of the vectorizer, this patch
 during ifcvt performs loop versioning depending on a special internal
 call, only if the internal call returns true we go to the if-converted
 original loop, otherwise the non-if-converted copy of the original loop
 is performed.  And the vectorizer is taught to fold this internal call
 into true resp. false depending on if the loop was vectorized or not, and
 vectorizer loop versioning, peeling for alignment and for bound are adjusted
 to also copy from the non-if-converted loop rather than if-converted one.
 
 Besides fixing the various PRs where if-conversion pessimizes code I'd like
 to also move forward with this with conditional loads and stores,
 http://gcc.gnu.org/ml/gcc-patches/2012-11/msg00202.html
 where the if-unconversion approach looked like a failure.
 
 This patch doesn't yet handle if-converted inner loop in outer loop
 vectorization, something on my todo list (so several vect-cond-*.c tests
 FAIL because they are no longer vectorized) plus I had to change two
 SLP vectorization tests that silently relied on loop if-conversion being
 performed to actually optimize the basic block (if the same thing didn't
 appear in a loop, it wouldn't be optimized at all).
 
 On the newly added testcase on x86_64, there are before this patch
 18 scalar conditional moves, with the patch just 2 (both in the checking
 routine).
 
 Comments?
 
 --- gcc/internal-fn.def.jj2013-10-11 14:32:57.079909782 +0200
 +++ gcc/internal-fn.def2013-10-11 17:23:58.705526840 +0200
 @@ -43,3 +43,4 @@ DEF_INTERNAL_FN (STORE_LANES, ECF_CONST
 DEF_INTERNAL_FN (GOMP_SIMD_LANE, ECF_NOVOPS | ECF_LEAF | ECF_NOTHROW)
 DEF_INTERNAL_FN (GOMP_SIMD_VF, ECF_CONST | ECF_LEAF | ECF_NOTHROW)
 DEF_INTERNAL_FN (GOMP_SIMD_LAST_LANE, ECF_CONST | ECF_LEAF | ECF_NOTHROW)
 +DEF_INTERNAL_FN (LOOP_VECTORIZED, ECF_NOVOPS | ECF_LEAF | ECF_NOTHROW)
 --- gcc/tree-vect-loop-manip.c.jj2013-09-30 22:13:47.0 +0200
 +++ gcc/tree-vect-loop-manip.c2013-10-15 12:57:54.854970913 +0200
 @@ -374,24 +374,31 @@ LOOP-  loop1
 
 static void
 slpeel_update_phi_nodes_for_guard1 (edge guard_edge, struct loop *loop,
 +struct loop *scalar_loop,
 bool is_new_loop, basic_block 
 *new_exit_bb)
 {
 -  gimple orig_phi, new_phi;
 +  gimple orig_phi, new_phi, scalar_phi = NULL;
   gimple update_phi, update_phi2;
   tree guard_arg, loop_arg;
   basic_block new_merge_bb = guard_edge-dest;
   edge e = EDGE_SUCC (new_merge_bb, 0);
   basic_block update_bb = e-dest;
   basic_block orig_bb = loop-header;
 -  edge new_exit_e;
 +  edge new_exit_e, scalar_e = NULL;
   tree current_new_name;
 -  gimple_stmt_iterator gsi_orig, gsi_update;
 +  gimple_stmt_iterator gsi_orig, gsi_update, gsi_scalar = gsi_none ();
 
   /* Create new bb between loop and new_merge_bb.  */
   *new_exit_bb = split_edge (single_exit (loop));
 
   new_exit_e = EDGE_SUCC (*new_exit_bb, 0);
 
 +  if (scalar_loop != NULL  !is_new_loop)
 +{
 +  gsi_scalar = gsi_start_phis (scalar_loop-header);
 +  scalar_e = EDGE_SUCC (scalar_loop-latch, 0);
 +}
 +
   for (gsi_orig = gsi_start_phis (orig_bb),
gsi_update = gsi_start_phis (update_bb);
!gsi_end_p (gsi_orig)  !gsi_end_p (gsi_update);
 @@ -401,6 +408,11 @@ slpeel_update_phi_nodes_for_guard1 (edge
   tree new_res;
   orig_phi = gsi_stmt (gsi_orig);
   update_phi = gsi_stmt (gsi_update);
 +  if (scalar_e != NULL)
 +{
 +  scalar_phi = gsi_stmt (gsi_scalar);
 +  gsi_next (gsi_scalar);
 +}
 
   /** 1. Handle new-merge-point phis  **/
 
 @@ -460,7 +472,13 @@ slpeel_update_phi_nodes_for_guard1 (edge
 current_new_name = loop_arg;
   else
 {
 -  current_new_name = get_current_def (loop_arg);
 +  if (scalar_e)
 +{
 +  current_new_name = PHI_ARG_DEF_FROM_EDGE (scalar_phi, scalar_e);
 +  current_new_name = get_current_def (current_new_name);
 +}
 +  else
 +current_new_name = get_current_def (loop_arg);
  /* current_def is not available only if the variable does 

Re: [PATCH] Reducing number of alias checks in vectorization.

2013-10-01 Thread pinskia


 On Oct 1, 2013, at 7:12 PM, Cong Hou co...@google.com wrote:
 
 When alias exists between data refs in a loop, to vectorize it GCC
 does loop versioning and adds runtime alias checks. Basically for each
 pair of data refs with possible data dependence, there will be two
 comparisons generated to make sure there is no aliasing between them
 in each iteration of the vectorized loop. If there are many such data
 refs pairs, the number of comparisons can be very large, which is a
 big overhead.
 
 However, in some cases it is possible to reduce the number of those
 comparisons. For example, for the following loop, we can detect that
 b[0] and b[1] are two consecutive member accesses so that we can
 combine the alias check between a[0:100]b[0] and a[0:100]b[1] into
 checking a[0:100]b[0:2]:
 
 void foo(int*a, int* b)
 {
   for (int i = 0; i  100; ++i)
a[i] = b[0] + b[1];
 }
 
 Actually, the requirement of consecutive memory accesses is too
 strict. For the following loop, we can still combine the alias checks
 between a[0:100]b[0] and a[0:100]b[100]:
 
 void foo(int*a, int* b)
 {
   for (int i = 0; i  100; ++i)
a[i] = b[0] + b[100];
 }
 
 This is because if b[0] is not in a[0:100] and b[100] is not in
 a[0:100] then a[0:100] cannot be between b[0] and b[100]. We only need
 to check a[0:100] and b[0:101] don't overlap.
 
 More generally, consider two pairs of data refs (a, b1) and (a, b2).
 Suppose addr_b1 and addr_b2 are basic addresses of data ref b1 and b2;
 offset_b1 and offset_b2 (offset_b1  offset_b2) are offsets of b1 and
 b2, and segment_length_a, segment_length_b1, and segment_length_b2 are
 segment length of a, b1, and b2. Then we can combine the two
 comparisons into one if the following condition is satisfied:
 
 offset_b2- offset_b1 - segment_length_b1  segment_length_a
 
 
 This patch detects those combination opportunities to reduce the
 number of alias checks. It is tested on an x86-64 machine.

I like the idea of this patch but I am not a fan of using stl really.  It seems 
a little too much dependence on c++ features for my liking.

Thanks,
Andrew

 
 
 thanks,
 Cong
 
 
 
 Index: gcc/tree-vect-loop-manip.c
 ===
 --- gcc/tree-vect-loop-manip.c (revision 202662)
 +++ gcc/tree-vect-loop-manip.c (working copy)
 @@ -19,6 +19,10 @@ You should have received a copy of the G
 along with GCC; see the file COPYING3.  If not see
 http://www.gnu.org/licenses/.  */
 
 +#include vector
 +#include utility
 +#include algorithm
 +
 #include config.h
 #include system.h
 #include coretypes.h
 @@ -2248,6 +2252,74 @@ vect_vfa_segment_size (struct data_refer
   return segment_length;
 }
 
 +namespace
 +{
 +
 +/* struct dr_addr_with_seg_len
 +
 +   A struct storing information of a data reference, including the data
 +   ref itself, its basic address, the access offset and the segment length
 +   for aliasing checks.  */
 +
 +struct dr_addr_with_seg_len
 +{
 +  dr_addr_with_seg_len (data_reference* d, tree addr, tree off, tree len)
 +: dr (d), basic_addr (addr), offset (off), seg_len (len) {}
 +
 +  data_reference* dr;
 +  tree basic_addr;
 +  tree offset;
 +  tree seg_len;
 +};
 +
 +/* Operator == between two dr_addr_with_seg_len objects.
 +
 +   This equality operator is used to make sure two data refs
 +   are the same one so that we will consider to combine the
 +   aliasing checks of those two pairs of data dependent data
 +   refs.  */
 +
 +bool operator == (const dr_addr_with_seg_len d1,
 +  const dr_addr_with_seg_len d2)
 +{
 +  return operand_equal_p (d1.basic_addr, d2.basic_addr, 0)
 +  operand_equal_p (d1.offset, d2.offset, 0)
 +  operand_equal_p (d1.seg_len, d2.seg_len, 0);
 +}
 +
 +typedef std::pair dr_addr_with_seg_len, dr_addr_with_seg_len
 + dr_addr_with_seg_len_pair_t;
 +
 +
 +/* Operator  between two dr_addr_with_seg_len_pair_t objects.
 +
 +   This operator is used to sort objects of dr_addr_with_seg_len_pair_t
 +   so that we can combine aliasing checks during one scan.  */
 +
 +bool operator  (const dr_addr_with_seg_len_pair_t p1,
 + const dr_addr_with_seg_len_pair_t p2)
 +{
 +  const dr_addr_with_seg_len p11 = p1.first;
 +  const dr_addr_with_seg_len p12 = p1.second;
 +  const dr_addr_with_seg_len p21 = p2.first;
 +  const dr_addr_with_seg_len p22 = p2.second;
 +
 +  if (p11.basic_addr != p21.basic_addr)
 +return p11.basic_addr  p21.basic_addr;
 +  if (p12.basic_addr != p22.basic_addr)
 +return p12.basic_addr  p22.basic_addr;
 +  if (TREE_CODE (p11.offset) != INTEGER_CST
 +  || TREE_CODE (p21.offset) != INTEGER_CST)
 +return p11.offset  p21.offset;
 +  if (int_cst_value (p11.offset) != int_cst_value (p21.offset))
 +return int_cst_value (p11.offset)  int_cst_value (p21.offset);
 +  if (TREE_CODE (p12.offset) != INTEGER_CST
 +  || TREE_CODE (p22.offset) != INTEGER_CST)
 +return p12.offset  p22.offset;
 +  return int_cst_value (p12.offset)  int_cst_value (p22.offset);
 +}
 +
 +}
 
 /* 

Re: [PATCH][RFC] Move IVOPTs closer to RTL expansion

2013-09-08 Thread pinskia
On Sep 8, 2013, at 7:01 PM, Bin.Cheng amker.ch...@gmail.com wrote:

 On Wed, Sep 4, 2013 at 5:20 PM, Richard Biener rguent...@suse.de wrote:
 
 The patch below moves IVOPTs out of the GIMPLE loop pipeline more
 closer to RTL expansion.  That's done for multiple reasons.
 
 First, the loop passes that at the moment preceede IVOPTs leave
 around IL that is in desparate need of basic re-optimization
 like CSE, constant propagation and DCE.  That puts extra load
 on IVOPTs and its cost model, increasing compile-time and
 possibly confusing it.
 
 Second, IVOPTs introduces lowered memory accesses that it
 expects to stay as is, likewise it produces auto-inc/dec
 sequences that it expects to stay as is until RTL expansion.
 Passes such as DOM can break this expectation and make the
 work done by IVOPTs a waste.
 
 I remember doing this excercise in the GCC 4.3 timeframe where
 benchmarking on x86_64 showed no gains or losses (but x86_64
 isn't very sensitive to IV choices).
 
 Any help with benchmarking this on targets other than x86_64
 is appreciated (I'll re-do x86_64).
 
 Bootstrapped and tested on x86_64-unknown-linux-gnu.
 
 General comments are of course also welcome.
 
 Thanks,
 Richard.
 
 2013-09-04  Richard Biener  rguent...@suse.de
 
* passes.def: Move IVOPTs before final DCE pass.
* tree-ssa-loop.c (tree_ssa_loop_ivopts): Adjust for being
outside of the loop pipeline.
 
* gcc.dg/tree-ssa/ivopts-3.c: Scan non-details dump.
* gcc.dg/tree-ssa/reassoc-19.c: Be more permissive.
 
 Index: gcc/passes.def
 ===
 *** gcc/passes.def.orig 2013-09-04 10:57:33.0 +0200
 --- gcc/passes.def  2013-09-04 11:11:27.535952665 +0200
 *** along with GCC; see the file COPYING3.
 *** 221,227 
  NEXT_PASS (pass_complete_unroll);
  NEXT_PASS (pass_slp_vectorize);
  NEXT_PASS (pass_loop_prefetch);
 - NEXT_PASS (pass_iv_optimize);
  NEXT_PASS (pass_lim);
  NEXT_PASS (pass_tree_loop_done);
POP_INSERT_PASSES ()
 --- 221,226 
 *** along with GCC; see the file COPYING3.
 *** 237,242 
 --- 236,246 
 opportunities.  */
NEXT_PASS (pass_phi_only_cprop);
NEXT_PASS (pass_vrp);
 +   /* IVOPTs lowers memory accesses and exposes auto-inc/dec
 +  opportunities.  Run it after the above passes cleaned up
 +the loop optimized IL but before DCE as IVOPTs generates
 +quite some garbage.  */
 +   NEXT_PASS (pass_iv_optimize);
NEXT_PASS (pass_cd_dce);
NEXT_PASS (pass_tracer);
 
 Index: gcc/tree-ssa-loop.c
 ===
 *** gcc/tree-ssa-loop.c.orig2013-09-04 10:57:32.0 +0200
 --- gcc/tree-ssa-loop.c 2013-09-04 11:11:27.536952677 +0200
 *** make_pass_loop_prefetch (gcc::context *c
 *** 906,915 
  static unsigned int
  tree_ssa_loop_ivopts (void)
  {
 !   if (number_of_loops (cfun) = 1)
 ! return 0;
 
 -   tree_ssa_iv_optimize ();
return 0;
  }
 
 --- 906,924 
  static unsigned int
  tree_ssa_loop_ivopts (void)
  {
 !   loop_optimizer_init (LOOPS_NORMAL
 !  | LOOPS_HAVE_RECORDED_EXITS);
 !
 !   if (number_of_loops (cfun)  1)
 ! {
 !   rewrite_into_loop_closed_ssa (NULL, TODO_update_ssa);
 !   scev_initialize ();
 !   tree_ssa_iv_optimize ();
 !   scev_finalize ();
 ! }
 !
 !   loop_optimizer_finalize ();
 
return 0;
  }
 
 Index: gcc/testsuite/gcc.dg/tree-ssa/ivopts-3.c
 ===
 *** gcc/testsuite/gcc.dg/tree-ssa/ivopts-3.c.orig   2013-09-04 
 10:57:33.0 +0200
 --- gcc/testsuite/gcc.dg/tree-ssa/ivopts-3.c2013-09-04 
 11:11:27.559952952 +0200
 ***
 *** 1,5 
  /* { dg-do compile } */
 ! /* { dg-options -O2 -fdump-tree-ivopts-details } */
 
  void main (void)
  {
 --- 1,5 
  /* { dg-do compile } */
 ! /* { dg-options -O2 -fdump-tree-ivopts } */
 
  void main (void)
  {
 *** void main (void)
 *** 8,12 
  f2 ();
  }
 
 ! /* { dg-final { scan-tree-dump-times != 0 5 ivopts } }  */
  /* { dg-final { cleanup-tree-dump ivopts } }  */
 --- 8,12 
  f2 ();
  }
 
 ! /* { dg-final { scan-tree-dump-times != 0 1 ivopts } }  */
  /* { dg-final { cleanup-tree-dump ivopts } }  */
 Index: gcc/testsuite/gcc.dg/tree-ssa/reassoc-19.c
 ===
 *** gcc/testsuite/gcc.dg/tree-ssa/reassoc-19.c.orig 2012-12-18 
 14:24:58.0 +0100
 --- gcc/testsuite/gcc.dg/tree-ssa/reassoc-19.c  2013-09-04 
 11:13:30.895416700 +0200
 *** void foo(char* left, char* rite, int ele
 *** 16,22 
}
  }
 
 ! /* { dg-final { scan-tree-dump-times = \\\(sizetype\\\) element 1 
 optimized } } */
  /* { dg-final { scan-tree-dump-times = - 1 optimized } } */
  /* { dg-final {