Re: [RFC] Teach vectorizer to deal with bitfield reads

2022-10-12 Thread Eric Botcazou via Gcc-patches
> Let me know if you believe this is a good approach? I've ran regression
> tests and this hasn't broken anything so far...

Small regression in Ada though, probably a missing guard somewhere:

=== gnat tests ===


Running target unix
FAIL: gnat.dg/loop_optimization23.adb 3 blank line(s) in output
FAIL: gnat.dg/loop_optimization23.adb (test for excess errors)
UNRESOLVED: gnat.dg/loop_optimization23.adb compilation failed to produce 
execut
able
FAIL: gnat.dg/loop_optimization23_pkg.adb 3 blank line(s) in output
FAIL: gnat.dg/loop_optimization23_pkg.adb (test for excess errors)

In order to reproduce, configure the compiler with Ada enabled, build it, and 
copy $[srcdir)/gcc/testsuite/gnat.dg/loop_optimization23_pkg.ad[sb] into the 
build directory, then just issue:

gcc/gnat1 -quiet loop_optimization23_pkg.adb -O2 -Igcc/ada/rts

eric@fomalhaut:~/build/gcc/native> gcc/gnat1 -quiet 
loop_optimization23_pkg.adb -O2 -Igcc/ada/rts
during GIMPLE pass: vect
+===GNAT BUG DETECTED==+
| 13.0.0 20221012 (experimental) [master ca7f7c3f140] (x86_64-suse-linux) GCC 
error:|
| in exact_div, at poly-int.h:2232 |
| Error detected around loop_optimization23_pkg.adb:5:3|
| Compiling loop_optimization23_pkg.adb

-- 
Eric Botcazou




Re: [PATCH] Teach vectorizer to deal with bitfield accesses (was: [RFC] Teach vectorizer to deal with bitfield reads)

2022-10-11 Thread Hongtao Liu via Gcc-patches
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=107226

On Wed, Oct 12, 2022 at 9:55 AM Hongtao Liu  wrote:
>
> This commit failed tests
>
> FAIL: gcc.target/i386/pr101668.c scan-assembler vpmovsxdq
> FAIL: gcc.target/i386/pr101668.c scan-assembler vpmovsxdq
> FAIL: gcc.target/i386/pr101668.c scan-assembler vpmovsxdq
> FAIL: gcc.target/i386/pr92645.c scan-tree-dump-times optimized "vec_unpack_" 4
> FAIL: gcc.target/i386/pr92645.c scan-tree-dump-times optimized "vec_unpack_" 4
> FAIL: gcc.target/i386/pr92645.c scan-tree-dump-times optimized "vec_unpack_" 4
> FAIL: gcc.target/i386/pr92658-avx2-2.c scan-assembler-times pmovsxbd 2
> FAIL: gcc.target/i386/pr92658-avx2-2.c scan-assembler-times pmovsxbd 2
> FAIL: gcc.target/i386/pr92658-avx2-2.c scan-assembler-times pmovsxbd 2
> FAIL: gcc.target/i386/pr92658-avx2-2.c scan-assembler-times pmovsxbq 2
> FAIL: gcc.target/i386/pr92658-avx2-2.c scan-assembler-times pmovsxbq 2
> FAIL: gcc.target/i386/pr92658-avx2-2.c scan-assembler-times pmovsxbq 2
> FAIL: gcc.target/i386/pr92658-avx2-2.c scan-assembler-times pmovsxbw 2
> FAIL: gcc.target/i386/pr92658-avx2-2.c scan-assembler-times pmovsxbw 2
> FAIL: gcc.target/i386/pr92658-avx2-2.c scan-assembler-times pmovsxbw 2
> FAIL: gcc.target/i386/pr92658-avx2-2.c scan-assembler-times pmovsxdq 2
> FAIL: gcc.target/i386/pr92658-avx2-2.c scan-assembler-times pmovsxdq 2
> FAIL: gcc.target/i386/pr92658-avx2-2.c scan-assembler-times pmovsxdq 2
> FAIL: gcc.target/i386/pr92658-avx2-2.c scan-assembler-times pmovsxwd 2
> FAIL: gcc.target/i386/pr92658-avx2-2.c scan-assembler-times pmovsxwd 2
> FAIL: gcc.target/i386/pr92658-avx2-2.c scan-assembler-times pmovsxwd 2
> FAIL: gcc.target/i386/pr92658-avx2-2.c scan-assembler-times pmovsxwq 2
> FAIL: gcc.target/i386/pr92658-avx2-2.c scan-assembler-times pmovsxwq 2
> FAIL: gcc.target/i386/pr92658-avx2-2.c scan-assembler-times pmovsxwq 2
> FAIL: gcc.target/i386/pr92658-avx2.c scan-assembler-times pmovzxbd 2
> FAIL: gcc.target/i386/pr92658-avx2.c scan-assembler-times pmovzxbd 2
> FAIL: gcc.target/i386/pr92658-avx2.c scan-assembler-times pmovzxbd 2
> FAIL: gcc.target/i386/pr92658-avx2.c scan-assembler-times pmovzxbq 2
> FAIL: gcc.target/i386/pr92658-avx2.c scan-assembler-times pmovzxbq 2
> FAIL: gcc.target/i386/pr92658-avx2.c scan-assembler-times pmovzxbq 2
> FAIL: gcc.target/i386/pr92658-avx2.c scan-assembler-times pmovzxbw 2
> FAIL: gcc.target/i386/pr92658-avx2.c scan-assembler-times pmovzxbw 2
> FAIL: gcc.target/i386/pr92658-avx2.c scan-assembler-times pmovzxbw 2
> FAIL: gcc.target/i386/pr92658-avx2.c scan-assembler-times pmovzxdq 2
> FAIL: gcc.target/i386/pr92658-avx2.c scan-assembler-times pmovzxdq 2
> FAIL: gcc.target/i386/pr92658-avx2.c scan-assembler-times pmovzxdq 2
> FAIL: gcc.target/i386/pr92658-avx2.c scan-assembler-times pmovzxwd 2
> FAIL: gcc.target/i386/pr92658-avx2.c scan-assembler-times pmovzxwd 2
> FAIL: gcc.target/i386/pr92658-avx2.c scan-assembler-times pmovzxwd 2
> FAIL: gcc.target/i386/pr92658-avx2.c scan-assembler-times pmovzxwq 2
> FAIL: gcc.target/i386/pr92658-avx2.c scan-assembler-times pmovzxwq 2
> FAIL: gcc.target/i386/pr92658-avx2.c scan-assembler-times pmovzxwq 2
> FAIL: gcc.target/i386/pr92658-avx512bw-2.c scan-assembler-times pmovsxbd 2
> FAIL: gcc.target/i386/pr92658-avx512bw-2.c scan-assembler-times pmovsxbd 2
> FAIL: gcc.target/i386/pr92658-avx512bw-2.c scan-assembler-times pmovsxbd 2
> FAIL: gcc.target/i386/pr92658-avx512bw-2.c scan-assembler-times pmovsxbq 2
> FAIL: gcc.target/i386/pr92658-avx512bw-2.c scan-assembler-times pmovsxbq 2
> FAIL: gcc.target/i386/pr92658-avx512bw-2.c scan-assembler-times pmovsxbq 2
> FAIL: gcc.target/i386/pr92658-avx512bw-2.c scan-assembler-times pmovsxbw 2
> FAIL: gcc.target/i386/pr92658-avx512bw-2.c scan-assembler-times pmovsxbw 2
> FAIL: gcc.target/i386/pr92658-avx512bw-2.c scan-assembler-times pmovsxbw 2
> FAIL: gcc.target/i386/pr92658-avx512bw-2.c scan-assembler-times pmovsxdq 2
> FAIL: gcc.target/i386/pr92658-avx512bw-2.c scan-assembler-times pmovsxdq 2
> FAIL: gcc.target/i386/pr92658-avx512bw-2.c scan-assembler-times pmovsxdq 2
> FAIL: gcc.target/i386/pr92658-avx512bw-2.c scan-assembler-times pmovsxwd 2
> FAIL: gcc.target/i386/pr92658-avx512bw-2.c scan-assembler-times pmovsxwd 2
> FAIL: gcc.target/i386/pr92658-avx512bw-2.c scan-assembler-times pmovsxwd 2
> FAIL: gcc.target/i386/pr92658-avx512bw-2.c scan-assembler-times pmovsxwq 2
> FAIL: gcc.target/i386/pr92658-avx512bw-2.c scan-assembler-times pmovsxwq 2
> FAIL: gcc.target/i386/pr92658-avx512bw-2.c scan-assembler-times pmovsxwq 2
> FAIL: gcc.target/i386/pr92658-avx512bw.c scan-assembler-times pmovzxbd 2
> FAIL: gcc.target/i386/pr92658-avx512bw.c scan-assembler-times pmovzxbd 2
> FAIL: gcc.target/i386/pr92658-avx512bw.c scan-assembler-times pmovzxbd 2
> FAIL: gcc.target/i386/pr92658-avx512bw.c scan-assembler-times pmovzxbq 2
> FAIL: gcc.target/i386/pr92658-avx512bw.c scan-assembler-times pmovzxbq 2
> FAIL: gcc.target/i386/pr92658-avx512bw.c scan-assembler-times pmovzxbq 2
> FAIL: 

Re: [PATCH] Teach vectorizer to deal with bitfield accesses (was: [RFC] Teach vectorizer to deal with bitfield reads)

2022-10-11 Thread Hongtao Liu via Gcc-patches
This commit failed tests

FAIL: gcc.target/i386/pr101668.c scan-assembler vpmovsxdq
FAIL: gcc.target/i386/pr101668.c scan-assembler vpmovsxdq
FAIL: gcc.target/i386/pr101668.c scan-assembler vpmovsxdq
FAIL: gcc.target/i386/pr92645.c scan-tree-dump-times optimized "vec_unpack_" 4
FAIL: gcc.target/i386/pr92645.c scan-tree-dump-times optimized "vec_unpack_" 4
FAIL: gcc.target/i386/pr92645.c scan-tree-dump-times optimized "vec_unpack_" 4
FAIL: gcc.target/i386/pr92658-avx2-2.c scan-assembler-times pmovsxbd 2
FAIL: gcc.target/i386/pr92658-avx2-2.c scan-assembler-times pmovsxbd 2
FAIL: gcc.target/i386/pr92658-avx2-2.c scan-assembler-times pmovsxbd 2
FAIL: gcc.target/i386/pr92658-avx2-2.c scan-assembler-times pmovsxbq 2
FAIL: gcc.target/i386/pr92658-avx2-2.c scan-assembler-times pmovsxbq 2
FAIL: gcc.target/i386/pr92658-avx2-2.c scan-assembler-times pmovsxbq 2
FAIL: gcc.target/i386/pr92658-avx2-2.c scan-assembler-times pmovsxbw 2
FAIL: gcc.target/i386/pr92658-avx2-2.c scan-assembler-times pmovsxbw 2
FAIL: gcc.target/i386/pr92658-avx2-2.c scan-assembler-times pmovsxbw 2
FAIL: gcc.target/i386/pr92658-avx2-2.c scan-assembler-times pmovsxdq 2
FAIL: gcc.target/i386/pr92658-avx2-2.c scan-assembler-times pmovsxdq 2
FAIL: gcc.target/i386/pr92658-avx2-2.c scan-assembler-times pmovsxdq 2
FAIL: gcc.target/i386/pr92658-avx2-2.c scan-assembler-times pmovsxwd 2
FAIL: gcc.target/i386/pr92658-avx2-2.c scan-assembler-times pmovsxwd 2
FAIL: gcc.target/i386/pr92658-avx2-2.c scan-assembler-times pmovsxwd 2
FAIL: gcc.target/i386/pr92658-avx2-2.c scan-assembler-times pmovsxwq 2
FAIL: gcc.target/i386/pr92658-avx2-2.c scan-assembler-times pmovsxwq 2
FAIL: gcc.target/i386/pr92658-avx2-2.c scan-assembler-times pmovsxwq 2
FAIL: gcc.target/i386/pr92658-avx2.c scan-assembler-times pmovzxbd 2
FAIL: gcc.target/i386/pr92658-avx2.c scan-assembler-times pmovzxbd 2
FAIL: gcc.target/i386/pr92658-avx2.c scan-assembler-times pmovzxbd 2
FAIL: gcc.target/i386/pr92658-avx2.c scan-assembler-times pmovzxbq 2
FAIL: gcc.target/i386/pr92658-avx2.c scan-assembler-times pmovzxbq 2
FAIL: gcc.target/i386/pr92658-avx2.c scan-assembler-times pmovzxbq 2
FAIL: gcc.target/i386/pr92658-avx2.c scan-assembler-times pmovzxbw 2
FAIL: gcc.target/i386/pr92658-avx2.c scan-assembler-times pmovzxbw 2
FAIL: gcc.target/i386/pr92658-avx2.c scan-assembler-times pmovzxbw 2
FAIL: gcc.target/i386/pr92658-avx2.c scan-assembler-times pmovzxdq 2
FAIL: gcc.target/i386/pr92658-avx2.c scan-assembler-times pmovzxdq 2
FAIL: gcc.target/i386/pr92658-avx2.c scan-assembler-times pmovzxdq 2
FAIL: gcc.target/i386/pr92658-avx2.c scan-assembler-times pmovzxwd 2
FAIL: gcc.target/i386/pr92658-avx2.c scan-assembler-times pmovzxwd 2
FAIL: gcc.target/i386/pr92658-avx2.c scan-assembler-times pmovzxwd 2
FAIL: gcc.target/i386/pr92658-avx2.c scan-assembler-times pmovzxwq 2
FAIL: gcc.target/i386/pr92658-avx2.c scan-assembler-times pmovzxwq 2
FAIL: gcc.target/i386/pr92658-avx2.c scan-assembler-times pmovzxwq 2
FAIL: gcc.target/i386/pr92658-avx512bw-2.c scan-assembler-times pmovsxbd 2
FAIL: gcc.target/i386/pr92658-avx512bw-2.c scan-assembler-times pmovsxbd 2
FAIL: gcc.target/i386/pr92658-avx512bw-2.c scan-assembler-times pmovsxbd 2
FAIL: gcc.target/i386/pr92658-avx512bw-2.c scan-assembler-times pmovsxbq 2
FAIL: gcc.target/i386/pr92658-avx512bw-2.c scan-assembler-times pmovsxbq 2
FAIL: gcc.target/i386/pr92658-avx512bw-2.c scan-assembler-times pmovsxbq 2
FAIL: gcc.target/i386/pr92658-avx512bw-2.c scan-assembler-times pmovsxbw 2
FAIL: gcc.target/i386/pr92658-avx512bw-2.c scan-assembler-times pmovsxbw 2
FAIL: gcc.target/i386/pr92658-avx512bw-2.c scan-assembler-times pmovsxbw 2
FAIL: gcc.target/i386/pr92658-avx512bw-2.c scan-assembler-times pmovsxdq 2
FAIL: gcc.target/i386/pr92658-avx512bw-2.c scan-assembler-times pmovsxdq 2
FAIL: gcc.target/i386/pr92658-avx512bw-2.c scan-assembler-times pmovsxdq 2
FAIL: gcc.target/i386/pr92658-avx512bw-2.c scan-assembler-times pmovsxwd 2
FAIL: gcc.target/i386/pr92658-avx512bw-2.c scan-assembler-times pmovsxwd 2
FAIL: gcc.target/i386/pr92658-avx512bw-2.c scan-assembler-times pmovsxwd 2
FAIL: gcc.target/i386/pr92658-avx512bw-2.c scan-assembler-times pmovsxwq 2
FAIL: gcc.target/i386/pr92658-avx512bw-2.c scan-assembler-times pmovsxwq 2
FAIL: gcc.target/i386/pr92658-avx512bw-2.c scan-assembler-times pmovsxwq 2
FAIL: gcc.target/i386/pr92658-avx512bw.c scan-assembler-times pmovzxbd 2
FAIL: gcc.target/i386/pr92658-avx512bw.c scan-assembler-times pmovzxbd 2
FAIL: gcc.target/i386/pr92658-avx512bw.c scan-assembler-times pmovzxbd 2
FAIL: gcc.target/i386/pr92658-avx512bw.c scan-assembler-times pmovzxbq 2
FAIL: gcc.target/i386/pr92658-avx512bw.c scan-assembler-times pmovzxbq 2
FAIL: gcc.target/i386/pr92658-avx512bw.c scan-assembler-times pmovzxbq 2
FAIL: gcc.target/i386/pr92658-avx512bw.c scan-assembler-times pmovzxbw 2
FAIL: gcc.target/i386/pr92658-avx512bw.c scan-assembler-times pmovzxbw 2
FAIL: gcc.target/i386/pr92658-avx512bw.c scan-assembler-times pmovzxbw 2
FAIL: 

Re: [PATCH] Teach vectorizer to deal with bitfield accesses (was: [RFC] Teach vectorizer to deal with bitfield reads)

2022-10-07 Thread Andre Vieira (lists) via Gcc-patches

Hi,

Whilst running a bootstrap with extra options to force bitfield 
vectorization '-O2 -ftree-vectorize -ftree-loop-if-convert 
-fno-vect-cost-model' I ran into an ICE in vect-patterns where a 
bit_field_ref had a container that wasn't INTEGRAL_TYPE and had a 
E_BLKmode, which meant we failed to build an integer type with the same 
size. For that reason I added a check to bail out earlier if the 
TYPE_MODE of the container is indeed E_BLKmode. The pattern for the 
bitfield inserts required no change as we currently don't support 
containers that aren't integer typed.


Also changed a testcase because in BIG-ENDIAN it was not vectorizing due 
to a different size of container that wasn't supported.


This passes the same bootstrap and regressions on aarch64-none-linux and 
no regressions on aarch64_be-none-elf either.


I assume you are OK with these changes Richard, but I don't like to 
commit on Friday in case something breaks over the weekend, so I'll 
leave it until Monday.


Thanks,
Andre

On 29/09/2022 08:54, Richard Biener wrote:

On Wed, Sep 28, 2022 at 7:32 PM Andre Vieira (lists) via Gcc-patches
 wrote:

Made the change and also created the ChangeLogs.

OK if bootstrap / testing succeeds.

Thanks,
Richard.


gcc/ChangeLog:

  * tree-if-conv.cc (if_convertible_loop_p_1): Move ordering of
loop bb's from here...
  (tree_if_conversion): ... to here.  Also call bitfield lowering
when appropriate.
  (version_loop_for_if_conversion): Adapt to enable loop
versioning when we only need
  to lower bitfields.
  (ifcvt_split_critical_edges): Relax condition of expected loop
form as this is checked earlier.
  (get_bitfield_rep): New function.
  (lower_bitfield): Likewise.
  (bitfields_to_lower_p): Likewise.
  (need_to_lower_bitfields): New global boolean.
  (need_to_ifcvt): Likewise.
  * tree-vect-data-refs.cc (vect_find_stmt_data_reference):
Improve diagnostic message.
  * tree-vect-patterns.cc (vect_recog_temp_ssa_var): Add default
value for last parameter.
  (vect_recog_bitfield_ref_pattern): New.
  (vect_recog_bit_insert_pattern): New.

gcc/testsuite/ChangeLog:

  * gcc.dg/vect/vect-bitfield-read-1.c: New test.
  * gcc.dg/vect/vect-bitfield-read-2.c: New test.
  * gcc.dg/vect/vect-bitfield-read-3.c: New test.
  * gcc.dg/vect/vect-bitfield-read-4.c: New test.
  * gcc.dg/vect/vect-bitfield-read-5.c: New test.
  * gcc.dg/vect/vect-bitfield-read-6.c: New test.
  * gcc.dg/vect/vect-bitfield-write-1.c: New test.
  * gcc.dg/vect/vect-bitfield-write-2.c: New test.
  * gcc.dg/vect/vect-bitfield-write-3.c: New test.
  * gcc.dg/vect/vect-bitfield-write-4.c: New test.
  * gcc.dg/vect/vect-bitfield-write-5.c: New test.

On 28/09/2022 10:43, Andre Vieira (lists) via Gcc-patches wrote:

On 27/09/2022 13:34, Richard Biener wrote:

On Mon, 26 Sep 2022, Andre Vieira (lists) wrote:


On 08/09/2022 12:51, Richard Biener wrote:

I'm curious, why the push to redundant_ssa_names?  That could use
a comment ...

So I purposefully left a #if 0 #else #endif in there so you can see
the two
options. But the reason I used redundant_ssa_names is because ifcvt
seems to
use that as a container for all pairs of (old, new) ssa names to
replace
later. So I just piggy backed on that. I don't know if there's a
specific
reason they do the replacement at the end? Maybe some ordering
issue? Either
way both adding it to redundant_ssa_names or doing the replacement
inline work
for the bitfield lowering (or work in my testing at least).

Possibly because we (in the past?) inserted/copied stuff based on
predicates generated at analysis time after we decide to elide something
so we need to watch for later appearing uses.  But who knows ... my mind
fails me here.

If it works to replace uses immediately please do so.  But now
I wonder why we need this - the value shouldn't change so you
should get away with re-using the existing SSA name for the final value?

Yeah... good point. A quick change and minor testing seems to agree.
I'm sure I had a good reason to do it initially ;)

I'll run a full-regression on this change to make sure I didn't miss
anything.
diff --git a/gcc/testsuite/gcc.dg/vect/vect-bitfield-read-1.c 
b/gcc/testsuite/gcc.dg/vect/vect-bitfield-read-1.c
new file mode 100644
index 
..01cf34fb44484ca926ca5de99eef76dd99b69e92
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-bitfield-read-1.c
@@ -0,0 +1,40 @@
+/* { dg-require-effective-target vect_int } */
+
+#include 
+#include "tree-vect.h"
+
+extern void abort(void);
+
+struct s { int i : 31; };
+
+#define ELT0 {0}
+#define ELT1 {1}
+#define ELT2 {2}
+#define ELT3 {3}
+#define N 32
+#define RES 48
+struct s A[N]
+  = { ELT0, ELT1, ELT2, ELT3, ELT0, ELT1, ELT2, ELT3,
+  ELT0, ELT1, ELT2, ELT3, ELT0, ELT1, ELT2, ELT3,
+  ELT0, ELT1, ELT2, ELT3, 

Re: [PATCH] Teach vectorizer to deal with bitfield accesses (was: [RFC] Teach vectorizer to deal with bitfield reads)

2022-09-29 Thread Richard Biener via Gcc-patches
On Wed, Sep 28, 2022 at 7:32 PM Andre Vieira (lists) via Gcc-patches
 wrote:
>
> Made the change and also created the ChangeLogs.

OK if bootstrap / testing succeeds.

Thanks,
Richard.

> gcc/ChangeLog:
>
>  * tree-if-conv.cc (if_convertible_loop_p_1): Move ordering of
> loop bb's from here...
>  (tree_if_conversion): ... to here.  Also call bitfield lowering
> when appropriate.
>  (version_loop_for_if_conversion): Adapt to enable loop
> versioning when we only need
>  to lower bitfields.
>  (ifcvt_split_critical_edges): Relax condition of expected loop
> form as this is checked earlier.
>  (get_bitfield_rep): New function.
>  (lower_bitfield): Likewise.
>  (bitfields_to_lower_p): Likewise.
>  (need_to_lower_bitfields): New global boolean.
>  (need_to_ifcvt): Likewise.
>  * tree-vect-data-refs.cc (vect_find_stmt_data_reference):
> Improve diagnostic message.
>  * tree-vect-patterns.cc (vect_recog_temp_ssa_var): Add default
> value for last parameter.
>  (vect_recog_bitfield_ref_pattern): New.
>  (vect_recog_bit_insert_pattern): New.
>
> gcc/testsuite/ChangeLog:
>
>  * gcc.dg/vect/vect-bitfield-read-1.c: New test.
>  * gcc.dg/vect/vect-bitfield-read-2.c: New test.
>  * gcc.dg/vect/vect-bitfield-read-3.c: New test.
>  * gcc.dg/vect/vect-bitfield-read-4.c: New test.
>  * gcc.dg/vect/vect-bitfield-read-5.c: New test.
>  * gcc.dg/vect/vect-bitfield-read-6.c: New test.
>  * gcc.dg/vect/vect-bitfield-write-1.c: New test.
>  * gcc.dg/vect/vect-bitfield-write-2.c: New test.
>  * gcc.dg/vect/vect-bitfield-write-3.c: New test.
>  * gcc.dg/vect/vect-bitfield-write-4.c: New test.
>  * gcc.dg/vect/vect-bitfield-write-5.c: New test.
>
> On 28/09/2022 10:43, Andre Vieira (lists) via Gcc-patches wrote:
> >
> > On 27/09/2022 13:34, Richard Biener wrote:
> >> On Mon, 26 Sep 2022, Andre Vieira (lists) wrote:
> >>
> >>> On 08/09/2022 12:51, Richard Biener wrote:
>  I'm curious, why the push to redundant_ssa_names?  That could use
>  a comment ...
> >>> So I purposefully left a #if 0 #else #endif in there so you can see
> >>> the two
> >>> options. But the reason I used redundant_ssa_names is because ifcvt
> >>> seems to
> >>> use that as a container for all pairs of (old, new) ssa names to
> >>> replace
> >>> later. So I just piggy backed on that. I don't know if there's a
> >>> specific
> >>> reason they do the replacement at the end? Maybe some ordering
> >>> issue? Either
> >>> way both adding it to redundant_ssa_names or doing the replacement
> >>> inline work
> >>> for the bitfield lowering (or work in my testing at least).
> >> Possibly because we (in the past?) inserted/copied stuff based on
> >> predicates generated at analysis time after we decide to elide something
> >> so we need to watch for later appearing uses.  But who knows ... my mind
> >> fails me here.
> >>
> >> If it works to replace uses immediately please do so.  But now
> >> I wonder why we need this - the value shouldn't change so you
> >> should get away with re-using the existing SSA name for the final value?
> >
> > Yeah... good point. A quick change and minor testing seems to agree.
> > I'm sure I had a good reason to do it initially ;)
> >
> > I'll run a full-regression on this change to make sure I didn't miss
> > anything.
> >


Re: [PATCH] Teach vectorizer to deal with bitfield accesses (was: [RFC] Teach vectorizer to deal with bitfield reads)

2022-09-28 Thread Andre Vieira (lists) via Gcc-patches

Made the change and also created the ChangeLogs.

gcc/ChangeLog:

    * tree-if-conv.cc (if_convertible_loop_p_1): Move ordering of 
loop bb's from here...
    (tree_if_conversion): ... to here.  Also call bitfield lowering 
when appropriate.
    (version_loop_for_if_conversion): Adapt to enable loop 
versioning when we only need

    to lower bitfields.
    (ifcvt_split_critical_edges): Relax condition of expected loop 
form as this is checked earlier.

    (get_bitfield_rep): New function.
    (lower_bitfield): Likewise.
    (bitfields_to_lower_p): Likewise.
    (need_to_lower_bitfields): New global boolean.
    (need_to_ifcvt): Likewise.
    * tree-vect-data-refs.cc (vect_find_stmt_data_reference): 
Improve diagnostic message.
    * tree-vect-patterns.cc (vect_recog_temp_ssa_var): Add default 
value for last parameter.

    (vect_recog_bitfield_ref_pattern): New.
    (vect_recog_bit_insert_pattern): New.

gcc/testsuite/ChangeLog:

    * gcc.dg/vect/vect-bitfield-read-1.c: New test.
    * gcc.dg/vect/vect-bitfield-read-2.c: New test.
    * gcc.dg/vect/vect-bitfield-read-3.c: New test.
    * gcc.dg/vect/vect-bitfield-read-4.c: New test.
    * gcc.dg/vect/vect-bitfield-read-5.c: New test.
    * gcc.dg/vect/vect-bitfield-read-6.c: New test.
    * gcc.dg/vect/vect-bitfield-write-1.c: New test.
    * gcc.dg/vect/vect-bitfield-write-2.c: New test.
    * gcc.dg/vect/vect-bitfield-write-3.c: New test.
    * gcc.dg/vect/vect-bitfield-write-4.c: New test.
    * gcc.dg/vect/vect-bitfield-write-5.c: New test.

On 28/09/2022 10:43, Andre Vieira (lists) via Gcc-patches wrote:


On 27/09/2022 13:34, Richard Biener wrote:

On Mon, 26 Sep 2022, Andre Vieira (lists) wrote:


On 08/09/2022 12:51, Richard Biener wrote:

I'm curious, why the push to redundant_ssa_names?  That could use
a comment ...
So I purposefully left a #if 0 #else #endif in there so you can see 
the two
options. But the reason I used redundant_ssa_names is because ifcvt 
seems to
use that as a container for all pairs of (old, new) ssa names to 
replace
later. So I just piggy backed on that. I don't know if there's a 
specific
reason they do the replacement at the end? Maybe some ordering 
issue? Either
way both adding it to redundant_ssa_names or doing the replacement 
inline work

for the bitfield lowering (or work in my testing at least).

Possibly because we (in the past?) inserted/copied stuff based on
predicates generated at analysis time after we decide to elide something
so we need to watch for later appearing uses.  But who knows ... my mind
fails me here.

If it works to replace uses immediately please do so.  But now
I wonder why we need this - the value shouldn't change so you
should get away with re-using the existing SSA name for the final value?


Yeah... good point. A quick change and minor testing seems to agree. 
I'm sure I had a good reason to do it initially ;)


I'll run a full-regression on this change to make sure I didn't miss 
anything.
diff --git a/gcc/testsuite/gcc.dg/vect/vect-bitfield-read-1.c 
b/gcc/testsuite/gcc.dg/vect/vect-bitfield-read-1.c
new file mode 100644
index 
..01cf34fb44484ca926ca5de99eef76dd99b69e92
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-bitfield-read-1.c
@@ -0,0 +1,40 @@
+/* { dg-require-effective-target vect_int } */
+
+#include 
+#include "tree-vect.h"
+
+extern void abort(void);
+
+struct s { int i : 31; };
+
+#define ELT0 {0}
+#define ELT1 {1}
+#define ELT2 {2}
+#define ELT3 {3}
+#define N 32
+#define RES 48
+struct s A[N]
+  = { ELT0, ELT1, ELT2, ELT3, ELT0, ELT1, ELT2, ELT3,
+  ELT0, ELT1, ELT2, ELT3, ELT0, ELT1, ELT2, ELT3,
+  ELT0, ELT1, ELT2, ELT3, ELT0, ELT1, ELT2, ELT3,
+  ELT0, ELT1, ELT2, ELT3, ELT0, ELT1, ELT2, ELT3};
+
+int __attribute__ ((noipa))
+f(struct s *ptr, unsigned n) {
+int res = 0;
+for (int i = 0; i < n; ++i)
+  res += ptr[i].i;
+return res;
+}
+
+int main (void)
+{
+  check_vect ();
+
+  if (f([0], N) != RES)
+abort ();
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-bitfield-read-2.c 
b/gcc/testsuite/gcc.dg/vect/vect-bitfield-read-2.c
new file mode 100644
index 
..1a4a1579c1478b9407ad21b19e8fbdca9f674b42
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-bitfield-read-2.c
@@ -0,0 +1,43 @@
+/* { dg-require-effective-target vect_int } */
+
+#include 
+#include "tree-vect.h"
+
+extern void abort(void);
+
+struct s {
+unsigned i : 31;
+char a : 4;
+};
+
+#define N 32
+#define ELT0 {0x7FFFUL, 0}
+#define ELT1 {0x7FFFUL, 1}
+#define ELT2 {0x7FFFUL, 2}
+#define ELT3 {0x7FFFUL, 3}
+#define RES 48
+struct s A[N]
+  = { ELT0, ELT1, ELT2, ELT3, ELT0, ELT1, ELT2, ELT3,
+  ELT0, ELT1, ELT2, ELT3, ELT0, ELT1, ELT2, ELT3,
+  ELT0, ELT1, ELT2, ELT3, ELT0, 

Re: [PATCH] Teach vectorizer to deal with bitfield accesses (was: [RFC] Teach vectorizer to deal with bitfield reads)

2022-09-28 Thread Andre Vieira (lists) via Gcc-patches



On 27/09/2022 13:34, Richard Biener wrote:

On Mon, 26 Sep 2022, Andre Vieira (lists) wrote:


On 08/09/2022 12:51, Richard Biener wrote:

I'm curious, why the push to redundant_ssa_names?  That could use
a comment ...

So I purposefully left a #if 0 #else #endif in there so you can see the two
options. But the reason I used redundant_ssa_names is because ifcvt seems to
use that as a container for all pairs of (old, new) ssa names to replace
later. So I just piggy backed on that. I don't know if there's a specific
reason they do the replacement at the end? Maybe some ordering issue? Either
way both adding it to redundant_ssa_names or doing the replacement inline work
for the bitfield lowering (or work in my testing at least).

Possibly because we (in the past?) inserted/copied stuff based on
predicates generated at analysis time after we decide to elide something
so we need to watch for later appearing uses.  But who knows ... my mind
fails me here.

If it works to replace uses immediately please do so.  But now
I wonder why we need this - the value shouldn't change so you
should get away with re-using the existing SSA name for the final value?


Yeah... good point. A quick change and minor testing seems to agree. I'm sure I 
had a good reason to do it initially ;)

I'll run a full-regression on this change to make sure I didn't miss anything.



Re: [PATCH] Teach vectorizer to deal with bitfield accesses (was: [RFC] Teach vectorizer to deal with bitfield reads)

2022-09-27 Thread Richard Biener via Gcc-patches
On Mon, 26 Sep 2022, Andre Vieira (lists) wrote:

> 
> On 08/09/2022 12:51, Richard Biener wrote:
> >
> > I'm curious, why the push to redundant_ssa_names?  That could use
> > a comment ...
> So I purposefully left a #if 0 #else #endif in there so you can see the two
> options. But the reason I used redundant_ssa_names is because ifcvt seems to
> use that as a container for all pairs of (old, new) ssa names to replace
> later. So I just piggy backed on that. I don't know if there's a specific
> reason they do the replacement at the end? Maybe some ordering issue? Either
> way both adding it to redundant_ssa_names or doing the replacement inline work
> for the bitfield lowering (or work in my testing at least).

Possibly because we (in the past?) inserted/copied stuff based on
predicates generated at analysis time after we decide to elide something
so we need to watch for later appearing uses.  But who knows ... my mind
fails me here.

If it works to replace uses immediately please do so.  But now
I wonder why we need this - the value shouldn't change so you
should get away with re-using the existing SSA name for the final value?

> > Note I fear we will have endianess issues when translating
> > bit-field accesses to BIT_FIELD_REF/INSERT and then to shifts.  Rules
> > for memory and register operations do not match up (IIRC, I repeatedly
> > run into issues here myself).  The testcases all look like they
> > won't catch this - I think an example would be sth like
> > struct X { unsigned a : 23; unsigned b : 9; }, can you see to do
> > testing on a big-endian target?
> I've done some testing and you were right, it did fall apart on big-endian. I
> fixed it by changing the way we compute the 'shift' value and added two extra
> testcases for read and write each.
> >
> > Sorry for the delay in reviewing.
> No worries, apologies myself for the delay in reworking this, had a nice
> little week holiday in between :)
> 
> I'll write the ChangeLogs once the patch has stabilized.

Thanks,
Richard.


Re: [PATCH] Teach vectorizer to deal with bitfield accesses (was: [RFC] Teach vectorizer to deal with bitfield reads)

2022-09-26 Thread Andre Vieira (lists) via Gcc-patches


On 08/09/2022 12:51, Richard Biener wrote:


I'm curious, why the push to redundant_ssa_names?  That could use
a comment ...
So I purposefully left a #if 0 #else #endif in there so you can see the 
two options. But the reason I used redundant_ssa_names is because ifcvt 
seems to use that as a container for all pairs of (old, new) ssa names 
to replace later. So I just piggy backed on that. I don't know if 
there's a specific reason they do the replacement at the end? Maybe some 
ordering issue? Either way both adding it to redundant_ssa_names or 
doing the replacement inline work for the bitfield lowering (or work in 
my testing at least).

Note I fear we will have endianess issues when translating
bit-field accesses to BIT_FIELD_REF/INSERT and then to shifts.  Rules
for memory and register operations do not match up (IIRC, I repeatedly
run into issues here myself).  The testcases all look like they
won't catch this - I think an example would be sth like
struct X { unsigned a : 23; unsigned b : 9; }, can you see to do
testing on a big-endian target?
I've done some testing and you were right, it did fall apart on 
big-endian. I fixed it by changing the way we compute the 'shift' value 
and added two extra testcases for read and write each.


Sorry for the delay in reviewing.
No worries, apologies myself for the delay in reworking this, had a nice 
little week holiday in between :)


I'll write the ChangeLogs once the patch has stabilized.

Thanks,
Andre
diff --git a/gcc/testsuite/gcc.dg/vect/vect-bitfield-read-1.c 
b/gcc/testsuite/gcc.dg/vect/vect-bitfield-read-1.c
new file mode 100644
index 
..01cf34fb44484ca926ca5de99eef76dd99b69e92
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-bitfield-read-1.c
@@ -0,0 +1,40 @@
+/* { dg-require-effective-target vect_int } */
+
+#include 
+#include "tree-vect.h"
+
+extern void abort(void);
+
+struct s { int i : 31; };
+
+#define ELT0 {0}
+#define ELT1 {1}
+#define ELT2 {2}
+#define ELT3 {3}
+#define N 32
+#define RES 48
+struct s A[N]
+  = { ELT0, ELT1, ELT2, ELT3, ELT0, ELT1, ELT2, ELT3,
+  ELT0, ELT1, ELT2, ELT3, ELT0, ELT1, ELT2, ELT3,
+  ELT0, ELT1, ELT2, ELT3, ELT0, ELT1, ELT2, ELT3,
+  ELT0, ELT1, ELT2, ELT3, ELT0, ELT1, ELT2, ELT3};
+
+int __attribute__ ((noipa))
+f(struct s *ptr, unsigned n) {
+int res = 0;
+for (int i = 0; i < n; ++i)
+  res += ptr[i].i;
+return res;
+}
+
+int main (void)
+{
+  check_vect ();
+
+  if (f([0], N) != RES)
+abort ();
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-bitfield-read-2.c 
b/gcc/testsuite/gcc.dg/vect/vect-bitfield-read-2.c
new file mode 100644
index 
..1a4a1579c1478b9407ad21b19e8fbdca9f674b42
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-bitfield-read-2.c
@@ -0,0 +1,43 @@
+/* { dg-require-effective-target vect_int } */
+
+#include 
+#include "tree-vect.h"
+
+extern void abort(void);
+
+struct s {
+unsigned i : 31;
+char a : 4;
+};
+
+#define N 32
+#define ELT0 {0x7FFFUL, 0}
+#define ELT1 {0x7FFFUL, 1}
+#define ELT2 {0x7FFFUL, 2}
+#define ELT3 {0x7FFFUL, 3}
+#define RES 48
+struct s A[N]
+  = { ELT0, ELT1, ELT2, ELT3, ELT0, ELT1, ELT2, ELT3,
+  ELT0, ELT1, ELT2, ELT3, ELT0, ELT1, ELT2, ELT3,
+  ELT0, ELT1, ELT2, ELT3, ELT0, ELT1, ELT2, ELT3,
+  ELT0, ELT1, ELT2, ELT3, ELT0, ELT1, ELT2, ELT3};
+
+int __attribute__ ((noipa))
+f(struct s *ptr, unsigned n) {
+int res = 0;
+for (int i = 0; i < n; ++i)
+  res += ptr[i].a;
+return res;
+}
+
+int main (void)
+{
+  check_vect ();
+
+  if (f([0], N) != RES)
+abort ();
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-bitfield-read-3.c 
b/gcc/testsuite/gcc.dg/vect/vect-bitfield-read-3.c
new file mode 100644
index 
..216611a29fd8bbfbafdbdb79d790e520f44ba672
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-bitfield-read-3.c
@@ -0,0 +1,43 @@
+/* { dg-require-effective-target vect_int } */
+
+#include 
+#include "tree-vect.h"
+#include 
+
+extern void abort(void);
+
+typedef struct {
+int  c;
+int  b;
+bool a : 1;
+} struct_t;
+
+#define N 16
+#define ELT_F { 0x, 0x, 0 }
+#define ELT_T { 0x, 0x, 1 }
+
+struct_t vect_false[N] = { ELT_F, ELT_F, ELT_F, ELT_F, ELT_F, ELT_F, ELT_F, 
ELT_F,
+  ELT_F, ELT_F, ELT_F, ELT_F, ELT_F, ELT_F, ELT_F, 
ELT_F  };
+struct_t vect_true[N]  = { ELT_F, ELT_F, ELT_T, ELT_F, ELT_F, ELT_F, ELT_F, 
ELT_F,
+  ELT_F, ELT_F, ELT_T, ELT_F, ELT_F, ELT_F, ELT_F, 
ELT_F  };
+int main (void)
+{
+  unsigned ret = 0;
+  for (unsigned i = 0; i < N; i++)
+  {
+  ret |= vect_false[i].a;
+  }
+  if (ret)
+abort ();
+
+  for (unsigned i = 0; i < N; i++)
+  {
+  ret |= 

Re: [PATCH] Teach vectorizer to deal with bitfield accesses (was: [RFC] Teach vectorizer to deal with bitfield reads)

2022-09-08 Thread Richard Biener via Gcc-patches
On Thu, 25 Aug 2022, Andre Vieira (lists) wrote:

> 
> On 17/08/2022 13:49, Richard Biener wrote:
> > Yes, of course.  What you need to do is subtract DECL_FIELD_BIT_OFFSET
> > of the representative from DECL_FIELD_BIT_OFFSET of the original bitfield
> > access - that's the offset within the representative (by construction
> > both fields share DECL_FIELD_OFFSET).
> Doh! That makes sense...
> >> So instead I change bitpos such that:
> >> align_of_representative = TYPE_ALIGN (TREE_TYPE (representative));
> >> bitpos -= bitpos.to_constant () / align_of_representative *
> >> align_of_representative;
> > ?  Not sure why alignment comes into play here?
> Yeah just forget about this... it was my ill attempt at basically doing what
> you described above.
> > Not sure what you are saying but "yes", all shifting and masking should
> > happen in the type of the representative.
> >
> > +  tree bitpos_tree = build_int_cst (bitsizetype, bitpos);
> >
> > for your convenience there's bitsize_int (bitpos) you can use.
> >
> > I don't think you are using the correct bitpos though, you fail to
> > adjust it for the BIT_FIELD_REF/BIT_INSERT_EXPR.
> Not sure I understand what you mean? I do adjust it, I've changed it now so it
> should hopefully be clearer.
> >
> > +build_int_cst (bitsizetype, TYPE_PRECISION
> > (bf_type)),
> >
> > the size of the bitfield reference is DECL_SIZE of the original
> > FIELD_DECL - it might be bigger than the precision of its type.
> > You probably want to double-check it's equal to the precision
> > (because of the insert but also because of all the masking) and
> > refuse to lower if not.
> I added a check for this but out of curiosity, how can the DECL_SIZE of a
> bitfield FIELD_DECL be different than it's type's precision?

It's probably not possible to create a C testcase but I don't see
what makes this impossible in general to have padding in a bitfield 
object.

> >
> > +/* Return TRUE if there are bitfields to lower in this LOOP.  Fill
> > TO_LOWER
> > +   with data structures representing these bitfields.  */
> > +
> > +static bool
> > +bitfields_to_lower_p (class loop *loop,
> > + vec  _to_lower,
> > + vec  _to_lower)
> > +{
> > +  basic_block *bbs = get_loop_body (loop);
> > +  gimple_stmt_iterator gsi;
> >
> > as said I'd prefer to do this walk as part of the other walks we
> > already do - if and if only because get_loop_body () is a DFS
> > walk over the loop body (you should at least share that).
> I'm now sharing the use of ifc_bbs. The reason why I'd rather not share the
> walk over them is because it becomes quite complex to split out the decision
> to not lower if's because there are none, for which we will still want to
> lower bitfields, versus not lowering if's when they are there but aren't
> lowerable at which point we will forego lowering bitfields since we will not
> vectorize this loop anyway.
> >
> > +  value = fold_build1 (NOP_EXPR, load_type, value);
> >
> > fold_convert (load_type, value)
> >
> > +  if (!CONSTANT_CLASS_P (value))
> > +   {
> > + pattern_stmt
> > +   = gimple_build_assign (vect_recog_temp_ssa_var (load_type,
> > NULL),
> > +  value);
> > + value = gimple_get_lhs (pattern_stmt);
> >
> > there's in principle
> >
> >   gimple_seq stmts = NULL;
> >   value = gimple_convert (, load_type, value);
> >   if (!gimple_seq_empty_p (stmts))
> > {
> >   pattern_stmt = gimple_seq_first_stmt (stmts);
> >   append_pattern_def_seq (vinfo, stmt_info, pattern_stmt);
> > }
> >
> > though a append_pattern_def_seq helper to add a convenience sequence
> > would be nice to have here.
> Ended up using the existing 'vect_convert_input', seems to do nicely here.
> > You probably want to double-check your lowering code by
> > bootstrapping / testing with -ftree-loop-if-convert.
> Done, this lead me to find a new failure mode, where the type of the first
> operand of BIT_FIELD_REF was a FP type (TF mode), which then lead to failures
> when constructing the masking and shifting. I ended up adding a nop-conversion
> to an INTEGER type of the same width first if necessary.

You want a VIEW_CONVERT (aka bit-cast) here.

> Also did a follow-up
> bootstrap with the addition of `-ftree-vectorize` and `-fno-vect-cost-model`
> to further test the codegen. All seems to be working on aarch64-linux-gnu.

+static tree
+get_bitfield_rep (gassign *stmt, bool write, tree *bitpos,
+ tree *struct_expr)
...
+  /* Bail out if the DECL_SIZE of the field_decl isn't the same as the 
BF's
+ precision.  */
+  unsigned HOST_WIDE_INT decl_size = tree_to_uhwi (DECL_SIZE 
(field_decl));
+  if (TYPE_PRECISION (TREE_TYPE (gimple_assign_lhs (stmt))) != decl_size)
+return NULL_TREE;

you can

use compare_tree_int (DECL_SIZE (field_decl), TYPE_PRECISION (...)) != 0

which avoids caring for the case the size isn't a 

Re: [PATCH] Teach vectorizer to deal with bitfield accesses (was: [RFC] Teach vectorizer to deal with bitfield reads)

2022-09-08 Thread Andre Vieira (lists) via Gcc-patches

Ping.

On 25/08/2022 10:09, Andre Vieira (lists) via Gcc-patches wrote:


On 17/08/2022 13:49, Richard Biener wrote:

Yes, of course.  What you need to do is subtract DECL_FIELD_BIT_OFFSET
of the representative from DECL_FIELD_BIT_OFFSET of the original 
bitfield

access - that's the offset within the representative (by construction
both fields share DECL_FIELD_OFFSET).

Doh! That makes sense...

So instead I change bitpos such that:
align_of_representative = TYPE_ALIGN (TREE_TYPE (representative));
bitpos -= bitpos.to_constant () / align_of_representative *
align_of_representative;

?  Not sure why alignment comes into play here?
Yeah just forget about this... it was my ill attempt at basically 
doing what you described above.

Not sure what you are saying but "yes", all shifting and masking should
happen in the type of the representative.

+  tree bitpos_tree = build_int_cst (bitsizetype, bitpos);

for your convenience there's bitsize_int (bitpos) you can use.

I don't think you are using the correct bitpos though, you fail to
adjust it for the BIT_FIELD_REF/BIT_INSERT_EXPR.
Not sure I understand what you mean? I do adjust it, I've changed it 
now so it should hopefully be clearer.


+    build_int_cst (bitsizetype, TYPE_PRECISION
(bf_type)),

the size of the bitfield reference is DECL_SIZE of the original
FIELD_DECL - it might be bigger than the precision of its type.
You probably want to double-check it's equal to the precision
(because of the insert but also because of all the masking) and
refuse to lower if not.
I added a check for this but out of curiosity, how can the DECL_SIZE 
of a bitfield FIELD_DECL be different than it's type's precision?


+/* Return TRUE if there are bitfields to lower in this LOOP. Fill
TO_LOWER
+   with data structures representing these bitfields.  */
+
+static bool
+bitfields_to_lower_p (class loop *loop,
+ vec  _to_lower,
+ vec  _to_lower)
+{
+  basic_block *bbs = get_loop_body (loop);
+  gimple_stmt_iterator gsi;

as said I'd prefer to do this walk as part of the other walks we
already do - if and if only because get_loop_body () is a DFS
walk over the loop body (you should at least share that).
I'm now sharing the use of ifc_bbs. The reason why I'd rather not 
share the walk over them is because it becomes quite complex to split 
out the decision to not lower if's because there are none, for which 
we will still want to lower bitfields, versus not lowering if's when 
they are there but aren't lowerable at which point we will forego 
lowering bitfields since we will not vectorize this loop anyway.


+  value = fold_build1 (NOP_EXPR, load_type, value);

fold_convert (load_type, value)

+  if (!CONSTANT_CLASS_P (value))
+   {
+ pattern_stmt
+   = gimple_build_assign (vect_recog_temp_ssa_var (load_type,
NULL),
+  value);
+ value = gimple_get_lhs (pattern_stmt);

there's in principle

  gimple_seq stmts = NULL;
  value = gimple_convert (, load_type, value);
  if (!gimple_seq_empty_p (stmts))
    {
  pattern_stmt = gimple_seq_first_stmt (stmts);
  append_pattern_def_seq (vinfo, stmt_info, pattern_stmt);
    }

though a append_pattern_def_seq helper to add a convenience sequence
would be nice to have here.
Ended up using the existing 'vect_convert_input', seems to do nicely 
here.

You probably want to double-check your lowering code by
bootstrapping / testing with -ftree-loop-if-convert.
Done, this lead me to find a new failure mode, where the type of the 
first operand of BIT_FIELD_REF was a FP type (TF mode), which then 
lead to failures when constructing the masking and shifting. I ended 
up adding a nop-conversion to an INTEGER type of the same width first 
if necessary. Also did a follow-up bootstrap with the addition of 
`-ftree-vectorize` and `-fno-vect-cost-model` to further test the 
codegen. All seems to be working on aarch64-linux-gnu.


Re: [PATCH] Teach vectorizer to deal with bitfield accesses (was: [RFC] Teach vectorizer to deal with bitfield reads)

2022-08-25 Thread Andre Vieira (lists) via Gcc-patches


On 17/08/2022 13:49, Richard Biener wrote:

Yes, of course.  What you need to do is subtract DECL_FIELD_BIT_OFFSET
of the representative from DECL_FIELD_BIT_OFFSET of the original bitfield
access - that's the offset within the representative (by construction
both fields share DECL_FIELD_OFFSET).

Doh! That makes sense...

So instead I change bitpos such that:
align_of_representative = TYPE_ALIGN (TREE_TYPE (representative));
bitpos -= bitpos.to_constant () / align_of_representative *
align_of_representative;

?  Not sure why alignment comes into play here?
Yeah just forget about this... it was my ill attempt at basically doing 
what you described above.

Not sure what you are saying but "yes", all shifting and masking should
happen in the type of the representative.

+  tree bitpos_tree = build_int_cst (bitsizetype, bitpos);

for your convenience there's bitsize_int (bitpos) you can use.

I don't think you are using the correct bitpos though, you fail to
adjust it for the BIT_FIELD_REF/BIT_INSERT_EXPR.
Not sure I understand what you mean? I do adjust it, I've changed it now 
so it should hopefully be clearer.


+build_int_cst (bitsizetype, TYPE_PRECISION
(bf_type)),

the size of the bitfield reference is DECL_SIZE of the original
FIELD_DECL - it might be bigger than the precision of its type.
You probably want to double-check it's equal to the precision
(because of the insert but also because of all the masking) and
refuse to lower if not.
I added a check for this but out of curiosity, how can the DECL_SIZE of 
a bitfield FIELD_DECL be different than it's type's precision?


+/* Return TRUE if there are bitfields to lower in this LOOP.  Fill
TO_LOWER
+   with data structures representing these bitfields.  */
+
+static bool
+bitfields_to_lower_p (class loop *loop,
+ vec  _to_lower,
+ vec  _to_lower)
+{
+  basic_block *bbs = get_loop_body (loop);
+  gimple_stmt_iterator gsi;

as said I'd prefer to do this walk as part of the other walks we
already do - if and if only because get_loop_body () is a DFS
walk over the loop body (you should at least share that).
I'm now sharing the use of ifc_bbs. The reason why I'd rather not share 
the walk over them is because it becomes quite complex to split out the 
decision to not lower if's because there are none, for which we will 
still want to lower bitfields, versus not lowering if's when they are 
there but aren't lowerable at which point we will forego lowering 
bitfields since we will not vectorize this loop anyway.


+  value = fold_build1 (NOP_EXPR, load_type, value);

fold_convert (load_type, value)

+  if (!CONSTANT_CLASS_P (value))
+   {
+ pattern_stmt
+   = gimple_build_assign (vect_recog_temp_ssa_var (load_type,
NULL),
+  value);
+ value = gimple_get_lhs (pattern_stmt);

there's in principle

  gimple_seq stmts = NULL;
  value = gimple_convert (, load_type, value);
  if (!gimple_seq_empty_p (stmts))
{
  pattern_stmt = gimple_seq_first_stmt (stmts);
  append_pattern_def_seq (vinfo, stmt_info, pattern_stmt);
}

though a append_pattern_def_seq helper to add a convenience sequence
would be nice to have here.

Ended up using the existing 'vect_convert_input', seems to do nicely here.

You probably want to double-check your lowering code by
bootstrapping / testing with -ftree-loop-if-convert.
Done, this lead me to find a new failure mode, where the type of the 
first operand of BIT_FIELD_REF was a FP type (TF mode), which then lead 
to failures when constructing the masking and shifting. I ended up 
adding a nop-conversion to an INTEGER type of the same width first if 
necessary. Also did a follow-up bootstrap with the addition of 
`-ftree-vectorize` and `-fno-vect-cost-model` to further test the 
codegen. All seems to be working on aarch64-linux-gnu.diff --git a/gcc/testsuite/gcc.dg/vect/vect-bitfield-read-1.c 
b/gcc/testsuite/gcc.dg/vect/vect-bitfield-read-1.c
new file mode 100644
index 
..01cf34fb44484ca926ca5de99eef76dd99b69e92
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-bitfield-read-1.c
@@ -0,0 +1,40 @@
+/* { dg-require-effective-target vect_int } */
+
+#include 
+#include "tree-vect.h"
+
+extern void abort(void);
+
+struct s { int i : 31; };
+
+#define ELT0 {0}
+#define ELT1 {1}
+#define ELT2 {2}
+#define ELT3 {3}
+#define N 32
+#define RES 48
+struct s A[N]
+  = { ELT0, ELT1, ELT2, ELT3, ELT0, ELT1, ELT2, ELT3,
+  ELT0, ELT1, ELT2, ELT3, ELT0, ELT1, ELT2, ELT3,
+  ELT0, ELT1, ELT2, ELT3, ELT0, ELT1, ELT2, ELT3,
+  ELT0, ELT1, ELT2, ELT3, ELT0, ELT1, ELT2, ELT3};
+
+int __attribute__ ((noipa))
+f(struct s *ptr, unsigned n) {
+int res = 0;
+for (int i = 0; i < n; ++i)
+  res += ptr[i].i;
+return res;
+}
+
+int main (void)
+{
+  check_vect ();
+
+  if (f([0], N) != RES)
+abort ();
+
+  return 0;
+}
+
+/* { 

Re: [PATCH] Teach vectorizer to deal with bitfield accesses (was: [RFC] Teach vectorizer to deal with bitfield reads)

2022-08-17 Thread Richard Biener via Gcc-patches
On Tue, 16 Aug 2022, Andre Vieira (lists) wrote:

> Hi,
> 
> New version of the patch attached, but haven't recreated the ChangeLog yet,
> just waiting to see if this is what you had in mind. See also some replies to
> your comments in-line below:
> 
> On 09/08/2022 15:34, Richard Biener wrote:
> 
> > @@ -2998,7 +3013,7 @@ ifcvt_split_critical_edges (class loop *loop, bool
> > aggressive_if_conv)
> > auto_vec critical_edges;
> >
> > /* Loop is not well formed.  */
> > -  if (num <= 2 || loop->inner || !single_exit (loop))
> > +  if (num <= 2 || loop->inner)
> >   return false;
> >
> > body = get_loop_body (loop);
> >
> > this doesn't appear in the ChangeLog nor is it clear to me why it's
> > needed?  Likewise
> So both these and...
> >
> > -  /* Save BB->aux around loop_version as that uses the same field.  */
> > -  save_length = loop->inner ? loop->inner->num_nodes : loop->num_nodes;
> > -  void **saved_preds = XALLOCAVEC (void *, save_length);
> > -  for (unsigned i = 0; i < save_length; i++)
> > -saved_preds[i] = ifc_bbs[i]->aux;
> > +  void **saved_preds = NULL;
> > +  if (any_complicated_phi || need_to_predicate)
> > +{
> > +  /* Save BB->aux around loop_version as that uses the same field.
> > */
> > +  save_length = loop->inner ? loop->inner->num_nodes :
> > loop->num_nodes;
> > +  saved_preds = XALLOCAVEC (void *, save_length);
> > +  for (unsigned i = 0; i < save_length; i++)
> > +   saved_preds[i] = ifc_bbs[i]->aux;
> > +}
> >
> > is that just premature optimization?
> 
> .. these changes are to make sure we can still use the loop versioning code
> even for cases where there are bitfields to lower but no ifcvts (i.e. num of
> BBs <= 2).
> I wasn't sure about the loop-inner condition and the small examples I tried it
> seemed to work, that is loop version seems to be able to handle nested loops.
> 
> The single_exit condition is still required for both, because the code to
> create the loop versions depends on it. It does look like I missed this in the
> ChangeLog...
> 
> > +  /* BITSTART and BITEND describe the region we can safely load from
> > inside the
> > + structure.  BITPOS is the bit position of the value inside the
> > + representative that we will end up loading OFFSET bytes from the
> > start
> > + of the struct.  BEST_MODE is the mode describing the optimal size of
> > the
> > + representative chunk we load.  If this is a write we will store the
> > same
> > + sized representative back, after we have changed the appropriate
> > bits.  */
> > +  get_bit_range (, , comp_ref, , );
> >
> > I think you need to give up when get_bit_range sets bitstart = bitend to
> > zero
> >
> > +  if (get_best_mode (bitsize, bitpos.to_constant (), bitstart, bitend,
> > +TYPE_ALIGN (TREE_TYPE (struct_expr)),
> > +INT_MAX, false, _mode))
> >
> > +  tree rep_decl = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
> > + NULL_TREE, rep_type);
> > +  /* Load from the start of 'offset + bitpos % alignment'.  */
> > +  uint64_t extra_offset = bitpos.to_constant ();
> >
> > you shouldn't build a new FIELD_DECL.  Either you use
> > DECL_BIT_FIELD_REPRESENTATIVE directly or you use a
> > BIT_FIELD_REF accessing the "representative".
> > DECL_BIT_FIELD_REPRESENTATIVE exists so it can maintain
> > a variable field offset, you can also subset that with an
> > intermediate BIT_FIELD_REF if DECL_BIT_FIELD_REPRESENTATIVE is
> > too large for your taste.
> >
> > I'm not sure all the offset calculation you do is correct, but
> > since you shouldn't invent a new FIELD_DECL it probably needs
> > to change anyway ...
> I can use the DECL_BIT_FIELD_REPRESENTATIVE, but I'll still need some offset
> calculation/extraction. It's easier to example with an example:
> 
> In vect-bitfield-read-3.c the struct:
> typedef struct {
>     int  c;
>     int  b;
>     bool a : 1;
> } struct_t;
> 
> and field access 'vect_false[i].a' or 'vect_true[i].a' will lead to a
> DECL_BIT_FIELD_REPRESENTATIVE of TYPE_SIZE of 8 (and TYPE_PRECISION is also 8
> as expected). However, the DECL_FIELD_OFFSET of either the original field
> decl, the actual bitfield member, or the DECL_BIT_FIELD_REPRESENTATIVE is 0
> and the DECL_FIELD_BIT_OFFSET is 64. These will lead to the correct load:
> _1 = vect_false[i].D;
> 
> D here being the representative is an 8-bit load from vect_false[i] + 64bits.
> So all good there. However, when we construct BIT_FIELD_REF we can't simply
> use DECL_FIELD_BIT_OFFSET (field_decl) as the BIT_FIELD_REF's bitpos.  During
> `verify_gimple` it checks that bitpos + bitsize < TYPE_SIZE (TREE_TYPE (load))
> where BIT_FIELD_REF (load, bitsize, bitpos).

Yes, of course.  What you need to do is subtract DECL_FIELD_BIT_OFFSET
of the representative from DECL_FIELD_BIT_OFFSET of the original bitfield
access - that's the offset within the representative (by construction
both fields share DECL_FIELD_OFFSET).

> So 

Re: [PATCH] Teach vectorizer to deal with bitfield accesses (was: [RFC] Teach vectorizer to deal with bitfield reads)

2022-08-16 Thread Andre Vieira (lists) via Gcc-patches

Hi,

New version of the patch attached, but haven't recreated the ChangeLog 
yet, just waiting to see if this is what you had in mind. See also some 
replies to your comments in-line below:


On 09/08/2022 15:34, Richard Biener wrote:


@@ -2998,7 +3013,7 @@ ifcvt_split_critical_edges (class loop *loop, bool
aggressive_if_conv)
auto_vec critical_edges;

/* Loop is not well formed.  */
-  if (num <= 2 || loop->inner || !single_exit (loop))
+  if (num <= 2 || loop->inner)
  return false;

body = get_loop_body (loop);

this doesn't appear in the ChangeLog nor is it clear to me why it's
needed?  Likewise

So both these and...


-  /* Save BB->aux around loop_version as that uses the same field.  */
-  save_length = loop->inner ? loop->inner->num_nodes : loop->num_nodes;
-  void **saved_preds = XALLOCAVEC (void *, save_length);
-  for (unsigned i = 0; i < save_length; i++)
-saved_preds[i] = ifc_bbs[i]->aux;
+  void **saved_preds = NULL;
+  if (any_complicated_phi || need_to_predicate)
+{
+  /* Save BB->aux around loop_version as that uses the same field.
*/
+  save_length = loop->inner ? loop->inner->num_nodes :
loop->num_nodes;
+  saved_preds = XALLOCAVEC (void *, save_length);
+  for (unsigned i = 0; i < save_length; i++)
+   saved_preds[i] = ifc_bbs[i]->aux;
+}

is that just premature optimization?


.. these changes are to make sure we can still use the loop versioning 
code even for cases where there are bitfields to lower but no ifcvts 
(i.e. num of BBs <= 2).
I wasn't sure about the loop-inner condition and the small examples I 
tried it seemed to work, that is loop version seems to be able to handle 
nested loops.


The single_exit condition is still required for both, because the code 
to create the loop versions depends on it. It does look like I missed 
this in the ChangeLog...



+  /* BITSTART and BITEND describe the region we can safely load from
inside the
+ structure.  BITPOS is the bit position of the value inside the
+ representative that we will end up loading OFFSET bytes from the
start
+ of the struct.  BEST_MODE is the mode describing the optimal size of
the
+ representative chunk we load.  If this is a write we will store the
same
+ sized representative back, after we have changed the appropriate
bits.  */
+  get_bit_range (, , comp_ref, , );

I think you need to give up when get_bit_range sets bitstart = bitend to
zero

+  if (get_best_mode (bitsize, bitpos.to_constant (), bitstart, bitend,
+TYPE_ALIGN (TREE_TYPE (struct_expr)),
+INT_MAX, false, _mode))

+  tree rep_decl = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
+ NULL_TREE, rep_type);
+  /* Load from the start of 'offset + bitpos % alignment'.  */
+  uint64_t extra_offset = bitpos.to_constant ();

you shouldn't build a new FIELD_DECL.  Either you use
DECL_BIT_FIELD_REPRESENTATIVE directly or you use a
BIT_FIELD_REF accessing the "representative".
DECL_BIT_FIELD_REPRESENTATIVE exists so it can maintain
a variable field offset, you can also subset that with an
intermediate BIT_FIELD_REF if DECL_BIT_FIELD_REPRESENTATIVE is
too large for your taste.

I'm not sure all the offset calculation you do is correct, but
since you shouldn't invent a new FIELD_DECL it probably needs
to change anyway ...
I can use the DECL_BIT_FIELD_REPRESENTATIVE, but I'll still need some 
offset calculation/extraction. It's easier to example with an example:


In vect-bitfield-read-3.c the struct:
typedef struct {
    int  c;
    int  b;
    bool a : 1;
} struct_t;

and field access 'vect_false[i].a' or 'vect_true[i].a' will lead to a 
DECL_BIT_FIELD_REPRESENTATIVE of TYPE_SIZE of 8 (and TYPE_PRECISION is 
also 8 as expected). However, the DECL_FIELD_OFFSET of either the 
original field decl, the actual bitfield member, or the 
DECL_BIT_FIELD_REPRESENTATIVE is 0 and the DECL_FIELD_BIT_OFFSET is 64. 
These will lead to the correct load:

_1 = vect_false[i].D;

D here being the representative is an 8-bit load from vect_false[i] + 
64bits. So all good there. However, when we construct BIT_FIELD_REF we 
can't simply use DECL_FIELD_BIT_OFFSET (field_decl) as the 
BIT_FIELD_REF's bitpos.  During `verify_gimple` it checks that bitpos + 
bitsize < TYPE_SIZE (TREE_TYPE (load)) where BIT_FIELD_REF (load, 
bitsize, bitpos).


So instead I change bitpos such that:
align_of_representative = TYPE_ALIGN (TREE_TYPE (representative));
bitpos -= bitpos.to_constant () / align_of_representative * 
align_of_representative;


I've now rewritten this to:
poly_int64 q,r;
if (can_trunc_div_p(bitpos, align_of_representative, , ))
  bitpos = r;

It makes it slightly clearer, also because I no longer need the changes 
to the original tree offset as I'm just using D for the load.

Note that for optimization it will be important that all
accesses to the bitfield members of the same bitfield use the
same underlying area (CSE and store-forwarding will thank 

Re: [PATCH] Teach vectorizer to deal with bitfield accesses (was: [RFC] Teach vectorizer to deal with bitfield reads)

2022-08-09 Thread Richard Biener via Gcc-patches
On Mon, 8 Aug 2022, Andre Vieira (lists) wrote:

> Hi,
> 
> So I've changed the approach from the RFC as suggested, moving the bitfield
> lowering to the if-convert pass.
> 
> So to reiterate, ifcvt will lower COMPONENT_REF's with DECL_BIT_FIELD field's
> to either BIT_FIELD_REF if they are reads or BIT_INSERT_EXPR if they are
> writes, using loads and writes of 'representatives' that are big enough to
> contain the bitfield value.
> 
> In vect_recog I added two patterns to replace these BIT_FIELD_REF and
> BIT_INSERT_EXPR with shift's and masks as appropriate.
> 
> I'd like to see if it was possible to remove the 'load' part of a
> BIT_INSERT_EXPR if the representative write didn't change any relevant bits. 
> For example:
> 
> struct s{
> int dont_care;
> char a : 3;
> };
> 
> s.a = ;
> 
> Should not require a load & write cycle, in fact it wouldn't even require any
> masking either. Though to achieve this we'd need to make sure the
> representative didn't overlap with any other field. Any suggestions on how to
> do this would be great, though I don't think we need to wait for that, as
> that's merely a nice-to-have optimization I guess?

Hmm.  I'm not sure the middle-end can simply ignore padding.  If
some language standard says that would be OK then I think we should
exploit this during lowering when the frontend is still around to
ask - which means somewhen during early optimization.

> I am not sure where I should 'document' this change of behavior to ifcvt,
> and/or we should change the name of the pass, since it's doing more than
> if-conversion now?

It's preparation for vectorization anyway since it will emit
.MASK_LOAD/STORE and friends already.  So I don't think anything
needs to change there.


@@ -2998,7 +3013,7 @@ ifcvt_split_critical_edges (class loop *loop, bool 
aggressive_if_conv)
   auto_vec critical_edges;

   /* Loop is not well formed.  */
-  if (num <= 2 || loop->inner || !single_exit (loop))
+  if (num <= 2 || loop->inner)
 return false;

   body = get_loop_body (loop);

this doesn't appear in the ChangeLog nor is it clear to me why it's
needed?  Likewise

-  /* Save BB->aux around loop_version as that uses the same field.  */
-  save_length = loop->inner ? loop->inner->num_nodes : loop->num_nodes;
-  void **saved_preds = XALLOCAVEC (void *, save_length);
-  for (unsigned i = 0; i < save_length; i++)
-saved_preds[i] = ifc_bbs[i]->aux;
+  void **saved_preds = NULL;
+  if (any_complicated_phi || need_to_predicate)
+{
+  /* Save BB->aux around loop_version as that uses the same field.  
*/
+  save_length = loop->inner ? loop->inner->num_nodes : 
loop->num_nodes;
+  saved_preds = XALLOCAVEC (void *, save_length);
+  for (unsigned i = 0; i < save_length; i++)
+   saved_preds[i] = ifc_bbs[i]->aux;
+}

is that just premature optimization?

+  /* BITSTART and BITEND describe the region we can safely load from 
inside the
+ structure.  BITPOS is the bit position of the value inside the
+ representative that we will end up loading OFFSET bytes from the 
start
+ of the struct.  BEST_MODE is the mode describing the optimal size of 
the
+ representative chunk we load.  If this is a write we will store the 
same
+ sized representative back, after we have changed the appropriate 
bits.  */
+  get_bit_range (, , comp_ref, , );

I think you need to give up when get_bit_range sets bitstart = bitend to 
zero

+  if (get_best_mode (bitsize, bitpos.to_constant (), bitstart, bitend,
+TYPE_ALIGN (TREE_TYPE (struct_expr)),
+INT_MAX, false, _mode))

+  tree rep_decl = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
+ NULL_TREE, rep_type);
+  /* Load from the start of 'offset + bitpos % alignment'.  */
+  uint64_t extra_offset = bitpos.to_constant ();

you shouldn't build a new FIELD_DECL.  Either you use
DECL_BIT_FIELD_REPRESENTATIVE directly or you use a
BIT_FIELD_REF accessing the "representative".
DECL_BIT_FIELD_REPRESENTATIVE exists so it can maintain
a variable field offset, you can also subset that with an
intermediate BIT_FIELD_REF if DECL_BIT_FIELD_REPRESENTATIVE is
too large for your taste.

I'm not sure all the offset calculation you do is correct, but
since you shouldn't invent a new FIELD_DECL it probably needs
to change anyway ...

Note that for optimization it will be important that all
accesses to the bitfield members of the same bitfield use the
same underlying area (CSE and store-forwarding will thank you).

+
+  need_to_lower_bitfields = bitfields_to_lower_p (loop, 
_to_lower);
+  if (!ifcvt_split_critical_edges (loop, aggressive_if_conv)
+  && !need_to_lower_bitfields)
 goto cleanup;

so we lower bitfields even when we cannot split critical edges?
why?

+  need_to_ifcvt
+= if_convertible_loop_p (loop) && dbg_cnt (if_conversion_tree);
+  if (!need_to_ifcvt && !need_to_lower_bitfields)
 goto cleanup;

likewise - if_convertible_loop_p performs other checks, 

[PATCH] Teach vectorizer to deal with bitfield accesses (was: [RFC] Teach vectorizer to deal with bitfield reads)

2022-08-08 Thread Andre Vieira (lists) via Gcc-patches

Hi,

So I've changed the approach from the RFC as suggested, moving the 
bitfield lowering to the if-convert pass.


So to reiterate, ifcvt will lower COMPONENT_REF's with DECL_BIT_FIELD 
field's to either BIT_FIELD_REF if they are reads or BIT_INSERT_EXPR if 
they are writes, using loads and writes of 'representatives' that are 
big enough to contain the bitfield value.


In vect_recog I added two patterns to replace these BIT_FIELD_REF and 
BIT_INSERT_EXPR with shift's and masks as appropriate.


I'd like to see if it was possible to remove the 'load' part of a 
BIT_INSERT_EXPR if the representative write didn't change any relevant 
bits.  For example:


struct s{
int dont_care;
char a : 3;
};

s.a = ;

Should not require a load & write cycle, in fact it wouldn't even 
require any masking either. Though to achieve this we'd need to make 
sure the representative didn't overlap with any other field. Any 
suggestions on how to do this would be great, though I don't think we 
need to wait for that, as that's merely a nice-to-have optimization I guess?


I am not sure where I should 'document' this change of behavior to 
ifcvt, and/or we should change the name of the pass, since it's doing 
more than if-conversion now?


Bootstrapped and regression tested this patch on aarch64-none-linux-gnu.

gcc/ChangeLog:
2022-08-08  Andre Vieira  

    * tree-if-conv.cc (includes): Add expr.h and langhooks.h to 
list of includes.

    (need_to_lower_bitfields): New static bool.
    (need_to_ifcvt): Likewise.
    (version_loop_for_if_conversion): Adapt to work for bitfield 
lowering-only path.

    (bitfield_data_t): New typedef.
    (get_bitfield_data): New function.
    (lower_bitfield): New function.
    (bitfields_to_lower_p): New function.
    (tree_if_conversion): Change to lower-bitfields too.
    * tree-vect-data-refs.cc (vect_find_stmt_data_reference): 
Modify dump message to be more accurate.

    * tree-vect-patterns.cc (includes): Add gimplify-me.h include.
    (vect_recog_bitfield_ref_pattern): New function.
    (vect_recog_bit_insert_pattern): New function.
    (vect_vect_recog_func_ptrs): Add two new patterns.

gcc/testsuite/ChangeLog:
2022-08-08  Andre Vieira  

    * gcc.dg/vect/vect-bitfield-read-1.c: New test.
    * gcc.dg/vect/vect-bitfield-read-2.c: New test.
    * gcc.dg/vect/vect-bitfield-read-3.c: New test.
    * gcc.dg/vect/vect-bitfield-read-4.c: New test.
    * gcc.dg/vect/vect-bitfield-write-1.c: New test.
    * gcc.dg/vect/vect-bitfield-write-2.c: New test.
    * gcc.dg/vect/vect-bitfield-write-3.c: New test.

Kind regards,
Andrediff --git a/gcc/testsuite/gcc.dg/vect/vect-bitfield-read-1.c 
b/gcc/testsuite/gcc.dg/vect/vect-bitfield-read-1.c
new file mode 100644
index 
..01cf34fb44484ca926ca5de99eef76dd99b69e92
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-bitfield-read-1.c
@@ -0,0 +1,40 @@
+/* { dg-require-effective-target vect_int } */
+
+#include 
+#include "tree-vect.h"
+
+extern void abort(void);
+
+struct s { int i : 31; };
+
+#define ELT0 {0}
+#define ELT1 {1}
+#define ELT2 {2}
+#define ELT3 {3}
+#define N 32
+#define RES 48
+struct s A[N]
+  = { ELT0, ELT1, ELT2, ELT3, ELT0, ELT1, ELT2, ELT3,
+  ELT0, ELT1, ELT2, ELT3, ELT0, ELT1, ELT2, ELT3,
+  ELT0, ELT1, ELT2, ELT3, ELT0, ELT1, ELT2, ELT3,
+  ELT0, ELT1, ELT2, ELT3, ELT0, ELT1, ELT2, ELT3};
+
+int __attribute__ ((noipa))
+f(struct s *ptr, unsigned n) {
+int res = 0;
+for (int i = 0; i < n; ++i)
+  res += ptr[i].i;
+return res;
+}
+
+int main (void)
+{
+  check_vect ();
+
+  if (f([0], N) != RES)
+abort ();
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-bitfield-read-2.c 
b/gcc/testsuite/gcc.dg/vect/vect-bitfield-read-2.c
new file mode 100644
index 
..1a4a1579c1478b9407ad21b19e8fbdca9f674b42
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-bitfield-read-2.c
@@ -0,0 +1,43 @@
+/* { dg-require-effective-target vect_int } */
+
+#include 
+#include "tree-vect.h"
+
+extern void abort(void);
+
+struct s {
+unsigned i : 31;
+char a : 4;
+};
+
+#define N 32
+#define ELT0 {0x7FFFUL, 0}
+#define ELT1 {0x7FFFUL, 1}
+#define ELT2 {0x7FFFUL, 2}
+#define ELT3 {0x7FFFUL, 3}
+#define RES 48
+struct s A[N]
+  = { ELT0, ELT1, ELT2, ELT3, ELT0, ELT1, ELT2, ELT3,
+  ELT0, ELT1, ELT2, ELT3, ELT0, ELT1, ELT2, ELT3,
+  ELT0, ELT1, ELT2, ELT3, ELT0, ELT1, ELT2, ELT3,
+  ELT0, ELT1, ELT2, ELT3, ELT0, ELT1, ELT2, ELT3};
+
+int __attribute__ ((noipa))
+f(struct s *ptr, unsigned n) {
+int res = 0;
+for (int i = 0; i < n; ++i)
+  res += ptr[i].a;
+return res;
+}
+
+int main (void)
+{
+  check_vect ();
+
+  if (f([0], N) != RES)
+abort ();
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 

Re: [RFC] Teach vectorizer to deal with bitfield reads

2022-08-01 Thread Richard Biener via Gcc-patches
On Mon, 1 Aug 2022, Andre Vieira (lists) wrote:

> 
> On 29/07/2022 11:52, Richard Biener wrote:
> > On Fri, 29 Jul 2022, Jakub Jelinek wrote:
> >
> >> On Fri, Jul 29, 2022 at 09:57:29AM +0100, Andre Vieira (lists) via
> >> Gcc-patches wrote:
> >>> The 'only on the vectorized code path' remains the same though as
> >>> vect_recog
> >>> also only happens on the vectorized code path right?
> >> if conversion (in some cases) duplicates a loop and guards one copy with
> >> an ifn which resolves to true if that particular loop is vectorized and
> >> false otherwise.  So, then changes that shouldn't be done in case of
> >> vectorization failure can be done on the for vectorizer only copy of the
> >> loop.
> > And just to mention, one issue with lowering of bitfield accesses
> > is bitfield inserts which, on some architectures (hello m68k) have
> > instructions operating on memory directly.  For those it's difficult
> > to not regress in code quality if a bitfield store becomes a
> > read-modify-write cycle.  That's one of the things holding this
> > back.  One idea would be to lower to .INSV directly for those targets
> > (but presence of insv isn't necessarily indicating support for
> > memory destinations).
> >
> > Richard.
> Should I account for that when vectorizing though? From what I can tell (no
> TARGET_VECTOR_* hooks implemented) m68k does not have vectorization support.

No.

> So the question is, are there currently any targets that vectorize and have
> vector bitfield-insert/extract support? If they don't exist I suggest we worry
> about it when it comes around, if not just for the fact that we wouldn't be
> able to test it right now.
> 
> If this is about not lowering on the non-vectorized path, see my previous
> reply, I never intended to do that in the vectorizer. I just thought it was
> the plan to do lowering eventually.

Yes, for the vectorized path this all isn't an issue - and btw the
advantage with if-conversion is that you get VN of the result
"for free", the RMW cycle of bitfield stores likely have reads to
share (and also redundant stores in the end, but ...).

And yes, the plan was to do lowering generally.  Just the simplistic
approaches (my last one was a lowering pass somewhen after IPA, IIRC
combined with SRA) run into some issues, like that on m68k, but IIRC
also some others.  So I wouldn't hold my breath, but then just somebody
needs to do the work and think about how to deal with m68k and the
likes...

Richard.


Re: [RFC] Teach vectorizer to deal with bitfield reads

2022-08-01 Thread Andre Vieira (lists) via Gcc-patches



On 29/07/2022 11:52, Richard Biener wrote:

On Fri, 29 Jul 2022, Jakub Jelinek wrote:


On Fri, Jul 29, 2022 at 09:57:29AM +0100, Andre Vieira (lists) via Gcc-patches 
wrote:

The 'only on the vectorized code path' remains the same though as vect_recog
also only happens on the vectorized code path right?

if conversion (in some cases) duplicates a loop and guards one copy with
an ifn which resolves to true if that particular loop is vectorized and
false otherwise.  So, then changes that shouldn't be done in case of
vectorization failure can be done on the for vectorizer only copy of the
loop.

And just to mention, one issue with lowering of bitfield accesses
is bitfield inserts which, on some architectures (hello m68k) have
instructions operating on memory directly.  For those it's difficult
to not regress in code quality if a bitfield store becomes a
read-modify-write cycle.  That's one of the things holding this
back.  One idea would be to lower to .INSV directly for those targets
(but presence of insv isn't necessarily indicating support for
memory destinations).

Richard.
Should I account for that when vectorizing though? From what I can tell 
(no TARGET_VECTOR_* hooks implemented) m68k does not have vectorization 
support. So the question is, are there currently any targets that 
vectorize and have vector bitfield-insert/extract support? If they don't 
exist I suggest we worry about it when it comes around, if not just for 
the fact that we wouldn't be able to test it right now.


If this is about not lowering on the non-vectorized path, see my 
previous reply, I never intended to do that in the vectorizer. I just 
thought it was the plan to do lowering eventually.




Re: [RFC] Teach vectorizer to deal with bitfield reads

2022-08-01 Thread Andre Vieira (lists) via Gcc-patches



On 29/07/2022 11:31, Jakub Jelinek wrote:

On Fri, Jul 29, 2022 at 09:57:29AM +0100, Andre Vieira (lists) via Gcc-patches 
wrote:

The 'only on the vectorized code path' remains the same though as vect_recog
also only happens on the vectorized code path right?

if conversion (in some cases) duplicates a loop and guards one copy with
an ifn which resolves to true if that particular loop is vectorized and
false otherwise.  So, then changes that shouldn't be done in case of
vectorization failure can be done on the for vectorizer only copy of the
loop.

Jakub
I'm pretty sure vect_recog patterns have no effect on scalar codegen if 
the vectorization fails too. The patterns live as new vect_stmt_info's 
and no changes are actually done to the scalar loop. That was the point 
I was trying to make, but it doesn't matter that much, as I said I am 
happy to do this in if convert.




Re: [RFC] Teach vectorizer to deal with bitfield reads

2022-07-29 Thread Richard Biener via Gcc-patches
On Fri, 29 Jul 2022, Jakub Jelinek wrote:

> On Fri, Jul 29, 2022 at 09:57:29AM +0100, Andre Vieira (lists) via 
> Gcc-patches wrote:
> > The 'only on the vectorized code path' remains the same though as vect_recog
> > also only happens on the vectorized code path right?
> 
> if conversion (in some cases) duplicates a loop and guards one copy with
> an ifn which resolves to true if that particular loop is vectorized and
> false otherwise.  So, then changes that shouldn't be done in case of
> vectorization failure can be done on the for vectorizer only copy of the
> loop.

And just to mention, one issue with lowering of bitfield accesses
is bitfield inserts which, on some architectures (hello m68k) have
instructions operating on memory directly.  For those it's difficult
to not regress in code quality if a bitfield store becomes a
read-modify-write cycle.  That's one of the things holding this
back.  One idea would be to lower to .INSV directly for those targets
(but presence of insv isn't necessarily indicating support for
memory destinations).

Richard.


Re: [RFC] Teach vectorizer to deal with bitfield reads

2022-07-29 Thread Jakub Jelinek via Gcc-patches
On Fri, Jul 29, 2022 at 09:57:29AM +0100, Andre Vieira (lists) via Gcc-patches 
wrote:
> The 'only on the vectorized code path' remains the same though as vect_recog
> also only happens on the vectorized code path right?

if conversion (in some cases) duplicates a loop and guards one copy with
an ifn which resolves to true if that particular loop is vectorized and
false otherwise.  So, then changes that shouldn't be done in case of
vectorization failure can be done on the for vectorizer only copy of the
loop.

Jakub



Re: [RFC] Teach vectorizer to deal with bitfield reads

2022-07-29 Thread Richard Biener via Gcc-patches
On Fri, 29 Jul 2022, Andre Vieira (lists) wrote:

> Hi Richard,
> 
> Thanks for the review, I don't completely understand all of the below, so I
> added some extra questions to help me understand :)
> 
> On 27/07/2022 12:37, Richard Biener wrote:
> > On Tue, 26 Jul 2022, Andre Vieira (lists) wrote:
> >
> > I don't think this is a good approach for what you gain and how
> > necessarily limited it will be.  Similar to the recent experiment with
> > handling _Complex loads/stores this is much better tackled by lowering
> > things earlier (it will be lowered at RTL expansion time).
> I assume the approach you are referring to here is the lowering of the
> BIT_FIELD_DECL to BIT_FIELD_REF in the vect_recog part of the vectorizer. I am
> all for lowering earlier, the reason I did it there was as a 'temporary'
> approach until we have that earlier loading.

I understood, but "temporary" things in GCC tend to be still around
10 years later, so ...

> >
> > One place to do this experimentation would be to piggy-back on the
> > if-conversion pass so the lowering would happen only on the
> > vectorized code path.
> This was one of my initial thoughts, though the if-conversion changes are a
> bit more intrusive for a temporary approach and not that much earlier. It does
> however have the added benefit of not having to make any changes to the
> vectorizer itself later if we do do the earlier lowering, assuming the
> lowering results in the same.
> 
> The 'only on the vectorized code path' remains the same though as vect_recog
> also only happens on the vectorized code path right?
> >   Note that the desired lowering would look like
> > the following for reads:
> >
> >_1 = a.b;
> >
> > to
> >
> >_2 = a.;
> >_1 = BIT_FIELD_REF <2, ...>; // extract bits
> I don't yet have a well formed idea of what '' is
> supposed to look like in terms of tree expressions. I understand what it's
> supposed to be representing, the 'larger than bit-field'-load. But is it going
> to be a COMPONENT_REF with a fake 'FIELD_DECL' with the larger size? Like I
> said on IRC, the description of BIT_FIELD_REF makes it sound like this isn't
> how we are supposed to use it, are we intending to make a change to that here?

 is what DECL_BIT_FIELD_REPRESENTATIVE 
(FIELD_DECL-for-b) gives you, it is a "fake" FIELD_DECL for the underlying
storage, conveniently made available to you by the middle-end.  For
your 31bit field it would be simply 'int' typed.

The BIT_FIELD_REF then extracts the actual bitfield from the underlying
storage, but it's now no longer operating on memory but on the register
holding the underlying data.  To the vectorizer we'd probably have to
pattern-match this to shifts & masks and hope for the conversion to
combine with a later one.

> > and for writes:
> >
> >a.b = _1;
> >
> > to
> >
> >_2 = a.;
> >_3 = BIT_INSERT_EXPR <_2, _1, ...>; // insert bits
> >a. = _3;
> I was going to avoid writes for now because they are somewhat more
> complicated, but maybe it's not that bad, I'll add them too.

Only handling loads at start is probably fine as experiment, but
handling stores should be straight forward - of course the
BIT_INSERT_EXPR lowering to shifts & masks will be more
complicated.

> > so you trade now handled loads/stores with not handled
> > BIT_FIELD_REF / BIT_INSERT_EXPR which you would then need to
> > pattern match to shifts and logical ops in the vectorizer.
> Yeah that vect_recog pattern already exists in my RFC patch, though I can
> probably simplify it by moving the bit-field-ref stuff to ifcvt.
> >
> > There's a separate thing of actually promoting all uses, for
> > example
> >
> > struct { long long x : 33; } a;
> >
> >   a.a = a.a + 1;
> >
> > will get you 33bit precision adds (for bit fields less than 32bits
> > they get promoted to int but not for larger bit fields).  RTL
> > expansion again will rewrite this into larger ops plus masking.
> Not sure I understand why this is relevant here? The current way I am doing
> this would likely lower a  bit-field like that to a 64-bit load  followed by
> the masking away of the top 31 bits, same would happen with a ifcvt-lowering
> approach.

Yes, but if there's anything besides loading or storing you will have
a conversion from, say int:31 to int in the IL before any arithmetic.
I've not looked but your patch probably handles conversions to/from
bitfield types by masking / extending.  What I've mentioned with the
33bit example is that with that you can have arithmetic in 33 bits
_without_ intermediate conversions, so you'd have to properly truncate
after every such operation (or choose not to vectorize which I think
is what would happen now).

> >
> > So I think the time is better spent in working on the lowering of
> > bitfield accesses, if sufficiently separated it could be used
> > from if-conversion by working on loop SEME regions.
> I will start to look at modifying ifcvt to add the lowering there. Will likely
> require two pass though 

Re: [RFC] Teach vectorizer to deal with bitfield reads

2022-07-29 Thread Andre Vieira (lists) via Gcc-patches

Hi Richard,

Thanks for the review, I don't completely understand all of the below, 
so I added some extra questions to help me understand :)


On 27/07/2022 12:37, Richard Biener wrote:

On Tue, 26 Jul 2022, Andre Vieira (lists) wrote:

I don't think this is a good approach for what you gain and how
necessarily limited it will be.  Similar to the recent experiment with
handling _Complex loads/stores this is much better tackled by lowering
things earlier (it will be lowered at RTL expansion time).
I assume the approach you are referring to here is the lowering of the 
BIT_FIELD_DECL to BIT_FIELD_REF in the vect_recog part of the 
vectorizer. I am all for lowering earlier, the reason I did it there was 
as a 'temporary' approach until we have that earlier loading.


One place to do this experimentation would be to piggy-back on the
if-conversion pass so the lowering would happen only on the
vectorized code path.
This was one of my initial thoughts, though the if-conversion changes 
are a bit more intrusive for a temporary approach and not that much 
earlier. It does however have the added benefit of not having to make 
any changes to the vectorizer itself later if we do do the earlier 
lowering, assuming the lowering results in the same.


The 'only on the vectorized code path' remains the same though as 
vect_recog also only happens on the vectorized code path right?

  Note that the desired lowering would look like
the following for reads:

   _1 = a.b;

to

   _2 = a.;
   _1 = BIT_FIELD_REF <2, ...>; // extract bits
I don't yet have a well formed idea of what '' is 
supposed to look like in terms of tree expressions. I understand what 
it's supposed to be representing, the 'larger than bit-field'-load. But 
is it going to be a COMPONENT_REF with a fake 'FIELD_DECL' with the 
larger size? Like I said on IRC, the description of BIT_FIELD_REF makes 
it sound like this isn't how we are supposed to use it, are we intending 
to make a change to that here?



and for writes:

   a.b = _1;

to

   _2 = a.;
   _3 = BIT_INSERT_EXPR <_2, _1, ...>; // insert bits
   a. = _3;
I was going to avoid writes for now because they are somewhat more 
complicated, but maybe it's not that bad, I'll add them too.

so you trade now handled loads/stores with not handled
BIT_FIELD_REF / BIT_INSERT_EXPR which you would then need to
pattern match to shifts and logical ops in the vectorizer.
Yeah that vect_recog pattern already exists in my RFC patch, though I 
can probably simplify it by moving the bit-field-ref stuff to ifcvt.


There's a separate thing of actually promoting all uses, for
example

struct { long long x : 33; } a;

  a.a = a.a + 1;

will get you 33bit precision adds (for bit fields less than 32bits
they get promoted to int but not for larger bit fields).  RTL
expansion again will rewrite this into larger ops plus masking.
Not sure I understand why this is relevant here? The current way I am 
doing this would likely lower a  bit-field like that to a 64-bit load  
followed by the masking away of the top 31 bits, same would happen with 
a ifcvt-lowering approach.


So I think the time is better spent in working on the lowering of
bitfield accesses, if sufficiently separated it could be used
from if-conversion by working on loop SEME regions.
I will start to look at modifying ifcvt to add the lowering there. Will 
likely require two pass though because we can no longer look at the 
number of BBs to determine whether ifcvt is even needed, so we will 
first need to look for bit-field-decls, then version the loops and then 
look for them again for transformation, but I guess that's fine?

The patches
doing previous implementations are probably not too useful anymore
(I find one from 2011 and one from 2016, both pre-dating BIT_INSERT_EXPR)

Richard.


Re: [RFC] Teach vectorizer to deal with bitfield reads

2022-07-27 Thread Richard Biener via Gcc-patches
On Tue, 26 Jul 2022, Andre Vieira (lists) wrote:

> Hi,
> 
> This is a RFC for my prototype for bitfield read vectorization. This patch
> enables bit-field read vectorization by removing the rejection of bit-field
> read's during DR analysis and by adding two vect patterns. The first one
> transforms TREE_COMPONENT's with BIT_FIELD_DECL's into BIT_FIELD_REF's, this
> is a temporary one as I believe there are plans to do this lowering earlier in
> the compiler. To avoid having to wait for that to happen we decided to
> implement this temporary vect pattern.
> The second one looks for conversions of the result of BIT_FIELD_REF's from a
> 'partial' type to a 'full-type' and transforms it into a 'full-type' load
> followed by the necessary shifting and masking.
> 
> The patch is not perfect, one thing I'd like to change for instance is the way
> the 'full-type' load is represented. I currently abuse the fact that the
> vectorizer transforms the original TREE_COMPONENT with a BIT_FIELD_DECL into a
> full-type vector load, because it uses the smallest mode necessary for that
> precision. The reason I do this is because I was struggling to construct a
> MEM_REF that the vectorizer would accept and this for some reason seemed to
> work ... I'd appreciate some pointers on how to do this properly :)
> 
> Another aspect that I haven't paid much attention to yet is costing, I've
> noticed some testcases fail to vectorize due to costing where I think it might
> be wrong, but like I said, I haven't paid much attention to it.
> 
> Finally another aspect I'd eventually like to tackle is the sinking of the
> masking when possible, for instance in bit-field-read-3.c the 'masking' does
> not need to be inside the loop because we are doing bitwise operations. Though
> I suspect we are better off looking at things like this elsewhere, maybe where
> we do codegen for the reduction... Haven't looked at this at all yet.
> 
> Let me know if you believe this is a good approach? I've ran regression tests
> and this hasn't broken anything so far...

I don't think this is a good approach for what you gain and how 
necessarily limited it will be.  Similar to the recent experiment with
handling _Complex loads/stores this is much better tackled by lowering
things earlier (it will be lowered at RTL expansion time).

One place to do this experimentation would be to piggy-back on the
if-conversion pass so the lowering would happen only on the
vectorized code path.  Note that the desired lowering would look like
the following for reads:

  _1 = a.b;

to

  _2 = a.;
  _1 = BIT_FIELD_REF <2, ...>; // extract bits

and for writes:

  a.b = _1;

to

  _2 = a.;
  _3 = BIT_INSERT_EXPR <_2, _1, ...>; // insert bits
  a. = _3;

so you trade now handled loads/stores with not handled
BIT_FIELD_REF / BIT_INSERT_EXPR which you would then need to
pattern match to shifts and logical ops in the vectorizer.

There's a separate thing of actually promoting all uses, for
example

struct { long long x : 33; } a;

 a.a = a.a + 1;

will get you 33bit precision adds (for bit fields less than 32bits
they get promoted to int but not for larger bit fields).  RTL
expansion again will rewrite this into larger ops plus masking.

So I think the time is better spent in working on the lowering of
bitfield accesses, if sufficiently separated it could be used
from if-conversion by working on loop SEME regions.  The patches
doing previous implementations are probably not too useful anymore
(I find one from 2011 and one from 2016, both pre-dating BIT_INSERT_EXPR)

Richard.


[RFC] Teach vectorizer to deal with bitfield reads

2022-07-26 Thread Andre Vieira (lists) via Gcc-patches

Hi,

This is a RFC for my prototype for bitfield read vectorization. This 
patch enables bit-field read vectorization by removing the rejection of 
bit-field read's during DR analysis and by adding two vect patterns. The 
first one transforms TREE_COMPONENT's with BIT_FIELD_DECL's into 
BIT_FIELD_REF's, this is a temporary one as I believe there are plans to 
do this lowering earlier in the compiler. To avoid having to wait for 
that to happen we decided to implement this temporary vect pattern.
The second one looks for conversions of the result of BIT_FIELD_REF's 
from a 'partial' type to a 'full-type' and transforms it into a 
'full-type' load followed by the necessary shifting and masking.


The patch is not perfect, one thing I'd like to change for instance is 
the way the 'full-type' load is represented. I currently abuse the fact 
that the vectorizer transforms the original TREE_COMPONENT with a 
BIT_FIELD_DECL into a full-type vector load, because it uses the 
smallest mode necessary for that precision. The reason I do this is 
because I was struggling to construct a MEM_REF that the vectorizer 
would accept and this for some reason seemed to work ... I'd appreciate 
some pointers on how to do this properly :)


Another aspect that I haven't paid much attention to yet is costing, 
I've noticed some testcases fail to vectorize due to costing where I 
think it might be wrong, but like I said, I haven't paid much attention 
to it.


Finally another aspect I'd eventually like to tackle is the sinking of 
the masking when possible, for instance in bit-field-read-3.c the 
'masking' does not need to be inside the loop because we are doing 
bitwise operations. Though I suspect we are better off looking at things 
like this elsewhere, maybe where we do codegen for the reduction... 
Haven't looked at this at all yet.


Let me know if you believe this is a good approach? I've ran regression 
tests and this hasn't broken anything so far...


Kind regards,
Andre

PS: Once we do have lowering of BIT_FIELD_DECL's to BIT_FIELD_REF's 
earlier in the compiler I suspect we will require some further changes 
to the DR analysis part, but that's difficult to test right now.
diff --git a/gcc/testsuite/gcc.dg/vect/bitfield-read-1.c 
b/gcc/testsuite/gcc.dg/vect/bitfield-read-1.c
new file mode 100644
index 
..c30aad365c40474109748bd03c3a5ca1d10723ed
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/bitfield-read-1.c
@@ -0,0 +1,38 @@
+/* { dg-require-effective-target vect_int } */
+
+#include 
+#include "tree-vect.h"
+
+extern void abort(void);
+
+struct s { int i : 31; };
+
+#define ELT0 {0}
+#define ELT1 {1}
+#define ELT2 {2}
+#define ELT3 {3}
+#define RES 48
+struct s A[N]
+  = { ELT0, ELT1, ELT2, ELT3, ELT0, ELT1, ELT2, ELT3,
+  ELT0, ELT1, ELT2, ELT3, ELT0, ELT1, ELT2, ELT3,
+  ELT0, ELT1, ELT2, ELT3, ELT0, ELT1, ELT2, ELT3,
+  ELT0, ELT1, ELT2, ELT3, ELT0, ELT1, ELT2, ELT3};
+
+int f(struct s *ptr, unsigned n) {
+int res = 0;
+for (int i = 0; i < n; ++i)
+  res += ptr[i].i;
+return res;
+}
+
+int main (void)
+{
+  check_vect ();
+
+  if (f([0], N) != RES)
+abort ();
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/bitfield-read-2.c 
b/gcc/testsuite/gcc.dg/vect/bitfield-read-2.c
new file mode 100644
index 
..ab82ff347c55e78d098d194d739bcd9d7737f777
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/bitfield-read-2.c
@@ -0,0 +1,42 @@
+/* { dg-require-effective-target vect_int } */
+
+#include 
+#include "tree-vect.h"
+
+extern void abort(void);
+
+struct s {
+unsigned i : 31;
+char a : 4;
+};
+
+#define N 32
+#define ELT0 {0x7FFFUL, 0}
+#define ELT1 {0x7FFFUL, 1}
+#define ELT2 {0x7FFFUL, 2}
+#define ELT3 {0x7FFFUL, 3}
+#define RES 48
+struct s A[N]
+  = { ELT0, ELT1, ELT2, ELT3, ELT0, ELT1, ELT2, ELT3,
+  ELT0, ELT1, ELT2, ELT3, ELT0, ELT1, ELT2, ELT3,
+  ELT0, ELT1, ELT2, ELT3, ELT0, ELT1, ELT2, ELT3,
+  ELT0, ELT1, ELT2, ELT3, ELT0, ELT1, ELT2, ELT3};
+
+int f(struct s *ptr, unsigned n) {
+int res = 0;
+for (int i = 0; i < n; ++i)
+  res += ptr[i].a;
+return res;
+}
+
+int main (void)
+{
+  check_vect ();
+
+  if (f([0], N) != RES)
+abort ();
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/bitfield-read-3.c 
b/gcc/testsuite/gcc.dg/vect/bitfield-read-3.c
new file mode 100644
index 
..216611a29fd8bbfbafdbdb79d790e520f44ba672
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/bitfield-read-3.c
@@ -0,0 +1,43 @@
+/* { dg-require-effective-target vect_int } */
+
+#include 
+#include "tree-vect.h"
+#include 
+
+extern void abort(void);
+
+typedef struct {
+int  c;
+int  b;
+bool a : 1;
+} struct_t;
+
+#define N 16
+#define ELT_F { 0x, 0x,