This is a modified version of the previous patch that removes the
documentation and read-md.c fixes. These patches have been submitted
separately and approved.
This patch is designed to address code that was not being vectorized due
to missing widening patterns in the ARM backend. Code such as:
int t6(int len, void * dummy, short * __restrict x)
{
len = len & ~31;
int result = 0;
__asm volatile ("");
for (int i = 0; i < len; i++)
result += x[i];
return result;
}
Validated on arm-none-eabi, arm-none-linux-gnueabi,
arm-none-linux-gnueabihf, and armeb-none-linux-gnueabihf.
2015-09-22 Michael Collison <michael.colli...@linaro.org>
* config/arm/neon.md (widen_<us>sum<mode>): New patterns
where mode is VQI to improve mixed mode add vectorization.
--
Michael Collison
Linaro Toolchain Working Group
michael.colli...@linaro.org
diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
index 654d9d5..54623fe 100644
--- a/gcc/config/arm/neon.md
+++ b/gcc/config/arm/neon.md
@@ -1174,6 +1174,57 @@
;; Widening operations
+(define_expand "widen_ssum<mode>3"
+ [(set (match_operand:<V_double_width> 0 "s_register_operand" "")
+ (plus:<V_double_width> (sign_extend:<V_double_width> (match_operand:VQI 1 "s_register_operand" ""))
+ (match_operand:<V_double_width> 2 "s_register_operand" "")))]
+ "TARGET_NEON"
+ {
+ int i;
+ int half_elem = <V_mode_nunits>/2;
+ rtvec v1 = rtvec_alloc (half_elem);
+ rtvec v2 = rtvec_alloc (half_elem);
+ rtx p1, p2;
+
+ for (i = 0; i < half_elem; i++)
+ RTVEC_ELT (v1, i) = GEN_INT (i);
+ p1 = gen_rtx_PARALLEL (GET_MODE (operands[1]), v1);
+
+ for (i = half_elem; i < <V_mode_nunits>; i++)
+ RTVEC_ELT (v2, i - half_elem) = GEN_INT (i);
+ p2 = gen_rtx_PARALLEL (GET_MODE (operands[1]), v2);
+
+ if (operands[0] != operands[2])
+ emit_move_insn (operands[0], operands[2]);
+
+ emit_insn (gen_vec_sel_widen_ssum_lo<mode><V_half>3 (operands[0], operands[1], p1, operands[0]));
+ emit_insn (gen_vec_sel_widen_ssum_hi<mode><V_half>3 (operands[0], operands[1], p2, operands[0]));
+ DONE;
+ }
+)
+
+(define_insn "vec_sel_widen_ssum_lo<VQI:mode><VW:mode>3"
+ [(set (match_operand:<VW:V_widen> 0 "s_register_operand" "=w")
+ (plus:<VW:V_widen> (sign_extend:<VW:V_widen> (vec_select:VW (match_operand:VQI 1 "s_register_operand" "%w")
+ (match_operand:VQI 2 "vect_par_constant_low" "")))
+ (match_operand:<VW:V_widen> 3 "s_register_operand" "0")))]
+ "TARGET_NEON"
+ "vaddw.<V_s_elem>\t%q0, %q3, %e1"
+ [(set_attr "type" "neon_add_widen")
+ (set_attr "length" "8")]
+)
+
+(define_insn "vec_sel_widen_ssum_hi<VQI:mode><VW:mode>3"
+ [(set (match_operand:<VW:V_widen> 0 "s_register_operand" "=w")
+ (plus:<VW:V_widen> (sign_extend:<VW:V_widen> (vec_select:VW (match_operand:VQI 1 "s_register_operand" "%w")
+ (match_operand:VQI 2 "vect_par_constant_high" "")))
+ (match_operand:<VW:V_widen> 3 "s_register_operand" "0")))]
+ "TARGET_NEON"
+ "vaddw.<V_s_elem>\t%q0, %q3, %f1"
+ [(set_attr "type" "neon_add_widen")
+ (set_attr "length" "8")]
+)
+
(define_insn "widen_ssum<mode>3"
[(set (match_operand:<V_widen> 0 "s_register_operand" "=w")
(plus:<V_widen> (sign_extend:<V_widen>
@@ -1184,4 +1235,55 @@
[(set_attr "type" "neon_add_widen")]
)
+(define_expand "widen_usum<mode>3"
+ [(set (match_operand:<V_double_width> 0 "s_register_operand" "")
+ (plus:<V_double_width> (zero_extend:<V_double_width> (match_operand:VQI 1 "s_register_operand" ""))
+ (match_operand:<V_double_width> 2 "s_register_operand" "")))]
+ "TARGET_NEON"
+ {
+ int i;
+ int half_elem = <V_mode_nunits>/2;
+ rtvec v1 = rtvec_alloc (half_elem);
+ rtvec v2 = rtvec_alloc (half_elem);
+ rtx p1, p2;
+
+ for (i = 0; i < half_elem; i++)
+ RTVEC_ELT (v1, i) = GEN_INT (i);
+ p1 = gen_rtx_PARALLEL (GET_MODE (operands[1]), v1);
+
+ for (i = half_elem; i < <V_mode_nunits>; i++)
+ RTVEC_ELT (v2, i - half_elem) = GEN_INT (i);
+ p2 = gen_rtx_PARALLEL (GET_MODE (operands[1]), v2);
+
+ if (operands[0] != operands[2])
+ emit_move_insn (operands[0], operands[2]);
+
+ emit_insn (gen_vec_sel_widen_usum_lo<mode><V_half>3 (operands[0], operands[1], p1, operands[0]));
+ emit_insn (gen_vec_sel_widen_usum_hi<mode><V_half>3 (operands[0], operands[1], p2, operands[0]));
+ DONE;
+ }
+)
+
+(define_insn "vec_sel_widen_usum_lo<VQI:mode><VW:mode>3"
+ [(set (match_operand:<VW:V_widen> 0 "s_register_operand" "=w")
+ (plus:<VW:V_widen> (zero_extend:<VW:V_widen> (vec_select:VW (match_operand:VQI 1 "s_register_operand" "%w")
+ (match_operand:VQI 2 "vect_par_constant_low" "")))
+ (match_operand:<VW:V_widen> 3 "s_register_operand" "0")))]
+ "TARGET_NEON"
+ "vaddw.<V_u_elem>\t%q0, %q3, %e1"
+ [(set_attr "type" "neon_add_widen")
+ (set_attr "length" "8")]
+)
+
+(define_insn "vec_sel_widen_usum_hi<VQI:mode><VW:mode>3"
+ [(set (match_operand:<VW:V_widen> 0 "s_register_operand" "=w")
+ (plus:<VW:V_widen> (zero_extend:<VW:V_widen> (vec_select:VW (match_operand:VQI 1 "s_register_operand" "%w")
+ (match_operand:VQI 2 "vect_par_constant_high" "")))
+ (match_operand:<VW:V_widen> 3 "s_register_operand" "0")))]
+ "TARGET_NEON"
+ "vaddw.<V_u_elem>\t%q0, %q3, %f1"
+ [(set_attr "type" "neon_add_widen")
+ (set_attr "length" "8")]
+)
+
diff --git a/gcc/testsuite/gcc.target/arm/neon-vaddws16.c b/gcc/testsuite/gcc.target/arm/neon-vaddws16.c
new file mode 100644
index 0000000..ed10669
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/neon-vaddws16.c
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_hw } */
+/* { dg-add-options arm_neon_ok } */
+/* { dg-options "-O3" } */
+
+
+int
+t6(int len, void * dummy, short * __restrict x)
+{
+ len = len & ~31;
+ int result = 0;
+ __asm volatile ("");
+ for (int i = 0; i < len; i++)
+ result += x[i];
+ return result;
+}
+
+/* { dg-final { scan-assembler "vaddw\.s16" } } */
+
+
+
diff --git a/gcc/testsuite/gcc.target/arm/neon-vaddws32.c b/gcc/testsuite/gcc.target/arm/neon-vaddws32.c
new file mode 100644
index 0000000..94bf0c9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/neon-vaddws32.c
@@ -0,0 +1,19 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_hw } */
+/* { dg-add-options arm_neon_ok } */
+/* { dg-options "-O3" } */
+
+int
+t6(int len, void * dummy, int * __restrict x)
+{
+ len = len & ~31;
+ long long result = 0;
+ __asm volatile ("");
+ for (int i = 0; i < len; i++)
+ result += x[i];
+ return result;
+}
+
+/* { dg-final { scan-assembler "vaddw\.s32" } } */
+
+
diff --git a/gcc/testsuite/gcc.target/arm/neon-vaddwu16.c b/gcc/testsuite/gcc.target/arm/neon-vaddwu16.c
new file mode 100644
index 0000000..98f8768
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/neon-vaddwu16.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_hw } */
+/* { dg-add-options arm_neon_ok } */
+/* { dg-options "-O3" } */
+
+
+int
+t6(int len, void * dummy, unsigned short * __restrict x)
+{
+ len = len & ~31;
+ unsigned int result = 0;
+ __asm volatile ("");
+ for (int i = 0; i < len; i++)
+ result += x[i];
+ return result;
+}
+
+/* { dg-final { scan-assembler "vaddw.u16" } } */
diff --git a/gcc/testsuite/gcc.target/arm/neon-vaddwu32.c b/gcc/testsuite/gcc.target/arm/neon-vaddwu32.c
new file mode 100644
index 0000000..2e9af56
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/neon-vaddwu32.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_hw } */
+/* { dg-add-options arm_neon_ok } */
+/* { dg-options "-O3" } */
+
+int
+t6(int len, void * dummy, unsigned int * __restrict x)
+{
+ len = len & ~31;
+ unsigned long long result = 0;
+ __asm volatile ("");
+ for (int i = 0; i < len; i++)
+ result += x[i];
+ return result;
+}
+
+/* { dg-final { scan-assembler "vaddw\.u32" } } */
+
diff --git a/gcc/testsuite/gcc.target/arm/neon-vaddwu8.c b/gcc/testsuite/gcc.target/arm/neon-vaddwu8.c
new file mode 100644
index 0000000..de2ad8a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/neon-vaddwu8.c
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_hw } */
+/* { dg-add-options arm_neon_ok } */
+/* { dg-options "-O3" } */
+
+
+int
+t6(int len, void * dummy, char * __restrict x)
+{
+ len = len & ~31;
+ unsigned short result = 0;
+ __asm volatile ("");
+ for (int i = 0; i < len; i++)
+ result += x[i];
+ return result;
+}
+
+/* { dg-final { scan-assembler "vaddw\.u8" } } */
+
+
+
--
1.9.1