Hi All, Here is a patch for popcount builtin detection similar to LLVM. I would like to queue this for review for next stage 1.
1. This is done part of loop-distribution and effective for -O3 and above. 2. This does not distribute loop to detect popcount (like memcpy/memmove). I dont think that happens in practice. Please correct me if I am wrong. Bootstrapped and regression tested on aarch64-linux-gnu with no new regressions. Thanks, Kugan gcc/ChangeLog: 2018-01-25 Kugan Vivekanandarajah <kug...@linaro.org> PR middle-end/82479 * tree-loop-distribution.c (handle_popcount): New. (pass_loop_distribution::execute): Use handle_popcount. gcc/testsuite/ChangeLog: 2018-01-25 Kugan Vivekanandarajah <kug...@linaro.org> PR middle-end/82479 * gcc.dg/tree-ssa/popcount.c: New test.
From 9fa09af4b7013c6207e59a4920c82f089bfe45c2 Mon Sep 17 00:00:00 2001 From: Kugan Vivekanandarajah <kugan.vivekanandara...@linaro.org> Date: Wed, 24 Jan 2018 08:50:08 +1100 Subject: [PATCH] pocount builtin detection Change-Id: Ic6e175f9cc9a69bd417936a4845c2c046fd446b4 Change-Id: I680eb107445660c60a5d38f5d7300ab1a3243bf5 Change-Id: Ia9f0df89e05520091dc7797195098118768c7ac2 --- gcc/testsuite/gcc.dg/tree-ssa/popcount.c | 41 +++++++++ gcc/tree-loop-distribution.c | 145 +++++++++++++++++++++++++++++++ 2 files changed, 186 insertions(+) create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/popcount.c diff --git a/gcc/testsuite/gcc.dg/tree-ssa/popcount.c b/gcc/testsuite/gcc.dg/tree-ssa/popcount.c new file mode 100644 index 0000000..86a66cb --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/popcount.c @@ -0,0 +1,41 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -fdump-tree-optimized" } */ + +extern int foo (int); + +int PopCount (long b) { + int c = 0; + b++; + + while (b) { + b &= b - 1; + c++; + } + return c; +} +int PopCount2 (long b) { + int c = 0; + + while (b) { + b &= b - 1; + c++; + } + foo (c); + return foo (c); +} + +void PopCount3 (long b1) { + + for (long i = 0; i < b1; ++i) + { + long b = i; + int c = 0; + while (b) { + b &= b - 1; + c++; + } + foo (c); + } +} + +/* { dg-final { scan-tree-dump-times "__builtin_popcount" 3 "optimized" } } */ diff --git a/gcc/tree-loop-distribution.c b/gcc/tree-loop-distribution.c index a3d76e4..1060700 100644 --- a/gcc/tree-loop-distribution.c +++ b/gcc/tree-loop-distribution.c @@ -1585,6 +1585,148 @@ classify_builtin_ldst (loop_p loop, struct graph *rdg, partition *partition, return; } +/* See if loop is a popcout implementation of the form + + int c = 0; + while (b) { + b = b & (b - 1); + c++; + } + + If so, convert this into c = __builtin_popcount (b) + return true if we did, false otherwise. */ + + +static bool +handle_popcount (loop_p loop) +{ + tree lhs, rhs; + tree dest, src; + gimple *and_minus_one; + int count = 0; + gphi *count_phi; + gimple *fn_call; + gimple *use_stmt; + use_operand_p use_p; + imm_use_iterator iter; + gimple_stmt_iterator gsi; + + /* Check loop terminating branch is like + if (b != 0). */ + gimple *stmt = last_stmt (loop->header); + if (!stmt + || gimple_code (stmt) != GIMPLE_COND + || !zerop (gimple_cond_rhs (stmt))) + return false; + + /* Cheeck "b = b & (b - 1)" is calculated. */ + lhs = gimple_cond_lhs (stmt); + gimple *and_stmt = SSA_NAME_DEF_STMT (lhs); + if (gimple_assign_rhs_code (and_stmt) != BIT_AND_EXPR) + return false; + lhs = gimple_assign_rhs1 (and_stmt); + rhs = gimple_assign_rhs2 (and_stmt); + if (TREE_CODE (lhs) == SSA_NAME + && (and_minus_one = SSA_NAME_DEF_STMT (lhs)) + && is_gimple_assign (and_minus_one) + && (gimple_assign_rhs_code (and_minus_one) == PLUS_EXPR) + && integer_minus_onep (gimple_assign_rhs2 (and_minus_one))) + lhs = rhs; + else if (TREE_CODE (rhs) == SSA_NAME + && (and_minus_one = SSA_NAME_DEF_STMT (rhs)) + && is_gimple_assign (and_minus_one) + && (gimple_assign_rhs_code (and_minus_one) == PLUS_EXPR) + && integer_minus_onep (gimple_assign_rhs2 (and_minus_one))) + ; + else + return false; + if ((gimple_assign_rhs1 (and_stmt) != gimple_assign_rhs1 (and_minus_one)) + && (gimple_assign_rhs2 (and_stmt) != gimple_assign_rhs1 (and_minus_one))) + return false; + + /* Check the recurrence. */ + gimple *phi = SSA_NAME_DEF_STMT (gimple_assign_rhs1 (and_minus_one)); + gimple *src_phi = SSA_NAME_DEF_STMT (lhs); + if (gimple_code (phi) != GIMPLE_PHI + || gimple_code (src_phi) != GIMPLE_PHI) + return false; + + /* Check the loop closed SSA definition for just the variable c defined in + loop. */ + src = gimple_phi_arg_def (src_phi, loop_preheader_edge (loop)->dest_idx); + basic_block bb = single_exit (loop)->dest; + for (gphi_iterator gpi = gsi_start_phis (bb); + !gsi_end_p (gpi); gsi_next (&gpi)) + { + count_phi = gpi.phi (); + count++; + } + if (count != 1) + return false; + + /* Make sure we have a count by one and it starts from zero. */ + rhs = gimple_phi_arg_def (count_phi, 0); + stmt = SSA_NAME_DEF_STMT (rhs); + if (!is_gimple_assign (stmt) + || (gimple_assign_rhs_code (stmt) != PLUS_EXPR) + || !integer_onep (gimple_assign_rhs2 (stmt))) + return false; + rhs = gimple_assign_rhs1 (stmt); + stmt = SSA_NAME_DEF_STMT (rhs); + if (gimple_code (stmt) != GIMPLE_PHI + || !integer_zerop (gimple_phi_arg_def (stmt, loop_preheader_edge (loop)->dest_idx))) + return false; + + /* Create a var for builtin_popcount result and update the uses. */ + dest = gimple_phi_result (count_phi); + tree var = make_ssa_name (TREE_TYPE (dest), NULL); + FOR_EACH_IMM_USE_STMT (use_stmt, iter, dest) + { + FOR_EACH_IMM_USE_ON_STMT (use_p, iter) + SET_USE (use_p, var); + } + dest = var; + + /* Remove the loop closed PHI stmt. */ + gsi = gsi_after_labels (gimple_bb (count_phi)); + gphi_iterator gsi_phi = gsi_for_phi (count_phi); + remove_phi_node (&gsi_phi, true); + + /* Insert the builtin. */ + gsi = gsi_after_labels (loop_preheader_edge (loop)->src); + src = force_gimple_operand_gsi (&gsi, src, true, NULL_TREE, + false, GSI_CONTINUE_LINKING); + tree fn = build_fold_addr_expr (builtin_decl_implicit (BUILT_IN_POPCOUNT)); + fn_call = gimple_build_call (fn, 1, src); + gimple_call_set_lhs (fn_call, dest); + gsi_insert_before (&gsi, fn_call, GSI_CONTINUE_LINKING); + + /* In most cases, we will have if (b != 0) preceeding the loop in + the form + if (b != 0) + { + loop; + } + + Since __builtin_popcount (0) is defined, we can remove this condition + by making it always true. */ + + stmt = last_stmt (EDGE_PRED (loop_preheader_edge (loop)->src, 0)->src); + if (stmt + && gimple_code (stmt) == GIMPLE_COND + && (gimple_cond_lhs (stmt) == src) + && zerop (gimple_cond_rhs (stmt))) + { + gimple_cond_set_lhs (as_a <gcond *> (stmt), + build_int_cst (TREE_TYPE (src), 1)); + update_stmt (stmt); + } + + /* Finaly remove the loop. */ + destroy_loop (loop); + return true; +} + /* Classifies the builtin kind we can generate for PARTITION of RDG and LOOP. For the moment we detect memset, memcpy and memmove patterns. Bitmap STMT_IN_ALL_PARTITIONS contains statements belonging to all partitions. */ @@ -3032,6 +3174,9 @@ pass_loop_distribution::execute (function *fun) || !optimize_loop_for_speed_p (loop)) continue; + if (handle_popcount (loop)) + continue; + /* Don't distribute loop if niters is unknown. */ tree niters = number_of_latch_executions (loop); if (niters == NULL_TREE || niters == chrec_dont_know) -- 2.7.4