https://gcc.gnu.org/g:7034f85fb6577ac48693f0606b7e33fef41a1b74
commit r17-567-g7034f85fb6577ac48693f0606b7e33fef41a1b74 Author: Kito Cheng <[email protected]> Date: Fri Nov 28 17:58:21 2025 +0800 middle-end: Optimize reversed CRC table-based implementation The previous reversed CRC implementation used explicit bit reflection before and after the CRC computation: reflect(crc_init); reflect(data); for (int i = 0; i < data_bit_size / 8; i++) crc = (crc << 8) ^ table[(crc >> (crc_bit_size - 8)) ^ (data >> (data_bit_size - (i+1) * 8) & 0xFF)]; reflect(crc); This patch generates a reversed polynomial lookup table directly, eliminating the need for bit reflection operations. The new algorithm: for (int i = 0; i < data_bit_size / 8; i++) crc = (crc >> 8) ^ table[(crc ^ (data >> (i * 8))) & 0xFF]; This improves code generation for all targets using table-based reversed CRC, as it removes the overhead of reflecting input data and CRC values. Note on code size: one could imagine sharing a single (non-reversed) table between programs that compute both reversed and non-reversed CRCs in order to save space under -Os. A survey of ~10k Fedora packages (by Mariam and Jeff Law) found no package that uses both flavors in the same binary, so this case is not worth optimizing for. Ref: [1] "Reversing CRC - Theory and Practice" https://sar.informatik.hu-berlin.de/research/publications/SAR-PR-2006-05/SAR-PR-2006-05_.pdf gcc/ChangeLog: * expr.cc (calculate_reversed_crc): New function. (assemble_reversed_crc_table): New function. (generate_reversed_crc_table): New function. (calculate_table_based_reversed_CRC): New function. (expand_reversed_crc_table_based): Remove gen_reflecting_code parameter. Use calculate_table_based_reversed_CRC. * expr.h (expand_reversed_crc_table_based): Update prototype. * builtins.cc (expand_builtin_crc_table_based): Update call. * internal-fn.cc (expand_crc_optab_fn): Update call. * config/aarch64/aarch64.md (crc_rev<ALLI:mode><ALLX:mode>4): Update call. * config/i386/i386.md (crc_rev<SWI124:mode>si4): Update call. * config/loongarch/loongarch.md (crc_rev<mode>si4): Update call. Remove local rbit lambda. * config/riscv/bitmanip.md (crc_rev<ANYI1:mode><ANYI:mode>4): Update call. Remove TARGET_ZBKB case. * config/riscv/riscv.cc (generate_reflecting_code_using_brev): Remove. * config/riscv/riscv-protos.h (generate_reflecting_code_using_brev): Remove declaration. Diff: --- gcc/builtins.cc | 3 +- gcc/config/aarch64/aarch64.md | 3 +- gcc/config/i386/i386.md | 3 +- gcc/config/loongarch/loongarch.md | 17 +--- gcc/config/riscv/bitmanip.md | 13 +-- gcc/config/riscv/riscv-protos.h | 1 - gcc/config/riscv/riscv.cc | 33 -------- gcc/expr.cc | 163 +++++++++++++++++++++++++++++++++----- gcc/expr.h | 3 +- gcc/internal-fn.cc | 3 +- 10 files changed, 151 insertions(+), 91 deletions(-) diff --git a/gcc/builtins.cc b/gcc/builtins.cc index 72310f1c928e..05d5fb13bfce 100644 --- a/gcc/builtins.cc +++ b/gcc/builtins.cc @@ -7828,8 +7828,7 @@ expand_builtin_crc_table_based (internal_fn fn, scalar_mode crc_mode, else /* If it's IFN_CRC_REV generate bit-reversed CRC. */ expand_reversed_crc_table_based (target, op1, op2, op3, - data_mode, - generate_reflecting_code_standard); + data_mode); return target; } diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md index 0ef01e889a6d..f7e2e4be10e0 100644 --- a/gcc/config/aarch64/aarch64.md +++ b/gcc/config/aarch64/aarch64.md @@ -4988,8 +4988,7 @@ else /* Otherwise, generate table-based CRC. */ expand_reversed_crc_table_based (operands[0], operands[1], operands[2], - operands[3], <ALLI:MODE>mode, - generate_reflecting_code_standard); + operands[3], <ALLI:MODE>mode); DONE; } ) diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 051105d5c1f0..5ed7ea4599cf 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -29899,8 +29899,7 @@ emit_insn (gen_sse4_2_crc32<mode> (operands[0], operands[1], operands[2])); else expand_reversed_crc_table_based (operands[0], operands[1], operands[2], - operands[3], <SWI124:MODE>mode, - generate_reflecting_code_standard); + operands[3], <SWI124:MODE>mode); DONE; }) diff --git a/gcc/config/loongarch/loongarch.md b/gcc/config/loongarch/loongarch.md index 32d837d1e2da..297c7e414921 100644 --- a/gcc/config/loongarch/loongarch.md +++ b/gcc/config/loongarch/loongarch.md @@ -4958,23 +4958,8 @@ } else { - /* No CRC instruction is suitable, use the generic table-based - implementation but optimize bit reversion. */ - auto rbit = [](rtx *r) - { - /* Well, this is ugly. The problem is - expand_reversed_crc_table_based only accepts one helper - for reversing data elements and CRC states. */ - auto mode = GET_MODE (*r); - auto rbit = (mode == <MODE>mode ? gen_rbit<mode> : gen_rbitsi); - rtx out = gen_reg_rtx (mode); - - emit_insn (rbit (out, *r)); - *r = out; - }; expand_reversed_crc_table_based (operands[0], operands[1], - msg, operands[3], <MODE>mode, - rbit); + msg, operands[3], <MODE>mode); } DONE; }) diff --git a/gcc/config/riscv/bitmanip.md b/gcc/config/riscv/bitmanip.md index a0ee2943983c..c1a106b50c90 100644 --- a/gcc/config/riscv/bitmanip.md +++ b/gcc/config/riscv/bitmanip.md @@ -1237,25 +1237,16 @@ it is possible to store the quotient within a single variable (E.g. CRC64's quotient may need 65 bits, we can't keep it in 64 bit variable.) - then use clmul instruction to implement the CRC, - otherwise (TARGET_ZBKB) generate table based using brev. */ + then use clmul instruction to implement the CRC. */ if ((TARGET_ZBKC || TARGET_ZBC || TARGET_ZVBC) && <ANYI:MODE>mode < word_mode) expand_reversed_crc_using_clmul (<ANYI:MODE>mode, <ANYI1:MODE>mode, operands); - else if (TARGET_ZBKB) - /* Generate table-based CRC. - To reflect values use brev and bswap instructions. */ - expand_reversed_crc_table_based (operands[0], operands[1], - operands[2], operands[3], - GET_MODE (operands[2]), - generate_reflecting_code_using_brev); else /* Generate table-based CRC. To reflect values use standard reflecting algorithm. */ expand_reversed_crc_table_based (operands[0], operands[1], operands[2], operands[3], - GET_MODE (operands[2]), - generate_reflecting_code_standard); + GET_MODE (operands[2])); DONE; }) diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h index 8b362e323d98..f8eee0fd8448 100644 --- a/gcc/config/riscv/riscv-protos.h +++ b/gcc/config/riscv/riscv-protos.h @@ -181,7 +181,6 @@ extern bool riscv_reg_frame_related (rtx); extern void riscv_split_sum_of_two_s12 (HOST_WIDE_INT, HOST_WIDE_INT *, HOST_WIDE_INT *); extern bool riscv_vector_float_type_p (const_tree type); -extern void generate_reflecting_code_using_brev (rtx *); extern void expand_crc_using_clmul (scalar_mode, scalar_mode, rtx *); extern void expand_reversed_crc_using_clmul (scalar_mode, scalar_mode, rtx *); diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc index 9a0b5c296d91..2a2193767c7c 100644 --- a/gcc/config/riscv/riscv.cc +++ b/gcc/config/riscv/riscv.cc @@ -15052,39 +15052,6 @@ riscv_use_by_pieces_infrastructure_p (unsigned HOST_WIDE_INT size, return default_use_by_pieces_infrastructure_p (size, alignment, op, speed_p); } -/* Generate instruction sequence - which reflects the value of the OP using bswap and brev8 instructions. - OP's mode may be less than word_mode, to get the correct number, - after reflecting we shift right the value by SHIFT_VAL. - E.g. we have 1111 0001, after reflection (target 32-bit) we will get - 1000 1111 0000 0000, if we shift-out 16 bits, - we will get the desired one: 1000 1111. */ - -void -generate_reflecting_code_using_brev (rtx *op_p) -{ - rtx op = *op_p; - machine_mode op_mode = GET_MODE (op); - - /* OP may be smaller than a word. We can use a paradoxical subreg - to compensate for that. It should never be larger than a word - for RISC-V. */ - gcc_assert (op_mode <= word_mode); - if (op_mode != word_mode) - op = gen_lowpart (word_mode, op); - - HOST_WIDE_INT shift_val = (BITS_PER_WORD - - GET_MODE_BITSIZE (op_mode).to_constant ()); - riscv_expand_op (BSWAP, word_mode, op, op, op); - riscv_expand_op (LSHIFTRT, word_mode, op, op, - gen_int_mode (shift_val, word_mode)); - if (TARGET_64BIT) - emit_insn (gen_riscv_brev8_di (op, op)); - else - emit_insn (gen_riscv_brev8_si (op, op)); -} - - /* Generate assembly to calculate CRC using clmul instruction. The following code will be generated when the CRC and data sizes are equal: li a4,quotient diff --git a/gcc/expr.cc b/gcc/expr.cc index d59a5463de4c..1c68055d45f8 100644 --- a/gcc/expr.cc +++ b/gcc/expr.cc @@ -14515,6 +14515,75 @@ generate_crc_table (unsigned HOST_WIDE_INT polynom, unsigned short crc_bits) return assemble_crc_table (polynom, crc_bits); } +/* Calculate CRC for the initial CRC and given POLYNOMIAL. + CRC_BITS is CRC size. */ + +static unsigned HOST_WIDE_INT +calculate_reversed_crc (unsigned HOST_WIDE_INT crc, + unsigned HOST_WIDE_INT polynomial, + unsigned short crc_bits) +{ + unsigned HOST_WIDE_INT rev_polynom = reflect_hwi (polynomial, crc_bits); + for (int j = 0; j < 8; j++) + { + if (crc & 1) + crc = (crc >> 1) ^ rev_polynom; + else + crc >>= 1; + } + /* Zero out bits in crc beyond the specified number of crc_bits. */ + if (crc_bits < sizeof (crc) * CHAR_BIT) + crc &= (HOST_WIDE_INT_1U << crc_bits) - 1; + return crc; +} + +/* Assemble CRC table with 256 elements for the given POLYNOM and CRC_BITS. + POLYNOM is the polynomial used to calculate the CRC table's elements. + CRC_BITS is the size of CRC, may be 8, 16, ... . */ + +static rtx +assemble_reversed_crc_table (unsigned HOST_WIDE_INT polynom, unsigned short crc_bits) +{ + unsigned table_el_n = 0x100; + tree ar = build_array_type (make_unsigned_type (crc_bits), + build_index_type (size_int (table_el_n - 1))); + + /* Initialize the table. */ + vec<tree, va_gc> *initial_values; + vec_alloc (initial_values, table_el_n); + for (size_t i = 0; i < table_el_n; ++i) + { + unsigned HOST_WIDE_INT crc = calculate_reversed_crc (i, polynom, crc_bits); + tree element = build_int_cstu (make_unsigned_type (crc_bits), crc); + vec_safe_push (initial_values, element); + } + tree ctor = build_constructor_from_vec (ar, initial_values); + rtx mem = output_constant_def (ctor, 1); + gcc_assert (MEM_P (mem)); + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, + ";; emitting reversed crc table crc_%u_polynomial_" + HOST_WIDE_INT_PRINT_HEX " ", + crc_bits, polynom); + print_rtl_single (dump_file, XEXP (mem, 0)); + fprintf (dump_file, "\n"); + } + + return XEXP (mem, 0); +} + +/* Generate reversed CRC table for the given POLYNOM and CRC_BITS. */ + +static rtx +generate_reversed_crc_table (unsigned HOST_WIDE_INT polynom, + unsigned short crc_bits) +{ + gcc_assert (crc_bits <= 64); + + return assemble_reversed_crc_table (polynom, crc_bits); +} + /* Generate table-based CRC code for the given CRC, INPUT_DATA and the POLYNOMIAL (without leading 1). @@ -14594,6 +14663,71 @@ calculate_table_based_CRC (rtx *crc, const rtx &input_data, } } +/* Generate table-based reversed CRC code for the given CRC, INPUT_DATA + and the POLYNOMIAL (without leading 1). + + This function generates code for reversed (bit-reflected) CRC calculation + using a pre-computed lookup table. Unlike the standard CRC calculation, + this processes data from LSB to MSB, eliminating the need for explicit + bit reflection before and after the CRC computation. */ + +static void +calculate_table_based_reversed_CRC (rtx *crc, const rtx &input_data, + const rtx &polynomial, + machine_mode data_mode) +{ + machine_mode mode = GET_MODE (*crc); + unsigned short crc_bit_size = GET_MODE_BITSIZE (mode).to_constant (); + unsigned short data_size = GET_MODE_SIZE (data_mode).to_constant (); + rtx tab = generate_reversed_crc_table (UINTVAL (polynomial), crc_bit_size); + + /* CRC's mode is always at least as wide as INPUT_DATA. Convert + INPUT_DATA into CRC's mode once outside the loop since INPUT_DATA + is loop-invariant. */ + rtx data_in_crc_mode = gen_reg_rtx (mode); + convert_move (data_in_crc_mode, input_data, 1); + + for (unsigned short i = 0; i < data_size; i++) + { + *crc = force_reg (mode, *crc); + + /* data >> (8 * i). */ + unsigned range_8 = 8 * i; + rtx data = expand_shift (RSHIFT_EXPR, mode, data_in_crc_mode, + range_8, NULL_RTX, 1); + + /* crc ^ (data >> (8 * i)). */ + rtx in = expand_binop (mode, xor_optab, *crc, data, + NULL_RTX, 1, OPTAB_WIDEN); + + /* (crc ^ data) & 0xFF. */ + rtx index = expand_and (mode, in, gen_int_mode (255, mode), + NULL_RTX); + int log_crc_size = exact_log2 (GET_MODE_SIZE (mode).to_constant ()); + index = expand_shift (LSHIFT_EXPR, mode, index, + log_crc_size, NULL_RTX, 0); + + rtx addr = gen_reg_rtx (Pmode); + convert_move (addr, index, 1); + addr = expand_binop (Pmode, add_optab, addr, tab, NULL_RTX, + 0, OPTAB_DIRECT); + + /* crc_table[(crc ^ data) & 0xFF]. */ + rtx tab_el = validize_mem (gen_rtx_MEM (mode, addr)); + + /* (crc >> 8) if CRC is larger than 8, otherwise 0. */ + rtx high = NULL_RTX; + if (crc_bit_size != 8) + high = expand_shift (RSHIFT_EXPR, mode, *crc, 8, NULL_RTX, 1); + else + high = gen_int_mode (0, mode); + + /* crc = (crc >> 8) ^ crc_table[(crc ^ data) & 0xFF]. */ + *crc = expand_binop (mode, xor_optab, tab_el, high, NULL_RTX, 1, + OPTAB_WIDEN); + } +} + /* Generate table-based CRC code for the given CRC, INPUT_DATA and the POLYNOMIAL (without leading 1). @@ -14731,41 +14865,30 @@ generate_reflecting_code_standard (rtx *op) the POLYNOMIAL (without leading 1). CRC is OP1, data is OP2 and the polynomial is OP3. - This must generate CRC table and assembly for the following code, + This generates a reversed CRC table and assembly for the following code, where crc_bit_size and data_bit_size may be 8, 16, 32, 64: uint_crc_bit_size_t crc_crc_bit_size (uint_crc_bit_size_t crc_init, - uint_data_bit_size_t data, size_t size) + uint_data_bit_size_t data) { - reflect (crc_init) uint_crc_bit_size_t crc = crc_init; - reflect (data); for (int i = 0; i < data_bit_size / 8; i++) - crc = (crc << 8) ^ crc_table[(crc >> (crc_bit_size - 8)) - ^ (data >> (data_bit_size - (i + 1) * 8) & 0xFF))]; - reflect (crc); + crc = (crc >> 8) ^ crc_table[(crc ^ (data >> (i * 8))) & 0xFF]; return crc; - } */ + } + + This approach uses a pre-computed reversed polynomial table, eliminating + the need for explicit bit reflection before and after the CRC computation. */ void expand_reversed_crc_table_based (rtx op0, rtx op1, rtx op2, rtx op3, - machine_mode data_mode, - void (*gen_reflecting_code) (rtx *op)) + machine_mode data_mode) { gcc_assert (!CONST_INT_P (op0)); gcc_assert (CONST_INT_P (op3)); machine_mode crc_mode = GET_MODE (op0); - rtx crc = gen_reg_rtx (crc_mode); convert_move (crc, op1, 0); - gen_reflecting_code (&crc); - - rtx data = gen_reg_rtx (data_mode); - convert_move (data, op2, 0); - gen_reflecting_code (&data); - - calculate_table_based_CRC (&crc, data, op3, data_mode); - - gen_reflecting_code (&crc); + calculate_table_based_reversed_CRC (&crc, op2, op3, data_mode); convert_move (op0, crc, 0); } diff --git a/gcc/expr.h b/gcc/expr.h index e6502d09ceea..a54984ea2f89 100644 --- a/gcc/expr.h +++ b/gcc/expr.h @@ -404,8 +404,7 @@ gf2n_poly_long_div_quotient (unsigned HOST_WIDE_INT, unsigned short); /* Generate table-based CRC. */ extern void generate_reflecting_code_standard (rtx *); extern void expand_crc_table_based (rtx, rtx, rtx, rtx, machine_mode); -extern void expand_reversed_crc_table_based (rtx, rtx, rtx, rtx, machine_mode, - void (*) (rtx *)); +extern void expand_reversed_crc_table_based (rtx, rtx, rtx, rtx, machine_mode); /* Cache of the "extended" flag in the target's _BitInt description for use during expand. */ diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc index 65be5aed35ca..cad5c50d83d7 100644 --- a/gcc/internal-fn.cc +++ b/gcc/internal-fn.cc @@ -4216,8 +4216,7 @@ expand_crc_optab_fn (internal_fn fn, gcall *stmt, convert_optab optab) else /* If it's IFN_CRC_REV generate bit-reversed CRC. */ expand_reversed_crc_table_based (dest, crc, data, polynomial, - TYPE_MODE (data_type), - generate_reflecting_code_standard); + TYPE_MODE (data_type)); /* Now get the return value where it needs to be, taking care to ensure it's promoted appropriately if the ABI demands it.
