On 9 July 2015 at 15:46, Richard Biener <rguent...@suse.de> wrote: > On Thu, 9 Jul 2015, Bernhard Reutner-Fischer wrote: > >> gcc/ChangeLog >> >> 2015-07-09 Bernhard Reutner-Fischer <al...@gcc.gnu.org> >> >> * builtins.c (fold_builtin_tolower, fold_builtin_toupper): New >> static functions. >> (fold_builtin_1): Handle BUILT_IN_TOLOWER, BUILT_IN_TOUPPER. > > As I read it you fold tolower (X) to (X) >= target_char_set ('A') > && (X) <= target_char_set ('Z') ? (X) - target_char_set ('A') + > target_char_set ('a'); > > I don't think this can be correct for all locales which need not > have a lower-case character for all upper-case ones nor do > all letters having one need to be in the range of 'A' to 'Z'. > > Joseph will surely correct me if I am wrong. > > What works would eventually be constant folding.
Thinking about it, this is not tolower_l nor towlower so should probably not be concerned about locales at all. thanks, > > Richard. > >> Signed-off-by: Bernhard Reutner-Fischer <rep.dot....@gmail.com> >> --- >> gcc/builtins.c | 99 >> ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ >> 1 file changed, 99 insertions(+) >> >> Using the three testcases attached to PR66741 where the -1.c one is using >> builtins >> $ for i in 0 1 2;do gcc -o tolower_strcpy-$i tolower_strcpy-$i.c -Ofast -W >> -Wall -Wextra -pedantic -DMAIN -msse4.2;done >> >> pristine (trunk@225368): >> # tolower_strcpy-0 >> >> real 0m6.068s >> user 0m3.204s >> sys 0m2.840s >> # tolower_strcpy-1 >> >> real 0m8.097s >> user 0m5.548s >> sys 0m2.528s >> # tolower_strcpy-2 >> >> real 0m3.568s >> user 0m0.804s >> sys 0m2.748s >> >> trunk@225368 + fold tolower/toupper below >> >> # tolower_strcpy-0 >> >> real 0m6.055s >> user 0m3.212s >> sys 0m2.832s >> # tolower_strcpy-1 >> >> real 0m5.383s >> user 0m2.464s >> sys 0m2.900s >> # tolower_strcpy-2 >> >> real 0m3.605s >> user 0m0.668s >> sys 0m2.924s >> >> The tolower loop now ends up as >> .L5: >> movsbl (%rbx), %edx >> leal 32(%rdx), %ecx >> movl %edx, %eax >> subl $65, %edx >> cmpl $25, %edx >> cmovbe %ecx, %eax >> addq $1, %rbx >> movb %al, -1(%rbx) >> cmpq %rsi, %rbx >> jne .L5 >> >> instead of the former call >> >> .L5: >> movsbl (%rbx), %edi >> addq $1, %rbx >> call tolower >> movb %al, -1(%rbx) >> cmpq %rbp, %rbx >> jne .L5 >> >> Would something like attached be ok for trunk after proper testing? >> Advise on the questions inline WRT caching lang_hooks intermediate >> results? >> Hints on further steps towards fixing the PR? >> >> I think the next step would be to try to teach graphite to fuse the two >> loops in tolower_strcpy-0.c. Need to look at graphite.. >> Then see how to classify builtins that could be expanded early and what >> breaks if doing so. This sounds like a potential disaster, fun. >> Next, see why the vectorizer (or something else) does not pave the way >> to use SSE instruction as the tolower_strcpy-2.c does. >> >> thanks, >> >> diff --git a/gcc/builtins.c b/gcc/builtins.c >> index 5f53342..421c908 100644 >> --- a/gcc/builtins.c >> +++ b/gcc/builtins.c >> @@ -204,6 +204,9 @@ static tree fold_builtin_strrchr (location_t, tree, >> tree, tree); >> static tree fold_builtin_strspn (location_t, tree, tree); >> static tree fold_builtin_strcspn (location_t, tree, tree); >> >> +static tree fold_builtin_tolower (location_t, tree); >> +static tree fold_builtin_toupper (location_t, tree); >> + >> static rtx expand_builtin_object_size (tree); >> static rtx expand_builtin_memory_chk (tree, rtx, machine_mode, >> enum built_in_function); >> @@ -10285,6 +10288,12 @@ fold_builtin_1 (location_t loc, tree fndecl, tree >> arg0) >> case BUILT_IN_ISDIGIT: >> return fold_builtin_isdigit (loc, arg0); >> >> + case BUILT_IN_TOLOWER: >> + return fold_builtin_tolower (loc, arg0); >> + >> + case BUILT_IN_TOUPPER: >> + return fold_builtin_toupper (loc, arg0); >> + >> CASE_FLT_FN (BUILT_IN_FINITE): >> case BUILT_IN_FINITED32: >> case BUILT_IN_FINITED64: >> @@ -11208,6 +11217,96 @@ fold_builtin_strcspn (location_t loc, tree s1, tree >> s2) >> } >> } >> >> + >> +/* Simplify a call to the tolower builtin. ARG is the argument to the call. >> + >> + Return NULL_TREE if no simplification was possible, otherwise return the >> + simplified form of the call as a tree. */ >> + >> +static tree >> +fold_builtin_tolower (location_t loc, tree arg) >> +{ >> + if (!validate_arg (arg, INTEGER_TYPE)) >> + return NULL_TREE; >> + >> + /* Transform tolower(c) -> (unsigned)(c) | 0x20. >> + >> + More specifically: >> + unsigned tem = arg - 'A'; >> + if (tem <= ('Z' - 'A')) >> + arg += 'a' - 'A'; >> + return arg; >> + */ >> + unsigned HOST_WIDE_INT target_A = lang_hooks.to_target_charset ('A'); >> + unsigned HOST_WIDE_INT target_Z = lang_hooks.to_target_charset ('Z'); >> + unsigned HOST_WIDE_INT target_a = lang_hooks.to_target_charset ('a'); >> + if (target_A == 0 >> + || target_Z == 0 >> + || target_a == 0) >> + return NULL_TREE; >> + >> + arg = fold_convert_loc (loc, unsigned_type_node, arg); >> + tree tem = fold_build2 (MINUS_EXPR, unsigned_type_node, arg, >> + build_int_cst (unsigned_type_node, target_A)); >> + /* ??? x19 and x20 would better live in static storage; Think: >> + * static struct static_fold_tolower {uHWI x19, x20; unsigned probe_done}; >> + */ >> + unsigned HOST_WIDE_INT x19 = target_Z - target_A; >> + unsigned HOST_WIDE_INT x20 = target_a - target_A; >> + tem = fold_build2_loc (loc, LE_EXPR, integer_type_node, tem, >> + build_int_cst (unsigned_type_node, x19)); >> + tem = fold_build3_loc (loc, COND_EXPR, unsigned_type_node, tem, >> + fold_build2 (PLUS_EXPR, unsigned_type_node, arg, >> + build_int_cst (unsigned_type_node, x20)), >> + arg); >> + return fold_convert_loc (loc, integer_type_node, tem); >> +} >> + >> +/* Simplify a call to the toupper builtin. ARG is the argument to the call. >> + >> + Return NULL_TREE if no simplification was possible, otherwise return the >> + simplified form of the call as a tree. */ >> + >> +static tree >> +fold_builtin_toupper (location_t loc, tree arg) >> +{ >> + if (!validate_arg (arg, INTEGER_TYPE)) >> + return NULL_TREE; >> + >> + /* Transform toupper(c) -> (unsigned)(c) ^ 0x20. >> + >> + More specifically: >> + unsigned tem = arg - 'a'; >> + if (tem <= ('z' - 'a')) >> + arg -= 'a' - 'A'; >> + return arg; >> + */ >> + unsigned HOST_WIDE_INT target_A = lang_hooks.to_target_charset ('A'); >> + unsigned HOST_WIDE_INT target_z = lang_hooks.to_target_charset ('z'); >> + unsigned HOST_WIDE_INT target_a = lang_hooks.to_target_charset ('a'); >> + if (target_A == 0 >> + || target_z == 0 >> + || target_a == 0) >> + return NULL_TREE; >> + >> + arg = fold_convert_loc (loc, unsigned_type_node, arg); >> + tree tem = fold_build2 (MINUS_EXPR, unsigned_type_node, arg, >> + build_int_cst (unsigned_type_node, target_a)); >> + /* ??? x19 and x20 would better live in static storage; Think: >> + * static struct static_fold_tolower {uHWI x19, x20; unsigned probe_done}; >> + */ >> + unsigned HOST_WIDE_INT x19 = target_z - target_a; >> + unsigned HOST_WIDE_INT x20 = target_a - target_A; >> + tem = fold_build2_loc (loc, LE_EXPR, integer_type_node, tem, >> + build_int_cst (unsigned_type_node, x19)); >> + tem = fold_build3_loc (loc, COND_EXPR, unsigned_type_node, tem, >> + fold_build2 (MINUS_EXPR, unsigned_type_node, arg, >> + build_int_cst (unsigned_type_node, x20)), >> + arg); >> + return fold_convert_loc (loc, integer_type_node, tem); >> +} >> + >> + >> /* Fold the next_arg or va_start call EXP. Returns true if there was an >> error >> produced. False otherwise. This is done so that we don't output the >> error >> or warning twice or three times. */ >> > > -- > Richard Biener <rguent...@suse.de> > SUSE LINUX GmbH, GF: Felix Imendoerffer, Jane Smithard, Dilip Upmanyu, Graham > Norton, HRB 21284 (AG Nuernberg)