[Bug middle-end/105032] Compiling inline ASM x86 causing GCC stuck in an endless loop with 100% CPU usage

2022-03-23 Thread ammarfaizi2 at gmail dot com via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105032

--- Comment #6 from Ammar Faizi  ---
(In reply to Jakub Jelinek from comment #4)
> If this is a macro that users should use in arbitrary user code, there is
> another problem, if something is vectorized in the function, either using
> AVX or later or -mstackrealign is used, another register is needed for the
> stack realignment (DRAP register).


I don't really understand about stack realignment part. So I have a question,
what is another register here? Is it %ebp?

If we have %ebp as a stack frame pointer, can't the compiler just use it for
the realignment?

I am not sure what the DRAP register really means. Googled about it, but
doesn't show anything relevant.

[Bug middle-end/105032] Compiling inline ASM x86 causing GCC stuck in an endless loop with 100% CPU usage

2022-03-23 Thread ammarfaizi2 at gmail dot com via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105032

--- Comment #5 from Ammar Faizi  ---
(In reply to Jakub Jelinek from comment #3)
> This has been hanging or ICEing on and off since forever.
> E.g. even r105000 ICEs, r20 works, r21 ICEs, r10-5912 works, r11-1
> hangs, so does current trunk.
> The first revision after r10-5912 to start hanging was
> r10-6326-gbcf3fa7cf5a3d024b507.
> Note, without optimizations, the inline asm is on or beyond the border what
> can be handled, it uses 6 of the 8 GPRs the arch has, the further two are
> the stack pointer and when not optimizing or if frame pointer is for
> whatever reason needed frame pointer.  The asm also has a memory input.  So,
> it fully depends on optimization (which isn't done with -O0 generally) that
> the address of the
> _arg6 variable can be expressed as offset(%esp) or offset(%ebp).  If it is
> not (and -O0 asks for no optimizations), then there are no registers left
> how to describe the input.

Interestingly, changing the my_syscall6() macro to this one works nicely.

#define my_syscall6(num, arg1, arg2, arg3, arg4, arg5, arg6)\
({  \
long _eax  = (long)(num);   \
long _arg6 = (long)(arg6); /* Always in memory */   \
__asm__ volatile (  \
"pushl  %[_arg6]\n\t"   \
"pushl  %%ebp\n\t"  \
"movl   4(%%esp),%%ebp\n\t" \
"int$0x80\n\t"  \
"popl   %%ebp\n\t"  \
"addl   $4,%%esp\n\t"   \
: "+a"(_eax)/* %eax */  \
: "b"(arg1),/* %ebx */  \
  "c"(arg2),/* %ecx */  \
  "d"(arg3),/* %edx */  \
  "S"(arg4),/* %esi */  \
  "D"(arg5),/* %edi */  \
  [_arg6]"m"(_arg6) /* memory */\
: "memory", "cc"\
);  \
_eax;   \
})

Link: https://godbolt.org/z/hdsffvr1d

What could possibly be wrong here?
I am not sure what is the behavior difference between this macro with the
previously posted?

[Bug c/105032] New: Compiling inline ASM x86 causing GCC stuck in an endless loop with 100% CPU usage

2022-03-22 Thread ammarfaizi2 at gmail dot com via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105032

Bug ID: 105032
   Summary: Compiling inline ASM x86 causing GCC stuck in an
endless loop with 100% CPU usage
   Product: gcc
   Version: 11.2.0
Status: UNCONFIRMED
  Severity: normal
  Priority: P3
 Component: c
  Assignee: unassigned at gcc dot gnu.org
  Reporter: ammarfaizi2 at gmail dot com
  Target Milestone: ---

Full story:
https://lore.kernel.org/lkml/9cfcb296-9dfe-aef1-4209-20a3a95c5...@gnuweeb.org/

Compiling inline ASM x86 causing GCC stuck in an endless loop with 100% CPU
usage.

-
ammarfaizi2@integral2:/tmp$ gcc --version
gcc (Ubuntu 11.2.0-7ubuntu2) 11.2.0
Copyright (C) 2021 Free Software Foundation, Inc.
This is free software; see the source for copying conditions.  There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.

ammarfaizi2@integral2:/tmp$ gcc -m32 -o test test.c # stuck here...
^C
ammarfaizi2@integral2:/tmp$ 
-



Reproducer:
-
#include 
#include 
#include 

#define my_syscall6(num, arg1, arg2, arg3, arg4, arg5, arg6)\
({  \
long _ret;  \
register long _num  asm("eax") = (num); \
register long _arg1 asm("ebx") = (long)(arg1);  \
register long _arg2 asm("ecx") = (long)(arg2);  \
register long _arg3 asm("edx") = (long)(arg3);  \
register long _arg4 asm("esi") = (long)(arg4);  \
register long _arg5 asm("edi") = (long)(arg5);  \
long _arg6 = (long)(arg6); /* Always be in memory */\
\
asm volatile (  \
"pushl  %[_arg6]\n\t"   \
"pushl  %%ebp\n\t"  \
"movl   4(%%esp), %%ebp\n\t"\
"int$0x80\n\t"  \
"popl   %%ebp\n\t"  \
"addl   $4,%%esp\n\t"   \
: "=a"(_ret)\
: "r"(_num), "r"(_arg1), "r"(_arg2), "r"(_arg3),\
  "r"(_arg4),"r"(_arg5), [_arg6]"m"(_arg6)  \
: "memory", "cc"\
);  \
_ret;   \
})


static void *__sys_mmap(void *addr, size_t length, int prot, int flags, int fd,
off_t offset)
{
offset >>= 12;
return (void *)my_syscall6(__NR_mmap2, addr, length, prot, flags, fd,
   offset);
}

int main(void)
{
__sys_mmap(NULL, 0x1000, PROT_READ|PROT_WRITE,
MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
return 0;
}

[Bug tree-optimization/100247] New: x86-64 bad register allocation for unsigned type

2021-04-24 Thread ammarfaizi2 at gmail dot com via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100247

Bug ID: 100247
   Summary: x86-64 bad register allocation for unsigned type
   Product: gcc
   Version: 12.0
Status: UNCONFIRMED
  Severity: normal
  Priority: P3
 Component: tree-optimization
  Assignee: unassigned at gcc dot gnu.org
  Reporter: ammarfaizi2 at gmail dot com
  Target Milestone: ---

GCC 12.0.0 allocates unnecessary register for unsigned type. Below is the
reproduction code:

Compile with: `gcc -Wall -Wextra -O3 -fno-tree-vectorize -fno-unroll-loops`

--
#include 

long add_arrays(long *arr1, long *arr2, size_t num) {
size_t i = 0;
long sum = 0;

for (i = 0; i < num; ++i) {
sum += arr1[i];
sum += arr2[i];
}

return sum;
}

unsigned long unsigned_add_arrays(unsigned long *arr1, unsigned long *arr2,
size_t num) {
size_t i = 0;
unsigned long sum = 0;

for (i = 0; i < num; ++i) {
sum += arr1[i];
sum += arr2[i];
}

return sum;
}

#define PSTR(P) #P
#define XSTR(P) PSTR(P)

const char gcc_ver[] = 
"It is GCC "
XSTR(__GNUC__) "."
XSTR(__GNUC_MINOR__) "."
XSTR(__GNUC_PATCHLEVEL__);

--
GCC 12.0.0 Result

add_arrays:
xorl%eax, %eax
testq   %rdx, %rdx
je  .L4
xorl%ecx, %ecx
.L3:
addq(%rdi,%rcx,8), %rax
addq(%rsi,%rcx,8), %rax
addq$1, %rcx
cmpq%rcx, %rdx
jne .L3
ret # Why do even we need this ret?
.L4:
ret

unsigned_add_arrays:
xorl%r8d, %r8d  # Using %r8 is unnecessary
testq   %rdx, %rdx
je  .L7
xorl%eax, %eax
.L9:
movq(%rsi,%rax,8), %rcx
addq(%rdi,%rax,8), %rcx
addq$1, %rax
addq%rcx, %r8   # %r8 is used as `sum` variable
cmpq%rax, %rdx
jne .L9
.L7:
movq%r8, %rax   # set return value to %r8
ret

gcc_ver:
.string "It is GCC 12.0.0"

--
GCC 4.6.4 produces better result here:

add_arrays:
xorl%eax, %eax
testq   %rdx, %rdx
je  .L2
xorl%ecx, %ecx
.L3:
addq(%rdi,%rcx,8), %rax
addq(%rsi,%rcx,8), %rax
addq$1, %rcx
cmpq%rdx, %rcx
jne .L3
.L2:
rep
ret

unsigned_add_arrays:
xorl%eax, %eax
testq   %rdx, %rdx
je  .L8
xorl%ecx, %ecx
.L9:
addq(%rdi,%rcx,8), %rax
addq(%rsi,%rcx,8), %rax
addq$1, %rcx
cmpq%rdx, %rcx
jne .L9
.L8:
rep
ret

gcc_ver:
.string "It is GCC 4.6.4"
--
Golbolt link: https://godbolt.org/z/9Pj5Ph1Gn