Re: amd64 assembly "fixes" for clang

2016-12-19 Thread Mike Larkin
On Sun, Dec 18, 2016 at 07:31:55PM +0100, Mark Kettenis wrote:
> Generating mixed 16-bit/32-bit/64-bit code with clang's integrated
> assembler is a bit tricky.  It supports the .code16, .code32 and
> .code64 directives.  But it doesn't know about the data16/data32 and
> addr16/addr32 instruction prefixes.  Instead it tries to determine
> those from the instruction opcode.  It mostly succeeds, but there are
> a couple of corner cases where clang will generate the "addr32" form
> where gas generates the "addr16" form in .code16 segments.  That
> should be no problem (and just waste a couple of bytes), but it makes
> comparing the generated code a bit difficult.
> 
> Here is an attempt to make the trampoline code bits compile with both
> gcc/gas and clang.  For clang I simply #define away the addr32 prefix.
> The data32 prefix can be dropped by using a mnemonic that implies the
> size of the opcode.  I also add a few addr32 prefixes to lidtl
> instructions in .code16 blocks where clang will generate an addr32
> prefix.
> 
> My x1 still spins up all CPUs and still suspends and resumes with this
> change.  I have not tried hibernate, but the codepath is pretty
> similar to the normal resume code so it should work out fine.
> 
> ok?
> 

This looks ok to me, if not committed already. If ZZZ is broken we can
work on that after.

-ml

> 
> Index: arch/amd64/amd64/acpi_wakecode.S
> ===
> RCS file: /cvs/src/sys/arch/amd64/amd64/acpi_wakecode.S,v
> retrieving revision 1.38
> diff -u -p -r1.38 acpi_wakecode.S
> --- arch/amd64/amd64/acpi_wakecode.S  16 May 2016 01:19:27 -  1.38
> +++ arch/amd64/amd64/acpi_wakecode.S  18 Dec 2016 18:06:09 -
> @@ -54,6 +54,10 @@
>  #include 
>  #include "lapic.h"
>  
> +#ifdef __clang__
> +#define addr32
> +#endif
> +
>  #define _ACPI_TRMP_LABEL(a) a = . - _C_LABEL(acpi_real_mode_resume) + \
>   ACPI_TRAMPOLINE
>  #define _ACPI_TRMP_OFFSET(a) a = . - _C_LABEL(acpi_real_mode_resume)
> @@ -109,7 +113,7 @@ _ACPI_TRMP_OFFSET(acpi_s3_vector_real)
>   movw%ax, %ss
>   movw%cs, %ax
>   movw%ax, %es
> - lidtl   clean_idt
> + addr32 lidtlclean_idt
>  
>   /*
>* Set up stack to grow down from offset 0x0FFE.
> @@ -140,7 +144,7 @@ _ACPI_TRMP_OFFSET(acpi_s3_vector_real)
>* time, until we restore the saved GDT that we had when we went
>* to sleep.
>*/ 
> - data32 addr32 lgdt  tmp_gdt
> + addr32 lgdtltmp_gdt
>  
>   /*
>* Enable protected mode by setting the PE bit in CR0
> @@ -385,7 +389,7 @@ _ACPI_TRMP_OFFSET(hibernate_resume_vecto
>   movw%ax, %fs
>   movw%ax, %gs
>   movl$0x0FFE, %esp
> - lidtl   clean_idt
> + addr32 lidtlclean_idt
>  
>   /* Jump to the S3 resume vector */
>   ljmp$(_ACPI_RM_CODE_SEG), $acpi_s3_vector_real
> @@ -422,7 +426,7 @@ _ACPI_TRMP_OFFSET(hibernate_resume_vecto
>   movw%ax, %gs
>   movw%ax, %ss
>   movl$0x0FFE, %esp
> - lidtl   clean_idt
> + addr32 lidtlclean_idt
>  
>  _ACPI_TRMP_OFFSET(hib_hlt_real)
>   hlt
> Index: arch/amd64/amd64/mptramp.S
> ===
> RCS file: /cvs/src/sys/arch/amd64/amd64/mptramp.S,v
> retrieving revision 1.13
> diff -u -p -r1.13 mptramp.S
> --- arch/amd64/amd64/mptramp.S16 May 2016 01:54:15 -  1.13
> +++ arch/amd64/amd64/mptramp.S18 Dec 2016 18:06:09 -
> @@ -84,6 +84,10 @@
>  #include 
>  #include 
>  
> +#ifdef __clang__
> +#define addr32
> +#endif
> +
>  #define _RELOC(x)   ((x) - KERNBASE)
>  #define RELOC(x)_RELOC(_C_LABEL(x))
>  
> @@ -114,7 +118,7 @@ _C_LABEL(cpu_spinup_trampoline):
>   movw%cs, %ax
>   movw%ax, %es
>   movw%ax, %ss
> - data32 addr32 lgdt (mptramp_gdt32_desc)   # load flat descriptor table
> + addr32 lgdtl (mptramp_gdt32_desc)   # load flat descriptor table
>   movl%cr0, %eax   # get cr0
>   orl $0x1, %eax  # enable protected mode
>   movl%eax, %cr0  # doit
> 



amd64 assembly "fixes" for clang

2016-12-18 Thread Mark Kettenis
Generating mixed 16-bit/32-bit/64-bit code with clang's integrated
assembler is a bit tricky.  It supports the .code16, .code32 and
.code64 directives.  But it doesn't know about the data16/data32 and
addr16/addr32 instruction prefixes.  Instead it tries to determine
those from the instruction opcode.  It mostly succeeds, but there are
a couple of corner cases where clang will generate the "addr32" form
where gas generates the "addr16" form in .code16 segments.  That
should be no problem (and just waste a couple of bytes), but it makes
comparing the generated code a bit difficult.

Here is an attempt to make the trampoline code bits compile with both
gcc/gas and clang.  For clang I simply #define away the addr32 prefix.
The data32 prefix can be dropped by using a mnemonic that implies the
size of the opcode.  I also add a few addr32 prefixes to lidtl
instructions in .code16 blocks where clang will generate an addr32
prefix.

My x1 still spins up all CPUs and still suspends and resumes with this
change.  I have not tried hibernate, but the codepath is pretty
similar to the normal resume code so it should work out fine.

ok?


Index: arch/amd64/amd64/acpi_wakecode.S
===
RCS file: /cvs/src/sys/arch/amd64/amd64/acpi_wakecode.S,v
retrieving revision 1.38
diff -u -p -r1.38 acpi_wakecode.S
--- arch/amd64/amd64/acpi_wakecode.S16 May 2016 01:19:27 -  1.38
+++ arch/amd64/amd64/acpi_wakecode.S18 Dec 2016 18:06:09 -
@@ -54,6 +54,10 @@
 #include 
 #include "lapic.h"
 
+#ifdef __clang__
+#define addr32
+#endif
+
 #define _ACPI_TRMP_LABEL(a) a = . - _C_LABEL(acpi_real_mode_resume) + \
ACPI_TRAMPOLINE
 #define _ACPI_TRMP_OFFSET(a) a = . - _C_LABEL(acpi_real_mode_resume)
@@ -109,7 +113,7 @@ _ACPI_TRMP_OFFSET(acpi_s3_vector_real)
movw%ax, %ss
movw%cs, %ax
movw%ax, %es
-   lidtl   clean_idt
+   addr32 lidtlclean_idt
 
/*
 * Set up stack to grow down from offset 0x0FFE.
@@ -140,7 +144,7 @@ _ACPI_TRMP_OFFSET(acpi_s3_vector_real)
 * time, until we restore the saved GDT that we had when we went
 * to sleep.
 */ 
-   data32 addr32 lgdt  tmp_gdt
+   addr32 lgdtltmp_gdt
 
/*
 * Enable protected mode by setting the PE bit in CR0
@@ -385,7 +389,7 @@ _ACPI_TRMP_OFFSET(hibernate_resume_vecto
movw%ax, %fs
movw%ax, %gs
movl$0x0FFE, %esp
-   lidtl   clean_idt
+   addr32 lidtlclean_idt
 
/* Jump to the S3 resume vector */
ljmp$(_ACPI_RM_CODE_SEG), $acpi_s3_vector_real
@@ -422,7 +426,7 @@ _ACPI_TRMP_OFFSET(hibernate_resume_vecto
movw%ax, %gs
movw%ax, %ss
movl$0x0FFE, %esp
-   lidtl   clean_idt
+   addr32 lidtlclean_idt
 
 _ACPI_TRMP_OFFSET(hib_hlt_real)
hlt
Index: arch/amd64/amd64/mptramp.S
===
RCS file: /cvs/src/sys/arch/amd64/amd64/mptramp.S,v
retrieving revision 1.13
diff -u -p -r1.13 mptramp.S
--- arch/amd64/amd64/mptramp.S  16 May 2016 01:54:15 -  1.13
+++ arch/amd64/amd64/mptramp.S  18 Dec 2016 18:06:09 -
@@ -84,6 +84,10 @@
 #include 
 #include 
 
+#ifdef __clang__
+#define addr32
+#endif
+
 #define _RELOC(x)   ((x) - KERNBASE)
 #define RELOC(x)_RELOC(_C_LABEL(x))
 
@@ -114,7 +118,7 @@ _C_LABEL(cpu_spinup_trampoline):
movw%cs, %ax
movw%ax, %es
movw%ax, %ss
-   data32 addr32 lgdt (mptramp_gdt32_desc)   # load flat descriptor table
+   addr32 lgdtl (mptramp_gdt32_desc)   # load flat descriptor table
movl%cr0, %eax   # get cr0
orl $0x1, %eax  # enable protected mode
movl%eax, %cr0  # doit