(This patch is the 64-bit variant of commit e75ee97224e5,
"UefiCpuPkg/PiSmmCpuDxeSmm: remove unneeded DBs from IA32 SmmStartup()",

The SmmStartup() function executes in SMM, which is very similar to real
mode. Add "BITS 16" before it and "BITS 64" after it (just before the
@LongMode label).

Remove the manual 0x66 operand-size override prefixes, for selecting
32-bit operands -- the sizes of our operands trigger NASM to insert the
prefixes automatically in almost every spot. The one place where we have
to add it back manually is the LGDT instruction. In the LGDT instruction
we also replace the binary 0x2E prefix with the normal NASM syntax for CS
segment override.

The stores to the Control Registers were always 32-bit wide; the source
code only used RAX as source operand because it generated the expected
object code (with NASM compiling the source as if in BITS 64). With BITS
16 added, we can use the actual register width in the source operands

This patch causes NASM to generate byte-identical object code (determined
by disassembling both the pre-patch and post-patch versions, and comparing
the listings), except:

> @@ -231,7 +231,7 @@
>  000001D2  6689D3            mov ebx,edx
>  000001D5  66B800000000      mov eax,0x0
>  000001DB  0F22D8            mov cr3,eax
> -000001DE  662E670F0155F6    o32 lgdt [cs:ebp-0xa]
> +000001DE  2E66670F0155F6    o32 lgdt [cs:ebp-0xa]
>  000001E5  66B800000000      mov eax,0x0
>  000001EB  80CC02            or ah,0x2
>  000001EE  0F22E0            mov cr4,eax

The only difference is the prefix list order, it changes from:

- 0x66, 0x2E, 0x67


- 0x2E, 0x66, 0x67

Cc: Eric Dong <eric.d...@intel.com>
Cc: Jiewen Yao <jiewen....@intel.com>
Cc: Liming Gao <liming....@intel.com>
Cc: Michael D Kinney <michael.d.kin...@intel.com>
Cc: Ruiyu Ni <ruiyu...@intel.com>
Ref: https://bugzilla.tianocore.org/show_bug.cgi?id=866
Contributed-under: TianoCore Contribution Agreement 1.1
Signed-off-by: Laszlo Ersek <ler...@redhat.com>
 UefiCpuPkg/PiSmmCpuDxeSmm/X64/SmmInit.nasm | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/UefiCpuPkg/PiSmmCpuDxeSmm/X64/SmmInit.nasm 
index b147e7218019..2eaf1433dcd6 100644
--- a/UefiCpuPkg/PiSmmCpuDxeSmm/X64/SmmInit.nasm
+++ b/UefiCpuPkg/PiSmmCpuDxeSmm/X64/SmmInit.nasm
@@ -41,26 +41,23 @@ ASM_PFX(gcSmiInitGdtr):
             DQ      0
 global ASM_PFX(SmmStartup)
+BITS 16
-    DB      0x66
     mov     eax, 0x80000001             ; read capability
-    DB      0x66
     mov     ebx, edx                    ; rdmsr will change edx. keep it in 
     DB      0x66, 0xb8                   ; mov eax, imm32
 ASM_PFX(gSmmCr3): DD 0
-    mov     cr3, rax
-    DB      0x66, 0x2e
-    lgdt    [ebp + (ASM_PFX(gcSmiInitGdtr) - ASM_PFX(SmmStartup))]
+    mov     cr3, eax
+o32 lgdt    [cs:ebp + (ASM_PFX(gcSmiInitGdtr) - ASM_PFX(SmmStartup))]
     DB      0x66, 0xb8                   ; mov eax, imm32
 ASM_PFX(gSmmCr4): DD 0
     or      ah,  2                      ; enable XMM registers access
-    mov     cr4, rax
-    DB      0x66
+    mov     cr4, eax
     mov     ecx, 0xc0000080             ; IA32_EFER MSR
     or      ah, BIT0                    ; set LME bit
-    DB      0x66
     test    ebx, BIT20                  ; check NXE capability
     jz      .1
     or      ah, BIT3                    ; set NXE bit
@@ -68,9 +65,11 @@ ASM_PFX(gSmmCr4): DD 0
     DB      0x66, 0xb8                   ; mov eax, imm32
 ASM_PFX(gSmmCr0): DD 0
-    mov     cr0, rax                    ; enable protected mode & paging
+    mov     cr0, eax                    ; enable protected mode & paging
     DB      0x66, 0xea                   ; far jmp to long mode
 ASM_PFX(gSmmJmpAddr): DQ 0;@LongMode
+BITS 64
 @LongMode:                              ; long-mode starts here
     DB      0x48, 0xbc                   ; mov rsp, imm64
 ASM_PFX(gSmmInitStack): DQ 0

