Index: compiler/aoptbase.pas
===================================================================
--- compiler/aoptbase.pas	(revision 39235)
+++ compiler/aoptbase.pas	(working copy)
@@ -42,6 +42,15 @@
     { assembler optimizer objects require                              }
     Type
       TAoptBase = class
+        { Flag used to indicate that further optimisations might be possible if the
+          code block is scanned again.  Not always used.
+
+          This is different from individual
+          optimisation methods returning True, as this instructs the optimiser
+          to evaluate the current instruction again as it may have changed
+          opcodes, for example. [Kit] }
+        ReoptimizeFlag: Boolean;
+
         { processor independent methods }
 
         constructor create; virtual;
@@ -117,6 +126,7 @@
   constructor taoptbase.create;
     begin
       inherited create;
+      ReoptimizeFlag := False;
     end;
 
 
Index: compiler/aoptobj.pas
===================================================================
--- compiler/aoptobj.pas	(revision 39235)
+++ compiler/aoptobj.pas	(working copy)
@@ -1687,19 +1687,24 @@
       var
         p: tai;
       begin
-        p := BlockStart;
-        ClearUsedRegs;
-        while (p <> BlockEnd) Do
-          begin
-            UpdateUsedRegs(tai(p.next));
-            if PostPeepHoleOptsCpu(p) then
-              continue;
-            if assigned(p) then
-              begin
-                UpdateUsedRegs(p);
-                p:=tai(p.next);
-              end;
-          end;
+        repeat
+          ReoptimizeFlag := False;
+          p := BlockStart;
+          ClearUsedRegs;
+          while (p <> BlockEnd) Do
+            begin
+              UpdateUsedRegs(tai(p.next));
+              if PostPeepHoleOptsCpu(p) then
+                continue;
+              if assigned(p) then
+                begin
+                  UpdateUsedRegs(p);
+                  p:=tai(p.next);
+                end;
+            end;
+
+        until not ReoptimizeFlag or
+          not ((cs_opt_level3 in current_settings.optimizerswitches) or (cs_opt_level4 in current_settings.optimizerswitches));
       end;
 
 
Index: compiler/i386/aoptcpu.pas
===================================================================
--- compiler/i386/aoptcpu.pas	(revision 39235)
+++ compiler/i386/aoptcpu.pas	(working copy)
@@ -1037,88 +1037,93 @@
 var
   p,hp1,hp2: tai;
 begin
-  p := BlockStart;
-  ClearUsedRegs;
-  while (p <> BlockEnd) Do
-    begin
-      UpdateUsedRegs(UsedRegs, tai(p.next));
-      case p.Typ Of
-        Ait_Instruction:
-          begin
-            if InsContainsSegRef(taicpu(p)) then
-              begin
-                p := tai(p.next);
-                continue;
+  repeat
+    ReoptimizeFlag := False;
+    p := BlockStart;
+    ClearUsedRegs;
+    while (p <> BlockEnd) Do
+      begin
+        UpdateUsedRegs(UsedRegs, tai(p.next));
+        case p.Typ Of
+          Ait_Instruction:
+            begin
+              if InsContainsSegRef(taicpu(p)) then
+                begin
+                  p := tai(p.next);
+                  continue;
+                end;
+              case taicpu(p).opcode Of
+                A_CALL:
+                  if PostPeepHoleOptCall(p) then
+                    Continue;
+                A_LEA:
+                  if PostPeepholeOptLea(p) then
+                    Continue;
+                A_CMP:
+                  if PostPeepholeOptCmp(p) then
+                    Continue;
+                A_MOV:
+                  if PostPeepholeOptMov(p) then
+                    Continue;
+                A_MOVZX:
+                  { if register vars are on, it's possible there is code like }
+                  {   "cmpl $3,%eax; movzbl 8(%ebp),%ebx; je .Lxxx"           }
+                  { so we can't safely replace the movzx then with xor/mov,   }
+                  { since that would change the flags (JM)                    }
+                  if not(cs_opt_regvar in current_settings.optimizerswitches) then
+                   begin
+                    if (taicpu(p).oper[1]^.typ = top_reg) then
+                      if (taicpu(p).oper[0]^.typ = top_reg)
+                        then
+                          case taicpu(p).opsize of
+                            S_BL:
+                              begin
+                                if IsGP32Reg(taicpu(p).oper[1]^.reg) and
+                                   not(cs_opt_size in current_settings.optimizerswitches) and
+                                   (current_settings.optimizecputype = cpu_Pentium) then
+                                    {Change "movzbl %reg1, %reg2" to
+                                     "xorl %reg2, %reg2; movb %reg1, %reg2" for Pentium and
+                                     PentiumMMX}
+                                  begin
+                                    hp1 := taicpu.op_reg_reg(A_XOR, S_L,
+                                                taicpu(p).oper[1]^.reg, taicpu(p).oper[1]^.reg);
+                                    InsertLLItem(p.previous, p, hp1);
+                                    taicpu(p).opcode := A_MOV;
+                                    taicpu(p).changeopsize(S_B);
+                                    setsubreg(taicpu(p).oper[1]^.reg,R_SUBL);
+                                  end;
+                              end;
+                          end
+                        else if (taicpu(p).oper[0]^.typ = top_ref) and
+                            (taicpu(p).oper[0]^.ref^.base <> taicpu(p).oper[1]^.reg) and
+                            (taicpu(p).oper[0]^.ref^.index <> taicpu(p).oper[1]^.reg) and
+                            not(cs_opt_size in current_settings.optimizerswitches) and
+                            IsGP32Reg(taicpu(p).oper[1]^.reg) and
+                            (current_settings.optimizecputype = cpu_Pentium) and
+                            (taicpu(p).opsize = S_BL) then
+                          {changes "movzbl mem, %reg" to "xorl %reg, %reg; movb mem, %reg8" for
+                            Pentium and PentiumMMX}
+                          begin
+                            hp1 := taicpu.Op_reg_reg(A_XOR, S_L, taicpu(p).oper[1]^.reg,
+                                        taicpu(p).oper[1]^.reg);
+                            taicpu(p).opcode := A_MOV;
+                            taicpu(p).changeopsize(S_B);
+                            setsubreg(taicpu(p).oper[1]^.reg,R_SUBL);
+                            InsertLLItem(p.previous, p, hp1);
+                          end;
+                   end;
+                A_TEST, A_OR:
+                  if PostPeepholeOptTestOr(p) then
+                    Continue;
               end;
-            case taicpu(p).opcode Of
-              A_CALL:
-                if PostPeepHoleOptCall(p) then
-                  Continue;
-              A_LEA:
-                if PostPeepholeOptLea(p) then
-                  Continue;
-              A_CMP:
-                if PostPeepholeOptCmp(p) then
-                  Continue;
-              A_MOV:
-                if PostPeepholeOptMov(p) then
-                  Continue;
-              A_MOVZX:
-                { if register vars are on, it's possible there is code like }
-                {   "cmpl $3,%eax; movzbl 8(%ebp),%ebx; je .Lxxx"           }
-                { so we can't safely replace the movzx then with xor/mov,   }
-                { since that would change the flags (JM)                    }
-                if not(cs_opt_regvar in current_settings.optimizerswitches) then
-                 begin
-                  if (taicpu(p).oper[1]^.typ = top_reg) then
-                    if (taicpu(p).oper[0]^.typ = top_reg)
-                      then
-                        case taicpu(p).opsize of
-                          S_BL:
-                            begin
-                              if IsGP32Reg(taicpu(p).oper[1]^.reg) and
-                                 not(cs_opt_size in current_settings.optimizerswitches) and
-                                 (current_settings.optimizecputype = cpu_Pentium) then
-                                  {Change "movzbl %reg1, %reg2" to
-                                   "xorl %reg2, %reg2; movb %reg1, %reg2" for Pentium and
-                                   PentiumMMX}
-                                begin
-                                  hp1 := taicpu.op_reg_reg(A_XOR, S_L,
-                                              taicpu(p).oper[1]^.reg, taicpu(p).oper[1]^.reg);
-                                  InsertLLItem(p.previous, p, hp1);
-                                  taicpu(p).opcode := A_MOV;
-                                  taicpu(p).changeopsize(S_B);
-                                  setsubreg(taicpu(p).oper[1]^.reg,R_SUBL);
-                                end;
-                            end;
-                        end
-                      else if (taicpu(p).oper[0]^.typ = top_ref) and
-                          (taicpu(p).oper[0]^.ref^.base <> taicpu(p).oper[1]^.reg) and
-                          (taicpu(p).oper[0]^.ref^.index <> taicpu(p).oper[1]^.reg) and
-                          not(cs_opt_size in current_settings.optimizerswitches) and
-                          IsGP32Reg(taicpu(p).oper[1]^.reg) and
-                          (current_settings.optimizecputype = cpu_Pentium) and
-                          (taicpu(p).opsize = S_BL) then
-                        {changes "movzbl mem, %reg" to "xorl %reg, %reg; movb mem, %reg8" for
-                          Pentium and PentiumMMX}
-                        begin
-                          hp1 := taicpu.Op_reg_reg(A_XOR, S_L, taicpu(p).oper[1]^.reg,
-                                      taicpu(p).oper[1]^.reg);
-                          taicpu(p).opcode := A_MOV;
-                          taicpu(p).changeopsize(S_B);
-                          setsubreg(taicpu(p).oper[1]^.reg,R_SUBL);
-                          InsertLLItem(p.previous, p, hp1);
-                        end;
-                 end;
-              A_TEST, A_OR:
-                if PostPeepholeOptTestOr(p) then
-                  Continue;
             end;
-          end;
+        end;
+        p := tai(p.next)
       end;
-      p := tai(p.next)
-    end;
-  OptReferences;
+    OptReferences;
+
+  until not ReoptimizeFlag or
+    not ((cs_opt_level3 in current_settings.optimizerswitches) or (cs_opt_level4 in current_settings.optimizerswitches));
 end;
 
 
Index: compiler/x86/aoptx86.pas
===================================================================
--- compiler/x86/aoptx86.pas	(revision 39235)
+++ compiler/x86/aoptx86.pas	(working copy)
@@ -35,11 +35,17 @@
       aopt,aoptobj;
 
     type
+
+      { TX86AsmOptimizer }
+
       TX86AsmOptimizer = class(TAsmOptimizer)
         function RegLoadedWithNewValue(reg : tregister; hp : tai) : boolean; override;
         function InstructionLoadsFromReg(const reg : TRegister; const hp : tai) : boolean; override;
         function RegReadByInstruction(reg : TRegister; hp : tai) : boolean;
+        function RegModifiedByInstruction(reg : TRegister; hp : tai) : boolean; override;
+        function RegTouchedByInstruction(reg : TRegister; hp : tai) : boolean;
       protected
+
         { checks whether loading a new value in reg1 overwrites the entirety of reg2 }
         function Reg1WriteOverwritesReg2Entirely(reg1, reg2: tregister): boolean;
         { checks whether reading the value in reg1 depends on the value of reg2. This
@@ -107,6 +113,7 @@
 
   const
     SPeepholeOptimization: string = 'Peephole Optimization: ';
+    SDeepOptimization: string = 'Deep Optimization: ';
 
   implementation
 
@@ -118,7 +125,7 @@
       aasmbase,
       aoptutils,
       symconst,symsym,
-      cgx86,
+      cgx86,aoptbase,
       itcpugas;
 
     function MatchInstruction(const instr: tai; const op: TAsmOp; const opsize: topsizes): boolean;
@@ -308,6 +315,9 @@
                  )
                );
           end;
+        A_NOP:
+          { Null operation, so no! }
+          Exit;
         else
           begin
             if (p.opcode=A_LEA) and is_segment_reg(reg) then
@@ -339,49 +349,49 @@
                   begin
                     case getsupreg(reg) of
                       RS_EAX:
-                        if [Ch_REAX,Ch_RWEAX,Ch_MEAX]*Ch<>[] then
+                        if [Ch_REAX,Ch_RWEAX,Ch_MEAX,Ch_RRAX,Ch_RWRAX,Ch_MRAX]*Ch<>[] then
                           begin
                             RegReadByInstruction := true;
                             exit
                           end;
                       RS_ECX:
-                        if [Ch_RECX,Ch_RWECX,Ch_MECX]*Ch<>[] then
+                        if [Ch_RECX,Ch_RWECX,Ch_MECX,Ch_RRCX,Ch_RWRCX,Ch_MRCX]*Ch<>[] then
                           begin
                             RegReadByInstruction := true;
                             exit
                           end;
                       RS_EDX:
-                        if [Ch_REDX,Ch_RWEDX,Ch_MEDX]*Ch<>[] then
+                        if [Ch_REDX,Ch_RWEDX,Ch_MEDX,Ch_RRDX,Ch_RWRDX,Ch_MRDX]*Ch<>[] then
                           begin
                             RegReadByInstruction := true;
                             exit
                           end;
                       RS_EBX:
-                        if [Ch_REBX,Ch_RWEBX,Ch_MEBX]*Ch<>[] then
+                        if [Ch_REBX,Ch_RWEBX,Ch_MEBX,Ch_RRBX,Ch_RWRBX,Ch_MRBX]*Ch<>[] then
                           begin
                             RegReadByInstruction := true;
                             exit
                           end;
                       RS_ESP:
-                        if [Ch_RESP,Ch_RWESP,Ch_MESP]*Ch<>[] then
+                        if [Ch_RESP,Ch_RWESP,Ch_MESP,Ch_RRSP,Ch_RWRSP,Ch_MRSP]*Ch<>[] then
                           begin
                             RegReadByInstruction := true;
                             exit
                           end;
                       RS_EBP:
-                        if [Ch_REBP,Ch_RWEBP,Ch_MEBP]*Ch<>[] then
+                        if [Ch_REBP,Ch_RWEBP,Ch_MEBP,Ch_RRBP,Ch_RWRBP,Ch_MRBP]*Ch<>[] then
                           begin
                             RegReadByInstruction := true;
                             exit
                           end;
                       RS_ESI:
-                        if [Ch_RESI,Ch_RWESI,Ch_MESI]*Ch<>[] then
+                        if [Ch_RESI,Ch_RWESI,Ch_MESI,Ch_RRSI,Ch_RWRSI,Ch_MRSI]*Ch<>[] then
                           begin
                             RegReadByInstruction := true;
                             exit
                           end;
                       RS_EDI:
-                        if [Ch_REDI,Ch_RWEDI,Ch_MEDI]*Ch<>[] then
+                        if [Ch_REDI,Ch_RWEDI,Ch_MEDI,Ch_RRDI,Ch_RWRDI,Ch_MRDI,Ch_RMemEDI,Ch_WMemEDI]*Ch<>[] then
                           begin
                             RegReadByInstruction := true;
                             exit
@@ -479,16 +489,475 @@
       end;
     end;
 
+    function TX86AsmOptimizer.RegModifiedByInstruction(reg: TRegister; hp: tai): boolean;
+    var
+      p: taicpu;
+      opcount: longint;
+    begin
+      RegModifiedByInstruction := false;
+      if hp.typ <> ait_instruction then
+        exit;
+      p := taicpu(hp);
+      case p.opcode of
+        A_CALL:
+          RegModifiedbyinstruction := true;
+        A_IMUL:
+          case p.ops of
+            1:
+              RegModifiedByInstruction :=
+                 (
+                  (getregtype(reg)=R_INTREGISTER) and
+                  ((getsupreg(reg)=RS_EAX) or ((getsupreg(reg)=RS_EDX) and (p.opsize<>S_B)))
+                 );
+            2,3:
+              RegModifiedByInstruction :=
+                reginop(reg,p.oper[1]^);
+          end;
+        A_MUL:
+          begin
+            RegModifiedByInstruction :=
+              (
+                (getregtype(reg)=R_INTREGISTER) and
+                ((getsupreg(reg)=RS_EAX) or ((getsupreg(reg)=RS_EDX) and (p.opsize<>S_B)))
+              );
+          end;
+        A_IDIV,A_DIV:
+          begin
+            RegModifiedByInstruction :=
+              (
+                (getregtype(reg)=R_INTREGISTER) and
+                ((getsupreg(reg)=RS_EAX) or ((getsupreg(reg)=RS_EDX) and (p.opsize<>S_B)))
+              );
+          end;
+        A_NOP:
+          { Null operation, so no! }
+          Exit;
+        else
+          begin
+            { special handling for SSE MOVSD }
+            if (p.opcode=A_MOVSD) and (p.ops>0) then
+              begin
+                if p.ops<>2 then
+                  internalerror(2018060140);
+                RegModifiedByInstruction :=
+                  (p.oper[1]^.typ=top_reg) and reginop(reg, p.oper[1]^);
+                exit;
+              end;
 
+            with insprop[p.opcode] do
+              begin
+                if getregtype(reg)=R_INTREGISTER then
+                  begin
+                    case getsupreg(reg) of
+                      RS_EAX:
+                        if [Ch_WEAX,Ch_RWEAX,Ch_MEAX,Ch_WRAX,Ch_RWRAX,Ch_MRAX]*Ch<>[] then
+                          begin
+                            RegModifiedByInstruction := true;
+                            exit
+                          end;
+                      RS_ECX:
+                        if [Ch_WECX,Ch_RWECX,Ch_MECX,Ch_WRCX,Ch_RWRCX,Ch_MRCX]*Ch<>[] then
+                          begin
+                            RegModifiedByInstruction := true;
+                            exit
+                          end;
+                      RS_EDX:
+                        if [Ch_WEDX,Ch_RWEDX,Ch_MEDX,Ch_WRDX,Ch_RWRDX,Ch_MRDX]*Ch<>[] then
+                          begin
+                            RegModifiedByInstruction := true;
+                            exit
+                          end;
+                      RS_EBX:
+                        if [Ch_WEBX,Ch_RWEBX,Ch_MEBX,Ch_WRBX,Ch_RWRBX,Ch_MRBX]*Ch<>[] then
+                          begin
+                            RegModifiedByInstruction := true;
+                            exit
+                          end;
+                      RS_ESP:
+                        if [Ch_WESP,Ch_RWESP,Ch_MESP,Ch_WRSP,Ch_RWRSP,Ch_MRSP]*Ch<>[] then
+                          begin
+                            RegModifiedByInstruction := true;
+                            exit
+                          end;
+                      RS_EBP:
+                        if [Ch_WEBP,Ch_RWEBP,Ch_MEBP,Ch_WRBP,Ch_RWRBP,Ch_MRBP]*Ch<>[] then
+                          begin
+                            RegModifiedByInstruction := true;
+                            exit
+                          end;
+                      RS_ESI:
+                        if [Ch_WESI,Ch_RWESI,Ch_MESI,Ch_WRSI,Ch_RWRSI,Ch_MRSI]*Ch<>[] then
+                          begin
+                            RegModifiedByInstruction := true;
+                            exit
+                          end;
+                      RS_EDI:
+                        if [Ch_WEDI,Ch_RWEDI,Ch_MEDI,Ch_WRDI,Ch_RWRDI,Ch_MRDI]*Ch<>[] then
+                          begin
+                            RegModifiedByInstruction := true;
+                            exit
+                          end;
+                    end;
+                  end;
+                if SuperRegistersEqual(reg,NR_DEFAULTFLAGS) then
+                  begin
+                    case getsubreg(reg) of
+                      R_SUBW,R_SUBD,R_SUBQ:
+                        RegModifiedByInstruction :=
+                          [Ch_WCarryFlag,Ch_WParityFlag,Ch_WAuxiliaryFlag,Ch_WZeroFlag,Ch_WSignFlag,Ch_WOverflowFlag,
+                           Ch_RWCarryFlag,Ch_RWParityFlag,Ch_RWAuxiliaryFlag,Ch_RWZeroFlag,Ch_RWSignFlag,Ch_RWOverflowFlag,
+                           Ch_W0DirFlag,Ch_W1DirFlag,Ch_WFlags,Ch_RWFlags]*Ch<>[];
+                      R_SUBFLAGCARRY:
+                        RegModifiedByInstruction:=[Ch_WCarryFlag,Ch_RWCarryFlag,Ch_WFlags,Ch_RWFlags]*Ch<>[];
+                      R_SUBFLAGPARITY:
+                        RegModifiedByInstruction:=[Ch_WParityFlag,Ch_RWParityFlag,Ch_WFlags,Ch_RWFlags]*Ch<>[];
+                      R_SUBFLAGAUXILIARY:
+                        RegModifiedByInstruction:=[Ch_WAuxiliaryFlag,Ch_RWAuxiliaryFlag,Ch_WFlags,Ch_RWFlags]*Ch<>[];
+                      R_SUBFLAGZERO:
+                        RegModifiedByInstruction:=[Ch_WZeroFlag,Ch_RWZeroFlag,Ch_WFlags,Ch_RWFlags]*Ch<>[];
+                      R_SUBFLAGSIGN:
+                        RegModifiedByInstruction:=[Ch_WSignFlag,Ch_RWSignFlag,Ch_WFlags,Ch_RWFlags]*Ch<>[];
+                      R_SUBFLAGOVERFLOW:
+                        RegModifiedByInstruction:=[Ch_WOverflowFlag,Ch_RWOverflowFlag,Ch_WFlags,Ch_RWFlags]*Ch<>[];
+                      R_SUBFLAGINTERRUPT:
+                        RegModifiedByInstruction:=[Ch_WFlags,Ch_RWFlags]*Ch<>[];
+                      R_SUBFLAGDIRECTION:
+                        RegModifiedByInstruction:=[Ch_W0DirFlag,Ch_W1DirFlag,Ch_WFlags,Ch_RWFlags]*Ch<>[];
+                      else
+                        internalerror(2018060110);
+                    end;
+                    exit;
+                  end;
+
+                if (p.ops = 2) and (p.oper[0]^.typ = top_reg) and (p.oper[1]^.typ = top_reg) and
+                   SuperRegistersEqual(reg, p.oper[0]^.reg) and (p.oper[0]^.reg = p.oper[1]^.reg) then
+                  begin
+                    { Check for cases where a write may be a null operation }
+                    case p.opcode of
+                      A_XCHG: { Always false if the registers are the same (even if the're not equal to reg) }
+                        Exit;
+                      A_OR, A_AND, A_MOV:
+                        begin
+                          { Only false if the instruction size is S_Q, because smaller sizes
+                            zero the upper 32 bits of the register. }
+                          RegModifiedByInstruction := (p.opsize <> S_Q);
+                          Exit;
+                        end;
+                    end;
+                  end;
+
+                { NOTE: Do not use "reginop(reg,p.oper[#]^)" as RegReadByInstruction does, because a register
+                   appearing in a memory address counts as a read, but never as a write. [Kit] }
+                if ([CH_RWOP1,CH_WOP1,CH_MOP1]*Ch<>[]) and (p.oper[0]^.typ = top_reg) and SuperRegistersEqual(reg,p.oper[0]^.reg) then
+                  begin
+                    RegModifiedByInstruction := true;
+                    exit
+                  end;
+                if ([Ch_RWOP2,Ch_WOP2,Ch_MOP2]*Ch<>[]) and (p.oper[1]^.typ = top_reg) and SuperRegistersEqual(reg,p.oper[1]^.reg) then
+                  begin
+                    RegModifiedByInstruction := true;
+                    exit
+                  end;
+                if ([Ch_RWOP3,Ch_WOP3,Ch_MOP3]*Ch<>[]) and (p.oper[2]^.typ = top_reg) and SuperRegistersEqual(reg,p.oper[2]^.reg) then
+                  begin
+                    RegModifiedByInstruction := true;
+                    exit
+                  end;
+                if ([Ch_RWOP4,Ch_WOP4,Ch_MOP4]*Ch<>[]) and (p.oper[3]^.typ = top_reg) and SuperRegistersEqual(reg,p.oper[3]^.reg) then
+                  begin
+                    RegModifiedByInstruction := true;
+                    exit
+                  end;
+              end;
+          end;
+      end;
+    end;
+
+    { Returns true if the instruction either reads from or writes to the register }
+    function TX86AsmOptimizer.RegTouchedByInstruction(reg: TRegister; hp: tai): boolean;
+      var
+        p: taicpu;
+        opcount: longint;
+      begin
+        RegTouchedByInstruction := false;
+        if hp.typ <> ait_instruction then
+          exit;
+        p := taicpu(hp);
+        case p.opcode of
+          A_CALL:
+            RegTouchedByInstruction := true;
+          A_IMUL:
+            case p.ops of
+              1:
+                RegTouchedByInstruction := RegInOp(reg,p.oper[0]^) or
+                   (
+                    ((getregtype(reg)=R_INTREGISTER) and (getsupreg(reg)=RS_EAX)) and
+                    ((getsubreg(reg)<>R_SUBH) or (p.opsize<>S_B))
+                   );
+              2,3:
+                RegTouchedByInstruction :=
+                  reginop(reg,p.oper[0]^) or
+                  reginop(reg,p.oper[1]^);
+            end;
+          A_MUL:
+            begin
+              RegTouchedByInstruction := RegInOp(reg,p.oper[0]^) or
+                 (
+                  ((getregtype(reg)=R_INTREGISTER) and (getsupreg(reg)=RS_EAX)) and
+                  ((getsubreg(reg)<>R_SUBH) or (p.opsize<>S_B))
+                 );
+            end;
+          A_IDIV,A_DIV:
+            begin
+              RegTouchedByInstruction := RegInOp(reg,p.oper[0]^) or
+                 (
+                   (getregtype(reg)=R_INTREGISTER) and
+                   (
+                     (getsupreg(reg)=RS_EAX) or ((getsupreg(reg)=RS_EDX) and (p.opsize<>S_B))
+                   )
+                 );
+            end;
+          A_NOP:
+            { Null operation, so no! }
+            Exit;
+          else
+            begin
+              if (p.opcode=A_LEA) and is_segment_reg(reg) then
+                begin
+                  RegTouchedByInstruction := false;
+                  exit;
+                end;
+              for opcount := 0 to p.ops-1 do
+                if (p.oper[opCount]^.typ = top_ref) and
+                   RegInRef(reg,p.oper[opcount]^.ref^) then
+                  begin
+                    RegTouchedByInstruction := true;
+                    exit
+                  end;
+              { special handling for SSE MOVSD }
+              if (p.opcode=A_MOVSD) and (p.ops>0) then
+                begin
+                  if p.ops<>2 then
+                    internalerror(2017042702);
+                  RegTouchedByInstruction := reginop(reg,p.oper[0]^) or
+                    ((p.oper[1]^.typ=top_reg) and reginop(reg, p.oper[1]^));
+                  exit;
+                end;
+              with insprop[p.opcode] do
+                begin
+                  if getregtype(reg)=R_INTREGISTER then
+                    begin
+                      case getsupreg(reg) of
+                        RS_EAX:
+                          if [Ch_REAX,Ch_RWEAX,Ch_WEAX,Ch_MEAX,Ch_RRAX,Ch_RWRAX,Ch_MRAX,Ch_WRAX]*Ch<>[] then
+                            begin
+                              RegTouchedByInstruction := true;
+                              exit
+                            end;
+                        RS_ECX:
+                          if [Ch_RECX,Ch_RWECX,Ch_WECX,Ch_MECX,Ch_RRCX,Ch_RWRCX,Ch_MRCX,Ch_WRCX]*Ch<>[] then
+                            begin
+                              RegTouchedByInstruction := true;
+                              exit
+                            end;
+                        RS_EDX:
+                          if [Ch_REDX,Ch_RWEDX,Ch_WEDX,Ch_MEDX,Ch_RRDX,Ch_RWRDX,Ch_MRDX,Ch_WRDX]*Ch<>[] then
+                            begin
+                              RegTouchedByInstruction := true;
+                              exit
+                            end;
+                        RS_EBX:
+                          if [Ch_REBX,Ch_RWEBX,Ch_WEBX,Ch_MEBX,Ch_RRBX,Ch_RWRBX,Ch_MRBX,Ch_WRBX]*Ch<>[] then
+                            begin
+                              RegTouchedByInstruction := true;
+                              exit
+                            end;
+                        RS_ESP:
+                          if [Ch_RESP,Ch_RWESP,Ch_WESP,Ch_MESP,Ch_RRSP,Ch_RWRSP,Ch_MRSP,Ch_WRSP]*Ch<>[] then
+                            begin
+                              RegTouchedByInstruction := true;
+                              exit
+                            end;
+                        RS_EBP:
+                          if [Ch_REBP,Ch_RWEBP,Ch_WEBP,Ch_MEBP,Ch_RRBP,Ch_RWRBP,Ch_MRBP,Ch_WRBP]*Ch<>[] then
+                            begin
+                              RegTouchedByInstruction := true;
+                              exit
+                            end;
+                        RS_ESI:
+                          if [Ch_RESI,Ch_RWESI,Ch_WESI,Ch_MESI,Ch_RRSI,Ch_RWRSI,Ch_MRSI,Ch_WRSI]*Ch<>[] then
+                            begin
+                              RegTouchedByInstruction := true;
+                              exit
+                            end;
+                        RS_EDI:
+                          if [Ch_REDI,Ch_RWEDI,Ch_WEDI,Ch_MEDI,Ch_RRDI,Ch_RWRDI,Ch_MRDI,Ch_WRDI,Ch_RMemEDI,Ch_WMemEDI]*Ch<>[] then
+                            begin
+                              RegTouchedByInstruction := true;
+                              exit
+                            end;
+                      end;
+                    end;
+                  if SuperRegistersEqual(reg,NR_DEFAULTFLAGS) then
+                    begin
+                      if (Ch_RFLAGScc in Ch) and not(getsubreg(reg) in [R_SUBW,R_SUBD,R_SUBQ]) then
+                        begin
+                          case p.condition of
+                            C_A,C_NBE,       { CF=0 and ZF=0  }
+                            C_BE,C_NA:       { CF=1 or ZF=1   }
+                              RegTouchedByInstruction:=getsubreg(reg) in [R_SUBFLAGCARRY,R_SUBFLAGZERO];
+                            C_AE,C_NB,C_NC,  { CF=0           }
+                            C_B,C_NAE,C_C:   { CF=1           }
+                              RegTouchedByInstruction:=getsubreg(reg) in [R_SUBFLAGCARRY];
+                            C_NE,C_NZ,       { ZF=0           }
+                            C_E,C_Z:         { ZF=1           }
+                              RegTouchedByInstruction:=getsubreg(reg) in [R_SUBFLAGZERO];
+                            C_G,C_NLE,       { ZF=0 and SF=OF }
+                            C_LE,C_NG:       { ZF=1 or SF<>OF }
+                              RegTouchedByInstruction:=getsubreg(reg) in [R_SUBFLAGZERO,R_SUBFLAGSIGN,R_SUBFLAGOVERFLOW];
+                            C_GE,C_NL,       { SF=OF          }
+                            C_L,C_NGE:       { SF<>OF         }
+                              RegTouchedByInstruction:=getsubreg(reg) in [R_SUBFLAGSIGN,R_SUBFLAGOVERFLOW];
+                            C_NO,            { OF=0           }
+                            C_O:             { OF=1           }
+                              RegTouchedByInstruction:=getsubreg(reg) in [R_SUBFLAGOVERFLOW];
+                            C_NP,C_PO,       { PF=0           }
+                            C_P,C_PE:        { PF=1           }
+                              RegTouchedByInstruction:=getsubreg(reg) in [R_SUBFLAGPARITY];
+                            C_NS,            { SF=0           }
+                            C_S:             { SF=1           }
+                              RegTouchedByInstruction:=getsubreg(reg) in [R_SUBFLAGSIGN];
+                            else
+                              internalerror(2018060120);
+                          end;
+                          if RegTouchedByInstruction then
+                            exit;
+                        end;
+                      case getsubreg(reg) of
+                        R_SUBW,R_SUBD,R_SUBQ:
+                          RegTouchedByInstruction :=
+                            [Ch_RCarryFlag,Ch_RParityFlag,Ch_RAuxiliaryFlag,Ch_RZeroFlag,Ch_RSignFlag,Ch_ROverflowFlag,
+                             Ch_RWCarryFlag,Ch_RWParityFlag,Ch_RWAuxiliaryFlag,Ch_RWZeroFlag,Ch_RWSignFlag,Ch_RWOverflowFlag,
+                             Ch_WCarryFlag,Ch_WParityFlag,Ch_WAuxiliaryFlag,Ch_WZeroFlag,Ch_WSignFlag,Ch_WOverflowFlag,
+                             Ch_RDirFlag,Ch_RFlags,Ch_RWFlags,Ch_RFLAGScc,Ch_W0DirFlag,Ch_W1DirFlag,Ch_WFlags]*Ch<>[];
+                        R_SUBFLAGCARRY:
+                          RegTouchedByInstruction:=[Ch_RCarryFlag,Ch_RWCarryFlag,Ch_RFlags,Ch_RWFlags,Ch_WCarryFlag,Ch_WFlags]*Ch<>[];
+                        R_SUBFLAGPARITY:
+                          RegTouchedByInstruction:=[Ch_RParityFlag,Ch_RWParityFlag,Ch_RFlags,Ch_RWFlags,Ch_WParityFlag,Ch_WFlags]*Ch<>[];
+                        R_SUBFLAGAUXILIARY:
+                          RegTouchedByInstruction:=[Ch_RAuxiliaryFlag,Ch_RWAuxiliaryFlag,Ch_RFlags,Ch_RWFlags,Ch_WAuxiliaryFlag,Ch_WFlags]*Ch<>[];
+                        R_SUBFLAGZERO:
+                          RegTouchedByInstruction:=[Ch_RZeroFlag,Ch_RWZeroFlag,Ch_RFlags,Ch_RWFlags,Ch_WZeroFlag,Ch_WFlags]*Ch<>[];
+                        R_SUBFLAGSIGN:
+                          RegTouchedByInstruction:=[Ch_RSignFlag,Ch_RWSignFlag,Ch_RFlags,Ch_RWFlags,Ch_WSignFlag,Ch_WFlags]*Ch<>[];
+                        R_SUBFLAGOVERFLOW:
+                          RegTouchedByInstruction:=[Ch_ROverflowFlag,Ch_RWOverflowFlag,Ch_RFlags,Ch_RWFlags,Ch_WOverflowFlag,Ch_WFlags]*Ch<>[];
+                        R_SUBFLAGINTERRUPT:
+                          RegTouchedByInstruction:=[Ch_RFlags,Ch_RWFlags,Ch_WFlags]*Ch<>[];
+                        R_SUBFLAGDIRECTION:
+                          RegTouchedByInstruction:=[Ch_RDirFlag,Ch_RFlags,Ch_RWFlags,Ch_W0DirFlag,Ch_W1DirFlag,Ch_WFlags]*Ch<>[];
+
+                        else
+                          internalerror(2018060130);
+                      end;
+                      exit;
+                    end;
+
+                  if ([Ch_RWOP1,Ch_ROP1,Ch_WOP1,Ch_MOP1]*Ch<>[]) and reginop(reg,p.oper[0]^) then
+                    begin
+                      RegTouchedByInstruction := true;
+                      exit
+                    end;
+                  if ([Ch_RWOP2,Ch_ROP2,Ch_WOP2,Ch_MOP2]*Ch<>[]) and reginop(reg,p.oper[1]^) then
+                    begin
+                      RegTouchedByInstruction := true;
+                      exit
+                    end;
+                  if ([Ch_RWOP3,Ch_ROP3,Ch_WOP3,Ch_MOP3]*Ch<>[]) and reginop(reg,p.oper[2]^) then
+                    begin
+                      RegTouchedByInstruction := true;
+                      exit
+                    end;
+                  if ([Ch_RWOP4,Ch_ROP4,Ch_WOP4,Ch_MOP4]*Ch<>[]) and reginop(reg,p.oper[3]^) then
+                    begin
+                      RegTouchedByInstruction := true;
+                      exit
+                    end;
+                end;
+            end;
+        end;
+      end;
+
 {$ifdef DEBUG_AOPTCPU}
     procedure TX86AsmOptimizer.DebugMsg(const s: string;p : tai);
       begin
         asml.insertbefore(tai_comment.Create(strpnew(s)), p);
       end;
+
+    function debug_tostr(i: tcgint): string; inline;
+      begin
+        Result := tostr(i);
+      end;
+
+    function debug_regname(r: TRegister): string; inline;
+      begin
+        Result := '%' + std_regname(r);
+      end;
+
+    { Debug output function - creates a string representation of an operator }
+    function debug_operstr(oper: TOper): string;
+      begin
+        case oper.typ of
+          top_const:
+            Result := '$' + debug_tostr(oper.val);
+          top_reg:
+            Result := debug_regname(oper.reg);
+          top_ref:
+            begin
+              if oper.ref^.offset <> 0 then
+                Result := debug_tostr(oper.ref^.offset) + '('
+              else
+                Result := '(';
+
+              if (oper.ref^.base <> NR_INVALID) and (oper.ref^.base <> NR_NO) then
+                begin
+                  Result := Result + debug_regname(oper.ref^.base);
+                  if (oper.ref^.index <> NR_INVALID) and (oper.ref^.index <> NR_NO) then
+                    Result := Result + ',' + debug_regname(oper.ref^.index);
+                end
+              else
+                if (oper.ref^.index <> NR_INVALID) and (oper.ref^.index <> NR_NO) then
+                  Result := Result + debug_regname(oper.ref^.index);
+
+              if (oper.ref^.scalefactor > 1) then
+                Result := Result + ',' + debug_tostr(oper.ref^.scalefactor) + ')'
+              else
+                Result := Result + ')';
+            end;
+          else
+            Result := '[UNKNOWN]';
+        end;
+      end;
+
 {$else DEBUG_AOPTCPU}
     procedure TX86AsmOptimizer.DebugMsg(const s: string;p : tai);inline;
       begin
       end;
+
+    function debug_tostr(i: tcgint): string; inline;
+      begin
+        Result := '';
+      end;
+
+    function debug_regname(r: TRegister): string; inline;
+      begin
+        Result := '';
+      end;
+
+    function debug_operstr(oper: TOper): string; inline;
+      begin
+        Result := '';
+      end;
 {$endif DEBUG_AOPTCPU}
 
 
@@ -2991,46 +3460,330 @@
 
     function TX86AsmOptimizer.PostPeepholeOptMov(const p : tai) : Boolean;
       var
-        Value, RegName: string;
+        RegName, SourceValueStr, Value, OldRef, NewRef: string; { <- Debug strings }
+        X: Integer; mov_val: tcgint;
+        hp1, hp_prev: tai; reg_source, reg_dest, new_reg: TRegister;
+        ConditionalJump: Boolean;
       begin
         Result:=false;
-        if (taicpu(p).oper[1]^.typ = top_reg) and (taicpu(p).oper[0]^.typ = top_const) then
+        if (taicpu(p).oper[1]^.typ = top_reg) then
           begin
+            if (taicpu(p).oper[0]^.typ = top_const) then
+              begin
 
-            case taicpu(p).oper[0]^.val of
-            0:
-              { Don't make this optimisation if the CPU flags are required, since XOR scrambles them }
-              if not (RegInUsedRegs(NR_DEFAULTFLAGS,UsedRegs)) then
-                begin
-                  { change "mov $0,%reg" into "xor %reg,%reg" }
-                  taicpu(p).opcode := A_XOR;
-                  taicpu(p).loadReg(0,taicpu(p).oper[1]^.reg);
-                  Result := True;
+                case taicpu(p).oper[0]^.val of
+                0:
+                  { Don't make this optimisation if the CPU flags are required, since XOR scrambles them }
+                  if not (RegInUsedRegs(NR_DEFAULTFLAGS,UsedRegs)) then
+                    begin
+                      { change "mov $0,%reg" into "xor %reg,%reg" }
+                      taicpu(p).opcode := A_XOR;
+                      taicpu(p).loadReg(0,taicpu(p).oper[1]^.reg);
+                      Result := True;
+                      { Don't exit yet - do the deep optimisation if DFA is enabled }
+                    end;
+                $1..$FFFFFFFF:
+                  begin
+                    { Code size reduction by J. Gareth "Kit" Moreton }
+                    { change 64-bit register to 32-bit register to reduce code size (upper 32 bits will be set to zero) }
+                    case taicpu(p).opsize of
+                    S_Q:
+                      begin
+                        RegName := std_regname(taicpu(p).oper[1]^.reg); { 64-bit register name }
+                        Value := tostr(taicpu(p).oper[0]^.val);
+
+                        { The actual optimization }
+                        setsubreg(taicpu(p).oper[1]^.reg, R_SUBD);
+                        taicpu(p).changeopsize(S_L);
+
+                        DebugMsg(SPeepholeOptimization + 'movq $' + Value + ',%' + RegName + ' -> movl $' + Value + ',%' + std_regname(taicpu(p).oper[1]^.reg) + ' (immediate can be represented with just 32 bits)', p);
+                      end;
+                    end;
+                  end;
                 end;
-            $1..$FFFFFFFF:
+              end;
+
+            if (cs_opt_nodedfa in current_settings.optimizerswitches) then
               begin
-                { Code size reduction by J. Gareth "Kit" Moreton }
-                { change 64-bit register to 32-bit register to reduce code size (upper 32 bits will be set to zero) }
-                case taicpu(p).opsize of
-                S_Q:
+                { Pipeline stall reduction by J. Gareth "Kit" Moreton }
+                reg_dest := taicpu(p).oper[1]^.reg;
+                RegName := debug_regname(reg_dest);
+
+                hp_prev := p;
+                ConditionalJump := False;
+
+                if (getsupreg(reg_dest) in [RS_EBP, RS_ESP]) then
+                  { Don't play around with stack pointers }
+                  Exit;
+
+                { Given "mov %reg1,%reg2", do a forward search for "mov %reg2,%reg3",
+                  where neither %reg1 nor %reg2 are modified in between (%reg1
+                  is allowed to be read though). This helps to minimise pipeline
+                  stalls if %reg2 is still being written to by the time execution
+                  reaches "mov %reg2,%reg3". }
+
+                if taicpu(p).opcode = A_XOR then
                   begin
-                    RegName := std_regname(taicpu(p).oper[1]^.reg); { 64-bit register name }
-                    Value := tostr(taicpu(p).oper[0]^.val);
+                    { Set the conditions for 'mov $0,%reg' }
+{$ifdef DEBUG_AOPTCPU}
+                    SourceValueStr := '$0';
+{$endif DEBUG_AOPTCPU}
+                    reg_source := NR_NO;
+                    mov_val := 0;
+                  end
+                else
+                  begin
+                    SourceValueStr := debug_operstr(taicpu(p).oper[0]^);
 
-                    { The actual optimization }
-                    setsubreg(taicpu(p).oper[1]^.reg, R_SUBD);
-                    taicpu(p).changeopsize(S_L);
+                    case taicpu(p).oper[0]^.typ of
+                      top_reg:
+                        begin
+                          reg_source := taicpu(p).oper[0]^.reg;
+                          mov_val := 0;
+                        end;
+                      top_const:
+                        begin
+                          reg_source := NR_NO;
+                          mov_val := taicpu(p).oper[0]^.val;
+                        end;
+                      top_ref:
+                        begin
+                          { Value initialisation to suppress warnings }
+                          reg_source := NR_NO;
+                          mov_val := 0;
+                        end;
+                      else
+                        InternalError(2018060700);
+                    end;
+                  end;
 
-                    DebugMsg(SPeepholeOptimization + 'movq $' + Value + ',%' + RegName + ' -> movl $' + Value + ',%' + std_regname(taicpu(p).oper[1]^.reg) + ' (immediate can be represented with just 32 bits)', p);
-                    Result := True;
+                while GetNextInstruction(hp_prev, hp1) do
+                  begin
+                    { Do not treat an identical memory addresses as having an equal
+                      value due to the hazards of multithreading. }
+
+                    { TODO: Allow an exception for local memory on the stack, that
+                      is, the reference contains RSP }
+
+                    if hp1.typ <> ait_instruction then
+                      begin
+                        { If an unskippable label is encountered, then it is likely
+                          part of a loop and hence the register state may not be
+                          what's expected when the label is jumped to, so stop. }
+                        if (hp1.typ = ait_label) then
+                          Exit;
+
+                        hp_prev := hp1;
+                        Continue;
+                      end;
+
+                    { Check for the presence of reg_dest in references and replace with reg_source }
+                    if (taicpu(p).oper[0]^.typ = top_reg) and (taicpu(p).opsize = tcgsize2opsize[OS_ADDR]) then
+                      for X := 0 to taicpu(hp1).ops - 1 do
+                        if (taicpu(hp1).oper[X]^.typ = top_ref) then
+                          begin
+                            OldRef := debug_operstr(taicpu(hp1).oper[X]^);
+
+                            { Check to see if reg_dest is part of the source address }
+                            if (taicpu(hp1).oper[X]^.ref^.base = reg_dest) then
+                              taicpu(hp1).oper[X]^.ref^.base := reg_source;
+
+                            if (taicpu(hp1).oper[X]^.ref^.index = reg_dest) then
+                              taicpu(hp1).oper[X]^.ref^.index := reg_source;
+
+                            NewRef := debug_operstr(taicpu(hp1).oper[X]^);
+
+                            { Only print the debug message if the address actually changed }
+                            if OldRef <> NewRef then
+                              DebugMsg(SDeepOptimization + OldRef + ' -> ' + NewRef + ' to minimise pipeline stall', hp1);
+
+                            { We can break out because there will only be one reference in an instruction }
+                            Break;
+                          end;
+
+                    if taicpu(hp1).opcode = A_Jcc then
+                      { If a conditional jump occurs, the initial MOV must never be removed,
+                        but optimisations can still be made after it }
+                      ConditionalJump := True
+
+                    else if MatchInstruction(hp1, A_CALL, A_JMP, []) then
+                      { Don't trust anything after a call or unconditional jump }
+                      Exit
+
+                    { Handle MOVs }
+                    else if taicpu(hp1).opcode = A_MOV then
+                      begin
+                        if (taicpu(hp1).oper[0]^.typ = top_reg) then
+                          begin
+
+                            { Handle register reads }
+                            if
+                              (taicpu(p).oper[0]^.typ = top_reg) and (taicpu(p).opcode <> A_XOR) and
+{$ifdef x86_64}
+                              (
+                                ((taicpu(p).opsize = S_Q) and (taicpu(hp1).opsize = S_Q)) or
+                                ((taicpu(hp1).opsize <> S_Q) and not SuperRegistersEqual(reg_source, taicpu(hp1).oper[0]^.reg))
+                              ) and
+                              { NOTE: The above check is required because the following sets the upper 32-bits of
+                                src to zero and will need deeper analysis to determine if this doesn't cause any
+                                side-effects:
+
+                                movl src,dst
+                                movl dst,src
+
+                                Hence it's only safe if both operations work on the full 64 bits of the registers.
+                                (We don't need to check if the second operation is of size S_Q because we know the
+                                registers match exactly, sub-register included)
+                              }
+{$endif x86_64}
+                              Reg1WriteOverwritesReg2Entirely(reg_dest, taicpu(hp1).oper[0]^.reg) then
+                              begin
+                                { Parameter is a subset of reg_dest that can be replaced with reg_source }
+                                new_reg := newreg(R_INTREGISTER, getsupreg(reg_source), getsubreg(taicpu(hp1).oper[0]^.reg));
+
+                                if (taicpu(hp1).oper[1]^.typ = top_reg) and (new_reg = taicpu(hp1).oper[1]^.reg) then
+                                  begin
+                                    { This MOV became a null operation as the value of reg_dest is being
+                                      written to reg_source (or a subset of it) when they are already equal }
+                                    DebugMsg(SDeepOptimization + 'mov' + gas_opsize2str[taicpu(hp1).opsize] + ' ' + debug_regname(taicpu(hp1).oper[0]^.reg) + ',' + debug_regname(taicpu(hp1).oper[1]^.reg) + ' removed because ' + RegName + ' and ' + SourceValueStr + ' are already equal in value', hp1);
+
+                                    asml.Remove(hp1);
+                                    hp1.Free;
+                                    hp1 := hp_prev;
+                                    Continue;
+                                  end;
+
+                                  Value := debug_operstr(taicpu(hp1).oper[1]^);
+
+                                  DebugMsg(SDeepOptimization + 'mov' + gas_opsize2str[taicpu(hp1).opsize] + ' ' + debug_regname(taicpu(hp1).oper[0]^.reg) + ',' + Value +
+                                    ' -> mov' + gas_opsize2str[taicpu(hp1).opsize] + ' ' + debug_regname(new_reg) + ',' + Value + ' to minimise pipeline stall', hp1);
+
+                                  { The actual optimisation }
+                                  taicpu(hp1).oper[0]^.reg := new_reg;
+                              end
+                            else if not (cs_opt_size in current_settings.optimizerswitches) and { This optimisation tends to increase the code size }
+                              ((taicpu(p).oper[0]^.typ = top_const) or (taicpu(p).opcode = A_XOR)) and
+{$ifdef x86_64}
+                              { Only a sign-extended 32-bit immediate can be loaded directly into a 64-bit
+                                memory location. }
+                              not ((taicpu(hp1).opsize = S_Q) and (taicpu(hp1).oper[1]^.typ = top_ref) and ((mov_val < 0) or (mov_val > 2147483647))) and
+{$endif x86_64}
+                              Reg1WriteOverwritesReg2Entirely(reg_dest, taicpu(hp1).oper[0]^.reg) then
+                              begin
+                                { Replace register of known content with the value that it's equal to }
+                                OldRef := debug_regname(taicpu(hp1).oper[0]^.reg);
+                                Value := debug_operstr(taicpu(hp1).oper[1]^);
+
+                                { The actual optimisation }
+                                taicpu(hp1).loadconst(0, mov_val);
+
+                                DebugMsg(SDeepOptimization + 'mov' + gas_opsize2str[taicpu(hp1).opsize] + ' ' + OldRef + ',' + Value +
+                                  ' -> mov' + gas_opsize2str[taicpu(hp1).opsize] + ' $' + debug_tostr(mov_val) + ',' + Value + ' to minimise pipeline stall', hp1);
+                              end;
+                          end;
+
+                        { Handle register writes }
+                        if (taicpu(hp1).oper[1]^.typ = top_reg) then
+                          begin
+                            if Reg1WriteOverwritesReg2Entirely(reg_dest, taicpu(hp1).oper[1]^.reg) then
+                              begin
+                                if
+{$ifdef x86_64}
+                                  { Returns true if both S_Q or both not S_Q (i.e. the "eqv" operator) }
+                                  not ((taicpu(p).opsize = S_Q) xor (taicpu(hp1).opsize = S_Q)) and
+{$endif x86_64}
+                                  (
+                                    (taicpu(hp1).oper[0]^.typ = top_const) and
+                                    (((taicpu(p).oper[0]^.typ = top_const) or (taicpu(p).opcode = A_XOR)) and (taicpu(hp1).oper[0]^.val = mov_val))
+                                  ) or ((taicpu(hp1).oper[0]^.typ = top_reg) and Reg1WriteOverwritesReg2Entirely(reg_source, taicpu(hp1).oper[0]^.reg)) then
+                                  begin
+                                    { p and hp1 are identical instructions, or hp1 writes a small portion
+                                      of the first value to the same part of reg_dest, so remove the second one }
+                                    DebugMsg(SDeepOptimization + 'mov' + gas_opsize2str[taicpu(hp1).opsize] + ' ' + debug_operstr(taicpu(hp1).oper[0]^) + ',' + debug_regname(taicpu(hp1).oper[1]^.reg) + ' removed because it writes the same value to ' + RegName + ' from earlier', hp1);
+
+                                    asml.Remove(hp1);
+                                    hp1.Free;
+                                    hp1 := hp_prev;
+                                    Continue;
+                                  end
+                                else if Reg1WriteOverwritesReg2Entirely(taicpu(hp1).oper[1]^.reg, reg_dest) and
+                                  not reginop(reg_dest, taicpu(hp1).oper[0]^) and not ConditionalJump then
+                                  begin
+                                    { If the instruction still reads from reg_dest, then it's part of a reference }
+
+                                    { p and hp1 write different values to the destination register, but
+                                      the destination register is not read from in between, so remove the
+                                      first instruction }
+
+                                    DebugMsg(SDeepOptimization + 'mov' + gas_opsize2str[taicpu(p).opsize] + ' ' + SourceValueStr + ',' + RegName + ' removed because this value for ' + RegName + ' is never used (MOV)', p);
+
+                                    if not GetNextInstruction(p, hp1) then
+                                      InternalError(2018060100);
+
+                                    asml.Remove(p);
+                                    p.free;
+                                    Result := True;
+                                    p := hp1;
+
+                                    { The instruction's removal may allow for further optimisations earlier in the
+                                      code if it's determined that the values are not used, so try for another scan }
+                                    ReoptimizeFlag := True;
+                                    Exit;
+                                  end;
+                              end
+                          end;
+                      end
+{$ifdef x86_64}
+                    else if MatchInstruction(hp1, [A_LEA, A_MOVZX, A_MOVSX, A_MOVSXD], []) then
+{$else x86_64}
+                    else if MatchInstruction(hp1, [A_LEA, A_MOVZX, A_MOVSX], []) then
+{$endif x86_64}
+                      begin
+                        if not reginop(reg_dest, taicpu(hp1).oper[0]^) and Reg1WriteOverwritesReg2Entirely(taicpu(hp1).oper[1]^.reg, reg_dest) and not ConditionalJump then
+                          begin
+                            { In this instance, LEA acts as a MOV command with some simple arithmetic.
+                              If reg_dest doesn't appear in the reference, it means that the MOV
+                              command earlier writes a value to reg_dest that is never used and can be
+                              safely removed, just so long as there's no conditional jump in between.
+
+                              Similarly with MOVZX, MOVSX and MOVSXD. }
+
+                            DebugMsg(SDeepOptimization + 'mov' + gas_opsize2str[taicpu(p).opsize] + ' ' + SourceValueStr + ',' + RegName + ' removed because this value for ' + RegName + ' is never used (' + UpCase(gas_op2str[taicpu(hp1).opcode]) + ')', p);
+
+                            if not GetNextInstruction(p, hp1) then
+                              InternalError(2018060101);
+
+                            asml.Remove(p);
+                            p.free;
+                            Result := True;
+                            p := hp1;
+
+                            { The instruction's removal may allow for further optimisations earlier in the
+                              code if it's determined that the values are not used, so try for another scan }
+                            ReoptimizeFlag := True;
+                            Exit;
+                          end;
+                      end;
+
+                    if (taicpu(p).oper[0]^.typ = top_reg) and RegModifiedByInstruction(reg_source, hp1) then
+                      { Source value has changed }
+                      Exit;
+
+                    if RegTouchedByInstruction(reg_dest, hp1) then
+                      begin
+                        { If written to, then reg_dest no longer equals reg_source;
+                          if read from despite the above attempts to change it, then
+                          we cannot improve on the pipeline stall any further. }
+                        Exit;
+
+                      end;
+
+                    hp_prev := hp1;
                   end;
-                end;
               end;
             end;
-          end;
-      end;
+        end;
 
-
     function TX86AsmOptimizer.PostPeepholeOptCmp(var p : tai) : Boolean;
       begin
         Result:=false;
