Re: Trapsleds

2017-06-21 Thread Mike Larkin
On Tue, Jun 20, 2017 at 10:34:00PM -0400, Todd Mortimer wrote:
> > 2. This patch also hits NOP sleds > 8 bytes on i386. We could also hit
> > the NOP sleds between 3 and 7 bytes if there are no objections.
> 
> The attached diff implements the same trapsled mechanism for i386 and
> amd64 for all padding sequences between 3 and 15 bytes.
> 
> I have put this through a kernel and base build on i386 without apparent
> ill effect, and the amd64 parts are unchanged from the last diff.
> 
> Todd
> 
> 

reads ok to me, thanks again for your work here.

-ml


> Index: gas/config/tc-i386.c
> ===
> RCS file: /cvs/src/gnu/usr.bin/binutils-2.17/gas/config/tc-i386.c,v
> retrieving revision 1.7
> diff -u -p -u -p -r1.7 tc-i386.c
> --- gas/config/tc-i386.c  4 Jun 2017 20:26:18 -   1.7
> +++ gas/config/tc-i386.c  21 Jun 2017 00:43:14 -
> @@ -505,41 +505,9 @@ i386_align_code (fragP, count)
>  {0x90};  /* nop  */
>static const char f32_2[] =
>  {0x89,0xf6}; /* movl %esi,%esi   */
> -  static const char f32_3[] =
> -{0x8d,0x76,0x00};/* leal 0(%esi),%esi
> */
> -  static const char f32_4[] =
> -{0x8d,0x74,0x26,0x00};   /* leal 0(%esi,1),%esi  */
> -  static const char f32_5[] =
> -{0x90,   /* nop  */
> - 0x8d,0x74,0x26,0x00};   /* leal 0(%esi,1),%esi  */
> -  static const char f32_6[] =
> -{0x8d,0xb6,0x00,0x00,0x00,0x00}; /* leal 0L(%esi),%esi   */
> -  static const char f32_7[] =
> -{0x8d,0xb4,0x26,0x00,0x00,0x00,0x00};/* leal 0L(%esi,1),%esi */
> -  static const char f32_8[] =
> -{0x90,   /* nop  */
> - 0x8d,0xb4,0x26,0x00,0x00,0x00,0x00};/* leal 0L(%esi,1),%esi */
> -  static const char f32_9[] =
> -{0x89,0xf6,  /* movl %esi,%esi   
> */
> - 0x8d,0xbc,0x27,0x00,0x00,0x00,0x00};/* leal 0L(%edi,1),%edi */
> -  static const char f32_10[] =
> -{0x8d,0x76,0x00, /* leal 0(%esi),%esi*/
> - 0x8d,0xbc,0x27,0x00,0x00,0x00,0x00};/* leal 0L(%edi,1),%edi */
> -  static const char f32_11[] =
> -{0x8d,0x74,0x26,0x00,/* leal 0(%esi,1),%esi  */
> - 0x8d,0xbc,0x27,0x00,0x00,0x00,0x00};/* leal 0L(%edi,1),%edi */
> -  static const char f32_12[] =
> -{0x8d,0xb6,0x00,0x00,0x00,0x00,  /* leal 0L(%esi),%esi   */
> - 0x8d,0xbf,0x00,0x00,0x00,0x00}; /* leal 0L(%edi),%edi   */
> -  static const char f32_13[] =
> -{0x8d,0xb6,0x00,0x00,0x00,0x00,  /* leal 0L(%esi),%esi   */
> - 0x8d,0xbc,0x27,0x00,0x00,0x00,0x00};/* leal 0L(%edi,1),%edi */
> -  static const char f32_14[] =
> -{0x8d,0xb4,0x26,0x00,0x00,0x00,0x00, /* leal 0L(%esi,1),%esi */
> - 0x8d,0xbc,0x27,0x00,0x00,0x00,0x00};/* leal 0L(%edi,1),%edi */
>static const char f32_15[] =
> -{0xeb,0x0d,0x90,0x90,0x90,0x90,0x90, /* jmp .+15; lotsa nops */
> - 0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90};
> +{0xeb,0x0d,0xCC,0xCC,0xCC,0xCC,0xCC, /* jmp .+15; lotsa int3 */
> + 0xCC,0xCC,0xCC,0xCC,0xCC,0xCC,0xCC,0xCC};
>static const char f16_3[] =
>  {0x8d,0x74,0x00};/* lea 0(%esi),%esi 
> */
>static const char f16_4[] =
> @@ -556,40 +524,31 @@ i386_align_code (fragP, count)
>static const char f16_8[] =
>  {0x8d,0xb4,0x00,0x00,/* lea 0w(%si),%si  */
>   0x8d,0xbd,0x00,0x00};   /* lea 0w(%di),%di  */
> +  static const char f64_2[] =
> +{0x66,0x90};/* data16, nop*/
>static const char *const f32_patt[] = {
> -f32_1, f32_2, f32_3, f32_4, f32_5, f32_6, f32_7, f32_8,
> -f32_9, f32_10, f32_11, f32_12, f32_13, f32_14, f32_15
> +f32_1, f32_2, f32_15, f32_15, f32_15, f32_15, f32_15, f32_15,
> +f32_15, f32_15, f32_15, f32_15, f32_15, f32_15, f32_15
>};
>static const char *const f16_patt[] = {
>  f32_1, f32_2, f16_3, f16_4, f16_5, f16_6, f16_7, f16_8,
>  f32_15, f32_15, f32_15, f32_15, f32_15, f32_15, f32_15
>};
> +  static const char *const f64_patt[] = {
> +f32_1, f64_2, f32_15, f32_15, f32_15, f32_15, f32_15, f32_15,
> +f32_15, f32_15, f32_15, f32_15, f32_15, f32_15, f32_15
> +  };
>  
>if (count <= 0 || count > 15)
>  return;
>  
> -  /* The recommended way to pad 64bit code is to use NOPs preceded by
> - maximally four 0x66 prefixes.  Balance the size of nops.  */
>if (flag_code == CODE_64BIT)
>  {
> -  int i;
> -  int nnops = (count + 3) / 4;
> -  int len = count / nnops;
> -  int remains = count - nnops * len;
> -  int pos = 0;
> -
> -  for (i = 0; i < remains; i++)
> - {
> -   memset (fragP->fr_literal + fragP->fr_fix 

Re: Trapsleds

2017-06-20 Thread Todd Mortimer
> 2. This patch also hits NOP sleds > 8 bytes on i386. We could also hit
> the NOP sleds between 3 and 7 bytes if there are no objections.

The attached diff implements the same trapsled mechanism for i386 and
amd64 for all padding sequences between 3 and 15 bytes.

I have put this through a kernel and base build on i386 without apparent
ill effect, and the amd64 parts are unchanged from the last diff.

Todd


Index: gas/config/tc-i386.c
===
RCS file: /cvs/src/gnu/usr.bin/binutils-2.17/gas/config/tc-i386.c,v
retrieving revision 1.7
diff -u -p -u -p -r1.7 tc-i386.c
--- gas/config/tc-i386.c4 Jun 2017 20:26:18 -   1.7
+++ gas/config/tc-i386.c21 Jun 2017 00:43:14 -
@@ -505,41 +505,9 @@ i386_align_code (fragP, count)
 {0x90};/* nop  */
   static const char f32_2[] =
 {0x89,0xf6};   /* movl %esi,%esi   */
-  static const char f32_3[] =
-{0x8d,0x76,0x00};  /* leal 0(%esi),%esi*/
-  static const char f32_4[] =
-{0x8d,0x74,0x26,0x00}; /* leal 0(%esi,1),%esi  */
-  static const char f32_5[] =
-{0x90, /* nop  */
- 0x8d,0x74,0x26,0x00}; /* leal 0(%esi,1),%esi  */
-  static const char f32_6[] =
-{0x8d,0xb6,0x00,0x00,0x00,0x00};   /* leal 0L(%esi),%esi   */
-  static const char f32_7[] =
-{0x8d,0xb4,0x26,0x00,0x00,0x00,0x00};  /* leal 0L(%esi,1),%esi */
-  static const char f32_8[] =
-{0x90, /* nop  */
- 0x8d,0xb4,0x26,0x00,0x00,0x00,0x00};  /* leal 0L(%esi,1),%esi */
-  static const char f32_9[] =
-{0x89,0xf6,/* movl %esi,%esi   
*/
- 0x8d,0xbc,0x27,0x00,0x00,0x00,0x00};  /* leal 0L(%edi,1),%edi */
-  static const char f32_10[] =
-{0x8d,0x76,0x00,   /* leal 0(%esi),%esi*/
- 0x8d,0xbc,0x27,0x00,0x00,0x00,0x00};  /* leal 0L(%edi,1),%edi */
-  static const char f32_11[] =
-{0x8d,0x74,0x26,0x00,  /* leal 0(%esi,1),%esi  */
- 0x8d,0xbc,0x27,0x00,0x00,0x00,0x00};  /* leal 0L(%edi,1),%edi */
-  static const char f32_12[] =
-{0x8d,0xb6,0x00,0x00,0x00,0x00,/* leal 0L(%esi),%esi   */
- 0x8d,0xbf,0x00,0x00,0x00,0x00};   /* leal 0L(%edi),%edi   */
-  static const char f32_13[] =
-{0x8d,0xb6,0x00,0x00,0x00,0x00,/* leal 0L(%esi),%esi   */
- 0x8d,0xbc,0x27,0x00,0x00,0x00,0x00};  /* leal 0L(%edi,1),%edi */
-  static const char f32_14[] =
-{0x8d,0xb4,0x26,0x00,0x00,0x00,0x00,   /* leal 0L(%esi,1),%esi */
- 0x8d,0xbc,0x27,0x00,0x00,0x00,0x00};  /* leal 0L(%edi,1),%edi */
   static const char f32_15[] =
-{0xeb,0x0d,0x90,0x90,0x90,0x90,0x90,   /* jmp .+15; lotsa nops */
- 0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90};
+{0xeb,0x0d,0xCC,0xCC,0xCC,0xCC,0xCC,   /* jmp .+15; lotsa int3 */
+ 0xCC,0xCC,0xCC,0xCC,0xCC,0xCC,0xCC,0xCC};
   static const char f16_3[] =
 {0x8d,0x74,0x00};  /* lea 0(%esi),%esi */
   static const char f16_4[] =
@@ -556,40 +524,31 @@ i386_align_code (fragP, count)
   static const char f16_8[] =
 {0x8d,0xb4,0x00,0x00,  /* lea 0w(%si),%si  */
  0x8d,0xbd,0x00,0x00}; /* lea 0w(%di),%di  */
+  static const char f64_2[] =
+{0x66,0x90};/* data16, nop*/
   static const char *const f32_patt[] = {
-f32_1, f32_2, f32_3, f32_4, f32_5, f32_6, f32_7, f32_8,
-f32_9, f32_10, f32_11, f32_12, f32_13, f32_14, f32_15
+f32_1, f32_2, f32_15, f32_15, f32_15, f32_15, f32_15, f32_15,
+f32_15, f32_15, f32_15, f32_15, f32_15, f32_15, f32_15
   };
   static const char *const f16_patt[] = {
 f32_1, f32_2, f16_3, f16_4, f16_5, f16_6, f16_7, f16_8,
 f32_15, f32_15, f32_15, f32_15, f32_15, f32_15, f32_15
   };
+  static const char *const f64_patt[] = {
+f32_1, f64_2, f32_15, f32_15, f32_15, f32_15, f32_15, f32_15,
+f32_15, f32_15, f32_15, f32_15, f32_15, f32_15, f32_15
+  };
 
   if (count <= 0 || count > 15)
 return;
 
-  /* The recommended way to pad 64bit code is to use NOPs preceded by
- maximally four 0x66 prefixes.  Balance the size of nops.  */
   if (flag_code == CODE_64BIT)
 {
-  int i;
-  int nnops = (count + 3) / 4;
-  int len = count / nnops;
-  int remains = count - nnops * len;
-  int pos = 0;
-
-  for (i = 0; i < remains; i++)
-   {
- memset (fragP->fr_literal + fragP->fr_fix + pos, 0x66, len);
- fragP->fr_literal[fragP->fr_fix + pos + len] = 0x90;
- pos += len + 1;
-   }
-  for (; i < nnops; i++)
-   {
- memset (fragP->fr_literal + fragP->fr_fix + pos, 0x66, len - 1);
- fragP->fr_literal[fragP->fr_fix + pos +

Re: Trapsleds

2017-06-19 Thread Bryan Steele
On Mon, Jun 19, 2017 at 09:22:57PM -0400, Todd Mortimer wrote:
> Hello tech,
> 
> I have attached a patch that converts NOP padding from the assembler
> into INT3 padding on amd64. The idea is to remove potentially conveinent
> NOP sleds from programs and libraries, which makes it harder for an
> attacker to hit any ROP gadgets or other instructions after a NOP sled. 
> 
> NOP sleds are used for text alignment in order to get jump targets onto
> 16 byte boundaries. They can appear both in the middle of a function
> and at the end. The trapsleds implemented in this diff convert NOP sleds
> longer than 2 bytes from a series of 0x6690 instructions to a 2 byte
> short JMP over a series of INT3 instructions that fill the rest of the
> gap. Programs that would have normally just slid through the NOP sled
> will now jump over. An attacker trying to hit the NOP sled will now get
> a core dump.
> 
> I have been running this on my system for over a week without any
> apparent ill effects. Specifically, there don't appear to be any
> performance penalties associated with doing this. A full base build
> on a system completely converted over to this took slightly less time to
> complete than the same build on a normal system, and my synthetic
> testing shows trapsleds perform similarly to nopsleds (performance
> difference was <1%, which is within error over multiple runs).
> 
> If people like this, I can do up the equivalent diff for clang.
> 
> Things that could could be improved:
> 
> 1. For padding inserted at the end of a function, the JMP is
> unnecessary, and could also be a 0x. I am going to have a go at gcc
> to see if I can coerce it into distinguishing end-of-function padding
> from padding that is intended to be executed. If some kind soul with gcc
> experience knows where I should look, any pointers would be welcome - my
> previous attempt was not fruitful.
> 
> 2. This patch also hits NOP sleds > 8 bytes on i386. We could also hit
> the NOP sleds between 3 and 7 bytes if there are no objections.
> 
> Comments and suggestions are welcome. Thanks to Theo for suggesting it
> in the hallway track at BSDCan. 
> 
> Todd
> 

> Index: gas/config/tc-i386.c
> ===
> RCS file: /cvs/src/gnu/usr.bin/binutils-2.17/gas/config/tc-i386.c,v
> retrieving revision 1.7
> diff -u -p -u -p -r1.7 tc-i386.c
> --- gas/config/tc-i386.c  4 Jun 2017 20:26:18 -   1.7
> +++ gas/config/tc-i386.c  20 Jun 2017 00:36:27 -
> @@ -538,8 +538,8 @@ i386_align_code (fragP, count)
>  {0x8d,0xb4,0x26,0x00,0x00,0x00,0x00, /* leal 0L(%esi,1),%esi */
>   0x8d,0xbc,0x27,0x00,0x00,0x00,0x00};/* leal 0L(%edi,1),%edi */
>static const char f32_15[] =
> -{0xeb,0x0d,0x90,0x90,0x90,0x90,0x90, /* jmp .+15; lotsa nops */
> - 0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90};
> +{0xeb,0x0d,0xCC,0xCC,0xCC,0xCC,0xCC, /* jmp .+15; lotsa int3 */
> + 0xCC,0xCC,0xCC,0xCC,0xCC,0xCC,0xCC,0xCC};
>static const char f16_3[] =
>  {0x8d,0x74,0x00};/* lea 0(%esi),%esi 
> */
>static const char f16_4[] =
> @@ -556,6 +556,8 @@ i386_align_code (fragP, count)
>static const char f16_8[] =
>  {0x8d,0xb4,0x00,0x00,/* lea 0w(%si),%si  */
>   0x8d,0xbd,0x00,0x00};   /* lea 0w(%di),%di  */
> +  static const char f64_2[] =
> +{0x66,0x90};/* data16, nop*/
>static const char *const f32_patt[] = {
>  f32_1, f32_2, f32_3, f32_4, f32_5, f32_6, f32_7, f32_8,
>  f32_9, f32_10, f32_11, f32_12, f32_13, f32_14, f32_15
> @@ -564,32 +566,21 @@ i386_align_code (fragP, count)
>  f32_1, f32_2, f16_3, f16_4, f16_5, f16_6, f16_7, f16_8,
>  f32_15, f32_15, f32_15, f32_15, f32_15, f32_15, f32_15
>};
> +  static const char *const f64_patt[] = {
> +f32_1, f64_2, f32_15, f32_15, f32_15, f32_15, f32_15, f32_15,
> +f32_15, f32_15, f32_15, f32_15, f32_15, f32_15, f32_15
> +  };
>  
>if (count <= 0 || count > 15)
>  return;
>  
> -  /* The recommended way to pad 64bit code is to use NOPs preceded by
> - maximally four 0x66 prefixes.  Balance the size of nops.  */
>if (flag_code == CODE_64BIT)
>  {
> -  int i;
> -  int nnops = (count + 3) / 4;
> -  int len = count / nnops;
> -  int remains = count - nnops * len;
> -  int pos = 0;
> -
> -  for (i = 0; i < remains; i++)
> - {
> -   memset (fragP->fr_literal + fragP->fr_fix + pos, 0x66, len);
> -   fragP->fr_literal[fragP->fr_fix + pos + len] = 0x90;
> -   pos += len + 1;
> - }
> -  for (; i < nnops;

Re: Trapsleds

2017-06-19 Thread Mike Larkin
On Mon, Jun 19, 2017 at 09:22:57PM -0400, Todd Mortimer wrote:
> Hello tech,
> 
> I have attached a patch that converts NOP padding from the assembler
> into INT3 padding on amd64. The idea is to remove potentially conveinent
> NOP sleds from programs and libraries, which makes it harder for an
> attacker to hit any ROP gadgets or other instructions after a NOP sled. 
> 
> NOP sleds are used for text alignment in order to get jump targets onto
> 16 byte boundaries. They can appear both in the middle of a function
> and at the end. The trapsleds implemented in this diff convert NOP sleds
> longer than 2 bytes from a series of 0x6690 instructions to a 2 byte
> short JMP over a series of INT3 instructions that fill the rest of the
> gap. Programs that would have normally just slid through the NOP sled
> will now jump over. An attacker trying to hit the NOP sled will now get
> a core dump.
> 
> I have been running this on my system for over a week without any
> apparent ill effects. Specifically, there don't appear to be any
> performance penalties associated with doing this. A full base build
> on a system completely converted over to this took slightly less time to
> complete than the same build on a normal system, and my synthetic
> testing shows trapsleds perform similarly to nopsleds (performance
> difference was <1%, which is within error over multiple runs).
> 
> If people like this, I can do up the equivalent diff for clang.
> 
> Things that could could be improved:
> 
> 1. For padding inserted at the end of a function, the JMP is
> unnecessary, and could also be a 0x. I am going to have a go at gcc
> to see if I can coerce it into distinguishing end-of-function padding
> from padding that is intended to be executed. If some kind soul with gcc
> experience knows where I should look, any pointers would be welcome - my
> previous attempt was not fruitful.
> 
> 2. This patch also hits NOP sleds > 8 bytes on i386. We could also hit
> the NOP sleds between 3 and 7 bytes if there are no objections.
> 
> Comments and suggestions are welcome. Thanks to Theo for suggesting it
> in the hallway track at BSDCan. 
> 
> Todd
> 

Nice, well done! I had this on my to do list for a while now and I'm happy
to see someone beat me to it.

-ml

> Index: gas/config/tc-i386.c
> ===
> RCS file: /cvs/src/gnu/usr.bin/binutils-2.17/gas/config/tc-i386.c,v
> retrieving revision 1.7
> diff -u -p -u -p -r1.7 tc-i386.c
> --- gas/config/tc-i386.c  4 Jun 2017 20:26:18 -   1.7
> +++ gas/config/tc-i386.c  20 Jun 2017 00:36:27 -
> @@ -538,8 +538,8 @@ i386_align_code (fragP, count)
>  {0x8d,0xb4,0x26,0x00,0x00,0x00,0x00, /* leal 0L(%esi,1),%esi */
>   0x8d,0xbc,0x27,0x00,0x00,0x00,0x00};/* leal 0L(%edi,1),%edi */
>static const char f32_15[] =
> -{0xeb,0x0d,0x90,0x90,0x90,0x90,0x90, /* jmp .+15; lotsa nops */
> - 0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90};
> +{0xeb,0x0d,0xCC,0xCC,0xCC,0xCC,0xCC, /* jmp .+15; lotsa int3 */
> + 0xCC,0xCC,0xCC,0xCC,0xCC,0xCC,0xCC,0xCC};
>static const char f16_3[] =
>  {0x8d,0x74,0x00};/* lea 0(%esi),%esi 
> */
>static const char f16_4[] =
> @@ -556,6 +556,8 @@ i386_align_code (fragP, count)
>static const char f16_8[] =
>  {0x8d,0xb4,0x00,0x00,/* lea 0w(%si),%si  */
>   0x8d,0xbd,0x00,0x00};   /* lea 0w(%di),%di  */
> +  static const char f64_2[] =
> +{0x66,0x90};/* data16, nop*/
>static const char *const f32_patt[] = {
>  f32_1, f32_2, f32_3, f32_4, f32_5, f32_6, f32_7, f32_8,
>  f32_9, f32_10, f32_11, f32_12, f32_13, f32_14, f32_15
> @@ -564,32 +566,21 @@ i386_align_code (fragP, count)
>  f32_1, f32_2, f16_3, f16_4, f16_5, f16_6, f16_7, f16_8,
>  f32_15, f32_15, f32_15, f32_15, f32_15, f32_15, f32_15
>};
> +  static const char *const f64_patt[] = {
> +f32_1, f64_2, f32_15, f32_15, f32_15, f32_15, f32_15, f32_15,
> +f32_15, f32_15, f32_15, f32_15, f32_15, f32_15, f32_15
> +  };
>  
>if (count <= 0 || count > 15)
>  return;
>  
> -  /* The recommended way to pad 64bit code is to use NOPs preceded by
> - maximally four 0x66 prefixes.  Balance the size of nops.  */
>if (flag_code == CODE_64BIT)
>  {
> -  int i;
> -  int nnops = (count + 3) / 4;
> -  int len = count / nnops;
> -  int remains = count - nnops * len;
> -  int pos = 0;
> -
> -  for (i = 0; i < remains; i++)
> - {
> -   memset (fragP->fr_literal + fragP->fr_fix + pos, 0x66, len);
> -   fragP->fr_lit

Trapsleds

2017-06-19 Thread Todd Mortimer
Hello tech,

I have attached a patch that converts NOP padding from the assembler
into INT3 padding on amd64. The idea is to remove potentially conveinent
NOP sleds from programs and libraries, which makes it harder for an
attacker to hit any ROP gadgets or other instructions after a NOP sled. 

NOP sleds are used for text alignment in order to get jump targets onto
16 byte boundaries. They can appear both in the middle of a function
and at the end. The trapsleds implemented in this diff convert NOP sleds
longer than 2 bytes from a series of 0x6690 instructions to a 2 byte
short JMP over a series of INT3 instructions that fill the rest of the
gap. Programs that would have normally just slid through the NOP sled
will now jump over. An attacker trying to hit the NOP sled will now get
a core dump.

I have been running this on my system for over a week without any
apparent ill effects. Specifically, there don't appear to be any
performance penalties associated with doing this. A full base build
on a system completely converted over to this took slightly less time to
complete than the same build on a normal system, and my synthetic
testing shows trapsleds perform similarly to nopsleds (performance
difference was <1%, which is within error over multiple runs).

If people like this, I can do up the equivalent diff for clang.

Things that could could be improved:

1. For padding inserted at the end of a function, the JMP is
unnecessary, and could also be a 0x. I am going to have a go at gcc
to see if I can coerce it into distinguishing end-of-function padding
from padding that is intended to be executed. If some kind soul with gcc
experience knows where I should look, any pointers would be welcome - my
previous attempt was not fruitful.

2. This patch also hits NOP sleds > 8 bytes on i386. We could also hit
the NOP sleds between 3 and 7 bytes if there are no objections.

Comments and suggestions are welcome. Thanks to Theo for suggesting it
in the hallway track at BSDCan. 

Todd

Index: gas/config/tc-i386.c
===
RCS file: /cvs/src/gnu/usr.bin/binutils-2.17/gas/config/tc-i386.c,v
retrieving revision 1.7
diff -u -p -u -p -r1.7 tc-i386.c
--- gas/config/tc-i386.c4 Jun 2017 20:26:18 -   1.7
+++ gas/config/tc-i386.c20 Jun 2017 00:36:27 -
@@ -538,8 +538,8 @@ i386_align_code (fragP, count)
 {0x8d,0xb4,0x26,0x00,0x00,0x00,0x00,   /* leal 0L(%esi,1),%esi */
  0x8d,0xbc,0x27,0x00,0x00,0x00,0x00};  /* leal 0L(%edi,1),%edi */
   static const char f32_15[] =
-{0xeb,0x0d,0x90,0x90,0x90,0x90,0x90,   /* jmp .+15; lotsa nops */
- 0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90};
+{0xeb,0x0d,0xCC,0xCC,0xCC,0xCC,0xCC,   /* jmp .+15; lotsa int3 */
+ 0xCC,0xCC,0xCC,0xCC,0xCC,0xCC,0xCC,0xCC};
   static const char f16_3[] =
 {0x8d,0x74,0x00};  /* lea 0(%esi),%esi */
   static const char f16_4[] =
@@ -556,6 +556,8 @@ i386_align_code (fragP, count)
   static const char f16_8[] =
 {0x8d,0xb4,0x00,0x00,  /* lea 0w(%si),%si  */
  0x8d,0xbd,0x00,0x00}; /* lea 0w(%di),%di  */
+  static const char f64_2[] =
+{0x66,0x90};/* data16, nop*/
   static const char *const f32_patt[] = {
 f32_1, f32_2, f32_3, f32_4, f32_5, f32_6, f32_7, f32_8,
 f32_9, f32_10, f32_11, f32_12, f32_13, f32_14, f32_15
@@ -564,32 +566,21 @@ i386_align_code (fragP, count)
 f32_1, f32_2, f16_3, f16_4, f16_5, f16_6, f16_7, f16_8,
 f32_15, f32_15, f32_15, f32_15, f32_15, f32_15, f32_15
   };
+  static const char *const f64_patt[] = {
+f32_1, f64_2, f32_15, f32_15, f32_15, f32_15, f32_15, f32_15,
+f32_15, f32_15, f32_15, f32_15, f32_15, f32_15, f32_15
+  };
 
   if (count <= 0 || count > 15)
 return;
 
-  /* The recommended way to pad 64bit code is to use NOPs preceded by
- maximally four 0x66 prefixes.  Balance the size of nops.  */
   if (flag_code == CODE_64BIT)
 {
-  int i;
-  int nnops = (count + 3) / 4;
-  int len = count / nnops;
-  int remains = count - nnops * len;
-  int pos = 0;
-
-  for (i = 0; i < remains; i++)
-   {
- memset (fragP->fr_literal + fragP->fr_fix + pos, 0x66, len);
- fragP->fr_literal[fragP->fr_fix + pos + len] = 0x90;
- pos += len + 1;
-   }
-  for (; i < nnops; i++)
-   {
- memset (fragP->fr_literal + fragP->fr_fix + pos, 0x66, len - 1);
- fragP->fr_literal[fragP->fr_fix + pos + len - 1] = 0x90;
- pos += len;
-   }
+  memcpy(fragP->fr_literal + fragP->fr_fix,
+  f64_patt[count -1], count);
+if (count > 2)
+  /* Adjust jump offset */
+  fragP->fr_literal[fragP->fr_fix + 1] = count - 2;
 }
   else
 if (flag_code == CODE_16BIT)